diff options
Diffstat (limited to 'sbin/hastd')
49 files changed, 15699 insertions, 0 deletions
diff --git a/sbin/hastd/Makefile b/sbin/hastd/Makefile new file mode 100644 index 0000000..7ff6ee8 --- /dev/null +++ b/sbin/hastd/Makefile @@ -0,0 +1,45 @@ +# $FreeBSD$ + +.include <bsd.own.mk> + +PROG= hastd +SRCS= activemap.c +SRCS+= control.c crc32.c +SRCS+= ebuf.c event.c +SRCS+= hast_checksum.c hast_compression.c hast_proto.c hastd.c hooks.c +SRCS+= lzf.c +SRCS+= metadata.c +SRCS+= nv.c +SRCS+= secondary.c +SRCS+= parse.y pjdlog.c primary.c +SRCS+= proto.c proto_common.c proto_socketpair.c proto_tcp.c proto_uds.c +SRCS+= rangelock.c +SRCS+= subr.c +SRCS+= token.l +SRCS+= y.tab.h +MAN= hastd.8 hast.conf.5 + +NO_WFORMAT= +NO_WCAST_ALIGN= +NO_WMISSING_VARIABLE_DECLARATIONS= +CFLAGS+=-I${.CURDIR} +CFLAGS+=-DHAVE_CAPSICUM +CFLAGS+=-DPROTO_TCP_DEFAULT_PORT=8457 +CFLAGS+=-DINET +.if ${MK_INET6_SUPPORT} != "no" +CFLAGS+=-DINET6 +.endif + +DPADD= ${LIBGEOM} ${LIBBSDXML} ${LIBSBUF} ${LIBL} ${LIBPTHREAD} ${LIBUTIL} +LDADD= -lgeom -lbsdxml -lsbuf -lpthread -lutil +.if ${MK_OPENSSL} != "no" +DPADD+= ${LIBCRYPTO} +LDADD+= -lcrypto +CFLAGS+=-DHAVE_CRYPTO +.endif + +YFLAGS+=-v + +CLEANFILES=y.tab.c y.tab.h y.output + +.include <bsd.prog.mk> diff --git a/sbin/hastd/activemap.c b/sbin/hastd/activemap.c new file mode 100644 index 0000000..64b95e3 --- /dev/null +++ b/sbin/hastd/activemap.c @@ -0,0 +1,701 @@ +/*- + * Copyright (c) 2009-2010 The FreeBSD Foundation + * All rights reserved. + * + * This software was developed by Pawel Jakub Dawidek under sponsorship from + * the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> /* powerof2() */ +#include <sys/queue.h> + +#include <bitstring.h> +#include <errno.h> +#include <stdint.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +#include <pjdlog.h> + +#include "activemap.h" + +#ifndef PJDLOG_ASSERT +#include <assert.h> +#define PJDLOG_ASSERT(...) assert(__VA_ARGS__) +#endif + +#define ACTIVEMAP_MAGIC 0xac71e4 +struct activemap { + int am_magic; /* Magic value. */ + off_t am_mediasize; /* Media size in bytes. */ + uint32_t am_extentsize; /* Extent size in bytes, + must be power of 2. */ + uint8_t am_extentshift;/* 2 ^ extentbits == extentsize */ + int am_nextents; /* Number of extents. */ + size_t am_mapsize; /* Bitmap size in bytes. */ + uint16_t *am_memtab; /* An array that holds number of pending + writes per extent. */ + bitstr_t *am_diskmap; /* On-disk bitmap of dirty extents. */ + bitstr_t *am_memmap; /* In-memory bitmap of dirty extents. */ + size_t am_diskmapsize; /* Map size rounded up to sector size. */ + uint64_t am_ndirty; /* Number of dirty regions. */ + bitstr_t *am_syncmap; /* Bitmap of extents to sync. */ + off_t am_syncoff; /* Next synchronization offset. */ + TAILQ_HEAD(skeepdirty, keepdirty) am_keepdirty; /* List of extents that + we keep dirty to reduce bitmap + updates. */ + int am_nkeepdirty; /* Number of am_keepdirty elements. */ + int am_nkeepdirty_limit; /* Maximum number of am_keepdirty + elements. */ +}; + +struct keepdirty { + int kd_extent; + TAILQ_ENTRY(keepdirty) kd_next; +}; + +/* + * Helper function taken from sys/systm.h to calculate extentshift. + */ +static uint32_t +bitcount32(uint32_t x) +{ + + x = (x & 0x55555555) + ((x & 0xaaaaaaaa) >> 1); + x = (x & 0x33333333) + ((x & 0xcccccccc) >> 2); + x = (x + (x >> 4)) & 0x0f0f0f0f; + x = (x + (x >> 8)); + x = (x + (x >> 16)) & 0x000000ff; + return (x); +} + +static __inline int +off2ext(const struct activemap *amp, off_t offset) +{ + int extent; + + PJDLOG_ASSERT(offset >= 0 && offset < amp->am_mediasize); + extent = (offset >> amp->am_extentshift); + PJDLOG_ASSERT(extent >= 0 && extent < amp->am_nextents); + return (extent); +} + +static __inline off_t +ext2off(const struct activemap *amp, int extent) +{ + off_t offset; + + PJDLOG_ASSERT(extent >= 0 && extent < amp->am_nextents); + offset = ((off_t)extent << amp->am_extentshift); + PJDLOG_ASSERT(offset >= 0 && offset < amp->am_mediasize); + return (offset); +} + +/* + * Function calculates number of requests needed to synchronize the given + * extent. + */ +static __inline int +ext2reqs(const struct activemap *amp, int ext) +{ + off_t left; + + if (ext < amp->am_nextents - 1) + return (((amp->am_extentsize - 1) / MAXPHYS) + 1); + + PJDLOG_ASSERT(ext == amp->am_nextents - 1); + left = amp->am_mediasize % amp->am_extentsize; + if (left == 0) + left = amp->am_extentsize; + return (((left - 1) / MAXPHYS) + 1); +} + +/* + * Initialize activemap structure and allocate memory for internal needs. + * Function returns 0 on success and -1 if any of the allocations failed. + */ +int +activemap_init(struct activemap **ampp, uint64_t mediasize, uint32_t extentsize, + uint32_t sectorsize, uint32_t keepdirty) +{ + struct activemap *amp; + + PJDLOG_ASSERT(ampp != NULL); + PJDLOG_ASSERT(mediasize > 0); + PJDLOG_ASSERT(extentsize > 0); + PJDLOG_ASSERT(powerof2(extentsize)); + PJDLOG_ASSERT(sectorsize > 0); + PJDLOG_ASSERT(powerof2(sectorsize)); + PJDLOG_ASSERT(keepdirty > 0); + + amp = malloc(sizeof(*amp)); + if (amp == NULL) + return (-1); + + amp->am_mediasize = mediasize; + amp->am_nkeepdirty_limit = keepdirty; + amp->am_extentsize = extentsize; + amp->am_extentshift = bitcount32(extentsize - 1); + amp->am_nextents = ((mediasize - 1) / extentsize) + 1; + amp->am_mapsize = sizeof(bitstr_t) * bitstr_size(amp->am_nextents); + amp->am_diskmapsize = roundup2(amp->am_mapsize, sectorsize); + amp->am_ndirty = 0; + amp->am_syncoff = -2; + TAILQ_INIT(&->am_keepdirty); + amp->am_nkeepdirty = 0; + + amp->am_memtab = calloc(amp->am_nextents, sizeof(amp->am_memtab[0])); + amp->am_diskmap = calloc(1, amp->am_diskmapsize); + amp->am_memmap = bit_alloc(amp->am_nextents); + amp->am_syncmap = bit_alloc(amp->am_nextents); + + /* + * Check to see if any of the allocations above failed. + */ + if (amp->am_memtab == NULL || amp->am_diskmap == NULL || + amp->am_memmap == NULL || amp->am_syncmap == NULL) { + if (amp->am_memtab != NULL) + free(amp->am_memtab); + if (amp->am_diskmap != NULL) + free(amp->am_diskmap); + if (amp->am_memmap != NULL) + free(amp->am_memmap); + if (amp->am_syncmap != NULL) + free(amp->am_syncmap); + amp->am_magic = 0; + free(amp); + errno = ENOMEM; + return (-1); + } + + amp->am_magic = ACTIVEMAP_MAGIC; + *ampp = amp; + + return (0); +} + +static struct keepdirty * +keepdirty_find(struct activemap *amp, int extent) +{ + struct keepdirty *kd; + + TAILQ_FOREACH(kd, &->am_keepdirty, kd_next) { + if (kd->kd_extent == extent) + break; + } + return (kd); +} + +static bool +keepdirty_add(struct activemap *amp, int extent) +{ + struct keepdirty *kd; + + kd = keepdirty_find(amp, extent); + if (kd != NULL) { + /* + * Only move element at the beginning. + */ + TAILQ_REMOVE(&->am_keepdirty, kd, kd_next); + TAILQ_INSERT_HEAD(&->am_keepdirty, kd, kd_next); + return (false); + } + /* + * Add new element, but first remove the most unused one if + * we have too many. + */ + if (amp->am_nkeepdirty >= amp->am_nkeepdirty_limit) { + kd = TAILQ_LAST(&->am_keepdirty, skeepdirty); + PJDLOG_ASSERT(kd != NULL); + TAILQ_REMOVE(&->am_keepdirty, kd, kd_next); + amp->am_nkeepdirty--; + PJDLOG_ASSERT(amp->am_nkeepdirty > 0); + } + if (kd == NULL) + kd = malloc(sizeof(*kd)); + /* We can ignore allocation failure. */ + if (kd != NULL) { + kd->kd_extent = extent; + amp->am_nkeepdirty++; + TAILQ_INSERT_HEAD(&->am_keepdirty, kd, kd_next); + } + + return (true); +} + +static void +keepdirty_fill(struct activemap *amp) +{ + struct keepdirty *kd; + + TAILQ_FOREACH(kd, &->am_keepdirty, kd_next) + bit_set(amp->am_diskmap, kd->kd_extent); +} + +static void +keepdirty_free(struct activemap *amp) +{ + struct keepdirty *kd; + + while ((kd = TAILQ_FIRST(&->am_keepdirty)) != NULL) { + TAILQ_REMOVE(&->am_keepdirty, kd, kd_next); + amp->am_nkeepdirty--; + free(kd); + } + PJDLOG_ASSERT(amp->am_nkeepdirty == 0); +} + +/* + * Function frees resources allocated by activemap_init() function. + */ +void +activemap_free(struct activemap *amp) +{ + + PJDLOG_ASSERT(amp->am_magic == ACTIVEMAP_MAGIC); + + amp->am_magic = 0; + + keepdirty_free(amp); + free(amp->am_memtab); + free(amp->am_diskmap); + free(amp->am_memmap); + free(amp->am_syncmap); +} + +/* + * Function should be called before we handle write requests. It updates + * internal structures and returns true if on-disk metadata should be updated. + */ +bool +activemap_write_start(struct activemap *amp, off_t offset, off_t length) +{ + bool modified; + off_t end; + int ext; + + PJDLOG_ASSERT(amp->am_magic == ACTIVEMAP_MAGIC); + PJDLOG_ASSERT(length > 0); + + modified = false; + end = offset + length - 1; + + for (ext = off2ext(amp, offset); ext <= off2ext(amp, end); ext++) { + /* + * If the number of pending writes is increased from 0, + * we have to mark the extent as dirty also in on-disk bitmap. + * By returning true we inform the caller that on-disk bitmap + * was modified and has to be flushed to disk. + */ + if (amp->am_memtab[ext]++ == 0) { + PJDLOG_ASSERT(!bit_test(amp->am_memmap, ext)); + bit_set(amp->am_memmap, ext); + amp->am_ndirty++; + } + if (keepdirty_add(amp, ext)) + modified = true; + } + + return (modified); +} + +/* + * Function should be called after receiving write confirmation. It updates + * internal structures and returns true if on-disk metadata should be updated. + */ +bool +activemap_write_complete(struct activemap *amp, off_t offset, off_t length) +{ + bool modified; + off_t end; + int ext; + + PJDLOG_ASSERT(amp->am_magic == ACTIVEMAP_MAGIC); + PJDLOG_ASSERT(length > 0); + + modified = false; + end = offset + length - 1; + + for (ext = off2ext(amp, offset); ext <= off2ext(amp, end); ext++) { + /* + * If the number of pending writes goes down to 0, we have to + * mark the extent as clean also in on-disk bitmap. + * By returning true we inform the caller that on-disk bitmap + * was modified and has to be flushed to disk. + */ + PJDLOG_ASSERT(amp->am_memtab[ext] > 0); + PJDLOG_ASSERT(bit_test(amp->am_memmap, ext)); + if (--amp->am_memtab[ext] == 0) { + bit_clear(amp->am_memmap, ext); + amp->am_ndirty--; + if (keepdirty_find(amp, ext) == NULL) + modified = true; + } + } + + return (modified); +} + +/* + * Function should be called after finishing synchronization of one extent. + * It returns true if on-disk metadata should be updated. + */ +bool +activemap_extent_complete(struct activemap *amp, int extent) +{ + bool modified; + int reqs; + + PJDLOG_ASSERT(amp->am_magic == ACTIVEMAP_MAGIC); + PJDLOG_ASSERT(extent >= 0 && extent < amp->am_nextents); + + modified = false; + + reqs = ext2reqs(amp, extent); + PJDLOG_ASSERT(amp->am_memtab[extent] >= reqs); + amp->am_memtab[extent] -= reqs; + PJDLOG_ASSERT(bit_test(amp->am_memmap, extent)); + if (amp->am_memtab[extent] == 0) { + bit_clear(amp->am_memmap, extent); + amp->am_ndirty--; + modified = true; + } + + return (modified); +} + +/* + * Function returns number of dirty regions. + */ +uint64_t +activemap_ndirty(const struct activemap *amp) +{ + + PJDLOG_ASSERT(amp->am_magic == ACTIVEMAP_MAGIC); + + return (amp->am_ndirty); +} + +/* + * Function compare on-disk bitmap and in-memory bitmap and returns true if + * they differ and should be flushed to the disk. + */ +bool +activemap_differ(const struct activemap *amp) +{ + + PJDLOG_ASSERT(amp->am_magic == ACTIVEMAP_MAGIC); + + return (memcmp(amp->am_diskmap, amp->am_memmap, + amp->am_mapsize) != 0); +} + +/* + * Function returns number of bytes used by bitmap. + */ +size_t +activemap_size(const struct activemap *amp) +{ + + PJDLOG_ASSERT(amp->am_magic == ACTIVEMAP_MAGIC); + + return (amp->am_mapsize); +} + +/* + * Function returns number of bytes needed for storing on-disk bitmap. + * This is the same as activemap_size(), but rounded up to sector size. + */ +size_t +activemap_ondisk_size(const struct activemap *amp) +{ + + PJDLOG_ASSERT(amp->am_magic == ACTIVEMAP_MAGIC); + + return (amp->am_diskmapsize); +} + +/* + * Function copies the given buffer read from disk to the internal bitmap. + */ +void +activemap_copyin(struct activemap *amp, const unsigned char *buf, size_t size) +{ + int ext; + + PJDLOG_ASSERT(amp->am_magic == ACTIVEMAP_MAGIC); + PJDLOG_ASSERT(size >= amp->am_mapsize); + + memcpy(amp->am_diskmap, buf, amp->am_mapsize); + memcpy(amp->am_memmap, buf, amp->am_mapsize); + memcpy(amp->am_syncmap, buf, amp->am_mapsize); + + bit_ffs(amp->am_memmap, amp->am_nextents, &ext); + if (ext == -1) { + /* There are no dirty extents, so we can leave now. */ + return; + } + /* + * Set synchronization offset to the first dirty extent. + */ + activemap_sync_rewind(amp); + /* + * We have dirty extents and we want them to stay that way until + * we synchronize, so we set number of pending writes to number + * of requests needed to synchronize one extent. + */ + amp->am_ndirty = 0; + for (; ext < amp->am_nextents; ext++) { + if (bit_test(amp->am_memmap, ext)) { + amp->am_memtab[ext] = ext2reqs(amp, ext); + amp->am_ndirty++; + } + } +} + +/* + * Function merges the given bitmap with existing one. + */ +void +activemap_merge(struct activemap *amp, const unsigned char *buf, size_t size) +{ + bitstr_t *remmap = __DECONST(bitstr_t *, buf); + int ext; + + PJDLOG_ASSERT(amp->am_magic == ACTIVEMAP_MAGIC); + PJDLOG_ASSERT(size >= amp->am_mapsize); + + bit_ffs(remmap, amp->am_nextents, &ext); + if (ext == -1) { + /* There are no dirty extents, so we can leave now. */ + return; + } + /* + * We have dirty extents and we want them to stay that way until + * we synchronize, so we set number of pending writes to number + * of requests needed to synchronize one extent. + */ + for (; ext < amp->am_nextents; ext++) { + /* Local extent already dirty. */ + if (bit_test(amp->am_syncmap, ext)) + continue; + /* Remote extent isn't dirty. */ + if (!bit_test(remmap, ext)) + continue; + bit_set(amp->am_syncmap, ext); + bit_set(amp->am_memmap, ext); + bit_set(amp->am_diskmap, ext); + if (amp->am_memtab[ext] == 0) + amp->am_ndirty++; + amp->am_memtab[ext] = ext2reqs(amp, ext); + } + /* + * Set synchronization offset to the first dirty extent. + */ + activemap_sync_rewind(amp); +} + +/* + * Function returns pointer to internal bitmap that should be written to disk. + */ +const unsigned char * +activemap_bitmap(struct activemap *amp, size_t *sizep) +{ + + PJDLOG_ASSERT(amp->am_magic == ACTIVEMAP_MAGIC); + + if (sizep != NULL) + *sizep = amp->am_diskmapsize; + memcpy(amp->am_diskmap, amp->am_memmap, amp->am_mapsize); + keepdirty_fill(amp); + return ((const unsigned char *)amp->am_diskmap); +} + +/* + * Function calculates size needed to store bitmap on disk. + */ +size_t +activemap_calc_ondisk_size(uint64_t mediasize, uint32_t extentsize, + uint32_t sectorsize) +{ + uint64_t nextents, mapsize; + + PJDLOG_ASSERT(mediasize > 0); + PJDLOG_ASSERT(extentsize > 0); + PJDLOG_ASSERT(powerof2(extentsize)); + PJDLOG_ASSERT(sectorsize > 0); + PJDLOG_ASSERT(powerof2(sectorsize)); + + nextents = ((mediasize - 1) / extentsize) + 1; + mapsize = sizeof(bitstr_t) * bitstr_size(nextents); + return (roundup2(mapsize, sectorsize)); +} + +/* + * Set synchronization offset to the first dirty extent. + */ +void +activemap_sync_rewind(struct activemap *amp) +{ + int ext; + + PJDLOG_ASSERT(amp->am_magic == ACTIVEMAP_MAGIC); + + bit_ffs(amp->am_syncmap, amp->am_nextents, &ext); + if (ext == -1) { + /* There are no extents to synchronize. */ + amp->am_syncoff = -2; + return; + } + /* + * Mark that we want to start synchronization from the beginning. + */ + amp->am_syncoff = -1; +} + +/* + * Return next offset of where we should synchronize. + */ +off_t +activemap_sync_offset(struct activemap *amp, off_t *lengthp, int *syncextp) +{ + off_t syncoff, left; + int ext; + + PJDLOG_ASSERT(amp->am_magic == ACTIVEMAP_MAGIC); + PJDLOG_ASSERT(lengthp != NULL); + PJDLOG_ASSERT(syncextp != NULL); + + *syncextp = -1; + + if (amp->am_syncoff == -2) + return (-1); + + if (amp->am_syncoff >= 0 && + (amp->am_syncoff + MAXPHYS >= amp->am_mediasize || + off2ext(amp, amp->am_syncoff) != + off2ext(amp, amp->am_syncoff + MAXPHYS))) { + /* + * We are about to change extent, so mark previous one as clean. + */ + ext = off2ext(amp, amp->am_syncoff); + bit_clear(amp->am_syncmap, ext); + *syncextp = ext; + amp->am_syncoff = -1; + } + + if (amp->am_syncoff == -1) { + /* + * Let's find first extent to synchronize. + */ + bit_ffs(amp->am_syncmap, amp->am_nextents, &ext); + if (ext == -1) { + amp->am_syncoff = -2; + return (-1); + } + amp->am_syncoff = ext2off(amp, ext); + } else { + /* + * We don't change extent, so just increase offset. + */ + amp->am_syncoff += MAXPHYS; + if (amp->am_syncoff >= amp->am_mediasize) { + amp->am_syncoff = -2; + return (-1); + } + } + + syncoff = amp->am_syncoff; + left = ext2off(amp, off2ext(amp, syncoff)) + + amp->am_extentsize - syncoff; + if (syncoff + left > amp->am_mediasize) + left = amp->am_mediasize - syncoff; + if (left > MAXPHYS) + left = MAXPHYS; + + PJDLOG_ASSERT(left >= 0 && left <= MAXPHYS); + PJDLOG_ASSERT(syncoff >= 0 && syncoff < amp->am_mediasize); + PJDLOG_ASSERT(syncoff + left >= 0 && + syncoff + left <= amp->am_mediasize); + + *lengthp = left; + return (syncoff); +} + +/* + * Mark extent(s) containing the given region for synchronization. + * Most likely one of the components is unavailable. + */ +bool +activemap_need_sync(struct activemap *amp, off_t offset, off_t length) +{ + bool modified; + off_t end; + int ext; + + PJDLOG_ASSERT(amp->am_magic == ACTIVEMAP_MAGIC); + + modified = false; + end = offset + length - 1; + + for (ext = off2ext(amp, offset); ext <= off2ext(amp, end); ext++) { + if (bit_test(amp->am_syncmap, ext)) { + /* Already marked for synchronization. */ + PJDLOG_ASSERT(bit_test(amp->am_memmap, ext)); + continue; + } + bit_set(amp->am_syncmap, ext); + if (!bit_test(amp->am_memmap, ext)) { + bit_set(amp->am_memmap, ext); + amp->am_ndirty++; + } + amp->am_memtab[ext] += ext2reqs(amp, ext); + modified = true; + } + + return (modified); +} + +void +activemap_dump(const struct activemap *amp) +{ + int bit; + + printf("M: "); + for (bit = 0; bit < amp->am_nextents; bit++) + printf("%d", bit_test(amp->am_memmap, bit) ? 1 : 0); + printf("\n"); + printf("D: "); + for (bit = 0; bit < amp->am_nextents; bit++) + printf("%d", bit_test(amp->am_diskmap, bit) ? 1 : 0); + printf("\n"); + printf("S: "); + for (bit = 0; bit < amp->am_nextents; bit++) + printf("%d", bit_test(amp->am_syncmap, bit) ? 1 : 0); + printf("\n"); +} diff --git a/sbin/hastd/activemap.h b/sbin/hastd/activemap.h new file mode 100644 index 0000000..42f0221 --- /dev/null +++ b/sbin/hastd/activemap.h @@ -0,0 +1,69 @@ +/*- + * Copyright (c) 2009-2010 The FreeBSD Foundation + * All rights reserved. + * + * This software was developed by Pawel Jakub Dawidek under sponsorship from + * the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _ACTIVEMAP_H_ +#define _ACTIVEMAP_H_ + +#include <stdbool.h> +#include <stdint.h> + +struct activemap; + +int activemap_init(struct activemap **ampp, uint64_t mediasize, + uint32_t extentsize, uint32_t sectorsize, uint32_t keepdirty); +void activemap_free(struct activemap *amp); + +bool activemap_write_start(struct activemap *amp, off_t offset, off_t length); +bool activemap_write_complete(struct activemap *amp, off_t offset, + off_t length); +bool activemap_extent_complete(struct activemap *amp, int extent); +uint64_t activemap_ndirty(const struct activemap *amp); + +bool activemap_differ(const struct activemap *amp); +size_t activemap_size(const struct activemap *amp); +size_t activemap_ondisk_size(const struct activemap *amp); +void activemap_copyin(struct activemap *amp, const unsigned char *buf, + size_t size); +void activemap_merge(struct activemap *amp, const unsigned char *buf, + size_t size); +const unsigned char *activemap_bitmap(struct activemap *amp, size_t *sizep); + +size_t activemap_calc_ondisk_size(uint64_t mediasize, uint32_t extentsize, + uint32_t sectorsize); + +void activemap_sync_rewind(struct activemap *amp); +off_t activemap_sync_offset(struct activemap *amp, off_t *lengthp, + int *syncextp); +bool activemap_need_sync(struct activemap *amp, off_t offset, off_t length); + +void activemap_dump(const struct activemap *amp); + +#endif /* !_ACTIVEMAP_H_ */ diff --git a/sbin/hastd/control.c b/sbin/hastd/control.c new file mode 100644 index 0000000..922f507 --- /dev/null +++ b/sbin/hastd/control.c @@ -0,0 +1,511 @@ +/*- + * Copyright (c) 2009-2010 The FreeBSD Foundation + * All rights reserved. + * + * This software was developed by Pawel Jakub Dawidek under sponsorship from + * the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/types.h> +#include <sys/wait.h> + +#include <errno.h> +#include <pthread.h> +#include <signal.h> +#include <stdio.h> +#include <string.h> +#include <unistd.h> + +#include "hast.h" +#include "hastd.h" +#include "hast_checksum.h" +#include "hast_compression.h" +#include "hast_proto.h" +#include "hooks.h" +#include "nv.h" +#include "pjdlog.h" +#include "proto.h" +#include "subr.h" + +#include "control.h" + +void +child_cleanup(struct hast_resource *res) +{ + + proto_close(res->hr_ctrl); + res->hr_ctrl = NULL; + if (res->hr_event != NULL) { + proto_close(res->hr_event); + res->hr_event = NULL; + } + if (res->hr_conn != NULL) { + proto_close(res->hr_conn); + res->hr_conn = NULL; + } + res->hr_workerpid = 0; +} + +static void +control_set_role_common(struct hastd_config *cfg, struct nv *nvout, + uint8_t role, struct hast_resource *res, const char *name, unsigned int no) +{ + int oldrole; + + /* Name is always needed. */ + if (name != NULL) + nv_add_string(nvout, name, "resource%u", no); + + if (res == NULL) { + PJDLOG_ASSERT(cfg != NULL); + PJDLOG_ASSERT(name != NULL); + + TAILQ_FOREACH(res, &cfg->hc_resources, hr_next) { + if (strcmp(res->hr_name, name) == 0) + break; + } + if (res == NULL) { + nv_add_int16(nvout, EHAST_NOENTRY, "error%u", no); + return; + } + } + PJDLOG_ASSERT(res != NULL); + + /* Send previous role back. */ + nv_add_string(nvout, role2str(res->hr_role), "role%u", no); + + /* Nothing changed, return here. */ + if (role == res->hr_role) + return; + + pjdlog_prefix_set("[%s] (%s) ", res->hr_name, role2str(res->hr_role)); + pjdlog_info("Role changed to %s.", role2str(role)); + + /* Change role to the new one. */ + oldrole = res->hr_role; + res->hr_role = role; + pjdlog_prefix_set("[%s] (%s) ", res->hr_name, role2str(res->hr_role)); + + /* + * If previous role was primary or secondary we have to kill process + * doing that work. + */ + if (res->hr_workerpid != 0) { + if (kill(res->hr_workerpid, SIGTERM) == -1) { + pjdlog_errno(LOG_WARNING, + "Unable to kill worker process %u", + (unsigned int)res->hr_workerpid); + } else if (waitpid(res->hr_workerpid, NULL, 0) != + res->hr_workerpid) { + pjdlog_errno(LOG_WARNING, + "Error while waiting for worker process %u", + (unsigned int)res->hr_workerpid); + } else { + pjdlog_debug(1, "Worker process %u stopped.", + (unsigned int)res->hr_workerpid); + } + child_cleanup(res); + } + + /* Start worker process if we are changing to primary. */ + if (role == HAST_ROLE_PRIMARY) + hastd_primary(res); + pjdlog_prefix_set("%s", ""); + hook_exec(res->hr_exec, "role", res->hr_name, role2str(oldrole), + role2str(res->hr_role), NULL); +} + +void +control_set_role(struct hast_resource *res, uint8_t role) +{ + + control_set_role_common(NULL, NULL, role, res, NULL, 0); +} + +static void +control_status_worker(struct hast_resource *res, struct nv *nvout, + unsigned int no) +{ + struct nv *cnvin, *cnvout; + const char *str; + int error; + + cnvin = NULL; + + /* + * Prepare and send command to worker process. + */ + cnvout = nv_alloc(); + nv_add_uint8(cnvout, CONTROL_STATUS, "cmd"); + error = nv_error(cnvout); + if (error != 0) { + pjdlog_common(LOG_ERR, 0, error, + "Unable to prepare control header"); + goto end; + } + if (hast_proto_send(res, res->hr_ctrl, cnvout, NULL, 0) == -1) { + error = errno; + pjdlog_errno(LOG_ERR, "Unable to send control header"); + goto end; + } + + /* + * Receive response. + */ + if (hast_proto_recv_hdr(res->hr_ctrl, &cnvin) == -1) { + error = errno; + pjdlog_errno(LOG_ERR, "Unable to receive control header"); + goto end; + } + + error = nv_get_int16(cnvin, "error"); + if (error != 0) + goto end; + + if ((str = nv_get_string(cnvin, "status")) == NULL) { + error = ENOENT; + pjdlog_errno(LOG_ERR, "Field 'status' is missing."); + goto end; + } + nv_add_string(nvout, str, "status%u", no); + nv_add_uint64(nvout, nv_get_uint64(cnvin, "dirty"), "dirty%u", no); + nv_add_uint32(nvout, nv_get_uint32(cnvin, "extentsize"), + "extentsize%u", no); + nv_add_uint32(nvout, nv_get_uint32(cnvin, "keepdirty"), + "keepdirty%u", no); + nv_add_uint64(nvout, nv_get_uint64(cnvin, "stat_read"), + "stat_read%u", no); + nv_add_uint64(nvout, nv_get_uint64(cnvin, "stat_write"), + "stat_write%u", no); + nv_add_uint64(nvout, nv_get_uint64(cnvin, "stat_delete"), + "stat_delete%u", no); + nv_add_uint64(nvout, nv_get_uint64(cnvin, "stat_flush"), + "stat_flush%u", no); + nv_add_uint64(nvout, nv_get_uint64(cnvin, "stat_activemap_update"), + "stat_activemap_update%u", no); + nv_add_uint64(nvout, nv_get_uint64(cnvin, "stat_read_error"), + "stat_read_error%u", no); + nv_add_uint64(nvout, nv_get_uint64(cnvin, "stat_write_error"), + "stat_write_error%u", no); + nv_add_uint64(nvout, nv_get_uint64(cnvin, "stat_delete_error"), + "stat_delete_error%u", no); + nv_add_uint64(nvout, nv_get_uint64(cnvin, "stat_flush_error"), + "stat_flush_error%u", no); +end: + if (cnvin != NULL) + nv_free(cnvin); + if (cnvout != NULL) + nv_free(cnvout); + if (error != 0) + nv_add_int16(nvout, error, "error"); +} + +static void +control_status(struct hastd_config *cfg, struct nv *nvout, + struct hast_resource *res, const char *name, unsigned int no) +{ + + PJDLOG_ASSERT(cfg != NULL); + PJDLOG_ASSERT(nvout != NULL); + PJDLOG_ASSERT(name != NULL); + + /* Name is always needed. */ + nv_add_string(nvout, name, "resource%u", no); + + if (res == NULL) { + TAILQ_FOREACH(res, &cfg->hc_resources, hr_next) { + if (strcmp(res->hr_name, name) == 0) + break; + } + if (res == NULL) { + nv_add_int16(nvout, EHAST_NOENTRY, "error%u", no); + return; + } + } + PJDLOG_ASSERT(res != NULL); + nv_add_string(nvout, res->hr_provname, "provname%u", no); + nv_add_string(nvout, res->hr_localpath, "localpath%u", no); + nv_add_string(nvout, res->hr_remoteaddr, "remoteaddr%u", no); + if (res->hr_sourceaddr[0] != '\0') + nv_add_string(nvout, res->hr_sourceaddr, "sourceaddr%u", no); + switch (res->hr_replication) { + case HAST_REPLICATION_FULLSYNC: + nv_add_string(nvout, "fullsync", "replication%u", no); + break; + case HAST_REPLICATION_MEMSYNC: + nv_add_string(nvout, "memsync", "replication%u", no); + break; + case HAST_REPLICATION_ASYNC: + nv_add_string(nvout, "async", "replication%u", no); + break; + default: + nv_add_string(nvout, "unknown", "replication%u", no); + break; + } + nv_add_string(nvout, checksum_name(res->hr_checksum), + "checksum%u", no); + nv_add_string(nvout, compression_name(res->hr_compression), + "compression%u", no); + nv_add_string(nvout, role2str(res->hr_role), "role%u", no); + nv_add_int32(nvout, res->hr_workerpid, "workerpid%u", no); + + switch (res->hr_role) { + case HAST_ROLE_PRIMARY: + PJDLOG_ASSERT(res->hr_workerpid != 0); + /* FALLTHROUGH */ + case HAST_ROLE_SECONDARY: + if (res->hr_workerpid != 0) + break; + /* FALLTHROUGH */ + default: + return; + } + + /* + * If we are here, it means that we have a worker process, which we + * want to ask some questions. + */ + control_status_worker(res, nvout, no); +} + +void +control_handle(struct hastd_config *cfg) +{ + struct proto_conn *conn; + struct nv *nvin, *nvout; + unsigned int ii; + const char *str; + uint8_t cmd, role; + int error; + + if (proto_accept(cfg->hc_controlconn, &conn) == -1) { + pjdlog_errno(LOG_ERR, "Unable to accept control connection"); + return; + } + + cfg->hc_controlin = conn; + nvin = nvout = NULL; + role = HAST_ROLE_UNDEF; + + if (hast_proto_recv_hdr(conn, &nvin) == -1) { + pjdlog_errno(LOG_ERR, "Unable to receive control header"); + nvin = NULL; + goto close; + } + + /* Obtain command code. 0 means that nv_get_uint8() failed. */ + cmd = nv_get_uint8(nvin, "cmd"); + if (cmd == 0) { + pjdlog_error("Control header is missing 'cmd' field."); + goto close; + } + + /* Allocate outgoing nv structure. */ + nvout = nv_alloc(); + if (nvout == NULL) { + pjdlog_error("Unable to allocate header for control response."); + goto close; + } + + error = 0; + + str = nv_get_string(nvin, "resource0"); + if (str == NULL) { + pjdlog_error("Control header is missing 'resource0' field."); + error = EHAST_INVALID; + goto fail; + } + if (cmd == HASTCTL_CMD_SETROLE) { + role = nv_get_uint8(nvin, "role"); + switch (role) { + case HAST_ROLE_INIT: + case HAST_ROLE_PRIMARY: + case HAST_ROLE_SECONDARY: + break; + default: + pjdlog_error("Invalid role received (%hhu).", role); + error = EHAST_INVALID; + goto fail; + } + } + if (strcmp(str, "all") == 0) { + struct hast_resource *res; + + /* All configured resources. */ + + ii = 0; + TAILQ_FOREACH(res, &cfg->hc_resources, hr_next) { + switch (cmd) { + case HASTCTL_CMD_SETROLE: + control_set_role_common(cfg, nvout, role, res, + res->hr_name, ii++); + break; + case HASTCTL_CMD_STATUS: + control_status(cfg, nvout, res, res->hr_name, + ii++); + break; + default: + pjdlog_error("Invalid command received (%hhu).", + cmd); + error = EHAST_UNIMPLEMENTED; + goto fail; + } + } + } else { + /* Only selected resources. */ + + for (ii = 0; ; ii++) { + str = nv_get_string(nvin, "resource%u", ii); + if (str == NULL) + break; + switch (cmd) { + case HASTCTL_CMD_SETROLE: + control_set_role_common(cfg, nvout, role, NULL, + str, ii); + break; + case HASTCTL_CMD_STATUS: + control_status(cfg, nvout, NULL, str, ii); + break; + default: + pjdlog_error("Invalid command received (%hhu).", + cmd); + error = EHAST_UNIMPLEMENTED; + goto fail; + } + } + } + if (nv_error(nvout) != 0) + goto close; +fail: + if (error != 0) + nv_add_int16(nvout, error, "error"); + + if (hast_proto_send(NULL, conn, nvout, NULL, 0) == -1) + pjdlog_errno(LOG_ERR, "Unable to send control response"); +close: + if (nvin != NULL) + nv_free(nvin); + if (nvout != NULL) + nv_free(nvout); + proto_close(conn); + cfg->hc_controlin = NULL; +} + +/* + * Thread handles control requests from the parent. + */ +void * +ctrl_thread(void *arg) +{ + struct hast_resource *res = arg; + struct nv *nvin, *nvout; + uint8_t cmd; + + for (;;) { + if (hast_proto_recv_hdr(res->hr_ctrl, &nvin) == -1) { + if (sigexit_received) + pthread_exit(NULL); + pjdlog_errno(LOG_ERR, + "Unable to receive control message"); + kill(getpid(), SIGTERM); + pthread_exit(NULL); + } + cmd = nv_get_uint8(nvin, "cmd"); + if (cmd == 0) { + pjdlog_error("Control message is missing 'cmd' field."); + nv_free(nvin); + continue; + } + nvout = nv_alloc(); + switch (cmd) { + case CONTROL_STATUS: + if (res->hr_remotein != NULL && + res->hr_remoteout != NULL) { + nv_add_string(nvout, "complete", "status"); + } else { + nv_add_string(nvout, "degraded", "status"); + } + nv_add_uint32(nvout, (uint32_t)res->hr_extentsize, + "extentsize"); + if (res->hr_role == HAST_ROLE_PRIMARY) { + nv_add_uint32(nvout, + (uint32_t)res->hr_keepdirty, "keepdirty"); + nv_add_uint64(nvout, + (uint64_t)(activemap_ndirty(res->hr_amp) * + res->hr_extentsize), "dirty"); + } else { + nv_add_uint32(nvout, (uint32_t)0, "keepdirty"); + nv_add_uint64(nvout, (uint64_t)0, "dirty"); + } + nv_add_uint64(nvout, res->hr_stat_read, "stat_read"); + nv_add_uint64(nvout, res->hr_stat_write, "stat_write"); + nv_add_uint64(nvout, res->hr_stat_delete, + "stat_delete"); + nv_add_uint64(nvout, res->hr_stat_flush, "stat_flush"); + nv_add_uint64(nvout, res->hr_stat_activemap_update, + "stat_activemap_update"); + nv_add_uint64(nvout, res->hr_stat_read_error, + "stat_read_error"); + nv_add_uint64(nvout, res->hr_stat_write_error + + res->hr_stat_activemap_write_error, + "stat_write_error"); + nv_add_uint64(nvout, res->hr_stat_delete_error, + "stat_delete_error"); + nv_add_uint64(nvout, res->hr_stat_flush_error + + res->hr_stat_activemap_flush_error, + "stat_flush_error"); + nv_add_int16(nvout, 0, "error"); + break; + case CONTROL_RELOAD: + /* + * When parent receives SIGHUP and discovers that + * something related to us has changes, it sends reload + * message to us. + */ + PJDLOG_ASSERT(res->hr_role == HAST_ROLE_PRIMARY); + primary_config_reload(res, nvin); + nv_add_int16(nvout, 0, "error"); + break; + default: + nv_add_int16(nvout, EINVAL, "error"); + break; + } + nv_free(nvin); + if (nv_error(nvout) != 0) { + pjdlog_error("Unable to create answer on control message."); + nv_free(nvout); + continue; + } + if (hast_proto_send(NULL, res->hr_ctrl, nvout, NULL, 0) == -1) { + pjdlog_errno(LOG_ERR, + "Unable to send reply to control message"); + } + nv_free(nvout); + } + /* NOTREACHED */ + return (NULL); +} diff --git a/sbin/hastd/control.h b/sbin/hastd/control.h new file mode 100644 index 0000000..0795c70 --- /dev/null +++ b/sbin/hastd/control.h @@ -0,0 +1,49 @@ +/*- + * Copyright (c) 2009-2010 The FreeBSD Foundation + * All rights reserved. + * + * This software was developed by Pawel Jakub Dawidek under sponsorship from + * the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _CONTROL_H_ +#define _CONTROL_H_ + +#define CONTROL_STATUS 10 +#define CONTROL_RELOAD 11 + +struct hastd_config; +struct hast_resource; + +void child_cleanup(struct hast_resource *res); + +void control_set_role(struct hast_resource *res, uint8_t role); + +void control_handle(struct hastd_config *cfg); + +void *ctrl_thread(void *arg); + +#endif /* !_CONTROL_H_ */ diff --git a/sbin/hastd/crc32.c b/sbin/hastd/crc32.c new file mode 100644 index 0000000..e8bc74a --- /dev/null +++ b/sbin/hastd/crc32.c @@ -0,0 +1,115 @@ +/*- + * COPYRIGHT (C) 1986 Gary S. Brown. You may use this program, or + * code or tables extracted from it, as desired without restriction. + */ + +/* + * First, the polynomial itself and its table of feedback terms. The + * polynomial is + * X^32+X^26+X^23+X^22+X^16+X^12+X^11+X^10+X^8+X^7+X^5+X^4+X^2+X^1+X^0 + * + * Note that we take it "backwards" and put the highest-order term in + * the lowest-order bit. The X^32 term is "implied"; the LSB is the + * X^31 term, etc. The X^0 term (usually shown as "+1") results in + * the MSB being 1 + * + * Note that the usual hardware shift register implementation, which + * is what we're using (we're merely optimizing it by doing eight-bit + * chunks at a time) shifts bits into the lowest-order term. In our + * implementation, that means shifting towards the right. Why do we + * do it this way? Because the calculated CRC must be transmitted in + * order from highest-order term to lowest-order term. UARTs transmit + * characters in order from LSB to MSB. By storing the CRC this way + * we hand it to the UART in the order low-byte to high-byte; the UART + * sends each low-bit to hight-bit; and the result is transmission bit + * by bit from highest- to lowest-order term without requiring any bit + * shuffling on our part. Reception works similarly + * + * The feedback terms table consists of 256, 32-bit entries. Notes + * + * The table can be generated at runtime if desired; code to do so + * is shown later. It might not be obvious, but the feedback + * terms simply represent the results of eight shift/xor opera + * tions for all combinations of data and CRC register values + * + * The values must be right-shifted by eight bits by the "updcrc + * logic; the shift must be unsigned (bring in zeroes). On some + * hardware you could probably optimize the shift in assembler by + * using byte-swap instructions + * polynomial $edb88320 + * + * + * CRC32 code derived from work by Gary S. Brown. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <stdint.h> + +#include <crc32.h> + +uint32_t crc32_tab[] = { + 0x00000000, 0x77073096, 0xee0e612c, 0x990951ba, 0x076dc419, 0x706af48f, + 0xe963a535, 0x9e6495a3, 0x0edb8832, 0x79dcb8a4, 0xe0d5e91e, 0x97d2d988, + 0x09b64c2b, 0x7eb17cbd, 0xe7b82d07, 0x90bf1d91, 0x1db71064, 0x6ab020f2, + 0xf3b97148, 0x84be41de, 0x1adad47d, 0x6ddde4eb, 0xf4d4b551, 0x83d385c7, + 0x136c9856, 0x646ba8c0, 0xfd62f97a, 0x8a65c9ec, 0x14015c4f, 0x63066cd9, + 0xfa0f3d63, 0x8d080df5, 0x3b6e20c8, 0x4c69105e, 0xd56041e4, 0xa2677172, + 0x3c03e4d1, 0x4b04d447, 0xd20d85fd, 0xa50ab56b, 0x35b5a8fa, 0x42b2986c, + 0xdbbbc9d6, 0xacbcf940, 0x32d86ce3, 0x45df5c75, 0xdcd60dcf, 0xabd13d59, + 0x26d930ac, 0x51de003a, 0xc8d75180, 0xbfd06116, 0x21b4f4b5, 0x56b3c423, + 0xcfba9599, 0xb8bda50f, 0x2802b89e, 0x5f058808, 0xc60cd9b2, 0xb10be924, + 0x2f6f7c87, 0x58684c11, 0xc1611dab, 0xb6662d3d, 0x76dc4190, 0x01db7106, + 0x98d220bc, 0xefd5102a, 0x71b18589, 0x06b6b51f, 0x9fbfe4a5, 0xe8b8d433, + 0x7807c9a2, 0x0f00f934, 0x9609a88e, 0xe10e9818, 0x7f6a0dbb, 0x086d3d2d, + 0x91646c97, 0xe6635c01, 0x6b6b51f4, 0x1c6c6162, 0x856530d8, 0xf262004e, + 0x6c0695ed, 0x1b01a57b, 0x8208f4c1, 0xf50fc457, 0x65b0d9c6, 0x12b7e950, + 0x8bbeb8ea, 0xfcb9887c, 0x62dd1ddf, 0x15da2d49, 0x8cd37cf3, 0xfbd44c65, + 0x4db26158, 0x3ab551ce, 0xa3bc0074, 0xd4bb30e2, 0x4adfa541, 0x3dd895d7, + 0xa4d1c46d, 0xd3d6f4fb, 0x4369e96a, 0x346ed9fc, 0xad678846, 0xda60b8d0, + 0x44042d73, 0x33031de5, 0xaa0a4c5f, 0xdd0d7cc9, 0x5005713c, 0x270241aa, + 0xbe0b1010, 0xc90c2086, 0x5768b525, 0x206f85b3, 0xb966d409, 0xce61e49f, + 0x5edef90e, 0x29d9c998, 0xb0d09822, 0xc7d7a8b4, 0x59b33d17, 0x2eb40d81, + 0xb7bd5c3b, 0xc0ba6cad, 0xedb88320, 0x9abfb3b6, 0x03b6e20c, 0x74b1d29a, + 0xead54739, 0x9dd277af, 0x04db2615, 0x73dc1683, 0xe3630b12, 0x94643b84, + 0x0d6d6a3e, 0x7a6a5aa8, 0xe40ecf0b, 0x9309ff9d, 0x0a00ae27, 0x7d079eb1, + 0xf00f9344, 0x8708a3d2, 0x1e01f268, 0x6906c2fe, 0xf762575d, 0x806567cb, + 0x196c3671, 0x6e6b06e7, 0xfed41b76, 0x89d32be0, 0x10da7a5a, 0x67dd4acc, + 0xf9b9df6f, 0x8ebeeff9, 0x17b7be43, 0x60b08ed5, 0xd6d6a3e8, 0xa1d1937e, + 0x38d8c2c4, 0x4fdff252, 0xd1bb67f1, 0xa6bc5767, 0x3fb506dd, 0x48b2364b, + 0xd80d2bda, 0xaf0a1b4c, 0x36034af6, 0x41047a60, 0xdf60efc3, 0xa867df55, + 0x316e8eef, 0x4669be79, 0xcb61b38c, 0xbc66831a, 0x256fd2a0, 0x5268e236, + 0xcc0c7795, 0xbb0b4703, 0x220216b9, 0x5505262f, 0xc5ba3bbe, 0xb2bd0b28, + 0x2bb45a92, 0x5cb36a04, 0xc2d7ffa7, 0xb5d0cf31, 0x2cd99e8b, 0x5bdeae1d, + 0x9b64c2b0, 0xec63f226, 0x756aa39c, 0x026d930a, 0x9c0906a9, 0xeb0e363f, + 0x72076785, 0x05005713, 0x95bf4a82, 0xe2b87a14, 0x7bb12bae, 0x0cb61b38, + 0x92d28e9b, 0xe5d5be0d, 0x7cdcefb7, 0x0bdbdf21, 0x86d3d2d4, 0xf1d4e242, + 0x68ddb3f8, 0x1fda836e, 0x81be16cd, 0xf6b9265b, 0x6fb077e1, 0x18b74777, + 0x88085ae6, 0xff0f6a70, 0x66063bca, 0x11010b5c, 0x8f659eff, 0xf862ae69, + 0x616bffd3, 0x166ccf45, 0xa00ae278, 0xd70dd2ee, 0x4e048354, 0x3903b3c2, + 0xa7672661, 0xd06016f7, 0x4969474d, 0x3e6e77db, 0xaed16a4a, 0xd9d65adc, + 0x40df0b66, 0x37d83bf0, 0xa9bcae53, 0xdebb9ec5, 0x47b2cf7f, 0x30b5ffe9, + 0xbdbdf21c, 0xcabac28a, 0x53b39330, 0x24b4a3a6, 0xbad03605, 0xcdd70693, + 0x54de5729, 0x23d967bf, 0xb3667a2e, 0xc4614ab8, 0x5d681b02, 0x2a6f2b94, + 0xb40bbe37, 0xc30c8ea1, 0x5a05df1b, 0x2d02ef8d +}; + +/* + * A function that calculates the CRC-32 based on the table above is + * given below for documentation purposes. An equivalent implementation + * of this function that's actually used in the kernel can be found + * in sys/libkern.h, where it can be inlined. + * + * uint32_t + * crc32(const void *buf, size_t size) + * { + * const uint8_t *p = buf; + * uint32_t crc; + * + * crc = ~0U; + * while (size--) + * crc = crc32_tab[(crc ^ *p++) & 0xFF] ^ (crc >> 8); + * return crc ^ ~0U; + * } + */ diff --git a/sbin/hastd/crc32.h b/sbin/hastd/crc32.h new file mode 100644 index 0000000..3812a83 --- /dev/null +++ b/sbin/hastd/crc32.h @@ -0,0 +1,28 @@ +/*- + * COPYRIGHT (C) 1986 Gary S. Brown. You may use this program, or + * code or tables extracted from it, as desired without restriction. + * + * $FreeBSD$ + */ + +#ifndef _CRC32_H_ +#define _CRC32_H_ + +#include <stdint.h> /* uint32_t */ +#include <stdlib.h> /* size_t */ + +extern uint32_t crc32_tab[]; + +static __inline uint32_t +crc32(const void *buf, size_t size) +{ + const uint8_t *p = buf; + uint32_t crc; + + crc = ~0U; + while (size--) + crc = crc32_tab[(crc ^ *p++) & 0xFF] ^ (crc >> 8); + return (crc ^ ~0U); +} + +#endif /* !_CRC32_H_ */ diff --git a/sbin/hastd/ebuf.c b/sbin/hastd/ebuf.c new file mode 100644 index 0000000..1ae2a26 --- /dev/null +++ b/sbin/hastd/ebuf.c @@ -0,0 +1,259 @@ +/*- + * Copyright (c) 2009-2010 The FreeBSD Foundation + * All rights reserved. + * + * This software was developed by Pawel Jakub Dawidek under sponsorship from + * the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> + +#include <errno.h> +#include <stdbool.h> +#include <stdint.h> +#include <strings.h> +#include <unistd.h> + +#include <pjdlog.h> + +#include "ebuf.h" + +#ifndef PJDLOG_ASSERT +#include <assert.h> +#define PJDLOG_ASSERT(...) assert(__VA_ARGS__) +#endif + +#define EBUF_MAGIC 0xeb0f41c +struct ebuf { + /* Magic to assert the caller uses valid structure. */ + int eb_magic; + /* Address where we did the allocation. */ + unsigned char *eb_start; + /* Allocation end address. */ + unsigned char *eb_end; + /* Start of real data. */ + unsigned char *eb_used; + /* Size of real data. */ + size_t eb_size; +}; + +static int ebuf_head_extend(struct ebuf *eb, size_t size); +static int ebuf_tail_extend(struct ebuf *eb, size_t size); + +struct ebuf * +ebuf_alloc(size_t size) +{ + struct ebuf *eb; + int rerrno; + + eb = malloc(sizeof(*eb)); + if (eb == NULL) + return (NULL); + size += PAGE_SIZE; + eb->eb_start = malloc(size); + if (eb->eb_start == NULL) { + rerrno = errno; + free(eb); + errno = rerrno; + return (NULL); + } + eb->eb_end = eb->eb_start + size; + /* + * We set start address for real data not at the first entry, because + * we want to be able to add data at the front. + */ + eb->eb_used = eb->eb_start + PAGE_SIZE / 4; + eb->eb_size = 0; + eb->eb_magic = EBUF_MAGIC; + + return (eb); +} + +void +ebuf_free(struct ebuf *eb) +{ + + PJDLOG_ASSERT(eb != NULL && eb->eb_magic == EBUF_MAGIC); + + eb->eb_magic = 0; + + free(eb->eb_start); + free(eb); +} + +int +ebuf_add_head(struct ebuf *eb, const void *data, size_t size) +{ + + PJDLOG_ASSERT(eb != NULL && eb->eb_magic == EBUF_MAGIC); + + if (size > (size_t)(eb->eb_used - eb->eb_start)) { + /* + * We can't add more entries at the front, so we have to extend + * our buffer. + */ + if (ebuf_head_extend(eb, size) == -1) + return (-1); + } + PJDLOG_ASSERT(size <= (size_t)(eb->eb_used - eb->eb_start)); + + eb->eb_size += size; + eb->eb_used -= size; + /* + * If data is NULL the caller just wants to reserve place. + */ + if (data != NULL) + bcopy(data, eb->eb_used, size); + + return (0); +} + +int +ebuf_add_tail(struct ebuf *eb, const void *data, size_t size) +{ + + PJDLOG_ASSERT(eb != NULL && eb->eb_magic == EBUF_MAGIC); + + if (size > (size_t)(eb->eb_end - (eb->eb_used + eb->eb_size))) { + /* + * We can't add more entries at the back, so we have to extend + * our buffer. + */ + if (ebuf_tail_extend(eb, size) == -1) + return (-1); + } + PJDLOG_ASSERT(size <= + (size_t)(eb->eb_end - (eb->eb_used + eb->eb_size))); + + /* + * If data is NULL the caller just wants to reserve space. + */ + if (data != NULL) + bcopy(data, eb->eb_used + eb->eb_size, size); + eb->eb_size += size; + + return (0); +} + +void +ebuf_del_head(struct ebuf *eb, size_t size) +{ + + PJDLOG_ASSERT(eb != NULL && eb->eb_magic == EBUF_MAGIC); + PJDLOG_ASSERT(size <= eb->eb_size); + + eb->eb_used += size; + eb->eb_size -= size; +} + +void +ebuf_del_tail(struct ebuf *eb, size_t size) +{ + + PJDLOG_ASSERT(eb != NULL && eb->eb_magic == EBUF_MAGIC); + PJDLOG_ASSERT(size <= eb->eb_size); + + eb->eb_size -= size; +} + +/* + * Return pointer to the data and data size. + */ +void * +ebuf_data(struct ebuf *eb, size_t *sizep) +{ + + PJDLOG_ASSERT(eb != NULL && eb->eb_magic == EBUF_MAGIC); + + if (sizep != NULL) + *sizep = eb->eb_size; + return (eb->eb_size > 0 ? eb->eb_used : NULL); +} + +/* + * Return data size. + */ +size_t +ebuf_size(struct ebuf *eb) +{ + + PJDLOG_ASSERT(eb != NULL && eb->eb_magic == EBUF_MAGIC); + + return (eb->eb_size); +} + +/* + * Function adds size + (PAGE_SIZE / 4) bytes at the front of the buffer.. + */ +static int +ebuf_head_extend(struct ebuf *eb, size_t size) +{ + unsigned char *newstart, *newused; + size_t newsize; + + PJDLOG_ASSERT(eb != NULL && eb->eb_magic == EBUF_MAGIC); + + newsize = eb->eb_end - eb->eb_start + (PAGE_SIZE / 4) + size; + + newstart = malloc(newsize); + if (newstart == NULL) + return (-1); + newused = + newstart + (PAGE_SIZE / 4) + size + (eb->eb_used - eb->eb_start); + + bcopy(eb->eb_used, newused, eb->eb_size); + + eb->eb_start = newstart; + eb->eb_used = newused; + eb->eb_end = newstart + newsize; + + return (0); +} + +/* + * Function adds size + ((3 * PAGE_SIZE) / 4) bytes at the back. + */ +static int +ebuf_tail_extend(struct ebuf *eb, size_t size) +{ + unsigned char *newstart; + size_t newsize; + + PJDLOG_ASSERT(eb != NULL && eb->eb_magic == EBUF_MAGIC); + + newsize = eb->eb_end - eb->eb_start + size + ((3 * PAGE_SIZE) / 4); + + newstart = realloc(eb->eb_start, newsize); + if (newstart == NULL) + return (-1); + + eb->eb_used = newstart + (eb->eb_used - eb->eb_start); + eb->eb_start = newstart; + eb->eb_end = newstart + newsize; + + return (0); +} diff --git a/sbin/hastd/ebuf.h b/sbin/hastd/ebuf.h new file mode 100644 index 0000000..06275e7 --- /dev/null +++ b/sbin/hastd/ebuf.h @@ -0,0 +1,51 @@ +/*- + * Copyright (c) 2009-2010 The FreeBSD Foundation + * All rights reserved. + * + * This software was developed by Pawel Jakub Dawidek under sponsorship from + * the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _EBUF_H_ +#define _EBUF_H_ + +#include <stdlib.h> /* size_t */ + +struct ebuf; + +struct ebuf *ebuf_alloc(size_t size); +void ebuf_free(struct ebuf *eb); + +int ebuf_add_head(struct ebuf *eb, const void *data, size_t size); +int ebuf_add_tail(struct ebuf *eb, const void *data, size_t size); + +void ebuf_del_head(struct ebuf *eb, size_t size); +void ebuf_del_tail(struct ebuf *eb, size_t size); + +void *ebuf_data(struct ebuf *eb, size_t *sizep); +size_t ebuf_size(struct ebuf *eb); + +#endif /* !_EBUF_H_ */ diff --git a/sbin/hastd/event.c b/sbin/hastd/event.c new file mode 100644 index 0000000..ef65df1 --- /dev/null +++ b/sbin/hastd/event.c @@ -0,0 +1,161 @@ +/*- + * Copyright (c) 2010 Pawel Jakub Dawidek <pjd@FreeBSD.org> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <errno.h> + +#include "hast.h" +#include "hast_proto.h" +#include "hooks.h" +#include "nv.h" +#include "pjdlog.h" +#include "proto.h" +#include "subr.h" + +#include "event.h" + +void +event_send(const struct hast_resource *res, int event) +{ + struct nv *nvin, *nvout; + int error; + + PJDLOG_ASSERT(res != NULL); + PJDLOG_ASSERT(event >= EVENT_MIN && event <= EVENT_MAX); + + nvin = nvout = NULL; + + /* + * Prepare and send event to parent process. + */ + nvout = nv_alloc(); + nv_add_uint8(nvout, (uint8_t)event, "event"); + error = nv_error(nvout); + if (error != 0) { + pjdlog_common(LOG_ERR, 0, error, + "Unable to prepare event header"); + goto done; + } + if (hast_proto_send(res, res->hr_event, nvout, NULL, 0) == -1) { + pjdlog_errno(LOG_ERR, "Unable to send event header"); + goto done; + } + if (hast_proto_recv_hdr(res->hr_event, &nvin) == -1) { + pjdlog_errno(LOG_ERR, "Unable to receive event header"); + goto done; + } + /* + * Do nothing with the answer. We only wait for it to be sure not + * to exit too quickly after sending an event and exiting immediately. + */ +done: + if (nvin != NULL) + nv_free(nvin); + if (nvout != NULL) + nv_free(nvout); +} + +int +event_recv(const struct hast_resource *res) +{ + struct nv *nvin, *nvout; + const char *evstr; + uint8_t event; + int error; + + PJDLOG_ASSERT(res != NULL); + + nvin = nvout = NULL; + + if (hast_proto_recv_hdr(res->hr_event, &nvin) == -1) { + /* + * First error log as debug. This is because worker process + * most likely exited. + */ + pjdlog_common(LOG_DEBUG, 1, errno, + "Unable to receive event header"); + goto fail; + } + + event = nv_get_uint8(nvin, "event"); + if (event == EVENT_NONE) { + pjdlog_error("Event header is missing 'event' field."); + goto fail; + } + + switch (event) { + case EVENT_CONNECT: + evstr = "connect"; + break; + case EVENT_DISCONNECT: + evstr = "disconnect"; + break; + case EVENT_SYNCSTART: + evstr = "syncstart"; + break; + case EVENT_SYNCDONE: + evstr = "syncdone"; + break; + case EVENT_SYNCINTR: + evstr = "syncintr"; + break; + case EVENT_SPLITBRAIN: + evstr = "split-brain"; + break; + default: + pjdlog_error("Event header contain invalid event number (%hhu).", + event); + goto fail; + } + + pjdlog_prefix_set("[%s] (%s) ", res->hr_name, role2str(res->hr_role)); + hook_exec(res->hr_exec, evstr, res->hr_name, NULL); + pjdlog_prefix_set("%s", ""); + + nvout = nv_alloc(); + nv_add_int16(nvout, 0, "error"); + error = nv_error(nvout); + if (error != 0) { + pjdlog_common(LOG_ERR, 0, error, + "Unable to prepare event header"); + goto fail; + } + if (hast_proto_send(res, res->hr_event, nvout, NULL, 0) == -1) { + pjdlog_errno(LOG_ERR, "Unable to send event header"); + goto fail; + } + nv_free(nvin); + nv_free(nvout); + return (0); +fail: + if (nvin != NULL) + nv_free(nvin); + if (nvout != NULL) + nv_free(nvout); + return (-1); +} diff --git a/sbin/hastd/event.h b/sbin/hastd/event.h new file mode 100644 index 0000000..1614bf1 --- /dev/null +++ b/sbin/hastd/event.h @@ -0,0 +1,46 @@ +/*- + * Copyright (c) 2010 Pawel Jakub Dawidek <pjd@FreeBSD.org> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _EVENT_H_ +#define _EVENT_H_ + +#define EVENT_NONE 0 +#define EVENT_CONNECT 1 +#define EVENT_DISCONNECT 2 +#define EVENT_SYNCSTART 3 +#define EVENT_SYNCDONE 4 +#define EVENT_SYNCINTR 5 +#define EVENT_SPLITBRAIN 6 + +#define EVENT_MIN EVENT_CONNECT +#define EVENT_MAX EVENT_SPLITBRAIN + +void event_send(const struct hast_resource *res, int event); +int event_recv(const struct hast_resource *res); + +#endif /* !_EVENT_H_ */ diff --git a/sbin/hastd/hast.conf.5 b/sbin/hastd/hast.conf.5 new file mode 100644 index 0000000..3d921e4 --- /dev/null +++ b/sbin/hastd/hast.conf.5 @@ -0,0 +1,449 @@ +.\" Copyright (c) 2010 The FreeBSD Foundation +.\" Copyright (c) 2010-2012 Pawel Jakub Dawidek <pawel@dawidek.net> +.\" All rights reserved. +.\" +.\" This documentation was written by Pawel Jakub Dawidek under sponsorship from +.\" the FreeBSD Foundation. +.\" +.\" Redistribution and use in source and binary forms, with or without +.\" modification, are permitted provided that the following conditions +.\" are met: +.\" 1. Redistributions of source code must retain the above copyright +.\" notice, this list of conditions and the following disclaimer. +.\" 2. Redistributions in binary form must reproduce the above copyright +.\" notice, this list of conditions and the following disclaimer in the +.\" documentation and/or other materials provided with the distribution. +.\" +.\" THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND +.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +.\" ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE +.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +.\" SUCH DAMAGE. +.\" +.\" $FreeBSD$ +.\" +.Dd January 25, 2012 +.Dt HAST.CONF 5 +.Os +.Sh NAME +.Nm hast.conf +.Nd configuration file for the +.Xr hastd 8 +daemon and the +.Xr hastctl 8 +utility +.Sh DESCRIPTION +The +.Nm +file is used by both +.Xr hastd 8 +daemon +and +.Xr hastctl 8 +control utility. +Configuration file is designed in a way that exactly the same file can be +(and should be) used on both HAST nodes. +Every line starting with # is treated as comment and ignored. +.Sh CONFIGURATION FILE SYNTAX +General syntax of the +.Nm +file is following: +.Bd -literal -offset indent +# Global section +control <addr> +listen <addr> +replication <mode> +checksum <algorithm> +compression <algorithm> +timeout <seconds> +exec <path> +metaflush on | off +pidfile <path> + +on <node> { + # Node section + control <addr> + listen <addr> + pidfile <path> +} + +on <node> { + # Node section + control <addr> + listen <addr> + pidfile <path> +} + +resource <name> { + # Resource section + replication <mode> + checksum <algorithm> + compression <algorithm> + name <name> + local <path> + timeout <seconds> + exec <path> + metaflush on | off + + on <node> { + # Resource-node section + name <name> + # Required + local <path> + metaflush on | off + # Required + remote <addr> + source <addr> + } + on <node> { + # Resource-node section + name <name> + # Required + local <path> + metaflush on | off + # Required + remote <addr> + source <addr> + } +} +.Ed +.Pp +Most of the various available configuration parameters are optional. +If parameter is not defined in the particular section, it will be +inherited from the parent section. +For example, if the +.Ic listen +parameter is not defined in the node section, it will be inherited from +the global section. +In case the global section does not define the +.Ic listen +parameter at all, the default value will be used. +.Sh CONFIGURATION FILE DESCRIPTION +The +.Aq node +argument can be replaced either by a full hostname as obtained by +.Xr gethostname 3 , +only first part of the hostname, by node's UUID as found in the +.Va kern.hostuuid +.Xr sysctl 8 +variable +or by node's hostid as found in the +.Va kern.hostid +.Xr sysctl 8 +variable. +.Pp +The following statements are available: +.Bl -tag -width ".Ic xxxx" +.It Ic control Aq addr +.Pp +Address for communication with +.Xr hastctl 8 . +Each of the following examples defines the same control address: +.Bd -literal -offset indent +uds:///var/run/hastctl +unix:///var/run/hastctl +/var/run/hastctl +.Ed +.Pp +The default value is +.Pa uds:///var/run/hastctl . +.It Ic pidfile Aq path +.Pp +File in which to store the process ID of the main +.Xr hastd 8 +process. +.Pp +The default value is +.Pa /var/run/hastd.pid . +.It Ic listen Aq addr +.Pp +Address to listen on in form of: +.Bd -literal -offset indent +protocol://protocol-specific-address +.Ed +.Pp +Each of the following examples defines the same listen address: +.Bd -literal -offset indent +0.0.0.0 +0.0.0.0:8457 +tcp://0.0.0.0 +tcp://0.0.0.0:8457 +tcp4://0.0.0.0 +tcp4://0.0.0.0:8457 +.Ed +.Pp +Multiple listen addresses can be specified. +By default +.Nm hastd +listens on +.Pa tcp4://0.0.0.0:8457 +and +.Pa tcp6://[::]:8457 +if kernel supports IPv4 and IPv6 respectively. +.It Ic replication Aq mode +.Pp +Replication mode should be one of the following: +.Bl -tag -width ".Ic xxxx" +.It Ic memsync +.Pp +Report the write operation as completed when local write completes and +when the remote node acknowledges the data receipt, but before it +actually stores the data. +The data on remote node will be stored directly after sending +acknowledgement. +This mode is intended to reduce latency, but still provides a very good +reliability. +The only situation where some small amount of data could be lost is when +the data is stored on primary node and sent to the secondary. +Secondary node then acknowledges data receipt and primary reports +success to an application. +However, it may happen that the secondary goes down before the received +data is really stored locally. +Before secondary node returns, primary node dies entirely. +When the secondary node comes back to life it becomes the new primary. +Unfortunately some small amount of data which was confirmed to be stored +to the application was lost. +The risk of such a situation is very small. +The +.Ic memsync +replication mode is the default. +.It Ic fullsync +.Pp +Mark the write operation as completed when local as well as remote +write completes. +This is the safest and the slowest replication mode. +.It Ic async +.Pp +The write operation is reported as complete right after the local write +completes. +This is the fastest and the most dangerous replication mode. +This mode should be used when replicating to a distant node where +latency is too high for other modes. +.El +.It Ic checksum Aq algorithm +.Pp +Checksum algorithm should be one of the following: +.Bl -tag -width ".Ic sha256" +.It Ic none +No checksum will be calculated for the data being send over the network. +This is the default setting. +.It Ic crc32 +CRC32 checksum will be calculated. +.It Ic sha256 +SHA256 checksum will be calculated. +.El +.It Ic compression Aq algorithm +.Pp +Compression algorithm should be one of the following: +.Bl -tag -width ".Ic none" +.It Ic none +Data send over the network will not be compressed. +.It Ic hole +Only blocks that contain all zeros will be compressed. +This is very useful for initial synchronization where potentially many blocks +are still all zeros. +There should be no measurable performance overhead when this algorithm is being +used. +This is the default setting. +.It Ic lzf +The LZF algorithm by Marc Alexander Lehmann will be used to compress the data +send over the network. +LZF is very fast, general purpose compression algorithm. +.El +.It Ic timeout Aq seconds +.Pp +Connection timeout in seconds. +The default value is +.Va 20 . +.It Ic exec Aq path +.Pp +Execute the given program on various HAST events. +Below is the list of currently implemented events and arguments the given +program is executed with: +.Bl -tag -width ".Ic xxxx" +.It Ic "<path> role <resource> <oldrole> <newrole>" +.Pp +Executed on both primary and secondary nodes when resource role is changed. +.Pp +.It Ic "<path> connect <resource>" +.Pp +Executed on both primary and secondary nodes when connection for the given +resource between the nodes is established. +.Pp +.It Ic "<path> disconnect <resource>" +.Pp +Executed on both primary and secondary nodes when connection for the given +resource between the nodes is lost. +.Pp +.It Ic "<path> syncstart <resource>" +.Pp +Executed on primary node when synchronization process of secondary node is +started. +.Pp +.It Ic "<path> syncdone <resource>" +.Pp +Executed on primary node when synchronization process of secondary node is +completed successfully. +.Pp +.It Ic "<path> syncintr <resource>" +.Pp +Executed on primary node when synchronization process of secondary node is +interrupted, most likely due to secondary node outage or connection failure +between the nodes. +.Pp +.It Ic "<path> split-brain <resource>" +.Pp +Executed on both primary and secondary nodes when split-brain condition is +detected. +.Pp +.El +The +.Aq path +argument should contain full path to executable program. +If the given program exits with code different than +.Va 0 , +.Nm hastd +will log it as an error. +.Pp +The +.Aq resource +argument is resource name from the configuration file. +.Pp +The +.Aq oldrole +argument is previous resource role (before the change). +It can be one of: +.Ar init , +.Ar secondary , +.Ar primary . +.Pp +The +.Aq newrole +argument is current resource role (after the change). +It can be one of: +.Ar init , +.Ar secondary , +.Ar primary . +.Pp +.It Ic metaflush on | off +.Pp +When set to +.Va on , +flush write cache of the local provider after every metadata (activemap) update. +Flushing write cache ensures that provider will not reorder writes and that +metadata will be properly updated before real data is stored. +If the local provider does not support flushing write cache (it returns +.Er EOPNOTSUPP +on the +.Cm BIO_FLUSH +request), +.Nm hastd +will disable +.Ic metaflush +automatically. +The default value is +.Va on . +.Pp +.It Ic name Aq name +.Pp +GEOM provider name that will appear as +.Pa /dev/hast/<name> . +If name is not defined, resource name will be used as provider name. +.It Ic local Aq path +.Pp +Path to the local component which will be used as backend provider for +the resource. +This can be either GEOM provider or regular file. +.It Ic remote Aq addr +.Pp +Address of the remote +.Nm hastd +daemon. +Format is the same as for the +.Ic listen +statement. +When operating as a primary node this address will be used to connect to +the secondary node. +When operating as a secondary node only connections from this address +will be accepted. +.Pp +A special value of +.Va none +can be used when the remote address is not yet known (eg. the other node is not +set up yet). +.It Ic source Aq addr +.Pp +Local address to bind to before connecting to the remote +.Nm hastd +daemon. +Format is the same as for the +.Ic listen +statement. +.El +.Sh FILES +.Bl -tag -width ".Pa /var/run/hastctl" -compact +.It Pa /etc/hast.conf +The default +.Xr hastctl 8 +and +.Xr hastd 8 +configuration file. +.It Pa /var/run/hastctl +Control socket used by the +.Xr hastctl 8 +control utility to communicate with the +.Xr hastd 8 +daemon. +.El +.Sh EXAMPLES +The example configuration file can look as follows: +.Bd -literal -offset indent +listen tcp://0.0.0.0 + +on hasta { + listen tcp://2001:db8::1/64 +} +on hastb { + listen tcp://2001:db8::2/64 +} + +resource shared { + local /dev/da0 + + on hasta { + remote tcp://10.0.0.2 + } + on hastb { + remote tcp://10.0.0.1 + } +} +resource tank { + on hasta { + local /dev/mirror/tanka + source tcp://10.0.0.1 + remote tcp://10.0.0.2 + } + on hastb { + local /dev/mirror/tankb + source tcp://10.0.0.2 + remote tcp://10.0.0.1 + } +} +.Ed +.Sh SEE ALSO +.Xr gethostname 3 , +.Xr geom 4 , +.Xr hastctl 8 , +.Xr hastd 8 +.Sh AUTHORS +The +.Nm +was written by +.An Pawel Jakub Dawidek Aq pjd@FreeBSD.org +under sponsorship of the FreeBSD Foundation. diff --git a/sbin/hastd/hast.h b/sbin/hastd/hast.h new file mode 100644 index 0000000..65c24f8 --- /dev/null +++ b/sbin/hastd/hast.h @@ -0,0 +1,264 @@ +/*- + * Copyright (c) 2009-2010 The FreeBSD Foundation + * Copyright (c) 2011 Pawel Jakub Dawidek <pawel@dawidek.net> + * All rights reserved. + * + * This software was developed by Pawel Jakub Dawidek under sponsorship from + * the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _HAST_H_ +#define _HAST_H_ + +#include <sys/queue.h> +#include <sys/socket.h> + +#include <arpa/inet.h> + +#include <netinet/in.h> + +#include <limits.h> +#include <pthread.h> +#include <stdbool.h> +#include <stdint.h> + +#include <activemap.h> + +#include "proto.h" + +/* + * Version history: + * 0 - initial version + * 1 - HIO_KEEPALIVE added + * 2 - "memsync" and "received" attributes added for memsync mode + */ +#define HAST_PROTO_VERSION 2 + +#define EHAST_OK 0 +#define EHAST_NOENTRY 1 +#define EHAST_INVALID 2 +#define EHAST_NOMEMORY 3 +#define EHAST_UNIMPLEMENTED 4 + +#define HASTCTL_CMD_UNKNOWN 0 +#define HASTCTL_CMD_SETROLE 1 +#define HASTCTL_CMD_STATUS 2 + +#define HAST_ROLE_UNDEF 0 +#define HAST_ROLE_INIT 1 +#define HAST_ROLE_PRIMARY 2 +#define HAST_ROLE_SECONDARY 3 + +#define HAST_SYNCSRC_UNDEF 0 +#define HAST_SYNCSRC_PRIMARY 1 +#define HAST_SYNCSRC_SECONDARY 2 + +#define HIO_UNDEF 0 +#define HIO_READ 1 +#define HIO_WRITE 2 +#define HIO_DELETE 3 +#define HIO_FLUSH 4 +#define HIO_KEEPALIVE 5 + +#define HAST_USER "hast" +#define HAST_TIMEOUT 20 +#define HAST_CONFIG "/etc/hast.conf" +#define HAST_CONTROL "/var/run/hastctl" +#define HASTD_LISTEN_TCP4 "tcp4://0.0.0.0:8457" +#define HASTD_LISTEN_TCP6 "tcp6://[::]:8457" +#define HASTD_PIDFILE "/var/run/hastd.pid" + +/* Default extent size. */ +#define HAST_EXTENTSIZE 2097152 +/* Default maximum number of extents that are kept dirty. */ +#define HAST_KEEPDIRTY 64 + +#define HAST_ADDRSIZE 1024 +#define HAST_TOKEN_SIZE 16 + +/* Number of seconds to sleep between reconnect retries or keepalive packets. */ +#define HAST_KEEPALIVE 10 + +struct hastd_listen { + /* Address to listen on. */ + char hl_addr[HAST_ADDRSIZE]; + /* Protocol-specific data. */ + struct proto_conn *hl_conn; + TAILQ_ENTRY(hastd_listen) hl_next; +}; + +struct hastd_config { + /* Address to communicate with hastctl(8). */ + char hc_controladdr[HAST_ADDRSIZE]; + /* Protocol-specific data. */ + struct proto_conn *hc_controlconn; + /* Incoming control connection. */ + struct proto_conn *hc_controlin; + /* PID file path. */ + char hc_pidfile[PATH_MAX]; + /* List of addresses to listen on. */ + TAILQ_HEAD(, hastd_listen) hc_listen; + /* List of resources. */ + TAILQ_HEAD(, hast_resource) hc_resources; +}; + +#define HAST_REPLICATION_FULLSYNC 0 +#define HAST_REPLICATION_MEMSYNC 1 +#define HAST_REPLICATION_ASYNC 2 + +#define HAST_COMPRESSION_NONE 0 +#define HAST_COMPRESSION_HOLE 1 +#define HAST_COMPRESSION_LZF 2 + +#define HAST_CHECKSUM_NONE 0 +#define HAST_CHECKSUM_CRC32 1 +#define HAST_CHECKSUM_SHA256 2 + +/* + * Structure that describes single resource. + */ +struct hast_resource { + /* Resource name. */ + char hr_name[NAME_MAX]; + /* Negotiated replication mode (HAST_REPLICATION_*). */ + int hr_replication; + /* Configured replication mode (HAST_REPLICATION_*). */ + int hr_original_replication; + /* Provider name that will appear in /dev/hast/. */ + char hr_provname[NAME_MAX]; + /* Synchronization extent size. */ + int hr_extentsize; + /* Maximum number of extents that are kept dirty. */ + int hr_keepdirty; + /* Path to a program to execute on various events. */ + char hr_exec[PATH_MAX]; + /* Compression algorithm. */ + int hr_compression; + /* Checksum algorithm. */ + int hr_checksum; + /* Protocol version. */ + int hr_version; + + /* Path to local component. */ + char hr_localpath[PATH_MAX]; + /* Descriptor to access local component. */ + int hr_localfd; + /* Offset into local component. */ + off_t hr_localoff; + /* Size of usable space. */ + off_t hr_datasize; + /* Size of entire local provider. */ + off_t hr_local_mediasize; + /* Sector size of local provider. */ + unsigned int hr_local_sectorsize; + /* Is flushing write cache supported by the local provider? */ + bool hr_localflush; + /* Flush write cache on metadata updates? */ + int hr_metaflush; + + /* Descriptor for /dev/ggctl communication. */ + int hr_ggatefd; + /* Unit number for ggate communication. */ + int hr_ggateunit; + + /* Address of the remote component. */ + char hr_remoteaddr[HAST_ADDRSIZE]; + /* Local address to bind to for outgoing connections. */ + char hr_sourceaddr[HAST_ADDRSIZE]; + /* Connection for incoming data. */ + struct proto_conn *hr_remotein; + /* Connection for outgoing data. */ + struct proto_conn *hr_remoteout; + /* Token to verify both in and out connection are coming from + the same node (not necessarily from the same address). */ + unsigned char hr_token[HAST_TOKEN_SIZE]; + /* Connection timeout. */ + int hr_timeout; + + /* Resource unique identifier. */ + uint64_t hr_resuid; + /* Primary's local modification count. */ + uint64_t hr_primary_localcnt; + /* Primary's remote modification count. */ + uint64_t hr_primary_remotecnt; + /* Secondary's local modification count. */ + uint64_t hr_secondary_localcnt; + /* Secondary's remote modification count. */ + uint64_t hr_secondary_remotecnt; + /* Synchronization source. */ + uint8_t hr_syncsrc; + + /* Resource role: HAST_ROLE_{INIT,PRIMARY,SECONDARY}. */ + int hr_role; + /* Previous resource role: HAST_ROLE_{INIT,PRIMARY,SECONDARY}. */ + int hr_previous_role; + /* PID of child worker process. 0 - no child. */ + pid_t hr_workerpid; + /* Control commands from parent to child. */ + struct proto_conn *hr_ctrl; + /* Events from child to parent. */ + struct proto_conn *hr_event; + /* Connection requests from child to parent. */ + struct proto_conn *hr_conn; + + /* Activemap structure. */ + struct activemap *hr_amp; + /* Lock used to synchronize access to hr_amp. */ + pthread_mutex_t hr_amp_lock; + /* Lock used to synchronize access to hr_amp diskmap. */ + pthread_mutex_t hr_amp_diskmap_lock; + + /* Number of BIO_READ requests. */ + uint64_t hr_stat_read; + /* Number of BIO_WRITE requests. */ + uint64_t hr_stat_write; + /* Number of BIO_DELETE requests. */ + uint64_t hr_stat_delete; + /* Number of BIO_FLUSH requests. */ + uint64_t hr_stat_flush; + /* Number of activemap updates. */ + uint64_t hr_stat_activemap_update; + /* Number of local read errors. */ + uint64_t hr_stat_read_error; + /* Number of local write errors. */ + uint64_t hr_stat_write_error; + /* Number of local delete errors. */ + uint64_t hr_stat_delete_error; + /* Number of flush errors. */ + uint64_t hr_stat_flush_error; + /* Number of activemap write errors. */ + uint64_t hr_stat_activemap_write_error; + /* Number of activemap flush errors. */ + uint64_t hr_stat_activemap_flush_error; + + /* Next resource. */ + TAILQ_ENTRY(hast_resource) hr_next; +}; + +struct hastd_config *yy_config_parse(const char *config, bool exitonerror); +void yy_config_free(struct hastd_config *config); + +#endif /* !_HAST_H_ */ diff --git a/sbin/hastd/hast_checksum.c b/sbin/hastd/hast_checksum.c new file mode 100644 index 0000000..795744e --- /dev/null +++ b/sbin/hastd/hast_checksum.c @@ -0,0 +1,160 @@ +/*- + * Copyright (c) 2011 Pawel Jakub Dawidek <pawel@dawidek.net> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <errno.h> +#include <string.h> +#include <strings.h> + +#ifdef HAVE_CRYPTO +#include <openssl/sha.h> +#endif + +#include <crc32.h> +#include <hast.h> +#include <nv.h> +#include <pjdlog.h> + +#include "hast_checksum.h" + +#ifdef HAVE_CRYPTO +#define MAX_HASH_SIZE SHA256_DIGEST_LENGTH +#else +#define MAX_HASH_SIZE 4 +#endif + +static void +hast_crc32_checksum(const unsigned char *data, size_t size, + unsigned char *hash, size_t *hsizep) +{ + uint32_t crc; + + crc = crc32(data, size); + /* XXXPJD: Do we have to use htole32() on crc first? */ + bcopy(&crc, hash, sizeof(crc)); + *hsizep = sizeof(crc); +} + +#ifdef HAVE_CRYPTO +static void +hast_sha256_checksum(const unsigned char *data, size_t size, + unsigned char *hash, size_t *hsizep) +{ + SHA256_CTX ctx; + + SHA256_Init(&ctx); + SHA256_Update(&ctx, data, size); + SHA256_Final(hash, &ctx); + *hsizep = SHA256_DIGEST_LENGTH; +} +#endif /* HAVE_CRYPTO */ + +const char * +checksum_name(int num) +{ + + switch (num) { + case HAST_CHECKSUM_NONE: + return ("none"); + case HAST_CHECKSUM_CRC32: + return ("crc32"); + case HAST_CHECKSUM_SHA256: + return ("sha256"); + } + return ("unknown"); +} + +int +checksum_send(const struct hast_resource *res, struct nv *nv, void **datap, + size_t *sizep, bool *freedatap __unused) +{ + unsigned char hash[MAX_HASH_SIZE]; + size_t hsize; + + switch (res->hr_checksum) { + case HAST_CHECKSUM_NONE: + return (0); + case HAST_CHECKSUM_CRC32: + hast_crc32_checksum(*datap, *sizep, hash, &hsize); + break; +#ifdef HAVE_CRYPTO + case HAST_CHECKSUM_SHA256: + hast_sha256_checksum(*datap, *sizep, hash, &hsize); + break; +#endif + default: + PJDLOG_ABORT("Invalid checksum: %d.", res->hr_checksum); + } + nv_add_string(nv, checksum_name(res->hr_checksum), "checksum"); + nv_add_uint8_array(nv, hash, hsize, "hash"); + if (nv_error(nv) != 0) { + errno = nv_error(nv); + return (-1); + } + return (0); +} + +int +checksum_recv(const struct hast_resource *res __unused, struct nv *nv, + void **datap, size_t *sizep, bool *freedatap __unused) +{ + unsigned char chash[MAX_HASH_SIZE]; + const unsigned char *rhash; + size_t chsize, rhsize; + const char *algo; + + algo = nv_get_string(nv, "checksum"); + if (algo == NULL) + return (0); /* No checksum. */ + rhash = nv_get_uint8_array(nv, &rhsize, "hash"); + if (rhash == NULL) { + pjdlog_error("Hash is missing."); + return (-1); /* Hash not found. */ + } + if (strcmp(algo, "crc32") == 0) + hast_crc32_checksum(*datap, *sizep, chash, &chsize); +#ifdef HAVE_CRYPTO + else if (strcmp(algo, "sha256") == 0) + hast_sha256_checksum(*datap, *sizep, chash, &chsize); +#endif + else { + pjdlog_error("Unknown checksum algorithm '%s'.", algo); + return (-1); /* Unknown checksum algorithm. */ + } + if (rhsize != chsize) { + pjdlog_error("Invalid hash size (%zu) for %s, should be %zu.", + rhsize, algo, chsize); + return (-1); /* Different hash size. */ + } + if (bcmp(rhash, chash, chsize) != 0) { + pjdlog_error("Hash mismatch."); + return (-1); /* Hash mismatch. */ + } + + return (0); +} diff --git a/sbin/hastd/hast_checksum.h b/sbin/hastd/hast_checksum.h new file mode 100644 index 0000000..9799828 --- /dev/null +++ b/sbin/hastd/hast_checksum.h @@ -0,0 +1,44 @@ +/*- + * Copyright (c) 2011 Pawel Jakub Dawidek <pawel@dawidek.net> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _HAST_CHECKSUM_H_ +#define _HAST_CHECKSUM_H_ + +#include <stdlib.h> /* size_t */ + +#include <hast.h> +#include <nv.h> + +const char *checksum_name(int num); + +int checksum_send(const struct hast_resource *res, struct nv *nv, + void **datap, size_t *sizep, bool *freedatap); +int checksum_recv(const struct hast_resource *res, struct nv *nv, + void **datap, size_t *sizep, bool *freedatap); + +#endif /* !_HAST_CHECKSUM_H_ */ diff --git a/sbin/hastd/hast_compression.c b/sbin/hastd/hast_compression.c new file mode 100644 index 0000000..f524eb1 --- /dev/null +++ b/sbin/hastd/hast_compression.c @@ -0,0 +1,283 @@ +/*- + * Copyright (c) 2011 Pawel Jakub Dawidek <pawel@dawidek.net> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/endian.h> + +#include <errno.h> +#include <string.h> +#include <strings.h> + +#include <hast.h> +#include <lzf.h> +#include <nv.h> +#include <pjdlog.h> + +#include "hast_compression.h" + +static bool +allzeros(const void *data, size_t size) +{ + const uint64_t *p = data; + unsigned int i; + uint64_t v; + + PJDLOG_ASSERT((size % sizeof(*p)) == 0); + + /* + * This is the fastest method I found for checking if the given + * buffer contain all zeros. + * Because inside the loop we don't check at every step, we would + * get an answer only after walking through entire buffer. + * To return early if the buffer doesn't contain all zeros, we probe + * 8 bytes at the beginning, in the middle and at the end of the buffer + * first. + */ + + size >>= 3; /* divide by 8 */ + if ((p[0] | p[size >> 1] | p[size - 1]) != 0) + return (false); + v = 0; + for (i = 0; i < size; i++) + v |= *p++; + return (v == 0); +} + +static void * +hast_hole_compress(const unsigned char *data, size_t *sizep) +{ + uint32_t size; + void *newbuf; + + if (!allzeros(data, *sizep)) + return (NULL); + + newbuf = malloc(sizeof(size)); + if (newbuf == NULL) { + pjdlog_warning("Unable to compress (no memory: %zu).", + (size_t)*sizep); + return (NULL); + } + size = htole32((uint32_t)*sizep); + bcopy(&size, newbuf, sizeof(size)); + *sizep = sizeof(size); + + return (newbuf); +} + +static void * +hast_hole_decompress(const unsigned char *data, size_t *sizep) +{ + uint32_t size; + void *newbuf; + + if (*sizep != sizeof(size)) { + pjdlog_error("Unable to decompress (invalid size: %zu).", + *sizep); + return (NULL); + } + + bcopy(data, &size, sizeof(size)); + size = le32toh(size); + + newbuf = malloc(size); + if (newbuf == NULL) { + pjdlog_error("Unable to decompress (no memory: %zu).", + (size_t)size); + return (NULL); + } + bzero(newbuf, size); + *sizep = size; + + return (newbuf); +} + +/* Minimum block size to try to compress. */ +#define HAST_LZF_COMPRESS_MIN 1024 + +static void * +hast_lzf_compress(const unsigned char *data, size_t *sizep) +{ + unsigned char *newbuf; + uint32_t origsize; + size_t newsize; + + origsize = *sizep; + + if (origsize <= HAST_LZF_COMPRESS_MIN) + return (NULL); + + newsize = sizeof(origsize) + origsize - HAST_LZF_COMPRESS_MIN; + newbuf = malloc(newsize); + if (newbuf == NULL) { + pjdlog_warning("Unable to compress (no memory: %zu).", + newsize); + return (NULL); + } + newsize = lzf_compress(data, *sizep, newbuf + sizeof(origsize), + newsize - sizeof(origsize)); + if (newsize == 0) { + free(newbuf); + return (NULL); + } + origsize = htole32(origsize); + bcopy(&origsize, newbuf, sizeof(origsize)); + + *sizep = sizeof(origsize) + newsize; + return (newbuf); +} + +static void * +hast_lzf_decompress(const unsigned char *data, size_t *sizep) +{ + unsigned char *newbuf; + uint32_t origsize; + size_t newsize; + + PJDLOG_ASSERT(*sizep > sizeof(origsize)); + + bcopy(data, &origsize, sizeof(origsize)); + origsize = le32toh(origsize); + PJDLOG_ASSERT(origsize > HAST_LZF_COMPRESS_MIN); + + newbuf = malloc(origsize); + if (newbuf == NULL) { + pjdlog_error("Unable to decompress (no memory: %zu).", + (size_t)origsize); + return (NULL); + } + newsize = lzf_decompress(data + sizeof(origsize), + *sizep - sizeof(origsize), newbuf, origsize); + if (newsize == 0) { + free(newbuf); + pjdlog_error("Unable to decompress."); + return (NULL); + } + PJDLOG_ASSERT(newsize == origsize); + + *sizep = newsize; + return (newbuf); +} + +const char * +compression_name(int num) +{ + + switch (num) { + case HAST_COMPRESSION_NONE: + return ("none"); + case HAST_COMPRESSION_HOLE: + return ("hole"); + case HAST_COMPRESSION_LZF: + return ("lzf"); + } + return ("unknown"); +} + +int +compression_send(const struct hast_resource *res, struct nv *nv, void **datap, + size_t *sizep, bool *freedatap) +{ + unsigned char *newbuf; + int compression; + size_t size; + + size = *sizep; + compression = res->hr_compression; + + switch (compression) { + case HAST_COMPRESSION_NONE: + return (0); + case HAST_COMPRESSION_HOLE: + newbuf = hast_hole_compress(*datap, &size); + break; + case HAST_COMPRESSION_LZF: + /* Try 'hole' compression first. */ + newbuf = hast_hole_compress(*datap, &size); + if (newbuf != NULL) + compression = HAST_COMPRESSION_HOLE; + else + newbuf = hast_lzf_compress(*datap, &size); + break; + default: + PJDLOG_ABORT("Invalid compression: %d.", res->hr_compression); + } + + if (newbuf == NULL) { + /* Unable to compress the data. */ + return (0); + } + nv_add_string(nv, compression_name(compression), "compression"); + if (nv_error(nv) != 0) { + free(newbuf); + errno = nv_error(nv); + return (-1); + } + if (*freedatap) + free(*datap); + *freedatap = true; + *datap = newbuf; + *sizep = size; + + return (0); +} + +int +compression_recv(const struct hast_resource *res __unused, struct nv *nv, + void **datap, size_t *sizep, bool *freedatap) +{ + unsigned char *newbuf; + const char *algo; + size_t size; + + algo = nv_get_string(nv, "compression"); + if (algo == NULL) + return (0); /* No compression. */ + + newbuf = NULL; + size = *sizep; + + if (strcmp(algo, "hole") == 0) + newbuf = hast_hole_decompress(*datap, &size); + else if (strcmp(algo, "lzf") == 0) + newbuf = hast_lzf_decompress(*datap, &size); + else { + pjdlog_error("Unknown compression algorithm '%s'.", algo); + return (-1); /* Unknown compression algorithm. */ + } + + if (newbuf == NULL) + return (-1); + if (*freedatap) + free(*datap); + *freedatap = true; + *datap = newbuf; + *sizep = size; + + return (0); +} diff --git a/sbin/hastd/hast_compression.h b/sbin/hastd/hast_compression.h new file mode 100644 index 0000000..eabdfb2 --- /dev/null +++ b/sbin/hastd/hast_compression.h @@ -0,0 +1,44 @@ +/*- + * Copyright (c) 2011 Pawel Jakub Dawidek <pawel@dawidek.net> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _HAST_COMPRESSION_H_ +#define _HAST_COMPRESSION_H_ + +#include <stdlib.h> /* size_t */ + +#include <hast.h> +#include <nv.h> + +const char *compression_name(int num); + +int compression_send(const struct hast_resource *res, struct nv *nv, + void **datap, size_t *sizep, bool *freedatap); +int compression_recv(const struct hast_resource *res, struct nv *nv, + void **datap, size_t *sizep, bool *freedatap); + +#endif /* !_HAST_COMPRESSION_H_ */ diff --git a/sbin/hastd/hast_proto.c b/sbin/hastd/hast_proto.c new file mode 100644 index 0000000..dd41fb1 --- /dev/null +++ b/sbin/hastd/hast_proto.c @@ -0,0 +1,222 @@ +/*- + * Copyright (c) 2009-2010 The FreeBSD Foundation + * Copyright (c) 2011 Pawel Jakub Dawidek <pawel@dawidek.net> + * All rights reserved. + * + * This software was developed by Pawel Jakub Dawidek under sponsorship from + * the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/endian.h> + +#include <errno.h> +#include <strings.h> + +#include <hast.h> +#include <ebuf.h> +#include <nv.h> +#include <pjdlog.h> +#include <proto.h> + +#ifdef HAVE_CRYPTO +#include "hast_checksum.h" +#endif +#include "hast_compression.h" +#include "hast_proto.h" + +struct hast_main_header { + /* Protocol version. */ + uint8_t version; + /* Size of nv headers. */ + uint32_t size; +} __packed; + +typedef int hps_send_t(const struct hast_resource *, struct nv *nv, void **, + size_t *, bool *); +typedef int hps_recv_t(const struct hast_resource *, struct nv *nv, void **, + size_t *, bool *); + +struct hast_pipe_stage { + const char *hps_name; + hps_send_t *hps_send; + hps_recv_t *hps_recv; +}; + +static struct hast_pipe_stage pipeline[] = { + { "compression", compression_send, compression_recv }, +#ifdef HAVE_CRYPTO + { "checksum", checksum_send, checksum_recv } +#endif +}; + +/* + * Send the given nv structure via conn. + * We keep headers in nv structure and pass data in separate argument. + * There can be no data at all (data is NULL then). + */ +int +hast_proto_send(const struct hast_resource *res, struct proto_conn *conn, + struct nv *nv, const void *data, size_t size) +{ + struct hast_main_header hdr; + struct ebuf *eb; + bool freedata; + void *dptr, *hptr; + size_t hsize; + int ret; + + dptr = (void *)(uintptr_t)data; + freedata = false; + ret = -1; + + if (data != NULL) { + unsigned int ii; + + for (ii = 0; ii < sizeof(pipeline) / sizeof(pipeline[0]); + ii++) { + (void)pipeline[ii].hps_send(res, nv, &dptr, &size, + &freedata); + } + nv_add_uint32(nv, size, "size"); + if (nv_error(nv) != 0) { + errno = nv_error(nv); + goto end; + } + } + + eb = nv_hton(nv); + if (eb == NULL) + goto end; + + hdr.version = res != NULL ? res->hr_version : HAST_PROTO_VERSION; + hdr.size = htole32((uint32_t)ebuf_size(eb)); + if (ebuf_add_head(eb, &hdr, sizeof(hdr)) == -1) + goto end; + + hptr = ebuf_data(eb, &hsize); + if (proto_send(conn, hptr, hsize) == -1) + goto end; + if (data != NULL && proto_send(conn, dptr, size) == -1) + goto end; + + ret = 0; +end: + if (freedata) + free(dptr); + return (ret); +} + +int +hast_proto_recv_hdr(const struct proto_conn *conn, struct nv **nvp) +{ + struct hast_main_header hdr; + struct nv *nv; + struct ebuf *eb; + void *hptr; + + eb = NULL; + nv = NULL; + + if (proto_recv(conn, &hdr, sizeof(hdr)) == -1) + goto fail; + + if (hdr.version > HAST_PROTO_VERSION) { + errno = ERPCMISMATCH; + goto fail; + } + + hdr.size = le32toh(hdr.size); + + eb = ebuf_alloc(hdr.size); + if (eb == NULL) + goto fail; + if (ebuf_add_tail(eb, NULL, hdr.size) == -1) + goto fail; + hptr = ebuf_data(eb, NULL); + PJDLOG_ASSERT(hptr != NULL); + if (proto_recv(conn, hptr, hdr.size) == -1) + goto fail; + nv = nv_ntoh(eb); + if (nv == NULL) + goto fail; + + *nvp = nv; + return (0); +fail: + if (eb != NULL) + ebuf_free(eb); + return (-1); +} + +int +hast_proto_recv_data(const struct hast_resource *res, struct proto_conn *conn, + struct nv *nv, void *data, size_t size) +{ + unsigned int ii; + bool freedata; + size_t dsize; + void *dptr; + int ret; + + PJDLOG_ASSERT(data != NULL); + PJDLOG_ASSERT(size > 0); + + ret = -1; + freedata = false; + dptr = data; + + dsize = nv_get_uint32(nv, "size"); + if (dsize > size) { + errno = EINVAL; + goto end; + } else if (dsize == 0) { + (void)nv_set_error(nv, 0); + } else { + if (proto_recv(conn, data, dsize) == -1) + goto end; + for (ii = sizeof(pipeline) / sizeof(pipeline[0]); ii > 0; + ii--) { + ret = pipeline[ii - 1].hps_recv(res, nv, &dptr, + &dsize, &freedata); + if (ret == -1) + goto end; + } + ret = -1; + if (dsize > size) { + errno = EINVAL; + goto end; + } + if (dptr != data) + bcopy(dptr, data, dsize); + } + + ret = 0; +end: + if (freedata) + free(dptr); + return (ret); +} diff --git a/sbin/hastd/hast_proto.h b/sbin/hastd/hast_proto.h new file mode 100644 index 0000000..49f3b56 --- /dev/null +++ b/sbin/hastd/hast_proto.h @@ -0,0 +1,46 @@ +/*- + * Copyright (c) 2009-2010 The FreeBSD Foundation + * All rights reserved. + * + * This software was developed by Pawel Jakub Dawidek under sponsorship from + * the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _HAST_PROTO_H_ +#define _HAST_PROTO_H_ + +#include <stdlib.h> /* size_t */ + +#include <nv.h> +#include <proto.h> + +int hast_proto_send(const struct hast_resource *res, struct proto_conn *conn, + struct nv *nv, const void *data, size_t size); +int hast_proto_recv_hdr(const struct proto_conn *conn, struct nv **nvp); +int hast_proto_recv_data(const struct hast_resource *res, + struct proto_conn *conn, struct nv *nv, void *data, size_t size); + +#endif /* !_HAST_PROTO_H_ */ diff --git a/sbin/hastd/hastd.8 b/sbin/hastd/hastd.8 new file mode 100644 index 0000000..017e895 --- /dev/null +++ b/sbin/hastd/hastd.8 @@ -0,0 +1,232 @@ +.\" Copyright (c) 2010 The FreeBSD Foundation +.\" All rights reserved. +.\" +.\" This software was developed by Pawel Jakub Dawidek under sponsorship from +.\" the FreeBSD Foundation. +.\" +.\" Redistribution and use in source and binary forms, with or without +.\" modification, are permitted provided that the following conditions +.\" are met: +.\" 1. Redistributions of source code must retain the above copyright +.\" notice, this list of conditions and the following disclaimer. +.\" 2. Redistributions in binary form must reproduce the above copyright +.\" notice, this list of conditions and the following disclaimer in the +.\" documentation and/or other materials provided with the distribution. +.\" +.\" THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND +.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +.\" ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE +.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +.\" SUCH DAMAGE. +.\" +.\" $FreeBSD$ +.\" +.Dd February 1, 2010 +.Dt HASTD 8 +.Os +.Sh NAME +.Nm hastd +.Nd "Highly Available Storage daemon" +.Sh SYNOPSIS +.Nm +.Op Fl dFh +.Op Fl c Ar config +.Op Fl P Ar pidfile +.Sh DESCRIPTION +The +.Nm +daemon is responsible for managing highly available GEOM providers. +.Pp +.Nm +allows to transparently store data on two physically separated machines +connected over the TCP/IP network. +Only one machine (cluster node) can actively use storage provided by +.Nm . +This machine is called primary. +The +.Nm +daemon operates on block level, which makes it transparent to file +systems and applications. +.Pp +There is one main +.Nm +daemon which starts new worker process as soon as a role for the given +resource is changed to primary or as soon as a role for the given +resource is changed to secondary and remote (primary) node will +successfully connect to it. +Every worker process gets a new process title (see +.Xr setproctitle 3 ) , +which describes its role and resource it controls. +The exact format is: +.Bd -literal -offset indent +hastd: <resource name> (<role>) +.Ed +.Pp +If (and only if) +.Nm +operates in primary role for the given resource, a corresponding +.Pa /dev/hast/<name> +disk-like device (GEOM provider) is created. +File systems and applications can use this provider to send I/O +requests to. +Every write, delete and flush operation +.Dv ( BIO_WRITE , BIO_DELETE , BIO_FLUSH ) +is sent to the local component and replicated on the remote (secondary) node +if it is available. +Read operations +.Dv ( BIO_READ ) +are handled locally unless an I/O error occurs or the local version of the data +is not up-to-date yet (synchronization is in progress). +.Pp +The +.Nm +daemon uses the GEOM Gate class to receive I/O requests from the +in-kernel GEOM infrastructure. +The +.Nm geom_gate.ko +module is loaded automatically if the kernel was not compiled with the +following option: +.Bd -ragged -offset indent +.Cd "options GEOM_GATE" +.Ed +.Pp +The connection between two +.Nm +daemons is always initiated from the one running as primary to the one +running as secondary. +When the primary +.Nm +is unable to connect or the connection fails, it will try to re-establish +the connection every few seconds. +Once the connection is established, the primary +.Nm +will synchronize every extent that was modified during connection outage +to the secondary +.Nm . +.Pp +It is possible that in the case of a connection outage between the nodes the +.Nm +primary role for the given resource will be configured on both nodes. +This in turn leads to incompatible data modifications. +Such a condition is called a split-brain and cannot be automatically +resolved by the +.Nm +daemon as this will lead most likely to data corruption or loss of +important changes. +Even though it cannot be fixed by +.Nm +itself, it will be detected and a further connection between independently +modified nodes will not be possible. +Once this situation is manually resolved by an administrator, the resource +on one of the nodes can be initialized (erasing local data), which makes +a connection to the remote node possible again. +Connection of the freshly initialized component will trigger full resource +synchronization. +.Pp +A +.Nm +daemon never picks its role automatically. +The role has to be configured with the +.Xr hastctl 8 +control utility by additional software like +.Nm ucarp +or +.Nm heartbeat +that can reliably manage role separation and switch secondary node to +primary role in case of the primary's failure. +.Pp +The +.Nm +daemon can be started with the following command line arguments: +.Bl -tag -width ".Fl P Ar pidfile" +.It Fl c Ar config +Specify alternative location of the configuration file. +The default location is +.Pa /etc/hast.conf . +.It Fl d +Print or log debugging information. +This option can be specified multiple times to raise the verbosity +level. +.It Fl F +Start the +.Nm +daemon in the foreground. +By default +.Nm +starts in the background. +.It Fl h +Print the +.Nm +usage message. +.It Fl P Ar pidfile +Specify alternative location of a file where main process PID will be +stored. +The default location is +.Pa /var/run/hastd.pid . +.El +.Sh FILES +.Bl -tag -width ".Pa /var/run/hastctl" -compact +.It Pa /etc/hast.conf +The configuration file for +.Nm +and +.Xr hastctl 8 . +.It Pa /var/run/hastctl +Control socket used by the +.Xr hastctl 8 +control utility to communicate with +.Nm . +.It Pa /var/run/hastd.pid +The default location of the +.Nm +PID file. +.El +.Sh EXIT STATUS +Exit status is 0 on success, or one of the values described in +.Xr sysexits 3 +on failure. +.Sh EXAMPLES +Launch +.Nm +on both nodes. +Set role for resource +.Nm shared +to primary on +.Nm nodeA +and to secondary on +.Nm nodeB . +Create file system on +.Pa /dev/hast/shared +provider and mount it. +.Bd -literal -offset indent +nodeB# hastd +nodeB# hastctl role secondary shared + +nodeA# hastd +nodeA# hastctl role primary shared +nodeA# newfs -U /dev/hast/shared +nodeA# mount -o noatime /dev/hast/shared /shared +.Ed +.Sh SEE ALSO +.Xr sysexits 3 , +.Xr geom 4 , +.Xr hast.conf 5 , +.Xr ggatec 8 , +.Xr ggated 8 , +.Xr ggatel 8 , +.Xr hastctl 8 , +.Xr mount 8 , +.Xr newfs 8 , +.Xr g_bio 9 +.Sh AUTHORS +The +.Nm +was developed by +.An Pawel Jakub Dawidek Aq pjd@FreeBSD.org +under sponsorship of the FreeBSD Foundation. diff --git a/sbin/hastd/hastd.c b/sbin/hastd/hastd.c new file mode 100644 index 0000000..06b38e9 --- /dev/null +++ b/sbin/hastd/hastd.c @@ -0,0 +1,1337 @@ +/*- + * Copyright (c) 2009-2010 The FreeBSD Foundation + * Copyright (c) 2010-2011 Pawel Jakub Dawidek <pawel@dawidek.net> + * All rights reserved. + * + * This software was developed by Pawel Jakub Dawidek under sponsorship from + * the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/linker.h> +#include <sys/module.h> +#include <sys/stat.h> +#include <sys/wait.h> + +#include <err.h> +#include <errno.h> +#include <libutil.h> +#include <signal.h> +#include <stdbool.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sysexits.h> +#include <time.h> +#include <unistd.h> + +#include <activemap.h> +#include <pjdlog.h> + +#include "control.h" +#include "event.h" +#include "hast.h" +#include "hast_proto.h" +#include "hastd.h" +#include "hooks.h" +#include "subr.h" + +/* Path to configuration file. */ +const char *cfgpath = HAST_CONFIG; +/* Hastd configuration. */ +static struct hastd_config *cfg; +/* Was SIGINT or SIGTERM signal received? */ +bool sigexit_received = false; +/* Path to pidfile. */ +static const char *pidfile; +/* Pidfile handle. */ +struct pidfh *pfh; +/* Do we run in foreground? */ +static bool foreground; + +/* How often check for hooks running for too long. */ +#define REPORT_INTERVAL 5 + +static void +usage(void) +{ + + errx(EX_USAGE, "[-dFh] [-c config] [-P pidfile]"); +} + +static void +g_gate_load(void) +{ + + if (modfind("g_gate") == -1) { + /* Not present in kernel, try loading it. */ + if (kldload("geom_gate") == -1 || modfind("g_gate") == -1) { + if (errno != EEXIST) { + pjdlog_exit(EX_OSERR, + "Unable to load geom_gate module"); + } + } + } +} + +void +descriptors_cleanup(struct hast_resource *res) +{ + struct hast_resource *tres, *tmres; + struct hastd_listen *lst; + + TAILQ_FOREACH_SAFE(tres, &cfg->hc_resources, hr_next, tmres) { + if (tres == res) { + PJDLOG_VERIFY(res->hr_role == HAST_ROLE_SECONDARY || + (res->hr_remotein == NULL && + res->hr_remoteout == NULL)); + continue; + } + if (tres->hr_remotein != NULL) + proto_close(tres->hr_remotein); + if (tres->hr_remoteout != NULL) + proto_close(tres->hr_remoteout); + if (tres->hr_ctrl != NULL) + proto_close(tres->hr_ctrl); + if (tres->hr_event != NULL) + proto_close(tres->hr_event); + if (tres->hr_conn != NULL) + proto_close(tres->hr_conn); + TAILQ_REMOVE(&cfg->hc_resources, tres, hr_next); + free(tres); + } + if (cfg->hc_controlin != NULL) + proto_close(cfg->hc_controlin); + proto_close(cfg->hc_controlconn); + while ((lst = TAILQ_FIRST(&cfg->hc_listen)) != NULL) { + TAILQ_REMOVE(&cfg->hc_listen, lst, hl_next); + if (lst->hl_conn != NULL) + proto_close(lst->hl_conn); + free(lst); + } + (void)pidfile_close(pfh); + hook_fini(); + pjdlog_fini(); +} + +static const char * +dtype2str(mode_t mode) +{ + + if (S_ISBLK(mode)) + return ("block device"); + else if (S_ISCHR(mode)) + return ("character device"); + else if (S_ISDIR(mode)) + return ("directory"); + else if (S_ISFIFO(mode)) + return ("pipe or FIFO"); + else if (S_ISLNK(mode)) + return ("symbolic link"); + else if (S_ISREG(mode)) + return ("regular file"); + else if (S_ISSOCK(mode)) + return ("socket"); + else if (S_ISWHT(mode)) + return ("whiteout"); + else + return ("unknown"); +} + +void +descriptors_assert(const struct hast_resource *res, int pjdlogmode) +{ + char msg[256]; + struct stat sb; + long maxfd; + bool isopen; + mode_t mode; + int fd; + + /* + * At this point descriptor to syslog socket is closed, so if we want + * to log assertion message, we have to first store it in 'msg' local + * buffer and then open syslog socket and log it. + */ + msg[0] = '\0'; + + maxfd = sysconf(_SC_OPEN_MAX); + if (maxfd == -1) { + pjdlog_init(pjdlogmode); + pjdlog_prefix_set("[%s] (%s) ", res->hr_name, + role2str(res->hr_role)); + pjdlog_errno(LOG_WARNING, "sysconf(_SC_OPEN_MAX) failed"); + pjdlog_fini(); + maxfd = 16384; + } + for (fd = 0; fd <= maxfd; fd++) { + if (fstat(fd, &sb) == 0) { + isopen = true; + mode = sb.st_mode; + } else if (errno == EBADF) { + isopen = false; + mode = 0; + } else { + (void)snprintf(msg, sizeof(msg), + "Unable to fstat descriptor %d: %s", fd, + strerror(errno)); + break; + } + if (fd == STDIN_FILENO || fd == STDOUT_FILENO || + fd == STDERR_FILENO) { + if (!isopen) { + (void)snprintf(msg, sizeof(msg), + "Descriptor %d (%s) is closed, but should be open.", + fd, (fd == STDIN_FILENO ? "stdin" : + (fd == STDOUT_FILENO ? "stdout" : "stderr"))); + break; + } + } else if (fd == proto_descriptor(res->hr_event)) { + if (!isopen) { + (void)snprintf(msg, sizeof(msg), + "Descriptor %d (event) is closed, but should be open.", + fd); + break; + } + if (!S_ISSOCK(mode)) { + (void)snprintf(msg, sizeof(msg), + "Descriptor %d (event) is %s, but should be %s.", + fd, dtype2str(mode), dtype2str(S_IFSOCK)); + break; + } + } else if (fd == proto_descriptor(res->hr_ctrl)) { + if (!isopen) { + (void)snprintf(msg, sizeof(msg), + "Descriptor %d (ctrl) is closed, but should be open.", + fd); + break; + } + if (!S_ISSOCK(mode)) { + (void)snprintf(msg, sizeof(msg), + "Descriptor %d (ctrl) is %s, but should be %s.", + fd, dtype2str(mode), dtype2str(S_IFSOCK)); + break; + } + } else if (res->hr_role == HAST_ROLE_PRIMARY && + fd == proto_descriptor(res->hr_conn)) { + if (!isopen) { + (void)snprintf(msg, sizeof(msg), + "Descriptor %d (conn) is closed, but should be open.", + fd); + break; + } + if (!S_ISSOCK(mode)) { + (void)snprintf(msg, sizeof(msg), + "Descriptor %d (conn) is %s, but should be %s.", + fd, dtype2str(mode), dtype2str(S_IFSOCK)); + break; + } + } else if (res->hr_role == HAST_ROLE_SECONDARY && + res->hr_conn != NULL && + fd == proto_descriptor(res->hr_conn)) { + if (isopen) { + (void)snprintf(msg, sizeof(msg), + "Descriptor %d (conn) is open, but should be closed.", + fd); + break; + } + } else if (res->hr_role == HAST_ROLE_SECONDARY && + fd == proto_descriptor(res->hr_remotein)) { + if (!isopen) { + (void)snprintf(msg, sizeof(msg), + "Descriptor %d (remote in) is closed, but should be open.", + fd); + break; + } + if (!S_ISSOCK(mode)) { + (void)snprintf(msg, sizeof(msg), + "Descriptor %d (remote in) is %s, but should be %s.", + fd, dtype2str(mode), dtype2str(S_IFSOCK)); + break; + } + } else if (res->hr_role == HAST_ROLE_SECONDARY && + fd == proto_descriptor(res->hr_remoteout)) { + if (!isopen) { + (void)snprintf(msg, sizeof(msg), + "Descriptor %d (remote out) is closed, but should be open.", + fd); + break; + } + if (!S_ISSOCK(mode)) { + (void)snprintf(msg, sizeof(msg), + "Descriptor %d (remote out) is %s, but should be %s.", + fd, dtype2str(mode), dtype2str(S_IFSOCK)); + break; + } + } else { + if (isopen) { + (void)snprintf(msg, sizeof(msg), + "Descriptor %d is open (%s), but should be closed.", + fd, dtype2str(mode)); + break; + } + } + } + if (msg[0] != '\0') { + pjdlog_init(pjdlogmode); + pjdlog_prefix_set("[%s] (%s) ", res->hr_name, + role2str(res->hr_role)); + PJDLOG_ABORT("%s", msg); + } +} + +static void +child_exit_log(unsigned int pid, int status) +{ + + if (WIFEXITED(status) && WEXITSTATUS(status) == 0) { + pjdlog_debug(1, "Worker process exited gracefully (pid=%u).", + pid); + } else if (WIFSIGNALED(status)) { + pjdlog_error("Worker process killed (pid=%u, signal=%d).", + pid, WTERMSIG(status)); + } else { + pjdlog_error("Worker process exited ungracefully (pid=%u, exitcode=%d).", + pid, WIFEXITED(status) ? WEXITSTATUS(status) : -1); + } +} + +static void +child_exit(void) +{ + struct hast_resource *res; + int status; + pid_t pid; + + while ((pid = wait3(&status, WNOHANG, NULL)) > 0) { + /* Find resource related to the process that just exited. */ + TAILQ_FOREACH(res, &cfg->hc_resources, hr_next) { + if (pid == res->hr_workerpid) + break; + } + if (res == NULL) { + /* + * This can happen when new connection arrives and we + * cancel child responsible for the old one or if this + * was hook which we executed. + */ + hook_check_one(pid, status); + continue; + } + pjdlog_prefix_set("[%s] (%s) ", res->hr_name, + role2str(res->hr_role)); + child_exit_log(pid, status); + child_cleanup(res); + if (res->hr_role == HAST_ROLE_PRIMARY) { + /* + * Restart child process if it was killed by signal + * or exited because of temporary problem. + */ + if (WIFSIGNALED(status) || + (WIFEXITED(status) && + WEXITSTATUS(status) == EX_TEMPFAIL)) { + sleep(1); + pjdlog_info("Restarting worker process."); + hastd_primary(res); + } else { + res->hr_role = HAST_ROLE_INIT; + pjdlog_info("Changing resource role back to %s.", + role2str(res->hr_role)); + } + } + pjdlog_prefix_set("%s", ""); + } +} + +static bool +resource_needs_restart(const struct hast_resource *res0, + const struct hast_resource *res1) +{ + + PJDLOG_ASSERT(strcmp(res0->hr_name, res1->hr_name) == 0); + + if (strcmp(res0->hr_provname, res1->hr_provname) != 0) + return (true); + if (strcmp(res0->hr_localpath, res1->hr_localpath) != 0) + return (true); + if (res0->hr_role == HAST_ROLE_INIT || + res0->hr_role == HAST_ROLE_SECONDARY) { + if (strcmp(res0->hr_remoteaddr, res1->hr_remoteaddr) != 0) + return (true); + if (strcmp(res0->hr_sourceaddr, res1->hr_sourceaddr) != 0) + return (true); + if (res0->hr_replication != res1->hr_replication) + return (true); + if (res0->hr_checksum != res1->hr_checksum) + return (true); + if (res0->hr_compression != res1->hr_compression) + return (true); + if (res0->hr_timeout != res1->hr_timeout) + return (true); + if (strcmp(res0->hr_exec, res1->hr_exec) != 0) + return (true); + /* + * When metaflush has changed we don't really need restart, + * but it is just easier this way. + */ + if (res0->hr_metaflush != res1->hr_metaflush) + return (true); + } + return (false); +} + +static bool +resource_needs_reload(const struct hast_resource *res0, + const struct hast_resource *res1) +{ + + PJDLOG_ASSERT(strcmp(res0->hr_name, res1->hr_name) == 0); + PJDLOG_ASSERT(strcmp(res0->hr_provname, res1->hr_provname) == 0); + PJDLOG_ASSERT(strcmp(res0->hr_localpath, res1->hr_localpath) == 0); + + if (res0->hr_role != HAST_ROLE_PRIMARY) + return (false); + + if (strcmp(res0->hr_remoteaddr, res1->hr_remoteaddr) != 0) + return (true); + if (strcmp(res0->hr_sourceaddr, res1->hr_sourceaddr) != 0) + return (true); + if (res0->hr_replication != res1->hr_replication) + return (true); + if (res0->hr_checksum != res1->hr_checksum) + return (true); + if (res0->hr_compression != res1->hr_compression) + return (true); + if (res0->hr_timeout != res1->hr_timeout) + return (true); + if (strcmp(res0->hr_exec, res1->hr_exec) != 0) + return (true); + if (res0->hr_metaflush != res1->hr_metaflush) + return (true); + return (false); +} + +static void +resource_reload(const struct hast_resource *res) +{ + struct nv *nvin, *nvout; + int error; + + PJDLOG_ASSERT(res->hr_role == HAST_ROLE_PRIMARY); + + nvout = nv_alloc(); + nv_add_uint8(nvout, CONTROL_RELOAD, "cmd"); + nv_add_string(nvout, res->hr_remoteaddr, "remoteaddr"); + nv_add_string(nvout, res->hr_sourceaddr, "sourceaddr"); + nv_add_int32(nvout, (int32_t)res->hr_replication, "replication"); + nv_add_int32(nvout, (int32_t)res->hr_checksum, "checksum"); + nv_add_int32(nvout, (int32_t)res->hr_compression, "compression"); + nv_add_int32(nvout, (int32_t)res->hr_timeout, "timeout"); + nv_add_string(nvout, res->hr_exec, "exec"); + nv_add_int32(nvout, (int32_t)res->hr_metaflush, "metaflush"); + if (nv_error(nvout) != 0) { + nv_free(nvout); + pjdlog_error("Unable to allocate header for reload message."); + return; + } + if (hast_proto_send(res, res->hr_ctrl, nvout, NULL, 0) == -1) { + pjdlog_errno(LOG_ERR, "Unable to send reload message"); + nv_free(nvout); + return; + } + nv_free(nvout); + + /* Receive response. */ + if (hast_proto_recv_hdr(res->hr_ctrl, &nvin) == -1) { + pjdlog_errno(LOG_ERR, "Unable to receive reload reply"); + return; + } + error = nv_get_int16(nvin, "error"); + nv_free(nvin); + if (error != 0) { + pjdlog_common(LOG_ERR, 0, error, "Reload failed"); + return; + } +} + +static void +hastd_reload(void) +{ + struct hastd_config *newcfg; + struct hast_resource *nres, *cres, *tres; + struct hastd_listen *nlst, *clst; + struct pidfh *newpfh; + unsigned int nlisten; + uint8_t role; + pid_t otherpid; + + pjdlog_info("Reloading configuration..."); + + newpfh = NULL; + + newcfg = yy_config_parse(cfgpath, false); + if (newcfg == NULL) + goto failed; + + /* + * Check if control address has changed. + */ + if (strcmp(cfg->hc_controladdr, newcfg->hc_controladdr) != 0) { + if (proto_server(newcfg->hc_controladdr, + &newcfg->hc_controlconn) == -1) { + pjdlog_errno(LOG_ERR, + "Unable to listen on control address %s", + newcfg->hc_controladdr); + goto failed; + } + } + /* + * Check if any listen address has changed. + */ + nlisten = 0; + TAILQ_FOREACH(nlst, &newcfg->hc_listen, hl_next) { + TAILQ_FOREACH(clst, &cfg->hc_listen, hl_next) { + if (strcmp(nlst->hl_addr, clst->hl_addr) == 0) + break; + } + if (clst != NULL && clst->hl_conn != NULL) { + pjdlog_info("Keep listening on address %s.", + nlst->hl_addr); + nlst->hl_conn = clst->hl_conn; + nlisten++; + } else if (proto_server(nlst->hl_addr, &nlst->hl_conn) == 0) { + pjdlog_info("Listening on new address %s.", + nlst->hl_addr); + nlisten++; + } else { + pjdlog_errno(LOG_WARNING, + "Unable to listen on address %s", nlst->hl_addr); + } + } + if (nlisten == 0) { + pjdlog_error("No addresses to listen on."); + goto failed; + } + /* + * Check if pidfile's path has changed. + */ + if (!foreground && pidfile == NULL && + strcmp(cfg->hc_pidfile, newcfg->hc_pidfile) != 0) { + newpfh = pidfile_open(newcfg->hc_pidfile, 0600, &otherpid); + if (newpfh == NULL) { + if (errno == EEXIST) { + pjdlog_errno(LOG_WARNING, + "Another hastd is already running, pidfile: %s, pid: %jd.", + newcfg->hc_pidfile, (intmax_t)otherpid); + } else { + pjdlog_errno(LOG_WARNING, + "Unable to open or create pidfile %s", + newcfg->hc_pidfile); + } + } else if (pidfile_write(newpfh) == -1) { + /* Write PID to a file. */ + pjdlog_errno(LOG_WARNING, + "Unable to write PID to file %s", + newcfg->hc_pidfile); + } else { + pjdlog_debug(1, "PID stored in %s.", + newcfg->hc_pidfile); + } + } + + /* No failures from now on. */ + + /* + * Switch to new control socket. + */ + if (newcfg->hc_controlconn != NULL) { + pjdlog_info("Control socket changed from %s to %s.", + cfg->hc_controladdr, newcfg->hc_controladdr); + proto_close(cfg->hc_controlconn); + cfg->hc_controlconn = newcfg->hc_controlconn; + newcfg->hc_controlconn = NULL; + strlcpy(cfg->hc_controladdr, newcfg->hc_controladdr, + sizeof(cfg->hc_controladdr)); + } + /* + * Switch to new pidfile. + */ + if (newpfh != NULL) { + pjdlog_info("Pidfile changed from %s to %s.", cfg->hc_pidfile, + newcfg->hc_pidfile); + (void)pidfile_remove(pfh); + pfh = newpfh; + (void)strlcpy(cfg->hc_pidfile, newcfg->hc_pidfile, + sizeof(cfg->hc_pidfile)); + } + /* + * Switch to new listen addresses. Close all that were removed. + */ + while ((clst = TAILQ_FIRST(&cfg->hc_listen)) != NULL) { + TAILQ_FOREACH(nlst, &newcfg->hc_listen, hl_next) { + if (strcmp(nlst->hl_addr, clst->hl_addr) == 0) + break; + } + if (nlst == NULL && clst->hl_conn != NULL) { + proto_close(clst->hl_conn); + pjdlog_info("No longer listening on address %s.", + clst->hl_addr); + } + TAILQ_REMOVE(&cfg->hc_listen, clst, hl_next); + free(clst); + } + TAILQ_CONCAT(&cfg->hc_listen, &newcfg->hc_listen, hl_next); + + /* + * Stop and remove resources that were removed from the configuration. + */ + TAILQ_FOREACH_SAFE(cres, &cfg->hc_resources, hr_next, tres) { + TAILQ_FOREACH(nres, &newcfg->hc_resources, hr_next) { + if (strcmp(cres->hr_name, nres->hr_name) == 0) + break; + } + if (nres == NULL) { + control_set_role(cres, HAST_ROLE_INIT); + TAILQ_REMOVE(&cfg->hc_resources, cres, hr_next); + pjdlog_info("Resource %s removed.", cres->hr_name); + free(cres); + } + } + /* + * Move new resources to the current configuration. + */ + TAILQ_FOREACH_SAFE(nres, &newcfg->hc_resources, hr_next, tres) { + TAILQ_FOREACH(cres, &cfg->hc_resources, hr_next) { + if (strcmp(cres->hr_name, nres->hr_name) == 0) + break; + } + if (cres == NULL) { + TAILQ_REMOVE(&newcfg->hc_resources, nres, hr_next); + TAILQ_INSERT_TAIL(&cfg->hc_resources, nres, hr_next); + pjdlog_info("Resource %s added.", nres->hr_name); + } + } + /* + * Deal with modified resources. + * Depending on what has changed exactly we might want to perform + * different actions. + * + * We do full resource restart in the following situations: + * Resource role is INIT or SECONDARY. + * Resource role is PRIMARY and path to local component or provider + * name has changed. + * In case of PRIMARY, the worker process will be killed and restarted, + * which also means removing /dev/hast/<name> provider and + * recreating it. + * + * We do just reload (send SIGHUP to worker process) if we act as + * PRIMARY, but only if remote address, source address, replication + * mode, timeout, execution path or metaflush has changed. + * For those, there is no need to restart worker process. + * If PRIMARY receives SIGHUP, it will reconnect if remote address or + * source address has changed or it will set new timeout if only timeout + * has changed or it will update metaflush if only metaflush has + * changed. + */ + TAILQ_FOREACH_SAFE(nres, &newcfg->hc_resources, hr_next, tres) { + TAILQ_FOREACH(cres, &cfg->hc_resources, hr_next) { + if (strcmp(cres->hr_name, nres->hr_name) == 0) + break; + } + PJDLOG_ASSERT(cres != NULL); + if (resource_needs_restart(cres, nres)) { + pjdlog_info("Resource %s configuration was modified, restarting it.", + cres->hr_name); + role = cres->hr_role; + control_set_role(cres, HAST_ROLE_INIT); + TAILQ_REMOVE(&cfg->hc_resources, cres, hr_next); + free(cres); + TAILQ_REMOVE(&newcfg->hc_resources, nres, hr_next); + TAILQ_INSERT_TAIL(&cfg->hc_resources, nres, hr_next); + control_set_role(nres, role); + } else if (resource_needs_reload(cres, nres)) { + pjdlog_info("Resource %s configuration was modified, reloading it.", + cres->hr_name); + strlcpy(cres->hr_remoteaddr, nres->hr_remoteaddr, + sizeof(cres->hr_remoteaddr)); + strlcpy(cres->hr_sourceaddr, nres->hr_sourceaddr, + sizeof(cres->hr_sourceaddr)); + cres->hr_replication = nres->hr_replication; + cres->hr_checksum = nres->hr_checksum; + cres->hr_compression = nres->hr_compression; + cres->hr_timeout = nres->hr_timeout; + strlcpy(cres->hr_exec, nres->hr_exec, + sizeof(cres->hr_exec)); + cres->hr_metaflush = nres->hr_metaflush; + if (cres->hr_workerpid != 0) + resource_reload(cres); + } + } + + yy_config_free(newcfg); + pjdlog_info("Configuration reloaded successfully."); + return; +failed: + if (newcfg != NULL) { + if (newcfg->hc_controlconn != NULL) + proto_close(newcfg->hc_controlconn); + while ((nlst = TAILQ_FIRST(&newcfg->hc_listen)) != NULL) { + if (nlst->hl_conn != NULL) { + TAILQ_FOREACH(clst, &cfg->hc_listen, hl_next) { + if (strcmp(nlst->hl_addr, + clst->hl_addr) == 0) { + break; + } + } + if (clst == NULL || clst->hl_conn == NULL) + proto_close(nlst->hl_conn); + } + TAILQ_REMOVE(&newcfg->hc_listen, nlst, hl_next); + free(nlst); + } + yy_config_free(newcfg); + } + if (newpfh != NULL) + (void)pidfile_remove(newpfh); + pjdlog_warning("Configuration not reloaded."); +} + +static void +terminate_workers(void) +{ + struct hast_resource *res; + + pjdlog_info("Termination signal received, exiting."); + TAILQ_FOREACH(res, &cfg->hc_resources, hr_next) { + if (res->hr_workerpid == 0) + continue; + pjdlog_info("Terminating worker process (resource=%s, role=%s, pid=%u).", + res->hr_name, role2str(res->hr_role), res->hr_workerpid); + if (kill(res->hr_workerpid, SIGTERM) == 0) + continue; + pjdlog_errno(LOG_WARNING, + "Unable to send signal to worker process (resource=%s, role=%s, pid=%u).", + res->hr_name, role2str(res->hr_role), res->hr_workerpid); + } +} + +static void +listen_accept(struct hastd_listen *lst) +{ + struct hast_resource *res; + struct proto_conn *conn; + struct nv *nvin, *nvout, *nverr; + const char *resname; + const unsigned char *token; + char laddr[256], raddr[256]; + uint8_t version; + size_t size; + pid_t pid; + int status; + + proto_local_address(lst->hl_conn, laddr, sizeof(laddr)); + pjdlog_debug(1, "Accepting connection to %s.", laddr); + + if (proto_accept(lst->hl_conn, &conn) == -1) { + pjdlog_errno(LOG_ERR, "Unable to accept connection %s", laddr); + return; + } + + proto_local_address(conn, laddr, sizeof(laddr)); + proto_remote_address(conn, raddr, sizeof(raddr)); + pjdlog_info("Connection from %s to %s.", raddr, laddr); + + /* Error in setting timeout is not critical, but why should it fail? */ + if (proto_timeout(conn, HAST_TIMEOUT) == -1) + pjdlog_errno(LOG_WARNING, "Unable to set connection timeout"); + + nvin = nvout = nverr = NULL; + + /* + * Before receiving any data see if remote host have access to any + * resource. + */ + TAILQ_FOREACH(res, &cfg->hc_resources, hr_next) { + if (proto_address_match(conn, res->hr_remoteaddr)) + break; + } + if (res == NULL) { + pjdlog_error("Client %s isn't known.", raddr); + goto close; + } + /* Ok, remote host can access at least one resource. */ + + if (hast_proto_recv_hdr(conn, &nvin) == -1) { + pjdlog_errno(LOG_ERR, "Unable to receive header from %s", + raddr); + goto close; + } + + resname = nv_get_string(nvin, "resource"); + if (resname == NULL) { + pjdlog_error("No 'resource' field in the header received from %s.", + raddr); + goto close; + } + pjdlog_debug(2, "%s: resource=%s", raddr, resname); + version = nv_get_uint8(nvin, "version"); + pjdlog_debug(2, "%s: version=%hhu", raddr, version); + if (version == 0) { + /* + * If no version is sent, it means this is protocol version 1. + */ + version = 1; + } + if (version > HAST_PROTO_VERSION) { + pjdlog_info("Remote protocol version %hhu is not supported, falling back to version %hhu.", + version, (unsigned char)HAST_PROTO_VERSION); + version = HAST_PROTO_VERSION; + } + pjdlog_debug(1, "Negotiated protocol version %hhu.", version); + token = nv_get_uint8_array(nvin, &size, "token"); + /* + * NULL token means that this is first connection. + */ + if (token != NULL && size != sizeof(res->hr_token)) { + pjdlog_error("Received token of invalid size from %s (expected %zu, got %zu).", + raddr, sizeof(res->hr_token), size); + goto close; + } + + /* + * From now on we want to send errors to the remote node. + */ + nverr = nv_alloc(); + + /* Find resource related to this connection. */ + TAILQ_FOREACH(res, &cfg->hc_resources, hr_next) { + if (strcmp(resname, res->hr_name) == 0) + break; + } + /* Have we found the resource? */ + if (res == NULL) { + pjdlog_error("No resource '%s' as requested by %s.", + resname, raddr); + nv_add_stringf(nverr, "errmsg", "Resource not configured."); + goto fail; + } + + /* Now that we know resource name setup log prefix. */ + pjdlog_prefix_set("[%s] (%s) ", res->hr_name, role2str(res->hr_role)); + + /* Does the remote host have access to this resource? */ + if (!proto_address_match(conn, res->hr_remoteaddr)) { + pjdlog_error("Client %s has no access to the resource.", raddr); + nv_add_stringf(nverr, "errmsg", "No access to the resource."); + goto fail; + } + /* Is the resource marked as secondary? */ + if (res->hr_role != HAST_ROLE_SECONDARY) { + pjdlog_warning("We act as %s for the resource and not as %s as requested by %s.", + role2str(res->hr_role), role2str(HAST_ROLE_SECONDARY), + raddr); + nv_add_stringf(nverr, "errmsg", + "Remote node acts as %s for the resource and not as %s.", + role2str(res->hr_role), role2str(HAST_ROLE_SECONDARY)); + if (res->hr_role == HAST_ROLE_PRIMARY) { + /* + * If we act as primary request the other side to wait + * for us a bit, as we might be finishing cleanups. + */ + nv_add_uint8(nverr, 1, "wait"); + } + goto fail; + } + /* Does token (if exists) match? */ + if (token != NULL && memcmp(token, res->hr_token, + sizeof(res->hr_token)) != 0) { + pjdlog_error("Token received from %s doesn't match.", raddr); + nv_add_stringf(nverr, "errmsg", "Token doesn't match."); + goto fail; + } + /* + * If there is no token, but we have half-open connection + * (only remotein) or full connection (worker process is running) + * we have to cancel those and accept the new connection. + */ + if (token == NULL) { + PJDLOG_ASSERT(res->hr_remoteout == NULL); + pjdlog_debug(1, "Initial connection from %s.", raddr); + if (res->hr_workerpid != 0) { + PJDLOG_ASSERT(res->hr_remotein == NULL); + pjdlog_debug(1, + "Worker process exists (pid=%u), stopping it.", + (unsigned int)res->hr_workerpid); + /* Stop child process. */ + if (kill(res->hr_workerpid, SIGINT) == -1) { + pjdlog_errno(LOG_ERR, + "Unable to stop worker process (pid=%u)", + (unsigned int)res->hr_workerpid); + /* + * Other than logging the problem we + * ignore it - nothing smart to do. + */ + } + /* Wait for it to exit. */ + else if ((pid = waitpid(res->hr_workerpid, + &status, 0)) != res->hr_workerpid) { + /* We can only log the problem. */ + pjdlog_errno(LOG_ERR, + "Waiting for worker process (pid=%u) failed", + (unsigned int)res->hr_workerpid); + } else { + child_exit_log(res->hr_workerpid, status); + } + child_cleanup(res); + } else if (res->hr_remotein != NULL) { + char oaddr[256]; + + proto_remote_address(res->hr_remotein, oaddr, + sizeof(oaddr)); + pjdlog_debug(1, + "Canceling half-open connection from %s on connection from %s.", + oaddr, raddr); + proto_close(res->hr_remotein); + res->hr_remotein = NULL; + } + } + + /* + * Checks and cleanups are done. + */ + + if (token == NULL) { + res->hr_version = version; + arc4random_buf(res->hr_token, sizeof(res->hr_token)); + nvout = nv_alloc(); + nv_add_uint8(nvout, version, "version"); + nv_add_uint8_array(nvout, res->hr_token, + sizeof(res->hr_token), "token"); + if (nv_error(nvout) != 0) { + pjdlog_common(LOG_ERR, 0, nv_error(nvout), + "Unable to prepare return header for %s", raddr); + nv_add_stringf(nverr, "errmsg", + "Remote node was unable to prepare return header: %s.", + strerror(nv_error(nvout))); + goto fail; + } + if (hast_proto_send(res, conn, nvout, NULL, 0) == -1) { + int error = errno; + + pjdlog_errno(LOG_ERR, "Unable to send response to %s", + raddr); + nv_add_stringf(nverr, "errmsg", + "Remote node was unable to send response: %s.", + strerror(error)); + goto fail; + } + res->hr_remotein = conn; + pjdlog_debug(1, "Incoming connection from %s configured.", + raddr); + } else { + res->hr_remoteout = conn; + pjdlog_debug(1, "Outgoing connection to %s configured.", raddr); + hastd_secondary(res, nvin); + } + nv_free(nvin); + nv_free(nvout); + nv_free(nverr); + pjdlog_prefix_set("%s", ""); + return; +fail: + if (nv_error(nverr) != 0) { + pjdlog_common(LOG_ERR, 0, nv_error(nverr), + "Unable to prepare error header for %s", raddr); + goto close; + } + if (hast_proto_send(NULL, conn, nverr, NULL, 0) == -1) { + pjdlog_errno(LOG_ERR, "Unable to send error to %s", raddr); + goto close; + } +close: + if (nvin != NULL) + nv_free(nvin); + if (nvout != NULL) + nv_free(nvout); + if (nverr != NULL) + nv_free(nverr); + proto_close(conn); + pjdlog_prefix_set("%s", ""); +} + +static void +connection_migrate(struct hast_resource *res) +{ + struct proto_conn *conn; + int16_t val = 0; + + pjdlog_prefix_set("[%s] (%s) ", res->hr_name, role2str(res->hr_role)); + + PJDLOG_ASSERT(res->hr_role == HAST_ROLE_PRIMARY); + + if (proto_recv(res->hr_conn, &val, sizeof(val)) == -1) { + pjdlog_errno(LOG_WARNING, + "Unable to receive connection command"); + return; + } + if (proto_client(res->hr_sourceaddr[0] != '\0' ? res->hr_sourceaddr : NULL, + res->hr_remoteaddr, &conn) == -1) { + val = errno; + pjdlog_errno(LOG_WARNING, + "Unable to create outgoing connection to %s", + res->hr_remoteaddr); + goto out; + } + if (proto_connect(conn, -1) == -1) { + val = errno; + pjdlog_errno(LOG_WARNING, "Unable to connect to %s", + res->hr_remoteaddr); + proto_close(conn); + goto out; + } + val = 0; +out: + if (proto_send(res->hr_conn, &val, sizeof(val)) == -1) { + pjdlog_errno(LOG_WARNING, + "Unable to send reply to connection request"); + } + if (val == 0 && proto_connection_send(res->hr_conn, conn) == -1) + pjdlog_errno(LOG_WARNING, "Unable to send connection"); + + pjdlog_prefix_set("%s", ""); +} + +static void +check_signals(void) +{ + struct timespec sigtimeout; + sigset_t mask; + int signo; + + sigtimeout.tv_sec = 0; + sigtimeout.tv_nsec = 0; + + PJDLOG_VERIFY(sigemptyset(&mask) == 0); + PJDLOG_VERIFY(sigaddset(&mask, SIGHUP) == 0); + PJDLOG_VERIFY(sigaddset(&mask, SIGINT) == 0); + PJDLOG_VERIFY(sigaddset(&mask, SIGTERM) == 0); + PJDLOG_VERIFY(sigaddset(&mask, SIGCHLD) == 0); + + while ((signo = sigtimedwait(&mask, NULL, &sigtimeout)) != -1) { + switch (signo) { + case SIGINT: + case SIGTERM: + sigexit_received = true; + terminate_workers(); + proto_close(cfg->hc_controlconn); + exit(EX_OK); + break; + case SIGCHLD: + child_exit(); + break; + case SIGHUP: + hastd_reload(); + break; + default: + PJDLOG_ABORT("Unexpected signal (%d).", signo); + } + } +} + +static void +main_loop(void) +{ + struct hast_resource *res; + struct hastd_listen *lst; + struct timeval seltimeout; + int fd, maxfd, ret; + time_t lastcheck, now; + fd_set rfds; + + lastcheck = time(NULL); + seltimeout.tv_sec = REPORT_INTERVAL; + seltimeout.tv_usec = 0; + + for (;;) { + check_signals(); + + /* Setup descriptors for select(2). */ + FD_ZERO(&rfds); + maxfd = fd = proto_descriptor(cfg->hc_controlconn); + PJDLOG_ASSERT(fd >= 0); + FD_SET(fd, &rfds); + TAILQ_FOREACH(lst, &cfg->hc_listen, hl_next) { + if (lst->hl_conn == NULL) + continue; + fd = proto_descriptor(lst->hl_conn); + PJDLOG_ASSERT(fd >= 0); + FD_SET(fd, &rfds); + maxfd = fd > maxfd ? fd : maxfd; + } + TAILQ_FOREACH(res, &cfg->hc_resources, hr_next) { + if (res->hr_event == NULL) + continue; + fd = proto_descriptor(res->hr_event); + PJDLOG_ASSERT(fd >= 0); + FD_SET(fd, &rfds); + maxfd = fd > maxfd ? fd : maxfd; + if (res->hr_role == HAST_ROLE_PRIMARY) { + /* Only primary workers asks for connections. */ + PJDLOG_ASSERT(res->hr_conn != NULL); + fd = proto_descriptor(res->hr_conn); + PJDLOG_ASSERT(fd >= 0); + FD_SET(fd, &rfds); + maxfd = fd > maxfd ? fd : maxfd; + } else { + PJDLOG_ASSERT(res->hr_conn == NULL); + } + } + + PJDLOG_ASSERT(maxfd + 1 <= (int)FD_SETSIZE); + ret = select(maxfd + 1, &rfds, NULL, NULL, &seltimeout); + now = time(NULL); + if (lastcheck + REPORT_INTERVAL <= now) { + hook_check(); + lastcheck = now; + } + if (ret == 0) { + /* + * select(2) timed out, so there should be no + * descriptors to check. + */ + continue; + } else if (ret == -1) { + if (errno == EINTR) + continue; + KEEP_ERRNO((void)pidfile_remove(pfh)); + pjdlog_exit(EX_OSERR, "select() failed"); + } + + /* + * Check for signals before we do anything to update our + * info about terminated workers in the meantime. + */ + check_signals(); + + if (FD_ISSET(proto_descriptor(cfg->hc_controlconn), &rfds)) + control_handle(cfg); + TAILQ_FOREACH(lst, &cfg->hc_listen, hl_next) { + if (lst->hl_conn == NULL) + continue; + if (FD_ISSET(proto_descriptor(lst->hl_conn), &rfds)) + listen_accept(lst); + } + TAILQ_FOREACH(res, &cfg->hc_resources, hr_next) { + if (res->hr_event == NULL) + continue; + if (FD_ISSET(proto_descriptor(res->hr_event), &rfds)) { + if (event_recv(res) == 0) + continue; + /* The worker process exited? */ + proto_close(res->hr_event); + res->hr_event = NULL; + if (res->hr_conn != NULL) { + proto_close(res->hr_conn); + res->hr_conn = NULL; + } + continue; + } + if (res->hr_role == HAST_ROLE_PRIMARY) { + PJDLOG_ASSERT(res->hr_conn != NULL); + if (FD_ISSET(proto_descriptor(res->hr_conn), + &rfds)) { + connection_migrate(res); + } + } else { + PJDLOG_ASSERT(res->hr_conn == NULL); + } + } + } +} + +static void +dummy_sighandler(int sig __unused) +{ + /* Nothing to do. */ +} + +int +main(int argc, char *argv[]) +{ + struct hastd_listen *lst; + pid_t otherpid; + int debuglevel; + sigset_t mask; + + foreground = false; + debuglevel = 0; + + for (;;) { + int ch; + + ch = getopt(argc, argv, "c:dFhP:"); + if (ch == -1) + break; + switch (ch) { + case 'c': + cfgpath = optarg; + break; + case 'd': + debuglevel++; + break; + case 'F': + foreground = true; + break; + case 'P': + pidfile = optarg; + break; + case 'h': + default: + usage(); + } + } + argc -= optind; + argv += optind; + + pjdlog_init(PJDLOG_MODE_STD); + pjdlog_debug_set(debuglevel); + + g_gate_load(); + + /* + * When path to the configuration file is relative, obtain full path, + * so we can always find the file, even after daemonizing and changing + * working directory to /. + */ + if (cfgpath[0] != '/') { + const char *newcfgpath; + + newcfgpath = realpath(cfgpath, NULL); + if (newcfgpath == NULL) { + pjdlog_exit(EX_CONFIG, + "Unable to obtain full path of %s", cfgpath); + } + cfgpath = newcfgpath; + } + + cfg = yy_config_parse(cfgpath, true); + PJDLOG_ASSERT(cfg != NULL); + + if (pidfile != NULL) { + if (strlcpy(cfg->hc_pidfile, pidfile, + sizeof(cfg->hc_pidfile)) >= sizeof(cfg->hc_pidfile)) { + pjdlog_exitx(EX_CONFIG, "Pidfile path is too long."); + } + } + + if (pidfile != NULL || !foreground) { + pfh = pidfile_open(cfg->hc_pidfile, 0600, &otherpid); + if (pfh == NULL) { + if (errno == EEXIST) { + pjdlog_exitx(EX_TEMPFAIL, + "Another hastd is already running, pidfile: %s, pid: %jd.", + cfg->hc_pidfile, (intmax_t)otherpid); + } + /* + * If we cannot create pidfile for other reasons, + * only warn. + */ + pjdlog_errno(LOG_WARNING, + "Unable to open or create pidfile %s", + cfg->hc_pidfile); + } + } + + /* + * Restore default actions for interesting signals in case parent + * process (like init(8)) decided to ignore some of them (like SIGHUP). + */ + PJDLOG_VERIFY(signal(SIGHUP, SIG_DFL) != SIG_ERR); + PJDLOG_VERIFY(signal(SIGINT, SIG_DFL) != SIG_ERR); + PJDLOG_VERIFY(signal(SIGTERM, SIG_DFL) != SIG_ERR); + /* + * Because SIGCHLD is ignored by default, setup dummy handler for it, + * so we can mask it. + */ + PJDLOG_VERIFY(signal(SIGCHLD, dummy_sighandler) != SIG_ERR); + + PJDLOG_VERIFY(sigemptyset(&mask) == 0); + PJDLOG_VERIFY(sigaddset(&mask, SIGHUP) == 0); + PJDLOG_VERIFY(sigaddset(&mask, SIGINT) == 0); + PJDLOG_VERIFY(sigaddset(&mask, SIGTERM) == 0); + PJDLOG_VERIFY(sigaddset(&mask, SIGCHLD) == 0); + PJDLOG_VERIFY(sigprocmask(SIG_SETMASK, &mask, NULL) == 0); + + /* Listen on control address. */ + if (proto_server(cfg->hc_controladdr, &cfg->hc_controlconn) == -1) { + KEEP_ERRNO((void)pidfile_remove(pfh)); + pjdlog_exit(EX_OSERR, "Unable to listen on control address %s", + cfg->hc_controladdr); + } + /* Listen for remote connections. */ + TAILQ_FOREACH(lst, &cfg->hc_listen, hl_next) { + if (proto_server(lst->hl_addr, &lst->hl_conn) == -1) { + KEEP_ERRNO((void)pidfile_remove(pfh)); + pjdlog_exit(EX_OSERR, "Unable to listen on address %s", + lst->hl_addr); + } + } + + if (!foreground) { + if (daemon(0, 0) == -1) { + KEEP_ERRNO((void)pidfile_remove(pfh)); + pjdlog_exit(EX_OSERR, "Unable to daemonize"); + } + + /* Start logging to syslog. */ + pjdlog_mode_set(PJDLOG_MODE_SYSLOG); + } + if (pidfile != NULL || !foreground) { + /* Write PID to a file. */ + if (pidfile_write(pfh) == -1) { + pjdlog_errno(LOG_WARNING, + "Unable to write PID to a file %s", + cfg->hc_pidfile); + } else { + pjdlog_debug(1, "PID stored in %s.", cfg->hc_pidfile); + } + } + + pjdlog_info("Started successfully, running protocol version %d.", + HAST_PROTO_VERSION); + + pjdlog_debug(1, "Listening on control address %s.", + cfg->hc_controladdr); + TAILQ_FOREACH(lst, &cfg->hc_listen, hl_next) + pjdlog_info("Listening on address %s.", lst->hl_addr); + + hook_init(); + + main_loop(); + + exit(0); +} diff --git a/sbin/hastd/hastd.h b/sbin/hastd/hastd.h new file mode 100644 index 0000000..d23e855 --- /dev/null +++ b/sbin/hastd/hastd.h @@ -0,0 +1,54 @@ +/*- + * Copyright (c) 2009-2010 The FreeBSD Foundation + * All rights reserved. + * + * This software was developed by Pawel Jakub Dawidek under sponsorship from + * the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _HASTD_H_ +#define _HASTD_H_ + +#include <sys/param.h> +#include <libutil.h> + +#include <nv.h> + +#include "hast.h" + +extern const char *cfgpath; +extern bool sigexit_received; +extern struct pidfh *pfh; + +void descriptors_cleanup(struct hast_resource *res); +void descriptors_assert(const struct hast_resource *res, int pjdlogmode); + +void hastd_primary(struct hast_resource *res); +void hastd_secondary(struct hast_resource *res, struct nv *nvin); + +void primary_config_reload(struct hast_resource *res, struct nv *nv); + +#endif /* !_HASTD_H_ */ diff --git a/sbin/hastd/hooks.c b/sbin/hastd/hooks.c new file mode 100644 index 0000000..b1886ca --- /dev/null +++ b/sbin/hastd/hooks.c @@ -0,0 +1,391 @@ +/*- + * Copyright (c) 2010 The FreeBSD Foundation + * Copyright (c) 2010 Pawel Jakub Dawidek <pjd@FreeBSD.org> + * All rights reserved. + * + * This software was developed by Pawel Jakub Dawidek under sponsorship from + * the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/types.h> +#include <sys/sysctl.h> +#include <sys/wait.h> + +#include <errno.h> +#include <fcntl.h> +#include <libgen.h> +#include <paths.h> +#include <signal.h> +#include <stdbool.h> +#include <stdint.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <syslog.h> +#include <unistd.h> + +#include <pjdlog.h> + +#include "hooks.h" +#include "subr.h" +#include "synch.h" + +/* Report processes that are running for too long not often than this value. */ +#define REPORT_INTERVAL 60 + +/* Are we initialized? */ +static bool hooks_initialized = false; + +/* + * Keep all processes we forked on a global queue, so we can report nicely + * when they finish or report that they are running for a long time. + */ +#define HOOKPROC_MAGIC_ALLOCATED 0x80090ca +#define HOOKPROC_MAGIC_ONLIST 0x80090c0 +struct hookproc { + /* Magic. */ + int hp_magic; + /* PID of a forked child. */ + pid_t hp_pid; + /* When process were forked? */ + time_t hp_birthtime; + /* When we logged previous reported? */ + time_t hp_lastreport; + /* Path to executable and all the arguments we passed. */ + char hp_comm[PATH_MAX]; + TAILQ_ENTRY(hookproc) hp_next; +}; +static TAILQ_HEAD(, hookproc) hookprocs; +static pthread_mutex_t hookprocs_lock; + +static void hook_remove(struct hookproc *hp); +static void hook_free(struct hookproc *hp); + +static void +descriptors(void) +{ + int fd; + + /* + * Close all (or almost all) descriptors. + */ + if (pjdlog_mode_get() == PJDLOG_MODE_STD) { + closefrom(MAX(MAX(STDIN_FILENO, STDOUT_FILENO), + STDERR_FILENO) + 1); + return; + } + + closefrom(0); + + /* + * Redirect stdin, stdout and stderr to /dev/null. + */ + fd = open(_PATH_DEVNULL, O_RDONLY); + if (fd == -1) { + pjdlog_errno(LOG_WARNING, "Unable to open %s for reading", + _PATH_DEVNULL); + } else if (fd != STDIN_FILENO) { + if (dup2(fd, STDIN_FILENO) == -1) { + pjdlog_errno(LOG_WARNING, + "Unable to duplicate descriptor for stdin"); + } + close(fd); + } + fd = open(_PATH_DEVNULL, O_WRONLY); + if (fd == -1) { + pjdlog_errno(LOG_WARNING, "Unable to open %s for writing", + _PATH_DEVNULL); + } else { + if (fd != STDOUT_FILENO && dup2(fd, STDOUT_FILENO) == -1) { + pjdlog_errno(LOG_WARNING, + "Unable to duplicate descriptor for stdout"); + } + if (fd != STDERR_FILENO && dup2(fd, STDERR_FILENO) == -1) { + pjdlog_errno(LOG_WARNING, + "Unable to duplicate descriptor for stderr"); + } + if (fd != STDOUT_FILENO && fd != STDERR_FILENO) + close(fd); + } +} + +void +hook_init(void) +{ + + PJDLOG_ASSERT(!hooks_initialized); + + mtx_init(&hookprocs_lock); + TAILQ_INIT(&hookprocs); + hooks_initialized = true; +} + +void +hook_fini(void) +{ + struct hookproc *hp; + + PJDLOG_ASSERT(hooks_initialized); + + mtx_lock(&hookprocs_lock); + while ((hp = TAILQ_FIRST(&hookprocs)) != NULL) { + PJDLOG_ASSERT(hp->hp_magic == HOOKPROC_MAGIC_ONLIST); + PJDLOG_ASSERT(hp->hp_pid > 0); + + hook_remove(hp); + hook_free(hp); + } + mtx_unlock(&hookprocs_lock); + + mtx_destroy(&hookprocs_lock); + TAILQ_INIT(&hookprocs); + hooks_initialized = false; +} + +static struct hookproc * +hook_alloc(const char *path, char **args) +{ + struct hookproc *hp; + unsigned int ii; + + hp = malloc(sizeof(*hp)); + if (hp == NULL) { + pjdlog_error("Unable to allocate %zu bytes of memory for a hook.", + sizeof(*hp)); + return (NULL); + } + + hp->hp_pid = 0; + hp->hp_birthtime = hp->hp_lastreport = time(NULL); + (void)strlcpy(hp->hp_comm, path, sizeof(hp->hp_comm)); + /* We start at 2nd argument as we don't want to have exec name twice. */ + for (ii = 1; args[ii] != NULL; ii++) { + (void)snprlcat(hp->hp_comm, sizeof(hp->hp_comm), " %s", + args[ii]); + } + if (strlen(hp->hp_comm) >= sizeof(hp->hp_comm) - 1) { + pjdlog_error("Exec path too long, correct configuration file."); + free(hp); + return (NULL); + } + hp->hp_magic = HOOKPROC_MAGIC_ALLOCATED; + return (hp); +} + +static void +hook_add(struct hookproc *hp, pid_t pid) +{ + + PJDLOG_ASSERT(hp->hp_magic == HOOKPROC_MAGIC_ALLOCATED); + PJDLOG_ASSERT(hp->hp_pid == 0); + + hp->hp_pid = pid; + mtx_lock(&hookprocs_lock); + hp->hp_magic = HOOKPROC_MAGIC_ONLIST; + TAILQ_INSERT_TAIL(&hookprocs, hp, hp_next); + mtx_unlock(&hookprocs_lock); +} + +static void +hook_remove(struct hookproc *hp) +{ + + PJDLOG_ASSERT(hp->hp_magic == HOOKPROC_MAGIC_ONLIST); + PJDLOG_ASSERT(hp->hp_pid > 0); + PJDLOG_ASSERT(mtx_owned(&hookprocs_lock)); + + TAILQ_REMOVE(&hookprocs, hp, hp_next); + hp->hp_magic = HOOKPROC_MAGIC_ALLOCATED; +} + +static void +hook_free(struct hookproc *hp) +{ + + PJDLOG_ASSERT(hp->hp_magic == HOOKPROC_MAGIC_ALLOCATED); + PJDLOG_ASSERT(hp->hp_pid > 0); + + hp->hp_magic = 0; + free(hp); +} + +static struct hookproc * +hook_find(pid_t pid) +{ + struct hookproc *hp; + + PJDLOG_ASSERT(mtx_owned(&hookprocs_lock)); + + TAILQ_FOREACH(hp, &hookprocs, hp_next) { + PJDLOG_ASSERT(hp->hp_magic == HOOKPROC_MAGIC_ONLIST); + PJDLOG_ASSERT(hp->hp_pid > 0); + + if (hp->hp_pid == pid) + break; + } + + return (hp); +} + +void +hook_check_one(pid_t pid, int status) +{ + struct hookproc *hp; + + mtx_lock(&hookprocs_lock); + hp = hook_find(pid); + if (hp == NULL) { + mtx_unlock(&hookprocs_lock); + pjdlog_debug(1, "Unknown process pid=%u", pid); + return; + } + hook_remove(hp); + mtx_unlock(&hookprocs_lock); + if (WIFEXITED(status) && WEXITSTATUS(status) == 0) { + pjdlog_debug(1, "Hook exited gracefully (pid=%u, cmd=[%s]).", + pid, hp->hp_comm); + } else if (WIFSIGNALED(status)) { + pjdlog_error("Hook was killed (pid=%u, signal=%d, cmd=[%s]).", + pid, WTERMSIG(status), hp->hp_comm); + } else { + pjdlog_error("Hook exited ungracefully (pid=%u, exitcode=%d, cmd=[%s]).", + pid, WIFEXITED(status) ? WEXITSTATUS(status) : -1, + hp->hp_comm); + } + hook_free(hp); +} + +void +hook_check(void) +{ + struct hookproc *hp, *hp2; + time_t now; + + PJDLOG_ASSERT(hooks_initialized); + + pjdlog_debug(2, "Checking hooks."); + + /* + * Report about processes that are running for a long time. + */ + now = time(NULL); + mtx_lock(&hookprocs_lock); + TAILQ_FOREACH_SAFE(hp, &hookprocs, hp_next, hp2) { + PJDLOG_ASSERT(hp->hp_magic == HOOKPROC_MAGIC_ONLIST); + PJDLOG_ASSERT(hp->hp_pid > 0); + + /* + * If process doesn't exists we somehow missed it. + * Not much can be done expect for logging this situation. + */ + if (kill(hp->hp_pid, 0) == -1 && errno == ESRCH) { + pjdlog_warning("Hook disappeared (pid=%u, cmd=[%s]).", + hp->hp_pid, hp->hp_comm); + hook_remove(hp); + hook_free(hp); + continue; + } + + /* + * Skip proccesses younger than 1 minute. + */ + if (now - hp->hp_lastreport < REPORT_INTERVAL) + continue; + + /* + * Hook is running for too long, report it. + */ + pjdlog_warning("Hook is running for %ju seconds (pid=%u, cmd=[%s]).", + (uintmax_t)(now - hp->hp_birthtime), hp->hp_pid, + hp->hp_comm); + hp->hp_lastreport = now; + } + mtx_unlock(&hookprocs_lock); +} + +void +hook_exec(const char *path, ...) +{ + va_list ap; + + va_start(ap, path); + hook_execv(path, ap); + va_end(ap); +} + +void +hook_execv(const char *path, va_list ap) +{ + struct hookproc *hp; + char *args[64]; + unsigned int ii; + sigset_t mask; + pid_t pid; + + PJDLOG_ASSERT(hooks_initialized); + + if (path == NULL || path[0] == '\0') + return; + + memset(args, 0, sizeof(args)); + args[0] = basename(path); + for (ii = 1; ii < sizeof(args) / sizeof(args[0]); ii++) { + args[ii] = va_arg(ap, char *); + if (args[ii] == NULL) + break; + } + PJDLOG_ASSERT(ii < sizeof(args) / sizeof(args[0])); + + hp = hook_alloc(path, args); + if (hp == NULL) + return; + + pjdlog_debug(1, "Executing hook: %s", hp->hp_comm); + + pid = fork(); + switch (pid) { + case -1: /* Error. */ + pjdlog_errno(LOG_ERR, "Unable to fork to execute %s", path); + hook_free(hp); + return; + case 0: /* Child. */ + descriptors(); + PJDLOG_VERIFY(sigemptyset(&mask) == 0); + PJDLOG_VERIFY(sigprocmask(SIG_SETMASK, &mask, NULL) == 0); + /* + * Dummy handler set for SIGCHLD in the parent will be restored + * to SIG_IGN on execv(3) below, so there is no need to do + * anything with it. + */ + execv(path, args); + pjdlog_errno(LOG_ERR, "Unable to execute %s", path); + exit(EX_SOFTWARE); + default: /* Parent. */ + hook_add(hp, pid); + break; + } +} diff --git a/sbin/hastd/hooks.h b/sbin/hastd/hooks.h new file mode 100644 index 0000000..4ce435e --- /dev/null +++ b/sbin/hastd/hooks.h @@ -0,0 +1,48 @@ +/*- + * Copyright (c) 2010 The FreeBSD Foundation + * Copyright (c) 2010 Pawel Jakub Dawidek <pjd@FreeBSD.org> + * All rights reserved. + * + * This software was developed by Pawel Jakub Dawidek under sponsorship from + * the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _HOOKS_H_ +#define _HOOKS_H_ + +#include <sys/types.h> + +#include <stdarg.h> +#include <stdbool.h> + +void hook_init(void); +void hook_fini(void); +void hook_check_one(pid_t pid, int status); +void hook_check(void); +void hook_exec(const char *path, ...); +void hook_execv(const char *path, va_list ap); + +#endif /* !_HOOKS_H_ */ diff --git a/sbin/hastd/lzf.c b/sbin/hastd/lzf.c new file mode 100644 index 0000000..cca6a17 --- /dev/null +++ b/sbin/hastd/lzf.c @@ -0,0 +1,406 @@ +/* + * Copyright (c) 2000-2008 Marc Alexander Lehmann <schmorp@schmorp.de> + * + * Redistribution and use in source and binary forms, with or without modifica- + * tion, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MER- + * CHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO + * EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPE- + * CIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTH- + * ERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED + * OF THE POSSIBILITY OF SUCH DAMAGE. + * + * Alternatively, the contents of this file may be used under the terms of + * the GNU General Public License ("GPL") version 2 or any later version, + * in which case the provisions of the GPL are applicable instead of + * the above. If you wish to allow the use of your version of this file + * only under the terms of the GPL and not to allow others to use your + * version of this file under the BSD license, indicate your decision + * by deleting the provisions above and replace them with the notice + * and other provisions required by the GPL. If you do not delete the + * provisions above, a recipient may use your version of this file under + * either the BSD or the GPL. + */ + +#include "lzf.h" + +#define HSIZE (1 << (HLOG)) + +/* + * don't play with this unless you benchmark! + * decompression is not dependent on the hash function + * the hashing function might seem strange, just believe me + * it works ;) + */ +#ifndef FRST +# define FRST(p) (((p[0]) << 8) | p[1]) +# define NEXT(v,p) (((v) << 8) | p[2]) +# if ULTRA_FAST +# define IDX(h) ((( h >> (3*8 - HLOG)) - h ) & (HSIZE - 1)) +# elif VERY_FAST +# define IDX(h) ((( h >> (3*8 - HLOG)) - h*5) & (HSIZE - 1)) +# else +# define IDX(h) ((((h ^ (h << 5)) >> (3*8 - HLOG)) - h*5) & (HSIZE - 1)) +# endif +#endif +/* + * IDX works because it is very similar to a multiplicative hash, e.g. + * ((h * 57321 >> (3*8 - HLOG)) & (HSIZE - 1)) + * the latter is also quite fast on newer CPUs, and compresses similarly. + * + * the next one is also quite good, albeit slow ;) + * (int)(cos(h & 0xffffff) * 1e6) + */ + +#if 0 +/* original lzv-like hash function, much worse and thus slower */ +# define FRST(p) (p[0] << 5) ^ p[1] +# define NEXT(v,p) ((v) << 5) ^ p[2] +# define IDX(h) ((h) & (HSIZE - 1)) +#endif + +#define MAX_LIT (1 << 5) +#define MAX_OFF (1 << 13) +#define MAX_REF ((1 << 8) + (1 << 3)) + +#if __GNUC__ >= 3 +# define expect(expr,value) __builtin_expect ((expr),(value)) +# define inline inline +#else +# define expect(expr,value) (expr) +# define inline static +#endif + +#define expect_false(expr) expect ((expr) != 0, 0) +#define expect_true(expr) expect ((expr) != 0, 1) + +/* + * compressed format + * + * 000LLLLL <L+1> ; literal + * LLLooooo oooooooo ; backref L + * 111ooooo LLLLLLLL oooooooo ; backref L+7 + * + */ + +unsigned int +lzf_compress (const void *const in_data, unsigned int in_len, + void *out_data, unsigned int out_len +#if LZF_STATE_ARG + , LZF_STATE htab +#endif + ) +{ +#if !LZF_STATE_ARG + LZF_STATE htab; +#endif + const u8 **hslot; + const u8 *ip = (const u8 *)in_data; + u8 *op = (u8 *)out_data; + const u8 *in_end = ip + in_len; + u8 *out_end = op + out_len; + const u8 *ref; + + /* off requires a type wide enough to hold a general pointer difference. + * ISO C doesn't have that (size_t might not be enough and ptrdiff_t only + * works for differences within a single object). We also assume that no + * no bit pattern traps. Since the only platform that is both non-POSIX + * and fails to support both assumptions is windows 64 bit, we make a + * special workaround for it. + */ +#if defined (WIN32) && defined (_M_X64) + unsigned _int64 off; /* workaround for missing POSIX compliance */ +#else + unsigned long off; +#endif + unsigned int hval; + int lit; + + if (!in_len || !out_len) + return 0; + +#if INIT_HTAB + memset (htab, 0, sizeof (htab)); +# if 0 + for (hslot = htab; hslot < htab + HSIZE; hslot++) + *hslot++ = ip; +# endif +#endif + + lit = 0; op++; /* start run */ + + hval = FRST (ip); + while (ip < in_end - 2) + { + hval = NEXT (hval, ip); + hslot = htab + IDX (hval); + ref = *hslot; *hslot = ip; + + if (1 +#if INIT_HTAB + && ref < ip /* the next test will actually take care of this, but this is faster */ +#endif + && (off = ip - ref - 1) < MAX_OFF + && ip + 4 < in_end + && ref > (const u8 *)in_data +#if STRICT_ALIGN + && ref[0] == ip[0] + && ref[1] == ip[1] + && ref[2] == ip[2] +#else + && *(const u16 *)ref == *(const u16 *)ip + && ref[2] == ip[2] +#endif + ) + { + /* match found at *ref++ */ + unsigned int len = 2; + unsigned int maxlen = in_end - ip - len; + maxlen = maxlen > MAX_REF ? MAX_REF : maxlen; + + if (expect_false (op + 3 + 1 >= out_end)) /* first a faster conservative test */ + if (op - !lit + 3 + 1 >= out_end) /* second the exact but rare test */ + return 0; + + op [- lit - 1] = lit - 1; /* stop run */ + op -= !lit; /* undo run if length is zero */ + + for (;;) + { + if (expect_true (maxlen > 16)) + { + len++; if (ref [len] != ip [len]) break; + len++; if (ref [len] != ip [len]) break; + len++; if (ref [len] != ip [len]) break; + len++; if (ref [len] != ip [len]) break; + + len++; if (ref [len] != ip [len]) break; + len++; if (ref [len] != ip [len]) break; + len++; if (ref [len] != ip [len]) break; + len++; if (ref [len] != ip [len]) break; + + len++; if (ref [len] != ip [len]) break; + len++; if (ref [len] != ip [len]) break; + len++; if (ref [len] != ip [len]) break; + len++; if (ref [len] != ip [len]) break; + + len++; if (ref [len] != ip [len]) break; + len++; if (ref [len] != ip [len]) break; + len++; if (ref [len] != ip [len]) break; + len++; if (ref [len] != ip [len]) break; + } + + do + len++; + while (len < maxlen && ref[len] == ip[len]); + + break; + } + + len -= 2; /* len is now #octets - 1 */ + ip++; + + if (len < 7) + { + *op++ = (off >> 8) + (len << 5); + } + else + { + *op++ = (off >> 8) + ( 7 << 5); + *op++ = len - 7; + } + + *op++ = off; + lit = 0; op++; /* start run */ + + ip += len + 1; + + if (expect_false (ip >= in_end - 2)) + break; + +#if ULTRA_FAST || VERY_FAST + --ip; +# if VERY_FAST && !ULTRA_FAST + --ip; +# endif + hval = FRST (ip); + + hval = NEXT (hval, ip); + htab[IDX (hval)] = ip; + ip++; + +# if VERY_FAST && !ULTRA_FAST + hval = NEXT (hval, ip); + htab[IDX (hval)] = ip; + ip++; +# endif +#else + ip -= len + 1; + + do + { + hval = NEXT (hval, ip); + htab[IDX (hval)] = ip; + ip++; + } + while (len--); +#endif + } + else + { + /* one more literal byte we must copy */ + if (expect_false (op >= out_end)) + return 0; + + lit++; *op++ = *ip++; + + if (expect_false (lit == MAX_LIT)) + { + op [- lit - 1] = lit - 1; /* stop run */ + lit = 0; op++; /* start run */ + } + } + } + + if (op + 3 > out_end) /* at most 3 bytes can be missing here */ + return 0; + + while (ip < in_end) + { + lit++; *op++ = *ip++; + + if (expect_false (lit == MAX_LIT)) + { + op [- lit - 1] = lit - 1; /* stop run */ + lit = 0; op++; /* start run */ + } + } + + op [- lit - 1] = lit - 1; /* end run */ + op -= !lit; /* undo run if length is zero */ + + return op - (u8 *)out_data; +} + +#if AVOID_ERRNO +# define SET_ERRNO(n) +#else +# include <errno.h> +# define SET_ERRNO(n) errno = (n) +#endif + +#if (__i386 || __amd64) && __GNUC__ >= 3 +# define lzf_movsb(dst, src, len) \ + asm ("rep movsb" \ + : "=D" (dst), "=S" (src), "=c" (len) \ + : "0" (dst), "1" (src), "2" (len)); +#endif + +unsigned int +lzf_decompress (const void *const in_data, unsigned int in_len, + void *out_data, unsigned int out_len) +{ + u8 const *ip = (const u8 *)in_data; + u8 *op = (u8 *)out_data; + u8 const *const in_end = ip + in_len; + u8 *const out_end = op + out_len; + + do + { + unsigned int ctrl = *ip++; + + if (ctrl < (1 << 5)) /* literal run */ + { + ctrl++; + + if (op + ctrl > out_end) + { + SET_ERRNO (E2BIG); + return 0; + } + +#if CHECK_INPUT + if (ip + ctrl > in_end) + { + SET_ERRNO (EINVAL); + return 0; + } +#endif + +#ifdef lzf_movsb + lzf_movsb (op, ip, ctrl); +#else + do + *op++ = *ip++; + while (--ctrl); +#endif + } + else /* back reference */ + { + unsigned int len = ctrl >> 5; + + u8 *ref = op - ((ctrl & 0x1f) << 8) - 1; + +#if CHECK_INPUT + if (ip >= in_end) + { + SET_ERRNO (EINVAL); + return 0; + } +#endif + if (len == 7) + { + len += *ip++; +#if CHECK_INPUT + if (ip >= in_end) + { + SET_ERRNO (EINVAL); + return 0; + } +#endif + } + + ref -= *ip++; + + if (op + len + 2 > out_end) + { + SET_ERRNO (E2BIG); + return 0; + } + + if (ref < (u8 *)out_data) + { + SET_ERRNO (EINVAL); + return 0; + } + +#ifdef lzf_movsb + len += 2; + lzf_movsb (op, ref, len); +#else + *op++ = *ref++; + *op++ = *ref++; + + do + *op++ = *ref++; + while (--len); +#endif + } + } + while (ip < in_end); + + return op - (u8 *)out_data; +} + diff --git a/sbin/hastd/lzf.h b/sbin/hastd/lzf.h new file mode 100644 index 0000000..d9563ef --- /dev/null +++ b/sbin/hastd/lzf.h @@ -0,0 +1,211 @@ +/* + * Copyright (c) 2000-2008 Marc Alexander Lehmann <schmorp@schmorp.de> + * + * Redistribution and use in source and binary forms, with or without modifica- + * tion, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MER- + * CHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO + * EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPE- + * CIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTH- + * ERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED + * OF THE POSSIBILITY OF SUCH DAMAGE. + * + * Alternatively, the contents of this file may be used under the terms of + * the GNU General Public License ("GPL") version 2 or any later version, + * in which case the provisions of the GPL are applicable instead of + * the above. If you wish to allow the use of your version of this file + * only under the terms of the GPL and not to allow others to use your + * version of this file under the BSD license, indicate your decision + * by deleting the provisions above and replace them with the notice + * and other provisions required by the GPL. If you do not delete the + * provisions above, a recipient may use your version of this file under + * either the BSD or the GPL. + */ + +#ifndef LZF_H +#define LZF_H + +/*********************************************************************** +** +** lzf -- an extremely fast/free compression/decompression-method +** http://liblzf.plan9.de/ +** +** This algorithm is believed to be patent-free. +** +***********************************************************************/ + +#define LZF_VERSION 0x0105 /* 1.5, API version */ + +/* + * Compress in_len bytes stored at the memory block starting at + * in_data and write the result to out_data, up to a maximum length + * of out_len bytes. + * + * If the output buffer is not large enough or any error occurs return 0, + * otherwise return the number of bytes used, which might be considerably + * more than in_len (but less than 104% of the original size), so it + * makes sense to always use out_len == in_len - 1), to ensure _some_ + * compression, and store the data uncompressed otherwise (with a flag, of + * course. + * + * lzf_compress might use different algorithms on different systems and + * even different runs, thus might result in different compressed strings + * depending on the phase of the moon or similar factors. However, all + * these strings are architecture-independent and will result in the + * original data when decompressed using lzf_decompress. + * + * The buffers must not be overlapping. + * + * If the option LZF_STATE_ARG is enabled, an extra argument must be + * supplied which is not reflected in this header file. Refer to lzfP.h + * and lzf_c.c. + * + */ +unsigned int +lzf_compress (const void *const in_data, unsigned int in_len, + void *out_data, unsigned int out_len); + +/* + * Decompress data compressed with some version of the lzf_compress + * function and stored at location in_data and length in_len. The result + * will be stored at out_data up to a maximum of out_len characters. + * + * If the output buffer is not large enough to hold the decompressed + * data, a 0 is returned and errno is set to E2BIG. Otherwise the number + * of decompressed bytes (i.e. the original length of the data) is + * returned. + * + * If an error in the compressed data is detected, a zero is returned and + * errno is set to EINVAL. + * + * This function is very fast, about as fast as a copying loop. + */ +unsigned int +lzf_decompress (const void *const in_data, unsigned int in_len, + void *out_data, unsigned int out_len); + +/* + * Size of hashtable is (1 << HLOG) * sizeof (char *) + * decompression is independent of the hash table size + * the difference between 15 and 14 is very small + * for small blocks (and 14 is usually a bit faster). + * For a low-memory/faster configuration, use HLOG == 13; + * For best compression, use 15 or 16 (or more, up to 23). + */ +#ifndef HLOG +# define HLOG 16 +#endif + +/* + * Sacrifice very little compression quality in favour of compression speed. + * This gives almost the same compression as the default code, and is + * (very roughly) 15% faster. This is the preferred mode of operation. + */ +#ifndef VERY_FAST +# define VERY_FAST 1 +#endif + +/* + * Sacrifice some more compression quality in favour of compression speed. + * (roughly 1-2% worse compression for large blocks and + * 9-10% for small, redundant, blocks and >>20% better speed in both cases) + * In short: when in need for speed, enable this for binary data, + * possibly disable this for text data. + */ +#ifndef ULTRA_FAST +# define ULTRA_FAST 0 +#endif + +/* + * Unconditionally aligning does not cost very much, so do it if unsure + */ +#ifndef STRICT_ALIGN +# define STRICT_ALIGN !(defined(__i386) || defined (__amd64)) +#endif + +/* + * You may choose to pre-set the hash table (might be faster on some + * modern cpus and large (>>64k) blocks, and also makes compression + * deterministic/repeatable when the configuration otherwise is the same). + */ +#ifndef INIT_HTAB +# define INIT_HTAB 1 +#endif + +/* + * Avoid assigning values to errno variable? for some embedding purposes + * (linux kernel for example), this is necessary. NOTE: this breaks + * the documentation in lzf.h. + */ +#ifndef AVOID_ERRNO +# define AVOID_ERRNO 0 +#endif + +/* + * Wether to pass the LZF_STATE variable as argument, or allocate it + * on the stack. For small-stack environments, define this to 1. + * NOTE: this breaks the prototype in lzf.h. + */ +#ifndef LZF_STATE_ARG +# define LZF_STATE_ARG 0 +#endif + +/* + * Wether to add extra checks for input validity in lzf_decompress + * and return EINVAL if the input stream has been corrupted. This + * only shields against overflowing the input buffer and will not + * detect most corrupted streams. + * This check is not normally noticeable on modern hardware + * (<1% slowdown), but might slow down older cpus considerably. + */ +#ifndef CHECK_INPUT +# define CHECK_INPUT 1 +#endif + +/*****************************************************************************/ +/* nothing should be changed below */ + +typedef unsigned char u8; + +typedef const u8 *LZF_STATE[1 << (HLOG)]; + +#if !STRICT_ALIGN +/* for unaligned accesses we need a 16 bit datatype. */ +# include <limits.h> +# if USHRT_MAX == 65535 + typedef unsigned short u16; +# elif UINT_MAX == 65535 + typedef unsigned int u16; +# else +# undef STRICT_ALIGN +# define STRICT_ALIGN 1 +# endif +#endif + +#if ULTRA_FAST +# if defined(VERY_FAST) +# undef VERY_FAST +# endif +#endif + +#if INIT_HTAB +# ifdef __cplusplus +# include <cstring> +# else +# include <string.h> +# endif +#endif + +#endif diff --git a/sbin/hastd/metadata.c b/sbin/hastd/metadata.c new file mode 100644 index 0000000..6d9f366 --- /dev/null +++ b/sbin/hastd/metadata.c @@ -0,0 +1,225 @@ +/*- + * Copyright (c) 2009-2010 The FreeBSD Foundation + * All rights reserved. + * + * This software was developed by Pawel Jakub Dawidek under sponsorship from + * the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <errno.h> +#include <fcntl.h> +#include <string.h> +#include <strings.h> +#include <unistd.h> + +#include <ebuf.h> +#include <nv.h> +#include <pjdlog.h> +#include <subr.h> + +#include "metadata.h" + +int +metadata_read(struct hast_resource *res, bool openrw) +{ + unsigned char *buf; + struct ebuf *eb; + struct nv *nv; + ssize_t done; + const char *str; + int rerrno; + bool opened_here; + + opened_here = false; + rerrno = 0; + + /* + * Is this first metadata_read() call for this resource? + */ + if (res->hr_localfd == -1) { + if (provinfo(res, openrw) == -1) { + rerrno = errno; + goto fail; + } + opened_here = true; + pjdlog_debug(1, "Obtained info about %s.", res->hr_localpath); + if (openrw) { + if (flock(res->hr_localfd, LOCK_EX | LOCK_NB) == -1) { + rerrno = errno; + if (errno == EOPNOTSUPP) { + pjdlog_warning("Unable to lock %s (operation not supported), but continuing.", + res->hr_localpath); + } else { + pjdlog_errno(LOG_ERR, + "Unable to lock %s", + res->hr_localpath); + goto fail; + } + } + pjdlog_debug(1, "Locked %s.", res->hr_localpath); + } + } + + eb = ebuf_alloc(METADATA_SIZE); + if (eb == NULL) { + rerrno = errno; + pjdlog_errno(LOG_ERR, + "Unable to allocate memory to read metadata"); + goto fail; + } + if (ebuf_add_tail(eb, NULL, METADATA_SIZE) == -1) { + rerrno = errno; + pjdlog_errno(LOG_ERR, + "Unable to allocate memory to read metadata"); + ebuf_free(eb); + goto fail; + } + buf = ebuf_data(eb, NULL); + PJDLOG_ASSERT(buf != NULL); + done = pread(res->hr_localfd, buf, METADATA_SIZE, 0); + if (done == -1 || done != METADATA_SIZE) { + rerrno = errno; + pjdlog_errno(LOG_ERR, "Unable to read metadata"); + ebuf_free(eb); + goto fail; + } + nv = nv_ntoh(eb); + if (nv == NULL) { + rerrno = errno; + pjdlog_errno(LOG_ERR, "Metadata read from %s is invalid", + res->hr_localpath); + ebuf_free(eb); + goto fail; + } + + str = nv_get_string(nv, "resource"); + if (str != NULL && strcmp(str, res->hr_name) != 0) { + pjdlog_error("Provider %s is not part of resource %s.", + res->hr_localpath, res->hr_name); + nv_free(nv); + goto fail; + } + + res->hr_datasize = nv_get_uint64(nv, "datasize"); + res->hr_extentsize = (int)nv_get_uint32(nv, "extentsize"); + res->hr_keepdirty = (int)nv_get_uint32(nv, "keepdirty"); + res->hr_localoff = nv_get_uint64(nv, "offset"); + res->hr_resuid = nv_get_uint64(nv, "resuid"); + if (res->hr_role != HAST_ROLE_PRIMARY) { + /* Secondary or init role. */ + res->hr_secondary_localcnt = nv_get_uint64(nv, "localcnt"); + res->hr_secondary_remotecnt = nv_get_uint64(nv, "remotecnt"); + } + if (res->hr_role != HAST_ROLE_SECONDARY) { + /* Primary or init role. */ + res->hr_primary_localcnt = nv_get_uint64(nv, "localcnt"); + res->hr_primary_remotecnt = nv_get_uint64(nv, "remotecnt"); + } + str = nv_get_string(nv, "prevrole"); + if (str != NULL) { + if (strcmp(str, "primary") == 0) + res->hr_previous_role = HAST_ROLE_PRIMARY; + else if (strcmp(str, "secondary") == 0) + res->hr_previous_role = HAST_ROLE_SECONDARY; + } + + if (nv_error(nv) != 0) { + errno = rerrno = nv_error(nv); + pjdlog_errno(LOG_ERR, "Unable to read metadata from %s", + res->hr_localpath); + nv_free(nv); + goto fail; + } + nv_free(nv); + return (0); +fail: + if (opened_here) { + close(res->hr_localfd); + res->hr_localfd = -1; + } + errno = rerrno; + return (-1); +} + +int +metadata_write(struct hast_resource *res) +{ + struct ebuf *eb; + struct nv *nv; + unsigned char *buf, *ptr; + size_t size; + ssize_t done; + int ret; + + buf = calloc(1, METADATA_SIZE); + if (buf == NULL) { + pjdlog_error("Unable to allocate %zu bytes for metadata.", + (size_t)METADATA_SIZE); + return (-1); + } + + ret = -1; + + nv = nv_alloc(); + nv_add_string(nv, res->hr_name, "resource"); + nv_add_uint64(nv, (uint64_t)res->hr_datasize, "datasize"); + nv_add_uint32(nv, (uint32_t)res->hr_extentsize, "extentsize"); + nv_add_uint32(nv, (uint32_t)res->hr_keepdirty, "keepdirty"); + nv_add_uint64(nv, (uint64_t)res->hr_localoff, "offset"); + nv_add_uint64(nv, res->hr_resuid, "resuid"); + if (res->hr_role == HAST_ROLE_PRIMARY || + res->hr_role == HAST_ROLE_INIT) { + nv_add_uint64(nv, res->hr_primary_localcnt, "localcnt"); + nv_add_uint64(nv, res->hr_primary_remotecnt, "remotecnt"); + } else /* if (res->hr_role == HAST_ROLE_SECONDARY) */ { + PJDLOG_ASSERT(res->hr_role == HAST_ROLE_SECONDARY); + nv_add_uint64(nv, res->hr_secondary_localcnt, "localcnt"); + nv_add_uint64(nv, res->hr_secondary_remotecnt, "remotecnt"); + } + nv_add_string(nv, role2str(res->hr_role), "prevrole"); + if (nv_error(nv) != 0) { + pjdlog_error("Unable to create metadata."); + goto end; + } + res->hr_previous_role = res->hr_role; + eb = nv_hton(nv); + PJDLOG_ASSERT(eb != NULL); + ptr = ebuf_data(eb, &size); + PJDLOG_ASSERT(ptr != NULL); + PJDLOG_ASSERT(size < METADATA_SIZE); + bcopy(ptr, buf, size); + done = pwrite(res->hr_localfd, buf, METADATA_SIZE, 0); + if (done == -1 || done != METADATA_SIZE) { + pjdlog_errno(LOG_ERR, "Unable to write metadata"); + goto end; + } + ret = 0; +end: + free(buf); + nv_free(nv); + return (ret); +} diff --git a/sbin/hastd/metadata.h b/sbin/hastd/metadata.h new file mode 100644 index 0000000..83d35f4 --- /dev/null +++ b/sbin/hastd/metadata.h @@ -0,0 +1,48 @@ +/*- + * Copyright (c) 2010 The FreeBSD Foundation + * All rights reserved. + * + * This software was developed by Pawel Jakub Dawidek under sponsorship from + * the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _METADATA_H_ +#define _METADATA_H_ + +#include <stdbool.h> + +#include <hast.h> + +/* + * Maximum size of metadata. + * XXX: We should take sector size into account. + */ +#define METADATA_SIZE 4096 + +int metadata_read(struct hast_resource *res, bool openrw); +int metadata_write(struct hast_resource *res); + +#endif /* !_METADATA_H_ */ diff --git a/sbin/hastd/nv.c b/sbin/hastd/nv.c new file mode 100644 index 0000000..8dcf697 --- /dev/null +++ b/sbin/hastd/nv.c @@ -0,0 +1,966 @@ +/*- + * Copyright (c) 2009-2010 The FreeBSD Foundation + * All rights reserved. + * + * This software was developed by Pawel Jakub Dawidek under sponsorship from + * the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/endian.h> + +#include <bitstring.h> +#include <errno.h> +#include <stdarg.h> +#include <stdbool.h> +#include <stdint.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> + +#include <ebuf.h> +#include <pjdlog.h> + +#include "nv.h" + +#ifndef PJDLOG_ASSERT +#include <assert.h> +#define PJDLOG_ASSERT(...) assert(__VA_ARGS__) +#endif +#ifndef PJDLOG_ABORT +#define PJDLOG_ABORT(...) abort() +#endif + +#define NV_TYPE_NONE 0 + +#define NV_TYPE_INT8 1 +#define NV_TYPE_UINT8 2 +#define NV_TYPE_INT16 3 +#define NV_TYPE_UINT16 4 +#define NV_TYPE_INT32 5 +#define NV_TYPE_UINT32 6 +#define NV_TYPE_INT64 7 +#define NV_TYPE_UINT64 8 +#define NV_TYPE_INT8_ARRAY 9 +#define NV_TYPE_UINT8_ARRAY 10 +#define NV_TYPE_INT16_ARRAY 11 +#define NV_TYPE_UINT16_ARRAY 12 +#define NV_TYPE_INT32_ARRAY 13 +#define NV_TYPE_UINT32_ARRAY 14 +#define NV_TYPE_INT64_ARRAY 15 +#define NV_TYPE_UINT64_ARRAY 16 +#define NV_TYPE_STRING 17 + +#define NV_TYPE_MASK 0x7f +#define NV_TYPE_FIRST NV_TYPE_INT8 +#define NV_TYPE_LAST NV_TYPE_STRING + +#define NV_ORDER_NETWORK 0x00 +#define NV_ORDER_HOST 0x80 + +#define NV_ORDER_MASK 0x80 + +#define NV_MAGIC 0xaea1e +struct nv { + int nv_magic; + int nv_error; + struct ebuf *nv_ebuf; +}; + +struct nvhdr { + uint8_t nvh_type; + uint8_t nvh_namesize; + uint32_t nvh_dsize; + char nvh_name[0]; +} __packed; +#define NVH_DATA(nvh) ((unsigned char *)nvh + NVH_HSIZE(nvh)) +#define NVH_HSIZE(nvh) \ + (sizeof(struct nvhdr) + roundup2((nvh)->nvh_namesize, 8)) +#define NVH_DSIZE(nvh) \ + (((nvh)->nvh_type & NV_ORDER_MASK) == NV_ORDER_HOST ? \ + (nvh)->nvh_dsize : \ + le32toh((nvh)->nvh_dsize)) +#define NVH_SIZE(nvh) (NVH_HSIZE(nvh) + roundup2(NVH_DSIZE(nvh), 8)) + +#define NV_CHECK(nv) do { \ + PJDLOG_ASSERT((nv) != NULL); \ + PJDLOG_ASSERT((nv)->nv_magic == NV_MAGIC); \ +} while (0) + +static void nv_add(struct nv *nv, const unsigned char *value, size_t vsize, + int type, const char *name); +static void nv_addv(struct nv *nv, const unsigned char *value, size_t vsize, + int type, const char *namefmt, va_list nameap); +static struct nvhdr *nv_find(struct nv *nv, int type, const char *namefmt, + va_list nameap); +static void nv_swap(struct nvhdr *nvh, bool tohost); + +/* + * Allocate and initialize new nv structure. + * Return NULL in case of malloc(3) failure. + */ +struct nv * +nv_alloc(void) +{ + struct nv *nv; + + nv = malloc(sizeof(*nv)); + if (nv == NULL) + return (NULL); + nv->nv_ebuf = ebuf_alloc(0); + if (nv->nv_ebuf == NULL) { + free(nv); + return (NULL); + } + nv->nv_error = 0; + nv->nv_magic = NV_MAGIC; + return (nv); +} + +/* + * Free the given nv structure. + */ +void +nv_free(struct nv *nv) +{ + + if (nv == NULL) + return; + + NV_CHECK(nv); + + nv->nv_magic = 0; + ebuf_free(nv->nv_ebuf); + free(nv); +} + +/* + * Return error for the given nv structure. + */ +int +nv_error(const struct nv *nv) +{ + + if (nv == NULL) + return (ENOMEM); + + NV_CHECK(nv); + + return (nv->nv_error); +} + +/* + * Set error for the given nv structure and return previous error. + */ +int +nv_set_error(struct nv *nv, int error) +{ + int preverr; + + if (nv == NULL) + return (ENOMEM); + + NV_CHECK(nv); + + preverr = nv->nv_error; + nv->nv_error = error; + return (preverr); +} + +/* + * Validate correctness of the entire nv structure and all its elements. + * If extrap is not NULL, store number of extra bytes at the end of the buffer. + */ +int +nv_validate(struct nv *nv, size_t *extrap) +{ + struct nvhdr *nvh; + unsigned char *data, *ptr; + size_t dsize, size, vsize; + int error; + + if (nv == NULL) { + errno = ENOMEM; + return (-1); + } + + NV_CHECK(nv); + PJDLOG_ASSERT(nv->nv_error == 0); + + /* TODO: Check that names are unique? */ + + error = 0; + ptr = ebuf_data(nv->nv_ebuf, &size); + while (size > 0) { + /* + * Zeros at the end of the buffer are acceptable. + */ + if (ptr[0] == '\0') + break; + /* + * Minimum size at this point is size of nvhdr structure, one + * character long name plus terminating '\0'. + */ + if (size < sizeof(*nvh) + 2) { + error = EINVAL; + break; + } + nvh = (struct nvhdr *)ptr; + if (size < NVH_HSIZE(nvh)) { + error = EINVAL; + break; + } + if (nvh->nvh_name[nvh->nvh_namesize - 1] != '\0') { + error = EINVAL; + break; + } + if (strlen(nvh->nvh_name) != + (size_t)(nvh->nvh_namesize - 1)) { + error = EINVAL; + break; + } + if ((nvh->nvh_type & NV_TYPE_MASK) < NV_TYPE_FIRST || + (nvh->nvh_type & NV_TYPE_MASK) > NV_TYPE_LAST) { + error = EINVAL; + break; + } + dsize = NVH_DSIZE(nvh); + if (dsize == 0) { + error = EINVAL; + break; + } + if (size < NVH_SIZE(nvh)) { + error = EINVAL; + break; + } + vsize = 0; + switch (nvh->nvh_type & NV_TYPE_MASK) { + case NV_TYPE_INT8: + case NV_TYPE_UINT8: + if (vsize == 0) + vsize = 1; + /* FALLTHROUGH */ + case NV_TYPE_INT16: + case NV_TYPE_UINT16: + if (vsize == 0) + vsize = 2; + /* FALLTHROUGH */ + case NV_TYPE_INT32: + case NV_TYPE_UINT32: + if (vsize == 0) + vsize = 4; + /* FALLTHROUGH */ + case NV_TYPE_INT64: + case NV_TYPE_UINT64: + if (vsize == 0) + vsize = 8; + if (dsize != vsize) { + error = EINVAL; + break; + } + break; + case NV_TYPE_INT8_ARRAY: + case NV_TYPE_UINT8_ARRAY: + break; + case NV_TYPE_INT16_ARRAY: + case NV_TYPE_UINT16_ARRAY: + if (vsize == 0) + vsize = 2; + /* FALLTHROUGH */ + case NV_TYPE_INT32_ARRAY: + case NV_TYPE_UINT32_ARRAY: + if (vsize == 0) + vsize = 4; + /* FALLTHROUGH */ + case NV_TYPE_INT64_ARRAY: + case NV_TYPE_UINT64_ARRAY: + if (vsize == 0) + vsize = 8; + if ((dsize % vsize) != 0) { + error = EINVAL; + break; + } + break; + case NV_TYPE_STRING: + data = NVH_DATA(nvh); + if (data[dsize - 1] != '\0') { + error = EINVAL; + break; + } + if (strlen((char *)data) != dsize - 1) { + error = EINVAL; + break; + } + break; + default: + PJDLOG_ABORT("invalid condition"); + } + if (error != 0) + break; + ptr += NVH_SIZE(nvh); + size -= NVH_SIZE(nvh); + } + if (error != 0) { + errno = error; + if (nv->nv_error == 0) + nv->nv_error = error; + return (-1); + } + if (extrap != NULL) + *extrap = size; + return (0); +} + +/* + * Convert the given nv structure to network byte order and return ebuf + * structure. + */ +struct ebuf * +nv_hton(struct nv *nv) +{ + struct nvhdr *nvh; + unsigned char *ptr; + size_t size; + + NV_CHECK(nv); + PJDLOG_ASSERT(nv->nv_error == 0); + + ptr = ebuf_data(nv->nv_ebuf, &size); + while (size > 0) { + /* + * Minimum size at this point is size of nvhdr structure, + * one character long name plus terminating '\0'. + */ + PJDLOG_ASSERT(size >= sizeof(*nvh) + 2); + nvh = (struct nvhdr *)ptr; + PJDLOG_ASSERT(NVH_SIZE(nvh) <= size); + nv_swap(nvh, false); + ptr += NVH_SIZE(nvh); + size -= NVH_SIZE(nvh); + } + + return (nv->nv_ebuf); +} + +/* + * Create nv structure based on ebuf received from the network. + */ +struct nv * +nv_ntoh(struct ebuf *eb) +{ + struct nv *nv; + size_t extra; + int rerrno; + + PJDLOG_ASSERT(eb != NULL); + + nv = malloc(sizeof(*nv)); + if (nv == NULL) + return (NULL); + nv->nv_error = 0; + nv->nv_ebuf = eb; + nv->nv_magic = NV_MAGIC; + + if (nv_validate(nv, &extra) == -1) { + rerrno = errno; + nv->nv_magic = 0; + free(nv); + errno = rerrno; + return (NULL); + } + /* + * Remove extra zeros at the end of the buffer. + */ + ebuf_del_tail(eb, extra); + + return (nv); +} + +#define NV_DEFINE_ADD(type, TYPE) \ +void \ +nv_add_##type(struct nv *nv, type##_t value, const char *namefmt, ...) \ +{ \ + va_list nameap; \ + \ + va_start(nameap, namefmt); \ + nv_addv(nv, (unsigned char *)&value, sizeof(value), \ + NV_TYPE_##TYPE, namefmt, nameap); \ + va_end(nameap); \ +} + +NV_DEFINE_ADD(int8, INT8) +NV_DEFINE_ADD(uint8, UINT8) +NV_DEFINE_ADD(int16, INT16) +NV_DEFINE_ADD(uint16, UINT16) +NV_DEFINE_ADD(int32, INT32) +NV_DEFINE_ADD(uint32, UINT32) +NV_DEFINE_ADD(int64, INT64) +NV_DEFINE_ADD(uint64, UINT64) + +#undef NV_DEFINE_ADD + +#define NV_DEFINE_ADD_ARRAY(type, TYPE) \ +void \ +nv_add_##type##_array(struct nv *nv, const type##_t *value, \ + size_t nsize, const char *namefmt, ...) \ +{ \ + va_list nameap; \ + \ + va_start(nameap, namefmt); \ + nv_addv(nv, (const unsigned char *)value, \ + sizeof(value[0]) * nsize, NV_TYPE_##TYPE##_ARRAY, namefmt, \ + nameap); \ + va_end(nameap); \ +} + +NV_DEFINE_ADD_ARRAY(int8, INT8) +NV_DEFINE_ADD_ARRAY(uint8, UINT8) +NV_DEFINE_ADD_ARRAY(int16, INT16) +NV_DEFINE_ADD_ARRAY(uint16, UINT16) +NV_DEFINE_ADD_ARRAY(int32, INT32) +NV_DEFINE_ADD_ARRAY(uint32, UINT32) +NV_DEFINE_ADD_ARRAY(int64, INT64) +NV_DEFINE_ADD_ARRAY(uint64, UINT64) + +#undef NV_DEFINE_ADD_ARRAY + +void +nv_add_string(struct nv *nv, const char *value, const char *namefmt, ...) +{ + va_list nameap; + size_t size; + + size = strlen(value) + 1; + + va_start(nameap, namefmt); + nv_addv(nv, (const unsigned char *)value, size, NV_TYPE_STRING, + namefmt, nameap); + va_end(nameap); +} + +void +nv_add_stringf(struct nv *nv, const char *name, const char *valuefmt, ...) +{ + va_list valueap; + + va_start(valueap, valuefmt); + nv_add_stringv(nv, name, valuefmt, valueap); + va_end(valueap); +} + +void +nv_add_stringv(struct nv *nv, const char *name, const char *valuefmt, + va_list valueap) +{ + char *value; + ssize_t size; + + size = vasprintf(&value, valuefmt, valueap); + if (size == -1) { + if (nv->nv_error == 0) + nv->nv_error = ENOMEM; + return; + } + size++; + nv_add(nv, (const unsigned char *)value, size, NV_TYPE_STRING, name); + free(value); +} + +#define NV_DEFINE_GET(type, TYPE) \ +type##_t \ +nv_get_##type(struct nv *nv, const char *namefmt, ...) \ +{ \ + struct nvhdr *nvh; \ + va_list nameap; \ + type##_t value; \ + \ + va_start(nameap, namefmt); \ + nvh = nv_find(nv, NV_TYPE_##TYPE, namefmt, nameap); \ + va_end(nameap); \ + if (nvh == NULL) \ + return (0); \ + PJDLOG_ASSERT((nvh->nvh_type & NV_ORDER_MASK) == NV_ORDER_HOST);\ + PJDLOG_ASSERT(sizeof(value) == nvh->nvh_dsize); \ + bcopy(NVH_DATA(nvh), &value, sizeof(value)); \ + \ + return (value); \ +} + +NV_DEFINE_GET(int8, INT8) +NV_DEFINE_GET(uint8, UINT8) +NV_DEFINE_GET(int16, INT16) +NV_DEFINE_GET(uint16, UINT16) +NV_DEFINE_GET(int32, INT32) +NV_DEFINE_GET(uint32, UINT32) +NV_DEFINE_GET(int64, INT64) +NV_DEFINE_GET(uint64, UINT64) + +#undef NV_DEFINE_GET + +#define NV_DEFINE_GET_ARRAY(type, TYPE) \ +const type##_t * \ +nv_get_##type##_array(struct nv *nv, size_t *sizep, \ + const char *namefmt, ...) \ +{ \ + struct nvhdr *nvh; \ + va_list nameap; \ + \ + va_start(nameap, namefmt); \ + nvh = nv_find(nv, NV_TYPE_##TYPE##_ARRAY, namefmt, nameap); \ + va_end(nameap); \ + if (nvh == NULL) \ + return (NULL); \ + PJDLOG_ASSERT((nvh->nvh_type & NV_ORDER_MASK) == NV_ORDER_HOST);\ + PJDLOG_ASSERT((nvh->nvh_dsize % sizeof(type##_t)) == 0); \ + if (sizep != NULL) \ + *sizep = nvh->nvh_dsize / sizeof(type##_t); \ + return ((type##_t *)(void *)NVH_DATA(nvh)); \ +} + +NV_DEFINE_GET_ARRAY(int8, INT8) +NV_DEFINE_GET_ARRAY(uint8, UINT8) +NV_DEFINE_GET_ARRAY(int16, INT16) +NV_DEFINE_GET_ARRAY(uint16, UINT16) +NV_DEFINE_GET_ARRAY(int32, INT32) +NV_DEFINE_GET_ARRAY(uint32, UINT32) +NV_DEFINE_GET_ARRAY(int64, INT64) +NV_DEFINE_GET_ARRAY(uint64, UINT64) + +#undef NV_DEFINE_GET_ARRAY + +const char * +nv_get_string(struct nv *nv, const char *namefmt, ...) +{ + struct nvhdr *nvh; + va_list nameap; + char *str; + + va_start(nameap, namefmt); + nvh = nv_find(nv, NV_TYPE_STRING, namefmt, nameap); + va_end(nameap); + if (nvh == NULL) + return (NULL); + PJDLOG_ASSERT((nvh->nvh_type & NV_ORDER_MASK) == NV_ORDER_HOST); + PJDLOG_ASSERT(nvh->nvh_dsize >= 1); + str = NVH_DATA(nvh); + PJDLOG_ASSERT(str[nvh->nvh_dsize - 1] == '\0'); + PJDLOG_ASSERT(strlen(str) == nvh->nvh_dsize - 1); + return (str); +} + +static bool +nv_vexists(struct nv *nv, const char *namefmt, va_list nameap) +{ + struct nvhdr *nvh; + int snverror, serrno; + + if (nv == NULL) + return (false); + + serrno = errno; + snverror = nv->nv_error; + + nvh = nv_find(nv, NV_TYPE_NONE, namefmt, nameap); + + errno = serrno; + nv->nv_error = snverror; + + return (nvh != NULL); +} + +bool +nv_exists(struct nv *nv, const char *namefmt, ...) +{ + va_list nameap; + bool ret; + + va_start(nameap, namefmt); + ret = nv_vexists(nv, namefmt, nameap); + va_end(nameap); + + return (ret); +} + +void +nv_assert(struct nv *nv, const char *namefmt, ...) +{ + va_list nameap; + + va_start(nameap, namefmt); + PJDLOG_ASSERT(nv_vexists(nv, namefmt, nameap)); + va_end(nameap); +} + +/* + * Dump content of the nv structure. + */ +void +nv_dump(struct nv *nv) +{ + struct nvhdr *nvh; + unsigned char *data, *ptr; + size_t dsize, size; + unsigned int ii; + bool swap; + + if (nv_validate(nv, NULL) == -1) { + printf("error: %d\n", errno); + return; + } + + NV_CHECK(nv); + PJDLOG_ASSERT(nv->nv_error == 0); + + ptr = ebuf_data(nv->nv_ebuf, &size); + while (size > 0) { + PJDLOG_ASSERT(size >= sizeof(*nvh) + 2); + nvh = (struct nvhdr *)ptr; + PJDLOG_ASSERT(size >= NVH_SIZE(nvh)); + swap = ((nvh->nvh_type & NV_ORDER_MASK) == NV_ORDER_NETWORK); + dsize = NVH_DSIZE(nvh); + data = NVH_DATA(nvh); + printf(" %s", nvh->nvh_name); + switch (nvh->nvh_type & NV_TYPE_MASK) { + case NV_TYPE_INT8: + printf("(int8): %jd", (intmax_t)(*(int8_t *)data)); + break; + case NV_TYPE_UINT8: + printf("(uint8): %ju", (uintmax_t)(*(uint8_t *)data)); + break; + case NV_TYPE_INT16: + printf("(int16): %jd", swap ? + (intmax_t)le16toh(*(int16_t *)(void *)data) : + (intmax_t)*(int16_t *)(void *)data); + break; + case NV_TYPE_UINT16: + printf("(uint16): %ju", swap ? + (uintmax_t)le16toh(*(uint16_t *)(void *)data) : + (uintmax_t)*(uint16_t *)(void *)data); + break; + case NV_TYPE_INT32: + printf("(int32): %jd", swap ? + (intmax_t)le32toh(*(int32_t *)(void *)data) : + (intmax_t)*(int32_t *)(void *)data); + break; + case NV_TYPE_UINT32: + printf("(uint32): %ju", swap ? + (uintmax_t)le32toh(*(uint32_t *)(void *)data) : + (uintmax_t)*(uint32_t *)(void *)data); + break; + case NV_TYPE_INT64: + printf("(int64): %jd", swap ? + (intmax_t)le64toh(*(int64_t *)(void *)data) : + (intmax_t)*(int64_t *)(void *)data); + break; + case NV_TYPE_UINT64: + printf("(uint64): %ju", swap ? + (uintmax_t)le64toh(*(uint64_t *)(void *)data) : + (uintmax_t)*(uint64_t *)(void *)data); + break; + case NV_TYPE_INT8_ARRAY: + printf("(int8 array):"); + for (ii = 0; ii < dsize; ii++) + printf(" %jd", (intmax_t)((int8_t *)data)[ii]); + break; + case NV_TYPE_UINT8_ARRAY: + printf("(uint8 array):"); + for (ii = 0; ii < dsize; ii++) + printf(" %ju", (uintmax_t)((uint8_t *)data)[ii]); + break; + case NV_TYPE_INT16_ARRAY: + printf("(int16 array):"); + for (ii = 0; ii < dsize / 2; ii++) { + printf(" %jd", swap ? + (intmax_t)le16toh(((int16_t *)(void *)data)[ii]) : + (intmax_t)((int16_t *)(void *)data)[ii]); + } + break; + case NV_TYPE_UINT16_ARRAY: + printf("(uint16 array):"); + for (ii = 0; ii < dsize / 2; ii++) { + printf(" %ju", swap ? + (uintmax_t)le16toh(((uint16_t *)(void *)data)[ii]) : + (uintmax_t)((uint16_t *)(void *)data)[ii]); + } + break; + case NV_TYPE_INT32_ARRAY: + printf("(int32 array):"); + for (ii = 0; ii < dsize / 4; ii++) { + printf(" %jd", swap ? + (intmax_t)le32toh(((int32_t *)(void *)data)[ii]) : + (intmax_t)((int32_t *)(void *)data)[ii]); + } + break; + case NV_TYPE_UINT32_ARRAY: + printf("(uint32 array):"); + for (ii = 0; ii < dsize / 4; ii++) { + printf(" %ju", swap ? + (uintmax_t)le32toh(((uint32_t *)(void *)data)[ii]) : + (uintmax_t)((uint32_t *)(void *)data)[ii]); + } + break; + case NV_TYPE_INT64_ARRAY: + printf("(int64 array):"); + for (ii = 0; ii < dsize / 8; ii++) { + printf(" %ju", swap ? + (uintmax_t)le64toh(((uint64_t *)(void *)data)[ii]) : + (uintmax_t)((uint64_t *)(void *)data)[ii]); + } + break; + case NV_TYPE_UINT64_ARRAY: + printf("(uint64 array):"); + for (ii = 0; ii < dsize / 8; ii++) { + printf(" %ju", swap ? + (uintmax_t)le64toh(((uint64_t *)(void *)data)[ii]) : + (uintmax_t)((uint64_t *)(void *)data)[ii]); + } + break; + case NV_TYPE_STRING: + printf("(string): %s", (char *)data); + break; + default: + PJDLOG_ABORT("invalid condition"); + } + printf("\n"); + ptr += NVH_SIZE(nvh); + size -= NVH_SIZE(nvh); + } +} + +/* + * Local routines below. + */ + +static void +nv_add(struct nv *nv, const unsigned char *value, size_t vsize, int type, + const char *name) +{ + static unsigned char align[7]; + struct nvhdr *nvh; + size_t namesize; + + if (nv == NULL) { + errno = ENOMEM; + return; + } + + NV_CHECK(nv); + + namesize = strlen(name) + 1; + + nvh = malloc(sizeof(*nvh) + roundup2(namesize, 8)); + if (nvh == NULL) { + if (nv->nv_error == 0) + nv->nv_error = ENOMEM; + return; + } + nvh->nvh_type = NV_ORDER_HOST | type; + nvh->nvh_namesize = (uint8_t)namesize; + nvh->nvh_dsize = (uint32_t)vsize; + bcopy(name, nvh->nvh_name, namesize); + + /* Add header first. */ + if (ebuf_add_tail(nv->nv_ebuf, nvh, NVH_HSIZE(nvh)) == -1) { + PJDLOG_ASSERT(errno != 0); + if (nv->nv_error == 0) + nv->nv_error = errno; + free(nvh); + return; + } + free(nvh); + /* Add the actual data. */ + if (ebuf_add_tail(nv->nv_ebuf, value, vsize) == -1) { + PJDLOG_ASSERT(errno != 0); + if (nv->nv_error == 0) + nv->nv_error = errno; + return; + } + /* Align the data (if needed). */ + vsize = roundup2(vsize, 8) - vsize; + if (vsize == 0) + return; + PJDLOG_ASSERT(vsize > 0 && vsize <= sizeof(align)); + if (ebuf_add_tail(nv->nv_ebuf, align, vsize) == -1) { + PJDLOG_ASSERT(errno != 0); + if (nv->nv_error == 0) + nv->nv_error = errno; + return; + } +} + +static void +nv_addv(struct nv *nv, const unsigned char *value, size_t vsize, int type, + const char *namefmt, va_list nameap) +{ + char name[255]; + size_t namesize; + + namesize = vsnprintf(name, sizeof(name), namefmt, nameap); + PJDLOG_ASSERT(namesize > 0 && namesize < sizeof(name)); + + nv_add(nv, value, vsize, type, name); +} + +static struct nvhdr * +nv_find(struct nv *nv, int type, const char *namefmt, va_list nameap) +{ + char name[255]; + struct nvhdr *nvh; + unsigned char *ptr; + size_t size, namesize; + + if (nv == NULL) { + errno = ENOMEM; + return (NULL); + } + + NV_CHECK(nv); + + namesize = vsnprintf(name, sizeof(name), namefmt, nameap); + PJDLOG_ASSERT(namesize > 0 && namesize < sizeof(name)); + namesize++; + + ptr = ebuf_data(nv->nv_ebuf, &size); + while (size > 0) { + PJDLOG_ASSERT(size >= sizeof(*nvh) + 2); + nvh = (struct nvhdr *)ptr; + PJDLOG_ASSERT(size >= NVH_SIZE(nvh)); + nv_swap(nvh, true); + if (strcmp(nvh->nvh_name, name) == 0) { + if (type != NV_TYPE_NONE && + (nvh->nvh_type & NV_TYPE_MASK) != type) { + errno = EINVAL; + if (nv->nv_error == 0) + nv->nv_error = EINVAL; + return (NULL); + } + return (nvh); + } + ptr += NVH_SIZE(nvh); + size -= NVH_SIZE(nvh); + } + errno = ENOENT; + if (nv->nv_error == 0) + nv->nv_error = ENOENT; + return (NULL); +} + +static void +nv_swap(struct nvhdr *nvh, bool tohost) +{ + unsigned char *data, *end, *p; + size_t vsize; + + data = NVH_DATA(nvh); + if (tohost) { + if ((nvh->nvh_type & NV_ORDER_MASK) == NV_ORDER_HOST) + return; + nvh->nvh_dsize = le32toh(nvh->nvh_dsize); + end = data + nvh->nvh_dsize; + nvh->nvh_type &= ~NV_ORDER_MASK; + nvh->nvh_type |= NV_ORDER_HOST; + } else { + if ((nvh->nvh_type & NV_ORDER_MASK) == NV_ORDER_NETWORK) + return; + end = data + nvh->nvh_dsize; + nvh->nvh_dsize = htole32(nvh->nvh_dsize); + nvh->nvh_type &= ~NV_ORDER_MASK; + nvh->nvh_type |= NV_ORDER_NETWORK; + } + + vsize = 0; + + switch (nvh->nvh_type & NV_TYPE_MASK) { + case NV_TYPE_INT8: + case NV_TYPE_UINT8: + case NV_TYPE_INT8_ARRAY: + case NV_TYPE_UINT8_ARRAY: + break; + case NV_TYPE_INT16: + case NV_TYPE_UINT16: + case NV_TYPE_INT16_ARRAY: + case NV_TYPE_UINT16_ARRAY: + if (vsize == 0) + vsize = 2; + /* FALLTHROUGH */ + case NV_TYPE_INT32: + case NV_TYPE_UINT32: + case NV_TYPE_INT32_ARRAY: + case NV_TYPE_UINT32_ARRAY: + if (vsize == 0) + vsize = 4; + /* FALLTHROUGH */ + case NV_TYPE_INT64: + case NV_TYPE_UINT64: + case NV_TYPE_INT64_ARRAY: + case NV_TYPE_UINT64_ARRAY: + if (vsize == 0) + vsize = 8; + for (p = data; p < end; p += vsize) { + if (tohost) { + switch (vsize) { + case 2: + *(uint16_t *)(void *)p = + le16toh(*(uint16_t *)(void *)p); + break; + case 4: + *(uint32_t *)(void *)p = + le32toh(*(uint32_t *)(void *)p); + break; + case 8: + *(uint64_t *)(void *)p = + le64toh(*(uint64_t *)(void *)p); + break; + default: + PJDLOG_ABORT("invalid condition"); + } + } else { + switch (vsize) { + case 2: + *(uint16_t *)(void *)p = + htole16(*(uint16_t *)(void *)p); + break; + case 4: + *(uint32_t *)(void *)p = + htole32(*(uint32_t *)(void *)p); + break; + case 8: + *(uint64_t *)(void *)p = + htole64(*(uint64_t *)(void *)p); + break; + default: + PJDLOG_ABORT("invalid condition"); + } + } + } + break; + case NV_TYPE_STRING: + break; + default: + PJDLOG_ABORT("unrecognized type"); + } +} diff --git a/sbin/hastd/nv.h b/sbin/hastd/nv.h new file mode 100644 index 0000000..d49fa5d --- /dev/null +++ b/sbin/hastd/nv.h @@ -0,0 +1,133 @@ +/*- + * Copyright (c) 2009-2010 The FreeBSD Foundation + * All rights reserved. + * + * This software was developed by Pawel Jakub Dawidek under sponsorship from + * the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _NV_H_ +#define _NV_H_ + +#include <sys/cdefs.h> + +#include <stdarg.h> +#include <stdbool.h> +#include <stdint.h> +#include <stdio.h> + +#include <ebuf.h> + +struct nv; + +struct nv *nv_alloc(void); +void nv_free(struct nv *nv); +int nv_error(const struct nv *nv); +int nv_set_error(struct nv *nv, int error); +int nv_validate(struct nv *nv, size_t *extrap); + +struct ebuf *nv_hton(struct nv *nv); +struct nv *nv_ntoh(struct ebuf *eb); + +void nv_add_int8(struct nv *nv, int8_t value, const char *namefmt, ...) + __printflike(3, 4); +void nv_add_uint8(struct nv *nv, uint8_t value, const char *namefmt, ...) + __printflike(3, 4); +void nv_add_int16(struct nv *nv, int16_t value, const char *namefmt, ...) + __printflike(3, 4); +void nv_add_uint16(struct nv *nv, uint16_t value, const char *namefmt, ...) + __printflike(3, 4); +void nv_add_int32(struct nv *nv, int32_t value, const char *namefmt, ...) + __printflike(3, 4); +void nv_add_uint32(struct nv *nv, uint32_t value, const char *namefmt, ...) + __printflike(3, 4); +void nv_add_int64(struct nv *nv, int64_t value, const char *namefmt, ...) + __printflike(3, 4); +void nv_add_uint64(struct nv *nv, uint64_t value, const char *namefmt, ...) + __printflike(3, 4); +void nv_add_int8_array(struct nv *nv, const int8_t *value, size_t size, + const char *namefmt, ...) __printflike(4, 5); +void nv_add_uint8_array(struct nv *nv, const uint8_t *value, size_t size, + const char *namefmt, ...) __printflike(4, 5); +void nv_add_int16_array(struct nv *nv, const int16_t *value, size_t size, + const char *namefmt, ...) __printflike(4, 5); +void nv_add_uint16_array(struct nv *nv, const uint16_t *value, size_t size, + const char *namefmt, ...) __printflike(4, 5); +void nv_add_int32_array(struct nv *nv, const int32_t *value, size_t size, + const char *namefmt, ...) __printflike(4, 5); +void nv_add_uint32_array(struct nv *nv, const uint32_t *value, size_t size, + const char *namefmt, ...) __printflike(4, 5); +void nv_add_int64_array(struct nv *nv, const int64_t *value, size_t size, + const char *namefmt, ...) __printflike(4, 5); +void nv_add_uint64_array(struct nv *nv, const uint64_t *value, size_t size, + const char *namefmt, ...) __printflike(4, 5); +void nv_add_string(struct nv *nv, const char *value, const char *namefmt, ...) + __printflike(3, 4); +void nv_add_stringf(struct nv *nv, const char *name, const char *valuefmt, ...) + __printflike(3, 4); +void nv_add_stringv(struct nv *nv, const char *name, const char *valuefmt, + va_list valueap) __printflike(3, 0); + +int8_t nv_get_int8(struct nv *nv, const char *namefmt, ...) + __printflike(2, 3); +uint8_t nv_get_uint8(struct nv *nv, const char *namefmt, ...) + __printflike(2, 3); +int16_t nv_get_int16(struct nv *nv, const char *namefmt, ...) + __printflike(2, 3); +uint16_t nv_get_uint16(struct nv *nv, const char *namefmt, ...) + __printflike(2, 3); +int32_t nv_get_int32(struct nv *nv, const char *namefmt, ...) + __printflike(2, 3); +uint32_t nv_get_uint32(struct nv *nv, const char *namefmt, ...) + __printflike(2, 3); +int64_t nv_get_int64(struct nv *nv, const char *namefmt, ...) + __printflike(2, 3); +uint64_t nv_get_uint64(struct nv *nv, const char *namefmt, ...) + __printflike(2, 3); +const int8_t *nv_get_int8_array(struct nv *nv, size_t *sizep, + const char *namefmt, ...) __printflike(3, 4); +const uint8_t *nv_get_uint8_array(struct nv *nv, size_t *sizep, + const char *namefmt, ...) __printflike(3, 4); +const int16_t *nv_get_int16_array(struct nv *nv, size_t *sizep, + const char *namefmt, ...) __printflike(3, 4); +const uint16_t *nv_get_uint16_array(struct nv *nv, size_t *sizep, + const char *namefmt, ...) __printflike(3, 4); +const int32_t *nv_get_int32_array(struct nv *nv, size_t *sizep, + const char *namefmt, ...) __printflike(3, 4); +const uint32_t *nv_get_uint32_array(struct nv *nv, size_t *sizep, + const char *namefmt, ...) __printflike(3, 4); +const int64_t *nv_get_int64_array(struct nv *nv, size_t *sizep, + const char *namefmt, ...) __printflike(3, 4); +const uint64_t *nv_get_uint64_array(struct nv *nv, size_t *sizep, + const char *namefmt, ...) __printflike(3, 4); +const char *nv_get_string(struct nv *nv, const char *namefmt, ...) + __printflike(2, 3); + +bool nv_exists(struct nv *nv, const char *namefmt, ...) __printflike(2, 3); +void nv_assert(struct nv *nv, const char *namefmt, ...) __printflike(2, 3); +void nv_dump(struct nv *nv); + +#endif /* !_NV_H_ */ diff --git a/sbin/hastd/parse.y b/sbin/hastd/parse.y new file mode 100644 index 0000000..6bfb537 --- /dev/null +++ b/sbin/hastd/parse.y @@ -0,0 +1,1037 @@ +%{ +/*- + * Copyright (c) 2009-2010 The FreeBSD Foundation + * Copyright (c) 2011 Pawel Jakub Dawidek <pawel@dawidek.net> + * All rights reserved. + * + * This software was developed by Pawel Jakub Dawidek under sponsorship from + * the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include <sys/param.h> /* MAXHOSTNAMELEN */ +#include <sys/queue.h> +#include <sys/socket.h> +#include <sys/sysctl.h> + +#include <arpa/inet.h> + +#include <err.h> +#include <errno.h> +#include <stdio.h> +#include <string.h> +#include <sysexits.h> +#include <unistd.h> + +#include <pjdlog.h> + +#include "hast.h" + +extern int depth; +extern int lineno; + +extern FILE *yyin; +extern char *yytext; + +static struct hastd_config *lconfig; +static struct hast_resource *curres; +static bool mynode, hadmynode; + +static char depth0_control[HAST_ADDRSIZE]; +static char depth0_pidfile[PATH_MAX]; +static char depth0_listen_tcp4[HAST_ADDRSIZE]; +static char depth0_listen_tcp6[HAST_ADDRSIZE]; +static TAILQ_HEAD(, hastd_listen) depth0_listen; +static int depth0_replication; +static int depth0_checksum; +static int depth0_compression; +static int depth0_timeout; +static char depth0_exec[PATH_MAX]; +static int depth0_metaflush; + +static char depth1_provname[PATH_MAX]; +static char depth1_localpath[PATH_MAX]; +static int depth1_metaflush; + +extern void yyerror(const char *); +extern int yylex(void); +extern void yyrestart(FILE *); + +static int isitme(const char *name); +static bool family_supported(int family); +static int node_names(char **namesp); +%} + +%token CONTROL PIDFILE LISTEN REPLICATION CHECKSUM COMPRESSION METAFLUSH +%token TIMEOUT EXEC RESOURCE NAME LOCAL REMOTE SOURCE ON OFF +%token FULLSYNC MEMSYNC ASYNC NONE CRC32 SHA256 HOLE LZF +%token NUM STR OB CB + +%type <str> remote_str +%type <num> replication_type +%type <num> checksum_type +%type <num> compression_type +%type <num> boolean + +%union +{ + int num; + char *str; +} + +%token <num> NUM +%token <str> STR + +%% + +statements: + | + statements statement + ; + +statement: + control_statement + | + pidfile_statement + | + listen_statement + | + replication_statement + | + checksum_statement + | + compression_statement + | + timeout_statement + | + exec_statement + | + metaflush_statement + | + node_statement + | + resource_statement + ; + +control_statement: CONTROL STR + { + switch (depth) { + case 0: + if (strlcpy(depth0_control, $2, + sizeof(depth0_control)) >= + sizeof(depth0_control)) { + pjdlog_error("control argument is too long."); + free($2); + return (1); + } + break; + case 1: + if (!mynode) + break; + if (strlcpy(lconfig->hc_controladdr, $2, + sizeof(lconfig->hc_controladdr)) >= + sizeof(lconfig->hc_controladdr)) { + pjdlog_error("control argument is too long."); + free($2); + return (1); + } + break; + default: + PJDLOG_ABORT("control at wrong depth level"); + } + free($2); + } + ; + +pidfile_statement: PIDFILE STR + { + switch (depth) { + case 0: + if (strlcpy(depth0_pidfile, $2, + sizeof(depth0_pidfile)) >= + sizeof(depth0_pidfile)) { + pjdlog_error("pidfile argument is too long."); + free($2); + return (1); + } + break; + case 1: + if (!mynode) + break; + if (strlcpy(lconfig->hc_pidfile, $2, + sizeof(lconfig->hc_pidfile)) >= + sizeof(lconfig->hc_pidfile)) { + pjdlog_error("pidfile argument is too long."); + free($2); + return (1); + } + break; + default: + PJDLOG_ABORT("pidfile at wrong depth level"); + } + free($2); + } + ; + +listen_statement: LISTEN STR + { + struct hastd_listen *lst; + + lst = calloc(1, sizeof(*lst)); + if (lst == NULL) { + pjdlog_error("Unable to allocate memory for listen address."); + free($2); + return (1); + } + if (strlcpy(lst->hl_addr, $2, sizeof(lst->hl_addr)) >= + sizeof(lst->hl_addr)) { + pjdlog_error("listen argument is too long."); + free($2); + free(lst); + return (1); + } + switch (depth) { + case 0: + TAILQ_INSERT_TAIL(&depth0_listen, lst, hl_next); + break; + case 1: + if (mynode) + TAILQ_INSERT_TAIL(&depth0_listen, lst, hl_next); + else + free(lst); + break; + default: + PJDLOG_ABORT("listen at wrong depth level"); + } + free($2); + } + ; + +replication_statement: REPLICATION replication_type + { + switch (depth) { + case 0: + depth0_replication = $2; + break; + case 1: + PJDLOG_ASSERT(curres != NULL); + curres->hr_replication = $2; + curres->hr_original_replication = $2; + break; + default: + PJDLOG_ABORT("replication at wrong depth level"); + } + } + ; + +replication_type: + FULLSYNC { $$ = HAST_REPLICATION_FULLSYNC; } + | + MEMSYNC { $$ = HAST_REPLICATION_MEMSYNC; } + | + ASYNC { $$ = HAST_REPLICATION_ASYNC; } + ; + +checksum_statement: CHECKSUM checksum_type + { + switch (depth) { + case 0: + depth0_checksum = $2; + break; + case 1: + PJDLOG_ASSERT(curres != NULL); + curres->hr_checksum = $2; + break; + default: + PJDLOG_ABORT("checksum at wrong depth level"); + } + } + ; + +checksum_type: + NONE { $$ = HAST_CHECKSUM_NONE; } + | + CRC32 { $$ = HAST_CHECKSUM_CRC32; } + | + SHA256 { $$ = HAST_CHECKSUM_SHA256; } + ; + +compression_statement: COMPRESSION compression_type + { + switch (depth) { + case 0: + depth0_compression = $2; + break; + case 1: + PJDLOG_ASSERT(curres != NULL); + curres->hr_compression = $2; + break; + default: + PJDLOG_ABORT("compression at wrong depth level"); + } + } + ; + +compression_type: + NONE { $$ = HAST_COMPRESSION_NONE; } + | + HOLE { $$ = HAST_COMPRESSION_HOLE; } + | + LZF { $$ = HAST_COMPRESSION_LZF; } + ; + +timeout_statement: TIMEOUT NUM + { + if ($2 <= 0) { + pjdlog_error("Negative or zero timeout."); + return (1); + } + switch (depth) { + case 0: + depth0_timeout = $2; + break; + case 1: + PJDLOG_ASSERT(curres != NULL); + curres->hr_timeout = $2; + break; + default: + PJDLOG_ABORT("timeout at wrong depth level"); + } + } + ; + +exec_statement: EXEC STR + { + switch (depth) { + case 0: + if (strlcpy(depth0_exec, $2, sizeof(depth0_exec)) >= + sizeof(depth0_exec)) { + pjdlog_error("Exec path is too long."); + free($2); + return (1); + } + break; + case 1: + PJDLOG_ASSERT(curres != NULL); + if (strlcpy(curres->hr_exec, $2, + sizeof(curres->hr_exec)) >= + sizeof(curres->hr_exec)) { + pjdlog_error("Exec path is too long."); + free($2); + return (1); + } + break; + default: + PJDLOG_ABORT("exec at wrong depth level"); + } + free($2); + } + ; + +metaflush_statement: METAFLUSH boolean + { + switch (depth) { + case 0: + depth0_metaflush = $2; + break; + case 1: + PJDLOG_ASSERT(curres != NULL); + depth1_metaflush = $2; + break; + case 2: + if (!mynode) + break; + PJDLOG_ASSERT(curres != NULL); + curres->hr_metaflush = $2; + break; + default: + PJDLOG_ABORT("metaflush at wrong depth level"); + } + } + ; + +boolean: + ON { $$ = 1; } + | + OFF { $$ = 0; } + ; + +node_statement: ON node_start OB node_entries CB + { + mynode = false; + } + ; + +node_start: STR + { + switch (isitme($1)) { + case -1: + free($1); + return (1); + case 0: + break; + case 1: + mynode = true; + break; + default: + PJDLOG_ABORT("invalid isitme() return value"); + } + free($1); + } + ; + +node_entries: + | + node_entries node_entry + ; + +node_entry: + control_statement + | + pidfile_statement + | + listen_statement + ; + +resource_statement: RESOURCE resource_start OB resource_entries CB + { + if (curres != NULL) { + /* + * There must be section for this node, at least with + * remote address configuration. + */ + if (!hadmynode) { + char *names; + + if (node_names(&names) != 0) + return (1); + pjdlog_error("No resource %s configuration for this node (acceptable node names: %s).", + curres->hr_name, names); + return (1); + } + + /* + * Let's see if there are some resource-level settings + * that we can use for node-level settings. + */ + if (curres->hr_provname[0] == '\0' && + depth1_provname[0] != '\0') { + /* + * Provider name is not set at node-level, + * but is set at resource-level, use it. + */ + strlcpy(curres->hr_provname, depth1_provname, + sizeof(curres->hr_provname)); + } + if (curres->hr_localpath[0] == '\0' && + depth1_localpath[0] != '\0') { + /* + * Path to local provider is not set at + * node-level, but is set at resource-level, + * use it. + */ + strlcpy(curres->hr_localpath, depth1_localpath, + sizeof(curres->hr_localpath)); + } + if (curres->hr_metaflush == -1 && depth1_metaflush != -1) { + /* + * Metaflush is not set at node-level, + * but is set at resource-level, use it. + */ + curres->hr_metaflush = depth1_metaflush; + } + + /* + * If provider name is not given, use resource name + * as provider name. + */ + if (curres->hr_provname[0] == '\0') { + strlcpy(curres->hr_provname, curres->hr_name, + sizeof(curres->hr_provname)); + } + + /* + * Remote address has to be configured at this point. + */ + if (curres->hr_remoteaddr[0] == '\0') { + pjdlog_error("Remote address not configured for resource %s.", + curres->hr_name); + return (1); + } + /* + * Path to local provider has to be configured at this + * point. + */ + if (curres->hr_localpath[0] == '\0') { + pjdlog_error("Path to local component not configured for resource %s.", + curres->hr_name); + return (1); + } + + /* Put it onto resource list. */ + TAILQ_INSERT_TAIL(&lconfig->hc_resources, curres, hr_next); + curres = NULL; + } + } + ; + +resource_start: STR + { + /* Check if there is no duplicate entry. */ + TAILQ_FOREACH(curres, &lconfig->hc_resources, hr_next) { + if (strcmp(curres->hr_name, $1) == 0) { + pjdlog_error("Resource %s configured more than once.", + curres->hr_name); + free($1); + return (1); + } + } + + /* + * Clear those, so we can tell if they were set at + * resource-level or not. + */ + depth1_provname[0] = '\0'; + depth1_localpath[0] = '\0'; + depth1_metaflush = -1; + hadmynode = false; + + curres = calloc(1, sizeof(*curres)); + if (curres == NULL) { + pjdlog_error("Unable to allocate memory for resource."); + free($1); + return (1); + } + if (strlcpy(curres->hr_name, $1, + sizeof(curres->hr_name)) >= + sizeof(curres->hr_name)) { + pjdlog_error("Resource name is too long."); + free(curres); + free($1); + return (1); + } + free($1); + curres->hr_role = HAST_ROLE_INIT; + curres->hr_previous_role = HAST_ROLE_INIT; + curres->hr_replication = -1; + curres->hr_original_replication = -1; + curres->hr_checksum = -1; + curres->hr_compression = -1; + curres->hr_version = 1; + curres->hr_timeout = -1; + curres->hr_exec[0] = '\0'; + curres->hr_provname[0] = '\0'; + curres->hr_localpath[0] = '\0'; + curres->hr_localfd = -1; + curres->hr_localflush = true; + curres->hr_metaflush = -1; + curres->hr_remoteaddr[0] = '\0'; + curres->hr_sourceaddr[0] = '\0'; + curres->hr_ggateunit = -1; + } + ; + +resource_entries: + | + resource_entries resource_entry + ; + +resource_entry: + replication_statement + | + checksum_statement + | + compression_statement + | + timeout_statement + | + exec_statement + | + metaflush_statement + | + name_statement + | + local_statement + | + resource_node_statement + ; + +name_statement: NAME STR + { + switch (depth) { + case 1: + if (strlcpy(depth1_provname, $2, + sizeof(depth1_provname)) >= + sizeof(depth1_provname)) { + pjdlog_error("name argument is too long."); + free($2); + return (1); + } + break; + case 2: + if (!mynode) + break; + PJDLOG_ASSERT(curres != NULL); + if (strlcpy(curres->hr_provname, $2, + sizeof(curres->hr_provname)) >= + sizeof(curres->hr_provname)) { + pjdlog_error("name argument is too long."); + free($2); + return (1); + } + break; + default: + PJDLOG_ABORT("name at wrong depth level"); + } + free($2); + } + ; + +local_statement: LOCAL STR + { + switch (depth) { + case 1: + if (strlcpy(depth1_localpath, $2, + sizeof(depth1_localpath)) >= + sizeof(depth1_localpath)) { + pjdlog_error("local argument is too long."); + free($2); + return (1); + } + break; + case 2: + if (!mynode) + break; + PJDLOG_ASSERT(curres != NULL); + if (strlcpy(curres->hr_localpath, $2, + sizeof(curres->hr_localpath)) >= + sizeof(curres->hr_localpath)) { + pjdlog_error("local argument is too long."); + free($2); + return (1); + } + break; + default: + PJDLOG_ABORT("local at wrong depth level"); + } + free($2); + } + ; + +resource_node_statement:ON resource_node_start OB resource_node_entries CB + { + mynode = false; + } + ; + +resource_node_start: STR + { + if (curres != NULL) { + switch (isitme($1)) { + case -1: + free($1); + return (1); + case 0: + break; + case 1: + mynode = hadmynode = true; + break; + default: + PJDLOG_ABORT("invalid isitme() return value"); + } + } + free($1); + } + ; + +resource_node_entries: + | + resource_node_entries resource_node_entry + ; + +resource_node_entry: + name_statement + | + local_statement + | + remote_statement + | + source_statement + | + metaflush_statement + ; + +remote_statement: REMOTE remote_str + { + PJDLOG_ASSERT(depth == 2); + if (mynode) { + PJDLOG_ASSERT(curres != NULL); + if (strlcpy(curres->hr_remoteaddr, $2, + sizeof(curres->hr_remoteaddr)) >= + sizeof(curres->hr_remoteaddr)) { + pjdlog_error("remote argument is too long."); + free($2); + return (1); + } + } + free($2); + } + ; + +remote_str: + NONE { $$ = strdup("none"); } + | + STR { } + ; + +source_statement: SOURCE STR + { + PJDLOG_ASSERT(depth == 2); + if (mynode) { + PJDLOG_ASSERT(curres != NULL); + if (strlcpy(curres->hr_sourceaddr, $2, + sizeof(curres->hr_sourceaddr)) >= + sizeof(curres->hr_sourceaddr)) { + pjdlog_error("source argument is too long."); + free($2); + return (1); + } + } + free($2); + } + ; + +%% + +static int +isitme(const char *name) +{ + char buf[MAXHOSTNAMELEN]; + unsigned long hostid; + char *pos; + size_t bufsize; + + /* + * First check if the given name matches our full hostname. + */ + if (gethostname(buf, sizeof(buf)) < 0) { + pjdlog_errno(LOG_ERR, "gethostname() failed"); + return (-1); + } + if (strcmp(buf, name) == 0) + return (1); + + /* + * Check if it matches first part of the host name. + */ + pos = strchr(buf, '.'); + if (pos != NULL && (size_t)(pos - buf) == strlen(name) && + strncmp(buf, name, pos - buf) == 0) { + return (1); + } + + /* + * Check if it matches host UUID. + */ + bufsize = sizeof(buf); + if (sysctlbyname("kern.hostuuid", buf, &bufsize, NULL, 0) < 0) { + pjdlog_errno(LOG_ERR, "sysctlbyname(kern.hostuuid) failed"); + return (-1); + } + if (strcasecmp(buf, name) == 0) + return (1); + + /* + * Check if it matches hostid. + */ + bufsize = sizeof(hostid); + if (sysctlbyname("kern.hostid", &hostid, &bufsize, NULL, 0) < 0) { + pjdlog_errno(LOG_ERR, "sysctlbyname(kern.hostid) failed"); + return (-1); + } + (void)snprintf(buf, sizeof(buf), "hostid%lu", hostid); + if (strcmp(buf, name) == 0) + return (1); + + /* + * Looks like this isn't about us. + */ + return (0); +} + +static bool +family_supported(int family) +{ + int sock; + + sock = socket(family, SOCK_STREAM, 0); + if (sock == -1 && errno == EPROTONOSUPPORT) + return (false); + if (sock >= 0) + (void)close(sock); + return (true); +} + +static int +node_names(char **namesp) +{ + static char names[MAXHOSTNAMELEN * 3]; + char buf[MAXHOSTNAMELEN]; + unsigned long hostid; + char *pos; + size_t bufsize; + + if (gethostname(buf, sizeof(buf)) < 0) { + pjdlog_errno(LOG_ERR, "gethostname() failed"); + return (-1); + } + + /* First component of the host name. */ + pos = strchr(buf, '.'); + if (pos != NULL && pos != buf) { + (void)strlcpy(names, buf, MIN((size_t)(pos - buf + 1), + sizeof(names))); + (void)strlcat(names, ", ", sizeof(names)); + } + + /* Full host name. */ + (void)strlcat(names, buf, sizeof(names)); + (void)strlcat(names, ", ", sizeof(names)); + + /* Host UUID. */ + bufsize = sizeof(buf); + if (sysctlbyname("kern.hostuuid", buf, &bufsize, NULL, 0) < 0) { + pjdlog_errno(LOG_ERR, "sysctlbyname(kern.hostuuid) failed"); + return (-1); + } + (void)strlcat(names, buf, sizeof(names)); + (void)strlcat(names, ", ", sizeof(names)); + + /* Host ID. */ + bufsize = sizeof(hostid); + if (sysctlbyname("kern.hostid", &hostid, &bufsize, NULL, 0) < 0) { + pjdlog_errno(LOG_ERR, "sysctlbyname(kern.hostid) failed"); + return (-1); + } + (void)snprintf(buf, sizeof(buf), "hostid%lu", hostid); + (void)strlcat(names, buf, sizeof(names)); + + *namesp = names; + + return (0); +} + +void +yyerror(const char *str) +{ + + pjdlog_error("Unable to parse configuration file at line %d near '%s': %s", + lineno, yytext, str); +} + +struct hastd_config * +yy_config_parse(const char *config, bool exitonerror) +{ + int ret; + + curres = NULL; + mynode = false; + depth = 0; + lineno = 0; + + depth0_timeout = HAST_TIMEOUT; + depth0_replication = HAST_REPLICATION_MEMSYNC; + depth0_checksum = HAST_CHECKSUM_NONE; + depth0_compression = HAST_COMPRESSION_HOLE; + strlcpy(depth0_control, HAST_CONTROL, sizeof(depth0_control)); + strlcpy(depth0_pidfile, HASTD_PIDFILE, sizeof(depth0_pidfile)); + TAILQ_INIT(&depth0_listen); + strlcpy(depth0_listen_tcp4, HASTD_LISTEN_TCP4, + sizeof(depth0_listen_tcp4)); + strlcpy(depth0_listen_tcp6, HASTD_LISTEN_TCP6, + sizeof(depth0_listen_tcp6)); + depth0_exec[0] = '\0'; + depth0_metaflush = 1; + + lconfig = calloc(1, sizeof(*lconfig)); + if (lconfig == NULL) { + pjdlog_error("Unable to allocate memory for configuration."); + if (exitonerror) + exit(EX_TEMPFAIL); + return (NULL); + } + + TAILQ_INIT(&lconfig->hc_listen); + TAILQ_INIT(&lconfig->hc_resources); + + yyin = fopen(config, "r"); + if (yyin == NULL) { + pjdlog_errno(LOG_ERR, "Unable to open configuration file %s", + config); + yy_config_free(lconfig); + if (exitonerror) + exit(EX_OSFILE); + return (NULL); + } + yyrestart(yyin); + ret = yyparse(); + fclose(yyin); + if (ret != 0) { + yy_config_free(lconfig); + if (exitonerror) + exit(EX_CONFIG); + return (NULL); + } + + /* + * Let's see if everything is set up. + */ + if (lconfig->hc_controladdr[0] == '\0') { + strlcpy(lconfig->hc_controladdr, depth0_control, + sizeof(lconfig->hc_controladdr)); + } + if (lconfig->hc_pidfile[0] == '\0') { + strlcpy(lconfig->hc_pidfile, depth0_pidfile, + sizeof(lconfig->hc_pidfile)); + } + if (!TAILQ_EMPTY(&depth0_listen)) + TAILQ_CONCAT(&lconfig->hc_listen, &depth0_listen, hl_next); + if (TAILQ_EMPTY(&lconfig->hc_listen)) { + struct hastd_listen *lst; + + if (family_supported(AF_INET)) { + lst = calloc(1, sizeof(*lst)); + if (lst == NULL) { + pjdlog_error("Unable to allocate memory for listen address."); + yy_config_free(lconfig); + if (exitonerror) + exit(EX_TEMPFAIL); + return (NULL); + } + (void)strlcpy(lst->hl_addr, depth0_listen_tcp4, + sizeof(lst->hl_addr)); + TAILQ_INSERT_TAIL(&lconfig->hc_listen, lst, hl_next); + } else { + pjdlog_debug(1, + "No IPv4 support in the kernel, not listening on IPv4 address."); + } + if (family_supported(AF_INET6)) { + lst = calloc(1, sizeof(*lst)); + if (lst == NULL) { + pjdlog_error("Unable to allocate memory for listen address."); + yy_config_free(lconfig); + if (exitonerror) + exit(EX_TEMPFAIL); + return (NULL); + } + (void)strlcpy(lst->hl_addr, depth0_listen_tcp6, + sizeof(lst->hl_addr)); + TAILQ_INSERT_TAIL(&lconfig->hc_listen, lst, hl_next); + } else { + pjdlog_debug(1, + "No IPv6 support in the kernel, not listening on IPv6 address."); + } + if (TAILQ_EMPTY(&lconfig->hc_listen)) { + pjdlog_error("No address to listen on."); + yy_config_free(lconfig); + if (exitonerror) + exit(EX_TEMPFAIL); + return (NULL); + } + } + TAILQ_FOREACH(curres, &lconfig->hc_resources, hr_next) { + PJDLOG_ASSERT(curres->hr_provname[0] != '\0'); + PJDLOG_ASSERT(curres->hr_localpath[0] != '\0'); + PJDLOG_ASSERT(curres->hr_remoteaddr[0] != '\0'); + + if (curres->hr_replication == -1) { + /* + * Replication is not set at resource-level. + * Use global or default setting. + */ + curres->hr_replication = depth0_replication; + curres->hr_original_replication = depth0_replication; + } + if (curres->hr_checksum == -1) { + /* + * Checksum is not set at resource-level. + * Use global or default setting. + */ + curres->hr_checksum = depth0_checksum; + } + if (curres->hr_compression == -1) { + /* + * Compression is not set at resource-level. + * Use global or default setting. + */ + curres->hr_compression = depth0_compression; + } + if (curres->hr_timeout == -1) { + /* + * Timeout is not set at resource-level. + * Use global or default setting. + */ + curres->hr_timeout = depth0_timeout; + } + if (curres->hr_exec[0] == '\0') { + /* + * Exec is not set at resource-level. + * Use global or default setting. + */ + strlcpy(curres->hr_exec, depth0_exec, + sizeof(curres->hr_exec)); + } + if (curres->hr_metaflush == -1) { + /* + * Metaflush is not set at resource-level. + * Use global or default setting. + */ + curres->hr_metaflush = depth0_metaflush; + } + } + + return (lconfig); +} + +void +yy_config_free(struct hastd_config *config) +{ + struct hastd_listen *lst; + struct hast_resource *res; + + while ((lst = TAILQ_FIRST(&depth0_listen)) != NULL) { + TAILQ_REMOVE(&depth0_listen, lst, hl_next); + free(lst); + } + while ((lst = TAILQ_FIRST(&config->hc_listen)) != NULL) { + TAILQ_REMOVE(&config->hc_listen, lst, hl_next); + free(lst); + } + while ((res = TAILQ_FIRST(&config->hc_resources)) != NULL) { + TAILQ_REMOVE(&config->hc_resources, res, hr_next); + free(res); + } + free(config); +} diff --git a/sbin/hastd/pjdlog.c b/sbin/hastd/pjdlog.c new file mode 100644 index 0000000..bc4018f --- /dev/null +++ b/sbin/hastd/pjdlog.c @@ -0,0 +1,614 @@ +/*- + * Copyright (c) 2009-2010 The FreeBSD Foundation + * Copyright (c) 2011 Pawel Jakub Dawidek <pjd@FreeBSD.org> + * All rights reserved. + * + * This software was developed by Pawel Jakub Dawidek under sponsorship from + * the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/types.h> +#include <sys/socket.h> +#include <netinet/in.h> +#include <arpa/inet.h> + +#include <assert.h> +#include <errno.h> +#include <libutil.h> +#include <printf.h> +#include <stdarg.h> +#include <stdint.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <syslog.h> + +#include "pjdlog.h" + +#define PJDLOG_NEVER_INITIALIZED 0 +#define PJDLOG_NOT_INITIALIZED 1 +#define PJDLOG_INITIALIZED 2 + +static int pjdlog_initialized = PJDLOG_NEVER_INITIALIZED; +static int pjdlog_mode, pjdlog_debug_level; +static char pjdlog_prefix[128]; + +static int +pjdlog_printf_arginfo_humanized_number(const struct printf_info *pi __unused, + size_t n, int *argt) +{ + + assert(n >= 1); + argt[0] = PA_INT | PA_FLAG_INTMAX; + return (1); +} + +static int +pjdlog_printf_render_humanized_number(struct __printf_io *io, + const struct printf_info *pi, const void * const *arg) +{ + char buf[5]; + intmax_t num; + int ret; + + num = *(const intmax_t *)arg[0]; + humanize_number(buf, sizeof(buf), (int64_t)num, "", HN_AUTOSCALE, + HN_NOSPACE | HN_DECIMAL); + ret = __printf_out(io, pi, buf, strlen(buf)); + __printf_flush(io); + return (ret); +} + +static int +pjdlog_printf_arginfo_sockaddr(const struct printf_info *pi __unused, + size_t n, int *argt) +{ + + assert(n >= 1); + argt[0] = PA_POINTER; + return (1); +} + +static int +pjdlog_printf_render_sockaddr(struct __printf_io *io, + const struct printf_info *pi, const void * const *arg) +{ + const struct sockaddr_storage *ss; + char buf[64]; + int ret; + + ss = *(const struct sockaddr_storage * const *)arg[0]; + switch (ss->ss_family) { + case AF_INET: + { + char addr[INET_ADDRSTRLEN]; + const struct sockaddr_in *sin; + unsigned int port; + + sin = (const struct sockaddr_in *)ss; + port = ntohs(sin->sin_port); + if (inet_ntop(ss->ss_family, &sin->sin_addr, addr, + sizeof(addr)) == NULL) { + PJDLOG_ABORT("inet_ntop(AF_INET) failed: %s.", + strerror(errno)); + } + snprintf(buf, sizeof(buf), "%s:%u", addr, port); + break; + } + case AF_INET6: + { + char addr[INET6_ADDRSTRLEN]; + const struct sockaddr_in6 *sin; + unsigned int port; + + sin = (const struct sockaddr_in6 *)ss; + port = ntohs(sin->sin6_port); + if (inet_ntop(ss->ss_family, &sin->sin6_addr, addr, + sizeof(addr)) == NULL) { + PJDLOG_ABORT("inet_ntop(AF_INET6) failed: %s.", + strerror(errno)); + } + snprintf(buf, sizeof(buf), "[%s]:%u", addr, port); + break; + } + default: + snprintf(buf, sizeof(buf), "[unsupported family %hhu]", + ss->ss_family); + break; + } + ret = __printf_out(io, pi, buf, strlen(buf)); + __printf_flush(io); + return (ret); +} + +void +pjdlog_init(int mode) +{ + int saved_errno; + + assert(pjdlog_initialized == PJDLOG_NEVER_INITIALIZED || + pjdlog_initialized == PJDLOG_NOT_INITIALIZED); + assert(mode == PJDLOG_MODE_STD || mode == PJDLOG_MODE_SYSLOG); + + saved_errno = errno; + + if (pjdlog_initialized == PJDLOG_NEVER_INITIALIZED) { + __use_xprintf = 1; + register_printf_render_std("T"); + register_printf_render('N', + pjdlog_printf_render_humanized_number, + pjdlog_printf_arginfo_humanized_number); + register_printf_render('S', + pjdlog_printf_render_sockaddr, + pjdlog_printf_arginfo_sockaddr); + } + + if (mode == PJDLOG_MODE_SYSLOG) + openlog(NULL, LOG_PID | LOG_NDELAY, LOG_DAEMON); + pjdlog_mode = mode; + pjdlog_debug_level = 0; + bzero(pjdlog_prefix, sizeof(pjdlog_prefix)); + + pjdlog_initialized = PJDLOG_INITIALIZED; + + errno = saved_errno; +} + +void +pjdlog_fini(void) +{ + int saved_errno; + + assert(pjdlog_initialized == PJDLOG_INITIALIZED); + + saved_errno = errno; + + if (pjdlog_mode == PJDLOG_MODE_SYSLOG) + closelog(); + + pjdlog_initialized = PJDLOG_NOT_INITIALIZED; + + errno = saved_errno; +} + +/* + * Configure where the logs should go. + * By default they are send to stdout/stderr, but after going into background + * (eg. by calling daemon(3)) application is responsible for changing mode to + * PJDLOG_MODE_SYSLOG, so logs will be send to syslog. + */ +void +pjdlog_mode_set(int mode) +{ + int saved_errno; + + assert(pjdlog_initialized == PJDLOG_INITIALIZED); + assert(mode == PJDLOG_MODE_STD || mode == PJDLOG_MODE_SYSLOG); + + if (pjdlog_mode == mode) + return; + + saved_errno = errno; + + if (mode == PJDLOG_MODE_SYSLOG) + openlog(NULL, LOG_PID | LOG_NDELAY, LOG_DAEMON); + else /* if (mode == PJDLOG_MODE_STD) */ + closelog(); + + pjdlog_mode = mode; + + errno = saved_errno; +} + +/* + * Return current mode. + */ +int +pjdlog_mode_get(void) +{ + + assert(pjdlog_initialized == PJDLOG_INITIALIZED); + + return (pjdlog_mode); +} + +/* + * Set debug level. All the logs above the level specified here will be + * ignored. + */ +void +pjdlog_debug_set(int level) +{ + + assert(pjdlog_initialized == PJDLOG_INITIALIZED); + assert(level >= 0); + + pjdlog_debug_level = level; +} + +/* + * Return current debug level. + */ +int +pjdlog_debug_get(void) +{ + + assert(pjdlog_initialized == PJDLOG_INITIALIZED); + + return (pjdlog_debug_level); +} + +/* + * Set prefix that will be used before each log. + * Setting prefix to NULL will remove it. + */ +void +pjdlog_prefix_set(const char *fmt, ...) +{ + va_list ap; + + assert(pjdlog_initialized == PJDLOG_INITIALIZED); + + va_start(ap, fmt); + pjdlogv_prefix_set(fmt, ap); + va_end(ap); +} + +/* + * Set prefix that will be used before each log. + * Setting prefix to NULL will remove it. + */ +void +pjdlogv_prefix_set(const char *fmt, va_list ap) +{ + int saved_errno; + + assert(pjdlog_initialized == PJDLOG_INITIALIZED); + assert(fmt != NULL); + + saved_errno = errno; + + vsnprintf(pjdlog_prefix, sizeof(pjdlog_prefix), fmt, ap); + + errno = saved_errno; +} + +/* + * Convert log level into string. + */ +static const char * +pjdlog_level_string(int loglevel) +{ + + switch (loglevel) { + case LOG_EMERG: + return ("EMERG"); + case LOG_ALERT: + return ("ALERT"); + case LOG_CRIT: + return ("CRIT"); + case LOG_ERR: + return ("ERROR"); + case LOG_WARNING: + return ("WARNING"); + case LOG_NOTICE: + return ("NOTICE"); + case LOG_INFO: + return ("INFO"); + case LOG_DEBUG: + return ("DEBUG"); + } + assert(!"Invalid log level."); + abort(); /* XXX: gcc */ +} + +/* + * Common log routine. + */ +void +pjdlog_common(int loglevel, int debuglevel, int error, const char *fmt, ...) +{ + va_list ap; + + assert(pjdlog_initialized == PJDLOG_INITIALIZED); + + va_start(ap, fmt); + pjdlogv_common(loglevel, debuglevel, error, fmt, ap); + va_end(ap); +} + +/* + * Common log routine, which can handle regular log level as well as debug + * level. We decide here where to send the logs (stdout/stderr or syslog). + */ +void +pjdlogv_common(int loglevel, int debuglevel, int error, const char *fmt, + va_list ap) +{ + int saved_errno; + + assert(pjdlog_initialized == PJDLOG_INITIALIZED); + assert(loglevel == LOG_EMERG || loglevel == LOG_ALERT || + loglevel == LOG_CRIT || loglevel == LOG_ERR || + loglevel == LOG_WARNING || loglevel == LOG_NOTICE || + loglevel == LOG_INFO || loglevel == LOG_DEBUG); + assert(loglevel != LOG_DEBUG || debuglevel > 0); + assert(error >= -1); + + /* Ignore debug above configured level. */ + if (loglevel == LOG_DEBUG && debuglevel > pjdlog_debug_level) + return; + + saved_errno = errno; + + switch (pjdlog_mode) { + case PJDLOG_MODE_STD: + { + FILE *out; + + /* + * We send errors and warning to stderr and the rest to stdout. + */ + switch (loglevel) { + case LOG_EMERG: + case LOG_ALERT: + case LOG_CRIT: + case LOG_ERR: + case LOG_WARNING: + out = stderr; + break; + case LOG_NOTICE: + case LOG_INFO: + case LOG_DEBUG: + out = stdout; + break; + default: + assert(!"Invalid loglevel."); + abort(); /* XXX: gcc */ + } + + fprintf(out, "[%s]", pjdlog_level_string(loglevel)); + /* Attach debuglevel if this is debug log. */ + if (loglevel == LOG_DEBUG) + fprintf(out, "[%d]", debuglevel); + fprintf(out, " %s", pjdlog_prefix); + vfprintf(out, fmt, ap); + if (error != -1) + fprintf(out, ": %s.", strerror(error)); + fprintf(out, "\n"); + fflush(out); + break; + } + case PJDLOG_MODE_SYSLOG: + { + char log[1024]; + int len; + + len = snprintf(log, sizeof(log), "%s", pjdlog_prefix); + if ((size_t)len < sizeof(log)) + len += vsnprintf(log + len, sizeof(log) - len, fmt, ap); + if (error != -1 && (size_t)len < sizeof(log)) { + (void)snprintf(log + len, sizeof(log) - len, ": %s.", + strerror(error)); + } + syslog(loglevel, "%s", log); + break; + } + default: + assert(!"Invalid mode."); + } + + errno = saved_errno; +} + +/* + * Regular logs. + */ +void +pjdlogv(int loglevel, const char *fmt, va_list ap) +{ + + assert(pjdlog_initialized == PJDLOG_INITIALIZED); + + /* LOG_DEBUG is invalid here, pjdlogv?_debug() should be used. */ + assert(loglevel == LOG_EMERG || loglevel == LOG_ALERT || + loglevel == LOG_CRIT || loglevel == LOG_ERR || + loglevel == LOG_WARNING || loglevel == LOG_NOTICE || + loglevel == LOG_INFO); + + pjdlogv_common(loglevel, 0, -1, fmt, ap); +} + +/* + * Regular logs. + */ +void +pjdlog(int loglevel, const char *fmt, ...) +{ + va_list ap; + + assert(pjdlog_initialized == PJDLOG_INITIALIZED); + + va_start(ap, fmt); + pjdlogv(loglevel, fmt, ap); + va_end(ap); +} + +/* + * Debug logs. + */ +void +pjdlogv_debug(int debuglevel, const char *fmt, va_list ap) +{ + + assert(pjdlog_initialized == PJDLOG_INITIALIZED); + + pjdlogv_common(LOG_DEBUG, debuglevel, -1, fmt, ap); +} + +/* + * Debug logs. + */ +void +pjdlog_debug(int debuglevel, const char *fmt, ...) +{ + va_list ap; + + assert(pjdlog_initialized == PJDLOG_INITIALIZED); + + va_start(ap, fmt); + pjdlogv_debug(debuglevel, fmt, ap); + va_end(ap); +} + +/* + * Error logs with errno logging. + */ +void +pjdlogv_errno(int loglevel, const char *fmt, va_list ap) +{ + + assert(pjdlog_initialized == PJDLOG_INITIALIZED); + + pjdlogv_common(loglevel, 0, errno, fmt, ap); +} + +/* + * Error logs with errno logging. + */ +void +pjdlog_errno(int loglevel, const char *fmt, ...) +{ + va_list ap; + + assert(pjdlog_initialized == PJDLOG_INITIALIZED); + + va_start(ap, fmt); + pjdlogv_errno(loglevel, fmt, ap); + va_end(ap); +} + +/* + * Log error, errno and exit. + */ +void +pjdlogv_exit(int exitcode, const char *fmt, va_list ap) +{ + + assert(pjdlog_initialized == PJDLOG_INITIALIZED); + + pjdlogv_errno(LOG_ERR, fmt, ap); + exit(exitcode); + /* NOTREACHED */ +} + +/* + * Log error, errno and exit. + */ +void +pjdlog_exit(int exitcode, const char *fmt, ...) +{ + va_list ap; + + assert(pjdlog_initialized == PJDLOG_INITIALIZED); + + va_start(ap, fmt); + pjdlogv_exit(exitcode, fmt, ap); + /* NOTREACHED */ + va_end(ap); +} + +/* + * Log error and exit. + */ +void +pjdlogv_exitx(int exitcode, const char *fmt, va_list ap) +{ + + assert(pjdlog_initialized == PJDLOG_INITIALIZED); + + pjdlogv(LOG_ERR, fmt, ap); + exit(exitcode); + /* NOTREACHED */ +} + +/* + * Log error and exit. + */ +void +pjdlog_exitx(int exitcode, const char *fmt, ...) +{ + va_list ap; + + assert(pjdlog_initialized == PJDLOG_INITIALIZED); + + va_start(ap, fmt); + pjdlogv_exitx(exitcode, fmt, ap); + /* NOTREACHED */ + va_end(ap); +} + +/* + * Log failure message and exit. + */ +void +pjdlog_abort(const char *func, const char *file, int line, + const char *failedexpr, const char *fmt, ...) +{ + va_list ap; + + assert(pjdlog_initialized == PJDLOG_INITIALIZED); + + /* + * When there is no message we pass __func__ as 'fmt'. + * It would be cleaner to pass NULL or "", but gcc generates a warning + * for both of those. + */ + if (fmt != func) { + va_start(ap, fmt); + pjdlogv_critical(fmt, ap); + va_end(ap); + } + if (failedexpr == NULL) { + if (func == NULL) { + pjdlog_critical("Aborted at file %s, line %d.", file, + line); + } else { + pjdlog_critical("Aborted at function %s, file %s, line %d.", + func, file, line); + } + } else { + if (func == NULL) { + pjdlog_critical("Assertion failed: (%s), file %s, line %d.", + failedexpr, file, line); + } else { + pjdlog_critical("Assertion failed: (%s), function %s, file %s, line %d.", + failedexpr, func, file, line); + } + } + abort(); +} diff --git a/sbin/hastd/pjdlog.h b/sbin/hastd/pjdlog.h new file mode 100644 index 0000000..0f01f79 --- /dev/null +++ b/sbin/hastd/pjdlog.h @@ -0,0 +1,117 @@ +/*- + * Copyright (c) 2009-2010 The FreeBSD Foundation + * Copyright (c) 2011 Pawel Jakub Dawidek <pjd@FreeBSD.org> + * All rights reserved. + * + * This software was developed by Pawel Jakub Dawidek under sponsorship from + * the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _PJDLOG_H_ +#define _PJDLOG_H_ + +#include <sys/cdefs.h> + +#include <stdarg.h> +#include <sysexits.h> +#include <syslog.h> + +#define PJDLOG_MODE_STD 0 +#define PJDLOG_MODE_SYSLOG 1 + +void pjdlog_init(int mode); +void pjdlog_fini(void); + +void pjdlog_mode_set(int mode); +int pjdlog_mode_get(void); + +void pjdlog_debug_set(int level); +int pjdlog_debug_get(void); + +void pjdlog_prefix_set(const char *fmt, ...) __printflike(1, 2); +void pjdlogv_prefix_set(const char *fmt, va_list ap) __printflike(1, 0); + +void pjdlog_common(int loglevel, int debuglevel, int error, const char *fmt, + ...) __printflike(4, 5); +void pjdlogv_common(int loglevel, int debuglevel, int error, const char *fmt, + va_list ap) __printflike(4, 0); + +void pjdlog(int loglevel, const char *fmt, ...) __printflike(2, 3); +void pjdlogv(int loglevel, const char *fmt, va_list ap) __printflike(2, 0); + +#define pjdlogv_emergency(fmt, ap) pjdlogv(LOG_EMERG, (fmt), (ap)) +#define pjdlog_emergency(...) pjdlog(LOG_EMERG, __VA_ARGS__) +#define pjdlogv_alert(fmt, ap) pjdlogv(LOG_ALERT, (fmt), (ap)) +#define pjdlog_alert(...) pjdlog(LOG_ALERT, __VA_ARGS__) +#define pjdlogv_critical(fmt, ap) pjdlogv(LOG_CRIT, (fmt), (ap)) +#define pjdlog_critical(...) pjdlog(LOG_CRIT, __VA_ARGS__) +#define pjdlogv_error(fmt, ap) pjdlogv(LOG_ERR, (fmt), (ap)) +#define pjdlog_error(...) pjdlog(LOG_ERR, __VA_ARGS__) +#define pjdlogv_warning(fmt, ap) pjdlogv(LOG_WARNING, (fmt), (ap)) +#define pjdlog_warning(...) pjdlog(LOG_WARNING, __VA_ARGS__) +#define pjdlogv_notice(fmt, ap) pjdlogv(LOG_NOTICE, (fmt), (ap)) +#define pjdlog_notice(...) pjdlog(LOG_NOTICE, __VA_ARGS__) +#define pjdlogv_info(fmt, ap) pjdlogv(LOG_INFO, (fmt), (ap)) +#define pjdlog_info(...) pjdlog(LOG_INFO, __VA_ARGS__) + +void pjdlog_debug(int debuglevel, const char *fmt, ...) __printflike(2, 3); +void pjdlogv_debug(int debuglevel, const char *fmt, va_list ap) __printflike(2, 0); + +void pjdlog_errno(int loglevel, const char *fmt, ...) __printflike(2, 3); +void pjdlogv_errno(int loglevel, const char *fmt, va_list ap) __printflike(2, 0); + +void pjdlog_exit(int exitcode, const char *fmt, ...) __printflike(2, 3) __dead2; +void pjdlogv_exit(int exitcode, const char *fmt, va_list ap) __printflike(2, 0) __dead2; + +void pjdlog_exitx(int exitcode, const char *fmt, ...) __printflike(2, 3) __dead2; +void pjdlogv_exitx(int exitcode, const char *fmt, va_list ap) __printflike(2, 0) __dead2; + +void pjdlog_abort(const char *func, const char *file, int line, + const char *failedexpr, const char *fmt, ...) __printflike(5, 6) __dead2; + +#define PJDLOG_VERIFY(expr) do { \ + if (!(expr)) { \ + pjdlog_abort(__func__, __FILE__, __LINE__, #expr, \ + __func__); \ + } \ +} while (0) +#define PJDLOG_RVERIFY(expr, ...) do { \ + if (!(expr)) { \ + pjdlog_abort(__func__, __FILE__, __LINE__, #expr, \ + __VA_ARGS__); \ + } \ +} while (0) +#define PJDLOG_ABORT(...) pjdlog_abort(__func__, __FILE__, \ + __LINE__, NULL, __VA_ARGS__) +#ifdef NDEBUG +#define PJDLOG_ASSERT(expr) do { } while (0) +#define PJDLOG_RASSERT(...) do { } while (0) +#else +#define PJDLOG_ASSERT(expr) PJDLOG_VERIFY(expr) +#define PJDLOG_RASSERT(...) PJDLOG_RVERIFY(__VA_ARGS__) +#endif + +#endif /* !_PJDLOG_H_ */ diff --git a/sbin/hastd/primary.c b/sbin/hastd/primary.c new file mode 100644 index 0000000..09ae17b --- /dev/null +++ b/sbin/hastd/primary.c @@ -0,0 +1,2477 @@ +/*- + * Copyright (c) 2009 The FreeBSD Foundation + * Copyright (c) 2010-2011 Pawel Jakub Dawidek <pawel@dawidek.net> + * All rights reserved. + * + * This software was developed by Pawel Jakub Dawidek under sponsorship from + * the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/types.h> +#include <sys/time.h> +#include <sys/bio.h> +#include <sys/disk.h> +#include <sys/stat.h> + +#include <geom/gate/g_gate.h> + +#include <err.h> +#include <errno.h> +#include <fcntl.h> +#include <libgeom.h> +#include <pthread.h> +#include <signal.h> +#include <stdint.h> +#include <stdio.h> +#include <string.h> +#include <sysexits.h> +#include <unistd.h> + +#include <activemap.h> +#include <nv.h> +#include <rangelock.h> + +#include "control.h" +#include "event.h" +#include "hast.h" +#include "hast_proto.h" +#include "hastd.h" +#include "hooks.h" +#include "metadata.h" +#include "proto.h" +#include "pjdlog.h" +#include "refcnt.h" +#include "subr.h" +#include "synch.h" + +/* The is only one remote component for now. */ +#define ISREMOTE(no) ((no) == 1) + +struct hio { + /* + * Number of components we are still waiting for. + * When this field goes to 0, we can send the request back to the + * kernel. Each component has to decrease this counter by one + * even on failure. + */ + refcnt_t hio_countdown; + /* + * Each component has a place to store its own error. + * Once the request is handled by all components we can decide if the + * request overall is successful or not. + */ + int *hio_errors; + /* + * Structure used to communicate with GEOM Gate class. + */ + struct g_gate_ctl_io hio_ggio; + /* + * Request was already confirmed to GEOM Gate. + */ + bool hio_done; + /* + * Remember replication from the time the request was initiated, + * so we won't get confused when replication changes on reload. + */ + int hio_replication; + TAILQ_ENTRY(hio) *hio_next; +}; +#define hio_free_next hio_next[0] +#define hio_done_next hio_next[0] + +/* + * Free list holds unused structures. When free list is empty, we have to wait + * until some in-progress requests are freed. + */ +static TAILQ_HEAD(, hio) hio_free_list; +static pthread_mutex_t hio_free_list_lock; +static pthread_cond_t hio_free_list_cond; +/* + * There is one send list for every component. One requests is placed on all + * send lists - each component gets the same request, but each component is + * responsible for managing his own send list. + */ +static TAILQ_HEAD(, hio) *hio_send_list; +static pthread_mutex_t *hio_send_list_lock; +static pthread_cond_t *hio_send_list_cond; +/* + * There is one recv list for every component, although local components don't + * use recv lists as local requests are done synchronously. + */ +static TAILQ_HEAD(, hio) *hio_recv_list; +static pthread_mutex_t *hio_recv_list_lock; +static pthread_cond_t *hio_recv_list_cond; +/* + * Request is placed on done list by the slowest component (the one that + * decreased hio_countdown from 1 to 0). + */ +static TAILQ_HEAD(, hio) hio_done_list; +static pthread_mutex_t hio_done_list_lock; +static pthread_cond_t hio_done_list_cond; +/* + * Structure below are for interaction with sync thread. + */ +static bool sync_inprogress; +static pthread_mutex_t sync_lock; +static pthread_cond_t sync_cond; +/* + * The lock below allows to synchornize access to remote connections. + */ +static pthread_rwlock_t *hio_remote_lock; + +/* + * Lock to synchronize metadata updates. Also synchronize access to + * hr_primary_localcnt and hr_primary_remotecnt fields. + */ +static pthread_mutex_t metadata_lock; + +/* + * Maximum number of outstanding I/O requests. + */ +#define HAST_HIO_MAX 256 +/* + * Number of components. At this point there are only two components: local + * and remote, but in the future it might be possible to use multiple local + * and remote components. + */ +#define HAST_NCOMPONENTS 2 + +#define ISCONNECTED(res, no) \ + ((res)->hr_remotein != NULL && (res)->hr_remoteout != NULL) + +#define QUEUE_INSERT1(hio, name, ncomp) do { \ + bool _wakeup; \ + \ + mtx_lock(&hio_##name##_list_lock[(ncomp)]); \ + _wakeup = TAILQ_EMPTY(&hio_##name##_list[(ncomp)]); \ + TAILQ_INSERT_TAIL(&hio_##name##_list[(ncomp)], (hio), \ + hio_next[(ncomp)]); \ + mtx_unlock(&hio_##name##_list_lock[ncomp]); \ + if (_wakeup) \ + cv_broadcast(&hio_##name##_list_cond[(ncomp)]); \ +} while (0) +#define QUEUE_INSERT2(hio, name) do { \ + bool _wakeup; \ + \ + mtx_lock(&hio_##name##_list_lock); \ + _wakeup = TAILQ_EMPTY(&hio_##name##_list); \ + TAILQ_INSERT_TAIL(&hio_##name##_list, (hio), hio_##name##_next);\ + mtx_unlock(&hio_##name##_list_lock); \ + if (_wakeup) \ + cv_broadcast(&hio_##name##_list_cond); \ +} while (0) +#define QUEUE_TAKE1(hio, name, ncomp, timeout) do { \ + bool _last; \ + \ + mtx_lock(&hio_##name##_list_lock[(ncomp)]); \ + _last = false; \ + while (((hio) = TAILQ_FIRST(&hio_##name##_list[(ncomp)])) == NULL && !_last) { \ + cv_timedwait(&hio_##name##_list_cond[(ncomp)], \ + &hio_##name##_list_lock[(ncomp)], (timeout)); \ + if ((timeout) != 0) \ + _last = true; \ + } \ + if (hio != NULL) { \ + TAILQ_REMOVE(&hio_##name##_list[(ncomp)], (hio), \ + hio_next[(ncomp)]); \ + } \ + mtx_unlock(&hio_##name##_list_lock[(ncomp)]); \ +} while (0) +#define QUEUE_TAKE2(hio, name) do { \ + mtx_lock(&hio_##name##_list_lock); \ + while (((hio) = TAILQ_FIRST(&hio_##name##_list)) == NULL) { \ + cv_wait(&hio_##name##_list_cond, \ + &hio_##name##_list_lock); \ + } \ + TAILQ_REMOVE(&hio_##name##_list, (hio), hio_##name##_next); \ + mtx_unlock(&hio_##name##_list_lock); \ +} while (0) + +#define SYNCREQ(hio) do { \ + (hio)->hio_ggio.gctl_unit = -1; \ + (hio)->hio_ggio.gctl_seq = 1; \ +} while (0) +#define ISSYNCREQ(hio) ((hio)->hio_ggio.gctl_unit == -1) +#define SYNCREQDONE(hio) do { (hio)->hio_ggio.gctl_unit = -2; } while (0) +#define ISSYNCREQDONE(hio) ((hio)->hio_ggio.gctl_unit == -2) + +static struct hast_resource *gres; + +static pthread_mutex_t range_lock; +static struct rangelocks *range_regular; +static bool range_regular_wait; +static pthread_cond_t range_regular_cond; +static struct rangelocks *range_sync; +static bool range_sync_wait; +static pthread_cond_t range_sync_cond; +static bool fullystarted; + +static void *ggate_recv_thread(void *arg); +static void *local_send_thread(void *arg); +static void *remote_send_thread(void *arg); +static void *remote_recv_thread(void *arg); +static void *ggate_send_thread(void *arg); +static void *sync_thread(void *arg); +static void *guard_thread(void *arg); + +static void +cleanup(struct hast_resource *res) +{ + int rerrno; + + /* Remember errno. */ + rerrno = errno; + + /* Destroy ggate provider if we created one. */ + if (res->hr_ggateunit >= 0) { + struct g_gate_ctl_destroy ggiod; + + bzero(&ggiod, sizeof(ggiod)); + ggiod.gctl_version = G_GATE_VERSION; + ggiod.gctl_unit = res->hr_ggateunit; + ggiod.gctl_force = 1; + if (ioctl(res->hr_ggatefd, G_GATE_CMD_DESTROY, &ggiod) == -1) { + pjdlog_errno(LOG_WARNING, + "Unable to destroy hast/%s device", + res->hr_provname); + } + res->hr_ggateunit = -1; + } + + /* Restore errno. */ + errno = rerrno; +} + +static __dead2 void +primary_exit(int exitcode, const char *fmt, ...) +{ + va_list ap; + + PJDLOG_ASSERT(exitcode != EX_OK); + va_start(ap, fmt); + pjdlogv_errno(LOG_ERR, fmt, ap); + va_end(ap); + cleanup(gres); + exit(exitcode); +} + +static __dead2 void +primary_exitx(int exitcode, const char *fmt, ...) +{ + va_list ap; + + va_start(ap, fmt); + pjdlogv(exitcode == EX_OK ? LOG_INFO : LOG_ERR, fmt, ap); + va_end(ap); + cleanup(gres); + exit(exitcode); +} + +/* Expects res->hr_amp locked, returns unlocked. */ +static int +hast_activemap_flush(struct hast_resource *res) +{ + const unsigned char *buf; + size_t size; + int ret; + + mtx_lock(&res->hr_amp_diskmap_lock); + buf = activemap_bitmap(res->hr_amp, &size); + mtx_unlock(&res->hr_amp_lock); + PJDLOG_ASSERT(buf != NULL); + PJDLOG_ASSERT((size % res->hr_local_sectorsize) == 0); + ret = 0; + if (pwrite(res->hr_localfd, buf, size, METADATA_SIZE) != + (ssize_t)size) { + pjdlog_errno(LOG_ERR, "Unable to flush activemap to disk"); + res->hr_stat_activemap_write_error++; + ret = -1; + } + if (ret == 0 && res->hr_metaflush == 1 && + g_flush(res->hr_localfd) == -1) { + if (errno == EOPNOTSUPP) { + pjdlog_warning("The %s provider doesn't support flushing write cache. Disabling it.", + res->hr_localpath); + res->hr_metaflush = 0; + } else { + pjdlog_errno(LOG_ERR, + "Unable to flush disk cache on activemap update"); + res->hr_stat_activemap_flush_error++; + ret = -1; + } + } + mtx_unlock(&res->hr_amp_diskmap_lock); + return (ret); +} + +static bool +real_remote(const struct hast_resource *res) +{ + + return (strcmp(res->hr_remoteaddr, "none") != 0); +} + +static void +init_environment(struct hast_resource *res __unused) +{ + struct hio *hio; + unsigned int ii, ncomps; + + /* + * In the future it might be per-resource value. + */ + ncomps = HAST_NCOMPONENTS; + + /* + * Allocate memory needed by lists. + */ + hio_send_list = malloc(sizeof(hio_send_list[0]) * ncomps); + if (hio_send_list == NULL) { + primary_exitx(EX_TEMPFAIL, + "Unable to allocate %zu bytes of memory for send lists.", + sizeof(hio_send_list[0]) * ncomps); + } + hio_send_list_lock = malloc(sizeof(hio_send_list_lock[0]) * ncomps); + if (hio_send_list_lock == NULL) { + primary_exitx(EX_TEMPFAIL, + "Unable to allocate %zu bytes of memory for send list locks.", + sizeof(hio_send_list_lock[0]) * ncomps); + } + hio_send_list_cond = malloc(sizeof(hio_send_list_cond[0]) * ncomps); + if (hio_send_list_cond == NULL) { + primary_exitx(EX_TEMPFAIL, + "Unable to allocate %zu bytes of memory for send list condition variables.", + sizeof(hio_send_list_cond[0]) * ncomps); + } + hio_recv_list = malloc(sizeof(hio_recv_list[0]) * ncomps); + if (hio_recv_list == NULL) { + primary_exitx(EX_TEMPFAIL, + "Unable to allocate %zu bytes of memory for recv lists.", + sizeof(hio_recv_list[0]) * ncomps); + } + hio_recv_list_lock = malloc(sizeof(hio_recv_list_lock[0]) * ncomps); + if (hio_recv_list_lock == NULL) { + primary_exitx(EX_TEMPFAIL, + "Unable to allocate %zu bytes of memory for recv list locks.", + sizeof(hio_recv_list_lock[0]) * ncomps); + } + hio_recv_list_cond = malloc(sizeof(hio_recv_list_cond[0]) * ncomps); + if (hio_recv_list_cond == NULL) { + primary_exitx(EX_TEMPFAIL, + "Unable to allocate %zu bytes of memory for recv list condition variables.", + sizeof(hio_recv_list_cond[0]) * ncomps); + } + hio_remote_lock = malloc(sizeof(hio_remote_lock[0]) * ncomps); + if (hio_remote_lock == NULL) { + primary_exitx(EX_TEMPFAIL, + "Unable to allocate %zu bytes of memory for remote connections locks.", + sizeof(hio_remote_lock[0]) * ncomps); + } + + /* + * Initialize lists, their locks and theirs condition variables. + */ + TAILQ_INIT(&hio_free_list); + mtx_init(&hio_free_list_lock); + cv_init(&hio_free_list_cond); + for (ii = 0; ii < HAST_NCOMPONENTS; ii++) { + TAILQ_INIT(&hio_send_list[ii]); + mtx_init(&hio_send_list_lock[ii]); + cv_init(&hio_send_list_cond[ii]); + TAILQ_INIT(&hio_recv_list[ii]); + mtx_init(&hio_recv_list_lock[ii]); + cv_init(&hio_recv_list_cond[ii]); + rw_init(&hio_remote_lock[ii]); + } + TAILQ_INIT(&hio_done_list); + mtx_init(&hio_done_list_lock); + cv_init(&hio_done_list_cond); + mtx_init(&metadata_lock); + + /* + * Allocate requests pool and initialize requests. + */ + for (ii = 0; ii < HAST_HIO_MAX; ii++) { + hio = malloc(sizeof(*hio)); + if (hio == NULL) { + primary_exitx(EX_TEMPFAIL, + "Unable to allocate %zu bytes of memory for hio request.", + sizeof(*hio)); + } + refcnt_init(&hio->hio_countdown, 0); + hio->hio_errors = malloc(sizeof(hio->hio_errors[0]) * ncomps); + if (hio->hio_errors == NULL) { + primary_exitx(EX_TEMPFAIL, + "Unable allocate %zu bytes of memory for hio errors.", + sizeof(hio->hio_errors[0]) * ncomps); + } + hio->hio_next = malloc(sizeof(hio->hio_next[0]) * ncomps); + if (hio->hio_next == NULL) { + primary_exitx(EX_TEMPFAIL, + "Unable allocate %zu bytes of memory for hio_next field.", + sizeof(hio->hio_next[0]) * ncomps); + } + hio->hio_ggio.gctl_version = G_GATE_VERSION; + hio->hio_ggio.gctl_data = malloc(MAXPHYS); + if (hio->hio_ggio.gctl_data == NULL) { + primary_exitx(EX_TEMPFAIL, + "Unable to allocate %zu bytes of memory for gctl_data.", + MAXPHYS); + } + hio->hio_ggio.gctl_length = MAXPHYS; + hio->hio_ggio.gctl_error = 0; + TAILQ_INSERT_HEAD(&hio_free_list, hio, hio_free_next); + } +} + +static bool +init_resuid(struct hast_resource *res) +{ + + mtx_lock(&metadata_lock); + if (res->hr_resuid != 0) { + mtx_unlock(&metadata_lock); + return (false); + } else { + /* Initialize unique resource identifier. */ + arc4random_buf(&res->hr_resuid, sizeof(res->hr_resuid)); + mtx_unlock(&metadata_lock); + if (metadata_write(res) == -1) + exit(EX_NOINPUT); + return (true); + } +} + +static void +init_local(struct hast_resource *res) +{ + unsigned char *buf; + size_t mapsize; + + if (metadata_read(res, true) == -1) + exit(EX_NOINPUT); + mtx_init(&res->hr_amp_lock); + if (activemap_init(&res->hr_amp, res->hr_datasize, res->hr_extentsize, + res->hr_local_sectorsize, res->hr_keepdirty) == -1) { + primary_exit(EX_TEMPFAIL, "Unable to create activemap"); + } + mtx_init(&range_lock); + cv_init(&range_regular_cond); + if (rangelock_init(&range_regular) == -1) + primary_exit(EX_TEMPFAIL, "Unable to create regular range lock"); + cv_init(&range_sync_cond); + if (rangelock_init(&range_sync) == -1) + primary_exit(EX_TEMPFAIL, "Unable to create sync range lock"); + mapsize = activemap_ondisk_size(res->hr_amp); + buf = calloc(1, mapsize); + if (buf == NULL) { + primary_exitx(EX_TEMPFAIL, + "Unable to allocate buffer for activemap."); + } + if (pread(res->hr_localfd, buf, mapsize, METADATA_SIZE) != + (ssize_t)mapsize) { + primary_exit(EX_NOINPUT, "Unable to read activemap"); + } + activemap_copyin(res->hr_amp, buf, mapsize); + free(buf); + if (res->hr_resuid != 0) + return; + /* + * We're using provider for the first time. Initialize local and remote + * counters. We don't initialize resuid here, as we want to do it just + * in time. The reason for this is that we want to inform secondary + * that there were no writes yet, so there is no need to synchronize + * anything. + */ + res->hr_primary_localcnt = 0; + res->hr_primary_remotecnt = 0; + if (metadata_write(res) == -1) + exit(EX_NOINPUT); +} + +static int +primary_connect(struct hast_resource *res, struct proto_conn **connp) +{ + struct proto_conn *conn; + int16_t val; + + val = 1; + if (proto_send(res->hr_conn, &val, sizeof(val)) == -1) { + primary_exit(EX_TEMPFAIL, + "Unable to send connection request to parent"); + } + if (proto_recv(res->hr_conn, &val, sizeof(val)) == -1) { + primary_exit(EX_TEMPFAIL, + "Unable to receive reply to connection request from parent"); + } + if (val != 0) { + errno = val; + pjdlog_errno(LOG_WARNING, "Unable to connect to %s", + res->hr_remoteaddr); + return (-1); + } + if (proto_connection_recv(res->hr_conn, true, &conn) == -1) { + primary_exit(EX_TEMPFAIL, + "Unable to receive connection from parent"); + } + if (proto_connect_wait(conn, res->hr_timeout) == -1) { + pjdlog_errno(LOG_WARNING, "Unable to connect to %s", + res->hr_remoteaddr); + proto_close(conn); + return (-1); + } + /* Error in setting timeout is not critical, but why should it fail? */ + if (proto_timeout(conn, res->hr_timeout) == -1) + pjdlog_errno(LOG_WARNING, "Unable to set connection timeout"); + + *connp = conn; + + return (0); +} + +/* + * Function instructs GEOM_GATE to handle reads directly from within the kernel. + */ +static void +enable_direct_reads(struct hast_resource *res) +{ + struct g_gate_ctl_modify ggiomodify; + + bzero(&ggiomodify, sizeof(ggiomodify)); + ggiomodify.gctl_version = G_GATE_VERSION; + ggiomodify.gctl_unit = res->hr_ggateunit; + ggiomodify.gctl_modify = GG_MODIFY_READPROV | GG_MODIFY_READOFFSET; + strlcpy(ggiomodify.gctl_readprov, res->hr_localpath, + sizeof(ggiomodify.gctl_readprov)); + ggiomodify.gctl_readoffset = res->hr_localoff; + if (ioctl(res->hr_ggatefd, G_GATE_CMD_MODIFY, &ggiomodify) == 0) + pjdlog_debug(1, "Direct reads enabled."); + else + pjdlog_errno(LOG_WARNING, "Failed to enable direct reads"); +} + +static int +init_remote(struct hast_resource *res, struct proto_conn **inp, + struct proto_conn **outp) +{ + struct proto_conn *in, *out; + struct nv *nvout, *nvin; + const unsigned char *token; + unsigned char *map; + const char *errmsg; + int32_t extentsize; + int64_t datasize; + uint32_t mapsize; + uint8_t version; + size_t size; + int error; + + PJDLOG_ASSERT((inp == NULL && outp == NULL) || (inp != NULL && outp != NULL)); + PJDLOG_ASSERT(real_remote(res)); + + in = out = NULL; + errmsg = NULL; + + if (primary_connect(res, &out) == -1) + return (ECONNREFUSED); + + error = ECONNABORTED; + + /* + * First handshake step. + * Setup outgoing connection with remote node. + */ + nvout = nv_alloc(); + nv_add_string(nvout, res->hr_name, "resource"); + nv_add_uint8(nvout, HAST_PROTO_VERSION, "version"); + if (nv_error(nvout) != 0) { + pjdlog_common(LOG_WARNING, 0, nv_error(nvout), + "Unable to allocate header for connection with %s", + res->hr_remoteaddr); + nv_free(nvout); + goto close; + } + if (hast_proto_send(res, out, nvout, NULL, 0) == -1) { + pjdlog_errno(LOG_WARNING, + "Unable to send handshake header to %s", + res->hr_remoteaddr); + nv_free(nvout); + goto close; + } + nv_free(nvout); + if (hast_proto_recv_hdr(out, &nvin) == -1) { + pjdlog_errno(LOG_WARNING, + "Unable to receive handshake header from %s", + res->hr_remoteaddr); + goto close; + } + errmsg = nv_get_string(nvin, "errmsg"); + if (errmsg != NULL) { + pjdlog_warning("%s", errmsg); + if (nv_exists(nvin, "wait")) + error = EBUSY; + nv_free(nvin); + goto close; + } + version = nv_get_uint8(nvin, "version"); + if (version == 0) { + /* + * If no version is sent, it means this is protocol version 1. + */ + version = 1; + } + if (version > HAST_PROTO_VERSION) { + pjdlog_warning("Invalid version received (%hhu).", version); + nv_free(nvin); + goto close; + } + res->hr_version = version; + pjdlog_debug(1, "Negotiated protocol version %d.", res->hr_version); + token = nv_get_uint8_array(nvin, &size, "token"); + if (token == NULL) { + pjdlog_warning("Handshake header from %s has no 'token' field.", + res->hr_remoteaddr); + nv_free(nvin); + goto close; + } + if (size != sizeof(res->hr_token)) { + pjdlog_warning("Handshake header from %s contains 'token' of wrong size (got %zu, expected %zu).", + res->hr_remoteaddr, size, sizeof(res->hr_token)); + nv_free(nvin); + goto close; + } + bcopy(token, res->hr_token, sizeof(res->hr_token)); + nv_free(nvin); + + /* + * Second handshake step. + * Setup incoming connection with remote node. + */ + if (primary_connect(res, &in) == -1) + goto close; + + nvout = nv_alloc(); + nv_add_string(nvout, res->hr_name, "resource"); + nv_add_uint8_array(nvout, res->hr_token, sizeof(res->hr_token), + "token"); + if (res->hr_resuid == 0) { + /* + * The resuid field was not yet initialized. + * Because we do synchronization inside init_resuid(), it is + * possible that someone already initialized it, the function + * will return false then, but if we successfully initialized + * it, we will get true. True means that there were no writes + * to this resource yet and we want to inform secondary that + * synchronization is not needed by sending "virgin" argument. + */ + if (init_resuid(res)) + nv_add_int8(nvout, 1, "virgin"); + } + nv_add_uint64(nvout, res->hr_resuid, "resuid"); + nv_add_uint64(nvout, res->hr_primary_localcnt, "localcnt"); + nv_add_uint64(nvout, res->hr_primary_remotecnt, "remotecnt"); + if (nv_error(nvout) != 0) { + pjdlog_common(LOG_WARNING, 0, nv_error(nvout), + "Unable to allocate header for connection with %s", + res->hr_remoteaddr); + nv_free(nvout); + goto close; + } + if (hast_proto_send(res, in, nvout, NULL, 0) == -1) { + pjdlog_errno(LOG_WARNING, + "Unable to send handshake header to %s", + res->hr_remoteaddr); + nv_free(nvout); + goto close; + } + nv_free(nvout); + if (hast_proto_recv_hdr(out, &nvin) == -1) { + pjdlog_errno(LOG_WARNING, + "Unable to receive handshake header from %s", + res->hr_remoteaddr); + goto close; + } + errmsg = nv_get_string(nvin, "errmsg"); + if (errmsg != NULL) { + pjdlog_warning("%s", errmsg); + nv_free(nvin); + goto close; + } + datasize = nv_get_int64(nvin, "datasize"); + if (datasize != res->hr_datasize) { + pjdlog_warning("Data size differs between nodes (local=%jd, remote=%jd).", + (intmax_t)res->hr_datasize, (intmax_t)datasize); + nv_free(nvin); + goto close; + } + extentsize = nv_get_int32(nvin, "extentsize"); + if (extentsize != res->hr_extentsize) { + pjdlog_warning("Extent size differs between nodes (local=%zd, remote=%zd).", + (ssize_t)res->hr_extentsize, (ssize_t)extentsize); + nv_free(nvin); + goto close; + } + res->hr_secondary_localcnt = nv_get_uint64(nvin, "localcnt"); + res->hr_secondary_remotecnt = nv_get_uint64(nvin, "remotecnt"); + res->hr_syncsrc = nv_get_uint8(nvin, "syncsrc"); + if (res->hr_syncsrc == HAST_SYNCSRC_PRIMARY) + enable_direct_reads(res); + if (nv_exists(nvin, "virgin")) { + /* + * Secondary was reinitialized, bump localcnt if it is 0 as + * only we have the data. + */ + PJDLOG_ASSERT(res->hr_syncsrc == HAST_SYNCSRC_PRIMARY); + PJDLOG_ASSERT(res->hr_secondary_localcnt == 0); + + if (res->hr_primary_localcnt == 0) { + PJDLOG_ASSERT(res->hr_secondary_remotecnt == 0); + + mtx_lock(&metadata_lock); + res->hr_primary_localcnt++; + pjdlog_debug(1, "Increasing localcnt to %ju.", + (uintmax_t)res->hr_primary_localcnt); + (void)metadata_write(res); + mtx_unlock(&metadata_lock); + } + } + map = NULL; + mapsize = nv_get_uint32(nvin, "mapsize"); + if (mapsize > 0) { + map = malloc(mapsize); + if (map == NULL) { + pjdlog_error("Unable to allocate memory for remote activemap (mapsize=%ju).", + (uintmax_t)mapsize); + nv_free(nvin); + goto close; + } + /* + * Remote node have some dirty extents on its own, lets + * download its activemap. + */ + if (hast_proto_recv_data(res, out, nvin, map, + mapsize) == -1) { + pjdlog_errno(LOG_ERR, + "Unable to receive remote activemap"); + nv_free(nvin); + free(map); + goto close; + } + /* + * Merge local and remote bitmaps. + */ + activemap_merge(res->hr_amp, map, mapsize); + free(map); + /* + * Now that we merged bitmaps from both nodes, flush it to the + * disk before we start to synchronize. + */ + mtx_lock(&res->hr_amp_lock); + (void)hast_activemap_flush(res); + } + nv_free(nvin); +#ifdef notyet + /* Setup directions. */ + if (proto_send(out, NULL, 0) == -1) + pjdlog_errno(LOG_WARNING, "Unable to set connection direction"); + if (proto_recv(in, NULL, 0) == -1) + pjdlog_errno(LOG_WARNING, "Unable to set connection direction"); +#endif + pjdlog_info("Connected to %s.", res->hr_remoteaddr); + if (res->hr_original_replication == HAST_REPLICATION_MEMSYNC && + res->hr_version < 2) { + pjdlog_warning("The 'memsync' replication mode is not supported by the remote node, falling back to 'fullsync' mode."); + res->hr_replication = HAST_REPLICATION_FULLSYNC; + } else if (res->hr_replication != res->hr_original_replication) { + /* + * This is in case hastd disconnected and was upgraded. + */ + res->hr_replication = res->hr_original_replication; + } + if (inp != NULL && outp != NULL) { + *inp = in; + *outp = out; + } else { + res->hr_remotein = in; + res->hr_remoteout = out; + } + event_send(res, EVENT_CONNECT); + return (0); +close: + if (errmsg != NULL && strcmp(errmsg, "Split-brain condition!") == 0) + event_send(res, EVENT_SPLITBRAIN); + proto_close(out); + if (in != NULL) + proto_close(in); + return (error); +} + +static void +sync_start(void) +{ + + mtx_lock(&sync_lock); + sync_inprogress = true; + mtx_unlock(&sync_lock); + cv_signal(&sync_cond); +} + +static void +sync_stop(void) +{ + + mtx_lock(&sync_lock); + if (sync_inprogress) + sync_inprogress = false; + mtx_unlock(&sync_lock); +} + +static void +init_ggate(struct hast_resource *res) +{ + struct g_gate_ctl_create ggiocreate; + struct g_gate_ctl_cancel ggiocancel; + + /* + * We communicate with ggate via /dev/ggctl. Open it. + */ + res->hr_ggatefd = open("/dev/" G_GATE_CTL_NAME, O_RDWR); + if (res->hr_ggatefd == -1) + primary_exit(EX_OSFILE, "Unable to open /dev/" G_GATE_CTL_NAME); + /* + * Create provider before trying to connect, as connection failure + * is not critical, but may take some time. + */ + bzero(&ggiocreate, sizeof(ggiocreate)); + ggiocreate.gctl_version = G_GATE_VERSION; + ggiocreate.gctl_mediasize = res->hr_datasize; + ggiocreate.gctl_sectorsize = res->hr_local_sectorsize; + ggiocreate.gctl_flags = 0; + ggiocreate.gctl_maxcount = 0; + ggiocreate.gctl_timeout = 0; + ggiocreate.gctl_unit = G_GATE_NAME_GIVEN; + snprintf(ggiocreate.gctl_name, sizeof(ggiocreate.gctl_name), "hast/%s", + res->hr_provname); + if (ioctl(res->hr_ggatefd, G_GATE_CMD_CREATE, &ggiocreate) == 0) { + pjdlog_info("Device hast/%s created.", res->hr_provname); + res->hr_ggateunit = ggiocreate.gctl_unit; + return; + } + if (errno != EEXIST) { + primary_exit(EX_OSERR, "Unable to create hast/%s device", + res->hr_provname); + } + pjdlog_debug(1, + "Device hast/%s already exists, we will try to take it over.", + res->hr_provname); + /* + * If we received EEXIST, we assume that the process who created the + * provider died and didn't clean up. In that case we will start from + * where he left of. + */ + bzero(&ggiocancel, sizeof(ggiocancel)); + ggiocancel.gctl_version = G_GATE_VERSION; + ggiocancel.gctl_unit = G_GATE_NAME_GIVEN; + snprintf(ggiocancel.gctl_name, sizeof(ggiocancel.gctl_name), "hast/%s", + res->hr_provname); + if (ioctl(res->hr_ggatefd, G_GATE_CMD_CANCEL, &ggiocancel) == 0) { + pjdlog_info("Device hast/%s recovered.", res->hr_provname); + res->hr_ggateunit = ggiocancel.gctl_unit; + return; + } + primary_exit(EX_OSERR, "Unable to take over hast/%s device", + res->hr_provname); +} + +void +hastd_primary(struct hast_resource *res) +{ + pthread_t td; + pid_t pid; + int error, mode, debuglevel; + + /* + * Create communication channel for sending control commands from + * parent to child. + */ + if (proto_client(NULL, "socketpair://", &res->hr_ctrl) == -1) { + /* TODO: There's no need for this to be fatal error. */ + KEEP_ERRNO((void)pidfile_remove(pfh)); + pjdlog_exit(EX_OSERR, + "Unable to create control sockets between parent and child"); + } + /* + * Create communication channel for sending events from child to parent. + */ + if (proto_client(NULL, "socketpair://", &res->hr_event) == -1) { + /* TODO: There's no need for this to be fatal error. */ + KEEP_ERRNO((void)pidfile_remove(pfh)); + pjdlog_exit(EX_OSERR, + "Unable to create event sockets between child and parent"); + } + /* + * Create communication channel for sending connection requests from + * child to parent. + */ + if (proto_client(NULL, "socketpair://", &res->hr_conn) == -1) { + /* TODO: There's no need for this to be fatal error. */ + KEEP_ERRNO((void)pidfile_remove(pfh)); + pjdlog_exit(EX_OSERR, + "Unable to create connection sockets between child and parent"); + } + + pid = fork(); + if (pid == -1) { + /* TODO: There's no need for this to be fatal error. */ + KEEP_ERRNO((void)pidfile_remove(pfh)); + pjdlog_exit(EX_TEMPFAIL, "Unable to fork"); + } + + if (pid > 0) { + /* This is parent. */ + /* Declare that we are receiver. */ + proto_recv(res->hr_event, NULL, 0); + proto_recv(res->hr_conn, NULL, 0); + /* Declare that we are sender. */ + proto_send(res->hr_ctrl, NULL, 0); + res->hr_workerpid = pid; + return; + } + + gres = res; + mode = pjdlog_mode_get(); + debuglevel = pjdlog_debug_get(); + + /* Declare that we are sender. */ + proto_send(res->hr_event, NULL, 0); + proto_send(res->hr_conn, NULL, 0); + /* Declare that we are receiver. */ + proto_recv(res->hr_ctrl, NULL, 0); + descriptors_cleanup(res); + + descriptors_assert(res, mode); + + pjdlog_init(mode); + pjdlog_debug_set(debuglevel); + pjdlog_prefix_set("[%s] (%s) ", res->hr_name, role2str(res->hr_role)); + setproctitle("%s (%s)", res->hr_name, role2str(res->hr_role)); + + init_local(res); + init_ggate(res); + init_environment(res); + + if (drop_privs(res) != 0) { + cleanup(res); + exit(EX_CONFIG); + } + pjdlog_info("Privileges successfully dropped."); + + /* + * Create the guard thread first, so we can handle signals from the + * very beginning. + */ + error = pthread_create(&td, NULL, guard_thread, res); + PJDLOG_ASSERT(error == 0); + /* + * Create the control thread before sending any event to the parent, + * as we can deadlock when parent sends control request to worker, + * but worker has no control thread started yet, so parent waits. + * In the meantime worker sends an event to the parent, but parent + * is unable to handle the event, because it waits for control + * request response. + */ + error = pthread_create(&td, NULL, ctrl_thread, res); + PJDLOG_ASSERT(error == 0); + if (real_remote(res)) { + error = init_remote(res, NULL, NULL); + if (error == 0) { + sync_start(); + } else if (error == EBUSY) { + time_t start = time(NULL); + + pjdlog_warning("Waiting for remote node to become %s for %ds.", + role2str(HAST_ROLE_SECONDARY), + res->hr_timeout); + for (;;) { + sleep(1); + error = init_remote(res, NULL, NULL); + if (error != EBUSY) + break; + if (time(NULL) > start + res->hr_timeout) + break; + } + if (error == EBUSY) { + pjdlog_warning("Remote node is still %s, starting anyway.", + role2str(HAST_ROLE_PRIMARY)); + } + } + } + error = pthread_create(&td, NULL, ggate_recv_thread, res); + PJDLOG_ASSERT(error == 0); + error = pthread_create(&td, NULL, local_send_thread, res); + PJDLOG_ASSERT(error == 0); + error = pthread_create(&td, NULL, remote_send_thread, res); + PJDLOG_ASSERT(error == 0); + error = pthread_create(&td, NULL, remote_recv_thread, res); + PJDLOG_ASSERT(error == 0); + error = pthread_create(&td, NULL, ggate_send_thread, res); + PJDLOG_ASSERT(error == 0); + fullystarted = true; + (void)sync_thread(res); +} + +static void +reqlog(int loglevel, int debuglevel, struct g_gate_ctl_io *ggio, + const char *fmt, ...) +{ + char msg[1024]; + va_list ap; + + va_start(ap, fmt); + (void)vsnprintf(msg, sizeof(msg), fmt, ap); + va_end(ap); + switch (ggio->gctl_cmd) { + case BIO_READ: + (void)snprlcat(msg, sizeof(msg), "READ(%ju, %ju).", + (uintmax_t)ggio->gctl_offset, (uintmax_t)ggio->gctl_length); + break; + case BIO_DELETE: + (void)snprlcat(msg, sizeof(msg), "DELETE(%ju, %ju).", + (uintmax_t)ggio->gctl_offset, (uintmax_t)ggio->gctl_length); + break; + case BIO_FLUSH: + (void)snprlcat(msg, sizeof(msg), "FLUSH."); + break; + case BIO_WRITE: + (void)snprlcat(msg, sizeof(msg), "WRITE(%ju, %ju).", + (uintmax_t)ggio->gctl_offset, (uintmax_t)ggio->gctl_length); + break; + default: + (void)snprlcat(msg, sizeof(msg), "UNKNOWN(%u).", + (unsigned int)ggio->gctl_cmd); + break; + } + pjdlog_common(loglevel, debuglevel, -1, "%s", msg); +} + +static void +remote_close(struct hast_resource *res, int ncomp) +{ + + rw_wlock(&hio_remote_lock[ncomp]); + /* + * Check for a race between dropping rlock and acquiring wlock - + * another thread can close connection in-between. + */ + if (!ISCONNECTED(res, ncomp)) { + PJDLOG_ASSERT(res->hr_remotein == NULL); + PJDLOG_ASSERT(res->hr_remoteout == NULL); + rw_unlock(&hio_remote_lock[ncomp]); + return; + } + + PJDLOG_ASSERT(res->hr_remotein != NULL); + PJDLOG_ASSERT(res->hr_remoteout != NULL); + + pjdlog_debug(2, "Closing incoming connection to %s.", + res->hr_remoteaddr); + proto_close(res->hr_remotein); + res->hr_remotein = NULL; + pjdlog_debug(2, "Closing outgoing connection to %s.", + res->hr_remoteaddr); + proto_close(res->hr_remoteout); + res->hr_remoteout = NULL; + + rw_unlock(&hio_remote_lock[ncomp]); + + pjdlog_warning("Disconnected from %s.", res->hr_remoteaddr); + + /* + * Stop synchronization if in-progress. + */ + sync_stop(); + + event_send(res, EVENT_DISCONNECT); +} + +/* + * Acknowledge write completion to the kernel, but don't update activemap yet. + */ +static void +write_complete(struct hast_resource *res, struct hio *hio) +{ + struct g_gate_ctl_io *ggio; + unsigned int ncomp; + + PJDLOG_ASSERT(!hio->hio_done); + + ggio = &hio->hio_ggio; + PJDLOG_ASSERT(ggio->gctl_cmd == BIO_WRITE); + + /* + * Bump local count if this is first write after + * connection failure with remote node. + */ + ncomp = 1; + rw_rlock(&hio_remote_lock[ncomp]); + if (!ISCONNECTED(res, ncomp)) { + mtx_lock(&metadata_lock); + if (res->hr_primary_localcnt == res->hr_secondary_remotecnt) { + res->hr_primary_localcnt++; + pjdlog_debug(1, "Increasing localcnt to %ju.", + (uintmax_t)res->hr_primary_localcnt); + (void)metadata_write(res); + } + mtx_unlock(&metadata_lock); + } + rw_unlock(&hio_remote_lock[ncomp]); + if (ioctl(res->hr_ggatefd, G_GATE_CMD_DONE, ggio) == -1) + primary_exit(EX_OSERR, "G_GATE_CMD_DONE failed"); + hio->hio_done = true; +} + +/* + * Thread receives ggate I/O requests from the kernel and passes them to + * appropriate threads: + * WRITE - always goes to both local_send and remote_send threads + * READ (when the block is up-to-date on local component) - + * only local_send thread + * READ (when the block isn't up-to-date on local component) - + * only remote_send thread + * DELETE - always goes to both local_send and remote_send threads + * FLUSH - always goes to both local_send and remote_send threads + */ +static void * +ggate_recv_thread(void *arg) +{ + struct hast_resource *res = arg; + struct g_gate_ctl_io *ggio; + struct hio *hio; + unsigned int ii, ncomp, ncomps; + int error; + + for (;;) { + pjdlog_debug(2, "ggate_recv: Taking free request."); + QUEUE_TAKE2(hio, free); + pjdlog_debug(2, "ggate_recv: (%p) Got free request.", hio); + ggio = &hio->hio_ggio; + ggio->gctl_unit = res->hr_ggateunit; + ggio->gctl_length = MAXPHYS; + ggio->gctl_error = 0; + hio->hio_done = false; + hio->hio_replication = res->hr_replication; + pjdlog_debug(2, + "ggate_recv: (%p) Waiting for request from the kernel.", + hio); + if (ioctl(res->hr_ggatefd, G_GATE_CMD_START, ggio) == -1) { + if (sigexit_received) + pthread_exit(NULL); + primary_exit(EX_OSERR, "G_GATE_CMD_START failed"); + } + error = ggio->gctl_error; + switch (error) { + case 0: + break; + case ECANCELED: + /* Exit gracefully. */ + if (!sigexit_received) { + pjdlog_debug(2, + "ggate_recv: (%p) Received cancel from the kernel.", + hio); + pjdlog_info("Received cancel from the kernel, exiting."); + } + pthread_exit(NULL); + case ENOMEM: + /* + * Buffer too small? Impossible, we allocate MAXPHYS + * bytes - request can't be bigger than that. + */ + /* FALLTHROUGH */ + case ENXIO: + default: + primary_exitx(EX_OSERR, "G_GATE_CMD_START failed: %s.", + strerror(error)); + } + + ncomp = 0; + ncomps = HAST_NCOMPONENTS; + + for (ii = 0; ii < ncomps; ii++) + hio->hio_errors[ii] = EINVAL; + reqlog(LOG_DEBUG, 2, ggio, + "ggate_recv: (%p) Request received from the kernel: ", + hio); + + /* + * Inform all components about new write request. + * For read request prefer local component unless the given + * range is out-of-date, then use remote component. + */ + switch (ggio->gctl_cmd) { + case BIO_READ: + res->hr_stat_read++; + ncomps = 1; + mtx_lock(&metadata_lock); + if (res->hr_syncsrc == HAST_SYNCSRC_UNDEF || + res->hr_syncsrc == HAST_SYNCSRC_PRIMARY) { + /* + * This range is up-to-date on local component, + * so handle request locally. + */ + /* Local component is 0 for now. */ + ncomp = 0; + } else /* if (res->hr_syncsrc == + HAST_SYNCSRC_SECONDARY) */ { + PJDLOG_ASSERT(res->hr_syncsrc == + HAST_SYNCSRC_SECONDARY); + /* + * This range is out-of-date on local component, + * so send request to the remote node. + */ + /* Remote component is 1 for now. */ + ncomp = 1; + } + mtx_unlock(&metadata_lock); + break; + case BIO_WRITE: + res->hr_stat_write++; + if (res->hr_resuid == 0 && + res->hr_primary_localcnt == 0) { + /* This is first write. */ + res->hr_primary_localcnt = 1; + } + for (;;) { + mtx_lock(&range_lock); + if (rangelock_islocked(range_sync, + ggio->gctl_offset, ggio->gctl_length)) { + pjdlog_debug(2, + "regular: Range offset=%jd length=%zu locked.", + (intmax_t)ggio->gctl_offset, + (size_t)ggio->gctl_length); + range_regular_wait = true; + cv_wait(&range_regular_cond, &range_lock); + range_regular_wait = false; + mtx_unlock(&range_lock); + continue; + } + if (rangelock_add(range_regular, + ggio->gctl_offset, ggio->gctl_length) == -1) { + mtx_unlock(&range_lock); + pjdlog_debug(2, + "regular: Range offset=%jd length=%zu is already locked, waiting.", + (intmax_t)ggio->gctl_offset, + (size_t)ggio->gctl_length); + sleep(1); + continue; + } + mtx_unlock(&range_lock); + break; + } + mtx_lock(&res->hr_amp_lock); + if (activemap_write_start(res->hr_amp, + ggio->gctl_offset, ggio->gctl_length)) { + res->hr_stat_activemap_update++; + (void)hast_activemap_flush(res); + } else { + mtx_unlock(&res->hr_amp_lock); + } + break; + case BIO_DELETE: + res->hr_stat_delete++; + break; + case BIO_FLUSH: + res->hr_stat_flush++; + break; + } + pjdlog_debug(2, + "ggate_recv: (%p) Moving request to the send queues.", hio); + if (hio->hio_replication == HAST_REPLICATION_MEMSYNC && + ggio->gctl_cmd == BIO_WRITE) { + /* Each remote request needs two responses in memsync. */ + refcnt_init(&hio->hio_countdown, ncomps + 1); + } else { + refcnt_init(&hio->hio_countdown, ncomps); + } + for (ii = ncomp; ii < ncomps; ii++) + QUEUE_INSERT1(hio, send, ii); + } + /* NOTREACHED */ + return (NULL); +} + +/* + * Thread reads from or writes to local component. + * If local read fails, it redirects it to remote_send thread. + */ +static void * +local_send_thread(void *arg) +{ + struct hast_resource *res = arg; + struct g_gate_ctl_io *ggio; + struct hio *hio; + unsigned int ncomp, rncomp; + ssize_t ret; + + /* Local component is 0 for now. */ + ncomp = 0; + /* Remote component is 1 for now. */ + rncomp = 1; + + for (;;) { + pjdlog_debug(2, "local_send: Taking request."); + QUEUE_TAKE1(hio, send, ncomp, 0); + pjdlog_debug(2, "local_send: (%p) Got request.", hio); + ggio = &hio->hio_ggio; + switch (ggio->gctl_cmd) { + case BIO_READ: + ret = pread(res->hr_localfd, ggio->gctl_data, + ggio->gctl_length, + ggio->gctl_offset + res->hr_localoff); + if (ret == ggio->gctl_length) + hio->hio_errors[ncomp] = 0; + else if (!ISSYNCREQ(hio)) { + /* + * If READ failed, try to read from remote node. + */ + if (ret == -1) { + reqlog(LOG_WARNING, 0, ggio, + "Local request failed (%s), trying remote node. ", + strerror(errno)); + } else if (ret != ggio->gctl_length) { + reqlog(LOG_WARNING, 0, ggio, + "Local request failed (%zd != %jd), trying remote node. ", + ret, (intmax_t)ggio->gctl_length); + } + QUEUE_INSERT1(hio, send, rncomp); + continue; + } + break; + case BIO_WRITE: + ret = pwrite(res->hr_localfd, ggio->gctl_data, + ggio->gctl_length, + ggio->gctl_offset + res->hr_localoff); + if (ret == -1) { + hio->hio_errors[ncomp] = errno; + reqlog(LOG_WARNING, 0, ggio, + "Local request failed (%s): ", + strerror(errno)); + } else if (ret != ggio->gctl_length) { + hio->hio_errors[ncomp] = EIO; + reqlog(LOG_WARNING, 0, ggio, + "Local request failed (%zd != %jd): ", + ret, (intmax_t)ggio->gctl_length); + } else { + hio->hio_errors[ncomp] = 0; + if (hio->hio_replication == + HAST_REPLICATION_ASYNC) { + ggio->gctl_error = 0; + write_complete(res, hio); + } + } + break; + case BIO_DELETE: + ret = g_delete(res->hr_localfd, + ggio->gctl_offset + res->hr_localoff, + ggio->gctl_length); + if (ret == -1) { + hio->hio_errors[ncomp] = errno; + reqlog(LOG_WARNING, 0, ggio, + "Local request failed (%s): ", + strerror(errno)); + } else { + hio->hio_errors[ncomp] = 0; + } + break; + case BIO_FLUSH: + if (!res->hr_localflush) { + ret = -1; + errno = EOPNOTSUPP; + break; + } + ret = g_flush(res->hr_localfd); + if (ret == -1) { + if (errno == EOPNOTSUPP) + res->hr_localflush = false; + hio->hio_errors[ncomp] = errno; + reqlog(LOG_WARNING, 0, ggio, + "Local request failed (%s): ", + strerror(errno)); + } else { + hio->hio_errors[ncomp] = 0; + } + break; + } + + if (hio->hio_replication != HAST_REPLICATION_MEMSYNC || + ggio->gctl_cmd != BIO_WRITE || ISSYNCREQ(hio)) { + if (refcnt_release(&hio->hio_countdown) > 0) + continue; + } else { + /* + * Depending on hio_countdown value, requests finished + * in the following order: + * 0: remote memsync, remote final, local write + * 1: remote memsync, local write, (remote final) + * 2: local write, (remote memsync), (remote final) + */ + switch (refcnt_release(&hio->hio_countdown)) { + case 0: + /* + * Local write finished as last. + */ + break; + case 1: + /* + * Local write finished after remote memsync + * reply arrvied. We can complete the write now. + */ + if (hio->hio_errors[0] == 0) + write_complete(res, hio); + continue; + case 2: + /* + * Local write finished as first. + */ + continue; + default: + PJDLOG_ABORT("Invalid hio_countdown."); + } + } + if (ISSYNCREQ(hio)) { + mtx_lock(&sync_lock); + SYNCREQDONE(hio); + mtx_unlock(&sync_lock); + cv_signal(&sync_cond); + } else { + pjdlog_debug(2, + "local_send: (%p) Moving request to the done queue.", + hio); + QUEUE_INSERT2(hio, done); + } + } + /* NOTREACHED */ + return (NULL); +} + +static void +keepalive_send(struct hast_resource *res, unsigned int ncomp) +{ + struct nv *nv; + + rw_rlock(&hio_remote_lock[ncomp]); + + if (!ISCONNECTED(res, ncomp)) { + rw_unlock(&hio_remote_lock[ncomp]); + return; + } + + PJDLOG_ASSERT(res->hr_remotein != NULL); + PJDLOG_ASSERT(res->hr_remoteout != NULL); + + nv = nv_alloc(); + nv_add_uint8(nv, HIO_KEEPALIVE, "cmd"); + if (nv_error(nv) != 0) { + rw_unlock(&hio_remote_lock[ncomp]); + nv_free(nv); + pjdlog_debug(1, + "keepalive_send: Unable to prepare header to send."); + return; + } + if (hast_proto_send(res, res->hr_remoteout, nv, NULL, 0) == -1) { + rw_unlock(&hio_remote_lock[ncomp]); + pjdlog_common(LOG_DEBUG, 1, errno, + "keepalive_send: Unable to send request"); + nv_free(nv); + remote_close(res, ncomp); + return; + } + + rw_unlock(&hio_remote_lock[ncomp]); + nv_free(nv); + pjdlog_debug(2, "keepalive_send: Request sent."); +} + +/* + * Thread sends request to secondary node. + */ +static void * +remote_send_thread(void *arg) +{ + struct hast_resource *res = arg; + struct g_gate_ctl_io *ggio; + time_t lastcheck, now; + struct hio *hio; + struct nv *nv; + unsigned int ncomp; + bool wakeup; + uint64_t offset, length; + uint8_t cmd; + void *data; + + /* Remote component is 1 for now. */ + ncomp = 1; + lastcheck = time(NULL); + + for (;;) { + pjdlog_debug(2, "remote_send: Taking request."); + QUEUE_TAKE1(hio, send, ncomp, HAST_KEEPALIVE); + if (hio == NULL) { + now = time(NULL); + if (lastcheck + HAST_KEEPALIVE <= now) { + keepalive_send(res, ncomp); + lastcheck = now; + } + continue; + } + pjdlog_debug(2, "remote_send: (%p) Got request.", hio); + ggio = &hio->hio_ggio; + switch (ggio->gctl_cmd) { + case BIO_READ: + cmd = HIO_READ; + data = NULL; + offset = ggio->gctl_offset; + length = ggio->gctl_length; + break; + case BIO_WRITE: + cmd = HIO_WRITE; + data = ggio->gctl_data; + offset = ggio->gctl_offset; + length = ggio->gctl_length; + break; + case BIO_DELETE: + cmd = HIO_DELETE; + data = NULL; + offset = ggio->gctl_offset; + length = ggio->gctl_length; + break; + case BIO_FLUSH: + cmd = HIO_FLUSH; + data = NULL; + offset = 0; + length = 0; + break; + default: + PJDLOG_ABORT("invalid condition"); + } + nv = nv_alloc(); + nv_add_uint8(nv, cmd, "cmd"); + nv_add_uint64(nv, (uint64_t)ggio->gctl_seq, "seq"); + nv_add_uint64(nv, offset, "offset"); + nv_add_uint64(nv, length, "length"); + if (hio->hio_replication == HAST_REPLICATION_MEMSYNC && + ggio->gctl_cmd == BIO_WRITE && !ISSYNCREQ(hio)) { + nv_add_uint8(nv, 1, "memsync"); + } + if (nv_error(nv) != 0) { + hio->hio_errors[ncomp] = nv_error(nv); + pjdlog_debug(2, + "remote_send: (%p) Unable to prepare header to send.", + hio); + reqlog(LOG_ERR, 0, ggio, + "Unable to prepare header to send (%s): ", + strerror(nv_error(nv))); + /* Move failed request immediately to the done queue. */ + goto done_queue; + } + /* + * Protect connection from disappearing. + */ + rw_rlock(&hio_remote_lock[ncomp]); + if (!ISCONNECTED(res, ncomp)) { + rw_unlock(&hio_remote_lock[ncomp]); + hio->hio_errors[ncomp] = ENOTCONN; + goto done_queue; + } + /* + * Move the request to recv queue before sending it, because + * in different order we can get reply before we move request + * to recv queue. + */ + pjdlog_debug(2, + "remote_send: (%p) Moving request to the recv queue.", + hio); + mtx_lock(&hio_recv_list_lock[ncomp]); + wakeup = TAILQ_EMPTY(&hio_recv_list[ncomp]); + TAILQ_INSERT_TAIL(&hio_recv_list[ncomp], hio, hio_next[ncomp]); + mtx_unlock(&hio_recv_list_lock[ncomp]); + if (hast_proto_send(res, res->hr_remoteout, nv, data, + data != NULL ? length : 0) == -1) { + hio->hio_errors[ncomp] = errno; + rw_unlock(&hio_remote_lock[ncomp]); + pjdlog_debug(2, + "remote_send: (%p) Unable to send request.", hio); + reqlog(LOG_ERR, 0, ggio, + "Unable to send request (%s): ", + strerror(hio->hio_errors[ncomp])); + remote_close(res, ncomp); + /* + * Take request back from the receive queue and move + * it immediately to the done queue. + */ + mtx_lock(&hio_recv_list_lock[ncomp]); + TAILQ_REMOVE(&hio_recv_list[ncomp], hio, + hio_next[ncomp]); + mtx_unlock(&hio_recv_list_lock[ncomp]); + goto done_queue; + } + rw_unlock(&hio_remote_lock[ncomp]); + nv_free(nv); + if (wakeup) + cv_signal(&hio_recv_list_cond[ncomp]); + continue; +done_queue: + nv_free(nv); + if (ISSYNCREQ(hio)) { + if (refcnt_release(&hio->hio_countdown) > 0) + continue; + mtx_lock(&sync_lock); + SYNCREQDONE(hio); + mtx_unlock(&sync_lock); + cv_signal(&sync_cond); + continue; + } + if (ggio->gctl_cmd == BIO_WRITE) { + mtx_lock(&res->hr_amp_lock); + if (activemap_need_sync(res->hr_amp, ggio->gctl_offset, + ggio->gctl_length)) { + (void)hast_activemap_flush(res); + } else { + mtx_unlock(&res->hr_amp_lock); + } + if (hio->hio_replication == HAST_REPLICATION_MEMSYNC) + (void)refcnt_release(&hio->hio_countdown); + } + if (refcnt_release(&hio->hio_countdown) > 0) + continue; + pjdlog_debug(2, + "remote_send: (%p) Moving request to the done queue.", + hio); + QUEUE_INSERT2(hio, done); + } + /* NOTREACHED */ + return (NULL); +} + +/* + * Thread receives answer from secondary node and passes it to ggate_send + * thread. + */ +static void * +remote_recv_thread(void *arg) +{ + struct hast_resource *res = arg; + struct g_gate_ctl_io *ggio; + struct hio *hio; + struct nv *nv; + unsigned int ncomp; + uint64_t seq; + bool memsyncack; + int error; + + /* Remote component is 1 for now. */ + ncomp = 1; + + for (;;) { + /* Wait until there is anything to receive. */ + mtx_lock(&hio_recv_list_lock[ncomp]); + while (TAILQ_EMPTY(&hio_recv_list[ncomp])) { + pjdlog_debug(2, "remote_recv: No requests, waiting."); + cv_wait(&hio_recv_list_cond[ncomp], + &hio_recv_list_lock[ncomp]); + } + mtx_unlock(&hio_recv_list_lock[ncomp]); + + memsyncack = false; + + rw_rlock(&hio_remote_lock[ncomp]); + if (!ISCONNECTED(res, ncomp)) { + rw_unlock(&hio_remote_lock[ncomp]); + /* + * Connection is dead, so move all pending requests to + * the done queue (one-by-one). + */ + mtx_lock(&hio_recv_list_lock[ncomp]); + hio = TAILQ_FIRST(&hio_recv_list[ncomp]); + PJDLOG_ASSERT(hio != NULL); + TAILQ_REMOVE(&hio_recv_list[ncomp], hio, + hio_next[ncomp]); + mtx_unlock(&hio_recv_list_lock[ncomp]); + goto done_queue; + } + if (hast_proto_recv_hdr(res->hr_remotein, &nv) == -1) { + pjdlog_errno(LOG_ERR, + "Unable to receive reply header"); + rw_unlock(&hio_remote_lock[ncomp]); + remote_close(res, ncomp); + continue; + } + rw_unlock(&hio_remote_lock[ncomp]); + seq = nv_get_uint64(nv, "seq"); + if (seq == 0) { + pjdlog_error("Header contains no 'seq' field."); + nv_free(nv); + continue; + } + memsyncack = nv_exists(nv, "received"); + mtx_lock(&hio_recv_list_lock[ncomp]); + TAILQ_FOREACH(hio, &hio_recv_list[ncomp], hio_next[ncomp]) { + if (hio->hio_ggio.gctl_seq == seq) { + TAILQ_REMOVE(&hio_recv_list[ncomp], hio, + hio_next[ncomp]); + break; + } + } + mtx_unlock(&hio_recv_list_lock[ncomp]); + if (hio == NULL) { + pjdlog_error("Found no request matching received 'seq' field (%ju).", + (uintmax_t)seq); + nv_free(nv); + continue; + } + ggio = &hio->hio_ggio; + error = nv_get_int16(nv, "error"); + if (error != 0) { + /* Request failed on remote side. */ + hio->hio_errors[ncomp] = error; + reqlog(LOG_WARNING, 0, ggio, + "Remote request failed (%s): ", strerror(error)); + nv_free(nv); + goto done_queue; + } + switch (ggio->gctl_cmd) { + case BIO_READ: + rw_rlock(&hio_remote_lock[ncomp]); + if (!ISCONNECTED(res, ncomp)) { + rw_unlock(&hio_remote_lock[ncomp]); + nv_free(nv); + goto done_queue; + } + if (hast_proto_recv_data(res, res->hr_remotein, nv, + ggio->gctl_data, ggio->gctl_length) == -1) { + hio->hio_errors[ncomp] = errno; + pjdlog_errno(LOG_ERR, + "Unable to receive reply data"); + rw_unlock(&hio_remote_lock[ncomp]); + nv_free(nv); + remote_close(res, ncomp); + goto done_queue; + } + rw_unlock(&hio_remote_lock[ncomp]); + break; + case BIO_WRITE: + case BIO_DELETE: + case BIO_FLUSH: + break; + default: + PJDLOG_ABORT("invalid condition"); + } + hio->hio_errors[ncomp] = 0; + nv_free(nv); +done_queue: + if (hio->hio_replication != HAST_REPLICATION_MEMSYNC || + hio->hio_ggio.gctl_cmd != BIO_WRITE || ISSYNCREQ(hio)) { + if (refcnt_release(&hio->hio_countdown) > 0) + continue; + } else { + /* + * Depending on hio_countdown value, requests finished + * in the following order: + * + * 0: local write, remote memsync, remote final + * or + * 0: remote memsync, local write, remote final + * + * 1: local write, remote memsync, (remote final) + * or + * 1: remote memsync, remote final, (local write) + * + * 2: remote memsync, (local write), (remote final) + * or + * 2: remote memsync, (remote final), (local write) + */ + switch (refcnt_release(&hio->hio_countdown)) { + case 0: + /* + * Remote final reply arrived. + */ + PJDLOG_ASSERT(!memsyncack); + break; + case 1: + if (memsyncack) { + /* + * Local request already finished, so we + * can complete the write. + */ + if (hio->hio_errors[0] == 0) + write_complete(res, hio); + /* + * We still need to wait for final + * remote reply. + */ + pjdlog_debug(2, + "remote_recv: (%p) Moving request back to the recv queue.", + hio); + mtx_lock(&hio_recv_list_lock[ncomp]); + TAILQ_INSERT_TAIL(&hio_recv_list[ncomp], + hio, hio_next[ncomp]); + mtx_unlock(&hio_recv_list_lock[ncomp]); + } else { + /* + * Remote final reply arrived before + * local write finished. + * Nothing to do in such case. + */ + } + continue; + case 2: + /* + * We received remote memsync reply even before + * local write finished. + */ + PJDLOG_ASSERT(memsyncack); + + pjdlog_debug(2, + "remote_recv: (%p) Moving request back to the recv queue.", + hio); + mtx_lock(&hio_recv_list_lock[ncomp]); + TAILQ_INSERT_TAIL(&hio_recv_list[ncomp], hio, + hio_next[ncomp]); + mtx_unlock(&hio_recv_list_lock[ncomp]); + continue; + default: + PJDLOG_ABORT("Invalid hio_countdown."); + } + } + if (ISSYNCREQ(hio)) { + mtx_lock(&sync_lock); + SYNCREQDONE(hio); + mtx_unlock(&sync_lock); + cv_signal(&sync_cond); + } else { + pjdlog_debug(2, + "remote_recv: (%p) Moving request to the done queue.", + hio); + QUEUE_INSERT2(hio, done); + } + } + /* NOTREACHED */ + return (NULL); +} + +/* + * Thread sends answer to the kernel. + */ +static void * +ggate_send_thread(void *arg) +{ + struct hast_resource *res = arg; + struct g_gate_ctl_io *ggio; + struct hio *hio; + unsigned int ii, ncomps; + + ncomps = HAST_NCOMPONENTS; + + for (;;) { + pjdlog_debug(2, "ggate_send: Taking request."); + QUEUE_TAKE2(hio, done); + pjdlog_debug(2, "ggate_send: (%p) Got request.", hio); + ggio = &hio->hio_ggio; + for (ii = 0; ii < ncomps; ii++) { + if (hio->hio_errors[ii] == 0) { + /* + * One successful request is enough to declare + * success. + */ + ggio->gctl_error = 0; + break; + } + } + if (ii == ncomps) { + /* + * None of the requests were successful. + * Use the error from local component except the + * case when we did only remote request. + */ + if (ggio->gctl_cmd == BIO_READ && + res->hr_syncsrc == HAST_SYNCSRC_SECONDARY) + ggio->gctl_error = hio->hio_errors[1]; + else + ggio->gctl_error = hio->hio_errors[0]; + } + if (ggio->gctl_error == 0 && ggio->gctl_cmd == BIO_WRITE) { + mtx_lock(&res->hr_amp_lock); + if (activemap_write_complete(res->hr_amp, + ggio->gctl_offset, ggio->gctl_length)) { + res->hr_stat_activemap_update++; + (void)hast_activemap_flush(res); + } else { + mtx_unlock(&res->hr_amp_lock); + } + } + if (ggio->gctl_cmd == BIO_WRITE) { + /* + * Unlock range we locked. + */ + mtx_lock(&range_lock); + rangelock_del(range_regular, ggio->gctl_offset, + ggio->gctl_length); + if (range_sync_wait) + cv_signal(&range_sync_cond); + mtx_unlock(&range_lock); + if (!hio->hio_done) + write_complete(res, hio); + } else { + if (ioctl(res->hr_ggatefd, G_GATE_CMD_DONE, ggio) == -1) { + primary_exit(EX_OSERR, + "G_GATE_CMD_DONE failed"); + } + } + if (hio->hio_errors[0]) { + switch (ggio->gctl_cmd) { + case BIO_READ: + res->hr_stat_read_error++; + break; + case BIO_WRITE: + res->hr_stat_write_error++; + break; + case BIO_DELETE: + res->hr_stat_delete_error++; + break; + case BIO_FLUSH: + res->hr_stat_flush_error++; + break; + } + } + pjdlog_debug(2, + "ggate_send: (%p) Moving request to the free queue.", hio); + QUEUE_INSERT2(hio, free); + } + /* NOTREACHED */ + return (NULL); +} + +/* + * Thread synchronize local and remote components. + */ +static void * +sync_thread(void *arg __unused) +{ + struct hast_resource *res = arg; + struct hio *hio; + struct g_gate_ctl_io *ggio; + struct timeval tstart, tend, tdiff; + unsigned int ii, ncomp, ncomps; + off_t offset, length, synced; + bool dorewind, directreads; + int syncext; + + ncomps = HAST_NCOMPONENTS; + dorewind = true; + synced = 0; + offset = -1; + directreads = false; + + for (;;) { + mtx_lock(&sync_lock); + if (offset >= 0 && !sync_inprogress) { + gettimeofday(&tend, NULL); + timersub(&tend, &tstart, &tdiff); + pjdlog_info("Synchronization interrupted after %#.0T. " + "%NB synchronized so far.", &tdiff, + (intmax_t)synced); + event_send(res, EVENT_SYNCINTR); + } + while (!sync_inprogress) { + dorewind = true; + synced = 0; + cv_wait(&sync_cond, &sync_lock); + } + mtx_unlock(&sync_lock); + /* + * Obtain offset at which we should synchronize. + * Rewind synchronization if needed. + */ + mtx_lock(&res->hr_amp_lock); + if (dorewind) + activemap_sync_rewind(res->hr_amp); + offset = activemap_sync_offset(res->hr_amp, &length, &syncext); + if (syncext != -1) { + /* + * We synchronized entire syncext extent, we can mark + * it as clean now. + */ + if (activemap_extent_complete(res->hr_amp, syncext)) + (void)hast_activemap_flush(res); + else + mtx_unlock(&res->hr_amp_lock); + } else { + mtx_unlock(&res->hr_amp_lock); + } + if (dorewind) { + dorewind = false; + if (offset == -1) + pjdlog_info("Nodes are in sync."); + else { + pjdlog_info("Synchronization started. %NB to go.", + (intmax_t)(res->hr_extentsize * + activemap_ndirty(res->hr_amp))); + event_send(res, EVENT_SYNCSTART); + gettimeofday(&tstart, NULL); + } + } + if (offset == -1) { + sync_stop(); + pjdlog_debug(1, "Nothing to synchronize."); + /* + * Synchronization complete, make both localcnt and + * remotecnt equal. + */ + ncomp = 1; + rw_rlock(&hio_remote_lock[ncomp]); + if (ISCONNECTED(res, ncomp)) { + if (synced > 0) { + int64_t bps; + + gettimeofday(&tend, NULL); + timersub(&tend, &tstart, &tdiff); + bps = (int64_t)((double)synced / + ((double)tdiff.tv_sec + + (double)tdiff.tv_usec / 1000000)); + pjdlog_info("Synchronization complete. " + "%NB synchronized in %#.0lT (%NB/sec).", + (intmax_t)synced, &tdiff, + (intmax_t)bps); + event_send(res, EVENT_SYNCDONE); + } + mtx_lock(&metadata_lock); + if (res->hr_syncsrc == HAST_SYNCSRC_SECONDARY) + directreads = true; + res->hr_syncsrc = HAST_SYNCSRC_UNDEF; + res->hr_primary_localcnt = + res->hr_secondary_remotecnt; + res->hr_primary_remotecnt = + res->hr_secondary_localcnt; + pjdlog_debug(1, + "Setting localcnt to %ju and remotecnt to %ju.", + (uintmax_t)res->hr_primary_localcnt, + (uintmax_t)res->hr_primary_remotecnt); + (void)metadata_write(res); + mtx_unlock(&metadata_lock); + } + rw_unlock(&hio_remote_lock[ncomp]); + if (directreads) { + directreads = false; + enable_direct_reads(res); + } + continue; + } + pjdlog_debug(2, "sync: Taking free request."); + QUEUE_TAKE2(hio, free); + pjdlog_debug(2, "sync: (%p) Got free request.", hio); + /* + * Lock the range we are going to synchronize. We don't want + * race where someone writes between our read and write. + */ + for (;;) { + mtx_lock(&range_lock); + if (rangelock_islocked(range_regular, offset, length)) { + pjdlog_debug(2, + "sync: Range offset=%jd length=%jd locked.", + (intmax_t)offset, (intmax_t)length); + range_sync_wait = true; + cv_wait(&range_sync_cond, &range_lock); + range_sync_wait = false; + mtx_unlock(&range_lock); + continue; + } + if (rangelock_add(range_sync, offset, length) == -1) { + mtx_unlock(&range_lock); + pjdlog_debug(2, + "sync: Range offset=%jd length=%jd is already locked, waiting.", + (intmax_t)offset, (intmax_t)length); + sleep(1); + continue; + } + mtx_unlock(&range_lock); + break; + } + /* + * First read the data from synchronization source. + */ + SYNCREQ(hio); + ggio = &hio->hio_ggio; + ggio->gctl_cmd = BIO_READ; + ggio->gctl_offset = offset; + ggio->gctl_length = length; + ggio->gctl_error = 0; + hio->hio_done = false; + hio->hio_replication = res->hr_replication; + for (ii = 0; ii < ncomps; ii++) + hio->hio_errors[ii] = EINVAL; + reqlog(LOG_DEBUG, 2, ggio, "sync: (%p) Sending sync request: ", + hio); + pjdlog_debug(2, "sync: (%p) Moving request to the send queue.", + hio); + mtx_lock(&metadata_lock); + if (res->hr_syncsrc == HAST_SYNCSRC_PRIMARY) { + /* + * This range is up-to-date on local component, + * so handle request locally. + */ + /* Local component is 0 for now. */ + ncomp = 0; + } else /* if (res->hr_syncsrc == HAST_SYNCSRC_SECONDARY) */ { + PJDLOG_ASSERT(res->hr_syncsrc == HAST_SYNCSRC_SECONDARY); + /* + * This range is out-of-date on local component, + * so send request to the remote node. + */ + /* Remote component is 1 for now. */ + ncomp = 1; + } + mtx_unlock(&metadata_lock); + refcnt_init(&hio->hio_countdown, 1); + QUEUE_INSERT1(hio, send, ncomp); + + /* + * Let's wait for READ to finish. + */ + mtx_lock(&sync_lock); + while (!ISSYNCREQDONE(hio)) + cv_wait(&sync_cond, &sync_lock); + mtx_unlock(&sync_lock); + + if (hio->hio_errors[ncomp] != 0) { + pjdlog_error("Unable to read synchronization data: %s.", + strerror(hio->hio_errors[ncomp])); + goto free_queue; + } + + /* + * We read the data from synchronization source, now write it + * to synchronization target. + */ + SYNCREQ(hio); + ggio->gctl_cmd = BIO_WRITE; + for (ii = 0; ii < ncomps; ii++) + hio->hio_errors[ii] = EINVAL; + reqlog(LOG_DEBUG, 2, ggio, "sync: (%p) Sending sync request: ", + hio); + pjdlog_debug(2, "sync: (%p) Moving request to the send queue.", + hio); + mtx_lock(&metadata_lock); + if (res->hr_syncsrc == HAST_SYNCSRC_PRIMARY) { + /* + * This range is up-to-date on local component, + * so we update remote component. + */ + /* Remote component is 1 for now. */ + ncomp = 1; + } else /* if (res->hr_syncsrc == HAST_SYNCSRC_SECONDARY) */ { + PJDLOG_ASSERT(res->hr_syncsrc == HAST_SYNCSRC_SECONDARY); + /* + * This range is out-of-date on local component, + * so we update it. + */ + /* Local component is 0 for now. */ + ncomp = 0; + } + mtx_unlock(&metadata_lock); + + pjdlog_debug(2, "sync: (%p) Moving request to the send queue.", + hio); + refcnt_init(&hio->hio_countdown, 1); + QUEUE_INSERT1(hio, send, ncomp); + + /* + * Let's wait for WRITE to finish. + */ + mtx_lock(&sync_lock); + while (!ISSYNCREQDONE(hio)) + cv_wait(&sync_cond, &sync_lock); + mtx_unlock(&sync_lock); + + if (hio->hio_errors[ncomp] != 0) { + pjdlog_error("Unable to write synchronization data: %s.", + strerror(hio->hio_errors[ncomp])); + goto free_queue; + } + + synced += length; +free_queue: + mtx_lock(&range_lock); + rangelock_del(range_sync, offset, length); + if (range_regular_wait) + cv_signal(&range_regular_cond); + mtx_unlock(&range_lock); + pjdlog_debug(2, "sync: (%p) Moving request to the free queue.", + hio); + QUEUE_INSERT2(hio, free); + } + /* NOTREACHED */ + return (NULL); +} + +void +primary_config_reload(struct hast_resource *res, struct nv *nv) +{ + unsigned int ii, ncomps; + int modified, vint; + const char *vstr; + + pjdlog_info("Reloading configuration..."); + + PJDLOG_ASSERT(res->hr_role == HAST_ROLE_PRIMARY); + PJDLOG_ASSERT(gres == res); + nv_assert(nv, "remoteaddr"); + nv_assert(nv, "sourceaddr"); + nv_assert(nv, "replication"); + nv_assert(nv, "checksum"); + nv_assert(nv, "compression"); + nv_assert(nv, "timeout"); + nv_assert(nv, "exec"); + nv_assert(nv, "metaflush"); + + ncomps = HAST_NCOMPONENTS; + +#define MODIFIED_REMOTEADDR 0x01 +#define MODIFIED_SOURCEADDR 0x02 +#define MODIFIED_REPLICATION 0x04 +#define MODIFIED_CHECKSUM 0x08 +#define MODIFIED_COMPRESSION 0x10 +#define MODIFIED_TIMEOUT 0x20 +#define MODIFIED_EXEC 0x40 +#define MODIFIED_METAFLUSH 0x80 + modified = 0; + + vstr = nv_get_string(nv, "remoteaddr"); + if (strcmp(gres->hr_remoteaddr, vstr) != 0) { + /* + * Don't copy res->hr_remoteaddr to gres just yet. + * We want remote_close() to log disconnect from the old + * addresses, not from the new ones. + */ + modified |= MODIFIED_REMOTEADDR; + } + vstr = nv_get_string(nv, "sourceaddr"); + if (strcmp(gres->hr_sourceaddr, vstr) != 0) { + strlcpy(gres->hr_sourceaddr, vstr, sizeof(gres->hr_sourceaddr)); + modified |= MODIFIED_SOURCEADDR; + } + vint = nv_get_int32(nv, "replication"); + if (gres->hr_replication != vint) { + gres->hr_replication = vint; + modified |= MODIFIED_REPLICATION; + } + vint = nv_get_int32(nv, "checksum"); + if (gres->hr_checksum != vint) { + gres->hr_checksum = vint; + modified |= MODIFIED_CHECKSUM; + } + vint = nv_get_int32(nv, "compression"); + if (gres->hr_compression != vint) { + gres->hr_compression = vint; + modified |= MODIFIED_COMPRESSION; + } + vint = nv_get_int32(nv, "timeout"); + if (gres->hr_timeout != vint) { + gres->hr_timeout = vint; + modified |= MODIFIED_TIMEOUT; + } + vstr = nv_get_string(nv, "exec"); + if (strcmp(gres->hr_exec, vstr) != 0) { + strlcpy(gres->hr_exec, vstr, sizeof(gres->hr_exec)); + modified |= MODIFIED_EXEC; + } + vint = nv_get_int32(nv, "metaflush"); + if (gres->hr_metaflush != vint) { + gres->hr_metaflush = vint; + modified |= MODIFIED_METAFLUSH; + } + + /* + * Change timeout for connected sockets. + * Don't bother if we need to reconnect. + */ + if ((modified & MODIFIED_TIMEOUT) != 0 && + (modified & (MODIFIED_REMOTEADDR | MODIFIED_SOURCEADDR)) == 0) { + for (ii = 0; ii < ncomps; ii++) { + if (!ISREMOTE(ii)) + continue; + rw_rlock(&hio_remote_lock[ii]); + if (!ISCONNECTED(gres, ii)) { + rw_unlock(&hio_remote_lock[ii]); + continue; + } + rw_unlock(&hio_remote_lock[ii]); + if (proto_timeout(gres->hr_remotein, + gres->hr_timeout) == -1) { + pjdlog_errno(LOG_WARNING, + "Unable to set connection timeout"); + } + if (proto_timeout(gres->hr_remoteout, + gres->hr_timeout) == -1) { + pjdlog_errno(LOG_WARNING, + "Unable to set connection timeout"); + } + } + } + if ((modified & (MODIFIED_REMOTEADDR | MODIFIED_SOURCEADDR)) != 0) { + for (ii = 0; ii < ncomps; ii++) { + if (!ISREMOTE(ii)) + continue; + remote_close(gres, ii); + } + if (modified & MODIFIED_REMOTEADDR) { + vstr = nv_get_string(nv, "remoteaddr"); + strlcpy(gres->hr_remoteaddr, vstr, + sizeof(gres->hr_remoteaddr)); + } + } +#undef MODIFIED_REMOTEADDR +#undef MODIFIED_SOURCEADDR +#undef MODIFIED_REPLICATION +#undef MODIFIED_CHECKSUM +#undef MODIFIED_COMPRESSION +#undef MODIFIED_TIMEOUT +#undef MODIFIED_EXEC +#undef MODIFIED_METAFLUSH + + pjdlog_info("Configuration reloaded successfully."); +} + +static void +guard_one(struct hast_resource *res, unsigned int ncomp) +{ + struct proto_conn *in, *out; + + if (!ISREMOTE(ncomp)) + return; + + rw_rlock(&hio_remote_lock[ncomp]); + + if (!real_remote(res)) { + rw_unlock(&hio_remote_lock[ncomp]); + return; + } + + if (ISCONNECTED(res, ncomp)) { + PJDLOG_ASSERT(res->hr_remotein != NULL); + PJDLOG_ASSERT(res->hr_remoteout != NULL); + rw_unlock(&hio_remote_lock[ncomp]); + pjdlog_debug(2, "remote_guard: Connection to %s is ok.", + res->hr_remoteaddr); + return; + } + + PJDLOG_ASSERT(res->hr_remotein == NULL); + PJDLOG_ASSERT(res->hr_remoteout == NULL); + /* + * Upgrade the lock. It doesn't have to be atomic as no other thread + * can change connection status from disconnected to connected. + */ + rw_unlock(&hio_remote_lock[ncomp]); + pjdlog_debug(2, "remote_guard: Reconnecting to %s.", + res->hr_remoteaddr); + in = out = NULL; + if (init_remote(res, &in, &out) == 0) { + rw_wlock(&hio_remote_lock[ncomp]); + PJDLOG_ASSERT(res->hr_remotein == NULL); + PJDLOG_ASSERT(res->hr_remoteout == NULL); + PJDLOG_ASSERT(in != NULL && out != NULL); + res->hr_remotein = in; + res->hr_remoteout = out; + rw_unlock(&hio_remote_lock[ncomp]); + pjdlog_info("Successfully reconnected to %s.", + res->hr_remoteaddr); + sync_start(); + } else { + /* Both connections should be NULL. */ + PJDLOG_ASSERT(res->hr_remotein == NULL); + PJDLOG_ASSERT(res->hr_remoteout == NULL); + PJDLOG_ASSERT(in == NULL && out == NULL); + pjdlog_debug(2, "remote_guard: Reconnect to %s failed.", + res->hr_remoteaddr); + } +} + +/* + * Thread guards remote connections and reconnects when needed, handles + * signals, etc. + */ +static void * +guard_thread(void *arg) +{ + struct hast_resource *res = arg; + unsigned int ii, ncomps; + struct timespec timeout; + time_t lastcheck, now; + sigset_t mask; + int signo; + + ncomps = HAST_NCOMPONENTS; + lastcheck = time(NULL); + + PJDLOG_VERIFY(sigemptyset(&mask) == 0); + PJDLOG_VERIFY(sigaddset(&mask, SIGINT) == 0); + PJDLOG_VERIFY(sigaddset(&mask, SIGTERM) == 0); + + timeout.tv_sec = HAST_KEEPALIVE; + timeout.tv_nsec = 0; + signo = -1; + + for (;;) { + switch (signo) { + case SIGINT: + case SIGTERM: + sigexit_received = true; + primary_exitx(EX_OK, + "Termination signal received, exiting."); + break; + default: + break; + } + + /* + * Don't check connections until we fully started, + * as we may still be looping, waiting for remote node + * to switch from primary to secondary. + */ + if (fullystarted) { + pjdlog_debug(2, "remote_guard: Checking connections."); + now = time(NULL); + if (lastcheck + HAST_KEEPALIVE <= now) { + for (ii = 0; ii < ncomps; ii++) + guard_one(res, ii); + lastcheck = now; + } + } + signo = sigtimedwait(&mask, NULL, &timeout); + } + /* NOTREACHED */ + return (NULL); +} diff --git a/sbin/hastd/proto.c b/sbin/hastd/proto.c new file mode 100644 index 0000000..73487c0 --- /dev/null +++ b/sbin/hastd/proto.c @@ -0,0 +1,446 @@ +/*- + * Copyright (c) 2009-2010 The FreeBSD Foundation + * All rights reserved. + * + * This software was developed by Pawel Jakub Dawidek under sponsorship from + * the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/types.h> +#include <sys/queue.h> +#include <sys/socket.h> + +#include <errno.h> +#include <stdint.h> +#include <string.h> +#include <strings.h> + +#include "pjdlog.h" +#include "proto.h" +#include "proto_impl.h" + +#define PROTO_CONN_MAGIC 0x907041c +struct proto_conn { + int pc_magic; + struct proto *pc_proto; + void *pc_ctx; + int pc_side; +#define PROTO_SIDE_CLIENT 0 +#define PROTO_SIDE_SERVER_LISTEN 1 +#define PROTO_SIDE_SERVER_WORK 2 +}; + +static TAILQ_HEAD(, proto) protos = TAILQ_HEAD_INITIALIZER(protos); + +void +proto_register(struct proto *proto, bool isdefault) +{ + static bool seen_default = false; + + if (!isdefault) + TAILQ_INSERT_HEAD(&protos, proto, prt_next); + else { + PJDLOG_ASSERT(!seen_default); + seen_default = true; + TAILQ_INSERT_TAIL(&protos, proto, prt_next); + } +} + +static struct proto_conn * +proto_alloc(struct proto *proto, int side) +{ + struct proto_conn *conn; + + PJDLOG_ASSERT(proto != NULL); + PJDLOG_ASSERT(side == PROTO_SIDE_CLIENT || + side == PROTO_SIDE_SERVER_LISTEN || + side == PROTO_SIDE_SERVER_WORK); + + conn = malloc(sizeof(*conn)); + if (conn != NULL) { + conn->pc_proto = proto; + conn->pc_side = side; + conn->pc_magic = PROTO_CONN_MAGIC; + } + return (conn); +} + +static void +proto_free(struct proto_conn *conn) +{ + + PJDLOG_ASSERT(conn != NULL); + PJDLOG_ASSERT(conn->pc_magic == PROTO_CONN_MAGIC); + PJDLOG_ASSERT(conn->pc_side == PROTO_SIDE_CLIENT || + conn->pc_side == PROTO_SIDE_SERVER_LISTEN || + conn->pc_side == PROTO_SIDE_SERVER_WORK); + PJDLOG_ASSERT(conn->pc_proto != NULL); + + bzero(conn, sizeof(*conn)); + free(conn); +} + +static int +proto_common_setup(const char *srcaddr, const char *dstaddr, + struct proto_conn **connp, int side) +{ + struct proto *proto; + struct proto_conn *conn; + void *ctx; + int ret; + + PJDLOG_ASSERT(side == PROTO_SIDE_CLIENT || + side == PROTO_SIDE_SERVER_LISTEN); + + TAILQ_FOREACH(proto, &protos, prt_next) { + if (side == PROTO_SIDE_CLIENT) { + if (proto->prt_client == NULL) + ret = -1; + else + ret = proto->prt_client(srcaddr, dstaddr, &ctx); + } else /* if (side == PROTO_SIDE_SERVER_LISTEN) */ { + if (proto->prt_server == NULL) + ret = -1; + else + ret = proto->prt_server(dstaddr, &ctx); + } + /* + * ret == 0 - success + * ret == -1 - dstaddr is not for this protocol + * ret > 0 - right protocol, but an error occurred + */ + if (ret >= 0) + break; + } + if (proto == NULL) { + /* Unrecognized address. */ + errno = EINVAL; + return (-1); + } + if (ret > 0) { + /* An error occurred. */ + errno = ret; + return (-1); + } + conn = proto_alloc(proto, side); + if (conn == NULL) { + if (proto->prt_close != NULL) + proto->prt_close(ctx); + errno = ENOMEM; + return (-1); + } + conn->pc_ctx = ctx; + *connp = conn; + + return (0); +} + +int +proto_client(const char *srcaddr, const char *dstaddr, + struct proto_conn **connp) +{ + + return (proto_common_setup(srcaddr, dstaddr, connp, PROTO_SIDE_CLIENT)); +} + +int +proto_connect(struct proto_conn *conn, int timeout) +{ + int ret; + + PJDLOG_ASSERT(conn != NULL); + PJDLOG_ASSERT(conn->pc_magic == PROTO_CONN_MAGIC); + PJDLOG_ASSERT(conn->pc_side == PROTO_SIDE_CLIENT); + PJDLOG_ASSERT(conn->pc_proto != NULL); + PJDLOG_ASSERT(conn->pc_proto->prt_connect != NULL); + PJDLOG_ASSERT(timeout >= -1); + + ret = conn->pc_proto->prt_connect(conn->pc_ctx, timeout); + if (ret != 0) { + errno = ret; + return (-1); + } + + return (0); +} + +int +proto_connect_wait(struct proto_conn *conn, int timeout) +{ + int ret; + + PJDLOG_ASSERT(conn != NULL); + PJDLOG_ASSERT(conn->pc_magic == PROTO_CONN_MAGIC); + PJDLOG_ASSERT(conn->pc_side == PROTO_SIDE_CLIENT); + PJDLOG_ASSERT(conn->pc_proto != NULL); + PJDLOG_ASSERT(conn->pc_proto->prt_connect_wait != NULL); + PJDLOG_ASSERT(timeout >= 0); + + ret = conn->pc_proto->prt_connect_wait(conn->pc_ctx, timeout); + if (ret != 0) { + errno = ret; + return (-1); + } + + return (0); +} + +int +proto_server(const char *addr, struct proto_conn **connp) +{ + + return (proto_common_setup(NULL, addr, connp, PROTO_SIDE_SERVER_LISTEN)); +} + +int +proto_accept(struct proto_conn *conn, struct proto_conn **newconnp) +{ + struct proto_conn *newconn; + int ret; + + PJDLOG_ASSERT(conn != NULL); + PJDLOG_ASSERT(conn->pc_magic == PROTO_CONN_MAGIC); + PJDLOG_ASSERT(conn->pc_side == PROTO_SIDE_SERVER_LISTEN); + PJDLOG_ASSERT(conn->pc_proto != NULL); + PJDLOG_ASSERT(conn->pc_proto->prt_accept != NULL); + + newconn = proto_alloc(conn->pc_proto, PROTO_SIDE_SERVER_WORK); + if (newconn == NULL) + return (-1); + + ret = conn->pc_proto->prt_accept(conn->pc_ctx, &newconn->pc_ctx); + if (ret != 0) { + proto_free(newconn); + errno = ret; + return (-1); + } + + *newconnp = newconn; + + return (0); +} + +int +proto_send(const struct proto_conn *conn, const void *data, size_t size) +{ + int ret; + + PJDLOG_ASSERT(conn != NULL); + PJDLOG_ASSERT(conn->pc_magic == PROTO_CONN_MAGIC); + PJDLOG_ASSERT(conn->pc_proto != NULL); + PJDLOG_ASSERT(conn->pc_proto->prt_send != NULL); + + ret = conn->pc_proto->prt_send(conn->pc_ctx, data, size, -1); + if (ret != 0) { + errno = ret; + return (-1); + } + return (0); +} + +int +proto_recv(const struct proto_conn *conn, void *data, size_t size) +{ + int ret; + + PJDLOG_ASSERT(conn != NULL); + PJDLOG_ASSERT(conn->pc_magic == PROTO_CONN_MAGIC); + PJDLOG_ASSERT(conn->pc_proto != NULL); + PJDLOG_ASSERT(conn->pc_proto->prt_recv != NULL); + + ret = conn->pc_proto->prt_recv(conn->pc_ctx, data, size, NULL); + if (ret != 0) { + errno = ret; + return (-1); + } + return (0); +} + +int +proto_connection_send(const struct proto_conn *conn, struct proto_conn *mconn) +{ + const char *protoname; + int ret, fd; + + PJDLOG_ASSERT(conn != NULL); + PJDLOG_ASSERT(conn->pc_magic == PROTO_CONN_MAGIC); + PJDLOG_ASSERT(conn->pc_proto != NULL); + PJDLOG_ASSERT(conn->pc_proto->prt_send != NULL); + PJDLOG_ASSERT(mconn != NULL); + PJDLOG_ASSERT(mconn->pc_magic == PROTO_CONN_MAGIC); + PJDLOG_ASSERT(mconn->pc_proto != NULL); + fd = proto_descriptor(mconn); + PJDLOG_ASSERT(fd >= 0); + protoname = mconn->pc_proto->prt_name; + PJDLOG_ASSERT(protoname != NULL); + + ret = conn->pc_proto->prt_send(conn->pc_ctx, protoname, + strlen(protoname) + 1, fd); + proto_close(mconn); + if (ret != 0) { + errno = ret; + return (-1); + } + return (0); +} + +int +proto_connection_recv(const struct proto_conn *conn, bool client, + struct proto_conn **newconnp) +{ + char protoname[128]; + struct proto *proto; + struct proto_conn *newconn; + int ret, fd; + + PJDLOG_ASSERT(conn != NULL); + PJDLOG_ASSERT(conn->pc_magic == PROTO_CONN_MAGIC); + PJDLOG_ASSERT(conn->pc_proto != NULL); + PJDLOG_ASSERT(conn->pc_proto->prt_recv != NULL); + PJDLOG_ASSERT(newconnp != NULL); + + bzero(protoname, sizeof(protoname)); + + ret = conn->pc_proto->prt_recv(conn->pc_ctx, protoname, + sizeof(protoname) - 1, &fd); + if (ret != 0) { + errno = ret; + return (-1); + } + + PJDLOG_ASSERT(fd >= 0); + + TAILQ_FOREACH(proto, &protos, prt_next) { + if (strcmp(proto->prt_name, protoname) == 0) + break; + } + if (proto == NULL) { + errno = EINVAL; + return (-1); + } + + newconn = proto_alloc(proto, + client ? PROTO_SIDE_CLIENT : PROTO_SIDE_SERVER_WORK); + if (newconn == NULL) + return (-1); + PJDLOG_ASSERT(newconn->pc_proto->prt_wrap != NULL); + ret = newconn->pc_proto->prt_wrap(fd, client, &newconn->pc_ctx); + if (ret != 0) { + proto_free(newconn); + errno = ret; + return (-1); + } + + *newconnp = newconn; + + return (0); +} + +int +proto_descriptor(const struct proto_conn *conn) +{ + + PJDLOG_ASSERT(conn != NULL); + PJDLOG_ASSERT(conn->pc_magic == PROTO_CONN_MAGIC); + PJDLOG_ASSERT(conn->pc_proto != NULL); + PJDLOG_ASSERT(conn->pc_proto->prt_descriptor != NULL); + + return (conn->pc_proto->prt_descriptor(conn->pc_ctx)); +} + +bool +proto_address_match(const struct proto_conn *conn, const char *addr) +{ + + PJDLOG_ASSERT(conn != NULL); + PJDLOG_ASSERT(conn->pc_magic == PROTO_CONN_MAGIC); + PJDLOG_ASSERT(conn->pc_proto != NULL); + PJDLOG_ASSERT(conn->pc_proto->prt_address_match != NULL); + + return (conn->pc_proto->prt_address_match(conn->pc_ctx, addr)); +} + +void +proto_local_address(const struct proto_conn *conn, char *addr, size_t size) +{ + + PJDLOG_ASSERT(conn != NULL); + PJDLOG_ASSERT(conn->pc_magic == PROTO_CONN_MAGIC); + PJDLOG_ASSERT(conn->pc_proto != NULL); + PJDLOG_ASSERT(conn->pc_proto->prt_local_address != NULL); + + conn->pc_proto->prt_local_address(conn->pc_ctx, addr, size); +} + +void +proto_remote_address(const struct proto_conn *conn, char *addr, size_t size) +{ + + PJDLOG_ASSERT(conn != NULL); + PJDLOG_ASSERT(conn->pc_magic == PROTO_CONN_MAGIC); + PJDLOG_ASSERT(conn->pc_proto != NULL); + PJDLOG_ASSERT(conn->pc_proto->prt_remote_address != NULL); + + conn->pc_proto->prt_remote_address(conn->pc_ctx, addr, size); +} + +int +proto_timeout(const struct proto_conn *conn, int timeout) +{ + struct timeval tv; + int fd; + + PJDLOG_ASSERT(conn != NULL); + PJDLOG_ASSERT(conn->pc_magic == PROTO_CONN_MAGIC); + PJDLOG_ASSERT(conn->pc_proto != NULL); + + fd = proto_descriptor(conn); + if (fd == -1) + return (-1); + + tv.tv_sec = timeout; + tv.tv_usec = 0; + if (setsockopt(fd, SOL_SOCKET, SO_SNDTIMEO, &tv, sizeof(tv)) == -1) + return (-1); + if (setsockopt(fd, SOL_SOCKET, SO_RCVTIMEO, &tv, sizeof(tv)) == -1) + return (-1); + + return (0); +} + +void +proto_close(struct proto_conn *conn) +{ + + PJDLOG_ASSERT(conn != NULL); + PJDLOG_ASSERT(conn->pc_magic == PROTO_CONN_MAGIC); + PJDLOG_ASSERT(conn->pc_proto != NULL); + PJDLOG_ASSERT(conn->pc_proto->prt_close != NULL); + + conn->pc_proto->prt_close(conn->pc_ctx); + proto_free(conn); +} diff --git a/sbin/hastd/proto.h b/sbin/hastd/proto.h new file mode 100644 index 0000000..1a60e5b --- /dev/null +++ b/sbin/hastd/proto.h @@ -0,0 +1,61 @@ +/*- + * Copyright (c) 2009-2010 The FreeBSD Foundation + * All rights reserved. + * + * This software was developed by Pawel Jakub Dawidek under sponsorship from + * the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _PROTO_H_ +#define _PROTO_H_ + +#include <stdbool.h> /* bool */ +#include <stdlib.h> /* size_t */ + +struct proto_conn; + +int proto_client(const char *srcaddr, const char *dstaddr, + struct proto_conn **connp); +int proto_connect(struct proto_conn *conn, int timeout); +int proto_connect_wait(struct proto_conn *conn, int timeout); +int proto_server(const char *addr, struct proto_conn **connp); +int proto_accept(struct proto_conn *conn, struct proto_conn **newconnp); +int proto_send(const struct proto_conn *conn, const void *data, size_t size); +int proto_recv(const struct proto_conn *conn, void *data, size_t size); +int proto_connection_send(const struct proto_conn *conn, + struct proto_conn *mconn); +int proto_connection_recv(const struct proto_conn *conn, bool client, + struct proto_conn **newconnp); +int proto_descriptor(const struct proto_conn *conn); +bool proto_address_match(const struct proto_conn *conn, const char *addr); +void proto_local_address(const struct proto_conn *conn, char *addr, + size_t size); +void proto_remote_address(const struct proto_conn *conn, char *addr, + size_t size); +int proto_timeout(const struct proto_conn *conn, int timeout); +void proto_close(struct proto_conn *conn); + +#endif /* !_PROTO_H_ */ diff --git a/sbin/hastd/proto_common.c b/sbin/hastd/proto_common.c new file mode 100644 index 0000000..843366b --- /dev/null +++ b/sbin/hastd/proto_common.c @@ -0,0 +1,232 @@ +/*- + * Copyright (c) 2009-2010 The FreeBSD Foundation + * Copyright (c) 2011 Pawel Jakub Dawidek <pawel@dawidek.net> + * All rights reserved. + * + * This software was developed by Pawel Jakub Dawidek under sponsorship from + * the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/types.h> +#include <sys/socket.h> + +#include <errno.h> +#include <fcntl.h> +#include <stdbool.h> +#include <stdlib.h> +#include <strings.h> +#include <unistd.h> + +#include "pjdlog.h" +#include "proto_impl.h" + +/* Maximum size of packet we want to use when sending data. */ +#ifndef MAX_SEND_SIZE +#define MAX_SEND_SIZE 32768 +#endif + +static bool +blocking_socket(int sock) +{ + int flags; + + flags = fcntl(sock, F_GETFL); + PJDLOG_ASSERT(flags >= 0); + return ((flags & O_NONBLOCK) == 0); +} + +static int +proto_descriptor_send(int sock, int fd) +{ + unsigned char ctrl[CMSG_SPACE(sizeof(fd))]; + struct msghdr msg; + struct cmsghdr *cmsg; + + PJDLOG_ASSERT(sock >= 0); + PJDLOG_ASSERT(fd >= 0); + + bzero(&msg, sizeof(msg)); + bzero(&ctrl, sizeof(ctrl)); + + msg.msg_iov = NULL; + msg.msg_iovlen = 0; + msg.msg_control = ctrl; + msg.msg_controllen = sizeof(ctrl); + + cmsg = CMSG_FIRSTHDR(&msg); + cmsg->cmsg_level = SOL_SOCKET; + cmsg->cmsg_type = SCM_RIGHTS; + cmsg->cmsg_len = CMSG_LEN(sizeof(fd)); + bcopy(&fd, CMSG_DATA(cmsg), sizeof(fd)); + + if (sendmsg(sock, &msg, 0) == -1) + return (errno); + + return (0); +} + +int +proto_common_send(int sock, const unsigned char *data, size_t size, int fd) +{ + ssize_t done; + size_t sendsize; + int errcount = 0; + + PJDLOG_ASSERT(sock >= 0); + + if (data == NULL) { + /* The caller is just trying to decide about direction. */ + + PJDLOG_ASSERT(size == 0); + + if (shutdown(sock, SHUT_RD) == -1) + return (errno); + return (0); + } + + PJDLOG_ASSERT(data != NULL); + PJDLOG_ASSERT(size > 0); + + do { + sendsize = size < MAX_SEND_SIZE ? size : MAX_SEND_SIZE; + done = send(sock, data, sendsize, MSG_NOSIGNAL); + if (done == 0) { + return (ENOTCONN); + } else if (done == -1) { + if (errno == EINTR) + continue; + if (errno == ENOBUFS) { + /* + * If there are no buffers we retry. + * After each try we increase delay before the + * next one and we give up after fifteen times. + * This gives 11s of total wait time. + */ + if (errcount == 15) { + pjdlog_warning("Getting ENOBUFS errors for 11s on send(), giving up."); + } else { + if (errcount == 0) + pjdlog_warning("Got ENOBUFS error on send(), retrying for a bit."); + errcount++; + usleep(100000 * errcount); + continue; + } + } + /* + * If this is blocking socket and we got EAGAIN, this + * means the request timed out. Translate errno to + * ETIMEDOUT, to give administrator a hint to + * eventually increase timeout. + */ + if (errno == EAGAIN && blocking_socket(sock)) + errno = ETIMEDOUT; + return (errno); + } + data += done; + size -= done; + } while (size > 0); + if (errcount > 0) { + pjdlog_info("Data sent successfully after %d ENOBUFS error%s.", + errcount, errcount == 1 ? "" : "s"); + } + + if (fd == -1) + return (0); + return (proto_descriptor_send(sock, fd)); +} + +static int +proto_descriptor_recv(int sock, int *fdp) +{ + unsigned char ctrl[CMSG_SPACE(sizeof(*fdp))]; + struct msghdr msg; + struct cmsghdr *cmsg; + + PJDLOG_ASSERT(sock >= 0); + PJDLOG_ASSERT(fdp != NULL); + + bzero(&msg, sizeof(msg)); + bzero(&ctrl, sizeof(ctrl)); + + msg.msg_iov = NULL; + msg.msg_iovlen = 0; + msg.msg_control = ctrl; + msg.msg_controllen = sizeof(ctrl); + + if (recvmsg(sock, &msg, 0) == -1) + return (errno); + + cmsg = CMSG_FIRSTHDR(&msg); + if (cmsg == NULL || cmsg->cmsg_level != SOL_SOCKET || + cmsg->cmsg_type != SCM_RIGHTS) { + return (EINVAL); + } + bcopy(CMSG_DATA(cmsg), fdp, sizeof(*fdp)); + + return (0); +} + +int +proto_common_recv(int sock, unsigned char *data, size_t size, int *fdp) +{ + ssize_t done; + + PJDLOG_ASSERT(sock >= 0); + + if (data == NULL) { + /* The caller is just trying to decide about direction. */ + + PJDLOG_ASSERT(size == 0); + + if (shutdown(sock, SHUT_WR) == -1) + return (errno); + return (0); + } + + PJDLOG_ASSERT(data != NULL); + PJDLOG_ASSERT(size > 0); + + do { + done = recv(sock, data, size, MSG_WAITALL); + } while (done == -1 && errno == EINTR); + if (done == 0) { + return (ENOTCONN); + } else if (done == -1) { + /* + * If this is blocking socket and we got EAGAIN, this + * means the request timed out. Translate errno to + * ETIMEDOUT, to give administrator a hint to + * eventually increase timeout. + */ + if (errno == EAGAIN && blocking_socket(sock)) + errno = ETIMEDOUT; + return (errno); + } + if (fdp == NULL) + return (0); + return (proto_descriptor_recv(sock, fdp)); +} diff --git a/sbin/hastd/proto_impl.h b/sbin/hastd/proto_impl.h new file mode 100644 index 0000000..d62f26f --- /dev/null +++ b/sbin/hastd/proto_impl.h @@ -0,0 +1,79 @@ +/*- + * Copyright (c) 2009-2010 The FreeBSD Foundation + * All rights reserved. + * + * This software was developed by Pawel Jakub Dawidek under sponsorship from + * the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _PROTO_IMPL_H_ +#define _PROTO_IMPL_H_ + +#include <sys/queue.h> + +#include <stdbool.h> /* bool */ +#include <stdlib.h> /* size_t */ + +#define __constructor __attribute__((constructor)) + +typedef int prt_client_t(const char *, const char *, void **); +typedef int prt_connect_t(void *, int); +typedef int prt_connect_wait_t(void *, int); +typedef int prt_server_t(const char *, void **); +typedef int prt_accept_t(void *, void **); +typedef int prt_wrap_t(int, bool, void **); +typedef int prt_send_t(void *, const unsigned char *, size_t, int); +typedef int prt_recv_t(void *, unsigned char *, size_t, int *); +typedef int prt_descriptor_t(const void *); +typedef bool prt_address_match_t(const void *, const char *); +typedef void prt_local_address_t(const void *, char *, size_t); +typedef void prt_remote_address_t(const void *, char *, size_t); +typedef void prt_close_t(void *); + +struct proto { + const char *prt_name; + prt_client_t *prt_client; + prt_connect_t *prt_connect; + prt_connect_wait_t *prt_connect_wait; + prt_server_t *prt_server; + prt_accept_t *prt_accept; + prt_wrap_t *prt_wrap; + prt_send_t *prt_send; + prt_recv_t *prt_recv; + prt_descriptor_t *prt_descriptor; + prt_address_match_t *prt_address_match; + prt_local_address_t *prt_local_address; + prt_remote_address_t *prt_remote_address; + prt_close_t *prt_close; + TAILQ_ENTRY(proto) prt_next; +}; + +void proto_register(struct proto *proto, bool isdefault); + +int proto_common_send(int sock, const unsigned char *data, size_t size, int fd); +int proto_common_recv(int sock, unsigned char *data, size_t size, int *fdp); + +#endif /* !_PROTO_IMPL_H_ */ diff --git a/sbin/hastd/proto_socketpair.c b/sbin/hastd/proto_socketpair.c new file mode 100644 index 0000000..d13caa9 --- /dev/null +++ b/sbin/hastd/proto_socketpair.c @@ -0,0 +1,237 @@ +/*- + * Copyright (c) 2009-2010 The FreeBSD Foundation + * All rights reserved. + * + * This software was developed by Pawel Jakub Dawidek under sponsorship from + * the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/types.h> +#include <sys/socket.h> + +#include <errno.h> +#include <stdbool.h> +#include <stdint.h> +#include <stdio.h> +#include <string.h> +#include <unistd.h> + +#include "pjdlog.h" +#include "proto_impl.h" + +#define SP_CTX_MAGIC 0x50c3741 +struct sp_ctx { + int sp_magic; + int sp_fd[2]; + int sp_side; +#define SP_SIDE_UNDEF 0 +#define SP_SIDE_CLIENT 1 +#define SP_SIDE_SERVER 2 +}; + +static void sp_close(void *ctx); + +static int +sp_client(const char *srcaddr, const char *dstaddr, void **ctxp) +{ + struct sp_ctx *spctx; + int ret; + + if (strcmp(dstaddr, "socketpair://") != 0) + return (-1); + + PJDLOG_ASSERT(srcaddr == NULL); + + spctx = malloc(sizeof(*spctx)); + if (spctx == NULL) + return (errno); + + if (socketpair(PF_UNIX, SOCK_STREAM, 0, spctx->sp_fd) == -1) { + ret = errno; + free(spctx); + return (ret); + } + + spctx->sp_side = SP_SIDE_UNDEF; + spctx->sp_magic = SP_CTX_MAGIC; + *ctxp = spctx; + + return (0); +} + +static int +sp_send(void *ctx, const unsigned char *data, size_t size, int fd) +{ + struct sp_ctx *spctx = ctx; + int sock; + + PJDLOG_ASSERT(spctx != NULL); + PJDLOG_ASSERT(spctx->sp_magic == SP_CTX_MAGIC); + + switch (spctx->sp_side) { + case SP_SIDE_UNDEF: + /* + * If the first operation done by the caller is proto_send(), + * we assume this is the client. + */ + /* FALLTHROUGH */ + spctx->sp_side = SP_SIDE_CLIENT; + /* Close other end. */ + close(spctx->sp_fd[1]); + spctx->sp_fd[1] = -1; + case SP_SIDE_CLIENT: + PJDLOG_ASSERT(spctx->sp_fd[0] >= 0); + sock = spctx->sp_fd[0]; + break; + case SP_SIDE_SERVER: + PJDLOG_ASSERT(spctx->sp_fd[1] >= 0); + sock = spctx->sp_fd[1]; + break; + default: + PJDLOG_ABORT("Invalid socket side (%d).", spctx->sp_side); + } + + /* Someone is just trying to decide about side. */ + if (data == NULL) + return (0); + + return (proto_common_send(sock, data, size, fd)); +} + +static int +sp_recv(void *ctx, unsigned char *data, size_t size, int *fdp) +{ + struct sp_ctx *spctx = ctx; + int fd; + + PJDLOG_ASSERT(spctx != NULL); + PJDLOG_ASSERT(spctx->sp_magic == SP_CTX_MAGIC); + + switch (spctx->sp_side) { + case SP_SIDE_UNDEF: + /* + * If the first operation done by the caller is proto_recv(), + * we assume this is the server. + */ + /* FALLTHROUGH */ + spctx->sp_side = SP_SIDE_SERVER; + /* Close other end. */ + close(spctx->sp_fd[0]); + spctx->sp_fd[0] = -1; + case SP_SIDE_SERVER: + PJDLOG_ASSERT(spctx->sp_fd[1] >= 0); + fd = spctx->sp_fd[1]; + break; + case SP_SIDE_CLIENT: + PJDLOG_ASSERT(spctx->sp_fd[0] >= 0); + fd = spctx->sp_fd[0]; + break; + default: + PJDLOG_ABORT("Invalid socket side (%d).", spctx->sp_side); + } + + /* Someone is just trying to decide about side. */ + if (data == NULL) + return (0); + + return (proto_common_recv(fd, data, size, fdp)); +} + +static int +sp_descriptor(const void *ctx) +{ + const struct sp_ctx *spctx = ctx; + + PJDLOG_ASSERT(spctx != NULL); + PJDLOG_ASSERT(spctx->sp_magic == SP_CTX_MAGIC); + PJDLOG_ASSERT(spctx->sp_side == SP_SIDE_CLIENT || + spctx->sp_side == SP_SIDE_SERVER); + + switch (spctx->sp_side) { + case SP_SIDE_CLIENT: + PJDLOG_ASSERT(spctx->sp_fd[0] >= 0); + return (spctx->sp_fd[0]); + case SP_SIDE_SERVER: + PJDLOG_ASSERT(spctx->sp_fd[1] >= 0); + return (spctx->sp_fd[1]); + } + + PJDLOG_ABORT("Invalid socket side (%d).", spctx->sp_side); +} + +static void +sp_close(void *ctx) +{ + struct sp_ctx *spctx = ctx; + + PJDLOG_ASSERT(spctx != NULL); + PJDLOG_ASSERT(spctx->sp_magic == SP_CTX_MAGIC); + + switch (spctx->sp_side) { + case SP_SIDE_UNDEF: + PJDLOG_ASSERT(spctx->sp_fd[0] >= 0); + close(spctx->sp_fd[0]); + spctx->sp_fd[0] = -1; + PJDLOG_ASSERT(spctx->sp_fd[1] >= 0); + close(spctx->sp_fd[1]); + spctx->sp_fd[1] = -1; + break; + case SP_SIDE_CLIENT: + PJDLOG_ASSERT(spctx->sp_fd[0] >= 0); + close(spctx->sp_fd[0]); + spctx->sp_fd[0] = -1; + PJDLOG_ASSERT(spctx->sp_fd[1] == -1); + break; + case SP_SIDE_SERVER: + PJDLOG_ASSERT(spctx->sp_fd[1] >= 0); + close(spctx->sp_fd[1]); + spctx->sp_fd[1] = -1; + PJDLOG_ASSERT(spctx->sp_fd[0] == -1); + break; + default: + PJDLOG_ABORT("Invalid socket side (%d).", spctx->sp_side); + } + + spctx->sp_magic = 0; + free(spctx); +} + +static struct proto sp_proto = { + .prt_name = "socketpair", + .prt_client = sp_client, + .prt_send = sp_send, + .prt_recv = sp_recv, + .prt_descriptor = sp_descriptor, + .prt_close = sp_close +}; + +static __constructor void +sp_ctor(void) +{ + + proto_register(&sp_proto, false); +} diff --git a/sbin/hastd/proto_tcp.c b/sbin/hastd/proto_tcp.c new file mode 100644 index 0000000..6dc0661 --- /dev/null +++ b/sbin/hastd/proto_tcp.c @@ -0,0 +1,637 @@ +/*- + * Copyright (c) 2009-2010 The FreeBSD Foundation + * Copyright (c) 2011 Pawel Jakub Dawidek <pawel@dawidek.net> + * All rights reserved. + * + * This software was developed by Pawel Jakub Dawidek under sponsorship from + * the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> /* MAXHOSTNAMELEN */ +#include <sys/socket.h> + +#include <arpa/inet.h> + +#include <netinet/in.h> +#include <netinet/tcp.h> + +#include <errno.h> +#include <fcntl.h> +#include <netdb.h> +#include <stdbool.h> +#include <stdint.h> +#include <stdio.h> +#include <string.h> +#include <unistd.h> + +#include "pjdlog.h" +#include "proto_impl.h" +#include "subr.h" + +#define TCP_CTX_MAGIC 0x7c41c +struct tcp_ctx { + int tc_magic; + struct sockaddr_storage tc_sa; + int tc_fd; + int tc_side; +#define TCP_SIDE_CLIENT 0 +#define TCP_SIDE_SERVER_LISTEN 1 +#define TCP_SIDE_SERVER_WORK 2 +}; + +static int tcp_connect_wait(void *ctx, int timeout); +static void tcp_close(void *ctx); + +/* + * Function converts the given string to unsigned number. + */ +static int +numfromstr(const char *str, intmax_t minnum, intmax_t maxnum, intmax_t *nump) +{ + intmax_t digit, num; + + if (str[0] == '\0') + goto invalid; /* Empty string. */ + num = 0; + for (; *str != '\0'; str++) { + if (*str < '0' || *str > '9') + goto invalid; /* Non-digit character. */ + digit = *str - '0'; + if (num > num * 10 + digit) + goto invalid; /* Overflow. */ + num = num * 10 + digit; + if (num > maxnum) + goto invalid; /* Too big. */ + } + if (num < minnum) + goto invalid; /* Too small. */ + *nump = num; + return (0); +invalid: + errno = EINVAL; + return (-1); +} + +static int +tcp_addr(const char *addr, int defport, struct sockaddr_storage *sap) +{ + char iporhost[MAXHOSTNAMELEN], portstr[6]; + struct addrinfo hints; + struct addrinfo *res; + const char *pp; + intmax_t port; + size_t size; + int error; + + if (addr == NULL) + return (-1); + + bzero(&hints, sizeof(hints)); + hints.ai_flags = AI_ADDRCONFIG | AI_NUMERICSERV; + hints.ai_family = PF_UNSPEC; + hints.ai_socktype = SOCK_STREAM; + hints.ai_protocol = IPPROTO_TCP; + + if (strncasecmp(addr, "tcp4://", 7) == 0) { + addr += 7; + hints.ai_family = PF_INET; + } else if (strncasecmp(addr, "tcp6://", 7) == 0) { + addr += 7; + hints.ai_family = PF_INET6; + } else if (strncasecmp(addr, "tcp://", 6) == 0) { + addr += 6; + } else { + /* + * Because TCP is the default assume IP or host is given without + * prefix. + */ + } + + /* + * Extract optional port. + * There are three cases to consider. + * 1. hostname with port, eg. freefall.freebsd.org:8457 + * 2. IPv4 address with port, eg. 192.168.0.101:8457 + * 3. IPv6 address with port, eg. [fe80::1]:8457 + * We discover IPv6 address by checking for two colons and if port is + * given, the address has to start with [. + */ + pp = NULL; + if (strchr(addr, ':') != strrchr(addr, ':')) { + if (addr[0] == '[') + pp = strrchr(addr, ':'); + } else { + pp = strrchr(addr, ':'); + } + if (pp == NULL) { + /* Port not given, use the default. */ + port = defport; + } else { + if (numfromstr(pp + 1, 1, 65535, &port) == -1) + return (errno); + } + (void)snprintf(portstr, sizeof(portstr), "%jd", (intmax_t)port); + /* Extract host name or IP address. */ + if (pp == NULL) { + size = sizeof(iporhost); + if (strlcpy(iporhost, addr, size) >= size) + return (ENAMETOOLONG); + } else if (addr[0] == '[' && pp[-1] == ']') { + size = (size_t)(pp - addr - 2 + 1); + if (size > sizeof(iporhost)) + return (ENAMETOOLONG); + (void)strlcpy(iporhost, addr + 1, size); + } else { + size = (size_t)(pp - addr + 1); + if (size > sizeof(iporhost)) + return (ENAMETOOLONG); + (void)strlcpy(iporhost, addr, size); + } + + error = getaddrinfo(iporhost, portstr, &hints, &res); + if (error != 0) { + pjdlog_debug(1, "getaddrinfo(%s, %s) failed: %s.", iporhost, + portstr, gai_strerror(error)); + return (EINVAL); + } + if (res == NULL) + return (ENOENT); + + memcpy(sap, res->ai_addr, res->ai_addrlen); + + freeaddrinfo(res); + + return (0); +} + +static int +tcp_setup_new(const char *addr, int side, void **ctxp) +{ + struct tcp_ctx *tctx; + int ret, nodelay; + + PJDLOG_ASSERT(addr != NULL); + PJDLOG_ASSERT(side == TCP_SIDE_CLIENT || + side == TCP_SIDE_SERVER_LISTEN); + PJDLOG_ASSERT(ctxp != NULL); + + tctx = malloc(sizeof(*tctx)); + if (tctx == NULL) + return (errno); + + /* Parse given address. */ + if ((ret = tcp_addr(addr, PROTO_TCP_DEFAULT_PORT, &tctx->tc_sa)) != 0) { + free(tctx); + return (ret); + } + + PJDLOG_ASSERT(tctx->tc_sa.ss_family != AF_UNSPEC); + + tctx->tc_fd = socket(tctx->tc_sa.ss_family, SOCK_STREAM, 0); + if (tctx->tc_fd == -1) { + ret = errno; + free(tctx); + return (ret); + } + + PJDLOG_ASSERT(tctx->tc_sa.ss_family != AF_UNSPEC); + + /* Socket settings. */ + nodelay = 1; + if (setsockopt(tctx->tc_fd, IPPROTO_TCP, TCP_NODELAY, &nodelay, + sizeof(nodelay)) == -1) { + pjdlog_errno(LOG_WARNING, "Unable to set TCP_NOELAY"); + } + + tctx->tc_side = side; + tctx->tc_magic = TCP_CTX_MAGIC; + *ctxp = tctx; + + return (0); +} + +static int +tcp_setup_wrap(int fd, int side, void **ctxp) +{ + struct tcp_ctx *tctx; + + PJDLOG_ASSERT(fd >= 0); + PJDLOG_ASSERT(side == TCP_SIDE_CLIENT || + side == TCP_SIDE_SERVER_WORK); + PJDLOG_ASSERT(ctxp != NULL); + + tctx = malloc(sizeof(*tctx)); + if (tctx == NULL) + return (errno); + + tctx->tc_fd = fd; + tctx->tc_sa.ss_family = AF_UNSPEC; + tctx->tc_side = side; + tctx->tc_magic = TCP_CTX_MAGIC; + *ctxp = tctx; + + return (0); +} + +static int +tcp_client(const char *srcaddr, const char *dstaddr, void **ctxp) +{ + struct tcp_ctx *tctx; + struct sockaddr_storage sa; + int ret; + + ret = tcp_setup_new(dstaddr, TCP_SIDE_CLIENT, ctxp); + if (ret != 0) + return (ret); + tctx = *ctxp; + if (srcaddr == NULL) + return (0); + ret = tcp_addr(srcaddr, 0, &sa); + if (ret != 0) { + tcp_close(tctx); + return (ret); + } + if (bind(tctx->tc_fd, (struct sockaddr *)&sa, sa.ss_len) == -1) { + ret = errno; + tcp_close(tctx); + return (ret); + } + return (0); +} + +static int +tcp_connect(void *ctx, int timeout) +{ + struct tcp_ctx *tctx = ctx; + int error, flags; + + PJDLOG_ASSERT(tctx != NULL); + PJDLOG_ASSERT(tctx->tc_magic == TCP_CTX_MAGIC); + PJDLOG_ASSERT(tctx->tc_side == TCP_SIDE_CLIENT); + PJDLOG_ASSERT(tctx->tc_fd >= 0); + PJDLOG_ASSERT(tctx->tc_sa.ss_family != AF_UNSPEC); + PJDLOG_ASSERT(timeout >= -1); + + flags = fcntl(tctx->tc_fd, F_GETFL); + if (flags == -1) { + pjdlog_common(LOG_DEBUG, 1, errno, "fcntl(F_GETFL) failed"); + return (errno); + } + /* + * We make socket non-blocking so we can handle connection timeout + * manually. + */ + flags |= O_NONBLOCK; + if (fcntl(tctx->tc_fd, F_SETFL, flags) == -1) { + pjdlog_common(LOG_DEBUG, 1, errno, + "fcntl(F_SETFL, O_NONBLOCK) failed"); + return (errno); + } + + if (connect(tctx->tc_fd, (struct sockaddr *)&tctx->tc_sa, + tctx->tc_sa.ss_len) == 0) { + if (timeout == -1) + return (0); + error = 0; + goto done; + } + if (errno != EINPROGRESS) { + error = errno; + pjdlog_common(LOG_DEBUG, 1, errno, "connect() failed"); + goto done; + } + if (timeout == -1) + return (0); + return (tcp_connect_wait(ctx, timeout)); +done: + flags &= ~O_NONBLOCK; + if (fcntl(tctx->tc_fd, F_SETFL, flags) == -1) { + if (error == 0) + error = errno; + pjdlog_common(LOG_DEBUG, 1, errno, + "fcntl(F_SETFL, ~O_NONBLOCK) failed"); + } + return (error); +} + +static int +tcp_connect_wait(void *ctx, int timeout) +{ + struct tcp_ctx *tctx = ctx; + struct timeval tv; + fd_set fdset; + socklen_t esize; + int error, flags, ret; + + PJDLOG_ASSERT(tctx != NULL); + PJDLOG_ASSERT(tctx->tc_magic == TCP_CTX_MAGIC); + PJDLOG_ASSERT(tctx->tc_side == TCP_SIDE_CLIENT); + PJDLOG_ASSERT(tctx->tc_fd >= 0); + PJDLOG_ASSERT(timeout >= 0); + + tv.tv_sec = timeout; + tv.tv_usec = 0; +again: + FD_ZERO(&fdset); + FD_SET(tctx->tc_fd, &fdset); + ret = select(tctx->tc_fd + 1, NULL, &fdset, NULL, &tv); + if (ret == 0) { + error = ETIMEDOUT; + goto done; + } else if (ret == -1) { + if (errno == EINTR) + goto again; + error = errno; + pjdlog_common(LOG_DEBUG, 1, errno, "select() failed"); + goto done; + } + PJDLOG_ASSERT(ret > 0); + PJDLOG_ASSERT(FD_ISSET(tctx->tc_fd, &fdset)); + esize = sizeof(error); + if (getsockopt(tctx->tc_fd, SOL_SOCKET, SO_ERROR, &error, + &esize) == -1) { + error = errno; + pjdlog_common(LOG_DEBUG, 1, errno, + "getsockopt(SO_ERROR) failed"); + goto done; + } + if (error != 0) { + pjdlog_common(LOG_DEBUG, 1, error, + "getsockopt(SO_ERROR) returned error"); + goto done; + } + error = 0; +done: + flags = fcntl(tctx->tc_fd, F_GETFL); + if (flags == -1) { + if (error == 0) + error = errno; + pjdlog_common(LOG_DEBUG, 1, errno, "fcntl(F_GETFL) failed"); + return (error); + } + flags &= ~O_NONBLOCK; + if (fcntl(tctx->tc_fd, F_SETFL, flags) == -1) { + if (error == 0) + error = errno; + pjdlog_common(LOG_DEBUG, 1, errno, + "fcntl(F_SETFL, ~O_NONBLOCK) failed"); + } + return (error); +} + +static int +tcp_server(const char *addr, void **ctxp) +{ + struct tcp_ctx *tctx; + int ret, val; + + ret = tcp_setup_new(addr, TCP_SIDE_SERVER_LISTEN, ctxp); + if (ret != 0) + return (ret); + + tctx = *ctxp; + + val = 1; + /* Ignore failure. */ + (void)setsockopt(tctx->tc_fd, SOL_SOCKET, SO_REUSEADDR, &val, + sizeof(val)); + + PJDLOG_ASSERT(tctx->tc_sa.ss_family != AF_UNSPEC); + + if (bind(tctx->tc_fd, (struct sockaddr *)&tctx->tc_sa, + tctx->tc_sa.ss_len) == -1) { + ret = errno; + tcp_close(tctx); + return (ret); + } + if (listen(tctx->tc_fd, 8) == -1) { + ret = errno; + tcp_close(tctx); + return (ret); + } + + return (0); +} + +static int +tcp_accept(void *ctx, void **newctxp) +{ + struct tcp_ctx *tctx = ctx; + struct tcp_ctx *newtctx; + socklen_t fromlen; + int ret; + + PJDLOG_ASSERT(tctx != NULL); + PJDLOG_ASSERT(tctx->tc_magic == TCP_CTX_MAGIC); + PJDLOG_ASSERT(tctx->tc_side == TCP_SIDE_SERVER_LISTEN); + PJDLOG_ASSERT(tctx->tc_fd >= 0); + PJDLOG_ASSERT(tctx->tc_sa.ss_family != AF_UNSPEC); + + newtctx = malloc(sizeof(*newtctx)); + if (newtctx == NULL) + return (errno); + + fromlen = tctx->tc_sa.ss_len; + newtctx->tc_fd = accept(tctx->tc_fd, (struct sockaddr *)&tctx->tc_sa, + &fromlen); + if (newtctx->tc_fd == -1) { + ret = errno; + free(newtctx); + return (ret); + } + + newtctx->tc_side = TCP_SIDE_SERVER_WORK; + newtctx->tc_magic = TCP_CTX_MAGIC; + *newctxp = newtctx; + + return (0); +} + +static int +tcp_wrap(int fd, bool client, void **ctxp) +{ + + return (tcp_setup_wrap(fd, + client ? TCP_SIDE_CLIENT : TCP_SIDE_SERVER_WORK, ctxp)); +} + +static int +tcp_send(void *ctx, const unsigned char *data, size_t size, int fd) +{ + struct tcp_ctx *tctx = ctx; + + PJDLOG_ASSERT(tctx != NULL); + PJDLOG_ASSERT(tctx->tc_magic == TCP_CTX_MAGIC); + PJDLOG_ASSERT(tctx->tc_fd >= 0); + PJDLOG_ASSERT(fd == -1); + + return (proto_common_send(tctx->tc_fd, data, size, -1)); +} + +static int +tcp_recv(void *ctx, unsigned char *data, size_t size, int *fdp) +{ + struct tcp_ctx *tctx = ctx; + + PJDLOG_ASSERT(tctx != NULL); + PJDLOG_ASSERT(tctx->tc_magic == TCP_CTX_MAGIC); + PJDLOG_ASSERT(tctx->tc_fd >= 0); + PJDLOG_ASSERT(fdp == NULL); + + return (proto_common_recv(tctx->tc_fd, data, size, NULL)); +} + +static int +tcp_descriptor(const void *ctx) +{ + const struct tcp_ctx *tctx = ctx; + + PJDLOG_ASSERT(tctx != NULL); + PJDLOG_ASSERT(tctx->tc_magic == TCP_CTX_MAGIC); + + return (tctx->tc_fd); +} + +static bool +tcp_address_match(const void *ctx, const char *addr) +{ + const struct tcp_ctx *tctx = ctx; + struct sockaddr_storage sa1, sa2; + socklen_t salen; + + PJDLOG_ASSERT(tctx != NULL); + PJDLOG_ASSERT(tctx->tc_magic == TCP_CTX_MAGIC); + + if (tcp_addr(addr, PROTO_TCP_DEFAULT_PORT, &sa1) != 0) + return (false); + + salen = sizeof(sa2); + if (getpeername(tctx->tc_fd, (struct sockaddr *)&sa2, &salen) == -1) + return (false); + + if (sa1.ss_family != sa2.ss_family || sa1.ss_len != sa2.ss_len) + return (false); + + switch (sa1.ss_family) { + case AF_INET: + { + struct sockaddr_in *sin1, *sin2; + + sin1 = (struct sockaddr_in *)&sa1; + sin2 = (struct sockaddr_in *)&sa2; + + return (memcmp(&sin1->sin_addr, &sin2->sin_addr, + sizeof(sin1->sin_addr)) == 0); + } + case AF_INET6: + { + struct sockaddr_in6 *sin1, *sin2; + + sin1 = (struct sockaddr_in6 *)&sa1; + sin2 = (struct sockaddr_in6 *)&sa2; + + return (memcmp(&sin1->sin6_addr, &sin2->sin6_addr, + sizeof(sin1->sin6_addr)) == 0); + } + default: + return (false); + } +} + +static void +tcp_local_address(const void *ctx, char *addr, size_t size) +{ + const struct tcp_ctx *tctx = ctx; + struct sockaddr_storage sa; + socklen_t salen; + + PJDLOG_ASSERT(tctx != NULL); + PJDLOG_ASSERT(tctx->tc_magic == TCP_CTX_MAGIC); + + salen = sizeof(sa); + if (getsockname(tctx->tc_fd, (struct sockaddr *)&sa, &salen) == -1) { + PJDLOG_VERIFY(strlcpy(addr, "N/A", size) < size); + return; + } + PJDLOG_VERIFY(snprintf(addr, size, "tcp://%S", &sa) < (ssize_t)size); +} + +static void +tcp_remote_address(const void *ctx, char *addr, size_t size) +{ + const struct tcp_ctx *tctx = ctx; + struct sockaddr_storage sa; + socklen_t salen; + + PJDLOG_ASSERT(tctx != NULL); + PJDLOG_ASSERT(tctx->tc_magic == TCP_CTX_MAGIC); + + salen = sizeof(sa); + if (getpeername(tctx->tc_fd, (struct sockaddr *)&sa, &salen) == -1) { + PJDLOG_VERIFY(strlcpy(addr, "N/A", size) < size); + return; + } + PJDLOG_VERIFY(snprintf(addr, size, "tcp://%S", &sa) < (ssize_t)size); +} + +static void +tcp_close(void *ctx) +{ + struct tcp_ctx *tctx = ctx; + + PJDLOG_ASSERT(tctx != NULL); + PJDLOG_ASSERT(tctx->tc_magic == TCP_CTX_MAGIC); + + if (tctx->tc_fd >= 0) + close(tctx->tc_fd); + tctx->tc_magic = 0; + free(tctx); +} + +static struct proto tcp_proto = { + .prt_name = "tcp", + .prt_client = tcp_client, + .prt_connect = tcp_connect, + .prt_connect_wait = tcp_connect_wait, + .prt_server = tcp_server, + .prt_accept = tcp_accept, + .prt_wrap = tcp_wrap, + .prt_send = tcp_send, + .prt_recv = tcp_recv, + .prt_descriptor = tcp_descriptor, + .prt_address_match = tcp_address_match, + .prt_local_address = tcp_local_address, + .prt_remote_address = tcp_remote_address, + .prt_close = tcp_close +}; + +static __constructor void +tcp_ctor(void) +{ + + proto_register(&tcp_proto, true); +} diff --git a/sbin/hastd/proto_uds.c b/sbin/hastd/proto_uds.c new file mode 100644 index 0000000..087b788 --- /dev/null +++ b/sbin/hastd/proto_uds.c @@ -0,0 +1,361 @@ +/*- + * Copyright (c) 2009-2010 The FreeBSD Foundation + * All rights reserved. + * + * This software was developed by Pawel Jakub Dawidek under sponsorship from + * the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +/* UDS - UNIX Domain Socket */ + +#include <sys/types.h> +#include <sys/socket.h> +#include <sys/un.h> + +#include <errno.h> +#include <stdbool.h> +#include <stdint.h> +#include <stdio.h> +#include <string.h> +#include <unistd.h> + +#include "pjdlog.h" +#include "proto_impl.h" + +#define UDS_CTX_MAGIC 0xd541c +struct uds_ctx { + int uc_magic; + struct sockaddr_un uc_sun; + int uc_fd; + int uc_side; +#define UDS_SIDE_CLIENT 0 +#define UDS_SIDE_SERVER_LISTEN 1 +#define UDS_SIDE_SERVER_WORK 2 + pid_t uc_owner; +}; + +static void uds_close(void *ctx); + +static int +uds_addr(const char *addr, struct sockaddr_un *sunp) +{ + + if (addr == NULL) + return (-1); + + if (strncasecmp(addr, "uds://", 6) == 0) + addr += 6; + else if (strncasecmp(addr, "unix://", 7) == 0) + addr += 7; + else if (addr[0] == '/' && /* If it starts from /... */ + strstr(addr, "://") == NULL)/* ...and there is no prefix... */ + ; /* ...we assume its us. */ + else + return (-1); + + sunp->sun_family = AF_UNIX; + if (strlcpy(sunp->sun_path, addr, sizeof(sunp->sun_path)) >= + sizeof(sunp->sun_path)) { + return (ENAMETOOLONG); + } + sunp->sun_len = SUN_LEN(sunp); + + return (0); +} + +static int +uds_common_setup(const char *addr, void **ctxp, int side) +{ + struct uds_ctx *uctx; + int ret; + + uctx = malloc(sizeof(*uctx)); + if (uctx == NULL) + return (errno); + + /* Parse given address. */ + if ((ret = uds_addr(addr, &uctx->uc_sun)) != 0) { + free(uctx); + return (ret); + } + + uctx->uc_fd = socket(AF_UNIX, SOCK_STREAM, 0); + if (uctx->uc_fd == -1) { + ret = errno; + free(uctx); + return (ret); + } + + uctx->uc_side = side; + uctx->uc_owner = 0; + uctx->uc_magic = UDS_CTX_MAGIC; + *ctxp = uctx; + + return (0); +} + +static int +uds_client(const char *srcaddr, const char *dstaddr, void **ctxp) +{ + int ret; + + ret = uds_common_setup(dstaddr, ctxp, UDS_SIDE_CLIENT); + if (ret != 0) + return (ret); + + PJDLOG_ASSERT(srcaddr == NULL); + + return (0); +} + +static int +uds_connect(void *ctx, int timeout) +{ + struct uds_ctx *uctx = ctx; + + PJDLOG_ASSERT(uctx != NULL); + PJDLOG_ASSERT(uctx->uc_magic == UDS_CTX_MAGIC); + PJDLOG_ASSERT(uctx->uc_side == UDS_SIDE_CLIENT); + PJDLOG_ASSERT(uctx->uc_fd >= 0); + PJDLOG_ASSERT(timeout >= -1); + + if (connect(uctx->uc_fd, (struct sockaddr *)&uctx->uc_sun, + sizeof(uctx->uc_sun)) == -1) { + return (errno); + } + + return (0); +} + +static int +uds_connect_wait(void *ctx, int timeout) +{ + struct uds_ctx *uctx = ctx; + + PJDLOG_ASSERT(uctx != NULL); + PJDLOG_ASSERT(uctx->uc_magic == UDS_CTX_MAGIC); + PJDLOG_ASSERT(uctx->uc_side == UDS_SIDE_CLIENT); + PJDLOG_ASSERT(uctx->uc_fd >= 0); + PJDLOG_ASSERT(timeout >= 0); + + return (0); +} + +static int +uds_server(const char *addr, void **ctxp) +{ + struct uds_ctx *uctx; + int ret; + + ret = uds_common_setup(addr, ctxp, UDS_SIDE_SERVER_LISTEN); + if (ret != 0) + return (ret); + + uctx = *ctxp; + + (void)unlink(uctx->uc_sun.sun_path); + if (bind(uctx->uc_fd, (struct sockaddr *)&uctx->uc_sun, + sizeof(uctx->uc_sun)) == -1) { + ret = errno; + uds_close(uctx); + return (ret); + } + uctx->uc_owner = getpid(); + if (listen(uctx->uc_fd, 8) == -1) { + ret = errno; + uds_close(uctx); + return (ret); + } + + return (0); +} + +static int +uds_accept(void *ctx, void **newctxp) +{ + struct uds_ctx *uctx = ctx; + struct uds_ctx *newuctx; + socklen_t fromlen; + int ret; + + PJDLOG_ASSERT(uctx != NULL); + PJDLOG_ASSERT(uctx->uc_magic == UDS_CTX_MAGIC); + PJDLOG_ASSERT(uctx->uc_side == UDS_SIDE_SERVER_LISTEN); + PJDLOG_ASSERT(uctx->uc_fd >= 0); + + newuctx = malloc(sizeof(*newuctx)); + if (newuctx == NULL) + return (errno); + + fromlen = sizeof(newuctx->uc_sun); + newuctx->uc_fd = accept(uctx->uc_fd, + (struct sockaddr *)&newuctx->uc_sun, &fromlen); + if (newuctx->uc_fd == -1) { + ret = errno; + free(newuctx); + return (ret); + } + + newuctx->uc_side = UDS_SIDE_SERVER_WORK; + newuctx->uc_magic = UDS_CTX_MAGIC; + *newctxp = newuctx; + + return (0); +} + +static int +uds_send(void *ctx, const unsigned char *data, size_t size, int fd) +{ + struct uds_ctx *uctx = ctx; + + PJDLOG_ASSERT(uctx != NULL); + PJDLOG_ASSERT(uctx->uc_magic == UDS_CTX_MAGIC); + PJDLOG_ASSERT(uctx->uc_fd >= 0); + + return (proto_common_send(uctx->uc_fd, data, size, fd)); +} + +static int +uds_recv(void *ctx, unsigned char *data, size_t size, int *fdp) +{ + struct uds_ctx *uctx = ctx; + + PJDLOG_ASSERT(uctx != NULL); + PJDLOG_ASSERT(uctx->uc_magic == UDS_CTX_MAGIC); + PJDLOG_ASSERT(uctx->uc_fd >= 0); + + return (proto_common_recv(uctx->uc_fd, data, size, fdp)); +} + +static int +uds_descriptor(const void *ctx) +{ + const struct uds_ctx *uctx = ctx; + + PJDLOG_ASSERT(uctx != NULL); + PJDLOG_ASSERT(uctx->uc_magic == UDS_CTX_MAGIC); + + return (uctx->uc_fd); +} + +static void +uds_local_address(const void *ctx, char *addr, size_t size) +{ + const struct uds_ctx *uctx = ctx; + struct sockaddr_un sun; + socklen_t sunlen; + + PJDLOG_ASSERT(uctx != NULL); + PJDLOG_ASSERT(uctx->uc_magic == UDS_CTX_MAGIC); + PJDLOG_ASSERT(addr != NULL); + + sunlen = sizeof(sun); + if (getsockname(uctx->uc_fd, (struct sockaddr *)&sun, &sunlen) == -1) { + PJDLOG_VERIFY(strlcpy(addr, "N/A", size) < size); + return; + } + PJDLOG_ASSERT(sun.sun_family == AF_UNIX); + if (sun.sun_path[0] == '\0') { + PJDLOG_VERIFY(strlcpy(addr, "N/A", size) < size); + return; + } + PJDLOG_VERIFY(snprintf(addr, size, "uds://%s", sun.sun_path) < (ssize_t)size); +} + +static void +uds_remote_address(const void *ctx, char *addr, size_t size) +{ + const struct uds_ctx *uctx = ctx; + struct sockaddr_un sun; + socklen_t sunlen; + + PJDLOG_ASSERT(uctx != NULL); + PJDLOG_ASSERT(uctx->uc_magic == UDS_CTX_MAGIC); + PJDLOG_ASSERT(addr != NULL); + + sunlen = sizeof(sun); + if (getpeername(uctx->uc_fd, (struct sockaddr *)&sun, &sunlen) == -1) { + PJDLOG_VERIFY(strlcpy(addr, "N/A", size) < size); + return; + } + PJDLOG_ASSERT(sun.sun_family == AF_UNIX); + if (sun.sun_path[0] == '\0') { + PJDLOG_VERIFY(strlcpy(addr, "N/A", size) < size); + return; + } + snprintf(addr, size, "uds://%s", sun.sun_path); +} + +static void +uds_close(void *ctx) +{ + struct uds_ctx *uctx = ctx; + + PJDLOG_ASSERT(uctx != NULL); + PJDLOG_ASSERT(uctx->uc_magic == UDS_CTX_MAGIC); + + if (uctx->uc_fd >= 0) + close(uctx->uc_fd); + /* + * Unlink the socket only if we are the owner and this is descriptor + * we listen on. + */ + if (uctx->uc_side == UDS_SIDE_SERVER_LISTEN && + uctx->uc_owner == getpid()) { + PJDLOG_ASSERT(uctx->uc_sun.sun_path[0] != '\0'); + if (unlink(uctx->uc_sun.sun_path) == -1) { + pjdlog_errno(LOG_WARNING, + "Unable to unlink socket file %s", + uctx->uc_sun.sun_path); + } + } + uctx->uc_owner = 0; + uctx->uc_magic = 0; + free(uctx); +} + +static struct proto uds_proto = { + .prt_name = "uds", + .prt_client = uds_client, + .prt_connect = uds_connect, + .prt_connect_wait = uds_connect_wait, + .prt_server = uds_server, + .prt_accept = uds_accept, + .prt_send = uds_send, + .prt_recv = uds_recv, + .prt_descriptor = uds_descriptor, + .prt_local_address = uds_local_address, + .prt_remote_address = uds_remote_address, + .prt_close = uds_close +}; + +static __constructor void +uds_ctor(void) +{ + + proto_register(&uds_proto, false); +} diff --git a/sbin/hastd/rangelock.c b/sbin/hastd/rangelock.c new file mode 100644 index 0000000..e14c5b8 --- /dev/null +++ b/sbin/hastd/rangelock.c @@ -0,0 +1,141 @@ +/*- + * Copyright (c) 2010 The FreeBSD Foundation + * All rights reserved. + * + * This software was developed by Pawel Jakub Dawidek under sponsorship from + * the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/queue.h> + +#include <stdbool.h> +#include <stdlib.h> +#include <unistd.h> + +#include <pjdlog.h> + +#include "rangelock.h" + +#ifndef PJDLOG_ASSERT +#include <assert.h> +#define PJDLOG_ASSERT(...) assert(__VA_ARGS__) +#endif + +#define RANGELOCKS_MAGIC 0x94310c +struct rangelocks { + int rls_magic; /* Magic value. */ + TAILQ_HEAD(, rlock) rls_locks; /* List of locked ranges. */ +}; + +struct rlock { + off_t rl_start; + off_t rl_end; + TAILQ_ENTRY(rlock) rl_next; +}; + +int +rangelock_init(struct rangelocks **rlsp) +{ + struct rangelocks *rls; + + PJDLOG_ASSERT(rlsp != NULL); + + rls = malloc(sizeof(*rls)); + if (rls == NULL) + return (-1); + + TAILQ_INIT(&rls->rls_locks); + + rls->rls_magic = RANGELOCKS_MAGIC; + *rlsp = rls; + + return (0); +} + +void +rangelock_free(struct rangelocks *rls) +{ + struct rlock *rl; + + PJDLOG_ASSERT(rls->rls_magic == RANGELOCKS_MAGIC); + + rls->rls_magic = 0; + + while ((rl = TAILQ_FIRST(&rls->rls_locks)) != NULL) { + TAILQ_REMOVE(&rls->rls_locks, rl, rl_next); + free(rl); + } + free(rls); +} + +int +rangelock_add(struct rangelocks *rls, off_t offset, off_t length) +{ + struct rlock *rl; + + PJDLOG_ASSERT(rls->rls_magic == RANGELOCKS_MAGIC); + + rl = malloc(sizeof(*rl)); + if (rl == NULL) + return (-1); + rl->rl_start = offset; + rl->rl_end = offset + length; + TAILQ_INSERT_TAIL(&rls->rls_locks, rl, rl_next); + return (0); +} + +void +rangelock_del(struct rangelocks *rls, off_t offset, off_t length) +{ + struct rlock *rl; + + PJDLOG_ASSERT(rls->rls_magic == RANGELOCKS_MAGIC); + + TAILQ_FOREACH(rl, &rls->rls_locks, rl_next) { + if (rl->rl_start == offset && rl->rl_end == offset + length) + break; + } + PJDLOG_ASSERT(rl != NULL); + TAILQ_REMOVE(&rls->rls_locks, rl, rl_next); + free(rl); +} + +bool +rangelock_islocked(struct rangelocks *rls, off_t offset, off_t length) +{ + struct rlock *rl; + off_t end; + + PJDLOG_ASSERT(rls->rls_magic == RANGELOCKS_MAGIC); + + end = offset + length; + TAILQ_FOREACH(rl, &rls->rls_locks, rl_next) { + if (rl->rl_start < end && rl->rl_end > offset) + break; + } + return (rl != NULL); +} diff --git a/sbin/hastd/rangelock.h b/sbin/hastd/rangelock.h new file mode 100644 index 0000000..2ad9895 --- /dev/null +++ b/sbin/hastd/rangelock.h @@ -0,0 +1,46 @@ +/*- + * Copyright (c) 2010 The FreeBSD Foundation + * All rights reserved. + * + * This software was developed by Pawel Jakub Dawidek under sponsorship from + * the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _RANGELOCK_H_ +#define _RANGELOCK_H_ + +#include <stdbool.h> +#include <unistd.h> + +struct rangelocks; + +int rangelock_init(struct rangelocks **rlsp); +void rangelock_free(struct rangelocks *rls); +int rangelock_add(struct rangelocks *rls, off_t offset, off_t length); +void rangelock_del(struct rangelocks *rls, off_t offset, off_t length); +bool rangelock_islocked(struct rangelocks *rls, off_t offset, off_t length); + +#endif /* !_RANGELOCK_H_ */ diff --git a/sbin/hastd/refcnt.h b/sbin/hastd/refcnt.h new file mode 100644 index 0000000..1246043 --- /dev/null +++ b/sbin/hastd/refcnt.h @@ -0,0 +1,66 @@ +/*- + * Copyright (c) 2005 John Baldwin <jhb@FreeBSD.org> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the author nor the names of any co-contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef __REFCNT_H__ +#define __REFCNT_H__ + +#include <machine/atomic.h> + +#include "pjdlog.h" + +typedef unsigned int refcnt_t; + +static __inline void +refcnt_init(refcnt_t *count, unsigned int v) +{ + + *count = v; +} + +static __inline void +refcnt_acquire(refcnt_t *count) +{ + + atomic_add_acq_int(count, 1); +} + +static __inline unsigned int +refcnt_release(refcnt_t *count) +{ + unsigned int old; + + /* XXX: Should this have a rel membar? */ + old = atomic_fetchadd_int(count, -1); + PJDLOG_ASSERT(old > 0); + return (old - 1); +} + +#endif /* ! __REFCNT_H__ */ diff --git a/sbin/hastd/secondary.c b/sbin/hastd/secondary.c new file mode 100644 index 0000000..067c5d9 --- /dev/null +++ b/sbin/hastd/secondary.c @@ -0,0 +1,915 @@ +/*- + * Copyright (c) 2009-2010 The FreeBSD Foundation + * Copyright (c) 2010 Pawel Jakub Dawidek <pjd@FreeBSD.org> + * All rights reserved. + * + * This software was developed by Pawel Jakub Dawidek under sponsorship from + * the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/time.h> +#include <sys/bio.h> +#include <sys/disk.h> +#include <sys/stat.h> + +#include <err.h> +#include <errno.h> +#include <fcntl.h> +#include <libgeom.h> +#include <pthread.h> +#include <signal.h> +#include <stdint.h> +#include <stdio.h> +#include <string.h> +#include <sysexits.h> +#include <unistd.h> + +#include <activemap.h> +#include <nv.h> +#include <pjdlog.h> + +#include "control.h" +#include "event.h" +#include "hast.h" +#include "hast_proto.h" +#include "hastd.h" +#include "hooks.h" +#include "metadata.h" +#include "proto.h" +#include "subr.h" +#include "synch.h" + +struct hio { + uint64_t hio_seq; + int hio_error; + void *hio_data; + uint8_t hio_cmd; + uint64_t hio_offset; + uint64_t hio_length; + bool hio_memsync; + TAILQ_ENTRY(hio) hio_next; +}; + +static struct hast_resource *gres; + +/* + * Free list holds unused structures. When free list is empty, we have to wait + * until some in-progress requests are freed. + */ +static TAILQ_HEAD(, hio) hio_free_list; +static pthread_mutex_t hio_free_list_lock; +static pthread_cond_t hio_free_list_cond; +/* + * Disk thread (the one that does I/O requests) takes requests from this list. + */ +static TAILQ_HEAD(, hio) hio_disk_list; +static pthread_mutex_t hio_disk_list_lock; +static pthread_cond_t hio_disk_list_cond; +/* + * Thread that sends requests back to primary takes requests from this list. + */ +static TAILQ_HEAD(, hio) hio_send_list; +static pthread_mutex_t hio_send_list_lock; +static pthread_cond_t hio_send_list_cond; + +/* + * Maximum number of outstanding I/O requests. + */ +#define HAST_HIO_MAX 256 + +static void *recv_thread(void *arg); +static void *disk_thread(void *arg); +static void *send_thread(void *arg); + +#define QUEUE_INSERT(name, hio) do { \ + bool _wakeup; \ + \ + mtx_lock(&hio_##name##_list_lock); \ + _wakeup = TAILQ_EMPTY(&hio_##name##_list); \ + TAILQ_INSERT_TAIL(&hio_##name##_list, (hio), hio_next); \ + mtx_unlock(&hio_##name##_list_lock); \ + if (_wakeup) \ + cv_broadcast(&hio_##name##_list_cond); \ +} while (0) +#define QUEUE_TAKE(name, hio) do { \ + mtx_lock(&hio_##name##_list_lock); \ + while (((hio) = TAILQ_FIRST(&hio_##name##_list)) == NULL) { \ + cv_wait(&hio_##name##_list_cond, \ + &hio_##name##_list_lock); \ + } \ + TAILQ_REMOVE(&hio_##name##_list, (hio), hio_next); \ + mtx_unlock(&hio_##name##_list_lock); \ +} while (0) + +static void +hio_clear(struct hio *hio) +{ + + hio->hio_seq = 0; + hio->hio_error = 0; + hio->hio_cmd = HIO_UNDEF; + hio->hio_offset = 0; + hio->hio_length = 0; + hio->hio_memsync = false; +} + +static void +hio_copy(const struct hio *srchio, struct hio *dsthio) +{ + + /* + * We don't copy hio_error, hio_data and hio_next fields. + */ + + dsthio->hio_seq = srchio->hio_seq; + dsthio->hio_cmd = srchio->hio_cmd; + dsthio->hio_offset = srchio->hio_offset; + dsthio->hio_length = srchio->hio_length; + dsthio->hio_memsync = srchio->hio_memsync; +} + +static void +init_environment(void) +{ + struct hio *hio; + unsigned int ii; + + /* + * Initialize lists, their locks and theirs condition variables. + */ + TAILQ_INIT(&hio_free_list); + mtx_init(&hio_free_list_lock); + cv_init(&hio_free_list_cond); + TAILQ_INIT(&hio_disk_list); + mtx_init(&hio_disk_list_lock); + cv_init(&hio_disk_list_cond); + TAILQ_INIT(&hio_send_list); + mtx_init(&hio_send_list_lock); + cv_init(&hio_send_list_cond); + + /* + * Allocate requests pool and initialize requests. + */ + for (ii = 0; ii < HAST_HIO_MAX; ii++) { + hio = malloc(sizeof(*hio)); + if (hio == NULL) { + pjdlog_exitx(EX_TEMPFAIL, + "Unable to allocate memory (%zu bytes) for hio request.", + sizeof(*hio)); + } + hio->hio_data = malloc(MAXPHYS); + if (hio->hio_data == NULL) { + pjdlog_exitx(EX_TEMPFAIL, + "Unable to allocate memory (%zu bytes) for gctl_data.", + (size_t)MAXPHYS); + } + hio_clear(hio); + TAILQ_INSERT_HEAD(&hio_free_list, hio, hio_next); + } +} + +static void +init_local(struct hast_resource *res) +{ + + if (metadata_read(res, true) == -1) + exit(EX_NOINPUT); +} + +static void +init_remote(struct hast_resource *res, struct nv *nvin) +{ + uint64_t resuid; + struct nv *nvout; + unsigned char *map; + size_t mapsize; + +#ifdef notyet + /* Setup direction. */ + if (proto_send(res->hr_remoteout, NULL, 0) == -1) + pjdlog_errno(LOG_WARNING, "Unable to set connection direction"); +#endif + + nvout = nv_alloc(); + nv_add_int64(nvout, (int64_t)res->hr_datasize, "datasize"); + nv_add_int32(nvout, (int32_t)res->hr_extentsize, "extentsize"); + resuid = nv_get_uint64(nvin, "resuid"); + res->hr_primary_localcnt = nv_get_uint64(nvin, "localcnt"); + res->hr_primary_remotecnt = nv_get_uint64(nvin, "remotecnt"); + nv_add_uint64(nvout, res->hr_secondary_localcnt, "localcnt"); + nv_add_uint64(nvout, res->hr_secondary_remotecnt, "remotecnt"); + mapsize = activemap_calc_ondisk_size(res->hr_local_mediasize - + METADATA_SIZE, res->hr_extentsize, res->hr_local_sectorsize); + map = malloc(mapsize); + if (map == NULL) { + pjdlog_exitx(EX_TEMPFAIL, + "Unable to allocate memory (%zu bytes) for activemap.", + mapsize); + } + /* + * When we work as primary and secondary is missing we will increase + * localcnt in our metadata. When secondary is connected and synced + * we make localcnt be equal to remotecnt, which means nodes are more + * or less in sync. + * Split-brain condition is when both nodes are not able to communicate + * and are both configured as primary nodes. In turn, they can both + * make incompatible changes to the data and we have to detect that. + * Under split-brain condition we will increase our localcnt on first + * write and remote node will increase its localcnt on first write. + * When we connect we can see that primary's localcnt is greater than + * our remotecnt (primary was modified while we weren't watching) and + * our localcnt is greater than primary's remotecnt (we were modified + * while primary wasn't watching). + * There are many possible combinations which are all gathered below. + * Don't pay too much attention to exact numbers, the more important + * is to compare them. We compare secondary's local with primary's + * remote and secondary's remote with primary's local. + * Note that every case where primary's localcnt is smaller than + * secondary's remotecnt and where secondary's localcnt is smaller than + * primary's remotecnt should be impossible in practise. We will perform + * full synchronization then. Those cases are marked with an asterisk. + * Regular synchronization means that only extents marked as dirty are + * synchronized (regular synchronization). + * + * SECONDARY METADATA PRIMARY METADATA + * local=3 remote=3 local=2 remote=2* ?! Full sync from secondary. + * local=3 remote=3 local=2 remote=3* ?! Full sync from primary. + * local=3 remote=3 local=2 remote=4* ?! Full sync from primary. + * local=3 remote=3 local=3 remote=2 Primary is out-of-date, + * regular sync from secondary. + * local=3 remote=3 local=3 remote=3 Regular sync just in case. + * local=3 remote=3 local=3 remote=4* ?! Full sync from primary. + * local=3 remote=3 local=4 remote=2 Split-brain condition. + * local=3 remote=3 local=4 remote=3 Secondary out-of-date, + * regular sync from primary. + * local=3 remote=3 local=4 remote=4* ?! Full sync from primary. + */ + if (res->hr_resuid == 0) { + /* + * Provider is used for the first time. If primary node done no + * writes yet as well (we will find "virgin" argument) then + * there is no need to synchronize anything. If primary node + * done any writes already we have to synchronize everything. + */ + PJDLOG_ASSERT(res->hr_secondary_localcnt == 0); + res->hr_resuid = resuid; + if (metadata_write(res) == -1) + exit(EX_NOINPUT); + if (nv_exists(nvin, "virgin")) { + free(map); + map = NULL; + mapsize = 0; + } else { + memset(map, 0xff, mapsize); + } + nv_add_int8(nvout, 1, "virgin"); + nv_add_uint8(nvout, HAST_SYNCSRC_PRIMARY, "syncsrc"); + } else if (res->hr_resuid != resuid) { + char errmsg[256]; + + free(map); + (void)snprintf(errmsg, sizeof(errmsg), + "Resource unique ID mismatch (primary=%ju, secondary=%ju).", + (uintmax_t)resuid, (uintmax_t)res->hr_resuid); + pjdlog_error("%s", errmsg); + nv_add_string(nvout, errmsg, "errmsg"); + if (hast_proto_send(res, res->hr_remotein, nvout, + NULL, 0) == -1) { + pjdlog_exit(EX_TEMPFAIL, + "Unable to send response to %s", + res->hr_remoteaddr); + } + nv_free(nvout); + exit(EX_CONFIG); + } else if ( + /* Is primary out-of-date? */ + (res->hr_secondary_localcnt > res->hr_primary_remotecnt && + res->hr_secondary_remotecnt == res->hr_primary_localcnt) || + /* Are the nodes more or less in sync? */ + (res->hr_secondary_localcnt == res->hr_primary_remotecnt && + res->hr_secondary_remotecnt == res->hr_primary_localcnt) || + /* Is secondary out-of-date? */ + (res->hr_secondary_localcnt == res->hr_primary_remotecnt && + res->hr_secondary_remotecnt < res->hr_primary_localcnt)) { + /* + * Nodes are more or less in sync or one of the nodes is + * out-of-date. + * It doesn't matter at this point which one, we just have to + * send out local bitmap to the remote node. + */ + if (pread(res->hr_localfd, map, mapsize, METADATA_SIZE) != + (ssize_t)mapsize) { + pjdlog_exit(LOG_ERR, "Unable to read activemap"); + } + if (res->hr_secondary_localcnt > res->hr_primary_remotecnt && + res->hr_secondary_remotecnt == res->hr_primary_localcnt) { + /* Primary is out-of-date, sync from secondary. */ + nv_add_uint8(nvout, HAST_SYNCSRC_SECONDARY, "syncsrc"); + } else { + /* + * Secondary is out-of-date or counts match. + * Sync from primary. + */ + nv_add_uint8(nvout, HAST_SYNCSRC_PRIMARY, "syncsrc"); + } + } else if (res->hr_secondary_localcnt > res->hr_primary_remotecnt && + res->hr_primary_localcnt > res->hr_secondary_remotecnt) { + /* + * Not good, we have split-brain condition. + */ + free(map); + pjdlog_error("Split-brain detected, exiting."); + nv_add_string(nvout, "Split-brain condition!", "errmsg"); + if (hast_proto_send(res, res->hr_remotein, nvout, + NULL, 0) == -1) { + pjdlog_exit(EX_TEMPFAIL, + "Unable to send response to %s", + res->hr_remoteaddr); + } + nv_free(nvout); + /* Exit on split-brain. */ + event_send(res, EVENT_SPLITBRAIN); + exit(EX_CONFIG); + } else /* if (res->hr_secondary_localcnt < res->hr_primary_remotecnt || + res->hr_primary_localcnt < res->hr_secondary_remotecnt) */ { + /* + * This should never happen in practise, but we will perform + * full synchronization. + */ + PJDLOG_ASSERT(res->hr_secondary_localcnt < res->hr_primary_remotecnt || + res->hr_primary_localcnt < res->hr_secondary_remotecnt); + mapsize = activemap_calc_ondisk_size(res->hr_local_mediasize - + METADATA_SIZE, res->hr_extentsize, + res->hr_local_sectorsize); + memset(map, 0xff, mapsize); + if (res->hr_secondary_localcnt > res->hr_primary_remotecnt) { + /* In this one of five cases sync from secondary. */ + nv_add_uint8(nvout, HAST_SYNCSRC_SECONDARY, "syncsrc"); + } else { + /* For the rest four cases sync from primary. */ + nv_add_uint8(nvout, HAST_SYNCSRC_PRIMARY, "syncsrc"); + } + pjdlog_warning("This should never happen, asking for full synchronization (primary(local=%ju, remote=%ju), secondary(local=%ju, remote=%ju)).", + (uintmax_t)res->hr_primary_localcnt, + (uintmax_t)res->hr_primary_remotecnt, + (uintmax_t)res->hr_secondary_localcnt, + (uintmax_t)res->hr_secondary_remotecnt); + } + nv_add_uint32(nvout, (uint32_t)mapsize, "mapsize"); + if (hast_proto_send(res, res->hr_remotein, nvout, map, mapsize) == -1) { + pjdlog_exit(EX_TEMPFAIL, "Unable to send activemap to %s", + res->hr_remoteaddr); + } + if (map != NULL) + free(map); + nv_free(nvout); +#ifdef notyet + /* Setup direction. */ + if (proto_recv(res->hr_remotein, NULL, 0) == -1) + pjdlog_errno(LOG_WARNING, "Unable to set connection direction"); +#endif +} + +void +hastd_secondary(struct hast_resource *res, struct nv *nvin) +{ + sigset_t mask; + pthread_t td; + pid_t pid; + int error, mode, debuglevel; + + /* + * Create communication channel between parent and child. + */ + if (proto_client(NULL, "socketpair://", &res->hr_ctrl) == -1) { + KEEP_ERRNO((void)pidfile_remove(pfh)); + pjdlog_exit(EX_OSERR, + "Unable to create control sockets between parent and child"); + } + /* + * Create communication channel between child and parent. + */ + if (proto_client(NULL, "socketpair://", &res->hr_event) == -1) { + KEEP_ERRNO((void)pidfile_remove(pfh)); + pjdlog_exit(EX_OSERR, + "Unable to create event sockets between child and parent"); + } + + pid = fork(); + if (pid == -1) { + KEEP_ERRNO((void)pidfile_remove(pfh)); + pjdlog_exit(EX_OSERR, "Unable to fork"); + } + + if (pid > 0) { + /* This is parent. */ + proto_close(res->hr_remotein); + res->hr_remotein = NULL; + proto_close(res->hr_remoteout); + res->hr_remoteout = NULL; + /* Declare that we are receiver. */ + proto_recv(res->hr_event, NULL, 0); + /* Declare that we are sender. */ + proto_send(res->hr_ctrl, NULL, 0); + res->hr_workerpid = pid; + return; + } + + gres = res; + mode = pjdlog_mode_get(); + debuglevel = pjdlog_debug_get(); + + /* Declare that we are sender. */ + proto_send(res->hr_event, NULL, 0); + /* Declare that we are receiver. */ + proto_recv(res->hr_ctrl, NULL, 0); + descriptors_cleanup(res); + + descriptors_assert(res, mode); + + pjdlog_init(mode); + pjdlog_debug_set(debuglevel); + pjdlog_prefix_set("[%s] (%s) ", res->hr_name, role2str(res->hr_role)); + setproctitle("%s (%s)", res->hr_name, role2str(res->hr_role)); + + PJDLOG_VERIFY(sigemptyset(&mask) == 0); + PJDLOG_VERIFY(sigprocmask(SIG_SETMASK, &mask, NULL) == 0); + + /* Error in setting timeout is not critical, but why should it fail? */ + if (proto_timeout(res->hr_remotein, 2 * HAST_KEEPALIVE) == -1) + pjdlog_errno(LOG_WARNING, "Unable to set connection timeout"); + if (proto_timeout(res->hr_remoteout, res->hr_timeout) == -1) + pjdlog_errno(LOG_WARNING, "Unable to set connection timeout"); + + init_local(res); + init_environment(); + + if (drop_privs(res) != 0) + exit(EX_CONFIG); + pjdlog_info("Privileges successfully dropped."); + + /* + * Create the control thread before sending any event to the parent, + * as we can deadlock when parent sends control request to worker, + * but worker has no control thread started yet, so parent waits. + * In the meantime worker sends an event to the parent, but parent + * is unable to handle the event, because it waits for control + * request response. + */ + error = pthread_create(&td, NULL, ctrl_thread, res); + PJDLOG_ASSERT(error == 0); + + init_remote(res, nvin); + event_send(res, EVENT_CONNECT); + + error = pthread_create(&td, NULL, recv_thread, res); + PJDLOG_ASSERT(error == 0); + error = pthread_create(&td, NULL, disk_thread, res); + PJDLOG_ASSERT(error == 0); + (void)send_thread(res); +} + +static void +reqlog(int loglevel, int debuglevel, int error, struct hio *hio, + const char *fmt, ...) +{ + char msg[1024]; + va_list ap; + int len; + + va_start(ap, fmt); + len = vsnprintf(msg, sizeof(msg), fmt, ap); + va_end(ap); + if ((size_t)len < sizeof(msg)) { + switch (hio->hio_cmd) { + case HIO_READ: + (void)snprintf(msg + len, sizeof(msg) - len, + "READ(%ju, %ju).", (uintmax_t)hio->hio_offset, + (uintmax_t)hio->hio_length); + break; + case HIO_DELETE: + (void)snprintf(msg + len, sizeof(msg) - len, + "DELETE(%ju, %ju).", (uintmax_t)hio->hio_offset, + (uintmax_t)hio->hio_length); + break; + case HIO_FLUSH: + (void)snprintf(msg + len, sizeof(msg) - len, "FLUSH."); + break; + case HIO_WRITE: + (void)snprintf(msg + len, sizeof(msg) - len, + "WRITE(%ju, %ju).", (uintmax_t)hio->hio_offset, + (uintmax_t)hio->hio_length); + break; + case HIO_KEEPALIVE: + (void)snprintf(msg + len, sizeof(msg) - len, "KEEPALIVE."); + break; + default: + (void)snprintf(msg + len, sizeof(msg) - len, + "UNKNOWN(%u).", (unsigned int)hio->hio_cmd); + break; + } + } + pjdlog_common(loglevel, debuglevel, error, "%s", msg); +} + +static int +requnpack(struct hast_resource *res, struct hio *hio, struct nv *nv) +{ + + hio->hio_cmd = nv_get_uint8(nv, "cmd"); + if (hio->hio_cmd == 0) { + pjdlog_error("Header contains no 'cmd' field."); + hio->hio_error = EINVAL; + goto end; + } + if (hio->hio_cmd != HIO_KEEPALIVE) { + hio->hio_seq = nv_get_uint64(nv, "seq"); + if (hio->hio_seq == 0) { + pjdlog_error("Header contains no 'seq' field."); + hio->hio_error = EINVAL; + goto end; + } + } + switch (hio->hio_cmd) { + case HIO_FLUSH: + case HIO_KEEPALIVE: + break; + case HIO_WRITE: + hio->hio_memsync = nv_exists(nv, "memsync"); + /* FALLTHROUGH */ + case HIO_READ: + case HIO_DELETE: + hio->hio_offset = nv_get_uint64(nv, "offset"); + if (nv_error(nv) != 0) { + pjdlog_error("Header is missing 'offset' field."); + hio->hio_error = EINVAL; + goto end; + } + hio->hio_length = nv_get_uint64(nv, "length"); + if (nv_error(nv) != 0) { + pjdlog_error("Header is missing 'length' field."); + hio->hio_error = EINVAL; + goto end; + } + if (hio->hio_length == 0) { + pjdlog_error("Data length is zero."); + hio->hio_error = EINVAL; + goto end; + } + if (hio->hio_cmd != HIO_DELETE && hio->hio_length > MAXPHYS) { + pjdlog_error("Data length is too large (%ju > %ju).", + (uintmax_t)hio->hio_length, (uintmax_t)MAXPHYS); + hio->hio_error = EINVAL; + goto end; + } + if ((hio->hio_offset % res->hr_local_sectorsize) != 0) { + pjdlog_error("Offset %ju is not multiple of sector size.", + (uintmax_t)hio->hio_offset); + hio->hio_error = EINVAL; + goto end; + } + if ((hio->hio_length % res->hr_local_sectorsize) != 0) { + pjdlog_error("Length %ju is not multiple of sector size.", + (uintmax_t)hio->hio_length); + hio->hio_error = EINVAL; + goto end; + } + if (hio->hio_offset + hio->hio_length > + (uint64_t)res->hr_datasize) { + pjdlog_error("Data offset is too large (%ju > %ju).", + (uintmax_t)(hio->hio_offset + hio->hio_length), + (uintmax_t)res->hr_datasize); + hio->hio_error = EINVAL; + goto end; + } + break; + default: + pjdlog_error("Header contains invalid 'cmd' (%hhu).", + hio->hio_cmd); + hio->hio_error = EINVAL; + goto end; + } + hio->hio_error = 0; +end: + return (hio->hio_error); +} + +static __dead2 void +secondary_exit(int exitcode, const char *fmt, ...) +{ + va_list ap; + + PJDLOG_ASSERT(exitcode != EX_OK); + va_start(ap, fmt); + pjdlogv_errno(LOG_ERR, fmt, ap); + va_end(ap); + event_send(gres, EVENT_DISCONNECT); + exit(exitcode); +} + +/* + * Thread receives requests from the primary node. + */ +static void * +recv_thread(void *arg) +{ + struct hast_resource *res = arg; + struct hio *hio, *mshio; + struct nv *nv; + + for (;;) { + pjdlog_debug(2, "recv: Taking free request."); + QUEUE_TAKE(free, hio); + pjdlog_debug(2, "recv: (%p) Got request.", hio); + if (hast_proto_recv_hdr(res->hr_remotein, &nv) == -1) { + secondary_exit(EX_TEMPFAIL, + "Unable to receive request header"); + } + if (requnpack(res, hio, nv) != 0) { + nv_free(nv); + pjdlog_debug(2, + "recv: (%p) Moving request to the send queue.", + hio); + QUEUE_INSERT(send, hio); + continue; + } + switch (hio->hio_cmd) { + case HIO_READ: + res->hr_stat_read++; + break; + case HIO_WRITE: + res->hr_stat_write++; + break; + case HIO_DELETE: + res->hr_stat_delete++; + break; + case HIO_FLUSH: + res->hr_stat_flush++; + break; + case HIO_KEEPALIVE: + break; + default: + PJDLOG_ABORT("Unexpected command (cmd=%hhu).", + hio->hio_cmd); + } + reqlog(LOG_DEBUG, 2, -1, hio, + "recv: (%p) Got request header: ", hio); + if (hio->hio_cmd == HIO_KEEPALIVE) { + nv_free(nv); + pjdlog_debug(2, + "recv: (%p) Moving request to the free queue.", + hio); + hio_clear(hio); + QUEUE_INSERT(free, hio); + continue; + } else if (hio->hio_cmd == HIO_WRITE) { + if (hast_proto_recv_data(res, res->hr_remotein, nv, + hio->hio_data, MAXPHYS) == -1) { + secondary_exit(EX_TEMPFAIL, + "Unable to receive request data"); + } + if (hio->hio_memsync) { + /* + * For memsync requests we expect two replies. + * Clone the hio so we can handle both of them. + */ + pjdlog_debug(2, "recv: Taking free request."); + QUEUE_TAKE(free, mshio); + pjdlog_debug(2, "recv: (%p) Got request.", + mshio); + hio_copy(hio, mshio); + mshio->hio_error = 0; + /* + * We want to keep 'memsync' tag only on the + * request going onto send queue (mshio). + */ + hio->hio_memsync = false; + pjdlog_debug(2, + "recv: (%p) Moving memsync request to the send queue.", + mshio); + QUEUE_INSERT(send, mshio); + } + } + nv_free(nv); + pjdlog_debug(2, "recv: (%p) Moving request to the disk queue.", + hio); + QUEUE_INSERT(disk, hio); + } + /* NOTREACHED */ + return (NULL); +} + +/* + * Thread reads from or writes to local component and also handles DELETE and + * FLUSH requests. + */ +static void * +disk_thread(void *arg) +{ + struct hast_resource *res = arg; + struct hio *hio; + ssize_t ret; + bool clear_activemap, logerror; + + clear_activemap = true; + + for (;;) { + pjdlog_debug(2, "disk: Taking request."); + QUEUE_TAKE(disk, hio); + while (clear_activemap) { + unsigned char *map; + size_t mapsize; + + /* + * When first request is received, it means that primary + * already received our activemap, merged it and stored + * locally. We can now safely clear our activemap. + */ + mapsize = + activemap_calc_ondisk_size(res->hr_local_mediasize - + METADATA_SIZE, res->hr_extentsize, + res->hr_local_sectorsize); + map = calloc(1, mapsize); + if (map == NULL) { + pjdlog_warning("Unable to allocate memory to clear local activemap."); + break; + } + if (pwrite(res->hr_localfd, map, mapsize, + METADATA_SIZE) != (ssize_t)mapsize) { + pjdlog_errno(LOG_WARNING, + "Unable to store cleared activemap"); + free(map); + res->hr_stat_activemap_write_error++; + break; + } + free(map); + clear_activemap = false; + pjdlog_debug(1, "Local activemap cleared."); + break; + } + reqlog(LOG_DEBUG, 2, -1, hio, "disk: (%p) Got request: ", hio); + logerror = true; + /* Handle the actual request. */ + switch (hio->hio_cmd) { + case HIO_READ: + ret = pread(res->hr_localfd, hio->hio_data, + hio->hio_length, + hio->hio_offset + res->hr_localoff); + if (ret == -1) + hio->hio_error = errno; + else if (ret != (int64_t)hio->hio_length) + hio->hio_error = EIO; + else + hio->hio_error = 0; + break; + case HIO_WRITE: + ret = pwrite(res->hr_localfd, hio->hio_data, + hio->hio_length, + hio->hio_offset + res->hr_localoff); + if (ret == -1) + hio->hio_error = errno; + else if (ret != (int64_t)hio->hio_length) + hio->hio_error = EIO; + else + hio->hio_error = 0; + break; + case HIO_DELETE: + ret = g_delete(res->hr_localfd, + hio->hio_offset + res->hr_localoff, + hio->hio_length); + if (ret == -1) + hio->hio_error = errno; + else + hio->hio_error = 0; + break; + case HIO_FLUSH: + if (!res->hr_localflush) { + ret = -1; + hio->hio_error = EOPNOTSUPP; + logerror = false; + break; + } + ret = g_flush(res->hr_localfd); + if (ret == -1) { + if (errno == EOPNOTSUPP) + res->hr_localflush = false; + hio->hio_error = errno; + } else { + hio->hio_error = 0; + } + break; + default: + PJDLOG_ABORT("Unexpected command (cmd=%hhu).", + hio->hio_cmd); + } + if (logerror && hio->hio_error != 0) { + reqlog(LOG_ERR, 0, hio->hio_error, hio, + "Request failed: "); + } + pjdlog_debug(2, "disk: (%p) Moving request to the send queue.", + hio); + QUEUE_INSERT(send, hio); + } + /* NOTREACHED */ + return (NULL); +} + +/* + * Thread sends requests back to primary node. + */ +static void * +send_thread(void *arg) +{ + struct hast_resource *res = arg; + struct nv *nvout; + struct hio *hio; + void *data; + size_t length; + + for (;;) { + pjdlog_debug(2, "send: Taking request."); + QUEUE_TAKE(send, hio); + reqlog(LOG_DEBUG, 2, -1, hio, "send: (%p) Got request: ", hio); + nvout = nv_alloc(); + /* Copy sequence number. */ + nv_add_uint64(nvout, hio->hio_seq, "seq"); + if (hio->hio_memsync) { + PJDLOG_ASSERT(hio->hio_cmd == HIO_WRITE); + nv_add_int8(nvout, 1, "received"); + } + switch (hio->hio_cmd) { + case HIO_READ: + if (hio->hio_error == 0) { + data = hio->hio_data; + length = hio->hio_length; + break; + } + /* + * We send no data in case of an error. + */ + /* FALLTHROUGH */ + case HIO_DELETE: + case HIO_FLUSH: + case HIO_WRITE: + data = NULL; + length = 0; + break; + default: + PJDLOG_ABORT("Unexpected command (cmd=%hhu).", + hio->hio_cmd); + } + if (hio->hio_error != 0) { + switch (hio->hio_cmd) { + case HIO_READ: + res->hr_stat_read_error++; + break; + case HIO_WRITE: + res->hr_stat_write_error++; + break; + case HIO_DELETE: + res->hr_stat_delete_error++; + break; + case HIO_FLUSH: + res->hr_stat_flush_error++; + break; + } + nv_add_int16(nvout, hio->hio_error, "error"); + } + if (hast_proto_send(res, res->hr_remoteout, nvout, data, + length) == -1) { + secondary_exit(EX_TEMPFAIL, "Unable to send reply"); + } + nv_free(nvout); + pjdlog_debug(2, "send: (%p) Moving request to the free queue.", + hio); + hio_clear(hio); + QUEUE_INSERT(free, hio); + } + /* NOTREACHED */ + return (NULL); +} diff --git a/sbin/hastd/subr.c b/sbin/hastd/subr.c new file mode 100644 index 0000000..0e9930b --- /dev/null +++ b/sbin/hastd/subr.c @@ -0,0 +1,299 @@ +/*- + * Copyright (c) 2010 The FreeBSD Foundation + * Copyright (c) 2011 Pawel Jakub Dawidek <pawel@dawidek.net> + * All rights reserved. + * + * This software was developed by Pawel Jakub Dawidek under sponsorship from + * the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/disk.h> +#include <sys/ioctl.h> +#include <sys/jail.h> +#include <sys/stat.h> +#ifdef HAVE_CAPSICUM +#include <sys/capability.h> +#include <geom/gate/g_gate.h> +#endif + +#include <errno.h> +#include <fcntl.h> +#include <pwd.h> +#include <stdarg.h> +#include <stdbool.h> +#include <stdio.h> +#include <string.h> +#include <unistd.h> + +#include <pjdlog.h> + +#include "hast.h" +#include "subr.h" + +int +vsnprlcat(char *str, size_t size, const char *fmt, va_list ap) +{ + size_t len; + + len = strlen(str); + return (vsnprintf(str + len, size - len, fmt, ap)); +} + +int +snprlcat(char *str, size_t size, const char *fmt, ...) +{ + va_list ap; + int result; + + va_start(ap, fmt); + result = vsnprlcat(str, size, fmt, ap); + va_end(ap); + return (result); +} + +int +provinfo(struct hast_resource *res, bool dowrite) +{ + struct stat sb; + + PJDLOG_ASSERT(res->hr_localpath != NULL && + res->hr_localpath[0] != '\0'); + + if (res->hr_localfd == -1) { + res->hr_localfd = open(res->hr_localpath, + dowrite ? O_RDWR : O_RDONLY); + if (res->hr_localfd == -1) { + pjdlog_errno(LOG_ERR, "Unable to open %s", + res->hr_localpath); + return (-1); + } + } + if (fstat(res->hr_localfd, &sb) == -1) { + pjdlog_errno(LOG_ERR, "Unable to stat %s", res->hr_localpath); + return (-1); + } + if (S_ISCHR(sb.st_mode)) { + /* + * If this is character device, it is most likely GEOM provider. + */ + if (ioctl(res->hr_localfd, DIOCGMEDIASIZE, + &res->hr_local_mediasize) == -1) { + pjdlog_errno(LOG_ERR, + "Unable obtain provider %s mediasize", + res->hr_localpath); + return (-1); + } + if (ioctl(res->hr_localfd, DIOCGSECTORSIZE, + &res->hr_local_sectorsize) == -1) { + pjdlog_errno(LOG_ERR, + "Unable obtain provider %s sectorsize", + res->hr_localpath); + return (-1); + } + } else if (S_ISREG(sb.st_mode)) { + /* + * We also support regular files for which we hardcode + * sector size of 512 bytes. + */ + res->hr_local_mediasize = sb.st_size; + res->hr_local_sectorsize = 512; + } else { + /* + * We support no other file types. + */ + pjdlog_error("%s is neither GEOM provider nor regular file.", + res->hr_localpath); + errno = EFTYPE; + return (-1); + } + return (0); +} + +const char * +role2str(int role) +{ + + switch (role) { + case HAST_ROLE_INIT: + return ("init"); + case HAST_ROLE_PRIMARY: + return ("primary"); + case HAST_ROLE_SECONDARY: + return ("secondary"); + } + return ("unknown"); +} + +int +drop_privs(const struct hast_resource *res) +{ + char jailhost[sizeof(res->hr_name) * 2]; + struct jail jailst; + struct passwd *pw; + uid_t ruid, euid, suid; + gid_t rgid, egid, sgid; + gid_t gidset[1]; + bool capsicum, jailed; + + /* + * According to getpwnam(3) we have to clear errno before calling the + * function to be able to distinguish between an error and missing + * entry (with is not treated as error by getpwnam(3)). + */ + errno = 0; + pw = getpwnam(HAST_USER); + if (pw == NULL) { + if (errno != 0) { + pjdlog_errno(LOG_ERR, + "Unable to find info about '%s' user", HAST_USER); + return (-1); + } else { + pjdlog_error("'%s' user doesn't exist.", HAST_USER); + errno = ENOENT; + return (-1); + } + } + + bzero(&jailst, sizeof(jailst)); + jailst.version = JAIL_API_VERSION; + jailst.path = pw->pw_dir; + if (res == NULL) { + (void)snprintf(jailhost, sizeof(jailhost), "hastctl"); + } else { + (void)snprintf(jailhost, sizeof(jailhost), "hastd: %s (%s)", + res->hr_name, role2str(res->hr_role)); + } + jailst.hostname = jailhost; + jailst.jailname = NULL; + jailst.ip4s = 0; + jailst.ip4 = NULL; + jailst.ip6s = 0; + jailst.ip6 = NULL; + if (jail(&jailst) >= 0) { + jailed = true; + } else { + jailed = false; + pjdlog_errno(LOG_WARNING, + "Unable to jail to directory to %s", pw->pw_dir); + if (chroot(pw->pw_dir) == -1) { + pjdlog_errno(LOG_ERR, + "Unable to change root directory to %s", + pw->pw_dir); + return (-1); + } + } + PJDLOG_VERIFY(chdir("/") == 0); + gidset[0] = pw->pw_gid; + if (setgroups(1, gidset) == -1) { + pjdlog_errno(LOG_ERR, "Unable to set groups to gid %u", + (unsigned int)pw->pw_gid); + return (-1); + } + if (setgid(pw->pw_gid) == -1) { + pjdlog_errno(LOG_ERR, "Unable to set gid to %u", + (unsigned int)pw->pw_gid); + return (-1); + } + if (setuid(pw->pw_uid) == -1) { + pjdlog_errno(LOG_ERR, "Unable to set uid to %u", + (unsigned int)pw->pw_uid); + return (-1); + } + +#ifdef HAVE_CAPSICUM + capsicum = (cap_enter() == 0); + if (!capsicum) { + pjdlog_common(LOG_DEBUG, 1, errno, + "Unable to sandbox using capsicum"); + } else if (res != NULL) { + cap_rights_t rights; + static const unsigned long geomcmds[] = { + DIOCGDELETE, + DIOCGFLUSH + }; + + PJDLOG_ASSERT(res->hr_role == HAST_ROLE_PRIMARY || + res->hr_role == HAST_ROLE_SECONDARY); + + cap_rights_init(&rights, CAP_FLOCK, CAP_IOCTL, CAP_PREAD, + CAP_PWRITE); + if (cap_rights_limit(res->hr_localfd, &rights) == -1) { + pjdlog_errno(LOG_ERR, + "Unable to limit capability rights on local descriptor"); + } + if (cap_ioctls_limit(res->hr_localfd, geomcmds, + sizeof(geomcmds) / sizeof(geomcmds[0])) == -1) { + pjdlog_errno(LOG_ERR, + "Unable to limit allowed GEOM ioctls"); + } + + if (res->hr_role == HAST_ROLE_PRIMARY) { + static const unsigned long ggatecmds[] = { + G_GATE_CMD_MODIFY, + G_GATE_CMD_START, + G_GATE_CMD_DONE, + G_GATE_CMD_DESTROY + }; + + cap_rights_init(&rights, CAP_IOCTL); + if (cap_rights_limit(res->hr_ggatefd, &rights) == -1) { + pjdlog_errno(LOG_ERR, + "Unable to limit capability rights to CAP_IOCTL on ggate descriptor"); + } + if (cap_ioctls_limit(res->hr_ggatefd, ggatecmds, + sizeof(ggatecmds) / sizeof(ggatecmds[0])) == -1) { + pjdlog_errno(LOG_ERR, + "Unable to limit allowed ggate ioctls"); + } + } + } +#else + capsicum = false; +#endif + + /* + * Better be sure that everything succeeded. + */ + PJDLOG_VERIFY(getresuid(&ruid, &euid, &suid) == 0); + PJDLOG_VERIFY(ruid == pw->pw_uid); + PJDLOG_VERIFY(euid == pw->pw_uid); + PJDLOG_VERIFY(suid == pw->pw_uid); + PJDLOG_VERIFY(getresgid(&rgid, &egid, &sgid) == 0); + PJDLOG_VERIFY(rgid == pw->pw_gid); + PJDLOG_VERIFY(egid == pw->pw_gid); + PJDLOG_VERIFY(sgid == pw->pw_gid); + PJDLOG_VERIFY(getgroups(0, NULL) == 1); + PJDLOG_VERIFY(getgroups(1, gidset) == 1); + PJDLOG_VERIFY(gidset[0] == pw->pw_gid); + + pjdlog_debug(1, + "Privileges successfully dropped using %s%s+setgid+setuid.", + capsicum ? "capsicum+" : "", jailed ? "jail" : "chroot"); + + return (0); +} diff --git a/sbin/hastd/subr.h b/sbin/hastd/subr.h new file mode 100644 index 0000000..c765754 --- /dev/null +++ b/sbin/hastd/subr.h @@ -0,0 +1,56 @@ +/*- + * Copyright (c) 2010 The FreeBSD Foundation + * Copyright (c) 2011 Pawel Jakub Dawidek <pawel@dawidek.net> + * All rights reserved. + * + * This software was developed by Pawel Jakub Dawidek under sponsorship from + * the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _SUBR_H_ +#define _SUBR_H_ + +#include <sys/types.h> +#include <stdbool.h> + +#include "hast.h" + +#define KEEP_ERRNO(work) do { \ + int _rerrno; \ + \ + _rerrno = errno; \ + work; \ + errno = _rerrno; \ +} while (0) + +int vsnprlcat(char *str, size_t size, const char *fmt, va_list ap); +int snprlcat(char *str, size_t size, const char *fmt, ...); + +int provinfo(struct hast_resource *res, bool dowrite); +const char *role2str(int role); +int drop_privs(const struct hast_resource *res); + +#endif /* !_SUBR_H_ */ diff --git a/sbin/hastd/synch.h b/sbin/hastd/synch.h new file mode 100644 index 0000000..65360fd --- /dev/null +++ b/sbin/hastd/synch.h @@ -0,0 +1,194 @@ +/*- + * Copyright (c) 2009-2010 The FreeBSD Foundation + * All rights reserved. + * + * This software was developed by Pawel Jakub Dawidek under sponsorship from + * the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _SYNCH_H_ +#define _SYNCH_H_ + +#include <errno.h> +#include <pthread.h> +#include <pthread_np.h> +#include <stdbool.h> +#include <time.h> + +#include <pjdlog.h> + +#ifndef PJDLOG_ASSERT +#include <assert.h> +#define PJDLOG_ASSERT(...) assert(__VA_ARGS__) +#endif + +static __inline void +mtx_init(pthread_mutex_t *lock) +{ + int error; + + error = pthread_mutex_init(lock, NULL); + PJDLOG_ASSERT(error == 0); +} +static __inline void +mtx_destroy(pthread_mutex_t *lock) +{ + int error; + + error = pthread_mutex_destroy(lock); + PJDLOG_ASSERT(error == 0); +} +static __inline void +mtx_lock(pthread_mutex_t *lock) +{ + int error; + + error = pthread_mutex_lock(lock); + PJDLOG_ASSERT(error == 0); +} +static __inline bool +mtx_trylock(pthread_mutex_t *lock) +{ + int error; + + error = pthread_mutex_trylock(lock); + PJDLOG_ASSERT(error == 0 || error == EBUSY); + return (error == 0); +} +static __inline void +mtx_unlock(pthread_mutex_t *lock) +{ + int error; + + error = pthread_mutex_unlock(lock); + PJDLOG_ASSERT(error == 0); +} +static __inline bool +mtx_owned(pthread_mutex_t *lock) +{ + + return (pthread_mutex_isowned_np(lock) != 0); +} + +static __inline void +rw_init(pthread_rwlock_t *lock) +{ + int error; + + error = pthread_rwlock_init(lock, NULL); + PJDLOG_ASSERT(error == 0); +} +static __inline void +rw_destroy(pthread_rwlock_t *lock) +{ + int error; + + error = pthread_rwlock_destroy(lock); + PJDLOG_ASSERT(error == 0); +} +static __inline void +rw_rlock(pthread_rwlock_t *lock) +{ + int error; + + error = pthread_rwlock_rdlock(lock); + PJDLOG_ASSERT(error == 0); +} +static __inline void +rw_wlock(pthread_rwlock_t *lock) +{ + int error; + + error = pthread_rwlock_wrlock(lock); + PJDLOG_ASSERT(error == 0); +} +static __inline void +rw_unlock(pthread_rwlock_t *lock) +{ + int error; + + error = pthread_rwlock_unlock(lock); + PJDLOG_ASSERT(error == 0); +} + +static __inline void +cv_init(pthread_cond_t *cv) +{ + pthread_condattr_t attr; + int error; + + error = pthread_condattr_init(&attr); + PJDLOG_ASSERT(error == 0); + error = pthread_condattr_setclock(&attr, CLOCK_MONOTONIC); + PJDLOG_ASSERT(error == 0); + error = pthread_cond_init(cv, &attr); + PJDLOG_ASSERT(error == 0); + error = pthread_condattr_destroy(&attr); + PJDLOG_ASSERT(error == 0); +} +static __inline void +cv_wait(pthread_cond_t *cv, pthread_mutex_t *lock) +{ + int error; + + error = pthread_cond_wait(cv, lock); + PJDLOG_ASSERT(error == 0); +} +static __inline bool +cv_timedwait(pthread_cond_t *cv, pthread_mutex_t *lock, int timeout) +{ + struct timespec ts; + int error; + + if (timeout == 0) { + cv_wait(cv, lock); + return (false); + } + + error = clock_gettime(CLOCK_MONOTONIC, &ts); + PJDLOG_ASSERT(error == 0); + ts.tv_sec += timeout; + error = pthread_cond_timedwait(cv, lock, &ts); + PJDLOG_ASSERT(error == 0 || error == ETIMEDOUT); + return (error == ETIMEDOUT); +} +static __inline void +cv_signal(pthread_cond_t *cv) +{ + int error; + + error = pthread_cond_signal(cv); + PJDLOG_ASSERT(error == 0); +} +static __inline void +cv_broadcast(pthread_cond_t *cv) +{ + int error; + + error = pthread_cond_broadcast(cv); + PJDLOG_ASSERT(error == 0); +} +#endif /* !_SYNCH_H_ */ diff --git a/sbin/hastd/token.l b/sbin/hastd/token.l new file mode 100644 index 0000000..e8f6760 --- /dev/null +++ b/sbin/hastd/token.l @@ -0,0 +1,86 @@ +%{ +/*- + * Copyright (c) 2009-2010 The FreeBSD Foundation + * Copyright (c) 2011 Pawel Jakub Dawidek <pawel@dawidek.net> + * All rights reserved. + * + * This software was developed by Pawel Jakub Dawidek under sponsorship from + * the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include <stdio.h> +#include <string.h> + +#include "hast.h" + +#include "y.tab.h" + +int depth; +int lineno; + +#define DP do { } while (0) +#define YY_DECL int yylex(void) + +extern int yylex(void); +%} + +%option noinput +%option nounput +%option noyywrap + +%% +control { DP; return CONTROL; } +pidfile { DP; return PIDFILE; } +listen { DP; return LISTEN; } +replication { DP; return REPLICATION; } +checksum { DP; return CHECKSUM; } +compression { DP; return COMPRESSION; } +timeout { DP; return TIMEOUT; } +exec { DP; return EXEC; } +metaflush { DP; return METAFLUSH; } +resource { DP; return RESOURCE; } +name { DP; return NAME; } +local { DP; return LOCAL; } +remote { DP; return REMOTE; } +source { DP; return SOURCE; } +on { DP; return ON; } +off { DP; return OFF; } +fullsync { DP; return FULLSYNC; } +memsync { DP; return MEMSYNC; } +async { DP; return ASYNC; } +none { DP; return NONE; } +crc32 { DP; return CRC32; } +sha256 { DP; return SHA256; } +hole { DP; return HOLE; } +lzf { DP; return LZF; } +[0-9]+ { DP; yylval.num = atoi(yytext); return NUM; } +[a-zA-Z0-9\.\-_/\:\[\]]+ { DP; yylval.str = strdup(yytext); return STR; } +\{ { DP; depth++; return OB; } +\} { DP; depth--; return CB; } +#.*$ /* ignore comments */; +\n { lineno++; } +[ \t]+ /* ignore whitespace */; +%% |