summaryrefslogtreecommitdiffstats
path: root/sbin
diff options
context:
space:
mode:
Diffstat (limited to 'sbin')
-rw-r--r--sbin/hastd/Makefile10
-rw-r--r--sbin/hastd/hast.conf.57
-rw-r--r--sbin/hastd/hast.h3
-rw-r--r--sbin/hastd/hast_proto.c8
-rw-r--r--sbin/hastd/hastd.c57
-rw-r--r--sbin/hastd/metadata.c2
-rw-r--r--sbin/hastd/parse.y32
-rw-r--r--sbin/hastd/primary.c10
-rw-r--r--sbin/hastd/proto.c26
-rw-r--r--sbin/hastd/proto.h1
-rw-r--r--sbin/hastd/proto_common.c4
-rw-r--r--sbin/hastd/proto_tcp4.c78
-rw-r--r--sbin/hastd/secondary.c6
-rw-r--r--sbin/hastd/token.l1
14 files changed, 214 insertions, 31 deletions
diff --git a/sbin/hastd/Makefile b/sbin/hastd/Makefile
index 4311807..0b0721e 100644
--- a/sbin/hastd/Makefile
+++ b/sbin/hastd/Makefile
@@ -27,9 +27,13 @@ CFLAGS+=-DINET6
# This is needed to have WARNS > 1.
CFLAGS+=-DYY_NO_UNPUT
-DPADD= ${LIBCRYPTO} ${LIBGEOM} ${LIBBSDXML} ${LIBSBUF} ${LIBL} \
- ${LIBPTHREAD} ${LIBUTIL}
-LDADD= -lcrypto -lgeom -lbsdxml -lsbuf -ll -lpthread -lutil
+DPADD= ${LIBGEOM} ${LIBBSDXML} ${LIBSBUF} ${LIBL} ${LIBPTHREAD} ${LIBUTIL}
+LDADD= -lgeom -lbsdxml -lsbuf -ll -lpthread -lutil
+.if ${MK_OPENSSL} != "no"
+DPADD+= ${LIBCRYPTO}
+LDADD+= -lcrypto
+CFLAGS+=-DHAVE_CRYPTO
+.endif
YFLAGS+=-v
diff --git a/sbin/hastd/hast.conf.5 b/sbin/hastd/hast.conf.5
index 5734ee8..1ccd479 100644
--- a/sbin/hastd/hast.conf.5
+++ b/sbin/hastd/hast.conf.5
@@ -58,6 +58,7 @@ file is following:
control <addr>
listen <addr>
replication <mode>
+timeout <seconds>
on <node> {
# Node section
@@ -76,6 +77,7 @@ resource <name> {
replication <mode>
name <name>
local <path>
+ timeout <seconds>
on <node> {
# Resource-node section
@@ -194,6 +196,11 @@ The
.Ic async
replication mode is currently not implemented.
.El
+.It Ic timeout Aq seconds
+.Pp
+Connection timeout in seconds.
+The default value is
+.Va 5 .
.It Ic name Aq name
.Pp
GEOM provider name that will appear as
diff --git a/sbin/hastd/hast.h b/sbin/hastd/hast.h
index c5220b5..2230afb 100644
--- a/sbin/hastd/hast.h
+++ b/sbin/hastd/hast.h
@@ -75,6 +75,7 @@
#define HIO_DELETE 3
#define HIO_FLUSH 4
+#define HAST_TIMEOUT 5
#define HAST_CONFIG "/etc/hast.conf"
#define HAST_CONTROL "/var/run/hastctl"
#define HASTD_PORT 8457
@@ -148,6 +149,8 @@ struct hast_resource {
/* Token to verify both in and out connection are coming from
the same node (not necessarily from the same address). */
unsigned char hr_token[HAST_TOKEN_SIZE];
+ /* Connection timeout. */
+ int hr_timeout;
/* Resource unique identifier. */
uint64_t hr_resuid;
diff --git a/sbin/hastd/hast_proto.c b/sbin/hastd/hast_proto.c
index 6e66006..348dfc8 100644
--- a/sbin/hastd/hast_proto.c
+++ b/sbin/hastd/hast_proto.c
@@ -37,7 +37,9 @@ __FBSDID("$FreeBSD$");
#include <string.h>
#include <strings.h>
+#ifdef HAVE_CRYPTO
#include <openssl/sha.h>
+#endif
#include <hast.h>
#include <ebuf.h>
@@ -67,14 +69,18 @@ static int compression_send(struct hast_resource *res, struct nv *nv,
void **datap, size_t *sizep, bool *freedatap);
static int compression_recv(struct hast_resource *res, struct nv *nv,
void **datap, size_t *sizep, bool *freedatap);
+#ifdef HAVE_CRYPTO
static int checksum_send(struct hast_resource *res, struct nv *nv,
void **datap, size_t *sizep, bool *freedatap);
static int checksum_recv(struct hast_resource *res, struct nv *nv,
void **datap, size_t *sizep, bool *freedatap);
+#endif
static struct hast_pipe_stage pipeline[] = {
{ "compression", compression_send, compression_recv },
+#ifdef HAVE_CRYPTO
{ "checksum", checksum_send, checksum_recv }
+#endif
};
static int
@@ -161,6 +167,7 @@ compression_recv(struct hast_resource *res, struct nv *nv, void **datap,
return (0);
}
+#ifdef HAVE_CRYPTO
static int
checksum_send(struct hast_resource *res, struct nv *nv, void **datap,
size_t *sizep, bool *freedatap __unused)
@@ -221,6 +228,7 @@ checksum_recv(struct hast_resource *res, struct nv *nv, void **datap,
return (0);
}
+#endif /* HAVE_CRYPTO */
/*
* Send the given nv structure via conn.
diff --git a/sbin/hastd/hastd.c b/sbin/hastd/hastd.c
index 957885d..27b9bba 100644
--- a/sbin/hastd/hastd.c
+++ b/sbin/hastd/hastd.c
@@ -108,6 +108,22 @@ g_gate_load(void)
}
static void
+child_exit_log(unsigned int pid, int status)
+{
+
+ if (WIFEXITED(status) && WEXITSTATUS(status) == 0) {
+ pjdlog_debug(1, "Worker process exited gracefully (pid=%u).",
+ pid);
+ } else if (WIFSIGNALED(status)) {
+ pjdlog_error("Worker process killed (pid=%u, signal=%d).",
+ pid, WTERMSIG(status));
+ } else {
+ pjdlog_error("Worker process exited ungracefully (pid=%u, exitcode=%d).",
+ pid, WIFEXITED(status) ? WEXITSTATUS(status) : -1);
+ }
+}
+
+static void
child_exit(void)
{
struct hast_resource *res;
@@ -129,20 +145,25 @@ child_exit(void)
}
pjdlog_prefix_set("[%s] (%s) ", res->hr_name,
role2str(res->hr_role));
- if (WEXITSTATUS(status) == 0) {
- pjdlog_debug(1,
- "Worker process exited gracefully (pid=%u).",
- (unsigned int)pid);
- } else {
- pjdlog_error("Worker process failed (pid=%u, status=%d).",
- (unsigned int)pid, WEXITSTATUS(status));
- }
+ child_exit_log(pid, status);
proto_close(res->hr_ctrl);
res->hr_workerpid = 0;
if (res->hr_role == HAST_ROLE_PRIMARY) {
- sleep(1);
- pjdlog_info("Restarting worker process.");
- hastd_primary(res);
+ /*
+ * Restart child process if it was killed by signal
+ * or exited because of temporary problem.
+ */
+ if (WIFSIGNALED(status) ||
+ (WIFEXITED(status) &&
+ WEXITSTATUS(status) == EX_TEMPFAIL)) {
+ sleep(1);
+ pjdlog_info("Restarting worker process.");
+ hastd_primary(res);
+ } else {
+ res->hr_role = HAST_ROLE_INIT;
+ pjdlog_info("Changing resource role back to %s.",
+ role2str(res->hr_role));
+ }
}
pjdlog_prefix_set("%s", "");
}
@@ -181,6 +202,10 @@ listen_accept(void)
proto_remote_address(conn, raddr, sizeof(raddr));
pjdlog_info("Connection from %s to %s.", laddr, raddr);
+ /* Error in setting timeout is not critical, but why should it fail? */
+ if (proto_timeout(conn, HAST_TIMEOUT) < 0)
+ pjdlog_errno(LOG_WARNING, "Unable to set connection timeout");
+
nvin = nvout = nverr = NULL;
/*
@@ -290,18 +315,12 @@ listen_accept(void)
/* Wait for it to exit. */
else if ((pid = waitpid(res->hr_workerpid,
&status, 0)) != res->hr_workerpid) {
+ /* We can only log the problem. */
pjdlog_errno(LOG_ERR,
"Waiting for worker process (pid=%u) failed",
(unsigned int)res->hr_workerpid);
- /* See above. */
- } else if (status != 0) {
- pjdlog_error("Worker process (pid=%u) exited ungracefully: status=%d.",
- (unsigned int)res->hr_workerpid, status);
- /* See above. */
} else {
- pjdlog_debug(1,
- "Worker process (pid=%u) exited gracefully.",
- (unsigned int)res->hr_workerpid);
+ child_exit_log(res->hr_workerpid, status);
}
res->hr_workerpid = 0;
} else if (res->hr_remotein != NULL) {
diff --git a/sbin/hastd/metadata.c b/sbin/hastd/metadata.c
index 9bca66b..7a138e8 100644
--- a/sbin/hastd/metadata.c
+++ b/sbin/hastd/metadata.c
@@ -117,7 +117,7 @@ metadata_read(struct hast_resource *res, bool openrw)
}
str = nv_get_string(nv, "resource");
- if (strcmp(str, res->hr_name) != 0) {
+ if (str != NULL && strcmp(str, res->hr_name) != 0) {
pjdlog_error("Provider %s is not part of resource %s.",
res->hr_localpath, res->hr_name);
nv_free(nv);
diff --git a/sbin/hastd/parse.y b/sbin/hastd/parse.y
index 6755320..840a844 100644
--- a/sbin/hastd/parse.y
+++ b/sbin/hastd/parse.y
@@ -58,6 +58,7 @@ static bool mynode;
static char depth0_control[HAST_ADDRSIZE];
static char depth0_listen[HAST_ADDRSIZE];
static int depth0_replication;
+static int depth0_timeout;
static char depth1_provname[PATH_MAX];
static char depth1_localpath[PATH_MAX];
@@ -115,6 +116,7 @@ yy_config_parse(const char *config)
curres = NULL;
mynode = false;
+ depth0_timeout = HAST_TIMEOUT;
depth0_replication = HAST_REPLICATION_MEMSYNC;
strlcpy(depth0_control, HAST_CONTROL, sizeof(depth0_control));
strlcpy(depth0_listen, HASTD_LISTEN, sizeof(depth0_listen));
@@ -154,6 +156,13 @@ yy_config_parse(const char *config)
*/
curres->hr_replication = depth0_replication;
}
+ if (curres->hr_timeout == -1) {
+ /*
+ * Timeout is not set at resource-level.
+ * Use global or default setting.
+ */
+ curres->hr_timeout = depth0_timeout;
+ }
}
return (&lconfig);
@@ -171,7 +180,7 @@ yy_config_free(struct hastd_config *config)
}
%}
-%token CONTROL LISTEN PORT REPLICATION EXTENTSIZE RESOURCE NAME LOCAL REMOTE ON
+%token CONTROL LISTEN PORT REPLICATION TIMEOUT EXTENTSIZE RESOURCE NAME LOCAL REMOTE ON
%token FULLSYNC MEMSYNC ASYNC
%token NUM STR OB CB
@@ -200,6 +209,8 @@ statement:
|
replication_statement
|
+ timeout_statement
+ |
node_statement
|
resource_statement
@@ -281,6 +292,22 @@ replication_type:
ASYNC { $$ = HAST_REPLICATION_ASYNC; }
;
+timeout_statement: TIMEOUT NUM
+ {
+ switch (depth) {
+ case 0:
+ depth0_timeout = $2;
+ break;
+ case 1:
+ if (curres != NULL)
+ curres->hr_timeout = $2;
+ break;
+ default:
+ assert(!"timeout at wrong depth level");
+ }
+ }
+ ;
+
node_statement: ON node_start OB node_entries CB
{
mynode = false;
@@ -389,6 +416,7 @@ resource_start: STR
curres->hr_role = HAST_ROLE_INIT;
curres->hr_previous_role = HAST_ROLE_INIT;
curres->hr_replication = -1;
+ curres->hr_timeout = -1;
curres->hr_provname[0] = '\0';
curres->hr_localpath[0] = '\0';
curres->hr_localfd = -1;
@@ -405,6 +433,8 @@ resource_entries:
resource_entry:
replication_statement
|
+ timeout_statement
+ |
name_statement
|
local_statement
diff --git a/sbin/hastd/primary.c b/sbin/hastd/primary.c
index 0915154..9f2b2c7 100644
--- a/sbin/hastd/primary.c
+++ b/sbin/hastd/primary.c
@@ -480,7 +480,7 @@ init_remote(struct hast_resource *res, struct proto_conn **inp,
/* Prepare outgoing connection with remote node. */
if (proto_client(res->hr_remoteaddr, &out) < 0) {
- primary_exit(EX_OSERR, "Unable to create connection to %s",
+ primary_exit(EX_TEMPFAIL, "Unable to create connection to %s",
res->hr_remoteaddr);
}
/* Try to connect, but accept failure. */
@@ -489,6 +489,9 @@ init_remote(struct hast_resource *res, struct proto_conn **inp,
res->hr_remoteaddr);
goto close;
}
+ /* Error in setting timeout is not critical, but why should it fail? */
+ if (proto_timeout(out, res->hr_timeout) < 0)
+ pjdlog_errno(LOG_WARNING, "Unable to set connection timeout");
/*
* First handshake step.
* Setup outgoing connection with remote node.
@@ -552,6 +555,9 @@ init_remote(struct hast_resource *res, struct proto_conn **inp,
res->hr_remoteaddr);
goto close;
}
+ /* Error in setting timeout is not critical, but why should it fail? */
+ if (proto_timeout(in, res->hr_timeout) < 0)
+ pjdlog_errno(LOG_WARNING, "Unable to set connection timeout");
nvout = nv_alloc();
nv_add_string(nvout, res->hr_name, "resource");
nv_add_uint8_array(nvout, res->hr_token, sizeof(res->hr_token),
@@ -739,7 +745,7 @@ hastd_primary(struct hast_resource *res)
pid = fork();
if (pid < 0) {
KEEP_ERRNO((void)pidfile_remove(pfh));
- primary_exit(EX_OSERR, "Unable to fork");
+ primary_exit(EX_TEMPFAIL, "Unable to fork");
}
if (pid > 0) {
diff --git a/sbin/hastd/proto.c b/sbin/hastd/proto.c
index 103f20c..531e7e5 100644
--- a/sbin/hastd/proto.c
+++ b/sbin/hastd/proto.c
@@ -30,7 +30,9 @@
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
+#include <sys/types.h>
#include <sys/queue.h>
+#include <sys/socket.h>
#include <assert.h>
#include <errno.h>
@@ -247,6 +249,30 @@ proto_remote_address(const struct proto_conn *conn, char *addr, size_t size)
conn->pc_proto->hp_remote_address(conn->pc_ctx, addr, size);
}
+int
+proto_timeout(const struct proto_conn *conn, int timeout)
+{
+ struct timeval tv;
+ int fd;
+
+ assert(conn != NULL);
+ assert(conn->pc_magic == PROTO_CONN_MAGIC);
+ assert(conn->pc_proto != NULL);
+
+ fd = proto_descriptor(conn);
+ if (fd < 0)
+ return (-1);
+
+ tv.tv_sec = timeout;
+ tv.tv_usec = 0;
+ if (setsockopt(fd, SOL_SOCKET, SO_SNDTIMEO, &tv, sizeof(tv)) < 0)
+ return (-1);
+ if (setsockopt(fd, SOL_SOCKET, SO_RCVTIMEO, &tv, sizeof(tv)) < 0)
+ return (-1);
+
+ return (0);
+}
+
void
proto_close(struct proto_conn *conn)
{
diff --git a/sbin/hastd/proto.h b/sbin/hastd/proto.h
index cb196d8..13248ba 100644
--- a/sbin/hastd/proto.h
+++ b/sbin/hastd/proto.h
@@ -49,6 +49,7 @@ void proto_local_address(const struct proto_conn *conn, char *addr,
size_t size);
void proto_remote_address(const struct proto_conn *conn, char *addr,
size_t size);
+int proto_timeout(const struct proto_conn *conn, int timeout);
void proto_close(struct proto_conn *conn);
#endif /* !_PROTO_H_ */
diff --git a/sbin/hastd/proto_common.c b/sbin/hastd/proto_common.c
index 22102d8..131d30e 100644
--- a/sbin/hastd/proto_common.c
+++ b/sbin/hastd/proto_common.c
@@ -58,7 +58,7 @@ proto_common_send(int fd, const unsigned char *data, size_t size)
if (done == 0)
return (ENOTCONN);
else if (done < 0) {
- if (errno == EAGAIN)
+ if (errno == EINTR)
continue;
return (errno);
}
@@ -76,7 +76,7 @@ proto_common_recv(int fd, unsigned char *data, size_t size)
do {
done = recv(fd, data, size, MSG_WAITALL);
- } while (done == -1 && errno == EAGAIN);
+ } while (done == -1 && errno == EINTR);
if (done == 0)
return (ENOTCONN);
else if (done < 0)
diff --git a/sbin/hastd/proto_tcp4.c b/sbin/hastd/proto_tcp4.c
index 2fba996..5af82d5 100644
--- a/sbin/hastd/proto_tcp4.c
+++ b/sbin/hastd/proto_tcp4.c
@@ -37,6 +37,7 @@ __FBSDID("$FreeBSD$");
#include <assert.h>
#include <errno.h>
+#include <fcntl.h>
#include <netdb.h>
#include <stdbool.h>
#include <stdint.h>
@@ -47,6 +48,7 @@ __FBSDID("$FreeBSD$");
#include "hast.h"
#include "pjdlog.h"
#include "proto_impl.h"
+#include "subr.h"
#define TCP4_CTX_MAGIC 0x7c441c
struct tcp4_ctx {
@@ -222,18 +224,88 @@ static int
tcp4_connect(void *ctx)
{
struct tcp4_ctx *tctx = ctx;
+ struct timeval tv;
+ fd_set fdset;
+ socklen_t esize;
+ int error, flags, ret;
assert(tctx != NULL);
assert(tctx->tc_magic == TCP4_CTX_MAGIC);
assert(tctx->tc_side == TCP4_SIDE_CLIENT);
assert(tctx->tc_fd >= 0);
- if (connect(tctx->tc_fd, (struct sockaddr *)&tctx->tc_sin,
- sizeof(tctx->tc_sin)) < 0) {
+ flags = fcntl(tctx->tc_fd, F_GETFL);
+ if (flags == -1) {
+ KEEP_ERRNO(pjdlog_common(LOG_DEBUG, 1, errno,
+ "fcntl(F_GETFL) failed"));
+ return (errno);
+ }
+ /*
+ * We make socket non-blocking so we have decided about connection
+ * timeout.
+ */
+ flags |= O_NONBLOCK;
+ if (fcntl(tctx->tc_fd, F_SETFL, flags) == -1) {
+ KEEP_ERRNO(pjdlog_common(LOG_DEBUG, 1, errno,
+ "fcntl(F_SETFL, O_NONBLOCK) failed"));
return (errno);
}
- return (0);
+ if (connect(tctx->tc_fd, (struct sockaddr *)&tctx->tc_sin,
+ sizeof(tctx->tc_sin)) == 0) {
+ error = 0;
+ goto done;
+ }
+ if (errno != EINPROGRESS) {
+ error = errno;
+ pjdlog_common(LOG_DEBUG, 1, errno, "connect() failed");
+ goto done;
+ }
+ /*
+ * Connection can't be established immediately, let's wait
+ * for HAST_TIMEOUT seconds.
+ */
+ tv.tv_sec = HAST_TIMEOUT;
+ tv.tv_usec = 0;
+again:
+ FD_ZERO(&fdset);
+ FD_SET(tctx->tc_fd, &fdset);
+ ret = select(tctx->tc_fd + 1, NULL, &fdset, NULL, &tv);
+ if (ret == 0) {
+ error = ETIMEDOUT;
+ goto done;
+ } else if (ret == -1) {
+ if (errno == EINTR)
+ goto again;
+ error = errno;
+ pjdlog_common(LOG_DEBUG, 1, errno, "select() failed");
+ goto done;
+ }
+ assert(ret > 0);
+ assert(FD_ISSET(tctx->tc_fd, &fdset));
+ esize = sizeof(error);
+ if (getsockopt(tctx->tc_fd, SOL_SOCKET, SO_ERROR, &error,
+ &esize) == -1) {
+ error = errno;
+ pjdlog_common(LOG_DEBUG, 1, errno,
+ "getsockopt(SO_ERROR) failed");
+ goto done;
+ }
+ if (error != 0) {
+ pjdlog_common(LOG_DEBUG, 1, error,
+ "getsockopt(SO_ERROR) returned error");
+ goto done;
+ }
+ error = 0;
+done:
+ flags &= ~O_NONBLOCK;
+ if (fcntl(tctx->tc_fd, F_SETFL, flags) == -1) {
+ if (error == 0)
+ error = errno;
+ pjdlog_common(LOG_DEBUG, 1, errno,
+ "fcntl(F_SETFL, ~O_NONBLOCK) failed");
+ }
+ return (error);
}
static int
diff --git a/sbin/hastd/secondary.c b/sbin/hastd/secondary.c
index 6af95b5..b189b7e 100644
--- a/sbin/hastd/secondary.c
+++ b/sbin/hastd/secondary.c
@@ -337,6 +337,12 @@ hastd_secondary(struct hast_resource *res, struct nv *nvin)
setproctitle("%s (secondary)", res->hr_name);
+ /* Error in setting timeout is not critical, but why should it fail? */
+ if (proto_timeout(res->hr_remotein, 0) < 0)
+ pjdlog_errno(LOG_WARNING, "Unable to set connection timeout");
+ if (proto_timeout(res->hr_remoteout, res->hr_timeout) < 0)
+ pjdlog_errno(LOG_WARNING, "Unable to set connection timeout");
+
init_local(res);
init_remote(res, nvin);
init_environment();
diff --git a/sbin/hastd/token.l b/sbin/hastd/token.l
index 7b80384..e5d4ca1 100644
--- a/sbin/hastd/token.l
+++ b/sbin/hastd/token.l
@@ -48,6 +48,7 @@ control { DP; return CONTROL; }
listen { DP; return LISTEN; }
port { DP; return PORT; }
replication { DP; return REPLICATION; }
+timeout { DP; return TIMEOUT; }
resource { DP; return RESOURCE; }
name { DP; return NAME; }
local { DP; return LOCAL; }
OpenPOWER on IntegriCloud