summaryrefslogtreecommitdiffstats
path: root/sbin
diff options
context:
space:
mode:
authorpjd <pjd@FreeBSD.org>2010-04-29 15:36:32 +0000
committerpjd <pjd@FreeBSD.org>2010-04-29 15:36:32 +0000
commit0dcf2ac7ada163fca882dcaa5c89ddfcef286bf9 (patch)
tree8e157b7fbed074af6570c80464a3a00cc302ae60 /sbin
parent37a9ab0c6732ace50747a5e47b4eae9c67b72bba (diff)
downloadFreeBSD-src-0dcf2ac7ada163fca882dcaa5c89ddfcef286bf9.zip
FreeBSD-src-0dcf2ac7ada163fca882dcaa5c89ddfcef286bf9.tar.gz
Fix a problem where hastd will stuck in recv(2) after sending request to
secondary, which died between send(2) and recv(2). Do it by adding timeout to recv(2) for primary incoming and outgoing sockets and secondary outgoing socket. Reported by: Mikolaj Golub <to.my.trociny@gmail.com> Tested by: Mikolaj Golub <to.my.trociny@gmail.com> MFC after: 3 days
Diffstat (limited to 'sbin')
-rw-r--r--sbin/hastd/hast.conf.57
-rw-r--r--sbin/hastd/hast.h3
-rw-r--r--sbin/hastd/hastd.c4
-rw-r--r--sbin/hastd/parse.y32
-rw-r--r--sbin/hastd/primary.c6
-rw-r--r--sbin/hastd/proto.c26
-rw-r--r--sbin/hastd/proto.h1
-rw-r--r--sbin/hastd/proto_common.c4
-rw-r--r--sbin/hastd/secondary.c6
-rw-r--r--sbin/hastd/token.l1
10 files changed, 87 insertions, 3 deletions
diff --git a/sbin/hastd/hast.conf.5 b/sbin/hastd/hast.conf.5
index 5734ee8..1ccd479 100644
--- a/sbin/hastd/hast.conf.5
+++ b/sbin/hastd/hast.conf.5
@@ -58,6 +58,7 @@ file is following:
control <addr>
listen <addr>
replication <mode>
+timeout <seconds>
on <node> {
# Node section
@@ -76,6 +77,7 @@ resource <name> {
replication <mode>
name <name>
local <path>
+ timeout <seconds>
on <node> {
# Resource-node section
@@ -194,6 +196,11 @@ The
.Ic async
replication mode is currently not implemented.
.El
+.It Ic timeout Aq seconds
+.Pp
+Connection timeout in seconds.
+The default value is
+.Va 5 .
.It Ic name Aq name
.Pp
GEOM provider name that will appear as
diff --git a/sbin/hastd/hast.h b/sbin/hastd/hast.h
index c5220b5..2230afb 100644
--- a/sbin/hastd/hast.h
+++ b/sbin/hastd/hast.h
@@ -75,6 +75,7 @@
#define HIO_DELETE 3
#define HIO_FLUSH 4
+#define HAST_TIMEOUT 5
#define HAST_CONFIG "/etc/hast.conf"
#define HAST_CONTROL "/var/run/hastctl"
#define HASTD_PORT 8457
@@ -148,6 +149,8 @@ struct hast_resource {
/* Token to verify both in and out connection are coming from
the same node (not necessarily from the same address). */
unsigned char hr_token[HAST_TOKEN_SIZE];
+ /* Connection timeout. */
+ int hr_timeout;
/* Resource unique identifier. */
uint64_t hr_resuid;
diff --git a/sbin/hastd/hastd.c b/sbin/hastd/hastd.c
index 7b2a8e2..7b1e228 100644
--- a/sbin/hastd/hastd.c
+++ b/sbin/hastd/hastd.c
@@ -187,6 +187,10 @@ listen_accept(void)
proto_remote_address(conn, raddr, sizeof(raddr));
pjdlog_info("Connection from %s to %s.", laddr, raddr);
+ /* Error in setting timeout is not critical, but why should it fail? */
+ if (proto_timeout(conn, HAST_TIMEOUT) < 0)
+ pjdlog_errno(LOG_WARNING, "Unable to set connection timeout");
+
nvin = nvout = nverr = NULL;
/*
diff --git a/sbin/hastd/parse.y b/sbin/hastd/parse.y
index 6755320..840a844 100644
--- a/sbin/hastd/parse.y
+++ b/sbin/hastd/parse.y
@@ -58,6 +58,7 @@ static bool mynode;
static char depth0_control[HAST_ADDRSIZE];
static char depth0_listen[HAST_ADDRSIZE];
static int depth0_replication;
+static int depth0_timeout;
static char depth1_provname[PATH_MAX];
static char depth1_localpath[PATH_MAX];
@@ -115,6 +116,7 @@ yy_config_parse(const char *config)
curres = NULL;
mynode = false;
+ depth0_timeout = HAST_TIMEOUT;
depth0_replication = HAST_REPLICATION_MEMSYNC;
strlcpy(depth0_control, HAST_CONTROL, sizeof(depth0_control));
strlcpy(depth0_listen, HASTD_LISTEN, sizeof(depth0_listen));
@@ -154,6 +156,13 @@ yy_config_parse(const char *config)
*/
curres->hr_replication = depth0_replication;
}
+ if (curres->hr_timeout == -1) {
+ /*
+ * Timeout is not set at resource-level.
+ * Use global or default setting.
+ */
+ curres->hr_timeout = depth0_timeout;
+ }
}
return (&lconfig);
@@ -171,7 +180,7 @@ yy_config_free(struct hastd_config *config)
}
%}
-%token CONTROL LISTEN PORT REPLICATION EXTENTSIZE RESOURCE NAME LOCAL REMOTE ON
+%token CONTROL LISTEN PORT REPLICATION TIMEOUT EXTENTSIZE RESOURCE NAME LOCAL REMOTE ON
%token FULLSYNC MEMSYNC ASYNC
%token NUM STR OB CB
@@ -200,6 +209,8 @@ statement:
|
replication_statement
|
+ timeout_statement
+ |
node_statement
|
resource_statement
@@ -281,6 +292,22 @@ replication_type:
ASYNC { $$ = HAST_REPLICATION_ASYNC; }
;
+timeout_statement: TIMEOUT NUM
+ {
+ switch (depth) {
+ case 0:
+ depth0_timeout = $2;
+ break;
+ case 1:
+ if (curres != NULL)
+ curres->hr_timeout = $2;
+ break;
+ default:
+ assert(!"timeout at wrong depth level");
+ }
+ }
+ ;
+
node_statement: ON node_start OB node_entries CB
{
mynode = false;
@@ -389,6 +416,7 @@ resource_start: STR
curres->hr_role = HAST_ROLE_INIT;
curres->hr_previous_role = HAST_ROLE_INIT;
curres->hr_replication = -1;
+ curres->hr_timeout = -1;
curres->hr_provname[0] = '\0';
curres->hr_localpath[0] = '\0';
curres->hr_localfd = -1;
@@ -405,6 +433,8 @@ resource_entries:
resource_entry:
replication_statement
|
+ timeout_statement
+ |
name_statement
|
local_statement
diff --git a/sbin/hastd/primary.c b/sbin/hastd/primary.c
index 73f722b..9f2b2c7 100644
--- a/sbin/hastd/primary.c
+++ b/sbin/hastd/primary.c
@@ -489,6 +489,9 @@ init_remote(struct hast_resource *res, struct proto_conn **inp,
res->hr_remoteaddr);
goto close;
}
+ /* Error in setting timeout is not critical, but why should it fail? */
+ if (proto_timeout(out, res->hr_timeout) < 0)
+ pjdlog_errno(LOG_WARNING, "Unable to set connection timeout");
/*
* First handshake step.
* Setup outgoing connection with remote node.
@@ -552,6 +555,9 @@ init_remote(struct hast_resource *res, struct proto_conn **inp,
res->hr_remoteaddr);
goto close;
}
+ /* Error in setting timeout is not critical, but why should it fail? */
+ if (proto_timeout(in, res->hr_timeout) < 0)
+ pjdlog_errno(LOG_WARNING, "Unable to set connection timeout");
nvout = nv_alloc();
nv_add_string(nvout, res->hr_name, "resource");
nv_add_uint8_array(nvout, res->hr_token, sizeof(res->hr_token),
diff --git a/sbin/hastd/proto.c b/sbin/hastd/proto.c
index 103f20c..531e7e5 100644
--- a/sbin/hastd/proto.c
+++ b/sbin/hastd/proto.c
@@ -30,7 +30,9 @@
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
+#include <sys/types.h>
#include <sys/queue.h>
+#include <sys/socket.h>
#include <assert.h>
#include <errno.h>
@@ -247,6 +249,30 @@ proto_remote_address(const struct proto_conn *conn, char *addr, size_t size)
conn->pc_proto->hp_remote_address(conn->pc_ctx, addr, size);
}
+int
+proto_timeout(const struct proto_conn *conn, int timeout)
+{
+ struct timeval tv;
+ int fd;
+
+ assert(conn != NULL);
+ assert(conn->pc_magic == PROTO_CONN_MAGIC);
+ assert(conn->pc_proto != NULL);
+
+ fd = proto_descriptor(conn);
+ if (fd < 0)
+ return (-1);
+
+ tv.tv_sec = timeout;
+ tv.tv_usec = 0;
+ if (setsockopt(fd, SOL_SOCKET, SO_SNDTIMEO, &tv, sizeof(tv)) < 0)
+ return (-1);
+ if (setsockopt(fd, SOL_SOCKET, SO_RCVTIMEO, &tv, sizeof(tv)) < 0)
+ return (-1);
+
+ return (0);
+}
+
void
proto_close(struct proto_conn *conn)
{
diff --git a/sbin/hastd/proto.h b/sbin/hastd/proto.h
index cb196d8..13248ba 100644
--- a/sbin/hastd/proto.h
+++ b/sbin/hastd/proto.h
@@ -49,6 +49,7 @@ void proto_local_address(const struct proto_conn *conn, char *addr,
size_t size);
void proto_remote_address(const struct proto_conn *conn, char *addr,
size_t size);
+int proto_timeout(const struct proto_conn *conn, int timeout);
void proto_close(struct proto_conn *conn);
#endif /* !_PROTO_H_ */
diff --git a/sbin/hastd/proto_common.c b/sbin/hastd/proto_common.c
index 22102d8..131d30e 100644
--- a/sbin/hastd/proto_common.c
+++ b/sbin/hastd/proto_common.c
@@ -58,7 +58,7 @@ proto_common_send(int fd, const unsigned char *data, size_t size)
if (done == 0)
return (ENOTCONN);
else if (done < 0) {
- if (errno == EAGAIN)
+ if (errno == EINTR)
continue;
return (errno);
}
@@ -76,7 +76,7 @@ proto_common_recv(int fd, unsigned char *data, size_t size)
do {
done = recv(fd, data, size, MSG_WAITALL);
- } while (done == -1 && errno == EAGAIN);
+ } while (done == -1 && errno == EINTR);
if (done == 0)
return (ENOTCONN);
else if (done < 0)
diff --git a/sbin/hastd/secondary.c b/sbin/hastd/secondary.c
index 6af95b5..b189b7e 100644
--- a/sbin/hastd/secondary.c
+++ b/sbin/hastd/secondary.c
@@ -337,6 +337,12 @@ hastd_secondary(struct hast_resource *res, struct nv *nvin)
setproctitle("%s (secondary)", res->hr_name);
+ /* Error in setting timeout is not critical, but why should it fail? */
+ if (proto_timeout(res->hr_remotein, 0) < 0)
+ pjdlog_errno(LOG_WARNING, "Unable to set connection timeout");
+ if (proto_timeout(res->hr_remoteout, res->hr_timeout) < 0)
+ pjdlog_errno(LOG_WARNING, "Unable to set connection timeout");
+
init_local(res);
init_remote(res, nvin);
init_environment();
diff --git a/sbin/hastd/token.l b/sbin/hastd/token.l
index 7b80384..e5d4ca1 100644
--- a/sbin/hastd/token.l
+++ b/sbin/hastd/token.l
@@ -48,6 +48,7 @@ control { DP; return CONTROL; }
listen { DP; return LISTEN; }
port { DP; return PORT; }
replication { DP; return REPLICATION; }
+timeout { DP; return TIMEOUT; }
resource { DP; return RESOURCE; }
name { DP; return NAME; }
local { DP; return LOCAL; }
OpenPOWER on IntegriCloud