summaryrefslogtreecommitdiffstats
path: root/sbin
diff options
context:
space:
mode:
authorpjd <pjd@FreeBSD.org>2011-04-20 18:43:28 +0000
committerpjd <pjd@FreeBSD.org>2011-04-20 18:43:28 +0000
commite272ff3c6fd30c936cd408f5ec6356eb947d9831 (patch)
tree518b413604555317bc5b158f47838e0526930e07 /sbin
parentedbcc76bc635c467669f309687713da38fa131bf (diff)
downloadFreeBSD-src-e272ff3c6fd30c936cd408f5ec6356eb947d9831.zip
FreeBSD-src-e272ff3c6fd30c936cd408f5ec6356eb947d9831.tar.gz
When we become primary, we connect to the remote and expect it to be in
secondary role. It is possible that the remote node is primary, but only because there was a role change and it didn't finish cleaning up (unmounting file systems, etc.). If we detect such situation, wait for the remote node to switch the role to secondary before accepting I/Os. If we don't wait for it in that case, we will most likely cause split-brain. MFC after: 1 week
Diffstat (limited to 'sbin')
-rw-r--r--sbin/hastd/hastd.c7
-rw-r--r--sbin/hastd/primary.c62
2 files changed, 56 insertions, 13 deletions
diff --git a/sbin/hastd/hastd.c b/sbin/hastd/hastd.c
index 444dd63..7f87dd6 100644
--- a/sbin/hastd/hastd.c
+++ b/sbin/hastd/hastd.c
@@ -736,6 +736,13 @@ listen_accept(void)
nv_add_stringf(nverr, "errmsg",
"Remote node acts as %s for the resource and not as %s.",
role2str(res->hr_role), role2str(HAST_ROLE_SECONDARY));
+ if (res->hr_role == HAST_ROLE_PRIMARY) {
+ /*
+ * If we act as primary request the other side to wait
+ * for us for a bit, as may might be finishing cleanups.
+ */
+ nv_add_uint8(nverr, 1, "wait");
+ }
goto fail;
}
/* Does token (if exists) match? */
diff --git a/sbin/hastd/primary.c b/sbin/hastd/primary.c
index 751929e..3363fcb 100644
--- a/sbin/hastd/primary.c
+++ b/sbin/hastd/primary.c
@@ -219,6 +219,7 @@ static pthread_cond_t range_regular_cond;
static struct rangelocks *range_sync;
static bool range_sync_wait;
static pthread_cond_t range_sync_cond;
+static bool fullystarted;
static void *ggate_recv_thread(void *arg);
static void *local_send_thread(void *arg);
@@ -524,7 +525,7 @@ primary_connect(struct hast_resource *res, struct proto_conn **connp)
return (0);
}
-static bool
+static int
init_remote(struct hast_resource *res, struct proto_conn **inp,
struct proto_conn **outp)
{
@@ -537,6 +538,7 @@ init_remote(struct hast_resource *res, struct proto_conn **inp,
int64_t datasize;
uint32_t mapsize;
size_t size;
+ int error;
PJDLOG_ASSERT((inp == NULL && outp == NULL) || (inp != NULL && outp != NULL));
PJDLOG_ASSERT(real_remote(res));
@@ -545,7 +547,9 @@ init_remote(struct hast_resource *res, struct proto_conn **inp,
errmsg = NULL;
if (primary_connect(res, &out) == -1)
- return (false);
+ return (ECONNREFUSED);
+
+ error = ECONNABORTED;
/*
* First handshake step.
@@ -577,6 +581,8 @@ init_remote(struct hast_resource *res, struct proto_conn **inp,
errmsg = nv_get_string(nvin, "errmsg");
if (errmsg != NULL) {
pjdlog_warning("%s", errmsg);
+ if (nv_exists(nvin, "wait"))
+ error = EBUSY;
nv_free(nvin);
goto close;
}
@@ -734,14 +740,14 @@ init_remote(struct hast_resource *res, struct proto_conn **inp,
res->hr_remoteout = out;
}
event_send(res, EVENT_CONNECT);
- return (true);
+ return (0);
close:
if (errmsg != NULL && strcmp(errmsg, "Split-brain condition!") == 0)
event_send(res, EVENT_SPLITBRAIN);
proto_close(out);
if (in != NULL)
proto_close(in);
- return (false);
+ return (error);
}
static void
@@ -920,8 +926,30 @@ hastd_primary(struct hast_resource *res)
*/
error = pthread_create(&td, NULL, ctrl_thread, res);
PJDLOG_ASSERT(error == 0);
- if (real_remote(res) && init_remote(res, NULL, NULL))
- sync_start();
+ if (real_remote(res)) {
+ error = init_remote(res, NULL, NULL);
+ if (error == 0) {
+ sync_start();
+ } else if (error == EBUSY) {
+ time_t start = time(NULL);
+
+ pjdlog_warning("Waiting for remote node to become %s for %ds.",
+ role2str(HAST_ROLE_SECONDARY),
+ res->hr_timeout);
+ for (;;) {
+ sleep(1);
+ error = init_remote(res, NULL, NULL);
+ if (error != EBUSY)
+ break;
+ if (time(NULL) > start + res->hr_timeout)
+ break;
+ }
+ if (error == EBUSY) {
+ pjdlog_warning("Remote node is still %s, starting anyway.",
+ role2str(HAST_ROLE_PRIMARY));
+ }
+ }
+ }
error = pthread_create(&td, NULL, ggate_recv_thread, res);
PJDLOG_ASSERT(error == 0);
error = pthread_create(&td, NULL, local_send_thread, res);
@@ -932,6 +960,7 @@ hastd_primary(struct hast_resource *res)
PJDLOG_ASSERT(error == 0);
error = pthread_create(&td, NULL, ggate_send_thread, res);
PJDLOG_ASSERT(error == 0);
+ fullystarted = true;
(void)sync_thread(res);
}
@@ -2095,7 +2124,7 @@ guard_one(struct hast_resource *res, unsigned int ncomp)
pjdlog_debug(2, "remote_guard: Reconnecting to %s.",
res->hr_remoteaddr);
in = out = NULL;
- if (init_remote(res, &in, &out)) {
+ if (init_remote(res, &in, &out) == 0) {
rw_wlock(&hio_remote_lock[ncomp]);
PJDLOG_ASSERT(res->hr_remotein == NULL);
PJDLOG_ASSERT(res->hr_remoteout == NULL);
@@ -2153,12 +2182,19 @@ guard_thread(void *arg)
break;
}
- pjdlog_debug(2, "remote_guard: Checking connections.");
- now = time(NULL);
- if (lastcheck + HAST_KEEPALIVE <= now) {
- for (ii = 0; ii < ncomps; ii++)
- guard_one(res, ii);
- lastcheck = now;
+ /*
+ * Don't check connections until we fully started,
+ * as we may still be looping, waiting for remote node
+ * to switch from primary to secondary.
+ */
+ if (fullystarted) {
+ pjdlog_debug(2, "remote_guard: Checking connections.");
+ now = time(NULL);
+ if (lastcheck + HAST_KEEPALIVE <= now) {
+ for (ii = 0; ii < ncomps; ii++)
+ guard_one(res, ii);
+ lastcheck = now;
+ }
}
signo = sigtimedwait(&mask, NULL, &timeout);
}
OpenPOWER on IntegriCloud