diff options
author | pjd <pjd@FreeBSD.org> | 2010-08-27 14:26:37 +0000 |
---|---|---|
committer | pjd <pjd@FreeBSD.org> | 2010-08-27 14:26:37 +0000 |
commit | ead19aaef1acf3ba476246765d4d409eba5c3c02 (patch) | |
tree | 2938d5286dc8ce260b3a171cb3a8c8ed2f970e44 /sbin/hastd/primary.c | |
parent | 8729a28322f63b97a52f73de5cc9f822a1645b51 (diff) | |
download | FreeBSD-src-ead19aaef1acf3ba476246765d4d409eba5c3c02.zip FreeBSD-src-ead19aaef1acf3ba476246765d4d409eba5c3c02.tar.gz |
Implement keepalive mechanism inside HAST protocol so we can detect secondary
node failures quickly for HAST resources that are rarely modified.
Remove XXX from a comment now that the guard thread never sleeps infinitely.
MFC after: 2 weeks
Obtained from: Wheel Systems Sp. z o.o. http://www.wheelsystems.com
Diffstat (limited to 'sbin/hastd/primary.c')
-rw-r--r-- | sbin/hastd/primary.c | 60 |
1 files changed, 46 insertions, 14 deletions
diff --git a/sbin/hastd/primary.c b/sbin/hastd/primary.c index 52ac594..dd26f4a 100644 --- a/sbin/hastd/primary.c +++ b/sbin/hastd/primary.c @@ -151,7 +151,11 @@ static pthread_mutex_t metadata_lock; */ #define HAST_NCOMPONENTS 2 /* - * Number of seconds to sleep before next reconnect try. + * Number of seconds to sleep between keepalive packets. + */ +#define KEEPALIVE_SLEEP 10 +/* + * Number of seconds to sleep between reconnect retries. */ #define RECONNECT_SLEEP 5 @@ -886,11 +890,14 @@ remote_close(struct hast_resource *res, int ncomp) sync_stop(); /* - * Wake up guard thread, so it can immediately start reconnect. + * Wake up guard thread (if we are not called from within guard thread), + * so it can immediately start reconnect. */ - mtx_lock(&hio_guard_lock); - cv_signal(&hio_guard_cond); - mtx_unlock(&hio_guard_lock); + if (!mtx_owned(&hio_guard_lock)) { + mtx_lock(&hio_guard_lock); + cv_signal(&hio_guard_cond); + mtx_unlock(&hio_guard_lock); + } } /* @@ -1734,7 +1741,7 @@ sighandler(int sig) assert(!"invalid condition"); } /* - * XXX: Racy, but if we cannot obtain hio_guard_lock here, we don't + * Racy, but if we cannot obtain hio_guard_lock here, we don't * want to risk deadlock. */ unlock = mtx_trylock(&hio_guard_lock); @@ -1851,6 +1858,32 @@ failed: pjdlog_warning("Configuration not reloaded."); } +static void +keepalive_send(struct hast_resource *res, unsigned int ncomp) +{ + struct nv *nv; + + nv = nv_alloc(); + nv_add_uint8(nv, HIO_KEEPALIVE, "cmd"); + if (nv_error(nv) != 0) { + nv_free(nv); + pjdlog_debug(1, + "keepalive_send: Unable to prepare header to send."); + return; + } + if (hast_proto_send(res, res->hr_remoteout, nv, NULL, 0) < 0) { + pjdlog_common(LOG_DEBUG, 1, errno, + "keepalive_send: Unable to send request"); + nv_free(nv); + rw_unlock(&hio_remote_lock[ncomp]); + remote_close(res, ncomp); + rw_rlock(&hio_remote_lock[ncomp]); + return; + } + nv_free(nv); + pjdlog_debug(2, "keepalive_send: Request sent."); +} + /* * Thread guards remote connections and reconnects when needed, handles * signals, etc. @@ -1874,14 +1907,8 @@ guard_thread(void *arg) sighup_received = false; config_reload(); } - /* - * If all the connection will be fine, we will sleep until - * someone wakes us up. - * If any of the connections will be broken and we won't be - * able to connect, we will sleep only for RECONNECT_SLEEP - * seconds so we can retry soon. - */ - timeout = 0; + + timeout = KEEPALIVE_SLEEP; pjdlog_debug(2, "remote_guard: Checking connections."); mtx_lock(&hio_guard_lock); for (ii = 0; ii < ncomps; ii++) { @@ -1891,6 +1918,11 @@ guard_thread(void *arg) if (ISCONNECTED(res, ii)) { assert(res->hr_remotein != NULL); assert(res->hr_remoteout != NULL); + keepalive_send(res, ii); + } + if (ISCONNECTED(res, ii)) { + assert(res->hr_remotein != NULL); + assert(res->hr_remoteout != NULL); rw_unlock(&hio_remote_lock[ii]); pjdlog_debug(2, "remote_guard: Connection to %s is ok.", |