6 files changed, 118 insertions, 7 deletions
diff --git a/sbin/hastd/hast.conf.5 b/sbin/hastd/hast.conf.5
index f53e6e3..6410ae7 100644
--- a/sbin/hastd/hast.conf.5
+++ b/sbin/hastd/hast.conf.5
@@ -63,6 +63,7 @@ checksum <algorithm>
 compression <algorithm>
 timeout <seconds>
 exec <path>
+metaflush "on" | "off"
 
 on <node> {
 	# Node section
@@ -85,12 +86,14 @@ resource <name> {
 	local <path>
 	timeout <seconds>
 	exec <path>
+	metaflush "on" | "off"
 
 	on <node> {
 		# Resource-node section
 		name <name>
 		# Required
 		local <path>
+		metaflush "on" | "off"
 		# Required
 		remote <addr>
 		source <addr>
@@ -100,6 +103,7 @@ resource <name> {
 		name <name>
 		# Required
 		local <path>
+		metaflush "on" | "off"
 		# Required
 		remote <addr>
 		source <addr>
@@ -318,6 +322,25 @@ It can be one of:
 .Ar secondary ,
 .Ar primary .
 .Pp
+.It Ic metaflush on | off
+.Pp
+When set to
+.Va on ,
+flush write cache of the local provider after every metadata (activemap) update.
+Flushing write cache ensures that provider will not reorder writes and that
+metadata will be properly updated before real data is stored.
+If the local provider does not support flushing write cache (it returns
+.Er EOPNOTSUPP
+on the
+.Cm BIO_FLUSH
+request),
+.Nm hastd
+will disable
+.Ic metaflush
+automatically.
+The default value is
+.Va on .
+.Pp
 .It Ic name Aq name
 .Pp
 GEOM provider name that will appear as
diff --git a/sbin/hastd/hast.h b/sbin/hastd/hast.h
index a62b63a..7bfef4c 100644
--- a/sbin/hastd/hast.h
+++ b/sbin/hastd/hast.h
@@ -167,6 +167,8 @@ struct hast_resource {
 	off_t	hr_local_mediasize;
 	/* Sector size of local provider. */
 	unsigned int hr_local_sectorsize;
+	/* Flush write cache on metadata updates? */
+	int	hr_metaflush;
 
 	/* Descriptor for /dev/ggctl communication. */
 	int	hr_ggatefd;
diff --git a/sbin/hastd/hastd.c b/sbin/hastd/hastd.c
index 6518f0c..d21f7f6 100644
--- a/sbin/hastd/hastd.c
+++ b/sbin/hastd/hastd.c
@@ -386,6 +386,12 @@ resource_needs_restart(const struct hast_resource *res0,
 			return (true);
 		if (strcmp(res0->hr_exec, res1->hr_exec) != 0)
 			return (true);
+		/*
+		 * When metaflush has changed we don't really need restart,
+		 * but it is just easier this way.
+		 */
+		if (res0->hr_metaflush != res1->hr_metaflush)
+			return (true);
 	}
 	return (false);
 }
@@ -416,6 +422,8 @@ resource_needs_reload(const struct hast_resource *res0,
 		return (true);
 	if (strcmp(res0->hr_exec, res1->hr_exec) != 0)
 		return (true);
+	if (res0->hr_metaflush != res1->hr_metaflush)
+		return (true);
 	return (false);
 }
 
@@ -436,6 +444,7 @@ resource_reload(const struct hast_resource *res)
 	nv_add_int32(nvout, (int32_t)res->hr_compression, "compression");
 	nv_add_int32(nvout, (int32_t)res->hr_timeout, "timeout");
 	nv_add_string(nvout, res->hr_exec, "exec");
+	nv_add_int32(nvout, (int32_t)res->hr_metaflush, "metaflush");
 	if (nv_error(nvout) != 0) {
 		nv_free(nvout);
 		pjdlog_error("Unable to allocate header for reload message.");
@@ -591,12 +600,13 @@ hastd_reload(void)
 	 * recreating it.
 	 *
 	 * We do just reload (send SIGHUP to worker process) if we act as
-	 * PRIMARY, but only if remote address, replication mode, timeout or
-	 * execution path has changed. For those, there is no need to restart
-	 * worker process.
+	 * PRIMARY, but only if remote address, source address, replication
+	 * mode, timeout, execution path or metaflush has changed.
+	 * For those, there is no need to restart worker process.
 	 * If PRIMARY receives SIGHUP, it will reconnect if remote address or
-	 * replication mode has changed or simply set new timeout if only
-	 * timeout has changed.
+	 * source address has changed or it will set new timeout if only timeout
+	 * has changed or it will update metaflush if only metaflush has
+	 * changed.
 	 */
 	TAILQ_FOREACH_SAFE(nres, &newcfg->hc_resources, hr_next, tres) {
 		TAILQ_FOREACH(cres, &cfg->hc_resources, hr_next) {
@@ -627,6 +637,7 @@ hastd_reload(void)
 			cres->hr_timeout = nres->hr_timeout;
 			strlcpy(cres->hr_exec, nres->hr_exec,
 			    sizeof(cres->hr_exec));
+			cres->hr_metaflush = nres->hr_metaflush;
 			if (cres->hr_workerpid != 0)
 				resource_reload(cres);
 		}
diff --git a/sbin/hastd/parse.y b/sbin/hastd/parse.y
index 01e8593a..e548a85 100644
--- a/sbin/hastd/parse.y
+++ b/sbin/hastd/parse.y
@@ -68,9 +68,11 @@ static int depth0_checksum;
 static int depth0_compression;
 static int depth0_timeout;
 static char depth0_exec[PATH_MAX];
+static int depth0_metaflush;
 
 static char depth1_provname[PATH_MAX];
 static char depth1_localpath[PATH_MAX];
+static int depth1_metaflush;
 
 extern void yyrestart(FILE *);
 
@@ -197,6 +199,7 @@ yy_config_parse(const char *config, bool exitonerror)
 	strlcpy(depth0_listen_tcp6, HASTD_LISTEN_TCP6,
 	    sizeof(depth0_listen_tcp6));
 	depth0_exec[0] = '\0';
+	depth0_metaflush = 1;
 
 	lconfig = calloc(1, sizeof(*lconfig));
 	if (lconfig == NULL) {
@@ -328,6 +331,13 @@ yy_config_parse(const char *config, bool exitonerror)
 			strlcpy(curres->hr_exec, depth0_exec,
 			    sizeof(curres->hr_exec));
 		}
+		if (curres->hr_metaflush == -1) {
+			/*
+			 * Metaflush is not set at resource-level.
+			 * Use global or default setting.
+			 */
+			curres->hr_metaflush = depth0_metaflush;
+		}
 	}
 
 	return (lconfig);
@@ -355,8 +365,8 @@ yy_config_free(struct hastd_config *config)
 }
 %}
 
-%token CONTROL LISTEN PORT REPLICATION CHECKSUM COMPRESSION
-%token TIMEOUT EXEC EXTENTSIZE RESOURCE NAME LOCAL REMOTE SOURCE ON
+%token CONTROL LISTEN PORT REPLICATION CHECKSUM COMPRESSION METAFLUSH
+%token TIMEOUT EXEC EXTENTSIZE RESOURCE NAME LOCAL REMOTE SOURCE ON OFF
 %token FULLSYNC MEMSYNC ASYNC NONE CRC32 SHA256 HOLE LZF
 %token NUM STR OB CB
 
@@ -364,6 +374,7 @@ yy_config_free(struct hastd_config *config)
 %type <num> replication_type
 %type <num> checksum_type
 %type <num> compression_type
+%type <num> boolean
 
 %union
 {
@@ -396,6 +407,8 @@ statement:
 	|
 	exec_statement
 	|
+	metaflush_statement
+	|
 	node_statement
 	|
 	resource_statement
@@ -585,6 +598,34 @@ exec_statement:		EXEC STR
 	}
 	;
 
+metaflush_statement:	METAFLUSH boolean
+	{
+		switch (depth) {
+		case 0:
+			depth0_metaflush = $2;
+			break;
+		case 1:
+			PJDLOG_ASSERT(curres != NULL);
+			depth1_metaflush = $2;
+			break;
+		case 2:
+			if (!mynode)
+				break;
+			PJDLOG_ASSERT(curres != NULL);
+			curres->hr_metaflush = $2;
+			break;
+		default:
+			PJDLOG_ABORT("metaflush at wrong depth level");
+		}
+	}
+	;
+
+boolean:
+	ON		{ $$ = 1; }
+	|
+	OFF		{ $$ = 0; }
+	;
+
 node_statement:		ON node_start OB node_entries CB
 	{
 		mynode = false;
@@ -660,6 +701,13 @@ resource_statement:	RESOURCE resource_start OB resource_entries CB
 				strlcpy(curres->hr_localpath, depth1_localpath,
 				    sizeof(curres->hr_localpath));
 			}
+			if (curres->hr_metaflush == -1 && depth1_metaflush != -1) {
+				/*
+				 * Metaflush is not set at node-level,
+				 * but is set at resource-level, use it.
+				 */
+				curres->hr_metaflush = depth1_metaflush;
+			}
 
 			/*
 			 * If provider name is not given, use resource name
@@ -713,6 +761,7 @@ resource_start:	STR
 		 */
 		depth1_provname[0] = '\0';
 		depth1_localpath[0] = '\0';
+		depth1_metaflush = -1;
 		hadmynode = false;
 
 		curres = calloc(1, sizeof(*curres));
@@ -739,6 +788,7 @@ resource_start:	STR
 		curres->hr_provname[0] = '\0';
 		curres->hr_localpath[0] = '\0';
 		curres->hr_localfd = -1;
+		curres->hr_metaflush = -1;
 		curres->hr_remoteaddr[0] = '\0';
 		curres->hr_sourceaddr[0] = '\0';
 		curres->hr_ggateunit = -1;
@@ -761,6 +811,8 @@ resource_entry:
 	|
 	exec_statement
 	|
+	metaflush_statement
+	|
 	name_statement
 	|
 	local_statement
@@ -869,6 +921,8 @@ resource_node_entry:
 	remote_statement
 	|
 	source_statement
+	|
+	metaflush_statement
 	;
 
 remote_statement:	REMOTE remote_str
diff --git a/sbin/hastd/primary.c b/sbin/hastd/primary.c
index ebb758e..08c3329 100644
--- a/sbin/hastd/primary.c
+++ b/sbin/hastd/primary.c
@@ -296,6 +296,17 @@ hast_activemap_flush(struct hast_resource *res)
 		pjdlog_errno(LOG_ERR, "Unable to flush activemap to disk");
 		return (-1);
 	}
+	if (res->hr_metaflush == 1 && g_flush(res->hr_localfd) == -1) {
+		if (errno == EOPNOTSUPP) {
+			pjdlog_warning("The %s provider doesn't support flushing write cache. Disabling it.",
+			    res->hr_localpath);
+			res->hr_metaflush = 0;
+		} else {
+			pjdlog_errno(LOG_ERR,
+			    "Unable to flush disk cache on activemap update");
+			return (-1);
+		}
+	}
 	return (0);
 }
 
@@ -1999,6 +2010,7 @@ primary_config_reload(struct hast_resource *res, struct nv *nv)
 	nv_assert(nv, "compression");
 	nv_assert(nv, "timeout");
 	nv_assert(nv, "exec");
+	nv_assert(nv, "metaflush");
 
 	ncomps = HAST_NCOMPONENTS;
 
@@ -2009,6 +2021,7 @@ primary_config_reload(struct hast_resource *res, struct nv *nv)
 #define MODIFIED_COMPRESSION	0x10
 #define MODIFIED_TIMEOUT	0x20
 #define MODIFIED_EXEC		0x40
+#define MODIFIED_METAFLUSH	0x80
 	modified = 0;
 
 	vstr = nv_get_string(nv, "remoteaddr");
@@ -2050,6 +2063,11 @@ primary_config_reload(struct hast_resource *res, struct nv *nv)
 		strlcpy(gres->hr_exec, vstr, sizeof(gres->hr_exec));
 		modified |= MODIFIED_EXEC;
 	}
+	vint = nv_get_int32(nv, "metaflush");
+	if (gres->hr_metaflush != vint) {
+		gres->hr_metaflush = vint;
+		modified |= MODIFIED_METAFLUSH;
+	}
 
 	/*
 	 * Change timeout for connected sockets.
@@ -2099,6 +2117,7 @@ primary_config_reload(struct hast_resource *res, struct nv *nv)
 #undef	MODIFIED_COMPRESSION
 #undef	MODIFIED_TIMEOUT
 #undef	MODIFIED_EXEC
+#undef	MODIFIED_METAFLUSH
 
 	pjdlog_info("Configuration reloaded successfully.");
 }
diff --git a/sbin/hastd/token.l b/sbin/hastd/token.l
index 67c1e13..3a868d7 100644
--- a/sbin/hastd/token.l
+++ b/sbin/hastd/token.l
@@ -53,12 +53,14 @@ checksum		{ DP; return CHECKSUM; }
 compression		{ DP; return COMPRESSION; }
 timeout			{ DP; return TIMEOUT; }
 exec			{ DP; return EXEC; }
+metaflush		{ DP; return METAFLUSH; }
 resource		{ DP; return RESOURCE; }
 name			{ DP; return NAME; }
 local			{ DP; return LOCAL; }
 remote			{ DP; return REMOTE; }
 source			{ DP; return SOURCE; }
 on			{ DP; return ON; }
+off			{ DP; return OFF; }
 fullsync		{ DP; return FULLSYNC; }
 memsync			{ DP; return MEMSYNC; }
 async			{ DP; return ASYNC; }