From 2896ee37ccc1f9acb244c9b02becb74a43661009 Mon Sep 17 00:00:00 2001 From: David Teigland Date: Mon, 27 Nov 2006 11:31:22 -0600 Subject: [DLM] fix add_requestqueue checking nodes list Requests that arrive after recovery has started are saved in the requestqueue and processed after recovery is done. Some of these requests are purged during recovery if they are from nodes that have been removed. We move the purging of the requests (dlm_purge_requestqueue) to later in the recovery sequence which allows the routine saving requests (dlm_add_requestqueue) to avoid filtering out requests by nodeid since the same will be done by the purge. The current code has add_requestqueue filtering by nodeid but doesn't hold any locks when accessing the list of current nodes. This also means that we need to call the purge routine when the lockspace is being shut down since the add routine will not be rejecting requests itself any more. Signed-off-by: David Teigland Signed-off-by: Steven Whitehouse --- fs/dlm/lockspace.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs/dlm/lockspace.c') diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c index f8842ca..791388b 100644 --- a/fs/dlm/lockspace.c +++ b/fs/dlm/lockspace.c @@ -22,6 +22,7 @@ #include "memory.h" #include "lock.h" #include "recover.h" +#include "requestqueue.h" #ifdef CONFIG_DLM_DEBUG int dlm_create_debug_file(struct dlm_ls *ls); @@ -684,6 +685,7 @@ static int release_lockspace(struct dlm_ls *ls, int force) * Free structures on any other lists */ + dlm_purge_requestqueue(ls); kfree(ls->ls_recover_args); dlm_clear_free_entries(ls); dlm_clear_members(ls); -- cgit v1.1 From 98f176fb32f33795b6d0f83856008b932123ab38 Mon Sep 17 00:00:00 2001 From: David Teigland Date: Mon, 27 Nov 2006 13:19:28 -0600 Subject: [DLM] don't accept replies to old recovery messages We often abort a recovery after sending a status request to a remote node. We want to ignore any potential status reply we get from the remote node. If we get one of these unwanted replies, we've often moved on to the next recovery message and incremented the message sequence counter, so the reply will be ignored due to the seq number. In some cases, we've not moved on to the next message so the seq number of the reply we want to ignore is still correct, causing the reply to be accepted. The next recovery message will then mistake this old reply as a new one. To fix this, we add the flag RCOM_WAIT to indicate when we can accept a new reply. We clear this flag if we abort recovery while waiting for a reply. Before the flag is set again (to allow new replies) we know that any old replies will be rejected due to their sequence number. We also initialize the recovery-message sequence number to a random value when a lockspace is first created. This makes it clear when messages are being rejected from an old instance of a lockspace that has since been recreated. Signed-off-by: David Teigland Signed-off-by: Steven Whitehouse --- fs/dlm/lockspace.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs/dlm/lockspace.c') diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c index 791388b..59012b0 100644 --- a/fs/dlm/lockspace.c +++ b/fs/dlm/lockspace.c @@ -479,6 +479,8 @@ static int new_lockspace(char *name, int namelen, void **lockspace, ls->ls_recoverd_task = NULL; mutex_init(&ls->ls_recoverd_active); spin_lock_init(&ls->ls_recover_lock); + spin_lock_init(&ls->ls_rcom_spin); + get_random_bytes(&ls->ls_rcom_seq, sizeof(uint64_t)); ls->ls_recover_status = 0; ls->ls_recover_seq = 0; ls->ls_recover_args = NULL; -- cgit v1.1