diff options
Diffstat (limited to 'fs/dlm')
-rw-r--r-- | fs/dlm/Kconfig | 20 | ||||
-rw-r--r-- | fs/dlm/Makefile | 4 | ||||
-rw-r--r-- | fs/dlm/dlm_internal.h | 4 | ||||
-rw-r--r-- | fs/dlm/lock.c | 16 | ||||
-rw-r--r-- | fs/dlm/lockspace.c | 4 | ||||
-rw-r--r-- | fs/dlm/lowcomms-sctp.c (renamed from fs/dlm/lowcomms.c) | 264 | ||||
-rw-r--r-- | fs/dlm/lowcomms-tcp.c | 1189 | ||||
-rw-r--r-- | fs/dlm/lowcomms.h | 2 | ||||
-rw-r--r-- | fs/dlm/main.c | 10 | ||||
-rw-r--r-- | fs/dlm/member.c | 8 | ||||
-rw-r--r-- | fs/dlm/rcom.c | 58 | ||||
-rw-r--r-- | fs/dlm/recover.c | 1 | ||||
-rw-r--r-- | fs/dlm/recoverd.c | 44 | ||||
-rw-r--r-- | fs/dlm/requestqueue.c | 26 | ||||
-rw-r--r-- | fs/dlm/requestqueue.h | 2 |
15 files changed, 1463 insertions, 189 deletions
diff --git a/fs/dlm/Kconfig b/fs/dlm/Kconfig index 81b2c64..b5654a2 100644 --- a/fs/dlm/Kconfig +++ b/fs/dlm/Kconfig @@ -1,14 +1,32 @@ menu "Distributed Lock Manager" - depends on INET && IP_SCTP && EXPERIMENTAL + depends on EXPERIMENTAL && INET config DLM tristate "Distributed Lock Manager (DLM)" depends on IPV6 || IPV6=n select CONFIGFS_FS + select IP_SCTP if DLM_SCTP help A general purpose distributed lock manager for kernel or userspace applications. +choice + prompt "Select DLM communications protocol" + depends on DLM + default DLM_TCP + help + The DLM Can use TCP or SCTP for it's network communications. + SCTP supports multi-homed operations whereas TCP doesn't. + However, SCTP seems to have stability problems at the moment. + +config DLM_TCP + bool "TCP/IP" + +config DLM_SCTP + bool "SCTP" + +endchoice + config DLM_DEBUG bool "DLM debugging" depends on DLM diff --git a/fs/dlm/Makefile b/fs/dlm/Makefile index 1832e02..6538894 100644 --- a/fs/dlm/Makefile +++ b/fs/dlm/Makefile @@ -4,7 +4,6 @@ dlm-y := ast.o \ dir.o \ lock.o \ lockspace.o \ - lowcomms.o \ main.o \ member.o \ memory.o \ @@ -17,3 +16,6 @@ dlm-y := ast.o \ util.o dlm-$(CONFIG_DLM_DEBUG) += debug_fs.o +dlm-$(CONFIG_DLM_TCP) += lowcomms-tcp.o + +dlm-$(CONFIG_DLM_SCTP) += lowcomms-sctp.o
\ No newline at end of file diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h index 1e5cd67..1ee8195 100644 --- a/fs/dlm/dlm_internal.h +++ b/fs/dlm/dlm_internal.h @@ -471,6 +471,7 @@ struct dlm_ls { char *ls_recover_buf; int ls_recover_nodeid; /* for debugging */ uint64_t ls_rcom_seq; + spinlock_t ls_rcom_spin; struct list_head ls_recover_list; spinlock_t ls_recover_list_lock; int ls_recover_list_count; @@ -488,7 +489,8 @@ struct dlm_ls { #define LSFL_RUNNING 1 #define LSFL_RECOVERY_STOP 2 #define LSFL_RCOM_READY 3 -#define LSFL_UEVENT_WAIT 4 +#define LSFL_RCOM_WAIT 4 +#define LSFL_UEVENT_WAIT 5 /* much of this is just saving user space pointers associated with the lock that we pass back to the user lib with an ast */ diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c index 3f2befa..30878de 100644 --- a/fs/dlm/lock.c +++ b/fs/dlm/lock.c @@ -2372,6 +2372,7 @@ static int send_lookup_reply(struct dlm_ls *ls, struct dlm_message *ms_in, static void receive_flags(struct dlm_lkb *lkb, struct dlm_message *ms) { lkb->lkb_exflags = ms->m_exflags; + lkb->lkb_sbflags = ms->m_sbflags; lkb->lkb_flags = (lkb->lkb_flags & 0xFFFF0000) | (ms->m_flags & 0x0000FFFF); } @@ -3028,10 +3029,17 @@ int dlm_receive_message(struct dlm_header *hd, int nodeid, int recovery) while (1) { if (dlm_locking_stopped(ls)) { - if (!recovery) - dlm_add_requestqueue(ls, nodeid, hd); - error = -EINTR; - goto out; + if (recovery) { + error = -EINTR; + goto out; + } + error = dlm_add_requestqueue(ls, nodeid, hd); + if (error == -EAGAIN) + continue; + else { + error = -EINTR; + goto out; + } } if (lock_recovery_try(ls)) diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c index f8842ca..59012b0 100644 --- a/fs/dlm/lockspace.c +++ b/fs/dlm/lockspace.c @@ -22,6 +22,7 @@ #include "memory.h" #include "lock.h" #include "recover.h" +#include "requestqueue.h" #ifdef CONFIG_DLM_DEBUG int dlm_create_debug_file(struct dlm_ls *ls); @@ -478,6 +479,8 @@ static int new_lockspace(char *name, int namelen, void **lockspace, ls->ls_recoverd_task = NULL; mutex_init(&ls->ls_recoverd_active); spin_lock_init(&ls->ls_recover_lock); + spin_lock_init(&ls->ls_rcom_spin); + get_random_bytes(&ls->ls_rcom_seq, sizeof(uint64_t)); ls->ls_recover_status = 0; ls->ls_recover_seq = 0; ls->ls_recover_args = NULL; @@ -684,6 +687,7 @@ static int release_lockspace(struct dlm_ls *ls, int force) * Free structures on any other lists */ + dlm_purge_requestqueue(ls); kfree(ls->ls_recover_args); dlm_clear_free_entries(ls); dlm_clear_members(ls); diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms-sctp.c index 6da6b14..fe158d7 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms-sctp.c @@ -2,7 +2,7 @@ ******************************************************************************* ** ** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. -** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved. +** Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. ** ** This copyrighted material is made available to anyone wishing to use, ** modify, copy, or redistribute it subject to the terms and conditions @@ -75,13 +75,13 @@ struct nodeinfo { }; static DEFINE_IDR(nodeinfo_idr); -static struct rw_semaphore nodeinfo_lock; -static int max_nodeid; +static DECLARE_RWSEM(nodeinfo_lock); +static int max_nodeid; struct cbuf { - unsigned base; - unsigned len; - unsigned mask; + unsigned int base; + unsigned int len; + unsigned int mask; }; /* Just the one of these, now. But this struct keeps @@ -90,9 +90,9 @@ struct cbuf { #define CF_READ_PENDING 1 struct connection { - struct socket *sock; + struct socket *sock; unsigned long flags; - struct page *rx_page; + struct page *rx_page; atomic_t waiting_requests; struct cbuf cb; int eagain_flag; @@ -102,36 +102,40 @@ struct connection { struct writequeue_entry { struct list_head list; - struct page *page; + struct page *page; int offset; int len; int end; int users; - struct nodeinfo *ni; + struct nodeinfo *ni; }; -#define CBUF_ADD(cb, n) do { (cb)->len += n; } while(0) -#define CBUF_EMPTY(cb) ((cb)->len == 0) -#define CBUF_MAY_ADD(cb, n) (((cb)->len + (n)) < ((cb)->mask + 1)) -#define CBUF_DATA(cb) (((cb)->base + (cb)->len) & (cb)->mask) +static void cbuf_add(struct cbuf *cb, int n) +{ + cb->len += n; +} -#define CBUF_INIT(cb, size) \ -do { \ - (cb)->base = (cb)->len = 0; \ - (cb)->mask = ((size)-1); \ -} while(0) +static int cbuf_data(struct cbuf *cb) +{ + return ((cb->base + cb->len) & cb->mask); +} -#define CBUF_EAT(cb, n) \ -do { \ - (cb)->len -= (n); \ - (cb)->base += (n); \ - (cb)->base &= (cb)->mask; \ -} while(0) +static void cbuf_init(struct cbuf *cb, int size) +{ + cb->base = cb->len = 0; + cb->mask = size-1; +} +static void cbuf_eat(struct cbuf *cb, int n) +{ + cb->len -= n; + cb->base += n; + cb->base &= cb->mask; +} /* List of nodes which have writes pending */ -static struct list_head write_nodes; -static spinlock_t write_nodes_lock; +static LIST_HEAD(write_nodes); +static DEFINE_SPINLOCK(write_nodes_lock); /* Maximum number of incoming messages to process before * doing a schedule() @@ -141,8 +145,7 @@ static spinlock_t write_nodes_lock; /* Manage daemons */ static struct task_struct *recv_task; static struct task_struct *send_task; -static wait_queue_head_t lowcomms_recv_wait; -static atomic_t accepting; +static DECLARE_WAIT_QUEUE_HEAD(lowcomms_recv_wait); /* The SCTP connection */ static struct connection sctp_con; @@ -161,11 +164,11 @@ static int nodeid_to_addr(int nodeid, struct sockaddr *retaddr) return error; if (dlm_local_addr[0]->ss_family == AF_INET) { - struct sockaddr_in *in4 = (struct sockaddr_in *) &addr; + struct sockaddr_in *in4 = (struct sockaddr_in *) &addr; struct sockaddr_in *ret4 = (struct sockaddr_in *) retaddr; ret4->sin_addr.s_addr = in4->sin_addr.s_addr; } else { - struct sockaddr_in6 *in6 = (struct sockaddr_in6 *) &addr; + struct sockaddr_in6 *in6 = (struct sockaddr_in6 *) &addr; struct sockaddr_in6 *ret6 = (struct sockaddr_in6 *) retaddr; memcpy(&ret6->sin6_addr, &in6->sin6_addr, sizeof(in6->sin6_addr)); @@ -174,6 +177,8 @@ static int nodeid_to_addr(int nodeid, struct sockaddr *retaddr) return 0; } +/* If alloc is 0 here we will not attempt to allocate a new + nodeinfo struct */ static struct nodeinfo *nodeid2nodeinfo(int nodeid, gfp_t alloc) { struct nodeinfo *ni; @@ -184,44 +189,45 @@ static struct nodeinfo *nodeid2nodeinfo(int nodeid, gfp_t alloc) ni = idr_find(&nodeinfo_idr, nodeid); up_read(&nodeinfo_lock); - if (!ni && alloc) { - down_write(&nodeinfo_lock); + if (ni || !alloc) + return ni; - ni = idr_find(&nodeinfo_idr, nodeid); - if (ni) - goto out_up; + down_write(&nodeinfo_lock); - r = idr_pre_get(&nodeinfo_idr, alloc); - if (!r) - goto out_up; + ni = idr_find(&nodeinfo_idr, nodeid); + if (ni) + goto out_up; - ni = kmalloc(sizeof(struct nodeinfo), alloc); - if (!ni) - goto out_up; + r = idr_pre_get(&nodeinfo_idr, alloc); + if (!r) + goto out_up; - r = idr_get_new_above(&nodeinfo_idr, ni, nodeid, &n); - if (r) { - kfree(ni); - ni = NULL; - goto out_up; - } - if (n != nodeid) { - idr_remove(&nodeinfo_idr, n); - kfree(ni); - ni = NULL; - goto out_up; - } - memset(ni, 0, sizeof(struct nodeinfo)); - spin_lock_init(&ni->lock); - INIT_LIST_HEAD(&ni->writequeue); - spin_lock_init(&ni->writequeue_lock); - ni->nodeid = nodeid; - - if (nodeid > max_nodeid) - max_nodeid = nodeid; - out_up: - up_write(&nodeinfo_lock); + ni = kmalloc(sizeof(struct nodeinfo), alloc); + if (!ni) + goto out_up; + + r = idr_get_new_above(&nodeinfo_idr, ni, nodeid, &n); + if (r) { + kfree(ni); + ni = NULL; + goto out_up; } + if (n != nodeid) { + idr_remove(&nodeinfo_idr, n); + kfree(ni); + ni = NULL; + goto out_up; + } + memset(ni, 0, sizeof(struct nodeinfo)); + spin_lock_init(&ni->lock); + INIT_LIST_HEAD(&ni->writequeue); + spin_lock_init(&ni->writequeue_lock); + ni->nodeid = nodeid; + + if (nodeid > max_nodeid) + max_nodeid = nodeid; +out_up: + up_write(&nodeinfo_lock); return ni; } @@ -279,13 +285,13 @@ static void make_sockaddr(struct sockaddr_storage *saddr, uint16_t port, in4_addr->sin_port = cpu_to_be16(port); memset(&in4_addr->sin_zero, 0, sizeof(in4_addr->sin_zero)); memset(in4_addr+1, 0, sizeof(struct sockaddr_storage) - - sizeof(struct sockaddr_in)); + sizeof(struct sockaddr_in)); *addr_len = sizeof(struct sockaddr_in); } else { struct sockaddr_in6 *in6_addr = (struct sockaddr_in6 *)saddr; in6_addr->sin6_port = cpu_to_be16(port); memset(in6_addr+1, 0, sizeof(struct sockaddr_storage) - - sizeof(struct sockaddr_in6)); + sizeof(struct sockaddr_in6)); *addr_len = sizeof(struct sockaddr_in6); } } @@ -324,7 +330,7 @@ static void send_shutdown(sctp_assoc_t associd) cmsg->cmsg_type = SCTP_SNDRCV; cmsg->cmsg_len = CMSG_LEN(sizeof(struct sctp_sndrcvinfo)); outmessage.msg_controllen = cmsg->cmsg_len; - sinfo = (struct sctp_sndrcvinfo *)CMSG_DATA(cmsg); + sinfo = CMSG_DATA(cmsg); memset(sinfo, 0x00, sizeof(struct sctp_sndrcvinfo)); sinfo->sinfo_flags |= MSG_EOF; @@ -387,7 +393,7 @@ static void process_sctp_notification(struct msghdr *msg, char *buf) if ((int)sn->sn_assoc_change.sac_assoc_id <= 0) { log_print("COMM_UP for invalid assoc ID %d", - (int)sn->sn_assoc_change.sac_assoc_id); + (int)sn->sn_assoc_change.sac_assoc_id); init_failed(); return; } @@ -398,15 +404,18 @@ static void process_sctp_notification(struct msghdr *msg, char *buf) fs = get_fs(); set_fs(get_ds()); ret = sctp_con.sock->ops->getsockopt(sctp_con.sock, - IPPROTO_SCTP, SCTP_PRIMARY_ADDR, - (char*)&prim, &prim_len); + IPPROTO_SCTP, + SCTP_PRIMARY_ADDR, + (char*)&prim, + &prim_len); set_fs(fs); if (ret < 0) { struct nodeinfo *ni; log_print("getsockopt/sctp_primary_addr on " "new assoc %d failed : %d", - (int)sn->sn_assoc_change.sac_assoc_id, ret); + (int)sn->sn_assoc_change.sac_assoc_id, + ret); /* Retry INIT later */ ni = assoc2nodeinfo(sn->sn_assoc_change.sac_assoc_id); @@ -426,12 +435,10 @@ static void process_sctp_notification(struct msghdr *msg, char *buf) return; /* Save the assoc ID */ - spin_lock(&ni->lock); ni->assoc_id = sn->sn_assoc_change.sac_assoc_id; - spin_unlock(&ni->lock); log_print("got new/restarted association %d nodeid %d", - (int)sn->sn_assoc_change.sac_assoc_id, nodeid); + (int)sn->sn_assoc_change.sac_assoc_id, nodeid); /* Send any pending writes */ clear_bit(NI_INIT_PENDING, &ni->flags); @@ -507,13 +514,12 @@ static int receive_from_sock(void) sctp_con.rx_page = alloc_page(GFP_ATOMIC); if (sctp_con.rx_page == NULL) goto out_resched; - CBUF_INIT(&sctp_con.cb, PAGE_CACHE_SIZE); + cbuf_init(&sctp_con.cb, PAGE_CACHE_SIZE); } memset(&incmsg, 0, sizeof(incmsg)); memset(&msgname, 0, sizeof(msgname)); - memset(incmsg, 0, sizeof(incmsg)); msg.msg_name = &msgname; msg.msg_namelen = sizeof(msgname); msg.msg_flags = 0; @@ -532,17 +538,17 @@ static int receive_from_sock(void) * iov[0] is the bit of the circular buffer between the current end * point (cb.base + cb.len) and the end of the buffer. */ - iov[0].iov_len = sctp_con.cb.base - CBUF_DATA(&sctp_con.cb); + iov[0].iov_len = sctp_con.cb.base - cbuf_data(&sctp_con.cb); iov[0].iov_base = page_address(sctp_con.rx_page) + - CBUF_DATA(&sctp_con.cb); + cbuf_data(&sctp_con.cb); iov[1].iov_len = 0; /* * iov[1] is the bit of the circular buffer between the start of the * buffer and the start of the currently used section (cb.base) */ - if (CBUF_DATA(&sctp_con.cb) >= sctp_con.cb.base) { - iov[0].iov_len = PAGE_CACHE_SIZE - CBUF_DATA(&sctp_con.cb); + if (cbuf_data(&sctp_con.cb) >= sctp_con.cb.base) { + iov[0].iov_len = PAGE_CACHE_SIZE - cbuf_data(&sctp_con.cb); iov[1].iov_len = sctp_con.cb.base; iov[1].iov_base = page_address(sctp_con.rx_page); msg.msg_iovlen = 2; @@ -557,7 +563,7 @@ static int receive_from_sock(void) msg.msg_control = incmsg; msg.msg_controllen = sizeof(incmsg); cmsg = CMSG_FIRSTHDR(&msg); - sinfo = (struct sctp_sndrcvinfo *)CMSG_DATA(cmsg); + sinfo = CMSG_DATA(cmsg); if (msg.msg_flags & MSG_NOTIFICATION) { process_sctp_notification(&msg, page_address(sctp_con.rx_page)); @@ -583,29 +589,29 @@ static int receive_from_sock(void) if (r == 1) return 0; - CBUF_ADD(&sctp_con.cb, ret); + cbuf_add(&sctp_con.cb, ret); ret = dlm_process_incoming_buffer(cpu_to_le32(sinfo->sinfo_ppid), page_address(sctp_con.rx_page), sctp_con.cb.base, sctp_con.cb.len, PAGE_CACHE_SIZE); if (ret < 0) goto out_close; - CBUF_EAT(&sctp_con.cb, ret); + cbuf_eat(&sctp_con.cb, ret); - out: +out: ret = 0; goto out_ret; - out_resched: +out_resched: lowcomms_data_ready(sctp_con.sock->sk, 0); ret = 0; - schedule(); + cond_resched(); goto out_ret; - out_close: +out_close: if (ret != -EAGAIN) log_print("error reading from sctp socket: %d", ret); - out_ret: +out_ret: return ret; } @@ -619,10 +625,12 @@ static int add_bind_addr(struct sockaddr_storage *addr, int addr_len, int num) set_fs(get_ds()); if (num == 1) result = sctp_con.sock->ops->bind(sctp_con.sock, - (struct sockaddr *) addr, addr_len); + (struct sockaddr *) addr, + addr_len); else result = sctp_con.sock->ops->setsockopt(sctp_con.sock, SOL_SCTP, - SCTP_SOCKOPT_BINDX_ADD, (char *)addr, addr_len); + SCTP_SOCKOPT_BINDX_ADD, + (char *)addr, addr_len); set_fs(fs); if (result < 0) @@ -719,10 +727,10 @@ static int init_sock(void) return 0; - create_delsock: +create_delsock: sock_release(sock); sctp_con.sock = NULL; - out: +out: return result; } @@ -756,16 +764,13 @@ void *dlm_lowcomms_get_buffer(int nodeid, int len, gfp_t allocation, char **ppc) int users = 0; struct nodeinfo *ni; - if (!atomic_read(&accepting)) - return NULL; - ni = nodeid2nodeinfo(nodeid, allocation); if (!ni) return NULL; spin_lock(&ni->writequeue_lock); e = list_entry(ni->writequeue.prev, struct writequeue_entry, list); - if (((struct list_head *) e == &ni->writequeue) || + if ((&e->list == &ni->writequeue) || (PAGE_CACHE_SIZE - e->end < len)) { e = NULL; } else { @@ -776,7 +781,7 @@ void *dlm_lowcomms_get_buffer(int nodeid, int len, gfp_t allocation, char **ppc) spin_unlock(&ni->writequeue_lock); if (e) { - got_one: + got_one: if (users == 0) kmap(e->page); *ppc = page_address(e->page) + offset; @@ -803,9 +808,6 @@ void dlm_lowcomms_commit_buffer(void *arg) int users; struct nodeinfo *ni = e->ni; - if (!atomic_read(&accepting)) - return; - spin_lock(&ni->writequeue_lock); users = --e->users; if (users) @@ -822,7 +824,7 @@ void dlm_lowcomms_commit_buffer(void *arg) } return; - out: +out: spin_unlock(&ni->writequeue_lock); return; } @@ -878,7 +880,7 @@ static void initiate_association(int nodeid) cmsg->cmsg_level = IPPROTO_SCTP; cmsg->cmsg_type = SCTP_SNDRCV; cmsg->cmsg_len = CMSG_LEN(sizeof(struct sctp_sndrcvinfo)); - sinfo = (struct sctp_sndrcvinfo *)CMSG_DATA(cmsg); + sinfo = CMSG_DATA(cmsg); memset(sinfo, 0x00, sizeof(struct sctp_sndrcvinfo)); sinfo->sinfo_ppid = cpu_to_le32(dlm_local_nodeid); @@ -892,7 +894,7 @@ static void initiate_association(int nodeid) } /* Send a message */ -static int send_to_sock(struct nodeinfo *ni) +static void send_to_sock(struct nodeinfo *ni) { int ret = 0; struct writequeue_entry *e; @@ -903,13 +905,13 @@ static int send_to_sock(struct nodeinfo *ni) struct sctp_sndrcvinfo *sinfo; struct kvec iov; - /* See if we need to init an association before we start + /* See if we need to init an association before we start sending precious messages */ spin_lock(&ni->lock); if (!ni->assoc_id && !test_and_set_bit(NI_INIT_PENDING, &ni->flags)) { spin_unlock(&ni->lock); initiate_association(ni->nodeid); - return 0; + return; } spin_unlock(&ni->lock); @@ -923,7 +925,7 @@ static int send_to_sock(struct nodeinfo *ni) cmsg->cmsg_level = IPPROTO_SCTP; cmsg->cmsg_type = SCTP_SNDRCV; cmsg->cmsg_len = CMSG_LEN(sizeof(struct sctp_sndrcvinfo)); - sinfo = (struct sctp_sndrcvinfo *)CMSG_DATA(cmsg); + sinfo = CMSG_DATA(cmsg); memset(sinfo, 0x00, sizeof(struct sctp_sndrcvinfo)); sinfo->sinfo_ppid = cpu_to_le32(dlm_local_nodeid); sinfo->sinfo_assoc_id = ni->assoc_id; @@ -955,7 +957,7 @@ static int send_to_sock(struct nodeinfo *ni) goto send_error; } else { /* Don't starve people filling buffers */ - schedule(); + cond_resched(); } spin_lock(&ni->writequeue_lock); @@ -964,15 +966,16 @@ static int send_to_sock(struct nodeinfo *ni) if (e->len == 0 && e->users == 0) { list_del(&e->list); + kunmap(e->page); free_entry(e); continue; } } spin_unlock(&ni->writequeue_lock); - out: - return ret; +out: + return; - send_error: +send_error: log_print("Error sending to node %d %d", ni->nodeid, ret); spin_lock(&ni->lock); if (!test_and_set_bit(NI_INIT_PENDING, &ni->flags)) { @@ -982,7 +985,7 @@ static int send_to_sock(struct nodeinfo *ni) } else spin_unlock(&ni->lock); - return ret; + return; } /* Try to send any messages that are pending */ @@ -994,7 +997,7 @@ static void process_output_queue(void) spin_lock_bh(&write_nodes_lock); list_for_each_safe(list, temp, &write_nodes) { struct nodeinfo *ni = - list_entry(list, struct nodeinfo, write_list); + list_entry(list, struct nodeinfo, write_list); clear_bit(NI_WRITE_PENDING, &ni->flags); list_del(&ni->write_list); @@ -1106,7 +1109,7 @@ static int dlm_recvd(void *data) set_current_state(TASK_INTERRUPTIBLE); add_wait_queue(&lowcomms_recv_wait, &wait); if (!test_bit(CF_READ_PENDING, &sctp_con.flags)) - schedule(); + cond_resched(); remove_wait_queue(&lowcomms_recv_wait, &wait); set_current_state(TASK_RUNNING); @@ -1118,12 +1121,12 @@ static int dlm_recvd(void *data) /* Don't starve out everyone else */ if (++count >= MAX_RX_MSG_COUNT) { - schedule(); + cond_resched(); count = 0; } } while (!kthread_should_stop() && ret >=0); } - schedule(); + cond_resched(); } return 0; @@ -1138,7 +1141,7 @@ static int dlm_sendd(void *data) while (!kthread_should_stop()) { set_current_state(TASK_INTERRUPTIBLE); if (write_list_empty()) - schedule(); + cond_resched(); set_current_state(TASK_RUNNING); if (sctp_con.eagain_flag) { @@ -1166,7 +1169,7 @@ static int daemons_start(void) p = kthread_run(dlm_recvd, NULL, "dlm_recvd"); error = IS_ERR(p); - if (error) { + if (error) { log_print("can't start dlm_recvd %d", error); return error; } @@ -1174,7 +1177,7 @@ static int daemons_start(void) p = kthread_run(dlm_sendd, NULL, "dlm_sendd"); error = IS_ERR(p); - if (error) { + if (error) { log_print("can't start dlm_sendd %d", error); kthread_stop(recv_task); return error; @@ -1197,43 +1200,28 @@ int dlm_lowcomms_start(void) error = daemons_start(); if (error) goto fail_sock; - atomic_set(&accepting, 1); return 0; - fail_sock: +fail_sock: close_connection(); return error; } -/* Set all the activity flags to prevent any socket activity. */ - void dlm_lowcomms_stop(void) { - atomic_set(&accepting, 0); + int i; + sctp_con.flags = 0x7; daemons_stop(); clean_writequeues(); close_connection(); dealloc_nodeinfo(); max_nodeid = 0; -} -int dlm_lowcomms_init(void) -{ - init_waitqueue_head(&lowcomms_recv_wait); - spin_lock_init(&write_nodes_lock); - INIT_LIST_HEAD(&write_nodes); - init_rwsem(&nodeinfo_lock); - return 0; -} - -void dlm_lowcomms_exit(void) -{ - int i; + dlm_local_count = 0; + dlm_local_nodeid = 0; for (i = 0; i < dlm_local_count; i++) kfree(dlm_local_addr[i]); - dlm_local_count = 0; - dlm_local_nodeid = 0; } diff --git a/fs/dlm/lowcomms-tcp.c b/fs/dlm/lowcomms-tcp.c new file mode 100644 index 0000000..8f2791f --- /dev/null +++ b/fs/dlm/lowcomms-tcp.c @@ -0,0 +1,1189 @@ +/****************************************************************************** +******************************************************************************* +** +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. +** Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. +** +** This copyrighted material is made available to anyone wishing to use, +** modify, copy, or redistribute it subject to the terms and conditions +** of the GNU General Public License v.2. +** +******************************************************************************* +******************************************************************************/ + +/* + * lowcomms.c + * + * This is the "low-level" comms layer. + * + * It is responsible for sending/receiving messages + * from other nodes in the cluster. + * + * Cluster nodes are referred to by their nodeids. nodeids are + * simply 32 bit numbers to the locking module - if they need to + * be expanded for the cluster infrastructure then that is it's + * responsibility. It is this layer's + * responsibility to resolve these into IP address or + * whatever it needs for inter-node communication. + * + * The comms level is two kernel threads that deal mainly with + * the receiving of messages from other nodes and passing them + * up to the mid-level comms layer (which understands the + * message format) for execution by the locking core, and + * a send thread which does all the setting up of connections + * to remote nodes and the sending of data. Threads are not allowed + * to send their own data because it may cause them to wait in times + * of high load. Also, this way, the sending thread can collect together + * messages bound for one node and send them in one block. + * + * I don't see any problem with the recv thread executing the locking + * code on behalf of remote processes as the locking code is + * short, efficient and never waits. + * + */ + + +#include <asm/ioctls.h> +#include <net/sock.h> +#include <net/tcp.h> +#include <linux/pagemap.h> + +#include "dlm_internal.h" +#include "lowcomms.h" +#include "midcomms.h" +#include "config.h" + +struct cbuf { + unsigned int base; + unsigned int len; + unsigned int mask; +}; + +#define NODE_INCREMENT 32 +static void cbuf_add(struct cbuf *cb, int n) +{ + cb->len += n; +} + +static int cbuf_data(struct cbuf *cb) +{ + return ((cb->base + cb->len) & cb->mask); +} + +static void cbuf_init(struct cbuf *cb, int size) +{ + cb->base = cb->len = 0; + cb->mask = size-1; +} + +static void cbuf_eat(struct cbuf *cb, int n) +{ + cb->len -= n; + cb->base += n; + cb->base &= cb->mask; +} + +static bool cbuf_empty(struct cbuf *cb) +{ + return cb->len == 0; +} + +/* Maximum number of incoming messages to process before + doing a cond_resched() +*/ +#define MAX_RX_MSG_COUNT 25 + +struct connection { + struct socket *sock; /* NULL if not connected */ + uint32_t nodeid; /* So we know who we are in the list */ + struct rw_semaphore sock_sem; /* Stop connect races */ + struct list_head read_list; /* On this list when ready for reading */ + struct list_head write_list; /* On this list when ready for writing */ + struct list_head state_list; /* On this list when ready to connect */ + unsigned long flags; /* bit 1,2 = We are on the read/write lists */ +#define CF_READ_PENDING 1 +#define CF_WRITE_PENDING 2 +#define CF_CONNECT_PENDING 3 +#define CF_IS_OTHERCON 4 + struct list_head writequeue; /* List of outgoing writequeue_entries */ + struct list_head listenlist; /* List of allocated listening sockets */ + spinlock_t writequeue_lock; + int (*rx_action) (struct connection *); /* What to do when active */ + struct page *rx_page; + struct cbuf cb; + int retries; + atomic_t waiting_requests; +#define MAX_CONNECT_RETRIES 3 + struct connection *othercon; +}; +#define sock2con(x) ((struct connection *)(x)->sk_user_data) + +/* An entry waiting to be sent */ +struct writequeue_entry { + struct list_head list; + struct page *page; + int offset; + int len; + int end; + int users; + struct connection *con; +}; + +static struct sockaddr_storage dlm_local_addr; + +/* Manage daemons */ +static struct task_struct *recv_task; +static struct task_struct *send_task; + +static wait_queue_t lowcomms_send_waitq_head; +static DECLARE_WAIT_QUEUE_HEAD(lowcomms_send_waitq); +static wait_queue_t lowcomms_recv_waitq_head; +static DECLARE_WAIT_QUEUE_HEAD(lowcomms_recv_waitq); + +/* An array of pointers to connections, indexed by NODEID */ +static struct connection **connections; +static DECLARE_MUTEX(connections_lock); +static kmem_cache_t *con_cache; +static int conn_array_size; + +/* List of sockets that have reads pending */ +static LIST_HEAD(read_sockets); +static DEFINE_SPINLOCK(read_sockets_lock); + +/* List of sockets which have writes pending */ +static LIST_HEAD(write_sockets); +static DEFINE_SPINLOCK(write_sockets_lock); + +/* List of sockets which have connects pending */ +static LIST_HEAD(state_sockets); +static DEFINE_SPINLOCK(state_sockets_lock); + +static struct connection *nodeid2con(int nodeid, gfp_t allocation) +{ + struct connection *con = NULL; + + down(&connections_lock); + if (nodeid >= conn_array_size) { + int new_size = nodeid + NODE_INCREMENT; + struct connection **new_conns; + + new_conns = kzalloc(sizeof(struct connection *) * + new_size, allocation); + if (!new_conns) + goto finish; + + memcpy(new_conns, connections, sizeof(struct connection *) * conn_array_size); + conn_array_size = new_size; + kfree(connections); + connections = new_conns; + + } + + con = connections[nodeid]; + if (con == NULL && allocation) { + con = kmem_cache_zalloc(con_cache, allocation); + if (!con) + goto finish; + + con->nodeid = nodeid; + init_rwsem(&con->sock_sem); + INIT_LIST_HEAD(&con->writequeue); + spin_lock_init(&con->writequeue_lock); + + connections[nodeid] = con; + } + +finish: + up(&connections_lock); + return con; +} + +/* Data available on socket or listen socket received a connect */ +static void lowcomms_data_ready(struct sock *sk, int count_unused) +{ + struct connection *con = sock2con(sk); + + atomic_inc(&con->waiting_requests); + if (test_and_set_bit(CF_READ_PENDING, &con->flags)) + return; + + spin_lock_bh(&read_sockets_lock); + list_add_tail(&con->read_list, &read_sockets); + spin_unlock_bh(&read_sockets_lock); + + wake_up_interruptible(&lowcomms_recv_waitq); +} + +static void lowcomms_write_space(struct sock *sk) +{ + struct connection *con = sock2con(sk); + + if (test_and_set_bit(CF_WRITE_PENDING, &con->flags)) + return; + + spin_lock_bh(&write_sockets_lock); + list_add_tail(&con->write_list, &write_sockets); + spin_unlock_bh(&write_sockets_lock); + + wake_up_interruptible(&lowcomms_send_waitq); +} + +static inline void lowcomms_connect_sock(struct connection *con) +{ + if (test_and_set_bit(CF_CONNECT_PENDING, &con->flags)) + return; + + spin_lock_bh(&state_sockets_lock); + list_add_tail(&con->state_list, &state_sockets); + spin_unlock_bh(&state_sockets_lock); + + wake_up_interruptible(&lowcomms_send_waitq); +} + +static void lowcomms_state_change(struct sock *sk) +{ + if (sk->sk_state == TCP_ESTABLISHED) + lowcomms_write_space(sk); +} + +/* Make a socket active */ +static int add_sock(struct socket *sock, struct connection *con) +{ + con->sock = sock; + + /* Install a data_ready callback */ + con->sock->sk->sk_data_ready = lowcomms_data_ready; + con->sock->sk->sk_write_space = lowcomms_write_space; + con->sock->sk->sk_state_change = lowcomms_state_change; + + return 0; +} + +/* Add the port number to an IP6 or 4 sockaddr and return the address + length */ +static void make_sockaddr(struct sockaddr_storage *saddr, uint16_t port, + int *addr_len) +{ + saddr->ss_family = dlm_local_addr.ss_family; + if (saddr->ss_family == AF_INET) { + struct sockaddr_in *in4_addr = (struct sockaddr_in *)saddr; + in4_addr->sin_port = cpu_to_be16(port); + *addr_len = sizeof(struct sockaddr_in); + } else { + struct sockaddr_in6 *in6_addr = (struct sockaddr_in6 *)saddr; + in6_addr->sin6_port = cpu_to_be16(port); + *addr_len = sizeof(struct sockaddr_in6); + } +} + +/* Close a remote connection and tidy up */ +static void close_connection(struct connection *con, bool and_other) +{ + down_write(&con->sock_sem); + + if (con->sock) { + sock_release(con->sock); + con->sock = NULL; + } + if (con->othercon && and_other) { + /* Will only re-enter once. */ + close_connection(con->othercon, false); + } + if (con->rx_page) { + __free_page(con->rx_page); + con->rx_page = NULL; + } + con->retries = 0; + up_write(&con->sock_sem); +} + +/* Data received from remote end */ +static int receive_from_sock(struct connection *con) +{ + int ret = 0; + struct msghdr msg; + struct iovec iov[2]; + mm_segment_t fs; + unsigned len; + int r; + int call_again_soon = 0; + + down_read(&con->sock_sem); + + if (con->sock == NULL) + goto out; + if (con->rx_page == NULL) { + /* + * This doesn't need to be atomic, but I think it should + * improve performance if it is. + */ + con->rx_page = alloc_page(GFP_ATOMIC); + if (con->rx_page == NULL) + goto out_resched; + cbuf_init(&con->cb, PAGE_CACHE_SIZE); + } + + msg.msg_control = NULL; + msg.msg_controllen = 0; + msg.msg_iovlen = 1; + msg.msg_iov = iov; + msg.msg_name = NULL; + msg.msg_namelen = 0; + msg.msg_flags = 0; + + /* + * iov[0] is the bit of the circular buffer between the current end + * point (cb.base + cb.len) and the end of the buffer. + */ + iov[0].iov_len = con->cb.base - cbuf_data(&con->cb); + iov[0].iov_base = page_address(con->rx_page) + cbuf_data(&con->cb); + iov[1].iov_len = 0; + + /* + * iov[1] is the bit of the circular buffer between the start of the + * buffer and the start of the currently used section (cb.base) + */ + if (cbuf_data(&con->cb) >= con->cb.base) { + iov[0].iov_len = PAGE_CACHE_SIZE - cbuf_data(&con->cb); + iov[1].iov_len = con->cb.base; + iov[1].iov_base = page_address(con->rx_page); + msg.msg_iovlen = 2; + } + len = iov[0].iov_len + iov[1].iov_len; + + fs = get_fs(); + set_fs(get_ds()); + r = ret = sock_recvmsg(con->sock, &msg, len, + MSG_DONTWAIT | MSG_NOSIGNAL); + set_fs(fs); + + if (ret <= 0) + goto out_close; + if (ret == len) + call_again_soon = 1; + cbuf_add(&con->cb, ret); + ret = dlm_process_incoming_buffer(con->nodeid, + page_address(con->rx_page), + con->cb.base, con->cb.len, + PAGE_CACHE_SIZE); + if (ret == -EBADMSG) { + printk(KERN_INFO "dlm: lowcomms: addr=%p, base=%u, len=%u, " + "iov_len=%u, iov_base[0]=%p, read=%d\n", + page_address(con->rx_page), con->cb.base, con->cb.len, + len, iov[0].iov_base, r); + } + if (ret < 0) + goto out_close; + cbuf_eat(&con->cb, ret); + + if (cbuf_empty(&con->cb) && !call_again_soon) { + __free_page(con->rx_page); + con->rx_page = NULL; + } + +out: + if (call_again_soon) + goto out_resched; + up_read(&con->sock_sem); + return 0; + +out_resched: + lowcomms_data_ready(con->sock->sk, 0); + up_read(&con->sock_sem); + cond_resched(); + return 0; + +out_close: + up_read(&con->sock_sem); + if (ret != -EAGAIN && !test_bit(CF_IS_OTHERCON, &con->flags)) { + close_connection(con, false); + /* Reconnect when there is something to send */ + } + + return ret; +} + +/* Listening socket is busy, accept a connection */ +static int accept_from_sock(struct connection *con) +{ + int result; + struct sockaddr_storage peeraddr; + struct socket *newsock; + int len; + int nodeid; + struct connection *newcon; + + memset(&peeraddr, 0, sizeof(peeraddr)); + result = sock_create_kern(dlm_local_addr.ss_family, SOCK_STREAM, + IPPROTO_TCP, &newsock); + if (result < 0) + return -ENOMEM; + + down_read(&con->sock_sem); + + result = -ENOTCONN; + if (con->sock == NULL) + goto accept_err; + + newsock->type = con->sock->type; + newsock->ops = con->sock->ops; + + result = con->sock->ops->accept(con->sock, newsock, O_NONBLOCK); + if (result < 0) + goto accept_err; + + /* Get the connected socket's peer */ + memset(&peeraddr, 0, sizeof(peeraddr)); + if (newsock->ops->getname(newsock, (struct sockaddr *)&peeraddr, + &len, 2)) { + result = -ECONNABORTED; + goto accept_err; + } + + /* Get the new node's NODEID */ + make_sockaddr(&peeraddr, 0, &len); + if (dlm_addr_to_nodeid(&peeraddr, &nodeid)) { + printk("dlm: connect from non cluster node\n"); + sock_release(newsock); + up_read(&con->sock_sem); + return -1; + } + + log_print("got connection from %d", nodeid); + + /* Check to see if we already have a connection to this node. This + * could happen if the two nodes initiate a connection at roughly + * the same time and the connections cross on the wire. + * TEMPORARY FIX: + * In this case we store the incoming one in "othercon" + */ + newcon = nodeid2con(nodeid, GFP_KERNEL); + if (!newcon) { + result = -ENOMEM; + goto accept_err; + } + down_write(&newcon->sock_sem); + if (newcon->sock) { + struct connection *othercon = newcon->othercon; + + if (!othercon) { + othercon = kmem_cache_zalloc(con_cache, GFP_KERNEL); + if (!othercon) { + printk("dlm: failed to allocate incoming socket\n"); + up_write(&newcon->sock_sem); + result = -ENOMEM; + goto accept_err; + } + othercon->nodeid = nodeid; + othercon->rx_action = receive_from_sock; + init_rwsem(&othercon->sock_sem); + set_bit(CF_IS_OTHERCON, &othercon->flags); + newcon->othercon = othercon; + } + othercon->sock = newsock; + newsock->sk->sk_user_data = othercon; + add_sock(newsock, othercon); + } + else { + newsock->sk->sk_user_data = newcon; + newcon->rx_action = receive_from_sock; + add_sock(newsock, newcon); + + } + + up_write(&newcon->sock_sem); + + /* + * Add it to the active queue in case we got data + * beween processing the accept adding the socket + * to the read_sockets list + */ + lowcomms_data_ready(newsock->sk, 0); + up_read(&con->sock_sem); + + return 0; + +accept_err: + up_read(&con->sock_sem); + sock_release(newsock); + + if (result != -EAGAIN) + printk("dlm: error accepting connection from node: %d\n", result); + return result; +} + +/* Connect a new socket to its peer */ +static void connect_to_sock(struct connection *con) +{ + int result = -EHOSTUNREACH; + struct sockaddr_storage saddr; + int addr_len; + struct socket *sock; + + if (con->nodeid == 0) { + log_print("attempt to connect sock 0 foiled"); + return; + } + + down_write(&con->sock_sem); + if (con->retries++ > MAX_CONNECT_RETRIES) + goto out; + + /* Some odd races can cause double-connects, ignore them */ + if (con->sock) { + result = 0; + goto out; + } + + /* Create a socket to communicate with */ + result = sock_create_kern(dlm_local_addr.ss_family, SOCK_STREAM, + IPPROTO_TCP, &sock); + if (result < 0) + goto out_err; + + memset(&saddr, 0, sizeof(saddr)); + if (dlm_nodeid_to_addr(con->nodeid, &saddr)) + goto out_err; + + sock->sk->sk_user_data = con; + con->rx_action = receive_from_sock; + + make_sockaddr(&saddr, dlm_config.tcp_port, &addr_len); + + add_sock(sock, con); + + log_print("connecting to %d", con->nodeid); + result = + sock->ops->connect(sock, (struct sockaddr *)&saddr, addr_len, + O_NONBLOCK); + if (result == -EINPROGRESS) + result = 0; + if (result == 0) + goto out; + +out_err: + if (con->sock) { + sock_release(con->sock); + con->sock = NULL; + } + /* + * Some errors are fatal and this list might need adjusting. For other + * errors we try again until the max number of retries is reached. + */ + if (result != -EHOSTUNREACH && result != -ENETUNREACH && + result != -ENETDOWN && result != EINVAL + && result != -EPROTONOSUPPORT) { + lowcomms_connect_sock(con); + result = 0; + } +out: + up_write(&con->sock_sem); + return; +} + +static struct socket *create_listen_sock(struct connection *con, + struct sockaddr_storage *saddr) +{ + struct socket *sock = NULL; + mm_segment_t fs; + int result = 0; + int one = 1; + int addr_len; + + if (dlm_local_addr.ss_family == AF_INET) + addr_len = sizeof(struct sockaddr_in); + else + addr_len = sizeof(struct sockaddr_in6); + + /* Create a socket to communicate with */ + result = sock_create_kern(dlm_local_addr.ss_family, SOCK_STREAM, IPPROTO_TCP, &sock); + if (result < 0) { + printk("dlm: Can't create listening comms socket\n"); + goto create_out; + } + + fs = get_fs(); + set_fs(get_ds()); + result = sock_setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, + (char *)&one, sizeof(one)); + set_fs(fs); + if (result < 0) { + printk("dlm: Failed to set SO_REUSEADDR on socket: result=%d\n", + result); + } + sock->sk->sk_user_data = con; + con->rx_action = accept_from_sock; + con->sock = sock; + + /* Bind to our port */ + make_sockaddr(saddr, dlm_config.tcp_port, &addr_len); + result = sock->ops->bind(sock, (struct sockaddr *) saddr, addr_len); + if (result < 0) { + printk("dlm: Can't bind to port %d\n", dlm_config.tcp_port); + sock_release(sock); + sock = NULL; + con->sock = NULL; + goto create_out; + } + + fs = get_fs(); + set_fs(get_ds()); + + result = sock_setsockopt(sock, SOL_SOCKET, SO_KEEPALIVE, + (char *)&one, sizeof(one)); + set_fs(fs); + if (result < 0) { + printk("dlm: Set keepalive failed: %d\n", result); + } + + result = sock->ops->listen(sock, 5); + if (result < 0) { + printk("dlm: Can't listen on port %d\n", dlm_config.tcp_port); + sock_release(sock); + sock = NULL; + goto create_out; + } + +create_out: + return sock; +} + + +/* Listen on all interfaces */ +static int listen_for_all(void) +{ + struct socket *sock = NULL; + struct connection *con = nodeid2con(0, GFP_KERNEL); + int result = -EINVAL; + + /* We don't support multi-homed hosts */ + set_bit(CF_IS_OTHERCON, &con->flags); + + sock = create_listen_sock(con, &dlm_local_addr); + if (sock) { + add_sock(sock, con); + result = 0; + } + else { + result = -EADDRINUSE; + } + + return result; +} + + + +static struct writequeue_entry *new_writequeue_entry(struct connection *con, + gfp_t allocation) +{ + struct writequeue_entry *entry; + + entry = kmalloc(sizeof(struct writequeue_entry), allocation); + if (!entry) + return NULL; + + entry->page = alloc_page(allocation); + if (!entry->page) { + kfree(entry); + return NULL; + } + + entry->offset = 0; + entry->len = 0; + entry->end = 0; + entry->users = 0; + entry->con = con; + + return entry; +} + +void *dlm_lowcomms_get_buffer(int nodeid, int len, + gfp_t allocation, char **ppc) +{ + struct connection *con; + struct writequeue_entry *e; + int offset = 0; + int users = 0; + + con = nodeid2con(nodeid, allocation); + if (!con) + return NULL; + + e = list_entry(con->writequeue.prev, struct writequeue_entry, list); + if ((&e->list == &con->writequeue) || + (PAGE_CACHE_SIZE - e->end < len)) { + e = NULL; + } else { + offset = e->end; + e->end += len; + users = e->users++; + } + spin_unlock(&con->writequeue_lock); + + if (e) { + got_one: + if (users == 0) + kmap(e->page); + *ppc = page_address(e->page) + offset; + return e; + } + + e = new_writequeue_entry(con, allocation); + if (e) { + spin_lock(&con->writequeue_lock); + offset = e->end; + e->end += len; + users = e->users++; + list_add_tail(&e->list, &con->writequeue); + spin_unlock(&con->writequeue_lock); + goto got_one; + } + return NULL; +} + +void dlm_lowcomms_commit_buffer(void *mh) +{ + struct writequeue_entry *e = (struct writequeue_entry *)mh; + struct connection *con = e->con; + int users; + + users = --e->users; + if (users) + goto out; + e->len = e->end - e->offset; + kunmap(e->page); + spin_unlock(&con->writequeue_lock); + + if (test_and_set_bit(CF_WRITE_PENDING, &con->flags) == 0) { + spin_lock_bh(&write_sockets_lock); + list_add_tail(&con->write_list, &write_sockets); + spin_unlock_bh(&write_sockets_lock); + + wake_up_interruptible(&lowcomms_send_waitq); + } + return; + +out: + spin_unlock(&con->writequeue_lock); + return; +} + +static void free_entry(struct writequeue_entry *e) +{ + __free_page(e->page); + kfree(e); +} + +/* Send a message */ +static void send_to_sock(struct connection *con) +{ + int ret = 0; + ssize_t(*sendpage) (struct socket *, struct page *, int, size_t, int); + const int msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL; + struct writequeue_entry *e; + int len, offset; + + down_read(&con->sock_sem); + if (con->sock == NULL) + goto out_connect; + + sendpage = con->sock->ops->sendpage; + + spin_lock(&con->writequeue_lock); + for (;;) { + e = list_entry(con->writequeue.next, struct writequeue_entry, + list); + if ((struct list_head *) e == &con->writequeue) + break; + + len = e->len; + offset = e->offset; + BUG_ON(len == 0 && e->users == 0); + spin_unlock(&con->writequeue_lock); + + ret = 0; + if (len) { + ret = sendpage(con->sock, e->page, offset, len, + msg_flags); + if (ret == -EAGAIN || ret == 0) + goto out; + if (ret <= 0) + goto send_error; + } + else { + /* Don't starve people filling buffers */ + cond_resched(); + } + + spin_lock(&con->writequeue_lock); + e->offset += ret; + e->len -= ret; + + if (e->len == 0 && e->users == 0) { + list_del(&e->list); + kunmap(e->page); + free_entry(e); + continue; + } + } + spin_unlock(&con->writequeue_lock); +out: + up_read(&con->sock_sem); + return; + +send_error: + up_read(&con->sock_sem); + close_connection(con, false); + lowcomms_connect_sock(con); + return; + +out_connect: + up_read(&con->sock_sem); + lowcomms_connect_sock(con); + return; +} + +static void clean_one_writequeue(struct connection *con) +{ + struct list_head *list; + struct list_head *temp; + + spin_lock(&con->writequeue_lock); + list_for_each_safe(list, temp, &con->writequeue) { + struct writequeue_entry *e = + list_entry(list, struct writequeue_entry, list); + list_del(&e->list); + free_entry(e); + } + spin_unlock(&con->writequeue_lock); +} + +/* Called from recovery when it knows that a node has + left the cluster */ +int dlm_lowcomms_close(int nodeid) +{ + struct connection *con; + + if (!connections) + goto out; + + log_print("closing connection to node %d", nodeid); + con = nodeid2con(nodeid, 0); + if (con) { + clean_one_writequeue(con); + close_connection(con, true); + atomic_set(&con->waiting_requests, 0); + } + return 0; + +out: + return -1; +} + +/* API send message call, may queue the request */ +/* N.B. This is the old interface - use the new one for new calls */ +int lowcomms_send_message(int nodeid, char *buf, int len, gfp_t allocation) +{ + struct writequeue_entry *e; + char *b; + + e = dlm_lowcomms_get_buffer(nodeid, len, allocation, &b); + if (e) { + memcpy(b, buf, len); + dlm_lowcomms_commit_buffer(e); + return 0; + } + return -ENOBUFS; +} + +/* Look for activity on active sockets */ +static void process_sockets(void) +{ + struct list_head *list; + struct list_head *temp; + int count = 0; + + spin_lock_bh(&read_sockets_lock); + list_for_each_safe(list, temp, &read_sockets) { + + struct connection *con = + list_entry(list, struct connection, read_list); + list_del(&con->read_list); + clear_bit(CF_READ_PENDING, &con->flags); + + spin_unlock_bh(&read_sockets_lock); + + /* This can reach zero if we are processing requests + * as they come in. + */ + if (atomic_read(&con->waiting_requests) == 0) { + spin_lock_bh(&read_sockets_lock); + continue; + } + + do { + con->rx_action(con); + + /* Don't starve out everyone else */ + if (++count >= MAX_RX_MSG_COUNT) { + cond_resched(); + count = 0; + } + + } while (!atomic_dec_and_test(&con->waiting_requests) && + !kthread_should_stop()); + + spin_lock_bh(&read_sockets_lock); + } + spin_unlock_bh(&read_sockets_lock); +} + +/* Try to send any messages that are pending + */ +static void process_output_queue(void) +{ + struct list_head *list; + struct list_head *temp; + + spin_lock_bh(&write_sockets_lock); + list_for_each_safe(list, temp, &write_sockets) { + struct connection *con = + list_entry(list, struct connection, write_list); + clear_bit(CF_WRITE_PENDING, &con->flags); + list_del(&con->write_list); + + spin_unlock_bh(&write_sockets_lock); + send_to_sock(con); + spin_lock_bh(&write_sockets_lock); + } + spin_unlock_bh(&write_sockets_lock); +} + +static void process_state_queue(void) +{ + struct list_head *list; + struct list_head *temp; + + spin_lock_bh(&state_sockets_lock); + list_for_each_safe(list, temp, &state_sockets) { + struct connection *con = + list_entry(list, struct connection, state_list); + list_del(&con->state_list); + clear_bit(CF_CONNECT_PENDING, &con->flags); + spin_unlock_bh(&state_sockets_lock); + + connect_to_sock(con); + spin_lock_bh(&state_sockets_lock); + } + spin_unlock_bh(&state_sockets_lock); +} + + +/* Discard all entries on the write queues */ +static void clean_writequeues(void) +{ + int nodeid; + + for (nodeid = 1; nodeid < conn_array_size; nodeid++) { + struct connection *con = nodeid2con(nodeid, 0); + + if (con) + clean_one_writequeue(con); + } +} + +static int read_list_empty(void) +{ + int status; + + spin_lock_bh(&read_sockets_lock); + status = list_empty(&read_sockets); + spin_unlock_bh(&read_sockets_lock); + + return status; +} + +/* DLM Transport comms receive daemon */ +static int dlm_recvd(void *data) +{ + init_waitqueue_entry(&lowcomms_recv_waitq_head, current); + add_wait_queue(&lowcomms_recv_waitq, &lowcomms_recv_waitq_head); + + while (!kthread_should_stop()) { + set_current_state(TASK_INTERRUPTIBLE); + if (read_list_empty()) + cond_resched(); + set_current_state(TASK_RUNNING); + + process_sockets(); + } + + return 0; +} + +static int write_and_state_lists_empty(void) +{ + int status; + + spin_lock_bh(&write_sockets_lock); + status = list_empty(&write_sockets); + spin_unlock_bh(&write_sockets_lock); + + spin_lock_bh(&state_sockets_lock); + if (list_empty(&state_sockets) == 0) + status = 0; + spin_unlock_bh(&state_sockets_lock); + + return status; +} + +/* DLM Transport send daemon */ +static int dlm_sendd(void *data) +{ + init_waitqueue_entry(&lowcomms_send_waitq_head, current); + add_wait_queue(&lowcomms_send_waitq, &lowcomms_send_waitq_head); + + while (!kthread_should_stop()) { + set_current_state(TASK_INTERRUPTIBLE); + if (write_and_state_lists_empty()) + cond_resched(); + set_current_state(TASK_RUNNING); + + process_state_queue(); + process_output_queue(); + } + + return 0; +} + +static void daemons_stop(void) +{ + kthread_stop(recv_task); + kthread_stop(send_task); +} + +static int daemons_start(void) +{ + struct task_struct *p; + int error; + + p = kthread_run(dlm_recvd, NULL, "dlm_recvd"); + error = IS_ERR(p); + if (error) { + log_print("can't start dlm_recvd %d", error); + return error; + } + recv_task = p; + + p = kthread_run(dlm_sendd, NULL, "dlm_sendd"); + error = IS_ERR(p); + if (error) { + log_print("can't start dlm_sendd %d", error); + kthread_stop(recv_task); + return error; + } + send_task = p; + + return 0; +} + +/* + * Return the largest buffer size we can cope with. + */ +int lowcomms_max_buffer_size(void) +{ + return PAGE_CACHE_SIZE; +} + +void dlm_lowcomms_stop(void) +{ + int i; + + /* Set all the flags to prevent any + socket activity. + */ + for (i = 0; i < conn_array_size; i++) { + if (connections[i]) + connections[i]->flags |= 0xFF; + } + + daemons_stop(); + clean_writequeues(); + + for (i = 0; i < conn_array_size; i++) { + if (connections[i]) { + close_connection(connections[i], true); + if (connections[i]->othercon) + kmem_cache_free(con_cache, connections[i]->othercon); + kmem_cache_free(con_cache, connections[i]); + } + } + + kfree(connections); + connections = NULL; + + kmem_cache_destroy(con_cache); +} + +/* This is quite likely to sleep... */ +int dlm_lowcomms_start(void) +{ + int error = 0; + + error = -ENOMEM; + connections = kzalloc(sizeof(struct connection *) * + NODE_INCREMENT, GFP_KERNEL); + if (!connections) + goto out; + + conn_array_size = NODE_INCREMENT; + + if (dlm_our_addr(&dlm_local_addr, 0)) { + log_print("no local IP address has been set"); + goto fail_free_conn; + } + if (!dlm_our_addr(&dlm_local_addr, 1)) { + log_print("This dlm comms module does not support multi-homed clustering"); + goto fail_free_conn; + } + + con_cache = kmem_cache_create("dlm_conn", sizeof(struct connection), + __alignof__(struct connection), 0, + NULL, NULL); + if (!con_cache) + goto fail_free_conn; + + + /* Start listening */ + error = listen_for_all(); + if (error) + goto fail_unlisten; + + error = daemons_start(); + if (error) + goto fail_unlisten; + + return 0; + +fail_unlisten: + close_connection(connections[0], false); + kmem_cache_free(con_cache, connections[0]); + kmem_cache_destroy(con_cache); + +fail_free_conn: + kfree(connections); + +out: + return error; +} + +/* + * Overrides for Emacs so that we follow Linus's tabbing style. + * Emacs will notice this stuff at the end of the file and automatically + * adjust the settings for this buffer only. This must remain at the end + * of the file. + * --------------------------------------------------------------------------- + * Local variables: + * c-file-style: "linux" + * End: + */ diff --git a/fs/dlm/lowcomms.h b/fs/dlm/lowcomms.h index 2d045e0..a9a9618 100644 --- a/fs/dlm/lowcomms.h +++ b/fs/dlm/lowcomms.h @@ -14,8 +14,6 @@ #ifndef __LOWCOMMS_DOT_H__ #define __LOWCOMMS_DOT_H__ -int dlm_lowcomms_init(void); -void dlm_lowcomms_exit(void); int dlm_lowcomms_start(void); void dlm_lowcomms_stop(void); int dlm_lowcomms_close(int nodeid); diff --git a/fs/dlm/main.c b/fs/dlm/main.c index a8da8dc..162fbae 100644 --- a/fs/dlm/main.c +++ b/fs/dlm/main.c @@ -16,7 +16,6 @@ #include "lock.h" #include "user.h" #include "memory.h" -#include "lowcomms.h" #include "config.h" #ifdef CONFIG_DLM_DEBUG @@ -47,20 +46,14 @@ static int __init init_dlm(void) if (error) goto out_config; - error = dlm_lowcomms_init(); - if (error) - goto out_debug; - error = dlm_user_init(); if (error) - goto out_lowcomms; + goto out_debug; printk("DLM (built %s %s) installed\n", __DATE__, __TIME__); return 0; - out_lowcomms: - dlm_lowcomms_exit(); out_debug: dlm_unregister_debugfs(); out_config: @@ -76,7 +69,6 @@ static int __init init_dlm(void) static void __exit exit_dlm(void) { dlm_user_exit(); - dlm_lowcomms_exit(); dlm_config_exit(); dlm_memory_exit(); dlm_lockspace_exit(); diff --git a/fs/dlm/member.c b/fs/dlm/member.c index a3f7de7..85e2897 100644 --- a/fs/dlm/member.c +++ b/fs/dlm/member.c @@ -186,6 +186,14 @@ int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out) struct dlm_member *memb, *safe; int i, error, found, pos = 0, neg = 0, low = -1; + /* previously removed members that we've not finished removing need to + count as a negative change so the "neg" recovery steps will happen */ + + list_for_each_entry(memb, &ls->ls_nodes_gone, list) { + log_debug(ls, "prev removed member %d", memb->nodeid); + neg++; + } + /* move departed members from ls_nodes to ls_nodes_gone */ list_for_each_entry_safe(memb, safe, &ls->ls_nodes, list) { diff --git a/fs/dlm/rcom.c b/fs/dlm/rcom.c index 518239a..4cc31be 100644 --- a/fs/dlm/rcom.c +++ b/fs/dlm/rcom.c @@ -90,13 +90,28 @@ static int check_config(struct dlm_ls *ls, struct rcom_config *rf, int nodeid) return 0; } +static void allow_sync_reply(struct dlm_ls *ls, uint64_t *new_seq) +{ + spin_lock(&ls->ls_rcom_spin); + *new_seq = ++ls->ls_rcom_seq; + set_bit(LSFL_RCOM_WAIT, &ls->ls_flags); + spin_unlock(&ls->ls_rcom_spin); +} + +static void disallow_sync_reply(struct dlm_ls *ls) +{ + spin_lock(&ls->ls_rcom_spin); + clear_bit(LSFL_RCOM_WAIT, &ls->ls_flags); + clear_bit(LSFL_RCOM_READY, &ls->ls_flags); + spin_unlock(&ls->ls_rcom_spin); +} + int dlm_rcom_status(struct dlm_ls *ls, int nodeid) { struct dlm_rcom *rc; struct dlm_mhandle *mh; int error = 0; - memset(ls->ls_recover_buf, 0, dlm_config.buffer_size); ls->ls_recover_nodeid = nodeid; if (nodeid == dlm_our_nodeid()) { @@ -108,12 +123,14 @@ int dlm_rcom_status(struct dlm_ls *ls, int nodeid) error = create_rcom(ls, nodeid, DLM_RCOM_STATUS, 0, &rc, &mh); if (error) goto out; - rc->rc_id = ++ls->ls_rcom_seq; + + allow_sync_reply(ls, &rc->rc_id); + memset(ls->ls_recover_buf, 0, dlm_config.buffer_size); send_rcom(ls, mh, rc); error = dlm_wait_function(ls, &rcom_response); - clear_bit(LSFL_RCOM_READY, &ls->ls_flags); + disallow_sync_reply(ls); if (error) goto out; @@ -150,14 +167,21 @@ static void receive_rcom_status(struct dlm_ls *ls, struct dlm_rcom *rc_in) static void receive_sync_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in) { - if (rc_in->rc_id != ls->ls_rcom_seq) { - log_debug(ls, "reject old reply %d got %llx wanted %llx", - rc_in->rc_type, rc_in->rc_id, ls->ls_rcom_seq); - return; + spin_lock(&ls->ls_rcom_spin); + if (!test_bit(LSFL_RCOM_WAIT, &ls->ls_flags) || + rc_in->rc_id != ls->ls_rcom_seq) { + log_debug(ls, "reject reply %d from %d seq %llx expect %llx", + rc_in->rc_type, rc_in->rc_header.h_nodeid, + (unsigned long long)rc_in->rc_id, + (unsigned long long)ls->ls_rcom_seq); + goto out; } memcpy(ls->ls_recover_buf, rc_in, rc_in->rc_header.h_length); set_bit(LSFL_RCOM_READY, &ls->ls_flags); + clear_bit(LSFL_RCOM_WAIT, &ls->ls_flags); wake_up(&ls->ls_wait_general); + out: + spin_unlock(&ls->ls_rcom_spin); } static void receive_rcom_status_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in) @@ -171,7 +195,6 @@ int dlm_rcom_names(struct dlm_ls *ls, int nodeid, char *last_name, int last_len) struct dlm_mhandle *mh; int error = 0, len = sizeof(struct dlm_rcom); - memset(ls->ls_recover_buf, 0, dlm_config.buffer_size); ls->ls_recover_nodeid = nodeid; if (nodeid == dlm_our_nodeid()) { @@ -185,12 +208,14 @@ int dlm_rcom_names(struct dlm_ls *ls, int nodeid, char *last_name, int last_len) if (error) goto out; memcpy(rc->rc_buf, last_name, last_len); - rc->rc_id = ++ls->ls_rcom_seq; + + allow_sync_reply(ls, &rc->rc_id); + memset(ls->ls_recover_buf, 0, dlm_config.buffer_size); send_rcom(ls, mh, rc); error = dlm_wait_function(ls, &rcom_response); - clear_bit(LSFL_RCOM_READY, &ls->ls_flags); + disallow_sync_reply(ls); out: return error; } @@ -370,9 +395,10 @@ static void receive_rcom_lock_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in) static int send_ls_not_ready(int nodeid, struct dlm_rcom *rc_in) { struct dlm_rcom *rc; + struct rcom_config *rf; struct dlm_mhandle *mh; char *mb; - int mb_len = sizeof(struct dlm_rcom); + int mb_len = sizeof(struct dlm_rcom) + sizeof(struct rcom_config); mh = dlm_lowcomms_get_buffer(nodeid, mb_len, GFP_KERNEL, &mb); if (!mh) @@ -391,6 +417,9 @@ static int send_ls_not_ready(int nodeid, struct dlm_rcom *rc_in) rc->rc_id = rc_in->rc_id; rc->rc_result = -ESRCH; + rf = (struct rcom_config *) rc->rc_buf; + rf->rf_lvblen = -1; + dlm_rcom_out(rc); dlm_lowcomms_commit_buffer(mh); @@ -412,9 +441,10 @@ void dlm_receive_rcom(struct dlm_header *hd, int nodeid) ls = dlm_find_lockspace_global(hd->h_lockspace); if (!ls) { - log_print("lockspace %x from %d not found", - hd->h_lockspace, nodeid); - send_ls_not_ready(nodeid, rc); + log_print("lockspace %x from %d type %x not found", + hd->h_lockspace, nodeid, rc->rc_type); + if (rc->rc_type == DLM_RCOM_STATUS) + send_ls_not_ready(nodeid, rc); return; } diff --git a/fs/dlm/recover.c b/fs/dlm/recover.c index a5e6d18..cf9f683 100644 --- a/fs/dlm/recover.c +++ b/fs/dlm/recover.c @@ -252,6 +252,7 @@ static void recover_list_clear(struct dlm_ls *ls) spin_lock(&ls->ls_recover_list_lock); list_for_each_entry_safe(r, s, &ls->ls_recover_list, res_recover_list) { list_del_init(&r->res_recover_list); + r->res_recover_locks_count = 0; dlm_put_rsb(r); ls->ls_recover_list_count--; } diff --git a/fs/dlm/recoverd.c b/fs/dlm/recoverd.c index 362e3ef..650536aa 100644 --- a/fs/dlm/recoverd.c +++ b/fs/dlm/recoverd.c @@ -45,7 +45,7 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv) unsigned long start; int error, neg = 0; - log_debug(ls, "recover %llx", rv->seq); + log_debug(ls, "recover %llx", (unsigned long long)rv->seq); mutex_lock(&ls->ls_recoverd_active); @@ -94,14 +94,6 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv) } /* - * Purge directory-related requests that are saved in requestqueue. - * All dir requests from before recovery are invalid now due to the dir - * rebuild and will be resent by the requesting nodes. - */ - - dlm_purge_requestqueue(ls); - - /* * Wait for all nodes to complete directory rebuild. */ @@ -164,10 +156,31 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv) */ dlm_recover_rsbs(ls); + } else { + /* + * Other lockspace members may be going through the "neg" steps + * while also adding us to the lockspace, in which case they'll + * be doing the recover_locks (RS_LOCKS) barrier. + */ + dlm_set_recover_status(ls, DLM_RS_LOCKS); + + error = dlm_recover_locks_wait(ls); + if (error) { + log_error(ls, "recover_locks_wait failed %d", error); + goto fail; + } } dlm_release_root_list(ls); + /* + * Purge directory-related requests that are saved in requestqueue. + * All dir requests from before recovery are invalid now due to the dir + * rebuild and will be resent by the requesting nodes. + */ + + dlm_purge_requestqueue(ls); + dlm_set_recover_status(ls, DLM_RS_DONE); error = dlm_recover_done_wait(ls); if (error) { @@ -199,7 +212,8 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv) dlm_astd_wake(); - log_debug(ls, "recover %llx done: %u ms", rv->seq, + log_debug(ls, "recover %llx done: %u ms", + (unsigned long long)rv->seq, jiffies_to_msecs(jiffies - start)); mutex_unlock(&ls->ls_recoverd_active); @@ -207,11 +221,16 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv) fail: dlm_release_root_list(ls); - log_debug(ls, "recover %llx error %d", rv->seq, error); + log_debug(ls, "recover %llx error %d", + (unsigned long long)rv->seq, error); mutex_unlock(&ls->ls_recoverd_active); return error; } +/* The dlm_ls_start() that created the rv we take here may already have been + stopped via dlm_ls_stop(); in that case we need to leave the RECOVERY_STOP + flag set. */ + static void do_ls_recovery(struct dlm_ls *ls) { struct dlm_recover *rv = NULL; @@ -219,7 +238,8 @@ static void do_ls_recovery(struct dlm_ls *ls) spin_lock(&ls->ls_recover_lock); rv = ls->ls_recover_args; ls->ls_recover_args = NULL; - clear_bit(LSFL_RECOVERY_STOP, &ls->ls_flags); + if (rv && ls->ls_recover_seq == rv->seq) + clear_bit(LSFL_RECOVERY_STOP, &ls->ls_flags); spin_unlock(&ls->ls_recover_lock); if (rv) { diff --git a/fs/dlm/requestqueue.c b/fs/dlm/requestqueue.c index 7b2b089..65008d7 100644 --- a/fs/dlm/requestqueue.c +++ b/fs/dlm/requestqueue.c @@ -30,26 +30,36 @@ struct rq_entry { * lockspace is enabled on some while still suspended on others. */ -void dlm_add_requestqueue(struct dlm_ls *ls, int nodeid, struct dlm_header *hd) +int dlm_add_requestqueue(struct dlm_ls *ls, int nodeid, struct dlm_header *hd) { struct rq_entry *e; int length = hd->h_length; - - if (dlm_is_removed(ls, nodeid)) - return; + int rv = 0; e = kmalloc(sizeof(struct rq_entry) + length, GFP_KERNEL); if (!e) { log_print("dlm_add_requestqueue: out of memory\n"); - return; + return 0; } e->nodeid = nodeid; memcpy(e->request, hd, length); + /* We need to check dlm_locking_stopped() after taking the mutex to + avoid a race where dlm_recoverd enables locking and runs + process_requestqueue between our earlier dlm_locking_stopped check + and this addition to the requestqueue. */ + mutex_lock(&ls->ls_requestqueue_mutex); - list_add_tail(&e->list, &ls->ls_requestqueue); + if (dlm_locking_stopped(ls)) + list_add_tail(&e->list, &ls->ls_requestqueue); + else { + log_debug(ls, "dlm_add_requestqueue skip from %d", nodeid); + kfree(e); + rv = -EAGAIN; + } mutex_unlock(&ls->ls_requestqueue_mutex); + return rv; } int dlm_process_requestqueue(struct dlm_ls *ls) @@ -120,6 +130,10 @@ static int purge_request(struct dlm_ls *ls, struct dlm_message *ms, int nodeid) { uint32_t type = ms->m_type; + /* the ls is being cleaned up and freed by release_lockspace */ + if (!ls->ls_count) + return 1; + if (dlm_is_removed(ls, nodeid)) return 1; diff --git a/fs/dlm/requestqueue.h b/fs/dlm/requestqueue.h index 349f0d2..6a53ea0 100644 --- a/fs/dlm/requestqueue.h +++ b/fs/dlm/requestqueue.h @@ -13,7 +13,7 @@ #ifndef __REQUESTQUEUE_DOT_H__ #define __REQUESTQUEUE_DOT_H__ -void dlm_add_requestqueue(struct dlm_ls *ls, int nodeid, struct dlm_header *hd); +int dlm_add_requestqueue(struct dlm_ls *ls, int nodeid, struct dlm_header *hd); int dlm_process_requestqueue(struct dlm_ls *ls); void dlm_wait_requestqueue(struct dlm_ls *ls); void dlm_purge_requestqueue(struct dlm_ls *ls); |