From 38815b780285a4957852c5c9dbe94991c0b26c56 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Wed, 2 Mar 2011 16:55:21 -0800 Subject: libceph: fix handling of short returns from get_user_pages get_user_pages() can return fewer pages than we ask for. We were returning a bogus pointer/error code in that case. Instead, loop until we get all the pages we want or get an error we can return to the caller. Signed-off-by: Sage Weil --- net/ceph/pagevec.c | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/ceph/pagevec.c b/net/ceph/pagevec.c index 1a040e6..cd9c21d 100644 --- a/net/ceph/pagevec.c +++ b/net/ceph/pagevec.c @@ -16,22 +16,30 @@ struct page **ceph_get_direct_page_vector(const char __user *data, int num_pages, bool write_page) { struct page **pages; - int rc; + int got = 0; + int rc = 0; pages = kmalloc(sizeof(*pages) * num_pages, GFP_NOFS); if (!pages) return ERR_PTR(-ENOMEM); down_read(¤t->mm->mmap_sem); - rc = get_user_pages(current, current->mm, (unsigned long)data, - num_pages, write_page, 0, pages, NULL); + while (got < num_pages) { + rc = get_user_pages(current, current->mm, + (unsigned long)data + ((unsigned long)got * PAGE_SIZE), + num_pages - got, write_page, 0, pages + got, NULL); + if (rc < 0) + break; + BUG_ON(rc == 0); + got += rc; + } up_read(¤t->mm->mmap_sem); - if (rc < num_pages) + if (rc < 0) goto fail; return pages; fail: - ceph_put_page_vector(pages, rc > 0 ? rc : 0, false); + ceph_put_page_vector(pages, got, false); return ERR_PTR(rc); } EXPORT_SYMBOL(ceph_get_direct_page_vector); -- cgit v1.1 From 692d20f576fb26f62c83f80dbf3ea899998391b7 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 3 Mar 2011 12:14:53 -0800 Subject: libceph: retry after authorization failure If we mark the connection CLOSED we will give up trying to reconnect to this server instance. That is appropriate for things like a protocol version mismatch that won't change until the server is restarted, at which point we'll get a new addr and reconnect. An authorization failure like this is probably due to the server not properly rotating it's secret keys, however, and should be treated as transient so that the normal backoff and retry behavior kicks in. Signed-off-by: Sage Weil --- net/ceph/messenger.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'net') diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 35b36b8..6bd5025 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -1248,8 +1248,6 @@ static int process_connect(struct ceph_connection *con) con->auth_retry); if (con->auth_retry == 2) { con->error_msg = "connect authorization failure"; - reset_connection(con); - set_bit(CLOSED, &con->state); return -1; } con->auth_retry = 1; -- cgit v1.1 From 60bf8bf8815e6adea4c1d0423578c3b8000e2ec8 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Fri, 4 Mar 2011 12:24:28 -0800 Subject: libceph: fix msgr backoff With commit f363e45f we replaced a bunch of hacky workqueue mutual exclusion logic with the WQ_NON_REENTRANT flag. One pieces of fallout is that the exponential backoff breaks in certain cases: * con_work attempts to connect. * we get an immediate failure, and the socket state change handler queues immediate work. * con_work calls con_fault, we decide to back off, but can't queue delayed work. In this case, we add a BACKOFF bit to make con_work reschedule delayed work next time it runs (which should be immediately). Signed-off-by: Sage Weil --- net/ceph/messenger.c | 30 ++++++++++++++++++++++++++++-- 1 file changed, 28 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 6bd5025..46fbc42 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -1949,6 +1949,19 @@ static void con_work(struct work_struct *work) work.work); mutex_lock(&con->mutex); + if (test_and_clear_bit(BACKOFF, &con->state)) { + dout("con_work %p backing off\n", con); + if (queue_delayed_work(ceph_msgr_wq, &con->work, + round_jiffies_relative(con->delay))) { + dout("con_work %p backoff %lu\n", con, con->delay); + mutex_unlock(&con->mutex); + return; + } else { + con->ops->put(con); + dout("con_work %p FAILED to back off %lu\n", con, + con->delay); + } + } if (test_bit(CLOSED, &con->state)) { /* e.g. if we are replaced */ dout("con_work CLOSED\n"); @@ -2017,11 +2030,24 @@ static void ceph_fault(struct ceph_connection *con) con->delay = BASE_DELAY_INTERVAL; else if (con->delay < MAX_DELAY_INTERVAL) con->delay *= 2; - dout("fault queueing %p delay %lu\n", con, con->delay); con->ops->get(con); if (queue_delayed_work(ceph_msgr_wq, &con->work, - round_jiffies_relative(con->delay)) == 0) + round_jiffies_relative(con->delay))) { + dout("fault queued %p delay %lu\n", con, con->delay); + } else { con->ops->put(con); + dout("fault failed to queue %p delay %lu, backoff\n", + con, con->delay); + /* + * In many cases we see a socket state change + * while con_work is running and end up + * queuing (non-delayed) work, such that we + * can't backoff with a delay. Set a flag so + * that when con_work restarts we schedule the + * delay then. + */ + set_bit(BACKOFF, &con->state); + } } out_unlock: -- cgit v1.1 From e76661d0a59e53e5cc4dccbe4b755d1dc8a968ec Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 3 Mar 2011 10:10:15 -0800 Subject: libceph: fix msgr keepalive flag There was some broken keepalive code using a dead variable. Shift to using the proper bit flag. Signed-off-by: Sage Weil --- net/ceph/messenger.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 46fbc42..3252ea9 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -336,7 +336,6 @@ static void reset_connection(struct ceph_connection *con) ceph_msg_put(con->out_msg); con->out_msg = NULL; } - con->out_keepalive_pending = false; con->in_seq = 0; con->in_seq_acked = 0; } @@ -2019,10 +2018,10 @@ static void ceph_fault(struct ceph_connection *con) /* Requeue anything that hasn't been acked */ list_splice_init(&con->out_sent, &con->out_queue); - /* If there are no messages in the queue, place the connection - * in a STANDBY state (i.e., don't try to reconnect just yet). */ - if (list_empty(&con->out_queue) && !con->out_keepalive_pending) { - dout("fault setting STANDBY\n"); + /* If there are no messages queued or keepalive pending, place + * the connection in a STANDBY state */ + if (list_empty(&con->out_queue) && + !test_bit(KEEPALIVE_PENDING, &con->state)) { set_bit(STANDBY, &con->state); } else { /* retry after a delay. */ -- cgit v1.1 From e00de341fdb76c955703b4438100f9933c452b7f Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Fri, 4 Mar 2011 12:25:05 -0800 Subject: libceph: fix msgr standby handling The standby logic used to be pretty dependent on the work requeueing behavior that changed when we switched to WQ_NON_REENTRANT. It was also very fragile. Restructure things so that: - We clear WRITE_PENDING when we set STANDBY. This ensures we will requeue work when we wake up later. - con_work backs off if STANDBY is set. There is nothing to do if we are in standby. - clear_standby() helper is called by both con_send() and con_keepalive(), the two actions that can wake us up again. Move the connect_seq++ logic here. Signed-off-by: Sage Weil --- net/ceph/messenger.c | 30 ++++++++++++++++++++++-------- 1 file changed, 22 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 3252ea9..05f3578 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -1712,14 +1712,6 @@ more: /* open the socket first? */ if (con->sock == NULL) { - /* - * if we were STANDBY and are reconnecting _this_ - * connection, bump connect_seq now. Always bump - * global_seq. - */ - if (test_and_clear_bit(STANDBY, &con->state)) - con->connect_seq++; - prepare_write_banner(msgr, con); prepare_write_connect(msgr, con, 1); prepare_read_banner(con); @@ -1962,6 +1954,10 @@ static void con_work(struct work_struct *work) } } + if (test_bit(STANDBY, &con->state)) { + dout("con_work %p STANDBY\n", con); + goto done; + } if (test_bit(CLOSED, &con->state)) { /* e.g. if we are replaced */ dout("con_work CLOSED\n"); con_close_socket(con); @@ -2022,6 +2018,8 @@ static void ceph_fault(struct ceph_connection *con) * the connection in a STANDBY state */ if (list_empty(&con->out_queue) && !test_bit(KEEPALIVE_PENDING, &con->state)) { + dout("fault %p setting STANDBY clearing WRITE_PENDING\n", con); + clear_bit(WRITE_PENDING, &con->state); set_bit(STANDBY, &con->state); } else { /* retry after a delay. */ @@ -2117,6 +2115,19 @@ void ceph_messenger_destroy(struct ceph_messenger *msgr) } EXPORT_SYMBOL(ceph_messenger_destroy); +static void clear_standby(struct ceph_connection *con) +{ + /* come back from STANDBY? */ + if (test_and_clear_bit(STANDBY, &con->state)) { + mutex_lock(&con->mutex); + dout("clear_standby %p and ++connect_seq\n", con); + con->connect_seq++; + WARN_ON(test_bit(WRITE_PENDING, &con->state)); + WARN_ON(test_bit(KEEPALIVE_PENDING, &con->state)); + mutex_unlock(&con->mutex); + } +} + /* * Queue up an outgoing message on the given connection. */ @@ -2149,6 +2160,7 @@ void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg) /* if there wasn't anything waiting to send before, queue * new work */ + clear_standby(con); if (test_and_set_bit(WRITE_PENDING, &con->state) == 0) queue_con(con); } @@ -2214,6 +2226,8 @@ void ceph_con_revoke_message(struct ceph_connection *con, struct ceph_msg *msg) */ void ceph_con_keepalive(struct ceph_connection *con) { + dout("con_keepalive %p\n", con); + clear_standby(con); if (test_and_set_bit(KEEPALIVE_PENDING, &con->state) == 0 && test_and_set_bit(WRITE_PENDING, &con->state) == 0) queue_con(con); -- cgit v1.1