summaryrefslogtreecommitdiffstats
path: root/drivers/block/drbd/drbd_worker.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/block/drbd/drbd_worker.c')
-rw-r--r--drivers/block/drbd/drbd_worker.c1237
1 files changed, 717 insertions, 520 deletions
diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c
index 6bce2cc..424dc7b 100644
--- a/drivers/block/drbd/drbd_worker.c
+++ b/drivers/block/drbd/drbd_worker.c
@@ -38,16 +38,13 @@
#include "drbd_int.h"
#include "drbd_req.h"
-static int w_make_ov_request(struct drbd_conf *mdev, struct drbd_work *w, int cancel);
-static int w_make_resync_request(struct drbd_conf *mdev,
- struct drbd_work *w, int cancel);
-
+static int w_make_ov_request(struct drbd_work *w, int cancel);
/* endio handlers:
* drbd_md_io_complete (defined here)
- * drbd_endio_pri (defined here)
- * drbd_endio_sec (defined here)
+ * drbd_request_endio (defined here)
+ * drbd_peer_request_endio (defined here)
* bm_async_io_complete (defined in drbd_bitmap.c)
*
* For all these callbacks, note the following:
@@ -60,7 +57,7 @@ static int w_make_resync_request(struct drbd_conf *mdev,
/* About the global_state_lock
Each state transition on an device holds a read lock. In case we have
- to evaluate the sync after dependencies, we grab a write lock, because
+ to evaluate the resync after dependencies, we grab a write lock, because
we need stable states on all devices for that. */
rwlock_t global_state_lock;
@@ -98,97 +95,93 @@ void drbd_md_io_complete(struct bio *bio, int error)
/* reads on behalf of the partner,
* "submitted" by the receiver
*/
-void drbd_endio_read_sec_final(struct drbd_epoch_entry *e) __releases(local)
+void drbd_endio_read_sec_final(struct drbd_peer_request *peer_req) __releases(local)
{
unsigned long flags = 0;
- struct drbd_conf *mdev = e->mdev;
-
- D_ASSERT(e->block_id != ID_VACANT);
+ struct drbd_conf *mdev = peer_req->w.mdev;
- spin_lock_irqsave(&mdev->req_lock, flags);
- mdev->read_cnt += e->size >> 9;
- list_del(&e->w.list);
+ spin_lock_irqsave(&mdev->tconn->req_lock, flags);
+ mdev->read_cnt += peer_req->i.size >> 9;
+ list_del(&peer_req->w.list);
if (list_empty(&mdev->read_ee))
wake_up(&mdev->ee_wait);
- if (test_bit(__EE_WAS_ERROR, &e->flags))
- __drbd_chk_io_error(mdev, DRBD_IO_ERROR);
- spin_unlock_irqrestore(&mdev->req_lock, flags);
+ if (test_bit(__EE_WAS_ERROR, &peer_req->flags))
+ __drbd_chk_io_error(mdev, DRBD_READ_ERROR);
+ spin_unlock_irqrestore(&mdev->tconn->req_lock, flags);
- drbd_queue_work(&mdev->data.work, &e->w);
+ drbd_queue_work(&mdev->tconn->sender_work, &peer_req->w);
put_ldev(mdev);
}
/* writes on behalf of the partner, or resync writes,
* "submitted" by the receiver, final stage. */
-static void drbd_endio_write_sec_final(struct drbd_epoch_entry *e) __releases(local)
+static void drbd_endio_write_sec_final(struct drbd_peer_request *peer_req) __releases(local)
{
unsigned long flags = 0;
- struct drbd_conf *mdev = e->mdev;
- sector_t e_sector;
+ struct drbd_conf *mdev = peer_req->w.mdev;
+ struct drbd_interval i;
int do_wake;
- int is_syncer_req;
+ u64 block_id;
int do_al_complete_io;
- D_ASSERT(e->block_id != ID_VACANT);
-
- /* after we moved e to done_ee,
+ /* after we moved peer_req to done_ee,
* we may no longer access it,
* it may be freed/reused already!
* (as soon as we release the req_lock) */
- e_sector = e->sector;
- do_al_complete_io = e->flags & EE_CALL_AL_COMPLETE_IO;
- is_syncer_req = is_syncer_block_id(e->block_id);
+ i = peer_req->i;
+ do_al_complete_io = peer_req->flags & EE_CALL_AL_COMPLETE_IO;
+ block_id = peer_req->block_id;
- spin_lock_irqsave(&mdev->req_lock, flags);
- mdev->writ_cnt += e->size >> 9;
- list_del(&e->w.list); /* has been on active_ee or sync_ee */
- list_add_tail(&e->w.list, &mdev->done_ee);
+ spin_lock_irqsave(&mdev->tconn->req_lock, flags);
+ mdev->writ_cnt += peer_req->i.size >> 9;
+ list_move_tail(&peer_req->w.list, &mdev->done_ee);
- /* No hlist_del_init(&e->collision) here, we did not send the Ack yet,
- * neither did we wake possibly waiting conflicting requests.
- * done from "drbd_process_done_ee" within the appropriate w.cb
- * (e_end_block/e_end_resync_block) or from _drbd_clear_done_ee */
+ /*
+ * Do not remove from the write_requests tree here: we did not send the
+ * Ack yet and did not wake possibly waiting conflicting requests.
+ * Removed from the tree from "drbd_process_done_ee" within the
+ * appropriate w.cb (e_end_block/e_end_resync_block) or from
+ * _drbd_clear_done_ee.
+ */
- do_wake = is_syncer_req
- ? list_empty(&mdev->sync_ee)
- : list_empty(&mdev->active_ee);
+ do_wake = list_empty(block_id == ID_SYNCER ? &mdev->sync_ee : &mdev->active_ee);
- if (test_bit(__EE_WAS_ERROR, &e->flags))
- __drbd_chk_io_error(mdev, DRBD_IO_ERROR);
- spin_unlock_irqrestore(&mdev->req_lock, flags);
+ if (test_bit(__EE_WAS_ERROR, &peer_req->flags))
+ __drbd_chk_io_error(mdev, DRBD_WRITE_ERROR);
+ spin_unlock_irqrestore(&mdev->tconn->req_lock, flags);
- if (is_syncer_req)
- drbd_rs_complete_io(mdev, e_sector);
+ if (block_id == ID_SYNCER)
+ drbd_rs_complete_io(mdev, i.sector);
if (do_wake)
wake_up(&mdev->ee_wait);
if (do_al_complete_io)
- drbd_al_complete_io(mdev, e_sector);
+ drbd_al_complete_io(mdev, &i);
- wake_asender(mdev);
+ wake_asender(mdev->tconn);
put_ldev(mdev);
}
/* writes on behalf of the partner, or resync writes,
* "submitted" by the receiver.
*/
-void drbd_endio_sec(struct bio *bio, int error)
+void drbd_peer_request_endio(struct bio *bio, int error)
{
- struct drbd_epoch_entry *e = bio->bi_private;
- struct drbd_conf *mdev = e->mdev;
+ struct drbd_peer_request *peer_req = bio->bi_private;
+ struct drbd_conf *mdev = peer_req->w.mdev;
int uptodate = bio_flagged(bio, BIO_UPTODATE);
int is_write = bio_data_dir(bio) == WRITE;
if (error && __ratelimit(&drbd_ratelimit_state))
dev_warn(DEV, "%s: error=%d s=%llus\n",
is_write ? "write" : "read", error,
- (unsigned long long)e->sector);
+ (unsigned long long)peer_req->i.sector);
if (!error && !uptodate) {
if (__ratelimit(&drbd_ratelimit_state))
dev_warn(DEV, "%s: setting error to -EIO s=%llus\n",
is_write ? "write" : "read",
- (unsigned long long)e->sector);
+ (unsigned long long)peer_req->i.sector);
/* strange behavior of some lower level drivers...
* fail the request by clearing the uptodate flag,
* but do not return any error?! */
@@ -196,24 +189,24 @@ void drbd_endio_sec(struct bio *bio, int error)
}
if (error)
- set_bit(__EE_WAS_ERROR, &e->flags);
+ set_bit(__EE_WAS_ERROR, &peer_req->flags);
bio_put(bio); /* no need for the bio anymore */
- if (atomic_dec_and_test(&e->pending_bios)) {
+ if (atomic_dec_and_test(&peer_req->pending_bios)) {
if (is_write)
- drbd_endio_write_sec_final(e);
+ drbd_endio_write_sec_final(peer_req);
else
- drbd_endio_read_sec_final(e);
+ drbd_endio_read_sec_final(peer_req);
}
}
/* read, readA or write requests on R_PRIMARY coming from drbd_make_request
*/
-void drbd_endio_pri(struct bio *bio, int error)
+void drbd_request_endio(struct bio *bio, int error)
{
unsigned long flags;
struct drbd_request *req = bio->bi_private;
- struct drbd_conf *mdev = req->mdev;
+ struct drbd_conf *mdev = req->w.mdev;
struct bio_and_error m;
enum drbd_req_event what;
int uptodate = bio_flagged(bio, BIO_UPTODATE);
@@ -227,53 +220,72 @@ void drbd_endio_pri(struct bio *bio, int error)
error = -EIO;
}
+
+ /* If this request was aborted locally before,
+ * but now was completed "successfully",
+ * chances are that this caused arbitrary data corruption.
+ *
+ * "aborting" requests, or force-detaching the disk, is intended for
+ * completely blocked/hung local backing devices which do no longer
+ * complete requests at all, not even do error completions. In this
+ * situation, usually a hard-reset and failover is the only way out.
+ *
+ * By "aborting", basically faking a local error-completion,
+ * we allow for a more graceful swichover by cleanly migrating services.
+ * Still the affected node has to be rebooted "soon".
+ *
+ * By completing these requests, we allow the upper layers to re-use
+ * the associated data pages.
+ *
+ * If later the local backing device "recovers", and now DMAs some data
+ * from disk into the original request pages, in the best case it will
+ * just put random data into unused pages; but typically it will corrupt
+ * meanwhile completely unrelated data, causing all sorts of damage.
+ *
+ * Which means delayed successful completion,
+ * especially for READ requests,
+ * is a reason to panic().
+ *
+ * We assume that a delayed *error* completion is OK,
+ * though we still will complain noisily about it.
+ */
+ if (unlikely(req->rq_state & RQ_LOCAL_ABORTED)) {
+ if (__ratelimit(&drbd_ratelimit_state))
+ dev_emerg(DEV, "delayed completion of aborted local request; disk-timeout may be too aggressive\n");
+
+ if (!error)
+ panic("possible random memory corruption caused by delayed completion of aborted local request\n");
+ }
+
/* to avoid recursion in __req_mod */
if (unlikely(error)) {
what = (bio_data_dir(bio) == WRITE)
- ? write_completed_with_error
+ ? WRITE_COMPLETED_WITH_ERROR
: (bio_rw(bio) == READ)
- ? read_completed_with_error
- : read_ahead_completed_with_error;
+ ? READ_COMPLETED_WITH_ERROR
+ : READ_AHEAD_COMPLETED_WITH_ERROR;
} else
- what = completed_ok;
+ what = COMPLETED_OK;
bio_put(req->private_bio);
req->private_bio = ERR_PTR(error);
/* not req_mod(), we need irqsave here! */
- spin_lock_irqsave(&mdev->req_lock, flags);
+ spin_lock_irqsave(&mdev->tconn->req_lock, flags);
__req_mod(req, what, &m);
- spin_unlock_irqrestore(&mdev->req_lock, flags);
+ spin_unlock_irqrestore(&mdev->tconn->req_lock, flags);
put_ldev(mdev);
if (m.bio)
complete_master_bio(mdev, &m);
}
-int w_read_retry_remote(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
-{
- struct drbd_request *req = container_of(w, struct drbd_request, w);
-
- /* We should not detach for read io-error,
- * but try to WRITE the P_DATA_REPLY to the failed location,
- * to give the disk the chance to relocate that block */
-
- spin_lock_irq(&mdev->req_lock);
- if (cancel || mdev->state.pdsk != D_UP_TO_DATE) {
- _req_mod(req, read_retry_remote_canceled);
- spin_unlock_irq(&mdev->req_lock);
- return 1;
- }
- spin_unlock_irq(&mdev->req_lock);
-
- return w_send_read_req(mdev, w, 0);
-}
-
-void drbd_csum_ee(struct drbd_conf *mdev, struct crypto_hash *tfm, struct drbd_epoch_entry *e, void *digest)
+void drbd_csum_ee(struct drbd_conf *mdev, struct crypto_hash *tfm,
+ struct drbd_peer_request *peer_req, void *digest)
{
struct hash_desc desc;
struct scatterlist sg;
- struct page *page = e->pages;
+ struct page *page = peer_req->pages;
struct page *tmp;
unsigned len;
@@ -290,7 +302,7 @@ void drbd_csum_ee(struct drbd_conf *mdev, struct crypto_hash *tfm, struct drbd_e
page = tmp;
}
/* and now the last, possibly only partially used page */
- len = e->size & (PAGE_SIZE - 1);
+ len = peer_req->i.size & (PAGE_SIZE - 1);
sg_set_page(&sg, page, len ?: PAGE_SIZE, 0);
crypto_hash_update(&desc, &sg, sg.length);
crypto_hash_final(&desc, digest);
@@ -316,59 +328,58 @@ void drbd_csum_bio(struct drbd_conf *mdev, struct crypto_hash *tfm, struct bio *
crypto_hash_final(&desc, digest);
}
-/* TODO merge common code with w_e_end_ov_req */
-int w_e_send_csum(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
+/* MAYBE merge common code with w_e_end_ov_req */
+static int w_e_send_csum(struct drbd_work *w, int cancel)
{
- struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w);
+ struct drbd_peer_request *peer_req = container_of(w, struct drbd_peer_request, w);
+ struct drbd_conf *mdev = w->mdev;
int digest_size;
void *digest;
- int ok = 1;
-
- D_ASSERT(e->block_id == DRBD_MAGIC + 0xbeef);
+ int err = 0;
if (unlikely(cancel))
goto out;
- if (likely((e->flags & EE_WAS_ERROR) != 0))
+ if (unlikely((peer_req->flags & EE_WAS_ERROR) != 0))
goto out;
- digest_size = crypto_hash_digestsize(mdev->csums_tfm);
+ digest_size = crypto_hash_digestsize(mdev->tconn->csums_tfm);
digest = kmalloc(digest_size, GFP_NOIO);
if (digest) {
- sector_t sector = e->sector;
- unsigned int size = e->size;
- drbd_csum_ee(mdev, mdev->csums_tfm, e, digest);
- /* Free e and pages before send.
+ sector_t sector = peer_req->i.sector;
+ unsigned int size = peer_req->i.size;
+ drbd_csum_ee(mdev, mdev->tconn->csums_tfm, peer_req, digest);
+ /* Free peer_req and pages before send.
* In case we block on congestion, we could otherwise run into
* some distributed deadlock, if the other side blocks on
* congestion as well, because our receiver blocks in
- * drbd_pp_alloc due to pp_in_use > max_buffers. */
- drbd_free_ee(mdev, e);
- e = NULL;
+ * drbd_alloc_pages due to pp_in_use > max_buffers. */
+ drbd_free_peer_req(mdev, peer_req);
+ peer_req = NULL;
inc_rs_pending(mdev);
- ok = drbd_send_drequest_csum(mdev, sector, size,
- digest, digest_size,
- P_CSUM_RS_REQUEST);
+ err = drbd_send_drequest_csum(mdev, sector, size,
+ digest, digest_size,
+ P_CSUM_RS_REQUEST);
kfree(digest);
} else {
dev_err(DEV, "kmalloc() of digest failed.\n");
- ok = 0;
+ err = -ENOMEM;
}
out:
- if (e)
- drbd_free_ee(mdev, e);
+ if (peer_req)
+ drbd_free_peer_req(mdev, peer_req);
- if (unlikely(!ok))
+ if (unlikely(err))
dev_err(DEV, "drbd_send_drequest(..., csum) failed\n");
- return ok;
+ return err;
}
#define GFP_TRY (__GFP_HIGHMEM | __GFP_NOWARN)
static int read_for_csum(struct drbd_conf *mdev, sector_t sector, int size)
{
- struct drbd_epoch_entry *e;
+ struct drbd_peer_request *peer_req;
if (!get_ldev(mdev))
return -EIO;
@@ -378,45 +389,47 @@ static int read_for_csum(struct drbd_conf *mdev, sector_t sector, int size)
/* GFP_TRY, because if there is no memory available right now, this may
* be rescheduled for later. It is "only" background resync, after all. */
- e = drbd_alloc_ee(mdev, DRBD_MAGIC+0xbeef, sector, size, GFP_TRY);
- if (!e)
+ peer_req = drbd_alloc_peer_req(mdev, ID_SYNCER /* unused */, sector,
+ size, GFP_TRY);
+ if (!peer_req)
goto defer;
- e->w.cb = w_e_send_csum;
- spin_lock_irq(&mdev->req_lock);
- list_add(&e->w.list, &mdev->read_ee);
- spin_unlock_irq(&mdev->req_lock);
+ peer_req->w.cb = w_e_send_csum;
+ spin_lock_irq(&mdev->tconn->req_lock);
+ list_add(&peer_req->w.list, &mdev->read_ee);
+ spin_unlock_irq(&mdev->tconn->req_lock);
atomic_add(size >> 9, &mdev->rs_sect_ev);
- if (drbd_submit_ee(mdev, e, READ, DRBD_FAULT_RS_RD) == 0)
+ if (drbd_submit_peer_request(mdev, peer_req, READ, DRBD_FAULT_RS_RD) == 0)
return 0;
/* If it failed because of ENOMEM, retry should help. If it failed
* because bio_add_page failed (probably broken lower level driver),
* retry may or may not help.
* If it does not, you may need to force disconnect. */
- spin_lock_irq(&mdev->req_lock);
- list_del(&e->w.list);
- spin_unlock_irq(&mdev->req_lock);
+ spin_lock_irq(&mdev->tconn->req_lock);
+ list_del(&peer_req->w.list);
+ spin_unlock_irq(&mdev->tconn->req_lock);
- drbd_free_ee(mdev, e);
+ drbd_free_peer_req(mdev, peer_req);
defer:
put_ldev(mdev);
return -EAGAIN;
}
-int w_resync_timer(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
+int w_resync_timer(struct drbd_work *w, int cancel)
{
+ struct drbd_conf *mdev = w->mdev;
switch (mdev->state.conn) {
case C_VERIFY_S:
- w_make_ov_request(mdev, w, cancel);
+ w_make_ov_request(w, cancel);
break;
case C_SYNC_TARGET:
- w_make_resync_request(mdev, w, cancel);
+ w_make_resync_request(w, cancel);
break;
}
- return 1;
+ return 0;
}
void resync_timer_fn(unsigned long data)
@@ -424,7 +437,7 @@ void resync_timer_fn(unsigned long data)
struct drbd_conf *mdev = (struct drbd_conf *) data;
if (list_empty(&mdev->resync_work.list))
- drbd_queue_work(&mdev->data.work, &mdev->resync_work);
+ drbd_queue_work(&mdev->tconn->sender_work, &mdev->resync_work);
}
static void fifo_set(struct fifo_buffer *fb, int value)
@@ -456,8 +469,24 @@ static void fifo_add_val(struct fifo_buffer *fb, int value)
fb->values[i] += value;
}
+struct fifo_buffer *fifo_alloc(int fifo_size)
+{
+ struct fifo_buffer *fb;
+
+ fb = kzalloc(sizeof(struct fifo_buffer) + sizeof(int) * fifo_size, GFP_NOIO);
+ if (!fb)
+ return NULL;
+
+ fb->head_index = 0;
+ fb->size = fifo_size;
+ fb->total = 0;
+
+ return fb;
+}
+
static int drbd_rs_controller(struct drbd_conf *mdev)
{
+ struct disk_conf *dc;
unsigned int sect_in; /* Number of sectors that came in since the last turn */
unsigned int want; /* The number of sectors we want in the proxy */
int req_sect; /* Number of sectors to request in this turn */
@@ -466,38 +495,39 @@ static int drbd_rs_controller(struct drbd_conf *mdev)
int steps; /* Number of time steps to plan ahead */
int curr_corr;
int max_sect;
+ struct fifo_buffer *plan;
sect_in = atomic_xchg(&mdev->rs_sect_in, 0); /* Number of sectors that came in */
mdev->rs_in_flight -= sect_in;
- spin_lock(&mdev->peer_seq_lock); /* get an atomic view on mdev->rs_plan_s */
+ dc = rcu_dereference(mdev->ldev->disk_conf);
+ plan = rcu_dereference(mdev->rs_plan_s);
- steps = mdev->rs_plan_s.size; /* (mdev->sync_conf.c_plan_ahead * 10 * SLEEP_TIME) / HZ; */
+ steps = plan->size; /* (dc->c_plan_ahead * 10 * SLEEP_TIME) / HZ; */
if (mdev->rs_in_flight + sect_in == 0) { /* At start of resync */
- want = ((mdev->sync_conf.rate * 2 * SLEEP_TIME) / HZ) * steps;
+ want = ((dc->resync_rate * 2 * SLEEP_TIME) / HZ) * steps;
} else { /* normal path */
- want = mdev->sync_conf.c_fill_target ? mdev->sync_conf.c_fill_target :
- sect_in * mdev->sync_conf.c_delay_target * HZ / (SLEEP_TIME * 10);
+ want = dc->c_fill_target ? dc->c_fill_target :
+ sect_in * dc->c_delay_target * HZ / (SLEEP_TIME * 10);
}
- correction = want - mdev->rs_in_flight - mdev->rs_planed;
+ correction = want - mdev->rs_in_flight - plan->total;
/* Plan ahead */
cps = correction / steps;
- fifo_add_val(&mdev->rs_plan_s, cps);
- mdev->rs_planed += cps * steps;
+ fifo_add_val(plan, cps);
+ plan->total += cps * steps;
/* What we do in this step */
- curr_corr = fifo_push(&mdev->rs_plan_s, 0);
- spin_unlock(&mdev->peer_seq_lock);
- mdev->rs_planed -= curr_corr;
+ curr_corr = fifo_push(plan, 0);
+ plan->total -= curr_corr;
req_sect = sect_in + curr_corr;
if (req_sect < 0)
req_sect = 0;
- max_sect = (mdev->sync_conf.c_max_rate * 2 * SLEEP_TIME) / HZ;
+ max_sect = (dc->c_max_rate * 2 * SLEEP_TIME) / HZ;
if (req_sect > max_sect)
req_sect = max_sect;
@@ -513,22 +543,25 @@ static int drbd_rs_controller(struct drbd_conf *mdev)
static int drbd_rs_number_requests(struct drbd_conf *mdev)
{
int number;
- if (mdev->rs_plan_s.size) { /* mdev->sync_conf.c_plan_ahead */
+
+ rcu_read_lock();
+ if (rcu_dereference(mdev->rs_plan_s)->size) {
number = drbd_rs_controller(mdev) >> (BM_BLOCK_SHIFT - 9);
mdev->c_sync_rate = number * HZ * (BM_BLOCK_SIZE / 1024) / SLEEP_TIME;
} else {
- mdev->c_sync_rate = mdev->sync_conf.rate;
+ mdev->c_sync_rate = rcu_dereference(mdev->ldev->disk_conf)->resync_rate;
number = SLEEP_TIME * mdev->c_sync_rate / ((BM_BLOCK_SIZE / 1024) * HZ);
}
+ rcu_read_unlock();
/* ignore the amount of pending requests, the resync controller should
* throttle down to incoming reply rate soon enough anyways. */
return number;
}
-static int w_make_resync_request(struct drbd_conf *mdev,
- struct drbd_work *w, int cancel)
+int w_make_resync_request(struct drbd_work *w, int cancel)
{
+ struct drbd_conf *mdev = w->mdev;
unsigned long bit;
sector_t sector;
const sector_t capacity = drbd_get_capacity(mdev->this_bdev);
@@ -538,12 +571,12 @@ static int w_make_resync_request(struct drbd_conf *mdev,
int i = 0;
if (unlikely(cancel))
- return 1;
+ return 0;
if (mdev->rs_total == 0) {
/* empty resync? */
drbd_resync_finished(mdev);
- return 1;
+ return 0;
}
if (!get_ldev(mdev)) {
@@ -552,7 +585,7 @@ static int w_make_resync_request(struct drbd_conf *mdev,
to continue resync with a broken disk makes no sense at
all */
dev_err(DEV, "Disk broke down during resync!\n");
- return 1;
+ return 0;
}
max_bio_size = queue_max_hw_sectors(mdev->rq_queue) << 9;
@@ -562,15 +595,15 @@ static int w_make_resync_request(struct drbd_conf *mdev,
for (i = 0; i < number; i++) {
/* Stop generating RS requests, when half of the send buffer is filled */
- mutex_lock(&mdev->data.mutex);
- if (mdev->data.socket) {
- queued = mdev->data.socket->sk->sk_wmem_queued;
- sndbuf = mdev->data.socket->sk->sk_sndbuf;
+ mutex_lock(&mdev->tconn->data.mutex);
+ if (mdev->tconn->data.socket) {
+ queued = mdev->tconn->data.socket->sk->sk_wmem_queued;
+ sndbuf = mdev->tconn->data.socket->sk->sk_sndbuf;
} else {
queued = 1;
sndbuf = 0;
}
- mutex_unlock(&mdev->data.mutex);
+ mutex_unlock(&mdev->tconn->data.mutex);
if (queued > sndbuf / 2)
goto requeue;
@@ -581,7 +614,7 @@ next_sector:
if (bit == DRBD_END_OF_BITMAP) {
mdev->bm_resync_fo = drbd_bm_bits(mdev);
put_ldev(mdev);
- return 1;
+ return 0;
}
sector = BM_BIT_TO_SECT(bit);
@@ -640,11 +673,11 @@ next_sector:
/* adjust very last sectors, in case we are oddly sized */
if (sector + (size>>9) > capacity)
size = (capacity-sector)<<9;
- if (mdev->agreed_pro_version >= 89 && mdev->csums_tfm) {
+ if (mdev->tconn->agreed_pro_version >= 89 && mdev->tconn->csums_tfm) {
switch (read_for_csum(mdev, sector, size)) {
case -EIO: /* Disk failure */
put_ldev(mdev);
- return 0;
+ return -EIO;
case -EAGAIN: /* allocation failed, or ldev busy */
drbd_rs_complete_io(mdev, sector);
mdev->bm_resync_fo = BM_SECT_TO_BIT(sector);
@@ -657,13 +690,16 @@ next_sector:
BUG();
}
} else {
+ int err;
+
inc_rs_pending(mdev);
- if (!drbd_send_drequest(mdev, P_RS_DATA_REQUEST,
- sector, size, ID_SYNCER)) {
+ err = drbd_send_drequest(mdev, P_RS_DATA_REQUEST,
+ sector, size, ID_SYNCER);
+ if (err) {
dev_err(DEV, "drbd_send_drequest() failed, aborting...\n");
dec_rs_pending(mdev);
put_ldev(mdev);
- return 0;
+ return err;
}
}
}
@@ -676,21 +712,23 @@ next_sector:
* until then resync "work" is "inactive" ...
*/
put_ldev(mdev);
- return 1;
+ return 0;
}
requeue:
mdev->rs_in_flight += (i << (BM_BLOCK_SHIFT - 9));
mod_timer(&mdev->resync_timer, jiffies + SLEEP_TIME);
put_ldev(mdev);
- return 1;
+ return 0;
}
-static int w_make_ov_request(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
+static int w_make_ov_request(struct drbd_work *w, int cancel)
{
+ struct drbd_conf *mdev = w->mdev;
int number, i, size;
sector_t sector;
const sector_t capacity = drbd_get_capacity(mdev->this_bdev);
+ bool stop_sector_reached = false;
if (unlikely(cancel))
return 1;
@@ -699,9 +737,17 @@ static int w_make_ov_request(struct drbd_conf *mdev, struct drbd_work *w, int ca
sector = mdev->ov_position;
for (i = 0; i < number; i++) {
- if (sector >= capacity) {
+ if (sector >= capacity)
return 1;
- }
+
+ /* We check for "finished" only in the reply path:
+ * w_e_end_ov_reply().
+ * We need to send at least one request out. */
+ stop_sector_reached = i > 0
+ && verify_can_do_stop_sector(mdev)
+ && sector >= mdev->ov_stop_sector;
+ if (stop_sector_reached)
+ break;
size = BM_BLOCK_SIZE;
@@ -715,7 +761,7 @@ static int w_make_ov_request(struct drbd_conf *mdev, struct drbd_work *w, int ca
size = (capacity-sector)<<9;
inc_rs_pending(mdev);
- if (!drbd_send_ov_request(mdev, sector, size)) {
+ if (drbd_send_ov_request(mdev, sector, size)) {
dec_rs_pending(mdev);
return 0;
}
@@ -725,56 +771,39 @@ static int w_make_ov_request(struct drbd_conf *mdev, struct drbd_work *w, int ca
requeue:
mdev->rs_in_flight += (i << (BM_BLOCK_SHIFT - 9));
- mod_timer(&mdev->resync_timer, jiffies + SLEEP_TIME);
- return 1;
-}
-
-
-void start_resync_timer_fn(unsigned long data)
-{
- struct drbd_conf *mdev = (struct drbd_conf *) data;
-
- drbd_queue_work(&mdev->data.work, &mdev->start_resync_work);
-}
-
-int w_start_resync(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
-{
- if (atomic_read(&mdev->unacked_cnt) || atomic_read(&mdev->rs_pending_cnt)) {
- dev_warn(DEV, "w_start_resync later...\n");
- mdev->start_resync_timer.expires = jiffies + HZ/10;
- add_timer(&mdev->start_resync_timer);
- return 1;
- }
-
- drbd_start_resync(mdev, C_SYNC_SOURCE);
- clear_bit(AHEAD_TO_SYNC_SOURCE, &mdev->flags);
+ if (i == 0 || !stop_sector_reached)
+ mod_timer(&mdev->resync_timer, jiffies + SLEEP_TIME);
return 1;
}
-int w_ov_finished(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
+int w_ov_finished(struct drbd_work *w, int cancel)
{
+ struct drbd_conf *mdev = w->mdev;
kfree(w);
- ov_oos_print(mdev);
+ ov_out_of_sync_print(mdev);
drbd_resync_finished(mdev);
- return 1;
+ return 0;
}
-static int w_resync_finished(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
+static int w_resync_finished(struct drbd_work *w, int cancel)
{
+ struct drbd_conf *mdev = w->mdev;
kfree(w);
drbd_resync_finished(mdev);
- return 1;
+ return 0;
}
static void ping_peer(struct drbd_conf *mdev)
{
- clear_bit(GOT_PING_ACK, &mdev->flags);
- request_ping(mdev);
- wait_event(mdev->misc_wait,
- test_bit(GOT_PING_ACK, &mdev->flags) || mdev->state.conn < C_CONNECTED);
+ struct drbd_tconn *tconn = mdev->tconn;
+
+ clear_bit(GOT_PING_ACK, &tconn->flags);
+ request_ping(tconn);
+ wait_event(tconn->ping_wait,
+ test_bit(GOT_PING_ACK, &tconn->flags) || mdev->state.conn < C_CONNECTED);
}
int drbd_resync_finished(struct drbd_conf *mdev)
@@ -799,7 +828,8 @@ int drbd_resync_finished(struct drbd_conf *mdev)
w = kmalloc(sizeof(struct drbd_work), GFP_ATOMIC);
if (w) {
w->cb = w_resync_finished;
- drbd_queue_work(&mdev->data.work, w);
+ w->mdev = mdev;
+ drbd_queue_work(&mdev->tconn->sender_work, w);
return 1;
}
dev_err(DEV, "Warn failed to drbd_rs_del_all() and to kmalloc(w).\n");
@@ -808,7 +838,12 @@ int drbd_resync_finished(struct drbd_conf *mdev)
dt = (jiffies - mdev->rs_start - mdev->rs_paused) / HZ;
if (dt <= 0)
dt = 1;
+
db = mdev->rs_total;
+ /* adjust for verify start and stop sectors, respective reached position */
+ if (mdev->state.conn == C_VERIFY_S || mdev->state.conn == C_VERIFY_T)
+ db -= mdev->ov_left;
+
dbdt = Bit2KB(db/dt);
mdev->rs_paused /= HZ;
@@ -817,8 +852,8 @@ int drbd_resync_finished(struct drbd_conf *mdev)
ping_peer(mdev);
- spin_lock_irq(&mdev->req_lock);
- os = mdev->state;
+ spin_lock_irq(&mdev->tconn->req_lock);
+ os = drbd_read_state(mdev);
verify_done = (os.conn == C_VERIFY_S || os.conn == C_VERIFY_T);
@@ -831,7 +866,7 @@ int drbd_resync_finished(struct drbd_conf *mdev)
ns.conn = C_CONNECTED;
dev_info(DEV, "%s done (total %lu sec; paused %lu sec; %lu K/sec)\n",
- verify_done ? "Online verify " : "Resync",
+ verify_done ? "Online verify" : "Resync",
dt + mdev->rs_paused, mdev->rs_paused, dbdt);
n_oos = drbd_bm_total_weight(mdev);
@@ -848,7 +883,7 @@ int drbd_resync_finished(struct drbd_conf *mdev)
if (os.conn == C_SYNC_TARGET || os.conn == C_PAUSED_SYNC_T)
khelper_cmd = "after-resync-target";
- if (mdev->csums_tfm && mdev->rs_total) {
+ if (mdev->tconn->csums_tfm && mdev->rs_total) {
const unsigned long s = mdev->rs_same_csum;
const unsigned long t = mdev->rs_total;
const int ratio =
@@ -906,13 +941,15 @@ int drbd_resync_finished(struct drbd_conf *mdev)
_drbd_set_state(mdev, ns, CS_VERBOSE, NULL);
out_unlock:
- spin_unlock_irq(&mdev->req_lock);
+ spin_unlock_irq(&mdev->tconn->req_lock);
put_ldev(mdev);
out:
mdev->rs_total = 0;
mdev->rs_failed = 0;
mdev->rs_paused = 0;
- if (verify_done)
+
+ /* reset start sector, if we reached end of device */
+ if (verify_done && mdev->ov_left == 0)
mdev->ov_start_sector = 0;
drbd_md_sync(mdev);
@@ -924,19 +961,19 @@ out:
}
/* helper */
-static void move_to_net_ee_or_free(struct drbd_conf *mdev, struct drbd_epoch_entry *e)
+static void move_to_net_ee_or_free(struct drbd_conf *mdev, struct drbd_peer_request *peer_req)
{
- if (drbd_ee_has_active_page(e)) {
+ if (drbd_peer_req_has_active_page(peer_req)) {
/* This might happen if sendpage() has not finished */
- int i = (e->size + PAGE_SIZE -1) >> PAGE_SHIFT;
+ int i = (peer_req->i.size + PAGE_SIZE -1) >> PAGE_SHIFT;
atomic_add(i, &mdev->pp_in_use_by_net);
atomic_sub(i, &mdev->pp_in_use);
- spin_lock_irq(&mdev->req_lock);
- list_add_tail(&e->w.list, &mdev->net_ee);
- spin_unlock_irq(&mdev->req_lock);
+ spin_lock_irq(&mdev->tconn->req_lock);
+ list_add_tail(&peer_req->w.list, &mdev->net_ee);
+ spin_unlock_irq(&mdev->tconn->req_lock);
wake_up(&drbd_pp_wait);
} else
- drbd_free_ee(mdev, e);
+ drbd_free_peer_req(mdev, peer_req);
}
/**
@@ -945,174 +982,177 @@ static void move_to_net_ee_or_free(struct drbd_conf *mdev, struct drbd_epoch_ent
* @w: work object.
* @cancel: The connection will be closed anyways
*/
-int w_e_end_data_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
+int w_e_end_data_req(struct drbd_work *w, int cancel)
{
- struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w);
- int ok;
+ struct drbd_peer_request *peer_req = container_of(w, struct drbd_peer_request, w);
+ struct drbd_conf *mdev = w->mdev;
+ int err;
if (unlikely(cancel)) {
- drbd_free_ee(mdev, e);
+ drbd_free_peer_req(mdev, peer_req);
dec_unacked(mdev);
- return 1;
+ return 0;
}
- if (likely((e->flags & EE_WAS_ERROR) == 0)) {
- ok = drbd_send_block(mdev, P_DATA_REPLY, e);
+ if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) {
+ err = drbd_send_block(mdev, P_DATA_REPLY, peer_req);
} else {
if (__ratelimit(&drbd_ratelimit_state))
dev_err(DEV, "Sending NegDReply. sector=%llus.\n",
- (unsigned long long)e->sector);
+ (unsigned long long)peer_req->i.sector);
- ok = drbd_send_ack(mdev, P_NEG_DREPLY, e);
+ err = drbd_send_ack(mdev, P_NEG_DREPLY, peer_req);
}
dec_unacked(mdev);
- move_to_net_ee_or_free(mdev, e);
+ move_to_net_ee_or_free(mdev, peer_req);
- if (unlikely(!ok))
+ if (unlikely(err))
dev_err(DEV, "drbd_send_block() failed\n");
- return ok;
+ return err;
}
/**
- * w_e_end_rsdata_req() - Worker callback to send a P_RS_DATA_REPLY packet in response to a P_RS_DATA_REQUESTRS
+ * w_e_end_rsdata_req() - Worker callback to send a P_RS_DATA_REPLY packet in response to a P_RS_DATA_REQUEST
* @mdev: DRBD device.
* @w: work object.
* @cancel: The connection will be closed anyways
*/
-int w_e_end_rsdata_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
+int w_e_end_rsdata_req(struct drbd_work *w, int cancel)
{
- struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w);
- int ok;
+ struct drbd_peer_request *peer_req = container_of(w, struct drbd_peer_request, w);
+ struct drbd_conf *mdev = w->mdev;
+ int err;
if (unlikely(cancel)) {
- drbd_free_ee(mdev, e);
+ drbd_free_peer_req(mdev, peer_req);
dec_unacked(mdev);
- return 1;
+ return 0;
}
if (get_ldev_if_state(mdev, D_FAILED)) {
- drbd_rs_complete_io(mdev, e->sector);
+ drbd_rs_complete_io(mdev, peer_req->i.sector);
put_ldev(mdev);
}
if (mdev->state.conn == C_AHEAD) {
- ok = drbd_send_ack(mdev, P_RS_CANCEL, e);
- } else if (likely((e->flags & EE_WAS_ERROR) == 0)) {
+ err = drbd_send_ack(mdev, P_RS_CANCEL, peer_req);
+ } else if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) {
if (likely(mdev->state.pdsk >= D_INCONSISTENT)) {
inc_rs_pending(mdev);
- ok = drbd_send_block(mdev, P_RS_DATA_REPLY, e);
+ err = drbd_send_block(mdev, P_RS_DATA_REPLY, peer_req);
} else {
if (__ratelimit(&drbd_ratelimit_state))
dev_err(DEV, "Not sending RSDataReply, "
"partner DISKLESS!\n");
- ok = 1;
+ err = 0;
}
} else {
if (__ratelimit(&drbd_ratelimit_state))
dev_err(DEV, "Sending NegRSDReply. sector %llus.\n",
- (unsigned long long)e->sector);
+ (unsigned long long)peer_req->i.sector);
- ok = drbd_send_ack(mdev, P_NEG_RS_DREPLY, e);
+ err = drbd_send_ack(mdev, P_NEG_RS_DREPLY, peer_req);
/* update resync data with failure */
- drbd_rs_failed_io(mdev, e->sector, e->size);
+ drbd_rs_failed_io(mdev, peer_req->i.sector, peer_req->i.size);
}
dec_unacked(mdev);
- move_to_net_ee_or_free(mdev, e);
+ move_to_net_ee_or_free(mdev, peer_req);
- if (unlikely(!ok))
+ if (unlikely(err))
dev_err(DEV, "drbd_send_block() failed\n");
- return ok;
+ return err;
}
-int w_e_end_csum_rs_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
+int w_e_end_csum_rs_req(struct drbd_work *w, int cancel)
{
- struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w);
+ struct drbd_peer_request *peer_req = container_of(w, struct drbd_peer_request, w);
+ struct drbd_conf *mdev = w->mdev;
struct digest_info *di;
int digest_size;
void *digest = NULL;
- int ok, eq = 0;
+ int err, eq = 0;
if (unlikely(cancel)) {
- drbd_free_ee(mdev, e);
+ drbd_free_peer_req(mdev, peer_req);
dec_unacked(mdev);
- return 1;
+ return 0;
}
if (get_ldev(mdev)) {
- drbd_rs_complete_io(mdev, e->sector);
+ drbd_rs_complete_io(mdev, peer_req->i.sector);
put_ldev(mdev);
}
- di = e->digest;
+ di = peer_req->digest;
- if (likely((e->flags & EE_WAS_ERROR) == 0)) {
+ if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) {
/* quick hack to try to avoid a race against reconfiguration.
* a real fix would be much more involved,
* introducing more locking mechanisms */
- if (mdev->csums_tfm) {
- digest_size = crypto_hash_digestsize(mdev->csums_tfm);
+ if (mdev->tconn->csums_tfm) {
+ digest_size = crypto_hash_digestsize(mdev->tconn->csums_tfm);
D_ASSERT(digest_size == di->digest_size);
digest = kmalloc(digest_size, GFP_NOIO);
}
if (digest) {
- drbd_csum_ee(mdev, mdev->csums_tfm, e, digest);
+ drbd_csum_ee(mdev, mdev->tconn->csums_tfm, peer_req, digest);
eq = !memcmp(digest, di->digest, digest_size);
kfree(digest);
}
if (eq) {
- drbd_set_in_sync(mdev, e->sector, e->size);
+ drbd_set_in_sync(mdev, peer_req->i.sector, peer_req->i.size);
/* rs_same_csums unit is BM_BLOCK_SIZE */
- mdev->rs_same_csum += e->size >> BM_BLOCK_SHIFT;
- ok = drbd_send_ack(mdev, P_RS_IS_IN_SYNC, e);
+ mdev->rs_same_csum += peer_req->i.size >> BM_BLOCK_SHIFT;
+ err = drbd_send_ack(mdev, P_RS_IS_IN_SYNC, peer_req);
} else {
inc_rs_pending(mdev);
- e->block_id = ID_SYNCER; /* By setting block_id, digest pointer becomes invalid! */
- e->flags &= ~EE_HAS_DIGEST; /* This e no longer has a digest pointer */
+ peer_req->block_id = ID_SYNCER; /* By setting block_id, digest pointer becomes invalid! */
+ peer_req->flags &= ~EE_HAS_DIGEST; /* This peer request no longer has a digest pointer */
kfree(di);
- ok = drbd_send_block(mdev, P_RS_DATA_REPLY, e);
+ err = drbd_send_block(mdev, P_RS_DATA_REPLY, peer_req);
}
} else {
- ok = drbd_send_ack(mdev, P_NEG_RS_DREPLY, e);
+ err = drbd_send_ack(mdev, P_NEG_RS_DREPLY, peer_req);
if (__ratelimit(&drbd_ratelimit_state))
dev_err(DEV, "Sending NegDReply. I guess it gets messy.\n");
}
dec_unacked(mdev);
- move_to_net_ee_or_free(mdev, e);
+ move_to_net_ee_or_free(mdev, peer_req);
- if (unlikely(!ok))
+ if (unlikely(err))
dev_err(DEV, "drbd_send_block/ack() failed\n");
- return ok;
+ return err;
}
-/* TODO merge common code with w_e_send_csum */
-int w_e_end_ov_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
+int w_e_end_ov_req(struct drbd_work *w, int cancel)
{
- struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w);
- sector_t sector = e->sector;
- unsigned int size = e->size;
+ struct drbd_peer_request *peer_req = container_of(w, struct drbd_peer_request, w);
+ struct drbd_conf *mdev = w->mdev;
+ sector_t sector = peer_req->i.sector;
+ unsigned int size = peer_req->i.size;
int digest_size;
void *digest;
- int ok = 1;
+ int err = 0;
if (unlikely(cancel))
goto out;
- digest_size = crypto_hash_digestsize(mdev->verify_tfm);
+ digest_size = crypto_hash_digestsize(mdev->tconn->verify_tfm);
digest = kmalloc(digest_size, GFP_NOIO);
if (!digest) {
- ok = 0; /* terminate the connection in case the allocation failed */
+ err = 1; /* terminate the connection in case the allocation failed */
goto out;
}
- if (likely(!(e->flags & EE_WAS_ERROR)))
- drbd_csum_ee(mdev, mdev->verify_tfm, e, digest);
+ if (likely(!(peer_req->flags & EE_WAS_ERROR)))
+ drbd_csum_ee(mdev, mdev->tconn->verify_tfm, peer_req, digest);
else
memset(digest, 0, digest_size);
@@ -1120,25 +1160,23 @@ int w_e_end_ov_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
* In case we block on congestion, we could otherwise run into
* some distributed deadlock, if the other side blocks on
* congestion as well, because our receiver blocks in
- * drbd_pp_alloc due to pp_in_use > max_buffers. */
- drbd_free_ee(mdev, e);
- e = NULL;
+ * drbd_alloc_pages due to pp_in_use > max_buffers. */
+ drbd_free_peer_req(mdev, peer_req);
+ peer_req = NULL;
inc_rs_pending(mdev);
- ok = drbd_send_drequest_csum(mdev, sector, size,
- digest, digest_size,
- P_OV_REPLY);
- if (!ok)
+ err = drbd_send_drequest_csum(mdev, sector, size, digest, digest_size, P_OV_REPLY);
+ if (err)
dec_rs_pending(mdev);
kfree(digest);
out:
- if (e)
- drbd_free_ee(mdev, e);
+ if (peer_req)
+ drbd_free_peer_req(mdev, peer_req);
dec_unacked(mdev);
- return ok;
+ return err;
}
-void drbd_ov_oos_found(struct drbd_conf *mdev, sector_t sector, int size)
+void drbd_ov_out_of_sync_found(struct drbd_conf *mdev, sector_t sector, int size)
{
if (mdev->ov_last_oos_start + mdev->ov_last_oos_size == sector) {
mdev->ov_last_oos_size += size>>9;
@@ -1149,36 +1187,38 @@ void drbd_ov_oos_found(struct drbd_conf *mdev, sector_t sector, int size)
drbd_set_out_of_sync(mdev, sector, size);
}
-int w_e_end_ov_reply(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
+int w_e_end_ov_reply(struct drbd_work *w, int cancel)
{
- struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w);
+ struct drbd_peer_request *peer_req = container_of(w, struct drbd_peer_request, w);
+ struct drbd_conf *mdev = w->mdev;
struct digest_info *di;
void *digest;
- sector_t sector = e->sector;
- unsigned int size = e->size;
+ sector_t sector = peer_req->i.sector;
+ unsigned int size = peer_req->i.size;
int digest_size;
- int ok, eq = 0;
+ int err, eq = 0;
+ bool stop_sector_reached = false;
if (unlikely(cancel)) {
- drbd_free_ee(mdev, e);
+ drbd_free_peer_req(mdev, peer_req);
dec_unacked(mdev);
- return 1;
+ return 0;
}
/* after "cancel", because after drbd_disconnect/drbd_rs_cancel_all
* the resync lru has been cleaned up already */
if (get_ldev(mdev)) {
- drbd_rs_complete_io(mdev, e->sector);
+ drbd_rs_complete_io(mdev, peer_req->i.sector);
put_ldev(mdev);
}
- di = e->digest;
+ di = peer_req->digest;
- if (likely((e->flags & EE_WAS_ERROR) == 0)) {
- digest_size = crypto_hash_digestsize(mdev->verify_tfm);
+ if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) {
+ digest_size = crypto_hash_digestsize(mdev->tconn->verify_tfm);
digest = kmalloc(digest_size, GFP_NOIO);
if (digest) {
- drbd_csum_ee(mdev, mdev->verify_tfm, e, digest);
+ drbd_csum_ee(mdev, mdev->tconn->verify_tfm, peer_req, digest);
D_ASSERT(digest_size == di->digest_size);
eq = !memcmp(digest, di->digest, digest_size);
@@ -1186,19 +1226,19 @@ int w_e_end_ov_reply(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
}
}
- /* Free e and pages before send.
- * In case we block on congestion, we could otherwise run into
- * some distributed deadlock, if the other side blocks on
- * congestion as well, because our receiver blocks in
- * drbd_pp_alloc due to pp_in_use > max_buffers. */
- drbd_free_ee(mdev, e);
+ /* Free peer_req and pages before send.
+ * In case we block on congestion, we could otherwise run into
+ * some distributed deadlock, if the other side blocks on
+ * congestion as well, because our receiver blocks in
+ * drbd_alloc_pages due to pp_in_use > max_buffers. */
+ drbd_free_peer_req(mdev, peer_req);
if (!eq)
- drbd_ov_oos_found(mdev, sector, size);
+ drbd_ov_out_of_sync_found(mdev, sector, size);
else
- ov_oos_print(mdev);
+ ov_out_of_sync_print(mdev);
- ok = drbd_send_ack_ex(mdev, P_OV_RESULT, sector, size,
- eq ? ID_IN_SYNC : ID_OUT_OF_SYNC);
+ err = drbd_send_ack_ex(mdev, P_OV_RESULT, sector, size,
+ eq ? ID_IN_SYNC : ID_OUT_OF_SYNC);
dec_unacked(mdev);
@@ -1208,73 +1248,102 @@ int w_e_end_ov_reply(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
if ((mdev->ov_left & 0x200) == 0x200)
drbd_advance_rs_marks(mdev, mdev->ov_left);
- if (mdev->ov_left == 0) {
- ov_oos_print(mdev);
+ stop_sector_reached = verify_can_do_stop_sector(mdev) &&
+ (sector + (size>>9)) >= mdev->ov_stop_sector;
+
+ if (mdev->ov_left == 0 || stop_sector_reached) {
+ ov_out_of_sync_print(mdev);
drbd_resync_finished(mdev);
}
- return ok;
+ return err;
}
-int w_prev_work_done(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
+int w_prev_work_done(struct drbd_work *w, int cancel)
{
struct drbd_wq_barrier *b = container_of(w, struct drbd_wq_barrier, w);
+
complete(&b->done);
- return 1;
+ return 0;
}
-int w_send_barrier(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
+/* FIXME
+ * We need to track the number of pending barrier acks,
+ * and to be able to wait for them.
+ * See also comment in drbd_adm_attach before drbd_suspend_io.
+ */
+int drbd_send_barrier(struct drbd_tconn *tconn)
{
- struct drbd_tl_epoch *b = container_of(w, struct drbd_tl_epoch, w);
- struct p_barrier *p = &mdev->data.sbuf.barrier;
- int ok = 1;
-
- /* really avoid racing with tl_clear. w.cb may have been referenced
- * just before it was reassigned and re-queued, so double check that.
- * actually, this race was harmless, since we only try to send the
- * barrier packet here, and otherwise do nothing with the object.
- * but compare with the head of w_clear_epoch */
- spin_lock_irq(&mdev->req_lock);
- if (w->cb != w_send_barrier || mdev->state.conn < C_CONNECTED)
- cancel = 1;
- spin_unlock_irq(&mdev->req_lock);
- if (cancel)
- return 1;
+ struct p_barrier *p;
+ struct drbd_socket *sock;
- if (!drbd_get_data_sock(mdev))
- return 0;
- p->barrier = b->br_number;
- /* inc_ap_pending was done where this was queued.
- * dec_ap_pending will be done in got_BarrierAck
- * or (on connection loss) in w_clear_epoch. */
- ok = _drbd_send_cmd(mdev, mdev->data.socket, P_BARRIER,
- (struct p_header80 *)p, sizeof(*p), 0);
- drbd_put_data_sock(mdev);
-
- return ok;
+ sock = &tconn->data;
+ p = conn_prepare_command(tconn, sock);
+ if (!p)
+ return -EIO;
+ p->barrier = tconn->send.current_epoch_nr;
+ p->pad = 0;
+ tconn->send.current_epoch_writes = 0;
+
+ return conn_send_command(tconn, sock, P_BARRIER, sizeof(*p), NULL, 0);
}
-int w_send_write_hint(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
+int w_send_write_hint(struct drbd_work *w, int cancel)
{
+ struct drbd_conf *mdev = w->mdev;
+ struct drbd_socket *sock;
+
if (cancel)
- return 1;
- return drbd_send_short_cmd(mdev, P_UNPLUG_REMOTE);
+ return 0;
+ sock = &mdev->tconn->data;
+ if (!drbd_prepare_command(mdev, sock))
+ return -EIO;
+ return drbd_send_command(mdev, sock, P_UNPLUG_REMOTE, 0, NULL, 0);
}
-int w_send_oos(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
+static void re_init_if_first_write(struct drbd_tconn *tconn, unsigned int epoch)
+{
+ if (!tconn->send.seen_any_write_yet) {
+ tconn->send.seen_any_write_yet = true;
+ tconn->send.current_epoch_nr = epoch;
+ tconn->send.current_epoch_writes = 0;
+ }
+}
+
+static void maybe_send_barrier(struct drbd_tconn *tconn, unsigned int epoch)
+{
+ /* re-init if first write on this connection */
+ if (!tconn->send.seen_any_write_yet)
+ return;
+ if (tconn->send.current_epoch_nr != epoch) {
+ if (tconn->send.current_epoch_writes)
+ drbd_send_barrier(tconn);
+ tconn->send.current_epoch_nr = epoch;
+ }
+}
+
+int w_send_out_of_sync(struct drbd_work *w, int cancel)
{
struct drbd_request *req = container_of(w, struct drbd_request, w);
- int ok;
+ struct drbd_conf *mdev = w->mdev;
+ struct drbd_tconn *tconn = mdev->tconn;
+ int err;
if (unlikely(cancel)) {
- req_mod(req, send_canceled);
- return 1;
+ req_mod(req, SEND_CANCELED);
+ return 0;
}
- ok = drbd_send_oos(mdev, req);
- req_mod(req, oos_handed_to_network);
+ /* this time, no tconn->send.current_epoch_writes++;
+ * If it was sent, it was the closing barrier for the last
+ * replicated epoch, before we went into AHEAD mode.
+ * No more barriers will be sent, until we leave AHEAD mode again. */
+ maybe_send_barrier(tconn, req->epoch);
+
+ err = drbd_send_out_of_sync(mdev, req);
+ req_mod(req, OOS_HANDED_TO_NETWORK);
- return ok;
+ return err;
}
/**
@@ -1283,20 +1352,26 @@ int w_send_oos(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
* @w: work object.
* @cancel: The connection will be closed anyways
*/
-int w_send_dblock(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
+int w_send_dblock(struct drbd_work *w, int cancel)
{
struct drbd_request *req = container_of(w, struct drbd_request, w);
- int ok;
+ struct drbd_conf *mdev = w->mdev;
+ struct drbd_tconn *tconn = mdev->tconn;
+ int err;
if (unlikely(cancel)) {
- req_mod(req, send_canceled);
- return 1;
+ req_mod(req, SEND_CANCELED);
+ return 0;
}
- ok = drbd_send_dblock(mdev, req);
- req_mod(req, ok ? handed_over_to_network : send_failed);
+ re_init_if_first_write(tconn, req->epoch);
+ maybe_send_barrier(tconn, req->epoch);
+ tconn->send.current_epoch_writes++;
+
+ err = drbd_send_dblock(mdev, req);
+ req_mod(req, err ? SEND_FAILED : HANDED_OVER_TO_NETWORK);
- return ok;
+ return err;
}
/**
@@ -1305,57 +1380,61 @@ int w_send_dblock(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
* @w: work object.
* @cancel: The connection will be closed anyways
*/
-int w_send_read_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
+int w_send_read_req(struct drbd_work *w, int cancel)
{
struct drbd_request *req = container_of(w, struct drbd_request, w);
- int ok;
+ struct drbd_conf *mdev = w->mdev;
+ struct drbd_tconn *tconn = mdev->tconn;
+ int err;
if (unlikely(cancel)) {
- req_mod(req, send_canceled);
- return 1;
+ req_mod(req, SEND_CANCELED);
+ return 0;
}
- ok = drbd_send_drequest(mdev, P_DATA_REQUEST, req->sector, req->size,
- (unsigned long)req);
+ /* Even read requests may close a write epoch,
+ * if there was any yet. */
+ maybe_send_barrier(tconn, req->epoch);
- if (!ok) {
- /* ?? we set C_TIMEOUT or C_BROKEN_PIPE in drbd_send();
- * so this is probably redundant */
- if (mdev->state.conn >= C_CONNECTED)
- drbd_force_state(mdev, NS(conn, C_NETWORK_FAILURE));
- }
- req_mod(req, ok ? handed_over_to_network : send_failed);
+ err = drbd_send_drequest(mdev, P_DATA_REQUEST, req->i.sector, req->i.size,
+ (unsigned long)req);
+
+ req_mod(req, err ? SEND_FAILED : HANDED_OVER_TO_NETWORK);
- return ok;
+ return err;
}
-int w_restart_disk_io(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
+int w_restart_disk_io(struct drbd_work *w, int cancel)
{
struct drbd_request *req = container_of(w, struct drbd_request, w);
+ struct drbd_conf *mdev = w->mdev;
if (bio_data_dir(req->master_bio) == WRITE && req->rq_state & RQ_IN_ACT_LOG)
- drbd_al_begin_io(mdev, req->sector);
- /* Calling drbd_al_begin_io() out of the worker might deadlocks
- theoretically. Practically it can not deadlock, since this is
- only used when unfreezing IOs. All the extents of the requests
- that made it into the TL are already active */
+ drbd_al_begin_io(mdev, &req->i);
drbd_req_make_private_bio(req, req->master_bio);
req->private_bio->bi_bdev = mdev->ldev->backing_bdev;
generic_make_request(req->private_bio);
- return 1;
+ return 0;
}
static int _drbd_may_sync_now(struct drbd_conf *mdev)
{
struct drbd_conf *odev = mdev;
+ int resync_after;
while (1) {
- if (odev->sync_conf.after == -1)
+ if (!odev->ldev)
+ return 1;
+ rcu_read_lock();
+ resync_after = rcu_dereference(odev->ldev->disk_conf)->resync_after;
+ rcu_read_unlock();
+ if (resync_after == -1)
+ return 1;
+ odev = minor_to_mdev(resync_after);
+ if (!expect(odev))
return 1;
- odev = minor_to_mdev(odev->sync_conf.after);
- ERR_IF(!odev) return 1;
if ((odev->state.conn >= C_SYNC_SOURCE &&
odev->state.conn <= C_PAUSED_SYNC_T) ||
odev->state.aftr_isp || odev->state.peer_isp ||
@@ -1375,16 +1454,15 @@ static int _drbd_pause_after(struct drbd_conf *mdev)
struct drbd_conf *odev;
int i, rv = 0;
- for (i = 0; i < minor_count; i++) {
- odev = minor_to_mdev(i);
- if (!odev)
- continue;
+ rcu_read_lock();
+ idr_for_each_entry(&minors, odev, i) {
if (odev->state.conn == C_STANDALONE && odev->state.disk == D_DISKLESS)
continue;
if (!_drbd_may_sync_now(odev))
rv |= (__drbd_set_state(_NS(odev, aftr_isp, 1), CS_HARD, NULL)
!= SS_NOTHING_TO_DO);
}
+ rcu_read_unlock();
return rv;
}
@@ -1400,10 +1478,8 @@ static int _drbd_resume_next(struct drbd_conf *mdev)
struct drbd_conf *odev;
int i, rv = 0;
- for (i = 0; i < minor_count; i++) {
- odev = minor_to_mdev(i);
- if (!odev)
- continue;
+ rcu_read_lock();
+ idr_for_each_entry(&minors, odev, i) {
if (odev->state.conn == C_STANDALONE && odev->state.disk == D_DISKLESS)
continue;
if (odev->state.aftr_isp) {
@@ -1413,6 +1489,7 @@ static int _drbd_resume_next(struct drbd_conf *mdev)
!= SS_NOTHING_TO_DO) ;
}
}
+ rcu_read_unlock();
return rv;
}
@@ -1430,57 +1507,86 @@ void suspend_other_sg(struct drbd_conf *mdev)
write_unlock_irq(&global_state_lock);
}
-static int sync_after_error(struct drbd_conf *mdev, int o_minor)
+/* caller must hold global_state_lock */
+enum drbd_ret_code drbd_resync_after_valid(struct drbd_conf *mdev, int o_minor)
{
struct drbd_conf *odev;
+ int resync_after;
if (o_minor == -1)
return NO_ERROR;
if (o_minor < -1 || minor_to_mdev(o_minor) == NULL)
- return ERR_SYNC_AFTER;
+ return ERR_RESYNC_AFTER;
/* check for loops */
odev = minor_to_mdev(o_minor);
while (1) {
if (odev == mdev)
- return ERR_SYNC_AFTER_CYCLE;
+ return ERR_RESYNC_AFTER_CYCLE;
+ rcu_read_lock();
+ resync_after = rcu_dereference(odev->ldev->disk_conf)->resync_after;
+ rcu_read_unlock();
/* dependency chain ends here, no cycles. */
- if (odev->sync_conf.after == -1)
+ if (resync_after == -1)
return NO_ERROR;
/* follow the dependency chain */
- odev = minor_to_mdev(odev->sync_conf.after);
+ odev = minor_to_mdev(resync_after);
}
}
-int drbd_alter_sa(struct drbd_conf *mdev, int na)
+/* caller must hold global_state_lock */
+void drbd_resync_after_changed(struct drbd_conf *mdev)
{
int changes;
- int retcode;
- write_lock_irq(&global_state_lock);
- retcode = sync_after_error(mdev, na);
- if (retcode == NO_ERROR) {
- mdev->sync_conf.after = na;
- do {
- changes = _drbd_pause_after(mdev);
- changes |= _drbd_resume_next(mdev);
- } while (changes);
- }
- write_unlock_irq(&global_state_lock);
- return retcode;
+ do {
+ changes = _drbd_pause_after(mdev);
+ changes |= _drbd_resume_next(mdev);
+ } while (changes);
}
void drbd_rs_controller_reset(struct drbd_conf *mdev)
{
+ struct fifo_buffer *plan;
+
atomic_set(&mdev->rs_sect_in, 0);
atomic_set(&mdev->rs_sect_ev, 0);
mdev->rs_in_flight = 0;
- mdev->rs_planed = 0;
- spin_lock(&mdev->peer_seq_lock);
- fifo_set(&mdev->rs_plan_s, 0);
- spin_unlock(&mdev->peer_seq_lock);
+
+ /* Updating the RCU protected object in place is necessary since
+ this function gets called from atomic context.
+ It is valid since all other updates also lead to an completely
+ empty fifo */
+ rcu_read_lock();
+ plan = rcu_dereference(mdev->rs_plan_s);
+ plan->total = 0;
+ fifo_set(plan, 0);
+ rcu_read_unlock();
+}
+
+void start_resync_timer_fn(unsigned long data)
+{
+ struct drbd_conf *mdev = (struct drbd_conf *) data;
+
+ drbd_queue_work(&mdev->tconn->sender_work, &mdev->start_resync_work);
+}
+
+int w_start_resync(struct drbd_work *w, int cancel)
+{
+ struct drbd_conf *mdev = w->mdev;
+
+ if (atomic_read(&mdev->unacked_cnt) || atomic_read(&mdev->rs_pending_cnt)) {
+ dev_warn(DEV, "w_start_resync later...\n");
+ mdev->start_resync_timer.expires = jiffies + HZ/10;
+ add_timer(&mdev->start_resync_timer);
+ return 0;
+ }
+
+ drbd_start_resync(mdev, C_SYNC_SOURCE);
+ clear_bit(AHEAD_TO_SYNC_SOURCE, &mdev->flags);
+ return 0;
}
/**
@@ -1501,43 +1607,58 @@ void drbd_start_resync(struct drbd_conf *mdev, enum drbd_conns side)
return;
}
- if (side == C_SYNC_TARGET) {
- /* Since application IO was locked out during C_WF_BITMAP_T and
- C_WF_SYNC_UUID we are still unmodified. Before going to C_SYNC_TARGET
- we check that we might make the data inconsistent. */
- r = drbd_khelper(mdev, "before-resync-target");
- r = (r >> 8) & 0xff;
- if (r > 0) {
- dev_info(DEV, "before-resync-target handler returned %d, "
- "dropping connection.\n", r);
- drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
- return;
- }
- } else /* C_SYNC_SOURCE */ {
- r = drbd_khelper(mdev, "before-resync-source");
- r = (r >> 8) & 0xff;
- if (r > 0) {
- if (r == 3) {
- dev_info(DEV, "before-resync-source handler returned %d, "
- "ignoring. Old userland tools?", r);
- } else {
- dev_info(DEV, "before-resync-source handler returned %d, "
+ if (!test_bit(B_RS_H_DONE, &mdev->flags)) {
+ if (side == C_SYNC_TARGET) {
+ /* Since application IO was locked out during C_WF_BITMAP_T and
+ C_WF_SYNC_UUID we are still unmodified. Before going to C_SYNC_TARGET
+ we check that we might make the data inconsistent. */
+ r = drbd_khelper(mdev, "before-resync-target");
+ r = (r >> 8) & 0xff;
+ if (r > 0) {
+ dev_info(DEV, "before-resync-target handler returned %d, "
"dropping connection.\n", r);
- drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
+ conn_request_state(mdev->tconn, NS(conn, C_DISCONNECTING), CS_HARD);
return;
}
+ } else /* C_SYNC_SOURCE */ {
+ r = drbd_khelper(mdev, "before-resync-source");
+ r = (r >> 8) & 0xff;
+ if (r > 0) {
+ if (r == 3) {
+ dev_info(DEV, "before-resync-source handler returned %d, "
+ "ignoring. Old userland tools?", r);
+ } else {
+ dev_info(DEV, "before-resync-source handler returned %d, "
+ "dropping connection.\n", r);
+ conn_request_state(mdev->tconn, NS(conn, C_DISCONNECTING), CS_HARD);
+ return;
+ }
+ }
}
}
- drbd_state_lock(mdev);
+ if (current == mdev->tconn->worker.task) {
+ /* The worker should not sleep waiting for state_mutex,
+ that can take long */
+ if (!mutex_trylock(mdev->state_mutex)) {
+ set_bit(B_RS_H_DONE, &mdev->flags);
+ mdev->start_resync_timer.expires = jiffies + HZ/5;
+ add_timer(&mdev->start_resync_timer);
+ return;
+ }
+ } else {
+ mutex_lock(mdev->state_mutex);
+ }
+ clear_bit(B_RS_H_DONE, &mdev->flags);
+
write_lock_irq(&global_state_lock);
if (!get_ldev_if_state(mdev, D_NEGOTIATING)) {
write_unlock_irq(&global_state_lock);
- drbd_state_unlock(mdev);
+ mutex_unlock(mdev->state_mutex);
return;
}
- ns.i = mdev->state.i;
+ ns = drbd_read_state(mdev);
ns.aftr_isp = !_drbd_may_sync_now(mdev);
@@ -1549,7 +1670,7 @@ void drbd_start_resync(struct drbd_conf *mdev, enum drbd_conns side)
ns.pdsk = D_INCONSISTENT;
r = __drbd_set_state(mdev, ns, CS_VERBOSE, NULL);
- ns = mdev->state;
+ ns = drbd_read_state(mdev);
if (ns.conn < C_CONNECTED)
r = SS_UNKNOWN_ERROR;
@@ -1575,6 +1696,10 @@ void drbd_start_resync(struct drbd_conf *mdev, enum drbd_conns side)
write_unlock_irq(&global_state_lock);
if (r == SS_SUCCESS) {
+ /* reset rs_last_bcast when a resync or verify is started,
+ * to deal with potential jiffies wrap. */
+ mdev->rs_last_bcast = jiffies - HZ;
+
dev_info(DEV, "Began resync as %s (will sync %lu KB [%lu bits set]).\n",
drbd_conn_str(ns.conn),
(unsigned long) mdev->rs_total << (BM_BLOCK_SHIFT-10),
@@ -1589,10 +1714,10 @@ void drbd_start_resync(struct drbd_conf *mdev, enum drbd_conns side)
* drbd_resync_finished from here in that case.
* We drbd_gen_and_send_sync_uuid here for protocol < 96,
* and from after_state_ch otherwise. */
- if (side == C_SYNC_SOURCE && mdev->agreed_pro_version < 96)
+ if (side == C_SYNC_SOURCE && mdev->tconn->agreed_pro_version < 96)
drbd_gen_and_send_sync_uuid(mdev);
- if (mdev->agreed_pro_version < 95 && mdev->rs_total == 0) {
+ if (mdev->tconn->agreed_pro_version < 95 && mdev->rs_total == 0) {
/* This still has a race (about when exactly the peers
* detect connection loss) that can lead to a full sync
* on next handshake. In 8.3.9 we fixed this with explicit
@@ -1603,10 +1728,16 @@ void drbd_start_resync(struct drbd_conf *mdev, enum drbd_conns side)
* detect connection loss, then waiting for a ping
* response (implicit in drbd_resync_finished) reduces
* the race considerably, but does not solve it. */
- if (side == C_SYNC_SOURCE)
- schedule_timeout_interruptible(
- mdev->net_conf->ping_int * HZ +
- mdev->net_conf->ping_timeo*HZ/9);
+ if (side == C_SYNC_SOURCE) {
+ struct net_conf *nc;
+ int timeo;
+
+ rcu_read_lock();
+ nc = rcu_dereference(mdev->tconn->net_conf);
+ timeo = nc->ping_int * HZ + nc->ping_timeo * HZ / 9;
+ rcu_read_unlock();
+ schedule_timeout_interruptible(timeo);
+ }
drbd_resync_finished(mdev);
}
@@ -1621,114 +1752,180 @@ void drbd_start_resync(struct drbd_conf *mdev, enum drbd_conns side)
drbd_md_sync(mdev);
}
put_ldev(mdev);
- drbd_state_unlock(mdev);
+ mutex_unlock(mdev->state_mutex);
}
-int drbd_worker(struct drbd_thread *thi)
+/* If the resource already closed the current epoch, but we did not
+ * (because we have not yet seen new requests), we should send the
+ * corresponding barrier now. Must be checked within the same spinlock
+ * that is used to check for new requests. */
+bool need_to_send_barrier(struct drbd_tconn *connection)
{
- struct drbd_conf *mdev = thi->mdev;
- struct drbd_work *w = NULL;
- LIST_HEAD(work_list);
- int intr = 0, i;
+ if (!connection->send.seen_any_write_yet)
+ return false;
+
+ /* Skip barriers that do not contain any writes.
+ * This may happen during AHEAD mode. */
+ if (!connection->send.current_epoch_writes)
+ return false;
+
+ /* ->req_lock is held when requests are queued on
+ * connection->sender_work, and put into ->transfer_log.
+ * It is also held when ->current_tle_nr is increased.
+ * So either there are already new requests queued,
+ * and corresponding barriers will be send there.
+ * Or nothing new is queued yet, so the difference will be 1.
+ */
+ if (atomic_read(&connection->current_tle_nr) !=
+ connection->send.current_epoch_nr + 1)
+ return false;
+
+ return true;
+}
+
+bool dequeue_work_batch(struct drbd_work_queue *queue, struct list_head *work_list)
+{
+ spin_lock_irq(&queue->q_lock);
+ list_splice_init(&queue->q, work_list);
+ spin_unlock_irq(&queue->q_lock);
+ return !list_empty(work_list);
+}
- sprintf(current->comm, "drbd%d_worker", mdev_to_minor(mdev));
+bool dequeue_work_item(struct drbd_work_queue *queue, struct list_head *work_list)
+{
+ spin_lock_irq(&queue->q_lock);
+ if (!list_empty(&queue->q))
+ list_move(queue->q.next, work_list);
+ spin_unlock_irq(&queue->q_lock);
+ return !list_empty(work_list);
+}
- while (get_t_state(thi) == Running) {
- drbd_thread_current_set_cpu(mdev);
+void wait_for_work(struct drbd_tconn *connection, struct list_head *work_list)
+{
+ DEFINE_WAIT(wait);
+ struct net_conf *nc;
+ int uncork, cork;
- if (down_trylock(&mdev->data.work.s)) {
- mutex_lock(&mdev->data.mutex);
- if (mdev->data.socket && !mdev->net_conf->no_cork)
- drbd_tcp_uncork(mdev->data.socket);
- mutex_unlock(&mdev->data.mutex);
+ dequeue_work_item(&connection->sender_work, work_list);
+ if (!list_empty(work_list))
+ return;
- intr = down_interruptible(&mdev->data.work.s);
+ /* Still nothing to do?
+ * Maybe we still need to close the current epoch,
+ * even if no new requests are queued yet.
+ *
+ * Also, poke TCP, just in case.
+ * Then wait for new work (or signal). */
+ rcu_read_lock();
+ nc = rcu_dereference(connection->net_conf);
+ uncork = nc ? nc->tcp_cork : 0;
+ rcu_read_unlock();
+ if (uncork) {
+ mutex_lock(&connection->data.mutex);
+ if (connection->data.socket)
+ drbd_tcp_uncork(connection->data.socket);
+ mutex_unlock(&connection->data.mutex);
+ }
- mutex_lock(&mdev->data.mutex);
- if (mdev->data.socket && !mdev->net_conf->no_cork)
- drbd_tcp_cork(mdev->data.socket);
- mutex_unlock(&mdev->data.mutex);
+ for (;;) {
+ int send_barrier;
+ prepare_to_wait(&connection->sender_work.q_wait, &wait, TASK_INTERRUPTIBLE);
+ spin_lock_irq(&connection->req_lock);
+ spin_lock(&connection->sender_work.q_lock); /* FIXME get rid of this one? */
+ /* dequeue single item only,
+ * we still use drbd_queue_work_front() in some places */
+ if (!list_empty(&connection->sender_work.q))
+ list_move(connection->sender_work.q.next, work_list);
+ spin_unlock(&connection->sender_work.q_lock); /* FIXME get rid of this one? */
+ if (!list_empty(work_list) || signal_pending(current)) {
+ spin_unlock_irq(&connection->req_lock);
+ break;
}
+ send_barrier = need_to_send_barrier(connection);
+ spin_unlock_irq(&connection->req_lock);
+ if (send_barrier) {
+ drbd_send_barrier(connection);
+ connection->send.current_epoch_nr++;
+ }
+ schedule();
+ /* may be woken up for other things but new work, too,
+ * e.g. if the current epoch got closed.
+ * In which case we send the barrier above. */
+ }
+ finish_wait(&connection->sender_work.q_wait, &wait);
+
+ /* someone may have changed the config while we have been waiting above. */
+ rcu_read_lock();
+ nc = rcu_dereference(connection->net_conf);
+ cork = nc ? nc->tcp_cork : 0;
+ rcu_read_unlock();
+ mutex_lock(&connection->data.mutex);
+ if (connection->data.socket) {
+ if (cork)
+ drbd_tcp_cork(connection->data.socket);
+ else if (!uncork)
+ drbd_tcp_uncork(connection->data.socket);
+ }
+ mutex_unlock(&connection->data.mutex);
+}
- if (intr) {
- D_ASSERT(intr == -EINTR);
+int drbd_worker(struct drbd_thread *thi)
+{
+ struct drbd_tconn *tconn = thi->tconn;
+ struct drbd_work *w = NULL;
+ struct drbd_conf *mdev;
+ LIST_HEAD(work_list);
+ int vnr;
+
+ while (get_t_state(thi) == RUNNING) {
+ drbd_thread_current_set_cpu(thi);
+
+ /* as long as we use drbd_queue_work_front(),
+ * we may only dequeue single work items here, not batches. */
+ if (list_empty(&work_list))
+ wait_for_work(tconn, &work_list);
+
+ if (signal_pending(current)) {
flush_signals(current);
- ERR_IF (get_t_state(thi) == Running)
+ if (get_t_state(thi) == RUNNING) {
+ conn_warn(tconn, "Worker got an unexpected signal\n");
continue;
+ }
break;
}
- if (get_t_state(thi) != Running)
+ if (get_t_state(thi) != RUNNING)
break;
- /* With this break, we have done a down() but not consumed
- the entry from the list. The cleanup code takes care of
- this... */
-
- w = NULL;
- spin_lock_irq(&mdev->data.work.q_lock);
- ERR_IF(list_empty(&mdev->data.work.q)) {
- /* something terribly wrong in our logic.
- * we were able to down() the semaphore,
- * but the list is empty... doh.
- *
- * what is the best thing to do now?
- * try again from scratch, restarting the receiver,
- * asender, whatnot? could break even more ugly,
- * e.g. when we are primary, but no good local data.
- *
- * I'll try to get away just starting over this loop.
- */
- spin_unlock_irq(&mdev->data.work.q_lock);
- continue;
- }
- w = list_entry(mdev->data.work.q.next, struct drbd_work, list);
- list_del_init(&w->list);
- spin_unlock_irq(&mdev->data.work.q_lock);
-
- if (!w->cb(mdev, w, mdev->state.conn < C_CONNECTED)) {
- /* dev_warn(DEV, "worker: a callback failed! \n"); */
- if (mdev->state.conn >= C_CONNECTED)
- drbd_force_state(mdev,
- NS(conn, C_NETWORK_FAILURE));
+
+ while (!list_empty(&work_list)) {
+ w = list_first_entry(&work_list, struct drbd_work, list);
+ list_del_init(&w->list);
+ if (w->cb(w, tconn->cstate < C_WF_REPORT_PARAMS) == 0)
+ continue;
+ if (tconn->cstate >= C_WF_REPORT_PARAMS)
+ conn_request_state(tconn, NS(conn, C_NETWORK_FAILURE), CS_HARD);
}
}
- D_ASSERT(test_bit(DEVICE_DYING, &mdev->flags));
- D_ASSERT(test_bit(CONFIG_PENDING, &mdev->flags));
-
- spin_lock_irq(&mdev->data.work.q_lock);
- i = 0;
- while (!list_empty(&mdev->data.work.q)) {
- list_splice_init(&mdev->data.work.q, &work_list);
- spin_unlock_irq(&mdev->data.work.q_lock);
+ do {
while (!list_empty(&work_list)) {
- w = list_entry(work_list.next, struct drbd_work, list);
+ w = list_first_entry(&work_list, struct drbd_work, list);
list_del_init(&w->list);
- w->cb(mdev, w, 1);
- i++; /* dead debugging code */
+ w->cb(w, 1);
}
-
- spin_lock_irq(&mdev->data.work.q_lock);
+ dequeue_work_batch(&tconn->sender_work, &work_list);
+ } while (!list_empty(&work_list));
+
+ rcu_read_lock();
+ idr_for_each_entry(&tconn->volumes, mdev, vnr) {
+ D_ASSERT(mdev->state.disk == D_DISKLESS && mdev->state.conn == C_STANDALONE);
+ kref_get(&mdev->kref);
+ rcu_read_unlock();
+ drbd_mdev_cleanup(mdev);
+ kref_put(&mdev->kref, &drbd_minor_destroy);
+ rcu_read_lock();
}
- sema_init(&mdev->data.work.s, 0);
- /* DANGEROUS race: if someone did queue his work within the spinlock,
- * but up() ed outside the spinlock, we could get an up() on the
- * semaphore without corresponding list entry.
- * So don't do that.
- */
- spin_unlock_irq(&mdev->data.work.q_lock);
-
- D_ASSERT(mdev->state.disk == D_DISKLESS && mdev->state.conn == C_STANDALONE);
- /* _drbd_set_state only uses stop_nowait.
- * wait here for the Exiting receiver. */
- drbd_thread_stop(&mdev->receiver);
- drbd_mdev_cleanup(mdev);
-
- dev_info(DEV, "worker terminated\n");
-
- clear_bit(DEVICE_DYING, &mdev->flags);
- clear_bit(CONFIG_PENDING, &mdev->flags);
- wake_up(&mdev->state_wait);
+ rcu_read_unlock();
return 0;
}
OpenPOWER on IntegriCloud