From a9e2ae39170d01937725e1fff2e606baaa71346c Mon Sep 17 00:00:00 2001 From: Mark Fasheh Date: Fri, 24 Mar 2006 14:20:17 -0800 Subject: ocfs2: Better I/O error handling in heartbeat Propagate errors received in o2hb_bio_end_io() back to the heartbeat thread so it can skip re-arming the timer. Signed-off-by: Mark Fasheh --- fs/ocfs2/cluster/heartbeat.c | 40 ++++++++++++++++++++++++++++++++-------- 1 file changed, 32 insertions(+), 8 deletions(-) (limited to 'fs/ocfs2/cluster') diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index bff0f0d..21f38ac 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -153,6 +153,7 @@ struct o2hb_region { struct o2hb_bio_wait_ctxt { atomic_t wc_num_reqs; struct completion wc_io_complete; + int wc_error; }; static void o2hb_write_timeout(void *arg) @@ -186,6 +187,7 @@ static inline void o2hb_bio_wait_init(struct o2hb_bio_wait_ctxt *wc, { atomic_set(&wc->wc_num_reqs, num_ios); init_completion(&wc->wc_io_complete); + wc->wc_error = 0; } /* Used in error paths too */ @@ -218,8 +220,10 @@ static int o2hb_bio_end_io(struct bio *bio, { struct o2hb_bio_wait_ctxt *wc = bio->bi_private; - if (error) + if (error) { mlog(ML_ERROR, "IO Error %d\n", error); + wc->wc_error = error; + } if (bio->bi_size) return 1; @@ -390,6 +394,8 @@ static int o2hb_read_slots(struct o2hb_region *reg, bail_and_wait: o2hb_wait_on_io(reg, &wc); + if (wc.wc_error && !status) + status = wc.wc_error; if (bios) { for(i = 0; i < num_bios; i++) @@ -790,20 +796,24 @@ static int o2hb_highest_node(unsigned long *nodes, return highest; } -static void o2hb_do_disk_heartbeat(struct o2hb_region *reg) +static int o2hb_do_disk_heartbeat(struct o2hb_region *reg) { int i, ret, highest_node, change = 0; unsigned long configured_nodes[BITS_TO_LONGS(O2NM_MAX_NODES)]; struct bio *write_bio; struct o2hb_bio_wait_ctxt write_wc; - if (o2nm_configured_node_map(configured_nodes, sizeof(configured_nodes))) - return; + ret = o2nm_configured_node_map(configured_nodes, + sizeof(configured_nodes)); + if (ret) { + mlog_errno(ret); + return ret; + } highest_node = o2hb_highest_node(configured_nodes, O2NM_MAX_NODES); if (highest_node >= O2NM_MAX_NODES) { mlog(ML_NOTICE, "ocfs2_heartbeat: no configured nodes found!\n"); - return; + return -EINVAL; } /* No sense in reading the slots of nodes that don't exist @@ -813,7 +823,7 @@ static void o2hb_do_disk_heartbeat(struct o2hb_region *reg) ret = o2hb_read_slots(reg, highest_node + 1); if (ret < 0) { mlog_errno(ret); - return; + return ret; } /* With an up to date view of the slots, we can check that no @@ -831,7 +841,7 @@ static void o2hb_do_disk_heartbeat(struct o2hb_region *reg) ret = o2hb_issue_node_write(reg, &write_bio, &write_wc); if (ret < 0) { mlog_errno(ret); - return; + return ret; } i = -1; @@ -847,6 +857,15 @@ static void o2hb_do_disk_heartbeat(struct o2hb_region *reg) */ o2hb_wait_on_io(reg, &write_wc); bio_put(write_bio); + if (write_wc.wc_error) { + /* Do not re-arm the write timeout on I/O error - we + * can't be sure that the new block ever made it to + * disk */ + mlog(ML_ERROR, "Write error %d on device \"%s\"\n", + write_wc.wc_error, reg->hr_dev_name); + return write_wc.wc_error; + } + o2hb_arm_write_timeout(reg); /* let the person who launched us know when things are steady */ @@ -854,6 +873,8 @@ static void o2hb_do_disk_heartbeat(struct o2hb_region *reg) if (atomic_dec_and_test(®->hr_steady_iterations)) wake_up(&o2hb_steady_queue); } + + return 0; } /* Subtract b from a, storing the result in a. a *must* have a larger @@ -913,7 +934,10 @@ static int o2hb_thread(void *data) * likely to time itself out. */ do_gettimeofday(&before_hb); - o2hb_do_disk_heartbeat(reg); + i = 0; + do { + ret = o2hb_do_disk_heartbeat(reg); + } while (ret && ++i < 2); do_gettimeofday(&after_hb); elapsed_msec = o2hb_elapsed_msecs(&before_hb, &after_hb); -- cgit v1.1