diff options
Diffstat (limited to 'fs/nfs/nfs4filelayout.c')
-rw-r--r-- | fs/nfs/nfs4filelayout.c | 688 |
1 files changed, 453 insertions, 235 deletions
diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c index 5acfd9e..e134029 100644 --- a/fs/nfs/nfs4filelayout.c +++ b/fs/nfs/nfs4filelayout.c @@ -82,29 +82,76 @@ filelayout_get_dserver_offset(struct pnfs_layout_segment *lseg, loff_t offset) BUG(); } +static void filelayout_reset_write(struct nfs_write_data *data) +{ + struct nfs_pgio_header *hdr = data->header; + struct rpc_task *task = &data->task; + + if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) { + dprintk("%s Reset task %5u for i/o through MDS " + "(req %s/%lld, %u bytes @ offset %llu)\n", __func__, + data->task.tk_pid, + hdr->inode->i_sb->s_id, + (long long)NFS_FILEID(hdr->inode), + data->args.count, + (unsigned long long)data->args.offset); + + task->tk_status = pnfs_write_done_resend_to_mds(hdr->inode, + &hdr->pages, + hdr->completion_ops); + } +} + +static void filelayout_reset_read(struct nfs_read_data *data) +{ + struct nfs_pgio_header *hdr = data->header; + struct rpc_task *task = &data->task; + + if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) { + dprintk("%s Reset task %5u for i/o through MDS " + "(req %s/%lld, %u bytes @ offset %llu)\n", __func__, + data->task.tk_pid, + hdr->inode->i_sb->s_id, + (long long)NFS_FILEID(hdr->inode), + data->args.count, + (unsigned long long)data->args.offset); + + task->tk_status = pnfs_read_done_resend_to_mds(hdr->inode, + &hdr->pages, + hdr->completion_ops); + } +} + static int filelayout_async_handle_error(struct rpc_task *task, struct nfs4_state *state, struct nfs_client *clp, - int *reset) + struct pnfs_layout_segment *lseg) { - struct nfs_server *mds_server = NFS_SERVER(state->inode); + struct inode *inode = lseg->pls_layout->plh_inode; + struct nfs_server *mds_server = NFS_SERVER(inode); + struct nfs4_deviceid_node *devid = FILELAYOUT_DEVID_NODE(lseg); struct nfs_client *mds_client = mds_server->nfs_client; + struct nfs4_slot_table *tbl = &clp->cl_session->fc_slot_table; if (task->tk_status >= 0) return 0; - *reset = 0; switch (task->tk_status) { /* MDS state errors */ case -NFS4ERR_DELEG_REVOKED: case -NFS4ERR_ADMIN_REVOKED: case -NFS4ERR_BAD_STATEID: + if (state == NULL) + break; nfs_remove_bad_delegation(state->inode); case -NFS4ERR_OPENMODE: + if (state == NULL) + break; nfs4_schedule_stateid_recovery(mds_server, state); goto wait_on_recovery; case -NFS4ERR_EXPIRED: - nfs4_schedule_stateid_recovery(mds_server, state); + if (state != NULL) + nfs4_schedule_stateid_recovery(mds_server, state); nfs4_schedule_lease_recovery(mds_client); goto wait_on_recovery; /* DS session errors */ @@ -118,7 +165,7 @@ static int filelayout_async_handle_error(struct rpc_task *task, dprintk("%s ERROR %d, Reset session. Exchangeid " "flags 0x%x\n", __func__, task->tk_status, clp->cl_exchange_flags); - nfs4_schedule_session_recovery(clp->cl_session); + nfs4_schedule_session_recovery(clp->cl_session, task->tk_status); break; case -NFS4ERR_DELAY: case -NFS4ERR_GRACE: @@ -127,11 +174,48 @@ static int filelayout_async_handle_error(struct rpc_task *task, break; case -NFS4ERR_RETRY_UNCACHED_REP: break; + /* Invalidate Layout errors */ + case -NFS4ERR_PNFS_NO_LAYOUT: + case -ESTALE: /* mapped NFS4ERR_STALE */ + case -EBADHANDLE: /* mapped NFS4ERR_BADHANDLE */ + case -EISDIR: /* mapped NFS4ERR_ISDIR */ + case -NFS4ERR_FHEXPIRED: + case -NFS4ERR_WRONG_TYPE: + dprintk("%s Invalid layout error %d\n", __func__, + task->tk_status); + /* + * Destroy layout so new i/o will get a new layout. + * Layout will not be destroyed until all current lseg + * references are put. Mark layout as invalid to resend failed + * i/o and all i/o waiting on the slot table to the MDS until + * layout is destroyed and a new valid layout is obtained. + */ + set_bit(NFS_LAYOUT_INVALID, + &NFS_I(inode)->layout->plh_flags); + pnfs_destroy_layout(NFS_I(inode)); + rpc_wake_up(&tbl->slot_tbl_waitq); + goto reset; + /* RPC connection errors */ + case -ECONNREFUSED: + case -EHOSTDOWN: + case -EHOSTUNREACH: + case -ENETUNREACH: + case -EIO: + case -ETIMEDOUT: + case -EPIPE: + dprintk("%s DS connection error %d\n", __func__, + task->tk_status); + if (!filelayout_test_devid_invalid(devid)) + _pnfs_return_layout(inode); + filelayout_mark_devid_invalid(devid); + rpc_wake_up(&tbl->slot_tbl_waitq); + nfs4_ds_disconnect(clp); + /* fall through */ default: - dprintk("%s DS error. Retry through MDS %d\n", __func__, +reset: + dprintk("%s Retry through MDS. Error %d\n", __func__, task->tk_status); - *reset = 1; - break; + return -NFS4ERR_RESET_TO_MDS; } out: task->tk_status = 0; @@ -148,18 +232,17 @@ wait_on_recovery: static int filelayout_read_done_cb(struct rpc_task *task, struct nfs_read_data *data) { - int reset = 0; + struct nfs_pgio_header *hdr = data->header; + int err; - dprintk("%s DS read\n", __func__); + err = filelayout_async_handle_error(task, data->args.context->state, + data->ds_clp, hdr->lseg); - if (filelayout_async_handle_error(task, data->args.context->state, - data->ds_clp, &reset) == -EAGAIN) { - dprintk("%s calling restart ds_clp %p ds_clp->cl_session %p\n", - __func__, data->ds_clp, data->ds_clp->cl_session); - if (reset) { - pnfs_set_lo_fail(data->lseg); - nfs4_reset_read(task, data); - } + switch (err) { + case -NFS4ERR_RESET_TO_MDS: + filelayout_reset_read(data); + return task->tk_status; + case -EAGAIN: rpc_restart_call_prepare(task); return -EAGAIN; } @@ -175,13 +258,15 @@ static int filelayout_read_done_cb(struct rpc_task *task, static void filelayout_set_layoutcommit(struct nfs_write_data *wdata) { - if (FILELAYOUT_LSEG(wdata->lseg)->commit_through_mds || + struct nfs_pgio_header *hdr = wdata->header; + + if (FILELAYOUT_LSEG(hdr->lseg)->commit_through_mds || wdata->res.verf->committed == NFS_FILE_SYNC) return; pnfs_set_layoutcommit(wdata); - dprintk("%s ionde %lu pls_end_pos %lu\n", __func__, wdata->inode->i_ino, - (unsigned long) NFS_I(wdata->inode)->layout->plh_lwb); + dprintk("%s ionde %lu pls_end_pos %lu\n", __func__, hdr->inode->i_ino, + (unsigned long) NFS_I(hdr->inode)->layout->plh_lwb); } /* @@ -191,8 +276,14 @@ filelayout_set_layoutcommit(struct nfs_write_data *wdata) */ static void filelayout_read_prepare(struct rpc_task *task, void *data) { - struct nfs_read_data *rdata = (struct nfs_read_data *)data; + struct nfs_read_data *rdata = data; + if (filelayout_reset_to_mds(rdata->header->lseg)) { + dprintk("%s task %u reset io to MDS\n", __func__, task->tk_pid); + filelayout_reset_read(rdata); + rpc_exit(task, 0); + return; + } rdata->read_done_cb = filelayout_read_done_cb; if (nfs41_setup_sequence(rdata->ds_clp->cl_session, @@ -205,42 +296,47 @@ static void filelayout_read_prepare(struct rpc_task *task, void *data) static void filelayout_read_call_done(struct rpc_task *task, void *data) { - struct nfs_read_data *rdata = (struct nfs_read_data *)data; + struct nfs_read_data *rdata = data; dprintk("--> %s task->tk_status %d\n", __func__, task->tk_status); + if (test_bit(NFS_IOHDR_REDO, &rdata->header->flags) && + task->tk_status == 0) + return; + /* Note this may cause RPC to be resent */ - rdata->mds_ops->rpc_call_done(task, data); + rdata->header->mds_ops->rpc_call_done(task, data); } static void filelayout_read_count_stats(struct rpc_task *task, void *data) { - struct nfs_read_data *rdata = (struct nfs_read_data *)data; + struct nfs_read_data *rdata = data; - rpc_count_iostats(task, NFS_SERVER(rdata->inode)->client->cl_metrics); + rpc_count_iostats(task, NFS_SERVER(rdata->header->inode)->client->cl_metrics); } static void filelayout_read_release(void *data) { - struct nfs_read_data *rdata = (struct nfs_read_data *)data; + struct nfs_read_data *rdata = data; - put_lseg(rdata->lseg); - rdata->mds_ops->rpc_release(data); + nfs_put_client(rdata->ds_clp); + rdata->header->mds_ops->rpc_release(data); } static int filelayout_write_done_cb(struct rpc_task *task, struct nfs_write_data *data) { - int reset = 0; - - if (filelayout_async_handle_error(task, data->args.context->state, - data->ds_clp, &reset) == -EAGAIN) { - dprintk("%s calling restart ds_clp %p ds_clp->cl_session %p\n", - __func__, data->ds_clp, data->ds_clp->cl_session); - if (reset) { - pnfs_set_lo_fail(data->lseg); - nfs4_reset_write(task, data); - } + struct nfs_pgio_header *hdr = data->header; + int err; + + err = filelayout_async_handle_error(task, data->args.context->state, + data->ds_clp, hdr->lseg); + + switch (err) { + case -NFS4ERR_RESET_TO_MDS: + filelayout_reset_write(data); + return task->tk_status; + case -EAGAIN: rpc_restart_call_prepare(task); return -EAGAIN; } @@ -250,7 +346,7 @@ static int filelayout_write_done_cb(struct rpc_task *task, } /* Fake up some data that will cause nfs_commit_release to retry the writes. */ -static void prepare_to_resend_writes(struct nfs_write_data *data) +static void prepare_to_resend_writes(struct nfs_commit_data *data) { struct nfs_page *first = nfs_list_entry(data->pages.next); @@ -261,19 +357,19 @@ static void prepare_to_resend_writes(struct nfs_write_data *data) } static int filelayout_commit_done_cb(struct rpc_task *task, - struct nfs_write_data *data) + struct nfs_commit_data *data) { - int reset = 0; - - if (filelayout_async_handle_error(task, data->args.context->state, - data->ds_clp, &reset) == -EAGAIN) { - dprintk("%s calling restart ds_clp %p ds_clp->cl_session %p\n", - __func__, data->ds_clp, data->ds_clp->cl_session); - if (reset) { - prepare_to_resend_writes(data); - pnfs_set_lo_fail(data->lseg); - } else - rpc_restart_call_prepare(task); + int err; + + err = filelayout_async_handle_error(task, NULL, data->ds_clp, + data->lseg); + + switch (err) { + case -NFS4ERR_RESET_TO_MDS: + prepare_to_resend_writes(data); + return -EAGAIN; + case -EAGAIN: + rpc_restart_call_prepare(task); return -EAGAIN; } @@ -282,8 +378,14 @@ static int filelayout_commit_done_cb(struct rpc_task *task, static void filelayout_write_prepare(struct rpc_task *task, void *data) { - struct nfs_write_data *wdata = (struct nfs_write_data *)data; + struct nfs_write_data *wdata = data; + if (filelayout_reset_to_mds(wdata->header->lseg)) { + dprintk("%s task %u reset io to MDS\n", __func__, task->tk_pid); + filelayout_reset_write(wdata); + rpc_exit(task, 0); + return; + } if (nfs41_setup_sequence(wdata->ds_clp->cl_session, &wdata->args.seq_args, &wdata->res.seq_res, task)) @@ -294,36 +396,66 @@ static void filelayout_write_prepare(struct rpc_task *task, void *data) static void filelayout_write_call_done(struct rpc_task *task, void *data) { - struct nfs_write_data *wdata = (struct nfs_write_data *)data; + struct nfs_write_data *wdata = data; + + if (test_bit(NFS_IOHDR_REDO, &wdata->header->flags) && + task->tk_status == 0) + return; /* Note this may cause RPC to be resent */ - wdata->mds_ops->rpc_call_done(task, data); + wdata->header->mds_ops->rpc_call_done(task, data); } static void filelayout_write_count_stats(struct rpc_task *task, void *data) { - struct nfs_write_data *wdata = (struct nfs_write_data *)data; + struct nfs_write_data *wdata = data; - rpc_count_iostats(task, NFS_SERVER(wdata->inode)->client->cl_metrics); + rpc_count_iostats(task, NFS_SERVER(wdata->header->inode)->client->cl_metrics); } static void filelayout_write_release(void *data) { - struct nfs_write_data *wdata = (struct nfs_write_data *)data; + struct nfs_write_data *wdata = data; + + nfs_put_client(wdata->ds_clp); + wdata->header->mds_ops->rpc_release(data); +} + +static void filelayout_commit_prepare(struct rpc_task *task, void *data) +{ + struct nfs_commit_data *wdata = data; - put_lseg(wdata->lseg); - wdata->mds_ops->rpc_release(data); + if (nfs41_setup_sequence(wdata->ds_clp->cl_session, + &wdata->args.seq_args, &wdata->res.seq_res, + task)) + return; + + rpc_call_start(task); +} + +static void filelayout_write_commit_done(struct rpc_task *task, void *data) +{ + struct nfs_commit_data *wdata = data; + + /* Note this may cause RPC to be resent */ + wdata->mds_ops->rpc_call_done(task, data); +} + +static void filelayout_commit_count_stats(struct rpc_task *task, void *data) +{ + struct nfs_commit_data *cdata = data; + + rpc_count_iostats(task, NFS_SERVER(cdata->inode)->client->cl_metrics); } -static void filelayout_commit_release(void *data) +static void filelayout_commit_release(void *calldata) { - struct nfs_write_data *wdata = (struct nfs_write_data *)data; + struct nfs_commit_data *data = calldata; - nfs_commit_release_pages(wdata); - if (atomic_dec_and_test(&NFS_I(wdata->inode)->commits_outstanding)) - nfs_commit_clear_lock(NFS_I(wdata->inode)); - put_lseg(wdata->lseg); - nfs_commitdata_release(wdata); + data->completion_ops->completion(data); + put_lseg(data->lseg); + nfs_put_client(data->ds_clp); + nfs_commitdata_release(data); } static const struct rpc_call_ops filelayout_read_call_ops = { @@ -341,16 +473,17 @@ static const struct rpc_call_ops filelayout_write_call_ops = { }; static const struct rpc_call_ops filelayout_commit_call_ops = { - .rpc_call_prepare = filelayout_write_prepare, - .rpc_call_done = filelayout_write_call_done, - .rpc_count_stats = filelayout_write_count_stats, + .rpc_call_prepare = filelayout_commit_prepare, + .rpc_call_done = filelayout_write_commit_done, + .rpc_count_stats = filelayout_commit_count_stats, .rpc_release = filelayout_commit_release, }; static enum pnfs_try_status filelayout_read_pagelist(struct nfs_read_data *data) { - struct pnfs_layout_segment *lseg = data->lseg; + struct nfs_pgio_header *hdr = data->header; + struct pnfs_layout_segment *lseg = hdr->lseg; struct nfs4_pnfs_ds *ds; loff_t offset = data->args.offset; u32 j, idx; @@ -358,25 +491,20 @@ filelayout_read_pagelist(struct nfs_read_data *data) int status; dprintk("--> %s ino %lu pgbase %u req %Zu@%llu\n", - __func__, data->inode->i_ino, + __func__, hdr->inode->i_ino, data->args.pgbase, (size_t)data->args.count, offset); - if (test_bit(NFS_DEVICEID_INVALID, &FILELAYOUT_DEVID_NODE(lseg)->flags)) - return PNFS_NOT_ATTEMPTED; - /* Retrieve the correct rpc_client for the byte range */ j = nfs4_fl_calc_j_index(lseg, offset); idx = nfs4_fl_calc_ds_index(lseg, j); ds = nfs4_fl_prepare_ds(lseg, idx); - if (!ds) { - /* Either layout fh index faulty, or ds connect failed */ - set_bit(lo_fail_bit(IOMODE_RW), &lseg->pls_layout->plh_flags); - set_bit(lo_fail_bit(IOMODE_READ), &lseg->pls_layout->plh_flags); + if (!ds) return PNFS_NOT_ATTEMPTED; - } - dprintk("%s USE DS: %s\n", __func__, ds->ds_remotestr); + dprintk("%s USE DS: %s cl_count %d\n", __func__, + ds->ds_remotestr, atomic_read(&ds->ds_clp->cl_count)); /* No multipath support. Use first DS */ + atomic_inc(&ds->ds_clp->cl_count); data->ds_clp = ds->ds_clp; fh = nfs4_fl_select_ds_fh(lseg, j); if (fh) @@ -386,8 +514,8 @@ filelayout_read_pagelist(struct nfs_read_data *data) data->mds_offset = offset; /* Perform an asynchronous read to ds */ - status = nfs_initiate_read(data, ds->ds_clp->cl_rpcclient, - &filelayout_read_call_ops); + status = nfs_initiate_read(ds->ds_clp->cl_rpcclient, data, + &filelayout_read_call_ops, RPC_TASK_SOFTCONN); BUG_ON(status != 0); return PNFS_ATTEMPTED; } @@ -396,32 +524,26 @@ filelayout_read_pagelist(struct nfs_read_data *data) static enum pnfs_try_status filelayout_write_pagelist(struct nfs_write_data *data, int sync) { - struct pnfs_layout_segment *lseg = data->lseg; + struct nfs_pgio_header *hdr = data->header; + struct pnfs_layout_segment *lseg = hdr->lseg; struct nfs4_pnfs_ds *ds; loff_t offset = data->args.offset; u32 j, idx; struct nfs_fh *fh; int status; - if (test_bit(NFS_DEVICEID_INVALID, &FILELAYOUT_DEVID_NODE(lseg)->flags)) - return PNFS_NOT_ATTEMPTED; - /* Retrieve the correct rpc_client for the byte range */ j = nfs4_fl_calc_j_index(lseg, offset); idx = nfs4_fl_calc_ds_index(lseg, j); ds = nfs4_fl_prepare_ds(lseg, idx); - if (!ds) { - printk(KERN_ERR "NFS: %s: prepare_ds failed, use MDS\n", - __func__); - set_bit(lo_fail_bit(IOMODE_RW), &lseg->pls_layout->plh_flags); - set_bit(lo_fail_bit(IOMODE_READ), &lseg->pls_layout->plh_flags); + if (!ds) return PNFS_NOT_ATTEMPTED; - } - dprintk("%s ino %lu sync %d req %Zu@%llu DS: %s\n", __func__, - data->inode->i_ino, sync, (size_t) data->args.count, offset, - ds->ds_remotestr); + dprintk("%s ino %lu sync %d req %Zu@%llu DS: %s cl_count %d\n", + __func__, hdr->inode->i_ino, sync, (size_t) data->args.count, + offset, ds->ds_remotestr, atomic_read(&ds->ds_clp->cl_count)); data->write_done_cb = filelayout_write_done_cb; + atomic_inc(&ds->ds_clp->cl_count); data->ds_clp = ds->ds_clp; fh = nfs4_fl_select_ds_fh(lseg, j); if (fh) @@ -433,8 +555,9 @@ filelayout_write_pagelist(struct nfs_write_data *data, int sync) data->args.offset = filelayout_get_dserver_offset(lseg, offset); /* Perform an asynchronous write */ - status = nfs_initiate_write(data, ds->ds_clp->cl_rpcclient, - &filelayout_write_call_ops, sync); + status = nfs_initiate_write(ds->ds_clp->cl_rpcclient, data, + &filelayout_write_call_ops, sync, + RPC_TASK_SOFTCONN); BUG_ON(status != 0); return PNFS_ATTEMPTED; } @@ -650,10 +773,65 @@ filelayout_free_lseg(struct pnfs_layout_segment *lseg) dprintk("--> %s\n", __func__); nfs4_fl_put_deviceid(fl->dsaddr); - kfree(fl->commit_buckets); + /* This assumes a single RW lseg */ + if (lseg->pls_range.iomode == IOMODE_RW) { + struct nfs4_filelayout *flo; + + flo = FILELAYOUT_FROM_HDR(lseg->pls_layout); + flo->commit_info.nbuckets = 0; + kfree(flo->commit_info.buckets); + flo->commit_info.buckets = NULL; + } _filelayout_free_lseg(fl); } +static int +filelayout_alloc_commit_info(struct pnfs_layout_segment *lseg, + struct nfs_commit_info *cinfo, + gfp_t gfp_flags) +{ + struct nfs4_filelayout_segment *fl = FILELAYOUT_LSEG(lseg); + struct pnfs_commit_bucket *buckets; + int size; + + if (fl->commit_through_mds) + return 0; + if (cinfo->ds->nbuckets != 0) { + /* This assumes there is only one IOMODE_RW lseg. What + * we really want to do is have a layout_hdr level + * dictionary of <multipath_list4, fh> keys, each + * associated with a struct list_head, populated by calls + * to filelayout_write_pagelist(). + * */ + return 0; + } + + size = (fl->stripe_type == STRIPE_SPARSE) ? + fl->dsaddr->ds_num : fl->dsaddr->stripe_count; + + buckets = kcalloc(size, sizeof(struct pnfs_commit_bucket), + gfp_flags); + if (!buckets) + return -ENOMEM; + else { + int i; + + spin_lock(cinfo->lock); + if (cinfo->ds->nbuckets != 0) + kfree(buckets); + else { + cinfo->ds->buckets = buckets; + cinfo->ds->nbuckets = size; + for (i = 0; i < size; i++) { + INIT_LIST_HEAD(&buckets[i].written); + INIT_LIST_HEAD(&buckets[i].committing); + } + } + spin_unlock(cinfo->lock); + return 0; + } +} + static struct pnfs_layout_segment * filelayout_alloc_lseg(struct pnfs_layout_hdr *layoutid, struct nfs4_layoutget_res *lgr, @@ -673,29 +851,6 @@ filelayout_alloc_lseg(struct pnfs_layout_hdr *layoutid, _filelayout_free_lseg(fl); return NULL; } - - /* This assumes there is only one IOMODE_RW lseg. What - * we really want to do is have a layout_hdr level - * dictionary of <multipath_list4, fh> keys, each - * associated with a struct list_head, populated by calls - * to filelayout_write_pagelist(). - * */ - if ((!fl->commit_through_mds) && (lgr->range.iomode == IOMODE_RW)) { - int i; - int size = (fl->stripe_type == STRIPE_SPARSE) ? - fl->dsaddr->ds_num : fl->dsaddr->stripe_count; - - fl->commit_buckets = kcalloc(size, sizeof(struct nfs4_fl_commit_bucket), gfp_flags); - if (!fl->commit_buckets) { - filelayout_free_lseg(&fl->generic_hdr); - return NULL; - } - fl->number_of_buckets = size; - for (i = 0; i < size; i++) { - INIT_LIST_HEAD(&fl->commit_buckets[i].written); - INIT_LIST_HEAD(&fl->commit_buckets[i].committing); - } - } return &fl->generic_hdr; } @@ -716,8 +871,8 @@ filelayout_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, !nfs_generic_pg_test(pgio, prev, req)) return false; - p_stripe = (u64)prev->wb_index << PAGE_CACHE_SHIFT; - r_stripe = (u64)req->wb_index << PAGE_CACHE_SHIFT; + p_stripe = (u64)req_offset(prev); + r_stripe = (u64)req_offset(req); stripe_unit = FILELAYOUT_LSEG(pgio->pg_lseg)->stripe_unit; do_div(p_stripe, stripe_unit); @@ -732,6 +887,16 @@ filelayout_pg_init_read(struct nfs_pageio_descriptor *pgio, { BUG_ON(pgio->pg_lseg != NULL); + if (req->wb_offset != req->wb_pgbase) { + /* + * Handling unaligned pages is difficult, because have to + * somehow split a req in two in certain cases in the + * pg.test code. Avoid this by just not using pnfs + * in this case. + */ + nfs_pageio_reset_read_mds(pgio); + return; + } pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, req->wb_context, 0, @@ -747,8 +912,13 @@ static void filelayout_pg_init_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *req) { + struct nfs_commit_info cinfo; + int status; + BUG_ON(pgio->pg_lseg != NULL); + if (req->wb_offset != req->wb_pgbase) + goto out_mds; pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, req->wb_context, 0, @@ -757,7 +927,17 @@ filelayout_pg_init_write(struct nfs_pageio_descriptor *pgio, GFP_NOFS); /* If no lseg, fall back to write through mds */ if (pgio->pg_lseg == NULL) - nfs_pageio_reset_write_mds(pgio); + goto out_mds; + nfs_init_cinfo(&cinfo, pgio->pg_inode, pgio->pg_dreq); + status = filelayout_alloc_commit_info(pgio->pg_lseg, &cinfo, GFP_NOFS); + if (status < 0) { + put_lseg(pgio->pg_lseg); + pgio->pg_lseg = NULL; + goto out_mds; + } + return; +out_mds: + nfs_pageio_reset_write_mds(pgio); } static const struct nfs_pageio_ops filelayout_pg_read_ops = { @@ -784,43 +964,42 @@ static u32 select_bucket_index(struct nfs4_filelayout_segment *fl, u32 j) * If this will make the bucket empty, it will need to put the lseg reference. */ static void -filelayout_clear_request_commit(struct nfs_page *req) +filelayout_clear_request_commit(struct nfs_page *req, + struct nfs_commit_info *cinfo) { struct pnfs_layout_segment *freeme = NULL; - struct inode *inode = req->wb_context->dentry->d_inode; - spin_lock(&inode->i_lock); + spin_lock(cinfo->lock); if (!test_and_clear_bit(PG_COMMIT_TO_DS, &req->wb_flags)) goto out; + cinfo->ds->nwritten--; if (list_is_singular(&req->wb_list)) { - struct pnfs_layout_segment *lseg; + struct pnfs_commit_bucket *bucket; - /* From here we can find the bucket, but for the moment, - * since there is only one relevant lseg... - */ - list_for_each_entry(lseg, &NFS_I(inode)->layout->plh_segs, pls_list) { - if (lseg->pls_range.iomode == IOMODE_RW) { - freeme = lseg; - break; - } - } + bucket = list_first_entry(&req->wb_list, + struct pnfs_commit_bucket, + written); + freeme = bucket->wlseg; + bucket->wlseg = NULL; } out: - nfs_request_remove_commit_list(req); - spin_unlock(&inode->i_lock); + nfs_request_remove_commit_list(req, cinfo); + spin_unlock(cinfo->lock); put_lseg(freeme); } static struct list_head * filelayout_choose_commit_list(struct nfs_page *req, - struct pnfs_layout_segment *lseg) + struct pnfs_layout_segment *lseg, + struct nfs_commit_info *cinfo) { struct nfs4_filelayout_segment *fl = FILELAYOUT_LSEG(lseg); u32 i, j; struct list_head *list; + struct pnfs_commit_bucket *buckets; if (fl->commit_through_mds) - return &NFS_I(req->wb_context->dentry->d_inode)->commit_list; + return &cinfo->mds->list; /* Note that we are calling nfs4_fl_calc_j_index on each page * that ends up being committed to a data server. An attractive @@ -828,31 +1007,33 @@ filelayout_choose_commit_list(struct nfs_page *req, * to store the value calculated in filelayout_write_pagelist * and just use that here. */ - j = nfs4_fl_calc_j_index(lseg, - (loff_t)req->wb_index << PAGE_CACHE_SHIFT); + j = nfs4_fl_calc_j_index(lseg, req_offset(req)); i = select_bucket_index(fl, j); - list = &fl->commit_buckets[i].written; + buckets = cinfo->ds->buckets; + list = &buckets[i].written; if (list_empty(list)) { /* Non-empty buckets hold a reference on the lseg. That ref * is normally transferred to the COMMIT call and released * there. It could also be released if the last req is pulled * off due to a rewrite, in which case it will be done in - * filelayout_remove_commit_req + * filelayout_clear_request_commit */ - get_lseg(lseg); + buckets[i].wlseg = get_lseg(lseg); } set_bit(PG_COMMIT_TO_DS, &req->wb_flags); + cinfo->ds->nwritten++; return list; } static void filelayout_mark_request_commit(struct nfs_page *req, - struct pnfs_layout_segment *lseg) + struct pnfs_layout_segment *lseg, + struct nfs_commit_info *cinfo) { struct list_head *list; - list = filelayout_choose_commit_list(req, lseg); - nfs_request_add_commit_list(req, list); + list = filelayout_choose_commit_list(req, lseg, cinfo); + nfs_request_add_commit_list(req, list, cinfo); } static u32 calc_ds_index_from_commit(struct pnfs_layout_segment *lseg, u32 i) @@ -880,7 +1061,7 @@ select_ds_fh_from_commit(struct pnfs_layout_segment *lseg, u32 i) return flseg->fh_array[i]; } -static int filelayout_initiate_commit(struct nfs_write_data *data, int how) +static int filelayout_initiate_commit(struct nfs_commit_data *data, int how) { struct pnfs_layout_segment *lseg = data->lseg; struct nfs4_pnfs_ds *ds; @@ -890,135 +1071,138 @@ static int filelayout_initiate_commit(struct nfs_write_data *data, int how) idx = calc_ds_index_from_commit(lseg, data->ds_commit_index); ds = nfs4_fl_prepare_ds(lseg, idx); if (!ds) { - printk(KERN_ERR "NFS: %s: prepare_ds failed, use MDS\n", - __func__); - set_bit(lo_fail_bit(IOMODE_RW), &lseg->pls_layout->plh_flags); - set_bit(lo_fail_bit(IOMODE_READ), &lseg->pls_layout->plh_flags); prepare_to_resend_writes(data); filelayout_commit_release(data); return -EAGAIN; } - dprintk("%s ino %lu, how %d\n", __func__, data->inode->i_ino, how); - data->write_done_cb = filelayout_commit_done_cb; + dprintk("%s ino %lu, how %d cl_count %d\n", __func__, + data->inode->i_ino, how, atomic_read(&ds->ds_clp->cl_count)); + data->commit_done_cb = filelayout_commit_done_cb; + atomic_inc(&ds->ds_clp->cl_count); data->ds_clp = ds->ds_clp; fh = select_ds_fh_from_commit(lseg, data->ds_commit_index); if (fh) data->args.fh = fh; - return nfs_initiate_commit(data, ds->ds_clp->cl_rpcclient, - &filelayout_commit_call_ops, how); -} - -/* - * This is only useful while we are using whole file layouts. - */ -static struct pnfs_layout_segment * -find_only_write_lseg_locked(struct inode *inode) -{ - struct pnfs_layout_segment *lseg; - - list_for_each_entry(lseg, &NFS_I(inode)->layout->plh_segs, pls_list) - if (lseg->pls_range.iomode == IOMODE_RW) - return lseg; - return NULL; -} - -static struct pnfs_layout_segment *find_only_write_lseg(struct inode *inode) -{ - struct pnfs_layout_segment *rv; - - spin_lock(&inode->i_lock); - rv = find_only_write_lseg_locked(inode); - if (rv) - get_lseg(rv); - spin_unlock(&inode->i_lock); - return rv; + return nfs_initiate_commit(ds->ds_clp->cl_rpcclient, data, + &filelayout_commit_call_ops, how, + RPC_TASK_SOFTCONN); } static int -filelayout_scan_ds_commit_list(struct nfs4_fl_commit_bucket *bucket, int max, - spinlock_t *lock) +transfer_commit_list(struct list_head *src, struct list_head *dst, + struct nfs_commit_info *cinfo, int max) { - struct list_head *src = &bucket->written; - struct list_head *dst = &bucket->committing; struct nfs_page *req, *tmp; int ret = 0; list_for_each_entry_safe(req, tmp, src, wb_list) { if (!nfs_lock_request(req)) continue; - if (cond_resched_lock(lock)) + kref_get(&req->wb_kref); + if (cond_resched_lock(cinfo->lock)) list_safe_reset_next(req, tmp, wb_list); - nfs_request_remove_commit_list(req); + nfs_request_remove_commit_list(req, cinfo); clear_bit(PG_COMMIT_TO_DS, &req->wb_flags); nfs_list_add_request(req, dst); ret++; - if (ret == max) + if ((ret == max) && !cinfo->dreq) break; } return ret; } +static int +filelayout_scan_ds_commit_list(struct pnfs_commit_bucket *bucket, + struct nfs_commit_info *cinfo, + int max) +{ + struct list_head *src = &bucket->written; + struct list_head *dst = &bucket->committing; + int ret; + + ret = transfer_commit_list(src, dst, cinfo, max); + if (ret) { + cinfo->ds->nwritten -= ret; + cinfo->ds->ncommitting += ret; + bucket->clseg = bucket->wlseg; + if (list_empty(src)) + bucket->wlseg = NULL; + else + get_lseg(bucket->clseg); + } + return ret; +} + /* Move reqs from written to committing lists, returning count of number moved. - * Note called with i_lock held. + * Note called with cinfo->lock held. */ -static int filelayout_scan_commit_lists(struct inode *inode, int max, - spinlock_t *lock) +static int filelayout_scan_commit_lists(struct nfs_commit_info *cinfo, + int max) { - struct pnfs_layout_segment *lseg; - struct nfs4_filelayout_segment *fl; int i, rv = 0, cnt; - lseg = find_only_write_lseg_locked(inode); - if (!lseg) - goto out_done; - fl = FILELAYOUT_LSEG(lseg); - if (fl->commit_through_mds) - goto out_done; - for (i = 0; i < fl->number_of_buckets && max != 0; i++) { - cnt = filelayout_scan_ds_commit_list(&fl->commit_buckets[i], - max, lock); + for (i = 0; i < cinfo->ds->nbuckets && max != 0; i++) { + cnt = filelayout_scan_ds_commit_list(&cinfo->ds->buckets[i], + cinfo, max); max -= cnt; rv += cnt; } -out_done: return rv; } +/* Pull everything off the committing lists and dump into @dst */ +static void filelayout_recover_commit_reqs(struct list_head *dst, + struct nfs_commit_info *cinfo) +{ + struct pnfs_commit_bucket *b; + int i; + + /* NOTE cinfo->lock is NOT held, relying on fact that this is + * only called on single thread per dreq. + * Can't take the lock because need to do put_lseg + */ + for (i = 0, b = cinfo->ds->buckets; i < cinfo->ds->nbuckets; i++, b++) { + if (transfer_commit_list(&b->written, dst, cinfo, 0)) { + BUG_ON(!list_empty(&b->written)); + put_lseg(b->wlseg); + b->wlseg = NULL; + } + } + cinfo->ds->nwritten = 0; +} + static unsigned int -alloc_ds_commits(struct inode *inode, struct list_head *list) +alloc_ds_commits(struct nfs_commit_info *cinfo, struct list_head *list) { - struct pnfs_layout_segment *lseg; - struct nfs4_filelayout_segment *fl; - struct nfs_write_data *data; + struct pnfs_ds_commit_info *fl_cinfo; + struct pnfs_commit_bucket *bucket; + struct nfs_commit_data *data; int i, j; unsigned int nreq = 0; - /* Won't need this when non-whole file layout segments are supported - * instead we will use a pnfs_layout_hdr structure */ - lseg = find_only_write_lseg(inode); - if (!lseg) - return 0; - fl = FILELAYOUT_LSEG(lseg); - for (i = 0; i < fl->number_of_buckets; i++) { - if (list_empty(&fl->commit_buckets[i].committing)) + fl_cinfo = cinfo->ds; + bucket = fl_cinfo->buckets; + for (i = 0; i < fl_cinfo->nbuckets; i++, bucket++) { + if (list_empty(&bucket->committing)) continue; data = nfs_commitdata_alloc(); if (!data) break; data->ds_commit_index = i; - data->lseg = lseg; + data->lseg = bucket->clseg; + bucket->clseg = NULL; list_add(&data->pages, list); nreq++; } /* Clean up on error */ - for (j = i; j < fl->number_of_buckets; j++) { - if (list_empty(&fl->commit_buckets[i].committing)) + for (j = i; j < fl_cinfo->nbuckets; j++, bucket++) { + if (list_empty(&bucket->committing)) continue; - nfs_retry_commit(&fl->commit_buckets[i].committing, lseg); - put_lseg(lseg); /* associated with emptying bucket */ + nfs_retry_commit(&bucket->committing, bucket->clseg, cinfo); + put_lseg(bucket->clseg); + bucket->clseg = NULL; } - put_lseg(lseg); /* Caller will clean up entries put on list */ return nreq; } @@ -1026,9 +1210,9 @@ alloc_ds_commits(struct inode *inode, struct list_head *list) /* This follows nfs_commit_list pretty closely */ static int filelayout_commit_pagelist(struct inode *inode, struct list_head *mds_pages, - int how) + int how, struct nfs_commit_info *cinfo) { - struct nfs_write_data *data, *tmp; + struct nfs_commit_data *data, *tmp; LIST_HEAD(list); unsigned int nreq = 0; @@ -1039,30 +1223,34 @@ filelayout_commit_pagelist(struct inode *inode, struct list_head *mds_pages, list_add(&data->pages, &list); nreq++; } else - nfs_retry_commit(mds_pages, NULL); + nfs_retry_commit(mds_pages, NULL, cinfo); } - nreq += alloc_ds_commits(inode, &list); + nreq += alloc_ds_commits(cinfo, &list); if (nreq == 0) { - nfs_commit_clear_lock(NFS_I(inode)); + cinfo->completion_ops->error_cleanup(NFS_I(inode)); goto out; } - atomic_add(nreq, &NFS_I(inode)->commits_outstanding); + atomic_add(nreq, &cinfo->mds->rpcs_out); list_for_each_entry_safe(data, tmp, &list, pages) { list_del_init(&data->pages); if (!data->lseg) { - nfs_init_commit(data, mds_pages, NULL); - nfs_initiate_commit(data, NFS_CLIENT(inode), - data->mds_ops, how); + nfs_init_commit(data, mds_pages, NULL, cinfo); + nfs_initiate_commit(NFS_CLIENT(inode), data, + data->mds_ops, how, 0); } else { - nfs_init_commit(data, &FILELAYOUT_LSEG(data->lseg)->commit_buckets[data->ds_commit_index].committing, data->lseg); + struct pnfs_commit_bucket *buckets; + + buckets = cinfo->ds->buckets; + nfs_init_commit(data, &buckets[data->ds_commit_index].committing, data->lseg, cinfo); filelayout_initiate_commit(data, how); } } out: + cinfo->ds->ncommitting = 0; return PNFS_ATTEMPTED; } @@ -1072,17 +1260,47 @@ filelayout_free_deveiceid_node(struct nfs4_deviceid_node *d) nfs4_fl_free_deviceid(container_of(d, struct nfs4_file_layout_dsaddr, id_node)); } +static struct pnfs_layout_hdr * +filelayout_alloc_layout_hdr(struct inode *inode, gfp_t gfp_flags) +{ + struct nfs4_filelayout *flo; + + flo = kzalloc(sizeof(*flo), gfp_flags); + return &flo->generic_hdr; +} + +static void +filelayout_free_layout_hdr(struct pnfs_layout_hdr *lo) +{ + kfree(FILELAYOUT_FROM_HDR(lo)); +} + +static struct pnfs_ds_commit_info * +filelayout_get_ds_info(struct inode *inode) +{ + struct pnfs_layout_hdr *layout = NFS_I(inode)->layout; + + if (layout == NULL) + return NULL; + else + return &FILELAYOUT_FROM_HDR(layout)->commit_info; +} + static struct pnfs_layoutdriver_type filelayout_type = { .id = LAYOUT_NFSV4_1_FILES, .name = "LAYOUT_NFSV4_1_FILES", .owner = THIS_MODULE, + .alloc_layout_hdr = filelayout_alloc_layout_hdr, + .free_layout_hdr = filelayout_free_layout_hdr, .alloc_lseg = filelayout_alloc_lseg, .free_lseg = filelayout_free_lseg, .pg_read_ops = &filelayout_pg_read_ops, .pg_write_ops = &filelayout_pg_write_ops, + .get_ds_info = &filelayout_get_ds_info, .mark_request_commit = filelayout_mark_request_commit, .clear_request_commit = filelayout_clear_request_commit, .scan_commit_lists = filelayout_scan_commit_lists, + .recover_commit_reqs = filelayout_recover_commit_reqs, .commit_pagelist = filelayout_commit_pagelist, .read_pagelist = filelayout_read_pagelist, .write_pagelist = filelayout_write_pagelist, |