diff options
55 files changed, 1748 insertions, 1500 deletions
diff --git a/fs/nfs/Makefile b/fs/nfs/Makefile index 8664417..6abdda2 100644 --- a/fs/nfs/Makefile +++ b/fs/nfs/Makefile @@ -6,7 +6,7 @@ obj-$(CONFIG_NFS_FS) += nfs.o CFLAGS_nfstrace.o += -I$(src) nfs-y := client.o dir.o file.o getroot.o inode.o super.o \ - direct.o pagelist.o read.o symlink.o unlink.o \ + io.o direct.o pagelist.o read.o symlink.o unlink.o \ write.o namespace.o mount_clnt.o nfstrace.o nfs-$(CONFIG_ROOT_NFS) += nfsroot.o nfs-$(CONFIG_SYSCTL) += sysctl.o diff --git a/fs/nfs/blocklayout/dev.c b/fs/nfs/blocklayout/dev.c index e5b8967..a69ef4e 100644 --- a/fs/nfs/blocklayout/dev.c +++ b/fs/nfs/blocklayout/dev.c @@ -65,8 +65,8 @@ nfs4_block_decode_volume(struct xdr_stream *xdr, struct pnfs_block_volume *b) if (!p) return -EIO; b->simple.nr_sigs = be32_to_cpup(p++); - if (!b->simple.nr_sigs) { - dprintk("no signature\n"); + if (!b->simple.nr_sigs || b->simple.nr_sigs > PNFS_BLOCK_MAX_UUIDS) { + dprintk("Bad signature count: %d\n", b->simple.nr_sigs); return -EIO; } @@ -89,7 +89,8 @@ nfs4_block_decode_volume(struct xdr_stream *xdr, struct pnfs_block_volume *b) memcpy(&b->simple.sigs[i].sig, p, b->simple.sigs[i].sig_len); - b->simple.len += 8 + 4 + b->simple.sigs[i].sig_len; + b->simple.len += 8 + 4 + \ + (XDR_QUADLEN(b->simple.sigs[i].sig_len) << 2); } break; case PNFS_BLOCK_VOLUME_SLICE: @@ -104,7 +105,12 @@ nfs4_block_decode_volume(struct xdr_stream *xdr, struct pnfs_block_volume *b) p = xdr_inline_decode(xdr, 4); if (!p) return -EIO; + b->concat.volumes_count = be32_to_cpup(p++); + if (b->concat.volumes_count > PNFS_BLOCK_MAX_DEVICES) { + dprintk("Too many volumes: %d\n", b->concat.volumes_count); + return -EIO; + } p = xdr_inline_decode(xdr, b->concat.volumes_count * 4); if (!p) @@ -116,8 +122,13 @@ nfs4_block_decode_volume(struct xdr_stream *xdr, struct pnfs_block_volume *b) p = xdr_inline_decode(xdr, 8 + 4); if (!p) return -EIO; + p = xdr_decode_hyper(p, &b->stripe.chunk_size); b->stripe.volumes_count = be32_to_cpup(p++); + if (b->stripe.volumes_count > PNFS_BLOCK_MAX_DEVICES) { + dprintk("Too many volumes: %d\n", b->stripe.volumes_count); + return -EIO; + } p = xdr_inline_decode(xdr, b->stripe.volumes_count * 4); if (!p) @@ -224,18 +235,20 @@ bl_parse_simple(struct nfs_server *server, struct pnfs_block_dev *d, struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask) { struct pnfs_block_volume *v = &volumes[idx]; + struct block_device *bdev; dev_t dev; dev = bl_resolve_deviceid(server, v, gfp_mask); if (!dev) return -EIO; - d->bdev = blkdev_get_by_dev(dev, FMODE_READ | FMODE_WRITE, NULL); - if (IS_ERR(d->bdev)) { + bdev = blkdev_get_by_dev(dev, FMODE_READ | FMODE_WRITE, NULL); + if (IS_ERR(bdev)) { printk(KERN_WARNING "pNFS: failed to open device %d:%d (%ld)\n", - MAJOR(dev), MINOR(dev), PTR_ERR(d->bdev)); - return PTR_ERR(d->bdev); + MAJOR(dev), MINOR(dev), PTR_ERR(bdev)); + return PTR_ERR(bdev); } + d->bdev = bdev; d->len = i_size_read(d->bdev->bd_inode); @@ -287,44 +300,71 @@ bl_validate_designator(struct pnfs_block_volume *v) } } +/* + * Try to open the udev path for the WWN. At least on Debian the udev + * by-id path will always point to the dm-multipath device if one exists. + */ +static struct block_device * +bl_open_udev_path(struct pnfs_block_volume *v) +{ + struct block_device *bdev; + const char *devname; + + devname = kasprintf(GFP_KERNEL, "/dev/disk/by-id/wwn-0x%*phN", + v->scsi.designator_len, v->scsi.designator); + if (!devname) + return ERR_PTR(-ENOMEM); + + bdev = blkdev_get_by_path(devname, FMODE_READ | FMODE_WRITE, NULL); + if (IS_ERR(bdev)) { + pr_warn("pNFS: failed to open device %s (%ld)\n", + devname, PTR_ERR(bdev)); + } + + kfree(devname); + return bdev; +} + +/* + * Try to open the RH/Fedora specific dm-mpath udev path for this WWN, as the + * wwn- links will only point to the first discovered SCSI device there. + */ +static struct block_device * +bl_open_dm_mpath_udev_path(struct pnfs_block_volume *v) +{ + struct block_device *bdev; + const char *devname; + + devname = kasprintf(GFP_KERNEL, + "/dev/disk/by-id/dm-uuid-mpath-%d%*phN", + v->scsi.designator_type, + v->scsi.designator_len, v->scsi.designator); + if (!devname) + return ERR_PTR(-ENOMEM); + + bdev = blkdev_get_by_path(devname, FMODE_READ | FMODE_WRITE, NULL); + kfree(devname); + return bdev; +} + static int bl_parse_scsi(struct nfs_server *server, struct pnfs_block_dev *d, struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask) { struct pnfs_block_volume *v = &volumes[idx]; + struct block_device *bdev; const struct pr_ops *ops; - const char *devname; int error; if (!bl_validate_designator(v)) return -EINVAL; - switch (v->scsi.designator_len) { - case 8: - devname = kasprintf(GFP_KERNEL, "/dev/disk/by-id/wwn-0x%8phN", - v->scsi.designator); - break; - case 12: - devname = kasprintf(GFP_KERNEL, "/dev/disk/by-id/wwn-0x%12phN", - v->scsi.designator); - break; - case 16: - devname = kasprintf(GFP_KERNEL, "/dev/disk/by-id/wwn-0x%16phN", - v->scsi.designator); - break; - default: - return -EINVAL; - } - - d->bdev = blkdev_get_by_path(devname, FMODE_READ, NULL); - if (IS_ERR(d->bdev)) { - pr_warn("pNFS: failed to open device %s (%ld)\n", - devname, PTR_ERR(d->bdev)); - kfree(devname); - return PTR_ERR(d->bdev); - } - - kfree(devname); + bdev = bl_open_dm_mpath_udev_path(v); + if (IS_ERR(bdev)) + bdev = bl_open_udev_path(v); + if (IS_ERR(bdev)) + return PTR_ERR(bdev); + d->bdev = bdev; d->len = i_size_read(d->bdev->bd_inode); d->map = bl_map_simple; @@ -352,7 +392,7 @@ bl_parse_scsi(struct nfs_server *server, struct pnfs_block_dev *d, return 0; out_blkdev_put: - blkdev_put(d->bdev, FMODE_READ); + blkdev_put(d->bdev, FMODE_READ | FMODE_WRITE); return error; } diff --git a/fs/nfs/blocklayout/extent_tree.c b/fs/nfs/blocklayout/extent_tree.c index 720b3ff..992bcb1 100644 --- a/fs/nfs/blocklayout/extent_tree.c +++ b/fs/nfs/blocklayout/extent_tree.c @@ -121,6 +121,16 @@ ext_try_to_merge_right(struct rb_root *root, struct pnfs_block_extent *be) return be; } +static void __ext_put_deviceids(struct list_head *head) +{ + struct pnfs_block_extent *be, *tmp; + + list_for_each_entry_safe(be, tmp, head, be_list) { + nfs4_put_deviceid_node(be->be_device); + kfree(be); + } +} + static void __ext_tree_insert(struct rb_root *root, struct pnfs_block_extent *new, bool merge_ok) @@ -163,7 +173,8 @@ free_new: } static int -__ext_tree_remove(struct rb_root *root, sector_t start, sector_t end) +__ext_tree_remove(struct rb_root *root, + sector_t start, sector_t end, struct list_head *tmp) { struct pnfs_block_extent *be; sector_t len1 = 0, len2 = 0; @@ -223,8 +234,7 @@ __ext_tree_remove(struct rb_root *root, sector_t start, sector_t end) struct pnfs_block_extent *next = ext_tree_next(be); rb_erase(&be->be_node, root); - nfs4_put_deviceid_node(be->be_device); - kfree(be); + list_add_tail(&be->be_list, tmp); be = next; } @@ -350,16 +360,18 @@ int ext_tree_remove(struct pnfs_block_layout *bl, bool rw, sector_t start, sector_t end) { int err, err2; + LIST_HEAD(tmp); spin_lock(&bl->bl_ext_lock); - err = __ext_tree_remove(&bl->bl_ext_ro, start, end); + err = __ext_tree_remove(&bl->bl_ext_ro, start, end, &tmp); if (rw) { - err2 = __ext_tree_remove(&bl->bl_ext_rw, start, end); + err2 = __ext_tree_remove(&bl->bl_ext_rw, start, end, &tmp); if (!err) err = err2; } spin_unlock(&bl->bl_ext_lock); + __ext_put_deviceids(&tmp); return err; } @@ -396,12 +408,13 @@ ext_tree_mark_written(struct pnfs_block_layout *bl, sector_t start, sector_t end = start + len; struct pnfs_block_extent *be; int err = 0; + LIST_HEAD(tmp); spin_lock(&bl->bl_ext_lock); /* * First remove all COW extents or holes from written to range. */ - err = __ext_tree_remove(&bl->bl_ext_ro, start, end); + err = __ext_tree_remove(&bl->bl_ext_ro, start, end, &tmp); if (err) goto out; @@ -459,6 +472,8 @@ ext_tree_mark_written(struct pnfs_block_layout *bl, sector_t start, } out: spin_unlock(&bl->bl_ext_lock); + + __ext_put_deviceids(&tmp); return err; } diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c index aaa2e8d..c92a75e 100644 --- a/fs/nfs/callback_proc.c +++ b/fs/nfs/callback_proc.c @@ -119,27 +119,30 @@ out: * hashed by filehandle. */ static struct pnfs_layout_hdr * get_layout_by_fh_locked(struct nfs_client *clp, - struct nfs_fh *fh, nfs4_stateid *stateid) + struct nfs_fh *fh) { struct nfs_server *server; + struct nfs_inode *nfsi; struct inode *ino; struct pnfs_layout_hdr *lo; +restart: list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) { list_for_each_entry(lo, &server->layouts, plh_layouts) { - if (!nfs4_stateid_match_other(&lo->plh_stateid, stateid)) + nfsi = NFS_I(lo->plh_inode); + if (nfs_compare_fh(fh, &nfsi->fh)) continue; - if (nfs_compare_fh(fh, &NFS_I(lo->plh_inode)->fh)) + if (nfsi->layout != lo) continue; ino = igrab(lo->plh_inode); if (!ino) break; spin_lock(&ino->i_lock); /* Is this layout in the process of being freed? */ - if (NFS_I(ino)->layout != lo) { + if (nfsi->layout != lo) { spin_unlock(&ino->i_lock); iput(ino); - break; + goto restart; } pnfs_get_layout_hdr(lo); spin_unlock(&ino->i_lock); @@ -151,13 +154,13 @@ static struct pnfs_layout_hdr * get_layout_by_fh_locked(struct nfs_client *clp, } static struct pnfs_layout_hdr * get_layout_by_fh(struct nfs_client *clp, - struct nfs_fh *fh, nfs4_stateid *stateid) + struct nfs_fh *fh) { struct pnfs_layout_hdr *lo; spin_lock(&clp->cl_lock); rcu_read_lock(); - lo = get_layout_by_fh_locked(clp, fh, stateid); + lo = get_layout_by_fh_locked(clp, fh); rcu_read_unlock(); spin_unlock(&clp->cl_lock); @@ -167,17 +170,39 @@ static struct pnfs_layout_hdr * get_layout_by_fh(struct nfs_client *clp, /* * Enforce RFC5661 section 12.5.5.2.1. (Layout Recall and Return Sequencing) */ -static bool pnfs_check_stateid_sequence(struct pnfs_layout_hdr *lo, +static u32 pnfs_check_callback_stateid(struct pnfs_layout_hdr *lo, const nfs4_stateid *new) { u32 oldseq, newseq; - oldseq = be32_to_cpu(lo->plh_stateid.seqid); + /* Is the stateid still not initialised? */ + if (!pnfs_layout_is_valid(lo)) + return NFS4ERR_DELAY; + + /* Mismatched stateid? */ + if (!nfs4_stateid_match_other(&lo->plh_stateid, new)) + return NFS4ERR_BAD_STATEID; + newseq = be32_to_cpu(new->seqid); + /* Are we already in a layout recall situation? */ + if (test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags) && + lo->plh_return_seq != 0) { + if (newseq < lo->plh_return_seq) + return NFS4ERR_OLD_STATEID; + if (newseq > lo->plh_return_seq) + return NFS4ERR_DELAY; + goto out; + } + /* Check that the stateid matches what we think it should be. */ + oldseq = be32_to_cpu(lo->plh_stateid.seqid); if (newseq > oldseq + 1) - return false; - return true; + return NFS4ERR_DELAY; + /* Crazy server! */ + if (newseq <= oldseq) + return NFS4ERR_OLD_STATEID; +out: + return NFS_OK; } static u32 initiate_file_draining(struct nfs_client *clp, @@ -188,7 +213,7 @@ static u32 initiate_file_draining(struct nfs_client *clp, u32 rv = NFS4ERR_NOMATCHING_LAYOUT; LIST_HEAD(free_me_list); - lo = get_layout_by_fh(clp, &args->cbl_fh, &args->cbl_stateid); + lo = get_layout_by_fh(clp, &args->cbl_fh); if (!lo) { trace_nfs4_cb_layoutrecall_file(clp, &args->cbl_fh, NULL, &args->cbl_stateid, -rv); @@ -196,18 +221,15 @@ static u32 initiate_file_draining(struct nfs_client *clp, } ino = lo->plh_inode; + pnfs_layoutcommit_inode(ino, false); + spin_lock(&ino->i_lock); - if (!pnfs_check_stateid_sequence(lo, &args->cbl_stateid)) { - rv = NFS4ERR_DELAY; + rv = pnfs_check_callback_stateid(lo, &args->cbl_stateid); + if (rv != NFS_OK) goto unlock; - } pnfs_set_layout_stateid(lo, &args->cbl_stateid, true); - spin_unlock(&ino->i_lock); - - pnfs_layoutcommit_inode(ino, false); - spin_lock(&ino->i_lock); /* * Enforce RFC5661 Section 12.5.5.2.1.5 (Bulk Recall and Return) */ @@ -223,11 +245,13 @@ static u32 initiate_file_draining(struct nfs_client *clp, goto unlock; } + /* Embrace your forgetfulness! */ + rv = NFS4ERR_NOMATCHING_LAYOUT; + if (NFS_SERVER(ino)->pnfs_curr_ld->return_range) { NFS_SERVER(ino)->pnfs_curr_ld->return_range(lo, &args->cbl_range); } - pnfs_mark_layout_returned_if_empty(lo); unlock: spin_unlock(&ino->i_lock); pnfs_free_lseg_list(&free_me_list); diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c index d81f96a..656f68f 100644 --- a/fs/nfs/callback_xdr.c +++ b/fs/nfs/callback_xdr.c @@ -925,7 +925,7 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r if (hdr_arg.minorversion == 0) { cps.clp = nfs4_find_client_ident(SVC_NET(rqstp), hdr_arg.cb_ident); if (!cps.clp || !check_gss_callback_principal(cps.clp, rqstp)) - return rpc_drop_reply; + goto out_invalidcred; } cps.minorversion = hdr_arg.minorversion; @@ -953,6 +953,10 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r nfs_put_client(cps.clp); dprintk("%s: done, status = %u\n", __func__, ntohl(status)); return rpc_success; + +out_invalidcred: + pr_warn_ratelimited("NFS: NFSv4 callback contains invalid cred\n"); + return rpc_autherr_badcred; } /* diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 487c560..003ebce 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -367,8 +367,6 @@ nfs_found_client(const struct nfs_client_initdata *cl_init, */ struct nfs_client * nfs_get_client(const struct nfs_client_initdata *cl_init, - const struct rpc_timeout *timeparms, - const char *ip_addr, rpc_authflavor_t authflavour) { struct nfs_client *clp, *new = NULL; @@ -399,7 +397,7 @@ nfs_get_client(const struct nfs_client_initdata *cl_init, &nn->nfs_client_list); spin_unlock(&nn->nfs_client_lock); new->cl_flags = cl_init->init_flags; - return rpc_ops->init_client(new, timeparms, ip_addr); + return rpc_ops->init_client(new, cl_init); } spin_unlock(&nn->nfs_client_lock); @@ -470,7 +468,7 @@ EXPORT_SYMBOL_GPL(nfs_init_timeout_values); * Create an RPC client handle */ int nfs_create_rpc_client(struct nfs_client *clp, - const struct rpc_timeout *timeparms, + const struct nfs_client_initdata *cl_init, rpc_authflavor_t flavor) { struct rpc_clnt *clnt = NULL; @@ -479,8 +477,9 @@ int nfs_create_rpc_client(struct nfs_client *clp, .protocol = clp->cl_proto, .address = (struct sockaddr *)&clp->cl_addr, .addrsize = clp->cl_addrlen, - .timeout = timeparms, + .timeout = cl_init->timeparms, .servername = clp->cl_hostname, + .nodename = cl_init->nodename, .program = &nfs_program, .version = clp->rpc_ops->version, .authflavor = flavor, @@ -591,14 +590,12 @@ EXPORT_SYMBOL_GPL(nfs_init_server_rpcclient); * nfs_init_client - Initialise an NFS2 or NFS3 client * * @clp: nfs_client to initialise - * @timeparms: timeout parameters for underlying RPC transport - * @ip_addr: IP presentation address (not used) + * @cl_init: Initialisation parameters * * Returns pointer to an NFS client, or an ERR_PTR value. */ struct nfs_client *nfs_init_client(struct nfs_client *clp, - const struct rpc_timeout *timeparms, - const char *ip_addr) + const struct nfs_client_initdata *cl_init) { int error; @@ -612,7 +609,7 @@ struct nfs_client *nfs_init_client(struct nfs_client *clp, * Create a client RPC handle for doing FSSTAT with UNIX auth only * - RFC 2623, sec 2.3.2 */ - error = nfs_create_rpc_client(clp, timeparms, RPC_AUTH_UNIX); + error = nfs_create_rpc_client(clp, cl_init, RPC_AUTH_UNIX); if (error < 0) goto error; nfs_mark_client_ready(clp, NFS_CS_READY); @@ -633,6 +630,7 @@ static int nfs_init_server(struct nfs_server *server, const struct nfs_parsed_mount_data *data, struct nfs_subversion *nfs_mod) { + struct rpc_timeout timeparms; struct nfs_client_initdata cl_init = { .hostname = data->nfs_server.hostname, .addr = (const struct sockaddr *)&data->nfs_server.address, @@ -640,8 +638,8 @@ static int nfs_init_server(struct nfs_server *server, .nfs_mod = nfs_mod, .proto = data->nfs_server.protocol, .net = data->net, + .timeparms = &timeparms, }; - struct rpc_timeout timeparms; struct nfs_client *clp; int error; @@ -653,7 +651,7 @@ static int nfs_init_server(struct nfs_server *server, set_bit(NFS_CS_NORESVPORT, &cl_init.init_flags); /* Allocate or find a client reference we can use */ - clp = nfs_get_client(&cl_init, &timeparms, NULL, RPC_AUTH_UNIX); + clp = nfs_get_client(&cl_init, RPC_AUTH_UNIX); if (IS_ERR(clp)) { dprintk("<-- nfs_init_server() = error %ld\n", PTR_ERR(clp)); return PTR_ERR(clp); diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index baaa388..177fefb 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -2252,21 +2252,37 @@ static struct nfs_access_entry *nfs_access_search_rbtree(struct inode *inode, st return NULL; } -static int nfs_access_get_cached(struct inode *inode, struct rpc_cred *cred, struct nfs_access_entry *res) +static int nfs_access_get_cached(struct inode *inode, struct rpc_cred *cred, struct nfs_access_entry *res, bool may_block) { struct nfs_inode *nfsi = NFS_I(inode); struct nfs_access_entry *cache; - int err = -ENOENT; + bool retry = true; + int err; spin_lock(&inode->i_lock); - if (nfsi->cache_validity & NFS_INO_INVALID_ACCESS) - goto out_zap; - cache = nfs_access_search_rbtree(inode, cred); - if (cache == NULL) - goto out; - if (!nfs_have_delegated_attributes(inode) && - !time_in_range_open(jiffies, cache->jiffies, cache->jiffies + nfsi->attrtimeo)) - goto out_stale; + for(;;) { + if (nfsi->cache_validity & NFS_INO_INVALID_ACCESS) + goto out_zap; + cache = nfs_access_search_rbtree(inode, cred); + err = -ENOENT; + if (cache == NULL) + goto out; + /* Found an entry, is our attribute cache valid? */ + if (!nfs_attribute_cache_expired(inode) && + !(nfsi->cache_validity & NFS_INO_INVALID_ATTR)) + break; + err = -ECHILD; + if (!may_block) + goto out; + if (!retry) + goto out_zap; + spin_unlock(&inode->i_lock); + err = __nfs_revalidate_inode(NFS_SERVER(inode), inode); + if (err) + return err; + spin_lock(&inode->i_lock); + retry = false; + } res->jiffies = cache->jiffies; res->cred = cache->cred; res->mask = cache->mask; @@ -2275,12 +2291,6 @@ static int nfs_access_get_cached(struct inode *inode, struct rpc_cred *cred, str out: spin_unlock(&inode->i_lock); return err; -out_stale: - rb_erase(&cache->rb_node, &nfsi->access_cache); - list_del(&cache->lru); - spin_unlock(&inode->i_lock); - nfs_access_free_entry(cache); - return -ENOENT; out_zap: spin_unlock(&inode->i_lock); nfs_access_zap_cache(inode); @@ -2307,13 +2317,12 @@ static int nfs_access_get_cached_rcu(struct inode *inode, struct rpc_cred *cred, cache = NULL; if (cache == NULL) goto out; - if (!nfs_have_delegated_attributes(inode) && - !time_in_range_open(jiffies, cache->jiffies, cache->jiffies + nfsi->attrtimeo)) + err = nfs_revalidate_inode_rcu(NFS_SERVER(inode), inode); + if (err) goto out; res->jiffies = cache->jiffies; res->cred = cache->cred; res->mask = cache->mask; - err = 0; out: rcu_read_unlock(); return err; @@ -2402,18 +2411,19 @@ EXPORT_SYMBOL_GPL(nfs_access_set_mask); static int nfs_do_access(struct inode *inode, struct rpc_cred *cred, int mask) { struct nfs_access_entry cache; + bool may_block = (mask & MAY_NOT_BLOCK) == 0; int status; trace_nfs_access_enter(inode); status = nfs_access_get_cached_rcu(inode, cred, &cache); if (status != 0) - status = nfs_access_get_cached(inode, cred, &cache); + status = nfs_access_get_cached(inode, cred, &cache, may_block); if (status == 0) goto out_cached; status = -ECHILD; - if (mask & MAY_NOT_BLOCK) + if (!may_block) goto out; /* Be clever: ask server to check for all possible rights */ diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index e6210ea..72b7d13 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -196,6 +196,12 @@ static void nfs_direct_set_hdr_verf(struct nfs_direct_req *dreq, WARN_ON_ONCE(verfp->committed < 0); } +static int nfs_direct_cmp_verf(const struct nfs_writeverf *v1, + const struct nfs_writeverf *v2) +{ + return nfs_write_verifier_cmp(&v1->verifier, &v2->verifier); +} + /* * nfs_direct_cmp_hdr_verf - compare verifier for pgio header * @dreq - direct request possibly spanning multiple servers @@ -215,7 +221,7 @@ static int nfs_direct_set_or_cmp_hdr_verf(struct nfs_direct_req *dreq, nfs_direct_set_hdr_verf(dreq, hdr); return 0; } - return memcmp(verfp, &hdr->verf, sizeof(struct nfs_writeverf)); + return nfs_direct_cmp_verf(verfp, &hdr->verf); } /* @@ -238,7 +244,7 @@ static int nfs_direct_cmp_commit_data_verf(struct nfs_direct_req *dreq, if (verfp->committed < 0) return 1; - return memcmp(verfp, &data->verf, sizeof(struct nfs_writeverf)); + return nfs_direct_cmp_verf(verfp, &data->verf); } /** @@ -366,22 +372,10 @@ out: * Synchronous I/O uses a stack-allocated iocb. Thus we can't trust * the iocb is still valid here if this is a synchronous request. */ -static void nfs_direct_complete(struct nfs_direct_req *dreq, bool write) +static void nfs_direct_complete(struct nfs_direct_req *dreq) { struct inode *inode = dreq->inode; - if (dreq->iocb && write) { - loff_t pos = dreq->iocb->ki_pos + dreq->count; - - spin_lock(&inode->i_lock); - if (i_size_read(inode) < pos) - i_size_write(inode, pos); - spin_unlock(&inode->i_lock); - } - - if (write) - nfs_zap_mapping(inode, inode->i_mapping); - inode_dio_end(inode); if (dreq->iocb) { @@ -436,7 +430,7 @@ static void nfs_direct_read_completion(struct nfs_pgio_header *hdr) } out_put: if (put_dreq(dreq)) - nfs_direct_complete(dreq, false); + nfs_direct_complete(dreq); hdr->release(hdr); } @@ -542,7 +536,7 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq, } if (put_dreq(dreq)) - nfs_direct_complete(dreq, false); + nfs_direct_complete(dreq); return 0; } @@ -583,17 +577,12 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter) if (!count) goto out; - inode_lock(inode); - result = nfs_sync_mapping(mapping); - if (result) - goto out_unlock; - task_io_account_read(count); result = -ENOMEM; dreq = nfs_direct_req_alloc(); if (dreq == NULL) - goto out_unlock; + goto out; dreq->inode = inode; dreq->bytes_left = dreq->max_count = count; @@ -608,10 +597,12 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter) if (!is_sync_kiocb(iocb)) dreq->iocb = iocb; + nfs_start_io_direct(inode); + NFS_I(inode)->read_io += count; result = nfs_direct_read_schedule_iovec(dreq, iter, iocb->ki_pos); - inode_unlock(inode); + nfs_end_io_direct(inode); if (!result) { result = nfs_direct_wait(dreq); @@ -619,13 +610,8 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter) iocb->ki_pos += result; } - nfs_direct_req_release(dreq); - return result; - out_release: nfs_direct_req_release(dreq); -out_unlock: - inode_unlock(inode); out: return result; } @@ -657,6 +643,8 @@ static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq) nfs_direct_write_scan_commit_list(dreq->inode, &reqs, &cinfo); dreq->count = 0; + dreq->verf.committed = NFS_INVALID_STABLE_HOW; + nfs_clear_pnfs_ds_commit_verifiers(&dreq->ds_cinfo); for (i = 0; i < dreq->mirror_count; i++) dreq->mirrors[i].count = 0; get_dreq(dreq); @@ -775,7 +763,8 @@ static void nfs_direct_write_schedule_work(struct work_struct *work) nfs_direct_write_reschedule(dreq); break; default: - nfs_direct_complete(dreq, true); + nfs_zap_mapping(dreq->inode, dreq->inode->i_mapping); + nfs_direct_complete(dreq); } } @@ -991,6 +980,7 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq, ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter) { ssize_t result = -EINVAL; + size_t count; struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; @@ -1001,34 +991,24 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter) dfprintk(FILE, "NFS: direct write(%pD2, %zd@%Ld)\n", file, iov_iter_count(iter), (long long) iocb->ki_pos); - nfs_add_stats(mapping->host, NFSIOS_DIRECTWRITTENBYTES, - iov_iter_count(iter)); + result = generic_write_checks(iocb, iter); + if (result <= 0) + return result; + count = result; + nfs_add_stats(mapping->host, NFSIOS_DIRECTWRITTENBYTES, count); pos = iocb->ki_pos; end = (pos + iov_iter_count(iter) - 1) >> PAGE_SHIFT; - inode_lock(inode); - - result = nfs_sync_mapping(mapping); - if (result) - goto out_unlock; - - if (mapping->nrpages) { - result = invalidate_inode_pages2_range(mapping, - pos >> PAGE_SHIFT, end); - if (result) - goto out_unlock; - } - - task_io_account_write(iov_iter_count(iter)); + task_io_account_write(count); result = -ENOMEM; dreq = nfs_direct_req_alloc(); if (!dreq) - goto out_unlock; + goto out; dreq->inode = inode; - dreq->bytes_left = dreq->max_count = iov_iter_count(iter); + dreq->bytes_left = dreq->max_count = count; dreq->io_start = pos; dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp)); l_ctx = nfs_get_lock_context(dreq->ctx); @@ -1040,6 +1020,8 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter) if (!is_sync_kiocb(iocb)) dreq->iocb = iocb; + nfs_start_io_direct(inode); + result = nfs_direct_write_schedule_iovec(dreq, iter, pos); if (mapping->nrpages) { @@ -1047,30 +1029,19 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter) pos >> PAGE_SHIFT, end); } - inode_unlock(inode); + nfs_end_io_direct(inode); if (!result) { result = nfs_direct_wait(dreq); if (result > 0) { - struct inode *inode = mapping->host; - iocb->ki_pos = pos + result; - spin_lock(&inode->i_lock); - if (i_size_read(inode) < iocb->ki_pos) - i_size_write(inode, iocb->ki_pos); - spin_unlock(&inode->i_lock); - /* XXX: should check the generic_write_sync retval */ generic_write_sync(iocb, result); } } - nfs_direct_req_release(dreq); - return result; - out_release: nfs_direct_req_release(dreq); -out_unlock: - inode_unlock(inode); +out: return result; } diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 717a8d6..7d62097 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -170,12 +170,14 @@ nfs_file_read(struct kiocb *iocb, struct iov_iter *to) iocb->ki_filp, iov_iter_count(to), (unsigned long) iocb->ki_pos); - result = nfs_revalidate_mapping_protected(inode, iocb->ki_filp->f_mapping); + nfs_start_io_read(inode); + result = nfs_revalidate_mapping(inode, iocb->ki_filp->f_mapping); if (!result) { result = generic_file_read_iter(iocb, to); if (result > 0) nfs_add_stats(inode, NFSIOS_NORMALREADBYTES, result); } + nfs_end_io_read(inode); return result; } EXPORT_SYMBOL_GPL(nfs_file_read); @@ -191,12 +193,14 @@ nfs_file_splice_read(struct file *filp, loff_t *ppos, dprintk("NFS: splice_read(%pD2, %lu@%Lu)\n", filp, (unsigned long) count, (unsigned long long) *ppos); - res = nfs_revalidate_mapping_protected(inode, filp->f_mapping); + nfs_start_io_read(inode); + res = nfs_revalidate_mapping(inode, filp->f_mapping); if (!res) { res = generic_file_splice_read(filp, ppos, pipe, count, flags); if (res > 0) nfs_add_stats(inode, NFSIOS_NORMALREADBYTES, res); } + nfs_end_io_read(inode); return res; } EXPORT_SYMBOL_GPL(nfs_file_splice_read); @@ -272,16 +276,13 @@ nfs_file_fsync(struct file *file, loff_t start, loff_t end, int datasync) trace_nfs_fsync_enter(inode); - inode_dio_wait(inode); do { ret = filemap_write_and_wait_range(inode->i_mapping, start, end); if (ret != 0) break; - inode_lock(inode); ret = nfs_file_fsync_commit(file, start, end, datasync); if (!ret) ret = pnfs_sync_inode(inode, !!datasync); - inode_unlock(inode); /* * If nfs_file_fsync_commit detected a server reboot, then * resend all dirty pages that might have been covered by @@ -359,19 +360,6 @@ static int nfs_write_begin(struct file *file, struct address_space *mapping, file, mapping->host->i_ino, len, (long long) pos); start: - /* - * Prevent starvation issues if someone is doing a consistency - * sync-to-disk - */ - ret = wait_on_bit_action(&NFS_I(mapping->host)->flags, NFS_INO_FLUSHING, - nfs_wait_bit_killable, TASK_KILLABLE); - if (ret) - return ret; - /* - * Wait for O_DIRECT to complete - */ - inode_dio_wait(mapping->host); - page = grab_cache_page_write_begin(mapping, index, flags); if (!page) return -ENOMEM; @@ -432,7 +420,7 @@ static int nfs_write_end(struct file *file, struct address_space *mapping, return status; NFS_I(mapping->host)->write_io += copied; - if (nfs_ctx_key_to_expire(ctx)) { + if (nfs_ctx_key_to_expire(ctx, mapping->host)) { status = nfs_wb_all(mapping->host); if (status < 0) return status; @@ -470,31 +458,8 @@ static void nfs_invalidate_page(struct page *page, unsigned int offset, */ static int nfs_release_page(struct page *page, gfp_t gfp) { - struct address_space *mapping = page->mapping; - dfprintk(PAGECACHE, "NFS: release_page(%p)\n", page); - /* Always try to initiate a 'commit' if relevant, but only - * wait for it if the caller allows blocking. Even then, - * only wait 1 second and only if the 'bdi' is not congested. - * Waiting indefinitely can cause deadlocks when the NFS - * server is on this machine, when a new TCP connection is - * needed and in other rare cases. There is no particular - * need to wait extensively here. A short wait has the - * benefit that someone else can worry about the freezer. - */ - if (mapping) { - struct nfs_server *nfss = NFS_SERVER(mapping->host); - nfs_commit_inode(mapping->host, 0); - if (gfpflags_allow_blocking(gfp) && - !bdi_write_congested(&nfss->backing_dev_info)) { - wait_on_page_bit_killable_timeout(page, PG_private, - HZ); - if (PagePrivate(page)) - set_bdi_congested(&nfss->backing_dev_info, - BLK_RW_ASYNC); - } - } /* If PagePrivate() is set, then the page is not freeable */ if (PagePrivate(page)) return 0; @@ -604,6 +569,8 @@ static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) filp, filp->f_mapping->host->i_ino, (long long)page_offset(page)); + sb_start_pagefault(inode->i_sb); + /* make sure the cache has finished storing the page */ nfs_fscache_wait_on_page_write(NFS_I(inode), page); @@ -630,6 +597,7 @@ static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) out_unlock: unlock_page(page); out: + sb_end_pagefault(inode->i_sb); return ret; } @@ -645,7 +613,7 @@ static int nfs_need_check_write(struct file *filp, struct inode *inode) ctx = nfs_file_open_context(filp); if (test_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags) || - nfs_ctx_key_to_expire(ctx)) + nfs_ctx_key_to_expire(ctx, inode)) return 1; return 0; } @@ -656,23 +624,17 @@ ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from) struct inode *inode = file_inode(file); unsigned long written = 0; ssize_t result; - size_t count = iov_iter_count(from); result = nfs_key_timeout_notify(file, inode); if (result) return result; - if (iocb->ki_flags & IOCB_DIRECT) { - result = generic_write_checks(iocb, from); - if (result <= 0) - return result; + if (iocb->ki_flags & IOCB_DIRECT) return nfs_file_direct_write(iocb, from); - } dprintk("NFS: write(%pD2, %zu@%Ld)\n", - file, count, (long long) iocb->ki_pos); + file, iov_iter_count(from), (long long) iocb->ki_pos); - result = -EBUSY; if (IS_SWAPFILE(inode)) goto out_swapfile; /* @@ -684,28 +646,33 @@ ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from) goto out; } - result = count; - if (!count) + nfs_start_io_write(inode); + result = generic_write_checks(iocb, from); + if (result > 0) { + current->backing_dev_info = inode_to_bdi(inode); + result = generic_perform_write(file, from, iocb->ki_pos); + current->backing_dev_info = NULL; + } + nfs_end_io_write(inode); + if (result <= 0) goto out; - result = generic_file_write_iter(iocb, from); - if (result > 0) - written = result; + written = generic_write_sync(iocb, result); + iocb->ki_pos += written; /* Return error values */ - if (result >= 0 && nfs_need_check_write(file, inode)) { + if (nfs_need_check_write(file, inode)) { int err = vfs_fsync(file, 0); if (err < 0) result = err; } - if (result > 0) - nfs_add_stats(inode, NFSIOS_NORMALWRITTENBYTES, written); + nfs_add_stats(inode, NFSIOS_NORMALWRITTENBYTES, written); out: return result; out_swapfile: printk(KERN_INFO "NFS: attempt to write to active swap file!\n"); - goto out; + return -EBUSY; } EXPORT_SYMBOL_GPL(nfs_file_write); @@ -780,11 +747,6 @@ do_unlk(struct file *filp, int cmd, struct file_lock *fl, int is_local) } static int -is_time_granular(struct timespec *ts) { - return ((ts->tv_sec == 0) && (ts->tv_nsec <= 1000)); -} - -static int do_setlk(struct file *filp, int cmd, struct file_lock *fl, int is_local) { struct inode *inode = filp->f_mapping->host; @@ -817,12 +779,8 @@ do_setlk(struct file *filp, int cmd, struct file_lock *fl, int is_local) * This makes locking act as a cache coherency point. */ nfs_sync_mapping(filp->f_mapping); - if (!NFS_PROTO(inode)->have_delegation(inode, FMODE_READ)) { - if (is_time_granular(&NFS_SERVER(inode)->time_delta)) - __nfs_revalidate_inode(NFS_SERVER(inode), inode); - else - nfs_zap_caches(inode); - } + if (!NFS_PROTO(inode)->have_delegation(inode, FMODE_READ)) + nfs_zap_mapping(inode, filp->f_mapping); out: return status; } diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c index aa59757..a3fc48b 100644 --- a/fs/nfs/filelayout/filelayout.c +++ b/fs/nfs/filelayout/filelayout.c @@ -255,13 +255,16 @@ static int filelayout_read_done_cb(struct rpc_task *task, static void filelayout_set_layoutcommit(struct nfs_pgio_header *hdr) { + loff_t end_offs = 0; if (FILELAYOUT_LSEG(hdr->lseg)->commit_through_mds || - hdr->res.verf->committed != NFS_DATA_SYNC) + hdr->res.verf->committed == NFS_FILE_SYNC) return; + if (hdr->res.verf->committed == NFS_DATA_SYNC) + end_offs = hdr->mds_offset + (loff_t)hdr->res.count; - pnfs_set_layoutcommit(hdr->inode, hdr->lseg, - hdr->mds_offset + hdr->res.count); + /* Note: if the write is unstable, don't set end_offs until commit */ + pnfs_set_layoutcommit(hdr->inode, hdr->lseg, end_offs); dprintk("%s inode %lu pls_end_pos %lu\n", __func__, hdr->inode->i_ino, (unsigned long) NFS_I(hdr->inode)->layout->plh_lwb); } @@ -354,6 +357,12 @@ static int filelayout_write_done_cb(struct rpc_task *task, } filelayout_set_layoutcommit(hdr); + + /* zero out the fattr */ + hdr->fattr.valid = 0; + if (task->tk_status >= 0) + nfs_writeback_update_inode(hdr); + return 0; } @@ -375,8 +384,7 @@ static int filelayout_commit_done_cb(struct rpc_task *task, return -EAGAIN; } - if (data->verf.committed == NFS_UNSTABLE) - pnfs_set_layoutcommit(data->inode, data->lseg, data->lwb); + pnfs_set_layoutcommit(data->inode, data->lseg, data->lwb); return 0; } diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c index 0e8018b..e6206ea 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.c +++ b/fs/nfs/flexfilelayout/flexfilelayout.c @@ -1325,15 +1325,16 @@ ff_layout_need_layoutcommit(struct pnfs_layout_segment *lseg) * we always send layoutcommit after DS writes. */ static void -ff_layout_set_layoutcommit(struct nfs_pgio_header *hdr) +ff_layout_set_layoutcommit(struct inode *inode, + struct pnfs_layout_segment *lseg, + loff_t end_offset) { - if (!ff_layout_need_layoutcommit(hdr->lseg)) + if (!ff_layout_need_layoutcommit(lseg)) return; - pnfs_set_layoutcommit(hdr->inode, hdr->lseg, - hdr->mds_offset + hdr->res.count); - dprintk("%s inode %lu pls_end_pos %lu\n", __func__, hdr->inode->i_ino, - (unsigned long) NFS_I(hdr->inode)->layout->plh_lwb); + pnfs_set_layoutcommit(inode, lseg, end_offset); + dprintk("%s inode %lu pls_end_pos %llu\n", __func__, inode->i_ino, + (unsigned long long) NFS_I(inode)->layout->plh_lwb); } static bool @@ -1469,6 +1470,7 @@ static void ff_layout_read_release(void *data) static int ff_layout_write_done_cb(struct rpc_task *task, struct nfs_pgio_header *hdr) { + loff_t end_offs = 0; int err; trace_nfs4_pnfs_write(hdr, task->tk_status); @@ -1494,7 +1496,10 @@ static int ff_layout_write_done_cb(struct rpc_task *task, if (hdr->res.verf->committed == NFS_FILE_SYNC || hdr->res.verf->committed == NFS_DATA_SYNC) - ff_layout_set_layoutcommit(hdr); + end_offs = hdr->mds_offset + (loff_t)hdr->res.count; + + /* Note: if the write is unstable, don't set end_offs until commit */ + ff_layout_set_layoutcommit(hdr->inode, hdr->lseg, end_offs); /* zero out fattr since we don't care DS attr at all */ hdr->fattr.valid = 0; @@ -1530,9 +1535,7 @@ static int ff_layout_commit_done_cb(struct rpc_task *task, return -EAGAIN; } - if (data->verf.committed == NFS_UNSTABLE - && ff_layout_need_layoutcommit(data->lseg)) - pnfs_set_layoutcommit(data->inode, data->lseg, data->lwb); + ff_layout_set_layoutcommit(data->inode, data->lseg, data->lwb); return 0; } diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index dda689d..bf4ec5e 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -662,9 +662,7 @@ int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) trace_nfs_getattr_enter(inode); /* Flush out writes to the server in order to update c/mtime. */ if (S_ISREG(inode->i_mode)) { - inode_lock(inode); - err = nfs_sync_inode(inode); - inode_unlock(inode); + err = filemap_write_and_wait(inode->i_mapping); if (err) goto out; } @@ -879,7 +877,10 @@ void nfs_inode_attach_open_context(struct nfs_open_context *ctx) struct nfs_inode *nfsi = NFS_I(inode); spin_lock(&inode->i_lock); - list_add(&ctx->list, &nfsi->open_files); + if (ctx->mode & FMODE_WRITE) + list_add(&ctx->list, &nfsi->open_files); + else + list_add_tail(&ctx->list, &nfsi->open_files); spin_unlock(&inode->i_lock); } EXPORT_SYMBOL_GPL(nfs_inode_attach_open_context); @@ -972,6 +973,13 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode) if (NFS_STALE(inode)) goto out; + /* pNFS: Attributes aren't updated until we layoutcommit */ + if (S_ISREG(inode->i_mode)) { + status = pnfs_sync_inode(inode, false); + if (status) + goto out; + } + status = -ENOMEM; fattr = nfs_alloc_fattr(); if (fattr == NULL) @@ -1122,14 +1130,12 @@ out: } /** - * __nfs_revalidate_mapping - Revalidate the pagecache + * nfs_revalidate_mapping - Revalidate the pagecache * @inode - pointer to host inode * @mapping - pointer to mapping - * @may_lock - take inode->i_mutex? */ -static int __nfs_revalidate_mapping(struct inode *inode, - struct address_space *mapping, - bool may_lock) +int nfs_revalidate_mapping(struct inode *inode, + struct address_space *mapping) { struct nfs_inode *nfsi = NFS_I(inode); unsigned long *bitlock = &nfsi->flags; @@ -1178,12 +1184,7 @@ static int __nfs_revalidate_mapping(struct inode *inode, nfsi->cache_validity &= ~NFS_INO_INVALID_DATA; spin_unlock(&inode->i_lock); trace_nfs_invalidate_mapping_enter(inode); - if (may_lock) { - inode_lock(inode); - ret = nfs_invalidate_mapping(inode, mapping); - inode_unlock(inode); - } else - ret = nfs_invalidate_mapping(inode, mapping); + ret = nfs_invalidate_mapping(inode, mapping); trace_nfs_invalidate_mapping_exit(inode, ret); clear_bit_unlock(NFS_INO_INVALIDATING, bitlock); @@ -1193,27 +1194,28 @@ out: return ret; } -/** - * nfs_revalidate_mapping - Revalidate the pagecache - * @inode - pointer to host inode - * @mapping - pointer to mapping - */ -int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping) +static bool nfs_file_has_writers(struct nfs_inode *nfsi) { - return __nfs_revalidate_mapping(inode, mapping, false); + struct inode *inode = &nfsi->vfs_inode; + + assert_spin_locked(&inode->i_lock); + + if (!S_ISREG(inode->i_mode)) + return false; + if (list_empty(&nfsi->open_files)) + return false; + /* Note: This relies on nfsi->open_files being ordered with writers + * being placed at the head of the list. + * See nfs_inode_attach_open_context() + */ + return (list_first_entry(&nfsi->open_files, + struct nfs_open_context, + list)->mode & FMODE_WRITE) == FMODE_WRITE; } -/** - * nfs_revalidate_mapping_protected - Revalidate the pagecache - * @inode - pointer to host inode - * @mapping - pointer to mapping - * - * Differs from nfs_revalidate_mapping() in that it grabs the inode->i_mutex - * while invalidating the mapping. - */ -int nfs_revalidate_mapping_protected(struct inode *inode, struct address_space *mapping) +static bool nfs_file_has_buffered_writers(struct nfs_inode *nfsi) { - return __nfs_revalidate_mapping(inode, mapping, true); + return nfs_file_has_writers(nfsi) && nfs_file_io_is_buffered(nfsi); } static unsigned long nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr *fattr) @@ -1280,22 +1282,24 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat if ((fattr->valid & NFS_ATTR_FATTR_TYPE) && (inode->i_mode & S_IFMT) != (fattr->mode & S_IFMT)) return -EIO; - if ((fattr->valid & NFS_ATTR_FATTR_CHANGE) != 0 && - inode->i_version != fattr->change_attr) - invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE; + if (!nfs_file_has_buffered_writers(nfsi)) { + /* Verify a few of the more important attributes */ + if ((fattr->valid & NFS_ATTR_FATTR_CHANGE) != 0 && inode->i_version != fattr->change_attr) + invalid |= NFS_INO_INVALID_ATTR | NFS_INO_REVAL_PAGECACHE; - /* Verify a few of the more important attributes */ - if ((fattr->valid & NFS_ATTR_FATTR_MTIME) && !timespec_equal(&inode->i_mtime, &fattr->mtime)) - invalid |= NFS_INO_INVALID_ATTR; + if ((fattr->valid & NFS_ATTR_FATTR_MTIME) && !timespec_equal(&inode->i_mtime, &fattr->mtime)) + invalid |= NFS_INO_INVALID_ATTR; - if (fattr->valid & NFS_ATTR_FATTR_SIZE) { - cur_size = i_size_read(inode); - new_isize = nfs_size_to_loff_t(fattr->size); - if (cur_size != new_isize) - invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE; + if ((fattr->valid & NFS_ATTR_FATTR_CTIME) && !timespec_equal(&inode->i_ctime, &fattr->ctime)) + invalid |= NFS_INO_INVALID_ATTR; + + if (fattr->valid & NFS_ATTR_FATTR_SIZE) { + cur_size = i_size_read(inode); + new_isize = nfs_size_to_loff_t(fattr->size); + if (cur_size != new_isize) + invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE; + } } - if (nfsi->nrequests != 0) - invalid &= ~NFS_INO_REVAL_PAGECACHE; /* Have any file permissions changed? */ if ((fattr->valid & NFS_ATTR_FATTR_MODE) && (inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO)) @@ -1470,28 +1474,12 @@ static int nfs_inode_attrs_need_update(const struct inode *inode, const struct n ((long)nfsi->attr_gencount - (long)nfs_read_attr_generation_counter() > 0); } -/* - * Don't trust the change_attribute, mtime, ctime or size if - * a pnfs LAYOUTCOMMIT is outstanding - */ -static void nfs_inode_attrs_handle_layoutcommit(struct inode *inode, - struct nfs_fattr *fattr) -{ - if (pnfs_layoutcommit_outstanding(inode)) - fattr->valid &= ~(NFS_ATTR_FATTR_CHANGE | - NFS_ATTR_FATTR_MTIME | - NFS_ATTR_FATTR_CTIME | - NFS_ATTR_FATTR_SIZE); -} - static int nfs_refresh_inode_locked(struct inode *inode, struct nfs_fattr *fattr) { int ret; trace_nfs_refresh_inode_enter(inode); - nfs_inode_attrs_handle_layoutcommit(inode, fattr); - if (nfs_inode_attrs_need_update(inode, fattr)) ret = nfs_update_inode(inode, fattr); else @@ -1527,7 +1515,7 @@ EXPORT_SYMBOL_GPL(nfs_refresh_inode); static int nfs_post_op_update_inode_locked(struct inode *inode, struct nfs_fattr *fattr) { - unsigned long invalid = NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE; + unsigned long invalid = NFS_INO_INVALID_ATTR; /* * Don't revalidate the pagecache if we hold a delegation, but do @@ -1676,6 +1664,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) unsigned long invalid = 0; unsigned long now = jiffies; unsigned long save_cache_validity; + bool have_writers = nfs_file_has_buffered_writers(nfsi); bool cache_revalidated = true; dfprintk(VFS, "NFS: %s(%s/%lu fh_crc=0x%08x ct=%d info=0x%x)\n", @@ -1725,17 +1714,25 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) /* Do atomic weak cache consistency updates */ invalid |= nfs_wcc_update_inode(inode, fattr); + if (pnfs_layoutcommit_outstanding(inode)) { + nfsi->cache_validity |= save_cache_validity & NFS_INO_INVALID_ATTR; + cache_revalidated = false; + } + /* More cache consistency checks */ if (fattr->valid & NFS_ATTR_FATTR_CHANGE) { if (inode->i_version != fattr->change_attr) { dprintk("NFS: change_attr change on server for file %s/%ld\n", inode->i_sb->s_id, inode->i_ino); - invalid |= NFS_INO_INVALID_ATTR - | NFS_INO_INVALID_DATA - | NFS_INO_INVALID_ACCESS - | NFS_INO_INVALID_ACL; - if (S_ISDIR(inode->i_mode)) - nfs_force_lookup_revalidate(inode); + /* Could it be a race with writeback? */ + if (!have_writers) { + invalid |= NFS_INO_INVALID_ATTR + | NFS_INO_INVALID_DATA + | NFS_INO_INVALID_ACCESS + | NFS_INO_INVALID_ACL; + if (S_ISDIR(inode->i_mode)) + nfs_force_lookup_revalidate(inode); + } inode->i_version = fattr->change_attr; } } else { @@ -1768,9 +1765,10 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) if (new_isize != cur_isize) { /* Do we perhaps have any outstanding writes, or has * the file grown beyond our last write? */ - if ((nfsi->nrequests == 0) || new_isize > cur_isize) { + if (nfsi->nrequests == 0 || new_isize > cur_isize) { i_size_write(inode, new_isize); - invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA; + if (!have_writers) + invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA; } dprintk("NFS: isize change on server for file %s/%ld " "(%Ld to %Ld)\n", diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 5ea04d8..7ce5e02 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -66,13 +66,16 @@ struct nfs_clone_mount { struct nfs_client_initdata { unsigned long init_flags; - const char *hostname; - const struct sockaddr *addr; + const char *hostname; /* Hostname of the server */ + const struct sockaddr *addr; /* Address of the server */ + const char *nodename; /* Hostname of the client */ + const char *ip_addr; /* IP address of the client */ size_t addrlen; struct nfs_subversion *nfs_mod; int proto; u32 minorversion; struct net *net; + const struct rpc_timeout *timeparms; }; /* @@ -147,9 +150,8 @@ extern void nfs_umount(const struct nfs_mount_request *info); extern const struct rpc_program nfs_program; extern void nfs_clients_init(struct net *net); extern struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *); -int nfs_create_rpc_client(struct nfs_client *, const struct rpc_timeout *, rpc_authflavor_t); +int nfs_create_rpc_client(struct nfs_client *, const struct nfs_client_initdata *, rpc_authflavor_t); struct nfs_client *nfs_get_client(const struct nfs_client_initdata *, - const struct rpc_timeout *, const char *, rpc_authflavor_t); int nfs_probe_fsinfo(struct nfs_server *server, struct nfs_fh *, struct nfs_fattr *); void nfs_server_insert_lists(struct nfs_server *); @@ -184,7 +186,7 @@ extern struct nfs_server *nfs_clone_server(struct nfs_server *, rpc_authflavor_t); extern int nfs_wait_client_init_complete(const struct nfs_client *clp); extern void nfs_mark_client_ready(struct nfs_client *clp, int state); -extern struct nfs_client *nfs4_set_ds_client(struct nfs_client* mds_clp, +extern struct nfs_client *nfs4_set_ds_client(struct nfs_server *mds_srv, const struct sockaddr *ds_addr, int ds_addrlen, int ds_proto, unsigned int ds_timeo, @@ -193,7 +195,7 @@ extern struct nfs_client *nfs4_set_ds_client(struct nfs_client* mds_clp, rpc_authflavor_t au_flavor); extern struct rpc_clnt *nfs4_find_or_create_ds_client(struct nfs_client *, struct inode *); -extern struct nfs_client *nfs3_set_ds_client(struct nfs_client *mds_clp, +extern struct nfs_client *nfs3_set_ds_client(struct nfs_server *mds_srv, const struct sockaddr *ds_addr, int ds_addrlen, int ds_proto, unsigned int ds_timeo, unsigned int ds_retrans, rpc_authflavor_t au_flavor); @@ -338,8 +340,7 @@ nfs4_label_copy(struct nfs4_label *dst, struct nfs4_label *src) /* proc.c */ void nfs_close_context(struct nfs_open_context *ctx, int is_sync); extern struct nfs_client *nfs_init_client(struct nfs_client *clp, - const struct rpc_timeout *timeparms, - const char *ip_addr); + const struct nfs_client_initdata *); /* dir.c */ extern void nfs_force_use_readdirplus(struct inode *dir); @@ -411,6 +412,19 @@ extern void __exit unregister_nfs_fs(void); extern bool nfs_sb_active(struct super_block *sb); extern void nfs_sb_deactive(struct super_block *sb); +/* io.c */ +extern void nfs_start_io_read(struct inode *inode); +extern void nfs_end_io_read(struct inode *inode); +extern void nfs_start_io_write(struct inode *inode); +extern void nfs_end_io_write(struct inode *inode); +extern void nfs_start_io_direct(struct inode *inode); +extern void nfs_end_io_direct(struct inode *inode); + +static inline bool nfs_file_io_is_buffered(struct nfs_inode *nfsi) +{ + return test_bit(NFS_INO_ODIRECT, &nfsi->flags) == 0; +} + /* namespace.c */ #define NFS_PATH_CANONICAL 1 extern char *nfs_path(char **p, struct dentry *dentry, @@ -496,9 +510,29 @@ void nfs_init_cinfo(struct nfs_commit_info *cinfo, struct inode *inode, struct nfs_direct_req *dreq); int nfs_key_timeout_notify(struct file *filp, struct inode *inode); -bool nfs_ctx_key_to_expire(struct nfs_open_context *ctx); +bool nfs_ctx_key_to_expire(struct nfs_open_context *ctx, struct inode *inode); void nfs_pageio_stop_mirroring(struct nfs_pageio_descriptor *pgio); +int nfs_filemap_write_and_wait_range(struct address_space *mapping, + loff_t lstart, loff_t lend); + +#ifdef CONFIG_NFS_V4_1 +static inline +void nfs_clear_pnfs_ds_commit_verifiers(struct pnfs_ds_commit_info *cinfo) +{ + int i; + + for (i = 0; i < cinfo->nbuckets; i++) + cinfo->buckets[i].direct_verf.committed = NFS_INVALID_STABLE_HOW; +} +#else +static inline +void nfs_clear_pnfs_ds_commit_verifiers(struct pnfs_ds_commit_info *cinfo) +{ +} +#endif + + #ifdef CONFIG_MIGRATION extern int nfs_migrate_page(struct address_space *, struct page *, struct page *, enum migrate_mode); @@ -506,6 +540,13 @@ extern int nfs_migrate_page(struct address_space *, #define nfs_migrate_page NULL #endif +static inline int +nfs_write_verifier_cmp(const struct nfs_write_verifier *v1, + const struct nfs_write_verifier *v2) +{ + return memcmp(v1->data, v2->data, sizeof(v1->data)); +} + /* unlink.c */ extern struct rpc_task * nfs_async_rename(struct inode *old_dir, struct inode *new_dir, @@ -521,8 +562,7 @@ extern ssize_t nfs_dreq_bytes_left(struct nfs_direct_req *dreq); /* nfs4proc.c */ extern void __nfs4_read_done_cb(struct nfs_pgio_header *); extern struct nfs_client *nfs4_init_client(struct nfs_client *clp, - const struct rpc_timeout *timeparms, - const char *ip_addr); + const struct nfs_client_initdata *); extern int nfs40_walk_client_list(struct nfs_client *clp, struct nfs_client **result, struct rpc_cred *cred); diff --git a/fs/nfs/io.c b/fs/nfs/io.c new file mode 100644 index 0000000..1fc5d1c --- /dev/null +++ b/fs/nfs/io.c @@ -0,0 +1,147 @@ +/* + * Copyright (c) 2016 Trond Myklebust + * + * I/O and data path helper functionality. + */ + +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/bitops.h> +#include <linux/rwsem.h> +#include <linux/fs.h> +#include <linux/nfs_fs.h> + +#include "internal.h" + +/* Call with exclusively locked inode->i_rwsem */ +static void nfs_block_o_direct(struct nfs_inode *nfsi, struct inode *inode) +{ + if (test_bit(NFS_INO_ODIRECT, &nfsi->flags)) { + clear_bit(NFS_INO_ODIRECT, &nfsi->flags); + inode_dio_wait(inode); + } +} + +/** + * nfs_start_io_read - declare the file is being used for buffered reads + * @inode - file inode + * + * Declare that a buffered read operation is about to start, and ensure + * that we block all direct I/O. + * On exit, the function ensures that the NFS_INO_ODIRECT flag is unset, + * and holds a shared lock on inode->i_rwsem to ensure that the flag + * cannot be changed. + * In practice, this means that buffered read operations are allowed to + * execute in parallel, thanks to the shared lock, whereas direct I/O + * operations need to wait to grab an exclusive lock in order to set + * NFS_INO_ODIRECT. + * Note that buffered writes and truncates both take a write lock on + * inode->i_rwsem, meaning that those are serialised w.r.t. the reads. + */ +void +nfs_start_io_read(struct inode *inode) +{ + struct nfs_inode *nfsi = NFS_I(inode); + /* Be an optimist! */ + down_read(&inode->i_rwsem); + if (test_bit(NFS_INO_ODIRECT, &nfsi->flags) == 0) + return; + up_read(&inode->i_rwsem); + /* Slow path.... */ + down_write(&inode->i_rwsem); + nfs_block_o_direct(nfsi, inode); + downgrade_write(&inode->i_rwsem); +} + +/** + * nfs_end_io_read - declare that the buffered read operation is done + * @inode - file inode + * + * Declare that a buffered read operation is done, and release the shared + * lock on inode->i_rwsem. + */ +void +nfs_end_io_read(struct inode *inode) +{ + up_read(&inode->i_rwsem); +} + +/** + * nfs_start_io_write - declare the file is being used for buffered writes + * @inode - file inode + * + * Declare that a buffered read operation is about to start, and ensure + * that we block all direct I/O. + */ +void +nfs_start_io_write(struct inode *inode) +{ + down_write(&inode->i_rwsem); + nfs_block_o_direct(NFS_I(inode), inode); +} + +/** + * nfs_end_io_write - declare that the buffered write operation is done + * @inode - file inode + * + * Declare that a buffered write operation is done, and release the + * lock on inode->i_rwsem. + */ +void +nfs_end_io_write(struct inode *inode) +{ + up_write(&inode->i_rwsem); +} + +/* Call with exclusively locked inode->i_rwsem */ +static void nfs_block_buffered(struct nfs_inode *nfsi, struct inode *inode) +{ + if (!test_bit(NFS_INO_ODIRECT, &nfsi->flags)) { + set_bit(NFS_INO_ODIRECT, &nfsi->flags); + nfs_wb_all(inode); + } +} + +/** + * nfs_end_io_direct - declare the file is being used for direct i/o + * @inode - file inode + * + * Declare that a direct I/O operation is about to start, and ensure + * that we block all buffered I/O. + * On exit, the function ensures that the NFS_INO_ODIRECT flag is set, + * and holds a shared lock on inode->i_rwsem to ensure that the flag + * cannot be changed. + * In practice, this means that direct I/O operations are allowed to + * execute in parallel, thanks to the shared lock, whereas buffered I/O + * operations need to wait to grab an exclusive lock in order to clear + * NFS_INO_ODIRECT. + * Note that buffered writes and truncates both take a write lock on + * inode->i_rwsem, meaning that those are serialised w.r.t. O_DIRECT. + */ +void +nfs_start_io_direct(struct inode *inode) +{ + struct nfs_inode *nfsi = NFS_I(inode); + /* Be an optimist! */ + down_read(&inode->i_rwsem); + if (test_bit(NFS_INO_ODIRECT, &nfsi->flags) != 0) + return; + up_read(&inode->i_rwsem); + /* Slow path.... */ + down_write(&inode->i_rwsem); + nfs_block_buffered(nfsi, inode); + downgrade_write(&inode->i_rwsem); +} + +/** + * nfs_end_io_direct - declare that the direct i/o operation is done + * @inode - file inode + * + * Declare that a direct I/O operation is done, and release the shared + * lock on inode->i_rwsem. + */ +void +nfs_end_io_direct(struct inode *inode) +{ + up_read(&inode->i_rwsem); +} diff --git a/fs/nfs/nfs3client.c b/fs/nfs/nfs3client.c index 9e9fa34..ee75354 100644 --- a/fs/nfs/nfs3client.c +++ b/fs/nfs/nfs3client.c @@ -76,19 +76,23 @@ struct nfs_server *nfs3_clone_server(struct nfs_server *source, * low timeout interval so that if a connection is lost, we retry through * the MDS. */ -struct nfs_client *nfs3_set_ds_client(struct nfs_client *mds_clp, +struct nfs_client *nfs3_set_ds_client(struct nfs_server *mds_srv, const struct sockaddr *ds_addr, int ds_addrlen, int ds_proto, unsigned int ds_timeo, unsigned int ds_retrans, rpc_authflavor_t au_flavor) { + struct rpc_timeout ds_timeout; + struct nfs_client *mds_clp = mds_srv->nfs_client; struct nfs_client_initdata cl_init = { .addr = ds_addr, .addrlen = ds_addrlen, + .nodename = mds_clp->cl_rpcclient->cl_nodename, + .ip_addr = mds_clp->cl_ipaddr, .nfs_mod = &nfs_v3, .proto = ds_proto, .net = mds_clp->cl_net, + .timeparms = &ds_timeout, }; - struct rpc_timeout ds_timeout; struct nfs_client *clp; char buf[INET6_ADDRSTRLEN + 1]; @@ -97,10 +101,12 @@ struct nfs_client *nfs3_set_ds_client(struct nfs_client *mds_clp, return ERR_PTR(-EINVAL); cl_init.hostname = buf; + if (mds_srv->flags & NFS_MOUNT_NORESVPORT) + set_bit(NFS_CS_NORESVPORT, &cl_init.init_flags); + /* Use the MDS nfs_client cl_ipaddr. */ nfs_init_timeout_values(&ds_timeout, ds_proto, ds_timeo, ds_retrans); - clp = nfs_get_client(&cl_init, &ds_timeout, mds_clp->cl_ipaddr, - au_flavor); + clp = nfs_get_client(&cl_init, au_flavor); return clp; } diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c index aa03ed0..33da841 100644 --- a/fs/nfs/nfs42proc.c +++ b/fs/nfs/nfs42proc.c @@ -113,15 +113,17 @@ int nfs42_proc_deallocate(struct file *filep, loff_t offset, loff_t len) if (!nfs_server_capable(inode, NFS_CAP_DEALLOCATE)) return -EOPNOTSUPP; - nfs_wb_all(inode); inode_lock(inode); + err = nfs_sync_inode(inode); + if (err) + goto out_unlock; err = nfs42_proc_fallocate(&msg, filep, offset, len); if (err == 0) truncate_pagecache_range(inode, offset, (offset + len) -1); if (err == -EOPNOTSUPP) NFS_SERVER(inode)->caps &= ~NFS_CAP_DEALLOCATE; - +out_unlock: inode_unlock(inode); return err; } @@ -154,11 +156,20 @@ static ssize_t _nfs42_proc_copy(struct file *src, loff_t pos_src, if (status) return status; + status = nfs_filemap_write_and_wait_range(file_inode(src)->i_mapping, + pos_src, pos_src + (loff_t)count - 1); + if (status) + return status; + status = nfs4_set_rw_stateid(&args.dst_stateid, dst_lock->open_context, dst_lock, FMODE_WRITE); if (status) return status; + status = nfs_sync_inode(dst_inode); + if (status) + return status; + status = nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0); if (status == -ENOTSUPP) @@ -258,7 +269,11 @@ static loff_t _nfs42_proc_llseek(struct file *filep, if (status) return status; - nfs_wb_all(inode); + status = nfs_filemap_write_and_wait_range(inode->i_mapping, + offset, LLONG_MAX); + if (status) + return status; + status = nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0); if (status == -ENOTSUPP) @@ -336,8 +351,7 @@ nfs42_layoutstat_done(struct rpc_task *task, void *calldata) * Mark the bad layout state as invalid, then retry * with the current stateid. */ - set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags); - pnfs_mark_matching_lsegs_invalid(lo, &head, NULL, 0); + pnfs_mark_layout_stateid_invalid(lo, &head); spin_unlock(&inode->i_lock); pnfs_free_lseg_list(&head); } else diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c index 6dc6f2a..8b26058 100644 --- a/fs/nfs/nfs42xdr.c +++ b/fs/nfs/nfs42xdr.c @@ -330,13 +330,21 @@ static int decode_write_response(struct xdr_stream *xdr, struct nfs42_write_res *res) { __be32 *p; - int stateids; p = xdr_inline_decode(xdr, 4 + 8 + 4); if (unlikely(!p)) goto out_overflow; - stateids = be32_to_cpup(p++); + /* + * We never use asynchronous mode, so warn if a server returns + * a stateid. + */ + if (unlikely(*p != 0)) { + pr_err_once("%s: server has set unrequested " + "asynchronous mode\n", __func__); + return -EREMOTEIO; + } + p++; p = xdr_decode_hyper(p, &res->count); res->verifier.committed = be32_to_cpup(p); return decode_verifier(xdr, &res->verifier.verifier); diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index 768456f..4be567a 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -185,6 +185,7 @@ struct nfs4_state { struct nfs4_exception { struct nfs4_state *state; struct inode *inode; + nfs4_stateid *stateid; long timeout; unsigned char delay : 1, recovering : 1, diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c index 10410e8..8d7d08d 100644 --- a/fs/nfs/nfs4client.c +++ b/fs/nfs/nfs4client.c @@ -349,10 +349,10 @@ static int nfs4_init_client_minor_version(struct nfs_client *clp) * Returns pointer to an NFS client, or an ERR_PTR value. */ struct nfs_client *nfs4_init_client(struct nfs_client *clp, - const struct rpc_timeout *timeparms, - const char *ip_addr) + const struct nfs_client_initdata *cl_init) { char buf[INET6_ADDRSTRLEN + 1]; + const char *ip_addr = cl_init->ip_addr; struct nfs_client *old; int error; @@ -370,9 +370,9 @@ struct nfs_client *nfs4_init_client(struct nfs_client *clp, __set_bit(NFS_CS_DISCRTRY, &clp->cl_flags); __set_bit(NFS_CS_NO_RETRANS_TIMEOUT, &clp->cl_flags); - error = nfs_create_rpc_client(clp, timeparms, RPC_AUTH_GSS_KRB5I); + error = nfs_create_rpc_client(clp, cl_init, RPC_AUTH_GSS_KRB5I); if (error == -EINVAL) - error = nfs_create_rpc_client(clp, timeparms, RPC_AUTH_UNIX); + error = nfs_create_rpc_client(clp, cl_init, RPC_AUTH_UNIX); if (error < 0) goto error; @@ -793,10 +793,12 @@ static int nfs4_set_client(struct nfs_server *server, .hostname = hostname, .addr = addr, .addrlen = addrlen, + .ip_addr = ip_addr, .nfs_mod = &nfs_v4, .proto = proto, .minorversion = minorversion, .net = net, + .timeparms = timeparms, }; struct nfs_client *clp; int error; @@ -809,7 +811,7 @@ static int nfs4_set_client(struct nfs_server *server, set_bit(NFS_CS_MIGRATION, &cl_init.init_flags); /* Allocate or find a client reference we can use */ - clp = nfs_get_client(&cl_init, timeparms, ip_addr, authflavour); + clp = nfs_get_client(&cl_init, authflavour); if (IS_ERR(clp)) { error = PTR_ERR(clp); goto error; @@ -842,20 +844,24 @@ error: * low timeout interval so that if a connection is lost, we retry through * the MDS. */ -struct nfs_client *nfs4_set_ds_client(struct nfs_client* mds_clp, +struct nfs_client *nfs4_set_ds_client(struct nfs_server *mds_srv, const struct sockaddr *ds_addr, int ds_addrlen, int ds_proto, unsigned int ds_timeo, unsigned int ds_retrans, u32 minor_version, rpc_authflavor_t au_flavor) { + struct rpc_timeout ds_timeout; + struct nfs_client *mds_clp = mds_srv->nfs_client; struct nfs_client_initdata cl_init = { .addr = ds_addr, .addrlen = ds_addrlen, + .nodename = mds_clp->cl_rpcclient->cl_nodename, + .ip_addr = mds_clp->cl_ipaddr, .nfs_mod = &nfs_v4, .proto = ds_proto, .minorversion = minor_version, .net = mds_clp->cl_net, + .timeparms = &ds_timeout, }; - struct rpc_timeout ds_timeout; struct nfs_client *clp; char buf[INET6_ADDRSTRLEN + 1]; @@ -863,14 +869,16 @@ struct nfs_client *nfs4_set_ds_client(struct nfs_client* mds_clp, return ERR_PTR(-EINVAL); cl_init.hostname = buf; + if (mds_srv->flags & NFS_MOUNT_NORESVPORT) + __set_bit(NFS_CS_NORESVPORT, &cl_init.init_flags); + /* * Set an authflavor equual to the MDS value. Use the MDS nfs_client * cl_ipaddr so as to use the same EXCHANGE_ID co_ownerid as the MDS * (section 13.1 RFC 5661). */ nfs_init_timeout_values(&ds_timeout, ds_proto, ds_timeo, ds_retrans); - clp = nfs_get_client(&cl_init, &ds_timeout, mds_clp->cl_ipaddr, - au_flavor); + clp = nfs_get_client(&cl_init, au_flavor); dprintk("<-- %s %p\n", __func__, clp); return clp; diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c index 014b0e4..d085ad7 100644 --- a/fs/nfs/nfs4file.c +++ b/fs/nfs/nfs4file.c @@ -66,7 +66,7 @@ nfs4_file_open(struct inode *inode, struct file *filp) if (openflags & O_TRUNC) { attr.ia_valid |= ATTR_SIZE; attr.ia_size = 0; - nfs_sync_inode(inode); + filemap_write_and_wait(inode->i_mapping); } inode = NFS_PROTO(dir)->open_context(dir, ctx, openflags, &attr, NULL); @@ -133,21 +133,9 @@ static ssize_t nfs4_copy_file_range(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, size_t count, unsigned int flags) { - struct inode *in_inode = file_inode(file_in); - struct inode *out_inode = file_inode(file_out); - int ret; - - if (in_inode == out_inode) + if (file_inode(file_in) == file_inode(file_out)) return -EINVAL; - /* flush any pending writes */ - ret = nfs_sync_inode(in_inode); - if (ret) - return ret; - ret = nfs_sync_inode(out_inode); - if (ret) - return ret; - return nfs42_proc_copy(file_in, pos_in, file_out, pos_out, count); } diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index ff416d0..da5c9e5 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -363,6 +363,7 @@ static int nfs4_do_handle_exception(struct nfs_server *server, { struct nfs_client *clp = server->nfs_client; struct nfs4_state *state = exception->state; + const nfs4_stateid *stateid = exception->stateid; struct inode *inode = exception->inode; int ret = errorcode; @@ -376,9 +377,18 @@ static int nfs4_do_handle_exception(struct nfs_server *server, case -NFS4ERR_DELEG_REVOKED: case -NFS4ERR_ADMIN_REVOKED: case -NFS4ERR_BAD_STATEID: - if (inode && nfs_async_inode_return_delegation(inode, - NULL) == 0) - goto wait_on_recovery; + if (inode) { + int err; + + err = nfs_async_inode_return_delegation(inode, + stateid); + if (err == 0) + goto wait_on_recovery; + if (stateid != NULL && stateid->type == NFS4_DELEGATION_STATEID_TYPE) { + exception->retry = 1; + break; + } + } if (state == NULL) break; ret = nfs4_schedule_stateid_recovery(server, state); @@ -427,6 +437,7 @@ static int nfs4_do_handle_exception(struct nfs_server *server, case -NFS4ERR_DELAY: nfs_inc_server_stats(server, NFSIOS_DELAY); case -NFS4ERR_GRACE: + case -NFS4ERR_LAYOUTTRYLATER: case -NFS4ERR_RECALLCONFLICT: exception->delay = 1; return 0; @@ -2669,28 +2680,17 @@ static struct nfs4_state *nfs4_do_open(struct inode *dir, return res; } -static int _nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred, - struct nfs_fattr *fattr, struct iattr *sattr, - struct nfs4_state *state, struct nfs4_label *ilabel, - struct nfs4_label *olabel) +static int _nfs4_do_setattr(struct inode *inode, + struct nfs_setattrargs *arg, + struct nfs_setattrres *res, + struct rpc_cred *cred, + struct nfs4_state *state) { struct nfs_server *server = NFS_SERVER(inode); - struct nfs_setattrargs arg = { - .fh = NFS_FH(inode), - .iap = sattr, - .server = server, - .bitmask = server->attr_bitmask, - .label = ilabel, - }; - struct nfs_setattrres res = { - .fattr = fattr, - .label = olabel, - .server = server, - }; struct rpc_message msg = { .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SETATTR], - .rpc_argp = &arg, - .rpc_resp = &res, + .rpc_argp = arg, + .rpc_resp = res, .rpc_cred = cred, }; struct rpc_cred *delegation_cred = NULL; @@ -2699,17 +2699,13 @@ static int _nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred, bool truncate; int status; - arg.bitmask = nfs4_bitmask(server, ilabel); - if (ilabel) - arg.bitmask = nfs4_bitmask(server, olabel); - - nfs_fattr_init(fattr); + nfs_fattr_init(res->fattr); /* Servers should only apply open mode checks for file size changes */ - truncate = (sattr->ia_valid & ATTR_SIZE) ? true : false; + truncate = (arg->iap->ia_valid & ATTR_SIZE) ? true : false; fmode = truncate ? FMODE_WRITE : FMODE_READ; - if (nfs4_copy_delegation_stateid(inode, fmode, &arg.stateid, &delegation_cred)) { + if (nfs4_copy_delegation_stateid(inode, fmode, &arg->stateid, &delegation_cred)) { /* Use that stateid */ } else if (truncate && state != NULL) { struct nfs_lockowner lockowner = { @@ -2719,19 +2715,19 @@ static int _nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred, if (!nfs4_valid_open_stateid(state)) return -EBADF; if (nfs4_select_rw_stateid(state, FMODE_WRITE, &lockowner, - &arg.stateid, &delegation_cred) == -EIO) + &arg->stateid, &delegation_cred) == -EIO) return -EBADF; } else - nfs4_stateid_copy(&arg.stateid, &zero_stateid); + nfs4_stateid_copy(&arg->stateid, &zero_stateid); if (delegation_cred) msg.rpc_cred = delegation_cred; - status = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1); + status = nfs4_call_sync(server->client, server, &msg, &arg->seq_args, &res->seq_res, 1); put_rpccred(delegation_cred); if (status == 0 && state != NULL) renew_lease(server, timestamp); - trace_nfs4_setattr(inode, &arg.stateid, status); + trace_nfs4_setattr(inode, &arg->stateid, status); return status; } @@ -2741,13 +2737,31 @@ static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred, struct nfs4_label *olabel) { struct nfs_server *server = NFS_SERVER(inode); + struct nfs_setattrargs arg = { + .fh = NFS_FH(inode), + .iap = sattr, + .server = server, + .bitmask = server->attr_bitmask, + .label = ilabel, + }; + struct nfs_setattrres res = { + .fattr = fattr, + .label = olabel, + .server = server, + }; struct nfs4_exception exception = { .state = state, .inode = inode, + .stateid = &arg.stateid, }; int err; + + arg.bitmask = nfs4_bitmask(server, ilabel); + if (ilabel) + arg.bitmask = nfs4_bitmask(server, olabel); + do { - err = _nfs4_do_setattr(inode, cred, fattr, sattr, state, ilabel, olabel); + err = _nfs4_do_setattr(inode, &arg, &res, cred, state); switch (err) { case -NFS4ERR_OPENMODE: if (!(sattr->ia_valid & ATTR_SIZE)) { @@ -3267,13 +3281,6 @@ static int nfs4_find_root_sec(struct nfs_server *server, struct nfs_fh *fhandle, return status; } -static int nfs4_do_find_root_sec(struct nfs_server *server, - struct nfs_fh *fhandle, struct nfs_fsinfo *info) -{ - int mv = server->nfs_client->cl_minorversion; - return nfs_v4_minor_ops[mv]->find_root_sec(server, fhandle, info); -} - /** * nfs4_proc_get_rootfh - get file handle for server's pseudoroot * @server: initialized nfs_server handle @@ -3293,7 +3300,8 @@ int nfs4_proc_get_rootfh(struct nfs_server *server, struct nfs_fh *fhandle, status = nfs4_lookup_root(server, fhandle, info); if (auth_probe || status == NFS4ERR_WRONGSEC) - status = nfs4_do_find_root_sec(server, fhandle, info); + status = server->nfs_client->cl_mvops->find_root_sec(server, + fhandle, info); if (status == 0) status = nfs4_server_capabilities(server, fhandle); @@ -4392,7 +4400,8 @@ static void nfs4_proc_read_setup(struct nfs_pgio_header *hdr, struct rpc_message *msg) { hdr->timestamp = jiffies; - hdr->pgio_done_cb = nfs4_read_done_cb; + if (!hdr->pgio_done_cb) + hdr->pgio_done_cb = nfs4_read_done_cb; msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ]; nfs4_init_sequence(&hdr->args.seq_args, &hdr->res.seq_res, 0); } @@ -7869,11 +7878,13 @@ nfs4_layoutget_handle_exception(struct rpc_task *task, struct inode *inode = lgp->args.inode; struct nfs_server *server = NFS_SERVER(inode); struct pnfs_layout_hdr *lo; - int status = task->tk_status; + int nfs4err = task->tk_status; + int err, status = 0; + LIST_HEAD(head); dprintk("--> %s tk_status => %d\n", __func__, -task->tk_status); - switch (status) { + switch (nfs4err) { case 0: goto out; @@ -7905,45 +7916,42 @@ nfs4_layoutget_handle_exception(struct rpc_task *task, status = -EOVERFLOW; goto out; } - /* Fallthrough */ + status = -EBUSY; + break; case -NFS4ERR_RECALLCONFLICT: - nfs4_handle_exception(server, -NFS4ERR_RECALLCONFLICT, - exception); status = -ERECALLCONFLICT; - goto out; + break; case -NFS4ERR_EXPIRED: case -NFS4ERR_BAD_STATEID: exception->timeout = 0; spin_lock(&inode->i_lock); - if (nfs4_stateid_match(&lgp->args.stateid, + lo = NFS_I(inode)->layout; + /* If the open stateid was bad, then recover it. */ + if (!lo || test_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags) || + nfs4_stateid_match_other(&lgp->args.stateid, &lgp->args.ctx->state->stateid)) { spin_unlock(&inode->i_lock); - /* If the open stateid was bad, then recover it. */ exception->state = lgp->args.ctx->state; break; } - lo = NFS_I(inode)->layout; - if (lo && !test_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags) && - nfs4_stateid_match_other(&lgp->args.stateid, &lo->plh_stateid)) { - LIST_HEAD(head); - - /* - * Mark the bad layout state as invalid, then retry - * with the current stateid. - */ - set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags); - pnfs_mark_matching_lsegs_invalid(lo, &head, NULL, 0); - spin_unlock(&inode->i_lock); - pnfs_free_lseg_list(&head); - status = -EAGAIN; - goto out; - } else - spin_unlock(&inode->i_lock); - } - status = nfs4_handle_exception(server, status, exception); - if (exception->retry) + /* + * Mark the bad layout state as invalid, then retry + */ + pnfs_mark_layout_stateid_invalid(lo, &head); + spin_unlock(&inode->i_lock); + pnfs_free_lseg_list(&head); status = -EAGAIN; + goto out; + } + + err = nfs4_handle_exception(server, nfs4err, exception); + if (!status) { + if (exception->retry) + status = -EAGAIN; + else + status = err; + } out: dprintk("<-- %s\n", __func__); return status; @@ -8129,8 +8137,7 @@ static void nfs4_layoutreturn_release(void *calldata) spin_lock(&lo->plh_inode->i_lock); pnfs_mark_matching_lsegs_invalid(lo, &freeme, &lrp->args.range, be32_to_cpu(lrp->args.stateid.seqid)); - pnfs_mark_layout_returned_if_empty(lo); - if (lrp->res.lrs_present) + if (lrp->res.lrs_present && pnfs_layout_is_valid(lo)) pnfs_set_layout_stateid(lo, &lrp->res.stateid, true); pnfs_clear_layoutreturn_waitbit(lo); spin_unlock(&lo->plh_inode->i_lock); @@ -8835,7 +8842,7 @@ const struct nfs4_minor_version_ops *nfs_v4_minor_ops[] = { #endif }; -ssize_t nfs4_listxattr(struct dentry *dentry, char *list, size_t size) +static ssize_t nfs4_listxattr(struct dentry *dentry, char *list, size_t size) { ssize_t error, error2; diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 661e753..7bd3a5c 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -1985,9 +1985,14 @@ encode_layoutcommit(struct xdr_stream *xdr, p = xdr_encode_hyper(p, args->lastbytewritten + 1); /* length */ *p = cpu_to_be32(0); /* reclaim */ encode_nfs4_stateid(xdr, &args->stateid); - p = reserve_space(xdr, 20); - *p++ = cpu_to_be32(1); /* newoffset = TRUE */ - p = xdr_encode_hyper(p, args->lastbytewritten); + if (args->lastbytewritten != U64_MAX) { + p = reserve_space(xdr, 20); + *p++ = cpu_to_be32(1); /* newoffset = TRUE */ + p = xdr_encode_hyper(p, args->lastbytewritten); + } else { + p = reserve_space(xdr, 12); + *p++ = cpu_to_be32(0); /* newoffset = FALSE */ + } *p++ = cpu_to_be32(0); /* Never send time_modify_changed */ *p++ = cpu_to_be32(NFS_SERVER(args->inode)->pnfs_curr_ld->id);/* type */ diff --git a/fs/nfs/nfstrace.h b/fs/nfs/nfstrace.h index 31c7763..2ca9167 100644 --- a/fs/nfs/nfstrace.h +++ b/fs/nfs/nfstrace.h @@ -37,7 +37,6 @@ { 1 << NFS_INO_ADVISE_RDPLUS, "ADVISE_RDPLUS" }, \ { 1 << NFS_INO_STALE, "STALE" }, \ { 1 << NFS_INO_INVALIDATING, "INVALIDATING" }, \ - { 1 << NFS_INO_FLUSHING, "FLUSHING" }, \ { 1 << NFS_INO_FSCACHE, "FSCACHE" }, \ { 1 << NFS_INO_LAYOUTCOMMIT, "NEED_LAYOUTCOMMIT" }, \ { 1 << NFS_INO_LAYOUTCOMMITTING, "LAYOUTCOMMIT" }) diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 0fbe734..70806ca 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -259,7 +259,7 @@ pnfs_put_layout_hdr(struct pnfs_layout_hdr *lo) * is required. * Note that caller must hold inode->i_lock. */ -static int +int pnfs_mark_layout_stateid_invalid(struct pnfs_layout_hdr *lo, struct list_head *lseg_list) { @@ -334,14 +334,17 @@ pnfs_layout_io_test_failed(struct pnfs_layout_hdr *lo, u32 iomode) } static void -init_lseg(struct pnfs_layout_hdr *lo, struct pnfs_layout_segment *lseg) +pnfs_init_lseg(struct pnfs_layout_hdr *lo, struct pnfs_layout_segment *lseg, + const struct pnfs_layout_range *range, + const nfs4_stateid *stateid) { INIT_LIST_HEAD(&lseg->pls_list); INIT_LIST_HEAD(&lseg->pls_lc_list); atomic_set(&lseg->pls_refcount, 1); - smp_mb(); set_bit(NFS_LSEG_VALID, &lseg->pls_flags); lseg->pls_layout = lo; + lseg->pls_range = *range; + lseg->pls_seq = be32_to_cpu(stateid->seqid); } static void pnfs_free_lseg(struct pnfs_layout_segment *lseg) @@ -486,15 +489,6 @@ pnfs_lseg_range_intersecting(const struct pnfs_layout_range *l1, (end2 == NFS4_MAX_UINT64 || end2 > start1); } -static bool -should_free_lseg(const struct pnfs_layout_range *lseg_range, - const struct pnfs_layout_range *recall_range) -{ - return (recall_range->iomode == IOMODE_ANY || - lseg_range->iomode == recall_range->iomode) && - pnfs_lseg_range_intersecting(lseg_range, recall_range); -} - static bool pnfs_lseg_dec_and_remove_zero(struct pnfs_layout_segment *lseg, struct list_head *tmp_list) { @@ -533,6 +527,27 @@ static bool pnfs_seqid_is_newer(u32 s1, u32 s2) return (s32)(s1 - s2) > 0; } +static bool +pnfs_should_free_range(const struct pnfs_layout_range *lseg_range, + const struct pnfs_layout_range *recall_range) +{ + return (recall_range->iomode == IOMODE_ANY || + lseg_range->iomode == recall_range->iomode) && + pnfs_lseg_range_intersecting(lseg_range, recall_range); +} + +static bool +pnfs_match_lseg_recall(const struct pnfs_layout_segment *lseg, + const struct pnfs_layout_range *recall_range, + u32 seq) +{ + if (seq != 0 && pnfs_seqid_is_newer(lseg->pls_seq, seq)) + return false; + if (recall_range == NULL) + return true; + return pnfs_should_free_range(&lseg->pls_range, recall_range); +} + /** * pnfs_mark_matching_lsegs_invalid - tear down lsegs or mark them for later * @lo: layout header containing the lsegs @@ -562,10 +577,7 @@ pnfs_mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo, if (list_empty(&lo->plh_segs)) return 0; list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list) - if (!recall_range || - should_free_lseg(&lseg->pls_range, recall_range)) { - if (seq && pnfs_seqid_is_newer(lseg->pls_seq, seq)) - continue; + if (pnfs_match_lseg_recall(lseg, recall_range, seq)) { dprintk("%s: freeing lseg %p iomode %d seq %u" "offset %llu length %llu\n", __func__, lseg, lseg->pls_range.iomode, lseg->pls_seq, @@ -761,24 +773,25 @@ void pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, const nfs4_stateid *new, bool update_barrier) { - u32 oldseq, newseq, new_barrier; - int empty = list_empty(&lo->plh_segs); + u32 oldseq, newseq, new_barrier = 0; + bool invalid = !pnfs_layout_is_valid(lo); oldseq = be32_to_cpu(lo->plh_stateid.seqid); newseq = be32_to_cpu(new->seqid); - if (empty || pnfs_seqid_is_newer(newseq, oldseq)) { + if (invalid || pnfs_seqid_is_newer(newseq, oldseq)) { nfs4_stateid_copy(&lo->plh_stateid, new); - if (update_barrier) { - new_barrier = be32_to_cpu(new->seqid); - } else { - /* Because of wraparound, we want to keep the barrier - * "close" to the current seqids. - */ - new_barrier = newseq - atomic_read(&lo->plh_outstanding); - } - if (empty || pnfs_seqid_is_newer(new_barrier, lo->plh_barrier)) - lo->plh_barrier = new_barrier; + /* + * Because of wraparound, we want to keep the barrier + * "close" to the current seqids. + */ + new_barrier = newseq - atomic_read(&lo->plh_outstanding); } + if (update_barrier) + new_barrier = be32_to_cpu(new->seqid); + else if (new_barrier == 0) + return; + if (invalid || pnfs_seqid_is_newer(new_barrier, lo->plh_barrier)) + lo->plh_barrier = new_barrier; } static bool @@ -873,15 +886,37 @@ void pnfs_clear_layoutreturn_waitbit(struct pnfs_layout_hdr *lo) rpc_wake_up(&NFS_SERVER(lo->plh_inode)->roc_rpcwaitq); } +static void +pnfs_clear_layoutreturn_info(struct pnfs_layout_hdr *lo) +{ + lo->plh_return_iomode = 0; + lo->plh_return_seq = 0; + clear_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags); +} + static bool -pnfs_prepare_layoutreturn(struct pnfs_layout_hdr *lo) +pnfs_prepare_layoutreturn(struct pnfs_layout_hdr *lo, + nfs4_stateid *stateid, + enum pnfs_iomode *iomode) { if (test_and_set_bit(NFS_LAYOUT_RETURN, &lo->plh_flags)) return false; - lo->plh_return_iomode = 0; - lo->plh_return_seq = 0; pnfs_get_layout_hdr(lo); - clear_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags); + if (test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags)) { + if (stateid != NULL) { + nfs4_stateid_copy(stateid, &lo->plh_stateid); + if (lo->plh_return_seq != 0) + stateid->seqid = cpu_to_be32(lo->plh_return_seq); + } + if (iomode != NULL) + *iomode = lo->plh_return_iomode; + pnfs_clear_layoutreturn_info(lo); + return true; + } + if (stateid != NULL) + nfs4_stateid_copy(stateid, &lo->plh_stateid); + if (iomode != NULL) + *iomode = IOMODE_ANY; return true; } @@ -949,10 +984,7 @@ static void pnfs_layoutreturn_before_put_layout_hdr(struct pnfs_layout_hdr *lo) enum pnfs_iomode iomode; bool send; - nfs4_stateid_copy(&stateid, &lo->plh_stateid); - stateid.seqid = cpu_to_be32(lo->plh_return_seq); - iomode = lo->plh_return_iomode; - send = pnfs_prepare_layoutreturn(lo); + send = pnfs_prepare_layoutreturn(lo, &stateid, &iomode); spin_unlock(&inode->i_lock); if (send) { /* Send an async layoutreturn so we dont deadlock */ @@ -989,7 +1021,6 @@ _pnfs_return_layout(struct inode *ino) dprintk("NFS: %s no layout to return\n", __func__); goto out; } - nfs4_stateid_copy(&stateid, &nfsi->layout->plh_stateid); /* Reference matched in nfs4_layoutreturn_release */ pnfs_get_layout_hdr(lo); empty = list_empty(&lo->plh_segs); @@ -1012,8 +1043,7 @@ _pnfs_return_layout(struct inode *ino) goto out_put_layout_hdr; } - set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags); - send = pnfs_prepare_layoutreturn(lo); + send = pnfs_prepare_layoutreturn(lo, &stateid, NULL); spin_unlock(&ino->i_lock); pnfs_free_lseg_list(&tmp_list); if (send) @@ -1080,11 +1110,10 @@ bool pnfs_roc(struct inode *ino) goto out_noroc; } - nfs4_stateid_copy(&stateid, &lo->plh_stateid); /* always send layoutreturn if being marked so */ - if (test_and_clear_bit(NFS_LAYOUT_RETURN_REQUESTED, - &lo->plh_flags)) - layoutreturn = pnfs_prepare_layoutreturn(lo); + if (test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags)) + layoutreturn = pnfs_prepare_layoutreturn(lo, + &stateid, NULL); list_for_each_entry_safe(lseg, tmp, &lo->plh_segs, pls_list) /* If we are sending layoutreturn, invalidate all valid lsegs */ @@ -1132,7 +1161,6 @@ void pnfs_roc_set_barrier(struct inode *ino, u32 barrier) spin_lock(&ino->i_lock); lo = NFS_I(ino)->layout; - pnfs_mark_layout_returned_if_empty(lo); if (pnfs_seqid_is_newer(barrier, lo->plh_barrier)) lo->plh_barrier = barrier; spin_unlock(&ino->i_lock); @@ -1505,7 +1533,7 @@ pnfs_update_layout(struct inode *ino, struct pnfs_layout_segment *lseg = NULL; nfs4_stateid stateid; long timeout = 0; - unsigned long giveup = jiffies + rpc_get_timeout(server->client); + unsigned long giveup = jiffies + (clp->cl_lease_time << 1); bool first; if (!pnfs_enabled_sb(NFS_SERVER(ino))) { @@ -1645,33 +1673,44 @@ lookup_again: lseg = send_layoutget(lo, ctx, &stateid, &arg, &timeout, gfp_flags); trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg, PNFS_UPDATE_LAYOUT_SEND_LAYOUTGET); + atomic_dec(&lo->plh_outstanding); if (IS_ERR(lseg)) { switch(PTR_ERR(lseg)) { - case -ERECALLCONFLICT: + case -EBUSY: if (time_after(jiffies, giveup)) lseg = NULL; - /* Fallthrough */ - case -EAGAIN: - pnfs_put_layout_hdr(lo); - if (first) - pnfs_clear_first_layoutget(lo); - if (lseg) { - trace_pnfs_update_layout(ino, pos, count, - iomode, lo, lseg, PNFS_UPDATE_LAYOUT_RETRY); - goto lookup_again; + break; + case -ERECALLCONFLICT: + /* Huh? We hold no layouts, how is there a recall? */ + if (first) { + lseg = NULL; + break; } + /* Destroy the existing layout and start over */ + if (time_after(jiffies, giveup)) + pnfs_destroy_layout(NFS_I(ino)); /* Fallthrough */ + case -EAGAIN: + break; default: if (!nfs_error_is_fatal(PTR_ERR(lseg))) { pnfs_layout_clear_fail_bit(lo, pnfs_iomode_to_fail_bit(iomode)); lseg = NULL; } + goto out_put_layout_hdr; + } + if (lseg) { + if (first) + pnfs_clear_first_layoutget(lo); + trace_pnfs_update_layout(ino, pos, count, + iomode, lo, lseg, PNFS_UPDATE_LAYOUT_RETRY); + pnfs_put_layout_hdr(lo); + goto lookup_again; } } else { pnfs_layout_clear_fail_bit(lo, pnfs_iomode_to_fail_bit(iomode)); } - atomic_dec(&lo->plh_outstanding); out_put_layout_hdr: if (first) pnfs_clear_first_layoutget(lo); @@ -1735,9 +1774,7 @@ pnfs_layout_process(struct nfs4_layoutget *lgp) return lseg; } - init_lseg(lo, lseg); - lseg->pls_range = res->range; - lseg->pls_seq = be32_to_cpu(res->stateid.seqid); + pnfs_init_lseg(lo, lseg, &res->range, &res->stateid); spin_lock(&ino->i_lock); if (pnfs_layoutgets_blocked(lo)) { @@ -1758,16 +1795,19 @@ pnfs_layout_process(struct nfs4_layoutget *lgp) * inode invalid, and don't bother validating the stateid * sequence number. */ - pnfs_mark_matching_lsegs_invalid(lo, &free_me, NULL, 0); + pnfs_mark_layout_stateid_invalid(lo, &free_me); nfs4_stateid_copy(&lo->plh_stateid, &res->stateid); lo->plh_barrier = be32_to_cpu(res->stateid.seqid); } - clear_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags); - pnfs_get_lseg(lseg); pnfs_layout_insert_lseg(lo, lseg, &free_me); + if (!pnfs_layout_is_valid(lo)) { + pnfs_clear_layoutreturn_info(lo); + clear_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags); + } + if (res->return_on_close) set_bit(NFS_LSEG_ROC, &lseg->pls_flags); @@ -1787,14 +1827,14 @@ static void pnfs_set_plh_return_info(struct pnfs_layout_hdr *lo, enum pnfs_iomode iomode, u32 seq) { - if (lo->plh_return_iomode == iomode) - return; - if (lo->plh_return_iomode != 0) + if (lo->plh_return_iomode != 0 && lo->plh_return_iomode != iomode) iomode = IOMODE_ANY; lo->plh_return_iomode = iomode; set_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags); - if (!lo->plh_return_seq || pnfs_seqid_is_newer(seq, lo->plh_return_seq)) + if (seq != 0) { + WARN_ON_ONCE(lo->plh_return_seq != 0 && lo->plh_return_seq != seq); lo->plh_return_seq = seq; + } } /** @@ -1824,7 +1864,7 @@ pnfs_mark_matching_lsegs_return(struct pnfs_layout_hdr *lo, assert_spin_locked(&lo->plh_inode->i_lock); list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list) - if (should_free_lseg(&lseg->pls_range, return_range)) { + if (pnfs_match_lseg_recall(lseg, return_range, seq)) { dprintk("%s: marking lseg %p iomode %d " "offset %llu length %llu\n", __func__, lseg, lseg->pls_range.iomode, @@ -1855,19 +1895,17 @@ void pnfs_error_mark_layout_for_return(struct inode *inode, bool return_now = false; spin_lock(&inode->i_lock); - pnfs_set_plh_return_info(lo, range.iomode, lseg->pls_seq); + pnfs_set_plh_return_info(lo, range.iomode, 0); /* * mark all matching lsegs so that we are sure to have no live * segments at hand when sending layoutreturn. See pnfs_put_lseg() * for how it works. */ - if (!pnfs_mark_matching_lsegs_return(lo, &free_me, - &range, lseg->pls_seq)) { + if (!pnfs_mark_matching_lsegs_return(lo, &free_me, &range, 0)) { nfs4_stateid stateid; - enum pnfs_iomode iomode = lo->plh_return_iomode; + enum pnfs_iomode iomode; - nfs4_stateid_copy(&stateid, &lo->plh_stateid); - return_now = pnfs_prepare_layoutreturn(lo); + return_now = pnfs_prepare_layoutreturn(lo, &stateid, &iomode); spin_unlock(&inode->i_lock); if (return_now) pnfs_send_layoutreturn(lo, &stateid, iomode, false); @@ -2382,7 +2420,10 @@ pnfs_layoutcommit_inode(struct inode *inode, bool sync) nfs_fattr_init(&data->fattr); data->args.bitmask = NFS_SERVER(inode)->cache_consistency_bitmask; data->res.fattr = &data->fattr; - data->args.lastbytewritten = end_pos - 1; + if (end_pos != 0) + data->args.lastbytewritten = end_pos - 1; + else + data->args.lastbytewritten = U64_MAX; data->res.server = NFS_SERVER(inode); if (ld->prepare_layoutcommit) { diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index b21bd0b..31d99b2 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -268,6 +268,8 @@ int pnfs_mark_matching_lsegs_return(struct pnfs_layout_hdr *lo, struct list_head *tmp_list, const struct pnfs_layout_range *recall_range, u32 seq); +int pnfs_mark_layout_stateid_invalid(struct pnfs_layout_hdr *lo, + struct list_head *lseg_list); bool pnfs_roc(struct inode *ino); void pnfs_roc_release(struct inode *ino); void pnfs_roc_set_barrier(struct inode *ino, u32 barrier); @@ -375,6 +377,11 @@ static inline bool nfs_have_layout(struct inode *inode) return NFS_I(inode)->layout != NULL; } +static inline bool pnfs_layout_is_valid(const struct pnfs_layout_hdr *lo) +{ + return test_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags) == 0; +} + static inline struct nfs4_deviceid_node * nfs4_get_deviceid(struct nfs4_deviceid_node *d) { @@ -545,19 +552,6 @@ pnfs_calc_offset_length(u64 offset, u64 end) return 1 + end - offset; } -/** - * pnfs_mark_layout_returned_if_empty - marks the layout as returned - * @lo: layout header - * - * Note: Caller must hold inode->i_lock - */ -static inline void -pnfs_mark_layout_returned_if_empty(struct pnfs_layout_hdr *lo) -{ - if (list_empty(&lo->plh_segs)) - set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags); -} - static inline void pnfs_copy_range(struct pnfs_layout_range *dst, const struct pnfs_layout_range *src) @@ -629,6 +623,13 @@ pnfs_sync_inode(struct inode *inode, bool datasync) } static inline bool +pnfs_layoutcommit_outstanding(struct inode *inode) +{ + return false; +} + + +static inline bool pnfs_roc(struct inode *ino) { return false; @@ -716,13 +717,6 @@ pnfs_use_threshold(struct nfs4_threshold **dst, struct nfs4_threshold *src, return false; } -static inline bool -pnfs_layoutcommit_outstanding(struct inode *inode) -{ - return false; -} - - static inline struct nfs4_threshold *pnfs_mdsthreshold_alloc(void) { return NULL; diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c index b38e3c0..f3468b5 100644 --- a/fs/nfs/pnfs_nfs.c +++ b/fs/nfs/pnfs_nfs.c @@ -595,7 +595,7 @@ static void nfs4_clear_ds_conn_bit(struct nfs4_pnfs_ds *ds) } static struct nfs_client *(*get_v3_ds_connect)( - struct nfs_client *mds_clp, + struct nfs_server *mds_srv, const struct sockaddr *ds_addr, int ds_addrlen, int ds_proto, @@ -654,7 +654,7 @@ static int _nfs4_pnfs_v3_ds_connect(struct nfs_server *mds_srv, rpc_clnt_add_xprt(clp->cl_rpcclient, &xprt_args, rpc_clnt_test_and_add_xprt, NULL); } else - clp = get_v3_ds_connect(mds_srv->nfs_client, + clp = get_v3_ds_connect(mds_srv, (struct sockaddr *)&da->da_addr, da->da_addrlen, IPPROTO_TCP, timeo, retrans, au_flavor); @@ -690,7 +690,7 @@ static int _nfs4_pnfs_v4_ds_connect(struct nfs_server *mds_srv, dprintk("%s: DS %s: trying address %s\n", __func__, ds->ds_remotestr, da->da_remotestr); - clp = nfs4_set_ds_client(mds_srv->nfs_client, + clp = nfs4_set_ds_client(mds_srv, (struct sockaddr *)&da->da_addr, da->da_addrlen, IPPROTO_TCP, timeo, retrans, minor_version, @@ -940,6 +940,13 @@ EXPORT_SYMBOL_GPL(pnfs_layout_mark_request_commit); int pnfs_nfs_generic_sync(struct inode *inode, bool datasync) { + int ret; + + if (!pnfs_layoutcommit_outstanding(inode)) + return 0; + ret = nfs_commit_inode(inode, FLUSH_SYNC); + if (ret < 0) + return ret; if (datasync) return 0; return pnfs_layoutcommit_inode(inode, true); diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 2137e02..18d446e 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -1684,6 +1684,7 @@ static int nfs_verify_authflavors(struct nfs_parsed_mount_data *args, { rpc_authflavor_t flavor = RPC_AUTH_MAXFLAVOR; unsigned int i; + int use_auth_null = false; /* * If the sec= mount option is used, the specified flavor or AUTH_NULL @@ -1691,14 +1692,21 @@ static int nfs_verify_authflavors(struct nfs_parsed_mount_data *args, * * AUTH_NULL has a special meaning when it's in the server list - it * means that the server will ignore the rpc creds, so any flavor - * can be used. + * can be used but still use the sec= that was specified. */ for (i = 0; i < count; i++) { flavor = server_authlist[i]; - if (nfs_auth_info_match(&args->auth_info, flavor) || - flavor == RPC_AUTH_NULL) + if (nfs_auth_info_match(&args->auth_info, flavor)) goto out; + + if (flavor == RPC_AUTH_NULL) + use_auth_null = true; + } + + if (use_auth_null) { + flavor = RPC_AUTH_NULL; + goto out; } dfprintk(MOUNT, diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 593fa21..3a6724c 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -625,7 +625,7 @@ static int nfs_writepage_locked(struct page *page, int err; nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGE); - nfs_pageio_init_write(&pgio, inode, wb_priority(wbc), + nfs_pageio_init_write(&pgio, inode, 0, false, &nfs_async_write_completion_ops); err = nfs_do_writepage(page, wbc, &pgio, launder); nfs_pageio_complete(&pgio); @@ -657,16 +657,9 @@ static int nfs_writepages_callback(struct page *page, struct writeback_control * int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc) { struct inode *inode = mapping->host; - unsigned long *bitlock = &NFS_I(inode)->flags; struct nfs_pageio_descriptor pgio; int err; - /* Stop dirtying of new pages while we sync */ - err = wait_on_bit_lock_action(bitlock, NFS_INO_FLUSHING, - nfs_wait_bit_killable, TASK_KILLABLE); - if (err) - goto out_err; - nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGES); nfs_pageio_init_write(&pgio, inode, wb_priority(wbc), false, @@ -674,10 +667,6 @@ int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc) err = write_cache_pages(mapping, wbc, nfs_writepages_callback, &pgio); nfs_pageio_complete(&pgio); - clear_bit_unlock(NFS_INO_FLUSHING, bitlock); - smp_mb__after_atomic(); - wake_up_bit(bitlock, NFS_INO_FLUSHING); - if (err < 0) goto out_err; err = pgio.pg_error; @@ -1195,9 +1184,11 @@ nfs_key_timeout_notify(struct file *filp, struct inode *inode) /* * Test if the open context credential key is marked to expire soon. */ -bool nfs_ctx_key_to_expire(struct nfs_open_context *ctx) +bool nfs_ctx_key_to_expire(struct nfs_open_context *ctx, struct inode *inode) { - return rpcauth_cred_key_to_expire(ctx->cred); + struct rpc_auth *auth = NFS_SERVER(inode)->client->cl_auth; + + return rpcauth_cred_key_to_expire(auth, ctx->cred); } /* @@ -1289,6 +1280,9 @@ int nfs_updatepage(struct file *file, struct page *page, dprintk("NFS: nfs_updatepage(%pD2 %d@%lld)\n", file, count, (long long)(page_file_offset(page) + offset)); + if (!count) + goto out; + if (nfs_can_extend_write(file, page, inode)) { count = max(count + offset, nfs_page_length(page)); offset = 0; @@ -1299,7 +1293,7 @@ int nfs_updatepage(struct file *file, struct page *page, nfs_set_pageerror(page); else __set_page_dirty_nobuffers(page); - +out: dprintk("NFS: nfs_updatepage returns %d (isize %lld)\n", status, (long long)i_size_read(inode)); return status; @@ -1800,7 +1794,7 @@ static void nfs_commit_release_pages(struct nfs_commit_data *data) /* Okay, COMMIT succeeded, apparently. Check the verifier * returned by the server against all stored verfs. */ - if (!memcmp(&req->wb_verf, &data->verf.verifier, sizeof(req->wb_verf))) { + if (!nfs_write_verifier_cmp(&req->wb_verf, &data->verf.verifier)) { /* We have a match */ nfs_inode_remove_request(req); dprintk(" OK\n"); @@ -1924,6 +1918,24 @@ out_mark_dirty: EXPORT_SYMBOL_GPL(nfs_write_inode); /* + * Wrapper for filemap_write_and_wait_range() + * + * Needed for pNFS in order to ensure data becomes visible to the + * client. + */ +int nfs_filemap_write_and_wait_range(struct address_space *mapping, + loff_t lstart, loff_t lend) +{ + int ret; + + ret = filemap_write_and_wait_range(mapping, lstart, lend); + if (ret == 0) + ret = pnfs_sync_inode(mapping->host, true); + return ret; +} +EXPORT_SYMBOL_GPL(nfs_filemap_write_and_wait_range); + +/* * flush the inode to disk. */ int nfs_wb_all(struct inode *inode) diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index d71278c..810124b 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -205,12 +205,12 @@ struct nfs_inode { #define NFS_INO_STALE (1) /* possible stale inode */ #define NFS_INO_ACL_LRU_SET (2) /* Inode is on the LRU list */ #define NFS_INO_INVALIDATING (3) /* inode is being invalidated */ -#define NFS_INO_FLUSHING (4) /* inode is flushing out data */ #define NFS_INO_FSCACHE (5) /* inode can be cached by FS-Cache */ #define NFS_INO_FSCACHE_LOCK (6) /* FS-Cache cookie management lock */ #define NFS_INO_LAYOUTCOMMIT (9) /* layoutcommit required */ #define NFS_INO_LAYOUTCOMMITTING (10) /* layoutcommit inflight */ #define NFS_INO_LAYOUTSTATS (11) /* layoutstats inflight */ +#define NFS_INO_ODIRECT (12) /* I/O setting is O_DIRECT */ static inline struct nfs_inode *NFS_I(const struct inode *inode) { @@ -351,7 +351,6 @@ extern int nfs_revalidate_inode_rcu(struct nfs_server *server, struct inode *ino extern int __nfs_revalidate_inode(struct nfs_server *, struct inode *); extern int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping); extern int nfs_revalidate_mapping_rcu(struct inode *inode); -extern int nfs_revalidate_mapping_protected(struct inode *inode, struct address_space *mapping); extern int nfs_setattr(struct dentry *, struct iattr *); extern void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr, struct nfs_fattr *); extern void nfs_setsecurity(struct inode *inode, struct nfs_fattr *fattr, diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index c304a11..82b81a1 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -1596,9 +1596,8 @@ struct nfs_rpc_ops { int (*have_delegation)(struct inode *, fmode_t); int (*return_delegation)(struct inode *); struct nfs_client *(*alloc_client) (const struct nfs_client_initdata *); - struct nfs_client * - (*init_client) (struct nfs_client *, const struct rpc_timeout *, - const char *); + struct nfs_client *(*init_client) (struct nfs_client *, + const struct nfs_client_initdata *); void (*free_client) (struct nfs_client *); struct nfs_server *(*create_server)(struct nfs_mount_info *, struct nfs_subversion *); struct nfs_server *(*clone_server)(struct nfs_server *, struct nfs_fh *, diff --git a/include/linux/sunrpc/auth.h b/include/linux/sunrpc/auth.h index 8997915..4ccf184 100644 --- a/include/linux/sunrpc/auth.h +++ b/include/linux/sunrpc/auth.h @@ -37,7 +37,6 @@ struct rpcsec_gss_info; /* auth_cred ac_flags bits */ enum { - RPC_CRED_NO_CRKEY_TIMEOUT = 0, /* underlying cred has no key timeout */ RPC_CRED_KEY_EXPIRE_SOON = 1, /* underlying cred key will expire soon */ RPC_CRED_NOTIFY_TIMEOUT = 2, /* nofity generic cred when underlying key will expire soon */ @@ -82,6 +81,9 @@ struct rpc_cred { #define RPCAUTH_CRED_MAGIC 0x0f4aa4f0 +/* rpc_auth au_flags */ +#define RPCAUTH_AUTH_NO_CRKEY_TIMEOUT 0x0001 /* underlying cred has no key timeout */ + /* * Client authentication handle */ @@ -107,6 +109,9 @@ struct rpc_auth { /* per-flavor data */ }; +/* rpc_auth au_flags */ +#define RPCAUTH_AUTH_DATATOUCH 0x00000002 + struct rpc_auth_create_args { rpc_authflavor_t pseudoflavor; const char *target_name; @@ -196,7 +201,7 @@ void rpcauth_destroy_credcache(struct rpc_auth *); void rpcauth_clear_credcache(struct rpc_cred_cache *); int rpcauth_key_timeout_notify(struct rpc_auth *, struct rpc_cred *); -bool rpcauth_cred_key_to_expire(struct rpc_cred *); +bool rpcauth_cred_key_to_expire(struct rpc_auth *, struct rpc_cred *); char * rpcauth_stringify_acceptor(struct rpc_cred *); static inline diff --git a/include/linux/sunrpc/gss_api.h b/include/linux/sunrpc/gss_api.h index 1f911cc..68ec78c 100644 --- a/include/linux/sunrpc/gss_api.h +++ b/include/linux/sunrpc/gss_api.h @@ -73,6 +73,7 @@ u32 gss_delete_sec_context( rpc_authflavor_t gss_svc_to_pseudoflavor(struct gss_api_mech *, u32 qop, u32 service); u32 gss_pseudoflavor_to_service(struct gss_api_mech *, u32 pseudoflavor); +bool gss_pseudoflavor_to_datatouch(struct gss_api_mech *, u32 pseudoflavor); char *gss_service_to_auth_domain_name(struct gss_api_mech *, u32 service); struct pf_desc { @@ -81,6 +82,7 @@ struct pf_desc { u32 service; char *name; char *auth_domain_name; + bool datatouch; }; /* Different mechanisms (e.g., krb5 or spkm3) may implement gss-api, and diff --git a/include/linux/sunrpc/sched.h b/include/linux/sunrpc/sched.h index 05a1809..817af0b 100644 --- a/include/linux/sunrpc/sched.h +++ b/include/linux/sunrpc/sched.h @@ -230,6 +230,10 @@ void rpc_wake_up_queued_task(struct rpc_wait_queue *, struct rpc_task *); void rpc_wake_up(struct rpc_wait_queue *); struct rpc_task *rpc_wake_up_next(struct rpc_wait_queue *); +struct rpc_task *rpc_wake_up_first_on_wq(struct workqueue_struct *wq, + struct rpc_wait_queue *, + bool (*)(struct rpc_task *, void *), + void *); struct rpc_task *rpc_wake_up_first(struct rpc_wait_queue *, bool (*)(struct rpc_task *, void *), void *); @@ -247,6 +251,7 @@ void rpc_show_tasks(struct net *); int rpc_init_mempool(void); void rpc_destroy_mempool(void); extern struct workqueue_struct *rpciod_workqueue; +extern struct workqueue_struct *xprtiod_workqueue; void rpc_prepare_task(struct rpc_task *task); static inline int rpc_wait_for_completion_task(struct rpc_task *task) diff --git a/include/linux/sunrpc/xprtsock.h b/include/linux/sunrpc/xprtsock.h index 0ece4ba..bef3fb0 100644 --- a/include/linux/sunrpc/xprtsock.h +++ b/include/linux/sunrpc/xprtsock.h @@ -80,6 +80,7 @@ struct sock_xprt { #define TCP_RPC_REPLY (1UL << 6) #define XPRT_SOCK_CONNECTING 1U +#define XPRT_SOCK_DATA_READY (2) #endif /* __KERNEL__ */ diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c index 040ff62..a7e42f9 100644 --- a/net/sunrpc/auth.c +++ b/net/sunrpc/auth.c @@ -51,9 +51,7 @@ static int param_set_hashtbl_sz(const char *val, const struct kernel_param *kp) ret = kstrtoul(val, 0, &num); if (ret == -EINVAL) goto out_inval; - nbits = fls(num); - if (num > (1U << nbits)) - nbits++; + nbits = fls(num - 1); if (nbits > MAX_HASHTABLE_BITS || nbits < 2) goto out_inval; *(unsigned int *)kp->arg = nbits; @@ -359,8 +357,10 @@ rpcauth_key_timeout_notify(struct rpc_auth *auth, struct rpc_cred *cred) EXPORT_SYMBOL_GPL(rpcauth_key_timeout_notify); bool -rpcauth_cred_key_to_expire(struct rpc_cred *cred) +rpcauth_cred_key_to_expire(struct rpc_auth *auth, struct rpc_cred *cred) { + if (auth->au_flags & RPCAUTH_AUTH_NO_CRKEY_TIMEOUT) + return false; if (!cred->cr_ops->crkey_to_expire) return false; return cred->cr_ops->crkey_to_expire(cred); diff --git a/net/sunrpc/auth_generic.c b/net/sunrpc/auth_generic.c index 54dd3fd..1682195 100644 --- a/net/sunrpc/auth_generic.c +++ b/net/sunrpc/auth_generic.c @@ -224,7 +224,7 @@ generic_key_timeout(struct rpc_auth *auth, struct rpc_cred *cred) /* Fast track for non crkey_timeout (no key) underlying credentials */ - if (test_bit(RPC_CRED_NO_CRKEY_TIMEOUT, &acred->ac_flags)) + if (auth->au_flags & RPCAUTH_AUTH_NO_CRKEY_TIMEOUT) return 0; /* Fast track for the normal case */ @@ -236,12 +236,6 @@ generic_key_timeout(struct rpc_auth *auth, struct rpc_cred *cred) if (IS_ERR(tcred)) return -EACCES; - if (!tcred->cr_ops->crkey_timeout) { - set_bit(RPC_CRED_NO_CRKEY_TIMEOUT, &acred->ac_flags); - ret = 0; - goto out_put; - } - /* Test for the almost error case */ ret = tcred->cr_ops->crkey_timeout(tcred); if (ret != 0) { @@ -257,7 +251,6 @@ generic_key_timeout(struct rpc_auth *auth, struct rpc_cred *cred) set_bit(RPC_CRED_NOTIFY_TIMEOUT, &acred->ac_flags); } -out_put: put_rpccred(tcred); return ret; } diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c index e64ae93..23c8e7c 100644 --- a/net/sunrpc/auth_gss/auth_gss.c +++ b/net/sunrpc/auth_gss/auth_gss.c @@ -1015,8 +1015,11 @@ gss_create_new(struct rpc_auth_create_args *args, struct rpc_clnt *clnt) auth = &gss_auth->rpc_auth; auth->au_cslack = GSS_CRED_SLACK >> 2; auth->au_rslack = GSS_VERF_SLACK >> 2; + auth->au_flags = 0; auth->au_ops = &authgss_ops; auth->au_flavor = flavor; + if (gss_pseudoflavor_to_datatouch(gss_auth->mech, flavor)) + auth->au_flags |= RPCAUTH_AUTH_DATATOUCH; atomic_set(&auth->au_count, 1); kref_init(&gss_auth->kref); diff --git a/net/sunrpc/auth_gss/gss_krb5_mech.c b/net/sunrpc/auth_gss/gss_krb5_mech.c index 6542749..6059583 100644 --- a/net/sunrpc/auth_gss/gss_krb5_mech.c +++ b/net/sunrpc/auth_gss/gss_krb5_mech.c @@ -745,12 +745,14 @@ static struct pf_desc gss_kerberos_pfs[] = { .qop = GSS_C_QOP_DEFAULT, .service = RPC_GSS_SVC_INTEGRITY, .name = "krb5i", + .datatouch = true, }, [2] = { .pseudoflavor = RPC_AUTH_GSS_KRB5P, .qop = GSS_C_QOP_DEFAULT, .service = RPC_GSS_SVC_PRIVACY, .name = "krb5p", + .datatouch = true, }, }; diff --git a/net/sunrpc/auth_gss/gss_mech_switch.c b/net/sunrpc/auth_gss/gss_mech_switch.c index 7063d85..5fec3ab 100644 --- a/net/sunrpc/auth_gss/gss_mech_switch.c +++ b/net/sunrpc/auth_gss/gss_mech_switch.c @@ -361,6 +361,18 @@ gss_pseudoflavor_to_service(struct gss_api_mech *gm, u32 pseudoflavor) } EXPORT_SYMBOL(gss_pseudoflavor_to_service); +bool +gss_pseudoflavor_to_datatouch(struct gss_api_mech *gm, u32 pseudoflavor) +{ + int i; + + for (i = 0; i < gm->gm_pf_num; i++) { + if (gm->gm_pfs[i].pseudoflavor == pseudoflavor) + return gm->gm_pfs[i].datatouch; + } + return false; +} + char * gss_service_to_auth_domain_name(struct gss_api_mech *gm, u32 service) { diff --git a/net/sunrpc/auth_null.c b/net/sunrpc/auth_null.c index 8d9eb4d..4d17376 100644 --- a/net/sunrpc/auth_null.c +++ b/net/sunrpc/auth_null.c @@ -115,6 +115,7 @@ static struct rpc_auth null_auth = { .au_cslack = NUL_CALLSLACK, .au_rslack = NUL_REPLYSLACK, + .au_flags = RPCAUTH_AUTH_NO_CRKEY_TIMEOUT, .au_ops = &authnull_ops, .au_flavor = RPC_AUTH_NULL, .au_count = ATOMIC_INIT(0), diff --git a/net/sunrpc/auth_unix.c b/net/sunrpc/auth_unix.c index 9f65452..a99278c 100644 --- a/net/sunrpc/auth_unix.c +++ b/net/sunrpc/auth_unix.c @@ -228,6 +228,7 @@ static struct rpc_auth unix_auth = { .au_cslack = UNX_CALLSLACK, .au_rslack = NUL_REPLYSLACK, + .au_flags = RPCAUTH_AUTH_NO_CRKEY_TIMEOUT, .au_ops = &authunix_ops, .au_flavor = RPC_AUTH_UNIX, .au_count = ATOMIC_INIT(0), diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 2808d55..cb49898 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -2577,7 +2577,7 @@ static void rpc_cb_add_xprt_release(void *calldata) kfree(data); } -const static struct rpc_call_ops rpc_cb_add_xprt_call_ops = { +static const struct rpc_call_ops rpc_cb_add_xprt_call_ops = { .rpc_call_done = rpc_cb_add_xprt_done, .rpc_release = rpc_cb_add_xprt_release, }; diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index fcfd48d..9ae5885 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c @@ -54,7 +54,8 @@ static struct rpc_wait_queue delay_queue; /* * rpciod-related stuff */ -struct workqueue_struct *rpciod_workqueue; +struct workqueue_struct *rpciod_workqueue __read_mostly; +struct workqueue_struct *xprtiod_workqueue __read_mostly; /* * Disable the timer for a given RPC task. Should be called with @@ -329,7 +330,8 @@ EXPORT_SYMBOL_GPL(__rpc_wait_for_completion_task); * lockless RPC_IS_QUEUED() test) before we've had a chance to test * the RPC_TASK_RUNNING flag. */ -static void rpc_make_runnable(struct rpc_task *task) +static void rpc_make_runnable(struct workqueue_struct *wq, + struct rpc_task *task) { bool need_wakeup = !rpc_test_and_set_running(task); @@ -338,7 +340,7 @@ static void rpc_make_runnable(struct rpc_task *task) return; if (RPC_IS_ASYNC(task)) { INIT_WORK(&task->u.tk_work, rpc_async_schedule); - queue_work(rpciod_workqueue, &task->u.tk_work); + queue_work(wq, &task->u.tk_work); } else wake_up_bit(&task->tk_runstate, RPC_TASK_QUEUED); } @@ -407,13 +409,16 @@ void rpc_sleep_on_priority(struct rpc_wait_queue *q, struct rpc_task *task, EXPORT_SYMBOL_GPL(rpc_sleep_on_priority); /** - * __rpc_do_wake_up_task - wake up a single rpc_task + * __rpc_do_wake_up_task_on_wq - wake up a single rpc_task + * @wq: workqueue on which to run task * @queue: wait queue * @task: task to be woken up * * Caller must hold queue->lock, and have cleared the task queued flag. */ -static void __rpc_do_wake_up_task(struct rpc_wait_queue *queue, struct rpc_task *task) +static void __rpc_do_wake_up_task_on_wq(struct workqueue_struct *wq, + struct rpc_wait_queue *queue, + struct rpc_task *task) { dprintk("RPC: %5u __rpc_wake_up_task (now %lu)\n", task->tk_pid, jiffies); @@ -428,7 +433,7 @@ static void __rpc_do_wake_up_task(struct rpc_wait_queue *queue, struct rpc_task __rpc_remove_wait_queue(queue, task); - rpc_make_runnable(task); + rpc_make_runnable(wq, task); dprintk("RPC: __rpc_wake_up_task done\n"); } @@ -436,16 +441,25 @@ static void __rpc_do_wake_up_task(struct rpc_wait_queue *queue, struct rpc_task /* * Wake up a queued task while the queue lock is being held */ -static void rpc_wake_up_task_queue_locked(struct rpc_wait_queue *queue, struct rpc_task *task) +static void rpc_wake_up_task_on_wq_queue_locked(struct workqueue_struct *wq, + struct rpc_wait_queue *queue, struct rpc_task *task) { if (RPC_IS_QUEUED(task)) { smp_rmb(); if (task->tk_waitqueue == queue) - __rpc_do_wake_up_task(queue, task); + __rpc_do_wake_up_task_on_wq(wq, queue, task); } } /* + * Wake up a queued task while the queue lock is being held + */ +static void rpc_wake_up_task_queue_locked(struct rpc_wait_queue *queue, struct rpc_task *task) +{ + rpc_wake_up_task_on_wq_queue_locked(rpciod_workqueue, queue, task); +} + +/* * Wake up a task on a specific queue */ void rpc_wake_up_queued_task(struct rpc_wait_queue *queue, struct rpc_task *task) @@ -518,7 +532,8 @@ static struct rpc_task *__rpc_find_next_queued(struct rpc_wait_queue *queue) /* * Wake up the first task on the wait queue. */ -struct rpc_task *rpc_wake_up_first(struct rpc_wait_queue *queue, +struct rpc_task *rpc_wake_up_first_on_wq(struct workqueue_struct *wq, + struct rpc_wait_queue *queue, bool (*func)(struct rpc_task *, void *), void *data) { struct rpc_task *task = NULL; @@ -529,7 +544,7 @@ struct rpc_task *rpc_wake_up_first(struct rpc_wait_queue *queue, task = __rpc_find_next_queued(queue); if (task != NULL) { if (func(task, data)) - rpc_wake_up_task_queue_locked(queue, task); + rpc_wake_up_task_on_wq_queue_locked(wq, queue, task); else task = NULL; } @@ -537,6 +552,15 @@ struct rpc_task *rpc_wake_up_first(struct rpc_wait_queue *queue, return task; } + +/* + * Wake up the first task on the wait queue. + */ +struct rpc_task *rpc_wake_up_first(struct rpc_wait_queue *queue, + bool (*func)(struct rpc_task *, void *), void *data) +{ + return rpc_wake_up_first_on_wq(rpciod_workqueue, queue, func, data); +} EXPORT_SYMBOL_GPL(rpc_wake_up_first); static bool rpc_wake_up_next_func(struct rpc_task *task, void *data) @@ -814,7 +838,7 @@ void rpc_execute(struct rpc_task *task) bool is_async = RPC_IS_ASYNC(task); rpc_set_active(task); - rpc_make_runnable(task); + rpc_make_runnable(rpciod_workqueue, task); if (!is_async) __rpc_execute(task); } @@ -1071,10 +1095,22 @@ static int rpciod_start(void) * Create the rpciod thread and wait for it to start. */ dprintk("RPC: creating workqueue rpciod\n"); - /* Note: highpri because network receive is latency sensitive */ - wq = alloc_workqueue("rpciod", WQ_MEM_RECLAIM | WQ_HIGHPRI, 0); + wq = alloc_workqueue("rpciod", WQ_MEM_RECLAIM, 0); + if (!wq) + goto out_failed; rpciod_workqueue = wq; - return rpciod_workqueue != NULL; + /* Note: highpri because network receive is latency sensitive */ + wq = alloc_workqueue("xprtiod", WQ_MEM_RECLAIM | WQ_HIGHPRI, 0); + if (!wq) + goto free_rpciod; + xprtiod_workqueue = wq; + return 1; +free_rpciod: + wq = rpciod_workqueue; + rpciod_workqueue = NULL; + destroy_workqueue(wq); +out_failed: + return 0; } static void rpciod_stop(void) @@ -1088,6 +1124,9 @@ static void rpciod_stop(void) wq = rpciod_workqueue; rpciod_workqueue = NULL; destroy_workqueue(wq); + wq = xprtiod_workqueue; + xprtiod_workqueue = NULL; + destroy_workqueue(wq); } void diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index cc98528..c5b0cb4 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -1188,11 +1188,17 @@ svc_process_common(struct svc_rqst *rqstp, struct kvec *argv, struct kvec *resv) *statp = procp->pc_func(rqstp, rqstp->rq_argp, rqstp->rq_resp); /* Encode reply */ - if (test_bit(RQ_DROPME, &rqstp->rq_flags)) { + if (*statp == rpc_drop_reply || + test_bit(RQ_DROPME, &rqstp->rq_flags)) { if (procp->pc_release) procp->pc_release(rqstp, NULL, rqstp->rq_resp); goto dropit; } + if (*statp == rpc_autherr_badcred) { + if (procp->pc_release) + procp->pc_release(rqstp, NULL, rqstp->rq_resp); + goto err_bad_auth; + } if (*statp == rpc_success && (xdr = procp->pc_encode) && !xdr(rqstp, resv->iov_base+resv->iov_len, rqstp->rq_resp)) { diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 216a138..8313960 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -220,7 +220,7 @@ static void xprt_clear_locked(struct rpc_xprt *xprt) clear_bit(XPRT_LOCKED, &xprt->state); smp_mb__after_atomic(); } else - queue_work(rpciod_workqueue, &xprt->task_cleanup); + queue_work(xprtiod_workqueue, &xprt->task_cleanup); } /* @@ -295,7 +295,8 @@ static void __xprt_lock_write_next(struct rpc_xprt *xprt) if (test_and_set_bit(XPRT_LOCKED, &xprt->state)) return; - if (rpc_wake_up_first(&xprt->sending, __xprt_lock_write_func, xprt)) + if (rpc_wake_up_first_on_wq(xprtiod_workqueue, &xprt->sending, + __xprt_lock_write_func, xprt)) return; xprt_clear_locked(xprt); } @@ -324,7 +325,8 @@ static void __xprt_lock_write_next_cong(struct rpc_xprt *xprt) return; if (RPCXPRT_CONGESTED(xprt)) goto out_unlock; - if (rpc_wake_up_first(&xprt->sending, __xprt_lock_write_cong_func, xprt)) + if (rpc_wake_up_first_on_wq(xprtiod_workqueue, &xprt->sending, + __xprt_lock_write_cong_func, xprt)) return; out_unlock: xprt_clear_locked(xprt); @@ -645,7 +647,7 @@ void xprt_force_disconnect(struct rpc_xprt *xprt) set_bit(XPRT_CLOSE_WAIT, &xprt->state); /* Try to schedule an autoclose RPC call */ if (test_and_set_bit(XPRT_LOCKED, &xprt->state) == 0) - queue_work(rpciod_workqueue, &xprt->task_cleanup); + queue_work(xprtiod_workqueue, &xprt->task_cleanup); xprt_wake_pending_tasks(xprt, -EAGAIN); spin_unlock_bh(&xprt->transport_lock); } @@ -672,7 +674,7 @@ void xprt_conditional_disconnect(struct rpc_xprt *xprt, unsigned int cookie) set_bit(XPRT_CLOSE_WAIT, &xprt->state); /* Try to schedule an autoclose RPC call */ if (test_and_set_bit(XPRT_LOCKED, &xprt->state) == 0) - queue_work(rpciod_workqueue, &xprt->task_cleanup); + queue_work(xprtiod_workqueue, &xprt->task_cleanup); xprt_wake_pending_tasks(xprt, -EAGAIN); out: spin_unlock_bh(&xprt->transport_lock); @@ -689,7 +691,7 @@ xprt_init_autodisconnect(unsigned long data) if (test_and_set_bit(XPRT_LOCKED, &xprt->state)) goto out_abort; spin_unlock(&xprt->transport_lock); - queue_work(rpciod_workqueue, &xprt->task_cleanup); + queue_work(xprtiod_workqueue, &xprt->task_cleanup); return; out_abort: spin_unlock(&xprt->transport_lock); diff --git a/net/sunrpc/xprtmultipath.c b/net/sunrpc/xprtmultipath.c index e7fd769..66c9d63 100644 --- a/net/sunrpc/xprtmultipath.c +++ b/net/sunrpc/xprtmultipath.c @@ -271,14 +271,12 @@ struct rpc_xprt *xprt_iter_next_entry_multiple(struct rpc_xprt_iter *xpi, xprt_switch_find_xprt_t find_next) { struct rpc_xprt_switch *xps = rcu_dereference(xpi->xpi_xpswitch); - struct list_head *head; if (xps == NULL) return NULL; - head = &xps->xps_xprt_list; - if (xps->xps_nxprts < 2) - return xprt_switch_find_first_entry(head); - return xprt_switch_set_next_cursor(head, &xpi->xpi_cursor, find_next); + return xprt_switch_set_next_cursor(&xps->xps_xprt_list, + &xpi->xpi_cursor, + find_next); } static diff --git a/net/sunrpc/xprtrdma/Makefile b/net/sunrpc/xprtrdma/Makefile index dc9f3b5..ef19fa4 100644 --- a/net/sunrpc/xprtrdma/Makefile +++ b/net/sunrpc/xprtrdma/Makefile @@ -1,7 +1,7 @@ obj-$(CONFIG_SUNRPC_XPRT_RDMA) += rpcrdma.o rpcrdma-y := transport.o rpc_rdma.o verbs.o \ - fmr_ops.o frwr_ops.o physical_ops.o \ + fmr_ops.o frwr_ops.o \ svc_rdma.o svc_rdma_backchannel.o svc_rdma_transport.o \ svc_rdma_marshal.o svc_rdma_sendto.o svc_rdma_recvfrom.o \ module.o diff --git a/net/sunrpc/xprtrdma/fmr_ops.c b/net/sunrpc/xprtrdma/fmr_ops.c index 6326ebe..21cb3b1 100644 --- a/net/sunrpc/xprtrdma/fmr_ops.c +++ b/net/sunrpc/xprtrdma/fmr_ops.c @@ -19,13 +19,6 @@ * verb (fmr_op_unmap). */ -/* Transport recovery - * - * After a transport reconnect, fmr_op_map re-uses the MR already - * allocated for the RPC, but generates a fresh rkey then maps the - * MR again. This process is synchronous. - */ - #include "xprt_rdma.h" #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) @@ -35,62 +28,132 @@ /* Maximum scatter/gather per FMR */ #define RPCRDMA_MAX_FMR_SGES (64) -static struct workqueue_struct *fmr_recovery_wq; - -#define FMR_RECOVERY_WQ_FLAGS (WQ_UNBOUND) +/* Access mode of externally registered pages */ +enum { + RPCRDMA_FMR_ACCESS_FLAGS = IB_ACCESS_REMOTE_WRITE | + IB_ACCESS_REMOTE_READ, +}; -int -fmr_alloc_recovery_wq(void) +bool +fmr_is_supported(struct rpcrdma_ia *ia) { - fmr_recovery_wq = alloc_workqueue("fmr_recovery", WQ_UNBOUND, 0); - return !fmr_recovery_wq ? -ENOMEM : 0; + if (!ia->ri_device->alloc_fmr) { + pr_info("rpcrdma: 'fmr' mode is not supported by device %s\n", + ia->ri_device->name); + return false; + } + return true; } -void -fmr_destroy_recovery_wq(void) +static int +fmr_op_init_mr(struct rpcrdma_ia *ia, struct rpcrdma_mw *mw) { - struct workqueue_struct *wq; + static struct ib_fmr_attr fmr_attr = { + .max_pages = RPCRDMA_MAX_FMR_SGES, + .max_maps = 1, + .page_shift = PAGE_SHIFT + }; - if (!fmr_recovery_wq) - return; + mw->fmr.fm_physaddrs = kcalloc(RPCRDMA_MAX_FMR_SGES, + sizeof(u64), GFP_KERNEL); + if (!mw->fmr.fm_physaddrs) + goto out_free; - wq = fmr_recovery_wq; - fmr_recovery_wq = NULL; - destroy_workqueue(wq); + mw->mw_sg = kcalloc(RPCRDMA_MAX_FMR_SGES, + sizeof(*mw->mw_sg), GFP_KERNEL); + if (!mw->mw_sg) + goto out_free; + + sg_init_table(mw->mw_sg, RPCRDMA_MAX_FMR_SGES); + + mw->fmr.fm_mr = ib_alloc_fmr(ia->ri_pd, RPCRDMA_FMR_ACCESS_FLAGS, + &fmr_attr); + if (IS_ERR(mw->fmr.fm_mr)) + goto out_fmr_err; + + return 0; + +out_fmr_err: + dprintk("RPC: %s: ib_alloc_fmr returned %ld\n", __func__, + PTR_ERR(mw->fmr.fm_mr)); + +out_free: + kfree(mw->mw_sg); + kfree(mw->fmr.fm_physaddrs); + return -ENOMEM; } static int __fmr_unmap(struct rpcrdma_mw *mw) { LIST_HEAD(l); + int rc; - list_add(&mw->fmr.fmr->list, &l); - return ib_unmap_fmr(&l); + list_add(&mw->fmr.fm_mr->list, &l); + rc = ib_unmap_fmr(&l); + list_del_init(&mw->fmr.fm_mr->list); + return rc; } -/* Deferred reset of a single FMR. Generate a fresh rkey by - * replacing the MR. There's no recovery if this fails. - */ static void -__fmr_recovery_worker(struct work_struct *work) +fmr_op_release_mr(struct rpcrdma_mw *r) { - struct rpcrdma_mw *mw = container_of(work, struct rpcrdma_mw, - mw_work); - struct rpcrdma_xprt *r_xprt = mw->mw_xprt; + LIST_HEAD(unmap_list); + int rc; - __fmr_unmap(mw); - rpcrdma_put_mw(r_xprt, mw); - return; + /* Ensure MW is not on any rl_registered list */ + if (!list_empty(&r->mw_list)) + list_del(&r->mw_list); + + kfree(r->fmr.fm_physaddrs); + kfree(r->mw_sg); + + /* In case this one was left mapped, try to unmap it + * to prevent dealloc_fmr from failing with EBUSY + */ + rc = __fmr_unmap(r); + if (rc) + pr_err("rpcrdma: final ib_unmap_fmr for %p failed %i\n", + r, rc); + + rc = ib_dealloc_fmr(r->fmr.fm_mr); + if (rc) + pr_err("rpcrdma: final ib_dealloc_fmr for %p returned %i\n", + r, rc); + + kfree(r); } -/* A broken MR was discovered in a context that can't sleep. - * Defer recovery to the recovery worker. +/* Reset of a single FMR. */ static void -__fmr_queue_recovery(struct rpcrdma_mw *mw) +fmr_op_recover_mr(struct rpcrdma_mw *mw) { - INIT_WORK(&mw->mw_work, __fmr_recovery_worker); - queue_work(fmr_recovery_wq, &mw->mw_work); + struct rpcrdma_xprt *r_xprt = mw->mw_xprt; + int rc; + + /* ORDER: invalidate first */ + rc = __fmr_unmap(mw); + + /* ORDER: then DMA unmap */ + ib_dma_unmap_sg(r_xprt->rx_ia.ri_device, + mw->mw_sg, mw->mw_nents, mw->mw_dir); + if (rc) + goto out_release; + + rpcrdma_put_mw(r_xprt, mw); + r_xprt->rx_stats.mrs_recovered++; + return; + +out_release: + pr_err("rpcrdma: FMR reset failed (%d), %p released\n", rc, mw); + r_xprt->rx_stats.mrs_orphaned++; + + spin_lock(&r_xprt->rx_buf.rb_mwlock); + list_del(&mw->mw_all); + spin_unlock(&r_xprt->rx_buf.rb_mwlock); + + fmr_op_release_mr(mw); } static int @@ -112,86 +175,21 @@ fmr_op_maxpages(struct rpcrdma_xprt *r_xprt) RPCRDMA_MAX_HDR_SEGS * RPCRDMA_MAX_FMR_SGES); } -static int -fmr_op_init(struct rpcrdma_xprt *r_xprt) -{ - struct rpcrdma_buffer *buf = &r_xprt->rx_buf; - int mr_access_flags = IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_READ; - struct ib_fmr_attr fmr_attr = { - .max_pages = RPCRDMA_MAX_FMR_SGES, - .max_maps = 1, - .page_shift = PAGE_SHIFT - }; - struct ib_pd *pd = r_xprt->rx_ia.ri_pd; - struct rpcrdma_mw *r; - int i, rc; - - spin_lock_init(&buf->rb_mwlock); - INIT_LIST_HEAD(&buf->rb_mws); - INIT_LIST_HEAD(&buf->rb_all); - - i = max_t(int, RPCRDMA_MAX_DATA_SEGS / RPCRDMA_MAX_FMR_SGES, 1); - i += 2; /* head + tail */ - i *= buf->rb_max_requests; /* one set for each RPC slot */ - dprintk("RPC: %s: initalizing %d FMRs\n", __func__, i); - - rc = -ENOMEM; - while (i--) { - r = kzalloc(sizeof(*r), GFP_KERNEL); - if (!r) - goto out; - - r->fmr.physaddrs = kmalloc(RPCRDMA_MAX_FMR_SGES * - sizeof(u64), GFP_KERNEL); - if (!r->fmr.physaddrs) - goto out_free; - - r->fmr.fmr = ib_alloc_fmr(pd, mr_access_flags, &fmr_attr); - if (IS_ERR(r->fmr.fmr)) - goto out_fmr_err; - - r->mw_xprt = r_xprt; - list_add(&r->mw_list, &buf->rb_mws); - list_add(&r->mw_all, &buf->rb_all); - } - return 0; - -out_fmr_err: - rc = PTR_ERR(r->fmr.fmr); - dprintk("RPC: %s: ib_alloc_fmr status %i\n", __func__, rc); - kfree(r->fmr.physaddrs); -out_free: - kfree(r); -out: - return rc; -} - /* Use the ib_map_phys_fmr() verb to register a memory region * for remote access via RDMA READ or RDMA WRITE. */ static int fmr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, - int nsegs, bool writing) + int nsegs, bool writing, struct rpcrdma_mw **out) { - struct rpcrdma_ia *ia = &r_xprt->rx_ia; - struct ib_device *device = ia->ri_device; - enum dma_data_direction direction = rpcrdma_data_dir(writing); struct rpcrdma_mr_seg *seg1 = seg; int len, pageoff, i, rc; struct rpcrdma_mw *mw; + u64 *dma_pages; - mw = seg1->rl_mw; - seg1->rl_mw = NULL; - if (!mw) { - mw = rpcrdma_get_mw(r_xprt); - if (!mw) - return -ENOMEM; - } else { - /* this is a retransmit; generate a fresh rkey */ - rc = __fmr_unmap(mw); - if (rc) - return rc; - } + mw = rpcrdma_get_mw(r_xprt); + if (!mw) + return -ENOBUFS; pageoff = offset_in_page(seg1->mr_offset); seg1->mr_offset -= pageoff; /* start of page */ @@ -200,8 +198,14 @@ fmr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, if (nsegs > RPCRDMA_MAX_FMR_SGES) nsegs = RPCRDMA_MAX_FMR_SGES; for (i = 0; i < nsegs;) { - rpcrdma_map_one(device, seg, direction); - mw->fmr.physaddrs[i] = seg->mr_dma; + if (seg->mr_page) + sg_set_page(&mw->mw_sg[i], + seg->mr_page, + seg->mr_len, + offset_in_page(seg->mr_offset)); + else + sg_set_buf(&mw->mw_sg[i], seg->mr_offset, + seg->mr_len); len += seg->mr_len; ++seg; ++i; @@ -210,49 +214,54 @@ fmr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len)) break; } - - rc = ib_map_phys_fmr(mw->fmr.fmr, mw->fmr.physaddrs, - i, seg1->mr_dma); + mw->mw_nents = i; + mw->mw_dir = rpcrdma_data_dir(writing); + if (i == 0) + goto out_dmamap_err; + + if (!ib_dma_map_sg(r_xprt->rx_ia.ri_device, + mw->mw_sg, mw->mw_nents, mw->mw_dir)) + goto out_dmamap_err; + + for (i = 0, dma_pages = mw->fmr.fm_physaddrs; i < mw->mw_nents; i++) + dma_pages[i] = sg_dma_address(&mw->mw_sg[i]); + rc = ib_map_phys_fmr(mw->fmr.fm_mr, dma_pages, mw->mw_nents, + dma_pages[0]); if (rc) goto out_maperr; - seg1->rl_mw = mw; - seg1->mr_rkey = mw->fmr.fmr->rkey; - seg1->mr_base = seg1->mr_dma + pageoff; - seg1->mr_nsegs = i; - seg1->mr_len = len; - return i; + mw->mw_handle = mw->fmr.fm_mr->rkey; + mw->mw_length = len; + mw->mw_offset = dma_pages[0] + pageoff; -out_maperr: - dprintk("RPC: %s: ib_map_phys_fmr %u@0x%llx+%i (%d) status %i\n", - __func__, len, (unsigned long long)seg1->mr_dma, - pageoff, i, rc); - while (i--) - rpcrdma_unmap_one(device, --seg); - return rc; -} + *out = mw; + return mw->mw_nents; -static void -__fmr_dma_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg) -{ - struct ib_device *device = r_xprt->rx_ia.ri_device; - int nsegs = seg->mr_nsegs; +out_dmamap_err: + pr_err("rpcrdma: failed to dma map sg %p sg_nents %u\n", + mw->mw_sg, mw->mw_nents); + rpcrdma_defer_mr_recovery(mw); + return -EIO; - while (nsegs--) - rpcrdma_unmap_one(device, seg++); +out_maperr: + pr_err("rpcrdma: ib_map_phys_fmr %u@0x%llx+%i (%d) status %i\n", + len, (unsigned long long)dma_pages[0], + pageoff, mw->mw_nents, rc); + rpcrdma_defer_mr_recovery(mw); + return -EIO; } /* Invalidate all memory regions that were registered for "req". * * Sleeps until it is safe for the host CPU to access the * previously mapped memory regions. + * + * Caller ensures that req->rl_registered is not empty. */ static void fmr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) { - struct rpcrdma_mr_seg *seg; - unsigned int i, nchunks; - struct rpcrdma_mw *mw; + struct rpcrdma_mw *mw, *tmp; LIST_HEAD(unmap_list); int rc; @@ -261,90 +270,54 @@ fmr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) /* ORDER: Invalidate all of the req's MRs first * * ib_unmap_fmr() is slow, so use a single call instead - * of one call per mapped MR. + * of one call per mapped FMR. */ - for (i = 0, nchunks = req->rl_nchunks; nchunks; nchunks--) { - seg = &req->rl_segments[i]; - mw = seg->rl_mw; - - list_add(&mw->fmr.fmr->list, &unmap_list); - - i += seg->mr_nsegs; - } + list_for_each_entry(mw, &req->rl_registered, mw_list) + list_add_tail(&mw->fmr.fm_mr->list, &unmap_list); rc = ib_unmap_fmr(&unmap_list); if (rc) - pr_warn("%s: ib_unmap_fmr failed (%i)\n", __func__, rc); + goto out_reset; /* ORDER: Now DMA unmap all of the req's MRs, and return * them to the free MW list. */ - for (i = 0, nchunks = req->rl_nchunks; nchunks; nchunks--) { - seg = &req->rl_segments[i]; + list_for_each_entry_safe(mw, tmp, &req->rl_registered, mw_list) { + list_del_init(&mw->mw_list); + list_del_init(&mw->fmr.fm_mr->list); + ib_dma_unmap_sg(r_xprt->rx_ia.ri_device, + mw->mw_sg, mw->mw_nents, mw->mw_dir); + rpcrdma_put_mw(r_xprt, mw); + } - __fmr_dma_unmap(r_xprt, seg); - rpcrdma_put_mw(r_xprt, seg->rl_mw); + return; - i += seg->mr_nsegs; - seg->mr_nsegs = 0; - seg->rl_mw = NULL; - } +out_reset: + pr_err("rpcrdma: ib_unmap_fmr failed (%i)\n", rc); - req->rl_nchunks = 0; + list_for_each_entry_safe(mw, tmp, &req->rl_registered, mw_list) { + list_del_init(&mw->fmr.fm_mr->list); + fmr_op_recover_mr(mw); + } } /* Use a slow, safe mechanism to invalidate all memory regions * that were registered for "req". - * - * In the asynchronous case, DMA unmapping occurs first here - * because the rpcrdma_mr_seg is released immediately after this - * call. It's contents won't be available in __fmr_dma_unmap later. - * FIXME. */ static void fmr_op_unmap_safe(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, bool sync) { - struct rpcrdma_mr_seg *seg; struct rpcrdma_mw *mw; - unsigned int i; - - for (i = 0; req->rl_nchunks; req->rl_nchunks--) { - seg = &req->rl_segments[i]; - mw = seg->rl_mw; - - if (sync) { - /* ORDER */ - __fmr_unmap(mw); - __fmr_dma_unmap(r_xprt, seg); - rpcrdma_put_mw(r_xprt, mw); - } else { - __fmr_dma_unmap(r_xprt, seg); - __fmr_queue_recovery(mw); - } - - i += seg->mr_nsegs; - seg->mr_nsegs = 0; - seg->rl_mw = NULL; - } -} - -static void -fmr_op_destroy(struct rpcrdma_buffer *buf) -{ - struct rpcrdma_mw *r; - int rc; - - while (!list_empty(&buf->rb_all)) { - r = list_entry(buf->rb_all.next, struct rpcrdma_mw, mw_all); - list_del(&r->mw_all); - kfree(r->fmr.physaddrs); - rc = ib_dealloc_fmr(r->fmr.fmr); - if (rc) - dprintk("RPC: %s: ib_dealloc_fmr failed %i\n", - __func__, rc); + while (!list_empty(&req->rl_registered)) { + mw = list_first_entry(&req->rl_registered, + struct rpcrdma_mw, mw_list); + list_del_init(&mw->mw_list); - kfree(r); + if (sync) + fmr_op_recover_mr(mw); + else + rpcrdma_defer_mr_recovery(mw); } } @@ -352,9 +325,10 @@ const struct rpcrdma_memreg_ops rpcrdma_fmr_memreg_ops = { .ro_map = fmr_op_map, .ro_unmap_sync = fmr_op_unmap_sync, .ro_unmap_safe = fmr_op_unmap_safe, + .ro_recover_mr = fmr_op_recover_mr, .ro_open = fmr_op_open, .ro_maxpages = fmr_op_maxpages, - .ro_init = fmr_op_init, - .ro_destroy = fmr_op_destroy, + .ro_init_mr = fmr_op_init_mr, + .ro_release_mr = fmr_op_release_mr, .ro_displayname = "fmr", }; diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index c094754..892b5e1 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -73,29 +73,71 @@ # define RPCDBG_FACILITY RPCDBG_TRANS #endif -static struct workqueue_struct *frwr_recovery_wq; - -#define FRWR_RECOVERY_WQ_FLAGS (WQ_UNBOUND | WQ_MEM_RECLAIM) +bool +frwr_is_supported(struct rpcrdma_ia *ia) +{ + struct ib_device_attr *attrs = &ia->ri_device->attrs; + + if (!(attrs->device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS)) + goto out_not_supported; + if (attrs->max_fast_reg_page_list_len == 0) + goto out_not_supported; + return true; + +out_not_supported: + pr_info("rpcrdma: 'frwr' mode is not supported by device %s\n", + ia->ri_device->name); + return false; +} -int -frwr_alloc_recovery_wq(void) +static int +frwr_op_init_mr(struct rpcrdma_ia *ia, struct rpcrdma_mw *r) { - frwr_recovery_wq = alloc_workqueue("frwr_recovery", - FRWR_RECOVERY_WQ_FLAGS, 0); - return !frwr_recovery_wq ? -ENOMEM : 0; + unsigned int depth = ia->ri_max_frmr_depth; + struct rpcrdma_frmr *f = &r->frmr; + int rc; + + f->fr_mr = ib_alloc_mr(ia->ri_pd, IB_MR_TYPE_MEM_REG, depth); + if (IS_ERR(f->fr_mr)) + goto out_mr_err; + + r->mw_sg = kcalloc(depth, sizeof(*r->mw_sg), GFP_KERNEL); + if (!r->mw_sg) + goto out_list_err; + + sg_init_table(r->mw_sg, depth); + init_completion(&f->fr_linv_done); + return 0; + +out_mr_err: + rc = PTR_ERR(f->fr_mr); + dprintk("RPC: %s: ib_alloc_mr status %i\n", + __func__, rc); + return rc; + +out_list_err: + rc = -ENOMEM; + dprintk("RPC: %s: sg allocation failure\n", + __func__); + ib_dereg_mr(f->fr_mr); + return rc; } -void -frwr_destroy_recovery_wq(void) +static void +frwr_op_release_mr(struct rpcrdma_mw *r) { - struct workqueue_struct *wq; + int rc; - if (!frwr_recovery_wq) - return; + /* Ensure MW is not on any rl_registered list */ + if (!list_empty(&r->mw_list)) + list_del(&r->mw_list); - wq = frwr_recovery_wq; - frwr_recovery_wq = NULL; - destroy_workqueue(wq); + rc = ib_dereg_mr(r->frmr.fr_mr); + if (rc) + pr_err("rpcrdma: final ib_dereg_mr for %p returned %i\n", + r, rc); + kfree(r->mw_sg); + kfree(r); } static int @@ -124,93 +166,37 @@ __frwr_reset_mr(struct rpcrdma_ia *ia, struct rpcrdma_mw *r) return 0; } -static void -__frwr_reset_and_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mw *mw) -{ - struct rpcrdma_ia *ia = &r_xprt->rx_ia; - struct rpcrdma_frmr *f = &mw->frmr; - int rc; - - rc = __frwr_reset_mr(ia, mw); - ib_dma_unmap_sg(ia->ri_device, f->fr_sg, f->fr_nents, f->fr_dir); - if (rc) - return; - - rpcrdma_put_mw(r_xprt, mw); -} - -/* Deferred reset of a single FRMR. Generate a fresh rkey by - * replacing the MR. +/* Reset of a single FRMR. Generate a fresh rkey by replacing the MR. * * There's no recovery if this fails. The FRMR is abandoned, but * remains in rb_all. It will be cleaned up when the transport is * destroyed. */ static void -__frwr_recovery_worker(struct work_struct *work) -{ - struct rpcrdma_mw *r = container_of(work, struct rpcrdma_mw, - mw_work); - - __frwr_reset_and_unmap(r->mw_xprt, r); - return; -} - -/* A broken MR was discovered in a context that can't sleep. - * Defer recovery to the recovery worker. - */ -static void -__frwr_queue_recovery(struct rpcrdma_mw *r) -{ - INIT_WORK(&r->mw_work, __frwr_recovery_worker); - queue_work(frwr_recovery_wq, &r->mw_work); -} - -static int -__frwr_init(struct rpcrdma_mw *r, struct ib_pd *pd, struct ib_device *device, - unsigned int depth) +frwr_op_recover_mr(struct rpcrdma_mw *mw) { - struct rpcrdma_frmr *f = &r->frmr; + struct rpcrdma_xprt *r_xprt = mw->mw_xprt; + struct rpcrdma_ia *ia = &r_xprt->rx_ia; int rc; - f->fr_mr = ib_alloc_mr(pd, IB_MR_TYPE_MEM_REG, depth); - if (IS_ERR(f->fr_mr)) - goto out_mr_err; - - f->fr_sg = kcalloc(depth, sizeof(*f->fr_sg), GFP_KERNEL); - if (!f->fr_sg) - goto out_list_err; - - sg_init_table(f->fr_sg, depth); - - init_completion(&f->fr_linv_done); - - return 0; + rc = __frwr_reset_mr(ia, mw); + ib_dma_unmap_sg(ia->ri_device, mw->mw_sg, mw->mw_nents, mw->mw_dir); + if (rc) + goto out_release; -out_mr_err: - rc = PTR_ERR(f->fr_mr); - dprintk("RPC: %s: ib_alloc_mr status %i\n", - __func__, rc); - return rc; + rpcrdma_put_mw(r_xprt, mw); + r_xprt->rx_stats.mrs_recovered++; + return; -out_list_err: - rc = -ENOMEM; - dprintk("RPC: %s: sg allocation failure\n", - __func__); - ib_dereg_mr(f->fr_mr); - return rc; -} +out_release: + pr_err("rpcrdma: FRMR reset failed %d, %p release\n", rc, mw); + r_xprt->rx_stats.mrs_orphaned++; -static void -__frwr_release(struct rpcrdma_mw *r) -{ - int rc; + spin_lock(&r_xprt->rx_buf.rb_mwlock); + list_del(&mw->mw_all); + spin_unlock(&r_xprt->rx_buf.rb_mwlock); - rc = ib_dereg_mr(r->frmr.fr_mr); - if (rc) - dprintk("RPC: %s: ib_dereg_mr status %i\n", - __func__, rc); - kfree(r->frmr.fr_sg); + frwr_op_release_mr(mw); } static int @@ -346,57 +332,14 @@ frwr_wc_localinv_wake(struct ib_cq *cq, struct ib_wc *wc) complete_all(&frmr->fr_linv_done); } -static int -frwr_op_init(struct rpcrdma_xprt *r_xprt) -{ - struct rpcrdma_buffer *buf = &r_xprt->rx_buf; - struct ib_device *device = r_xprt->rx_ia.ri_device; - unsigned int depth = r_xprt->rx_ia.ri_max_frmr_depth; - struct ib_pd *pd = r_xprt->rx_ia.ri_pd; - int i; - - spin_lock_init(&buf->rb_mwlock); - INIT_LIST_HEAD(&buf->rb_mws); - INIT_LIST_HEAD(&buf->rb_all); - - i = max_t(int, RPCRDMA_MAX_DATA_SEGS / depth, 1); - i += 2; /* head + tail */ - i *= buf->rb_max_requests; /* one set for each RPC slot */ - dprintk("RPC: %s: initalizing %d FRMRs\n", __func__, i); - - while (i--) { - struct rpcrdma_mw *r; - int rc; - - r = kzalloc(sizeof(*r), GFP_KERNEL); - if (!r) - return -ENOMEM; - - rc = __frwr_init(r, pd, device, depth); - if (rc) { - kfree(r); - return rc; - } - - r->mw_xprt = r_xprt; - list_add(&r->mw_list, &buf->rb_mws); - list_add(&r->mw_all, &buf->rb_all); - } - - return 0; -} - -/* Post a FAST_REG Work Request to register a memory region +/* Post a REG_MR Work Request to register a memory region * for remote access via RDMA READ or RDMA WRITE. */ static int frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, - int nsegs, bool writing) + int nsegs, bool writing, struct rpcrdma_mw **out) { struct rpcrdma_ia *ia = &r_xprt->rx_ia; - struct ib_device *device = ia->ri_device; - enum dma_data_direction direction = rpcrdma_data_dir(writing); - struct rpcrdma_mr_seg *seg1 = seg; struct rpcrdma_mw *mw; struct rpcrdma_frmr *frmr; struct ib_mr *mr; @@ -405,14 +348,13 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, int rc, i, n, dma_nents; u8 key; - mw = seg1->rl_mw; - seg1->rl_mw = NULL; + mw = NULL; do { if (mw) - __frwr_queue_recovery(mw); + rpcrdma_defer_mr_recovery(mw); mw = rpcrdma_get_mw(r_xprt); if (!mw) - return -ENOMEM; + return -ENOBUFS; } while (mw->frmr.fr_state != FRMR_IS_INVALID); frmr = &mw->frmr; frmr->fr_state = FRMR_IS_VALID; @@ -421,15 +363,14 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, if (nsegs > ia->ri_max_frmr_depth) nsegs = ia->ri_max_frmr_depth; - for (i = 0; i < nsegs;) { if (seg->mr_page) - sg_set_page(&frmr->fr_sg[i], + sg_set_page(&mw->mw_sg[i], seg->mr_page, seg->mr_len, offset_in_page(seg->mr_offset)); else - sg_set_buf(&frmr->fr_sg[i], seg->mr_offset, + sg_set_buf(&mw->mw_sg[i], seg->mr_offset, seg->mr_len); ++seg; @@ -440,26 +381,22 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len)) break; } - frmr->fr_nents = i; - frmr->fr_dir = direction; - - dma_nents = ib_dma_map_sg(device, frmr->fr_sg, frmr->fr_nents, direction); - if (!dma_nents) { - pr_err("RPC: %s: failed to dma map sg %p sg_nents %u\n", - __func__, frmr->fr_sg, frmr->fr_nents); - return -ENOMEM; - } + mw->mw_nents = i; + mw->mw_dir = rpcrdma_data_dir(writing); + if (i == 0) + goto out_dmamap_err; - n = ib_map_mr_sg(mr, frmr->fr_sg, frmr->fr_nents, NULL, PAGE_SIZE); - if (unlikely(n != frmr->fr_nents)) { - pr_err("RPC: %s: failed to map mr %p (%u/%u)\n", - __func__, frmr->fr_mr, n, frmr->fr_nents); - rc = n < 0 ? n : -EINVAL; - goto out_senderr; - } + dma_nents = ib_dma_map_sg(ia->ri_device, + mw->mw_sg, mw->mw_nents, mw->mw_dir); + if (!dma_nents) + goto out_dmamap_err; + + n = ib_map_mr_sg(mr, mw->mw_sg, mw->mw_nents, NULL, PAGE_SIZE); + if (unlikely(n != mw->mw_nents)) + goto out_mapmr_err; dprintk("RPC: %s: Using frmr %p to map %u segments (%u bytes)\n", - __func__, mw, frmr->fr_nents, mr->length); + __func__, mw, mw->mw_nents, mr->length); key = (u8)(mr->rkey & 0x000000FF); ib_update_fast_reg_key(mr, ++key); @@ -481,24 +418,34 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, if (rc) goto out_senderr; - seg1->rl_mw = mw; - seg1->mr_rkey = mr->rkey; - seg1->mr_base = mr->iova; - seg1->mr_nsegs = frmr->fr_nents; - seg1->mr_len = mr->length; + mw->mw_handle = mr->rkey; + mw->mw_length = mr->length; + mw->mw_offset = mr->iova; + + *out = mw; + return mw->mw_nents; - return frmr->fr_nents; +out_dmamap_err: + pr_err("rpcrdma: failed to dma map sg %p sg_nents %u\n", + mw->mw_sg, mw->mw_nents); + rpcrdma_defer_mr_recovery(mw); + return -EIO; + +out_mapmr_err: + pr_err("rpcrdma: failed to map mr %p (%u/%u)\n", + frmr->fr_mr, n, mw->mw_nents); + rpcrdma_defer_mr_recovery(mw); + return -EIO; out_senderr: - dprintk("RPC: %s: ib_post_send status %i\n", __func__, rc); - __frwr_queue_recovery(mw); - return rc; + pr_err("rpcrdma: FRMR registration ib_post_send returned %i\n", rc); + rpcrdma_defer_mr_recovery(mw); + return -ENOTCONN; } static struct ib_send_wr * -__frwr_prepare_linv_wr(struct rpcrdma_mr_seg *seg) +__frwr_prepare_linv_wr(struct rpcrdma_mw *mw) { - struct rpcrdma_mw *mw = seg->rl_mw; struct rpcrdma_frmr *f = &mw->frmr; struct ib_send_wr *invalidate_wr; @@ -518,16 +465,16 @@ __frwr_prepare_linv_wr(struct rpcrdma_mr_seg *seg) * * Sleeps until it is safe for the host CPU to access the * previously mapped memory regions. + * + * Caller ensures that req->rl_registered is not empty. */ static void frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) { struct ib_send_wr *invalidate_wrs, *pos, *prev, *bad_wr; struct rpcrdma_ia *ia = &r_xprt->rx_ia; - struct rpcrdma_mr_seg *seg; - unsigned int i, nchunks; + struct rpcrdma_mw *mw, *tmp; struct rpcrdma_frmr *f; - struct rpcrdma_mw *mw; int rc; dprintk("RPC: %s: req %p\n", __func__, req); @@ -537,22 +484,18 @@ frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) * Chain the LOCAL_INV Work Requests and post them with * a single ib_post_send() call. */ + f = NULL; invalidate_wrs = pos = prev = NULL; - seg = NULL; - for (i = 0, nchunks = req->rl_nchunks; nchunks; nchunks--) { - seg = &req->rl_segments[i]; - - pos = __frwr_prepare_linv_wr(seg); + list_for_each_entry(mw, &req->rl_registered, mw_list) { + pos = __frwr_prepare_linv_wr(mw); if (!invalidate_wrs) invalidate_wrs = pos; else prev->next = pos; prev = pos; - - i += seg->mr_nsegs; + f = &mw->frmr; } - f = &seg->rl_mw->frmr; /* Strong send queue ordering guarantees that when the * last WR in the chain completes, all WRs in the chain @@ -577,39 +520,27 @@ frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) * them to the free MW list. */ unmap: - for (i = 0, nchunks = req->rl_nchunks; nchunks; nchunks--) { - seg = &req->rl_segments[i]; - mw = seg->rl_mw; - seg->rl_mw = NULL; - - ib_dma_unmap_sg(ia->ri_device, f->fr_sg, f->fr_nents, - f->fr_dir); + list_for_each_entry_safe(mw, tmp, &req->rl_registered, mw_list) { + list_del_init(&mw->mw_list); + ib_dma_unmap_sg(ia->ri_device, + mw->mw_sg, mw->mw_nents, mw->mw_dir); rpcrdma_put_mw(r_xprt, mw); - - i += seg->mr_nsegs; - seg->mr_nsegs = 0; } - - req->rl_nchunks = 0; return; reset_mrs: - pr_warn("%s: ib_post_send failed %i\n", __func__, rc); + pr_err("rpcrdma: FRMR invalidate ib_post_send returned %i\n", rc); + rdma_disconnect(ia->ri_id); /* Find and reset the MRs in the LOCAL_INV WRs that did not * get posted. This is synchronous, and slow. */ - for (i = 0, nchunks = req->rl_nchunks; nchunks; nchunks--) { - seg = &req->rl_segments[i]; - mw = seg->rl_mw; + list_for_each_entry(mw, &req->rl_registered, mw_list) { f = &mw->frmr; - if (mw->frmr.fr_mr->rkey == bad_wr->ex.invalidate_rkey) { __frwr_reset_mr(ia, mw); bad_wr = bad_wr->next; } - - i += seg->mr_nsegs; } goto unmap; } @@ -621,38 +552,17 @@ static void frwr_op_unmap_safe(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, bool sync) { - struct rpcrdma_mr_seg *seg; struct rpcrdma_mw *mw; - unsigned int i; - for (i = 0; req->rl_nchunks; req->rl_nchunks--) { - seg = &req->rl_segments[i]; - mw = seg->rl_mw; + while (!list_empty(&req->rl_registered)) { + mw = list_first_entry(&req->rl_registered, + struct rpcrdma_mw, mw_list); + list_del_init(&mw->mw_list); if (sync) - __frwr_reset_and_unmap(r_xprt, mw); + frwr_op_recover_mr(mw); else - __frwr_queue_recovery(mw); - - i += seg->mr_nsegs; - seg->mr_nsegs = 0; - seg->rl_mw = NULL; - } -} - -static void -frwr_op_destroy(struct rpcrdma_buffer *buf) -{ - struct rpcrdma_mw *r; - - /* Ensure stale MWs for "buf" are no longer in flight */ - flush_workqueue(frwr_recovery_wq); - - while (!list_empty(&buf->rb_all)) { - r = list_entry(buf->rb_all.next, struct rpcrdma_mw, mw_all); - list_del(&r->mw_all); - __frwr_release(r); - kfree(r); + rpcrdma_defer_mr_recovery(mw); } } @@ -660,9 +570,10 @@ const struct rpcrdma_memreg_ops rpcrdma_frwr_memreg_ops = { .ro_map = frwr_op_map, .ro_unmap_sync = frwr_op_unmap_sync, .ro_unmap_safe = frwr_op_unmap_safe, + .ro_recover_mr = frwr_op_recover_mr, .ro_open = frwr_op_open, .ro_maxpages = frwr_op_maxpages, - .ro_init = frwr_op_init, - .ro_destroy = frwr_op_destroy, + .ro_init_mr = frwr_op_init_mr, + .ro_release_mr = frwr_op_release_mr, .ro_displayname = "frwr", }; diff --git a/net/sunrpc/xprtrdma/physical_ops.c b/net/sunrpc/xprtrdma/physical_ops.c deleted file mode 100644 index 3750596..0000000 --- a/net/sunrpc/xprtrdma/physical_ops.c +++ /dev/null @@ -1,122 +0,0 @@ -/* - * Copyright (c) 2015 Oracle. All rights reserved. - * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved. - */ - -/* No-op chunk preparation. All client memory is pre-registered. - * Sometimes referred to as ALLPHYSICAL mode. - * - * Physical registration is simple because all client memory is - * pre-registered and never deregistered. This mode is good for - * adapter bring up, but is considered not safe: the server is - * trusted not to abuse its access to client memory not involved - * in RDMA I/O. - */ - -#include "xprt_rdma.h" - -#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) -# define RPCDBG_FACILITY RPCDBG_TRANS -#endif - -static int -physical_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep, - struct rpcrdma_create_data_internal *cdata) -{ - struct ib_mr *mr; - - /* Obtain an rkey to use for RPC data payloads. - */ - mr = ib_get_dma_mr(ia->ri_pd, - IB_ACCESS_LOCAL_WRITE | - IB_ACCESS_REMOTE_WRITE | - IB_ACCESS_REMOTE_READ); - if (IS_ERR(mr)) { - pr_err("%s: ib_get_dma_mr for failed with %lX\n", - __func__, PTR_ERR(mr)); - return -ENOMEM; - } - ia->ri_dma_mr = mr; - - rpcrdma_set_max_header_sizes(ia, cdata, min_t(unsigned int, - RPCRDMA_MAX_DATA_SEGS, - RPCRDMA_MAX_HDR_SEGS)); - return 0; -} - -/* PHYSICAL memory registration conveys one page per chunk segment. - */ -static size_t -physical_op_maxpages(struct rpcrdma_xprt *r_xprt) -{ - return min_t(unsigned int, RPCRDMA_MAX_DATA_SEGS, - RPCRDMA_MAX_HDR_SEGS); -} - -static int -physical_op_init(struct rpcrdma_xprt *r_xprt) -{ - return 0; -} - -/* The client's physical memory is already exposed for - * remote access via RDMA READ or RDMA WRITE. - */ -static int -physical_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, - int nsegs, bool writing) -{ - struct rpcrdma_ia *ia = &r_xprt->rx_ia; - - rpcrdma_map_one(ia->ri_device, seg, rpcrdma_data_dir(writing)); - seg->mr_rkey = ia->ri_dma_mr->rkey; - seg->mr_base = seg->mr_dma; - return 1; -} - -/* DMA unmap all memory regions that were mapped for "req". - */ -static void -physical_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) -{ - struct ib_device *device = r_xprt->rx_ia.ri_device; - unsigned int i; - - for (i = 0; req->rl_nchunks; --req->rl_nchunks) - rpcrdma_unmap_one(device, &req->rl_segments[i++]); -} - -/* Use a slow, safe mechanism to invalidate all memory regions - * that were registered for "req". - * - * For physical memory registration, there is no good way to - * fence a single MR that has been advertised to the server. The - * client has already handed the server an R_key that cannot be - * invalidated and is shared by all MRs on this connection. - * Tearing down the PD might be the only safe choice, but it's - * not clear that a freshly acquired DMA R_key would be different - * than the one used by the PD that was just destroyed. - * FIXME. - */ -static void -physical_op_unmap_safe(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, - bool sync) -{ - physical_op_unmap_sync(r_xprt, req); -} - -static void -physical_op_destroy(struct rpcrdma_buffer *buf) -{ -} - -const struct rpcrdma_memreg_ops rpcrdma_physical_memreg_ops = { - .ro_map = physical_op_map, - .ro_unmap_sync = physical_op_unmap_sync, - .ro_unmap_safe = physical_op_unmap_safe, - .ro_open = physical_op_open, - .ro_maxpages = physical_op_maxpages, - .ro_init = physical_op_init, - .ro_destroy = physical_op_destroy, - .ro_displayname = "physical", -}; diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index 35a8109..a47f170 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -196,8 +196,7 @@ rpcrdma_tail_pullup(struct xdr_buf *buf) * MR when they can. */ static int -rpcrdma_convert_kvec(struct kvec *vec, struct rpcrdma_mr_seg *seg, - int n, int nsegs) +rpcrdma_convert_kvec(struct kvec *vec, struct rpcrdma_mr_seg *seg, int n) { size_t page_offset; u32 remaining; @@ -206,7 +205,7 @@ rpcrdma_convert_kvec(struct kvec *vec, struct rpcrdma_mr_seg *seg, base = vec->iov_base; page_offset = offset_in_page(base); remaining = vec->iov_len; - while (remaining && n < nsegs) { + while (remaining && n < RPCRDMA_MAX_SEGS) { seg[n].mr_page = NULL; seg[n].mr_offset = base; seg[n].mr_len = min_t(u32, PAGE_SIZE - page_offset, remaining); @@ -230,34 +229,34 @@ rpcrdma_convert_kvec(struct kvec *vec, struct rpcrdma_mr_seg *seg, static int rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos, - enum rpcrdma_chunktype type, struct rpcrdma_mr_seg *seg, int nsegs) + enum rpcrdma_chunktype type, struct rpcrdma_mr_seg *seg) { - int len, n = 0, p; - int page_base; + int len, n, p, page_base; struct page **ppages; + n = 0; if (pos == 0) { - n = rpcrdma_convert_kvec(&xdrbuf->head[0], seg, n, nsegs); - if (n == nsegs) - return -EIO; + n = rpcrdma_convert_kvec(&xdrbuf->head[0], seg, n); + if (n == RPCRDMA_MAX_SEGS) + goto out_overflow; } len = xdrbuf->page_len; ppages = xdrbuf->pages + (xdrbuf->page_base >> PAGE_SHIFT); page_base = xdrbuf->page_base & ~PAGE_MASK; p = 0; - while (len && n < nsegs) { + while (len && n < RPCRDMA_MAX_SEGS) { if (!ppages[p]) { /* alloc the pagelist for receiving buffer */ ppages[p] = alloc_page(GFP_ATOMIC); if (!ppages[p]) - return -ENOMEM; + return -EAGAIN; } seg[n].mr_page = ppages[p]; seg[n].mr_offset = (void *)(unsigned long) page_base; seg[n].mr_len = min_t(u32, PAGE_SIZE - page_base, len); if (seg[n].mr_len > PAGE_SIZE) - return -EIO; + goto out_overflow; len -= seg[n].mr_len; ++n; ++p; @@ -265,8 +264,8 @@ rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos, } /* Message overflows the seg array */ - if (len && n == nsegs) - return -EIO; + if (len && n == RPCRDMA_MAX_SEGS) + goto out_overflow; /* When encoding the read list, the tail is always sent inline */ if (type == rpcrdma_readch) @@ -277,20 +276,24 @@ rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos, * xdr pad bytes, saving the server an RDMA operation. */ if (xdrbuf->tail[0].iov_len < 4 && xprt_rdma_pad_optimize) return n; - n = rpcrdma_convert_kvec(&xdrbuf->tail[0], seg, n, nsegs); - if (n == nsegs) - return -EIO; + n = rpcrdma_convert_kvec(&xdrbuf->tail[0], seg, n); + if (n == RPCRDMA_MAX_SEGS) + goto out_overflow; } return n; + +out_overflow: + pr_err("rpcrdma: segment array overflow\n"); + return -EIO; } static inline __be32 * -xdr_encode_rdma_segment(__be32 *iptr, struct rpcrdma_mr_seg *seg) +xdr_encode_rdma_segment(__be32 *iptr, struct rpcrdma_mw *mw) { - *iptr++ = cpu_to_be32(seg->mr_rkey); - *iptr++ = cpu_to_be32(seg->mr_len); - return xdr_encode_hyper(iptr, seg->mr_base); + *iptr++ = cpu_to_be32(mw->mw_handle); + *iptr++ = cpu_to_be32(mw->mw_length); + return xdr_encode_hyper(iptr, mw->mw_offset); } /* XDR-encode the Read list. Supports encoding a list of read @@ -310,7 +313,8 @@ rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, struct rpc_rqst *rqst, __be32 *iptr, enum rpcrdma_chunktype rtype) { - struct rpcrdma_mr_seg *seg = req->rl_nextseg; + struct rpcrdma_mr_seg *seg; + struct rpcrdma_mw *mw; unsigned int pos; int n, nsegs; @@ -322,15 +326,17 @@ rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt, pos = rqst->rq_snd_buf.head[0].iov_len; if (rtype == rpcrdma_areadch) pos = 0; - nsegs = rpcrdma_convert_iovs(&rqst->rq_snd_buf, pos, rtype, seg, - RPCRDMA_MAX_SEGS - req->rl_nchunks); + seg = req->rl_segments; + nsegs = rpcrdma_convert_iovs(&rqst->rq_snd_buf, pos, rtype, seg); if (nsegs < 0) return ERR_PTR(nsegs); do { - n = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs, false); - if (n <= 0) + n = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs, + false, &mw); + if (n < 0) return ERR_PTR(n); + list_add(&mw->mw_list, &req->rl_registered); *iptr++ = xdr_one; /* item present */ @@ -338,20 +344,17 @@ rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt, * have the same "position". */ *iptr++ = cpu_to_be32(pos); - iptr = xdr_encode_rdma_segment(iptr, seg); + iptr = xdr_encode_rdma_segment(iptr, mw); - dprintk("RPC: %5u %s: read segment pos %u " - "%d@0x%016llx:0x%08x (%s)\n", + dprintk("RPC: %5u %s: pos %u %u@0x%016llx:0x%08x (%s)\n", rqst->rq_task->tk_pid, __func__, pos, - seg->mr_len, (unsigned long long)seg->mr_base, - seg->mr_rkey, n < nsegs ? "more" : "last"); + mw->mw_length, (unsigned long long)mw->mw_offset, + mw->mw_handle, n < nsegs ? "more" : "last"); r_xprt->rx_stats.read_chunk_count++; - req->rl_nchunks++; seg += n; nsegs -= n; } while (nsegs); - req->rl_nextseg = seg; /* Finish Read list */ *iptr++ = xdr_zero; /* Next item not present */ @@ -375,7 +378,8 @@ rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, struct rpc_rqst *rqst, __be32 *iptr, enum rpcrdma_chunktype wtype) { - struct rpcrdma_mr_seg *seg = req->rl_nextseg; + struct rpcrdma_mr_seg *seg; + struct rpcrdma_mw *mw; int n, nsegs, nchunks; __be32 *segcount; @@ -384,10 +388,10 @@ rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, return iptr; } + seg = req->rl_segments; nsegs = rpcrdma_convert_iovs(&rqst->rq_rcv_buf, rqst->rq_rcv_buf.head[0].iov_len, - wtype, seg, - RPCRDMA_MAX_SEGS - req->rl_nchunks); + wtype, seg); if (nsegs < 0) return ERR_PTR(nsegs); @@ -396,26 +400,25 @@ rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, nchunks = 0; do { - n = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs, true); - if (n <= 0) + n = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs, + true, &mw); + if (n < 0) return ERR_PTR(n); + list_add(&mw->mw_list, &req->rl_registered); - iptr = xdr_encode_rdma_segment(iptr, seg); + iptr = xdr_encode_rdma_segment(iptr, mw); - dprintk("RPC: %5u %s: write segment " - "%d@0x016%llx:0x%08x (%s)\n", + dprintk("RPC: %5u %s: %u@0x016%llx:0x%08x (%s)\n", rqst->rq_task->tk_pid, __func__, - seg->mr_len, (unsigned long long)seg->mr_base, - seg->mr_rkey, n < nsegs ? "more" : "last"); + mw->mw_length, (unsigned long long)mw->mw_offset, + mw->mw_handle, n < nsegs ? "more" : "last"); r_xprt->rx_stats.write_chunk_count++; r_xprt->rx_stats.total_rdma_request += seg->mr_len; - req->rl_nchunks++; nchunks++; seg += n; nsegs -= n; } while (nsegs); - req->rl_nextseg = seg; /* Update count of segments in this Write chunk */ *segcount = cpu_to_be32(nchunks); @@ -442,7 +445,8 @@ rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, struct rpc_rqst *rqst, __be32 *iptr, enum rpcrdma_chunktype wtype) { - struct rpcrdma_mr_seg *seg = req->rl_nextseg; + struct rpcrdma_mr_seg *seg; + struct rpcrdma_mw *mw; int n, nsegs, nchunks; __be32 *segcount; @@ -451,8 +455,8 @@ rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt, return iptr; } - nsegs = rpcrdma_convert_iovs(&rqst->rq_rcv_buf, 0, wtype, seg, - RPCRDMA_MAX_SEGS - req->rl_nchunks); + seg = req->rl_segments; + nsegs = rpcrdma_convert_iovs(&rqst->rq_rcv_buf, 0, wtype, seg); if (nsegs < 0) return ERR_PTR(nsegs); @@ -461,26 +465,25 @@ rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt, nchunks = 0; do { - n = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs, true); - if (n <= 0) + n = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs, + true, &mw); + if (n < 0) return ERR_PTR(n); + list_add(&mw->mw_list, &req->rl_registered); - iptr = xdr_encode_rdma_segment(iptr, seg); + iptr = xdr_encode_rdma_segment(iptr, mw); - dprintk("RPC: %5u %s: reply segment " - "%d@0x%016llx:0x%08x (%s)\n", + dprintk("RPC: %5u %s: %u@0x%016llx:0x%08x (%s)\n", rqst->rq_task->tk_pid, __func__, - seg->mr_len, (unsigned long long)seg->mr_base, - seg->mr_rkey, n < nsegs ? "more" : "last"); + mw->mw_length, (unsigned long long)mw->mw_offset, + mw->mw_handle, n < nsegs ? "more" : "last"); r_xprt->rx_stats.reply_chunk_count++; r_xprt->rx_stats.total_rdma_request += seg->mr_len; - req->rl_nchunks++; nchunks++; seg += n; nsegs -= n; } while (nsegs); - req->rl_nextseg = seg; /* Update count of segments in the Reply chunk */ *segcount = cpu_to_be32(nchunks); @@ -567,6 +570,7 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) struct rpcrdma_req *req = rpcr_to_rdmar(rqst); enum rpcrdma_chunktype rtype, wtype; struct rpcrdma_msg *headerp; + bool ddp_allowed; ssize_t hdrlen; size_t rpclen; __be32 *iptr; @@ -583,6 +587,13 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) headerp->rm_credit = cpu_to_be32(r_xprt->rx_buf.rb_max_requests); headerp->rm_type = rdma_msg; + /* When the ULP employs a GSS flavor that guarantees integrity + * or privacy, direct data placement of individual data items + * is not allowed. + */ + ddp_allowed = !(rqst->rq_cred->cr_auth->au_flags & + RPCAUTH_AUTH_DATATOUCH); + /* * Chunks needed for results? * @@ -594,7 +605,7 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) */ if (rpcrdma_results_inline(r_xprt, rqst)) wtype = rpcrdma_noch; - else if (rqst->rq_rcv_buf.flags & XDRBUF_READ) + else if (ddp_allowed && rqst->rq_rcv_buf.flags & XDRBUF_READ) wtype = rpcrdma_writech; else wtype = rpcrdma_replych; @@ -617,7 +628,7 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) rtype = rpcrdma_noch; rpcrdma_inline_pullup(rqst); rpclen = rqst->rq_svec[0].iov_len; - } else if (rqst->rq_snd_buf.flags & XDRBUF_WRITE) { + } else if (ddp_allowed && rqst->rq_snd_buf.flags & XDRBUF_WRITE) { rtype = rpcrdma_readch; rpclen = rqst->rq_svec[0].iov_len; rpclen += rpcrdma_tail_pullup(&rqst->rq_snd_buf); @@ -650,8 +661,6 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) * send a Call message with a Position Zero Read chunk and a * regular Read chunk at the same time. */ - req->rl_nchunks = 0; - req->rl_nextseg = req->rl_segments; iptr = headerp->rm_body.rm_chunks; iptr = rpcrdma_encode_read_list(r_xprt, req, rqst, iptr, rtype); if (IS_ERR(iptr)) @@ -690,10 +699,7 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) out_overflow: pr_err("rpcrdma: send overflow: hdrlen %zd rpclen %zu %s/%s\n", hdrlen, rpclen, transfertypes[rtype], transfertypes[wtype]); - /* Terminate this RPC. Chunks registered above will be - * released by xprt_release -> xprt_rmda_free . - */ - return -EIO; + iptr = ERR_PTR(-EIO); out_unmap: r_xprt->rx_ia.ri_ops->ro_unmap_safe(r_xprt, req, false); @@ -705,15 +711,13 @@ out_unmap: * RDMA'd by server. See map at rpcrdma_create_chunks()! :-) */ static int -rpcrdma_count_chunks(struct rpcrdma_rep *rep, unsigned int max, int wrchunk, __be32 **iptrp) +rpcrdma_count_chunks(struct rpcrdma_rep *rep, int wrchunk, __be32 **iptrp) { unsigned int i, total_len; struct rpcrdma_write_chunk *cur_wchunk; char *base = (char *)rdmab_to_msg(rep->rr_rdmabuf); i = be32_to_cpu(**iptrp); - if (i > max) - return -1; cur_wchunk = (struct rpcrdma_write_chunk *) (*iptrp + 1); total_len = 0; while (i--) { @@ -744,45 +748,66 @@ rpcrdma_count_chunks(struct rpcrdma_rep *rep, unsigned int max, int wrchunk, __b return total_len; } -/* - * Scatter inline received data back into provided iov's. +/** + * rpcrdma_inline_fixup - Scatter inline received data into rqst's iovecs + * @rqst: controlling RPC request + * @srcp: points to RPC message payload in receive buffer + * @copy_len: remaining length of receive buffer content + * @pad: Write chunk pad bytes needed (zero for pure inline) + * + * The upper layer has set the maximum number of bytes it can + * receive in each component of rq_rcv_buf. These values are set in + * the head.iov_len, page_len, tail.iov_len, and buflen fields. + * + * Unlike the TCP equivalent (xdr_partial_copy_from_skb), in + * many cases this function simply updates iov_base pointers in + * rq_rcv_buf to point directly to the received reply data, to + * avoid copying reply data. + * + * Returns the count of bytes which had to be memcopied. */ -static void +static unsigned long rpcrdma_inline_fixup(struct rpc_rqst *rqst, char *srcp, int copy_len, int pad) { - int i, npages, curlen, olen; + unsigned long fixup_copy_count; + int i, npages, curlen; char *destp; struct page **ppages; int page_base; + /* The head iovec is redirected to the RPC reply message + * in the receive buffer, to avoid a memcopy. + */ + rqst->rq_rcv_buf.head[0].iov_base = srcp; + rqst->rq_private_buf.head[0].iov_base = srcp; + + /* The contents of the receive buffer that follow + * head.iov_len bytes are copied into the page list. + */ curlen = rqst->rq_rcv_buf.head[0].iov_len; - if (curlen > copy_len) { /* write chunk header fixup */ + if (curlen > copy_len) curlen = copy_len; - rqst->rq_rcv_buf.head[0].iov_len = curlen; - } - dprintk("RPC: %s: srcp 0x%p len %d hdrlen %d\n", __func__, srcp, copy_len, curlen); - - /* Shift pointer for first receive segment only */ - rqst->rq_rcv_buf.head[0].iov_base = srcp; srcp += curlen; copy_len -= curlen; - olen = copy_len; - i = 0; - rpcx_to_rdmax(rqst->rq_xprt)->rx_stats.fixup_copy_count += olen; page_base = rqst->rq_rcv_buf.page_base; ppages = rqst->rq_rcv_buf.pages + (page_base >> PAGE_SHIFT); page_base &= ~PAGE_MASK; - + fixup_copy_count = 0; if (copy_len && rqst->rq_rcv_buf.page_len) { - npages = PAGE_ALIGN(page_base + - rqst->rq_rcv_buf.page_len) >> PAGE_SHIFT; - for (; i < npages; i++) { + int pagelist_len; + + pagelist_len = rqst->rq_rcv_buf.page_len; + if (pagelist_len > copy_len) + pagelist_len = copy_len; + npages = PAGE_ALIGN(page_base + pagelist_len) >> PAGE_SHIFT; + for (i = 0; i < npages; i++) { curlen = PAGE_SIZE - page_base; - if (curlen > copy_len) - curlen = copy_len; + if (curlen > pagelist_len) + curlen = pagelist_len; + dprintk("RPC: %s: page %d" " srcp 0x%p len %d curlen %d\n", __func__, i, srcp, copy_len, curlen); @@ -792,39 +817,32 @@ rpcrdma_inline_fixup(struct rpc_rqst *rqst, char *srcp, int copy_len, int pad) kunmap_atomic(destp); srcp += curlen; copy_len -= curlen; - if (copy_len == 0) + fixup_copy_count += curlen; + pagelist_len -= curlen; + if (!pagelist_len) break; page_base = 0; } - } - if (copy_len && rqst->rq_rcv_buf.tail[0].iov_len) { - curlen = copy_len; - if (curlen > rqst->rq_rcv_buf.tail[0].iov_len) - curlen = rqst->rq_rcv_buf.tail[0].iov_len; - if (rqst->rq_rcv_buf.tail[0].iov_base != srcp) - memmove(rqst->rq_rcv_buf.tail[0].iov_base, srcp, curlen); - dprintk("RPC: %s: tail srcp 0x%p len %d curlen %d\n", - __func__, srcp, copy_len, curlen); - rqst->rq_rcv_buf.tail[0].iov_len = curlen; - copy_len -= curlen; ++i; - } else - rqst->rq_rcv_buf.tail[0].iov_len = 0; - - if (pad) { - /* implicit padding on terminal chunk */ - unsigned char *p = rqst->rq_rcv_buf.tail[0].iov_base; - while (pad--) - p[rqst->rq_rcv_buf.tail[0].iov_len++] = 0; + /* Implicit padding for the last segment in a Write + * chunk is inserted inline at the front of the tail + * iovec. The upper layer ignores the content of + * the pad. Simply ensure inline content in the tail + * that follows the Write chunk is properly aligned. + */ + if (pad) + srcp -= pad; } - if (copy_len) - dprintk("RPC: %s: %d bytes in" - " %d extra segments (%d lost)\n", - __func__, olen, i, copy_len); + /* The tail iovec is redirected to the remaining data + * in the receive buffer, to avoid a memcopy. + */ + if (copy_len || pad) { + rqst->rq_rcv_buf.tail[0].iov_base = srcp; + rqst->rq_private_buf.tail[0].iov_base = srcp; + } - /* TBD avoid a warning from call_decode() */ - rqst->rq_private_buf = rqst->rq_rcv_buf; + return fixup_copy_count; } void @@ -960,14 +978,13 @@ rpcrdma_reply_handler(struct rpcrdma_rep *rep) (headerp->rm_body.rm_chunks[1] == xdr_zero && headerp->rm_body.rm_chunks[2] != xdr_zero) || (headerp->rm_body.rm_chunks[1] != xdr_zero && - req->rl_nchunks == 0)) + list_empty(&req->rl_registered))) goto badheader; if (headerp->rm_body.rm_chunks[1] != xdr_zero) { /* count any expected write chunks in read reply */ /* start at write chunk array count */ iptr = &headerp->rm_body.rm_chunks[2]; - rdmalen = rpcrdma_count_chunks(rep, - req->rl_nchunks, 1, &iptr); + rdmalen = rpcrdma_count_chunks(rep, 1, &iptr); /* check for validity, and no reply chunk after */ if (rdmalen < 0 || *iptr++ != xdr_zero) goto badheader; @@ -988,8 +1005,10 @@ rpcrdma_reply_handler(struct rpcrdma_rep *rep) rep->rr_len -= RPCRDMA_HDRLEN_MIN; status = rep->rr_len; } - /* Fix up the rpc results for upper layer */ - rpcrdma_inline_fixup(rqst, (char *)iptr, rep->rr_len, rdmalen); + + r_xprt->rx_stats.fixup_copy_count += + rpcrdma_inline_fixup(rqst, (char *)iptr, rep->rr_len, + rdmalen); break; case rdma_nomsg: @@ -997,11 +1016,11 @@ rpcrdma_reply_handler(struct rpcrdma_rep *rep) if (headerp->rm_body.rm_chunks[0] != xdr_zero || headerp->rm_body.rm_chunks[1] != xdr_zero || headerp->rm_body.rm_chunks[2] != xdr_one || - req->rl_nchunks == 0) + list_empty(&req->rl_registered)) goto badheader; iptr = (__be32 *)((unsigned char *)headerp + RPCRDMA_HDRLEN_MIN); - rdmalen = rpcrdma_count_chunks(rep, req->rl_nchunks, 0, &iptr); + rdmalen = rpcrdma_count_chunks(rep, 0, &iptr); if (rdmalen < 0) goto badheader; r_xprt->rx_stats.total_rdma_reply += rdmalen; @@ -1014,14 +1033,9 @@ rpcrdma_reply_handler(struct rpcrdma_rep *rep) badheader: default: - dprintk("%s: invalid rpcrdma reply header (type %d):" - " chunks[012] == %d %d %d" - " expected chunks <= %d\n", - __func__, be32_to_cpu(headerp->rm_type), - headerp->rm_body.rm_chunks[0], - headerp->rm_body.rm_chunks[1], - headerp->rm_body.rm_chunks[2], - req->rl_nchunks); + dprintk("RPC: %5u %s: invalid rpcrdma reply (type %u)\n", + rqst->rq_task->tk_pid, __func__, + be32_to_cpu(headerp->rm_type)); status = -EIO; r_xprt->rx_stats.bad_reply_count++; break; @@ -1035,7 +1049,7 @@ out: * control: waking the next RPC waits until this RPC has * relinquished all its Send Queue entries. */ - if (req->rl_nchunks) + if (!list_empty(&req->rl_registered)) r_xprt->rx_ia.ri_ops->ro_unmap_sync(r_xprt, req); spin_lock_bh(&xprt->transport_lock); diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index 99d2e5b..81f0e87 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -558,7 +558,6 @@ out_sendbuf: out_fail: rpcrdma_buffer_put(req); - r_xprt->rx_stats.failed_marshal_count++; return NULL; } @@ -590,8 +589,19 @@ xprt_rdma_free(void *buffer) rpcrdma_buffer_put(req); } -/* +/** + * xprt_rdma_send_request - marshal and send an RPC request + * @task: RPC task with an RPC message in rq_snd_buf + * + * Return values: + * 0: The request has been sent + * ENOTCONN: Caller needs to invoke connect logic then call again + * ENOBUFS: Call again later to send the request + * EIO: A permanent error occurred. The request was not sent, + * and don't try it again + * * send_request invokes the meat of RPC RDMA. It must do the following: + * * 1. Marshal the RPC request into an RPC RDMA request, which means * putting a header in front of data, and creating IOVs for RDMA * from those in the request. @@ -600,7 +610,6 @@ xprt_rdma_free(void *buffer) * the request (rpcrdma_ep_post). * 4. No partial sends are possible in the RPC-RDMA protocol (as in UDP). */ - static int xprt_rdma_send_request(struct rpc_task *task) { @@ -610,6 +619,9 @@ xprt_rdma_send_request(struct rpc_task *task) struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); int rc = 0; + /* On retransmit, remove any previously registered chunks */ + r_xprt->rx_ia.ri_ops->ro_unmap_safe(r_xprt, req, false); + rc = rpcrdma_marshal_req(rqst); if (rc < 0) goto failed_marshal; @@ -630,11 +642,12 @@ xprt_rdma_send_request(struct rpc_task *task) return 0; failed_marshal: - r_xprt->rx_stats.failed_marshal_count++; dprintk("RPC: %s: rpcrdma_marshal_req failed, status %i\n", __func__, rc); if (rc == -EIO) - return -EIO; + r_xprt->rx_stats.failed_marshal_count++; + if (rc != -ENOTCONN) + return rc; drop_connection: xprt_disconnect_done(xprt); return -ENOTCONN; /* implies disconnect */ @@ -660,7 +673,7 @@ void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq) xprt->stat.bad_xids, xprt->stat.req_u, xprt->stat.bklog_u); - seq_printf(seq, "%lu %lu %lu %llu %llu %llu %llu %lu %lu %lu %lu\n", + seq_printf(seq, "%lu %lu %lu %llu %llu %llu %llu %lu %lu %lu %lu ", r_xprt->rx_stats.read_chunk_count, r_xprt->rx_stats.write_chunk_count, r_xprt->rx_stats.reply_chunk_count, @@ -672,6 +685,10 @@ void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq) r_xprt->rx_stats.failed_marshal_count, r_xprt->rx_stats.bad_reply_count, r_xprt->rx_stats.nomsg_call_count); + seq_printf(seq, "%lu %lu %lu\n", + r_xprt->rx_stats.mrs_recovered, + r_xprt->rx_stats.mrs_orphaned, + r_xprt->rx_stats.mrs_allocated); } static int @@ -741,7 +758,6 @@ void xprt_rdma_cleanup(void) __func__, rc); rpcrdma_destroy_wq(); - frwr_destroy_recovery_wq(); rc = xprt_unregister_transport(&xprt_rdma_bc); if (rc) @@ -753,20 +769,13 @@ int xprt_rdma_init(void) { int rc; - rc = frwr_alloc_recovery_wq(); - if (rc) - return rc; - rc = rpcrdma_alloc_wq(); - if (rc) { - frwr_destroy_recovery_wq(); + if (rc) return rc; - } rc = xprt_register_transport(&xprt_rdma); if (rc) { rpcrdma_destroy_wq(); - frwr_destroy_recovery_wq(); return rc; } @@ -774,7 +783,6 @@ int xprt_rdma_init(void) if (rc) { xprt_unregister_transport(&xprt_rdma); rpcrdma_destroy_wq(); - frwr_destroy_recovery_wq(); return rc; } diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index b044d98a..536d0be 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -379,8 +379,6 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg) struct rpcrdma_ia *ia = &xprt->rx_ia; int rc; - ia->ri_dma_mr = NULL; - ia->ri_id = rpcrdma_create_id(xprt, ia, addr); if (IS_ERR(ia->ri_id)) { rc = PTR_ERR(ia->ri_id); @@ -391,47 +389,29 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg) ia->ri_pd = ib_alloc_pd(ia->ri_device); if (IS_ERR(ia->ri_pd)) { rc = PTR_ERR(ia->ri_pd); - dprintk("RPC: %s: ib_alloc_pd() failed %i\n", - __func__, rc); + pr_err("rpcrdma: ib_alloc_pd() returned %d\n", rc); goto out2; } - if (memreg == RPCRDMA_FRMR) { - if (!(ia->ri_device->attrs.device_cap_flags & - IB_DEVICE_MEM_MGT_EXTENSIONS) || - (ia->ri_device->attrs.max_fast_reg_page_list_len == 0)) { - dprintk("RPC: %s: FRMR registration " - "not supported by HCA\n", __func__); - memreg = RPCRDMA_MTHCAFMR; - } - } - if (memreg == RPCRDMA_MTHCAFMR) { - if (!ia->ri_device->alloc_fmr) { - dprintk("RPC: %s: MTHCAFMR registration " - "not supported by HCA\n", __func__); - rc = -EINVAL; - goto out3; - } - } - switch (memreg) { case RPCRDMA_FRMR: - ia->ri_ops = &rpcrdma_frwr_memreg_ops; - break; - case RPCRDMA_ALLPHYSICAL: - ia->ri_ops = &rpcrdma_physical_memreg_ops; - break; + if (frwr_is_supported(ia)) { + ia->ri_ops = &rpcrdma_frwr_memreg_ops; + break; + } + /*FALLTHROUGH*/ case RPCRDMA_MTHCAFMR: - ia->ri_ops = &rpcrdma_fmr_memreg_ops; - break; + if (fmr_is_supported(ia)) { + ia->ri_ops = &rpcrdma_fmr_memreg_ops; + break; + } + /*FALLTHROUGH*/ default: - printk(KERN_ERR "RPC: Unsupported memory " - "registration mode: %d\n", memreg); - rc = -ENOMEM; + pr_err("rpcrdma: Unsupported memory registration mode: %d\n", + memreg); + rc = -EINVAL; goto out3; } - dprintk("RPC: %s: memory registration strategy is '%s'\n", - __func__, ia->ri_ops->ro_displayname); return 0; @@ -585,8 +565,6 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, out2: ib_free_cq(sendcq); out1: - if (ia->ri_dma_mr) - ib_dereg_mr(ia->ri_dma_mr); return rc; } @@ -600,8 +578,6 @@ out1: void rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) { - int rc; - dprintk("RPC: %s: entering, connected is %d\n", __func__, ep->rep_connected); @@ -615,12 +591,6 @@ rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) ib_free_cq(ep->rep_attr.recv_cq); ib_free_cq(ep->rep_attr.send_cq); - - if (ia->ri_dma_mr) { - rc = ib_dereg_mr(ia->ri_dma_mr); - dprintk("RPC: %s: ib_dereg_mr returned %i\n", - __func__, rc); - } } /* @@ -777,6 +747,90 @@ rpcrdma_ep_disconnect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) ib_drain_qp(ia->ri_id->qp); } +static void +rpcrdma_mr_recovery_worker(struct work_struct *work) +{ + struct rpcrdma_buffer *buf = container_of(work, struct rpcrdma_buffer, + rb_recovery_worker.work); + struct rpcrdma_mw *mw; + + spin_lock(&buf->rb_recovery_lock); + while (!list_empty(&buf->rb_stale_mrs)) { + mw = list_first_entry(&buf->rb_stale_mrs, + struct rpcrdma_mw, mw_list); + list_del_init(&mw->mw_list); + spin_unlock(&buf->rb_recovery_lock); + + dprintk("RPC: %s: recovering MR %p\n", __func__, mw); + mw->mw_xprt->rx_ia.ri_ops->ro_recover_mr(mw); + + spin_lock(&buf->rb_recovery_lock); + } + spin_unlock(&buf->rb_recovery_lock); +} + +void +rpcrdma_defer_mr_recovery(struct rpcrdma_mw *mw) +{ + struct rpcrdma_xprt *r_xprt = mw->mw_xprt; + struct rpcrdma_buffer *buf = &r_xprt->rx_buf; + + spin_lock(&buf->rb_recovery_lock); + list_add(&mw->mw_list, &buf->rb_stale_mrs); + spin_unlock(&buf->rb_recovery_lock); + + schedule_delayed_work(&buf->rb_recovery_worker, 0); +} + +static void +rpcrdma_create_mrs(struct rpcrdma_xprt *r_xprt) +{ + struct rpcrdma_buffer *buf = &r_xprt->rx_buf; + struct rpcrdma_ia *ia = &r_xprt->rx_ia; + unsigned int count; + LIST_HEAD(free); + LIST_HEAD(all); + + for (count = 0; count < 32; count++) { + struct rpcrdma_mw *mw; + int rc; + + mw = kzalloc(sizeof(*mw), GFP_KERNEL); + if (!mw) + break; + + rc = ia->ri_ops->ro_init_mr(ia, mw); + if (rc) { + kfree(mw); + break; + } + + mw->mw_xprt = r_xprt; + + list_add(&mw->mw_list, &free); + list_add(&mw->mw_all, &all); + } + + spin_lock(&buf->rb_mwlock); + list_splice(&free, &buf->rb_mws); + list_splice(&all, &buf->rb_all); + r_xprt->rx_stats.mrs_allocated += count; + spin_unlock(&buf->rb_mwlock); + + dprintk("RPC: %s: created %u MRs\n", __func__, count); +} + +static void +rpcrdma_mr_refresh_worker(struct work_struct *work) +{ + struct rpcrdma_buffer *buf = container_of(work, struct rpcrdma_buffer, + rb_refresh_worker.work); + struct rpcrdma_xprt *r_xprt = container_of(buf, struct rpcrdma_xprt, + rx_buf); + + rpcrdma_create_mrs(r_xprt); +} + struct rpcrdma_req * rpcrdma_create_req(struct rpcrdma_xprt *r_xprt) { @@ -793,6 +847,7 @@ rpcrdma_create_req(struct rpcrdma_xprt *r_xprt) spin_unlock(&buffer->rb_reqslock); req->rl_cqe.done = rpcrdma_wc_send; req->rl_buffer = &r_xprt->rx_buf; + INIT_LIST_HEAD(&req->rl_registered); return req; } @@ -832,17 +887,23 @@ int rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt) { struct rpcrdma_buffer *buf = &r_xprt->rx_buf; - struct rpcrdma_ia *ia = &r_xprt->rx_ia; int i, rc; buf->rb_max_requests = r_xprt->rx_data.max_requests; buf->rb_bc_srv_max_requests = 0; - spin_lock_init(&buf->rb_lock); atomic_set(&buf->rb_credits, 1); + spin_lock_init(&buf->rb_mwlock); + spin_lock_init(&buf->rb_lock); + spin_lock_init(&buf->rb_recovery_lock); + INIT_LIST_HEAD(&buf->rb_mws); + INIT_LIST_HEAD(&buf->rb_all); + INIT_LIST_HEAD(&buf->rb_stale_mrs); + INIT_DELAYED_WORK(&buf->rb_refresh_worker, + rpcrdma_mr_refresh_worker); + INIT_DELAYED_WORK(&buf->rb_recovery_worker, + rpcrdma_mr_recovery_worker); - rc = ia->ri_ops->ro_init(r_xprt); - if (rc) - goto out; + rpcrdma_create_mrs(r_xprt); INIT_LIST_HEAD(&buf->rb_send_bufs); INIT_LIST_HEAD(&buf->rb_allreqs); @@ -862,7 +923,7 @@ rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt) } INIT_LIST_HEAD(&buf->rb_recv_bufs); - for (i = 0; i < buf->rb_max_requests + 2; i++) { + for (i = 0; i < buf->rb_max_requests; i++) { struct rpcrdma_rep *rep; rep = rpcrdma_create_rep(r_xprt); @@ -918,11 +979,39 @@ rpcrdma_destroy_req(struct rpcrdma_ia *ia, struct rpcrdma_req *req) kfree(req); } +static void +rpcrdma_destroy_mrs(struct rpcrdma_buffer *buf) +{ + struct rpcrdma_xprt *r_xprt = container_of(buf, struct rpcrdma_xprt, + rx_buf); + struct rpcrdma_ia *ia = rdmab_to_ia(buf); + struct rpcrdma_mw *mw; + unsigned int count; + + count = 0; + spin_lock(&buf->rb_mwlock); + while (!list_empty(&buf->rb_all)) { + mw = list_entry(buf->rb_all.next, struct rpcrdma_mw, mw_all); + list_del(&mw->mw_all); + + spin_unlock(&buf->rb_mwlock); + ia->ri_ops->ro_release_mr(mw); + count++; + spin_lock(&buf->rb_mwlock); + } + spin_unlock(&buf->rb_mwlock); + r_xprt->rx_stats.mrs_allocated = 0; + + dprintk("RPC: %s: released %u MRs\n", __func__, count); +} + void rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf) { struct rpcrdma_ia *ia = rdmab_to_ia(buf); + cancel_delayed_work_sync(&buf->rb_recovery_worker); + while (!list_empty(&buf->rb_recv_bufs)) { struct rpcrdma_rep *rep; @@ -944,7 +1033,7 @@ rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf) } spin_unlock(&buf->rb_reqslock); - ia->ri_ops->ro_destroy(buf); + rpcrdma_destroy_mrs(buf); } struct rpcrdma_mw * @@ -962,8 +1051,17 @@ rpcrdma_get_mw(struct rpcrdma_xprt *r_xprt) spin_unlock(&buf->rb_mwlock); if (!mw) - pr_err("RPC: %s: no MWs available\n", __func__); + goto out_nomws; return mw; + +out_nomws: + dprintk("RPC: %s: no MWs available\n", __func__); + schedule_delayed_work(&buf->rb_refresh_worker, 0); + + /* Allow the reply handler and refresh worker to run */ + cond_resched(); + + return NULL; } void @@ -978,8 +1076,6 @@ rpcrdma_put_mw(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mw *mw) /* * Get a set of request/reply buffers. - * - * Reply buffer (if available) is attached to send buffer upon return. */ struct rpcrdma_req * rpcrdma_buffer_get(struct rpcrdma_buffer *buffers) @@ -998,13 +1094,13 @@ rpcrdma_buffer_get(struct rpcrdma_buffer *buffers) out_reqbuf: spin_unlock(&buffers->rb_lock); - pr_warn("RPC: %s: out of request buffers\n", __func__); + pr_warn("rpcrdma: out of request buffers (%p)\n", buffers); return NULL; out_repbuf: + list_add(&req->rl_free, &buffers->rb_send_bufs); spin_unlock(&buffers->rb_lock); - pr_warn("RPC: %s: out of reply buffers\n", __func__); - req->rl_reply = NULL; - return req; + pr_warn("rpcrdma: out of reply buffers (%p)\n", buffers); + return NULL; } /* @@ -1060,14 +1156,6 @@ rpcrdma_recv_buffer_put(struct rpcrdma_rep *rep) * Wrappers for internal-use kmalloc memory registration, used by buffer code. */ -void -rpcrdma_mapping_error(struct rpcrdma_mr_seg *seg) -{ - dprintk("RPC: map_one: offset %p iova %llx len %zu\n", - seg->mr_offset, - (unsigned long long)seg->mr_dma, seg->mr_dmalen); -} - /** * rpcrdma_alloc_regbuf - kmalloc and register memory for SEND/RECV buffers * @ia: controlling rpcrdma_ia @@ -1150,7 +1238,7 @@ rpcrdma_ep_post(struct rpcrdma_ia *ia, if (rep) { rc = rpcrdma_ep_post_recv(ia, ep, rep); if (rc) - goto out; + return rc; req->rl_reply = NULL; } @@ -1175,10 +1263,12 @@ rpcrdma_ep_post(struct rpcrdma_ia *ia, rc = ib_post_send(ia->ri_id->qp, &send_wr, &send_wr_fail); if (rc) - dprintk("RPC: %s: ib_post_send returned %i\n", __func__, - rc); -out: - return rc; + goto out_postsend_err; + return 0; + +out_postsend_err: + pr_err("rpcrdma: RDMA Send ib_post_send returned %i\n", rc); + return -ENOTCONN; } /* @@ -1203,11 +1293,13 @@ rpcrdma_ep_post_recv(struct rpcrdma_ia *ia, DMA_BIDIRECTIONAL); rc = ib_post_recv(ia->ri_id->qp, &recv_wr, &recv_wr_fail); - if (rc) - dprintk("RPC: %s: ib_post_recv returned %i\n", __func__, - rc); - return rc; + goto out_postrecv; + return 0; + +out_postrecv: + pr_err("rpcrdma: ib_post_recv returned %i\n", rc); + return -ENOTCONN; } /** diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 95cdc66..670fad5 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -68,7 +68,6 @@ struct rpcrdma_ia { struct ib_device *ri_device; struct rdma_cm_id *ri_id; struct ib_pd *ri_pd; - struct ib_mr *ri_dma_mr; struct completion ri_done; int ri_async_rc; unsigned int ri_max_frmr_depth; @@ -172,23 +171,14 @@ rdmab_to_msg(struct rpcrdma_regbuf *rb) * o recv buffer (posted to provider) * o ib_sge (also donated to provider) * o status of reply (length, success or not) - * o bookkeeping state to get run by tasklet (list, etc) + * o bookkeeping state to get run by reply handler (list, etc) * - * These are allocated during initialization, per-transport instance; - * however, the tasklet execution list itself is global, as it should - * always be pretty short. + * These are allocated during initialization, per-transport instance. * * N of these are associated with a transport instance, and stored in * struct rpcrdma_buffer. N is the max number of outstanding requests. */ -#define RPCRDMA_MAX_DATA_SEGS ((1 * 1024 * 1024) / PAGE_SIZE) - -/* data segments + head/tail for Call + head/tail for Reply */ -#define RPCRDMA_MAX_SEGS (RPCRDMA_MAX_DATA_SEGS + 4) - -struct rpcrdma_buffer; - struct rpcrdma_rep { struct ib_cqe rr_cqe; unsigned int rr_len; @@ -221,9 +211,6 @@ enum rpcrdma_frmr_state { }; struct rpcrdma_frmr { - struct scatterlist *fr_sg; - int fr_nents; - enum dma_data_direction fr_dir; struct ib_mr *fr_mr; struct ib_cqe fr_cqe; enum rpcrdma_frmr_state fr_state; @@ -235,18 +222,23 @@ struct rpcrdma_frmr { }; struct rpcrdma_fmr { - struct ib_fmr *fmr; - u64 *physaddrs; + struct ib_fmr *fm_mr; + u64 *fm_physaddrs; }; struct rpcrdma_mw { + struct list_head mw_list; + struct scatterlist *mw_sg; + int mw_nents; + enum dma_data_direction mw_dir; union { struct rpcrdma_fmr fmr; struct rpcrdma_frmr frmr; }; - struct work_struct mw_work; struct rpcrdma_xprt *mw_xprt; - struct list_head mw_list; + u32 mw_handle; + u32 mw_length; + u64 mw_offset; struct list_head mw_all; }; @@ -266,33 +258,30 @@ struct rpcrdma_mw { * of iovs for send operations. The reason is that the iovs passed to * ib_post_{send,recv} must not be modified until the work request * completes. - * - * NOTES: - * o RPCRDMA_MAX_SEGS is the max number of addressible chunk elements we - * marshal. The number needed varies depending on the iov lists that - * are passed to us, the memory registration mode we are in, and if - * physical addressing is used, the layout. */ +/* Maximum number of page-sized "segments" per chunk list to be + * registered or invalidated. Must handle a Reply chunk: + */ +enum { + RPCRDMA_MAX_IOV_SEGS = 3, + RPCRDMA_MAX_DATA_SEGS = ((1 * 1024 * 1024) / PAGE_SIZE) + 1, + RPCRDMA_MAX_SEGS = RPCRDMA_MAX_DATA_SEGS + + RPCRDMA_MAX_IOV_SEGS, +}; + struct rpcrdma_mr_seg { /* chunk descriptors */ - struct rpcrdma_mw *rl_mw; /* registered MR */ - u64 mr_base; /* registration result */ - u32 mr_rkey; /* registration result */ u32 mr_len; /* length of chunk or segment */ - int mr_nsegs; /* number of segments in chunk or 0 */ - enum dma_data_direction mr_dir; /* segment mapping direction */ - dma_addr_t mr_dma; /* segment mapping address */ - size_t mr_dmalen; /* segment mapping length */ struct page *mr_page; /* owning page, if any */ char *mr_offset; /* kva if no page, else offset */ }; #define RPCRDMA_MAX_IOVS (2) +struct rpcrdma_buffer; struct rpcrdma_req { struct list_head rl_free; unsigned int rl_niovs; - unsigned int rl_nchunks; unsigned int rl_connect_cookie; struct rpc_task *rl_task; struct rpcrdma_buffer *rl_buffer; @@ -300,12 +289,13 @@ struct rpcrdma_req { struct ib_sge rl_send_iov[RPCRDMA_MAX_IOVS]; struct rpcrdma_regbuf *rl_rdmabuf; struct rpcrdma_regbuf *rl_sendbuf; - struct rpcrdma_mr_seg rl_segments[RPCRDMA_MAX_SEGS]; - struct rpcrdma_mr_seg *rl_nextseg; struct ib_cqe rl_cqe; struct list_head rl_all; bool rl_backchannel; + + struct list_head rl_registered; /* registered segments */ + struct rpcrdma_mr_seg rl_segments[RPCRDMA_MAX_SEGS]; }; static inline struct rpcrdma_req * @@ -341,6 +331,11 @@ struct rpcrdma_buffer { struct list_head rb_allreqs; u32 rb_bc_max_requests; + + spinlock_t rb_recovery_lock; /* protect rb_stale_mrs */ + struct list_head rb_stale_mrs; + struct delayed_work rb_recovery_worker; + struct delayed_work rb_refresh_worker; }; #define rdmab_to_ia(b) (&container_of((b), struct rpcrdma_xprt, rx_buf)->rx_ia) @@ -387,6 +382,9 @@ struct rpcrdma_stats { unsigned long bad_reply_count; unsigned long nomsg_call_count; unsigned long bcall_count; + unsigned long mrs_recovered; + unsigned long mrs_orphaned; + unsigned long mrs_allocated; }; /* @@ -395,23 +393,25 @@ struct rpcrdma_stats { struct rpcrdma_xprt; struct rpcrdma_memreg_ops { int (*ro_map)(struct rpcrdma_xprt *, - struct rpcrdma_mr_seg *, int, bool); + struct rpcrdma_mr_seg *, int, bool, + struct rpcrdma_mw **); void (*ro_unmap_sync)(struct rpcrdma_xprt *, struct rpcrdma_req *); void (*ro_unmap_safe)(struct rpcrdma_xprt *, struct rpcrdma_req *, bool); + void (*ro_recover_mr)(struct rpcrdma_mw *); int (*ro_open)(struct rpcrdma_ia *, struct rpcrdma_ep *, struct rpcrdma_create_data_internal *); size_t (*ro_maxpages)(struct rpcrdma_xprt *); - int (*ro_init)(struct rpcrdma_xprt *); - void (*ro_destroy)(struct rpcrdma_buffer *); + int (*ro_init_mr)(struct rpcrdma_ia *, + struct rpcrdma_mw *); + void (*ro_release_mr)(struct rpcrdma_mw *); const char *ro_displayname; }; extern const struct rpcrdma_memreg_ops rpcrdma_fmr_memreg_ops; extern const struct rpcrdma_memreg_ops rpcrdma_frwr_memreg_ops; -extern const struct rpcrdma_memreg_ops rpcrdma_physical_memreg_ops; /* * RPCRDMA transport -- encapsulates the structures above for @@ -446,6 +446,8 @@ extern int xprt_rdma_pad_optimize; */ int rpcrdma_ia_open(struct rpcrdma_xprt *, struct sockaddr *, int); void rpcrdma_ia_close(struct rpcrdma_ia *); +bool frwr_is_supported(struct rpcrdma_ia *); +bool fmr_is_supported(struct rpcrdma_ia *); /* * Endpoint calls - xprtrdma/verbs.c @@ -477,6 +479,8 @@ void rpcrdma_buffer_put(struct rpcrdma_req *); void rpcrdma_recv_buffer_get(struct rpcrdma_req *); void rpcrdma_recv_buffer_put(struct rpcrdma_rep *); +void rpcrdma_defer_mr_recovery(struct rpcrdma_mw *); + struct rpcrdma_regbuf *rpcrdma_alloc_regbuf(struct rpcrdma_ia *, size_t, gfp_t); void rpcrdma_free_regbuf(struct rpcrdma_ia *, @@ -484,9 +488,6 @@ void rpcrdma_free_regbuf(struct rpcrdma_ia *, int rpcrdma_ep_post_extra_recv(struct rpcrdma_xprt *, unsigned int); -int frwr_alloc_recovery_wq(void); -void frwr_destroy_recovery_wq(void); - int rpcrdma_alloc_wq(void); void rpcrdma_destroy_wq(void); @@ -494,45 +495,12 @@ void rpcrdma_destroy_wq(void); * Wrappers for chunk registration, shared by read/write chunk code. */ -void rpcrdma_mapping_error(struct rpcrdma_mr_seg *); - static inline enum dma_data_direction rpcrdma_data_dir(bool writing) { return writing ? DMA_FROM_DEVICE : DMA_TO_DEVICE; } -static inline void -rpcrdma_map_one(struct ib_device *device, struct rpcrdma_mr_seg *seg, - enum dma_data_direction direction) -{ - seg->mr_dir = direction; - seg->mr_dmalen = seg->mr_len; - - if (seg->mr_page) - seg->mr_dma = ib_dma_map_page(device, - seg->mr_page, offset_in_page(seg->mr_offset), - seg->mr_dmalen, seg->mr_dir); - else - seg->mr_dma = ib_dma_map_single(device, - seg->mr_offset, - seg->mr_dmalen, seg->mr_dir); - - if (ib_dma_mapping_error(device, seg->mr_dma)) - rpcrdma_mapping_error(seg); -} - -static inline void -rpcrdma_unmap_one(struct ib_device *device, struct rpcrdma_mr_seg *seg) -{ - if (seg->mr_page) - ib_dma_unmap_page(device, - seg->mr_dma, seg->mr_dmalen, seg->mr_dir); - else - ib_dma_unmap_single(device, - seg->mr_dma, seg->mr_dmalen, seg->mr_dir); -} - /* * RPC/RDMA connection management calls - xprtrdma/rpc_rdma.c */ diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 7e2b2fa..111767a 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -124,7 +124,7 @@ static struct ctl_table xs_tunables_table[] = { .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = &xprt_min_resvport_limit, - .extra2 = &xprt_max_resvport_limit + .extra2 = &xprt_max_resvport }, { .procname = "max_resvport", @@ -132,7 +132,7 @@ static struct ctl_table xs_tunables_table[] = { .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &xprt_min_resvport_limit, + .extra1 = &xprt_min_resvport, .extra2 = &xprt_max_resvport_limit }, { @@ -642,6 +642,7 @@ static int xs_tcp_send_request(struct rpc_task *task) struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); struct xdr_buf *xdr = &req->rq_snd_buf; bool zerocopy = true; + bool vm_wait = false; int status; int sent; @@ -677,15 +678,33 @@ static int xs_tcp_send_request(struct rpc_task *task) return 0; } + WARN_ON_ONCE(sent == 0 && status == 0); + + if (status == -EAGAIN ) { + /* + * Return EAGAIN if we're sure we're hitting the + * socket send buffer limits. + */ + if (test_bit(SOCK_NOSPACE, &transport->sock->flags)) + break; + /* + * Did we hit a memory allocation failure? + */ + if (sent == 0) { + status = -ENOBUFS; + if (vm_wait) + break; + /* Retry, knowing now that we're below the + * socket send buffer limit + */ + vm_wait = true; + } + continue; + } if (status < 0) break; - if (sent == 0) { - status = -EAGAIN; - break; - } + vm_wait = false; } - if (status == -EAGAIN && sk_stream_is_writeable(transport->inet)) - status = -ENOBUFS; switch (status) { case -ENOTSOCK: @@ -755,11 +774,19 @@ static void xs_restore_old_callbacks(struct sock_xprt *transport, struct sock *s sk->sk_error_report = transport->old_error_report; } +static void xs_sock_reset_state_flags(struct rpc_xprt *xprt) +{ + struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); + + clear_bit(XPRT_SOCK_DATA_READY, &transport->sock_state); +} + static void xs_sock_reset_connection_flags(struct rpc_xprt *xprt) { smp_mb__before_atomic(); clear_bit(XPRT_CLOSE_WAIT, &xprt->state); clear_bit(XPRT_CLOSING, &xprt->state); + xs_sock_reset_state_flags(xprt); smp_mb__after_atomic(); } @@ -962,10 +989,13 @@ static void xs_local_data_receive(struct sock_xprt *transport) goto out; for (;;) { skb = skb_recv_datagram(sk, 0, 1, &err); - if (skb == NULL) + if (skb != NULL) { + xs_local_data_read_skb(&transport->xprt, sk, skb); + skb_free_datagram(sk, skb); + continue; + } + if (!test_and_clear_bit(XPRT_SOCK_DATA_READY, &transport->sock_state)) break; - xs_local_data_read_skb(&transport->xprt, sk, skb); - skb_free_datagram(sk, skb); } out: mutex_unlock(&transport->recv_mutex); @@ -1043,10 +1073,13 @@ static void xs_udp_data_receive(struct sock_xprt *transport) goto out; for (;;) { skb = skb_recv_datagram(sk, 0, 1, &err); - if (skb == NULL) + if (skb != NULL) { + xs_udp_data_read_skb(&transport->xprt, sk, skb); + skb_free_datagram(sk, skb); + continue; + } + if (!test_and_clear_bit(XPRT_SOCK_DATA_READY, &transport->sock_state)) break; - xs_udp_data_read_skb(&transport->xprt, sk, skb); - skb_free_datagram(sk, skb); } out: mutex_unlock(&transport->recv_mutex); @@ -1074,7 +1107,14 @@ static void xs_data_ready(struct sock *sk) if (xprt != NULL) { struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); - queue_work(rpciod_workqueue, &transport->recv_worker); + transport->old_data_ready(sk); + /* Any data means we had a useful conversation, so + * then we don't need to delay the next reconnect + */ + if (xprt->reestablish_timeout) + xprt->reestablish_timeout = 0; + if (!test_and_set_bit(XPRT_SOCK_DATA_READY, &transport->sock_state)) + queue_work(xprtiod_workqueue, &transport->recv_worker); } read_unlock_bh(&sk->sk_callback_lock); } @@ -1474,10 +1514,15 @@ static void xs_tcp_data_receive(struct sock_xprt *transport) for (;;) { lock_sock(sk); read = tcp_read_sock(sk, &rd_desc, xs_tcp_data_recv); - release_sock(sk); - if (read <= 0) - break; - total += read; + if (read <= 0) { + clear_bit(XPRT_SOCK_DATA_READY, &transport->sock_state); + release_sock(sk); + if (!test_bit(XPRT_SOCK_DATA_READY, &transport->sock_state)) + break; + } else { + release_sock(sk); + total += read; + } rd_desc.count = 65536; } out: @@ -1493,34 +1538,6 @@ static void xs_tcp_data_receive_workfn(struct work_struct *work) } /** - * xs_tcp_data_ready - "data ready" callback for TCP sockets - * @sk: socket with data to read - * - */ -static void xs_tcp_data_ready(struct sock *sk) -{ - struct sock_xprt *transport; - struct rpc_xprt *xprt; - - dprintk("RPC: xs_tcp_data_ready...\n"); - - read_lock_bh(&sk->sk_callback_lock); - if (!(xprt = xprt_from_sock(sk))) - goto out; - transport = container_of(xprt, struct sock_xprt, xprt); - - /* Any data means we had a useful conversation, so - * the we don't need to delay the next reconnect - */ - if (xprt->reestablish_timeout) - xprt->reestablish_timeout = 0; - queue_work(rpciod_workqueue, &transport->recv_worker); - -out: - read_unlock_bh(&sk->sk_callback_lock); -} - -/** * xs_tcp_state_change - callback to handle TCP socket state changes * @sk: socket whose state has changed * @@ -1714,7 +1731,7 @@ static void xs_udp_timer(struct rpc_xprt *xprt, struct rpc_task *task) static unsigned short xs_get_random_port(void) { - unsigned short range = xprt_max_resvport - xprt_min_resvport; + unsigned short range = xprt_max_resvport - xprt_min_resvport + 1; unsigned short rand = (unsigned short) prandom_u32() % range; return rand + xprt_min_resvport; } @@ -2241,7 +2258,7 @@ static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock) xs_save_old_callbacks(transport, sk); sk->sk_user_data = xprt; - sk->sk_data_ready = xs_tcp_data_ready; + sk->sk_data_ready = xs_data_ready; sk->sk_state_change = xs_tcp_state_change; sk->sk_write_space = xs_tcp_write_space; sock_set_flag(sk, SOCK_FASYNC); @@ -2380,7 +2397,7 @@ static void xs_connect(struct rpc_xprt *xprt, struct rpc_task *task) /* Start by resetting any existing state */ xs_reset_transport(transport); - queue_delayed_work(rpciod_workqueue, + queue_delayed_work(xprtiod_workqueue, &transport->connect_worker, xprt->reestablish_timeout); xprt->reestablish_timeout <<= 1; @@ -2390,7 +2407,7 @@ static void xs_connect(struct rpc_xprt *xprt, struct rpc_task *task) xprt->reestablish_timeout = XS_TCP_MAX_REEST_TO; } else { dprintk("RPC: xs_connect scheduled xprt %p\n", xprt); - queue_delayed_work(rpciod_workqueue, + queue_delayed_work(xprtiod_workqueue, &transport->connect_worker, 0); } } @@ -3153,8 +3170,12 @@ static int param_set_uint_minmax(const char *val, static int param_set_portnr(const char *val, const struct kernel_param *kp) { - return param_set_uint_minmax(val, kp, + if (kp->arg == &xprt_min_resvport) + return param_set_uint_minmax(val, kp, RPC_MIN_RESVPORT, + xprt_max_resvport); + return param_set_uint_minmax(val, kp, + xprt_min_resvport, RPC_MAX_RESVPORT); } |