diff options
40 files changed, 2562 insertions, 254 deletions
diff --git a/Documentation/filesystems/nfs/nfs41-server.txt b/Documentation/filesystems/nfs/nfs41-server.txt index c49cd7e..682a59f 100644 --- a/Documentation/filesystems/nfs/nfs41-server.txt +++ b/Documentation/filesystems/nfs/nfs41-server.txt @@ -24,11 +24,6 @@ focuses on the mandatory-to-implement NFSv4.1 Sessions, providing "exactly once" semantics and better control and throttling of the resources allocated for each client. -Other NFSv4.1 features, Parallel NFS operations in particular, -are still under development out of tree. -See http://wiki.linux-nfs.org/wiki/index.php/PNFS_prototype_design -for more information. - The table below, taken from the NFSv4.1 document, lists the operations that are mandatory to implement (REQ), optional (OPT), and NFSv4.0 operations that are required not to implement (MNI) @@ -43,9 +38,7 @@ The OPTIONAL features identified and their abbreviations are as follows: The following abbreviations indicate the linux server implementation status. I Implemented NFSv4.1 operations. NS Not Supported. - NS* unimplemented optional feature. - P pNFS features implemented out of tree. - PNS pNFS features that are not supported yet (out of tree). + NS* Unimplemented optional feature. Operations @@ -70,13 +63,13 @@ I | DESTROY_SESSION | REQ | | Section 18.37 | I | EXCHANGE_ID | REQ | | Section 18.35 | I | FREE_STATEID | REQ | | Section 18.38 | | GETATTR | REQ | | Section 18.7 | -P | GETDEVICEINFO | OPT | pNFS (REQ) | Section 18.40 | -P | GETDEVICELIST | OPT | pNFS (OPT) | Section 18.41 | +I | GETDEVICEINFO | OPT | pNFS (REQ) | Section 18.40 | +NS*| GETDEVICELIST | OPT | pNFS (OPT) | Section 18.41 | | GETFH | REQ | | Section 18.8 | NS*| GET_DIR_DELEGATION | OPT | DDELG (REQ) | Section 18.39 | -P | LAYOUTCOMMIT | OPT | pNFS (REQ) | Section 18.42 | -P | LAYOUTGET | OPT | pNFS (REQ) | Section 18.43 | -P | LAYOUTRETURN | OPT | pNFS (REQ) | Section 18.44 | +I | LAYOUTCOMMIT | OPT | pNFS (REQ) | Section 18.42 | +I | LAYOUTGET | OPT | pNFS (REQ) | Section 18.43 | +I | LAYOUTRETURN | OPT | pNFS (REQ) | Section 18.44 | | LINK | OPT | | Section 18.9 | | LOCK | REQ | | Section 18.10 | | LOCKT | REQ | | Section 18.11 | @@ -122,9 +115,9 @@ Callback Operations | | MNI | or OPT) | | +-------------------------+-----------+-------------+---------------+ | CB_GETATTR | OPT | FDELG (REQ) | Section 20.1 | -P | CB_LAYOUTRECALL | OPT | pNFS (REQ) | Section 20.3 | +I | CB_LAYOUTRECALL | OPT | pNFS (REQ) | Section 20.3 | NS*| CB_NOTIFY | OPT | DDELG (REQ) | Section 20.4 | -P | CB_NOTIFY_DEVICEID | OPT | pNFS (OPT) | Section 20.12 | +NS*| CB_NOTIFY_DEVICEID | OPT | pNFS (OPT) | Section 20.12 | NS*| CB_NOTIFY_LOCK | OPT | | Section 20.11 | NS*| CB_PUSH_DELEG | OPT | FDELG (OPT) | Section 20.5 | | CB_RECALL | OPT | FDELG, | Section 20.2 | diff --git a/Documentation/filesystems/nfs/pnfs-block-server.txt b/Documentation/filesystems/nfs/pnfs-block-server.txt new file mode 100644 index 0000000..2143673 --- /dev/null +++ b/Documentation/filesystems/nfs/pnfs-block-server.txt @@ -0,0 +1,37 @@ +pNFS block layout server user guide + +The Linux NFS server now supports the pNFS block layout extension. In this +case the NFS server acts as Metadata Server (MDS) for pNFS, which in addition +to handling all the metadata access to the NFS export also hands out layouts +to the clients to directly access the underlying block devices that are +shared with the client. + +To use pNFS block layouts with with the Linux NFS server the exported file +system needs to support the pNFS block layouts (currently just XFS), and the +file system must sit on shared storage (typically iSCSI) that is accessible +to the clients in addition to the MDS. As of now the file system needs to +sit directly on the exported volume, striping or concatenation of +volumes on the MDS and clients is not supported yet. + +On the server, pNFS block volume support is automatically if the file system +support it. On the client make sure the kernel has the CONFIG_PNFS_BLOCK +option enabled, the blkmapd daemon from nfs-utils is running, and the +file system is mounted using the NFSv4.1 protocol version (mount -o vers=4.1). + +If the nfsd server needs to fence a non-responding client it calls +/sbin/nfsd-recall-failed with the first argument set to the IP address of +the client, and the second argument set to the device node without the /dev +prefix for the file system to be fenced. Below is an example file that shows +how to translate the device into a serial number from SCSI EVPD 0x80: + +cat > /sbin/nfsd-recall-failed << EOF +#!/bin/sh + +CLIENT="$1" +DEV="/dev/$2" +EVPD=`sg_inq --page=0x80 ${DEV} | \ + grep "Unit serial number:" | \ + awk -F ': ' '{print $2}'` + +echo "fencing client ${CLIENT} serial ${EVPD}" >> /var/log/pnfsd-fence.log +EOF diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c index 5659874..5581e02 100644 --- a/fs/lockd/svclock.c +++ b/fs/lockd/svclock.c @@ -57,8 +57,8 @@ static DEFINE_SPINLOCK(nlm_blocked_lock); static const char *nlmdbg_cookie2a(const struct nlm_cookie *cookie) { /* - * We can get away with a static buffer because we're only - * called with BKL held. + * We can get away with a static buffer because this is only called + * from lockd, which is single-threaded. */ static char buf[2*NLM_MAXCOOKIELEN+1]; unsigned int i, len = sizeof(buf); diff --git a/fs/lockd/xdr.c b/fs/lockd/xdr.c index 9340e7e..5b651da 100644 --- a/fs/lockd/xdr.c +++ b/fs/lockd/xdr.c @@ -95,14 +95,6 @@ nlm_decode_fh(__be32 *p, struct nfs_fh *f) return p + XDR_QUADLEN(NFS2_FHSIZE); } -static inline __be32 * -nlm_encode_fh(__be32 *p, struct nfs_fh *f) -{ - *p++ = htonl(NFS2_FHSIZE); - memcpy(p, f->data, NFS2_FHSIZE); - return p + XDR_QUADLEN(NFS2_FHSIZE); -} - /* * Encode and decode owner handle */ @@ -137,7 +137,7 @@ #define IS_POSIX(fl) (fl->fl_flags & FL_POSIX) #define IS_FLOCK(fl) (fl->fl_flags & FL_FLOCK) -#define IS_LEASE(fl) (fl->fl_flags & (FL_LEASE|FL_DELEG)) +#define IS_LEASE(fl) (fl->fl_flags & (FL_LEASE|FL_DELEG|FL_LAYOUT)) #define IS_OFDLCK(fl) (fl->fl_flags & FL_OFDLCK) static bool lease_breaking(struct file_lock *fl) @@ -1371,6 +1371,8 @@ static void time_out_leases(struct inode *inode, struct list_head *dispose) static bool leases_conflict(struct file_lock *lease, struct file_lock *breaker) { + if ((breaker->fl_flags & FL_LAYOUT) != (lease->fl_flags & FL_LAYOUT)) + return false; if ((breaker->fl_flags & FL_DELEG) && (lease->fl_flags & FL_LEASE)) return false; return locks_conflict(breaker, lease); @@ -1594,11 +1596,14 @@ int fcntl_getlease(struct file *filp) * conflict with the lease we're trying to set. */ static int -check_conflicting_open(const struct dentry *dentry, const long arg) +check_conflicting_open(const struct dentry *dentry, const long arg, int flags) { int ret = 0; struct inode *inode = dentry->d_inode; + if (flags & FL_LAYOUT) + return 0; + if ((arg == F_RDLCK) && (atomic_read(&inode->i_writecount) > 0)) return -EAGAIN; @@ -1647,7 +1652,7 @@ generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **pr spin_lock(&ctx->flc_lock); time_out_leases(inode, &dispose); - error = check_conflicting_open(dentry, arg); + error = check_conflicting_open(dentry, arg, lease->fl_flags); if (error) goto out; @@ -1661,7 +1666,8 @@ generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **pr */ error = -EAGAIN; list_for_each_entry(fl, &ctx->flc_lease, fl_list) { - if (fl->fl_file == filp) { + if (fl->fl_file == filp && + fl->fl_owner == lease->fl_owner) { my_fl = fl; continue; } @@ -1702,7 +1708,7 @@ generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **pr * precedes these checks. */ smp_mb(); - error = check_conflicting_open(dentry, arg); + error = check_conflicting_open(dentry, arg, lease->fl_flags); if (error) { locks_unlink_lock_ctx(lease, &ctx->flc_lease_cnt); goto out; @@ -1721,7 +1727,7 @@ out: return error; } -static int generic_delete_lease(struct file *filp) +static int generic_delete_lease(struct file *filp, void *owner) { int error = -EAGAIN; struct file_lock *fl, *victim = NULL; @@ -1737,7 +1743,8 @@ static int generic_delete_lease(struct file *filp) spin_lock(&ctx->flc_lock); list_for_each_entry(fl, &ctx->flc_lease, fl_list) { - if (fl->fl_file == filp) { + if (fl->fl_file == filp && + fl->fl_owner == owner) { victim = fl; break; } @@ -1778,13 +1785,14 @@ int generic_setlease(struct file *filp, long arg, struct file_lock **flp, switch (arg) { case F_UNLCK: - return generic_delete_lease(filp); + return generic_delete_lease(filp, *priv); case F_RDLCK: case F_WRLCK: if (!(*flp)->fl_lmops->lm_break) { WARN_ON_ONCE(1); return -ENOLCK; } + return generic_add_lease(filp, arg, flp, priv); default: return -EINVAL; @@ -1857,7 +1865,7 @@ static int do_fcntl_add_lease(unsigned int fd, struct file *filp, long arg) int fcntl_setlease(unsigned int fd, struct file *filp, long arg) { if (arg == F_UNLCK) - return vfs_setlease(filp, F_UNLCK, NULL, NULL); + return vfs_setlease(filp, F_UNLCK, NULL, (void **)&filp); return do_fcntl_add_lease(fd, filp, arg); } diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig index 7339515..683bf71 100644 --- a/fs/nfsd/Kconfig +++ b/fs/nfsd/Kconfig @@ -82,6 +82,16 @@ config NFSD_V4 If unsure, say N. +config NFSD_PNFS + bool "NFSv4.1 server support for Parallel NFS (pNFS)" + depends on NFSD_V4 + help + This option enables support for the parallel NFS features of the + minor version 1 of the NFSv4 protocol (RFC5661) in the kernel's NFS + server. + + If unsure, say N. + config NFSD_V4_SECURITY_LABEL bool "Provide Security Label support for NFSv4 server" depends on NFSD_V4 && SECURITY diff --git a/fs/nfsd/Makefile b/fs/nfsd/Makefile index af32ef0..9a6028e 100644 --- a/fs/nfsd/Makefile +++ b/fs/nfsd/Makefile @@ -2,9 +2,14 @@ # Makefile for the Linux nfs server # +ccflags-y += -I$(src) # needed for trace events + obj-$(CONFIG_NFSD) += nfsd.o -nfsd-y := nfssvc.o nfsctl.o nfsproc.o nfsfh.o vfs.o \ +# this one should be compiled first, as the tracing macros can easily blow up +nfsd-y += trace.o + +nfsd-y += nfssvc.o nfsctl.o nfsproc.o nfsfh.o vfs.o \ export.o auth.o lockd.o nfscache.o nfsxdr.o stats.o nfsd-$(CONFIG_NFSD_FAULT_INJECTION) += fault_inject.o nfsd-$(CONFIG_NFSD_V2_ACL) += nfs2acl.o @@ -12,3 +17,4 @@ nfsd-$(CONFIG_NFSD_V3) += nfs3proc.o nfs3xdr.o nfsd-$(CONFIG_NFSD_V3_ACL) += nfs3acl.o nfsd-$(CONFIG_NFSD_V4) += nfs4proc.o nfs4xdr.o nfs4state.o nfs4idmap.o \ nfs4acl.o nfs4callback.o nfs4recover.o +nfsd-$(CONFIG_NFSD_PNFS) += nfs4layouts.o blocklayout.o blocklayoutxdr.o diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c new file mode 100644 index 0000000..cdbc78c --- /dev/null +++ b/fs/nfsd/blocklayout.c @@ -0,0 +1,189 @@ +/* + * Copyright (c) 2014 Christoph Hellwig. + */ +#include <linux/exportfs.h> +#include <linux/genhd.h> +#include <linux/slab.h> + +#include <linux/nfsd/debug.h> + +#include "blocklayoutxdr.h" +#include "pnfs.h" + +#define NFSDDBG_FACILITY NFSDDBG_PNFS + + +static int +nfsd4_block_get_device_info_simple(struct super_block *sb, + struct nfsd4_getdeviceinfo *gdp) +{ + struct pnfs_block_deviceaddr *dev; + struct pnfs_block_volume *b; + + dev = kzalloc(sizeof(struct pnfs_block_deviceaddr) + + sizeof(struct pnfs_block_volume), GFP_KERNEL); + if (!dev) + return -ENOMEM; + gdp->gd_device = dev; + + dev->nr_volumes = 1; + b = &dev->volumes[0]; + + b->type = PNFS_BLOCK_VOLUME_SIMPLE; + b->simple.sig_len = PNFS_BLOCK_UUID_LEN; + return sb->s_export_op->get_uuid(sb, b->simple.sig, &b->simple.sig_len, + &b->simple.offset); +} + +static __be32 +nfsd4_block_proc_getdeviceinfo(struct super_block *sb, + struct nfsd4_getdeviceinfo *gdp) +{ + if (sb->s_bdev != sb->s_bdev->bd_contains) + return nfserr_inval; + return nfserrno(nfsd4_block_get_device_info_simple(sb, gdp)); +} + +static __be32 +nfsd4_block_proc_layoutget(struct inode *inode, const struct svc_fh *fhp, + struct nfsd4_layoutget *args) +{ + struct nfsd4_layout_seg *seg = &args->lg_seg; + struct super_block *sb = inode->i_sb; + u32 block_size = (1 << inode->i_blkbits); + struct pnfs_block_extent *bex; + struct iomap iomap; + u32 device_generation = 0; + int error; + + /* + * We do not attempt to support I/O smaller than the fs block size, + * or not aligned to it. + */ + if (args->lg_minlength < block_size) { + dprintk("pnfsd: I/O too small\n"); + goto out_layoutunavailable; + } + if (seg->offset & (block_size - 1)) { + dprintk("pnfsd: I/O misaligned\n"); + goto out_layoutunavailable; + } + + /* + * Some clients barf on non-zero block numbers for NONE or INVALID + * layouts, so make sure to zero the whole structure. + */ + error = -ENOMEM; + bex = kzalloc(sizeof(*bex), GFP_KERNEL); + if (!bex) + goto out_error; + args->lg_content = bex; + + error = sb->s_export_op->map_blocks(inode, seg->offset, seg->length, + &iomap, seg->iomode != IOMODE_READ, + &device_generation); + if (error) { + if (error == -ENXIO) + goto out_layoutunavailable; + goto out_error; + } + + if (iomap.length < args->lg_minlength) { + dprintk("pnfsd: extent smaller than minlength\n"); + goto out_layoutunavailable; + } + + switch (iomap.type) { + case IOMAP_MAPPED: + if (seg->iomode == IOMODE_READ) + bex->es = PNFS_BLOCK_READ_DATA; + else + bex->es = PNFS_BLOCK_READWRITE_DATA; + bex->soff = (iomap.blkno << 9); + break; + case IOMAP_UNWRITTEN: + if (seg->iomode & IOMODE_RW) { + /* + * Crack monkey special case from section 2.3.1. + */ + if (args->lg_minlength == 0) { + dprintk("pnfsd: no soup for you!\n"); + goto out_layoutunavailable; + } + + bex->es = PNFS_BLOCK_INVALID_DATA; + bex->soff = (iomap.blkno << 9); + break; + } + /*FALLTHRU*/ + case IOMAP_HOLE: + if (seg->iomode == IOMODE_READ) { + bex->es = PNFS_BLOCK_NONE_DATA; + break; + } + /*FALLTHRU*/ + case IOMAP_DELALLOC: + default: + WARN(1, "pnfsd: filesystem returned %d extent\n", iomap.type); + goto out_layoutunavailable; + } + + error = nfsd4_set_deviceid(&bex->vol_id, fhp, device_generation); + if (error) + goto out_error; + bex->foff = iomap.offset; + bex->len = iomap.length; + + seg->offset = iomap.offset; + seg->length = iomap.length; + + dprintk("GET: %lld:%lld %d\n", bex->foff, bex->len, bex->es); + return 0; + +out_error: + seg->length = 0; + return nfserrno(error); +out_layoutunavailable: + seg->length = 0; + return nfserr_layoutunavailable; +} + +static __be32 +nfsd4_block_proc_layoutcommit(struct inode *inode, + struct nfsd4_layoutcommit *lcp) +{ + loff_t new_size = lcp->lc_last_wr + 1; + struct iattr iattr = { .ia_valid = 0 }; + struct iomap *iomaps; + int nr_iomaps; + int error; + + nr_iomaps = nfsd4_block_decode_layoutupdate(lcp->lc_up_layout, + lcp->lc_up_len, &iomaps, 1 << inode->i_blkbits); + if (nr_iomaps < 0) + return nfserrno(nr_iomaps); + + if (lcp->lc_mtime.tv_nsec == UTIME_NOW || + timespec_compare(&lcp->lc_mtime, &inode->i_mtime) < 0) + lcp->lc_mtime = current_fs_time(inode->i_sb); + iattr.ia_valid |= ATTR_ATIME | ATTR_CTIME | ATTR_MTIME; + iattr.ia_atime = iattr.ia_ctime = iattr.ia_mtime = lcp->lc_mtime; + + if (new_size > i_size_read(inode)) { + iattr.ia_valid |= ATTR_SIZE; + iattr.ia_size = new_size; + } + + error = inode->i_sb->s_export_op->commit_blocks(inode, iomaps, + nr_iomaps, &iattr); + kfree(iomaps); + return nfserrno(error); +} + +const struct nfsd4_layout_ops bl_layout_ops = { + .proc_getdeviceinfo = nfsd4_block_proc_getdeviceinfo, + .encode_getdeviceinfo = nfsd4_block_encode_getdeviceinfo, + .proc_layoutget = nfsd4_block_proc_layoutget, + .encode_layoutget = nfsd4_block_encode_layoutget, + .proc_layoutcommit = nfsd4_block_proc_layoutcommit, +}; diff --git a/fs/nfsd/blocklayoutxdr.c b/fs/nfsd/blocklayoutxdr.c new file mode 100644 index 0000000..9da89fd --- /dev/null +++ b/fs/nfsd/blocklayoutxdr.c @@ -0,0 +1,157 @@ +/* + * Copyright (c) 2014 Christoph Hellwig. + */ +#include <linux/sunrpc/svc.h> +#include <linux/exportfs.h> +#include <linux/nfs4.h> + +#include "nfsd.h" +#include "blocklayoutxdr.h" + +#define NFSDDBG_FACILITY NFSDDBG_PNFS + + +__be32 +nfsd4_block_encode_layoutget(struct xdr_stream *xdr, + struct nfsd4_layoutget *lgp) +{ + struct pnfs_block_extent *b = lgp->lg_content; + int len = sizeof(__be32) + 5 * sizeof(__be64) + sizeof(__be32); + __be32 *p; + + p = xdr_reserve_space(xdr, sizeof(__be32) + len); + if (!p) + return nfserr_toosmall; + + *p++ = cpu_to_be32(len); + *p++ = cpu_to_be32(1); /* we always return a single extent */ + + p = xdr_encode_opaque_fixed(p, &b->vol_id, + sizeof(struct nfsd4_deviceid)); + p = xdr_encode_hyper(p, b->foff); + p = xdr_encode_hyper(p, b->len); + p = xdr_encode_hyper(p, b->soff); + *p++ = cpu_to_be32(b->es); + return 0; +} + +static int +nfsd4_block_encode_volume(struct xdr_stream *xdr, struct pnfs_block_volume *b) +{ + __be32 *p; + int len; + + switch (b->type) { + case PNFS_BLOCK_VOLUME_SIMPLE: + len = 4 + 4 + 8 + 4 + b->simple.sig_len; + p = xdr_reserve_space(xdr, len); + if (!p) + return -ETOOSMALL; + + *p++ = cpu_to_be32(b->type); + *p++ = cpu_to_be32(1); /* single signature */ + p = xdr_encode_hyper(p, b->simple.offset); + p = xdr_encode_opaque(p, b->simple.sig, b->simple.sig_len); + break; + default: + return -ENOTSUPP; + } + + return len; +} + +__be32 +nfsd4_block_encode_getdeviceinfo(struct xdr_stream *xdr, + struct nfsd4_getdeviceinfo *gdp) +{ + struct pnfs_block_deviceaddr *dev = gdp->gd_device; + int len = sizeof(__be32), ret, i; + __be32 *p; + + p = xdr_reserve_space(xdr, len + sizeof(__be32)); + if (!p) + return nfserr_resource; + + for (i = 0; i < dev->nr_volumes; i++) { + ret = nfsd4_block_encode_volume(xdr, &dev->volumes[i]); + if (ret < 0) + return nfserrno(ret); + len += ret; + } + + /* + * Fill in the overall length and number of volumes at the beginning + * of the layout. + */ + *p++ = cpu_to_be32(len); + *p++ = cpu_to_be32(dev->nr_volumes); + return 0; +} + +int +nfsd4_block_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp, + u32 block_size) +{ + struct iomap *iomaps; + u32 nr_iomaps, expected, i; + + if (len < sizeof(u32)) { + dprintk("%s: extent array too small: %u\n", __func__, len); + return -EINVAL; + } + + nr_iomaps = be32_to_cpup(p++); + expected = sizeof(__be32) + nr_iomaps * NFS4_BLOCK_EXTENT_SIZE; + if (len != expected) { + dprintk("%s: extent array size mismatch: %u/%u\n", + __func__, len, expected); + return -EINVAL; + } + + iomaps = kcalloc(nr_iomaps, sizeof(*iomaps), GFP_KERNEL); + if (!iomaps) { + dprintk("%s: failed to allocate extent array\n", __func__); + return -ENOMEM; + } + + for (i = 0; i < nr_iomaps; i++) { + struct pnfs_block_extent bex; + + memcpy(&bex.vol_id, p, sizeof(struct nfsd4_deviceid)); + p += XDR_QUADLEN(sizeof(struct nfsd4_deviceid)); + + p = xdr_decode_hyper(p, &bex.foff); + if (bex.foff & (block_size - 1)) { + dprintk("%s: unaligned offset %lld\n", + __func__, bex.foff); + goto fail; + } + p = xdr_decode_hyper(p, &bex.len); + if (bex.len & (block_size - 1)) { + dprintk("%s: unaligned length %lld\n", + __func__, bex.foff); + goto fail; + } + p = xdr_decode_hyper(p, &bex.soff); + if (bex.soff & (block_size - 1)) { + dprintk("%s: unaligned disk offset %lld\n", + __func__, bex.soff); + goto fail; + } + bex.es = be32_to_cpup(p++); + if (bex.es != PNFS_BLOCK_READWRITE_DATA) { + dprintk("%s: incorrect extent state %d\n", + __func__, bex.es); + goto fail; + } + + iomaps[i].offset = bex.foff; + iomaps[i].length = bex.len; + } + + *iomapp = iomaps; + return nr_iomaps; +fail: + kfree(iomaps); + return -EINVAL; +} diff --git a/fs/nfsd/blocklayoutxdr.h b/fs/nfsd/blocklayoutxdr.h new file mode 100644 index 0000000..fdc7903 --- /dev/null +++ b/fs/nfsd/blocklayoutxdr.h @@ -0,0 +1,62 @@ +#ifndef _NFSD_BLOCKLAYOUTXDR_H +#define _NFSD_BLOCKLAYOUTXDR_H 1 + +#include <linux/blkdev.h> +#include "xdr4.h" + +struct iomap; +struct xdr_stream; + +enum pnfs_block_extent_state { + PNFS_BLOCK_READWRITE_DATA = 0, + PNFS_BLOCK_READ_DATA = 1, + PNFS_BLOCK_INVALID_DATA = 2, + PNFS_BLOCK_NONE_DATA = 3, +}; + +struct pnfs_block_extent { + struct nfsd4_deviceid vol_id; + u64 foff; + u64 len; + u64 soff; + enum pnfs_block_extent_state es; +}; +#define NFS4_BLOCK_EXTENT_SIZE 44 + +enum pnfs_block_volume_type { + PNFS_BLOCK_VOLUME_SIMPLE = 0, + PNFS_BLOCK_VOLUME_SLICE = 1, + PNFS_BLOCK_VOLUME_CONCAT = 2, + PNFS_BLOCK_VOLUME_STRIPE = 3, +}; + +/* + * Random upper cap for the uuid length to avoid unbounded allocation. + * Not actually limited by the protocol. + */ +#define PNFS_BLOCK_UUID_LEN 128 + +struct pnfs_block_volume { + enum pnfs_block_volume_type type; + union { + struct { + u64 offset; + u32 sig_len; + u8 sig[PNFS_BLOCK_UUID_LEN]; + } simple; + }; +}; + +struct pnfs_block_deviceaddr { + u32 nr_volumes; + struct pnfs_block_volume volumes[]; +}; + +__be32 nfsd4_block_encode_getdeviceinfo(struct xdr_stream *xdr, + struct nfsd4_getdeviceinfo *gdp); +__be32 nfsd4_block_encode_layoutget(struct xdr_stream *xdr, + struct nfsd4_layoutget *lgp); +int nfsd4_block_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp, + u32 block_size); + +#endif /* _NFSD_BLOCKLAYOUTXDR_H */ diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c index 30a739d..c3e3b6e 100644 --- a/fs/nfsd/export.c +++ b/fs/nfsd/export.c @@ -20,6 +20,7 @@ #include "nfsd.h" #include "nfsfh.h" #include "netns.h" +#include "pnfs.h" #define NFSDDBG_FACILITY NFSDDBG_EXPORT @@ -545,6 +546,7 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen) exp.ex_client = dom; exp.cd = cd; + exp.ex_devid_map = NULL; /* expiry */ err = -EINVAL; @@ -621,6 +623,8 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen) if (!gid_valid(exp.ex_anon_gid)) goto out4; err = 0; + + nfsd4_setup_layout_type(&exp); } expp = svc_export_lookup(&exp); @@ -703,6 +707,7 @@ static void svc_export_init(struct cache_head *cnew, struct cache_head *citem) new->ex_fslocs.locations = NULL; new->ex_fslocs.locations_count = 0; new->ex_fslocs.migrated = 0; + new->ex_layout_type = 0; new->ex_uuid = NULL; new->cd = item->cd; } @@ -717,6 +722,8 @@ static void export_update(struct cache_head *cnew, struct cache_head *citem) new->ex_anon_uid = item->ex_anon_uid; new->ex_anon_gid = item->ex_anon_gid; new->ex_fsid = item->ex_fsid; + new->ex_devid_map = item->ex_devid_map; + item->ex_devid_map = NULL; new->ex_uuid = item->ex_uuid; item->ex_uuid = NULL; new->ex_fslocs.locations = item->ex_fslocs.locations; @@ -725,6 +732,7 @@ static void export_update(struct cache_head *cnew, struct cache_head *citem) item->ex_fslocs.locations_count = 0; new->ex_fslocs.migrated = item->ex_fslocs.migrated; item->ex_fslocs.migrated = 0; + new->ex_layout_type = item->ex_layout_type; new->ex_nflavors = item->ex_nflavors; for (i = 0; i < MAX_SECINFO_LIST; i++) { new->ex_flavors[i] = item->ex_flavors[i]; diff --git a/fs/nfsd/export.h b/fs/nfsd/export.h index 04dc8c1..1f52bfc 100644 --- a/fs/nfsd/export.h +++ b/fs/nfsd/export.h @@ -56,6 +56,8 @@ struct svc_export { struct nfsd4_fs_locations ex_fslocs; uint32_t ex_nflavors; struct exp_flavor_info ex_flavors[MAX_SECINFO_LIST]; + enum pnfs_layouttype ex_layout_type; + struct nfsd4_deviceid_map *ex_devid_map; struct cache_detail *cd; }; diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index 7cbdf1b..5827785 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -546,6 +546,102 @@ out: return status; } +#ifdef CONFIG_NFSD_PNFS +/* + * CB_LAYOUTRECALL4args + * + * struct layoutrecall_file4 { + * nfs_fh4 lor_fh; + * offset4 lor_offset; + * length4 lor_length; + * stateid4 lor_stateid; + * }; + * + * union layoutrecall4 switch(layoutrecall_type4 lor_recalltype) { + * case LAYOUTRECALL4_FILE: + * layoutrecall_file4 lor_layout; + * case LAYOUTRECALL4_FSID: + * fsid4 lor_fsid; + * case LAYOUTRECALL4_ALL: + * void; + * }; + * + * struct CB_LAYOUTRECALL4args { + * layouttype4 clora_type; + * layoutiomode4 clora_iomode; + * bool clora_changed; + * layoutrecall4 clora_recall; + * }; + */ +static void encode_cb_layout4args(struct xdr_stream *xdr, + const struct nfs4_layout_stateid *ls, + struct nfs4_cb_compound_hdr *hdr) +{ + __be32 *p; + + BUG_ON(hdr->minorversion == 0); + + p = xdr_reserve_space(xdr, 5 * 4); + *p++ = cpu_to_be32(OP_CB_LAYOUTRECALL); + *p++ = cpu_to_be32(ls->ls_layout_type); + *p++ = cpu_to_be32(IOMODE_ANY); + *p++ = cpu_to_be32(1); + *p = cpu_to_be32(RETURN_FILE); + + encode_nfs_fh4(xdr, &ls->ls_stid.sc_file->fi_fhandle); + + p = xdr_reserve_space(xdr, 2 * 8); + p = xdr_encode_hyper(p, 0); + xdr_encode_hyper(p, NFS4_MAX_UINT64); + + encode_stateid4(xdr, &ls->ls_recall_sid); + + hdr->nops++; +} + +static void nfs4_xdr_enc_cb_layout(struct rpc_rqst *req, + struct xdr_stream *xdr, + const struct nfsd4_callback *cb) +{ + const struct nfs4_layout_stateid *ls = + container_of(cb, struct nfs4_layout_stateid, ls_recall); + struct nfs4_cb_compound_hdr hdr = { + .ident = 0, + .minorversion = cb->cb_minorversion, + }; + + encode_cb_compound4args(xdr, &hdr); + encode_cb_sequence4args(xdr, cb, &hdr); + encode_cb_layout4args(xdr, ls, &hdr); + encode_cb_nops(&hdr); +} + +static int nfs4_xdr_dec_cb_layout(struct rpc_rqst *rqstp, + struct xdr_stream *xdr, + struct nfsd4_callback *cb) +{ + struct nfs4_cb_compound_hdr hdr; + enum nfsstat4 nfserr; + int status; + + status = decode_cb_compound4res(xdr, &hdr); + if (unlikely(status)) + goto out; + if (cb) { + status = decode_cb_sequence4res(xdr, cb); + if (unlikely(status)) + goto out; + } + status = decode_cb_op_status(xdr, OP_CB_LAYOUTRECALL, &nfserr); + if (unlikely(status)) + goto out; + if (unlikely(nfserr != NFS4_OK)) + status = nfs_cb_stat_to_errno(nfserr); +out: + return status; +} +#endif /* CONFIG_NFSD_PNFS */ + /* * RPC procedure tables */ @@ -563,6 +659,9 @@ out: static struct rpc_procinfo nfs4_cb_procedures[] = { PROC(CB_NULL, NULL, cb_null, cb_null), PROC(CB_RECALL, COMPOUND, cb_recall, cb_recall), +#ifdef CONFIG_NFSD_PNFS + PROC(CB_LAYOUT, COMPOUND, cb_layout, cb_layout), +#endif }; static struct rpc_version nfs_cb_version4 = { diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c new file mode 100644 index 0000000..3c1bfa1 --- /dev/null +++ b/fs/nfsd/nfs4layouts.c @@ -0,0 +1,721 @@ +/* + * Copyright (c) 2014 Christoph Hellwig. + */ +#include <linux/kmod.h> +#include <linux/file.h> +#include <linux/jhash.h> +#include <linux/sched.h> +#include <linux/sunrpc/addr.h> + +#include "pnfs.h" +#include "netns.h" +#include "trace.h" + +#define NFSDDBG_FACILITY NFSDDBG_PNFS + +struct nfs4_layout { + struct list_head lo_perstate; + struct nfs4_layout_stateid *lo_state; + struct nfsd4_layout_seg lo_seg; +}; + +static struct kmem_cache *nfs4_layout_cache; +static struct kmem_cache *nfs4_layout_stateid_cache; + +static struct nfsd4_callback_ops nfsd4_cb_layout_ops; +static const struct lock_manager_operations nfsd4_layouts_lm_ops; + +const struct nfsd4_layout_ops *nfsd4_layout_ops[LAYOUT_TYPE_MAX] = { + [LAYOUT_BLOCK_VOLUME] = &bl_layout_ops, +}; + +/* pNFS device ID to export fsid mapping */ +#define DEVID_HASH_BITS 8 +#define DEVID_HASH_SIZE (1 << DEVID_HASH_BITS) +#define DEVID_HASH_MASK (DEVID_HASH_SIZE - 1) +static u64 nfsd_devid_seq = 1; +static struct list_head nfsd_devid_hash[DEVID_HASH_SIZE]; +static DEFINE_SPINLOCK(nfsd_devid_lock); + +static inline u32 devid_hashfn(u64 idx) +{ + return jhash_2words(idx, idx >> 32, 0) & DEVID_HASH_MASK; +} + +static void +nfsd4_alloc_devid_map(const struct svc_fh *fhp) +{ + const struct knfsd_fh *fh = &fhp->fh_handle; + size_t fsid_len = key_len(fh->fh_fsid_type); + struct nfsd4_deviceid_map *map, *old; + int i; + + map = kzalloc(sizeof(*map) + fsid_len, GFP_KERNEL); + if (!map) + return; + + map->fsid_type = fh->fh_fsid_type; + memcpy(&map->fsid, fh->fh_fsid, fsid_len); + + spin_lock(&nfsd_devid_lock); + if (fhp->fh_export->ex_devid_map) + goto out_unlock; + + for (i = 0; i < DEVID_HASH_SIZE; i++) { + list_for_each_entry(old, &nfsd_devid_hash[i], hash) { + if (old->fsid_type != fh->fh_fsid_type) + continue; + if (memcmp(old->fsid, fh->fh_fsid, + key_len(old->fsid_type))) + continue; + + fhp->fh_export->ex_devid_map = old; + goto out_unlock; + } + } + + map->idx = nfsd_devid_seq++; + list_add_tail_rcu(&map->hash, &nfsd_devid_hash[devid_hashfn(map->idx)]); + fhp->fh_export->ex_devid_map = map; + map = NULL; + +out_unlock: + spin_unlock(&nfsd_devid_lock); + kfree(map); +} + +struct nfsd4_deviceid_map * +nfsd4_find_devid_map(int idx) +{ + struct nfsd4_deviceid_map *map, *ret = NULL; + + rcu_read_lock(); + list_for_each_entry_rcu(map, &nfsd_devid_hash[devid_hashfn(idx)], hash) + if (map->idx == idx) + ret = map; + rcu_read_unlock(); + + return ret; +} + +int +nfsd4_set_deviceid(struct nfsd4_deviceid *id, const struct svc_fh *fhp, + u32 device_generation) +{ + if (!fhp->fh_export->ex_devid_map) { + nfsd4_alloc_devid_map(fhp); + if (!fhp->fh_export->ex_devid_map) + return -ENOMEM; + } + + id->fsid_idx = fhp->fh_export->ex_devid_map->idx; + id->generation = device_generation; + id->pad = 0; + return 0; +} + +void nfsd4_setup_layout_type(struct svc_export *exp) +{ + struct super_block *sb = exp->ex_path.mnt->mnt_sb; + + if (exp->ex_flags & NFSEXP_NOPNFS) + return; + + if (sb->s_export_op->get_uuid && + sb->s_export_op->map_blocks && + sb->s_export_op->commit_blocks) + exp->ex_layout_type = LAYOUT_BLOCK_VOLUME; +} + +static void +nfsd4_free_layout_stateid(struct nfs4_stid *stid) +{ + struct nfs4_layout_stateid *ls = layoutstateid(stid); + struct nfs4_client *clp = ls->ls_stid.sc_client; + struct nfs4_file *fp = ls->ls_stid.sc_file; + + trace_layoutstate_free(&ls->ls_stid.sc_stateid); + + spin_lock(&clp->cl_lock); + list_del_init(&ls->ls_perclnt); + spin_unlock(&clp->cl_lock); + + spin_lock(&fp->fi_lock); + list_del_init(&ls->ls_perfile); + spin_unlock(&fp->fi_lock); + + vfs_setlease(ls->ls_file, F_UNLCK, NULL, (void **)&ls); + fput(ls->ls_file); + + if (ls->ls_recalled) + atomic_dec(&ls->ls_stid.sc_file->fi_lo_recalls); + + kmem_cache_free(nfs4_layout_stateid_cache, ls); +} + +static int +nfsd4_layout_setlease(struct nfs4_layout_stateid *ls) +{ + struct file_lock *fl; + int status; + + fl = locks_alloc_lock(); + if (!fl) + return -ENOMEM; + locks_init_lock(fl); + fl->fl_lmops = &nfsd4_layouts_lm_ops; + fl->fl_flags = FL_LAYOUT; + fl->fl_type = F_RDLCK; + fl->fl_end = OFFSET_MAX; + fl->fl_owner = ls; + fl->fl_pid = current->tgid; + fl->fl_file = ls->ls_file; + + status = vfs_setlease(fl->fl_file, fl->fl_type, &fl, NULL); + if (status) { + locks_free_lock(fl); + return status; + } + BUG_ON(fl != NULL); + return 0; +} + +static struct nfs4_layout_stateid * +nfsd4_alloc_layout_stateid(struct nfsd4_compound_state *cstate, + struct nfs4_stid *parent, u32 layout_type) +{ + struct nfs4_client *clp = cstate->clp; + struct nfs4_file *fp = parent->sc_file; + struct nfs4_layout_stateid *ls; + struct nfs4_stid *stp; + + stp = nfs4_alloc_stid(cstate->clp, nfs4_layout_stateid_cache); + if (!stp) + return NULL; + stp->sc_free = nfsd4_free_layout_stateid; + get_nfs4_file(fp); + stp->sc_file = fp; + + ls = layoutstateid(stp); + INIT_LIST_HEAD(&ls->ls_perclnt); + INIT_LIST_HEAD(&ls->ls_perfile); + spin_lock_init(&ls->ls_lock); + INIT_LIST_HEAD(&ls->ls_layouts); + ls->ls_layout_type = layout_type; + nfsd4_init_cb(&ls->ls_recall, clp, &nfsd4_cb_layout_ops, + NFSPROC4_CLNT_CB_LAYOUT); + + if (parent->sc_type == NFS4_DELEG_STID) + ls->ls_file = get_file(fp->fi_deleg_file); + else + ls->ls_file = find_any_file(fp); + BUG_ON(!ls->ls_file); + + if (nfsd4_layout_setlease(ls)) { + put_nfs4_file(fp); + kmem_cache_free(nfs4_layout_stateid_cache, ls); + return NULL; + } + + spin_lock(&clp->cl_lock); + stp->sc_type = NFS4_LAYOUT_STID; + list_add(&ls->ls_perclnt, &clp->cl_lo_states); + spin_unlock(&clp->cl_lock); + + spin_lock(&fp->fi_lock); + list_add(&ls->ls_perfile, &fp->fi_lo_states); + spin_unlock(&fp->fi_lock); + + trace_layoutstate_alloc(&ls->ls_stid.sc_stateid); + return ls; +} + +__be32 +nfsd4_preprocess_layout_stateid(struct svc_rqst *rqstp, + struct nfsd4_compound_state *cstate, stateid_t *stateid, + bool create, u32 layout_type, struct nfs4_layout_stateid **lsp) +{ + struct nfs4_layout_stateid *ls; + struct nfs4_stid *stid; + unsigned char typemask = NFS4_LAYOUT_STID; + __be32 status; + + if (create) + typemask |= (NFS4_OPEN_STID | NFS4_LOCK_STID | NFS4_DELEG_STID); + + status = nfsd4_lookup_stateid(cstate, stateid, typemask, &stid, + net_generic(SVC_NET(rqstp), nfsd_net_id)); + if (status) + goto out; + + if (!fh_match(&cstate->current_fh.fh_handle, + &stid->sc_file->fi_fhandle)) { + status = nfserr_bad_stateid; + goto out_put_stid; + } + + if (stid->sc_type != NFS4_LAYOUT_STID) { + ls = nfsd4_alloc_layout_stateid(cstate, stid, layout_type); + nfs4_put_stid(stid); + + status = nfserr_jukebox; + if (!ls) + goto out; + } else { + ls = container_of(stid, struct nfs4_layout_stateid, ls_stid); + + status = nfserr_bad_stateid; + if (stateid->si_generation > stid->sc_stateid.si_generation) + goto out_put_stid; + if (layout_type != ls->ls_layout_type) + goto out_put_stid; + } + + *lsp = ls; + return 0; + +out_put_stid: + nfs4_put_stid(stid); +out: + return status; +} + +static void +nfsd4_recall_file_layout(struct nfs4_layout_stateid *ls) +{ + spin_lock(&ls->ls_lock); + if (ls->ls_recalled) + goto out_unlock; + + ls->ls_recalled = true; + atomic_inc(&ls->ls_stid.sc_file->fi_lo_recalls); + if (list_empty(&ls->ls_layouts)) + goto out_unlock; + + trace_layout_recall(&ls->ls_stid.sc_stateid); + + atomic_inc(&ls->ls_stid.sc_count); + update_stateid(&ls->ls_stid.sc_stateid); + memcpy(&ls->ls_recall_sid, &ls->ls_stid.sc_stateid, sizeof(stateid_t)); + nfsd4_run_cb(&ls->ls_recall); + +out_unlock: + spin_unlock(&ls->ls_lock); +} + +static inline u64 +layout_end(struct nfsd4_layout_seg *seg) +{ + u64 end = seg->offset + seg->length; + return end >= seg->offset ? end : NFS4_MAX_UINT64; +} + +static void +layout_update_len(struct nfsd4_layout_seg *lo, u64 end) +{ + if (end == NFS4_MAX_UINT64) + lo->length = NFS4_MAX_UINT64; + else + lo->length = end - lo->offset; +} + +static bool +layouts_overlapping(struct nfs4_layout *lo, struct nfsd4_layout_seg *s) +{ + if (s->iomode != IOMODE_ANY && s->iomode != lo->lo_seg.iomode) + return false; + if (layout_end(&lo->lo_seg) <= s->offset) + return false; + if (layout_end(s) <= lo->lo_seg.offset) + return false; + return true; +} + +static bool +layouts_try_merge(struct nfsd4_layout_seg *lo, struct nfsd4_layout_seg *new) +{ + if (lo->iomode != new->iomode) + return false; + if (layout_end(new) < lo->offset) + return false; + if (layout_end(lo) < new->offset) + return false; + + lo->offset = min(lo->offset, new->offset); + layout_update_len(lo, max(layout_end(lo), layout_end(new))); + return true; +} + +static __be32 +nfsd4_recall_conflict(struct nfs4_layout_stateid *ls) +{ + struct nfs4_file *fp = ls->ls_stid.sc_file; + struct nfs4_layout_stateid *l, *n; + __be32 nfserr = nfs_ok; + + assert_spin_locked(&fp->fi_lock); + + list_for_each_entry_safe(l, n, &fp->fi_lo_states, ls_perfile) { + if (l != ls) { + nfsd4_recall_file_layout(l); + nfserr = nfserr_recallconflict; + } + } + + return nfserr; +} + +__be32 +nfsd4_insert_layout(struct nfsd4_layoutget *lgp, struct nfs4_layout_stateid *ls) +{ + struct nfsd4_layout_seg *seg = &lgp->lg_seg; + struct nfs4_file *fp = ls->ls_stid.sc_file; + struct nfs4_layout *lp, *new = NULL; + __be32 nfserr; + + spin_lock(&fp->fi_lock); + nfserr = nfsd4_recall_conflict(ls); + if (nfserr) + goto out; + spin_lock(&ls->ls_lock); + list_for_each_entry(lp, &ls->ls_layouts, lo_perstate) { + if (layouts_try_merge(&lp->lo_seg, seg)) + goto done; + } + spin_unlock(&ls->ls_lock); + spin_unlock(&fp->fi_lock); + + new = kmem_cache_alloc(nfs4_layout_cache, GFP_KERNEL); + if (!new) + return nfserr_jukebox; + memcpy(&new->lo_seg, seg, sizeof(lp->lo_seg)); + new->lo_state = ls; + + spin_lock(&fp->fi_lock); + nfserr = nfsd4_recall_conflict(ls); + if (nfserr) + goto out; + spin_lock(&ls->ls_lock); + list_for_each_entry(lp, &ls->ls_layouts, lo_perstate) { + if (layouts_try_merge(&lp->lo_seg, seg)) + goto done; + } + + atomic_inc(&ls->ls_stid.sc_count); + list_add_tail(&new->lo_perstate, &ls->ls_layouts); + new = NULL; +done: + update_stateid(&ls->ls_stid.sc_stateid); + memcpy(&lgp->lg_sid, &ls->ls_stid.sc_stateid, sizeof(stateid_t)); + spin_unlock(&ls->ls_lock); +out: + spin_unlock(&fp->fi_lock); + if (new) + kmem_cache_free(nfs4_layout_cache, new); + return nfserr; +} + +static void +nfsd4_free_layouts(struct list_head *reaplist) +{ + while (!list_empty(reaplist)) { + struct nfs4_layout *lp = list_first_entry(reaplist, + struct nfs4_layout, lo_perstate); + + list_del(&lp->lo_perstate); + nfs4_put_stid(&lp->lo_state->ls_stid); + kmem_cache_free(nfs4_layout_cache, lp); + } +} + +static void +nfsd4_return_file_layout(struct nfs4_layout *lp, struct nfsd4_layout_seg *seg, + struct list_head *reaplist) +{ + struct nfsd4_layout_seg *lo = &lp->lo_seg; + u64 end = layout_end(lo); + + if (seg->offset <= lo->offset) { + if (layout_end(seg) >= end) { + list_move_tail(&lp->lo_perstate, reaplist); + return; + } + end = seg->offset; + } else { + /* retain the whole layout segment on a split. */ + if (layout_end(seg) < end) { + dprintk("%s: split not supported\n", __func__); + return; + } + + lo->offset = layout_end(seg); + } + + layout_update_len(lo, end); +} + +__be32 +nfsd4_return_file_layouts(struct svc_rqst *rqstp, + struct nfsd4_compound_state *cstate, + struct nfsd4_layoutreturn *lrp) +{ + struct nfs4_layout_stateid *ls; + struct nfs4_layout *lp, *n; + LIST_HEAD(reaplist); + __be32 nfserr; + int found = 0; + + nfserr = nfsd4_preprocess_layout_stateid(rqstp, cstate, &lrp->lr_sid, + false, lrp->lr_layout_type, + &ls); + if (nfserr) { + trace_layout_return_lookup_fail(&lrp->lr_sid); + return nfserr; + } + + spin_lock(&ls->ls_lock); + list_for_each_entry_safe(lp, n, &ls->ls_layouts, lo_perstate) { + if (layouts_overlapping(lp, &lrp->lr_seg)) { + nfsd4_return_file_layout(lp, &lrp->lr_seg, &reaplist); + found++; + } + } + if (!list_empty(&ls->ls_layouts)) { + if (found) { + update_stateid(&ls->ls_stid.sc_stateid); + memcpy(&lrp->lr_sid, &ls->ls_stid.sc_stateid, + sizeof(stateid_t)); + } + lrp->lrs_present = 1; + } else { + trace_layoutstate_unhash(&ls->ls_stid.sc_stateid); + nfs4_unhash_stid(&ls->ls_stid); + lrp->lrs_present = 0; + } + spin_unlock(&ls->ls_lock); + + nfs4_put_stid(&ls->ls_stid); + nfsd4_free_layouts(&reaplist); + return nfs_ok; +} + +__be32 +nfsd4_return_client_layouts(struct svc_rqst *rqstp, + struct nfsd4_compound_state *cstate, + struct nfsd4_layoutreturn *lrp) +{ + struct nfs4_layout_stateid *ls, *n; + struct nfs4_client *clp = cstate->clp; + struct nfs4_layout *lp, *t; + LIST_HEAD(reaplist); + + lrp->lrs_present = 0; + + spin_lock(&clp->cl_lock); + list_for_each_entry_safe(ls, n, &clp->cl_lo_states, ls_perclnt) { + if (lrp->lr_return_type == RETURN_FSID && + !fh_fsid_match(&ls->ls_stid.sc_file->fi_fhandle, + &cstate->current_fh.fh_handle)) + continue; + + spin_lock(&ls->ls_lock); + list_for_each_entry_safe(lp, t, &ls->ls_layouts, lo_perstate) { + if (lrp->lr_seg.iomode == IOMODE_ANY || + lrp->lr_seg.iomode == lp->lo_seg.iomode) + list_move_tail(&lp->lo_perstate, &reaplist); + } + spin_unlock(&ls->ls_lock); + } + spin_unlock(&clp->cl_lock); + + nfsd4_free_layouts(&reaplist); + return 0; +} + +static void +nfsd4_return_all_layouts(struct nfs4_layout_stateid *ls, + struct list_head *reaplist) +{ + spin_lock(&ls->ls_lock); + list_splice_init(&ls->ls_layouts, reaplist); + spin_unlock(&ls->ls_lock); +} + +void +nfsd4_return_all_client_layouts(struct nfs4_client *clp) +{ + struct nfs4_layout_stateid *ls, *n; + LIST_HEAD(reaplist); + + spin_lock(&clp->cl_lock); + list_for_each_entry_safe(ls, n, &clp->cl_lo_states, ls_perclnt) + nfsd4_return_all_layouts(ls, &reaplist); + spin_unlock(&clp->cl_lock); + + nfsd4_free_layouts(&reaplist); +} + +void +nfsd4_return_all_file_layouts(struct nfs4_client *clp, struct nfs4_file *fp) +{ + struct nfs4_layout_stateid *ls, *n; + LIST_HEAD(reaplist); + + spin_lock(&fp->fi_lock); + list_for_each_entry_safe(ls, n, &fp->fi_lo_states, ls_perfile) { + if (ls->ls_stid.sc_client == clp) + nfsd4_return_all_layouts(ls, &reaplist); + } + spin_unlock(&fp->fi_lock); + + nfsd4_free_layouts(&reaplist); +} + +static void +nfsd4_cb_layout_fail(struct nfs4_layout_stateid *ls) +{ + struct nfs4_client *clp = ls->ls_stid.sc_client; + char addr_str[INET6_ADDRSTRLEN]; + static char *envp[] = { + "HOME=/", + "TERM=linux", + "PATH=/sbin:/usr/sbin:/bin:/usr/bin", + NULL + }; + char *argv[8]; + int error; + + rpc_ntop((struct sockaddr *)&clp->cl_addr, addr_str, sizeof(addr_str)); + + nfsd4_cb_layout_fail(ls); + + printk(KERN_WARNING + "nfsd: client %s failed to respond to layout recall. " + " Fencing..\n", addr_str); + + argv[0] = "/sbin/nfsd-recall-failed"; + argv[1] = addr_str; + argv[2] = ls->ls_file->f_path.mnt->mnt_sb->s_id; + argv[3] = NULL; + + error = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC); + if (error) { + printk(KERN_ERR "nfsd: fence failed for client %s: %d!\n", + addr_str, error); + } +} + +static int +nfsd4_cb_layout_done(struct nfsd4_callback *cb, struct rpc_task *task) +{ + struct nfs4_layout_stateid *ls = + container_of(cb, struct nfs4_layout_stateid, ls_recall); + LIST_HEAD(reaplist); + + switch (task->tk_status) { + case 0: + return 1; + case -NFS4ERR_NOMATCHING_LAYOUT: + trace_layout_recall_done(&ls->ls_stid.sc_stateid); + task->tk_status = 0; + return 1; + case -NFS4ERR_DELAY: + /* Poll the client until it's done with the layout */ + /* FIXME: cap number of retries. + * The pnfs standard states that we need to only expire + * the client after at-least "lease time" .eg lease-time * 2 + * when failing to communicate a recall + */ + rpc_delay(task, HZ/100); /* 10 mili-seconds */ + return 0; + default: + /* + * Unknown error or non-responding client, we'll need to fence. + */ + nfsd4_cb_layout_fail(ls); + return -1; + } +} + +static void +nfsd4_cb_layout_release(struct nfsd4_callback *cb) +{ + struct nfs4_layout_stateid *ls = + container_of(cb, struct nfs4_layout_stateid, ls_recall); + LIST_HEAD(reaplist); + + trace_layout_recall_release(&ls->ls_stid.sc_stateid); + + nfsd4_return_all_layouts(ls, &reaplist); + nfsd4_free_layouts(&reaplist); + nfs4_put_stid(&ls->ls_stid); +} + +static struct nfsd4_callback_ops nfsd4_cb_layout_ops = { + .done = nfsd4_cb_layout_done, + .release = nfsd4_cb_layout_release, +}; + +static bool +nfsd4_layout_lm_break(struct file_lock *fl) +{ + /* + * We don't want the locks code to timeout the lease for us; + * we'll remove it ourself if a layout isn't returned + * in time: + */ + fl->fl_break_time = 0; + nfsd4_recall_file_layout(fl->fl_owner); + return false; +} + +static int +nfsd4_layout_lm_change(struct file_lock *onlist, int arg, + struct list_head *dispose) +{ + BUG_ON(!(arg & F_UNLCK)); + return lease_modify(onlist, arg, dispose); +} + +static const struct lock_manager_operations nfsd4_layouts_lm_ops = { + .lm_break = nfsd4_layout_lm_break, + .lm_change = nfsd4_layout_lm_change, +}; + +int +nfsd4_init_pnfs(void) +{ + int i; + + for (i = 0; i < DEVID_HASH_SIZE; i++) + INIT_LIST_HEAD(&nfsd_devid_hash[i]); + + nfs4_layout_cache = kmem_cache_create("nfs4_layout", + sizeof(struct nfs4_layout), 0, 0, NULL); + if (!nfs4_layout_cache) + return -ENOMEM; + + nfs4_layout_stateid_cache = kmem_cache_create("nfs4_layout_stateid", + sizeof(struct nfs4_layout_stateid), 0, 0, NULL); + if (!nfs4_layout_stateid_cache) { + kmem_cache_destroy(nfs4_layout_cache); + return -ENOMEM; + } + return 0; +} + +void +nfsd4_exit_pnfs(void) +{ + int i; + + kmem_cache_destroy(nfs4_layout_cache); + kmem_cache_destroy(nfs4_layout_stateid_cache); + + for (i = 0; i < DEVID_HASH_SIZE; i++) { + struct nfsd4_deviceid_map *map, *n; + + list_for_each_entry_safe(map, n, &nfsd_devid_hash[i], hash) + kfree(map); + } +} diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index ac71d13..d30bea8d 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -43,6 +43,8 @@ #include "current_stateid.h" #include "netns.h" #include "acl.h" +#include "pnfs.h" +#include "trace.h" #ifdef CONFIG_NFSD_V4_SECURITY_LABEL #include <linux/security.h> @@ -1178,6 +1180,259 @@ nfsd4_verify(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, return status == nfserr_same ? nfs_ok : status; } +#ifdef CONFIG_NFSD_PNFS +static const struct nfsd4_layout_ops * +nfsd4_layout_verify(struct svc_export *exp, unsigned int layout_type) +{ + if (!exp->ex_layout_type) { + dprintk("%s: export does not support pNFS\n", __func__); + return NULL; + } + + if (exp->ex_layout_type != layout_type) { + dprintk("%s: layout type %d not supported\n", + __func__, layout_type); + return NULL; + } + + return nfsd4_layout_ops[layout_type]; +} + +static __be32 +nfsd4_getdeviceinfo(struct svc_rqst *rqstp, + struct nfsd4_compound_state *cstate, + struct nfsd4_getdeviceinfo *gdp) +{ + const struct nfsd4_layout_ops *ops; + struct nfsd4_deviceid_map *map; + struct svc_export *exp; + __be32 nfserr; + + dprintk("%s: layout_type %u dev_id [0x%llx:0x%x] maxcnt %u\n", + __func__, + gdp->gd_layout_type, + gdp->gd_devid.fsid_idx, gdp->gd_devid.generation, + gdp->gd_maxcount); + + map = nfsd4_find_devid_map(gdp->gd_devid.fsid_idx); + if (!map) { + dprintk("%s: couldn't find device ID to export mapping!\n", + __func__); + return nfserr_noent; + } + + exp = rqst_exp_find(rqstp, map->fsid_type, map->fsid); + if (IS_ERR(exp)) { + dprintk("%s: could not find device id\n", __func__); + return nfserr_noent; + } + + nfserr = nfserr_layoutunavailable; + ops = nfsd4_layout_verify(exp, gdp->gd_layout_type); + if (!ops) + goto out; + + nfserr = nfs_ok; + if (gdp->gd_maxcount != 0) + nfserr = ops->proc_getdeviceinfo(exp->ex_path.mnt->mnt_sb, gdp); + + gdp->gd_notify_types &= ops->notify_types; + exp_put(exp); +out: + return nfserr; +} + +static __be32 +nfsd4_layoutget(struct svc_rqst *rqstp, + struct nfsd4_compound_state *cstate, + struct nfsd4_layoutget *lgp) +{ + struct svc_fh *current_fh = &cstate->current_fh; + const struct nfsd4_layout_ops *ops; + struct nfs4_layout_stateid *ls; + __be32 nfserr; + int accmode; + + switch (lgp->lg_seg.iomode) { + case IOMODE_READ: + accmode = NFSD_MAY_READ; + break; + case IOMODE_RW: + accmode = NFSD_MAY_READ | NFSD_MAY_WRITE; + break; + default: + dprintk("%s: invalid iomode %d\n", + __func__, lgp->lg_seg.iomode); + nfserr = nfserr_badiomode; + goto out; + } + + nfserr = fh_verify(rqstp, current_fh, 0, accmode); + if (nfserr) + goto out; + + nfserr = nfserr_layoutunavailable; + ops = nfsd4_layout_verify(current_fh->fh_export, lgp->lg_layout_type); + if (!ops) + goto out; + + /* + * Verify minlength and range as per RFC5661: + * o If loga_length is less than loga_minlength, + * the metadata server MUST return NFS4ERR_INVAL. + * o If the sum of loga_offset and loga_minlength exceeds + * NFS4_UINT64_MAX, and loga_minlength is not + * NFS4_UINT64_MAX, the error NFS4ERR_INVAL MUST result. + * o If the sum of loga_offset and loga_length exceeds + * NFS4_UINT64_MAX, and loga_length is not NFS4_UINT64_MAX, + * the error NFS4ERR_INVAL MUST result. + */ + nfserr = nfserr_inval; + if (lgp->lg_seg.length < lgp->lg_minlength || + (lgp->lg_minlength != NFS4_MAX_UINT64 && + lgp->lg_minlength > NFS4_MAX_UINT64 - lgp->lg_seg.offset) || + (lgp->lg_seg.length != NFS4_MAX_UINT64 && + lgp->lg_seg.length > NFS4_MAX_UINT64 - lgp->lg_seg.offset)) + goto out; + if (lgp->lg_seg.length == 0) + goto out; + + nfserr = nfsd4_preprocess_layout_stateid(rqstp, cstate, &lgp->lg_sid, + true, lgp->lg_layout_type, &ls); + if (nfserr) { + trace_layout_get_lookup_fail(&lgp->lg_sid); + goto out; + } + + nfserr = nfserr_recallconflict; + if (atomic_read(&ls->ls_stid.sc_file->fi_lo_recalls)) + goto out_put_stid; + + nfserr = ops->proc_layoutget(current_fh->fh_dentry->d_inode, + current_fh, lgp); + if (nfserr) + goto out_put_stid; + + nfserr = nfsd4_insert_layout(lgp, ls); + +out_put_stid: + nfs4_put_stid(&ls->ls_stid); +out: + return nfserr; +} + +static __be32 +nfsd4_layoutcommit(struct svc_rqst *rqstp, + struct nfsd4_compound_state *cstate, + struct nfsd4_layoutcommit *lcp) +{ + const struct nfsd4_layout_seg *seg = &lcp->lc_seg; + struct svc_fh *current_fh = &cstate->current_fh; + const struct nfsd4_layout_ops *ops; + loff_t new_size = lcp->lc_last_wr + 1; + struct inode *inode; + struct nfs4_layout_stateid *ls; + __be32 nfserr; + + nfserr = fh_verify(rqstp, current_fh, 0, NFSD_MAY_WRITE); + if (nfserr) + goto out; + + nfserr = nfserr_layoutunavailable; + ops = nfsd4_layout_verify(current_fh->fh_export, lcp->lc_layout_type); + if (!ops) + goto out; + inode = current_fh->fh_dentry->d_inode; + + nfserr = nfserr_inval; + if (new_size <= seg->offset) { + dprintk("pnfsd: last write before layout segment\n"); + goto out; + } + if (new_size > seg->offset + seg->length) { + dprintk("pnfsd: last write beyond layout segment\n"); + goto out; + } + if (!lcp->lc_newoffset && new_size > i_size_read(inode)) { + dprintk("pnfsd: layoutcommit beyond EOF\n"); + goto out; + } + + nfserr = nfsd4_preprocess_layout_stateid(rqstp, cstate, &lcp->lc_sid, + false, lcp->lc_layout_type, + &ls); + if (nfserr) { + trace_layout_commit_lookup_fail(&lcp->lc_sid); + /* fixup error code as per RFC5661 */ + if (nfserr == nfserr_bad_stateid) + nfserr = nfserr_badlayout; + goto out; + } + + nfserr = ops->proc_layoutcommit(inode, lcp); + if (nfserr) + goto out_put_stid; + + if (new_size > i_size_read(inode)) { + lcp->lc_size_chg = 1; + lcp->lc_newsize = new_size; + } else { + lcp->lc_size_chg = 0; + } + +out_put_stid: + nfs4_put_stid(&ls->ls_stid); +out: + return nfserr; +} + +static __be32 +nfsd4_layoutreturn(struct svc_rqst *rqstp, + struct nfsd4_compound_state *cstate, + struct nfsd4_layoutreturn *lrp) +{ + struct svc_fh *current_fh = &cstate->current_fh; + __be32 nfserr; + + nfserr = fh_verify(rqstp, current_fh, 0, NFSD_MAY_NOP); + if (nfserr) + goto out; + + nfserr = nfserr_layoutunavailable; + if (!nfsd4_layout_verify(current_fh->fh_export, lrp->lr_layout_type)) + goto out; + + switch (lrp->lr_seg.iomode) { + case IOMODE_READ: + case IOMODE_RW: + case IOMODE_ANY: + break; + default: + dprintk("%s: invalid iomode %d\n", __func__, + lrp->lr_seg.iomode); + nfserr = nfserr_inval; + goto out; + } + + switch (lrp->lr_return_type) { + case RETURN_FILE: + nfserr = nfsd4_return_file_layouts(rqstp, cstate, lrp); + break; + case RETURN_FSID: + case RETURN_ALL: + nfserr = nfsd4_return_client_layouts(rqstp, cstate, lrp); + break; + default: + dprintk("%s: invalid return_type %d\n", __func__, + lrp->lr_return_type); + nfserr = nfserr_inval; + break; + } +out: + return nfserr; +} +#endif /* CONFIG_NFSD_PNFS */ + /* * NULL call. */ @@ -1679,6 +1934,36 @@ static inline u32 nfsd4_create_session_rsize(struct svc_rqst *rqstp, struct nfsd op_encode_channel_attrs_maxsz) * sizeof(__be32); } +#ifdef CONFIG_NFSD_PNFS +/* + * At this stage we don't really know what layout driver will handle the request, + * so we need to define an arbitrary upper bound here. + */ +#define MAX_LAYOUT_SIZE 128 +static inline u32 nfsd4_layoutget_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +{ + return (op_encode_hdr_size + + 1 /* logr_return_on_close */ + + op_encode_stateid_maxsz + + 1 /* nr of layouts */ + + MAX_LAYOUT_SIZE) * sizeof(__be32); +} + +static inline u32 nfsd4_layoutcommit_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +{ + return (op_encode_hdr_size + + 1 /* locr_newsize */ + + 2 /* ns_size */) * sizeof(__be32); +} + +static inline u32 nfsd4_layoutreturn_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +{ + return (op_encode_hdr_size + + 1 /* lrs_stateid */ + + op_encode_stateid_maxsz) * sizeof(__be32); +} +#endif /* CONFIG_NFSD_PNFS */ + static struct nfsd4_operation nfsd4_ops[] = { [OP_ACCESS] = { .op_func = (nfsd4op_func)nfsd4_access, @@ -1966,6 +2251,31 @@ static struct nfsd4_operation nfsd4_ops[] = { .op_get_currentstateid = (stateid_getter)nfsd4_get_freestateid, .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize, }, +#ifdef CONFIG_NFSD_PNFS + [OP_GETDEVICEINFO] = { + .op_func = (nfsd4op_func)nfsd4_getdeviceinfo, + .op_flags = ALLOWED_WITHOUT_FH, + .op_name = "OP_GETDEVICEINFO", + }, + [OP_LAYOUTGET] = { + .op_func = (nfsd4op_func)nfsd4_layoutget, + .op_flags = OP_MODIFIES_SOMETHING, + .op_name = "OP_LAYOUTGET", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_layoutget_rsize, + }, + [OP_LAYOUTCOMMIT] = { + .op_func = (nfsd4op_func)nfsd4_layoutcommit, + .op_flags = OP_MODIFIES_SOMETHING, + .op_name = "OP_LAYOUTCOMMIT", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_layoutcommit_rsize, + }, + [OP_LAYOUTRETURN] = { + .op_func = (nfsd4op_func)nfsd4_layoutreturn, + .op_flags = OP_MODIFIES_SOMETHING, + .op_name = "OP_LAYOUTRETURN", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_layoutreturn_rsize, + }, +#endif /* CONFIG_NFSD_PNFS */ /* NFSv4.2 operations */ [OP_ALLOCATE] = { diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 532a60c..f6b2a09 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -48,6 +48,7 @@ #include "current_stateid.h" #include "netns.h" +#include "pnfs.h" #define NFSDDBG_FACILITY NFSDDBG_PROC @@ -150,16 +151,6 @@ renew_client_locked(struct nfs4_client *clp) clp->cl_time = get_seconds(); } -static inline void -renew_client(struct nfs4_client *clp) -{ - struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); - - spin_lock(&nn->client_lock); - renew_client_locked(clp); - spin_unlock(&nn->client_lock); -} - static void put_client_renew_locked(struct nfs4_client *clp) { struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); @@ -282,7 +273,7 @@ static void nfsd4_free_file_rcu(struct rcu_head *rcu) kmem_cache_free(file_slab, fp); } -static inline void +void put_nfs4_file(struct nfs4_file *fi) { might_lock(&state_lock); @@ -295,12 +286,6 @@ put_nfs4_file(struct nfs4_file *fi) } } -static inline void -get_nfs4_file(struct nfs4_file *fi) -{ - atomic_inc(&fi->fi_ref); -} - static struct file * __nfs4_get_fd(struct nfs4_file *f, int oflag) { @@ -358,7 +343,7 @@ find_readable_file(struct nfs4_file *f) return ret; } -static struct file * +struct file * find_any_file(struct nfs4_file *f) { struct file *ret; @@ -408,14 +393,6 @@ static unsigned int file_hashval(struct knfsd_fh *fh) return nfsd_fh_hashval(fh) & (FILE_HASH_SIZE - 1); } -static bool nfsd_fh_match(struct knfsd_fh *fh1, struct knfsd_fh *fh2) -{ - return fh1->fh_size == fh2->fh_size && - !memcmp(fh1->fh_base.fh_pad, - fh2->fh_base.fh_pad, - fh1->fh_size); -} - static struct hlist_head file_hashtbl[FILE_HASH_SIZE]; static void @@ -494,7 +471,7 @@ static void nfs4_file_put_access(struct nfs4_file *fp, u32 access) __nfs4_file_put_access(fp, O_RDONLY); } -static struct nfs4_stid *nfs4_alloc_stid(struct nfs4_client *cl, +struct nfs4_stid *nfs4_alloc_stid(struct nfs4_client *cl, struct kmem_cache *slab) { struct nfs4_stid *stid; @@ -688,17 +665,17 @@ static void nfs4_put_deleg_lease(struct nfs4_file *fp) struct file *filp = NULL; spin_lock(&fp->fi_lock); - if (fp->fi_deleg_file && atomic_dec_and_test(&fp->fi_delegees)) + if (fp->fi_deleg_file && --fp->fi_delegees == 0) swap(filp, fp->fi_deleg_file); spin_unlock(&fp->fi_lock); if (filp) { - vfs_setlease(filp, F_UNLCK, NULL, NULL); + vfs_setlease(filp, F_UNLCK, NULL, (void **)&fp); fput(filp); } } -static void unhash_stid(struct nfs4_stid *s) +void nfs4_unhash_stid(struct nfs4_stid *s) { s->sc_type = 0; } @@ -1006,7 +983,7 @@ static void unhash_lock_stateid(struct nfs4_ol_stateid *stp) list_del_init(&stp->st_locks); unhash_ol_stateid(stp); - unhash_stid(&stp->st_stid); + nfs4_unhash_stid(&stp->st_stid); } static void release_lock_stateid(struct nfs4_ol_stateid *stp) @@ -1518,7 +1495,12 @@ unhash_session(struct nfsd4_session *ses) static int STALE_CLIENTID(clientid_t *clid, struct nfsd_net *nn) { - if (clid->cl_boot == nn->boot_time) + /* + * We're assuming the clid was not given out from a boot + * precisely 2^32 (about 136 years) before this one. That seems + * a safe assumption: + */ + if (clid->cl_boot == (u32)nn->boot_time) return 0; dprintk("NFSD stale clientid (%08x/%08x) boot_time %08lx\n", clid->cl_boot, clid->cl_id, nn->boot_time); @@ -1558,6 +1540,9 @@ static struct nfs4_client *alloc_client(struct xdr_netobj name) INIT_LIST_HEAD(&clp->cl_lru); INIT_LIST_HEAD(&clp->cl_callbacks); INIT_LIST_HEAD(&clp->cl_revoked); +#ifdef CONFIG_NFSD_PNFS + INIT_LIST_HEAD(&clp->cl_lo_states); +#endif spin_lock_init(&clp->cl_lock); rpc_init_wait_queue(&clp->cl_cb_waitq, "Backchannel slot table"); return clp; @@ -1662,6 +1647,7 @@ __destroy_client(struct nfs4_client *clp) nfs4_get_stateowner(&oo->oo_owner); release_openowner(oo); } + nfsd4_return_all_client_layouts(clp); nfsd4_shutdown_callback(clp); if (clp->cl_cb_conn.cb_xprt) svc_xprt_put(clp->cl_cb_conn.cb_xprt); @@ -2145,8 +2131,11 @@ nfsd4_replay_cache_entry(struct nfsd4_compoundres *resp, static void nfsd4_set_ex_flags(struct nfs4_client *new, struct nfsd4_exchange_id *clid) { - /* pNFS is not supported */ +#ifdef CONFIG_NFSD_PNFS + new->cl_exchange_flags |= EXCHGID4_FLAG_USE_PNFS_MDS; +#else new->cl_exchange_flags |= EXCHGID4_FLAG_USE_NON_PNFS; +#endif /* Referrals are supported, Migration is not. */ new->cl_exchange_flags |= EXCHGID4_FLAG_SUPP_MOVED_REFER; @@ -3074,6 +3063,10 @@ static void nfsd4_init_file(struct knfsd_fh *fh, unsigned int hashval, fp->fi_share_deny = 0; memset(fp->fi_fds, 0, sizeof(fp->fi_fds)); memset(fp->fi_access, 0, sizeof(fp->fi_access)); +#ifdef CONFIG_NFSD_PNFS + INIT_LIST_HEAD(&fp->fi_lo_states); + atomic_set(&fp->fi_lo_recalls, 0); +#endif hlist_add_head_rcu(&fp->fi_hash, &file_hashtbl[hashval]); } @@ -3300,7 +3293,7 @@ find_file_locked(struct knfsd_fh *fh, unsigned int hashval) struct nfs4_file *fp; hlist_for_each_entry_rcu(fp, &file_hashtbl[hashval], fi_hash) { - if (nfsd_fh_match(&fp->fi_fhandle, fh)) { + if (fh_match(&fp->fi_fhandle, fh)) { if (atomic_inc_not_zero(&fp->fi_ref)) return fp; } @@ -3308,7 +3301,7 @@ find_file_locked(struct knfsd_fh *fh, unsigned int hashval) return NULL; } -static struct nfs4_file * +struct nfs4_file * find_file(struct knfsd_fh *fh) { struct nfs4_file *fp; @@ -3856,12 +3849,12 @@ static int nfs4_setlease(struct nfs4_delegation *dp) /* Race breaker */ if (fp->fi_deleg_file) { status = 0; - atomic_inc(&fp->fi_delegees); + ++fp->fi_delegees; hash_delegation_locked(dp, fp); goto out_unlock; } fp->fi_deleg_file = filp; - atomic_set(&fp->fi_delegees, 1); + fp->fi_delegees = 1; hash_delegation_locked(dp, fp); spin_unlock(&fp->fi_lock); spin_unlock(&state_lock); @@ -3902,7 +3895,7 @@ nfs4_set_delegation(struct nfs4_client *clp, struct svc_fh *fh, status = -EAGAIN; goto out_unlock; } - atomic_inc(&fp->fi_delegees); + ++fp->fi_delegees; hash_delegation_locked(dp, fp); status = 0; out_unlock: @@ -4295,7 +4288,7 @@ laundromat_main(struct work_struct *laundry) static inline __be32 nfs4_check_fh(struct svc_fh *fhp, struct nfs4_ol_stateid *stp) { - if (!nfsd_fh_match(&fhp->fh_handle, &stp->st_stid.sc_file->fi_fhandle)) + if (!fh_match(&fhp->fh_handle, &stp->st_stid.sc_file->fi_fhandle)) return nfserr_bad_stateid; return nfs_ok; } @@ -4446,7 +4439,7 @@ out_unlock: return status; } -static __be32 +__be32 nfsd4_lookup_stateid(struct nfsd4_compound_state *cstate, stateid_t *stateid, unsigned char typemask, struct nfs4_stid **s, struct nfsd_net *nn) @@ -4860,6 +4853,9 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, update_stateid(&stp->st_stid.sc_stateid); memcpy(&close->cl_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t)); + nfsd4_return_all_file_layouts(stp->st_stateowner->so_client, + stp->st_stid.sc_file); + nfsd4_close_open_stateid(stp); /* put reference from nfs4_preprocess_seqid_op */ diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 15f7b73..df5e66c 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -47,6 +47,7 @@ #include "state.h" #include "cache.h" #include "netns.h" +#include "pnfs.h" #ifdef CONFIG_NFSD_V4_SECURITY_LABEL #include <linux/security.h> @@ -234,6 +235,26 @@ static char *savemem(struct nfsd4_compoundargs *argp, __be32 *p, int nbytes) return ret; } +/* + * We require the high 32 bits of 'seconds' to be 0, and + * we ignore all 32 bits of 'nseconds'. + */ +static __be32 +nfsd4_decode_time(struct nfsd4_compoundargs *argp, struct timespec *tv) +{ + DECODE_HEAD; + u64 sec; + + READ_BUF(12); + p = xdr_decode_hyper(p, &sec); + tv->tv_sec = sec; + tv->tv_nsec = be32_to_cpup(p++); + if (tv->tv_nsec >= (u32)1000000000) + return nfserr_inval; + + DECODE_TAIL; +} + static __be32 nfsd4_decode_bitmap(struct nfsd4_compoundargs *argp, u32 *bmval) { @@ -267,7 +288,6 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, { int expected_len, len = 0; u32 dummy32; - u64 sec; char *buf; DECODE_HEAD; @@ -358,15 +378,10 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, dummy32 = be32_to_cpup(p++); switch (dummy32) { case NFS4_SET_TO_CLIENT_TIME: - /* We require the high 32 bits of 'seconds' to be 0, and we ignore - all 32 bits of 'nseconds'. */ - READ_BUF(12); len += 12; - p = xdr_decode_hyper(p, &sec); - iattr->ia_atime.tv_sec = (time_t)sec; - iattr->ia_atime.tv_nsec = be32_to_cpup(p++); - if (iattr->ia_atime.tv_nsec >= (u32)1000000000) - return nfserr_inval; + status = nfsd4_decode_time(argp, &iattr->ia_atime); + if (status) + return status; iattr->ia_valid |= (ATTR_ATIME | ATTR_ATIME_SET); break; case NFS4_SET_TO_SERVER_TIME: @@ -382,15 +397,10 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, dummy32 = be32_to_cpup(p++); switch (dummy32) { case NFS4_SET_TO_CLIENT_TIME: - /* We require the high 32 bits of 'seconds' to be 0, and we ignore - all 32 bits of 'nseconds'. */ - READ_BUF(12); len += 12; - p = xdr_decode_hyper(p, &sec); - iattr->ia_mtime.tv_sec = sec; - iattr->ia_mtime.tv_nsec = be32_to_cpup(p++); - if (iattr->ia_mtime.tv_nsec >= (u32)1000000000) - return nfserr_inval; + status = nfsd4_decode_time(argp, &iattr->ia_mtime); + if (status) + return status; iattr->ia_valid |= (ATTR_MTIME | ATTR_MTIME_SET); break; case NFS4_SET_TO_SERVER_TIME: @@ -1513,6 +1523,127 @@ static __be32 nfsd4_decode_reclaim_complete(struct nfsd4_compoundargs *argp, str DECODE_TAIL; } +#ifdef CONFIG_NFSD_PNFS +static __be32 +nfsd4_decode_getdeviceinfo(struct nfsd4_compoundargs *argp, + struct nfsd4_getdeviceinfo *gdev) +{ + DECODE_HEAD; + u32 num, i; + + READ_BUF(sizeof(struct nfsd4_deviceid) + 3 * 4); + COPYMEM(&gdev->gd_devid, sizeof(struct nfsd4_deviceid)); + gdev->gd_layout_type = be32_to_cpup(p++); + gdev->gd_maxcount = be32_to_cpup(p++); + num = be32_to_cpup(p++); + if (num) { + READ_BUF(4 * num); + gdev->gd_notify_types = be32_to_cpup(p++); + for (i = 1; i < num; i++) { + if (be32_to_cpup(p++)) { + status = nfserr_inval; + goto out; + } + } + } + DECODE_TAIL; +} + +static __be32 +nfsd4_decode_layoutget(struct nfsd4_compoundargs *argp, + struct nfsd4_layoutget *lgp) +{ + DECODE_HEAD; + + READ_BUF(36); + lgp->lg_signal = be32_to_cpup(p++); + lgp->lg_layout_type = be32_to_cpup(p++); + lgp->lg_seg.iomode = be32_to_cpup(p++); + p = xdr_decode_hyper(p, &lgp->lg_seg.offset); + p = xdr_decode_hyper(p, &lgp->lg_seg.length); + p = xdr_decode_hyper(p, &lgp->lg_minlength); + nfsd4_decode_stateid(argp, &lgp->lg_sid); + READ_BUF(4); + lgp->lg_maxcount = be32_to_cpup(p++); + + DECODE_TAIL; +} + +static __be32 +nfsd4_decode_layoutcommit(struct nfsd4_compoundargs *argp, + struct nfsd4_layoutcommit *lcp) +{ + DECODE_HEAD; + u32 timechange; + + READ_BUF(20); + p = xdr_decode_hyper(p, &lcp->lc_seg.offset); + p = xdr_decode_hyper(p, &lcp->lc_seg.length); + lcp->lc_reclaim = be32_to_cpup(p++); + nfsd4_decode_stateid(argp, &lcp->lc_sid); + READ_BUF(4); + lcp->lc_newoffset = be32_to_cpup(p++); + if (lcp->lc_newoffset) { + READ_BUF(8); + p = xdr_decode_hyper(p, &lcp->lc_last_wr); + } else + lcp->lc_last_wr = 0; + READ_BUF(4); + timechange = be32_to_cpup(p++); + if (timechange) { + status = nfsd4_decode_time(argp, &lcp->lc_mtime); + if (status) + return status; + } else { + lcp->lc_mtime.tv_nsec = UTIME_NOW; + } + READ_BUF(8); + lcp->lc_layout_type = be32_to_cpup(p++); + + /* + * Save the layout update in XDR format and let the layout driver deal + * with it later. + */ + lcp->lc_up_len = be32_to_cpup(p++); + if (lcp->lc_up_len > 0) { + READ_BUF(lcp->lc_up_len); + READMEM(lcp->lc_up_layout, lcp->lc_up_len); + } + + DECODE_TAIL; +} + +static __be32 +nfsd4_decode_layoutreturn(struct nfsd4_compoundargs *argp, + struct nfsd4_layoutreturn *lrp) +{ + DECODE_HEAD; + + READ_BUF(16); + lrp->lr_reclaim = be32_to_cpup(p++); + lrp->lr_layout_type = be32_to_cpup(p++); + lrp->lr_seg.iomode = be32_to_cpup(p++); + lrp->lr_return_type = be32_to_cpup(p++); + if (lrp->lr_return_type == RETURN_FILE) { + READ_BUF(16); + p = xdr_decode_hyper(p, &lrp->lr_seg.offset); + p = xdr_decode_hyper(p, &lrp->lr_seg.length); + nfsd4_decode_stateid(argp, &lrp->lr_sid); + READ_BUF(4); + lrp->lrf_body_len = be32_to_cpup(p++); + if (lrp->lrf_body_len > 0) { + READ_BUF(lrp->lrf_body_len); + READMEM(lrp->lrf_body, lrp->lrf_body_len); + } + } else { + lrp->lr_seg.offset = 0; + lrp->lr_seg.length = NFS4_MAX_UINT64; + } + + DECODE_TAIL; +} +#endif /* CONFIG_NFSD_PNFS */ + static __be32 nfsd4_decode_fallocate(struct nfsd4_compoundargs *argp, struct nfsd4_fallocate *fallocate) @@ -1607,11 +1738,19 @@ static nfsd4_dec nfsd4_dec_ops[] = { [OP_DESTROY_SESSION] = (nfsd4_dec)nfsd4_decode_destroy_session, [OP_FREE_STATEID] = (nfsd4_dec)nfsd4_decode_free_stateid, [OP_GET_DIR_DELEGATION] = (nfsd4_dec)nfsd4_decode_notsupp, +#ifdef CONFIG_NFSD_PNFS + [OP_GETDEVICEINFO] = (nfsd4_dec)nfsd4_decode_getdeviceinfo, + [OP_GETDEVICELIST] = (nfsd4_dec)nfsd4_decode_notsupp, + [OP_LAYOUTCOMMIT] = (nfsd4_dec)nfsd4_decode_layoutcommit, + [OP_LAYOUTGET] = (nfsd4_dec)nfsd4_decode_layoutget, + [OP_LAYOUTRETURN] = (nfsd4_dec)nfsd4_decode_layoutreturn, +#else [OP_GETDEVICEINFO] = (nfsd4_dec)nfsd4_decode_notsupp, [OP_GETDEVICELIST] = (nfsd4_dec)nfsd4_decode_notsupp, [OP_LAYOUTCOMMIT] = (nfsd4_dec)nfsd4_decode_notsupp, [OP_LAYOUTGET] = (nfsd4_dec)nfsd4_decode_notsupp, [OP_LAYOUTRETURN] = (nfsd4_dec)nfsd4_decode_notsupp, +#endif [OP_SECINFO_NO_NAME] = (nfsd4_dec)nfsd4_decode_secinfo_no_name, [OP_SEQUENCE] = (nfsd4_dec)nfsd4_decode_sequence, [OP_SET_SSV] = (nfsd4_dec)nfsd4_decode_notsupp, @@ -2539,6 +2678,30 @@ out_acl: get_parent_attributes(exp, &stat); p = xdr_encode_hyper(p, stat.ino); } +#ifdef CONFIG_NFSD_PNFS + if ((bmval1 & FATTR4_WORD1_FS_LAYOUT_TYPES) || + (bmval2 & FATTR4_WORD2_LAYOUT_TYPES)) { + if (exp->ex_layout_type) { + p = xdr_reserve_space(xdr, 8); + if (!p) + goto out_resource; + *p++ = cpu_to_be32(1); + *p++ = cpu_to_be32(exp->ex_layout_type); + } else { + p = xdr_reserve_space(xdr, 4); + if (!p) + goto out_resource; + *p++ = cpu_to_be32(0); + } + } + + if (bmval2 & FATTR4_WORD2_LAYOUT_BLKSIZE) { + p = xdr_reserve_space(xdr, 4); + if (!p) + goto out_resource; + *p++ = cpu_to_be32(stat.blksize); + } +#endif /* CONFIG_NFSD_PNFS */ if (bmval2 & FATTR4_WORD2_SECURITY_LABEL) { status = nfsd4_encode_security_label(xdr, rqstp, context, contextlen); @@ -2768,16 +2931,17 @@ nfsd4_encode_dirent(void *ccdv, const char *name, int namlen, if (entry_bytes > cd->rd_maxcount) goto fail; cd->rd_maxcount -= entry_bytes; - if (!cd->rd_dircount) - goto fail; /* * RFC 3530 14.2.24 describes rd_dircount as only a "hint", so * let's always let through the first entry, at least: */ - name_and_cookie = 4 * XDR_QUADLEN(namlen) + 8; + if (!cd->rd_dircount) + goto fail; + name_and_cookie = 4 + 4 * XDR_QUADLEN(namlen) + 8; if (name_and_cookie > cd->rd_dircount && cd->cookie_offset) goto fail; cd->rd_dircount -= min(cd->rd_dircount, name_and_cookie); + cd->cookie_offset = cookie_offset; skip_entry: cd->common.err = nfs_ok; @@ -3814,6 +3978,156 @@ nfsd4_encode_test_stateid(struct nfsd4_compoundres *resp, __be32 nfserr, return nfserr; } +#ifdef CONFIG_NFSD_PNFS +static __be32 +nfsd4_encode_getdeviceinfo(struct nfsd4_compoundres *resp, __be32 nfserr, + struct nfsd4_getdeviceinfo *gdev) +{ + struct xdr_stream *xdr = &resp->xdr; + const struct nfsd4_layout_ops *ops = + nfsd4_layout_ops[gdev->gd_layout_type]; + u32 starting_len = xdr->buf->len, needed_len; + __be32 *p; + + dprintk("%s: err %d\n", __func__, nfserr); + if (nfserr) + goto out; + + nfserr = nfserr_resource; + p = xdr_reserve_space(xdr, 4); + if (!p) + goto out; + + *p++ = cpu_to_be32(gdev->gd_layout_type); + + /* If maxcount is 0 then just update notifications */ + if (gdev->gd_maxcount != 0) { + nfserr = ops->encode_getdeviceinfo(xdr, gdev); + if (nfserr) { + /* + * We don't bother to burden the layout drivers with + * enforcing gd_maxcount, just tell the client to + * come back with a bigger buffer if it's not enough. + */ + if (xdr->buf->len + 4 > gdev->gd_maxcount) + goto toosmall; + goto out; + } + } + + nfserr = nfserr_resource; + if (gdev->gd_notify_types) { + p = xdr_reserve_space(xdr, 4 + 4); + if (!p) + goto out; + *p++ = cpu_to_be32(1); /* bitmap length */ + *p++ = cpu_to_be32(gdev->gd_notify_types); + } else { + p = xdr_reserve_space(xdr, 4); + if (!p) + goto out; + *p++ = 0; + } + + nfserr = 0; +out: + kfree(gdev->gd_device); + dprintk("%s: done: %d\n", __func__, be32_to_cpu(nfserr)); + return nfserr; + +toosmall: + dprintk("%s: maxcount too small\n", __func__); + needed_len = xdr->buf->len + 4 /* notifications */; + xdr_truncate_encode(xdr, starting_len); + p = xdr_reserve_space(xdr, 4); + if (!p) { + nfserr = nfserr_resource; + } else { + *p++ = cpu_to_be32(needed_len); + nfserr = nfserr_toosmall; + } + goto out; +} + +static __be32 +nfsd4_encode_layoutget(struct nfsd4_compoundres *resp, __be32 nfserr, + struct nfsd4_layoutget *lgp) +{ + struct xdr_stream *xdr = &resp->xdr; + const struct nfsd4_layout_ops *ops = + nfsd4_layout_ops[lgp->lg_layout_type]; + __be32 *p; + + dprintk("%s: err %d\n", __func__, nfserr); + if (nfserr) + goto out; + + nfserr = nfserr_resource; + p = xdr_reserve_space(xdr, 36 + sizeof(stateid_opaque_t)); + if (!p) + goto out; + + *p++ = cpu_to_be32(1); /* we always set return-on-close */ + *p++ = cpu_to_be32(lgp->lg_sid.si_generation); + p = xdr_encode_opaque_fixed(p, &lgp->lg_sid.si_opaque, + sizeof(stateid_opaque_t)); + + *p++ = cpu_to_be32(1); /* we always return a single layout */ + p = xdr_encode_hyper(p, lgp->lg_seg.offset); + p = xdr_encode_hyper(p, lgp->lg_seg.length); + *p++ = cpu_to_be32(lgp->lg_seg.iomode); + *p++ = cpu_to_be32(lgp->lg_layout_type); + + nfserr = ops->encode_layoutget(xdr, lgp); +out: + kfree(lgp->lg_content); + return nfserr; +} + +static __be32 +nfsd4_encode_layoutcommit(struct nfsd4_compoundres *resp, __be32 nfserr, + struct nfsd4_layoutcommit *lcp) +{ + struct xdr_stream *xdr = &resp->xdr; + __be32 *p; + + if (nfserr) + return nfserr; + + p = xdr_reserve_space(xdr, 4); + if (!p) + return nfserr_resource; + *p++ = cpu_to_be32(lcp->lc_size_chg); + if (lcp->lc_size_chg) { + p = xdr_reserve_space(xdr, 8); + if (!p) + return nfserr_resource; + p = xdr_encode_hyper(p, lcp->lc_newsize); + } + + return nfs_ok; +} + +static __be32 +nfsd4_encode_layoutreturn(struct nfsd4_compoundres *resp, __be32 nfserr, + struct nfsd4_layoutreturn *lrp) +{ + struct xdr_stream *xdr = &resp->xdr; + __be32 *p; + + if (nfserr) + return nfserr; + + p = xdr_reserve_space(xdr, 4); + if (!p) + return nfserr_resource; + *p++ = cpu_to_be32(lrp->lrs_present); + if (lrp->lrs_present) + nfsd4_encode_stateid(xdr, &lrp->lr_sid); + return nfs_ok; +} +#endif /* CONFIG_NFSD_PNFS */ + static __be32 nfsd4_encode_seek(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_seek *seek) @@ -3890,11 +4204,19 @@ static nfsd4_enc nfsd4_enc_ops[] = { [OP_DESTROY_SESSION] = (nfsd4_enc)nfsd4_encode_noop, [OP_FREE_STATEID] = (nfsd4_enc)nfsd4_encode_noop, [OP_GET_DIR_DELEGATION] = (nfsd4_enc)nfsd4_encode_noop, +#ifdef CONFIG_NFSD_PNFS + [OP_GETDEVICEINFO] = (nfsd4_enc)nfsd4_encode_getdeviceinfo, + [OP_GETDEVICELIST] = (nfsd4_enc)nfsd4_encode_noop, + [OP_LAYOUTCOMMIT] = (nfsd4_enc)nfsd4_encode_layoutcommit, + [OP_LAYOUTGET] = (nfsd4_enc)nfsd4_encode_layoutget, + [OP_LAYOUTRETURN] = (nfsd4_enc)nfsd4_encode_layoutreturn, +#else [OP_GETDEVICEINFO] = (nfsd4_enc)nfsd4_encode_noop, [OP_GETDEVICELIST] = (nfsd4_enc)nfsd4_encode_noop, [OP_LAYOUTCOMMIT] = (nfsd4_enc)nfsd4_encode_noop, [OP_LAYOUTGET] = (nfsd4_enc)nfsd4_encode_noop, [OP_LAYOUTRETURN] = (nfsd4_enc)nfsd4_encode_noop, +#endif [OP_SECINFO_NO_NAME] = (nfsd4_enc)nfsd4_encode_secinfo_no_name, [OP_SEQUENCE] = (nfsd4_enc)nfsd4_encode_sequence, [OP_SET_SSV] = (nfsd4_enc)nfsd4_encode_noop, diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index 19ace74..aa47d75 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -21,6 +21,7 @@ #include "cache.h" #include "state.h" #include "netns.h" +#include "pnfs.h" /* * We have a single directory with several nodes in it. @@ -1258,9 +1259,12 @@ static int __init init_nfsd(void) retval = nfsd4_init_slabs(); if (retval) goto out_unregister_pernet; - retval = nfsd_fault_inject_init(); /* nfsd fault injection controls */ + retval = nfsd4_init_pnfs(); if (retval) goto out_free_slabs; + retval = nfsd_fault_inject_init(); /* nfsd fault injection controls */ + if (retval) + goto out_exit_pnfs; nfsd_stat_init(); /* Statistics */ retval = nfsd_reply_cache_init(); if (retval) @@ -1282,6 +1286,8 @@ out_free_lockd: out_free_stat: nfsd_stat_shutdown(); nfsd_fault_inject_cleanup(); +out_exit_pnfs: + nfsd4_exit_pnfs(); out_free_slabs: nfsd4_free_slabs(); out_unregister_pernet: @@ -1299,6 +1305,7 @@ static void __exit exit_nfsd(void) nfsd_stat_shutdown(); nfsd_lockd_shutdown(); nfsd4_free_slabs(); + nfsd4_exit_pnfs(); nfsd_fault_inject_cleanup(); unregister_filesystem(&nfsd_fs_type); unregister_pernet_subsys(&nfsd_net_ops); diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h index 33a46a8..565c4da 100644 --- a/fs/nfsd/nfsd.h +++ b/fs/nfsd/nfsd.h @@ -325,15 +325,27 @@ void nfsd_lockd_shutdown(void); #define NFSD4_SUPPORTED_ATTRS_WORD2 0 +/* 4.1 */ +#ifdef CONFIG_NFSD_PNFS +#define PNFSD_SUPPORTED_ATTRS_WORD1 FATTR4_WORD1_FS_LAYOUT_TYPES +#define PNFSD_SUPPORTED_ATTRS_WORD2 \ +(FATTR4_WORD2_LAYOUT_BLKSIZE | FATTR4_WORD2_LAYOUT_TYPES) +#else +#define PNFSD_SUPPORTED_ATTRS_WORD1 0 +#define PNFSD_SUPPORTED_ATTRS_WORD2 0 +#endif /* CONFIG_NFSD_PNFS */ + #define NFSD4_1_SUPPORTED_ATTRS_WORD0 \ NFSD4_SUPPORTED_ATTRS_WORD0 #define NFSD4_1_SUPPORTED_ATTRS_WORD1 \ - NFSD4_SUPPORTED_ATTRS_WORD1 + (NFSD4_SUPPORTED_ATTRS_WORD1 | PNFSD_SUPPORTED_ATTRS_WORD1) #define NFSD4_1_SUPPORTED_ATTRS_WORD2 \ - (NFSD4_SUPPORTED_ATTRS_WORD2 | FATTR4_WORD2_SUPPATTR_EXCLCREAT) + (NFSD4_SUPPORTED_ATTRS_WORD2 | PNFSD_SUPPORTED_ATTRS_WORD2 | \ + FATTR4_WORD2_SUPPATTR_EXCLCREAT) +/* 4.2 */ #ifdef CONFIG_NFSD_V4_SECURITY_LABEL #define NFSD4_2_SECURITY_ATTRS FATTR4_WORD2_SECURITY_LABEL #else diff --git a/fs/nfsd/nfsfh.h b/fs/nfsd/nfsfh.h index 08236d7..84cae20 100644 --- a/fs/nfsd/nfsfh.h +++ b/fs/nfsd/nfsfh.h @@ -187,6 +187,24 @@ fh_init(struct svc_fh *fhp, int maxsize) return fhp; } +static inline bool fh_match(struct knfsd_fh *fh1, struct knfsd_fh *fh2) +{ + if (fh1->fh_size != fh2->fh_size) + return false; + if (memcmp(fh1->fh_base.fh_pad, fh2->fh_base.fh_pad, fh1->fh_size) != 0) + return false; + return true; +} + +static inline bool fh_fsid_match(struct knfsd_fh *fh1, struct knfsd_fh *fh2) +{ + if (fh1->fh_fsid_type != fh2->fh_fsid_type) + return false; + if (memcmp(fh1->fh_fsid, fh2->fh_fsid, key_len(fh1->fh_fsid_type) != 0)) + return false; + return true; +} + #ifdef CONFIG_NFSD_V3 /* * The wcc data stored in current_fh should be cleared diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index 314f5c8..9277cc9 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -119,6 +119,7 @@ struct svc_program nfsd_program = { static bool nfsd_supported_minorversions[NFSD_SUPPORTED_MINOR_VERSION + 1] = { [0] = 1, [1] = 1, + [2] = 1, }; int nfsd_vers(int vers, enum vers_op change) diff --git a/fs/nfsd/pnfs.h b/fs/nfsd/pnfs.h new file mode 100644 index 0000000..fedb4d6 --- /dev/null +++ b/fs/nfsd/pnfs.h @@ -0,0 +1,81 @@ +#ifndef _FS_NFSD_PNFS_H +#define _FS_NFSD_PNFS_H 1 + +#include <linux/exportfs.h> +#include <linux/nfsd/export.h> + +#include "state.h" +#include "xdr4.h" + +struct xdr_stream; + +struct nfsd4_deviceid_map { + struct list_head hash; + u64 idx; + int fsid_type; + u32 fsid[]; +}; + +struct nfsd4_layout_ops { + u32 notify_types; + + __be32 (*proc_getdeviceinfo)(struct super_block *sb, + struct nfsd4_getdeviceinfo *gdevp); + __be32 (*encode_getdeviceinfo)(struct xdr_stream *xdr, + struct nfsd4_getdeviceinfo *gdevp); + + __be32 (*proc_layoutget)(struct inode *, const struct svc_fh *fhp, + struct nfsd4_layoutget *lgp); + __be32 (*encode_layoutget)(struct xdr_stream *, + struct nfsd4_layoutget *lgp); + + __be32 (*proc_layoutcommit)(struct inode *inode, + struct nfsd4_layoutcommit *lcp); +}; + +extern const struct nfsd4_layout_ops *nfsd4_layout_ops[]; +extern const struct nfsd4_layout_ops bl_layout_ops; + +__be32 nfsd4_preprocess_layout_stateid(struct svc_rqst *rqstp, + struct nfsd4_compound_state *cstate, stateid_t *stateid, + bool create, u32 layout_type, struct nfs4_layout_stateid **lsp); +__be32 nfsd4_insert_layout(struct nfsd4_layoutget *lgp, + struct nfs4_layout_stateid *ls); +__be32 nfsd4_return_file_layouts(struct svc_rqst *rqstp, + struct nfsd4_compound_state *cstate, + struct nfsd4_layoutreturn *lrp); +__be32 nfsd4_return_client_layouts(struct svc_rqst *rqstp, + struct nfsd4_compound_state *cstate, + struct nfsd4_layoutreturn *lrp); +int nfsd4_set_deviceid(struct nfsd4_deviceid *id, const struct svc_fh *fhp, + u32 device_generation); +struct nfsd4_deviceid_map *nfsd4_find_devid_map(int idx); + +#ifdef CONFIG_NFSD_PNFS +void nfsd4_setup_layout_type(struct svc_export *exp); +void nfsd4_return_all_client_layouts(struct nfs4_client *); +void nfsd4_return_all_file_layouts(struct nfs4_client *clp, + struct nfs4_file *fp); +int nfsd4_init_pnfs(void); +void nfsd4_exit_pnfs(void); +#else +static inline void nfsd4_setup_layout_type(struct svc_export *exp) +{ +} + +static inline void nfsd4_return_all_client_layouts(struct nfs4_client *clp) +{ +} +static inline void nfsd4_return_all_file_layouts(struct nfs4_client *clp, + struct nfs4_file *fp) +{ +} +static inline void nfsd4_exit_pnfs(void) +{ +} +static inline int nfsd4_init_pnfs(void) +{ + return 0; +} +#endif /* CONFIG_NFSD_PNFS */ +#endif /* _FS_NFSD_PNFS_H */ diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index 9d3be37..4f3bfeb 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -92,6 +92,7 @@ struct nfs4_stid { /* For a deleg stateid kept around only to process free_stateid's: */ #define NFS4_REVOKED_DELEG_STID 16 #define NFS4_CLOSED_DELEG_STID 32 +#define NFS4_LAYOUT_STID 64 unsigned char sc_type; stateid_t sc_stateid; struct nfs4_client *sc_client; @@ -297,6 +298,9 @@ struct nfs4_client { struct list_head cl_delegations; struct list_head cl_revoked; /* unacknowledged, revoked 4.1 state */ struct list_head cl_lru; /* tail queue */ +#ifdef CONFIG_NFSD_PNFS + struct list_head cl_lo_states; /* outstanding layout states */ +#endif struct xdr_netobj cl_name; /* id generated by client */ nfs4_verifier cl_verifier; /* generated by client */ time_t cl_time; /* time of last lease renewal */ @@ -493,9 +497,13 @@ struct nfs4_file { atomic_t fi_access[2]; u32 fi_share_deny; struct file *fi_deleg_file; - atomic_t fi_delegees; + int fi_delegees; struct knfsd_fh fi_fhandle; bool fi_had_conflict; +#ifdef CONFIG_NFSD_PNFS + struct list_head fi_lo_states; + atomic_t fi_lo_recalls; +#endif }; /* @@ -528,6 +536,24 @@ static inline struct nfs4_ol_stateid *openlockstateid(struct nfs4_stid *s) return container_of(s, struct nfs4_ol_stateid, st_stid); } +struct nfs4_layout_stateid { + struct nfs4_stid ls_stid; + struct list_head ls_perclnt; + struct list_head ls_perfile; + spinlock_t ls_lock; + struct list_head ls_layouts; + u32 ls_layout_type; + struct file *ls_file; + struct nfsd4_callback ls_recall; + stateid_t ls_recall_sid; + bool ls_recalled; +}; + +static inline struct nfs4_layout_stateid *layoutstateid(struct nfs4_stid *s) +{ + return container_of(s, struct nfs4_layout_stateid, ls_stid); +} + /* flags for preprocess_seqid_op() */ #define RD_STATE 0x00000010 #define WR_STATE 0x00000020 @@ -535,6 +561,7 @@ static inline struct nfs4_ol_stateid *openlockstateid(struct nfs4_stid *s) enum nfsd4_cb_op { NFSPROC4_CLNT_CB_NULL = 0, NFSPROC4_CLNT_CB_RECALL, + NFSPROC4_CLNT_CB_LAYOUT, NFSPROC4_CLNT_CB_SEQUENCE, }; @@ -545,6 +572,12 @@ struct nfsd_net; extern __be32 nfs4_preprocess_stateid_op(struct net *net, struct nfsd4_compound_state *cstate, stateid_t *stateid, int flags, struct file **filp); +__be32 nfsd4_lookup_stateid(struct nfsd4_compound_state *cstate, + stateid_t *stateid, unsigned char typemask, + struct nfs4_stid **s, struct nfsd_net *nn); +struct nfs4_stid *nfs4_alloc_stid(struct nfs4_client *cl, + struct kmem_cache *slab); +void nfs4_unhash_stid(struct nfs4_stid *s); void nfs4_put_stid(struct nfs4_stid *s); void nfs4_remove_reclaim_record(struct nfs4_client_reclaim *, struct nfsd_net *); extern void nfs4_release_reclaim(struct nfsd_net *); @@ -567,6 +600,14 @@ extern struct nfs4_client_reclaim *nfs4_client_to_reclaim(const char *name, struct nfsd_net *nn); extern bool nfs4_has_reclaimed_state(const char *name, struct nfsd_net *nn); +struct nfs4_file *find_file(struct knfsd_fh *fh); +void put_nfs4_file(struct nfs4_file *fi); +static inline void get_nfs4_file(struct nfs4_file *fi) +{ + atomic_inc(&fi->fi_ref); +} +struct file *find_any_file(struct nfs4_file *f); + /* grace period management */ void nfsd4_end_grace(struct nfsd_net *nn); diff --git a/fs/nfsd/trace.c b/fs/nfsd/trace.c new file mode 100644 index 0000000..82f8907 --- /dev/null +++ b/fs/nfsd/trace.c @@ -0,0 +1,5 @@ + +#include "state.h" + +#define CREATE_TRACE_POINTS +#include "trace.h" diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h new file mode 100644 index 0000000..c668520 --- /dev/null +++ b/fs/nfsd/trace.h @@ -0,0 +1,54 @@ +/* + * Copyright (c) 2014 Christoph Hellwig. + */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM nfsd + +#if !defined(_NFSD_TRACE_H) || defined(TRACE_HEADER_MULTI_READ) +#define _NFSD_TRACE_H + +#include <linux/tracepoint.h> + +DECLARE_EVENT_CLASS(nfsd_stateid_class, + TP_PROTO(stateid_t *stp), + TP_ARGS(stp), + TP_STRUCT__entry( + __field(u32, cl_boot) + __field(u32, cl_id) + __field(u32, si_id) + __field(u32, si_generation) + ), + TP_fast_assign( + __entry->cl_boot = stp->si_opaque.so_clid.cl_boot; + __entry->cl_id = stp->si_opaque.so_clid.cl_id; + __entry->si_id = stp->si_opaque.so_id; + __entry->si_generation = stp->si_generation; + ), + TP_printk("client %08x:%08x stateid %08x:%08x", + __entry->cl_boot, + __entry->cl_id, + __entry->si_id, + __entry->si_generation) +) + +#define DEFINE_STATEID_EVENT(name) \ +DEFINE_EVENT(nfsd_stateid_class, name, \ + TP_PROTO(stateid_t *stp), \ + TP_ARGS(stp)) +DEFINE_STATEID_EVENT(layoutstate_alloc); +DEFINE_STATEID_EVENT(layoutstate_unhash); +DEFINE_STATEID_EVENT(layoutstate_free); +DEFINE_STATEID_EVENT(layout_get_lookup_fail); +DEFINE_STATEID_EVENT(layout_commit_lookup_fail); +DEFINE_STATEID_EVENT(layout_return_lookup_fail); +DEFINE_STATEID_EVENT(layout_recall); +DEFINE_STATEID_EVENT(layout_recall_done); +DEFINE_STATEID_EVENT(layout_recall_fail); +DEFINE_STATEID_EVENT(layout_recall_release); + +#endif /* _NFSD_TRACE_H */ + +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH . +#define TRACE_INCLUDE_FILE trace +#include <trace/define_trace.h> diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h index 90a5925..0bda93e 100644 --- a/fs/nfsd/xdr4.h +++ b/fs/nfsd/xdr4.h @@ -428,6 +428,61 @@ struct nfsd4_reclaim_complete { u32 rca_one_fs; }; +struct nfsd4_deviceid { + u64 fsid_idx; + u32 generation; + u32 pad; +}; + +struct nfsd4_layout_seg { + u32 iomode; + u64 offset; + u64 length; +}; + +struct nfsd4_getdeviceinfo { + struct nfsd4_deviceid gd_devid; /* request */ + u32 gd_layout_type; /* request */ + u32 gd_maxcount; /* request */ + u32 gd_notify_types;/* request - response */ + void *gd_device; /* response */ +}; + +struct nfsd4_layoutget { + u64 lg_minlength; /* request */ + u32 lg_signal; /* request */ + u32 lg_layout_type; /* request */ + u32 lg_maxcount; /* request */ + stateid_t lg_sid; /* request/response */ + struct nfsd4_layout_seg lg_seg; /* request/response */ + void *lg_content; /* response */ +}; + +struct nfsd4_layoutcommit { + stateid_t lc_sid; /* request */ + struct nfsd4_layout_seg lc_seg; /* request */ + u32 lc_reclaim; /* request */ + u32 lc_newoffset; /* request */ + u64 lc_last_wr; /* request */ + struct timespec lc_mtime; /* request */ + u32 lc_layout_type; /* request */ + u32 lc_up_len; /* layout length */ + void *lc_up_layout; /* decoded by callback */ + u32 lc_size_chg; /* boolean for response */ + u64 lc_newsize; /* response */ +}; + +struct nfsd4_layoutreturn { + u32 lr_return_type; /* request */ + u32 lr_layout_type; /* request */ + struct nfsd4_layout_seg lr_seg; /* request */ + u32 lr_reclaim; /* request */ + u32 lrf_body_len; /* request */ + void *lrf_body; /* request */ + stateid_t lr_sid; /* request/response */ + u32 lrs_present; /* response */ +}; + struct nfsd4_fallocate { /* request */ stateid_t falloc_stateid; @@ -491,6 +546,10 @@ struct nfsd4_op { struct nfsd4_reclaim_complete reclaim_complete; struct nfsd4_test_stateid test_stateid; struct nfsd4_free_stateid free_stateid; + struct nfsd4_getdeviceinfo getdeviceinfo; + struct nfsd4_layoutget layoutget; + struct nfsd4_layoutcommit layoutcommit; + struct nfsd4_layoutreturn layoutreturn; /* NFSv4.2 */ struct nfsd4_fallocate allocate; diff --git a/fs/nfsd/xdr4cb.h b/fs/nfsd/xdr4cb.h index c5c55df..c47f6fd 100644 --- a/fs/nfsd/xdr4cb.h +++ b/fs/nfsd/xdr4cb.h @@ -21,3 +21,10 @@ #define NFS4_dec_cb_recall_sz (cb_compound_dec_hdr_sz + \ cb_sequence_dec_sz + \ op_dec_sz) +#define NFS4_enc_cb_layout_sz (cb_compound_enc_hdr_sz + \ + cb_sequence_enc_sz + \ + 1 + 3 + \ + enc_nfs4_fh_sz + 4) +#define NFS4_dec_cb_layout_sz (cb_compound_dec_hdr_sz + \ + cb_sequence_dec_sz + \ + op_dec_sz) diff --git a/include/linux/exportfs.h b/include/linux/exportfs.h index 41b223a..fa05e04 100644 --- a/include/linux/exportfs.h +++ b/include/linux/exportfs.h @@ -4,6 +4,7 @@ #include <linux/types.h> struct dentry; +struct iattr; struct inode; struct super_block; struct vfsmount; @@ -180,6 +181,21 @@ struct fid { * get_name is not (which is possibly inconsistent) */ +/* types of block ranges for multipage write mappings. */ +#define IOMAP_HOLE 0x01 /* no blocks allocated, need allocation */ +#define IOMAP_DELALLOC 0x02 /* delayed allocation blocks */ +#define IOMAP_MAPPED 0x03 /* blocks allocated @blkno */ +#define IOMAP_UNWRITTEN 0x04 /* blocks allocated @blkno in unwritten state */ + +#define IOMAP_NULL_BLOCK -1LL /* blkno is not valid */ + +struct iomap { + sector_t blkno; /* first sector of mapping */ + loff_t offset; /* file offset of mapping, bytes */ + u64 length; /* length of mapping, bytes */ + int type; /* type of mapping */ +}; + struct export_operations { int (*encode_fh)(struct inode *inode, __u32 *fh, int *max_len, struct inode *parent); @@ -191,6 +207,13 @@ struct export_operations { struct dentry *child); struct dentry * (*get_parent)(struct dentry *child); int (*commit_metadata)(struct inode *inode); + + int (*get_uuid)(struct super_block *sb, u8 *buf, u32 *len, u64 *offset); + int (*map_blocks)(struct inode *inode, loff_t offset, + u64 len, struct iomap *iomap, + bool write, u32 *device_generation); + int (*commit_blocks)(struct inode *inode, struct iomap *iomaps, + int nr_iomaps, struct iattr *iattr); }; extern int exportfs_encode_inode_fh(struct inode *inode, struct fid *fid, diff --git a/include/linux/fs.h b/include/linux/fs.h index f125b88..cdcb1e9 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -873,6 +873,7 @@ static inline struct file *get_file(struct file *f) #define FL_DOWNGRADE_PENDING 256 /* Lease is being downgraded */ #define FL_UNLOCK_PENDING 512 /* Lease is being broken */ #define FL_OFDLCK 1024 /* lock is "owned" by struct file */ +#define FL_LAYOUT 2048 /* outstanding pNFS layout */ /* * Special return value from posix_lock_file() and vfs_lock_file() for @@ -2035,6 +2036,16 @@ static inline int break_deleg_wait(struct inode **delegated_inode) return ret; } +static inline int break_layout(struct inode *inode, bool wait) +{ + smp_mb(); + if (inode->i_flctx && !list_empty_careful(&inode->i_flctx->flc_lease)) + return __break_lease(inode, + wait ? O_WRONLY : O_WRONLY | O_NONBLOCK, + FL_LAYOUT); + return 0; +} + #else /* !CONFIG_FILE_LOCKING */ static inline int locks_mandatory_locked(struct file *file) { @@ -2090,6 +2101,11 @@ static inline int break_deleg_wait(struct inode **delegated_inode) return 0; } +static inline int break_layout(struct inode *inode, bool wait) +{ + return 0; +} + #endif /* CONFIG_FILE_LOCKING */ /* fs/open.c */ diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h index de7c91c..ed43cb7 100644 --- a/include/linux/nfs4.h +++ b/include/linux/nfs4.h @@ -411,6 +411,7 @@ enum lock_type4 { #define FATTR4_WORD1_TIME_MODIFY_SET (1UL << 22) #define FATTR4_WORD1_MOUNTED_ON_FILEID (1UL << 23) #define FATTR4_WORD1_FS_LAYOUT_TYPES (1UL << 30) +#define FATTR4_WORD2_LAYOUT_TYPES (1UL << 0) #define FATTR4_WORD2_LAYOUT_BLKSIZE (1UL << 1) #define FATTR4_WORD2_MDSTHRESHOLD (1UL << 4) #define FATTR4_WORD2_SECURITY_LABEL (1UL << 16) @@ -517,6 +518,7 @@ enum pnfs_layouttype { LAYOUT_OSD2_OBJECTS = 2, LAYOUT_BLOCK_VOLUME = 3, LAYOUT_FLEX_FILES = 4, + LAYOUT_TYPE_MAX }; /* used for both layout return and recall */ diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index 6f22cfe..fae6fb9 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -110,7 +110,7 @@ struct svc_serv { * We use sv_nrthreads as a reference count. svc_destroy() drops * this refcount, so we need to bump it up around operations that * change the number of threads. Horrible, but there it is. - * Should be called with the BKL held. + * Should be called with the "service mutex" held. */ static inline void svc_get(struct svc_serv *serv) { diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h index ddfe88f..df8edf8 100644 --- a/include/linux/sunrpc/svc_rdma.h +++ b/include/linux/sunrpc/svc_rdma.h @@ -77,6 +77,7 @@ struct svc_rdma_op_ctxt { enum ib_wr_opcode wr_op; enum ib_wc_status wc_status; u32 byte_len; + u32 position; struct svcxprt_rdma *xprt; unsigned long flags; enum dma_data_direction direction; @@ -148,6 +149,10 @@ struct svcxprt_rdma { struct ib_cq *sc_rq_cq; struct ib_cq *sc_sq_cq; struct ib_mr *sc_phys_mr; /* MR for server memory */ + int (*sc_reader)(struct svcxprt_rdma *, + struct svc_rqst *, + struct svc_rdma_op_ctxt *, + int *, u32 *, u32, u32, u64, bool); u32 sc_dev_caps; /* distilled device caps */ u32 sc_dma_lkey; /* local dma key */ unsigned int sc_frmr_pg_list_len; @@ -176,8 +181,6 @@ struct svcxprt_rdma { #define RPCRDMA_MAX_REQ_SIZE 4096 /* svc_rdma_marshal.c */ -extern void svc_rdma_rcl_chunk_counts(struct rpcrdma_read_chunk *, - int *, int *); extern int svc_rdma_xdr_decode_req(struct rpcrdma_msg **, struct svc_rqst *); extern int svc_rdma_xdr_decode_deferred_req(struct svc_rqst *); extern int svc_rdma_xdr_encode_error(struct svcxprt_rdma *, @@ -195,6 +198,12 @@ extern int svc_rdma_xdr_get_reply_hdr_len(struct rpcrdma_msg *); /* svc_rdma_recvfrom.c */ extern int svc_rdma_recvfrom(struct svc_rqst *); +extern int rdma_read_chunk_lcl(struct svcxprt_rdma *, struct svc_rqst *, + struct svc_rdma_op_ctxt *, int *, u32 *, + u32, u32, u64, bool); +extern int rdma_read_chunk_frmr(struct svcxprt_rdma *, struct svc_rqst *, + struct svc_rdma_op_ctxt *, int *, u32 *, + u32, u32, u64, bool); /* svc_rdma_sendto.c */ extern int svc_rdma_sendto(struct svc_rqst *); diff --git a/include/uapi/linux/nfsd/debug.h b/include/uapi/linux/nfsd/debug.h index 1fdc95b..0bf130a 100644 --- a/include/uapi/linux/nfsd/debug.h +++ b/include/uapi/linux/nfsd/debug.h @@ -32,6 +32,7 @@ #define NFSDDBG_REPCACHE 0x0080 #define NFSDDBG_XDR 0x0100 #define NFSDDBG_LOCKD 0x0200 +#define NFSDDBG_PNFS 0x0400 #define NFSDDBG_ALL 0x7FFF #define NFSDDBG_NOCHANGE 0xFFFF diff --git a/include/uapi/linux/nfsd/export.h b/include/uapi/linux/nfsd/export.h index 584b6ef..4742f2c 100644 --- a/include/uapi/linux/nfsd/export.h +++ b/include/uapi/linux/nfsd/export.h @@ -47,8 +47,10 @@ * exported filesystem. */ #define NFSEXP_V4ROOT 0x10000 +#define NFSEXP_NOPNFS 0x20000 + /* All flags that we claim to support. (Note we don't support NOACL.) */ -#define NFSEXP_ALLFLAGS 0x1FE7F +#define NFSEXP_ALLFLAGS 0x3FE7F /* The flags that may vary depending on security flavor: */ #define NFSEXP_SECINFO_FLAGS (NFSEXP_READONLY | NFSEXP_ROOTSQUASH \ diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index 91eaef1..78974e4 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -768,8 +768,8 @@ svc_set_num_threads(struct svc_serv *serv, struct svc_pool *pool, int nrservs) EXPORT_SYMBOL_GPL(svc_set_num_threads); /* - * Called from a server thread as it's exiting. Caller must hold the BKL or - * the "service mutex", whichever is appropriate for the service. + * Called from a server thread as it's exiting. Caller must hold the "service + * mutex" for the service. */ void svc_exit_thread(struct svc_rqst *rqstp) diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index c69358b..163ac45 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -42,7 +42,7 @@ static LIST_HEAD(svc_xprt_class_list); * svc_pool->sp_lock protects most of the fields of that pool. * svc_serv->sv_lock protects sv_tempsocks, sv_permsocks, sv_tmpcnt. * when both need to be taken (rare), svc_serv->sv_lock is first. - * BKL protects svc_serv->sv_nrthread. + * The "service mutex" protects svc_serv->sv_nrthread. * svc_sock->sk_lock protects the svc_sock->sk_deferred list * and the ->sk_info_authunix cache. * @@ -67,7 +67,6 @@ static LIST_HEAD(svc_xprt_class_list); * that no other thread will be using the transport or will * try to set XPT_DEAD. */ - int svc_reg_xprt_class(struct svc_xprt_class *xcl) { struct svc_xprt_class *cl; diff --git a/net/sunrpc/xprtrdma/svc_rdma_marshal.c b/net/sunrpc/xprtrdma/svc_rdma_marshal.c index 65b1462..b681855 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_marshal.c +++ b/net/sunrpc/xprtrdma/svc_rdma_marshal.c @@ -71,22 +71,6 @@ static u32 *decode_read_list(u32 *va, u32 *vaend) } /* - * Determine number of chunks and total bytes in chunk list. The chunk - * list has already been verified to fit within the RPCRDMA header. - */ -void svc_rdma_rcl_chunk_counts(struct rpcrdma_read_chunk *ch, - int *ch_count, int *byte_count) -{ - /* compute the number of bytes represented by read chunks */ - *byte_count = 0; - *ch_count = 0; - for (; ch->rc_discrim != 0; ch++) { - *byte_count = *byte_count + ntohl(ch->rc_target.rs_length); - *ch_count = *ch_count + 1; - } -} - -/* * Decodes a write chunk list. The expected format is as follows: * descrim : xdr_one * nchunks : <count> diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c index e011027..f9f13a3 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c +++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c @@ -43,7 +43,6 @@ #include <linux/sunrpc/debug.h> #include <linux/sunrpc/rpc_rdma.h> #include <linux/spinlock.h> -#include <linux/highmem.h> #include <asm/unaligned.h> #include <rdma/ib_verbs.h> #include <rdma/rdma_cm.h> @@ -60,6 +59,7 @@ static void rdma_build_arg_xdr(struct svc_rqst *rqstp, struct svc_rdma_op_ctxt *ctxt, u32 byte_count) { + struct rpcrdma_msg *rmsgp; struct page *page; u32 bc; int sge_no; @@ -82,7 +82,14 @@ static void rdma_build_arg_xdr(struct svc_rqst *rqstp, /* If data remains, store it in the pagelist */ rqstp->rq_arg.page_len = bc; rqstp->rq_arg.page_base = 0; - rqstp->rq_arg.pages = &rqstp->rq_pages[1]; + + /* RDMA_NOMSG: RDMA READ data should land just after RDMA RECV data */ + rmsgp = (struct rpcrdma_msg *)rqstp->rq_arg.head[0].iov_base; + if (be32_to_cpu(rmsgp->rm_type) == RDMA_NOMSG) + rqstp->rq_arg.pages = &rqstp->rq_pages[0]; + else + rqstp->rq_arg.pages = &rqstp->rq_pages[1]; + sge_no = 1; while (bc && sge_no < ctxt->count) { page = ctxt->pages[sge_no]; @@ -95,14 +102,6 @@ static void rdma_build_arg_xdr(struct svc_rqst *rqstp, rqstp->rq_respages = &rqstp->rq_pages[sge_no]; rqstp->rq_next_page = rqstp->rq_respages + 1; - /* We should never run out of SGE because the limit is defined to - * support the max allowed RPC data length - */ - BUG_ON(bc && (sge_no == ctxt->count)); - BUG_ON((rqstp->rq_arg.head[0].iov_len + rqstp->rq_arg.page_len) - != byte_count); - BUG_ON(rqstp->rq_arg.len != byte_count); - /* If not all pages were used from the SGL, free the remaining ones */ bc = sge_no; while (sge_no < ctxt->count) { @@ -125,26 +124,16 @@ static int rdma_read_max_sge(struct svcxprt_rdma *xprt, int sge_count) return min_t(int, sge_count, xprt->sc_max_sge); } -typedef int (*rdma_reader_fn)(struct svcxprt_rdma *xprt, - struct svc_rqst *rqstp, - struct svc_rdma_op_ctxt *head, - int *page_no, - u32 *page_offset, - u32 rs_handle, - u32 rs_length, - u64 rs_offset, - int last); - /* Issue an RDMA_READ using the local lkey to map the data sink */ -static int rdma_read_chunk_lcl(struct svcxprt_rdma *xprt, - struct svc_rqst *rqstp, - struct svc_rdma_op_ctxt *head, - int *page_no, - u32 *page_offset, - u32 rs_handle, - u32 rs_length, - u64 rs_offset, - int last) +int rdma_read_chunk_lcl(struct svcxprt_rdma *xprt, + struct svc_rqst *rqstp, + struct svc_rdma_op_ctxt *head, + int *page_no, + u32 *page_offset, + u32 rs_handle, + u32 rs_length, + u64 rs_offset, + bool last) { struct ib_send_wr read_wr; int pages_needed = PAGE_ALIGN(*page_offset + rs_length) >> PAGE_SHIFT; @@ -229,15 +218,15 @@ static int rdma_read_chunk_lcl(struct svcxprt_rdma *xprt, } /* Issue an RDMA_READ using an FRMR to map the data sink */ -static int rdma_read_chunk_frmr(struct svcxprt_rdma *xprt, - struct svc_rqst *rqstp, - struct svc_rdma_op_ctxt *head, - int *page_no, - u32 *page_offset, - u32 rs_handle, - u32 rs_length, - u64 rs_offset, - int last) +int rdma_read_chunk_frmr(struct svcxprt_rdma *xprt, + struct svc_rqst *rqstp, + struct svc_rdma_op_ctxt *head, + int *page_no, + u32 *page_offset, + u32 rs_handle, + u32 rs_length, + u64 rs_offset, + bool last) { struct ib_send_wr read_wr; struct ib_send_wr inv_wr; @@ -365,24 +354,84 @@ static int rdma_read_chunk_frmr(struct svcxprt_rdma *xprt, return ret; } +static unsigned int +rdma_rcl_chunk_count(struct rpcrdma_read_chunk *ch) +{ + unsigned int count; + + for (count = 0; ch->rc_discrim != xdr_zero; ch++) + count++; + return count; +} + +/* If there was additional inline content, append it to the end of arg.pages. + * Tail copy has to be done after the reader function has determined how many + * pages are needed for RDMA READ. + */ +static int +rdma_copy_tail(struct svc_rqst *rqstp, struct svc_rdma_op_ctxt *head, + u32 position, u32 byte_count, u32 page_offset, int page_no) +{ + char *srcp, *destp; + int ret; + + ret = 0; + srcp = head->arg.head[0].iov_base + position; + byte_count = head->arg.head[0].iov_len - position; + if (byte_count > PAGE_SIZE) { + dprintk("svcrdma: large tail unsupported\n"); + return 0; + } + + /* Fit as much of the tail on the current page as possible */ + if (page_offset != PAGE_SIZE) { + destp = page_address(rqstp->rq_arg.pages[page_no]); + destp += page_offset; + while (byte_count--) { + *destp++ = *srcp++; + page_offset++; + if (page_offset == PAGE_SIZE && byte_count) + goto more; + } + goto done; + } + +more: + /* Fit the rest on the next page */ + page_no++; + destp = page_address(rqstp->rq_arg.pages[page_no]); + while (byte_count--) + *destp++ = *srcp++; + + rqstp->rq_respages = &rqstp->rq_arg.pages[page_no+1]; + rqstp->rq_next_page = rqstp->rq_respages + 1; + +done: + byte_count = head->arg.head[0].iov_len - position; + head->arg.page_len += byte_count; + head->arg.len += byte_count; + head->arg.buflen += byte_count; + return 1; +} + static int rdma_read_chunks(struct svcxprt_rdma *xprt, struct rpcrdma_msg *rmsgp, struct svc_rqst *rqstp, struct svc_rdma_op_ctxt *head) { - int page_no, ch_count, ret; + int page_no, ret; struct rpcrdma_read_chunk *ch; - u32 page_offset, byte_count; + u32 handle, page_offset, byte_count; + u32 position; u64 rs_offset; - rdma_reader_fn reader; + bool last; /* If no read list is present, return 0 */ ch = svc_rdma_get_read_chunk(rmsgp); if (!ch) return 0; - svc_rdma_rcl_chunk_counts(ch, &ch_count, &byte_count); - if (ch_count > RPCSVC_MAXPAGES) + if (rdma_rcl_chunk_count(ch) > RPCSVC_MAXPAGES) return -EINVAL; /* The request is completed when the RDMA_READs complete. The @@ -391,34 +440,41 @@ static int rdma_read_chunks(struct svcxprt_rdma *xprt, */ head->arg.head[0] = rqstp->rq_arg.head[0]; head->arg.tail[0] = rqstp->rq_arg.tail[0]; - head->arg.pages = &head->pages[head->count]; head->hdr_count = head->count; head->arg.page_base = 0; head->arg.page_len = 0; head->arg.len = rqstp->rq_arg.len; head->arg.buflen = rqstp->rq_arg.buflen; - /* Use FRMR if supported */ - if (xprt->sc_dev_caps & SVCRDMA_DEVCAP_FAST_REG) - reader = rdma_read_chunk_frmr; - else - reader = rdma_read_chunk_lcl; + ch = (struct rpcrdma_read_chunk *)&rmsgp->rm_body.rm_chunks[0]; + position = be32_to_cpu(ch->rc_position); + + /* RDMA_NOMSG: RDMA READ data should land just after RDMA RECV data */ + if (position == 0) { + head->arg.pages = &head->pages[0]; + page_offset = head->byte_len; + } else { + head->arg.pages = &head->pages[head->count]; + page_offset = 0; + } - page_no = 0; page_offset = 0; - for (ch = (struct rpcrdma_read_chunk *)&rmsgp->rm_body.rm_chunks[0]; - ch->rc_discrim != 0; ch++) { + ret = 0; + page_no = 0; + for (; ch->rc_discrim != xdr_zero; ch++) { + if (be32_to_cpu(ch->rc_position) != position) + goto err; + handle = be32_to_cpu(ch->rc_target.rs_handle), + byte_count = be32_to_cpu(ch->rc_target.rs_length); xdr_decode_hyper((__be32 *)&ch->rc_target.rs_offset, &rs_offset); - byte_count = ntohl(ch->rc_target.rs_length); while (byte_count > 0) { - ret = reader(xprt, rqstp, head, - &page_no, &page_offset, - ntohl(ch->rc_target.rs_handle), - byte_count, rs_offset, - ((ch+1)->rc_discrim == 0) /* last */ - ); + last = (ch + 1)->rc_discrim == xdr_zero; + ret = xprt->sc_reader(xprt, rqstp, head, + &page_no, &page_offset, + handle, byte_count, + rs_offset, last); if (ret < 0) goto err; byte_count -= ret; @@ -426,7 +482,24 @@ static int rdma_read_chunks(struct svcxprt_rdma *xprt, head->arg.buflen += ret; } } + + /* Read list may need XDR round-up (see RFC 5666, s. 3.7) */ + if (page_offset & 3) { + u32 pad = 4 - (page_offset & 3); + + head->arg.page_len += pad; + head->arg.len += pad; + head->arg.buflen += pad; + page_offset += pad; + } + ret = 1; + if (position && position < head->arg.head[0].iov_len) + ret = rdma_copy_tail(rqstp, head, position, + byte_count, page_offset, page_no); + head->arg.head[0].iov_len = position; + head->position = position; + err: /* Detach arg pages. svc_recv will replenish them */ for (page_no = 0; @@ -436,47 +509,33 @@ static int rdma_read_chunks(struct svcxprt_rdma *xprt, return ret; } -/* - * To avoid a separate RDMA READ just for a handful of zero bytes, - * RFC 5666 section 3.7 allows the client to omit the XDR zero pad - * in chunk lists. - */ -static void -rdma_fix_xdr_pad(struct xdr_buf *buf) -{ - unsigned int page_len = buf->page_len; - unsigned int size = (XDR_QUADLEN(page_len) << 2) - page_len; - unsigned int offset, pg_no; - char *p; - - if (size == 0) - return; - - pg_no = page_len >> PAGE_SHIFT; - offset = page_len & ~PAGE_MASK; - p = page_address(buf->pages[pg_no]); - memset(p + offset, 0, size); - - buf->page_len += size; - buf->buflen += size; - buf->len += size; -} - static int rdma_read_complete(struct svc_rqst *rqstp, struct svc_rdma_op_ctxt *head) { int page_no; int ret; - BUG_ON(!head); - /* Copy RPC pages */ for (page_no = 0; page_no < head->count; page_no++) { put_page(rqstp->rq_pages[page_no]); rqstp->rq_pages[page_no] = head->pages[page_no]; } + + /* Adjustments made for RDMA_NOMSG type requests */ + if (head->position == 0) { + if (head->arg.len <= head->sge[0].length) { + head->arg.head[0].iov_len = head->arg.len - + head->byte_len; + head->arg.page_len = 0; + } else { + head->arg.head[0].iov_len = head->sge[0].length - + head->byte_len; + head->arg.page_len = head->arg.len - + head->sge[0].length; + } + } + /* Point rq_arg.pages past header */ - rdma_fix_xdr_pad(&head->arg); rqstp->rq_arg.pages = &rqstp->rq_pages[head->hdr_count]; rqstp->rq_arg.page_len = head->arg.page_len; rqstp->rq_arg.page_base = head->arg.page_base; @@ -501,8 +560,8 @@ static int rdma_read_complete(struct svc_rqst *rqstp, ret = rqstp->rq_arg.head[0].iov_len + rqstp->rq_arg.page_len + rqstp->rq_arg.tail[0].iov_len; - dprintk("svcrdma: deferred read ret=%d, rq_arg.len =%d, " - "rq_arg.head[0].iov_base=%p, rq_arg.head[0].iov_len = %zd\n", + dprintk("svcrdma: deferred read ret=%d, rq_arg.len=%u, " + "rq_arg.head[0].iov_base=%p, rq_arg.head[0].iov_len=%zu\n", ret, rqstp->rq_arg.len, rqstp->rq_arg.head[0].iov_base, rqstp->rq_arg.head[0].iov_len); @@ -558,7 +617,6 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp) } dprintk("svcrdma: processing ctxt=%p on xprt=%p, rqstp=%p, status=%d\n", ctxt, rdma_xprt, rqstp, ctxt->wc_status); - BUG_ON(ctxt->wc_status != IB_WC_SUCCESS); atomic_inc(&rdma_stat_recv); /* Build up the XDR from the receive buffers. */ @@ -591,8 +649,8 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp) + rqstp->rq_arg.tail[0].iov_len; svc_rdma_put_context(ctxt, 0); out: - dprintk("svcrdma: ret = %d, rq_arg.len =%d, " - "rq_arg.head[0].iov_base=%p, rq_arg.head[0].iov_len = %zd\n", + dprintk("svcrdma: ret=%d, rq_arg.len=%u, " + "rq_arg.head[0].iov_base=%p, rq_arg.head[0].iov_len=%zd\n", ret, rqstp->rq_arg.len, rqstp->rq_arg.head[0].iov_base, rqstp->rq_arg.head[0].iov_len); diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c index 9f1b506..7de33d1a 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c +++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c @@ -60,8 +60,11 @@ static int map_xdr(struct svcxprt_rdma *xprt, u32 page_off; int page_no; - BUG_ON(xdr->len != - (xdr->head[0].iov_len + xdr->page_len + xdr->tail[0].iov_len)); + if (xdr->len != + (xdr->head[0].iov_len + xdr->page_len + xdr->tail[0].iov_len)) { + pr_err("svcrdma: map_xdr: XDR buffer length error\n"); + return -EIO; + } /* Skip the first sge, this is for the RPCRDMA header */ sge_no = 1; @@ -150,7 +153,11 @@ static int send_write(struct svcxprt_rdma *xprt, struct svc_rqst *rqstp, int bc; struct svc_rdma_op_ctxt *ctxt; - BUG_ON(vec->count > RPCSVC_MAXPAGES); + if (vec->count > RPCSVC_MAXPAGES) { + pr_err("svcrdma: Too many pages (%lu)\n", vec->count); + return -EIO; + } + dprintk("svcrdma: RDMA_WRITE rmr=%x, to=%llx, xdr_off=%d, " "write_len=%d, vec->sge=%p, vec->count=%lu\n", rmr, (unsigned long long)to, xdr_off, @@ -190,7 +197,10 @@ static int send_write(struct svcxprt_rdma *xprt, struct svc_rqst *rqstp, sge_off = 0; sge_no++; xdr_sge_no++; - BUG_ON(xdr_sge_no > vec->count); + if (xdr_sge_no > vec->count) { + pr_err("svcrdma: Too many sges (%d)\n", xdr_sge_no); + goto err; + } bc -= sge_bytes; if (sge_no == xprt->sc_max_sge) break; @@ -421,7 +431,10 @@ static int send_reply(struct svcxprt_rdma *rdma, ctxt->sge[sge_no].lkey = rdma->sc_dma_lkey; ctxt->sge[sge_no].length = sge_bytes; } - BUG_ON(byte_count != 0); + if (byte_count != 0) { + pr_err("svcrdma: Could not map %d bytes\n", byte_count); + goto err; + } /* Save all respages in the ctxt and remove them from the * respages array. They are our pages until the I/O @@ -442,7 +455,10 @@ static int send_reply(struct svcxprt_rdma *rdma, } rqstp->rq_next_page = rqstp->rq_respages + 1; - BUG_ON(sge_no > rdma->sc_max_sge); + if (sge_no > rdma->sc_max_sge) { + pr_err("svcrdma: Too many sges (%d)\n", sge_no); + goto err; + } memset(&send_wr, 0, sizeof send_wr); ctxt->wr_op = IB_WR_SEND; send_wr.wr_id = (unsigned long)ctxt; @@ -467,18 +483,6 @@ void svc_rdma_prep_reply_hdr(struct svc_rqst *rqstp) { } -/* - * Return the start of an xdr buffer. - */ -static void *xdr_start(struct xdr_buf *xdr) -{ - return xdr->head[0].iov_base - - (xdr->len - - xdr->page_len - - xdr->tail[0].iov_len - - xdr->head[0].iov_len); -} - int svc_rdma_sendto(struct svc_rqst *rqstp) { struct svc_xprt *xprt = rqstp->rq_xprt; @@ -496,8 +500,10 @@ int svc_rdma_sendto(struct svc_rqst *rqstp) dprintk("svcrdma: sending response for rqstp=%p\n", rqstp); - /* Get the RDMA request header. */ - rdma_argp = xdr_start(&rqstp->rq_arg); + /* Get the RDMA request header. The receive logic always + * places this at the start of page 0. + */ + rdma_argp = page_address(rqstp->rq_pages[0]); /* Build an req vec for the XDR */ ctxt = svc_rdma_get_context(rdma); diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index 4e61880..f609c1c 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -139,7 +139,6 @@ void svc_rdma_put_context(struct svc_rdma_op_ctxt *ctxt, int free_pages) struct svcxprt_rdma *xprt; int i; - BUG_ON(!ctxt); xprt = ctxt->xprt; if (free_pages) for (i = 0; i < ctxt->count; i++) @@ -339,12 +338,14 @@ static void process_context(struct svcxprt_rdma *xprt, switch (ctxt->wr_op) { case IB_WR_SEND: - BUG_ON(ctxt->frmr); + if (ctxt->frmr) + pr_err("svcrdma: SEND: ctxt->frmr != NULL\n"); svc_rdma_put_context(ctxt, 1); break; case IB_WR_RDMA_WRITE: - BUG_ON(ctxt->frmr); + if (ctxt->frmr) + pr_err("svcrdma: WRITE: ctxt->frmr != NULL\n"); svc_rdma_put_context(ctxt, 0); break; @@ -353,19 +354,21 @@ static void process_context(struct svcxprt_rdma *xprt, svc_rdma_put_frmr(xprt, ctxt->frmr); if (test_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags)) { struct svc_rdma_op_ctxt *read_hdr = ctxt->read_hdr; - BUG_ON(!read_hdr); - spin_lock_bh(&xprt->sc_rq_dto_lock); - set_bit(XPT_DATA, &xprt->sc_xprt.xpt_flags); - list_add_tail(&read_hdr->dto_q, - &xprt->sc_read_complete_q); - spin_unlock_bh(&xprt->sc_rq_dto_lock); + if (read_hdr) { + spin_lock_bh(&xprt->sc_rq_dto_lock); + set_bit(XPT_DATA, &xprt->sc_xprt.xpt_flags); + list_add_tail(&read_hdr->dto_q, + &xprt->sc_read_complete_q); + spin_unlock_bh(&xprt->sc_rq_dto_lock); + } else { + pr_err("svcrdma: ctxt->read_hdr == NULL\n"); + } svc_xprt_enqueue(&xprt->sc_xprt); } svc_rdma_put_context(ctxt, 0); break; default: - BUG_ON(1); printk(KERN_ERR "svcrdma: unexpected completion type, " "opcode=%d\n", ctxt->wr_op); @@ -513,7 +516,10 @@ int svc_rdma_post_recv(struct svcxprt_rdma *xprt) buflen = 0; ctxt->direction = DMA_FROM_DEVICE; for (sge_no = 0; buflen < xprt->sc_max_req_size; sge_no++) { - BUG_ON(sge_no >= xprt->sc_max_sge); + if (sge_no >= xprt->sc_max_sge) { + pr_err("svcrdma: Too many sges (%d)\n", sge_no); + goto err_put_ctxt; + } page = svc_rdma_get_page(); ctxt->pages[sge_no] = page; pa = ib_dma_map_page(xprt->sc_cm_id->device, @@ -687,7 +693,6 @@ static struct svc_xprt *svc_rdma_create(struct svc_serv *serv, { struct rdma_cm_id *listen_id; struct svcxprt_rdma *cma_xprt; - struct svc_xprt *xprt; int ret; dprintk("svcrdma: Creating RDMA socket\n"); @@ -698,7 +703,6 @@ static struct svc_xprt *svc_rdma_create(struct svc_serv *serv, cma_xprt = rdma_create_xprt(serv, 1); if (!cma_xprt) return ERR_PTR(-ENOMEM); - xprt = &cma_xprt->sc_xprt; listen_id = rdma_create_id(rdma_listen_handler, cma_xprt, RDMA_PS_TCP, IB_QPT_RC); @@ -822,7 +826,7 @@ void svc_rdma_put_frmr(struct svcxprt_rdma *rdma, if (frmr) { frmr_unmap_dma(rdma, frmr); spin_lock_bh(&rdma->sc_frmr_q_lock); - BUG_ON(!list_empty(&frmr->frmr_list)); + WARN_ON_ONCE(!list_empty(&frmr->frmr_list)); list_add(&frmr->frmr_list, &rdma->sc_frmr_q); spin_unlock_bh(&rdma->sc_frmr_q_lock); } @@ -970,10 +974,12 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) * NB: iWARP requires remote write access for the data sink * of an RDMA_READ. IB does not. */ + newxprt->sc_reader = rdma_read_chunk_lcl; if (devattr.device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS) { newxprt->sc_frmr_pg_list_len = devattr.max_fast_reg_page_list_len; newxprt->sc_dev_caps |= SVCRDMA_DEVCAP_FAST_REG; + newxprt->sc_reader = rdma_read_chunk_frmr; } /* @@ -1125,7 +1131,9 @@ static void __svc_rdma_free(struct work_struct *work) dprintk("svcrdma: svc_rdma_free(%p)\n", rdma); /* We should only be called from kref_put */ - BUG_ON(atomic_read(&rdma->sc_xprt.xpt_ref.refcount) != 0); + if (atomic_read(&rdma->sc_xprt.xpt_ref.refcount) != 0) + pr_err("svcrdma: sc_xprt still in use? (%d)\n", + atomic_read(&rdma->sc_xprt.xpt_ref.refcount)); /* * Destroy queued, but not processed read completions. Note @@ -1153,8 +1161,12 @@ static void __svc_rdma_free(struct work_struct *work) } /* Warn if we leaked a resource or under-referenced */ - WARN_ON(atomic_read(&rdma->sc_ctxt_used) != 0); - WARN_ON(atomic_read(&rdma->sc_dma_used) != 0); + if (atomic_read(&rdma->sc_ctxt_used) != 0) + pr_err("svcrdma: ctxt still in use? (%d)\n", + atomic_read(&rdma->sc_ctxt_used)); + if (atomic_read(&rdma->sc_dma_used) != 0) + pr_err("svcrdma: dma still in use? (%d)\n", + atomic_read(&rdma->sc_dma_used)); /* De-allocate fastreg mr */ rdma_dealloc_frmr_q(rdma); @@ -1254,7 +1266,6 @@ int svc_rdma_send(struct svcxprt_rdma *xprt, struct ib_send_wr *wr) if (test_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags)) return -ENOTCONN; - BUG_ON(wr->send_flags != IB_SEND_SIGNALED); wr_count = 1; for (n_wr = wr->next; n_wr; n_wr = n_wr->next) wr_count++; |