diff options
Diffstat (limited to 'fs')
57 files changed, 2178 insertions, 1115 deletions
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index e1aa8d6..100b07f 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2493,7 +2493,7 @@ int close_ctree(struct btrfs_root *root) * ERROR state on disk. * * 2. when btrfs flips readonly just in btrfs_commit_super, - * and in such case, btrfs cannnot write sb via btrfs_commit_super, + * and in such case, btrfs cannot write sb via btrfs_commit_super, * and since fs_state has been set BTRFS_SUPER_FLAG_ERROR flag, * btrfs will cleanup all FS resources first and write sb then. */ diff --git a/fs/dcache.c b/fs/dcache.c index 1baddc1..ad25c4c 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -1811,7 +1811,7 @@ struct dentry *__d_lookup_rcu(struct dentry *parent, struct qstr *name, * false-negative result. d_lookup() protects against concurrent * renames using rename_lock seqlock. * - * See Documentation/vfs/dcache-locking.txt for more details. + * See Documentation/filesystems/path-lookup.txt for more details. */ hlist_bl_for_each_entry_rcu(dentry, node, &b->head, d_hash) { struct inode *i; @@ -1931,7 +1931,7 @@ struct dentry *__d_lookup(struct dentry *parent, struct qstr *name) * false-negative result. d_lookup() protects against concurrent * renames using rename_lock seqlock. * - * See Documentation/vfs/dcache-locking.txt for more details. + * See Documentation/filesystems/path-lookup.txt for more details. */ rcu_read_lock(); diff --git a/fs/direct-io.c b/fs/direct-io.c index b044705..dcb5577 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -645,11 +645,11 @@ static int dio_send_cur_page(struct dio *dio) /* * See whether this new request is contiguous with the old. * - * Btrfs cannot handl having logically non-contiguous requests - * submitted. For exmple if you have + * Btrfs cannot handle having logically non-contiguous requests + * submitted. For example if you have * * Logical: [0-4095][HOLE][8192-12287] - * Phyiscal: [0-4095] [4096-8181] + * Physical: [0-4095] [4096-8191] * * We cannot submit those pages together as one BIO. So if our * current logical offset in the file does not equal what would diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 4a09af9..ff12f7a 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -62,7 +62,7 @@ * This mutex is acquired by ep_free() during the epoll file * cleanup path and it is also acquired by eventpoll_release_file() * if a file has been pushed inside an epoll set and it is then - * close()d without a previous call toepoll_ctl(EPOLL_CTL_DEL). + * close()d without a previous call to epoll_ctl(EPOLL_CTL_DEL). * It is also acquired when inserting an epoll fd onto another epoll * fd. We do this so that we walk the epoll tree and ensure that this * insertion does not create a cycle of epoll file descriptors, which @@ -152,11 +152,11 @@ struct epitem { /* * This structure is stored inside the "private_data" member of the file - * structure and rapresent the main data sructure for the eventpoll + * structure and represents the main data structure for the eventpoll * interface. */ struct eventpoll { - /* Protect the this structure access */ + /* Protect the access to this structure */ spinlock_t lock; /* @@ -793,7 +793,7 @@ static struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd) /* * This is the callback that is passed to the wait queue wakeup - * machanism. It is called by the stored file descriptors when they + * mechanism. It is called by the stored file descriptors when they * have events to report. */ static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *key) @@ -824,9 +824,9 @@ static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *k goto out_unlock; /* - * If we are trasfering events to userspace, we can hold no locks + * If we are transferring events to userspace, we can hold no locks * (because we're accessing user memory, and because of linux f_op->poll() - * semantics). All the events that happens during that period of time are + * semantics). All the events that happen during that period of time are * chained in ep->ovflist and requeued later on. */ if (unlikely(ep->ovflist != EP_UNACTIVE_PTR)) { diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c index 045995c..15324218 100644 --- a/fs/ext3/balloc.c +++ b/fs/ext3/balloc.c @@ -1991,6 +1991,7 @@ ext3_grpblk_t ext3_trim_all_free(struct super_block *sb, unsigned int group, spin_unlock(sb_bgl_lock(sbi, group)); percpu_counter_sub(&sbi->s_freeblocks_counter, next - start); + free_blocks -= next - start; /* Do not issue a TRIM on extents smaller than minblocks */ if ((next - start) < minblocks) goto free_extent; @@ -2040,7 +2041,7 @@ free_extent: cond_resched(); /* No more suitable extents */ - if ((free_blocks - count) < minblocks) + if (free_blocks < minblocks) break; } @@ -2090,7 +2091,8 @@ int ext3_trim_fs(struct super_block *sb, struct fstrim_range *range) ext3_fsblk_t max_blks = le32_to_cpu(es->s_blocks_count); int ret = 0; - start = range->start >> sb->s_blocksize_bits; + start = (range->start >> sb->s_blocksize_bits) + + le32_to_cpu(es->s_first_data_block); len = range->len >> sb->s_blocksize_bits; minlen = range->minlen >> sb->s_blocksize_bits; trimmed = 0; @@ -2099,10 +2101,6 @@ int ext3_trim_fs(struct super_block *sb, struct fstrim_range *range) return -EINVAL; if (start >= max_blks) goto out; - if (start < le32_to_cpu(es->s_first_data_block)) { - len -= le32_to_cpu(es->s_first_data_block) - start; - start = le32_to_cpu(es->s_first_data_block); - } if (start + len > max_blks) len = max_blks - start; @@ -2129,10 +2127,15 @@ int ext3_trim_fs(struct super_block *sb, struct fstrim_range *range) if (free_blocks < minlen) continue; - if (len >= EXT3_BLOCKS_PER_GROUP(sb)) - len -= (EXT3_BLOCKS_PER_GROUP(sb) - first_block); - else + /* + * For all the groups except the last one, last block will + * always be EXT3_BLOCKS_PER_GROUP(sb), so we only need to + * change it for the last group in which case first_block + + * len < EXT3_BLOCKS_PER_GROUP(sb). + */ + if (first_block + len < EXT3_BLOCKS_PER_GROUP(sb)) last_block = first_block + len; + len -= last_block - first_block; ret = ext3_trim_all_free(sb, group, first_block, last_block, minlen); diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c index 0521a00..32f3b86 100644 --- a/fs/ext3/namei.c +++ b/fs/ext3/namei.c @@ -1540,8 +1540,8 @@ static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry, goto cleanup; node2 = (struct dx_node *)(bh2->b_data); entries2 = node2->entries; + memset(&node2->fake, 0, sizeof(struct fake_dirent)); node2->fake.rec_len = ext3_rec_len_to_disk(sb->s_blocksize); - node2->fake.inode = 0; BUFFER_TRACE(frame->bh, "get_write_access"); err = ext3_journal_get_write_access(handle, frame->bh); if (err) diff --git a/fs/ext3/super.c b/fs/ext3/super.c index 9cc19a1d..071689f 100644 --- a/fs/ext3/super.c +++ b/fs/ext3/super.c @@ -1464,6 +1464,13 @@ static void ext3_orphan_cleanup (struct super_block * sb, return; } + /* Check if feature set allows readwrite operations */ + if (EXT3_HAS_RO_COMPAT_FEATURE(sb, ~EXT3_FEATURE_RO_COMPAT_SUPP)) { + ext3_msg(sb, KERN_INFO, "Skipping orphan cleanup due to " + "unknown ROCOMPAT features"); + return; + } + if (EXT3_SB(sb)->s_mount_state & EXT3_ERROR_FS) { if (es->s_last_orphan) jbd_debug(1, "Errors on filesystem, " diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index ccce8a7..7516fb9 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -131,7 +131,7 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode, * fragmenting the file system's free space. Maybe we * should have some hueristics or some way to allow * userspace to pass a hint to file system, - * especiially if the latter case turns out to be + * especially if the latter case turns out to be * common. */ ex = path[depth].p_ext; @@ -2844,7 +2844,7 @@ fix_extent_len: * ext4_get_blocks_dio_write() when DIO to write * to an uninitialized extent. * - * Writing to an uninitized extent may result in splitting the uninitialized + * Writing to an uninitialized extent may result in splitting the uninitialized * extent into multiple /initialized uninitialized extents (up to three) * There are three possibilities: * a> There is no split required: Entire extent should be uninitialized diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c index 3e87cce..7c39b885 100644 --- a/fs/fuse/cuse.c +++ b/fs/fuse/cuse.c @@ -458,7 +458,7 @@ static void cuse_fc_release(struct fuse_conn *fc) * @file: file struct being opened * * Userland CUSE server can create a CUSE device by opening /dev/cuse - * and replying to the initilaization request kernel sends. This + * and replying to the initialization request kernel sends. This * function is responsible for handling CUSE device initialization. * Because the fd opened by this function is used during * initialization, this function only creates cuse_conn and sends diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c index da1b5e4..eb11601 100644 --- a/fs/jbd/journal.c +++ b/fs/jbd/journal.c @@ -839,7 +839,7 @@ journal_t * journal_init_inode (struct inode *inode) err = journal_bmap(journal, 0, &blocknr); /* If that failed, give up */ if (err) { - printk(KERN_ERR "%s: Cannnot locate journal superblock\n", + printk(KERN_ERR "%s: Cannot locate journal superblock\n", __func__); goto out_err; } diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 97e7346..90407b8 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -991,7 +991,7 @@ journal_t * jbd2_journal_init_inode (struct inode *inode) err = jbd2_journal_bmap(journal, 0, &blocknr); /* If that failed, give up */ if (err) { - printk(KERN_ERR "%s: Cannnot locate journal superblock\n", + printk(KERN_ERR "%s: Cannot locate journal superblock\n", __func__); goto out_err; } diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c index 8958757..2f41dcce 100644 --- a/fs/nfs/callback_proc.c +++ b/fs/nfs/callback_proc.c @@ -188,10 +188,10 @@ static u32 initiate_bulk_draining(struct nfs_client *clp, rv = NFS4ERR_DELAY; list_del_init(&lo->plh_bulk_recall); spin_unlock(&ino->i_lock); + pnfs_free_lseg_list(&free_me_list); put_layout_hdr(lo); iput(ino); } - pnfs_free_lseg_list(&free_me_list); return rv; } diff --git a/fs/nfs/client.c b/fs/nfs/client.c index bd3ca32..139be96 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -82,6 +82,11 @@ retry: #endif /* CONFIG_NFS_V4 */ /* + * Turn off NFSv4 uid/gid mapping when using AUTH_SYS + */ +static int nfs4_disable_idmapping = 0; + +/* * RPC cruft for NFS */ static struct rpc_version *nfs_version[5] = { @@ -481,7 +486,12 @@ static struct nfs_client *nfs_match_client(const struct nfs_client_initdata *dat * Look up a client by IP address and protocol version * - creates a new record if one doesn't yet exist */ -static struct nfs_client *nfs_get_client(const struct nfs_client_initdata *cl_init) +static struct nfs_client * +nfs_get_client(const struct nfs_client_initdata *cl_init, + const struct rpc_timeout *timeparms, + const char *ip_addr, + rpc_authflavor_t authflavour, + int noresvport) { struct nfs_client *clp, *new = NULL; int error; @@ -512,6 +522,13 @@ install_client: clp = new; list_add(&clp->cl_share_link, &nfs_client_list); spin_unlock(&nfs_client_lock); + + error = cl_init->rpc_ops->init_client(clp, timeparms, ip_addr, + authflavour, noresvport); + if (error < 0) { + nfs_put_client(clp); + return ERR_PTR(error); + } dprintk("--> nfs_get_client() = %p [new]\n", clp); return clp; @@ -767,9 +784,9 @@ static int nfs_init_server_rpcclient(struct nfs_server *server, /* * Initialise an NFS2 or NFS3 client */ -static int nfs_init_client(struct nfs_client *clp, - const struct rpc_timeout *timeparms, - const struct nfs_parsed_mount_data *data) +int nfs_init_client(struct nfs_client *clp, const struct rpc_timeout *timeparms, + const char *ip_addr, rpc_authflavor_t authflavour, + int noresvport) { int error; @@ -784,7 +801,7 @@ static int nfs_init_client(struct nfs_client *clp, * - RFC 2623, sec 2.3.2 */ error = nfs_create_rpc_client(clp, timeparms, RPC_AUTH_UNIX, - 0, data->flags & NFS_MOUNT_NORESVPORT); + 0, noresvport); if (error < 0) goto error; nfs_mark_client_ready(clp, NFS_CS_READY); @@ -820,19 +837,17 @@ static int nfs_init_server(struct nfs_server *server, cl_init.rpc_ops = &nfs_v3_clientops; #endif + nfs_init_timeout_values(&timeparms, data->nfs_server.protocol, + data->timeo, data->retrans); + /* Allocate or find a client reference we can use */ - clp = nfs_get_client(&cl_init); + clp = nfs_get_client(&cl_init, &timeparms, NULL, RPC_AUTH_UNIX, + data->flags & NFS_MOUNT_NORESVPORT); if (IS_ERR(clp)) { dprintk("<-- nfs_init_server() = error %ld\n", PTR_ERR(clp)); return PTR_ERR(clp); } - nfs_init_timeout_values(&timeparms, data->nfs_server.protocol, - data->timeo, data->retrans); - error = nfs_init_client(clp, &timeparms, data); - if (error < 0) - goto error; - server->nfs_client = clp; /* Initialise the client representation from the mount data */ @@ -1009,14 +1024,19 @@ static void nfs_server_insert_lists(struct nfs_server *server) spin_lock(&nfs_client_lock); list_add_tail_rcu(&server->client_link, &clp->cl_superblocks); list_add_tail(&server->master_link, &nfs_volume_list); + clear_bit(NFS_CS_STOP_RENEW, &clp->cl_res_state); spin_unlock(&nfs_client_lock); } static void nfs_server_remove_lists(struct nfs_server *server) { + struct nfs_client *clp = server->nfs_client; + spin_lock(&nfs_client_lock); list_del_rcu(&server->client_link); + if (clp && list_empty(&clp->cl_superblocks)) + set_bit(NFS_CS_STOP_RENEW, &clp->cl_res_state); list_del(&server->master_link); spin_unlock(&nfs_client_lock); @@ -1307,11 +1327,11 @@ static int nfs4_init_client_minor_version(struct nfs_client *clp) /* * Initialise an NFS4 client record */ -static int nfs4_init_client(struct nfs_client *clp, - const struct rpc_timeout *timeparms, - const char *ip_addr, - rpc_authflavor_t authflavour, - int flags) +int nfs4_init_client(struct nfs_client *clp, + const struct rpc_timeout *timeparms, + const char *ip_addr, + rpc_authflavor_t authflavour, + int noresvport) { int error; @@ -1325,7 +1345,7 @@ static int nfs4_init_client(struct nfs_client *clp, clp->rpc_ops = &nfs_v4_clientops; error = nfs_create_rpc_client(clp, timeparms, authflavour, - 1, flags & NFS_MOUNT_NORESVPORT); + 1, noresvport); if (error < 0) goto error; strlcpy(clp->cl_ipaddr, ip_addr, sizeof(clp->cl_ipaddr)); @@ -1378,27 +1398,71 @@ static int nfs4_set_client(struct nfs_server *server, dprintk("--> nfs4_set_client()\n"); /* Allocate or find a client reference we can use */ - clp = nfs_get_client(&cl_init); + clp = nfs_get_client(&cl_init, timeparms, ip_addr, authflavour, + server->flags & NFS_MOUNT_NORESVPORT); if (IS_ERR(clp)) { error = PTR_ERR(clp); goto error; } - error = nfs4_init_client(clp, timeparms, ip_addr, authflavour, - server->flags); - if (error < 0) - goto error_put; + + /* + * Query for the lease time on clientid setup or renewal + * + * Note that this will be set on nfs_clients that were created + * only for the DS role and did not set this bit, but now will + * serve a dual role. + */ + set_bit(NFS_CS_CHECK_LEASE_TIME, &clp->cl_res_state); server->nfs_client = clp; dprintk("<-- nfs4_set_client() = 0 [new %p]\n", clp); return 0; - -error_put: - nfs_put_client(clp); error: dprintk("<-- nfs4_set_client() = xerror %d\n", error); return error; } +/* + * Set up a pNFS Data Server client. + * + * Return any existing nfs_client that matches server address,port,version + * and minorversion. + * + * For a new nfs_client, use a soft mount (default), a low retrans and a + * low timeout interval so that if a connection is lost, we retry through + * the MDS. + */ +struct nfs_client *nfs4_set_ds_client(struct nfs_client* mds_clp, + const struct sockaddr *ds_addr, + int ds_addrlen, int ds_proto) +{ + struct nfs_client_initdata cl_init = { + .addr = ds_addr, + .addrlen = ds_addrlen, + .rpc_ops = &nfs_v4_clientops, + .proto = ds_proto, + .minorversion = mds_clp->cl_minorversion, + }; + struct rpc_timeout ds_timeout = { + .to_initval = 15 * HZ, + .to_maxval = 15 * HZ, + .to_retries = 1, + .to_exponential = 1, + }; + struct nfs_client *clp; + + /* + * Set an authflavor equual to the MDS value. Use the MDS nfs_client + * cl_ipaddr so as to use the same EXCHANGE_ID co_ownerid as the MDS + * (section 13.1 RFC 5661). + */ + clp = nfs_get_client(&cl_init, &ds_timeout, mds_clp->cl_ipaddr, + mds_clp->cl_rpcclient->cl_auth->au_flavor, 0); + + dprintk("<-- %s %p\n", __func__, clp); + return clp; +} +EXPORT_SYMBOL(nfs4_set_ds_client); /* * Session has been established, and the client marked ready. @@ -1435,6 +1499,10 @@ static int nfs4_server_common_setup(struct nfs_server *server, BUG_ON(!server->nfs_client->rpc_ops); BUG_ON(!server->nfs_client->rpc_ops->file_inode_ops); + /* data servers support only a subset of NFSv4.1 */ + if (is_ds_only_client(server->nfs_client)) + return -EPROTONOSUPPORT; + fattr = nfs_alloc_fattr(); if (fattr == NULL) return -ENOMEM; @@ -1504,6 +1572,13 @@ static int nfs4_init_server(struct nfs_server *server, if (error < 0) goto error; + /* + * Don't use NFS uid/gid mapping if we're using AUTH_SYS or lower + * authentication. + */ + if (nfs4_disable_idmapping && data->auth_flavors[0] == RPC_AUTH_UNIX) + server->caps |= NFS_CAP_UIDGID_NOMAP; + if (data->rsize) server->rsize = nfs_block_size(data->rsize, NULL); if (data->wsize) @@ -1921,3 +1996,7 @@ void nfs_fs_proc_exit(void) } #endif /* CONFIG_PROC_FS */ + +module_param(nfs4_disable_idmapping, bool, 0644); +MODULE_PARM_DESC(nfs4_disable_idmapping, + "Turn off NFSv4 idmapping when using 'sec=sys'"); diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index 9943a75..8eea253 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -45,6 +45,7 @@ #include <linux/pagemap.h> #include <linux/kref.h> #include <linux/slab.h> +#include <linux/task_io_accounting_ops.h> #include <linux/nfs_fs.h> #include <linux/nfs_page.h> @@ -649,8 +650,7 @@ static void nfs_direct_write_result(struct rpc_task *task, void *calldata) { struct nfs_write_data *data = calldata; - if (nfs_writeback_done(task, data) != 0) - return; + nfs_writeback_done(task, data); } /* @@ -938,6 +938,8 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, const struct iovec *iov, if (retval) goto out; + task_io_account_read(count); + retval = nfs_direct_read(iocb, iov, nr_segs, pos); if (retval > 0) iocb->ki_pos = pos + retval; @@ -999,6 +1001,8 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, const struct iovec *iov, if (retval) goto out; + task_io_account_write(count); + retval = nfs_direct_write(iocb, iov, nr_segs, pos, count); if (retval > 0) diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 7bf029e..d85a534 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -387,10 +387,6 @@ static int nfs_write_begin(struct file *file, struct address_space *mapping, file->f_path.dentry->d_name.name, mapping->host->i_ino, len, (long long) pos); - pnfs_update_layout(mapping->host, - nfs_file_open_context(file), - IOMODE_RW); - start: /* * Prevent starvation issues if someone is doing a consistency diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c index 1869688..79664a1 100644 --- a/fs/nfs/idmap.c +++ b/fs/nfs/idmap.c @@ -33,16 +33,41 @@ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ +#include <linux/types.h> +#include <linux/string.h> +#include <linux/kernel.h> + +static int nfs_map_string_to_numeric(const char *name, size_t namelen, __u32 *res) +{ + unsigned long val; + char buf[16]; + + if (memchr(name, '@', namelen) != NULL || namelen >= sizeof(buf)) + return 0; + memcpy(buf, name, namelen); + buf[namelen] = '\0'; + if (strict_strtoul(buf, 0, &val) != 0) + return 0; + *res = val; + return 1; +} + +static int nfs_map_numeric_to_string(__u32 id, char *buf, size_t buflen) +{ + return snprintf(buf, buflen, "%u", id); +} #ifdef CONFIG_NFS_USE_NEW_IDMAPPER #include <linux/slab.h> #include <linux/cred.h> +#include <linux/sunrpc/sched.h> +#include <linux/nfs4.h> +#include <linux/nfs_fs_sb.h> #include <linux/nfs_idmap.h> #include <linux/keyctl.h> #include <linux/key-type.h> #include <linux/rcupdate.h> -#include <linux/kernel.h> #include <linux/err.h> #include <keys/user-type.h> @@ -219,23 +244,39 @@ static int nfs_idmap_lookup_id(const char *name, size_t namelen, return ret; } -int nfs_map_name_to_uid(struct nfs_client *clp, const char *name, size_t namelen, __u32 *uid) +int nfs_map_name_to_uid(const struct nfs_server *server, const char *name, size_t namelen, __u32 *uid) { + if (nfs_map_string_to_numeric(name, namelen, uid)) + return 0; return nfs_idmap_lookup_id(name, namelen, "uid", uid); } -int nfs_map_group_to_gid(struct nfs_client *clp, const char *name, size_t namelen, __u32 *gid) +int nfs_map_group_to_gid(const struct nfs_server *server, const char *name, size_t namelen, __u32 *gid) { + if (nfs_map_string_to_numeric(name, namelen, gid)) + return 0; return nfs_idmap_lookup_id(name, namelen, "gid", gid); } -int nfs_map_uid_to_name(struct nfs_client *clp, __u32 uid, char *buf, size_t buflen) +int nfs_map_uid_to_name(const struct nfs_server *server, __u32 uid, char *buf, size_t buflen) { - return nfs_idmap_lookup_name(uid, "user", buf, buflen); + int ret = -EINVAL; + + if (!(server->caps & NFS_CAP_UIDGID_NOMAP)) + ret = nfs_idmap_lookup_name(uid, "user", buf, buflen); + if (ret < 0) + ret = nfs_map_numeric_to_string(uid, buf, buflen); + return ret; } -int nfs_map_gid_to_group(struct nfs_client *clp, __u32 gid, char *buf, size_t buflen) +int nfs_map_gid_to_group(const struct nfs_server *server, __u32 gid, char *buf, size_t buflen) { - return nfs_idmap_lookup_name(gid, "group", buf, buflen); + int ret = -EINVAL; + + if (!(server->caps & NFS_CAP_UIDGID_NOMAP)) + ret = nfs_idmap_lookup_name(gid, "group", buf, buflen); + if (ret < 0) + ret = nfs_map_numeric_to_string(gid, buf, buflen); + return ret; } #else /* CONFIG_NFS_USE_NEW_IDMAPPER not defined */ @@ -243,7 +284,6 @@ int nfs_map_gid_to_group(struct nfs_client *clp, __u32 gid, char *buf, size_t bu #include <linux/module.h> #include <linux/mutex.h> #include <linux/init.h> -#include <linux/types.h> #include <linux/slab.h> #include <linux/socket.h> #include <linux/in.h> @@ -695,31 +735,45 @@ static unsigned int fnvhash32(const void *buf, size_t buflen) return hash; } -int nfs_map_name_to_uid(struct nfs_client *clp, const char *name, size_t namelen, __u32 *uid) +int nfs_map_name_to_uid(const struct nfs_server *server, const char *name, size_t namelen, __u32 *uid) { - struct idmap *idmap = clp->cl_idmap; + struct idmap *idmap = server->nfs_client->cl_idmap; + if (nfs_map_string_to_numeric(name, namelen, uid)) + return 0; return nfs_idmap_id(idmap, &idmap->idmap_user_hash, name, namelen, uid); } -int nfs_map_group_to_gid(struct nfs_client *clp, const char *name, size_t namelen, __u32 *uid) +int nfs_map_group_to_gid(const struct nfs_server *server, const char *name, size_t namelen, __u32 *uid) { - struct idmap *idmap = clp->cl_idmap; + struct idmap *idmap = server->nfs_client->cl_idmap; + if (nfs_map_string_to_numeric(name, namelen, uid)) + return 0; return nfs_idmap_id(idmap, &idmap->idmap_group_hash, name, namelen, uid); } -int nfs_map_uid_to_name(struct nfs_client *clp, __u32 uid, char *buf, size_t buflen) +int nfs_map_uid_to_name(const struct nfs_server *server, __u32 uid, char *buf, size_t buflen) { - struct idmap *idmap = clp->cl_idmap; + struct idmap *idmap = server->nfs_client->cl_idmap; + int ret = -EINVAL; - return nfs_idmap_name(idmap, &idmap->idmap_user_hash, uid, buf); + if (!(server->caps & NFS_CAP_UIDGID_NOMAP)) + ret = nfs_idmap_name(idmap, &idmap->idmap_user_hash, uid, buf); + if (ret < 0) + ret = nfs_map_numeric_to_string(uid, buf, buflen); + return ret; } -int nfs_map_gid_to_group(struct nfs_client *clp, __u32 uid, char *buf, size_t buflen) +int nfs_map_gid_to_group(const struct nfs_server *server, __u32 uid, char *buf, size_t buflen) { - struct idmap *idmap = clp->cl_idmap; + struct idmap *idmap = server->nfs_client->cl_idmap; + int ret = -EINVAL; - return nfs_idmap_name(idmap, &idmap->idmap_group_hash, uid, buf); + if (!(server->caps & NFS_CAP_UIDGID_NOMAP)) + ret = nfs_idmap_name(idmap, &idmap->idmap_group_hash, uid, buf); + if (ret < 0) + ret = nfs_map_numeric_to_string(uid, buf, buflen); + return ret; } #endif /* CONFIG_NFS_USE_NEW_IDMAPPER */ diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index e94ad22..72e0bdd 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -148,6 +148,9 @@ extern struct nfs_server *nfs_clone_server(struct nfs_server *, struct nfs_fattr *); extern void nfs_mark_client_ready(struct nfs_client *clp, int state); extern int nfs4_check_client_ready(struct nfs_client *clp); +extern struct nfs_client *nfs4_set_ds_client(struct nfs_client* mds_clp, + const struct sockaddr *ds_addr, + int ds_addrlen, int ds_proto); #ifdef CONFIG_PROC_FS extern int __init nfs_fs_proc_init(void); extern void nfs_fs_proc_exit(void); @@ -213,8 +216,14 @@ extern const u32 nfs41_maxwrite_overhead; extern struct rpc_procinfo nfs4_procedures[]; #endif +extern int nfs4_init_ds_session(struct nfs_client *clp); + /* proc.c */ void nfs_close_context(struct nfs_open_context *ctx, int is_sync); +extern int nfs_init_client(struct nfs_client *clp, + const struct rpc_timeout *timeparms, + const char *ip_addr, rpc_authflavor_t authflavour, + int noresvport); /* dir.c */ extern int nfs_access_cache_shrinker(struct shrinker *shrink, @@ -262,9 +271,15 @@ extern int nfs4_get_rootfh(struct nfs_server *server, struct nfs_fh *mntfh); #endif /* read.c */ +extern int nfs_initiate_read(struct nfs_read_data *data, struct rpc_clnt *clnt, + const struct rpc_call_ops *call_ops); extern void nfs_read_prepare(struct rpc_task *task, void *calldata); /* write.c */ +extern int nfs_initiate_write(struct nfs_write_data *data, + struct rpc_clnt *clnt, + const struct rpc_call_ops *call_ops, + int how); extern void nfs_write_prepare(struct rpc_task *task, void *calldata); #ifdef CONFIG_MIGRATION extern int nfs_migrate_page(struct address_space *, @@ -274,6 +289,13 @@ extern int nfs_migrate_page(struct address_space *, #endif /* nfs4proc.c */ +extern void nfs4_reset_read(struct rpc_task *task, struct nfs_read_data *data); +extern int nfs4_init_client(struct nfs_client *clp, + const struct rpc_timeout *timeparms, + const char *ip_addr, + rpc_authflavor_t authflavour, + int noresvport); +extern void nfs4_reset_write(struct rpc_task *task, struct nfs_write_data *data); extern int _nfs4_call_sync(struct nfs_server *server, struct rpc_message *msg, struct nfs4_sequence_args *args, diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c index ce939c0..d0c80d8 100644 --- a/fs/nfs/nfs3proc.c +++ b/fs/nfs/nfs3proc.c @@ -885,4 +885,5 @@ const struct nfs_rpc_ops nfs_v3_clientops = { .lock = nfs3_proc_lock, .clear_acl_cache = nfs3_forget_cached_acls, .close_context = nfs_close_context, + .init_client = nfs_init_client, }; diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index 1be36cf..c64be1c 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -252,6 +252,9 @@ static inline struct nfs4_session *nfs4_get_session(const struct nfs_server *ser extern int nfs4_setup_sequence(const struct nfs_server *server, struct nfs4_sequence_args *args, struct nfs4_sequence_res *res, int cache_reply, struct rpc_task *task); +extern int nfs41_setup_sequence(struct nfs4_session *session, + struct nfs4_sequence_args *args, struct nfs4_sequence_res *res, + int cache_reply, struct rpc_task *task); extern void nfs4_destroy_session(struct nfs4_session *session); extern struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp); extern int nfs4_proc_create_session(struct nfs_client *); @@ -259,6 +262,19 @@ extern int nfs4_proc_destroy_session(struct nfs4_session *); extern int nfs4_init_session(struct nfs_server *server); extern int nfs4_proc_get_lease_time(struct nfs_client *clp, struct nfs_fsinfo *fsinfo); + +static inline bool +is_ds_only_client(struct nfs_client *clp) +{ + return (clp->cl_exchange_flags & EXCHGID4_FLAG_MASK_PNFS) == + EXCHGID4_FLAG_USE_PNFS_DS; +} + +static inline bool +is_ds_client(struct nfs_client *clp) +{ + return clp->cl_exchange_flags & EXCHGID4_FLAG_USE_PNFS_DS; +} #else /* CONFIG_NFS_v4_1 */ static inline struct nfs4_session *nfs4_get_session(const struct nfs_server *server) { @@ -276,6 +292,18 @@ static inline int nfs4_init_session(struct nfs_server *server) { return 0; } + +static inline bool +is_ds_only_client(struct nfs_client *clp) +{ + return false; +} + +static inline bool +is_ds_client(struct nfs_client *clp) +{ + return false; +} #endif /* CONFIG_NFS_V4_1 */ extern const struct nfs4_minor_version_ops *nfs_v4_minor_ops[]; diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c index 23f930c..4285584 100644 --- a/fs/nfs/nfs4filelayout.c +++ b/fs/nfs/nfs4filelayout.c @@ -40,32 +40,309 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Dean Hildebrand <dhildebz@umich.edu>"); MODULE_DESCRIPTION("The NFSv4 file layout driver"); -static int -filelayout_set_layoutdriver(struct nfs_server *nfss) +#define FILELAYOUT_POLL_RETRY_MAX (15*HZ) + +static loff_t +filelayout_get_dense_offset(struct nfs4_filelayout_segment *flseg, + loff_t offset) { - int status = pnfs_alloc_init_deviceid_cache(nfss->nfs_client, - nfs4_fl_free_deviceid_callback); - if (status) { - printk(KERN_WARNING "%s: deviceid cache could not be " - "initialized\n", __func__); - return status; + u32 stripe_width = flseg->stripe_unit * flseg->dsaddr->stripe_count; + u64 tmp; + + offset -= flseg->pattern_offset; + tmp = offset; + do_div(tmp, stripe_width); + + return tmp * flseg->stripe_unit + do_div(offset, flseg->stripe_unit); +} + +/* This function is used by the layout driver to calculate the + * offset of the file on the dserver based on whether the + * layout type is STRIPE_DENSE or STRIPE_SPARSE + */ +static loff_t +filelayout_get_dserver_offset(struct pnfs_layout_segment *lseg, loff_t offset) +{ + struct nfs4_filelayout_segment *flseg = FILELAYOUT_LSEG(lseg); + + switch (flseg->stripe_type) { + case STRIPE_SPARSE: + return offset; + + case STRIPE_DENSE: + return filelayout_get_dense_offset(flseg, offset); } - dprintk("%s: deviceid cache has been initialized successfully\n", - __func__); + + BUG(); +} + +/* For data server errors we don't recover from */ +static void +filelayout_set_lo_fail(struct pnfs_layout_segment *lseg) +{ + if (lseg->pls_range.iomode == IOMODE_RW) { + dprintk("%s Setting layout IOMODE_RW fail bit\n", __func__); + set_bit(lo_fail_bit(IOMODE_RW), &lseg->pls_layout->plh_flags); + } else { + dprintk("%s Setting layout IOMODE_READ fail bit\n", __func__); + set_bit(lo_fail_bit(IOMODE_READ), &lseg->pls_layout->plh_flags); + } +} + +static int filelayout_async_handle_error(struct rpc_task *task, + struct nfs4_state *state, + struct nfs_client *clp, + int *reset) +{ + if (task->tk_status >= 0) + return 0; + + *reset = 0; + + switch (task->tk_status) { + case -NFS4ERR_BADSESSION: + case -NFS4ERR_BADSLOT: + case -NFS4ERR_BAD_HIGH_SLOT: + case -NFS4ERR_DEADSESSION: + case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION: + case -NFS4ERR_SEQ_FALSE_RETRY: + case -NFS4ERR_SEQ_MISORDERED: + dprintk("%s ERROR %d, Reset session. Exchangeid " + "flags 0x%x\n", __func__, task->tk_status, + clp->cl_exchange_flags); + nfs4_schedule_session_recovery(clp->cl_session); + break; + case -NFS4ERR_DELAY: + case -NFS4ERR_GRACE: + case -EKEYEXPIRED: + rpc_delay(task, FILELAYOUT_POLL_RETRY_MAX); + break; + default: + dprintk("%s DS error. Retry through MDS %d\n", __func__, + task->tk_status); + *reset = 1; + break; + } + task->tk_status = 0; + return -EAGAIN; +} + +/* NFS_PROTO call done callback routines */ + +static int filelayout_read_done_cb(struct rpc_task *task, + struct nfs_read_data *data) +{ + struct nfs_client *clp = data->ds_clp; + int reset = 0; + + dprintk("%s DS read\n", __func__); + + if (filelayout_async_handle_error(task, data->args.context->state, + data->ds_clp, &reset) == -EAGAIN) { + dprintk("%s calling restart ds_clp %p ds_clp->cl_session %p\n", + __func__, data->ds_clp, data->ds_clp->cl_session); + if (reset) { + filelayout_set_lo_fail(data->lseg); + nfs4_reset_read(task, data); + clp = NFS_SERVER(data->inode)->nfs_client; + } + nfs_restart_rpc(task, clp); + return -EAGAIN; + } + return 0; } -/* Clear out the layout by destroying its device list */ -static int -filelayout_clear_layoutdriver(struct nfs_server *nfss) +/* + * Call ops for the async read/write cases + * In the case of dense layouts, the offset needs to be reset to its + * original value. + */ +static void filelayout_read_prepare(struct rpc_task *task, void *data) { - dprintk("--> %s\n", __func__); + struct nfs_read_data *rdata = (struct nfs_read_data *)data; + + rdata->read_done_cb = filelayout_read_done_cb; + + if (nfs41_setup_sequence(rdata->ds_clp->cl_session, + &rdata->args.seq_args, &rdata->res.seq_res, + 0, task)) + return; + + rpc_call_start(task); +} + +static void filelayout_read_call_done(struct rpc_task *task, void *data) +{ + struct nfs_read_data *rdata = (struct nfs_read_data *)data; + + dprintk("--> %s task->tk_status %d\n", __func__, task->tk_status); + + /* Note this may cause RPC to be resent */ + rdata->mds_ops->rpc_call_done(task, data); +} + +static void filelayout_read_release(void *data) +{ + struct nfs_read_data *rdata = (struct nfs_read_data *)data; + + rdata->mds_ops->rpc_release(data); +} + +static int filelayout_write_done_cb(struct rpc_task *task, + struct nfs_write_data *data) +{ + int reset = 0; + + if (filelayout_async_handle_error(task, data->args.context->state, + data->ds_clp, &reset) == -EAGAIN) { + struct nfs_client *clp; + + dprintk("%s calling restart ds_clp %p ds_clp->cl_session %p\n", + __func__, data->ds_clp, data->ds_clp->cl_session); + if (reset) { + filelayout_set_lo_fail(data->lseg); + nfs4_reset_write(task, data); + clp = NFS_SERVER(data->inode)->nfs_client; + } else + clp = data->ds_clp; + nfs_restart_rpc(task, clp); + return -EAGAIN; + } - if (nfss->nfs_client->cl_devid_cache) - pnfs_put_deviceid_cache(nfss->nfs_client); return 0; } +static void filelayout_write_prepare(struct rpc_task *task, void *data) +{ + struct nfs_write_data *wdata = (struct nfs_write_data *)data; + + if (nfs41_setup_sequence(wdata->ds_clp->cl_session, + &wdata->args.seq_args, &wdata->res.seq_res, + 0, task)) + return; + + rpc_call_start(task); +} + +static void filelayout_write_call_done(struct rpc_task *task, void *data) +{ + struct nfs_write_data *wdata = (struct nfs_write_data *)data; + + /* Note this may cause RPC to be resent */ + wdata->mds_ops->rpc_call_done(task, data); +} + +static void filelayout_write_release(void *data) +{ + struct nfs_write_data *wdata = (struct nfs_write_data *)data; + + wdata->mds_ops->rpc_release(data); +} + +struct rpc_call_ops filelayout_read_call_ops = { + .rpc_call_prepare = filelayout_read_prepare, + .rpc_call_done = filelayout_read_call_done, + .rpc_release = filelayout_read_release, +}; + +struct rpc_call_ops filelayout_write_call_ops = { + .rpc_call_prepare = filelayout_write_prepare, + .rpc_call_done = filelayout_write_call_done, + .rpc_release = filelayout_write_release, +}; + +static enum pnfs_try_status +filelayout_read_pagelist(struct nfs_read_data *data) +{ + struct pnfs_layout_segment *lseg = data->lseg; + struct nfs4_pnfs_ds *ds; + loff_t offset = data->args.offset; + u32 j, idx; + struct nfs_fh *fh; + int status; + + dprintk("--> %s ino %lu pgbase %u req %Zu@%llu\n", + __func__, data->inode->i_ino, + data->args.pgbase, (size_t)data->args.count, offset); + + /* Retrieve the correct rpc_client for the byte range */ + j = nfs4_fl_calc_j_index(lseg, offset); + idx = nfs4_fl_calc_ds_index(lseg, j); + ds = nfs4_fl_prepare_ds(lseg, idx); + if (!ds) { + /* Either layout fh index faulty, or ds connect failed */ + set_bit(lo_fail_bit(IOMODE_RW), &lseg->pls_layout->plh_flags); + set_bit(lo_fail_bit(IOMODE_READ), &lseg->pls_layout->plh_flags); + return PNFS_NOT_ATTEMPTED; + } + dprintk("%s USE DS:ip %x %hu\n", __func__, + ntohl(ds->ds_ip_addr), ntohs(ds->ds_port)); + + /* No multipath support. Use first DS */ + data->ds_clp = ds->ds_clp; + fh = nfs4_fl_select_ds_fh(lseg, j); + if (fh) + data->args.fh = fh; + + data->args.offset = filelayout_get_dserver_offset(lseg, offset); + data->mds_offset = offset; + + /* Perform an asynchronous read to ds */ + status = nfs_initiate_read(data, ds->ds_clp->cl_rpcclient, + &filelayout_read_call_ops); + BUG_ON(status != 0); + return PNFS_ATTEMPTED; +} + +/* Perform async writes. */ +static enum pnfs_try_status +filelayout_write_pagelist(struct nfs_write_data *data, int sync) +{ + struct pnfs_layout_segment *lseg = data->lseg; + struct nfs4_pnfs_ds *ds; + loff_t offset = data->args.offset; + u32 j, idx; + struct nfs_fh *fh; + int status; + + /* Retrieve the correct rpc_client for the byte range */ + j = nfs4_fl_calc_j_index(lseg, offset); + idx = nfs4_fl_calc_ds_index(lseg, j); + ds = nfs4_fl_prepare_ds(lseg, idx); + if (!ds) { + printk(KERN_ERR "%s: prepare_ds failed, use MDS\n", __func__); + set_bit(lo_fail_bit(IOMODE_RW), &lseg->pls_layout->plh_flags); + set_bit(lo_fail_bit(IOMODE_READ), &lseg->pls_layout->plh_flags); + return PNFS_NOT_ATTEMPTED; + } + dprintk("%s ino %lu sync %d req %Zu@%llu DS:%x:%hu\n", __func__, + data->inode->i_ino, sync, (size_t) data->args.count, offset, + ntohl(ds->ds_ip_addr), ntohs(ds->ds_port)); + + /* We can't handle commit to ds yet */ + if (!FILELAYOUT_LSEG(lseg)->commit_through_mds) + data->args.stable = NFS_FILE_SYNC; + + data->write_done_cb = filelayout_write_done_cb; + data->ds_clp = ds->ds_clp; + fh = nfs4_fl_select_ds_fh(lseg, j); + if (fh) + data->args.fh = fh; + /* + * Get the file offset on the dserver. Set the write offset to + * this offset and save the original offset. + */ + data->args.offset = filelayout_get_dserver_offset(lseg, offset); + data->mds_offset = offset; + + /* Perform an asynchronous write */ + status = nfs_initiate_write(data, ds->ds_clp->cl_rpcclient, + &filelayout_write_call_ops, sync); + BUG_ON(status != 0); + return PNFS_ATTEMPTED; +} + /* * filelayout_check_layout() * @@ -92,14 +369,14 @@ filelayout_check_layout(struct pnfs_layout_hdr *lo, goto out; } - if (fl->stripe_unit % PAGE_SIZE) { - dprintk("%s Stripe unit (%u) not page aligned\n", + if (!fl->stripe_unit || fl->stripe_unit % PAGE_SIZE) { + dprintk("%s Invalid stripe unit (%u)\n", __func__, fl->stripe_unit); goto out; } /* find and reference the deviceid */ - dsaddr = nfs4_fl_find_get_deviceid(nfss->nfs_client, id); + dsaddr = nfs4_fl_find_get_deviceid(id); if (dsaddr == NULL) { dsaddr = get_device_info(lo->plh_inode, id); if (dsaddr == NULL) @@ -134,7 +411,7 @@ out: dprintk("--> %s returns %d\n", __func__, status); return status; out_put: - pnfs_put_deviceid(nfss->nfs_client->cl_devid_cache, &dsaddr->deviceid); + nfs4_fl_put_deviceid(dsaddr); goto out; } @@ -243,23 +520,47 @@ filelayout_alloc_lseg(struct pnfs_layout_hdr *layoutid, static void filelayout_free_lseg(struct pnfs_layout_segment *lseg) { - struct nfs_server *nfss = NFS_SERVER(lseg->pls_layout->plh_inode); struct nfs4_filelayout_segment *fl = FILELAYOUT_LSEG(lseg); dprintk("--> %s\n", __func__); - pnfs_put_deviceid(nfss->nfs_client->cl_devid_cache, - &fl->dsaddr->deviceid); + nfs4_fl_put_deviceid(fl->dsaddr); _filelayout_free_lseg(fl); } +/* + * filelayout_pg_test(). Called by nfs_can_coalesce_requests() + * + * return 1 : coalesce page + * return 0 : don't coalesce page + */ +int +filelayout_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, + struct nfs_page *req) +{ + u64 p_stripe, r_stripe; + u32 stripe_unit; + + if (!pgio->pg_lseg) + return 1; + p_stripe = (u64)prev->wb_index << PAGE_CACHE_SHIFT; + r_stripe = (u64)req->wb_index << PAGE_CACHE_SHIFT; + stripe_unit = FILELAYOUT_LSEG(pgio->pg_lseg)->stripe_unit; + + do_div(p_stripe, stripe_unit); + do_div(r_stripe, stripe_unit); + + return (p_stripe == r_stripe); +} + static struct pnfs_layoutdriver_type filelayout_type = { - .id = LAYOUT_NFSV4_1_FILES, - .name = "LAYOUT_NFSV4_1_FILES", - .owner = THIS_MODULE, - .set_layoutdriver = filelayout_set_layoutdriver, - .clear_layoutdriver = filelayout_clear_layoutdriver, - .alloc_lseg = filelayout_alloc_lseg, - .free_lseg = filelayout_free_lseg, + .id = LAYOUT_NFSV4_1_FILES, + .name = "LAYOUT_NFSV4_1_FILES", + .owner = THIS_MODULE, + .alloc_lseg = filelayout_alloc_lseg, + .free_lseg = filelayout_free_lseg, + .pg_test = filelayout_pg_test, + .read_pagelist = filelayout_read_pagelist, + .write_pagelist = filelayout_write_pagelist, }; static int __init nfs4filelayout_init(void) diff --git a/fs/nfs/nfs4filelayout.h b/fs/nfs/nfs4filelayout.h index bbf60dd..ee0c907 100644 --- a/fs/nfs/nfs4filelayout.h +++ b/fs/nfs/nfs4filelayout.h @@ -55,8 +55,14 @@ struct nfs4_pnfs_ds { atomic_t ds_count; }; +/* nfs4_file_layout_dsaddr flags */ +#define NFS4_DEVICE_ID_NEG_ENTRY 0x00000001 + struct nfs4_file_layout_dsaddr { - struct pnfs_deviceid_node deviceid; + struct hlist_node node; + struct nfs4_deviceid deviceid; + atomic_t ref; + unsigned long flags; u32 stripe_count; u8 *stripe_indices; u32 ds_num; @@ -83,11 +89,18 @@ FILELAYOUT_LSEG(struct pnfs_layout_segment *lseg) generic_hdr); } -extern void nfs4_fl_free_deviceid_callback(struct pnfs_deviceid_node *); +extern struct nfs_fh * +nfs4_fl_select_ds_fh(struct pnfs_layout_segment *lseg, u32 j); + extern void print_ds(struct nfs4_pnfs_ds *ds); extern void print_deviceid(struct nfs4_deviceid *dev_id); +u32 nfs4_fl_calc_j_index(struct pnfs_layout_segment *lseg, loff_t offset); +u32 nfs4_fl_calc_ds_index(struct pnfs_layout_segment *lseg, u32 j); +struct nfs4_pnfs_ds *nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg, + u32 ds_idx); extern struct nfs4_file_layout_dsaddr * -nfs4_fl_find_get_deviceid(struct nfs_client *, struct nfs4_deviceid *dev_id); +nfs4_fl_find_get_deviceid(struct nfs4_deviceid *dev_id); +extern void nfs4_fl_put_deviceid(struct nfs4_file_layout_dsaddr *dsaddr); struct nfs4_file_layout_dsaddr * get_device_info(struct inode *inode, struct nfs4_deviceid *dev_id); diff --git a/fs/nfs/nfs4filelayoutdev.c b/fs/nfs/nfs4filelayoutdev.c index b73c343..68143c1 100644 --- a/fs/nfs/nfs4filelayoutdev.c +++ b/fs/nfs/nfs4filelayoutdev.c @@ -37,6 +37,30 @@ #define NFSDBG_FACILITY NFSDBG_PNFS_LD /* + * Device ID RCU cache. A device ID is unique per client ID and layout type. + */ +#define NFS4_FL_DEVICE_ID_HASH_BITS 5 +#define NFS4_FL_DEVICE_ID_HASH_SIZE (1 << NFS4_FL_DEVICE_ID_HASH_BITS) +#define NFS4_FL_DEVICE_ID_HASH_MASK (NFS4_FL_DEVICE_ID_HASH_SIZE - 1) + +static inline u32 +nfs4_fl_deviceid_hash(struct nfs4_deviceid *id) +{ + unsigned char *cptr = (unsigned char *)id->data; + unsigned int nbytes = NFS4_DEVICEID4_SIZE; + u32 x = 0; + + while (nbytes--) { + x *= 37; + x += *cptr++; + } + return x & NFS4_FL_DEVICE_ID_HASH_MASK; +} + +static struct hlist_head filelayout_deviceid_cache[NFS4_FL_DEVICE_ID_HASH_SIZE]; +static DEFINE_SPINLOCK(filelayout_deviceid_lock); + +/* * Data server cache * * Data servers can be mapped to different device ids. @@ -104,6 +128,67 @@ _data_server_lookup_locked(u32 ip_addr, u32 port) return NULL; } +/* + * Create an rpc connection to the nfs4_pnfs_ds data server + * Currently only support IPv4 + */ +static int +nfs4_ds_connect(struct nfs_server *mds_srv, struct nfs4_pnfs_ds *ds) +{ + struct nfs_client *clp; + struct sockaddr_in sin; + int status = 0; + + dprintk("--> %s ip:port %x:%hu au_flavor %d\n", __func__, + ntohl(ds->ds_ip_addr), ntohs(ds->ds_port), + mds_srv->nfs_client->cl_rpcclient->cl_auth->au_flavor); + + sin.sin_family = AF_INET; + sin.sin_addr.s_addr = ds->ds_ip_addr; + sin.sin_port = ds->ds_port; + + clp = nfs4_set_ds_client(mds_srv->nfs_client, (struct sockaddr *)&sin, + sizeof(sin), IPPROTO_TCP); + if (IS_ERR(clp)) { + status = PTR_ERR(clp); + goto out; + } + + if ((clp->cl_exchange_flags & EXCHGID4_FLAG_MASK_PNFS) != 0) { + if (!is_ds_client(clp)) { + status = -ENODEV; + goto out_put; + } + ds->ds_clp = clp; + dprintk("%s [existing] ip=%x, port=%hu\n", __func__, + ntohl(ds->ds_ip_addr), ntohs(ds->ds_port)); + goto out; + } + + /* + * Do not set NFS_CS_CHECK_LEASE_TIME instead set the DS lease to + * be equal to the MDS lease. Renewal is scheduled in create_session. + */ + spin_lock(&mds_srv->nfs_client->cl_lock); + clp->cl_lease_time = mds_srv->nfs_client->cl_lease_time; + spin_unlock(&mds_srv->nfs_client->cl_lock); + clp->cl_last_renewal = jiffies; + + /* New nfs_client */ + status = nfs4_init_ds_session(clp); + if (status) + goto out_put; + + ds->ds_clp = clp; + dprintk("%s [new] ip=%x, port=%hu\n", __func__, ntohl(ds->ds_ip_addr), + ntohs(ds->ds_port)); +out: + return status; +out_put: + nfs_put_client(clp); + goto out; +} + static void destroy_ds(struct nfs4_pnfs_ds *ds) { @@ -122,7 +207,7 @@ nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr *dsaddr) struct nfs4_pnfs_ds *ds; int i; - print_deviceid(&dsaddr->deviceid.de_id); + print_deviceid(&dsaddr->deviceid); for (i = 0; i < dsaddr->ds_num; i++) { ds = dsaddr->ds_list[i]; @@ -139,15 +224,6 @@ nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr *dsaddr) kfree(dsaddr); } -void -nfs4_fl_free_deviceid_callback(struct pnfs_deviceid_node *device) -{ - struct nfs4_file_layout_dsaddr *dsaddr = - container_of(device, struct nfs4_file_layout_dsaddr, deviceid); - - nfs4_fl_free_deviceid(dsaddr); -} - static struct nfs4_pnfs_ds * nfs4_pnfs_ds_add(struct inode *inode, u32 ip_addr, u32 port) { @@ -300,7 +376,7 @@ decode_device(struct inode *ino, struct pnfs_device *pdev) dsaddr->stripe_count = cnt; dsaddr->ds_num = num; - memcpy(&dsaddr->deviceid.de_id, &pdev->dev_id, sizeof(pdev->dev_id)); + memcpy(&dsaddr->deviceid, &pdev->dev_id, sizeof(pdev->dev_id)); /* Go back an read stripe indices */ p = indicesp; @@ -350,28 +426,37 @@ out_err: } /* - * Decode the opaque device specified in 'dev' - * and add it to the list of available devices. - * If the deviceid is already cached, nfs4_add_deviceid will return - * a pointer to the cached struct and throw away the new. + * Decode the opaque device specified in 'dev' and add it to the cache of + * available devices. */ -static struct nfs4_file_layout_dsaddr* +static struct nfs4_file_layout_dsaddr * decode_and_add_device(struct inode *inode, struct pnfs_device *dev) { - struct nfs4_file_layout_dsaddr *dsaddr; - struct pnfs_deviceid_node *d; + struct nfs4_file_layout_dsaddr *d, *new; + long hash; - dsaddr = decode_device(inode, dev); - if (!dsaddr) { + new = decode_device(inode, dev); + if (!new) { printk(KERN_WARNING "%s: Could not decode or add device\n", __func__); return NULL; } - d = pnfs_add_deviceid(NFS_SERVER(inode)->nfs_client->cl_devid_cache, - &dsaddr->deviceid); + spin_lock(&filelayout_deviceid_lock); + d = nfs4_fl_find_get_deviceid(&new->deviceid); + if (d) { + spin_unlock(&filelayout_deviceid_lock); + nfs4_fl_free_deviceid(new); + return d; + } + + INIT_HLIST_NODE(&new->node); + atomic_set(&new->ref, 1); + hash = nfs4_fl_deviceid_hash(&new->deviceid); + hlist_add_head_rcu(&new->node, &filelayout_deviceid_cache[hash]); + spin_unlock(&filelayout_deviceid_lock); - return container_of(d, struct nfs4_file_layout_dsaddr, deviceid); + return new; } /* @@ -446,12 +531,123 @@ out_free: return dsaddr; } +void +nfs4_fl_put_deviceid(struct nfs4_file_layout_dsaddr *dsaddr) +{ + if (atomic_dec_and_lock(&dsaddr->ref, &filelayout_deviceid_lock)) { + hlist_del_rcu(&dsaddr->node); + spin_unlock(&filelayout_deviceid_lock); + + synchronize_rcu(); + nfs4_fl_free_deviceid(dsaddr); + } +} + struct nfs4_file_layout_dsaddr * -nfs4_fl_find_get_deviceid(struct nfs_client *clp, struct nfs4_deviceid *id) +nfs4_fl_find_get_deviceid(struct nfs4_deviceid *id) +{ + struct nfs4_file_layout_dsaddr *d; + struct hlist_node *n; + long hash = nfs4_fl_deviceid_hash(id); + + + rcu_read_lock(); + hlist_for_each_entry_rcu(d, n, &filelayout_deviceid_cache[hash], node) { + if (!memcmp(&d->deviceid, id, sizeof(*id))) { + if (!atomic_inc_not_zero(&d->ref)) + goto fail; + rcu_read_unlock(); + return d; + } + } +fail: + rcu_read_unlock(); + return NULL; +} + +/* + * Want res = (offset - layout->pattern_offset)/ layout->stripe_unit + * Then: ((res + fsi) % dsaddr->stripe_count) + */ +u32 +nfs4_fl_calc_j_index(struct pnfs_layout_segment *lseg, loff_t offset) +{ + struct nfs4_filelayout_segment *flseg = FILELAYOUT_LSEG(lseg); + u64 tmp; + + tmp = offset - flseg->pattern_offset; + do_div(tmp, flseg->stripe_unit); + tmp += flseg->first_stripe_index; + return do_div(tmp, flseg->dsaddr->stripe_count); +} + +u32 +nfs4_fl_calc_ds_index(struct pnfs_layout_segment *lseg, u32 j) +{ + return FILELAYOUT_LSEG(lseg)->dsaddr->stripe_indices[j]; +} + +struct nfs_fh * +nfs4_fl_select_ds_fh(struct pnfs_layout_segment *lseg, u32 j) +{ + struct nfs4_filelayout_segment *flseg = FILELAYOUT_LSEG(lseg); + u32 i; + + if (flseg->stripe_type == STRIPE_SPARSE) { + if (flseg->num_fh == 1) + i = 0; + else if (flseg->num_fh == 0) + /* Use the MDS OPEN fh set in nfs_read_rpcsetup */ + return NULL; + else + i = nfs4_fl_calc_ds_index(lseg, j); + } else + i = j; + return flseg->fh_array[i]; +} + +static void +filelayout_mark_devid_negative(struct nfs4_file_layout_dsaddr *dsaddr, + int err, u32 ds_addr) +{ + u32 *p = (u32 *)&dsaddr->deviceid; + + printk(KERN_ERR "NFS: data server %x connection error %d." + " Deviceid [%x%x%x%x] marked out of use.\n", + ds_addr, err, p[0], p[1], p[2], p[3]); + + spin_lock(&filelayout_deviceid_lock); + dsaddr->flags |= NFS4_DEVICE_ID_NEG_ENTRY; + spin_unlock(&filelayout_deviceid_lock); +} + +struct nfs4_pnfs_ds * +nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx) { - struct pnfs_deviceid_node *d; + struct nfs4_file_layout_dsaddr *dsaddr = FILELAYOUT_LSEG(lseg)->dsaddr; + struct nfs4_pnfs_ds *ds = dsaddr->ds_list[ds_idx]; - d = pnfs_find_get_deviceid(clp->cl_devid_cache, id); - return (d == NULL) ? NULL : - container_of(d, struct nfs4_file_layout_dsaddr, deviceid); + if (ds == NULL) { + printk(KERN_ERR "%s: No data server for offset index %d\n", + __func__, ds_idx); + return NULL; + } + + if (!ds->ds_clp) { + struct nfs_server *s = NFS_SERVER(lseg->pls_layout->plh_inode); + int err; + + if (dsaddr->flags & NFS4_DEVICE_ID_NEG_ENTRY) { + /* Already tried to connect, don't try again */ + dprintk("%s Deviceid marked out of use\n", __func__); + return NULL; + } + err = nfs4_ds_connect(s, ds); + if (err) { + filelayout_mark_devid_negative(dsaddr, err, + ntohl(ds->ds_ip_addr)); + return NULL; + } + } + return ds; } diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 0a07e35..1d84e70 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -85,6 +85,9 @@ static int nfs4_map_errors(int err) switch (err) { case -NFS4ERR_RESOURCE: return -EREMOTEIO; + case -NFS4ERR_BADOWNER: + case -NFS4ERR_BADNAME: + return -EINVAL; default: dprintk("%s could not handle NFSv4 error %d\n", __func__, -err); @@ -241,7 +244,7 @@ static int nfs4_delay(struct rpc_clnt *clnt, long *timeout) /* This is the error handling routine for processes that are allowed * to sleep. */ -static int nfs4_handle_exception(const struct nfs_server *server, int errorcode, struct nfs4_exception *exception) +static int nfs4_handle_exception(struct nfs_server *server, int errorcode, struct nfs4_exception *exception) { struct nfs_client *clp = server->nfs_client; struct nfs4_state *state = exception->state; @@ -293,6 +296,19 @@ static int nfs4_handle_exception(const struct nfs_server *server, int errorcode, break; case -NFS4ERR_OLD_STATEID: exception->retry = 1; + break; + case -NFS4ERR_BADOWNER: + /* The following works around a Linux server bug! */ + case -NFS4ERR_BADNAME: + if (server->caps & NFS_CAP_UIDGID_NOMAP) { + server->caps &= ~NFS_CAP_UIDGID_NOMAP; + exception->retry = 1; + printk(KERN_WARNING "NFS: v4 server %s " + "does not accept raw " + "uid/gids. " + "Reenabling the idmapper.\n", + server->nfs_client->cl_hostname); + } } /* We failed to handle the error */ return nfs4_map_errors(ret); @@ -505,7 +521,7 @@ out: return ret_id; } -static int nfs41_setup_sequence(struct nfs4_session *session, +int nfs41_setup_sequence(struct nfs4_session *session, struct nfs4_sequence_args *args, struct nfs4_sequence_res *res, int cache_reply, @@ -571,6 +587,7 @@ static int nfs41_setup_sequence(struct nfs4_session *session, res->sr_status = 1; return 0; } +EXPORT_SYMBOL_GPL(nfs41_setup_sequence); int nfs4_setup_sequence(const struct nfs_server *server, struct nfs4_sequence_args *args, @@ -1573,9 +1590,8 @@ static int _nfs4_proc_open(struct nfs4_opendata *data) return 0; } -static int nfs4_recover_expired_lease(struct nfs_server *server) +static int nfs4_client_recover_expired_lease(struct nfs_client *clp) { - struct nfs_client *clp = server->nfs_client; unsigned int loop; int ret; @@ -1592,6 +1608,11 @@ static int nfs4_recover_expired_lease(struct nfs_server *server) return ret; } +static int nfs4_recover_expired_lease(struct nfs_server *server) +{ + return nfs4_client_recover_expired_lease(server->nfs_client); +} + /* * OPEN_EXPIRED: * reclaim state on the server after a network partition. @@ -3069,15 +3090,10 @@ static int nfs4_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle, return err; } -static int nfs4_read_done(struct rpc_task *task, struct nfs_read_data *data) +static int nfs4_read_done_cb(struct rpc_task *task, struct nfs_read_data *data) { struct nfs_server *server = NFS_SERVER(data->inode); - dprintk("--> %s\n", __func__); - - if (!nfs4_sequence_done(task, &data->res.seq_res)) - return -EAGAIN; - if (nfs4_async_handle_error(task, server, data->args.context->state) == -EAGAIN) { nfs_restart_rpc(task, server->nfs_client); return -EAGAIN; @@ -3089,19 +3105,44 @@ static int nfs4_read_done(struct rpc_task *task, struct nfs_read_data *data) return 0; } +static int nfs4_read_done(struct rpc_task *task, struct nfs_read_data *data) +{ + + dprintk("--> %s\n", __func__); + + if (!nfs4_sequence_done(task, &data->res.seq_res)) + return -EAGAIN; + + return data->read_done_cb(task, data); +} + static void nfs4_proc_read_setup(struct nfs_read_data *data, struct rpc_message *msg) { data->timestamp = jiffies; + data->read_done_cb = nfs4_read_done_cb; msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ]; } -static int nfs4_write_done(struct rpc_task *task, struct nfs_write_data *data) +/* Reset the the nfs_read_data to send the read to the MDS. */ +void nfs4_reset_read(struct rpc_task *task, struct nfs_read_data *data) +{ + dprintk("%s Reset task for i/o through\n", __func__); + put_lseg(data->lseg); + data->lseg = NULL; + /* offsets will differ in the dense stripe case */ + data->args.offset = data->mds_offset; + data->ds_clp = NULL; + data->args.fh = NFS_FH(data->inode); + data->read_done_cb = nfs4_read_done_cb; + task->tk_ops = data->mds_ops; + rpc_task_reset_client(task, NFS_CLIENT(data->inode)); +} +EXPORT_SYMBOL_GPL(nfs4_reset_read); + +static int nfs4_write_done_cb(struct rpc_task *task, struct nfs_write_data *data) { struct inode *inode = data->inode; - if (!nfs4_sequence_done(task, &data->res.seq_res)) - return -EAGAIN; - if (nfs4_async_handle_error(task, NFS_SERVER(inode), data->args.context->state) == -EAGAIN) { nfs_restart_rpc(task, NFS_SERVER(inode)->nfs_client); return -EAGAIN; @@ -3113,11 +3154,41 @@ static int nfs4_write_done(struct rpc_task *task, struct nfs_write_data *data) return 0; } +static int nfs4_write_done(struct rpc_task *task, struct nfs_write_data *data) +{ + if (!nfs4_sequence_done(task, &data->res.seq_res)) + return -EAGAIN; + return data->write_done_cb(task, data); +} + +/* Reset the the nfs_write_data to send the write to the MDS. */ +void nfs4_reset_write(struct rpc_task *task, struct nfs_write_data *data) +{ + dprintk("%s Reset task for i/o through\n", __func__); + put_lseg(data->lseg); + data->lseg = NULL; + data->ds_clp = NULL; + data->write_done_cb = nfs4_write_done_cb; + data->args.fh = NFS_FH(data->inode); + data->args.bitmask = data->res.server->cache_consistency_bitmask; + data->args.offset = data->mds_offset; + data->res.fattr = &data->fattr; + task->tk_ops = data->mds_ops; + rpc_task_reset_client(task, NFS_CLIENT(data->inode)); +} +EXPORT_SYMBOL_GPL(nfs4_reset_write); + static void nfs4_proc_write_setup(struct nfs_write_data *data, struct rpc_message *msg) { struct nfs_server *server = NFS_SERVER(data->inode); - data->args.bitmask = server->cache_consistency_bitmask; + if (data->lseg) { + data->args.bitmask = NULL; + data->res.fattr = NULL; + } else + data->args.bitmask = server->cache_consistency_bitmask; + if (!data->write_done_cb) + data->write_done_cb = nfs4_write_done_cb; data->res.server = server; data->timestamp = jiffies; @@ -5118,6 +5189,27 @@ int nfs4_init_session(struct nfs_server *server) return ret; } +int nfs4_init_ds_session(struct nfs_client *clp) +{ + struct nfs4_session *session = clp->cl_session; + int ret; + + if (!test_and_clear_bit(NFS4_SESSION_INITING, &session->session_state)) + return 0; + + ret = nfs4_client_recover_expired_lease(clp); + if (!ret) + /* Test for the DS role */ + if (!is_ds_client(clp)) + ret = -ENODEV; + if (!ret) + ret = nfs4_check_client_ready(clp); + return ret; + +} +EXPORT_SYMBOL_GPL(nfs4_init_ds_session); + + /* * Renew the cl_session lease. */ @@ -5648,6 +5740,7 @@ const struct nfs_rpc_ops nfs_v4_clientops = { .clear_acl_cache = nfs4_zap_acl_attr, .close_context = nfs4_close_context, .open_context = nfs4_atomic_open, + .init_client = nfs4_init_client, }; static const struct xattr_handler nfs4_xattr_nfs4_acl_handler = { diff --git a/fs/nfs/nfs4renewd.c b/fs/nfs/nfs4renewd.c index 402143d..df8e7f3 100644 --- a/fs/nfs/nfs4renewd.c +++ b/fs/nfs/nfs4renewd.c @@ -64,12 +64,8 @@ nfs4_renew_state(struct work_struct *work) ops = clp->cl_mvops->state_renewal_ops; dprintk("%s: start\n", __func__); - rcu_read_lock(); - if (list_empty(&clp->cl_superblocks)) { - rcu_read_unlock(); + if (test_bit(NFS_CS_STOP_RENEW, &clp->cl_res_state)) goto out; - } - rcu_read_unlock(); spin_lock(&clp->cl_lock); lease = clp->cl_lease_time; diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 0592288..ab1bf5b 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -153,6 +153,11 @@ static int nfs41_setup_state_renewal(struct nfs_client *clp) int status; struct nfs_fsinfo fsinfo; + if (!test_bit(NFS_CS_CHECK_LEASE_TIME, &clp->cl_res_state)) { + nfs4_schedule_state_renewal(clp); + return 0; + } + status = nfs4_proc_get_lease_time(clp, &fsinfo); if (status == 0) { /* Update lease time and schedule renewal */ @@ -1448,6 +1453,7 @@ void nfs4_schedule_session_recovery(struct nfs4_session *session) { nfs4_schedule_lease_recovery(session->clp); } +EXPORT_SYMBOL_GPL(nfs4_schedule_session_recovery); void nfs41_handle_recall_slot(struct nfs_client *clp) { diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 94d50e8..0cf560f 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -844,7 +844,7 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const if (iap->ia_valid & ATTR_MODE) len += 4; if (iap->ia_valid & ATTR_UID) { - owner_namelen = nfs_map_uid_to_name(server->nfs_client, iap->ia_uid, owner_name, IDMAP_NAMESZ); + owner_namelen = nfs_map_uid_to_name(server, iap->ia_uid, owner_name, IDMAP_NAMESZ); if (owner_namelen < 0) { dprintk("nfs: couldn't resolve uid %d to string\n", iap->ia_uid); @@ -856,7 +856,7 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const len += 4 + (XDR_QUADLEN(owner_namelen) << 2); } if (iap->ia_valid & ATTR_GID) { - owner_grouplen = nfs_map_gid_to_group(server->nfs_client, iap->ia_gid, owner_group, IDMAP_NAMESZ); + owner_grouplen = nfs_map_gid_to_group(server, iap->ia_gid, owner_group, IDMAP_NAMESZ); if (owner_grouplen < 0) { dprintk("nfs: couldn't resolve gid %d to string\n", iap->ia_gid); @@ -1384,7 +1384,7 @@ static void encode_putrootfh(struct xdr_stream *xdr, struct compound_hdr *hdr) hdr->replen += decode_putrootfh_maxsz; } -static void encode_stateid(struct xdr_stream *xdr, const struct nfs_open_context *ctx, const struct nfs_lock_context *l_ctx) +static void encode_stateid(struct xdr_stream *xdr, const struct nfs_open_context *ctx, const struct nfs_lock_context *l_ctx, int zero_seqid) { nfs4_stateid stateid; __be32 *p; @@ -1392,6 +1392,8 @@ static void encode_stateid(struct xdr_stream *xdr, const struct nfs_open_context p = reserve_space(xdr, NFS4_STATEID_SIZE); if (ctx->state != NULL) { nfs4_copy_stateid(&stateid, ctx->state, l_ctx->lockowner, l_ctx->pid); + if (zero_seqid) + stateid.stateid.seqid = 0; xdr_encode_opaque_fixed(p, stateid.data, NFS4_STATEID_SIZE); } else xdr_encode_opaque_fixed(p, zero_stateid.data, NFS4_STATEID_SIZE); @@ -1404,7 +1406,8 @@ static void encode_read(struct xdr_stream *xdr, const struct nfs_readargs *args, p = reserve_space(xdr, 4); *p = cpu_to_be32(OP_READ); - encode_stateid(xdr, args->context, args->lock_context); + encode_stateid(xdr, args->context, args->lock_context, + hdr->minorversion); p = reserve_space(xdr, 12); p = xdr_encode_hyper(p, args->offset); @@ -1592,7 +1595,8 @@ static void encode_write(struct xdr_stream *xdr, const struct nfs_writeargs *arg p = reserve_space(xdr, 4); *p = cpu_to_be32(OP_WRITE); - encode_stateid(xdr, args->context, args->lock_context); + encode_stateid(xdr, args->context, args->lock_context, + hdr->minorversion); p = reserve_space(xdr, 16); p = xdr_encode_hyper(p, args->offset); @@ -2271,7 +2275,8 @@ static void nfs4_xdr_enc_write(struct rpc_rqst *req, struct xdr_stream *xdr, encode_putfh(xdr, args->fh, &hdr); encode_write(xdr, args, &hdr); req->rq_snd_buf.flags |= XDRBUF_WRITE; - encode_getfattr(xdr, args->bitmask, &hdr); + if (args->bitmask) + encode_getfattr(xdr, args->bitmask, &hdr); encode_nops(&hdr); } @@ -3382,7 +3387,7 @@ out_overflow: } static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap, - struct nfs_client *clp, uint32_t *uid, int may_sleep) + const struct nfs_server *server, uint32_t *uid, int may_sleep) { uint32_t len; __be32 *p; @@ -3402,7 +3407,7 @@ static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap, if (!may_sleep) { /* do nothing */ } else if (len < XDR_MAX_NETOBJ) { - if (nfs_map_name_to_uid(clp, (char *)p, len, uid) == 0) + if (nfs_map_name_to_uid(server, (char *)p, len, uid) == 0) ret = NFS_ATTR_FATTR_OWNER; else dprintk("%s: nfs_map_name_to_uid failed!\n", @@ -3420,7 +3425,7 @@ out_overflow: } static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap, - struct nfs_client *clp, uint32_t *gid, int may_sleep) + const struct nfs_server *server, uint32_t *gid, int may_sleep) { uint32_t len; __be32 *p; @@ -3440,7 +3445,7 @@ static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap, if (!may_sleep) { /* do nothing */ } else if (len < XDR_MAX_NETOBJ) { - if (nfs_map_group_to_gid(clp, (char *)p, len, gid) == 0) + if (nfs_map_group_to_gid(server, (char *)p, len, gid) == 0) ret = NFS_ATTR_FATTR_GROUP; else dprintk("%s: nfs_map_group_to_gid failed!\n", @@ -3939,14 +3944,12 @@ static int decode_getfattr_attrs(struct xdr_stream *xdr, uint32_t *bitmap, goto xdr_error; fattr->valid |= status; - status = decode_attr_owner(xdr, bitmap, server->nfs_client, - &fattr->uid, may_sleep); + status = decode_attr_owner(xdr, bitmap, server, &fattr->uid, may_sleep); if (status < 0) goto xdr_error; fattr->valid |= status; - status = decode_attr_group(xdr, bitmap, server->nfs_client, - &fattr->gid, may_sleep); + status = decode_attr_group(xdr, bitmap, server, &fattr->gid, may_sleep); if (status < 0) goto xdr_error; fattr->valid |= status; @@ -5690,8 +5693,9 @@ static int nfs4_xdr_dec_write(struct rpc_rqst *rqstp, struct xdr_stream *xdr, status = decode_write(xdr, res); if (status) goto out; - decode_getfattr(xdr, res->fattr, res->server, - !RPC_IS_ASYNC(rqstp->rq_task)); + if (res->fattr) + decode_getfattr(xdr, res->fattr, res->server, + !RPC_IS_ASYNC(rqstp->rq_task)); if (!status) status = res->count; out: @@ -6167,8 +6171,6 @@ static struct { { NFS4ERR_DQUOT, -EDQUOT }, { NFS4ERR_STALE, -ESTALE }, { NFS4ERR_BADHANDLE, -EBADHANDLE }, - { NFS4ERR_BADOWNER, -EINVAL }, - { NFS4ERR_BADNAME, -EINVAL }, { NFS4ERR_BAD_COOKIE, -EBADCOOKIE }, { NFS4ERR_NOTSUPP, -ENOTSUPP }, { NFS4ERR_TOOSMALL, -ETOOSMALL }, diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index e1164e3..23e7944 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -20,6 +20,7 @@ #include <linux/nfs_mount.h> #include "internal.h" +#include "pnfs.h" static struct kmem_cache *nfs_page_cachep; @@ -213,7 +214,7 @@ nfs_wait_on_request(struct nfs_page *req) */ void nfs_pageio_init(struct nfs_pageio_descriptor *desc, struct inode *inode, - int (*doio)(struct inode *, struct list_head *, unsigned int, size_t, int), + int (*doio)(struct nfs_pageio_descriptor *), size_t bsize, int io_flags) { @@ -226,6 +227,7 @@ void nfs_pageio_init(struct nfs_pageio_descriptor *desc, desc->pg_doio = doio; desc->pg_ioflags = io_flags; desc->pg_error = 0; + desc->pg_lseg = NULL; } /** @@ -240,7 +242,8 @@ void nfs_pageio_init(struct nfs_pageio_descriptor *desc, * Return 'true' if this is the case, else return 'false'. */ static int nfs_can_coalesce_requests(struct nfs_page *prev, - struct nfs_page *req) + struct nfs_page *req, + struct nfs_pageio_descriptor *pgio) { if (req->wb_context->cred != prev->wb_context->cred) return 0; @@ -254,6 +257,12 @@ static int nfs_can_coalesce_requests(struct nfs_page *prev, return 0; if (prev->wb_pgbase + prev->wb_bytes != PAGE_CACHE_SIZE) return 0; + /* + * Non-whole file layouts need to check that req is inside of + * pgio->pg_lseg. + */ + if (pgio->pg_test && !pgio->pg_test(pgio, prev, req)) + return 0; return 1; } @@ -286,7 +295,7 @@ static int nfs_pageio_do_add_request(struct nfs_pageio_descriptor *desc, if (newlen > desc->pg_bsize) return 0; prev = nfs_list_entry(desc->pg_list.prev); - if (!nfs_can_coalesce_requests(prev, req)) + if (!nfs_can_coalesce_requests(prev, req, desc)) return 0; } else desc->pg_base = req->wb_pgbase; @@ -302,12 +311,7 @@ static int nfs_pageio_do_add_request(struct nfs_pageio_descriptor *desc, static void nfs_pageio_doio(struct nfs_pageio_descriptor *desc) { if (!list_empty(&desc->pg_list)) { - int error = desc->pg_doio(desc->pg_inode, - &desc->pg_list, - nfs_page_array_len(desc->pg_base, - desc->pg_count), - desc->pg_count, - desc->pg_ioflags); + int error = desc->pg_doio(desc); if (error < 0) desc->pg_error = error; else diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 1b1bc1a..f38813a 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -30,6 +30,7 @@ #include <linux/nfs_fs.h> #include "internal.h" #include "pnfs.h" +#include "iostat.h" #define NFSDBG_FACILITY NFSDBG_PNFS @@ -74,10 +75,8 @@ find_pnfs_driver(u32 id) void unset_pnfs_layoutdriver(struct nfs_server *nfss) { - if (nfss->pnfs_curr_ld) { - nfss->pnfs_curr_ld->clear_layoutdriver(nfss); + if (nfss->pnfs_curr_ld) module_put(nfss->pnfs_curr_ld->owner); - } nfss->pnfs_curr_ld = NULL; } @@ -115,13 +114,7 @@ set_pnfs_layoutdriver(struct nfs_server *server, u32 id) goto out_no_driver; } server->pnfs_curr_ld = ld_type; - if (ld_type->set_layoutdriver(server)) { - printk(KERN_ERR - "%s: Error initializing mount point for layout driver %u.\n", - __func__, id); - module_put(ld_type->owner); - goto out_no_driver; - } + dprintk("%s: pNFS module for %u set\n", __func__, id); return; @@ -230,37 +223,41 @@ static void free_lseg(struct pnfs_layout_segment *lseg) put_layout_hdr(NFS_I(ino)->layout); } -/* The use of tmp_list is necessary because pnfs_curr_ld->free_lseg - * could sleep, so must be called outside of the lock. - * Returns 1 if object was removed, otherwise return 0. - */ -static int -put_lseg_locked(struct pnfs_layout_segment *lseg, - struct list_head *tmp_list) +static void +put_lseg_common(struct pnfs_layout_segment *lseg) +{ + struct inode *inode = lseg->pls_layout->plh_inode; + + BUG_ON(test_bit(NFS_LSEG_VALID, &lseg->pls_flags)); + list_del_init(&lseg->pls_list); + if (list_empty(&lseg->pls_layout->plh_segs)) { + set_bit(NFS_LAYOUT_DESTROYED, &lseg->pls_layout->plh_flags); + /* Matched by initial refcount set in alloc_init_layout_hdr */ + put_layout_hdr_locked(lseg->pls_layout); + } + rpc_wake_up(&NFS_SERVER(inode)->roc_rpcwaitq); +} + +void +put_lseg(struct pnfs_layout_segment *lseg) { + struct inode *inode; + + if (!lseg) + return; + dprintk("%s: lseg %p ref %d valid %d\n", __func__, lseg, atomic_read(&lseg->pls_refcount), test_bit(NFS_LSEG_VALID, &lseg->pls_flags)); - if (atomic_dec_and_test(&lseg->pls_refcount)) { - struct inode *ino = lseg->pls_layout->plh_inode; + inode = lseg->pls_layout->plh_inode; + if (atomic_dec_and_lock(&lseg->pls_refcount, &inode->i_lock)) { + LIST_HEAD(free_me); - BUG_ON(test_bit(NFS_LSEG_VALID, &lseg->pls_flags)); - list_del(&lseg->pls_list); - if (list_empty(&lseg->pls_layout->plh_segs)) { - struct nfs_client *clp; - - clp = NFS_SERVER(ino)->nfs_client; - spin_lock(&clp->cl_lock); - /* List does not take a reference, so no need for put here */ - list_del_init(&lseg->pls_layout->plh_layouts); - spin_unlock(&clp->cl_lock); - clear_bit(NFS_LAYOUT_BULK_RECALL, &lseg->pls_layout->plh_flags); - } - rpc_wake_up(&NFS_SERVER(ino)->roc_rpcwaitq); - list_add(&lseg->pls_list, tmp_list); - return 1; + put_lseg_common(lseg); + list_add(&lseg->pls_list, &free_me); + spin_unlock(&inode->i_lock); + pnfs_free_lseg_list(&free_me); } - return 0; } static bool @@ -281,7 +278,13 @@ static int mark_lseg_invalid(struct pnfs_layout_segment *lseg, * list. It will now be removed when all * outstanding io is finished. */ - rv = put_lseg_locked(lseg, tmp_list); + dprintk("%s: lseg %p ref %d\n", __func__, lseg, + atomic_read(&lseg->pls_refcount)); + if (atomic_dec_and_test(&lseg->pls_refcount)) { + put_lseg_common(lseg); + list_add(&lseg->pls_list, tmp_list); + rv = 1; + } } return rv; } @@ -299,6 +302,11 @@ mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo, dprintk("%s:Begin lo %p\n", __func__, lo); + if (list_empty(&lo->plh_segs)) { + if (!test_and_set_bit(NFS_LAYOUT_DESTROYED, &lo->plh_flags)) + put_layout_hdr_locked(lo); + return 0; + } list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list) if (should_free_lseg(lseg->pls_range.iomode, iomode)) { dprintk("%s: freeing lseg %p iomode %d " @@ -312,11 +320,27 @@ mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo, return invalid - removed; } +/* note free_me must contain lsegs from a single layout_hdr */ void pnfs_free_lseg_list(struct list_head *free_me) { struct pnfs_layout_segment *lseg, *tmp; + struct pnfs_layout_hdr *lo; + + if (list_empty(free_me)) + return; + lo = list_first_entry(free_me, struct pnfs_layout_segment, + pls_list)->pls_layout; + + if (test_bit(NFS_LAYOUT_DESTROYED, &lo->plh_flags)) { + struct nfs_client *clp; + + clp = NFS_SERVER(lo->plh_inode)->nfs_client; + spin_lock(&clp->cl_lock); + list_del_init(&lo->plh_layouts); + spin_unlock(&clp->cl_lock); + } list_for_each_entry_safe(lseg, tmp, free_me, pls_list) { list_del(&lseg->pls_list); free_lseg(lseg); @@ -332,10 +356,8 @@ pnfs_destroy_layout(struct nfs_inode *nfsi) spin_lock(&nfsi->vfs_inode.i_lock); lo = nfsi->layout; if (lo) { - set_bit(NFS_LAYOUT_DESTROYED, &nfsi->layout->plh_flags); + lo->plh_block_lgets++; /* permanently block new LAYOUTGETs */ mark_matching_lsegs_invalid(lo, &tmp_list, IOMODE_ANY); - /* Matched by refcount set to 1 in alloc_init_layout_hdr */ - put_layout_hdr_locked(lo); } spin_unlock(&nfsi->vfs_inode.i_lock); pnfs_free_lseg_list(&tmp_list); @@ -403,6 +425,7 @@ pnfs_layoutgets_blocked(struct pnfs_layout_hdr *lo, nfs4_stateid *stateid, (int)(lo->plh_barrier - be32_to_cpu(stateid->stateid.seqid)) >= 0) return true; return lo->plh_block_lgets || + test_bit(NFS_LAYOUT_DESTROYED, &lo->plh_flags) || test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags) || (list_empty(&lo->plh_segs) && (atomic_read(&lo->plh_outstanding) > lget)); @@ -674,7 +697,7 @@ pnfs_find_lseg(struct pnfs_layout_hdr *lo, u32 iomode) list_for_each_entry(lseg, &lo->plh_segs, pls_list) { if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags) && is_matching_lseg(lseg, iomode)) { - ret = lseg; + ret = get_lseg(lseg); break; } if (cmp_layout(iomode, lseg->pls_range.iomode) > 0) @@ -699,6 +722,7 @@ pnfs_update_layout(struct inode *ino, struct nfs_client *clp = NFS_SERVER(ino)->nfs_client; struct pnfs_layout_hdr *lo; struct pnfs_layout_segment *lseg = NULL; + bool first = false; if (!pnfs_enabled_sb(NFS_SERVER(ino))) return NULL; @@ -715,21 +739,25 @@ pnfs_update_layout(struct inode *ino, dprintk("%s matches recall, use MDS\n", __func__); goto out_unlock; } - /* Check to see if the layout for the given range already exists */ - lseg = pnfs_find_lseg(lo, iomode); - if (lseg) - goto out_unlock; /* if LAYOUTGET already failed once we don't try again */ if (test_bit(lo_fail_bit(iomode), &nfsi->layout->plh_flags)) goto out_unlock; + /* Check to see if the layout for the given range already exists */ + lseg = pnfs_find_lseg(lo, iomode); + if (lseg) + goto out_unlock; + if (pnfs_layoutgets_blocked(lo, NULL, 0)) goto out_unlock; atomic_inc(&lo->plh_outstanding); get_layout_hdr(lo); - if (list_empty(&lo->plh_segs)) { + if (list_empty(&lo->plh_segs)) + first = true; + spin_unlock(&ino->i_lock); + if (first) { /* The lo must be on the clp list if there is any * chance of a CB_LAYOUTRECALL(FILE) coming in. */ @@ -738,24 +766,18 @@ pnfs_update_layout(struct inode *ino, list_add_tail(&lo->plh_layouts, &clp->cl_layouts); spin_unlock(&clp->cl_lock); } - spin_unlock(&ino->i_lock); lseg = send_layoutget(lo, ctx, iomode); - if (!lseg) { - spin_lock(&ino->i_lock); - if (list_empty(&lo->plh_segs)) { - spin_lock(&clp->cl_lock); - list_del_init(&lo->plh_layouts); - spin_unlock(&clp->cl_lock); - clear_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags); - } - spin_unlock(&ino->i_lock); + if (!lseg && first) { + spin_lock(&clp->cl_lock); + list_del_init(&lo->plh_layouts); + spin_unlock(&clp->cl_lock); } atomic_dec(&lo->plh_outstanding); put_layout_hdr(lo); out: dprintk("%s end, state 0x%lx lseg %p\n", __func__, - nfsi->layout->plh_flags, lseg); + nfsi->layout ? nfsi->layout->plh_flags : -1, lseg); return lseg; out_unlock: spin_unlock(&ino->i_lock); @@ -808,7 +830,7 @@ pnfs_layout_process(struct nfs4_layoutget *lgp) } init_lseg(lo, lseg); lseg->pls_range = res->range; - *lgp->lsegpp = lseg; + *lgp->lsegpp = get_lseg(lseg); pnfs_insert_layout(lo, lseg); if (res->return_on_close) { @@ -829,137 +851,97 @@ out_forget_reply: goto out; } -/* - * Device ID cache. Currently supports one layout type per struct nfs_client. - * Add layout type to the lookup key to expand to support multiple types. - */ -int -pnfs_alloc_init_deviceid_cache(struct nfs_client *clp, - void (*free_callback)(struct pnfs_deviceid_node *)) +static int pnfs_read_pg_test(struct nfs_pageio_descriptor *pgio, + struct nfs_page *prev, + struct nfs_page *req) { - struct pnfs_deviceid_cache *c; - - c = kzalloc(sizeof(struct pnfs_deviceid_cache), GFP_KERNEL); - if (!c) - return -ENOMEM; - spin_lock(&clp->cl_lock); - if (clp->cl_devid_cache != NULL) { - atomic_inc(&clp->cl_devid_cache->dc_ref); - dprintk("%s [kref [%d]]\n", __func__, - atomic_read(&clp->cl_devid_cache->dc_ref)); - kfree(c); - } else { - /* kzalloc initializes hlists */ - spin_lock_init(&c->dc_lock); - atomic_set(&c->dc_ref, 1); - c->dc_free_callback = free_callback; - clp->cl_devid_cache = c; - dprintk("%s [new]\n", __func__); + if (pgio->pg_count == prev->wb_bytes) { + /* This is first coelesce call for a series of nfs_pages */ + pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, + prev->wb_context, + IOMODE_READ); } - spin_unlock(&clp->cl_lock); - return 0; + return NFS_SERVER(pgio->pg_inode)->pnfs_curr_ld->pg_test(pgio, prev, req); } -EXPORT_SYMBOL_GPL(pnfs_alloc_init_deviceid_cache); -/* - * Called from pnfs_layoutdriver_type->free_lseg - * last layout segment reference frees deviceid - */ void -pnfs_put_deviceid(struct pnfs_deviceid_cache *c, - struct pnfs_deviceid_node *devid) +pnfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, struct inode *inode) { - struct nfs4_deviceid *id = &devid->de_id; - struct pnfs_deviceid_node *d; - struct hlist_node *n; - long h = nfs4_deviceid_hash(id); + struct pnfs_layoutdriver_type *ld; - dprintk("%s [%d]\n", __func__, atomic_read(&devid->de_ref)); - if (!atomic_dec_and_lock(&devid->de_ref, &c->dc_lock)) - return; + ld = NFS_SERVER(inode)->pnfs_curr_ld; + pgio->pg_test = (ld && ld->pg_test) ? pnfs_read_pg_test : NULL; +} - hlist_for_each_entry_rcu(d, n, &c->dc_deviceids[h], de_node) - if (!memcmp(&d->de_id, id, sizeof(*id))) { - hlist_del_rcu(&d->de_node); - spin_unlock(&c->dc_lock); - synchronize_rcu(); - c->dc_free_callback(devid); - return; - } - spin_unlock(&c->dc_lock); - /* Why wasn't it found in the list? */ - BUG(); -} -EXPORT_SYMBOL_GPL(pnfs_put_deviceid); - -/* Find and reference a deviceid */ -struct pnfs_deviceid_node * -pnfs_find_get_deviceid(struct pnfs_deviceid_cache *c, struct nfs4_deviceid *id) -{ - struct pnfs_deviceid_node *d; - struct hlist_node *n; - long hash = nfs4_deviceid_hash(id); - - dprintk("--> %s hash %ld\n", __func__, hash); - rcu_read_lock(); - hlist_for_each_entry_rcu(d, n, &c->dc_deviceids[hash], de_node) { - if (!memcmp(&d->de_id, id, sizeof(*id))) { - if (!atomic_inc_not_zero(&d->de_ref)) { - goto fail; - } else { - rcu_read_unlock(); - return d; - } - } +static int pnfs_write_pg_test(struct nfs_pageio_descriptor *pgio, + struct nfs_page *prev, + struct nfs_page *req) +{ + if (pgio->pg_count == prev->wb_bytes) { + /* This is first coelesce call for a series of nfs_pages */ + pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, + prev->wb_context, + IOMODE_RW); } -fail: - rcu_read_unlock(); - return NULL; + return NFS_SERVER(pgio->pg_inode)->pnfs_curr_ld->pg_test(pgio, prev, req); +} + +void +pnfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, struct inode *inode) +{ + struct pnfs_layoutdriver_type *ld; + + ld = NFS_SERVER(inode)->pnfs_curr_ld; + pgio->pg_test = (ld && ld->pg_test) ? pnfs_write_pg_test : NULL; +} + +enum pnfs_try_status +pnfs_try_to_write_data(struct nfs_write_data *wdata, + const struct rpc_call_ops *call_ops, int how) +{ + struct inode *inode = wdata->inode; + enum pnfs_try_status trypnfs; + struct nfs_server *nfss = NFS_SERVER(inode); + + wdata->mds_ops = call_ops; + + dprintk("%s: Writing ino:%lu %u@%llu (how %d)\n", __func__, + inode->i_ino, wdata->args.count, wdata->args.offset, how); + + trypnfs = nfss->pnfs_curr_ld->write_pagelist(wdata, how); + if (trypnfs == PNFS_NOT_ATTEMPTED) { + put_lseg(wdata->lseg); + wdata->lseg = NULL; + } else + nfs_inc_stats(inode, NFSIOS_PNFS_WRITE); + + dprintk("%s End (trypnfs:%d)\n", __func__, trypnfs); + return trypnfs; } -EXPORT_SYMBOL_GPL(pnfs_find_get_deviceid); /* - * Add a deviceid to the cache. - * GETDEVICEINFOs for same deviceid can race. If deviceid is found, discard new + * Call the appropriate parallel I/O subsystem read function. */ -struct pnfs_deviceid_node * -pnfs_add_deviceid(struct pnfs_deviceid_cache *c, struct pnfs_deviceid_node *new) -{ - struct pnfs_deviceid_node *d; - long hash = nfs4_deviceid_hash(&new->de_id); - - dprintk("--> %s hash %ld\n", __func__, hash); - spin_lock(&c->dc_lock); - d = pnfs_find_get_deviceid(c, &new->de_id); - if (d) { - spin_unlock(&c->dc_lock); - dprintk("%s [discard]\n", __func__); - c->dc_free_callback(new); - return d; - } - INIT_HLIST_NODE(&new->de_node); - atomic_set(&new->de_ref, 1); - hlist_add_head_rcu(&new->de_node, &c->dc_deviceids[hash]); - spin_unlock(&c->dc_lock); - dprintk("%s [new]\n", __func__); - return new; -} -EXPORT_SYMBOL_GPL(pnfs_add_deviceid); - -void -pnfs_put_deviceid_cache(struct nfs_client *clp) +enum pnfs_try_status +pnfs_try_to_read_data(struct nfs_read_data *rdata, + const struct rpc_call_ops *call_ops) { - struct pnfs_deviceid_cache *local = clp->cl_devid_cache; + struct inode *inode = rdata->inode; + struct nfs_server *nfss = NFS_SERVER(inode); + enum pnfs_try_status trypnfs; - dprintk("--> %s ({%d})\n", __func__, atomic_read(&local->dc_ref)); - if (atomic_dec_and_lock(&local->dc_ref, &clp->cl_lock)) { - int i; - /* Verify cache is empty */ - for (i = 0; i < NFS4_DEVICE_ID_HASH_SIZE; i++) - BUG_ON(!hlist_empty(&local->dc_deviceids[i])); - clp->cl_devid_cache = NULL; - spin_unlock(&clp->cl_lock); - kfree(local); + rdata->mds_ops = call_ops; + + dprintk("%s: Reading ino:%lu %u@%llu\n", + __func__, inode->i_ino, rdata->args.count, rdata->args.offset); + + trypnfs = nfss->pnfs_curr_ld->read_pagelist(rdata); + if (trypnfs == PNFS_NOT_ATTEMPTED) { + put_lseg(rdata->lseg); + rdata->lseg = NULL; + } else { + nfs_inc_stats(inode, NFSIOS_PNFS_READ); } + dprintk("%s End (trypnfs:%d)\n", __func__, trypnfs); + return trypnfs; } -EXPORT_SYMBOL_GPL(pnfs_put_deviceid_cache); diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index e2612ea..6380b94 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -30,6 +30,8 @@ #ifndef FS_NFS_PNFS_H #define FS_NFS_PNFS_H +#include <linux/nfs_page.h> + enum { NFS_LSEG_VALID = 0, /* cleared when lseg is recalled/returned */ NFS_LSEG_ROC, /* roc bit received from server */ @@ -43,6 +45,11 @@ struct pnfs_layout_segment { struct pnfs_layout_hdr *pls_layout; }; +enum pnfs_try_status { + PNFS_ATTEMPTED = 0, + PNFS_NOT_ATTEMPTED = 1, +}; + #ifdef CONFIG_NFS_V4_1 #define LAYOUT_NFSV4_1_MODULE_PREFIX "nfs-layouttype4" @@ -61,10 +68,18 @@ struct pnfs_layoutdriver_type { const u32 id; const char *name; struct module *owner; - int (*set_layoutdriver) (struct nfs_server *); - int (*clear_layoutdriver) (struct nfs_server *); struct pnfs_layout_segment * (*alloc_lseg) (struct pnfs_layout_hdr *layoutid, struct nfs4_layoutget_res *lgr); void (*free_lseg) (struct pnfs_layout_segment *lseg); + + /* test for nfs page cache coalescing */ + int (*pg_test)(struct nfs_pageio_descriptor *, struct nfs_page *, struct nfs_page *); + + /* + * Return PNFS_ATTEMPTED to indicate the layout code has attempted + * I/O, else return PNFS_NOT_ATTEMPTED to fall back to normal NFS + */ + enum pnfs_try_status (*read_pagelist) (struct nfs_read_data *nfs_data); + enum pnfs_try_status (*write_pagelist) (struct nfs_write_data *nfs_data, int how); }; struct pnfs_layout_hdr { @@ -90,52 +105,6 @@ struct pnfs_device { unsigned int pglen; }; -/* - * Device ID RCU cache. A device ID is unique per client ID and layout type. - */ -#define NFS4_DEVICE_ID_HASH_BITS 5 -#define NFS4_DEVICE_ID_HASH_SIZE (1 << NFS4_DEVICE_ID_HASH_BITS) -#define NFS4_DEVICE_ID_HASH_MASK (NFS4_DEVICE_ID_HASH_SIZE - 1) - -static inline u32 -nfs4_deviceid_hash(struct nfs4_deviceid *id) -{ - unsigned char *cptr = (unsigned char *)id->data; - unsigned int nbytes = NFS4_DEVICEID4_SIZE; - u32 x = 0; - - while (nbytes--) { - x *= 37; - x += *cptr++; - } - return x & NFS4_DEVICE_ID_HASH_MASK; -} - -struct pnfs_deviceid_node { - struct hlist_node de_node; - struct nfs4_deviceid de_id; - atomic_t de_ref; -}; - -struct pnfs_deviceid_cache { - spinlock_t dc_lock; - atomic_t dc_ref; - void (*dc_free_callback)(struct pnfs_deviceid_node *); - struct hlist_head dc_deviceids[NFS4_DEVICE_ID_HASH_SIZE]; -}; - -extern int pnfs_alloc_init_deviceid_cache(struct nfs_client *, - void (*free_callback)(struct pnfs_deviceid_node *)); -extern void pnfs_put_deviceid_cache(struct nfs_client *); -extern struct pnfs_deviceid_node *pnfs_find_get_deviceid( - struct pnfs_deviceid_cache *, - struct nfs4_deviceid *); -extern struct pnfs_deviceid_node *pnfs_add_deviceid( - struct pnfs_deviceid_cache *, - struct pnfs_deviceid_node *); -extern void pnfs_put_deviceid(struct pnfs_deviceid_cache *c, - struct pnfs_deviceid_node *devid); - extern int pnfs_register_layoutdriver(struct pnfs_layoutdriver_type *); extern void pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *); @@ -146,11 +115,18 @@ extern int nfs4_proc_layoutget(struct nfs4_layoutget *lgp); /* pnfs.c */ void get_layout_hdr(struct pnfs_layout_hdr *lo); +void put_lseg(struct pnfs_layout_segment *lseg); struct pnfs_layout_segment * pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx, enum pnfs_iomode access_type); void set_pnfs_layoutdriver(struct nfs_server *, u32 id); void unset_pnfs_layoutdriver(struct nfs_server *); +enum pnfs_try_status pnfs_try_to_write_data(struct nfs_write_data *, + const struct rpc_call_ops *, int); +enum pnfs_try_status pnfs_try_to_read_data(struct nfs_read_data *, + const struct rpc_call_ops *); +void pnfs_pageio_init_read(struct nfs_pageio_descriptor *, struct inode *); +void pnfs_pageio_init_write(struct nfs_pageio_descriptor *, struct inode *); int pnfs_layout_process(struct nfs4_layoutget *lgp); void pnfs_free_lseg_list(struct list_head *tmp_list); void pnfs_destroy_layout(struct nfs_inode *); @@ -177,6 +153,16 @@ static inline int lo_fail_bit(u32 iomode) NFS_LAYOUT_RW_FAILED : NFS_LAYOUT_RO_FAILED; } +static inline struct pnfs_layout_segment * +get_lseg(struct pnfs_layout_segment *lseg) +{ + if (lseg) { + atomic_inc(&lseg->pls_refcount); + smp_mb__after_atomic_inc(); + } + return lseg; +} + /* Return true if a layout driver is being used for this mountpoint */ static inline int pnfs_enabled_sb(struct nfs_server *nfss) { @@ -194,12 +180,36 @@ static inline void pnfs_destroy_layout(struct nfs_inode *nfsi) } static inline struct pnfs_layout_segment * +get_lseg(struct pnfs_layout_segment *lseg) +{ + return NULL; +} + +static inline void put_lseg(struct pnfs_layout_segment *lseg) +{ +} + +static inline struct pnfs_layout_segment * pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx, enum pnfs_iomode access_type) { return NULL; } +static inline enum pnfs_try_status +pnfs_try_to_read_data(struct nfs_read_data *data, + const struct rpc_call_ops *call_ops) +{ + return PNFS_NOT_ATTEMPTED; +} + +static inline enum pnfs_try_status +pnfs_try_to_write_data(struct nfs_write_data *data, + const struct rpc_call_ops *call_ops, int how) +{ + return PNFS_NOT_ATTEMPTED; +} + static inline bool pnfs_roc(struct inode *ino) { @@ -230,6 +240,18 @@ static inline void unset_pnfs_layoutdriver(struct nfs_server *s) { } +static inline void +pnfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, struct inode *ino) +{ + pgio->pg_test = NULL; +} + +static inline void +pnfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, struct inode *ino) +{ + pgio->pg_test = NULL; +} + #endif /* CONFIG_NFS_V4_1 */ #endif /* FS_NFS_PNFS_H */ diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c index 77d5e21..b8ec170 100644 --- a/fs/nfs/proc.c +++ b/fs/nfs/proc.c @@ -741,4 +741,5 @@ const struct nfs_rpc_ops nfs_v2_clientops = { .lock = nfs_proc_lock, .lock_check_bounds = nfs_lock_check_bounds, .close_context = nfs_close_context, + .init_client = nfs_init_client, }; diff --git a/fs/nfs/read.c b/fs/nfs/read.c index aedcaa7..7cded2b 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -18,19 +18,20 @@ #include <linux/sunrpc/clnt.h> #include <linux/nfs_fs.h> #include <linux/nfs_page.h> +#include <linux/module.h> #include <asm/system.h> +#include "pnfs.h" #include "nfs4_fs.h" #include "internal.h" #include "iostat.h" #include "fscache.h" -#include "pnfs.h" #define NFSDBG_FACILITY NFSDBG_PAGECACHE -static int nfs_pagein_multi(struct inode *, struct list_head *, unsigned int, size_t, int); -static int nfs_pagein_one(struct inode *, struct list_head *, unsigned int, size_t, int); +static int nfs_pagein_multi(struct nfs_pageio_descriptor *desc); +static int nfs_pagein_one(struct nfs_pageio_descriptor *desc); static const struct rpc_call_ops nfs_read_partial_ops; static const struct rpc_call_ops nfs_read_full_ops; @@ -69,6 +70,7 @@ void nfs_readdata_free(struct nfs_read_data *p) static void nfs_readdata_release(struct nfs_read_data *rdata) { + put_lseg(rdata->lseg); put_nfs_open_context(rdata->args.context); nfs_readdata_free(rdata); } @@ -114,14 +116,13 @@ static void nfs_readpage_truncate_uninitialised_page(struct nfs_read_data *data) int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode, struct page *page) { - LIST_HEAD(one_request); struct nfs_page *new; unsigned int len; + struct nfs_pageio_descriptor pgio; len = nfs_page_length(page); if (len == 0) return nfs_return_empty_page(page); - pnfs_update_layout(inode, ctx, IOMODE_READ); new = nfs_create_request(ctx, inode, page, 0, len); if (IS_ERR(new)) { unlock_page(page); @@ -130,11 +131,14 @@ int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode, if (len < PAGE_CACHE_SIZE) zero_user_segment(page, len, PAGE_CACHE_SIZE); - nfs_list_add_request(new, &one_request); + nfs_pageio_init(&pgio, inode, NULL, 0, 0); + nfs_list_add_request(new, &pgio.pg_list); + pgio.pg_count = len; + if (NFS_SERVER(inode)->rsize < PAGE_CACHE_SIZE) - nfs_pagein_multi(inode, &one_request, 1, len, 0); + nfs_pagein_multi(&pgio); else - nfs_pagein_one(inode, &one_request, 1, len, 0); + nfs_pagein_one(&pgio); return 0; } @@ -155,24 +159,20 @@ static void nfs_readpage_release(struct nfs_page *req) nfs_release_request(req); } -/* - * Set up the NFS read request struct - */ -static int nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data, - const struct rpc_call_ops *call_ops, - unsigned int count, unsigned int offset) +int nfs_initiate_read(struct nfs_read_data *data, struct rpc_clnt *clnt, + const struct rpc_call_ops *call_ops) { - struct inode *inode = req->wb_context->path.dentry->d_inode; + struct inode *inode = data->inode; int swap_flags = IS_SWAPFILE(inode) ? NFS_RPC_SWAPFLAGS : 0; struct rpc_task *task; struct rpc_message msg = { .rpc_argp = &data->args, .rpc_resp = &data->res, - .rpc_cred = req->wb_context->cred, + .rpc_cred = data->cred, }; struct rpc_task_setup task_setup_data = { .task = &data->task, - .rpc_client = NFS_CLIENT(inode), + .rpc_client = clnt, .rpc_message = &msg, .callback_ops = call_ops, .callback_data = data, @@ -180,9 +180,39 @@ static int nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data, .flags = RPC_TASK_ASYNC | swap_flags, }; + /* Set up the initial task struct. */ + NFS_PROTO(inode)->read_setup(data, &msg); + + dprintk("NFS: %5u initiated read call (req %s/%lld, %u bytes @ " + "offset %llu)\n", + data->task.tk_pid, + inode->i_sb->s_id, + (long long)NFS_FILEID(inode), + data->args.count, + (unsigned long long)data->args.offset); + + task = rpc_run_task(&task_setup_data); + if (IS_ERR(task)) + return PTR_ERR(task); + rpc_put_task(task); + return 0; +} +EXPORT_SYMBOL_GPL(nfs_initiate_read); + +/* + * Set up the NFS read request struct + */ +static int nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data, + const struct rpc_call_ops *call_ops, + unsigned int count, unsigned int offset, + struct pnfs_layout_segment *lseg) +{ + struct inode *inode = req->wb_context->path.dentry->d_inode; + data->req = req; data->inode = inode; - data->cred = msg.rpc_cred; + data->cred = req->wb_context->cred; + data->lseg = get_lseg(lseg); data->args.fh = NFS_FH(inode); data->args.offset = req_offset(req) + offset; @@ -197,21 +227,11 @@ static int nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data, data->res.eof = 0; nfs_fattr_init(&data->fattr); - /* Set up the initial task struct. */ - NFS_PROTO(inode)->read_setup(data, &msg); - - dprintk("NFS: %5u initiated read call (req %s/%Ld, %u bytes @ offset %Lu)\n", - data->task.tk_pid, - inode->i_sb->s_id, - (long long)NFS_FILEID(inode), - count, - (unsigned long long)data->args.offset); + if (data->lseg && + (pnfs_try_to_read_data(data, call_ops) == PNFS_ATTEMPTED)) + return 0; - task = rpc_run_task(&task_setup_data); - if (IS_ERR(task)) - return PTR_ERR(task); - rpc_put_task(task); - return 0; + return nfs_initiate_read(data, NFS_CLIENT(inode), call_ops); } static void @@ -240,20 +260,21 @@ nfs_async_read_error(struct list_head *head) * won't see the new data until our attribute cache is updated. This is more * or less conventional NFS client behavior. */ -static int nfs_pagein_multi(struct inode *inode, struct list_head *head, unsigned int npages, size_t count, int flags) +static int nfs_pagein_multi(struct nfs_pageio_descriptor *desc) { - struct nfs_page *req = nfs_list_entry(head->next); + struct nfs_page *req = nfs_list_entry(desc->pg_list.next); struct page *page = req->wb_page; struct nfs_read_data *data; - size_t rsize = NFS_SERVER(inode)->rsize, nbytes; + size_t rsize = NFS_SERVER(desc->pg_inode)->rsize, nbytes; unsigned int offset; int requests = 0; int ret = 0; + struct pnfs_layout_segment *lseg; LIST_HEAD(list); nfs_list_remove_request(req); - nbytes = count; + nbytes = desc->pg_count; do { size_t len = min(nbytes,rsize); @@ -266,9 +287,11 @@ static int nfs_pagein_multi(struct inode *inode, struct list_head *head, unsigne } while(nbytes != 0); atomic_set(&req->wb_complete, requests); + BUG_ON(desc->pg_lseg != NULL); + lseg = pnfs_update_layout(desc->pg_inode, req->wb_context, IOMODE_READ); ClearPageError(page); offset = 0; - nbytes = count; + nbytes = desc->pg_count; do { int ret2; @@ -280,12 +303,14 @@ static int nfs_pagein_multi(struct inode *inode, struct list_head *head, unsigne if (nbytes < rsize) rsize = nbytes; ret2 = nfs_read_rpcsetup(req, data, &nfs_read_partial_ops, - rsize, offset); + rsize, offset, lseg); if (ret == 0) ret = ret2; offset += rsize; nbytes -= rsize; } while (nbytes != 0); + put_lseg(lseg); + desc->pg_lseg = NULL; return ret; @@ -300,16 +325,21 @@ out_bad: return -ENOMEM; } -static int nfs_pagein_one(struct inode *inode, struct list_head *head, unsigned int npages, size_t count, int flags) +static int nfs_pagein_one(struct nfs_pageio_descriptor *desc) { struct nfs_page *req; struct page **pages; struct nfs_read_data *data; + struct list_head *head = &desc->pg_list; + struct pnfs_layout_segment *lseg = desc->pg_lseg; int ret = -ENOMEM; - data = nfs_readdata_alloc(npages); - if (!data) - goto out_bad; + data = nfs_readdata_alloc(nfs_page_array_len(desc->pg_base, + desc->pg_count)); + if (!data) { + nfs_async_read_error(head); + goto out; + } pages = data->pagevec; while (!list_empty(head)) { @@ -320,10 +350,14 @@ static int nfs_pagein_one(struct inode *inode, struct list_head *head, unsigned *pages++ = req->wb_page; } req = nfs_list_entry(data->pages.next); + if ((!lseg) && list_is_singular(&data->pages)) + lseg = pnfs_update_layout(desc->pg_inode, req->wb_context, IOMODE_READ); - return nfs_read_rpcsetup(req, data, &nfs_read_full_ops, count, 0); -out_bad: - nfs_async_read_error(head); + ret = nfs_read_rpcsetup(req, data, &nfs_read_full_ops, desc->pg_count, + 0, lseg); +out: + put_lseg(lseg); + desc->pg_lseg = NULL; return ret; } @@ -366,6 +400,7 @@ static void nfs_readpage_retry(struct rpc_task *task, struct nfs_read_data *data return; /* Yes, so retry the read at the end of the data */ + data->mds_offset += resp->count; argp->offset += resp->count; argp->pgbase += resp->count; argp->count -= resp->count; @@ -625,7 +660,7 @@ int nfs_readpages(struct file *filp, struct address_space *mapping, if (ret == 0) goto read_complete; /* all pages were read */ - pnfs_update_layout(inode, desc.ctx, IOMODE_READ); + pnfs_pageio_init_read(&pgio, inode); if (rsize < PAGE_CACHE_SIZE) nfs_pageio_init(&pgio, inode, nfs_pagein_multi, rsize, 0); else diff --git a/fs/nfs/super.c b/fs/nfs/super.c index d328658..2b8e9a5 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -1008,6 +1008,27 @@ static int nfs_parse_security_flavors(char *value, return 1; } +static int nfs_get_option_str(substring_t args[], char **option) +{ + kfree(*option); + *option = match_strdup(args); + return !option; +} + +static int nfs_get_option_ul(substring_t args[], unsigned long *option) +{ + int rc; + char *string; + + string = match_strdup(args); + if (string == NULL) + return -ENOMEM; + rc = strict_strtoul(string, 10, option); + kfree(string); + + return rc; +} + /* * Error-check and convert a string of mount options from user space into * a data structure. The whole mount string is processed; bad options are @@ -1156,155 +1177,82 @@ static int nfs_parse_mount_options(char *raw, * options that take numeric values */ case Opt_port: - string = match_strdup(args); - if (string == NULL) - goto out_nomem; - rc = strict_strtoul(string, 10, &option); - kfree(string); - if (rc != 0 || option > USHRT_MAX) + if (nfs_get_option_ul(args, &option) || + option > USHRT_MAX) goto out_invalid_value; mnt->nfs_server.port = option; break; case Opt_rsize: - string = match_strdup(args); - if (string == NULL) - goto out_nomem; - rc = strict_strtoul(string, 10, &option); - kfree(string); - if (rc != 0) + if (nfs_get_option_ul(args, &option)) goto out_invalid_value; mnt->rsize = option; break; case Opt_wsize: - string = match_strdup(args); - if (string == NULL) - goto out_nomem; - rc = strict_strtoul(string, 10, &option); - kfree(string); - if (rc != 0) + if (nfs_get_option_ul(args, &option)) goto out_invalid_value; mnt->wsize = option; break; case Opt_bsize: - string = match_strdup(args); - if (string == NULL) - goto out_nomem; - rc = strict_strtoul(string, 10, &option); - kfree(string); - if (rc != 0) + if (nfs_get_option_ul(args, &option)) goto out_invalid_value; mnt->bsize = option; break; case Opt_timeo: - string = match_strdup(args); - if (string == NULL) - goto out_nomem; - rc = strict_strtoul(string, 10, &option); - kfree(string); - if (rc != 0 || option == 0) + if (nfs_get_option_ul(args, &option) || option == 0) goto out_invalid_value; mnt->timeo = option; break; case Opt_retrans: - string = match_strdup(args); - if (string == NULL) - goto out_nomem; - rc = strict_strtoul(string, 10, &option); - kfree(string); - if (rc != 0 || option == 0) + if (nfs_get_option_ul(args, &option) || option == 0) goto out_invalid_value; mnt->retrans = option; break; case Opt_acregmin: - string = match_strdup(args); - if (string == NULL) - goto out_nomem; - rc = strict_strtoul(string, 10, &option); - kfree(string); - if (rc != 0) + if (nfs_get_option_ul(args, &option)) goto out_invalid_value; mnt->acregmin = option; break; case Opt_acregmax: - string = match_strdup(args); - if (string == NULL) - goto out_nomem; - rc = strict_strtoul(string, 10, &option); - kfree(string); - if (rc != 0) + if (nfs_get_option_ul(args, &option)) goto out_invalid_value; mnt->acregmax = option; break; case Opt_acdirmin: - string = match_strdup(args); - if (string == NULL) - goto out_nomem; - rc = strict_strtoul(string, 10, &option); - kfree(string); - if (rc != 0) + if (nfs_get_option_ul(args, &option)) goto out_invalid_value; mnt->acdirmin = option; break; case Opt_acdirmax: - string = match_strdup(args); - if (string == NULL) - goto out_nomem; - rc = strict_strtoul(string, 10, &option); - kfree(string); - if (rc != 0) + if (nfs_get_option_ul(args, &option)) goto out_invalid_value; mnt->acdirmax = option; break; case Opt_actimeo: - string = match_strdup(args); - if (string == NULL) - goto out_nomem; - rc = strict_strtoul(string, 10, &option); - kfree(string); - if (rc != 0) + if (nfs_get_option_ul(args, &option)) goto out_invalid_value; mnt->acregmin = mnt->acregmax = mnt->acdirmin = mnt->acdirmax = option; break; case Opt_namelen: - string = match_strdup(args); - if (string == NULL) - goto out_nomem; - rc = strict_strtoul(string, 10, &option); - kfree(string); - if (rc != 0) + if (nfs_get_option_ul(args, &option)) goto out_invalid_value; mnt->namlen = option; break; case Opt_mountport: - string = match_strdup(args); - if (string == NULL) - goto out_nomem; - rc = strict_strtoul(string, 10, &option); - kfree(string); - if (rc != 0 || option > USHRT_MAX) + if (nfs_get_option_ul(args, &option) || + option > USHRT_MAX) goto out_invalid_value; mnt->mount_server.port = option; break; case Opt_mountvers: - string = match_strdup(args); - if (string == NULL) - goto out_nomem; - rc = strict_strtoul(string, 10, &option); - kfree(string); - if (rc != 0 || + if (nfs_get_option_ul(args, &option) || option < NFS_MNT_VERSION || option > NFS_MNT3_VERSION) goto out_invalid_value; mnt->mount_server.version = option; break; case Opt_nfsvers: - string = match_strdup(args); - if (string == NULL) - goto out_nomem; - rc = strict_strtoul(string, 10, &option); - kfree(string); - if (rc != 0) + if (nfs_get_option_ul(args, &option)) goto out_invalid_value; switch (option) { case NFS2_VERSION: @@ -1324,12 +1272,7 @@ static int nfs_parse_mount_options(char *raw, } break; case Opt_minorversion: - string = match_strdup(args); - if (string == NULL) - goto out_nomem; - rc = strict_strtoul(string, 10, &option); - kfree(string); - if (rc != 0) + if (nfs_get_option_ul(args, &option)) goto out_invalid_value; if (option > NFS4_MAX_MINOR_VERSION) goto out_invalid_value; @@ -1365,21 +1308,18 @@ static int nfs_parse_mount_options(char *raw, case Opt_xprt_udp: mnt->flags &= ~NFS_MOUNT_TCP; mnt->nfs_server.protocol = XPRT_TRANSPORT_UDP; - kfree(string); break; case Opt_xprt_tcp6: protofamily = AF_INET6; case Opt_xprt_tcp: mnt->flags |= NFS_MOUNT_TCP; mnt->nfs_server.protocol = XPRT_TRANSPORT_TCP; - kfree(string); break; case Opt_xprt_rdma: /* vector side protocols to TCP */ mnt->flags |= NFS_MOUNT_TCP; mnt->nfs_server.protocol = XPRT_TRANSPORT_RDMA; xprt_load_transport(string); - kfree(string); break; default: dfprintk(MOUNT, "NFS: unrecognized " @@ -1387,6 +1327,7 @@ static int nfs_parse_mount_options(char *raw, kfree(string); return 0; } + kfree(string); break; case Opt_mountproto: string = match_strdup(args); @@ -1429,18 +1370,13 @@ static int nfs_parse_mount_options(char *raw, goto out_invalid_address; break; case Opt_clientaddr: - string = match_strdup(args); - if (string == NULL) + if (nfs_get_option_str(args, &mnt->client_address)) goto out_nomem; - kfree(mnt->client_address); - mnt->client_address = string; break; case Opt_mounthost: - string = match_strdup(args); - if (string == NULL) + if (nfs_get_option_str(args, + &mnt->mount_server.hostname)) goto out_nomem; - kfree(mnt->mount_server.hostname); - mnt->mount_server.hostname = string; break; case Opt_mountaddr: string = match_strdup(args); @@ -1480,11 +1416,8 @@ static int nfs_parse_mount_options(char *raw, }; break; case Opt_fscache_uniq: - string = match_strdup(args); - if (string == NULL) + if (nfs_get_option_str(args, &mnt->fscache_uniq)) goto out_nomem; - kfree(mnt->fscache_uniq); - mnt->fscache_uniq = string; mnt->options |= NFS_OPTION_FSCACHE; break; case Opt_local_lock: @@ -1694,99 +1627,59 @@ static int nfs_try_mount(struct nfs_parsed_mount_data *args, return nfs_walk_authlist(args, &request); } -static int nfs_parse_simple_hostname(const char *dev_name, - char **hostname, size_t maxnamlen, - char **export_path, size_t maxpathlen) +/* + * Split "dev_name" into "hostname:export_path". + * + * The leftmost colon demarks the split between the server's hostname + * and the export path. If the hostname starts with a left square + * bracket, then it may contain colons. + * + * Note: caller frees hostname and export path, even on error. + */ +static int nfs_parse_devname(const char *dev_name, + char **hostname, size_t maxnamlen, + char **export_path, size_t maxpathlen) { size_t len; - char *colon, *comma; + char *end; - colon = strchr(dev_name, ':'); - if (colon == NULL) - goto out_bad_devname; - - len = colon - dev_name; - if (len > maxnamlen) - goto out_hostname; - - /* N.B. caller will free nfs_server.hostname in all cases */ - *hostname = kstrndup(dev_name, len, GFP_KERNEL); - if (!*hostname) - goto out_nomem; - - /* kill possible hostname list: not supported */ - comma = strchr(*hostname, ','); - if (comma != NULL) { - if (comma == *hostname) + /* Is the host name protected with square brakcets? */ + if (*dev_name == '[') { + end = strchr(++dev_name, ']'); + if (end == NULL || end[1] != ':') goto out_bad_devname; - *comma = '\0'; - } - - colon++; - len = strlen(colon); - if (len > maxpathlen) - goto out_path; - *export_path = kstrndup(colon, len, GFP_KERNEL); - if (!*export_path) - goto out_nomem; - - dfprintk(MOUNT, "NFS: MNTPATH: '%s'\n", *export_path); - return 0; - -out_bad_devname: - dfprintk(MOUNT, "NFS: device name not in host:path format\n"); - return -EINVAL; -out_nomem: - dfprintk(MOUNT, "NFS: not enough memory to parse device name\n"); - return -ENOMEM; - -out_hostname: - dfprintk(MOUNT, "NFS: server hostname too long\n"); - return -ENAMETOOLONG; - -out_path: - dfprintk(MOUNT, "NFS: export pathname too long\n"); - return -ENAMETOOLONG; -} - -/* - * Hostname has square brackets around it because it contains one or - * more colons. We look for the first closing square bracket, and a - * colon must follow it. - */ -static int nfs_parse_protected_hostname(const char *dev_name, - char **hostname, size_t maxnamlen, - char **export_path, size_t maxpathlen) -{ - size_t len; - char *start, *end; + len = end - dev_name; + end++; + } else { + char *comma; - start = (char *)(dev_name + 1); + end = strchr(dev_name, ':'); + if (end == NULL) + goto out_bad_devname; + len = end - dev_name; - end = strchr(start, ']'); - if (end == NULL) - goto out_bad_devname; - if (*(end + 1) != ':') - goto out_bad_devname; + /* kill possible hostname list: not supported */ + comma = strchr(dev_name, ','); + if (comma != NULL && comma < end) + *comma = 0; + } - len = end - start; if (len > maxnamlen) goto out_hostname; /* N.B. caller will free nfs_server.hostname in all cases */ - *hostname = kstrndup(start, len, GFP_KERNEL); + *hostname = kstrndup(dev_name, len, GFP_KERNEL); if (*hostname == NULL) goto out_nomem; - - end += 2; - len = strlen(end); + len = strlen(++end); if (len > maxpathlen) goto out_path; *export_path = kstrndup(end, len, GFP_KERNEL); if (!*export_path) goto out_nomem; + dfprintk(MOUNT, "NFS: MNTPATH: '%s'\n", *export_path); return 0; out_bad_devname: @@ -1807,29 +1700,6 @@ out_path: } /* - * Split "dev_name" into "hostname:export_path". - * - * The leftmost colon demarks the split between the server's hostname - * and the export path. If the hostname starts with a left square - * bracket, then it may contain colons. - * - * Note: caller frees hostname and export path, even on error. - */ -static int nfs_parse_devname(const char *dev_name, - char **hostname, size_t maxnamlen, - char **export_path, size_t maxpathlen) -{ - if (*dev_name == '[') - return nfs_parse_protected_hostname(dev_name, - hostname, maxnamlen, - export_path, maxpathlen); - - return nfs_parse_simple_hostname(dev_name, - hostname, maxnamlen, - export_path, maxpathlen); -} - -/* * Validate the NFS2/NFS3 mount data * - fills in the mount root filehandle * diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 42b92d7..47a3ad6 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -28,6 +28,7 @@ #include "iostat.h" #include "nfs4_fs.h" #include "fscache.h" +#include "pnfs.h" #define NFSDBG_FACILITY NFSDBG_PAGECACHE @@ -96,6 +97,7 @@ void nfs_writedata_free(struct nfs_write_data *p) static void nfs_writedata_release(struct nfs_write_data *wdata) { + put_lseg(wdata->lseg); put_nfs_open_context(wdata->args.context); nfs_writedata_free(wdata); } @@ -781,25 +783,21 @@ static int flush_task_priority(int how) return RPC_PRIORITY_NORMAL; } -/* - * Set up the argument/result storage required for the RPC call. - */ -static int nfs_write_rpcsetup(struct nfs_page *req, - struct nfs_write_data *data, - const struct rpc_call_ops *call_ops, - unsigned int count, unsigned int offset, - int how) +int nfs_initiate_write(struct nfs_write_data *data, + struct rpc_clnt *clnt, + const struct rpc_call_ops *call_ops, + int how) { - struct inode *inode = req->wb_context->path.dentry->d_inode; + struct inode *inode = data->inode; int priority = flush_task_priority(how); struct rpc_task *task; struct rpc_message msg = { .rpc_argp = &data->args, .rpc_resp = &data->res, - .rpc_cred = req->wb_context->cred, + .rpc_cred = data->cred, }; struct rpc_task_setup task_setup_data = { - .rpc_client = NFS_CLIENT(inode), + .rpc_client = clnt, .task = &data->task, .rpc_message = &msg, .callback_ops = call_ops, @@ -810,12 +808,52 @@ static int nfs_write_rpcsetup(struct nfs_page *req, }; int ret = 0; + /* Set up the initial task struct. */ + NFS_PROTO(inode)->write_setup(data, &msg); + + dprintk("NFS: %5u initiated write call " + "(req %s/%lld, %u bytes @ offset %llu)\n", + data->task.tk_pid, + inode->i_sb->s_id, + (long long)NFS_FILEID(inode), + data->args.count, + (unsigned long long)data->args.offset); + + task = rpc_run_task(&task_setup_data); + if (IS_ERR(task)) { + ret = PTR_ERR(task); + goto out; + } + if (how & FLUSH_SYNC) { + ret = rpc_wait_for_completion_task(task); + if (ret == 0) + ret = task->tk_status; + } + rpc_put_task(task); +out: + return ret; +} +EXPORT_SYMBOL_GPL(nfs_initiate_write); + +/* + * Set up the argument/result storage required for the RPC call. + */ +static int nfs_write_rpcsetup(struct nfs_page *req, + struct nfs_write_data *data, + const struct rpc_call_ops *call_ops, + unsigned int count, unsigned int offset, + struct pnfs_layout_segment *lseg, + int how) +{ + struct inode *inode = req->wb_context->path.dentry->d_inode; + /* Set up the RPC argument and reply structs * NB: take care not to mess about with data->commit et al. */ data->req = req; data->inode = inode = req->wb_context->path.dentry->d_inode; - data->cred = msg.rpc_cred; + data->cred = req->wb_context->cred; + data->lseg = get_lseg(lseg); data->args.fh = NFS_FH(inode); data->args.offset = req_offset(req) + offset; @@ -836,30 +874,11 @@ static int nfs_write_rpcsetup(struct nfs_page *req, data->res.verf = &data->verf; nfs_fattr_init(&data->fattr); - /* Set up the initial task struct. */ - NFS_PROTO(inode)->write_setup(data, &msg); - - dprintk("NFS: %5u initiated write call " - "(req %s/%lld, %u bytes @ offset %llu)\n", - data->task.tk_pid, - inode->i_sb->s_id, - (long long)NFS_FILEID(inode), - count, - (unsigned long long)data->args.offset); + if (data->lseg && + (pnfs_try_to_write_data(data, call_ops, how) == PNFS_ATTEMPTED)) + return 0; - task = rpc_run_task(&task_setup_data); - if (IS_ERR(task)) { - ret = PTR_ERR(task); - goto out; - } - if (how & FLUSH_SYNC) { - ret = rpc_wait_for_completion_task(task); - if (ret == 0) - ret = task->tk_status; - } - rpc_put_task(task); -out: - return ret; + return nfs_initiate_write(data, NFS_CLIENT(inode), call_ops, how); } /* If a nfs_flush_* function fails, it should remove reqs from @head and @@ -879,20 +898,21 @@ static void nfs_redirty_request(struct nfs_page *req) * Generate multiple small requests to write out a single * contiguous dirty area on one page. */ -static int nfs_flush_multi(struct inode *inode, struct list_head *head, unsigned int npages, size_t count, int how) +static int nfs_flush_multi(struct nfs_pageio_descriptor *desc) { - struct nfs_page *req = nfs_list_entry(head->next); + struct nfs_page *req = nfs_list_entry(desc->pg_list.next); struct page *page = req->wb_page; struct nfs_write_data *data; - size_t wsize = NFS_SERVER(inode)->wsize, nbytes; + size_t wsize = NFS_SERVER(desc->pg_inode)->wsize, nbytes; unsigned int offset; int requests = 0; int ret = 0; + struct pnfs_layout_segment *lseg; LIST_HEAD(list); nfs_list_remove_request(req); - nbytes = count; + nbytes = desc->pg_count; do { size_t len = min(nbytes, wsize); @@ -905,9 +925,11 @@ static int nfs_flush_multi(struct inode *inode, struct list_head *head, unsigned } while (nbytes != 0); atomic_set(&req->wb_complete, requests); + BUG_ON(desc->pg_lseg); + lseg = pnfs_update_layout(desc->pg_inode, req->wb_context, IOMODE_RW); ClearPageError(page); offset = 0; - nbytes = count; + nbytes = desc->pg_count; do { int ret2; @@ -919,13 +941,15 @@ static int nfs_flush_multi(struct inode *inode, struct list_head *head, unsigned if (nbytes < wsize) wsize = nbytes; ret2 = nfs_write_rpcsetup(req, data, &nfs_write_partial_ops, - wsize, offset, how); + wsize, offset, lseg, desc->pg_ioflags); if (ret == 0) ret = ret2; offset += wsize; nbytes -= wsize; } while (nbytes != 0); + put_lseg(lseg); + desc->pg_lseg = NULL; return ret; out_bad: @@ -946,16 +970,26 @@ out_bad: * This is the case if nfs_updatepage detects a conflicting request * that has been written but not committed. */ -static int nfs_flush_one(struct inode *inode, struct list_head *head, unsigned int npages, size_t count, int how) +static int nfs_flush_one(struct nfs_pageio_descriptor *desc) { struct nfs_page *req; struct page **pages; struct nfs_write_data *data; + struct list_head *head = &desc->pg_list; + struct pnfs_layout_segment *lseg = desc->pg_lseg; + int ret; - data = nfs_writedata_alloc(npages); - if (!data) - goto out_bad; - + data = nfs_writedata_alloc(nfs_page_array_len(desc->pg_base, + desc->pg_count)); + if (!data) { + while (!list_empty(head)) { + req = nfs_list_entry(head->next); + nfs_list_remove_request(req); + nfs_redirty_request(req); + } + ret = -ENOMEM; + goto out; + } pages = data->pagevec; while (!list_empty(head)) { req = nfs_list_entry(head->next); @@ -965,16 +999,15 @@ static int nfs_flush_one(struct inode *inode, struct list_head *head, unsigned i *pages++ = req->wb_page; } req = nfs_list_entry(data->pages.next); + if ((!lseg) && list_is_singular(&data->pages)) + lseg = pnfs_update_layout(desc->pg_inode, req->wb_context, IOMODE_RW); /* Set up the argument struct */ - return nfs_write_rpcsetup(req, data, &nfs_write_full_ops, count, 0, how); - out_bad: - while (!list_empty(head)) { - req = nfs_list_entry(head->next); - nfs_list_remove_request(req); - nfs_redirty_request(req); - } - return -ENOMEM; + ret = nfs_write_rpcsetup(req, data, &nfs_write_full_ops, desc->pg_count, 0, lseg, desc->pg_ioflags); +out: + put_lseg(lseg); /* Cleans any gotten in ->pg_test */ + desc->pg_lseg = NULL; + return ret; } static void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, @@ -982,6 +1015,8 @@ static void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, { size_t wsize = NFS_SERVER(inode)->wsize; + pnfs_pageio_init_write(pgio, inode); + if (wsize < PAGE_CACHE_SIZE) nfs_pageio_init(pgio, inode, nfs_flush_multi, wsize, ioflags); else @@ -1132,7 +1167,7 @@ static const struct rpc_call_ops nfs_write_full_ops = { /* * This function is called when the WRITE call is complete. */ -int nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data) +void nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data) { struct nfs_writeargs *argp = &data->args; struct nfs_writeres *resp = &data->res; @@ -1151,7 +1186,7 @@ int nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data) */ status = NFS_PROTO(data->inode)->write_done(task, data); if (status != 0) - return status; + return; nfs_add_stats(data->inode, NFSIOS_SERVERWRITTENBYTES, resp->count); #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4) @@ -1166,6 +1201,7 @@ int nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data) */ static unsigned long complain; + /* Note this will print the MDS for a DS write */ if (time_before(complain, jiffies)) { dprintk("NFS: faulty NFS server %s:" " (committed = %d) != (stable = %d)\n", @@ -1186,6 +1222,7 @@ int nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data) /* Was this an NFSv2 write or an NFSv3 stable write? */ if (resp->verf->committed != NFS_UNSTABLE) { /* Resend from where the server left off */ + data->mds_offset += resp->count; argp->offset += resp->count; argp->pgbase += resp->count; argp->count -= resp->count; @@ -1196,7 +1233,7 @@ int nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data) argp->stable = NFS_FILE_SYNC; } nfs_restart_rpc(task, server->nfs_client); - return -EAGAIN; + return; } if (time_before(complain, jiffies)) { printk(KERN_WARNING @@ -1207,7 +1244,7 @@ int nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data) /* Can't do anything about it except throw an error. */ task->tk_status = -EIO; } - return 0; + return; } diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index 8b61220..6b1305d 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -876,7 +876,7 @@ SYSCALL_ALIAS(sys_fanotify_mark, SyS_fanotify_mark); #endif /* - * fanotify_user_setup - Our initialization function. Note that we cannnot return + * fanotify_user_setup - Our initialization function. Note that we cannot return * error because we have compiled-in VFS hooks. So an (unlikely) failure here * must result in panic(). */ diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c index 4cd5d5d..bd46e7c 100644 --- a/fs/notify/inotify/inotify_user.c +++ b/fs/notify/inotify/inotify_user.c @@ -841,7 +841,7 @@ out: } /* - * inotify_user_setup - Our initialization function. Note that we cannnot return + * inotify_user_setup - Our initialization function. Note that we cannot return * error because we have compiled-in VFS hooks. So an (unlikely) failure here * must result in panic(). */ diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c index d417b3f..f97b6f1 100644 --- a/fs/ocfs2/dir.c +++ b/fs/ocfs2/dir.c @@ -354,7 +354,7 @@ static inline int ocfs2_match(int len, /* * Returns 0 if not found, -1 on failure, and 1 on success */ -static int inline ocfs2_search_dirblock(struct buffer_head *bh, +static inline int ocfs2_search_dirblock(struct buffer_head *bh, struct inode *dir, const char *name, int namelen, unsigned long offset, diff --git a/fs/omfs/dir.c b/fs/omfs/dir.c index 393f3f6..de4ff29 100644 --- a/fs/omfs/dir.c +++ b/fs/omfs/dir.c @@ -235,33 +235,22 @@ static int omfs_dir_is_empty(struct inode *inode) return *ptr != ~0; } -static int omfs_unlink(struct inode *dir, struct dentry *dentry) +static int omfs_remove(struct inode *dir, struct dentry *dentry) { - int ret; struct inode *inode = dentry->d_inode; + int ret; + + if (S_ISDIR(inode->i_mode) && !omfs_dir_is_empty(inode)) + return -ENOTEMPTY; ret = omfs_delete_entry(dentry); if (ret) - goto end_unlink; - - inode_dec_link_count(inode); + return ret; + + clear_nlink(inode); + mark_inode_dirty(inode); mark_inode_dirty(dir); - -end_unlink: - return ret; -} - -static int omfs_rmdir(struct inode *dir, struct dentry *dentry) -{ - int err = -ENOTEMPTY; - struct inode *inode = dentry->d_inode; - - if (omfs_dir_is_empty(inode)) { - err = omfs_unlink(dir, dentry); - if (!err) - inode_dec_link_count(inode); - } - return err; + return 0; } static int omfs_add_node(struct inode *dir, struct dentry *dentry, int mode) @@ -372,9 +361,10 @@ static int omfs_fill_chain(struct file *filp, void *dirent, filldir_t filldir, res = filldir(dirent, oi->i_name, strnlen(oi->i_name, OMFS_NAMELEN), filp->f_pos, self, d_type); - if (res == 0) - filp->f_pos++; brelse(bh); + if (res < 0) + break; + filp->f_pos++; } out: return res; @@ -385,44 +375,28 @@ static int omfs_rename(struct inode *old_dir, struct dentry *old_dentry, { struct inode *new_inode = new_dentry->d_inode; struct inode *old_inode = old_dentry->d_inode; - struct buffer_head *bh; - int is_dir; int err; - is_dir = S_ISDIR(old_inode->i_mode); - if (new_inode) { /* overwriting existing file/dir */ - err = -ENOTEMPTY; - if (is_dir && !omfs_dir_is_empty(new_inode)) - goto out; - - err = -ENOENT; - bh = omfs_find_entry(new_dir, new_dentry->d_name.name, - new_dentry->d_name.len); - if (IS_ERR(bh)) - goto out; - brelse(bh); - - err = omfs_unlink(new_dir, new_dentry); + err = omfs_remove(new_dir, new_dentry); if (err) goto out; } /* since omfs locates files by name, we need to unlink _before_ * adding the new link or we won't find the old one */ - inode_inc_link_count(old_inode); - err = omfs_unlink(old_dir, old_dentry); - if (err) { - inode_dec_link_count(old_inode); + err = omfs_delete_entry(old_dentry); + if (err) goto out; - } + mark_inode_dirty(old_dir); err = omfs_add_link(new_dentry, old_inode); if (err) goto out; old_inode->i_ctime = CURRENT_TIME_SEC; + mark_inode_dirty(old_inode); out: return err; } @@ -488,8 +462,8 @@ const struct inode_operations omfs_dir_inops = { .mkdir = omfs_mkdir, .rename = omfs_rename, .create = omfs_create, - .unlink = omfs_unlink, - .rmdir = omfs_rmdir, + .unlink = omfs_remove, + .rmdir = omfs_remove, }; const struct file_operations omfs_dir_operations = { diff --git a/fs/quota/quota_v2.c b/fs/quota/quota_v2.c index 65444d2..f1ab360 100644 --- a/fs/quota/quota_v2.c +++ b/fs/quota/quota_v2.c @@ -112,7 +112,7 @@ static int v2_read_file_info(struct super_block *sb, int type) if (!info->dqi_priv) { printk(KERN_WARNING "Not enough memory for quota information structure.\n"); - return -1; + return -ENOMEM; } qinfo = info->dqi_priv; if (version == 0) { diff --git a/fs/ubifs/Kconfig b/fs/ubifs/Kconfig index 830e3f7..1d1859d 100644 --- a/fs/ubifs/Kconfig +++ b/fs/ubifs/Kconfig @@ -44,23 +44,20 @@ config UBIFS_FS_ZLIB # Debugging-related stuff config UBIFS_FS_DEBUG - bool "Enable debugging" + bool "Enable debugging support" depends on UBIFS_FS select DEBUG_FS select KALLSYMS_ALL help - This option enables UBIFS debugging. - -config UBIFS_FS_DEBUG_MSG_LVL - int "Default message level (0 = no extra messages, 3 = lots)" - depends on UBIFS_FS_DEBUG - default "0" - help - This controls the amount of debugging messages produced by UBIFS. - If reporting bugs, please try to have available a full dump of the - messages at level 1 while the misbehaviour was occurring. Level 2 - may become necessary if level 1 messages were not enough to find the - bug. Generally Level 3 should be avoided. + This option enables UBIFS debugging support. It makes sure various + assertions, self-checks, debugging messages and test modes are compiled + in (this all is compiled out otherwise). Assertions are light-weight + and this option also enables them. Self-checks, debugging messages and + test modes are switched off by default. Thus, it is safe and actually + recommended to have debugging support enabled, and it should not slow + down UBIFS. You can then further enable / disable individual debugging + features using UBIFS module parameters and the corresponding sysfs + interfaces. config UBIFS_FS_DEBUG_CHKS bool "Enable extra checks" diff --git a/fs/ubifs/commit.c b/fs/ubifs/commit.c index 02429d8..b148fbc 100644 --- a/fs/ubifs/commit.c +++ b/fs/ubifs/commit.c @@ -48,6 +48,56 @@ #include <linux/slab.h> #include "ubifs.h" +/* + * nothing_to_commit - check if there is nothing to commit. + * @c: UBIFS file-system description object + * + * This is a helper function which checks if there is anything to commit. It is + * used as an optimization to avoid starting the commit if it is not really + * necessary. Indeed, the commit operation always assumes flash I/O (e.g., + * writing the commit start node to the log), and it is better to avoid doing + * this unnecessarily. E.g., 'ubifs_sync_fs()' runs the commit, but if there is + * nothing to commit, it is more optimal to avoid any flash I/O. + * + * This function has to be called with @c->commit_sem locked for writing - + * this function does not take LPT/TNC locks because the @c->commit_sem + * guarantees that we have exclusive access to the TNC and LPT data structures. + * + * This function returns %1 if there is nothing to commit and %0 otherwise. + */ +static int nothing_to_commit(struct ubifs_info *c) +{ + /* + * During mounting or remounting from R/O mode to R/W mode we may + * commit for various recovery-related reasons. + */ + if (c->mounting || c->remounting_rw) + return 0; + + /* + * If the root TNC node is dirty, we definitely have something to + * commit. + */ + if (c->zroot.znode && test_bit(DIRTY_ZNODE, &c->zroot.znode->flags)) + return 0; + + /* + * Even though the TNC is clean, the LPT tree may have dirty nodes. For + * example, this may happen if the budgeting subsystem invoked GC to + * make some free space, and the GC found an LEB with only dirty and + * free space. In this case GC would just change the lprops of this + * LEB (by turning all space into free space) and unmap it. + */ + if (c->nroot && test_bit(DIRTY_CNODE, &c->nroot->flags)) + return 0; + + ubifs_assert(atomic_long_read(&c->dirty_zn_cnt) == 0); + ubifs_assert(c->dirty_pn_cnt == 0); + ubifs_assert(c->dirty_nn_cnt == 0); + + return 1; +} + /** * do_commit - commit the journal. * @c: UBIFS file-system description object @@ -70,6 +120,12 @@ static int do_commit(struct ubifs_info *c) goto out_up; } + if (nothing_to_commit(c)) { + up_write(&c->commit_sem); + err = 0; + goto out_cancel; + } + /* Sync all write buffers (necessary for recovery) */ for (i = 0; i < c->jhead_cnt; i++) { err = ubifs_wbuf_sync(&c->jheads[i].wbuf); @@ -162,12 +218,12 @@ static int do_commit(struct ubifs_info *c) if (err) goto out; +out_cancel: spin_lock(&c->cs_lock); c->cmt_state = COMMIT_RESTING; wake_up(&c->cmt_wq); dbg_cmt("commit end"); spin_unlock(&c->cs_lock); - return 0; out_up: diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c index 0bee4db..01c2b02 100644 --- a/fs/ubifs/debug.c +++ b/fs/ubifs/debug.c @@ -43,8 +43,8 @@ DEFINE_SPINLOCK(dbg_lock); static char dbg_key_buf0[128]; static char dbg_key_buf1[128]; -unsigned int ubifs_msg_flags = UBIFS_MSG_FLAGS_DEFAULT; -unsigned int ubifs_chk_flags = UBIFS_CHK_FLAGS_DEFAULT; +unsigned int ubifs_msg_flags; +unsigned int ubifs_chk_flags; unsigned int ubifs_tst_flags; module_param_named(debug_msgs, ubifs_msg_flags, uint, S_IRUGO | S_IWUSR); @@ -810,16 +810,24 @@ void dbg_dump_leb(const struct ubifs_info *c, int lnum) { struct ubifs_scan_leb *sleb; struct ubifs_scan_node *snod; + void *buf; if (dbg_failure_mode) return; printk(KERN_DEBUG "(pid %d) start dumping LEB %d\n", current->pid, lnum); - sleb = ubifs_scan(c, lnum, 0, c->dbg->buf, 0); + + buf = __vmalloc(c->leb_size, GFP_KERNEL | GFP_NOFS, PAGE_KERNEL); + if (!buf) { + ubifs_err("cannot allocate memory for dumping LEB %d", lnum); + return; + } + + sleb = ubifs_scan(c, lnum, 0, buf, 0); if (IS_ERR(sleb)) { ubifs_err("scan error %d", (int)PTR_ERR(sleb)); - return; + goto out; } printk(KERN_DEBUG "LEB %d has %d nodes ending at %d\n", lnum, @@ -835,6 +843,9 @@ void dbg_dump_leb(const struct ubifs_info *c, int lnum) printk(KERN_DEBUG "(pid %d) finish dumping LEB %d\n", current->pid, lnum); ubifs_scan_destroy(sleb); + +out: + vfree(buf); return; } @@ -2690,16 +2701,8 @@ int ubifs_debugging_init(struct ubifs_info *c) if (!c->dbg) return -ENOMEM; - c->dbg->buf = vmalloc(c->leb_size); - if (!c->dbg->buf) - goto out; - failure_mode_init(c); return 0; - -out: - kfree(c->dbg); - return -ENOMEM; } /** @@ -2709,7 +2712,6 @@ out: void ubifs_debugging_exit(struct ubifs_info *c) { failure_mode_exit(c); - vfree(c->dbg->buf); kfree(c->dbg); } @@ -2813,19 +2815,19 @@ int dbg_debugfs_init_fs(struct ubifs_info *c) } fname = "dump_lprops"; - dent = debugfs_create_file(fname, S_IWUGO, d->dfs_dir, c, &dfs_fops); + dent = debugfs_create_file(fname, S_IWUSR, d->dfs_dir, c, &dfs_fops); if (IS_ERR(dent)) goto out_remove; d->dfs_dump_lprops = dent; fname = "dump_budg"; - dent = debugfs_create_file(fname, S_IWUGO, d->dfs_dir, c, &dfs_fops); + dent = debugfs_create_file(fname, S_IWUSR, d->dfs_dir, c, &dfs_fops); if (IS_ERR(dent)) goto out_remove; d->dfs_dump_budg = dent; fname = "dump_tnc"; - dent = debugfs_create_file(fname, S_IWUGO, d->dfs_dir, c, &dfs_fops); + dent = debugfs_create_file(fname, S_IWUSR, d->dfs_dir, c, &dfs_fops); if (IS_ERR(dent)) goto out_remove; d->dfs_dump_tnc = dent; diff --git a/fs/ubifs/debug.h b/fs/ubifs/debug.h index 69ebe47..919f0de 100644 --- a/fs/ubifs/debug.h +++ b/fs/ubifs/debug.h @@ -27,7 +27,6 @@ /** * ubifs_debug_info - per-FS debugging information. - * @buf: a buffer of LEB size, used for various purposes * @old_zroot: old index root - used by 'dbg_check_old_index()' * @old_zroot_level: old index root level - used by 'dbg_check_old_index()' * @old_zroot_sqnum: old index root sqnum - used by 'dbg_check_old_index()' @@ -54,7 +53,6 @@ * dfs_dump_tnc: "dump TNC" debugfs knob */ struct ubifs_debug_info { - void *buf; struct ubifs_zbranch old_zroot; int old_zroot_level; unsigned long long old_zroot_sqnum; @@ -173,7 +171,7 @@ const char *dbg_key_str1(const struct ubifs_info *c, #define dbg_rcvry(fmt, ...) dbg_do_msg(UBIFS_MSG_RCVRY, fmt, ##__VA_ARGS__) /* - * Debugging message type flags (must match msg_type_names in debug.c). + * Debugging message type flags. * * UBIFS_MSG_GEN: general messages * UBIFS_MSG_JNL: journal messages @@ -205,14 +203,8 @@ enum { UBIFS_MSG_RCVRY = 0x1000, }; -/* Debugging message type flags for each default debug message level */ -#define UBIFS_MSG_LVL_0 0 -#define UBIFS_MSG_LVL_1 0x1 -#define UBIFS_MSG_LVL_2 0x7f -#define UBIFS_MSG_LVL_3 0xffff - /* - * Debugging check flags (must match chk_names in debug.c). + * Debugging check flags. * * UBIFS_CHK_GEN: general checks * UBIFS_CHK_TNC: check TNC @@ -233,7 +225,7 @@ enum { }; /* - * Special testing flags (must match tst_names in debug.c). + * Special testing flags. * * UBIFS_TST_FORCE_IN_THE_GAPS: force the use of in-the-gaps method * UBIFS_TST_RCVRY: failure mode for recovery testing @@ -243,22 +235,6 @@ enum { UBIFS_TST_RCVRY = 0x4, }; -#if CONFIG_UBIFS_FS_DEBUG_MSG_LVL == 1 -#define UBIFS_MSG_FLAGS_DEFAULT UBIFS_MSG_LVL_1 -#elif CONFIG_UBIFS_FS_DEBUG_MSG_LVL == 2 -#define UBIFS_MSG_FLAGS_DEFAULT UBIFS_MSG_LVL_2 -#elif CONFIG_UBIFS_FS_DEBUG_MSG_LVL == 3 -#define UBIFS_MSG_FLAGS_DEFAULT UBIFS_MSG_LVL_3 -#else -#define UBIFS_MSG_FLAGS_DEFAULT UBIFS_MSG_LVL_0 -#endif - -#ifdef CONFIG_UBIFS_FS_DEBUG_CHKS -#define UBIFS_CHK_FLAGS_DEFAULT 0xffffffff -#else -#define UBIFS_CHK_FLAGS_DEFAULT 0 -#endif - extern spinlock_t dbg_lock; extern unsigned int ubifs_msg_flags; diff --git a/fs/ubifs/io.c b/fs/ubifs/io.c index d821731..dfd168b 100644 --- a/fs/ubifs/io.c +++ b/fs/ubifs/io.c @@ -31,6 +31,26 @@ * buffer is full or when it is not used for some time (by timer). This is * similar to the mechanism is used by JFFS2. * + * UBIFS distinguishes between minimum write size (@c->min_io_size) and maximum + * write size (@c->max_write_size). The latter is the maximum amount of bytes + * the underlying flash is able to program at a time, and writing in + * @c->max_write_size units should presumably be faster. Obviously, + * @c->min_io_size <= @c->max_write_size. Write-buffers are of + * @c->max_write_size bytes in size for maximum performance. However, when a + * write-buffer is flushed, only the portion of it (aligned to @c->min_io_size + * boundary) which contains data is written, not the whole write-buffer, + * because this is more space-efficient. + * + * This optimization adds few complications to the code. Indeed, on the one + * hand, we want to write in optimal @c->max_write_size bytes chunks, which + * also means aligning writes at the @c->max_write_size bytes offsets. On the + * other hand, we do not want to waste space when synchronizing the write + * buffer, so during synchronization we writes in smaller chunks. And this makes + * the next write offset to be not aligned to @c->max_write_size bytes. So the + * have to make sure that the write-buffer offset (@wbuf->offs) becomes aligned + * to @c->max_write_size bytes again. We do this by temporarily shrinking + * write-buffer size (@wbuf->size). + * * Write-buffers are defined by 'struct ubifs_wbuf' objects and protected by * mutexes defined inside these objects. Since sometimes upper-level code * has to lock the write-buffer (e.g. journal space reservation code), many @@ -46,8 +66,8 @@ * UBIFS uses padding when it pads to the next min. I/O unit. In this case it * uses padding nodes or padding bytes, if the padding node does not fit. * - * All UBIFS nodes are protected by CRC checksums and UBIFS checks all nodes - * every time they are read from the flash media. + * All UBIFS nodes are protected by CRC checksums and UBIFS checks CRC when + * they are read from the flash media. */ #include <linux/crc32.h> @@ -88,8 +108,12 @@ void ubifs_ro_mode(struct ubifs_info *c, int err) * This function may skip data nodes CRC checking if @c->no_chk_data_crc is * true, which is controlled by corresponding UBIFS mount option. However, if * @must_chk_crc is true, then @c->no_chk_data_crc is ignored and CRC is - * checked. Similarly, if @c->always_chk_crc is true, @c->no_chk_data_crc is - * ignored and CRC is checked. + * checked. Similarly, if @c->mounting or @c->remounting_rw is true (we are + * mounting or re-mounting to R/W mode), @c->no_chk_data_crc is ignored and CRC + * is checked. This is because during mounting or re-mounting from R/O mode to + * R/W mode we may read journal nodes (when replying the journal or doing the + * recovery) and the journal nodes may potentially be corrupted, so checking is + * required. * * This function returns zero in case of success and %-EUCLEAN in case of bad * CRC or magic. @@ -131,8 +155,8 @@ int ubifs_check_node(const struct ubifs_info *c, const void *buf, int lnum, node_len > c->ranges[type].max_len) goto out_len; - if (!must_chk_crc && type == UBIFS_DATA_NODE && !c->always_chk_crc && - c->no_chk_data_crc) + if (!must_chk_crc && type == UBIFS_DATA_NODE && !c->mounting && + !c->remounting_rw && c->no_chk_data_crc) return 0; crc = crc32(UBIFS_CRC32_INIT, buf + 8, node_len - 8); @@ -343,11 +367,17 @@ static void cancel_wbuf_timer_nolock(struct ubifs_wbuf *wbuf) * * This function synchronizes write-buffer @buf and returns zero in case of * success or a negative error code in case of failure. + * + * Note, although write-buffers are of @c->max_write_size, this function does + * not necessarily writes all @c->max_write_size bytes to the flash. Instead, + * if the write-buffer is only partially filled with data, only the used part + * of the write-buffer (aligned on @c->min_io_size boundary) is synchronized. + * This way we waste less space. */ int ubifs_wbuf_sync_nolock(struct ubifs_wbuf *wbuf) { struct ubifs_info *c = wbuf->c; - int err, dirt; + int err, dirt, sync_len; cancel_wbuf_timer_nolock(wbuf); if (!wbuf->used || wbuf->lnum == -1) @@ -357,27 +387,53 @@ int ubifs_wbuf_sync_nolock(struct ubifs_wbuf *wbuf) dbg_io("LEB %d:%d, %d bytes, jhead %s", wbuf->lnum, wbuf->offs, wbuf->used, dbg_jhead(wbuf->jhead)); ubifs_assert(!(wbuf->avail & 7)); - ubifs_assert(wbuf->offs + c->min_io_size <= c->leb_size); + ubifs_assert(wbuf->offs + wbuf->size <= c->leb_size); + ubifs_assert(wbuf->size >= c->min_io_size); + ubifs_assert(wbuf->size <= c->max_write_size); + ubifs_assert(wbuf->size % c->min_io_size == 0); ubifs_assert(!c->ro_media && !c->ro_mount); + if (c->leb_size - wbuf->offs >= c->max_write_size) + ubifs_assert(!((wbuf->offs + wbuf->size) % c->max_write_size )); if (c->ro_error) return -EROFS; - ubifs_pad(c, wbuf->buf + wbuf->used, wbuf->avail); + /* + * Do not write whole write buffer but write only the minimum necessary + * amount of min. I/O units. + */ + sync_len = ALIGN(wbuf->used, c->min_io_size); + dirt = sync_len - wbuf->used; + if (dirt) + ubifs_pad(c, wbuf->buf + wbuf->used, dirt); err = ubi_leb_write(c->ubi, wbuf->lnum, wbuf->buf, wbuf->offs, - c->min_io_size, wbuf->dtype); + sync_len, wbuf->dtype); if (err) { ubifs_err("cannot write %d bytes to LEB %d:%d", - c->min_io_size, wbuf->lnum, wbuf->offs); + sync_len, wbuf->lnum, wbuf->offs); dbg_dump_stack(); return err; } - dirt = wbuf->avail; - spin_lock(&wbuf->lock); - wbuf->offs += c->min_io_size; - wbuf->avail = c->min_io_size; + wbuf->offs += sync_len; + /* + * Now @wbuf->offs is not necessarily aligned to @c->max_write_size. + * But our goal is to optimize writes and make sure we write in + * @c->max_write_size chunks and to @c->max_write_size-aligned offset. + * Thus, if @wbuf->offs is not aligned to @c->max_write_size now, make + * sure that @wbuf->offs + @wbuf->size is aligned to + * @c->max_write_size. This way we make sure that after next + * write-buffer flush we are again at the optimal offset (aligned to + * @c->max_write_size). + */ + if (c->leb_size - wbuf->offs < c->max_write_size) + wbuf->size = c->leb_size - wbuf->offs; + else if (wbuf->offs & (c->max_write_size - 1)) + wbuf->size = ALIGN(wbuf->offs, c->max_write_size) - wbuf->offs; + else + wbuf->size = c->max_write_size; + wbuf->avail = wbuf->size; wbuf->used = 0; wbuf->next_ino = 0; spin_unlock(&wbuf->lock); @@ -420,7 +476,13 @@ int ubifs_wbuf_seek_nolock(struct ubifs_wbuf *wbuf, int lnum, int offs, spin_lock(&wbuf->lock); wbuf->lnum = lnum; wbuf->offs = offs; - wbuf->avail = c->min_io_size; + if (c->leb_size - wbuf->offs < c->max_write_size) + wbuf->size = c->leb_size - wbuf->offs; + else if (wbuf->offs & (c->max_write_size - 1)) + wbuf->size = ALIGN(wbuf->offs, c->max_write_size) - wbuf->offs; + else + wbuf->size = c->max_write_size; + wbuf->avail = wbuf->size; wbuf->used = 0; spin_unlock(&wbuf->lock); wbuf->dtype = dtype; @@ -500,8 +562,9 @@ out_timers: * * This function writes data to flash via write-buffer @wbuf. This means that * the last piece of the node won't reach the flash media immediately if it - * does not take whole minimal I/O unit. Instead, the node will sit in RAM - * until the write-buffer is synchronized (e.g., by timer). + * does not take whole max. write unit (@c->max_write_size). Instead, the node + * will sit in RAM until the write-buffer is synchronized (e.g., by timer, or + * because more data are appended to the write-buffer). * * This function returns zero in case of success and a negative error code in * case of failure. If the node cannot be written because there is no more @@ -518,9 +581,14 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len) ubifs_assert(len > 0 && wbuf->lnum >= 0 && wbuf->lnum < c->leb_cnt); ubifs_assert(wbuf->offs >= 0 && wbuf->offs % c->min_io_size == 0); ubifs_assert(!(wbuf->offs & 7) && wbuf->offs <= c->leb_size); - ubifs_assert(wbuf->avail > 0 && wbuf->avail <= c->min_io_size); + ubifs_assert(wbuf->avail > 0 && wbuf->avail <= wbuf->size); + ubifs_assert(wbuf->size >= c->min_io_size); + ubifs_assert(wbuf->size <= c->max_write_size); + ubifs_assert(wbuf->size % c->min_io_size == 0); ubifs_assert(mutex_is_locked(&wbuf->io_mutex)); ubifs_assert(!c->ro_media && !c->ro_mount); + if (c->leb_size - wbuf->offs >= c->max_write_size) + ubifs_assert(!((wbuf->offs + wbuf->size) % c->max_write_size )); if (c->leb_size - wbuf->offs - wbuf->used < aligned_len) { err = -ENOSPC; @@ -543,14 +611,18 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len) dbg_io("flush jhead %s wbuf to LEB %d:%d", dbg_jhead(wbuf->jhead), wbuf->lnum, wbuf->offs); err = ubi_leb_write(c->ubi, wbuf->lnum, wbuf->buf, - wbuf->offs, c->min_io_size, + wbuf->offs, wbuf->size, wbuf->dtype); if (err) goto out; spin_lock(&wbuf->lock); - wbuf->offs += c->min_io_size; - wbuf->avail = c->min_io_size; + wbuf->offs += wbuf->size; + if (c->leb_size - wbuf->offs >= c->max_write_size) + wbuf->size = c->max_write_size; + else + wbuf->size = c->leb_size - wbuf->offs; + wbuf->avail = wbuf->size; wbuf->used = 0; wbuf->next_ino = 0; spin_unlock(&wbuf->lock); @@ -564,33 +636,57 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len) goto exit; } - /* - * The node is large enough and does not fit entirely within current - * minimal I/O unit. We have to fill and flush write-buffer and switch - * to the next min. I/O unit. - */ - dbg_io("flush jhead %s wbuf to LEB %d:%d", - dbg_jhead(wbuf->jhead), wbuf->lnum, wbuf->offs); - memcpy(wbuf->buf + wbuf->used, buf, wbuf->avail); - err = ubi_leb_write(c->ubi, wbuf->lnum, wbuf->buf, wbuf->offs, - c->min_io_size, wbuf->dtype); - if (err) - goto out; + offs = wbuf->offs; + written = 0; - offs = wbuf->offs + c->min_io_size; - len -= wbuf->avail; - aligned_len -= wbuf->avail; - written = wbuf->avail; + if (wbuf->used) { + /* + * The node is large enough and does not fit entirely within + * current available space. We have to fill and flush + * write-buffer and switch to the next max. write unit. + */ + dbg_io("flush jhead %s wbuf to LEB %d:%d", + dbg_jhead(wbuf->jhead), wbuf->lnum, wbuf->offs); + memcpy(wbuf->buf + wbuf->used, buf, wbuf->avail); + err = ubi_leb_write(c->ubi, wbuf->lnum, wbuf->buf, wbuf->offs, + wbuf->size, wbuf->dtype); + if (err) + goto out; + + offs += wbuf->size; + len -= wbuf->avail; + aligned_len -= wbuf->avail; + written += wbuf->avail; + } else if (wbuf->offs & (c->max_write_size - 1)) { + /* + * The write-buffer offset is not aligned to + * @c->max_write_size and @wbuf->size is less than + * @c->max_write_size. Write @wbuf->size bytes to make sure the + * following writes are done in optimal @c->max_write_size + * chunks. + */ + dbg_io("write %d bytes to LEB %d:%d", + wbuf->size, wbuf->lnum, wbuf->offs); + err = ubi_leb_write(c->ubi, wbuf->lnum, buf, wbuf->offs, + wbuf->size, wbuf->dtype); + if (err) + goto out; + + offs += wbuf->size; + len -= wbuf->size; + aligned_len -= wbuf->size; + written += wbuf->size; + } /* - * The remaining data may take more whole min. I/O units, so write the - * remains multiple to min. I/O unit size directly to the flash media. + * The remaining data may take more whole max. write units, so write the + * remains multiple to max. write unit size directly to the flash media. * We align node length to 8-byte boundary because we anyway flash wbuf * if the remaining space is less than 8 bytes. */ - n = aligned_len >> c->min_io_shift; + n = aligned_len >> c->max_write_shift; if (n) { - n <<= c->min_io_shift; + n <<= c->max_write_shift; dbg_io("write %d bytes to LEB %d:%d", n, wbuf->lnum, offs); err = ubi_leb_write(c->ubi, wbuf->lnum, buf + written, offs, n, wbuf->dtype); @@ -606,14 +702,18 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len) if (aligned_len) /* * And now we have what's left and what does not take whole - * min. I/O unit, so write it to the write-buffer and we are + * max. write unit, so write it to the write-buffer and we are * done. */ memcpy(wbuf->buf, buf + written, len); wbuf->offs = offs; + if (c->leb_size - wbuf->offs >= c->max_write_size) + wbuf->size = c->max_write_size; + else + wbuf->size = c->leb_size - wbuf->offs; + wbuf->avail = wbuf->size - aligned_len; wbuf->used = aligned_len; - wbuf->avail = c->min_io_size - aligned_len; wbuf->next_ino = 0; spin_unlock(&wbuf->lock); @@ -837,11 +937,11 @@ int ubifs_wbuf_init(struct ubifs_info *c, struct ubifs_wbuf *wbuf) { size_t size; - wbuf->buf = kmalloc(c->min_io_size, GFP_KERNEL); + wbuf->buf = kmalloc(c->max_write_size, GFP_KERNEL); if (!wbuf->buf) return -ENOMEM; - size = (c->min_io_size / UBIFS_CH_SZ + 1) * sizeof(ino_t); + size = (c->max_write_size / UBIFS_CH_SZ + 1) * sizeof(ino_t); wbuf->inodes = kmalloc(size, GFP_KERNEL); if (!wbuf->inodes) { kfree(wbuf->buf); @@ -851,7 +951,14 @@ int ubifs_wbuf_init(struct ubifs_info *c, struct ubifs_wbuf *wbuf) wbuf->used = 0; wbuf->lnum = wbuf->offs = -1; - wbuf->avail = c->min_io_size; + /* + * If the LEB starts at the max. write size aligned address, then + * write-buffer size has to be set to @c->max_write_size. Otherwise, + * set it to something smaller so that it ends at the closest max. + * write size boundary. + */ + size = c->max_write_size - (c->leb_start % c->max_write_size); + wbuf->avail = wbuf->size = size; wbuf->dtype = UBI_UNKNOWN; wbuf->sync_callback = NULL; mutex_init(&wbuf->io_mutex); diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c index 914f1bd..aed25e8 100644 --- a/fs/ubifs/journal.c +++ b/fs/ubifs/journal.c @@ -690,7 +690,7 @@ int ubifs_jnl_write_data(struct ubifs_info *c, const struct inode *inode, { struct ubifs_data_node *data; int err, lnum, offs, compr_type, out_len; - int dlen = UBIFS_DATA_NODE_SZ + UBIFS_BLOCK_SIZE * WORST_COMPR_FACTOR; + int dlen = COMPRESSED_DATA_NODE_BUF_SZ, allocated = 1; struct ubifs_inode *ui = ubifs_inode(inode); dbg_jnl("ino %lu, blk %u, len %d, key %s", @@ -698,9 +698,19 @@ int ubifs_jnl_write_data(struct ubifs_info *c, const struct inode *inode, DBGKEY(key)); ubifs_assert(len <= UBIFS_BLOCK_SIZE); - data = kmalloc(dlen, GFP_NOFS); - if (!data) - return -ENOMEM; + data = kmalloc(dlen, GFP_NOFS | __GFP_NOWARN); + if (!data) { + /* + * Fall-back to the write reserve buffer. Note, we might be + * currently on the memory reclaim path, when the kernel is + * trying to free some memory by writing out dirty pages. The + * write reserve buffer helps us to guarantee that we are + * always able to write the data. + */ + allocated = 0; + mutex_lock(&c->write_reserve_mutex); + data = c->write_reserve_buf; + } data->ch.node_type = UBIFS_DATA_NODE; key_write(c, key, &data->key); @@ -736,7 +746,10 @@ int ubifs_jnl_write_data(struct ubifs_info *c, const struct inode *inode, goto out_ro; finish_reservation(c); - kfree(data); + if (!allocated) + mutex_unlock(&c->write_reserve_mutex); + else + kfree(data); return 0; out_release: @@ -745,7 +758,10 @@ out_ro: ubifs_ro_mode(c, err); finish_reservation(c); out_free: - kfree(data); + if (!allocated) + mutex_unlock(&c->write_reserve_mutex); + else + kfree(data); return err; } diff --git a/fs/ubifs/lprops.c b/fs/ubifs/lprops.c index 4d4ca38..c7b25e2 100644 --- a/fs/ubifs/lprops.c +++ b/fs/ubifs/lprops.c @@ -1035,7 +1035,8 @@ static int scan_check_cb(struct ubifs_info *c, struct ubifs_scan_leb *sleb; struct ubifs_scan_node *snod; struct ubifs_lp_stats *lst = &data->lst; - int cat, lnum = lp->lnum, is_idx = 0, used = 0, free, dirty; + int cat, lnum = lp->lnum, is_idx = 0, used = 0, free, dirty, ret; + void *buf = NULL; cat = lp->flags & LPROPS_CAT_MASK; if (cat != LPROPS_UNCAT) { @@ -1093,7 +1094,13 @@ static int scan_check_cb(struct ubifs_info *c, } } - sleb = ubifs_scan(c, lnum, 0, c->dbg->buf, 0); + buf = __vmalloc(c->leb_size, GFP_KERNEL | GFP_NOFS, PAGE_KERNEL); + if (!buf) { + ubifs_err("cannot allocate memory to scan LEB %d", lnum); + goto out; + } + + sleb = ubifs_scan(c, lnum, 0, buf, 0); if (IS_ERR(sleb)) { /* * After an unclean unmount, empty and freeable LEBs @@ -1105,7 +1112,8 @@ static int scan_check_cb(struct ubifs_info *c, lst->empty_lebs += 1; lst->total_free += c->leb_size; lst->total_dark += ubifs_calc_dark(c, c->leb_size); - return LPT_SCAN_CONTINUE; + ret = LPT_SCAN_CONTINUE; + goto exit; } if (lp->free + lp->dirty == c->leb_size && @@ -1115,10 +1123,12 @@ static int scan_check_cb(struct ubifs_info *c, lst->total_free += lp->free; lst->total_dirty += lp->dirty; lst->total_dark += ubifs_calc_dark(c, c->leb_size); - return LPT_SCAN_CONTINUE; + ret = LPT_SCAN_CONTINUE; + goto exit; } data->err = PTR_ERR(sleb); - return LPT_SCAN_STOP; + ret = LPT_SCAN_STOP; + goto exit; } is_idx = -1; @@ -1236,7 +1246,10 @@ static int scan_check_cb(struct ubifs_info *c, } ubifs_scan_destroy(sleb); - return LPT_SCAN_CONTINUE; + ret = LPT_SCAN_CONTINUE; +exit: + vfree(buf); + return ret; out_print: ubifs_err("bad accounting of LEB %d: free %d, dirty %d flags %#x, " @@ -1246,6 +1259,7 @@ out_print: out_destroy: ubifs_scan_destroy(sleb); out: + vfree(buf); data->err = -EINVAL; return LPT_SCAN_STOP; } diff --git a/fs/ubifs/lpt_commit.c b/fs/ubifs/lpt_commit.c index 5c90dec..0a3c2c3 100644 --- a/fs/ubifs/lpt_commit.c +++ b/fs/ubifs/lpt_commit.c @@ -1628,29 +1628,35 @@ static int dbg_check_ltab_lnum(struct ubifs_info *c, int lnum) { int err, len = c->leb_size, dirty = 0, node_type, node_num, node_len; int ret; - void *buf = c->dbg->buf; + void *buf, *p; if (!(ubifs_chk_flags & UBIFS_CHK_LPROPS)) return 0; + buf = p = __vmalloc(c->leb_size, GFP_KERNEL | GFP_NOFS, PAGE_KERNEL); + if (!buf) { + ubifs_err("cannot allocate memory for ltab checking"); + return 0; + } + dbg_lp("LEB %d", lnum); err = ubi_read(c->ubi, lnum, buf, 0, c->leb_size); if (err) { dbg_msg("ubi_read failed, LEB %d, error %d", lnum, err); - return err; + goto out; } while (1) { - if (!is_a_node(c, buf, len)) { + if (!is_a_node(c, p, len)) { int i, pad_len; - pad_len = get_pad_len(c, buf, len); + pad_len = get_pad_len(c, p, len); if (pad_len) { - buf += pad_len; + p += pad_len; len -= pad_len; dirty += pad_len; continue; } - if (!dbg_is_all_ff(buf, len)) { + if (!dbg_is_all_ff(p, len)) { dbg_msg("invalid empty space in LEB %d at %d", lnum, c->leb_size - len); err = -EINVAL; @@ -1668,16 +1674,21 @@ static int dbg_check_ltab_lnum(struct ubifs_info *c, int lnum) lnum, dirty, c->ltab[i].dirty); err = -EINVAL; } - return err; + goto out; } - node_type = get_lpt_node_type(c, buf, &node_num); + node_type = get_lpt_node_type(c, p, &node_num); node_len = get_lpt_node_len(c, node_type); ret = dbg_is_node_dirty(c, node_type, lnum, c->leb_size - len); if (ret == 1) dirty += node_len; - buf += node_len; + p += node_len; len -= node_len; } + + err = 0; +out: + vfree(buf); + return err; } /** @@ -1870,25 +1881,31 @@ int dbg_chk_lpt_sz(struct ubifs_info *c, int action, int len) static void dump_lpt_leb(const struct ubifs_info *c, int lnum) { int err, len = c->leb_size, node_type, node_num, node_len, offs; - void *buf = c->dbg->buf; + void *buf, *p; printk(KERN_DEBUG "(pid %d) start dumping LEB %d\n", current->pid, lnum); + buf = p = __vmalloc(c->leb_size, GFP_KERNEL | GFP_NOFS, PAGE_KERNEL); + if (!buf) { + ubifs_err("cannot allocate memory to dump LPT"); + return; + } + err = ubi_read(c->ubi, lnum, buf, 0, c->leb_size); if (err) { ubifs_err("cannot read LEB %d, error %d", lnum, err); - return; + goto out; } while (1) { offs = c->leb_size - len; - if (!is_a_node(c, buf, len)) { + if (!is_a_node(c, p, len)) { int pad_len; - pad_len = get_pad_len(c, buf, len); + pad_len = get_pad_len(c, p, len); if (pad_len) { printk(KERN_DEBUG "LEB %d:%d, pad %d bytes\n", lnum, offs, pad_len); - buf += pad_len; + p += pad_len; len -= pad_len; continue; } @@ -1898,7 +1915,7 @@ static void dump_lpt_leb(const struct ubifs_info *c, int lnum) break; } - node_type = get_lpt_node_type(c, buf, &node_num); + node_type = get_lpt_node_type(c, p, &node_num); switch (node_type) { case UBIFS_LPT_PNODE: { @@ -1923,7 +1940,7 @@ static void dump_lpt_leb(const struct ubifs_info *c, int lnum) else printk(KERN_DEBUG "LEB %d:%d, nnode, ", lnum, offs); - err = ubifs_unpack_nnode(c, buf, &nnode); + err = ubifs_unpack_nnode(c, p, &nnode); for (i = 0; i < UBIFS_LPT_FANOUT; i++) { printk(KERN_CONT "%d:%d", nnode.nbranch[i].lnum, nnode.nbranch[i].offs); @@ -1944,15 +1961,18 @@ static void dump_lpt_leb(const struct ubifs_info *c, int lnum) break; default: ubifs_err("LPT node type %d not recognized", node_type); - return; + goto out; } - buf += node_len; + p += node_len; len -= node_len; } printk(KERN_DEBUG "(pid %d) finish dumping LEB %d\n", current->pid, lnum); +out: + vfree(buf); + return; } /** diff --git a/fs/ubifs/orphan.c b/fs/ubifs/orphan.c index 82009c7..2cdbd31 100644 --- a/fs/ubifs/orphan.c +++ b/fs/ubifs/orphan.c @@ -892,15 +892,22 @@ static int dbg_read_orphans(struct check_info *ci, struct ubifs_scan_leb *sleb) static int dbg_scan_orphans(struct ubifs_info *c, struct check_info *ci) { int lnum, err = 0; + void *buf; /* Check no-orphans flag and skip this if no orphans */ if (c->no_orphs) return 0; + buf = __vmalloc(c->leb_size, GFP_KERNEL | GFP_NOFS, PAGE_KERNEL); + if (!buf) { + ubifs_err("cannot allocate memory to check orphans"); + return 0; + } + for (lnum = c->orph_first; lnum <= c->orph_last; lnum++) { struct ubifs_scan_leb *sleb; - sleb = ubifs_scan(c, lnum, 0, c->dbg->buf, 0); + sleb = ubifs_scan(c, lnum, 0, buf, 0); if (IS_ERR(sleb)) { err = PTR_ERR(sleb); break; @@ -912,6 +919,7 @@ static int dbg_scan_orphans(struct ubifs_info *c, struct check_info *ci) break; } + vfree(buf); return err; } diff --git a/fs/ubifs/recovery.c b/fs/ubifs/recovery.c index 77e9b87..936f2cb 100644 --- a/fs/ubifs/recovery.c +++ b/fs/ubifs/recovery.c @@ -28,6 +28,23 @@ * UBIFS always cleans away all remnants of an unclean un-mount, so that * errors do not accumulate. However UBIFS defers recovery if it is mounted * read-only, and the flash is not modified in that case. + * + * The general UBIFS approach to the recovery is that it recovers from + * corruptions which could be caused by power cuts, but it refuses to recover + * from corruption caused by other reasons. And UBIFS tries to distinguish + * between these 2 reasons of corruptions and silently recover in the former + * case and loudly complain in the latter case. + * + * UBIFS writes only to erased LEBs, so it writes only to the flash space + * containing only 0xFFs. UBIFS also always writes strictly from the beginning + * of the LEB to the end. And UBIFS assumes that the underlying flash media + * writes in @c->max_write_size bytes at a time. + * + * Hence, if UBIFS finds a corrupted node at offset X, it expects only the min. + * I/O unit corresponding to offset X to contain corrupted data, all the + * following min. I/O units have to contain empty space (all 0xFFs). If this is + * not true, the corruption cannot be the result of a power cut, and UBIFS + * refuses to mount. */ #include <linux/crc32.h> @@ -362,8 +379,9 @@ int ubifs_write_rcvrd_mst_node(struct ubifs_info *c) * @offs: offset to check * * This function returns %1 if @offs was in the last write to the LEB whose data - * is in @buf, otherwise %0 is returned. The determination is made by checking - * for subsequent empty space starting from the next @c->min_io_size boundary. + * is in @buf, otherwise %0 is returned. The determination is made by checking + * for subsequent empty space starting from the next @c->max_write_size + * boundary. */ static int is_last_write(const struct ubifs_info *c, void *buf, int offs) { @@ -371,10 +389,10 @@ static int is_last_write(const struct ubifs_info *c, void *buf, int offs) uint8_t *p; /* - * Round up to the next @c->min_io_size boundary i.e. @offs is in the - * last wbuf written. After that should be empty space. + * Round up to the next @c->max_write_size boundary i.e. @offs is in + * the last wbuf written. After that should be empty space. */ - empty_offs = ALIGN(offs + 1, c->min_io_size); + empty_offs = ALIGN(offs + 1, c->max_write_size); check_len = c->leb_size - empty_offs; p = buf + empty_offs - offs; return is_empty(p, check_len); @@ -429,7 +447,7 @@ static int no_more_nodes(const struct ubifs_info *c, void *buf, int len, int skip, dlen = le32_to_cpu(ch->len); /* Check for empty space after the corrupt node's common header */ - skip = ALIGN(offs + UBIFS_CH_SZ, c->min_io_size) - offs; + skip = ALIGN(offs + UBIFS_CH_SZ, c->max_write_size) - offs; if (is_empty(buf + skip, len - skip)) return 1; /* @@ -441,7 +459,7 @@ static int no_more_nodes(const struct ubifs_info *c, void *buf, int len, return 0; } /* Now we know the corrupt node's length we can skip over it */ - skip = ALIGN(offs + dlen, c->min_io_size) - offs; + skip = ALIGN(offs + dlen, c->max_write_size) - offs; /* After which there should be empty space */ if (is_empty(buf + skip, len - skip)) return 1; @@ -671,10 +689,14 @@ struct ubifs_scan_leb *ubifs_recover_leb(struct ubifs_info *c, int lnum, } else { int corruption = first_non_ff(buf, len); + /* + * See header comment for this file for more + * explanations about the reasons we have this check. + */ ubifs_err("corrupt empty space LEB %d:%d, corruption " "starts at %d", lnum, offs, corruption); /* Make sure we dump interesting non-0xFF data */ - offs = corruption; + offs += corruption; buf += corruption; goto corrupted; } @@ -836,12 +858,8 @@ struct ubifs_scan_leb *ubifs_recover_log_leb(struct ubifs_info *c, int lnum, static int recover_head(const struct ubifs_info *c, int lnum, int offs, void *sbuf) { - int len, err; + int len = c->max_write_size, err; - if (c->min_io_size > 1) - len = c->min_io_size; - else - len = 512; if (offs + len > c->leb_size) len = c->leb_size - offs; diff --git a/fs/ubifs/scan.c b/fs/ubifs/scan.c index 3e1ee57..36216b4 100644 --- a/fs/ubifs/scan.c +++ b/fs/ubifs/scan.c @@ -328,7 +328,7 @@ struct ubifs_scan_leb *ubifs_scan(const struct ubifs_info *c, int lnum, if (!quiet) ubifs_err("empty space starts at non-aligned offset %d", offs); - goto corrupted;; + goto corrupted; } ubifs_end_scan(c, sleb, lnum, offs); diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index 6e11c29..e5dc1e1 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -512,9 +512,12 @@ static int init_constants_early(struct ubifs_info *c) c->leb_cnt = c->vi.size; c->leb_size = c->vi.usable_leb_size; + c->leb_start = c->di.leb_start; c->half_leb_size = c->leb_size / 2; c->min_io_size = c->di.min_io_size; c->min_io_shift = fls(c->min_io_size) - 1; + c->max_write_size = c->di.max_write_size; + c->max_write_shift = fls(c->max_write_size) - 1; if (c->leb_size < UBIFS_MIN_LEB_SZ) { ubifs_err("too small LEBs (%d bytes), min. is %d bytes", @@ -534,6 +537,18 @@ static int init_constants_early(struct ubifs_info *c) } /* + * Maximum write size has to be greater or equivalent to min. I/O + * size, and be multiple of min. I/O size. + */ + if (c->max_write_size < c->min_io_size || + c->max_write_size % c->min_io_size || + !is_power_of_2(c->max_write_size)) { + ubifs_err("bad write buffer size %d for %d min. I/O unit", + c->max_write_size, c->min_io_size); + return -EINVAL; + } + + /* * UBIFS aligns all node to 8-byte boundary, so to make function in * io.c simpler, assume minimum I/O unit size to be 8 bytes if it is * less than 8. @@ -541,6 +556,10 @@ static int init_constants_early(struct ubifs_info *c) if (c->min_io_size < 8) { c->min_io_size = 8; c->min_io_shift = 3; + if (c->max_write_size < c->min_io_size) { + c->max_write_size = c->min_io_size; + c->max_write_shift = c->min_io_shift; + } } c->ref_node_alsz = ALIGN(UBIFS_REF_NODE_SZ, c->min_io_size); @@ -1202,11 +1221,14 @@ static int mount_ubifs(struct ubifs_info *c) if (c->bulk_read == 1) bu_init(c); - /* - * We have to check all CRCs, even for data nodes, when we mount the FS - * (specifically, when we are replaying). - */ - c->always_chk_crc = 1; + if (!c->ro_mount) { + c->write_reserve_buf = kmalloc(COMPRESSED_DATA_NODE_BUF_SZ, + GFP_KERNEL); + if (!c->write_reserve_buf) + goto out_free; + } + + c->mounting = 1; err = ubifs_read_superblock(c); if (err) @@ -1382,7 +1404,7 @@ static int mount_ubifs(struct ubifs_info *c) if (err) goto out_infos; - c->always_chk_crc = 0; + c->mounting = 0; ubifs_msg("mounted UBI device %d, volume %d, name \"%s\"", c->vi.ubi_num, c->vi.vol_id, c->vi.name); @@ -1403,6 +1425,7 @@ static int mount_ubifs(struct ubifs_info *c) dbg_msg("compiled on: " __DATE__ " at " __TIME__); dbg_msg("min. I/O unit size: %d bytes", c->min_io_size); + dbg_msg("max. write size: %d bytes", c->max_write_size); dbg_msg("LEB size: %d bytes (%d KiB)", c->leb_size, c->leb_size >> 10); dbg_msg("data journal heads: %d", @@ -1432,9 +1455,9 @@ static int mount_ubifs(struct ubifs_info *c) UBIFS_TRUN_NODE_SZ, UBIFS_SB_NODE_SZ, UBIFS_MST_NODE_SZ); dbg_msg("node sizes: ref %zu, cmt. start %zu, orph %zu", UBIFS_REF_NODE_SZ, UBIFS_CS_NODE_SZ, UBIFS_ORPH_NODE_SZ); - dbg_msg("max. node sizes: data %zu, inode %zu dentry %zu", + dbg_msg("max. node sizes: data %zu, inode %zu dentry %zu, idx %d", UBIFS_MAX_DATA_NODE_SZ, UBIFS_MAX_INO_NODE_SZ, - UBIFS_MAX_DENT_NODE_SZ); + UBIFS_MAX_DENT_NODE_SZ, ubifs_idx_node_sz(c, c->fanout)); dbg_msg("dead watermark: %d", c->dead_wm); dbg_msg("dark watermark: %d", c->dark_wm); dbg_msg("LEB overhead: %d", c->leb_overhead); @@ -1474,6 +1497,7 @@ out_wbufs: out_cbuf: kfree(c->cbuf); out_free: + kfree(c->write_reserve_buf); kfree(c->bu.buf); vfree(c->ileb_buf); vfree(c->sbuf); @@ -1512,6 +1536,7 @@ static void ubifs_umount(struct ubifs_info *c) kfree(c->cbuf); kfree(c->rcvrd_mst_node); kfree(c->mst_node); + kfree(c->write_reserve_buf); kfree(c->bu.buf); vfree(c->ileb_buf); vfree(c->sbuf); @@ -1543,7 +1568,6 @@ static int ubifs_remount_rw(struct ubifs_info *c) mutex_lock(&c->umount_mutex); dbg_save_space_info(c); c->remounting_rw = 1; - c->always_chk_crc = 1; err = check_free_space(c); if (err) @@ -1598,6 +1622,10 @@ static int ubifs_remount_rw(struct ubifs_info *c) goto out; } + c->write_reserve_buf = kmalloc(COMPRESSED_DATA_NODE_BUF_SZ, GFP_KERNEL); + if (!c->write_reserve_buf) + goto out; + err = ubifs_lpt_init(c, 0, 1); if (err) goto out; @@ -1650,7 +1678,6 @@ static int ubifs_remount_rw(struct ubifs_info *c) dbg_gen("re-mounted read-write"); c->ro_mount = 0; c->remounting_rw = 0; - c->always_chk_crc = 0; err = dbg_check_space_info(c); mutex_unlock(&c->umount_mutex); return err; @@ -1663,11 +1690,12 @@ out: c->bgt = NULL; } free_wbufs(c); + kfree(c->write_reserve_buf); + c->write_reserve_buf = NULL; vfree(c->ileb_buf); c->ileb_buf = NULL; ubifs_lpt_free(c, 1); c->remounting_rw = 0; - c->always_chk_crc = 0; mutex_unlock(&c->umount_mutex); return err; } @@ -1707,6 +1735,8 @@ static void ubifs_remount_ro(struct ubifs_info *c) free_wbufs(c); vfree(c->orph_buf); c->orph_buf = NULL; + kfree(c->write_reserve_buf); + c->write_reserve_buf = NULL; vfree(c->ileb_buf); c->ileb_buf = NULL; ubifs_lpt_free(c, 1); @@ -1937,6 +1967,7 @@ static int ubifs_fill_super(struct super_block *sb, void *data, int silent) mutex_init(&c->mst_mutex); mutex_init(&c->umount_mutex); mutex_init(&c->bu_mutex); + mutex_init(&c->write_reserve_mutex); init_waitqueue_head(&c->cmt_wq); c->buds = RB_ROOT; c->old_idx = RB_ROOT; @@ -1954,6 +1985,7 @@ static int ubifs_fill_super(struct super_block *sb, void *data, int silent) INIT_LIST_HEAD(&c->old_buds); INIT_LIST_HEAD(&c->orph_list); INIT_LIST_HEAD(&c->orph_new); + c->no_chk_data_crc = 1; c->vfs_sb = sb; c->highest_inum = UBIFS_FIRST_INO; diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c index ad9cf01..de48597 100644 --- a/fs/ubifs/tnc.c +++ b/fs/ubifs/tnc.c @@ -447,8 +447,11 @@ static int tnc_read_node_nm(struct ubifs_info *c, struct ubifs_zbranch *zbr, * * Note, this function does not check CRC of data nodes if @c->no_chk_data_crc * is true (it is controlled by corresponding mount option). However, if - * @c->always_chk_crc is true, @c->no_chk_data_crc is ignored and CRC is always - * checked. + * @c->mounting or @c->remounting_rw is true (we are mounting or re-mounting to + * R/W mode), @c->no_chk_data_crc is ignored and CRC is checked. This is + * because during mounting or re-mounting from R/O mode to R/W mode we may read + * journal nodes (when replying the journal or doing the recovery) and the + * journal nodes may potentially be corrupted, so checking is required. */ static int try_read_node(const struct ubifs_info *c, void *buf, int type, int len, int lnum, int offs) @@ -476,7 +479,8 @@ static int try_read_node(const struct ubifs_info *c, void *buf, int type, if (node_len != len) return 0; - if (type == UBIFS_DATA_NODE && !c->always_chk_crc && c->no_chk_data_crc) + if (type == UBIFS_DATA_NODE && c->no_chk_data_crc && !c->mounting && + !c->remounting_rw) return 1; crc = crc32(UBIFS_CRC32_INIT, buf + 8, node_len - 8); diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h index 381d6b2..8c40ad3 100644 --- a/fs/ubifs/ubifs.h +++ b/fs/ubifs/ubifs.h @@ -151,6 +151,12 @@ */ #define WORST_COMPR_FACTOR 2 +/* + * How much memory is needed for a buffer where we comress a data node. + */ +#define COMPRESSED_DATA_NODE_BUF_SZ \ + (UBIFS_DATA_NODE_SZ + UBIFS_BLOCK_SIZE * WORST_COMPR_FACTOR) + /* Maximum expected tree height for use by bottom_up_buf */ #define BOTTOM_UP_HEIGHT 64 @@ -646,6 +652,7 @@ typedef int (*ubifs_lpt_scan_callback)(struct ubifs_info *c, * @offs: write-buffer offset in this logical eraseblock * @avail: number of bytes available in the write-buffer * @used: number of used bytes in the write-buffer + * @size: write-buffer size (in [@c->min_io_size, @c->max_write_size] range) * @dtype: type of data stored in this LEB (%UBI_LONGTERM, %UBI_SHORTTERM, * %UBI_UNKNOWN) * @jhead: journal head the mutex belongs to (note, needed only to shut lockdep @@ -680,6 +687,7 @@ struct ubifs_wbuf { int offs; int avail; int used; + int size; int dtype; int jhead; int (*sync_callback)(struct ubifs_info *c, int lnum, int free, int pad); @@ -1003,6 +1011,11 @@ struct ubifs_debug_info; * @bu_mutex: protects the pre-allocated bulk-read buffer and @c->bu * @bu: pre-allocated bulk-read information * + * @write_reserve_mutex: protects @write_reserve_buf + * @write_reserve_buf: on the write path we allocate memory, which might + * sometimes be unavailable, in which case we use this + * write reserve buffer + * * @log_lebs: number of logical eraseblocks in the log * @log_bytes: log size in bytes * @log_last: last LEB of the log @@ -1024,7 +1037,12 @@ struct ubifs_debug_info; * * @min_io_size: minimal input/output unit size * @min_io_shift: number of bits in @min_io_size minus one + * @max_write_size: maximum amount of bytes the underlying flash can write at a + * time (MTD write buffer size) + * @max_write_shift: number of bits in @max_write_size minus one * @leb_size: logical eraseblock size in bytes + * @leb_start: starting offset of logical eraseblocks within physical + * eraseblocks * @half_leb_size: half LEB size * @idx_leb_size: how many bytes of an LEB are effectively available when it is * used to store indexing nodes (@leb_size - @max_idx_node_sz) @@ -1166,22 +1184,21 @@ struct ubifs_debug_info; * @rp_uid: reserved pool user ID * @rp_gid: reserved pool group ID * - * @empty: if the UBI device is empty + * @empty: %1 if the UBI device is empty + * @need_recovery: %1 if the file-system needs recovery + * @replaying: %1 during journal replay + * @mounting: %1 while mounting + * @remounting_rw: %1 while re-mounting from R/O mode to R/W mode * @replay_tree: temporary tree used during journal replay * @replay_list: temporary list used during journal replay * @replay_buds: list of buds to replay * @cs_sqnum: sequence number of first node in the log (commit start node) * @replay_sqnum: sequence number of node currently being replayed - * @need_recovery: file-system needs recovery - * @replaying: set to %1 during journal replay * @unclean_leb_list: LEBs to recover when re-mounting R/O mounted FS to R/W * mode * @rcvrd_mst_node: recovered master node to write when re-mounting R/O mounted * FS to R/W mode * @size_tree: inode size information for recovery - * @remounting_rw: set while re-mounting from R/O mode to R/W mode - * @always_chk_crc: always check CRCs (while mounting and remounting to R/W - * mode) * @mount_opts: UBIFS-specific mount options * * @dbg: debugging-related information @@ -1250,6 +1267,9 @@ struct ubifs_info { struct mutex bu_mutex; struct bu_info bu; + struct mutex write_reserve_mutex; + void *write_reserve_buf; + int log_lebs; long long log_bytes; int log_last; @@ -1271,7 +1291,10 @@ struct ubifs_info { int min_io_size; int min_io_shift; + int max_write_size; + int max_write_shift; int leb_size; + int leb_start; int half_leb_size; int idx_leb_size; int leb_cnt; @@ -1402,19 +1425,19 @@ struct ubifs_info { gid_t rp_gid; /* The below fields are used only during mounting and re-mounting */ - int empty; + unsigned int empty:1; + unsigned int need_recovery:1; + unsigned int replaying:1; + unsigned int mounting:1; + unsigned int remounting_rw:1; struct rb_root replay_tree; struct list_head replay_list; struct list_head replay_buds; unsigned long long cs_sqnum; unsigned long long replay_sqnum; - int need_recovery; - int replaying; struct list_head unclean_leb_list; struct ubifs_mst_node *rcvrd_mst_node; struct rb_root size_tree; - int remounting_rw; - int always_chk_crc; struct ubifs_mount_opts mount_opts; #ifdef CONFIG_UBIFS_FS_DEBUG diff --git a/fs/udf/balloc.c b/fs/udf/balloc.c index 306ee39..8994dd0 100644 --- a/fs/udf/balloc.c +++ b/fs/udf/balloc.c @@ -31,7 +31,7 @@ #define udf_set_bit(nr, addr) ext2_set_bit(nr, addr) #define udf_test_bit(nr, addr) ext2_test_bit(nr, addr) #define udf_find_next_one_bit(addr, size, offset) \ - ext2_find_next_bit(addr, size, offset) + ext2_find_next_bit((unsigned long *)(addr), size, offset) static int read_block_bitmap(struct super_block *sb, struct udf_bitmap *bitmap, unsigned int block, @@ -297,7 +297,7 @@ repeat: break; } } else { - bit = udf_find_next_one_bit((char *)bh->b_data, + bit = udf_find_next_one_bit(bh->b_data, sb->s_blocksize << 3, group_start << 3); if (bit < sb->s_blocksize << 3) diff --git a/fs/udf/file.c b/fs/udf/file.c index 89c7848..f391a2a 100644 --- a/fs/udf/file.c +++ b/fs/udf/file.c @@ -123,8 +123,8 @@ static ssize_t udf_file_aio_write(struct kiocb *iocb, const struct iovec *iov, if (inode->i_sb->s_blocksize < (udf_file_entry_alloc_offset(inode) + pos + count)) { - udf_expand_file_adinicb(inode, pos + count, &err); - if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) { + err = udf_expand_file_adinicb(inode); + if (err) { udf_debug("udf_expand_adinicb: err=%d\n", err); up_write(&iinfo->i_data_sem); return err; @@ -237,7 +237,7 @@ static int udf_setattr(struct dentry *dentry, struct iattr *attr) if ((attr->ia_valid & ATTR_SIZE) && attr->ia_size != i_size_read(inode)) { - error = vmtruncate(inode, attr->ia_size); + error = udf_setsize(inode, attr->ia_size); if (error) return error; } @@ -249,5 +249,4 @@ static int udf_setattr(struct dentry *dentry, struct iattr *attr) const struct inode_operations udf_file_inode_operations = { .setattr = udf_setattr, - .truncate = udf_truncate, }; diff --git a/fs/udf/inode.c b/fs/udf/inode.c index c6a2e78..ccc8143 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -73,14 +73,12 @@ void udf_evict_inode(struct inode *inode) struct udf_inode_info *iinfo = UDF_I(inode); int want_delete = 0; - truncate_inode_pages(&inode->i_data, 0); - if (!inode->i_nlink && !is_bad_inode(inode)) { want_delete = 1; - inode->i_size = 0; - udf_truncate(inode); + udf_setsize(inode, 0); udf_update_inode(inode, IS_SYNC(inode)); - } + } else + truncate_inode_pages(&inode->i_data, 0); invalidate_inode_buffers(inode); end_writeback(inode); if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB && @@ -117,9 +115,18 @@ static int udf_write_begin(struct file *file, struct address_space *mapping, ret = block_write_begin(mapping, pos, len, flags, pagep, udf_get_block); if (unlikely(ret)) { - loff_t isize = mapping->host->i_size; - if (pos + len > isize) - vmtruncate(mapping->host, isize); + struct inode *inode = mapping->host; + struct udf_inode_info *iinfo = UDF_I(inode); + loff_t isize = inode->i_size; + + if (pos + len > isize) { + truncate_pagecache(inode, pos + len, isize); + if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) { + down_write(&iinfo->i_data_sem); + udf_truncate_extents(inode); + up_write(&iinfo->i_data_sem); + } + } } return ret; @@ -139,30 +146,31 @@ const struct address_space_operations udf_aops = { .bmap = udf_bmap, }; -void udf_expand_file_adinicb(struct inode *inode, int newsize, int *err) +int udf_expand_file_adinicb(struct inode *inode) { struct page *page; char *kaddr; struct udf_inode_info *iinfo = UDF_I(inode); + int err; struct writeback_control udf_wbc = { .sync_mode = WB_SYNC_NONE, .nr_to_write = 1, }; - /* from now on we have normal address_space methods */ - inode->i_data.a_ops = &udf_aops; - if (!iinfo->i_lenAlloc) { if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_SHORT_AD)) iinfo->i_alloc_type = ICBTAG_FLAG_AD_SHORT; else iinfo->i_alloc_type = ICBTAG_FLAG_AD_LONG; + /* from now on we have normal address_space methods */ + inode->i_data.a_ops = &udf_aops; mark_inode_dirty(inode); - return; + return 0; } - page = grab_cache_page(inode->i_mapping, 0); - BUG_ON(!PageLocked(page)); + page = find_or_create_page(inode->i_mapping, 0, GFP_NOFS); + if (!page) + return -ENOMEM; if (!PageUptodate(page)) { kaddr = kmap(page); @@ -181,11 +189,24 @@ void udf_expand_file_adinicb(struct inode *inode, int newsize, int *err) iinfo->i_alloc_type = ICBTAG_FLAG_AD_SHORT; else iinfo->i_alloc_type = ICBTAG_FLAG_AD_LONG; - - inode->i_data.a_ops->writepage(page, &udf_wbc); + /* from now on we have normal address_space methods */ + inode->i_data.a_ops = &udf_aops; + err = inode->i_data.a_ops->writepage(page, &udf_wbc); + if (err) { + /* Restore everything back so that we don't lose data... */ + lock_page(page); + kaddr = kmap(page); + memcpy(iinfo->i_ext.i_data + iinfo->i_lenEAttr, kaddr, + inode->i_size); + kunmap(page); + unlock_page(page); + iinfo->i_alloc_type = ICBTAG_FLAG_AD_IN_ICB; + inode->i_data.a_ops = &udf_adinicb_aops; + } page_cache_release(page); - mark_inode_dirty(inode); + + return err; } struct buffer_head *udf_expand_dir_adinicb(struct inode *inode, int *block, @@ -348,8 +369,10 @@ static struct buffer_head *udf_getblk(struct inode *inode, long block, } /* Extend the file by 'blocks' blocks, return the number of extents added */ -int udf_extend_file(struct inode *inode, struct extent_position *last_pos, - struct kernel_long_ad *last_ext, sector_t blocks) +static int udf_do_extend_file(struct inode *inode, + struct extent_position *last_pos, + struct kernel_long_ad *last_ext, + sector_t blocks) { sector_t add; int count = 0, fake = !(last_ext->extLength & UDF_EXTENT_LENGTH_MASK); @@ -357,6 +380,7 @@ int udf_extend_file(struct inode *inode, struct extent_position *last_pos, struct kernel_lb_addr prealloc_loc = {}; int prealloc_len = 0; struct udf_inode_info *iinfo; + int err; /* The previous extent is fake and we should not extend by anything * - there's nothing to do... */ @@ -422,26 +446,29 @@ int udf_extend_file(struct inode *inode, struct extent_position *last_pos, /* Create enough extents to cover the whole hole */ while (blocks > add) { blocks -= add; - if (udf_add_aext(inode, last_pos, &last_ext->extLocation, - last_ext->extLength, 1) == -1) - return -1; + err = udf_add_aext(inode, last_pos, &last_ext->extLocation, + last_ext->extLength, 1); + if (err) + return err; count++; } if (blocks) { last_ext->extLength = EXT_NOT_RECORDED_NOT_ALLOCATED | (blocks << sb->s_blocksize_bits); - if (udf_add_aext(inode, last_pos, &last_ext->extLocation, - last_ext->extLength, 1) == -1) - return -1; + err = udf_add_aext(inode, last_pos, &last_ext->extLocation, + last_ext->extLength, 1); + if (err) + return err; count++; } out: /* Do we have some preallocated blocks saved? */ if (prealloc_len) { - if (udf_add_aext(inode, last_pos, &prealloc_loc, - prealloc_len, 1) == -1) - return -1; + err = udf_add_aext(inode, last_pos, &prealloc_loc, + prealloc_len, 1); + if (err) + return err; last_ext->extLocation = prealloc_loc; last_ext->extLength = prealloc_len; count++; @@ -453,11 +480,68 @@ out: else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG) last_pos->offset -= sizeof(struct long_ad); else - return -1; + return -EIO; return count; } +static int udf_extend_file(struct inode *inode, loff_t newsize) +{ + + struct extent_position epos; + struct kernel_lb_addr eloc; + uint32_t elen; + int8_t etype; + struct super_block *sb = inode->i_sb; + sector_t first_block = newsize >> sb->s_blocksize_bits, offset; + int adsize; + struct udf_inode_info *iinfo = UDF_I(inode); + struct kernel_long_ad extent; + int err; + + if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) + adsize = sizeof(struct short_ad); + else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG) + adsize = sizeof(struct long_ad); + else + BUG(); + + etype = inode_bmap(inode, first_block, &epos, &eloc, &elen, &offset); + + /* File has extent covering the new size (could happen when extending + * inside a block)? */ + if (etype != -1) + return 0; + if (newsize & (sb->s_blocksize - 1)) + offset++; + /* Extended file just to the boundary of the last file block? */ + if (offset == 0) + return 0; + + /* Truncate is extending the file by 'offset' blocks */ + if ((!epos.bh && epos.offset == udf_file_entry_alloc_offset(inode)) || + (epos.bh && epos.offset == sizeof(struct allocExtDesc))) { + /* File has no extents at all or has empty last + * indirect extent! Create a fake extent... */ + extent.extLocation.logicalBlockNum = 0; + extent.extLocation.partitionReferenceNum = 0; + extent.extLength = EXT_NOT_RECORDED_NOT_ALLOCATED; + } else { + epos.offset -= adsize; + etype = udf_next_aext(inode, &epos, &extent.extLocation, + &extent.extLength, 0); + extent.extLength |= etype << 30; + } + err = udf_do_extend_file(inode, &epos, &extent, offset); + if (err < 0) + goto out; + err = 0; + iinfo->i_lenExtents = newsize; +out: + brelse(epos.bh); + return err; +} + static struct buffer_head *inode_getblk(struct inode *inode, sector_t block, int *err, sector_t *phys, int *new) { @@ -540,7 +624,7 @@ static struct buffer_head *inode_getblk(struct inode *inode, sector_t block, elen = EXT_RECORDED_ALLOCATED | ((elen + inode->i_sb->s_blocksize - 1) & ~(inode->i_sb->s_blocksize - 1)); - etype = udf_write_aext(inode, &cur_epos, &eloc, elen, 1); + udf_write_aext(inode, &cur_epos, &eloc, elen, 1); } brelse(prev_epos.bh); brelse(cur_epos.bh); @@ -564,19 +648,17 @@ static struct buffer_head *inode_getblk(struct inode *inode, sector_t block, memset(&laarr[0].extLocation, 0x00, sizeof(struct kernel_lb_addr)); laarr[0].extLength = EXT_NOT_RECORDED_NOT_ALLOCATED; - /* Will udf_extend_file() create real extent from + /* Will udf_do_extend_file() create real extent from a fake one? */ startnum = (offset > 0); } /* Create extents for the hole between EOF and offset */ - ret = udf_extend_file(inode, &prev_epos, laarr, offset); - if (ret == -1) { + ret = udf_do_extend_file(inode, &prev_epos, laarr, offset); + if (ret < 0) { brelse(prev_epos.bh); brelse(cur_epos.bh); brelse(next_epos.bh); - /* We don't really know the error here so we just make - * something up */ - *err = -ENOSPC; + *err = ret; return NULL; } c = 0; @@ -1005,52 +1087,66 @@ struct buffer_head *udf_bread(struct inode *inode, int block, return NULL; } -void udf_truncate(struct inode *inode) +int udf_setsize(struct inode *inode, loff_t newsize) { - int offset; int err; struct udf_inode_info *iinfo; + int bsize = 1 << inode->i_blkbits; if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))) - return; + return -EINVAL; if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) - return; + return -EPERM; iinfo = UDF_I(inode); - if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) { + if (newsize > inode->i_size) { down_write(&iinfo->i_data_sem); - if (inode->i_sb->s_blocksize < - (udf_file_entry_alloc_offset(inode) + - inode->i_size)) { - udf_expand_file_adinicb(inode, inode->i_size, &err); - if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) { - inode->i_size = iinfo->i_lenAlloc; - up_write(&iinfo->i_data_sem); - return; + if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) { + if (bsize < + (udf_file_entry_alloc_offset(inode) + newsize)) { + err = udf_expand_file_adinicb(inode); + if (err) { + up_write(&iinfo->i_data_sem); + return err; + } } else - udf_truncate_extents(inode); - } else { - offset = inode->i_size & (inode->i_sb->s_blocksize - 1); - memset(iinfo->i_ext.i_data + iinfo->i_lenEAttr + offset, - 0x00, inode->i_sb->s_blocksize - - offset - udf_file_entry_alloc_offset(inode)); - iinfo->i_lenAlloc = inode->i_size; + iinfo->i_lenAlloc = newsize; + } + err = udf_extend_file(inode, newsize); + if (err) { + up_write(&iinfo->i_data_sem); + return err; } + truncate_setsize(inode, newsize); up_write(&iinfo->i_data_sem); } else { - block_truncate_page(inode->i_mapping, inode->i_size, - udf_get_block); + if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) { + down_write(&iinfo->i_data_sem); + memset(iinfo->i_ext.i_data + iinfo->i_lenEAttr + newsize, + 0x00, bsize - newsize - + udf_file_entry_alloc_offset(inode)); + iinfo->i_lenAlloc = newsize; + truncate_setsize(inode, newsize); + up_write(&iinfo->i_data_sem); + goto update_time; + } + err = block_truncate_page(inode->i_mapping, newsize, + udf_get_block); + if (err) + return err; down_write(&iinfo->i_data_sem); + truncate_setsize(inode, newsize); udf_truncate_extents(inode); up_write(&iinfo->i_data_sem); } - +update_time: inode->i_mtime = inode->i_ctime = current_fs_time(inode->i_sb); if (IS_SYNC(inode)) udf_sync_inode(inode); else mark_inode_dirty(inode); + return 0; } static void __udf_read_inode(struct inode *inode) @@ -1637,14 +1733,13 @@ struct inode *udf_iget(struct super_block *sb, struct kernel_lb_addr *ino) return NULL; } -int8_t udf_add_aext(struct inode *inode, struct extent_position *epos, - struct kernel_lb_addr *eloc, uint32_t elen, int inc) +int udf_add_aext(struct inode *inode, struct extent_position *epos, + struct kernel_lb_addr *eloc, uint32_t elen, int inc) { int adsize; struct short_ad *sad = NULL; struct long_ad *lad = NULL; struct allocExtDesc *aed; - int8_t etype; uint8_t *ptr; struct udf_inode_info *iinfo = UDF_I(inode); @@ -1660,7 +1755,7 @@ int8_t udf_add_aext(struct inode *inode, struct extent_position *epos, else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG) adsize = sizeof(struct long_ad); else - return -1; + return -EIO; if (epos->offset + (2 * adsize) > inode->i_sb->s_blocksize) { unsigned char *sptr, *dptr; @@ -1672,12 +1767,12 @@ int8_t udf_add_aext(struct inode *inode, struct extent_position *epos, obloc.partitionReferenceNum, obloc.logicalBlockNum, &err); if (!epos->block.logicalBlockNum) - return -1; + return -ENOSPC; nbh = udf_tgetblk(inode->i_sb, udf_get_lb_pblock(inode->i_sb, &epos->block, 0)); if (!nbh) - return -1; + return -EIO; lock_buffer(nbh); memset(nbh->b_data, 0x00, inode->i_sb->s_blocksize); set_buffer_uptodate(nbh); @@ -1746,7 +1841,7 @@ int8_t udf_add_aext(struct inode *inode, struct extent_position *epos, epos->bh = nbh; } - etype = udf_write_aext(inode, epos, eloc, elen, inc); + udf_write_aext(inode, epos, eloc, elen, inc); if (!epos->bh) { iinfo->i_lenAlloc += adsize; @@ -1764,11 +1859,11 @@ int8_t udf_add_aext(struct inode *inode, struct extent_position *epos, mark_buffer_dirty_inode(epos->bh, inode); } - return etype; + return 0; } -int8_t udf_write_aext(struct inode *inode, struct extent_position *epos, - struct kernel_lb_addr *eloc, uint32_t elen, int inc) +void udf_write_aext(struct inode *inode, struct extent_position *epos, + struct kernel_lb_addr *eloc, uint32_t elen, int inc) { int adsize; uint8_t *ptr; @@ -1798,7 +1893,7 @@ int8_t udf_write_aext(struct inode *inode, struct extent_position *epos, adsize = sizeof(struct long_ad); break; default: - return -1; + return; } if (epos->bh) { @@ -1817,8 +1912,6 @@ int8_t udf_write_aext(struct inode *inode, struct extent_position *epos, if (inc) epos->offset += adsize; - - return (elen >> 30); } int8_t udf_next_aext(struct inode *inode, struct extent_position *epos, diff --git a/fs/udf/truncate.c b/fs/udf/truncate.c index 225527c..8424308 100644 --- a/fs/udf/truncate.c +++ b/fs/udf/truncate.c @@ -197,6 +197,11 @@ static void udf_update_alloc_ext_desc(struct inode *inode, mark_buffer_dirty_inode(epos->bh, inode); } +/* + * Truncate extents of inode to inode->i_size. This function can be used only + * for making file shorter. For making file longer, udf_extend_file() has to + * be used. + */ void udf_truncate_extents(struct inode *inode) { struct extent_position epos; @@ -219,96 +224,65 @@ void udf_truncate_extents(struct inode *inode) etype = inode_bmap(inode, first_block, &epos, &eloc, &elen, &offset); byte_offset = (offset << sb->s_blocksize_bits) + (inode->i_size & (sb->s_blocksize - 1)); - if (etype != -1) { - epos.offset -= adsize; - extent_trunc(inode, &epos, &eloc, etype, elen, byte_offset); - epos.offset += adsize; - if (byte_offset) - lenalloc = epos.offset; - else - lenalloc = epos.offset - adsize; - - if (!epos.bh) - lenalloc -= udf_file_entry_alloc_offset(inode); - else - lenalloc -= sizeof(struct allocExtDesc); - - while ((etype = udf_current_aext(inode, &epos, &eloc, - &elen, 0)) != -1) { - if (etype == (EXT_NEXT_EXTENT_ALLOCDECS >> 30)) { - udf_write_aext(inode, &epos, &neloc, nelen, 0); - if (indirect_ext_len) { - /* We managed to free all extents in the - * indirect extent - free it too */ - BUG_ON(!epos.bh); - udf_free_blocks(sb, inode, &epos.block, - 0, indirect_ext_len); - } else if (!epos.bh) { - iinfo->i_lenAlloc = lenalloc; - mark_inode_dirty(inode); - } else - udf_update_alloc_ext_desc(inode, - &epos, lenalloc); - brelse(epos.bh); - epos.offset = sizeof(struct allocExtDesc); - epos.block = eloc; - epos.bh = udf_tread(sb, - udf_get_lb_pblock(sb, &eloc, 0)); - if (elen) - indirect_ext_len = - (elen + sb->s_blocksize - 1) >> - sb->s_blocksize_bits; - else - indirect_ext_len = 1; - } else { - extent_trunc(inode, &epos, &eloc, etype, - elen, 0); - epos.offset += adsize; - } - } + if (etype == -1) { + /* We should extend the file? */ + WARN_ON(byte_offset); + return; + } + epos.offset -= adsize; + extent_trunc(inode, &epos, &eloc, etype, elen, byte_offset); + epos.offset += adsize; + if (byte_offset) + lenalloc = epos.offset; + else + lenalloc = epos.offset - adsize; - if (indirect_ext_len) { - BUG_ON(!epos.bh); - udf_free_blocks(sb, inode, &epos.block, 0, - indirect_ext_len); - } else if (!epos.bh) { - iinfo->i_lenAlloc = lenalloc; - mark_inode_dirty(inode); - } else - udf_update_alloc_ext_desc(inode, &epos, lenalloc); - } else if (inode->i_size) { - if (byte_offset) { - struct kernel_long_ad extent; + if (!epos.bh) + lenalloc -= udf_file_entry_alloc_offset(inode); + else + lenalloc -= sizeof(struct allocExtDesc); - /* - * OK, there is not extent covering inode->i_size and - * no extent above inode->i_size => truncate is - * extending the file by 'offset' blocks. - */ - if ((!epos.bh && - epos.offset == - udf_file_entry_alloc_offset(inode)) || - (epos.bh && epos.offset == - sizeof(struct allocExtDesc))) { - /* File has no extents at all or has empty last - * indirect extent! Create a fake extent... */ - extent.extLocation.logicalBlockNum = 0; - extent.extLocation.partitionReferenceNum = 0; - extent.extLength = - EXT_NOT_RECORDED_NOT_ALLOCATED; - } else { - epos.offset -= adsize; - etype = udf_next_aext(inode, &epos, - &extent.extLocation, - &extent.extLength, 0); - extent.extLength |= etype << 30; - } - udf_extend_file(inode, &epos, &extent, - offset + - ((inode->i_size & - (sb->s_blocksize - 1)) != 0)); + while ((etype = udf_current_aext(inode, &epos, &eloc, + &elen, 0)) != -1) { + if (etype == (EXT_NEXT_EXTENT_ALLOCDECS >> 30)) { + udf_write_aext(inode, &epos, &neloc, nelen, 0); + if (indirect_ext_len) { + /* We managed to free all extents in the + * indirect extent - free it too */ + BUG_ON(!epos.bh); + udf_free_blocks(sb, inode, &epos.block, + 0, indirect_ext_len); + } else if (!epos.bh) { + iinfo->i_lenAlloc = lenalloc; + mark_inode_dirty(inode); + } else + udf_update_alloc_ext_desc(inode, + &epos, lenalloc); + brelse(epos.bh); + epos.offset = sizeof(struct allocExtDesc); + epos.block = eloc; + epos.bh = udf_tread(sb, + udf_get_lb_pblock(sb, &eloc, 0)); + if (elen) + indirect_ext_len = + (elen + sb->s_blocksize - 1) >> + sb->s_blocksize_bits; + else + indirect_ext_len = 1; + } else { + extent_trunc(inode, &epos, &eloc, etype, elen, 0); + epos.offset += adsize; } } + + if (indirect_ext_len) { + BUG_ON(!epos.bh); + udf_free_blocks(sb, inode, &epos.block, 0, indirect_ext_len); + } else if (!epos.bh) { + iinfo->i_lenAlloc = lenalloc; + mark_inode_dirty(inode); + } else + udf_update_alloc_ext_desc(inode, &epos, lenalloc); iinfo->i_lenExtents = inode->i_size; brelse(epos.bh); diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h index eba4820..dbd52d4b 100644 --- a/fs/udf/udfdecl.h +++ b/fs/udf/udfdecl.h @@ -136,22 +136,20 @@ extern int udf_write_fi(struct inode *inode, struct fileIdentDesc *, extern long udf_ioctl(struct file *, unsigned int, unsigned long); /* inode.c */ extern struct inode *udf_iget(struct super_block *, struct kernel_lb_addr *); -extern void udf_expand_file_adinicb(struct inode *, int, int *); +extern int udf_expand_file_adinicb(struct inode *); extern struct buffer_head *udf_expand_dir_adinicb(struct inode *, int *, int *); extern struct buffer_head *udf_bread(struct inode *, int, int, int *); -extern void udf_truncate(struct inode *); +extern int udf_setsize(struct inode *, loff_t); extern void udf_read_inode(struct inode *); extern void udf_evict_inode(struct inode *); extern int udf_write_inode(struct inode *, struct writeback_control *wbc); extern long udf_block_map(struct inode *, sector_t); -extern int udf_extend_file(struct inode *, struct extent_position *, - struct kernel_long_ad *, sector_t); extern int8_t inode_bmap(struct inode *, sector_t, struct extent_position *, struct kernel_lb_addr *, uint32_t *, sector_t *); -extern int8_t udf_add_aext(struct inode *, struct extent_position *, +extern int udf_add_aext(struct inode *, struct extent_position *, + struct kernel_lb_addr *, uint32_t, int); +extern void udf_write_aext(struct inode *, struct extent_position *, struct kernel_lb_addr *, uint32_t, int); -extern int8_t udf_write_aext(struct inode *, struct extent_position *, - struct kernel_lb_addr *, uint32_t, int); extern int8_t udf_delete_aext(struct inode *, struct extent_position, struct kernel_lb_addr, uint32_t); extern int8_t udf_next_aext(struct inode *, struct extent_position *, |