diff options
Diffstat (limited to 'fs')
186 files changed, 4929 insertions, 3793 deletions
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c index c061c3f..24eb010 100644 --- a/fs/9p/v9fs.c +++ b/fs/9p/v9fs.c @@ -30,8 +30,8 @@ #include <linux/parser.h> #include <linux/idr.h> #include <net/9p/9p.h> -#include <net/9p/transport.h> #include <net/9p/client.h> +#include <net/9p/transport.h> #include "v9fs.h" #include "v9fs_vfs.h" @@ -234,7 +234,7 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses, if (!v9ses->clnt->dotu) v9ses->flags &= ~V9FS_EXTENDED; - v9ses->maxdata = v9ses->clnt->msize; + v9ses->maxdata = v9ses->clnt->msize - P9_IOHDRSZ; /* for legacy mode, fall back to V9FS_ACCESS_ANY */ if (!v9fs_extended(v9ses) && diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h index 57997fa..c295ba7 100644 --- a/fs/9p/v9fs_vfs.h +++ b/fs/9p/v9fs_vfs.h @@ -46,9 +46,11 @@ extern struct dentry_operations v9fs_cached_dentry_operations; struct inode *v9fs_get_inode(struct super_block *sb, int mode); ino_t v9fs_qid2ino(struct p9_qid *qid); -void v9fs_stat2inode(struct p9_stat *, struct inode *, struct super_block *); +void v9fs_stat2inode(struct p9_wstat *, struct inode *, struct super_block *); int v9fs_dir_release(struct inode *inode, struct file *filp); int v9fs_file_open(struct inode *inode, struct file *file); -void v9fs_inode2stat(struct inode *inode, struct p9_stat *stat); +void v9fs_inode2stat(struct inode *inode, struct p9_wstat *stat); void v9fs_dentry_release(struct dentry *); int v9fs_uflags2omode(int uflags, int extended); + +ssize_t v9fs_file_readn(struct file *, char *, char __user *, u32, u64); diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c index 97d3aed..6fcb1e7 100644 --- a/fs/9p/vfs_addr.c +++ b/fs/9p/vfs_addr.c @@ -38,7 +38,6 @@ #include "v9fs.h" #include "v9fs_vfs.h" -#include "fid.h" /** * v9fs_vfs_readpage - read an entire page in from 9P @@ -53,14 +52,12 @@ static int v9fs_vfs_readpage(struct file *filp, struct page *page) int retval; loff_t offset; char *buffer; - struct p9_fid *fid; P9_DPRINTK(P9_DEBUG_VFS, "\n"); - fid = filp->private_data; buffer = kmap(page); offset = page_offset(page); - retval = p9_client_readn(fid, buffer, offset, PAGE_CACHE_SIZE); + retval = v9fs_file_readn(filp, buffer, NULL, offset, PAGE_CACHE_SIZE); if (retval < 0) goto done; diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c index e298fe1..873cd31 100644 --- a/fs/9p/vfs_dir.c +++ b/fs/9p/vfs_dir.c @@ -45,7 +45,7 @@ * */ -static inline int dt_type(struct p9_stat *mistat) +static inline int dt_type(struct p9_wstat *mistat) { unsigned long perm = mistat->mode; int rettype = DT_REG; @@ -69,32 +69,58 @@ static inline int dt_type(struct p9_stat *mistat) static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir) { int over; + struct p9_wstat st; + int err; struct p9_fid *fid; - struct v9fs_session_info *v9ses; - struct inode *inode; - struct p9_stat *st; + int buflen; + char *statbuf; + int n, i = 0; P9_DPRINTK(P9_DEBUG_VFS, "name %s\n", filp->f_path.dentry->d_name.name); - inode = filp->f_path.dentry->d_inode; - v9ses = v9fs_inode2v9ses(inode); fid = filp->private_data; - while ((st = p9_client_dirread(fid, filp->f_pos)) != NULL) { - if (IS_ERR(st)) - return PTR_ERR(st); - over = filldir(dirent, st->name.str, st->name.len, filp->f_pos, - v9fs_qid2ino(&st->qid), dt_type(st)); + buflen = fid->clnt->msize - P9_IOHDRSZ; + statbuf = kmalloc(buflen, GFP_KERNEL); + if (!statbuf) + return -ENOMEM; - if (over) + while (1) { + err = v9fs_file_readn(filp, statbuf, NULL, buflen, + fid->rdir_fpos); + if (err <= 0) break; - filp->f_pos += st->size; - kfree(st); - st = NULL; + n = err; + while (i < n) { + err = p9stat_read(statbuf + i, buflen-i, &st, + fid->clnt->dotu); + if (err) { + P9_DPRINTK(P9_DEBUG_VFS, "returned %d\n", err); + err = -EIO; + p9stat_free(&st); + goto free_and_exit; + } + + i += st.size+2; + fid->rdir_fpos += st.size+2; + + over = filldir(dirent, st.name, strlen(st.name), + filp->f_pos, v9fs_qid2ino(&st.qid), dt_type(&st)); + + filp->f_pos += st.size+2; + + p9stat_free(&st); + + if (over) { + err = 0; + goto free_and_exit; + } + } } - kfree(st); - return 0; +free_and_exit: + kfree(statbuf); + return err; } diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c index 52944d2..68bf2af 100644 --- a/fs/9p/vfs_file.c +++ b/fs/9p/vfs_file.c @@ -120,23 +120,72 @@ static int v9fs_file_lock(struct file *filp, int cmd, struct file_lock *fl) } /** - * v9fs_file_read - read from a file + * v9fs_file_readn - read from a file * @filp: file pointer to read * @data: data buffer to read data into + * @udata: user data buffer to read data into * @count: size of buffer * @offset: offset at which to read data * */ + +ssize_t +v9fs_file_readn(struct file *filp, char *data, char __user *udata, u32 count, + u64 offset) +{ + int n, total; + struct p9_fid *fid = filp->private_data; + + P9_DPRINTK(P9_DEBUG_VFS, "fid %d offset %llu count %d\n", fid->fid, + (long long unsigned) offset, count); + + n = 0; + total = 0; + do { + n = p9_client_read(fid, data, udata, offset, count); + if (n <= 0) + break; + + if (data) + data += n; + if (udata) + udata += n; + + offset += n; + count -= n; + total += n; + } while (count > 0 && n == (fid->clnt->msize - P9_IOHDRSZ)); + + if (n < 0) + total = n; + + return total; +} + +/** + * v9fs_file_read - read from a file + * @filp: file pointer to read + * @udata: user data buffer to read data into + * @count: size of buffer + * @offset: offset at which to read data + * + */ + static ssize_t -v9fs_file_read(struct file *filp, char __user * data, size_t count, +v9fs_file_read(struct file *filp, char __user *udata, size_t count, loff_t * offset) { int ret; struct p9_fid *fid; - P9_DPRINTK(P9_DEBUG_VFS, "\n"); + P9_DPRINTK(P9_DEBUG_VFS, "count %zu offset %lld\n", count, *offset); fid = filp->private_data; - ret = p9_client_uread(fid, data, *offset, count); + + if (count > (fid->clnt->msize - P9_IOHDRSZ)) + ret = v9fs_file_readn(filp, NULL, udata, count, *offset); + else + ret = p9_client_read(fid, NULL, udata, *offset, count); + if (ret > 0) *offset += ret; @@ -156,19 +205,38 @@ static ssize_t v9fs_file_write(struct file *filp, const char __user * data, size_t count, loff_t * offset) { - int ret; + int n, rsize, total = 0; struct p9_fid *fid; + struct p9_client *clnt; struct inode *inode = filp->f_path.dentry->d_inode; + int origin = *offset; P9_DPRINTK(P9_DEBUG_VFS, "data %p count %d offset %x\n", data, (int)count, (int)*offset); fid = filp->private_data; - ret = p9_client_uwrite(fid, data, *offset, count); - if (ret > 0) { - invalidate_inode_pages2_range(inode->i_mapping, *offset, - *offset+ret); - *offset += ret; + clnt = fid->clnt; + + rsize = fid->iounit; + if (!rsize || rsize > clnt->msize-P9_IOHDRSZ) + rsize = clnt->msize - P9_IOHDRSZ; + + do { + if (count < rsize) + rsize = count; + + n = p9_client_write(fid, NULL, data+total, *offset+total, + rsize); + if (n <= 0) + break; + count -= n; + total += n; + } while (count > 0); + + if (total > 0) { + invalidate_inode_pages2_range(inode->i_mapping, origin, + origin+total); + *offset += total; } if (*offset > inode->i_size) { @@ -176,7 +244,10 @@ v9fs_file_write(struct file *filp, const char __user * data, inode->i_blocks = (inode->i_size + 512 - 1) >> 9; } - return ret; + if (n < 0) + return n; + + return total; } static const struct file_operations v9fs_cached_file_operations = { diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index e83aa5e..8314d3f4 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -334,7 +334,7 @@ v9fs_inode_from_fid(struct v9fs_session_info *v9ses, struct p9_fid *fid, { int err, umode; struct inode *ret; - struct p9_stat *st; + struct p9_wstat *st; ret = NULL; st = p9_client_stat(fid); @@ -417,6 +417,8 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir, struct p9_fid *dfid, *ofid, *fid; struct inode *inode; + P9_DPRINTK(P9_DEBUG_VFS, "name %s\n", dentry->d_name.name); + err = 0; ofid = NULL; fid = NULL; @@ -424,6 +426,7 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir, dfid = v9fs_fid_clone(dentry->d_parent); if (IS_ERR(dfid)) { err = PTR_ERR(dfid); + P9_DPRINTK(P9_DEBUG_VFS, "fid clone failed %d\n", err); dfid = NULL; goto error; } @@ -432,18 +435,22 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir, ofid = p9_client_walk(dfid, 0, NULL, 1); if (IS_ERR(ofid)) { err = PTR_ERR(ofid); + P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err); ofid = NULL; goto error; } err = p9_client_fcreate(ofid, name, perm, mode, extension); - if (err < 0) + if (err < 0) { + P9_DPRINTK(P9_DEBUG_VFS, "p9_client_fcreate failed %d\n", err); goto error; + } /* now walk from the parent so we can get unopened fid */ fid = p9_client_walk(dfid, 1, &name, 0); if (IS_ERR(fid)) { err = PTR_ERR(fid); + P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err); fid = NULL; goto error; } else @@ -453,6 +460,7 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir, inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb); if (IS_ERR(inode)) { err = PTR_ERR(inode); + P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n", err); goto error; } @@ -734,7 +742,7 @@ v9fs_vfs_getattr(struct vfsmount *mnt, struct dentry *dentry, int err; struct v9fs_session_info *v9ses; struct p9_fid *fid; - struct p9_stat *st; + struct p9_wstat *st; P9_DPRINTK(P9_DEBUG_VFS, "dentry: %p\n", dentry); err = -EPERM; @@ -815,10 +823,9 @@ static int v9fs_vfs_setattr(struct dentry *dentry, struct iattr *iattr) */ void -v9fs_stat2inode(struct p9_stat *stat, struct inode *inode, +v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode, struct super_block *sb) { - int n; char ext[32]; struct v9fs_session_info *v9ses = sb->s_fs_info; @@ -842,11 +849,7 @@ v9fs_stat2inode(struct p9_stat *stat, struct inode *inode, int major = -1; int minor = -1; - n = stat->extension.len; - if (n > sizeof(ext)-1) - n = sizeof(ext)-1; - memmove(ext, stat->extension.str, n); - ext[n] = 0; + strncpy(ext, stat->extension, sizeof(ext)); sscanf(ext, "%c %u %u", &type, &major, &minor); switch (type) { case 'c': @@ -857,10 +860,11 @@ v9fs_stat2inode(struct p9_stat *stat, struct inode *inode, break; default: P9_DPRINTK(P9_DEBUG_ERROR, - "Unknown special type %c (%.*s)\n", type, - stat->extension.len, stat->extension.str); + "Unknown special type %c %s\n", type, + stat->extension); }; inode->i_rdev = MKDEV(major, minor); + init_special_inode(inode, inode->i_mode, inode->i_rdev); } else inode->i_rdev = 0; @@ -904,7 +908,7 @@ static int v9fs_readlink(struct dentry *dentry, char *buffer, int buflen) struct v9fs_session_info *v9ses; struct p9_fid *fid; - struct p9_stat *st; + struct p9_wstat *st; P9_DPRINTK(P9_DEBUG_VFS, " %s\n", dentry->d_name.name); retval = -EPERM; @@ -926,15 +930,10 @@ static int v9fs_readlink(struct dentry *dentry, char *buffer, int buflen) } /* copy extension buffer into buffer */ - if (st->extension.len < buflen) - buflen = st->extension.len + 1; - - memmove(buffer, st->extension.str, buflen - 1); - buffer[buflen-1] = 0; + strncpy(buffer, st->extension, buflen); P9_DPRINTK(P9_DEBUG_VFS, - "%s -> %.*s (%s)\n", dentry->d_name.name, st->extension.len, - st->extension.str, buffer); + "%s -> %s (%s)\n", dentry->d_name.name, st->extension, buffer); retval = buflen; diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c index bf59c39..d6cb1a0 100644 --- a/fs/9p/vfs_super.c +++ b/fs/9p/vfs_super.c @@ -111,7 +111,7 @@ static int v9fs_get_sb(struct file_system_type *fs_type, int flags, struct inode *inode = NULL; struct dentry *root = NULL; struct v9fs_session_info *v9ses = NULL; - struct p9_stat *st = NULL; + struct p9_wstat *st = NULL; int mode = S_IRWXUGO | S_ISVTX; uid_t uid = current->fsuid; gid_t gid = current->fsgid; @@ -161,10 +161,14 @@ static int v9fs_get_sb(struct file_system_type *fs_type, int flags, sb->s_root = root; root->d_inode->i_ino = v9fs_qid2ino(&st->qid); + v9fs_stat2inode(st, root->d_inode, sb); + v9fs_fid_add(root, fid); + p9stat_free(st); kfree(st); +P9_DPRINTK(P9_DEBUG_VFS, " return simple set mount\n"); return simple_set_mnt(mnt, sb); release_sb: @@ -6,61 +6,9 @@ menu "File systems" if BLOCK -config EXT2_FS - tristate "Second extended fs support" - help - Ext2 is a standard Linux file system for hard disks. - - To compile this file system support as a module, choose M here: the - module will be called ext2. - - If unsure, say Y. - -config EXT2_FS_XATTR - bool "Ext2 extended attributes" - depends on EXT2_FS - help - Extended attributes are name:value pairs associated with inodes by - the kernel or by users (see the attr(5) manual page, or visit - <http://acl.bestbits.at/> for details). - - If unsure, say N. - -config EXT2_FS_POSIX_ACL - bool "Ext2 POSIX Access Control Lists" - depends on EXT2_FS_XATTR - select FS_POSIX_ACL - help - Posix Access Control Lists (ACLs) support permissions for users and - groups beyond the owner/group/world scheme. - - To learn more about Access Control Lists, visit the Posix ACLs for - Linux website <http://acl.bestbits.at/>. - - If you don't know what Access Control Lists are, say N - -config EXT2_FS_SECURITY - bool "Ext2 Security Labels" - depends on EXT2_FS_XATTR - help - Security labels support alternative access control models - implemented by security modules like SELinux. This option - enables an extended attribute handler for file security - labels in the ext2 filesystem. - - If you are not using a security module that requires using - extended attributes for file security labels, say N. - -config EXT2_FS_XIP - bool "Ext2 execute in place support" - depends on EXT2_FS && MMU - help - Execute in place can be used on memory-backed block devices. If you - enable this option, you can select to mount block devices which are - capable of this feature without using the page cache. - - If you do not use a block device that is capable of using this, - or if unsure, say N. +source "fs/ext2/Kconfig" +source "fs/ext3/Kconfig" +source "fs/ext4/Kconfig" config FS_XIP # execute in place @@ -68,225 +16,16 @@ config FS_XIP depends on EXT2_FS_XIP default y -config EXT3_FS - tristate "Ext3 journalling file system support" - select JBD - help - This is the journalling version of the Second extended file system - (often called ext3), the de facto standard Linux file system - (method to organize files on a storage device) for hard disks. - - The journalling code included in this driver means you do not have - to run e2fsck (file system checker) on your file systems after a - crash. The journal keeps track of any changes that were being made - at the time the system crashed, and can ensure that your file system - is consistent without the need for a lengthy check. - - Other than adding the journal to the file system, the on-disk format - of ext3 is identical to ext2. It is possible to freely switch - between using the ext3 driver and the ext2 driver, as long as the - file system has been cleanly unmounted, or e2fsck is run on the file - system. - - To add a journal on an existing ext2 file system or change the - behavior of ext3 file systems, you can use the tune2fs utility ("man - tune2fs"). To modify attributes of files and directories on ext3 - file systems, use chattr ("man chattr"). You need to be using - e2fsprogs version 1.20 or later in order to create ext3 journals - (available at <http://sourceforge.net/projects/e2fsprogs/>). - - To compile this file system support as a module, choose M here: the - module will be called ext3. - -config EXT3_FS_XATTR - bool "Ext3 extended attributes" - depends on EXT3_FS - default y - help - Extended attributes are name:value pairs associated with inodes by - the kernel or by users (see the attr(5) manual page, or visit - <http://acl.bestbits.at/> for details). - - If unsure, say N. - - You need this for POSIX ACL support on ext3. - -config EXT3_FS_POSIX_ACL - bool "Ext3 POSIX Access Control Lists" - depends on EXT3_FS_XATTR - select FS_POSIX_ACL - help - Posix Access Control Lists (ACLs) support permissions for users and - groups beyond the owner/group/world scheme. - - To learn more about Access Control Lists, visit the Posix ACLs for - Linux website <http://acl.bestbits.at/>. - - If you don't know what Access Control Lists are, say N - -config EXT3_FS_SECURITY - bool "Ext3 Security Labels" - depends on EXT3_FS_XATTR - help - Security labels support alternative access control models - implemented by security modules like SELinux. This option - enables an extended attribute handler for file security - labels in the ext3 filesystem. - - If you are not using a security module that requires using - extended attributes for file security labels, say N. - -config EXT4_FS - tristate "The Extended 4 (ext4) filesystem" - select JBD2 - select CRC16 - help - This is the next generation of the ext3 filesystem. - - Unlike the change from ext2 filesystem to ext3 filesystem, - the on-disk format of ext4 is not forwards compatible with - ext3; it is based on extent maps and it supports 48-bit - physical block numbers. The ext4 filesystem also supports delayed - allocation, persistent preallocation, high resolution time stamps, - and a number of other features to improve performance and speed - up fsck time. For more information, please see the web pages at - http://ext4.wiki.kernel.org. - - The ext4 filesystem will support mounting an ext3 - filesystem; while there will be some performance gains from - the delayed allocation and inode table readahead, the best - performance gains will require enabling ext4 features in the - filesystem, or formating a new filesystem as an ext4 - filesystem initially. - - To compile this file system support as a module, choose M here. The - module will be called ext4dev. - - If unsure, say N. - -config EXT4DEV_COMPAT - bool "Enable ext4dev compatibility" - depends on EXT4_FS - help - Starting with 2.6.28, the name of the ext4 filesystem was - renamed from ext4dev to ext4. Unfortunately there are some - legacy userspace programs (such as klibc's fstype) have - "ext4dev" hardcoded. - - To enable backwards compatibility so that systems that are - still expecting to mount ext4 filesystems using ext4dev, - chose Y here. This feature will go away by 2.6.31, so - please arrange to get your userspace programs fixed! - -config EXT4_FS_XATTR - bool "Ext4 extended attributes" - depends on EXT4_FS - default y - help - Extended attributes are name:value pairs associated with inodes by - the kernel or by users (see the attr(5) manual page, or visit - <http://acl.bestbits.at/> for details). - - If unsure, say N. - - You need this for POSIX ACL support on ext4. - -config EXT4_FS_POSIX_ACL - bool "Ext4 POSIX Access Control Lists" - depends on EXT4_FS_XATTR - select FS_POSIX_ACL - help - POSIX Access Control Lists (ACLs) support permissions for users and - groups beyond the owner/group/world scheme. - - To learn more about Access Control Lists, visit the POSIX ACLs for - Linux website <http://acl.bestbits.at/>. - - If you don't know what Access Control Lists are, say N - -config EXT4_FS_SECURITY - bool "Ext4 Security Labels" - depends on EXT4_FS_XATTR - help - Security labels support alternative access control models - implemented by security modules like SELinux. This option - enables an extended attribute handler for file security - labels in the ext4 filesystem. - - If you are not using a security module that requires using - extended attributes for file security labels, say N. - -config JBD - tristate - help - This is a generic journalling layer for block devices. It is - currently used by the ext3 file system, but it could also be - used to add journal support to other file systems or block - devices such as RAID or LVM. - - If you are using the ext3 file system, you need to say Y here. - If you are not using ext3 then you will probably want to say N. - - To compile this device as a module, choose M here: the module will be - called jbd. If you are compiling ext3 into the kernel, you - cannot compile this code as a module. - -config JBD_DEBUG - bool "JBD (ext3) debugging support" - depends on JBD && DEBUG_FS - help - If you are using the ext3 journaled file system (or potentially any - other file system/device using JBD), this option allows you to - enable debugging output while the system is running, in order to - help track down any problems you are having. By default the - debugging output will be turned off. - - If you select Y here, then you will be able to turn on debugging - with "echo N > /sys/kernel/debug/jbd/jbd-debug", where N is a - number between 1 and 5, the higher the number, the more debugging - output is generated. To turn debugging off again, do - "echo 0 > /sys/kernel/debug/jbd/jbd-debug". - -config JBD2 - tristate - select CRC32 - help - This is a generic journaling layer for block devices that support - both 32-bit and 64-bit block numbers. It is currently used by - the ext4 and OCFS2 filesystems, but it could also be used to add - journal support to other file systems or block devices such - as RAID or LVM. - - If you are using ext4 or OCFS2, you need to say Y here. - If you are not using ext4 or OCFS2 then you will - probably want to say N. - - To compile this device as a module, choose M here. The module will be - called jbd2. If you are compiling ext4 or OCFS2 into the kernel, - you cannot compile this code as a module. - -config JBD2_DEBUG - bool "JBD2 (ext4) debugging support" - depends on JBD2 && DEBUG_FS - help - If you are using the ext4 journaled file system (or - potentially any other filesystem/device using JBD2), this option - allows you to enable debugging output while the system is running, - in order to help track down any problems you are having. - By default, the debugging output will be turned off. - - If you select Y here, then you will be able to turn on debugging - with "echo N > /sys/kernel/debug/jbd2/jbd2-debug", where N is a - number between 1 and 5. The higher the number, the more debugging - output is generated. To turn debugging off again, do - "echo 0 > /sys/kernel/debug/jbd2/jbd2-debug". +source "fs/jbd/Kconfig" +source "fs/jbd2/Kconfig" config FS_MBCACHE # Meta block cache for Extended Attributes (ext2/ext3/ext4) tristate - depends on EXT2_FS_XATTR || EXT3_FS_XATTR || EXT4_FS_XATTR - default y if EXT2_FS=y || EXT3_FS=y || EXT4_FS=y - default m if EXT2_FS=m || EXT3_FS=m || EXT4_FS=m + default y if EXT2_FS=y && EXT2_FS_XATTR + default y if EXT3_FS=y && EXT3_FS_XATTR + default y if EXT4_FS=y && EXT4_FS_XATTR + default m if EXT2_FS_XATTR || EXT3_FS_XATTR || EXT4_FS_XATTR config REISERFS_FS tristate "Reiserfs support" @@ -665,7 +404,7 @@ config AUTOFS4_FS N here. config FUSE_FS - tristate "Filesystem in Userspace support" + tristate "FUSE (Filesystem in Userspace) support" help With FUSE it is possible to implement a fully functional filesystem in a userspace program. @@ -1168,195 +907,7 @@ config EFS_FS To compile the EFS file system support as a module, choose M here: the module will be called efs. -config JFFS2_FS - tristate "Journalling Flash File System v2 (JFFS2) support" - select CRC32 - depends on MTD - help - JFFS2 is the second generation of the Journalling Flash File System - for use on diskless embedded devices. It provides improved wear - levelling, compression and support for hard links. You cannot use - this on normal block devices, only on 'MTD' devices. - - Further information on the design and implementation of JFFS2 is - available at <http://sources.redhat.com/jffs2/>. - -config JFFS2_FS_DEBUG - int "JFFS2 debugging verbosity (0 = quiet, 2 = noisy)" - depends on JFFS2_FS - default "0" - help - This controls the amount of debugging messages produced by the JFFS2 - code. Set it to zero for use in production systems. For evaluation, - testing and debugging, it's advisable to set it to one. This will - enable a few assertions and will print debugging messages at the - KERN_DEBUG loglevel, where they won't normally be visible. Level 2 - is unlikely to be useful - it enables extra debugging in certain - areas which at one point needed debugging, but when the bugs were - located and fixed, the detailed messages were relegated to level 2. - - If reporting bugs, please try to have available a full dump of the - messages at debug level 1 while the misbehaviour was occurring. - -config JFFS2_FS_WRITEBUFFER - bool "JFFS2 write-buffering support" - depends on JFFS2_FS - default y - help - This enables the write-buffering support in JFFS2. - - This functionality is required to support JFFS2 on the following - types of flash devices: - - NAND flash - - NOR flash with transparent ECC - - DataFlash - -config JFFS2_FS_WBUF_VERIFY - bool "Verify JFFS2 write-buffer reads" - depends on JFFS2_FS_WRITEBUFFER - default n - help - This causes JFFS2 to read back every page written through the - write-buffer, and check for errors. - -config JFFS2_SUMMARY - bool "JFFS2 summary support (EXPERIMENTAL)" - depends on JFFS2_FS && EXPERIMENTAL - default n - help - This feature makes it possible to use summary information - for faster filesystem mount. - - The summary information can be inserted into a filesystem image - by the utility 'sumtool'. - - If unsure, say 'N'. - -config JFFS2_FS_XATTR - bool "JFFS2 XATTR support (EXPERIMENTAL)" - depends on JFFS2_FS && EXPERIMENTAL - default n - help - Extended attributes are name:value pairs associated with inodes by - the kernel or by users (see the attr(5) manual page, or visit - <http://acl.bestbits.at/> for details). - - If unsure, say N. - -config JFFS2_FS_POSIX_ACL - bool "JFFS2 POSIX Access Control Lists" - depends on JFFS2_FS_XATTR - default y - select FS_POSIX_ACL - help - Posix Access Control Lists (ACLs) support permissions for users and - groups beyond the owner/group/world scheme. - - To learn more about Access Control Lists, visit the Posix ACLs for - Linux website <http://acl.bestbits.at/>. - - If you don't know what Access Control Lists are, say N - -config JFFS2_FS_SECURITY - bool "JFFS2 Security Labels" - depends on JFFS2_FS_XATTR - default y - help - Security labels support alternative access control models - implemented by security modules like SELinux. This option - enables an extended attribute handler for file security - labels in the jffs2 filesystem. - - If you are not using a security module that requires using - extended attributes for file security labels, say N. - -config JFFS2_COMPRESSION_OPTIONS - bool "Advanced compression options for JFFS2" - depends on JFFS2_FS - default n - help - Enabling this option allows you to explicitly choose which - compression modules, if any, are enabled in JFFS2. Removing - compressors can mean you cannot read existing file systems, - and enabling experimental compressors can mean that you - write a file system which cannot be read by a standard kernel. - - If unsure, you should _definitely_ say 'N'. - -config JFFS2_ZLIB - bool "JFFS2 ZLIB compression support" if JFFS2_COMPRESSION_OPTIONS - select ZLIB_INFLATE - select ZLIB_DEFLATE - depends on JFFS2_FS - default y - help - Zlib is designed to be a free, general-purpose, legally unencumbered, - lossless data-compression library for use on virtually any computer - hardware and operating system. See <http://www.gzip.org/zlib/> for - further information. - - Say 'Y' if unsure. - -config JFFS2_LZO - bool "JFFS2 LZO compression support" if JFFS2_COMPRESSION_OPTIONS - select LZO_COMPRESS - select LZO_DECOMPRESS - depends on JFFS2_FS - default n - help - minilzo-based compression. Generally works better than Zlib. - - This feature was added in July, 2007. Say 'N' if you need - compatibility with older bootloaders or kernels. - -config JFFS2_RTIME - bool "JFFS2 RTIME compression support" if JFFS2_COMPRESSION_OPTIONS - depends on JFFS2_FS - default y - help - Rtime does manage to recompress already-compressed data. Say 'Y' if unsure. - -config JFFS2_RUBIN - bool "JFFS2 RUBIN compression support" if JFFS2_COMPRESSION_OPTIONS - depends on JFFS2_FS - default n - help - RUBINMIPS and DYNRUBIN compressors. Say 'N' if unsure. - -choice - prompt "JFFS2 default compression mode" if JFFS2_COMPRESSION_OPTIONS - default JFFS2_CMODE_PRIORITY - depends on JFFS2_FS - help - You can set here the default compression mode of JFFS2 from - the available compression modes. Don't touch if unsure. - -config JFFS2_CMODE_NONE - bool "no compression" - help - Uses no compression. - -config JFFS2_CMODE_PRIORITY - bool "priority" - help - Tries the compressors in a predefined order and chooses the first - successful one. - -config JFFS2_CMODE_SIZE - bool "size (EXPERIMENTAL)" - help - Tries all compressors and chooses the one which has the smallest - result. - -config JFFS2_CMODE_FAVOURLZO - bool "Favour LZO" - help - Tries all compressors and chooses the one which has the smallest - result but gives some preference to LZO (which has faster - decompression) at the expense of size. - -endchoice - +source "fs/jffs2/Kconfig" # UBIFS File system configuration source "fs/ubifs/Kconfig" @@ -1913,148 +1464,7 @@ config SMB_NLS_REMOTE smbmount from samba 2.2.0 or later supports this. -config CIFS - tristate "CIFS support (advanced network filesystem, SMBFS successor)" - depends on INET - select NLS - help - This is the client VFS module for the Common Internet File System - (CIFS) protocol which is the successor to the Server Message Block - (SMB) protocol, the native file sharing mechanism for most early - PC operating systems. The CIFS protocol is fully supported by - file servers such as Windows 2000 (including Windows 2003, NT 4 - and Windows XP) as well by Samba (which provides excellent CIFS - server support for Linux and many other operating systems). Limited - support for OS/2 and Windows ME and similar servers is provided as - well. - - The cifs module provides an advanced network file system - client for mounting to CIFS compliant servers. It includes - support for DFS (hierarchical name space), secure per-user - session establishment via Kerberos or NTLM or NTLMv2, - safe distributed caching (oplock), optional packet - signing, Unicode and other internationalization improvements. - If you need to mount to Samba or Windows from this machine, say Y. - -config CIFS_STATS - bool "CIFS statistics" - depends on CIFS - help - Enabling this option will cause statistics for each server share - mounted by the cifs client to be displayed in /proc/fs/cifs/Stats - -config CIFS_STATS2 - bool "Extended statistics" - depends on CIFS_STATS - help - Enabling this option will allow more detailed statistics on SMB - request timing to be displayed in /proc/fs/cifs/DebugData and also - allow optional logging of slow responses to dmesg (depending on the - value of /proc/fs/cifs/cifsFYI, see fs/cifs/README for more details). - These additional statistics may have a minor effect on performance - and memory utilization. - - Unless you are a developer or are doing network performance analysis - or tuning, say N. - -config CIFS_WEAK_PW_HASH - bool "Support legacy servers which use weaker LANMAN security" - depends on CIFS - help - Modern CIFS servers including Samba and most Windows versions - (since 1997) support stronger NTLM (and even NTLMv2 and Kerberos) - security mechanisms. These hash the password more securely - than the mechanisms used in the older LANMAN version of the - SMB protocol but LANMAN based authentication is needed to - establish sessions with some old SMB servers. - - Enabling this option allows the cifs module to mount to older - LANMAN based servers such as OS/2 and Windows 95, but such - mounts may be less secure than mounts using NTLM or more recent - security mechanisms if you are on a public network. Unless you - have a need to access old SMB servers (and are on a private - network) you probably want to say N. Even if this support - is enabled in the kernel build, LANMAN authentication will not be - used automatically. At runtime LANMAN mounts are disabled but - can be set to required (or optional) either in - /proc/fs/cifs (see fs/cifs/README for more detail) or via an - option on the mount command. This support is disabled by - default in order to reduce the possibility of a downgrade - attack. - - If unsure, say N. - -config CIFS_UPCALL - bool "Kerberos/SPNEGO advanced session setup" - depends on CIFS && KEYS - help - Enables an upcall mechanism for CIFS which accesses - userspace helper utilities to provide SPNEGO packaged (RFC 4178) - Kerberos tickets which are needed to mount to certain secure servers - (for which more secure Kerberos authentication is required). If - unsure, say N. - -config CIFS_XATTR - bool "CIFS extended attributes" - depends on CIFS - help - Extended attributes are name:value pairs associated with inodes by - the kernel or by users (see the attr(5) manual page, or visit - <http://acl.bestbits.at/> for details). CIFS maps the name of - extended attributes beginning with the user namespace prefix - to SMB/CIFS EAs. EAs are stored on Windows servers without the - user namespace prefix, but their names are seen by Linux cifs clients - prefaced by the user namespace prefix. The system namespace - (used by some filesystems to store ACLs) is not supported at - this time. - - If unsure, say N. - -config CIFS_POSIX - bool "CIFS POSIX Extensions" - depends on CIFS_XATTR - help - Enabling this option will cause the cifs client to attempt to - negotiate a newer dialect with servers, such as Samba 3.0.5 - or later, that optionally can handle more POSIX like (rather - than Windows like) file behavior. It also enables - support for POSIX ACLs (getfacl and setfacl) to servers - (such as Samba 3.10 and later) which can negotiate - CIFS POSIX ACL support. If unsure, say N. - -config CIFS_DEBUG2 - bool "Enable additional CIFS debugging routines" - depends on CIFS - help - Enabling this option adds a few more debugging routines - to the cifs code which slightly increases the size of - the cifs module and can cause additional logging of debug - messages in some error paths, slowing performance. This - option can be turned off unless you are debugging - cifs problems. If unsure, say N. - -config CIFS_EXPERIMENTAL - bool "CIFS Experimental Features (EXPERIMENTAL)" - depends on CIFS && EXPERIMENTAL - help - Enables cifs features under testing. These features are - experimental and currently include DFS support and directory - change notification ie fcntl(F_DNOTIFY), as well as the upcall - mechanism which will be used for Kerberos session negotiation - and uid remapping. Some of these features also may depend on - setting a value of 1 to the pseudo-file /proc/fs/cifs/Experimental - (which is disabled by default). See the file fs/cifs/README - for more details. If unsure, say N. - -config CIFS_DFS_UPCALL - bool "DFS feature support (EXPERIMENTAL)" - depends on CIFS_EXPERIMENTAL - depends on KEYS - help - Enables an upcall mechanism for CIFS which contacts userspace - helper utilities to provide server name resolution (host names to - IP addresses) which is needed for implicit mounts of DFS junction - points. If unsure, say N. +source "fs/cifs/Kconfig" config NCP_FS tristate "NCP file system support (to mount NetWare volumes)" diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt index 801db13..ce9fb3f 100644 --- a/fs/Kconfig.binfmt +++ b/fs/Kconfig.binfmt @@ -40,6 +40,28 @@ config BINFMT_ELF_FDPIC It is also possible to run FDPIC ELF binaries on MMU linux also. +config CORE_DUMP_DEFAULT_ELF_HEADERS + bool "Write ELF core dumps with partial segments" + default n + depends on BINFMT_ELF + help + ELF core dump files describe each memory mapping of the crashed + process, and can contain or omit the memory contents of each one. + The contents of an unmodified text mapping are omitted by default. + + For an unmodified text mapping of an ELF object, including just + the first page of the file in a core dump makes it possible to + identify the build ID bits in the file, without paying the i/o + cost and disk space to dump all the text. However, versions of + GDB before 6.7 are confused by ELF core dump files in this format. + + The core dump behavior can be controlled per process using + the /proc/PID/coredump_filter pseudo-file; this setting is + inherited. See Documentation/filesystems/proc.txt for details. + + This config option changes the default setting of coredump_filter + seen at boot time. If unsure, say N. + config BINFMT_FLAT bool "Kernel support for flat binaries" depends on !MMU && (!FRV || BROKEN) diff --git a/fs/Makefile b/fs/Makefile index d0c69f5..2168c90 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -71,7 +71,7 @@ obj-$(CONFIG_DLM) += dlm/ # Do not add any filesystems before this line obj-$(CONFIG_REISERFS_FS) += reiserfs/ obj-$(CONFIG_EXT3_FS) += ext3/ # Before ext2 so root fs can be ext3 -obj-$(CONFIG_EXT4_FS) += ext4/ # Before ext2 so root fs can be ext4dev +obj-$(CONFIG_EXT4_FS) += ext4/ # Before ext2 so root fs can be ext4 obj-$(CONFIG_JBD) += jbd/ obj-$(CONFIG_JBD2) += jbd2/ obj-$(CONFIG_EXT2_FS) += ext2/ diff --git a/fs/afs/dir.c b/fs/afs/dir.c index dfda03d..99cf390 100644 --- a/fs/afs/dir.c +++ b/fs/afs/dir.c @@ -45,6 +45,7 @@ const struct file_operations afs_dir_file_operations = { .release = afs_release, .readdir = afs_readdir, .lock = afs_lock, + .llseek = generic_file_llseek, }; const struct inode_operations afs_dir_inode_operations = { @@ -159,17 +159,17 @@ int notify_change(struct dentry * dentry, struct iattr * attr) if (!(attr->ia_valid & ~(ATTR_KILL_SUID | ATTR_KILL_SGID))) return 0; + error = security_inode_setattr(dentry, attr); + if (error) + return error; + if (ia_valid & ATTR_SIZE) down_write(&dentry->d_inode->i_alloc_sem); if (inode->i_op && inode->i_op->setattr) { - error = security_inode_setattr(dentry, attr); - if (!error) - error = inode->i_op->setattr(dentry, attr); + error = inode->i_op->setattr(dentry, attr); } else { error = inode_change_ok(inode, attr); - if (!error) - error = security_inode_setattr(dentry, attr); if (!error) { if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) || (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) diff --git a/fs/bfs/dir.c b/fs/bfs/dir.c index ed8feb0..daae463 100644 --- a/fs/bfs/dir.c +++ b/fs/bfs/dir.c @@ -80,6 +80,7 @@ const struct file_operations bfs_dir_operations = { .read = generic_read_dir, .readdir = bfs_readdir, .fsync = file_fsync, + .llseek = generic_file_llseek, }; extern void dump_imap(const char *, struct super_block *); diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index c76afa2..8fcfa39 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -1156,16 +1156,24 @@ static int dump_seek(struct file *file, loff_t off) static unsigned long vma_dump_size(struct vm_area_struct *vma, unsigned long mm_flags) { +#define FILTER(type) (mm_flags & (1UL << MMF_DUMP_##type)) + /* The vma can be set up to tell us the answer directly. */ if (vma->vm_flags & VM_ALWAYSDUMP) goto whole; + /* Hugetlb memory check */ + if (vma->vm_flags & VM_HUGETLB) { + if ((vma->vm_flags & VM_SHARED) && FILTER(HUGETLB_SHARED)) + goto whole; + if (!(vma->vm_flags & VM_SHARED) && FILTER(HUGETLB_PRIVATE)) + goto whole; + } + /* Do not dump I/O mapped devices or special mappings */ if (vma->vm_flags & (VM_IO | VM_RESERVED)) return 0; -#define FILTER(type) (mm_flags & (1UL << MMF_DUMP_##type)) - /* By default, dump shared memory if mapped from an anonymous file. */ if (vma->vm_flags & VM_SHARED) { if (vma->vm_file->f_path.dentry->d_inode->i_nlink == 0 ? @@ -1333,20 +1341,15 @@ static void fill_prstatus(struct elf_prstatus *prstatus, prstatus->pr_pgrp = task_pgrp_vnr(p); prstatus->pr_sid = task_session_vnr(p); if (thread_group_leader(p)) { + struct task_cputime cputime; + /* - * This is the record for the group leader. Add in the - * cumulative times of previous dead threads. This total - * won't include the time of each live thread whose state - * is included in the core dump. The final total reported - * to our parent process when it calls wait4 will include - * those sums as well as the little bit more time it takes - * this and each other thread to finish dying after the - * core dump synchronization phase. + * This is the record for the group leader. It shows the + * group-wide total, not its individual thread total. */ - cputime_to_timeval(cputime_add(p->utime, p->signal->utime), - &prstatus->pr_utime); - cputime_to_timeval(cputime_add(p->stime, p->signal->stime), - &prstatus->pr_stime); + thread_group_cputime(p, &cputime); + cputime_to_timeval(cputime.utime, &prstatus->pr_utime); + cputime_to_timeval(cputime.stime, &prstatus->pr_stime); } else { cputime_to_timeval(p->utime, &prstatus->pr_utime); cputime_to_timeval(p->stime, &prstatus->pr_stime); diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index 0e8367c..5b5424c 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -1390,20 +1390,15 @@ static void fill_prstatus(struct elf_prstatus *prstatus, prstatus->pr_pgrp = task_pgrp_vnr(p); prstatus->pr_sid = task_session_vnr(p); if (thread_group_leader(p)) { + struct task_cputime cputime; + /* - * This is the record for the group leader. Add in the - * cumulative times of previous dead threads. This total - * won't include the time of each live thread whose state - * is included in the core dump. The final total reported - * to our parent process when it calls wait4 will include - * those sums as well as the little bit more time it takes - * this and each other thread to finish dying after the - * core dump synchronization phase. + * This is the record for the group leader. It shows the + * group-wide total, not its individual thread total. */ - cputime_to_timeval(cputime_add(p->utime, p->signal->utime), - &prstatus->pr_utime); - cputime_to_timeval(cputime_add(p->stime, p->signal->stime), - &prstatus->pr_stime); + thread_group_cputime(p, &cputime); + cputime_to_timeval(cputime.utime, &prstatus->pr_utime); + cputime_to_timeval(cputime.stime, &prstatus->pr_stime); } else { cputime_to_timeval(p->utime, &prstatus->pr_utime); cputime_to_timeval(p->stime, &prstatus->pr_stime); diff --git a/fs/block_dev.c b/fs/block_dev.c index d84f0469..88a776f 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -840,13 +840,12 @@ EXPORT_SYMBOL_GPL(bd_release_from_disk); * to be used for internal purposes. If you ever need it - reconsider * your API. */ -struct block_device *open_by_devnum(dev_t dev, unsigned mode) +struct block_device *open_by_devnum(dev_t dev, fmode_t mode) { struct block_device *bdev = bdget(dev); int err = -ENOMEM; - int flags = mode & FMODE_WRITE ? O_RDWR : O_RDONLY; if (bdev) - err = blkdev_get(bdev, mode, flags); + err = blkdev_get(bdev, mode); return err ? ERR_PTR(err) : bdev; } @@ -975,9 +974,7 @@ void bd_set_size(struct block_device *bdev, loff_t size) } EXPORT_SYMBOL(bd_set_size); -static int __blkdev_get(struct block_device *bdev, mode_t mode, unsigned flags, - int for_part); -static int __blkdev_put(struct block_device *bdev, int for_part); +static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part); /* * bd_mutex locking: @@ -986,7 +983,7 @@ static int __blkdev_put(struct block_device *bdev, int for_part); * mutex_lock_nested(whole->bd_mutex, 1) */ -static int do_open(struct block_device *bdev, struct file *file, int for_part) +static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) { struct gendisk *disk; struct hd_struct *part = NULL; @@ -994,9 +991,9 @@ static int do_open(struct block_device *bdev, struct file *file, int for_part) int partno; int perm = 0; - if (file->f_mode & FMODE_READ) + if (mode & FMODE_READ) perm |= MAY_READ; - if (file->f_mode & FMODE_WRITE) + if (mode & FMODE_WRITE) perm |= MAY_WRITE; /* * hooks: /n/, see "layering violations". @@ -1008,7 +1005,6 @@ static int do_open(struct block_device *bdev, struct file *file, int for_part) } ret = -ENXIO; - file->f_mapping = bdev->bd_inode->i_mapping; lock_kernel(); @@ -1027,7 +1023,7 @@ static int do_open(struct block_device *bdev, struct file *file, int for_part) if (!partno) { struct backing_dev_info *bdi; if (disk->fops->open) { - ret = disk->fops->open(bdev->bd_inode, file); + ret = disk->fops->open(bdev, mode); if (ret) goto out_clear; } @@ -1047,7 +1043,7 @@ static int do_open(struct block_device *bdev, struct file *file, int for_part) if (!whole) goto out_clear; BUG_ON(for_part); - ret = __blkdev_get(whole, file->f_mode, file->f_flags, 1); + ret = __blkdev_get(whole, mode, 1); if (ret) goto out_clear; bdev->bd_contains = whole; @@ -1068,7 +1064,7 @@ static int do_open(struct block_device *bdev, struct file *file, int for_part) disk = NULL; if (bdev->bd_contains == bdev) { if (bdev->bd_disk->fops->open) { - ret = bdev->bd_disk->fops->open(bdev->bd_inode, file); + ret = bdev->bd_disk->fops->open(bdev, mode); if (ret) goto out_unlock_bdev; } @@ -1088,7 +1084,7 @@ static int do_open(struct block_device *bdev, struct file *file, int for_part) bdev->bd_part = NULL; bdev->bd_inode->i_data.backing_dev_info = &default_backing_dev_info; if (bdev != bdev->bd_contains) - __blkdev_put(bdev->bd_contains, 1); + __blkdev_put(bdev->bd_contains, mode, 1); bdev->bd_contains = NULL; out_unlock_bdev: mutex_unlock(&bdev->bd_mutex); @@ -1104,28 +1100,9 @@ static int do_open(struct block_device *bdev, struct file *file, int for_part) return ret; } -static int __blkdev_get(struct block_device *bdev, mode_t mode, unsigned flags, - int for_part) +int blkdev_get(struct block_device *bdev, fmode_t mode) { - /* - * This crockload is due to bad choice of ->open() type. - * It will go away. - * For now, block device ->open() routine must _not_ - * examine anything in 'inode' argument except ->i_rdev. - */ - struct file fake_file = {}; - struct dentry fake_dentry = {}; - fake_file.f_mode = mode; - fake_file.f_flags = flags; - fake_file.f_path.dentry = &fake_dentry; - fake_dentry.d_inode = bdev->bd_inode; - - return do_open(bdev, &fake_file, for_part); -} - -int blkdev_get(struct block_device *bdev, mode_t mode, unsigned flags) -{ - return __blkdev_get(bdev, mode, flags, 0); + return __blkdev_get(bdev, mode, 0); } EXPORT_SYMBOL(blkdev_get); @@ -1142,28 +1119,36 @@ static int blkdev_open(struct inode * inode, struct file * filp) */ filp->f_flags |= O_LARGEFILE; + if (filp->f_flags & O_NDELAY) + filp->f_mode |= FMODE_NDELAY; + if (filp->f_flags & O_EXCL) + filp->f_mode |= FMODE_EXCL; + if ((filp->f_flags & O_ACCMODE) == 3) + filp->f_mode |= FMODE_WRITE_IOCTL; + bdev = bd_acquire(inode); if (bdev == NULL) return -ENOMEM; - res = do_open(bdev, filp, 0); + filp->f_mapping = bdev->bd_inode->i_mapping; + + res = blkdev_get(bdev, filp->f_mode); if (res) return res; - if (!(filp->f_flags & O_EXCL) ) + if (!(filp->f_mode & FMODE_EXCL)) return 0; if (!(res = bd_claim(bdev, filp))) return 0; - blkdev_put(bdev); + blkdev_put(bdev, filp->f_mode); return res; } -static int __blkdev_put(struct block_device *bdev, int for_part) +static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part) { int ret = 0; - struct inode *bd_inode = bdev->bd_inode; struct gendisk *disk = bdev->bd_disk; struct block_device *victim = NULL; @@ -1178,7 +1163,7 @@ static int __blkdev_put(struct block_device *bdev, int for_part) } if (bdev->bd_contains == bdev) { if (disk->fops->release) - ret = disk->fops->release(bd_inode, NULL); + ret = disk->fops->release(disk, mode); } if (!bdev->bd_openers) { struct module *owner = disk->fops->owner; @@ -1197,13 +1182,13 @@ static int __blkdev_put(struct block_device *bdev, int for_part) mutex_unlock(&bdev->bd_mutex); bdput(bdev); if (victim) - __blkdev_put(victim, 1); + __blkdev_put(victim, mode, 1); return ret; } -int blkdev_put(struct block_device *bdev) +int blkdev_put(struct block_device *bdev, fmode_t mode) { - return __blkdev_put(bdev, 0); + return __blkdev_put(bdev, mode, 0); } EXPORT_SYMBOL(blkdev_put); @@ -1212,12 +1197,16 @@ static int blkdev_close(struct inode * inode, struct file * filp) struct block_device *bdev = I_BDEV(filp->f_mapping->host); if (bdev->bd_holder == filp) bd_release(bdev); - return blkdev_put(bdev); + return blkdev_put(bdev, filp->f_mode); } static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg) { - return blkdev_ioctl(file->f_mapping->host, file, cmd, arg); + struct block_device *bdev = I_BDEV(file->f_mapping->host); + fmode_t mode = file->f_mode; + if (file->f_flags & O_NDELAY) + mode |= FMODE_NDELAY_NOW; + return blkdev_ioctl(bdev, mode, cmd, arg); } static const struct address_space_operations def_blk_aops = { @@ -1253,7 +1242,7 @@ int ioctl_by_bdev(struct block_device *bdev, unsigned cmd, unsigned long arg) int res; mm_segment_t old_fs = get_fs(); set_fs(KERNEL_DS); - res = blkdev_ioctl(bdev->bd_inode, NULL, cmd, arg); + res = blkdev_ioctl(bdev, 0, cmd, arg); set_fs(old_fs); return res; } @@ -1262,39 +1251,39 @@ EXPORT_SYMBOL(ioctl_by_bdev); /** * lookup_bdev - lookup a struct block_device by name - * @pathname: special file representing the block device + * @path: special file representing the block device * * Get a reference to the blockdevice at @pathname in the current * namespace if possible and return it. Return ERR_PTR(error) * otherwise. */ -struct block_device *lookup_bdev(const char *path) +struct block_device *lookup_bdev(const char *pathname) { struct block_device *bdev; struct inode *inode; - struct nameidata nd; + struct path path; int error; - if (!path || !*path) + if (!pathname || !*pathname) return ERR_PTR(-EINVAL); - error = path_lookup(path, LOOKUP_FOLLOW, &nd); + error = kern_path(pathname, LOOKUP_FOLLOW, &path); if (error) return ERR_PTR(error); - inode = nd.path.dentry->d_inode; + inode = path.dentry->d_inode; error = -ENOTBLK; if (!S_ISBLK(inode->i_mode)) goto fail; error = -EACCES; - if (nd.path.mnt->mnt_flags & MNT_NODEV) + if (path.mnt->mnt_flags & MNT_NODEV) goto fail; error = -ENOMEM; bdev = bd_acquire(inode); if (!bdev) goto fail; out: - path_put(&nd.path); + path_put(&path); return bdev; fail: bdev = ERR_PTR(error); @@ -1303,32 +1292,29 @@ fail: EXPORT_SYMBOL(lookup_bdev); /** - * open_bdev_excl - open a block device by name and set it up for use + * open_bdev_exclusive - open a block device by name and set it up for use * * @path: special file representing the block device - * @flags: %MS_RDONLY for opening read-only + * @mode: FMODE_... combination to pass be used * @holder: owner for exclusion * * Open the blockdevice described by the special file at @path, claim it * for the @holder. */ -struct block_device *open_bdev_excl(const char *path, int flags, void *holder) +struct block_device *open_bdev_exclusive(const char *path, fmode_t mode, void *holder) { struct block_device *bdev; - mode_t mode = FMODE_READ; int error = 0; bdev = lookup_bdev(path); if (IS_ERR(bdev)) return bdev; - if (!(flags & MS_RDONLY)) - mode |= FMODE_WRITE; - error = blkdev_get(bdev, mode, 0); + error = blkdev_get(bdev, mode); if (error) return ERR_PTR(error); error = -EACCES; - if (!(flags & MS_RDONLY) && bdev_read_only(bdev)) + if ((mode & FMODE_WRITE) && bdev_read_only(bdev)) goto blkdev_put; error = bd_claim(bdev, holder); if (error) @@ -1337,26 +1323,27 @@ struct block_device *open_bdev_excl(const char *path, int flags, void *holder) return bdev; blkdev_put: - blkdev_put(bdev); + blkdev_put(bdev, mode); return ERR_PTR(error); } -EXPORT_SYMBOL(open_bdev_excl); +EXPORT_SYMBOL(open_bdev_exclusive); /** - * close_bdev_excl - release a blockdevice openen by open_bdev_excl() + * close_bdev_exclusive - close a blockdevice opened by open_bdev_exclusive() * * @bdev: blockdevice to close + * @mode: mode, must match that used to open. * - * This is the counterpart to open_bdev_excl(). + * This is the counterpart to open_bdev_exclusive(). */ -void close_bdev_excl(struct block_device *bdev) +void close_bdev_exclusive(struct block_device *bdev, fmode_t mode) { bd_release(bdev); - blkdev_put(bdev); + blkdev_put(bdev, mode); } -EXPORT_SYMBOL(close_bdev_excl); +EXPORT_SYMBOL(close_bdev_exclusive); int __invalidate_device(struct block_device *bdev) { diff --git a/fs/buffer.c b/fs/buffer.c index ac78d4c..6569fda 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -76,8 +76,7 @@ EXPORT_SYMBOL(__lock_buffer); void unlock_buffer(struct buffer_head *bh) { - smp_mb__before_clear_bit(); - clear_buffer_locked(bh); + clear_bit_unlock(BH_Lock, &bh->b_state); smp_mb__after_clear_bit(); wake_up_bit(&bh->b_state, BH_Lock); } diff --git a/fs/char_dev.c b/fs/char_dev.c index 262fa10..700697a 100644 --- a/fs/char_dev.c +++ b/fs/char_dev.c @@ -386,15 +386,22 @@ static int chrdev_open(struct inode *inode, struct file *filp) cdev_put(new); if (ret) return ret; + + ret = -ENXIO; filp->f_op = fops_get(p->ops); - if (!filp->f_op) { - cdev_put(p); - return -ENXIO; - } - if (filp->f_op->open) + if (!filp->f_op) + goto out_cdev_put; + + if (filp->f_op->open) { ret = filp->f_op->open(inode,filp); - if (ret) - cdev_put(p); + if (ret) + goto out_cdev_put; + } + + return 0; + + out_cdev_put: + cdev_put(p); return ret; } diff --git a/fs/cifs/CHANGES b/fs/cifs/CHANGES index 06e521a..8f528ea 100644 --- a/fs/cifs/CHANGES +++ b/fs/cifs/CHANGES @@ -1,3 +1,11 @@ +Version 1.55 +------------ +Various fixes to make delete of open files behavior more predictable +(when delete of an open file fails we mark the file as "delete-on-close" +in a way that more servers accept, but only if we can first rename the +file to a temporary name). Add experimental support for more safely +handling fcntl(F_SETLEASE). + Version 1.54 ------------ Fix premature write failure on congested networks (we would give up @@ -13,6 +21,7 @@ on dns_upcall (resolving DFS referralls). Fix plain text password authentication (requires setting SecurityFlags to 0x30030 to enable lanman and plain text though). Fix writes to be at correct offset when file is open with O_APPEND and file is on a directio (forcediretio) mount. +Fix bug in rewinding readdir directory searches. Add nodfs mount option. Version 1.53 ------------ diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig new file mode 100644 index 0000000..341a989 --- /dev/null +++ b/fs/cifs/Kconfig @@ -0,0 +1,142 @@ +config CIFS + tristate "CIFS support (advanced network filesystem, SMBFS successor)" + depends on INET + select NLS + help + This is the client VFS module for the Common Internet File System + (CIFS) protocol which is the successor to the Server Message Block + (SMB) protocol, the native file sharing mechanism for most early + PC operating systems. The CIFS protocol is fully supported by + file servers such as Windows 2000 (including Windows 2003, NT 4 + and Windows XP) as well by Samba (which provides excellent CIFS + server support for Linux and many other operating systems). Limited + support for OS/2 and Windows ME and similar servers is provided as + well. + + The cifs module provides an advanced network file system + client for mounting to CIFS compliant servers. It includes + support for DFS (hierarchical name space), secure per-user + session establishment via Kerberos or NTLM or NTLMv2, + safe distributed caching (oplock), optional packet + signing, Unicode and other internationalization improvements. + If you need to mount to Samba or Windows from this machine, say Y. + +config CIFS_STATS + bool "CIFS statistics" + depends on CIFS + help + Enabling this option will cause statistics for each server share + mounted by the cifs client to be displayed in /proc/fs/cifs/Stats + +config CIFS_STATS2 + bool "Extended statistics" + depends on CIFS_STATS + help + Enabling this option will allow more detailed statistics on SMB + request timing to be displayed in /proc/fs/cifs/DebugData and also + allow optional logging of slow responses to dmesg (depending on the + value of /proc/fs/cifs/cifsFYI, see fs/cifs/README for more details). + These additional statistics may have a minor effect on performance + and memory utilization. + + Unless you are a developer or are doing network performance analysis + or tuning, say N. + +config CIFS_WEAK_PW_HASH + bool "Support legacy servers which use weaker LANMAN security" + depends on CIFS + help + Modern CIFS servers including Samba and most Windows versions + (since 1997) support stronger NTLM (and even NTLMv2 and Kerberos) + security mechanisms. These hash the password more securely + than the mechanisms used in the older LANMAN version of the + SMB protocol but LANMAN based authentication is needed to + establish sessions with some old SMB servers. + + Enabling this option allows the cifs module to mount to older + LANMAN based servers such as OS/2 and Windows 95, but such + mounts may be less secure than mounts using NTLM or more recent + security mechanisms if you are on a public network. Unless you + have a need to access old SMB servers (and are on a private + network) you probably want to say N. Even if this support + is enabled in the kernel build, LANMAN authentication will not be + used automatically. At runtime LANMAN mounts are disabled but + can be set to required (or optional) either in + /proc/fs/cifs (see fs/cifs/README for more detail) or via an + option on the mount command. This support is disabled by + default in order to reduce the possibility of a downgrade + attack. + + If unsure, say N. + +config CIFS_UPCALL + bool "Kerberos/SPNEGO advanced session setup" + depends on CIFS && KEYS + help + Enables an upcall mechanism for CIFS which accesses + userspace helper utilities to provide SPNEGO packaged (RFC 4178) + Kerberos tickets which are needed to mount to certain secure servers + (for which more secure Kerberos authentication is required). If + unsure, say N. + +config CIFS_XATTR + bool "CIFS extended attributes" + depends on CIFS + help + Extended attributes are name:value pairs associated with inodes by + the kernel or by users (see the attr(5) manual page, or visit + <http://acl.bestbits.at/> for details). CIFS maps the name of + extended attributes beginning with the user namespace prefix + to SMB/CIFS EAs. EAs are stored on Windows servers without the + user namespace prefix, but their names are seen by Linux cifs clients + prefaced by the user namespace prefix. The system namespace + (used by some filesystems to store ACLs) is not supported at + this time. + + If unsure, say N. + +config CIFS_POSIX + bool "CIFS POSIX Extensions" + depends on CIFS_XATTR + help + Enabling this option will cause the cifs client to attempt to + negotiate a newer dialect with servers, such as Samba 3.0.5 + or later, that optionally can handle more POSIX like (rather + than Windows like) file behavior. It also enables + support for POSIX ACLs (getfacl and setfacl) to servers + (such as Samba 3.10 and later) which can negotiate + CIFS POSIX ACL support. If unsure, say N. + +config CIFS_DEBUG2 + bool "Enable additional CIFS debugging routines" + depends on CIFS + help + Enabling this option adds a few more debugging routines + to the cifs code which slightly increases the size of + the cifs module and can cause additional logging of debug + messages in some error paths, slowing performance. This + option can be turned off unless you are debugging + cifs problems. If unsure, say N. + +config CIFS_EXPERIMENTAL + bool "CIFS Experimental Features (EXPERIMENTAL)" + depends on CIFS && EXPERIMENTAL + help + Enables cifs features under testing. These features are + experimental and currently include DFS support and directory + change notification ie fcntl(F_DNOTIFY), as well as the upcall + mechanism which will be used for Kerberos session negotiation + and uid remapping. Some of these features also may depend on + setting a value of 1 to the pseudo-file /proc/fs/cifs/Experimental + (which is disabled by default). See the file fs/cifs/README + for more details. If unsure, say N. + +config CIFS_DFS_UPCALL + bool "DFS feature support (EXPERIMENTAL)" + depends on CIFS_EXPERIMENTAL + depends on KEYS + help + Enables an upcall mechanism for CIFS which contacts userspace + helper utilities to provide server name resolution (host names to + IP addresses) which is needed for implicit mounts of DFS junction + points. If unsure, say N. diff --git a/fs/cifs/README b/fs/cifs/README index bd2343d..a439dc1 100644 --- a/fs/cifs/README +++ b/fs/cifs/README @@ -463,6 +463,9 @@ A partial list of the supported mount options follows: with cifs style mandatory byte range locks (and most cifs servers do not yet support requesting advisory byte range locks). + nodfs Disable DFS (global name space support) even if the + server claims to support it. This can help work around + a problem with parsing of DFS paths with Samba 3.0.24 server. remount remount the share (often used to change from ro to rw mounts or vice versa) cifsacl Report mode bits (e.g. on stat) based on the Windows ACL for @@ -488,6 +491,19 @@ A partial list of the supported mount options follows: Note that this differs from the sign mount option in that it causes encryption of data sent over this mounted share but other shares mounted to the same server are unaffected. + locallease This option is rarely needed. Fcntl F_SETLEASE is + used by some applications such as Samba and NFSv4 server to + check to see whether a file is cacheable. CIFS has no way + to explicitly request a lease, but can check whether a file + is cacheable (oplocked). Unfortunately, even if a file + is not oplocked, it could still be cacheable (ie cifs client + could grant fcntl leases if no other local processes are using + the file) for cases for example such as when the server does not + support oplocks and the user is sure that the only updates to + the file will be from this client. Specifying this mount option + will allow the cifs client to check for leases (only) locally + for files which are not oplocked instead of denying leases + in that case. (EXPERIMENTAL) sec Security mode. Allowed values are: none attempt to connection as a null user (no name) krb5 Use Kerberos version 5 authentication @@ -638,6 +654,9 @@ requires enabling CONFIG_CIFS_EXPERIMENTAL cifsacl support needed to retrieve approximated mode bits based on the contents on the CIFS ACL. + lease support: cifs will check the oplock state before calling into + the vfs to see if we can grant a lease on a file. + DNOTIFY fcntl: needed for support of directory change notification and perhaps later for file leases) diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 25ecbd5..ac5915d 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -275,9 +275,12 @@ static int cifs_permission(struct inode *inode, int mask) cifs_sb = CIFS_SB(inode->i_sb); - if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_PERM) - return 0; - else /* file mode might have been restricted at mount time + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_PERM) { + if ((mask & MAY_EXEC) && !execute_ok(inode)) + return -EACCES; + else + return 0; + } else /* file mode might have been restricted at mount time on the client (above and beyond ACL on servers) for servers which do not support setting and viewing mode bits, so allowing client to check permissions is useful */ @@ -309,6 +312,7 @@ cifs_alloc_inode(struct super_block *sb) file data or metadata */ cifs_inode->clientCanCacheRead = false; cifs_inode->clientCanCacheAll = false; + cifs_inode->delete_pending = false; cifs_inode->vfs_inode.i_blkbits = 14; /* 2**14 = CIFS_MAX_MSGSIZE */ /* Can not set i_flags here - they get immediately overwritten @@ -617,6 +621,37 @@ static loff_t cifs_llseek(struct file *file, loff_t offset, int origin) return generic_file_llseek_unlocked(file, offset, origin); } +#ifdef CONFIG_CIFS_EXPERIMENTAL +static int cifs_setlease(struct file *file, long arg, struct file_lock **lease) +{ + /* note that this is called by vfs setlease with the BKL held + although I doubt that BKL is needed here in cifs */ + struct inode *inode = file->f_path.dentry->d_inode; + + if (!(S_ISREG(inode->i_mode))) + return -EINVAL; + + /* check if file is oplocked */ + if (((arg == F_RDLCK) && + (CIFS_I(inode)->clientCanCacheRead)) || + ((arg == F_WRLCK) && + (CIFS_I(inode)->clientCanCacheAll))) + return generic_setlease(file, arg, lease); + else if (CIFS_SB(inode->i_sb)->tcon->local_lease && + !CIFS_I(inode)->clientCanCacheRead) + /* If the server claims to support oplock on this + file, then we still need to check oplock even + if the local_lease mount option is set, but there + are servers which do not support oplock for which + this mount option may be useful if the user + knows that the file won't be changed on the server + by anyone else */ + return generic_setlease(file, arg, lease); + else + return -EAGAIN; +} +#endif + struct file_system_type cifs_fs_type = { .owner = THIS_MODULE, .name = "cifs", @@ -695,6 +730,7 @@ const struct file_operations cifs_file_ops = { #ifdef CONFIG_CIFS_EXPERIMENTAL .dir_notify = cifs_dir_notify, + .setlease = cifs_setlease, #endif /* CONFIG_CIFS_EXPERIMENTAL */ }; @@ -715,6 +751,7 @@ const struct file_operations cifs_file_direct_ops = { .llseek = cifs_llseek, #ifdef CONFIG_CIFS_EXPERIMENTAL .dir_notify = cifs_dir_notify, + .setlease = cifs_setlease, #endif /* CONFIG_CIFS_EXPERIMENTAL */ }; const struct file_operations cifs_file_nobrl_ops = { @@ -735,6 +772,7 @@ const struct file_operations cifs_file_nobrl_ops = { #ifdef CONFIG_CIFS_EXPERIMENTAL .dir_notify = cifs_dir_notify, + .setlease = cifs_setlease, #endif /* CONFIG_CIFS_EXPERIMENTAL */ }; @@ -754,6 +792,7 @@ const struct file_operations cifs_file_direct_nobrl_ops = { .llseek = cifs_llseek, #ifdef CONFIG_CIFS_EXPERIMENTAL .dir_notify = cifs_dir_notify, + .setlease = cifs_setlease, #endif /* CONFIG_CIFS_EXPERIMENTAL */ }; @@ -765,6 +804,7 @@ const struct file_operations cifs_dir_ops = { .dir_notify = cifs_dir_notify, #endif /* CONFIG_CIFS_EXPERIMENTAL */ .unlocked_ioctl = cifs_ioctl, + .llseek = generic_file_llseek, }; static void @@ -945,6 +985,12 @@ static int cifs_oplock_thread(void *dummyarg) the call */ /* mutex_lock(&inode->i_mutex);*/ if (S_ISREG(inode->i_mode)) { +#ifdef CONFIG_CIFS_EXPERIMENTAL + if (CIFS_I(inode)->clientCanCacheAll == 0) + break_lease(inode, FMODE_READ); + else if (CIFS_I(inode)->clientCanCacheRead == 0) + break_lease(inode, FMODE_WRITE); +#endif rc = filemap_fdatawrite(inode->i_mapping); if (CIFS_I(inode)->clientCanCacheRead == 0) { waitrc = filemap_fdatawait( diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h index f7b4a5c..074de0b 100644 --- a/fs/cifs/cifsfs.h +++ b/fs/cifs/cifsfs.h @@ -101,5 +101,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg); extern const struct export_operations cifs_export_ops; #endif /* EXPERIMENTAL */ -#define CIFS_VERSION "1.54" +#define CIFS_VERSION "1.55" #endif /* _CIFSFS_H */ diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index 0d22479..c791e5b 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -285,6 +285,7 @@ struct cifsTconInfo { bool seal:1; /* transport encryption for this mounted share */ bool unix_ext:1; /* if false disable Linux extensions to CIFS protocol for this mount even if server would support */ + bool local_lease:1; /* check leases (only) on local system not remote */ /* BB add field for back pointer to sb struct(s)? */ }; @@ -353,6 +354,7 @@ struct cifsInodeInfo { bool clientCanCacheRead:1; /* read oplock */ bool clientCanCacheAll:1; /* read and writebehind oplock */ bool oplockPending:1; + bool delete_pending:1; /* DELETE_ON_CLOSE is set */ struct inode vfs_inode; }; diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index 6f4ffe1..843a85f 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -1309,6 +1309,7 @@ OldOpenRetry: cpu_to_le64(le32_to_cpu(pSMBr->EndOfFile)); pfile_info->EndOfFile = pfile_info->AllocationSize; pfile_info->NumberOfLinks = cpu_to_le32(1); + pfile_info->DeletePending = 0; } } @@ -1410,6 +1411,7 @@ openRetry: pfile_info->AllocationSize = pSMBr->AllocationSize; pfile_info->EndOfFile = pSMBr->EndOfFile; pfile_info->NumberOfLinks = cpu_to_le32(1); + pfile_info->DeletePending = 0; } } diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 4c13bcd..71b7661 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -90,6 +90,8 @@ struct smb_vol { bool nocase:1; /* request case insensitive filenames */ bool nobrl:1; /* disable sending byte range locks to srv */ bool seal:1; /* request transport encryption on share */ + bool nodfs:1; /* Do not request DFS, even if available */ + bool local_lease:1; /* check leases only on local system, not remote */ unsigned int rsize; unsigned int wsize; unsigned int sockopt; @@ -124,7 +126,7 @@ cifs_reconnect(struct TCP_Server_Info *server) struct mid_q_entry *mid_entry; spin_lock(&GlobalMid_Lock); - if (kthread_should_stop()) { + if (server->tcpStatus == CifsExiting) { /* the demux thread will exit normally next time through the loop */ spin_unlock(&GlobalMid_Lock); @@ -184,7 +186,8 @@ cifs_reconnect(struct TCP_Server_Info *server) spin_unlock(&GlobalMid_Lock); up(&server->tcpSem); - while ((!kthread_should_stop()) && (server->tcpStatus != CifsGood)) { + while ((server->tcpStatus != CifsExiting) && + (server->tcpStatus != CifsGood)) { try_to_freeze(); if (server->protocolType == IPV6) { rc = ipv6_connect(&server->addr.sockAddr6, @@ -201,7 +204,7 @@ cifs_reconnect(struct TCP_Server_Info *server) } else { atomic_inc(&tcpSesReconnectCount); spin_lock(&GlobalMid_Lock); - if (!kthread_should_stop()) + if (server->tcpStatus != CifsExiting) server->tcpStatus = CifsGood; server->sequence_number = 0; spin_unlock(&GlobalMid_Lock); @@ -356,7 +359,7 @@ cifs_demultiplex_thread(struct TCP_Server_Info *server) GFP_KERNEL); set_freezable(); - while (!kthread_should_stop()) { + while (server->tcpStatus != CifsExiting) { if (try_to_freeze()) continue; if (bigbuf == NULL) { @@ -397,7 +400,7 @@ incomplete_rcv: kernel_recvmsg(csocket, &smb_msg, &iov, 1, pdu_length, 0 /* BB other flags? */); - if (kthread_should_stop()) { + if (server->tcpStatus == CifsExiting) { break; } else if (server->tcpStatus == CifsNeedReconnect) { cFYI(1, ("Reconnect after server stopped responding")); @@ -522,7 +525,7 @@ incomplete_rcv: total_read += length) { length = kernel_recvmsg(csocket, &smb_msg, &iov, 1, pdu_length - total_read, 0); - if (kthread_should_stop() || + if ((server->tcpStatus == CifsExiting) || (length == -EINTR)) { /* then will exit */ reconnect = 2; @@ -651,14 +654,6 @@ multi_t2_fnd: spin_unlock(&GlobalMid_Lock); wake_up_all(&server->response_q); - /* don't exit until kthread_stop is called */ - set_current_state(TASK_UNINTERRUPTIBLE); - while (!kthread_should_stop()) { - schedule(); - set_current_state(TASK_UNINTERRUPTIBLE); - } - set_current_state(TASK_RUNNING); - /* check if we have blocked requests that need to free */ /* Note that cifs_max_pending is normally 50, but can be set at module install time to as little as two */ @@ -755,6 +750,7 @@ multi_t2_fnd: write_unlock(&GlobalSMBSeslock); kfree(server->hostname); + task_to_wake = xchg(&server->tsk, NULL); kfree(server); length = atomic_dec_return(&tcpSesAllocCount); @@ -762,6 +758,16 @@ multi_t2_fnd: mempool_resize(cifs_req_poolp, length + cifs_min_rcv, GFP_KERNEL); + /* if server->tsk was NULL then wait for a signal before exiting */ + if (!task_to_wake) { + set_current_state(TASK_INTERRUPTIBLE); + while (!signal_pending(current)) { + schedule(); + set_current_state(TASK_INTERRUPTIBLE); + } + set_current_state(TASK_RUNNING); + } + return 0; } @@ -1218,6 +1224,8 @@ cifs_parse_mount_options(char *options, const char *devname, vol->sfu_emul = 1; } else if (strnicmp(data, "nosfu", 5) == 0) { vol->sfu_emul = 0; + } else if (strnicmp(data, "nodfs", 5) == 0) { + vol->nodfs = 1; } else if (strnicmp(data, "posixpaths", 10) == 0) { vol->posix_paths = 1; } else if (strnicmp(data, "noposixpaths", 12) == 0) { @@ -1268,6 +1276,10 @@ cifs_parse_mount_options(char *options, const char *devname, vol->no_psx_acl = 0; } else if (strnicmp(data, "noacl", 5) == 0) { vol->no_psx_acl = 1; +#ifdef CONFIG_CIFS_EXPERIMENTAL + } else if (strnicmp(data, "locallease", 6) == 0) { + vol->local_lease = 1; +#endif } else if (strnicmp(data, "sign", 4) == 0) { vol->secFlg |= CIFSSEC_MUST_SIGN; } else if (strnicmp(data, "seal", 4) == 0) { @@ -1845,6 +1857,16 @@ convert_delimiter(char *path, char delim) } } +static void +kill_cifsd(struct TCP_Server_Info *server) +{ + struct task_struct *task; + + task = xchg(&server->tsk, NULL); + if (task) + force_sig(SIGKILL, task); +} + int cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb, char *mount_data, const char *devname) @@ -2166,6 +2188,7 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb, for the retry flag is used */ tcon->retry = volume_info.retry; tcon->nocase = volume_info.nocase; + tcon->local_lease = volume_info.local_lease; if (tcon->seal != volume_info.seal) cERROR(1, ("transport encryption setting " "conflicts with existing tid")); @@ -2197,6 +2220,12 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb, volume_info.UNC, tcon, cifs_sb->local_nls); cFYI(1, ("CIFS Tcon rc = %d", rc)); + if (volume_info.nodfs) { + tcon->Flags &= + ~SMB_SHARE_IS_IN_DFS; + cFYI(1, ("DFS disabled (%d)", + tcon->Flags)); + } } if (!rc) { atomic_inc(&pSesInfo->inUse); @@ -2225,14 +2254,7 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb, spin_lock(&GlobalMid_Lock); srvTcp->tcpStatus = CifsExiting; spin_unlock(&GlobalMid_Lock); - if (srvTcp->tsk) { - /* If we could verify that kthread_stop would - always wake up processes blocked in - tcp in recv_mesg then we could remove the - send_sig call */ - force_sig(SIGKILL, srvTcp->tsk); - kthread_stop(srvTcp->tsk); - } + kill_cifsd(srvTcp); } /* If find_unc succeeded then rc == 0 so we can not end */ if (tcon) /* up accidently freeing someone elses tcon struct */ @@ -2245,19 +2267,15 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb, temp_rc = CIFSSMBLogoff(xid, pSesInfo); /* if the socketUseCount is now zero */ if ((temp_rc == -ESHUTDOWN) && - (pSesInfo->server) && - (pSesInfo->server->tsk)) { - force_sig(SIGKILL, - pSesInfo->server->tsk); - kthread_stop(pSesInfo->server->tsk); - } + (pSesInfo->server)) + kill_cifsd(pSesInfo->server); } else { cFYI(1, ("No session or bad tcon")); - if ((pSesInfo->server) && - (pSesInfo->server->tsk)) { - force_sig(SIGKILL, - pSesInfo->server->tsk); - kthread_stop(pSesInfo->server->tsk); + if (pSesInfo->server) { + spin_lock(&GlobalMid_Lock); + srvTcp->tcpStatus = CifsExiting; + spin_unlock(&GlobalMid_Lock); + kill_cifsd(pSesInfo->server); } } sesInfoFree(pSesInfo); @@ -3544,7 +3562,6 @@ cifs_umount(struct super_block *sb, struct cifs_sb_info *cifs_sb) int rc = 0; int xid; struct cifsSesInfo *ses = NULL; - struct task_struct *cifsd_task; char *tmp; xid = GetXid(); @@ -3560,7 +3577,6 @@ cifs_umount(struct super_block *sb, struct cifs_sb_info *cifs_sb) tconInfoFree(cifs_sb->tcon); if ((ses) && (ses->server)) { /* save off task so we do not refer to ses later */ - cifsd_task = ses->server->tsk; cFYI(1, ("About to do SMBLogoff ")); rc = CIFSSMBLogoff(xid, ses); if (rc == -EBUSY) { @@ -3568,10 +3584,8 @@ cifs_umount(struct super_block *sb, struct cifs_sb_info *cifs_sb) return 0; } else if (rc == -ESHUTDOWN) { cFYI(1, ("Waking up socket by sending signal")); - if (cifsd_task) { - force_sig(SIGKILL, cifsd_task); - kthread_stop(cifsd_task); - } + if (ses->server) + kill_cifsd(ses->server); rc = 0; } /* else - we have an smb session left on this socket do not kill cifsd */ @@ -3701,7 +3715,9 @@ int cifs_setup_session(unsigned int xid, struct cifsSesInfo *pSesInfo, cERROR(1, ("Send error in SessSetup = %d", rc)); } else { cFYI(1, ("CIFS Session Established successfully")); + spin_lock(&GlobalMid_Lock); pSesInfo->status = CifsGood; + spin_unlock(&GlobalMid_Lock); } ss_err_exit: diff --git a/fs/cifs/file.c b/fs/cifs/file.c index c4a8a06..62d8bd8 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -1791,7 +1791,7 @@ static void cifs_copy_cache_pages(struct address_space *mapping, SetPageUptodate(page); unlock_page(page); if (!pagevec_add(plru_pvec, page)) - __pagevec_lru_add(plru_pvec); + __pagevec_lru_add_file(plru_pvec); data += PAGE_CACHE_SIZE; } return; @@ -1925,7 +1925,7 @@ static int cifs_readpages(struct file *file, struct address_space *mapping, bytes_read = 0; } - pagevec_lru_add(&lru_pvec); + pagevec_lru_add_file(&lru_pvec); /* need to free smb_read_data buf before exit */ if (smb_read_data) { diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index a8c8333..d54fa8a 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -506,6 +506,7 @@ int cifs_get_inode_info(struct inode **pinode, inode = *pinode; cifsInfo = CIFS_I(inode); cifsInfo->cifsAttrs = attr; + cifsInfo->delete_pending = pfindData->DeletePending ? true : false; cFYI(1, ("Old time %ld", cifsInfo->time)); cifsInfo->time = jiffies; cFYI(1, ("New time %ld", cifsInfo->time)); @@ -772,63 +773,106 @@ out: * anything else. */ static int -cifs_rename_pending_delete(char *full_path, struct inode *inode, int xid) +cifs_rename_pending_delete(char *full_path, struct dentry *dentry, int xid) { int oplock = 0; int rc; __u16 netfid; + struct inode *inode = dentry->d_inode; struct cifsInodeInfo *cifsInode = CIFS_I(inode); struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); struct cifsTconInfo *tcon = cifs_sb->tcon; - __u32 dosattr; - FILE_BASIC_INFO *info_buf; + __u32 dosattr, origattr; + FILE_BASIC_INFO *info_buf = NULL; rc = CIFSSMBOpen(xid, tcon, full_path, FILE_OPEN, - DELETE|FILE_WRITE_ATTRIBUTES, - CREATE_NOT_DIR|CREATE_DELETE_ON_CLOSE, + DELETE|FILE_WRITE_ATTRIBUTES, CREATE_NOT_DIR, &netfid, &oplock, NULL, cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); if (rc != 0) goto out; - /* set ATTR_HIDDEN and clear ATTR_READONLY */ - cifsInode = CIFS_I(inode); - dosattr = cifsInode->cifsAttrs & ~ATTR_READONLY; + origattr = cifsInode->cifsAttrs; + if (origattr == 0) + origattr |= ATTR_NORMAL; + + dosattr = origattr & ~ATTR_READONLY; if (dosattr == 0) dosattr |= ATTR_NORMAL; dosattr |= ATTR_HIDDEN; - info_buf = kzalloc(sizeof(*info_buf), GFP_KERNEL); - if (info_buf == NULL) { - rc = -ENOMEM; - goto out_close; + /* set ATTR_HIDDEN and clear ATTR_READONLY, but only if needed */ + if (dosattr != origattr) { + info_buf = kzalloc(sizeof(*info_buf), GFP_KERNEL); + if (info_buf == NULL) { + rc = -ENOMEM; + goto out_close; + } + info_buf->Attributes = cpu_to_le32(dosattr); + rc = CIFSSMBSetFileInfo(xid, tcon, info_buf, netfid, + current->tgid); + /* although we would like to mark the file hidden + if that fails we will still try to rename it */ + if (rc != 0) + cifsInode->cifsAttrs = dosattr; + else + dosattr = origattr; /* since not able to change them */ } - info_buf->Attributes = cpu_to_le32(dosattr); - rc = CIFSSMBSetFileInfo(xid, tcon, info_buf, netfid, current->tgid); - kfree(info_buf); - if (rc != 0) - goto out_close; - cifsInode->cifsAttrs = dosattr; - /* silly-rename the file */ - CIFSSMBRenameOpenFile(xid, tcon, netfid, NULL, cifs_sb->local_nls, + /* rename the file */ + rc = CIFSSMBRenameOpenFile(xid, tcon, netfid, NULL, cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); + if (rc != 0) { + rc = -ETXTBSY; + goto undo_setattr; + } - /* set DELETE_ON_CLOSE */ - rc = CIFSSMBSetFileDisposition(xid, tcon, true, netfid, current->tgid); - - /* - * some samba versions return -ENOENT when we try to set the file - * disposition here. Likely a samba bug, but work around it for now - */ - if (rc == -ENOENT) - rc = 0; + /* try to set DELETE_ON_CLOSE */ + if (!cifsInode->delete_pending) { + rc = CIFSSMBSetFileDisposition(xid, tcon, true, netfid, + current->tgid); + /* + * some samba versions return -ENOENT when we try to set the + * file disposition here. Likely a samba bug, but work around + * it for now. This means that some cifsXXX files may hang + * around after they shouldn't. + * + * BB: remove this hack after more servers have the fix + */ + if (rc == -ENOENT) + rc = 0; + else if (rc != 0) { + rc = -ETXTBSY; + goto undo_rename; + } + cifsInode->delete_pending = true; + } out_close: CIFSSMBClose(xid, tcon, netfid); out: + kfree(info_buf); return rc; + + /* + * reset everything back to the original state. Don't bother + * dealing with errors here since we can't do anything about + * them anyway. + */ +undo_rename: + CIFSSMBRenameOpenFile(xid, tcon, netfid, dentry->d_name.name, + cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & + CIFS_MOUNT_MAP_SPECIAL_CHR); +undo_setattr: + if (dosattr != origattr) { + info_buf->Attributes = cpu_to_le32(origattr); + if (!CIFSSMBSetFileInfo(xid, tcon, info_buf, netfid, + current->tgid)) + cifsInode->cifsAttrs = origattr; + } + + goto out_close; } int cifs_unlink(struct inode *dir, struct dentry *dentry) @@ -878,7 +922,7 @@ psx_del_no_retry: } else if (rc == -ENOENT) { d_drop(dentry); } else if (rc == -ETXTBSY) { - rc = cifs_rename_pending_delete(full_path, inode, xid); + rc = cifs_rename_pending_delete(full_path, dentry, xid); if (rc == 0) drop_nlink(inode); } else if (rc == -EACCES && dosattr == 0) { @@ -1241,22 +1285,21 @@ cifs_do_rename(int xid, struct dentry *from_dentry, const char *fromPath, return rc; } -int cifs_rename(struct inode *source_inode, struct dentry *source_direntry, - struct inode *target_inode, struct dentry *target_direntry) +int cifs_rename(struct inode *source_dir, struct dentry *source_dentry, + struct inode *target_dir, struct dentry *target_dentry) { char *fromName = NULL; char *toName = NULL; struct cifs_sb_info *cifs_sb_source; struct cifs_sb_info *cifs_sb_target; - struct cifsTconInfo *pTcon; + struct cifsTconInfo *tcon; FILE_UNIX_BASIC_INFO *info_buf_source = NULL; FILE_UNIX_BASIC_INFO *info_buf_target; - int xid; - int rc; + int xid, rc, tmprc; - cifs_sb_target = CIFS_SB(target_inode->i_sb); - cifs_sb_source = CIFS_SB(source_inode->i_sb); - pTcon = cifs_sb_source->tcon; + cifs_sb_target = CIFS_SB(target_dir->i_sb); + cifs_sb_source = CIFS_SB(source_dir->i_sb); + tcon = cifs_sb_source->tcon; xid = GetXid(); @@ -1264,7 +1307,7 @@ int cifs_rename(struct inode *source_inode, struct dentry *source_direntry, * BB: this might be allowed if same server, but different share. * Consider adding support for this */ - if (pTcon != cifs_sb_target->tcon) { + if (tcon != cifs_sb_target->tcon) { rc = -EXDEV; goto cifs_rename_exit; } @@ -1273,65 +1316,65 @@ int cifs_rename(struct inode *source_inode, struct dentry *source_direntry, * we already have the rename sem so we do not need to * grab it again here to protect the path integrity */ - fromName = build_path_from_dentry(source_direntry); + fromName = build_path_from_dentry(source_dentry); if (fromName == NULL) { rc = -ENOMEM; goto cifs_rename_exit; } - toName = build_path_from_dentry(target_direntry); + toName = build_path_from_dentry(target_dentry); if (toName == NULL) { rc = -ENOMEM; goto cifs_rename_exit; } - rc = cifs_do_rename(xid, source_direntry, fromName, - target_direntry, toName); + rc = cifs_do_rename(xid, source_dentry, fromName, + target_dentry, toName); - if (rc == -EEXIST) { - if (pTcon->unix_ext) { - /* - * Are src and dst hardlinks of same inode? We can - * only tell with unix extensions enabled - */ - info_buf_source = - kmalloc(2 * sizeof(FILE_UNIX_BASIC_INFO), - GFP_KERNEL); - if (info_buf_source == NULL) - goto unlink_target; - - info_buf_target = info_buf_source + 1; - rc = CIFSSMBUnixQPathInfo(xid, pTcon, fromName, - info_buf_source, - cifs_sb_source->local_nls, - cifs_sb_source->mnt_cifs_flags & - CIFS_MOUNT_MAP_SPECIAL_CHR); - if (rc != 0) - goto unlink_target; - - rc = CIFSSMBUnixQPathInfo(xid, pTcon, - toName, info_buf_target, - cifs_sb_target->local_nls, - /* remap based on source sb */ - cifs_sb_source->mnt_cifs_flags & - CIFS_MOUNT_MAP_SPECIAL_CHR); + if (rc == -EEXIST && tcon->unix_ext) { + /* + * Are src and dst hardlinks of same inode? We can + * only tell with unix extensions enabled + */ + info_buf_source = + kmalloc(2 * sizeof(FILE_UNIX_BASIC_INFO), + GFP_KERNEL); + if (info_buf_source == NULL) { + rc = -ENOMEM; + goto cifs_rename_exit; + } - if (rc == 0 && (info_buf_source->UniqueId == - info_buf_target->UniqueId)) - /* same file, POSIX says that this is a noop */ - goto cifs_rename_exit; - } /* else ... BB we could add the same check for Windows by + info_buf_target = info_buf_source + 1; + tmprc = CIFSSMBUnixQPathInfo(xid, tcon, fromName, + info_buf_source, + cifs_sb_source->local_nls, + cifs_sb_source->mnt_cifs_flags & + CIFS_MOUNT_MAP_SPECIAL_CHR); + if (tmprc != 0) + goto unlink_target; + + tmprc = CIFSSMBUnixQPathInfo(xid, tcon, + toName, info_buf_target, + cifs_sb_target->local_nls, + /* remap based on source sb */ + cifs_sb_source->mnt_cifs_flags & + CIFS_MOUNT_MAP_SPECIAL_CHR); + + if (tmprc == 0 && (info_buf_source->UniqueId == + info_buf_target->UniqueId)) + /* same file, POSIX says that this is a noop */ + goto cifs_rename_exit; + } /* else ... BB we could add the same check for Windows by checking the UniqueId via FILE_INTERNAL_INFO */ + unlink_target: - /* - * we either can not tell the files are hardlinked (as with - * Windows servers) or files are not hardlinked. Delete the - * target manually before renaming to follow POSIX rather than - * Windows semantics - */ - cifs_unlink(target_inode, target_direntry); - rc = cifs_do_rename(xid, source_direntry, fromName, - target_direntry, toName); + if ((rc == -EACCES) || (rc == -EEXIST)) { + tmprc = cifs_unlink(target_dir, target_dentry); + if (tmprc) + goto cifs_rename_exit; + + rc = cifs_do_rename(xid, source_dentry, fromName, + target_dentry, toName); } cifs_rename_exit: diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c index 765adf1..58d5729 100644 --- a/fs/cifs/readdir.c +++ b/fs/cifs/readdir.c @@ -762,14 +762,15 @@ static int find_cifs_entry(const int xid, struct cifsTconInfo *pTcon, rc)); return rc; } + cifs_save_resume_key(cifsFile->srch_inf.last_entry, cifsFile); } while ((index_to_find >= cifsFile->srch_inf.index_of_last_entry) && (rc == 0) && !cifsFile->srch_inf.endOfSearch) { cFYI(1, ("calling findnext2")); - cifs_save_resume_key(cifsFile->srch_inf.last_entry, cifsFile); rc = CIFSFindNext(xid, pTcon, cifsFile->netfid, &cifsFile->srch_inf); + cifs_save_resume_key(cifsFile->srch_inf.last_entry, cifsFile); if (rc) return -ENOENT; } diff --git a/fs/coda/dir.c b/fs/coda/dir.c index c591622..75b1fa9 100644 --- a/fs/coda/dir.c +++ b/fs/coda/dir.c @@ -146,6 +146,9 @@ int coda_permission(struct inode *inode, int mask) if (!mask) return 0; + if ((mask & MAY_EXEC) && !execute_ok(inode)) + return -EACCES; + lock_kernel(); if (coda_cache_check(inode, mask)) diff --git a/fs/coda/pioctl.c b/fs/coda/pioctl.c index c513654..773f2ce 100644 --- a/fs/coda/pioctl.c +++ b/fs/coda/pioctl.c @@ -43,7 +43,7 @@ const struct file_operations coda_ioctl_operations = { /* the coda pioctl inode ops */ static int coda_ioctl_permission(struct inode *inode, int mask) { - return 0; + return (mask & MAY_EXEC) ? -EACCES : 0; } static int coda_pioctl(struct inode * inode, struct file * filp, diff --git a/fs/compat.c b/fs/compat.c index 5f9ec44..fe3c9bf 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -869,7 +869,7 @@ asmlinkage long compat_sys_old_readdir(unsigned int fd, buf.dirent = dirent; error = vfs_readdir(file, compat_fillonedir, &buf); - if (error >= 0) + if (buf.result) error = buf.result; fput(file); @@ -956,9 +956,8 @@ asmlinkage long compat_sys_getdents(unsigned int fd, buf.error = 0; error = vfs_readdir(file, compat_filldir, &buf); - if (error < 0) - goto out_putf; - error = buf.error; + if (error >= 0) + error = buf.error; lastdirent = buf.previous; if (lastdirent) { if (put_user(file->f_pos, &lastdirent->d_off)) @@ -966,8 +965,6 @@ asmlinkage long compat_sys_getdents(unsigned int fd, else error = count - buf.count; } - -out_putf: fput(file); out: return error; @@ -1047,19 +1044,16 @@ asmlinkage long compat_sys_getdents64(unsigned int fd, buf.error = 0; error = vfs_readdir(file, compat_filldir64, &buf); - if (error < 0) - goto out_putf; - error = buf.error; + if (error >= 0) + error = buf.error; lastdirent = buf.previous; if (lastdirent) { typeof(lastdirent->d_off) d_off = file->f_pos; - error = -EFAULT; if (__put_user_unaligned(d_off, &lastdirent->d_off)) - goto out_putf; - error = count - buf.count; + error = -EFAULT; + else + error = count - buf.count; } - -out_putf: fput(file); out: return error; @@ -1475,6 +1469,57 @@ out_ret: #define __COMPAT_NFDBITS (8 * sizeof(compat_ulong_t)) +static int poll_select_copy_remaining(struct timespec *end_time, void __user *p, + int timeval, int ret) +{ + struct timespec ts; + + if (!p) + return ret; + + if (current->personality & STICKY_TIMEOUTS) + goto sticky; + + /* No update for zero timeout */ + if (!end_time->tv_sec && !end_time->tv_nsec) + return ret; + + ktime_get_ts(&ts); + ts = timespec_sub(*end_time, ts); + if (ts.tv_sec < 0) + ts.tv_sec = ts.tv_nsec = 0; + + if (timeval) { + struct compat_timeval rtv; + + rtv.tv_sec = ts.tv_sec; + rtv.tv_usec = ts.tv_nsec / NSEC_PER_USEC; + + if (!copy_to_user(p, &rtv, sizeof(rtv))) + return ret; + } else { + struct compat_timespec rts; + + rts.tv_sec = ts.tv_sec; + rts.tv_nsec = ts.tv_nsec; + + if (!copy_to_user(p, &rts, sizeof(rts))) + return ret; + } + /* + * If an application puts its timeval in read-only memory, we + * don't want the Linux-specific update to the timeval to + * cause a fault after the select has completed + * successfully. However, because we're not updating the + * timeval, we can't restart the system call. + */ + +sticky: + if (ret == -ERESTARTNOHAND) + ret = -EINTR; + return ret; +} + /* * Ooo, nasty. We need here to frob 32-bit unsigned longs to * 64-bit unsigned longs. @@ -1556,7 +1601,8 @@ int compat_set_fd_set(unsigned long nr, compat_ulong_t __user *ufdset, ((unsigned long) (MAX_SCHEDULE_TIMEOUT / HZ)-1) int compat_core_sys_select(int n, compat_ulong_t __user *inp, - compat_ulong_t __user *outp, compat_ulong_t __user *exp, s64 *timeout) + compat_ulong_t __user *outp, compat_ulong_t __user *exp, + struct timespec *end_time) { fd_set_bits fds; void *bits; @@ -1603,7 +1649,7 @@ int compat_core_sys_select(int n, compat_ulong_t __user *inp, zero_fd_set(n, fds.res_out); zero_fd_set(n, fds.res_ex); - ret = do_select(n, &fds, timeout); + ret = do_select(n, &fds, end_time); if (ret < 0) goto out; @@ -1629,7 +1675,7 @@ asmlinkage long compat_sys_select(int n, compat_ulong_t __user *inp, compat_ulong_t __user *outp, compat_ulong_t __user *exp, struct compat_timeval __user *tvp) { - s64 timeout = -1; + struct timespec end_time, *to = NULL; struct compat_timeval tv; int ret; @@ -1637,43 +1683,14 @@ asmlinkage long compat_sys_select(int n, compat_ulong_t __user *inp, if (copy_from_user(&tv, tvp, sizeof(tv))) return -EFAULT; - if (tv.tv_sec < 0 || tv.tv_usec < 0) + to = &end_time; + if (poll_select_set_timeout(to, tv.tv_sec, + tv.tv_usec * NSEC_PER_USEC)) return -EINVAL; - - /* Cast to u64 to make GCC stop complaining */ - if ((u64)tv.tv_sec >= (u64)MAX_INT64_SECONDS) - timeout = -1; /* infinite */ - else { - timeout = DIV_ROUND_UP(tv.tv_usec, 1000000/HZ); - timeout += tv.tv_sec * HZ; - } } - ret = compat_core_sys_select(n, inp, outp, exp, &timeout); - - if (tvp) { - struct compat_timeval rtv; - - if (current->personality & STICKY_TIMEOUTS) - goto sticky; - rtv.tv_usec = jiffies_to_usecs(do_div((*(u64*)&timeout), HZ)); - rtv.tv_sec = timeout; - if (compat_timeval_compare(&rtv, &tv) >= 0) - rtv = tv; - if (copy_to_user(tvp, &rtv, sizeof(rtv))) { -sticky: - /* - * If an application puts its timeval in read-only - * memory, we don't want the Linux-specific update to - * the timeval to cause a fault after the select has - * completed successfully. However, because we're not - * updating the timeval, we can't restart the system - * call. - */ - if (ret == -ERESTARTNOHAND) - ret = -EINTR; - } - } + ret = compat_core_sys_select(n, inp, outp, exp, to); + ret = poll_select_copy_remaining(&end_time, tvp, 1, ret); return ret; } @@ -1686,15 +1703,16 @@ asmlinkage long compat_sys_pselect7(int n, compat_ulong_t __user *inp, { compat_sigset_t ss32; sigset_t ksigmask, sigsaved; - s64 timeout = MAX_SCHEDULE_TIMEOUT; struct compat_timespec ts; + struct timespec end_time, *to = NULL; int ret; if (tsp) { if (copy_from_user(&ts, tsp, sizeof(ts))) return -EFAULT; - if (ts.tv_sec < 0 || ts.tv_nsec < 0) + to = &end_time; + if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec)) return -EINVAL; } @@ -1709,51 +1727,8 @@ asmlinkage long compat_sys_pselect7(int n, compat_ulong_t __user *inp, sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved); } - do { - if (tsp) { - if ((unsigned long)ts.tv_sec < MAX_SELECT_SECONDS) { - timeout = DIV_ROUND_UP(ts.tv_nsec, 1000000000/HZ); - timeout += ts.tv_sec * (unsigned long)HZ; - ts.tv_sec = 0; - ts.tv_nsec = 0; - } else { - ts.tv_sec -= MAX_SELECT_SECONDS; - timeout = MAX_SELECT_SECONDS * HZ; - } - } - - ret = compat_core_sys_select(n, inp, outp, exp, &timeout); - - } while (!ret && !timeout && tsp && (ts.tv_sec || ts.tv_nsec)); - - if (tsp) { - struct compat_timespec rts; - - if (current->personality & STICKY_TIMEOUTS) - goto sticky; - - rts.tv_sec = timeout / HZ; - rts.tv_nsec = (timeout % HZ) * (NSEC_PER_SEC/HZ); - if (rts.tv_nsec >= NSEC_PER_SEC) { - rts.tv_sec++; - rts.tv_nsec -= NSEC_PER_SEC; - } - if (compat_timespec_compare(&rts, &ts) >= 0) - rts = ts; - if (copy_to_user(tsp, &rts, sizeof(rts))) { -sticky: - /* - * If an application puts its timeval in read-only - * memory, we don't want the Linux-specific update to - * the timeval to cause a fault after the select has - * completed successfully. However, because we're not - * updating the timeval, we can't restart the system - * call. - */ - if (ret == -ERESTARTNOHAND) - ret = -EINTR; - } - } + ret = compat_core_sys_select(n, inp, outp, exp, to); + ret = poll_select_copy_remaining(&end_time, tsp, 0, ret); if (ret == -ERESTARTNOHAND) { /* @@ -1798,18 +1773,16 @@ asmlinkage long compat_sys_ppoll(struct pollfd __user *ufds, compat_sigset_t ss32; sigset_t ksigmask, sigsaved; struct compat_timespec ts; - s64 timeout = -1; + struct timespec end_time, *to = NULL; int ret; if (tsp) { if (copy_from_user(&ts, tsp, sizeof(ts))) return -EFAULT; - /* We assume that ts.tv_sec is always lower than - the number of seconds that can be expressed in - an s64. Otherwise the compiler bitches at us */ - timeout = DIV_ROUND_UP(ts.tv_nsec, 1000000000/HZ); - timeout += ts.tv_sec * HZ; + to = &end_time; + if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec)) + return -EINVAL; } if (sigmask) { @@ -1823,7 +1796,7 @@ asmlinkage long compat_sys_ppoll(struct pollfd __user *ufds, sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved); } - ret = do_sys_poll(ufds, nfds, &timeout); + ret = do_sys_poll(ufds, nfds, to); /* We can restart this syscall, usually */ if (ret == -EINTR) { @@ -1841,31 +1814,7 @@ asmlinkage long compat_sys_ppoll(struct pollfd __user *ufds, } else if (sigmask) sigprocmask(SIG_SETMASK, &sigsaved, NULL); - if (tsp && timeout >= 0) { - struct compat_timespec rts; - - if (current->personality & STICKY_TIMEOUTS) - goto sticky; - /* Yes, we know it's actually an s64, but it's also positive. */ - rts.tv_nsec = jiffies_to_usecs(do_div((*(u64*)&timeout), HZ)) * - 1000; - rts.tv_sec = timeout; - if (compat_timespec_compare(&rts, &ts) >= 0) - rts = ts; - if (copy_to_user(tsp, &rts, sizeof(rts))) { -sticky: - /* - * If an application puts its timeval in read-only - * memory, we don't want the Linux-specific update to - * the timeval to cause a fault after the select has - * completed successfully. However, because we're not - * updating the timeval, we can't restart the system - * call. - */ - if (ret == -ERESTARTNOHAND && timeout >= 0) - ret = -EINTR; - } - } + ret = poll_select_copy_remaining(&end_time, tsp, 0, ret); return ret; } diff --git a/fs/configfs/symlink.c b/fs/configfs/symlink.c index bf74973..932a92b 100644 --- a/fs/configfs/symlink.c +++ b/fs/configfs/symlink.c @@ -108,18 +108,18 @@ out: } -static int get_target(const char *symname, struct nameidata *nd, +static int get_target(const char *symname, struct path *path, struct config_item **target) { int ret; - ret = path_lookup(symname, LOOKUP_FOLLOW|LOOKUP_DIRECTORY, nd); + ret = kern_path(symname, LOOKUP_FOLLOW|LOOKUP_DIRECTORY, path); if (!ret) { - if (nd->path.dentry->d_sb == configfs_sb) { - *target = configfs_get_config_item(nd->path.dentry); + if (path->dentry->d_sb == configfs_sb) { + *target = configfs_get_config_item(path->dentry); if (!*target) { ret = -ENOENT; - path_put(&nd->path); + path_put(path); } } else ret = -EPERM; @@ -132,7 +132,7 @@ static int get_target(const char *symname, struct nameidata *nd, int configfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname) { int ret; - struct nameidata nd; + struct path path; struct configfs_dirent *sd; struct config_item *parent_item; struct config_item *target_item; @@ -159,7 +159,7 @@ int configfs_symlink(struct inode *dir, struct dentry *dentry, const char *symna !type->ct_item_ops->allow_link) goto out_put; - ret = get_target(symname, &nd, &target_item); + ret = get_target(symname, &path, &target_item); if (ret) goto out_put; @@ -174,7 +174,7 @@ int configfs_symlink(struct inode *dir, struct dentry *dentry, const char *symna } config_item_put(target_item); - path_put(&nd.path); + path_put(&path); out_put: config_item_put(parent_item); diff --git a/fs/dcache.c b/fs/dcache.c index e7a1a99..a1d86c7 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -69,6 +69,7 @@ struct dentry_stat_t dentry_stat = { static void __d_free(struct dentry *dentry) { + WARN_ON(!list_empty(&dentry->d_alias)); if (dname_external(dentry)) kfree(dentry->d_name.name); kmem_cache_free(dentry_cache, dentry); @@ -174,9 +175,12 @@ static struct dentry *d_kill(struct dentry *dentry) dentry_stat.nr_dentry--; /* For d_free, below */ /*drops the locks, at that point nobody can reach this dentry */ dentry_iput(dentry); - parent = dentry->d_parent; + if (IS_ROOT(dentry)) + parent = NULL; + else + parent = dentry->d_parent; d_free(dentry); - return dentry == parent ? NULL : parent; + return parent; } /* @@ -666,11 +670,12 @@ static void shrink_dcache_for_umount_subtree(struct dentry *dentry) BUG(); } - parent = dentry->d_parent; - if (parent == dentry) + if (IS_ROOT(dentry)) parent = NULL; - else + else { + parent = dentry->d_parent; atomic_dec(&parent->d_count); + } list_del(&dentry->d_u.d_child); detached++; @@ -977,6 +982,15 @@ struct dentry *d_alloc_name(struct dentry *parent, const char *name) return d_alloc(parent, &q); } +/* the caller must hold dcache_lock */ +static void __d_instantiate(struct dentry *dentry, struct inode *inode) +{ + if (inode) + list_add(&dentry->d_alias, &inode->i_dentry); + dentry->d_inode = inode; + fsnotify_d_instantiate(dentry, inode); +} + /** * d_instantiate - fill in inode information for a dentry * @entry: dentry to complete @@ -996,10 +1010,7 @@ void d_instantiate(struct dentry *entry, struct inode * inode) { BUG_ON(!list_empty(&entry->d_alias)); spin_lock(&dcache_lock); - if (inode) - list_add(&entry->d_alias, &inode->i_dentry); - entry->d_inode = inode; - fsnotify_d_instantiate(entry, inode); + __d_instantiate(entry, inode); spin_unlock(&dcache_lock); security_d_instantiate(entry, inode); } @@ -1029,7 +1040,7 @@ static struct dentry *__d_instantiate_unique(struct dentry *entry, unsigned int hash = entry->d_name.hash; if (!inode) { - entry->d_inode = NULL; + __d_instantiate(entry, NULL); return NULL; } @@ -1048,9 +1059,7 @@ static struct dentry *__d_instantiate_unique(struct dentry *entry, return alias; } - list_add(&entry->d_alias, &inode->i_dentry); - entry->d_inode = inode; - fsnotify_d_instantiate(entry, inode); + __d_instantiate(entry, inode); return NULL; } @@ -1111,69 +1120,71 @@ static inline struct hlist_head *d_hash(struct dentry *parent, } /** - * d_alloc_anon - allocate an anonymous dentry + * d_obtain_alias - find or allocate a dentry for a given inode * @inode: inode to allocate the dentry for * - * This is similar to d_alloc_root. It is used by filesystems when - * creating a dentry for a given inode, often in the process of - * mapping a filehandle to a dentry. The returned dentry may be - * anonymous, or may have a full name (if the inode was already - * in the cache). The file system may need to make further - * efforts to connect this dentry into the dcache properly. + * Obtain a dentry for an inode resulting from NFS filehandle conversion or + * similar open by handle operations. The returned dentry may be anonymous, + * or may have a full name (if the inode was already in the cache). * - * When called on a directory inode, we must ensure that - * the inode only ever has one dentry. If a dentry is - * found, that is returned instead of allocating a new one. + * When called on a directory inode, we must ensure that the inode only ever + * has one dentry. If a dentry is found, that is returned instead of + * allocating a new one. * * On successful return, the reference to the inode has been transferred - * to the dentry. If %NULL is returned (indicating kmalloc failure), - * the reference on the inode has not been released. + * to the dentry. In case of an error the reference on the inode is released. + * To make it easier to use in export operations a %NULL or IS_ERR inode may + * be passed in and will be the error will be propagate to the return value, + * with a %NULL @inode replaced by ERR_PTR(-ESTALE). */ - -struct dentry * d_alloc_anon(struct inode *inode) +struct dentry *d_obtain_alias(struct inode *inode) { static const struct qstr anonstring = { .name = "" }; struct dentry *tmp; struct dentry *res; - if ((res = d_find_alias(inode))) { - iput(inode); - return res; - } + if (!inode) + return ERR_PTR(-ESTALE); + if (IS_ERR(inode)) + return ERR_CAST(inode); - tmp = d_alloc(NULL, &anonstring); - if (!tmp) - return NULL; + res = d_find_alias(inode); + if (res) + goto out_iput; + tmp = d_alloc(NULL, &anonstring); + if (!tmp) { + res = ERR_PTR(-ENOMEM); + goto out_iput; + } tmp->d_parent = tmp; /* make sure dput doesn't croak */ - + spin_lock(&dcache_lock); res = __d_find_alias(inode, 0); - if (!res) { - /* attach a disconnected dentry */ - res = tmp; - tmp = NULL; - spin_lock(&res->d_lock); - res->d_sb = inode->i_sb; - res->d_parent = res; - res->d_inode = inode; - res->d_flags |= DCACHE_DISCONNECTED; - res->d_flags &= ~DCACHE_UNHASHED; - list_add(&res->d_alias, &inode->i_dentry); - hlist_add_head(&res->d_hash, &inode->i_sb->s_anon); - spin_unlock(&res->d_lock); - - inode = NULL; /* don't drop reference */ + if (res) { + spin_unlock(&dcache_lock); + dput(tmp); + goto out_iput; } + + /* attach a disconnected dentry */ + spin_lock(&tmp->d_lock); + tmp->d_sb = inode->i_sb; + tmp->d_inode = inode; + tmp->d_flags |= DCACHE_DISCONNECTED; + tmp->d_flags &= ~DCACHE_UNHASHED; + list_add(&tmp->d_alias, &inode->i_dentry); + hlist_add_head(&tmp->d_hash, &inode->i_sb->s_anon); + spin_unlock(&tmp->d_lock); + spin_unlock(&dcache_lock); + return tmp; - if (inode) - iput(inode); - if (tmp) - dput(tmp); + out_iput: + iput(inode); return res; } - +EXPORT_SYMBOL_GPL(d_obtain_alias); /** * d_splice_alias - splice a disconnected dentry into the tree if one exists @@ -1200,17 +1211,14 @@ struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry) new = __d_find_alias(inode, 1); if (new) { BUG_ON(!(new->d_flags & DCACHE_DISCONNECTED)); - fsnotify_d_instantiate(new, inode); spin_unlock(&dcache_lock); security_d_instantiate(new, inode); d_rehash(dentry); d_move(new, dentry); iput(inode); } else { - /* d_instantiate takes dcache_lock, so we do it by hand */ - list_add(&dentry->d_alias, &inode->i_dentry); - dentry->d_inode = inode; - fsnotify_d_instantiate(dentry, inode); + /* already taking dcache_lock, so d_add() by hand */ + __d_instantiate(dentry, inode); spin_unlock(&dcache_lock); security_d_instantiate(dentry, inode); d_rehash(dentry); @@ -1293,8 +1301,7 @@ struct dentry *d_add_ci(struct dentry *dentry, struct inode *inode, * d_instantiate() by hand because it takes dcache_lock which * we already hold. */ - list_add(&found->d_alias, &inode->i_dentry); - found->d_inode = inode; + __d_instantiate(found, inode); spin_unlock(&dcache_lock); security_d_instantiate(found, inode); return found; @@ -1456,8 +1463,6 @@ out: * d_validate - verify dentry provided from insecure source * @dentry: The dentry alleged to be valid child of @dparent * @dparent: The parent dentry (known to be valid) - * @hash: Hash of the dentry - * @len: Length of the name * * An insecure source has sent us a dentry, here we verify it and dget() it. * This is used by ncpfs in its readdir implementation. @@ -1714,18 +1719,23 @@ void d_move(struct dentry * dentry, struct dentry * target) spin_unlock(&dcache_lock); } -/* - * Helper that returns 1 if p1 is a parent of p2, else 0 +/** + * d_ancestor - search for an ancestor + * @p1: ancestor dentry + * @p2: child dentry + * + * Returns the ancestor dentry of p2 which is a child of p1, if p1 is + * an ancestor of p2, else NULL. */ -static int d_isparent(struct dentry *p1, struct dentry *p2) +struct dentry *d_ancestor(struct dentry *p1, struct dentry *p2) { struct dentry *p; - for (p = p2; p->d_parent != p; p = p->d_parent) { + for (p = p2; !IS_ROOT(p); p = p->d_parent) { if (p->d_parent == p1) - return 1; + return p; } - return 0; + return NULL; } /* @@ -1749,7 +1759,7 @@ static struct dentry *__d_unalias(struct dentry *dentry, struct dentry *alias) /* Check for loops */ ret = ERR_PTR(-ELOOP); - if (d_isparent(alias, dentry)) + if (d_ancestor(alias, dentry)) goto out_err; /* See lock_rename() */ @@ -1822,7 +1832,7 @@ struct dentry *d_materialise_unique(struct dentry *dentry, struct inode *inode) if (!inode) { actual = dentry; - dentry->d_inode = NULL; + __d_instantiate(dentry, NULL); goto found_lock; } @@ -2149,32 +2159,27 @@ out: * Caller must ensure that "new_dentry" is pinned before calling is_subdir() */ -int is_subdir(struct dentry * new_dentry, struct dentry * old_dentry) +int is_subdir(struct dentry *new_dentry, struct dentry *old_dentry) { int result; - struct dentry * saved = new_dentry; unsigned long seq; - /* need rcu_readlock to protect against the d_parent trashing due to - * d_move + /* FIXME: This is old behavior, needed? Please check callers. */ + if (new_dentry == old_dentry) + return 1; + + /* + * Need rcu_readlock to protect against the d_parent trashing + * due to d_move */ rcu_read_lock(); - do { + do { /* for restarting inner loop in case of seq retry */ - new_dentry = saved; - result = 0; seq = read_seqbegin(&rename_lock); - for (;;) { - if (new_dentry != old_dentry) { - struct dentry * parent = new_dentry->d_parent; - if (parent == new_dentry) - break; - new_dentry = parent; - continue; - } + if (d_ancestor(old_dentry, new_dentry)) result = 1; - break; - } + else + result = 0; } while (read_seqretry(&rename_lock, seq)); rcu_read_unlock(); @@ -2344,7 +2349,6 @@ void __init vfs_caches_init(unsigned long mempages) } EXPORT_SYMBOL(d_alloc); -EXPORT_SYMBOL(d_alloc_anon); EXPORT_SYMBOL(d_alloc_root); EXPORT_SYMBOL(d_delete); EXPORT_SYMBOL(d_find_alias); @@ -1805,19 +1805,19 @@ int vfs_quota_on_path(struct super_block *sb, int type, int format_id, } /* Actual function called from quotactl() */ -int vfs_quota_on(struct super_block *sb, int type, int format_id, char *path, +int vfs_quota_on(struct super_block *sb, int type, int format_id, char *name, int remount) { - struct nameidata nd; + struct path path; int error; if (remount) return vfs_quota_on_remount(sb, type); - error = path_lookup(path, LOOKUP_FOLLOW, &nd); + error = kern_path(name, LOOKUP_FOLLOW, &path); if (!error) { - error = vfs_quota_on_path(sb, type, format_id, &nd.path); - path_put(&nd.path); + error = vfs_quota_on_path(sb, type, format_id, &path); + path_put(&path); } return error; } diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c index 046e027..64d2ba9 100644 --- a/fs/ecryptfs/main.c +++ b/fs/ecryptfs/main.c @@ -471,31 +471,26 @@ out: */ static int ecryptfs_read_super(struct super_block *sb, const char *dev_name) { + struct path path; int rc; - struct nameidata nd; - struct dentry *lower_root; - struct vfsmount *lower_mnt; - memset(&nd, 0, sizeof(struct nameidata)); - rc = path_lookup(dev_name, LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &nd); + rc = kern_path(dev_name, LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &path); if (rc) { ecryptfs_printk(KERN_WARNING, "path_lookup() failed\n"); goto out; } - lower_root = nd.path.dentry; - lower_mnt = nd.path.mnt; - ecryptfs_set_superblock_lower(sb, lower_root->d_sb); - sb->s_maxbytes = lower_root->d_sb->s_maxbytes; - sb->s_blocksize = lower_root->d_sb->s_blocksize; - ecryptfs_set_dentry_lower(sb->s_root, lower_root); - ecryptfs_set_dentry_lower_mnt(sb->s_root, lower_mnt); - rc = ecryptfs_interpose(lower_root, sb->s_root, sb, 0); + ecryptfs_set_superblock_lower(sb, path.dentry->d_sb); + sb->s_maxbytes = path.dentry->d_sb->s_maxbytes; + sb->s_blocksize = path.dentry->d_sb->s_blocksize; + ecryptfs_set_dentry_lower(sb->s_root, path.dentry); + ecryptfs_set_dentry_lower_mnt(sb->s_root, path.mnt); + rc = ecryptfs_interpose(path.dentry, sb->s_root, sb, 0); if (rc) goto out_free; rc = 0; goto out; out_free: - path_put(&nd.path); + path_put(&path); out: return rc; } diff --git a/fs/efs/namei.c b/fs/efs/namei.c index 291abb1..c3fb5f9 100644 --- a/fs/efs/namei.c +++ b/fs/efs/namei.c @@ -112,35 +112,14 @@ struct dentry *efs_fh_to_parent(struct super_block *sb, struct fid *fid, struct dentry *efs_get_parent(struct dentry *child) { - struct dentry *parent; - struct inode *inode; + struct dentry *parent = ERR_PTR(-ENOENT); efs_ino_t ino; - long error; lock_kernel(); - - error = -ENOENT; ino = efs_find_entry(child->d_inode, "..", 2); - if (!ino) - goto fail; - - inode = efs_iget(child->d_inode->i_sb, ino); - if (IS_ERR(inode)) { - error = PTR_ERR(inode); - goto fail; - } - - error = -ENOMEM; - parent = d_alloc_anon(inode); - if (!parent) - goto fail_iput; - + if (ino) + parent = d_obtain_alias(efs_iget(child->d_inode->i_sb, ino)); unlock_kernel(); - return parent; - fail_iput: - iput(inode); - fail: - unlock_kernel(); - return ERR_PTR(error); + return parent; } @@ -1386,7 +1386,7 @@ EXPORT_SYMBOL(set_binfmt); * name into corename, which must have space for at least * CORENAME_MAX_SIZE bytes plus one byte for the zero terminator. */ -static int format_corename(char *corename, int nr_threads, long signr) +static int format_corename(char *corename, long signr) { const char *pat_ptr = core_pattern; int ispipe = (*pat_ptr == '|'); @@ -1493,8 +1493,7 @@ static int format_corename(char *corename, int nr_threads, long signr) * If core_pattern does not include a %p (as is the default) * and core_uses_pid is set, then .%pid will be appended to * the filename. Do not do this for piped commands. */ - if (!ispipe && !pid_in_pattern - && (core_uses_pid || nr_threads)) { + if (!ispipe && !pid_in_pattern && core_uses_pid) { rc = snprintf(out_ptr, out_end - out_ptr, ".%d", task_tgid_vnr(current)); if (rc > out_end - out_ptr) @@ -1757,7 +1756,7 @@ int do_coredump(long signr, int exit_code, struct pt_regs * regs) * uses lock_kernel() */ lock_kernel(); - ispipe = format_corename(corename, retval, signr); + ispipe = format_corename(corename, signr); unlock_kernel(); /* * Don't bother to check the RLIMIT_CORE value if core_pattern points diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c index cc91227..80246ba 100644 --- a/fs/exportfs/expfs.c +++ b/fs/exportfs/expfs.c @@ -94,9 +94,8 @@ find_disconnected_root(struct dentry *dentry) * It may already be, as the flag isn't always updated when connection happens. */ static int -reconnect_path(struct vfsmount *mnt, struct dentry *target_dir) +reconnect_path(struct vfsmount *mnt, struct dentry *target_dir, char *nbuf) { - char nbuf[NAME_MAX+1]; int noprogress = 0; int err = -ESTALE; @@ -281,13 +280,14 @@ static int get_name(struct vfsmount *mnt, struct dentry *dentry, int old_seq = buffer.sequence; error = vfs_readdir(file, filldir_one, &buffer); + if (buffer.found) { + error = 0; + break; + } if (error < 0) break; - error = 0; - if (buffer.found) - break; error = -ENOENT; if (old_seq == buffer.sequence) break; @@ -360,14 +360,13 @@ struct dentry *exportfs_decode_fh(struct vfsmount *mnt, struct fid *fid, { const struct export_operations *nop = mnt->mnt_sb->s_export_op; struct dentry *result, *alias; + char nbuf[NAME_MAX+1]; int err; /* * Try to get any dentry for the given file handle from the filesystem. */ result = nop->fh_to_dentry(mnt->mnt_sb, fid, fh_len, fileid_type); - if (!result) - result = ERR_PTR(-ESTALE); if (IS_ERR(result)) return result; @@ -381,7 +380,7 @@ struct dentry *exportfs_decode_fh(struct vfsmount *mnt, struct fid *fid, * filesystem root. */ if (result->d_flags & DCACHE_DISCONNECTED) { - err = reconnect_path(mnt, result); + err = reconnect_path(mnt, result, nbuf); if (err) goto err_result; } @@ -397,7 +396,6 @@ struct dentry *exportfs_decode_fh(struct vfsmount *mnt, struct fid *fid, * It's not a directory. Life is a little more complicated. */ struct dentry *target_dir, *nresult; - char nbuf[NAME_MAX+1]; /* * See if either the dentry we just got from the filesystem @@ -422,8 +420,6 @@ struct dentry *exportfs_decode_fh(struct vfsmount *mnt, struct fid *fid, target_dir = nop->fh_to_parent(mnt->mnt_sb, fid, fh_len, fileid_type); - if (!target_dir) - goto err_result; err = PTR_ERR(target_dir); if (IS_ERR(target_dir)) goto err_result; @@ -433,7 +429,7 @@ struct dentry *exportfs_decode_fh(struct vfsmount *mnt, struct fid *fid, * connected to the filesystem root. The VFS really doesn't * like disconnected directories.. */ - err = reconnect_path(mnt, target_dir); + err = reconnect_path(mnt, target_dir, nbuf); if (err) { dput(target_dir); goto err_result; diff --git a/fs/ext2/Kconfig b/fs/ext2/Kconfig new file mode 100644 index 0000000..14a6780 --- /dev/null +++ b/fs/ext2/Kconfig @@ -0,0 +1,55 @@ +config EXT2_FS + tristate "Second extended fs support" + help + Ext2 is a standard Linux file system for hard disks. + + To compile this file system support as a module, choose M here: the + module will be called ext2. + + If unsure, say Y. + +config EXT2_FS_XATTR + bool "Ext2 extended attributes" + depends on EXT2_FS + help + Extended attributes are name:value pairs associated with inodes by + the kernel or by users (see the attr(5) manual page, or visit + <http://acl.bestbits.at/> for details). + + If unsure, say N. + +config EXT2_FS_POSIX_ACL + bool "Ext2 POSIX Access Control Lists" + depends on EXT2_FS_XATTR + select FS_POSIX_ACL + help + Posix Access Control Lists (ACLs) support permissions for users and + groups beyond the owner/group/world scheme. + + To learn more about Access Control Lists, visit the Posix ACLs for + Linux website <http://acl.bestbits.at/>. + + If you don't know what Access Control Lists are, say N + +config EXT2_FS_SECURITY + bool "Ext2 Security Labels" + depends on EXT2_FS_XATTR + help + Security labels support alternative access control models + implemented by security modules like SELinux. This option + enables an extended attribute handler for file security + labels in the ext2 filesystem. + + If you are not using a security module that requires using + extended attributes for file security labels, say N. + +config EXT2_FS_XIP + bool "Ext2 execute in place support" + depends on EXT2_FS && MMU + help + Execute in place can be used on memory-backed block devices. If you + enable this option, you can select to mount block devices which are + capable of this feature without using the page cache. + + If you do not use a block device that is capable of using this, + or if unsure, say N. diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c index 11a49ce..9a0fc40 100644 --- a/fs/ext2/dir.c +++ b/fs/ext2/dir.c @@ -354,11 +354,11 @@ ext2_readdir (struct file * filp, void * dirent, filldir_t filldir) * (as a parameter - res_dir). Page is returned mapped and unlocked. * Entry is guaranteed to be valid. */ -struct ext2_dir_entry_2 * ext2_find_entry (struct inode * dir, - struct dentry *dentry, struct page ** res_page) +struct ext2_dir_entry_2 *ext2_find_entry (struct inode * dir, + struct qstr *child, struct page ** res_page) { - const char *name = dentry->d_name.name; - int namelen = dentry->d_name.len; + const char *name = child->name; + int namelen = child->len; unsigned reclen = EXT2_DIR_REC_LEN(namelen); unsigned long start, n; unsigned long npages = dir_pages(dir); @@ -431,13 +431,13 @@ struct ext2_dir_entry_2 * ext2_dotdot (struct inode *dir, struct page **p) return de; } -ino_t ext2_inode_by_name(struct inode * dir, struct dentry *dentry) +ino_t ext2_inode_by_name(struct inode *dir, struct qstr *child) { ino_t res = 0; - struct ext2_dir_entry_2 * de; + struct ext2_dir_entry_2 *de; struct page *page; - de = ext2_find_entry (dir, dentry, &page); + de = ext2_find_entry (dir, child, &page); if (de) { res = le32_to_cpu(de->inode); ext2_put_page(page); diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h index bae998c..3203042 100644 --- a/fs/ext2/ext2.h +++ b/fs/ext2/ext2.h @@ -105,9 +105,9 @@ extern void ext2_rsv_window_add(struct super_block *sb, struct ext2_reserve_wind /* dir.c */ extern int ext2_add_link (struct dentry *, struct inode *); -extern ino_t ext2_inode_by_name(struct inode *, struct dentry *); +extern ino_t ext2_inode_by_name(struct inode *, struct qstr *); extern int ext2_make_empty(struct inode *, struct inode *); -extern struct ext2_dir_entry_2 * ext2_find_entry (struct inode *,struct dentry *, struct page **); +extern struct ext2_dir_entry_2 * ext2_find_entry (struct inode *,struct qstr *, struct page **); extern int ext2_delete_entry (struct ext2_dir_entry_2 *, struct page *); extern int ext2_empty_dir (struct inode *); extern struct ext2_dir_entry_2 * ext2_dotdot (struct inode *, struct page **); diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c index 80c97fd..2a74725 100644 --- a/fs/ext2/namei.c +++ b/fs/ext2/namei.c @@ -60,7 +60,7 @@ static struct dentry *ext2_lookup(struct inode * dir, struct dentry *dentry, str if (dentry->d_name.len > EXT2_NAME_LEN) return ERR_PTR(-ENAMETOOLONG); - ino = ext2_inode_by_name(dir, dentry); + ino = ext2_inode_by_name(dir, &dentry->d_name); inode = NULL; if (ino) { inode = ext2_iget(dir->i_sb, ino); @@ -72,27 +72,11 @@ static struct dentry *ext2_lookup(struct inode * dir, struct dentry *dentry, str struct dentry *ext2_get_parent(struct dentry *child) { - unsigned long ino; - struct dentry *parent; - struct inode *inode; - struct dentry dotdot; - - dotdot.d_name.name = ".."; - dotdot.d_name.len = 2; - - ino = ext2_inode_by_name(child->d_inode, &dotdot); + struct qstr dotdot = {.name = "..", .len = 2}; + unsigned long ino = ext2_inode_by_name(child->d_inode, &dotdot); if (!ino) return ERR_PTR(-ENOENT); - inode = ext2_iget(child->d_inode->i_sb, ino); - - if (IS_ERR(inode)) - return ERR_CAST(inode); - parent = d_alloc_anon(inode); - if (!parent) { - iput(inode); - parent = ERR_PTR(-ENOMEM); - } - return parent; + return d_obtain_alias(ext2_iget(child->d_inode->i_sb, ino)); } /* @@ -257,7 +241,7 @@ static int ext2_unlink(struct inode * dir, struct dentry *dentry) struct page * page; int err = -ENOENT; - de = ext2_find_entry (dir, dentry, &page); + de = ext2_find_entry (dir, &dentry->d_name, &page); if (!de) goto out; @@ -299,7 +283,7 @@ static int ext2_rename (struct inode * old_dir, struct dentry * old_dentry, struct ext2_dir_entry_2 * old_de; int err = -ENOENT; - old_de = ext2_find_entry (old_dir, old_dentry, &old_page); + old_de = ext2_find_entry (old_dir, &old_dentry->d_name, &old_page); if (!old_de) goto out; @@ -319,7 +303,7 @@ static int ext2_rename (struct inode * old_dir, struct dentry * old_dentry, goto out_dir; err = -ENOENT; - new_de = ext2_find_entry (new_dir, new_dentry, &new_page); + new_de = ext2_find_entry (new_dir, &new_dentry->d_name, &new_page); if (!new_de) goto out_dir; inode_inc_link_count(old_inode); diff --git a/fs/ext2/xip.c b/fs/ext2/xip.c index 4fb94c2..b72b858 100644 --- a/fs/ext2/xip.c +++ b/fs/ext2/xip.c @@ -11,6 +11,7 @@ #include <linux/buffer_head.h> #include <linux/ext2_fs_sb.h> #include <linux/ext2_fs.h> +#include <linux/blkdev.h> #include "ext2.h" #include "xip.h" diff --git a/fs/ext3/Kconfig b/fs/ext3/Kconfig new file mode 100644 index 0000000..8e0cfe4 --- /dev/null +++ b/fs/ext3/Kconfig @@ -0,0 +1,67 @@ +config EXT3_FS + tristate "Ext3 journalling file system support" + select JBD + help + This is the journalling version of the Second extended file system + (often called ext3), the de facto standard Linux file system + (method to organize files on a storage device) for hard disks. + + The journalling code included in this driver means you do not have + to run e2fsck (file system checker) on your file systems after a + crash. The journal keeps track of any changes that were being made + at the time the system crashed, and can ensure that your file system + is consistent without the need for a lengthy check. + + Other than adding the journal to the file system, the on-disk format + of ext3 is identical to ext2. It is possible to freely switch + between using the ext3 driver and the ext2 driver, as long as the + file system has been cleanly unmounted, or e2fsck is run on the file + system. + + To add a journal on an existing ext2 file system or change the + behavior of ext3 file systems, you can use the tune2fs utility ("man + tune2fs"). To modify attributes of files and directories on ext3 + file systems, use chattr ("man chattr"). You need to be using + e2fsprogs version 1.20 or later in order to create ext3 journals + (available at <http://sourceforge.net/projects/e2fsprogs/>). + + To compile this file system support as a module, choose M here: the + module will be called ext3. + +config EXT3_FS_XATTR + bool "Ext3 extended attributes" + depends on EXT3_FS + default y + help + Extended attributes are name:value pairs associated with inodes by + the kernel or by users (see the attr(5) manual page, or visit + <http://acl.bestbits.at/> for details). + + If unsure, say N. + + You need this for POSIX ACL support on ext3. + +config EXT3_FS_POSIX_ACL + bool "Ext3 POSIX Access Control Lists" + depends on EXT3_FS_XATTR + select FS_POSIX_ACL + help + Posix Access Control Lists (ACLs) support permissions for users and + groups beyond the owner/group/world scheme. + + To learn more about Access Control Lists, visit the Posix ACLs for + Linux website <http://acl.bestbits.at/>. + + If you don't know what Access Control Lists are, say N + +config EXT3_FS_SECURITY + bool "Ext3 Security Labels" + depends on EXT3_FS_XATTR + help + Security labels support alternative access control models + implemented by security modules like SELinux. This option + enables an extended attribute handler for file security + labels in the ext3 filesystem. + + If you are not using a security module that requires using + extended attributes for file security labels, say N. diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c index 92fd033..f5b57a2 100644 --- a/fs/ext3/balloc.c +++ b/fs/ext3/balloc.c @@ -1547,6 +1547,7 @@ retry_alloc: * turn off reservation for this allocation */ if (my_rsv && (free_blocks < windowsz) + && (free_blocks > 0) && (rsv_is_empty(&my_rsv->rsv_window))) my_rsv = NULL; @@ -1585,7 +1586,7 @@ retry_alloc: * free blocks is less than half of the reservation * window size. */ - if (free_blocks <= (windowsz/2)) + if (my_rsv && (free_blocks <= (windowsz/2))) continue; brelse(bitmap_bh); diff --git a/fs/ext3/dir.c b/fs/ext3/dir.c index 2eea96e..4c82531 100644 --- a/fs/ext3/dir.c +++ b/fs/ext3/dir.c @@ -102,6 +102,7 @@ static int ext3_readdir(struct file * filp, int err; struct inode *inode = filp->f_path.dentry->d_inode; int ret = 0; + int dir_has_error = 0; sb = inode->i_sb; @@ -148,9 +149,12 @@ static int ext3_readdir(struct file * filp, * of recovering data when there's a bad sector */ if (!bh) { - ext3_error (sb, "ext3_readdir", - "directory #%lu contains a hole at offset %lu", - inode->i_ino, (unsigned long)filp->f_pos); + if (!dir_has_error) { + ext3_error(sb, __func__, "directory #%lu " + "contains a hole at offset %lld", + inode->i_ino, filp->f_pos); + dir_has_error = 1; + } /* corrupt size? Maybe no more blocks to read */ if (filp->f_pos > inode->i_blocks << 9) break; @@ -410,7 +414,7 @@ static int call_filldir(struct file * filp, void * dirent, get_dtype(sb, fname->file_type)); if (error) { filp->f_pos = curr_pos; - info->extra_fname = fname->next; + info->extra_fname = fname; return error; } fname = fname->next; @@ -449,11 +453,21 @@ static int ext3_dx_readdir(struct file * filp, * If there are any leftover names on the hash collision * chain, return them first. */ - if (info->extra_fname && - call_filldir(filp, dirent, filldir, info->extra_fname)) - goto finished; + if (info->extra_fname) { + if (call_filldir(filp, dirent, filldir, info->extra_fname)) + goto finished; - if (!info->curr_node) + info->extra_fname = NULL; + info->curr_node = rb_next(info->curr_node); + if (!info->curr_node) { + if (info->next_hash == ~0) { + filp->f_pos = EXT3_HTREE_EOF; + goto finished; + } + info->curr_hash = info->next_hash; + info->curr_minor_hash = 0; + } + } else if (!info->curr_node) info->curr_node = rb_first(&info->root); while (1) { diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c index ebfec4d..f8424ad 100644 --- a/fs/ext3/inode.c +++ b/fs/ext3/inode.c @@ -1186,6 +1186,13 @@ write_begin_failed: ext3_journal_stop(handle); unlock_page(page); page_cache_release(page); + /* + * block_write_begin may have instantiated a few blocks + * outside i_size. Trim these off again. Don't need + * i_size_read because we hold i_mutex. + */ + if (pos + len > inode->i_size) + vmtruncate(inode, inode->i_size); } if (ret == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries)) goto retry; diff --git a/fs/ext3/ioctl.c b/fs/ext3/ioctl.c index 0d0c701..b7394d0 100644 --- a/fs/ext3/ioctl.c +++ b/fs/ext3/ioctl.c @@ -239,7 +239,7 @@ setrsvsz_out: case EXT3_IOC_GROUP_EXTEND: { ext3_fsblk_t n_blocks_count; struct super_block *sb = inode->i_sb; - int err; + int err, err2; if (!capable(CAP_SYS_RESOURCE)) return -EPERM; @@ -254,8 +254,10 @@ setrsvsz_out: } err = ext3_group_extend(sb, EXT3_SB(sb)->s_es, n_blocks_count); journal_lock_updates(EXT3_SB(sb)->s_journal); - journal_flush(EXT3_SB(sb)->s_journal); + err2 = journal_flush(EXT3_SB(sb)->s_journal); journal_unlock_updates(EXT3_SB(sb)->s_journal); + if (err == 0) + err = err2; group_extend_out: mnt_drop_write(filp->f_path.mnt); return err; @@ -263,7 +265,7 @@ group_extend_out: case EXT3_IOC_GROUP_ADD: { struct ext3_new_group_data input; struct super_block *sb = inode->i_sb; - int err; + int err, err2; if (!capable(CAP_SYS_RESOURCE)) return -EPERM; @@ -280,8 +282,10 @@ group_extend_out: err = ext3_group_add(sb, &input); journal_lock_updates(EXT3_SB(sb)->s_journal); - journal_flush(EXT3_SB(sb)->s_journal); + err2 = journal_flush(EXT3_SB(sb)->s_journal); journal_unlock_updates(EXT3_SB(sb)->s_journal); + if (err == 0) + err = err2; group_add_out: mnt_drop_write(filp->f_path.mnt); return err; diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c index de13e91..3e5edc9 100644 --- a/fs/ext3/namei.c +++ b/fs/ext3/namei.c @@ -159,7 +159,7 @@ static void dx_set_count (struct dx_entry *entries, unsigned value); static void dx_set_limit (struct dx_entry *entries, unsigned value); static unsigned dx_root_limit (struct inode *dir, unsigned infosize); static unsigned dx_node_limit (struct inode *dir); -static struct dx_frame *dx_probe(struct dentry *dentry, +static struct dx_frame *dx_probe(struct qstr *entry, struct inode *dir, struct dx_hash_info *hinfo, struct dx_frame *frame, @@ -176,8 +176,9 @@ static int ext3_htree_next_block(struct inode *dir, __u32 hash, struct dx_frame *frame, struct dx_frame *frames, __u32 *start_hash); -static struct buffer_head * ext3_dx_find_entry(struct dentry *dentry, - struct ext3_dir_entry_2 **res_dir, int *err); +static struct buffer_head * ext3_dx_find_entry(struct inode *dir, + struct qstr *entry, struct ext3_dir_entry_2 **res_dir, + int *err); static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry, struct inode *inode); @@ -342,7 +343,7 @@ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir, * back to userspace. */ static struct dx_frame * -dx_probe(struct dentry *dentry, struct inode *dir, +dx_probe(struct qstr *entry, struct inode *dir, struct dx_hash_info *hinfo, struct dx_frame *frame_in, int *err) { unsigned count, indirect; @@ -353,8 +354,6 @@ dx_probe(struct dentry *dentry, struct inode *dir, u32 hash; frame->bh = NULL; - if (dentry) - dir = dentry->d_parent->d_inode; if (!(bh = ext3_bread (NULL,dir, 0, 0, err))) goto fail; root = (struct dx_root *) bh->b_data; @@ -370,8 +369,8 @@ dx_probe(struct dentry *dentry, struct inode *dir, } hinfo->hash_version = root->info.hash_version; hinfo->seed = EXT3_SB(dir->i_sb)->s_hash_seed; - if (dentry) - ext3fs_dirhash(dentry->d_name.name, dentry->d_name.len, hinfo); + if (entry) + ext3fs_dirhash(entry->name, entry->len, hinfo); hash = hinfo->hash; if (root->info.unused_flags & 1) { @@ -803,15 +802,15 @@ static inline int ext3_match (int len, const char * const name, */ static inline int search_dirblock(struct buffer_head * bh, struct inode *dir, - struct dentry *dentry, + struct qstr *child, unsigned long offset, struct ext3_dir_entry_2 ** res_dir) { struct ext3_dir_entry_2 * de; char * dlimit; int de_len; - const char *name = dentry->d_name.name; - int namelen = dentry->d_name.len; + const char *name = child->name; + int namelen = child->len; de = (struct ext3_dir_entry_2 *) bh->b_data; dlimit = bh->b_data + dir->i_sb->s_blocksize; @@ -850,8 +849,9 @@ static inline int search_dirblock(struct buffer_head * bh, * The returned buffer_head has ->b_count elevated. The caller is expected * to brelse() it when appropriate. */ -static struct buffer_head * ext3_find_entry (struct dentry *dentry, - struct ext3_dir_entry_2 ** res_dir) +static struct buffer_head *ext3_find_entry(struct inode *dir, + struct qstr *entry, + struct ext3_dir_entry_2 **res_dir) { struct super_block * sb; struct buffer_head * bh_use[NAMEI_RA_SIZE]; @@ -863,16 +863,15 @@ static struct buffer_head * ext3_find_entry (struct dentry *dentry, buffer */ int num = 0; int nblocks, i, err; - struct inode *dir = dentry->d_parent->d_inode; int namelen; *res_dir = NULL; sb = dir->i_sb; - namelen = dentry->d_name.len; + namelen = entry->len; if (namelen > EXT3_NAME_LEN) return NULL; if (is_dx(dir)) { - bh = ext3_dx_find_entry(dentry, res_dir, &err); + bh = ext3_dx_find_entry(dir, entry, res_dir, &err); /* * On success, or if the error was file not found, * return. Otherwise, fall back to doing a search the @@ -923,7 +922,7 @@ restart: brelse(bh); goto next; } - i = search_dirblock(bh, dir, dentry, + i = search_dirblock(bh, dir, entry, block << EXT3_BLOCK_SIZE_BITS(sb), res_dir); if (i == 1) { EXT3_I(dir)->i_dir_start_lookup = block; @@ -957,8 +956,9 @@ cleanup_and_exit: return ret; } -static struct buffer_head * ext3_dx_find_entry(struct dentry *dentry, - struct ext3_dir_entry_2 **res_dir, int *err) +static struct buffer_head * ext3_dx_find_entry(struct inode *dir, + struct qstr *entry, struct ext3_dir_entry_2 **res_dir, + int *err) { struct super_block * sb; struct dx_hash_info hinfo; @@ -968,14 +968,13 @@ static struct buffer_head * ext3_dx_find_entry(struct dentry *dentry, struct buffer_head *bh; unsigned long block; int retval; - int namelen = dentry->d_name.len; - const u8 *name = dentry->d_name.name; - struct inode *dir = dentry->d_parent->d_inode; + int namelen = entry->len; + const u8 *name = entry->name; sb = dir->i_sb; /* NFS may look up ".." - look at dx_root directory block */ - if (namelen > 2 || name[0] != '.'||(name[1] != '.' && name[1] != '\0')){ - if (!(frame = dx_probe(dentry, NULL, &hinfo, frames, err))) + if (namelen > 2 || name[0] != '.'|| (namelen == 2 && name[1] != '.')) { + if (!(frame = dx_probe(entry, dir, &hinfo, frames, err))) return NULL; } else { frame = frames; @@ -1036,7 +1035,7 @@ static struct dentry *ext3_lookup(struct inode * dir, struct dentry *dentry, str if (dentry->d_name.len > EXT3_NAME_LEN) return ERR_PTR(-ENAMETOOLONG); - bh = ext3_find_entry(dentry, &de); + bh = ext3_find_entry(dir, &dentry->d_name, &de); inode = NULL; if (bh) { unsigned long ino = le32_to_cpu(de->inode); @@ -1057,18 +1056,11 @@ static struct dentry *ext3_lookup(struct inode * dir, struct dentry *dentry, str struct dentry *ext3_get_parent(struct dentry *child) { unsigned long ino; - struct dentry *parent; - struct inode *inode; - struct dentry dotdot; + struct qstr dotdot = {.name = "..", .len = 2}; struct ext3_dir_entry_2 * de; struct buffer_head *bh; - dotdot.d_name.name = ".."; - dotdot.d_name.len = 2; - dotdot.d_parent = child; /* confusing, isn't it! */ - - bh = ext3_find_entry(&dotdot, &de); - inode = NULL; + bh = ext3_find_entry(child->d_inode, &dotdot, &de); if (!bh) return ERR_PTR(-ENOENT); ino = le32_to_cpu(de->inode); @@ -1080,16 +1072,7 @@ struct dentry *ext3_get_parent(struct dentry *child) return ERR_PTR(-EIO); } - inode = ext3_iget(child->d_inode->i_sb, ino); - if (IS_ERR(inode)) - return ERR_CAST(inode); - - parent = d_alloc_anon(inode); - if (!parent) { - iput(inode); - parent = ERR_PTR(-ENOMEM); - } - return parent; + return d_obtain_alias(ext3_iget(child->d_inode->i_sb, ino)); } #define S_SHIFT 12 @@ -1503,7 +1486,7 @@ static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry, struct ext3_dir_entry_2 *de; int err; - frame = dx_probe(dentry, NULL, &hinfo, frames, &err); + frame = dx_probe(&dentry->d_name, dir, &hinfo, frames, &err); if (!frame) return err; entries = frame->entries; @@ -2056,7 +2039,7 @@ static int ext3_rmdir (struct inode * dir, struct dentry *dentry) return PTR_ERR(handle); retval = -ENOENT; - bh = ext3_find_entry (dentry, &de); + bh = ext3_find_entry(dir, &dentry->d_name, &de); if (!bh) goto end_rmdir; @@ -2118,7 +2101,7 @@ static int ext3_unlink(struct inode * dir, struct dentry *dentry) handle->h_sync = 1; retval = -ENOENT; - bh = ext3_find_entry (dentry, &de); + bh = ext3_find_entry(dir, &dentry->d_name, &de); if (!bh) goto end_unlink; @@ -2276,7 +2259,7 @@ static int ext3_rename (struct inode * old_dir, struct dentry *old_dentry, if (IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir)) handle->h_sync = 1; - old_bh = ext3_find_entry (old_dentry, &old_de); + old_bh = ext3_find_entry(old_dir, &old_dentry->d_name, &old_de); /* * Check for inode number is _not_ due to possible IO errors. * We might rmdir the source, keep it as pwd of some process @@ -2289,7 +2272,7 @@ static int ext3_rename (struct inode * old_dir, struct dentry *old_dentry, goto end_rename; new_inode = new_dentry->d_inode; - new_bh = ext3_find_entry (new_dentry, &new_de); + new_bh = ext3_find_entry(new_dir, &new_dentry->d_name, &new_de); if (new_bh) { if (!new_inode) { brelse (new_bh); @@ -2355,7 +2338,8 @@ static int ext3_rename (struct inode * old_dir, struct dentry *old_dentry, struct buffer_head *old_bh2; struct ext3_dir_entry_2 *old_de2; - old_bh2 = ext3_find_entry(old_dentry, &old_de2); + old_bh2 = ext3_find_entry(old_dir, &old_dentry->d_name, + &old_de2); if (old_bh2) { retval = ext3_delete_entry(handle, old_dir, old_de2, old_bh2); diff --git a/fs/ext3/resize.c b/fs/ext3/resize.c index 77278e9..78fdf38 100644 --- a/fs/ext3/resize.c +++ b/fs/ext3/resize.c @@ -790,7 +790,8 @@ int ext3_group_add(struct super_block *sb, struct ext3_new_group_data *input) if (reserved_gdb || gdb_off == 0) { if (!EXT3_HAS_COMPAT_FEATURE(sb, - EXT3_FEATURE_COMPAT_RESIZE_INODE)){ + EXT3_FEATURE_COMPAT_RESIZE_INODE) + || !le16_to_cpu(es->s_reserved_gdt_blocks)) { ext3_warning(sb, __func__, "No reserved GDT blocks, can't resize"); return -EPERM; diff --git a/fs/ext3/super.c b/fs/ext3/super.c index 399a96a..18eaa78 100644 --- a/fs/ext3/super.c +++ b/fs/ext3/super.c @@ -347,7 +347,7 @@ fail: static int ext3_blkdev_put(struct block_device *bdev) { bd_release(bdev); - return blkdev_put(bdev); + return blkdev_put(bdev, FMODE_READ|FMODE_WRITE); } static int ext3_blkdev_remove(struct ext3_sb_info *sbi) @@ -393,7 +393,8 @@ static void ext3_put_super (struct super_block * sb) int i; ext3_xattr_put_super(sb); - journal_destroy(sbi->s_journal); + if (journal_destroy(sbi->s_journal) < 0) + ext3_abort(sb, __func__, "Couldn't clean up the journal"); if (!(sb->s_flags & MS_RDONLY)) { EXT3_CLEAR_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER); es->s_state = cpu_to_le16(sbi->s_mount_state); @@ -625,6 +626,9 @@ static int ext3_show_options(struct seq_file *seq, struct vfsmount *vfs) else if (test_opt(sb, DATA_FLAGS) == EXT3_MOUNT_WRITEBACK_DATA) seq_puts(seq, ",data=writeback"); + if (test_opt(sb, DATA_ERR_ABORT)) + seq_puts(seq, ",data_err=abort"); + ext3_show_quota_options(seq, sb); return 0; @@ -754,6 +758,7 @@ enum { Opt_reservation, Opt_noreservation, Opt_noload, Opt_nobh, Opt_bh, Opt_commit, Opt_journal_update, Opt_journal_inum, Opt_journal_dev, Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback, + Opt_data_err_abort, Opt_data_err_ignore, Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota, Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota, @@ -796,6 +801,8 @@ static const match_table_t tokens = { {Opt_data_journal, "data=journal"}, {Opt_data_ordered, "data=ordered"}, {Opt_data_writeback, "data=writeback"}, + {Opt_data_err_abort, "data_err=abort"}, + {Opt_data_err_ignore, "data_err=ignore"}, {Opt_offusrjquota, "usrjquota="}, {Opt_usrjquota, "usrjquota=%s"}, {Opt_offgrpjquota, "grpjquota="}, @@ -1011,6 +1018,12 @@ static int parse_options (char *options, struct super_block *sb, sbi->s_mount_opt |= data_opt; } break; + case Opt_data_err_abort: + set_opt(sbi->s_mount_opt, DATA_ERR_ABORT); + break; + case Opt_data_err_ignore: + clear_opt(sbi->s_mount_opt, DATA_ERR_ABORT); + break; #ifdef CONFIG_QUOTA case Opt_usrjquota: qtype = USRQUOTA; @@ -1986,6 +1999,10 @@ static void ext3_init_journal_params(struct super_block *sb, journal_t *journal) journal->j_flags |= JFS_BARRIER; else journal->j_flags &= ~JFS_BARRIER; + if (test_opt(sb, DATA_ERR_ABORT)) + journal->j_flags |= JFS_ABORT_ON_SYNCDATA_ERR; + else + journal->j_flags &= ~JFS_ABORT_ON_SYNCDATA_ERR; spin_unlock(&journal->j_state_lock); } @@ -2050,7 +2067,7 @@ static journal_t *ext3_get_dev_journal(struct super_block *sb, if (bd_claim(bdev, sb)) { printk(KERN_ERR "EXT3: failed to claim external journal device.\n"); - blkdev_put(bdev); + blkdev_put(bdev, FMODE_READ|FMODE_WRITE); return NULL; } @@ -2280,7 +2297,9 @@ static void ext3_mark_recovery_complete(struct super_block * sb, journal_t *journal = EXT3_SB(sb)->s_journal; journal_lock_updates(journal); - journal_flush(journal); + if (journal_flush(journal) < 0) + goto out; + lock_super(sb); if (EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER) && sb->s_flags & MS_RDONLY) { @@ -2289,6 +2308,8 @@ static void ext3_mark_recovery_complete(struct super_block * sb, ext3_commit_super(sb, es, 1); } unlock_super(sb); + +out: journal_unlock_updates(journal); } @@ -2388,7 +2409,13 @@ static void ext3_write_super_lockfs(struct super_block *sb) /* Now we set up the journal barrier. */ journal_lock_updates(journal); - journal_flush(journal); + + /* + * We don't want to clear needs_recovery flag when we failed + * to flush the journal. + */ + if (journal_flush(journal) < 0) + return; /* Journal blocked and flushed, clear needs_recovery flag. */ EXT3_CLEAR_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER); @@ -2767,30 +2794,30 @@ static int ext3_quota_on_mount(struct super_block *sb, int type) * Standard function to be called on quota_on */ static int ext3_quota_on(struct super_block *sb, int type, int format_id, - char *path, int remount) + char *name, int remount) { int err; - struct nameidata nd; + struct path path; if (!test_opt(sb, QUOTA)) return -EINVAL; - /* When remounting, no checks are needed and in fact, path is NULL */ + /* When remounting, no checks are needed and in fact, name is NULL */ if (remount) - return vfs_quota_on(sb, type, format_id, path, remount); + return vfs_quota_on(sb, type, format_id, name, remount); - err = path_lookup(path, LOOKUP_FOLLOW, &nd); + err = kern_path(name, LOOKUP_FOLLOW, &path); if (err) return err; /* Quotafile not on the same filesystem? */ - if (nd.path.mnt->mnt_sb != sb) { - path_put(&nd.path); + if (path.mnt->mnt_sb != sb) { + path_put(&path); return -EXDEV; } /* Journaling quota? */ if (EXT3_SB(sb)->s_qf_names[type]) { /* Quotafile not of fs root? */ - if (nd.path.dentry->d_parent->d_inode != sb->s_root->d_inode) + if (path.dentry->d_parent != sb->s_root) printk(KERN_WARNING "EXT3-fs: Quota file not on filesystem root. " "Journaled quota will not work.\n"); @@ -2800,18 +2827,22 @@ static int ext3_quota_on(struct super_block *sb, int type, int format_id, * When we journal data on quota file, we have to flush journal to see * all updates to the file when we bypass pagecache... */ - if (ext3_should_journal_data(nd.path.dentry->d_inode)) { + if (ext3_should_journal_data(path.dentry->d_inode)) { /* * We don't need to lock updates but journal_flush() could * otherwise be livelocked... */ journal_lock_updates(EXT3_SB(sb)->s_journal); - journal_flush(EXT3_SB(sb)->s_journal); + err = journal_flush(EXT3_SB(sb)->s_journal); journal_unlock_updates(EXT3_SB(sb)->s_journal); + if (err) { + path_put(&path); + return err; + } } - err = vfs_quota_on_path(sb, type, format_id, &nd.path); - path_put(&nd.path); + err = vfs_quota_on_path(sb, type, format_id, &path); + path_put(&path); return err; } diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig new file mode 100644 index 0000000..7505482 --- /dev/null +++ b/fs/ext4/Kconfig @@ -0,0 +1,79 @@ +config EXT4_FS + tristate "The Extended 4 (ext4) filesystem" + select JBD2 + select CRC16 + help + This is the next generation of the ext3 filesystem. + + Unlike the change from ext2 filesystem to ext3 filesystem, + the on-disk format of ext4 is not forwards compatible with + ext3; it is based on extent maps and it supports 48-bit + physical block numbers. The ext4 filesystem also supports delayed + allocation, persistent preallocation, high resolution time stamps, + and a number of other features to improve performance and speed + up fsck time. For more information, please see the web pages at + http://ext4.wiki.kernel.org. + + The ext4 filesystem will support mounting an ext3 + filesystem; while there will be some performance gains from + the delayed allocation and inode table readahead, the best + performance gains will require enabling ext4 features in the + filesystem, or formating a new filesystem as an ext4 + filesystem initially. + + To compile this file system support as a module, choose M here. The + module will be called ext4. + + If unsure, say N. + +config EXT4DEV_COMPAT + bool "Enable ext4dev compatibility" + depends on EXT4_FS + help + Starting with 2.6.28, the name of the ext4 filesystem was + renamed from ext4dev to ext4. Unfortunately there are some + legacy userspace programs (such as klibc's fstype) have + "ext4dev" hardcoded. + + To enable backwards compatibility so that systems that are + still expecting to mount ext4 filesystems using ext4dev, + chose Y here. This feature will go away by 2.6.31, so + please arrange to get your userspace programs fixed! + +config EXT4_FS_XATTR + bool "Ext4 extended attributes" + depends on EXT4_FS + default y + help + Extended attributes are name:value pairs associated with inodes by + the kernel or by users (see the attr(5) manual page, or visit + <http://acl.bestbits.at/> for details). + + If unsure, say N. + + You need this for POSIX ACL support on ext4. + +config EXT4_FS_POSIX_ACL + bool "Ext4 POSIX Access Control Lists" + depends on EXT4_FS_XATTR + select FS_POSIX_ACL + help + POSIX Access Control Lists (ACLs) support permissions for users and + groups beyond the owner/group/world scheme. + + To learn more about Access Control Lists, visit the POSIX ACLs for + Linux website <http://acl.bestbits.at/>. + + If you don't know what Access Control Lists are, say N + +config EXT4_FS_SECURITY + bool "Ext4 Security Labels" + depends on EXT4_FS_XATTR + help + Security labels support alternative access control models + implemented by security modules like SELinux. This option + enables an extended attribute handler for file security + labels in the ext4 filesystem. + + If you are not using a security module that requires using + extended attributes for file security labels, say N. diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index bd2ece2..b9821be 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -568,8 +568,16 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode, /* this isn't the right place to decide whether block is metadata * inode.c/extents.c knows better, but for safety ... */ - if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode) || - ext4_should_journal_data(inode)) + if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) + metadata = 1; + + /* We need to make sure we don't reuse + * block released untill the transaction commit. + * writeback mode have weak data consistency so + * don't force data as metadata when freeing block + * for writeback mode. + */ + if (metadata == 0 && !ext4_should_writeback_data(inode)) metadata = 1; sb = inode->i_sb; diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 6690a41..4880cc3 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -511,7 +511,6 @@ do { \ /* * Mount flags */ -#define EXT4_MOUNT_CHECK 0x00001 /* Do mount-time checks */ #define EXT4_MOUNT_OLDALLOC 0x00002 /* Don't use the new Orlov allocator */ #define EXT4_MOUNT_GRPID 0x00004 /* Create files with directory's group */ #define EXT4_MOUNT_DEBUG 0x00008 /* Some debugging messages */ diff --git a/fs/ext4/ext4_sb.h b/fs/ext4/ext4_sb.h index 6a0b40d..445fde6 100644 --- a/fs/ext4/ext4_sb.h +++ b/fs/ext4/ext4_sb.h @@ -99,9 +99,6 @@ struct ext4_sb_info { struct inode *s_buddy_cache; long s_blocks_reserved; spinlock_t s_reserve_lock; - struct list_head s_active_transaction; - struct list_head s_closed_transaction; - struct list_head s_committed_transaction; spinlock_t s_md_lock; tid_t s_last_transaction; unsigned short *s_mb_offsets, *s_mb_maxs; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 9b4ec9d..8dbf695 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1648,6 +1648,7 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd) int ret = 0, err, nr_pages, i; unsigned long index, end; struct pagevec pvec; + long pages_skipped; BUG_ON(mpd->next_page <= mpd->first_page); pagevec_init(&pvec, 0); @@ -1655,20 +1656,30 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd) end = mpd->next_page - 1; while (index <= end) { - /* XXX: optimize tail */ - nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE); + /* + * We can use PAGECACHE_TAG_DIRTY lookup here because + * even though we have cleared the dirty flag on the page + * We still keep the page in the radix tree with tag + * PAGECACHE_TAG_DIRTY. See clear_page_dirty_for_io. + * The PAGECACHE_TAG_DIRTY is cleared in set_page_writeback + * which is called via the below writepage callback. + */ + nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, + PAGECACHE_TAG_DIRTY, + min(end - index, + (pgoff_t)PAGEVEC_SIZE-1) + 1); if (nr_pages == 0) break; for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; - index = page->index; - if (index > end) - break; - index++; - + pages_skipped = mpd->wbc->pages_skipped; err = mapping->a_ops->writepage(page, mpd->wbc); - if (!err) + if (!err && (pages_skipped == mpd->wbc->pages_skipped)) + /* + * have successfully written the page + * without skipping the same + */ mpd->pages_written++; /* * In error case, we have to continue because @@ -2104,7 +2115,6 @@ static int mpage_da_writepages(struct address_space *mapping, struct writeback_control *wbc, struct mpage_da_data *mpd) { - long to_write; int ret; if (!mpd->get_block) @@ -2119,19 +2129,18 @@ static int mpage_da_writepages(struct address_space *mapping, mpd->pages_written = 0; mpd->retval = 0; - to_write = wbc->nr_to_write; - ret = write_cache_pages(mapping, wbc, __mpage_da_writepage, mpd); - /* * Handle last extent of pages */ if (!mpd->io_done && mpd->next_page != mpd->first_page) { if (mpage_da_map_blocks(mpd) == 0) mpage_da_submit_io(mpd); - } - wbc->nr_to_write = to_write - mpd->pages_written; + mpd->io_done = 1; + ret = MPAGE_DA_EXTENT_TAIL; + } + wbc->nr_to_write -= mpd->pages_written; return ret; } @@ -2360,12 +2369,14 @@ static int ext4_da_writepages_trans_blocks(struct inode *inode) static int ext4_da_writepages(struct address_space *mapping, struct writeback_control *wbc) { + pgoff_t index; + int range_whole = 0; handle_t *handle = NULL; - loff_t range_start = 0; struct mpage_da_data mpd; struct inode *inode = mapping->host; + int no_nrwrite_index_update; + long pages_written = 0, pages_skipped; int needed_blocks, ret = 0, nr_to_writebump = 0; - long to_write, pages_skipped = 0; struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb); /* @@ -2385,23 +2396,26 @@ static int ext4_da_writepages(struct address_space *mapping, nr_to_writebump = sbi->s_mb_stream_request - wbc->nr_to_write; wbc->nr_to_write = sbi->s_mb_stream_request; } + if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) + range_whole = 1; - if (!wbc->range_cyclic) - /* - * If range_cyclic is not set force range_cont - * and save the old writeback_index - */ - wbc->range_cont = 1; - - range_start = wbc->range_start; - pages_skipped = wbc->pages_skipped; + if (wbc->range_cyclic) + index = mapping->writeback_index; + else + index = wbc->range_start >> PAGE_CACHE_SHIFT; mpd.wbc = wbc; mpd.inode = mapping->host; -restart_loop: - to_write = wbc->nr_to_write; - while (!ret && to_write > 0) { + /* + * we don't want write_cache_pages to update + * nr_to_write and writeback_index + */ + no_nrwrite_index_update = wbc->no_nrwrite_index_update; + wbc->no_nrwrite_index_update = 1; + pages_skipped = wbc->pages_skipped; + + while (!ret && wbc->nr_to_write > 0) { /* * we insert one extent at a time. So we need @@ -2422,48 +2436,53 @@ restart_loop: dump_stack(); goto out_writepages; } - to_write -= wbc->nr_to_write; - mpd.get_block = ext4_da_get_block_write; ret = mpage_da_writepages(mapping, wbc, &mpd); ext4_journal_stop(handle); - if (mpd.retval == -ENOSPC) + if (mpd.retval == -ENOSPC) { + /* commit the transaction which would + * free blocks released in the transaction + * and try again + */ jbd2_journal_force_commit_nested(sbi->s_journal); - - /* reset the retry count */ - if (ret == MPAGE_DA_EXTENT_TAIL) { + wbc->pages_skipped = pages_skipped; + ret = 0; + } else if (ret == MPAGE_DA_EXTENT_TAIL) { /* * got one extent now try with * rest of the pages */ - to_write += wbc->nr_to_write; + pages_written += mpd.pages_written; + wbc->pages_skipped = pages_skipped; ret = 0; - } else if (wbc->nr_to_write) { + } else if (wbc->nr_to_write) /* * There is no more writeout needed * or we requested for a noblocking writeout * and we found the device congested */ - to_write += wbc->nr_to_write; break; - } - wbc->nr_to_write = to_write; - } - - if (wbc->range_cont && (pages_skipped != wbc->pages_skipped)) { - /* We skipped pages in this loop */ - wbc->range_start = range_start; - wbc->nr_to_write = to_write + - wbc->pages_skipped - pages_skipped; - wbc->pages_skipped = pages_skipped; - goto restart_loop; } + if (pages_skipped != wbc->pages_skipped) + printk(KERN_EMERG "This should not happen leaving %s " + "with nr_to_write = %ld ret = %d\n", + __func__, wbc->nr_to_write, ret); + + /* Update index */ + index += pages_written; + if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) + /* + * set the writeback_index so that range_cyclic + * mode will write it back later + */ + mapping->writeback_index = index; out_writepages: - wbc->nr_to_write = to_write - nr_to_writebump; - wbc->range_start = range_start; + if (!no_nrwrite_index_update) + wbc->no_nrwrite_index_update = 0; + wbc->nr_to_write -= nr_to_writebump; return ret; } @@ -4175,7 +4194,6 @@ static int ext4_inode_blocks_set(handle_t *handle, struct inode *inode = &(ei->vfs_inode); u64 i_blocks = inode->i_blocks; struct super_block *sb = inode->i_sb; - int err = 0; if (i_blocks <= ~0U) { /* @@ -4185,36 +4203,27 @@ static int ext4_inode_blocks_set(handle_t *handle, raw_inode->i_blocks_lo = cpu_to_le32(i_blocks); raw_inode->i_blocks_high = 0; ei->i_flags &= ~EXT4_HUGE_FILE_FL; - } else if (i_blocks <= 0xffffffffffffULL) { + return 0; + } + if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_HUGE_FILE)) + return -EFBIG; + + if (i_blocks <= 0xffffffffffffULL) { /* * i_blocks can be represented in a 48 bit variable * as multiple of 512 bytes */ - err = ext4_update_rocompat_feature(handle, sb, - EXT4_FEATURE_RO_COMPAT_HUGE_FILE); - if (err) - goto err_out; - /* i_block is stored in the split 48 bit fields */ raw_inode->i_blocks_lo = cpu_to_le32(i_blocks); raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32); ei->i_flags &= ~EXT4_HUGE_FILE_FL; } else { - /* - * i_blocks should be represented in a 48 bit variable - * as multiple of file system block size - */ - err = ext4_update_rocompat_feature(handle, sb, - EXT4_FEATURE_RO_COMPAT_HUGE_FILE); - if (err) - goto err_out; ei->i_flags |= EXT4_HUGE_FILE_FL; /* i_block is stored in file system block size */ i_blocks = i_blocks >> (inode->i_blkbits - 9); raw_inode->i_blocks_lo = cpu_to_le32(i_blocks); raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32); } -err_out: - return err; + return 0; } /* diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index b580714..dfe17a1 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -2300,6 +2300,7 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group, } INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list); + meta_group_info[i]->bb_free_root.rb_node = NULL;; #ifdef DOUBLE_CHECK { @@ -2522,9 +2523,6 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery) } spin_lock_init(&sbi->s_md_lock); - INIT_LIST_HEAD(&sbi->s_active_transaction); - INIT_LIST_HEAD(&sbi->s_closed_transaction); - INIT_LIST_HEAD(&sbi->s_committed_transaction); spin_lock_init(&sbi->s_bal_lock); sbi->s_mb_max_to_scan = MB_DEFAULT_MAX_TO_SCAN; @@ -2553,6 +2551,8 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery) ext4_mb_init_per_dev_proc(sb); ext4_mb_history_init(sb); + sbi->s_journal->j_commit_callback = release_blocks_on_commit; + printk(KERN_INFO "EXT4-fs: mballoc enabled\n"); return 0; } @@ -2568,7 +2568,7 @@ static void ext4_mb_cleanup_pa(struct ext4_group_info *grp) pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list); list_del(&pa->pa_group_list); count++; - kfree(pa); + kmem_cache_free(ext4_pspace_cachep, pa); } if (count) mb_debug("mballoc: %u PAs left\n", count); @@ -2582,15 +2582,6 @@ int ext4_mb_release(struct super_block *sb) struct ext4_group_info *grinfo; struct ext4_sb_info *sbi = EXT4_SB(sb); - /* release freed, non-committed blocks */ - spin_lock(&sbi->s_md_lock); - list_splice_init(&sbi->s_closed_transaction, - &sbi->s_committed_transaction); - list_splice_init(&sbi->s_active_transaction, - &sbi->s_committed_transaction); - spin_unlock(&sbi->s_md_lock); - ext4_mb_free_committed_blocks(sb); - if (sbi->s_group_info) { for (i = 0; i < sbi->s_groups_count; i++) { grinfo = ext4_get_group_info(sb, i); @@ -2644,61 +2635,57 @@ int ext4_mb_release(struct super_block *sb) return 0; } -static noinline_for_stack void -ext4_mb_free_committed_blocks(struct super_block *sb) +/* + * This function is called by the jbd2 layer once the commit has finished, + * so we know we can free the blocks that were released with that commit. + */ +static void release_blocks_on_commit(journal_t *journal, transaction_t *txn) { - struct ext4_sb_info *sbi = EXT4_SB(sb); - int err; - int i; - int count = 0; - int count2 = 0; - struct ext4_free_metadata *md; + struct super_block *sb = journal->j_private; struct ext4_buddy e4b; + struct ext4_group_info *db; + int err, count = 0, count2 = 0; + struct ext4_free_data *entry; + ext4_fsblk_t discard_block; + struct list_head *l, *ltmp; - if (list_empty(&sbi->s_committed_transaction)) - return; - - /* there is committed blocks to be freed yet */ - do { - /* get next array of blocks */ - md = NULL; - spin_lock(&sbi->s_md_lock); - if (!list_empty(&sbi->s_committed_transaction)) { - md = list_entry(sbi->s_committed_transaction.next, - struct ext4_free_metadata, list); - list_del(&md->list); - } - spin_unlock(&sbi->s_md_lock); - - if (md == NULL) - break; + list_for_each_safe(l, ltmp, &txn->t_private_list) { + entry = list_entry(l, struct ext4_free_data, list); mb_debug("gonna free %u blocks in group %lu (0x%p):", - md->num, md->group, md); + entry->count, entry->group, entry); - err = ext4_mb_load_buddy(sb, md->group, &e4b); + err = ext4_mb_load_buddy(sb, entry->group, &e4b); /* we expect to find existing buddy because it's pinned */ BUG_ON(err != 0); + db = e4b.bd_info; /* there are blocks to put in buddy to make them really free */ - count += md->num; + count += entry->count; count2++; - ext4_lock_group(sb, md->group); - for (i = 0; i < md->num; i++) { - mb_debug(" %u", md->blocks[i]); - mb_free_blocks(NULL, &e4b, md->blocks[i], 1); + ext4_lock_group(sb, entry->group); + /* Take it out of per group rb tree */ + rb_erase(&entry->node, &(db->bb_free_root)); + mb_free_blocks(NULL, &e4b, entry->start_blk, entry->count); + + if (!db->bb_free_root.rb_node) { + /* No more items in the per group rb tree + * balance refcounts from ext4_mb_free_metadata() + */ + page_cache_release(e4b.bd_buddy_page); + page_cache_release(e4b.bd_bitmap_page); } - mb_debug("\n"); - ext4_unlock_group(sb, md->group); - - /* balance refcounts from ext4_mb_free_metadata() */ - page_cache_release(e4b.bd_buddy_page); - page_cache_release(e4b.bd_bitmap_page); - - kfree(md); + ext4_unlock_group(sb, entry->group); + discard_block = (ext4_fsblk_t) entry->group * EXT4_BLOCKS_PER_GROUP(sb) + + entry->start_blk + + le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block); + trace_mark(ext4_discard_blocks, "dev %s blk %llu count %u", sb->s_id, + (unsigned long long) discard_block, entry->count); + sb_issue_discard(sb, discard_block, entry->count); + + kmem_cache_free(ext4_free_ext_cachep, entry); ext4_mb_release_desc(&e4b); - - } while (md); + } mb_debug("freed %u blocks in %u structures\n", count, count2); } @@ -2712,6 +2699,7 @@ ext4_mb_free_committed_blocks(struct super_block *sb) static int ext4_mb_init_per_dev_proc(struct super_block *sb) { +#ifdef CONFIG_PROC_FS mode_t mode = S_IFREG | S_IRUGO | S_IWUSR; struct ext4_sb_info *sbi = EXT4_SB(sb); struct proc_dir_entry *proc; @@ -2735,10 +2723,14 @@ err_out: remove_proc_entry(EXT4_MB_MAX_TO_SCAN_NAME, sbi->s_proc); remove_proc_entry(EXT4_MB_STATS_NAME, sbi->s_proc); return -ENOMEM; +#else + return 0; +#endif } static int ext4_mb_destroy_per_dev_proc(struct super_block *sb) { +#ifdef CONFIG_PROC_FS struct ext4_sb_info *sbi = EXT4_SB(sb); if (sbi->s_proc == NULL) @@ -2750,7 +2742,7 @@ static int ext4_mb_destroy_per_dev_proc(struct super_block *sb) remove_proc_entry(EXT4_MB_MIN_TO_SCAN_NAME, sbi->s_proc); remove_proc_entry(EXT4_MB_MAX_TO_SCAN_NAME, sbi->s_proc); remove_proc_entry(EXT4_MB_STATS_NAME, sbi->s_proc); - +#endif return 0; } @@ -2771,6 +2763,16 @@ int __init init_ext4_mballoc(void) kmem_cache_destroy(ext4_pspace_cachep); return -ENOMEM; } + + ext4_free_ext_cachep = + kmem_cache_create("ext4_free_block_extents", + sizeof(struct ext4_free_data), + 0, SLAB_RECLAIM_ACCOUNT, NULL); + if (ext4_free_ext_cachep == NULL) { + kmem_cache_destroy(ext4_pspace_cachep); + kmem_cache_destroy(ext4_ac_cachep); + return -ENOMEM; + } return 0; } @@ -2779,6 +2781,7 @@ void exit_ext4_mballoc(void) /* XXX: synchronize_rcu(); */ kmem_cache_destroy(ext4_pspace_cachep); kmem_cache_destroy(ext4_ac_cachep); + kmem_cache_destroy(ext4_free_ext_cachep); } @@ -4324,8 +4327,6 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, goto out1; } - ext4_mb_poll_new_transaction(sb, handle); - *errp = ext4_mb_initialize_context(ac, ar); if (*errp) { ar->len = 0; @@ -4384,35 +4385,20 @@ out1: return block; } -static void ext4_mb_poll_new_transaction(struct super_block *sb, - handle_t *handle) -{ - struct ext4_sb_info *sbi = EXT4_SB(sb); - - if (sbi->s_last_transaction == handle->h_transaction->t_tid) - return; - - /* new transaction! time to close last one and free blocks for - * committed transaction. we know that only transaction can be - * active, so previos transaction can be being logged and we - * know that transaction before previous is known to be already - * logged. this means that now we may free blocks freed in all - * transactions before previous one. hope I'm clear enough ... */ - spin_lock(&sbi->s_md_lock); - if (sbi->s_last_transaction != handle->h_transaction->t_tid) { - mb_debug("new transaction %lu, old %lu\n", - (unsigned long) handle->h_transaction->t_tid, - (unsigned long) sbi->s_last_transaction); - list_splice_init(&sbi->s_closed_transaction, - &sbi->s_committed_transaction); - list_splice_init(&sbi->s_active_transaction, - &sbi->s_closed_transaction); - sbi->s_last_transaction = handle->h_transaction->t_tid; - } - spin_unlock(&sbi->s_md_lock); - - ext4_mb_free_committed_blocks(sb); +/* + * We can merge two free data extents only if the physical blocks + * are contiguous, AND the extents were freed by the same transaction, + * AND the blocks are associated with the same group. + */ +static int can_merge(struct ext4_free_data *entry1, + struct ext4_free_data *entry2) +{ + if ((entry1->t_tid == entry2->t_tid) && + (entry1->group == entry2->group) && + ((entry1->start_blk + entry1->count) == entry2->start_blk)) + return 1; + return 0; } static noinline_for_stack int @@ -4422,57 +4408,80 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b, struct ext4_group_info *db = e4b->bd_info; struct super_block *sb = e4b->bd_sb; struct ext4_sb_info *sbi = EXT4_SB(sb); - struct ext4_free_metadata *md; - int i; + struct ext4_free_data *entry, *new_entry; + struct rb_node **n = &db->bb_free_root.rb_node, *node; + struct rb_node *parent = NULL, *new_node; + BUG_ON(e4b->bd_bitmap_page == NULL); BUG_ON(e4b->bd_buddy_page == NULL); + new_entry = kmem_cache_alloc(ext4_free_ext_cachep, GFP_NOFS); + new_entry->start_blk = block; + new_entry->group = group; + new_entry->count = count; + new_entry->t_tid = handle->h_transaction->t_tid; + new_node = &new_entry->node; + ext4_lock_group(sb, group); - for (i = 0; i < count; i++) { - md = db->bb_md_cur; - if (md && db->bb_tid != handle->h_transaction->t_tid) { - db->bb_md_cur = NULL; - md = NULL; + if (!*n) { + /* first free block exent. We need to + protect buddy cache from being freed, + * otherwise we'll refresh it from + * on-disk bitmap and lose not-yet-available + * blocks */ + page_cache_get(e4b->bd_buddy_page); + page_cache_get(e4b->bd_bitmap_page); + } + while (*n) { + parent = *n; + entry = rb_entry(parent, struct ext4_free_data, node); + if (block < entry->start_blk) + n = &(*n)->rb_left; + else if (block >= (entry->start_blk + entry->count)) + n = &(*n)->rb_right; + else { + ext4_error(sb, __func__, + "Double free of blocks %d (%d %d)\n", + block, entry->start_blk, entry->count); + return 0; } + } - if (md == NULL) { - ext4_unlock_group(sb, group); - md = kmalloc(sizeof(*md), GFP_NOFS); - if (md == NULL) - return -ENOMEM; - md->num = 0; - md->group = group; - - ext4_lock_group(sb, group); - if (db->bb_md_cur == NULL) { - spin_lock(&sbi->s_md_lock); - list_add(&md->list, &sbi->s_active_transaction); - spin_unlock(&sbi->s_md_lock); - /* protect buddy cache from being freed, - * otherwise we'll refresh it from - * on-disk bitmap and lose not-yet-available - * blocks */ - page_cache_get(e4b->bd_buddy_page); - page_cache_get(e4b->bd_bitmap_page); - db->bb_md_cur = md; - db->bb_tid = handle->h_transaction->t_tid; - mb_debug("new md 0x%p for group %lu\n", - md, md->group); - } else { - kfree(md); - md = db->bb_md_cur; - } + rb_link_node(new_node, parent, n); + rb_insert_color(new_node, &db->bb_free_root); + + /* Now try to see the extent can be merged to left and right */ + node = rb_prev(new_node); + if (node) { + entry = rb_entry(node, struct ext4_free_data, node); + if (can_merge(entry, new_entry)) { + new_entry->start_blk = entry->start_blk; + new_entry->count += entry->count; + rb_erase(node, &(db->bb_free_root)); + spin_lock(&sbi->s_md_lock); + list_del(&entry->list); + spin_unlock(&sbi->s_md_lock); + kmem_cache_free(ext4_free_ext_cachep, entry); } + } - BUG_ON(md->num >= EXT4_BB_MAX_BLOCKS); - md->blocks[md->num] = block + i; - md->num++; - if (md->num == EXT4_BB_MAX_BLOCKS) { - /* no more space, put full container on a sb's list */ - db->bb_md_cur = NULL; + node = rb_next(new_node); + if (node) { + entry = rb_entry(node, struct ext4_free_data, node); + if (can_merge(new_entry, entry)) { + new_entry->count += entry->count; + rb_erase(node, &(db->bb_free_root)); + spin_lock(&sbi->s_md_lock); + list_del(&entry->list); + spin_unlock(&sbi->s_md_lock); + kmem_cache_free(ext4_free_ext_cachep, entry); } } + /* Add the extent to transaction's private list */ + spin_lock(&sbi->s_md_lock); + list_add(&new_entry->list, &handle->h_transaction->t_private_list); + spin_unlock(&sbi->s_md_lock); ext4_unlock_group(sb, group); return 0; } @@ -4500,8 +4509,6 @@ void ext4_mb_free_blocks(handle_t *handle, struct inode *inode, *freed = 0; - ext4_mb_poll_new_transaction(sb, handle); - sbi = EXT4_SB(sb); es = EXT4_SB(sb)->s_es; if (block < le32_to_cpu(es->s_first_data_block) || diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h index b3b4828..b5dff1f 100644 --- a/fs/ext4/mballoc.h +++ b/fs/ext4/mballoc.h @@ -18,6 +18,8 @@ #include <linux/pagemap.h> #include <linux/seq_file.h> #include <linux/version.h> +#include <linux/blkdev.h> +#include <linux/marker.h> #include "ext4_jbd2.h" #include "ext4.h" #include "group.h" @@ -98,23 +100,29 @@ static struct kmem_cache *ext4_pspace_cachep; static struct kmem_cache *ext4_ac_cachep; +static struct kmem_cache *ext4_free_ext_cachep; -#ifdef EXT4_BB_MAX_BLOCKS -#undef EXT4_BB_MAX_BLOCKS -#endif -#define EXT4_BB_MAX_BLOCKS 30 +struct ext4_free_data { + /* this links the free block information from group_info */ + struct rb_node node; -struct ext4_free_metadata { - ext4_group_t group; - unsigned short num; - ext4_grpblk_t blocks[EXT4_BB_MAX_BLOCKS]; + /* this links the free block information from ext4_sb_info */ struct list_head list; + + /* group which free block extent belongs */ + ext4_group_t group; + + /* free block extent */ + ext4_grpblk_t start_blk; + ext4_grpblk_t count; + + /* transaction which freed this extent */ + tid_t t_tid; }; struct ext4_group_info { unsigned long bb_state; - unsigned long bb_tid; - struct ext4_free_metadata *bb_md_cur; + struct rb_root bb_free_root; unsigned short bb_first_free; unsigned short bb_free; unsigned short bb_fragments; @@ -261,8 +269,6 @@ struct buffer_head *read_block_bitmap(struct super_block *, ext4_group_t); static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, ext4_group_t group); -static void ext4_mb_poll_new_transaction(struct super_block *, handle_t *); -static void ext4_mb_free_committed_blocks(struct super_block *); static void ext4_mb_return_to_preallocation(struct inode *inode, struct ext4_buddy *e4b, sector_t block, int count); @@ -270,6 +276,7 @@ static void ext4_mb_put_pa(struct ext4_allocation_context *, struct super_block *, struct ext4_prealloc_space *pa); static int ext4_mb_init_per_dev_proc(struct super_block *sb); static int ext4_mb_destroy_per_dev_proc(struct super_block *sb); +static void release_blocks_on_commit(journal_t *journal, transaction_t *txn); static inline void ext4_lock_group(struct super_block *sb, ext4_group_t group) diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 92db9e9..63adcb7 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -1061,7 +1061,6 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, stru struct dentry *ext4_get_parent(struct dentry *child) { unsigned long ino; - struct dentry *parent; struct inode *inode; static const struct qstr dotdot = { .name = "..", @@ -1083,16 +1082,7 @@ struct dentry *ext4_get_parent(struct dentry *child) return ERR_PTR(-EIO); } - inode = ext4_iget(child->d_inode->i_sb, ino); - if (IS_ERR(inode)) - return ERR_CAST(inode); - - parent = d_alloc_anon(inode); - if (!parent) { - iput(inode); - parent = ERR_PTR(-ENOMEM); - } - return parent; + return d_obtain_alias(ext4_iget(child->d_inode->i_sb, ino)); } #define S_SHIFT 12 diff --git a/fs/ext4/super.c b/fs/ext4/super.c index dea8f13..bdddea1 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -374,66 +374,6 @@ void ext4_update_dynamic_rev(struct super_block *sb) */ } -int ext4_update_compat_feature(handle_t *handle, - struct super_block *sb, __u32 compat) -{ - int err = 0; - if (!EXT4_HAS_COMPAT_FEATURE(sb, compat)) { - err = ext4_journal_get_write_access(handle, - EXT4_SB(sb)->s_sbh); - if (err) - return err; - EXT4_SET_COMPAT_FEATURE(sb, compat); - sb->s_dirt = 1; - handle->h_sync = 1; - BUFFER_TRACE(EXT4_SB(sb)->s_sbh, - "call ext4_journal_dirty_met adata"); - err = ext4_journal_dirty_metadata(handle, - EXT4_SB(sb)->s_sbh); - } - return err; -} - -int ext4_update_rocompat_feature(handle_t *handle, - struct super_block *sb, __u32 rocompat) -{ - int err = 0; - if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, rocompat)) { - err = ext4_journal_get_write_access(handle, - EXT4_SB(sb)->s_sbh); - if (err) - return err; - EXT4_SET_RO_COMPAT_FEATURE(sb, rocompat); - sb->s_dirt = 1; - handle->h_sync = 1; - BUFFER_TRACE(EXT4_SB(sb)->s_sbh, - "call ext4_journal_dirty_met adata"); - err = ext4_journal_dirty_metadata(handle, - EXT4_SB(sb)->s_sbh); - } - return err; -} - -int ext4_update_incompat_feature(handle_t *handle, - struct super_block *sb, __u32 incompat) -{ - int err = 0; - if (!EXT4_HAS_INCOMPAT_FEATURE(sb, incompat)) { - err = ext4_journal_get_write_access(handle, - EXT4_SB(sb)->s_sbh); - if (err) - return err; - EXT4_SET_INCOMPAT_FEATURE(sb, incompat); - sb->s_dirt = 1; - handle->h_sync = 1; - BUFFER_TRACE(EXT4_SB(sb)->s_sbh, - "call ext4_journal_dirty_met adata"); - err = ext4_journal_dirty_metadata(handle, - EXT4_SB(sb)->s_sbh); - } - return err; -} - /* * Open the external journal device */ @@ -459,7 +399,7 @@ fail: static int ext4_blkdev_put(struct block_device *bdev) { bd_release(bdev); - return blkdev_put(bdev); + return blkdev_put(bdev, FMODE_READ|FMODE_WRITE); } static int ext4_blkdev_remove(struct ext4_sb_info *sbi) @@ -904,7 +844,7 @@ static const struct export_operations ext4_export_ops = { enum { Opt_bsd_df, Opt_minix_df, Opt_grpid, Opt_nogrpid, Opt_resgid, Opt_resuid, Opt_sb, Opt_err_cont, Opt_err_panic, Opt_err_ro, - Opt_nouid32, Opt_nocheck, Opt_debug, Opt_oldalloc, Opt_orlov, + Opt_nouid32, Opt_debug, Opt_oldalloc, Opt_orlov, Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl, Opt_reservation, Opt_noreservation, Opt_noload, Opt_nobh, Opt_bh, Opt_commit, Opt_journal_update, Opt_journal_inum, Opt_journal_dev, @@ -915,7 +855,7 @@ enum { Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota, Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota, Opt_grpquota, Opt_extents, Opt_noextents, Opt_i_version, - Opt_mballoc, Opt_nomballoc, Opt_stripe, Opt_delalloc, Opt_nodelalloc, + Opt_stripe, Opt_delalloc, Opt_nodelalloc, Opt_inode_readahead_blks }; @@ -933,8 +873,6 @@ static const match_table_t tokens = { {Opt_err_panic, "errors=panic"}, {Opt_err_ro, "errors=remount-ro"}, {Opt_nouid32, "nouid32"}, - {Opt_nocheck, "nocheck"}, - {Opt_nocheck, "check=none"}, {Opt_debug, "debug"}, {Opt_oldalloc, "oldalloc"}, {Opt_orlov, "orlov"}, @@ -973,8 +911,6 @@ static const match_table_t tokens = { {Opt_extents, "extents"}, {Opt_noextents, "noextents"}, {Opt_i_version, "i_version"}, - {Opt_mballoc, "mballoc"}, - {Opt_nomballoc, "nomballoc"}, {Opt_stripe, "stripe=%u"}, {Opt_resize, "resize"}, {Opt_delalloc, "delalloc"}, @@ -1073,9 +1009,6 @@ static int parse_options(char *options, struct super_block *sb, case Opt_nouid32: set_opt(sbi->s_mount_opt, NO_UID32); break; - case Opt_nocheck: - clear_opt(sbi->s_mount_opt, CHECK); - break; case Opt_debug: set_opt(sbi->s_mount_opt, DEBUG); break; @@ -1618,14 +1551,14 @@ static int ext4_check_descriptors(struct super_block *sb) if (block_bitmap < first_block || block_bitmap > last_block) { printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: " "Block bitmap for group %lu not in group " - "(block %llu)!", i, block_bitmap); + "(block %llu)!\n", i, block_bitmap); return 0; } inode_bitmap = ext4_inode_bitmap(sb, gdp); if (inode_bitmap < first_block || inode_bitmap > last_block) { printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: " "Inode bitmap for group %lu not in group " - "(block %llu)!", i, inode_bitmap); + "(block %llu)!\n", i, inode_bitmap); return 0; } inode_table = ext4_inode_table(sb, gdp); @@ -1633,7 +1566,7 @@ static int ext4_check_descriptors(struct super_block *sb) inode_table + sbi->s_itb_per_group - 1 > last_block) { printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: " "Inode table for group %lu not in group " - "(block %llu)!", i, inode_table); + "(block %llu)!\n", i, inode_table); return 0; } spin_lock(sb_bgl_lock(sbi, i)); @@ -1778,13 +1711,13 @@ static void ext4_orphan_cleanup(struct super_block *sb, * * Note, this does *not* consider any metadata overhead for vfs i_blocks. */ -static loff_t ext4_max_size(int blkbits) +static loff_t ext4_max_size(int blkbits, int has_huge_files) { loff_t res; loff_t upper_limit = MAX_LFS_FILESIZE; /* small i_blocks in vfs inode? */ - if (sizeof(blkcnt_t) < sizeof(u64)) { + if (!has_huge_files || sizeof(blkcnt_t) < sizeof(u64)) { /* * CONFIG_LSF is not enabled implies the inode * i_block represent total blocks in 512 bytes @@ -1814,7 +1747,7 @@ static loff_t ext4_max_size(int blkbits) * block limit, and also a limit of (2^48 - 1) 512-byte sectors in i_blocks. * We need to be 1 filesystem block less than the 2^48 sector limit. */ -static loff_t ext4_max_bitmap_size(int bits) +static loff_t ext4_max_bitmap_size(int bits, int has_huge_files) { loff_t res = EXT4_NDIR_BLOCKS; int meta_blocks; @@ -1827,11 +1760,11 @@ static loff_t ext4_max_bitmap_size(int bits) * total number of 512 bytes blocks of the file */ - if (sizeof(blkcnt_t) < sizeof(u64)) { + if (!has_huge_files || sizeof(blkcnt_t) < sizeof(u64)) { /* - * CONFIG_LSF is not enabled implies the inode - * i_block represent total blocks in 512 bytes - * 32 == size of vfs inode i_blocks * 8 + * !has_huge_files or CONFIG_LSF is not enabled + * implies the inode i_block represent total blocks in + * 512 bytes 32 == size of vfs inode i_blocks * 8 */ upper_limit = (1LL << 32) - 1; @@ -1940,7 +1873,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) int blocksize; int db_count; int i; - int needs_recovery; + int needs_recovery, has_huge_files; __le32 features; __u64 blocks_count; int err; @@ -2081,7 +2014,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) sb->s_id, le32_to_cpu(features)); goto failed_mount; } - if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_HUGE_FILE)) { + has_huge_files = EXT4_HAS_RO_COMPAT_FEATURE(sb, + EXT4_FEATURE_RO_COMPAT_HUGE_FILE); + if (has_huge_files) { /* * Large file size enabled file system can only be * mount if kernel is build with CONFIG_LSF @@ -2131,8 +2066,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) } } - sbi->s_bitmap_maxbytes = ext4_max_bitmap_size(sb->s_blocksize_bits); - sb->s_maxbytes = ext4_max_size(sb->s_blocksize_bits); + sbi->s_bitmap_maxbytes = ext4_max_bitmap_size(sb->s_blocksize_bits, + has_huge_files); + sb->s_maxbytes = ext4_max_size(sb->s_blocksize_bits, has_huge_files); if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV) { sbi->s_inode_size = EXT4_GOOD_OLD_INODE_SIZE; @@ -2456,6 +2392,21 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) "available.\n"); } + if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) { + printk(KERN_WARNING "EXT4-fs: Ignoring delalloc option - " + "requested data journaling mode\n"); + clear_opt(sbi->s_mount_opt, DELALLOC); + } else if (test_opt(sb, DELALLOC)) + printk(KERN_INFO "EXT4-fs: delayed allocation enabled\n"); + + ext4_ext_init(sb); + err = ext4_mb_init(sb, needs_recovery); + if (err) { + printk(KERN_ERR "EXT4-fs: failed to initalize mballoc (%d)\n", + err); + goto failed_mount4; + } + /* * akpm: core read_super() calls in here with the superblock locked. * That deadlocks, because orphan cleanup needs to lock the superblock @@ -2475,21 +2426,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA ? "ordered": "writeback"); - if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) { - printk(KERN_WARNING "EXT4-fs: Ignoring delalloc option - " - "requested data journaling mode\n"); - clear_opt(sbi->s_mount_opt, DELALLOC); - } else if (test_opt(sb, DELALLOC)) - printk(KERN_INFO "EXT4-fs: delayed allocation enabled\n"); - - ext4_ext_init(sb); - err = ext4_mb_init(sb, needs_recovery); - if (err) { - printk(KERN_ERR "EXT4-fs: failed to initalize mballoc (%d)\n", - err); - goto failed_mount4; - } - lock_kernel(); return 0; @@ -2617,7 +2553,7 @@ static journal_t *ext4_get_dev_journal(struct super_block *sb, if (bd_claim(bdev, sb)) { printk(KERN_ERR "EXT4: failed to claim external journal device.\n"); - blkdev_put(bdev); + blkdev_put(bdev, FMODE_READ|FMODE_WRITE); return NULL; } @@ -3392,30 +3328,30 @@ static int ext4_quota_on_mount(struct super_block *sb, int type) * Standard function to be called on quota_on */ static int ext4_quota_on(struct super_block *sb, int type, int format_id, - char *path, int remount) + char *name, int remount) { int err; - struct nameidata nd; + struct path path; if (!test_opt(sb, QUOTA)) return -EINVAL; - /* When remounting, no checks are needed and in fact, path is NULL */ + /* When remounting, no checks are needed and in fact, name is NULL */ if (remount) - return vfs_quota_on(sb, type, format_id, path, remount); + return vfs_quota_on(sb, type, format_id, name, remount); - err = path_lookup(path, LOOKUP_FOLLOW, &nd); + err = kern_path(name, LOOKUP_FOLLOW, &path); if (err) return err; /* Quotafile not on the same filesystem? */ - if (nd.path.mnt->mnt_sb != sb) { - path_put(&nd.path); + if (path.mnt->mnt_sb != sb) { + path_put(&path); return -EXDEV; } /* Journaling quota? */ if (EXT4_SB(sb)->s_qf_names[type]) { /* Quotafile not in fs root? */ - if (nd.path.dentry->d_parent->d_inode != sb->s_root->d_inode) + if (path.dentry->d_parent != sb->s_root) printk(KERN_WARNING "EXT4-fs: Quota file not on filesystem root. " "Journaled quota will not work.\n"); @@ -3425,7 +3361,7 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id, * When we journal data on quota file, we have to flush journal to see * all updates to the file when we bypass pagecache... */ - if (ext4_should_journal_data(nd.path.dentry->d_inode)) { + if (ext4_should_journal_data(path.dentry->d_inode)) { /* * We don't need to lock updates but journal_flush() could * otherwise be livelocked... @@ -3434,13 +3370,13 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id, err = jbd2_journal_flush(EXT4_SB(sb)->s_journal); jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal); if (err) { - path_put(&nd.path); + path_put(&path); return err; } } - err = vfs_quota_on_path(sb, type, format_id, &nd.path); - path_put(&nd.path); + err = vfs_quota_on_path(sb, type, format_id, &path); + path_put(&path); return err; } diff --git a/fs/fat/dir.c b/fs/fat/dir.c index cd4a016..bae1c32 100644 --- a/fs/fat/dir.c +++ b/fs/fat/dir.c @@ -839,6 +839,7 @@ const struct file_operations fat_dir_operations = { .compat_ioctl = fat_compat_dir_ioctl, #endif .fsync = file_fsync, + .llseek = generic_file_llseek, }; static int fat_get_short_entry(struct inode *dir, loff_t *pos, diff --git a/fs/fat/inode.c b/fs/fat/inode.c index d12cdf2..19eafbe 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -681,33 +681,24 @@ static struct dentry *fat_fh_to_dentry(struct super_block *sb, inode = NULL; } } - if (!inode) { - /* For now, do nothing - * What we could do is: - * follow the file starting at fh[4], and record - * the ".." entry, and the name of the fh[2] entry. - * The follow the ".." file finding the next step up. - * This way we build a path to the root of - * the tree. If this works, we lookup the path and so - * get this inode into the cache. - * Finally try the fat_iget lookup again - * If that fails, then weare totally out of luck - * But all that is for another day - */ - } - if (!inode) - return ERR_PTR(-ESTALE); - - /* now to find a dentry. - * If possible, get a well-connected one + /* + * For now, do nothing if the inode is not found. + * + * What we could do is: + * + * - follow the file starting at fh[4], and record the ".." entry, + * and the name of the fh[2] entry. + * - then follow the ".." file finding the next step up. + * + * This way we build a path to the root of the tree. If this works, we + * lookup the path and so get this inode into the cache. Finally try + * the fat_iget lookup again. If that fails, then we are totally out + * of luck. But all that is for another day */ - result = d_alloc_anon(inode); - if (result == NULL) { - iput(inode); - return ERR_PTR(-ENOMEM); - } - result->d_op = sb->s_root->d_op; + result = d_obtain_alias(inode); + if (!IS_ERR(result)) + result->d_op = sb->s_root->d_op; return result; } @@ -754,15 +745,8 @@ static struct dentry *fat_get_parent(struct dentry *child) } inode = fat_build_inode(sb, de, i_pos); brelse(bh); - if (IS_ERR(inode)) { - parent = ERR_CAST(inode); - goto out; - } - parent = d_alloc_anon(inode); - if (!parent) { - iput(inode); - parent = ERR_PTR(-ENOMEM); - } + + parent = d_obtain_alias(inode); out: unlock_super(sb); @@ -51,7 +51,7 @@ static int fifo_open(struct inode *inode, struct file *filp) filp->f_mode &= (FMODE_READ | FMODE_WRITE); switch (filp->f_mode) { - case 1: + case FMODE_READ: /* * O_RDONLY * POSIX.1 says that O_NONBLOCK means return with the FIFO @@ -76,7 +76,7 @@ static int fifo_open(struct inode *inode, struct file *filp) } break; - case 2: + case FMODE_WRITE: /* * O_WRONLY * POSIX.1 says that O_NONBLOCK means return -1 with @@ -98,7 +98,7 @@ static int fifo_open(struct inode *inode, struct file *filp) } break; - case 3: + case FMODE_READ | FMODE_WRITE: /* * O_RDWR * POSIX.1 leaves this case "undefined" when O_NONBLOCK is set. diff --git a/fs/file_table.c b/fs/file_table.c index f45a449..efc06fa 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -161,7 +161,7 @@ EXPORT_SYMBOL(get_empty_filp); * code should be moved into this function. */ struct file *alloc_file(struct vfsmount *mnt, struct dentry *dentry, - mode_t mode, const struct file_operations *fop) + fmode_t mode, const struct file_operations *fop) { struct file *file; struct path; @@ -193,7 +193,7 @@ EXPORT_SYMBOL(alloc_file); * of this should be moving to alloc_file(). */ int init_file(struct file *file, struct vfsmount *mnt, struct dentry *dentry, - mode_t mode, const struct file_operations *fop) + fmode_t mode, const struct file_operations *fop) { int error = 0; file->f_path.dentry = dentry; diff --git a/fs/filesystems.c b/fs/filesystems.c index f37f872..d0e20ce 100644 --- a/fs/filesystems.c +++ b/fs/filesystems.c @@ -8,6 +8,8 @@ #include <linux/syscalls.h> #include <linux/fs.h> +#include <linux/proc_fs.h> +#include <linux/seq_file.h> #include <linux/slab.h> #include <linux/kmod.h> #include <linux/init.h> @@ -214,6 +216,43 @@ int get_filesystem_list(char * buf) return len; } +#ifdef CONFIG_PROC_FS +static int filesystems_proc_show(struct seq_file *m, void *v) +{ + struct file_system_type * tmp; + + read_lock(&file_systems_lock); + tmp = file_systems; + while (tmp) { + seq_printf(m, "%s\t%s\n", + (tmp->fs_flags & FS_REQUIRES_DEV) ? "" : "nodev", + tmp->name); + tmp = tmp->next; + } + read_unlock(&file_systems_lock); + return 0; +} + +static int filesystems_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, filesystems_proc_show, NULL); +} + +static const struct file_operations filesystems_proc_fops = { + .open = filesystems_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int __init proc_filesystems_init(void) +{ + proc_create("filesystems", 0, NULL, &filesystems_proc_fops); + return 0; +} +module_init(proc_filesystems_init); +#endif + struct file_system_type *get_fs_type(const char *name) { struct file_system_type *fs; diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 2bada6b..34930a9 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -101,6 +101,8 @@ void fuse_finish_open(struct inode *inode, struct file *file, file->f_op = &fuse_direct_io_file_operations; if (!(outarg->open_flags & FOPEN_KEEP_CACHE)) invalidate_inode_pages2(inode->i_mapping); + if (outarg->open_flags & FOPEN_NONSEEKABLE) + nonseekable_open(inode, file); ff->fh = outarg->fh; file->private_data = fuse_file_get(ff); } @@ -1448,6 +1450,9 @@ static loff_t fuse_file_llseek(struct file *file, loff_t offset, int origin) mutex_lock(&inode->i_mutex); switch (origin) { case SEEK_END: + retval = fuse_update_attributes(inode, NULL, file, NULL); + if (retval) + return retval; offset += i_size_read(inode); break; case SEEK_CUR: diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 3a87607..35accfd 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -6,6 +6,9 @@ See the file COPYING. */ +#ifndef _FS_FUSE_I_H +#define _FS_FUSE_I_H + #include <linux/fuse.h> #include <linux/fs.h> #include <linux/mount.h> @@ -655,3 +658,5 @@ void fuse_set_nowrite(struct inode *inode); void fuse_release_nowrite(struct inode *inode); u64 fuse_get_attr_version(struct fuse_conn *fc); + +#endif /* _FS_FUSE_I_H */ diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 6a84388..2e99f34 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -596,12 +596,8 @@ static struct dentry *fuse_get_dentry(struct super_block *sb, if (inode->i_generation != handle->generation) goto out_iput; - entry = d_alloc_anon(inode); - err = -ENOMEM; - if (!entry) - goto out_iput; - - if (get_node_id(inode) != FUSE_ROOT_ID) { + entry = d_obtain_alias(inode); + if (!IS_ERR(entry) && get_node_id(inode) != FUSE_ROOT_ID) { entry->d_op = &fuse_dentry_operations; fuse_invalidate_entry_cache(entry); } @@ -696,17 +692,14 @@ static struct dentry *fuse_get_parent(struct dentry *child) name.name = ".."; err = fuse_lookup_name(child_inode->i_sb, get_node_id(child_inode), &name, &outarg, &inode); - if (err && err != -ENOENT) + if (err) { + if (err == -ENOENT) + return ERR_PTR(-ESTALE); return ERR_PTR(err); - if (err || !inode) - return ERR_PTR(-ESTALE); - - parent = d_alloc_anon(inode); - if (!parent) { - iput(inode); - return ERR_PTR(-ENOMEM); } - if (get_node_id(inode) != FUSE_ROOT_ID) { + + parent = d_obtain_alias(inode); + if (!IS_ERR(parent) && get_node_id(inode) != FUSE_ROOT_ID) { parent->d_op = &fuse_dentry_operations; fuse_invalidate_entry_cache(parent); } @@ -865,7 +858,7 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) if (is_bdev) { fc->destroy_req = fuse_request_alloc(); if (!fc->destroy_req) - goto err_put_root; + goto err_free_init_req; } mutex_lock(&fuse_mutex); @@ -895,6 +888,7 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) err_unlock: mutex_unlock(&fuse_mutex); + err_free_init_req: fuse_request_free(init_req); err_put_root: dput(root_dentry); diff --git a/fs/gfs2/ops_export.c b/fs/gfs2/ops_export.c index 9cda853..bbb8c36 100644 --- a/fs/gfs2/ops_export.c +++ b/fs/gfs2/ops_export.c @@ -130,28 +130,17 @@ static int gfs2_get_name(struct dentry *parent, char *name, static struct dentry *gfs2_get_parent(struct dentry *child) { struct qstr dotdot; - struct inode *inode; struct dentry *dentry; - gfs2_str2qstr(&dotdot, ".."); - inode = gfs2_lookupi(child->d_inode, &dotdot, 1); - - if (!inode) - return ERR_PTR(-ENOENT); /* - * In case of an error, @inode carries the error value, and we - * have to return that as a(n invalid) pointer to dentry. + * XXX(hch): it would be a good idea to keep this around as a + * static variable. */ - if (IS_ERR(inode)) - return ERR_CAST(inode); - - dentry = d_alloc_anon(inode); - if (!dentry) { - iput(inode); - return ERR_PTR(-ENOMEM); - } + gfs2_str2qstr(&dotdot, ".."); - dentry->d_op = &gfs2_dops; + dentry = d_obtain_alias(gfs2_lookupi(child->d_inode, &dotdot, 1)); + if (!IS_ERR(dentry)) + dentry->d_op = &gfs2_dops; return dentry; } @@ -233,13 +222,9 @@ static struct dentry *gfs2_get_dentry(struct super_block *sb, gfs2_glock_dq_uninit(&i_gh); out_inode: - dentry = d_alloc_anon(inode); - if (!dentry) { - iput(inode); - return ERR_PTR(-ENOMEM); - } - - dentry->d_op = &gfs2_dops; + dentry = d_obtain_alias(inode); + if (!IS_ERR(dentry)) + dentry->d_op = &gfs2_dops; return dentry; fail_rgd: diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c index 534e1e2..d232991 100644 --- a/fs/gfs2/ops_inode.c +++ b/fs/gfs2/ops_inode.c @@ -69,7 +69,7 @@ static int gfs2_create(struct inode *dir, struct dentry *dentry, mark_inode_dirty(inode); break; } else if (PTR_ERR(inode) != -EEXIST || - (nd && (nd->intent.open.flags & O_EXCL))) { + (nd && nd->flags & LOOKUP_EXCL)) { gfs2_holder_uninit(ghs); return PTR_ERR(inode); } diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c index 7e19835..c69b7ac 100644 --- a/fs/hfs/inode.c +++ b/fs/hfs/inode.c @@ -511,13 +511,6 @@ void hfs_clear_inode(struct inode *inode) } } -static int hfs_permission(struct inode *inode, int mask) -{ - if (S_ISREG(inode->i_mode) && mask & MAY_EXEC) - return 0; - return generic_permission(inode, mask, NULL); -} - static int hfs_file_open(struct inode *inode, struct file *file) { if (HFS_IS_RSRC(inode)) @@ -616,7 +609,6 @@ static const struct inode_operations hfs_file_inode_operations = { .lookup = hfs_file_lookup, .truncate = hfs_file_truncate, .setattr = hfs_inode_setattr, - .permission = hfs_permission, .setxattr = hfs_setxattr, .getxattr = hfs_getxattr, .listxattr = hfs_listxattr, diff --git a/fs/hfsplus/extents.c b/fs/hfsplus/extents.c index fec8f61..0022eec 100644 --- a/fs/hfsplus/extents.c +++ b/fs/hfsplus/extents.c @@ -199,6 +199,9 @@ int hfsplus_get_block(struct inode *inode, sector_t iblock, goto done; } + if (inode->i_ino == HFSPLUS_EXT_CNID) + return -EIO; + mutex_lock(&HFSPLUS_I(inode).extents_lock); res = hfsplus_ext_read_extent(inode, ablock); if (!res) { diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c index b085d64..b207f0e 100644 --- a/fs/hfsplus/inode.c +++ b/fs/hfsplus/inode.c @@ -238,22 +238,12 @@ static void hfsplus_set_perms(struct inode *inode, struct hfsplus_perm *perms) perms->dev = cpu_to_be32(HFSPLUS_I(inode).dev); } -static int hfsplus_permission(struct inode *inode, int mask) -{ - /* MAY_EXEC is also used for lookup, if no x bit is set allow lookup, - * open_exec has the same test, so it's still not executable, if a x bit - * is set fall back to standard permission check. - */ - if (S_ISREG(inode->i_mode) && mask & MAY_EXEC && !(inode->i_mode & 0111)) - return 0; - return generic_permission(inode, mask, NULL); -} - - static int hfsplus_file_open(struct inode *inode, struct file *file) { if (HFSPLUS_IS_RSRC(inode)) inode = HFSPLUS_I(inode).rsrc_inode; + if (!(file->f_flags & O_LARGEFILE) && i_size_read(inode) > MAX_NON_LFS) + return -EOVERFLOW; atomic_inc(&HFSPLUS_I(inode).opencnt); return 0; } @@ -279,7 +269,6 @@ static int hfsplus_file_release(struct inode *inode, struct file *file) static const struct inode_operations hfsplus_file_inode_operations = { .lookup = hfsplus_file_lookup, .truncate = hfsplus_file_truncate, - .permission = hfsplus_permission, .setxattr = hfsplus_setxattr, .getxattr = hfsplus_getxattr, .listxattr = hfsplus_listxattr, diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c index d6ecabf..7f34f43 100644 --- a/fs/hostfs/hostfs_kern.c +++ b/fs/hostfs/hostfs_kern.c @@ -20,7 +20,7 @@ struct hostfs_inode_info { char *host_filename; int fd; - int mode; + fmode_t mode; struct inode vfs_inode; }; @@ -373,7 +373,8 @@ int hostfs_readdir(struct file *file, void *ent, filldir_t filldir) int hostfs_file_open(struct inode *ino, struct file *file) { char *name; - int mode = 0, r = 0, w = 0, fd; + fmode_t mode = 0; + int r = 0, w = 0, fd; mode = file->f_mode & (FMODE_READ | FMODE_WRITE); if ((mode & HOSTFS_I(ino)->mode) == mode) diff --git a/fs/hpfs/file.c b/fs/hpfs/file.c index be8be50..64ab522 100644 --- a/fs/hpfs/file.c +++ b/fs/hpfs/file.c @@ -143,5 +143,5 @@ const struct file_operations hpfs_file_ops = const struct inode_operations hpfs_file_iops = { .truncate = hpfs_truncate, - .setattr = hpfs_notify_change, + .setattr = hpfs_setattr, }; diff --git a/fs/hpfs/hpfs_fn.h b/fs/hpfs/hpfs_fn.h index 42ff60c..c2ea31b 100644 --- a/fs/hpfs/hpfs_fn.h +++ b/fs/hpfs/hpfs_fn.h @@ -275,7 +275,7 @@ void hpfs_init_inode(struct inode *); void hpfs_read_inode(struct inode *); void hpfs_write_inode(struct inode *); void hpfs_write_inode_nolock(struct inode *); -int hpfs_notify_change(struct dentry *, struct iattr *); +int hpfs_setattr(struct dentry *, struct iattr *); void hpfs_write_if_changed(struct inode *); void hpfs_delete_inode(struct inode *); diff --git a/fs/hpfs/inode.c b/fs/hpfs/inode.c index 85d3e1d..39a1bfb 100644 --- a/fs/hpfs/inode.c +++ b/fs/hpfs/inode.c @@ -260,19 +260,28 @@ void hpfs_write_inode_nolock(struct inode *i) brelse(bh); } -int hpfs_notify_change(struct dentry *dentry, struct iattr *attr) +int hpfs_setattr(struct dentry *dentry, struct iattr *attr) { struct inode *inode = dentry->d_inode; - int error=0; + int error = -EINVAL; + lock_kernel(); - if ( ((attr->ia_valid & ATTR_SIZE) && attr->ia_size > inode->i_size) || - (hpfs_sb(inode->i_sb)->sb_root == inode->i_ino) ) { - error = -EINVAL; - } else if ((error = inode_change_ok(inode, attr))) { - } else if ((error = inode_setattr(inode, attr))) { - } else { - hpfs_write_inode(inode); - } + if (inode->i_ino == hpfs_sb(inode->i_sb)->sb_root) + goto out_unlock; + if ((attr->ia_valid & ATTR_SIZE) && attr->ia_size > inode->i_size) + goto out_unlock; + + error = inode_change_ok(inode, attr); + if (error) + goto out_unlock; + + error = inode_setattr(inode, attr); + if (error) + goto out_unlock; + + hpfs_write_inode(inode); + + out_unlock: unlock_kernel(); return error; } diff --git a/fs/hpfs/namei.c b/fs/hpfs/namei.c index d9c59a7..10783f3 100644 --- a/fs/hpfs/namei.c +++ b/fs/hpfs/namei.c @@ -669,5 +669,5 @@ const struct inode_operations hpfs_dir_iops = .rmdir = hpfs_rmdir, .mknod = hpfs_mknod, .rename = hpfs_rename, - .setattr = hpfs_notify_change, + .setattr = hpfs_setattr, }; diff --git a/fs/isofs/export.c b/fs/isofs/export.c index bb21913..e81a305 100644 --- a/fs/isofs/export.c +++ b/fs/isofs/export.c @@ -22,7 +22,7 @@ isofs_export_iget(struct super_block *sb, __u32 generation) { struct inode *inode; - struct dentry *result; + if (block == 0) return ERR_PTR(-ESTALE); inode = isofs_iget(sb, block, offset); @@ -32,12 +32,7 @@ isofs_export_iget(struct super_block *sb, iput(inode); return ERR_PTR(-ESTALE); } - result = d_alloc_anon(inode); - if (!result) { - iput(inode); - return ERR_PTR(-ENOMEM); - } - return result; + return d_obtain_alias(inode); } /* This function is surprisingly simple. The trick is understanding @@ -51,7 +46,6 @@ static struct dentry *isofs_export_get_parent(struct dentry *child) unsigned long parent_offset = 0; struct inode *child_inode = child->d_inode; struct iso_inode_info *e_child_inode = ISOFS_I(child_inode); - struct inode *parent_inode = NULL; struct iso_directory_record *de = NULL; struct buffer_head * bh = NULL; struct dentry *rv = NULL; @@ -104,28 +98,11 @@ static struct dentry *isofs_export_get_parent(struct dentry *child) /* Normalize */ isofs_normalize_block_and_offset(de, &parent_block, &parent_offset); - /* Get the inode. */ - parent_inode = isofs_iget(child_inode->i_sb, - parent_block, - parent_offset); - if (IS_ERR(parent_inode)) { - rv = ERR_CAST(parent_inode); - if (rv != ERR_PTR(-ENOMEM)) - rv = ERR_PTR(-EACCES); - goto out; - } - - /* Allocate the dentry. */ - rv = d_alloc_anon(parent_inode); - if (rv == NULL) { - rv = ERR_PTR(-ENOMEM); - goto out; - } - + rv = d_obtain_alias(isofs_iget(child_inode->i_sb, parent_block, + parent_offset)); out: - if (bh) { + if (bh) brelse(bh); - } return rv; } diff --git a/fs/jbd/Kconfig b/fs/jbd/Kconfig new file mode 100644 index 0000000..4e28bee --- /dev/null +++ b/fs/jbd/Kconfig @@ -0,0 +1,30 @@ +config JBD + tristate + help + This is a generic journalling layer for block devices. It is + currently used by the ext3 file system, but it could also be + used to add journal support to other file systems or block + devices such as RAID or LVM. + + If you are using the ext3 file system, you need to say Y here. + If you are not using ext3 then you will probably want to say N. + + To compile this device as a module, choose M here: the module will be + called jbd. If you are compiling ext3 into the kernel, you + cannot compile this code as a module. + +config JBD_DEBUG + bool "JBD (ext3) debugging support" + depends on JBD && DEBUG_FS + help + If you are using the ext3 journaled file system (or potentially any + other file system/device using JBD), this option allows you to + enable debugging output while the system is running, in order to + help track down any problems you are having. By default the + debugging output will be turned off. + + If you select Y here, then you will be able to turn on debugging + with "echo N > /sys/kernel/debug/jbd/jbd-debug", where N is a + number between 1 and 5, the higher the number, the more debugging + output is generated. To turn debugging off again, do + "echo 0 > /sys/kernel/debug/jbd/jbd-debug". diff --git a/fs/jbd/checkpoint.c b/fs/jbd/checkpoint.c index a5432bb..1bd8d4a 100644 --- a/fs/jbd/checkpoint.c +++ b/fs/jbd/checkpoint.c @@ -93,7 +93,8 @@ static int __try_to_free_cp_buf(struct journal_head *jh) int ret = 0; struct buffer_head *bh = jh2bh(jh); - if (jh->b_jlist == BJ_None && !buffer_locked(bh) && !buffer_dirty(bh)) { + if (jh->b_jlist == BJ_None && !buffer_locked(bh) && + !buffer_dirty(bh) && !buffer_write_io_error(bh)) { JBUFFER_TRACE(jh, "remove from checkpoint list"); ret = __journal_remove_checkpoint(jh) + 1; jbd_unlock_bh_state(bh); @@ -126,14 +127,29 @@ void __log_wait_for_space(journal_t *journal) /* * Test again, another process may have checkpointed while we - * were waiting for the checkpoint lock + * were waiting for the checkpoint lock. If there are no + * outstanding transactions there is nothing to checkpoint and + * we can't make progress. Abort the journal in this case. */ spin_lock(&journal->j_state_lock); + spin_lock(&journal->j_list_lock); nblocks = jbd_space_needed(journal); if (__log_space_left(journal) < nblocks) { + int chkpt = journal->j_checkpoint_transactions != NULL; + + spin_unlock(&journal->j_list_lock); spin_unlock(&journal->j_state_lock); - log_do_checkpoint(journal); + if (chkpt) { + log_do_checkpoint(journal); + } else { + printk(KERN_ERR "%s: no transactions\n", + __func__); + journal_abort(journal, 0); + } + spin_lock(&journal->j_state_lock); + } else { + spin_unlock(&journal->j_list_lock); } mutex_unlock(&journal->j_checkpoint_mutex); } @@ -160,21 +176,25 @@ static void jbd_sync_bh(journal_t *journal, struct buffer_head *bh) * buffers. Note that we take the buffers in the opposite ordering * from the one in which they were submitted for IO. * + * Return 0 on success, and return <0 if some buffers have failed + * to be written out. + * * Called with j_list_lock held. */ -static void __wait_cp_io(journal_t *journal, transaction_t *transaction) +static int __wait_cp_io(journal_t *journal, transaction_t *transaction) { struct journal_head *jh; struct buffer_head *bh; tid_t this_tid; int released = 0; + int ret = 0; this_tid = transaction->t_tid; restart: /* Did somebody clean up the transaction in the meanwhile? */ if (journal->j_checkpoint_transactions != transaction || transaction->t_tid != this_tid) - return; + return ret; while (!released && transaction->t_checkpoint_io_list) { jh = transaction->t_checkpoint_io_list; bh = jh2bh(jh); @@ -194,6 +214,9 @@ restart: spin_lock(&journal->j_list_lock); goto restart; } + if (unlikely(buffer_write_io_error(bh))) + ret = -EIO; + /* * Now in whatever state the buffer currently is, we know that * it has been written out and so we can drop it from the list @@ -203,6 +226,8 @@ restart: journal_remove_journal_head(bh); __brelse(bh); } + + return ret; } #define NR_BATCH 64 @@ -226,7 +251,8 @@ __flush_batch(journal_t *journal, struct buffer_head **bhs, int *batch_count) * Try to flush one buffer from the checkpoint list to disk. * * Return 1 if something happened which requires us to abort the current - * scan of the checkpoint list. + * scan of the checkpoint list. Return <0 if the buffer has failed to + * be written out. * * Called with j_list_lock held and drops it if 1 is returned * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it @@ -256,6 +282,9 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh, log_wait_commit(journal, tid); ret = 1; } else if (!buffer_dirty(bh)) { + ret = 1; + if (unlikely(buffer_write_io_error(bh))) + ret = -EIO; J_ASSERT_JH(jh, !buffer_jbddirty(bh)); BUFFER_TRACE(bh, "remove from checkpoint"); __journal_remove_checkpoint(jh); @@ -263,7 +292,6 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh, jbd_unlock_bh_state(bh); journal_remove_journal_head(bh); __brelse(bh); - ret = 1; } else { /* * Important: we are about to write the buffer, and @@ -295,6 +323,7 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh, * to disk. We submit larger chunks of data at once. * * The journal should be locked before calling this function. + * Called with j_checkpoint_mutex held. */ int log_do_checkpoint(journal_t *journal) { @@ -318,6 +347,7 @@ int log_do_checkpoint(journal_t *journal) * OK, we need to start writing disk blocks. Take one transaction * and write it. */ + result = 0; spin_lock(&journal->j_list_lock); if (!journal->j_checkpoint_transactions) goto out; @@ -334,7 +364,7 @@ restart: int batch_count = 0; struct buffer_head *bhs[NR_BATCH]; struct journal_head *jh; - int retry = 0; + int retry = 0, err; while (!retry && transaction->t_checkpoint_list) { struct buffer_head *bh; @@ -347,6 +377,8 @@ restart: break; } retry = __process_buffer(journal, jh, bhs,&batch_count); + if (retry < 0 && !result) + result = retry; if (!retry && (need_resched() || spin_needbreak(&journal->j_list_lock))) { spin_unlock(&journal->j_list_lock); @@ -371,14 +403,18 @@ restart: * Now we have cleaned up the first transaction's checkpoint * list. Let's clean up the second one */ - __wait_cp_io(journal, transaction); + err = __wait_cp_io(journal, transaction); + if (!result) + result = err; } out: spin_unlock(&journal->j_list_lock); - result = cleanup_journal_tail(journal); if (result < 0) - return result; - return 0; + journal_abort(journal, result); + else + result = cleanup_journal_tail(journal); + + return (result < 0) ? result : 0; } /* @@ -394,8 +430,9 @@ out: * This is the only part of the journaling code which really needs to be * aware of transaction aborts. Checkpointing involves writing to the * main filesystem area rather than to the journal, so it can proceed - * even in abort state, but we must not update the journal superblock if - * we have an abort error outstanding. + * even in abort state, but we must not update the super block if + * checkpointing may have failed. Otherwise, we would lose some metadata + * buffers which should be written-back to the filesystem. */ int cleanup_journal_tail(journal_t *journal) @@ -404,6 +441,9 @@ int cleanup_journal_tail(journal_t *journal) tid_t first_tid; unsigned long blocknr, freed; + if (is_journal_aborted(journal)) + return 1; + /* OK, work out the oldest transaction remaining in the log, and * the log block it starts at. * diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c index ae08c05..25719d9 100644 --- a/fs/jbd/commit.c +++ b/fs/jbd/commit.c @@ -482,6 +482,8 @@ void journal_commit_transaction(journal_t *journal) printk(KERN_WARNING "JBD: Detected IO errors while flushing file data " "on %s\n", bdevname(journal->j_fs_dev, b)); + if (journal->j_flags & JFS_ABORT_ON_SYNCDATA_ERR) + journal_abort(journal, err); err = 0; } @@ -518,9 +520,10 @@ void journal_commit_transaction(journal_t *journal) jh = commit_transaction->t_buffers; /* If we're in abort mode, we just un-journal the buffer and - release it for background writing. */ + release it. */ if (is_journal_aborted(journal)) { + clear_buffer_jbddirty(jh2bh(jh)); JBUFFER_TRACE(jh, "journal is aborting: refile"); journal_refile_buffer(journal, jh); /* If that was the last one, we need to clean up @@ -762,6 +765,9 @@ wait_for_iobuf: /* AKPM: bforget here */ } + if (err) + journal_abort(journal, err); + jbd_debug(3, "JBD: commit phase 6\n"); if (journal_write_commit_record(journal, commit_transaction)) @@ -852,6 +858,8 @@ restart_loop: if (buffer_jbddirty(bh)) { JBUFFER_TRACE(jh, "add to new checkpointing trans"); __journal_insert_checkpoint(jh, commit_transaction); + if (is_journal_aborted(journal)) + clear_buffer_jbddirty(bh); JBUFFER_TRACE(jh, "refile for checkpoint writeback"); __journal_refile_buffer(jh); jbd_unlock_bh_state(bh); diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c index aa7143a..9e4fa52 100644 --- a/fs/jbd/journal.c +++ b/fs/jbd/journal.c @@ -1121,9 +1121,12 @@ recovery_error: * * Release a journal_t structure once it is no longer in use by the * journaled object. + * Return <0 if we couldn't clean up the journal. */ -void journal_destroy(journal_t *journal) +int journal_destroy(journal_t *journal) { + int err = 0; + /* Wait for the commit thread to wake up and die. */ journal_kill_thread(journal); @@ -1146,11 +1149,16 @@ void journal_destroy(journal_t *journal) J_ASSERT(journal->j_checkpoint_transactions == NULL); spin_unlock(&journal->j_list_lock); - /* We can now mark the journal as empty. */ - journal->j_tail = 0; - journal->j_tail_sequence = ++journal->j_transaction_sequence; if (journal->j_sb_buffer) { - journal_update_superblock(journal, 1); + if (!is_journal_aborted(journal)) { + /* We can now mark the journal as empty. */ + journal->j_tail = 0; + journal->j_tail_sequence = + ++journal->j_transaction_sequence; + journal_update_superblock(journal, 1); + } else { + err = -EIO; + } brelse(journal->j_sb_buffer); } @@ -1160,6 +1168,8 @@ void journal_destroy(journal_t *journal) journal_destroy_revoke(journal); kfree(journal->j_wbuf); kfree(journal); + + return err; } @@ -1359,10 +1369,16 @@ int journal_flush(journal_t *journal) spin_lock(&journal->j_list_lock); while (!err && journal->j_checkpoint_transactions != NULL) { spin_unlock(&journal->j_list_lock); + mutex_lock(&journal->j_checkpoint_mutex); err = log_do_checkpoint(journal); + mutex_unlock(&journal->j_checkpoint_mutex); spin_lock(&journal->j_list_lock); } spin_unlock(&journal->j_list_lock); + + if (is_journal_aborted(journal)) + return -EIO; + cleanup_journal_tail(journal); /* Finally, mark the journal as really needing no recovery. @@ -1384,7 +1400,7 @@ int journal_flush(journal_t *journal) J_ASSERT(journal->j_head == journal->j_tail); J_ASSERT(journal->j_tail_sequence == journal->j_transaction_sequence); spin_unlock(&journal->j_state_lock); - return err; + return 0; } /** diff --git a/fs/jbd/recovery.c b/fs/jbd/recovery.c index 43bc5e5..db5e982 100644 --- a/fs/jbd/recovery.c +++ b/fs/jbd/recovery.c @@ -223,7 +223,7 @@ do { \ */ int journal_recover(journal_t *journal) { - int err; + int err, err2; journal_superblock_t * sb; struct recovery_info info; @@ -261,7 +261,10 @@ int journal_recover(journal_t *journal) journal->j_transaction_sequence = ++info.end_transaction; journal_clear_revoke(journal); - sync_blockdev(journal->j_fs_dev); + err2 = sync_blockdev(journal->j_fs_dev); + if (!err) + err = err2; + return err; } diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c index 0540ca2..d15cd6e 100644 --- a/fs/jbd/transaction.c +++ b/fs/jbd/transaction.c @@ -954,9 +954,10 @@ int journal_dirty_data(handle_t *handle, struct buffer_head *bh) journal_t *journal = handle->h_transaction->t_journal; int need_brelse = 0; struct journal_head *jh; + int ret = 0; if (is_handle_aborted(handle)) - return 0; + return ret; jh = journal_add_journal_head(bh); JBUFFER_TRACE(jh, "entry"); @@ -1067,7 +1068,16 @@ int journal_dirty_data(handle_t *handle, struct buffer_head *bh) time if it is redirtied */ } - /* journal_clean_data_list() may have got there first */ + /* + * We cannot remove the buffer with io error from the + * committing transaction, because otherwise it would + * miss the error and the commit would not abort. + */ + if (unlikely(!buffer_uptodate(bh))) { + ret = -EIO; + goto no_journal; + } + if (jh->b_transaction != NULL) { JBUFFER_TRACE(jh, "unfile from commit"); __journal_temp_unlink_buffer(jh); @@ -1108,7 +1118,7 @@ no_journal: } JBUFFER_TRACE(jh, "exit"); journal_put_journal_head(jh); - return 0; + return ret; } /** diff --git a/fs/jbd2/Kconfig b/fs/jbd2/Kconfig new file mode 100644 index 0000000..f32f346 --- /dev/null +++ b/fs/jbd2/Kconfig @@ -0,0 +1,33 @@ +config JBD2 + tristate + select CRC32 + help + This is a generic journaling layer for block devices that support + both 32-bit and 64-bit block numbers. It is currently used by + the ext4 and OCFS2 filesystems, but it could also be used to add + journal support to other file systems or block devices such + as RAID or LVM. + + If you are using ext4 or OCFS2, you need to say Y here. + If you are not using ext4 or OCFS2 then you will + probably want to say N. + + To compile this device as a module, choose M here. The module will be + called jbd2. If you are compiling ext4 or OCFS2 into the kernel, + you cannot compile this code as a module. + +config JBD2_DEBUG + bool "JBD2 (ext4) debugging support" + depends on JBD2 && DEBUG_FS + help + If you are using the ext4 journaled file system (or + potentially any other filesystem/device using JBD2), this option + allows you to enable debugging output while the system is running, + in order to help track down any problems you are having. + By default, the debugging output will be turned off. + + If you select Y here, then you will be able to turn on debugging + with "echo N > /sys/kernel/debug/jbd2/jbd2-debug", where N is a + number between 1 and 5. The higher the number, the more debugging + output is generated. To turn debugging off again, do + "echo 0 > /sys/kernel/debug/jbd2/jbd2-debug". diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 0abe02c..8b119e1 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -995,6 +995,9 @@ restart_loop: } spin_unlock(&journal->j_list_lock); + if (journal->j_commit_callback) + journal->j_commit_callback(journal, commit_transaction); + trace_mark(jbd2_end_commit, "dev %s transaction %d head %d", journal->j_devname, commit_transaction->t_tid, journal->j_tail_sequence); diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index e5d5405..39b7805 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -52,6 +52,7 @@ jbd2_get_transaction(journal_t *journal, transaction_t *transaction) transaction->t_expires = jiffies + journal->j_commit_interval; spin_lock_init(&transaction->t_handle_lock); INIT_LIST_HEAD(&transaction->t_inode_list); + INIT_LIST_HEAD(&transaction->t_private_list); /* Set up the commit timer for the new transaction. */ journal->j_commit_timer.expires = round_jiffies(transaction->t_expires); diff --git a/fs/jffs2/Kconfig b/fs/jffs2/Kconfig new file mode 100644 index 0000000..6ae169c --- /dev/null +++ b/fs/jffs2/Kconfig @@ -0,0 +1,188 @@ +config JFFS2_FS + tristate "Journalling Flash File System v2 (JFFS2) support" + select CRC32 + depends on MTD + help + JFFS2 is the second generation of the Journalling Flash File System + for use on diskless embedded devices. It provides improved wear + levelling, compression and support for hard links. You cannot use + this on normal block devices, only on 'MTD' devices. + + Further information on the design and implementation of JFFS2 is + available at <http://sources.redhat.com/jffs2/>. + +config JFFS2_FS_DEBUG + int "JFFS2 debugging verbosity (0 = quiet, 2 = noisy)" + depends on JFFS2_FS + default "0" + help + This controls the amount of debugging messages produced by the JFFS2 + code. Set it to zero for use in production systems. For evaluation, + testing and debugging, it's advisable to set it to one. This will + enable a few assertions and will print debugging messages at the + KERN_DEBUG loglevel, where they won't normally be visible. Level 2 + is unlikely to be useful - it enables extra debugging in certain + areas which at one point needed debugging, but when the bugs were + located and fixed, the detailed messages were relegated to level 2. + + If reporting bugs, please try to have available a full dump of the + messages at debug level 1 while the misbehaviour was occurring. + +config JFFS2_FS_WRITEBUFFER + bool "JFFS2 write-buffering support" + depends on JFFS2_FS + default y + help + This enables the write-buffering support in JFFS2. + + This functionality is required to support JFFS2 on the following + types of flash devices: + - NAND flash + - NOR flash with transparent ECC + - DataFlash + +config JFFS2_FS_WBUF_VERIFY + bool "Verify JFFS2 write-buffer reads" + depends on JFFS2_FS_WRITEBUFFER + default n + help + This causes JFFS2 to read back every page written through the + write-buffer, and check for errors. + +config JFFS2_SUMMARY + bool "JFFS2 summary support (EXPERIMENTAL)" + depends on JFFS2_FS && EXPERIMENTAL + default n + help + This feature makes it possible to use summary information + for faster filesystem mount. + + The summary information can be inserted into a filesystem image + by the utility 'sumtool'. + + If unsure, say 'N'. + +config JFFS2_FS_XATTR + bool "JFFS2 XATTR support (EXPERIMENTAL)" + depends on JFFS2_FS && EXPERIMENTAL + default n + help + Extended attributes are name:value pairs associated with inodes by + the kernel or by users (see the attr(5) manual page, or visit + <http://acl.bestbits.at/> for details). + + If unsure, say N. + +config JFFS2_FS_POSIX_ACL + bool "JFFS2 POSIX Access Control Lists" + depends on JFFS2_FS_XATTR + default y + select FS_POSIX_ACL + help + Posix Access Control Lists (ACLs) support permissions for users and + groups beyond the owner/group/world scheme. + + To learn more about Access Control Lists, visit the Posix ACLs for + Linux website <http://acl.bestbits.at/>. + + If you don't know what Access Control Lists are, say N + +config JFFS2_FS_SECURITY + bool "JFFS2 Security Labels" + depends on JFFS2_FS_XATTR + default y + help + Security labels support alternative access control models + implemented by security modules like SELinux. This option + enables an extended attribute handler for file security + labels in the jffs2 filesystem. + + If you are not using a security module that requires using + extended attributes for file security labels, say N. + +config JFFS2_COMPRESSION_OPTIONS + bool "Advanced compression options for JFFS2" + depends on JFFS2_FS + default n + help + Enabling this option allows you to explicitly choose which + compression modules, if any, are enabled in JFFS2. Removing + compressors can mean you cannot read existing file systems, + and enabling experimental compressors can mean that you + write a file system which cannot be read by a standard kernel. + + If unsure, you should _definitely_ say 'N'. + +config JFFS2_ZLIB + bool "JFFS2 ZLIB compression support" if JFFS2_COMPRESSION_OPTIONS + select ZLIB_INFLATE + select ZLIB_DEFLATE + depends on JFFS2_FS + default y + help + Zlib is designed to be a free, general-purpose, legally unencumbered, + lossless data-compression library for use on virtually any computer + hardware and operating system. See <http://www.gzip.org/zlib/> for + further information. + + Say 'Y' if unsure. + +config JFFS2_LZO + bool "JFFS2 LZO compression support" if JFFS2_COMPRESSION_OPTIONS + select LZO_COMPRESS + select LZO_DECOMPRESS + depends on JFFS2_FS + default n + help + minilzo-based compression. Generally works better than Zlib. + + This feature was added in July, 2007. Say 'N' if you need + compatibility with older bootloaders or kernels. + +config JFFS2_RTIME + bool "JFFS2 RTIME compression support" if JFFS2_COMPRESSION_OPTIONS + depends on JFFS2_FS + default y + help + Rtime does manage to recompress already-compressed data. Say 'Y' if unsure. + +config JFFS2_RUBIN + bool "JFFS2 RUBIN compression support" if JFFS2_COMPRESSION_OPTIONS + depends on JFFS2_FS + default n + help + RUBINMIPS and DYNRUBIN compressors. Say 'N' if unsure. + +choice + prompt "JFFS2 default compression mode" if JFFS2_COMPRESSION_OPTIONS + default JFFS2_CMODE_PRIORITY + depends on JFFS2_FS + help + You can set here the default compression mode of JFFS2 from + the available compression modes. Don't touch if unsure. + +config JFFS2_CMODE_NONE + bool "no compression" + help + Uses no compression. + +config JFFS2_CMODE_PRIORITY + bool "priority" + help + Tries the compressors in a predefined order and chooses the first + successful one. + +config JFFS2_CMODE_SIZE + bool "size (EXPERIMENTAL)" + help + Tries all compressors and chooses the one which has the smallest + result. + +config JFFS2_CMODE_FAVOURLZO + bool "Favour LZO" + help + Tries all compressors and chooses the one which has the smallest + result but gives some preference to LZO (which has faster + decompression) at the expense of size. + +endchoice diff --git a/fs/jffs2/compr.c b/fs/jffs2/compr.c index 86739ee..f25e70c 100644 --- a/fs/jffs2/compr.c +++ b/fs/jffs2/compr.c @@ -53,8 +53,8 @@ static int jffs2_is_best_compression(struct jffs2_compressor *this, } /* jffs2_compress: - * @data: Pointer to uncompressed data - * @cdata: Pointer to returned pointer to buffer for compressed data + * @data_in: Pointer to uncompressed data + * @cpage_out: Pointer to returned pointer to buffer for compressed data * @datalen: On entry, holds the amount of data available for compression. * On exit, expected to hold the amount of data actually compressed. * @cdatalen: On entry, holds the amount of space available for compressed diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c index cd219ef..6f60cc9 100644 --- a/fs/jffs2/dir.c +++ b/fs/jffs2/dir.c @@ -39,7 +39,8 @@ const struct file_operations jffs2_dir_operations = .read = generic_read_dir, .readdir = jffs2_readdir, .unlocked_ioctl=jffs2_ioctl, - .fsync = jffs2_fsync + .fsync = jffs2_fsync, + .llseek = generic_file_llseek, }; @@ -108,9 +109,7 @@ static struct dentry *jffs2_lookup(struct inode *dir_i, struct dentry *target, } } - d_add(target, inode); - - return NULL; + return d_splice_alias(inode, target); } /***********************************************************************/ @@ -311,7 +310,7 @@ static int jffs2_symlink (struct inode *dir_i, struct dentry *dentry, const char /* FIXME: If you care. We'd need to use frags for the target if it grows much more than this */ if (targetlen > 254) - return -EINVAL; + return -ENAMETOOLONG; ri = jffs2_alloc_raw_inode(); diff --git a/fs/jffs2/erase.c b/fs/jffs2/erase.c index dddb2a6..259461b 100644 --- a/fs/jffs2/erase.c +++ b/fs/jffs2/erase.c @@ -68,7 +68,7 @@ static void jffs2_erase_block(struct jffs2_sb_info *c, instr->len = c->sector_size; instr->callback = jffs2_erase_callback; instr->priv = (unsigned long)(&instr[1]); - instr->fail_addr = 0xffffffff; + instr->fail_addr = MTD_FAIL_ADDR_UNKNOWN; ((struct erase_priv_struct *)instr->priv)->jeb = jeb; ((struct erase_priv_struct *)instr->priv)->c = c; @@ -175,7 +175,7 @@ static void jffs2_erase_failed(struct jffs2_sb_info *c, struct jffs2_eraseblock { /* For NAND, if the failure did not occur at the device level for a specific physical page, don't bother updating the bad block table. */ - if (jffs2_cleanmarker_oob(c) && (bad_offset != 0xffffffff)) { + if (jffs2_cleanmarker_oob(c) && (bad_offset != MTD_FAIL_ADDR_UNKNOWN)) { /* We had a device-level failure to erase. Let's see if we've failed too many times. */ if (!jffs2_write_nand_badblock(c, jeb, bad_offset)) { diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c index 086c438..249305d 100644 --- a/fs/jffs2/fs.c +++ b/fs/jffs2/fs.c @@ -207,6 +207,8 @@ int jffs2_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_files = 0; buf->f_ffree = 0; buf->f_namelen = JFFS2_MAX_NAME_LEN; + buf->f_fsid.val[0] = JFFS2_SUPER_MAGIC; + buf->f_fsid.val[1] = c->mtd->index; spin_lock(&c->erase_completion_lock); avail = c->dirty_size + c->free_size; @@ -440,14 +442,14 @@ struct inode *jffs2_new_inode (struct inode *dir_i, int mode, struct jffs2_raw_i memset(ri, 0, sizeof(*ri)); /* Set OS-specific defaults for new inodes */ - ri->uid = cpu_to_je16(current->fsuid); + ri->uid = cpu_to_je16(current_fsuid()); if (dir_i->i_mode & S_ISGID) { ri->gid = cpu_to_je16(dir_i->i_gid); if (S_ISDIR(mode)) mode |= S_ISGID; } else { - ri->gid = cpu_to_je16(current->fsgid); + ri->gid = cpu_to_je16(current_fsgid()); } /* POSIX ACLs have to be processed now, at least partly. diff --git a/fs/jffs2/nodemgmt.c b/fs/jffs2/nodemgmt.c index a9bf960..0875b60 100644 --- a/fs/jffs2/nodemgmt.c +++ b/fs/jffs2/nodemgmt.c @@ -261,6 +261,10 @@ static int jffs2_find_nextblock(struct jffs2_sb_info *c) jffs2_sum_reset_collected(c->summary); /* reset collected summary */ + /* adjust write buffer offset, else we get a non contiguous write bug */ + if (!(c->wbuf_ofs % c->sector_size) && !c->wbuf_len) + c->wbuf_ofs = 0xffffffff; + D1(printk(KERN_DEBUG "jffs2_find_nextblock(): new nextblock = 0x%08x\n", c->nextblock->offset)); return 0; diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c index efd4012..4c4e18c 100644 --- a/fs/jffs2/super.c +++ b/fs/jffs2/super.c @@ -22,6 +22,7 @@ #include <linux/mtd/super.h> #include <linux/ctype.h> #include <linux/namei.h> +#include <linux/exportfs.h> #include "compr.h" #include "nodelist.h" @@ -62,6 +63,52 @@ static int jffs2_sync_fs(struct super_block *sb, int wait) return 0; } +static struct inode *jffs2_nfs_get_inode(struct super_block *sb, uint64_t ino, + uint32_t generation) +{ + /* We don't care about i_generation. We'll destroy the flash + before we start re-using inode numbers anyway. And even + if that wasn't true, we'd have other problems...*/ + return jffs2_iget(sb, ino); +} + +static struct dentry *jffs2_fh_to_dentry(struct super_block *sb, struct fid *fid, + int fh_len, int fh_type) +{ + return generic_fh_to_dentry(sb, fid, fh_len, fh_type, + jffs2_nfs_get_inode); +} + +static struct dentry *jffs2_fh_to_parent(struct super_block *sb, struct fid *fid, + int fh_len, int fh_type) +{ + return generic_fh_to_parent(sb, fid, fh_len, fh_type, + jffs2_nfs_get_inode); +} + +static struct dentry *jffs2_get_parent(struct dentry *child) +{ + struct jffs2_inode_info *f; + uint32_t pino; + + BUG_ON(!S_ISDIR(child->d_inode->i_mode)); + + f = JFFS2_INODE_INFO(child->d_inode); + + pino = f->inocache->pino_nlink; + + JFFS2_DEBUG("Parent of directory ino #%u is #%u\n", + f->inocache->ino, pino); + + return d_obtain_alias(jffs2_iget(child->d_inode->i_sb, pino)); +} + +static struct export_operations jffs2_export_ops = { + .get_parent = jffs2_get_parent, + .fh_to_dentry = jffs2_fh_to_dentry, + .fh_to_parent = jffs2_fh_to_parent, +}; + static const struct super_operations jffs2_super_operations = { .alloc_inode = jffs2_alloc_inode, @@ -104,6 +151,7 @@ static int jffs2_fill_super(struct super_block *sb, void *data, int silent) spin_lock_init(&c->inocache_lock); sb->s_op = &jffs2_super_operations; + sb->s_export_op = &jffs2_export_ops; sb->s_flags = sb->s_flags | MS_NOATIME; sb->s_xattr = jffs2_xattr_handlers; #ifdef CONFIG_JFFS2_FS_POSIX_ACL diff --git a/fs/jffs2/wbuf.c b/fs/jffs2/wbuf.c index 0e78b00..d9a721e 100644 --- a/fs/jffs2/wbuf.c +++ b/fs/jffs2/wbuf.c @@ -679,10 +679,7 @@ static int __jffs2_flush_wbuf(struct jffs2_sb_info *c, int pad) memset(c->wbuf,0xff,c->wbuf_pagesize); /* adjust write buffer offset, else we get a non contiguous write bug */ - if (SECTOR_ADDR(c->wbuf_ofs) == SECTOR_ADDR(c->wbuf_ofs+c->wbuf_pagesize)) - c->wbuf_ofs += c->wbuf_pagesize; - else - c->wbuf_ofs = 0xffffffff; + c->wbuf_ofs += c->wbuf_pagesize; c->wbuf_len = 0; return 0; } diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c index cd2ec29..335c4de 100644 --- a/fs/jfs/jfs_logmgr.c +++ b/fs/jfs/jfs_logmgr.c @@ -1168,7 +1168,7 @@ journal_found: bd_release(bdev); close: /* close external log device */ - blkdev_put(bdev); + blkdev_put(bdev, FMODE_READ|FMODE_WRITE); free: /* free log descriptor */ mutex_unlock(&jfs_log_mutex); @@ -1514,7 +1514,7 @@ int lmLogClose(struct super_block *sb) rc = lmLogShutdown(log); bd_release(bdev); - blkdev_put(bdev); + blkdev_put(bdev, FMODE_READ|FMODE_WRITE); kfree(log); diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c index 2aba823..cc3cedff 100644 --- a/fs/jfs/namei.c +++ b/fs/jfs/namei.c @@ -1511,25 +1511,12 @@ struct dentry *jfs_fh_to_parent(struct super_block *sb, struct fid *fid, struct dentry *jfs_get_parent(struct dentry *dentry) { - struct super_block *sb = dentry->d_inode->i_sb; - struct dentry *parent = ERR_PTR(-ENOENT); - struct inode *inode; unsigned long parent_ino; parent_ino = le32_to_cpu(JFS_IP(dentry->d_inode)->i_dtroot.header.idotdot); - inode = jfs_iget(sb, parent_ino); - if (IS_ERR(inode)) { - parent = ERR_CAST(inode); - } else { - parent = d_alloc_anon(inode); - if (!parent) { - parent = ERR_PTR(-ENOMEM); - iput(inode); - } - } - return parent; + return d_obtain_alias(jfs_iget(dentry->d_inode->i_sb, parent_ino)); } const struct inode_operations jfs_dir_inode_operations = { @@ -1560,6 +1547,7 @@ const struct file_operations jfs_dir_operations = { #ifdef CONFIG_COMPAT .compat_ioctl = jfs_compat_ioctl, #endif + .llseek = generic_file_llseek, }; static int jfs_ci_hash(struct dentry *dir, struct qstr *this) @@ -732,28 +732,6 @@ out: return ret; } -/* - * This is what d_alloc_anon should have been. Once the exportfs - * argument transition has been finished I will update d_alloc_anon - * to this prototype and this wrapper will go away. --hch - */ -static struct dentry *exportfs_d_alloc(struct inode *inode) -{ - struct dentry *dentry; - - if (!inode) - return NULL; - if (IS_ERR(inode)) - return ERR_PTR(PTR_ERR(inode)); - - dentry = d_alloc_anon(inode); - if (!dentry) { - iput(inode); - dentry = ERR_PTR(-ENOMEM); - } - return dentry; -} - /** * generic_fh_to_dentry - generic helper for the fh_to_dentry export operation * @sb: filesystem to do the file handle conversion on @@ -782,7 +760,7 @@ struct dentry *generic_fh_to_dentry(struct super_block *sb, struct fid *fid, break; } - return exportfs_d_alloc(inode); + return d_obtain_alias(inode); } EXPORT_SYMBOL_GPL(generic_fh_to_dentry); @@ -815,7 +793,7 @@ struct dentry *generic_fh_to_parent(struct super_block *sb, struct fid *fid, break; } - return exportfs_d_alloc(inode); + return d_obtain_alias(inode); } EXPORT_SYMBOL_GPL(generic_fh_to_parent); @@ -1580,7 +1580,8 @@ asmlinkage long sys_flock(unsigned int fd, unsigned int cmd) cmd &= ~LOCK_NB; unlock = (cmd == LOCK_UN); - if (!unlock && !(cmd & LOCK_MAND) && !(filp->f_mode & 3)) + if (!unlock && !(cmd & LOCK_MAND) && + !(filp->f_mode & (FMODE_READ|FMODE_WRITE))) goto out_putf; error = flock_make_lock(filp, &lock, cmd); @@ -2078,6 +2079,7 @@ int vfs_cancel_lock(struct file *filp, struct file_lock *fl) EXPORT_SYMBOL_GPL(vfs_cancel_lock); #ifdef CONFIG_PROC_FS +#include <linux/proc_fs.h> #include <linux/seq_file.h> static void lock_get_status(struct seq_file *f, struct file_lock *fl, @@ -2183,12 +2185,31 @@ static void locks_stop(struct seq_file *f, void *v) unlock_kernel(); } -struct seq_operations locks_seq_operations = { +static const struct seq_operations locks_seq_operations = { .start = locks_start, .next = locks_next, .stop = locks_stop, .show = locks_show, }; + +static int locks_open(struct inode *inode, struct file *filp) +{ + return seq_open(filp, &locks_seq_operations); +} + +static const struct file_operations proc_locks_operations = { + .open = locks_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static int __init proc_locks_init(void) +{ + proc_create("locks", 0, NULL, &proc_locks_operations); + return 0; +} +module_init(proc_locks_init); #endif /** @@ -212,8 +212,7 @@ int generic_permission(struct inode *inode, int mask, * Read/write DACs are always overridable. * Executable DACs are overridable if at least one exec bit is set. */ - if (!(mask & MAY_EXEC) || - (inode->i_mode & S_IXUGO) || S_ISDIR(inode->i_mode)) + if (!(mask & MAY_EXEC) || execute_ok(inode)) if (capable(CAP_DAC_OVERRIDE)) return 0; @@ -249,23 +248,11 @@ int inode_permission(struct inode *inode, int mask) } /* Ordinary permission routines do not understand MAY_APPEND. */ - if (inode->i_op && inode->i_op->permission) { + if (inode->i_op && inode->i_op->permission) retval = inode->i_op->permission(inode, mask); - if (!retval) { - /* - * Exec permission on a regular file is denied if none - * of the execute bits are set. - * - * This check should be done by the ->permission() - * method. - */ - if ((mask & MAY_EXEC) && S_ISREG(inode->i_mode) && - !(inode->i_mode & S_IXUGO)) - return -EACCES; - } - } else { + else retval = generic_permission(inode, mask, NULL); - } + if (retval) return retval; @@ -1106,6 +1093,15 @@ int path_lookup(const char *name, unsigned int flags, return do_path_lookup(AT_FDCWD, name, flags, nd); } +int kern_path(const char *name, unsigned int flags, struct path *path) +{ + struct nameidata nd; + int res = do_path_lookup(AT_FDCWD, name, flags, &nd); + if (!res) + *path = nd.path; + return res; +} + /** * vfs_path_lookup - lookup a file path relative to a dentry-vfsmount pair * @dentry: pointer to dentry of the base directory @@ -1138,9 +1134,16 @@ int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt, } -static int __path_lookup_intent_open(int dfd, const char *name, - unsigned int lookup_flags, struct nameidata *nd, - int open_flags, int create_mode) +/** + * path_lookup_open - lookup a file path with open intent + * @dfd: the directory to use as base, or AT_FDCWD + * @name: pointer to file name + * @lookup_flags: lookup intent flags + * @nd: pointer to nameidata + * @open_flags: open intent flags + */ +int path_lookup_open(int dfd, const char *name, unsigned int lookup_flags, + struct nameidata *nd, int open_flags) { struct file *filp = get_empty_filp(); int err; @@ -1149,7 +1152,7 @@ static int __path_lookup_intent_open(int dfd, const char *name, return -ENFILE; nd->intent.open.file = filp; nd->intent.open.flags = open_flags; - nd->intent.open.create_mode = create_mode; + nd->intent.open.create_mode = 0; err = do_path_lookup(dfd, name, lookup_flags|LOOKUP_OPEN, nd); if (IS_ERR(nd->intent.open.file)) { if (err == 0) { @@ -1161,38 +1164,6 @@ static int __path_lookup_intent_open(int dfd, const char *name, return err; } -/** - * path_lookup_open - lookup a file path with open intent - * @dfd: the directory to use as base, or AT_FDCWD - * @name: pointer to file name - * @lookup_flags: lookup intent flags - * @nd: pointer to nameidata - * @open_flags: open intent flags - */ -int path_lookup_open(int dfd, const char *name, unsigned int lookup_flags, - struct nameidata *nd, int open_flags) -{ - return __path_lookup_intent_open(dfd, name, lookup_flags, nd, - open_flags, 0); -} - -/** - * path_lookup_create - lookup a file path with open + create intent - * @dfd: the directory to use as base, or AT_FDCWD - * @name: pointer to file name - * @lookup_flags: lookup intent flags - * @nd: pointer to nameidata - * @open_flags: open intent flags - * @create_mode: create intent flags - */ -static int path_lookup_create(int dfd, const char *name, - unsigned int lookup_flags, struct nameidata *nd, - int open_flags, int create_mode) -{ - return __path_lookup_intent_open(dfd, name, lookup_flags|LOOKUP_CREATE, - nd, open_flags, create_mode); -} - static struct dentry *__lookup_hash(struct qstr *name, struct dentry *base, struct nameidata *nd) { @@ -1470,20 +1441,18 @@ struct dentry *lock_rename(struct dentry *p1, struct dentry *p2) mutex_lock(&p1->d_inode->i_sb->s_vfs_rename_mutex); - for (p = p1; p->d_parent != p; p = p->d_parent) { - if (p->d_parent == p2) { - mutex_lock_nested(&p2->d_inode->i_mutex, I_MUTEX_PARENT); - mutex_lock_nested(&p1->d_inode->i_mutex, I_MUTEX_CHILD); - return p; - } + p = d_ancestor(p2, p1); + if (p) { + mutex_lock_nested(&p2->d_inode->i_mutex, I_MUTEX_PARENT); + mutex_lock_nested(&p1->d_inode->i_mutex, I_MUTEX_CHILD); + return p; } - for (p = p2; p->d_parent != p; p = p->d_parent) { - if (p->d_parent == p1) { - mutex_lock_nested(&p1->d_inode->i_mutex, I_MUTEX_PARENT); - mutex_lock_nested(&p2->d_inode->i_mutex, I_MUTEX_CHILD); - return p; - } + p = d_ancestor(p1, p2); + if (p) { + mutex_lock_nested(&p1->d_inode->i_mutex, I_MUTEX_PARENT); + mutex_lock_nested(&p2->d_inode->i_mutex, I_MUTEX_CHILD); + return p; } mutex_lock_nested(&p1->d_inode->i_mutex, I_MUTEX_PARENT); @@ -1702,8 +1671,7 @@ struct file *do_filp_open(int dfd, const char *pathname, /* * Create - we need to know the parent. */ - error = path_lookup_create(dfd, pathname, LOOKUP_PARENT, - &nd, flag, mode); + error = do_path_lookup(dfd, pathname, LOOKUP_PARENT, &nd); if (error) return ERR_PTR(error); @@ -1714,10 +1682,20 @@ struct file *do_filp_open(int dfd, const char *pathname, */ error = -EISDIR; if (nd.last_type != LAST_NORM || nd.last.name[nd.last.len]) - goto exit; + goto exit_parent; + error = -ENFILE; + filp = get_empty_filp(); + if (filp == NULL) + goto exit_parent; + nd.intent.open.file = filp; + nd.intent.open.flags = flag; + nd.intent.open.create_mode = mode; dir = nd.path.dentry; nd.flags &= ~LOOKUP_PARENT; + nd.flags |= LOOKUP_CREATE | LOOKUP_OPEN; + if (flag & O_EXCL) + nd.flags |= LOOKUP_EXCL; mutex_lock(&dir->d_inode->i_mutex); path.dentry = lookup_hash(&nd); path.mnt = nd.path.mnt; @@ -1822,6 +1800,7 @@ exit_dput: exit: if (!IS_ERR(nd.intent.open.file)) release_open_intent(&nd); +exit_parent: path_put(&nd.path); return ERR_PTR(error); @@ -1914,7 +1893,7 @@ struct dentry *lookup_create(struct nameidata *nd, int is_dir) if (nd->last_type != LAST_NORM) goto fail; nd->flags &= ~LOOKUP_PARENT; - nd->flags |= LOOKUP_CREATE; + nd->flags |= LOOKUP_CREATE | LOOKUP_EXCL; nd->intent.open.flags = O_EXCL; /* @@ -2178,16 +2157,19 @@ static long do_rmdir(int dfd, const char __user *pathname) return error; switch(nd.last_type) { - case LAST_DOTDOT: - error = -ENOTEMPTY; - goto exit1; - case LAST_DOT: - error = -EINVAL; - goto exit1; - case LAST_ROOT: - error = -EBUSY; - goto exit1; + case LAST_DOTDOT: + error = -ENOTEMPTY; + goto exit1; + case LAST_DOT: + error = -EINVAL; + goto exit1; + case LAST_ROOT: + error = -EBUSY; + goto exit1; } + + nd.flags &= ~LOOKUP_PARENT; + mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT); dentry = lookup_hash(&nd); error = PTR_ERR(dentry); @@ -2265,6 +2247,9 @@ static long do_unlinkat(int dfd, const char __user *pathname) error = -EISDIR; if (nd.last_type != LAST_NORM) goto exit1; + + nd.flags &= ~LOOKUP_PARENT; + mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT); dentry = lookup_hash(&nd); error = PTR_ERR(dentry); @@ -2654,6 +2639,10 @@ asmlinkage long sys_renameat(int olddfd, const char __user *oldname, if (newnd.last_type != LAST_NORM) goto exit2; + oldnd.flags &= ~LOOKUP_PARENT; + newnd.flags &= ~LOOKUP_PARENT; + newnd.flags |= LOOKUP_RENAME_TARGET; + trap = lock_rename(new_dir, old_dir); old_dentry = lookup_hash(&oldnd); @@ -2855,6 +2844,7 @@ EXPORT_SYMBOL(__page_symlink); EXPORT_SYMBOL(page_symlink); EXPORT_SYMBOL(page_symlink_inode_operations); EXPORT_SYMBOL(path_lookup); +EXPORT_SYMBOL(kern_path); EXPORT_SYMBOL(vfs_path_lookup); EXPORT_SYMBOL(inode_permission); EXPORT_SYMBOL(vfs_permission); diff --git a/fs/namespace.c b/fs/namespace.c index 6e283c9..cce4670 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -1167,19 +1167,19 @@ asmlinkage long sys_oldumount(char __user * name) #endif -static int mount_is_safe(struct nameidata *nd) +static int mount_is_safe(struct path *path) { if (capable(CAP_SYS_ADMIN)) return 0; return -EPERM; #ifdef notyet - if (S_ISLNK(nd->path.dentry->d_inode->i_mode)) + if (S_ISLNK(path->dentry->d_inode->i_mode)) return -EPERM; - if (nd->path.dentry->d_inode->i_mode & S_ISVTX) { - if (current->uid != nd->path.dentry->d_inode->i_uid) + if (path->dentry->d_inode->i_mode & S_ISVTX) { + if (current->uid != path->dentry->d_inode->i_uid) return -EPERM; } - if (vfs_permission(nd, MAY_WRITE)) + if (inode_permission(path->dentry->d_inode, MAY_WRITE)) return -EPERM; return 0; #endif @@ -1425,11 +1425,10 @@ out_unlock: /* * recursively change the type of the mountpoint. - * noinline this do_mount helper to save do_mount stack space. */ -static noinline int do_change_type(struct nameidata *nd, int flag) +static int do_change_type(struct path *path, int flag) { - struct vfsmount *m, *mnt = nd->path.mnt; + struct vfsmount *m, *mnt = path->mnt; int recurse = flag & MS_REC; int type = flag & ~MS_REC; int err = 0; @@ -1437,7 +1436,7 @@ static noinline int do_change_type(struct nameidata *nd, int flag) if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (nd->path.dentry != nd->path.mnt->mnt_root) + if (path->dentry != path->mnt->mnt_root) return -EINVAL; down_write(&namespace_sem); @@ -1459,40 +1458,39 @@ static noinline int do_change_type(struct nameidata *nd, int flag) /* * do loopback mount. - * noinline this do_mount helper to save do_mount stack space. */ -static noinline int do_loopback(struct nameidata *nd, char *old_name, +static int do_loopback(struct path *path, char *old_name, int recurse) { - struct nameidata old_nd; + struct path old_path; struct vfsmount *mnt = NULL; - int err = mount_is_safe(nd); + int err = mount_is_safe(path); if (err) return err; if (!old_name || !*old_name) return -EINVAL; - err = path_lookup(old_name, LOOKUP_FOLLOW, &old_nd); + err = kern_path(old_name, LOOKUP_FOLLOW, &old_path); if (err) return err; down_write(&namespace_sem); err = -EINVAL; - if (IS_MNT_UNBINDABLE(old_nd.path.mnt)) + if (IS_MNT_UNBINDABLE(old_path.mnt)) goto out; - if (!check_mnt(nd->path.mnt) || !check_mnt(old_nd.path.mnt)) + if (!check_mnt(path->mnt) || !check_mnt(old_path.mnt)) goto out; err = -ENOMEM; if (recurse) - mnt = copy_tree(old_nd.path.mnt, old_nd.path.dentry, 0); + mnt = copy_tree(old_path.mnt, old_path.dentry, 0); else - mnt = clone_mnt(old_nd.path.mnt, old_nd.path.dentry, 0); + mnt = clone_mnt(old_path.mnt, old_path.dentry, 0); if (!mnt) goto out; - err = graft_tree(mnt, &nd->path); + err = graft_tree(mnt, path); if (err) { LIST_HEAD(umount_list); spin_lock(&vfsmount_lock); @@ -1503,7 +1501,7 @@ static noinline int do_loopback(struct nameidata *nd, char *old_name, out: up_write(&namespace_sem); - path_put(&old_nd.path); + path_put(&old_path); return err; } @@ -1528,33 +1526,37 @@ static int change_mount_flags(struct vfsmount *mnt, int ms_flags) * change filesystem flags. dir should be a physical root of filesystem. * If you've mounted a non-root directory somewhere and want to do remount * on it - tough luck. - * noinline this do_mount helper to save do_mount stack space. */ -static noinline int do_remount(struct nameidata *nd, int flags, int mnt_flags, +static int do_remount(struct path *path, int flags, int mnt_flags, void *data) { int err; - struct super_block *sb = nd->path.mnt->mnt_sb; + struct super_block *sb = path->mnt->mnt_sb; if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (!check_mnt(nd->path.mnt)) + if (!check_mnt(path->mnt)) return -EINVAL; - if (nd->path.dentry != nd->path.mnt->mnt_root) + if (path->dentry != path->mnt->mnt_root) return -EINVAL; down_write(&sb->s_umount); if (flags & MS_BIND) - err = change_mount_flags(nd->path.mnt, flags); + err = change_mount_flags(path->mnt, flags); else err = do_remount_sb(sb, flags, data, 0); if (!err) - nd->path.mnt->mnt_flags = mnt_flags; + path->mnt->mnt_flags = mnt_flags; up_write(&sb->s_umount); - if (!err) - security_sb_post_remount(nd->path.mnt, flags, data); + if (!err) { + security_sb_post_remount(path->mnt, flags, data); + + spin_lock(&vfsmount_lock); + touch_mnt_namespace(path->mnt->mnt_ns); + spin_unlock(&vfsmount_lock); + } return err; } @@ -1568,90 +1570,85 @@ static inline int tree_contains_unbindable(struct vfsmount *mnt) return 0; } -/* - * noinline this do_mount helper to save do_mount stack space. - */ -static noinline int do_move_mount(struct nameidata *nd, char *old_name) +static int do_move_mount(struct path *path, char *old_name) { - struct nameidata old_nd; - struct path parent_path; + struct path old_path, parent_path; struct vfsmount *p; int err = 0; if (!capable(CAP_SYS_ADMIN)) return -EPERM; if (!old_name || !*old_name) return -EINVAL; - err = path_lookup(old_name, LOOKUP_FOLLOW, &old_nd); + err = kern_path(old_name, LOOKUP_FOLLOW, &old_path); if (err) return err; down_write(&namespace_sem); - while (d_mountpoint(nd->path.dentry) && - follow_down(&nd->path.mnt, &nd->path.dentry)) + while (d_mountpoint(path->dentry) && + follow_down(&path->mnt, &path->dentry)) ; err = -EINVAL; - if (!check_mnt(nd->path.mnt) || !check_mnt(old_nd.path.mnt)) + if (!check_mnt(path->mnt) || !check_mnt(old_path.mnt)) goto out; err = -ENOENT; - mutex_lock(&nd->path.dentry->d_inode->i_mutex); - if (IS_DEADDIR(nd->path.dentry->d_inode)) + mutex_lock(&path->dentry->d_inode->i_mutex); + if (IS_DEADDIR(path->dentry->d_inode)) goto out1; - if (!IS_ROOT(nd->path.dentry) && d_unhashed(nd->path.dentry)) + if (!IS_ROOT(path->dentry) && d_unhashed(path->dentry)) goto out1; err = -EINVAL; - if (old_nd.path.dentry != old_nd.path.mnt->mnt_root) + if (old_path.dentry != old_path.mnt->mnt_root) goto out1; - if (old_nd.path.mnt == old_nd.path.mnt->mnt_parent) + if (old_path.mnt == old_path.mnt->mnt_parent) goto out1; - if (S_ISDIR(nd->path.dentry->d_inode->i_mode) != - S_ISDIR(old_nd.path.dentry->d_inode->i_mode)) + if (S_ISDIR(path->dentry->d_inode->i_mode) != + S_ISDIR(old_path.dentry->d_inode->i_mode)) goto out1; /* * Don't move a mount residing in a shared parent. */ - if (old_nd.path.mnt->mnt_parent && - IS_MNT_SHARED(old_nd.path.mnt->mnt_parent)) + if (old_path.mnt->mnt_parent && + IS_MNT_SHARED(old_path.mnt->mnt_parent)) goto out1; /* * Don't move a mount tree containing unbindable mounts to a destination * mount which is shared. */ - if (IS_MNT_SHARED(nd->path.mnt) && - tree_contains_unbindable(old_nd.path.mnt)) + if (IS_MNT_SHARED(path->mnt) && + tree_contains_unbindable(old_path.mnt)) goto out1; err = -ELOOP; - for (p = nd->path.mnt; p->mnt_parent != p; p = p->mnt_parent) - if (p == old_nd.path.mnt) + for (p = path->mnt; p->mnt_parent != p; p = p->mnt_parent) + if (p == old_path.mnt) goto out1; - err = attach_recursive_mnt(old_nd.path.mnt, &nd->path, &parent_path); + err = attach_recursive_mnt(old_path.mnt, path, &parent_path); if (err) goto out1; /* if the mount is moved, it should no longer be expire * automatically */ - list_del_init(&old_nd.path.mnt->mnt_expire); + list_del_init(&old_path.mnt->mnt_expire); out1: - mutex_unlock(&nd->path.dentry->d_inode->i_mutex); + mutex_unlock(&path->dentry->d_inode->i_mutex); out: up_write(&namespace_sem); if (!err) path_put(&parent_path); - path_put(&old_nd.path); + path_put(&old_path); return err; } /* * create a new mount for userspace and request it to be added into the * namespace's tree - * noinline this do_mount helper to save do_mount stack space. */ -static noinline int do_new_mount(struct nameidata *nd, char *type, int flags, +static int do_new_mount(struct path *path, char *type, int flags, int mnt_flags, char *name, void *data) { struct vfsmount *mnt; @@ -1667,7 +1664,7 @@ static noinline int do_new_mount(struct nameidata *nd, char *type, int flags, if (IS_ERR(mnt)) return PTR_ERR(mnt); - return do_add_mount(mnt, &nd->path, mnt_flags, NULL); + return do_add_mount(mnt, path, mnt_flags, NULL); } /* @@ -1902,7 +1899,7 @@ int copy_mount_options(const void __user * data, unsigned long *where) long do_mount(char *dev_name, char *dir_name, char *type_page, unsigned long flags, void *data_page) { - struct nameidata nd; + struct path path; int retval = 0; int mnt_flags = 0; @@ -1940,29 +1937,29 @@ long do_mount(char *dev_name, char *dir_name, char *type_page, MS_NOATIME | MS_NODIRATIME | MS_RELATIME| MS_KERNMOUNT); /* ... and get the mountpoint */ - retval = path_lookup(dir_name, LOOKUP_FOLLOW, &nd); + retval = kern_path(dir_name, LOOKUP_FOLLOW, &path); if (retval) return retval; - retval = security_sb_mount(dev_name, &nd.path, + retval = security_sb_mount(dev_name, &path, type_page, flags, data_page); if (retval) goto dput_out; if (flags & MS_REMOUNT) - retval = do_remount(&nd, flags & ~MS_REMOUNT, mnt_flags, + retval = do_remount(&path, flags & ~MS_REMOUNT, mnt_flags, data_page); else if (flags & MS_BIND) - retval = do_loopback(&nd, dev_name, flags & MS_REC); + retval = do_loopback(&path, dev_name, flags & MS_REC); else if (flags & (MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE)) - retval = do_change_type(&nd, flags); + retval = do_change_type(&path, flags); else if (flags & MS_MOVE) - retval = do_move_mount(&nd, dev_name); + retval = do_move_mount(&path, dev_name); else - retval = do_new_mount(&nd, type_page, flags, mnt_flags, + retval = do_new_mount(&path, type_page, flags, mnt_flags, dev_name, data_page); dput_out: - path_put(&nd.path); + path_put(&path); return retval; } diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c index 6a09760..c2e9cfd 100644 --- a/fs/nfs/callback.c +++ b/fs/nfs/callback.c @@ -40,6 +40,16 @@ unsigned short nfs_callback_tcpport; static const int nfs_set_port_min = 0; static const int nfs_set_port_max = 65535; +/* + * If the kernel has IPv6 support available, always listen for + * both AF_INET and AF_INET6 requests. + */ +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) +static const sa_family_t nfs_callback_family = AF_INET6; +#else +static const sa_family_t nfs_callback_family = AF_INET; +#endif + static int param_set_port(const char *val, struct kernel_param *kp) { char *endp; @@ -106,7 +116,7 @@ int nfs_callback_up(void) if (nfs_callback_info.users++ || nfs_callback_info.task != NULL) goto out; serv = svc_create(&nfs4_callback_program, NFS4_CALLBACK_BUFSIZE, - AF_INET, NULL); + nfs_callback_family, NULL); ret = -ENOMEM; if (!serv) goto out_err; @@ -116,7 +126,8 @@ int nfs_callback_up(void) if (ret <= 0) goto out_err; nfs_callback_tcpport = ret; - dprintk("Callback port = 0x%x\n", nfs_callback_tcpport); + dprintk("NFS: Callback listener port = %u (af %u)\n", + nfs_callback_tcpport, nfs_callback_family); nfs_callback_info.rqst = svc_prepare_thread(serv, &serv->sv_pools[0]); if (IS_ERR(nfs_callback_info.rqst)) { @@ -149,8 +160,8 @@ out: mutex_unlock(&nfs_callback_mutex); return ret; out_err: - dprintk("Couldn't create callback socket or server thread; err = %d\n", - ret); + dprintk("NFS: Couldn't create callback socket or server thread; " + "err = %d\n", ret); nfs_callback_info.users--; goto out; } diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 2ab70d4..3e64b98 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -707,9 +707,7 @@ static int nfs_is_exclusive_create(struct inode *dir, struct nameidata *nd) { if (NFS_PROTO(dir)->version == 2) return 0; - if (nd == NULL || nfs_lookup_check_intent(nd, LOOKUP_CREATE) == 0) - return 0; - return (nd->intent.open.flags & O_EXCL) != 0; + return nd && nfs_lookup_check_intent(nd, LOOKUP_EXCL); } /* @@ -1009,7 +1007,7 @@ static struct dentry *nfs_atomic_lookup(struct inode *dir, struct dentry *dentry /* Let vfs_create() deal with O_EXCL. Instantiate, but don't hash * the dentry. */ - if (nd->intent.open.flags & O_EXCL) { + if (nd->flags & LOOKUP_EXCL) { d_instantiate(dentry, NULL); goto out; } @@ -1517,7 +1515,7 @@ static int nfs_symlink(struct inode *dir, struct dentry *dentry, const char *sym if (!add_to_page_cache(page, dentry->d_inode->i_mapping, 0, GFP_KERNEL)) { pagevec_add(&lru_pvec, page); - pagevec_lru_add(&lru_pvec); + pagevec_lru_add_file(&lru_pvec); SetPageUptodate(page); unlock_page(page); } else @@ -1959,6 +1957,9 @@ force_lookup: } else res = PTR_ERR(cred); out: + if (!res && (mask & MAY_EXEC) && !execute_ok(inode)) + res = -EACCES; + dfprintk(VFS, "NFS: permission(%s/%ld), mask=0x%x, res=%d\n", inode->i_sb->s_id, inode->i_ino, mask, res); return res; diff --git a/fs/nfs/getroot.c b/fs/nfs/getroot.c index fae9719..b7c9b2d 100644 --- a/fs/nfs/getroot.c +++ b/fs/nfs/getroot.c @@ -107,11 +107,10 @@ struct dentry *nfs_get_root(struct super_block *sb, struct nfs_fh *mntfh) * if the dentry tree reaches them; however if the dentry already * exists, we'll pick it up at this point and use it as the root */ - mntroot = d_alloc_anon(inode); - if (!mntroot) { - iput(inode); + mntroot = d_obtain_alias(inode); + if (IS_ERR(mntroot)) { dprintk("nfs_get_root: get root dentry failed\n"); - return ERR_PTR(-ENOMEM); + return mntroot; } security_d_instantiate(mntroot, inode); @@ -277,11 +276,10 @@ struct dentry *nfs4_get_root(struct super_block *sb, struct nfs_fh *mntfh) * if the dentry tree reaches them; however if the dentry already * exists, we'll pick it up at this point and use it as the root */ - mntroot = d_alloc_anon(inode); - if (!mntroot) { - iput(inode); + mntroot = d_obtain_alias(inode); + if (IS_ERR(mntroot)) { dprintk("nfs_get_root: get root dentry failed\n"); - return ERR_PTR(-ENOMEM); + return mntroot; } security_d_instantiate(mntroot, inode); diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index c910413..83e700a 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -1659,8 +1659,10 @@ nfs4_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr, struct nfs_open_context *ctx; ctx = nfs_file_open_context(sattr->ia_file); - cred = ctx->cred; - state = ctx->state; + if (ctx) { + cred = ctx->cred; + state = ctx->state; + } } status = nfs4_do_setattr(inode, cred, fattr, sattr, state); diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 8b28b95c..a3b0061 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -2459,7 +2459,7 @@ static int nfs4_xdev_get_sb(struct file_system_type *fs_type, int flags, compare_super = NULL; /* Get a superblock - note that we may end up sharing one that already exists */ - s = sget(&nfs_fs_type, compare_super, nfs_set_super, &sb_mntdata); + s = sget(&nfs4_fs_type, compare_super, nfs_set_super, &sb_mntdata); if (IS_ERR(s)) { error = PTR_ERR(s); goto out_err_nosb; @@ -2544,7 +2544,7 @@ static int nfs4_referral_get_sb(struct file_system_type *fs_type, int flags, compare_super = NULL; /* Get a superblock - note that we may end up sharing one that already exists */ - s = sget(&nfs_fs_type, compare_super, nfs_set_super, &sb_mntdata); + s = sget(&nfs4_fs_type, compare_super, nfs_set_super, &sb_mntdata); if (IS_ERR(s)) { error = PTR_ERR(s); goto out_err_nosb; diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c index 9dc036f..5839b22 100644 --- a/fs/nfsd/export.c +++ b/fs/nfsd/export.c @@ -99,7 +99,7 @@ static int expkey_parse(struct cache_detail *cd, char *mesg, int mlen) int fsidtype; char *ep; struct svc_expkey key; - struct svc_expkey *ek; + struct svc_expkey *ek = NULL; if (mesg[mlen-1] != '\n') return -EINVAL; @@ -107,7 +107,8 @@ static int expkey_parse(struct cache_detail *cd, char *mesg, int mlen) buf = kmalloc(PAGE_SIZE, GFP_KERNEL); err = -ENOMEM; - if (!buf) goto out; + if (!buf) + goto out; err = -EINVAL; if ((len=qword_get(&mesg, buf, PAGE_SIZE)) <= 0) @@ -151,34 +152,32 @@ static int expkey_parse(struct cache_detail *cd, char *mesg, int mlen) /* now we want a pathname, or empty meaning NEGATIVE */ err = -EINVAL; - if ((len=qword_get(&mesg, buf, PAGE_SIZE)) < 0) + len = qword_get(&mesg, buf, PAGE_SIZE); + if (len < 0) goto out; dprintk("Path seems to be <%s>\n", buf); err = 0; if (len == 0) { set_bit(CACHE_NEGATIVE, &key.h.flags); ek = svc_expkey_update(&key, ek); - if (ek) - cache_put(&ek->h, &svc_expkey_cache); - else err = -ENOMEM; + if (!ek) + err = -ENOMEM; } else { - struct nameidata nd; - err = path_lookup(buf, 0, &nd); + err = kern_path(buf, 0, &key.ek_path); if (err) goto out; dprintk("Found the path %s\n", buf); - key.ek_path = nd.path; ek = svc_expkey_update(&key, ek); - if (ek) - cache_put(&ek->h, &svc_expkey_cache); - else + if (!ek) err = -ENOMEM; - path_put(&nd.path); + path_put(&key.ek_path); } cache_flush(); out: + if (ek) + cache_put(&ek->h, &svc_expkey_cache); if (dom) auth_domain_put(dom); kfree(buf); @@ -500,35 +499,22 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen) int len; int err; struct auth_domain *dom = NULL; - struct nameidata nd; - struct svc_export exp, *expp; + struct svc_export exp = {}, *expp; int an_int; - nd.path.dentry = NULL; - exp.ex_pathname = NULL; - - /* fs locations */ - exp.ex_fslocs.locations = NULL; - exp.ex_fslocs.locations_count = 0; - exp.ex_fslocs.migrated = 0; - - exp.ex_uuid = NULL; - - /* secinfo */ - exp.ex_nflavors = 0; - if (mesg[mlen-1] != '\n') return -EINVAL; mesg[mlen-1] = 0; buf = kmalloc(PAGE_SIZE, GFP_KERNEL); - err = -ENOMEM; - if (!buf) goto out; + if (!buf) + return -ENOMEM; /* client */ - len = qword_get(&mesg, buf, PAGE_SIZE); err = -EINVAL; - if (len <= 0) goto out; + len = qword_get(&mesg, buf, PAGE_SIZE); + if (len <= 0) + goto out; err = -ENOENT; dom = auth_domain_find(buf); @@ -537,25 +523,25 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen) /* path */ err = -EINVAL; - if ((len=qword_get(&mesg, buf, PAGE_SIZE)) <= 0) - goto out; - err = path_lookup(buf, 0, &nd); - if (err) goto out_no_path; + if ((len = qword_get(&mesg, buf, PAGE_SIZE)) <= 0) + goto out1; + + err = kern_path(buf, 0, &exp.ex_path); + if (err) + goto out1; - exp.h.flags = 0; exp.ex_client = dom; - exp.ex_path.mnt = nd.path.mnt; - exp.ex_path.dentry = nd.path.dentry; - exp.ex_pathname = kstrdup(buf, GFP_KERNEL); + err = -ENOMEM; + exp.ex_pathname = kstrdup(buf, GFP_KERNEL); if (!exp.ex_pathname) - goto out; + goto out2; /* expiry */ err = -EINVAL; exp.h.expiry_time = get_expiry(&mesg); if (exp.h.expiry_time == 0) - goto out; + goto out3; /* flags */ err = get_int(&mesg, &an_int); @@ -563,22 +549,26 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen) err = 0; set_bit(CACHE_NEGATIVE, &exp.h.flags); } else { - if (err || an_int < 0) goto out; + if (err || an_int < 0) + goto out3; exp.ex_flags= an_int; /* anon uid */ err = get_int(&mesg, &an_int); - if (err) goto out; + if (err) + goto out3; exp.ex_anon_uid= an_int; /* anon gid */ err = get_int(&mesg, &an_int); - if (err) goto out; + if (err) + goto out3; exp.ex_anon_gid= an_int; /* fsid */ err = get_int(&mesg, &an_int); - if (err) goto out; + if (err) + goto out3; exp.ex_fsid = an_int; while ((len = qword_get(&mesg, buf, PAGE_SIZE)) > 0) { @@ -604,12 +594,13 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen) */ break; if (err) - goto out; + goto out4; } - err = check_export(nd.path.dentry->d_inode, exp.ex_flags, + err = check_export(exp.ex_path.dentry->d_inode, exp.ex_flags, exp.ex_uuid); - if (err) goto out; + if (err) + goto out4; } expp = svc_export_lookup(&exp); @@ -622,15 +613,16 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen) err = -ENOMEM; else exp_put(expp); - out: +out4: nfsd4_fslocs_free(&exp.ex_fslocs); kfree(exp.ex_uuid); +out3: kfree(exp.ex_pathname); - if (nd.path.dentry) - path_put(&nd.path); - out_no_path: - if (dom) - auth_domain_put(dom); +out2: + path_put(&exp.ex_path); +out1: + auth_domain_put(dom); +out: kfree(buf); return err; } @@ -998,7 +990,7 @@ exp_export(struct nfsctl_export *nxp) struct svc_export *exp = NULL; struct svc_export new; struct svc_expkey *fsid_key = NULL; - struct nameidata nd; + struct path path; int err; /* Consistency check */ @@ -1021,12 +1013,12 @@ exp_export(struct nfsctl_export *nxp) /* Look up the dentry */ - err = path_lookup(nxp->ex_path, 0, &nd); + err = kern_path(nxp->ex_path, 0, &path); if (err) goto out_put_clp; err = -EINVAL; - exp = exp_get_by_name(clp, nd.path.mnt, nd.path.dentry, NULL); + exp = exp_get_by_name(clp, path.mnt, path.dentry, NULL); memset(&new, 0, sizeof(new)); @@ -1034,8 +1026,8 @@ exp_export(struct nfsctl_export *nxp) if ((nxp->ex_flags & NFSEXP_FSID) && (!IS_ERR(fsid_key = exp_get_fsid_key(clp, nxp->ex_dev))) && fsid_key->ek_path.mnt && - (fsid_key->ek_path.mnt != nd.path.mnt || - fsid_key->ek_path.dentry != nd.path.dentry)) + (fsid_key->ek_path.mnt != path.mnt || + fsid_key->ek_path.dentry != path.dentry)) goto finish; if (!IS_ERR(exp)) { @@ -1051,7 +1043,7 @@ exp_export(struct nfsctl_export *nxp) goto finish; } - err = check_export(nd.path.dentry->d_inode, nxp->ex_flags, NULL); + err = check_export(path.dentry->d_inode, nxp->ex_flags, NULL); if (err) goto finish; err = -ENOMEM; @@ -1064,7 +1056,7 @@ exp_export(struct nfsctl_export *nxp) if (!new.ex_pathname) goto finish; new.ex_client = clp; - new.ex_path = nd.path; + new.ex_path = path; new.ex_flags = nxp->ex_flags; new.ex_anon_uid = nxp->ex_anon_uid; new.ex_anon_gid = nxp->ex_anon_gid; @@ -1090,7 +1082,7 @@ finish: exp_put(exp); if (fsid_key && !IS_ERR(fsid_key)) cache_put(&fsid_key->h, &svc_expkey_cache); - path_put(&nd.path); + path_put(&path); out_put_clp: auth_domain_put(clp); out_unlock: @@ -1121,7 +1113,7 @@ exp_unexport(struct nfsctl_export *nxp) { struct auth_domain *dom; svc_export *exp; - struct nameidata nd; + struct path path; int err; /* Consistency check */ @@ -1138,13 +1130,13 @@ exp_unexport(struct nfsctl_export *nxp) goto out_unlock; } - err = path_lookup(nxp->ex_path, 0, &nd); + err = kern_path(nxp->ex_path, 0, &path); if (err) goto out_domain; err = -EINVAL; - exp = exp_get_by_name(dom, nd.path.mnt, nd.path.dentry, NULL); - path_put(&nd.path); + exp = exp_get_by_name(dom, path.mnt, path.dentry, NULL); + path_put(&path); if (IS_ERR(exp)) goto out_domain; @@ -1166,26 +1158,26 @@ out_unlock: * since its harder to fool a kernel module than a user space program. */ int -exp_rootfh(svc_client *clp, char *path, struct knfsd_fh *f, int maxsize) +exp_rootfh(svc_client *clp, char *name, struct knfsd_fh *f, int maxsize) { struct svc_export *exp; - struct nameidata nd; + struct path path; struct inode *inode; struct svc_fh fh; int err; err = -EPERM; /* NB: we probably ought to check that it's NUL-terminated */ - if (path_lookup(path, 0, &nd)) { - printk("nfsd: exp_rootfh path not found %s", path); + if (kern_path(name, 0, &path)) { + printk("nfsd: exp_rootfh path not found %s", name); return err; } - inode = nd.path.dentry->d_inode; + inode = path.dentry->d_inode; dprintk("nfsd: exp_rootfh(%s [%p] %s:%s/%ld)\n", - path, nd.path.dentry, clp->name, + name, path.dentry, clp->name, inode->i_sb->s_id, inode->i_ino); - exp = exp_parent(clp, nd.path.mnt, nd.path.dentry, NULL); + exp = exp_parent(clp, path.mnt, path.dentry, NULL); if (IS_ERR(exp)) { err = PTR_ERR(exp); goto out; @@ -1195,7 +1187,7 @@ exp_rootfh(svc_client *clp, char *path, struct knfsd_fh *f, int maxsize) * fh must be initialized before calling fh_compose */ fh_init(&fh, maxsize); - if (fh_compose(&fh, exp, nd.path.dentry, NULL)) + if (fh_compose(&fh, exp, path.dentry, NULL)) err = -EINVAL; else err = 0; @@ -1203,7 +1195,7 @@ exp_rootfh(svc_client *clp, char *path, struct knfsd_fh *f, int maxsize) fh_put(&fh); exp_put(exp); out: - path_put(&nd.path); + path_put(&path); return err; } diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c index 145b3c8..bb93946 100644 --- a/fs/nfsd/nfs4recover.c +++ b/fs/nfsd/nfs4recover.c @@ -51,7 +51,7 @@ #define NFSDDBG_FACILITY NFSDDBG_PROC /* Globals */ -static struct nameidata rec_dir; +static struct path rec_dir; static int rec_dir_init = 0; static void @@ -121,9 +121,9 @@ out_no_tfm: static void nfsd4_sync_rec_dir(void) { - mutex_lock(&rec_dir.path.dentry->d_inode->i_mutex); - nfsd_sync_dir(rec_dir.path.dentry); - mutex_unlock(&rec_dir.path.dentry->d_inode->i_mutex); + mutex_lock(&rec_dir.dentry->d_inode->i_mutex); + nfsd_sync_dir(rec_dir.dentry); + mutex_unlock(&rec_dir.dentry->d_inode->i_mutex); } int @@ -143,9 +143,9 @@ nfsd4_create_clid_dir(struct nfs4_client *clp) nfs4_save_user(&uid, &gid); /* lock the parent */ - mutex_lock(&rec_dir.path.dentry->d_inode->i_mutex); + mutex_lock(&rec_dir.dentry->d_inode->i_mutex); - dentry = lookup_one_len(dname, rec_dir.path.dentry, HEXDIR_LEN-1); + dentry = lookup_one_len(dname, rec_dir.dentry, HEXDIR_LEN-1); if (IS_ERR(dentry)) { status = PTR_ERR(dentry); goto out_unlock; @@ -155,15 +155,15 @@ nfsd4_create_clid_dir(struct nfs4_client *clp) dprintk("NFSD: nfsd4_create_clid_dir: DIRECTORY EXISTS\n"); goto out_put; } - status = mnt_want_write(rec_dir.path.mnt); + status = mnt_want_write(rec_dir.mnt); if (status) goto out_put; - status = vfs_mkdir(rec_dir.path.dentry->d_inode, dentry, S_IRWXU); - mnt_drop_write(rec_dir.path.mnt); + status = vfs_mkdir(rec_dir.dentry->d_inode, dentry, S_IRWXU); + mnt_drop_write(rec_dir.mnt); out_put: dput(dentry); out_unlock: - mutex_unlock(&rec_dir.path.dentry->d_inode->i_mutex); + mutex_unlock(&rec_dir.dentry->d_inode->i_mutex); if (status == 0) { clp->cl_firststate = 1; nfsd4_sync_rec_dir(); @@ -226,7 +226,7 @@ nfsd4_list_rec_dir(struct dentry *dir, recdir_func *f) nfs4_save_user(&uid, &gid); - filp = dentry_open(dget(dir), mntget(rec_dir.path.mnt), O_RDONLY); + filp = dentry_open(dget(dir), mntget(rec_dir.mnt), O_RDONLY); status = PTR_ERR(filp); if (IS_ERR(filp)) goto out; @@ -291,9 +291,9 @@ nfsd4_unlink_clid_dir(char *name, int namlen) dprintk("NFSD: nfsd4_unlink_clid_dir. name %.*s\n", namlen, name); - mutex_lock(&rec_dir.path.dentry->d_inode->i_mutex); - dentry = lookup_one_len(name, rec_dir.path.dentry, namlen); - mutex_unlock(&rec_dir.path.dentry->d_inode->i_mutex); + mutex_lock(&rec_dir.dentry->d_inode->i_mutex); + dentry = lookup_one_len(name, rec_dir.dentry, namlen); + mutex_unlock(&rec_dir.dentry->d_inode->i_mutex); if (IS_ERR(dentry)) { status = PTR_ERR(dentry); return status; @@ -302,7 +302,7 @@ nfsd4_unlink_clid_dir(char *name, int namlen) if (!dentry->d_inode) goto out; - status = nfsd4_clear_clid_dir(rec_dir.path.dentry, dentry); + status = nfsd4_clear_clid_dir(rec_dir.dentry, dentry); out: dput(dentry); return status; @@ -318,7 +318,7 @@ nfsd4_remove_clid_dir(struct nfs4_client *clp) if (!rec_dir_init || !clp->cl_firststate) return; - status = mnt_want_write(rec_dir.path.mnt); + status = mnt_want_write(rec_dir.mnt); if (status) goto out; clp->cl_firststate = 0; @@ -327,7 +327,7 @@ nfsd4_remove_clid_dir(struct nfs4_client *clp) nfs4_reset_user(uid, gid); if (status == 0) nfsd4_sync_rec_dir(); - mnt_drop_write(rec_dir.path.mnt); + mnt_drop_write(rec_dir.mnt); out: if (status) printk("NFSD: Failed to remove expired client state directory" @@ -357,17 +357,17 @@ nfsd4_recdir_purge_old(void) { if (!rec_dir_init) return; - status = mnt_want_write(rec_dir.path.mnt); + status = mnt_want_write(rec_dir.mnt); if (status) goto out; - status = nfsd4_list_rec_dir(rec_dir.path.dentry, purge_old); + status = nfsd4_list_rec_dir(rec_dir.dentry, purge_old); if (status == 0) nfsd4_sync_rec_dir(); - mnt_drop_write(rec_dir.path.mnt); + mnt_drop_write(rec_dir.mnt); out: if (status) printk("nfsd4: failed to purge old clients from recovery" - " directory %s\n", rec_dir.path.dentry->d_name.name); + " directory %s\n", rec_dir.dentry->d_name.name); } static int @@ -387,10 +387,10 @@ int nfsd4_recdir_load(void) { int status; - status = nfsd4_list_rec_dir(rec_dir.path.dentry, load_recdir); + status = nfsd4_list_rec_dir(rec_dir.dentry, load_recdir); if (status) printk("nfsd4: failed loading clients from recovery" - " directory %s\n", rec_dir.path.dentry->d_name.name); + " directory %s\n", rec_dir.dentry->d_name.name); return status; } @@ -412,7 +412,7 @@ nfsd4_init_recdir(char *rec_dirname) nfs4_save_user(&uid, &gid); - status = path_lookup(rec_dirname, LOOKUP_FOLLOW | LOOKUP_DIRECTORY, + status = kern_path(rec_dirname, LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &rec_dir); if (status) printk("NFSD: unable to find recovery directory %s\n", @@ -429,5 +429,5 @@ nfsd4_shutdown_recdir(void) if (!rec_dir_init) return; rec_dir_init = 0; - path_put(&rec_dir.path); + path_put(&rec_dir); } diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 0cc7ff5..b0bebc5 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -3284,17 +3284,17 @@ int nfs4_reset_recoverydir(char *recdir) { int status; - struct nameidata nd; + struct path path; - status = path_lookup(recdir, LOOKUP_FOLLOW, &nd); + status = kern_path(recdir, LOOKUP_FOLLOW, &path); if (status) return status; status = -ENOTDIR; - if (S_ISDIR(nd.path.dentry->d_inode->i_mode)) { + if (S_ISDIR(path.dentry->d_inode->i_mode)) { nfs4_set_recdir(recdir); status = 0; } - path_put(&nd.path); + path_put(&path); return status; } diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index 97543df..e3f9783 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -341,7 +341,7 @@ static ssize_t failover_unlock_ip(struct file *file, char *buf, size_t size) static ssize_t failover_unlock_fs(struct file *file, char *buf, size_t size) { - struct nameidata nd; + struct path path; char *fo_path; int error; @@ -356,13 +356,13 @@ static ssize_t failover_unlock_fs(struct file *file, char *buf, size_t size) if (qword_get(&buf, fo_path, size) < 0) return -EINVAL; - error = path_lookup(fo_path, 0, &nd); + error = kern_path(fo_path, 0, &path); if (error) return error; - error = nlmsvc_unlock_all_by_sb(nd.path.mnt->mnt_sb); + error = nlmsvc_unlock_all_by_sb(path.mnt->mnt_sb); - path_put(&nd.path); + path_put(&path); return error; } diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index 59eeb46..07e4f5d 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -249,6 +249,10 @@ static int nfsd_init_socks(int port) if (error < 0) return error; + error = lockd_up(); + if (error < 0) + return error; + error = svc_create_xprt(nfsd_serv, "tcp", port, SVC_SOCK_DEFAULTS); if (error < 0) diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index aa1d0d6..0bc56f6 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -410,6 +410,7 @@ out_nfserr: static ssize_t nfsd_getxattr(struct dentry *dentry, char *key, void **buf) { ssize_t buflen; + ssize_t ret; buflen = vfs_getxattr(dentry, key, NULL, 0); if (buflen <= 0) @@ -419,7 +420,10 @@ static ssize_t nfsd_getxattr(struct dentry *dentry, char *key, void **buf) if (!*buf) return -ENOMEM; - return vfs_getxattr(dentry, key, *buf, buflen); + ret = vfs_getxattr(dentry, key, *buf, buflen); + if (ret < 0) + kfree(*buf); + return ret; } #endif @@ -1814,6 +1818,115 @@ out: } /* + * We do this buffering because we must not call back into the file + * system's ->lookup() method from the filldir callback. That may well + * deadlock a number of file systems. + * + * This is based heavily on the implementation of same in XFS. + */ +struct buffered_dirent { + u64 ino; + loff_t offset; + int namlen; + unsigned int d_type; + char name[]; +}; + +struct readdir_data { + char *dirent; + size_t used; + int full; +}; + +static int nfsd_buffered_filldir(void *__buf, const char *name, int namlen, + loff_t offset, u64 ino, unsigned int d_type) +{ + struct readdir_data *buf = __buf; + struct buffered_dirent *de = (void *)(buf->dirent + buf->used); + unsigned int reclen; + + reclen = ALIGN(sizeof(struct buffered_dirent) + namlen, sizeof(u64)); + if (buf->used + reclen > PAGE_SIZE) { + buf->full = 1; + return -EINVAL; + } + + de->namlen = namlen; + de->offset = offset; + de->ino = ino; + de->d_type = d_type; + memcpy(de->name, name, namlen); + buf->used += reclen; + + return 0; +} + +static int nfsd_buffered_readdir(struct file *file, filldir_t func, + struct readdir_cd *cdp, loff_t *offsetp) +{ + struct readdir_data buf; + struct buffered_dirent *de; + int host_err; + int size; + loff_t offset; + + buf.dirent = (void *)__get_free_page(GFP_KERNEL); + if (!buf.dirent) + return -ENOMEM; + + offset = *offsetp; + cdp->err = nfserr_eof; /* will be cleared on successful read */ + + while (1) { + unsigned int reclen; + + buf.used = 0; + buf.full = 0; + + host_err = vfs_readdir(file, nfsd_buffered_filldir, &buf); + if (buf.full) + host_err = 0; + + if (host_err < 0) + break; + + size = buf.used; + + if (!size) + break; + + de = (struct buffered_dirent *)buf.dirent; + while (size > 0) { + offset = de->offset; + + if (func(cdp, de->name, de->namlen, de->offset, + de->ino, de->d_type)) + goto done; + + if (cdp->err != nfs_ok) + goto done; + + reclen = ALIGN(sizeof(*de) + de->namlen, + sizeof(u64)); + size -= reclen; + de = (struct buffered_dirent *)((char *)de + reclen); + } + offset = vfs_llseek(file, 0, SEEK_CUR); + if (!buf.full) + break; + } + + done: + free_page((unsigned long)(buf.dirent)); + + if (host_err) + return nfserrno(host_err); + + *offsetp = offset; + return cdp->err; +} + +/* * Read entries from a directory. * The NFSv3/4 verifier we ignore for now. */ @@ -1822,7 +1935,6 @@ nfsd_readdir(struct svc_rqst *rqstp, struct svc_fh *fhp, loff_t *offsetp, struct readdir_cd *cdp, filldir_t func) { __be32 err; - int host_err; struct file *file; loff_t offset = *offsetp; @@ -1836,21 +1948,7 @@ nfsd_readdir(struct svc_rqst *rqstp, struct svc_fh *fhp, loff_t *offsetp, goto out_close; } - /* - * Read the directory entries. This silly loop is necessary because - * readdir() is not guaranteed to fill up the entire buffer, but - * may choose to do less. - */ - - do { - cdp->err = nfserr_eof; /* will be cleared on successful read */ - host_err = vfs_readdir(file, func, cdp); - } while (host_err >=0 && cdp->err == nfs_ok); - if (host_err) - err = nfserrno(host_err); - else - err = cdp->err; - *offsetp = vfs_llseek(file, 0, 1); + err = nfsd_buffered_readdir(file, func, cdp, offsetp); if (err == nfserr_eof || err == nfserr_toosmall) err = nfs_ok; /* can still be found in ->err */ diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c index d020866..3140a44 100644 --- a/fs/ntfs/file.c +++ b/fs/ntfs/file.c @@ -439,7 +439,7 @@ static inline int __ntfs_grab_cache_pages(struct address_space *mapping, pages[nr] = *cached_page; page_cache_get(*cached_page); if (unlikely(!pagevec_add(lru_pvec, *cached_page))) - __pagevec_lru_add(lru_pvec); + __pagevec_lru_add_file(lru_pvec); *cached_page = NULL; } index++; @@ -2084,7 +2084,7 @@ err_out: OSYNC_METADATA|OSYNC_DATA); } } - pagevec_lru_add(&lru_pvec); + pagevec_lru_add_file(&lru_pvec); ntfs_debug("Done. Returning %s (written 0x%lx, status %li).", written ? "written" : "status", (unsigned long)written, (long)status); diff --git a/fs/ntfs/namei.c b/fs/ntfs/namei.c index 9e8a95b..2ca0015 100644 --- a/fs/ntfs/namei.c +++ b/fs/ntfs/namei.c @@ -304,8 +304,6 @@ static struct dentry *ntfs_get_parent(struct dentry *child_dent) ntfs_attr_search_ctx *ctx; ATTR_RECORD *attr; FILE_NAME_ATTR *fn; - struct inode *parent_vi; - struct dentry *parent_dent; unsigned long parent_ino; int err; @@ -345,24 +343,8 @@ try_next: /* Release the search context and the mft record of the child. */ ntfs_attr_put_search_ctx(ctx); unmap_mft_record(ni); - /* Get the inode of the parent directory. */ - parent_vi = ntfs_iget(vi->i_sb, parent_ino); - if (IS_ERR(parent_vi) || unlikely(is_bad_inode(parent_vi))) { - if (!IS_ERR(parent_vi)) - iput(parent_vi); - ntfs_error(vi->i_sb, "Failed to get parent directory inode " - "0x%lx of child inode 0x%lx.", parent_ino, - vi->i_ino); - return ERR_PTR(-EACCES); - } - /* Finally get a dentry for the parent directory and return it. */ - parent_dent = d_alloc_anon(parent_vi); - if (unlikely(!parent_dent)) { - iput(parent_vi); - return ERR_PTR(-ENOMEM); - } - ntfs_debug("Done for inode 0x%lx.", vi->i_ino); - return parent_dent; + + return d_obtain_alias(ntfs_iget(vi->i_sb, parent_ino)); } static struct inode *ntfs_nfs_get_inode(struct super_block *sb, diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index 7dce161..6ebaa58 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -976,7 +976,7 @@ static void o2hb_region_release(struct config_item *item) } if (reg->hr_bdev) - blkdev_put(reg->hr_bdev); + blkdev_put(reg->hr_bdev, FMODE_READ|FMODE_WRITE); if (reg->hr_slots) kfree(reg->hr_slots); @@ -1268,7 +1268,7 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg, goto out; reg->hr_bdev = I_BDEV(filp->f_mapping->host); - ret = blkdev_get(reg->hr_bdev, FMODE_WRITE | FMODE_READ, 0); + ret = blkdev_get(reg->hr_bdev, FMODE_WRITE | FMODE_READ); if (ret) { reg->hr_bdev = NULL; goto out; @@ -1358,7 +1358,7 @@ out: iput(inode); if (ret < 0) { if (reg->hr_bdev) { - blkdev_put(reg->hr_bdev); + blkdev_put(reg->hr_bdev, FMODE_READ|FMODE_WRITE); reg->hr_bdev = NULL; } } diff --git a/fs/ocfs2/export.c b/fs/ocfs2/export.c index 67527ce..2f27b33 100644 --- a/fs/ocfs2/export.c +++ b/fs/ocfs2/export.c @@ -68,14 +68,9 @@ static struct dentry *ocfs2_get_dentry(struct super_block *sb, return ERR_PTR(-ESTALE); } - result = d_alloc_anon(inode); - - if (!result) { - iput(inode); - mlog_errno(-ENOMEM); - return ERR_PTR(-ENOMEM); - } - result->d_op = &ocfs2_dentry_ops; + result = d_obtain_alias(inode); + if (!IS_ERR(result)) + result->d_op = &ocfs2_dentry_ops; mlog_exit_ptr(result); return result; @@ -86,7 +81,6 @@ static struct dentry *ocfs2_get_parent(struct dentry *child) int status; u64 blkno; struct dentry *parent; - struct inode *inode; struct inode *dir = child->d_inode; mlog_entry("(0x%p, '%.*s')\n", child, @@ -109,21 +103,9 @@ static struct dentry *ocfs2_get_parent(struct dentry *child) goto bail_unlock; } - inode = ocfs2_iget(OCFS2_SB(dir->i_sb), blkno, 0, 0); - if (IS_ERR(inode)) { - mlog(ML_ERROR, "Unable to create inode %llu\n", - (unsigned long long)blkno); - parent = ERR_PTR(-EACCES); - goto bail_unlock; - } - - parent = d_alloc_anon(inode); - if (!parent) { - iput(inode); - parent = ERR_PTR(-ENOMEM); - } - - parent->d_op = &ocfs2_dentry_ops; + parent = d_obtain_alias(ocfs2_iget(OCFS2_SB(dir->i_sb), blkno, 0, 0)); + if (!IS_ERR(parent)) + parent->d_op = &ocfs2_dentry_ops; bail_unlock: ocfs2_inode_unlock(dir, 0); diff --git a/fs/omfs/dir.c b/fs/omfs/dir.c index c0757e9..c7275cf 100644 --- a/fs/omfs/dir.c +++ b/fs/omfs/dir.c @@ -501,4 +501,5 @@ struct inode_operations omfs_dir_inops = { struct file_operations omfs_dir_operations = { .read = generic_read_dir, .readdir = omfs_readdir, + .llseek = generic_file_llseek, }; @@ -798,7 +798,7 @@ static struct file *__dentry_open(struct dentry *dentry, struct vfsmount *mnt, int error; f->f_flags = flags; - f->f_mode = ((flags+1) & O_ACCMODE) | FMODE_LSEEK | + f->f_mode = (__force fmode_t)((flags+1) & O_ACCMODE) | FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE; inode = dentry->d_inode; if (f->f_mode & FMODE_WRITE) { diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c index 9f5b054..d41bdc7 100644 --- a/fs/openpromfs/inode.c +++ b/fs/openpromfs/inode.c @@ -167,6 +167,7 @@ static int openpromfs_readdir(struct file *, void *, filldir_t); static const struct file_operations openprom_operations = { .read = generic_read_dir, .readdir = openpromfs_readdir, + .llseek = generic_file_llseek, }; static struct dentry *openpromfs_lookup(struct inode *, struct dentry *, struct nameidata *); diff --git a/fs/partitions/check.c b/fs/partitions/check.c index fbeb2f3..633f7a0 100644 --- a/fs/partitions/check.c +++ b/fs/partitions/check.c @@ -195,6 +195,14 @@ check_partition(struct gendisk *hd, struct block_device *bdev) return ERR_PTR(res); } +static ssize_t part_partition_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct hd_struct *p = dev_to_part(dev); + + return sprintf(buf, "%d\n", p->partno); +} + static ssize_t part_start_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -260,6 +268,7 @@ ssize_t part_fail_store(struct device *dev, } #endif +static DEVICE_ATTR(partition, S_IRUGO, part_partition_show, NULL); static DEVICE_ATTR(start, S_IRUGO, part_start_show, NULL); static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL); static DEVICE_ATTR(stat, S_IRUGO, part_stat_show, NULL); @@ -269,6 +278,7 @@ static struct device_attribute dev_attr_fail = #endif static struct attribute *part_attrs[] = { + &dev_attr_partition.attr, &dev_attr_start.attr, &dev_attr_size.attr, &dev_attr_stat.attr, @@ -475,10 +485,10 @@ void register_disk(struct gendisk *disk) goto exit; bdev->bd_invalidated = 1; - err = blkdev_get(bdev, FMODE_READ, 0); + err = blkdev_get(bdev, FMODE_READ); if (err < 0) goto exit; - blkdev_put(bdev); + blkdev_put(bdev, FMODE_READ); exit: /* announce disk after possible partitions are created */ diff --git a/fs/proc/Makefile b/fs/proc/Makefile index ebaba02..63d9651 100644 --- a/fs/proc/Makefile +++ b/fs/proc/Makefile @@ -8,11 +8,20 @@ proc-y := nommu.o task_nommu.o proc-$(CONFIG_MMU) := mmu.o task_mmu.o proc-y += inode.o root.o base.o generic.o array.o \ - proc_tty.o proc_misc.o - + proc_tty.o +proc-y += cmdline.o +proc-y += cpuinfo.o +proc-y += devices.o +proc-y += interrupts.o +proc-y += loadavg.o +proc-y += meminfo.o +proc-y += stat.o +proc-y += uptime.o +proc-y += version.o proc-$(CONFIG_PROC_SYSCTL) += proc_sysctl.o proc-$(CONFIG_NET) += proc_net.o proc-$(CONFIG_PROC_KCORE) += kcore.o proc-$(CONFIG_PROC_VMCORE) += vmcore.o proc-$(CONFIG_PROC_DEVICETREE) += proc_devtree.o proc-$(CONFIG_PRINTK) += kmsg.o +proc-$(CONFIG_PROC_PAGE_MONITOR) += page.o diff --git a/fs/proc/array.c b/fs/proc/array.c index f4bc0e7..bb9f4b0 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -388,20 +388,20 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, /* add up live thread stats at the group level */ if (whole) { + struct task_cputime cputime; struct task_struct *t = task; do { min_flt += t->min_flt; maj_flt += t->maj_flt; - utime = cputime_add(utime, task_utime(t)); - stime = cputime_add(stime, task_stime(t)); gtime = cputime_add(gtime, task_gtime(t)); t = next_thread(t); } while (t != task); min_flt += sig->min_flt; maj_flt += sig->maj_flt; - utime = cputime_add(utime, sig->utime); - stime = cputime_add(stime, sig->stime); + thread_group_cputime(task, &cputime); + utime = cputime.utime; + stime = cputime.stime; gtime = cputime_add(gtime, sig->gtime); } diff --git a/fs/proc/base.c b/fs/proc/base.c index b5918ae..486cf3f 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1712,9 +1712,9 @@ static struct dentry *proc_fd_instantiate(struct inode *dir, file = fcheck_files(files, fd); if (!file) goto out_unlock; - if (file->f_mode & 1) + if (file->f_mode & FMODE_READ) inode->i_mode |= S_IRUSR | S_IXUSR; - if (file->f_mode & 2) + if (file->f_mode & FMODE_WRITE) inode->i_mode |= S_IWUSR | S_IXUSR; spin_unlock(&files->file_lock); put_files_struct(files); diff --git a/fs/proc/cmdline.c b/fs/proc/cmdline.c new file mode 100644 index 0000000..82676e3 --- /dev/null +++ b/fs/proc/cmdline.c @@ -0,0 +1,29 @@ +#include <linux/fs.h> +#include <linux/init.h> +#include <linux/proc_fs.h> +#include <linux/seq_file.h> + +static int cmdline_proc_show(struct seq_file *m, void *v) +{ + seq_printf(m, "%s\n", saved_command_line); + return 0; +} + +static int cmdline_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, cmdline_proc_show, NULL); +} + +static const struct file_operations cmdline_proc_fops = { + .open = cmdline_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int __init proc_cmdline_init(void) +{ + proc_create("cmdline", 0, NULL, &cmdline_proc_fops); + return 0; +} +module_init(proc_cmdline_init); diff --git a/fs/proc/cpuinfo.c b/fs/proc/cpuinfo.c new file mode 100644 index 0000000..5a1e539 --- /dev/null +++ b/fs/proc/cpuinfo.c @@ -0,0 +1,24 @@ +#include <linux/fs.h> +#include <linux/init.h> +#include <linux/proc_fs.h> +#include <linux/seq_file.h> + +extern const struct seq_operations cpuinfo_op; +static int cpuinfo_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &cpuinfo_op); +} + +static const struct file_operations proc_cpuinfo_operations = { + .open = cpuinfo_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static int __init proc_cpuinfo_init(void) +{ + proc_create("cpuinfo", 0, NULL, &proc_cpuinfo_operations); + return 0; +} +module_init(proc_cpuinfo_init); diff --git a/fs/proc/devices.c b/fs/proc/devices.c new file mode 100644 index 0000000..59ee7da --- /dev/null +++ b/fs/proc/devices.c @@ -0,0 +1,70 @@ +#include <linux/fs.h> +#include <linux/init.h> +#include <linux/proc_fs.h> +#include <linux/seq_file.h> + +static int devinfo_show(struct seq_file *f, void *v) +{ + int i = *(loff_t *) v; + + if (i < CHRDEV_MAJOR_HASH_SIZE) { + if (i == 0) + seq_printf(f, "Character devices:\n"); + chrdev_show(f, i); + } +#ifdef CONFIG_BLOCK + else { + i -= CHRDEV_MAJOR_HASH_SIZE; + if (i == 0) + seq_printf(f, "\nBlock devices:\n"); + blkdev_show(f, i); + } +#endif + return 0; +} + +static void *devinfo_start(struct seq_file *f, loff_t *pos) +{ + if (*pos < (BLKDEV_MAJOR_HASH_SIZE + CHRDEV_MAJOR_HASH_SIZE)) + return pos; + return NULL; +} + +static void *devinfo_next(struct seq_file *f, void *v, loff_t *pos) +{ + (*pos)++; + if (*pos >= (BLKDEV_MAJOR_HASH_SIZE + CHRDEV_MAJOR_HASH_SIZE)) + return NULL; + return pos; +} + +static void devinfo_stop(struct seq_file *f, void *v) +{ + /* Nothing to do */ +} + +static const struct seq_operations devinfo_ops = { + .start = devinfo_start, + .next = devinfo_next, + .stop = devinfo_stop, + .show = devinfo_show +}; + +static int devinfo_open(struct inode *inode, struct file *filp) +{ + return seq_open(filp, &devinfo_ops); +} + +static const struct file_operations proc_devinfo_operations = { + .open = devinfo_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static int __init proc_devices_init(void) +{ + proc_create("devices", 0, NULL, &proc_devinfo_operations); + return 0; +} +module_init(proc_devices_init); diff --git a/fs/proc/generic.c b/fs/proc/generic.c index 7821589..60a359b 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -547,9 +547,8 @@ static int proc_register(struct proc_dir_entry * dir, struct proc_dir_entry * dp for (tmp = dir->subdir; tmp; tmp = tmp->next) if (strcmp(tmp->name, dp->name) == 0) { - printk(KERN_WARNING "proc_dir_entry '%s/%s' already registered\n", + WARN(1, KERN_WARNING "proc_dir_entry '%s/%s' already registered\n", dir->name, dp->name); - dump_stack(); break; } diff --git a/fs/proc/inode.c b/fs/proc/inode.c index c6b4fa7..2543fd0 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -106,14 +106,13 @@ static void init_once(void *foo) inode_init_once(&ei->vfs_inode); } -int __init proc_init_inodecache(void) +void __init proc_init_inodecache(void) { proc_inode_cachep = kmem_cache_create("proc_inode_cache", sizeof(struct proc_inode), 0, (SLAB_RECLAIM_ACCOUNT| SLAB_MEM_SPREAD|SLAB_PANIC), init_once); - return 0; } static const struct super_operations proc_sops = { diff --git a/fs/proc/internal.h b/fs/proc/internal.h index 3bfb7b8..3e8aeb8 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -61,12 +61,11 @@ extern const struct file_operations proc_smaps_operations; extern const struct file_operations proc_clear_refs_operations; extern const struct file_operations proc_pagemap_operations; extern const struct file_operations proc_net_operations; -extern const struct file_operations proc_kmsg_operations; extern const struct inode_operations proc_net_inode_operations; void free_proc_entry(struct proc_dir_entry *de); -int proc_init_inodecache(void); +void proc_init_inodecache(void); static inline struct pid *proc_pid(struct inode *inode) { diff --git a/fs/proc/interrupts.c b/fs/proc/interrupts.c new file mode 100644 index 0000000..05029c0 --- /dev/null +++ b/fs/proc/interrupts.c @@ -0,0 +1,53 @@ +#include <linux/fs.h> +#include <linux/init.h> +#include <linux/interrupt.h> +#include <linux/irqnr.h> +#include <linux/proc_fs.h> +#include <linux/seq_file.h> + +/* + * /proc/interrupts + */ +static void *int_seq_start(struct seq_file *f, loff_t *pos) +{ + return (*pos <= nr_irqs) ? pos : NULL; +} + +static void *int_seq_next(struct seq_file *f, void *v, loff_t *pos) +{ + (*pos)++; + if (*pos > nr_irqs) + return NULL; + return pos; +} + +static void int_seq_stop(struct seq_file *f, void *v) +{ + /* Nothing to do */ +} + +static const struct seq_operations int_seq_ops = { + .start = int_seq_start, + .next = int_seq_next, + .stop = int_seq_stop, + .show = show_interrupts +}; + +static int interrupts_open(struct inode *inode, struct file *filp) +{ + return seq_open(filp, &int_seq_ops); +} + +static const struct file_operations proc_interrupts_operations = { + .open = interrupts_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static int __init proc_interrupts_init(void) +{ + proc_create("interrupts", 0, NULL, &proc_interrupts_operations); + return 0; +} +module_init(proc_interrupts_init); diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c index c2370c7..59b43a0 100644 --- a/fs/proc/kcore.c +++ b/fs/proc/kcore.c @@ -27,6 +27,8 @@ #define ELF_CORE_EFLAGS 0 #endif +static struct proc_dir_entry *proc_root_kcore; + static int open_kcore(struct inode * inode, struct file * filp) { return capable(CAP_SYS_RAWIO) ? 0 : -EPERM; @@ -34,7 +36,7 @@ static int open_kcore(struct inode * inode, struct file * filp) static ssize_t read_kcore(struct file *, char __user *, size_t, loff_t *); -const struct file_operations proc_kcore_operations = { +static const struct file_operations proc_kcore_operations = { .read = read_kcore, .open = open_kcore, }; @@ -399,3 +401,13 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos) return acc; } + +static int __init proc_kcore_init(void) +{ + proc_root_kcore = proc_create("kcore", S_IRUSR, NULL, &proc_kcore_operations); + if (proc_root_kcore) + proc_root_kcore->size = + (size_t)high_memory - PAGE_OFFSET + PAGE_SIZE; + return 0; +} +module_init(proc_kcore_init); diff --git a/fs/proc/kmsg.c b/fs/proc/kmsg.c index 9fd5df3..7ca7834 100644 --- a/fs/proc/kmsg.c +++ b/fs/proc/kmsg.c @@ -10,13 +10,12 @@ #include <linux/time.h> #include <linux/kernel.h> #include <linux/poll.h> +#include <linux/proc_fs.h> #include <linux/fs.h> #include <asm/uaccess.h> #include <asm/io.h> -#include "internal.h" - extern wait_queue_head_t log_wait; extern int do_syslog(int type, char __user *bug, int count); @@ -49,9 +48,16 @@ static unsigned int kmsg_poll(struct file *file, poll_table *wait) } -const struct file_operations proc_kmsg_operations = { +static const struct file_operations proc_kmsg_operations = { .read = kmsg_read, .poll = kmsg_poll, .open = kmsg_open, .release = kmsg_release, }; + +static int __init proc_kmsg_init(void) +{ + proc_create("kmsg", S_IRUSR, NULL, &proc_kmsg_operations); + return 0; +} +module_init(proc_kmsg_init); diff --git a/fs/proc/loadavg.c b/fs/proc/loadavg.c new file mode 100644 index 0000000..9bca39c --- /dev/null +++ b/fs/proc/loadavg.c @@ -0,0 +1,51 @@ +#include <linux/fs.h> +#include <linux/init.h> +#include <linux/pid_namespace.h> +#include <linux/proc_fs.h> +#include <linux/sched.h> +#include <linux/seq_file.h> +#include <linux/seqlock.h> +#include <linux/time.h> + +#define LOAD_INT(x) ((x) >> FSHIFT) +#define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100) + +static int loadavg_proc_show(struct seq_file *m, void *v) +{ + int a, b, c; + unsigned long seq; + + do { + seq = read_seqbegin(&xtime_lock); + a = avenrun[0] + (FIXED_1/200); + b = avenrun[1] + (FIXED_1/200); + c = avenrun[2] + (FIXED_1/200); + } while (read_seqretry(&xtime_lock, seq)); + + seq_printf(m, "%d.%02d %d.%02d %d.%02d %ld/%d %d\n", + LOAD_INT(a), LOAD_FRAC(a), + LOAD_INT(b), LOAD_FRAC(b), + LOAD_INT(c), LOAD_FRAC(c), + nr_running(), nr_threads, + task_active_pid_ns(current)->last_pid); + return 0; +} + +static int loadavg_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, loadavg_proc_show, NULL); +} + +static const struct file_operations loadavg_proc_fops = { + .open = loadavg_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int __init proc_loadavg_init(void) +{ + proc_create("loadavg", 0, NULL, &loadavg_proc_fops); + return 0; +} +module_init(proc_loadavg_init); diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c new file mode 100644 index 0000000..b1675c4 --- /dev/null +++ b/fs/proc/meminfo.c @@ -0,0 +1,168 @@ +#include <linux/fs.h> +#include <linux/hugetlb.h> +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/mman.h> +#include <linux/mmzone.h> +#include <linux/proc_fs.h> +#include <linux/quicklist.h> +#include <linux/seq_file.h> +#include <linux/swap.h> +#include <linux/vmstat.h> +#include <asm/atomic.h> +#include <asm/page.h> +#include <asm/pgtable.h> +#include "internal.h" + +void __attribute__((weak)) arch_report_meminfo(struct seq_file *m) +{ +} + +static int meminfo_proc_show(struct seq_file *m, void *v) +{ + struct sysinfo i; + unsigned long committed; + unsigned long allowed; + struct vmalloc_info vmi; + long cached; + unsigned long pages[NR_LRU_LISTS]; + int lru; + +/* + * display in kilobytes. + */ +#define K(x) ((x) << (PAGE_SHIFT - 10)) + si_meminfo(&i); + si_swapinfo(&i); + committed = atomic_long_read(&vm_committed_space); + allowed = ((totalram_pages - hugetlb_total_pages()) + * sysctl_overcommit_ratio / 100) + total_swap_pages; + + cached = global_page_state(NR_FILE_PAGES) - + total_swapcache_pages - i.bufferram; + if (cached < 0) + cached = 0; + + get_vmalloc_info(&vmi); + + for (lru = LRU_BASE; lru < NR_LRU_LISTS; lru++) + pages[lru] = global_page_state(NR_LRU_BASE + lru); + + /* + * Tagged format, for easy grepping and expansion. + */ + seq_printf(m, + "MemTotal: %8lu kB\n" + "MemFree: %8lu kB\n" + "Buffers: %8lu kB\n" + "Cached: %8lu kB\n" + "SwapCached: %8lu kB\n" + "Active: %8lu kB\n" + "Inactive: %8lu kB\n" + "Active(anon): %8lu kB\n" + "Inactive(anon): %8lu kB\n" + "Active(file): %8lu kB\n" + "Inactive(file): %8lu kB\n" +#ifdef CONFIG_UNEVICTABLE_LRU + "Unevictable: %8lu kB\n" + "Mlocked: %8lu kB\n" +#endif +#ifdef CONFIG_HIGHMEM + "HighTotal: %8lu kB\n" + "HighFree: %8lu kB\n" + "LowTotal: %8lu kB\n" + "LowFree: %8lu kB\n" +#endif + "SwapTotal: %8lu kB\n" + "SwapFree: %8lu kB\n" + "Dirty: %8lu kB\n" + "Writeback: %8lu kB\n" + "AnonPages: %8lu kB\n" + "Mapped: %8lu kB\n" + "Slab: %8lu kB\n" + "SReclaimable: %8lu kB\n" + "SUnreclaim: %8lu kB\n" + "PageTables: %8lu kB\n" +#ifdef CONFIG_QUICKLIST + "Quicklists: %8lu kB\n" +#endif + "NFS_Unstable: %8lu kB\n" + "Bounce: %8lu kB\n" + "WritebackTmp: %8lu kB\n" + "CommitLimit: %8lu kB\n" + "Committed_AS: %8lu kB\n" + "VmallocTotal: %8lu kB\n" + "VmallocUsed: %8lu kB\n" + "VmallocChunk: %8lu kB\n", + K(i.totalram), + K(i.freeram), + K(i.bufferram), + K(cached), + K(total_swapcache_pages), + K(pages[LRU_ACTIVE_ANON] + pages[LRU_ACTIVE_FILE]), + K(pages[LRU_INACTIVE_ANON] + pages[LRU_INACTIVE_FILE]), + K(pages[LRU_ACTIVE_ANON]), + K(pages[LRU_INACTIVE_ANON]), + K(pages[LRU_ACTIVE_FILE]), + K(pages[LRU_INACTIVE_FILE]), +#ifdef CONFIG_UNEVICTABLE_LRU + K(pages[LRU_UNEVICTABLE]), + K(global_page_state(NR_MLOCK)), +#endif +#ifdef CONFIG_HIGHMEM + K(i.totalhigh), + K(i.freehigh), + K(i.totalram-i.totalhigh), + K(i.freeram-i.freehigh), +#endif + K(i.totalswap), + K(i.freeswap), + K(global_page_state(NR_FILE_DIRTY)), + K(global_page_state(NR_WRITEBACK)), + K(global_page_state(NR_ANON_PAGES)), + K(global_page_state(NR_FILE_MAPPED)), + K(global_page_state(NR_SLAB_RECLAIMABLE) + + global_page_state(NR_SLAB_UNRECLAIMABLE)), + K(global_page_state(NR_SLAB_RECLAIMABLE)), + K(global_page_state(NR_SLAB_UNRECLAIMABLE)), + K(global_page_state(NR_PAGETABLE)), +#ifdef CONFIG_QUICKLIST + K(quicklist_total_size()), +#endif + K(global_page_state(NR_UNSTABLE_NFS)), + K(global_page_state(NR_BOUNCE)), + K(global_page_state(NR_WRITEBACK_TEMP)), + K(allowed), + K(committed), + (unsigned long)VMALLOC_TOTAL >> 10, + vmi.used >> 10, + vmi.largest_chunk >> 10 + ); + + hugetlb_report_meminfo(m); + + arch_report_meminfo(m); + + return 0; +#undef K +} + +static int meminfo_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, meminfo_proc_show, NULL); +} + +static const struct file_operations meminfo_proc_fops = { + .open = meminfo_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int __init proc_meminfo_init(void) +{ + proc_create("meminfo", 0, NULL, &meminfo_proc_fops); + return 0; +} +module_init(proc_meminfo_init); diff --git a/fs/proc/page.c b/fs/proc/page.c new file mode 100644 index 0000000..767d95a --- /dev/null +++ b/fs/proc/page.c @@ -0,0 +1,147 @@ +#include <linux/bootmem.h> +#include <linux/compiler.h> +#include <linux/fs.h> +#include <linux/init.h> +#include <linux/mm.h> +#include <linux/mmzone.h> +#include <linux/proc_fs.h> +#include <linux/seq_file.h> +#include <asm/uaccess.h> +#include "internal.h" + +#define KPMSIZE sizeof(u64) +#define KPMMASK (KPMSIZE - 1) +/* /proc/kpagecount - an array exposing page counts + * + * Each entry is a u64 representing the corresponding + * physical page count. + */ +static ssize_t kpagecount_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + u64 __user *out = (u64 __user *)buf; + struct page *ppage; + unsigned long src = *ppos; + unsigned long pfn; + ssize_t ret = 0; + u64 pcount; + + pfn = src / KPMSIZE; + count = min_t(size_t, count, (max_pfn * KPMSIZE) - src); + if (src & KPMMASK || count & KPMMASK) + return -EINVAL; + + while (count > 0) { + ppage = NULL; + if (pfn_valid(pfn)) + ppage = pfn_to_page(pfn); + pfn++; + if (!ppage) + pcount = 0; + else + pcount = page_mapcount(ppage); + + if (put_user(pcount, out++)) { + ret = -EFAULT; + break; + } + + count -= KPMSIZE; + } + + *ppos += (char __user *)out - buf; + if (!ret) + ret = (char __user *)out - buf; + return ret; +} + +static const struct file_operations proc_kpagecount_operations = { + .llseek = mem_lseek, + .read = kpagecount_read, +}; + +/* /proc/kpageflags - an array exposing page flags + * + * Each entry is a u64 representing the corresponding + * physical page flags. + */ + +/* These macros are used to decouple internal flags from exported ones */ + +#define KPF_LOCKED 0 +#define KPF_ERROR 1 +#define KPF_REFERENCED 2 +#define KPF_UPTODATE 3 +#define KPF_DIRTY 4 +#define KPF_LRU 5 +#define KPF_ACTIVE 6 +#define KPF_SLAB 7 +#define KPF_WRITEBACK 8 +#define KPF_RECLAIM 9 +#define KPF_BUDDY 10 + +#define kpf_copy_bit(flags, srcpos, dstpos) (((flags >> srcpos) & 1) << dstpos) + +static ssize_t kpageflags_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + u64 __user *out = (u64 __user *)buf; + struct page *ppage; + unsigned long src = *ppos; + unsigned long pfn; + ssize_t ret = 0; + u64 kflags, uflags; + + pfn = src / KPMSIZE; + count = min_t(unsigned long, count, (max_pfn * KPMSIZE) - src); + if (src & KPMMASK || count & KPMMASK) + return -EINVAL; + + while (count > 0) { + ppage = NULL; + if (pfn_valid(pfn)) + ppage = pfn_to_page(pfn); + pfn++; + if (!ppage) + kflags = 0; + else + kflags = ppage->flags; + + uflags = kpf_copy_bit(KPF_LOCKED, PG_locked, kflags) | + kpf_copy_bit(kflags, KPF_ERROR, PG_error) | + kpf_copy_bit(kflags, KPF_REFERENCED, PG_referenced) | + kpf_copy_bit(kflags, KPF_UPTODATE, PG_uptodate) | + kpf_copy_bit(kflags, KPF_DIRTY, PG_dirty) | + kpf_copy_bit(kflags, KPF_LRU, PG_lru) | + kpf_copy_bit(kflags, KPF_ACTIVE, PG_active) | + kpf_copy_bit(kflags, KPF_SLAB, PG_slab) | + kpf_copy_bit(kflags, KPF_WRITEBACK, PG_writeback) | + kpf_copy_bit(kflags, KPF_RECLAIM, PG_reclaim) | + kpf_copy_bit(kflags, KPF_BUDDY, PG_buddy); + + if (put_user(uflags, out++)) { + ret = -EFAULT; + break; + } + + count -= KPMSIZE; + } + + *ppos += (char __user *)out - buf; + if (!ret) + ret = (char __user *)out - buf; + return ret; +} + +static const struct file_operations proc_kpageflags_operations = { + .llseek = mem_lseek, + .read = kpageflags_read, +}; + +static int __init proc_page_init(void) +{ + proc_create("kpagecount", S_IRUSR, NULL, &proc_kpagecount_operations); + proc_create("kpageflags", S_IRUSR, NULL, &proc_kpageflags_operations); + return 0; +} +module_init(proc_page_init); diff --git a/fs/proc/proc_devtree.c b/fs/proc/proc_devtree.c index eca471bc..d777789 100644 --- a/fs/proc/proc_devtree.c +++ b/fs/proc/proc_devtree.c @@ -4,6 +4,7 @@ * Copyright 1997 Paul Mackerras */ #include <linux/errno.h> +#include <linux/init.h> #include <linux/time.h> #include <linux/proc_fs.h> #include <linux/stat.h> @@ -214,7 +215,7 @@ void proc_device_tree_add_node(struct device_node *np, /* * Called on initialization to set up the /proc/device-tree subtree */ -void proc_device_tree_init(void) +void __init proc_device_tree_init(void) { struct device_node *root; if ( !have_of ) diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c deleted file mode 100644 index 59ea42e..0000000 --- a/fs/proc/proc_misc.c +++ /dev/null @@ -1,914 +0,0 @@ -/* - * linux/fs/proc/proc_misc.c - * - * linux/fs/proc/array.c - * Copyright (C) 1992 by Linus Torvalds - * based on ideas by Darren Senn - * - * This used to be the part of array.c. See the rest of history and credits - * there. I took this into a separate file and switched the thing to generic - * proc_file_inode_operations, leaving in array.c only per-process stuff. - * Inumbers allocation made dynamic (via create_proc_entry()). AV, May 1999. - * - * Changes: - * Fulton Green : Encapsulated position metric calculations. - * <kernel@FultonGreen.com> - */ - -#include <linux/types.h> -#include <linux/errno.h> -#include <linux/time.h> -#include <linux/kernel.h> -#include <linux/kernel_stat.h> -#include <linux/fs.h> -#include <linux/tty.h> -#include <linux/string.h> -#include <linux/mman.h> -#include <linux/quicklist.h> -#include <linux/proc_fs.h> -#include <linux/ioport.h> -#include <linux/mm.h> -#include <linux/mmzone.h> -#include <linux/pagemap.h> -#include <linux/interrupt.h> -#include <linux/swap.h> -#include <linux/slab.h> -#include <linux/genhd.h> -#include <linux/smp.h> -#include <linux/signal.h> -#include <linux/module.h> -#include <linux/init.h> -#include <linux/seq_file.h> -#include <linux/times.h> -#include <linux/profile.h> -#include <linux/utsname.h> -#include <linux/blkdev.h> -#include <linux/hugetlb.h> -#include <linux/jiffies.h> -#include <linux/vmalloc.h> -#include <linux/crash_dump.h> -#include <linux/pid_namespace.h> -#include <linux/bootmem.h> -#include <asm/uaccess.h> -#include <asm/pgtable.h> -#include <asm/io.h> -#include <asm/tlb.h> -#include <asm/div64.h> -#include "internal.h" - -#define LOAD_INT(x) ((x) >> FSHIFT) -#define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100) -/* - * Warning: stuff below (imported functions) assumes that its output will fit - * into one page. For some of those functions it may be wrong. Moreover, we - * have a way to deal with that gracefully. Right now I used straightforward - * wrappers, but this needs further analysis wrt potential overflows. - */ -extern int get_hardware_list(char *); -extern int get_stram_list(char *); -extern int get_exec_domain_list(char *); - -static int proc_calc_metrics(char *page, char **start, off_t off, - int count, int *eof, int len) -{ - if (len <= off+count) *eof = 1; - *start = page + off; - len -= off; - if (len>count) len = count; - if (len<0) len = 0; - return len; -} - -static int loadavg_read_proc(char *page, char **start, off_t off, - int count, int *eof, void *data) -{ - int a, b, c; - int len; - unsigned long seq; - - do { - seq = read_seqbegin(&xtime_lock); - a = avenrun[0] + (FIXED_1/200); - b = avenrun[1] + (FIXED_1/200); - c = avenrun[2] + (FIXED_1/200); - } while (read_seqretry(&xtime_lock, seq)); - - len = sprintf(page,"%d.%02d %d.%02d %d.%02d %ld/%d %d\n", - LOAD_INT(a), LOAD_FRAC(a), - LOAD_INT(b), LOAD_FRAC(b), - LOAD_INT(c), LOAD_FRAC(c), - nr_running(), nr_threads, - task_active_pid_ns(current)->last_pid); - return proc_calc_metrics(page, start, off, count, eof, len); -} - -static int uptime_read_proc(char *page, char **start, off_t off, - int count, int *eof, void *data) -{ - struct timespec uptime; - struct timespec idle; - int len; - cputime_t idletime = cputime_add(init_task.utime, init_task.stime); - - do_posix_clock_monotonic_gettime(&uptime); - monotonic_to_bootbased(&uptime); - cputime_to_timespec(idletime, &idle); - len = sprintf(page,"%lu.%02lu %lu.%02lu\n", - (unsigned long) uptime.tv_sec, - (uptime.tv_nsec / (NSEC_PER_SEC / 100)), - (unsigned long) idle.tv_sec, - (idle.tv_nsec / (NSEC_PER_SEC / 100))); - - return proc_calc_metrics(page, start, off, count, eof, len); -} - -int __attribute__((weak)) arch_report_meminfo(char *page) -{ - return 0; -} - -static int meminfo_read_proc(char *page, char **start, off_t off, - int count, int *eof, void *data) -{ - struct sysinfo i; - int len; - unsigned long committed; - unsigned long allowed; - struct vmalloc_info vmi; - long cached; - -/* - * display in kilobytes. - */ -#define K(x) ((x) << (PAGE_SHIFT - 10)) - si_meminfo(&i); - si_swapinfo(&i); - committed = atomic_long_read(&vm_committed_space); - allowed = ((totalram_pages - hugetlb_total_pages()) - * sysctl_overcommit_ratio / 100) + total_swap_pages; - - cached = global_page_state(NR_FILE_PAGES) - - total_swapcache_pages - i.bufferram; - if (cached < 0) - cached = 0; - - get_vmalloc_info(&vmi); - - /* - * Tagged format, for easy grepping and expansion. - */ - len = sprintf(page, - "MemTotal: %8lu kB\n" - "MemFree: %8lu kB\n" - "Buffers: %8lu kB\n" - "Cached: %8lu kB\n" - "SwapCached: %8lu kB\n" - "Active: %8lu kB\n" - "Inactive: %8lu kB\n" -#ifdef CONFIG_HIGHMEM - "HighTotal: %8lu kB\n" - "HighFree: %8lu kB\n" - "LowTotal: %8lu kB\n" - "LowFree: %8lu kB\n" -#endif - "SwapTotal: %8lu kB\n" - "SwapFree: %8lu kB\n" - "Dirty: %8lu kB\n" - "Writeback: %8lu kB\n" - "AnonPages: %8lu kB\n" - "Mapped: %8lu kB\n" - "Slab: %8lu kB\n" - "SReclaimable: %8lu kB\n" - "SUnreclaim: %8lu kB\n" - "PageTables: %8lu kB\n" -#ifdef CONFIG_QUICKLIST - "Quicklists: %8lu kB\n" -#endif - "NFS_Unstable: %8lu kB\n" - "Bounce: %8lu kB\n" - "WritebackTmp: %8lu kB\n" - "CommitLimit: %8lu kB\n" - "Committed_AS: %8lu kB\n" - "VmallocTotal: %8lu kB\n" - "VmallocUsed: %8lu kB\n" - "VmallocChunk: %8lu kB\n", - K(i.totalram), - K(i.freeram), - K(i.bufferram), - K(cached), - K(total_swapcache_pages), - K(global_page_state(NR_ACTIVE)), - K(global_page_state(NR_INACTIVE)), -#ifdef CONFIG_HIGHMEM - K(i.totalhigh), - K(i.freehigh), - K(i.totalram-i.totalhigh), - K(i.freeram-i.freehigh), -#endif - K(i.totalswap), - K(i.freeswap), - K(global_page_state(NR_FILE_DIRTY)), - K(global_page_state(NR_WRITEBACK)), - K(global_page_state(NR_ANON_PAGES)), - K(global_page_state(NR_FILE_MAPPED)), - K(global_page_state(NR_SLAB_RECLAIMABLE) + - global_page_state(NR_SLAB_UNRECLAIMABLE)), - K(global_page_state(NR_SLAB_RECLAIMABLE)), - K(global_page_state(NR_SLAB_UNRECLAIMABLE)), - K(global_page_state(NR_PAGETABLE)), -#ifdef CONFIG_QUICKLIST - K(quicklist_total_size()), -#endif - K(global_page_state(NR_UNSTABLE_NFS)), - K(global_page_state(NR_BOUNCE)), - K(global_page_state(NR_WRITEBACK_TEMP)), - K(allowed), - K(committed), - (unsigned long)VMALLOC_TOTAL >> 10, - vmi.used >> 10, - vmi.largest_chunk >> 10 - ); - - len += hugetlb_report_meminfo(page + len); - - len += arch_report_meminfo(page + len); - - return proc_calc_metrics(page, start, off, count, eof, len); -#undef K -} - -static int fragmentation_open(struct inode *inode, struct file *file) -{ - (void)inode; - return seq_open(file, &fragmentation_op); -} - -static const struct file_operations fragmentation_file_operations = { - .open = fragmentation_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; - -static int pagetypeinfo_open(struct inode *inode, struct file *file) -{ - return seq_open(file, &pagetypeinfo_op); -} - -static const struct file_operations pagetypeinfo_file_ops = { - .open = pagetypeinfo_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; - -static int zoneinfo_open(struct inode *inode, struct file *file) -{ - return seq_open(file, &zoneinfo_op); -} - -static const struct file_operations proc_zoneinfo_file_operations = { - .open = zoneinfo_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; - -static int version_read_proc(char *page, char **start, off_t off, - int count, int *eof, void *data) -{ - int len; - - len = snprintf(page, PAGE_SIZE, linux_proc_banner, - utsname()->sysname, - utsname()->release, - utsname()->version); - return proc_calc_metrics(page, start, off, count, eof, len); -} - -extern const struct seq_operations cpuinfo_op; -static int cpuinfo_open(struct inode *inode, struct file *file) -{ - return seq_open(file, &cpuinfo_op); -} - -static const struct file_operations proc_cpuinfo_operations = { - .open = cpuinfo_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; - -static int devinfo_show(struct seq_file *f, void *v) -{ - int i = *(loff_t *) v; - - if (i < CHRDEV_MAJOR_HASH_SIZE) { - if (i == 0) - seq_printf(f, "Character devices:\n"); - chrdev_show(f, i); - } -#ifdef CONFIG_BLOCK - else { - i -= CHRDEV_MAJOR_HASH_SIZE; - if (i == 0) - seq_printf(f, "\nBlock devices:\n"); - blkdev_show(f, i); - } -#endif - return 0; -} - -static void *devinfo_start(struct seq_file *f, loff_t *pos) -{ - if (*pos < (BLKDEV_MAJOR_HASH_SIZE + CHRDEV_MAJOR_HASH_SIZE)) - return pos; - return NULL; -} - -static void *devinfo_next(struct seq_file *f, void *v, loff_t *pos) -{ - (*pos)++; - if (*pos >= (BLKDEV_MAJOR_HASH_SIZE + CHRDEV_MAJOR_HASH_SIZE)) - return NULL; - return pos; -} - -static void devinfo_stop(struct seq_file *f, void *v) -{ - /* Nothing to do */ -} - -static const struct seq_operations devinfo_ops = { - .start = devinfo_start, - .next = devinfo_next, - .stop = devinfo_stop, - .show = devinfo_show -}; - -static int devinfo_open(struct inode *inode, struct file *filp) -{ - return seq_open(filp, &devinfo_ops); -} - -static const struct file_operations proc_devinfo_operations = { - .open = devinfo_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; - -static int vmstat_open(struct inode *inode, struct file *file) -{ - return seq_open(file, &vmstat_op); -} -static const struct file_operations proc_vmstat_file_operations = { - .open = vmstat_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; - -#ifdef CONFIG_PROC_HARDWARE -static int hardware_read_proc(char *page, char **start, off_t off, - int count, int *eof, void *data) -{ - int len = get_hardware_list(page); - return proc_calc_metrics(page, start, off, count, eof, len); -} -#endif - -#ifdef CONFIG_STRAM_PROC -static int stram_read_proc(char *page, char **start, off_t off, - int count, int *eof, void *data) -{ - int len = get_stram_list(page); - return proc_calc_metrics(page, start, off, count, eof, len); -} -#endif - -#ifdef CONFIG_BLOCK -static int partitions_open(struct inode *inode, struct file *file) -{ - return seq_open(file, &partitions_op); -} -static const struct file_operations proc_partitions_operations = { - .open = partitions_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; - -static int diskstats_open(struct inode *inode, struct file *file) -{ - return seq_open(file, &diskstats_op); -} -static const struct file_operations proc_diskstats_operations = { - .open = diskstats_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; -#endif - -#ifdef CONFIG_MODULES -extern const struct seq_operations modules_op; -static int modules_open(struct inode *inode, struct file *file) -{ - return seq_open(file, &modules_op); -} -static const struct file_operations proc_modules_operations = { - .open = modules_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; -#endif - -#ifdef CONFIG_SLABINFO -static int slabinfo_open(struct inode *inode, struct file *file) -{ - return seq_open(file, &slabinfo_op); -} -static const struct file_operations proc_slabinfo_operations = { - .open = slabinfo_open, - .read = seq_read, - .write = slabinfo_write, - .llseek = seq_lseek, - .release = seq_release, -}; - -#ifdef CONFIG_DEBUG_SLAB_LEAK -extern const struct seq_operations slabstats_op; -static int slabstats_open(struct inode *inode, struct file *file) -{ - unsigned long *n = kzalloc(PAGE_SIZE, GFP_KERNEL); - int ret = -ENOMEM; - if (n) { - ret = seq_open(file, &slabstats_op); - if (!ret) { - struct seq_file *m = file->private_data; - *n = PAGE_SIZE / (2 * sizeof(unsigned long)); - m->private = n; - n = NULL; - } - kfree(n); - } - return ret; -} - -static const struct file_operations proc_slabstats_operations = { - .open = slabstats_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release_private, -}; -#endif -#endif - -#ifdef CONFIG_MMU -static int vmalloc_open(struct inode *inode, struct file *file) -{ - unsigned int *ptr = NULL; - int ret; - - if (NUMA_BUILD) - ptr = kmalloc(nr_node_ids * sizeof(unsigned int), GFP_KERNEL); - ret = seq_open(file, &vmalloc_op); - if (!ret) { - struct seq_file *m = file->private_data; - m->private = ptr; - } else - kfree(ptr); - return ret; -} - -static const struct file_operations proc_vmalloc_operations = { - .open = vmalloc_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release_private, -}; -#endif - -#ifndef arch_irq_stat_cpu -#define arch_irq_stat_cpu(cpu) 0 -#endif -#ifndef arch_irq_stat -#define arch_irq_stat() 0 -#endif - -static int show_stat(struct seq_file *p, void *v) -{ - int i; - unsigned long jif; - cputime64_t user, nice, system, idle, iowait, irq, softirq, steal; - cputime64_t guest; - u64 sum = 0; - struct timespec boottime; - unsigned int *per_irq_sum; - - per_irq_sum = kzalloc(sizeof(unsigned int)*NR_IRQS, GFP_KERNEL); - if (!per_irq_sum) - return -ENOMEM; - - user = nice = system = idle = iowait = - irq = softirq = steal = cputime64_zero; - guest = cputime64_zero; - getboottime(&boottime); - jif = boottime.tv_sec; - - for_each_possible_cpu(i) { - int j; - - user = cputime64_add(user, kstat_cpu(i).cpustat.user); - nice = cputime64_add(nice, kstat_cpu(i).cpustat.nice); - system = cputime64_add(system, kstat_cpu(i).cpustat.system); - idle = cputime64_add(idle, kstat_cpu(i).cpustat.idle); - iowait = cputime64_add(iowait, kstat_cpu(i).cpustat.iowait); - irq = cputime64_add(irq, kstat_cpu(i).cpustat.irq); - softirq = cputime64_add(softirq, kstat_cpu(i).cpustat.softirq); - steal = cputime64_add(steal, kstat_cpu(i).cpustat.steal); - guest = cputime64_add(guest, kstat_cpu(i).cpustat.guest); - for (j = 0; j < NR_IRQS; j++) { - unsigned int temp = kstat_cpu(i).irqs[j]; - sum += temp; - per_irq_sum[j] += temp; - } - sum += arch_irq_stat_cpu(i); - } - sum += arch_irq_stat(); - - seq_printf(p, "cpu %llu %llu %llu %llu %llu %llu %llu %llu %llu\n", - (unsigned long long)cputime64_to_clock_t(user), - (unsigned long long)cputime64_to_clock_t(nice), - (unsigned long long)cputime64_to_clock_t(system), - (unsigned long long)cputime64_to_clock_t(idle), - (unsigned long long)cputime64_to_clock_t(iowait), - (unsigned long long)cputime64_to_clock_t(irq), - (unsigned long long)cputime64_to_clock_t(softirq), - (unsigned long long)cputime64_to_clock_t(steal), - (unsigned long long)cputime64_to_clock_t(guest)); - for_each_online_cpu(i) { - - /* Copy values here to work around gcc-2.95.3, gcc-2.96 */ - user = kstat_cpu(i).cpustat.user; - nice = kstat_cpu(i).cpustat.nice; - system = kstat_cpu(i).cpustat.system; - idle = kstat_cpu(i).cpustat.idle; - iowait = kstat_cpu(i).cpustat.iowait; - irq = kstat_cpu(i).cpustat.irq; - softirq = kstat_cpu(i).cpustat.softirq; - steal = kstat_cpu(i).cpustat.steal; - guest = kstat_cpu(i).cpustat.guest; - seq_printf(p, - "cpu%d %llu %llu %llu %llu %llu %llu %llu %llu %llu\n", - i, - (unsigned long long)cputime64_to_clock_t(user), - (unsigned long long)cputime64_to_clock_t(nice), - (unsigned long long)cputime64_to_clock_t(system), - (unsigned long long)cputime64_to_clock_t(idle), - (unsigned long long)cputime64_to_clock_t(iowait), - (unsigned long long)cputime64_to_clock_t(irq), - (unsigned long long)cputime64_to_clock_t(softirq), - (unsigned long long)cputime64_to_clock_t(steal), - (unsigned long long)cputime64_to_clock_t(guest)); - } - seq_printf(p, "intr %llu", (unsigned long long)sum); - - for (i = 0; i < NR_IRQS; i++) - seq_printf(p, " %u", per_irq_sum[i]); - - seq_printf(p, - "\nctxt %llu\n" - "btime %lu\n" - "processes %lu\n" - "procs_running %lu\n" - "procs_blocked %lu\n", - nr_context_switches(), - (unsigned long)jif, - total_forks, - nr_running(), - nr_iowait()); - - kfree(per_irq_sum); - return 0; -} - -static int stat_open(struct inode *inode, struct file *file) -{ - unsigned size = 4096 * (1 + num_possible_cpus() / 32); - char *buf; - struct seq_file *m; - int res; - - /* don't ask for more than the kmalloc() max size, currently 128 KB */ - if (size > 128 * 1024) - size = 128 * 1024; - buf = kmalloc(size, GFP_KERNEL); - if (!buf) - return -ENOMEM; - - res = single_open(file, show_stat, NULL); - if (!res) { - m = file->private_data; - m->buf = buf; - m->size = size; - } else - kfree(buf); - return res; -} -static const struct file_operations proc_stat_operations = { - .open = stat_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - -/* - * /proc/interrupts - */ -static void *int_seq_start(struct seq_file *f, loff_t *pos) -{ - return (*pos <= NR_IRQS) ? pos : NULL; -} - -static void *int_seq_next(struct seq_file *f, void *v, loff_t *pos) -{ - (*pos)++; - if (*pos > NR_IRQS) - return NULL; - return pos; -} - -static void int_seq_stop(struct seq_file *f, void *v) -{ - /* Nothing to do */ -} - - -static const struct seq_operations int_seq_ops = { - .start = int_seq_start, - .next = int_seq_next, - .stop = int_seq_stop, - .show = show_interrupts -}; - -static int interrupts_open(struct inode *inode, struct file *filp) -{ - return seq_open(filp, &int_seq_ops); -} - -static const struct file_operations proc_interrupts_operations = { - .open = interrupts_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; - -static int filesystems_read_proc(char *page, char **start, off_t off, - int count, int *eof, void *data) -{ - int len = get_filesystem_list(page); - return proc_calc_metrics(page, start, off, count, eof, len); -} - -static int cmdline_read_proc(char *page, char **start, off_t off, - int count, int *eof, void *data) -{ - int len; - - len = sprintf(page, "%s\n", saved_command_line); - return proc_calc_metrics(page, start, off, count, eof, len); -} - -#ifdef CONFIG_FILE_LOCKING -static int locks_open(struct inode *inode, struct file *filp) -{ - return seq_open(filp, &locks_seq_operations); -} - -static const struct file_operations proc_locks_operations = { - .open = locks_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; -#endif /* CONFIG_FILE_LOCKING */ - -static int execdomains_read_proc(char *page, char **start, off_t off, - int count, int *eof, void *data) -{ - int len = get_exec_domain_list(page); - return proc_calc_metrics(page, start, off, count, eof, len); -} - -#ifdef CONFIG_PROC_PAGE_MONITOR -#define KPMSIZE sizeof(u64) -#define KPMMASK (KPMSIZE - 1) -/* /proc/kpagecount - an array exposing page counts - * - * Each entry is a u64 representing the corresponding - * physical page count. - */ -static ssize_t kpagecount_read(struct file *file, char __user *buf, - size_t count, loff_t *ppos) -{ - u64 __user *out = (u64 __user *)buf; - struct page *ppage; - unsigned long src = *ppos; - unsigned long pfn; - ssize_t ret = 0; - u64 pcount; - - pfn = src / KPMSIZE; - count = min_t(size_t, count, (max_pfn * KPMSIZE) - src); - if (src & KPMMASK || count & KPMMASK) - return -EINVAL; - - while (count > 0) { - ppage = NULL; - if (pfn_valid(pfn)) - ppage = pfn_to_page(pfn); - pfn++; - if (!ppage) - pcount = 0; - else - pcount = page_mapcount(ppage); - - if (put_user(pcount, out++)) { - ret = -EFAULT; - break; - } - - count -= KPMSIZE; - } - - *ppos += (char __user *)out - buf; - if (!ret) - ret = (char __user *)out - buf; - return ret; -} - -static struct file_operations proc_kpagecount_operations = { - .llseek = mem_lseek, - .read = kpagecount_read, -}; - -/* /proc/kpageflags - an array exposing page flags - * - * Each entry is a u64 representing the corresponding - * physical page flags. - */ - -/* These macros are used to decouple internal flags from exported ones */ - -#define KPF_LOCKED 0 -#define KPF_ERROR 1 -#define KPF_REFERENCED 2 -#define KPF_UPTODATE 3 -#define KPF_DIRTY 4 -#define KPF_LRU 5 -#define KPF_ACTIVE 6 -#define KPF_SLAB 7 -#define KPF_WRITEBACK 8 -#define KPF_RECLAIM 9 -#define KPF_BUDDY 10 - -#define kpf_copy_bit(flags, srcpos, dstpos) (((flags >> srcpos) & 1) << dstpos) - -static ssize_t kpageflags_read(struct file *file, char __user *buf, - size_t count, loff_t *ppos) -{ - u64 __user *out = (u64 __user *)buf; - struct page *ppage; - unsigned long src = *ppos; - unsigned long pfn; - ssize_t ret = 0; - u64 kflags, uflags; - - pfn = src / KPMSIZE; - count = min_t(unsigned long, count, (max_pfn * KPMSIZE) - src); - if (src & KPMMASK || count & KPMMASK) - return -EINVAL; - - while (count > 0) { - ppage = NULL; - if (pfn_valid(pfn)) - ppage = pfn_to_page(pfn); - pfn++; - if (!ppage) - kflags = 0; - else - kflags = ppage->flags; - - uflags = kpf_copy_bit(KPF_LOCKED, PG_locked, kflags) | - kpf_copy_bit(kflags, KPF_ERROR, PG_error) | - kpf_copy_bit(kflags, KPF_REFERENCED, PG_referenced) | - kpf_copy_bit(kflags, KPF_UPTODATE, PG_uptodate) | - kpf_copy_bit(kflags, KPF_DIRTY, PG_dirty) | - kpf_copy_bit(kflags, KPF_LRU, PG_lru) | - kpf_copy_bit(kflags, KPF_ACTIVE, PG_active) | - kpf_copy_bit(kflags, KPF_SLAB, PG_slab) | - kpf_copy_bit(kflags, KPF_WRITEBACK, PG_writeback) | - kpf_copy_bit(kflags, KPF_RECLAIM, PG_reclaim) | - kpf_copy_bit(kflags, KPF_BUDDY, PG_buddy); - - if (put_user(uflags, out++)) { - ret = -EFAULT; - break; - } - - count -= KPMSIZE; - } - - *ppos += (char __user *)out - buf; - if (!ret) - ret = (char __user *)out - buf; - return ret; -} - -static struct file_operations proc_kpageflags_operations = { - .llseek = mem_lseek, - .read = kpageflags_read, -}; -#endif /* CONFIG_PROC_PAGE_MONITOR */ - -struct proc_dir_entry *proc_root_kcore; - -void __init proc_misc_init(void) -{ - static struct { - char *name; - int (*read_proc)(char*,char**,off_t,int,int*,void*); - } *p, simple_ones[] = { - {"loadavg", loadavg_read_proc}, - {"uptime", uptime_read_proc}, - {"meminfo", meminfo_read_proc}, - {"version", version_read_proc}, -#ifdef CONFIG_PROC_HARDWARE - {"hardware", hardware_read_proc}, -#endif -#ifdef CONFIG_STRAM_PROC - {"stram", stram_read_proc}, -#endif - {"filesystems", filesystems_read_proc}, - {"cmdline", cmdline_read_proc}, - {"execdomains", execdomains_read_proc}, - {NULL,} - }; - for (p = simple_ones; p->name; p++) - create_proc_read_entry(p->name, 0, NULL, p->read_proc, NULL); - - proc_symlink("mounts", NULL, "self/mounts"); - - /* And now for trickier ones */ -#ifdef CONFIG_PRINTK - proc_create("kmsg", S_IRUSR, NULL, &proc_kmsg_operations); -#endif -#ifdef CONFIG_FILE_LOCKING - proc_create("locks", 0, NULL, &proc_locks_operations); -#endif - proc_create("devices", 0, NULL, &proc_devinfo_operations); - proc_create("cpuinfo", 0, NULL, &proc_cpuinfo_operations); -#ifdef CONFIG_BLOCK - proc_create("partitions", 0, NULL, &proc_partitions_operations); -#endif - proc_create("stat", 0, NULL, &proc_stat_operations); - proc_create("interrupts", 0, NULL, &proc_interrupts_operations); -#ifdef CONFIG_SLABINFO - proc_create("slabinfo",S_IWUSR|S_IRUGO,NULL,&proc_slabinfo_operations); -#ifdef CONFIG_DEBUG_SLAB_LEAK - proc_create("slab_allocators", 0, NULL, &proc_slabstats_operations); -#endif -#endif -#ifdef CONFIG_MMU - proc_create("vmallocinfo", S_IRUSR, NULL, &proc_vmalloc_operations); -#endif - proc_create("buddyinfo", S_IRUGO, NULL, &fragmentation_file_operations); - proc_create("pagetypeinfo", S_IRUGO, NULL, &pagetypeinfo_file_ops); - proc_create("vmstat", S_IRUGO, NULL, &proc_vmstat_file_operations); - proc_create("zoneinfo", S_IRUGO, NULL, &proc_zoneinfo_file_operations); -#ifdef CONFIG_BLOCK - proc_create("diskstats", 0, NULL, &proc_diskstats_operations); -#endif -#ifdef CONFIG_MODULES - proc_create("modules", 0, NULL, &proc_modules_operations); -#endif -#ifdef CONFIG_SCHEDSTATS - proc_create("schedstat", 0, NULL, &proc_schedstat_operations); -#endif -#ifdef CONFIG_PROC_KCORE - proc_root_kcore = proc_create("kcore", S_IRUSR, NULL, &proc_kcore_operations); - if (proc_root_kcore) - proc_root_kcore->size = - (size_t)high_memory - PAGE_OFFSET + PAGE_SIZE; -#endif -#ifdef CONFIG_PROC_PAGE_MONITOR - proc_create("kpagecount", S_IRUSR, NULL, &proc_kpagecount_operations); - proc_create("kpageflags", S_IRUSR, NULL, &proc_kpageflags_operations); -#endif -#ifdef CONFIG_PROC_VMCORE - proc_vmcore = proc_create("vmcore", S_IRUSR, NULL, &proc_vmcore_operations); -#endif -} diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index 945a810..94fcfff 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -1,7 +1,7 @@ /* * /proc/sys support */ - +#include <linux/init.h> #include <linux/sysctl.h> #include <linux/proc_fs.h> #include <linux/security.h> @@ -298,13 +298,19 @@ static int proc_sys_permission(struct inode *inode, int mask) * sysctl entries that are not writeable, * are _NOT_ writeable, capabilities or not. */ - struct ctl_table_header *head = grab_header(inode); - struct ctl_table *table = PROC_I(inode)->sysctl_entry; + struct ctl_table_header *head; + struct ctl_table *table; int error; + /* Executable files are not allowed under /proc/sys/ */ + if ((mask & MAY_EXEC) && S_ISREG(inode->i_mode)) + return -EACCES; + + head = grab_header(inode); if (IS_ERR(head)) return PTR_ERR(head); + table = PROC_I(inode)->sysctl_entry; if (!table) /* global root - r-xr-xr-x */ error = mask & MAY_WRITE ? -EACCES : 0; else /* Use the permissions on the sysctl table entry */ @@ -353,6 +359,7 @@ static const struct file_operations proc_sys_file_operations = { static const struct file_operations proc_sys_dir_file_operations = { .readdir = proc_sys_readdir, + .llseek = generic_file_llseek, }; static const struct inode_operations proc_sys_inode_operations = { @@ -395,7 +402,7 @@ static struct dentry_operations proc_sys_dentry_operations = { .d_compare = proc_sys_compare, }; -int proc_sys_init(void) +int __init proc_sys_init(void) { struct proc_dir_entry *proc_sys_root; diff --git a/fs/proc/root.c b/fs/proc/root.c index 9511753..7761602 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -104,9 +104,9 @@ static struct file_system_type proc_fs_type = { void __init proc_root_init(void) { - int err = proc_init_inodecache(); - if (err) - return; + int err; + + proc_init_inodecache(); err = register_filesystem(&proc_fs_type); if (err) return; @@ -117,7 +117,7 @@ void __init proc_root_init(void) return; } - proc_misc_init(); + proc_symlink("mounts", NULL, "self/mounts"); proc_net_init(); diff --git a/fs/proc/stat.c b/fs/proc/stat.c new file mode 100644 index 0000000..81904f0 --- /dev/null +++ b/fs/proc/stat.c @@ -0,0 +1,153 @@ +#include <linux/cpumask.h> +#include <linux/fs.h> +#include <linux/gfp.h> +#include <linux/init.h> +#include <linux/interrupt.h> +#include <linux/kernel_stat.h> +#include <linux/proc_fs.h> +#include <linux/sched.h> +#include <linux/seq_file.h> +#include <linux/slab.h> +#include <linux/time.h> +#include <asm/cputime.h> + +#ifndef arch_irq_stat_cpu +#define arch_irq_stat_cpu(cpu) 0 +#endif +#ifndef arch_irq_stat +#define arch_irq_stat() 0 +#endif + +static int show_stat(struct seq_file *p, void *v) +{ + int i, j; + unsigned long jif; + cputime64_t user, nice, system, idle, iowait, irq, softirq, steal; + cputime64_t guest; + u64 sum = 0; + struct timespec boottime; + unsigned int per_irq_sum; + + user = nice = system = idle = iowait = + irq = softirq = steal = cputime64_zero; + guest = cputime64_zero; + getboottime(&boottime); + jif = boottime.tv_sec; + + for_each_possible_cpu(i) { + user = cputime64_add(user, kstat_cpu(i).cpustat.user); + nice = cputime64_add(nice, kstat_cpu(i).cpustat.nice); + system = cputime64_add(system, kstat_cpu(i).cpustat.system); + idle = cputime64_add(idle, kstat_cpu(i).cpustat.idle); + iowait = cputime64_add(iowait, kstat_cpu(i).cpustat.iowait); + irq = cputime64_add(irq, kstat_cpu(i).cpustat.irq); + softirq = cputime64_add(softirq, kstat_cpu(i).cpustat.softirq); + steal = cputime64_add(steal, kstat_cpu(i).cpustat.steal); + guest = cputime64_add(guest, kstat_cpu(i).cpustat.guest); + + for_each_irq_nr(j) + sum += kstat_irqs_cpu(j, i); + + sum += arch_irq_stat_cpu(i); + } + sum += arch_irq_stat(); + + seq_printf(p, "cpu %llu %llu %llu %llu %llu %llu %llu %llu %llu\n", + (unsigned long long)cputime64_to_clock_t(user), + (unsigned long long)cputime64_to_clock_t(nice), + (unsigned long long)cputime64_to_clock_t(system), + (unsigned long long)cputime64_to_clock_t(idle), + (unsigned long long)cputime64_to_clock_t(iowait), + (unsigned long long)cputime64_to_clock_t(irq), + (unsigned long long)cputime64_to_clock_t(softirq), + (unsigned long long)cputime64_to_clock_t(steal), + (unsigned long long)cputime64_to_clock_t(guest)); + for_each_online_cpu(i) { + + /* Copy values here to work around gcc-2.95.3, gcc-2.96 */ + user = kstat_cpu(i).cpustat.user; + nice = kstat_cpu(i).cpustat.nice; + system = kstat_cpu(i).cpustat.system; + idle = kstat_cpu(i).cpustat.idle; + iowait = kstat_cpu(i).cpustat.iowait; + irq = kstat_cpu(i).cpustat.irq; + softirq = kstat_cpu(i).cpustat.softirq; + steal = kstat_cpu(i).cpustat.steal; + guest = kstat_cpu(i).cpustat.guest; + seq_printf(p, + "cpu%d %llu %llu %llu %llu %llu %llu %llu %llu %llu\n", + i, + (unsigned long long)cputime64_to_clock_t(user), + (unsigned long long)cputime64_to_clock_t(nice), + (unsigned long long)cputime64_to_clock_t(system), + (unsigned long long)cputime64_to_clock_t(idle), + (unsigned long long)cputime64_to_clock_t(iowait), + (unsigned long long)cputime64_to_clock_t(irq), + (unsigned long long)cputime64_to_clock_t(softirq), + (unsigned long long)cputime64_to_clock_t(steal), + (unsigned long long)cputime64_to_clock_t(guest)); + } + seq_printf(p, "intr %llu", (unsigned long long)sum); + + /* sum again ? it could be updated? */ + for_each_irq_nr(j) { + per_irq_sum = 0; + + for_each_possible_cpu(i) + per_irq_sum += kstat_irqs_cpu(j, i); + + seq_printf(p, " %u", per_irq_sum); + } + + seq_printf(p, + "\nctxt %llu\n" + "btime %lu\n" + "processes %lu\n" + "procs_running %lu\n" + "procs_blocked %lu\n", + nr_context_switches(), + (unsigned long)jif, + total_forks, + nr_running(), + nr_iowait()); + + return 0; +} + +static int stat_open(struct inode *inode, struct file *file) +{ + unsigned size = 4096 * (1 + num_possible_cpus() / 32); + char *buf; + struct seq_file *m; + int res; + + /* don't ask for more than the kmalloc() max size, currently 128 KB */ + if (size > 128 * 1024) + size = 128 * 1024; + buf = kmalloc(size, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + res = single_open(file, show_stat, NULL); + if (!res) { + m = file->private_data; + m->buf = buf; + m->size = size; + } else + kfree(buf); + return res; +} + +static const struct file_operations proc_stat_operations = { + .open = stat_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int __init proc_stat_init(void) +{ + proc_create("stat", 0, NULL, &proc_stat_operations); + return 0; +} +module_init(proc_stat_init); diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 4806830..b770c09 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -198,11 +198,8 @@ static int do_maps_open(struct inode *inode, struct file *file, return ret; } -static int show_map(struct seq_file *m, void *v) +static void show_map_vma(struct seq_file *m, struct vm_area_struct *vma) { - struct proc_maps_private *priv = m->private; - struct task_struct *task = priv->task; - struct vm_area_struct *vma = v; struct mm_struct *mm = vma->vm_mm; struct file *file = vma->vm_file; int flags = vma->vm_flags; @@ -254,6 +251,15 @@ static int show_map(struct seq_file *m, void *v) } } seq_putc(m, '\n'); +} + +static int show_map(struct seq_file *m, void *v) +{ + struct vm_area_struct *vma = v; + struct proc_maps_private *priv = m->private; + struct task_struct *task = priv->task; + + show_map_vma(m, vma); if (m->count < m->size) /* vma is copied successfully */ m->version = (vma != get_gate_vma(task))? vma->vm_start: 0; @@ -364,9 +370,10 @@ static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, static int show_smap(struct seq_file *m, void *v) { + struct proc_maps_private *priv = m->private; + struct task_struct *task = priv->task; struct vm_area_struct *vma = v; struct mem_size_stats mss; - int ret; struct mm_walk smaps_walk = { .pmd_entry = smaps_pte_range, .mm = vma->vm_mm, @@ -378,9 +385,7 @@ static int show_smap(struct seq_file *m, void *v) if (vma->vm_mm && !is_vm_hugetlb_page(vma)) walk_page_range(vma->vm_start, vma->vm_end, &smaps_walk); - ret = show_map(m, v); - if (ret) - return ret; + show_map_vma(m, vma); seq_printf(m, "Size: %8lu kB\n" @@ -402,7 +407,9 @@ static int show_smap(struct seq_file *m, void *v) mss.referenced >> 10, mss.swap >> 10); - return ret; + if (m->count < m->size) /* vma is copied successfully */ + m->version = (vma != get_gate_vma(task)) ? vma->vm_start : 0; + return 0; } static const struct seq_operations proc_pid_smaps_op = { diff --git a/fs/proc/uptime.c b/fs/proc/uptime.c new file mode 100644 index 0000000..0c10a0b --- /dev/null +++ b/fs/proc/uptime.c @@ -0,0 +1,43 @@ +#include <linux/fs.h> +#include <linux/init.h> +#include <linux/proc_fs.h> +#include <linux/sched.h> +#include <linux/seq_file.h> +#include <linux/time.h> +#include <asm/cputime.h> + +static int uptime_proc_show(struct seq_file *m, void *v) +{ + struct timespec uptime; + struct timespec idle; + cputime_t idletime = cputime_add(init_task.utime, init_task.stime); + + do_posix_clock_monotonic_gettime(&uptime); + monotonic_to_bootbased(&uptime); + cputime_to_timespec(idletime, &idle); + seq_printf(m, "%lu.%02lu %lu.%02lu\n", + (unsigned long) uptime.tv_sec, + (uptime.tv_nsec / (NSEC_PER_SEC / 100)), + (unsigned long) idle.tv_sec, + (idle.tv_nsec / (NSEC_PER_SEC / 100))); + return 0; +} + +static int uptime_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, uptime_proc_show, NULL); +} + +static const struct file_operations uptime_proc_fops = { + .open = uptime_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int __init proc_uptime_init(void) +{ + proc_create("uptime", 0, NULL, &uptime_proc_fops); + return 0; +} +module_init(proc_uptime_init); diff --git a/fs/proc/version.c b/fs/proc/version.c new file mode 100644 index 0000000..76817a6 --- /dev/null +++ b/fs/proc/version.c @@ -0,0 +1,34 @@ +#include <linux/fs.h> +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/proc_fs.h> +#include <linux/seq_file.h> +#include <linux/utsname.h> + +static int version_proc_show(struct seq_file *m, void *v) +{ + seq_printf(m, linux_proc_banner, + utsname()->sysname, + utsname()->release, + utsname()->version); + return 0; +} + +static int version_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, version_proc_show, NULL); +} + +static const struct file_operations version_proc_fops = { + .open = version_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int __init proc_version_init(void) +{ + proc_create("version", 0, NULL, &version_proc_fops); + return 0; +} +module_init(proc_version_init); diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c index 841368b..03ec595 100644 --- a/fs/proc/vmcore.c +++ b/fs/proc/vmcore.c @@ -32,10 +32,7 @@ static size_t elfcorebuf_sz; /* Total size of vmcore file. */ static u64 vmcore_size; -/* Stores the physical address of elf header of crash image. */ -unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX; - -struct proc_dir_entry *proc_vmcore = NULL; +static struct proc_dir_entry *proc_vmcore = NULL; /* Reads a page from the oldmem device from given offset. */ static ssize_t read_from_oldmem(char *buf, size_t count, @@ -165,7 +162,7 @@ static ssize_t read_vmcore(struct file *file, char __user *buffer, return acc; } -const struct file_operations proc_vmcore_operations = { +static const struct file_operations proc_vmcore_operations = { .read = read_vmcore, }; @@ -647,7 +644,7 @@ static int __init vmcore_init(void) int rc = 0; /* If elfcorehdr= has been passed in cmdline, then capture the dump.*/ - if (!(elfcorehdr_addr < ELFCORE_ADDR_MAX)) + if (!(is_vmcore_usable())) return rc; rc = parse_crash_elf_headers(); if (rc) { @@ -655,7 +652,7 @@ static int __init vmcore_init(void) return rc; } - /* Initialize /proc/vmcore size if proc is already up. */ + proc_vmcore = proc_create("vmcore", S_IRUSR, NULL, &proc_vmcore_operations); if (proc_vmcore) proc_vmcore->size = vmcore_size; return 0; diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c index 5145cb9..76acdbc 100644 --- a/fs/ramfs/file-nommu.c +++ b/fs/ramfs/file-nommu.c @@ -112,12 +112,12 @@ int ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize) goto add_error; if (!pagevec_add(&lru_pvec, page)) - __pagevec_lru_add(&lru_pvec); + __pagevec_lru_add_file(&lru_pvec); unlock_page(page); } - pagevec_lru_add(&lru_pvec); + pagevec_lru_add_file(&lru_pvec); return 0; fsize_exceeded: diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c index b131234..f031d1c 100644 --- a/fs/ramfs/inode.c +++ b/fs/ramfs/inode.c @@ -61,6 +61,7 @@ struct inode *ramfs_get_inode(struct super_block *sb, int mode, dev_t dev) inode->i_mapping->a_ops = &ramfs_aops; inode->i_mapping->backing_dev_info = &ramfs_backing_dev_info; mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER); + mapping_set_unevictable(inode->i_mapping); inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; switch (mode & S_IFMT) { default: diff --git a/fs/read_write.c b/fs/read_write.c index 9ba495d..969a6d9 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -31,39 +31,61 @@ const struct file_operations generic_ro_fops = { EXPORT_SYMBOL(generic_ro_fops); +/** + * generic_file_llseek_unlocked - lockless generic llseek implementation + * @file: file structure to seek on + * @offset: file offset to seek to + * @origin: type of seek + * + * Updates the file offset to the value specified by @offset and @origin. + * Locking must be provided by the caller. + */ loff_t generic_file_llseek_unlocked(struct file *file, loff_t offset, int origin) { - loff_t retval; struct inode *inode = file->f_mapping->host; switch (origin) { - case SEEK_END: - offset += inode->i_size; - break; - case SEEK_CUR: - offset += file->f_pos; + case SEEK_END: + offset += inode->i_size; + break; + case SEEK_CUR: + offset += file->f_pos; + break; } - retval = -EINVAL; - if (offset>=0 && offset<=inode->i_sb->s_maxbytes) { - /* Special lock needed here? */ - if (offset != file->f_pos) { - file->f_pos = offset; - file->f_version = 0; - } - retval = offset; + + if (offset < 0 || offset > inode->i_sb->s_maxbytes) + return -EINVAL; + + /* Special lock needed here? */ + if (offset != file->f_pos) { + file->f_pos = offset; + file->f_version = 0; } - return retval; + + return offset; } EXPORT_SYMBOL(generic_file_llseek_unlocked); +/** + * generic_file_llseek - generic llseek implementation for regular files + * @file: file structure to seek on + * @offset: file offset to seek to + * @origin: type of seek + * + * This is a generic implemenation of ->llseek useable for all normal local + * filesystems. It just updates the file offset to the value specified by + * @offset and @origin under i_mutex. + */ loff_t generic_file_llseek(struct file *file, loff_t offset, int origin) { - loff_t n; + loff_t rval; + mutex_lock(&file->f_dentry->d_inode->i_mutex); - n = generic_file_llseek_unlocked(file, offset, origin); + rval = generic_file_llseek_unlocked(file, offset, origin); mutex_unlock(&file->f_dentry->d_inode->i_mutex); - return n; + + return rval; } EXPORT_SYMBOL(generic_file_llseek); diff --git a/fs/readdir.c b/fs/readdir.c index 93a7559..b318d9b 100644 --- a/fs/readdir.c +++ b/fs/readdir.c @@ -117,7 +117,7 @@ asmlinkage long old_readdir(unsigned int fd, struct old_linux_dirent __user * di buf.dirent = dirent; error = vfs_readdir(file, fillonedir, &buf); - if (error >= 0) + if (buf.result) error = buf.result; fput(file); @@ -209,9 +209,8 @@ asmlinkage long sys_getdents(unsigned int fd, struct linux_dirent __user * diren buf.error = 0; error = vfs_readdir(file, filldir, &buf); - if (error < 0) - goto out_putf; - error = buf.error; + if (error >= 0) + error = buf.error; lastdirent = buf.previous; if (lastdirent) { if (put_user(file->f_pos, &lastdirent->d_off)) @@ -219,8 +218,6 @@ asmlinkage long sys_getdents(unsigned int fd, struct linux_dirent __user * diren else error = count - buf.count; } - -out_putf: fput(file); out: return error; @@ -293,19 +290,16 @@ asmlinkage long sys_getdents64(unsigned int fd, struct linux_dirent64 __user * d buf.error = 0; error = vfs_readdir(file, filldir64, &buf); - if (error < 0) - goto out_putf; - error = buf.error; + if (error >= 0) + error = buf.error; lastdirent = buf.previous; if (lastdirent) { typeof(lastdirent->d_off) d_off = file->f_pos; - error = -EFAULT; if (__put_user(d_off, &lastdirent->d_off)) - goto out_putf; - error = count - buf.count; + error = -EFAULT; + else + error = count - buf.count; } - -out_putf: fput(file); out: return error; diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c index a804903..3340841 100644 --- a/fs/reiserfs/file.c +++ b/fs/reiserfs/file.c @@ -296,6 +296,7 @@ const struct file_operations reiserfs_file_operations = { .aio_write = generic_file_aio_write, .splice_read = generic_file_splice_read, .splice_write = generic_file_splice_write, + .llseek = generic_file_llseek, }; const struct inode_operations reiserfs_file_inode_operations = { diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index 5699171..6c4c2c6 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c @@ -1522,7 +1522,6 @@ static struct dentry *reiserfs_get_dentry(struct super_block *sb, { struct cpu_key key; - struct dentry *result; struct inode *inode; key.on_disk_key.k_objectid = objectid; @@ -1535,16 +1534,8 @@ static struct dentry *reiserfs_get_dentry(struct super_block *sb, inode = NULL; } reiserfs_write_unlock(sb); - if (!inode) - inode = ERR_PTR(-ESTALE); - if (IS_ERR(inode)) - return ERR_CAST(inode); - result = d_alloc_anon(inode); - if (!result) { - iput(inode); - return ERR_PTR(-ENOMEM); - } - return result; + + return d_obtain_alias(inode); } struct dentry *reiserfs_fh_to_dentry(struct super_block *sb, struct fid *fid, diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c index c21df71..9643c3b 100644 --- a/fs/reiserfs/journal.c +++ b/fs/reiserfs/journal.c @@ -2575,7 +2575,7 @@ static int release_journal_dev(struct super_block *super, if (journal->j_dev_bd != NULL) { if (journal->j_dev_bd->bd_dev != super->s_dev) bd_release(journal->j_dev_bd); - result = blkdev_put(journal->j_dev_bd); + result = blkdev_put(journal->j_dev_bd, journal->j_dev_mode); journal->j_dev_bd = NULL; } @@ -2593,7 +2593,7 @@ static int journal_init_dev(struct super_block *super, { int result; dev_t jdev; - int blkdev_mode = FMODE_READ | FMODE_WRITE; + fmode_t blkdev_mode = FMODE_READ | FMODE_WRITE; char b[BDEVNAME_SIZE]; result = 0; @@ -2608,6 +2608,7 @@ static int journal_init_dev(struct super_block *super, /* there is no "jdev" option and journal is on separate device */ if ((!jdev_name || !jdev_name[0])) { journal->j_dev_bd = open_by_devnum(jdev, blkdev_mode); + journal->j_dev_mode = blkdev_mode; if (IS_ERR(journal->j_dev_bd)) { result = PTR_ERR(journal->j_dev_bd); journal->j_dev_bd = NULL; @@ -2618,7 +2619,7 @@ static int journal_init_dev(struct super_block *super, } else if (jdev != super->s_dev) { result = bd_claim(journal->j_dev_bd, journal); if (result) { - blkdev_put(journal->j_dev_bd); + blkdev_put(journal->j_dev_bd, blkdev_mode); return result; } @@ -2628,7 +2629,9 @@ static int journal_init_dev(struct super_block *super, return 0; } - journal->j_dev_bd = open_bdev_excl(jdev_name, 0, journal); + journal->j_dev_mode = blkdev_mode; + journal->j_dev_bd = open_bdev_exclusive(jdev_name, + blkdev_mode, journal); if (IS_ERR(journal->j_dev_bd)) { result = PTR_ERR(journal->j_dev_bd); journal->j_dev_bd = NULL; diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c index c1add28..f89ebb9 100644 --- a/fs/reiserfs/namei.c +++ b/fs/reiserfs/namei.c @@ -383,7 +383,6 @@ struct dentry *reiserfs_get_parent(struct dentry *child) struct inode *inode = NULL; struct reiserfs_dir_entry de; INITIALIZE_PATH(path_to_entry); - struct dentry *parent; struct inode *dir = child->d_inode; if (dir->i_nlink == 0) { @@ -401,15 +400,7 @@ struct dentry *reiserfs_get_parent(struct dentry *child) inode = reiserfs_iget(dir->i_sb, (struct cpu_key *)&(de.de_dir_id)); reiserfs_write_unlock(dir->i_sb); - if (!inode || IS_ERR(inode)) { - return ERR_PTR(-EACCES); - } - parent = d_alloc_anon(inode); - if (!parent) { - iput(inode); - parent = ERR_PTR(-ENOMEM); - } - return parent; + return d_obtain_alias(inode); } /* add entry to the directory (entry can be hidden). diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index d318c7e..663a91f 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -2058,10 +2058,10 @@ static int reiserfs_quota_on_mount(struct super_block *sb, int type) * Standard function to be called on quota_on */ static int reiserfs_quota_on(struct super_block *sb, int type, int format_id, - char *path, int remount) + char *name, int remount) { int err; - struct nameidata nd; + struct path path; struct inode *inode; struct reiserfs_transaction_handle th; @@ -2069,16 +2069,16 @@ static int reiserfs_quota_on(struct super_block *sb, int type, int format_id, return -EINVAL; /* No more checks needed? Path and format_id are bogus anyway... */ if (remount) - return vfs_quota_on(sb, type, format_id, path, 1); - err = path_lookup(path, LOOKUP_FOLLOW, &nd); + return vfs_quota_on(sb, type, format_id, name, 1); + err = kern_path(name, LOOKUP_FOLLOW, &path); if (err) return err; /* Quotafile not on the same filesystem? */ - if (nd.path.mnt->mnt_sb != sb) { + if (path.mnt->mnt_sb != sb) { err = -EXDEV; goto out; } - inode = nd.path.dentry->d_inode; + inode = path.dentry->d_inode; /* We must not pack tails for quota files on reiserfs for quota IO to work */ if (!(REISERFS_I(inode)->i_flags & i_nopack_mask)) { err = reiserfs_unpack(inode, NULL); @@ -2094,7 +2094,7 @@ static int reiserfs_quota_on(struct super_block *sb, int type, int format_id, /* Journaling quota? */ if (REISERFS_SB(sb)->s_qf_names[type]) { /* Quotafile not of fs root? */ - if (nd.path.dentry->d_parent->d_inode != sb->s_root->d_inode) + if (path.dentry->d_parent != sb->s_root) reiserfs_warning(sb, "reiserfs: Quota file not on filesystem root. " "Journalled quota will not work."); @@ -2113,9 +2113,9 @@ static int reiserfs_quota_on(struct super_block *sb, int type, int format_id, if (err) goto out; } - err = vfs_quota_on_path(sb, type, format_id, &nd.path); + err = vfs_quota_on_path(sb, type, format_id, &path); out: - path_put(&nd.path); + path_put(&path); return err; } diff --git a/fs/select.c b/fs/select.c index da0e882..448e440 100644 --- a/fs/select.c +++ b/fs/select.c @@ -24,9 +24,64 @@ #include <linux/fdtable.h> #include <linux/fs.h> #include <linux/rcupdate.h> +#include <linux/hrtimer.h> #include <asm/uaccess.h> + +/* + * Estimate expected accuracy in ns from a timeval. + * + * After quite a bit of churning around, we've settled on + * a simple thing of taking 0.1% of the timeout as the + * slack, with a cap of 100 msec. + * "nice" tasks get a 0.5% slack instead. + * + * Consider this comment an open invitation to come up with even + * better solutions.. + */ + +static long __estimate_accuracy(struct timespec *tv) +{ + long slack; + int divfactor = 1000; + + if (task_nice(current) > 0) + divfactor = divfactor / 5; + + slack = tv->tv_nsec / divfactor; + slack += tv->tv_sec * (NSEC_PER_SEC/divfactor); + + if (slack > 100 * NSEC_PER_MSEC) + slack = 100 * NSEC_PER_MSEC; + + if (slack < 0) + slack = 0; + return slack; +} + +static long estimate_accuracy(struct timespec *tv) +{ + unsigned long ret; + struct timespec now; + + /* + * Realtime tasks get a slack of 0 for obvious reasons. + */ + + if (rt_task(current)) + return 0; + + ktime_get_ts(&now); + now = timespec_sub(*tv, now); + ret = __estimate_accuracy(&now); + if (ret < current->timer_slack_ns) + return current->timer_slack_ns; + return ret; +} + + + struct poll_table_page { struct poll_table_page * next; struct poll_table_entry * entry; @@ -130,6 +185,79 @@ static void __pollwait(struct file *filp, wait_queue_head_t *wait_address, add_wait_queue(wait_address, &entry->wait); } +/** + * poll_select_set_timeout - helper function to setup the timeout value + * @to: pointer to timespec variable for the final timeout + * @sec: seconds (from user space) + * @nsec: nanoseconds (from user space) + * + * Note, we do not use a timespec for the user space value here, That + * way we can use the function for timeval and compat interfaces as well. + * + * Returns -EINVAL if sec/nsec are not normalized. Otherwise 0. + */ +int poll_select_set_timeout(struct timespec *to, long sec, long nsec) +{ + struct timespec ts = {.tv_sec = sec, .tv_nsec = nsec}; + + if (!timespec_valid(&ts)) + return -EINVAL; + + /* Optimize for the zero timeout value here */ + if (!sec && !nsec) { + to->tv_sec = to->tv_nsec = 0; + } else { + ktime_get_ts(to); + *to = timespec_add_safe(*to, ts); + } + return 0; +} + +static int poll_select_copy_remaining(struct timespec *end_time, void __user *p, + int timeval, int ret) +{ + struct timespec rts; + struct timeval rtv; + + if (!p) + return ret; + + if (current->personality & STICKY_TIMEOUTS) + goto sticky; + + /* No update for zero timeout */ + if (!end_time->tv_sec && !end_time->tv_nsec) + return ret; + + ktime_get_ts(&rts); + rts = timespec_sub(*end_time, rts); + if (rts.tv_sec < 0) + rts.tv_sec = rts.tv_nsec = 0; + + if (timeval) { + rtv.tv_sec = rts.tv_sec; + rtv.tv_usec = rts.tv_nsec / NSEC_PER_USEC; + + if (!copy_to_user(p, &rtv, sizeof(rtv))) + return ret; + + } else if (!copy_to_user(p, &rts, sizeof(rts))) + return ret; + + /* + * If an application puts its timeval in read-only memory, we + * don't want the Linux-specific update to the timeval to + * cause a fault after the select has completed + * successfully. However, because we're not updating the + * timeval, we can't restart the system call. + */ + +sticky: + if (ret == -ERESTARTNOHAND) + ret = -EINTR; + return ret; +} + #define FDS_IN(fds, n) (fds->in + n) #define FDS_OUT(fds, n) (fds->out + n) #define FDS_EX(fds, n) (fds->ex + n) @@ -182,11 +310,13 @@ get_max: #define POLLOUT_SET (POLLWRBAND | POLLWRNORM | POLLOUT | POLLERR) #define POLLEX_SET (POLLPRI) -int do_select(int n, fd_set_bits *fds, s64 *timeout) +int do_select(int n, fd_set_bits *fds, struct timespec *end_time) { + ktime_t expire, *to = NULL; struct poll_wqueues table; poll_table *wait; - int retval, i; + int retval, i, timed_out = 0; + unsigned long slack = 0; rcu_read_lock(); retval = max_select_fd(n, fds); @@ -198,12 +328,17 @@ int do_select(int n, fd_set_bits *fds, s64 *timeout) poll_initwait(&table); wait = &table.pt; - if (!*timeout) + if (end_time && !end_time->tv_sec && !end_time->tv_nsec) { wait = NULL; + timed_out = 1; + } + + if (end_time && !timed_out) + slack = estimate_accuracy(end_time); + retval = 0; for (;;) { unsigned long *rinp, *routp, *rexp, *inp, *outp, *exp; - long __timeout; set_current_state(TASK_INTERRUPTIBLE); @@ -259,27 +394,25 @@ int do_select(int n, fd_set_bits *fds, s64 *timeout) cond_resched(); } wait = NULL; - if (retval || !*timeout || signal_pending(current)) + if (retval || timed_out || signal_pending(current)) break; if (table.error) { retval = table.error; break; } - if (*timeout < 0) { - /* Wait indefinitely */ - __timeout = MAX_SCHEDULE_TIMEOUT; - } else if (unlikely(*timeout >= (s64)MAX_SCHEDULE_TIMEOUT - 1)) { - /* Wait for longer than MAX_SCHEDULE_TIMEOUT. Do it in a loop */ - __timeout = MAX_SCHEDULE_TIMEOUT - 1; - *timeout -= __timeout; - } else { - __timeout = *timeout; - *timeout = 0; + /* + * If this is the first loop and we have a timeout + * given, then we convert to ktime_t and set the to + * pointer to the expiry value. + */ + if (end_time && !to) { + expire = timespec_to_ktime(*end_time); + to = &expire; } - __timeout = schedule_timeout(__timeout); - if (*timeout >= 0) - *timeout += __timeout; + + if (!schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS)) + timed_out = 1; } __set_current_state(TASK_RUNNING); @@ -300,7 +433,7 @@ int do_select(int n, fd_set_bits *fds, s64 *timeout) ((unsigned long) (MAX_SCHEDULE_TIMEOUT / HZ)-1) int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp, - fd_set __user *exp, s64 *timeout) + fd_set __user *exp, struct timespec *end_time) { fd_set_bits fds; void *bits; @@ -351,7 +484,7 @@ int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp, zero_fd_set(n, fds.res_out); zero_fd_set(n, fds.res_ex); - ret = do_select(n, &fds, timeout); + ret = do_select(n, &fds, end_time); if (ret < 0) goto out; @@ -377,7 +510,7 @@ out_nofds: asmlinkage long sys_select(int n, fd_set __user *inp, fd_set __user *outp, fd_set __user *exp, struct timeval __user *tvp) { - s64 timeout = -1; + struct timespec end_time, *to = NULL; struct timeval tv; int ret; @@ -385,43 +518,14 @@ asmlinkage long sys_select(int n, fd_set __user *inp, fd_set __user *outp, if (copy_from_user(&tv, tvp, sizeof(tv))) return -EFAULT; - if (tv.tv_sec < 0 || tv.tv_usec < 0) + to = &end_time; + if (poll_select_set_timeout(to, tv.tv_sec, + tv.tv_usec * NSEC_PER_USEC)) return -EINVAL; - - /* Cast to u64 to make GCC stop complaining */ - if ((u64)tv.tv_sec >= (u64)MAX_INT64_SECONDS) - timeout = -1; /* infinite */ - else { - timeout = DIV_ROUND_UP(tv.tv_usec, USEC_PER_SEC/HZ); - timeout += tv.tv_sec * HZ; - } } - ret = core_sys_select(n, inp, outp, exp, &timeout); - - if (tvp) { - struct timeval rtv; - - if (current->personality & STICKY_TIMEOUTS) - goto sticky; - rtv.tv_usec = jiffies_to_usecs(do_div((*(u64*)&timeout), HZ)); - rtv.tv_sec = timeout; - if (timeval_compare(&rtv, &tv) >= 0) - rtv = tv; - if (copy_to_user(tvp, &rtv, sizeof(rtv))) { -sticky: - /* - * If an application puts its timeval in read-only - * memory, we don't want the Linux-specific update to - * the timeval to cause a fault after the select has - * completed successfully. However, because we're not - * updating the timeval, we can't restart the system - * call. - */ - if (ret == -ERESTARTNOHAND) - ret = -EINTR; - } - } + ret = core_sys_select(n, inp, outp, exp, to); + ret = poll_select_copy_remaining(&end_time, tvp, 1, ret); return ret; } @@ -431,25 +535,17 @@ asmlinkage long sys_pselect7(int n, fd_set __user *inp, fd_set __user *outp, fd_set __user *exp, struct timespec __user *tsp, const sigset_t __user *sigmask, size_t sigsetsize) { - s64 timeout = MAX_SCHEDULE_TIMEOUT; sigset_t ksigmask, sigsaved; - struct timespec ts; + struct timespec ts, end_time, *to = NULL; int ret; if (tsp) { if (copy_from_user(&ts, tsp, sizeof(ts))) return -EFAULT; - if (ts.tv_sec < 0 || ts.tv_nsec < 0) + to = &end_time; + if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec)) return -EINVAL; - - /* Cast to u64 to make GCC stop complaining */ - if ((u64)ts.tv_sec >= (u64)MAX_INT64_SECONDS) - timeout = -1; /* infinite */ - else { - timeout = DIV_ROUND_UP(ts.tv_nsec, NSEC_PER_SEC/HZ); - timeout += ts.tv_sec * HZ; - } } if (sigmask) { @@ -463,32 +559,8 @@ asmlinkage long sys_pselect7(int n, fd_set __user *inp, fd_set __user *outp, sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved); } - ret = core_sys_select(n, inp, outp, exp, &timeout); - - if (tsp) { - struct timespec rts; - - if (current->personality & STICKY_TIMEOUTS) - goto sticky; - rts.tv_nsec = jiffies_to_usecs(do_div((*(u64*)&timeout), HZ)) * - 1000; - rts.tv_sec = timeout; - if (timespec_compare(&rts, &ts) >= 0) - rts = ts; - if (copy_to_user(tsp, &rts, sizeof(rts))) { -sticky: - /* - * If an application puts its timeval in read-only - * memory, we don't want the Linux-specific update to - * the timeval to cause a fault after the select has - * completed successfully. However, because we're not - * updating the timeval, we can't restart the system - * call. - */ - if (ret == -ERESTARTNOHAND) - ret = -EINTR; - } - } + ret = core_sys_select(n, inp, outp, exp, &end_time); + ret = poll_select_copy_remaining(&end_time, tsp, 0, ret); if (ret == -ERESTARTNOHAND) { /* @@ -574,18 +646,24 @@ static inline unsigned int do_pollfd(struct pollfd *pollfd, poll_table *pwait) } static int do_poll(unsigned int nfds, struct poll_list *list, - struct poll_wqueues *wait, s64 *timeout) + struct poll_wqueues *wait, struct timespec *end_time) { - int count = 0; poll_table* pt = &wait->pt; + ktime_t expire, *to = NULL; + int timed_out = 0, count = 0; + unsigned long slack = 0; /* Optimise the no-wait case */ - if (!(*timeout)) + if (end_time && !end_time->tv_sec && !end_time->tv_nsec) { pt = NULL; + timed_out = 1; + } + + if (end_time && !timed_out) + slack = estimate_accuracy(end_time); for (;;) { struct poll_list *walk; - long __timeout; set_current_state(TASK_INTERRUPTIBLE); for (walk = list; walk != NULL; walk = walk->next) { @@ -617,27 +695,21 @@ static int do_poll(unsigned int nfds, struct poll_list *list, if (signal_pending(current)) count = -EINTR; } - if (count || !*timeout) + if (count || timed_out) break; - if (*timeout < 0) { - /* Wait indefinitely */ - __timeout = MAX_SCHEDULE_TIMEOUT; - } else if (unlikely(*timeout >= (s64)MAX_SCHEDULE_TIMEOUT-1)) { - /* - * Wait for longer than MAX_SCHEDULE_TIMEOUT. Do it in - * a loop - */ - __timeout = MAX_SCHEDULE_TIMEOUT - 1; - *timeout -= __timeout; - } else { - __timeout = *timeout; - *timeout = 0; + /* + * If this is the first loop and we have a timeout + * given, then we convert to ktime_t and set the to + * pointer to the expiry value. + */ + if (end_time && !to) { + expire = timespec_to_ktime(*end_time); + to = &expire; } - __timeout = schedule_timeout(__timeout); - if (*timeout >= 0) - *timeout += __timeout; + if (!schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS)) + timed_out = 1; } __set_current_state(TASK_RUNNING); return count; @@ -646,7 +718,8 @@ static int do_poll(unsigned int nfds, struct poll_list *list, #define N_STACK_PPS ((sizeof(stack_pps) - sizeof(struct poll_list)) / \ sizeof(struct pollfd)) -int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds, s64 *timeout) +int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds, + struct timespec *end_time) { struct poll_wqueues table; int err = -EFAULT, fdcount, len, size; @@ -686,7 +759,7 @@ int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds, s64 *timeout) } poll_initwait(&table); - fdcount = do_poll(nfds, head, &table, timeout); + fdcount = do_poll(nfds, head, &table, end_time); poll_freewait(&table); for (walk = head; walk; walk = walk->next) { @@ -712,16 +785,21 @@ out_fds: static long do_restart_poll(struct restart_block *restart_block) { - struct pollfd __user *ufds = (struct pollfd __user*)restart_block->arg0; - int nfds = restart_block->arg1; - s64 timeout = ((s64)restart_block->arg3<<32) | (s64)restart_block->arg2; + struct pollfd __user *ufds = restart_block->poll.ufds; + int nfds = restart_block->poll.nfds; + struct timespec *to = NULL, end_time; int ret; - ret = do_sys_poll(ufds, nfds, &timeout); + if (restart_block->poll.has_timeout) { + end_time.tv_sec = restart_block->poll.tv_sec; + end_time.tv_nsec = restart_block->poll.tv_nsec; + to = &end_time; + } + + ret = do_sys_poll(ufds, nfds, to); + if (ret == -EINTR) { restart_block->fn = do_restart_poll; - restart_block->arg2 = timeout & 0xFFFFFFFF; - restart_block->arg3 = (u64)timeout >> 32; ret = -ERESTART_RESTARTBLOCK; } return ret; @@ -730,31 +808,32 @@ static long do_restart_poll(struct restart_block *restart_block) asmlinkage long sys_poll(struct pollfd __user *ufds, unsigned int nfds, long timeout_msecs) { - s64 timeout_jiffies; + struct timespec end_time, *to = NULL; int ret; - if (timeout_msecs > 0) { -#if HZ > 1000 - /* We can only overflow if HZ > 1000 */ - if (timeout_msecs / 1000 > (s64)0x7fffffffffffffffULL / (s64)HZ) - timeout_jiffies = -1; - else -#endif - timeout_jiffies = msecs_to_jiffies(timeout_msecs) + 1; - } else { - /* Infinite (< 0) or no (0) timeout */ - timeout_jiffies = timeout_msecs; + if (timeout_msecs >= 0) { + to = &end_time; + poll_select_set_timeout(to, timeout_msecs / MSEC_PER_SEC, + NSEC_PER_MSEC * (timeout_msecs % MSEC_PER_SEC)); } - ret = do_sys_poll(ufds, nfds, &timeout_jiffies); + ret = do_sys_poll(ufds, nfds, to); + if (ret == -EINTR) { struct restart_block *restart_block; + restart_block = ¤t_thread_info()->restart_block; restart_block->fn = do_restart_poll; - restart_block->arg0 = (unsigned long)ufds; - restart_block->arg1 = nfds; - restart_block->arg2 = timeout_jiffies & 0xFFFFFFFF; - restart_block->arg3 = (u64)timeout_jiffies >> 32; + restart_block->poll.ufds = ufds; + restart_block->poll.nfds = nfds; + + if (timeout_msecs >= 0) { + restart_block->poll.tv_sec = end_time.tv_sec; + restart_block->poll.tv_nsec = end_time.tv_nsec; + restart_block->poll.has_timeout = 1; + } else + restart_block->poll.has_timeout = 0; + ret = -ERESTART_RESTARTBLOCK; } return ret; @@ -766,21 +845,16 @@ asmlinkage long sys_ppoll(struct pollfd __user *ufds, unsigned int nfds, size_t sigsetsize) { sigset_t ksigmask, sigsaved; - struct timespec ts; - s64 timeout = -1; + struct timespec ts, end_time, *to = NULL; int ret; if (tsp) { if (copy_from_user(&ts, tsp, sizeof(ts))) return -EFAULT; - /* Cast to u64 to make GCC stop complaining */ - if ((u64)ts.tv_sec >= (u64)MAX_INT64_SECONDS) - timeout = -1; /* infinite */ - else { - timeout = DIV_ROUND_UP(ts.tv_nsec, NSEC_PER_SEC/HZ); - timeout += ts.tv_sec * HZ; - } + to = &end_time; + if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec)) + return -EINVAL; } if (sigmask) { @@ -794,7 +868,7 @@ asmlinkage long sys_ppoll(struct pollfd __user *ufds, unsigned int nfds, sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved); } - ret = do_sys_poll(ufds, nfds, &timeout); + ret = do_sys_poll(ufds, nfds, to); /* We can restart this syscall, usually */ if (ret == -EINTR) { @@ -812,31 +886,7 @@ asmlinkage long sys_ppoll(struct pollfd __user *ufds, unsigned int nfds, } else if (sigmask) sigprocmask(SIG_SETMASK, &sigsaved, NULL); - if (tsp && timeout >= 0) { - struct timespec rts; - - if (current->personality & STICKY_TIMEOUTS) - goto sticky; - /* Yes, we know it's actually an s64, but it's also positive. */ - rts.tv_nsec = jiffies_to_usecs(do_div((*(u64*)&timeout), HZ)) * - 1000; - rts.tv_sec = timeout; - if (timespec_compare(&rts, &ts) >= 0) - rts = ts; - if (copy_to_user(tsp, &rts, sizeof(rts))) { - sticky: - /* - * If an application puts its timeval in read-only - * memory, we don't want the Linux-specific update to - * the timeval to cause a fault after the select has - * completed successfully. However, because we're not - * updating the timeval, we can't restart the system - * call. - */ - if (ret == -ERESTARTNOHAND && timeout >= 0) - ret = -EINTR; - } - } + ret = poll_select_copy_remaining(&end_time, tsp, 0, ret); return ret; } diff --git a/fs/seq_file.c b/fs/seq_file.c index bd20f7f..eba2eab 100644 --- a/fs/seq_file.c +++ b/fs/seq_file.c @@ -452,17 +452,34 @@ int seq_dentry(struct seq_file *m, struct dentry *dentry, char *esc) int seq_bitmap(struct seq_file *m, unsigned long *bits, unsigned int nr_bits) { - size_t len = bitmap_scnprintf_len(nr_bits); + if (m->count < m->size) { + int len = bitmap_scnprintf(m->buf + m->count, + m->size - m->count, bits, nr_bits); + if (m->count + len < m->size) { + m->count += len; + return 0; + } + } + m->count = m->size; + return -1; +} +EXPORT_SYMBOL(seq_bitmap); - if (m->count + len < m->size) { - bitmap_scnprintf(m->buf + m->count, m->size - m->count, - bits, nr_bits); - m->count += len; - return 0; +int seq_bitmap_list(struct seq_file *m, unsigned long *bits, + unsigned int nr_bits) +{ + if (m->count < m->size) { + int len = bitmap_scnlistprintf(m->buf + m->count, + m->size - m->count, bits, nr_bits); + if (m->count + len < m->size) { + m->count += len; + return 0; + } } m->count = m->size; return -1; } +EXPORT_SYMBOL(seq_bitmap_list); static void *single_start(struct seq_file *p, loff_t *pos) { @@ -682,7 +682,7 @@ void emergency_remount(void) * filesystems which don't use real block-devices. -- jrs */ -static struct idr unnamed_dev_idr; +static DEFINE_IDA(unnamed_dev_ida); static DEFINE_SPINLOCK(unnamed_dev_lock);/* protects the above */ int set_anon_super(struct super_block *s, void *data) @@ -691,10 +691,10 @@ int set_anon_super(struct super_block *s, void *data) int error; retry: - if (idr_pre_get(&unnamed_dev_idr, GFP_ATOMIC) == 0) + if (ida_pre_get(&unnamed_dev_ida, GFP_ATOMIC) == 0) return -ENOMEM; spin_lock(&unnamed_dev_lock); - error = idr_get_new(&unnamed_dev_idr, NULL, &dev); + error = ida_get_new(&unnamed_dev_ida, &dev); spin_unlock(&unnamed_dev_lock); if (error == -EAGAIN) /* We raced and lost with another CPU. */ @@ -704,7 +704,7 @@ int set_anon_super(struct super_block *s, void *data) if ((dev & MAX_ID_MASK) == (1 << MINORBITS)) { spin_lock(&unnamed_dev_lock); - idr_remove(&unnamed_dev_idr, dev); + ida_remove(&unnamed_dev_ida, dev); spin_unlock(&unnamed_dev_lock); return -EMFILE; } @@ -720,17 +720,12 @@ void kill_anon_super(struct super_block *sb) generic_shutdown_super(sb); spin_lock(&unnamed_dev_lock); - idr_remove(&unnamed_dev_idr, slot); + ida_remove(&unnamed_dev_ida, slot); spin_unlock(&unnamed_dev_lock); } EXPORT_SYMBOL(kill_anon_super); -void __init unnamed_dev_init(void) -{ - idr_init(&unnamed_dev_idr); -} - void kill_litter_super(struct super_block *sb) { if (sb->s_root) @@ -760,9 +755,13 @@ int get_sb_bdev(struct file_system_type *fs_type, { struct block_device *bdev; struct super_block *s; + fmode_t mode = FMODE_READ; int error = 0; - bdev = open_bdev_excl(dev_name, flags, fs_type); + if (!(flags & MS_RDONLY)) + mode |= FMODE_WRITE; + + bdev = open_bdev_exclusive(dev_name, mode, fs_type); if (IS_ERR(bdev)) return PTR_ERR(bdev); @@ -785,11 +784,12 @@ int get_sb_bdev(struct file_system_type *fs_type, goto error_bdev; } - close_bdev_excl(bdev); + close_bdev_exclusive(bdev, mode); } else { char b[BDEVNAME_SIZE]; s->s_flags = flags; + s->s_mode = mode; strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id)); sb_set_blocksize(s, block_size(bdev)); error = fill_super(s, data, flags & MS_SILENT ? 1 : 0); @@ -807,7 +807,7 @@ int get_sb_bdev(struct file_system_type *fs_type, error_s: error = PTR_ERR(s); error_bdev: - close_bdev_excl(bdev); + close_bdev_exclusive(bdev, mode); error: return error; } @@ -817,10 +817,11 @@ EXPORT_SYMBOL(get_sb_bdev); void kill_block_super(struct super_block *sb) { struct block_device *bdev = sb->s_bdev; + fmode_t mode = sb->s_mode; generic_shutdown_super(sb); sync_blockdev(bdev); - close_bdev_excl(bdev); + close_bdev_exclusive(bdev, mode); } EXPORT_SYMBOL(kill_block_super); diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c index 3a05a59..82d3b79 100644 --- a/fs/sysfs/dir.c +++ b/fs/sysfs/dir.c @@ -983,4 +983,5 @@ static int sysfs_readdir(struct file * filp, void * dirent, filldir_t filldir) const struct file_operations sysfs_dir_operations = { .read = generic_read_dir, .readdir = sysfs_readdir, + .llseek = generic_file_llseek, }; diff --git a/fs/timerfd.c b/fs/timerfd.c index c502c60..0862f0e 100644 --- a/fs/timerfd.c +++ b/fs/timerfd.c @@ -52,11 +52,9 @@ static enum hrtimer_restart timerfd_tmrproc(struct hrtimer *htmr) static ktime_t timerfd_get_remaining(struct timerfd_ctx *ctx) { - ktime_t now, remaining; - - now = ctx->tmr.base->get_time(); - remaining = ktime_sub(ctx->tmr.expires, now); + ktime_t remaining; + remaining = hrtimer_expires_remaining(&ctx->tmr); return remaining.tv64 < 0 ? ktime_set(0, 0): remaining; } @@ -74,7 +72,7 @@ static void timerfd_setup(struct timerfd_ctx *ctx, int flags, ctx->ticks = 0; ctx->tintv = timespec_to_ktime(ktmr->it_interval); hrtimer_init(&ctx->tmr, ctx->clockid, htmode); - ctx->tmr.expires = texp; + hrtimer_set_expires(&ctx->tmr, texp); ctx->tmr.function = timerfd_tmrproc; if (texp.tv64 != 0) hrtimer_start(&ctx->tmr, texp, htmode); diff --git a/fs/ubifs/budget.c b/fs/ubifs/budget.c index 73db464..1a4973e 100644 --- a/fs/ubifs/budget.c +++ b/fs/ubifs/budget.c @@ -414,19 +414,21 @@ static int do_budget_space(struct ubifs_info *c) * @c->lst.empty_lebs + @c->freeable_cnt + @c->idx_gc_cnt - * @c->lst.taken_empty_lebs * - * @empty_lebs are available because they are empty. @freeable_cnt are - * available because they contain only free and dirty space and the - * index allocation always occurs after wbufs are synch'ed. - * @idx_gc_cnt are available because they are index LEBs that have been - * garbage collected (including trivial GC) and are awaiting the commit - * before they can be unmapped - note that the in-the-gaps method will - * grab these if it needs them. @taken_empty_lebs are empty_lebs that - * have already been allocated for some purpose (also includes those - * LEBs on the @idx_gc list). + * @c->lst.empty_lebs are available because they are empty. + * @c->freeable_cnt are available because they contain only free and + * dirty space, @c->idx_gc_cnt are available because they are index + * LEBs that have been garbage collected and are awaiting the commit + * before they can be used. And the in-the-gaps method will grab these + * if it needs them. @c->lst.taken_empty_lebs are empty LEBs that have + * already been allocated for some purpose. * - * Note, @taken_empty_lebs may temporarily be higher by one because of - * the way we serialize LEB allocations and budgeting. See a comment in - * 'ubifs_find_free_space()'. + * Note, @c->idx_gc_cnt is included to both @c->lst.empty_lebs (because + * these LEBs are empty) and to @c->lst.taken_empty_lebs (because they + * are taken until after the commit). + * + * Note, @c->lst.taken_empty_lebs may temporarily be higher by one + * because of the way we serialize LEB allocations and budgeting. See a + * comment in 'ubifs_find_free_space()'. */ lebs = c->lst.empty_lebs + c->freeable_cnt + c->idx_gc_cnt - c->lst.taken_empty_lebs; diff --git a/fs/ubifs/compress.c b/fs/ubifs/compress.c index 5bb51da..a0ada59 100644 --- a/fs/ubifs/compress.c +++ b/fs/ubifs/compress.c @@ -91,8 +91,6 @@ struct ubifs_compressor *ubifs_compressors[UBIFS_COMPR_TYPES_CNT]; * * Note, if the input buffer was not compressed, it is copied to the output * buffer and %UBIFS_COMPR_NONE is returned in @compr_type. - * - * This functions returns %0 on success or a negative error code on failure. */ void ubifs_compress(const void *in_buf, int in_len, void *out_buf, int *out_len, int *compr_type) diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c index d7f7645..7186400 100644 --- a/fs/ubifs/debug.c +++ b/fs/ubifs/debug.c @@ -222,30 +222,38 @@ void dbg_dump_inode(const struct ubifs_info *c, const struct inode *inode) { const struct ubifs_inode *ui = ubifs_inode(inode); - printk(KERN_DEBUG "inode %lu\n", inode->i_ino); - printk(KERN_DEBUG "size %llu\n", + printk(KERN_DEBUG "Dump in-memory inode:"); + printk(KERN_DEBUG "\tinode %lu\n", inode->i_ino); + printk(KERN_DEBUG "\tsize %llu\n", (unsigned long long)i_size_read(inode)); - printk(KERN_DEBUG "nlink %u\n", inode->i_nlink); - printk(KERN_DEBUG "uid %u\n", (unsigned int)inode->i_uid); - printk(KERN_DEBUG "gid %u\n", (unsigned int)inode->i_gid); - printk(KERN_DEBUG "atime %u.%u\n", + printk(KERN_DEBUG "\tnlink %u\n", inode->i_nlink); + printk(KERN_DEBUG "\tuid %u\n", (unsigned int)inode->i_uid); + printk(KERN_DEBUG "\tgid %u\n", (unsigned int)inode->i_gid); + printk(KERN_DEBUG "\tatime %u.%u\n", (unsigned int)inode->i_atime.tv_sec, (unsigned int)inode->i_atime.tv_nsec); - printk(KERN_DEBUG "mtime %u.%u\n", + printk(KERN_DEBUG "\tmtime %u.%u\n", (unsigned int)inode->i_mtime.tv_sec, (unsigned int)inode->i_mtime.tv_nsec); - printk(KERN_DEBUG "ctime %u.%u\n", + printk(KERN_DEBUG "\tctime %u.%u\n", (unsigned int)inode->i_ctime.tv_sec, (unsigned int)inode->i_ctime.tv_nsec); - printk(KERN_DEBUG "creat_sqnum %llu\n", ui->creat_sqnum); - printk(KERN_DEBUG "xattr_size %u\n", ui->xattr_size); - printk(KERN_DEBUG "xattr_cnt %u\n", ui->xattr_cnt); - printk(KERN_DEBUG "xattr_names %u\n", ui->xattr_names); - printk(KERN_DEBUG "dirty %u\n", ui->dirty); - printk(KERN_DEBUG "xattr %u\n", ui->xattr); - printk(KERN_DEBUG "flags %d\n", ui->flags); - printk(KERN_DEBUG "compr_type %d\n", ui->compr_type); - printk(KERN_DEBUG "data_len %d\n", ui->data_len); + printk(KERN_DEBUG "\tcreat_sqnum %llu\n", ui->creat_sqnum); + printk(KERN_DEBUG "\txattr_size %u\n", ui->xattr_size); + printk(KERN_DEBUG "\txattr_cnt %u\n", ui->xattr_cnt); + printk(KERN_DEBUG "\txattr_names %u\n", ui->xattr_names); + printk(KERN_DEBUG "\tdirty %u\n", ui->dirty); + printk(KERN_DEBUG "\txattr %u\n", ui->xattr); + printk(KERN_DEBUG "\tbulk_read %u\n", ui->xattr); + printk(KERN_DEBUG "\tsynced_i_size %llu\n", + (unsigned long long)ui->synced_i_size); + printk(KERN_DEBUG "\tui_size %llu\n", + (unsigned long long)ui->ui_size); + printk(KERN_DEBUG "\tflags %d\n", ui->flags); + printk(KERN_DEBUG "\tcompr_type %d\n", ui->compr_type); + printk(KERN_DEBUG "\tlast_page_read %lu\n", ui->last_page_read); + printk(KERN_DEBUG "\tread_in_a_row %lu\n", ui->read_in_a_row); + printk(KERN_DEBUG "\tdata_len %d\n", ui->data_len); } void dbg_dump_node(const struct ubifs_info *c, const void *node) @@ -647,6 +655,43 @@ void dbg_dump_lprops(struct ubifs_info *c) } } +void dbg_dump_lpt_info(struct ubifs_info *c) +{ + int i; + + spin_lock(&dbg_lock); + printk(KERN_DEBUG "\tlpt_sz: %lld\n", c->lpt_sz); + printk(KERN_DEBUG "\tpnode_sz: %d\n", c->pnode_sz); + printk(KERN_DEBUG "\tnnode_sz: %d\n", c->nnode_sz); + printk(KERN_DEBUG "\tltab_sz: %d\n", c->ltab_sz); + printk(KERN_DEBUG "\tlsave_sz: %d\n", c->lsave_sz); + printk(KERN_DEBUG "\tbig_lpt: %d\n", c->big_lpt); + printk(KERN_DEBUG "\tlpt_hght: %d\n", c->lpt_hght); + printk(KERN_DEBUG "\tpnode_cnt: %d\n", c->pnode_cnt); + printk(KERN_DEBUG "\tnnode_cnt: %d\n", c->nnode_cnt); + printk(KERN_DEBUG "\tdirty_pn_cnt: %d\n", c->dirty_pn_cnt); + printk(KERN_DEBUG "\tdirty_nn_cnt: %d\n", c->dirty_nn_cnt); + printk(KERN_DEBUG "\tlsave_cnt: %d\n", c->lsave_cnt); + printk(KERN_DEBUG "\tspace_bits: %d\n", c->space_bits); + printk(KERN_DEBUG "\tlpt_lnum_bits: %d\n", c->lpt_lnum_bits); + printk(KERN_DEBUG "\tlpt_offs_bits: %d\n", c->lpt_offs_bits); + printk(KERN_DEBUG "\tlpt_spc_bits: %d\n", c->lpt_spc_bits); + printk(KERN_DEBUG "\tpcnt_bits: %d\n", c->pcnt_bits); + printk(KERN_DEBUG "\tlnum_bits: %d\n", c->lnum_bits); + printk(KERN_DEBUG "\tLPT root is at %d:%d\n", c->lpt_lnum, c->lpt_offs); + printk(KERN_DEBUG "\tLPT head is at %d:%d\n", + c->nhead_lnum, c->nhead_offs); + printk(KERN_DEBUG "\tLPT ltab is at %d:%d\n", c->ltab_lnum, c->ltab_offs); + if (c->big_lpt) + printk(KERN_DEBUG "\tLPT lsave is at %d:%d\n", + c->lsave_lnum, c->lsave_offs); + for (i = 0; i < c->lpt_lebs; i++) + printk(KERN_DEBUG "\tLPT LEB %d free %d dirty %d tgc %d " + "cmt %d\n", i + c->lpt_first, c->ltab[i].free, + c->ltab[i].dirty, c->ltab[i].tgc, c->ltab[i].cmt); + spin_unlock(&dbg_lock); +} + void dbg_dump_leb(const struct ubifs_info *c, int lnum) { struct ubifs_scan_leb *sleb; diff --git a/fs/ubifs/debug.h b/fs/ubifs/debug.h index 50315fc..33d6b95 100644 --- a/fs/ubifs/debug.h +++ b/fs/ubifs/debug.h @@ -224,6 +224,7 @@ void dbg_dump_lstats(const struct ubifs_lp_stats *lst); void dbg_dump_budg(struct ubifs_info *c); void dbg_dump_lprop(const struct ubifs_info *c, const struct ubifs_lprops *lp); void dbg_dump_lprops(struct ubifs_info *c); +void dbg_dump_lpt_info(struct ubifs_info *c); void dbg_dump_leb(const struct ubifs_info *c, int lnum); void dbg_dump_znode(const struct ubifs_info *c, const struct ubifs_znode *znode); @@ -249,6 +250,8 @@ int dbg_old_index_check_init(struct ubifs_info *c, struct ubifs_zbranch *zroot); int dbg_check_old_index(struct ubifs_info *c, struct ubifs_zbranch *zroot); int dbg_check_cats(struct ubifs_info *c); int dbg_check_ltab(struct ubifs_info *c); +int dbg_chk_lpt_free_spc(struct ubifs_info *c); +int dbg_chk_lpt_sz(struct ubifs_info *c, int action, int len); int dbg_check_synced_i_size(struct inode *inode); int dbg_check_dir_size(struct ubifs_info *c, const struct inode *dir); int dbg_check_tnc(struct ubifs_info *c, int extra); @@ -367,6 +370,7 @@ static inline int dbg_change(struct ubi_volume_desc *desc, int lnum, #define dbg_dump_budg(c) ({}) #define dbg_dump_lprop(c, lp) ({}) #define dbg_dump_lprops(c) ({}) +#define dbg_dump_lpt_info(c) ({}) #define dbg_dump_leb(c, lnum) ({}) #define dbg_dump_znode(c, znode) ({}) #define dbg_dump_heap(c, heap, cat) ({}) @@ -379,6 +383,8 @@ static inline int dbg_change(struct ubi_volume_desc *desc, int lnum, #define dbg_check_old_index(c, zroot) 0 #define dbg_check_cats(c) 0 #define dbg_check_ltab(c) 0 +#define dbg_chk_lpt_free_spc(c) 0 +#define dbg_chk_lpt_sz(c, action, len) 0 #define dbg_check_synced_i_size(inode) 0 #define dbg_check_dir_size(c, dir) 0 #define dbg_check_tnc(c, x) 0 diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index 3d698e2..51cf511 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -147,6 +147,12 @@ static int do_readpage(struct page *page) err = ret; if (err != -ENOENT) break; + } else if (block + 1 == beyond) { + int dlen = le32_to_cpu(dn->size); + int ilen = i_size & (UBIFS_BLOCK_SIZE - 1); + + if (ilen && ilen < dlen) + memset(addr + ilen, 0, dlen - ilen); } } if (++i >= UBIFS_BLOCKS_PER_PAGE) @@ -577,8 +583,262 @@ out: return copied; } +/** + * populate_page - copy data nodes into a page for bulk-read. + * @c: UBIFS file-system description object + * @page: page + * @bu: bulk-read information + * @n: next zbranch slot + * + * This function returns %0 on success and a negative error code on failure. + */ +static int populate_page(struct ubifs_info *c, struct page *page, + struct bu_info *bu, int *n) +{ + int i = 0, nn = *n, offs = bu->zbranch[0].offs, hole = 0, read = 0; + struct inode *inode = page->mapping->host; + loff_t i_size = i_size_read(inode); + unsigned int page_block; + void *addr, *zaddr; + pgoff_t end_index; + + dbg_gen("ino %lu, pg %lu, i_size %lld, flags %#lx", + inode->i_ino, page->index, i_size, page->flags); + + addr = zaddr = kmap(page); + + end_index = (i_size - 1) >> PAGE_CACHE_SHIFT; + if (!i_size || page->index > end_index) { + hole = 1; + memset(addr, 0, PAGE_CACHE_SIZE); + goto out_hole; + } + + page_block = page->index << UBIFS_BLOCKS_PER_PAGE_SHIFT; + while (1) { + int err, len, out_len, dlen; + + if (nn >= bu->cnt) { + hole = 1; + memset(addr, 0, UBIFS_BLOCK_SIZE); + } else if (key_block(c, &bu->zbranch[nn].key) == page_block) { + struct ubifs_data_node *dn; + + dn = bu->buf + (bu->zbranch[nn].offs - offs); + + ubifs_assert(dn->ch.sqnum > + ubifs_inode(inode)->creat_sqnum); + + len = le32_to_cpu(dn->size); + if (len <= 0 || len > UBIFS_BLOCK_SIZE) + goto out_err; + + dlen = le32_to_cpu(dn->ch.len) - UBIFS_DATA_NODE_SZ; + out_len = UBIFS_BLOCK_SIZE; + err = ubifs_decompress(&dn->data, dlen, addr, &out_len, + le16_to_cpu(dn->compr_type)); + if (err || len != out_len) + goto out_err; + + if (len < UBIFS_BLOCK_SIZE) + memset(addr + len, 0, UBIFS_BLOCK_SIZE - len); + + nn += 1; + read = (i << UBIFS_BLOCK_SHIFT) + len; + } else if (key_block(c, &bu->zbranch[nn].key) < page_block) { + nn += 1; + continue; + } else { + hole = 1; + memset(addr, 0, UBIFS_BLOCK_SIZE); + } + if (++i >= UBIFS_BLOCKS_PER_PAGE) + break; + addr += UBIFS_BLOCK_SIZE; + page_block += 1; + } + + if (end_index == page->index) { + int len = i_size & (PAGE_CACHE_SIZE - 1); + + if (len && len < read) + memset(zaddr + len, 0, read - len); + } + +out_hole: + if (hole) { + SetPageChecked(page); + dbg_gen("hole"); + } + + SetPageUptodate(page); + ClearPageError(page); + flush_dcache_page(page); + kunmap(page); + *n = nn; + return 0; + +out_err: + ClearPageUptodate(page); + SetPageError(page); + flush_dcache_page(page); + kunmap(page); + ubifs_err("bad data node (block %u, inode %lu)", + page_block, inode->i_ino); + return -EINVAL; +} + +/** + * ubifs_do_bulk_read - do bulk-read. + * @c: UBIFS file-system description object + * @page1: first page + * + * This function returns %1 if the bulk-read is done, otherwise %0 is returned. + */ +static int ubifs_do_bulk_read(struct ubifs_info *c, struct page *page1) +{ + pgoff_t offset = page1->index, end_index; + struct address_space *mapping = page1->mapping; + struct inode *inode = mapping->host; + struct ubifs_inode *ui = ubifs_inode(inode); + struct bu_info *bu; + int err, page_idx, page_cnt, ret = 0, n = 0; + loff_t isize; + + bu = kmalloc(sizeof(struct bu_info), GFP_NOFS); + if (!bu) + return 0; + + bu->buf_len = c->bulk_read_buf_size; + bu->buf = kmalloc(bu->buf_len, GFP_NOFS); + if (!bu->buf) + goto out_free; + + data_key_init(c, &bu->key, inode->i_ino, + offset << UBIFS_BLOCKS_PER_PAGE_SHIFT); + + err = ubifs_tnc_get_bu_keys(c, bu); + if (err) + goto out_warn; + + if (bu->eof) { + /* Turn off bulk-read at the end of the file */ + ui->read_in_a_row = 1; + ui->bulk_read = 0; + } + + page_cnt = bu->blk_cnt >> UBIFS_BLOCKS_PER_PAGE_SHIFT; + if (!page_cnt) { + /* + * This happens when there are multiple blocks per page and the + * blocks for the first page we are looking for, are not + * together. If all the pages were like this, bulk-read would + * reduce performance, so we turn it off for a while. + */ + ui->read_in_a_row = 0; + ui->bulk_read = 0; + goto out_free; + } + + if (bu->cnt) { + err = ubifs_tnc_bulk_read(c, bu); + if (err) + goto out_warn; + } + + err = populate_page(c, page1, bu, &n); + if (err) + goto out_warn; + + unlock_page(page1); + ret = 1; + + isize = i_size_read(inode); + if (isize == 0) + goto out_free; + end_index = ((isize - 1) >> PAGE_CACHE_SHIFT); + + for (page_idx = 1; page_idx < page_cnt; page_idx++) { + pgoff_t page_offset = offset + page_idx; + struct page *page; + + if (page_offset > end_index) + break; + page = find_or_create_page(mapping, page_offset, + GFP_NOFS | __GFP_COLD); + if (!page) + break; + if (!PageUptodate(page)) + err = populate_page(c, page, bu, &n); + unlock_page(page); + page_cache_release(page); + if (err) + break; + } + + ui->last_page_read = offset + page_idx - 1; + +out_free: + kfree(bu->buf); + kfree(bu); + return ret; + +out_warn: + ubifs_warn("ignoring error %d and skipping bulk-read", err); + goto out_free; +} + +/** + * ubifs_bulk_read - determine whether to bulk-read and, if so, do it. + * @page: page from which to start bulk-read. + * + * Some flash media are capable of reading sequentially at faster rates. UBIFS + * bulk-read facility is designed to take advantage of that, by reading in one + * go consecutive data nodes that are also located consecutively in the same + * LEB. This function returns %1 if a bulk-read is done and %0 otherwise. + */ +static int ubifs_bulk_read(struct page *page) +{ + struct inode *inode = page->mapping->host; + struct ubifs_info *c = inode->i_sb->s_fs_info; + struct ubifs_inode *ui = ubifs_inode(inode); + pgoff_t index = page->index, last_page_read = ui->last_page_read; + int ret = 0; + + ui->last_page_read = index; + + if (!c->bulk_read) + return 0; + /* + * Bulk-read is protected by ui_mutex, but it is an optimization, so + * don't bother if we cannot lock the mutex. + */ + if (!mutex_trylock(&ui->ui_mutex)) + return 0; + if (index != last_page_read + 1) { + /* Turn off bulk-read if we stop reading sequentially */ + ui->read_in_a_row = 1; + if (ui->bulk_read) + ui->bulk_read = 0; + goto out_unlock; + } + if (!ui->bulk_read) { + ui->read_in_a_row += 1; + if (ui->read_in_a_row < 3) + goto out_unlock; + /* Three reads in a row, so switch on bulk-read */ + ui->bulk_read = 1; + } + ret = ubifs_do_bulk_read(c, page); +out_unlock: + mutex_unlock(&ui->ui_mutex); + return ret; +} + static int ubifs_readpage(struct file *file, struct page *page) { + if (ubifs_bulk_read(page)) + return 0; do_readpage(page); unlock_page(page); return 0; diff --git a/fs/ubifs/find.c b/fs/ubifs/find.c index 47814cd..717d79c 100644 --- a/fs/ubifs/find.c +++ b/fs/ubifs/find.c @@ -901,11 +901,11 @@ static int get_idx_gc_leb(struct ubifs_info *c) * it is needed now for this commit. */ lp = ubifs_lpt_lookup_dirty(c, lnum); - if (unlikely(IS_ERR(lp))) + if (IS_ERR(lp)) return PTR_ERR(lp); lp = ubifs_change_lp(c, lp, LPROPS_NC, LPROPS_NC, lp->flags | LPROPS_INDEX, -1); - if (unlikely(IS_ERR(lp))) + if (IS_ERR(lp)) return PTR_ERR(lp); dbg_find("LEB %d, dirty %d and free %d flags %#x", lp->lnum, lp->dirty, lp->free, lp->flags); diff --git a/fs/ubifs/gc.c b/fs/ubifs/gc.c index 02aba36..0bef650 100644 --- a/fs/ubifs/gc.c +++ b/fs/ubifs/gc.c @@ -96,6 +96,48 @@ static int switch_gc_head(struct ubifs_info *c) } /** + * joinup - bring data nodes for an inode together. + * @c: UBIFS file-system description object + * @sleb: describes scanned LEB + * @inum: inode number + * @blk: block number + * @data: list to which to add data nodes + * + * This function looks at the first few nodes in the scanned LEB @sleb and adds + * them to @data if they are data nodes from @inum and have a larger block + * number than @blk. This function returns %0 on success and a negative error + * code on failure. + */ +static int joinup(struct ubifs_info *c, struct ubifs_scan_leb *sleb, ino_t inum, + unsigned int blk, struct list_head *data) +{ + int err, cnt = 6, lnum = sleb->lnum, offs; + struct ubifs_scan_node *snod, *tmp; + union ubifs_key *key; + + list_for_each_entry_safe(snod, tmp, &sleb->nodes, list) { + key = &snod->key; + if (key_inum(c, key) == inum && + key_type(c, key) == UBIFS_DATA_KEY && + key_block(c, key) > blk) { + offs = snod->offs; + err = ubifs_tnc_has_node(c, key, 0, lnum, offs, 0); + if (err < 0) + return err; + list_del(&snod->list); + if (err) { + list_add_tail(&snod->list, data); + blk = key_block(c, key); + } else + kfree(snod); + cnt = 6; + } else if (--cnt == 0) + break; + } + return 0; +} + +/** * move_nodes - move nodes. * @c: UBIFS file-system description object * @sleb: describes nodes to move @@ -116,16 +158,21 @@ static int switch_gc_head(struct ubifs_info *c) static int move_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb) { struct ubifs_scan_node *snod, *tmp; - struct list_head large, medium, small; + struct list_head data, large, medium, small; struct ubifs_wbuf *wbuf = &c->jheads[GCHD].wbuf; int avail, err, min = INT_MAX; + unsigned int blk = 0; + ino_t inum = 0; + INIT_LIST_HEAD(&data); INIT_LIST_HEAD(&large); INIT_LIST_HEAD(&medium); INIT_LIST_HEAD(&small); - list_for_each_entry_safe(snod, tmp, &sleb->nodes, list) { - struct list_head *lst; + while (!list_empty(&sleb->nodes)) { + struct list_head *lst = sleb->nodes.next; + + snod = list_entry(lst, struct ubifs_scan_node, list); ubifs_assert(snod->type != UBIFS_IDX_NODE); ubifs_assert(snod->type != UBIFS_REF_NODE); @@ -136,7 +183,6 @@ static int move_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb) if (err < 0) goto out; - lst = &snod->list; list_del(lst); if (!err) { /* The node is obsolete, remove it from the list */ @@ -145,15 +191,30 @@ static int move_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb) } /* - * Sort the list of nodes so that large nodes go first, and - * small nodes go last. + * Sort the list of nodes so that data nodes go first, large + * nodes go second, and small nodes go last. */ - if (snod->len > MEDIUM_NODE_WM) - list_add(lst, &large); + if (key_type(c, &snod->key) == UBIFS_DATA_KEY) { + if (inum != key_inum(c, &snod->key)) { + if (inum) { + /* + * Try to move data nodes from the same + * inode together. + */ + err = joinup(c, sleb, inum, blk, &data); + if (err) + goto out; + } + inum = key_inum(c, &snod->key); + blk = key_block(c, &snod->key); + } + list_add_tail(lst, &data); + } else if (snod->len > MEDIUM_NODE_WM) + list_add_tail(lst, &large); else if (snod->len > SMALL_NODE_WM) - list_add(lst, &medium); + list_add_tail(lst, &medium); else - list_add(lst, &small); + list_add_tail(lst, &small); /* And find the smallest node */ if (snod->len < min) @@ -164,6 +225,7 @@ static int move_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb) * Join the tree lists so that we'd have one roughly sorted list * ('large' will be the head of the joined list). */ + list_splice(&data, &large); list_splice(&medium, large.prev); list_splice(&small, large.prev); @@ -653,7 +715,7 @@ int ubifs_gc_start_commit(struct ubifs_info *c) */ while (1) { lp = ubifs_fast_find_freeable(c); - if (unlikely(IS_ERR(lp))) { + if (IS_ERR(lp)) { err = PTR_ERR(lp); goto out; } @@ -665,7 +727,7 @@ int ubifs_gc_start_commit(struct ubifs_info *c) if (err) goto out; lp = ubifs_change_lp(c, lp, c->leb_size, 0, lp->flags, 0); - if (unlikely(IS_ERR(lp))) { + if (IS_ERR(lp)) { err = PTR_ERR(lp); goto out; } @@ -680,7 +742,7 @@ int ubifs_gc_start_commit(struct ubifs_info *c) /* Record index freeable LEBs for unmapping after commit */ while (1) { lp = ubifs_fast_find_frdi_idx(c); - if (unlikely(IS_ERR(lp))) { + if (IS_ERR(lp)) { err = PTR_ERR(lp); goto out; } @@ -696,7 +758,7 @@ int ubifs_gc_start_commit(struct ubifs_info *c) /* Don't release the LEB until after the next commit */ flags = (lp->flags | LPROPS_TAKEN) ^ LPROPS_INDEX; lp = ubifs_change_lp(c, lp, c->leb_size, 0, flags, 1); - if (unlikely(IS_ERR(lp))) { + if (IS_ERR(lp)) { err = PTR_ERR(lp); kfree(idx_gc); goto out; diff --git a/fs/ubifs/io.c b/fs/ubifs/io.c index 054363f..0168271 100644 --- a/fs/ubifs/io.c +++ b/fs/ubifs/io.c @@ -62,6 +62,7 @@ void ubifs_ro_mode(struct ubifs_info *c, int err) { if (!c->ro_media) { c->ro_media = 1; + c->no_chk_data_crc = 0; ubifs_warn("switched to read-only mode, error %d", err); dbg_dump_stack(); } @@ -74,6 +75,7 @@ void ubifs_ro_mode(struct ubifs_info *c, int err) * @lnum: logical eraseblock number * @offs: offset within the logical eraseblock * @quiet: print no messages + * @chk_crc: indicates whether to always check the CRC * * This function checks node magic number and CRC checksum. This function also * validates node length to prevent UBIFS from becoming crazy when an attacker @@ -85,7 +87,7 @@ void ubifs_ro_mode(struct ubifs_info *c, int err) * or magic. */ int ubifs_check_node(const struct ubifs_info *c, const void *buf, int lnum, - int offs, int quiet) + int offs, int quiet, int chk_crc) { int err = -EINVAL, type, node_len; uint32_t crc, node_crc, magic; @@ -121,6 +123,10 @@ int ubifs_check_node(const struct ubifs_info *c, const void *buf, int lnum, node_len > c->ranges[type].max_len) goto out_len; + if (!chk_crc && type == UBIFS_DATA_NODE && !c->always_chk_crc) + if (c->no_chk_data_crc) + return 0; + crc = crc32(UBIFS_CRC32_INIT, buf + 8, node_len - 8); node_crc = le32_to_cpu(ch->crc); if (crc != node_crc) { @@ -722,7 +728,7 @@ int ubifs_read_node_wbuf(struct ubifs_wbuf *wbuf, void *buf, int type, int len, goto out; } - err = ubifs_check_node(c, buf, lnum, offs, 0); + err = ubifs_check_node(c, buf, lnum, offs, 0, 0); if (err) { ubifs_err("expected node type %d", type); return err; @@ -781,7 +787,7 @@ int ubifs_read_node(const struct ubifs_info *c, void *buf, int type, int len, goto out; } - err = ubifs_check_node(c, buf, lnum, offs, 0); + err = ubifs_check_node(c, buf, lnum, offs, 0, 0); if (err) { ubifs_err("expected node type %d", type); return err; diff --git a/fs/ubifs/key.h b/fs/ubifs/key.h index 8f74760..9ee6508 100644 --- a/fs/ubifs/key.h +++ b/fs/ubifs/key.h @@ -484,7 +484,7 @@ static inline void key_copy(const struct ubifs_info *c, * @key2: the second key to compare * * This function compares 2 keys and returns %-1 if @key1 is less than - * @key2, 0 if the keys are equivalent and %1 if @key1 is greater than @key2. + * @key2, %0 if the keys are equivalent and %1 if @key1 is greater than @key2. */ static inline int keys_cmp(const struct ubifs_info *c, const union ubifs_key *key1, @@ -503,6 +503,26 @@ static inline int keys_cmp(const struct ubifs_info *c, } /** + * keys_eq - determine if keys are equivalent. + * @c: UBIFS file-system description object + * @key1: the first key to compare + * @key2: the second key to compare + * + * This function compares 2 keys and returns %1 if @key1 is equal to @key2 and + * %0 if not. + */ +static inline int keys_eq(const struct ubifs_info *c, + const union ubifs_key *key1, + const union ubifs_key *key2) +{ + if (key1->u32[0] != key2->u32[0]) + return 0; + if (key1->u32[1] != key2->u32[1]) + return 0; + return 1; +} + +/** * is_hash_key - is a key vulnerable to hash collisions. * @c: UBIFS file-system description object * @key: key diff --git a/fs/ubifs/lprops.c b/fs/ubifs/lprops.c index 2ba93da..f27176e 100644 --- a/fs/ubifs/lprops.c +++ b/fs/ubifs/lprops.c @@ -125,6 +125,7 @@ static void adjust_lpt_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap, } } } + /* Not greater than parent, so compare to children */ while (1) { /* Compare to left child */ @@ -460,18 +461,6 @@ static void change_category(struct ubifs_info *c, struct ubifs_lprops *lprops) } /** - * ubifs_get_lprops - get reference to LEB properties. - * @c: the UBIFS file-system description object - * - * This function locks lprops. Lprops have to be unlocked by - * 'ubifs_release_lprops()'. - */ -void ubifs_get_lprops(struct ubifs_info *c) -{ - mutex_lock(&c->lp_mutex); -} - -/** * calc_dark - calculate LEB dark space size. * @c: the UBIFS file-system description object * @spc: amount of free and dirty space in the LEB @@ -576,7 +565,6 @@ const struct ubifs_lprops *ubifs_change_lp(struct ubifs_info *c, ubifs_assert(!(lprops->free & 7) && !(lprops->dirty & 7)); spin_lock(&c->space_lock); - if ((lprops->flags & LPROPS_TAKEN) && lprops->free == c->leb_size) c->lst.taken_empty_lebs -= 1; @@ -637,31 +625,12 @@ const struct ubifs_lprops *ubifs_change_lp(struct ubifs_info *c, c->lst.taken_empty_lebs += 1; change_category(c, lprops); - c->idx_gc_cnt += idx_gc_cnt; - spin_unlock(&c->space_lock); - return lprops; } /** - * ubifs_release_lprops - release lprops lock. - * @c: the UBIFS file-system description object - * - * This function has to be called after each 'ubifs_get_lprops()' call to - * unlock lprops. - */ -void ubifs_release_lprops(struct ubifs_info *c) -{ - ubifs_assert(mutex_is_locked(&c->lp_mutex)); - ubifs_assert(c->lst.empty_lebs >= 0 && - c->lst.empty_lebs <= c->main_lebs); - - mutex_unlock(&c->lp_mutex); -} - -/** * ubifs_get_lp_stats - get lprops statistics. * @c: UBIFS file-system description object * @st: return statistics @@ -1262,7 +1231,6 @@ static int scan_check_cb(struct ubifs_info *c, } ubifs_scan_destroy(sleb); - return LPT_SCAN_CONTINUE; out_print: diff --git a/fs/ubifs/lpt.c b/fs/ubifs/lpt.c index 9ff2463..db8bd0e 100644 --- a/fs/ubifs/lpt.c +++ b/fs/ubifs/lpt.c @@ -109,7 +109,8 @@ static void do_calc_lpt_geom(struct ubifs_info *c) c->lpt_sz = (long long)c->pnode_cnt * c->pnode_sz; c->lpt_sz += (long long)c->nnode_cnt * c->nnode_sz; c->lpt_sz += c->ltab_sz; - c->lpt_sz += c->lsave_sz; + if (c->big_lpt) + c->lpt_sz += c->lsave_sz; /* Add wastage */ sz = c->lpt_sz; @@ -287,25 +288,56 @@ uint32_t ubifs_unpack_bits(uint8_t **addr, int *pos, int nrbits) const int k = 32 - nrbits; uint8_t *p = *addr; int b = *pos; - uint32_t val; + uint32_t uninitialized_var(val); + const int bytes = (nrbits + b + 7) >> 3; ubifs_assert(nrbits > 0); ubifs_assert(nrbits <= 32); ubifs_assert(*pos >= 0); ubifs_assert(*pos < 8); if (b) { - val = p[1] | ((uint32_t)p[2] << 8) | ((uint32_t)p[3] << 16) | - ((uint32_t)p[4] << 24); + switch (bytes) { + case 2: + val = p[1]; + break; + case 3: + val = p[1] | ((uint32_t)p[2] << 8); + break; + case 4: + val = p[1] | ((uint32_t)p[2] << 8) | + ((uint32_t)p[3] << 16); + break; + case 5: + val = p[1] | ((uint32_t)p[2] << 8) | + ((uint32_t)p[3] << 16) | + ((uint32_t)p[4] << 24); + } val <<= (8 - b); val |= *p >> b; nrbits += b; - } else - val = p[0] | ((uint32_t)p[1] << 8) | ((uint32_t)p[2] << 16) | - ((uint32_t)p[3] << 24); + } else { + switch (bytes) { + case 1: + val = p[0]; + break; + case 2: + val = p[0] | ((uint32_t)p[1] << 8); + break; + case 3: + val = p[0] | ((uint32_t)p[1] << 8) | + ((uint32_t)p[2] << 16); + break; + case 4: + val = p[0] | ((uint32_t)p[1] << 8) | + ((uint32_t)p[2] << 16) | + ((uint32_t)p[3] << 24); + break; + } + } val <<= k; val >>= k; b = nrbits & 7; - p += nrbits / 8; + p += nrbits >> 3; *addr = p; *pos = b; ubifs_assert((val >> nrbits) == 0 || nrbits - b == 32); diff --git a/fs/ubifs/lpt_commit.c b/fs/ubifs/lpt_commit.c index 5f0b83e..eed5a00 100644 --- a/fs/ubifs/lpt_commit.c +++ b/fs/ubifs/lpt_commit.c @@ -177,8 +177,6 @@ static int alloc_lpt_leb(struct ubifs_info *c, int *lnum) return 0; } } - dbg_err("last LEB %d", *lnum); - dump_stack(); return -ENOSPC; } @@ -193,6 +191,9 @@ static int layout_cnodes(struct ubifs_info *c) int lnum, offs, len, alen, done_lsave, done_ltab, err; struct ubifs_cnode *cnode; + err = dbg_chk_lpt_sz(c, 0, 0); + if (err) + return err; cnode = c->lpt_cnext; if (!cnode) return 0; @@ -206,6 +207,7 @@ static int layout_cnodes(struct ubifs_info *c) c->lsave_lnum = lnum; c->lsave_offs = offs; offs += c->lsave_sz; + dbg_chk_lpt_sz(c, 1, c->lsave_sz); } if (offs + c->ltab_sz <= c->leb_size) { @@ -213,6 +215,7 @@ static int layout_cnodes(struct ubifs_info *c) c->ltab_lnum = lnum; c->ltab_offs = offs; offs += c->ltab_sz; + dbg_chk_lpt_sz(c, 1, c->ltab_sz); } do { @@ -226,9 +229,10 @@ static int layout_cnodes(struct ubifs_info *c) while (offs + len > c->leb_size) { alen = ALIGN(offs, c->min_io_size); upd_ltab(c, lnum, c->leb_size - alen, alen - offs); + dbg_chk_lpt_sz(c, 2, alen - offs); err = alloc_lpt_leb(c, &lnum); if (err) - return err; + goto no_space; offs = 0; ubifs_assert(lnum >= c->lpt_first && lnum <= c->lpt_last); @@ -238,6 +242,7 @@ static int layout_cnodes(struct ubifs_info *c) c->lsave_lnum = lnum; c->lsave_offs = offs; offs += c->lsave_sz; + dbg_chk_lpt_sz(c, 1, c->lsave_sz); continue; } if (!done_ltab) { @@ -245,6 +250,7 @@ static int layout_cnodes(struct ubifs_info *c) c->ltab_lnum = lnum; c->ltab_offs = offs; offs += c->ltab_sz; + dbg_chk_lpt_sz(c, 1, c->ltab_sz); continue; } break; @@ -257,6 +263,7 @@ static int layout_cnodes(struct ubifs_info *c) c->lpt_offs = offs; } offs += len; + dbg_chk_lpt_sz(c, 1, len); cnode = cnode->cnext; } while (cnode && cnode != c->lpt_cnext); @@ -265,9 +272,10 @@ static int layout_cnodes(struct ubifs_info *c) if (offs + c->lsave_sz > c->leb_size) { alen = ALIGN(offs, c->min_io_size); upd_ltab(c, lnum, c->leb_size - alen, alen - offs); + dbg_chk_lpt_sz(c, 2, alen - offs); err = alloc_lpt_leb(c, &lnum); if (err) - return err; + goto no_space; offs = 0; ubifs_assert(lnum >= c->lpt_first && lnum <= c->lpt_last); @@ -276,6 +284,7 @@ static int layout_cnodes(struct ubifs_info *c) c->lsave_lnum = lnum; c->lsave_offs = offs; offs += c->lsave_sz; + dbg_chk_lpt_sz(c, 1, c->lsave_sz); } /* Make sure to place LPT's own lprops table */ @@ -283,9 +292,10 @@ static int layout_cnodes(struct ubifs_info *c) if (offs + c->ltab_sz > c->leb_size) { alen = ALIGN(offs, c->min_io_size); upd_ltab(c, lnum, c->leb_size - alen, alen - offs); + dbg_chk_lpt_sz(c, 2, alen - offs); err = alloc_lpt_leb(c, &lnum); if (err) - return err; + goto no_space; offs = 0; ubifs_assert(lnum >= c->lpt_first && lnum <= c->lpt_last); @@ -294,11 +304,23 @@ static int layout_cnodes(struct ubifs_info *c) c->ltab_lnum = lnum; c->ltab_offs = offs; offs += c->ltab_sz; + dbg_chk_lpt_sz(c, 1, c->ltab_sz); } alen = ALIGN(offs, c->min_io_size); upd_ltab(c, lnum, c->leb_size - alen, alen - offs); + dbg_chk_lpt_sz(c, 4, alen - offs); + err = dbg_chk_lpt_sz(c, 3, alen); + if (err) + return err; return 0; + +no_space: + ubifs_err("LPT out of space"); + dbg_err("LPT out of space at LEB %d:%d needing %d, done_ltab %d, " + "done_lsave %d", lnum, offs, len, done_ltab, done_lsave); + dbg_dump_lpt_info(c); + return err; } /** @@ -333,8 +355,6 @@ static int realloc_lpt_leb(struct ubifs_info *c, int *lnum) *lnum = i + c->lpt_first; return 0; } - dbg_err("last LEB %d", *lnum); - dump_stack(); return -ENOSPC; } @@ -369,12 +389,14 @@ static int write_cnodes(struct ubifs_info *c) done_lsave = 1; ubifs_pack_lsave(c, buf + offs, c->lsave); offs += c->lsave_sz; + dbg_chk_lpt_sz(c, 1, c->lsave_sz); } if (offs + c->ltab_sz <= c->leb_size) { done_ltab = 1; ubifs_pack_ltab(c, buf + offs, c->ltab_cmt); offs += c->ltab_sz; + dbg_chk_lpt_sz(c, 1, c->ltab_sz); } /* Loop for each cnode */ @@ -392,10 +414,12 @@ static int write_cnodes(struct ubifs_info *c) alen, UBI_SHORTTERM); if (err) return err; + dbg_chk_lpt_sz(c, 4, alen - wlen); } + dbg_chk_lpt_sz(c, 2, 0); err = realloc_lpt_leb(c, &lnum); if (err) - return err; + goto no_space; offs = 0; from = 0; ubifs_assert(lnum >= c->lpt_first && @@ -408,12 +432,14 @@ static int write_cnodes(struct ubifs_info *c) done_lsave = 1; ubifs_pack_lsave(c, buf + offs, c->lsave); offs += c->lsave_sz; + dbg_chk_lpt_sz(c, 1, c->lsave_sz); continue; } if (!done_ltab) { done_ltab = 1; ubifs_pack_ltab(c, buf + offs, c->ltab_cmt); offs += c->ltab_sz; + dbg_chk_lpt_sz(c, 1, c->ltab_sz); continue; } break; @@ -435,6 +461,7 @@ static int write_cnodes(struct ubifs_info *c) clear_bit(COW_ZNODE, &cnode->flags); smp_mb__after_clear_bit(); offs += len; + dbg_chk_lpt_sz(c, 1, len); cnode = cnode->cnext; } while (cnode && cnode != c->lpt_cnext); @@ -448,9 +475,10 @@ static int write_cnodes(struct ubifs_info *c) UBI_SHORTTERM); if (err) return err; + dbg_chk_lpt_sz(c, 2, alen - wlen); err = realloc_lpt_leb(c, &lnum); if (err) - return err; + goto no_space; offs = 0; ubifs_assert(lnum >= c->lpt_first && lnum <= c->lpt_last); @@ -461,6 +489,7 @@ static int write_cnodes(struct ubifs_info *c) done_lsave = 1; ubifs_pack_lsave(c, buf + offs, c->lsave); offs += c->lsave_sz; + dbg_chk_lpt_sz(c, 1, c->lsave_sz); } /* Make sure to place LPT's own lprops table */ @@ -473,9 +502,10 @@ static int write_cnodes(struct ubifs_info *c) UBI_SHORTTERM); if (err) return err; + dbg_chk_lpt_sz(c, 2, alen - wlen); err = realloc_lpt_leb(c, &lnum); if (err) - return err; + goto no_space; offs = 0; ubifs_assert(lnum >= c->lpt_first && lnum <= c->lpt_last); @@ -486,6 +516,7 @@ static int write_cnodes(struct ubifs_info *c) done_ltab = 1; ubifs_pack_ltab(c, buf + offs, c->ltab_cmt); offs += c->ltab_sz; + dbg_chk_lpt_sz(c, 1, c->ltab_sz); } /* Write remaining data in buffer */ @@ -495,6 +526,12 @@ static int write_cnodes(struct ubifs_info *c) err = ubifs_leb_write(c, lnum, buf + from, from, alen, UBI_SHORTTERM); if (err) return err; + + dbg_chk_lpt_sz(c, 4, alen - wlen); + err = dbg_chk_lpt_sz(c, 3, ALIGN(offs, c->min_io_size)); + if (err) + return err; + c->nhead_lnum = lnum; c->nhead_offs = ALIGN(offs, c->min_io_size); @@ -503,7 +540,15 @@ static int write_cnodes(struct ubifs_info *c) dbg_lp("LPT ltab is at %d:%d", c->ltab_lnum, c->ltab_offs); if (c->big_lpt) dbg_lp("LPT lsave is at %d:%d", c->lsave_lnum, c->lsave_offs); + return 0; + +no_space: + ubifs_err("LPT out of space mismatch"); + dbg_err("LPT out of space mismatch at LEB %d:%d needing %d, done_ltab " + "%d, done_lsave %d", lnum, offs, len, done_ltab, done_lsave); + dbg_dump_lpt_info(c); + return err; } /** @@ -1044,6 +1089,8 @@ static int is_a_node(struct ubifs_info *c, uint8_t *buf, int len) int pos = 0, node_type, node_len; uint16_t crc, calc_crc; + if (len < UBIFS_LPT_CRC_BYTES + (UBIFS_LPT_TYPE_BITS + 7) / 8) + return 0; node_type = ubifs_unpack_bits(&addr, &pos, UBIFS_LPT_TYPE_BITS); if (node_type == UBIFS_LPT_NOT_A_NODE) return 0; @@ -1156,6 +1203,9 @@ int ubifs_lpt_start_commit(struct ubifs_info *c) dbg_lp(""); mutex_lock(&c->lp_mutex); + err = dbg_chk_lpt_free_spc(c); + if (err) + goto out; err = dbg_check_ltab(c); if (err) goto out; @@ -1645,4 +1695,121 @@ int dbg_check_ltab(struct ubifs_info *c) return 0; } +/** + * dbg_chk_lpt_free_spc - check LPT free space is enough to write entire LPT. + * @c: the UBIFS file-system description object + * + * This function returns %0 on success and a negative error code on failure. + */ +int dbg_chk_lpt_free_spc(struct ubifs_info *c) +{ + long long free = 0; + int i; + + for (i = 0; i < c->lpt_lebs; i++) { + if (c->ltab[i].tgc || c->ltab[i].cmt) + continue; + if (i + c->lpt_first == c->nhead_lnum) + free += c->leb_size - c->nhead_offs; + else if (c->ltab[i].free == c->leb_size) + free += c->leb_size; + } + if (free < c->lpt_sz) { + dbg_err("LPT space error: free %lld lpt_sz %lld", + free, c->lpt_sz); + dbg_dump_lpt_info(c); + return -EINVAL; + } + return 0; +} + +/** + * dbg_chk_lpt_sz - check LPT does not write more than LPT size. + * @c: the UBIFS file-system description object + * @action: action + * @len: length written + * + * This function returns %0 on success and a negative error code on failure. + */ +int dbg_chk_lpt_sz(struct ubifs_info *c, int action, int len) +{ + long long chk_lpt_sz, lpt_sz; + int err = 0; + + switch (action) { + case 0: + c->chk_lpt_sz = 0; + c->chk_lpt_sz2 = 0; + c->chk_lpt_lebs = 0; + c->chk_lpt_wastage = 0; + if (c->dirty_pn_cnt > c->pnode_cnt) { + dbg_err("dirty pnodes %d exceed max %d", + c->dirty_pn_cnt, c->pnode_cnt); + err = -EINVAL; + } + if (c->dirty_nn_cnt > c->nnode_cnt) { + dbg_err("dirty nnodes %d exceed max %d", + c->dirty_nn_cnt, c->nnode_cnt); + err = -EINVAL; + } + return err; + case 1: + c->chk_lpt_sz += len; + return 0; + case 2: + c->chk_lpt_sz += len; + c->chk_lpt_wastage += len; + c->chk_lpt_lebs += 1; + return 0; + case 3: + chk_lpt_sz = c->leb_size; + chk_lpt_sz *= c->chk_lpt_lebs; + chk_lpt_sz += len - c->nhead_offs; + if (c->chk_lpt_sz != chk_lpt_sz) { + dbg_err("LPT wrote %lld but space used was %lld", + c->chk_lpt_sz, chk_lpt_sz); + err = -EINVAL; + } + if (c->chk_lpt_sz > c->lpt_sz) { + dbg_err("LPT wrote %lld but lpt_sz is %lld", + c->chk_lpt_sz, c->lpt_sz); + err = -EINVAL; + } + if (c->chk_lpt_sz2 && c->chk_lpt_sz != c->chk_lpt_sz2) { + dbg_err("LPT layout size %lld but wrote %lld", + c->chk_lpt_sz, c->chk_lpt_sz2); + err = -EINVAL; + } + if (c->chk_lpt_sz2 && c->new_nhead_offs != len) { + dbg_err("LPT new nhead offs: expected %d was %d", + c->new_nhead_offs, len); + err = -EINVAL; + } + lpt_sz = (long long)c->pnode_cnt * c->pnode_sz; + lpt_sz += (long long)c->nnode_cnt * c->nnode_sz; + lpt_sz += c->ltab_sz; + if (c->big_lpt) + lpt_sz += c->lsave_sz; + if (c->chk_lpt_sz - c->chk_lpt_wastage > lpt_sz) { + dbg_err("LPT chk_lpt_sz %lld + waste %lld exceeds %lld", + c->chk_lpt_sz, c->chk_lpt_wastage, lpt_sz); + err = -EINVAL; + } + if (err) + dbg_dump_lpt_info(c); + c->chk_lpt_sz2 = c->chk_lpt_sz; + c->chk_lpt_sz = 0; + c->chk_lpt_wastage = 0; + c->chk_lpt_lebs = 0; + c->new_nhead_offs = len; + return err; + case 4: + c->chk_lpt_sz += len; + c->chk_lpt_wastage += len; + return 0; + default: + return -EINVAL; + } +} + #endif /* CONFIG_UBIFS_FS_DEBUG */ diff --git a/fs/ubifs/misc.h b/fs/ubifs/misc.h index 4c12a92..4fa81d8 100644 --- a/fs/ubifs/misc.h +++ b/fs/ubifs/misc.h @@ -310,4 +310,31 @@ static inline int ubifs_tnc_lookup(struct ubifs_info *c, return ubifs_tnc_locate(c, key, node, NULL, NULL); } +/** + * ubifs_get_lprops - get reference to LEB properties. + * @c: the UBIFS file-system description object + * + * This function locks lprops. Lprops have to be unlocked by + * 'ubifs_release_lprops()'. + */ +static inline void ubifs_get_lprops(struct ubifs_info *c) +{ + mutex_lock(&c->lp_mutex); +} + +/** + * ubifs_release_lprops - release lprops lock. + * @c: the UBIFS file-system description object + * + * This function has to be called after each 'ubifs_get_lprops()' call to + * unlock lprops. + */ +static inline void ubifs_release_lprops(struct ubifs_info *c) +{ + ubifs_assert(mutex_is_locked(&c->lp_mutex)); + ubifs_assert(c->lst.empty_lebs >= 0 && + c->lst.empty_lebs <= c->main_lebs); + mutex_unlock(&c->lp_mutex); +} + #endif /* __UBIFS_MISC_H__ */ diff --git a/fs/ubifs/scan.c b/fs/ubifs/scan.c index acf5c5f..0ed8247 100644 --- a/fs/ubifs/scan.c +++ b/fs/ubifs/scan.c @@ -87,7 +87,7 @@ int ubifs_scan_a_node(const struct ubifs_info *c, void *buf, int len, int lnum, dbg_scan("scanning %s", dbg_ntype(ch->node_type)); - if (ubifs_check_node(c, buf, lnum, offs, quiet)) + if (ubifs_check_node(c, buf, lnum, offs, quiet, 1)) return SCANNED_A_CORRUPT_NODE; if (ch->node_type == UBIFS_PAD_NODE) { diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index 9a92203..8780efb 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -401,6 +401,16 @@ static int ubifs_show_options(struct seq_file *s, struct vfsmount *mnt) else if (c->mount_opts.unmount_mode == 1) seq_printf(s, ",norm_unmount"); + if (c->mount_opts.bulk_read == 2) + seq_printf(s, ",bulk_read"); + else if (c->mount_opts.bulk_read == 1) + seq_printf(s, ",no_bulk_read"); + + if (c->mount_opts.chk_data_crc == 2) + seq_printf(s, ",chk_data_crc"); + else if (c->mount_opts.chk_data_crc == 1) + seq_printf(s, ",no_chk_data_crc"); + return 0; } @@ -408,13 +418,26 @@ static int ubifs_sync_fs(struct super_block *sb, int wait) { struct ubifs_info *c = sb->s_fs_info; int i, ret = 0, err; + long long bud_bytes; - if (c->jheads) + if (c->jheads) { for (i = 0; i < c->jhead_cnt; i++) { err = ubifs_wbuf_sync(&c->jheads[i].wbuf); if (err && !ret) ret = err; } + + /* Commit the journal unless it has too little data */ + spin_lock(&c->buds_lock); + bud_bytes = c->bud_bytes; + spin_unlock(&c->buds_lock); + if (bud_bytes > c->leb_size) { + err = ubifs_run_commit(c); + if (err) + return err; + } + } + /* * We ought to call sync for c->ubi but it does not have one. If it had * it would in turn call mtd->sync, however mtd operations are @@ -538,6 +561,18 @@ static int init_constants_early(struct ubifs_info *c) * calculations when reporting free space. */ c->leb_overhead = c->leb_size % UBIFS_MAX_DATA_NODE_SZ; + /* Buffer size for bulk-reads */ + c->bulk_read_buf_size = UBIFS_MAX_BULK_READ * UBIFS_MAX_DATA_NODE_SZ; + if (c->bulk_read_buf_size > c->leb_size) + c->bulk_read_buf_size = c->leb_size; + if (c->bulk_read_buf_size > 128 * 1024) { + /* Check if we can kmalloc more than 128KiB */ + void *try = kmalloc(c->bulk_read_buf_size, GFP_KERNEL); + + kfree(try); + if (!try) + c->bulk_read_buf_size = 128 * 1024; + } return 0; } @@ -840,17 +875,29 @@ static int check_volume_empty(struct ubifs_info *c) * * Opt_fast_unmount: do not run a journal commit before un-mounting * Opt_norm_unmount: run a journal commit before un-mounting + * Opt_bulk_read: enable bulk-reads + * Opt_no_bulk_read: disable bulk-reads + * Opt_chk_data_crc: check CRCs when reading data nodes + * Opt_no_chk_data_crc: do not check CRCs when reading data nodes * Opt_err: just end of array marker */ enum { Opt_fast_unmount, Opt_norm_unmount, + Opt_bulk_read, + Opt_no_bulk_read, + Opt_chk_data_crc, + Opt_no_chk_data_crc, Opt_err, }; static const match_table_t tokens = { {Opt_fast_unmount, "fast_unmount"}, {Opt_norm_unmount, "norm_unmount"}, + {Opt_bulk_read, "bulk_read"}, + {Opt_no_bulk_read, "no_bulk_read"}, + {Opt_chk_data_crc, "chk_data_crc"}, + {Opt_no_chk_data_crc, "no_chk_data_crc"}, {Opt_err, NULL}, }; @@ -888,6 +935,22 @@ static int ubifs_parse_options(struct ubifs_info *c, char *options, c->mount_opts.unmount_mode = 1; c->fast_unmount = 0; break; + case Opt_bulk_read: + c->mount_opts.bulk_read = 2; + c->bulk_read = 1; + break; + case Opt_no_bulk_read: + c->mount_opts.bulk_read = 1; + c->bulk_read = 0; + break; + case Opt_chk_data_crc: + c->mount_opts.chk_data_crc = 2; + c->no_chk_data_crc = 0; + break; + case Opt_no_chk_data_crc: + c->mount_opts.chk_data_crc = 1; + c->no_chk_data_crc = 1; + break; default: ubifs_err("unrecognized mount option \"%s\" " "or missing value", p); @@ -996,6 +1059,8 @@ static int mount_ubifs(struct ubifs_info *c) goto out_free; } + c->always_chk_crc = 1; + err = ubifs_read_superblock(c); if (err) goto out_free; @@ -1032,8 +1097,6 @@ static int mount_ubifs(struct ubifs_info *c) /* Create background thread */ c->bgt = kthread_create(ubifs_bg_thread, c, c->bgt_name); - if (!c->bgt) - c->bgt = ERR_PTR(-EINVAL); if (IS_ERR(c->bgt)) { err = PTR_ERR(c->bgt); c->bgt = NULL; @@ -1139,24 +1202,28 @@ static int mount_ubifs(struct ubifs_info *c) if (err) goto out_infos; + c->always_chk_crc = 0; + ubifs_msg("mounted UBI device %d, volume %d, name \"%s\"", c->vi.ubi_num, c->vi.vol_id, c->vi.name); if (mounted_read_only) ubifs_msg("mounted read-only"); x = (long long)c->main_lebs * c->leb_size; - ubifs_msg("file system size: %lld bytes (%lld KiB, %lld MiB, %d LEBs)", - x, x >> 10, x >> 20, c->main_lebs); + ubifs_msg("file system size: %lld bytes (%lld KiB, %lld MiB, %d " + "LEBs)", x, x >> 10, x >> 20, c->main_lebs); x = (long long)c->log_lebs * c->leb_size + c->max_bud_bytes; - ubifs_msg("journal size: %lld bytes (%lld KiB, %lld MiB, %d LEBs)", - x, x >> 10, x >> 20, c->log_lebs + c->max_bud_cnt); - ubifs_msg("default compressor: %s", ubifs_compr_name(c->default_compr)); - ubifs_msg("media format %d, latest format %d", + ubifs_msg("journal size: %lld bytes (%lld KiB, %lld MiB, %d " + "LEBs)", x, x >> 10, x >> 20, c->log_lebs + c->max_bud_cnt); + ubifs_msg("media format: %d (latest is %d)", c->fmt_version, UBIFS_FORMAT_VERSION); + ubifs_msg("default compressor: %s", ubifs_compr_name(c->default_compr)); + ubifs_msg("reserved for root: %llu bytes (%llu KiB)", + c->report_rp_size, c->report_rp_size >> 10); dbg_msg("compiled on: " __DATE__ " at " __TIME__); dbg_msg("min. I/O unit size: %d bytes", c->min_io_size); dbg_msg("LEB size: %d bytes (%d KiB)", - c->leb_size, c->leb_size / 1024); + c->leb_size, c->leb_size >> 10); dbg_msg("data journal heads: %d", c->jhead_cnt - NONDATA_JHEADS_CNT); dbg_msg("UUID: %02X%02X%02X%02X-%02X%02X" @@ -1282,6 +1349,7 @@ static int ubifs_remount_rw(struct ubifs_info *c) mutex_lock(&c->umount_mutex); c->remounting_rw = 1; + c->always_chk_crc = 1; /* Check for enough free space */ if (ubifs_calc_available(c, c->min_idx_lebs) <= 0) { @@ -1345,20 +1413,20 @@ static int ubifs_remount_rw(struct ubifs_info *c) /* Create background thread */ c->bgt = kthread_create(ubifs_bg_thread, c, c->bgt_name); - if (!c->bgt) - c->bgt = ERR_PTR(-EINVAL); if (IS_ERR(c->bgt)) { err = PTR_ERR(c->bgt); c->bgt = NULL; ubifs_err("cannot spawn \"%s\", error %d", c->bgt_name, err); - return err; + goto out; } wake_up_process(c->bgt); c->orph_buf = vmalloc(c->leb_size); - if (!c->orph_buf) - return -ENOMEM; + if (!c->orph_buf) { + err = -ENOMEM; + goto out; + } /* Check for enough log space */ lnum = c->lhead_lnum + 1; @@ -1385,6 +1453,7 @@ static int ubifs_remount_rw(struct ubifs_info *c) dbg_gen("re-mounted read-write"); c->vfs_sb->s_flags &= ~MS_RDONLY; c->remounting_rw = 0; + c->always_chk_crc = 0; mutex_unlock(&c->umount_mutex); return 0; @@ -1400,6 +1469,7 @@ out: c->ileb_buf = NULL; ubifs_lpt_free(c, 1); c->remounting_rw = 0; + c->always_chk_crc = 0; mutex_unlock(&c->umount_mutex); return err; } @@ -1408,12 +1478,9 @@ out: * commit_on_unmount - commit the journal when un-mounting. * @c: UBIFS file-system description object * - * This function is called during un-mounting and it commits the journal unless - * the "fast unmount" mode is enabled. It also avoids committing the journal if - * it contains too few data. - * - * Sometimes recovery requires the journal to be committed at least once, and - * this function takes care about this. + * This function is called during un-mounting and re-mounting, and it commits + * the journal unless the "fast unmount" mode is enabled. It also avoids + * committing the journal if it contains too few data. */ static void commit_on_unmount(struct ubifs_info *c) { diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c index 7634c59..d27fd91 100644 --- a/fs/ubifs/tnc.c +++ b/fs/ubifs/tnc.c @@ -284,7 +284,7 @@ static struct ubifs_znode *dirty_cow_znode(struct ubifs_info *c, } zn = copy_znode(c, znode); - if (unlikely(IS_ERR(zn))) + if (IS_ERR(zn)) return zn; if (zbr->len) { @@ -470,6 +470,10 @@ static int try_read_node(const struct ubifs_info *c, void *buf, int type, if (node_len != len) return 0; + if (type == UBIFS_DATA_NODE && !c->always_chk_crc) + if (c->no_chk_data_crc) + return 0; + crc = crc32(UBIFS_CRC32_INIT, buf + 8, node_len - 8); node_crc = le32_to_cpu(ch->crc); if (crc != node_crc) @@ -1128,7 +1132,7 @@ static struct ubifs_znode *dirty_cow_bottom_up(struct ubifs_info *c, ubifs_assert(znode == c->zroot.znode); znode = dirty_cow_znode(c, &c->zroot); } - if (unlikely(IS_ERR(znode)) || !p) + if (IS_ERR(znode) || !p) break; ubifs_assert(path[p - 1] >= 0); ubifs_assert(path[p - 1] < znode->child_cnt); @@ -1492,6 +1496,289 @@ out: } /** + * ubifs_tnc_get_bu_keys - lookup keys for bulk-read. + * @c: UBIFS file-system description object + * @bu: bulk-read parameters and results + * + * Lookup consecutive data node keys for the same inode that reside + * consecutively in the same LEB. + */ +int ubifs_tnc_get_bu_keys(struct ubifs_info *c, struct bu_info *bu) +{ + int n, err = 0, lnum = -1, uninitialized_var(offs); + int uninitialized_var(len); + unsigned int block = key_block(c, &bu->key); + struct ubifs_znode *znode; + + bu->cnt = 0; + bu->blk_cnt = 0; + bu->eof = 0; + + mutex_lock(&c->tnc_mutex); + /* Find first key */ + err = ubifs_lookup_level0(c, &bu->key, &znode, &n); + if (err < 0) + goto out; + if (err) { + /* Key found */ + len = znode->zbranch[n].len; + /* The buffer must be big enough for at least 1 node */ + if (len > bu->buf_len) { + err = -EINVAL; + goto out; + } + /* Add this key */ + bu->zbranch[bu->cnt++] = znode->zbranch[n]; + bu->blk_cnt += 1; + lnum = znode->zbranch[n].lnum; + offs = ALIGN(znode->zbranch[n].offs + len, 8); + } + while (1) { + struct ubifs_zbranch *zbr; + union ubifs_key *key; + unsigned int next_block; + + /* Find next key */ + err = tnc_next(c, &znode, &n); + if (err) + goto out; + zbr = &znode->zbranch[n]; + key = &zbr->key; + /* See if there is another data key for this file */ + if (key_inum(c, key) != key_inum(c, &bu->key) || + key_type(c, key) != UBIFS_DATA_KEY) { + err = -ENOENT; + goto out; + } + if (lnum < 0) { + /* First key found */ + lnum = zbr->lnum; + offs = ALIGN(zbr->offs + zbr->len, 8); + len = zbr->len; + if (len > bu->buf_len) { + err = -EINVAL; + goto out; + } + } else { + /* + * The data nodes must be in consecutive positions in + * the same LEB. + */ + if (zbr->lnum != lnum || zbr->offs != offs) + goto out; + offs += ALIGN(zbr->len, 8); + len = ALIGN(len, 8) + zbr->len; + /* Must not exceed buffer length */ + if (len > bu->buf_len) + goto out; + } + /* Allow for holes */ + next_block = key_block(c, key); + bu->blk_cnt += (next_block - block - 1); + if (bu->blk_cnt >= UBIFS_MAX_BULK_READ) + goto out; + block = next_block; + /* Add this key */ + bu->zbranch[bu->cnt++] = *zbr; + bu->blk_cnt += 1; + /* See if we have room for more */ + if (bu->cnt >= UBIFS_MAX_BULK_READ) + goto out; + if (bu->blk_cnt >= UBIFS_MAX_BULK_READ) + goto out; + } +out: + if (err == -ENOENT) { + bu->eof = 1; + err = 0; + } + bu->gc_seq = c->gc_seq; + mutex_unlock(&c->tnc_mutex); + if (err) + return err; + /* + * An enormous hole could cause bulk-read to encompass too many + * page cache pages, so limit the number here. + */ + if (bu->blk_cnt > UBIFS_MAX_BULK_READ) + bu->blk_cnt = UBIFS_MAX_BULK_READ; + /* + * Ensure that bulk-read covers a whole number of page cache + * pages. + */ + if (UBIFS_BLOCKS_PER_PAGE == 1 || + !(bu->blk_cnt & (UBIFS_BLOCKS_PER_PAGE - 1))) + return 0; + if (bu->eof) { + /* At the end of file we can round up */ + bu->blk_cnt += UBIFS_BLOCKS_PER_PAGE - 1; + return 0; + } + /* Exclude data nodes that do not make up a whole page cache page */ + block = key_block(c, &bu->key) + bu->blk_cnt; + block &= ~(UBIFS_BLOCKS_PER_PAGE - 1); + while (bu->cnt) { + if (key_block(c, &bu->zbranch[bu->cnt - 1].key) < block) + break; + bu->cnt -= 1; + } + return 0; +} + +/** + * read_wbuf - bulk-read from a LEB with a wbuf. + * @wbuf: wbuf that may overlap the read + * @buf: buffer into which to read + * @len: read length + * @lnum: LEB number from which to read + * @offs: offset from which to read + * + * This functions returns %0 on success or a negative error code on failure. + */ +static int read_wbuf(struct ubifs_wbuf *wbuf, void *buf, int len, int lnum, + int offs) +{ + const struct ubifs_info *c = wbuf->c; + int rlen, overlap; + + dbg_io("LEB %d:%d, length %d", lnum, offs, len); + ubifs_assert(wbuf && lnum >= 0 && lnum < c->leb_cnt && offs >= 0); + ubifs_assert(!(offs & 7) && offs < c->leb_size); + ubifs_assert(offs + len <= c->leb_size); + + spin_lock(&wbuf->lock); + overlap = (lnum == wbuf->lnum && offs + len > wbuf->offs); + if (!overlap) { + /* We may safely unlock the write-buffer and read the data */ + spin_unlock(&wbuf->lock); + return ubi_read(c->ubi, lnum, buf, offs, len); + } + + /* Don't read under wbuf */ + rlen = wbuf->offs - offs; + if (rlen < 0) + rlen = 0; + + /* Copy the rest from the write-buffer */ + memcpy(buf + rlen, wbuf->buf + offs + rlen - wbuf->offs, len - rlen); + spin_unlock(&wbuf->lock); + + if (rlen > 0) + /* Read everything that goes before write-buffer */ + return ubi_read(c->ubi, lnum, buf, offs, rlen); + + return 0; +} + +/** + * validate_data_node - validate data nodes for bulk-read. + * @c: UBIFS file-system description object + * @buf: buffer containing data node to validate + * @zbr: zbranch of data node to validate + * + * This functions returns %0 on success or a negative error code on failure. + */ +static int validate_data_node(struct ubifs_info *c, void *buf, + struct ubifs_zbranch *zbr) +{ + union ubifs_key key1; + struct ubifs_ch *ch = buf; + int err, len; + + if (ch->node_type != UBIFS_DATA_NODE) { + ubifs_err("bad node type (%d but expected %d)", + ch->node_type, UBIFS_DATA_NODE); + goto out_err; + } + + err = ubifs_check_node(c, buf, zbr->lnum, zbr->offs, 0, 0); + if (err) { + ubifs_err("expected node type %d", UBIFS_DATA_NODE); + goto out; + } + + len = le32_to_cpu(ch->len); + if (len != zbr->len) { + ubifs_err("bad node length %d, expected %d", len, zbr->len); + goto out_err; + } + + /* Make sure the key of the read node is correct */ + key_read(c, buf + UBIFS_KEY_OFFSET, &key1); + if (!keys_eq(c, &zbr->key, &key1)) { + ubifs_err("bad key in node at LEB %d:%d", + zbr->lnum, zbr->offs); + dbg_tnc("looked for key %s found node's key %s", + DBGKEY(&zbr->key), DBGKEY1(&key1)); + goto out_err; + } + + return 0; + +out_err: + err = -EINVAL; +out: + ubifs_err("bad node at LEB %d:%d", zbr->lnum, zbr->offs); + dbg_dump_node(c, buf); + dbg_dump_stack(); + return err; +} + +/** + * ubifs_tnc_bulk_read - read a number of data nodes in one go. + * @c: UBIFS file-system description object + * @bu: bulk-read parameters and results + * + * This functions reads and validates the data nodes that were identified by the + * 'ubifs_tnc_get_bu_keys()' function. This functions returns %0 on success, + * -EAGAIN to indicate a race with GC, or another negative error code on + * failure. + */ +int ubifs_tnc_bulk_read(struct ubifs_info *c, struct bu_info *bu) +{ + int lnum = bu->zbranch[0].lnum, offs = bu->zbranch[0].offs, len, err, i; + struct ubifs_wbuf *wbuf; + void *buf; + + len = bu->zbranch[bu->cnt - 1].offs; + len += bu->zbranch[bu->cnt - 1].len - offs; + if (len > bu->buf_len) { + ubifs_err("buffer too small %d vs %d", bu->buf_len, len); + return -EINVAL; + } + + /* Do the read */ + wbuf = ubifs_get_wbuf(c, lnum); + if (wbuf) + err = read_wbuf(wbuf, bu->buf, len, lnum, offs); + else + err = ubi_read(c->ubi, lnum, bu->buf, offs, len); + + /* Check for a race with GC */ + if (maybe_leb_gced(c, lnum, bu->gc_seq)) + return -EAGAIN; + + if (err && err != -EBADMSG) { + ubifs_err("failed to read from LEB %d:%d, error %d", + lnum, offs, err); + dbg_dump_stack(); + dbg_tnc("key %s", DBGKEY(&bu->key)); + return err; + } + + /* Validate the nodes read */ + buf = bu->buf; + for (i = 0; i < bu->cnt; i++) { + err = validate_data_node(c, buf, &bu->zbranch[i]); + if (err) + return err; + buf = buf + ALIGN(bu->zbranch[i].len, 8); + } + + return 0; +} + +/** * do_lookup_nm- look up a "hashed" node. * @c: UBIFS file-system description object * @key: node key to lookup @@ -1675,7 +1962,7 @@ static int tnc_insert(struct ubifs_info *c, struct ubifs_znode *znode, { struct ubifs_znode *zn, *zi, *zp; int i, keep, move, appending = 0; - union ubifs_key *key = &zbr->key; + union ubifs_key *key = &zbr->key, *key1; ubifs_assert(n >= 0 && n <= c->fanout); @@ -1716,20 +2003,33 @@ again: zn->level = znode->level; /* Decide where to split */ - if (znode->level == 0 && n == c->fanout && - key_type(c, key) == UBIFS_DATA_KEY) { - union ubifs_key *key1; - - /* - * If this is an inode which is being appended - do not split - * it because no other zbranches can be inserted between - * zbranches of consecutive data nodes anyway. - */ - key1 = &znode->zbranch[n - 1].key; - if (key_inum(c, key1) == key_inum(c, key) && - key_type(c, key1) == UBIFS_DATA_KEY && - key_block(c, key1) == key_block(c, key) - 1) - appending = 1; + if (znode->level == 0 && key_type(c, key) == UBIFS_DATA_KEY) { + /* Try not to split consecutive data keys */ + if (n == c->fanout) { + key1 = &znode->zbranch[n - 1].key; + if (key_inum(c, key1) == key_inum(c, key) && + key_type(c, key1) == UBIFS_DATA_KEY) + appending = 1; + } else + goto check_split; + } else if (appending && n != c->fanout) { + /* Try not to split consecutive data keys */ + appending = 0; +check_split: + if (n >= (c->fanout + 1) / 2) { + key1 = &znode->zbranch[0].key; + if (key_inum(c, key1) == key_inum(c, key) && + key_type(c, key1) == UBIFS_DATA_KEY) { + key1 = &znode->zbranch[n].key; + if (key_inum(c, key1) != key_inum(c, key) || + key_type(c, key1) != UBIFS_DATA_KEY) { + keep = n; + move = c->fanout - keep; + zi = znode; + goto do_split; + } + } + } } if (appending) { @@ -1759,6 +2059,8 @@ again: zbr->znode->parent = zn; } +do_split: + __set_bit(DIRTY_ZNODE, &zn->flags); atomic_long_inc(&c->dirty_zn_cnt); @@ -1785,14 +2087,11 @@ again: /* Insert new znode (produced by spitting) into the parent */ if (zp) { - i = n; + if (n == 0 && zi == znode && znode->iip == 0) + correct_parent_keys(c, znode); + /* Locate insertion point */ n = znode->iip + 1; - if (appending && n != c->fanout) - appending = 0; - - if (i == 0 && zi == znode && znode->iip == 0) - correct_parent_keys(c, znode); /* Tail recursion */ zbr->key = zn->zbranch[0].key; diff --git a/fs/ubifs/tnc_misc.c b/fs/ubifs/tnc_misc.c index a25c1cc..b48db99 100644 --- a/fs/ubifs/tnc_misc.c +++ b/fs/ubifs/tnc_misc.c @@ -480,8 +480,8 @@ int ubifs_tnc_read_node(struct ubifs_info *c, struct ubifs_zbranch *zbr, } /* Make sure the key of the read node is correct */ - key_read(c, key, &key1); - if (memcmp(node + UBIFS_KEY_OFFSET, &key1, c->key_len)) { + key_read(c, node + UBIFS_KEY_OFFSET, &key1); + if (!keys_eq(c, key, &key1)) { ubifs_err("bad key in node at LEB %d:%d", zbr->lnum, zbr->offs); dbg_tnc("looked for key %s found node's key %s", diff --git a/fs/ubifs/ubifs-media.h b/fs/ubifs/ubifs-media.h index a9ecbd9..0b37804 100644 --- a/fs/ubifs/ubifs-media.h +++ b/fs/ubifs/ubifs-media.h @@ -75,7 +75,6 @@ */ #define UBIFS_BLOCK_SIZE 4096 #define UBIFS_BLOCK_SHIFT 12 -#define UBIFS_BLOCK_MASK 0x00000FFF /* UBIFS padding byte pattern (must not be first or last byte of node magic) */ #define UBIFS_PADDING_BYTE 0xCE diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h index 17c620b..a7bd32f 100644 --- a/fs/ubifs/ubifs.h +++ b/fs/ubifs/ubifs.h @@ -142,6 +142,9 @@ /* Maximum expected tree height for use by bottom_up_buf */ #define BOTTOM_UP_HEIGHT 64 +/* Maximum number of data nodes to bulk-read */ +#define UBIFS_MAX_BULK_READ 32 + /* * Lockdep classes for UBIFS inode @ui_mutex. */ @@ -328,9 +331,10 @@ struct ubifs_gced_idx_leb { * this inode * @dirty: non-zero if the inode is dirty * @xattr: non-zero if this is an extended attribute inode + * @bulk_read: non-zero if bulk-read should be used * @ui_mutex: serializes inode write-back with the rest of VFS operations, - * serializes "clean <-> dirty" state changes, protects @dirty, - * @ui_size, and @xattr_size + * serializes "clean <-> dirty" state changes, serializes bulk-read, + * protects @dirty, @bulk_read, @ui_size, and @xattr_size * @ui_lock: protects @synced_i_size * @synced_i_size: synchronized size of inode, i.e. the value of inode size * currently stored on the flash; used only for regular file @@ -338,6 +342,8 @@ struct ubifs_gced_idx_leb { * @ui_size: inode size used by UBIFS when writing to flash * @flags: inode flags (@UBIFS_COMPR_FL, etc) * @compr_type: default compression type used for this inode + * @last_page_read: page number of last page read (for bulk read) + * @read_in_a_row: number of consecutive pages read in a row (for bulk read) * @data_len: length of the data attached to the inode * @data: inode's data * @@ -379,12 +385,15 @@ struct ubifs_inode { unsigned int xattr_names; unsigned int dirty:1; unsigned int xattr:1; + unsigned int bulk_read:1; struct mutex ui_mutex; spinlock_t ui_lock; loff_t synced_i_size; loff_t ui_size; int flags; int compr_type; + pgoff_t last_page_read; + pgoff_t read_in_a_row; int data_len; void *data; }; @@ -698,8 +707,8 @@ struct ubifs_jhead { * struct ubifs_zbranch - key/coordinate/length branch stored in znodes. * @key: key * @znode: znode address in memory - * @lnum: LEB number of the indexing node - * @offs: offset of the indexing node within @lnum + * @lnum: LEB number of the target node (indexing node or data node) + * @offs: target node offset within @lnum * @len: target node length */ struct ubifs_zbranch { @@ -744,6 +753,28 @@ struct ubifs_znode { }; /** + * struct bu_info - bulk-read information + * @key: first data node key + * @zbranch: zbranches of data nodes to bulk read + * @buf: buffer to read into + * @buf_len: buffer length + * @gc_seq: GC sequence number to detect races with GC + * @cnt: number of data nodes for bulk read + * @blk_cnt: number of data blocks including holes + * @oef: end of file reached + */ +struct bu_info { + union ubifs_key key; + struct ubifs_zbranch zbranch[UBIFS_MAX_BULK_READ]; + void *buf; + int buf_len; + int gc_seq; + int cnt; + int blk_cnt; + int eof; +}; + +/** * struct ubifs_node_range - node length range description data structure. * @len: fixed node length * @min_len: minimum possible node length @@ -862,9 +893,13 @@ struct ubifs_orphan { /** * struct ubifs_mount_opts - UBIFS-specific mount options information. * @unmount_mode: selected unmount mode (%0 default, %1 normal, %2 fast) + * @bulk_read: enable bulk-reads + * @chk_data_crc: check CRCs when reading data nodes */ struct ubifs_mount_opts { unsigned int unmount_mode:2; + unsigned int bulk_read:2; + unsigned int chk_data_crc:2; }; /** @@ -905,13 +940,12 @@ struct ubifs_mount_opts { * @cmt_state: commit state * @cs_lock: commit state lock * @cmt_wq: wait queue to sleep on if the log is full and a commit is running + * * @fast_unmount: do not run journal commit before un-mounting * @big_lpt: flag that LPT is too big to write whole during commit - * @check_lpt_free: flag that indicates LPT GC may be needed - * @nospace: non-zero if the file-system does not have flash space (used as - * optimization) - * @nospace_rp: the same as @nospace, but additionally means that even reserved - * pool is full + * @no_chk_data_crc: do not check CRCs when reading data nodes (except during + * recovery) + * @bulk_read: enable bulk-reads * * @tnc_mutex: protects the Tree Node Cache (TNC), @zroot, @cnext, @enext, and * @calc_idx_sz @@ -935,6 +969,7 @@ struct ubifs_mount_opts { * @mst_node: master node * @mst_offs: offset of valid master node * @mst_mutex: protects the master node area, @mst_node, and @mst_offs + * @bulk_read_buf_size: buffer size for bulk-reads * * @log_lebs: number of logical eraseblocks in the log * @log_bytes: log size in bytes @@ -977,12 +1012,17 @@ struct ubifs_mount_opts { * but which still have to be taken into account because * the index has not been committed so far * @space_lock: protects @budg_idx_growth, @budg_data_growth, @budg_dd_growth, - * @budg_uncommited_idx, @min_idx_lebs, @old_idx_sz, and @lst; + * @budg_uncommited_idx, @min_idx_lebs, @old_idx_sz, @lst, + * @nospace, and @nospace_rp; * @min_idx_lebs: minimum number of LEBs required for the index * @old_idx_sz: size of index on flash * @calc_idx_sz: temporary variable which is used to calculate new index size * (contains accurate new index size at end of TNC commit start) * @lst: lprops statistics + * @nospace: non-zero if the file-system does not have flash space (used as + * optimization) + * @nospace_rp: the same as @nospace, but additionally means that even reserved + * pool is full * * @page_budget: budget for a page * @inode_budget: budget for an inode @@ -1061,6 +1101,7 @@ struct ubifs_mount_opts { * @lpt_drty_flgs: dirty flags for LPT special nodes e.g. ltab * @dirty_nn_cnt: number of dirty nnodes * @dirty_pn_cnt: number of dirty pnodes + * @check_lpt_free: flag that indicates LPT GC may be needed * @lpt_sz: LPT size * @lpt_nod_buf: buffer for an on-flash nnode or pnode * @lpt_buf: buffer of LEB size used by LPT @@ -1102,6 +1143,7 @@ struct ubifs_mount_opts { * @rcvrd_mst_node: recovered master node to write when mounting ro to rw * @size_tree: inode size information for recovery * @remounting_rw: set while remounting from ro to rw (sb flags have MS_RDONLY) + * @always_chk_crc: always check CRCs (while mounting and remounting rw) * @mount_opts: UBIFS-specific mount options * * @dbg_buf: a buffer of LEB size used for debugging purposes @@ -1146,11 +1188,11 @@ struct ubifs_info { int cmt_state; spinlock_t cs_lock; wait_queue_head_t cmt_wq; + unsigned int fast_unmount:1; unsigned int big_lpt:1; - unsigned int check_lpt_free:1; - unsigned int nospace:1; - unsigned int nospace_rp:1; + unsigned int no_chk_data_crc:1; + unsigned int bulk_read:1; struct mutex tnc_mutex; struct ubifs_zbranch zroot; @@ -1175,6 +1217,7 @@ struct ubifs_info { struct ubifs_mst_node *mst_node; int mst_offs; struct mutex mst_mutex; + int bulk_read_buf_size; int log_lebs; long long log_bytes; @@ -1218,6 +1261,8 @@ struct ubifs_info { unsigned long long old_idx_sz; unsigned long long calc_idx_sz; struct ubifs_lp_stats lst; + unsigned int nospace:1; + unsigned int nospace_rp:1; int page_budget; int inode_budget; @@ -1294,6 +1339,7 @@ struct ubifs_info { int lpt_drty_flgs; int dirty_nn_cnt; int dirty_pn_cnt; + int check_lpt_free; long long lpt_sz; void *lpt_nod_buf; void *lpt_buf; @@ -1335,6 +1381,7 @@ struct ubifs_info { struct ubifs_mst_node *rcvrd_mst_node; struct rb_root size_tree; int remounting_rw; + int always_chk_crc; struct ubifs_mount_opts mount_opts; #ifdef CONFIG_UBIFS_FS_DEBUG @@ -1347,6 +1394,12 @@ struct ubifs_info { unsigned long fail_timeout; unsigned int fail_cnt; unsigned int fail_cnt_max; + long long chk_lpt_sz; + long long chk_lpt_sz2; + long long chk_lpt_wastage; + int chk_lpt_lebs; + int new_nhead_lnum; + int new_nhead_offs; #endif }; @@ -1377,7 +1430,7 @@ int ubifs_read_node_wbuf(struct ubifs_wbuf *wbuf, void *buf, int type, int len, int ubifs_write_node(struct ubifs_info *c, void *node, int len, int lnum, int offs, int dtype); int ubifs_check_node(const struct ubifs_info *c, const void *buf, int lnum, - int offs, int quiet); + int offs, int quiet, int chk_crc); void ubifs_prepare_node(struct ubifs_info *c, void *buf, int len, int pad); void ubifs_prep_grp_node(struct ubifs_info *c, void *node, int len, int last); int ubifs_io_init(struct ubifs_info *c); @@ -1490,6 +1543,8 @@ void destroy_old_idx(struct ubifs_info *c); int is_idx_node_in_tnc(struct ubifs_info *c, union ubifs_key *key, int level, int lnum, int offs); int insert_old_idx_znode(struct ubifs_info *c, struct ubifs_znode *znode); +int ubifs_tnc_get_bu_keys(struct ubifs_info *c, struct bu_info *bu); +int ubifs_tnc_bulk_read(struct ubifs_info *c, struct bu_info *bu); /* tnc_misc.c */ struct ubifs_znode *ubifs_tnc_levelorder_next(struct ubifs_znode *zr, @@ -1586,12 +1641,10 @@ int ubifs_lpt_post_commit(struct ubifs_info *c); void ubifs_lpt_free(struct ubifs_info *c, int wr_only); /* lprops.c */ -void ubifs_get_lprops(struct ubifs_info *c); const struct ubifs_lprops *ubifs_change_lp(struct ubifs_info *c, const struct ubifs_lprops *lp, int free, int dirty, int flags, int idx_gc_cnt); -void ubifs_release_lprops(struct ubifs_info *c); void ubifs_get_lp_stats(struct ubifs_info *c, struct ubifs_lp_stats *stats); void ubifs_add_to_cat(struct ubifs_info *c, struct ubifs_lprops *lprops, int cat); diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c index 649bec7..cfd31e2 100644 --- a/fs/ubifs/xattr.c +++ b/fs/ubifs/xattr.c @@ -446,7 +446,7 @@ ssize_t ubifs_listxattr(struct dentry *dentry, char *buffer, size_t size) int type; xent = ubifs_tnc_next_ent(c, &key, &nm); - if (unlikely(IS_ERR(xent))) { + if (IS_ERR(xent)) { err = PTR_ERR(xent); break; } diff --git a/fs/udf/namei.c b/fs/udf/namei.c index d3231947..082409c 100644 --- a/fs/udf/namei.c +++ b/fs/udf/namei.c @@ -142,7 +142,7 @@ int udf_write_fi(struct inode *inode, struct fileIdentDesc *cfi, } static struct fileIdentDesc *udf_find_entry(struct inode *dir, - struct dentry *dentry, + struct qstr *child, struct udf_fileident_bh *fibh, struct fileIdentDesc *cfi) { @@ -159,8 +159,8 @@ static struct fileIdentDesc *udf_find_entry(struct inode *dir, sector_t offset; struct extent_position epos = {}; struct udf_inode_info *dinfo = UDF_I(dir); - int isdotdot = dentry->d_name.len == 2 && - dentry->d_name.name[0] == '.' && dentry->d_name.name[1] == '.'; + int isdotdot = child->len == 2 && + child->name[0] == '.' && child->name[1] == '.'; size = udf_ext0_offset(dir) + dir->i_size; f_pos = udf_ext0_offset(dir); @@ -238,8 +238,7 @@ static struct fileIdentDesc *udf_find_entry(struct inode *dir, continue; flen = udf_get_filename(dir->i_sb, nameptr, fname, lfi); - if (flen && udf_match(flen, fname, dentry->d_name.len, - dentry->d_name.name)) + if (flen && udf_match(flen, fname, child->len, child->name)) goto out_ok; } @@ -283,7 +282,7 @@ static struct dentry *udf_lookup(struct inode *dir, struct dentry *dentry, } else #endif /* UDF_RECOVERY */ - if (udf_find_entry(dir, dentry, &fibh, &cfi)) { + if (udf_find_entry(dir, &dentry->d_name, &fibh, &cfi)) { if (fibh.sbh != fibh.ebh) brelse(fibh.ebh); brelse(fibh.sbh); @@ -783,7 +782,7 @@ static int udf_rmdir(struct inode *dir, struct dentry *dentry) retval = -ENOENT; lock_kernel(); - fi = udf_find_entry(dir, dentry, &fibh, &cfi); + fi = udf_find_entry(dir, &dentry->d_name, &fibh, &cfi); if (!fi) goto out; @@ -829,7 +828,7 @@ static int udf_unlink(struct inode *dir, struct dentry *dentry) retval = -ENOENT; lock_kernel(); - fi = udf_find_entry(dir, dentry, &fibh, &cfi); + fi = udf_find_entry(dir, &dentry->d_name, &fibh, &cfi); if (!fi) goto out; @@ -1113,7 +1112,7 @@ static int udf_rename(struct inode *old_dir, struct dentry *old_dentry, struct udf_inode_info *old_iinfo = UDF_I(old_inode); lock_kernel(); - ofi = udf_find_entry(old_dir, old_dentry, &ofibh, &ocfi); + ofi = udf_find_entry(old_dir, &old_dentry->d_name, &ofibh, &ocfi); if (ofi) { if (ofibh.sbh != ofibh.ebh) brelse(ofibh.ebh); @@ -1124,7 +1123,7 @@ static int udf_rename(struct inode *old_dir, struct dentry *old_dentry, != old_inode->i_ino) goto end_rename; - nfi = udf_find_entry(new_dir, new_dentry, &nfibh, &ncfi); + nfi = udf_find_entry(new_dir, &new_dentry->d_name, &nfibh, &ncfi); if (nfi) { if (!new_inode) { if (nfibh.sbh != nfibh.ebh) @@ -1192,7 +1191,7 @@ static int udf_rename(struct inode *old_dir, struct dentry *old_dentry, udf_write_fi(new_dir, &ncfi, nfi, &nfibh, NULL, NULL); /* The old fid may have moved - find it again */ - ofi = udf_find_entry(old_dir, old_dentry, &ofibh, &ocfi); + ofi = udf_find_entry(old_dir, &old_dentry->d_name, &ofibh, &ocfi); udf_delete_entry(old_dir, ofi, &ofibh, &ocfi); if (new_inode) { @@ -1243,15 +1242,11 @@ end_rename: static struct dentry *udf_get_parent(struct dentry *child) { - struct dentry *parent; struct inode *inode = NULL; - struct dentry dotdot; + struct qstr dotdot = {.name = "..", .len = 2}; struct fileIdentDesc cfi; struct udf_fileident_bh fibh; - dotdot.d_name.name = ".."; - dotdot.d_name.len = 2; - lock_kernel(); if (!udf_find_entry(child->d_inode, &dotdot, &fibh, &cfi)) goto out_unlock; @@ -1266,13 +1261,7 @@ static struct dentry *udf_get_parent(struct dentry *child) goto out_unlock; unlock_kernel(); - parent = d_alloc_anon(inode); - if (!parent) { - iput(inode); - parent = ERR_PTR(-ENOMEM); - } - - return parent; + return d_obtain_alias(inode); out_unlock: unlock_kernel(); return ERR_PTR(-EACCES); @@ -1283,7 +1272,6 @@ static struct dentry *udf_nfs_get_inode(struct super_block *sb, u32 block, u16 partref, __u32 generation) { struct inode *inode; - struct dentry *result; kernel_lb_addr loc; if (block == 0) @@ -1300,12 +1288,7 @@ static struct dentry *udf_nfs_get_inode(struct super_block *sb, u32 block, iput(inode); return ERR_PTR(-ESTALE); } - result = d_alloc_anon(inode); - if (!result) { - iput(inode); - return ERR_PTR(-ENOMEM); - } - return result; + return d_obtain_alias(inode); } static struct dentry *udf_fh_to_dentry(struct super_block *sb, diff --git a/fs/ufs/dir.c b/fs/ufs/dir.c index df0bef1..dbbbc46 100644 --- a/fs/ufs/dir.c +++ b/fs/ufs/dir.c @@ -667,4 +667,5 @@ const struct file_operations ufs_dir_operations = { .read = generic_read_dir, .readdir = ufs_readdir, .fsync = file_fsync, + .llseek = generic_file_llseek, }; diff --git a/fs/xfs/linux-2.6/xfs_export.c b/fs/xfs/linux-2.6/xfs_export.c index 24fd598..7f7abec 100644 --- a/fs/xfs/linux-2.6/xfs_export.c +++ b/fs/xfs/linux-2.6/xfs_export.c @@ -148,7 +148,6 @@ xfs_fs_fh_to_dentry(struct super_block *sb, struct fid *fid, { struct xfs_fid64 *fid64 = (struct xfs_fid64 *)fid; struct inode *inode = NULL; - struct dentry *result; if (fh_len < xfs_fileid_length(fileid_type)) return NULL; @@ -164,16 +163,7 @@ xfs_fs_fh_to_dentry(struct super_block *sb, struct fid *fid, break; } - if (!inode) - return NULL; - if (IS_ERR(inode)) - return ERR_CAST(inode); - result = d_alloc_anon(inode); - if (!result) { - iput(inode); - return ERR_PTR(-ENOMEM); - } - return result; + return d_obtain_alias(inode); } STATIC struct dentry * @@ -182,7 +172,6 @@ xfs_fs_fh_to_parent(struct super_block *sb, struct fid *fid, { struct xfs_fid64 *fid64 = (struct xfs_fid64 *)fid; struct inode *inode = NULL; - struct dentry *result; switch (fileid_type) { case FILEID_INO32_GEN_PARENT: @@ -195,16 +184,7 @@ xfs_fs_fh_to_parent(struct super_block *sb, struct fid *fid, break; } - if (!inode) - return NULL; - if (IS_ERR(inode)) - return ERR_CAST(inode); - result = d_alloc_anon(inode); - if (!result) { - iput(inode); - return ERR_PTR(-ENOMEM); - } - return result; + return d_obtain_alias(inode); } STATIC struct dentry * @@ -213,18 +193,12 @@ xfs_fs_get_parent( { int error; struct xfs_inode *cip; - struct dentry *parent; error = xfs_lookup(XFS_I(child->d_inode), &xfs_name_dotdot, &cip, NULL); if (unlikely(error)) return ERR_PTR(-error); - parent = d_alloc_anon(VFS_I(cip)); - if (unlikely(!parent)) { - iput(VFS_I(cip)); - return ERR_PTR(-ENOMEM); - } - return parent; + return d_obtain_alias(VFS_I(cip)); } const struct export_operations xfs_export_operations = { diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c index 5311c1a..3fee790 100644 --- a/fs/xfs/linux-2.6/xfs_file.c +++ b/fs/xfs/linux-2.6/xfs_file.c @@ -204,15 +204,6 @@ xfs_file_fsync( return -xfs_fsync(XFS_I(dentry->d_inode)); } -/* - * Unfortunately we can't just use the clean and simple readdir implementation - * below, because nfs might call back into ->lookup from the filldir callback - * and that will deadlock the low-level btree code. - * - * Hopefully we'll find a better workaround that allows to use the optimal - * version at least for local readdirs for 2.6.25. - */ -#if 0 STATIC int xfs_file_readdir( struct file *filp, @@ -244,125 +235,6 @@ xfs_file_readdir( return -error; return 0; } -#else - -struct hack_dirent { - u64 ino; - loff_t offset; - int namlen; - unsigned int d_type; - char name[]; -}; - -struct hack_callback { - char *dirent; - size_t len; - size_t used; -}; - -STATIC int -xfs_hack_filldir( - void *__buf, - const char *name, - int namlen, - loff_t offset, - u64 ino, - unsigned int d_type) -{ - struct hack_callback *buf = __buf; - struct hack_dirent *de = (struct hack_dirent *)(buf->dirent + buf->used); - unsigned int reclen; - - reclen = ALIGN(sizeof(struct hack_dirent) + namlen, sizeof(u64)); - if (buf->used + reclen > buf->len) - return -EINVAL; - - de->namlen = namlen; - de->offset = offset; - de->ino = ino; - de->d_type = d_type; - memcpy(de->name, name, namlen); - buf->used += reclen; - return 0; -} - -STATIC int -xfs_file_readdir( - struct file *filp, - void *dirent, - filldir_t filldir) -{ - struct inode *inode = filp->f_path.dentry->d_inode; - xfs_inode_t *ip = XFS_I(inode); - struct hack_callback buf; - struct hack_dirent *de; - int error; - loff_t size; - int eof = 0; - xfs_off_t start_offset, curr_offset, offset; - - /* - * Try fairly hard to get memory - */ - buf.len = PAGE_CACHE_SIZE; - do { - buf.dirent = kmalloc(buf.len, GFP_KERNEL); - if (buf.dirent) - break; - buf.len >>= 1; - } while (buf.len >= 1024); - - if (!buf.dirent) - return -ENOMEM; - - curr_offset = filp->f_pos; - if (curr_offset == 0x7fffffff) - offset = 0xffffffff; - else - offset = filp->f_pos; - - while (!eof) { - unsigned int reclen; - - start_offset = offset; - - buf.used = 0; - error = -xfs_readdir(ip, &buf, buf.len, &offset, - xfs_hack_filldir); - if (error || offset == start_offset) { - size = 0; - break; - } - - size = buf.used; - de = (struct hack_dirent *)buf.dirent; - while (size > 0) { - curr_offset = de->offset /* & 0x7fffffff */; - if (filldir(dirent, de->name, de->namlen, - curr_offset & 0x7fffffff, - de->ino, de->d_type)) { - goto done; - } - - reclen = ALIGN(sizeof(struct hack_dirent) + de->namlen, - sizeof(u64)); - size -= reclen; - de = (struct hack_dirent *)((char *)de + reclen); - } - } - - done: - if (!error) { - if (size == 0) - filp->f_pos = offset & 0x7fffffff; - else if (de) - filp->f_pos = curr_offset; - } - - kfree(buf.dirent); - return error; -} -#endif STATIC int xfs_file_mmap( diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c index 48799ba..d3438c7 100644 --- a/fs/xfs/linux-2.6/xfs_ioctl.c +++ b/fs/xfs/linux-2.6/xfs_ioctl.c @@ -311,11 +311,10 @@ xfs_open_by_handle( return new_fd; } - dentry = d_alloc_anon(inode); - if (dentry == NULL) { - iput(inode); + dentry = d_obtain_alias(inode); + if (IS_ERR(dentry)) { put_unused_fd(new_fd); - return -XFS_ERROR(ENOMEM); + return PTR_ERR(dentry); } /* Ensure umount returns EBUSY on umounts while this file is open. */ diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c index e390136..37ebe36 100644 --- a/fs/xfs/linux-2.6/xfs_super.c +++ b/fs/xfs/linux-2.6/xfs_super.c @@ -589,7 +589,7 @@ xfs_blkdev_get( { int error = 0; - *bdevp = open_bdev_excl(name, 0, mp); + *bdevp = open_bdev_exclusive(name, FMODE_READ|FMODE_WRITE, mp); if (IS_ERR(*bdevp)) { error = PTR_ERR(*bdevp); printk("XFS: Invalid device [%s], error=%d\n", name, error); @@ -603,7 +603,7 @@ xfs_blkdev_put( struct block_device *bdev) { if (bdev) - close_bdev_excl(bdev); + close_bdev_exclusive(bdev, FMODE_READ|FMODE_WRITE); } /* |