From 2f56f56ad991edd51ffd0baf1182245ee1277a04 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Wed, 27 Oct 2010 20:59:49 -0700 Subject: Revert "ceph: update issue_seq on cap grant" This reverts commit d91f2438d881514e4a923fd786dbd94b764a9440. The intent of issue_seq is to distinguish between mds->client messages that (re)create the cap and those that do not, which means we should _only_ be updating that value in the create paths. By updating it in handle_cap_grant, we reset it to zero, which then breaks release. The larger question is what workload/problem made me think it should be updated here... Signed-off-by: Sage Weil --- fs/ceph/caps.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 98ab13e..6e0942f 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -2273,8 +2273,7 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant, { struct ceph_inode_info *ci = ceph_inode(inode); int mds = session->s_mds; - unsigned seq = le32_to_cpu(grant->seq); - unsigned issue_seq = le32_to_cpu(grant->issue_seq); + int seq = le32_to_cpu(grant->seq); int newcaps = le32_to_cpu(grant->caps); int issued, implemented, used, wanted, dirty; u64 size = le64_to_cpu(grant->size); @@ -2286,8 +2285,8 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant, int revoked_rdcache = 0; int queue_invalidate = 0; - dout("handle_cap_grant inode %p cap %p mds%d seq %u/%u %s\n", - inode, cap, mds, seq, issue_seq, ceph_cap_string(newcaps)); + dout("handle_cap_grant inode %p cap %p mds%d seq %d %s\n", + inode, cap, mds, seq, ceph_cap_string(newcaps)); dout(" size %llu max_size %llu, i_size %llu\n", size, max_size, inode->i_size); @@ -2383,7 +2382,6 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant, } cap->seq = seq; - cap->issue_seq = issue_seq; /* file layout may have changed */ ci->i_layout = grant->layout; -- cgit v1.1 From 50ae28f0144a790fc63a5b89b9aca3ffa9f88522 Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Mon, 1 Nov 2010 16:08:55 +0100 Subject: FS: cifs, remove unneeded NULL tests Stanse found that pSMBFile in cifs_ioctl and file->f_path.dentry in cifs_user_write are dereferenced prior their test to NULL. The alternative is not to dereference them before the tests. The patch is to point out the problem, you have to decide. While at it we cache the inode in cifs_user_write to a local variable and use all over the function. Signed-off-by: Jiri Slaby Cc: Steve French Cc: linux-cifs@vger.kernel.org Cc: Jeff Layton Cc: Christoph Hellwig Signed-off-by: Steve French --- fs/cifs/file.c | 25 +++++++++++-------------- fs/cifs/ioctl.c | 4 ---- 2 files changed, 11 insertions(+), 18 deletions(-) (limited to 'fs') diff --git a/fs/cifs/file.c b/fs/cifs/file.c index ae82159..5d06eb3 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -956,6 +956,7 @@ cifs_update_eof(struct cifsInodeInfo *cifsi, loff_t offset, ssize_t cifs_user_write(struct file *file, const char __user *write_data, size_t write_size, loff_t *poffset) { + struct inode *inode = file->f_path.dentry->d_inode; int rc = 0; unsigned int bytes_written = 0; unsigned int total_written; @@ -963,7 +964,7 @@ ssize_t cifs_user_write(struct file *file, const char __user *write_data, struct cifsTconInfo *pTcon; int xid, long_op; struct cifsFileInfo *open_file; - struct cifsInodeInfo *cifsi = CIFS_I(file->f_path.dentry->d_inode); + struct cifsInodeInfo *cifsi = CIFS_I(inode); cifs_sb = CIFS_SB(file->f_path.dentry->d_sb); @@ -1029,21 +1030,17 @@ ssize_t cifs_user_write(struct file *file, const char __user *write_data, cifs_stats_bytes_written(pTcon, total_written); - /* since the write may have blocked check these pointers again */ - if ((file->f_path.dentry) && (file->f_path.dentry->d_inode)) { - struct inode *inode = file->f_path.dentry->d_inode; /* Do not update local mtime - server will set its actual value on write - * inode->i_ctime = inode->i_mtime = - * current_fs_time(inode->i_sb);*/ - if (total_written > 0) { - spin_lock(&inode->i_lock); - if (*poffset > file->f_path.dentry->d_inode->i_size) - i_size_write(file->f_path.dentry->d_inode, - *poffset); - spin_unlock(&inode->i_lock); - } - mark_inode_dirty_sync(file->f_path.dentry->d_inode); + * inode->i_ctime = inode->i_mtime = + * current_fs_time(inode->i_sb);*/ + if (total_written > 0) { + spin_lock(&inode->i_lock); + if (*poffset > inode->i_size) + i_size_write(inode, *poffset); + spin_unlock(&inode->i_lock); } + mark_inode_dirty_sync(inode); + FreeXid(xid); return total_written; } diff --git a/fs/cifs/ioctl.c b/fs/cifs/ioctl.c index 077bf75..2fa22f2 100644 --- a/fs/cifs/ioctl.c +++ b/fs/cifs/ioctl.c @@ -63,8 +63,6 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg) #ifdef CONFIG_CIFS_POSIX case FS_IOC_GETFLAGS: if (CIFS_UNIX_EXTATTR_CAP & caps) { - if (pSMBFile == NULL) - break; rc = CIFSGetExtAttr(xid, tcon, pSMBFile->netfid, &ExtAttrBits, &ExtAttrMask); if (rc == 0) @@ -80,8 +78,6 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg) rc = -EFAULT; break; } - if (pSMBFile == NULL) - break; /* rc= CIFSGetExtAttr(xid,tcon,pSMBFile->netfid, extAttrBits, &ExtAttrMask);*/ } -- cgit v1.1 From f4245bd4ebf903541ba758ad06c118626d8c6f18 Mon Sep 17 00:00:00 2001 From: Lukas Czerner Date: Tue, 2 Nov 2010 14:07:17 -0400 Subject: ext4: fix lazyinit hang after removing request When the request has been removed from the list and no other request has been issued, we will end up with next wakeup scheduled to MAX_JIFFY_OFFSET which is bad. So check for that. Signed-off-by: Lukas Czerner Signed-off-by: "Theodore Ts'o" --- fs/ext4/super.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 40131b7..8d1d942 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -2740,7 +2740,8 @@ cont_thread: if (freezing(current)) refrigerator(); - if (time_after_eq(jiffies, next_wakeup)) { + if ((time_after_eq(jiffies, next_wakeup)) || + (MAX_JIFFY_OFFSET == next_wakeup)) { cond_resched(); continue; } -- cgit v1.1 From b2c78cd09b6ef78c8f20190f0b3e6df1d3651b70 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Tue, 2 Nov 2010 14:19:30 -0400 Subject: ext4: "ret" may be used uninitialized in ext4_lazyinit_thread() Newer GCC's reported the following build warning: fs/ext4/super.c: In function 'ext4_lazyinit_thread': fs/ext4/super.c:2702: warning: 'ret' may be used uninitialized in this function Fix it by removing the need for the ret variable in the first place. Signed-off-by: "Lukas Czerner" Reported-by: "Stefan Richter" Signed-off-by: "Theodore Ts'o" --- fs/ext4/super.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 8d1d942..4d7ef31 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -2699,7 +2699,6 @@ static int ext4_lazyinit_thread(void *arg) struct ext4_li_request *elr; unsigned long next_wakeup; DEFINE_WAIT(wait); - int ret; BUG_ON(NULL == eli); @@ -2723,13 +2722,12 @@ cont_thread: elr = list_entry(pos, struct ext4_li_request, lr_request); - if (time_after_eq(jiffies, elr->lr_next_sched)) - ret = ext4_run_li_request(elr); - - if (ret) { - ret = 0; - ext4_remove_li_request(elr); - continue; + if (time_after_eq(jiffies, elr->lr_next_sched)) { + if (ext4_run_li_request(elr) != 0) { + /* error, remove the lazy_init job */ + ext4_remove_li_request(elr); + continue; + } } if (time_before(elr->lr_next_sched, next_wakeup)) -- cgit v1.1 From e66673e39ac9d4749bd9676dd1caf928095409f5 Mon Sep 17 00:00:00 2001 From: Pavel Shilovsky Date: Tue, 2 Nov 2010 12:00:42 +0300 Subject: CIFS: Add cifs_set_oplock_level Simplify many places when we need to set oplock level on an inode. Signed-off-by: Pavel Shilovsky Reviewed-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/cifsproto.h | 1 + fs/cifs/file.c | 38 +++++++++----------------------------- fs/cifs/misc.c | 23 ++++++++++++++++++++--- 3 files changed, 30 insertions(+), 32 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index edb6d90..7f050f4 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -104,6 +104,7 @@ extern struct timespec cifs_NTtimeToUnix(__le64 utc_nanoseconds_since_1601); extern u64 cifs_UnixTimeToNT(struct timespec); extern struct timespec cnvrtDosUnixTm(__le16 le_date, __le16 le_time, int offset); +extern void cifs_set_oplock_level(struct inode *inode, __u32 oplock); extern struct cifsFileInfo *cifs_new_fileinfo(__u16 fileHandle, struct file *file, struct tcon_link *tlink, diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 5d06eb3..a566f15 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -146,12 +146,7 @@ client_can_cache: rc = cifs_get_inode_info(&inode, full_path, buf, inode->i_sb, xid, NULL); - if ((oplock & 0xF) == OPLOCK_EXCLUSIVE) { - pCifsInode->clientCanCacheAll = true; - pCifsInode->clientCanCacheRead = true; - cFYI(1, "Exclusive Oplock granted on inode %p", inode); - } else if ((oplock & 0xF) == OPLOCK_READ) - pCifsInode->clientCanCacheRead = true; + cifs_set_oplock_level(inode, oplock); return rc; } @@ -253,12 +248,7 @@ cifs_new_fileinfo(__u16 fileHandle, struct file *file, list_add_tail(&pCifsFile->flist, &pCifsInode->openFileList); spin_unlock(&cifs_file_list_lock); - if ((oplock & 0xF) == OPLOCK_EXCLUSIVE) { - pCifsInode->clientCanCacheAll = true; - pCifsInode->clientCanCacheRead = true; - cFYI(1, "Exclusive Oplock inode %p", inode); - } else if ((oplock & 0xF) == OPLOCK_READ) - pCifsInode->clientCanCacheRead = true; + cifs_set_oplock_level(inode, oplock); file->private_data = pCifsFile; return pCifsFile; @@ -271,8 +261,10 @@ cifs_new_fileinfo(__u16 fileHandle, struct file *file, */ void cifsFileInfo_put(struct cifsFileInfo *cifs_file) { + struct inode *inode = cifs_file->dentry->d_inode; struct cifsTconInfo *tcon = tlink_tcon(cifs_file->tlink); - struct cifsInodeInfo *cifsi = CIFS_I(cifs_file->dentry->d_inode); + struct cifsInodeInfo *cifsi = CIFS_I(inode); + struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); struct cifsLockInfo *li, *tmp; spin_lock(&cifs_file_list_lock); @@ -288,8 +280,7 @@ void cifsFileInfo_put(struct cifsFileInfo *cifs_file) if (list_empty(&cifsi->openFileList)) { cFYI(1, "closing last open instance for inode %p", cifs_file->dentry->d_inode); - cifsi->clientCanCacheRead = false; - cifsi->clientCanCacheAll = false; + cifs_set_oplock_level(inode, 0); } spin_unlock(&cifs_file_list_lock); @@ -607,8 +598,6 @@ reopen_success: rc = filemap_write_and_wait(inode->i_mapping); mapping_set_error(inode->i_mapping, rc); - pCifsInode->clientCanCacheAll = false; - pCifsInode->clientCanCacheRead = false; if (tcon->unix_ext) rc = cifs_get_inode_info_unix(&inode, full_path, inode->i_sb, xid); @@ -622,18 +611,9 @@ reopen_success: invalidate the current end of file on the server we can not go to the server to get the new inod info */ - if ((oplock & 0xF) == OPLOCK_EXCLUSIVE) { - pCifsInode->clientCanCacheAll = true; - pCifsInode->clientCanCacheRead = true; - cFYI(1, "Exclusive Oplock granted on inode %p", - pCifsFile->dentry->d_inode); - } else if ((oplock & 0xF) == OPLOCK_READ) { - pCifsInode->clientCanCacheRead = true; - pCifsInode->clientCanCacheAll = false; - } else { - pCifsInode->clientCanCacheRead = false; - pCifsInode->clientCanCacheAll = false; - } + + cifs_set_oplock_level(inode, oplock); + cifs_relock_file(pCifsFile); reopen_error_exit: diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c index c4e296f..d3b9dde 100644 --- a/fs/cifs/misc.c +++ b/fs/cifs/misc.c @@ -569,10 +569,9 @@ is_valid_oplock_break(struct smb_hdr *buf, struct TCP_Server_Info *srv) cFYI(1, "file id match, oplock break"); pCifsInode = CIFS_I(netfile->dentry->d_inode); - pCifsInode->clientCanCacheAll = false; - if (pSMB->OplockLevel == 0) - pCifsInode->clientCanCacheRead = false; + cifs_set_oplock_level(netfile->dentry->d_inode, + pSMB->OplockLevel); /* * cifs_oplock_break_put() can't be called * from here. Get reference after queueing @@ -722,3 +721,21 @@ cifs_autodisable_serverino(struct cifs_sb_info *cifs_sb) cifs_sb_master_tcon(cifs_sb)->treeName); } } + +void cifs_set_oplock_level(struct inode *inode, __u32 oplock) +{ + struct cifsInodeInfo *cinode = CIFS_I(inode); + + if ((oplock & 0xF) == OPLOCK_EXCLUSIVE) { + cinode->clientCanCacheAll = true; + cinode->clientCanCacheRead = true; + cFYI(1, "Exclusive Oplock granted on inode %p", inode); + } else if ((oplock & 0xF) == OPLOCK_READ) { + cinode->clientCanCacheAll = false; + cinode->clientCanCacheRead = true; + cFYI(1, "Level II Oplock granted on inode %p", inode); + } else { + cinode->clientCanCacheAll = false; + cinode->clientCanCacheRead = false; + } +} -- cgit v1.1 From df098db12ada832c0232ee1f91eff21a8701889c Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Sat, 30 Oct 2010 17:06:21 -0400 Subject: cifs: trivial doc fix: note setlease implemented Signed-off-by: J. Bruce Fields Signed-off-by: Steve French --- fs/cifs/TODO | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/cifs/TODO b/fs/cifs/TODO index 5aff46c..355abcdc 100644 --- a/fs/cifs/TODO +++ b/fs/cifs/TODO @@ -81,7 +81,7 @@ u) DOS attrs - returned as pseudo-xattr in Samba format (check VFAT and NTFS for v) mount check for unmatched uids -w) Add support for new vfs entry points for setlease and fallocate +w) Add support for new vfs entry point for fallocate x) Fix Samba 3 server to handle Linux kernel aio so dbench with lots of processes can proceed better in parallel (on the server) -- cgit v1.1 From 413e661c136c52290de1ee19a1b049a4da9dbf51 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Thu, 28 Oct 2010 13:33:38 -0400 Subject: cifs: store pointer to master tlink in superblock (try #2) This is the second version of this patch, the only difference between it and the first one is that this explicitly makes cifs_sb_master_tlink a static inline. Instead of keeping a tag on the master tlink in the tree, just keep a pointer to the master in the superblock. That eliminates the need for using the radix tree to look up a tagged entry. Signed-off-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/cifs_fs_sb.h | 2 +- fs/cifs/connect.c | 20 ++++---------------- 2 files changed, 5 insertions(+), 17 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifs_fs_sb.h b/fs/cifs/cifs_fs_sb.h index 525ba59..79576da 100644 --- a/fs/cifs/cifs_fs_sb.h +++ b/fs/cifs/cifs_fs_sb.h @@ -43,8 +43,8 @@ struct cifs_sb_info { struct radix_tree_root tlink_tree; -#define CIFS_TLINK_MASTER_TAG 0 /* is "master" (mount) tcon */ spinlock_t tlink_tree_lock; + struct tcon_link *master_tlink; struct nls_table *local_nls; unsigned int rsize; unsigned int wsize; diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 9eb327d..197ac57 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -2914,11 +2914,11 @@ remote_path_check: spin_lock(&cifs_sb->tlink_tree_lock); radix_tree_insert(&cifs_sb->tlink_tree, pSesInfo->linux_uid, tlink); - radix_tree_tag_set(&cifs_sb->tlink_tree, pSesInfo->linux_uid, - CIFS_TLINK_MASTER_TAG); spin_unlock(&cifs_sb->tlink_tree_lock); radix_tree_preload_end(); + cifs_sb->master_tlink = tlink; + queue_delayed_work(system_nrt_wq, &cifs_sb->prune_tlinks, TLINK_IDLE_EXPIRE); @@ -3271,22 +3271,10 @@ out: return tcon; } -static struct tcon_link * +static inline struct tcon_link * cifs_sb_master_tlink(struct cifs_sb_info *cifs_sb) { - struct tcon_link *tlink; - unsigned int ret; - - spin_lock(&cifs_sb->tlink_tree_lock); - ret = radix_tree_gang_lookup_tag(&cifs_sb->tlink_tree, (void **)&tlink, - 0, 1, CIFS_TLINK_MASTER_TAG); - spin_unlock(&cifs_sb->tlink_tree_lock); - - /* the master tcon should always be present */ - if (ret == 0) - BUG(); - - return tlink; + return cifs_sb->master_tlink; } struct cifsTconInfo * -- cgit v1.1 From b647c35f77af9c07d336247b23014596e9f0a593 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Thu, 28 Oct 2010 11:16:44 -0400 Subject: cifs: convert tlink_tree to a rbtree Radix trees are ideal when you want to track a bunch of pointers and can't embed a tracking structure within the target of those pointers. The tradeoff is an increase in memory, particularly if the tree is sparse. In CIFS, we use the tlink_tree to track tcon_link structs. A tcon_link can never be in more than one tlink_tree, so there's no impediment to using a rb_tree here instead of a radix tree. Convert the new multiuser mount code to use a rb_tree instead. This should reduce the memory required to manage the tlink_tree. Signed-off-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/cifs_fs_sb.h | 4 +- fs/cifs/cifsfs.c | 2 +- fs/cifs/cifsglob.h | 3 +- fs/cifs/connect.c | 177 ++++++++++++++++++++++++++++----------------------- 4 files changed, 101 insertions(+), 85 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifs_fs_sb.h b/fs/cifs/cifs_fs_sb.h index 79576da..e9a393c 100644 --- a/fs/cifs/cifs_fs_sb.h +++ b/fs/cifs/cifs_fs_sb.h @@ -15,7 +15,7 @@ * the GNU Lesser General Public License for more details. * */ -#include +#include #ifndef _CIFS_FS_SB_H #define _CIFS_FS_SB_H @@ -42,7 +42,7 @@ #define CIFS_MOUNT_MULTIUSER 0x20000 /* multiuser mount */ struct cifs_sb_info { - struct radix_tree_root tlink_tree; + struct rb_root tlink_tree; spinlock_t tlink_tree_lock; struct tcon_link *master_tlink; struct nls_table *local_nls; diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 75c4eaa..38526a6 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -116,7 +116,7 @@ cifs_read_super(struct super_block *sb, void *data, return -ENOMEM; spin_lock_init(&cifs_sb->tlink_tree_lock); - INIT_RADIX_TREE(&cifs_sb->tlink_tree, GFP_KERNEL); + cifs_sb->tlink_tree = RB_ROOT; rc = bdi_setup_and_register(&cifs_sb->bdi, "cifs", BDI_CAP_MAP_COPY); if (rc) { diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index f259e4d..b577bf0 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -336,7 +336,8 @@ struct cifsTconInfo { * "get" on the container. */ struct tcon_link { - unsigned long tl_index; + struct rb_node tl_rbnode; + uid_t tl_uid; unsigned long tl_flags; #define TCON_LINK_MASTER 0 #define TCON_LINK_PENDING 1 diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 197ac57..c9699ce 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -116,6 +116,7 @@ struct smb_vol { static int ipv4_connect(struct TCP_Server_Info *server); static int ipv6_connect(struct TCP_Server_Info *server); +static void tlink_rb_insert(struct rb_root *root, struct tcon_link *new_tlink); static void cifs_prune_tlinks(struct work_struct *work); /* @@ -2900,24 +2901,16 @@ remote_path_check: goto mount_fail_check; } - tlink->tl_index = pSesInfo->linux_uid; + tlink->tl_uid = pSesInfo->linux_uid; tlink->tl_tcon = tcon; tlink->tl_time = jiffies; set_bit(TCON_LINK_MASTER, &tlink->tl_flags); set_bit(TCON_LINK_IN_TREE, &tlink->tl_flags); - rc = radix_tree_preload(GFP_KERNEL); - if (rc == -ENOMEM) { - kfree(tlink); - goto mount_fail_check; - } - + cifs_sb->master_tlink = tlink; spin_lock(&cifs_sb->tlink_tree_lock); - radix_tree_insert(&cifs_sb->tlink_tree, pSesInfo->linux_uid, tlink); + tlink_rb_insert(&cifs_sb->tlink_tree, tlink); spin_unlock(&cifs_sb->tlink_tree_lock); - radix_tree_preload_end(); - - cifs_sb->master_tlink = tlink; queue_delayed_work(system_nrt_wq, &cifs_sb->prune_tlinks, TLINK_IDLE_EXPIRE); @@ -3107,32 +3100,25 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses, int cifs_umount(struct super_block *sb, struct cifs_sb_info *cifs_sb) { - int i, ret; + struct rb_root *root = &cifs_sb->tlink_tree; + struct rb_node *node; + struct tcon_link *tlink; char *tmp; - struct tcon_link *tlink[8]; - unsigned long index = 0; cancel_delayed_work_sync(&cifs_sb->prune_tlinks); - do { - spin_lock(&cifs_sb->tlink_tree_lock); - ret = radix_tree_gang_lookup(&cifs_sb->tlink_tree, - (void **)tlink, index, - ARRAY_SIZE(tlink)); - /* increment index for next pass */ - if (ret > 0) - index = tlink[ret - 1]->tl_index + 1; - for (i = 0; i < ret; i++) { - cifs_get_tlink(tlink[i]); - clear_bit(TCON_LINK_IN_TREE, &tlink[i]->tl_flags); - radix_tree_delete(&cifs_sb->tlink_tree, - tlink[i]->tl_index); - } - spin_unlock(&cifs_sb->tlink_tree_lock); + spin_lock(&cifs_sb->tlink_tree_lock); + while ((node = rb_first(root))) { + tlink = rb_entry(node, struct tcon_link, tl_rbnode); + cifs_get_tlink(tlink); + clear_bit(TCON_LINK_IN_TREE, &tlink->tl_flags); + rb_erase(node, root); - for (i = 0; i < ret; i++) - cifs_put_tlink(tlink[i]); - } while (ret != 0); + spin_unlock(&cifs_sb->tlink_tree_lock); + cifs_put_tlink(tlink); + spin_lock(&cifs_sb->tlink_tree_lock); + } + spin_unlock(&cifs_sb->tlink_tree_lock); tmp = cifs_sb->prepath; cifs_sb->prepathlen = 0; @@ -3290,6 +3276,47 @@ cifs_sb_tcon_pending_wait(void *unused) return signal_pending(current) ? -ERESTARTSYS : 0; } +/* find and return a tlink with given uid */ +static struct tcon_link * +tlink_rb_search(struct rb_root *root, uid_t uid) +{ + struct rb_node *node = root->rb_node; + struct tcon_link *tlink; + + while (node) { + tlink = rb_entry(node, struct tcon_link, tl_rbnode); + + if (tlink->tl_uid > uid) + node = node->rb_left; + else if (tlink->tl_uid < uid) + node = node->rb_right; + else + return tlink; + } + return NULL; +} + +/* insert a tcon_link into the tree */ +static void +tlink_rb_insert(struct rb_root *root, struct tcon_link *new_tlink) +{ + struct rb_node **new = &(root->rb_node), *parent = NULL; + struct tcon_link *tlink; + + while (*new) { + tlink = rb_entry(*new, struct tcon_link, tl_rbnode); + parent = *new; + + if (tlink->tl_uid > new_tlink->tl_uid) + new = &((*new)->rb_left); + else + new = &((*new)->rb_right); + } + + rb_link_node(&new_tlink->tl_rbnode, parent, new); + rb_insert_color(&new_tlink->tl_rbnode, root); +} + /* * Find or construct an appropriate tcon given a cifs_sb and the fsuid of the * current task. @@ -3310,14 +3337,14 @@ struct tcon_link * cifs_sb_tlink(struct cifs_sb_info *cifs_sb) { int ret; - unsigned long fsuid = (unsigned long) current_fsuid(); + uid_t fsuid = current_fsuid(); struct tcon_link *tlink, *newtlink; if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER)) return cifs_get_tlink(cifs_sb_master_tlink(cifs_sb)); spin_lock(&cifs_sb->tlink_tree_lock); - tlink = radix_tree_lookup(&cifs_sb->tlink_tree, fsuid); + tlink = tlink_rb_search(&cifs_sb->tlink_tree, fsuid); if (tlink) cifs_get_tlink(tlink); spin_unlock(&cifs_sb->tlink_tree_lock); @@ -3326,36 +3353,24 @@ cifs_sb_tlink(struct cifs_sb_info *cifs_sb) newtlink = kzalloc(sizeof(*tlink), GFP_KERNEL); if (newtlink == NULL) return ERR_PTR(-ENOMEM); - newtlink->tl_index = fsuid; + newtlink->tl_uid = fsuid; newtlink->tl_tcon = ERR_PTR(-EACCES); set_bit(TCON_LINK_PENDING, &newtlink->tl_flags); set_bit(TCON_LINK_IN_TREE, &newtlink->tl_flags); cifs_get_tlink(newtlink); - ret = radix_tree_preload(GFP_KERNEL); - if (ret != 0) { - kfree(newtlink); - return ERR_PTR(ret); - } - spin_lock(&cifs_sb->tlink_tree_lock); /* was one inserted after previous search? */ - tlink = radix_tree_lookup(&cifs_sb->tlink_tree, fsuid); + tlink = tlink_rb_search(&cifs_sb->tlink_tree, fsuid); if (tlink) { cifs_get_tlink(tlink); spin_unlock(&cifs_sb->tlink_tree_lock); - radix_tree_preload_end(); kfree(newtlink); goto wait_for_construction; } - ret = radix_tree_insert(&cifs_sb->tlink_tree, fsuid, newtlink); - spin_unlock(&cifs_sb->tlink_tree_lock); - radix_tree_preload_end(); - if (ret) { - kfree(newtlink); - return ERR_PTR(ret); - } tlink = newtlink; + tlink_rb_insert(&cifs_sb->tlink_tree, tlink); + spin_unlock(&cifs_sb->tlink_tree_lock); } else { wait_for_construction: ret = wait_on_bit(&tlink->tl_flags, TCON_LINK_PENDING, @@ -3401,39 +3416,39 @@ cifs_prune_tlinks(struct work_struct *work) { struct cifs_sb_info *cifs_sb = container_of(work, struct cifs_sb_info, prune_tlinks.work); - struct tcon_link *tlink[8]; - unsigned long now = jiffies; - unsigned long index = 0; - int i, ret; + struct rb_root *root = &cifs_sb->tlink_tree; + struct rb_node *node = rb_first(root); + struct rb_node *tmp; + struct tcon_link *tlink; - do { - spin_lock(&cifs_sb->tlink_tree_lock); - ret = radix_tree_gang_lookup(&cifs_sb->tlink_tree, - (void **)tlink, index, - ARRAY_SIZE(tlink)); - /* increment index for next pass */ - if (ret > 0) - index = tlink[ret - 1]->tl_index + 1; - for (i = 0; i < ret; i++) { - if (test_bit(TCON_LINK_MASTER, &tlink[i]->tl_flags) || - atomic_read(&tlink[i]->tl_count) != 0 || - time_after(tlink[i]->tl_time + TLINK_IDLE_EXPIRE, - now)) { - tlink[i] = NULL; - continue; - } - cifs_get_tlink(tlink[i]); - clear_bit(TCON_LINK_IN_TREE, &tlink[i]->tl_flags); - radix_tree_delete(&cifs_sb->tlink_tree, - tlink[i]->tl_index); - } - spin_unlock(&cifs_sb->tlink_tree_lock); + /* + * Because we drop the spinlock in the loop in order to put the tlink + * it's not guarded against removal of links from the tree. The only + * places that remove entries from the tree are this function and + * umounts. Because this function is non-reentrant and is canceled + * before umount can proceed, this is safe. + */ + spin_lock(&cifs_sb->tlink_tree_lock); + node = rb_first(root); + while (node != NULL) { + tmp = node; + node = rb_next(tmp); + tlink = rb_entry(tmp, struct tcon_link, tl_rbnode); + + if (test_bit(TCON_LINK_MASTER, &tlink->tl_flags) || + atomic_read(&tlink->tl_count) != 0 || + time_after(tlink->tl_time + TLINK_IDLE_EXPIRE, jiffies)) + continue; - for (i = 0; i < ret; i++) { - if (tlink[i] != NULL) - cifs_put_tlink(tlink[i]); - } - } while (ret != 0); + cifs_get_tlink(tlink); + clear_bit(TCON_LINK_IN_TREE, &tlink->tl_flags); + rb_erase(tmp, root); + + spin_unlock(&cifs_sb->tlink_tree_lock); + cifs_put_tlink(tlink); + spin_lock(&cifs_sb->tlink_tree_lock); + } + spin_unlock(&cifs_sb->tlink_tree_lock); queue_delayed_work(system_nrt_wq, &cifs_sb->prune_tlinks, TLINK_IDLE_EXPIRE); -- cgit v1.1 From 54eeafe1e4fb7b11da17adacacb1fbe279e0cf6e Mon Sep 17 00:00:00 2001 From: Steve French Date: Tue, 2 Nov 2010 19:22:45 +0000 Subject: [CIFS] Cleanup unused variable build warning Signed-off-by: Steve French --- fs/cifs/file.c | 1 - 1 file changed, 1 deletion(-) (limited to 'fs') diff --git a/fs/cifs/file.c b/fs/cifs/file.c index a566f15..71185d1 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -264,7 +264,6 @@ void cifsFileInfo_put(struct cifsFileInfo *cifs_file) struct inode *inode = cifs_file->dentry->d_inode; struct cifsTconInfo *tcon = tlink_tcon(cifs_file->tlink); struct cifsInodeInfo *cifsi = CIFS_I(inode); - struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); struct cifsLockInfo *li, *tmp; spin_lock(&cifs_file_list_lock); -- cgit v1.1 From 21b75b019983dfa5c2dda588f4b60b4ca69844a4 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Tue, 26 Oct 2010 10:07:17 -0400 Subject: nfsd4: fix 4.1 connection registration race If a connection is closed just after a sequence or create_session is sent over it, we could end up trying to register a callback that will never get called since the xprt is already marked dead. Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4state.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index f1e5ec6..ad2bfa6 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -673,16 +673,17 @@ static void nfsd4_hash_conn(struct nfsd4_conn *conn, struct nfsd4_session *ses) spin_unlock(&clp->cl_lock); } -static void nfsd4_register_conn(struct nfsd4_conn *conn) +static int nfsd4_register_conn(struct nfsd4_conn *conn) { conn->cn_xpt_user.callback = nfsd4_conn_lost; - register_xpt_user(conn->cn_xprt, &conn->cn_xpt_user); + return register_xpt_user(conn->cn_xprt, &conn->cn_xpt_user); } static __be32 nfsd4_new_conn(struct svc_rqst *rqstp, struct nfsd4_session *ses) { struct nfsd4_conn *conn; u32 flags = NFS4_CDFC4_FORE; + int ret; if (ses->se_flags & SESSION4_BACK_CHAN) flags |= NFS4_CDFC4_BACK; @@ -690,7 +691,10 @@ static __be32 nfsd4_new_conn(struct svc_rqst *rqstp, struct nfsd4_session *ses) if (!conn) return nfserr_jukebox; nfsd4_hash_conn(conn, ses); - nfsd4_register_conn(conn); + ret = nfsd4_register_conn(conn); + if (ret) + /* oops; xprt is already down: */ + nfsd4_conn_lost(&conn->cn_xpt_user); return nfs_ok; } @@ -1644,6 +1648,7 @@ static void nfsd4_sequence_check_conn(struct nfsd4_conn *new, struct nfsd4_sessi { struct nfs4_client *clp = ses->se_client; struct nfsd4_conn *c; + int ret; spin_lock(&clp->cl_lock); c = __nfsd4_find_conn(new->cn_xprt, ses); @@ -1654,7 +1659,10 @@ static void nfsd4_sequence_check_conn(struct nfsd4_conn *new, struct nfsd4_sessi } __nfsd4_hash_conn(new, ses); spin_unlock(&clp->cl_lock); - nfsd4_register_conn(new); + ret = nfsd4_register_conn(new); + if (ret) + /* oops; xprt is already down: */ + nfsd4_conn_lost(&new->cn_xpt_user); return; } -- cgit v1.1 From ce7e010aef63dc6b37a2354f7c9f5f4aedb37978 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Wed, 3 Nov 2010 12:03:21 -0400 Subject: ext4: initialize the percpu counters before replaying the journal We now initialize the percpu counters before replaying the journal, but after the journal, we recalculate the global counters, to deal with the possibility of the per-blockgroup counts getting updated by the journal replay. Signed-off-by: "Theodore Ts'o" --- fs/ext4/super.c | 65 ++++++++++++++++++++++++++++++++++----------------------- 1 file changed, 39 insertions(+), 26 deletions(-) (limited to 'fs') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 4d7ef31..04352e9 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -3347,6 +3347,24 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) get_random_bytes(&sbi->s_next_generation, sizeof(u32)); spin_lock_init(&sbi->s_next_gen_lock); + err = percpu_counter_init(&sbi->s_freeblocks_counter, + ext4_count_free_blocks(sb)); + if (!err) { + err = percpu_counter_init(&sbi->s_freeinodes_counter, + ext4_count_free_inodes(sb)); + } + if (!err) { + err = percpu_counter_init(&sbi->s_dirs_counter, + ext4_count_dirs(sb)); + } + if (!err) { + err = percpu_counter_init(&sbi->s_dirtyblocks_counter, 0); + } + if (err) { + ext4_msg(sb, KERN_ERR, "insufficient memory"); + goto failed_mount3; + } + sbi->s_stripe = ext4_get_stripe_size(sbi); sbi->s_max_writeback_mb_bump = 128; @@ -3445,22 +3463,19 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) } set_task_ioprio(sbi->s_journal->j_task, journal_ioprio); -no_journal: - err = percpu_counter_init(&sbi->s_freeblocks_counter, - ext4_count_free_blocks(sb)); - if (!err) - err = percpu_counter_init(&sbi->s_freeinodes_counter, - ext4_count_free_inodes(sb)); - if (!err) - err = percpu_counter_init(&sbi->s_dirs_counter, - ext4_count_dirs(sb)); - if (!err) - err = percpu_counter_init(&sbi->s_dirtyblocks_counter, 0); - if (err) { - ext4_msg(sb, KERN_ERR, "insufficient memory"); - goto failed_mount_wq; - } + /* + * The journal may have updated the bg summary counts, so we + * need to update the global counters. + */ + percpu_counter_set(&sbi->s_freeblocks_counter, + ext4_count_free_blocks(sb)); + percpu_counter_set(&sbi->s_freeinodes_counter, + ext4_count_free_inodes(sb)); + percpu_counter_set(&sbi->s_dirs_counter, + ext4_count_dirs(sb)); + percpu_counter_set(&sbi->s_dirtyblocks_counter, 0); +no_journal: EXT4_SB(sb)->dio_unwritten_wq = create_workqueue("ext4-dio-unwritten"); if (!EXT4_SB(sb)->dio_unwritten_wq) { printk(KERN_ERR "EXT4-fs: failed to create DIO workqueue\n"); @@ -3610,10 +3625,6 @@ failed_mount_wq: jbd2_journal_destroy(sbi->s_journal); sbi->s_journal = NULL; } - percpu_counter_destroy(&sbi->s_freeblocks_counter); - percpu_counter_destroy(&sbi->s_freeinodes_counter); - percpu_counter_destroy(&sbi->s_dirs_counter); - percpu_counter_destroy(&sbi->s_dirtyblocks_counter); failed_mount3: if (sbi->s_flex_groups) { if (is_vmalloc_addr(sbi->s_flex_groups)) @@ -3621,6 +3632,10 @@ failed_mount3: else kfree(sbi->s_flex_groups); } + percpu_counter_destroy(&sbi->s_freeblocks_counter); + percpu_counter_destroy(&sbi->s_freeinodes_counter); + percpu_counter_destroy(&sbi->s_dirs_counter); + percpu_counter_destroy(&sbi->s_dirtyblocks_counter); failed_mount2: for (i = 0; i < db_count; i++) brelse(sbi->s_group_desc[i]); @@ -3948,13 +3963,11 @@ static int ext4_commit_super(struct super_block *sb, int sync) else es->s_kbytes_written = cpu_to_le64(EXT4_SB(sb)->s_kbytes_written); - if (percpu_counter_initialized(&EXT4_SB(sb)->s_freeblocks_counter)) - ext4_free_blocks_count_set(es, percpu_counter_sum_positive( - &EXT4_SB(sb)->s_freeblocks_counter)); - if (percpu_counter_initialized(&EXT4_SB(sb)->s_freeinodes_counter)) - es->s_free_inodes_count = - cpu_to_le32(percpu_counter_sum_positive( - &EXT4_SB(sb)->s_freeinodes_counter)); + ext4_free_blocks_count_set(es, percpu_counter_sum_positive( + &EXT4_SB(sb)->s_freeblocks_counter)); + es->s_free_inodes_count = + cpu_to_le32(percpu_counter_sum_positive( + &EXT4_SB(sb)->s_freeinodes_counter)); sb->s_dirt = 0; BUFFER_TRACE(sbh, "marking dirty"); mark_buffer_dirty(sbh); -- cgit v1.1 From 6ef933a38ade555a175ecab9d803e6bb73399763 Mon Sep 17 00:00:00 2001 From: Suresh Jayaraman Date: Wed, 3 Nov 2010 10:53:49 +0530 Subject: cifs: trivial comment fix: tlink_tree is now a rbtree Noticed while reviewing (late) the rbtree conversion patchset (which has been merged already). Cc: Jeff Layton Signed-off-by: Suresh Jayaraman Signed-off-by: Steve French --- fs/cifs/connect.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index c9699ce..251a17c 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -3324,7 +3324,7 @@ tlink_rb_insert(struct rb_root *root, struct tcon_link *new_tlink) * If the superblock doesn't refer to a multiuser mount, then just return * the master tcon for the mount. * - * First, search the radix tree for an existing tcon for this fsuid. If one + * First, search the rbtree for an existing tcon for this fsuid. If one * exists, then check to see if it's pending construction. If it is then wait * for construction to complete. Once it's no longer pending, check to see if * it failed and either return an error or retry construction, depending on -- cgit v1.1 From d38922949d377da7d47473c7868334408ae3b373 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 2 Nov 2010 16:22:50 -0400 Subject: cifs: dereferencing first then checking This patch is based on Dan's original patch. His original description is below: Smatch complained about a couple checking for NULL after dereferencing bugs. I'm not super familiar with the code so I did the conservative thing and move the dereferences after the checks. The dereferences in cifs_lock() and cifs_fsync() were added in ba00ba64cf0 "cifs: make various routines use the cifsFileInfo->tcon pointer". The dereference in find_writable_file() was added in 6508d904e6f "cifs: have find_readable/writable_file filter by fsuid". The comments there say it's possible to trigger the NULL dereference under stress. Signed-off-by: Dan Carpenter Signed-off-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/file.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 71185d1..777e7f4 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -754,12 +754,6 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock) cifs_sb = CIFS_SB(file->f_path.dentry->d_sb); tcon = tlink_tcon(((struct cifsFileInfo *)file->private_data)->tlink); - - if (file->private_data == NULL) { - rc = -EBADF; - FreeXid(xid); - return rc; - } netfid = ((struct cifsFileInfo *)file->private_data)->netfid; if ((tcon->ses->capabilities & CAP_UNIX) && @@ -1154,7 +1148,7 @@ struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *cifs_inode, bool fsuid_only) { struct cifsFileInfo *open_file; - struct cifs_sb_info *cifs_sb = CIFS_SB(cifs_inode->vfs_inode.i_sb); + struct cifs_sb_info *cifs_sb; bool any_available = false; int rc; @@ -1168,6 +1162,8 @@ struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *cifs_inode, return NULL; } + cifs_sb = CIFS_SB(cifs_inode->vfs_inode.i_sb); + /* only filter by fsuid on multiuser mounts */ if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER)) fsuid_only = false; -- cgit v1.1 From c67236281c5d749741f5414103903a7c1b9c4636 Mon Sep 17 00:00:00 2001 From: Pavel Shilovsky Date: Wed, 3 Nov 2010 10:58:57 +0300 Subject: cifs: make cifs_set_oplock_level() take a cifsInodeInfo pointer All the callers already have a pointer to struct cifsInodeInfo. Use it. Signed-off-by: Suresh Jayaraman Signed-off-by: Pavel Shilovsky Signed-off-by: Steve French --- fs/cifs/cifsfs.c | 3 +-- fs/cifs/cifsproto.h | 2 +- fs/cifs/file.c | 8 ++++---- fs/cifs/misc.c | 16 +++++++++------- 4 files changed, 15 insertions(+), 14 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 38526a6..9c37897 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -321,8 +321,7 @@ cifs_alloc_inode(struct super_block *sb) /* Until the file is open and we have gotten oplock info back from the server, can not assume caching of file data or metadata */ - cifs_inode->clientCanCacheRead = false; - cifs_inode->clientCanCacheAll = false; + cifs_set_oplock_level(cifs_inode, 0); cifs_inode->delete_pending = false; cifs_inode->invalid_mapping = false; cifs_inode->vfs_inode.i_blkbits = 14; /* 2**14 = CIFS_MAX_MSGSIZE */ diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index 7f050f4..7ed69b6b 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -104,7 +104,7 @@ extern struct timespec cifs_NTtimeToUnix(__le64 utc_nanoseconds_since_1601); extern u64 cifs_UnixTimeToNT(struct timespec); extern struct timespec cnvrtDosUnixTm(__le16 le_date, __le16 le_time, int offset); -extern void cifs_set_oplock_level(struct inode *inode, __u32 oplock); +extern void cifs_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock); extern struct cifsFileInfo *cifs_new_fileinfo(__u16 fileHandle, struct file *file, struct tcon_link *tlink, diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 777e7f4..06c3e83 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -146,7 +146,7 @@ client_can_cache: rc = cifs_get_inode_info(&inode, full_path, buf, inode->i_sb, xid, NULL); - cifs_set_oplock_level(inode, oplock); + cifs_set_oplock_level(pCifsInode, oplock); return rc; } @@ -248,7 +248,7 @@ cifs_new_fileinfo(__u16 fileHandle, struct file *file, list_add_tail(&pCifsFile->flist, &pCifsInode->openFileList); spin_unlock(&cifs_file_list_lock); - cifs_set_oplock_level(inode, oplock); + cifs_set_oplock_level(pCifsInode, oplock); file->private_data = pCifsFile; return pCifsFile; @@ -279,7 +279,7 @@ void cifsFileInfo_put(struct cifsFileInfo *cifs_file) if (list_empty(&cifsi->openFileList)) { cFYI(1, "closing last open instance for inode %p", cifs_file->dentry->d_inode); - cifs_set_oplock_level(inode, 0); + cifs_set_oplock_level(cifsi, 0); } spin_unlock(&cifs_file_list_lock); @@ -611,7 +611,7 @@ reopen_success: we can not go to the server to get the new inod info */ - cifs_set_oplock_level(inode, oplock); + cifs_set_oplock_level(pCifsInode, oplock); cifs_relock_file(pCifsFile); diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c index d3b9dde..43f1028 100644 --- a/fs/cifs/misc.c +++ b/fs/cifs/misc.c @@ -570,7 +570,7 @@ is_valid_oplock_break(struct smb_hdr *buf, struct TCP_Server_Info *srv) cFYI(1, "file id match, oplock break"); pCifsInode = CIFS_I(netfile->dentry->d_inode); - cifs_set_oplock_level(netfile->dentry->d_inode, + cifs_set_oplock_level(pCifsInode, pSMB->OplockLevel); /* * cifs_oplock_break_put() can't be called @@ -722,18 +722,20 @@ cifs_autodisable_serverino(struct cifs_sb_info *cifs_sb) } } -void cifs_set_oplock_level(struct inode *inode, __u32 oplock) +void cifs_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock) { - struct cifsInodeInfo *cinode = CIFS_I(inode); + oplock &= 0xF; - if ((oplock & 0xF) == OPLOCK_EXCLUSIVE) { + if (oplock == OPLOCK_EXCLUSIVE) { cinode->clientCanCacheAll = true; cinode->clientCanCacheRead = true; - cFYI(1, "Exclusive Oplock granted on inode %p", inode); - } else if ((oplock & 0xF) == OPLOCK_READ) { + cFYI(1, "Exclusive Oplock granted on inode %p", + &cinode->vfs_inode); + } else if (oplock == OPLOCK_READ) { cinode->clientCanCacheAll = false; cinode->clientCanCacheRead = true; - cFYI(1, "Level II Oplock granted on inode %p", inode); + cFYI(1, "Level II Oplock granted on inode %p", + &cinode->vfs_inode); } else { cinode->clientCanCacheAll = false; cinode->clientCanCacheRead = false; -- cgit v1.1 From d8b16b3d1c9d8d9124d647d05797383d35e2d645 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Sat, 6 Nov 2010 12:41:16 -0700 Subject: ceph: fix bad pointer dereference in ceph_fill_trace We dereference *in a few lines down, but only set it on rename. It is apparently pretty rare for this to trigger, but I have been hitting it with a clustered MDSs. Signed-off-by: Sage Weil --- fs/ceph/inode.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 1d6a45b..cd0432c 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -1055,7 +1055,8 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req, ininfo = rinfo->targeti.in; vino.ino = le64_to_cpu(ininfo->ino); vino.snap = le64_to_cpu(ininfo->snapid); - if (!dn->d_inode) { + in = dn->d_inode; + if (!in) { in = ceph_get_inode(sb, vino); if (IS_ERR(in)) { pr_err("fill_trace bad get_inode " -- cgit v1.1 From 7421ab8041d98363edfb85955fa3b9849ffae366 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Sun, 7 Nov 2010 09:07:15 -0800 Subject: ceph: fix open for write on clustered mds Normally when we open a file we already have a cap, and simply update the wanted set. However, if we open a file for write, but don't have an auth cap, that doesn't work; we need to open a new cap with the auth MDS. Only reuse existing caps if we are opening for read or the existing cap is auth. Signed-off-by: Sage Weil --- fs/ceph/file.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ceph/file.c b/fs/ceph/file.c index e77c28c..87ee944 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -154,11 +154,13 @@ int ceph_open(struct inode *inode, struct file *file) } /* - * No need to block if we have any caps. Update wanted set + * No need to block if we have caps on the auth MDS (for + * write) or any MDS (for read). Update wanted set * asynchronously. */ spin_lock(&inode->i_lock); - if (__ceph_is_any_real_caps(ci)) { + if (__ceph_is_any_real_caps(ci) && + (((fmode & CEPH_FILE_MODE_WR) == 0) || ci->i_auth_cap)) { int mds_wanted = __ceph_caps_mds_wanted(ci); int issued = __ceph_caps_issued(ci, NULL); -- cgit v1.1 From 912a9b0319a8eb9e0834b19a25e01013ab2d6a9f Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Sun, 7 Nov 2010 09:37:25 -0800 Subject: ceph: only let auth caps update max_size Only the auth MDS has a meaningful max_size value for us, so only update it in fill_inode if we're being issued an auth cap. Otherwise, a random stat result from a non-auth MDS can clobber a meaningful max_size, get the client<->mds cap state out of sync, and make writes hang. Specifically, even if the client re-requests a larger max_size (which it will), the MDS won't respond because as far as it knows we already have a sufficiently large value. Signed-off-by: Sage Weil --- fs/ceph/inode.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index cd0432c..0a49ffd 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -606,7 +606,14 @@ static int fill_inode(struct inode *inode, le32_to_cpu(info->time_warp_seq), &ctime, &mtime, &atime); - ci->i_max_size = le64_to_cpu(info->max_size); + /* only update max_size on auth cap */ + if ((info->cap.flags & CEPH_CAP_FLAG_AUTH) && + ci->i_max_size != le64_to_cpu(info->max_size)) { + dout("max_size %lld -> %llu\n", ci->i_max_size, + le64_to_cpu(info->max_size)); + ci->i_max_size = le64_to_cpu(info->max_size); + } + ci->i_layout = info->layout; inode->i_blkbits = fls(le32_to_cpu(info->layout.fl_stripe_unit)) - 1; -- cgit v1.1 From feb4cc9bb433bf1491ac5ffbba133f3258dacf06 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Sun, 7 Nov 2010 09:39:00 -0800 Subject: ceph: re-request max_size if cap auth changes If the auth cap migrates to another MDS, clear requested_max_size so that we resend any pending max_size increase requests. This fixes potential hangs on writes that extend a file and race with an cap migration between MDSs. Signed-off-by: Sage Weil --- fs/ceph/caps.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'fs') diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 6e0942f..04b207b 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -2689,6 +2689,11 @@ static void handle_cap_import(struct ceph_mds_client *mdsc, NULL /* no caps context */); try_flush_caps(inode, session, NULL); up_read(&mdsc->snap_rwsem); + + /* make sure we re-request max_size, if necessary */ + spin_lock(&inode->i_lock); + ci->i_requested_max_size = 0; + spin_unlock(&inode->i_lock); } /* -- cgit v1.1 From cd045cb42a266882ac24bc21a3a8d03683c72954 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 4 Nov 2010 11:05:05 -0700 Subject: ceph: fix rdcache_gen usage and invalidate We used to use rdcache_gen to indicate whether we "might" have cached pages. Now we just look at the mapping to determine that. However, some old behavior remains from that transition. First, rdcache_gen == 0 no longer means we have no pages. That can happen at any time (presumably when we carry FILE_CACHE). We should not reset it to zero, and we should not check that it is zero. That means that the only purpose for rdcache_revoking is to resolve races between new issues of FILE_CACHE and an async invalidate. If they are equal, we should invalidate. On success, we decrement rdcache_revoking, so that it is no longer equal to rdcache_gen. Similarly, if we success in doing a sync invalidate, set revoking = gen - 1. (This is a small optimization to avoid doing unnecessary invalidate work and does not affect correctness.) Signed-off-by: Sage Weil --- fs/ceph/caps.c | 4 ++-- fs/ceph/inode.c | 16 +++++++--------- fs/ceph/super.h | 4 +--- 3 files changed, 10 insertions(+), 14 deletions(-) (limited to 'fs') diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 04b207b..60d27bc 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -1430,8 +1430,8 @@ static int try_nonblocking_invalidate(struct inode *inode) invalidating_gen == ci->i_rdcache_gen) { /* success. */ dout("try_nonblocking_invalidate %p success\n", inode); - ci->i_rdcache_gen = 0; - ci->i_rdcache_revoking = 0; + /* save any racing async invalidate some trouble */ + ci->i_rdcache_revoking = ci->i_rdcache_gen - 1; return 0; } dout("try_nonblocking_invalidate %p failed\n", inode); diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 0a49ffd..5a9f907 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -1394,11 +1394,8 @@ static void ceph_invalidate_work(struct work_struct *work) spin_lock(&inode->i_lock); dout("invalidate_pages %p gen %d revoking %d\n", inode, ci->i_rdcache_gen, ci->i_rdcache_revoking); - if (ci->i_rdcache_gen == 0 || - ci->i_rdcache_revoking != ci->i_rdcache_gen) { - BUG_ON(ci->i_rdcache_revoking > ci->i_rdcache_gen); + if (ci->i_rdcache_revoking != ci->i_rdcache_gen) { /* nevermind! */ - ci->i_rdcache_revoking = 0; spin_unlock(&inode->i_lock); goto out; } @@ -1408,15 +1405,16 @@ static void ceph_invalidate_work(struct work_struct *work) ceph_invalidate_nondirty_pages(inode->i_mapping); spin_lock(&inode->i_lock); - if (orig_gen == ci->i_rdcache_gen) { + if (orig_gen == ci->i_rdcache_gen && + orig_gen == ci->i_rdcache_revoking) { dout("invalidate_pages %p gen %d successful\n", inode, ci->i_rdcache_gen); - ci->i_rdcache_gen = 0; - ci->i_rdcache_revoking = 0; + ci->i_rdcache_revoking--; check = 1; } else { - dout("invalidate_pages %p gen %d raced, gen now %d\n", - inode, orig_gen, ci->i_rdcache_gen); + dout("invalidate_pages %p gen %d raced, now %d revoking %d\n", + inode, orig_gen, ci->i_rdcache_gen, + ci->i_rdcache_revoking); } spin_unlock(&inode->i_lock); diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 1886294..7f01728 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -293,9 +293,7 @@ struct ceph_inode_info { int i_rd_ref, i_rdcache_ref, i_wr_ref; int i_wrbuffer_ref, i_wrbuffer_ref_head; u32 i_shared_gen; /* increment each time we get FILE_SHARED */ - u32 i_rdcache_gen; /* we increment this each time we get - FILE_CACHE. If it's non-zero, we - _may_ have cached pages. */ + u32 i_rdcache_gen; /* incremented each time we get FILE_CACHE. */ u32 i_rdcache_revoking; /* RDCACHE gen to async invalidate, if any */ struct list_head i_unsafe_writes; /* uncommitted sync writes */ -- cgit v1.1 From cb4276cca4695670916a82e359f2e3776f0a9138 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Mon, 8 Nov 2010 07:28:52 -0800 Subject: ceph: fix uid/gid on resent mds requests MDS requests can be rebuilt and resent in non-process context, but were filling in uid/gid from current_fsuid/gid. Put that information in the request struct on request setup. This fixes incorrect (and root) uid/gid getting set for requests that are forwarded between MDSs, usually due to metadata migrations. Signed-off-by: Sage Weil --- fs/ceph/mds_client.c | 7 +++++-- fs/ceph/mds_client.h | 2 ++ 2 files changed, 7 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 3142b15..d22fbbe 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -529,6 +529,9 @@ static void __register_request(struct ceph_mds_client *mdsc, ceph_mdsc_get_request(req); __insert_request(mdsc, req); + req->r_uid = current_fsuid(); + req->r_gid = current_fsgid(); + if (dir) { struct ceph_inode_info *ci = ceph_inode(dir); @@ -1588,8 +1591,8 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc, head->mdsmap_epoch = cpu_to_le32(mdsc->mdsmap->m_epoch); head->op = cpu_to_le32(req->r_op); - head->caller_uid = cpu_to_le32(current_fsuid()); - head->caller_gid = cpu_to_le32(current_fsgid()); + head->caller_uid = cpu_to_le32(req->r_uid); + head->caller_gid = cpu_to_le32(req->r_gid); head->args = req->r_args; ceph_encode_filepath(&p, end, ino1, path1); diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index d66d63c..9341fd4 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -170,6 +170,8 @@ struct ceph_mds_request { union ceph_mds_request_args r_args; int r_fmode; /* file mode, if expecting cap */ + uid_t r_uid; + gid_t r_gid; /* for choosing which mds to send this request to */ int r_direct_mode; -- cgit v1.1 From 8bd59e0188c04f6540f00e13f633f22e4804ce06 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Mon, 8 Nov 2010 09:23:12 -0800 Subject: ceph: fix version check on racing inode updates We may get updates on the same inode from multiple MDSs; generally we only pay attention if the update is newer than what we already have. The exception is when an MDS sense unstable information, in which case we always update. The old > check got this wrong when our version was odd (e.g. 3) and the reply version was even (e.g. 2): the older stale (v2) info would be applied. Fixed and clarified the comment. Signed-off-by: Sage Weil --- fs/ceph/inode.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 5a9f907..425c5b1 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -567,12 +567,17 @@ static int fill_inode(struct inode *inode, /* * provided version will be odd if inode value is projected, - * even if stable. skip the update if we have a newer info - * (e.g., due to inode info racing form multiple MDSs), or if - * we are getting projected (unstable) inode info. + * even if stable. skip the update if we have newer stable + * info (ours>=theirs, e.g. due to racing mds replies), unless + * we are getting projected (unstable) info (in which case the + * version is odd, and we want ours>theirs). + * us them + * 2 2 skip + * 3 2 skip + * 3 3 update */ if (le64_to_cpu(info->version) > 0 && - (ci->i_version & ~1) > le64_to_cpu(info->version)) + (ci->i_version & ~1) >= le64_to_cpu(info->version)) goto no_change; issued = __ceph_caps_issued(ci, &implemented); -- cgit v1.1 From d8672d64b88cdb7aa8139fb6d218f40b8cbf60af Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Mon, 8 Nov 2010 09:24:34 -0800 Subject: ceph: fix update of ctime from MDS The client can have a newer ctime than the MDS due to AUTH_EXCL and XATTR_EXCL caps as well; update the check in ceph_fill_file_time appropriately. This fixes cases where ctime/mtime goes backward under the right sequence of local updates (e.g. chmod) and mds replies (e.g. subsequent stat that goes to the MDS). Signed-off-by: Sage Weil --- fs/ceph/inode.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 425c5b1..7bc0fbd 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -471,7 +471,9 @@ void ceph_fill_file_time(struct inode *inode, int issued, if (issued & (CEPH_CAP_FILE_EXCL| CEPH_CAP_FILE_WR| - CEPH_CAP_FILE_BUFFER)) { + CEPH_CAP_FILE_BUFFER| + CEPH_CAP_AUTH_EXCL| + CEPH_CAP_XATTR_EXCL)) { if (timespec_compare(ctime, &inode->i_ctime) > 0) { dout("ctime %ld.%09ld -> %ld.%09ld inc w/ cap\n", inode->i_ctime.tv_sec, inode->i_ctime.tv_nsec, @@ -511,7 +513,7 @@ void ceph_fill_file_time(struct inode *inode, int issued, warn = 1; } } else { - /* we have no write caps; whatever the MDS says is true */ + /* we have no write|excl caps; whatever the MDS says is true */ if (ceph_seq_cmp(time_warp_seq, ci->i_time_warp_seq) >= 0) { inode->i_ctime = *ctime; inode->i_mtime = *mtime; -- cgit v1.1 From f7ad6d2e9201a6e1c9ee6530a291452eb695feb8 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Mon, 8 Nov 2010 13:43:33 -0500 Subject: ext4: handle writeback of inodes which are being freed The following BUG can occur when an inode which is getting freed when it still has dirty pages outstanding, and it gets deleted (in this because it was the target of a rename). In ordered mode, we need to make sure the data pages are written just in case we crash before the rename (or unlink) is committed. If the inode is being freed then when we try to igrab the inode, we end up tripping the BUG_ON at fs/ext4/page-io.c:146. To solve this problem, we need to keep track of the number of io callbacks which are pending, and avoid destroying the inode until they have all been completed. That way we don't have to bump the inode count to keep the inode from being destroyed; an approach which doesn't work because the count could have already been dropped down to zero before the inode writeback has started (at which point we're not allowed to bump the count back up to 1, since it's already started getting freed). Thanks to Dave Chinner for suggesting this approach, which is also used by XFS. kernel BUG at /scratch_space/linux-2.6/fs/ext4/page-io.c:146! Call Trace: [] ext4_bio_write_page+0x172/0x307 [] mpage_da_submit_io+0x2f9/0x37b [] mpage_da_map_and_submit+0x2cc/0x2e2 [] mpage_add_bh_to_extent+0xc6/0xd5 [] write_cache_pages_da+0x2a4/0x3ac [] ext4_da_writepages+0x2d6/0x44d [] do_writepages+0x1c/0x25 [] __filemap_fdatawrite_range+0x4b/0x4d [] filemap_fdatawrite_range+0xe/0x10 [] jbd2_journal_begin_ordered_truncate+0x7b/0xa2 [] ext4_evict_inode+0x57/0x24c [] evict+0x22/0x92 [] iput+0x212/0x249 [] dentry_iput+0xa1/0xb9 [] d_kill+0x3d/0x5d [] dput+0x13a/0x147 [] sys_renameat+0x1b5/0x258 [] ? _atomic_dec_and_lock+0x2d/0x4c [] ? cp_new_stat+0xde/0xea [] ? sys_newlstat+0x2d/0x38 [] sys_rename+0x16/0x18 [] system_call_fastpath+0x16/0x1b Reported-by: Nick Bowler Signed-off-by: "Theodore Ts'o" Tested-by: Nick Bowler --- fs/ext4/ext4.h | 2 ++ fs/ext4/page-io.c | 59 ++++++++++++++++++++++++++++++++----------------------- fs/ext4/super.c | 2 ++ 3 files changed, 38 insertions(+), 25 deletions(-) (limited to 'fs') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 8b5dd636..670d134 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -858,6 +858,7 @@ struct ext4_inode_info { spinlock_t i_completed_io_lock; /* current io_end structure for async DIO write*/ ext4_io_end_t *cur_aio_dio; + atomic_t i_ioend_count; /* Number of outstanding io_end structs */ /* * Transactions that contain inode's metadata needed to complete @@ -2060,6 +2061,7 @@ extern int ext4_move_extents(struct file *o_filp, struct file *d_filp, /* page-io.c */ extern int __init ext4_init_pageio(void); extern void ext4_exit_pageio(void); +extern void ext4_ioend_wait(struct inode *); extern void ext4_free_io_end(ext4_io_end_t *io); extern ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags); extern int ext4_end_io_nolock(ext4_io_end_t *io); diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 46a7d6a..a24c8cc 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -32,8 +32,14 @@ static struct kmem_cache *io_page_cachep, *io_end_cachep; +#define WQ_HASH_SZ 37 +#define to_ioend_wq(v) (&ioend_wq[((unsigned long)v) % WQ_HASH_SZ]) +static wait_queue_head_t ioend_wq[WQ_HASH_SZ]; + int __init ext4_init_pageio(void) { + int i; + io_page_cachep = KMEM_CACHE(ext4_io_page, SLAB_RECLAIM_ACCOUNT); if (io_page_cachep == NULL) return -ENOMEM; @@ -42,6 +48,8 @@ int __init ext4_init_pageio(void) kmem_cache_destroy(io_page_cachep); return -ENOMEM; } + for (i = 0; i < WQ_HASH_SZ; i++) + init_waitqueue_head(&ioend_wq[i]); return 0; } @@ -52,9 +60,17 @@ void ext4_exit_pageio(void) kmem_cache_destroy(io_page_cachep); } +void ext4_ioend_wait(struct inode *inode) +{ + wait_queue_head_t *wq = to_ioend_wq(inode); + + wait_event(*wq, (atomic_read(&EXT4_I(inode)->i_ioend_count) == 0)); +} + void ext4_free_io_end(ext4_io_end_t *io) { int i; + wait_queue_head_t *wq; BUG_ON(!io); if (io->page) @@ -69,7 +85,10 @@ void ext4_free_io_end(ext4_io_end_t *io) } } io->num_io_pages = 0; - iput(io->inode); + wq = to_ioend_wq(io->inode); + if (atomic_dec_and_test(&EXT4_I(io->inode)->i_ioend_count) && + waitqueue_active(wq)) + wake_up_all(wq); kmem_cache_free(io_end_cachep, io); } @@ -142,8 +161,8 @@ ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags) io = kmem_cache_alloc(io_end_cachep, flags); if (io) { memset(io, 0, sizeof(*io)); - io->inode = igrab(inode); - BUG_ON(!io->inode); + atomic_inc(&EXT4_I(inode)->i_ioend_count); + io->inode = inode; INIT_WORK(&io->work, ext4_end_io_work); INIT_LIST_HEAD(&io->list); } @@ -171,35 +190,15 @@ static void ext4_end_bio(struct bio *bio, int error) struct workqueue_struct *wq; struct inode *inode; unsigned long flags; - ext4_fsblk_t err_block; int i; BUG_ON(!io_end); - inode = io_end->inode; bio->bi_private = NULL; bio->bi_end_io = NULL; if (test_bit(BIO_UPTODATE, &bio->bi_flags)) error = 0; - err_block = bio->bi_sector >> (inode->i_blkbits - 9); bio_put(bio); - if (!(inode->i_sb->s_flags & MS_ACTIVE)) { - pr_err("sb umounted, discard end_io request for inode %lu\n", - io_end->inode->i_ino); - ext4_free_io_end(io_end); - return; - } - - if (error) { - io_end->flag |= EXT4_IO_END_ERROR; - ext4_warning(inode->i_sb, "I/O error writing to inode %lu " - "(offset %llu size %ld starting block %llu)", - inode->i_ino, - (unsigned long long) io_end->offset, - (long) io_end->size, - (unsigned long long) err_block); - } - for (i = 0; i < io_end->num_io_pages; i++) { struct page *page = io_end->pages[i]->p_page; struct buffer_head *bh, *head; @@ -254,8 +253,19 @@ static void ext4_end_bio(struct bio *bio, int error) if (!partial_write) SetPageUptodate(page); } - io_end->num_io_pages = 0; + inode = io_end->inode; + + if (error) { + io_end->flag |= EXT4_IO_END_ERROR; + ext4_warning(inode->i_sb, "I/O error writing to inode %lu " + "(offset %llu size %ld starting block %llu)", + inode->i_ino, + (unsigned long long) io_end->offset, + (long) io_end->size, + (unsigned long long) + bio->bi_sector >> (inode->i_blkbits - 9)); + } /* Add the io_end to per-inode completed io list*/ spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags); @@ -305,7 +315,6 @@ static int io_submit_init(struct ext4_io_submit *io, bio->bi_private = io->io_end = io_end; bio->bi_end_io = ext4_end_bio; - io_end->inode = inode; io_end->offset = (page->index << PAGE_CACHE_SHIFT) + bh_offset(bh); io->io_bio = bio; diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 04352e9..45653af 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -828,12 +828,14 @@ static struct inode *ext4_alloc_inode(struct super_block *sb) ei->cur_aio_dio = NULL; ei->i_sync_tid = 0; ei->i_datasync_tid = 0; + atomic_set(&ei->i_ioend_count, 0); return &ei->vfs_inode; } static void ext4_destroy_inode(struct inode *inode) { + ext4_ioend_wait(inode); if (!list_empty(&(EXT4_I(inode)->i_orphan))) { ext4_msg(inode->i_sb, KERN_ERR, "Inode %lu (%p): orphan list check failed!", -- cgit v1.1 From 83668e7141c7a0aa4035bde94344b81f9cf966ab Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Mon, 8 Nov 2010 13:45:33 -0500 Subject: ext4: fix potential race when freeing ext4_io_page structures Use an atomic_t and make sure we don't free the structure while we might still be submitting I/O for that page. Signed-off-by: "Theodore Ts'o" --- fs/ext4/ext4.h | 2 +- fs/ext4/page-io.c | 38 +++++++++++++++----------------------- 2 files changed, 16 insertions(+), 24 deletions(-) (limited to 'fs') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 670d134..6a5edea 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -177,7 +177,7 @@ struct mpage_da_data { struct ext4_io_page { struct page *p_page; - int p_count; + atomic_t p_count; }; #define MAX_IO_PAGES 128 diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index a24c8cc..7f5451c 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -67,6 +67,15 @@ void ext4_ioend_wait(struct inode *inode) wait_event(*wq, (atomic_read(&EXT4_I(inode)->i_ioend_count) == 0)); } +static void put_io_page(struct ext4_io_page *io_page) +{ + if (atomic_dec_and_test(&io_page->p_count)) { + end_page_writeback(io_page->p_page); + put_page(io_page->p_page); + kmem_cache_free(io_page_cachep, io_page); + } +} + void ext4_free_io_end(ext4_io_end_t *io) { int i; @@ -75,15 +84,8 @@ void ext4_free_io_end(ext4_io_end_t *io) BUG_ON(!io); if (io->page) put_page(io->page); - for (i = 0; i < io->num_io_pages; i++) { - if (--io->pages[i]->p_count == 0) { - struct page *page = io->pages[i]->p_page; - - end_page_writeback(page); - put_page(page); - kmem_cache_free(io_page_cachep, io->pages[i]); - } - } + for (i = 0; i < io->num_io_pages; i++) + put_io_page(io->pages[i]); io->num_io_pages = 0; wq = to_ioend_wq(io->inode); if (atomic_dec_and_test(&EXT4_I(io->inode)->i_ioend_count) && @@ -235,13 +237,7 @@ static void ext4_end_bio(struct bio *bio, int error) } while (bh != head); } - if (--io_end->pages[i]->p_count == 0) { - struct page *page = io_end->pages[i]->p_page; - - end_page_writeback(page); - put_page(page); - kmem_cache_free(io_page_cachep, io_end->pages[i]); - } + put_io_page(io_end->pages[i]); /* * If this is a partial write which happened to make @@ -369,7 +365,7 @@ submit_and_retry: if ((io_end->num_io_pages == 0) || (io_end->pages[io_end->num_io_pages-1] != io_page)) { io_end->pages[io_end->num_io_pages++] = io_page; - io_page->p_count++; + atomic_inc(&io_page->p_count); } return 0; } @@ -398,7 +394,7 @@ int ext4_bio_write_page(struct ext4_io_submit *io, return -ENOMEM; } io_page->p_page = page; - io_page->p_count = 0; + atomic_set(&io_page->p_count, 1); get_page(page); for (bh = head = page_buffers(page), block_start = 0; @@ -430,10 +426,6 @@ int ext4_bio_write_page(struct ext4_io_submit *io, * PageWriteback bit from the page to prevent the system from * wedging later on. */ - if (io_page->p_count == 0) { - put_page(page); - end_page_writeback(page); - kmem_cache_free(io_page_cachep, io_page); - } + put_io_page(io_page); return ret; } -- cgit v1.1 From 87009d86dc045d228e21242467a67a5f99347553 Mon Sep 17 00:00:00 2001 From: Dmitry Monakhov Date: Mon, 8 Nov 2010 13:47:33 -0500 Subject: ext4: do not try to grab the s_umount semaphore in ext4_quota_off It's not needed to sync the filesystem, and it fixes a lock_dep complaint. Signed-off-by: Dmitry Monakhov Signed-off-by: "Theodore Ts'o" Reviewed-by: Jan Kara --- fs/ext4/super.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 45653af..ee91e29d 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -4570,12 +4570,10 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id, static int ext4_quota_off(struct super_block *sb, int type) { - /* Force all delayed allocation blocks to be allocated */ - if (test_opt(sb, DELALLOC)) { - down_read(&sb->s_umount); + /* Force all delayed allocation blocks to be allocated. + * Caller already holds s_umount sem */ + if (test_opt(sb, DELALLOC)) sync_filesystem(sb); - up_read(&sb->s_umount); - } return dquot_quota_off(sb, type); } -- cgit v1.1 From b56ff9d397cecdaad6c98c9d57cc6fea475e1f50 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Mon, 8 Nov 2010 13:49:33 -0500 Subject: ext4: Don't call sb_issue_discard() in ext4_free_blocks() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit 5c521830cf (ext4: Support discard requests when running in no-journal mode) attempts to add sb_issue_discard() for data blocks (in data=writeback mode) and in no-journal mode. Unfortunately, this no longer works, because in commit dd3932eddf (block: remove BLKDEV_IFL_WAIT), sb_issue_discard() only presents a synchronous interface, and there are times when we call ext4_free_blocks() when we are are holding a spinlock, or are otherwise in an atomic context. For now, I've removed the call to sb_issue_discard() to prevent a deadlock or (if spinlock debugging is enabled) failures like this: BUG: scheduling while atomic: rc.sysinit/1376/0x00000002 Pid: 1376, comm: rc.sysinit Not tainted 2.6.36-ARCH #1 Call Trace: [] __schedule_bug+0x5e/0x70 [] schedule+0x950/0xa70 [] ? insert_work+0x7d/0x90 [] ? queue_work_on+0x1d/0x30 [] ? queue_work+0x37/0x60 [] schedule_timeout+0x21d/0x360 [] ? generic_make_request+0x2c3/0x540 [] wait_for_common+0xc0/0x150 [] ? default_wake_function+0x0/0x10 [] ? submit_bio+0x7c/0x100 [] ? wake_bit_function+0x0/0x40 [] wait_for_completion+0x18/0x20 [] blkdev_issue_discard+0x1b9/0x210 [] ext4_free_blocks+0x68e/0xb60 [] ? __ext4_handle_dirty_metadata+0x110/0x120 [] ext4_ext_truncate+0x8cc/0xa70 [] ? pagevec_lookup+0x1e/0x30 [] ext4_truncate+0x178/0x5d0 [] ? unmap_mapping_range+0xab/0x280 [] vmtruncate+0x56/0x70 [] ext4_setattr+0x14b/0x460 [] notify_change+0x194/0x380 [] do_truncate+0x60/0x90 [] ? security_inode_permission+0x1a/0x20 [] ? tomoyo_path_truncate+0x11/0x20 [] do_last+0x5d9/0x770 [] do_filp_open+0x1ed/0x680 [] ? page_fault+0x1f/0x30 [] ? alloc_fd+0xec/0x140 [] do_sys_open+0x61/0x120 [] sys_open+0x1b/0x20 [] system_call_fastpath+0x16/0x1b https://bugzilla.kernel.org/show_bug.cgi?id=22302 Reported-by: Mathias Burén Signed-off-by: "Theodore Ts'o" Cc: jiayingz@google.com --- fs/ext4/mballoc.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'fs') diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index c58eba34..5b4d4e3 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -4640,8 +4640,6 @@ do_more: * with group lock held. generate_buddy look at * them with group lock_held */ - if (test_opt(sb, DISCARD)) - ext4_issue_discard(sb, block_group, bit, count); ext4_lock_group(sb, block_group); mb_clear_bits(bitmap_bh->b_data, bit, count); mb_free_blocks(inode, &e4b, bit, count); -- cgit v1.1 From 7ff9c073dd4d7200399076554f7ab9b876f196f6 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Mon, 8 Nov 2010 13:51:33 -0500 Subject: ext4: Add new ext4 inode tracepoints Add ext4_evict_inode, ext4_drop_inode, ext4_mark_inode_dirty, and ext4_begin_ordered_truncate() Signed-off-by: "Theodore Ts'o" --- fs/ext4/inode.c | 3 +++ fs/ext4/super.c | 10 ++++++++++ 2 files changed, 13 insertions(+) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 1916164..846e1e9 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -53,6 +53,7 @@ static inline int ext4_begin_ordered_truncate(struct inode *inode, loff_t new_size) { + trace_ext4_begin_ordered_truncate(inode, new_size); return jbd2_journal_begin_ordered_truncate( EXT4_SB(inode->i_sb)->s_journal, &EXT4_I(inode)->jinode, @@ -178,6 +179,7 @@ void ext4_evict_inode(struct inode *inode) handle_t *handle; int err; + trace_ext4_evict_inode(inode); if (inode->i_nlink) { truncate_inode_pages(&inode->i_data, 0); goto no_delete; @@ -5649,6 +5651,7 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode) int err, ret; might_sleep(); + trace_ext4_mark_inode_dirty(inode, _RET_IP_); err = ext4_reserve_inode_write(handle, inode, &iloc); if (ext4_handle_valid(handle) && EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize && diff --git a/fs/ext4/super.c b/fs/ext4/super.c index ee91e29d..61182fe 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -833,6 +833,14 @@ static struct inode *ext4_alloc_inode(struct super_block *sb) return &ei->vfs_inode; } +static int ext4_drop_inode(struct inode *inode) +{ + int drop = generic_drop_inode(inode); + + trace_ext4_drop_inode(inode, drop); + return drop; +} + static void ext4_destroy_inode(struct inode *inode) { ext4_ioend_wait(inode); @@ -1175,6 +1183,7 @@ static const struct super_operations ext4_sops = { .destroy_inode = ext4_destroy_inode, .write_inode = ext4_write_inode, .dirty_inode = ext4_dirty_inode, + .drop_inode = ext4_drop_inode, .evict_inode = ext4_evict_inode, .put_super = ext4_put_super, .sync_fs = ext4_sync_fs, @@ -1196,6 +1205,7 @@ static const struct super_operations ext4_nojournal_sops = { .destroy_inode = ext4_destroy_inode, .write_inode = ext4_write_inode, .dirty_inode = ext4_dirty_inode, + .drop_inode = ext4_drop_inode, .evict_inode = ext4_evict_inode, .write_super = ext4_write_super, .put_super = ext4_put_super, -- cgit v1.1 From 618763958b2291a09057dbfa553da6ded93dcfad Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 8 Nov 2010 07:28:32 -0500 Subject: cifs: make cifs_ioctl handle NULL filp->private_data correctly Commit 13cfb7334e made cifs_ioctl use the tlink attached to the cifsFileInfo for a filp. This ignores the case of an open directory however, which in CIFS can have a NULL private_data until a readdir is done on it. This patch re-adds the NULL pointer checks that were removed in commit 50ae28f01 and moves the setting of tcon and "caps" variables lower. Long term, a better fix would be to establish a f_op->open routine for directories that populates that field at open time, but that requires some other changes to how readdir calls are handled. Reported-by: Kjell Rune Skaaraas Reviewed-and-Tested-by: Suresh Jayaraman Signed-off-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/ioctl.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/cifs/ioctl.c b/fs/cifs/ioctl.c index 2fa22f2..0c98672 100644 --- a/fs/cifs/ioctl.c +++ b/fs/cifs/ioctl.c @@ -38,10 +38,10 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg) struct cifs_sb_info *cifs_sb; #ifdef CONFIG_CIFS_POSIX struct cifsFileInfo *pSMBFile = filep->private_data; - struct cifsTconInfo *tcon = tlink_tcon(pSMBFile->tlink); + struct cifsTconInfo *tcon; __u64 ExtAttrBits = 0; __u64 ExtAttrMask = 0; - __u64 caps = le64_to_cpu(tcon->fsUnixInfo.Capability); + __u64 caps; #endif /* CONFIG_CIFS_POSIX */ xid = GetXid(); @@ -62,6 +62,10 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg) break; #ifdef CONFIG_CIFS_POSIX case FS_IOC_GETFLAGS: + if (pSMBFile == NULL) + break; + tcon = tlink_tcon(pSMBFile->tlink); + caps = le64_to_cpu(tcon->fsUnixInfo.Capability); if (CIFS_UNIX_EXTATTR_CAP & caps) { rc = CIFSGetExtAttr(xid, tcon, pSMBFile->netfid, &ExtAttrBits, &ExtAttrMask); @@ -73,6 +77,10 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg) break; case FS_IOC_SETFLAGS: + if (pSMBFile == NULL) + break; + tcon = tlink_tcon(pSMBFile->tlink); + caps = le64_to_cpu(tcon->fsUnixInfo.Capability); if (CIFS_UNIX_EXTATTR_CAP & caps) { if (get_user(ExtAttrBits, (int __user *)arg)) { rc = -EFAULT; -- cgit v1.1 From 0e15482566b752718e7225168380904f1d0cdfa3 Mon Sep 17 00:00:00 2001 From: Meelis Roos Date: Mon, 8 Nov 2010 13:38:14 -0800 Subject: sparc: fix openpromfs compile Fix openpromfs compilation by adding a missing semicolon in fs/openpromfs/inode.c openprom_mount(). Signed-off-by: Meelis Roos Signed-off-by: David S. Miller Signed-off-by: Linus Torvalds --- fs/openpromfs/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c index ddb1f41..911e61f 100644 --- a/fs/openpromfs/inode.c +++ b/fs/openpromfs/inode.c @@ -418,7 +418,7 @@ out_no_root: static struct dentry *openprom_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *data) { - return mount_single(fs_type, flags, data, openprom_fill_super) + return mount_single(fs_type, flags, data, openprom_fill_super); } static struct file_system_type openprom_fs_type = { -- cgit v1.1 From 3565bd46b1c6a3dbf1f670d3275aa4018a4c65ae Mon Sep 17 00:00:00 2001 From: Suresh Jayaraman Date: Tue, 9 Nov 2010 12:27:41 +0530 Subject: cifs: fix a memleak in cifs_setattr_nounix() Andrew Hendry reported a kmemleak warning in 2.6.37-rc1 while editing a text file with gedit over cifs. unreferenced object 0xffff88022ee08b40 (size 32): comm "gedit", pid 2524, jiffies 4300160388 (age 2633.655s) hex dump (first 32 bytes): 5c 2e 67 6f 75 74 70 75 74 73 74 72 65 61 6d 2d \.goutputstream- 35 42 41 53 4c 56 00 de 09 00 00 00 2c 26 78 ee 5BASLV......,&x. backtrace: [] kmemleak_alloc+0x2d/0x60 [] __kmalloc+0xe3/0x1d0 [] build_path_from_dentry+0xf0/0x230 [cifs] [] cifs_setattr+0x9e/0x770 [cifs] [] notify_change+0x170/0x2e0 [] sys_fchmod+0x10b/0x140 [] system_call_fastpath+0x16/0x1b [] 0xffffffffffffffff The commit 1025774c that removed inode_setattr() seems to have introduced this memleak by returning early without freeing 'full_path'. Reported-by: Andrew Hendry Cc: Christoph Hellwig Reviewed-by: Jeff Layton Signed-off-by: Suresh Jayaraman Signed-off-by: Steve French --- fs/cifs/inode.c | 1 - 1 file changed, 1 deletion(-) (limited to 'fs') diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index 39869c3c..ef3a55b 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -2177,7 +2177,6 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs) setattr_copy(inode, attrs); mark_inode_dirty(inode); - return 0; cifs_setattr_exit: kfree(full_path); -- cgit v1.1 From e98b6fed84d0f0155d7b398e0dfeac74c792f2d0 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 9 Nov 2010 12:24:53 -0800 Subject: ceph: fix comment, remove extraneous args The offset/length arguments aren't used. Signed-off-by: Sage Weil --- fs/ceph/file.c | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) (limited to 'fs') diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 87ee944..603fd00 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -376,21 +376,19 @@ static ssize_t ceph_sync_read(struct file *file, char __user *data, dout("sync_read on file %p %llu~%u %s\n", file, off, len, (file->f_flags & O_DIRECT) ? "O_DIRECT" : ""); - if (file->f_flags & O_DIRECT) { - pages = ceph_get_direct_page_vector(data, num_pages, off, len); - - /* - * flush any page cache pages in this range. this - * will make concurrent normal and O_DIRECT io slow, - * but it will at least behave sensibly when they are - * in sequence. - */ - } else { + if (file->f_flags & O_DIRECT) + pages = ceph_get_direct_page_vector(data, num_pages); + else pages = ceph_alloc_page_vector(num_pages, GFP_NOFS); - } if (IS_ERR(pages)) return PTR_ERR(pages); + /* + * flush any page cache pages in this range. this + * will make concurrent normal and sync io slow, + * but it will at least behave sensibly when they are + * in sequence. + */ ret = filemap_write_and_wait(inode->i_mapping); if (ret < 0) goto done; -- cgit v1.1 From b7495fc2ff941db6a118a93ab8d61149e3f4cef8 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 9 Nov 2010 12:43:12 -0800 Subject: ceph: make page alignment explicit in osd interface We used to infer alignment of IOs within a page based on the file offset, which assumed they matched. This broke with direct IO that was not aligned to pages (e.g., 512-byte aligned IO). We were also trusting the alignment specified in the OSD reply, which could have been adjusted by the server. Explicitly specify the page alignment when setting up OSD IO requests. Signed-off-by: Sage Weil --- fs/ceph/addr.c | 6 +++--- fs/ceph/file.c | 26 +++++++++++++++++++++----- fs/ceph/inode.c | 2 +- 3 files changed, 25 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 51bcc5c..4aa85776 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -204,7 +204,7 @@ static int readpage_nounlock(struct file *filp, struct page *page) err = ceph_osdc_readpages(osdc, ceph_vino(inode), &ci->i_layout, page->index << PAGE_CACHE_SHIFT, &len, ci->i_truncate_seq, ci->i_truncate_size, - &page, 1); + &page, 1, 0); if (err == -ENOENT) err = 0; if (err < 0) { @@ -287,7 +287,7 @@ static int ceph_readpages(struct file *file, struct address_space *mapping, rc = ceph_osdc_readpages(osdc, ceph_vino(inode), &ci->i_layout, offset, &len, ci->i_truncate_seq, ci->i_truncate_size, - pages, nr_pages); + pages, nr_pages, 0); if (rc == -ENOENT) rc = 0; if (rc < 0) @@ -782,7 +782,7 @@ get_more_pages: snapc, do_sync, ci->i_truncate_seq, ci->i_truncate_size, - &inode->i_mtime, true, 1); + &inode->i_mtime, true, 1, 0); max_pages = req->r_num_pages; alloc_page_vec(fsc, req); diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 603fd00..8d79b89 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -282,11 +282,12 @@ int ceph_release(struct inode *inode, struct file *file) static int striped_read(struct inode *inode, u64 off, u64 len, struct page **pages, int num_pages, - int *checkeof) + int *checkeof, bool align_to_pages) { struct ceph_fs_client *fsc = ceph_inode_to_client(inode); struct ceph_inode_info *ci = ceph_inode(inode); u64 pos, this_len; + int io_align, page_align; int page_off = off & ~PAGE_CACHE_MASK; /* first byte's offset in page */ int left, pages_left; int read; @@ -302,14 +303,19 @@ static int striped_read(struct inode *inode, page_pos = pages; pages_left = num_pages; read = 0; + io_align = off & ~PAGE_MASK; more: + if (align_to_pages) + page_align = (pos - io_align) & ~PAGE_MASK; + else + page_align = pos & ~PAGE_MASK; this_len = left; ret = ceph_osdc_readpages(&fsc->client->osdc, ceph_vino(inode), &ci->i_layout, pos, &this_len, ci->i_truncate_seq, ci->i_truncate_size, - page_pos, pages_left); + page_pos, pages_left, page_align); hit_stripe = this_len < left; was_short = ret >= 0 && ret < this_len; if (ret == -ENOENT) @@ -393,7 +399,8 @@ static ssize_t ceph_sync_read(struct file *file, char __user *data, if (ret < 0) goto done; - ret = striped_read(inode, off, len, pages, num_pages, checkeof); + ret = striped_read(inode, off, len, pages, num_pages, checkeof, + file->f_flags & O_DIRECT); if (ret >= 0 && (file->f_flags & O_DIRECT) == 0) ret = ceph_copy_page_vector_to_user(pages, data, off, ret); @@ -448,6 +455,7 @@ static ssize_t ceph_sync_write(struct file *file, const char __user *data, int flags; int do_sync = 0; int check_caps = 0; + int page_align, io_align; int ret; struct timespec mtime = CURRENT_TIME; @@ -462,6 +470,8 @@ static ssize_t ceph_sync_write(struct file *file, const char __user *data, else pos = *offset; + io_align = pos & ~PAGE_MASK; + ret = filemap_write_and_wait_range(inode->i_mapping, pos, pos + left); if (ret < 0) return ret; @@ -486,20 +496,26 @@ static ssize_t ceph_sync_write(struct file *file, const char __user *data, */ more: len = left; + if (file->f_flags & O_DIRECT) + /* write from beginning of first page, regardless of + io alignment */ + page_align = (pos - io_align) & ~PAGE_MASK; + else + page_align = pos & ~PAGE_MASK; req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, ceph_vino(inode), pos, &len, CEPH_OSD_OP_WRITE, flags, ci->i_snap_realm->cached_context, do_sync, ci->i_truncate_seq, ci->i_truncate_size, - &mtime, false, 2); + &mtime, false, 2, page_align); if (!req) return -ENOMEM; num_pages = calc_pages_for(pos, len); if (file->f_flags & O_DIRECT) { - pages = ceph_get_direct_page_vector(data, num_pages, pos, len); + pages = ceph_get_direct_page_vector(data, num_pages); if (IS_ERR(pages)) { ret = PTR_ERR(pages); goto out; diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 7bc0fbd..8153ee5 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -1752,7 +1752,7 @@ int ceph_do_getattr(struct inode *inode, int mask) return 0; } - dout("do_getattr inode %p mask %s\n", inode, ceph_cap_string(mask)); + dout("do_getattr inode %p mask %s mode 0%o\n", inode, ceph_cap_string(mask), inode->i_mode); if (ceph_caps_issued_mask(ceph_inode(inode), mask, 1)) return 0; -- cgit v1.1 From f3f63c1c28bc861a931fac283b5bc3585efb8967 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 29 Oct 2010 11:46:56 -0600 Subject: block: limit vec count in bio_kmalloc() and bio_alloc_map_data() Reported-by: Dan Rosenberg Cc: stable@kernel.org Signed-off-by: Jens Axboe --- fs/bio.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/bio.c b/fs/bio.c index 8abb2df..8317a2c 100644 --- a/fs/bio.c +++ b/fs/bio.c @@ -370,6 +370,9 @@ struct bio *bio_kmalloc(gfp_t gfp_mask, int nr_iovecs) { struct bio *bio; + if (nr_iovecs > UIO_MAXIOV) + return NULL; + bio = kmalloc(sizeof(struct bio) + nr_iovecs * sizeof(struct bio_vec), gfp_mask); if (unlikely(!bio)) @@ -697,8 +700,12 @@ static void bio_free_map_data(struct bio_map_data *bmd) static struct bio_map_data *bio_alloc_map_data(int nr_segs, int iov_count, gfp_t gfp_mask) { - struct bio_map_data *bmd = kmalloc(sizeof(*bmd), gfp_mask); + struct bio_map_data *bmd; + + if (iov_count > UIO_MAXIOV) + return NULL; + bmd = kmalloc(sizeof(*bmd), gfp_mask); if (!bmd) return NULL; -- cgit v1.1 From cb4644cac4a2797afc847e6c92736664d4b0ea34 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 10 Nov 2010 14:36:25 +0100 Subject: bio: take care not overflow page count when mapping/copying user data If the iovec is being set up in a way that causes uaddr + PAGE_SIZE to overflow, we could end up attempting to map a huge number of pages. Check for this invalid input type. Reported-by: Dan Rosenberg Cc: stable@kernel.org Signed-off-by: Jens Axboe --- fs/bio.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/bio.c b/fs/bio.c index 8317a2c..4bd454f 100644 --- a/fs/bio.c +++ b/fs/bio.c @@ -834,6 +834,12 @@ struct bio *bio_copy_user_iov(struct request_queue *q, end = (uaddr + iov[i].iov_len + PAGE_SIZE - 1) >> PAGE_SHIFT; start = uaddr >> PAGE_SHIFT; + /* + * Overflow, abort + */ + if (end < start) + return ERR_PTR(-EINVAL); + nr_pages += end - start; len += iov[i].iov_len; } @@ -962,6 +968,12 @@ static struct bio *__bio_map_user_iov(struct request_queue *q, unsigned long end = (uaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT; unsigned long start = uaddr >> PAGE_SHIFT; + /* + * Overflow, abort + */ + if (end < start) + return ERR_PTR(-EINVAL); + nr_pages += end - start; /* * buffer must be aligned to at least hardsector size for now @@ -989,7 +1001,7 @@ static struct bio *__bio_map_user_iov(struct request_queue *q, unsigned long start = uaddr >> PAGE_SHIFT; const int local_nr_pages = end - start; const int page_limit = cur_page + local_nr_pages; - + ret = get_user_pages_fast(uaddr, local_nr_pages, write_to_vm, &pages[cur_page]); if (ret < local_nr_pages) { -- cgit v1.1 From 1447399b3e34af016c368b4178db7ef0e04e15b0 Mon Sep 17 00:00:00 2001 From: Daniel J Blueman Date: Tue, 9 Nov 2010 21:33:02 +0100 Subject: ioprio: fix RCU locking around task dereference With 2.6.37-rc1, I observe sys_ioprio_set not taking the RCU lock [1] across access to the task credentials. Inspecting the code in fs/ioprio.c, the tasklist_lock is held for read across the __task_cred call, which is presumably sufficient to prevent the task credentials becoming stale. =================================================== [ INFO: suspicious rcu_dereference_check() usage. ] --------------------------------------------------- kernel/pid.c:419 invoked rcu_dereference_check() without protection! other info that might help us debug this: rcu_scheduler_active = 1, debug_locks = 1 1 lock held by start-stop-daem/2246: #0: (tasklist_lock){.?.?..}, at: [] sys_ioprio_set+0x8a/0x400 stack backtrace: Pid: 2246, comm: start-stop-daem Not tainted 2.6.37-rc1-330cd+ #2 Call Trace: [] lockdep_rcu_dereference+0xa4/0xc0 [] find_task_by_pid_ns+0x81/0x90 [] find_task_by_vpid+0x1d/0x20 [] sys_ioprio_set+0x3f0/0x400 [] ? trace_hardirqs_on_thunk+0x3a/0x3f [] system_call_fastpath+0x16/0x1b Take the RCU lock for read across acquiring the pointer to the task credentials and dereferencing it. Signed-off-by: Daniel J Blueman Fixed up by Jens to fix missing rcu_read_unlock() on mismatches. Signed-off-by: Jens Axboe --- fs/ioprio.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ioprio.c b/fs/ioprio.c index 748cfb9..8def14e 100644 --- a/fs/ioprio.c +++ b/fs/ioprio.c @@ -139,7 +139,12 @@ SYSCALL_DEFINE3(ioprio_set, int, which, int, who, int, ioprio) break; do_each_thread(g, p) { - if (__task_cred(p)->uid != who) + int match; + + rcu_read_lock(); + match = __task_cred(p)->uid == who; + rcu_read_unlock(); + if (!match) continue; ret = set_task_ioprio(p, ioprio); if (ret) @@ -232,7 +237,12 @@ SYSCALL_DEFINE2(ioprio_get, int, which, int, who) break; do_each_thread(g, p) { - if (__task_cred(p)->uid != user->uid) + int match; + + rcu_read_lock(); + match = __task_cred(p)->uid == user->uid; + rcu_read_unlock(); + if (!match) continue; tmpio = get_task_ioprio(p); if (tmpio < 0) -- cgit v1.1 From f85acd81aa623e3dcf268c90e5cd8ecf36830984 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Tue, 9 Nov 2010 21:26:56 +0100 Subject: ioprio: rcu_read_lock/unlock protect find_task_by_vpid call (V2) Commit 4221a9918e38b7494cee341dda7b7b4bb8c04bde "Add RCU check for find_task_by_vpid()" introduced rcu_lockdep_assert to find_task_by_pid_ns= Assertion failed in sys_ioprio_get. The patch is fixing assertion failure in ioprio_set as well. kernel/pid.c:419 invoked rcu_dereference_check() without protection! stack backtrace: Pid: 4254, comm: iotop Not tainted Call Trace: [] lockdep_rcu_dereference+0xaa/0xb2 [] find_task_by_pid_ns+0x4f/0x68 [] find_task_by_vpid+0x1d/0x1f [] sys_ioprio_get+0x50/0x2da [] system_call_fastpath+0x16/0x1b V2: rcu critical section expanded according to comment by Paul E. McKenney Signed-off-by: Sergey Senozhatsky Acked-by: Paul E. McKenney Signed-off-by: Jens Axboe --- fs/ioprio.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'fs') diff --git a/fs/ioprio.c b/fs/ioprio.c index 8def14e..2f7d05c 100644 --- a/fs/ioprio.c +++ b/fs/ioprio.c @@ -111,12 +111,14 @@ SYSCALL_DEFINE3(ioprio_set, int, which, int, who, int, ioprio) read_lock(&tasklist_lock); switch (which) { case IOPRIO_WHO_PROCESS: + rcu_read_lock(); if (!who) p = current; else p = find_task_by_vpid(who); if (p) ret = set_task_ioprio(p, ioprio); + rcu_read_unlock(); break; case IOPRIO_WHO_PGRP: if (!who) @@ -205,12 +207,14 @@ SYSCALL_DEFINE2(ioprio_get, int, which, int, who) read_lock(&tasklist_lock); switch (which) { case IOPRIO_WHO_PROCESS: + rcu_read_lock(); if (!who) p = current; else p = find_task_by_vpid(who); if (p) ret = get_task_ioprio(p); + rcu_read_unlock(); break; case IOPRIO_WHO_PGRP: if (!who) -- cgit v1.1 From 5d0af85cd0964bb845b63d5059bb20e8f7731e65 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 28 Oct 2010 21:37:10 +0000 Subject: xfs: remove experimental tag from the delaylog option We promised to do this for 2.6.37, and the code looks stable enough to keep that promise. Signed-off-by: Christoph Hellwig Reviewed-by: Dave Chinner Signed-off-by: Alex Elder --- fs/xfs/linux-2.6/xfs_super.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'fs') diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c index 9f3a78f..064f964 100644 --- a/fs/xfs/linux-2.6/xfs_super.c +++ b/fs/xfs/linux-2.6/xfs_super.c @@ -353,9 +353,6 @@ xfs_parseargs( mp->m_qflags &= ~XFS_OQUOTA_ENFD; } else if (!strcmp(this_char, MNTOPT_DELAYLOG)) { mp->m_flags |= XFS_MOUNT_DELAYLOG; - cmn_err(CE_WARN, - "Enabling EXPERIMENTAL delayed logging feature " - "- use at your own risk.\n"); } else if (!strcmp(this_char, MNTOPT_NODELAYLOG)) { mp->m_flags &= ~XFS_MOUNT_DELAYLOG; } else if (!strcmp(this_char, "ihashsize")) { -- cgit v1.1 From 6762b938eac878a30a90e770ac655874c36bc642 Mon Sep 17 00:00:00 2001 From: Kulikov Vasiliy Date: Sat, 30 Oct 2010 14:26:17 +0000 Subject: xfs: xfs_ioctl: fix information leak to userland al_hreq is copied from userland. If al_hreq.buflen is not properly aligned then xfs_attr_list will ignore the last bytes of kbuf. These bytes are unitialized. It leads to leaking of contents of kernel stack memory. Signed-off-by: Vasiliy Kulikov Signed-off-by: Alex Elder --- fs/xfs/linux-2.6/xfs_ioctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c index 2ea238f6..ad442d9 100644 --- a/fs/xfs/linux-2.6/xfs_ioctl.c +++ b/fs/xfs/linux-2.6/xfs_ioctl.c @@ -416,7 +416,7 @@ xfs_attrlist_by_handle( if (IS_ERR(dentry)) return PTR_ERR(dentry); - kbuf = kmalloc(al_hreq.buflen, GFP_KERNEL); + kbuf = kzalloc(al_hreq.buflen, GFP_KERNEL); if (!kbuf) goto out_dput; -- cgit v1.1 From f83282a8ef799c0bdcb0c32971487087da1bc216 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 8 Nov 2010 08:55:04 +0000 Subject: xfs: fix per-ag reference counting in inode reclaim tree walking The walk fails to decrement the per-ag reference count when the non-blocking walk fails to obtain the per-ag reclaim lock, leading to an assert failure on debug kernels when unmounting a filesystem. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Signed-off-by: Alex Elder --- fs/xfs/linux-2.6/xfs_sync.c | 1 + fs/xfs/xfs_mount.c | 1 + 2 files changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c index 37d3325..afb0d7c 100644 --- a/fs/xfs/linux-2.6/xfs_sync.c +++ b/fs/xfs/linux-2.6/xfs_sync.c @@ -853,6 +853,7 @@ restart: if (trylock) { if (!mutex_trylock(&pag->pag_ici_reclaim_lock)) { skipped++; + xfs_perag_put(pag); continue; } first_index = pag->pag_ici_reclaim_cursor; diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index b1498ab..19e9dfa 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -275,6 +275,7 @@ xfs_free_perag( pag = radix_tree_delete(&mp->m_perag_tree, agno); spin_unlock(&mp->m_perag_lock); ASSERT(pag); + ASSERT(atomic_read(&pag->pag_ref) == 0); call_rcu(&pag->rcu_head, __xfs_free_perag); } } -- cgit v1.1 From bfe2741967eaa3434fa9b3d8f24b1422d4540e7d Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 8 Nov 2010 08:55:05 +0000 Subject: xfs: move delayed write buffer trace The delayed write buffer split trace currently issues a trace for every buffer it scans. These buffers are not necessarily queued for delayed write. Indeed, when buffers are pinned, there can be thousands of traces of buffers that aren't actually queued for delayed write and the ones that are are lost in the noise. Move the trace point to record only buffers that are split out for IO to be issued on. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Signed-off-by: Alex Elder --- fs/xfs/linux-2.6/xfs_buf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c index 63fd2c0..aa1d353 100644 --- a/fs/xfs/linux-2.6/xfs_buf.c +++ b/fs/xfs/linux-2.6/xfs_buf.c @@ -1781,7 +1781,6 @@ xfs_buf_delwri_split( INIT_LIST_HEAD(list); spin_lock(dwlk); list_for_each_entry_safe(bp, n, dwq, b_list) { - trace_xfs_buf_delwri_split(bp, _RET_IP_); ASSERT(bp->b_flags & XBF_DELWRI); if (!XFS_BUF_ISPINNED(bp) && !xfs_buf_cond_lock(bp)) { @@ -1795,6 +1794,7 @@ xfs_buf_delwri_split( _XBF_RUN_QUEUES); bp->b_flags |= XBF_WRITE; list_move_tail(&bp->b_list, list); + trace_xfs_buf_delwri_split(bp, _RET_IP_); } else skipped++; } -- cgit v1.1 From 785ce41805ea7b6a9b2775ed9f4cf10cd7a90c03 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 6 Nov 2010 11:42:44 +0000 Subject: xfs: tell lockdep about parent iolock usage in filestreams The filestreams code may take the iolock on the parent inode while holding it on a child. This is the only place in XFS where we take both the child and parent iolock, so just telling lockdep about it is enough. The lock flag required for that was already added as part of the ilock lockdep annotations and unused so far. Signed-off-by: Christoph Hellwig Signed-off-by: Alex Elder --- fs/xfs/xfs_filestream.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c index 9b715dc..9124425 100644 --- a/fs/xfs/xfs_filestream.c +++ b/fs/xfs/xfs_filestream.c @@ -744,9 +744,15 @@ xfs_filestream_new_ag( * If the file's parent directory is known, take its iolock in exclusive * mode to prevent two sibling files from racing each other to migrate * themselves and their parent to different AGs. + * + * Note that we lock the parent directory iolock inside the child + * iolock here. That's fine as we never hold both parent and child + * iolock in any other place. This is different from the ilock, + * which requires locking of the child after the parent for namespace + * operations. */ if (pip) - xfs_ilock(pip, XFS_IOLOCK_EXCL); + xfs_ilock(pip, XFS_IOLOCK_EXCL | XFS_IOLOCK_PARENT); /* * A new AG needs to be found for the file. If the file's parent -- cgit v1.1 From 5d2bf8a55e03b0e59ed5a4ac2ff7f9ee3ba7e40d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 6 Nov 2010 11:42:56 +0000 Subject: xfs: fix a few compiler warnings with CONFIG_XFS_QUOTA=n Andi Kleen reported that gcc-4.5 gives lots of warnings for him inside the XFS code. It turned out most of them are due to the quota stubs beeing macros, and gcc now complaining about macros evaluating to 0 that are not assigned to variables. Signed-off-by: Christoph Hellwig Signed-off-by: Alex Elder --- fs/xfs/xfs_quota.h | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h index e0e64b1..9bb6eda 100644 --- a/fs/xfs/xfs_quota.h +++ b/fs/xfs/xfs_quota.h @@ -346,8 +346,17 @@ xfs_qm_vop_dqalloc(struct xfs_inode *ip, uid_t uid, gid_t gid, prid_t prid, #define xfs_trans_mod_dquot_byino(tp, ip, fields, delta) #define xfs_trans_apply_dquot_deltas(tp) #define xfs_trans_unreserve_and_mod_dquots(tp) -#define xfs_trans_reserve_quota_nblks(tp, ip, nblks, ninos, flags) (0) -#define xfs_trans_reserve_quota_bydquots(tp, mp, u, g, nb, ni, fl) (0) +static inline int xfs_trans_reserve_quota_nblks(struct xfs_trans *tp, + struct xfs_inode *ip, long nblks, long ninos, uint flags) +{ + return 0; +} +static inline int xfs_trans_reserve_quota_bydquots(struct xfs_trans *tp, + struct xfs_mount *mp, struct xfs_dquot *udqp, + struct xfs_dquot *gdqp, long nblks, long nions, uint flags) +{ + return 0; +} #define xfs_qm_vop_create_dqattach(tp, ip, u, g) #define xfs_qm_vop_rename_dqattach(it) (0) #define xfs_qm_vop_chown(tp, ip, old, new) (NULL) @@ -357,11 +366,14 @@ xfs_qm_vop_dqalloc(struct xfs_inode *ip, uid_t uid, gid_t gid, prid_t prid, #define xfs_qm_dqdetach(ip) #define xfs_qm_dqrele(d) #define xfs_qm_statvfs(ip, s) -#define xfs_qm_sync(mp, fl) (0) +static inline int xfs_qm_sync(struct xfs_mount *mp, int flags) +{ + return 0; +} #define xfs_qm_newmount(mp, a, b) (0) #define xfs_qm_mount_quotas(mp) #define xfs_qm_unmount(mp) -#define xfs_qm_unmount_quotas(mp) (0) +#define xfs_qm_unmount_quotas(mp) #endif /* CONFIG_XFS_QUOTA */ #define xfs_trans_unreserve_quota_nblks(tp, ip, nblks, ninos, flags) \ -- cgit v1.1 From c6f6cd0608b1826ee1797cf57a808416e4bdb806 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 6 Nov 2010 11:43:08 +0000 Subject: xfs: use hlist_add_fake XFS does not need it's inodes to actuall be hashed in the VFS inode cache, but we require the inode to be marked hashed for the writeback code to work. Insted of using insert_inode_hash, which requires a second inode_lock roundtrip after the partial merge of the inode scalability patches in 2.6.37-rc simply use the new hlist_add_fake helper to mark it hashed without requiring a lock or touching a global cache line. Signed-off-by: Christoph Hellwig Signed-off-by: Alex Elder --- fs/xfs/linux-2.6/xfs_iops.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c index 96107ef..94d5fd6 100644 --- a/fs/xfs/linux-2.6/xfs_iops.c +++ b/fs/xfs/linux-2.6/xfs_iops.c @@ -762,7 +762,8 @@ xfs_setup_inode( inode->i_state = I_NEW; inode_sb_list_add(inode); - insert_inode_hash(inode); + /* make the inode look hashed for the writeback code */ + hlist_add_fake(&inode->i_hash); inode->i_mode = ip->i_d.di_mode; inode->i_nlink = ip->i_d.di_nlink; -- cgit v1.1 From 3df057ac9afe83c4af84016df3baf3a0eb1d3d33 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Wed, 3 Nov 2010 16:49:44 -0400 Subject: locks: fix leak on merging leases We must also free the passed-in lease in the case it wasn't used because an existing lease was upgrade/downgraded or already existed. Note the nfsd caller doesn't care because it's fl_change callback returns an error in those cases. Signed-off-by: J. Bruce Fields --- fs/locks.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/locks.c b/fs/locks.c index 65765cb..61c22f7 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -1504,7 +1504,7 @@ static int do_fcntl_delete_lease(struct file *filp) static int do_fcntl_add_lease(unsigned int fd, struct file *filp, long arg) { - struct file_lock *fl; + struct file_lock *fl, *ret; struct fasync_struct *new; struct inode *inode = filp->f_path.dentry->d_inode; int error; @@ -1518,6 +1518,7 @@ static int do_fcntl_add_lease(unsigned int fd, struct file *filp, long arg) locks_free_lock(fl); return -ENOMEM; } + ret = fl; lock_flocks(); error = __vfs_setlease(filp, arg, &fl); if (error) { @@ -1525,6 +1526,8 @@ static int do_fcntl_add_lease(unsigned int fd, struct file *filp, long arg) locks_free_lock(fl); goto out_free_fasync; } + if (ret != fl) + locks_free_lock(fl); /* * fasync_insert_entry() returns the old entry if any. @@ -1532,7 +1535,7 @@ static int do_fcntl_add_lease(unsigned int fd, struct file *filp, long arg) * inserted it into the fasync list. Clear new so that * we don't release it here. */ - if (!fasync_insert_entry(fd, filp, &fl->fl_fasync, new)) + if (!fasync_insert_entry(fd, filp, &ret->fl_fasync, new)) new = NULL; if (error < 0) { -- cgit v1.1 From 8896b93f42459b18b145c69d399b62870df48061 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Wed, 3 Nov 2010 18:09:18 -0400 Subject: locks: remove dead lease error-handling code A minor oversight from f7347ce4ee7c65415f84be915c018473e7076f31, "fasync: re-organize fasync entry insertion to allow it under a spinlock": this cleanup-on-error was only needed to handle -ENOMEM. Now that we're preallocating it's unneeded. Signed-off-by: J. Bruce Fields --- fs/locks.c | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/locks.c b/fs/locks.c index 61c22f7..0e62dd3 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -1506,7 +1506,6 @@ static int do_fcntl_add_lease(unsigned int fd, struct file *filp, long arg) { struct file_lock *fl, *ret; struct fasync_struct *new; - struct inode *inode = filp->f_path.dentry->d_inode; int error; fl = lease_alloc(filp, arg); @@ -1520,7 +1519,7 @@ static int do_fcntl_add_lease(unsigned int fd, struct file *filp, long arg) } ret = fl; lock_flocks(); - error = __vfs_setlease(filp, arg, &fl); + error = __vfs_setlease(filp, arg, &ret); if (error) { unlock_flocks(); locks_free_lock(fl); @@ -1538,14 +1537,7 @@ static int do_fcntl_add_lease(unsigned int fd, struct file *filp, long arg) if (!fasync_insert_entry(fd, filp, &ret->fl_fasync, new)) new = NULL; - if (error < 0) { - /* remove lease just inserted by setlease */ - fl->fl_type = F_UNLCK | F_INPROGRESS; - fl->fl_break_time = jiffies - 10; - time_out_leases(inode); - } else { - error = __f_setown(filp, task_pid(current), PIDTYPE_PID, 0); - } + error = __f_setown(filp, task_pid(current), PIDTYPE_PID, 0); unlock_flocks(); out_free_fasync: -- cgit v1.1 From ece413f59f257682de4a2e2e42af33b016af53f3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 10 Nov 2010 21:39:11 +0000 Subject: xfs: remove incorrect assert in xfs_vm_writepage In commit 20cb52ebd1b5ca6fa8a5d9b6b1392292f5ca8a45, titled "xfs: simplify xfs_vm_writepage" I added an assert that any !mapped and uptodate buffers are not dirty. That asserts turns out to trigger a lot when running fsx on filesystems with small block sizes. The reason for that is that the assert is simply incorrect. !mapped and uptodate just mean this buffer covers a hole, and whenever we do a set_page_dirty we mark all blocks in the page dirty, no matter if they have data or not. So remove the assert, and update the comment above the condition to match reality. Signed-off-by: Christoph Hellwig Signed-off-by: Alex Elder --- fs/xfs/linux-2.6/xfs_aops.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c index c9af48f..7d287af 100644 --- a/fs/xfs/linux-2.6/xfs_aops.c +++ b/fs/xfs/linux-2.6/xfs_aops.c @@ -1111,11 +1111,12 @@ xfs_vm_writepage( uptodate = 0; /* - * A hole may still be marked uptodate because discard_buffer - * leaves the flag set. + * set_page_dirty dirties all buffers in a page, independent + * of their state. The dirty state however is entirely + * meaningless for holes (!mapped && uptodate), so skip + * buffers covering holes here. */ if (!buffer_mapped(bh) && buffer_uptodate(bh)) { - ASSERT(!buffer_dirty(bh)); imap_valid = 0; continue; } -- cgit v1.1 From a1629c3b24f26ec1b0f534874af674a6b4c1540b Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 11 Nov 2010 15:24:06 -0800 Subject: ceph: fix dangling pointer Clear fi->last_name when it's freed. The only caller is rewinddir() (or equivalent lseek). Signed-off-by: Sage Weil --- fs/ceph/dir.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index e0a2dc6..1e11ed7 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -414,6 +414,7 @@ static void reset_readdir(struct ceph_file_info *fi) fi->last_readdir = NULL; } kfree(fi->last_name); + fi->last_name = NULL; fi->next_offset = 2; /* compensate for . and .. */ if (fi->dentry) { dput(fi->dentry); -- cgit v1.1 From 7b88dadc13e0004947de52df128dbd5b0754ed0a Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 11 Nov 2010 16:48:59 -0800 Subject: ceph: fix frag offset for non-leftmost frags We start at offset 2 for the leftmost frag, and 0 for subsequent frags. When we reach the end (rightmost), we go back to 2. This fixes readdir on fragmented (large) directories. Signed-off-by: Sage Weil --- fs/ceph/dir.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 1e11ed7..5f67728 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -336,7 +336,10 @@ more: if (req->r_reply_info.dir_end) { kfree(fi->last_name); fi->last_name = NULL; - fi->next_offset = 2; + if (ceph_frag_is_rightmost(frag)) + fi->next_offset = 2; + else + fi->next_offset = 0; } else { rinfo = &req->r_reply_info; err = note_last_dentry(fi, -- cgit v1.1 From 52ca0e84b05595cf74f1ff772b3f9807256b1b27 Mon Sep 17 00:00:00 2001 From: Dave Jones Date: Thu, 11 Nov 2010 14:05:20 -0800 Subject: hugetlbfs: lessen the impact of a deprecation warning WARN_ONCE is a bit strong for a deprecation warning, given that it spews a huge backtrace. Signed-off-by: Dave Jones Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/hugetlbfs/inode.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index d6cfac1..a5fe681 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -932,8 +932,7 @@ struct file *hugetlb_file_setup(const char *name, size_t size, int acctflag, if (creat_flags == HUGETLB_SHMFS_INODE && !can_do_hugetlb_shm()) { *user = current_user(); if (user_shm_lock(size, *user)) { - WARN_ONCE(1, - "Using mlock ulimits for SHM_HUGETLB deprecated\n"); + printk_once(KERN_WARNING "Using mlock ulimits for SHM_HUGETLB is deprecated\n"); } else { *user = NULL; return ERR_PTR(-EPERM); -- cgit v1.1 From 1c66b360fe26204e2aa14e45086b4a6b8890b1a2 Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Sat, 13 Nov 2010 16:22:02 +0800 Subject: ocfs2: Change some lock status member in ocfs2_lock_res to char. Commit 83fd9c7 changes l_level, l_requested and l_blocking of ocfs2_lock_res from int to unsigned char. But actually it is initially as -1(ocfs2_lock_res_init_common) which correspoding to 255 for unsigned char. So the whole dlm lock mechanism doesn't work now which means a disaster to ocfs2. Cc: Goldwyn Rodrigues Signed-off-by: Tao Ma Signed-off-by: Joel Becker --- fs/ocfs2/ocfs2.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index d840821..1efea36 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -159,7 +159,9 @@ struct ocfs2_lock_res { char l_name[OCFS2_LOCK_ID_MAX_LEN]; unsigned int l_ro_holders; unsigned int l_ex_holders; - unsigned char l_level; + char l_level; + char l_requested; + char l_blocking; /* Data packed - type enum ocfs2_lock_type */ unsigned char l_type; @@ -169,8 +171,6 @@ struct ocfs2_lock_res { unsigned char l_action; /* Data packed - enum type ocfs2_unlock_action */ unsigned char l_unlock_action; - unsigned char l_requested; - unsigned char l_blocking; unsigned int l_pending_gen; spinlock_t l_lock; -- cgit v1.1 From d69b78ba1deaaa95ffa8dac5a9ca819ce454d52e Mon Sep 17 00:00:00 2001 From: Greg Thelen Date: Mon, 15 Nov 2010 10:20:52 +0100 Subject: ioprio: grab rcu_read_lock in sys_ioprio_{set,get}() Using: - CONFIG_LOCKUP_DETECTOR=y - CONFIG_PREEMPT=y - CONFIG_LOCKDEP=y - CONFIG_PROVE_LOCKING=y - CONFIG_PROVE_RCU=y found a missing rcu lock during boot on a 512 MiB x86_64 ubuntu vm: =================================================== [ INFO: suspicious rcu_dereference_check() usage. ] --------------------------------------------------- kernel/pid.c:419 invoked rcu_dereference_check() without protection! other info that might help us debug this: rcu_scheduler_active = 1, debug_locks = 0 1 lock held by ureadahead/1355: #0: (tasklist_lock){.+.+..}, at: [] sys_ioprio_set+0x7f/0x29e stack backtrace: Pid: 1355, comm: ureadahead Not tainted 2.6.37-dbg-DEV #1 Call Trace: [] lockdep_rcu_dereference+0xaa/0xb3 [] find_task_by_pid_ns+0x44/0x5d [] find_task_by_vpid+0x22/0x24 [] sys_ioprio_set+0xb4/0x29e [] ? trace_hardirqs_off_thunk+0x3a/0x3c [] sysenter_dispatch+0x7/0x2c [] ? trace_hardirqs_on_thunk+0x3a/0x3f The fix is to: a) grab rcu lock in sys_ioprio_{set,get}() and b) avoid grabbing tasklist_lock. Discussion in: http://marc.info/?l=linux-kernel&m=128951324702889 Signed-off-by: Greg Thelen Acked-by: Paul E. McKenney Reviewed-by: Oleg Nesterov Modified by Jens to remove the now redundant inner rcu lock and unlock since they are now protected by the outer lock. Signed-off-by: Jens Axboe --- fs/ioprio.c | 31 ++++++------------------------- 1 file changed, 6 insertions(+), 25 deletions(-) (limited to 'fs') diff --git a/fs/ioprio.c b/fs/ioprio.c index 2f7d05c..7da2a06 100644 --- a/fs/ioprio.c +++ b/fs/ioprio.c @@ -103,22 +103,15 @@ SYSCALL_DEFINE3(ioprio_set, int, which, int, who, int, ioprio) } ret = -ESRCH; - /* - * We want IOPRIO_WHO_PGRP/IOPRIO_WHO_USER to be "atomic", - * so we can't use rcu_read_lock(). See re-copy of ->ioprio - * in copy_process(). - */ - read_lock(&tasklist_lock); + rcu_read_lock(); switch (which) { case IOPRIO_WHO_PROCESS: - rcu_read_lock(); if (!who) p = current; else p = find_task_by_vpid(who); if (p) ret = set_task_ioprio(p, ioprio); - rcu_read_unlock(); break; case IOPRIO_WHO_PGRP: if (!who) @@ -141,12 +134,7 @@ SYSCALL_DEFINE3(ioprio_set, int, which, int, who, int, ioprio) break; do_each_thread(g, p) { - int match; - - rcu_read_lock(); - match = __task_cred(p)->uid == who; - rcu_read_unlock(); - if (!match) + if (__task_cred(p)->uid != who) continue; ret = set_task_ioprio(p, ioprio); if (ret) @@ -160,7 +148,7 @@ free_uid: ret = -EINVAL; } - read_unlock(&tasklist_lock); + rcu_read_unlock(); return ret; } @@ -204,17 +192,15 @@ SYSCALL_DEFINE2(ioprio_get, int, which, int, who) int ret = -ESRCH; int tmpio; - read_lock(&tasklist_lock); + rcu_read_lock(); switch (which) { case IOPRIO_WHO_PROCESS: - rcu_read_lock(); if (!who) p = current; else p = find_task_by_vpid(who); if (p) ret = get_task_ioprio(p); - rcu_read_unlock(); break; case IOPRIO_WHO_PGRP: if (!who) @@ -241,12 +227,7 @@ SYSCALL_DEFINE2(ioprio_get, int, which, int, who) break; do_each_thread(g, p) { - int match; - - rcu_read_lock(); - match = __task_cred(p)->uid == user->uid; - rcu_read_unlock(); - if (!match) + if (__task_cred(p)->uid != user->uid) continue; tmpio = get_task_ioprio(p); if (tmpio < 0) @@ -264,6 +245,6 @@ SYSCALL_DEFINE2(ioprio_get, int, which, int, who) ret = -EINVAL; } - read_unlock(&tasklist_lock); + rcu_read_unlock(); return ret; } -- cgit v1.1 From 044b9414c7caf9a26192c73a5b88fa1a8a32a1c1 Mon Sep 17 00:00:00 2001 From: Steven Whitehouse Date: Wed, 3 Nov 2010 20:01:07 +0000 Subject: GFS2: Fix inode deallocation race This area of the code has always been a bit delicate due to the subtleties of lock ordering. The problem is that for "normal" alloc/dealloc, we always grab the inode locks first and the rgrp lock later. In order to ensure no races in looking up the unlinked, but still allocated inodes, we need to hold the rgrp lock when we do the lookup, which means that we can't take the inode glock. The solution is to borrow the technique already used by NFS to solve what is essentially the same problem (given an inode number, look up the inode carefully, checking that it really is in the expected state). We cannot do that directly from the allocation code (lock ordering again) so we give the job to the pre-existing delete workqueue and carry on with the allocation as normal. If we find there is no space, we do a journal flush (required anyway if space from a deallocation is to be released) which should block against the pending deallocations, so we should always get the space back. Signed-off-by: Steven Whitehouse --- fs/gfs2/export.c | 46 ++--------------- fs/gfs2/glock.c | 21 ++++---- fs/gfs2/inode.c | 152 +++++++++++++------------------------------------------ fs/gfs2/inode.h | 4 +- fs/gfs2/rgrp.c | 91 +++++++++++++++++---------------- 5 files changed, 98 insertions(+), 216 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/export.c b/fs/gfs2/export.c index 06d5827..5ab3839 100644 --- a/fs/gfs2/export.c +++ b/fs/gfs2/export.c @@ -138,10 +138,8 @@ static struct dentry *gfs2_get_dentry(struct super_block *sb, struct gfs2_inum_host *inum) { struct gfs2_sbd *sdp = sb->s_fs_info; - struct gfs2_holder i_gh; struct inode *inode; struct dentry *dentry; - int error; inode = gfs2_ilookup(sb, inum->no_addr); if (inode) { @@ -152,52 +150,16 @@ static struct dentry *gfs2_get_dentry(struct super_block *sb, goto out_inode; } - error = gfs2_glock_nq_num(sdp, inum->no_addr, &gfs2_inode_glops, - LM_ST_SHARED, LM_FLAG_ANY, &i_gh); - if (error) - return ERR_PTR(error); - - error = gfs2_check_blk_type(sdp, inum->no_addr, GFS2_BLKST_DINODE); - if (error) - goto fail; - - inode = gfs2_inode_lookup(sb, DT_UNKNOWN, inum->no_addr, 0); - if (IS_ERR(inode)) { - error = PTR_ERR(inode); - goto fail; - } - - error = gfs2_inode_refresh(GFS2_I(inode)); - if (error) { - iput(inode); - goto fail; - } - - /* Pick up the works we bypass in gfs2_inode_lookup */ - if (inode->i_state & I_NEW) - gfs2_set_iop(inode); - - if (GFS2_I(inode)->i_no_formal_ino != inum->no_formal_ino) { - iput(inode); - goto fail; - } - - error = -EIO; - if (GFS2_I(inode)->i_diskflags & GFS2_DIF_SYSTEM) { - iput(inode); - goto fail; - } - - gfs2_glock_dq_uninit(&i_gh); + inode = gfs2_lookup_by_inum(sdp, inum->no_addr, &inum->no_formal_ino, + GFS2_BLKST_DINODE); + if (IS_ERR(inode)) + return ERR_CAST(inode); out_inode: dentry = d_obtain_alias(inode); if (!IS_ERR(dentry)) dentry->d_op = &gfs2_dops; return dentry; -fail: - gfs2_glock_dq_uninit(&i_gh); - return ERR_PTR(error); } static struct dentry *gfs2_fh_to_dentry(struct super_block *sb, struct fid *fid, diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 8777885..f92c177 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -686,21 +686,20 @@ static void delete_work_func(struct work_struct *work) { struct gfs2_glock *gl = container_of(work, struct gfs2_glock, gl_delete); struct gfs2_sbd *sdp = gl->gl_sbd; - struct gfs2_inode *ip = NULL; + struct gfs2_inode *ip; struct inode *inode; - u64 no_addr = 0; + u64 no_addr = gl->gl_name.ln_number; + + ip = gl->gl_object; + /* Note: Unsafe to dereference ip as we don't hold right refs/locks */ - spin_lock(&gl->gl_spin); - ip = (struct gfs2_inode *)gl->gl_object; if (ip) - no_addr = ip->i_no_addr; - spin_unlock(&gl->gl_spin); - if (ip) { inode = gfs2_ilookup(sdp->sd_vfs, no_addr); - if (inode) { - d_prune_aliases(inode); - iput(inode); - } + else + inode = gfs2_lookup_by_inum(sdp, no_addr, NULL, GFS2_BLKST_UNLINKED); + if (inode && !IS_ERR(inode)) { + d_prune_aliases(inode); + iput(inode); } gfs2_glock_put(gl); } diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index 06370f8..e1213f7 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -73,49 +73,6 @@ static struct inode *gfs2_iget(struct super_block *sb, u64 no_addr) return iget5_locked(sb, hash, iget_test, iget_set, &no_addr); } -struct gfs2_skip_data { - u64 no_addr; - int skipped; -}; - -static int iget_skip_test(struct inode *inode, void *opaque) -{ - struct gfs2_inode *ip = GFS2_I(inode); - struct gfs2_skip_data *data = opaque; - - if (ip->i_no_addr == data->no_addr) { - if (inode->i_state & (I_FREEING|I_WILL_FREE)){ - data->skipped = 1; - return 0; - } - return 1; - } - return 0; -} - -static int iget_skip_set(struct inode *inode, void *opaque) -{ - struct gfs2_inode *ip = GFS2_I(inode); - struct gfs2_skip_data *data = opaque; - - if (data->skipped) - return 1; - inode->i_ino = (unsigned long)(data->no_addr); - ip->i_no_addr = data->no_addr; - return 0; -} - -static struct inode *gfs2_iget_skip(struct super_block *sb, - u64 no_addr) -{ - struct gfs2_skip_data data; - unsigned long hash = (unsigned long)no_addr; - - data.no_addr = no_addr; - data.skipped = 0; - return iget5_locked(sb, hash, iget_skip_test, iget_skip_set, &data); -} - /** * GFS2 lookup code fills in vfs inode contents based on info obtained * from directory entry inside gfs2_inode_lookup(). This has caused issues @@ -243,93 +200,54 @@ fail: return ERR_PTR(error); } -/** - * gfs2_process_unlinked_inode - Lookup an unlinked inode for reclamation - * and try to reclaim it by doing iput. - * - * This function assumes no rgrp locks are currently held. - * - * @sb: The super block - * no_addr: The inode number - * - */ - -void gfs2_process_unlinked_inode(struct super_block *sb, u64 no_addr) +struct inode *gfs2_lookup_by_inum(struct gfs2_sbd *sdp, u64 no_addr, + u64 *no_formal_ino, unsigned int blktype) { - struct gfs2_sbd *sdp; - struct gfs2_inode *ip; - struct gfs2_glock *io_gl = NULL; - int error; - struct gfs2_holder gh; + struct super_block *sb = sdp->sd_vfs; + struct gfs2_holder i_gh; struct inode *inode; + int error; - inode = gfs2_iget_skip(sb, no_addr); - - if (!inode) - return; - - /* If it's not a new inode, someone's using it, so leave it alone. */ - if (!(inode->i_state & I_NEW)) { - iput(inode); - return; - } - - ip = GFS2_I(inode); - sdp = GFS2_SB(inode); - ip->i_no_formal_ino = -1; + error = gfs2_glock_nq_num(sdp, no_addr, &gfs2_inode_glops, + LM_ST_SHARED, LM_FLAG_ANY, &i_gh); + if (error) + return ERR_PTR(error); - error = gfs2_glock_get(sdp, no_addr, &gfs2_inode_glops, CREATE, &ip->i_gl); - if (unlikely(error)) + error = gfs2_check_blk_type(sdp, no_addr, blktype); + if (error) goto fail; - ip->i_gl->gl_object = ip; - error = gfs2_glock_get(sdp, no_addr, &gfs2_iopen_glops, CREATE, &io_gl); - if (unlikely(error)) - goto fail_put; - - set_bit(GIF_INVALID, &ip->i_flags); - error = gfs2_glock_nq_init(io_gl, LM_ST_SHARED, LM_FLAG_TRY | GL_EXACT, - &ip->i_iopen_gh); - if (unlikely(error)) - goto fail_iopen; + inode = gfs2_inode_lookup(sb, DT_UNKNOWN, no_addr, 0); + if (IS_ERR(inode)) + goto fail; - ip->i_iopen_gh.gh_gl->gl_object = ip; - gfs2_glock_put(io_gl); - io_gl = NULL; + error = gfs2_inode_refresh(GFS2_I(inode)); + if (error) + goto fail_iput; - inode->i_mode = DT2IF(DT_UNKNOWN); + /* Pick up the works we bypass in gfs2_inode_lookup */ + if (inode->i_state & I_NEW) + gfs2_set_iop(inode); - /* - * We must read the inode in order to work out its type in - * this case. Note that this doesn't happen often as we normally - * know the type beforehand. This code path only occurs during - * unlinked inode recovery (where it is safe to do this glock, - * which is not true in the general case). - */ - error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, LM_FLAG_TRY, - &gh); - if (unlikely(error)) - goto fail_glock; + /* Two extra checks for NFS only */ + if (no_formal_ino) { + error = -ESTALE; + if (GFS2_I(inode)->i_no_formal_ino != *no_formal_ino) + goto fail_iput; - /* Inode is now uptodate */ - gfs2_glock_dq_uninit(&gh); - gfs2_set_iop(inode); + error = -EIO; + if (GFS2_I(inode)->i_diskflags & GFS2_DIF_SYSTEM) + goto fail_iput; - /* The iput will cause it to be deleted. */ - iput(inode); - return; + error = 0; + } -fail_glock: - gfs2_glock_dq(&ip->i_iopen_gh); -fail_iopen: - if (io_gl) - gfs2_glock_put(io_gl); -fail_put: - ip->i_gl->gl_object = NULL; - gfs2_glock_put(ip->i_gl); fail: - iget_failed(inode); - return; + gfs2_glock_dq_uninit(&i_gh); + return error ? ERR_PTR(error) : inode; +fail_iput: + iput(inode); + goto fail; } static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf) diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h index 6720d7d..d8499fa 100644 --- a/fs/gfs2/inode.h +++ b/fs/gfs2/inode.h @@ -99,7 +99,9 @@ err: extern void gfs2_set_iop(struct inode *inode); extern struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned type, u64 no_addr, u64 no_formal_ino); -extern void gfs2_process_unlinked_inode(struct super_block *sb, u64 no_addr); +extern struct inode *gfs2_lookup_by_inum(struct gfs2_sbd *sdp, u64 no_addr, + u64 *no_formal_ino, + unsigned int blktype); extern struct inode *gfs2_ilookup(struct super_block *sb, u64 no_addr); extern int gfs2_inode_refresh(struct gfs2_inode *ip); diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index bef3ab6..33c8407 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -963,17 +963,18 @@ static int try_rgrp_fit(struct gfs2_rgrpd *rgd, struct gfs2_alloc *al) * The inode, if one has been found, in inode. */ -static u64 try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked, - u64 skip) +static void try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked, u64 skip) { u32 goal = 0, block; u64 no_addr; struct gfs2_sbd *sdp = rgd->rd_sbd; unsigned int n; + struct gfs2_glock *gl; + struct gfs2_inode *ip; + int error; + int found = 0; - for(;;) { - if (goal >= rgd->rd_data) - break; + while (goal < rgd->rd_data) { down_write(&sdp->sd_log_flush_lock); n = 1; block = rgblk_search(rgd, goal, GFS2_BLKST_UNLINKED, @@ -990,11 +991,32 @@ static u64 try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked, if (no_addr == skip) continue; *last_unlinked = no_addr; - return no_addr; + + error = gfs2_glock_get(sdp, no_addr, &gfs2_inode_glops, CREATE, &gl); + if (error) + continue; + + /* If the inode is already in cache, we can ignore it here + * because the existing inode disposal code will deal with + * it when all refs have gone away. Accessing gl_object like + * this is not safe in general. Here it is ok because we do + * not dereference the pointer, and we only need an approx + * answer to whether it is NULL or not. + */ + ip = gl->gl_object; + + if (ip || queue_work(gfs2_delete_workqueue, &gl->gl_delete) == 0) + gfs2_glock_put(gl); + else + found++; + + /* Limit reclaim to sensible number of tasks */ + if (found > 2*NR_CPUS) + return; } rgd->rd_flags &= ~GFS2_RDF_CHECK; - return 0; + return; } /** @@ -1075,11 +1097,9 @@ static void forward_rgrp_set(struct gfs2_sbd *sdp, struct gfs2_rgrpd *rgd) * Try to acquire rgrp in way which avoids contending with others. * * Returns: errno - * unlinked: the block address of an unlinked block to be reclaimed */ -static int get_local_rgrp(struct gfs2_inode *ip, u64 *unlinked, - u64 *last_unlinked) +static int get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); struct gfs2_rgrpd *rgd, *begin = NULL; @@ -1089,7 +1109,6 @@ static int get_local_rgrp(struct gfs2_inode *ip, u64 *unlinked, int loops = 0; int error, rg_locked; - *unlinked = 0; rgd = gfs2_blk2rgrpd(sdp, ip->i_goal); while (rgd) { @@ -1106,17 +1125,10 @@ static int get_local_rgrp(struct gfs2_inode *ip, u64 *unlinked, case 0: if (try_rgrp_fit(rgd, al)) goto out; - /* If the rg came in already locked, there's no - way we can recover from a failed try_rgrp_unlink - because that would require an iput which can only - happen after the rgrp is unlocked. */ - if (!rg_locked && rgd->rd_flags & GFS2_RDF_CHECK) - *unlinked = try_rgrp_unlink(rgd, last_unlinked, - ip->i_no_addr); + if (rgd->rd_flags & GFS2_RDF_CHECK) + try_rgrp_unlink(rgd, last_unlinked, ip->i_no_addr); if (!rg_locked) gfs2_glock_dq_uninit(&al->al_rgd_gh); - if (*unlinked) - return -EAGAIN; /* fall through */ case GLR_TRYFAILED: rgd = recent_rgrp_next(rgd); @@ -1145,13 +1157,10 @@ static int get_local_rgrp(struct gfs2_inode *ip, u64 *unlinked, case 0: if (try_rgrp_fit(rgd, al)) goto out; - if (!rg_locked && rgd->rd_flags & GFS2_RDF_CHECK) - *unlinked = try_rgrp_unlink(rgd, last_unlinked, - ip->i_no_addr); + if (rgd->rd_flags & GFS2_RDF_CHECK) + try_rgrp_unlink(rgd, last_unlinked, ip->i_no_addr); if (!rg_locked) gfs2_glock_dq_uninit(&al->al_rgd_gh); - if (*unlinked) - return -EAGAIN; break; case GLR_TRYFAILED: @@ -1204,12 +1213,12 @@ int gfs2_inplace_reserve_i(struct gfs2_inode *ip, int hold_rindex, struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); struct gfs2_alloc *al = ip->i_alloc; int error = 0; - u64 last_unlinked = NO_BLOCK, unlinked; + u64 last_unlinked = NO_BLOCK; + int tries = 0; if (gfs2_assert_warn(sdp, al->al_requested)) return -EINVAL; -try_again: if (hold_rindex) { /* We need to hold the rindex unless the inode we're using is the rindex itself, in which case it's already held. */ @@ -1218,31 +1227,23 @@ try_again: else if (!sdp->sd_rgrps) /* We may not have the rindex read in, so: */ error = gfs2_ri_update_special(ip); + if (error) + return error; } - if (error) - return error; + do { + error = get_local_rgrp(ip, &last_unlinked); + /* If there is no space, flushing the log may release some */ + if (error) + gfs2_log_flush(sdp, NULL); + } while (error && tries++ < 3); - /* Find an rgrp suitable for allocation. If it encounters any unlinked - dinodes along the way, error will equal -EAGAIN and unlinked will - contains it block address. We then need to look up that inode and - try to free it, and try the allocation again. */ - error = get_local_rgrp(ip, &unlinked, &last_unlinked); if (error) { if (hold_rindex && ip != GFS2_I(sdp->sd_rindex)) gfs2_glock_dq_uninit(&al->al_ri_gh); - if (error != -EAGAIN) - return error; - - gfs2_process_unlinked_inode(ip->i_inode.i_sb, unlinked); - /* regardless of whether or not gfs2_process_unlinked_inode - was successful, we don't want to repeat it again. */ - last_unlinked = unlinked; - gfs2_log_flush(sdp, NULL); - error = 0; - - goto try_again; + return error; } + /* no error, so we have the rgrp set in the inode's allocation. */ al->al_file = file; al->al_line = line; -- cgit v1.1 From 8e35f8e7c61c88f9a979a4e6f7f4ffd4c158a88a Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 2 Nov 2010 09:11:55 -0400 Subject: NLM: Fix a regression in lockd Nick Bowler reports: There are no unusual messages on the client... but I just logged into the server and I see lots of messages of the following form: nfsd: request from insecure port (192.168.8.199:35766)! nfsd: request from insecure port (192.168.8.199:35766)! nfsd: request from insecure port (192.168.8.199:35766)! nfsd: request from insecure port (192.168.8.199:35766)! nfsd: request from insecure port (192.168.8.199:35766)! Bisected to commit 9247685088398cf21bcb513bd2832b4cd42516c4 (SUNRPC: Properly initialize sock_xprt.srcaddr in all cases) Apparently, removing the 'transport->srcaddr.ss_family = family' from xs_create_sock() triggers this due to nlmclnt_lookup_host() incorrectly initialising the srcaddr family to AF_UNSPEC. Reported-by: Nick Bowler Signed-off-by: Trond Myklebust --- fs/lockd/host.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/lockd/host.c b/fs/lockd/host.c index 25e21e4..ed0c59f 100644 --- a/fs/lockd/host.c +++ b/fs/lockd/host.c @@ -124,7 +124,7 @@ static struct nlm_host *nlm_lookup_host(struct nlm_lookup_host_info *ni) continue; if (host->h_server != ni->server) continue; - if (ni->server && + if (ni->server && ni->src_len != 0 && !rpc_cmp_addr(nlm_srcaddr(host), ni->src_sap)) continue; @@ -167,6 +167,7 @@ static struct nlm_host *nlm_lookup_host(struct nlm_lookup_host_info *ni) host->h_addrlen = ni->salen; rpc_set_port(nlm_addr(host), 0); memcpy(nlm_srcaddr(host), ni->src_sap, ni->src_len); + host->h_srcaddrlen = ni->src_len; host->h_version = ni->version; host->h_proto = ni->protocol; host->h_rpcclnt = NULL; @@ -238,9 +239,6 @@ struct nlm_host *nlmclnt_lookup_host(const struct sockaddr *sap, const char *hostname, int noresvport) { - const struct sockaddr source = { - .sa_family = AF_UNSPEC, - }; struct nlm_lookup_host_info ni = { .server = 0, .sap = sap, @@ -249,8 +247,6 @@ struct nlm_host *nlmclnt_lookup_host(const struct sockaddr *sap, .version = version, .hostname = hostname, .hostname_len = strlen(hostname), - .src_sap = &source, - .src_len = sizeof(source), .noresvport = noresvport, }; @@ -357,7 +353,6 @@ nlm_bind_host(struct nlm_host *host) .protocol = host->h_proto, .address = nlm_addr(host), .addrsize = host->h_addrlen, - .saddress = nlm_srcaddr(host), .timeout = &timeparms, .servername = host->h_name, .program = &nlm_program, @@ -376,6 +371,8 @@ nlm_bind_host(struct nlm_host *host) args.flags |= RPC_CLNT_CREATE_HARDRTRY; if (host->h_noresvport) args.flags |= RPC_CLNT_CREATE_NONPRIVPORT; + if (host->h_srcaddrlen) + args.saddress = nlm_srcaddr(host); clnt = rpc_create(&args); if (!IS_ERR(clnt)) -- cgit v1.1 From 1e657bd51f313d87fbbb22d1edf625dba87ef353 Mon Sep 17 00:00:00 2001 From: Paulius Zaleckas Date: Sun, 31 Oct 2010 18:21:05 +0200 Subject: Regression: fix mounting NFS when NFSv3 support is not compiled Trying to mount NFS (root partition in my case) fails if CONFIG_NFS_V3 is not selected. nfs_validate_mount_data() returns EPROTONOSUPPORT, because of this check: #ifndef CONFIG_NFS_V3 if (args->version == 3) goto out_v3_not_compiled; #endif /* !CONFIG_NFS_V3 */ and args->version was always initialized to 3. It was working in 2.6.36 Signed-off-by: Paulius Zaleckas Signed-off-by: Trond Myklebust --- fs/nfs/super.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 0a42e8f..9587506 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -67,6 +67,12 @@ #define NFSDBG_FACILITY NFSDBG_VFS +#ifdef CONFIG_NFS_V3 +#define NFS_DEFAULT_VERSION 3 +#else +#define NFS_DEFAULT_VERSION 2 +#endif + enum { /* Mount options that take no arguments */ Opt_soft, Opt_hard, @@ -2277,7 +2283,7 @@ static int nfs_get_sb(struct file_system_type *fs_type, }; int error = -ENOMEM; - data = nfs_alloc_parsed_mount_data(3); + data = nfs_alloc_parsed_mount_data(NFS_DEFAULT_VERSION); mntfh = nfs_alloc_fhandle(); if (data == NULL || mntfh == NULL) goto out_free_fh; -- cgit v1.1 From 23ebbd9acf5756b6eb783df84403e3ab668a6bce Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 3 Nov 2010 10:24:16 -0400 Subject: Revert "NFSv4: Fall back to ordinary lookup if nfs4_atomic_open() returns EISDIR" This reverts commit 80e60639f1b7c121a7fea53920c5a4b94009361a. This change requires further fixes to ensure that the open doesn't succeed if the lookup later results in a regular file being created. Signed-off-by: Trond Myklebust --- fs/nfs/dir.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 07ac384..635ff65 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -1345,12 +1345,12 @@ static struct dentry *nfs_atomic_lookup(struct inode *dir, struct dentry *dentry res = NULL; goto out; /* This turned out not to be a regular file */ - case -EISDIR: case -ENOTDIR: goto no_open; case -ELOOP: if (!(nd->intent.open.flags & O_NOFOLLOW)) goto no_open; + /* case -EISDIR: */ /* case -EINVAL: */ default: res = ERR_CAST(inode); -- cgit v1.1 From 8cd51a0ccd1beda4482507769887c0be9d70f8c1 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 15 Nov 2010 20:26:22 -0500 Subject: NFS: Fix a couple of regressions in readdir. Fix up the issue that array->eof_index needs to be able to be set even if array->size == 0. Ensure that we catch all important memory allocation error conditions and/or kmap() failures. Signed-off-by: Trond Myklebust --- fs/nfs/dir.c | 90 +++++++++++++++++++++++++++++++++++++----------------------- 1 file changed, 56 insertions(+), 34 deletions(-) (limited to 'fs') diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 635ff65..c6ce8af 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -194,9 +194,13 @@ typedef struct { static struct nfs_cache_array *nfs_readdir_get_array(struct page *page) { + void *ptr; if (page == NULL) return ERR_PTR(-EIO); - return (struct nfs_cache_array *)kmap(page); + ptr = kmap(page); + if (ptr == NULL) + return ERR_PTR(-ENOMEM); + return ptr; } static @@ -213,6 +217,9 @@ int nfs_readdir_clear_array(struct page *page, gfp_t mask) { struct nfs_cache_array *array = nfs_readdir_get_array(page); int i; + + if (IS_ERR(array)) + return PTR_ERR(array); for (i = 0; i < array->size; i++) kfree(array->array[i].string.name); nfs_readdir_release_array(page); @@ -244,7 +251,7 @@ int nfs_readdir_add_to_array(struct nfs_entry *entry, struct page *page) if (IS_ERR(array)) return PTR_ERR(array); - ret = -EIO; + ret = -ENOSPC; if (array->size >= MAX_READDIR_ARRAY) goto out; @@ -255,9 +262,9 @@ int nfs_readdir_add_to_array(struct nfs_entry *entry, struct page *page) if (ret) goto out; array->last_cookie = entry->cookie; + array->size++; if (entry->eof == 1) array->eof_index = array->size; - array->size++; out: nfs_readdir_release_array(page); return ret; @@ -272,7 +279,7 @@ int nfs_readdir_search_for_pos(struct nfs_cache_array *array, nfs_readdir_descri if (diff < 0) goto out_eof; if (diff >= array->size) { - if (array->eof_index > 0) + if (array->eof_index >= 0) goto out_eof; desc->current_index += array->size; return -EAGAIN; @@ -281,8 +288,6 @@ int nfs_readdir_search_for_pos(struct nfs_cache_array *array, nfs_readdir_descri index = (unsigned int)diff; *desc->dir_cookie = array->array[index].cookie; desc->cache_entry_index = index; - if (index == array->eof_index) - desc->eof = 1; return 0; out_eof: desc->eof = 1; @@ -296,17 +301,17 @@ int nfs_readdir_search_for_cookie(struct nfs_cache_array *array, nfs_readdir_des int status = -EAGAIN; for (i = 0; i < array->size; i++) { - if (i == array->eof_index) { - desc->eof = 1; - status = -EBADCOOKIE; - } if (array->array[i].cookie == *desc->dir_cookie) { desc->cache_entry_index = i; status = 0; - break; + goto out; } } - + if (i == array->eof_index) { + desc->eof = 1; + status = -EBADCOOKIE; + } +out: return status; } @@ -449,7 +454,7 @@ out: /* Perform conversion from xdr to cache array */ static -void nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *entry, +int nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *entry, void *xdr_page, struct page *page, unsigned int buflen) { struct xdr_stream stream; @@ -471,21 +476,29 @@ void nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *e do { status = xdr_decode(desc, entry, &stream); - if (status != 0) + if (status != 0) { + if (status == -EAGAIN) + status = 0; break; + } - if (nfs_readdir_add_to_array(entry, page) == -1) - break; if (desc->plus == 1) nfs_prime_dcache(desc->file->f_path.dentry, entry); + + status = nfs_readdir_add_to_array(entry, page); + if (status != 0) + break; } while (!entry->eof); if (status == -EBADCOOKIE && entry->eof) { array = nfs_readdir_get_array(page); - array->eof_index = array->size - 1; - status = 0; - nfs_readdir_release_array(page); + if (!IS_ERR(array)) { + array->eof_index = array->size; + status = 0; + nfs_readdir_release_array(page); + } } + return status; } static @@ -537,7 +550,7 @@ int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page, struct nfs_entry entry; struct file *file = desc->file; struct nfs_cache_array *array; - int status = 0; + int status = -ENOMEM; unsigned int array_size = ARRAY_SIZE(pages); entry.prev_cookie = 0; @@ -549,6 +562,10 @@ int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page, goto out; array = nfs_readdir_get_array(page); + if (IS_ERR(array)) { + status = PTR_ERR(array); + goto out; + } memset(array, 0, sizeof(struct nfs_cache_array)); array->eof_index = -1; @@ -560,8 +577,13 @@ int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page, if (status < 0) break; - nfs_readdir_page_filler(desc, &entry, pages_ptr, page, array_size * PAGE_SIZE); - } while (array->eof_index < 0 && array->size < MAX_READDIR_ARRAY); + status = nfs_readdir_page_filler(desc, &entry, pages_ptr, page, array_size * PAGE_SIZE); + if (status < 0) { + if (status == -ENOSPC) + status = 0; + break; + } + } while (array->eof_index < 0); nfs_readdir_free_large_page(pages_ptr, pages, array_size); out_release_array: @@ -582,8 +604,10 @@ static int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page* page) { struct inode *inode = desc->file->f_path.dentry->d_inode; + int ret; - if (nfs_readdir_xdr_to_array(desc, page, inode) < 0) + ret = nfs_readdir_xdr_to_array(desc, page, inode); + if (ret < 0) goto error; SetPageUptodate(page); @@ -595,7 +619,7 @@ int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page* page) return 0; error: unlock_page(page); - return -EIO; + return ret; } static @@ -608,12 +632,8 @@ void cache_page_release(nfs_readdir_descriptor_t *desc) static struct page *get_cache_page(nfs_readdir_descriptor_t *desc) { - struct page *page; - page = read_cache_page(desc->file->f_path.dentry->d_inode->i_mapping, + return read_cache_page(desc->file->f_path.dentry->d_inode->i_mapping, desc->page_index, (filler_t *)nfs_readdir_filler, desc); - if (IS_ERR(page)) - desc->eof = 1; - return page; } /* @@ -639,8 +659,10 @@ int find_cache_page(nfs_readdir_descriptor_t *desc) static inline int readdir_search_pagecache(nfs_readdir_descriptor_t *desc) { - int res = -EAGAIN; + int res; + if (desc->page_index == 0) + desc->current_index = 0; while (1) { res = find_cache_page(desc); if (res != -EAGAIN) @@ -670,6 +692,8 @@ int nfs_do_filldir(nfs_readdir_descriptor_t *desc, void *dirent, struct dentry *dentry = NULL; array = nfs_readdir_get_array(desc->page); + if (IS_ERR(array)) + return PTR_ERR(array); for (i = desc->cache_entry_index; i < array->size; i++) { d_type = DT_UNKNOWN; @@ -685,11 +709,9 @@ int nfs_do_filldir(nfs_readdir_descriptor_t *desc, void *dirent, *desc->dir_cookie = array->array[i+1].cookie; else *desc->dir_cookie = array->last_cookie; - if (i == array->eof_index) { - desc->eof = 1; - break; - } } + if (i == array->eof_index) + desc->eof = 1; nfs_readdir_release_array(desc->page); cache_page_release(desc); -- cgit v1.1 From ac39612824e1fad8baf82c2841e42b2142af3445 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 15 Nov 2010 20:26:22 -0500 Subject: NFS: readdir shouldn't read beyond the reply returned by the server Signed-off-by: Trond Myklebust --- fs/nfs/dir.c | 4 +++- fs/nfs/nfs2xdr.c | 4 ++-- fs/nfs/nfs3xdr.c | 4 ++-- fs/nfs/nfs4proc.c | 4 +++- fs/nfs/nfs4xdr.c | 2 +- 5 files changed, 11 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index c6ce8af..c9196c9 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -573,11 +573,13 @@ int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page, if (!pages_ptr) goto out_release_array; do { + unsigned int pglen; status = nfs_readdir_xdr_filler(pages, desc, &entry, file, inode); if (status < 0) break; - status = nfs_readdir_page_filler(desc, &entry, pages_ptr, page, array_size * PAGE_SIZE); + pglen = status; + status = nfs_readdir_page_filler(desc, &entry, pages_ptr, page, pglen); if (status < 0) { if (status == -ENOSPC) status = 0; diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c index e6bf457..2563f76 100644 --- a/fs/nfs/nfs2xdr.c +++ b/fs/nfs/nfs2xdr.c @@ -423,7 +423,7 @@ nfs_xdr_readdirres(struct rpc_rqst *req, __be32 *p, void *dummy) struct page **page; size_t hdrlen; unsigned int pglen, recvd; - int status, nr = 0; + int status; if ((status = ntohl(*p++))) return nfs_stat_to_errno(status); @@ -443,7 +443,7 @@ nfs_xdr_readdirres(struct rpc_rqst *req, __be32 *p, void *dummy) if (pglen > recvd) pglen = recvd; page = rcvbuf->pages; - return nr; + return pglen; } static void print_overflow_msg(const char *func, const struct xdr_stream *xdr) diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c index d9a5e83..748dc91 100644 --- a/fs/nfs/nfs3xdr.c +++ b/fs/nfs/nfs3xdr.c @@ -555,7 +555,7 @@ nfs3_xdr_readdirres(struct rpc_rqst *req, __be32 *p, struct nfs3_readdirres *res struct page **page; size_t hdrlen; u32 recvd, pglen; - int status, nr = 0; + int status; status = ntohl(*p++); /* Decode post_op_attrs */ @@ -586,7 +586,7 @@ nfs3_xdr_readdirres(struct rpc_rqst *req, __be32 *p, struct nfs3_readdirres *res pglen = recvd; page = rcvbuf->pages; - return nr; + return pglen; } __be32 * diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 0f24cdf..6a653ff 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -2852,8 +2852,10 @@ static int _nfs4_proc_readdir(struct dentry *dentry, struct rpc_cred *cred, nfs4_setup_readdir(cookie, NFS_COOKIEVERF(dir), dentry, &args); res.pgbase = args.pgbase; status = nfs4_call_sync(NFS_SERVER(dir), &msg, &args, &res, 0); - if (status == 0) + if (status >= 0) { memcpy(NFS_COOKIEVERF(dir), res.verifier.data, NFS4_VERIFIER_SIZE); + status += args.pgbase; + } nfs_invalidate_atime(dir); diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index f313c4c..b7a204f 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -4518,7 +4518,7 @@ static int decode_readdir(struct xdr_stream *xdr, struct rpc_rqst *req, struct n xdr_read_pages(xdr, pglen); - return 0; + return pglen; } static int decode_readlink(struct xdr_stream *xdr, struct rpc_rqst *req) -- cgit v1.1 From 04e4bd1c67f941d81bff78a3b6b94194f081b7df Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Thu, 11 Nov 2010 12:53:47 +0000 Subject: nfs: Ignore kmemleak false positive in nfs_readdir_make_qstr Strings allocated via kmemdup() in nfs_readdir_make_qstr() are referenced from the nfs_cache_array which is stored in a page cache page. Kmemleak does not scan such pages and it reports several false positives. This patch annotates the string->name pointer so that kmemleak does not consider it a real leak. Signed-off-by: Catalin Marinas Cc: Bryan Schumaker Cc: Trond Myklebust Signed-off-by: Trond Myklebust --- fs/nfs/dir.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'fs') diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index c9196c9..662df2a 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -34,6 +34,7 @@ #include #include #include +#include #include "delegation.h" #include "iostat.h" @@ -238,6 +239,11 @@ int nfs_readdir_make_qstr(struct qstr *string, const char *name, unsigned int le string->name = kmemdup(name, len, GFP_KERNEL); if (string->name == NULL) return -ENOMEM; + /* + * Avoid a kmemleak false positive. The pointer to the name is stored + * in a page cache page which kmemleak does not scan. + */ + kmemleak_not_leak(string->name); string->hash = full_name_hash(name, len); return 0; } -- cgit v1.1 From 451a3c24b0135bce54542009b5fde43846c7cf67 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 17 Nov 2010 16:26:55 +0100 Subject: BKL: remove extraneous #include The big kernel lock has been removed from all these files at some point, leaving only the #include. Remove this too as a cleanup. Signed-off-by: Arnd Bergmann Signed-off-by: Linus Torvalds --- fs/block_dev.c | 1 - fs/ceph/inode.c | 1 - fs/ceph/mds_client.c | 1 - fs/compat_ioctl.c | 1 - fs/ecryptfs/super.c | 1 - fs/ext3/super.c | 1 - fs/ioctl.c | 1 - fs/lockd/clntlock.c | 1 - fs/lockd/clntproc.c | 1 - fs/lockd/svc4proc.c | 1 - fs/lockd/svclock.c | 1 - fs/lockd/svcproc.c | 1 - fs/locks.c | 1 - fs/namespace.c | 1 - fs/ncpfs/dir.c | 1 - fs/ncpfs/file.c | 1 - fs/ncpfs/inode.c | 1 - fs/ncpfs/ioctl.c | 1 - fs/nfs/callback.c | 1 - fs/nfs/delegation.c | 1 - fs/nfs/super.c | 1 - fs/ocfs2/super.c | 1 - fs/proc/inode.c | 1 - fs/read_write.c | 1 - fs/reiserfs/inode.c | 1 - fs/reiserfs/ioctl.c | 1 - fs/reiserfs/journal.c | 1 - fs/reiserfs/super.c | 1 - 28 files changed, 28 deletions(-) (limited to 'fs') diff --git a/fs/block_dev.c b/fs/block_dev.c index 06e8ff1..4230252 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -11,7 +11,6 @@ #include #include #include -#include #include #include #include diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 1d6a45b..524b80b 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -2,7 +2,6 @@ #include #include -#include #include #include #include diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 3142b15..7799cac 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -6,7 +6,6 @@ #include #include #include -#include #include "super.h" #include "mds_client.h" diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c index 410ed18..a60579b 100644 --- a/fs/compat_ioctl.c +++ b/fs/compat_ioctl.c @@ -19,7 +19,6 @@ #include #include #include -#include #include #include #include diff --git a/fs/ecryptfs/super.c b/fs/ecryptfs/super.c index 2537323..2720178 100644 --- a/fs/ecryptfs/super.c +++ b/fs/ecryptfs/super.c @@ -28,7 +28,6 @@ #include #include #include -#include #include #include #include "ecryptfs_kernel.h" diff --git a/fs/ext3/super.c b/fs/ext3/super.c index 2fedaf8..acf8695 100644 --- a/fs/ext3/super.c +++ b/fs/ext3/super.c @@ -27,7 +27,6 @@ #include #include #include -#include #include #include #include diff --git a/fs/ioctl.c b/fs/ioctl.c index e92fdbb..4f46752 100644 --- a/fs/ioctl.c +++ b/fs/ioctl.c @@ -6,7 +6,6 @@ #include #include -#include #include #include #include diff --git a/fs/lockd/clntlock.c b/fs/lockd/clntlock.c index d5bb868..25509eb 100644 --- a/fs/lockd/clntlock.c +++ b/fs/lockd/clntlock.c @@ -14,7 +14,6 @@ #include #include #include -#include #include #define NLMDBG_FACILITY NLMDBG_CLIENT diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c index 47ea1e1..332c54c 100644 --- a/fs/lockd/clntproc.c +++ b/fs/lockd/clntproc.c @@ -7,7 +7,6 @@ */ #include -#include #include #include #include diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c index a336e83..38d2611 100644 --- a/fs/lockd/svc4proc.c +++ b/fs/lockd/svc4proc.c @@ -9,7 +9,6 @@ #include #include -#include #include #include diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c index c462d34..ef5659b 100644 --- a/fs/lockd/svclock.c +++ b/fs/lockd/svclock.c @@ -25,7 +25,6 @@ #include #include #include -#include #include #include #include diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c index c3069f3..0caea53 100644 --- a/fs/lockd/svcproc.c +++ b/fs/lockd/svcproc.c @@ -9,7 +9,6 @@ #include #include -#include #include #include diff --git a/fs/locks.c b/fs/locks.c index 0e62dd3..8729347 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -122,7 +122,6 @@ #include #include #include -#include #include #include #include diff --git a/fs/namespace.c b/fs/namespace.c index 8a415c9..3dbfc07 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -13,7 +13,6 @@ #include #include #include -#include #include #include #include diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c index aac8832e..f22b12e7 100644 --- a/fs/ncpfs/dir.c +++ b/fs/ncpfs/dir.c @@ -19,7 +19,6 @@ #include #include #include -#include #include diff --git a/fs/ncpfs/file.c b/fs/ncpfs/file.c index 6c754f7..cb50aaf 100644 --- a/fs/ncpfs/file.c +++ b/fs/ncpfs/file.c @@ -17,7 +17,6 @@ #include #include #include -#include #include #include "ncplib_kernel.h" diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c index d290545..8fb93b6 100644 --- a/fs/ncpfs/inode.c +++ b/fs/ncpfs/inode.c @@ -26,7 +26,6 @@ #include #include #include -#include #include #include #include diff --git a/fs/ncpfs/ioctl.c b/fs/ncpfs/ioctl.c index c2a1f9a..d40a547 100644 --- a/fs/ncpfs/ioctl.c +++ b/fs/ncpfs/ioctl.c @@ -17,7 +17,6 @@ #include #include #include -#include #include #include diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c index aeec017..93a8b3b 100644 --- a/fs/nfs/callback.c +++ b/fs/nfs/callback.c @@ -9,7 +9,6 @@ #include #include #include -#include #include #include #include diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c index 232a7ee..1fd62fc 100644 --- a/fs/nfs/delegation.c +++ b/fs/nfs/delegation.c @@ -11,7 +11,6 @@ #include #include #include -#include #include #include diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 9587506..3c04504 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -39,7 +39,6 @@ #include #include #include -#include #include #include #include diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index f02c0ef..cfeab7c 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -41,7 +41,6 @@ #include #include #include -#include #define MLOG_MASK_PREFIX ML_SUPER #include diff --git a/fs/proc/inode.c b/fs/proc/inode.c index 9c2b5f4..3ddb606 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -16,7 +16,6 @@ #include #include #include -#include #include #include diff --git a/fs/read_write.c b/fs/read_write.c index 431a0ed..5d431ba 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -9,7 +9,6 @@ #include #include #include -#include #include #include #include diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index 41656d4..0bae036 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c @@ -8,7 +8,6 @@ #include #include #include -#include #include #include #include diff --git a/fs/reiserfs/ioctl.c b/fs/reiserfs/ioctl.c index adf22b4..bd9763e 100644 --- a/fs/reiserfs/ioctl.c +++ b/fs/reiserfs/ioctl.c @@ -9,7 +9,6 @@ #include #include #include -#include #include /* diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c index 076c8b1..d31bce1 100644 --- a/fs/reiserfs/journal.c +++ b/fs/reiserfs/journal.c @@ -43,7 +43,6 @@ #include #include #include -#include #include #include #include diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index 3bf7a64..b243117 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -28,7 +28,6 @@ #include #include #include -#include struct file_system_type reiserfs_fs_type; -- cgit v1.1 From 460781b54253e3ed10a0a2a433bdc548ec952269 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 17 Nov 2010 16:26:56 +0100 Subject: BKL: remove references to lock_kernel from comments Lock_kernel is gone from the code, so the comments should be updated, too. nfsd now uses lock_flocks instead of lock_kernel to protect against posix file locks. Signed-off-by: Arnd Bergmann Acked-by: J. Bruce Fields Cc: linux-nfs@vger.kernel.org Signed-off-by: Linus Torvalds --- fs/nfsd/nfs4state.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index ad2bfa6..116cab9 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -2262,7 +2262,7 @@ nfs4_file_downgrade(struct nfs4_file *fp, unsigned int share_access) * Spawn a thread to perform a recall on the delegation represented * by the lease (file_lock) * - * Called from break_lease() with lock_kernel() held. + * Called from break_lease() with lock_flocks() held. * Note: we assume break_lease will only call this *once* for any given * lease. */ @@ -2286,7 +2286,7 @@ void nfsd_break_deleg_cb(struct file_lock *fl) list_add_tail(&dp->dl_recall_lru, &del_recall_lru); spin_unlock(&recall_lock); - /* only place dl_time is set. protected by lock_kernel*/ + /* only place dl_time is set. protected by lock_flocks*/ dp->dl_time = get_seconds(); /* @@ -2303,7 +2303,7 @@ void nfsd_break_deleg_cb(struct file_lock *fl) /* * The file_lock is being reapd. * - * Called by locks_free_lock() with lock_kernel() held. + * Called by locks_free_lock() with lock_flocks() held. */ static void nfsd_release_deleg_cb(struct file_lock *fl) @@ -2318,7 +2318,7 @@ void nfsd_release_deleg_cb(struct file_lock *fl) } /* - * Called from setlease() with lock_kernel() held + * Called from setlease() with lock_flocks() held */ static int nfsd_same_client_deleg_cb(struct file_lock *onlist, struct file_lock *try) -- cgit v1.1 From 08da1193d2c8c7a25d0cef7f85d0b9f1ad7c583a Mon Sep 17 00:00:00 2001 From: Markus Trippelsdorf Date: Wed, 17 Nov 2010 21:46:06 -0500 Subject: ext4: fix setting random pages PageUptodate ext4_end_bio calls put_page and kmem_cache_free before calling SetPageUpdate(). This can result in setting the PageUptodate bit on random pages and causes the following BUG: BUG: Bad page state in process rm pfn:52e54 page:ffffea0001222260 count:0 mapcount:0 mapping: (null) index:0x0 arch kernel: page flags: 0x4000000000000008(uptodate) Fix the problem by moving put_io_page() after the SetPageUpdate() call. Thanks to Hugh Dickins for analyzing this problem. Reported-by: Markus Trippelsdorf Tested-by: Markus Trippelsdorf Signed-off-by: Markus Trippelsdorf Signed-off-by: "Theodore Ts'o" --- fs/ext4/page-io.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 7f5451c..beacce1 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -237,8 +237,6 @@ static void ext4_end_bio(struct bio *bio, int error) } while (bh != head); } - put_io_page(io_end->pages[i]); - /* * If this is a partial write which happened to make * all buffers uptodate then we can optimize away a @@ -248,6 +246,8 @@ static void ext4_end_bio(struct bio *bio, int error) */ if (!partial_write) SetPageUptodate(page); + + put_io_page(io_end->pages[i]); } io_end->num_io_pages = 0; inode = io_end->inode; -- cgit v1.1 From f4c8cc652d9f70680dd91be60a7a455040d0a282 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 17 Nov 2010 21:46:25 -0500 Subject: ext4: missing unlock in ext4_clear_request_list() If the the li_request_list was empty then it returned with the lock held. Instead of adding a "goto unlock" I just removed that special case and let it go past the empty list_for_each_safe(). Signed-off-by: Dan Carpenter Signed-off-by: "Theodore Ts'o" --- fs/ext4/super.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'fs') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 61182fe..ef09d14 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -2799,9 +2799,6 @@ static void ext4_clear_request_list(void) struct ext4_li_request *elr; mutex_lock(&ext4_li_info->li_list_mtx); - if (list_empty(&ext4_li_info->li_request_list)) - return; - list_for_each_safe(pos, n, &ext4_li_info->li_request_list) { elr = list_entry(pos, struct ext4_li_request, lr_request); -- cgit v1.1 From 0587aa3d11f9769a301b21bff2c3ed8365606b8d Mon Sep 17 00:00:00 2001 From: yangsheng Date: Wed, 17 Nov 2010 21:46:26 -0500 Subject: jbd2: fix /proc/fs/jbd2/ when using an external journal In jbd2_journal_init_dev(), we need make sure the journal structure is fully initialzied before calling jbd2_stats_proc_init(). Reviewed-by: Andreas Dilger Signed-off-by: yangsheng Signed-off-by: "Theodore Ts'o" --- fs/jbd2/journal.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index c590d15..f837ba9 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -899,6 +899,14 @@ journal_t * jbd2_journal_init_dev(struct block_device *bdev, /* journal descriptor can store up to n blocks -bzzz */ journal->j_blocksize = blocksize; + journal->j_dev = bdev; + journal->j_fs_dev = fs_dev; + journal->j_blk_offset = start; + journal->j_maxlen = len; + bdevname(journal->j_dev, journal->j_devname); + p = journal->j_devname; + while ((p = strchr(p, '/'))) + *p = '!'; jbd2_stats_proc_init(journal); n = journal->j_blocksize / sizeof(journal_block_tag_t); journal->j_wbufsize = n; @@ -908,14 +916,6 @@ journal_t * jbd2_journal_init_dev(struct block_device *bdev, __func__); goto out_err; } - journal->j_dev = bdev; - journal->j_fs_dev = fs_dev; - journal->j_blk_offset = start; - journal->j_maxlen = len; - bdevname(journal->j_dev, journal->j_devname); - p = journal->j_devname; - while ((p = strchr(p, '/'))) - *p = '!'; bh = __getblk(journal->j_dev, start, journal->j_blocksize); if (!bh) { -- cgit v1.1 From 3105c19c450ac7c18ab28c19d364b588767261b3 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 18 Nov 2010 09:15:07 -0800 Subject: ceph: fix readdir EOVERFLOW on 32-bit archs One of the readdir filldir_t callers was passing the raw ceph 64-bit ino instead of the hashed 32-bit one, producing an EOVERFLOW in the filler callback. Fix this by calling the ceph_vino_to_ino() helper to do the conversion. Reported-by: Jan Smets Tested-by: Jan Smets Signed-off-by: Sage Weil --- fs/ceph/dir.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 5f67728..7d447af 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -358,18 +358,22 @@ more: u64 pos = ceph_make_fpos(frag, off); struct ceph_mds_reply_inode *in = rinfo->dir_in[off - fi->offset].in; + struct ceph_vino vino; + ino_t ino; + dout("readdir off %d (%d/%d) -> %lld '%.*s' %p\n", off, off - fi->offset, rinfo->dir_nr, pos, rinfo->dir_dname_len[off - fi->offset], rinfo->dir_dname[off - fi->offset], in); BUG_ON(!in); ftype = le32_to_cpu(in->mode) >> 12; + vino.ino = le64_to_cpu(in->ino); + vino.snap = le64_to_cpu(in->snapid); + ino = ceph_vino_to_ino(vino); if (filldir(dirent, rinfo->dir_dname[off - fi->offset], rinfo->dir_dname_len[off - fi->offset], - pos, - le64_to_cpu(in->ino), - ftype) < 0) { + pos, ino, ftype) < 0) { dout("filldir stopping us...\n"); return 0; } -- cgit v1.1 From 14870b457524e745f1a118e17873d104b1a47b70 Mon Sep 17 00:00:00 2001 From: Abhijith Das Date: Thu, 18 Nov 2010 11:24:24 -0500 Subject: GFS2: Userland expects quota limit/warn/usage in 512b blocks Userland programs using the quotactl() syscall assume limit/warn/usage block counts in 512b basic blocks which were instead being read/written in fs blocksize in gfs2. With this patch, gfs2 correctly interacts with the syscall using 512b blocks. Signed-off-by: Abhi Das Reviewed-by: Christoph Hellwig Signed-off-by: Steven Whitehouse --- fs/gfs2/quota.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index 58a9b99..f606baf 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -631,6 +631,7 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc, struct fs_disk_quota *fdq) { struct inode *inode = &ip->i_inode; + struct gfs2_sbd *sdp = GFS2_SB(inode); struct address_space *mapping = inode->i_mapping; unsigned long index = loc >> PAGE_CACHE_SHIFT; unsigned offset = loc & (PAGE_CACHE_SIZE - 1); @@ -658,11 +659,11 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc, qd->qd_qb.qb_value = qp->qu_value; if (fdq) { if (fdq->d_fieldmask & FS_DQ_BSOFT) { - qp->qu_warn = cpu_to_be64(fdq->d_blk_softlimit); + qp->qu_warn = cpu_to_be64(fdq->d_blk_softlimit >> sdp->sd_fsb2bb_shift); qd->qd_qb.qb_warn = qp->qu_warn; } if (fdq->d_fieldmask & FS_DQ_BHARD) { - qp->qu_limit = cpu_to_be64(fdq->d_blk_hardlimit); + qp->qu_limit = cpu_to_be64(fdq->d_blk_hardlimit >> sdp->sd_fsb2bb_shift); qd->qd_qb.qb_limit = qp->qu_limit; } } @@ -1497,9 +1498,9 @@ static int gfs2_get_dqblk(struct super_block *sb, int type, qid_t id, fdq->d_version = FS_DQUOT_VERSION; fdq->d_flags = (type == QUOTA_USER) ? FS_USER_QUOTA : FS_GROUP_QUOTA; fdq->d_id = id; - fdq->d_blk_hardlimit = be64_to_cpu(qlvb->qb_limit); - fdq->d_blk_softlimit = be64_to_cpu(qlvb->qb_warn); - fdq->d_bcount = be64_to_cpu(qlvb->qb_value); + fdq->d_blk_hardlimit = be64_to_cpu(qlvb->qb_limit) << sdp->sd_fsb2bb_shift; + fdq->d_blk_softlimit = be64_to_cpu(qlvb->qb_warn) << sdp->sd_fsb2bb_shift; + fdq->d_bcount = be64_to_cpu(qlvb->qb_value) << sdp->sd_fsb2bb_shift; gfs2_glock_dq_uninit(&q_gh); out: @@ -1566,10 +1567,10 @@ static int gfs2_set_dqblk(struct super_block *sb, int type, qid_t id, /* If nothing has changed, this is a no-op */ if ((fdq->d_fieldmask & FS_DQ_BSOFT) && - (fdq->d_blk_softlimit == be64_to_cpu(qd->qd_qb.qb_warn))) + ((fdq->d_blk_softlimit >> sdp->sd_fsb2bb_shift) == be64_to_cpu(qd->qd_qb.qb_warn))) fdq->d_fieldmask ^= FS_DQ_BSOFT; if ((fdq->d_fieldmask & FS_DQ_BHARD) && - (fdq->d_blk_hardlimit == be64_to_cpu(qd->qd_qb.qb_limit))) + ((fdq->d_blk_hardlimit >> sdp->sd_fsb2bb_shift) == be64_to_cpu(qd->qd_qb.qb_limit))) fdq->d_fieldmask ^= FS_DQ_BHARD; if (fdq->d_fieldmask == 0) goto out_i; -- cgit v1.1 From 5a9ae68a349aa076bc8557ee2fcf865574459282 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Fri, 19 Nov 2010 09:56:44 -0500 Subject: ext4: ext4_fill_super shouldn't return 0 on corruption At the start of ext4_fill_super, ret is set to -EINVAL, and any failure path out of that function returns ret. However, the generic_check_addressable clause sets ret = 0 (if it passes), which means that a subsequent failure (e.g. a group checksum error) returns 0 even though the mount should fail. This causes vfs_kern_mount in turn to think that the mount succeeded, leading to an oops. A simple fix is to avoid using ret for the generic_check_addressable check, which was last changed in commit 30ca22c70e3ef0a96ff84de69cd7e8561b416cb2. Signed-off-by: Darrick J. Wong Signed-off-by: "Theodore Ts'o" --- fs/ext4/super.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index ef09d14..14ada8c 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -3265,13 +3265,14 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) * Test whether we have more sectors than will fit in sector_t, * and whether the max offset is addressable by the page cache. */ - ret = generic_check_addressable(sb->s_blocksize_bits, + err = generic_check_addressable(sb->s_blocksize_bits, ext4_blocks_count(es)); - if (ret) { + if (err) { ext4_msg(sb, KERN_ERR, "filesystem" " too large to mount safely on this system"); if (sizeof(sector_t) < 8) ext4_msg(sb, KERN_WARNING, "CONFIG_LBDAF not enabled"); + ret = err; goto failed_mount; } -- cgit v1.1 From 93bb41f4f8b89ac8b4d0a734bc59634cb0a29a89 Mon Sep 17 00:00:00 2001 From: Lukas Czerner Date: Fri, 19 Nov 2010 21:18:35 -0500 Subject: fs: Do not dispatch FITRIM through separate super_operation There was concern that FITRIM ioctl is not common enough to be included in core vfs ioctl, as Christoph Hellwig pointed out there's no real point in dispatching this out to a separate vector instead of just through ->ioctl. So this commit removes ioctl_fstrim() from vfs ioctl and trim_fs from super_operation structure. Signed-off-by: Lukas Czerner Signed-off-by: "Theodore Ts'o" --- fs/ext4/super.c | 1 - fs/ioctl.c | 39 --------------------------------------- 2 files changed, 40 deletions(-) (limited to 'fs') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 14ada8c..e32195d 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1197,7 +1197,6 @@ static const struct super_operations ext4_sops = { .quota_write = ext4_quota_write, #endif .bdev_try_to_free_page = bdev_try_to_free_page, - .trim_fs = ext4_trim_fs }; static const struct super_operations ext4_nojournal_sops = { diff --git a/fs/ioctl.c b/fs/ioctl.c index e92fdbb..f855ea4 100644 --- a/fs/ioctl.c +++ b/fs/ioctl.c @@ -530,41 +530,6 @@ static int ioctl_fsthaw(struct file *filp) return thaw_super(sb); } -static int ioctl_fstrim(struct file *filp, void __user *argp) -{ - struct super_block *sb = filp->f_path.dentry->d_inode->i_sb; - struct fstrim_range range; - int ret = 0; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - /* If filesystem doesn't support trim feature, return. */ - if (sb->s_op->trim_fs == NULL) - return -EOPNOTSUPP; - - /* If a blockdevice-backed filesystem isn't specified, return EINVAL. */ - if (sb->s_bdev == NULL) - return -EINVAL; - - if (argp == NULL) { - range.start = 0; - range.len = ULLONG_MAX; - range.minlen = 0; - } else if (copy_from_user(&range, argp, sizeof(range))) - return -EFAULT; - - ret = sb->s_op->trim_fs(sb, &range); - if (ret < 0) - return ret; - - if ((argp != NULL) && - (copy_to_user(argp, &range, sizeof(range)))) - return -EFAULT; - - return 0; -} - /* * When you add any new common ioctls to the switches above and below * please update compat_sys_ioctl() too. @@ -615,10 +580,6 @@ int do_vfs_ioctl(struct file *filp, unsigned int fd, unsigned int cmd, error = ioctl_fsthaw(filp); break; - case FITRIM: - error = ioctl_fstrim(filp, argp); - break; - case FS_IOC_FIEMAP: return ioctl_fiemap(filp, arg); -- cgit v1.1 From e681c047e47c0abe67bf95857f23814372793cb0 Mon Sep 17 00:00:00 2001 From: Lukas Czerner Date: Fri, 19 Nov 2010 21:47:07 -0500 Subject: ext4: Add EXT4_IOC_TRIM ioctl to handle batched discard Filesystem independent ioctl was rejected as not common enough to be in core vfs ioctl. Since we still need to access to this functionality this commit adds ext4 specific ioctl EXT4_IOC_TRIM to dispatch ext4_trim_fs(). It takes fstrim_range structure as an argument. fstrim_range is definec in the include/linux/fs.h and its definition is as follows. struct fstrim_range { __u64 start; __u64 len; __u64 minlen; } start - first Byte to trim len - number of Bytes to trim from start minlen - minimum extent length to trim, free extents shorter than this number of Bytes will be ignored. This will be rounded up to fs block size. After the FITRIM is done, the number of actually discarded Bytes is stored in fstrim_range.len to give the user better insight on how much storage space has been really released for wear-leveling. Signed-off-by: Lukas Czerner Signed-off-by: "Theodore Ts'o" --- fs/ext4/ioctl.c | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) (limited to 'fs') diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index bf5ae88..eb3bc2f 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -331,6 +331,30 @@ mext_out: return err; } + case FITRIM: + { + struct super_block *sb = inode->i_sb; + struct fstrim_range range; + int ret = 0; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (copy_from_user(&range, (struct fstrim_range *)arg, + sizeof(range))) + return -EFAULT; + + ret = ext4_trim_fs(sb, &range); + if (ret < 0) + return ret; + + if (copy_to_user((struct fstrim_range *)arg, &range, + sizeof(range))) + return -EFAULT; + + return 0; + } + default: return -ENOTTY; } -- cgit v1.1 From 784b4e29a26617589edd290dd2919735e190c06e Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Sun, 21 Nov 2010 22:20:49 -0500 Subject: Btrfs: add migrate page for metadata inode Migrate page will directly call the btrfs btree writepage function, which isn't actually allowed. Our writepage assumes that you have locked the extent_buffer and flagged the block as written. Without doing these steps, we can corrupt metadata blocks. A later commit will remove the btree writepage function since it is really only safely used internally by btrfs. We use writepages for everything else. Signed-off-by: Chris Mason --- fs/btrfs/disk-io.c | 27 +++++++++++++++++++++++++-- 1 file changed, 25 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index b40dfe4..a67b98d 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -28,6 +28,7 @@ #include #include #include +#include #include "compat.h" #include "ctree.h" #include "disk-io.h" @@ -355,6 +356,8 @@ static int csum_dirty_buffer(struct btrfs_root *root, struct page *page) ret = btree_read_extent_buffer_pages(root, eb, start + PAGE_CACHE_SIZE, btrfs_header_generation(eb)); BUG_ON(ret); + WARN_ON(!btrfs_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN)); + found_start = btrfs_header_bytenr(eb); if (found_start != start) { WARN_ON(1); @@ -693,6 +696,26 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, __btree_submit_bio_done); } +static int btree_migratepage(struct address_space *mapping, + struct page *newpage, struct page *page) +{ + /* + * we can't safely write a btree page from here, + * we haven't done the locking hook + */ + if (PageDirty(page)) + return -EAGAIN; + /* + * Buffers may be managed in a filesystem specific way. + * We must have no buffers or drop them. + */ + if (page_has_private(page) && + !try_to_release_page(page, GFP_KERNEL)) + return -EAGAIN; + + return migrate_page(mapping, newpage, page); +} + static int btree_writepage(struct page *page, struct writeback_control *wbc) { struct extent_io_tree *tree; @@ -707,8 +730,7 @@ static int btree_writepage(struct page *page, struct writeback_control *wbc) } redirty_page_for_writepage(wbc, page); - eb = btrfs_find_tree_block(root, page_offset(page), - PAGE_CACHE_SIZE); + eb = btrfs_find_tree_block(root, page_offset(page), PAGE_CACHE_SIZE); WARN_ON(!eb); was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags); @@ -799,6 +821,7 @@ static const struct address_space_operations btree_aops = { .releasepage = btree_releasepage, .invalidatepage = btree_invalidatepage, .sync_page = block_sync_page, + .migratepage = btree_migratepage, }; int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize, -- cgit v1.1 From 0c56fa9662927354255f2f64617d1de61fc03db9 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Mon, 22 Nov 2010 03:01:39 +0000 Subject: btrfs: fix free dip and dip->csums twice bio_endio() will free dip and dip->csums, so dip and dip->csums twice will be freed twice. Fix it. Signed-off-by: Miao Xie Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 5132c9a..8c027aa 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -5731,7 +5731,7 @@ static void btrfs_submit_direct(int rw, struct bio *bio, struct inode *inode, ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0); if (ret) - goto out_err; + goto free_ordered; if (write && !skip_sum) { ret = btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info, @@ -5740,7 +5740,7 @@ static void btrfs_submit_direct(int rw, struct bio *bio, struct inode *inode, __btrfs_submit_bio_start_direct_io, __btrfs_submit_bio_done); if (ret) - goto out_err; + goto free_ordered; return; } else if (!skip_sum) btrfs_lookup_bio_sums_dio(root, inode, bio, @@ -5748,11 +5748,8 @@ static void btrfs_submit_direct(int rw, struct bio *bio, struct inode *inode, ret = btrfs_map_bio(root, rw, bio, 0, 1); if (ret) - goto out_err; + goto free_ordered; return; -out_err: - kfree(dip->csums); - kfree(dip); free_ordered: /* * If this is a write, we need to clean up the reserved space and kill -- cgit v1.1 From 88f794ede7fadd4b63135b94d1561c1f2d5eb5f5 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Mon, 22 Nov 2010 03:02:55 +0000 Subject: btrfs: cleanup duplicate bio allocating functions extent_bio_alloc() and compressed_bio_alloc() are similar, cleanup similar source code. Signed-off-by: Miao Xie Signed-off-by: Chris Mason --- fs/btrfs/compression.c | 15 +-------------- fs/btrfs/extent_io.c | 8 ++++---- fs/btrfs/extent_io.h | 3 +++ 3 files changed, 8 insertions(+), 18 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 7845d1f..b50bc4b 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -91,23 +91,10 @@ static inline int compressed_bio_size(struct btrfs_root *root, static struct bio *compressed_bio_alloc(struct block_device *bdev, u64 first_byte, gfp_t gfp_flags) { - struct bio *bio; int nr_vecs; nr_vecs = bio_get_nr_vecs(bdev); - bio = bio_alloc(gfp_flags, nr_vecs); - - if (bio == NULL && (current->flags & PF_MEMALLOC)) { - while (!bio && (nr_vecs /= 2)) - bio = bio_alloc(gfp_flags, nr_vecs); - } - - if (bio) { - bio->bi_size = 0; - bio->bi_bdev = bdev; - bio->bi_sector = first_byte >> 9; - } - return bio; + return btrfs_bio_alloc(bdev, first_byte >> 9, nr_vecs, gfp_flags); } static int check_compressed_csum(struct inode *inode, diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 3b7eaee..f60aa3c 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1828,9 +1828,9 @@ static void end_bio_extent_preparewrite(struct bio *bio, int err) bio_put(bio); } -static struct bio * -extent_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs, - gfp_t gfp_flags) +struct bio * +btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs, + gfp_t gfp_flags) { struct bio *bio; @@ -1919,7 +1919,7 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree, else nr = bio_get_nr_vecs(bdev); - bio = extent_bio_alloc(bdev, sector, nr, GFP_NOFS | __GFP_HIGH); + bio = btrfs_bio_alloc(bdev, sector, nr, GFP_NOFS | __GFP_HIGH); bio_add_page(bio, page, page_size, offset); bio->bi_end_io = end_io_func; diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 1c6d4f3..4183c81 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -310,4 +310,7 @@ int extent_clear_unlock_delalloc(struct inode *inode, struct extent_io_tree *tree, u64 start, u64 end, struct page *locked_page, unsigned long op); +struct bio * +btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs, + gfp_t gfp_flags); #endif -- cgit v1.1 From e65e1535542931e51189832264cd282e5899e4b9 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Mon, 22 Nov 2010 03:04:43 +0000 Subject: btrfs: fix panic caused by direct IO btrfs paniced when we write >64KB data by direct IO at one time. Reproduce steps: # mkfs.btrfs /dev/sda5 /dev/sda6 # mount /dev/sda5 /mnt # dd if=/dev/zero of=/mnt/tmpfile bs=100K count=1 oflag=direct Then btrfs paniced: mapping failed logical 1103155200 bio len 69632 len 12288 ------------[ cut here ]------------ kernel BUG at fs/btrfs/volumes.c:3010! [SNIP] Pid: 1992, comm: btrfs-worker-0 Not tainted 2.6.37-rc1 #1 D2399/PRIMERGY RIP: 0010:[] [] btrfs_map_bio+0x202/0x210 [btrfs] [SNIP] Call Trace: [] __btrfs_submit_bio_done+0x1b/0x20 [btrfs] [] run_one_async_done+0x9f/0xb0 [btrfs] [] run_ordered_completions+0x80/0xc0 [btrfs] [] worker_loop+0x154/0x5f0 [btrfs] [] ? worker_loop+0x0/0x5f0 [btrfs] [] ? worker_loop+0x0/0x5f0 [btrfs] [] kthread+0x96/0xa0 [] kernel_thread_helper+0x4/0x10 [] ? kthread+0x0/0xa0 [] ? kernel_thread_helper+0x0/0x10 We fix this problem by splitting bios when we submit bios. Reported-by: Tsutomu Itoh Signed-off-by: Miao Xie Tested-by: Tsutomu Itoh Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 205 +++++++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 184 insertions(+), 21 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 8c027aa..a47e4fa 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -5535,13 +5535,21 @@ struct btrfs_dio_private { u64 bytes; u32 *csums; void *private; + + /* number of bios pending for this dio */ + atomic_t pending_bios; + + /* IO errors */ + int errors; + + struct bio *orig_bio; }; static void btrfs_endio_direct_read(struct bio *bio, int err) { + struct btrfs_dio_private *dip = bio->bi_private; struct bio_vec *bvec_end = bio->bi_io_vec + bio->bi_vcnt - 1; struct bio_vec *bvec = bio->bi_io_vec; - struct btrfs_dio_private *dip = bio->bi_private; struct inode *inode = dip->inode; struct btrfs_root *root = BTRFS_I(inode)->root; u64 start; @@ -5684,6 +5692,176 @@ static int __btrfs_submit_bio_start_direct_io(struct inode *inode, int rw, return 0; } +static void btrfs_end_dio_bio(struct bio *bio, int err) +{ + struct btrfs_dio_private *dip = bio->bi_private; + + if (err) { + printk(KERN_ERR "btrfs direct IO failed ino %lu rw %lu " + "disk_bytenr %lu len %u err no %d\n", + dip->inode->i_ino, bio->bi_rw, bio->bi_sector, + bio->bi_size, err); + dip->errors = 1; + + /* + * before atomic variable goto zero, we must make sure + * dip->errors is perceived to be set. + */ + smp_mb__before_atomic_dec(); + } + + /* if there are more bios still pending for this dio, just exit */ + if (!atomic_dec_and_test(&dip->pending_bios)) + goto out; + + if (dip->errors) + bio_io_error(dip->orig_bio); + else { + set_bit(BIO_UPTODATE, &dip->orig_bio->bi_flags); + bio_endio(dip->orig_bio, 0); + } +out: + bio_put(bio); +} + +static struct bio *btrfs_dio_bio_alloc(struct block_device *bdev, + u64 first_sector, gfp_t gfp_flags) +{ + int nr_vecs = bio_get_nr_vecs(bdev); + return btrfs_bio_alloc(bdev, first_sector, nr_vecs, gfp_flags); +} + +static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode, + int rw, u64 file_offset, int skip_sum, + u32 *csums) +{ + int write = rw & REQ_WRITE; + struct btrfs_root *root = BTRFS_I(inode)->root; + int ret; + + bio_get(bio); + ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0); + if (ret) + goto err; + + if (write && !skip_sum) { + ret = btrfs_wq_submit_bio(root->fs_info, + inode, rw, bio, 0, 0, + file_offset, + __btrfs_submit_bio_start_direct_io, + __btrfs_submit_bio_done); + goto err; + } else if (!skip_sum) + btrfs_lookup_bio_sums_dio(root, inode, bio, + file_offset, csums); + + ret = btrfs_map_bio(root, rw, bio, 0, 1); +err: + bio_put(bio); + return ret; +} + +static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip, + int skip_sum) +{ + struct inode *inode = dip->inode; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree; + struct bio *bio; + struct bio *orig_bio = dip->orig_bio; + struct bio_vec *bvec = orig_bio->bi_io_vec; + u64 start_sector = orig_bio->bi_sector; + u64 file_offset = dip->logical_offset; + u64 submit_len = 0; + u64 map_length; + int nr_pages = 0; + u32 *csums = dip->csums; + int ret = 0; + + bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev, start_sector, GFP_NOFS); + if (!bio) + return -ENOMEM; + bio->bi_private = dip; + bio->bi_end_io = btrfs_end_dio_bio; + atomic_inc(&dip->pending_bios); + + map_length = orig_bio->bi_size; + ret = btrfs_map_block(map_tree, READ, start_sector << 9, + &map_length, NULL, 0); + if (ret) { + bio_put(bio); + return -EIO; + } + + while (bvec <= (orig_bio->bi_io_vec + orig_bio->bi_vcnt - 1)) { + if (unlikely(map_length < submit_len + bvec->bv_len || + bio_add_page(bio, bvec->bv_page, bvec->bv_len, + bvec->bv_offset) < bvec->bv_len)) { + /* + * inc the count before we submit the bio so + * we know the end IO handler won't happen before + * we inc the count. Otherwise, the dip might get freed + * before we're done setting it up + */ + atomic_inc(&dip->pending_bios); + ret = __btrfs_submit_dio_bio(bio, inode, rw, + file_offset, skip_sum, + csums); + if (ret) { + bio_put(bio); + atomic_dec(&dip->pending_bios); + goto out_err; + } + + if (!skip_sum) + csums = csums + nr_pages; + start_sector += submit_len >> 9; + file_offset += submit_len; + + submit_len = 0; + nr_pages = 0; + + bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev, + start_sector, GFP_NOFS); + if (!bio) + goto out_err; + bio->bi_private = dip; + bio->bi_end_io = btrfs_end_dio_bio; + + map_length = orig_bio->bi_size; + ret = btrfs_map_block(map_tree, READ, start_sector << 9, + &map_length, NULL, 0); + if (ret) { + bio_put(bio); + goto out_err; + } + } else { + submit_len += bvec->bv_len; + nr_pages ++; + bvec++; + } + } + + ret = __btrfs_submit_dio_bio(bio, inode, rw, file_offset, skip_sum, + csums); + if (!ret) + return 0; + + bio_put(bio); +out_err: + dip->errors = 1; + /* + * before atomic variable goto zero, we must + * make sure dip->errors is perceived to be set. + */ + smp_mb__before_atomic_dec(); + if (atomic_dec_and_test(&dip->pending_bios)) + bio_io_error(dip->orig_bio); + + /* bio_end_io() will handle error, so we needn't return it */ + return 0; +} + static void btrfs_submit_direct(int rw, struct bio *bio, struct inode *inode, loff_t file_offset) { @@ -5723,33 +5901,18 @@ static void btrfs_submit_direct(int rw, struct bio *bio, struct inode *inode, dip->disk_bytenr = (u64)bio->bi_sector << 9; bio->bi_private = dip; + dip->errors = 0; + dip->orig_bio = bio; + atomic_set(&dip->pending_bios, 0); if (write) bio->bi_end_io = btrfs_endio_direct_write; else bio->bi_end_io = btrfs_endio_direct_read; - ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0); - if (ret) - goto free_ordered; - - if (write && !skip_sum) { - ret = btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info, - inode, rw, bio, 0, 0, - dip->logical_offset, - __btrfs_submit_bio_start_direct_io, - __btrfs_submit_bio_done); - if (ret) - goto free_ordered; + ret = btrfs_submit_direct_hook(rw, dip, skip_sum); + if (!ret) return; - } else if (!skip_sum) - btrfs_lookup_bio_sums_dio(root, inode, bio, - dip->logical_offset, dip->csums); - - ret = btrfs_map_bio(root, rw, bio, 0, 1); - if (ret) - goto free_ordered; - return; free_ordered: /* * If this is a write, we need to clean up the reserved space and kill -- cgit v1.1 From 6f33434850ed87dc5e56b60ebbad3d3cf405f296 Mon Sep 17 00:00:00 2001 From: Arne Jansen Date: Fri, 12 Nov 2010 23:17:56 +0000 Subject: btrfs: Fix early enospc because 'unused' calculated with wrong sign. 'unused' calculated with wrong sign in reserve_metadata_bytes(). This might have lead to unwanted over-reservations. Signed-off-by: Arne Jansen Reviewed-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/extent-tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index a541bc8..ddaf634 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3413,7 +3413,7 @@ again: * our reservation. */ if (unused <= space_info->total_bytes) { - unused -= space_info->total_bytes; + unused = space_info->total_bytes - unused; if (unused >= num_bytes) { if (!reserved) space_info->bytes_reserved += orig_bytes; -- cgit v1.1 From 0de90876c6cb774d4a424dafc1fc9ec50071b81b Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 19 Nov 2010 13:40:41 +0000 Subject: Btrfs: handle the space_cache option properly When I added the clear_cache option I screwed up and took the break out of the space_cache case statement, so whenever you mount with space_cache you also get clear_cache, which does you no good if you say set space_cache in fstab so it always gets set. This patch adds the break back in properly. Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/super.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 718b10d..66e4612 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -244,6 +244,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) case Opt_space_cache: printk(KERN_INFO "btrfs: enabling disk space caching\n"); btrfs_set_opt(info->mount_opt, SPACE_CACHE); + break; case Opt_clear_cache: printk(KERN_INFO "btrfs: force clearing of disk cache\n"); btrfs_set_opt(info->mount_opt, CLEAR_CACHE); -- cgit v1.1 From 2a6b8daedaf3682bed3fc1d4e2390491f6e19c49 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Fri, 19 Nov 2010 01:36:10 +0000 Subject: btrfs: Check if dest_offset is block-size aligned before cloning file We've done the check for src_offset and src_length, and We should also check dest_offset, otherwise we'll corrupt the destination file: (After cloning file1 to file2 with unaligned dest_offset) # cat /mnt/file2 cat: /mnt/file2: Input/output error Signed-off-by: Li Zefan Signed-off-by: Chris Mason --- fs/btrfs/ioctl.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 463d91b..81b47bd 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1669,12 +1669,11 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, olen = len = src->i_size - off; /* if we extend to eof, continue to block boundary */ if (off + len == src->i_size) - len = ((src->i_size + bs-1) & ~(bs-1)) - - off; + len = ALIGN(src->i_size, bs) - off; /* verify the end result is block aligned */ - if ((off & (bs-1)) || - ((off + len) & (bs-1))) + if (!IS_ALIGNED(off, bs) || !IS_ALIGNED(off + len, bs) || + !IS_ALIGNED(destoff, bs)) goto out_unlock; /* do any pending delalloc/csum calc on src, one way or -- cgit v1.1 From 5f3888ff6f0b9dce60705765752b788a92557644 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Fri, 19 Nov 2010 01:36:34 +0000 Subject: btrfs: Set file size correctly in file clone Set src_offset = 0, src_length = 20K, dest_offset = 20K. And the original filesize of the dest file 'file2' is 30K: # ls -l /mnt/file2 -rw-r--r-- 1 root root 30720 Nov 18 16:42 /mnt/file2 Now clone file1 to file2, the dest file should be 40K, but it still shows 30K: # ls -l /mnt/file2 -rw-r--r-- 1 root root 30720 Nov 18 16:42 /mnt/file2 Signed-off-by: Li Zefan Signed-off-by: Chris Mason --- fs/btrfs/ioctl.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 81b47bd..6b4bfa7 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1873,8 +1873,8 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, * but shouldn't round up the file size */ endoff = new_key.offset + datal; - if (endoff > off+olen) - endoff = off+olen; + if (endoff > destoff+olen) + endoff = destoff+olen; if (endoff > inode->i_size) btrfs_i_size_write(inode, endoff); -- cgit v1.1 From f209561ad83c5ffd561dc4bc3a3c90b704fe9231 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Fri, 19 Nov 2010 02:05:24 +0000 Subject: btrfs: Show device attr correctly for symlinks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Symlinks and files of other types show different device numbers, though they are on the same partition: $ touch tmp; ln -s tmp tmp2; stat tmp tmp2 File: `tmp' Size: 0 Blocks: 0 IO Block: 4096 regular empty file Device: 15h/21d Inode: 984027 Links: 1 --- snip --- File: `tmp2' -> `tmp' Size: 3 Blocks: 0 IO Block: 4096 symbolic link Device: 13h/19d Inode: 984028 Links: 1 Reported-by: Toke Høiland-Jørgensen Signed-off-by: Li Zefan Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index a47e4fa..eed357f 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -7299,6 +7299,7 @@ static const struct inode_operations btrfs_symlink_inode_operations = { .readlink = generic_readlink, .follow_link = page_follow_link_light, .put_link = page_put_link, + .getattr = btrfs_getattr, .permission = btrfs_permission, .setxattr = btrfs_setxattr, .getxattr = btrfs_getxattr, -- cgit v1.1 From 0410c94aff109c02b6774a0ed00114987cda7ce5 Mon Sep 17 00:00:00 2001 From: Mariusz Kozlowski Date: Sat, 20 Nov 2010 12:03:07 +0000 Subject: btrfs: make 1-bit signed fileds unsigned Fixes these sparse warnings: fs/btrfs/ctree.h:811:17: error: dubious one-bit signed bitfield fs/btrfs/ctree.h:812:20: error: dubious one-bit signed bitfield fs/btrfs/ctree.h:813:19: error: dubious one-bit signed bitfield Signed-off-by: Mariusz Kozlowski Signed-off-by: Chris Mason --- fs/btrfs/ctree.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 8db9234..af52f6d 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -808,9 +808,9 @@ struct btrfs_block_group_cache { int extents_thresh; int free_extents; int total_bitmaps; - int ro:1; - int dirty:1; - int iref:1; + unsigned int ro:1; + unsigned int dirty:1; + unsigned int iref:1; int disk_cache_state; -- cgit v1.1 From 2ede0daf01549cecf4bb0962c46dc47382047523 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 17 Nov 2010 18:54:54 +0000 Subject: Btrfs: handle NFS lookups properly People kept reporting NFS issues, specifically getting ESTALE alot. I figured out how to reproduce the problem SERVER mkfs.btrfs /dev/sda1 mount /dev/sda1 /mnt/btrfs-test btrfs subvol create /mnt/btrfs-test/foo service nfs start CLIENT mount server:/mnt/btrfs /mnt/test cd /mnt/test/foo ls SERVER echo 3 > /proc/sys/vm/drop_caches CLIENT ls <-- get an ESTALE here This is because the standard way to lookup a name in nfsd is to use readdir, and what it does is do a readdir on the parent directory looking for the inode of the child. So in this case the parent being / and the child being foo. Well subvols all have the same inode number, so doing a readdir of / looking for inode 256 will return '.', which obviously doesn't match foo. So instead we need to have our own .get_name so that we can find the right name. Our .get_name will either lookup the inode backref or the root backref, whichever we're looking for, and return the name we find. Running the above reproducer with this patch results in everything acting the way its supposed to. Thanks, Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/export.c | 76 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 76 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c index 951ef09..6f04444 100644 --- a/fs/btrfs/export.c +++ b/fs/btrfs/export.c @@ -232,9 +232,85 @@ fail: return ERR_PTR(ret); } +static int btrfs_get_name(struct dentry *parent, char *name, + struct dentry *child) +{ + struct inode *inode = child->d_inode; + struct inode *dir = parent->d_inode; + struct btrfs_path *path; + struct btrfs_root *root = BTRFS_I(dir)->root; + struct btrfs_inode_ref *iref; + struct btrfs_root_ref *rref; + struct extent_buffer *leaf; + unsigned long name_ptr; + struct btrfs_key key; + int name_len; + int ret; + + if (!dir || !inode) + return -EINVAL; + + if (!S_ISDIR(dir->i_mode)) + return -EINVAL; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + path->leave_spinning = 1; + + if (inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) { + key.objectid = BTRFS_I(inode)->root->root_key.objectid; + key.type = BTRFS_ROOT_BACKREF_KEY; + key.offset = (u64)-1; + root = root->fs_info->tree_root; + } else { + key.objectid = inode->i_ino; + key.offset = dir->i_ino; + key.type = BTRFS_INODE_REF_KEY; + } + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) { + btrfs_free_path(path); + return ret; + } else if (ret > 0) { + if (inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) { + path->slots[0]--; + } else { + btrfs_free_path(path); + return -ENOENT; + } + } + leaf = path->nodes[0]; + + if (inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) { + rref = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_root_ref); + name_ptr = (unsigned long)(rref + 1); + name_len = btrfs_root_ref_name_len(leaf, rref); + } else { + iref = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_inode_ref); + name_ptr = (unsigned long)(iref + 1); + name_len = btrfs_inode_ref_name_len(leaf, iref); + } + + read_extent_buffer(leaf, name, name_ptr, name_len); + btrfs_free_path(path); + + /* + * have to add the null termination to make sure that reconnect_path + * gets the right len for strlen + */ + name[name_len] = '\0'; + + return 0; +} + const struct export_operations btrfs_export_ops = { .encode_fh = btrfs_encode_fh, .fh_to_dentry = btrfs_fh_to_dentry, .fh_to_parent = btrfs_fh_to_parent, .get_parent = btrfs_get_parent, + .get_name = btrfs_get_name, }; -- cgit v1.1 From 76195853903ca613ba722203db9b747d70478fc7 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 19 Nov 2010 02:18:02 +0000 Subject: Btrfs: fix more ESTALE problems with NFS When creating new inodes we don't setup inode->i_generation. So if we generate an fh with a newly created inode we save the generation of 0, but if we flush the inode to disk and have to read it back when getting the inode on the server we'll have the right i_generation, so gens wont match and we get ESTALE. This patch properly sets inode->i_generation when we create the new inode and now I'm no longer getting ESTALE. Thanks, Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index eed357f..fc22f55 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4501,6 +4501,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, BTRFS_I(inode)->index_cnt = 2; BTRFS_I(inode)->root = root; BTRFS_I(inode)->generation = trans->transid; + inode->i_generation = BTRFS_I(inode)->generation; btrfs_set_inode_space_info(root, inode); if (mode & S_IFDIR) -- cgit v1.1 From 6a912213046ecb6511fdf35531a0c7de3de963c9 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Sat, 20 Nov 2010 09:48:00 +0000 Subject: Btrfs: use dget_parent where we can UPDATED There are lots of places where we do dentry->d_parent->d_inode without holding the dentry->d_lock. This could cause problems with rename. So instead we need to use dget_parent() and hold the reference to the parent as long as we are going to use it's inode and then dput it at the end. Signed-off-by: Josef Bacik Cc: raven@themaw.net Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 9 ++++++--- fs/btrfs/ioctl.c | 20 ++++++++++++++++---- fs/btrfs/transaction.c | 5 ++++- fs/btrfs/tree-log.c | 21 +++++++++++++++++---- 4 files changed, 43 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index fc22f55..c0faf47 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4811,10 +4811,12 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, if (err) { drop_inode = 1; } else { + struct dentry *parent = dget_parent(dentry); btrfs_update_inode_block_group(trans, dir); err = btrfs_update_inode(trans, root, inode); BUG_ON(err); - btrfs_log_new_name(trans, inode, NULL, dentry->d_parent); + btrfs_log_new_name(trans, inode, NULL, parent); + dput(parent); } nr = trans->blocks_used; @@ -6768,8 +6770,9 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, BUG_ON(ret); if (old_inode->i_ino != BTRFS_FIRST_FREE_OBJECTID) { - btrfs_log_new_name(trans, old_inode, old_dir, - new_dentry->d_parent); + struct dentry *parent = dget_parent(new_dentry); + btrfs_log_new_name(trans, old_inode, old_dir, parent); + dput(parent); btrfs_end_log_trans(root); } out_fail: diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 6b4bfa7..f1c9bb4 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -233,7 +233,8 @@ static noinline int create_subvol(struct btrfs_root *root, struct btrfs_inode_item *inode_item; struct extent_buffer *leaf; struct btrfs_root *new_root; - struct inode *dir = dentry->d_parent->d_inode; + struct dentry *parent = dget_parent(dentry); + struct inode *dir; int ret; int err; u64 objectid; @@ -242,8 +243,13 @@ static noinline int create_subvol(struct btrfs_root *root, ret = btrfs_find_free_objectid(NULL, root->fs_info->tree_root, 0, &objectid); - if (ret) + if (ret) { + dput(parent); return ret; + } + + dir = parent->d_inode; + /* * 1 - inode item * 2 - refs @@ -251,8 +257,10 @@ static noinline int create_subvol(struct btrfs_root *root, * 2 - dir items */ trans = btrfs_start_transaction(root, 6); - if (IS_ERR(trans)) + if (IS_ERR(trans)) { + dput(parent); return PTR_ERR(trans); + } leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 0, objectid, NULL, 0, 0, 0); @@ -339,6 +347,7 @@ static noinline int create_subvol(struct btrfs_root *root, d_instantiate(dentry, btrfs_lookup_dentry(dir, dentry)); fail: + dput(parent); if (async_transid) { *async_transid = trans->transid; err = btrfs_commit_transaction_async(trans, root, 1); @@ -354,6 +363,7 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry, char *name, int namelen, u64 *async_transid) { struct inode *inode; + struct dentry *parent; struct btrfs_pending_snapshot *pending_snapshot; struct btrfs_trans_handle *trans; int ret; @@ -396,7 +406,9 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry, btrfs_orphan_cleanup(pending_snapshot->snap); - inode = btrfs_lookup_dentry(dentry->d_parent->d_inode, dentry); + parent = dget_parent(dentry); + inode = btrfs_lookup_dentry(parent->d_inode, dentry); + dput(parent); if (IS_ERR(inode)) { ret = PTR_ERR(inode); goto fail; diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 1fffbc0..f50e931 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -902,6 +902,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root *root = pending->root; struct btrfs_root *parent_root; struct inode *parent_inode; + struct dentry *parent; struct dentry *dentry; struct extent_buffer *tmp; struct extent_buffer *old; @@ -941,7 +942,8 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, trans->block_rsv = &pending->block_rsv; dentry = pending->dentry; - parent_inode = dentry->d_parent->d_inode; + parent = dget_parent(dentry); + parent_inode = parent->d_inode; parent_root = BTRFS_I(parent_inode)->root; record_root_in_trans(trans, parent_root); @@ -989,6 +991,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, parent_inode->i_ino, index, dentry->d_name.name, dentry->d_name.len); BUG_ON(ret); + dput(parent); key.offset = (u64)-1; pending->snap = btrfs_read_fs_root_no_name(root->fs_info, &key); diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index a29f193..054744a 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -2869,6 +2869,7 @@ static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans, { int ret = 0; struct btrfs_root *root; + struct dentry *old_parent = NULL; /* * for regular files, if its inode is already on disk, we don't @@ -2910,10 +2911,13 @@ static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans, if (IS_ROOT(parent)) break; - parent = parent->d_parent; + parent = dget_parent(parent); + dput(old_parent); + old_parent = parent; inode = parent->d_inode; } + dput(old_parent); out: return ret; } @@ -2945,6 +2949,7 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, { int inode_only = exists_only ? LOG_INODE_EXISTS : LOG_INODE_ALL; struct super_block *sb; + struct dentry *old_parent = NULL; int ret = 0; u64 last_committed = root->fs_info->last_trans_committed; @@ -3016,10 +3021,13 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, if (IS_ROOT(parent)) break; - parent = parent->d_parent; + parent = dget_parent(parent); + dput(old_parent); + old_parent = parent; } ret = 0; end_trans: + dput(old_parent); if (ret < 0) { BUG_ON(ret != -ENOSPC); root->fs_info->last_trans_log_full_commit = trans->transid; @@ -3039,8 +3047,13 @@ end_no_trans: int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct dentry *dentry) { - return btrfs_log_inode_parent(trans, root, dentry->d_inode, - dentry->d_parent, 0); + struct dentry *parent = dget_parent(dentry); + int ret; + + ret = btrfs_log_inode_parent(trans, root, dentry->d_inode, parent, 0); + dput(parent); + + return ret; } /* -- cgit v1.1 From 495e86779f4f319828bc10dfc0c9ac2161868077 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 19 Nov 2010 20:36:10 +0000 Subject: Btrfs: hold i_mutex when calling btrfs_log_dentry_safe Since we walk up the path logging all of the parts of the inode's path, we need to hold i_mutex to make sure that the inode is not renamed while we're logging everything. btrfs_log_dentry_safe does dget_parent and all of that jazz, but we may get unexpected results if the rename changes the inode's location while we're higher up the path logging those dentries, so do this for safety reasons. Thanks, Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/file.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index e354c33..c1faded 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1047,8 +1047,14 @@ out: if ((file->f_flags & O_DSYNC) || IS_SYNC(inode)) { trans = btrfs_start_transaction(root, 0); + if (IS_ERR(trans)) { + num_written = PTR_ERR(trans); + goto done; + } + mutex_lock(&inode->i_mutex); ret = btrfs_log_dentry_safe(trans, root, file->f_dentry); + mutex_unlock(&inode->i_mutex); if (ret == 0) { ret = btrfs_sync_log(trans, root); if (ret == 0) @@ -1067,6 +1073,7 @@ out: (start_pos + num_written - 1) >> PAGE_CACHE_SHIFT); } } +done: current->backing_dev_info = NULL; return num_written ? num_written : err; } -- cgit v1.1 From a1b075d28da563c5e2325577f282c042494254ba Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 19 Nov 2010 20:36:11 +0000 Subject: Btrfs: make btrfs_add_nondir take parent inode as an argument Everybody who calls btrfs_add_nondir just passes in the dentry of the new file and then dereference dentry->d_parent->d_inode, but everybody who calls btrfs_add_nondir() are already passed the parent's inode. So instead of dereferencing dentry->d_parent, just make btrfs_add_nondir take the dir inode as an argument and pass that along so we don't have to worry about d_parent. Thanks, Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 38 ++++++++++++++++---------------------- 1 file changed, 16 insertions(+), 22 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index c0faf47..37cc177 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4623,12 +4623,12 @@ int btrfs_add_link(struct btrfs_trans_handle *trans, } static int btrfs_add_nondir(struct btrfs_trans_handle *trans, - struct dentry *dentry, struct inode *inode, - int backref, u64 index) + struct inode *dir, struct dentry *dentry, + struct inode *inode, int backref, u64 index) { - int err = btrfs_add_link(trans, dentry->d_parent->d_inode, - inode, dentry->d_name.name, - dentry->d_name.len, backref, index); + int err = btrfs_add_link(trans, dir, inode, + dentry->d_name.name, dentry->d_name.len, + backref, index); if (!err) { d_instantiate(dentry, inode); return 0; @@ -4669,8 +4669,7 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry, btrfs_set_trans_block_group(trans, dir); inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, - dentry->d_name.len, - dentry->d_parent->d_inode->i_ino, objectid, + dentry->d_name.len, dir->i_ino, objectid, BTRFS_I(dir)->block_group, mode, &index); err = PTR_ERR(inode); if (IS_ERR(inode)) @@ -4683,7 +4682,7 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry, } btrfs_set_trans_block_group(trans, inode); - err = btrfs_add_nondir(trans, dentry, inode, 0, index); + err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index); if (err) drop_inode = 1; else { @@ -4731,10 +4730,8 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry, btrfs_set_trans_block_group(trans, dir); inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, - dentry->d_name.len, - dentry->d_parent->d_inode->i_ino, - objectid, BTRFS_I(dir)->block_group, mode, - &index); + dentry->d_name.len, dir->i_ino, objectid, + BTRFS_I(dir)->block_group, mode, &index); err = PTR_ERR(inode); if (IS_ERR(inode)) goto out_unlock; @@ -4746,7 +4743,7 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry, } btrfs_set_trans_block_group(trans, inode); - err = btrfs_add_nondir(trans, dentry, inode, 0, index); + err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index); if (err) drop_inode = 1; else { @@ -4806,7 +4803,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, btrfs_set_trans_block_group(trans, dir); atomic_inc(&inode->i_count); - err = btrfs_add_nondir(trans, dentry, inode, 1, index); + err = btrfs_add_nondir(trans, dir, dentry, inode, 1, index); if (err) { drop_inode = 1; @@ -4856,8 +4853,7 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) btrfs_set_trans_block_group(trans, dir); inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, - dentry->d_name.len, - dentry->d_parent->d_inode->i_ino, objectid, + dentry->d_name.len, dir->i_ino, objectid, BTRFS_I(dir)->block_group, S_IFDIR | mode, &index); if (IS_ERR(inode)) { @@ -4880,9 +4876,8 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) if (err) goto out_fail; - err = btrfs_add_link(trans, dentry->d_parent->d_inode, - inode, dentry->d_name.name, - dentry->d_name.len, 0, index); + err = btrfs_add_link(trans, dir, inode, dentry->d_name.name, + dentry->d_name.len, 0, index); if (err) goto out_fail; @@ -6922,8 +6917,7 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry, btrfs_set_trans_block_group(trans, dir); inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, - dentry->d_name.len, - dentry->d_parent->d_inode->i_ino, objectid, + dentry->d_name.len, dir->i_ino, objectid, BTRFS_I(dir)->block_group, S_IFLNK|S_IRWXUGO, &index); err = PTR_ERR(inode); @@ -6937,7 +6931,7 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry, } btrfs_set_trans_block_group(trans, inode); - err = btrfs_add_nondir(trans, dentry, inode, 0, index); + err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index); if (err) drop_inode = 1; else { -- cgit v1.1 From 45f49bce99d008d6864a20324548f35936ba46fb Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Sun, 21 Nov 2010 22:27:44 -0500 Subject: Btrfs: avoid NULL pointer deref in try_release_extent_buffer If we fail to find a pointer in the radix tree, don't try to deref the NULL one we do have. Signed-off-by: Chris Mason --- fs/btrfs/extent_io.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index f60aa3c..143d3f5 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3837,8 +3837,10 @@ int try_release_extent_buffer(struct extent_io_tree *tree, struct page *page) spin_lock(&tree->buffer_lock); eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT); - if (!eb) - goto out; + if (!eb) { + spin_unlock(&tree->buffer_lock); + return ret; + } if (test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) { ret = 0; -- cgit v1.1 From b47d19de2c714020ba8f5545a6e7d4968f37eb45 Mon Sep 17 00:00:00 2001 From: Arun Bharadwaj Date: Thu, 18 Nov 2010 10:36:43 +0000 Subject: Pure nfs client performance using odirect. When an application opens a file with O_DIRECT flag, if the size of the data that is written is equal to wsize, the client sends a WRITE RPC with stable flag set to UNSTABLE followed by a single COMMIT RPC rather than sending a single WRITE RPC with the stable flag set to FILE_SYNC. This a bug. Patch to fix this. Signed-off-by: Arun R Bharadwaj Signed-off-by: Trond Myklebust --- fs/nfs/direct.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index 84d3c8b..e6ace0d 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -867,7 +867,7 @@ static ssize_t nfs_direct_write(struct kiocb *iocb, const struct iovec *iov, goto out; nfs_alloc_commit_data(dreq); - if (dreq->commit_data == NULL || count < wsize) + if (dreq->commit_data == NULL || count <= wsize) sync = NFS_FILE_SYNC; dreq->inode = inode; -- cgit v1.1 From 463a376eae1c92a66c912af539bfd4bbefa37673 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sat, 20 Nov 2010 12:22:20 -0500 Subject: NFS: Buffer overflow in ->decode_dirent() should not be fatal Overflowing the buffer in the readdir ->decode_dirent() should not lead to a fatal error, but rather to an attempt to reread the record in question. Signed-off-by: Trond Myklebust --- fs/nfs/nfs2xdr.c | 2 +- fs/nfs/nfs3xdr.c | 2 +- fs/nfs/nfs4xdr.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c index 2563f76..ab593773 100644 --- a/fs/nfs/nfs2xdr.c +++ b/fs/nfs/nfs2xdr.c @@ -495,7 +495,7 @@ nfs_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, struct nfs_se out_overflow: print_overflow_msg(__func__, xdr); - return ERR_PTR(-EIO); + return ERR_PTR(-EAGAIN); } /* diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c index 748dc91..e79e4f5 100644 --- a/fs/nfs/nfs3xdr.c +++ b/fs/nfs/nfs3xdr.c @@ -656,7 +656,7 @@ nfs3_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, struct nfs_s out_overflow: print_overflow_msg(__func__, xdr); out_overflow_exit: - return ERR_PTR(-EIO); + return ERR_PTR(-EAGAIN); } /* diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index b7a204f..a3b39cb 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -6221,7 +6221,7 @@ __be32 *nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, out_overflow: print_overflow_msg(__func__, xdr); - return ERR_PTR(-EIO); + return ERR_PTR(-EAGAIN); } /* -- cgit v1.1 From 5c346854d8ce6ca91931f8fc9177934257a667d0 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sat, 20 Nov 2010 12:43:45 -0500 Subject: NFS: Assume eof if the server returns no readdir records Some servers are known to be buggy w.r.t. this. Deal with them... Signed-off-by: Trond Myklebust --- fs/nfs/dir.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 662df2a..2789cb3 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -466,8 +466,9 @@ int nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *en struct xdr_stream stream; struct xdr_buf buf; __be32 *ptr = xdr_page; - int status; struct nfs_cache_array *array; + unsigned int count = 0; + int status; buf.head->iov_base = xdr_page; buf.head->iov_len = buflen; @@ -488,6 +489,8 @@ int nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *en break; } + count++; + if (desc->plus == 1) nfs_prime_dcache(desc->file->f_path.dentry, entry); @@ -496,13 +499,14 @@ int nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *en break; } while (!entry->eof); - if (status == -EBADCOOKIE && entry->eof) { + if (count == 0 || (status == -EBADCOOKIE && entry->eof == 1)) { array = nfs_readdir_get_array(page); if (!IS_ERR(array)) { array->eof_index = array->size; status = 0; nfs_readdir_release_array(page); - } + } else + status = PTR_ERR(array); } return status; } -- cgit v1.1 From e7c58e974a0318fcca5368e7b3570e10e9ae9028 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sat, 20 Nov 2010 13:22:24 -0500 Subject: NFS: Fix a page leak in nfs_do_filldir() nfs_do_filldir() must always free desc->page when it is done, otherwise we end up leaking the page. Also remove unused variable 'dentry'. Signed-off-by: Trond Myklebust --- fs/nfs/dir.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 2789cb3..42e66e9 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -701,11 +701,12 @@ int nfs_do_filldir(nfs_readdir_descriptor_t *desc, void *dirent, int res = 0; struct nfs_cache_array *array = NULL; unsigned int d_type = DT_UNKNOWN; - struct dentry *dentry = NULL; array = nfs_readdir_get_array(desc->page); - if (IS_ERR(array)) - return PTR_ERR(array); + if (IS_ERR(array)) { + res = PTR_ERR(array); + goto out; + } for (i = desc->cache_entry_index; i < array->size; i++) { d_type = DT_UNKNOWN; @@ -726,9 +727,8 @@ int nfs_do_filldir(nfs_readdir_descriptor_t *desc, void *dirent, desc->eof = 1; nfs_readdir_release_array(desc->page); +out: cache_page_release(desc); - if (dentry != NULL) - dput(dentry); dfprintk(DIRCACHE, "NFS: nfs_do_filldir() filling ended @ cookie %Lu; returning = %d\n", (unsigned long long)*desc->dir_cookie, res); return res; -- cgit v1.1 From 7a8e1dc34f52fd2927dbf7e520d7cd8eadc51336 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sat, 20 Nov 2010 13:24:46 -0500 Subject: NFS: Fix a page leak in uncached_readdir() Signed-off-by: Trond Myklebust --- fs/nfs/dir.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 42e66e9..353f47c 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -763,13 +763,14 @@ int uncached_readdir(nfs_readdir_descriptor_t *desc, void *dirent, goto out; } + desc->page_index = 0; + desc->page = page; + if (nfs_readdir_xdr_to_array(desc, page, inode) == -1) { status = -EIO; goto out_release; } - desc->page_index = 0; - desc->page = page; status = nfs_do_filldir(desc, dirent, filldir); out: -- cgit v1.1 From 85f8607e163f8d281fb407357279cb4ac6df12e6 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sat, 20 Nov 2010 13:24:49 -0500 Subject: NFS: Fix the error handling in "uncached_readdir()" Currently, uncached_readdir() is broken because if fails to handle the results from nfs_readdir_xdr_to_array() correctly. Signed-off-by: Trond Myklebust --- fs/nfs/dir.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 353f47c..2492bac 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -766,10 +766,9 @@ int uncached_readdir(nfs_readdir_descriptor_t *desc, void *dirent, desc->page_index = 0; desc->page = page; - if (nfs_readdir_xdr_to_array(desc, page, inode) == -1) { - status = -EIO; + status = nfs_readdir_xdr_to_array(desc, page, inode); + if (status < 0) goto out_release; - } status = nfs_do_filldir(desc, dirent, filldir); -- cgit v1.1 From ece0b4233b6b915d1f63add2bd9f2733aec6317a Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sat, 20 Nov 2010 13:55:33 -0500 Subject: NFS: Don't ignore errors from nfs_do_filldir() We should ignore the errors from the filldir callback, and just interpret them as meaning we should exit, however we should definitely pass back ENOMEM errors. Signed-off-by: Trond Myklebust --- fs/nfs/dir.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 2492bac..ddc2e43 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -709,13 +709,15 @@ int nfs_do_filldir(nfs_readdir_descriptor_t *desc, void *dirent, } for (i = desc->cache_entry_index; i < array->size; i++) { + struct nfs_cache_array_entry *ent; d_type = DT_UNKNOWN; - res = filldir(dirent, array->array[i].string.name, - array->array[i].string.len, file->f_pos, - nfs_compat_user_ino64(array->array[i].ino), d_type); - if (res < 0) + ent = &array->array[i]; + if (filldir(dirent, ent->string.name, ent->string.len, + file->f_pos, nfs_compat_user_ino64(ent->ino), d_type) < 0) { + desc->eof = 1; break; + } file->f_pos++; desc->cache_entry_index = i; if (i < (array->size-1)) @@ -820,14 +822,14 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir) res = readdir_search_pagecache(desc); if (res == -EBADCOOKIE) { + res = 0; /* This means either end of directory */ if (*desc->dir_cookie && desc->eof == 0) { /* Or that the server has 'lost' a cookie */ res = uncached_readdir(desc, dirent, filldir); - if (res >= 0) + if (res == 0) continue; } - res = 0; break; } if (res == -ETOOSMALL && desc->plus) { @@ -842,10 +844,8 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir) break; res = nfs_do_filldir(desc, dirent, filldir); - if (res < 0) { - res = 0; + if (res < 0) break; - } } out: nfs_unblock_sillyrename(dentry); -- cgit v1.1 From 3020093f578fb6c9acc6914dfd887a1ebd1db659 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sat, 20 Nov 2010 15:18:22 -0500 Subject: NFS: Correct the array bound calculation in nfs_readdir_add_to_array It looks as if the array size calculation in MAX_READDIR_ARRAY does not take the alignment of struct nfs_cache_array_entry into account. Signed-off-by: Trond Myklebust --- fs/nfs/dir.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index ddc2e43..ced7291 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -171,8 +171,6 @@ struct nfs_cache_array { struct nfs_cache_array_entry array[0]; }; -#define MAX_READDIR_ARRAY ((PAGE_SIZE - sizeof(struct nfs_cache_array)) / sizeof(struct nfs_cache_array_entry)) - typedef __be32 * (*decode_dirent_t)(struct xdr_stream *, struct nfs_entry *, struct nfs_server *, int); typedef struct { struct file *file; @@ -257,11 +255,14 @@ int nfs_readdir_add_to_array(struct nfs_entry *entry, struct page *page) if (IS_ERR(array)) return PTR_ERR(array); + + cache_entry = &array->array[array->size]; + + /* Check that this entry lies within the page bounds */ ret = -ENOSPC; - if (array->size >= MAX_READDIR_ARRAY) + if ((char *)&cache_entry[1] - (char *)page_address(page) > PAGE_SIZE) goto out; - cache_entry = &array->array[array->size]; cache_entry->cookie = entry->prev_cookie; cache_entry->ino = entry->ino; ret = nfs_readdir_make_qstr(&cache_entry->string, entry->name, entry->len); -- cgit v1.1 From 0b26a0bf6ff398185546432420bb772bcfdf8d94 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sat, 20 Nov 2010 14:26:44 -0500 Subject: NFS: Ensure we return the dirent->d_type when it is known Store the dirent->d_type in the struct nfs_cache_array_entry so that we can use it in getdents() calls. This fixes a regression with the new readdir code. Signed-off-by: Trond Myklebust --- fs/nfs/dir.c | 7 ++++--- fs/nfs/internal.h | 9 +++++++++ fs/nfs/nfs2xdr.c | 2 ++ fs/nfs/nfs3xdr.c | 2 ++ fs/nfs/nfs4xdr.c | 4 ++++ 5 files changed, 21 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index ced7291..8ea4a41 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -162,6 +162,7 @@ struct nfs_cache_array_entry { u64 cookie; u64 ino; struct qstr string; + unsigned char d_type; }; struct nfs_cache_array { @@ -265,6 +266,7 @@ int nfs_readdir_add_to_array(struct nfs_entry *entry, struct page *page) cache_entry->cookie = entry->prev_cookie; cache_entry->ino = entry->ino; + cache_entry->d_type = entry->d_type; ret = nfs_readdir_make_qstr(&cache_entry->string, entry->name, entry->len); if (ret) goto out; @@ -701,7 +703,6 @@ int nfs_do_filldir(nfs_readdir_descriptor_t *desc, void *dirent, int i = 0; int res = 0; struct nfs_cache_array *array = NULL; - unsigned int d_type = DT_UNKNOWN; array = nfs_readdir_get_array(desc->page); if (IS_ERR(array)) { @@ -711,11 +712,11 @@ int nfs_do_filldir(nfs_readdir_descriptor_t *desc, void *dirent, for (i = desc->cache_entry_index; i < array->size; i++) { struct nfs_cache_array_entry *ent; - d_type = DT_UNKNOWN; ent = &array->array[i]; if (filldir(dirent, ent->string.name, ent->string.len, - file->f_pos, nfs_compat_user_ino64(ent->ino), d_type) < 0) { + file->f_pos, nfs_compat_user_ino64(ent->ino), + ent->d_type) < 0) { desc->eof = 1; break; } diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index db08ff3..e6356b7 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -362,6 +362,15 @@ unsigned int nfs_page_length(struct page *page) } /* + * Convert a umode to a dirent->d_type + */ +static inline +unsigned char nfs_umode_to_dtype(umode_t mode) +{ + return (mode >> 12) & 15; +} + +/* * Determine the number of pages in an array of length 'len' and * with a base offset of 'base' */ diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c index ab593773..5914a19 100644 --- a/fs/nfs/nfs2xdr.c +++ b/fs/nfs/nfs2xdr.c @@ -485,6 +485,8 @@ nfs_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, struct nfs_se entry->prev_cookie = entry->cookie; entry->cookie = ntohl(*p++); + entry->d_type = DT_UNKNOWN; + p = xdr_inline_peek(xdr, 8); if (p != NULL) entry->eof = !p[0] && p[1]; diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c index e79e4f5..f6cc60f 100644 --- a/fs/nfs/nfs3xdr.c +++ b/fs/nfs/nfs3xdr.c @@ -622,11 +622,13 @@ nfs3_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, struct nfs_s entry->prev_cookie = entry->cookie; p = xdr_decode_hyper(p, &entry->cookie); + entry->d_type = DT_UNKNOWN; if (plus) { entry->fattr->valid = 0; p = xdr_decode_post_op_attr_stream(xdr, entry->fattr); if (IS_ERR(p)) goto out_overflow_exit; + entry->d_type = nfs_umode_to_dtype(entry->fattr->mode); /* In fact, a post_op_fh3: */ p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index a3b39cb..9f1826b 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -6208,6 +6208,10 @@ __be32 *nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, if (entry->fattr->valid & NFS_ATTR_FATTR_FILEID) entry->ino = entry->fattr->fileid; + entry->d_type = DT_UNKNOWN; + if (entry->fattr->valid & NFS_ATTR_FATTR_TYPE) + entry->d_type = nfs_umode_to_dtype(entry->fattr->mode); + if (verify_attr_len(xdr, p, len) < 0) goto out_overflow; -- cgit v1.1 From 103cfcf522cefe00d8c322c6beac9a711acbf235 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Tue, 23 Nov 2010 09:26:02 +0300 Subject: nilfs2: nilfs_iget_for_gc() returns ERR_PTR nilfs_iget_for_gc() returns an ERR_PTR() on failure and doesn't return NULL. Signed-off-by: Dan Carpenter Signed-off-by: Ryusuke Konishi --- fs/nilfs2/ioctl.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c index 3e90f86..e00d945 100644 --- a/fs/nilfs2/ioctl.c +++ b/fs/nilfs2/ioctl.c @@ -349,8 +349,8 @@ static int nilfs_ioctl_move_blocks(struct super_block *sb, ino = vdesc->vd_ino; cno = vdesc->vd_cno; inode = nilfs_iget_for_gc(sb, ino, cno); - if (unlikely(inode == NULL)) { - ret = -ENOMEM; + if (IS_ERR(inode)) { + ret = PTR_ERR(inode); goto failed; } do { -- cgit v1.1 From f6c26ec5085be805c9dc72d074ef5f504b9cd7df Mon Sep 17 00:00:00 2001 From: Ryusuke Konishi Date: Wed, 24 Nov 2010 02:18:59 +0900 Subject: nilfs2: fix typo in comment of nilfs_dat_move function Fixes a typo: "uncommited" -> "uncommitted". Signed-off-by: Ryusuke Konishi --- fs/nilfs2/dat.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nilfs2/dat.c b/fs/nilfs2/dat.c index 49c844d..59e5fe7 100644 --- a/fs/nilfs2/dat.c +++ b/fs/nilfs2/dat.c @@ -335,7 +335,7 @@ int nilfs_dat_move(struct inode *dat, __u64 vblocknr, sector_t blocknr) * the device at this point. * * To prevent nilfs_dat_translate() from returning the - * uncommited block number, this makes a copy of the entry + * uncommitted block number, this makes a copy of the entry * buffer and redirects nilfs_dat_translate() to the copy. */ if (!buffer_nilfs_redirected(entry_bh)) { -- cgit v1.1 From a0822c55779d9319939eac69f00bb729ea9d23da Mon Sep 17 00:00:00 2001 From: Ken Sumrall Date: Wed, 24 Nov 2010 12:57:00 -0800 Subject: fuse: fix attributes after open(O_TRUNC) The attribute cache for a file was not being cleared when a file is opened with O_TRUNC. If the filesystem's open operation truncates the file ("atomic_o_trunc" feature flag is set) then the kernel should invalidate the cached st_mtime and st_ctime attributes. Also i_size should be explicitly be set to zero as it is used sometimes without refreshing the cache. Signed-off-by: Ken Sumrall Cc: Anfei Cc: "Anand V. Avati" Signed-off-by: Miklos Szeredi Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/fuse/file.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'fs') diff --git a/fs/fuse/file.c b/fs/fuse/file.c index c822458..9242d29 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -134,6 +134,7 @@ EXPORT_SYMBOL_GPL(fuse_do_open); void fuse_finish_open(struct inode *inode, struct file *file) { struct fuse_file *ff = file->private_data; + struct fuse_conn *fc = get_fuse_conn(inode); if (ff->open_flags & FOPEN_DIRECT_IO) file->f_op = &fuse_direct_io_file_operations; @@ -141,6 +142,15 @@ void fuse_finish_open(struct inode *inode, struct file *file) invalidate_inode_pages2(inode->i_mapping); if (ff->open_flags & FOPEN_NONSEEKABLE) nonseekable_open(inode, file); + if (fc->atomic_o_trunc && (file->f_flags & O_TRUNC)) { + struct fuse_inode *fi = get_fuse_inode(inode); + + spin_lock(&fc->lock); + fi->attr_version = ++fc->attr_version; + i_size_write(inode, 0); + spin_unlock(&fc->lock); + fuse_invalidate_attr(inode); + } } int fuse_open_common(struct inode *inode, struct file *file, bool isdir) -- cgit v1.1 From ea251c1d5c481cda1cf6b0c9e4965f04a6cf2ffc Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Wed, 24 Nov 2010 12:57:13 -0800 Subject: pagemap: set pagemap walk limit to PMD boundary Currently one pagemap_read() call walks in PAGEMAP_WALK_SIZE bytes (== 512 pages.) But there is a corner case where walk_pmd_range() accidentally runs over a VMA associated with a hugetlbfs file. For example, when a process has mappings to VMAs as shown below: # cat /proc//maps ... 3a58f6d000-3a58f72000 rw-p 00000000 00:00 0 7fbd51853000-7fbd51855000 rw-p 00000000 00:00 0 7fbd5186c000-7fbd5186e000 rw-p 00000000 00:00 0 7fbd51a00000-7fbd51c00000 rw-s 00000000 00:12 8614 /hugepages/test then pagemap_read() goes into walk_pmd_range() path and walks in the range 0x7fbd51853000-0x7fbd51a53000, but the hugetlbfs VMA should be handled by walk_hugetlb_range(). Otherwise PMD for the hugepage is considered bad and cleared, which causes undesirable results. This patch fixes it by separating pagemap walk range into one PMD. Signed-off-by: Naoya Horiguchi Cc: Jun'ichi Nomura Acked-by: KAMEZAWA Hiroyuki Cc: Matt Mackall Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/task_mmu.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index da6b01d..c126c83 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -706,6 +706,7 @@ static int pagemap_hugetlb_range(pte_t *pte, unsigned long hmask, * skip over unmapped regions. */ #define PAGEMAP_WALK_SIZE (PMD_SIZE) +#define PAGEMAP_WALK_MASK (PMD_MASK) static ssize_t pagemap_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { @@ -776,7 +777,7 @@ static ssize_t pagemap_read(struct file *file, char __user *buf, unsigned long end; pm.pos = 0; - end = start_vaddr + PAGEMAP_WALK_SIZE; + end = (start_vaddr + PAGEMAP_WALK_SIZE) & PAGEMAP_WALK_MASK; /* overflow ? */ if (end < start_vaddr || end > end_vaddr) end = end_vaddr; -- cgit v1.1 From da905873effecd1c0166e578bc4b5006f041b18b Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 24 Nov 2010 12:57:15 -0800 Subject: reiserfs: fix inode mutex - reiserfs lock misordering reiserfs_unpack() locks the inode mutex with reiserfs_mutex_lock_safe() to protect against reiserfs lock dependency. However this protection requires to have the reiserfs lock to be locked. This is the case if reiserfs_unpack() is called by reiserfs_ioctl but not from reiserfs_quota_on() when it tries to unpack tails of quota files. Fix the ordering of the two locks in reiserfs_unpack() to fix this issue. Signed-off-by: Frederic Weisbecker Reported-by: Markus Gapp Reported-by: Jan Kara Cc: Jeff Mahoney Cc: [2.6.36.x] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/reiserfs/ioctl.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/reiserfs/ioctl.c b/fs/reiserfs/ioctl.c index bd9763e..79265fd 100644 --- a/fs/reiserfs/ioctl.c +++ b/fs/reiserfs/ioctl.c @@ -183,12 +183,11 @@ int reiserfs_unpack(struct inode *inode, struct file *filp) return 0; } - /* we need to make sure nobody is changing the file size beneath - ** us - */ - reiserfs_mutex_lock_safe(&inode->i_mutex, inode->i_sb); depth = reiserfs_write_lock_once(inode->i_sb); + /* we need to make sure nobody is changing the file size beneath us */ + reiserfs_mutex_lock_safe(&inode->i_mutex, inode->i_sb); + write_from = inode->i_size & (blocksize - 1); /* if we are on a block boundary, we are already unpacked. */ if (write_from == 0) { -- cgit v1.1 From 55a61d1d06a3dc443d0db8aaa613365dcb83b98a Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 22 Nov 2010 18:50:32 +0000 Subject: Btrfs: fix typo in fallocate to make it honor actual size There is a typo in __btrfs_prealloc_file_range() where we set the i_size to actual_len/cur_offset, and then just set it to cur_offset again, and do the same with btrfs_ordered_update_i_size(). This fixes it back to keeping i_size in a local variable and then updating i_size properly. Tested this with xfs_io -F -f -c "falloc 0 1" -c "pwrite 0 1" foo stat'ing foo gives us a size of 1 instead of 4096 like it was. Thanks, Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 37cc177..0058fb3 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -7002,6 +7002,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode, struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_key ins; u64 cur_offset = start; + u64 i_size; int ret = 0; bool own_trans = true; @@ -7043,11 +7044,11 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode, (actual_len > inode->i_size) && (cur_offset > inode->i_size)) { if (cur_offset > actual_len) - i_size_write(inode, actual_len); + i_size = actual_len; else - i_size_write(inode, cur_offset); - i_size_write(inode, cur_offset); - btrfs_ordered_update_i_size(inode, cur_offset, NULL); + i_size = cur_offset; + i_size_write(inode, i_size); + btrfs_ordered_update_i_size(inode, i_size, NULL); } ret = btrfs_update_inode(trans, root, inode); -- cgit v1.1 From 0ed42a63f3edb144b091d9528401fce95c3c4d8d Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 22 Nov 2010 18:55:39 +0000 Subject: Btrfs: make sure new inode size is ok in fallocate We have been failing xfstest 228 forever, because we don't check to make sure the new inode size is acceptable as far as RLIMIT is concerned. Just check to make sure it's ok to create a inode with this new size and error out if not. With this patch we now pass 228. Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 0058fb3..0eeacd9 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -7102,6 +7102,10 @@ static long btrfs_fallocate(struct inode *inode, int mode, btrfs_wait_ordered_range(inode, alloc_start, alloc_end - alloc_start); mutex_lock(&inode->i_mutex); + ret = inode_newsize_ok(inode, alloc_end); + if (ret) + goto out; + if (alloc_start > inode->i_size) { ret = btrfs_cont_expand(inode, alloc_start); if (ret) -- cgit v1.1 From bc1cbf1f86aa2501efa9ca637c736fce6bcc4b1d Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 23 Nov 2010 19:50:59 +0000 Subject: Btrfs: update inode ctime when using links Currently we fail xfstest 236 because we're not updating the inode ctime on link. This is a simple fix, and makes it so we pass 236 now. Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 0eeacd9..6df921f 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4785,6 +4785,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, return -EPERM; btrfs_inc_nlink(inode); + inode->i_ctime = CURRENT_TIME; err = btrfs_set_inode_index(dir, &index); if (err) -- cgit v1.1 From 619c8c763928841b1112e1d417f88bc1d44daecb Mon Sep 17 00:00:00 2001 From: Ian Kent Date: Mon, 22 Nov 2010 02:21:38 +0000 Subject: Btrfs - fix race between btrfs_get_sb() and umount When mounting a btrfs file system btrfs_test_super() may attempt to use sb->s_fs_info, the btrfs root, of a super block that is going away and that has had the btrfs root set to NULL in its ->put_super(). But if the super block is going away it cannot be an existing super block so we can return false in this case. Signed-off-by: Ian Kent Signed-off-by: Chris Mason --- fs/btrfs/super.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 66e4612..141fb31 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -566,6 +566,12 @@ static int btrfs_test_super(struct super_block *s, void *data) struct btrfs_fs_devices *test_fs_devices = data; struct btrfs_root *root = btrfs_sb(s); + /* + * If this super block is going away, return false as it + * can't match as an existing super block. + */ + if (!atomic_read(&s->s_active)) + return 0; return root->fs_info->fs_devices == test_fs_devices; } -- cgit v1.1 From 975f84fee2e8a77ee5f41bfe7c5682bf29366b10 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 23 Nov 2010 19:36:57 +0000 Subject: Btrfs: fix fiemap There are two big problems currently with FIEMAP 1) We return extents for holes. This isn't supposed to happen, we just don't return extents for holes and then userspace interprets the lack of an extent as a hole. 2) We sometimes don't set FIEMAP_EXTENT_LAST properly. This is because we wait to see a EXTENT_FLAG_VACANCY flag on the em, but this won't happen if say we ask fiemap to map up to the last extent in a file, and there is nothing but holes up to the i_size. To fix this we need to lookup the last extent in this file and save the logical offset, so if we happen to try and map that extent we can be sure to set FIEMAP_EXTENT_LAST. With this patch we now pass xfstest 225, which we never have before. Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/extent_io.c | 63 ++++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 54 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 143d3f5..5e7a94d 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2901,21 +2901,53 @@ out: int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, __u64 start, __u64 len, get_extent_t *get_extent) { - int ret; + int ret = 0; u64 off = start; u64 max = start + len; u32 flags = 0; + u32 found_type; + u64 last; u64 disko = 0; + struct btrfs_key found_key; struct extent_map *em = NULL; struct extent_state *cached_state = NULL; + struct btrfs_path *path; + struct btrfs_file_extent_item *item; int end = 0; u64 em_start = 0, em_len = 0; unsigned long emflags; - ret = 0; + int hole = 0; if (len == 0) return -EINVAL; + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + path->leave_spinning = 1; + + ret = btrfs_lookup_file_extent(NULL, BTRFS_I(inode)->root, + path, inode->i_ino, -1, 0); + if (ret < 0) { + btrfs_free_path(path); + return ret; + } + WARN_ON(!ret); + path->slots[0]--; + item = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_file_extent_item); + btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]); + found_type = btrfs_key_type(&found_key); + + /* No extents, just return */ + if (found_key.objectid != inode->i_ino || + found_type != BTRFS_EXTENT_DATA_KEY) { + btrfs_free_path(path); + return 0; + } + last = found_key.offset; + btrfs_free_path(path); + lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len, 0, &cached_state, GFP_NOFS); em = get_extent(inode, NULL, 0, off, max - off, 0); @@ -2925,11 +2957,18 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, ret = PTR_ERR(em); goto out; } + while (!end) { + hole = 0; off = em->start + em->len; if (off >= max) end = 1; + if (em->block_start == EXTENT_MAP_HOLE) { + hole = 1; + goto next; + } + em_start = em->start; em_len = em->len; @@ -2939,8 +2978,6 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, if (em->block_start == EXTENT_MAP_LAST_BYTE) { end = 1; flags |= FIEMAP_EXTENT_LAST; - } else if (em->block_start == EXTENT_MAP_HOLE) { - flags |= FIEMAP_EXTENT_UNWRITTEN; } else if (em->block_start == EXTENT_MAP_INLINE) { flags |= (FIEMAP_EXTENT_DATA_INLINE | FIEMAP_EXTENT_NOT_ALIGNED); @@ -2953,10 +2990,10 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) flags |= FIEMAP_EXTENT_ENCODED; +next: emflags = em->flags; free_extent_map(em); em = NULL; - if (!end) { em = get_extent(inode, NULL, 0, off, max - off, 0); if (!em) @@ -2967,15 +3004,23 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, } emflags = em->flags; } + if (test_bit(EXTENT_FLAG_VACANCY, &emflags)) { flags |= FIEMAP_EXTENT_LAST; end = 1; } - ret = fiemap_fill_next_extent(fieinfo, em_start, disko, - em_len, flags); - if (ret) - goto out_free; + if (em_start == last) { + flags |= FIEMAP_EXTENT_LAST; + end = 1; + } + + if (!hole) { + ret = fiemap_fill_next_extent(fieinfo, em_start, disko, + em_len, flags); + if (ret) + goto out_free; + } } out_free: free_extent_map(em); -- cgit v1.1 From 450ba0ea06b6ed3612d27f2b7127a9de4160f285 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 19 Nov 2010 14:59:15 -0500 Subject: Btrfs: setup blank root and fs_info for mount time There is a problem with how we use sget, it searches through the list of supers attached to the fs_type looking for a super with the same fs_devices as what we're trying to mount. This depends on sb->s_fs_info being filled, but we don't fill that in until we get to btrfs_fill_super, so we could hit supers on the fs_type super list that have a null s_fs_info. In order to fix that we need to go ahead and setup a blank root with a blank fs_info to hold fs_devices, that way our test will work out right and then we can set s_fs_info in btrfs_set_super, and then open_ctree will simply use our pre-allocated root and fs_info when setting everything up. Thanks, Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/disk-io.c | 6 ++---- fs/btrfs/super.c | 34 +++++++++++++++++++++++++++++++--- 2 files changed, 33 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index a67b98d..57c9d8e 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1561,10 +1561,8 @@ struct btrfs_root *open_ctree(struct super_block *sb, GFP_NOFS); struct btrfs_root *csum_root = kzalloc(sizeof(struct btrfs_root), GFP_NOFS); - struct btrfs_root *tree_root = kzalloc(sizeof(struct btrfs_root), - GFP_NOFS); - struct btrfs_fs_info *fs_info = kzalloc(sizeof(*fs_info), - GFP_NOFS); + struct btrfs_root *tree_root = btrfs_sb(sb); + struct btrfs_fs_info *fs_info = tree_root->fs_info; struct btrfs_root *chunk_root = kzalloc(sizeof(struct btrfs_root), GFP_NOFS); struct btrfs_root *dev_root = kzalloc(sizeof(struct btrfs_root), diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 141fb31..47bf67c 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -563,7 +563,7 @@ static int btrfs_show_options(struct seq_file *seq, struct vfsmount *vfs) static int btrfs_test_super(struct super_block *s, void *data) { - struct btrfs_fs_devices *test_fs_devices = data; + struct btrfs_root *test_root = data; struct btrfs_root *root = btrfs_sb(s); /* @@ -572,9 +572,17 @@ static int btrfs_test_super(struct super_block *s, void *data) */ if (!atomic_read(&s->s_active)) return 0; - return root->fs_info->fs_devices == test_fs_devices; + return root->fs_info->fs_devices == test_root->fs_info->fs_devices; } +static int btrfs_set_super(struct super_block *s, void *data) +{ + s->s_fs_info = data; + + return set_anon_super(s, data); +} + + /* * Find a superblock for the given device / mount point. * @@ -588,6 +596,8 @@ static int btrfs_get_sb(struct file_system_type *fs_type, int flags, struct super_block *s; struct dentry *root; struct btrfs_fs_devices *fs_devices = NULL; + struct btrfs_root *tree_root = NULL; + struct btrfs_fs_info *fs_info = NULL; fmode_t mode = FMODE_READ; char *subvol_name = NULL; u64 subvol_objectid = 0; @@ -615,8 +625,24 @@ static int btrfs_get_sb(struct file_system_type *fs_type, int flags, goto error_close_devices; } + /* + * Setup a dummy root and fs_info for test/set super. This is because + * we don't actually fill this stuff out until open_ctree, but we need + * it for searching for existing supers, so this lets us do that and + * then open_ctree will properly initialize everything later. + */ + fs_info = kzalloc(sizeof(struct btrfs_fs_info), GFP_NOFS); + tree_root = kzalloc(sizeof(struct btrfs_root), GFP_NOFS); + if (!fs_info || !tree_root) { + error = -ENOMEM; + goto error_close_devices; + } + fs_info->tree_root = tree_root; + fs_info->fs_devices = fs_devices; + tree_root->fs_info = fs_info; + bdev = fs_devices->latest_bdev; - s = sget(fs_type, btrfs_test_super, set_anon_super, fs_devices); + s = sget(fs_type, btrfs_test_super, btrfs_set_super, tree_root); if (IS_ERR(s)) goto error_s; @@ -685,6 +711,8 @@ error_s: error = PTR_ERR(s); error_close_devices: btrfs_close_devices(fs_devices); + kfree(fs_info); + kfree(tree_root); error_free_subvol_name: kfree(subvol_name); return error; -- cgit v1.1 From 71993e62a47dabddf10302807d6aa260455503f4 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 28 Nov 2010 13:56:09 -0800 Subject: Rename 'pipe_info()' to 'get_pipe_info()' .. and change it to take the 'file' pointer instead of an inode, since that's what all users want anyway. The renaming is preparatory to exporting it to other users. The old 'pipe_info()' name was too generic and is already used elsewhere, so before making the function public we need to use a more specific name. Cc: Jens Axboe Cc: Andrew Morton Cc: Al Viro Cc: Dave Jones Signed-off-by: Linus Torvalds --- fs/splice.c | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/splice.c b/fs/splice.c index 8f1dfae..0d92dab 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -1316,12 +1316,11 @@ static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe, * location, so checking ->i_pipe is not enough to verify that this is a * pipe. */ -static inline struct pipe_inode_info *pipe_info(struct inode *inode) +static inline struct pipe_inode_info *get_pipe_info(struct file *file) { - if (S_ISFIFO(inode->i_mode)) - return inode->i_pipe; + struct inode *i = file->f_path.dentry->d_inode; - return NULL; + return S_ISFIFO(i->i_mode) ? i->i_pipe : NULL; } /* @@ -1336,8 +1335,8 @@ static long do_splice(struct file *in, loff_t __user *off_in, loff_t offset, *off; long ret; - ipipe = pipe_info(in->f_path.dentry->d_inode); - opipe = pipe_info(out->f_path.dentry->d_inode); + ipipe = get_pipe_info(in); + opipe = get_pipe_info(out); if (ipipe && opipe) { if (off_in || off_out) @@ -1555,7 +1554,7 @@ static long vmsplice_to_user(struct file *file, const struct iovec __user *iov, int error; long ret; - pipe = pipe_info(file->f_path.dentry->d_inode); + pipe = get_pipe_info(file); if (!pipe) return -EBADF; @@ -1642,7 +1641,7 @@ static long vmsplice_to_pipe(struct file *file, const struct iovec __user *iov, }; long ret; - pipe = pipe_info(file->f_path.dentry->d_inode); + pipe = get_pipe_info(file); if (!pipe) return -EBADF; @@ -2022,8 +2021,8 @@ static int link_pipe(struct pipe_inode_info *ipipe, static long do_tee(struct file *in, struct file *out, size_t len, unsigned int flags) { - struct pipe_inode_info *ipipe = pipe_info(in->f_path.dentry->d_inode); - struct pipe_inode_info *opipe = pipe_info(out->f_path.dentry->d_inode); + struct pipe_inode_info *ipipe = get_pipe_info(in); + struct pipe_inode_info *opipe = get_pipe_info(out); int ret = -EINVAL; /* -- cgit v1.1 From c66fb347946ebdd5b10908866ecc9fa05ee2cf3d Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 28 Nov 2010 14:09:57 -0800 Subject: Export 'get_pipe_info()' to other users And in particular, use it in 'pipe_fcntl()'. The other pipe functions do not need to use the 'careful' version, since they are only ever called for things that are already known to be pipes. The normal read/write/ioctl functions are called through the file operations structures, so if a file isn't a pipe, they'd never get called. But pipe_fcntl() is special, and called directly from the generic fcntl code, and needs to use the same careful function that the splice code is using. Cc: Jens Axboe Cc: Andrew Morton Cc: Al Viro Cc: Dave Jones Signed-off-by: Linus Torvalds --- fs/pipe.c | 2 +- fs/splice.c | 11 ----------- 2 files changed, 1 insertion(+), 12 deletions(-) (limited to 'fs') diff --git a/fs/pipe.c b/fs/pipe.c index a8012a9..b8997f8 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -1204,7 +1204,7 @@ long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg) struct pipe_inode_info *pipe; long ret; - pipe = file->f_path.dentry->d_inode->i_pipe; + pipe = get_pipe_info(file); if (!pipe) return -EBADF; diff --git a/fs/splice.c b/fs/splice.c index 0d92dab..ce2f025 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -1311,17 +1311,6 @@ long do_splice_direct(struct file *in, loff_t *ppos, struct file *out, static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe, struct pipe_inode_info *opipe, size_t len, unsigned int flags); -/* - * After the inode slimming patch, i_pipe/i_bdev/i_cdev share the same - * location, so checking ->i_pipe is not enough to verify that this is a - * pipe. - */ -static inline struct pipe_inode_info *get_pipe_info(struct file *file) -{ - struct inode *i = file->f_path.dentry->d_inode; - - return S_ISFIFO(i->i_mode) ? i->i_pipe : NULL; -} /* * Determine where to splice to/from. -- cgit v1.1 From 72083646528d4887b920deb71b37e09bc7d227bb Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 28 Nov 2010 16:27:19 -0800 Subject: Un-inline get_pipe_info() helper function This avoids some include-file hell, and the function isn't really important enough to be inlined anyway. Reported-by: Ingo Molnar Signed-off-by: Linus Torvalds --- fs/pipe.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'fs') diff --git a/fs/pipe.c b/fs/pipe.c index b8997f8..04629f3 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -1199,6 +1199,18 @@ int pipe_proc_fn(struct ctl_table *table, int write, void __user *buf, return ret; } +/* + * After the inode slimming patch, i_pipe/i_bdev/i_cdev share the same + * location, so checking ->i_pipe is not enough to verify that this is a + * pipe. + */ +struct pipe_inode_info *get_pipe_info(struct file *file) +{ + struct inode *i = file->f_path.dentry->d_inode; + + return S_ISFIFO(i->i_mode) ? i->i_pipe : NULL; +} + long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg) { struct pipe_inode_info *pipe; -- cgit v1.1 From 163cf09c2a0ee5cac6285f9347975bd1e97725da Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Sun, 28 Nov 2010 19:56:33 -0500 Subject: Btrfs: deal with DIO bios that span more than one ordered extent The new DIO bio splitting code has problems when the bio spans more than one ordered extent. This will happen as the generic DIO code merges our get_blocks calls together into a bigger single bio. This fixes things by walking forward in the ordered extent code finding all the overlapping ordered extents and completing them all at once. Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 23 ++++++++++++++--- fs/btrfs/ordered-data.c | 67 +++++++++++++++++++++++++++++++++++++++++++++++++ fs/btrfs/ordered-data.h | 3 +++ 3 files changed, 89 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 6df921f..0f34cae 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -5602,15 +5602,18 @@ static void btrfs_endio_direct_write(struct bio *bio, int err) struct btrfs_trans_handle *trans; struct btrfs_ordered_extent *ordered = NULL; struct extent_state *cached_state = NULL; + u64 ordered_offset = dip->logical_offset; + u64 ordered_bytes = dip->bytes; int ret; if (err) goto out_done; - - ret = btrfs_dec_test_ordered_pending(inode, &ordered, - dip->logical_offset, dip->bytes); +again: + ret = btrfs_dec_test_first_ordered_pending(inode, &ordered, + &ordered_offset, + ordered_bytes); if (!ret) - goto out_done; + goto out_test; BUG_ON(!ordered); @@ -5670,8 +5673,20 @@ out_unlock: out: btrfs_delalloc_release_metadata(inode, ordered->len); btrfs_end_transaction(trans, root); + ordered_offset = ordered->file_offset + ordered->len; btrfs_put_ordered_extent(ordered); btrfs_put_ordered_extent(ordered); + +out_test: + /* + * our bio might span multiple ordered extents. If we haven't + * completed the accounting for the whole dio, go back and try again + */ + if (ordered_offset < dip->logical_offset + dip->bytes) { + ordered_bytes = dip->logical_offset + dip->bytes - + ordered_offset; + goto again; + } out_done: bio->bi_private = dip->private; diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index f4621f6..ae7737e 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -250,6 +250,73 @@ int btrfs_add_ordered_sum(struct inode *inode, /* * this is used to account for finished IO across a given range + * of the file. The IO may span ordered extents. If + * a given ordered_extent is completely done, 1 is returned, otherwise + * 0. + * + * test_and_set_bit on a flag in the struct btrfs_ordered_extent is used + * to make sure this function only returns 1 once for a given ordered extent. + * + * file_offset is updated to one byte past the range that is recorded as + * complete. This allows you to walk forward in the file. + */ +int btrfs_dec_test_first_ordered_pending(struct inode *inode, + struct btrfs_ordered_extent **cached, + u64 *file_offset, u64 io_size) +{ + struct btrfs_ordered_inode_tree *tree; + struct rb_node *node; + struct btrfs_ordered_extent *entry = NULL; + int ret; + u64 dec_end; + u64 dec_start; + u64 to_dec; + + tree = &BTRFS_I(inode)->ordered_tree; + spin_lock(&tree->lock); + node = tree_search(tree, *file_offset); + if (!node) { + ret = 1; + goto out; + } + + entry = rb_entry(node, struct btrfs_ordered_extent, rb_node); + if (!offset_in_entry(entry, *file_offset)) { + ret = 1; + goto out; + } + + dec_start = max(*file_offset, entry->file_offset); + dec_end = min(*file_offset + io_size, entry->file_offset + + entry->len); + *file_offset = dec_end; + if (dec_start > dec_end) { + printk(KERN_CRIT "bad ordering dec_start %llu end %llu\n", + (unsigned long long)dec_start, + (unsigned long long)dec_end); + } + to_dec = dec_end - dec_start; + if (to_dec > entry->bytes_left) { + printk(KERN_CRIT "bad ordered accounting left %llu size %llu\n", + (unsigned long long)entry->bytes_left, + (unsigned long long)to_dec); + } + entry->bytes_left -= to_dec; + if (entry->bytes_left == 0) + ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags); + else + ret = 1; +out: + if (!ret && cached && entry) { + *cached = entry; + atomic_inc(&entry->refs); + } + spin_unlock(&tree->lock); + return ret == 0; +} + +/* + * this is used to account for finished IO across a given range * of the file. The IO should not span ordered extents. If * a given ordered_extent is completely done, 1 is returned, otherwise * 0. diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index 8ac3654..61dca83 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -141,6 +141,9 @@ int btrfs_remove_ordered_extent(struct inode *inode, int btrfs_dec_test_ordered_pending(struct inode *inode, struct btrfs_ordered_extent **cached, u64 file_offset, u64 io_size); +int btrfs_dec_test_first_ordered_pending(struct inode *inode, + struct btrfs_ordered_extent **cached, + u64 *file_offset, u64 io_size); int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, u64 start, u64 len, u64 disk_len, int type); int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset, -- cgit v1.1 From 5a92bc88cef279261d3f138e25850c122df67045 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Mon, 29 Nov 2010 09:49:11 -0500 Subject: Btrfs: don't use migrate page without CONFIG_MIGRATION Fixes compile error Signed-off-by: Chris Mason --- fs/btrfs/disk-io.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 57c9d8e..33b6d45 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -712,8 +712,11 @@ static int btree_migratepage(struct address_space *mapping, if (page_has_private(page) && !try_to_release_page(page, GFP_KERNEL)) return -EAGAIN; - +#ifdef CONFIG_MIGRATION return migrate_page(mapping, newpage, page); +#else + return -ENOSYS; +#endif } static int btree_writepage(struct page *page, struct writeback_control *wbc) @@ -821,7 +824,9 @@ static const struct address_space_operations btree_aops = { .releasepage = btree_releasepage, .invalidatepage = btree_invalidatepage, .sync_page = block_sync_page, +#ifdef CONFIG_MIGRATION .migratepage = btree_migratepage, +#endif }; int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize, -- cgit v1.1 From 37a09f07459753e7c98d4e21f1c61e8756923f81 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 30 Nov 2010 12:42:34 -0500 Subject: NFS: Fix a readdirplus bug When comparing filehandles in the helper nfs_same_file(), we should not be using 'strncmp()': filehandles are not null terminated strings. Instead, we should just use the existing helper nfs_compare_fh(). Signed-off-by: Trond Myklebust Signed-off-by: Linus Torvalds --- fs/nfs/dir.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 8ea4a41..f0a384e 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -395,13 +395,9 @@ int xdr_decode(nfs_readdir_descriptor_t *desc, struct nfs_entry *entry, struct x static int nfs_same_file(struct dentry *dentry, struct nfs_entry *entry) { - struct nfs_inode *node; if (dentry->d_inode == NULL) goto different; - node = NFS_I(dentry->d_inode); - if (node->fh.size != entry->fh->size) - goto different; - if (strncmp(node->fh.data, entry->fh->data, node->fh.size) != 0) + if (nfs_compare_fh(entry->fh, NFS_FH(dentry->d_inode)) != 0) goto different; return 1; different: -- cgit v1.1 From 3c77f845722158206a7209c45ccddc264d19319c Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 30 Nov 2010 20:55:34 +0100 Subject: exec: make argv/envp memory visible to oom-killer Brad Spengler published a local memory-allocation DoS that evades the OOM-killer (though not the virtual memory RLIMIT): http://www.grsecurity.net/~spender/64bit_dos.c execve()->copy_strings() can allocate a lot of memory, but this is not visible to oom-killer, nobody can see the nascent bprm->mm and take it into account. With this patch get_arg_page() increments current's MM_ANONPAGES counter every time we allocate the new page for argv/envp. When do_execve() succeds or fails, we change this counter back. Technically this is not 100% correct, we can't know if the new page is swapped out and turn MM_ANONPAGES into MM_SWAPENTS, but I don't think this really matters and everything becomes correct once exec changes ->mm or fails. Reported-by: Brad Spengler Reviewed-and-discussed-by: KOSAKI Motohiro Signed-off-by: Oleg Nesterov Cc: stable@kernel.org Signed-off-by: Linus Torvalds --- fs/exec.c | 32 ++++++++++++++++++++++++++++++-- 1 file changed, 30 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/exec.c b/fs/exec.c index 99d33a1..4303b90 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -164,6 +164,25 @@ out: #ifdef CONFIG_MMU +static void acct_arg_size(struct linux_binprm *bprm, unsigned long pages) +{ + struct mm_struct *mm = current->mm; + long diff = (long)(pages - bprm->vma_pages); + + if (!mm || !diff) + return; + + bprm->vma_pages = pages; + +#ifdef SPLIT_RSS_COUNTING + add_mm_counter(mm, MM_ANONPAGES, diff); +#else + spin_lock(&mm->page_table_lock); + add_mm_counter(mm, MM_ANONPAGES, diff); + spin_unlock(&mm->page_table_lock); +#endif +} + static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos, int write) { @@ -186,6 +205,8 @@ static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos, unsigned long size = bprm->vma->vm_end - bprm->vma->vm_start; struct rlimit *rlim; + acct_arg_size(bprm, size / PAGE_SIZE); + /* * We've historically supported up to 32 pages (ARG_MAX) * of argument strings even with small stacks @@ -276,6 +297,10 @@ static bool valid_arg_len(struct linux_binprm *bprm, long len) #else +static inline void acct_arg_size(struct linux_binprm *bprm, unsigned long pages) +{ +} + static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos, int write) { @@ -1003,6 +1028,7 @@ int flush_old_exec(struct linux_binprm * bprm) /* * Release all of the old mmap stuff */ + acct_arg_size(bprm, 0); retval = exec_mmap(bprm->mm); if (retval) goto out; @@ -1426,8 +1452,10 @@ int do_execve(const char * filename, return retval; out: - if (bprm->mm) - mmput (bprm->mm); + if (bprm->mm) { + acct_arg_size(bprm, 0); + mmput(bprm->mm); + } out_file: if (bprm->file) { -- cgit v1.1 From 114279be2120a916e8a04feeb2ac976a10016f2f Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 30 Nov 2010 20:56:02 +0100 Subject: exec: copy-and-paste the fixes into compat_do_execve() paths Note: this patch targets 2.6.37 and tries to be as simple as possible. That is why it adds more copy-and-paste horror into fs/compat.c and uglifies fs/exec.c, this will be cleanuped later. compat_copy_strings() plays with bprm->vma/mm directly and thus has two problems: it lacks the RLIMIT_STACK check and argv/envp memory is not visible to oom killer. Export acct_arg_size() and get_arg_page(), change compat_copy_strings() to use get_arg_page(), change compat_do_execve() to do acct_arg_size(0) as do_execve() does. Add the fatal_signal_pending/cond_resched checks into compat_count() and compat_copy_strings(), this matches the code in fs/exec.c and certainly makes sense. Signed-off-by: Oleg Nesterov Cc: KOSAKI Motohiro Cc: stable@kernel.org Signed-off-by: Linus Torvalds --- fs/compat.c | 28 +++++++++++++++------------- fs/exec.c | 8 ++++---- 2 files changed, 19 insertions(+), 17 deletions(-) (limited to 'fs') diff --git a/fs/compat.c b/fs/compat.c index c580c32..eb1740a 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -1350,6 +1350,10 @@ static int compat_count(compat_uptr_t __user *argv, int max) argv++; if (i++ >= max) return -E2BIG; + + if (fatal_signal_pending(current)) + return -ERESTARTNOHAND; + cond_resched(); } } return i; @@ -1391,6 +1395,12 @@ static int compat_copy_strings(int argc, compat_uptr_t __user *argv, while (len > 0) { int offset, bytes_to_copy; + if (fatal_signal_pending(current)) { + ret = -ERESTARTNOHAND; + goto out; + } + cond_resched(); + offset = pos % PAGE_SIZE; if (offset == 0) offset = PAGE_SIZE; @@ -1407,18 +1417,8 @@ static int compat_copy_strings(int argc, compat_uptr_t __user *argv, if (!kmapped_page || kpos != (pos & PAGE_MASK)) { struct page *page; -#ifdef CONFIG_STACK_GROWSUP - ret = expand_stack_downwards(bprm->vma, pos); - if (ret < 0) { - /* We've exceed the stack rlimit. */ - ret = -E2BIG; - goto out; - } -#endif - ret = get_user_pages(current, bprm->mm, pos, - 1, 1, 1, &page, NULL); - if (ret <= 0) { - /* We've exceed the stack rlimit. */ + page = get_arg_page(bprm, pos, 1); + if (!page) { ret = -E2BIG; goto out; } @@ -1539,8 +1539,10 @@ int compat_do_execve(char * filename, return retval; out: - if (bprm->mm) + if (bprm->mm) { + acct_arg_size(bprm, 0); mmput(bprm->mm); + } out_file: if (bprm->file) { diff --git a/fs/exec.c b/fs/exec.c index 4303b90..d68c378 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -164,7 +164,7 @@ out: #ifdef CONFIG_MMU -static void acct_arg_size(struct linux_binprm *bprm, unsigned long pages) +void acct_arg_size(struct linux_binprm *bprm, unsigned long pages) { struct mm_struct *mm = current->mm; long diff = (long)(pages - bprm->vma_pages); @@ -183,7 +183,7 @@ static void acct_arg_size(struct linux_binprm *bprm, unsigned long pages) #endif } -static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos, +struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos, int write) { struct page *page; @@ -297,11 +297,11 @@ static bool valid_arg_len(struct linux_binprm *bprm, long len) #else -static inline void acct_arg_size(struct linux_binprm *bprm, unsigned long pages) +void acct_arg_size(struct linux_binprm *bprm, unsigned long pages) { } -static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos, +struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos, int write) { struct page *page; -- cgit v1.1