From a98ee8c1c707fe3210b00ef9f806ba8e2bf35504 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 26 Nov 2008 19:32:33 +0000 Subject: [CIFS] fix regression in cifs_write_begin/cifs_write_end The conversion to write_begin/write_end interfaces had a bug where we were passing a bad parameter to cifs_readpage_worker. Rather than passing the page offset of the start of the write, we needed to pass the offset of the beginning of the page. This was reliably showing up as data corruption in the fsx-linux test from LTP. It also became evident that this code was occasionally doing unnecessary read calls. Optimize those away by using the PG_checked flag to indicate that the unwritten part of the page has been initialized. CC: Nick Piggin Acked-by: Dave Kleikamp Signed-off-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/file.c | 77 ++++++++++++++++++++++++++++++++++++++++++---------------- 1 file changed, 56 insertions(+), 21 deletions(-) (limited to 'fs') diff --git a/fs/cifs/file.c b/fs/cifs/file.c index b691b89..f0a81e6 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -1475,7 +1475,11 @@ static int cifs_write_end(struct file *file, struct address_space *mapping, cFYI(1, ("write_end for page %p from pos %lld with %d bytes", page, pos, copied)); - if (!PageUptodate(page) && copied == PAGE_CACHE_SIZE) + if (PageChecked(page)) { + if (copied == len) + SetPageUptodate(page); + ClearPageChecked(page); + } else if (!PageUptodate(page) && copied == PAGE_CACHE_SIZE) SetPageUptodate(page); if (!PageUptodate(page)) { @@ -2062,39 +2066,70 @@ static int cifs_write_begin(struct file *file, struct address_space *mapping, { pgoff_t index = pos >> PAGE_CACHE_SHIFT; loff_t offset = pos & (PAGE_CACHE_SIZE - 1); + loff_t page_start = pos & PAGE_MASK; + loff_t i_size; + struct page *page; + int rc = 0; cFYI(1, ("write_begin from %lld len %d", (long long)pos, len)); - *pagep = __grab_cache_page(mapping, index); - if (!*pagep) - return -ENOMEM; - - if (PageUptodate(*pagep)) - return 0; + page = __grab_cache_page(mapping, index); + if (!page) { + rc = -ENOMEM; + goto out; + } - /* If we are writing a full page it will be up to date, - no need to read from the server */ - if (len == PAGE_CACHE_SIZE && flags & AOP_FLAG_UNINTERRUPTIBLE) - return 0; + if (PageUptodate(page)) + goto out; - if ((file->f_flags & O_ACCMODE) != O_WRONLY) { - int rc; + /* + * If we write a full page it will be up to date, no need to read from + * the server. If the write is short, we'll end up doing a sync write + * instead. + */ + if (len == PAGE_CACHE_SIZE) + goto out; - /* might as well read a page, it is fast enough */ - rc = cifs_readpage_worker(file, *pagep, &offset); + /* + * optimize away the read when we have an oplock, and we're not + * expecting to use any of the data we'd be reading in. That + * is, when the page lies beyond the EOF, or straddles the EOF + * and the write will cover all of the existing data. + */ + if (CIFS_I(mapping->host)->clientCanCacheRead) { + i_size = i_size_read(mapping->host); + if (page_start >= i_size || + (offset == 0 && (pos + len) >= i_size)) { + zero_user_segments(page, 0, offset, + offset + len, + PAGE_CACHE_SIZE); + /* + * PageChecked means that the parts of the page + * to which we're not writing are considered up + * to date. Once the data is copied to the + * page, it can be set uptodate. + */ + SetPageChecked(page); + goto out; + } + } - /* we do not need to pass errors back - e.g. if we do not have read access to the file - because cifs_write_end will attempt synchronous writes - -- shaggy */ + if ((file->f_flags & O_ACCMODE) != O_WRONLY) { + /* + * might as well read a page, it is fast enough. If we get + * an error, we don't need to return it. cifs_write_end will + * do a sync write instead since PG_uptodate isn't set. + */ + cifs_readpage_worker(file, page, &page_start); } else { /* we could try using another file handle if there is one - but how would we lock it to prevent close of that handle racing with this read? In any case this will be written out by write_end so is fine */ } - - return 0; +out: + *pagep = page; + return rc; } const struct address_space_operations cifs_addr_ops = { -- cgit v1.1 From 52b19ac993f1aeadbce15b55302be9a35346e235 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 23 Sep 2008 18:24:08 +0200 Subject: udf: Fix BUG_ON() in destroy_inode() udf_clear_inode() can leave behind buffers on mapping's i_private list (when we truncated preallocation). Call invalidate_inode_buffers() so that the list is properly cleaned-up before we return from udf_clear_inode(). This is ugly and suggest that we should cleanup preallocation earlier than in clear_inode() but currently there's no such call available since drop_inode() is called under inode lock and thus is unusable for disk operations. Signed-off-by: Jan Kara --- fs/buffer.c | 1 + fs/udf/inode.c | 1 + 2 files changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/buffer.c b/fs/buffer.c index 6569fda..10179cf 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -878,6 +878,7 @@ void invalidate_inode_buffers(struct inode *inode) spin_unlock(&buffer_mapping->private_lock); } } +EXPORT_SYMBOL(invalidate_inode_buffers); /* * Remove any clean buffers from the inode's buffer list. This is called diff --git a/fs/udf/inode.c b/fs/udf/inode.c index 6e74b11..30ebde4 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -106,6 +106,7 @@ void udf_clear_inode(struct inode *inode) udf_truncate_tail_extent(inode); unlock_kernel(); write_inode_now(inode, 0); + invalidate_inode_buffers(inode); } iinfo = UDF_I(inode); kfree(iinfo->i_ext.i_data); -- cgit v1.1 From 3b5da0189c93160e44b878d2c72e9552d642497c Mon Sep 17 00:00:00 2001 From: Coly Li Date: Wed, 5 Nov 2008 15:16:24 +0800 Subject: ocfs2: comments typo fix This patch fixes two typos in comments of ocfs2. Signed-off-by: Coly Li Signed-off-by: Mark Fasheh --- fs/ocfs2/dlm/userdlm.h | 2 +- fs/ocfs2/ocfs2.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/dlm/userdlm.h b/fs/ocfs2/dlm/userdlm.h index 39ec277..0c3cc03 100644 --- a/fs/ocfs2/dlm/userdlm.h +++ b/fs/ocfs2/dlm/userdlm.h @@ -33,7 +33,7 @@ #include /* user_lock_res->l_flags flags. */ -#define USER_LOCK_ATTACHED (0x00000001) /* have we initialized +#define USER_LOCK_ATTACHED (0x00000001) /* we have initialized * the lvb */ #define USER_LOCK_BUSY (0x00000002) /* we are currently in * dlm_lock */ diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index fef7ece..3fed9e3 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -85,7 +85,7 @@ enum ocfs2_unlock_action { }; /* ocfs2_lock_res->l_flags flags. */ -#define OCFS2_LOCK_ATTACHED (0x00000001) /* have we initialized +#define OCFS2_LOCK_ATTACHED (0x00000001) /* we have initialized * the lvb */ #define OCFS2_LOCK_BUSY (0x00000002) /* we are currently in * dlm_lock */ -- cgit v1.1 From 66f502a416f18cd36179290746aa53736c6b2828 Mon Sep 17 00:00:00 2001 From: David Teigland Date: Mon, 10 Nov 2008 16:24:57 -0600 Subject: ocfs2: initialize stack_user lvbptr The locking_state dump, ocfs2_dlm_seq_show, reads the lvb on locks where it has not yet been initialized by a lock call. Signed-off-by: David Teigland Acked-by: Joel Becker Signed-off-by: Mark Fasheh --- fs/ocfs2/stack_user.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs') diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c index faec2d8..9b76d41 100644 --- a/fs/ocfs2/stack_user.c +++ b/fs/ocfs2/stack_user.c @@ -740,6 +740,9 @@ static int user_dlm_lock_status(union ocfs2_dlm_lksb *lksb) static void *user_dlm_lvb(union ocfs2_dlm_lksb *lksb) { + if (!lksb->lksb_fsdlm.sb_lvbptr) + lksb->lksb_fsdlm.sb_lvbptr = (char *)lksb + + sizeof(struct dlm_lksb); return (void *)(lksb->lksb_fsdlm.sb_lvbptr); } -- cgit v1.1 From 07f9eebcdfaeefc8f807fa1bcce1d7c3ae6661b1 Mon Sep 17 00:00:00 2001 From: David Teigland Date: Mon, 17 Nov 2008 12:28:48 -0600 Subject: ocfs2: fix wake_up in unlock_ast In ocfs2_unlock_ast(), call wake_up() on lockres before releasing the spin lock on it. As soon as the spin lock is released, the lockres can be freed. Signed-off-by: David Teigland Signed-off-by: Mark Fasheh --- fs/ocfs2/dlmglue.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index ec68442..6e6cc0a 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -2841,9 +2841,8 @@ static void ocfs2_unlock_ast(void *opaque, int error) lockres_clear_flags(lockres, OCFS2_LOCK_BUSY); lockres->l_unlock_action = OCFS2_UNLOCK_INVALID; - spin_unlock_irqrestore(&lockres->l_lock, flags); - wake_up(&lockres->l_event); + spin_unlock_irqrestore(&lockres->l_lock, flags); mlog_exit_void(); } -- cgit v1.1 From 07d9a3954a68764aefe16855bcd0f86deeb5c825 Mon Sep 17 00:00:00 2001 From: Coly Li Date: Mon, 17 Nov 2008 12:38:22 +0800 Subject: ocfs2: fix return value set in init_dlmfs_fs() In init_dlmfs_fs(), if calling kmem_cache_create() failed, the code will use return value from calling bdi_init(). The correct behavior should be set status as -ENOMEM before going to "bail:". Signed-off-by: Coly Li Acked-by: Sunil Mushran Signed-off-by: Mark Fasheh --- fs/ocfs2/dlm/dlmfs.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ocfs2/dlm/dlmfs.c b/fs/ocfs2/dlm/dlmfs.c index 533a789..ba962d7 100644 --- a/fs/ocfs2/dlm/dlmfs.c +++ b/fs/ocfs2/dlm/dlmfs.c @@ -608,8 +608,10 @@ static int __init init_dlmfs_fs(void) 0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT| SLAB_MEM_SPREAD), dlmfs_init_once); - if (!dlmfs_inode_cache) + if (!dlmfs_inode_cache) { + status = -ENOMEM; goto bail; + } cleanup_inode = 1; user_dlm_worker = create_singlethread_workqueue("user_dlm"); -- cgit v1.1 From d6b58f89f7257c8099c2260e2bea042a917d6cdf Mon Sep 17 00:00:00 2001 From: Mark Fasheh Date: Fri, 21 Nov 2008 14:06:55 -0800 Subject: ocfs2: fix regression in ocfs2_read_blocks_sync() We're panicing in ocfs2_read_blocks_sync() if a jbd-managed buffer is seen. At first glance, this seems ok but in reality it can happen. My test case was to just run 'exorcist'. A struct inode is being pushed out of memory but is then re-read at a later time, before the buffer has been checkpointed by jbd. This causes a BUG to be hit in ocfs2_read_blocks_sync(). Reviewed-by: Joel Becker Signed-off-by: Mark Fasheh --- fs/ocfs2/buffer_head_io.c | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/buffer_head_io.c b/fs/ocfs2/buffer_head_io.c index 7e947c6..3a178ec 100644 --- a/fs/ocfs2/buffer_head_io.c +++ b/fs/ocfs2/buffer_head_io.c @@ -112,7 +112,7 @@ int ocfs2_read_blocks_sync(struct ocfs2_super *osb, u64 block, bh = bhs[i]; if (buffer_jbd(bh)) { - mlog(ML_ERROR, + mlog(ML_BH_IO, "trying to sync read a jbd " "managed bh (blocknr = %llu), skipping\n", (unsigned long long)bh->b_blocknr); @@ -147,15 +147,10 @@ int ocfs2_read_blocks_sync(struct ocfs2_super *osb, u64 block, for (i = nr; i > 0; i--) { bh = bhs[i - 1]; - if (buffer_jbd(bh)) { - mlog(ML_ERROR, - "the journal got the buffer while it was " - "locked for io! (blocknr = %llu)\n", - (unsigned long long)bh->b_blocknr); - BUG(); - } + /* No need to wait on the buffer if it's managed by JBD. */ + if (!buffer_jbd(bh)) + wait_on_buffer(bh); - wait_on_buffer(bh); if (!buffer_uptodate(bh)) { /* Status won't be cleared from here on out, * so we can safely record this and loop back @@ -251,8 +246,6 @@ int ocfs2_read_blocks(struct inode *inode, u64 block, int nr, ignore_cache = 1; } - /* XXX: Can we ever get this and *not* have the cached - * flag set? */ if (buffer_jbd(bh)) { if (ignore_cache) mlog(ML_BH_IO, "trying to sync read a jbd " -- cgit v1.1 From 7ef9964e6d1b911b78709f144000aacadd0ebc21 Mon Sep 17 00:00:00 2001 From: Davide Libenzi Date: Mon, 1 Dec 2008 13:13:55 -0800 Subject: epoll: introduce resource usage limits It has been thought that the per-user file descriptors limit would also limit the resources that a normal user can request via the epoll interface. Vegard Nossum reported a very simple program (a modified version attached) that can make a normal user to request a pretty large amount of kernel memory, well within the its maximum number of fds. To solve such problem, default limits are now imposed, and /proc based configuration has been introduced. A new directory has been created, named /proc/sys/fs/epoll/ and inside there, there are two configuration points: max_user_instances = Maximum number of devices - per user max_user_watches = Maximum number of "watched" fds - per user The current default for "max_user_watches" limits the memory used by epoll to store "watches", to 1/32 of the amount of the low RAM. As example, a 256MB 32bit machine, will have "max_user_watches" set to roughly 90000. That should be enough to not break existing heavy epoll users. The default value for "max_user_instances" is set to 128, that should be enough too. This also changes the userspace, because a new error code can now come out from EPOLL_CTL_ADD (-ENOSPC). The EMFILE from epoll_create() was already listed, so that should be ok. [akpm@linux-foundation.org: use get_current_user()] Signed-off-by: Davide Libenzi Cc: Michael Kerrisk Cc: Cc: Cyrill Gorcunov Reported-by: Vegard Nossum Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/eventpoll.c | 85 ++++++++++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 77 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/eventpoll.c b/fs/eventpoll.c index aec5c13f..96355d5 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -102,6 +102,8 @@ #define EP_UNACTIVE_PTR ((void *) -1L) +#define EP_ITEM_COST (sizeof(struct epitem) + sizeof(struct eppoll_entry)) + struct epoll_filefd { struct file *file; int fd; @@ -200,6 +202,9 @@ struct eventpoll { * holding ->lock. */ struct epitem *ovflist; + + /* The user that created the eventpoll descriptor */ + struct user_struct *user; }; /* Wait structure used by the poll hooks */ @@ -227,9 +232,17 @@ struct ep_pqueue { }; /* + * Configuration options available inside /proc/sys/fs/epoll/ + */ +/* Maximum number of epoll devices, per user */ +static int max_user_instances __read_mostly; +/* Maximum number of epoll watched descriptors, per user */ +static int max_user_watches __read_mostly; + +/* * This mutex is used to serialize ep_free() and eventpoll_release_file(). */ -static struct mutex epmutex; +static DEFINE_MUTEX(epmutex); /* Safe wake up implementation */ static struct poll_safewake psw; @@ -240,6 +253,33 @@ static struct kmem_cache *epi_cache __read_mostly; /* Slab cache used to allocate "struct eppoll_entry" */ static struct kmem_cache *pwq_cache __read_mostly; +#ifdef CONFIG_SYSCTL + +#include + +static int zero; + +ctl_table epoll_table[] = { + { + .procname = "max_user_instances", + .data = &max_user_instances, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .extra1 = &zero, + }, + { + .procname = "max_user_watches", + .data = &max_user_watches, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .extra1 = &zero, + }, + { .ctl_name = 0 } +}; +#endif /* CONFIG_SYSCTL */ + /* Setup the structure that is used as key for the RB tree */ static inline void ep_set_ffd(struct epoll_filefd *ffd, @@ -402,6 +442,8 @@ static int ep_remove(struct eventpoll *ep, struct epitem *epi) /* At this point it is safe to free the eventpoll item */ kmem_cache_free(epi_cache, epi); + atomic_dec(&ep->user->epoll_watches); + DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_remove(%p, %p)\n", current, ep, file)); @@ -449,6 +491,8 @@ static void ep_free(struct eventpoll *ep) mutex_unlock(&epmutex); mutex_destroy(&ep->mtx); + atomic_dec(&ep->user->epoll_devs); + free_uid(ep->user); kfree(ep); } @@ -532,10 +576,19 @@ void eventpoll_release_file(struct file *file) static int ep_alloc(struct eventpoll **pep) { - struct eventpoll *ep = kzalloc(sizeof(*ep), GFP_KERNEL); + int error; + struct user_struct *user; + struct eventpoll *ep; - if (!ep) - return -ENOMEM; + user = get_current_user(); + error = -EMFILE; + if (unlikely(atomic_read(&user->epoll_devs) >= + max_user_instances)) + goto free_uid; + error = -ENOMEM; + ep = kzalloc(sizeof(*ep), GFP_KERNEL); + if (unlikely(!ep)) + goto free_uid; spin_lock_init(&ep->lock); mutex_init(&ep->mtx); @@ -544,12 +597,17 @@ static int ep_alloc(struct eventpoll **pep) INIT_LIST_HEAD(&ep->rdllist); ep->rbr = RB_ROOT; ep->ovflist = EP_UNACTIVE_PTR; + ep->user = user; *pep = ep; DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_alloc() ep=%p\n", current, ep)); return 0; + +free_uid: + free_uid(user); + return error; } /* @@ -703,9 +761,11 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event, struct epitem *epi; struct ep_pqueue epq; - error = -ENOMEM; + if (unlikely(atomic_read(&ep->user->epoll_watches) >= + max_user_watches)) + return -ENOSPC; if (!(epi = kmem_cache_alloc(epi_cache, GFP_KERNEL))) - goto error_return; + return -ENOMEM; /* Item initialization follow here ... */ INIT_LIST_HEAD(&epi->rdllink); @@ -735,6 +795,7 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event, * install process. Namely an allocation for a wait queue failed due * high memory pressure. */ + error = -ENOMEM; if (epi->nwait < 0) goto error_unregister; @@ -765,6 +826,8 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event, spin_unlock_irqrestore(&ep->lock, flags); + atomic_inc(&ep->user->epoll_watches); + /* We have to call this outside the lock */ if (pwake) ep_poll_safewake(&psw, &ep->poll_wait); @@ -789,7 +852,7 @@ error_unregister: spin_unlock_irqrestore(&ep->lock, flags); kmem_cache_free(epi_cache, epi); -error_return: + return error; } @@ -1078,6 +1141,7 @@ asmlinkage long sys_epoll_create1(int flags) flags & O_CLOEXEC); if (fd < 0) ep_free(ep); + atomic_inc(&ep->user->epoll_devs); error_return: DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d) = %d\n", @@ -1299,7 +1363,12 @@ asmlinkage long sys_epoll_pwait(int epfd, struct epoll_event __user *events, static int __init eventpoll_init(void) { - mutex_init(&epmutex); + struct sysinfo si; + + si_meminfo(&si); + max_user_instances = 128; + max_user_watches = (((si.totalram - si.totalhigh) / 32) << PAGE_SHIFT) / + EP_ITEM_COST; /* Initialize the structure used to perform safe poll wait head wake ups */ ep_poll_safewake_init(&psw); -- cgit v1.1 From 03801553630c4bec6682108800c9b2de64bdbd37 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Mon, 1 Dec 2008 13:14:04 -0800 Subject: ntfs: don't fool kernel-doc kernel-doc handles macros now (it has for quite some time), so change the ntfs_debug() macro's kernel-doc to be just before the macro instead of before a phony function prototype. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Randy Dunlap Cc: Anton Altaparmakov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ntfs/debug.h | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/ntfs/debug.h b/fs/ntfs/debug.h index 5e6724c..2142b1c 100644 --- a/fs/ntfs/debug.h +++ b/fs/ntfs/debug.h @@ -30,7 +30,8 @@ extern int debug_msgs; -#if 0 /* Fool kernel-doc since it doesn't do macros yet */ +extern void __ntfs_debug(const char *file, int line, const char *function, + const char *format, ...) __attribute__ ((format (printf, 4, 5))); /** * ntfs_debug - write a debug level message to syslog * @f: a printf format string containing the message @@ -39,11 +40,6 @@ extern int debug_msgs; * ntfs_debug() writes a DEBUG level message to the syslog but only if the * driver was compiled with -DDEBUG. Otherwise, the call turns into a NOP. */ -static void ntfs_debug(const char *f, ...); -#endif - -extern void __ntfs_debug (const char *file, int line, const char *function, - const char *format, ...) __attribute__ ((format (printf, 4, 5))); #define ntfs_debug(f, a...) \ __ntfs_debug(__FILE__, __LINE__, __func__, f, ##a) -- cgit v1.1