diff options
Diffstat (limited to 'fs')
438 files changed, 23664 insertions, 11318 deletions
diff --git a/fs/9p/Kconfig b/fs/9p/Kconfig index 74e0723..7952337 100644 --- a/fs/9p/Kconfig +++ b/fs/9p/Kconfig @@ -8,3 +8,12 @@ config 9P_FS See <http://v9fs.sf.net> for more information. If unsure, say N. + +config 9P_FSCACHE + bool "Enable 9P client caching support (EXPERIMENTAL)" + depends on EXPERIMENTAL + depends on 9P_FS=m && FSCACHE || 9P_FS=y && FSCACHE=y + help + Choose Y here to enable persistent, read-only local + caching support for 9p clients using FS-Cache + diff --git a/fs/9p/Makefile b/fs/9p/Makefile index bc7f0d1..1a940ec 100644 --- a/fs/9p/Makefile +++ b/fs/9p/Makefile @@ -8,5 +8,6 @@ obj-$(CONFIG_9P_FS) := 9p.o vfs_dir.o \ vfs_dentry.o \ v9fs.o \ - fid.o \ + fid.o +9p-$(CONFIG_9P_FSCACHE) += cache.o diff --git a/fs/9p/cache.c b/fs/9p/cache.c new file mode 100644 index 0000000..51c94e2 --- /dev/null +++ b/fs/9p/cache.c @@ -0,0 +1,474 @@ +/* + * V9FS cache definitions. + * + * Copyright (C) 2009 by Abhishek Kulkarni <adkulkar@umail.iu.edu> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to: + * Free Software Foundation + * 51 Franklin Street, Fifth Floor + * Boston, MA 02111-1301 USA + * + */ + +#include <linux/jiffies.h> +#include <linux/file.h> +#include <linux/stat.h> +#include <linux/sched.h> +#include <linux/fs.h> +#include <net/9p/9p.h> + +#include "v9fs.h" +#include "cache.h" + +#define CACHETAG_LEN 11 + +struct kmem_cache *vcookie_cache; + +struct fscache_netfs v9fs_cache_netfs = { + .name = "9p", + .version = 0, +}; + +static void init_once(void *foo) +{ + struct v9fs_cookie *vcookie = (struct v9fs_cookie *) foo; + vcookie->fscache = NULL; + vcookie->qid = NULL; + inode_init_once(&vcookie->inode); +} + +/** + * v9fs_init_vcookiecache - initialize a cache for vcookies to maintain + * vcookie to inode mapping + * + * Returns 0 on success. + */ + +static int v9fs_init_vcookiecache(void) +{ + vcookie_cache = kmem_cache_create("vcookie_cache", + sizeof(struct v9fs_cookie), + 0, (SLAB_RECLAIM_ACCOUNT| + SLAB_MEM_SPREAD), + init_once); + if (!vcookie_cache) + return -ENOMEM; + + return 0; +} + +/** + * v9fs_destroy_vcookiecache - destroy the cache of vcookies + * + */ + +static void v9fs_destroy_vcookiecache(void) +{ + kmem_cache_destroy(vcookie_cache); +} + +int __v9fs_cache_register(void) +{ + int ret; + ret = v9fs_init_vcookiecache(); + if (ret < 0) + return ret; + + return fscache_register_netfs(&v9fs_cache_netfs); +} + +void __v9fs_cache_unregister(void) +{ + v9fs_destroy_vcookiecache(); + fscache_unregister_netfs(&v9fs_cache_netfs); +} + +/** + * v9fs_random_cachetag - Generate a random tag to be associated + * with a new cache session. + * + * The value of jiffies is used for a fairly randomly cache tag. + */ + +static +int v9fs_random_cachetag(struct v9fs_session_info *v9ses) +{ + v9ses->cachetag = kmalloc(CACHETAG_LEN, GFP_KERNEL); + if (!v9ses->cachetag) + return -ENOMEM; + + return scnprintf(v9ses->cachetag, CACHETAG_LEN, "%lu", jiffies); +} + +static uint16_t v9fs_cache_session_get_key(const void *cookie_netfs_data, + void *buffer, uint16_t bufmax) +{ + struct v9fs_session_info *v9ses; + uint16_t klen = 0; + + v9ses = (struct v9fs_session_info *)cookie_netfs_data; + P9_DPRINTK(P9_DEBUG_FSC, "session %p buf %p size %u", v9ses, + buffer, bufmax); + + if (v9ses->cachetag) + klen = strlen(v9ses->cachetag); + + if (klen > bufmax) + return 0; + + memcpy(buffer, v9ses->cachetag, klen); + P9_DPRINTK(P9_DEBUG_FSC, "cache session tag %s", v9ses->cachetag); + return klen; +} + +const struct fscache_cookie_def v9fs_cache_session_index_def = { + .name = "9P.session", + .type = FSCACHE_COOKIE_TYPE_INDEX, + .get_key = v9fs_cache_session_get_key, +}; + +void v9fs_cache_session_get_cookie(struct v9fs_session_info *v9ses) +{ + /* If no cache session tag was specified, we generate a random one. */ + if (!v9ses->cachetag) + v9fs_random_cachetag(v9ses); + + v9ses->fscache = fscache_acquire_cookie(v9fs_cache_netfs.primary_index, + &v9fs_cache_session_index_def, + v9ses); + P9_DPRINTK(P9_DEBUG_FSC, "session %p get cookie %p", v9ses, + v9ses->fscache); +} + +void v9fs_cache_session_put_cookie(struct v9fs_session_info *v9ses) +{ + P9_DPRINTK(P9_DEBUG_FSC, "session %p put cookie %p", v9ses, + v9ses->fscache); + fscache_relinquish_cookie(v9ses->fscache, 0); + v9ses->fscache = NULL; +} + + +static uint16_t v9fs_cache_inode_get_key(const void *cookie_netfs_data, + void *buffer, uint16_t bufmax) +{ + const struct v9fs_cookie *vcookie = cookie_netfs_data; + memcpy(buffer, &vcookie->qid->path, sizeof(vcookie->qid->path)); + + P9_DPRINTK(P9_DEBUG_FSC, "inode %p get key %llu", &vcookie->inode, + vcookie->qid->path); + return sizeof(vcookie->qid->path); +} + +static void v9fs_cache_inode_get_attr(const void *cookie_netfs_data, + uint64_t *size) +{ + const struct v9fs_cookie *vcookie = cookie_netfs_data; + *size = i_size_read(&vcookie->inode); + + P9_DPRINTK(P9_DEBUG_FSC, "inode %p get attr %llu", &vcookie->inode, + *size); +} + +static uint16_t v9fs_cache_inode_get_aux(const void *cookie_netfs_data, + void *buffer, uint16_t buflen) +{ + const struct v9fs_cookie *vcookie = cookie_netfs_data; + memcpy(buffer, &vcookie->qid->version, sizeof(vcookie->qid->version)); + + P9_DPRINTK(P9_DEBUG_FSC, "inode %p get aux %u", &vcookie->inode, + vcookie->qid->version); + return sizeof(vcookie->qid->version); +} + +static enum +fscache_checkaux v9fs_cache_inode_check_aux(void *cookie_netfs_data, + const void *buffer, + uint16_t buflen) +{ + const struct v9fs_cookie *vcookie = cookie_netfs_data; + + if (buflen != sizeof(vcookie->qid->version)) + return FSCACHE_CHECKAUX_OBSOLETE; + + if (memcmp(buffer, &vcookie->qid->version, + sizeof(vcookie->qid->version))) + return FSCACHE_CHECKAUX_OBSOLETE; + + return FSCACHE_CHECKAUX_OKAY; +} + +static void v9fs_cache_inode_now_uncached(void *cookie_netfs_data) +{ + struct v9fs_cookie *vcookie = cookie_netfs_data; + struct pagevec pvec; + pgoff_t first; + int loop, nr_pages; + + pagevec_init(&pvec, 0); + first = 0; + + for (;;) { + nr_pages = pagevec_lookup(&pvec, vcookie->inode.i_mapping, + first, + PAGEVEC_SIZE - pagevec_count(&pvec)); + if (!nr_pages) + break; + + for (loop = 0; loop < nr_pages; loop++) + ClearPageFsCache(pvec.pages[loop]); + + first = pvec.pages[nr_pages - 1]->index + 1; + + pvec.nr = nr_pages; + pagevec_release(&pvec); + cond_resched(); + } +} + +const struct fscache_cookie_def v9fs_cache_inode_index_def = { + .name = "9p.inode", + .type = FSCACHE_COOKIE_TYPE_DATAFILE, + .get_key = v9fs_cache_inode_get_key, + .get_attr = v9fs_cache_inode_get_attr, + .get_aux = v9fs_cache_inode_get_aux, + .check_aux = v9fs_cache_inode_check_aux, + .now_uncached = v9fs_cache_inode_now_uncached, +}; + +void v9fs_cache_inode_get_cookie(struct inode *inode) +{ + struct v9fs_cookie *vcookie; + struct v9fs_session_info *v9ses; + + if (!S_ISREG(inode->i_mode)) + return; + + vcookie = v9fs_inode2cookie(inode); + if (vcookie->fscache) + return; + + v9ses = v9fs_inode2v9ses(inode); + vcookie->fscache = fscache_acquire_cookie(v9ses->fscache, + &v9fs_cache_inode_index_def, + vcookie); + + P9_DPRINTK(P9_DEBUG_FSC, "inode %p get cookie %p", inode, + vcookie->fscache); +} + +void v9fs_cache_inode_put_cookie(struct inode *inode) +{ + struct v9fs_cookie *vcookie = v9fs_inode2cookie(inode); + + if (!vcookie->fscache) + return; + P9_DPRINTK(P9_DEBUG_FSC, "inode %p put cookie %p", inode, + vcookie->fscache); + + fscache_relinquish_cookie(vcookie->fscache, 0); + vcookie->fscache = NULL; +} + +void v9fs_cache_inode_flush_cookie(struct inode *inode) +{ + struct v9fs_cookie *vcookie = v9fs_inode2cookie(inode); + + if (!vcookie->fscache) + return; + P9_DPRINTK(P9_DEBUG_FSC, "inode %p flush cookie %p", inode, + vcookie->fscache); + + fscache_relinquish_cookie(vcookie->fscache, 1); + vcookie->fscache = NULL; +} + +void v9fs_cache_inode_set_cookie(struct inode *inode, struct file *filp) +{ + struct v9fs_cookie *vcookie = v9fs_inode2cookie(inode); + struct p9_fid *fid; + + if (!vcookie->fscache) + return; + + spin_lock(&vcookie->lock); + fid = filp->private_data; + if ((filp->f_flags & O_ACCMODE) != O_RDONLY) + v9fs_cache_inode_flush_cookie(inode); + else + v9fs_cache_inode_get_cookie(inode); + + spin_unlock(&vcookie->lock); +} + +void v9fs_cache_inode_reset_cookie(struct inode *inode) +{ + struct v9fs_cookie *vcookie = v9fs_inode2cookie(inode); + struct v9fs_session_info *v9ses; + struct fscache_cookie *old; + + if (!vcookie->fscache) + return; + + old = vcookie->fscache; + + spin_lock(&vcookie->lock); + fscache_relinquish_cookie(vcookie->fscache, 1); + + v9ses = v9fs_inode2v9ses(inode); + vcookie->fscache = fscache_acquire_cookie(v9ses->fscache, + &v9fs_cache_inode_index_def, + vcookie); + + P9_DPRINTK(P9_DEBUG_FSC, "inode %p revalidating cookie old %p new %p", + inode, old, vcookie->fscache); + + spin_unlock(&vcookie->lock); +} + +int __v9fs_fscache_release_page(struct page *page, gfp_t gfp) +{ + struct inode *inode = page->mapping->host; + struct v9fs_cookie *vcookie = v9fs_inode2cookie(inode); + + BUG_ON(!vcookie->fscache); + + if (PageFsCache(page)) { + if (fscache_check_page_write(vcookie->fscache, page)) { + if (!(gfp & __GFP_WAIT)) + return 0; + fscache_wait_on_page_write(vcookie->fscache, page); + } + + fscache_uncache_page(vcookie->fscache, page); + ClearPageFsCache(page); + } + + return 1; +} + +void __v9fs_fscache_invalidate_page(struct page *page) +{ + struct inode *inode = page->mapping->host; + struct v9fs_cookie *vcookie = v9fs_inode2cookie(inode); + + BUG_ON(!vcookie->fscache); + + if (PageFsCache(page)) { + fscache_wait_on_page_write(vcookie->fscache, page); + BUG_ON(!PageLocked(page)); + fscache_uncache_page(vcookie->fscache, page); + ClearPageFsCache(page); + } +} + +static void v9fs_vfs_readpage_complete(struct page *page, void *data, + int error) +{ + if (!error) + SetPageUptodate(page); + + unlock_page(page); +} + +/** + * __v9fs_readpage_from_fscache - read a page from cache + * + * Returns 0 if the pages are in cache and a BIO is submitted, + * 1 if the pages are not in cache and -error otherwise. + */ + +int __v9fs_readpage_from_fscache(struct inode *inode, struct page *page) +{ + int ret; + const struct v9fs_cookie *vcookie = v9fs_inode2cookie(inode); + + P9_DPRINTK(P9_DEBUG_FSC, "inode %p page %p", inode, page); + if (!vcookie->fscache) + return -ENOBUFS; + + ret = fscache_read_or_alloc_page(vcookie->fscache, + page, + v9fs_vfs_readpage_complete, + NULL, + GFP_KERNEL); + switch (ret) { + case -ENOBUFS: + case -ENODATA: + P9_DPRINTK(P9_DEBUG_FSC, "page/inode not in cache %d", ret); + return 1; + case 0: + P9_DPRINTK(P9_DEBUG_FSC, "BIO submitted"); + return ret; + default: + P9_DPRINTK(P9_DEBUG_FSC, "ret %d", ret); + return ret; + } +} + +/** + * __v9fs_readpages_from_fscache - read multiple pages from cache + * + * Returns 0 if the pages are in cache and a BIO is submitted, + * 1 if the pages are not in cache and -error otherwise. + */ + +int __v9fs_readpages_from_fscache(struct inode *inode, + struct address_space *mapping, + struct list_head *pages, + unsigned *nr_pages) +{ + int ret; + const struct v9fs_cookie *vcookie = v9fs_inode2cookie(inode); + + P9_DPRINTK(P9_DEBUG_FSC, "inode %p pages %u", inode, *nr_pages); + if (!vcookie->fscache) + return -ENOBUFS; + + ret = fscache_read_or_alloc_pages(vcookie->fscache, + mapping, pages, nr_pages, + v9fs_vfs_readpage_complete, + NULL, + mapping_gfp_mask(mapping)); + switch (ret) { + case -ENOBUFS: + case -ENODATA: + P9_DPRINTK(P9_DEBUG_FSC, "pages/inodes not in cache %d", ret); + return 1; + case 0: + BUG_ON(!list_empty(pages)); + BUG_ON(*nr_pages != 0); + P9_DPRINTK(P9_DEBUG_FSC, "BIO submitted"); + return ret; + default: + P9_DPRINTK(P9_DEBUG_FSC, "ret %d", ret); + return ret; + } +} + +/** + * __v9fs_readpage_to_fscache - write a page to the cache + * + */ + +void __v9fs_readpage_to_fscache(struct inode *inode, struct page *page) +{ + int ret; + const struct v9fs_cookie *vcookie = v9fs_inode2cookie(inode); + + P9_DPRINTK(P9_DEBUG_FSC, "inode %p page %p", inode, page); + ret = fscache_write_page(vcookie->fscache, page, GFP_KERNEL); + P9_DPRINTK(P9_DEBUG_FSC, "ret = %d", ret); + if (ret != 0) + v9fs_uncache_page(inode, page); +} diff --git a/fs/9p/cache.h b/fs/9p/cache.h new file mode 100644 index 0000000..a94192b --- /dev/null +++ b/fs/9p/cache.h @@ -0,0 +1,176 @@ +/* + * V9FS cache definitions. + * + * Copyright (C) 2009 by Abhishek Kulkarni <adkulkar@umail.iu.edu> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to: + * Free Software Foundation + * 51 Franklin Street, Fifth Floor + * Boston, MA 02111-1301 USA + * + */ + +#ifndef _9P_CACHE_H +#ifdef CONFIG_9P_FSCACHE +#include <linux/fscache.h> +#include <linux/spinlock.h> + +extern struct kmem_cache *vcookie_cache; + +struct v9fs_cookie { + spinlock_t lock; + struct inode inode; + struct fscache_cookie *fscache; + struct p9_qid *qid; +}; + +static inline struct v9fs_cookie *v9fs_inode2cookie(const struct inode *inode) +{ + return container_of(inode, struct v9fs_cookie, inode); +} + +extern struct fscache_netfs v9fs_cache_netfs; +extern const struct fscache_cookie_def v9fs_cache_session_index_def; +extern const struct fscache_cookie_def v9fs_cache_inode_index_def; + +extern void v9fs_cache_session_get_cookie(struct v9fs_session_info *v9ses); +extern void v9fs_cache_session_put_cookie(struct v9fs_session_info *v9ses); + +extern void v9fs_cache_inode_get_cookie(struct inode *inode); +extern void v9fs_cache_inode_put_cookie(struct inode *inode); +extern void v9fs_cache_inode_flush_cookie(struct inode *inode); +extern void v9fs_cache_inode_set_cookie(struct inode *inode, struct file *filp); +extern void v9fs_cache_inode_reset_cookie(struct inode *inode); + +extern int __v9fs_cache_register(void); +extern void __v9fs_cache_unregister(void); + +extern int __v9fs_fscache_release_page(struct page *page, gfp_t gfp); +extern void __v9fs_fscache_invalidate_page(struct page *page); +extern int __v9fs_readpage_from_fscache(struct inode *inode, + struct page *page); +extern int __v9fs_readpages_from_fscache(struct inode *inode, + struct address_space *mapping, + struct list_head *pages, + unsigned *nr_pages); +extern void __v9fs_readpage_to_fscache(struct inode *inode, struct page *page); + + +/** + * v9fs_cache_register - Register v9fs file system with the cache + */ +static inline int v9fs_cache_register(void) +{ + return __v9fs_cache_register(); +} + +/** + * v9fs_cache_unregister - Unregister v9fs from the cache + */ +static inline void v9fs_cache_unregister(void) +{ + __v9fs_cache_unregister(); +} + +static inline int v9fs_fscache_release_page(struct page *page, + gfp_t gfp) +{ + return __v9fs_fscache_release_page(page, gfp); +} + +static inline void v9fs_fscache_invalidate_page(struct page *page) +{ + __v9fs_fscache_invalidate_page(page); +} + +static inline int v9fs_readpage_from_fscache(struct inode *inode, + struct page *page) +{ + return __v9fs_readpage_from_fscache(inode, page); +} + +static inline int v9fs_readpages_from_fscache(struct inode *inode, + struct address_space *mapping, + struct list_head *pages, + unsigned *nr_pages) +{ + return __v9fs_readpages_from_fscache(inode, mapping, pages, + nr_pages); +} + +static inline void v9fs_readpage_to_fscache(struct inode *inode, + struct page *page) +{ + if (PageFsCache(page)) + __v9fs_readpage_to_fscache(inode, page); +} + +static inline void v9fs_uncache_page(struct inode *inode, struct page *page) +{ + struct v9fs_cookie *vcookie = v9fs_inode2cookie(inode); + fscache_uncache_page(vcookie->fscache, page); + BUG_ON(PageFsCache(page)); +} + +static inline void v9fs_vcookie_set_qid(struct inode *inode, + struct p9_qid *qid) +{ + struct v9fs_cookie *vcookie = v9fs_inode2cookie(inode); + spin_lock(&vcookie->lock); + vcookie->qid = qid; + spin_unlock(&vcookie->lock); +} + +#else /* CONFIG_9P_FSCACHE */ + +static inline int v9fs_cache_register(void) +{ + return 1; +} + +static inline void v9fs_cache_unregister(void) {} + +static inline int v9fs_fscache_release_page(struct page *page, + gfp_t gfp) { + return 1; +} + +static inline void v9fs_fscache_invalidate_page(struct page *page) {} + +static inline int v9fs_readpage_from_fscache(struct inode *inode, + struct page *page) +{ + return -ENOBUFS; +} + +static inline int v9fs_readpages_from_fscache(struct inode *inode, + struct address_space *mapping, + struct list_head *pages, + unsigned *nr_pages) +{ + return -ENOBUFS; +} + +static inline void v9fs_readpage_to_fscache(struct inode *inode, + struct page *page) +{} + +static inline void v9fs_uncache_page(struct inode *inode, struct page *page) +{} + +static inline void v9fs_vcookie_set_qid(struct inode *inode, + struct p9_qid *qid) +{} + +#endif /* CONFIG_9P_FSCACHE */ +#endif /* _9P_CACHE_H */ diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c index f7003cf..cf62b05 100644 --- a/fs/9p/v9fs.c +++ b/fs/9p/v9fs.c @@ -34,21 +34,25 @@ #include <net/9p/transport.h> #include "v9fs.h" #include "v9fs_vfs.h" +#include "cache.h" + +static DEFINE_SPINLOCK(v9fs_sessionlist_lock); +static LIST_HEAD(v9fs_sessionlist); /* - * Option Parsing (code inspired by NFS code) - * NOTE: each transport will parse its own options - */ + * Option Parsing (code inspired by NFS code) + * NOTE: each transport will parse its own options + */ enum { /* Options that take integer arguments */ Opt_debug, Opt_dfltuid, Opt_dfltgid, Opt_afid, /* String options */ - Opt_uname, Opt_remotename, Opt_trans, + Opt_uname, Opt_remotename, Opt_trans, Opt_cache, Opt_cachetag, /* Options that take no arguments */ Opt_nodevmap, /* Cache options */ - Opt_cache_loose, + Opt_cache_loose, Opt_fscache, /* Access options */ Opt_access, /* Error token */ @@ -63,8 +67,10 @@ static const match_table_t tokens = { {Opt_uname, "uname=%s"}, {Opt_remotename, "aname=%s"}, {Opt_nodevmap, "nodevmap"}, - {Opt_cache_loose, "cache=loose"}, + {Opt_cache, "cache=%s"}, {Opt_cache_loose, "loose"}, + {Opt_fscache, "fscache"}, + {Opt_cachetag, "cachetag=%s"}, {Opt_access, "access=%s"}, {Opt_err, NULL} }; @@ -89,16 +95,16 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts) v9ses->afid = ~0; v9ses->debug = 0; v9ses->cache = 0; +#ifdef CONFIG_9P_FSCACHE + v9ses->cachetag = NULL; +#endif if (!opts) return 0; options = kstrdup(opts, GFP_KERNEL); - if (!options) { - P9_DPRINTK(P9_DEBUG_ERROR, - "failed to allocate copy of option string\n"); - return -ENOMEM; - } + if (!options) + goto fail_option_alloc; while ((p = strsep(&options, ",")) != NULL) { int token; @@ -143,16 +149,33 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts) case Opt_cache_loose: v9ses->cache = CACHE_LOOSE; break; + case Opt_fscache: + v9ses->cache = CACHE_FSCACHE; + break; + case Opt_cachetag: +#ifdef CONFIG_9P_FSCACHE + v9ses->cachetag = match_strdup(&args[0]); +#endif + break; + case Opt_cache: + s = match_strdup(&args[0]); + if (!s) + goto fail_option_alloc; + + if (strcmp(s, "loose") == 0) + v9ses->cache = CACHE_LOOSE; + else if (strcmp(s, "fscache") == 0) + v9ses->cache = CACHE_FSCACHE; + else + v9ses->cache = CACHE_NONE; + kfree(s); + break; case Opt_access: s = match_strdup(&args[0]); - if (!s) { - P9_DPRINTK(P9_DEBUG_ERROR, - "failed to allocate copy" - " of option argument\n"); - ret = -ENOMEM; - break; - } + if (!s) + goto fail_option_alloc; + v9ses->flags &= ~V9FS_ACCESS_MASK; if (strcmp(s, "user") == 0) v9ses->flags |= V9FS_ACCESS_USER; @@ -173,6 +196,11 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts) } kfree(options); return ret; + +fail_option_alloc: + P9_DPRINTK(P9_DEBUG_ERROR, + "failed to allocate copy of option argument\n"); + return -ENOMEM; } /** @@ -200,6 +228,10 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses, return ERR_PTR(-ENOMEM); } + spin_lock(&v9fs_sessionlist_lock); + list_add(&v9ses->slist, &v9fs_sessionlist); + spin_unlock(&v9fs_sessionlist_lock); + v9ses->flags = V9FS_EXTENDED | V9FS_ACCESS_USER; strcpy(v9ses->uname, V9FS_DEFUSER); strcpy(v9ses->aname, V9FS_DEFANAME); @@ -249,6 +281,11 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses, else fid->uid = ~0; +#ifdef CONFIG_9P_FSCACHE + /* register the session for caching */ + v9fs_cache_session_get_cookie(v9ses); +#endif + return fid; error: @@ -268,8 +305,18 @@ void v9fs_session_close(struct v9fs_session_info *v9ses) v9ses->clnt = NULL; } +#ifdef CONFIG_9P_FSCACHE + if (v9ses->fscache) { + v9fs_cache_session_put_cookie(v9ses); + kfree(v9ses->cachetag); + } +#endif __putname(v9ses->uname); __putname(v9ses->aname); + + spin_lock(&v9fs_sessionlist_lock); + list_del(&v9ses->slist); + spin_unlock(&v9fs_sessionlist_lock); } /** @@ -286,25 +333,132 @@ void v9fs_session_cancel(struct v9fs_session_info *v9ses) { extern int v9fs_error_init(void); +static struct kobject *v9fs_kobj; + +#ifdef CONFIG_9P_FSCACHE /** - * v9fs_init - Initialize module + * caches_show - list caches associated with a session + * + * Returns the size of buffer written. + */ + +static ssize_t caches_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + ssize_t n = 0, count = 0, limit = PAGE_SIZE; + struct v9fs_session_info *v9ses; + + spin_lock(&v9fs_sessionlist_lock); + list_for_each_entry(v9ses, &v9fs_sessionlist, slist) { + if (v9ses->cachetag) { + n = snprintf(buf, limit, "%s\n", v9ses->cachetag); + if (n < 0) { + count = n; + break; + } + + count += n; + limit -= n; + } + } + + spin_unlock(&v9fs_sessionlist_lock); + return count; +} + +static struct kobj_attribute v9fs_attr_cache = __ATTR_RO(caches); +#endif /* CONFIG_9P_FSCACHE */ + +static struct attribute *v9fs_attrs[] = { +#ifdef CONFIG_9P_FSCACHE + &v9fs_attr_cache.attr, +#endif + NULL, +}; + +static struct attribute_group v9fs_attr_group = { + .attrs = v9fs_attrs, +}; + +/** + * v9fs_sysfs_init - Initialize the v9fs sysfs interface + * + */ + +static int v9fs_sysfs_init(void) +{ + v9fs_kobj = kobject_create_and_add("9p", fs_kobj); + if (!v9fs_kobj) + return -ENOMEM; + + if (sysfs_create_group(v9fs_kobj, &v9fs_attr_group)) { + kobject_put(v9fs_kobj); + return -ENOMEM; + } + + return 0; +} + +/** + * v9fs_sysfs_cleanup - Unregister the v9fs sysfs interface + * + */ + +static void v9fs_sysfs_cleanup(void) +{ + sysfs_remove_group(v9fs_kobj, &v9fs_attr_group); + kobject_put(v9fs_kobj); +} + +/** + * init_v9fs - Initialize module * */ static int __init init_v9fs(void) { + int err; printk(KERN_INFO "Installing v9fs 9p2000 file system support\n"); /* TODO: Setup list of registered trasnport modules */ - return register_filesystem(&v9fs_fs_type); + err = register_filesystem(&v9fs_fs_type); + if (err < 0) { + printk(KERN_ERR "Failed to register filesystem\n"); + return err; + } + + err = v9fs_cache_register(); + if (err < 0) { + printk(KERN_ERR "Failed to register v9fs for caching\n"); + goto out_fs_unreg; + } + + err = v9fs_sysfs_init(); + if (err < 0) { + printk(KERN_ERR "Failed to register with sysfs\n"); + goto out_sysfs_cleanup; + } + + return 0; + +out_sysfs_cleanup: + v9fs_sysfs_cleanup(); + +out_fs_unreg: + unregister_filesystem(&v9fs_fs_type); + + return err; } /** - * v9fs_init - shutdown module + * exit_v9fs - shutdown module * */ static void __exit exit_v9fs(void) { + v9fs_sysfs_cleanup(); + v9fs_cache_unregister(); unregister_filesystem(&v9fs_fs_type); } diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h index 38762bf..019f4cc 100644 --- a/fs/9p/v9fs.h +++ b/fs/9p/v9fs.h @@ -51,6 +51,7 @@ enum p9_session_flags { enum p9_cache_modes { CACHE_NONE, CACHE_LOOSE, + CACHE_FSCACHE, }; /** @@ -60,6 +61,8 @@ enum p9_cache_modes { * @debug: debug level * @afid: authentication handle * @cache: cache mode of type &p9_cache_modes + * @cachetag: the tag of the cache associated with this session + * @fscache: session cookie associated with FS-Cache * @options: copy of options string given by user * @uname: string user name to mount hierarchy as * @aname: mount specifier for remote hierarchy @@ -68,7 +71,7 @@ enum p9_cache_modes { * @dfltgid: default numeric groupid to mount hierarchy as * @uid: if %V9FS_ACCESS_SINGLE, the numeric uid which mounted the hierarchy * @clnt: reference to 9P network client instantiated for this session - * @debugfs_dir: reference to debugfs_dir which can be used for add'l debug + * @slist: reference to list of registered 9p sessions * * This structure holds state for each session instance established during * a sys_mount() . @@ -84,6 +87,10 @@ struct v9fs_session_info { unsigned short debug; unsigned int afid; unsigned int cache; +#ifdef CONFIG_9P_FSCACHE + char *cachetag; + struct fscache_cookie *fscache; +#endif char *uname; /* user name to mount as */ char *aname; /* name of remote hierarchy being mounted */ @@ -92,11 +99,9 @@ struct v9fs_session_info { unsigned int dfltgid; /* default gid for legacy support */ u32 uid; /* if ACCESS_SINGLE, the uid that has access */ struct p9_client *clnt; /* 9p client */ - struct dentry *debugfs_dir; + struct list_head slist; /* list of sessions registered with v9fs */ }; -extern struct dentry *v9fs_debugfs_root; - struct p9_fid *v9fs_session_init(struct v9fs_session_info *, const char *, char *); void v9fs_session_close(struct v9fs_session_info *v9ses); diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h index f0c7de7..3a7560e 100644 --- a/fs/9p/v9fs_vfs.h +++ b/fs/9p/v9fs_vfs.h @@ -44,7 +44,13 @@ extern const struct file_operations v9fs_dir_operations; extern const struct dentry_operations v9fs_dentry_operations; extern const struct dentry_operations v9fs_cached_dentry_operations; +#ifdef CONFIG_9P_FSCACHE +struct inode *v9fs_alloc_inode(struct super_block *sb); +void v9fs_destroy_inode(struct inode *inode); +#endif + struct inode *v9fs_get_inode(struct super_block *sb, int mode); +void v9fs_clear_inode(struct inode *inode); ino_t v9fs_qid2ino(struct p9_qid *qid); void v9fs_stat2inode(struct p9_wstat *, struct inode *, struct super_block *); int v9fs_dir_release(struct inode *inode, struct file *filp); diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c index 9282828..90e3844 100644 --- a/fs/9p/vfs_addr.c +++ b/fs/9p/vfs_addr.c @@ -38,6 +38,7 @@ #include "v9fs.h" #include "v9fs_vfs.h" +#include "cache.h" /** * v9fs_vfs_readpage - read an entire page in from 9P @@ -52,18 +53,31 @@ static int v9fs_vfs_readpage(struct file *filp, struct page *page) int retval; loff_t offset; char *buffer; + struct inode *inode; + inode = page->mapping->host; P9_DPRINTK(P9_DEBUG_VFS, "\n"); + + BUG_ON(!PageLocked(page)); + + retval = v9fs_readpage_from_fscache(inode, page); + if (retval == 0) + return retval; + buffer = kmap(page); offset = page_offset(page); retval = v9fs_file_readn(filp, buffer, NULL, PAGE_CACHE_SIZE, offset); - if (retval < 0) + if (retval < 0) { + v9fs_uncache_page(inode, page); goto done; + } memset(buffer + retval, 0, PAGE_CACHE_SIZE - retval); flush_dcache_page(page); SetPageUptodate(page); + + v9fs_readpage_to_fscache(inode, page); retval = 0; done: @@ -72,6 +86,78 @@ done: return retval; } +/** + * v9fs_vfs_readpages - read a set of pages from 9P + * + * @filp: file being read + * @mapping: the address space + * @pages: list of pages to read + * @nr_pages: count of pages to read + * + */ + +static int v9fs_vfs_readpages(struct file *filp, struct address_space *mapping, + struct list_head *pages, unsigned nr_pages) +{ + int ret = 0; + struct inode *inode; + + inode = mapping->host; + P9_DPRINTK(P9_DEBUG_VFS, "inode: %p file: %p\n", inode, filp); + + ret = v9fs_readpages_from_fscache(inode, mapping, pages, &nr_pages); + if (ret == 0) + return ret; + + ret = read_cache_pages(mapping, pages, (void *)v9fs_vfs_readpage, filp); + P9_DPRINTK(P9_DEBUG_VFS, " = %d\n", ret); + return ret; +} + +/** + * v9fs_release_page - release the private state associated with a page + * + * Returns 1 if the page can be released, false otherwise. + */ + +static int v9fs_release_page(struct page *page, gfp_t gfp) +{ + if (PagePrivate(page)) + return 0; + + return v9fs_fscache_release_page(page, gfp); +} + +/** + * v9fs_invalidate_page - Invalidate a page completely or partially + * + * @page: structure to page + * @offset: offset in the page + */ + +static void v9fs_invalidate_page(struct page *page, unsigned long offset) +{ + if (offset == 0) + v9fs_fscache_invalidate_page(page); +} + +/** + * v9fs_launder_page - Writeback a dirty page + * Since the writes go directly to the server, we simply return a 0 + * here to indicate success. + * + * Returns 0 on success. + */ + +static int v9fs_launder_page(struct page *page) +{ + return 0; +} + const struct address_space_operations v9fs_addr_operations = { .readpage = v9fs_vfs_readpage, + .readpages = v9fs_vfs_readpages, + .releasepage = v9fs_release_page, + .invalidatepage = v9fs_invalidate_page, + .launder_page = v9fs_launder_page, }; diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c index 68bf2af..3902bf4 100644 --- a/fs/9p/vfs_file.c +++ b/fs/9p/vfs_file.c @@ -32,6 +32,7 @@ #include <linux/string.h> #include <linux/inet.h> #include <linux/list.h> +#include <linux/pagemap.h> #include <asm/uaccess.h> #include <linux/idr.h> #include <net/9p/9p.h> @@ -40,6 +41,7 @@ #include "v9fs.h" #include "v9fs_vfs.h" #include "fid.h" +#include "cache.h" static const struct file_operations v9fs_cached_file_operations; @@ -72,7 +74,7 @@ int v9fs_file_open(struct inode *inode, struct file *file) return err; } if (omode & P9_OTRUNC) { - inode->i_size = 0; + i_size_write(inode, 0); inode->i_blocks = 0; } if ((file->f_flags & O_APPEND) && (!v9fs_extended(v9ses))) @@ -85,6 +87,10 @@ int v9fs_file_open(struct inode *inode, struct file *file) /* enable cached file options */ if(file->f_op == &v9fs_file_operations) file->f_op = &v9fs_cached_file_operations; + +#ifdef CONFIG_9P_FSCACHE + v9fs_cache_inode_set_cookie(inode, file); +#endif } return 0; @@ -210,6 +216,7 @@ v9fs_file_write(struct file *filp, const char __user * data, struct p9_client *clnt; struct inode *inode = filp->f_path.dentry->d_inode; int origin = *offset; + unsigned long pg_start, pg_end; P9_DPRINTK(P9_DEBUG_VFS, "data %p count %d offset %x\n", data, (int)count, (int)*offset); @@ -225,7 +232,7 @@ v9fs_file_write(struct file *filp, const char __user * data, if (count < rsize) rsize = count; - n = p9_client_write(fid, NULL, data+total, *offset+total, + n = p9_client_write(fid, NULL, data+total, origin+total, rsize); if (n <= 0) break; @@ -234,14 +241,14 @@ v9fs_file_write(struct file *filp, const char __user * data, } while (count > 0); if (total > 0) { - invalidate_inode_pages2_range(inode->i_mapping, origin, - origin+total); + pg_start = origin >> PAGE_CACHE_SHIFT; + pg_end = (origin + total - 1) >> PAGE_CACHE_SHIFT; + if (inode->i_mapping && inode->i_mapping->nrpages) + invalidate_inode_pages2_range(inode->i_mapping, + pg_start, pg_end); *offset += total; - } - - if (*offset > inode->i_size) { - inode->i_size = *offset; - inode->i_blocks = (inode->i_size + 512 - 1) >> 9; + i_size_write(inode, i_size_read(inode) + total); + inode->i_blocks = (i_size_read(inode) + 512 - 1) >> 9; } if (n < 0) diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index 06a223d..5947628 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -40,6 +40,7 @@ #include "v9fs.h" #include "v9fs_vfs.h" #include "fid.h" +#include "cache.h" static const struct inode_operations v9fs_dir_inode_operations; static const struct inode_operations v9fs_dir_inode_operations_ext; @@ -197,6 +198,39 @@ v9fs_blank_wstat(struct p9_wstat *wstat) wstat->extension = NULL; } +#ifdef CONFIG_9P_FSCACHE +/** + * v9fs_alloc_inode - helper function to allocate an inode + * This callback is executed before setting up the inode so that we + * can associate a vcookie with each inode. + * + */ + +struct inode *v9fs_alloc_inode(struct super_block *sb) +{ + struct v9fs_cookie *vcookie; + vcookie = (struct v9fs_cookie *)kmem_cache_alloc(vcookie_cache, + GFP_KERNEL); + if (!vcookie) + return NULL; + + vcookie->fscache = NULL; + vcookie->qid = NULL; + spin_lock_init(&vcookie->lock); + return &vcookie->inode; +} + +/** + * v9fs_destroy_inode - destroy an inode + * + */ + +void v9fs_destroy_inode(struct inode *inode) +{ + kmem_cache_free(vcookie_cache, v9fs_inode2cookie(inode)); +} +#endif + /** * v9fs_get_inode - helper function to setup an inode * @sb: superblock @@ -326,6 +360,21 @@ error: } */ + +/** + * v9fs_clear_inode - release an inode + * @inode: inode to release + * + */ +void v9fs_clear_inode(struct inode *inode) +{ + filemap_fdatawrite(inode->i_mapping); + +#ifdef CONFIG_9P_FSCACHE + v9fs_cache_inode_put_cookie(inode); +#endif +} + /** * v9fs_inode_from_fid - populate an inode by issuing a attribute request * @v9ses: session information @@ -356,8 +405,14 @@ v9fs_inode_from_fid(struct v9fs_session_info *v9ses, struct p9_fid *fid, v9fs_stat2inode(st, ret, sb); ret->i_ino = v9fs_qid2ino(&st->qid); + +#ifdef CONFIG_9P_FSCACHE + v9fs_vcookie_set_qid(ret, &st->qid); + v9fs_cache_inode_get_cookie(ret); +#endif p9stat_free(st); kfree(st); + return ret; error: @@ -751,7 +806,7 @@ v9fs_vfs_getattr(struct vfsmount *mnt, struct dentry *dentry, P9_DPRINTK(P9_DEBUG_VFS, "dentry: %p\n", dentry); err = -EPERM; v9ses = v9fs_inode2v9ses(dentry->d_inode); - if (v9ses->cache == CACHE_LOOSE) + if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) return simple_getattr(mnt, dentry, stat); fid = v9fs_fid_lookup(dentry); @@ -872,10 +927,10 @@ v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode, } else inode->i_rdev = 0; - inode->i_size = stat->length; + i_size_write(inode, stat->length); /* not real number of blocks, but 512 byte ones ... */ - inode->i_blocks = (inode->i_size + 512 - 1) >> 9; + inode->i_blocks = (i_size_read(inode) + 512 - 1) >> 9; } /** diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c index 8961f1a..14a8644 100644 --- a/fs/9p/vfs_super.c +++ b/fs/9p/vfs_super.c @@ -44,21 +44,9 @@ #include "v9fs_vfs.h" #include "fid.h" -static void v9fs_clear_inode(struct inode *); static const struct super_operations v9fs_super_ops; /** - * v9fs_clear_inode - release an inode - * @inode: inode to release - * - */ - -static void v9fs_clear_inode(struct inode *inode) -{ - filemap_fdatawrite(inode->i_mapping); -} - -/** * v9fs_set_super - set the superblock * @s: super block * @data: file system specific data @@ -220,6 +208,10 @@ v9fs_umount_begin(struct super_block *sb) } static const struct super_operations v9fs_super_ops = { +#ifdef CONFIG_9P_FSCACHE + .alloc_inode = v9fs_alloc_inode, + .destroy_inode = v9fs_destroy_inode, +#endif .statfs = simple_statfs, .clear_inode = v9fs_clear_inode, .show_options = generic_show_options, @@ -43,6 +43,7 @@ source "fs/xfs/Kconfig" source "fs/gfs2/Kconfig" source "fs/ocfs2/Kconfig" source "fs/btrfs/Kconfig" +source "fs/nilfs2/Kconfig" endif # BLOCK @@ -108,6 +109,7 @@ source "fs/sysfs/Kconfig" config TMPFS bool "Virtual memory file system support (former shm fs)" + depends on SHMEM help Tmpfs is a file system which keeps all files in virtual memory. @@ -186,7 +188,6 @@ source "fs/romfs/Kconfig" source "fs/sysv/Kconfig" source "fs/ufs/Kconfig" source "fs/exofs/Kconfig" -source "fs/nilfs2/Kconfig" endif # MISC_FILESYSTEMS diff --git a/fs/adfs/inode.c b/fs/adfs/inode.c index 798cb07..3f57ce4 100644 --- a/fs/adfs/inode.c +++ b/fs/adfs/inode.c @@ -19,9 +19,6 @@ static int adfs_get_block(struct inode *inode, sector_t block, struct buffer_head *bh, int create) { - if (block < 0) - goto abort_negative; - if (!create) { if (block >= inode->i_blocks) goto abort_toobig; @@ -34,10 +31,6 @@ adfs_get_block(struct inode *inode, sector_t block, struct buffer_head *bh, /* don't support allocation of blocks yet */ return -EIO; -abort_negative: - adfs_error(inode->i_sb, "block %d < 0", block); - return -EIO; - abort_toobig: return 0; } diff --git a/fs/afs/cache.h b/fs/afs/cache.h deleted file mode 100644 index 5c4f6b4..0000000 --- a/fs/afs/cache.h +++ /dev/null @@ -1,12 +0,0 @@ -/* AFS local cache management interface - * - * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved. - * Written by David Howells (dhowells@redhat.com) - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ - -#include <linux/fscache.h> diff --git a/fs/afs/flock.c b/fs/afs/flock.c index 3ff8bdd..0931bc1 100644 --- a/fs/afs/flock.c +++ b/fs/afs/flock.c @@ -21,7 +21,7 @@ static void afs_fl_release_private(struct file_lock *fl); static struct workqueue_struct *afs_lock_manager; static DEFINE_MUTEX(afs_lock_manager_mutex); -static struct file_lock_operations afs_lock_ops = { +static const struct file_lock_operations afs_lock_ops = { .fl_copy_lock = afs_fl_copy_lock, .fl_release_private = afs_fl_release_private, }; diff --git a/fs/afs/internal.h b/fs/afs/internal.h index 106be66..6ece2a1 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -18,10 +18,10 @@ #include <linux/key.h> #include <linux/workqueue.h> #include <linux/sched.h> +#include <linux/fscache.h> #include "afs.h" #include "afs_vl.h" -#include "cache.h" #define AFS_CELL_MAX_ADDRS 15 diff --git a/fs/afs/proc.c b/fs/afs/proc.c index 8630615..852739d 100644 --- a/fs/afs/proc.c +++ b/fs/afs/proc.c @@ -28,7 +28,7 @@ static int afs_proc_cells_show(struct seq_file *m, void *v); static ssize_t afs_proc_cells_write(struct file *file, const char __user *buf, size_t size, loff_t *_pos); -static struct seq_operations afs_proc_cells_ops = { +static const struct seq_operations afs_proc_cells_ops = { .start = afs_proc_cells_start, .next = afs_proc_cells_next, .stop = afs_proc_cells_stop, @@ -70,7 +70,7 @@ static void *afs_proc_cell_volumes_next(struct seq_file *p, void *v, static void afs_proc_cell_volumes_stop(struct seq_file *p, void *v); static int afs_proc_cell_volumes_show(struct seq_file *m, void *v); -static struct seq_operations afs_proc_cell_volumes_ops = { +static const struct seq_operations afs_proc_cell_volumes_ops = { .start = afs_proc_cell_volumes_start, .next = afs_proc_cell_volumes_next, .stop = afs_proc_cell_volumes_stop, @@ -95,7 +95,7 @@ static void *afs_proc_cell_vlservers_next(struct seq_file *p, void *v, static void afs_proc_cell_vlservers_stop(struct seq_file *p, void *v); static int afs_proc_cell_vlservers_show(struct seq_file *m, void *v); -static struct seq_operations afs_proc_cell_vlservers_ops = { +static const struct seq_operations afs_proc_cell_vlservers_ops = { .start = afs_proc_cell_vlservers_start, .next = afs_proc_cell_vlservers_next, .stop = afs_proc_cell_vlservers_stop, @@ -119,7 +119,7 @@ static void *afs_proc_cell_servers_next(struct seq_file *p, void *v, static void afs_proc_cell_servers_stop(struct seq_file *p, void *v); static int afs_proc_cell_servers_show(struct seq_file *m, void *v); -static struct seq_operations afs_proc_cell_servers_ops = { +static const struct seq_operations afs_proc_cell_servers_ops = { .start = afs_proc_cell_servers_start, .next = afs_proc_cell_servers_next, .stop = afs_proc_cell_servers_stop, diff --git a/fs/afs/write.c b/fs/afs/write.c index c2e7a7f..c63a3c8 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -712,7 +712,6 @@ int afs_writeback_all(struct afs_vnode *vnode) .bdi = mapping->backing_dev_info, .sync_mode = WB_SYNC_ALL, .nr_to_write = LONG_MAX, - .for_writepages = 1, .range_cyclic = 1, }; int ret; @@ -24,6 +24,7 @@ #include <linux/file.h> #include <linux/mm.h> #include <linux/mman.h> +#include <linux/mmu_context.h> #include <linux/slab.h> #include <linux/timer.h> #include <linux/aio.h> @@ -34,7 +35,6 @@ #include <asm/kmap_types.h> #include <asm/uaccess.h> -#include <asm/mmu_context.h> #if DEBUG > 1 #define dprintk printk @@ -78,6 +78,7 @@ static int __init aio_setup(void) return 0; } +__initcall(aio_setup); static void aio_free_ring(struct kioctx *ctx) { @@ -380,6 +381,7 @@ ssize_t wait_on_sync_kiocb(struct kiocb *iocb) __set_current_state(TASK_RUNNING); return iocb->ki_user_data; } +EXPORT_SYMBOL(wait_on_sync_kiocb); /* exit_aio: called when the last user of mm goes away. At this point, * there is no way for any new requests to be submited or any of the @@ -573,6 +575,7 @@ int aio_put_req(struct kiocb *req) spin_unlock_irq(&ctx->ctx_lock); return ret; } +EXPORT_SYMBOL(aio_put_req); static struct kioctx *lookup_ioctx(unsigned long ctx_id) { @@ -595,51 +598,6 @@ static struct kioctx *lookup_ioctx(unsigned long ctx_id) } /* - * use_mm - * Makes the calling kernel thread take on the specified - * mm context. - * Called by the retry thread execute retries within the - * iocb issuer's mm context, so that copy_from/to_user - * operations work seamlessly for aio. - * (Note: this routine is intended to be called only - * from a kernel thread context) - */ -static void use_mm(struct mm_struct *mm) -{ - struct mm_struct *active_mm; - struct task_struct *tsk = current; - - task_lock(tsk); - active_mm = tsk->active_mm; - atomic_inc(&mm->mm_count); - tsk->mm = mm; - tsk->active_mm = mm; - switch_mm(active_mm, mm, tsk); - task_unlock(tsk); - - mmdrop(active_mm); -} - -/* - * unuse_mm - * Reverses the effect of use_mm, i.e. releases the - * specified mm context which was earlier taken on - * by the calling kernel thread - * (Note: this routine is intended to be called only - * from a kernel thread context) - */ -static void unuse_mm(struct mm_struct *mm) -{ - struct task_struct *tsk = current; - - task_lock(tsk); - tsk->mm = NULL; - /* active_mm is still 'mm' */ - enter_lazy_tlb(mm, tsk); - task_unlock(tsk); -} - -/* * Queue up a kiocb to be retried. Assumes that the kiocb * has already been marked as kicked, and places it on * the retry run list for the corresponding ioctx, if it @@ -1037,6 +995,7 @@ put_rq: spin_unlock_irqrestore(&ctx->ctx_lock, flags); return ret; } +EXPORT_SYMBOL(aio_complete); /* aio_read_evt * Pull an event off of the ioctx's event ring. Returns the number of @@ -1825,9 +1784,3 @@ SYSCALL_DEFINE5(io_getevents, aio_context_t, ctx_id, asmlinkage_protect(5, ret, ctx_id, min_nr, nr, events, timeout); return ret; } - -__initcall(aio_setup); - -EXPORT_SYMBOL(aio_complete); -EXPORT_SYMBOL(aio_put_req); -EXPORT_SYMBOL(wait_on_sync_kiocb); diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c index 47d4a01..2ca7a7c 100644 --- a/fs/anon_inodes.c +++ b/fs/anon_inodes.c @@ -8,8 +8,10 @@ * */ +#include <linux/cred.h> #include <linux/file.h> #include <linux/poll.h> +#include <linux/sched.h> #include <linux/slab.h> #include <linux/init.h> #include <linux/fs.h> @@ -77,28 +79,24 @@ static const struct address_space_operations anon_aops = { * * Creates a new file by hooking it on a single inode. This is useful for files * that do not need to have a full-fledged inode in order to operate correctly. - * All the files created with anon_inode_getfd() will share a single inode, + * All the files created with anon_inode_getfile() will share a single inode, * hence saving memory and avoiding code duplication for the file/inode/dentry - * setup. Returns new descriptor or -error. + * setup. Returns the newly created file* or an error pointer. */ -int anon_inode_getfd(const char *name, const struct file_operations *fops, - void *priv, int flags) +struct file *anon_inode_getfile(const char *name, + const struct file_operations *fops, + void *priv, int flags) { struct qstr this; struct dentry *dentry; struct file *file; - int error, fd; + int error; if (IS_ERR(anon_inode_inode)) - return -ENODEV; + return ERR_PTR(-ENODEV); if (fops->owner && !try_module_get(fops->owner)) - return -ENOENT; - - error = get_unused_fd_flags(flags); - if (error < 0) - goto err_module; - fd = error; + return ERR_PTR(-ENOENT); /* * Link the inode to a directory entry by creating a unique name @@ -110,7 +108,7 @@ int anon_inode_getfd(const char *name, const struct file_operations *fops, this.hash = 0; dentry = d_alloc(anon_inode_mnt->mnt_sb->s_root, &this); if (!dentry) - goto err_put_unused_fd; + goto err_module; /* * We know the anon_inode inode count is always greater than zero, @@ -136,16 +134,54 @@ int anon_inode_getfd(const char *name, const struct file_operations *fops, file->f_version = 0; file->private_data = priv; + return file; + +err_dput: + dput(dentry); +err_module: + module_put(fops->owner); + return ERR_PTR(error); +} +EXPORT_SYMBOL_GPL(anon_inode_getfile); + +/** + * anon_inode_getfd - creates a new file instance by hooking it up to an + * anonymous inode, and a dentry that describe the "class" + * of the file + * + * @name: [in] name of the "class" of the new file + * @fops: [in] file operations for the new file + * @priv: [in] private data for the new file (will be file's private_data) + * @flags: [in] flags + * + * Creates a new file by hooking it on a single inode. This is useful for files + * that do not need to have a full-fledged inode in order to operate correctly. + * All the files created with anon_inode_getfd() will share a single inode, + * hence saving memory and avoiding code duplication for the file/inode/dentry + * setup. Returns new descriptor or an error code. + */ +int anon_inode_getfd(const char *name, const struct file_operations *fops, + void *priv, int flags) +{ + int error, fd; + struct file *file; + + error = get_unused_fd_flags(flags); + if (error < 0) + return error; + fd = error; + + file = anon_inode_getfile(name, fops, priv, flags); + if (IS_ERR(file)) { + error = PTR_ERR(file); + goto err_put_unused_fd; + } fd_install(fd, file); return fd; -err_dput: - dput(dentry); err_put_unused_fd: put_unused_fd(fd); -err_module: - module_put(fops->owner); return error; } EXPORT_SYMBOL_GPL(anon_inode_getfd); @@ -18,7 +18,7 @@ /* Taken over from the old code... */ /* POSIX UID/GID verification for setting inode attributes. */ -int inode_change_ok(struct inode *inode, struct iattr *attr) +int inode_change_ok(const struct inode *inode, struct iattr *attr) { int retval = -EPERM; unsigned int ia_valid = attr->ia_valid; @@ -60,9 +60,51 @@ fine: error: return retval; } - EXPORT_SYMBOL(inode_change_ok); +/** + * inode_newsize_ok - may this inode be truncated to a given size + * @inode: the inode to be truncated + * @offset: the new size to assign to the inode + * @Returns: 0 on success, -ve errno on failure + * + * inode_newsize_ok will check filesystem limits and ulimits to check that the + * new inode size is within limits. inode_newsize_ok will also send SIGXFSZ + * when necessary. Caller must not proceed with inode size change if failure is + * returned. @inode must be a file (not directory), with appropriate + * permissions to allow truncate (inode_newsize_ok does NOT check these + * conditions). + * + * inode_newsize_ok must be called with i_mutex held. + */ +int inode_newsize_ok(const struct inode *inode, loff_t offset) +{ + if (inode->i_size < offset) { + unsigned long limit; + + limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur; + if (limit != RLIM_INFINITY && offset > limit) + goto out_sig; + if (offset > inode->i_sb->s_maxbytes) + goto out_big; + } else { + /* + * truncation of in-use swapfiles is disallowed - it would + * cause subsequent swapout to scribble on the now-freed + * blocks. + */ + if (IS_SWAPFILE(inode)) + return -ETXTBSY; + } + + return 0; +out_sig: + send_sig(SIGXFSZ, current, 0); +out_big: + return -EFBIG; +} +EXPORT_SYMBOL(inode_newsize_ok); + int inode_setattr(struct inode * inode, struct iattr * attr) { unsigned int ia_valid = attr->ia_valid; diff --git a/fs/autofs/dirhash.c b/fs/autofs/dirhash.c index 2316e94..e947915 100644 --- a/fs/autofs/dirhash.c +++ b/fs/autofs/dirhash.c @@ -90,7 +90,7 @@ struct autofs_dir_ent *autofs_expire(struct super_block *sb, DPRINTK(("autofs: not expirable (not a mounted directory): %s\n", ent->name)); continue; } - while (d_mountpoint(path.dentry) && follow_down(&path)); + while (d_mountpoint(path.dentry) && follow_down(&path)) ; umount_ok = may_umount(path.mnt); path_put(&path); diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c index 615d549..33baf27 100644 --- a/fs/befs/linuxvfs.c +++ b/fs/befs/linuxvfs.c @@ -737,12 +737,7 @@ befs_put_super(struct super_block *sb) { kfree(BEFS_SB(sb)->mount_opts.iocharset); BEFS_SB(sb)->mount_opts.iocharset = NULL; - - if (BEFS_SB(sb)->nls) { - unload_nls(BEFS_SB(sb)->nls); - BEFS_SB(sb)->nls = NULL; - } - + unload_nls(BEFS_SB(sb)->nls); kfree(sb->s_fs_info); sb->s_fs_info = NULL; } @@ -842,7 +837,7 @@ befs_fill_super(struct super_block *sb, void *data, int silent) sb->s_magic = BEFS_SUPER_MAGIC; /* Set real blocksize of fs */ sb_set_blocksize(sb, (ulong) befs_sb->block_size); - sb->s_op = (struct super_operations *) &befs_sops; + sb->s_op = &befs_sops; root = befs_iget(sb, iaddr2blockno(sb, &(befs_sb->root_dir))); if (IS_ERR(root)) { ret = PTR_ERR(root); diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index b7c1603..b9b3bb5 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -501,22 +501,22 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex, } } - /* - * Now fill out the bss section. First pad the last page up - * to the page boundary, and then perform a mmap to make sure - * that there are zero-mapped pages up to and including the - * last bss page. - */ - if (padzero(elf_bss)) { - error = -EFAULT; - goto out_close; - } + if (last_bss > elf_bss) { + /* + * Now fill out the bss section. First pad the last page up + * to the page boundary, and then perform a mmap to make sure + * that there are zero-mapped pages up to and including the + * last bss page. + */ + if (padzero(elf_bss)) { + error = -EFAULT; + goto out_close; + } - /* What we have mapped so far */ - elf_bss = ELF_PAGESTART(elf_bss + ELF_MIN_ALIGN - 1); + /* What we have mapped so far */ + elf_bss = ELF_PAGESTART(elf_bss + ELF_MIN_ALIGN - 1); - /* Map the last of the bss segment */ - if (last_bss > elf_bss) { + /* Map the last of the bss segment */ down_write(¤t->mm->mmap_sem); error = do_brk(elf_bss, last_bss - elf_bss); up_write(¤t->mm->mmap_sem); @@ -1280,9 +1280,6 @@ static int writenote(struct memelfnote *men, struct file *file, #define DUMP_WRITE(addr, nr) \ if ((size += (nr)) > limit || !dump_write(file, (addr), (nr))) \ goto end_coredump; -#define DUMP_SEEK(off) \ - if (!dump_seek(file, (off))) \ - goto end_coredump; static void fill_elf_header(struct elfhdr *elf, int segs, u16 machine, u32 flags, u8 osabi) @@ -1714,42 +1711,52 @@ struct elf_note_info { int numnote; }; -static int fill_note_info(struct elfhdr *elf, int phdrs, - struct elf_note_info *info, - long signr, struct pt_regs *regs) +static int elf_note_info_init(struct elf_note_info *info) { -#define NUM_NOTES 6 - struct list_head *t; - - info->notes = NULL; - info->prstatus = NULL; - info->psinfo = NULL; - info->fpu = NULL; -#ifdef ELF_CORE_COPY_XFPREGS - info->xfpu = NULL; -#endif + memset(info, 0, sizeof(*info)); INIT_LIST_HEAD(&info->thread_list); - info->notes = kmalloc(NUM_NOTES * sizeof(struct memelfnote), - GFP_KERNEL); + /* Allocate space for six ELF notes */ + info->notes = kmalloc(6 * sizeof(struct memelfnote), GFP_KERNEL); if (!info->notes) return 0; info->psinfo = kmalloc(sizeof(*info->psinfo), GFP_KERNEL); if (!info->psinfo) - return 0; + goto notes_free; info->prstatus = kmalloc(sizeof(*info->prstatus), GFP_KERNEL); if (!info->prstatus) - return 0; + goto psinfo_free; info->fpu = kmalloc(sizeof(*info->fpu), GFP_KERNEL); if (!info->fpu) - return 0; + goto prstatus_free; #ifdef ELF_CORE_COPY_XFPREGS info->xfpu = kmalloc(sizeof(*info->xfpu), GFP_KERNEL); if (!info->xfpu) - return 0; + goto fpu_free; +#endif + return 1; +#ifdef ELF_CORE_COPY_XFPREGS + fpu_free: + kfree(info->fpu); #endif + prstatus_free: + kfree(info->prstatus); + psinfo_free: + kfree(info->psinfo); + notes_free: + kfree(info->notes); + return 0; +} + +static int fill_note_info(struct elfhdr *elf, int phdrs, + struct elf_note_info *info, + long signr, struct pt_regs *regs) +{ + struct list_head *t; + + if (!elf_note_info_init(info)) + return 0; - info->thread_status_size = 0; if (signr) { struct core_thread *ct; struct elf_thread_status *ets; @@ -1809,8 +1816,6 @@ static int fill_note_info(struct elfhdr *elf, int phdrs, #endif return 1; - -#undef NUM_NOTES } static size_t get_note_info_size(struct elf_note_info *info) @@ -2016,7 +2021,8 @@ static int elf_core_dump(long signr, struct pt_regs *regs, struct file *file, un goto end_coredump; /* Align to page */ - DUMP_SEEK(dataoff - foffset); + if (!dump_seek(file, dataoff - foffset)) + goto end_coredump; for (vma = first_vma(current, gate_vma); vma != NULL; vma = next_vma(vma, gate_vma)) { @@ -2027,33 +2033,19 @@ static int elf_core_dump(long signr, struct pt_regs *regs, struct file *file, un for (addr = vma->vm_start; addr < end; addr += PAGE_SIZE) { struct page *page; - struct vm_area_struct *tmp_vma; - - if (get_user_pages(current, current->mm, addr, 1, 0, 1, - &page, &tmp_vma) <= 0) { - DUMP_SEEK(PAGE_SIZE); - } else { - if (page == ZERO_PAGE(0)) { - if (!dump_seek(file, PAGE_SIZE)) { - page_cache_release(page); - goto end_coredump; - } - } else { - void *kaddr; - flush_cache_page(tmp_vma, addr, - page_to_pfn(page)); - kaddr = kmap(page); - if ((size += PAGE_SIZE) > limit || - !dump_write(file, kaddr, - PAGE_SIZE)) { - kunmap(page); - page_cache_release(page); - goto end_coredump; - } - kunmap(page); - } + int stop; + + page = get_dump_page(addr); + if (page) { + void *kaddr = kmap(page); + stop = ((size += PAGE_SIZE) > limit) || + !dump_write(file, kaddr, PAGE_SIZE); + kunmap(page); page_cache_release(page); - } + } else + stop = !dump_seek(file, PAGE_SIZE); + if (stop) + goto end_coredump; } } diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index 20fbece..38502c6 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -283,20 +283,23 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm, } stack_size = exec_params.stack_size; - if (stack_size < interp_params.stack_size) - stack_size = interp_params.stack_size; - if (exec_params.flags & ELF_FDPIC_FLAG_EXEC_STACK) executable_stack = EXSTACK_ENABLE_X; else if (exec_params.flags & ELF_FDPIC_FLAG_NOEXEC_STACK) executable_stack = EXSTACK_DISABLE_X; - else if (interp_params.flags & ELF_FDPIC_FLAG_EXEC_STACK) - executable_stack = EXSTACK_ENABLE_X; - else if (interp_params.flags & ELF_FDPIC_FLAG_NOEXEC_STACK) - executable_stack = EXSTACK_DISABLE_X; else executable_stack = EXSTACK_DEFAULT; + if (stack_size == 0) { + stack_size = interp_params.stack_size; + if (interp_params.flags & ELF_FDPIC_FLAG_EXEC_STACK) + executable_stack = EXSTACK_ENABLE_X; + else if (interp_params.flags & ELF_FDPIC_FLAG_NOEXEC_STACK) + executable_stack = EXSTACK_DISABLE_X; + else + executable_stack = EXSTACK_DEFAULT; + } + retval = -ENOEXEC; if (stack_size == 0) goto error; @@ -1325,9 +1328,6 @@ static int writenote(struct memelfnote *men, struct file *file) #define DUMP_WRITE(addr, nr) \ if ((size += (nr)) > limit || !dump_write(file, (addr), (nr))) \ goto end_coredump; -#define DUMP_SEEK(off) \ - if (!dump_seek(file, (off))) \ - goto end_coredump; static inline void fill_elf_fdpic_header(struct elfhdr *elf, int segs) { @@ -1518,6 +1518,7 @@ static int elf_fdpic_dump_segments(struct file *file, size_t *size, unsigned long *limit, unsigned long mm_flags) { struct vm_area_struct *vma; + int err = 0; for (vma = current->mm->mmap; vma; vma = vma->vm_next) { unsigned long addr; @@ -1525,43 +1526,26 @@ static int elf_fdpic_dump_segments(struct file *file, size_t *size, if (!maydump(vma, mm_flags)) continue; - for (addr = vma->vm_start; - addr < vma->vm_end; - addr += PAGE_SIZE - ) { - struct vm_area_struct *vma; - struct page *page; - - if (get_user_pages(current, current->mm, addr, 1, 0, 1, - &page, &vma) <= 0) { - DUMP_SEEK(file->f_pos + PAGE_SIZE); - } - else if (page == ZERO_PAGE(0)) { - page_cache_release(page); - DUMP_SEEK(file->f_pos + PAGE_SIZE); - } - else { - void *kaddr; - - flush_cache_page(vma, addr, page_to_pfn(page)); - kaddr = kmap(page); - if ((*size += PAGE_SIZE) > *limit || - !dump_write(file, kaddr, PAGE_SIZE) - ) { - kunmap(page); - page_cache_release(page); - return -EIO; - } + for (addr = vma->vm_start; addr < vma->vm_end; + addr += PAGE_SIZE) { + struct page *page = get_dump_page(addr); + if (page) { + void *kaddr = kmap(page); + *size += PAGE_SIZE; + if (*size > *limit) + err = -EFBIG; + else if (!dump_write(file, kaddr, PAGE_SIZE)) + err = -EIO; kunmap(page); page_cache_release(page); - } + } else if (!dump_seek(file, file->f_pos + PAGE_SIZE)) + err = -EFBIG; + if (err) + goto out; } } - - return 0; - -end_coredump: - return -EFBIG; +out: + return err; } #endif @@ -1802,7 +1786,8 @@ static int elf_fdpic_core_dump(long signr, struct pt_regs *regs, goto end_coredump; } - DUMP_SEEK(dataoff); + if (!dump_seek(file, dataoff)) + goto end_coredump; if (elf_fdpic_dump_segments(file, &size, &limit, mm_flags) < 0) goto end_coredump; diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c index e92f229..a279665 100644 --- a/fs/binfmt_flat.c +++ b/fs/binfmt_flat.c @@ -278,8 +278,6 @@ static int decompress_exec( ret = bprm->file->f_op->read(bprm->file, buf, LBUFSIZE, &fpos); if (ret <= 0) break; - if (ret >= (unsigned long) -4096) - break; len -= ret; strm.next_in = buf; @@ -335,7 +333,7 @@ calc_reloc(unsigned long r, struct lib_info *p, int curid, int internalp) "(%d != %d)", (unsigned) r, curid, id); goto failed; } else if ( ! p->lib_list[id].loaded && - load_flat_shared_library(id, p) > (unsigned long) -4096) { + IS_ERR_VALUE(load_flat_shared_library(id, p))) { printk("BINFMT_FLAT: failed to load library %d", id); goto failed; } @@ -545,7 +543,7 @@ static int load_flat_file(struct linux_binprm * bprm, textpos = do_mmap(bprm->file, 0, text_len, PROT_READ|PROT_EXEC, MAP_PRIVATE|MAP_EXECUTABLE, 0); up_write(¤t->mm->mmap_sem); - if (!textpos || textpos >= (unsigned long) -4096) { + if (!textpos || IS_ERR_VALUE(textpos)) { if (!textpos) textpos = (unsigned long) -ENOMEM; printk("Unable to mmap process text, errno %d\n", (int)-textpos); @@ -560,7 +558,7 @@ static int load_flat_file(struct linux_binprm * bprm, PROT_READ|PROT_WRITE|PROT_EXEC, MAP_PRIVATE, 0); up_write(¤t->mm->mmap_sem); - if (realdatastart == 0 || realdatastart >= (unsigned long)-4096) { + if (realdatastart == 0 || IS_ERR_VALUE(realdatastart)) { if (!realdatastart) realdatastart = (unsigned long) -ENOMEM; printk("Unable to allocate RAM for process data, errno %d\n", @@ -587,7 +585,7 @@ static int load_flat_file(struct linux_binprm * bprm, result = bprm->file->f_op->read(bprm->file, (char *) datapos, data_len + (relocs * sizeof(unsigned long)), &fpos); } - if (result >= (unsigned long)-4096) { + if (IS_ERR_VALUE(result)) { printk("Unable to read data+bss, errno %d\n", (int)-result); do_munmap(current->mm, textpos, text_len); do_munmap(current->mm, realdatastart, data_len + extra); @@ -607,7 +605,7 @@ static int load_flat_file(struct linux_binprm * bprm, PROT_READ | PROT_EXEC | PROT_WRITE, MAP_PRIVATE, 0); up_write(¤t->mm->mmap_sem); - if (!textpos || textpos >= (unsigned long) -4096) { + if (!textpos || IS_ERR_VALUE(textpos)) { if (!textpos) textpos = (unsigned long) -ENOMEM; printk("Unable to allocate RAM for process text/data, errno %d\n", @@ -641,7 +639,7 @@ static int load_flat_file(struct linux_binprm * bprm, fpos = 0; result = bprm->file->f_op->read(bprm->file, (char *) textpos, text_len, &fpos); - if (result < (unsigned long) -4096) + if (!IS_ERR_VALUE(result)) result = decompress_exec(bprm, text_len, (char *) datapos, data_len + (relocs * sizeof(unsigned long)), 0); } @@ -651,13 +649,13 @@ static int load_flat_file(struct linux_binprm * bprm, fpos = 0; result = bprm->file->f_op->read(bprm->file, (char *) textpos, text_len, &fpos); - if (result < (unsigned long) -4096) { + if (!IS_ERR_VALUE(result)) { fpos = ntohl(hdr->data_start); result = bprm->file->f_op->read(bprm->file, (char *) datapos, data_len + (relocs * sizeof(unsigned long)), &fpos); } } - if (result >= (unsigned long)-4096) { + if (IS_ERR_VALUE(result)) { printk("Unable to read code+data+bss, errno %d\n",(int)-result); do_munmap(current->mm, textpos, text_len + data_len + extra + MAX_SHARED_LIBS * sizeof(unsigned long)); @@ -835,7 +833,7 @@ static int load_flat_shared_library(int id, struct lib_info *libs) res = prepare_binprm(&bprm); - if (res <= (unsigned long)-4096) + if (!IS_ERR_VALUE(res)) res = load_flat_file(&bprm, libs, id, NULL); abort_creds(bprm.cred); @@ -880,7 +878,7 @@ static int load_flat_binary(struct linux_binprm * bprm, struct pt_regs * regs) stack_len += FLAT_DATA_ALIGN - 1; /* reserve for upcoming alignment */ res = load_flat_file(bprm, &libinfo, 0, &stack_len); - if (res > (unsigned long)-4096) + if (IS_ERR_VALUE(res)) return res; /* Update data segment pointers for all libraries */ @@ -249,6 +249,7 @@ void bio_free(struct bio *bio, struct bio_set *bs) mempool_free(p, bs->bio_pool); } +EXPORT_SYMBOL(bio_free); void bio_init(struct bio *bio) { @@ -257,6 +258,7 @@ void bio_init(struct bio *bio) bio->bi_comp_cpu = -1; atomic_set(&bio->bi_cnt, 1); } +EXPORT_SYMBOL(bio_init); /** * bio_alloc_bioset - allocate a bio for I/O @@ -311,6 +313,7 @@ err_free: mempool_free(p, bs->bio_pool); return NULL; } +EXPORT_SYMBOL(bio_alloc_bioset); static void bio_fs_destructor(struct bio *bio) { @@ -337,6 +340,7 @@ struct bio *bio_alloc(gfp_t gfp_mask, int nr_iovecs) return bio; } +EXPORT_SYMBOL(bio_alloc); static void bio_kmalloc_destructor(struct bio *bio) { @@ -380,6 +384,7 @@ struct bio *bio_kmalloc(gfp_t gfp_mask, int nr_iovecs) return bio; } +EXPORT_SYMBOL(bio_kmalloc); void zero_fill_bio(struct bio *bio) { @@ -416,6 +421,7 @@ void bio_put(struct bio *bio) bio->bi_destructor(bio); } } +EXPORT_SYMBOL(bio_put); inline int bio_phys_segments(struct request_queue *q, struct bio *bio) { @@ -424,6 +430,7 @@ inline int bio_phys_segments(struct request_queue *q, struct bio *bio) return bio->bi_phys_segments; } +EXPORT_SYMBOL(bio_phys_segments); /** * __bio_clone - clone a bio @@ -451,6 +458,7 @@ void __bio_clone(struct bio *bio, struct bio *bio_src) bio->bi_size = bio_src->bi_size; bio->bi_idx = bio_src->bi_idx; } +EXPORT_SYMBOL(__bio_clone); /** * bio_clone - clone a bio @@ -482,6 +490,7 @@ struct bio *bio_clone(struct bio *bio, gfp_t gfp_mask) return b; } +EXPORT_SYMBOL(bio_clone); /** * bio_get_nr_vecs - return approx number of vecs @@ -505,6 +514,7 @@ int bio_get_nr_vecs(struct block_device *bdev) return nr_pages; } +EXPORT_SYMBOL(bio_get_nr_vecs); static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page *page, unsigned int len, unsigned int offset, @@ -635,6 +645,7 @@ int bio_add_pc_page(struct request_queue *q, struct bio *bio, struct page *page, return __bio_add_page(q, bio, page, len, offset, queue_max_hw_sectors(q)); } +EXPORT_SYMBOL(bio_add_pc_page); /** * bio_add_page - attempt to add page to bio @@ -655,6 +666,7 @@ int bio_add_page(struct bio *bio, struct page *page, unsigned int len, struct request_queue *q = bdev_get_queue(bio->bi_bdev); return __bio_add_page(q, bio, page, len, offset, queue_max_sectors(q)); } +EXPORT_SYMBOL(bio_add_page); struct bio_map_data { struct bio_vec *iovecs; @@ -776,6 +788,7 @@ int bio_uncopy_user(struct bio *bio) bio_put(bio); return ret; } +EXPORT_SYMBOL(bio_uncopy_user); /** * bio_copy_user_iov - copy user data to bio @@ -920,6 +933,7 @@ struct bio *bio_copy_user(struct request_queue *q, struct rq_map_data *map_data, return bio_copy_user_iov(q, map_data, &iov, 1, write_to_vm, gfp_mask); } +EXPORT_SYMBOL(bio_copy_user); static struct bio *__bio_map_user_iov(struct request_queue *q, struct block_device *bdev, @@ -1050,6 +1064,7 @@ struct bio *bio_map_user(struct request_queue *q, struct block_device *bdev, return bio_map_user_iov(q, bdev, &iov, 1, write_to_vm, gfp_mask); } +EXPORT_SYMBOL(bio_map_user); /** * bio_map_user_iov - map user sg_iovec table into bio @@ -1117,13 +1132,13 @@ void bio_unmap_user(struct bio *bio) __bio_unmap_user(bio); bio_put(bio); } +EXPORT_SYMBOL(bio_unmap_user); static void bio_map_kern_endio(struct bio *bio, int err) { bio_put(bio); } - static struct bio *__bio_map_kern(struct request_queue *q, void *data, unsigned int len, gfp_t gfp_mask) { @@ -1189,6 +1204,7 @@ struct bio *bio_map_kern(struct request_queue *q, void *data, unsigned int len, bio_put(bio); return ERR_PTR(-EINVAL); } +EXPORT_SYMBOL(bio_map_kern); static void bio_copy_kern_endio(struct bio *bio, int err) { @@ -1250,6 +1266,7 @@ struct bio *bio_copy_kern(struct request_queue *q, void *data, unsigned int len, return bio; } +EXPORT_SYMBOL(bio_copy_kern); /* * bio_set_pages_dirty() and bio_check_pages_dirty() are support functions @@ -1400,6 +1417,7 @@ void bio_endio(struct bio *bio, int error) if (bio->bi_end_io) bio->bi_end_io(bio, error); } +EXPORT_SYMBOL(bio_endio); void bio_pair_release(struct bio_pair *bp) { @@ -1410,6 +1428,7 @@ void bio_pair_release(struct bio_pair *bp) mempool_free(bp, bp->bio2.bi_private); } } +EXPORT_SYMBOL(bio_pair_release); static void bio_pair_end_1(struct bio *bi, int err) { @@ -1477,6 +1496,7 @@ struct bio_pair *bio_split(struct bio *bi, int first_sectors) return bp; } +EXPORT_SYMBOL(bio_split); /** * bio_sector_offset - Find hardware sector offset in bio @@ -1547,6 +1567,7 @@ void bioset_free(struct bio_set *bs) kfree(bs); } +EXPORT_SYMBOL(bioset_free); /** * bioset_create - Create a bio_set @@ -1592,6 +1613,7 @@ bad: bioset_free(bs); return NULL; } +EXPORT_SYMBOL(bioset_create); static void __init biovec_init_slabs(void) { @@ -1636,29 +1658,4 @@ static int __init init_bio(void) return 0; } - subsys_initcall(init_bio); - -EXPORT_SYMBOL(bio_alloc); -EXPORT_SYMBOL(bio_kmalloc); -EXPORT_SYMBOL(bio_put); -EXPORT_SYMBOL(bio_free); -EXPORT_SYMBOL(bio_endio); -EXPORT_SYMBOL(bio_init); -EXPORT_SYMBOL(__bio_clone); -EXPORT_SYMBOL(bio_clone); -EXPORT_SYMBOL(bio_phys_segments); -EXPORT_SYMBOL(bio_add_page); -EXPORT_SYMBOL(bio_add_pc_page); -EXPORT_SYMBOL(bio_get_nr_vecs); -EXPORT_SYMBOL(bio_map_user); -EXPORT_SYMBOL(bio_unmap_user); -EXPORT_SYMBOL(bio_map_kern); -EXPORT_SYMBOL(bio_copy_kern); -EXPORT_SYMBOL(bio_pair_release); -EXPORT_SYMBOL(bio_split); -EXPORT_SYMBOL(bio_copy_user); -EXPORT_SYMBOL(bio_uncopy_user); -EXPORT_SYMBOL(bioset_create); -EXPORT_SYMBOL(bioset_free); -EXPORT_SYMBOL(bio_alloc_bioset); diff --git a/fs/block_dev.c b/fs/block_dev.c index 94dfda2..9cf4b92 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -216,8 +216,6 @@ EXPORT_SYMBOL(fsync_bdev); * freeze_bdev -- lock a filesystem and force it into a consistent state * @bdev: blockdevice to lock * - * This takes the block device bd_mount_sem to make sure no new mounts - * happen on bdev until thaw_bdev() is called. * If a superblock is found on this device, we take the s_umount semaphore * on it to make sure nobody unmounts until the snapshot creation is done. * The reference counter (bd_fsfreeze_count) guarantees that only the last @@ -232,46 +230,55 @@ struct super_block *freeze_bdev(struct block_device *bdev) int error = 0; mutex_lock(&bdev->bd_fsfreeze_mutex); - if (bdev->bd_fsfreeze_count > 0) { - bdev->bd_fsfreeze_count++; + if (++bdev->bd_fsfreeze_count > 1) { + /* + * We don't even need to grab a reference - the first call + * to freeze_bdev grab an active reference and only the last + * thaw_bdev drops it. + */ sb = get_super(bdev); + drop_super(sb); mutex_unlock(&bdev->bd_fsfreeze_mutex); return sb; } - bdev->bd_fsfreeze_count++; - - down(&bdev->bd_mount_sem); - sb = get_super(bdev); - if (sb && !(sb->s_flags & MS_RDONLY)) { - sb->s_frozen = SB_FREEZE_WRITE; - smp_wmb(); - - sync_filesystem(sb); - - sb->s_frozen = SB_FREEZE_TRANS; - smp_wmb(); - - sync_blockdev(sb->s_bdev); - - if (sb->s_op->freeze_fs) { - error = sb->s_op->freeze_fs(sb); - if (error) { - printk(KERN_ERR - "VFS:Filesystem freeze failed\n"); - sb->s_frozen = SB_UNFROZEN; - drop_super(sb); - up(&bdev->bd_mount_sem); - bdev->bd_fsfreeze_count--; - mutex_unlock(&bdev->bd_fsfreeze_mutex); - return ERR_PTR(error); - } + + sb = get_active_super(bdev); + if (!sb) + goto out; + if (sb->s_flags & MS_RDONLY) { + deactivate_locked_super(sb); + mutex_unlock(&bdev->bd_fsfreeze_mutex); + return sb; + } + + sb->s_frozen = SB_FREEZE_WRITE; + smp_wmb(); + + sync_filesystem(sb); + + sb->s_frozen = SB_FREEZE_TRANS; + smp_wmb(); + + sync_blockdev(sb->s_bdev); + + if (sb->s_op->freeze_fs) { + error = sb->s_op->freeze_fs(sb); + if (error) { + printk(KERN_ERR + "VFS:Filesystem freeze failed\n"); + sb->s_frozen = SB_UNFROZEN; + deactivate_locked_super(sb); + bdev->bd_fsfreeze_count--; + mutex_unlock(&bdev->bd_fsfreeze_mutex); + return ERR_PTR(error); } } + up_write(&sb->s_umount); + out: sync_blockdev(bdev); mutex_unlock(&bdev->bd_fsfreeze_mutex); - - return sb; /* thaw_bdev releases s->s_umount and bd_mount_sem */ + return sb; /* thaw_bdev releases s->s_umount */ } EXPORT_SYMBOL(freeze_bdev); @@ -284,44 +291,44 @@ EXPORT_SYMBOL(freeze_bdev); */ int thaw_bdev(struct block_device *bdev, struct super_block *sb) { - int error = 0; + int error = -EINVAL; mutex_lock(&bdev->bd_fsfreeze_mutex); - if (!bdev->bd_fsfreeze_count) { - mutex_unlock(&bdev->bd_fsfreeze_mutex); - return -EINVAL; - } - - bdev->bd_fsfreeze_count--; - if (bdev->bd_fsfreeze_count > 0) { - if (sb) - drop_super(sb); - mutex_unlock(&bdev->bd_fsfreeze_mutex); - return 0; - } - - if (sb) { - BUG_ON(sb->s_bdev != bdev); - if (!(sb->s_flags & MS_RDONLY)) { - if (sb->s_op->unfreeze_fs) { - error = sb->s_op->unfreeze_fs(sb); - if (error) { - printk(KERN_ERR - "VFS:Filesystem thaw failed\n"); - sb->s_frozen = SB_FREEZE_TRANS; - bdev->bd_fsfreeze_count++; - mutex_unlock(&bdev->bd_fsfreeze_mutex); - return error; - } - } - sb->s_frozen = SB_UNFROZEN; - smp_wmb(); - wake_up(&sb->s_wait_unfrozen); + if (!bdev->bd_fsfreeze_count) + goto out_unlock; + + error = 0; + if (--bdev->bd_fsfreeze_count > 0) + goto out_unlock; + + if (!sb) + goto out_unlock; + + BUG_ON(sb->s_bdev != bdev); + down_write(&sb->s_umount); + if (sb->s_flags & MS_RDONLY) + goto out_deactivate; + + if (sb->s_op->unfreeze_fs) { + error = sb->s_op->unfreeze_fs(sb); + if (error) { + printk(KERN_ERR + "VFS:Filesystem thaw failed\n"); + sb->s_frozen = SB_FREEZE_TRANS; + bdev->bd_fsfreeze_count++; + mutex_unlock(&bdev->bd_fsfreeze_mutex); + return error; } - drop_super(sb); } - up(&bdev->bd_mount_sem); + sb->s_frozen = SB_UNFROZEN; + smp_wmb(); + wake_up(&sb->s_wait_unfrozen); + +out_deactivate: + if (sb) + deactivate_locked_super(sb); +out_unlock: mutex_unlock(&bdev->bd_fsfreeze_mutex); return 0; } @@ -420,7 +427,6 @@ static void bdev_destroy_inode(struct inode *inode) { struct bdev_inode *bdi = BDEV_I(inode); - bdi->bdev.bd_inode_backing_dev_info = NULL; kmem_cache_free(bdev_cachep, bdi); } @@ -431,7 +437,6 @@ static void init_once(void *foo) memset(bdev, 0, sizeof(*bdev)); mutex_init(&bdev->bd_mutex); - sema_init(&bdev->bd_mount_sem, 1); INIT_LIST_HEAD(&bdev->bd_inodes); INIT_LIST_HEAD(&bdev->bd_list); #ifdef CONFIG_SYSFS @@ -1115,7 +1120,7 @@ EXPORT_SYMBOL(revalidate_disk); int check_disk_change(struct block_device *bdev) { struct gendisk *disk = bdev->bd_disk; - struct block_device_operations * bdops = disk->fops; + const struct block_device_operations *bdops = disk->fops; if (!bdops->media_changed) return 0; @@ -1405,6 +1410,33 @@ static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg) } /* + * Write data to the block device. Only intended for the block device itself + * and the raw driver which basically is a fake block device. + * + * Does not take i_mutex for the write and thus is not for general purpose + * use. + */ +ssize_t blkdev_aio_write(struct kiocb *iocb, const struct iovec *iov, + unsigned long nr_segs, loff_t pos) +{ + struct file *file = iocb->ki_filp; + ssize_t ret; + + BUG_ON(iocb->ki_pos != pos); + + ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos); + if (ret > 0 || ret == -EIOCBQUEUED) { + ssize_t err; + + err = generic_write_sync(file, pos, ret); + if (err < 0 && ret > 0) + ret = err; + } + return ret; +} +EXPORT_SYMBOL_GPL(blkdev_aio_write); + +/* * Try to release a page associated with block device when the system * is under memory pressure. */ @@ -1436,7 +1468,7 @@ const struct file_operations def_blk_fops = { .read = do_sync_read, .write = do_sync_write, .aio_read = generic_file_aio_read, - .aio_write = generic_file_aio_write_nolock, + .aio_write = blkdev_aio_write, .mmap = generic_file_mmap, .fsync = block_fsync, .unlocked_ioctl = block_ioctl, diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c index f128427..3616042 100644 --- a/fs/btrfs/acl.c +++ b/fs/btrfs/acl.c @@ -27,7 +27,7 @@ #include "btrfs_inode.h" #include "xattr.h" -#ifdef CONFIG_FS_POSIX_ACL +#ifdef CONFIG_BTRFS_FS_POSIX_ACL static struct posix_acl *btrfs_get_acl(struct inode *inode, int type) { @@ -313,7 +313,7 @@ struct xattr_handler btrfs_xattr_acl_access_handler = { .set = btrfs_xattr_acl_access_set, }; -#else /* CONFIG_FS_POSIX_ACL */ +#else /* CONFIG_BTRFS_FS_POSIX_ACL */ int btrfs_acl_chmod(struct inode *inode) { @@ -325,4 +325,4 @@ int btrfs_init_acl(struct inode *inode, struct inode *dir) return 0; } -#endif /* CONFIG_FS_POSIX_ACL */ +#endif /* CONFIG_BTRFS_FS_POSIX_ACL */ diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c index 019e8af..c0861e7 100644 --- a/fs/btrfs/async-thread.c +++ b/fs/btrfs/async-thread.c @@ -48,6 +48,9 @@ struct btrfs_worker_thread { /* number of things on the pending list */ atomic_t num_pending; + /* reference counter for this struct */ + atomic_t refs; + unsigned long sequence; /* protects the pending list. */ @@ -61,6 +64,51 @@ struct btrfs_worker_thread { }; /* + * btrfs_start_workers uses kthread_run, which can block waiting for memory + * for a very long time. It will actually throttle on page writeback, + * and so it may not make progress until after our btrfs worker threads + * process all of the pending work structs in their queue + * + * This means we can't use btrfs_start_workers from inside a btrfs worker + * thread that is used as part of cleaning dirty memory, which pretty much + * involves all of the worker threads. + * + * Instead we have a helper queue who never has more than one thread + * where we scheduler thread start operations. This worker_start struct + * is used to contain the work and hold a pointer to the queue that needs + * another worker. + */ +struct worker_start { + struct btrfs_work work; + struct btrfs_workers *queue; +}; + +static void start_new_worker_func(struct btrfs_work *work) +{ + struct worker_start *start; + start = container_of(work, struct worker_start, work); + btrfs_start_workers(start->queue, 1); + kfree(start); +} + +static int start_new_worker(struct btrfs_workers *queue) +{ + struct worker_start *start; + int ret; + + start = kzalloc(sizeof(*start), GFP_NOFS); + if (!start) + return -ENOMEM; + + start->work.func = start_new_worker_func; + start->queue = queue; + ret = btrfs_queue_worker(queue->atomic_worker_start, &start->work); + if (ret) + kfree(start); + return ret; +} + +/* * helper function to move a thread onto the idle list after it * has finished some requests. */ @@ -71,7 +119,12 @@ static void check_idle_worker(struct btrfs_worker_thread *worker) unsigned long flags; spin_lock_irqsave(&worker->workers->lock, flags); worker->idle = 1; - list_move(&worker->worker_list, &worker->workers->idle_list); + + /* the list may be empty if the worker is just starting */ + if (!list_empty(&worker->worker_list)) { + list_move(&worker->worker_list, + &worker->workers->idle_list); + } spin_unlock_irqrestore(&worker->workers->lock, flags); } } @@ -87,23 +140,51 @@ static void check_busy_worker(struct btrfs_worker_thread *worker) unsigned long flags; spin_lock_irqsave(&worker->workers->lock, flags); worker->idle = 0; - list_move_tail(&worker->worker_list, - &worker->workers->worker_list); + + if (!list_empty(&worker->worker_list)) { + list_move_tail(&worker->worker_list, + &worker->workers->worker_list); + } spin_unlock_irqrestore(&worker->workers->lock, flags); } } -static noinline int run_ordered_completions(struct btrfs_workers *workers, - struct btrfs_work *work) +static void check_pending_worker_creates(struct btrfs_worker_thread *worker) { + struct btrfs_workers *workers = worker->workers; unsigned long flags; + rmb(); + if (!workers->atomic_start_pending) + return; + + spin_lock_irqsave(&workers->lock, flags); + if (!workers->atomic_start_pending) + goto out; + + workers->atomic_start_pending = 0; + if (workers->num_workers + workers->num_workers_starting >= + workers->max_workers) + goto out; + + workers->num_workers_starting += 1; + spin_unlock_irqrestore(&workers->lock, flags); + start_new_worker(workers); + return; + +out: + spin_unlock_irqrestore(&workers->lock, flags); +} + +static noinline int run_ordered_completions(struct btrfs_workers *workers, + struct btrfs_work *work) +{ if (!workers->ordered) return 0; set_bit(WORK_DONE_BIT, &work->flags); - spin_lock_irqsave(&workers->lock, flags); + spin_lock(&workers->order_lock); while (1) { if (!list_empty(&workers->prio_order_list)) { @@ -126,45 +207,118 @@ static noinline int run_ordered_completions(struct btrfs_workers *workers, if (test_and_set_bit(WORK_ORDER_DONE_BIT, &work->flags)) break; - spin_unlock_irqrestore(&workers->lock, flags); + spin_unlock(&workers->order_lock); work->ordered_func(work); /* now take the lock again and call the freeing code */ - spin_lock_irqsave(&workers->lock, flags); + spin_lock(&workers->order_lock); list_del(&work->order_list); work->ordered_free(work); } - spin_unlock_irqrestore(&workers->lock, flags); + spin_unlock(&workers->order_lock); return 0; } +static void put_worker(struct btrfs_worker_thread *worker) +{ + if (atomic_dec_and_test(&worker->refs)) + kfree(worker); +} + +static int try_worker_shutdown(struct btrfs_worker_thread *worker) +{ + int freeit = 0; + + spin_lock_irq(&worker->lock); + spin_lock(&worker->workers->lock); + if (worker->workers->num_workers > 1 && + worker->idle && + !worker->working && + !list_empty(&worker->worker_list) && + list_empty(&worker->prio_pending) && + list_empty(&worker->pending) && + atomic_read(&worker->num_pending) == 0) { + freeit = 1; + list_del_init(&worker->worker_list); + worker->workers->num_workers--; + } + spin_unlock(&worker->workers->lock); + spin_unlock_irq(&worker->lock); + + if (freeit) + put_worker(worker); + return freeit; +} + +static struct btrfs_work *get_next_work(struct btrfs_worker_thread *worker, + struct list_head *prio_head, + struct list_head *head) +{ + struct btrfs_work *work = NULL; + struct list_head *cur = NULL; + + if(!list_empty(prio_head)) + cur = prio_head->next; + + smp_mb(); + if (!list_empty(&worker->prio_pending)) + goto refill; + + if (!list_empty(head)) + cur = head->next; + + if (cur) + goto out; + +refill: + spin_lock_irq(&worker->lock); + list_splice_tail_init(&worker->prio_pending, prio_head); + list_splice_tail_init(&worker->pending, head); + + if (!list_empty(prio_head)) + cur = prio_head->next; + else if (!list_empty(head)) + cur = head->next; + spin_unlock_irq(&worker->lock); + + if (!cur) + goto out_fail; + +out: + work = list_entry(cur, struct btrfs_work, list); + +out_fail: + return work; +} + /* * main loop for servicing work items */ static int worker_loop(void *arg) { struct btrfs_worker_thread *worker = arg; - struct list_head *cur; + struct list_head head; + struct list_head prio_head; struct btrfs_work *work; + + INIT_LIST_HEAD(&head); + INIT_LIST_HEAD(&prio_head); + do { - spin_lock_irq(&worker->lock); -again_locked: +again: while (1) { - if (!list_empty(&worker->prio_pending)) - cur = worker->prio_pending.next; - else if (!list_empty(&worker->pending)) - cur = worker->pending.next; - else + + + work = get_next_work(worker, &prio_head, &head); + if (!work) break; - work = list_entry(cur, struct btrfs_work, list); list_del(&work->list); clear_bit(WORK_QUEUED_BIT, &work->flags); work->worker = worker; - spin_unlock_irq(&worker->lock); work->func(work); @@ -175,9 +329,13 @@ again_locked: */ run_ordered_completions(worker->workers, work); - spin_lock_irq(&worker->lock); - check_idle_worker(worker); + check_pending_worker_creates(worker); + } + + spin_lock_irq(&worker->lock); + check_idle_worker(worker); + if (freezing(current)) { worker->working = 0; spin_unlock_irq(&worker->lock); @@ -216,8 +374,10 @@ again_locked: spin_lock_irq(&worker->lock); set_current_state(TASK_INTERRUPTIBLE); if (!list_empty(&worker->pending) || - !list_empty(&worker->prio_pending)) - goto again_locked; + !list_empty(&worker->prio_pending)) { + spin_unlock_irq(&worker->lock); + goto again; + } /* * this makes sure we get a wakeup when someone @@ -226,8 +386,13 @@ again_locked: worker->working = 0; spin_unlock_irq(&worker->lock); - if (!kthread_should_stop()) - schedule(); + if (!kthread_should_stop()) { + schedule_timeout(HZ * 120); + if (!worker->working && + try_worker_shutdown(worker)) { + return 0; + } + } } __set_current_state(TASK_RUNNING); } @@ -242,41 +407,61 @@ int btrfs_stop_workers(struct btrfs_workers *workers) { struct list_head *cur; struct btrfs_worker_thread *worker; + int can_stop; + spin_lock_irq(&workers->lock); list_splice_init(&workers->idle_list, &workers->worker_list); while (!list_empty(&workers->worker_list)) { cur = workers->worker_list.next; worker = list_entry(cur, struct btrfs_worker_thread, worker_list); - kthread_stop(worker->task); - list_del(&worker->worker_list); - kfree(worker); + + atomic_inc(&worker->refs); + workers->num_workers -= 1; + if (!list_empty(&worker->worker_list)) { + list_del_init(&worker->worker_list); + put_worker(worker); + can_stop = 1; + } else + can_stop = 0; + spin_unlock_irq(&workers->lock); + if (can_stop) + kthread_stop(worker->task); + spin_lock_irq(&workers->lock); + put_worker(worker); } + spin_unlock_irq(&workers->lock); return 0; } /* * simple init on struct btrfs_workers */ -void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max) +void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max, + struct btrfs_workers *async_helper) { workers->num_workers = 0; + workers->num_workers_starting = 0; INIT_LIST_HEAD(&workers->worker_list); INIT_LIST_HEAD(&workers->idle_list); INIT_LIST_HEAD(&workers->order_list); INIT_LIST_HEAD(&workers->prio_order_list); spin_lock_init(&workers->lock); + spin_lock_init(&workers->order_lock); workers->max_workers = max; workers->idle_thresh = 32; workers->name = name; workers->ordered = 0; + workers->atomic_start_pending = 0; + workers->atomic_worker_start = async_helper; } /* * starts new worker threads. This does not enforce the max worker * count in case you need to temporarily go past it. */ -int btrfs_start_workers(struct btrfs_workers *workers, int num_workers) +static int __btrfs_start_workers(struct btrfs_workers *workers, + int num_workers) { struct btrfs_worker_thread *worker; int ret = 0; @@ -293,7 +478,9 @@ int btrfs_start_workers(struct btrfs_workers *workers, int num_workers) INIT_LIST_HEAD(&worker->prio_pending); INIT_LIST_HEAD(&worker->worker_list); spin_lock_init(&worker->lock); + atomic_set(&worker->num_pending, 0); + atomic_set(&worker->refs, 1); worker->workers = workers; worker->task = kthread_run(worker_loop, worker, "btrfs-%s-%d", workers->name, @@ -303,11 +490,12 @@ int btrfs_start_workers(struct btrfs_workers *workers, int num_workers) kfree(worker); goto fail; } - spin_lock_irq(&workers->lock); list_add_tail(&worker->worker_list, &workers->idle_list); worker->idle = 1; workers->num_workers++; + workers->num_workers_starting--; + WARN_ON(workers->num_workers_starting < 0); spin_unlock_irq(&workers->lock); } return 0; @@ -316,6 +504,14 @@ fail: return ret; } +int btrfs_start_workers(struct btrfs_workers *workers, int num_workers) +{ + spin_lock_irq(&workers->lock); + workers->num_workers_starting += num_workers; + spin_unlock_irq(&workers->lock); + return __btrfs_start_workers(workers, num_workers); +} + /* * run through the list and find a worker thread that doesn't have a lot * to do right now. This can return null if we aren't yet at the thread @@ -325,7 +521,10 @@ static struct btrfs_worker_thread *next_worker(struct btrfs_workers *workers) { struct btrfs_worker_thread *worker; struct list_head *next; - int enforce_min = workers->num_workers < workers->max_workers; + int enforce_min; + + enforce_min = (workers->num_workers + workers->num_workers_starting) < + workers->max_workers; /* * if we find an idle thread, don't move it to the end of the @@ -350,7 +549,6 @@ static struct btrfs_worker_thread *next_worker(struct btrfs_workers *workers) */ next = workers->worker_list.next; worker = list_entry(next, struct btrfs_worker_thread, worker_list); - atomic_inc(&worker->num_pending); worker->sequence++; if (worker->sequence % workers->idle_thresh == 0) @@ -367,35 +565,49 @@ static struct btrfs_worker_thread *find_worker(struct btrfs_workers *workers) { struct btrfs_worker_thread *worker; unsigned long flags; + struct list_head *fallback; again: spin_lock_irqsave(&workers->lock, flags); worker = next_worker(workers); - spin_unlock_irqrestore(&workers->lock, flags); if (!worker) { - spin_lock_irqsave(&workers->lock, flags); - if (workers->num_workers >= workers->max_workers) { - struct list_head *fallback = NULL; - /* - * we have failed to find any workers, just - * return the force one - */ - if (!list_empty(&workers->worker_list)) - fallback = workers->worker_list.next; - if (!list_empty(&workers->idle_list)) - fallback = workers->idle_list.next; - BUG_ON(!fallback); - worker = list_entry(fallback, - struct btrfs_worker_thread, worker_list); - spin_unlock_irqrestore(&workers->lock, flags); + if (workers->num_workers + workers->num_workers_starting >= + workers->max_workers) { + goto fallback; + } else if (workers->atomic_worker_start) { + workers->atomic_start_pending = 1; + goto fallback; } else { + workers->num_workers_starting++; spin_unlock_irqrestore(&workers->lock, flags); /* we're below the limit, start another worker */ - btrfs_start_workers(workers, 1); + __btrfs_start_workers(workers, 1); goto again; } } + goto found; + +fallback: + fallback = NULL; + /* + * we have failed to find any workers, just + * return the first one we can find. + */ + if (!list_empty(&workers->worker_list)) + fallback = workers->worker_list.next; + if (!list_empty(&workers->idle_list)) + fallback = workers->idle_list.next; + BUG_ON(!fallback); + worker = list_entry(fallback, + struct btrfs_worker_thread, worker_list); +found: + /* + * this makes sure the worker doesn't exit before it is placed + * onto a busy/idle list + */ + atomic_inc(&worker->num_pending); + spin_unlock_irqrestore(&workers->lock, flags); return worker; } @@ -427,7 +639,7 @@ int btrfs_requeue_work(struct btrfs_work *work) spin_lock(&worker->workers->lock); worker->idle = 0; list_move_tail(&worker->worker_list, - &worker->workers->worker_list); + &worker->workers->worker_list); spin_unlock(&worker->workers->lock); } if (!worker->working) { @@ -435,9 +647,9 @@ int btrfs_requeue_work(struct btrfs_work *work) worker->working = 1; } - spin_unlock_irqrestore(&worker->lock, flags); if (wake) wake_up_process(worker->task); + spin_unlock_irqrestore(&worker->lock, flags); out: return 0; @@ -463,14 +675,18 @@ int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work) worker = find_worker(workers); if (workers->ordered) { - spin_lock_irqsave(&workers->lock, flags); + /* + * you're not allowed to do ordered queues from an + * interrupt handler + */ + spin_lock(&workers->order_lock); if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags)) { list_add_tail(&work->order_list, &workers->prio_order_list); } else { list_add_tail(&work->order_list, &workers->order_list); } - spin_unlock_irqrestore(&workers->lock, flags); + spin_unlock(&workers->order_lock); } else { INIT_LIST_HEAD(&work->order_list); } @@ -481,7 +697,6 @@ int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work) list_add_tail(&work->list, &worker->prio_pending); else list_add_tail(&work->list, &worker->pending); - atomic_inc(&worker->num_pending); check_busy_worker(worker); /* @@ -492,10 +707,10 @@ int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work) wake = 1; worker->working = 1; - spin_unlock_irqrestore(&worker->lock, flags); - if (wake) wake_up_process(worker->task); + spin_unlock_irqrestore(&worker->lock, flags); + out: return 0; } diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h index 1b511c1..5077746 100644 --- a/fs/btrfs/async-thread.h +++ b/fs/btrfs/async-thread.h @@ -64,6 +64,8 @@ struct btrfs_workers { /* current number of running workers */ int num_workers; + int num_workers_starting; + /* max number of workers allowed. changed by btrfs_start_workers */ int max_workers; @@ -73,6 +75,16 @@ struct btrfs_workers { /* force completions in the order they were queued */ int ordered; + /* more workers required, but in an interrupt handler */ + int atomic_start_pending; + + /* + * are we allowed to sleep while starting workers or are we required + * to start them at a later time? If we can't sleep, this indicates + * which queue we need to use to schedule thread creation. + */ + struct btrfs_workers *atomic_worker_start; + /* list with all the work threads. The workers on the idle thread * may be actively servicing jobs, but they haven't yet hit the * idle thresh limit above. @@ -90,6 +102,9 @@ struct btrfs_workers { /* lock for finding the next worker thread to queue on */ spinlock_t lock; + /* lock for the ordered lists */ + spinlock_t order_lock; + /* extra name for this worker, used for current->name */ char *name; }; @@ -97,7 +112,8 @@ struct btrfs_workers { int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work); int btrfs_start_workers(struct btrfs_workers *workers, int num_workers); int btrfs_stop_workers(struct btrfs_workers *workers); -void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max); +void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max, + struct btrfs_workers *async_starter); int btrfs_requeue_work(struct btrfs_work *work); void btrfs_set_work_high_prio(struct btrfs_work *work); #endif diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index ea1ea0a..f6783a4 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -86,6 +86,12 @@ struct btrfs_inode { * transid of the trans_handle that last modified this inode */ u64 last_trans; + + /* + * log transid when this inode was last modified + */ + u64 last_sub_trans; + /* * transid that last logged this inode */ @@ -128,6 +134,16 @@ struct btrfs_inode { u64 last_unlink_trans; /* + * Counters to keep track of the number of extent item's we may use due + * to delalloc and such. outstanding_extents is the number of extent + * items we think we'll end up using, and reserved_extents is the number + * of extent items we've reserved metadata for. + */ + spinlock_t accounting_lock; + int reserved_extents; + int outstanding_extents; + + /* * ordered_data_close is set by truncate when a file that used * to have good data has been truncated to zero. When it is set * the btrfs file release call will add this inode to the @@ -138,6 +154,7 @@ struct btrfs_inode { * of these. */ unsigned ordered_data_close:1; + unsigned dummy_inode:1; struct inode vfs_inode; }; diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 9d8ba4d..a11a320 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -506,10 +506,10 @@ static noinline int add_ra_bio_pages(struct inode *inode, */ set_page_extent_mapped(page); lock_extent(tree, last_offset, end, GFP_NOFS); - spin_lock(&em_tree->lock); + read_lock(&em_tree->lock); em = lookup_extent_mapping(em_tree, last_offset, PAGE_CACHE_SIZE); - spin_unlock(&em_tree->lock); + read_unlock(&em_tree->lock); if (!em || last_offset < em->start || (last_offset + PAGE_CACHE_SIZE > extent_map_end(em)) || @@ -593,11 +593,11 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, em_tree = &BTRFS_I(inode)->extent_tree; /* we need the actual starting offset of this extent in the file */ - spin_lock(&em_tree->lock); + read_lock(&em_tree->lock); em = lookup_extent_mapping(em_tree, page_offset(bio->bi_io_vec->bv_page), PAGE_CACHE_SIZE); - spin_unlock(&em_tree->lock); + read_unlock(&em_tree->lock); compressed_len = em->block_len; cb = kmalloc(compressed_bio_size(root, compressed_len), GFP_NOFS); diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 3fdcc05..ec96f3a 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -2853,6 +2853,12 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans, int split; int num_doubles = 0; + l = path->nodes[0]; + slot = path->slots[0]; + if (extend && data_size + btrfs_item_size_nr(l, slot) + + sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(root)) + return -EOVERFLOW; + /* first try to make some room by pushing left and right */ if (data_size && ins_key->type != BTRFS_DIR_ITEM_KEY) { wret = push_leaf_right(trans, root, path, data_size, 0); diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 837435c..444b3e9 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -114,6 +114,10 @@ struct btrfs_ordered_sum; */ #define BTRFS_DEV_ITEMS_OBJECTID 1ULL +#define BTRFS_BTREE_INODE_OBJECTID 1 + +#define BTRFS_EMPTY_SUBVOL_DIR_OBJECTID 2 + /* * we can actually store much bigger names, but lets not confuse the rest * of linux @@ -670,21 +674,29 @@ struct btrfs_space_info { u64 bytes_reserved; /* total bytes the allocator has reserved for current allocations */ u64 bytes_readonly; /* total bytes that are read only */ - - /* delalloc accounting */ - u64 bytes_delalloc; /* number of bytes reserved for allocation, - this space is not necessarily reserved yet - by the allocator */ + u64 bytes_super; /* total bytes reserved for the super blocks */ + u64 bytes_root; /* the number of bytes needed to commit a + transaction */ u64 bytes_may_use; /* number of bytes that may be used for - delalloc */ + delalloc/allocations */ + u64 bytes_delalloc; /* number of bytes currently reserved for + delayed allocation */ int full; /* indicates that we cannot allocate any more chunks for this space */ int force_alloc; /* set if we need to force a chunk alloc for this space */ + int force_delalloc; /* make people start doing filemap_flush until + we're under a threshold */ struct list_head list; + /* for controlling how we free up space for allocations */ + wait_queue_head_t allocate_wait; + wait_queue_head_t flush_wait; + int allocating_chunk; + int flushing; + /* for block groups in our same type */ struct list_head block_groups; spinlock_t lock; @@ -726,6 +738,15 @@ enum btrfs_caching_type { BTRFS_CACHE_FINISHED = 2, }; +struct btrfs_caching_control { + struct list_head list; + struct mutex mutex; + wait_queue_head_t wait; + struct btrfs_block_group_cache *block_group; + u64 progress; + atomic_t count; +}; + struct btrfs_block_group_cache { struct btrfs_key key; struct btrfs_block_group_item item; @@ -733,6 +754,7 @@ struct btrfs_block_group_cache { spinlock_t lock; u64 pinned; u64 reserved; + u64 bytes_super; u64 flags; u64 sectorsize; int extents_thresh; @@ -742,8 +764,9 @@ struct btrfs_block_group_cache { int dirty; /* cache tracking stuff */ - wait_queue_head_t caching_q; int cached; + struct btrfs_caching_control *caching_ctl; + u64 last_byte_to_unpin; struct btrfs_space_info *space_info; @@ -782,13 +805,16 @@ struct btrfs_fs_info { /* the log root tree is a directory of all the other log roots */ struct btrfs_root *log_root_tree; + + spinlock_t fs_roots_radix_lock; struct radix_tree_root fs_roots_radix; /* block group cache stuff */ spinlock_t block_group_cache_lock; struct rb_root block_group_cache_tree; - struct extent_io_tree pinned_extents; + struct extent_io_tree freed_extents[2]; + struct extent_io_tree *pinned_extents; /* logical->physical extent mapping */ struct btrfs_mapping_tree mapping_tree; @@ -822,11 +848,7 @@ struct btrfs_fs_info { struct mutex transaction_kthread_mutex; struct mutex cleaner_mutex; struct mutex chunk_mutex; - struct mutex drop_mutex; struct mutex volume_mutex; - struct mutex tree_reloc_mutex; - struct rw_semaphore extent_commit_sem; - /* * this protects the ordered operations list only while we are * processing all of the entries on it. This way we make @@ -835,10 +857,16 @@ struct btrfs_fs_info { * before jumping into the main commit. */ struct mutex ordered_operations_mutex; + struct rw_semaphore extent_commit_sem; + + struct rw_semaphore subvol_sem; + + struct srcu_struct subvol_srcu; struct list_head trans_list; struct list_head hashers; struct list_head dead_roots; + struct list_head caching_block_groups; atomic_t nr_async_submits; atomic_t async_submit_draining; @@ -882,6 +910,7 @@ struct btrfs_fs_info { * A third pool does submit_bio to avoid deadlocking with the other * two */ + struct btrfs_workers generic_worker; struct btrfs_workers workers; struct btrfs_workers delalloc_workers; struct btrfs_workers endio_workers; @@ -889,6 +918,7 @@ struct btrfs_fs_info { struct btrfs_workers endio_meta_write_workers; struct btrfs_workers endio_write_workers; struct btrfs_workers submit_workers; + struct btrfs_workers enospc_workers; /* * fixup workers take dirty pages that didn't properly go through * the cow mechanism and make them safe to write. It happens @@ -979,7 +1009,10 @@ struct btrfs_root { atomic_t log_writers; atomic_t log_commit[2]; unsigned long log_transid; + unsigned long last_log_commit; unsigned long log_batch; + pid_t log_start_pid; + bool log_multiple_pids; u64 objectid; u64 last_trans; @@ -996,10 +1029,12 @@ struct btrfs_root { u32 stripesize; u32 type; - u64 highest_inode; - u64 last_inode_alloc; + + u64 highest_objectid; int ref_cows; int track_dirty; + int in_radix; + u64 defrag_trans_start; struct btrfs_key defrag_progress; struct btrfs_key defrag_max; @@ -1118,6 +1153,7 @@ struct btrfs_root { #define BTRFS_MOUNT_FLUSHONCOMMIT (1 << 7) #define BTRFS_MOUNT_SSD_SPREAD (1 << 8) #define BTRFS_MOUNT_NOSSD (1 << 9) +#define BTRFS_MOUNT_DISCARD (1 << 10) #define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt) #define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt) @@ -1920,8 +1956,8 @@ void btrfs_put_block_group(struct btrfs_block_group_cache *cache); int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, struct btrfs_root *root, unsigned long count); int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len); -int btrfs_update_pinned_extents(struct btrfs_root *root, - u64 bytenr, u64 num, int pin); +int btrfs_pin_extent(struct btrfs_root *root, + u64 bytenr, u64 num, int reserved); int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *leaf); int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans, @@ -1971,9 +2007,10 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, u64 root_objectid, u64 owner, u64 offset); int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len); +int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans, + struct btrfs_root *root); int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct extent_io_tree *unpin); + struct btrfs_root *root); int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 num_bytes, u64 parent, @@ -1984,6 +2021,7 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr); int btrfs_free_block_groups(struct btrfs_fs_info *info); int btrfs_read_block_groups(struct btrfs_root *root); +int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr); int btrfs_make_block_group(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytes_used, u64 type, u64 chunk_objectid, u64 chunk_offset, @@ -1997,7 +2035,12 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags); void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *ionde); void btrfs_clear_space_info_full(struct btrfs_fs_info *info); -int btrfs_check_metadata_free_space(struct btrfs_root *root); +int btrfs_reserve_metadata_space(struct btrfs_root *root, int num_items); +int btrfs_unreserve_metadata_space(struct btrfs_root *root, int num_items); +int btrfs_unreserve_metadata_for_delalloc(struct btrfs_root *root, + struct inode *inode, int num_items); +int btrfs_reserve_metadata_for_delalloc(struct btrfs_root *root, + struct inode *inode, int num_items); int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode, u64 bytes); void btrfs_free_reserved_data_space(struct btrfs_root *root, @@ -2006,7 +2049,6 @@ void btrfs_delalloc_reserve_space(struct btrfs_root *root, struct inode *inode, u64 bytes); void btrfs_delalloc_free_space(struct btrfs_root *root, struct inode *inode, u64 bytes); -void btrfs_free_pinned_extents(struct btrfs_fs_info *info); /* ctree.c */ int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key, int level, int *slot); @@ -2100,12 +2142,15 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans, struct extent_buffer *parent); /* root-item.c */ int btrfs_find_root_ref(struct btrfs_root *tree_root, - struct btrfs_path *path, - u64 root_id, u64 ref_id); + struct btrfs_path *path, + u64 root_id, u64 ref_id); int btrfs_add_root_ref(struct btrfs_trans_handle *trans, struct btrfs_root *tree_root, - u64 root_id, u8 type, u64 ref_id, - u64 dirid, u64 sequence, + u64 root_id, u64 ref_id, u64 dirid, u64 sequence, + const char *name, int name_len); +int btrfs_del_root_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *tree_root, + u64 root_id, u64 ref_id, u64 dirid, u64 *sequence, const char *name, int name_len); int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_key *key); @@ -2120,6 +2165,7 @@ int btrfs_find_last_root(struct btrfs_root *root, u64 objectid, struct int btrfs_search_root(struct btrfs_root *root, u64 search_start, u64 *found_objectid); int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid); +int btrfs_find_orphan_roots(struct btrfs_root *tree_root); int btrfs_set_root_node(struct btrfs_root_item *item, struct extent_buffer *node); /* dir-item.c */ @@ -2138,6 +2184,10 @@ btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans, struct btrfs_path *path, u64 dir, u64 objectid, const char *name, int name_len, int mod); +struct btrfs_dir_item * +btrfs_search_dir_index_item(struct btrfs_root *root, + struct btrfs_path *path, u64 dirid, + const char *name, int name_len); struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root, struct btrfs_path *path, const char *name, int name_len); @@ -2160,6 +2210,7 @@ int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 offset); int btrfs_del_orphan_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 offset); +int btrfs_find_orphan_item(struct btrfs_root *root, u64 offset); /* inode-map.c */ int btrfs_find_free_objectid(struct btrfs_trans_handle *trans, @@ -2232,6 +2283,10 @@ int btrfs_unlink_inode(struct btrfs_trans_handle *trans, int btrfs_add_link(struct btrfs_trans_handle *trans, struct inode *parent_inode, struct inode *inode, const char *name, int name_len, int add_backref, u64 index); +int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct inode *dir, u64 objectid, + const char *name, int name_len); int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode, u64 new_size, @@ -2242,7 +2297,7 @@ int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end); int btrfs_writepages(struct address_space *mapping, struct writeback_control *wbc); int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, - struct btrfs_root *new_root, struct dentry *dentry, + struct btrfs_root *new_root, u64 new_dirid, u64 alloc_hint); int btrfs_merge_bio_hook(struct page *page, unsigned long offset, size_t size, struct bio *bio, unsigned long bio_flags); @@ -2258,6 +2313,7 @@ int btrfs_write_inode(struct inode *inode, int wait); void btrfs_dirty_inode(struct inode *inode); struct inode *btrfs_alloc_inode(struct super_block *sb); void btrfs_destroy_inode(struct inode *inode); +void btrfs_drop_inode(struct inode *inode); int btrfs_init_cachep(void); void btrfs_destroy_cachep(void); long btrfs_ioctl_trans_end(struct file *file); @@ -2275,6 +2331,8 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode); int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode); void btrfs_orphan_cleanup(struct btrfs_root *root); int btrfs_cont_expand(struct inode *inode, loff_t size); +int btrfs_invalidate_inodes(struct btrfs_root *root); +extern const struct dentry_operations btrfs_dentry_operations; /* ioctl.c */ long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg); @@ -2286,11 +2344,11 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync); int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, int skip_pinned); int btrfs_check_file(struct btrfs_root *root, struct inode *inode); -extern struct file_operations btrfs_file_operations; +extern const struct file_operations btrfs_file_operations; int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode, u64 start, u64 end, u64 locked_end, - u64 inline_limit, u64 *hint_block); + u64 inline_limit, u64 *hint_block, int drop_cache); int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode, u64 start, u64 end); @@ -2317,7 +2375,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options); int btrfs_sync_fs(struct super_block *sb, int wait); /* acl.c */ -#ifdef CONFIG_FS_POSIX_ACL +#ifdef CONFIG_BTRFS_FS_POSIX_ACL int btrfs_check_acl(struct inode *inode, int mask); #else #define btrfs_check_acl NULL diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c index 1d70236..f3a6075 100644 --- a/fs/btrfs/dir-item.c +++ b/fs/btrfs/dir-item.c @@ -281,6 +281,53 @@ btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans, return btrfs_match_dir_item_name(root, path, name, name_len); } +struct btrfs_dir_item * +btrfs_search_dir_index_item(struct btrfs_root *root, + struct btrfs_path *path, u64 dirid, + const char *name, int name_len) +{ + struct extent_buffer *leaf; + struct btrfs_dir_item *di; + struct btrfs_key key; + u32 nritems; + int ret; + + key.objectid = dirid; + key.type = BTRFS_DIR_INDEX_KEY; + key.offset = 0; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + return ERR_PTR(ret); + + leaf = path->nodes[0]; + nritems = btrfs_header_nritems(leaf); + + while (1) { + if (path->slots[0] >= nritems) { + ret = btrfs_next_leaf(root, path); + if (ret < 0) + return ERR_PTR(ret); + if (ret > 0) + break; + leaf = path->nodes[0]; + nritems = btrfs_header_nritems(leaf); + continue; + } + + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + if (key.objectid != dirid || key.type != BTRFS_DIR_INDEX_KEY) + break; + + di = btrfs_match_dir_item_name(root, path, name, name_len); + if (di) + return di; + + path->slots[0]++; + } + return NULL; +} + struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, u64 dir, diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index e83be2e..02b6afb 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -41,6 +41,7 @@ static struct extent_io_ops btree_extent_io_ops; static void end_workqueue_fn(struct btrfs_work *work); +static void free_fs_root(struct btrfs_root *root); static atomic_t btrfs_bdi_num = ATOMIC_INIT(0); @@ -123,15 +124,15 @@ static struct extent_map *btree_get_extent(struct inode *inode, struct extent_map *em; int ret; - spin_lock(&em_tree->lock); + read_lock(&em_tree->lock); em = lookup_extent_mapping(em_tree, start, len); if (em) { em->bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev; - spin_unlock(&em_tree->lock); + read_unlock(&em_tree->lock); goto out; } - spin_unlock(&em_tree->lock); + read_unlock(&em_tree->lock); em = alloc_extent_map(GFP_NOFS); if (!em) { @@ -144,7 +145,7 @@ static struct extent_map *btree_get_extent(struct inode *inode, em->block_start = 0; em->bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev; - spin_lock(&em_tree->lock); + write_lock(&em_tree->lock); ret = add_extent_mapping(em_tree, em); if (ret == -EEXIST) { u64 failed_start = em->start; @@ -163,7 +164,7 @@ static struct extent_map *btree_get_extent(struct inode *inode, free_extent_map(em); em = NULL; } - spin_unlock(&em_tree->lock); + write_unlock(&em_tree->lock); if (ret) em = ERR_PTR(ret); @@ -772,7 +773,7 @@ static void btree_invalidatepage(struct page *page, unsigned long offset) } } -static struct address_space_operations btree_aops = { +static const struct address_space_operations btree_aops = { .readpage = btree_readpage, .writepage = btree_writepage, .writepages = btree_writepages, @@ -821,14 +822,14 @@ struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root, int btrfs_write_tree_block(struct extent_buffer *buf) { - return btrfs_fdatawrite_range(buf->first_page->mapping, buf->start, - buf->start + buf->len - 1, WB_SYNC_ALL); + return filemap_fdatawrite_range(buf->first_page->mapping, buf->start, + buf->start + buf->len - 1); } int btrfs_wait_tree_block_writeback(struct extent_buffer *buf) { - return btrfs_wait_on_page_writeback_range(buf->first_page->mapping, - buf->start, buf->start + buf->len - 1); + return filemap_fdatawait_range(buf->first_page->mapping, + buf->start, buf->start + buf->len - 1); } struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr, @@ -895,8 +896,7 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize, root->fs_info = fs_info; root->objectid = objectid; root->last_trans = 0; - root->highest_inode = 0; - root->last_inode_alloc = 0; + root->highest_objectid = 0; root->name = NULL; root->in_sysfs = 0; root->inode_tree.rb_node = NULL; @@ -917,6 +917,7 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize, atomic_set(&root->log_writers, 0); root->log_batch = 0; root->log_transid = 0; + root->last_log_commit = 0; extent_io_tree_init(&root->dirty_log_pages, fs_info->btree_inode->i_mapping, GFP_NOFS); @@ -952,14 +953,16 @@ static int find_and_setup_root(struct btrfs_root *tree_root, root, fs_info, objectid); ret = btrfs_find_last_root(tree_root, objectid, &root->root_item, &root->root_key); + if (ret > 0) + return -ENOENT; BUG_ON(ret); generation = btrfs_root_generation(&root->root_item); blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item)); root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item), blocksize, generation); - root->commit_root = btrfs_root_node(root); BUG_ON(!root->node); + root->commit_root = btrfs_root_node(root); return 0; } @@ -1085,6 +1088,7 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans, WARN_ON(root->log_root); root->log_root = log_root; root->log_transid = 0; + root->last_log_commit = 0; return 0; } @@ -1095,7 +1099,6 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root, struct btrfs_fs_info *fs_info = tree_root->fs_info; struct btrfs_path *path; struct extent_buffer *l; - u64 highest_inode; u64 generation; u32 blocksize; int ret = 0; @@ -1110,7 +1113,7 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root, kfree(root); return ERR_PTR(ret); } - goto insert; + goto out; } __setup_root(tree_root->nodesize, tree_root->leafsize, @@ -1120,39 +1123,30 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root, path = btrfs_alloc_path(); BUG_ON(!path); ret = btrfs_search_slot(NULL, tree_root, location, path, 0, 0); - if (ret != 0) { - if (ret > 0) - ret = -ENOENT; - goto out; + if (ret == 0) { + l = path->nodes[0]; + read_extent_buffer(l, &root->root_item, + btrfs_item_ptr_offset(l, path->slots[0]), + sizeof(root->root_item)); + memcpy(&root->root_key, location, sizeof(*location)); } - l = path->nodes[0]; - read_extent_buffer(l, &root->root_item, - btrfs_item_ptr_offset(l, path->slots[0]), - sizeof(root->root_item)); - memcpy(&root->root_key, location, sizeof(*location)); - ret = 0; -out: - btrfs_release_path(root, path); btrfs_free_path(path); if (ret) { - kfree(root); + if (ret > 0) + ret = -ENOENT; return ERR_PTR(ret); } + generation = btrfs_root_generation(&root->root_item); blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item)); root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item), blocksize, generation); root->commit_root = btrfs_root_node(root); BUG_ON(!root->node); -insert: - if (location->objectid != BTRFS_TREE_LOG_OBJECTID) { +out: + if (location->objectid != BTRFS_TREE_LOG_OBJECTID) root->ref_cows = 1; - ret = btrfs_find_highest_inode(root, &highest_inode); - if (ret == 0) { - root->highest_inode = highest_inode; - root->last_inode_alloc = highest_inode; - } - } + return root; } @@ -1187,39 +1181,66 @@ struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info, return fs_info->dev_root; if (location->objectid == BTRFS_CSUM_TREE_OBJECTID) return fs_info->csum_root; - +again: + spin_lock(&fs_info->fs_roots_radix_lock); root = radix_tree_lookup(&fs_info->fs_roots_radix, (unsigned long)location->objectid); + spin_unlock(&fs_info->fs_roots_radix_lock); if (root) return root; + ret = btrfs_find_orphan_item(fs_info->tree_root, location->objectid); + if (ret == 0) + ret = -ENOENT; + if (ret < 0) + return ERR_PTR(ret); + root = btrfs_read_fs_root_no_radix(fs_info->tree_root, location); if (IS_ERR(root)) return root; + WARN_ON(btrfs_root_refs(&root->root_item) == 0); set_anon_super(&root->anon_super, NULL); + ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM); + if (ret) + goto fail; + + spin_lock(&fs_info->fs_roots_radix_lock); ret = radix_tree_insert(&fs_info->fs_roots_radix, (unsigned long)root->root_key.objectid, root); + if (ret == 0) + root->in_radix = 1; + spin_unlock(&fs_info->fs_roots_radix_lock); + radix_tree_preload_end(); if (ret) { - free_extent_buffer(root->node); - kfree(root); - return ERR_PTR(ret); + if (ret == -EEXIST) { + free_fs_root(root); + goto again; + } + goto fail; } - if (!(fs_info->sb->s_flags & MS_RDONLY)) { - ret = btrfs_find_dead_roots(fs_info->tree_root, - root->root_key.objectid); - BUG_ON(ret); + + ret = btrfs_find_dead_roots(fs_info->tree_root, + root->root_key.objectid); + WARN_ON(ret); + + if (!(fs_info->sb->s_flags & MS_RDONLY)) btrfs_orphan_cleanup(root); - } + return root; +fail: + free_fs_root(root); + return ERR_PTR(ret); } struct btrfs_root *btrfs_read_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_key *location, const char *name, int namelen) { + return btrfs_read_fs_root_no_name(fs_info, location); +#if 0 struct btrfs_root *root; int ret; @@ -1236,7 +1257,7 @@ struct btrfs_root *btrfs_read_fs_root(struct btrfs_fs_info *fs_info, kfree(root); return ERR_PTR(ret); } -#if 0 + ret = btrfs_sysfs_add_root(root); if (ret) { free_extent_buffer(root->node); @@ -1244,9 +1265,9 @@ struct btrfs_root *btrfs_read_fs_root(struct btrfs_fs_info *fs_info, kfree(root); return ERR_PTR(ret); } -#endif root->in_sysfs = 1; return root; +#endif } static int btrfs_congested_fn(void *congested_data, int bdi_bits) @@ -1325,9 +1346,9 @@ static void btrfs_unplug_io_fn(struct backing_dev_info *bdi, struct page *page) offset = page_offset(page); em_tree = &BTRFS_I(inode)->extent_tree; - spin_lock(&em_tree->lock); + read_lock(&em_tree->lock); em = lookup_extent_mapping(em_tree, offset, PAGE_CACHE_SIZE); - spin_unlock(&em_tree->lock); + read_unlock(&em_tree->lock); if (!em) { __unplug_io_fn(bdi, page); return; @@ -1352,6 +1373,7 @@ static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi) { int err; + bdi->name = "btrfs"; bdi->capabilities = BDI_CAP_MAP_COPY; err = bdi_init(bdi); if (err) @@ -1359,8 +1381,10 @@ static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi) err = bdi_register(bdi, NULL, "btrfs-%d", atomic_inc_return(&btrfs_bdi_num)); - if (err) + if (err) { + bdi_destroy(bdi); return err; + } bdi->ra_pages = default_backing_dev_info.ra_pages; bdi->unplug_io_fn = btrfs_unplug_io_fn; @@ -1450,9 +1474,12 @@ static int cleaner_kthread(void *arg) break; vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE); - mutex_lock(&root->fs_info->cleaner_mutex); - btrfs_clean_old_snapshots(root); - mutex_unlock(&root->fs_info->cleaner_mutex); + + if (!(root->fs_info->sb->s_flags & MS_RDONLY) && + mutex_trylock(&root->fs_info->cleaner_mutex)) { + btrfs_clean_old_snapshots(root); + mutex_unlock(&root->fs_info->cleaner_mutex); + } if (freezing(current)) { refrigerator(); @@ -1557,15 +1584,36 @@ struct btrfs_root *open_ctree(struct super_block *sb, err = -ENOMEM; goto fail; } - INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_NOFS); + + ret = init_srcu_struct(&fs_info->subvol_srcu); + if (ret) { + err = ret; + goto fail; + } + + ret = setup_bdi(fs_info, &fs_info->bdi); + if (ret) { + err = ret; + goto fail_srcu; + } + + fs_info->btree_inode = new_inode(sb); + if (!fs_info->btree_inode) { + err = -ENOMEM; + goto fail_bdi; + } + + INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC); INIT_LIST_HEAD(&fs_info->trans_list); INIT_LIST_HEAD(&fs_info->dead_roots); INIT_LIST_HEAD(&fs_info->hashers); INIT_LIST_HEAD(&fs_info->delalloc_inodes); INIT_LIST_HEAD(&fs_info->ordered_operations); + INIT_LIST_HEAD(&fs_info->caching_block_groups); spin_lock_init(&fs_info->delalloc_lock); spin_lock_init(&fs_info->new_trans_lock); spin_lock_init(&fs_info->ref_cache_lock); + spin_lock_init(&fs_info->fs_roots_radix_lock); init_completion(&fs_info->kobj_unregister); fs_info->tree_root = tree_root; @@ -1584,12 +1632,7 @@ struct btrfs_root *open_ctree(struct super_block *sb, fs_info->sb = sb; fs_info->max_extent = (u64)-1; fs_info->max_inline = 8192 * 1024; - if (setup_bdi(fs_info, &fs_info->bdi)) - goto fail_bdi; - fs_info->btree_inode = new_inode(sb); - fs_info->btree_inode->i_ino = 1; - fs_info->btree_inode->i_nlink = 1; - fs_info->metadata_ratio = 8; + fs_info->metadata_ratio = 0; fs_info->thread_pool_size = min_t(unsigned long, num_online_cpus() + 2, 8); @@ -1599,7 +1642,10 @@ struct btrfs_root *open_ctree(struct super_block *sb, sb->s_blocksize = 4096; sb->s_blocksize_bits = blksize_bits(4096); + sb->s_bdi = &fs_info->bdi; + fs_info->btree_inode->i_ino = BTRFS_BTREE_INODE_OBJECTID; + fs_info->btree_inode->i_nlink = 1; /* * we set the i_size on the btree inode to the max possible int. * the real end of the address space is determined by all of @@ -1618,28 +1664,32 @@ struct btrfs_root *open_ctree(struct super_block *sb, BTRFS_I(fs_info->btree_inode)->io_tree.ops = &btree_extent_io_ops; + BTRFS_I(fs_info->btree_inode)->root = tree_root; + memset(&BTRFS_I(fs_info->btree_inode)->location, 0, + sizeof(struct btrfs_key)); + BTRFS_I(fs_info->btree_inode)->dummy_inode = 1; + insert_inode_hash(fs_info->btree_inode); + spin_lock_init(&fs_info->block_group_cache_lock); fs_info->block_group_cache_tree.rb_node = NULL; - extent_io_tree_init(&fs_info->pinned_extents, + extent_io_tree_init(&fs_info->freed_extents[0], fs_info->btree_inode->i_mapping, GFP_NOFS); + extent_io_tree_init(&fs_info->freed_extents[1], + fs_info->btree_inode->i_mapping, GFP_NOFS); + fs_info->pinned_extents = &fs_info->freed_extents[0]; fs_info->do_barriers = 1; - BTRFS_I(fs_info->btree_inode)->root = tree_root; - memset(&BTRFS_I(fs_info->btree_inode)->location, 0, - sizeof(struct btrfs_key)); - insert_inode_hash(fs_info->btree_inode); mutex_init(&fs_info->trans_mutex); mutex_init(&fs_info->ordered_operations_mutex); mutex_init(&fs_info->tree_log_mutex); - mutex_init(&fs_info->drop_mutex); mutex_init(&fs_info->chunk_mutex); mutex_init(&fs_info->transaction_kthread_mutex); mutex_init(&fs_info->cleaner_mutex); mutex_init(&fs_info->volume_mutex); - mutex_init(&fs_info->tree_reloc_mutex); init_rwsem(&fs_info->extent_commit_sem); + init_rwsem(&fs_info->subvol_sem); btrfs_init_free_cluster(&fs_info->meta_alloc_cluster); btrfs_init_free_cluster(&fs_info->data_alloc_cluster); @@ -1699,20 +1749,24 @@ struct btrfs_root *open_ctree(struct super_block *sb, goto fail_iput; } - /* - * we need to start all the end_io workers up front because the - * queue work function gets called at interrupt time, and so it - * cannot dynamically grow. - */ + btrfs_init_workers(&fs_info->generic_worker, + "genwork", 1, NULL); + btrfs_init_workers(&fs_info->workers, "worker", - fs_info->thread_pool_size); + fs_info->thread_pool_size, + &fs_info->generic_worker); btrfs_init_workers(&fs_info->delalloc_workers, "delalloc", - fs_info->thread_pool_size); + fs_info->thread_pool_size, + &fs_info->generic_worker); btrfs_init_workers(&fs_info->submit_workers, "submit", min_t(u64, fs_devices->num_devices, - fs_info->thread_pool_size)); + fs_info->thread_pool_size), + &fs_info->generic_worker); + btrfs_init_workers(&fs_info->enospc_workers, "enospc", + fs_info->thread_pool_size, + &fs_info->generic_worker); /* a higher idle thresh on the submit workers makes it much more * likely that bios will be send down in a sane order to the @@ -1726,15 +1780,20 @@ struct btrfs_root *open_ctree(struct super_block *sb, fs_info->delalloc_workers.idle_thresh = 2; fs_info->delalloc_workers.ordered = 1; - btrfs_init_workers(&fs_info->fixup_workers, "fixup", 1); + btrfs_init_workers(&fs_info->fixup_workers, "fixup", 1, + &fs_info->generic_worker); btrfs_init_workers(&fs_info->endio_workers, "endio", - fs_info->thread_pool_size); + fs_info->thread_pool_size, + &fs_info->generic_worker); btrfs_init_workers(&fs_info->endio_meta_workers, "endio-meta", - fs_info->thread_pool_size); + fs_info->thread_pool_size, + &fs_info->generic_worker); btrfs_init_workers(&fs_info->endio_meta_write_workers, - "endio-meta-write", fs_info->thread_pool_size); + "endio-meta-write", fs_info->thread_pool_size, + &fs_info->generic_worker); btrfs_init_workers(&fs_info->endio_write_workers, "endio-write", - fs_info->thread_pool_size); + fs_info->thread_pool_size, + &fs_info->generic_worker); /* * endios are largely parallel and should have a very @@ -1743,20 +1802,19 @@ struct btrfs_root *open_ctree(struct super_block *sb, fs_info->endio_workers.idle_thresh = 4; fs_info->endio_meta_workers.idle_thresh = 4; - fs_info->endio_write_workers.idle_thresh = 64; - fs_info->endio_meta_write_workers.idle_thresh = 64; + fs_info->endio_write_workers.idle_thresh = 2; + fs_info->endio_meta_write_workers.idle_thresh = 2; btrfs_start_workers(&fs_info->workers, 1); + btrfs_start_workers(&fs_info->generic_worker, 1); btrfs_start_workers(&fs_info->submit_workers, 1); btrfs_start_workers(&fs_info->delalloc_workers, 1); btrfs_start_workers(&fs_info->fixup_workers, 1); - btrfs_start_workers(&fs_info->endio_workers, fs_info->thread_pool_size); - btrfs_start_workers(&fs_info->endio_meta_workers, - fs_info->thread_pool_size); - btrfs_start_workers(&fs_info->endio_meta_write_workers, - fs_info->thread_pool_size); - btrfs_start_workers(&fs_info->endio_write_workers, - fs_info->thread_pool_size); + btrfs_start_workers(&fs_info->endio_workers, 1); + btrfs_start_workers(&fs_info->endio_meta_workers, 1); + btrfs_start_workers(&fs_info->endio_meta_write_workers, 1); + btrfs_start_workers(&fs_info->endio_write_workers, 1); + btrfs_start_workers(&fs_info->enospc_workers, 1); fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super); fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages, @@ -1916,6 +1974,9 @@ struct btrfs_root *open_ctree(struct super_block *sb, } } + ret = btrfs_find_orphan_roots(tree_root); + BUG_ON(ret); + if (!(sb->s_flags & MS_RDONLY)) { ret = btrfs_recover_relocation(tree_root); BUG_ON(ret); @@ -1959,6 +2020,7 @@ fail_chunk_root: free_extent_buffer(chunk_root->node); free_extent_buffer(chunk_root->commit_root); fail_sb_buffer: + btrfs_stop_workers(&fs_info->generic_worker); btrfs_stop_workers(&fs_info->fixup_workers); btrfs_stop_workers(&fs_info->delalloc_workers); btrfs_stop_workers(&fs_info->workers); @@ -1967,6 +2029,7 @@ fail_sb_buffer: btrfs_stop_workers(&fs_info->endio_meta_write_workers); btrfs_stop_workers(&fs_info->endio_write_workers); btrfs_stop_workers(&fs_info->submit_workers); + btrfs_stop_workers(&fs_info->enospc_workers); fail_iput: invalidate_inode_pages2(fs_info->btree_inode->i_mapping); iput(fs_info->btree_inode); @@ -1975,6 +2038,8 @@ fail_iput: btrfs_mapping_tree_free(&fs_info->mapping_tree); fail_bdi: bdi_destroy(&fs_info->bdi); +fail_srcu: + cleanup_srcu_struct(&fs_info->subvol_srcu); fail: kfree(extent_root); kfree(tree_root); @@ -2234,20 +2299,29 @@ int write_ctree_super(struct btrfs_trans_handle *trans, int btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root) { - WARN_ON(!RB_EMPTY_ROOT(&root->inode_tree)); + spin_lock(&fs_info->fs_roots_radix_lock); radix_tree_delete(&fs_info->fs_roots_radix, (unsigned long)root->root_key.objectid); + spin_unlock(&fs_info->fs_roots_radix_lock); + + if (btrfs_root_refs(&root->root_item) == 0) + synchronize_srcu(&fs_info->subvol_srcu); + + free_fs_root(root); + return 0; +} + +static void free_fs_root(struct btrfs_root *root) +{ + WARN_ON(!RB_EMPTY_ROOT(&root->inode_tree)); if (root->anon_super.s_dev) { down_write(&root->anon_super.s_umount); kill_anon_super(&root->anon_super); } - if (root->node) - free_extent_buffer(root->node); - if (root->commit_root) - free_extent_buffer(root->commit_root); + free_extent_buffer(root->node); + free_extent_buffer(root->commit_root); kfree(root->name); kfree(root); - return 0; } static int del_fs_roots(struct btrfs_fs_info *fs_info) @@ -2256,6 +2330,20 @@ static int del_fs_roots(struct btrfs_fs_info *fs_info) struct btrfs_root *gang[8]; int i; + while (!list_empty(&fs_info->dead_roots)) { + gang[0] = list_entry(fs_info->dead_roots.next, + struct btrfs_root, root_list); + list_del(&gang[0]->root_list); + + if (gang[0]->in_radix) { + btrfs_free_fs_root(fs_info, gang[0]); + } else { + free_extent_buffer(gang[0]->node); + free_extent_buffer(gang[0]->commit_root); + kfree(gang[0]); + } + } + while (1) { ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix, (void **)gang, 0, @@ -2285,9 +2373,6 @@ int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info) root_objectid = gang[ret - 1]->root_key.objectid + 1; for (i = 0; i < ret; i++) { root_objectid = gang[i]->root_key.objectid; - ret = btrfs_find_dead_roots(fs_info->tree_root, - root_objectid); - BUG_ON(ret); btrfs_orphan_cleanup(gang[i]); } root_objectid++; @@ -2357,12 +2442,12 @@ int close_ctree(struct btrfs_root *root) free_extent_buffer(root->fs_info->csum_root->commit_root); btrfs_free_block_groups(root->fs_info); - btrfs_free_pinned_extents(root->fs_info); del_fs_roots(fs_info); iput(fs_info->btree_inode); + btrfs_stop_workers(&fs_info->generic_worker); btrfs_stop_workers(&fs_info->fixup_workers); btrfs_stop_workers(&fs_info->delalloc_workers); btrfs_stop_workers(&fs_info->workers); @@ -2371,11 +2456,13 @@ int close_ctree(struct btrfs_root *root) btrfs_stop_workers(&fs_info->endio_meta_write_workers); btrfs_stop_workers(&fs_info->endio_write_workers); btrfs_stop_workers(&fs_info->submit_workers); + btrfs_stop_workers(&fs_info->enospc_workers); btrfs_close_devices(fs_info->fs_devices); btrfs_mapping_tree_free(&fs_info->mapping_tree); bdi_destroy(&fs_info->bdi); + cleanup_srcu_struct(&fs_info->subvol_srcu); kfree(fs_info->extent_root); kfree(fs_info->tree_root); diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c index 9596b40..ba5c3fd 100644 --- a/fs/btrfs/export.c +++ b/fs/btrfs/export.c @@ -28,7 +28,7 @@ static int btrfs_encode_fh(struct dentry *dentry, u32 *fh, int *max_len, len = BTRFS_FID_SIZE_NON_CONNECTABLE; type = FILEID_BTRFS_WITHOUT_PARENT; - fid->objectid = BTRFS_I(inode)->location.objectid; + fid->objectid = inode->i_ino; fid->root_objectid = BTRFS_I(inode)->root->objectid; fid->gen = inode->i_generation; @@ -60,34 +60,61 @@ static int btrfs_encode_fh(struct dentry *dentry, u32 *fh, int *max_len, } static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid, - u64 root_objectid, u32 generation) + u64 root_objectid, u32 generation, + int check_generation) { + struct btrfs_fs_info *fs_info = btrfs_sb(sb)->fs_info; struct btrfs_root *root; + struct dentry *dentry; struct inode *inode; struct btrfs_key key; + int index; + int err = 0; + + if (objectid < BTRFS_FIRST_FREE_OBJECTID) + return ERR_PTR(-ESTALE); key.objectid = root_objectid; btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); key.offset = (u64)-1; - root = btrfs_read_fs_root_no_name(btrfs_sb(sb)->fs_info, &key); - if (IS_ERR(root)) - return ERR_CAST(root); + index = srcu_read_lock(&fs_info->subvol_srcu); + + root = btrfs_read_fs_root_no_name(fs_info, &key); + if (IS_ERR(root)) { + err = PTR_ERR(root); + goto fail; + } + + if (btrfs_root_refs(&root->root_item) == 0) { + err = -ENOENT; + goto fail; + } key.objectid = objectid; btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); key.offset = 0; inode = btrfs_iget(sb, &key, root); - if (IS_ERR(inode)) - return (void *)inode; + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + goto fail; + } + + srcu_read_unlock(&fs_info->subvol_srcu, index); - if (generation != inode->i_generation) { + if (check_generation && generation != inode->i_generation) { iput(inode); return ERR_PTR(-ESTALE); } - return d_obtain_alias(inode); + dentry = d_obtain_alias(inode); + if (!IS_ERR(dentry)) + dentry->d_op = &btrfs_dentry_operations; + return dentry; +fail: + srcu_read_unlock(&fs_info->subvol_srcu, index); + return ERR_PTR(err); } static struct dentry *btrfs_fh_to_parent(struct super_block *sb, struct fid *fh, @@ -111,7 +138,7 @@ static struct dentry *btrfs_fh_to_parent(struct super_block *sb, struct fid *fh, objectid = fid->parent_objectid; generation = fid->parent_gen; - return btrfs_get_dentry(sb, objectid, root_objectid, generation); + return btrfs_get_dentry(sb, objectid, root_objectid, generation, 1); } static struct dentry *btrfs_fh_to_dentry(struct super_block *sb, struct fid *fh, @@ -133,66 +160,76 @@ static struct dentry *btrfs_fh_to_dentry(struct super_block *sb, struct fid *fh, root_objectid = fid->root_objectid; generation = fid->gen; - return btrfs_get_dentry(sb, objectid, root_objectid, generation); + return btrfs_get_dentry(sb, objectid, root_objectid, generation, 1); } static struct dentry *btrfs_get_parent(struct dentry *child) { struct inode *dir = child->d_inode; + static struct dentry *dentry; struct btrfs_root *root = BTRFS_I(dir)->root; - struct btrfs_key key; struct btrfs_path *path; struct extent_buffer *leaf; - int slot; - u64 objectid; + struct btrfs_root_ref *ref; + struct btrfs_key key; + struct btrfs_key found_key; int ret; path = btrfs_alloc_path(); - key.objectid = dir->i_ino; - btrfs_set_key_type(&key, BTRFS_INODE_REF_KEY); - key.offset = (u64)-1; + if (dir->i_ino == BTRFS_FIRST_FREE_OBJECTID) { + key.objectid = root->root_key.objectid; + key.type = BTRFS_ROOT_BACKREF_KEY; + key.offset = (u64)-1; + root = root->fs_info->tree_root; + } else { + key.objectid = dir->i_ino; + key.type = BTRFS_INODE_REF_KEY; + key.offset = (u64)-1; + } ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); - if (ret < 0) { - /* Error */ - btrfs_free_path(path); - return ERR_PTR(ret); + if (ret < 0) + goto fail; + + BUG_ON(ret == 0); + if (path->slots[0] == 0) { + ret = -ENOENT; + goto fail; } + + path->slots[0]--; leaf = path->nodes[0]; - slot = path->slots[0]; - if (ret) { - /* btrfs_search_slot() returns the slot where we'd want to - insert a backref for parent inode #0xFFFFFFFFFFFFFFFF. - The _real_ backref, telling us what the parent inode - _actually_ is, will be in the slot _before_ the one - that btrfs_search_slot() returns. */ - if (!slot) { - /* Unless there is _no_ key in the tree before... */ - btrfs_free_path(path); - return ERR_PTR(-EIO); - } - slot--; + + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); + if (found_key.objectid != key.objectid || found_key.type != key.type) { + ret = -ENOENT; + goto fail; } - btrfs_item_key_to_cpu(leaf, &key, slot); + if (found_key.type == BTRFS_ROOT_BACKREF_KEY) { + ref = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_root_ref); + key.objectid = btrfs_root_ref_dirid(leaf, ref); + } else { + key.objectid = found_key.offset; + } btrfs_free_path(path); - if (key.objectid != dir->i_ino || key.type != BTRFS_INODE_REF_KEY) - return ERR_PTR(-EINVAL); - - objectid = key.offset; - - /* If we are already at the root of a subvol, return the real root */ - if (objectid == dir->i_ino) - return dget(dir->i_sb->s_root); + if (found_key.type == BTRFS_ROOT_BACKREF_KEY) { + return btrfs_get_dentry(root->fs_info->sb, key.objectid, + found_key.offset, 0, 0); + } - /* Build a new key for the inode item */ - key.objectid = objectid; - btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); + key.type = BTRFS_INODE_ITEM_KEY; key.offset = 0; - - return d_obtain_alias(btrfs_iget(root->fs_info->sb, &key, root)); + dentry = d_obtain_alias(btrfs_iget(root->fs_info->sb, &key, root)); + if (!IS_ERR(dentry)) + dentry->d_op = &btrfs_dentry_operations; + return dentry; +fail: + btrfs_free_path(path); + return ERR_PTR(ret); } const struct export_operations btrfs_export_ops = { diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 72a2b9c..e238a0c 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -32,12 +32,12 @@ #include "locking.h" #include "free-space-cache.h" -static int update_reserved_extents(struct btrfs_root *root, - u64 bytenr, u64 num, int reserve); static int update_block_group(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 num_bytes, int alloc, int mark_free); +static int update_reserved_extents(struct btrfs_block_group_cache *cache, + u64 num_bytes, int reserve); static int __btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 num_bytes, u64 parent, @@ -57,10 +57,19 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, u64 parent, u64 root_objectid, u64 flags, struct btrfs_disk_key *key, int level, struct btrfs_key *ins); - static int do_chunk_alloc(struct btrfs_trans_handle *trans, struct btrfs_root *extent_root, u64 alloc_bytes, u64 flags, int force); +static int pin_down_bytes(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + u64 bytenr, u64 num_bytes, + int is_data, int reserved, + struct extent_buffer **must_clean); +static int find_next_key(struct btrfs_path *path, int level, + struct btrfs_key *key); +static void dump_space_info(struct btrfs_space_info *info, u64 bytes, + int dump_block_groups); static noinline int block_group_cache_done(struct btrfs_block_group_cache *cache) @@ -153,34 +162,34 @@ block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr, return ret; } -/* - * We always set EXTENT_LOCKED for the super mirror extents so we don't - * overwrite them, so those bits need to be unset. Also, if we are unmounting - * with pinned extents still sitting there because we had a block group caching, - * we need to clear those now, since we are done. - */ -void btrfs_free_pinned_extents(struct btrfs_fs_info *info) +static int add_excluded_extent(struct btrfs_root *root, + u64 start, u64 num_bytes) { - u64 start, end, last = 0; - int ret; + u64 end = start + num_bytes - 1; + set_extent_bits(&root->fs_info->freed_extents[0], + start, end, EXTENT_UPTODATE, GFP_NOFS); + set_extent_bits(&root->fs_info->freed_extents[1], + start, end, EXTENT_UPTODATE, GFP_NOFS); + return 0; +} - while (1) { - ret = find_first_extent_bit(&info->pinned_extents, last, - &start, &end, - EXTENT_LOCKED|EXTENT_DIRTY); - if (ret) - break; +static void free_excluded_extents(struct btrfs_root *root, + struct btrfs_block_group_cache *cache) +{ + u64 start, end; - clear_extent_bits(&info->pinned_extents, start, end, - EXTENT_LOCKED|EXTENT_DIRTY, GFP_NOFS); - last = end+1; - } + start = cache->key.objectid; + end = start + cache->key.offset - 1; + + clear_extent_bits(&root->fs_info->freed_extents[0], + start, end, EXTENT_UPTODATE, GFP_NOFS); + clear_extent_bits(&root->fs_info->freed_extents[1], + start, end, EXTENT_UPTODATE, GFP_NOFS); } -static int remove_sb_from_cache(struct btrfs_root *root, - struct btrfs_block_group_cache *cache) +static int exclude_super_stripes(struct btrfs_root *root, + struct btrfs_block_group_cache *cache) { - struct btrfs_fs_info *fs_info = root->fs_info; u64 bytenr; u64 *logical; int stripe_len; @@ -192,17 +201,42 @@ static int remove_sb_from_cache(struct btrfs_root *root, cache->key.objectid, bytenr, 0, &logical, &nr, &stripe_len); BUG_ON(ret); + while (nr--) { - try_lock_extent(&fs_info->pinned_extents, - logical[nr], - logical[nr] + stripe_len - 1, GFP_NOFS); + cache->bytes_super += stripe_len; + ret = add_excluded_extent(root, logical[nr], + stripe_len); + BUG_ON(ret); } + kfree(logical); } - return 0; } +static struct btrfs_caching_control * +get_caching_control(struct btrfs_block_group_cache *cache) +{ + struct btrfs_caching_control *ctl; + + spin_lock(&cache->lock); + if (cache->cached != BTRFS_CACHE_STARTED) { + spin_unlock(&cache->lock); + return NULL; + } + + ctl = cache->caching_ctl; + atomic_inc(&ctl->count); + spin_unlock(&cache->lock); + return ctl; +} + +static void put_caching_control(struct btrfs_caching_control *ctl) +{ + if (atomic_dec_and_test(&ctl->count)) + kfree(ctl); +} + /* * this is only called by cache_block_group, since we could have freed extents * we need to check the pinned_extents for any extents that can't be used yet @@ -215,9 +249,9 @@ static u64 add_new_free_space(struct btrfs_block_group_cache *block_group, int ret; while (start < end) { - ret = find_first_extent_bit(&info->pinned_extents, start, + ret = find_first_extent_bit(info->pinned_extents, start, &extent_start, &extent_end, - EXTENT_DIRTY|EXTENT_LOCKED); + EXTENT_DIRTY | EXTENT_UPTODATE); if (ret) break; @@ -249,22 +283,27 @@ static int caching_kthread(void *data) { struct btrfs_block_group_cache *block_group = data; struct btrfs_fs_info *fs_info = block_group->fs_info; - u64 last = 0; + struct btrfs_caching_control *caching_ctl = block_group->caching_ctl; + struct btrfs_root *extent_root = fs_info->extent_root; struct btrfs_path *path; - int ret = 0; - struct btrfs_key key; struct extent_buffer *leaf; - int slot; + struct btrfs_key key; u64 total_found = 0; - - BUG_ON(!fs_info); + u64 last = 0; + u32 nritems; + int ret = 0; path = btrfs_alloc_path(); if (!path) return -ENOMEM; - atomic_inc(&block_group->space_info->caching_threads); + exclude_super_stripes(extent_root, block_group); + spin_lock(&block_group->space_info->lock); + block_group->space_info->bytes_super += block_group->bytes_super; + spin_unlock(&block_group->space_info->lock); + last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET); + /* * We don't want to deadlock with somebody trying to allocate a new * extent for the extent root while also trying to search the extent @@ -277,74 +316,64 @@ static int caching_kthread(void *data) key.objectid = last; key.offset = 0; - btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY); + key.type = BTRFS_EXTENT_ITEM_KEY; again: + mutex_lock(&caching_ctl->mutex); /* need to make sure the commit_root doesn't disappear */ down_read(&fs_info->extent_commit_sem); - ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, path, 0, 0); + ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); if (ret < 0) goto err; + leaf = path->nodes[0]; + nritems = btrfs_header_nritems(leaf); + while (1) { smp_mb(); - if (block_group->fs_info->closing > 1) { + if (fs_info->closing > 1) { last = (u64)-1; break; } - leaf = path->nodes[0]; - slot = path->slots[0]; - if (slot >= btrfs_header_nritems(leaf)) { - ret = btrfs_next_leaf(fs_info->extent_root, path); - if (ret < 0) - goto err; - else if (ret) + if (path->slots[0] < nritems) { + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + } else { + ret = find_next_key(path, 0, &key); + if (ret) break; - if (need_resched() || - btrfs_transaction_in_commit(fs_info)) { - leaf = path->nodes[0]; - - /* this shouldn't happen, but if the - * leaf is empty just move on. - */ - if (btrfs_header_nritems(leaf) == 0) - break; - /* - * we need to copy the key out so that - * we are sure the next search advances - * us forward in the btree. - */ - btrfs_item_key_to_cpu(leaf, &key, 0); - btrfs_release_path(fs_info->extent_root, path); - up_read(&fs_info->extent_commit_sem); + caching_ctl->progress = last; + btrfs_release_path(extent_root, path); + up_read(&fs_info->extent_commit_sem); + mutex_unlock(&caching_ctl->mutex); + if (btrfs_transaction_in_commit(fs_info)) schedule_timeout(1); - goto again; - } + else + cond_resched(); + goto again; + } + if (key.objectid < block_group->key.objectid) { + path->slots[0]++; continue; } - btrfs_item_key_to_cpu(leaf, &key, slot); - if (key.objectid < block_group->key.objectid) - goto next; if (key.objectid >= block_group->key.objectid + block_group->key.offset) break; - if (btrfs_key_type(&key) == BTRFS_EXTENT_ITEM_KEY) { + if (key.type == BTRFS_EXTENT_ITEM_KEY) { total_found += add_new_free_space(block_group, fs_info, last, key.objectid); last = key.objectid + key.offset; - } - if (total_found > (1024 * 1024 * 2)) { - total_found = 0; - wake_up(&block_group->caching_q); + if (total_found > (1024 * 1024 * 2)) { + total_found = 0; + wake_up(&caching_ctl->wait); + } } -next: path->slots[0]++; } ret = 0; @@ -352,33 +381,65 @@ next: total_found += add_new_free_space(block_group, fs_info, last, block_group->key.objectid + block_group->key.offset); + caching_ctl->progress = (u64)-1; spin_lock(&block_group->lock); + block_group->caching_ctl = NULL; block_group->cached = BTRFS_CACHE_FINISHED; spin_unlock(&block_group->lock); err: btrfs_free_path(path); up_read(&fs_info->extent_commit_sem); - atomic_dec(&block_group->space_info->caching_threads); - wake_up(&block_group->caching_q); + free_excluded_extents(extent_root, block_group); + + mutex_unlock(&caching_ctl->mutex); + wake_up(&caching_ctl->wait); + + put_caching_control(caching_ctl); + atomic_dec(&block_group->space_info->caching_threads); return 0; } static int cache_block_group(struct btrfs_block_group_cache *cache) { + struct btrfs_fs_info *fs_info = cache->fs_info; + struct btrfs_caching_control *caching_ctl; struct task_struct *tsk; int ret = 0; + smp_mb(); + if (cache->cached != BTRFS_CACHE_NO) + return 0; + + caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_KERNEL); + BUG_ON(!caching_ctl); + + INIT_LIST_HEAD(&caching_ctl->list); + mutex_init(&caching_ctl->mutex); + init_waitqueue_head(&caching_ctl->wait); + caching_ctl->block_group = cache; + caching_ctl->progress = cache->key.objectid; + /* one for caching kthread, one for caching block group list */ + atomic_set(&caching_ctl->count, 2); + spin_lock(&cache->lock); if (cache->cached != BTRFS_CACHE_NO) { spin_unlock(&cache->lock); - return ret; + kfree(caching_ctl); + return 0; } + cache->caching_ctl = caching_ctl; cache->cached = BTRFS_CACHE_STARTED; spin_unlock(&cache->lock); + down_write(&fs_info->extent_commit_sem); + list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups); + up_write(&fs_info->extent_commit_sem); + + atomic_inc(&cache->space_info->caching_threads); + tsk = kthread_run(caching_kthread, cache, "btrfs-cache-%llu\n", cache->key.objectid); if (IS_ERR(tsk)) { @@ -1507,22 +1568,23 @@ static int remove_extent_backref(struct btrfs_trans_handle *trans, return ret; } -#ifdef BIO_RW_DISCARD static void btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len) { - blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL); + blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL, + DISCARD_FL_BARRIER); } -#endif static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, u64 num_bytes) { -#ifdef BIO_RW_DISCARD int ret; u64 map_length = num_bytes; struct btrfs_multi_bio *multi = NULL; + if (!btrfs_test_opt(root, DISCARD)) + return 0; + /* Tell the block device(s) that the sectors can be discarded */ ret = btrfs_map_block(&root->fs_info->mapping_tree, READ, bytenr, &map_length, &multi, 0); @@ -1542,9 +1604,6 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, } return ret; -#else - return 0; -#endif } int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, @@ -1656,7 +1715,6 @@ static int run_delayed_data_ref(struct btrfs_trans_handle *trans, parent, ref_root, flags, ref->objectid, ref->offset, &ins, node->ref_mod); - update_reserved_extents(root, ins.objectid, ins.offset, 0); } else if (node->action == BTRFS_ADD_DELAYED_REF) { ret = __btrfs_inc_extent_ref(trans, root, node->bytenr, node->num_bytes, parent, @@ -1782,7 +1840,6 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans, extent_op->flags_to_set, &extent_op->key, ref->level, &ins); - update_reserved_extents(root, ins.objectid, ins.offset, 0); } else if (node->action == BTRFS_ADD_DELAYED_REF) { ret = __btrfs_inc_extent_ref(trans, root, node->bytenr, node->num_bytes, parent, ref_root, @@ -1817,16 +1874,32 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans, BUG_ON(extent_op); head = btrfs_delayed_node_to_head(node); if (insert_reserved) { + int mark_free = 0; + struct extent_buffer *must_clean = NULL; + + ret = pin_down_bytes(trans, root, NULL, + node->bytenr, node->num_bytes, + head->is_data, 1, &must_clean); + if (ret > 0) + mark_free = 1; + + if (must_clean) { + clean_tree_block(NULL, root, must_clean); + btrfs_tree_unlock(must_clean); + free_extent_buffer(must_clean); + } if (head->is_data) { ret = btrfs_del_csums(trans, root, node->bytenr, node->num_bytes); BUG_ON(ret); } - btrfs_update_pinned_extents(root, node->bytenr, - node->num_bytes, 1); - update_reserved_extents(root, node->bytenr, - node->num_bytes, 0); + if (mark_free) { + ret = btrfs_free_reserved_extent(root, + node->bytenr, + node->num_bytes); + BUG_ON(ret); + } } mutex_unlock(&head->mutex); return 0; @@ -2691,60 +2764,448 @@ void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *inode) alloc_target); } +static u64 calculate_bytes_needed(struct btrfs_root *root, int num_items) +{ + u64 num_bytes; + int level; + + level = BTRFS_MAX_LEVEL - 2; + /* + * NOTE: these calculations are absolutely the worst possible case. + * This assumes that _every_ item we insert will require a new leaf, and + * that the tree has grown to its maximum level size. + */ + + /* + * for every item we insert we could insert both an extent item and a + * extent ref item. Then for ever item we insert, we will need to cow + * both the original leaf, plus the leaf to the left and right of it. + * + * Unless we are talking about the extent root, then we just want the + * number of items * 2, since we just need the extent item plus its ref. + */ + if (root == root->fs_info->extent_root) + num_bytes = num_items * 2; + else + num_bytes = (num_items + (2 * num_items)) * 3; + + /* + * num_bytes is total number of leaves we could need times the leaf + * size, and then for every leaf we could end up cow'ing 2 nodes per + * level, down to the leaf level. + */ + num_bytes = (num_bytes * root->leafsize) + + (num_bytes * (level * 2)) * root->nodesize; + + return num_bytes; +} + /* - * for now this just makes sure we have at least 5% of our metadata space free - * for use. + * Unreserve metadata space for delalloc. If we have less reserved credits than + * we have extents, this function does nothing. */ -int btrfs_check_metadata_free_space(struct btrfs_root *root) +int btrfs_unreserve_metadata_for_delalloc(struct btrfs_root *root, + struct inode *inode, int num_items) { struct btrfs_fs_info *info = root->fs_info; struct btrfs_space_info *meta_sinfo; - u64 alloc_target, thresh; - int committed = 0, ret; + u64 num_bytes; + u64 alloc_target; + bool bug = false; /* get the space info for where the metadata will live */ alloc_target = btrfs_get_alloc_profile(root, 0); meta_sinfo = __find_space_info(info, alloc_target); -again: + num_bytes = calculate_bytes_needed(root->fs_info->extent_root, + num_items); + spin_lock(&meta_sinfo->lock); - if (!meta_sinfo->full) - thresh = meta_sinfo->total_bytes * 80; - else - thresh = meta_sinfo->total_bytes * 95; + spin_lock(&BTRFS_I(inode)->accounting_lock); + if (BTRFS_I(inode)->reserved_extents <= + BTRFS_I(inode)->outstanding_extents) { + spin_unlock(&BTRFS_I(inode)->accounting_lock); + spin_unlock(&meta_sinfo->lock); + return 0; + } + spin_unlock(&BTRFS_I(inode)->accounting_lock); + + BTRFS_I(inode)->reserved_extents--; + BUG_ON(BTRFS_I(inode)->reserved_extents < 0); + + if (meta_sinfo->bytes_delalloc < num_bytes) { + bug = true; + meta_sinfo->bytes_delalloc = 0; + } else { + meta_sinfo->bytes_delalloc -= num_bytes; + } + spin_unlock(&meta_sinfo->lock); + + BUG_ON(bug); + + return 0; +} + +static void check_force_delalloc(struct btrfs_space_info *meta_sinfo) +{ + u64 thresh; + + thresh = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved + + meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly + + meta_sinfo->bytes_super + meta_sinfo->bytes_root + + meta_sinfo->bytes_may_use; + thresh = meta_sinfo->total_bytes - thresh; + thresh *= 80; do_div(thresh, 100); + if (thresh <= meta_sinfo->bytes_delalloc) + meta_sinfo->force_delalloc = 1; + else + meta_sinfo->force_delalloc = 0; +} - if (meta_sinfo->bytes_used + meta_sinfo->bytes_reserved + - meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly > thresh) { - struct btrfs_trans_handle *trans; - if (!meta_sinfo->full) { - meta_sinfo->force_alloc = 1; - spin_unlock(&meta_sinfo->lock); +struct async_flush { + struct btrfs_root *root; + struct btrfs_space_info *info; + struct btrfs_work work; +}; - trans = btrfs_start_transaction(root, 1); - if (!trans) - return -ENOMEM; +static noinline void flush_delalloc_async(struct btrfs_work *work) +{ + struct async_flush *async; + struct btrfs_root *root; + struct btrfs_space_info *info; - ret = do_chunk_alloc(trans, root->fs_info->extent_root, - 2 * 1024 * 1024, alloc_target, 0); - btrfs_end_transaction(trans, root); + async = container_of(work, struct async_flush, work); + root = async->root; + info = async->info; + + btrfs_start_delalloc_inodes(root); + wake_up(&info->flush_wait); + btrfs_wait_ordered_extents(root, 0); + + spin_lock(&info->lock); + info->flushing = 0; + spin_unlock(&info->lock); + wake_up(&info->flush_wait); + + kfree(async); +} + +static void wait_on_flush(struct btrfs_space_info *info) +{ + DEFINE_WAIT(wait); + u64 used; + + while (1) { + prepare_to_wait(&info->flush_wait, &wait, + TASK_UNINTERRUPTIBLE); + spin_lock(&info->lock); + if (!info->flushing) { + spin_unlock(&info->lock); + break; + } + + used = info->bytes_used + info->bytes_reserved + + info->bytes_pinned + info->bytes_readonly + + info->bytes_super + info->bytes_root + + info->bytes_may_use + info->bytes_delalloc; + if (used < info->total_bytes) { + spin_unlock(&info->lock); + break; + } + spin_unlock(&info->lock); + schedule(); + } + finish_wait(&info->flush_wait, &wait); +} + +static void flush_delalloc(struct btrfs_root *root, + struct btrfs_space_info *info) +{ + struct async_flush *async; + bool wait = false; + + spin_lock(&info->lock); + + if (!info->flushing) { + info->flushing = 1; + init_waitqueue_head(&info->flush_wait); + } else { + wait = true; + } + + spin_unlock(&info->lock); + + if (wait) { + wait_on_flush(info); + return; + } + + async = kzalloc(sizeof(*async), GFP_NOFS); + if (!async) + goto flush; + + async->root = root; + async->info = info; + async->work.func = flush_delalloc_async; + + btrfs_queue_worker(&root->fs_info->enospc_workers, + &async->work); + wait_on_flush(info); + return; + +flush: + btrfs_start_delalloc_inodes(root); + btrfs_wait_ordered_extents(root, 0); + + spin_lock(&info->lock); + info->flushing = 0; + spin_unlock(&info->lock); + wake_up(&info->flush_wait); +} + +static int maybe_allocate_chunk(struct btrfs_root *root, + struct btrfs_space_info *info) +{ + struct btrfs_super_block *disk_super = &root->fs_info->super_copy; + struct btrfs_trans_handle *trans; + bool wait = false; + int ret = 0; + u64 min_metadata; + u64 free_space; + + free_space = btrfs_super_total_bytes(disk_super); + /* + * we allow the metadata to grow to a max of either 5gb or 5% of the + * space in the volume. + */ + min_metadata = min((u64)5 * 1024 * 1024 * 1024, + div64_u64(free_space * 5, 100)); + if (info->total_bytes >= min_metadata) { + spin_unlock(&info->lock); + return 0; + } + + if (info->full) { + spin_unlock(&info->lock); + return 0; + } + + if (!info->allocating_chunk) { + info->force_alloc = 1; + info->allocating_chunk = 1; + init_waitqueue_head(&info->allocate_wait); + } else { + wait = true; + } + + spin_unlock(&info->lock); + + if (wait) { + wait_event(info->allocate_wait, + !info->allocating_chunk); + return 1; + } + + trans = btrfs_start_transaction(root, 1); + if (!trans) { + ret = -ENOMEM; + goto out; + } + + ret = do_chunk_alloc(trans, root->fs_info->extent_root, + 4096 + 2 * 1024 * 1024, + info->flags, 0); + btrfs_end_transaction(trans, root); + if (ret) + goto out; +out: + spin_lock(&info->lock); + info->allocating_chunk = 0; + spin_unlock(&info->lock); + wake_up(&info->allocate_wait); + + if (ret) + return 0; + return 1; +} + +/* + * Reserve metadata space for delalloc. + */ +int btrfs_reserve_metadata_for_delalloc(struct btrfs_root *root, + struct inode *inode, int num_items) +{ + struct btrfs_fs_info *info = root->fs_info; + struct btrfs_space_info *meta_sinfo; + u64 num_bytes; + u64 used; + u64 alloc_target; + int flushed = 0; + int force_delalloc; + + /* get the space info for where the metadata will live */ + alloc_target = btrfs_get_alloc_profile(root, 0); + meta_sinfo = __find_space_info(info, alloc_target); + + num_bytes = calculate_bytes_needed(root->fs_info->extent_root, + num_items); +again: + spin_lock(&meta_sinfo->lock); + + force_delalloc = meta_sinfo->force_delalloc; + + if (unlikely(!meta_sinfo->bytes_root)) + meta_sinfo->bytes_root = calculate_bytes_needed(root, 6); + + if (!flushed) + meta_sinfo->bytes_delalloc += num_bytes; + + used = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved + + meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly + + meta_sinfo->bytes_super + meta_sinfo->bytes_root + + meta_sinfo->bytes_may_use + meta_sinfo->bytes_delalloc; + + if (used > meta_sinfo->total_bytes) { + flushed++; + + if (flushed == 1) { + if (maybe_allocate_chunk(root, meta_sinfo)) + goto again; + flushed++; + } else { + spin_unlock(&meta_sinfo->lock); + } + + if (flushed == 2) { + filemap_flush(inode->i_mapping); + goto again; + } else if (flushed == 3) { + flush_delalloc(root, meta_sinfo); goto again; } + spin_lock(&meta_sinfo->lock); + meta_sinfo->bytes_delalloc -= num_bytes; spin_unlock(&meta_sinfo->lock); + printk(KERN_ERR "enospc, has %d, reserved %d\n", + BTRFS_I(inode)->outstanding_extents, + BTRFS_I(inode)->reserved_extents); + dump_space_info(meta_sinfo, 0, 0); + return -ENOSPC; + } - if (!committed) { - committed = 1; - trans = btrfs_join_transaction(root, 1); - if (!trans) - return -ENOMEM; - ret = btrfs_commit_transaction(trans, root); - if (ret) - return ret; + BTRFS_I(inode)->reserved_extents++; + check_force_delalloc(meta_sinfo); + spin_unlock(&meta_sinfo->lock); + + if (!flushed && force_delalloc) + filemap_flush(inode->i_mapping); + + return 0; +} + +/* + * unreserve num_items number of items worth of metadata space. This needs to + * be paired with btrfs_reserve_metadata_space. + * + * NOTE: if you have the option, run this _AFTER_ you do a + * btrfs_end_transaction, since btrfs_end_transaction will run delayed ref + * oprations which will result in more used metadata, so we want to make sure we + * can do that without issue. + */ +int btrfs_unreserve_metadata_space(struct btrfs_root *root, int num_items) +{ + struct btrfs_fs_info *info = root->fs_info; + struct btrfs_space_info *meta_sinfo; + u64 num_bytes; + u64 alloc_target; + bool bug = false; + + /* get the space info for where the metadata will live */ + alloc_target = btrfs_get_alloc_profile(root, 0); + meta_sinfo = __find_space_info(info, alloc_target); + + num_bytes = calculate_bytes_needed(root, num_items); + + spin_lock(&meta_sinfo->lock); + if (meta_sinfo->bytes_may_use < num_bytes) { + bug = true; + meta_sinfo->bytes_may_use = 0; + } else { + meta_sinfo->bytes_may_use -= num_bytes; + } + spin_unlock(&meta_sinfo->lock); + + BUG_ON(bug); + + return 0; +} + +/* + * Reserve some metadata space for use. We'll calculate the worste case number + * of bytes that would be needed to modify num_items number of items. If we + * have space, fantastic, if not, you get -ENOSPC. Please call + * btrfs_unreserve_metadata_space when you are done for the _SAME_ number of + * items you reserved, since whatever metadata you needed should have already + * been allocated. + * + * This will commit the transaction to make more space if we don't have enough + * metadata space. THe only time we don't do this is if we're reserving space + * inside of a transaction, then we will just return -ENOSPC and it is the + * callers responsibility to handle it properly. + */ +int btrfs_reserve_metadata_space(struct btrfs_root *root, int num_items) +{ + struct btrfs_fs_info *info = root->fs_info; + struct btrfs_space_info *meta_sinfo; + u64 num_bytes; + u64 used; + u64 alloc_target; + int retries = 0; + + /* get the space info for where the metadata will live */ + alloc_target = btrfs_get_alloc_profile(root, 0); + meta_sinfo = __find_space_info(info, alloc_target); + + num_bytes = calculate_bytes_needed(root, num_items); +again: + spin_lock(&meta_sinfo->lock); + + if (unlikely(!meta_sinfo->bytes_root)) + meta_sinfo->bytes_root = calculate_bytes_needed(root, 6); + + if (!retries) + meta_sinfo->bytes_may_use += num_bytes; + + used = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved + + meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly + + meta_sinfo->bytes_super + meta_sinfo->bytes_root + + meta_sinfo->bytes_may_use + meta_sinfo->bytes_delalloc; + + if (used > meta_sinfo->total_bytes) { + retries++; + if (retries == 1) { + if (maybe_allocate_chunk(root, meta_sinfo)) + goto again; + retries++; + } else { + spin_unlock(&meta_sinfo->lock); + } + + if (retries == 2) { + flush_delalloc(root, meta_sinfo); goto again; } + spin_lock(&meta_sinfo->lock); + meta_sinfo->bytes_may_use -= num_bytes; + spin_unlock(&meta_sinfo->lock); + + dump_space_info(meta_sinfo, 0, 0); return -ENOSPC; } + + check_force_delalloc(meta_sinfo); spin_unlock(&meta_sinfo->lock); return 0; @@ -2764,13 +3225,16 @@ int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode, bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1); data_sinfo = BTRFS_I(inode)->space_info; + if (!data_sinfo) + goto alloc; + again: /* make sure we have enough space to handle the data first */ spin_lock(&data_sinfo->lock); if (data_sinfo->total_bytes - data_sinfo->bytes_used - data_sinfo->bytes_delalloc - data_sinfo->bytes_reserved - data_sinfo->bytes_pinned - data_sinfo->bytes_readonly - - data_sinfo->bytes_may_use < bytes) { + data_sinfo->bytes_may_use - data_sinfo->bytes_super < bytes) { struct btrfs_trans_handle *trans; /* @@ -2782,7 +3246,7 @@ again: data_sinfo->force_alloc = 1; spin_unlock(&data_sinfo->lock); - +alloc: alloc_target = btrfs_get_alloc_profile(root, 1); trans = btrfs_start_transaction(root, 1); if (!trans) @@ -2794,12 +3258,17 @@ again: btrfs_end_transaction(trans, root); if (ret) return ret; + + if (!data_sinfo) { + btrfs_set_inode_space_info(root, inode); + data_sinfo = BTRFS_I(inode)->space_info; + } goto again; } spin_unlock(&data_sinfo->lock); /* commit the current transaction and try again */ - if (!committed) { + if (!committed && !root->fs_info->open_ioctl_trans) { committed = 1; trans = btrfs_join_transaction(root, 1); if (!trans) @@ -2827,7 +3296,7 @@ again: BTRFS_I(inode)->reserved_bytes += bytes; spin_unlock(&data_sinfo->lock); - return btrfs_check_metadata_free_space(root); + return 0; } /* @@ -2926,17 +3395,15 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans, BUG_ON(!space_info); spin_lock(&space_info->lock); - if (space_info->force_alloc) { + if (space_info->force_alloc) force = 1; - space_info->force_alloc = 0; - } if (space_info->full) { spin_unlock(&space_info->lock); goto out; } thresh = space_info->total_bytes - space_info->bytes_readonly; - thresh = div_factor(thresh, 6); + thresh = div_factor(thresh, 8); if (!force && (space_info->bytes_used + space_info->bytes_pinned + space_info->bytes_reserved + alloc_bytes) < thresh) { @@ -2950,7 +3417,7 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans, * we keep a reasonable number of metadata chunks allocated in the * FS as well. */ - if (flags & BTRFS_BLOCK_GROUP_DATA) { + if (flags & BTRFS_BLOCK_GROUP_DATA && fs_info->metadata_ratio) { fs_info->data_chunk_allocations++; if (!(fs_info->data_chunk_allocations % fs_info->metadata_ratio)) @@ -2958,8 +3425,11 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans, } ret = btrfs_alloc_chunk(trans, extent_root, flags); + spin_lock(&space_info->lock); if (ret) space_info->full = 1; + space_info->force_alloc = 0; + spin_unlock(&space_info->lock); out: mutex_unlock(&extent_root->fs_info->chunk_mutex); return ret; @@ -3008,10 +3478,12 @@ static int update_block_group(struct btrfs_trans_handle *trans, num_bytes = min(total, cache->key.offset - byte_in_group); if (alloc) { old_val += num_bytes; + btrfs_set_block_group_used(&cache->item, old_val); + cache->reserved -= num_bytes; cache->space_info->bytes_used += num_bytes; + cache->space_info->bytes_reserved -= num_bytes; if (cache->ro) cache->space_info->bytes_readonly -= num_bytes; - btrfs_set_block_group_used(&cache->item, old_val); spin_unlock(&cache->lock); spin_unlock(&cache->space_info->lock); } else { @@ -3056,127 +3528,136 @@ static u64 first_logical_byte(struct btrfs_root *root, u64 search_start) return bytenr; } -int btrfs_update_pinned_extents(struct btrfs_root *root, - u64 bytenr, u64 num, int pin) +/* + * this function must be called within transaction + */ +int btrfs_pin_extent(struct btrfs_root *root, + u64 bytenr, u64 num_bytes, int reserved) { - u64 len; - struct btrfs_block_group_cache *cache; struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_block_group_cache *cache; - if (pin) - set_extent_dirty(&fs_info->pinned_extents, - bytenr, bytenr + num - 1, GFP_NOFS); - - while (num > 0) { - cache = btrfs_lookup_block_group(fs_info, bytenr); - BUG_ON(!cache); - len = min(num, cache->key.offset - - (bytenr - cache->key.objectid)); - if (pin) { - spin_lock(&cache->space_info->lock); - spin_lock(&cache->lock); - cache->pinned += len; - cache->space_info->bytes_pinned += len; - spin_unlock(&cache->lock); - spin_unlock(&cache->space_info->lock); - fs_info->total_pinned += len; - } else { - int unpin = 0; + cache = btrfs_lookup_block_group(fs_info, bytenr); + BUG_ON(!cache); - /* - * in order to not race with the block group caching, we - * only want to unpin the extent if we are cached. If - * we aren't cached, we want to start async caching this - * block group so we can free the extent the next time - * around. - */ - spin_lock(&cache->space_info->lock); - spin_lock(&cache->lock); - unpin = (cache->cached == BTRFS_CACHE_FINISHED); - if (likely(unpin)) { - cache->pinned -= len; - cache->space_info->bytes_pinned -= len; - fs_info->total_pinned -= len; - } - spin_unlock(&cache->lock); - spin_unlock(&cache->space_info->lock); + spin_lock(&cache->space_info->lock); + spin_lock(&cache->lock); + cache->pinned += num_bytes; + cache->space_info->bytes_pinned += num_bytes; + if (reserved) { + cache->reserved -= num_bytes; + cache->space_info->bytes_reserved -= num_bytes; + } + spin_unlock(&cache->lock); + spin_unlock(&cache->space_info->lock); - if (likely(unpin)) - clear_extent_dirty(&fs_info->pinned_extents, - bytenr, bytenr + len -1, - GFP_NOFS); - else - cache_block_group(cache); + btrfs_put_block_group(cache); - if (unpin) - btrfs_add_free_space(cache, bytenr, len); - } - btrfs_put_block_group(cache); - bytenr += len; - num -= len; + set_extent_dirty(fs_info->pinned_extents, + bytenr, bytenr + num_bytes - 1, GFP_NOFS); + return 0; +} + +static int update_reserved_extents(struct btrfs_block_group_cache *cache, + u64 num_bytes, int reserve) +{ + spin_lock(&cache->space_info->lock); + spin_lock(&cache->lock); + if (reserve) { + cache->reserved += num_bytes; + cache->space_info->bytes_reserved += num_bytes; + } else { + cache->reserved -= num_bytes; + cache->space_info->bytes_reserved -= num_bytes; } + spin_unlock(&cache->lock); + spin_unlock(&cache->space_info->lock); return 0; } -static int update_reserved_extents(struct btrfs_root *root, - u64 bytenr, u64 num, int reserve) +int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans, + struct btrfs_root *root) { - u64 len; - struct btrfs_block_group_cache *cache; struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_caching_control *next; + struct btrfs_caching_control *caching_ctl; + struct btrfs_block_group_cache *cache; - while (num > 0) { - cache = btrfs_lookup_block_group(fs_info, bytenr); - BUG_ON(!cache); - len = min(num, cache->key.offset - - (bytenr - cache->key.objectid)); + down_write(&fs_info->extent_commit_sem); - spin_lock(&cache->space_info->lock); - spin_lock(&cache->lock); - if (reserve) { - cache->reserved += len; - cache->space_info->bytes_reserved += len; + list_for_each_entry_safe(caching_ctl, next, + &fs_info->caching_block_groups, list) { + cache = caching_ctl->block_group; + if (block_group_cache_done(cache)) { + cache->last_byte_to_unpin = (u64)-1; + list_del_init(&caching_ctl->list); + put_caching_control(caching_ctl); } else { - cache->reserved -= len; - cache->space_info->bytes_reserved -= len; + cache->last_byte_to_unpin = caching_ctl->progress; } - spin_unlock(&cache->lock); - spin_unlock(&cache->space_info->lock); - btrfs_put_block_group(cache); - bytenr += len; - num -= len; } + + if (fs_info->pinned_extents == &fs_info->freed_extents[0]) + fs_info->pinned_extents = &fs_info->freed_extents[1]; + else + fs_info->pinned_extents = &fs_info->freed_extents[0]; + + up_write(&fs_info->extent_commit_sem); return 0; } -int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy) +static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end) { - u64 last = 0; - u64 start; - u64 end; - struct extent_io_tree *pinned_extents = &root->fs_info->pinned_extents; - int ret; + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_block_group_cache *cache = NULL; + u64 len; - while (1) { - ret = find_first_extent_bit(pinned_extents, last, - &start, &end, EXTENT_DIRTY); - if (ret) - break; + while (start <= end) { + if (!cache || + start >= cache->key.objectid + cache->key.offset) { + if (cache) + btrfs_put_block_group(cache); + cache = btrfs_lookup_block_group(fs_info, start); + BUG_ON(!cache); + } - set_extent_dirty(copy, start, end, GFP_NOFS); - last = end + 1; + len = cache->key.objectid + cache->key.offset - start; + len = min(len, end + 1 - start); + + if (start < cache->last_byte_to_unpin) { + len = min(len, cache->last_byte_to_unpin - start); + btrfs_add_free_space(cache, start, len); + } + + spin_lock(&cache->space_info->lock); + spin_lock(&cache->lock); + cache->pinned -= len; + cache->space_info->bytes_pinned -= len; + spin_unlock(&cache->lock); + spin_unlock(&cache->space_info->lock); + + start += len; } + + if (cache) + btrfs_put_block_group(cache); return 0; } int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct extent_io_tree *unpin) + struct btrfs_root *root) { + struct btrfs_fs_info *fs_info = root->fs_info; + struct extent_io_tree *unpin; u64 start; u64 end; int ret; + if (fs_info->pinned_extents == &fs_info->freed_extents[0]) + unpin = &fs_info->freed_extents[1]; + else + unpin = &fs_info->freed_extents[0]; + while (1) { ret = find_first_extent_bit(unpin, 0, &start, &end, EXTENT_DIRTY); @@ -3185,10 +3666,8 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, ret = btrfs_discard_extent(root, start, end + 1 - start); - /* unlocks the pinned mutex */ - btrfs_update_pinned_extents(root, start, end + 1 - start, 0); clear_extent_dirty(unpin, start, end, GFP_NOFS); - + unpin_extent_range(root, start, end); cond_resched(); } @@ -3198,7 +3677,8 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, static int pin_down_bytes(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, - u64 bytenr, u64 num_bytes, int is_data, + u64 bytenr, u64 num_bytes, + int is_data, int reserved, struct extent_buffer **must_clean) { int err = 0; @@ -3207,6 +3687,14 @@ static int pin_down_bytes(struct btrfs_trans_handle *trans, if (is_data) goto pinit; + /* + * discard is sloooow, and so triggering discards on + * individual btree blocks isn't a good plan. Just + * pin everything in discard mode. + */ + if (btrfs_test_opt(root, DISCARD)) + goto pinit; + buf = btrfs_find_tree_block(root, bytenr, num_bytes); if (!buf) goto pinit; @@ -3230,15 +3718,15 @@ static int pin_down_bytes(struct btrfs_trans_handle *trans, } free_extent_buffer(buf); pinit: - btrfs_set_path_blocking(path); + if (path) + btrfs_set_path_blocking(path); /* unlocks the pinned mutex */ - btrfs_update_pinned_extents(root, bytenr, num_bytes, 1); + btrfs_pin_extent(root, bytenr, num_bytes, reserved); BUG_ON(err < 0); return 0; } - static int __btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 num_bytes, u64 parent, @@ -3412,7 +3900,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, } ret = pin_down_bytes(trans, root, path, bytenr, - num_bytes, is_data, &must_clean); + num_bytes, is_data, 0, &must_clean); if (ret > 0) mark_free = 1; BUG_ON(ret < 0); @@ -3543,8 +4031,7 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, if (root_objectid == BTRFS_TREE_LOG_OBJECTID) { WARN_ON(owner >= BTRFS_FIRST_FREE_OBJECTID); /* unlocks the pinned mutex */ - btrfs_update_pinned_extents(root, bytenr, num_bytes, 1); - update_reserved_extents(root, bytenr, num_bytes, 0); + btrfs_pin_extent(root, bytenr, num_bytes, 1); ret = 0; } else if (owner < BTRFS_FIRST_FREE_OBJECTID) { ret = btrfs_add_delayed_tree_ref(trans, bytenr, num_bytes, @@ -3584,19 +4071,33 @@ static noinline int wait_block_group_cache_progress(struct btrfs_block_group_cache *cache, u64 num_bytes) { + struct btrfs_caching_control *caching_ctl; DEFINE_WAIT(wait); - prepare_to_wait(&cache->caching_q, &wait, TASK_UNINTERRUPTIBLE); - - if (block_group_cache_done(cache)) { - finish_wait(&cache->caching_q, &wait); + caching_ctl = get_caching_control(cache); + if (!caching_ctl) return 0; - } - schedule(); - finish_wait(&cache->caching_q, &wait); - wait_event(cache->caching_q, block_group_cache_done(cache) || + wait_event(caching_ctl->wait, block_group_cache_done(cache) || (cache->free_space >= num_bytes)); + + put_caching_control(caching_ctl); + return 0; +} + +static noinline int +wait_block_group_cache_done(struct btrfs_block_group_cache *cache) +{ + struct btrfs_caching_control *caching_ctl; + DEFINE_WAIT(wait); + + caching_ctl = get_caching_control(cache); + if (!caching_ctl) + return 0; + + wait_event(caching_ctl->wait, block_group_cache_done(cache)); + + put_caching_control(caching_ctl); return 0; } @@ -3634,6 +4135,8 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans, int last_ptr_loop = 0; int loop = 0; bool found_uncached_bg = false; + bool failed_cluster_refill = false; + bool failed_alloc = false; WARN_ON(num_bytes < root->sectorsize); btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY); @@ -3731,7 +4234,16 @@ have_block_group: if (unlikely(block_group->ro)) goto loop; - if (last_ptr) { + /* + * Ok we want to try and use the cluster allocator, so lets look + * there, unless we are on LOOP_NO_EMPTY_SIZE, since we will + * have tried the cluster allocator plenty of times at this + * point and not have found anything, so we are likely way too + * fragmented for the clustering stuff to find anything, so lets + * just skip it and let the allocator find whatever block it can + * find + */ + if (last_ptr && loop < LOOP_NO_EMPTY_SIZE) { /* * the refill lock keeps out other * people trying to start a new cluster @@ -3806,9 +4318,11 @@ refill_cluster: spin_unlock(&last_ptr->refill_lock); goto checks; } - } else if (!cached && loop > LOOP_CACHING_NOWAIT) { + } else if (!cached && loop > LOOP_CACHING_NOWAIT + && !failed_cluster_refill) { spin_unlock(&last_ptr->refill_lock); + failed_cluster_refill = true; wait_block_group_cache_progress(block_group, num_bytes + empty_cluster + empty_size); goto have_block_group; @@ -3820,25 +4334,30 @@ refill_cluster: * cluster. Free the cluster we've been trying * to use, and go to the next block group */ - if (loop < LOOP_NO_EMPTY_SIZE) { - btrfs_return_cluster_to_free_space(NULL, - last_ptr); - spin_unlock(&last_ptr->refill_lock); - goto loop; - } + btrfs_return_cluster_to_free_space(NULL, last_ptr); spin_unlock(&last_ptr->refill_lock); + goto loop; } offset = btrfs_find_space_for_alloc(block_group, search_start, num_bytes, empty_size); - if (!offset && (cached || (!cached && - loop == LOOP_CACHING_NOWAIT))) { - goto loop; - } else if (!offset && (!cached && - loop > LOOP_CACHING_NOWAIT)) { + /* + * If we didn't find a chunk, and we haven't failed on this + * block group before, and this block group is in the middle of + * caching and we are ok with waiting, then go ahead and wait + * for progress to be made, and set failed_alloc to true. + * + * If failed_alloc is true then we've already waited on this + * block group once and should move on to the next block group. + */ + if (!offset && !failed_alloc && !cached && + loop > LOOP_CACHING_NOWAIT) { wait_block_group_cache_progress(block_group, - num_bytes + empty_size); + num_bytes + empty_size); + failed_alloc = true; goto have_block_group; + } else if (!offset) { + goto loop; } checks: search_start = stripe_align(root, offset); @@ -3880,9 +4399,13 @@ checks: search_start - offset); BUG_ON(offset > search_start); + update_reserved_extents(block_group, num_bytes, 1); + /* we are all good, lets return */ break; loop: + failed_cluster_refill = false; + failed_alloc = false; btrfs_put_block_group(block_group); } up_read(&space_info->groups_sem); @@ -3940,21 +4463,32 @@ loop: return ret; } -static void dump_space_info(struct btrfs_space_info *info, u64 bytes) +static void dump_space_info(struct btrfs_space_info *info, u64 bytes, + int dump_block_groups) { struct btrfs_block_group_cache *cache; + spin_lock(&info->lock); printk(KERN_INFO "space_info has %llu free, is %sfull\n", (unsigned long long)(info->total_bytes - info->bytes_used - - info->bytes_pinned - info->bytes_reserved), + info->bytes_pinned - info->bytes_reserved - + info->bytes_super), (info->full) ? "" : "not "); printk(KERN_INFO "space_info total=%llu, pinned=%llu, delalloc=%llu," - " may_use=%llu, used=%llu\n", + " may_use=%llu, used=%llu, root=%llu, super=%llu, reserved=%llu" + "\n", (unsigned long long)info->total_bytes, (unsigned long long)info->bytes_pinned, (unsigned long long)info->bytes_delalloc, (unsigned long long)info->bytes_may_use, - (unsigned long long)info->bytes_used); + (unsigned long long)info->bytes_used, + (unsigned long long)info->bytes_root, + (unsigned long long)info->bytes_super, + (unsigned long long)info->bytes_reserved); + spin_unlock(&info->lock); + + if (!dump_block_groups) + return; down_read(&info->groups_sem); list_for_each_entry(cache, &info->block_groups, list) { @@ -3972,12 +4506,12 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes) up_read(&info->groups_sem); } -static int __btrfs_reserve_extent(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - u64 num_bytes, u64 min_alloc_size, - u64 empty_size, u64 hint_byte, - u64 search_end, struct btrfs_key *ins, - u64 data) +int btrfs_reserve_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 num_bytes, u64 min_alloc_size, + u64 empty_size, u64 hint_byte, + u64 search_end, struct btrfs_key *ins, + u64 data) { int ret; u64 search_start = 0; @@ -4022,7 +4556,7 @@ again: printk(KERN_ERR "btrfs allocation failed flags %llu, " "wanted %llu\n", (unsigned long long)data, (unsigned long long)num_bytes); - dump_space_info(sinfo, num_bytes); + dump_space_info(sinfo, num_bytes, 1); } return ret; @@ -4043,25 +4577,8 @@ int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len) ret = btrfs_discard_extent(root, start, len); btrfs_add_free_space(cache, start, len); + update_reserved_extents(cache, len, 0); btrfs_put_block_group(cache); - update_reserved_extents(root, start, len, 0); - - return ret; -} - -int btrfs_reserve_extent(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - u64 num_bytes, u64 min_alloc_size, - u64 empty_size, u64 hint_byte, - u64 search_end, struct btrfs_key *ins, - u64 data) -{ - int ret; - ret = __btrfs_reserve_extent(trans, root, num_bytes, min_alloc_size, - empty_size, hint_byte, search_end, ins, - data); - if (!ret) - update_reserved_extents(root, ins->objectid, ins->offset, 1); return ret; } @@ -4222,15 +4739,46 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, { int ret; struct btrfs_block_group_cache *block_group; + struct btrfs_caching_control *caching_ctl; + u64 start = ins->objectid; + u64 num_bytes = ins->offset; block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid); cache_block_group(block_group); - wait_event(block_group->caching_q, - block_group_cache_done(block_group)); + caching_ctl = get_caching_control(block_group); - ret = btrfs_remove_free_space(block_group, ins->objectid, - ins->offset); - BUG_ON(ret); + if (!caching_ctl) { + BUG_ON(!block_group_cache_done(block_group)); + ret = btrfs_remove_free_space(block_group, start, num_bytes); + BUG_ON(ret); + } else { + mutex_lock(&caching_ctl->mutex); + + if (start >= caching_ctl->progress) { + ret = add_excluded_extent(root, start, num_bytes); + BUG_ON(ret); + } else if (start + num_bytes <= caching_ctl->progress) { + ret = btrfs_remove_free_space(block_group, + start, num_bytes); + BUG_ON(ret); + } else { + num_bytes = caching_ctl->progress - start; + ret = btrfs_remove_free_space(block_group, + start, num_bytes); + BUG_ON(ret); + + start = caching_ctl->progress; + num_bytes = ins->objectid + ins->offset - + caching_ctl->progress; + ret = add_excluded_extent(root, start, num_bytes); + BUG_ON(ret); + } + + mutex_unlock(&caching_ctl->mutex); + put_caching_control(caching_ctl); + } + + update_reserved_extents(block_group, ins->offset, 1); btrfs_put_block_group(block_group); ret = alloc_reserved_file_extent(trans, root, 0, root_objectid, 0, owner, offset, ins, 1); @@ -4254,9 +4802,9 @@ static int alloc_tree_block(struct btrfs_trans_handle *trans, int ret; u64 flags = 0; - ret = __btrfs_reserve_extent(trans, root, num_bytes, num_bytes, - empty_size, hint_byte, search_end, - ins, 0); + ret = btrfs_reserve_extent(trans, root, num_bytes, num_bytes, + empty_size, hint_byte, search_end, + ins, 0); if (ret) return ret; @@ -4267,7 +4815,6 @@ static int alloc_tree_block(struct btrfs_trans_handle *trans, } else BUG_ON(parent > 0); - update_reserved_extents(root, ins->objectid, ins->offset, 1); if (root_objectid != BTRFS_TREE_LOG_OBJECTID) { struct btrfs_delayed_extent_op *extent_op; extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS); @@ -4346,452 +4893,108 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, return buf; } -#if 0 -int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct extent_buffer *leaf) -{ - u64 disk_bytenr; - u64 num_bytes; - struct btrfs_key key; - struct btrfs_file_extent_item *fi; - u32 nritems; - int i; - int ret; - - BUG_ON(!btrfs_is_leaf(leaf)); - nritems = btrfs_header_nritems(leaf); - - for (i = 0; i < nritems; i++) { - cond_resched(); - btrfs_item_key_to_cpu(leaf, &key, i); - - /* only extents have references, skip everything else */ - if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY) - continue; - - fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item); - - /* inline extents live in the btree, they don't have refs */ - if (btrfs_file_extent_type(leaf, fi) == - BTRFS_FILE_EXTENT_INLINE) - continue; - - disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); - - /* holes don't have refs */ - if (disk_bytenr == 0) - continue; - - num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi); - ret = btrfs_free_extent(trans, root, disk_bytenr, num_bytes, - leaf->start, 0, key.objectid, 0); - BUG_ON(ret); - } - return 0; -} - -static noinline int cache_drop_leaf_ref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_leaf_ref *ref) -{ - int i; - int ret; - struct btrfs_extent_info *info; - struct refsort *sorted; - - if (ref->nritems == 0) - return 0; - - sorted = kmalloc(sizeof(*sorted) * ref->nritems, GFP_NOFS); - for (i = 0; i < ref->nritems; i++) { - sorted[i].bytenr = ref->extents[i].bytenr; - sorted[i].slot = i; - } - sort(sorted, ref->nritems, sizeof(struct refsort), refsort_cmp, NULL); - - /* - * the items in the ref were sorted when the ref was inserted - * into the ref cache, so this is already in order - */ - for (i = 0; i < ref->nritems; i++) { - info = ref->extents + sorted[i].slot; - ret = btrfs_free_extent(trans, root, info->bytenr, - info->num_bytes, ref->bytenr, - ref->owner, ref->generation, - info->objectid, 0); - - atomic_inc(&root->fs_info->throttle_gen); - wake_up(&root->fs_info->transaction_throttle); - cond_resched(); - - BUG_ON(ret); - info++; - } - - kfree(sorted); - return 0; -} - - -static int drop_snap_lookup_refcount(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 start, - u64 len, u32 *refs) -{ - int ret; - - ret = btrfs_lookup_extent_refs(trans, root, start, len, refs); - BUG_ON(ret); - -#if 0 /* some debugging code in case we see problems here */ - /* if the refs count is one, it won't get increased again. But - * if the ref count is > 1, someone may be decreasing it at - * the same time we are. - */ - if (*refs != 1) { - struct extent_buffer *eb = NULL; - eb = btrfs_find_create_tree_block(root, start, len); - if (eb) - btrfs_tree_lock(eb); - - mutex_lock(&root->fs_info->alloc_mutex); - ret = lookup_extent_ref(NULL, root, start, len, refs); - BUG_ON(ret); - mutex_unlock(&root->fs_info->alloc_mutex); - - if (eb) { - btrfs_tree_unlock(eb); - free_extent_buffer(eb); - } - if (*refs == 1) { - printk(KERN_ERR "btrfs block %llu went down to one " - "during drop_snap\n", (unsigned long long)start); - } - - } -#endif - - cond_resched(); - return ret; -} +struct walk_control { + u64 refs[BTRFS_MAX_LEVEL]; + u64 flags[BTRFS_MAX_LEVEL]; + struct btrfs_key update_progress; + int stage; + int level; + int shared_level; + int update_ref; + int keep_locks; + int reada_slot; + int reada_count; +}; +#define DROP_REFERENCE 1 +#define UPDATE_BACKREF 2 -/* - * this is used while deleting old snapshots, and it drops the refs - * on a whole subtree starting from a level 1 node. - * - * The idea is to sort all the leaf pointers, and then drop the - * ref on all the leaves in order. Most of the time the leaves - * will have ref cache entries, so no leaf IOs will be required to - * find the extents they have references on. - * - * For each leaf, any references it has are also dropped in order - * - * This ends up dropping the references in something close to optimal - * order for reading and modifying the extent allocation tree. - */ -static noinline int drop_level_one_refs(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path) +static noinline void reada_walk_down(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct walk_control *wc, + struct btrfs_path *path) { u64 bytenr; - u64 root_owner; - u64 root_gen; - struct extent_buffer *eb = path->nodes[1]; - struct extent_buffer *leaf; - struct btrfs_leaf_ref *ref; - struct refsort *sorted = NULL; - int nritems = btrfs_header_nritems(eb); + u64 generation; + u64 refs; + u64 flags; + u64 last = 0; + u32 nritems; + u32 blocksize; + struct btrfs_key key; + struct extent_buffer *eb; int ret; - int i; - int refi = 0; - int slot = path->slots[1]; - u32 blocksize = btrfs_level_size(root, 0); - u32 refs; - - if (nritems == 0) - goto out; - - root_owner = btrfs_header_owner(eb); - root_gen = btrfs_header_generation(eb); - sorted = kmalloc(sizeof(*sorted) * nritems, GFP_NOFS); + int slot; + int nread = 0; - /* - * step one, sort all the leaf pointers so we don't scribble - * randomly into the extent allocation tree - */ - for (i = slot; i < nritems; i++) { - sorted[refi].bytenr = btrfs_node_blockptr(eb, i); - sorted[refi].slot = i; - refi++; + if (path->slots[wc->level] < wc->reada_slot) { + wc->reada_count = wc->reada_count * 2 / 3; + wc->reada_count = max(wc->reada_count, 2); + } else { + wc->reada_count = wc->reada_count * 3 / 2; + wc->reada_count = min_t(int, wc->reada_count, + BTRFS_NODEPTRS_PER_BLOCK(root)); } - /* - * nritems won't be zero, but if we're picking up drop_snapshot - * after a crash, slot might be > 0, so double check things - * just in case. - */ - if (refi == 0) - goto out; + eb = path->nodes[wc->level]; + nritems = btrfs_header_nritems(eb); + blocksize = btrfs_level_size(root, wc->level - 1); - sort(sorted, refi, sizeof(struct refsort), refsort_cmp, NULL); + for (slot = path->slots[wc->level]; slot < nritems; slot++) { + if (nread >= wc->reada_count) + break; - /* - * the first loop frees everything the leaves point to - */ - for (i = 0; i < refi; i++) { - u64 ptr_gen; + cond_resched(); + bytenr = btrfs_node_blockptr(eb, slot); + generation = btrfs_node_ptr_generation(eb, slot); - bytenr = sorted[i].bytenr; + if (slot == path->slots[wc->level]) + goto reada; - /* - * check the reference count on this leaf. If it is > 1 - * we just decrement it below and don't update any - * of the refs the leaf points to. - */ - ret = drop_snap_lookup_refcount(trans, root, bytenr, - blocksize, &refs); - BUG_ON(ret); - if (refs != 1) + if (wc->stage == UPDATE_BACKREF && + generation <= root->root_key.offset) continue; - ptr_gen = btrfs_node_ptr_generation(eb, sorted[i].slot); - - /* - * the leaf only had one reference, which means the - * only thing pointing to this leaf is the snapshot - * we're deleting. It isn't possible for the reference - * count to increase again later - * - * The reference cache is checked for the leaf, - * and if found we'll be able to drop any refs held by - * the leaf without needing to read it in. - */ - ref = btrfs_lookup_leaf_ref(root, bytenr); - if (ref && ref->generation != ptr_gen) { - btrfs_free_leaf_ref(root, ref); - ref = NULL; - } - if (ref) { - ret = cache_drop_leaf_ref(trans, root, ref); - BUG_ON(ret); - btrfs_remove_leaf_ref(root, ref); - btrfs_free_leaf_ref(root, ref); - } else { - /* - * the leaf wasn't in the reference cache, so - * we have to read it. - */ - leaf = read_tree_block(root, bytenr, blocksize, - ptr_gen); - ret = btrfs_drop_leaf_ref(trans, root, leaf); - BUG_ON(ret); - free_extent_buffer(leaf); - } - atomic_inc(&root->fs_info->throttle_gen); - wake_up(&root->fs_info->transaction_throttle); - cond_resched(); - } - - /* - * run through the loop again to free the refs on the leaves. - * This is faster than doing it in the loop above because - * the leaves are likely to be clustered together. We end up - * working in nice chunks on the extent allocation tree. - */ - for (i = 0; i < refi; i++) { - bytenr = sorted[i].bytenr; - ret = btrfs_free_extent(trans, root, bytenr, - blocksize, eb->start, - root_owner, root_gen, 0, 1); + /* We don't lock the tree block, it's OK to be racy here */ + ret = btrfs_lookup_extent_info(trans, root, bytenr, blocksize, + &refs, &flags); BUG_ON(ret); + BUG_ON(refs == 0); - atomic_inc(&root->fs_info->throttle_gen); - wake_up(&root->fs_info->transaction_throttle); - cond_resched(); - } -out: - kfree(sorted); - - /* - * update the path to show we've processed the entire level 1 - * node. This will get saved into the root's drop_snapshot_progress - * field so these drops are not repeated again if this transaction - * commits. - */ - path->slots[1] = nritems; - return 0; -} - -/* - * helper function for drop_snapshot, this walks down the tree dropping ref - * counts as it goes. - */ -static noinline int walk_down_tree(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, int *level) -{ - u64 root_owner; - u64 root_gen; - u64 bytenr; - u64 ptr_gen; - struct extent_buffer *next; - struct extent_buffer *cur; - struct extent_buffer *parent; - u32 blocksize; - int ret; - u32 refs; - - WARN_ON(*level < 0); - WARN_ON(*level >= BTRFS_MAX_LEVEL); - ret = drop_snap_lookup_refcount(trans, root, path->nodes[*level]->start, - path->nodes[*level]->len, &refs); - BUG_ON(ret); - if (refs > 1) - goto out; - - /* - * walk down to the last node level and free all the leaves - */ - while (*level >= 0) { - WARN_ON(*level < 0); - WARN_ON(*level >= BTRFS_MAX_LEVEL); - cur = path->nodes[*level]; - - if (btrfs_header_level(cur) != *level) - WARN_ON(1); - - if (path->slots[*level] >= - btrfs_header_nritems(cur)) - break; + if (wc->stage == DROP_REFERENCE) { + if (refs == 1) + goto reada; - /* the new code goes down to level 1 and does all the - * leaves pointed to that node in bulk. So, this check - * for level 0 will always be false. - * - * But, the disk format allows the drop_snapshot_progress - * field in the root to leave things in a state where - * a leaf will need cleaning up here. If someone crashes - * with the old code and then boots with the new code, - * we might find a leaf here. - */ - if (*level == 0) { - ret = btrfs_drop_leaf_ref(trans, root, cur); - BUG_ON(ret); - break; + if (wc->level == 1 && + (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) + continue; + if (!wc->update_ref || + generation <= root->root_key.offset) + continue; + btrfs_node_key_to_cpu(eb, &key, slot); + ret = btrfs_comp_cpu_keys(&key, + &wc->update_progress); + if (ret < 0) + continue; + } else { + if (wc->level == 1 && + (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) + continue; } - - /* - * once we get to level one, process the whole node - * at once, including everything below it. - */ - if (*level == 1) { - ret = drop_level_one_refs(trans, root, path); - BUG_ON(ret); +reada: + ret = readahead_tree_block(root, bytenr, blocksize, + generation); + if (ret) break; - } - - bytenr = btrfs_node_blockptr(cur, path->slots[*level]); - ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]); - blocksize = btrfs_level_size(root, *level - 1); - - ret = drop_snap_lookup_refcount(trans, root, bytenr, - blocksize, &refs); - BUG_ON(ret); - - /* - * if there is more than one reference, we don't need - * to read that node to drop any references it has. We - * just drop the ref we hold on that node and move on to the - * next slot in this level. - */ - if (refs != 1) { - parent = path->nodes[*level]; - root_owner = btrfs_header_owner(parent); - root_gen = btrfs_header_generation(parent); - path->slots[*level]++; - - ret = btrfs_free_extent(trans, root, bytenr, - blocksize, parent->start, - root_owner, root_gen, - *level - 1, 1); - BUG_ON(ret); - - atomic_inc(&root->fs_info->throttle_gen); - wake_up(&root->fs_info->transaction_throttle); - cond_resched(); - - continue; - } - - /* - * we need to keep freeing things in the next level down. - * read the block and loop around to process it - */ - next = read_tree_block(root, bytenr, blocksize, ptr_gen); - WARN_ON(*level <= 0); - if (path->nodes[*level-1]) - free_extent_buffer(path->nodes[*level-1]); - path->nodes[*level-1] = next; - *level = btrfs_header_level(next); - path->slots[*level] = 0; - cond_resched(); + last = bytenr + blocksize; + nread++; } -out: - WARN_ON(*level < 0); - WARN_ON(*level >= BTRFS_MAX_LEVEL); - - if (path->nodes[*level] == root->node) { - parent = path->nodes[*level]; - bytenr = path->nodes[*level]->start; - } else { - parent = path->nodes[*level + 1]; - bytenr = btrfs_node_blockptr(parent, path->slots[*level + 1]); - } - - blocksize = btrfs_level_size(root, *level); - root_owner = btrfs_header_owner(parent); - root_gen = btrfs_header_generation(parent); - - /* - * cleanup and free the reference on the last node - * we processed - */ - ret = btrfs_free_extent(trans, root, bytenr, blocksize, - parent->start, root_owner, root_gen, - *level, 1); - free_extent_buffer(path->nodes[*level]); - path->nodes[*level] = NULL; - - *level += 1; - BUG_ON(ret); - - cond_resched(); - return 0; + wc->reada_slot = slot; } -#endif - -struct walk_control { - u64 refs[BTRFS_MAX_LEVEL]; - u64 flags[BTRFS_MAX_LEVEL]; - struct btrfs_key update_progress; - int stage; - int level; - int shared_level; - int update_ref; - int keep_locks; -}; - -#define DROP_REFERENCE 1 -#define UPDATE_BACKREF 2 /* * hepler to process tree block while walking down the tree. * - * when wc->stage == DROP_REFERENCE, this function checks - * reference count of the block. if the block is shared and - * we need update back refs for the subtree rooted at the - * block, this function changes wc->stage to UPDATE_BACKREF - * * when wc->stage == UPDATE_BACKREF, this function updates * back refs for pointers in the block. * @@ -4800,11 +5003,10 @@ struct walk_control { static noinline int walk_down_proc(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, - struct walk_control *wc) + struct walk_control *wc, int lookup_info) { int level = wc->level; struct extent_buffer *eb = path->nodes[level]; - struct btrfs_key key; u64 flag = BTRFS_BLOCK_FLAG_FULL_BACKREF; int ret; @@ -4816,8 +5018,9 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans, * when reference count of tree block is 1, it won't increase * again. once full backref flag is set, we never clear it. */ - if ((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) || - (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag))) { + if (lookup_info && + ((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) || + (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag)))) { BUG_ON(!path->locks[level]); ret = btrfs_lookup_extent_info(trans, root, eb->start, eb->len, @@ -4827,21 +5030,6 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans, BUG_ON(wc->refs[level] == 0); } - if (wc->stage == DROP_REFERENCE && - wc->update_ref && wc->refs[level] > 1) { - BUG_ON(eb == root->node); - BUG_ON(path->slots[level] > 0); - if (level == 0) - btrfs_item_key_to_cpu(eb, &key, path->slots[level]); - else - btrfs_node_key_to_cpu(eb, &key, path->slots[level]); - if (btrfs_header_owner(eb) == root->root_key.objectid && - btrfs_comp_cpu_keys(&key, &wc->update_progress) >= 0) { - wc->stage = UPDATE_BACKREF; - wc->shared_level = level; - } - } - if (wc->stage == DROP_REFERENCE) { if (wc->refs[level] > 1) return 1; @@ -4878,6 +5066,136 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans, } /* + * hepler to process tree block pointer. + * + * when wc->stage == DROP_REFERENCE, this function checks + * reference count of the block pointed to. if the block + * is shared and we need update back refs for the subtree + * rooted at the block, this function changes wc->stage to + * UPDATE_BACKREF. if the block is shared and there is no + * need to update back, this function drops the reference + * to the block. + * + * NOTE: return value 1 means we should stop walking down. + */ +static noinline int do_walk_down(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct walk_control *wc, int *lookup_info) +{ + u64 bytenr; + u64 generation; + u64 parent; + u32 blocksize; + struct btrfs_key key; + struct extent_buffer *next; + int level = wc->level; + int reada = 0; + int ret = 0; + + generation = btrfs_node_ptr_generation(path->nodes[level], + path->slots[level]); + /* + * if the lower level block was created before the snapshot + * was created, we know there is no need to update back refs + * for the subtree + */ + if (wc->stage == UPDATE_BACKREF && + generation <= root->root_key.offset) { + *lookup_info = 1; + return 1; + } + + bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]); + blocksize = btrfs_level_size(root, level - 1); + + next = btrfs_find_tree_block(root, bytenr, blocksize); + if (!next) { + next = btrfs_find_create_tree_block(root, bytenr, blocksize); + reada = 1; + } + btrfs_tree_lock(next); + btrfs_set_lock_blocking(next); + + ret = btrfs_lookup_extent_info(trans, root, bytenr, blocksize, + &wc->refs[level - 1], + &wc->flags[level - 1]); + BUG_ON(ret); + BUG_ON(wc->refs[level - 1] == 0); + *lookup_info = 0; + + if (wc->stage == DROP_REFERENCE) { + if (wc->refs[level - 1] > 1) { + if (level == 1 && + (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF)) + goto skip; + + if (!wc->update_ref || + generation <= root->root_key.offset) + goto skip; + + btrfs_node_key_to_cpu(path->nodes[level], &key, + path->slots[level]); + ret = btrfs_comp_cpu_keys(&key, &wc->update_progress); + if (ret < 0) + goto skip; + + wc->stage = UPDATE_BACKREF; + wc->shared_level = level - 1; + } + } else { + if (level == 1 && + (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF)) + goto skip; + } + + if (!btrfs_buffer_uptodate(next, generation)) { + btrfs_tree_unlock(next); + free_extent_buffer(next); + next = NULL; + *lookup_info = 1; + } + + if (!next) { + if (reada && level == 1) + reada_walk_down(trans, root, wc, path); + next = read_tree_block(root, bytenr, blocksize, generation); + btrfs_tree_lock(next); + btrfs_set_lock_blocking(next); + } + + level--; + BUG_ON(level != btrfs_header_level(next)); + path->nodes[level] = next; + path->slots[level] = 0; + path->locks[level] = 1; + wc->level = level; + if (wc->level == 1) + wc->reada_slot = 0; + return 0; +skip: + wc->refs[level - 1] = 0; + wc->flags[level - 1] = 0; + if (wc->stage == DROP_REFERENCE) { + if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) { + parent = path->nodes[level]->start; + } else { + BUG_ON(root->root_key.objectid != + btrfs_header_owner(path->nodes[level])); + parent = 0; + } + + ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent, + root->root_key.objectid, level - 1, 0); + BUG_ON(ret); + } + btrfs_tree_unlock(next); + free_extent_buffer(next); + *lookup_info = 1; + return 1; +} + +/* * hepler to process tree block while walking up the tree. * * when wc->stage == DROP_REFERENCE, this function drops @@ -4904,7 +5222,6 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans, if (level < wc->shared_level) goto out; - BUG_ON(wc->refs[level] <= 1); ret = find_next_key(path, level + 1, &wc->update_progress); if (ret > 0) wc->update_ref = 0; @@ -4935,8 +5252,6 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans, path->locks[level] = 0; return 1; } - } else { - BUG_ON(level != 0); } } @@ -4989,39 +5304,28 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans, struct btrfs_path *path, struct walk_control *wc) { - struct extent_buffer *next; - struct extent_buffer *cur; - u64 bytenr; - u64 ptr_gen; - u32 blocksize; int level = wc->level; + int lookup_info = 1; int ret; while (level >= 0) { - cur = path->nodes[level]; - BUG_ON(path->slots[level] >= btrfs_header_nritems(cur)); + if (path->slots[level] >= + btrfs_header_nritems(path->nodes[level])) + break; - ret = walk_down_proc(trans, root, path, wc); + ret = walk_down_proc(trans, root, path, wc, lookup_info); if (ret > 0) break; if (level == 0) break; - bytenr = btrfs_node_blockptr(cur, path->slots[level]); - blocksize = btrfs_level_size(root, level - 1); - ptr_gen = btrfs_node_ptr_generation(cur, path->slots[level]); - - next = read_tree_block(root, bytenr, blocksize, ptr_gen); - btrfs_tree_lock(next); - btrfs_set_lock_blocking(next); - - level--; - BUG_ON(level != btrfs_header_level(next)); - path->nodes[level] = next; - path->slots[level] = 0; - path->locks[level] = 1; - wc->level = level; + ret = do_walk_down(trans, root, path, wc, &lookup_info); + if (ret > 0) { + path->slots[level]++; + continue; + } + level = wc->level; } return 0; } @@ -5111,9 +5415,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref) err = ret; goto out; } - btrfs_node_key_to_cpu(path->nodes[level], &key, - path->slots[level]); - WARN_ON(memcmp(&key, &wc->update_progress, sizeof(key))); + WARN_ON(ret > 0); /* * unlock our path, this is safe because only this @@ -5148,6 +5450,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref) wc->stage = DROP_REFERENCE; wc->update_ref = update_ref; wc->keep_locks = 0; + wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root); while (1) { ret = walk_down_tree(trans, root, path, wc); @@ -5200,9 +5503,24 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref) ret = btrfs_del_root(trans, tree_root, &root->root_key); BUG_ON(ret); - free_extent_buffer(root->node); - free_extent_buffer(root->commit_root); - kfree(root); + if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) { + ret = btrfs_find_last_root(tree_root, root->root_key.objectid, + NULL, NULL); + BUG_ON(ret < 0); + if (ret > 0) { + ret = btrfs_del_orphan_item(trans, tree_root, + root->root_key.objectid); + BUG_ON(ret); + } + } + + if (root->in_radix) { + btrfs_free_fs_root(tree_root->fs_info, root); + } else { + free_extent_buffer(root->node); + free_extent_buffer(root->commit_root); + kfree(root); + } out: btrfs_end_transaction(trans, tree_root); kfree(wc); @@ -5254,6 +5572,7 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans, wc->stage = DROP_REFERENCE; wc->update_ref = 0; wc->keep_locks = 1; + wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root); while (1) { wret = walk_down_tree(trans, root, path, wc); @@ -5396,9 +5715,9 @@ static noinline int relocate_data_extent(struct inode *reloc_inode, lock_extent(&BTRFS_I(reloc_inode)->io_tree, start, end, GFP_NOFS); while (1) { int ret; - spin_lock(&em_tree->lock); + write_lock(&em_tree->lock); ret = add_extent_mapping(em_tree, em); - spin_unlock(&em_tree->lock); + write_unlock(&em_tree->lock); if (ret != -EEXIST) { free_extent_map(em); break; @@ -6841,287 +7160,86 @@ int btrfs_prepare_block_group_relocation(struct btrfs_root *root, return 0; } -#if 0 -static int __insert_orphan_inode(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - u64 objectid, u64 size) -{ - struct btrfs_path *path; - struct btrfs_inode_item *item; - struct extent_buffer *leaf; - int ret; - - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - - path->leave_spinning = 1; - ret = btrfs_insert_empty_inode(trans, root, path, objectid); - if (ret) - goto out; - - leaf = path->nodes[0]; - item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item); - memset_extent_buffer(leaf, 0, (unsigned long)item, sizeof(*item)); - btrfs_set_inode_generation(leaf, item, 1); - btrfs_set_inode_size(leaf, item, size); - btrfs_set_inode_mode(leaf, item, S_IFREG | 0600); - btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS); - btrfs_mark_buffer_dirty(leaf); - btrfs_release_path(root, path); -out: - btrfs_free_path(path); - return ret; -} - -static noinline struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info, - struct btrfs_block_group_cache *group) +/* + * checks to see if its even possible to relocate this block group. + * + * @return - -1 if it's not a good idea to relocate this block group, 0 if its + * ok to go ahead and try. + */ +int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr) { - struct inode *inode = NULL; - struct btrfs_trans_handle *trans; - struct btrfs_root *root; - struct btrfs_key root_key; - u64 objectid = BTRFS_FIRST_FREE_OBJECTID; - int err = 0; + struct btrfs_block_group_cache *block_group; + struct btrfs_space_info *space_info; + struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; + struct btrfs_device *device; + int full = 0; + int ret = 0; - root_key.objectid = BTRFS_DATA_RELOC_TREE_OBJECTID; - root_key.type = BTRFS_ROOT_ITEM_KEY; - root_key.offset = (u64)-1; - root = btrfs_read_fs_root_no_name(fs_info, &root_key); - if (IS_ERR(root)) - return ERR_CAST(root); + block_group = btrfs_lookup_block_group(root->fs_info, bytenr); - trans = btrfs_start_transaction(root, 1); - BUG_ON(!trans); + /* odd, couldn't find the block group, leave it alone */ + if (!block_group) + return -1; - err = btrfs_find_free_objectid(trans, root, objectid, &objectid); - if (err) + /* no bytes used, we're good */ + if (!btrfs_block_group_used(&block_group->item)) goto out; - err = __insert_orphan_inode(trans, root, objectid, group->key.offset); - BUG_ON(err); - - err = btrfs_insert_file_extent(trans, root, objectid, 0, 0, 0, - group->key.offset, 0, group->key.offset, - 0, 0, 0); - BUG_ON(err); - - inode = btrfs_iget_locked(root->fs_info->sb, objectid, root); - if (inode->i_state & I_NEW) { - BTRFS_I(inode)->root = root; - BTRFS_I(inode)->location.objectid = objectid; - BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY; - BTRFS_I(inode)->location.offset = 0; - btrfs_read_locked_inode(inode); - unlock_new_inode(inode); - BUG_ON(is_bad_inode(inode)); - } else { - BUG_ON(1); - } - BTRFS_I(inode)->index_cnt = group->key.objectid; - - err = btrfs_orphan_add(trans, inode); -out: - btrfs_end_transaction(trans, root); - if (err) { - if (inode) - iput(inode); - inode = ERR_PTR(err); - } - return inode; -} - -int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len) -{ - - struct btrfs_ordered_sum *sums; - struct btrfs_sector_sum *sector_sum; - struct btrfs_ordered_extent *ordered; - struct btrfs_root *root = BTRFS_I(inode)->root; - struct list_head list; - size_t offset; - int ret; - u64 disk_bytenr; - - INIT_LIST_HEAD(&list); - - ordered = btrfs_lookup_ordered_extent(inode, file_pos); - BUG_ON(ordered->file_offset != file_pos || ordered->len != len); - - disk_bytenr = file_pos + BTRFS_I(inode)->index_cnt; - ret = btrfs_lookup_csums_range(root->fs_info->csum_root, disk_bytenr, - disk_bytenr + len - 1, &list); - - while (!list_empty(&list)) { - sums = list_entry(list.next, struct btrfs_ordered_sum, list); - list_del_init(&sums->list); - - sector_sum = sums->sums; - sums->bytenr = ordered->start; + space_info = block_group->space_info; + spin_lock(&space_info->lock); - offset = 0; - while (offset < sums->len) { - sector_sum->bytenr += ordered->start - disk_bytenr; - sector_sum++; - offset += root->sectorsize; - } + full = space_info->full; - btrfs_add_ordered_sum(inode, ordered, sums); + /* + * if this is the last block group we have in this space, we can't + * relocate it unless we're able to allocate a new chunk below. + * + * Otherwise, we need to make sure we have room in the space to handle + * all of the extents from this block group. If we can, we're good + */ + if ((space_info->total_bytes != block_group->key.offset) && + (space_info->bytes_used + space_info->bytes_reserved + + space_info->bytes_pinned + space_info->bytes_readonly + + btrfs_block_group_used(&block_group->item) < + space_info->total_bytes)) { + spin_unlock(&space_info->lock); + goto out; } - btrfs_put_ordered_extent(ordered); - return 0; -} - -int btrfs_relocate_block_group(struct btrfs_root *root, u64 group_start) -{ - struct btrfs_trans_handle *trans; - struct btrfs_path *path; - struct btrfs_fs_info *info = root->fs_info; - struct extent_buffer *leaf; - struct inode *reloc_inode; - struct btrfs_block_group_cache *block_group; - struct btrfs_key key; - u64 skipped; - u64 cur_byte; - u64 total_found; - u32 nritems; - int ret; - int progress; - int pass = 0; - - root = root->fs_info->extent_root; - - block_group = btrfs_lookup_block_group(info, group_start); - BUG_ON(!block_group); - - printk(KERN_INFO "btrfs relocating block group %llu flags %llu\n", - (unsigned long long)block_group->key.objectid, - (unsigned long long)block_group->flags); - - path = btrfs_alloc_path(); - BUG_ON(!path); - - reloc_inode = create_reloc_inode(info, block_group); - BUG_ON(IS_ERR(reloc_inode)); - - __alloc_chunk_for_shrink(root, block_group, 1); - set_block_group_readonly(block_group); - - btrfs_start_delalloc_inodes(info->tree_root); - btrfs_wait_ordered_extents(info->tree_root, 0); -again: - skipped = 0; - total_found = 0; - progress = 0; - key.objectid = block_group->key.objectid; - key.offset = 0; - key.type = 0; - cur_byte = key.objectid; - - trans = btrfs_start_transaction(info->tree_root, 1); - btrfs_commit_transaction(trans, info->tree_root); + spin_unlock(&space_info->lock); - mutex_lock(&root->fs_info->cleaner_mutex); - btrfs_clean_old_snapshots(info->tree_root); - btrfs_remove_leaf_refs(info->tree_root, (u64)-1, 1); - mutex_unlock(&root->fs_info->cleaner_mutex); + /* + * ok we don't have enough space, but maybe we have free space on our + * devices to allocate new chunks for relocation, so loop through our + * alloc devices and guess if we have enough space. However, if we + * were marked as full, then we know there aren't enough chunks, and we + * can just return. + */ + ret = -1; + if (full) + goto out; - trans = btrfs_start_transaction(info->tree_root, 1); - btrfs_commit_transaction(trans, info->tree_root); + mutex_lock(&root->fs_info->chunk_mutex); + list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) { + u64 min_free = btrfs_block_group_used(&block_group->item); + u64 dev_offset, max_avail; - while (1) { - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); - if (ret < 0) - goto out; -next: - leaf = path->nodes[0]; - nritems = btrfs_header_nritems(leaf); - if (path->slots[0] >= nritems) { - ret = btrfs_next_leaf(root, path); - if (ret < 0) - goto out; - if (ret == 1) { - ret = 0; + /* + * check to make sure we can actually find a chunk with enough + * space to fit our block group in. + */ + if (device->total_bytes > device->bytes_used + min_free) { + ret = find_free_dev_extent(NULL, device, min_free, + &dev_offset, &max_avail); + if (!ret) break; - } - leaf = path->nodes[0]; - nritems = btrfs_header_nritems(leaf); - } - - btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); - - if (key.objectid >= block_group->key.objectid + - block_group->key.offset) - break; - - if (progress && need_resched()) { - btrfs_release_path(root, path); - cond_resched(); - progress = 0; - continue; - } - progress = 1; - - if (btrfs_key_type(&key) != BTRFS_EXTENT_ITEM_KEY || - key.objectid + key.offset <= cur_byte) { - path->slots[0]++; - goto next; + ret = -1; } - - total_found++; - cur_byte = key.objectid + key.offset; - btrfs_release_path(root, path); - - __alloc_chunk_for_shrink(root, block_group, 0); - ret = relocate_one_extent(root, path, &key, block_group, - reloc_inode, pass); - BUG_ON(ret < 0); - if (ret > 0) - skipped++; - - key.objectid = cur_byte; - key.type = 0; - key.offset = 0; } - - btrfs_release_path(root, path); - - if (pass == 0) { - btrfs_wait_ordered_range(reloc_inode, 0, (u64)-1); - invalidate_mapping_pages(reloc_inode->i_mapping, 0, -1); - } - - if (total_found > 0) { - printk(KERN_INFO "btrfs found %llu extents in pass %d\n", - (unsigned long long)total_found, pass); - pass++; - if (total_found == skipped && pass > 2) { - iput(reloc_inode); - reloc_inode = create_reloc_inode(info, block_group); - pass = 0; - } - goto again; - } - - /* delete reloc_inode */ - iput(reloc_inode); - - /* unpin extents in this range */ - trans = btrfs_start_transaction(info->tree_root, 1); - btrfs_commit_transaction(trans, info->tree_root); - - spin_lock(&block_group->lock); - WARN_ON(block_group->pinned > 0); - WARN_ON(block_group->reserved > 0); - WARN_ON(btrfs_block_group_used(&block_group->item) > 0); - spin_unlock(&block_group->lock); - btrfs_put_block_group(block_group); - ret = 0; + mutex_unlock(&root->fs_info->chunk_mutex); out: - btrfs_free_path(path); + btrfs_put_block_group(block_group); return ret; } -#endif static int find_first_block_group(struct btrfs_root *root, struct btrfs_path *path, struct btrfs_key *key) @@ -7164,8 +7282,18 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info) { struct btrfs_block_group_cache *block_group; struct btrfs_space_info *space_info; + struct btrfs_caching_control *caching_ctl; struct rb_node *n; + down_write(&info->extent_commit_sem); + while (!list_empty(&info->caching_block_groups)) { + caching_ctl = list_entry(info->caching_block_groups.next, + struct btrfs_caching_control, list); + list_del(&caching_ctl->list); + put_caching_control(caching_ctl); + } + up_write(&info->extent_commit_sem); + spin_lock(&info->block_group_cache_lock); while ((n = rb_last(&info->block_group_cache_tree)) != NULL) { block_group = rb_entry(n, struct btrfs_block_group_cache, @@ -7179,8 +7307,7 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info) up_write(&block_group->space_info->groups_sem); if (block_group->cached == BTRFS_CACHE_STARTED) - wait_event(block_group->caching_q, - block_group_cache_done(block_group)); + wait_block_group_cache_done(block_group); btrfs_remove_free_space_cache(block_group); @@ -7250,7 +7377,6 @@ int btrfs_read_block_groups(struct btrfs_root *root) spin_lock_init(&cache->lock); spin_lock_init(&cache->tree_lock); cache->fs_info = info; - init_waitqueue_head(&cache->caching_q); INIT_LIST_HEAD(&cache->list); INIT_LIST_HEAD(&cache->cluster_list); @@ -7272,8 +7398,6 @@ int btrfs_read_block_groups(struct btrfs_root *root) cache->flags = btrfs_block_group_flags(&cache->item); cache->sectorsize = root->sectorsize; - remove_sb_from_cache(root, cache); - /* * check for two cases, either we are full, and therefore * don't need to bother with the caching work since we won't @@ -7282,13 +7406,19 @@ int btrfs_read_block_groups(struct btrfs_root *root) * time, particularly in the full case. */ if (found_key.offset == btrfs_block_group_used(&cache->item)) { + exclude_super_stripes(root, cache); + cache->last_byte_to_unpin = (u64)-1; cache->cached = BTRFS_CACHE_FINISHED; + free_excluded_extents(root, cache); } else if (btrfs_block_group_used(&cache->item) == 0) { + exclude_super_stripes(root, cache); + cache->last_byte_to_unpin = (u64)-1; cache->cached = BTRFS_CACHE_FINISHED; add_new_free_space(cache, root->fs_info, found_key.objectid, found_key.objectid + found_key.offset); + free_excluded_extents(root, cache); } ret = update_space_info(info, cache->flags, found_key.offset, @@ -7296,6 +7426,10 @@ int btrfs_read_block_groups(struct btrfs_root *root) &space_info); BUG_ON(ret); cache->space_info = space_info; + spin_lock(&cache->space_info->lock); + cache->space_info->bytes_super += cache->bytes_super; + spin_unlock(&cache->space_info->lock); + down_write(&space_info->groups_sem); list_add_tail(&cache->list, &space_info->block_groups); up_write(&space_info->groups_sem); @@ -7345,7 +7479,6 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, atomic_set(&cache->count, 1); spin_lock_init(&cache->lock); spin_lock_init(&cache->tree_lock); - init_waitqueue_head(&cache->caching_q); INIT_LIST_HEAD(&cache->list); INIT_LIST_HEAD(&cache->cluster_list); @@ -7354,15 +7487,23 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, cache->flags = type; btrfs_set_block_group_flags(&cache->item, type); + cache->last_byte_to_unpin = (u64)-1; cache->cached = BTRFS_CACHE_FINISHED; - remove_sb_from_cache(root, cache); + exclude_super_stripes(root, cache); add_new_free_space(cache, root->fs_info, chunk_offset, chunk_offset + size); + free_excluded_extents(root, cache); + ret = update_space_info(root->fs_info, cache->flags, size, bytes_used, &cache->space_info); BUG_ON(ret); + + spin_lock(&cache->space_info->lock); + cache->space_info->bytes_super += cache->bytes_super; + spin_unlock(&cache->space_info->lock); + down_write(&cache->space_info->groups_sem); list_add_tail(&cache->list, &cache->space_info->block_groups); up_write(&cache->space_info->groups_sem); @@ -7428,8 +7569,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, up_write(&block_group->space_info->groups_sem); if (block_group->cached == BTRFS_CACHE_STARTED) - wait_event(block_group->caching_q, - block_group_cache_done(block_group)); + wait_block_group_cache_done(block_group); btrfs_remove_free_space_cache(block_group); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 6826018..96577e8 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -280,6 +280,14 @@ static struct extent_buffer *buffer_search(struct extent_io_tree *tree, return NULL; } +static void merge_cb(struct extent_io_tree *tree, struct extent_state *new, + struct extent_state *other) +{ + if (tree->ops && tree->ops->merge_extent_hook) + tree->ops->merge_extent_hook(tree->mapping->host, new, + other); +} + /* * utility function to look for merge candidates inside a given range. * Any extents with matching state are merged together into a single @@ -303,6 +311,7 @@ static int merge_state(struct extent_io_tree *tree, other = rb_entry(other_node, struct extent_state, rb_node); if (other->end == state->start - 1 && other->state == state->state) { + merge_cb(tree, state, other); state->start = other->start; other->tree = NULL; rb_erase(&other->rb_node, &tree->state); @@ -314,33 +323,37 @@ static int merge_state(struct extent_io_tree *tree, other = rb_entry(other_node, struct extent_state, rb_node); if (other->start == state->end + 1 && other->state == state->state) { + merge_cb(tree, state, other); other->start = state->start; state->tree = NULL; rb_erase(&state->rb_node, &tree->state); free_extent_state(state); + state = NULL; } } + return 0; } -static void set_state_cb(struct extent_io_tree *tree, +static int set_state_cb(struct extent_io_tree *tree, struct extent_state *state, unsigned long bits) { if (tree->ops && tree->ops->set_bit_hook) { - tree->ops->set_bit_hook(tree->mapping->host, state->start, - state->end, state->state, bits); + return tree->ops->set_bit_hook(tree->mapping->host, + state->start, state->end, + state->state, bits); } + + return 0; } static void clear_state_cb(struct extent_io_tree *tree, struct extent_state *state, unsigned long bits) { - if (tree->ops && tree->ops->clear_bit_hook) { - tree->ops->clear_bit_hook(tree->mapping->host, state->start, - state->end, state->state, bits); - } + if (tree->ops && tree->ops->clear_bit_hook) + tree->ops->clear_bit_hook(tree->mapping->host, state, bits); } /* @@ -358,6 +371,7 @@ static int insert_state(struct extent_io_tree *tree, int bits) { struct rb_node *node; + int ret; if (end < start) { printk(KERN_ERR "btrfs end < start %llu %llu\n", @@ -365,12 +379,15 @@ static int insert_state(struct extent_io_tree *tree, (unsigned long long)start); WARN_ON(1); } + state->start = start; + state->end = end; + ret = set_state_cb(tree, state, bits); + if (ret) + return ret; + if (bits & EXTENT_DIRTY) tree->dirty_bytes += end - start + 1; - set_state_cb(tree, state, bits); state->state |= bits; - state->start = start; - state->end = end; node = tree_insert(&tree->state, end, &state->rb_node); if (node) { struct extent_state *found; @@ -387,6 +404,15 @@ static int insert_state(struct extent_io_tree *tree, return 0; } +static int split_cb(struct extent_io_tree *tree, struct extent_state *orig, + u64 split) +{ + if (tree->ops && tree->ops->split_extent_hook) + return tree->ops->split_extent_hook(tree->mapping->host, + orig, split); + return 0; +} + /* * split a given extent state struct in two, inserting the preallocated * struct 'prealloc' as the newly created second half. 'split' indicates an @@ -405,6 +431,9 @@ static int split_state(struct extent_io_tree *tree, struct extent_state *orig, struct extent_state *prealloc, u64 split) { struct rb_node *node; + + split_cb(tree, orig, split); + prealloc->start = orig->start; prealloc->end = split - 1; prealloc->state = orig->state; @@ -431,7 +460,8 @@ static int clear_state_bit(struct extent_io_tree *tree, struct extent_state *state, int bits, int wake, int delete) { - int ret = state->state & bits; + int bits_to_clear = bits & ~EXTENT_DO_ACCOUNTING; + int ret = state->state & bits_to_clear; if ((bits & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) { u64 range = state->end - state->start + 1; @@ -439,7 +469,7 @@ static int clear_state_bit(struct extent_io_tree *tree, tree->dirty_bytes -= range; } clear_state_cb(tree, state, bits); - state->state &= ~bits; + state->state &= ~bits_to_clear; if (wake) wake_up(&state->wq); if (delete || state->state == 0) { @@ -471,10 +501,14 @@ static int clear_state_bit(struct extent_io_tree *tree, * bits were already set, or zero if none of the bits were already set. */ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, - int bits, int wake, int delete, gfp_t mask) + int bits, int wake, int delete, + struct extent_state **cached_state, + gfp_t mask) { struct extent_state *state; + struct extent_state *cached; struct extent_state *prealloc = NULL; + struct rb_node *next_node; struct rb_node *node; u64 last_end; int err; @@ -488,6 +522,17 @@ again: } spin_lock(&tree->lock); + if (cached_state) { + cached = *cached_state; + *cached_state = NULL; + cached_state = NULL; + if (cached && cached->tree && cached->start == start) { + atomic_dec(&cached->refs); + state = cached; + goto hit_next; + } + free_extent_state(cached); + } /* * this search will find the extents that end after * our range starts @@ -496,6 +541,7 @@ again: if (!node) goto out; state = rb_entry(node, struct extent_state, rb_node); +hit_next: if (state->start > end) goto out; WARN_ON(state->end < start); @@ -526,13 +572,11 @@ again: if (err) goto out; if (state->end <= end) { - set |= clear_state_bit(tree, state, bits, - wake, delete); + set |= clear_state_bit(tree, state, bits, wake, + delete); if (last_end == (u64)-1) goto out; start = last_end + 1; - } else { - start = state->start; } goto search_again; } @@ -547,19 +591,30 @@ again: prealloc = alloc_extent_state(GFP_ATOMIC); err = split_state(tree, state, prealloc, end + 1); BUG_ON(err == -EEXIST); - if (wake) wake_up(&state->wq); - set |= clear_state_bit(tree, prealloc, bits, - wake, delete); + + set |= clear_state_bit(tree, prealloc, bits, wake, delete); + prealloc = NULL; goto out; } + if (state->end < end && prealloc && !need_resched()) + next_node = rb_next(&state->rb_node); + else + next_node = NULL; + set |= clear_state_bit(tree, state, bits, wake, delete); if (last_end == (u64)-1) goto out; start = last_end + 1; + if (start <= end && next_node) { + state = rb_entry(next_node, struct extent_state, + rb_node); + if (state->start == start) + goto hit_next; + } goto search_again; out: @@ -641,40 +696,59 @@ out: return 0; } -static void set_state_bits(struct extent_io_tree *tree, +static int set_state_bits(struct extent_io_tree *tree, struct extent_state *state, int bits) { + int ret; + + ret = set_state_cb(tree, state, bits); + if (ret) + return ret; + if ((bits & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) { u64 range = state->end - state->start + 1; tree->dirty_bytes += range; } - set_state_cb(tree, state, bits); state->state |= bits; + + return 0; +} + +static void cache_state(struct extent_state *state, + struct extent_state **cached_ptr) +{ + if (cached_ptr && !(*cached_ptr)) { + if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY)) { + *cached_ptr = state; + atomic_inc(&state->refs); + } + } } /* - * set some bits on a range in the tree. This may require allocations - * or sleeping, so the gfp mask is used to indicate what is allowed. + * set some bits on a range in the tree. This may require allocations or + * sleeping, so the gfp mask is used to indicate what is allowed. * - * If 'exclusive' == 1, this will fail with -EEXIST if some part of the - * range already has the desired bits set. The start of the existing - * range is returned in failed_start in this case. + * If any of the exclusive bits are set, this will fail with -EEXIST if some + * part of the range already has the desired bits set. The start of the + * existing range is returned in failed_start in this case. * - * [start, end] is inclusive - * This takes the tree lock. + * [start, end] is inclusive This takes the tree lock. */ + static int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, - int bits, int exclusive, u64 *failed_start, + int bits, int exclusive_bits, u64 *failed_start, + struct extent_state **cached_state, gfp_t mask) { struct extent_state *state; struct extent_state *prealloc = NULL; struct rb_node *node; int err = 0; - int set; u64 last_start; u64 last_end; + again: if (!prealloc && (mask & __GFP_WAIT)) { prealloc = alloc_extent_state(mask); @@ -683,6 +757,13 @@ again: } spin_lock(&tree->lock); + if (cached_state && *cached_state) { + state = *cached_state; + if (state->start == start && state->tree) { + node = &state->rb_node; + goto hit_next; + } + } /* * this search will find all the extents that end after * our range starts. @@ -694,8 +775,8 @@ again: BUG_ON(err == -EEXIST); goto out; } - state = rb_entry(node, struct extent_state, rb_node); +hit_next: last_start = state->start; last_end = state->end; @@ -706,17 +787,32 @@ again: * Just lock what we found and keep going */ if (state->start == start && state->end <= end) { - set = state->state & bits; - if (set && exclusive) { + struct rb_node *next_node; + if (state->state & exclusive_bits) { *failed_start = state->start; err = -EEXIST; goto out; } - set_state_bits(tree, state, bits); + + err = set_state_bits(tree, state, bits); + if (err) + goto out; + + cache_state(state, cached_state); merge_state(tree, state); if (last_end == (u64)-1) goto out; + start = last_end + 1; + if (start < end && prealloc && !need_resched()) { + next_node = rb_next(node); + if (next_node) { + state = rb_entry(next_node, struct extent_state, + rb_node); + if (state->start == start) + goto hit_next; + } + } goto search_again; } @@ -737,8 +833,7 @@ again: * desired bit on it. */ if (state->start < start) { - set = state->state & bits; - if (exclusive && set) { + if (state->state & exclusive_bits) { *failed_start = start; err = -EEXIST; goto out; @@ -749,13 +844,14 @@ again: if (err) goto out; if (state->end <= end) { - set_state_bits(tree, state, bits); + err = set_state_bits(tree, state, bits); + if (err) + goto out; + cache_state(state, cached_state); merge_state(tree, state); if (last_end == (u64)-1) goto out; start = last_end + 1; - } else { - start = state->start; } goto search_again; } @@ -774,10 +870,13 @@ again: this_end = last_start - 1; err = insert_state(tree, prealloc, start, this_end, bits); - prealloc = NULL; BUG_ON(err == -EEXIST); - if (err) + if (err) { + prealloc = NULL; goto out; + } + cache_state(prealloc, cached_state); + prealloc = NULL; start = this_end + 1; goto search_again; } @@ -788,8 +887,7 @@ again: * on the first half */ if (state->start <= end && state->end > end) { - set = state->state & bits; - if (exclusive && set) { + if (state->state & exclusive_bits) { *failed_start = start; err = -EEXIST; goto out; @@ -797,7 +895,12 @@ again: err = split_state(tree, state, prealloc, end + 1); BUG_ON(err == -EEXIST); - set_state_bits(tree, prealloc, bits); + err = set_state_bits(tree, prealloc, bits); + if (err) { + prealloc = NULL; + goto out; + } + cache_state(prealloc, cached_state); merge_state(tree, prealloc); prealloc = NULL; goto out; @@ -826,86 +929,65 @@ int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask) { return set_extent_bit(tree, start, end, EXTENT_DIRTY, 0, NULL, - mask); -} - -int set_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end, - gfp_t mask) -{ - return set_extent_bit(tree, start, end, EXTENT_ORDERED, 0, NULL, mask); + NULL, mask); } int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, int bits, gfp_t mask) { return set_extent_bit(tree, start, end, bits, 0, NULL, - mask); + NULL, mask); } int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, int bits, gfp_t mask) { - return clear_extent_bit(tree, start, end, bits, 0, 0, mask); + return clear_extent_bit(tree, start, end, bits, 0, 0, NULL, mask); } int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask) { return set_extent_bit(tree, start, end, - EXTENT_DELALLOC | EXTENT_DIRTY, - 0, NULL, mask); + EXTENT_DELALLOC | EXTENT_DIRTY | EXTENT_UPTODATE, + 0, NULL, NULL, mask); } int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask) { return clear_extent_bit(tree, start, end, - EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, mask); -} - -int clear_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end, - gfp_t mask) -{ - return clear_extent_bit(tree, start, end, EXTENT_ORDERED, 1, 0, mask); + EXTENT_DIRTY | EXTENT_DELALLOC | + EXTENT_DO_ACCOUNTING, 0, 0, + NULL, mask); } int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask) { return set_extent_bit(tree, start, end, EXTENT_NEW, 0, NULL, - mask); + NULL, mask); } static int clear_extent_new(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask) { - return clear_extent_bit(tree, start, end, EXTENT_NEW, 0, 0, mask); + return clear_extent_bit(tree, start, end, EXTENT_NEW, 0, 0, + NULL, mask); } int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask) { return set_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, NULL, - mask); + NULL, mask); } static int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask) { - return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0, mask); -} - -static int set_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end, - gfp_t mask) -{ - return set_extent_bit(tree, start, end, EXTENT_WRITEBACK, - 0, NULL, mask); -} - -static int clear_extent_writeback(struct extent_io_tree *tree, u64 start, - u64 end, gfp_t mask) -{ - return clear_extent_bit(tree, start, end, EXTENT_WRITEBACK, 1, 0, mask); + return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0, + NULL, mask); } int wait_on_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end) @@ -917,13 +999,15 @@ int wait_on_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end) * either insert or lock state struct between start and end use mask to tell * us if waiting is desired. */ -int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask) +int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, + int bits, struct extent_state **cached_state, gfp_t mask) { int err; u64 failed_start; while (1) { - err = set_extent_bit(tree, start, end, EXTENT_LOCKED, 1, - &failed_start, mask); + err = set_extent_bit(tree, start, end, EXTENT_LOCKED | bits, + EXTENT_LOCKED, &failed_start, + cached_state, mask); if (err == -EEXIST && (mask & __GFP_WAIT)) { wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED); start = failed_start; @@ -935,27 +1019,40 @@ int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask) return err; } +int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask) +{ + return lock_extent_bits(tree, start, end, 0, NULL, mask); +} + int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask) { int err; u64 failed_start; - err = set_extent_bit(tree, start, end, EXTENT_LOCKED, 1, - &failed_start, mask); + err = set_extent_bit(tree, start, end, EXTENT_LOCKED, EXTENT_LOCKED, + &failed_start, NULL, mask); if (err == -EEXIST) { if (failed_start > start) clear_extent_bit(tree, start, failed_start - 1, - EXTENT_LOCKED, 1, 0, mask); + EXTENT_LOCKED, 1, 0, NULL, mask); return 0; } return 1; } +int unlock_extent_cached(struct extent_io_tree *tree, u64 start, u64 end, + struct extent_state **cached, gfp_t mask) +{ + return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, cached, + mask); +} + int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask) { - return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, mask); + return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, NULL, + mask); } /* @@ -974,7 +1071,6 @@ int set_range_dirty(struct extent_io_tree *tree, u64 start, u64 end) page_cache_release(page); index++; } - set_extent_dirty(tree, start, end, GFP_NOFS); return 0; } @@ -994,7 +1090,6 @@ static int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end) page_cache_release(page); index++; } - set_extent_writeback(tree, start, end, GFP_NOFS); return 0; } @@ -1232,6 +1327,7 @@ static noinline u64 find_lock_delalloc_range(struct inode *inode, u64 delalloc_start; u64 delalloc_end; u64 found; + struct extent_state *cached_state = NULL; int ret; int loops = 0; @@ -1269,6 +1365,7 @@ again: /* some of the pages are gone, lets avoid looping by * shortening the size of the delalloc range we're searching */ + free_extent_state(cached_state); if (!loops) { unsigned long offset = (*start) & (PAGE_CACHE_SIZE - 1); max_bytes = PAGE_CACHE_SIZE - offset; @@ -1282,18 +1379,21 @@ again: BUG_ON(ret); /* step three, lock the state bits for the whole range */ - lock_extent(tree, delalloc_start, delalloc_end, GFP_NOFS); + lock_extent_bits(tree, delalloc_start, delalloc_end, + 0, &cached_state, GFP_NOFS); /* then test to make sure it is all still delalloc */ ret = test_range_bit(tree, delalloc_start, delalloc_end, - EXTENT_DELALLOC, 1); + EXTENT_DELALLOC, 1, cached_state); if (!ret) { - unlock_extent(tree, delalloc_start, delalloc_end, GFP_NOFS); + unlock_extent_cached(tree, delalloc_start, delalloc_end, + &cached_state, GFP_NOFS); __unlock_for_delalloc(inode, locked_page, delalloc_start, delalloc_end); cond_resched(); goto again; } + free_extent_state(cached_state); *start = delalloc_start; *end = delalloc_end; out_failed: @@ -1303,11 +1403,7 @@ out_failed: int extent_clear_unlock_delalloc(struct inode *inode, struct extent_io_tree *tree, u64 start, u64 end, struct page *locked_page, - int unlock_pages, - int clear_unlock, - int clear_delalloc, int clear_dirty, - int set_writeback, - int end_writeback) + unsigned long op) { int ret; struct page *pages[16]; @@ -1317,16 +1413,21 @@ int extent_clear_unlock_delalloc(struct inode *inode, int i; int clear_bits = 0; - if (clear_unlock) + if (op & EXTENT_CLEAR_UNLOCK) clear_bits |= EXTENT_LOCKED; - if (clear_dirty) + if (op & EXTENT_CLEAR_DIRTY) clear_bits |= EXTENT_DIRTY; - if (clear_delalloc) + if (op & EXTENT_CLEAR_DELALLOC) clear_bits |= EXTENT_DELALLOC; - clear_extent_bit(tree, start, end, clear_bits, 1, 0, GFP_NOFS); - if (!(unlock_pages || clear_dirty || set_writeback || end_writeback)) + if (op & EXTENT_CLEAR_ACCOUNTING) + clear_bits |= EXTENT_DO_ACCOUNTING; + + clear_extent_bit(tree, start, end, clear_bits, 1, 0, NULL, GFP_NOFS); + if (!(op & (EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY | + EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK | + EXTENT_SET_PRIVATE2))) return 0; while (nr_pages > 0) { @@ -1334,17 +1435,21 @@ int extent_clear_unlock_delalloc(struct inode *inode, min_t(unsigned long, nr_pages, ARRAY_SIZE(pages)), pages); for (i = 0; i < ret; i++) { + + if (op & EXTENT_SET_PRIVATE2) + SetPagePrivate2(pages[i]); + if (pages[i] == locked_page) { page_cache_release(pages[i]); continue; } - if (clear_dirty) + if (op & EXTENT_CLEAR_DIRTY) clear_page_dirty_for_io(pages[i]); - if (set_writeback) + if (op & EXTENT_SET_WRITEBACK) set_page_writeback(pages[i]); - if (end_writeback) + if (op & EXTENT_END_WRITEBACK) end_page_writeback(pages[i]); - if (unlock_pages) + if (op & EXTENT_CLEAR_UNLOCK_PAGE) unlock_page(pages[i]); page_cache_release(pages[i]); } @@ -1476,14 +1581,17 @@ out: * range is found set. */ int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, - int bits, int filled) + int bits, int filled, struct extent_state *cached) { struct extent_state *state = NULL; struct rb_node *node; int bitset = 0; spin_lock(&tree->lock); - node = tree_search(tree, start); + if (cached && cached->tree && cached->start == start) + node = &cached->rb_node; + else + node = tree_search(tree, start); while (node && start <= end) { state = rb_entry(node, struct extent_state, rb_node); @@ -1503,6 +1611,10 @@ int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, bitset = 0; break; } + + if (state->end == (u64)-1) + break; + start = state->end + 1; if (start > end) break; @@ -1526,7 +1638,7 @@ static int check_page_uptodate(struct extent_io_tree *tree, { u64 start = (u64)page->index << PAGE_CACHE_SHIFT; u64 end = start + PAGE_CACHE_SIZE - 1; - if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1)) + if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL)) SetPageUptodate(page); return 0; } @@ -1540,7 +1652,7 @@ static int check_page_locked(struct extent_io_tree *tree, { u64 start = (u64)page->index << PAGE_CACHE_SHIFT; u64 end = start + PAGE_CACHE_SIZE - 1; - if (!test_range_bit(tree, start, end, EXTENT_LOCKED, 0)) + if (!test_range_bit(tree, start, end, EXTENT_LOCKED, 0, NULL)) unlock_page(page); return 0; } @@ -1552,10 +1664,7 @@ static int check_page_locked(struct extent_io_tree *tree, static int check_page_writeback(struct extent_io_tree *tree, struct page *page) { - u64 start = (u64)page->index << PAGE_CACHE_SHIFT; - u64 end = start + PAGE_CACHE_SIZE - 1; - if (!test_range_bit(tree, start, end, EXTENT_WRITEBACK, 0)) - end_page_writeback(page); + end_page_writeback(page); return 0; } @@ -1613,13 +1722,11 @@ static void end_bio_extent_writepage(struct bio *bio, int err) } if (!uptodate) { - clear_extent_uptodate(tree, start, end, GFP_ATOMIC); + clear_extent_uptodate(tree, start, end, GFP_NOFS); ClearPageUptodate(page); SetPageError(page); } - clear_extent_writeback(tree, start, end, GFP_ATOMIC); - if (whole_page) end_page_writeback(page); else @@ -1983,7 +2090,8 @@ static int __extent_read_full_page(struct extent_io_tree *tree, continue; } /* the get_extent function already copied into the page */ - if (test_range_bit(tree, cur, cur_end, EXTENT_UPTODATE, 1)) { + if (test_range_bit(tree, cur, cur_end, + EXTENT_UPTODATE, 1, NULL)) { check_page_uptodate(tree, page); unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS); cur = cur + iosize; @@ -2078,6 +2186,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, u64 iosize; u64 unlock_start; sector_t sector; + struct extent_state *cached_state = NULL; struct extent_map *em; struct block_device *bdev; int ret; @@ -2124,6 +2233,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, delalloc_end = 0; page_started = 0; if (!epd->extent_locked) { + u64 delalloc_to_write = 0; /* * make sure the wbc mapping index is at least updated * to this page. @@ -2143,8 +2253,24 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, tree->ops->fill_delalloc(inode, page, delalloc_start, delalloc_end, &page_started, &nr_written); + /* + * delalloc_end is already one less than the total + * length, so we don't subtract one from + * PAGE_CACHE_SIZE + */ + delalloc_to_write += (delalloc_end - delalloc_start + + PAGE_CACHE_SIZE) >> + PAGE_CACHE_SHIFT; delalloc_start = delalloc_end + 1; } + if (wbc->nr_to_write < delalloc_to_write) { + int thresh = 8192; + + if (delalloc_to_write < thresh * 2) + thresh = delalloc_to_write; + wbc->nr_to_write = min_t(u64, delalloc_to_write, + thresh); + } /* did the fill delalloc function already unlock and start * the IO? @@ -2160,15 +2286,10 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, goto done_unlocked; } } - lock_extent(tree, start, page_end, GFP_NOFS); - - unlock_start = start; - if (tree->ops && tree->ops->writepage_start_hook) { ret = tree->ops->writepage_start_hook(page, start, page_end); if (ret == -EAGAIN) { - unlock_extent(tree, start, page_end, GFP_NOFS); redirty_page_for_writepage(wbc, page); update_nr_written(page, wbc, nr_written); unlock_page(page); @@ -2184,12 +2305,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, update_nr_written(page, wbc, nr_written + 1); end = page_end; - if (test_range_bit(tree, start, page_end, EXTENT_DELALLOC, 0)) - printk(KERN_ERR "btrfs delalloc bits after lock_extent\n"); - if (last_byte <= start) { - clear_extent_dirty(tree, start, page_end, GFP_NOFS); - unlock_extent(tree, start, page_end, GFP_NOFS); if (tree->ops && tree->ops->writepage_end_io_hook) tree->ops->writepage_end_io_hook(page, start, page_end, NULL, 1); @@ -2197,13 +2313,10 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, goto done; } - set_extent_uptodate(tree, start, page_end, GFP_NOFS); blocksize = inode->i_sb->s_blocksize; while (cur <= end) { if (cur >= last_byte) { - clear_extent_dirty(tree, cur, page_end, GFP_NOFS); - unlock_extent(tree, unlock_start, page_end, GFP_NOFS); if (tree->ops && tree->ops->writepage_end_io_hook) tree->ops->writepage_end_io_hook(page, cur, page_end, NULL, 1); @@ -2235,12 +2348,6 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, */ if (compressed || block_start == EXTENT_MAP_HOLE || block_start == EXTENT_MAP_INLINE) { - clear_extent_dirty(tree, cur, - cur + iosize - 1, GFP_NOFS); - - unlock_extent(tree, unlock_start, cur + iosize - 1, - GFP_NOFS); - /* * end_io notification does not happen here for * compressed extents @@ -2265,13 +2372,12 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, } /* leave this out until we have a page_mkwrite call */ if (0 && !test_range_bit(tree, cur, cur + iosize - 1, - EXTENT_DIRTY, 0)) { + EXTENT_DIRTY, 0, NULL)) { cur = cur + iosize; pg_offset += iosize; continue; } - clear_extent_dirty(tree, cur, cur + iosize - 1, GFP_NOFS); if (tree->ops && tree->ops->writepage_io_hook) { ret = tree->ops->writepage_io_hook(page, cur, cur + iosize - 1); @@ -2309,12 +2415,12 @@ done: set_page_writeback(page); end_page_writeback(page); } - if (unlock_start <= page_end) - unlock_extent(tree, unlock_start, page_end, GFP_NOFS); unlock_page(page); done_unlocked: + /* drop our reference on any cached states */ + free_extent_state(cached_state); return 0; } @@ -2339,9 +2445,9 @@ static int extent_write_cache_pages(struct extent_io_tree *tree, writepage_t writepage, void *data, void (*flush_fn)(void *)) { - struct backing_dev_info *bdi = mapping->backing_dev_info; int ret = 0; int done = 0; + int nr_to_write_done = 0; struct pagevec pvec; int nr_pages; pgoff_t index; @@ -2361,7 +2467,7 @@ static int extent_write_cache_pages(struct extent_io_tree *tree, scanned = 1; } retry: - while (!done && (index <= end) && + while (!done && !nr_to_write_done && (index <= end) && (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, PAGECACHE_TAG_DIRTY, min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) { @@ -2412,12 +2518,15 @@ retry: unlock_page(page); ret = 0; } - if (ret || wbc->nr_to_write <= 0) - done = 1; - if (wbc->nonblocking && bdi_write_congested(bdi)) { - wbc->encountered_congestion = 1; + if (ret) done = 1; - } + + /* + * the filesystem may choose to bump up nr_to_write. + * We have to make sure to honor the new nr_to_write + * at any time + */ + nr_to_write_done = wbc->nr_to_write <= 0; } pagevec_release(&pvec); cond_resched(); @@ -2604,10 +2713,11 @@ int extent_invalidatepage(struct extent_io_tree *tree, return 0; lock_extent(tree, start, end, GFP_NOFS); - wait_on_extent_writeback(tree, start, end); + wait_on_page_writeback(page); clear_extent_bit(tree, start, end, - EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC, - 1, 1, GFP_NOFS); + EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC | + EXTENT_DO_ACCOUNTING, + 1, 1, NULL, GFP_NOFS); return 0; } @@ -2687,7 +2797,7 @@ int extent_prepare_write(struct extent_io_tree *tree, !isnew && !PageUptodate(page) && (block_off_end > to || block_off_start < from) && !test_range_bit(tree, block_start, cur_end, - EXTENT_UPTODATE, 1)) { + EXTENT_UPTODATE, 1, NULL)) { u64 sector; u64 extent_offset = block_start - em->start; size_t iosize; @@ -2701,7 +2811,7 @@ int extent_prepare_write(struct extent_io_tree *tree, */ set_extent_bit(tree, block_start, block_start + iosize - 1, - EXTENT_LOCKED, 0, NULL, GFP_NOFS); + EXTENT_LOCKED, 0, NULL, NULL, GFP_NOFS); ret = submit_extent_page(READ, tree, page, sector, iosize, page_offset, em->bdev, NULL, 1, @@ -2742,13 +2852,18 @@ int try_release_extent_state(struct extent_map_tree *map, int ret = 1; if (test_range_bit(tree, start, end, - EXTENT_IOBITS | EXTENT_ORDERED, 0)) + EXTENT_IOBITS, 0, NULL)) ret = 0; else { if ((mask & GFP_NOFS) == GFP_NOFS) mask = GFP_NOFS; - clear_extent_bit(tree, start, end, EXTENT_UPTODATE, - 1, 1, mask); + /* + * at this point we can safely clear everything except the + * locked bit and the nodatasum bit + */ + clear_extent_bit(tree, start, end, + ~(EXTENT_LOCKED | EXTENT_NODATASUM), + 0, 0, NULL, mask); } return ret; } @@ -2771,29 +2886,28 @@ int try_release_extent_mapping(struct extent_map_tree *map, u64 len; while (start <= end) { len = end - start + 1; - spin_lock(&map->lock); + write_lock(&map->lock); em = lookup_extent_mapping(map, start, len); if (!em || IS_ERR(em)) { - spin_unlock(&map->lock); + write_unlock(&map->lock); break; } if (test_bit(EXTENT_FLAG_PINNED, &em->flags) || em->start != start) { - spin_unlock(&map->lock); + write_unlock(&map->lock); free_extent_map(em); break; } if (!test_range_bit(tree, em->start, extent_map_end(em) - 1, - EXTENT_LOCKED | EXTENT_WRITEBACK | - EXTENT_ORDERED, - 0)) { + EXTENT_LOCKED | EXTENT_WRITEBACK, + 0, NULL)) { remove_extent_mapping(map, em); /* once for the rb tree */ free_extent_map(em); } start = extent_map_end(em); - spin_unlock(&map->lock); + write_unlock(&map->lock); /* once for us */ free_extent_map(em); @@ -3203,7 +3317,7 @@ int extent_range_uptodate(struct extent_io_tree *tree, int uptodate; unsigned long index; - ret = test_range_bit(tree, start, end, EXTENT_UPTODATE, 1); + ret = test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL); if (ret) return 1; while (start <= end) { @@ -3233,7 +3347,7 @@ int extent_buffer_uptodate(struct extent_io_tree *tree, return 1; ret = test_range_bit(tree, eb->start, eb->start + eb->len - 1, - EXTENT_UPTODATE, 1); + EXTENT_UPTODATE, 1, NULL); if (ret) return ret; @@ -3269,7 +3383,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree, return 0; if (test_range_bit(tree, eb->start, eb->start + eb->len - 1, - EXTENT_UPTODATE, 1)) { + EXTENT_UPTODATE, 1, NULL)) { return 0; } diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 5bc20ab..36de250 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -13,10 +13,9 @@ #define EXTENT_DEFRAG (1 << 6) #define EXTENT_DEFRAG_DONE (1 << 7) #define EXTENT_BUFFER_FILLED (1 << 8) -#define EXTENT_ORDERED (1 << 9) -#define EXTENT_ORDERED_METADATA (1 << 10) -#define EXTENT_BOUNDARY (1 << 11) -#define EXTENT_NODATASUM (1 << 12) +#define EXTENT_BOUNDARY (1 << 9) +#define EXTENT_NODATASUM (1 << 10) +#define EXTENT_DO_ACCOUNTING (1 << 11) #define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK) /* flags for bio submission */ @@ -27,6 +26,16 @@ #define EXTENT_BUFFER_BLOCKING 1 #define EXTENT_BUFFER_DIRTY 2 +/* these are flags for extent_clear_unlock_delalloc */ +#define EXTENT_CLEAR_UNLOCK_PAGE 0x1 +#define EXTENT_CLEAR_UNLOCK 0x2 +#define EXTENT_CLEAR_DELALLOC 0x4 +#define EXTENT_CLEAR_DIRTY 0x8 +#define EXTENT_SET_WRITEBACK 0x10 +#define EXTENT_END_WRITEBACK 0x20 +#define EXTENT_SET_PRIVATE2 0x40 +#define EXTENT_CLEAR_ACCOUNTING 0x80 + /* * page->private values. Every page that is controlled by the extent * map has page->private set to one. @@ -62,8 +71,13 @@ struct extent_io_ops { struct extent_state *state, int uptodate); int (*set_bit_hook)(struct inode *inode, u64 start, u64 end, unsigned long old, unsigned long bits); - int (*clear_bit_hook)(struct inode *inode, u64 start, u64 end, - unsigned long old, unsigned long bits); + int (*clear_bit_hook)(struct inode *inode, struct extent_state *state, + unsigned long bits); + int (*merge_extent_hook)(struct inode *inode, + struct extent_state *new, + struct extent_state *other); + int (*split_extent_hook)(struct inode *inode, + struct extent_state *orig, u64 split); int (*write_cache_pages_lock_hook)(struct page *page); }; @@ -81,10 +95,14 @@ struct extent_state { u64 start; u64 end; /* inclusive */ struct rb_node rb_node; + + /* ADD NEW ELEMENTS AFTER THIS */ struct extent_io_tree *tree; wait_queue_head_t wq; atomic_t refs; unsigned long state; + u64 split_start; + u64 split_end; /* for use by the FS */ u64 private; @@ -142,6 +160,8 @@ int try_release_extent_state(struct extent_map_tree *map, struct extent_io_tree *tree, struct page *page, gfp_t mask); int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask); +int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, + int bits, struct extent_state **cached, gfp_t mask); int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask); int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask); @@ -155,11 +175,12 @@ u64 count_range_bits(struct extent_io_tree *tree, u64 max_bytes, unsigned long bits); int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, - int bits, int filled); + int bits, int filled, struct extent_state *cached_state); int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, int bits, gfp_t mask); int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, - int bits, int wake, int delete, gfp_t mask); + int bits, int wake, int delete, struct extent_state **cached, + gfp_t mask); int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, int bits, gfp_t mask); int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end, @@ -278,9 +299,5 @@ int extent_range_uptodate(struct extent_io_tree *tree, int extent_clear_unlock_delalloc(struct inode *inode, struct extent_io_tree *tree, u64 start, u64 end, struct page *locked_page, - int unlock_page, - int clear_unlock, - int clear_delalloc, int clear_dirty, - int set_writeback, - int end_writeback); + unsigned long op); #endif diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 30c9365..2c726b7 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -36,7 +36,7 @@ void extent_map_exit(void) void extent_map_tree_init(struct extent_map_tree *tree, gfp_t mask) { tree->map.rb_node = NULL; - spin_lock_init(&tree->lock); + rwlock_init(&tree->lock); } /** @@ -198,6 +198,56 @@ static int mergable_maps(struct extent_map *prev, struct extent_map *next) return 0; } +int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len) +{ + int ret = 0; + struct extent_map *merge = NULL; + struct rb_node *rb; + struct extent_map *em; + + write_lock(&tree->lock); + em = lookup_extent_mapping(tree, start, len); + + WARN_ON(em->start != start || !em); + + if (!em) + goto out; + + clear_bit(EXTENT_FLAG_PINNED, &em->flags); + + if (em->start != 0) { + rb = rb_prev(&em->rb_node); + if (rb) + merge = rb_entry(rb, struct extent_map, rb_node); + if (rb && mergable_maps(merge, em)) { + em->start = merge->start; + em->len += merge->len; + em->block_len += merge->block_len; + em->block_start = merge->block_start; + merge->in_tree = 0; + rb_erase(&merge->rb_node, &tree->map); + free_extent_map(merge); + } + } + + rb = rb_next(&em->rb_node); + if (rb) + merge = rb_entry(rb, struct extent_map, rb_node); + if (rb && mergable_maps(em, merge)) { + em->len += merge->len; + em->block_len += merge->len; + rb_erase(&merge->rb_node, &tree->map); + merge->in_tree = 0; + free_extent_map(merge); + } + + free_extent_map(em); +out: + write_unlock(&tree->lock); + return ret; + +} + /** * add_extent_mapping - add new extent map to the extent tree * @tree: tree to insert new map in @@ -222,7 +272,6 @@ int add_extent_mapping(struct extent_map_tree *tree, ret = -EEXIST; goto out; } - assert_spin_locked(&tree->lock); rb = tree_insert(&tree->map, em->start, &em->rb_node); if (rb) { ret = -EEXIST; @@ -285,7 +334,6 @@ struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree, struct rb_node *next = NULL; u64 end = range_end(start, len); - assert_spin_locked(&tree->lock); rb_node = __tree_search(&tree->map, start, &prev, &next); if (!rb_node && prev) { em = rb_entry(prev, struct extent_map, rb_node); @@ -319,6 +367,54 @@ out: } /** + * search_extent_mapping - find a nearby extent map + * @tree: tree to lookup in + * @start: byte offset to start the search + * @len: length of the lookup range + * + * Find and return the first extent_map struct in @tree that intersects the + * [start, len] range. + * + * If one can't be found, any nearby extent may be returned + */ +struct extent_map *search_extent_mapping(struct extent_map_tree *tree, + u64 start, u64 len) +{ + struct extent_map *em; + struct rb_node *rb_node; + struct rb_node *prev = NULL; + struct rb_node *next = NULL; + + rb_node = __tree_search(&tree->map, start, &prev, &next); + if (!rb_node && prev) { + em = rb_entry(prev, struct extent_map, rb_node); + goto found; + } + if (!rb_node && next) { + em = rb_entry(next, struct extent_map, rb_node); + goto found; + } + if (!rb_node) { + em = NULL; + goto out; + } + if (IS_ERR(rb_node)) { + em = ERR_PTR(PTR_ERR(rb_node)); + goto out; + } + em = rb_entry(rb_node, struct extent_map, rb_node); + goto found; + + em = NULL; + goto out; + +found: + atomic_inc(&em->refs); +out: + return em; +} + +/** * remove_extent_mapping - removes an extent_map from the extent tree * @tree: extent tree to remove from * @em: extent map beeing removed @@ -331,7 +427,6 @@ int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em) int ret = 0; WARN_ON(test_bit(EXTENT_FLAG_PINNED, &em->flags)); - assert_spin_locked(&tree->lock); rb_erase(&em->rb_node, &tree->map); em->in_tree = 0; return ret; diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h index fb6eeef..ab6d74b 100644 --- a/fs/btrfs/extent_map.h +++ b/fs/btrfs/extent_map.h @@ -31,7 +31,7 @@ struct extent_map { struct extent_map_tree { struct rb_root map; - spinlock_t lock; + rwlock_t lock; }; static inline u64 extent_map_end(struct extent_map *em) @@ -59,4 +59,7 @@ struct extent_map *alloc_extent_map(gfp_t mask); void free_extent_map(struct extent_map *em); int __init extent_map_init(void); void extent_map_exit(void); +int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len); +struct extent_map *search_extent_mapping(struct extent_map_tree *tree, + u64 start, u64 len); #endif diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 4b83397..06550af 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -112,8 +112,6 @@ static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans, int err = 0; int i; struct inode *inode = fdentry(file)->d_inode; - struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; - u64 hint_byte; u64 num_bytes; u64 start_pos; u64 end_of_last_block; @@ -125,23 +123,10 @@ static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans, root->sectorsize - 1) & ~((u64)root->sectorsize - 1); end_of_last_block = start_pos + num_bytes - 1; + err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block); + if (err) + return err; - lock_extent(io_tree, start_pos, end_of_last_block, GFP_NOFS); - trans = btrfs_join_transaction(root, 1); - if (!trans) { - err = -ENOMEM; - goto out_unlock; - } - btrfs_set_trans_block_group(trans, inode); - hint_byte = 0; - - set_extent_uptodate(io_tree, start_pos, end_of_last_block, GFP_NOFS); - - /* check for reserved extents on each page, we don't want - * to reset the delalloc bit on things that already have - * extents reserved. - */ - btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block); for (i = 0; i < num_pages; i++) { struct page *p = pages[i]; SetPageUptodate(p); @@ -155,9 +140,6 @@ static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans, * at this time. */ } - err = btrfs_end_transaction(trans, root); -out_unlock: - unlock_extent(io_tree, start_pos, end_of_last_block, GFP_NOFS); return err; } @@ -189,18 +171,18 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, if (!split2) split2 = alloc_extent_map(GFP_NOFS); - spin_lock(&em_tree->lock); + write_lock(&em_tree->lock); em = lookup_extent_mapping(em_tree, start, len); if (!em) { - spin_unlock(&em_tree->lock); + write_unlock(&em_tree->lock); break; } flags = em->flags; if (skip_pinned && test_bit(EXTENT_FLAG_PINNED, &em->flags)) { - spin_unlock(&em_tree->lock); if (em->start <= start && (!testend || em->start + em->len >= start + len)) { free_extent_map(em); + write_unlock(&em_tree->lock); break; } if (start < em->start) { @@ -210,6 +192,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, start = em->start + em->len; } free_extent_map(em); + write_unlock(&em_tree->lock); continue; } compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags); @@ -260,7 +243,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, free_extent_map(split); split = NULL; } - spin_unlock(&em_tree->lock); + write_unlock(&em_tree->lock); /* once for us */ free_extent_map(em); @@ -289,7 +272,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, noinline int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode, u64 start, u64 end, u64 locked_end, - u64 inline_limit, u64 *hint_byte) + u64 inline_limit, u64 *hint_byte, int drop_cache) { u64 extent_end = 0; u64 search_start = start; @@ -314,7 +297,8 @@ noinline int btrfs_drop_extents(struct btrfs_trans_handle *trans, int ret; inline_limit = 0; - btrfs_drop_extent_cache(inode, start, end - 1, 0); + if (drop_cache) + btrfs_drop_extent_cache(inode, start, end - 1, 0); path = btrfs_alloc_path(); if (!path) @@ -894,7 +878,8 @@ again: btrfs_put_ordered_extent(ordered); clear_extent_bits(&BTRFS_I(inode)->io_tree, start_pos, - last_pos - 1, EXTENT_DIRTY | EXTENT_DELALLOC, + last_pos - 1, EXTENT_DIRTY | EXTENT_DELALLOC | + EXTENT_DO_ACCOUNTING, GFP_NOFS); unlock_extent(&BTRFS_I(inode)->io_tree, start_pos, last_pos - 1, GFP_NOFS); @@ -936,21 +921,35 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf, start_pos = pos; vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE); + + /* do the reserve before the mutex lock in case we have to do some + * flushing. We wouldn't deadlock, but this is more polite. + */ + err = btrfs_reserve_metadata_for_delalloc(root, inode, 1); + if (err) + goto out_nolock; + + mutex_lock(&inode->i_mutex); + current->backing_dev_info = inode->i_mapping->backing_dev_info; err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode)); if (err) - goto out_nolock; + goto out; + if (count == 0) - goto out_nolock; + goto out; err = file_remove_suid(file); if (err) - goto out_nolock; + goto out; + file_update_time(file); pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL); - mutex_lock(&inode->i_mutex); + /* generic_write_checks can change our pos */ + start_pos = pos; + BTRFS_I(inode)->sequence++; first_index = pos >> PAGE_CACHE_SHIFT; last_index = (pos + count) >> PAGE_CACHE_SHIFT; @@ -1024,9 +1023,8 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf, } if (will_write) { - btrfs_fdatawrite_range(inode->i_mapping, pos, - pos + write_bytes - 1, - WB_SYNC_ALL); + filemap_fdatawrite_range(inode->i_mapping, pos, + pos + write_bytes - 1); } else { balance_dirty_pages_ratelimited_nr(inode->i_mapping, num_pages); @@ -1047,6 +1045,7 @@ out: mutex_unlock(&inode->i_mutex); if (ret) err = ret; + btrfs_unreserve_metadata_for_delalloc(root, inode, 1); out_nolock: kfree(pages); @@ -1087,8 +1086,10 @@ out_nolock: btrfs_end_transaction(trans, root); else btrfs_commit_transaction(trans, root); - } else { + } else if (ret != BTRFS_NO_LOG_SYNC) { btrfs_commit_transaction(trans, root); + } else { + btrfs_end_transaction(trans, root); } } if (file->f_flags & O_DIRECT) { @@ -1138,6 +1139,13 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync) int ret = 0; struct btrfs_trans_handle *trans; + + /* we wait first, since the writeback may change the inode */ + root->log_batch++; + /* the VFS called filemap_fdatawrite for us */ + btrfs_wait_ordered_range(inode, 0, (u64)-1); + root->log_batch++; + /* * check the transaction that last modified this inode * and see if its already been committed @@ -1145,6 +1153,11 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync) if (!BTRFS_I(inode)->last_trans) goto out; + /* + * if the last transaction that changed this file was before + * the current transaction, we can bail out now without any + * syncing + */ mutex_lock(&root->fs_info->trans_mutex); if (BTRFS_I(inode)->last_trans <= root->fs_info->last_trans_committed) { @@ -1154,13 +1167,6 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync) } mutex_unlock(&root->fs_info->trans_mutex); - root->log_batch++; - filemap_fdatawrite(inode->i_mapping); - btrfs_wait_ordered_range(inode, 0, (u64)-1); - root->log_batch++; - - if (datasync && !(inode->i_state & I_DIRTY_PAGES)) - goto out; /* * ok we haven't committed the transaction yet, lets do a commit */ @@ -1189,21 +1195,25 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync) */ mutex_unlock(&dentry->d_inode->i_mutex); - if (ret > 0) { - ret = btrfs_commit_transaction(trans, root); - } else { - ret = btrfs_sync_log(trans, root); - if (ret == 0) - ret = btrfs_end_transaction(trans, root); - else + if (ret != BTRFS_NO_LOG_SYNC) { + if (ret > 0) { ret = btrfs_commit_transaction(trans, root); + } else { + ret = btrfs_sync_log(trans, root); + if (ret == 0) + ret = btrfs_end_transaction(trans, root); + else + ret = btrfs_commit_transaction(trans, root); + } + } else { + ret = btrfs_end_transaction(trans, root); } mutex_lock(&dentry->d_inode->i_mutex); out: return ret > 0 ? EIO : ret; } -static struct vm_operations_struct btrfs_file_vm_ops = { +static const struct vm_operations_struct btrfs_file_vm_ops = { .fault = filemap_fault, .page_mkwrite = btrfs_page_mkwrite, }; @@ -1215,7 +1225,7 @@ static int btrfs_file_mmap(struct file *filp, struct vm_area_struct *vma) return 0; } -struct file_operations btrfs_file_operations = { +const struct file_operations btrfs_file_operations = { .llseek = generic_file_llseek, .read = do_sync_read, .aio_read = generic_file_aio_read, diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 5edcee3..5c2caad 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -259,7 +259,9 @@ static int link_free_space(struct btrfs_block_group_cache *block_group, static void recalculate_thresholds(struct btrfs_block_group_cache *block_group) { - u64 max_bytes, possible_bytes; + u64 max_bytes; + u64 bitmap_bytes; + u64 extent_bytes; /* * The goal is to keep the total amount of memory used per 1gb of space @@ -269,22 +271,27 @@ static void recalculate_thresholds(struct btrfs_block_group_cache *block_group) max_bytes = MAX_CACHE_BYTES_PER_GIG * (div64_u64(block_group->key.offset, 1024 * 1024 * 1024)); - possible_bytes = (block_group->total_bitmaps * PAGE_CACHE_SIZE) + - (sizeof(struct btrfs_free_space) * - block_group->extents_thresh); + /* + * we want to account for 1 more bitmap than what we have so we can make + * sure we don't go over our overall goal of MAX_CACHE_BYTES_PER_GIG as + * we add more bitmaps. + */ + bitmap_bytes = (block_group->total_bitmaps + 1) * PAGE_CACHE_SIZE; - if (possible_bytes > max_bytes) { - int extent_bytes = max_bytes - - (block_group->total_bitmaps * PAGE_CACHE_SIZE); + if (bitmap_bytes >= max_bytes) { + block_group->extents_thresh = 0; + return; + } - if (extent_bytes <= 0) { - block_group->extents_thresh = 0; - return; - } + /* + * we want the extent entry threshold to always be at most 1/2 the maxw + * bytes we can have, or whatever is less than that. + */ + extent_bytes = max_bytes - bitmap_bytes; + extent_bytes = min_t(u64, extent_bytes, div64_u64(max_bytes, 2)); - block_group->extents_thresh = extent_bytes / - (sizeof(struct btrfs_free_space)); - } + block_group->extents_thresh = + div64_u64(extent_bytes, (sizeof(struct btrfs_free_space))); } static void bitmap_clear_bits(struct btrfs_block_group_cache *block_group, @@ -403,6 +410,7 @@ static void add_new_bitmap(struct btrfs_block_group_cache *block_group, BUG_ON(block_group->total_bitmaps >= max_bitmaps); info->offset = offset_to_bitmap(block_group, offset); + info->bytes = 0; link_free_space(block_group, info); block_group->total_bitmaps++; diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c index 6b627c6..72ce3c1 100644 --- a/fs/btrfs/inode-item.c +++ b/fs/btrfs/inode-item.c @@ -149,6 +149,8 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans, ptr = (unsigned long)(ref + 1); ret = 0; } else if (ret < 0) { + if (ret == -EOVERFLOW) + ret = -EMLINK; goto out; } else { ref = btrfs_item_ptr(path->nodes[0], path->slots[0], @@ -177,8 +179,6 @@ int btrfs_insert_empty_inode(struct btrfs_trans_handle *trans, ret = btrfs_insert_empty_item(trans, root, path, &key, sizeof(struct btrfs_inode_item)); - if (ret == 0 && objectid > root->highest_inode) - root->highest_inode = objectid; return ret; } diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c index 9abbced..c56eb59 100644 --- a/fs/btrfs/inode-map.c +++ b/fs/btrfs/inode-map.c @@ -43,9 +43,10 @@ int btrfs_find_highest_inode(struct btrfs_root *root, u64 *objectid) slot = path->slots[0] - 1; l = path->nodes[0]; btrfs_item_key_to_cpu(l, &found_key, slot); - *objectid = found_key.objectid; + *objectid = max_t(u64, found_key.objectid, + BTRFS_FIRST_FREE_OBJECTID - 1); } else { - *objectid = BTRFS_FIRST_FREE_OBJECTID; + *objectid = BTRFS_FIRST_FREE_OBJECTID - 1; } ret = 0; error: @@ -53,91 +54,27 @@ error: return ret; } -/* - * walks the btree of allocated inodes and find a hole. - */ int btrfs_find_free_objectid(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 dirid, u64 *objectid) { - struct btrfs_path *path; - struct btrfs_key key; int ret; - int slot = 0; - u64 last_ino = 0; - int start_found; - struct extent_buffer *l; - struct btrfs_key search_key; - u64 search_start = dirid; - mutex_lock(&root->objectid_mutex); - if (root->last_inode_alloc >= BTRFS_FIRST_FREE_OBJECTID && - root->last_inode_alloc < BTRFS_LAST_FREE_OBJECTID) { - *objectid = ++root->last_inode_alloc; - mutex_unlock(&root->objectid_mutex); - return 0; - } - path = btrfs_alloc_path(); - BUG_ON(!path); - search_start = max(search_start, (u64)BTRFS_FIRST_FREE_OBJECTID); - search_key.objectid = search_start; - search_key.type = 0; - search_key.offset = 0; - - start_found = 0; - ret = btrfs_search_slot(trans, root, &search_key, path, 0, 0); - if (ret < 0) - goto error; - while (1) { - l = path->nodes[0]; - slot = path->slots[0]; - if (slot >= btrfs_header_nritems(l)) { - ret = btrfs_next_leaf(root, path); - if (ret == 0) - continue; - if (ret < 0) - goto error; - if (!start_found) { - *objectid = search_start; - start_found = 1; - goto found; - } - *objectid = last_ino > search_start ? - last_ino : search_start; - goto found; - } - btrfs_item_key_to_cpu(l, &key, slot); - if (key.objectid >= search_start) { - if (start_found) { - if (last_ino < search_start) - last_ino = search_start; - if (key.objectid > last_ino) { - *objectid = last_ino; - goto found; - } - } else if (key.objectid > search_start) { - *objectid = search_start; - goto found; - } - } - if (key.objectid >= BTRFS_LAST_FREE_OBJECTID) - break; + if (unlikely(root->highest_objectid < BTRFS_FIRST_FREE_OBJECTID)) { + ret = btrfs_find_highest_inode(root, &root->highest_objectid); + if (ret) + goto out; + } - start_found = 1; - last_ino = key.objectid + 1; - path->slots[0]++; + if (unlikely(root->highest_objectid >= BTRFS_LAST_FREE_OBJECTID)) { + ret = -ENOSPC; + goto out; } - BUG_ON(1); -found: - btrfs_release_path(root, path); - btrfs_free_path(path); - BUG_ON(*objectid < search_start); - mutex_unlock(&root->objectid_mutex); - return 0; -error: - btrfs_release_path(root, path); - btrfs_free_path(path); + + *objectid = ++root->highest_objectid; + ret = 0; +out: mutex_unlock(&root->objectid_mutex); return ret; } diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 59cba18..dae12dc 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -55,14 +55,14 @@ struct btrfs_iget_args { struct btrfs_root *root; }; -static struct inode_operations btrfs_dir_inode_operations; -static struct inode_operations btrfs_symlink_inode_operations; -static struct inode_operations btrfs_dir_ro_inode_operations; -static struct inode_operations btrfs_special_inode_operations; -static struct inode_operations btrfs_file_inode_operations; -static struct address_space_operations btrfs_aops; -static struct address_space_operations btrfs_symlink_aops; -static struct file_operations btrfs_dir_file_operations; +static const struct inode_operations btrfs_dir_inode_operations; +static const struct inode_operations btrfs_symlink_inode_operations; +static const struct inode_operations btrfs_dir_ro_inode_operations; +static const struct inode_operations btrfs_special_inode_operations; +static const struct inode_operations btrfs_file_inode_operations; +static const struct address_space_operations btrfs_aops; +static const struct address_space_operations btrfs_symlink_aops; +static const struct file_operations btrfs_dir_file_operations; static struct extent_io_ops btrfs_extent_io_ops; static struct kmem_cache *btrfs_inode_cachep; @@ -231,7 +231,8 @@ static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans, } ret = btrfs_drop_extents(trans, root, inode, start, - aligned_end, aligned_end, start, &hint_byte); + aligned_end, aligned_end, start, + &hint_byte, 1); BUG_ON(ret); if (isize > actual_end) @@ -240,7 +241,7 @@ static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans, inline_len, compressed_size, compressed_pages); BUG_ON(ret); - btrfs_drop_extent_cache(inode, start, aligned_end, 0); + btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0); return 0; } @@ -423,9 +424,12 @@ again: * and free up our temp pages. */ extent_clear_unlock_delalloc(inode, - &BTRFS_I(inode)->io_tree, - start, end, NULL, 1, 0, - 0, 1, 1, 1); + &BTRFS_I(inode)->io_tree, + start, end, NULL, + EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY | + EXTENT_CLEAR_DELALLOC | + EXTENT_CLEAR_ACCOUNTING | + EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK); ret = 0; goto free_pages_out; } @@ -611,9 +615,9 @@ static noinline int submit_compressed_extents(struct inode *inode, set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); while (1) { - spin_lock(&em_tree->lock); + write_lock(&em_tree->lock); ret = add_extent_mapping(em_tree, em); - spin_unlock(&em_tree->lock); + write_unlock(&em_tree->lock); if (ret != -EEXIST) { free_extent_map(em); break; @@ -636,11 +640,14 @@ static noinline int submit_compressed_extents(struct inode *inode, * clear dirty, set writeback and unlock the pages. */ extent_clear_unlock_delalloc(inode, - &BTRFS_I(inode)->io_tree, - async_extent->start, - async_extent->start + - async_extent->ram_size - 1, - NULL, 1, 1, 0, 1, 1, 0); + &BTRFS_I(inode)->io_tree, + async_extent->start, + async_extent->start + + async_extent->ram_size - 1, + NULL, EXTENT_CLEAR_UNLOCK_PAGE | + EXTENT_CLEAR_UNLOCK | + EXTENT_CLEAR_DELALLOC | + EXTENT_CLEAR_DIRTY | EXTENT_SET_WRITEBACK); ret = btrfs_submit_compressed_write(inode, async_extent->start, @@ -711,9 +718,15 @@ static noinline int cow_file_range(struct inode *inode, start, end, 0, NULL); if (ret == 0) { extent_clear_unlock_delalloc(inode, - &BTRFS_I(inode)->io_tree, - start, end, NULL, 1, 1, - 1, 1, 1, 1); + &BTRFS_I(inode)->io_tree, + start, end, NULL, + EXTENT_CLEAR_UNLOCK_PAGE | + EXTENT_CLEAR_UNLOCK | + EXTENT_CLEAR_DELALLOC | + EXTENT_CLEAR_ACCOUNTING | + EXTENT_CLEAR_DIRTY | + EXTENT_SET_WRITEBACK | + EXTENT_END_WRITEBACK); *nr_written = *nr_written + (end - start + PAGE_CACHE_SIZE) / PAGE_CACHE_SIZE; *page_started = 1; @@ -725,9 +738,20 @@ static noinline int cow_file_range(struct inode *inode, BUG_ON(disk_num_bytes > btrfs_super_total_bytes(&root->fs_info->super_copy)); + + read_lock(&BTRFS_I(inode)->extent_tree.lock); + em = search_extent_mapping(&BTRFS_I(inode)->extent_tree, + start, num_bytes); + if (em) { + alloc_hint = em->block_start; + free_extent_map(em); + } + read_unlock(&BTRFS_I(inode)->extent_tree.lock); btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0); while (disk_num_bytes > 0) { + unsigned long op; + cur_alloc_size = min(disk_num_bytes, root->fs_info->max_extent); ret = btrfs_reserve_extent(trans, root, cur_alloc_size, root->sectorsize, 0, alloc_hint, @@ -737,7 +761,6 @@ static noinline int cow_file_range(struct inode *inode, em = alloc_extent_map(GFP_NOFS); em->start = start; em->orig_start = em->start; - ram_size = ins.offset; em->len = ins.offset; @@ -747,9 +770,9 @@ static noinline int cow_file_range(struct inode *inode, set_bit(EXTENT_FLAG_PINNED, &em->flags); while (1) { - spin_lock(&em_tree->lock); + write_lock(&em_tree->lock); ret = add_extent_mapping(em_tree, em); - spin_unlock(&em_tree->lock); + write_unlock(&em_tree->lock); if (ret != -EEXIST) { free_extent_map(em); break; @@ -776,11 +799,17 @@ static noinline int cow_file_range(struct inode *inode, /* we're not doing compressed IO, don't unlock the first * page (which the caller expects to stay locked), don't * clear any dirty bits and don't set any writeback bits + * + * Do set the Private2 bit so we know this page was properly + * setup for writepage */ + op = unlock ? EXTENT_CLEAR_UNLOCK_PAGE : 0; + op |= EXTENT_CLEAR_UNLOCK | EXTENT_CLEAR_DELALLOC | + EXTENT_SET_PRIVATE2; + extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree, start, start + ram_size - 1, - locked_page, unlock, 1, - 1, 0, 0, 0); + locked_page, op); disk_num_bytes -= cur_alloc_size; num_bytes -= cur_alloc_size; alloc_hint = ins.objectid + ins.offset; @@ -852,8 +881,8 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page, u64 cur_end; int limit = 10 * 1024 * 1042; - clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED | - EXTENT_DELALLOC, 1, 0, GFP_NOFS); + clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED, + 1, 0, NULL, GFP_NOFS); while (start < end) { async_cow = kmalloc(sizeof(*async_cow), GFP_NOFS); async_cow->inode = inode; @@ -994,6 +1023,7 @@ next_slot: if (found_key.offset > cur_offset) { extent_end = found_key.offset; + extent_type = 0; goto out_check; } @@ -1080,9 +1110,9 @@ out_check: em->bdev = root->fs_info->fs_devices->latest_bdev; set_bit(EXTENT_FLAG_PINNED, &em->flags); while (1) { - spin_lock(&em_tree->lock); + write_lock(&em_tree->lock); ret = add_extent_mapping(em_tree, em); - spin_unlock(&em_tree->lock); + write_unlock(&em_tree->lock); if (ret != -EEXIST) { free_extent_map(em); break; @@ -1100,8 +1130,10 @@ out_check: BUG_ON(ret); extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree, - cur_offset, cur_offset + num_bytes - 1, - locked_page, 1, 1, 1, 0, 0, 0); + cur_offset, cur_offset + num_bytes - 1, + locked_page, EXTENT_CLEAR_UNLOCK_PAGE | + EXTENT_CLEAR_UNLOCK | EXTENT_CLEAR_DELALLOC | + EXTENT_SET_PRIVATE2); cur_offset = extent_end; if (cur_offset > end) break; @@ -1147,6 +1179,89 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page, return ret; } +static int btrfs_split_extent_hook(struct inode *inode, + struct extent_state *orig, u64 split) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + u64 size; + + if (!(orig->state & EXTENT_DELALLOC)) + return 0; + + size = orig->end - orig->start + 1; + if (size > root->fs_info->max_extent) { + u64 num_extents; + u64 new_size; + + new_size = orig->end - split + 1; + num_extents = div64_u64(size + root->fs_info->max_extent - 1, + root->fs_info->max_extent); + + /* + * if we break a large extent up then leave oustanding_extents + * be, since we've already accounted for the large extent. + */ + if (div64_u64(new_size + root->fs_info->max_extent - 1, + root->fs_info->max_extent) < num_extents) + return 0; + } + + spin_lock(&BTRFS_I(inode)->accounting_lock); + BTRFS_I(inode)->outstanding_extents++; + spin_unlock(&BTRFS_I(inode)->accounting_lock); + + return 0; +} + +/* + * extent_io.c merge_extent_hook, used to track merged delayed allocation + * extents so we can keep track of new extents that are just merged onto old + * extents, such as when we are doing sequential writes, so we can properly + * account for the metadata space we'll need. + */ +static int btrfs_merge_extent_hook(struct inode *inode, + struct extent_state *new, + struct extent_state *other) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + u64 new_size, old_size; + u64 num_extents; + + /* not delalloc, ignore it */ + if (!(other->state & EXTENT_DELALLOC)) + return 0; + + old_size = other->end - other->start + 1; + if (new->start < other->start) + new_size = other->end - new->start + 1; + else + new_size = new->end - other->start + 1; + + /* we're not bigger than the max, unreserve the space and go */ + if (new_size <= root->fs_info->max_extent) { + spin_lock(&BTRFS_I(inode)->accounting_lock); + BTRFS_I(inode)->outstanding_extents--; + spin_unlock(&BTRFS_I(inode)->accounting_lock); + return 0; + } + + /* + * If we grew by another max_extent, just return, we want to keep that + * reserved amount. + */ + num_extents = div64_u64(old_size + root->fs_info->max_extent - 1, + root->fs_info->max_extent); + if (div64_u64(new_size + root->fs_info->max_extent - 1, + root->fs_info->max_extent) > num_extents) + return 0; + + spin_lock(&BTRFS_I(inode)->accounting_lock); + BTRFS_I(inode)->outstanding_extents--; + spin_unlock(&BTRFS_I(inode)->accounting_lock); + + return 0; +} + /* * extent_io.c set_bit_hook, used to track delayed allocation * bytes in this file, and to maintain the list of inodes that @@ -1155,6 +1270,7 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page, static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end, unsigned long old, unsigned long bits) { + /* * set_bit and clear bit hooks normally require _irqsave/restore * but in this case, we are only testeing for the DELALLOC @@ -1162,6 +1278,10 @@ static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end, */ if (!(old & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) { struct btrfs_root *root = BTRFS_I(inode)->root; + + spin_lock(&BTRFS_I(inode)->accounting_lock); + BTRFS_I(inode)->outstanding_extents++; + spin_unlock(&BTRFS_I(inode)->accounting_lock); btrfs_delalloc_reserve_space(root, inode, end - start + 1); spin_lock(&root->fs_info->delalloc_lock); BTRFS_I(inode)->delalloc_bytes += end - start + 1; @@ -1178,22 +1298,31 @@ static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end, /* * extent_io.c clear_bit_hook, see set_bit_hook for why */ -static int btrfs_clear_bit_hook(struct inode *inode, u64 start, u64 end, - unsigned long old, unsigned long bits) +static int btrfs_clear_bit_hook(struct inode *inode, + struct extent_state *state, unsigned long bits) { /* * set_bit and clear bit hooks normally require _irqsave/restore * but in this case, we are only testeing for the DELALLOC * bit, which is only set or cleared with irqs on */ - if ((old & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) { + if ((state->state & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) { struct btrfs_root *root = BTRFS_I(inode)->root; + if (bits & EXTENT_DO_ACCOUNTING) { + spin_lock(&BTRFS_I(inode)->accounting_lock); + BTRFS_I(inode)->outstanding_extents--; + spin_unlock(&BTRFS_I(inode)->accounting_lock); + btrfs_unreserve_metadata_for_delalloc(root, inode, 1); + } + spin_lock(&root->fs_info->delalloc_lock); - if (end - start + 1 > root->fs_info->delalloc_bytes) { + if (state->end - state->start + 1 > + root->fs_info->delalloc_bytes) { printk(KERN_INFO "btrfs warning: delalloc account " "%llu %llu\n", - (unsigned long long)end - start + 1, + (unsigned long long) + state->end - state->start + 1, (unsigned long long) root->fs_info->delalloc_bytes); btrfs_delalloc_free_space(root, inode, (u64)-1); @@ -1201,9 +1330,12 @@ static int btrfs_clear_bit_hook(struct inode *inode, u64 start, u64 end, BTRFS_I(inode)->delalloc_bytes = 0; } else { btrfs_delalloc_free_space(root, inode, - end - start + 1); - root->fs_info->delalloc_bytes -= end - start + 1; - BTRFS_I(inode)->delalloc_bytes -= end - start + 1; + state->end - + state->start + 1); + root->fs_info->delalloc_bytes -= state->end - + state->start + 1; + BTRFS_I(inode)->delalloc_bytes -= state->end - + state->start + 1; } if (BTRFS_I(inode)->delalloc_bytes == 0 && !list_empty(&BTRFS_I(inode)->delalloc_inodes)) { @@ -1374,10 +1506,8 @@ again: lock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end, GFP_NOFS); /* already ordered? We're done */ - if (test_range_bit(&BTRFS_I(inode)->io_tree, page_start, page_end, - EXTENT_ORDERED, 0)) { + if (PagePrivate2(page)) goto out; - } ordered = btrfs_lookup_ordered_extent(inode, page_start); if (ordered) { @@ -1413,11 +1543,9 @@ static int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end) struct inode *inode = page->mapping->host; struct btrfs_writepage_fixup *fixup; struct btrfs_root *root = BTRFS_I(inode)->root; - int ret; - ret = test_range_bit(&BTRFS_I(inode)->io_tree, start, end, - EXTENT_ORDERED, 0); - if (ret) + /* this page is properly in the ordered list */ + if (TestClearPagePrivate2(page)) return 0; if (PageChecked(page)) @@ -1455,9 +1583,19 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, BUG_ON(!path); path->leave_spinning = 1; + + /* + * we may be replacing one extent in the tree with another. + * The new extent is pinned in the extent map, and we don't want + * to drop it from the cache until it is completely in the btree. + * + * So, tell btrfs_drop_extents to leave this extent in the cache. + * the caller is expected to unpin it and allow it to be merged + * with the others. + */ ret = btrfs_drop_extents(trans, root, inode, file_pos, file_pos + num_bytes, locked_end, - file_pos, &hint); + file_pos, &hint, 0); BUG_ON(ret); ins.objectid = inode->i_ino; @@ -1485,7 +1623,6 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(leaf); inode_add_bytes(inode, num_bytes); - btrfs_drop_extent_cache(inode, file_pos, file_pos + num_bytes - 1, 0); ins.objectid = disk_bytenr; ins.offset = disk_num_bytes; @@ -1596,6 +1733,9 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end) ordered_extent->len, compressed, 0, 0, BTRFS_FILE_EXTENT_REG); + unpin_extent_cache(&BTRFS_I(inode)->extent_tree, + ordered_extent->file_offset, + ordered_extent->len); BUG_ON(ret); } unlock_extent(io_tree, ordered_extent->file_offset, @@ -1623,6 +1763,7 @@ nocow: static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end, struct extent_state *state, int uptodate) { + ClearPagePrivate2(page); return btrfs_finish_ordered_io(page->mapping->host, start, end); } @@ -1669,13 +1810,13 @@ static int btrfs_io_failed_hook(struct bio *failed_bio, failrec->last_mirror = 0; failrec->bio_flags = 0; - spin_lock(&em_tree->lock); + read_lock(&em_tree->lock); em = lookup_extent_mapping(em_tree, start, failrec->len); if (em->start > start || em->start + em->len < start) { free_extent_map(em); em = NULL; } - spin_unlock(&em_tree->lock); + read_unlock(&em_tree->lock); if (!em || IS_ERR(em)) { kfree(failrec); @@ -1794,7 +1935,7 @@ static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end, return 0; if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID && - test_range_bit(io_tree, start, end, EXTENT_NODATASUM, 1)) { + test_range_bit(io_tree, start, end, EXTENT_NODATASUM, 1, NULL)) { clear_extent_bits(io_tree, start, end, EXTENT_NODATASUM, GFP_NOFS); return 0; @@ -2352,6 +2493,69 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry) return ret; } +int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct inode *dir, u64 objectid, + const char *name, int name_len) +{ + struct btrfs_path *path; + struct extent_buffer *leaf; + struct btrfs_dir_item *di; + struct btrfs_key key; + u64 index; + int ret; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino, + name, name_len, -1); + BUG_ON(!di || IS_ERR(di)); + + leaf = path->nodes[0]; + btrfs_dir_item_key_to_cpu(leaf, di, &key); + WARN_ON(key.type != BTRFS_ROOT_ITEM_KEY || key.objectid != objectid); + ret = btrfs_delete_one_dir_name(trans, root, path, di); + BUG_ON(ret); + btrfs_release_path(root, path); + + ret = btrfs_del_root_ref(trans, root->fs_info->tree_root, + objectid, root->root_key.objectid, + dir->i_ino, &index, name, name_len); + if (ret < 0) { + BUG_ON(ret != -ENOENT); + di = btrfs_search_dir_index_item(root, path, dir->i_ino, + name, name_len); + BUG_ON(!di || IS_ERR(di)); + + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + btrfs_release_path(root, path); + index = key.offset; + } + + di = btrfs_lookup_dir_index_item(trans, root, path, dir->i_ino, + index, name, name_len, -1); + BUG_ON(!di || IS_ERR(di)); + + leaf = path->nodes[0]; + btrfs_dir_item_key_to_cpu(leaf, di, &key); + WARN_ON(key.type != BTRFS_ROOT_ITEM_KEY || key.objectid != objectid); + ret = btrfs_delete_one_dir_name(trans, root, path, di); + BUG_ON(ret); + btrfs_release_path(root, path); + + btrfs_i_size_write(dir, dir->i_size - name_len * 2); + dir->i_mtime = dir->i_ctime = CURRENT_TIME; + ret = btrfs_update_inode(trans, root, dir); + BUG_ON(ret); + dir->i_sb->s_dirt = 1; + + btrfs_free_path(path); + return 0; +} + static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) { struct inode *inode = dentry->d_inode; @@ -2361,29 +2565,31 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) struct btrfs_trans_handle *trans; unsigned long nr = 0; - /* - * the FIRST_FREE_OBJECTID check makes sure we don't try to rmdir - * the root of a subvolume or snapshot - */ if (inode->i_size > BTRFS_EMPTY_DIR_SIZE || - inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) { + inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) return -ENOTEMPTY; - } trans = btrfs_start_transaction(root, 1); btrfs_set_trans_block_group(trans, dir); + if (unlikely(inode->i_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) { + err = btrfs_unlink_subvol(trans, root, dir, + BTRFS_I(inode)->location.objectid, + dentry->d_name.name, + dentry->d_name.len); + goto out; + } + err = btrfs_orphan_add(trans, inode); if (err) - goto fail_trans; + goto out; /* now the directory is empty */ err = btrfs_unlink_inode(trans, root, dir, dentry->d_inode, dentry->d_name.name, dentry->d_name.len); if (!err) btrfs_i_size_write(inode, 0); - -fail_trans: +out: nr = trans->blocks_used; ret = btrfs_end_transaction_throttle(trans, root); btrfs_btree_balance_dirty(root, nr); @@ -2826,12 +3032,22 @@ static int btrfs_truncate_page(struct address_space *mapping, loff_t from) if ((offset & (blocksize - 1)) == 0) goto out; + ret = btrfs_check_data_free_space(root, inode, PAGE_CACHE_SIZE); + if (ret) + goto out; + + ret = btrfs_reserve_metadata_for_delalloc(root, inode, 1); + if (ret) + goto out; ret = -ENOMEM; again: page = grab_cache_page(mapping, index); - if (!page) + if (!page) { + btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE); + btrfs_unreserve_metadata_for_delalloc(root, inode, 1); goto out; + } page_start = page_offset(page); page_end = page_start + PAGE_CACHE_SIZE - 1; @@ -2864,7 +3080,16 @@ again: goto again; } - btrfs_set_extent_delalloc(inode, page_start, page_end); + clear_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end, + EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING, + GFP_NOFS); + + ret = btrfs_set_extent_delalloc(inode, page_start, page_end); + if (ret) { + unlock_extent(io_tree, page_start, page_end, GFP_NOFS); + goto out_unlock; + } + ret = 0; if (offset != PAGE_CACHE_SIZE) { kaddr = kmap(page); @@ -2877,6 +3102,9 @@ again: unlock_extent(io_tree, page_start, page_end, GFP_NOFS); out_unlock: + if (ret) + btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE); + btrfs_unreserve_metadata_for_delalloc(root, inode, 1); unlock_page(page); page_cache_release(page); out: @@ -2895,17 +3123,15 @@ int btrfs_cont_expand(struct inode *inode, loff_t size) u64 last_byte; u64 cur_offset; u64 hole_size; - int err; + int err = 0; if (size <= hole_start) return 0; - err = btrfs_check_metadata_free_space(root); + err = btrfs_truncate_page(inode->i_mapping, inode->i_size); if (err) return err; - btrfs_truncate_page(inode->i_mapping, inode->i_size); - while (1) { struct btrfs_ordered_extent *ordered; btrfs_wait_ordered_range(inode, hole_start, @@ -2935,15 +3161,21 @@ int btrfs_cont_expand(struct inode *inode, loff_t size) cur_offset, cur_offset + hole_size, block_end, - cur_offset, &hint_byte); + cur_offset, &hint_byte, 1); + if (err) + break; + + err = btrfs_reserve_metadata_space(root, 1); if (err) break; + err = btrfs_insert_file_extent(trans, root, inode->i_ino, cur_offset, 0, 0, hole_size, 0, hole_size, 0, 0, 0); btrfs_drop_extent_cache(inode, hole_start, last_byte - 1, 0); + btrfs_unreserve_metadata_space(root, 1); } free_extent_map(em); cur_offset = last_byte; @@ -3003,6 +3235,11 @@ void btrfs_delete_inode(struct inode *inode) } btrfs_wait_ordered_range(inode, 0, (u64)-1); + if (inode->i_nlink > 0) { + BUG_ON(btrfs_root_refs(&root->root_item) != 0); + goto no_delete; + } + btrfs_i_size_write(inode, 0); trans = btrfs_join_transaction(root, 1); @@ -3070,29 +3307,67 @@ out_err: * is kind of like crossing a mount point. */ static int fixup_tree_root_location(struct btrfs_root *root, - struct btrfs_key *location, - struct btrfs_root **sub_root, - struct dentry *dentry) + struct inode *dir, + struct dentry *dentry, + struct btrfs_key *location, + struct btrfs_root **sub_root) { - struct btrfs_root_item *ri; + struct btrfs_path *path; + struct btrfs_root *new_root; + struct btrfs_root_ref *ref; + struct extent_buffer *leaf; + int ret; + int err = 0; - if (btrfs_key_type(location) != BTRFS_ROOT_ITEM_KEY) - return 0; - if (location->objectid == BTRFS_ROOT_TREE_OBJECTID) - return 0; + path = btrfs_alloc_path(); + if (!path) { + err = -ENOMEM; + goto out; + } - *sub_root = btrfs_read_fs_root(root->fs_info, location, - dentry->d_name.name, - dentry->d_name.len); - if (IS_ERR(*sub_root)) - return PTR_ERR(*sub_root); + err = -ENOENT; + ret = btrfs_find_root_ref(root->fs_info->tree_root, path, + BTRFS_I(dir)->root->root_key.objectid, + location->objectid); + if (ret) { + if (ret < 0) + err = ret; + goto out; + } - ri = &(*sub_root)->root_item; - location->objectid = btrfs_root_dirid(ri); - btrfs_set_key_type(location, BTRFS_INODE_ITEM_KEY); - location->offset = 0; + leaf = path->nodes[0]; + ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref); + if (btrfs_root_ref_dirid(leaf, ref) != dir->i_ino || + btrfs_root_ref_name_len(leaf, ref) != dentry->d_name.len) + goto out; - return 0; + ret = memcmp_extent_buffer(leaf, dentry->d_name.name, + (unsigned long)(ref + 1), + dentry->d_name.len); + if (ret) + goto out; + + btrfs_release_path(root->fs_info->tree_root, path); + + new_root = btrfs_read_fs_root_no_name(root->fs_info, location); + if (IS_ERR(new_root)) { + err = PTR_ERR(new_root); + goto out; + } + + if (btrfs_root_refs(&new_root->root_item) == 0) { + err = -ENOENT; + goto out; + } + + *sub_root = new_root; + location->objectid = btrfs_root_dirid(&new_root->root_item); + location->type = BTRFS_INODE_ITEM_KEY; + location->offset = 0; + err = 0; +out: + btrfs_free_path(path); + return err; } static void inode_tree_add(struct inode *inode) @@ -3101,11 +3376,13 @@ static void inode_tree_add(struct inode *inode) struct btrfs_inode *entry; struct rb_node **p; struct rb_node *parent; - again: p = &root->inode_tree.rb_node; parent = NULL; + if (hlist_unhashed(&inode->i_hash)) + return; + spin_lock(&root->inode_lock); while (*p) { parent = *p; @@ -3132,13 +3409,87 @@ again: static void inode_tree_del(struct inode *inode) { struct btrfs_root *root = BTRFS_I(inode)->root; + int empty = 0; spin_lock(&root->inode_lock); if (!RB_EMPTY_NODE(&BTRFS_I(inode)->rb_node)) { rb_erase(&BTRFS_I(inode)->rb_node, &root->inode_tree); RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node); + empty = RB_EMPTY_ROOT(&root->inode_tree); } spin_unlock(&root->inode_lock); + + if (empty && btrfs_root_refs(&root->root_item) == 0) { + synchronize_srcu(&root->fs_info->subvol_srcu); + spin_lock(&root->inode_lock); + empty = RB_EMPTY_ROOT(&root->inode_tree); + spin_unlock(&root->inode_lock); + if (empty) + btrfs_add_dead_root(root); + } +} + +int btrfs_invalidate_inodes(struct btrfs_root *root) +{ + struct rb_node *node; + struct rb_node *prev; + struct btrfs_inode *entry; + struct inode *inode; + u64 objectid = 0; + + WARN_ON(btrfs_root_refs(&root->root_item) != 0); + + spin_lock(&root->inode_lock); +again: + node = root->inode_tree.rb_node; + prev = NULL; + while (node) { + prev = node; + entry = rb_entry(node, struct btrfs_inode, rb_node); + + if (objectid < entry->vfs_inode.i_ino) + node = node->rb_left; + else if (objectid > entry->vfs_inode.i_ino) + node = node->rb_right; + else + break; + } + if (!node) { + while (prev) { + entry = rb_entry(prev, struct btrfs_inode, rb_node); + if (objectid <= entry->vfs_inode.i_ino) { + node = prev; + break; + } + prev = rb_next(prev); + } + } + while (node) { + entry = rb_entry(node, struct btrfs_inode, rb_node); + objectid = entry->vfs_inode.i_ino + 1; + inode = igrab(&entry->vfs_inode); + if (inode) { + spin_unlock(&root->inode_lock); + if (atomic_read(&inode->i_count) > 1) + d_prune_aliases(inode); + /* + * btrfs_drop_inode will remove it from + * the inode cache when its usage count + * hits zero. + */ + iput(inode); + cond_resched(); + spin_lock(&root->inode_lock); + goto again; + } + + if (cond_resched_lock(&root->inode_lock)) + goto again; + + node = rb_next(node); + } + spin_unlock(&root->inode_lock); + return 0; } static noinline void init_btrfs_i(struct inode *inode) @@ -3148,6 +3499,7 @@ static noinline void init_btrfs_i(struct inode *inode) bi->generation = 0; bi->sequence = 0; bi->last_trans = 0; + bi->last_sub_trans = 0; bi->logged_trans = 0; bi->delalloc_bytes = 0; bi->reserved_bytes = 0; @@ -3225,15 +3577,41 @@ struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location, return inode; } +static struct inode *new_simple_dir(struct super_block *s, + struct btrfs_key *key, + struct btrfs_root *root) +{ + struct inode *inode = new_inode(s); + + if (!inode) + return ERR_PTR(-ENOMEM); + + init_btrfs_i(inode); + + BTRFS_I(inode)->root = root; + memcpy(&BTRFS_I(inode)->location, key, sizeof(*key)); + BTRFS_I(inode)->dummy_inode = 1; + + inode->i_ino = BTRFS_EMPTY_SUBVOL_DIR_OBJECTID; + inode->i_op = &simple_dir_inode_operations; + inode->i_fop = &simple_dir_operations; + inode->i_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO; + inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; + + return inode; +} + struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry) { struct inode *inode; - struct btrfs_inode *bi = BTRFS_I(dir); - struct btrfs_root *root = bi->root; + struct btrfs_root *root = BTRFS_I(dir)->root; struct btrfs_root *sub_root = root; struct btrfs_key location; + int index; int ret; + dentry->d_op = &btrfs_dentry_operations; + if (dentry->d_name.len > BTRFS_NAME_LEN) return ERR_PTR(-ENAMETOOLONG); @@ -3242,29 +3620,52 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry) if (ret < 0) return ERR_PTR(ret); - inode = NULL; - if (location.objectid) { - ret = fixup_tree_root_location(root, &location, &sub_root, - dentry); - if (ret < 0) - return ERR_PTR(ret); - if (ret > 0) - return ERR_PTR(-ENOENT); + if (location.objectid == 0) + return NULL; + + if (location.type == BTRFS_INODE_ITEM_KEY) { + inode = btrfs_iget(dir->i_sb, &location, root); + return inode; + } + + BUG_ON(location.type != BTRFS_ROOT_ITEM_KEY); + + index = srcu_read_lock(&root->fs_info->subvol_srcu); + ret = fixup_tree_root_location(root, dir, dentry, + &location, &sub_root); + if (ret < 0) { + if (ret != -ENOENT) + inode = ERR_PTR(ret); + else + inode = new_simple_dir(dir->i_sb, &location, sub_root); + } else { inode = btrfs_iget(dir->i_sb, &location, sub_root); - if (IS_ERR(inode)) - return ERR_CAST(inode); } + srcu_read_unlock(&root->fs_info->subvol_srcu, index); + return inode; } +static int btrfs_dentry_delete(struct dentry *dentry) +{ + struct btrfs_root *root; + + if (!dentry->d_inode && !IS_ROOT(dentry)) + dentry = dentry->d_parent; + + if (dentry->d_inode) { + root = BTRFS_I(dentry->d_inode)->root; + if (btrfs_root_refs(&root->root_item) == 0) + return 1; + } + return 0; +} + static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd) { struct inode *inode; - if (dentry->d_name.len > BTRFS_NAME_LEN) - return ERR_PTR(-ENAMETOOLONG); - inode = btrfs_lookup_dentry(dir, dentry); if (IS_ERR(inode)) return ERR_CAST(inode); @@ -3603,9 +4004,6 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, if (ret != 0) goto fail; - if (objectid > root->highest_inode) - root->highest_inode = objectid; - inode->i_uid = current_fsuid(); if (dir && (dir->i_mode & S_ISGID)) { @@ -3673,26 +4071,35 @@ int btrfs_add_link(struct btrfs_trans_handle *trans, struct inode *parent_inode, struct inode *inode, const char *name, int name_len, int add_backref, u64 index) { - int ret; + int ret = 0; struct btrfs_key key; struct btrfs_root *root = BTRFS_I(parent_inode)->root; - key.objectid = inode->i_ino; - btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); - key.offset = 0; + if (unlikely(inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)) { + memcpy(&key, &BTRFS_I(inode)->root->root_key, sizeof(key)); + } else { + key.objectid = inode->i_ino; + btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); + key.offset = 0; + } + + if (unlikely(inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)) { + ret = btrfs_add_root_ref(trans, root->fs_info->tree_root, + key.objectid, root->root_key.objectid, + parent_inode->i_ino, + index, name, name_len); + } else if (add_backref) { + ret = btrfs_insert_inode_ref(trans, root, + name, name_len, inode->i_ino, + parent_inode->i_ino, index); + } - ret = btrfs_insert_dir_item(trans, root, name, name_len, - parent_inode->i_ino, - &key, btrfs_inode_type(inode), - index); if (ret == 0) { - if (add_backref) { - ret = btrfs_insert_inode_ref(trans, root, - name, name_len, - inode->i_ino, - parent_inode->i_ino, - index); - } + ret = btrfs_insert_dir_item(trans, root, name, name_len, + parent_inode->i_ino, &key, + btrfs_inode_type(inode), index); + BUG_ON(ret); + btrfs_i_size_write(parent_inode, parent_inode->i_size + name_len * 2); parent_inode->i_mtime = parent_inode->i_ctime = CURRENT_TIME; @@ -3732,11 +4139,18 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry, if (!new_valid_dev(rdev)) return -EINVAL; - err = btrfs_check_metadata_free_space(root); + /* + * 2 for inode item and ref + * 2 for dir items + * 1 for xattr if selinux is on + */ + err = btrfs_reserve_metadata_space(root, 5); if (err) - goto fail; + return err; trans = btrfs_start_transaction(root, 1); + if (!trans) + goto fail; btrfs_set_trans_block_group(trans, dir); err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid); @@ -3774,6 +4188,7 @@ out_unlock: nr = trans->blocks_used; btrfs_end_transaction_throttle(trans, root); fail: + btrfs_unreserve_metadata_space(root, 5); if (drop_inode) { inode_dec_link_count(inode); iput(inode); @@ -3794,10 +4209,18 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry, u64 objectid; u64 index = 0; - err = btrfs_check_metadata_free_space(root); + /* + * 2 for inode item and ref + * 2 for dir items + * 1 for xattr if selinux is on + */ + err = btrfs_reserve_metadata_space(root, 5); if (err) - goto fail; + return err; + trans = btrfs_start_transaction(root, 1); + if (!trans) + goto fail; btrfs_set_trans_block_group(trans, dir); err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid); @@ -3838,6 +4261,7 @@ out_unlock: nr = trans->blocks_used; btrfs_end_transaction_throttle(trans, root); fail: + btrfs_unreserve_metadata_space(root, 5); if (drop_inode) { inode_dec_link_count(inode); iput(inode); @@ -3860,10 +4284,16 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, if (inode->i_nlink == 0) return -ENOENT; - btrfs_inc_nlink(inode); - err = btrfs_check_metadata_free_space(root); + /* + * 1 item for inode ref + * 2 items for dir items + */ + err = btrfs_reserve_metadata_space(root, 3); if (err) - goto fail; + return err; + + btrfs_inc_nlink(inode); + err = btrfs_set_inode_index(dir, &index); if (err) goto fail; @@ -3875,20 +4305,19 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, err = btrfs_add_nondir(trans, dentry, inode, 1, index); - if (err) - drop_inode = 1; - - btrfs_update_inode_block_group(trans, dir); - err = btrfs_update_inode(trans, root, inode); - - if (err) + if (err) { drop_inode = 1; + } else { + btrfs_update_inode_block_group(trans, dir); + err = btrfs_update_inode(trans, root, inode); + BUG_ON(err); + btrfs_log_new_name(trans, inode, NULL, dentry->d_parent); + } nr = trans->blocks_used; - - btrfs_log_new_name(trans, inode, NULL, dentry->d_parent); btrfs_end_transaction_throttle(trans, root); fail: + btrfs_unreserve_metadata_space(root, 3); if (drop_inode) { inode_dec_link_count(inode); iput(inode); @@ -3908,17 +4337,21 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) u64 index = 0; unsigned long nr = 1; - err = btrfs_check_metadata_free_space(root); + /* + * 2 items for inode and ref + * 2 items for dir items + * 1 for xattr if selinux is on + */ + err = btrfs_reserve_metadata_space(root, 5); if (err) - goto out_unlock; + return err; trans = btrfs_start_transaction(root, 1); - btrfs_set_trans_block_group(trans, dir); - - if (IS_ERR(trans)) { - err = PTR_ERR(trans); + if (!trans) { + err = -ENOMEM; goto out_unlock; } + btrfs_set_trans_block_group(trans, dir); err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid); if (err) { @@ -3967,6 +4400,7 @@ out_fail: btrfs_end_transaction_throttle(trans, root); out_unlock: + btrfs_unreserve_metadata_space(root, 5); if (drop_on_err) iput(inode); btrfs_btree_balance_dirty(root, nr); @@ -4064,11 +4498,11 @@ struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page, int compressed; again: - spin_lock(&em_tree->lock); + read_lock(&em_tree->lock); em = lookup_extent_mapping(em_tree, start, len); if (em) em->bdev = root->fs_info->fs_devices->latest_bdev; - spin_unlock(&em_tree->lock); + read_unlock(&em_tree->lock); if (em) { if (em->start > start || em->start + em->len <= start) @@ -4215,6 +4649,11 @@ again: map = kmap(page); read_extent_buffer(leaf, map + pg_offset, ptr, copy_size); + if (pg_offset + copy_size < PAGE_CACHE_SIZE) { + memset(map + pg_offset + copy_size, 0, + PAGE_CACHE_SIZE - pg_offset - + copy_size); + } kunmap(page); } flush_dcache_page(page); @@ -4259,7 +4698,7 @@ insert: } err = 0; - spin_lock(&em_tree->lock); + write_lock(&em_tree->lock); ret = add_extent_mapping(em_tree, em); /* it is possible that someone inserted the extent into the tree * while we had the lock dropped. It is also possible that @@ -4299,7 +4738,7 @@ insert: err = 0; } } - spin_unlock(&em_tree->lock); + write_unlock(&em_tree->lock); out: if (path) btrfs_free_path(path); @@ -4398,13 +4837,21 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset) u64 page_start = page_offset(page); u64 page_end = page_start + PAGE_CACHE_SIZE - 1; + + /* + * we have the page locked, so new writeback can't start, + * and the dirty bit won't be cleared while we are here. + * + * Wait for IO on this page so that we can safely clear + * the PagePrivate2 bit and do ordered accounting + */ wait_on_page_writeback(page); + tree = &BTRFS_I(page->mapping->host)->io_tree; if (offset) { btrfs_releasepage(page, GFP_NOFS); return; } - lock_extent(tree, page_start, page_end, GFP_NOFS); ordered = btrfs_lookup_ordered_extent(page->mapping->host, page_offset(page)); @@ -4415,16 +4862,22 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset) */ clear_extent_bit(tree, page_start, page_end, EXTENT_DIRTY | EXTENT_DELALLOC | - EXTENT_LOCKED, 1, 0, GFP_NOFS); - btrfs_finish_ordered_io(page->mapping->host, - page_start, page_end); + EXTENT_LOCKED | EXTENT_DO_ACCOUNTING, 1, 0, + NULL, GFP_NOFS); + /* + * whoever cleared the private bit is responsible + * for the finish_ordered_io + */ + if (TestClearPagePrivate2(page)) { + btrfs_finish_ordered_io(page->mapping->host, + page_start, page_end); + } btrfs_put_ordered_extent(ordered); lock_extent(tree, page_start, page_end, GFP_NOFS); } clear_extent_bit(tree, page_start, page_end, EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC | - EXTENT_ORDERED, - 1, 1, GFP_NOFS); + EXTENT_DO_ACCOUNTING, 1, 1, NULL, GFP_NOFS); __btrfs_releasepage(page, GFP_NOFS); ClearPageChecked(page); @@ -4473,6 +4926,13 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) goto out; } + ret = btrfs_reserve_metadata_for_delalloc(root, inode, 1); + if (ret) { + btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE); + ret = VM_FAULT_SIGBUS; + goto out; + } + ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */ again: lock_page(page); @@ -4504,7 +4964,24 @@ again: goto again; } - btrfs_set_extent_delalloc(inode, page_start, page_end); + /* + * XXX - page_mkwrite gets called every time the page is dirtied, even + * if it was already dirty, so for space accounting reasons we need to + * clear any delalloc bits for the range we are fixing to save. There + * is probably a better way to do this, but for now keep consistent with + * prepare_pages in the normal write path. + */ + clear_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end, + EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING, + GFP_NOFS); + + ret = btrfs_set_extent_delalloc(inode, page_start, page_end); + if (ret) { + unlock_extent(io_tree, page_start, page_end, GFP_NOFS); + ret = VM_FAULT_SIGBUS; + btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE); + goto out_unlock; + } ret = 0; /* page is wholly or partially inside EOF */ @@ -4521,11 +4998,17 @@ again: } ClearPageChecked(page); set_page_dirty(page); + SetPageUptodate(page); + + BTRFS_I(inode)->last_trans = root->fs_info->generation; + BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid; - BTRFS_I(inode)->last_trans = root->fs_info->generation + 1; unlock_extent(io_tree, page_start, page_end, GFP_NOFS); out_unlock: + btrfs_unreserve_metadata_for_delalloc(root, inode, 1); + if (!ret) + return VM_FAULT_LOCKED; unlock_page(page); out: return ret; @@ -4544,7 +5027,9 @@ static void btrfs_truncate(struct inode *inode) if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) return; - btrfs_truncate_page(inode->i_mapping, inode->i_size); + ret = btrfs_truncate_page(inode->i_mapping, inode->i_size); + if (ret) + return; btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1); trans = btrfs_start_transaction(root, 1); @@ -4594,11 +5079,11 @@ out: * create a new subvolume directory/inode (helper for the ioctl). */ int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, - struct btrfs_root *new_root, struct dentry *dentry, + struct btrfs_root *new_root, u64 new_dirid, u64 alloc_hint) { struct inode *inode; - int error; + int err; u64 index = 0; inode = btrfs_new_inode(trans, new_root, NULL, "..", 2, new_dirid, @@ -4611,11 +5096,10 @@ int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, inode->i_nlink = 1; btrfs_i_size_write(inode, 0); - error = btrfs_update_inode(trans, new_root, inode); - if (error) - return error; + err = btrfs_update_inode(trans, new_root, inode); + BUG_ON(err); - d_instantiate(dentry, inode); + iput(inode); return 0; } @@ -4640,7 +5124,11 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) if (!ei) return NULL; ei->last_trans = 0; + ei->last_sub_trans = 0; ei->logged_trans = 0; + ei->outstanding_extents = 0; + ei->reserved_extents = 0; + spin_lock_init(&ei->accounting_lock); btrfs_ordered_inode_tree_init(&ei->ordered_tree); INIT_LIST_HEAD(&ei->i_orphan); INIT_LIST_HEAD(&ei->ordered_operations); @@ -4693,6 +5181,16 @@ void btrfs_destroy_inode(struct inode *inode) kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode)); } +void btrfs_drop_inode(struct inode *inode) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + + if (inode->i_nlink > 0 && btrfs_root_refs(&root->root_item) == 0) + generic_delete_inode(inode); + else + generic_drop_inode(inode); +} + static void init_once(void *foo) { struct btrfs_inode *ei = (struct btrfs_inode *) foo; @@ -4761,31 +5259,37 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, { struct btrfs_trans_handle *trans; struct btrfs_root *root = BTRFS_I(old_dir)->root; + struct btrfs_root *dest = BTRFS_I(new_dir)->root; struct inode *new_inode = new_dentry->d_inode; struct inode *old_inode = old_dentry->d_inode; struct timespec ctime = CURRENT_TIME; u64 index = 0; + u64 root_objectid; int ret; - /* we're not allowed to rename between subvolumes */ - if (BTRFS_I(old_inode)->root->root_key.objectid != - BTRFS_I(new_dir)->root->root_key.objectid) + if (new_dir->i_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) + return -EPERM; + + /* we only allow rename subvolume link between subvolumes */ + if (old_inode->i_ino != BTRFS_FIRST_FREE_OBJECTID && root != dest) return -EXDEV; + if (old_inode->i_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID || + (new_inode && new_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)) + return -ENOTEMPTY; + if (S_ISDIR(old_inode->i_mode) && new_inode && - new_inode->i_size > BTRFS_EMPTY_DIR_SIZE) { + new_inode->i_size > BTRFS_EMPTY_DIR_SIZE) return -ENOTEMPTY; - } - /* to rename a snapshot or subvolume, we need to juggle the - * backrefs. This isn't coded yet + /* + * 2 items for dir items + * 1 item for orphan entry + * 1 item for ref */ - if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) - return -EXDEV; - - ret = btrfs_check_metadata_free_space(root); + ret = btrfs_reserve_metadata_space(root, 4); if (ret) - goto out_unlock; + return ret; /* * we're using rename to replace one file with another. @@ -4796,8 +5300,40 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, old_inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT) filemap_flush(old_inode->i_mapping); + /* close the racy window with snapshot create/destroy ioctl */ + if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) + down_read(&root->fs_info->subvol_sem); + trans = btrfs_start_transaction(root, 1); + btrfs_set_trans_block_group(trans, new_dir); + + if (dest != root) + btrfs_record_root_in_trans(trans, dest); + ret = btrfs_set_inode_index(new_dir, &index); + if (ret) + goto out_fail; + + if (unlikely(old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)) { + /* force full log commit if subvolume involved. */ + root->fs_info->last_trans_log_full_commit = trans->transid; + } else { + ret = btrfs_insert_inode_ref(trans, dest, + new_dentry->d_name.name, + new_dentry->d_name.len, + old_inode->i_ino, + new_dir->i_ino, index); + if (ret) + goto out_fail; + /* + * this is an ugly little race, but the rename is required + * to make sure that if we crash, the inode is either at the + * old name or the new one. pinning the log transaction lets + * us make sure we don't allow a log commit to come in after + * we unlink the name but before we add the new name back in. + */ + btrfs_pin_log_trans(root); + } /* * make sure the inode gets flushed if it is replacing * something. @@ -4807,18 +5343,6 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, btrfs_add_ordered_operation(trans, root, old_inode); } - /* - * this is an ugly little race, but the rename is required to make - * sure that if we crash, the inode is either at the old name - * or the new one. pinning the log transaction lets us make sure - * we don't allow a log commit to come in after we unlink the - * name but before we add the new name back in. - */ - btrfs_pin_log_trans(root); - - btrfs_set_trans_block_group(trans, new_dir); - - btrfs_inc_nlink(old_dentry->d_inode); old_dir->i_ctime = old_dir->i_mtime = ctime; new_dir->i_ctime = new_dir->i_mtime = ctime; old_inode->i_ctime = ctime; @@ -4826,47 +5350,60 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, if (old_dentry->d_parent != new_dentry->d_parent) btrfs_record_unlink_dir(trans, old_dir, old_inode, 1); - ret = btrfs_unlink_inode(trans, root, old_dir, old_dentry->d_inode, - old_dentry->d_name.name, - old_dentry->d_name.len); - if (ret) - goto out_fail; + if (unlikely(old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)) { + root_objectid = BTRFS_I(old_inode)->root->root_key.objectid; + ret = btrfs_unlink_subvol(trans, root, old_dir, root_objectid, + old_dentry->d_name.name, + old_dentry->d_name.len); + } else { + btrfs_inc_nlink(old_dentry->d_inode); + ret = btrfs_unlink_inode(trans, root, old_dir, + old_dentry->d_inode, + old_dentry->d_name.name, + old_dentry->d_name.len); + } + BUG_ON(ret); if (new_inode) { new_inode->i_ctime = CURRENT_TIME; - ret = btrfs_unlink_inode(trans, root, new_dir, - new_dentry->d_inode, - new_dentry->d_name.name, - new_dentry->d_name.len); - if (ret) - goto out_fail; + if (unlikely(new_inode->i_ino == + BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) { + root_objectid = BTRFS_I(new_inode)->location.objectid; + ret = btrfs_unlink_subvol(trans, dest, new_dir, + root_objectid, + new_dentry->d_name.name, + new_dentry->d_name.len); + BUG_ON(new_inode->i_nlink == 0); + } else { + ret = btrfs_unlink_inode(trans, dest, new_dir, + new_dentry->d_inode, + new_dentry->d_name.name, + new_dentry->d_name.len); + } + BUG_ON(ret); if (new_inode->i_nlink == 0) { ret = btrfs_orphan_add(trans, new_dentry->d_inode); - if (ret) - goto out_fail; + BUG_ON(ret); } - } - ret = btrfs_set_inode_index(new_dir, &index); - if (ret) - goto out_fail; - ret = btrfs_add_link(trans, new_dentry->d_parent->d_inode, - old_inode, new_dentry->d_name.name, - new_dentry->d_name.len, 1, index); - if (ret) - goto out_fail; + ret = btrfs_add_link(trans, new_dir, old_inode, + new_dentry->d_name.name, + new_dentry->d_name.len, 0, index); + BUG_ON(ret); - btrfs_log_new_name(trans, old_inode, old_dir, - new_dentry->d_parent); + if (old_inode->i_ino != BTRFS_FIRST_FREE_OBJECTID) { + btrfs_log_new_name(trans, old_inode, old_dir, + new_dentry->d_parent); + btrfs_end_log_trans(root); + } out_fail: - - /* this btrfs_end_log_trans just allows the current - * log-sub transaction to complete - */ - btrfs_end_log_trans(root); btrfs_end_transaction_throttle(trans, root); -out_unlock: + + if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) + up_read(&root->fs_info->subvol_sem); + + btrfs_unreserve_metadata_space(root, 4); return ret; } @@ -4938,11 +5475,18 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry, if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root)) return -ENAMETOOLONG; - err = btrfs_check_metadata_free_space(root); + /* + * 2 items for inode item and ref + * 2 items for dir items + * 1 item for xattr if selinux is on + */ + err = btrfs_reserve_metadata_space(root, 5); if (err) - goto out_fail; + return err; trans = btrfs_start_transaction(root, 1); + if (!trans) + goto out_fail; btrfs_set_trans_block_group(trans, dir); err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid); @@ -5023,6 +5567,7 @@ out_unlock: nr = trans->blocks_used; btrfs_end_transaction_throttle(trans, root); out_fail: + btrfs_unreserve_metadata_space(root, 5); if (drop_inode) { inode_dec_link_count(inode); iput(inode); @@ -5044,6 +5589,11 @@ static int prealloc_file_range(struct btrfs_trans_handle *trans, while (num_bytes > 0) { alloc_size = min(num_bytes, root->fs_info->max_extent); + + ret = btrfs_reserve_metadata_space(root, 1); + if (ret) + goto out; + ret = btrfs_reserve_extent(trans, root, alloc_size, root->sectorsize, 0, alloc_hint, (u64)-1, &ins, 1); @@ -5058,9 +5608,12 @@ static int prealloc_file_range(struct btrfs_trans_handle *trans, 0, 0, 0, BTRFS_FILE_EXTENT_PREALLOC); BUG_ON(ret); + btrfs_drop_extent_cache(inode, cur_offset, + cur_offset + ins.offset -1, 0); num_bytes -= ins.offset; cur_offset += ins.offset; alloc_hint = ins.objectid + ins.offset; + btrfs_unreserve_metadata_space(root, 1); } out: if (cur_offset > start) { @@ -5201,7 +5754,7 @@ static int btrfs_permission(struct inode *inode, int mask) return generic_permission(inode, mask, btrfs_check_acl); } -static struct inode_operations btrfs_dir_inode_operations = { +static const struct inode_operations btrfs_dir_inode_operations = { .getattr = btrfs_getattr, .lookup = btrfs_lookup, .create = btrfs_create, @@ -5219,11 +5772,12 @@ static struct inode_operations btrfs_dir_inode_operations = { .removexattr = btrfs_removexattr, .permission = btrfs_permission, }; -static struct inode_operations btrfs_dir_ro_inode_operations = { +static const struct inode_operations btrfs_dir_ro_inode_operations = { .lookup = btrfs_lookup, .permission = btrfs_permission, }; -static struct file_operations btrfs_dir_file_operations = { + +static const struct file_operations btrfs_dir_file_operations = { .llseek = generic_file_llseek, .read = generic_read_dir, .readdir = btrfs_real_readdir, @@ -5245,6 +5799,8 @@ static struct extent_io_ops btrfs_extent_io_ops = { .readpage_io_failed_hook = btrfs_io_failed_hook, .set_bit_hook = btrfs_set_bit_hook, .clear_bit_hook = btrfs_clear_bit_hook, + .merge_extent_hook = btrfs_merge_extent_hook, + .split_extent_hook = btrfs_split_extent_hook, }; /* @@ -5259,7 +5815,7 @@ static struct extent_io_ops btrfs_extent_io_ops = { * * For now we're avoiding this by dropping bmap. */ -static struct address_space_operations btrfs_aops = { +static const struct address_space_operations btrfs_aops = { .readpage = btrfs_readpage, .writepage = btrfs_writepage, .writepages = btrfs_writepages, @@ -5269,16 +5825,17 @@ static struct address_space_operations btrfs_aops = { .invalidatepage = btrfs_invalidatepage, .releasepage = btrfs_releasepage, .set_page_dirty = btrfs_set_page_dirty, + .error_remove_page = generic_error_remove_page, }; -static struct address_space_operations btrfs_symlink_aops = { +static const struct address_space_operations btrfs_symlink_aops = { .readpage = btrfs_readpage, .writepage = btrfs_writepage, .invalidatepage = btrfs_invalidatepage, .releasepage = btrfs_releasepage, }; -static struct inode_operations btrfs_file_inode_operations = { +static const struct inode_operations btrfs_file_inode_operations = { .truncate = btrfs_truncate, .getattr = btrfs_getattr, .setattr = btrfs_setattr, @@ -5290,7 +5847,7 @@ static struct inode_operations btrfs_file_inode_operations = { .fallocate = btrfs_fallocate, .fiemap = btrfs_fiemap, }; -static struct inode_operations btrfs_special_inode_operations = { +static const struct inode_operations btrfs_special_inode_operations = { .getattr = btrfs_getattr, .setattr = btrfs_setattr, .permission = btrfs_permission, @@ -5299,7 +5856,7 @@ static struct inode_operations btrfs_special_inode_operations = { .listxattr = btrfs_listxattr, .removexattr = btrfs_removexattr, }; -static struct inode_operations btrfs_symlink_inode_operations = { +static const struct inode_operations btrfs_symlink_inode_operations = { .readlink = generic_readlink, .follow_link = page_follow_link_light, .put_link = page_put_link, @@ -5309,3 +5866,7 @@ static struct inode_operations btrfs_symlink_inode_operations = { .listxattr = btrfs_listxattr, .removexattr = btrfs_removexattr, }; + +const struct dentry_operations btrfs_dentry_operations = { + .d_delete = btrfs_dentry_delete, +}; diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index bd88f25..cdbb054 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -230,8 +230,8 @@ static noinline int create_subvol(struct btrfs_root *root, struct btrfs_root_item root_item; struct btrfs_inode_item *inode_item; struct extent_buffer *leaf; - struct btrfs_root *new_root = root; - struct inode *dir; + struct btrfs_root *new_root; + struct inode *dir = dentry->d_parent->d_inode; int ret; int err; u64 objectid; @@ -239,9 +239,15 @@ static noinline int create_subvol(struct btrfs_root *root, u64 index = 0; unsigned long nr = 1; - ret = btrfs_check_metadata_free_space(root); + /* + * 1 - inode item + * 2 - refs + * 1 - root item + * 2 - dir items + */ + ret = btrfs_reserve_metadata_space(root, 6); if (ret) - goto fail_commit; + return ret; trans = btrfs_start_transaction(root, 1); BUG_ON(!trans); @@ -304,11 +310,17 @@ static noinline int create_subvol(struct btrfs_root *root, if (ret) goto fail; + key.offset = (u64)-1; + new_root = btrfs_read_fs_root_no_name(root->fs_info, &key); + BUG_ON(IS_ERR(new_root)); + + btrfs_record_root_in_trans(trans, new_root); + + ret = btrfs_create_subvol_root(trans, new_root, new_dirid, + BTRFS_I(dir)->block_group); /* * insert the directory item */ - key.offset = (u64)-1; - dir = dentry->d_parent->d_inode; ret = btrfs_set_inode_index(dir, &index); BUG_ON(ret); @@ -322,43 +334,20 @@ static noinline int create_subvol(struct btrfs_root *root, ret = btrfs_update_inode(trans, root, dir); BUG_ON(ret); - /* add the backref first */ ret = btrfs_add_root_ref(trans, root->fs_info->tree_root, - objectid, BTRFS_ROOT_BACKREF_KEY, - root->root_key.objectid, + objectid, root->root_key.objectid, dir->i_ino, index, name, namelen); BUG_ON(ret); - /* now add the forward ref */ - ret = btrfs_add_root_ref(trans, root->fs_info->tree_root, - root->root_key.objectid, BTRFS_ROOT_REF_KEY, - objectid, - dir->i_ino, index, name, namelen); - - BUG_ON(ret); - - ret = btrfs_commit_transaction(trans, root); - if (ret) - goto fail_commit; - - new_root = btrfs_read_fs_root_no_name(root->fs_info, &key); - BUG_ON(!new_root); - - trans = btrfs_start_transaction(new_root, 1); - BUG_ON(!trans); - - ret = btrfs_create_subvol_root(trans, new_root, dentry, new_dirid, - BTRFS_I(dir)->block_group); - if (ret) - goto fail; - + d_instantiate(dentry, btrfs_lookup_dentry(dir, dentry)); fail: nr = trans->blocks_used; - err = btrfs_commit_transaction(trans, new_root); + err = btrfs_commit_transaction(trans, root); if (err && !ret) ret = err; -fail_commit: + + btrfs_unreserve_metadata_space(root, 6); btrfs_btree_balance_dirty(root, nr); return ret; } @@ -375,19 +364,27 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry, if (!root->ref_cows) return -EINVAL; - ret = btrfs_check_metadata_free_space(root); + /* + * 1 - inode item + * 2 - refs + * 1 - root item + * 2 - dir items + */ + ret = btrfs_reserve_metadata_space(root, 6); if (ret) goto fail_unlock; pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_NOFS); if (!pending_snapshot) { ret = -ENOMEM; + btrfs_unreserve_metadata_space(root, 6); goto fail_unlock; } pending_snapshot->name = kmalloc(namelen + 1, GFP_NOFS); if (!pending_snapshot->name) { ret = -ENOMEM; kfree(pending_snapshot); + btrfs_unreserve_metadata_space(root, 6); goto fail_unlock; } memcpy(pending_snapshot->name, name, namelen); @@ -420,14 +417,15 @@ static inline int btrfs_may_create(struct inode *dir, struct dentry *child) * sys_mkdirat and vfs_mkdir, but we only do a single component lookup * inside this filesystem so it's quite a bit simpler. */ -static noinline int btrfs_mksubvol(struct path *parent, char *name, - int mode, int namelen, +static noinline int btrfs_mksubvol(struct path *parent, + char *name, int namelen, struct btrfs_root *snap_src) { + struct inode *dir = parent->dentry->d_inode; struct dentry *dentry; int error; - mutex_lock_nested(&parent->dentry->d_inode->i_mutex, I_MUTEX_PARENT); + mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT); dentry = lookup_one_len(name, parent->dentry, namelen); error = PTR_ERR(dentry); @@ -438,99 +436,39 @@ static noinline int btrfs_mksubvol(struct path *parent, char *name, if (dentry->d_inode) goto out_dput; - if (!IS_POSIXACL(parent->dentry->d_inode)) - mode &= ~current_umask(); - error = mnt_want_write(parent->mnt); if (error) goto out_dput; - error = btrfs_may_create(parent->dentry->d_inode, dentry); + error = btrfs_may_create(dir, dentry); if (error) goto out_drop_write; - /* - * Actually perform the low-level subvolume creation after all - * this VFS fuzz. - * - * Eventually we want to pass in an inode under which we create this - * subvolume, but for now all are under the filesystem root. - * - * Also we should pass on the mode eventually to allow creating new - * subvolume with specific mode bits. - */ + down_read(&BTRFS_I(dir)->root->fs_info->subvol_sem); + + if (btrfs_root_refs(&BTRFS_I(dir)->root->root_item) == 0) + goto out_up_read; + if (snap_src) { - struct dentry *dir = dentry->d_parent; - struct dentry *test = dir->d_parent; - struct btrfs_path *path = btrfs_alloc_path(); - int ret; - u64 test_oid; - u64 parent_oid = BTRFS_I(dir->d_inode)->root->root_key.objectid; - - test_oid = snap_src->root_key.objectid; - - ret = btrfs_find_root_ref(snap_src->fs_info->tree_root, - path, parent_oid, test_oid); - if (ret == 0) - goto create; - btrfs_release_path(snap_src->fs_info->tree_root, path); - - /* we need to make sure we aren't creating a directory loop - * by taking a snapshot of something that has our current - * subvol in its directory tree. So, this loops through - * the dentries and checks the forward refs for each subvolume - * to see if is references the subvolume where we are - * placing this new snapshot. - */ - while (1) { - if (!test || - dir == snap_src->fs_info->sb->s_root || - test == snap_src->fs_info->sb->s_root || - test->d_inode->i_sb != snap_src->fs_info->sb) { - break; - } - if (S_ISLNK(test->d_inode->i_mode)) { - printk(KERN_INFO "Btrfs symlink in snapshot " - "path, failed\n"); - error = -EMLINK; - btrfs_free_path(path); - goto out_drop_write; - } - test_oid = - BTRFS_I(test->d_inode)->root->root_key.objectid; - ret = btrfs_find_root_ref(snap_src->fs_info->tree_root, - path, test_oid, parent_oid); - if (ret == 0) { - printk(KERN_INFO "Btrfs snapshot creation " - "failed, looping\n"); - error = -EMLINK; - btrfs_free_path(path); - goto out_drop_write; - } - btrfs_release_path(snap_src->fs_info->tree_root, path); - test = test->d_parent; - } -create: - btrfs_free_path(path); - error = create_snapshot(snap_src, dentry, name, namelen); + error = create_snapshot(snap_src, dentry, + name, namelen); } else { - error = create_subvol(BTRFS_I(parent->dentry->d_inode)->root, - dentry, name, namelen); + error = create_subvol(BTRFS_I(dir)->root, dentry, + name, namelen); } - if (error) - goto out_drop_write; - - fsnotify_mkdir(parent->dentry->d_inode, dentry); + if (!error) + fsnotify_mkdir(dir, dentry); +out_up_read: + up_read(&BTRFS_I(dir)->root->fs_info->subvol_sem); out_drop_write: mnt_drop_write(parent->mnt); out_dput: dput(dentry); out_unlock: - mutex_unlock(&parent->dentry->d_inode->i_mutex); + mutex_unlock(&dir->i_mutex); return error; } - static int btrfs_defrag_file(struct file *file) { struct inode *inode = fdentry(file)->d_inode; @@ -596,9 +534,8 @@ again: clear_page_dirty_for_io(page); btrfs_set_extent_delalloc(inode, page_start, page_end); - - unlock_extent(io_tree, page_start, page_end, GFP_NOFS); set_page_dirty(page); + unlock_extent(io_tree, page_start, page_end, GFP_NOFS); unlock_page(page); page_cache_release(page); balance_dirty_pages_ratelimited_nr(inode->i_mapping, 1); @@ -609,7 +546,8 @@ out_unlock: return 0; } -static int btrfs_ioctl_resize(struct btrfs_root *root, void __user *arg) +static noinline int btrfs_ioctl_resize(struct btrfs_root *root, + void __user *arg) { u64 new_size; u64 old_size; @@ -718,10 +656,7 @@ static noinline int btrfs_ioctl_snap_create(struct file *file, { struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root; struct btrfs_ioctl_vol_args *vol_args; - struct btrfs_dir_item *di; - struct btrfs_path *path; struct file *src_file; - u64 root_dirid; int namelen; int ret = 0; @@ -739,32 +674,9 @@ static noinline int btrfs_ioctl_snap_create(struct file *file, goto out; } - path = btrfs_alloc_path(); - if (!path) { - ret = -ENOMEM; - goto out; - } - - root_dirid = root->fs_info->sb->s_root->d_inode->i_ino, - di = btrfs_lookup_dir_item(NULL, root->fs_info->tree_root, - path, root_dirid, - vol_args->name, namelen, 0); - btrfs_free_path(path); - - if (di && !IS_ERR(di)) { - ret = -EEXIST; - goto out; - } - - if (IS_ERR(di)) { - ret = PTR_ERR(di); - goto out; - } - if (subvol) { - ret = btrfs_mksubvol(&file->f_path, vol_args->name, - file->f_path.dentry->d_inode->i_mode, - namelen, NULL); + ret = btrfs_mksubvol(&file->f_path, vol_args->name, namelen, + NULL); } else { struct inode *src_inode; src_file = fget(vol_args->fd); @@ -781,17 +693,157 @@ static noinline int btrfs_ioctl_snap_create(struct file *file, fput(src_file); goto out; } - ret = btrfs_mksubvol(&file->f_path, vol_args->name, - file->f_path.dentry->d_inode->i_mode, - namelen, BTRFS_I(src_inode)->root); + ret = btrfs_mksubvol(&file->f_path, vol_args->name, namelen, + BTRFS_I(src_inode)->root); fput(src_file); } - out: kfree(vol_args); return ret; } +/* + * helper to check if the subvolume references other subvolumes + */ +static noinline int may_destroy_subvol(struct btrfs_root *root) +{ + struct btrfs_path *path; + struct btrfs_key key; + int ret; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + key.objectid = root->root_key.objectid; + key.type = BTRFS_ROOT_REF_KEY; + key.offset = (u64)-1; + + ret = btrfs_search_slot(NULL, root->fs_info->tree_root, + &key, path, 0, 0); + if (ret < 0) + goto out; + BUG_ON(ret == 0); + + ret = 0; + if (path->slots[0] > 0) { + path->slots[0]--; + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + if (key.objectid == root->root_key.objectid && + key.type == BTRFS_ROOT_REF_KEY) + ret = -ENOTEMPTY; + } +out: + btrfs_free_path(path); + return ret; +} + +static noinline int btrfs_ioctl_snap_destroy(struct file *file, + void __user *arg) +{ + struct dentry *parent = fdentry(file); + struct dentry *dentry; + struct inode *dir = parent->d_inode; + struct inode *inode; + struct btrfs_root *root = BTRFS_I(dir)->root; + struct btrfs_root *dest = NULL; + struct btrfs_ioctl_vol_args *vol_args; + struct btrfs_trans_handle *trans; + int namelen; + int ret; + int err = 0; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + vol_args = memdup_user(arg, sizeof(*vol_args)); + if (IS_ERR(vol_args)) + return PTR_ERR(vol_args); + + vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; + namelen = strlen(vol_args->name); + if (strchr(vol_args->name, '/') || + strncmp(vol_args->name, "..", namelen) == 0) { + err = -EINVAL; + goto out; + } + + err = mnt_want_write(file->f_path.mnt); + if (err) + goto out; + + mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT); + dentry = lookup_one_len(vol_args->name, parent, namelen); + if (IS_ERR(dentry)) { + err = PTR_ERR(dentry); + goto out_unlock_dir; + } + + if (!dentry->d_inode) { + err = -ENOENT; + goto out_dput; + } + + inode = dentry->d_inode; + if (inode->i_ino != BTRFS_FIRST_FREE_OBJECTID) { + err = -EINVAL; + goto out_dput; + } + + dest = BTRFS_I(inode)->root; + + mutex_lock(&inode->i_mutex); + err = d_invalidate(dentry); + if (err) + goto out_unlock; + + down_write(&root->fs_info->subvol_sem); + + err = may_destroy_subvol(dest); + if (err) + goto out_up_write; + + trans = btrfs_start_transaction(root, 1); + ret = btrfs_unlink_subvol(trans, root, dir, + dest->root_key.objectid, + dentry->d_name.name, + dentry->d_name.len); + BUG_ON(ret); + + btrfs_record_root_in_trans(trans, dest); + + memset(&dest->root_item.drop_progress, 0, + sizeof(dest->root_item.drop_progress)); + dest->root_item.drop_level = 0; + btrfs_set_root_refs(&dest->root_item, 0); + + ret = btrfs_insert_orphan_item(trans, + root->fs_info->tree_root, + dest->root_key.objectid); + BUG_ON(ret); + + ret = btrfs_commit_transaction(trans, root); + BUG_ON(ret); + inode->i_flags |= S_DEAD; +out_up_write: + up_write(&root->fs_info->subvol_sem); +out_unlock: + mutex_unlock(&inode->i_mutex); + if (!err) { + shrink_dcache_sb(root->fs_info->sb); + btrfs_invalidate_inodes(dest); + d_delete(dentry); + } +out_dput: + dput(dentry); +out_unlock_dir: + mutex_unlock(&dir->i_mutex); + mnt_drop_write(file->f_path.mnt); +out: + kfree(vol_args); + return err; +} + static int btrfs_ioctl_defrag(struct file *file) { struct inode *inode = fdentry(file)->d_inode; @@ -865,8 +917,8 @@ static long btrfs_ioctl_rm_dev(struct btrfs_root *root, void __user *arg) return ret; } -static long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, - u64 off, u64 olen, u64 destoff) +static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, + u64 off, u64 olen, u64 destoff) { struct inode *inode = fdentry(file)->d_inode; struct btrfs_root *root = BTRFS_I(inode)->root; @@ -976,7 +1028,7 @@ static long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, /* punch hole in destination first */ btrfs_drop_extents(trans, root, inode, off, off + len, - off + len, 0, &hint_byte); + off + len, 0, &hint_byte, 1); /* clone data */ key.objectid = src->i_ino; @@ -1071,9 +1123,10 @@ static long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, datao += off - key.offset; datal -= off - key.offset; } - if (key.offset + datao + datal + key.offset > - off + len) - datal = off + len - key.offset - datao; + + if (key.offset + datal > off + len) + datal = off + len - key.offset; + /* disko == 0 means it's a hole */ if (!disko) datao = 0; @@ -1182,15 +1235,15 @@ static long btrfs_ioctl_trans_start(struct file *file) struct inode *inode = fdentry(file)->d_inode; struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_trans_handle *trans; - int ret = 0; + int ret; + ret = -EPERM; if (!capable(CAP_SYS_ADMIN)) - return -EPERM; + goto out; - if (file->private_data) { - ret = -EINPROGRESS; + ret = -EINPROGRESS; + if (file->private_data) goto out; - } ret = mnt_want_write(file->f_path.mnt); if (ret) @@ -1200,12 +1253,19 @@ static long btrfs_ioctl_trans_start(struct file *file) root->fs_info->open_ioctl_trans++; mutex_unlock(&root->fs_info->trans_mutex); + ret = -ENOMEM; trans = btrfs_start_ioctl_transaction(root, 0); - if (trans) - file->private_data = trans; - else - ret = -ENOMEM; - /*printk(KERN_INFO "btrfs_ioctl_trans_start on %p\n", file);*/ + if (!trans) + goto out_drop; + + file->private_data = trans; + return 0; + +out_drop: + mutex_lock(&root->fs_info->trans_mutex); + root->fs_info->open_ioctl_trans--; + mutex_unlock(&root->fs_info->trans_mutex); + mnt_drop_write(file->f_path.mnt); out: return ret; } @@ -1221,24 +1281,20 @@ long btrfs_ioctl_trans_end(struct file *file) struct inode *inode = fdentry(file)->d_inode; struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_trans_handle *trans; - int ret = 0; trans = file->private_data; - if (!trans) { - ret = -EINVAL; - goto out; - } - btrfs_end_transaction(trans, root); + if (!trans) + return -EINVAL; file->private_data = NULL; + btrfs_end_transaction(trans, root); + mutex_lock(&root->fs_info->trans_mutex); root->fs_info->open_ioctl_trans--; mutex_unlock(&root->fs_info->trans_mutex); mnt_drop_write(file->f_path.mnt); - -out: - return ret; + return 0; } long btrfs_ioctl(struct file *file, unsigned int @@ -1258,6 +1314,8 @@ long btrfs_ioctl(struct file *file, unsigned int return btrfs_ioctl_snap_create(file, argp, 0); case BTRFS_IOC_SUBVOL_CREATE: return btrfs_ioctl_snap_create(file, argp, 1); + case BTRFS_IOC_SNAP_DESTROY: + return btrfs_ioctl_snap_destroy(file, argp); case BTRFS_IOC_DEFRAG: return btrfs_ioctl_defrag(file); case BTRFS_IOC_RESIZE: diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h index b320b10..bc49914 100644 --- a/fs/btrfs/ioctl.h +++ b/fs/btrfs/ioctl.h @@ -65,5 +65,6 @@ struct btrfs_ioctl_clone_range_args { #define BTRFS_IOC_SUBVOL_CREATE _IOW(BTRFS_IOCTL_MAGIC, 14, \ struct btrfs_ioctl_vol_args) - +#define BTRFS_IOC_SNAP_DESTROY _IOW(BTRFS_IOCTL_MAGIC, 15, \ + struct btrfs_ioctl_vol_args) #endif diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index d6f0806..5799bc4 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -159,8 +159,6 @@ static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree, * * len is the length of the extent * - * This also sets the EXTENT_ORDERED bit on the range in the inode. - * * The tree is given a single reference on the ordered extent that was * inserted. */ @@ -181,6 +179,7 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, entry->start = start; entry->len = len; entry->disk_len = disk_len; + entry->bytes_left = len; entry->inode = inode; if (type != BTRFS_ORDERED_IO_DONE && type != BTRFS_ORDERED_COMPLETE) set_bit(type, &entry->flags); @@ -195,9 +194,6 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, &entry->rb_node); BUG_ON(node); - set_extent_ordered(&BTRFS_I(inode)->io_tree, file_offset, - entry_end(entry) - 1, GFP_NOFS); - spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock); list_add_tail(&entry->root_extent_list, &BTRFS_I(inode)->root->fs_info->ordered_extents); @@ -241,13 +237,10 @@ int btrfs_dec_test_ordered_pending(struct inode *inode, struct btrfs_ordered_inode_tree *tree; struct rb_node *node; struct btrfs_ordered_extent *entry; - struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; int ret; tree = &BTRFS_I(inode)->ordered_tree; mutex_lock(&tree->mutex); - clear_extent_ordered(io_tree, file_offset, file_offset + io_size - 1, - GFP_NOFS); node = tree_search(tree, file_offset); if (!node) { ret = 1; @@ -260,11 +253,16 @@ int btrfs_dec_test_ordered_pending(struct inode *inode, goto out; } - ret = test_range_bit(io_tree, entry->file_offset, - entry->file_offset + entry->len - 1, - EXTENT_ORDERED, 0); - if (ret == 0) + if (io_size > entry->bytes_left) { + printk(KERN_CRIT "bad ordered accounting left %llu size %llu\n", + (unsigned long long)entry->bytes_left, + (unsigned long long)io_size); + } + entry->bytes_left -= io_size; + if (entry->bytes_left == 0) ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags); + else + ret = 1; out: mutex_unlock(&tree->mutex); return ret == 0; @@ -308,6 +306,12 @@ int btrfs_remove_ordered_extent(struct inode *inode, tree->last = NULL; set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags); + spin_lock(&BTRFS_I(inode)->accounting_lock); + BTRFS_I(inode)->outstanding_extents--; + spin_unlock(&BTRFS_I(inode)->accounting_lock); + btrfs_unreserve_metadata_for_delalloc(BTRFS_I(inode)->root, + inode, 1); + spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock); list_del_init(&entry->root_extent_list); @@ -460,7 +464,7 @@ void btrfs_start_ordered_extent(struct inode *inode, * start IO on any dirty ones so the wait doesn't stall waiting * for pdflush to find them */ - btrfs_fdatawrite_range(inode->i_mapping, start, end, WB_SYNC_ALL); + filemap_fdatawrite_range(inode->i_mapping, start, end); if (wait) { wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE, &entry->flags)); @@ -476,6 +480,7 @@ int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len) u64 orig_end; u64 wait_end; struct btrfs_ordered_extent *ordered; + int found; if (start + len < start) { orig_end = INT_LIMIT(loff_t); @@ -489,19 +494,18 @@ again: /* start IO across the range first to instantiate any delalloc * extents */ - btrfs_fdatawrite_range(inode->i_mapping, start, orig_end, WB_SYNC_ALL); + filemap_fdatawrite_range(inode->i_mapping, start, orig_end); /* The compression code will leave pages locked but return from * writepage without setting the page writeback. Starting again * with WB_SYNC_ALL will end up waiting for the IO to actually start. */ - btrfs_fdatawrite_range(inode->i_mapping, start, orig_end, WB_SYNC_ALL); + filemap_fdatawrite_range(inode->i_mapping, start, orig_end); - btrfs_wait_on_page_writeback_range(inode->i_mapping, - start >> PAGE_CACHE_SHIFT, - orig_end >> PAGE_CACHE_SHIFT); + filemap_fdatawait_range(inode->i_mapping, start, orig_end); end = orig_end; + found = 0; while (1) { ordered = btrfs_lookup_first_ordered_extent(inode, end); if (!ordered) @@ -514,6 +518,7 @@ again: btrfs_put_ordered_extent(ordered); break; } + found++; btrfs_start_ordered_extent(inode, ordered, 1); end = ordered->file_offset; btrfs_put_ordered_extent(ordered); @@ -521,8 +526,8 @@ again: break; end--; } - if (test_range_bit(&BTRFS_I(inode)->io_tree, start, orig_end, - EXTENT_ORDERED | EXTENT_DELALLOC, 0)) { + if (found || test_range_bit(&BTRFS_I(inode)->io_tree, start, orig_end, + EXTENT_DELALLOC, 0, NULL)) { schedule_timeout(1); goto again; } @@ -613,7 +618,7 @@ int btrfs_ordered_update_i_size(struct inode *inode, */ if (test_range_bit(io_tree, disk_i_size, ordered->file_offset + ordered->len - 1, - EXTENT_DELALLOC, 0)) { + EXTENT_DELALLOC, 0, NULL)) { goto out; } /* @@ -664,7 +669,7 @@ int btrfs_ordered_update_i_size(struct inode *inode, */ if (i_size_test > entry_end(ordered) && !test_range_bit(io_tree, entry_end(ordered), i_size_test - 1, - EXTENT_DELALLOC, 0)) { + EXTENT_DELALLOC, 0, NULL)) { new_i_size = min_t(u64, i_size_test, i_size_read(inode)); } BTRFS_I(inode)->disk_i_size = new_i_size; @@ -715,90 +720,6 @@ out: } -/** - * taken from mm/filemap.c because it isn't exported - * - * __filemap_fdatawrite_range - start writeback on mapping dirty pages in range - * @mapping: address space structure to write - * @start: offset in bytes where the range starts - * @end: offset in bytes where the range ends (inclusive) - * @sync_mode: enable synchronous operation - * - * Start writeback against all of a mapping's dirty pages that lie - * within the byte offsets <start, end> inclusive. - * - * If sync_mode is WB_SYNC_ALL then this is a "data integrity" operation, as - * opposed to a regular memory cleansing writeback. The difference between - * these two operations is that if a dirty page/buffer is encountered, it must - * be waited upon, and not just skipped over. - */ -int btrfs_fdatawrite_range(struct address_space *mapping, loff_t start, - loff_t end, int sync_mode) -{ - struct writeback_control wbc = { - .sync_mode = sync_mode, - .nr_to_write = mapping->nrpages * 2, - .range_start = start, - .range_end = end, - .for_writepages = 1, - }; - return btrfs_writepages(mapping, &wbc); -} - -/** - * taken from mm/filemap.c because it isn't exported - * - * wait_on_page_writeback_range - wait for writeback to complete - * @mapping: target address_space - * @start: beginning page index - * @end: ending page index - * - * Wait for writeback to complete against pages indexed by start->end - * inclusive - */ -int btrfs_wait_on_page_writeback_range(struct address_space *mapping, - pgoff_t start, pgoff_t end) -{ - struct pagevec pvec; - int nr_pages; - int ret = 0; - pgoff_t index; - - if (end < start) - return 0; - - pagevec_init(&pvec, 0); - index = start; - while ((index <= end) && - (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, - PAGECACHE_TAG_WRITEBACK, - min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1)) != 0) { - unsigned i; - - for (i = 0; i < nr_pages; i++) { - struct page *page = pvec.pages[i]; - - /* until radix tree lookup accepts end_index */ - if (page->index > end) - continue; - - wait_on_page_writeback(page); - if (PageError(page)) - ret = -EIO; - } - pagevec_release(&pvec); - cond_resched(); - } - - /* Check for outstanding write errors */ - if (test_and_clear_bit(AS_ENOSPC, &mapping->flags)) - ret = -ENOSPC; - if (test_and_clear_bit(AS_EIO, &mapping->flags)) - ret = -EIO; - - return ret; -} - /* * add a given inode to the list of inodes that must be fully on * disk before a transaction commit finishes. diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index 3d31c88..f82e874 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -85,6 +85,9 @@ struct btrfs_ordered_extent { /* extent length on disk */ u64 disk_len; + /* number of bytes that still need writing */ + u64 bytes_left; + /* flags (described above) */ unsigned long flags; @@ -150,10 +153,6 @@ btrfs_lookup_first_ordered_extent(struct inode * inode, u64 file_offset); int btrfs_ordered_update_i_size(struct inode *inode, struct btrfs_ordered_extent *ordered); int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, u32 *sum); -int btrfs_wait_on_page_writeback_range(struct address_space *mapping, - pgoff_t start, pgoff_t end); -int btrfs_fdatawrite_range(struct address_space *mapping, loff_t start, - loff_t end, int sync_mode); int btrfs_wait_ordered_extents(struct btrfs_root *root, int nocow_only); int btrfs_run_ordered_operations(struct btrfs_root *root, int wait); int btrfs_add_ordered_operation(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/orphan.c b/fs/btrfs/orphan.c index 3c0d52a..79cba5f 100644 --- a/fs/btrfs/orphan.c +++ b/fs/btrfs/orphan.c @@ -65,3 +65,23 @@ out: btrfs_free_path(path); return ret; } + +int btrfs_find_orphan_item(struct btrfs_root *root, u64 offset) +{ + struct btrfs_path *path; + struct btrfs_key key; + int ret; + + key.objectid = BTRFS_ORPHAN_OBJECTID; + key.type = BTRFS_ORPHAN_ITEM_KEY; + key.offset = offset; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + + btrfs_free_path(path); + return ret; +} diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index c04f7f2..cfcc93c 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -121,6 +121,15 @@ struct inodevec { int nr; }; +#define MAX_EXTENTS 128 + +struct file_extent_cluster { + u64 start; + u64 end; + u64 boundary[MAX_EXTENTS]; + unsigned int nr; +}; + struct reloc_control { /* block group to relocate */ struct btrfs_block_group_cache *block_group; @@ -2180,7 +2189,7 @@ static int tree_block_processed(u64 bytenr, u32 blocksize, struct reloc_control *rc) { if (test_range_bit(&rc->processed_blocks, bytenr, - bytenr + blocksize - 1, EXTENT_DIRTY, 1)) + bytenr + blocksize - 1, EXTENT_DIRTY, 1, NULL)) return 1; return 0; } @@ -2529,56 +2538,94 @@ out: } static noinline_for_stack -int relocate_inode_pages(struct inode *inode, u64 start, u64 len) +int setup_extent_mapping(struct inode *inode, u64 start, u64 end, + u64 block_start) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; + struct extent_map *em; + int ret = 0; + + em = alloc_extent_map(GFP_NOFS); + if (!em) + return -ENOMEM; + + em->start = start; + em->len = end + 1 - start; + em->block_len = em->len; + em->block_start = block_start; + em->bdev = root->fs_info->fs_devices->latest_bdev; + set_bit(EXTENT_FLAG_PINNED, &em->flags); + + lock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS); + while (1) { + write_lock(&em_tree->lock); + ret = add_extent_mapping(em_tree, em); + write_unlock(&em_tree->lock); + if (ret != -EEXIST) { + free_extent_map(em); + break; + } + btrfs_drop_extent_cache(inode, start, end, 0); + } + unlock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS); + return ret; +} + +static int relocate_file_extent_cluster(struct inode *inode, + struct file_extent_cluster *cluster) { u64 page_start; u64 page_end; - unsigned long i; - unsigned long first_index; + u64 offset = BTRFS_I(inode)->index_cnt; + unsigned long index; unsigned long last_index; - unsigned int total_read = 0; - unsigned int total_dirty = 0; + unsigned int dirty_page = 0; struct page *page; struct file_ra_state *ra; - struct btrfs_ordered_extent *ordered; - struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; + int nr = 0; int ret = 0; + if (!cluster->nr) + return 0; + ra = kzalloc(sizeof(*ra), GFP_NOFS); if (!ra) return -ENOMEM; + index = (cluster->start - offset) >> PAGE_CACHE_SHIFT; + last_index = (cluster->end - offset) >> PAGE_CACHE_SHIFT; + mutex_lock(&inode->i_mutex); - first_index = start >> PAGE_CACHE_SHIFT; - last_index = (start + len - 1) >> PAGE_CACHE_SHIFT; - /* make sure the dirty trick played by the caller work */ - while (1) { - ret = invalidate_inode_pages2_range(inode->i_mapping, - first_index, last_index); - if (ret != -EBUSY) - break; - schedule_timeout(HZ/10); - } + i_size_write(inode, cluster->end + 1 - offset); + ret = setup_extent_mapping(inode, cluster->start - offset, + cluster->end - offset, cluster->start); if (ret) goto out_unlock; file_ra_state_init(ra, inode->i_mapping); - for (i = first_index ; i <= last_index; i++) { - if (total_read % ra->ra_pages == 0) { - btrfs_force_ra(inode->i_mapping, ra, NULL, i, - min(last_index, ra->ra_pages + i - 1)); - } - total_read++; -again: - if (((u64)i << PAGE_CACHE_SHIFT) > i_size_read(inode)) - BUG_ON(1); - page = grab_cache_page(inode->i_mapping, i); + WARN_ON(cluster->start != cluster->boundary[0]); + while (index <= last_index) { + page = find_lock_page(inode->i_mapping, index); if (!page) { - ret = -ENOMEM; - goto out_unlock; + page_cache_sync_readahead(inode->i_mapping, + ra, NULL, index, + last_index + 1 - index); + page = grab_cache_page(inode->i_mapping, index); + if (!page) { + ret = -ENOMEM; + goto out_unlock; + } + } + + if (PageReadahead(page)) { + page_cache_async_readahead(inode->i_mapping, + ra, NULL, page, index, + last_index + 1 - index); } + if (!PageUptodate(page)) { btrfs_readpage(NULL, page); lock_page(page); @@ -2589,75 +2636,79 @@ again: goto out_unlock; } } - wait_on_page_writeback(page); page_start = (u64)page->index << PAGE_CACHE_SHIFT; page_end = page_start + PAGE_CACHE_SIZE - 1; - lock_extent(io_tree, page_start, page_end, GFP_NOFS); - - ordered = btrfs_lookup_ordered_extent(inode, page_start); - if (ordered) { - unlock_extent(io_tree, page_start, page_end, GFP_NOFS); - unlock_page(page); - page_cache_release(page); - btrfs_start_ordered_extent(inode, ordered, 1); - btrfs_put_ordered_extent(ordered); - goto again; - } + + lock_extent(&BTRFS_I(inode)->io_tree, + page_start, page_end, GFP_NOFS); + set_page_extent_mapped(page); - if (i == first_index) - set_extent_bits(io_tree, page_start, page_end, + if (nr < cluster->nr && + page_start + offset == cluster->boundary[nr]) { + set_extent_bits(&BTRFS_I(inode)->io_tree, + page_start, page_end, EXTENT_BOUNDARY, GFP_NOFS); + nr++; + } btrfs_set_extent_delalloc(inode, page_start, page_end); set_page_dirty(page); - total_dirty++; + dirty_page++; - unlock_extent(io_tree, page_start, page_end, GFP_NOFS); + unlock_extent(&BTRFS_I(inode)->io_tree, + page_start, page_end, GFP_NOFS); unlock_page(page); page_cache_release(page); + + index++; + if (nr < cluster->nr && + page_end + 1 + offset == cluster->boundary[nr]) { + balance_dirty_pages_ratelimited_nr(inode->i_mapping, + dirty_page); + dirty_page = 0; + } + } + if (dirty_page) { + balance_dirty_pages_ratelimited_nr(inode->i_mapping, + dirty_page); } + WARN_ON(nr != cluster->nr); out_unlock: mutex_unlock(&inode->i_mutex); kfree(ra); - balance_dirty_pages_ratelimited_nr(inode->i_mapping, total_dirty); return ret; } static noinline_for_stack -int relocate_data_extent(struct inode *inode, struct btrfs_key *extent_key) +int relocate_data_extent(struct inode *inode, struct btrfs_key *extent_key, + struct file_extent_cluster *cluster) { - struct btrfs_root *root = BTRFS_I(inode)->root; - struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; - struct extent_map *em; - u64 start = extent_key->objectid - BTRFS_I(inode)->index_cnt; - u64 end = start + extent_key->offset - 1; - - em = alloc_extent_map(GFP_NOFS); - em->start = start; - em->len = extent_key->offset; - em->block_len = extent_key->offset; - em->block_start = extent_key->objectid; - em->bdev = root->fs_info->fs_devices->latest_bdev; - set_bit(EXTENT_FLAG_PINNED, &em->flags); + int ret; - /* setup extent map to cheat btrfs_readpage */ - lock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS); - while (1) { - int ret; - spin_lock(&em_tree->lock); - ret = add_extent_mapping(em_tree, em); - spin_unlock(&em_tree->lock); - if (ret != -EEXIST) { - free_extent_map(em); - break; - } - btrfs_drop_extent_cache(inode, start, end, 0); + if (cluster->nr > 0 && extent_key->objectid != cluster->end + 1) { + ret = relocate_file_extent_cluster(inode, cluster); + if (ret) + return ret; + cluster->nr = 0; } - unlock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS); - return relocate_inode_pages(inode, start, extent_key->offset); + if (!cluster->nr) + cluster->start = extent_key->objectid; + else + BUG_ON(cluster->nr >= MAX_EXTENTS); + cluster->end = extent_key->objectid + extent_key->offset - 1; + cluster->boundary[cluster->nr] = extent_key->objectid; + cluster->nr++; + + if (cluster->nr >= MAX_EXTENTS) { + ret = relocate_file_extent_cluster(inode, cluster); + if (ret) + return ret; + cluster->nr = 0; + } + return 0; } #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 @@ -3203,10 +3254,12 @@ static int check_extent_flags(u64 flags) return 0; } + static noinline_for_stack int relocate_block_group(struct reloc_control *rc) { struct rb_root blocks = RB_ROOT; struct btrfs_key key; + struct file_extent_cluster *cluster; struct btrfs_trans_handle *trans = NULL; struct btrfs_path *path; struct btrfs_extent_item *ei; @@ -3216,10 +3269,17 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc) int ret; int err = 0; + cluster = kzalloc(sizeof(*cluster), GFP_NOFS); + if (!cluster) + return -ENOMEM; + path = btrfs_alloc_path(); if (!path) return -ENOMEM; + rc->extents_found = 0; + rc->extents_skipped = 0; + rc->search_start = rc->block_group->key.objectid; clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY, GFP_NOFS); @@ -3306,14 +3366,15 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc) } nr = trans->blocks_used; - btrfs_end_transaction_throttle(trans, rc->extent_root); + btrfs_end_transaction(trans, rc->extent_root); trans = NULL; btrfs_btree_balance_dirty(rc->extent_root, nr); if (rc->stage == MOVE_DATA_EXTENTS && (flags & BTRFS_EXTENT_FLAG_DATA)) { rc->found_file_extent = 1; - ret = relocate_data_extent(rc->data_inode, &key); + ret = relocate_data_extent(rc->data_inode, + &key, cluster); if (ret < 0) { err = ret; break; @@ -3328,6 +3389,14 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc) btrfs_btree_balance_dirty(rc->extent_root, nr); } + if (!err) { + ret = relocate_file_extent_cluster(rc->data_inode, cluster); + if (ret < 0) + err = ret; + } + + kfree(cluster); + rc->create_reloc_root = 0; smp_mb(); @@ -3348,8 +3417,7 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc) } static int __insert_orphan_inode(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - u64 objectid, u64 size) + struct btrfs_root *root, u64 objectid) { struct btrfs_path *path; struct btrfs_inode_item *item; @@ -3368,7 +3436,7 @@ static int __insert_orphan_inode(struct btrfs_trans_handle *trans, item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item); memset_extent_buffer(leaf, 0, (unsigned long)item, sizeof(*item)); btrfs_set_inode_generation(leaf, item, 1); - btrfs_set_inode_size(leaf, item, size); + btrfs_set_inode_size(leaf, item, 0); btrfs_set_inode_mode(leaf, item, S_IFREG | 0600); btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS); btrfs_mark_buffer_dirty(leaf); @@ -3404,12 +3472,7 @@ static struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info, if (err) goto out; - err = __insert_orphan_inode(trans, root, objectid, group->key.offset); - BUG_ON(err); - - err = btrfs_insert_file_extent(trans, root, objectid, 0, 0, 0, - group->key.offset, 0, group->key.offset, - 0, 0, 0); + err = __insert_orphan_inode(trans, root, objectid); BUG_ON(err); key.objectid = objectid; @@ -3455,7 +3518,7 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start) BUG_ON(!rc->block_group); btrfs_init_workers(&rc->workers, "relocate", - fs_info->thread_pool_size); + fs_info->thread_pool_size, NULL); rc->extent_root = extent_root; btrfs_prepare_block_group_relocation(extent_root, rc->block_group); @@ -3475,14 +3538,15 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start) btrfs_wait_ordered_extents(fs_info->tree_root, 0); while (1) { - mutex_lock(&fs_info->cleaner_mutex); - btrfs_clean_old_snapshots(fs_info->tree_root); - mutex_unlock(&fs_info->cleaner_mutex); - rc->extents_found = 0; rc->extents_skipped = 0; + mutex_lock(&fs_info->cleaner_mutex); + + btrfs_clean_old_snapshots(fs_info->tree_root); ret = relocate_block_group(rc); + + mutex_unlock(&fs_info->cleaner_mutex); if (ret < 0) { err = ret; break; @@ -3514,10 +3578,10 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start) } } - filemap_fdatawrite_range(fs_info->btree_inode->i_mapping, - rc->block_group->key.objectid, - rc->block_group->key.objectid + - rc->block_group->key.offset - 1); + filemap_write_and_wait_range(fs_info->btree_inode->i_mapping, + rc->block_group->key.objectid, + rc->block_group->key.objectid + + rc->block_group->key.offset - 1); WARN_ON(rc->block_group->pinned > 0); WARN_ON(rc->block_group->reserved > 0); @@ -3530,6 +3594,26 @@ out: return err; } +static noinline_for_stack int mark_garbage_root(struct btrfs_root *root) +{ + struct btrfs_trans_handle *trans; + int ret; + + trans = btrfs_start_transaction(root->fs_info->tree_root, 1); + + memset(&root->root_item.drop_progress, 0, + sizeof(root->root_item.drop_progress)); + root->root_item.drop_level = 0; + btrfs_set_root_refs(&root->root_item, 0); + ret = btrfs_update_root(trans, root->fs_info->tree_root, + &root->root_key, &root->root_item); + BUG_ON(ret); + + ret = btrfs_end_transaction(trans, root->fs_info->tree_root); + BUG_ON(ret); + return 0; +} + /* * recover relocation interrupted by system crash. * @@ -3589,8 +3673,12 @@ int btrfs_recover_relocation(struct btrfs_root *root) fs_root = read_fs_root(root->fs_info, reloc_root->root_key.offset); if (IS_ERR(fs_root)) { - err = PTR_ERR(fs_root); - goto out; + ret = PTR_ERR(fs_root); + if (ret != -ENOENT) { + err = ret; + goto out; + } + mark_garbage_root(reloc_root); } } @@ -3613,7 +3701,7 @@ int btrfs_recover_relocation(struct btrfs_root *root) mapping_tree_init(&rc->reloc_root_tree); INIT_LIST_HEAD(&rc->reloc_roots); btrfs_init_workers(&rc->workers, "relocate", - root->fs_info->thread_pool_size); + root->fs_info->thread_pool_size, NULL); rc->extent_root = root->fs_info->extent_root; set_reloc_control(rc); diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index 0ddc6d6..9351428 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -94,17 +94,23 @@ int btrfs_find_last_root(struct btrfs_root *root, u64 objectid, goto out; BUG_ON(ret == 0); + if (path->slots[0] == 0) { + ret = 1; + goto out; + } l = path->nodes[0]; - BUG_ON(path->slots[0] == 0); slot = path->slots[0] - 1; btrfs_item_key_to_cpu(l, &found_key, slot); - if (found_key.objectid != objectid) { + if (found_key.objectid != objectid || + found_key.type != BTRFS_ROOT_ITEM_KEY) { ret = 1; goto out; } - read_extent_buffer(l, item, btrfs_item_ptr_offset(l, slot), - sizeof(*item)); - memcpy(key, &found_key, sizeof(found_key)); + if (item) + read_extent_buffer(l, item, btrfs_item_ptr_offset(l, slot), + sizeof(*item)); + if (key) + memcpy(key, &found_key, sizeof(found_key)); ret = 0; out: btrfs_free_path(path); @@ -249,6 +255,59 @@ err: return ret; } +int btrfs_find_orphan_roots(struct btrfs_root *tree_root) +{ + struct extent_buffer *leaf; + struct btrfs_path *path; + struct btrfs_key key; + int err = 0; + int ret; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + key.objectid = BTRFS_ORPHAN_OBJECTID; + key.type = BTRFS_ORPHAN_ITEM_KEY; + key.offset = 0; + + while (1) { + ret = btrfs_search_slot(NULL, tree_root, &key, path, 0, 0); + if (ret < 0) { + err = ret; + break; + } + + leaf = path->nodes[0]; + if (path->slots[0] >= btrfs_header_nritems(leaf)) { + ret = btrfs_next_leaf(tree_root, path); + if (ret < 0) + err = ret; + if (ret != 0) + break; + leaf = path->nodes[0]; + } + + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + btrfs_release_path(tree_root, path); + + if (key.objectid != BTRFS_ORPHAN_OBJECTID || + key.type != BTRFS_ORPHAN_ITEM_KEY) + break; + + ret = btrfs_find_dead_roots(tree_root, key.offset); + if (ret) { + err = ret; + break; + } + + key.offset++; + } + + btrfs_free_path(path); + return err; +} + /* drop the root item for 'key' from 'root' */ int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_key *key) @@ -278,31 +337,57 @@ out: return ret; } -#if 0 /* this will get used when snapshot deletion is implemented */ int btrfs_del_root_ref(struct btrfs_trans_handle *trans, struct btrfs_root *tree_root, - u64 root_id, u8 type, u64 ref_id) + u64 root_id, u64 ref_id, u64 dirid, u64 *sequence, + const char *name, int name_len) + { + struct btrfs_path *path; + struct btrfs_root_ref *ref; + struct extent_buffer *leaf; struct btrfs_key key; + unsigned long ptr; + int err = 0; int ret; - struct btrfs_path *path; path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; key.objectid = root_id; - key.type = type; + key.type = BTRFS_ROOT_BACKREF_KEY; key.offset = ref_id; - +again: ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1); - BUG_ON(ret); - - ret = btrfs_del_item(trans, tree_root, path); - BUG_ON(ret); + BUG_ON(ret < 0); + if (ret == 0) { + leaf = path->nodes[0]; + ref = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_root_ref); + + WARN_ON(btrfs_root_ref_dirid(leaf, ref) != dirid); + WARN_ON(btrfs_root_ref_name_len(leaf, ref) != name_len); + ptr = (unsigned long)(ref + 1); + WARN_ON(memcmp_extent_buffer(leaf, name, ptr, name_len)); + *sequence = btrfs_root_ref_sequence(leaf, ref); + + ret = btrfs_del_item(trans, tree_root, path); + BUG_ON(ret); + } else + err = -ENOENT; + + if (key.type == BTRFS_ROOT_BACKREF_KEY) { + btrfs_release_path(tree_root, path); + key.objectid = ref_id; + key.type = BTRFS_ROOT_REF_KEY; + key.offset = root_id; + goto again; + } btrfs_free_path(path); - return ret; + return err; } -#endif int btrfs_find_root_ref(struct btrfs_root *tree_root, struct btrfs_path *path, @@ -319,7 +404,6 @@ int btrfs_find_root_ref(struct btrfs_root *tree_root, return ret; } - /* * add a btrfs_root_ref item. type is either BTRFS_ROOT_REF_KEY * or BTRFS_ROOT_BACKREF_KEY. @@ -335,8 +419,7 @@ int btrfs_find_root_ref(struct btrfs_root *tree_root, */ int btrfs_add_root_ref(struct btrfs_trans_handle *trans, struct btrfs_root *tree_root, - u64 root_id, u8 type, u64 ref_id, - u64 dirid, u64 sequence, + u64 root_id, u64 ref_id, u64 dirid, u64 sequence, const char *name, int name_len) { struct btrfs_key key; @@ -346,13 +429,14 @@ int btrfs_add_root_ref(struct btrfs_trans_handle *trans, struct extent_buffer *leaf; unsigned long ptr; - path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; key.objectid = root_id; - key.type = type; + key.type = BTRFS_ROOT_BACKREF_KEY; key.offset = ref_id; - +again: ret = btrfs_insert_empty_item(trans, tree_root, path, &key, sizeof(*ref) + name_len); BUG_ON(ret); @@ -366,6 +450,14 @@ int btrfs_add_root_ref(struct btrfs_trans_handle *trans, write_extent_buffer(leaf, name, ptr, name_len); btrfs_mark_buffer_dirty(leaf); + if (key.type == BTRFS_ROOT_BACKREF_KEY) { + btrfs_release_path(tree_root, path); + key.objectid = ref_id; + key.type = BTRFS_ROOT_REF_KEY; + key.offset = root_id; + goto again; + } + btrfs_free_path(path); - return ret; + return 0; } diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 6d6d06c..752a546 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -51,7 +51,7 @@ #include "export.h" #include "compression.h" -static struct super_operations btrfs_super_ops; +static const struct super_operations btrfs_super_ops; static void btrfs_put_super(struct super_block *sb) { @@ -66,7 +66,8 @@ enum { Opt_degraded, Opt_subvol, Opt_device, Opt_nodatasum, Opt_nodatacow, Opt_max_extent, Opt_max_inline, Opt_alloc_start, Opt_nobarrier, Opt_ssd, Opt_nossd, Opt_ssd_spread, Opt_thread_pool, Opt_noacl, - Opt_compress, Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_err, + Opt_compress, Opt_notreelog, Opt_ratio, Opt_flushoncommit, + Opt_discard, Opt_err, }; static match_table_t tokens = { @@ -88,6 +89,7 @@ static match_table_t tokens = { {Opt_notreelog, "notreelog"}, {Opt_flushoncommit, "flushoncommit"}, {Opt_ratio, "metadata_ratio=%d"}, + {Opt_discard, "discard"}, {Opt_err, NULL}, }; @@ -257,6 +259,9 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) info->metadata_ratio); } break; + case Opt_discard: + btrfs_set_opt(info->mount_opt, DISCARD); + break; default: break; } @@ -344,7 +349,9 @@ static int btrfs_fill_super(struct super_block *sb, sb->s_export_op = &btrfs_export_ops; sb->s_xattr = btrfs_xattr_handlers; sb->s_time_gran = 1; +#ifdef CONFIG_BTRFS_FS_POSIX_ACL sb->s_flags |= MS_POSIXACL; +#endif tree_root = open_ctree(sb, fs_devices, (char *)data); @@ -675,7 +682,8 @@ static int btrfs_unfreeze(struct super_block *sb) return 0; } -static struct super_operations btrfs_super_ops = { +static const struct super_operations btrfs_super_ops = { + .drop_inode = btrfs_drop_inode, .delete_inode = btrfs_delete_inode, .put_super = btrfs_put_super, .sync_fs = btrfs_sync_fs, diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index cdbb502..bca82a4 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -104,7 +104,6 @@ static noinline int record_root_in_trans(struct btrfs_trans_handle *trans, { if (root->ref_cows && root->last_trans < trans->transid) { WARN_ON(root == root->fs_info->extent_root); - WARN_ON(root->root_item.refs == 0); WARN_ON(root->commit_root != root->node); radix_tree_tag_set(&root->fs_info->fs_roots_radix, @@ -187,6 +186,9 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root, h->alloc_exclude_start = 0; h->delayed_ref_updates = 0; + if (!current->journal_info) + current->journal_info = h; + root->fs_info->running_transaction->use_count++; record_root_in_trans(h, root); mutex_unlock(&root->fs_info->trans_mutex); @@ -318,6 +320,9 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, wake_up(&cur_trans->writer_wait); put_transaction(cur_trans); mutex_unlock(&info->trans_mutex); + + if (current->journal_info == trans) + current->journal_info = NULL; memset(trans, 0, sizeof(*trans)); kmem_cache_free(btrfs_trans_handle_cachep, trans); @@ -339,10 +344,10 @@ int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans, /* * when btree blocks are allocated, they have some corresponding bits set for * them in one of two extent_io trees. This is used to make sure all of - * those extents are on disk for transaction or log commit + * those extents are sent to disk but does not wait on them */ -int btrfs_write_and_wait_marked_extents(struct btrfs_root *root, - struct extent_io_tree *dirty_pages) +int btrfs_write_marked_extents(struct btrfs_root *root, + struct extent_io_tree *dirty_pages) { int ret; int err = 0; @@ -389,6 +394,29 @@ int btrfs_write_and_wait_marked_extents(struct btrfs_root *root, page_cache_release(page); } } + if (err) + werr = err; + return werr; +} + +/* + * when btree blocks are allocated, they have some corresponding bits set for + * them in one of two extent_io trees. This is used to make sure all of + * those extents are on disk for transaction or log commit. We wait + * on all the pages and clear them from the dirty pages state tree + */ +int btrfs_wait_marked_extents(struct btrfs_root *root, + struct extent_io_tree *dirty_pages) +{ + int ret; + int err = 0; + int werr = 0; + struct page *page; + struct inode *btree_inode = root->fs_info->btree_inode; + u64 start = 0; + u64 end; + unsigned long index; + while (1) { ret = find_first_extent_bit(dirty_pages, 0, &start, &end, EXTENT_DIRTY); @@ -419,6 +447,22 @@ int btrfs_write_and_wait_marked_extents(struct btrfs_root *root, return werr; } +/* + * when btree blocks are allocated, they have some corresponding bits set for + * them in one of two extent_io trees. This is used to make sure all of + * those extents are on disk for transaction or log commit + */ +int btrfs_write_and_wait_marked_extents(struct btrfs_root *root, + struct extent_io_tree *dirty_pages) +{ + int ret; + int ret2; + + ret = btrfs_write_marked_extents(root, dirty_pages); + ret2 = btrfs_wait_marked_extents(root, dirty_pages); + return ret || ret2; +} + int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans, struct btrfs_root *root) { @@ -720,7 +764,8 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, memcpy(new_root_item, &root->root_item, sizeof(*new_root_item)); key.objectid = objectid; - key.offset = 0; + /* record when the snapshot was created in key.offset */ + key.offset = trans->transid; btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); old = btrfs_lock_root_node(root); @@ -743,6 +788,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, memcpy(&pending->root_key, &key, sizeof(key)); fail: kfree(new_root_item); + btrfs_unreserve_metadata_space(root, 6); return ret; } @@ -778,24 +824,14 @@ static noinline int finish_pending_snapshot(struct btrfs_fs_info *fs_info, ret = btrfs_update_inode(trans, parent_root, parent_inode); BUG_ON(ret); - /* add the backref first */ ret = btrfs_add_root_ref(trans, parent_root->fs_info->tree_root, pending->root_key.objectid, - BTRFS_ROOT_BACKREF_KEY, parent_root->root_key.objectid, parent_inode->i_ino, index, pending->name, namelen); BUG_ON(ret); - /* now add the forward ref */ - ret = btrfs_add_root_ref(trans, parent_root->fs_info->tree_root, - parent_root->root_key.objectid, - BTRFS_ROOT_REF_KEY, - pending->root_key.objectid, - parent_inode->i_ino, index, pending->name, - namelen); - inode = btrfs_lookup_dentry(parent_inode, pending->dentry); d_instantiate(pending->dentry, inode); fail: @@ -874,7 +910,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, unsigned long timeout = 1; struct btrfs_transaction *cur_trans; struct btrfs_transaction *prev_trans = NULL; - struct extent_io_tree *pinned_copy; DEFINE_WAIT(wait); int ret; int should_grow = 0; @@ -915,13 +950,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, return 0; } - pinned_copy = kmalloc(sizeof(*pinned_copy), GFP_NOFS); - if (!pinned_copy) - return -ENOMEM; - - extent_io_tree_init(pinned_copy, - root->fs_info->btree_inode->i_mapping, GFP_NOFS); - trans->transaction->in_commit = 1; trans->transaction->blocked = 1; if (cur_trans->list.prev != &root->fs_info->trans_list) { @@ -1019,6 +1047,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, ret = commit_cowonly_roots(trans, root); BUG_ON(ret); + btrfs_prepare_extent_commit(trans, root); + cur_trans = root->fs_info->running_transaction; spin_lock(&root->fs_info->new_trans_lock); root->fs_info->running_transaction = NULL; @@ -1042,8 +1072,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, memcpy(&root->fs_info->super_for_commit, &root->fs_info->super_copy, sizeof(root->fs_info->super_copy)); - btrfs_copy_pinned(root, pinned_copy); - trans->transaction->blocked = 0; wake_up(&root->fs_info->transaction_wait); @@ -1059,8 +1087,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, */ mutex_unlock(&root->fs_info->tree_log_mutex); - btrfs_finish_extent_commit(trans, root, pinned_copy); - kfree(pinned_copy); + btrfs_finish_extent_commit(trans, root); /* do the directory inserts of any pending snapshot creations */ finish_pending_snapshots(trans, root->fs_info); @@ -1078,6 +1105,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, mutex_unlock(&root->fs_info->trans_mutex); + if (current->journal_info == trans) + current->journal_info = NULL; + kmem_cache_free(btrfs_trans_handle_cachep, trans); return ret; } @@ -1096,8 +1126,13 @@ int btrfs_clean_old_snapshots(struct btrfs_root *root) while (!list_empty(&list)) { root = list_entry(list.next, struct btrfs_root, root_list); - list_del_init(&root->root_list); - btrfs_drop_snapshot(root, 0); + list_del(&root->root_list); + + if (btrfs_header_backref_rev(root->node) < + BTRFS_MIXED_BACKREF_REV) + btrfs_drop_snapshot(root, 0); + else + btrfs_drop_snapshot(root, 1); } return 0; } diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 663c674..d4e3e7a 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -79,6 +79,7 @@ static inline void btrfs_set_inode_last_trans(struct btrfs_trans_handle *trans, struct inode *inode) { BTRFS_I(inode)->last_trans = trans->transaction->transid; + BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid; } int btrfs_end_transaction(struct btrfs_trans_handle *trans, @@ -107,5 +108,9 @@ int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans, struct btrfs_root *root); int btrfs_write_and_wait_marked_extents(struct btrfs_root *root, struct extent_io_tree *dirty_pages); +int btrfs_write_marked_extents(struct btrfs_root *root, + struct extent_io_tree *dirty_pages); +int btrfs_wait_marked_extents(struct btrfs_root *root, + struct extent_io_tree *dirty_pages); int btrfs_transaction_in_commit(struct btrfs_fs_info *info); #endif diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index d91b0de..741666a 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -137,11 +137,20 @@ static int start_log_trans(struct btrfs_trans_handle *trans, mutex_lock(&root->log_mutex); if (root->log_root) { + if (!root->log_start_pid) { + root->log_start_pid = current->pid; + root->log_multiple_pids = false; + } else if (root->log_start_pid != current->pid) { + root->log_multiple_pids = true; + } + root->log_batch++; atomic_inc(&root->log_writers); mutex_unlock(&root->log_mutex); return 0; } + root->log_multiple_pids = false; + root->log_start_pid = current->pid; mutex_lock(&root->fs_info->tree_log_mutex); if (!root->fs_info->log_root_tree) { ret = btrfs_init_log_root_tree(trans, root->fs_info); @@ -263,8 +272,8 @@ static int process_one_buffer(struct btrfs_root *log, struct walk_control *wc, u64 gen) { if (wc->pin) - btrfs_update_pinned_extents(log->fs_info->extent_root, - eb->start, eb->len, 1); + btrfs_pin_extent(log->fs_info->extent_root, + eb->start, eb->len, 0); if (btrfs_buffer_uptodate(eb, gen)) { if (wc->write) @@ -534,7 +543,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, saved_nbytes = inode_get_bytes(inode); /* drop any overlapping extents */ ret = btrfs_drop_extents(trans, root, inode, - start, extent_end, extent_end, start, &alloc_hint); + start, extent_end, extent_end, start, &alloc_hint, 1); BUG_ON(ret); if (found_type == BTRFS_FILE_EXTENT_REG || @@ -1971,6 +1980,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, int ret; struct btrfs_root *log = root->log_root; struct btrfs_root *log_root_tree = root->fs_info->log_root_tree; + u64 log_transid = 0; mutex_lock(&root->log_mutex); index1 = root->log_transid % 2; @@ -1987,10 +1997,11 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, while (1) { unsigned long batch = root->log_batch; - mutex_unlock(&root->log_mutex); - schedule_timeout_uninterruptible(1); - mutex_lock(&root->log_mutex); - + if (root->log_multiple_pids) { + mutex_unlock(&root->log_mutex); + schedule_timeout_uninterruptible(1); + mutex_lock(&root->log_mutex); + } wait_for_writer(trans, root); if (batch == root->log_batch) break; @@ -2003,14 +2014,19 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, goto out; } - ret = btrfs_write_and_wait_marked_extents(log, &log->dirty_log_pages); + /* we start IO on all the marked extents here, but we don't actually + * wait for them until later. + */ + ret = btrfs_write_marked_extents(log, &log->dirty_log_pages); BUG_ON(ret); btrfs_set_root_node(&log->root_item, log->node); root->log_batch = 0; + log_transid = root->log_transid; root->log_transid++; log->log_transid = root->log_transid; + root->log_start_pid = 0; smp_mb(); /* * log tree has been flushed to disk, new modifications of @@ -2036,6 +2052,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, index2 = log_root_tree->log_transid % 2; if (atomic_read(&log_root_tree->log_commit[index2])) { + btrfs_wait_marked_extents(log, &log->dirty_log_pages); wait_log_commit(trans, log_root_tree, log_root_tree->log_transid); mutex_unlock(&log_root_tree->log_mutex); @@ -2055,6 +2072,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, * check the full commit flag again */ if (root->fs_info->last_trans_log_full_commit == trans->transid) { + btrfs_wait_marked_extents(log, &log->dirty_log_pages); mutex_unlock(&log_root_tree->log_mutex); ret = -EAGAIN; goto out_wake_log_root; @@ -2063,6 +2081,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, ret = btrfs_write_and_wait_marked_extents(log_root_tree, &log_root_tree->dirty_log_pages); BUG_ON(ret); + btrfs_wait_marked_extents(log, &log->dirty_log_pages); btrfs_set_super_log_root(&root->fs_info->super_for_commit, log_root_tree->node->start); @@ -2082,9 +2101,14 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, * the running transaction open, so a full commit can't hop * in and cause problems either. */ - write_ctree_super(trans, root->fs_info->tree_root, 2); + write_ctree_super(trans, root->fs_info->tree_root, 1); ret = 0; + mutex_lock(&root->log_mutex); + if (root->last_log_commit < log_transid) + root->last_log_commit = log_transid; + mutex_unlock(&root->log_mutex); + out_wake_log_root: atomic_set(&log_root_tree->log_commit[index2], 0); smp_mb(); @@ -2605,7 +2629,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, extent); cs = btrfs_file_extent_offset(src, extent); cl = btrfs_file_extent_num_bytes(src, - extent);; + extent); if (btrfs_file_extent_compression(src, extent)) { cs = 0; @@ -2841,7 +2865,7 @@ static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans, if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb) break; - if (parent == sb->s_root) + if (IS_ROOT(parent)) break; parent = parent->d_parent; @@ -2852,6 +2876,21 @@ out: return ret; } +static int inode_in_log(struct btrfs_trans_handle *trans, + struct inode *inode) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + int ret = 0; + + mutex_lock(&root->log_mutex); + if (BTRFS_I(inode)->logged_trans == trans->transid && + BTRFS_I(inode)->last_sub_trans <= root->last_log_commit) + ret = 1; + mutex_unlock(&root->log_mutex); + return ret; +} + + /* * helper function around btrfs_log_inode to make sure newly created * parent directories also end up in the log. A minimal inode and backref @@ -2880,11 +2919,22 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, goto end_no_trans; } + if (root != BTRFS_I(inode)->root || + btrfs_root_refs(&root->root_item) == 0) { + ret = 1; + goto end_no_trans; + } + ret = check_parent_dirs_for_sync(trans, inode, parent, sb, last_committed); if (ret) goto end_no_trans; + if (inode_in_log(trans, inode)) { + ret = BTRFS_NO_LOG_SYNC; + goto end_no_trans; + } + start_log_trans(trans, root); ret = btrfs_log_inode(trans, root, inode, inode_only); @@ -2907,12 +2957,15 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, break; inode = parent->d_inode; + if (root != BTRFS_I(inode)->root) + break; + if (BTRFS_I(inode)->generation > root->fs_info->last_trans_committed) { ret = btrfs_log_inode(trans, root, inode, inode_only); BUG_ON(ret); } - if (parent == sb->s_root) + if (IS_ROOT(parent)) break; parent = parent->d_parent; @@ -2951,7 +3004,6 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree) struct btrfs_key tmp_key; struct btrfs_root *log; struct btrfs_fs_info *fs_info = log_root_tree->fs_info; - u64 highest_inode; struct walk_control wc = { .process_func = process_one_buffer, .stage = 0, @@ -3010,11 +3062,6 @@ again: path); BUG_ON(ret); } - ret = btrfs_find_highest_inode(wc.replay_dest, &highest_inode); - if (ret == 0) { - wc.replay_dest->highest_inode = highest_inode; - wc.replay_dest->last_inode_alloc = highest_inode; - } key.offset = found_key.offset - 1; wc.replay_dest->log_root = NULL; diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h index d09c760..0776eac 100644 --- a/fs/btrfs/tree-log.h +++ b/fs/btrfs/tree-log.h @@ -19,6 +19,9 @@ #ifndef __TREE_LOG_ #define __TREE_LOG_ +/* return value for btrfs_log_dentry_safe that means we don't need to log it at all */ +#define BTRFS_NO_LOG_SYNC 256 + int btrfs_sync_log(struct btrfs_trans_handle *trans, struct btrfs_root *root); int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 5dbefd1..7eda483 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -260,7 +260,7 @@ loop_lock: num_run++; batch_run++; - if (bio_sync(cur)) + if (bio_rw_flagged(cur, BIO_RW_SYNCIO)) num_sync_run++; if (need_resched()) { @@ -276,7 +276,7 @@ loop_lock: * is now congested. Back off and let other work structs * run instead */ - if (pending && bdi_write_congested(bdi) && batch_run > 32 && + if (pending && bdi_write_congested(bdi) && batch_run > 8 && fs_info->fs_devices->open_devices > 1) { struct io_context *ioc; @@ -446,8 +446,10 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig) goto error; device->name = kstrdup(orig_dev->name, GFP_NOFS); - if (!device->name) + if (!device->name) { + kfree(device); goto error; + } device->devid = orig_dev->devid; device->work.func = pending_bios_fn; @@ -719,10 +721,9 @@ error: * called very infrequently and that a given device has a small number * of extents */ -static noinline int find_free_dev_extent(struct btrfs_trans_handle *trans, - struct btrfs_device *device, - u64 num_bytes, u64 *start, - u64 *max_avail) +int find_free_dev_extent(struct btrfs_trans_handle *trans, + struct btrfs_device *device, u64 num_bytes, + u64 *start, u64 *max_avail) { struct btrfs_key key; struct btrfs_root *root = device->dev_root; @@ -1736,6 +1737,10 @@ static int btrfs_relocate_chunk(struct btrfs_root *root, extent_root = root->fs_info->extent_root; em_tree = &root->fs_info->mapping_tree.map_tree; + ret = btrfs_can_relocate(extent_root, chunk_offset); + if (ret) + return -ENOSPC; + /* step one, relocate all the extents inside this chunk */ ret = btrfs_relocate_block_group(extent_root, chunk_offset); BUG_ON(ret); @@ -1749,9 +1754,9 @@ static int btrfs_relocate_chunk(struct btrfs_root *root, * step two, delete the device extents and the * chunk tree entries */ - spin_lock(&em_tree->lock); + read_lock(&em_tree->lock); em = lookup_extent_mapping(em_tree, chunk_offset, 1); - spin_unlock(&em_tree->lock); + read_unlock(&em_tree->lock); BUG_ON(em->start > chunk_offset || em->start + em->len < chunk_offset); @@ -1780,9 +1785,9 @@ static int btrfs_relocate_chunk(struct btrfs_root *root, ret = btrfs_remove_block_group(trans, extent_root, chunk_offset); BUG_ON(ret); - spin_lock(&em_tree->lock); + write_lock(&em_tree->lock); remove_extent_mapping(em_tree, em); - spin_unlock(&em_tree->lock); + write_unlock(&em_tree->lock); kfree(map); em->bdev = NULL; @@ -1807,12 +1812,15 @@ static int btrfs_relocate_sys_chunks(struct btrfs_root *root) struct btrfs_key found_key; u64 chunk_tree = chunk_root->root_key.objectid; u64 chunk_type; + bool retried = false; + int failed = 0; int ret; path = btrfs_alloc_path(); if (!path) return -ENOMEM; +again: key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; key.offset = (u64)-1; key.type = BTRFS_CHUNK_ITEM_KEY; @@ -1842,7 +1850,10 @@ static int btrfs_relocate_sys_chunks(struct btrfs_root *root) ret = btrfs_relocate_chunk(chunk_root, chunk_tree, found_key.objectid, found_key.offset); - BUG_ON(ret); + if (ret == -ENOSPC) + failed++; + else if (ret) + BUG(); } if (found_key.offset == 0) @@ -1850,6 +1861,14 @@ static int btrfs_relocate_sys_chunks(struct btrfs_root *root) key.offset = found_key.offset - 1; } ret = 0; + if (failed && !retried) { + failed = 0; + retried = true; + goto again; + } else if (failed && retried) { + WARN_ON(1); + ret = -ENOSPC; + } error: btrfs_free_path(path); return ret; @@ -1894,6 +1913,8 @@ int btrfs_balance(struct btrfs_root *dev_root) continue; ret = btrfs_shrink_device(device, old_size - size_to_free); + if (ret == -ENOSPC) + break; BUG_ON(ret); trans = btrfs_start_transaction(dev_root, 1); @@ -1938,9 +1959,8 @@ int btrfs_balance(struct btrfs_root *dev_root) chunk = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_chunk); - key.offset = found_key.offset; /* chunk zero is special */ - if (key.offset == 0) + if (found_key.offset == 0) break; btrfs_release_path(chunk_root, path); @@ -1948,7 +1968,8 @@ int btrfs_balance(struct btrfs_root *dev_root) chunk_root->root_key.objectid, found_key.objectid, found_key.offset); - BUG_ON(ret); + BUG_ON(ret && ret != -ENOSPC); + key.offset = found_key.offset - 1; } ret = 0; error: @@ -1974,10 +1995,13 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) u64 chunk_offset; int ret; int slot; + int failed = 0; + bool retried = false; struct extent_buffer *l; struct btrfs_key key; struct btrfs_super_block *super_copy = &root->fs_info->super_copy; u64 old_total = btrfs_super_total_bytes(super_copy); + u64 old_size = device->total_bytes; u64 diff = device->total_bytes - new_size; if (new_size >= device->total_bytes) @@ -1987,12 +2011,6 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) if (!path) return -ENOMEM; - trans = btrfs_start_transaction(root, 1); - if (!trans) { - ret = -ENOMEM; - goto done; - } - path->reada = 2; lock_chunks(root); @@ -2001,8 +2019,8 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) if (device->writeable) device->fs_devices->total_rw_bytes -= diff; unlock_chunks(root); - btrfs_end_transaction(trans, root); +again: key.objectid = device->devid; key.offset = (u64)-1; key.type = BTRFS_DEV_EXTENT_KEY; @@ -2017,6 +2035,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) goto done; if (ret) { ret = 0; + btrfs_release_path(root, path); break; } @@ -2024,14 +2043,18 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) slot = path->slots[0]; btrfs_item_key_to_cpu(l, &key, path->slots[0]); - if (key.objectid != device->devid) + if (key.objectid != device->devid) { + btrfs_release_path(root, path); break; + } dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); length = btrfs_dev_extent_length(l, dev_extent); - if (key.offset + length <= new_size) + if (key.offset + length <= new_size) { + btrfs_release_path(root, path); break; + } chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent); chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent); @@ -2040,8 +2063,26 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) ret = btrfs_relocate_chunk(root, chunk_tree, chunk_objectid, chunk_offset); - if (ret) + if (ret && ret != -ENOSPC) goto done; + if (ret == -ENOSPC) + failed++; + key.offset -= 1; + } + + if (failed && !retried) { + failed = 0; + retried = true; + goto again; + } else if (failed && retried) { + ret = -ENOSPC; + lock_chunks(root); + + device->total_bytes = old_size; + if (device->writeable) + device->fs_devices->total_rw_bytes += diff; + unlock_chunks(root); + goto done; } /* Shrinking succeeded, else we would be at "done". */ @@ -2294,9 +2335,9 @@ again: em->block_len = em->len; em_tree = &extent_root->fs_info->mapping_tree.map_tree; - spin_lock(&em_tree->lock); + write_lock(&em_tree->lock); ret = add_extent_mapping(em_tree, em); - spin_unlock(&em_tree->lock); + write_unlock(&em_tree->lock); BUG_ON(ret); free_extent_map(em); @@ -2491,9 +2532,9 @@ int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset) int readonly = 0; int i; - spin_lock(&map_tree->map_tree.lock); + read_lock(&map_tree->map_tree.lock); em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1); - spin_unlock(&map_tree->map_tree.lock); + read_unlock(&map_tree->map_tree.lock); if (!em) return 1; @@ -2518,11 +2559,11 @@ void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree) struct extent_map *em; while (1) { - spin_lock(&tree->map_tree.lock); + write_lock(&tree->map_tree.lock); em = lookup_extent_mapping(&tree->map_tree, 0, (u64)-1); if (em) remove_extent_mapping(&tree->map_tree, em); - spin_unlock(&tree->map_tree.lock); + write_unlock(&tree->map_tree.lock); if (!em) break; kfree(em->bdev); @@ -2540,9 +2581,9 @@ int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len) struct extent_map_tree *em_tree = &map_tree->map_tree; int ret; - spin_lock(&em_tree->lock); + read_lock(&em_tree->lock); em = lookup_extent_mapping(em_tree, logical, len); - spin_unlock(&em_tree->lock); + read_unlock(&em_tree->lock); BUG_ON(!em); BUG_ON(em->start > logical || em->start + em->len < logical); @@ -2604,9 +2645,9 @@ again: atomic_set(&multi->error, 0); } - spin_lock(&em_tree->lock); + read_lock(&em_tree->lock); em = lookup_extent_mapping(em_tree, logical, *length); - spin_unlock(&em_tree->lock); + read_unlock(&em_tree->lock); if (!em && unplug_page) return 0; @@ -2763,9 +2804,9 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, u64 stripe_nr; int i, j, nr = 0; - spin_lock(&em_tree->lock); + read_lock(&em_tree->lock); em = lookup_extent_mapping(em_tree, chunk_start, 1); - spin_unlock(&em_tree->lock); + read_unlock(&em_tree->lock); BUG_ON(!em || em->start != chunk_start); map = (struct map_lookup *)em->bdev; @@ -2903,7 +2944,7 @@ static noinline int schedule_bio(struct btrfs_root *root, bio->bi_rw |= rw; spin_lock(&device->io_lock); - if (bio_sync(bio)) + if (bio_rw_flagged(bio, BIO_RW_SYNCIO)) pending_bios = &device->pending_sync_bios; else pending_bios = &device->pending_bios; @@ -3053,9 +3094,9 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key, logical = key->offset; length = btrfs_chunk_length(leaf, chunk); - spin_lock(&map_tree->map_tree.lock); + read_lock(&map_tree->map_tree.lock); em = lookup_extent_mapping(&map_tree->map_tree, logical, 1); - spin_unlock(&map_tree->map_tree.lock); + read_unlock(&map_tree->map_tree.lock); /* already mapped? */ if (em && em->start <= logical && em->start + em->len > logical) { @@ -3114,9 +3155,9 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key, map->stripes[i].dev->in_fs_metadata = 1; } - spin_lock(&map_tree->map_tree.lock); + write_lock(&map_tree->map_tree.lock); ret = add_extent_mapping(&map_tree->map_tree, em); - spin_unlock(&map_tree->map_tree.lock); + write_unlock(&map_tree->map_tree.lock); BUG_ON(ret); free_extent_map(em); diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 5139a83..31b0fab 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -181,4 +181,7 @@ int btrfs_balance(struct btrfs_root *dev_root); void btrfs_unlock_volumes(void); void btrfs_lock_volumes(void); int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset); +int find_free_dev_extent(struct btrfs_trans_handle *trans, + struct btrfs_device *device, u64 num_bytes, + u64 *start, u64 *max_avail); #endif diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index a9d3bf4..b6dd596 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -260,7 +260,7 @@ err: * attributes are handled directly. */ struct xattr_handler *btrfs_xattr_handlers[] = { -#ifdef CONFIG_FS_POSIX_ACL +#ifdef CONFIG_BTRFS_FS_POSIX_ACL &btrfs_xattr_acl_access_handler, &btrfs_xattr_acl_default_handler, #endif diff --git a/fs/buffer.c b/fs/buffer.c index 28f320f..6fa5302 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -52,6 +52,7 @@ init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private) bh->b_end_io = handler; bh->b_private = private; } +EXPORT_SYMBOL(init_buffer); static int sync_buffer(void *word) { @@ -80,6 +81,7 @@ void unlock_buffer(struct buffer_head *bh) smp_mb__after_clear_bit(); wake_up_bit(&bh->b_state, BH_Lock); } +EXPORT_SYMBOL(unlock_buffer); /* * Block until a buffer comes unlocked. This doesn't stop it @@ -90,6 +92,7 @@ void __wait_on_buffer(struct buffer_head * bh) { wait_on_bit(&bh->b_state, BH_Lock, sync_buffer, TASK_UNINTERRUPTIBLE); } +EXPORT_SYMBOL(__wait_on_buffer); static void __clear_page_buffers(struct page *page) @@ -144,6 +147,7 @@ void end_buffer_read_sync(struct buffer_head *bh, int uptodate) __end_buffer_read_notouch(bh, uptodate); put_bh(bh); } +EXPORT_SYMBOL(end_buffer_read_sync); void end_buffer_write_sync(struct buffer_head *bh, int uptodate) { @@ -164,6 +168,7 @@ void end_buffer_write_sync(struct buffer_head *bh, int uptodate) unlock_buffer(bh); put_bh(bh); } +EXPORT_SYMBOL(end_buffer_write_sync); /* * Various filesystems appear to want __find_get_block to be non-blocking. @@ -272,16 +277,17 @@ void invalidate_bdev(struct block_device *bdev) invalidate_bh_lrus(); invalidate_mapping_pages(mapping, 0, -1); } +EXPORT_SYMBOL(invalidate_bdev); /* - * Kick pdflush then try to free up some ZONE_NORMAL memory. + * Kick the writeback threads then try to free up some ZONE_NORMAL memory. */ static void free_more_memory(void) { struct zone *zone; int nid; - wakeup_pdflush(1024); + wakeup_flusher_threads(1024); yield(); for_each_online_node(nid) { @@ -410,6 +416,7 @@ still_busy: local_irq_restore(flags); return; } +EXPORT_SYMBOL(end_buffer_async_write); /* * If a page's buffers are under async readin (end_buffer_async_read @@ -438,8 +445,8 @@ static void mark_buffer_async_read(struct buffer_head *bh) set_buffer_async_read(bh); } -void mark_buffer_async_write_endio(struct buffer_head *bh, - bh_end_io_t *handler) +static void mark_buffer_async_write_endio(struct buffer_head *bh, + bh_end_io_t *handler) { bh->b_end_io = handler; set_buffer_async_write(bh); @@ -553,7 +560,7 @@ repeat: return err; } -void do_thaw_all(struct work_struct *work) +static void do_thaw_all(struct work_struct *work) { struct super_block *sb; char b[BDEVNAME_SIZE]; @@ -1172,6 +1179,7 @@ void mark_buffer_dirty(struct buffer_head *bh) } } } +EXPORT_SYMBOL(mark_buffer_dirty); /* * Decrement a buffer_head's reference count. If all buffers against a page @@ -1188,6 +1196,7 @@ void __brelse(struct buffer_head * buf) } WARN(1, KERN_ERR "VFS: brelse: Trying to free free buffer\n"); } +EXPORT_SYMBOL(__brelse); /* * bforget() is like brelse(), except it discards any @@ -1206,6 +1215,7 @@ void __bforget(struct buffer_head *bh) } __brelse(bh); } +EXPORT_SYMBOL(__bforget); static struct buffer_head *__bread_slow(struct buffer_head *bh) { @@ -1699,9 +1709,9 @@ static int __block_write_full_page(struct inode *inode, struct page *page, /* * If it's a fully non-blocking write attempt and we cannot * lock the buffer then redirty the page. Note that this can - * potentially cause a busy-wait loop from pdflush and kswapd - * activity, but those code paths have their own higher-level - * throttling. + * potentially cause a busy-wait loop from writeback threads + * and kswapd activity, but those code paths have their own + * higher-level throttling. */ if (wbc->sync_mode != WB_SYNC_NONE || !wbc->nonblocking) { lock_buffer(bh); @@ -2218,6 +2228,7 @@ int block_read_full_page(struct page *page, get_block_t *get_block) } return 0; } +EXPORT_SYMBOL(block_read_full_page); /* utility function for filesystems that need to do work on expanding * truncates. Uses filesystem pagecache writes to allow the filesystem to @@ -2228,16 +2239,10 @@ int generic_cont_expand_simple(struct inode *inode, loff_t size) struct address_space *mapping = inode->i_mapping; struct page *page; void *fsdata; - unsigned long limit; int err; - err = -EFBIG; - limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur; - if (limit != RLIM_INFINITY && size > (loff_t)limit) { - send_sig(SIGXFSZ, current, 0); - goto out; - } - if (size > inode->i_sb->s_maxbytes) + err = inode_newsize_ok(inode, size); + if (err) goto out; err = pagecache_write_begin(NULL, mapping, size, 0, @@ -2252,6 +2257,7 @@ int generic_cont_expand_simple(struct inode *inode, loff_t size) out: return err; } +EXPORT_SYMBOL(generic_cont_expand_simple); static int cont_expand_zero(struct file *file, struct address_space *mapping, loff_t pos, loff_t *bytes) @@ -2352,6 +2358,7 @@ int cont_write_begin(struct file *file, struct address_space *mapping, out: return err; } +EXPORT_SYMBOL(cont_write_begin); int block_prepare_write(struct page *page, unsigned from, unsigned to, get_block_t *get_block) @@ -2362,6 +2369,7 @@ int block_prepare_write(struct page *page, unsigned from, unsigned to, ClearPageUptodate(page); return err; } +EXPORT_SYMBOL(block_prepare_write); int block_commit_write(struct page *page, unsigned from, unsigned to) { @@ -2369,6 +2377,7 @@ int block_commit_write(struct page *page, unsigned from, unsigned to) __block_commit_write(inode,page,from,to); return 0; } +EXPORT_SYMBOL(block_commit_write); /* * block_page_mkwrite() is not allowed to change the file size as it gets @@ -2426,6 +2435,7 @@ block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf, out: return ret; } +EXPORT_SYMBOL(block_page_mkwrite); /* * nobh_write_begin()'s prereads are special: the buffer_heads are freed @@ -2849,6 +2859,7 @@ unlock: out: return err; } +EXPORT_SYMBOL(block_truncate_page); /* * The generic ->writepage function for buffer-backed address_spaces @@ -2890,6 +2901,7 @@ int block_write_full_page_endio(struct page *page, get_block_t *get_block, zero_user_segment(page, offset, PAGE_CACHE_SIZE); return __block_write_full_page(inode, page, get_block, wbc, handler); } +EXPORT_SYMBOL(block_write_full_page_endio); /* * The generic ->writepage function for buffer-backed address_spaces @@ -2900,7 +2912,7 @@ int block_write_full_page(struct page *page, get_block_t *get_block, return block_write_full_page_endio(page, get_block, wbc, end_buffer_async_write); } - +EXPORT_SYMBOL(block_write_full_page); sector_t generic_block_bmap(struct address_space *mapping, sector_t block, get_block_t *get_block) @@ -2913,6 +2925,7 @@ sector_t generic_block_bmap(struct address_space *mapping, sector_t block, get_block(inode, block, &tmp, 0); return tmp.b_blocknr; } +EXPORT_SYMBOL(generic_block_bmap); static void end_bio_bh_io_sync(struct bio *bio, int err) { @@ -2982,6 +2995,7 @@ int submit_bh(int rw, struct buffer_head * bh) bio_put(bio); return ret; } +EXPORT_SYMBOL(submit_bh); /** * ll_rw_block: low-level access to block devices (DEPRECATED) @@ -3043,6 +3057,7 @@ void ll_rw_block(int rw, int nr, struct buffer_head *bhs[]) unlock_buffer(bh); } } +EXPORT_SYMBOL(ll_rw_block); /* * For a data-integrity writeout, we need to wait upon any in-progress I/O @@ -3071,6 +3086,7 @@ int sync_dirty_buffer(struct buffer_head *bh) } return ret; } +EXPORT_SYMBOL(sync_dirty_buffer); /* * try_to_free_buffers() checks if all the buffers on this particular page @@ -3185,13 +3201,14 @@ void block_sync_page(struct page *page) if (mapping) blk_run_backing_dev(mapping->backing_dev_info, page); } +EXPORT_SYMBOL(block_sync_page); /* * There are no bdflush tunables left. But distributions are * still running obsolete flush daemons, so we terminate them here. * * Use of bdflush() is deprecated and will be removed in a future kernel. - * The `pdflush' kernel threads fully replace bdflush daemons and this call. + * The `flush-X' kernel threads fully replace bdflush daemons and this call. */ SYSCALL_DEFINE2(bdflush, int, func, long, data) { @@ -3361,29 +3378,3 @@ void __init buffer_init(void) max_buffer_heads = nrpages * (PAGE_SIZE / sizeof(struct buffer_head)); hotcpu_notifier(buffer_cpu_notify, 0); } - -EXPORT_SYMBOL(__bforget); -EXPORT_SYMBOL(__brelse); -EXPORT_SYMBOL(__wait_on_buffer); -EXPORT_SYMBOL(block_commit_write); -EXPORT_SYMBOL(block_prepare_write); -EXPORT_SYMBOL(block_page_mkwrite); -EXPORT_SYMBOL(block_read_full_page); -EXPORT_SYMBOL(block_sync_page); -EXPORT_SYMBOL(block_truncate_page); -EXPORT_SYMBOL(block_write_full_page); -EXPORT_SYMBOL(block_write_full_page_endio); -EXPORT_SYMBOL(cont_write_begin); -EXPORT_SYMBOL(end_buffer_read_sync); -EXPORT_SYMBOL(end_buffer_write_sync); -EXPORT_SYMBOL(end_buffer_async_write); -EXPORT_SYMBOL(file_fsync); -EXPORT_SYMBOL(generic_block_bmap); -EXPORT_SYMBOL(generic_cont_expand_simple); -EXPORT_SYMBOL(init_buffer); -EXPORT_SYMBOL(invalidate_bdev); -EXPORT_SYMBOL(ll_rw_block); -EXPORT_SYMBOL(mark_buffer_dirty); -EXPORT_SYMBOL(submit_bh); -EXPORT_SYMBOL(sync_dirty_buffer); -EXPORT_SYMBOL(unlock_buffer); diff --git a/fs/char_dev.c b/fs/char_dev.c index a173551..d6db933 100644 --- a/fs/char_dev.c +++ b/fs/char_dev.c @@ -31,6 +31,7 @@ * - no readahead or I/O queue unplugging required */ struct backing_dev_info directly_mappable_cdev_bdi = { + .name = "char", .capabilities = ( #ifdef CONFIG_MMU /* permit private copies of the data to be taken */ @@ -237,8 +238,10 @@ int alloc_chrdev_region(dev_t *dev, unsigned baseminor, unsigned count, } /** - * register_chrdev() - Register a major number for character devices. + * __register_chrdev() - create and register a cdev occupying a range of minors * @major: major device number or 0 for dynamic allocation + * @baseminor: first of the requested range of minor numbers + * @count: the number of minor numbers required * @name: name of this range of devices * @fops: file operations associated with this devices * @@ -254,19 +257,16 @@ int alloc_chrdev_region(dev_t *dev, unsigned baseminor, unsigned count, * /dev. It only helps to keep track of the different owners of devices. If * your module name has only one type of devices it's ok to use e.g. the name * of the module here. - * - * This function registers a range of 256 minor numbers. The first minor number - * is 0. */ -int register_chrdev(unsigned int major, const char *name, - const struct file_operations *fops) +int __register_chrdev(unsigned int major, unsigned int baseminor, + unsigned int count, const char *name, + const struct file_operations *fops) { struct char_device_struct *cd; struct cdev *cdev; - char *s; int err = -ENOMEM; - cd = __register_chrdev_region(major, 0, 256, name); + cd = __register_chrdev_region(major, baseminor, count, name); if (IS_ERR(cd)) return PTR_ERR(cd); @@ -277,10 +277,8 @@ int register_chrdev(unsigned int major, const char *name, cdev->owner = fops->owner; cdev->ops = fops; kobject_set_name(&cdev->kobj, "%s", name); - for (s = strchr(kobject_name(&cdev->kobj),'/'); s; s = strchr(s, '/')) - *s = '!'; - err = cdev_add(cdev, MKDEV(cd->major, 0), 256); + err = cdev_add(cdev, MKDEV(cd->major, baseminor), count); if (err) goto out; @@ -290,7 +288,7 @@ int register_chrdev(unsigned int major, const char *name, out: kobject_put(&cdev->kobj); out2: - kfree(__unregister_chrdev_region(cd->major, 0, 256)); + kfree(__unregister_chrdev_region(cd->major, baseminor, count)); return err; } @@ -316,10 +314,23 @@ void unregister_chrdev_region(dev_t from, unsigned count) } } -void unregister_chrdev(unsigned int major, const char *name) +/** + * __unregister_chrdev - unregister and destroy a cdev + * @major: major device number + * @baseminor: first of the range of minor numbers + * @count: the number of minor numbers this cdev is occupying + * @name: name of this range of devices + * + * Unregister and destroy the cdev occupying the region described by + * @major, @baseminor and @count. This function undoes what + * __register_chrdev() did. + */ +void __unregister_chrdev(unsigned int major, unsigned int baseminor, + unsigned int count, const char *name) { struct char_device_struct *cd; - cd = __unregister_chrdev_region(major, 0, 256); + + cd = __unregister_chrdev_region(major, baseminor, count); if (cd && cd->cdev) cdev_del(cd->cdev); kfree(cd); @@ -568,6 +579,6 @@ EXPORT_SYMBOL(cdev_alloc); EXPORT_SYMBOL(cdev_del); EXPORT_SYMBOL(cdev_add); EXPORT_SYMBOL(cdev_index); -EXPORT_SYMBOL(register_chrdev); -EXPORT_SYMBOL(unregister_chrdev); +EXPORT_SYMBOL(__register_chrdev); +EXPORT_SYMBOL(__unregister_chrdev); EXPORT_SYMBOL(directly_mappable_cdev_bdi); diff --git a/fs/cifs/CHANGES b/fs/cifs/CHANGES index e85b1e4..145540a 100644 --- a/fs/cifs/CHANGES +++ b/fs/cifs/CHANGES @@ -3,7 +3,10 @@ Version 1.60 Fix memory leak in reconnect. Fix oops in DFS mount error path. Set s_maxbytes to smaller (the max that vfs can handle) so that sendfile will now work over cifs mounts again. Add noforcegid -and noforceuid mount parameters. +and noforceuid mount parameters. Fix small mem leak when using +ntlmv2. Fix 2nd mount to same server but with different port to +be allowed (rather than reusing the 1st port) - only when the +user explicitly overrides the port on the 2nd mount. Version 1.59 ------------ diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig index 6994a0f..80f3525 100644 --- a/fs/cifs/Kconfig +++ b/fs/cifs/Kconfig @@ -2,6 +2,7 @@ config CIFS tristate "CIFS support (advanced network filesystem, SMBFS successor)" depends on INET select NLS + select SLOW_WORK help This is the client VFS module for the Common Internet File System (CIFS) protocol which is the successor to the Server Message Block diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c index 606912d..fea9e89 100644 --- a/fs/cifs/cifs_dfs_ref.c +++ b/fs/cifs/cifs_dfs_ref.c @@ -142,7 +142,7 @@ char *cifs_compose_mount_options(const char *sb_mountdata, rc = dns_resolve_server_name_to_ip(*devname, &srvIP); if (rc != 0) { cERROR(1, ("%s: Failed to resolve server part of %s to IP: %d", - __func__, *devname, rc));; + __func__, *devname, rc)); goto compose_mount_options_err; } /* md_len = strlen(...) + 12 for 'sep+prefixpath=' @@ -385,7 +385,7 @@ out_err: goto out; } -struct inode_operations cifs_dfs_referral_inode_operations = { +const struct inode_operations cifs_dfs_referral_inode_operations = { .follow_link = cifs_dfs_follow_mountpoint, }; diff --git a/fs/cifs/cifs_spnego.c b/fs/cifs/cifs_spnego.c index 051caec..8ec7736 100644 --- a/fs/cifs/cifs_spnego.c +++ b/fs/cifs/cifs_spnego.c @@ -125,7 +125,7 @@ cifs_get_spnego_key(struct cifsSesInfo *sesInfo) if (server->addr.sockAddr.sin_family == AF_INET) sprintf(dp, "ip4=%pI4", &server->addr.sockAddr.sin_addr); else if (server->addr.sockAddr.sin_family == AF_INET6) - sprintf(dp, "ip6=%pi6", &server->addr.sockAddr6.sin6_addr); + sprintf(dp, "ip6=%pI6", &server->addr.sockAddr6.sin6_addr); else goto out; diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c index 6941c22..7dfe084 100644 --- a/fs/cifs/cifsacl.c +++ b/fs/cifs/cifsacl.c @@ -607,7 +607,7 @@ static struct cifs_ntsd *get_cifs_acl(struct cifs_sb_info *cifs_sb, return get_cifs_acl_by_path(cifs_sb, path, pacllen); pntsd = get_cifs_acl_by_fid(cifs_sb, open_file->netfid, pacllen); - atomic_dec(&open_file->wrtPending); + cifsFileInfo_put(open_file); return pntsd; } @@ -665,7 +665,7 @@ static int set_cifs_acl(struct cifs_ntsd *pnntsd, __u32 acllen, return set_cifs_acl_by_path(cifs_sb, path, pnntsd, acllen); rc = set_cifs_acl_by_fid(cifs_sb, open_file->netfid, pnntsd, acllen); - atomic_dec(&open_file->wrtPending); + cifsFileInfo_put(open_file); return rc; } diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c index 7c98095..7efe174 100644 --- a/fs/cifs/cifsencrypt.c +++ b/fs/cifs/cifsencrypt.c @@ -373,6 +373,7 @@ calc_exit_2: compare with the NTLM example */ hmac_md5_final(ses->server->ntlmv2_hash, pctxt); + kfree(pctxt); return rc; } diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 84b7525..9a5e4f5 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -50,7 +50,7 @@ #define CIFS_MAGIC_NUMBER 0xFF534D42 /* the first four bytes of SMB PDUs */ #ifdef CONFIG_CIFS_QUOTA -static struct quotactl_ops cifs_quotactl_ops; +static const struct quotactl_ops cifs_quotactl_ops; #endif /* QUOTA */ int cifsFYI = 0; @@ -64,9 +64,6 @@ unsigned int multiuser_mount = 0; unsigned int extended_security = CIFSSEC_DEF; /* unsigned int ntlmv2_support = 0; */ unsigned int sign_CIFS_PDUs = 1; -extern struct task_struct *oplockThread; /* remove sparse warning */ -struct task_struct *oplockThread = NULL; -/* extern struct task_struct * dnotifyThread; remove sparse warning */ static const struct super_operations cifs_super_ops; unsigned int CIFSMaxBufSize = CIFS_MAX_MSGSIZE; module_param(CIFSMaxBufSize, int, 0); @@ -185,8 +182,7 @@ out_mount_failed: cifs_sb->mountdata = NULL; } #endif - if (cifs_sb->local_nls) - unload_nls(cifs_sb->local_nls); + unload_nls(cifs_sb->local_nls); kfree(cifs_sb); } return rc; @@ -361,13 +357,10 @@ cifs_show_address(struct seq_file *s, struct TCP_Server_Info *server) static int cifs_show_options(struct seq_file *s, struct vfsmount *m) { - struct cifs_sb_info *cifs_sb; - struct cifsTconInfo *tcon; - - cifs_sb = CIFS_SB(m->mnt_sb); - tcon = cifs_sb->tcon; + struct cifs_sb_info *cifs_sb = CIFS_SB(m->mnt_sb); + struct cifsTconInfo *tcon = cifs_sb->tcon; - seq_printf(s, ",unc=%s", cifs_sb->tcon->treeName); + seq_printf(s, ",unc=%s", tcon->treeName); if (tcon->ses->userName) seq_printf(s, ",username=%s", tcon->ses->userName); if (tcon->ses->domainName) @@ -520,7 +513,7 @@ int cifs_xstate_get(struct super_block *sb, struct fs_quota_stat *qstats) return rc; } -static struct quotactl_ops cifs_quotactl_ops = { +static const struct quotactl_ops cifs_quotactl_ops = { .set_xquota = cifs_xquota_set, .get_xquota = cifs_xquota_get, .set_xstate = cifs_xstate_set, @@ -976,89 +969,12 @@ cifs_destroy_mids(void) kmem_cache_destroy(cifs_oplock_cachep); } -static int cifs_oplock_thread(void *dummyarg) -{ - struct oplock_q_entry *oplock_item; - struct cifsTconInfo *pTcon; - struct inode *inode; - __u16 netfid; - int rc, waitrc = 0; - - set_freezable(); - do { - if (try_to_freeze()) - continue; - - spin_lock(&GlobalMid_Lock); - if (list_empty(&GlobalOplock_Q)) { - spin_unlock(&GlobalMid_Lock); - set_current_state(TASK_INTERRUPTIBLE); - schedule_timeout(39*HZ); - } else { - oplock_item = list_entry(GlobalOplock_Q.next, - struct oplock_q_entry, qhead); - cFYI(1, ("found oplock item to write out")); - pTcon = oplock_item->tcon; - inode = oplock_item->pinode; - netfid = oplock_item->netfid; - spin_unlock(&GlobalMid_Lock); - DeleteOplockQEntry(oplock_item); - /* can not grab inode sem here since it would - deadlock when oplock received on delete - since vfs_unlink holds the i_mutex across - the call */ - /* mutex_lock(&inode->i_mutex);*/ - if (S_ISREG(inode->i_mode)) { -#ifdef CONFIG_CIFS_EXPERIMENTAL - if (CIFS_I(inode)->clientCanCacheAll == 0) - break_lease(inode, FMODE_READ); - else if (CIFS_I(inode)->clientCanCacheRead == 0) - break_lease(inode, FMODE_WRITE); -#endif - rc = filemap_fdatawrite(inode->i_mapping); - if (CIFS_I(inode)->clientCanCacheRead == 0) { - waitrc = filemap_fdatawait( - inode->i_mapping); - invalidate_remote_inode(inode); - } - if (rc == 0) - rc = waitrc; - } else - rc = 0; - /* mutex_unlock(&inode->i_mutex);*/ - if (rc) - CIFS_I(inode)->write_behind_rc = rc; - cFYI(1, ("Oplock flush inode %p rc %d", - inode, rc)); - - /* releasing stale oplock after recent reconnect - of smb session using a now incorrect file - handle is not a data integrity issue but do - not bother sending an oplock release if session - to server still is disconnected since oplock - already released by the server in that case */ - if (!pTcon->need_reconnect) { - rc = CIFSSMBLock(0, pTcon, netfid, - 0 /* len */ , 0 /* offset */, 0, - 0, LOCKING_ANDX_OPLOCK_RELEASE, - false /* wait flag */); - cFYI(1, ("Oplock release rc = %d", rc)); - } - set_current_state(TASK_INTERRUPTIBLE); - schedule_timeout(1); /* yield in case q were corrupt */ - } - } while (!kthread_should_stop()); - - return 0; -} - static int __init init_cifs(void) { int rc = 0; cifs_proc_init(); INIT_LIST_HEAD(&cifs_tcp_ses_list); - INIT_LIST_HEAD(&GlobalOplock_Q); #ifdef CONFIG_CIFS_EXPERIMENTAL INIT_LIST_HEAD(&GlobalDnotifyReqList); INIT_LIST_HEAD(&GlobalDnotifyRsp_Q); @@ -1121,16 +1037,13 @@ init_cifs(void) if (rc) goto out_unregister_key_type; #endif - oplockThread = kthread_run(cifs_oplock_thread, NULL, "cifsoplockd"); - if (IS_ERR(oplockThread)) { - rc = PTR_ERR(oplockThread); - cERROR(1, ("error %d create oplock thread", rc)); - goto out_unregister_dfs_key_type; - } + rc = slow_work_register_user(); + if (rc) + goto out_unregister_resolver_key; return 0; - out_unregister_dfs_key_type: + out_unregister_resolver_key: #ifdef CONFIG_CIFS_DFS_UPCALL unregister_key_type(&key_type_dns_resolver); out_unregister_key_type: @@ -1167,7 +1080,6 @@ exit_cifs(void) cifs_destroy_inodecache(); cifs_destroy_mids(); cifs_destroy_request_bufs(); - kthread_stop(oplockThread); } MODULE_AUTHOR("Steve French <sfrench@us.ibm.com>"); diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h index 6c17094..ac2b24c 100644 --- a/fs/cifs/cifsfs.h +++ b/fs/cifs/cifsfs.h @@ -67,7 +67,7 @@ extern int cifs_setattr(struct dentry *, struct iattr *); extern const struct inode_operations cifs_file_inode_ops; extern const struct inode_operations cifs_symlink_inode_ops; -extern struct inode_operations cifs_dfs_referral_inode_operations; +extern const struct inode_operations cifs_dfs_referral_inode_operations; /* Functions related to files and directories */ @@ -113,5 +113,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg); extern const struct export_operations cifs_export_ops; #endif /* EXPERIMENTAL */ -#define CIFS_VERSION "1.60" +#define CIFS_VERSION "1.61" #endif /* _CIFSFS_H */ diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index 6084d63..5d0fde1 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -18,6 +18,7 @@ */ #include <linux/in.h> #include <linux/in6.h> +#include <linux/slow-work.h> #include "cifs_fs_sb.h" #include "cifsacl.h" /* @@ -346,16 +347,33 @@ struct cifsFileInfo { /* lock scope id (0 if none) */ struct file *pfile; /* needed for writepage */ struct inode *pInode; /* needed for oplock break */ + struct vfsmount *mnt; struct mutex lock_mutex; struct list_head llist; /* list of byte range locks we have. */ bool closePend:1; /* file is marked to close */ bool invalidHandle:1; /* file closed via session abend */ - bool messageMode:1; /* for pipes: message vs byte mode */ - atomic_t wrtPending; /* handle in use - defer close */ + bool oplock_break_cancelled:1; + atomic_t count; /* reference count */ struct mutex fh_mutex; /* prevents reopen race after dead ses*/ struct cifs_search_info srch_inf; + struct slow_work oplock_break; /* slow_work job for oplock breaks */ }; +/* Take a reference on the file private data */ +static inline void cifsFileInfo_get(struct cifsFileInfo *cifs_file) +{ + atomic_inc(&cifs_file->count); +} + +/* Release a reference on the file private data */ +static inline void cifsFileInfo_put(struct cifsFileInfo *cifs_file) +{ + if (atomic_dec_and_test(&cifs_file->count)) { + iput(cifs_file->pInode); + kfree(cifs_file); + } +} + /* * One of these for each file inode */ @@ -369,7 +387,6 @@ struct cifsInodeInfo { unsigned long time; /* jiffies of last update/check of inode */ bool clientCanCacheRead:1; /* read oplock */ bool clientCanCacheAll:1; /* read and writebehind oplock */ - bool oplockPending:1; bool delete_pending:1; /* DELETE_ON_CLOSE is set */ u64 server_eof; /* current file size on server */ u64 uniqueid; /* server inode number */ @@ -572,9 +589,9 @@ require use of the stronger protocol */ #define CIFSSEC_MUST_LANMAN 0x10010 #define CIFSSEC_MUST_PLNTXT 0x20020 #ifdef CONFIG_CIFS_UPCALL -#define CIFSSEC_MASK 0xAF0AF /* allows weak security but also krb5 */ +#define CIFSSEC_MASK 0xBF0BF /* allows weak security but also krb5 */ #else -#define CIFSSEC_MASK 0xA70A7 /* current flags supported if weak */ +#define CIFSSEC_MASK 0xB70B7 /* current flags supported if weak */ #endif /* UPCALL */ #else /* do not allow weak pw hash */ #ifdef CONFIG_CIFS_UPCALL @@ -656,8 +673,6 @@ GLOBAL_EXTERN rwlock_t cifs_tcp_ses_lock; */ GLOBAL_EXTERN rwlock_t GlobalSMBSeslock; -GLOBAL_EXTERN struct list_head GlobalOplock_Q; - /* Outstanding dir notify requests */ GLOBAL_EXTERN struct list_head GlobalDnotifyReqList; /* DirNotify response queue */ @@ -708,3 +723,4 @@ GLOBAL_EXTERN unsigned int cifs_min_rcv; /* min size of big ntwrk buf pool */ GLOBAL_EXTERN unsigned int cifs_min_small; /* min size of small buf pool */ GLOBAL_EXTERN unsigned int cifs_max_pending; /* MAX requests at once to server*/ +extern const struct slow_work_ops cifs_oplock_break_ops; diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index da8fbf5..6928c24 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -86,18 +86,17 @@ extern int CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses, const int stage, const struct nls_table *nls_cp); extern __u16 GetNextMid(struct TCP_Server_Info *server); -extern struct oplock_q_entry *AllocOplockQEntry(struct inode *, u16, - struct cifsTconInfo *); -extern void DeleteOplockQEntry(struct oplock_q_entry *); -extern void DeleteTconOplockQEntries(struct cifsTconInfo *); extern struct timespec cifs_NTtimeToUnix(__le64 utc_nanoseconds_since_1601); extern u64 cifs_UnixTimeToNT(struct timespec); extern struct timespec cnvrtDosUnixTm(__le16 le_date, __le16 le_time, int offset); +extern struct cifsFileInfo *cifs_new_fileinfo(struct inode *newinode, + __u16 fileHandle, struct file *file, + struct vfsmount *mnt, unsigned int oflags); extern int cifs_posix_open(char *full_path, struct inode **pinode, - struct super_block *sb, int mode, int oflags, - int *poplock, __u16 *pnetfid, int xid); + struct vfsmount *mnt, int mode, int oflags, + __u32 *poplock, __u16 *pnetfid, int xid); extern void cifs_unix_basic_to_fattr(struct cifs_fattr *fattr, FILE_UNIX_BASIC_INFO *info, struct cifs_sb_info *cifs_sb); diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index 1866bc2..941441d 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -94,116 +94,145 @@ static void mark_open_files_invalid(struct cifsTconInfo *pTcon) list_for_each_safe(tmp, tmp1, &pTcon->openFileList) { open_file = list_entry(tmp, struct cifsFileInfo, tlist); open_file->invalidHandle = true; + open_file->oplock_break_cancelled = true; } write_unlock(&GlobalSMBSeslock); /* BB Add call to invalidate_inodes(sb) for all superblocks mounted to this tcon */ } -/* Allocate and return pointer to an SMB request buffer, and set basic - SMB information in the SMB header. If the return code is zero, this - function must have filled in request_buf pointer */ +/* reconnect the socket, tcon, and smb session if needed */ static int -small_smb_init(int smb_command, int wct, struct cifsTconInfo *tcon, - void **request_buf) +cifs_reconnect_tcon(struct cifsTconInfo *tcon, int smb_command) { int rc = 0; + struct cifsSesInfo *ses; + struct TCP_Server_Info *server; + struct nls_table *nls_codepage; - /* SMBs NegProt, SessSetup, uLogoff do not have tcon yet so - check for tcp and smb session status done differently - for those three - in the calling routine */ - if (tcon) { - if (tcon->tidStatus == CifsExiting) { - /* only tree disconnect, open, and write, - (and ulogoff which does not have tcon) - are allowed as we start force umount */ - if ((smb_command != SMB_COM_WRITE_ANDX) && - (smb_command != SMB_COM_OPEN_ANDX) && - (smb_command != SMB_COM_TREE_DISCONNECT)) { - cFYI(1, ("can not send cmd %d while umounting", - smb_command)); - return -ENODEV; - } + /* + * SMBs NegProt, SessSetup, uLogoff do not have tcon yet so check for + * tcp and smb session status done differently for those three - in the + * calling routine + */ + if (!tcon) + return 0; + + ses = tcon->ses; + server = ses->server; + + /* + * only tree disconnect, open, and write, (and ulogoff which does not + * have tcon) are allowed as we start force umount + */ + if (tcon->tidStatus == CifsExiting) { + if (smb_command != SMB_COM_WRITE_ANDX && + smb_command != SMB_COM_OPEN_ANDX && + smb_command != SMB_COM_TREE_DISCONNECT) { + cFYI(1, ("can not send cmd %d while umounting", + smb_command)); + return -ENODEV; } - if ((tcon->ses) && (tcon->ses->status != CifsExiting) && - (tcon->ses->server)) { - struct nls_table *nls_codepage; - /* Give Demultiplex thread up to 10 seconds to - reconnect, should be greater than cifs socket - timeout which is 7 seconds */ - while (tcon->ses->server->tcpStatus == - CifsNeedReconnect) { - wait_event_interruptible_timeout(tcon->ses->server->response_q, - (tcon->ses->server->tcpStatus == - CifsGood), 10 * HZ); - if (tcon->ses->server->tcpStatus == - CifsNeedReconnect) { - /* on "soft" mounts we wait once */ - if (!tcon->retry || - (tcon->ses->status == CifsExiting)) { - cFYI(1, ("gave up waiting on " - "reconnect in smb_init")); - return -EHOSTDOWN; - } /* else "hard" mount - keep retrying - until process is killed or server - comes back on-line */ - } else /* TCP session is reestablished now */ - break; - } + } - nls_codepage = load_nls_default(); - /* need to prevent multiple threads trying to - simultaneously reconnect the same SMB session */ - down(&tcon->ses->sesSem); - if (tcon->ses->need_reconnect) - rc = cifs_setup_session(0, tcon->ses, - nls_codepage); - if (!rc && (tcon->need_reconnect)) { - mark_open_files_invalid(tcon); - rc = CIFSTCon(0, tcon->ses, tcon->treeName, - tcon, nls_codepage); - up(&tcon->ses->sesSem); - /* BB FIXME add code to check if wsize needs - update due to negotiated smb buffer size - shrinking */ - if (rc == 0) { - atomic_inc(&tconInfoReconnectCount); - /* tell server Unix caps we support */ - if (tcon->ses->capabilities & CAP_UNIX) - reset_cifs_unix_caps( - 0 /* no xid */, - tcon, - NULL /* we do not know sb */, - NULL /* no vol info */); - } + if (ses->status == CifsExiting) + return -EIO; - cFYI(1, ("reconnect tcon rc = %d", rc)); - /* Removed call to reopen open files here. - It is safer (and faster) to reopen files - one at a time as needed in read and write */ - - /* Check if handle based operation so we - know whether we can continue or not without - returning to caller to reset file handle */ - switch (smb_command) { - case SMB_COM_READ_ANDX: - case SMB_COM_WRITE_ANDX: - case SMB_COM_CLOSE: - case SMB_COM_FIND_CLOSE2: - case SMB_COM_LOCKING_ANDX: { - unload_nls(nls_codepage); - return -EAGAIN; - } - } - } else { - up(&tcon->ses->sesSem); - } - unload_nls(nls_codepage); + /* + * Give demultiplex thread up to 10 seconds to reconnect, should be + * greater than cifs socket timeout which is 7 seconds + */ + while (server->tcpStatus == CifsNeedReconnect) { + wait_event_interruptible_timeout(server->response_q, + (server->tcpStatus == CifsGood), 10 * HZ); - } else { - return -EIO; + /* is TCP session is reestablished now ?*/ + if (server->tcpStatus != CifsNeedReconnect) + break; + + /* + * on "soft" mounts we wait once. Hard mounts keep + * retrying until process is killed or server comes + * back on-line + */ + if (!tcon->retry || ses->status == CifsExiting) { + cFYI(1, ("gave up waiting on reconnect in smb_init")); + return -EHOSTDOWN; } } + + if (!ses->need_reconnect && !tcon->need_reconnect) + return 0; + + nls_codepage = load_nls_default(); + + /* + * need to prevent multiple threads trying to simultaneously + * reconnect the same SMB session + */ + down(&ses->sesSem); + if (ses->need_reconnect) + rc = cifs_setup_session(0, ses, nls_codepage); + + /* do we need to reconnect tcon? */ + if (rc || !tcon->need_reconnect) { + up(&ses->sesSem); + goto out; + } + + mark_open_files_invalid(tcon); + rc = CIFSTCon(0, ses, tcon->treeName, tcon, nls_codepage); + up(&ses->sesSem); + cFYI(1, ("reconnect tcon rc = %d", rc)); + + if (rc) + goto out; + + /* + * FIXME: check if wsize needs updated due to negotiated smb buffer + * size shrinking + */ + atomic_inc(&tconInfoReconnectCount); + + /* tell server Unix caps we support */ + if (ses->capabilities & CAP_UNIX) + reset_cifs_unix_caps(0, tcon, NULL, NULL); + + /* + * Removed call to reopen open files here. It is safer (and faster) to + * reopen files one at a time as needed in read and write. + * + * FIXME: what about file locks? don't we need to reclaim them ASAP? + */ + +out: + /* + * Check if handle based operation so we know whether we can continue + * or not without returning to caller to reset file handle + */ + switch (smb_command) { + case SMB_COM_READ_ANDX: + case SMB_COM_WRITE_ANDX: + case SMB_COM_CLOSE: + case SMB_COM_FIND_CLOSE2: + case SMB_COM_LOCKING_ANDX: + rc = -EAGAIN; + } + + unload_nls(nls_codepage); + return rc; +} + +/* Allocate and return pointer to an SMB request buffer, and set basic + SMB information in the SMB header. If the return code is zero, this + function must have filled in request_buf pointer */ +static int +small_smb_init(int smb_command, int wct, struct cifsTconInfo *tcon, + void **request_buf) +{ + int rc = 0; + + rc = cifs_reconnect_tcon(tcon, smb_command); if (rc) return rc; @@ -256,101 +285,7 @@ smb_init(int smb_command, int wct, struct cifsTconInfo *tcon, { int rc = 0; - /* SMBs NegProt, SessSetup, uLogoff do not have tcon yet so - check for tcp and smb session status done differently - for those three - in the calling routine */ - if (tcon) { - if (tcon->tidStatus == CifsExiting) { - /* only tree disconnect, open, and write, - (and ulogoff which does not have tcon) - are allowed as we start force umount */ - if ((smb_command != SMB_COM_WRITE_ANDX) && - (smb_command != SMB_COM_OPEN_ANDX) && - (smb_command != SMB_COM_TREE_DISCONNECT)) { - cFYI(1, ("can not send cmd %d while umounting", - smb_command)); - return -ENODEV; - } - } - - if ((tcon->ses) && (tcon->ses->status != CifsExiting) && - (tcon->ses->server)) { - struct nls_table *nls_codepage; - /* Give Demultiplex thread up to 10 seconds to - reconnect, should be greater than cifs socket - timeout which is 7 seconds */ - while (tcon->ses->server->tcpStatus == - CifsNeedReconnect) { - wait_event_interruptible_timeout(tcon->ses->server->response_q, - (tcon->ses->server->tcpStatus == - CifsGood), 10 * HZ); - if (tcon->ses->server->tcpStatus == - CifsNeedReconnect) { - /* on "soft" mounts we wait once */ - if (!tcon->retry || - (tcon->ses->status == CifsExiting)) { - cFYI(1, ("gave up waiting on " - "reconnect in smb_init")); - return -EHOSTDOWN; - } /* else "hard" mount - keep retrying - until process is killed or server - comes on-line */ - } else /* TCP session is reestablished now */ - break; - } - nls_codepage = load_nls_default(); - /* need to prevent multiple threads trying to - simultaneously reconnect the same SMB session */ - down(&tcon->ses->sesSem); - if (tcon->ses->need_reconnect) - rc = cifs_setup_session(0, tcon->ses, - nls_codepage); - if (!rc && (tcon->need_reconnect)) { - mark_open_files_invalid(tcon); - rc = CIFSTCon(0, tcon->ses, tcon->treeName, - tcon, nls_codepage); - up(&tcon->ses->sesSem); - /* BB FIXME add code to check if wsize needs - update due to negotiated smb buffer size - shrinking */ - if (rc == 0) { - atomic_inc(&tconInfoReconnectCount); - /* tell server Unix caps we support */ - if (tcon->ses->capabilities & CAP_UNIX) - reset_cifs_unix_caps( - 0 /* no xid */, - tcon, - NULL /* do not know sb */, - NULL /* no vol info */); - } - - cFYI(1, ("reconnect tcon rc = %d", rc)); - /* Removed call to reopen open files here. - It is safer (and faster) to reopen files - one at a time as needed in read and write */ - - /* Check if handle based operation so we - know whether we can continue or not without - returning to caller to reset file handle */ - switch (smb_command) { - case SMB_COM_READ_ANDX: - case SMB_COM_WRITE_ANDX: - case SMB_COM_CLOSE: - case SMB_COM_FIND_CLOSE2: - case SMB_COM_LOCKING_ANDX: { - unload_nls(nls_codepage); - return -EAGAIN; - } - } - } else { - up(&tcon->ses->sesSem); - } - unload_nls(nls_codepage); - - } else { - return -EIO; - } - } + rc = cifs_reconnect_tcon(tcon, smb_command); if (rc) return rc; @@ -3961,6 +3896,10 @@ parse_DFS_referrals(TRANSACTION2_GET_DFS_REFER_RSP *pSMBr, if (is_unicode) { __le16 *tmp = kmalloc(strlen(searchName)*2 + 2, GFP_KERNEL); + if (tmp == NULL) { + rc = -ENOMEM; + goto parse_DFS_referrals_exit; + } cifsConvertToUCS((__le16 *) tmp, searchName, PATH_MAX, nls_codepage, remap); node->path_consumed = cifs_ucs2_bytes(tmp, diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 1f3345d..43003e0 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -1377,7 +1377,7 @@ cifs_parse_mount_options(char *options, const char *devname, } static struct TCP_Server_Info * -cifs_find_tcp_session(struct sockaddr_storage *addr) +cifs_find_tcp_session(struct sockaddr_storage *addr, unsigned short int port) { struct list_head *tmp; struct TCP_Server_Info *server; @@ -1397,16 +1397,37 @@ cifs_find_tcp_session(struct sockaddr_storage *addr) if (server->tcpStatus == CifsNew) continue; - if (addr->ss_family == AF_INET && - (addr4->sin_addr.s_addr != - server->addr.sockAddr.sin_addr.s_addr)) - continue; - else if (addr->ss_family == AF_INET6 && - (!ipv6_addr_equal(&server->addr.sockAddr6.sin6_addr, - &addr6->sin6_addr) || - server->addr.sockAddr6.sin6_scope_id != - addr6->sin6_scope_id)) - continue; + switch (addr->ss_family) { + case AF_INET: + if (addr4->sin_addr.s_addr == + server->addr.sockAddr.sin_addr.s_addr) { + addr4->sin_port = htons(port); + /* user overrode default port? */ + if (addr4->sin_port) { + if (addr4->sin_port != + server->addr.sockAddr.sin_port) + continue; + } + break; + } else + continue; + + case AF_INET6: + if (ipv6_addr_equal(&addr6->sin6_addr, + &server->addr.sockAddr6.sin6_addr) && + (addr6->sin6_scope_id == + server->addr.sockAddr6.sin6_scope_id)) { + addr6->sin6_port = htons(port); + /* user overrode default port? */ + if (addr6->sin6_port) { + if (addr6->sin6_port != + server->addr.sockAddr6.sin6_port) + continue; + } + break; + } else + continue; + } ++server->srv_count; write_unlock(&cifs_tcp_ses_lock); @@ -1475,7 +1496,7 @@ cifs_get_tcp_session(struct smb_vol *volume_info) } /* see if we already have a matching tcp_ses */ - tcp_ses = cifs_find_tcp_session(&addr); + tcp_ses = cifs_find_tcp_session(&addr, volume_info->port); if (tcp_ses) return tcp_ses; @@ -1649,7 +1670,6 @@ cifs_put_tcon(struct cifsTconInfo *tcon) CIFSSMBTDis(xid, tcon); _FreeXid(xid); - DeleteTconOplockQEntries(tcon); tconInfoFree(tcon); cifs_put_smb_ses(ses); } @@ -2636,9 +2656,9 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses, return -EIO; smb_buffer = cifs_buf_get(); - if (smb_buffer == NULL) { + if (smb_buffer == NULL) return -ENOMEM; - } + smb_buffer_response = smb_buffer; header_assemble(smb_buffer, SMB_COM_TREE_CONNECT_ANDX, diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c index 4326ffd..627a60a 100644 --- a/fs/cifs/dir.c +++ b/fs/cifs/dir.c @@ -24,6 +24,7 @@ #include <linux/stat.h> #include <linux/slab.h> #include <linux/namei.h> +#include <linux/mount.h> #include "cifsfs.h" #include "cifspdu.h" #include "cifsglob.h" @@ -129,44 +130,45 @@ cifs_bp_rename_retry: return full_path; } -static void -cifs_fill_fileinfo(struct inode *newinode, __u16 fileHandle, - struct cifsTconInfo *tcon, bool write_only) +struct cifsFileInfo * +cifs_new_fileinfo(struct inode *newinode, __u16 fileHandle, + struct file *file, struct vfsmount *mnt, unsigned int oflags) { int oplock = 0; struct cifsFileInfo *pCifsFile; struct cifsInodeInfo *pCifsInode; + struct cifs_sb_info *cifs_sb = CIFS_SB(mnt->mnt_sb); pCifsFile = kzalloc(sizeof(struct cifsFileInfo), GFP_KERNEL); - if (pCifsFile == NULL) - return; + return pCifsFile; if (oplockEnabled) oplock = REQ_OPLOCK; pCifsFile->netfid = fileHandle; pCifsFile->pid = current->tgid; - pCifsFile->pInode = newinode; + pCifsFile->pInode = igrab(newinode); + pCifsFile->mnt = mnt; + pCifsFile->pfile = file; pCifsFile->invalidHandle = false; pCifsFile->closePend = false; mutex_init(&pCifsFile->fh_mutex); mutex_init(&pCifsFile->lock_mutex); INIT_LIST_HEAD(&pCifsFile->llist); - atomic_set(&pCifsFile->wrtPending, 0); + atomic_set(&pCifsFile->count, 1); + slow_work_init(&pCifsFile->oplock_break, &cifs_oplock_break_ops); - /* set the following in open now - pCifsFile->pfile = file; */ write_lock(&GlobalSMBSeslock); - list_add(&pCifsFile->tlist, &tcon->openFileList); + list_add(&pCifsFile->tlist, &cifs_sb->tcon->openFileList); pCifsInode = CIFS_I(newinode); if (pCifsInode) { /* if readable file instance put first in list*/ - if (write_only) + if (oflags & FMODE_READ) + list_add(&pCifsFile->flist, &pCifsInode->openFileList); + else list_add_tail(&pCifsFile->flist, &pCifsInode->openFileList); - else - list_add(&pCifsFile->flist, &pCifsInode->openFileList); if ((oplock & 0xF) == OPLOCK_EXCLUSIVE) { pCifsInode->clientCanCacheAll = true; @@ -176,18 +178,18 @@ cifs_fill_fileinfo(struct inode *newinode, __u16 fileHandle, pCifsInode->clientCanCacheRead = true; } write_unlock(&GlobalSMBSeslock); + + return pCifsFile; } int cifs_posix_open(char *full_path, struct inode **pinode, - struct super_block *sb, int mode, int oflags, - int *poplock, __u16 *pnetfid, int xid) + struct vfsmount *mnt, int mode, int oflags, + __u32 *poplock, __u16 *pnetfid, int xid) { int rc; - __u32 oplock; - bool write_only = false; FILE_UNIX_BASIC_INFO *presp_data; __u32 posix_flags = 0; - struct cifs_sb_info *cifs_sb = CIFS_SB(sb); + struct cifs_sb_info *cifs_sb = CIFS_SB(mnt->mnt_sb); struct cifs_fattr fattr; cFYI(1, ("posix open %s", full_path)); @@ -223,12 +225,9 @@ int cifs_posix_open(char *full_path, struct inode **pinode, if (oflags & O_DIRECT) posix_flags |= SMB_O_DIRECT; - if (!(oflags & FMODE_READ)) - write_only = true; - mode &= ~current_umask(); rc = CIFSPOSIXCreate(xid, cifs_sb->tcon, posix_flags, mode, - pnetfid, presp_data, &oplock, full_path, + pnetfid, presp_data, poplock, full_path, cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); if (rc) @@ -244,7 +243,7 @@ int cifs_posix_open(char *full_path, struct inode **pinode, /* get new inode and set it up */ if (*pinode == NULL) { - *pinode = cifs_iget(sb, &fattr); + *pinode = cifs_iget(mnt->mnt_sb, &fattr); if (!*pinode) { rc = -ENOMEM; goto posix_open_ret; @@ -253,7 +252,7 @@ int cifs_posix_open(char *full_path, struct inode **pinode, cifs_fattr_to_inode(*pinode, &fattr); } - cifs_fill_fileinfo(*pinode, *pnetfid, cifs_sb->tcon, write_only); + cifs_new_fileinfo(*pinode, *pnetfid, NULL, mnt, oflags); posix_open_ret: kfree(presp_data); @@ -280,7 +279,7 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode, int rc = -ENOENT; int xid; int create_options = CREATE_NOT_DIR; - int oplock = 0; + __u32 oplock = 0; int oflags; bool posix_create = false; /* @@ -298,7 +297,6 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode, FILE_ALL_INFO *buf = NULL; struct inode *newinode = NULL; int disposition = FILE_OVERWRITE_IF; - bool write_only = false; xid = GetXid(); @@ -323,7 +321,7 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode, if (tcon->unix_ext && (tcon->ses->capabilities & CAP_UNIX) && (CIFS_UNIX_POSIX_PATH_OPS_CAP & le64_to_cpu(tcon->fsUnixInfo.Capability))) { - rc = cifs_posix_open(full_path, &newinode, inode->i_sb, + rc = cifs_posix_open(full_path, &newinode, nd->path.mnt, mode, oflags, &oplock, &fileHandle, xid); /* EIO could indicate that (posix open) operation is not supported, despite what server claimed in capability @@ -351,11 +349,8 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode, desiredAccess = 0; if (oflags & FMODE_READ) desiredAccess |= GENERIC_READ; /* is this too little? */ - if (oflags & FMODE_WRITE) { + if (oflags & FMODE_WRITE) desiredAccess |= GENERIC_WRITE; - if (!(oflags & FMODE_READ)) - write_only = true; - } if ((oflags & (O_CREAT | O_EXCL)) == (O_CREAT | O_EXCL)) disposition = FILE_CREATE; @@ -470,8 +465,8 @@ cifs_create_set_dentry: /* mknod case - do not leave file open */ CIFSSMBClose(xid, tcon, fileHandle); } else if (!(posix_create) && (newinode)) { - cifs_fill_fileinfo(newinode, fileHandle, - cifs_sb->tcon, write_only); + cifs_new_fileinfo(newinode, fileHandle, NULL, + nd->path.mnt, oflags); } cifs_create_out: kfree(buf); @@ -611,7 +606,7 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry, { int xid; int rc = 0; /* to get around spurious gcc warning, set to zero here */ - int oplock = 0; + __u32 oplock = 0; __u16 fileHandle = 0; bool posix_open = false; struct cifs_sb_info *cifs_sb; @@ -683,8 +678,7 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry, if (!(nd->flags & (LOOKUP_PARENT | LOOKUP_DIRECTORY)) && (nd->flags & LOOKUP_OPEN) && !pTcon->broken_posix_open && (nd->intent.open.flags & O_CREAT)) { - rc = cifs_posix_open(full_path, &newInode, - parent_dir_inode->i_sb, + rc = cifs_posix_open(full_path, &newInode, nd->path.mnt, nd->intent.open.create_mode, nd->intent.open.flags, &oplock, &fileHandle, xid); diff --git a/fs/cifs/file.c b/fs/cifs/file.c index c34b7f8..429337e 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -30,6 +30,7 @@ #include <linux/writeback.h> #include <linux/task_io_accounting_ops.h> #include <linux/delay.h> +#include <linux/mount.h> #include <asm/div64.h> #include "cifsfs.h" #include "cifspdu.h" @@ -39,29 +40,6 @@ #include "cifs_debug.h" #include "cifs_fs_sb.h" -static inline struct cifsFileInfo *cifs_init_private( - struct cifsFileInfo *private_data, struct inode *inode, - struct file *file, __u16 netfid) -{ - memset(private_data, 0, sizeof(struct cifsFileInfo)); - private_data->netfid = netfid; - private_data->pid = current->tgid; - mutex_init(&private_data->fh_mutex); - mutex_init(&private_data->lock_mutex); - INIT_LIST_HEAD(&private_data->llist); - private_data->pfile = file; /* needed for writepage */ - private_data->pInode = inode; - private_data->invalidHandle = false; - private_data->closePend = false; - /* we have to track num writers to the inode, since writepages - does not tell us which handle the write is for so there can - be a close (overlapping with write) of the filehandle that - cifs_writepages chose to use */ - atomic_set(&private_data->wrtPending, 0); - - return private_data; -} - static inline int cifs_convert_flags(unsigned int flags) { if ((flags & O_ACCMODE) == O_RDONLY) @@ -125,9 +103,11 @@ static inline int cifs_get_disposition(unsigned int flags) } /* all arguments to this function must be checked for validity in caller */ -static inline int cifs_posix_open_inode_helper(struct inode *inode, - struct file *file, struct cifsInodeInfo *pCifsInode, - struct cifsFileInfo *pCifsFile, int oplock, u16 netfid) +static inline int +cifs_posix_open_inode_helper(struct inode *inode, struct file *file, + struct cifsInodeInfo *pCifsInode, + struct cifsFileInfo *pCifsFile, __u32 oplock, + u16 netfid) { write_lock(&GlobalSMBSeslock); @@ -221,17 +201,6 @@ static inline int cifs_open_inode_helper(struct inode *inode, struct file *file, struct timespec temp; int rc; - /* want handles we can use to read with first - in the list so we do not have to walk the - list to search for one in write_begin */ - if ((file->f_flags & O_ACCMODE) == O_WRONLY) { - list_add_tail(&pCifsFile->flist, - &pCifsInode->openFileList); - } else { - list_add(&pCifsFile->flist, - &pCifsInode->openFileList); - } - write_unlock(&GlobalSMBSeslock); if (pCifsInode->clientCanCacheRead) { /* we have the inode open somewhere else no need to discard cache data */ @@ -281,7 +250,8 @@ client_can_cache: int cifs_open(struct inode *inode, struct file *file) { int rc = -EACCES; - int xid, oplock; + int xid; + __u32 oplock; struct cifs_sb_info *cifs_sb; struct cifsTconInfo *tcon; struct cifsFileInfo *pCifsFile; @@ -326,7 +296,7 @@ int cifs_open(struct inode *inode, struct file *file) le64_to_cpu(tcon->fsUnixInfo.Capability))) { int oflags = (int) cifs_posix_convert_flags(file->f_flags); /* can not refresh inode info since size could be stale */ - rc = cifs_posix_open(full_path, &inode, inode->i_sb, + rc = cifs_posix_open(full_path, &inode, file->f_path.mnt, cifs_sb->mnt_file_mode /* ignored */, oflags, &oplock, &netfid, xid); if (rc == 0) { @@ -416,24 +386,17 @@ int cifs_open(struct inode *inode, struct file *file) cFYI(1, ("cifs_open returned 0x%x", rc)); goto out; } - file->private_data = - kmalloc(sizeof(struct cifsFileInfo), GFP_KERNEL); + + pCifsFile = cifs_new_fileinfo(inode, netfid, file, file->f_path.mnt, + file->f_flags); + file->private_data = pCifsFile; if (file->private_data == NULL) { rc = -ENOMEM; goto out; } - pCifsFile = cifs_init_private(file->private_data, inode, file, netfid); - write_lock(&GlobalSMBSeslock); - list_add(&pCifsFile->tlist, &tcon->openFileList); - pCifsInode = CIFS_I(file->f_path.dentry->d_inode); - if (pCifsInode) { - rc = cifs_open_inode_helper(inode, file, pCifsInode, - pCifsFile, tcon, - &oplock, buf, full_path, xid); - } else { - write_unlock(&GlobalSMBSeslock); - } + rc = cifs_open_inode_helper(inode, file, pCifsInode, pCifsFile, tcon, + &oplock, buf, full_path, xid); if (oplock & CIFS_CREATE_ACTION) { /* time to set mode which we can not set earlier due to @@ -476,7 +439,8 @@ static int cifs_relock_file(struct cifsFileInfo *cifsFile) static int cifs_reopen_file(struct file *file, bool can_flush) { int rc = -EACCES; - int xid, oplock; + int xid; + __u32 oplock; struct cifs_sb_info *cifs_sb; struct cifsTconInfo *tcon; struct cifsFileInfo *pCifsFile; @@ -545,7 +509,7 @@ reopen_error_exit: le64_to_cpu(tcon->fsUnixInfo.Capability))) { int oflags = (int) cifs_posix_convert_flags(file->f_flags); /* can not refresh inode info since size could be stale */ - rc = cifs_posix_open(full_path, NULL, inode->i_sb, + rc = cifs_posix_open(full_path, NULL, file->f_path.mnt, cifs_sb->mnt_file_mode /* ignored */, oflags, &oplock, &netfid, xid); if (rc == 0) { @@ -643,7 +607,7 @@ int cifs_close(struct inode *inode, struct file *file) if (!pTcon->need_reconnect) { write_unlock(&GlobalSMBSeslock); timeout = 2; - while ((atomic_read(&pSMBFile->wrtPending) != 0) + while ((atomic_read(&pSMBFile->count) != 1) && (timeout <= 2048)) { /* Give write a better chance to get to server ahead of the close. We do not @@ -657,8 +621,6 @@ int cifs_close(struct inode *inode, struct file *file) msleep(timeout); timeout *= 4; } - if (atomic_read(&pSMBFile->wrtPending)) - cERROR(1, ("close with pending write")); if (!pTcon->need_reconnect && !pSMBFile->invalidHandle) rc = CIFSSMBClose(xid, pTcon, @@ -681,24 +643,7 @@ int cifs_close(struct inode *inode, struct file *file) list_del(&pSMBFile->flist); list_del(&pSMBFile->tlist); write_unlock(&GlobalSMBSeslock); - timeout = 10; - /* We waited above to give the SMBWrite a chance to issue - on the wire (so we do not get SMBWrite returning EBADF - if writepages is racing with close. Note that writepages - does not specify a file handle, so it is possible for a file - to be opened twice, and the application close the "wrong" - file handle - in these cases we delay long enough to allow - the SMBWrite to get on the wire before the SMB Close. - We allow total wait here over 45 seconds, more than - oplock break time, and more than enough to allow any write - to complete on the server, or to time out on the client */ - while ((atomic_read(&pSMBFile->wrtPending) != 0) - && (timeout <= 50000)) { - cERROR(1, ("writes pending, delay free of handle")); - msleep(timeout); - timeout *= 8; - } - kfree(file->private_data); + cifsFileInfo_put(file->private_data); file->private_data = NULL; } else rc = -EBADF; @@ -1236,7 +1181,7 @@ struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *cifs_inode) if (!open_file->invalidHandle) { /* found a good file */ /* lock it so it will not be closed on us */ - atomic_inc(&open_file->wrtPending); + cifsFileInfo_get(open_file); read_unlock(&GlobalSMBSeslock); return open_file; } /* else might as well continue, and look for @@ -1276,7 +1221,7 @@ refind_writable: if (open_file->pfile && ((open_file->pfile->f_flags & O_RDWR) || (open_file->pfile->f_flags & O_WRONLY))) { - atomic_inc(&open_file->wrtPending); + cifsFileInfo_get(open_file); if (!open_file->invalidHandle) { /* found a good writable file */ @@ -1293,7 +1238,7 @@ refind_writable: else { /* start over in case this was deleted */ /* since the list could be modified */ read_lock(&GlobalSMBSeslock); - atomic_dec(&open_file->wrtPending); + cifsFileInfo_put(open_file); goto refind_writable; } } @@ -1309,7 +1254,7 @@ refind_writable: read_lock(&GlobalSMBSeslock); /* can not use this handle, no write pending on this one after all */ - atomic_dec(&open_file->wrtPending); + cifsFileInfo_put(open_file); if (open_file->closePend) /* list could have changed */ goto refind_writable; @@ -1373,7 +1318,7 @@ static int cifs_partialpagewrite(struct page *page, unsigned from, unsigned to) if (open_file) { bytes_written = cifs_write(open_file->pfile, write_data, to-from, &offset); - atomic_dec(&open_file->wrtPending); + cifsFileInfo_put(open_file); /* Does mm or vfs already set times? */ inode->i_atime = inode->i_mtime = current_fs_time(inode->i_sb); if ((bytes_written > 0) && (offset)) @@ -1562,7 +1507,7 @@ retry: bytes_to_write, offset, &bytes_written, iov, n_iov, long_op); - atomic_dec(&open_file->wrtPending); + cifsFileInfo_put(open_file); cifs_update_eof(cifsi, offset, bytes_written); if (rc || bytes_written < bytes_to_write) { @@ -2329,6 +2274,73 @@ out: return rc; } +static void +cifs_oplock_break(struct slow_work *work) +{ + struct cifsFileInfo *cfile = container_of(work, struct cifsFileInfo, + oplock_break); + struct inode *inode = cfile->pInode; + struct cifsInodeInfo *cinode = CIFS_I(inode); + struct cifs_sb_info *cifs_sb = CIFS_SB(cfile->mnt->mnt_sb); + int rc, waitrc = 0; + + if (inode && S_ISREG(inode->i_mode)) { +#ifdef CONFIG_CIFS_EXPERIMENTAL + if (cinode->clientCanCacheAll == 0) + break_lease(inode, FMODE_READ); + else if (cinode->clientCanCacheRead == 0) + break_lease(inode, FMODE_WRITE); +#endif + rc = filemap_fdatawrite(inode->i_mapping); + if (cinode->clientCanCacheRead == 0) { + waitrc = filemap_fdatawait(inode->i_mapping); + invalidate_remote_inode(inode); + } + if (!rc) + rc = waitrc; + if (rc) + cinode->write_behind_rc = rc; + cFYI(1, ("Oplock flush inode %p rc %d", inode, rc)); + } + + /* + * releasing stale oplock after recent reconnect of smb session using + * a now incorrect file handle is not a data integrity issue but do + * not bother sending an oplock release if session to server still is + * disconnected since oplock already released by the server + */ + if (!cfile->closePend && !cfile->oplock_break_cancelled) { + rc = CIFSSMBLock(0, cifs_sb->tcon, cfile->netfid, 0, 0, 0, 0, + LOCKING_ANDX_OPLOCK_RELEASE, false); + cFYI(1, ("Oplock release rc = %d", rc)); + } +} + +static int +cifs_oplock_break_get(struct slow_work *work) +{ + struct cifsFileInfo *cfile = container_of(work, struct cifsFileInfo, + oplock_break); + mntget(cfile->mnt); + cifsFileInfo_get(cfile); + return 0; +} + +static void +cifs_oplock_break_put(struct slow_work *work) +{ + struct cifsFileInfo *cfile = container_of(work, struct cifsFileInfo, + oplock_break); + mntput(cfile->mnt); + cifsFileInfo_put(cfile); +} + +const struct slow_work_ops cifs_oplock_break_ops = { + .get_ref = cifs_oplock_break_get, + .put_ref = cifs_oplock_break_put, + .execute = cifs_oplock_break, +}; + const struct address_space_operations cifs_addr_ops = { .readpage = cifs_readpage, .readpages = cifs_readpages, diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index 82d83839..5e24925 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -800,7 +800,7 @@ set_via_filehandle: if (open_file == NULL) CIFSSMBClose(xid, pTcon, netfid); else - atomic_dec(&open_file->wrtPending); + cifsFileInfo_put(open_file); out: return rc; } @@ -1557,57 +1557,24 @@ static int cifs_truncate_page(struct address_space *mapping, loff_t from) static int cifs_vmtruncate(struct inode *inode, loff_t offset) { - struct address_space *mapping = inode->i_mapping; - unsigned long limit; + loff_t oldsize; + int err; spin_lock(&inode->i_lock); - if (inode->i_size < offset) - goto do_expand; - /* - * truncation of in-use swapfiles is disallowed - it would cause - * subsequent swapout to scribble on the now-freed blocks. - */ - if (IS_SWAPFILE(inode)) { - spin_unlock(&inode->i_lock); - goto out_busy; - } - i_size_write(inode, offset); - spin_unlock(&inode->i_lock); - /* - * unmap_mapping_range is called twice, first simply for efficiency - * so that truncate_inode_pages does fewer single-page unmaps. However - * after this first call, and before truncate_inode_pages finishes, - * it is possible for private pages to be COWed, which remain after - * truncate_inode_pages finishes, hence the second unmap_mapping_range - * call must be made for correctness. - */ - unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1); - truncate_inode_pages(mapping, offset); - unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1); - goto out_truncate; - -do_expand: - limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur; - if (limit != RLIM_INFINITY && offset > limit) { + err = inode_newsize_ok(inode, offset); + if (err) { spin_unlock(&inode->i_lock); - goto out_sig; - } - if (offset > inode->i_sb->s_maxbytes) { - spin_unlock(&inode->i_lock); - goto out_big; + goto out; } + + oldsize = inode->i_size; i_size_write(inode, offset); spin_unlock(&inode->i_lock); -out_truncate: + truncate_pagecache(inode, oldsize, offset); if (inode->i_op->truncate) inode->i_op->truncate(inode); - return 0; -out_sig: - send_sig(SIGXFSZ, current, 0); -out_big: - return -EFBIG; -out_busy: - return -ETXTBSY; +out: + return err; } static int @@ -1635,7 +1602,7 @@ cifs_set_file_size(struct inode *inode, struct iattr *attrs, __u32 npid = open_file->pid; rc = CIFSSMBSetFileSize(xid, pTcon, attrs->ia_size, nfid, npid, false); - atomic_dec(&open_file->wrtPending); + cifsFileInfo_put(open_file); cFYI(1, ("SetFSize for attrs rc = %d", rc)); if ((rc == -EINVAL) || (rc == -EOPNOTSUPP)) { unsigned int bytes_written; @@ -1790,7 +1757,7 @@ cifs_setattr_unix(struct dentry *direntry, struct iattr *attrs) u16 nfid = open_file->netfid; u32 npid = open_file->pid; rc = CIFSSMBUnixSetFileInfo(xid, pTcon, args, nfid, npid); - atomic_dec(&open_file->wrtPending); + cifsFileInfo_put(open_file); } else { rc = CIFSSMBUnixSetPathInfo(xid, pTcon, full_path, args, cifs_sb->local_nls, diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c index e079a91..0241b25 100644 --- a/fs/cifs/misc.c +++ b/fs/cifs/misc.c @@ -32,7 +32,6 @@ extern mempool_t *cifs_sm_req_poolp; extern mempool_t *cifs_req_poolp; -extern struct task_struct *oplockThread; /* The xid serves as a useful identifier for each incoming vfs request, in a similar way to the mid which is useful to track each sent smb, @@ -500,6 +499,7 @@ is_valid_oplock_break(struct smb_hdr *buf, struct TCP_Server_Info *srv) struct cifsTconInfo *tcon; struct cifsInodeInfo *pCifsInode; struct cifsFileInfo *netfile; + int rc; cFYI(1, ("Checking for oplock break or dnotify response")); if ((pSMB->hdr.Command == SMB_COM_NT_TRANSACT) && @@ -562,30 +562,40 @@ is_valid_oplock_break(struct smb_hdr *buf, struct TCP_Server_Info *srv) continue; cifs_stats_inc(&tcon->num_oplock_brks); - write_lock(&GlobalSMBSeslock); + read_lock(&GlobalSMBSeslock); list_for_each(tmp2, &tcon->openFileList) { netfile = list_entry(tmp2, struct cifsFileInfo, tlist); if (pSMB->Fid != netfile->netfid) continue; - write_unlock(&GlobalSMBSeslock); - read_unlock(&cifs_tcp_ses_lock); + /* + * don't do anything if file is about to be + * closed anyway. + */ + if (netfile->closePend) { + read_unlock(&GlobalSMBSeslock); + read_unlock(&cifs_tcp_ses_lock); + return true; + } + cFYI(1, ("file id match, oplock break")); pCifsInode = CIFS_I(netfile->pInode); pCifsInode->clientCanCacheAll = false; if (pSMB->OplockLevel == 0) pCifsInode->clientCanCacheRead = false; - pCifsInode->oplockPending = true; - AllocOplockQEntry(netfile->pInode, - netfile->netfid, tcon); - cFYI(1, ("about to wake up oplock thread")); - if (oplockThread) - wake_up_process(oplockThread); - + rc = slow_work_enqueue(&netfile->oplock_break); + if (rc) { + cERROR(1, ("failed to enqueue oplock " + "break: %d\n", rc)); + } else { + netfile->oplock_break_cancelled = false; + } + read_unlock(&GlobalSMBSeslock); + read_unlock(&cifs_tcp_ses_lock); return true; } - write_unlock(&GlobalSMBSeslock); + read_unlock(&GlobalSMBSeslock); read_unlock(&cifs_tcp_ses_lock); cFYI(1, ("No matching file for oplock break")); return true; diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c index f823a4a..1f098ca 100644 --- a/fs/cifs/readdir.c +++ b/fs/cifs/readdir.c @@ -146,7 +146,7 @@ cifs_fill_common_info(struct cifs_fattr *fattr, struct cifs_sb_info *cifs_sb) } } -void +static void cifs_dir_info_to_fattr(struct cifs_fattr *fattr, FILE_DIRECTORY_INFO *info, struct cifs_sb_info *cifs_sb) { @@ -161,7 +161,7 @@ cifs_dir_info_to_fattr(struct cifs_fattr *fattr, FILE_DIRECTORY_INFO *info, cifs_fill_common_info(fattr, cifs_sb); } -void +static void cifs_std_info_to_fattr(struct cifs_fattr *fattr, FIND_FILE_STANDARD_INFO *info, struct cifs_sb_info *cifs_sb) { diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index 0ad3e2d..07b8e71 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -103,57 +103,6 @@ DeleteMidQEntry(struct mid_q_entry *midEntry) mempool_free(midEntry, cifs_mid_poolp); } -struct oplock_q_entry * -AllocOplockQEntry(struct inode *pinode, __u16 fid, struct cifsTconInfo *tcon) -{ - struct oplock_q_entry *temp; - if ((pinode == NULL) || (tcon == NULL)) { - cERROR(1, ("Null parms passed to AllocOplockQEntry")); - return NULL; - } - temp = (struct oplock_q_entry *) kmem_cache_alloc(cifs_oplock_cachep, - GFP_KERNEL); - if (temp == NULL) - return temp; - else { - temp->pinode = pinode; - temp->tcon = tcon; - temp->netfid = fid; - spin_lock(&GlobalMid_Lock); - list_add_tail(&temp->qhead, &GlobalOplock_Q); - spin_unlock(&GlobalMid_Lock); - } - return temp; - -} - -void DeleteOplockQEntry(struct oplock_q_entry *oplockEntry) -{ - spin_lock(&GlobalMid_Lock); - /* should we check if list empty first? */ - list_del(&oplockEntry->qhead); - spin_unlock(&GlobalMid_Lock); - kmem_cache_free(cifs_oplock_cachep, oplockEntry); -} - - -void DeleteTconOplockQEntries(struct cifsTconInfo *tcon) -{ - struct oplock_q_entry *temp; - - if (tcon == NULL) - return; - - spin_lock(&GlobalMid_Lock); - list_for_each_entry(temp, &GlobalOplock_Q, qhead) { - if ((temp->tcon) && (temp->tcon == tcon)) { - list_del(&temp->qhead); - kmem_cache_free(cifs_oplock_cachep, temp); - } - } - spin_unlock(&GlobalMid_Lock); -} - static int smb_sendv(struct TCP_Server_Info *server, struct kvec *iov, int n_vec) { diff --git a/fs/coda/coda_int.h b/fs/coda/coda_int.h index 8ccd5ed..d99860a 100644 --- a/fs/coda/coda_int.h +++ b/fs/coda/coda_int.h @@ -2,6 +2,7 @@ #define _CODA_INT_ struct dentry; +struct file; extern struct file_system_type coda_fs_type; extern unsigned long coda_timeout; diff --git a/fs/coda/psdev.c b/fs/coda/psdev.c index 0376ac6..be4392c 100644 --- a/fs/coda/psdev.c +++ b/fs/coda/psdev.c @@ -22,6 +22,7 @@ #include <linux/kernel.h> #include <linux/major.h> #include <linux/time.h> +#include <linux/sched.h> #include <linux/slab.h> #include <linux/ioport.h> #include <linux/fcntl.h> diff --git a/fs/compat.c b/fs/compat.c index 6d6f98f..d576b55 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -100,13 +100,6 @@ asmlinkage long compat_sys_utimensat(unsigned int dfd, char __user *filename, st get_compat_timespec(&tv[1], &t[1])) return -EFAULT; - if ((tv[0].tv_nsec == UTIME_OMIT || tv[0].tv_nsec == UTIME_NOW) - && tv[0].tv_sec != 0) - return -EINVAL; - if ((tv[1].tv_nsec == UTIME_OMIT || tv[1].tv_nsec == UTIME_NOW) - && tv[1].tv_sec != 0) - return -EINVAL; - if (tv[0].tv_nsec == UTIME_OMIT && tv[1].tv_nsec == UTIME_OMIT) return 0; } @@ -775,13 +768,13 @@ asmlinkage long compat_sys_mount(char __user * dev_name, char __user * dir_name, char __user * type, unsigned long flags, void __user * data) { - unsigned long type_page; + char *kernel_type; unsigned long data_page; - unsigned long dev_page; + char *kernel_dev; char *dir_page; int retval; - retval = copy_mount_options (type, &type_page); + retval = copy_mount_string(type, &kernel_type); if (retval < 0) goto out; @@ -790,38 +783,38 @@ asmlinkage long compat_sys_mount(char __user * dev_name, char __user * dir_name, if (IS_ERR(dir_page)) goto out1; - retval = copy_mount_options (dev_name, &dev_page); + retval = copy_mount_string(dev_name, &kernel_dev); if (retval < 0) goto out2; - retval = copy_mount_options (data, &data_page); + retval = copy_mount_options(data, &data_page); if (retval < 0) goto out3; retval = -EINVAL; - if (type_page && data_page) { - if (!strcmp((char *)type_page, SMBFS_NAME)) { + if (kernel_type && data_page) { + if (!strcmp(kernel_type, SMBFS_NAME)) { do_smb_super_data_conv((void *)data_page); - } else if (!strcmp((char *)type_page, NCPFS_NAME)) { + } else if (!strcmp(kernel_type, NCPFS_NAME)) { do_ncp_super_data_conv((void *)data_page); - } else if (!strcmp((char *)type_page, NFS4_NAME)) { + } else if (!strcmp(kernel_type, NFS4_NAME)) { if (do_nfs4_super_data_conv((void *) data_page)) goto out4; } } - retval = do_mount((char*)dev_page, dir_page, (char*)type_page, + retval = do_mount(kernel_dev, dir_page, kernel_type, flags, (void*)data_page); out4: free_page(data_page); out3: - free_page(dev_page); + kfree(kernel_dev); out2: putname(dir_page); out1: - free_page(type_page); + kfree(kernel_type); out: return retval; } diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c index 4921e742..a2f7460 100644 --- a/fs/configfs/inode.c +++ b/fs/configfs/inode.c @@ -51,6 +51,7 @@ static const struct address_space_operations configfs_aops = { }; static struct backing_dev_info configfs_backing_dev_info = { + .name = "configfs", .ra_pages = 0, /* No readahead */ .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, }; diff --git a/fs/dcache.c b/fs/dcache.c index 9e5cd3c..a100fa3 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -32,6 +32,7 @@ #include <linux/swap.h> #include <linux/bootmem.h> #include <linux/fs_struct.h> +#include <linux/hardirq.h> #include "internal.h" int sysctl_vfs_cache_pressure __read_mostly = 100; diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c index 75efb02..d5f8c96 100644 --- a/fs/devpts/inode.c +++ b/fs/devpts/inode.c @@ -18,14 +18,13 @@ #include <linux/mount.h> #include <linux/tty.h> #include <linux/mutex.h> +#include <linux/magic.h> #include <linux/idr.h> #include <linux/devpts_fs.h> #include <linux/parser.h> #include <linux/fsnotify.h> #include <linux/seq_file.h> -#define DEVPTS_SUPER_MAGIC 0x1cd1 - #define DEVPTS_DEFAULT_MODE 0600 /* * ptmx is a new node in /dev/pts and will be unused in legacy (single- diff --git a/fs/dlm/debug_fs.c b/fs/dlm/debug_fs.c index 1d1d274..1c8bb8c 100644 --- a/fs/dlm/debug_fs.c +++ b/fs/dlm/debug_fs.c @@ -386,9 +386,9 @@ static int table_seq_show(struct seq_file *seq, void *iter_ptr) return rv; } -static struct seq_operations format1_seq_ops; -static struct seq_operations format2_seq_ops; -static struct seq_operations format3_seq_ops; +static const struct seq_operations format1_seq_ops; +static const struct seq_operations format2_seq_ops; +static const struct seq_operations format3_seq_ops; static void *table_seq_start(struct seq_file *seq, loff_t *pos) { @@ -534,21 +534,21 @@ static void table_seq_stop(struct seq_file *seq, void *iter_ptr) } } -static struct seq_operations format1_seq_ops = { +static const struct seq_operations format1_seq_ops = { .start = table_seq_start, .next = table_seq_next, .stop = table_seq_stop, .show = table_seq_show, }; -static struct seq_operations format2_seq_ops = { +static const struct seq_operations format2_seq_ops = { .start = table_seq_start, .next = table_seq_next, .stop = table_seq_stop, .show = table_seq_show, }; -static struct seq_operations format3_seq_ops = { +static const struct seq_operations format3_seq_ops = { .start = table_seq_start, .next = table_seq_next, .stop = table_seq_stop, diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index 618a60f..70736eb 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -106,6 +106,7 @@ struct connection { #define CF_CONNECT_PENDING 3 #define CF_INIT_PENDING 4 #define CF_IS_OTHERCON 5 +#define CF_CLOSE 6 struct list_head writequeue; /* List of outgoing writequeue_entries */ spinlock_t writequeue_lock; int (*rx_action) (struct connection *); /* What to do when active */ @@ -299,6 +300,8 @@ static void lowcomms_write_space(struct sock *sk) static inline void lowcomms_connect_sock(struct connection *con) { + if (test_bit(CF_CLOSE, &con->flags)) + return; if (!test_and_set_bit(CF_CONNECT_PENDING, &con->flags)) queue_work(send_workqueue, &con->swork); } @@ -313,6 +316,10 @@ int dlm_lowcomms_connect_node(int nodeid) { struct connection *con; + /* with sctp there's no connecting without sending */ + if (dlm_config.ci_protocol != 0) + return 0; + if (nodeid == dlm_our_nodeid()) return 0; @@ -452,9 +459,9 @@ static void process_sctp_notification(struct connection *con, int prim_len, ret; int addr_len; struct connection *new_con; - struct file *file; sctp_peeloff_arg_t parg; int parglen = sizeof(parg); + int err; /* * We get this before any data for an association. @@ -509,19 +516,22 @@ static void process_sctp_notification(struct connection *con, ret = kernel_getsockopt(con->sock, IPPROTO_SCTP, SCTP_SOCKOPT_PEELOFF, (void *)&parg, &parglen); - if (ret) { + if (ret < 0) { log_print("Can't peel off a socket for " - "connection %d to node %d: err=%d\n", + "connection %d to node %d: err=%d", parg.associd, nodeid, ret); + return; + } + new_con->sock = sockfd_lookup(parg.sd, &err); + if (!new_con->sock) { + log_print("sockfd_lookup error %d", err); + return; } - file = fget(parg.sd); - new_con->sock = SOCKET_I(file->f_dentry->d_inode); add_sock(new_con->sock, new_con); - fput(file); - put_unused_fd(parg.sd); + sockfd_put(new_con->sock); - log_print("got new/restarted association %d nodeid %d", - (int)sn->sn_assoc_change.sac_assoc_id, nodeid); + log_print("connecting to %d sctp association %d", + nodeid, (int)sn->sn_assoc_change.sac_assoc_id); /* Send any pending writes */ clear_bit(CF_CONNECT_PENDING, &new_con->flags); @@ -834,8 +844,6 @@ static void sctp_init_assoc(struct connection *con) if (con->retries++ > MAX_CONNECT_RETRIES) return; - log_print("Initiating association with node %d", con->nodeid); - if (nodeid_to_addr(con->nodeid, (struct sockaddr *)&rem_addr)) { log_print("no address for nodeid %d", con->nodeid); return; @@ -852,11 +860,14 @@ static void sctp_init_assoc(struct connection *con) outmessage.msg_flags = MSG_EOR; spin_lock(&con->writequeue_lock); - e = list_entry(con->writequeue.next, struct writequeue_entry, - list); - BUG_ON((struct list_head *) e == &con->writequeue); + if (list_empty(&con->writequeue)) { + spin_unlock(&con->writequeue_lock); + log_print("writequeue empty for nodeid %d", con->nodeid); + return; + } + e = list_first_entry(&con->writequeue, struct writequeue_entry, list); len = e->len; offset = e->offset; spin_unlock(&con->writequeue_lock); @@ -926,10 +937,8 @@ static void tcp_connect_to_sock(struct connection *con) goto out_err; memset(&saddr, 0, sizeof(saddr)); - if (dlm_nodeid_to_addr(con->nodeid, &saddr)) { - sock_release(sock); + if (dlm_nodeid_to_addr(con->nodeid, &saddr)) goto out_err; - } sock->sk->sk_user_data = con; con->rx_action = receive_from_sock; @@ -1284,7 +1293,6 @@ out: static void send_to_sock(struct connection *con) { int ret = 0; - ssize_t(*sendpage) (struct socket *, struct page *, int, size_t, int); const int msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL; struct writequeue_entry *e; int len, offset; @@ -1293,8 +1301,6 @@ static void send_to_sock(struct connection *con) if (con->sock == NULL) goto out_connect; - sendpage = con->sock->ops->sendpage; - spin_lock(&con->writequeue_lock); for (;;) { e = list_entry(con->writequeue.next, struct writequeue_entry, @@ -1309,8 +1315,8 @@ static void send_to_sock(struct connection *con) ret = 0; if (len) { - ret = sendpage(con->sock, e->page, offset, len, - msg_flags); + ret = kernel_sendpage(con->sock, e->page, offset, len, + msg_flags); if (ret == -EAGAIN || ret == 0) { cond_resched(); goto out; @@ -1370,6 +1376,13 @@ int dlm_lowcomms_close(int nodeid) log_print("closing connection to node %d", nodeid); con = nodeid2con(nodeid, 0); if (con) { + clear_bit(CF_CONNECT_PENDING, &con->flags); + clear_bit(CF_WRITE_PENDING, &con->flags); + set_bit(CF_CLOSE, &con->flags); + if (cancel_work_sync(&con->swork)) + log_print("canceled swork for node %d", nodeid); + if (cancel_work_sync(&con->rwork)) + log_print("canceled rwork for node %d", nodeid); clean_one_writequeue(con); close_connection(con, true); } @@ -1395,9 +1408,10 @@ static void process_send_sockets(struct work_struct *work) if (test_and_clear_bit(CF_CONNECT_PENDING, &con->flags)) { con->connect_action(con); + set_bit(CF_WRITE_PENDING, &con->flags); } - clear_bit(CF_WRITE_PENDING, &con->flags); - send_to_sock(con); + if (test_and_clear_bit(CF_WRITE_PENDING, &con->flags)) + send_to_sock(con); } diff --git a/fs/dlm/netlink.c b/fs/dlm/netlink.c index ccc9d62..55ea369 100644 --- a/fs/dlm/netlink.c +++ b/fs/dlm/netlink.c @@ -63,7 +63,7 @@ static int send_data(struct sk_buff *skb) return rv; } - return genlmsg_unicast(skb, listener_nlpid); + return genlmsg_unicast(&init_net, skb, listener_nlpid); } static int user_cmd(struct sk_buff *skb, struct genl_info *info) diff --git a/fs/drop_caches.c b/fs/drop_caches.c index a2edb79..31f4b0e 100644 --- a/fs/drop_caches.c +++ b/fs/drop_caches.c @@ -63,9 +63,9 @@ static void drop_slab(void) } int drop_caches_sysctl_handler(ctl_table *table, int write, - struct file *file, void __user *buffer, size_t *length, loff_t *ppos) + void __user *buffer, size_t *length, loff_t *ppos) { - proc_dointvec_minmax(table, write, file, buffer, length, ppos); + proc_dointvec_minmax(table, write, buffer, length, ppos); if (write) { if (sysctl_drop_caches & 1) drop_pagecache(); diff --git a/fs/ecryptfs/Kconfig b/fs/ecryptfs/Kconfig index 0c754e6..1cd6d9d 100644 --- a/fs/ecryptfs/Kconfig +++ b/fs/ecryptfs/Kconfig @@ -1,6 +1,9 @@ config ECRYPT_FS tristate "eCrypt filesystem layer support (EXPERIMENTAL)" - depends on EXPERIMENTAL && KEYS && CRYPTO && NET + depends on EXPERIMENTAL && KEYS && CRYPTO + select CRYPTO_ECB + select CRYPTO_CBC + select CRYPTO_MD5 help Encrypted filesystem that operates on the VFS layer. See <file:Documentation/filesystems/ecryptfs.txt> to learn more about diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c index b91851f..fbb6e5e 100644 --- a/fs/ecryptfs/crypto.c +++ b/fs/ecryptfs/crypto.c @@ -245,13 +245,11 @@ void ecryptfs_destroy_crypt_stat(struct ecryptfs_crypt_stat *crypt_stat) crypto_free_blkcipher(crypt_stat->tfm); if (crypt_stat->hash_tfm) crypto_free_hash(crypt_stat->hash_tfm); - mutex_lock(&crypt_stat->keysig_list_mutex); list_for_each_entry_safe(key_sig, key_sig_tmp, &crypt_stat->keysig_list, crypt_stat_list) { list_del(&key_sig->crypt_stat_list); kmem_cache_free(ecryptfs_key_sig_cache, key_sig); } - mutex_unlock(&crypt_stat->keysig_list_mutex); memset(crypt_stat, 0, sizeof(struct ecryptfs_crypt_stat)); } @@ -511,13 +509,14 @@ int ecryptfs_encrypt_page(struct page *page) + extent_offset), crypt_stat); rc = ecryptfs_write_lower(ecryptfs_inode, enc_extent_virt, offset, crypt_stat->extent_size); - if (rc) { + if (rc < 0) { ecryptfs_printk(KERN_ERR, "Error attempting " "to write lower page; rc = [%d]" "\n", rc); goto out; } } + rc = 0; out: if (enc_extent_page) { kunmap(enc_extent_page); @@ -633,7 +632,7 @@ int ecryptfs_decrypt_page(struct page *page) rc = ecryptfs_read_lower(enc_extent_virt, offset, crypt_stat->extent_size, ecryptfs_inode); - if (rc) { + if (rc < 0) { ecryptfs_printk(KERN_ERR, "Error attempting " "to read lower page; rc = [%d]" "\n", rc); @@ -797,6 +796,7 @@ int ecryptfs_init_crypt_ctx(struct ecryptfs_crypt_stat *crypt_stat) kfree(full_alg_name); if (IS_ERR(crypt_stat->tfm)) { rc = PTR_ERR(crypt_stat->tfm); + crypt_stat->tfm = NULL; ecryptfs_printk(KERN_ERR, "cryptfs: init_crypt_ctx(): " "Error initializing cipher [%s]\n", crypt_stat->cipher); @@ -925,7 +925,9 @@ static int ecryptfs_copy_mount_wide_sigs_to_inode_sigs( struct ecryptfs_global_auth_tok *global_auth_tok; int rc = 0; + mutex_lock(&crypt_stat->keysig_list_mutex); mutex_lock(&mount_crypt_stat->global_auth_tok_list_mutex); + list_for_each_entry(global_auth_tok, &mount_crypt_stat->global_auth_tok_list, mount_crypt_stat_list) { @@ -934,13 +936,13 @@ static int ecryptfs_copy_mount_wide_sigs_to_inode_sigs( rc = ecryptfs_add_keysig(crypt_stat, global_auth_tok->sig); if (rc) { printk(KERN_ERR "Error adding keysig; rc = [%d]\n", rc); - mutex_unlock( - &mount_crypt_stat->global_auth_tok_list_mutex); goto out; } } - mutex_unlock(&mount_crypt_stat->global_auth_tok_list_mutex); + out: + mutex_unlock(&mount_crypt_stat->global_auth_tok_list_mutex); + mutex_unlock(&crypt_stat->keysig_list_mutex); return rc; } @@ -1212,14 +1214,15 @@ int ecryptfs_read_and_validate_header_region(char *data, crypt_stat->extent_size = ECRYPTFS_DEFAULT_EXTENT_SIZE; rc = ecryptfs_read_lower(data, 0, crypt_stat->extent_size, ecryptfs_inode); - if (rc) { + if (rc < 0) { printk(KERN_ERR "%s: Error reading header region; rc = [%d]\n", __func__, rc); goto out; } if (!contains_ecryptfs_marker(data + ECRYPTFS_FILE_SIZE_BYTES)) { rc = -EINVAL; - } + } else + rc = 0; out: return rc; } @@ -1314,10 +1317,11 @@ ecryptfs_write_metadata_to_contents(struct dentry *ecryptfs_dentry, rc = ecryptfs_write_lower(ecryptfs_dentry->d_inode, virt, 0, virt_len); - if (rc) + if (rc < 0) printk(KERN_ERR "%s: Error attempting to write header " - "information to lower file; rc = [%d]\n", __func__, - rc); + "information to lower file; rc = [%d]\n", __func__, rc); + else + rc = 0; return rc; } @@ -1597,7 +1601,7 @@ int ecryptfs_read_metadata(struct dentry *ecryptfs_dentry) } rc = ecryptfs_read_lower(page_virt, 0, crypt_stat->extent_size, ecryptfs_inode); - if (!rc) + if (rc >= 0) rc = ecryptfs_read_headers_virt(page_virt, crypt_stat, ecryptfs_dentry, ECRYPTFS_VALIDATE_HEADER_SIZE); @@ -1702,7 +1706,7 @@ ecryptfs_encrypt_filename(struct ecryptfs_filename *filename, } else { printk(KERN_ERR "%s: No support for requested filename " "encryption method in this release\n", __func__); - rc = -ENOTSUPP; + rc = -EOPNOTSUPP; goto out; } out: @@ -1763,7 +1767,7 @@ ecryptfs_process_key_cipher(struct crypto_blkcipher **key_tfm, if (IS_ERR(*key_tfm)) { rc = PTR_ERR(*key_tfm); printk(KERN_ERR "Unable to allocate crypto cipher with name " - "[%s]; rc = [%d]\n", cipher_name, rc); + "[%s]; rc = [%d]\n", full_alg_name, rc); goto out; } crypto_blkcipher_set_flags(*key_tfm, CRYPTO_TFM_REQ_WEAK_KEY); @@ -1776,7 +1780,8 @@ ecryptfs_process_key_cipher(struct crypto_blkcipher **key_tfm, rc = crypto_blkcipher_setkey(*key_tfm, dummy_key, *key_size); if (rc) { printk(KERN_ERR "Error attempting to set key of size [%zd] for " - "cipher [%s]; rc = [%d]\n", *key_size, cipher_name, rc); + "cipher [%s]; rc = [%d]\n", *key_size, full_alg_name, + rc); rc = -EINVAL; goto out; } @@ -2166,7 +2171,7 @@ int ecryptfs_encrypt_and_encode_filename( (*encoded_name)[(*encoded_name_size)] = '\0'; (*encoded_name_size)++; } else { - rc = -ENOTSUPP; + rc = -EOPNOTSUPP; } if (rc) { printk(KERN_ERR "%s: Error attempting to encode " diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h index 00b30a2..542f625 100644 --- a/fs/ecryptfs/ecryptfs_kernel.h +++ b/fs/ecryptfs/ecryptfs_kernel.h @@ -582,7 +582,7 @@ extern const struct inode_operations ecryptfs_dir_iops; extern const struct inode_operations ecryptfs_symlink_iops; extern const struct super_operations ecryptfs_sops; extern const struct dentry_operations ecryptfs_dops; -extern struct address_space_operations ecryptfs_aops; +extern const struct address_space_operations ecryptfs_aops; extern int ecryptfs_verbosity; extern unsigned int ecryptfs_message_buf_len; extern signed long ecryptfs_message_wait_timeout; diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index 2f0945d..056fed62 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -476,6 +476,7 @@ static int ecryptfs_unlink(struct inode *dir, struct dentry *dentry) struct inode *lower_dir_inode = ecryptfs_inode_to_lower(dir); struct dentry *lower_dir_dentry; + dget(lower_dentry); lower_dir_dentry = lock_parent(lower_dentry); rc = vfs_unlink(lower_dir_inode, lower_dentry); if (rc) { @@ -489,6 +490,7 @@ static int ecryptfs_unlink(struct inode *dir, struct dentry *dentry) d_drop(dentry); out_unlock: unlock_dir(lower_dir_dentry); + dput(lower_dentry); return rc; } diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c index 259525c9..a0a7847 100644 --- a/fs/ecryptfs/keystore.c +++ b/fs/ecryptfs/keystore.c @@ -416,7 +416,9 @@ ecryptfs_find_global_auth_tok_for_sig( &mount_crypt_stat->global_auth_tok_list, mount_crypt_stat_list) { if (memcmp(walker->sig, sig, ECRYPTFS_SIG_SIZE_HEX) == 0) { - (*global_auth_tok) = walker; + rc = key_validate(walker->global_auth_tok_key); + if (!rc) + (*global_auth_tok) = walker; goto out; } } @@ -612,7 +614,12 @@ ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes, } /* TODO: Support other key modules than passphrase for * filename encryption */ - BUG_ON(s->auth_tok->token_type != ECRYPTFS_PASSWORD); + if (s->auth_tok->token_type != ECRYPTFS_PASSWORD) { + rc = -EOPNOTSUPP; + printk(KERN_INFO "%s: Filename encryption only supports " + "password tokens\n", __func__); + goto out_free_unlock; + } sg_init_one( &s->hash_sg, (u8 *)s->auth_tok->token.password.session_key_encryption_key, @@ -910,7 +917,12 @@ ecryptfs_parse_tag_70_packet(char **filename, size_t *filename_size, } /* TODO: Support other key modules than passphrase for * filename encryption */ - BUG_ON(s->auth_tok->token_type != ECRYPTFS_PASSWORD); + if (s->auth_tok->token_type != ECRYPTFS_PASSWORD) { + rc = -EOPNOTSUPP; + printk(KERN_INFO "%s: Filename encryption only supports " + "password tokens\n", __func__); + goto out_free_unlock; + } rc = crypto_blkcipher_setkey( s->desc.tfm, s->auth_tok->token.password.session_key_encryption_key, @@ -1316,8 +1328,10 @@ parse_tag_3_packet(struct ecryptfs_crypt_stat *crypt_stat, rc = -EINVAL; goto out_free; } - ecryptfs_cipher_code_to_string(crypt_stat->cipher, - (u16)data[(*packet_size)]); + rc = ecryptfs_cipher_code_to_string(crypt_stat->cipher, + (u16)data[(*packet_size)]); + if (rc) + goto out_free; /* A little extra work to differentiate among the AES key * sizes; see RFC2440 */ switch(data[(*packet_size)++]) { @@ -1328,7 +1342,9 @@ parse_tag_3_packet(struct ecryptfs_crypt_stat *crypt_stat, crypt_stat->key_size = (*new_auth_tok)->session_key.encrypted_key_size; } - ecryptfs_init_crypt_ctx(crypt_stat); + rc = ecryptfs_init_crypt_ctx(crypt_stat); + if (rc) + goto out_free; if (unlikely(data[(*packet_size)++] != 0x03)) { printk(KERN_WARNING "Only S2K ID 3 is currently supported\n"); rc = -ENOSYS; @@ -2366,21 +2382,18 @@ struct kmem_cache *ecryptfs_key_sig_cache; int ecryptfs_add_keysig(struct ecryptfs_crypt_stat *crypt_stat, char *sig) { struct ecryptfs_key_sig *new_key_sig; - int rc = 0; new_key_sig = kmem_cache_alloc(ecryptfs_key_sig_cache, GFP_KERNEL); if (!new_key_sig) { - rc = -ENOMEM; printk(KERN_ERR "Error allocating from ecryptfs_key_sig_cache\n"); - goto out; + return -ENOMEM; } memcpy(new_key_sig->keysig, sig, ECRYPTFS_SIG_SIZE_HEX); - mutex_lock(&crypt_stat->keysig_list_mutex); + /* Caller must hold keysig_list_mutex */ list_add(&new_key_sig->crypt_stat_list, &crypt_stat->keysig_list); - mutex_unlock(&crypt_stat->keysig_list_mutex); -out: - return rc; + + return 0; } struct kmem_cache *ecryptfs_global_auth_tok_cache; diff --git a/fs/ecryptfs/kthread.c b/fs/ecryptfs/kthread.c index c6d7a4d..e14cf7e 100644 --- a/fs/ecryptfs/kthread.c +++ b/fs/ecryptfs/kthread.c @@ -136,6 +136,7 @@ int ecryptfs_privileged_open(struct file **lower_file, const struct cred *cred) { struct ecryptfs_open_req *req; + int flags = O_LARGEFILE; int rc = 0; /* Corresponding dput() and mntput() are done when the @@ -143,10 +144,14 @@ int ecryptfs_privileged_open(struct file **lower_file, * destroyed. */ dget(lower_dentry); mntget(lower_mnt); - (*lower_file) = dentry_open(lower_dentry, lower_mnt, - (O_RDWR | O_LARGEFILE), cred); + flags |= IS_RDONLY(lower_dentry->d_inode) ? O_RDONLY : O_RDWR; + (*lower_file) = dentry_open(lower_dentry, lower_mnt, flags, cred); if (!IS_ERR(*lower_file)) goto out; + if (flags & O_RDONLY) { + rc = PTR_ERR((*lower_file)); + goto out; + } req = kmem_cache_alloc(ecryptfs_open_req_cache, GFP_KERNEL); if (!req) { rc = -ENOMEM; @@ -180,21 +185,8 @@ int ecryptfs_privileged_open(struct file **lower_file, __func__); goto out_unlock; } - if (IS_ERR(*req->lower_file)) { + if (IS_ERR(*req->lower_file)) rc = PTR_ERR(*req->lower_file); - dget(lower_dentry); - mntget(lower_mnt); - (*lower_file) = dentry_open(lower_dentry, lower_mnt, - (O_RDONLY | O_LARGEFILE), cred); - if (IS_ERR(*lower_file)) { - rc = PTR_ERR(*req->lower_file); - (*lower_file) = NULL; - printk(KERN_WARNING "%s: Error attempting privileged " - "open of lower file with either RW or RO " - "perms; rc = [%d]. Giving up.\n", - __func__, rc); - } - } out_unlock: mutex_unlock(&req->mux); out_free: diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c index 9f0aa98..c6ac85d 100644 --- a/fs/ecryptfs/main.c +++ b/fs/ecryptfs/main.c @@ -35,6 +35,7 @@ #include <linux/key.h> #include <linux/parser.h> #include <linux/fs_stack.h> +#include <linux/ima.h> #include "ecryptfs_kernel.h" /** @@ -118,6 +119,7 @@ int ecryptfs_init_persistent_file(struct dentry *ecryptfs_dentry) const struct cred *cred = current_cred(); struct ecryptfs_inode_info *inode_info = ecryptfs_inode_to_private(ecryptfs_dentry->d_inode); + int opened_lower_file = 0; int rc = 0; mutex_lock(&inode_info->lower_file_mutex); @@ -129,15 +131,17 @@ int ecryptfs_init_persistent_file(struct dentry *ecryptfs_dentry) lower_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry); rc = ecryptfs_privileged_open(&inode_info->lower_file, lower_dentry, lower_mnt, cred); - if (rc || IS_ERR(inode_info->lower_file)) { + if (rc) { printk(KERN_ERR "Error opening lower persistent file " "for lower_dentry [0x%p] and lower_mnt [0x%p]; " "rc = [%d]\n", lower_dentry, lower_mnt, rc); - rc = PTR_ERR(inode_info->lower_file); inode_info->lower_file = NULL; - } + } else + opened_lower_file = 1; } mutex_unlock(&inode_info->lower_file_mutex); + if (opened_lower_file) + ima_counts_get(inode_info->lower_file); return rc; } diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c index 5c6bab9..df4ce99d 100644 --- a/fs/ecryptfs/mmap.c +++ b/fs/ecryptfs/mmap.c @@ -396,9 +396,11 @@ static int ecryptfs_write_inode_size_to_header(struct inode *ecryptfs_inode) rc = ecryptfs_write_lower(ecryptfs_inode, file_size_virt, 0, sizeof(u64)); kfree(file_size_virt); - if (rc) + if (rc < 0) printk(KERN_ERR "%s: Error writing file size to header; " "rc = [%d]\n", __func__, rc); + else + rc = 0; out: return rc; } @@ -545,7 +547,7 @@ static sector_t ecryptfs_bmap(struct address_space *mapping, sector_t block) return rc; } -struct address_space_operations ecryptfs_aops = { +const struct address_space_operations ecryptfs_aops = { .writepage = ecryptfs_writepage, .readpage = ecryptfs_readpage, .write_begin = ecryptfs_write_begin, diff --git a/fs/ecryptfs/read_write.c b/fs/ecryptfs/read_write.c index a137c6e..0cc4faf 100644 --- a/fs/ecryptfs/read_write.c +++ b/fs/ecryptfs/read_write.c @@ -34,15 +34,14 @@ * * Write data to the lower file. * - * Returns zero on success; non-zero on error + * Returns bytes written on success; less than zero on error */ int ecryptfs_write_lower(struct inode *ecryptfs_inode, char *data, loff_t offset, size_t size) { struct ecryptfs_inode_info *inode_info; - ssize_t octets_written; mm_segment_t fs_save; - int rc = 0; + ssize_t rc; inode_info = ecryptfs_inode_to_private(ecryptfs_inode); mutex_lock(&inode_info->lower_file_mutex); @@ -50,14 +49,9 @@ int ecryptfs_write_lower(struct inode *ecryptfs_inode, char *data, inode_info->lower_file->f_pos = offset; fs_save = get_fs(); set_fs(get_ds()); - octets_written = vfs_write(inode_info->lower_file, data, size, - &inode_info->lower_file->f_pos); + rc = vfs_write(inode_info->lower_file, data, size, + &inode_info->lower_file->f_pos); set_fs(fs_save); - if (octets_written < 0) { - printk(KERN_ERR "%s: octets_written = [%td]; " - "expected [%td]\n", __func__, octets_written, size); - rc = -EINVAL; - } mutex_unlock(&inode_info->lower_file_mutex); mark_inode_dirty_sync(ecryptfs_inode); return rc; @@ -91,6 +85,8 @@ int ecryptfs_write_lower_page_segment(struct inode *ecryptfs_inode, + offset_in_page); virt = kmap(page_for_lower); rc = ecryptfs_write_lower(ecryptfs_inode, virt, offset, size); + if (rc > 0) + rc = 0; kunmap(page_for_lower); return rc; } @@ -229,30 +225,24 @@ out: * Read @size bytes of data at byte offset @offset from the lower * inode into memory location @data. * - * Returns zero on success; non-zero on error + * Returns bytes read on success; 0 on EOF; less than zero on error */ int ecryptfs_read_lower(char *data, loff_t offset, size_t size, struct inode *ecryptfs_inode) { struct ecryptfs_inode_info *inode_info = ecryptfs_inode_to_private(ecryptfs_inode); - ssize_t octets_read; mm_segment_t fs_save; - int rc = 0; + ssize_t rc; mutex_lock(&inode_info->lower_file_mutex); BUG_ON(!inode_info->lower_file); inode_info->lower_file->f_pos = offset; fs_save = get_fs(); set_fs(get_ds()); - octets_read = vfs_read(inode_info->lower_file, data, size, - &inode_info->lower_file->f_pos); + rc = vfs_read(inode_info->lower_file, data, size, + &inode_info->lower_file->f_pos); set_fs(fs_save); - if (octets_read < 0) { - printk(KERN_ERR "%s: octets_read = [%td]; " - "expected [%td]\n", __func__, octets_read, size); - rc = -EINVAL; - } mutex_unlock(&inode_info->lower_file_mutex); return rc; } @@ -284,6 +274,8 @@ int ecryptfs_read_lower_page_segment(struct page *page_for_ecryptfs, offset = ((((loff_t)page_index) << PAGE_CACHE_SHIFT) + offset_in_page); virt = kmap(page_for_ecryptfs); rc = ecryptfs_read_lower(virt, offset, size, ecryptfs_inode); + if (rc > 0) + rc = 0; kunmap(page_for_ecryptfs); flush_dcache_page(page_for_ecryptfs); return rc; diff --git a/fs/ecryptfs/super.c b/fs/ecryptfs/super.c index 12d6496..b15a43a 100644 --- a/fs/ecryptfs/super.c +++ b/fs/ecryptfs/super.c @@ -77,7 +77,6 @@ static void ecryptfs_destroy_inode(struct inode *inode) struct ecryptfs_inode_info *inode_info; inode_info = ecryptfs_inode_to_private(inode); - mutex_lock(&inode_info->lower_file_mutex); if (inode_info->lower_file) { struct dentry *lower_dentry = inode_info->lower_file->f_dentry; @@ -89,7 +88,6 @@ static void ecryptfs_destroy_inode(struct inode *inode) d_drop(lower_dentry); } } - mutex_unlock(&inode_info->lower_file_mutex); ecryptfs_destroy_crypt_stat(&inode_info->crypt_stat); kmem_cache_free(ecryptfs_inode_info_cache, inode_info); } diff --git a/fs/eventfd.c b/fs/eventfd.c index 31d12de8..8b47e42 100644 --- a/fs/eventfd.c +++ b/fs/eventfd.c @@ -68,11 +68,16 @@ int eventfd_signal(struct eventfd_ctx *ctx, int n) } EXPORT_SYMBOL_GPL(eventfd_signal); +static void eventfd_free_ctx(struct eventfd_ctx *ctx) +{ + kfree(ctx); +} + static void eventfd_free(struct kref *kref) { struct eventfd_ctx *ctx = container_of(kref, struct eventfd_ctx, kref); - kfree(ctx); + eventfd_free_ctx(ctx); } /** @@ -298,9 +303,23 @@ struct eventfd_ctx *eventfd_ctx_fileget(struct file *file) } EXPORT_SYMBOL_GPL(eventfd_ctx_fileget); -SYSCALL_DEFINE2(eventfd2, unsigned int, count, int, flags) +/** + * eventfd_file_create - Creates an eventfd file pointer. + * @count: Initial eventfd counter value. + * @flags: Flags for the eventfd file. + * + * This function creates an eventfd file pointer, w/out installing it into + * the fd table. This is useful when the eventfd file is used during the + * initialization of data structures that require extra setup after the eventfd + * creation. So the eventfd creation is split into the file pointer creation + * phase, and the file descriptor installation phase. + * In this way races with userspace closing the newly installed file descriptor + * can be avoided. + * Returns an eventfd file pointer, or a proper error pointer. + */ +struct file *eventfd_file_create(unsigned int count, int flags) { - int fd; + struct file *file; struct eventfd_ctx *ctx; /* Check the EFD_* constants for consistency. */ @@ -308,26 +327,48 @@ SYSCALL_DEFINE2(eventfd2, unsigned int, count, int, flags) BUILD_BUG_ON(EFD_NONBLOCK != O_NONBLOCK); if (flags & ~EFD_FLAGS_SET) - return -EINVAL; + return ERR_PTR(-EINVAL); ctx = kmalloc(sizeof(*ctx), GFP_KERNEL); if (!ctx) - return -ENOMEM; + return ERR_PTR(-ENOMEM); kref_init(&ctx->kref); init_waitqueue_head(&ctx->wqh); ctx->count = count; ctx->flags = flags; - /* - * When we call this, the initialization must be complete, since - * anon_inode_getfd() will install the fd. - */ - fd = anon_inode_getfd("[eventfd]", &eventfd_fops, ctx, - flags & EFD_SHARED_FCNTL_FLAGS); - if (fd < 0) - kfree(ctx); + file = anon_inode_getfile("[eventfd]", &eventfd_fops, ctx, + flags & EFD_SHARED_FCNTL_FLAGS); + if (IS_ERR(file)) + eventfd_free_ctx(ctx); + + return file; +} + +SYSCALL_DEFINE2(eventfd2, unsigned int, count, int, flags) +{ + int fd, error; + struct file *file; + + error = get_unused_fd_flags(flags & EFD_SHARED_FCNTL_FLAGS); + if (error < 0) + return error; + fd = error; + + file = eventfd_file_create(count, flags); + if (IS_ERR(file)) { + error = PTR_ERR(file); + goto err_put_unused_fd; + } + fd_install(fd, file); + return fd; + +err_put_unused_fd: + put_unused_fd(fd); + + return error; } SYSCALL_DEFINE1(eventfd, unsigned int, count) @@ -33,7 +33,7 @@ #include <linux/string.h> #include <linux/init.h> #include <linux/pagemap.h> -#include <linux/perf_counter.h> +#include <linux/perf_event.h> #include <linux/highmem.h> #include <linux/spinlock.h> #include <linux/key.h> @@ -55,6 +55,7 @@ #include <linux/kmod.h> #include <linux/fsnotify.h> #include <linux/fs_struct.h> +#include <linux/pipe_fs_i.h> #include <asm/uaccess.h> #include <asm/mmu_context.h> @@ -63,6 +64,7 @@ int core_uses_pid; char core_pattern[CORENAME_MAX_SIZE] = "core"; +unsigned int core_pipe_limit; int suid_dumpable = 0; /* The maximal length of core_pattern is also specified in sysctl.c */ @@ -845,6 +847,9 @@ static int de_thread(struct task_struct *tsk) sig->notify_count = 0; no_thread_group: + if (current->mm) + setmax_mm_hiwater_rss(&sig->maxrss, current->mm); + exit_itimers(sig); flush_itimer_signals(); @@ -923,7 +928,7 @@ void set_task_comm(struct task_struct *tsk, char *buf) task_lock(tsk); strlcpy(tsk->comm, buf, sizeof(tsk->comm)); task_unlock(tsk); - perf_counter_comm(tsk); + perf_event_comm(tsk); } int flush_old_exec(struct linux_binprm * bprm) @@ -997,7 +1002,7 @@ int flush_old_exec(struct linux_binprm * bprm) * security domain: */ if (!get_dumpable(current->mm)) - perf_counter_exit_task(current); + perf_event_exit_task(current); /* An exec changes our domain. We are no longer part of the thread group */ @@ -1354,6 +1359,8 @@ int do_execve(char * filename, if (retval < 0) goto out; + current->stack_start = current->mm->start_stack; + /* execve succeeded */ current->fs->in_exec = 0; current->in_execve = 0; @@ -1388,18 +1395,16 @@ out_ret: return retval; } -int set_binfmt(struct linux_binfmt *new) +void set_binfmt(struct linux_binfmt *new) { - struct linux_binfmt *old = current->binfmt; + struct mm_struct *mm = current->mm; - if (new) { - if (!try_module_get(new->module)) - return -1; - } - current->binfmt = new; - if (old) - module_put(old->module); - return 0; + if (mm->binfmt) + module_put(mm->binfmt->module); + + mm->binfmt = new; + if (new) + __module_get(new->module); } EXPORT_SYMBOL(set_binfmt); @@ -1723,6 +1728,29 @@ int get_dumpable(struct mm_struct *mm) return (ret >= 2) ? 2 : ret; } +static void wait_for_dump_helpers(struct file *file) +{ + struct pipe_inode_info *pipe; + + pipe = file->f_path.dentry->d_inode->i_pipe; + + pipe_lock(pipe); + pipe->readers++; + pipe->writers--; + + while ((pipe->readers > 1) && (!signal_pending(current))) { + wake_up_interruptible_sync(&pipe->wait); + kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); + pipe_wait(pipe); + } + + pipe->readers--; + pipe->writers++; + pipe_unlock(pipe); + +} + + void do_coredump(long signr, int exit_code, struct pt_regs *regs) { struct core_state core_state; @@ -1739,11 +1767,12 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs) unsigned long core_limit = current->signal->rlim[RLIMIT_CORE].rlim_cur; char **helper_argv = NULL; int helper_argc = 0; - char *delimit; + int dump_count = 0; + static atomic_t core_dump_count = ATOMIC_INIT(0); audit_core_dumps(signr); - binfmt = current->binfmt; + binfmt = mm->binfmt; if (!binfmt || !binfmt->core_dump) goto fail; @@ -1794,54 +1823,63 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs) lock_kernel(); ispipe = format_corename(corename, signr); unlock_kernel(); - /* - * Don't bother to check the RLIMIT_CORE value if core_pattern points - * to a pipe. Since we're not writing directly to the filesystem - * RLIMIT_CORE doesn't really apply, as no actual core file will be - * created unless the pipe reader choses to write out the core file - * at which point file size limits and permissions will be imposed - * as it does with any other process - */ + if ((!ispipe) && (core_limit < binfmt->min_coredump)) goto fail_unlock; if (ispipe) { + if (core_limit == 0) { + /* + * Normally core limits are irrelevant to pipes, since + * we're not writing to the file system, but we use + * core_limit of 0 here as a speacial value. Any + * non-zero limit gets set to RLIM_INFINITY below, but + * a limit of 0 skips the dump. This is a consistent + * way to catch recursive crashes. We can still crash + * if the core_pattern binary sets RLIM_CORE = !0 + * but it runs as root, and can do lots of stupid things + * Note that we use task_tgid_vnr here to grab the pid + * of the process group leader. That way we get the + * right pid if a thread in a multi-threaded + * core_pattern process dies. + */ + printk(KERN_WARNING + "Process %d(%s) has RLIMIT_CORE set to 0\n", + task_tgid_vnr(current), current->comm); + printk(KERN_WARNING "Aborting core\n"); + goto fail_unlock; + } + + dump_count = atomic_inc_return(&core_dump_count); + if (core_pipe_limit && (core_pipe_limit < dump_count)) { + printk(KERN_WARNING "Pid %d(%s) over core_pipe_limit\n", + task_tgid_vnr(current), current->comm); + printk(KERN_WARNING "Skipping core dump\n"); + goto fail_dropcount; + } + helper_argv = argv_split(GFP_KERNEL, corename+1, &helper_argc); if (!helper_argv) { printk(KERN_WARNING "%s failed to allocate memory\n", __func__); - goto fail_unlock; - } - /* Terminate the string before the first option */ - delimit = strchr(corename, ' '); - if (delimit) - *delimit = '\0'; - delimit = strrchr(helper_argv[0], '/'); - if (delimit) - delimit++; - else - delimit = helper_argv[0]; - if (!strcmp(delimit, current->comm)) { - printk(KERN_NOTICE "Recursive core dump detected, " - "aborting\n"); - goto fail_unlock; + goto fail_dropcount; } core_limit = RLIM_INFINITY; /* SIGPIPE can happen, but it's just never processed */ - if (call_usermodehelper_pipe(corename+1, helper_argv, NULL, + if (call_usermodehelper_pipe(helper_argv[0], helper_argv, NULL, &file)) { printk(KERN_INFO "Core dump to %s pipe failed\n", corename); - goto fail_unlock; + goto fail_dropcount; } } else file = filp_open(corename, O_CREAT | 2 | O_NOFOLLOW | O_LARGEFILE | flag, 0600); if (IS_ERR(file)) - goto fail_unlock; + goto fail_dropcount; inode = file->f_path.dentry->d_inode; if (inode->i_nlink > 1) goto close_fail; /* multiple links - don't dump */ @@ -1870,7 +1908,12 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs) if (retval) current->signal->group_exit_code |= 0x80; close_fail: + if (ispipe && core_pipe_limit) + wait_for_dump_helpers(file); filp_close(file, NULL); +fail_dropcount: + if (dump_count) + atomic_dec(&core_dump_count); fail_unlock: if (helper_argv) argv_free(helper_argv); diff --git a/fs/exofs/super.c b/fs/exofs/super.c index 5ab10c3..9f500de 100644 --- a/fs/exofs/super.c +++ b/fs/exofs/super.c @@ -214,7 +214,6 @@ int exofs_sync_fs(struct super_block *sb, int wait) } lock_super(sb); - lock_kernel(); sbi = sb->s_fs_info; fscb->s_nextid = cpu_to_le64(sbi->s_nextid); fscb->s_numfiles = cpu_to_le32(sbi->s_numfiles); @@ -245,7 +244,6 @@ int exofs_sync_fs(struct super_block *sb, int wait) out: if (or) osd_end_request(or); - unlock_kernel(); unlock_super(sb); kfree(fscb); return ret; @@ -268,8 +266,6 @@ static void exofs_put_super(struct super_block *sb) int num_pend; struct exofs_sb_info *sbi = sb->s_fs_info; - lock_kernel(); - if (sb->s_dirt) exofs_write_super(sb); @@ -286,8 +282,6 @@ static void exofs_put_super(struct super_block *sb) osduld_put_device(sbi->s_dev); kfree(sb->s_fs_info); sb->s_fs_info = NULL; - - unlock_kernel(); } /* diff --git a/fs/ext2/acl.c b/fs/ext2/acl.c index d636e12..a63d442 100644 --- a/fs/ext2/acl.c +++ b/fs/ext2/acl.c @@ -230,7 +230,7 @@ ext2_set_acl(struct inode *inode, int type, struct posix_acl *acl) return error; } -static int +int ext2_check_acl(struct inode *inode, int mask) { struct posix_acl *acl = ext2_get_acl(inode, ACL_TYPE_ACCESS); @@ -246,12 +246,6 @@ ext2_check_acl(struct inode *inode, int mask) return -EAGAIN; } -int -ext2_permission(struct inode *inode, int mask) -{ - return generic_permission(inode, mask, ext2_check_acl); -} - /* * Initialize the ACLs of a new inode. Called from ext2_new_inode. * diff --git a/fs/ext2/acl.h b/fs/ext2/acl.h index ecefe47..3ff6cbb 100644 --- a/fs/ext2/acl.h +++ b/fs/ext2/acl.h @@ -54,13 +54,13 @@ static inline int ext2_acl_count(size_t size) #ifdef CONFIG_EXT2_FS_POSIX_ACL /* acl.c */ -extern int ext2_permission (struct inode *, int); +extern int ext2_check_acl (struct inode *, int); extern int ext2_acl_chmod (struct inode *); extern int ext2_init_acl (struct inode *, struct inode *); #else #include <linux/sched.h> -#define ext2_permission NULL +#define ext2_check_acl NULL #define ext2_get_acl NULL #define ext2_set_acl NULL diff --git a/fs/ext2/file.c b/fs/ext2/file.c index 2b9e47d..a2f3afd 100644 --- a/fs/ext2/file.c +++ b/fs/ext2/file.c @@ -85,6 +85,6 @@ const struct inode_operations ext2_file_inode_operations = { .removexattr = generic_removexattr, #endif .setattr = ext2_setattr, - .permission = ext2_permission, + .check_acl = ext2_check_acl, .fiemap = ext2_fiemap, }; diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index e271303..ade6340 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -482,7 +482,7 @@ static int ext2_alloc_branch(struct inode *inode, unlock_buffer(bh); mark_buffer_dirty_inode(bh, inode); /* We used to sync bh here if IS_SYNC(inode). - * But we now rely upon generic_osync_inode() + * But we now rely upon generic_write_sync() * and b_inode_buffers. But not for directories. */ if (S_ISDIR(inode->i_mode) && IS_DIRSYNC(inode)) @@ -819,6 +819,7 @@ const struct address_space_operations ext2_aops = { .writepages = ext2_writepages, .migratepage = buffer_migrate_page, .is_partially_uptodate = block_is_partially_uptodate, + .error_remove_page = generic_error_remove_page, }; const struct address_space_operations ext2_aops_xip = { @@ -837,6 +838,7 @@ const struct address_space_operations ext2_nobh_aops = { .direct_IO = ext2_direct_IO, .writepages = ext2_writepages, .migratepage = buffer_migrate_page, + .error_remove_page = generic_error_remove_page, }; /* diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c index 78d9b92..dd7175c 100644 --- a/fs/ext2/namei.c +++ b/fs/ext2/namei.c @@ -70,7 +70,7 @@ static struct dentry *ext2_lookup(struct inode * dir, struct dentry *dentry, str if (PTR_ERR(inode) == -ESTALE) { ext2_error(dir->i_sb, __func__, "deleted inode referenced: %lu", - ino); + (unsigned long) ino); return ERR_PTR(-EIO); } else { return ERR_CAST(inode); @@ -400,7 +400,7 @@ const struct inode_operations ext2_dir_inode_operations = { .removexattr = generic_removexattr, #endif .setattr = ext2_setattr, - .permission = ext2_permission, + .check_acl = ext2_check_acl, }; const struct inode_operations ext2_special_inode_operations = { @@ -411,5 +411,5 @@ const struct inode_operations ext2_special_inode_operations = { .removexattr = generic_removexattr, #endif .setattr = ext2_setattr, - .permission = ext2_permission, + .check_acl = ext2_check_acl, }; diff --git a/fs/ext2/xip.c b/fs/ext2/xip.c index b72b858..c18fbf3 100644 --- a/fs/ext2/xip.c +++ b/fs/ext2/xip.c @@ -20,7 +20,7 @@ __inode_direct_access(struct inode *inode, sector_t block, void **kaddr, unsigned long *pfn) { struct block_device *bdev = inode->i_sb->s_bdev; - struct block_device_operations *ops = bdev->bd_disk->fops; + const struct block_device_operations *ops = bdev->bd_disk->fops; sector_t sector; sector = block * (PAGE_SIZE / 512); /* ext2 block to bdev sector */ diff --git a/fs/ext3/acl.c b/fs/ext3/acl.c index e167bae..c9b0df3 100644 --- a/fs/ext3/acl.c +++ b/fs/ext3/acl.c @@ -238,7 +238,7 @@ ext3_set_acl(handle_t *handle, struct inode *inode, int type, return error; } -static int +int ext3_check_acl(struct inode *inode, int mask) { struct posix_acl *acl = ext3_get_acl(inode, ACL_TYPE_ACCESS); @@ -254,12 +254,6 @@ ext3_check_acl(struct inode *inode, int mask) return -EAGAIN; } -int -ext3_permission(struct inode *inode, int mask) -{ - return generic_permission(inode, mask, ext3_check_acl); -} - /* * Initialize the ACLs of a new inode. Called from ext3_new_inode. * diff --git a/fs/ext3/acl.h b/fs/ext3/acl.h index 07d15a3..5973346 100644 --- a/fs/ext3/acl.h +++ b/fs/ext3/acl.h @@ -54,13 +54,13 @@ static inline int ext3_acl_count(size_t size) #ifdef CONFIG_EXT3_FS_POSIX_ACL /* acl.c */ -extern int ext3_permission (struct inode *, int); +extern int ext3_check_acl (struct inode *, int); extern int ext3_acl_chmod (struct inode *); extern int ext3_init_acl (handle_t *, struct inode *, struct inode *); #else /* CONFIG_EXT3_FS_POSIX_ACL */ #include <linux/sched.h> -#define ext3_permission NULL +#define ext3_check_acl NULL static inline int ext3_acl_chmod(struct inode *inode) diff --git a/fs/ext3/file.c b/fs/ext3/file.c index 5b49704..388bbdf 100644 --- a/fs/ext3/file.c +++ b/fs/ext3/file.c @@ -51,71 +51,12 @@ static int ext3_release_file (struct inode * inode, struct file * filp) return 0; } -static ssize_t -ext3_file_write(struct kiocb *iocb, const struct iovec *iov, - unsigned long nr_segs, loff_t pos) -{ - struct file *file = iocb->ki_filp; - struct inode *inode = file->f_path.dentry->d_inode; - ssize_t ret; - int err; - - ret = generic_file_aio_write(iocb, iov, nr_segs, pos); - - /* - * Skip flushing if there was an error, or if nothing was written. - */ - if (ret <= 0) - return ret; - - /* - * If the inode is IS_SYNC, or is O_SYNC and we are doing data - * journalling then we need to make sure that we force the transaction - * to disk to keep all metadata uptodate synchronously. - */ - if (file->f_flags & O_SYNC) { - /* - * If we are non-data-journaled, then the dirty data has - * already been flushed to backing store by generic_osync_inode, - * and the inode has been flushed too if there have been any - * modifications other than mere timestamp updates. - * - * Open question --- do we care about flushing timestamps too - * if the inode is IS_SYNC? - */ - if (!ext3_should_journal_data(inode)) - return ret; - - goto force_commit; - } - - /* - * So we know that there has been no forced data flush. If the inode - * is marked IS_SYNC, we need to force one ourselves. - */ - if (!IS_SYNC(inode)) - return ret; - - /* - * Open question #2 --- should we force data to disk here too? If we - * don't, the only impact is that data=writeback filesystems won't - * flush data to disk automatically on IS_SYNC, only metadata (but - * historically, that is what ext2 has done.) - */ - -force_commit: - err = ext3_force_commit(inode->i_sb); - if (err) - return err; - return ret; -} - const struct file_operations ext3_file_operations = { .llseek = generic_file_llseek, .read = do_sync_read, .write = do_sync_write, .aio_read = generic_file_aio_read, - .aio_write = ext3_file_write, + .aio_write = generic_file_aio_write, .unlocked_ioctl = ext3_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = ext3_compat_ioctl, @@ -137,7 +78,7 @@ const struct inode_operations ext3_file_inode_operations = { .listxattr = ext3_listxattr, .removexattr = generic_removexattr, #endif - .permission = ext3_permission, + .check_acl = ext3_check_acl, .fiemap = ext3_fiemap, }; diff --git a/fs/ext3/fsync.c b/fs/ext3/fsync.c index d336341..451d166 100644 --- a/fs/ext3/fsync.c +++ b/fs/ext3/fsync.c @@ -23,6 +23,7 @@ */ #include <linux/time.h> +#include <linux/blkdev.h> #include <linux/fs.h> #include <linux/sched.h> #include <linux/writeback.h> @@ -73,7 +74,7 @@ int ext3_sync_file(struct file * file, struct dentry *dentry, int datasync) } if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) - goto out; + goto flush; /* * The VFS has written the file data. If the inode is unaltered @@ -85,7 +86,16 @@ int ext3_sync_file(struct file * file, struct dentry *dentry, int datasync) .nr_to_write = 0, /* sys_fsync did this */ }; ret = sync_inode(inode, &wbc); + goto out; } +flush: + /* + * In case we didn't commit a transaction, we have to flush + * disk caches manually so that data really is on persistent + * storage + */ + if (test_opt(inode->i_sb, BARRIER)) + blkdev_issue_flush(inode->i_sb->s_bdev, NULL); out: return ret; } diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c index b49908a..acf1b14 100644 --- a/fs/ext3/inode.c +++ b/fs/ext3/inode.c @@ -172,10 +172,21 @@ static int try_to_extend_transaction(handle_t *handle, struct inode *inode) * so before we call here everything must be consistently dirtied against * this transaction. */ -static int ext3_journal_test_restart(handle_t *handle, struct inode *inode) +static int truncate_restart_transaction(handle_t *handle, struct inode *inode) { + int ret; + jbd_debug(2, "restarting handle %p\n", handle); - return ext3_journal_restart(handle, blocks_for_truncate(inode)); + /* + * Drop truncate_mutex to avoid deadlock with ext3_get_blocks_handle + * At this moment, get_block can be called only for blocks inside + * i_size since page cache has been already dropped and writes are + * blocked by i_mutex. So we can safely drop the truncate_mutex. + */ + mutex_unlock(&EXT3_I(inode)->truncate_mutex); + ret = ext3_journal_restart(handle, blocks_for_truncate(inode)); + mutex_lock(&EXT3_I(inode)->truncate_mutex); + return ret; } /* @@ -1819,6 +1830,7 @@ static const struct address_space_operations ext3_ordered_aops = { .direct_IO = ext3_direct_IO, .migratepage = buffer_migrate_page, .is_partially_uptodate = block_is_partially_uptodate, + .error_remove_page = generic_error_remove_page, }; static const struct address_space_operations ext3_writeback_aops = { @@ -1834,6 +1846,7 @@ static const struct address_space_operations ext3_writeback_aops = { .direct_IO = ext3_direct_IO, .migratepage = buffer_migrate_page, .is_partially_uptodate = block_is_partially_uptodate, + .error_remove_page = generic_error_remove_page, }; static const struct address_space_operations ext3_journalled_aops = { @@ -1848,6 +1861,7 @@ static const struct address_space_operations ext3_journalled_aops = { .invalidatepage = ext3_invalidatepage, .releasepage = ext3_releasepage, .is_partially_uptodate = block_is_partially_uptodate, + .error_remove_page = generic_error_remove_page, }; void ext3_set_aops(struct inode *inode) @@ -2072,7 +2086,7 @@ static void ext3_clear_blocks(handle_t *handle, struct inode *inode, ext3_journal_dirty_metadata(handle, bh); } ext3_mark_inode_dirty(handle, inode); - ext3_journal_test_restart(handle, inode); + truncate_restart_transaction(handle, inode); if (bh) { BUFFER_TRACE(bh, "retaking write access"); ext3_journal_get_write_access(handle, bh); @@ -2282,7 +2296,7 @@ static void ext3_free_branches(handle_t *handle, struct inode *inode, return; if (try_to_extend_transaction(handle, inode)) { ext3_mark_inode_dirty(handle, inode); - ext3_journal_test_restart(handle, inode); + truncate_restart_transaction(handle, inode); } ext3_free_blocks(handle, inode, nr, 1); @@ -2892,6 +2906,10 @@ static int ext3_do_update_inode(handle_t *handle, struct buffer_head *bh = iloc->bh; int err = 0, rc, block; +again: + /* we can't allow multiple procs in here at once, its a bit racey */ + lock_buffer(bh); + /* For fields not not tracking in the in-memory inode, * initialise them to zero for new inodes. */ if (ei->i_state & EXT3_STATE_NEW) @@ -2951,16 +2969,20 @@ static int ext3_do_update_inode(handle_t *handle, /* If this is the first large file * created, add a flag to the superblock. */ + unlock_buffer(bh); err = ext3_journal_get_write_access(handle, EXT3_SB(sb)->s_sbh); if (err) goto out_brelse; + ext3_update_dynamic_rev(sb); EXT3_SET_RO_COMPAT_FEATURE(sb, EXT3_FEATURE_RO_COMPAT_LARGE_FILE); handle->h_sync = 1; err = ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh); + /* get our lock and start over */ + goto again; } } } @@ -2983,6 +3005,7 @@ static int ext3_do_update_inode(handle_t *handle, raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize); BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata"); + unlock_buffer(bh); rc = ext3_journal_dirty_metadata(handle, bh); if (!err) err = rc; diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c index 6ff7b97..aad6400 100644 --- a/fs/ext3/namei.c +++ b/fs/ext3/namei.c @@ -2445,7 +2445,7 @@ const struct inode_operations ext3_dir_inode_operations = { .listxattr = ext3_listxattr, .removexattr = generic_removexattr, #endif - .permission = ext3_permission, + .check_acl = ext3_check_acl, }; const struct inode_operations ext3_special_inode_operations = { @@ -2456,5 +2456,5 @@ const struct inode_operations ext3_special_inode_operations = { .listxattr = ext3_listxattr, .removexattr = generic_removexattr, #endif - .permission = ext3_permission, + .check_acl = ext3_check_acl, }; diff --git a/fs/ext3/super.c b/fs/ext3/super.c index a8d80a7..7a520a8 100644 --- a/fs/ext3/super.c +++ b/fs/ext3/super.c @@ -720,7 +720,7 @@ static ssize_t ext3_quota_read(struct super_block *sb, int type, char *data, static ssize_t ext3_quota_write(struct super_block *sb, int type, const char *data, size_t len, loff_t off); -static struct dquot_operations ext3_quota_operations = { +static const struct dquot_operations ext3_quota_operations = { .initialize = dquot_initialize, .drop = dquot_drop, .alloc_space = dquot_alloc_space, @@ -737,7 +737,7 @@ static struct dquot_operations ext3_quota_operations = { .destroy_dquot = dquot_destroy, }; -static struct quotactl_ops ext3_qctl_operations = { +static const struct quotactl_ops ext3_qctl_operations = { .quota_on = ext3_quota_on, .quota_off = vfs_quota_off, .quota_sync = vfs_quota_sync, @@ -2321,7 +2321,18 @@ static int ext3_commit_super(struct super_block *sb, if (!sbh) return error; - es->s_wtime = cpu_to_le32(get_seconds()); + /* + * If the file system is mounted read-only, don't update the + * superblock write time. This avoids updating the superblock + * write time when we are mounting the root file system + * read/only but we need to replay the journal; at that point, + * for people who are east of GMT and who make their clock + * tick in localtime for Windows bug-for-bug compatibility, + * the clock is set in the future, and this will cause e2fsck + * to complain and force a full file system check. + */ + if (!(sb->s_flags & MS_RDONLY)) + es->s_wtime = cpu_to_le32(get_seconds()); es->s_free_blocks_count = cpu_to_le32(ext3_count_free_blocks(sb)); es->s_free_inodes_count = cpu_to_le32(ext3_count_free_inodes(sb)); BUFFER_TRACE(sbh, "marking dirty"); diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig index 418b6f3..9f2d45d 100644 --- a/fs/ext4/Kconfig +++ b/fs/ext4/Kconfig @@ -26,20 +26,6 @@ config EXT4_FS If unsure, say N. -config EXT4DEV_COMPAT - bool "Enable ext4dev compatibility" - depends on EXT4_FS - help - Starting with 2.6.28, the name of the ext4 filesystem was - renamed from ext4dev to ext4. Unfortunately there are some - legacy userspace programs (such as klibc's fstype) have - "ext4dev" hardcoded. - - To enable backwards compatibility so that systems that are - still expecting to mount ext4 filesystems using ext4dev, - chose Y here. This feature will go away by 2.6.31, so - please arrange to get your userspace programs fixed! - config EXT4_FS_XATTR bool "Ext4 extended attributes" depends on EXT4_FS @@ -77,3 +63,12 @@ config EXT4_FS_SECURITY If you are not using a security module that requires using extended attributes for file security labels, say N. + +config EXT4_DEBUG + bool "EXT4 debugging support" + depends on EXT4_FS + help + Enables run-time debugging support for the ext4 filesystem. + + If you select Y here, then you will be able to turn on debugging + with a command such as "echo 1 > /sys/kernel/debug/ext4/mballoc-debug" diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c index f6d8967..0df88b2 100644 --- a/fs/ext4/acl.c +++ b/fs/ext4/acl.c @@ -236,7 +236,7 @@ ext4_set_acl(handle_t *handle, struct inode *inode, int type, return error; } -static int +int ext4_check_acl(struct inode *inode, int mask) { struct posix_acl *acl = ext4_get_acl(inode, ACL_TYPE_ACCESS); @@ -252,12 +252,6 @@ ext4_check_acl(struct inode *inode, int mask) return -EAGAIN; } -int -ext4_permission(struct inode *inode, int mask) -{ - return generic_permission(inode, mask, ext4_check_acl); -} - /* * Initialize the ACLs of a new inode. Called from ext4_new_inode. * diff --git a/fs/ext4/acl.h b/fs/ext4/acl.h index 949789d..9d843d5 100644 --- a/fs/ext4/acl.h +++ b/fs/ext4/acl.h @@ -54,13 +54,13 @@ static inline int ext4_acl_count(size_t size) #ifdef CONFIG_EXT4_FS_POSIX_ACL /* acl.c */ -extern int ext4_permission(struct inode *, int); +extern int ext4_check_acl(struct inode *, int); extern int ext4_acl_chmod(struct inode *); extern int ext4_init_acl(handle_t *, struct inode *, struct inode *); #else /* CONFIG_EXT4_FS_POSIX_ACL */ #include <linux/sched.h> -#define ext4_permission NULL +#define ext4_check_acl NULL static inline int ext4_acl_chmod(struct inode *inode) diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index e2126d7..1d04189 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -478,7 +478,7 @@ void ext4_add_groupblocks(handle_t *handle, struct super_block *sb, * new bitmap information */ set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state)); - ext4_mb_update_group_info(grp, blocks_freed); + grp->bb_free += blocks_freed; up_write(&grp->alloc_sem); /* We dirtied the bitmap block */ diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 9714db3..984ca0c 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -65,29 +65,37 @@ typedef __u32 ext4_lblk_t; /* data type for block group number */ typedef unsigned int ext4_group_t; +/* + * Flags used in mballoc's allocation_context flags field. + * + * Also used to show what's going on for debugging purposes when the + * flag field is exported via the traceport interface + */ /* prefer goal again. length */ -#define EXT4_MB_HINT_MERGE 1 +#define EXT4_MB_HINT_MERGE 0x0001 /* blocks already reserved */ -#define EXT4_MB_HINT_RESERVED 2 +#define EXT4_MB_HINT_RESERVED 0x0002 /* metadata is being allocated */ -#define EXT4_MB_HINT_METADATA 4 +#define EXT4_MB_HINT_METADATA 0x0004 /* first blocks in the file */ -#define EXT4_MB_HINT_FIRST 8 +#define EXT4_MB_HINT_FIRST 0x0008 /* search for the best chunk */ -#define EXT4_MB_HINT_BEST 16 +#define EXT4_MB_HINT_BEST 0x0010 /* data is being allocated */ -#define EXT4_MB_HINT_DATA 32 +#define EXT4_MB_HINT_DATA 0x0020 /* don't preallocate (for tails) */ -#define EXT4_MB_HINT_NOPREALLOC 64 +#define EXT4_MB_HINT_NOPREALLOC 0x0040 /* allocate for locality group */ -#define EXT4_MB_HINT_GROUP_ALLOC 128 +#define EXT4_MB_HINT_GROUP_ALLOC 0x0080 /* allocate goal blocks or none */ -#define EXT4_MB_HINT_GOAL_ONLY 256 +#define EXT4_MB_HINT_GOAL_ONLY 0x0100 /* goal is meaningful */ -#define EXT4_MB_HINT_TRY_GOAL 512 +#define EXT4_MB_HINT_TRY_GOAL 0x0200 /* blocks already pre-reserved by delayed allocation */ -#define EXT4_MB_DELALLOC_RESERVED 1024 +#define EXT4_MB_DELALLOC_RESERVED 0x0400 +/* We are doing stream allocation */ +#define EXT4_MB_STREAM_ALLOC 0x0800 struct ext4_allocation_request { @@ -112,6 +120,31 @@ struct ext4_allocation_request { }; /* + * For delayed allocation tracking + */ +struct mpage_da_data { + struct inode *inode; + sector_t b_blocknr; /* start block number of extent */ + size_t b_size; /* size of extent */ + unsigned long b_state; /* state of the extent */ + unsigned long first_page, next_page; /* extent of pages */ + struct writeback_control *wbc; + int io_done; + int pages_written; + int retval; +}; +#define DIO_AIO_UNWRITTEN 0x1 +typedef struct ext4_io_end { + struct list_head list; /* per-file finished AIO list */ + struct inode *inode; /* file being written to */ + unsigned int flag; /* unwritten or not */ + int error; /* I/O error code */ + ext4_lblk_t offset; /* offset in the file */ + size_t size; /* size of the extent */ + struct work_struct work; /* data work queue */ +} ext4_io_end_t; + +/* * Special inodes numbers */ #define EXT4_BAD_INO 1 /* Bad blocks inode */ @@ -251,7 +284,6 @@ struct flex_groups { #define EXT4_TOPDIR_FL 0x00020000 /* Top of directory hierarchies*/ #define EXT4_HUGE_FILE_FL 0x00040000 /* Set to each huge file */ #define EXT4_EXTENTS_FL 0x00080000 /* Inode uses extents */ -#define EXT4_EXT_MIGRATE 0x00100000 /* Inode is migrating */ #define EXT4_RESERVED_FL 0x80000000 /* reserved for ext4 lib */ #define EXT4_FL_USER_VISIBLE 0x000BDFFF /* User visible flags */ @@ -289,6 +321,7 @@ static inline __u32 ext4_mask_flags(umode_t mode, __u32 flags) #define EXT4_STATE_XATTR 0x00000004 /* has in-inode xattrs */ #define EXT4_STATE_NO_EXPAND 0x00000008 /* No space for expansion */ #define EXT4_STATE_DA_ALLOC_CLOSE 0x00000010 /* Alloc DA blks on close */ +#define EXT4_STATE_EXT_MIGRATE 0x00000020 /* Inode is migrating */ /* Used to pass group descriptor data when online resize is done */ struct ext4_new_group_input { @@ -330,7 +363,16 @@ struct ext4_new_group_data { /* Call ext4_da_update_reserve_space() after successfully allocating the blocks */ #define EXT4_GET_BLOCKS_UPDATE_RESERVE_SPACE 0x0008 - + /* caller is from the direct IO path, request to creation of an + unitialized extents if not allocated, split the uninitialized + extent if blocks has been preallocated already*/ +#define EXT4_GET_BLOCKS_DIO 0x0010 +#define EXT4_GET_BLOCKS_CONVERT 0x0020 +#define EXT4_GET_BLOCKS_DIO_CREATE_EXT (EXT4_GET_BLOCKS_DIO|\ + EXT4_GET_BLOCKS_CREATE_UNINIT_EXT) + /* Convert extent to initialized after direct IO complete */ +#define EXT4_GET_BLOCKS_DIO_CONVERT_EXT (EXT4_GET_BLOCKS_CONVERT|\ + EXT4_GET_BLOCKS_DIO_CREATE_EXT) /* * ioctl commands @@ -386,6 +428,9 @@ struct ext4_mount_options { #endif }; +/* Max physical block we can addres w/o extents */ +#define EXT4_MAX_BLOCK_FILE_PHYS 0xFFFFFFFF + /* * Structure of an inode on the disk */ @@ -456,7 +501,6 @@ struct move_extent { __u64 len; /* block length to be moved */ __u64 moved_len; /* moved block length */ }; -#define MAX_DEFRAG_SIZE ((1UL<<31) - 1) #define EXT4_EPOCH_BITS 2 #define EXT4_EPOCH_MASK ((1 << EXT4_EPOCH_BITS) - 1) @@ -481,8 +525,8 @@ struct move_extent { static inline __le32 ext4_encode_extra_time(struct timespec *time) { return cpu_to_le32((sizeof(time->tv_sec) > 4 ? - time->tv_sec >> 32 : 0) | - ((time->tv_nsec << 2) & EXT4_NSEC_MASK)); + (time->tv_sec >> 32) & EXT4_EPOCH_MASK : 0) | + ((time->tv_nsec << EXT4_EPOCH_BITS) & EXT4_NSEC_MASK)); } static inline void ext4_decode_extra_time(struct timespec *time, __le32 extra) @@ -490,7 +534,7 @@ static inline void ext4_decode_extra_time(struct timespec *time, __le32 extra) if (sizeof(time->tv_sec) > 4) time->tv_sec |= (__u64)(le32_to_cpu(extra) & EXT4_EPOCH_MASK) << 32; - time->tv_nsec = (le32_to_cpu(extra) & EXT4_NSEC_MASK) >> 2; + time->tv_nsec = (le32_to_cpu(extra) & EXT4_NSEC_MASK) >> EXT4_EPOCH_BITS; } #define EXT4_INODE_SET_XTIME(xtime, inode, raw_inode) \ @@ -653,6 +697,11 @@ struct ext4_inode_info { __u16 i_extra_isize; spinlock_t i_block_reservation_lock; + + /* completed async DIOs that might need unwritten extents handling */ + struct list_head i_aio_dio_complete_list; + /* current io_end structure for async DIO write*/ + ext4_io_end_t *cur_aio_dio; }; /* @@ -694,7 +743,6 @@ struct ext4_inode_info { #define EXT4_MOUNT_QUOTA 0x80000 /* Some quota option set */ #define EXT4_MOUNT_USRQUOTA 0x100000 /* "old" user quota */ #define EXT4_MOUNT_GRPQUOTA 0x200000 /* "old" group quota */ -#define EXT4_MOUNT_JOURNAL_CHECKSUM 0x800000 /* Journal checksums */ #define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */ #define EXT4_MOUNT_I_VERSION 0x2000000 /* i_version support */ #define EXT4_MOUNT_DELALLOC 0x8000000 /* Delalloc support */ @@ -841,6 +889,7 @@ struct ext4_sb_info { unsigned long s_gdb_count; /* Number of group descriptor blocks */ unsigned long s_desc_per_block; /* Number of group descriptors per block */ ext4_group_t s_groups_count; /* Number of groups in the fs */ + ext4_group_t s_blockfile_groups;/* Groups acceptable for non-extent files */ unsigned long s_overhead_last; /* Last calculated overhead */ unsigned long s_blocks_last; /* Last seen block count */ loff_t s_bitmap_maxbytes; /* max bytes for bitmap files */ @@ -923,18 +972,11 @@ struct ext4_sb_info { unsigned int s_mb_stats; unsigned int s_mb_order2_reqs; unsigned int s_mb_group_prealloc; + unsigned int s_max_writeback_mb_bump; /* where last allocation was done - for stream allocation */ unsigned long s_mb_last_group; unsigned long s_mb_last_start; - /* history to debug policy */ - struct ext4_mb_history *s_mb_history; - int s_mb_history_cur; - int s_mb_history_max; - int s_mb_history_num; - spinlock_t s_mb_history_lock; - int s_mb_history_filter; - /* stats for buddy allocator */ spinlock_t s_mb_pa_lock; atomic_t s_bal_reqs; /* number of reqs with len > 1 */ @@ -950,6 +992,7 @@ struct ext4_sb_info { atomic_t s_mb_lost_chunks; atomic_t s_mb_preallocated; atomic_t s_mb_discarded; + atomic_t s_lock_busy; /* locality groups */ struct ext4_locality_group *s_locality_groups; @@ -960,6 +1003,9 @@ struct ext4_sb_info { unsigned int s_log_groups_per_flex; struct flex_groups *s_flex_groups; + + /* workqueue for dio unwritten */ + struct workqueue_struct *dio_unwritten_wq; }; static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb) @@ -1340,8 +1386,6 @@ extern void ext4_mb_free_blocks(handle_t *, struct inode *, ext4_fsblk_t, unsigned long, int, unsigned long *); extern int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t i, struct ext4_group_desc *desc); -extern void ext4_mb_update_group_info(struct ext4_group_info *grp, - ext4_grpblk_t add); extern int ext4_mb_get_buddy_cache_lock(struct super_block *, ext4_group_t); extern void ext4_mb_put_buddy_cache_lock(struct super_block *, ext4_group_t, int); @@ -1367,6 +1411,7 @@ extern int ext4_change_inode_journal_flag(struct inode *, int); extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *); extern int ext4_can_truncate(struct inode *inode); extern void ext4_truncate(struct inode *); +extern int ext4_truncate_restart_trans(handle_t *, struct inode *, int nblocks); extern void ext4_set_inode_flags(struct inode *); extern void ext4_get_inode_flags(struct ext4_inode_info *); extern int ext4_alloc_da_blocks(struct inode *inode); @@ -1378,7 +1423,7 @@ extern int ext4_block_truncate_page(handle_t *handle, struct address_space *mapping, loff_t from); extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); extern qsize_t ext4_get_reserved_space(struct inode *inode); - +extern int flush_aio_dio_completed_IO(struct inode *inode); /* ioctl.c */ extern long ext4_ioctl(struct file *, unsigned int, unsigned long); extern long ext4_compat_ioctl(struct file *, unsigned int, unsigned long); @@ -1575,15 +1620,18 @@ static inline void ext4_update_i_disksize(struct inode *inode, loff_t newsize) struct ext4_group_info { unsigned long bb_state; struct rb_root bb_free_root; - unsigned short bb_first_free; - unsigned short bb_free; - unsigned short bb_fragments; + ext4_grpblk_t bb_first_free; /* first free block */ + ext4_grpblk_t bb_free; /* total free blocks */ + ext4_grpblk_t bb_fragments; /* nr of freespace fragments */ struct list_head bb_prealloc_list; #ifdef DOUBLE_CHECK void *bb_bitmap; #endif struct rw_semaphore alloc_sem; - unsigned short bb_counters[]; + ext4_grpblk_t bb_counters[]; /* Nr of free power-of-two-block + * regions, index is order. + * bb_counters[3] = 5 means + * 5 free 8-block regions. */ }; #define EXT4_GROUP_INFO_NEED_INIT_BIT 0 @@ -1591,15 +1639,42 @@ struct ext4_group_info { #define EXT4_MB_GRP_NEED_INIT(grp) \ (test_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &((grp)->bb_state))) +#define EXT4_MAX_CONTENTION 8 +#define EXT4_CONTENTION_THRESHOLD 2 + static inline spinlock_t *ext4_group_lock_ptr(struct super_block *sb, ext4_group_t group) { return bgl_lock_ptr(EXT4_SB(sb)->s_blockgroup_lock, group); } +/* + * Returns true if the filesystem is busy enough that attempts to + * access the block group locks has run into contention. + */ +static inline int ext4_fs_is_busy(struct ext4_sb_info *sbi) +{ + return (atomic_read(&sbi->s_lock_busy) > EXT4_CONTENTION_THRESHOLD); +} + static inline void ext4_lock_group(struct super_block *sb, ext4_group_t group) { - spin_lock(ext4_group_lock_ptr(sb, group)); + spinlock_t *lock = ext4_group_lock_ptr(sb, group); + if (spin_trylock(lock)) + /* + * We're able to grab the lock right away, so drop the + * lock contention counter. + */ + atomic_add_unless(&EXT4_SB(sb)->s_lock_busy, -1, 0); + else { + /* + * The lock is busy, so bump the contention counter, + * and then wait on the spin lock. + */ + atomic_add_unless(&EXT4_SB(sb)->s_lock_busy, 1, + EXT4_MAX_CONTENTION); + spin_lock(lock); + } } static inline void ext4_unlock_group(struct super_block *sb, @@ -1650,6 +1725,8 @@ extern void ext4_ext_init(struct super_block *); extern void ext4_ext_release(struct super_block *); extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len); +extern int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset, + loff_t len); extern int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block, unsigned int max_blocks, struct buffer_head *bh, int flags); diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h index 20a8410..2ca6864 100644 --- a/fs/ext4/ext4_extents.h +++ b/fs/ext4/ext4_extents.h @@ -43,8 +43,7 @@ #define CHECK_BINSEARCH__ /* - * If EXT_DEBUG is defined you can use the 'extdebug' mount option - * to get lots of info about what's going on. + * Turn on EXT_DEBUG to get lots of info about extents operations. */ #define EXT_DEBUG__ #ifdef EXT_DEBUG @@ -138,6 +137,7 @@ typedef int (*ext_prepare_callback)(struct inode *, struct ext4_ext_path *, #define EXT_BREAK 1 #define EXT_REPEAT 2 +/* Maximum logical block in a file; ext4_extent's ee_block is __le32 */ #define EXT_MAX_BLOCK 0xffffffff /* @@ -220,6 +220,11 @@ static inline int ext4_ext_get_actual_len(struct ext4_extent *ext) (le16_to_cpu(ext->ee_len) - EXT_INIT_MAX_LEN)); } +static inline void ext4_ext_mark_initialized(struct ext4_extent *ext) +{ + ext->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ext)); +} + extern int ext4_ext_calc_metadata_amount(struct inode *inode, int blocks); extern ext4_fsblk_t ext_pblock(struct ext4_extent *ex); extern ext4_fsblk_t idx_pblock(struct ext4_extent_idx *); @@ -235,7 +240,7 @@ extern int ext4_ext_try_to_merge(struct inode *inode, struct ext4_ext_path *path, struct ext4_extent *); extern unsigned int ext4_ext_check_overlap(struct inode *, struct ext4_extent *, struct ext4_ext_path *); -extern int ext4_ext_insert_extent(handle_t *, struct inode *, struct ext4_ext_path *, struct ext4_extent *); +extern int ext4_ext_insert_extent(handle_t *, struct inode *, struct ext4_ext_path *, struct ext4_extent *, int); extern int ext4_ext_walk_space(struct inode *, ext4_lblk_t, ext4_lblk_t, ext_prepare_callback, void *); extern struct ext4_ext_path *ext4_ext_find_extent(struct inode *, ext4_lblk_t, diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c index eb27fd0..6a94099 100644 --- a/fs/ext4/ext4_jbd2.c +++ b/fs/ext4/ext4_jbd2.c @@ -44,7 +44,7 @@ int __ext4_journal_forget(const char *where, handle_t *handle, handle, err); } else - brelse(bh); + bforget(bh); return err; } @@ -60,7 +60,7 @@ int __ext4_journal_revoke(const char *where, handle_t *handle, handle, err); } else - brelse(bh); + bforget(bh); return err; } @@ -89,7 +89,10 @@ int __ext4_handle_dirty_metadata(const char *where, handle_t *handle, ext4_journal_abort_handle(where, __func__, bh, handle, err); } else { - mark_buffer_dirty(bh); + if (inode && bh) + mark_buffer_dirty_inode(bh, inode); + else + mark_buffer_dirty(bh); if (inode && inode_needs_sync(inode)) { sync_dirty_buffer(bh); if (buffer_req(bh) && !buffer_uptodate(bh)) { diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h index 139fb8c..a286598 100644 --- a/fs/ext4/ext4_jbd2.h +++ b/fs/ext4/ext4_jbd2.h @@ -161,11 +161,13 @@ int __ext4_handle_dirty_metadata(const char *where, handle_t *handle, handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks); int __ext4_journal_stop(const char *where, handle_t *handle); -#define EXT4_NOJOURNAL_HANDLE ((handle_t *) 0x1) +#define EXT4_NOJOURNAL_MAX_REF_COUNT ((unsigned long) 4096) +/* Note: Do not use this for NULL handles. This is only to determine if + * a properly allocated handle is using a journal or not. */ static inline int ext4_handle_valid(handle_t *handle) { - if (handle == EXT4_NOJOURNAL_HANDLE) + if ((unsigned long)handle < EXT4_NOJOURNAL_MAX_REF_COUNT) return 0; return 1; } diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 73ebfb4..10539e3 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -93,7 +93,9 @@ static void ext4_idx_store_pblock(struct ext4_extent_idx *ix, ext4_fsblk_t pb) ix->ei_leaf_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) & 0xffff); } -static int ext4_ext_journal_restart(handle_t *handle, int needed) +static int ext4_ext_truncate_extend_restart(handle_t *handle, + struct inode *inode, + int needed) { int err; @@ -104,7 +106,14 @@ static int ext4_ext_journal_restart(handle_t *handle, int needed) err = ext4_journal_extend(handle, needed); if (err <= 0) return err; - return ext4_journal_restart(handle, needed); + err = ext4_truncate_restart_trans(handle, inode, needed); + /* + * We have dropped i_data_sem so someone might have cached again + * an extent we are going to truncate. + */ + ext4_ext_invalidate_cache(inode); + + return err; } /* @@ -220,57 +229,65 @@ ext4_ext_new_meta_block(handle_t *handle, struct inode *inode, return newblock; } -static int ext4_ext_space_block(struct inode *inode) +static inline int ext4_ext_space_block(struct inode *inode, int check) { int size; size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header)) / sizeof(struct ext4_extent); + if (!check) { #ifdef AGGRESSIVE_TEST - if (size > 6) - size = 6; + if (size > 6) + size = 6; #endif + } return size; } -static int ext4_ext_space_block_idx(struct inode *inode) +static inline int ext4_ext_space_block_idx(struct inode *inode, int check) { int size; size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header)) / sizeof(struct ext4_extent_idx); + if (!check) { #ifdef AGGRESSIVE_TEST - if (size > 5) - size = 5; + if (size > 5) + size = 5; #endif + } return size; } -static int ext4_ext_space_root(struct inode *inode) +static inline int ext4_ext_space_root(struct inode *inode, int check) { int size; size = sizeof(EXT4_I(inode)->i_data); size -= sizeof(struct ext4_extent_header); size /= sizeof(struct ext4_extent); + if (!check) { #ifdef AGGRESSIVE_TEST - if (size > 3) - size = 3; + if (size > 3) + size = 3; #endif + } return size; } -static int ext4_ext_space_root_idx(struct inode *inode) +static inline int ext4_ext_space_root_idx(struct inode *inode, int check) { int size; size = sizeof(EXT4_I(inode)->i_data); size -= sizeof(struct ext4_extent_header); size /= sizeof(struct ext4_extent_idx); + if (!check) { #ifdef AGGRESSIVE_TEST - if (size > 4) - size = 4; + if (size > 4) + size = 4; #endif + } return size; } @@ -284,9 +301,9 @@ int ext4_ext_calc_metadata_amount(struct inode *inode, int blocks) int lcap, icap, rcap, leafs, idxs, num; int newextents = blocks; - rcap = ext4_ext_space_root_idx(inode); - lcap = ext4_ext_space_block(inode); - icap = ext4_ext_space_block_idx(inode); + rcap = ext4_ext_space_root_idx(inode, 0); + lcap = ext4_ext_space_block(inode, 0); + icap = ext4_ext_space_block_idx(inode, 0); /* number of new leaf blocks needed */ num = leafs = (newextents + lcap - 1) / lcap; @@ -311,14 +328,14 @@ ext4_ext_max_entries(struct inode *inode, int depth) if (depth == ext_depth(inode)) { if (depth == 0) - max = ext4_ext_space_root(inode); + max = ext4_ext_space_root(inode, 1); else - max = ext4_ext_space_root_idx(inode); + max = ext4_ext_space_root_idx(inode, 1); } else { if (depth == 0) - max = ext4_ext_space_block(inode); + max = ext4_ext_space_block(inode, 1); else - max = ext4_ext_space_block_idx(inode); + max = ext4_ext_space_block_idx(inode, 1); } return max; @@ -437,8 +454,9 @@ static void ext4_ext_show_path(struct inode *inode, struct ext4_ext_path *path) ext_debug(" %d->%llu", le32_to_cpu(path->p_idx->ei_block), idx_pblock(path->p_idx)); } else if (path->p_ext) { - ext_debug(" %d:%d:%llu ", + ext_debug(" %d:[%d]%d:%llu ", le32_to_cpu(path->p_ext->ee_block), + ext4_ext_is_uninitialized(path->p_ext), ext4_ext_get_actual_len(path->p_ext), ext_pblock(path->p_ext)); } else @@ -460,8 +478,11 @@ static void ext4_ext_show_leaf(struct inode *inode, struct ext4_ext_path *path) eh = path[depth].p_hdr; ex = EXT_FIRST_EXTENT(eh); + ext_debug("Displaying leaf extents for inode %lu\n", inode->i_ino); + for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ex++) { - ext_debug("%d:%d:%llu ", le32_to_cpu(ex->ee_block), + ext_debug("%d:[%d]%d:%llu ", le32_to_cpu(ex->ee_block), + ext4_ext_is_uninitialized(ex), ext4_ext_get_actual_len(ex), ext_pblock(ex)); } ext_debug("\n"); @@ -580,9 +601,10 @@ ext4_ext_binsearch(struct inode *inode, } path->p_ext = l - 1; - ext_debug(" -> %d:%llu:%d ", + ext_debug(" -> %d:%llu:[%d]%d ", le32_to_cpu(path->p_ext->ee_block), ext_pblock(path->p_ext), + ext4_ext_is_uninitialized(path->p_ext), ext4_ext_get_actual_len(path->p_ext)); #ifdef CHECK_BINSEARCH @@ -612,7 +634,7 @@ int ext4_ext_tree_init(handle_t *handle, struct inode *inode) eh->eh_depth = 0; eh->eh_entries = 0; eh->eh_magic = EXT4_EXT_MAGIC; - eh->eh_max = cpu_to_le16(ext4_ext_space_root(inode)); + eh->eh_max = cpu_to_le16(ext4_ext_space_root(inode, 0)); ext4_mark_inode_dirty(handle, inode); ext4_ext_invalidate_cache(inode); return 0; @@ -701,7 +723,7 @@ err: * insert new index [@logical;@ptr] into the block at @curp; * check where to insert: before @curp or after @curp */ -static int ext4_ext_insert_index(handle_t *handle, struct inode *inode, +int ext4_ext_insert_index(handle_t *handle, struct inode *inode, struct ext4_ext_path *curp, int logical, ext4_fsblk_t ptr) { @@ -837,7 +859,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode, neh = ext_block_hdr(bh); neh->eh_entries = 0; - neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode)); + neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0)); neh->eh_magic = EXT4_EXT_MAGIC; neh->eh_depth = 0; ex = EXT_FIRST_EXTENT(neh); @@ -850,9 +872,10 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode, path[depth].p_ext++; while (path[depth].p_ext <= EXT_MAX_EXTENT(path[depth].p_hdr)) { - ext_debug("move %d:%llu:%d in new leaf %llu\n", + ext_debug("move %d:%llu:[%d]%d in new leaf %llu\n", le32_to_cpu(path[depth].p_ext->ee_block), ext_pblock(path[depth].p_ext), + ext4_ext_is_uninitialized(path[depth].p_ext), ext4_ext_get_actual_len(path[depth].p_ext), newblock); /*memmove(ex++, path[depth].p_ext++, @@ -912,7 +935,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode, neh = ext_block_hdr(bh); neh->eh_entries = cpu_to_le16(1); neh->eh_magic = EXT4_EXT_MAGIC; - neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode)); + neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode, 0)); neh->eh_depth = cpu_to_le16(depth - i); fidx = EXT_FIRST_INDEX(neh); fidx->ei_block = border; @@ -1037,9 +1060,9 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode, /* old root could have indexes or leaves * so calculate e_max right way */ if (ext_depth(inode)) - neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode)); + neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode, 0)); else - neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode)); + neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0)); neh->eh_magic = EXT4_EXT_MAGIC; set_buffer_uptodate(bh); unlock_buffer(bh); @@ -1054,7 +1077,7 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode, goto out; curp->p_hdr->eh_magic = EXT4_EXT_MAGIC; - curp->p_hdr->eh_max = cpu_to_le16(ext4_ext_space_root_idx(inode)); + curp->p_hdr->eh_max = cpu_to_le16(ext4_ext_space_root_idx(inode, 0)); curp->p_hdr->eh_entries = cpu_to_le16(1); curp->p_idx = EXT_FIRST_INDEX(curp->p_hdr); @@ -1563,7 +1586,7 @@ out: */ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode, struct ext4_ext_path *path, - struct ext4_extent *newext) + struct ext4_extent *newext, int flag) { struct ext4_extent_header *eh; struct ext4_extent *ex, *fex; @@ -1579,10 +1602,13 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode, BUG_ON(path[depth].p_hdr == NULL); /* try to insert block into found extent and return */ - if (ex && ext4_can_extents_be_merged(inode, ex, newext)) { - ext_debug("append %d block to %d:%d (from %llu)\n", + if (ex && (flag != EXT4_GET_BLOCKS_DIO_CREATE_EXT) + && ext4_can_extents_be_merged(inode, ex, newext)) { + ext_debug("append [%d]%d block to %d:[%d]%d (from %llu)\n", + ext4_ext_is_uninitialized(newext), ext4_ext_get_actual_len(newext), le32_to_cpu(ex->ee_block), + ext4_ext_is_uninitialized(ex), ext4_ext_get_actual_len(ex), ext_pblock(ex)); err = ext4_ext_get_access(handle, inode, path + depth); if (err) @@ -1651,9 +1677,10 @@ has_space: if (!nearex) { /* there is no extent in this leaf, create first one */ - ext_debug("first extent in the leaf: %d:%llu:%d\n", + ext_debug("first extent in the leaf: %d:%llu:[%d]%d\n", le32_to_cpu(newext->ee_block), ext_pblock(newext), + ext4_ext_is_uninitialized(newext), ext4_ext_get_actual_len(newext)); path[depth].p_ext = EXT_FIRST_EXTENT(eh); } else if (le32_to_cpu(newext->ee_block) @@ -1663,10 +1690,11 @@ has_space: len = EXT_MAX_EXTENT(eh) - nearex; len = (len - 1) * sizeof(struct ext4_extent); len = len < 0 ? 0 : len; - ext_debug("insert %d:%llu:%d after: nearest 0x%p, " + ext_debug("insert %d:%llu:[%d]%d after: nearest 0x%p, " "move %d from 0x%p to 0x%p\n", le32_to_cpu(newext->ee_block), ext_pblock(newext), + ext4_ext_is_uninitialized(newext), ext4_ext_get_actual_len(newext), nearex, len, nearex + 1, nearex + 2); memmove(nearex + 2, nearex + 1, len); @@ -1676,10 +1704,11 @@ has_space: BUG_ON(newext->ee_block == nearex->ee_block); len = (EXT_MAX_EXTENT(eh) - nearex) * sizeof(struct ext4_extent); len = len < 0 ? 0 : len; - ext_debug("insert %d:%llu:%d before: nearest 0x%p, " + ext_debug("insert %d:%llu:[%d]%d before: nearest 0x%p, " "move %d from 0x%p to 0x%p\n", le32_to_cpu(newext->ee_block), ext_pblock(newext), + ext4_ext_is_uninitialized(newext), ext4_ext_get_actual_len(newext), nearex, len, nearex + 1, nearex + 2); memmove(nearex + 1, nearex, len); @@ -1694,7 +1723,8 @@ has_space: merge: /* try to merge extents to the right */ - ext4_ext_try_to_merge(inode, path, nearex); + if (flag != EXT4_GET_BLOCKS_DIO_CREATE_EXT) + ext4_ext_try_to_merge(inode, path, nearex); /* try to merge extents to the left */ @@ -2094,7 +2124,8 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, else uninitialized = 0; - ext_debug("remove ext %lu:%u\n", ex_ee_block, ex_ee_len); + ext_debug("remove ext %u:[%d]%d\n", ex_ee_block, + uninitialized, ex_ee_len); path[depth].p_ext = ex; a = ex_ee_block > start ? ex_ee_block : start; @@ -2138,7 +2169,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, } credits += 2 * EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb); - err = ext4_ext_journal_restart(handle, credits); + err = ext4_ext_truncate_extend_restart(handle, inode, credits); if (err) goto out; @@ -2327,7 +2358,7 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start) if (err == 0) { ext_inode_hdr(inode)->eh_depth = 0; ext_inode_hdr(inode)->eh_max = - cpu_to_le16(ext4_ext_space_root(inode)); + cpu_to_le16(ext4_ext_space_root(inode, 0)); err = ext4_ext_dirty(handle, inode, path); } } @@ -2349,6 +2380,7 @@ void ext4_ext_init(struct super_block *sb) */ if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) { +#if defined(AGGRESSIVE_TEST) || defined(CHECK_BINSEARCH) || defined(EXTENTS_STATS) printk(KERN_INFO "EXT4-fs: file extents enabled"); #ifdef AGGRESSIVE_TEST printk(", aggressive tests"); @@ -2360,6 +2392,7 @@ void ext4_ext_init(struct super_block *sb) printk(", stats"); #endif printk("\n"); +#endif #ifdef EXTENTS_STATS spin_lock_init(&EXT4_SB(sb)->s_ext_stats_lock); EXT4_SB(sb)->s_ext_min = 1 << 30; @@ -2461,7 +2494,6 @@ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex) } #define EXT4_EXT_ZERO_LEN 7 - /* * This function is called by ext4_ext_get_blocks() if someone tries to write * to an uninitialized extent. It may result in splitting the uninitialized @@ -2554,7 +2586,8 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, ex3->ee_block = cpu_to_le32(iblock); ext4_ext_store_pblock(ex3, newblock); ex3->ee_len = cpu_to_le16(allocated); - err = ext4_ext_insert_extent(handle, inode, path, ex3); + err = ext4_ext_insert_extent(handle, inode, path, + ex3, 0); if (err == -ENOSPC) { err = ext4_ext_zeroout(inode, &orig_ex); if (err) @@ -2610,7 +2643,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, ext4_ext_store_pblock(ex3, newblock + max_blocks); ex3->ee_len = cpu_to_le16(allocated - max_blocks); ext4_ext_mark_uninitialized(ex3); - err = ext4_ext_insert_extent(handle, inode, path, ex3); + err = ext4_ext_insert_extent(handle, inode, path, ex3, 0); if (err == -ENOSPC) { err = ext4_ext_zeroout(inode, &orig_ex); if (err) @@ -2728,7 +2761,191 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, err = ext4_ext_dirty(handle, inode, path + depth); goto out; insert: - err = ext4_ext_insert_extent(handle, inode, path, &newex); + err = ext4_ext_insert_extent(handle, inode, path, &newex, 0); + if (err == -ENOSPC) { + err = ext4_ext_zeroout(inode, &orig_ex); + if (err) + goto fix_extent_len; + /* update the extent length and mark as initialized */ + ex->ee_block = orig_ex.ee_block; + ex->ee_len = orig_ex.ee_len; + ext4_ext_store_pblock(ex, ext_pblock(&orig_ex)); + ext4_ext_dirty(handle, inode, path + depth); + /* zero out the first half */ + return allocated; + } else if (err) + goto fix_extent_len; +out: + ext4_ext_show_leaf(inode, path); + return err ? err : allocated; + +fix_extent_len: + ex->ee_block = orig_ex.ee_block; + ex->ee_len = orig_ex.ee_len; + ext4_ext_store_pblock(ex, ext_pblock(&orig_ex)); + ext4_ext_mark_uninitialized(ex); + ext4_ext_dirty(handle, inode, path + depth); + return err; +} + +/* + * This function is called by ext4_ext_get_blocks() from + * ext4_get_blocks_dio_write() when DIO to write + * to an uninitialized extent. + * + * Writing to an uninitized extent may result in splitting the uninitialized + * extent into multiple /intialized unintialized extents (up to three) + * There are three possibilities: + * a> There is no split required: Entire extent should be uninitialized + * b> Splits in two extents: Write is happening at either end of the extent + * c> Splits in three extents: Somone is writing in middle of the extent + * + * One of more index blocks maybe needed if the extent tree grow after + * the unintialized extent split. To prevent ENOSPC occur at the IO + * complete, we need to split the uninitialized extent before DIO submit + * the IO. The uninitilized extent called at this time will be split + * into three uninitialized extent(at most). After IO complete, the part + * being filled will be convert to initialized by the end_io callback function + * via ext4_convert_unwritten_extents(). + */ +static int ext4_split_unwritten_extents(handle_t *handle, + struct inode *inode, + struct ext4_ext_path *path, + ext4_lblk_t iblock, + unsigned int max_blocks, + int flags) +{ + struct ext4_extent *ex, newex, orig_ex; + struct ext4_extent *ex1 = NULL; + struct ext4_extent *ex2 = NULL; + struct ext4_extent *ex3 = NULL; + struct ext4_extent_header *eh; + ext4_lblk_t ee_block; + unsigned int allocated, ee_len, depth; + ext4_fsblk_t newblock; + int err = 0; + int ret = 0; + + ext_debug("ext4_split_unwritten_extents: inode %lu," + "iblock %llu, max_blocks %u\n", inode->i_ino, + (unsigned long long)iblock, max_blocks); + depth = ext_depth(inode); + eh = path[depth].p_hdr; + ex = path[depth].p_ext; + ee_block = le32_to_cpu(ex->ee_block); + ee_len = ext4_ext_get_actual_len(ex); + allocated = ee_len - (iblock - ee_block); + newblock = iblock - ee_block + ext_pblock(ex); + ex2 = ex; + orig_ex.ee_block = ex->ee_block; + orig_ex.ee_len = cpu_to_le16(ee_len); + ext4_ext_store_pblock(&orig_ex, ext_pblock(ex)); + + /* + * if the entire unintialized extent length less than + * the size of extent to write, there is no need to split + * uninitialized extent + */ + if (allocated <= max_blocks) + return ret; + + err = ext4_ext_get_access(handle, inode, path + depth); + if (err) + goto out; + /* ex1: ee_block to iblock - 1 : uninitialized */ + if (iblock > ee_block) { + ex1 = ex; + ex1->ee_len = cpu_to_le16(iblock - ee_block); + ext4_ext_mark_uninitialized(ex1); + ex2 = &newex; + } + /* + * for sanity, update the length of the ex2 extent before + * we insert ex3, if ex1 is NULL. This is to avoid temporary + * overlap of blocks. + */ + if (!ex1 && allocated > max_blocks) + ex2->ee_len = cpu_to_le16(max_blocks); + /* ex3: to ee_block + ee_len : uninitialised */ + if (allocated > max_blocks) { + unsigned int newdepth; + ex3 = &newex; + ex3->ee_block = cpu_to_le32(iblock + max_blocks); + ext4_ext_store_pblock(ex3, newblock + max_blocks); + ex3->ee_len = cpu_to_le16(allocated - max_blocks); + ext4_ext_mark_uninitialized(ex3); + err = ext4_ext_insert_extent(handle, inode, path, ex3, flags); + if (err == -ENOSPC) { + err = ext4_ext_zeroout(inode, &orig_ex); + if (err) + goto fix_extent_len; + /* update the extent length and mark as initialized */ + ex->ee_block = orig_ex.ee_block; + ex->ee_len = orig_ex.ee_len; + ext4_ext_store_pblock(ex, ext_pblock(&orig_ex)); + ext4_ext_dirty(handle, inode, path + depth); + /* zeroed the full extent */ + /* blocks available from iblock */ + return allocated; + + } else if (err) + goto fix_extent_len; + /* + * The depth, and hence eh & ex might change + * as part of the insert above. + */ + newdepth = ext_depth(inode); + /* + * update the extent length after successful insert of the + * split extent + */ + orig_ex.ee_len = cpu_to_le16(ee_len - + ext4_ext_get_actual_len(ex3)); + depth = newdepth; + ext4_ext_drop_refs(path); + path = ext4_ext_find_extent(inode, iblock, path); + if (IS_ERR(path)) { + err = PTR_ERR(path); + goto out; + } + eh = path[depth].p_hdr; + ex = path[depth].p_ext; + if (ex2 != &newex) + ex2 = ex; + + err = ext4_ext_get_access(handle, inode, path + depth); + if (err) + goto out; + + allocated = max_blocks; + } + /* + * If there was a change of depth as part of the + * insertion of ex3 above, we need to update the length + * of the ex1 extent again here + */ + if (ex1 && ex1 != ex) { + ex1 = ex; + ex1->ee_len = cpu_to_le16(iblock - ee_block); + ext4_ext_mark_uninitialized(ex1); + ex2 = &newex; + } + /* + * ex2: iblock to iblock + maxblocks-1 : to be direct IO written, + * uninitialised still. + */ + ex2->ee_block = cpu_to_le32(iblock); + ext4_ext_store_pblock(ex2, newblock); + ex2->ee_len = cpu_to_le16(allocated); + ext4_ext_mark_uninitialized(ex2); + if (ex2 != ex) + goto insert; + /* Mark modified extent as dirty */ + err = ext4_ext_dirty(handle, inode, path + depth); + ext_debug("out here\n"); + goto out; +insert: + err = ext4_ext_insert_extent(handle, inode, path, &newex, flags); if (err == -ENOSPC) { err = ext4_ext_zeroout(inode, &orig_ex); if (err) @@ -2743,6 +2960,7 @@ insert: } else if (err) goto fix_extent_len; out: + ext4_ext_show_leaf(inode, path); return err ? err : allocated; fix_extent_len: @@ -2753,7 +2971,141 @@ fix_extent_len: ext4_ext_dirty(handle, inode, path + depth); return err; } +static int ext4_convert_unwritten_extents_dio(handle_t *handle, + struct inode *inode, + struct ext4_ext_path *path) +{ + struct ext4_extent *ex; + struct ext4_extent_header *eh; + int depth; + int err = 0; + int ret = 0; + + depth = ext_depth(inode); + eh = path[depth].p_hdr; + ex = path[depth].p_ext; + + err = ext4_ext_get_access(handle, inode, path + depth); + if (err) + goto out; + /* first mark the extent as initialized */ + ext4_ext_mark_initialized(ex); + + /* + * We have to see if it can be merged with the extent + * on the left. + */ + if (ex > EXT_FIRST_EXTENT(eh)) { + /* + * To merge left, pass "ex - 1" to try_to_merge(), + * since it merges towards right _only_. + */ + ret = ext4_ext_try_to_merge(inode, path, ex - 1); + if (ret) { + err = ext4_ext_correct_indexes(handle, inode, path); + if (err) + goto out; + depth = ext_depth(inode); + ex--; + } + } + /* + * Try to Merge towards right. + */ + ret = ext4_ext_try_to_merge(inode, path, ex); + if (ret) { + err = ext4_ext_correct_indexes(handle, inode, path); + if (err) + goto out; + depth = ext_depth(inode); + } + /* Mark modified extent as dirty */ + err = ext4_ext_dirty(handle, inode, path + depth); +out: + ext4_ext_show_leaf(inode, path); + return err; +} + +static int +ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode, + ext4_lblk_t iblock, unsigned int max_blocks, + struct ext4_ext_path *path, int flags, + unsigned int allocated, struct buffer_head *bh_result, + ext4_fsblk_t newblock) +{ + int ret = 0; + int err = 0; + ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio; + + ext_debug("ext4_ext_handle_uninitialized_extents: inode %lu, logical" + "block %llu, max_blocks %u, flags %d, allocated %u", + inode->i_ino, (unsigned long long)iblock, max_blocks, + flags, allocated); + ext4_ext_show_leaf(inode, path); + + /* DIO get_block() before submit the IO, split the extent */ + if (flags == EXT4_GET_BLOCKS_DIO_CREATE_EXT) { + ret = ext4_split_unwritten_extents(handle, + inode, path, iblock, + max_blocks, flags); + /* flag the io_end struct that we need convert when IO done */ + if (io) + io->flag = DIO_AIO_UNWRITTEN; + goto out; + } + /* DIO end_io complete, convert the filled extent to written */ + if (flags == EXT4_GET_BLOCKS_DIO_CONVERT_EXT) { + ret = ext4_convert_unwritten_extents_dio(handle, inode, + path); + goto out2; + } + /* buffered IO case */ + /* + * repeat fallocate creation request + * we already have an unwritten extent + */ + if (flags & EXT4_GET_BLOCKS_UNINIT_EXT) + goto map_out; + + /* buffered READ or buffered write_begin() lookup */ + if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) { + /* + * We have blocks reserved already. We + * return allocated blocks so that delalloc + * won't do block reservation for us. But + * the buffer head will be unmapped so that + * a read from the block returns 0s. + */ + set_buffer_unwritten(bh_result); + goto out1; + } + /* buffered write, writepage time, convert*/ + ret = ext4_ext_convert_to_initialized(handle, inode, + path, iblock, + max_blocks); +out: + if (ret <= 0) { + err = ret; + goto out2; + } else + allocated = ret; + set_buffer_new(bh_result); +map_out: + set_buffer_mapped(bh_result); +out1: + if (allocated > max_blocks) + allocated = max_blocks; + ext4_ext_show_leaf(inode, path); + bh_result->b_bdev = inode->i_sb->s_bdev; + bh_result->b_blocknr = newblock; +out2: + if (path) { + ext4_ext_drop_refs(path); + kfree(path); + } + return err ? err : allocated; +} /* * Block allocation/map/preallocation routine for extents based files * @@ -2784,9 +3136,10 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, int err = 0, depth, ret, cache_type; unsigned int allocated = 0; struct ext4_allocation_request ar; + ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio; __clear_bit(BH_New, &bh_result->b_state); - ext_debug("blocks %u/%u requested for inode %u\n", + ext_debug("blocks %u/%u requested for inode %lu\n", iblock, max_blocks, inode->i_ino); /* check in cache */ @@ -2849,7 +3202,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, newblock = iblock - ee_block + ee_start; /* number of remaining blocks in the extent */ allocated = ee_len - (iblock - ee_block); - ext_debug("%u fit into %lu:%d -> %llu\n", iblock, + ext_debug("%u fit into %u:%d -> %llu\n", iblock, ee_block, ee_len, newblock); /* Do not put uninitialized extent in the cache */ @@ -2859,33 +3212,10 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, EXT4_EXT_CACHE_EXTENT); goto out; } - if (flags & EXT4_GET_BLOCKS_UNINIT_EXT) - goto out; - if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) { - if (allocated > max_blocks) - allocated = max_blocks; - /* - * We have blocks reserved already. We - * return allocated blocks so that delalloc - * won't do block reservation for us. But - * the buffer head will be unmapped so that - * a read from the block returns 0s. - */ - set_buffer_unwritten(bh_result); - bh_result->b_bdev = inode->i_sb->s_bdev; - bh_result->b_blocknr = newblock; - goto out2; - } - - ret = ext4_ext_convert_to_initialized(handle, inode, - path, iblock, - max_blocks); - if (ret <= 0) { - err = ret; - goto out2; - } else - allocated = ret; - goto outnew; + ret = ext4_ext_handle_uninitialized_extents(handle, + inode, iblock, max_blocks, path, + flags, allocated, bh_result, newblock); + return ret; } } @@ -2950,15 +3280,27 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, newblock = ext4_mb_new_blocks(handle, &ar, &err); if (!newblock) goto out2; - ext_debug("allocate new block: goal %llu, found %llu/%lu\n", + ext_debug("allocate new block: goal %llu, found %llu/%u\n", ar.goal, newblock, allocated); /* try to insert new extent into found leaf and return */ ext4_ext_store_pblock(&newex, newblock); newex.ee_len = cpu_to_le16(ar.len); - if (flags & EXT4_GET_BLOCKS_UNINIT_EXT) /* Mark uninitialized */ + /* Mark uninitialized */ + if (flags & EXT4_GET_BLOCKS_UNINIT_EXT){ ext4_ext_mark_uninitialized(&newex); - err = ext4_ext_insert_extent(handle, inode, path, &newex); + /* + * io_end structure was created for every async + * direct IO write to the middle of the file. + * To avoid unecessary convertion for every aio dio rewrite + * to the mid of file, here we flag the IO that is really + * need the convertion. + * + */ + if (io && flags == EXT4_GET_BLOCKS_DIO_CREATE_EXT) + io->flag = DIO_AIO_UNWRITTEN; + } + err = ext4_ext_insert_extent(handle, inode, path, &newex, flags); if (err) { /* free data blocks we just allocated */ /* not a good idea to call discard here directly, @@ -2972,7 +3314,6 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, /* previous routine could use block we allocated */ newblock = ext_pblock(&newex); allocated = ext4_ext_get_actual_len(&newex); -outnew: set_buffer_new(bh_result); /* Cache only when it is _not_ an uninitialized extent */ @@ -3171,6 +3512,63 @@ retry: } /* + * This function convert a range of blocks to written extents + * The caller of this function will pass the start offset and the size. + * all unwritten extents within this range will be converted to + * written extents. + * + * This function is called from the direct IO end io call back + * function, to convert the fallocated extents after IO is completed. + */ +int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset, + loff_t len) +{ + handle_t *handle; + ext4_lblk_t block; + unsigned int max_blocks; + int ret = 0; + int ret2 = 0; + struct buffer_head map_bh; + unsigned int credits, blkbits = inode->i_blkbits; + + block = offset >> blkbits; + /* + * We can't just convert len to max_blocks because + * If blocksize = 4096 offset = 3072 and len = 2048 + */ + max_blocks = (EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits) + - block; + /* + * credits to insert 1 extent into extent tree + */ + credits = ext4_chunk_trans_blocks(inode, max_blocks); + while (ret >= 0 && ret < max_blocks) { + block = block + ret; + max_blocks = max_blocks - ret; + handle = ext4_journal_start(inode, credits); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + break; + } + map_bh.b_state = 0; + ret = ext4_get_blocks(handle, inode, block, + max_blocks, &map_bh, + EXT4_GET_BLOCKS_DIO_CONVERT_EXT); + if (ret <= 0) { + WARN_ON(ret <= 0); + printk(KERN_ERR "%s: ext4_ext_get_blocks " + "returned error inode#%lu, block=%u, " + "max_blocks=%u", __func__, + inode->i_ino, block, max_blocks); + } + ext4_mark_inode_dirty(handle, inode); + ret2 = ext4_journal_stop(handle); + if (ret <= 0 || ret2 ) + break; + } + return ret > 0 ? ret2 : ret; +} +/* * Callback function called for each extent to gather FIEMAP information. */ static int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path, diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 3f1873f..9630583 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -58,10 +58,7 @@ static ssize_t ext4_file_write(struct kiocb *iocb, const struct iovec *iov, unsigned long nr_segs, loff_t pos) { - struct file *file = iocb->ki_filp; - struct inode *inode = file->f_path.dentry->d_inode; - ssize_t ret; - int err; + struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode; /* * If we have encountered a bitmap-format file, the size limit @@ -81,56 +78,10 @@ ext4_file_write(struct kiocb *iocb, const struct iovec *iov, } } - ret = generic_file_aio_write(iocb, iov, nr_segs, pos); - /* - * Skip flushing if there was an error, or if nothing was written. - */ - if (ret <= 0) - return ret; - - /* - * If the inode is IS_SYNC, or is O_SYNC and we are doing data - * journalling then we need to make sure that we force the transaction - * to disk to keep all metadata uptodate synchronously. - */ - if (file->f_flags & O_SYNC) { - /* - * If we are non-data-journaled, then the dirty data has - * already been flushed to backing store by generic_osync_inode, - * and the inode has been flushed too if there have been any - * modifications other than mere timestamp updates. - * - * Open question --- do we care about flushing timestamps too - * if the inode is IS_SYNC? - */ - if (!ext4_should_journal_data(inode)) - return ret; - - goto force_commit; - } - - /* - * So we know that there has been no forced data flush. If the inode - * is marked IS_SYNC, we need to force one ourselves. - */ - if (!IS_SYNC(inode)) - return ret; - - /* - * Open question #2 --- should we force data to disk here too? If we - * don't, the only impact is that data=writeback filesystems won't - * flush data to disk automatically on IS_SYNC, only metadata (but - * historically, that is what ext2 has done.) - */ - -force_commit: - err = ext4_force_commit(inode->i_sb); - if (err) - return err; - return ret; + return generic_file_aio_write(iocb, iov, nr_segs, pos); } -static struct vm_operations_struct ext4_file_vm_ops = { +static const struct vm_operations_struct ext4_file_vm_ops = { .fault = filemap_fault, .page_mkwrite = ext4_page_mkwrite, }; @@ -207,7 +158,7 @@ const struct inode_operations ext4_file_inode_operations = { .listxattr = ext4_listxattr, .removexattr = generic_removexattr, #endif - .permission = ext4_permission, + .check_acl = ext4_check_acl, .fallocate = ext4_fallocate, .fiemap = ext4_fiemap, }; diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c index 83cf641..2b15312 100644 --- a/fs/ext4/fsync.c +++ b/fs/ext4/fsync.c @@ -44,18 +44,23 @@ * * What we do is just kick off a commit and wait on it. This will snapshot the * inode to disk. + * + * i_mutex lock is held when entering and exiting this function */ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync) { struct inode *inode = dentry->d_inode; journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; - int ret = 0; + int err, ret = 0; J_ASSERT(ext4_journal_current_handle() == NULL); trace_ext4_sync_file(file, dentry, datasync); + ret = flush_aio_dio_completed_IO(inode); + if (ret < 0) + goto out; /* * data=writeback: * The caller's filemap_fdatawrite()/wait will sync the data. @@ -79,6 +84,9 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync) goto out; } + if (!journal) + ret = sync_mapping_buffers(inode->i_mapping); + if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) goto out; @@ -91,10 +99,12 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync) .sync_mode = WB_SYNC_ALL, .nr_to_write = 0, /* sys_fsync did this */ }; - ret = sync_inode(inode, &wbc); - if (journal && (journal->j_flags & JBD2_BARRIER)) - blkdev_issue_flush(inode->i_sb->s_bdev, NULL); + err = sync_inode(inode, &wbc); + if (ret == 0) + ret = err; } out: + if (journal && (journal->j_flags & JBD2_BARRIER)) + blkdev_issue_flush(inode->i_sb->s_bdev, NULL); return ret; } diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 29e6dc7..f3624ea 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -1189,7 +1189,7 @@ unsigned long ext4_count_free_inodes(struct super_block *sb) x = ext4_count_free(bitmap_bh, EXT4_INODES_PER_GROUP(sb) / 8); printk(KERN_DEBUG "group %lu: stored = %d, counted = %lu\n", - i, ext4_free_inodes_count(sb, gdp), x); + (unsigned long) i, ext4_free_inodes_count(sb, gdp), x); bitmap_count += x; } brelse(bitmap_bh); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index f9c642b..5c5bc5d 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -37,6 +37,7 @@ #include <linux/namei.h> #include <linux/uio.h> #include <linux/bio.h> +#include <linux/workqueue.h> #include "ext4_jbd2.h" #include "xattr.h" @@ -192,11 +193,24 @@ static int try_to_extend_transaction(handle_t *handle, struct inode *inode) * so before we call here everything must be consistently dirtied against * this transaction. */ -static int ext4_journal_test_restart(handle_t *handle, struct inode *inode) + int ext4_truncate_restart_trans(handle_t *handle, struct inode *inode, + int nblocks) { + int ret; + + /* + * Drop i_data_sem to avoid deadlock with ext4_get_blocks At this + * moment, get_block can be called only for blocks inside i_size since + * page cache has been already dropped and writes are blocked by + * i_mutex. So we can safely drop the i_data_sem here. + */ BUG_ON(EXT4_JOURNAL(inode) == NULL); jbd_debug(2, "restarting handle %p\n", handle); - return ext4_journal_restart(handle, blocks_for_truncate(inode)); + up_write(&EXT4_I(inode)->i_data_sem); + ret = ext4_journal_restart(handle, blocks_for_truncate(inode)); + down_write(&EXT4_I(inode)->i_data_sem); + + return ret; } /* @@ -341,9 +355,7 @@ static int ext4_block_to_path(struct inode *inode, int n = 0; int final = 0; - if (i_block < 0) { - ext4_warning(inode->i_sb, "ext4_block_to_path", "block < 0"); - } else if (i_block < direct_blocks) { + if (i_block < direct_blocks) { offsets[n++] = i_block; final = direct_blocks; } else if ((i_block -= direct_blocks) < indirect_blocks) { @@ -551,15 +563,21 @@ static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind) * * Normally this function find the preferred place for block allocation, * returns it. + * Because this is only used for non-extent files, we limit the block nr + * to 32 bits. */ static ext4_fsblk_t ext4_find_goal(struct inode *inode, ext4_lblk_t block, Indirect *partial) { + ext4_fsblk_t goal; + /* * XXX need to get goal block from mballoc's data structures */ - return ext4_find_near(inode, partial); + goal = ext4_find_near(inode, partial); + goal = goal & EXT4_MAX_BLOCK_FILE_PHYS; + return goal; } /** @@ -640,6 +658,8 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode, if (*err) goto failed_out; + BUG_ON(current_block + count > EXT4_MAX_BLOCK_FILE_PHYS); + target -= count; /* allocate blocks for indirect blocks */ while (index < indirect_blks && count) { @@ -674,6 +694,7 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode, ar.flags = EXT4_MB_HINT_DATA; current_block = ext4_mb_new_blocks(handle, &ar, err); + BUG_ON(current_block + ar.len > EXT4_MAX_BLOCK_FILE_PHYS); if (*err && (target == blks)) { /* @@ -762,8 +783,9 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode, BUFFER_TRACE(bh, "call get_create_access"); err = ext4_journal_get_create_access(handle, bh); if (err) { + /* Don't brelse(bh) here; it's done in + * ext4_journal_forget() below */ unlock_buffer(bh); - brelse(bh); goto failed; } @@ -1109,22 +1131,79 @@ static void ext4_da_update_reserve_space(struct inode *inode, int used) ext4_discard_preallocations(inode); } -static int check_block_validity(struct inode *inode, sector_t logical, - sector_t phys, int len) +static int check_block_validity(struct inode *inode, const char *msg, + sector_t logical, sector_t phys, int len) { if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), phys, len)) { - ext4_error(inode->i_sb, "check_block_validity", + ext4_error(inode->i_sb, msg, "inode #%lu logical block %llu mapped to %llu " "(size %d)", inode->i_ino, (unsigned long long) logical, (unsigned long long) phys, len); - WARN_ON(1); return -EIO; } return 0; } /* + * Return the number of contiguous dirty pages in a given inode + * starting at page frame idx. + */ +static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx, + unsigned int max_pages) +{ + struct address_space *mapping = inode->i_mapping; + pgoff_t index; + struct pagevec pvec; + pgoff_t num = 0; + int i, nr_pages, done = 0; + + if (max_pages == 0) + return 0; + pagevec_init(&pvec, 0); + while (!done) { + index = idx; + nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, + PAGECACHE_TAG_DIRTY, + (pgoff_t)PAGEVEC_SIZE); + if (nr_pages == 0) + break; + for (i = 0; i < nr_pages; i++) { + struct page *page = pvec.pages[i]; + struct buffer_head *bh, *head; + + lock_page(page); + if (unlikely(page->mapping != mapping) || + !PageDirty(page) || + PageWriteback(page) || + page->index != idx) { + done = 1; + unlock_page(page); + break; + } + if (page_has_buffers(page)) { + bh = head = page_buffers(page); + do { + if (!buffer_delay(bh) && + !buffer_unwritten(bh)) + done = 1; + bh = bh->b_this_page; + } while (!done && (bh != head)); + } + unlock_page(page); + if (done) + break; + idx++; + num++; + if (num >= max_pages) + break; + } + pagevec_release(&pvec); + } + return num; +} + +/* * The ext4_get_blocks() function tries to look up the requested blocks, * and returns if the blocks are already mapped. * @@ -1155,6 +1234,9 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block, clear_buffer_mapped(bh); clear_buffer_unwritten(bh); + ext_debug("ext4_get_blocks(): inode %lu, flag %d, max_blocks %u," + "logical block %lu\n", inode->i_ino, flags, max_blocks, + (unsigned long)block); /* * Try to see if we can get the block without requesting a new * file system block. @@ -1170,8 +1252,8 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block, up_read((&EXT4_I(inode)->i_data_sem)); if (retval > 0 && buffer_mapped(bh)) { - int ret = check_block_validity(inode, block, - bh->b_blocknr, retval); + int ret = check_block_validity(inode, "file system corruption", + block, bh->b_blocknr, retval); if (ret != 0) return ret; } @@ -1235,8 +1317,7 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block, * i_data's format changing. Force the migrate * to fail by clearing migrate flags */ - EXT4_I(inode)->i_flags = EXT4_I(inode)->i_flags & - ~EXT4_EXT_MIGRATE; + EXT4_I(inode)->i_state &= ~EXT4_STATE_EXT_MIGRATE; } } @@ -1252,8 +1333,9 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block, up_write((&EXT4_I(inode)->i_data_sem)); if (retval > 0 && buffer_mapped(bh)) { - int ret = check_block_validity(inode, block, - bh->b_blocknr, retval); + int ret = check_block_validity(inode, "file system " + "corruption after allocation", + block, bh->b_blocknr, retval); if (ret != 0) return ret; } @@ -1776,11 +1858,11 @@ repeat: if (ext4_claim_free_blocks(sbi, total)) { spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); + vfs_dq_release_reservation_block(inode, total); if (ext4_should_retry_alloc(inode->i_sb, &retries)) { yield(); goto repeat; } - vfs_dq_release_reservation_block(inode, total); return -ENOSPC; } EXT4_I(inode)->i_reserved_data_blocks += nrblocks; @@ -1863,18 +1945,6 @@ static void ext4_da_page_release_reservation(struct page *page, * Delayed allocation stuff */ -struct mpage_da_data { - struct inode *inode; - sector_t b_blocknr; /* start block number of extent */ - size_t b_size; /* size of extent */ - unsigned long b_state; /* state of the extent */ - unsigned long first_page, next_page; /* extent of pages */ - struct writeback_control *wbc; - int io_done; - int pages_written; - int retval; -}; - /* * mpage_da_submit_io - walks through extent of pages and try to write * them with writepage() call back @@ -2084,18 +2154,18 @@ static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd, static void ext4_print_free_blocks(struct inode *inode) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); - printk(KERN_EMERG "Total free blocks count %lld\n", - ext4_count_free_blocks(inode->i_sb)); - printk(KERN_EMERG "Free/Dirty block details\n"); - printk(KERN_EMERG "free_blocks=%lld\n", - (long long)percpu_counter_sum(&sbi->s_freeblocks_counter)); - printk(KERN_EMERG "dirty_blocks=%lld\n", - (long long)percpu_counter_sum(&sbi->s_dirtyblocks_counter)); - printk(KERN_EMERG "Block reservation details\n"); - printk(KERN_EMERG "i_reserved_data_blocks=%u\n", - EXT4_I(inode)->i_reserved_data_blocks); - printk(KERN_EMERG "i_reserved_meta_blocks=%u\n", - EXT4_I(inode)->i_reserved_meta_blocks); + printk(KERN_CRIT "Total free blocks count %lld\n", + ext4_count_free_blocks(inode->i_sb)); + printk(KERN_CRIT "Free/Dirty block details\n"); + printk(KERN_CRIT "free_blocks=%lld\n", + (long long) percpu_counter_sum(&sbi->s_freeblocks_counter)); + printk(KERN_CRIT "dirty_blocks=%lld\n", + (long long) percpu_counter_sum(&sbi->s_dirtyblocks_counter)); + printk(KERN_CRIT "Block reservation details\n"); + printk(KERN_CRIT "i_reserved_data_blocks=%u\n", + EXT4_I(inode)->i_reserved_data_blocks); + printk(KERN_CRIT "i_reserved_meta_blocks=%u\n", + EXT4_I(inode)->i_reserved_meta_blocks); return; } @@ -2181,14 +2251,14 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd) * writepage and writepages will again try to write * the same. */ - printk(KERN_EMERG "%s block allocation failed for inode %lu " - "at logical offset %llu with max blocks " - "%zd with error %d\n", - __func__, mpd->inode->i_ino, - (unsigned long long)next, - mpd->b_size >> mpd->inode->i_blkbits, err); - printk(KERN_EMERG "This should not happen.!! " - "Data will be lost\n"); + ext4_msg(mpd->inode->i_sb, KERN_CRIT, + "delayed block allocation failed for inode %lu at " + "logical offset %llu with max blocks %zd with " + "error %d\n", mpd->inode->i_ino, + (unsigned long long) next, + mpd->b_size >> mpd->inode->i_blkbits, err); + printk(KERN_CRIT "This should not happen!! " + "Data will be lost\n"); if (err == -ENOSPC) { ext4_print_free_blocks(mpd->inode); } @@ -2329,7 +2399,7 @@ static int __mpage_da_writepage(struct page *page, /* * Rest of the page in the page_vec * redirty then and skip then. We will - * try to to write them again after + * try to write them again after * starting a new transaction */ redirty_page_for_writepage(wbc, page); @@ -2735,8 +2805,11 @@ static int ext4_da_writepages(struct address_space *mapping, int no_nrwrite_index_update; int pages_written = 0; long pages_skipped; + unsigned int max_pages; int range_cyclic, cycled = 1, io_done = 0; - int needed_blocks, ret = 0, nr_to_writebump = 0; + int needed_blocks, ret = 0; + long desired_nr_to_write, nr_to_writebump = 0; + loff_t range_start = wbc->range_start; struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb); trace_ext4_da_writepages(inode, wbc); @@ -2762,16 +2835,6 @@ static int ext4_da_writepages(struct address_space *mapping, if (unlikely(sbi->s_mount_flags & EXT4_MF_FS_ABORTED)) return -EROFS; - /* - * Make sure nr_to_write is >= sbi->s_mb_stream_request - * This make sure small files blocks are allocated in - * single attempt. This ensure that small files - * get less fragmented. - */ - if (wbc->nr_to_write < sbi->s_mb_stream_request) { - nr_to_writebump = sbi->s_mb_stream_request - wbc->nr_to_write; - wbc->nr_to_write = sbi->s_mb_stream_request; - } if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) range_whole = 1; @@ -2786,6 +2849,36 @@ static int ext4_da_writepages(struct address_space *mapping, } else index = wbc->range_start >> PAGE_CACHE_SHIFT; + /* + * This works around two forms of stupidity. The first is in + * the writeback code, which caps the maximum number of pages + * written to be 1024 pages. This is wrong on multiple + * levels; different architectues have a different page size, + * which changes the maximum amount of data which gets + * written. Secondly, 4 megabytes is way too small. XFS + * forces this value to be 16 megabytes by multiplying + * nr_to_write parameter by four, and then relies on its + * allocator to allocate larger extents to make them + * contiguous. Unfortunately this brings us to the second + * stupidity, which is that ext4's mballoc code only allocates + * at most 2048 blocks. So we force contiguous writes up to + * the number of dirty blocks in the inode, or + * sbi->max_writeback_mb_bump whichever is smaller. + */ + max_pages = sbi->s_max_writeback_mb_bump << (20 - PAGE_CACHE_SHIFT); + if (!range_cyclic && range_whole) + desired_nr_to_write = wbc->nr_to_write * 8; + else + desired_nr_to_write = ext4_num_dirty_pages(inode, index, + max_pages); + if (desired_nr_to_write > max_pages) + desired_nr_to_write = max_pages; + + if (wbc->nr_to_write < desired_nr_to_write) { + nr_to_writebump = desired_nr_to_write - wbc->nr_to_write; + wbc->nr_to_write = desired_nr_to_write; + } + mpd.wbc = wbc; mpd.inode = mapping->host; @@ -2813,10 +2906,9 @@ retry: handle = ext4_journal_start(inode, needed_blocks); if (IS_ERR(handle)) { ret = PTR_ERR(handle); - printk(KERN_CRIT "%s: jbd2_start: " + ext4_msg(inode->i_sb, KERN_CRIT, "%s: jbd2_start: " "%ld pages, ino %lu; err %d\n", __func__, wbc->nr_to_write, inode->i_ino, ret); - dump_stack(); goto out_writepages; } @@ -2850,6 +2942,7 @@ retry: mpd.io_done = 1; ret = MPAGE_DA_EXTENT_TAIL; } + trace_ext4_da_write_pages(inode, &mpd); wbc->nr_to_write -= mpd.pages_written; ext4_journal_stop(handle); @@ -2887,9 +2980,10 @@ retry: goto retry; } if (pages_skipped != wbc->pages_skipped) - printk(KERN_EMERG "This should not happen leaving %s " - "with nr_to_write = %ld ret = %d\n", - __func__, wbc->nr_to_write, ret); + ext4_msg(inode->i_sb, KERN_CRIT, + "This should not happen leaving %s " + "with nr_to_write = %ld ret = %d\n", + __func__, wbc->nr_to_write, ret); /* Update index */ index += pages_written; @@ -2904,7 +2998,9 @@ retry: out_writepages: if (!no_nrwrite_index_update) wbc->no_nrwrite_index_update = 0; - wbc->nr_to_write -= nr_to_writebump; + if (wbc->nr_to_write > nr_to_writebump) + wbc->nr_to_write -= nr_to_writebump; + wbc->range_start = range_start; trace_ext4_da_writepages_result(inode, wbc, ret, pages_written); return ret; } @@ -3117,6 +3213,8 @@ out: */ int ext4_alloc_da_blocks(struct inode *inode) { + trace_ext4_alloc_da_blocks(inode); + if (!EXT4_I(inode)->i_reserved_data_blocks && !EXT4_I(inode)->i_reserved_meta_blocks) return 0; @@ -3259,6 +3357,8 @@ static int ext4_releasepage(struct page *page, gfp_t wait) } /* + * O_DIRECT for ext3 (or indirect map) based files + * * If the O_DIRECT write will extend the file then add this inode to the * orphan list. So recovery will truncate it back to the original size * if the machine crashes during the write. @@ -3267,7 +3367,7 @@ static int ext4_releasepage(struct page *page, gfp_t wait) * crashes then stale disk data _may_ be exposed inside the file. But current * VFS code falls back into buffered path in that case so we are safe. */ -static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb, +static ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, loff_t offset, unsigned long nr_segs) { @@ -3278,6 +3378,7 @@ static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb, ssize_t ret; int orphan = 0; size_t count = iov_length(iov, nr_segs); + int retries = 0; if (rw == WRITE) { loff_t final_size = offset + count; @@ -3300,9 +3401,12 @@ static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb, } } +retry: ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov, offset, nr_segs, ext4_get_block, NULL); + if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) + goto retry; if (orphan) { int err; @@ -3341,6 +3445,359 @@ out: return ret; } +/* Maximum number of blocks we map for direct IO at once. */ + +static int ext4_get_block_dio_write(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create) +{ + handle_t *handle = NULL; + int ret = 0; + unsigned max_blocks = bh_result->b_size >> inode->i_blkbits; + int dio_credits; + + ext4_debug("ext4_get_block_dio_write: inode %lu, create flag %d\n", + inode->i_ino, create); + /* + * DIO VFS code passes create = 0 flag for write to + * the middle of file. It does this to avoid block + * allocation for holes, to prevent expose stale data + * out when there is parallel buffered read (which does + * not hold the i_mutex lock) while direct IO write has + * not completed. DIO request on holes finally falls back + * to buffered IO for this reason. + * + * For ext4 extent based file, since we support fallocate, + * new allocated extent as uninitialized, for holes, we + * could fallocate blocks for holes, thus parallel + * buffered IO read will zero out the page when read on + * a hole while parallel DIO write to the hole has not completed. + * + * when we come here, we know it's a direct IO write to + * to the middle of file (<i_size) + * so it's safe to override the create flag from VFS. + */ + create = EXT4_GET_BLOCKS_DIO_CREATE_EXT; + + if (max_blocks > DIO_MAX_BLOCKS) + max_blocks = DIO_MAX_BLOCKS; + dio_credits = ext4_chunk_trans_blocks(inode, max_blocks); + handle = ext4_journal_start(inode, dio_credits); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + goto out; + } + ret = ext4_get_blocks(handle, inode, iblock, max_blocks, bh_result, + create); + if (ret > 0) { + bh_result->b_size = (ret << inode->i_blkbits); + ret = 0; + } + ext4_journal_stop(handle); +out: + return ret; +} + +static void ext4_free_io_end(ext4_io_end_t *io) +{ + BUG_ON(!io); + iput(io->inode); + kfree(io); +} +static void dump_aio_dio_list(struct inode * inode) +{ +#ifdef EXT4_DEBUG + struct list_head *cur, *before, *after; + ext4_io_end_t *io, *io0, *io1; + + if (list_empty(&EXT4_I(inode)->i_aio_dio_complete_list)){ + ext4_debug("inode %lu aio dio list is empty\n", inode->i_ino); + return; + } + + ext4_debug("Dump inode %lu aio_dio_completed_IO list \n", inode->i_ino); + list_for_each_entry(io, &EXT4_I(inode)->i_aio_dio_complete_list, list){ + cur = &io->list; + before = cur->prev; + io0 = container_of(before, ext4_io_end_t, list); + after = cur->next; + io1 = container_of(after, ext4_io_end_t, list); + + ext4_debug("io 0x%p from inode %lu,prev 0x%p,next 0x%p\n", + io, inode->i_ino, io0, io1); + } +#endif +} + +/* + * check a range of space and convert unwritten extents to written. + */ +static int ext4_end_aio_dio_nolock(ext4_io_end_t *io) +{ + struct inode *inode = io->inode; + loff_t offset = io->offset; + size_t size = io->size; + int ret = 0; + + ext4_debug("end_aio_dio_onlock: io 0x%p from inode %lu,list->next 0x%p," + "list->prev 0x%p\n", + io, inode->i_ino, io->list.next, io->list.prev); + + if (list_empty(&io->list)) + return ret; + + if (io->flag != DIO_AIO_UNWRITTEN) + return ret; + + if (offset + size <= i_size_read(inode)) + ret = ext4_convert_unwritten_extents(inode, offset, size); + + if (ret < 0) { + printk(KERN_EMERG "%s: failed to convert unwritten" + "extents to written extents, error is %d" + " io is still on inode %lu aio dio list\n", + __func__, ret, inode->i_ino); + return ret; + } + + /* clear the DIO AIO unwritten flag */ + io->flag = 0; + return ret; +} +/* + * work on completed aio dio IO, to convert unwritten extents to extents + */ +static void ext4_end_aio_dio_work(struct work_struct *work) +{ + ext4_io_end_t *io = container_of(work, ext4_io_end_t, work); + struct inode *inode = io->inode; + int ret = 0; + + mutex_lock(&inode->i_mutex); + ret = ext4_end_aio_dio_nolock(io); + if (ret >= 0) { + if (!list_empty(&io->list)) + list_del_init(&io->list); + ext4_free_io_end(io); + } + mutex_unlock(&inode->i_mutex); +} +/* + * This function is called from ext4_sync_file(). + * + * When AIO DIO IO is completed, the work to convert unwritten + * extents to written is queued on workqueue but may not get immediately + * scheduled. When fsync is called, we need to ensure the + * conversion is complete before fsync returns. + * The inode keeps track of a list of completed AIO from DIO path + * that might needs to do the conversion. This function walks through + * the list and convert the related unwritten extents to written. + */ +int flush_aio_dio_completed_IO(struct inode *inode) +{ + ext4_io_end_t *io; + int ret = 0; + int ret2 = 0; + + if (list_empty(&EXT4_I(inode)->i_aio_dio_complete_list)) + return ret; + + dump_aio_dio_list(inode); + while (!list_empty(&EXT4_I(inode)->i_aio_dio_complete_list)){ + io = list_entry(EXT4_I(inode)->i_aio_dio_complete_list.next, + ext4_io_end_t, list); + /* + * Calling ext4_end_aio_dio_nolock() to convert completed + * IO to written. + * + * When ext4_sync_file() is called, run_queue() may already + * about to flush the work corresponding to this io structure. + * It will be upset if it founds the io structure related + * to the work-to-be schedule is freed. + * + * Thus we need to keep the io structure still valid here after + * convertion finished. The io structure has a flag to + * avoid double converting from both fsync and background work + * queue work. + */ + ret = ext4_end_aio_dio_nolock(io); + if (ret < 0) + ret2 = ret; + else + list_del_init(&io->list); + } + return (ret2 < 0) ? ret2 : 0; +} + +static ext4_io_end_t *ext4_init_io_end (struct inode *inode) +{ + ext4_io_end_t *io = NULL; + + io = kmalloc(sizeof(*io), GFP_NOFS); + + if (io) { + igrab(inode); + io->inode = inode; + io->flag = 0; + io->offset = 0; + io->size = 0; + io->error = 0; + INIT_WORK(&io->work, ext4_end_aio_dio_work); + INIT_LIST_HEAD(&io->list); + } + + return io; +} + +static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset, + ssize_t size, void *private) +{ + ext4_io_end_t *io_end = iocb->private; + struct workqueue_struct *wq; + + ext_debug("ext4_end_io_dio(): io_end 0x%p" + "for inode %lu, iocb 0x%p, offset %llu, size %llu\n", + iocb->private, io_end->inode->i_ino, iocb, offset, + size); + /* if not async direct IO or dio with 0 bytes write, just return */ + if (!io_end || !size) + return; + + /* if not aio dio with unwritten extents, just free io and return */ + if (io_end->flag != DIO_AIO_UNWRITTEN){ + ext4_free_io_end(io_end); + iocb->private = NULL; + return; + } + + io_end->offset = offset; + io_end->size = size; + wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq; + + /* queue the work to convert unwritten extents to written */ + queue_work(wq, &io_end->work); + + /* Add the io_end to per-inode completed aio dio list*/ + list_add_tail(&io_end->list, + &EXT4_I(io_end->inode)->i_aio_dio_complete_list); + iocb->private = NULL; +} +/* + * For ext4 extent files, ext4 will do direct-io write to holes, + * preallocated extents, and those write extend the file, no need to + * fall back to buffered IO. + * + * For holes, we fallocate those blocks, mark them as unintialized + * If those blocks were preallocated, we mark sure they are splited, but + * still keep the range to write as unintialized. + * + * The unwrritten extents will be converted to written when DIO is completed. + * For async direct IO, since the IO may still pending when return, we + * set up an end_io call back function, which will do the convertion + * when async direct IO completed. + * + * If the O_DIRECT write will extend the file then add this inode to the + * orphan list. So recovery will truncate it back to the original size + * if the machine crashes during the write. + * + */ +static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb, + const struct iovec *iov, loff_t offset, + unsigned long nr_segs) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file->f_mapping->host; + ssize_t ret; + size_t count = iov_length(iov, nr_segs); + + loff_t final_size = offset + count; + if (rw == WRITE && final_size <= inode->i_size) { + /* + * We could direct write to holes and fallocate. + * + * Allocated blocks to fill the hole are marked as uninitialized + * to prevent paralel buffered read to expose the stale data + * before DIO complete the data IO. + * + * As to previously fallocated extents, ext4 get_block + * will just simply mark the buffer mapped but still + * keep the extents uninitialized. + * + * for non AIO case, we will convert those unwritten extents + * to written after return back from blockdev_direct_IO. + * + * for async DIO, the conversion needs to be defered when + * the IO is completed. The ext4 end_io callback function + * will be called to take care of the conversion work. + * Here for async case, we allocate an io_end structure to + * hook to the iocb. + */ + iocb->private = NULL; + EXT4_I(inode)->cur_aio_dio = NULL; + if (!is_sync_kiocb(iocb)) { + iocb->private = ext4_init_io_end(inode); + if (!iocb->private) + return -ENOMEM; + /* + * we save the io structure for current async + * direct IO, so that later ext4_get_blocks() + * could flag the io structure whether there + * is a unwritten extents needs to be converted + * when IO is completed. + */ + EXT4_I(inode)->cur_aio_dio = iocb->private; + } + + ret = blockdev_direct_IO(rw, iocb, inode, + inode->i_sb->s_bdev, iov, + offset, nr_segs, + ext4_get_block_dio_write, + ext4_end_io_dio); + if (iocb->private) + EXT4_I(inode)->cur_aio_dio = NULL; + /* + * The io_end structure takes a reference to the inode, + * that structure needs to be destroyed and the + * reference to the inode need to be dropped, when IO is + * complete, even with 0 byte write, or failed. + * + * In the successful AIO DIO case, the io_end structure will be + * desctroyed and the reference to the inode will be dropped + * after the end_io call back function is called. + * + * In the case there is 0 byte write, or error case, since + * VFS direct IO won't invoke the end_io call back function, + * we need to free the end_io structure here. + */ + if (ret != -EIOCBQUEUED && ret <= 0 && iocb->private) { + ext4_free_io_end(iocb->private); + iocb->private = NULL; + } else if (ret > 0) + /* + * for non AIO case, since the IO is already + * completed, we could do the convertion right here + */ + ret = ext4_convert_unwritten_extents(inode, + offset, ret); + return ret; + } + + /* for write the the end of file case, we fall back to old way */ + return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs); +} + +static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb, + const struct iovec *iov, loff_t offset, + unsigned long nr_segs) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file->f_mapping->host; + + if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) + return ext4_ext_direct_IO(rw, iocb, iov, offset, nr_segs); + + return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs); +} + /* * Pages can be marked dirty completely asynchronously from ext4's journalling * activity. By filemap_sync_pte(), try_to_unmap_one(), etc. We cannot do @@ -3373,6 +3830,7 @@ static const struct address_space_operations ext4_ordered_aops = { .direct_IO = ext4_direct_IO, .migratepage = buffer_migrate_page, .is_partially_uptodate = block_is_partially_uptodate, + .error_remove_page = generic_error_remove_page, }; static const struct address_space_operations ext4_writeback_aops = { @@ -3388,6 +3846,7 @@ static const struct address_space_operations ext4_writeback_aops = { .direct_IO = ext4_direct_IO, .migratepage = buffer_migrate_page, .is_partially_uptodate = block_is_partially_uptodate, + .error_remove_page = generic_error_remove_page, }; static const struct address_space_operations ext4_journalled_aops = { @@ -3402,6 +3861,7 @@ static const struct address_space_operations ext4_journalled_aops = { .invalidatepage = ext4_invalidatepage, .releasepage = ext4_releasepage, .is_partially_uptodate = block_is_partially_uptodate, + .error_remove_page = generic_error_remove_page, }; static const struct address_space_operations ext4_da_aops = { @@ -3418,6 +3878,7 @@ static const struct address_space_operations ext4_da_aops = { .direct_IO = ext4_direct_IO, .migratepage = buffer_migrate_page, .is_partially_uptodate = block_is_partially_uptodate, + .error_remove_page = generic_error_remove_page, }; void ext4_set_aops(struct inode *inode) @@ -3659,7 +4120,8 @@ static void ext4_clear_blocks(handle_t *handle, struct inode *inode, ext4_handle_dirty_metadata(handle, inode, bh); } ext4_mark_inode_dirty(handle, inode); - ext4_journal_test_restart(handle, inode); + ext4_truncate_restart_trans(handle, inode, + blocks_for_truncate(inode)); if (bh) { BUFFER_TRACE(bh, "retaking write access"); ext4_journal_get_write_access(handle, bh); @@ -3870,7 +4332,8 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode, return; if (try_to_extend_transaction(handle, inode)) { ext4_mark_inode_dirty(handle, inode); - ext4_journal_test_restart(handle, inode); + ext4_truncate_restart_trans(handle, inode, + blocks_for_truncate(inode)); } ext4_free_blocks(handle, inode, nr, 1, 1); @@ -3958,8 +4421,7 @@ void ext4_truncate(struct inode *inode) if (!ext4_can_truncate(inode)) return; - if (ei->i_disksize && inode->i_size == 0 && - !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC)) + if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC)) ei->i_state |= EXT4_STATE_DA_ALLOC_CLOSE; if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) { @@ -4581,8 +5043,7 @@ static int ext4_do_update_inode(handle_t *handle, if (ext4_inode_blocks_set(handle, raw_inode, ei)) goto out_brelse; raw_inode->i_dtime = cpu_to_le32(ei->i_dtime); - /* clear the migrate flag in the raw_inode */ - raw_inode->i_flags = cpu_to_le32(ei->i_flags & ~EXT4_EXT_MIGRATE); + raw_inode->i_flags = cpu_to_le32(ei->i_flags); if (EXT4_SB(inode->i_sb)->s_es->s_creator_os != cpu_to_le32(EXT4_OS_HURD)) raw_inode->i_file_acl_high = @@ -4684,19 +5145,40 @@ out_brelse: */ int ext4_write_inode(struct inode *inode, int wait) { + int err; + if (current->flags & PF_MEMALLOC) return 0; - if (ext4_journal_current_handle()) { - jbd_debug(1, "called recursively, non-PF_MEMALLOC!\n"); - dump_stack(); - return -EIO; - } + if (EXT4_SB(inode->i_sb)->s_journal) { + if (ext4_journal_current_handle()) { + jbd_debug(1, "called recursively, non-PF_MEMALLOC!\n"); + dump_stack(); + return -EIO; + } - if (!wait) - return 0; + if (!wait) + return 0; - return ext4_force_commit(inode->i_sb); + err = ext4_force_commit(inode->i_sb); + } else { + struct ext4_iloc iloc; + + err = ext4_get_inode_loc(inode, &iloc); + if (err) + return err; + if (wait) + sync_dirty_buffer(iloc.bh); + if (buffer_req(iloc.bh) && !buffer_uptodate(iloc.bh)) { + ext4_error(inode->i_sb, __func__, + "IO error syncing inode, " + "inode=%lu, block=%llu", + inode->i_ino, + (unsigned long long)iloc.bh->b_blocknr); + err = -EIO; + } + } + return err; } /* @@ -5134,27 +5616,14 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode) */ void ext4_dirty_inode(struct inode *inode) { - handle_t *current_handle = ext4_journal_current_handle(); handle_t *handle; - if (!ext4_handle_valid(current_handle)) { - ext4_mark_inode_dirty(current_handle, inode); - return; - } - handle = ext4_journal_start(inode, 2); if (IS_ERR(handle)) goto out; - if (current_handle && - current_handle->h_transaction != handle->h_transaction) { - /* This task has a transaction open against a different fs */ - printk(KERN_EMERG "%s: transactions do not match!\n", - __func__); - } else { - jbd_debug(5, "marking dirty. outer handle=%p\n", - current_handle); - ext4_mark_inode_dirty(handle, inode); - } + + ext4_mark_inode_dirty(handle, inode); + ext4_journal_stop(handle); out: return; @@ -5281,12 +5750,21 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) else len = PAGE_CACHE_SIZE; + lock_page(page); + /* + * return if we have all the buffers mapped. This avoid + * the need to call write_begin/write_end which does a + * journal_start/journal_stop which can block and take + * long time + */ if (page_has_buffers(page)) { - /* return if we have all the buffers mapped */ if (!walk_page_buffers(NULL, page_buffers(page), 0, len, NULL, - ext4_bh_unmapped)) + ext4_bh_unmapped)) { + unlock_page(page); goto out_unlock; + } } + unlock_page(page); /* * OK, we need to fill the hole... Do write_begin write_end * to do block allocation/reservation.We are not holding diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 7050a9c..c1cdf61 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -243,10 +243,9 @@ setversion_out: me.donor_start, me.len, &me.moved_len); fput(donor_filp); - if (!err) - if (copy_to_user((struct move_extent *)arg, - &me, sizeof(me))) - return -EFAULT; + if (copy_to_user((struct move_extent *)arg, &me, sizeof(me))) + return -EFAULT; + return err; } diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index cd25846..bba1282 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -22,6 +22,7 @@ */ #include "mballoc.h" +#include <linux/debugfs.h> #include <trace/events/ext4.h> /* @@ -622,13 +623,13 @@ static int __mb_check_buddy(struct ext4_buddy *e4b, char *file, /* FIXME!! need more doc */ static void ext4_mb_mark_free_simple(struct super_block *sb, - void *buddy, unsigned first, int len, + void *buddy, ext4_grpblk_t first, ext4_grpblk_t len, struct ext4_group_info *grp) { struct ext4_sb_info *sbi = EXT4_SB(sb); - unsigned short min; - unsigned short max; - unsigned short chunk; + ext4_grpblk_t min; + ext4_grpblk_t max; + ext4_grpblk_t chunk; unsigned short border; BUG_ON(len > EXT4_BLOCKS_PER_GROUP(sb)); @@ -662,10 +663,10 @@ void ext4_mb_generate_buddy(struct super_block *sb, void *buddy, void *bitmap, ext4_group_t group) { struct ext4_group_info *grp = ext4_get_group_info(sb, group); - unsigned short max = EXT4_BLOCKS_PER_GROUP(sb); - unsigned short i = 0; - unsigned short first; - unsigned short len; + ext4_grpblk_t max = EXT4_BLOCKS_PER_GROUP(sb); + ext4_grpblk_t i = 0; + ext4_grpblk_t first; + ext4_grpblk_t len; unsigned free = 0; unsigned fragments = 0; unsigned long long period = get_cycles(); @@ -743,7 +744,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore) char *data; char *bitmap; - mb_debug("init page %lu\n", page->index); + mb_debug(1, "init page %lu\n", page->index); inode = page->mapping->host; sb = inode->i_sb; @@ -822,7 +823,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore) set_bitmap_uptodate(bh[i]); bh[i]->b_end_io = end_buffer_read_sync; submit_bh(READ, bh[i]); - mb_debug("read bitmap for group %u\n", first_group + i); + mb_debug(1, "read bitmap for group %u\n", first_group + i); } /* wait for I/O completion */ @@ -862,12 +863,13 @@ static int ext4_mb_init_cache(struct page *page, char *incore) if ((first_block + i) & 1) { /* this is block of buddy */ BUG_ON(incore == NULL); - mb_debug("put buddy for group %u in page %lu/%x\n", + mb_debug(1, "put buddy for group %u in page %lu/%x\n", group, page->index, i * blocksize); grinfo = ext4_get_group_info(sb, group); grinfo->bb_fragments = 0; memset(grinfo->bb_counters, 0, - sizeof(unsigned short)*(sb->s_blocksize_bits+2)); + sizeof(*grinfo->bb_counters) * + (sb->s_blocksize_bits+2)); /* * incore got set to the group block bitmap below */ @@ -878,7 +880,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore) } else { /* this is block of bitmap */ BUG_ON(incore != NULL); - mb_debug("put bitmap for group %u in page %lu/%x\n", + mb_debug(1, "put bitmap for group %u in page %lu/%x\n", group, page->index, i * blocksize); /* see comments in ext4_mb_put_pa() */ @@ -908,6 +910,100 @@ out: return err; } +static noinline_for_stack +int ext4_mb_init_group(struct super_block *sb, ext4_group_t group) +{ + + int ret = 0; + void *bitmap; + int blocks_per_page; + int block, pnum, poff; + int num_grp_locked = 0; + struct ext4_group_info *this_grp; + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct inode *inode = sbi->s_buddy_cache; + struct page *page = NULL, *bitmap_page = NULL; + + mb_debug(1, "init group %u\n", group); + blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize; + this_grp = ext4_get_group_info(sb, group); + /* + * This ensures that we don't reinit the buddy cache + * page which map to the group from which we are already + * allocating. If we are looking at the buddy cache we would + * have taken a reference using ext4_mb_load_buddy and that + * would have taken the alloc_sem lock. + */ + num_grp_locked = ext4_mb_get_buddy_cache_lock(sb, group); + if (!EXT4_MB_GRP_NEED_INIT(this_grp)) { + /* + * somebody initialized the group + * return without doing anything + */ + ret = 0; + goto err; + } + /* + * the buddy cache inode stores the block bitmap + * and buddy information in consecutive blocks. + * So for each group we need two blocks. + */ + block = group * 2; + pnum = block / blocks_per_page; + poff = block % blocks_per_page; + page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); + if (page) { + BUG_ON(page->mapping != inode->i_mapping); + ret = ext4_mb_init_cache(page, NULL); + if (ret) { + unlock_page(page); + goto err; + } + unlock_page(page); + } + if (page == NULL || !PageUptodate(page)) { + ret = -EIO; + goto err; + } + mark_page_accessed(page); + bitmap_page = page; + bitmap = page_address(page) + (poff * sb->s_blocksize); + + /* init buddy cache */ + block++; + pnum = block / blocks_per_page; + poff = block % blocks_per_page; + page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); + if (page == bitmap_page) { + /* + * If both the bitmap and buddy are in + * the same page we don't need to force + * init the buddy + */ + unlock_page(page); + } else if (page) { + BUG_ON(page->mapping != inode->i_mapping); + ret = ext4_mb_init_cache(page, bitmap); + if (ret) { + unlock_page(page); + goto err; + } + unlock_page(page); + } + if (page == NULL || !PageUptodate(page)) { + ret = -EIO; + goto err; + } + mark_page_accessed(page); +err: + ext4_mb_put_buddy_cache_lock(sb, group, num_grp_locked); + if (bitmap_page) + page_cache_release(bitmap_page); + if (page) + page_cache_release(page); + return ret; +} + static noinline_for_stack int ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group, struct ext4_buddy *e4b) @@ -922,7 +1018,7 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group, struct ext4_sb_info *sbi = EXT4_SB(sb); struct inode *inode = sbi->s_buddy_cache; - mb_debug("load group %u\n", group); + mb_debug(1, "load group %u\n", group); blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize; grp = ext4_get_group_info(sb, group); @@ -941,8 +1037,26 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group, * groups mapped by the page is blocked * till we are done with allocation */ +repeat_load_buddy: down_read(e4b->alloc_semp); + if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) { + /* we need to check for group need init flag + * with alloc_semp held so that we can be sure + * that new blocks didn't get added to the group + * when we are loading the buddy cache + */ + up_read(e4b->alloc_semp); + /* + * we need full data about the group + * to make a good selection + */ + ret = ext4_mb_init_group(sb, group); + if (ret) + return ret; + goto repeat_load_buddy; + } + /* * the buddy cache inode stores the block bitmap * and buddy information in consecutive blocks. @@ -1360,7 +1474,7 @@ static void ext4_mb_use_best_found(struct ext4_allocation_context *ac, ac->alloc_semp = e4b->alloc_semp; e4b->alloc_semp = NULL; /* store last allocated for subsequent stream allocation */ - if ((ac->ac_flags & EXT4_MB_HINT_DATA)) { + if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) { spin_lock(&sbi->s_md_lock); sbi->s_mb_last_group = ac->ac_f_ex.fe_group; sbi->s_mb_last_start = ac->ac_f_ex.fe_start; @@ -1837,97 +1951,6 @@ void ext4_mb_put_buddy_cache_lock(struct super_block *sb, } -static noinline_for_stack -int ext4_mb_init_group(struct super_block *sb, ext4_group_t group) -{ - - int ret; - void *bitmap; - int blocks_per_page; - int block, pnum, poff; - int num_grp_locked = 0; - struct ext4_group_info *this_grp; - struct ext4_sb_info *sbi = EXT4_SB(sb); - struct inode *inode = sbi->s_buddy_cache; - struct page *page = NULL, *bitmap_page = NULL; - - mb_debug("init group %lu\n", group); - blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize; - this_grp = ext4_get_group_info(sb, group); - /* - * This ensures we don't add group - * to this buddy cache via resize - */ - num_grp_locked = ext4_mb_get_buddy_cache_lock(sb, group); - if (!EXT4_MB_GRP_NEED_INIT(this_grp)) { - /* - * somebody initialized the group - * return without doing anything - */ - ret = 0; - goto err; - } - /* - * the buddy cache inode stores the block bitmap - * and buddy information in consecutive blocks. - * So for each group we need two blocks. - */ - block = group * 2; - pnum = block / blocks_per_page; - poff = block % blocks_per_page; - page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); - if (page) { - BUG_ON(page->mapping != inode->i_mapping); - ret = ext4_mb_init_cache(page, NULL); - if (ret) { - unlock_page(page); - goto err; - } - unlock_page(page); - } - if (page == NULL || !PageUptodate(page)) { - ret = -EIO; - goto err; - } - mark_page_accessed(page); - bitmap_page = page; - bitmap = page_address(page) + (poff * sb->s_blocksize); - - /* init buddy cache */ - block++; - pnum = block / blocks_per_page; - poff = block % blocks_per_page; - page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); - if (page == bitmap_page) { - /* - * If both the bitmap and buddy are in - * the same page we don't need to force - * init the buddy - */ - unlock_page(page); - } else if (page) { - BUG_ON(page->mapping != inode->i_mapping); - ret = ext4_mb_init_cache(page, bitmap); - if (ret) { - unlock_page(page); - goto err; - } - unlock_page(page); - } - if (page == NULL || !PageUptodate(page)) { - ret = -EIO; - goto err; - } - mark_page_accessed(page); -err: - ext4_mb_put_buddy_cache_lock(sb, group, num_grp_locked); - if (bitmap_page) - page_cache_release(bitmap_page); - if (page) - page_cache_release(page); - return ret; -} - static noinline_for_stack int ext4_mb_regular_allocator(struct ext4_allocation_context *ac) { @@ -1938,11 +1961,14 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac) struct ext4_sb_info *sbi; struct super_block *sb; struct ext4_buddy e4b; - loff_t size, isize; sb = ac->ac_sb; sbi = EXT4_SB(sb); ngroups = ext4_get_groups_count(sb); + /* non-extent files are limited to low blocks/groups */ + if (!(EXT4_I(ac->ac_inode)->i_flags & EXT4_EXTENTS_FL)) + ngroups = sbi->s_blockfile_groups; + BUG_ON(ac->ac_status == AC_STATUS_FOUND); /* first, try the goal */ @@ -1974,20 +2000,16 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac) } bsbits = ac->ac_sb->s_blocksize_bits; - /* if stream allocation is enabled, use global goal */ - size = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len; - isize = i_size_read(ac->ac_inode) >> bsbits; - if (size < isize) - size = isize; - if (size < sbi->s_mb_stream_request && - (ac->ac_flags & EXT4_MB_HINT_DATA)) { + /* if stream allocation is enabled, use global goal */ + if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) { /* TBD: may be hot point */ spin_lock(&sbi->s_md_lock); ac->ac_g_ex.fe_group = sbi->s_mb_last_group; ac->ac_g_ex.fe_start = sbi->s_mb_last_start; spin_unlock(&sbi->s_md_lock); } + /* Let's just scan groups to find more-less suitable blocks */ cr = ac->ac_2order ? 0 : 1; /* @@ -2015,27 +2037,6 @@ repeat: if (grp->bb_free == 0) continue; - /* - * if the group is already init we check whether it is - * a good group and if not we don't load the buddy - */ - if (EXT4_MB_GRP_NEED_INIT(grp)) { - /* - * we need full data about the group - * to make a good selection - */ - err = ext4_mb_init_group(sb, group); - if (err) - goto out; - } - - /* - * If the particular group doesn't satisfy our - * criteria we continue with the next group - */ - if (!ext4_mb_good_group(ac, group, cr)) - continue; - err = ext4_mb_load_buddy(sb, group, &e4b); if (err) goto out; @@ -2095,207 +2096,6 @@ out: return err; } -#ifdef EXT4_MB_HISTORY -struct ext4_mb_proc_session { - struct ext4_mb_history *history; - struct super_block *sb; - int start; - int max; -}; - -static void *ext4_mb_history_skip_empty(struct ext4_mb_proc_session *s, - struct ext4_mb_history *hs, - int first) -{ - if (hs == s->history + s->max) - hs = s->history; - if (!first && hs == s->history + s->start) - return NULL; - while (hs->orig.fe_len == 0) { - hs++; - if (hs == s->history + s->max) - hs = s->history; - if (hs == s->history + s->start) - return NULL; - } - return hs; -} - -static void *ext4_mb_seq_history_start(struct seq_file *seq, loff_t *pos) -{ - struct ext4_mb_proc_session *s = seq->private; - struct ext4_mb_history *hs; - int l = *pos; - - if (l == 0) - return SEQ_START_TOKEN; - hs = ext4_mb_history_skip_empty(s, s->history + s->start, 1); - if (!hs) - return NULL; - while (--l && (hs = ext4_mb_history_skip_empty(s, ++hs, 0)) != NULL); - return hs; -} - -static void *ext4_mb_seq_history_next(struct seq_file *seq, void *v, - loff_t *pos) -{ - struct ext4_mb_proc_session *s = seq->private; - struct ext4_mb_history *hs = v; - - ++*pos; - if (v == SEQ_START_TOKEN) - return ext4_mb_history_skip_empty(s, s->history + s->start, 1); - else - return ext4_mb_history_skip_empty(s, ++hs, 0); -} - -static int ext4_mb_seq_history_show(struct seq_file *seq, void *v) -{ - char buf[25], buf2[25], buf3[25], *fmt; - struct ext4_mb_history *hs = v; - - if (v == SEQ_START_TOKEN) { - seq_printf(seq, "%-5s %-8s %-23s %-23s %-23s %-5s " - "%-5s %-2s %-5s %-5s %-5s %-6s\n", - "pid", "inode", "original", "goal", "result", "found", - "grps", "cr", "flags", "merge", "tail", "broken"); - return 0; - } - - if (hs->op == EXT4_MB_HISTORY_ALLOC) { - fmt = "%-5u %-8u %-23s %-23s %-23s %-5u %-5u %-2u " - "%-5u %-5s %-5u %-6u\n"; - sprintf(buf2, "%u/%d/%u@%u", hs->result.fe_group, - hs->result.fe_start, hs->result.fe_len, - hs->result.fe_logical); - sprintf(buf, "%u/%d/%u@%u", hs->orig.fe_group, - hs->orig.fe_start, hs->orig.fe_len, - hs->orig.fe_logical); - sprintf(buf3, "%u/%d/%u@%u", hs->goal.fe_group, - hs->goal.fe_start, hs->goal.fe_len, - hs->goal.fe_logical); - seq_printf(seq, fmt, hs->pid, hs->ino, buf, buf3, buf2, - hs->found, hs->groups, hs->cr, hs->flags, - hs->merged ? "M" : "", hs->tail, - hs->buddy ? 1 << hs->buddy : 0); - } else if (hs->op == EXT4_MB_HISTORY_PREALLOC) { - fmt = "%-5u %-8u %-23s %-23s %-23s\n"; - sprintf(buf2, "%u/%d/%u@%u", hs->result.fe_group, - hs->result.fe_start, hs->result.fe_len, - hs->result.fe_logical); - sprintf(buf, "%u/%d/%u@%u", hs->orig.fe_group, - hs->orig.fe_start, hs->orig.fe_len, - hs->orig.fe_logical); - seq_printf(seq, fmt, hs->pid, hs->ino, buf, "", buf2); - } else if (hs->op == EXT4_MB_HISTORY_DISCARD) { - sprintf(buf2, "%u/%d/%u", hs->result.fe_group, - hs->result.fe_start, hs->result.fe_len); - seq_printf(seq, "%-5u %-8u %-23s discard\n", - hs->pid, hs->ino, buf2); - } else if (hs->op == EXT4_MB_HISTORY_FREE) { - sprintf(buf2, "%u/%d/%u", hs->result.fe_group, - hs->result.fe_start, hs->result.fe_len); - seq_printf(seq, "%-5u %-8u %-23s free\n", - hs->pid, hs->ino, buf2); - } - return 0; -} - -static void ext4_mb_seq_history_stop(struct seq_file *seq, void *v) -{ -} - -static struct seq_operations ext4_mb_seq_history_ops = { - .start = ext4_mb_seq_history_start, - .next = ext4_mb_seq_history_next, - .stop = ext4_mb_seq_history_stop, - .show = ext4_mb_seq_history_show, -}; - -static int ext4_mb_seq_history_open(struct inode *inode, struct file *file) -{ - struct super_block *sb = PDE(inode)->data; - struct ext4_sb_info *sbi = EXT4_SB(sb); - struct ext4_mb_proc_session *s; - int rc; - int size; - - if (unlikely(sbi->s_mb_history == NULL)) - return -ENOMEM; - s = kmalloc(sizeof(*s), GFP_KERNEL); - if (s == NULL) - return -ENOMEM; - s->sb = sb; - size = sizeof(struct ext4_mb_history) * sbi->s_mb_history_max; - s->history = kmalloc(size, GFP_KERNEL); - if (s->history == NULL) { - kfree(s); - return -ENOMEM; - } - - spin_lock(&sbi->s_mb_history_lock); - memcpy(s->history, sbi->s_mb_history, size); - s->max = sbi->s_mb_history_max; - s->start = sbi->s_mb_history_cur % s->max; - spin_unlock(&sbi->s_mb_history_lock); - - rc = seq_open(file, &ext4_mb_seq_history_ops); - if (rc == 0) { - struct seq_file *m = (struct seq_file *)file->private_data; - m->private = s; - } else { - kfree(s->history); - kfree(s); - } - return rc; - -} - -static int ext4_mb_seq_history_release(struct inode *inode, struct file *file) -{ - struct seq_file *seq = (struct seq_file *)file->private_data; - struct ext4_mb_proc_session *s = seq->private; - kfree(s->history); - kfree(s); - return seq_release(inode, file); -} - -static ssize_t ext4_mb_seq_history_write(struct file *file, - const char __user *buffer, - size_t count, loff_t *ppos) -{ - struct seq_file *seq = (struct seq_file *)file->private_data; - struct ext4_mb_proc_session *s = seq->private; - struct super_block *sb = s->sb; - char str[32]; - int value; - - if (count >= sizeof(str)) { - printk(KERN_ERR "EXT4-fs: %s string too long, max %u bytes\n", - "mb_history", (int)sizeof(str)); - return -EOVERFLOW; - } - - if (copy_from_user(str, buffer, count)) - return -EFAULT; - - value = simple_strtol(str, NULL, 0); - if (value < 0) - return -ERANGE; - EXT4_SB(sb)->s_mb_history_filter = value; - - return count; -} - -static struct file_operations ext4_mb_seq_history_fops = { - .owner = THIS_MODULE, - .open = ext4_mb_seq_history_open, - .read = seq_read, - .write = ext4_mb_seq_history_write, - .llseek = seq_lseek, - .release = ext4_mb_seq_history_release, -}; - static void *ext4_mb_seq_groups_start(struct seq_file *seq, loff_t *pos) { struct super_block *sb = seq->private; @@ -2328,7 +2128,7 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v) struct ext4_buddy e4b; struct sg { struct ext4_group_info info; - unsigned short counters[16]; + ext4_grpblk_t counters[16]; } sg; group--; @@ -2366,7 +2166,7 @@ static void ext4_mb_seq_groups_stop(struct seq_file *seq, void *v) { } -static struct seq_operations ext4_mb_seq_groups_ops = { +static const struct seq_operations ext4_mb_seq_groups_ops = { .start = ext4_mb_seq_groups_start, .next = ext4_mb_seq_groups_next, .stop = ext4_mb_seq_groups_stop, @@ -2387,7 +2187,7 @@ static int ext4_mb_seq_groups_open(struct inode *inode, struct file *file) } -static struct file_operations ext4_mb_seq_groups_fops = { +static const struct file_operations ext4_mb_seq_groups_fops = { .owner = THIS_MODULE, .open = ext4_mb_seq_groups_open, .read = seq_read, @@ -2395,82 +2195,6 @@ static struct file_operations ext4_mb_seq_groups_fops = { .release = seq_release, }; -static void ext4_mb_history_release(struct super_block *sb) -{ - struct ext4_sb_info *sbi = EXT4_SB(sb); - - if (sbi->s_proc != NULL) { - remove_proc_entry("mb_groups", sbi->s_proc); - if (sbi->s_mb_history_max) - remove_proc_entry("mb_history", sbi->s_proc); - } - kfree(sbi->s_mb_history); -} - -static void ext4_mb_history_init(struct super_block *sb) -{ - struct ext4_sb_info *sbi = EXT4_SB(sb); - int i; - - if (sbi->s_proc != NULL) { - if (sbi->s_mb_history_max) - proc_create_data("mb_history", S_IRUGO, sbi->s_proc, - &ext4_mb_seq_history_fops, sb); - proc_create_data("mb_groups", S_IRUGO, sbi->s_proc, - &ext4_mb_seq_groups_fops, sb); - } - - sbi->s_mb_history_cur = 0; - spin_lock_init(&sbi->s_mb_history_lock); - i = sbi->s_mb_history_max * sizeof(struct ext4_mb_history); - sbi->s_mb_history = i ? kzalloc(i, GFP_KERNEL) : NULL; - /* if we can't allocate history, then we simple won't use it */ -} - -static noinline_for_stack void -ext4_mb_store_history(struct ext4_allocation_context *ac) -{ - struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); - struct ext4_mb_history h; - - if (sbi->s_mb_history == NULL) - return; - - if (!(ac->ac_op & sbi->s_mb_history_filter)) - return; - - h.op = ac->ac_op; - h.pid = current->pid; - h.ino = ac->ac_inode ? ac->ac_inode->i_ino : 0; - h.orig = ac->ac_o_ex; - h.result = ac->ac_b_ex; - h.flags = ac->ac_flags; - h.found = ac->ac_found; - h.groups = ac->ac_groups_scanned; - h.cr = ac->ac_criteria; - h.tail = ac->ac_tail; - h.buddy = ac->ac_buddy; - h.merged = 0; - if (ac->ac_op == EXT4_MB_HISTORY_ALLOC) { - if (ac->ac_g_ex.fe_start == ac->ac_b_ex.fe_start && - ac->ac_g_ex.fe_group == ac->ac_b_ex.fe_group) - h.merged = 1; - h.goal = ac->ac_g_ex; - h.result = ac->ac_f_ex; - } - - spin_lock(&sbi->s_mb_history_lock); - memcpy(sbi->s_mb_history + sbi->s_mb_history_cur, &h, sizeof(h)); - if (++sbi->s_mb_history_cur >= sbi->s_mb_history_max) - sbi->s_mb_history_cur = 0; - spin_unlock(&sbi->s_mb_history_lock); -} - -#else -#define ext4_mb_history_release(sb) -#define ext4_mb_history_init(sb) -#endif - /* Create and initialize ext4_group_info data for the given group. */ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group, @@ -2532,7 +2256,7 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group, INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list); init_rwsem(&meta_group_info[i]->alloc_sem); - meta_group_info[i]->bb_free_root.rb_node = NULL;; + meta_group_info[i]->bb_free_root.rb_node = NULL; #ifdef DOUBLE_CHECK { @@ -2558,26 +2282,15 @@ exit_meta_group_info: return -ENOMEM; } /* ext4_mb_add_groupinfo */ -/* - * Update an existing group. - * This function is used for online resize - */ -void ext4_mb_update_group_info(struct ext4_group_info *grp, ext4_grpblk_t add) -{ - grp->bb_free += add; -} - static int ext4_mb_init_backend(struct super_block *sb) { ext4_group_t ngroups = ext4_get_groups_count(sb); ext4_group_t i; - int metalen; struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_super_block *es = sbi->s_es; int num_meta_group_infos; int num_meta_group_infos_max; int array_size; - struct ext4_group_info **meta_group_info; struct ext4_group_desc *desc; /* This is the number of blocks used by GDT */ @@ -2622,22 +2335,6 @@ static int ext4_mb_init_backend(struct super_block *sb) goto err_freesgi; } EXT4_I(sbi->s_buddy_cache)->i_disksize = 0; - - metalen = sizeof(*meta_group_info) << EXT4_DESC_PER_BLOCK_BITS(sb); - for (i = 0; i < num_meta_group_infos; i++) { - if ((i + 1) == num_meta_group_infos) - metalen = sizeof(*meta_group_info) * - (ngroups - - (i << EXT4_DESC_PER_BLOCK_BITS(sb))); - meta_group_info = kmalloc(metalen, GFP_KERNEL); - if (meta_group_info == NULL) { - printk(KERN_ERR "EXT4-fs: can't allocate mem for a " - "buddy group\n"); - goto err_freemeta; - } - sbi->s_group_info[i] = meta_group_info; - } - for (i = 0; i < ngroups; i++) { desc = ext4_get_group_desc(sb, i, NULL); if (desc == NULL) { @@ -2655,7 +2352,6 @@ err_freebuddy: while (i-- > 0) kfree(ext4_get_group_info(sb, i)); i = num_meta_group_infos; -err_freemeta: while (i-- > 0) kfree(sbi->s_group_info[i]); iput(sbi->s_buddy_cache); @@ -2672,14 +2368,14 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery) unsigned max; int ret; - i = (sb->s_blocksize_bits + 2) * sizeof(unsigned short); + i = (sb->s_blocksize_bits + 2) * sizeof(*sbi->s_mb_offsets); sbi->s_mb_offsets = kmalloc(i, GFP_KERNEL); if (sbi->s_mb_offsets == NULL) { return -ENOMEM; } - i = (sb->s_blocksize_bits + 2) * sizeof(unsigned int); + i = (sb->s_blocksize_bits + 2) * sizeof(*sbi->s_mb_maxs); sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL); if (sbi->s_mb_maxs == NULL) { kfree(sbi->s_mb_offsets); @@ -2717,7 +2413,6 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery) sbi->s_mb_stats = MB_DEFAULT_STATS; sbi->s_mb_stream_request = MB_DEFAULT_STREAM_THRESHOLD; sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS; - sbi->s_mb_history_filter = EXT4_MB_HISTORY_DEFAULT; sbi->s_mb_group_prealloc = MB_DEFAULT_GROUP_PREALLOC; sbi->s_locality_groups = alloc_percpu(struct ext4_locality_group); @@ -2735,12 +2430,12 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery) spin_lock_init(&lg->lg_prealloc_lock); } - ext4_mb_history_init(sb); + if (sbi->s_proc) + proc_create_data("mb_groups", S_IRUGO, sbi->s_proc, + &ext4_mb_seq_groups_fops, sb); if (sbi->s_journal) sbi->s_journal->j_commit_callback = release_blocks_on_commit; - - printk(KERN_INFO "EXT4-fs: mballoc enabled\n"); return 0; } @@ -2758,7 +2453,7 @@ static void ext4_mb_cleanup_pa(struct ext4_group_info *grp) kmem_cache_free(ext4_pspace_cachep, pa); } if (count) - mb_debug("mballoc: %u PAs left\n", count); + mb_debug(1, "mballoc: %u PAs left\n", count); } @@ -2817,7 +2512,8 @@ int ext4_mb_release(struct super_block *sb) } free_percpu(sbi->s_locality_groups); - ext4_mb_history_release(sb); + if (sbi->s_proc) + remove_proc_entry("mb_groups", sbi->s_proc); return 0; } @@ -2839,7 +2535,7 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn) list_for_each_safe(l, ltmp, &txn->t_private_list) { entry = list_entry(l, struct ext4_free_data, list); - mb_debug("gonna free %u blocks in group %u (0x%p):", + mb_debug(1, "gonna free %u blocks in group %u (0x%p):", entry->count, entry->group, entry); err = ext4_mb_load_buddy(sb, entry->group, &e4b); @@ -2874,9 +2570,43 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn) ext4_mb_release_desc(&e4b); } - mb_debug("freed %u blocks in %u structures\n", count, count2); + mb_debug(1, "freed %u blocks in %u structures\n", count, count2); +} + +#ifdef CONFIG_EXT4_DEBUG +u8 mb_enable_debug __read_mostly; + +static struct dentry *debugfs_dir; +static struct dentry *debugfs_debug; + +static void __init ext4_create_debugfs_entry(void) +{ + debugfs_dir = debugfs_create_dir("ext4", NULL); + if (debugfs_dir) + debugfs_debug = debugfs_create_u8("mballoc-debug", + S_IRUGO | S_IWUSR, + debugfs_dir, + &mb_enable_debug); +} + +static void ext4_remove_debugfs_entry(void) +{ + debugfs_remove(debugfs_debug); + debugfs_remove(debugfs_dir); } +#else + +static void __init ext4_create_debugfs_entry(void) +{ +} + +static void ext4_remove_debugfs_entry(void) +{ +} + +#endif + int __init init_ext4_mballoc(void) { ext4_pspace_cachep = @@ -2904,6 +2634,7 @@ int __init init_ext4_mballoc(void) kmem_cache_destroy(ext4_ac_cachep); return -ENOMEM; } + ext4_create_debugfs_entry(); return 0; } @@ -2917,6 +2648,7 @@ void exit_ext4_mballoc(void) kmem_cache_destroy(ext4_pspace_cachep); kmem_cache_destroy(ext4_ac_cachep); kmem_cache_destroy(ext4_free_ext_cachep); + ext4_remove_debugfs_entry(); } @@ -3061,7 +2793,7 @@ static void ext4_mb_normalize_group_request(struct ext4_allocation_context *ac) ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_stripe; else ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_mb_group_prealloc; - mb_debug("#%u: goal %u blocks for locality group\n", + mb_debug(1, "#%u: goal %u blocks for locality group\n", current->pid, ac->ac_g_ex.fe_len); } @@ -3180,23 +2912,18 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac, BUG_ON(!(ac->ac_o_ex.fe_logical >= pa_end || ac->ac_o_ex.fe_logical < pa->pa_lstart)); - /* skip PA normalized request doesn't overlap with */ - if (pa->pa_lstart >= end) { - spin_unlock(&pa->pa_lock); - continue; - } - if (pa_end <= start) { + /* skip PAs this normalized request doesn't overlap with */ + if (pa->pa_lstart >= end || pa_end <= start) { spin_unlock(&pa->pa_lock); continue; } BUG_ON(pa->pa_lstart <= start && pa_end >= end); + /* adjust start or end to be adjacent to this pa */ if (pa_end <= ac->ac_o_ex.fe_logical) { BUG_ON(pa_end < start); start = pa_end; - } - - if (pa->pa_lstart > ac->ac_o_ex.fe_logical) { + } else if (pa->pa_lstart > ac->ac_o_ex.fe_logical) { BUG_ON(pa->pa_lstart > end); end = pa->pa_lstart; } @@ -3251,7 +2978,7 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac, ac->ac_flags |= EXT4_MB_HINT_TRY_GOAL; } - mb_debug("goal: %u(was %u) blocks at %u\n", (unsigned) size, + mb_debug(1, "goal: %u(was %u) blocks at %u\n", (unsigned) size, (unsigned) orig_size, (unsigned) start); } @@ -3272,7 +2999,10 @@ static void ext4_mb_collect_stats(struct ext4_allocation_context *ac) atomic_inc(&sbi->s_bal_breaks); } - ext4_mb_store_history(ac); + if (ac->ac_op == EXT4_MB_HISTORY_ALLOC) + trace_ext4_mballoc_alloc(ac); + else + trace_ext4_mballoc_prealloc(ac); } /* @@ -3300,7 +3030,7 @@ static void ext4_mb_use_inode_pa(struct ext4_allocation_context *ac, BUG_ON(pa->pa_free < len); pa->pa_free -= len; - mb_debug("use %llu/%u from inode pa %p\n", start, len, pa); + mb_debug(1, "use %llu/%u from inode pa %p\n", start, len, pa); } /* @@ -3324,7 +3054,7 @@ static void ext4_mb_use_group_pa(struct ext4_allocation_context *ac, * in on-disk bitmap -- see ext4_mb_release_context() * Other CPUs are prevented from allocating from this pa by lg_mutex */ - mb_debug("use %u/%u from group pa %p\n", pa->pa_lstart-len, len, pa); + mb_debug(1, "use %u/%u from group pa %p\n", pa->pa_lstart-len, len, pa); } /* @@ -3382,6 +3112,11 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac) ac->ac_o_ex.fe_logical >= pa->pa_lstart + pa->pa_len) continue; + /* non-extent files can't have physical blocks past 2^32 */ + if (!(EXT4_I(ac->ac_inode)->i_flags & EXT4_EXTENTS_FL) && + pa->pa_pstart + pa->pa_len > EXT4_MAX_BLOCK_FILE_PHYS) + continue; + /* found preallocated blocks, use them */ spin_lock(&pa->pa_lock); if (pa->pa_deleted == 0 && pa->pa_free) { @@ -3503,7 +3238,7 @@ void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, preallocated += len; count++; } - mb_debug("prellocated %u for group %u\n", preallocated, group); + mb_debug(1, "prellocated %u for group %u\n", preallocated, group); } static void ext4_mb_pa_callback(struct rcu_head *head) @@ -3638,7 +3373,7 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac) pa->pa_deleted = 0; pa->pa_type = MB_INODE_PA; - mb_debug("new inode pa %p: %llu/%u for %u\n", pa, + mb_debug(1, "new inode pa %p: %llu/%u for %u\n", pa, pa->pa_pstart, pa->pa_len, pa->pa_lstart); trace_ext4_mb_new_inode_pa(ac, pa); @@ -3698,7 +3433,7 @@ ext4_mb_new_group_pa(struct ext4_allocation_context *ac) pa->pa_deleted = 0; pa->pa_type = MB_GROUP_PA; - mb_debug("new group pa %p: %llu/%u for %u\n", pa, + mb_debug(1, "new group pa %p: %llu/%u for %u\n", pa, pa->pa_pstart, pa->pa_len, pa->pa_lstart); trace_ext4_mb_new_group_pa(ac, pa); @@ -3767,7 +3502,6 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh, if (ac) { ac->ac_sb = sb; ac->ac_inode = pa->pa_inode; - ac->ac_op = EXT4_MB_HISTORY_DISCARD; } while (bit < end) { @@ -3777,7 +3511,7 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh, next = mb_find_next_bit(bitmap_bh->b_data, end, bit); start = group * EXT4_BLOCKS_PER_GROUP(sb) + bit + le32_to_cpu(sbi->s_es->s_first_data_block); - mb_debug(" free preallocated %u/%u in group %u\n", + mb_debug(1, " free preallocated %u/%u in group %u\n", (unsigned) start, (unsigned) next - bit, (unsigned) group); free += next - bit; @@ -3787,7 +3521,7 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh, ac->ac_b_ex.fe_start = bit; ac->ac_b_ex.fe_len = next - bit; ac->ac_b_ex.fe_logical = 0; - ext4_mb_store_history(ac); + trace_ext4_mballoc_discard(ac); } trace_ext4_mb_release_inode_pa(ac, pa, grp_blk_start + bit, @@ -3822,9 +3556,6 @@ ext4_mb_release_group_pa(struct ext4_buddy *e4b, ext4_group_t group; ext4_grpblk_t bit; - if (ac) - ac->ac_op = EXT4_MB_HISTORY_DISCARD; - trace_ext4_mb_release_group_pa(ac, pa); BUG_ON(pa->pa_deleted == 0); ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit); @@ -3839,7 +3570,7 @@ ext4_mb_release_group_pa(struct ext4_buddy *e4b, ac->ac_b_ex.fe_start = bit; ac->ac_b_ex.fe_len = pa->pa_len; ac->ac_b_ex.fe_logical = 0; - ext4_mb_store_history(ac); + trace_ext4_mballoc_discard(ac); } return 0; @@ -3868,7 +3599,7 @@ ext4_mb_discard_group_preallocations(struct super_block *sb, int busy = 0; int free = 0; - mb_debug("discard preallocation for group %u\n", group); + mb_debug(1, "discard preallocation for group %u\n", group); if (list_empty(&grp->bb_prealloc_list)) return 0; @@ -3992,7 +3723,7 @@ void ext4_discard_preallocations(struct inode *inode) return; } - mb_debug("discard preallocation for inode %lu\n", inode->i_ino); + mb_debug(1, "discard preallocation for inode %lu\n", inode->i_ino); trace_ext4_discard_preallocations(inode); INIT_LIST_HEAD(&list); @@ -4097,7 +3828,7 @@ static void ext4_mb_return_to_preallocation(struct inode *inode, { BUG_ON(!list_empty(&EXT4_I(inode)->i_prealloc_list)); } -#ifdef MB_DEBUG +#ifdef CONFIG_EXT4_DEBUG static void ext4_mb_show_ac(struct ext4_allocation_context *ac) { struct super_block *sb = ac->ac_sb; @@ -4139,14 +3870,14 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac) ext4_get_group_no_and_offset(sb, pa->pa_pstart, NULL, &start); spin_unlock(&pa->pa_lock); - printk(KERN_ERR "PA:%lu:%d:%u \n", i, - start, pa->pa_len); + printk(KERN_ERR "PA:%u:%d:%u \n", i, + start, pa->pa_len); } ext4_unlock_group(sb, i); if (grp->bb_free == 0) continue; - printk(KERN_ERR "%lu: %d/%d \n", + printk(KERN_ERR "%u: %d/%d \n", i, grp->bb_free, grp->bb_fragments); } printk(KERN_ERR "\n"); @@ -4174,16 +3905,26 @@ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac) if (!(ac->ac_flags & EXT4_MB_HINT_DATA)) return; + if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY)) + return; + size = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len; - isize = i_size_read(ac->ac_inode) >> bsbits; - size = max(size, isize); + isize = (i_size_read(ac->ac_inode) + ac->ac_sb->s_blocksize - 1) + >> bsbits; - /* don't use group allocation for large files */ - if (size >= sbi->s_mb_stream_request) + if ((size == isize) && + !ext4_fs_is_busy(sbi) && + (atomic_read(&ac->ac_inode->i_writecount) == 0)) { + ac->ac_flags |= EXT4_MB_HINT_NOPREALLOC; return; + } - if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY)) + /* don't use group allocation for large files */ + size = max(size, isize); + if (size >= sbi->s_mb_stream_request) { + ac->ac_flags |= EXT4_MB_STREAM_ALLOC; return; + } BUG_ON(ac->ac_lg != NULL); /* @@ -4246,7 +3987,7 @@ ext4_mb_initialize_context(struct ext4_allocation_context *ac, * locality group. this is a policy, actually */ ext4_mb_group_or_file(ac); - mb_debug("init ac: %u blocks @ %u, goal %u, flags %x, 2^%d, " + mb_debug(1, "init ac: %u blocks @ %u, goal %u, flags %x, 2^%d, " "left: %u/%u, right %u/%u to %swritable\n", (unsigned) ar->len, (unsigned) ar->logical, (unsigned) ar->goal, ac->ac_flags, ac->ac_2order, @@ -4268,7 +4009,7 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb, struct ext4_prealloc_space *pa, *tmp; struct ext4_allocation_context *ac; - mb_debug("discard locality group preallocation\n"); + mb_debug(1, "discard locality group preallocation\n"); INIT_LIST_HEAD(&discard_list); ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS); @@ -4720,7 +4461,6 @@ void ext4_mb_free_blocks(handle_t *handle, struct inode *inode, ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS); if (ac) { - ac->ac_op = EXT4_MB_HISTORY_FREE; ac->ac_inode = inode; ac->ac_sb = sb; } @@ -4787,7 +4527,7 @@ do_more: ac->ac_b_ex.fe_group = block_group; ac->ac_b_ex.fe_start = bit; ac->ac_b_ex.fe_len = count; - ext4_mb_store_history(ac); + trace_ext4_mballoc_free(ac); } err = ext4_mb_load_buddy(sb, block_group, &e4b); diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h index c96bb19..0ca8110 100644 --- a/fs/ext4/mballoc.h +++ b/fs/ext4/mballoc.h @@ -37,25 +37,23 @@ /* */ -#define MB_DEBUG__ -#ifdef MB_DEBUG -#define mb_debug(fmt, a...) printk(fmt, ##a) +#ifdef CONFIG_EXT4_DEBUG +extern u8 mb_enable_debug; + +#define mb_debug(n, fmt, a...) \ + do { \ + if ((n) <= mb_enable_debug) { \ + printk(KERN_DEBUG "(%s, %d): %s: ", \ + __FILE__, __LINE__, __func__); \ + printk(fmt, ## a); \ + } \ + } while (0) #else -#define mb_debug(fmt, a...) +#define mb_debug(n, fmt, a...) #endif -/* - * with EXT4_MB_HISTORY mballoc stores last N allocations in memory - * and you can monitor it in /proc/fs/ext4/<dev>/mb_history - */ -#define EXT4_MB_HISTORY #define EXT4_MB_HISTORY_ALLOC 1 /* allocation */ #define EXT4_MB_HISTORY_PREALLOC 2 /* preallocated blocks used */ -#define EXT4_MB_HISTORY_DISCARD 4 /* preallocation discarded */ -#define EXT4_MB_HISTORY_FREE 8 /* free */ - -#define EXT4_MB_HISTORY_DEFAULT (EXT4_MB_HISTORY_ALLOC | \ - EXT4_MB_HISTORY_PREALLOC) /* * How long mballoc can look for a best extent (in found extents) @@ -76,7 +74,7 @@ * with 'ext4_mb_stats' allocator will collect stats that will be * shown at umount. The collecting costs though! */ -#define MB_DEFAULT_STATS 1 +#define MB_DEFAULT_STATS 0 /* * files smaller than MB_DEFAULT_STREAM_THRESHOLD are served @@ -128,8 +126,8 @@ struct ext4_prealloc_space { unsigned pa_deleted; ext4_fsblk_t pa_pstart; /* phys. block */ ext4_lblk_t pa_lstart; /* log. block */ - unsigned short pa_len; /* len of preallocated chunk */ - unsigned short pa_free; /* how many blocks are free */ + ext4_grpblk_t pa_len; /* len of preallocated chunk */ + ext4_grpblk_t pa_free; /* how many blocks are free */ unsigned short pa_type; /* pa type. inode or group */ spinlock_t *pa_obj_lock; struct inode *pa_inode; /* hack, for history only */ @@ -144,7 +142,7 @@ struct ext4_free_extent { ext4_lblk_t fe_logical; ext4_grpblk_t fe_start; ext4_group_t fe_group; - int fe_len; + ext4_grpblk_t fe_len; }; /* @@ -209,22 +207,6 @@ struct ext4_allocation_context { #define AC_STATUS_FOUND 2 #define AC_STATUS_BREAK 3 -struct ext4_mb_history { - struct ext4_free_extent orig; /* orig allocation */ - struct ext4_free_extent goal; /* goal allocation */ - struct ext4_free_extent result; /* result allocation */ - unsigned pid; - unsigned ino; - __u16 found; /* how many extents have been found */ - __u16 groups; /* how many groups have been scanned */ - __u16 tail; /* what tail broke some buddy */ - __u16 buddy; /* buddy the tail ^^^ broke */ - __u16 flags; - __u8 cr:3; /* which phase the result extent was found at */ - __u8 op:4; - __u8 merged:1; -}; - struct ext4_buddy { struct page *bd_buddy_page; void *bd_buddy; @@ -239,13 +221,6 @@ struct ext4_buddy { #define EXT4_MB_BITMAP(e4b) ((e4b)->bd_bitmap) #define EXT4_MB_BUDDY(e4b) ((e4b)->bd_buddy) -#ifndef EXT4_MB_HISTORY -static inline void ext4_mb_store_history(struct ext4_allocation_context *ac) -{ - return; -} -#endif - #define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1) static inline ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb, diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c index 313a50b..a93d5b8 100644 --- a/fs/ext4/migrate.c +++ b/fs/ext4/migrate.c @@ -75,7 +75,7 @@ static int finish_range(handle_t *handle, struct inode *inode, goto err_out; } } - retval = ext4_ext_insert_extent(handle, inode, path, &newext); + retval = ext4_ext_insert_extent(handle, inode, path, &newext, 0); err_out: if (path) { ext4_ext_drop_refs(path); @@ -353,17 +353,16 @@ static int ext4_ext_swap_inode_data(handle_t *handle, struct inode *inode, down_write(&EXT4_I(inode)->i_data_sem); /* - * if EXT4_EXT_MIGRATE is cleared a block allocation + * if EXT4_STATE_EXT_MIGRATE is cleared a block allocation * happened after we started the migrate. We need to * fail the migrate */ - if (!(EXT4_I(inode)->i_flags & EXT4_EXT_MIGRATE)) { + if (!(EXT4_I(inode)->i_state & EXT4_STATE_EXT_MIGRATE)) { retval = -EAGAIN; up_write(&EXT4_I(inode)->i_data_sem); goto err_out; } else - EXT4_I(inode)->i_flags = EXT4_I(inode)->i_flags & - ~EXT4_EXT_MIGRATE; + EXT4_I(inode)->i_state &= ~EXT4_STATE_EXT_MIGRATE; /* * We have the extent map build with the tmp inode. * Now copy the i_data across @@ -517,14 +516,15 @@ int ext4_ext_migrate(struct inode *inode) * when we add extents we extent the journal */ /* - * Even though we take i_mutex we can still cause block allocation - * via mmap write to holes. If we have allocated new blocks we fail - * migrate. New block allocation will clear EXT4_EXT_MIGRATE flag. - * The flag is updated with i_data_sem held to prevent racing with - * block allocation. + * Even though we take i_mutex we can still cause block + * allocation via mmap write to holes. If we have allocated + * new blocks we fail migrate. New block allocation will + * clear EXT4_STATE_EXT_MIGRATE flag. The flag is updated + * with i_data_sem held to prevent racing with block + * allocation. */ down_read((&EXT4_I(inode)->i_data_sem)); - EXT4_I(inode)->i_flags = EXT4_I(inode)->i_flags | EXT4_EXT_MIGRATE; + EXT4_I(inode)->i_state |= EXT4_STATE_EXT_MIGRATE; up_read((&EXT4_I(inode)->i_data_sem)); handle = ext4_journal_start(inode, 1); @@ -618,7 +618,7 @@ err_out: tmp_inode->i_nlink = 0; ext4_journal_stop(handle); - + unlock_new_inode(tmp_inode); iput(tmp_inode); return retval; diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index bbf2dd9..25b6b14 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -19,14 +19,31 @@ #include "ext4_extents.h" #include "ext4.h" -#define get_ext_path(path, inode, block, ret) \ - do { \ - path = ext4_ext_find_extent(inode, block, path); \ - if (IS_ERR(path)) { \ - ret = PTR_ERR(path); \ - path = NULL; \ - } \ - } while (0) +/** + * get_ext_path - Find an extent path for designated logical block number. + * + * @inode: an inode which is searched + * @lblock: logical block number to find an extent path + * @path: pointer to an extent path pointer (for output) + * + * ext4_ext_find_extent wrapper. Return 0 on success, or a negative error value + * on failure. + */ +static inline int +get_ext_path(struct inode *inode, ext4_lblk_t lblock, + struct ext4_ext_path **path) +{ + int ret = 0; + + *path = ext4_ext_find_extent(inode, lblock, *path); + if (IS_ERR(*path)) { + ret = PTR_ERR(*path); + *path = NULL; + } else if ((*path)[ext_depth(inode)].p_ext == NULL) + ret = -ENODATA; + + return ret; +} /** * copy_extent_status - Copy the extent's initialization status @@ -113,6 +130,31 @@ mext_next_extent(struct inode *inode, struct ext4_ext_path *path, } /** + * mext_check_null_inode - NULL check for two inodes + * + * If inode1 or inode2 is NULL, return -EIO. Otherwise, return 0. + */ +static int +mext_check_null_inode(struct inode *inode1, struct inode *inode2, + const char *function) +{ + int ret = 0; + + if (inode1 == NULL) { + ext4_error(inode2->i_sb, function, + "Both inodes should not be NULL: " + "inode1 NULL inode2 %lu", inode2->i_ino); + ret = -EIO; + } else if (inode2 == NULL) { + ext4_error(inode1->i_sb, function, + "Both inodes should not be NULL: " + "inode1 %lu inode2 NULL", inode1->i_ino); + ret = -EIO; + } + return ret; +} + +/** * mext_double_down_read - Acquire two inodes' read semaphore * * @orig_inode: original inode structure @@ -124,8 +166,6 @@ mext_double_down_read(struct inode *orig_inode, struct inode *donor_inode) { struct inode *first = orig_inode, *second = donor_inode; - BUG_ON(orig_inode == NULL || donor_inode == NULL); - /* * Use the inode number to provide the stable locking order instead * of its address, because the C language doesn't guarantee you can @@ -152,8 +192,6 @@ mext_double_down_write(struct inode *orig_inode, struct inode *donor_inode) { struct inode *first = orig_inode, *second = donor_inode; - BUG_ON(orig_inode == NULL || donor_inode == NULL); - /* * Use the inode number to provide the stable locking order instead * of its address, because the C language doesn't guarantee you can @@ -178,8 +216,6 @@ mext_double_down_write(struct inode *orig_inode, struct inode *donor_inode) static void mext_double_up_read(struct inode *orig_inode, struct inode *donor_inode) { - BUG_ON(orig_inode == NULL || donor_inode == NULL); - up_read(&EXT4_I(orig_inode)->i_data_sem); up_read(&EXT4_I(donor_inode)->i_data_sem); } @@ -194,8 +230,6 @@ mext_double_up_read(struct inode *orig_inode, struct inode *donor_inode) static void mext_double_up_write(struct inode *orig_inode, struct inode *donor_inode) { - BUG_ON(orig_inode == NULL || donor_inode == NULL); - up_write(&EXT4_I(orig_inode)->i_data_sem); up_write(&EXT4_I(donor_inode)->i_data_sem); } @@ -283,23 +317,23 @@ mext_insert_across_blocks(handle_t *handle, struct inode *orig_inode, } if (new_flag) { - get_ext_path(orig_path, orig_inode, eblock, err); - if (orig_path == NULL) + err = get_ext_path(orig_inode, eblock, &orig_path); + if (err) goto out; if (ext4_ext_insert_extent(handle, orig_inode, - orig_path, new_ext)) + orig_path, new_ext, 0)) goto out; } if (end_flag) { - get_ext_path(orig_path, orig_inode, - le32_to_cpu(end_ext->ee_block) - 1, err); - if (orig_path == NULL) + err = get_ext_path(orig_inode, + le32_to_cpu(end_ext->ee_block) - 1, &orig_path); + if (err) goto out; if (ext4_ext_insert_extent(handle, orig_inode, - orig_path, end_ext)) + orig_path, end_ext, 0)) goto out; } out: @@ -519,7 +553,15 @@ mext_leaf_block(handle_t *handle, struct inode *orig_inode, * oext |-----------| * new_ext |-------| */ - BUG_ON(le32_to_cpu(oext->ee_block) + oext_alen - 1 < new_ext_end); + if (le32_to_cpu(oext->ee_block) + oext_alen - 1 < new_ext_end) { + ext4_error(orig_inode->i_sb, __func__, + "new_ext_end(%u) should be less than or equal to " + "oext->ee_block(%u) + oext_alen(%d) - 1", + new_ext_end, le32_to_cpu(oext->ee_block), + oext_alen); + ret = -EIO; + goto out; + } /* * Case: new_ext is smaller than original extent @@ -543,6 +585,7 @@ mext_leaf_block(handle_t *handle, struct inode *orig_inode, ret = mext_insert_extents(handle, orig_inode, orig_path, o_start, o_end, &start_ext, &new_ext, &end_ext); +out: return ret; } @@ -554,8 +597,10 @@ mext_leaf_block(handle_t *handle, struct inode *orig_inode, * @orig_off: block offset of original inode * @donor_off: block offset of donor inode * @max_count: the maximun length of extents + * + * Return 0 on success, or a negative error value on failure. */ -static void +static int mext_calc_swap_extents(struct ext4_extent *tmp_dext, struct ext4_extent *tmp_oext, ext4_lblk_t orig_off, ext4_lblk_t donor_off, @@ -564,6 +609,19 @@ mext_calc_swap_extents(struct ext4_extent *tmp_dext, ext4_lblk_t diff, orig_diff; struct ext4_extent dext_old, oext_old; + BUG_ON(orig_off != donor_off); + + /* original and donor extents have to cover the same block offset */ + if (orig_off < le32_to_cpu(tmp_oext->ee_block) || + le32_to_cpu(tmp_oext->ee_block) + + ext4_ext_get_actual_len(tmp_oext) - 1 < orig_off) + return -ENODATA; + + if (orig_off < le32_to_cpu(tmp_dext->ee_block) || + le32_to_cpu(tmp_dext->ee_block) + + ext4_ext_get_actual_len(tmp_dext) - 1 < orig_off) + return -ENODATA; + dext_old = *tmp_dext; oext_old = *tmp_oext; @@ -591,6 +649,8 @@ mext_calc_swap_extents(struct ext4_extent *tmp_dext, copy_extent_status(&oext_old, tmp_dext); copy_extent_status(&dext_old, tmp_oext); + + return 0; } /** @@ -631,13 +691,13 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode, mext_double_down_write(orig_inode, donor_inode); /* Get the original extent for the block "orig_off" */ - get_ext_path(orig_path, orig_inode, orig_off, err); - if (orig_path == NULL) + err = get_ext_path(orig_inode, orig_off, &orig_path); + if (err) goto out; /* Get the donor extent for the head */ - get_ext_path(donor_path, donor_inode, donor_off, err); - if (donor_path == NULL) + err = get_ext_path(donor_inode, donor_off, &donor_path); + if (err) goto out; depth = ext_depth(orig_inode); oext = orig_path[depth].p_ext; @@ -647,13 +707,28 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode, dext = donor_path[depth].p_ext; tmp_dext = *dext; - mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off, + err = mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off, donor_off, count); + if (err) + goto out; /* Loop for the donor extents */ while (1) { /* The extent for donor must be found. */ - BUG_ON(!dext || donor_off != le32_to_cpu(tmp_dext.ee_block)); + if (!dext) { + ext4_error(donor_inode->i_sb, __func__, + "The extent for donor must be found"); + err = -EIO; + goto out; + } else if (donor_off != le32_to_cpu(tmp_dext.ee_block)) { + ext4_error(donor_inode->i_sb, __func__, + "Donor offset(%u) and the first block of donor " + "extent(%u) should be equal", + donor_off, + le32_to_cpu(tmp_dext.ee_block)); + err = -EIO; + goto out; + } /* Set donor extent to orig extent */ err = mext_leaf_block(handle, orig_inode, @@ -678,8 +753,8 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode, if (orig_path) ext4_ext_drop_refs(orig_path); - get_ext_path(orig_path, orig_inode, orig_off, err); - if (orig_path == NULL) + err = get_ext_path(orig_inode, orig_off, &orig_path); + if (err) goto out; depth = ext_depth(orig_inode); oext = orig_path[depth].p_ext; @@ -692,9 +767,8 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode, if (donor_path) ext4_ext_drop_refs(donor_path); - get_ext_path(donor_path, donor_inode, - donor_off, err); - if (donor_path == NULL) + err = get_ext_path(donor_inode, donor_off, &donor_path); + if (err) goto out; depth = ext_depth(donor_inode); dext = donor_path[depth].p_ext; @@ -705,9 +779,10 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode, } tmp_dext = *dext; - mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off, - donor_off, - count - replaced_count); + err = mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off, + donor_off, count - replaced_count); + if (err) + goto out; } out: @@ -740,7 +815,7 @@ out: * on success, or a negative error value on failure. */ static int -move_extent_par_page(struct file *o_filp, struct inode *donor_inode, +move_extent_per_page(struct file *o_filp, struct inode *donor_inode, pgoff_t orig_page_offset, int data_offset_in_page, int block_len_in_page, int uninit) { @@ -871,6 +946,7 @@ out: if (PageLocked(page)) unlock_page(page); page_cache_release(page); + ext4_journal_stop(handle); } out2: ext4_journal_stop(handle); @@ -897,6 +973,10 @@ mext_check_arguments(struct inode *orig_inode, struct inode *donor_inode, __u64 orig_start, __u64 donor_start, __u64 *len, __u64 moved_len) { + ext4_lblk_t orig_blocks, donor_blocks; + unsigned int blkbits = orig_inode->i_blkbits; + unsigned int blocksize = 1 << blkbits; + /* Regular file check */ if (!S_ISREG(orig_inode->i_mode) || !S_ISREG(donor_inode->i_mode)) { ext4_debug("ext4 move extent: The argument files should be " @@ -921,14 +1001,6 @@ mext_check_arguments(struct inode *orig_inode, return -EINVAL; } - /* orig and donor should be different file */ - if (orig_inode->i_ino == donor_inode->i_ino) { - ext4_debug("ext4 move extent: The argument files should not " - "be same file [ino:orig %lu, donor %lu]\n", - orig_inode->i_ino, donor_inode->i_ino); - return -EINVAL; - } - /* Ext4 move extent supports only extent based file */ if (!(EXT4_I(orig_inode)->i_flags & EXT4_EXTENTS_FL)) { ext4_debug("ext4 move extent: orig file is not extents " @@ -960,54 +1032,58 @@ mext_check_arguments(struct inode *orig_inode, return -EINVAL; } - if ((orig_start > MAX_DEFRAG_SIZE) || - (donor_start > MAX_DEFRAG_SIZE) || - (*len > MAX_DEFRAG_SIZE) || - (orig_start + *len > MAX_DEFRAG_SIZE)) { - ext4_debug("ext4 move extent: Can't handle over [%lu] blocks " - "[ino:orig %lu, donor %lu]\n", MAX_DEFRAG_SIZE, + if ((orig_start > EXT_MAX_BLOCK) || + (donor_start > EXT_MAX_BLOCK) || + (*len > EXT_MAX_BLOCK) || + (orig_start + *len > EXT_MAX_BLOCK)) { + ext4_debug("ext4 move extent: Can't handle over [%u] blocks " + "[ino:orig %lu, donor %lu]\n", EXT_MAX_BLOCK, orig_inode->i_ino, donor_inode->i_ino); return -EINVAL; } if (orig_inode->i_size > donor_inode->i_size) { - if (orig_start >= donor_inode->i_size) { + donor_blocks = (donor_inode->i_size + blocksize - 1) >> blkbits; + /* TODO: eliminate this artificial restriction */ + if (orig_start >= donor_blocks) { ext4_debug("ext4 move extent: orig start offset " - "[%llu] should be less than donor file size " - "[%lld] [ino:orig %lu, donor_inode %lu]\n", - orig_start, donor_inode->i_size, + "[%llu] should be less than donor file blocks " + "[%u] [ino:orig %lu, donor %lu]\n", + orig_start, donor_blocks, orig_inode->i_ino, donor_inode->i_ino); return -EINVAL; } - if (orig_start + *len > donor_inode->i_size) { + /* TODO: eliminate this artificial restriction */ + if (orig_start + *len > donor_blocks) { ext4_debug("ext4 move extent: End offset [%llu] should " - "be less than donor file size [%lld]." - "So adjust length from %llu to %lld " + "be less than donor file blocks [%u]." + "So adjust length from %llu to %llu " "[ino:orig %lu, donor %lu]\n", - orig_start + *len, donor_inode->i_size, - *len, donor_inode->i_size - orig_start, + orig_start + *len, donor_blocks, + *len, donor_blocks - orig_start, orig_inode->i_ino, donor_inode->i_ino); - *len = donor_inode->i_size - orig_start; + *len = donor_blocks - orig_start; } } else { - if (orig_start >= orig_inode->i_size) { + orig_blocks = (orig_inode->i_size + blocksize - 1) >> blkbits; + if (orig_start >= orig_blocks) { ext4_debug("ext4 move extent: start offset [%llu] " - "should be less than original file size " - "[%lld] [inode:orig %lu, donor %lu]\n", - orig_start, orig_inode->i_size, + "should be less than original file blocks " + "[%u] [ino:orig %lu, donor %lu]\n", + orig_start, orig_blocks, orig_inode->i_ino, donor_inode->i_ino); return -EINVAL; } - if (orig_start + *len > orig_inode->i_size) { + if (orig_start + *len > orig_blocks) { ext4_debug("ext4 move extent: Adjust length " - "from %llu to %lld. Because it should be " - "less than original file size " + "from %llu to %llu. Because it should be " + "less than original file blocks " "[ino:orig %lu, donor %lu]\n", - *len, orig_inode->i_size - orig_start, + *len, orig_blocks - orig_start, orig_inode->i_ino, donor_inode->i_ino); - *len = orig_inode->i_size - orig_start; + *len = orig_blocks - orig_start; } } @@ -1027,18 +1103,23 @@ mext_check_arguments(struct inode *orig_inode, * @inode1: the inode structure * @inode2: the inode structure * - * Lock two inodes' i_mutex by i_ino order. This function is moved from - * fs/inode.c. + * Lock two inodes' i_mutex by i_ino order. + * If inode1 or inode2 is NULL, return -EIO. Otherwise, return 0. */ -static void +static int mext_inode_double_lock(struct inode *inode1, struct inode *inode2) { - if (inode1 == NULL || inode2 == NULL || inode1 == inode2) { - if (inode1) - mutex_lock(&inode1->i_mutex); - else if (inode2) - mutex_lock(&inode2->i_mutex); - return; + int ret = 0; + + BUG_ON(inode1 == NULL && inode2 == NULL); + + ret = mext_check_null_inode(inode1, inode2, __func__); + if (ret < 0) + goto out; + + if (inode1 == inode2) { + mutex_lock(&inode1->i_mutex); + goto out; } if (inode1->i_ino < inode2->i_ino) { @@ -1048,6 +1129,9 @@ mext_inode_double_lock(struct inode *inode1, struct inode *inode2) mutex_lock_nested(&inode2->i_mutex, I_MUTEX_PARENT); mutex_lock_nested(&inode1->i_mutex, I_MUTEX_CHILD); } + +out: + return ret; } /** @@ -1056,17 +1140,28 @@ mext_inode_double_lock(struct inode *inode1, struct inode *inode2) * @inode1: the inode that is released first * @inode2: the inode that is released second * - * This function is moved from fs/inode.c. + * If inode1 or inode2 is NULL, return -EIO. Otherwise, return 0. */ -static void +static int mext_inode_double_unlock(struct inode *inode1, struct inode *inode2) { + int ret = 0; + + BUG_ON(inode1 == NULL && inode2 == NULL); + + ret = mext_check_null_inode(inode1, inode2, __func__); + if (ret < 0) + goto out; + if (inode1) mutex_unlock(&inode1->i_mutex); if (inode2 && inode2 != inode1) mutex_unlock(&inode2->i_mutex); + +out: + return ret; } /** @@ -1123,70 +1218,84 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, ext4_lblk_t block_end, seq_start, add_blocks, file_end, seq_blocks = 0; ext4_lblk_t rest_blocks; pgoff_t orig_page_offset = 0, seq_end_page; - int ret, depth, last_extent = 0; + int ret1, ret2, depth, last_extent = 0; int blocks_per_page = PAGE_CACHE_SIZE >> orig_inode->i_blkbits; int data_offset_in_page; int block_len_in_page; int uninit; + /* orig and donor should be different file */ + if (orig_inode->i_ino == donor_inode->i_ino) { + ext4_debug("ext4 move extent: The argument files should not " + "be same file [ino:orig %lu, donor %lu]\n", + orig_inode->i_ino, donor_inode->i_ino); + return -EINVAL; + } + /* protect orig and donor against a truncate */ - mext_inode_double_lock(orig_inode, donor_inode); + ret1 = mext_inode_double_lock(orig_inode, donor_inode); + if (ret1 < 0) + return ret1; mext_double_down_read(orig_inode, donor_inode); /* Check the filesystem environment whether move_extent can be done */ - ret = mext_check_arguments(orig_inode, donor_inode, orig_start, + ret1 = mext_check_arguments(orig_inode, donor_inode, orig_start, donor_start, &len, *moved_len); mext_double_up_read(orig_inode, donor_inode); - if (ret) - goto out2; + if (ret1) + goto out; file_end = (i_size_read(orig_inode) - 1) >> orig_inode->i_blkbits; block_end = block_start + len - 1; if (file_end < block_end) len -= block_end - file_end; - get_ext_path(orig_path, orig_inode, block_start, ret); - if (orig_path == NULL) - goto out2; + ret1 = get_ext_path(orig_inode, block_start, &orig_path); + if (ret1) + goto out; /* Get path structure to check the hole */ - get_ext_path(holecheck_path, orig_inode, block_start, ret); - if (holecheck_path == NULL) + ret1 = get_ext_path(orig_inode, block_start, &holecheck_path); + if (ret1) goto out; depth = ext_depth(orig_inode); ext_cur = holecheck_path[depth].p_ext; - if (ext_cur == NULL) { - ret = -EINVAL; - goto out; - } /* - * Get proper extent whose ee_block is beyond block_start - * if block_start was within the hole. + * Get proper starting location of block replacement if block_start was + * within the hole. */ if (le32_to_cpu(ext_cur->ee_block) + ext4_ext_get_actual_len(ext_cur) - 1 < block_start) { + /* + * The hole exists between extents or the tail of + * original file. + */ last_extent = mext_next_extent(orig_inode, holecheck_path, &ext_cur); if (last_extent < 0) { - ret = last_extent; + ret1 = last_extent; goto out; } last_extent = mext_next_extent(orig_inode, orig_path, &ext_dummy); if (last_extent < 0) { - ret = last_extent; + ret1 = last_extent; goto out; } - } - seq_start = block_start; + seq_start = le32_to_cpu(ext_cur->ee_block); + } else if (le32_to_cpu(ext_cur->ee_block) > block_start) + /* The hole exists at the beginning of original file. */ + seq_start = le32_to_cpu(ext_cur->ee_block); + else + seq_start = block_start; /* No blocks within the specified range. */ if (le32_to_cpu(ext_cur->ee_block) > block_end) { ext4_debug("ext4 move extent: The specified range of file " "may be the hole\n"); - ret = -EINVAL; + ret1 = -EINVAL; goto out; } @@ -1206,7 +1315,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, last_extent = mext_next_extent(orig_inode, holecheck_path, &ext_cur); if (last_extent < 0) { - ret = last_extent; + ret1 = last_extent; break; } add_blocks = ext4_ext_get_actual_len(ext_cur); @@ -1258,16 +1367,23 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, while (orig_page_offset <= seq_end_page) { /* Swap original branches with new branches */ - ret = move_extent_par_page(o_filp, donor_inode, + ret1 = move_extent_per_page(o_filp, donor_inode, orig_page_offset, data_offset_in_page, block_len_in_page, uninit); - if (ret < 0) + if (ret1 < 0) goto out; orig_page_offset++; /* Count how many blocks we have exchanged */ *moved_len += block_len_in_page; - BUG_ON(*moved_len > len); + if (*moved_len > len) { + ext4_error(orig_inode->i_sb, __func__, + "We replaced blocks too much! " + "sum of replaced: %llu requested: %llu", + *moved_len, len); + ret1 = -EIO; + goto out; + } data_offset_in_page = 0; rest_blocks -= block_len_in_page; @@ -1280,17 +1396,16 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, /* Decrease buffer counter */ if (holecheck_path) ext4_ext_drop_refs(holecheck_path); - get_ext_path(holecheck_path, orig_inode, - seq_start, ret); - if (holecheck_path == NULL) + ret1 = get_ext_path(orig_inode, seq_start, &holecheck_path); + if (ret1) break; depth = holecheck_path->p_depth; /* Decrease buffer counter */ if (orig_path) ext4_ext_drop_refs(orig_path); - get_ext_path(orig_path, orig_inode, seq_start, ret); - if (orig_path == NULL) + ret1 = get_ext_path(orig_inode, seq_start, &orig_path); + if (ret1) break; ext_cur = holecheck_path[depth].p_ext; @@ -1307,14 +1422,13 @@ out: ext4_ext_drop_refs(holecheck_path); kfree(holecheck_path); } -out2: - mext_inode_double_unlock(orig_inode, donor_inode); - if (ret) - return ret; + ret2 = mext_inode_double_unlock(orig_inode, donor_inode); - /* All of the specified blocks must be exchanged in succeed */ - BUG_ON(*moved_len != len); + if (ret1) + return ret1; + else if (ret2) + return ret2; return 0; } diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index de04013..7c8fe80 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -1518,8 +1518,12 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry, return retval; if (blocks == 1 && !dx_fallback && - EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_DIR_INDEX)) - return make_indexed_dir(handle, dentry, inode, bh); + EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_DIR_INDEX)) { + retval = make_indexed_dir(handle, dentry, inode, bh); + if (retval == -ENOSPC) + brelse(bh); + return retval; + } brelse(bh); } bh = ext4_append(handle, dir, &block, &retval); @@ -1528,7 +1532,10 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry, de = (struct ext4_dir_entry_2 *) bh->b_data; de->inode = 0; de->rec_len = ext4_rec_len_to_disk(blocksize, blocksize); - return add_dirent_to_buf(handle, dentry, inode, de, bh); + retval = add_dirent_to_buf(handle, dentry, inode, de, bh); + if (retval == -ENOSPC) + brelse(bh); + return retval; } /* @@ -1590,9 +1597,9 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry, goto cleanup; node2 = (struct dx_node *)(bh2->b_data); entries2 = node2->entries; + memset(&node2->fake, 0, sizeof(struct fake_dirent)); node2->fake.rec_len = ext4_rec_len_to_disk(sb->s_blocksize, sb->s_blocksize); - node2->fake.inode = 0; BUFFER_TRACE(frame->bh, "get_write_access"); err = ext4_journal_get_write_access(handle, frame->bh); if (err) @@ -1657,7 +1664,8 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry, if (!de) goto cleanup; err = add_dirent_to_buf(handle, dentry, inode, de, bh); - bh = NULL; + if (err != -ENOSPC) + bh = NULL; goto cleanup; journal_error: @@ -2068,7 +2076,8 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode) struct ext4_iloc iloc; int err = 0; - if (!ext4_handle_valid(handle)) + /* ext4_handle_valid() assumes a valid handle_t pointer */ + if (handle && !ext4_handle_valid(handle)) return 0; mutex_lock(&EXT4_SB(inode->i_sb)->s_orphan_lock); @@ -2310,7 +2319,7 @@ static int ext4_link(struct dentry *old_dentry, struct inode *inode = old_dentry->d_inode; int err, retries = 0; - if (EXT4_DIR_LINK_MAX(inode)) + if (inode->i_nlink >= EXT4_LINK_MAX) return -EMLINK; /* @@ -2413,7 +2422,7 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, goto end_rename; retval = -EMLINK; if (!new_inode && new_dir != old_dir && - new_dir->i_nlink >= EXT4_LINK_MAX) + EXT4_DIR_LINK_MAX(new_dir)) goto end_rename; } if (!new_bh) { @@ -2536,7 +2545,7 @@ const struct inode_operations ext4_dir_inode_operations = { .listxattr = ext4_listxattr, .removexattr = generic_removexattr, #endif - .permission = ext4_permission, + .check_acl = ext4_check_acl, .fiemap = ext4_fiemap, }; @@ -2548,5 +2557,5 @@ const struct inode_operations ext4_special_inode_operations = { .listxattr = ext4_listxattr, .removexattr = generic_removexattr, #endif - .permission = ext4_permission, + .check_acl = ext4_check_acl, }; diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index 68b0351..3cfc343 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -746,7 +746,6 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input) struct inode *inode = NULL; handle_t *handle; int gdb_off, gdb_num; - int num_grp_locked = 0; int err, err2; gdb_num = input->group / EXT4_DESC_PER_BLOCK(sb); @@ -856,7 +855,6 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input) * using the new disk blocks. */ - num_grp_locked = ext4_mb_get_buddy_cache_lock(sb, input->group); /* Update group descriptor block for new group */ gdp = (struct ext4_group_desc *)((char *)primary->b_data + gdb_off * EXT4_DESC_SIZE(sb)); @@ -875,10 +873,8 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input) * descriptor */ err = ext4_mb_add_groupinfo(sb, input->group, gdp); - if (err) { - ext4_mb_put_buddy_cache_lock(sb, input->group, num_grp_locked); + if (err) goto exit_journal; - } /* * Make the new blocks and inodes valid next. We do this before @@ -920,7 +916,6 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input) /* Update the global fs size fields */ sbi->s_groups_count++; - ext4_mb_put_buddy_cache_lock(sb, input->group, num_grp_locked); ext4_handle_dirty_metadata(handle, NULL, primary); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 8f4f079..312211e 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -45,17 +45,11 @@ #include "ext4_jbd2.h" #include "xattr.h" #include "acl.h" +#include "mballoc.h" #define CREATE_TRACE_POINTS #include <trace/events/ext4.h> -static int default_mb_history_length = 1000; - -module_param_named(default_mb_history_length, default_mb_history_length, - int, 0644); -MODULE_PARM_DESC(default_mb_history_length, - "Default number of entries saved for mb_history"); - struct proc_dir_entry *ext4_proc_root; static struct kset *ext4_kset; @@ -188,6 +182,36 @@ void ext4_itable_unused_set(struct super_block *sb, bg->bg_itable_unused_hi = cpu_to_le16(count >> 16); } + +/* Just increment the non-pointer handle value */ +static handle_t *ext4_get_nojournal(void) +{ + handle_t *handle = current->journal_info; + unsigned long ref_cnt = (unsigned long)handle; + + BUG_ON(ref_cnt >= EXT4_NOJOURNAL_MAX_REF_COUNT); + + ref_cnt++; + handle = (handle_t *)ref_cnt; + + current->journal_info = handle; + return handle; +} + + +/* Decrement the non-pointer handle value */ +static void ext4_put_nojournal(handle_t *handle) +{ + unsigned long ref_cnt = (unsigned long)handle; + + BUG_ON(ref_cnt == 0); + + ref_cnt--; + handle = (handle_t *)ref_cnt; + + current->journal_info = handle; +} + /* * Wrappers for jbd2_journal_start/end. * @@ -214,11 +238,7 @@ handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks) } return jbd2_journal_start(journal, nblocks); } - /* - * We're not journaling, return the appropriate indication. - */ - current->journal_info = EXT4_NOJOURNAL_HANDLE; - return current->journal_info; + return ext4_get_nojournal(); } /* @@ -234,11 +254,7 @@ int __ext4_journal_stop(const char *where, handle_t *handle) int rc; if (!ext4_handle_valid(handle)) { - /* - * Do this here since we don't call jbd2_journal_stop() in - * no-journal mode. - */ - current->journal_info = NULL; + ext4_put_nojournal(handle); return 0; } sb = handle->h_transaction->t_journal->j_private; @@ -344,7 +360,8 @@ static const char *ext4_decode_error(struct super_block *sb, int errno, errstr = "Out of memory"; break; case -EROFS: - if (!sb || EXT4_SB(sb)->s_journal->j_flags & JBD2_ABORT) + if (!sb || (EXT4_SB(sb)->s_journal && + EXT4_SB(sb)->s_journal->j_flags & JBD2_ABORT)) errstr = "Journal has aborted"; else errstr = "Readonly filesystem"; @@ -578,6 +595,9 @@ static void ext4_put_super(struct super_block *sb) struct ext4_super_block *es = sbi->s_es; int i, err; + flush_workqueue(sbi->dio_unwritten_wq); + destroy_workqueue(sbi->dio_unwritten_wq); + lock_super(sb); lock_kernel(); if (sb->s_dirt) @@ -682,6 +702,8 @@ static struct inode *ext4_alloc_inode(struct super_block *sb) ei->i_allocated_meta_blocks = 0; ei->i_delalloc_reserved_flag = 0; spin_lock_init(&(ei->i_block_reservation_lock)); + INIT_LIST_HEAD(&ei->i_aio_dio_complete_list); + ei->cur_aio_dio = NULL; return &ei->vfs_inode; } @@ -962,7 +984,7 @@ static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data, static ssize_t ext4_quota_write(struct super_block *sb, int type, const char *data, size_t len, loff_t off); -static struct dquot_operations ext4_quota_operations = { +static const struct dquot_operations ext4_quota_operations = { .initialize = dquot_initialize, .drop = dquot_drop, .alloc_space = dquot_alloc_space, @@ -983,7 +1005,7 @@ static struct dquot_operations ext4_quota_operations = { .destroy_dquot = dquot_destroy, }; -static struct quotactl_ops ext4_qctl_operations = { +static const struct quotactl_ops ext4_qctl_operations = { .quota_on = ext4_quota_on, .quota_off = vfs_quota_off, .quota_sync = vfs_quota_sync, @@ -1050,7 +1072,7 @@ enum { Opt_journal_update, Opt_journal_dev, Opt_journal_checksum, Opt_journal_async_commit, Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback, - Opt_data_err_abort, Opt_data_err_ignore, Opt_mb_history_length, + Opt_data_err_abort, Opt_data_err_ignore, Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota, Opt_ignore, Opt_barrier, Opt_nobarrier, Opt_err, Opt_resize, @@ -1097,7 +1119,6 @@ static const match_table_t tokens = { {Opt_data_writeback, "data=writeback"}, {Opt_data_err_abort, "data_err=abort"}, {Opt_data_err_ignore, "data_err=ignore"}, - {Opt_mb_history_length, "mb_history_length=%u"}, {Opt_offusrjquota, "usrjquota="}, {Opt_usrjquota, "usrjquota=%s"}, {Opt_offgrpjquota, "grpjquota="}, @@ -1279,11 +1300,9 @@ static int parse_options(char *options, struct super_block *sb, *journal_devnum = option; break; case Opt_journal_checksum: - set_opt(sbi->s_mount_opt, JOURNAL_CHECKSUM); - break; + break; /* Kept for backwards compatibility */ case Opt_journal_async_commit: set_opt(sbi->s_mount_opt, JOURNAL_ASYNC_COMMIT); - set_opt(sbi->s_mount_opt, JOURNAL_CHECKSUM); break; case Opt_noload: set_opt(sbi->s_mount_opt, NOLOAD); @@ -1340,13 +1359,6 @@ static int parse_options(char *options, struct super_block *sb, case Opt_data_err_ignore: clear_opt(sbi->s_mount_opt, DATA_ERR_ABORT); break; - case Opt_mb_history_length: - if (match_int(&args[0], &option)) - return 0; - if (option < 0) - return 0; - sbi->s_mb_history_max = option; - break; #ifdef CONFIG_QUOTA case Opt_usrjquota: qtype = USRQUOTA; @@ -1646,13 +1658,6 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es, EXT4_INODES_PER_GROUP(sb), sbi->s_mount_opt); - if (EXT4_SB(sb)->s_journal) { - ext4_msg(sb, KERN_INFO, "%s journal on %s", - EXT4_SB(sb)->s_journal->j_inode ? "internal" : - "external", EXT4_SB(sb)->s_journal->j_devname); - } else { - ext4_msg(sb, KERN_INFO, "no journal"); - } return res; } @@ -1695,12 +1700,12 @@ static int ext4_fill_flex_info(struct super_block *sb) gdp = ext4_get_group_desc(sb, i, NULL); flex_group = ext4_flex_group(sbi, i); - atomic_set(&sbi->s_flex_groups[flex_group].free_inodes, - ext4_free_inodes_count(sb, gdp)); - atomic_set(&sbi->s_flex_groups[flex_group].free_blocks, - ext4_free_blks_count(sb, gdp)); - atomic_set(&sbi->s_flex_groups[flex_group].used_dirs, - ext4_used_dirs_count(sb, gdp)); + atomic_add(ext4_free_inodes_count(sb, gdp), + &sbi->s_flex_groups[flex_group].free_inodes); + atomic_add(ext4_free_blks_count(sb, gdp), + &sbi->s_flex_groups[flex_group].free_blocks); + atomic_add(ext4_used_dirs_count(sb, gdp), + &sbi->s_flex_groups[flex_group].used_dirs); } return 1; @@ -2197,6 +2202,7 @@ EXT4_RW_ATTR_SBI_UI(mb_min_to_scan, s_mb_min_to_scan); EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs); EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request); EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc); +EXT4_RW_ATTR_SBI_UI(max_writeback_mb_bump, s_max_writeback_mb_bump); static struct attribute *ext4_attrs[] = { ATTR_LIST(delayed_allocation_blocks), @@ -2210,6 +2216,7 @@ static struct attribute *ext4_attrs[] = { ATTR_LIST(mb_order2_req), ATTR_LIST(mb_stream_req), ATTR_LIST(mb_group_prealloc), + ATTR_LIST(max_writeback_mb_bump), NULL, }; @@ -2253,6 +2260,49 @@ static struct kobj_type ext4_ktype = { .release = ext4_sb_release, }; +/* + * Check whether this filesystem can be mounted based on + * the features present and the RDONLY/RDWR mount requested. + * Returns 1 if this filesystem can be mounted as requested, + * 0 if it cannot be. + */ +static int ext4_feature_set_ok(struct super_block *sb, int readonly) +{ + if (EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT4_FEATURE_INCOMPAT_SUPP)) { + ext4_msg(sb, KERN_ERR, + "Couldn't mount because of " + "unsupported optional features (%x)", + (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_incompat) & + ~EXT4_FEATURE_INCOMPAT_SUPP)); + return 0; + } + + if (readonly) + return 1; + + /* Check that feature set is OK for a read-write mount */ + if (EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT4_FEATURE_RO_COMPAT_SUPP)) { + ext4_msg(sb, KERN_ERR, "couldn't mount RDWR because of " + "unsupported optional features (%x)", + (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_ro_compat) & + ~EXT4_FEATURE_RO_COMPAT_SUPP)); + return 0; + } + /* + * Large file size enabled file system can only be mounted + * read-write on 32-bit systems if kernel is built with CONFIG_LBDAF + */ + if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_HUGE_FILE)) { + if (sizeof(blkcnt_t) < sizeof(u64)) { + ext4_msg(sb, KERN_ERR, "Filesystem with huge files " + "cannot be mounted RDWR without " + "CONFIG_LBDAF"); + return 0; + } + } + return 1; +} + static int ext4_fill_super(struct super_block *sb, void *data, int silent) __releases(kernel_lock) __acquires(kernel_lock) @@ -2274,7 +2324,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) unsigned int db_count; unsigned int i; int needs_recovery, has_huge_files; - int features; __u64 blocks_count; int err; unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO; @@ -2371,7 +2420,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) sbi->s_commit_interval = JBD2_DEFAULT_MAX_COMMIT_AGE * HZ; sbi->s_min_batch_time = EXT4_DEF_MIN_BATCH_TIME; sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME; - sbi->s_mb_history_max = default_mb_history_length; set_opt(sbi->s_mount_opt, BARRIER); @@ -2401,39 +2449,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) * previously didn't change the revision level when setting the flags, * so there is a chance incompat flags are set on a rev 0 filesystem. */ - features = EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT4_FEATURE_INCOMPAT_SUPP); - if (features) { - ext4_msg(sb, KERN_ERR, - "Couldn't mount because of " - "unsupported optional features (%x)", - (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_incompat) & - ~EXT4_FEATURE_INCOMPAT_SUPP)); + if (!ext4_feature_set_ok(sb, (sb->s_flags & MS_RDONLY))) goto failed_mount; - } - features = EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT4_FEATURE_RO_COMPAT_SUPP); - if (!(sb->s_flags & MS_RDONLY) && features) { - ext4_msg(sb, KERN_ERR, - "Couldn't mount RDWR because of " - "unsupported optional features (%x)", - (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_ro_compat) & - ~EXT4_FEATURE_RO_COMPAT_SUPP)); - goto failed_mount; - } - has_huge_files = EXT4_HAS_RO_COMPAT_FEATURE(sb, - EXT4_FEATURE_RO_COMPAT_HUGE_FILE); - if (has_huge_files) { - /* - * Large file size enabled file system can only be - * mount if kernel is build with CONFIG_LBDAF - */ - if (sizeof(root->i_blocks) < sizeof(u64) && - !(sb->s_flags & MS_RDONLY)) { - ext4_msg(sb, KERN_ERR, "Filesystem with huge " - "files cannot be mounted read-write " - "without CONFIG_LBDAF"); - goto failed_mount; - } - } + blocksize = BLOCK_SIZE << le32_to_cpu(es->s_log_block_size); if (blocksize < EXT4_MIN_BLOCK_SIZE || @@ -2469,6 +2487,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) } } + has_huge_files = EXT4_HAS_RO_COMPAT_FEATURE(sb, + EXT4_FEATURE_RO_COMPAT_HUGE_FILE); sbi->s_bitmap_maxbytes = ext4_max_bitmap_size(sb->s_blocksize_bits, has_huge_files); sb->s_maxbytes = ext4_max_size(sb->s_blocksize_bits, has_huge_files); @@ -2549,12 +2569,19 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) goto failed_mount; } - if (ext4_blocks_count(es) > - (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) { + /* + * Test whether we have more sectors than will fit in sector_t, + * and whether the max offset is addressable by the page cache. + */ + if ((ext4_blocks_count(es) > + (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) || + (ext4_blocks_count(es) > + (pgoff_t)(~0ULL) >> (PAGE_CACHE_SHIFT - sb->s_blocksize_bits))) { ext4_msg(sb, KERN_ERR, "filesystem" - " too large to mount safely"); + " too large to mount safely on this system"); if (sizeof(sector_t) < 8) ext4_msg(sb, KERN_WARNING, "CONFIG_LBDAF not enabled"); + ret = -EFBIG; goto failed_mount; } @@ -2595,6 +2622,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) goto failed_mount; } sbi->s_groups_count = blocks_count; + sbi->s_blockfile_groups = min_t(ext4_group_t, sbi->s_groups_count, + (EXT4_MAX_BLOCK_FILE_PHYS / EXT4_BLOCKS_PER_GROUP(sb))); db_count = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) / EXT4_DESC_PER_BLOCK(sb); sbi->s_group_desc = kmalloc(db_count * sizeof(struct buffer_head *), @@ -2656,6 +2685,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) } sbi->s_stripe = ext4_get_stripe_size(sbi); + sbi->s_max_writeback_mb_bump = 128; /* * set up enough so that it can read an inode @@ -2729,20 +2759,14 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) goto failed_mount4; } - if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) { - jbd2_journal_set_features(sbi->s_journal, - JBD2_FEATURE_COMPAT_CHECKSUM, 0, + jbd2_journal_set_features(sbi->s_journal, + JBD2_FEATURE_COMPAT_CHECKSUM, 0, 0); + if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) + jbd2_journal_set_features(sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT); - } else if (test_opt(sb, JOURNAL_CHECKSUM)) { - jbd2_journal_set_features(sbi->s_journal, - JBD2_FEATURE_COMPAT_CHECKSUM, 0, 0); + else jbd2_journal_clear_features(sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT); - } else { - jbd2_journal_clear_features(sbi->s_journal, - JBD2_FEATURE_COMPAT_CHECKSUM, 0, - JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT); - } /* We have now updated the journal if required, so we can * validate the data journaling mode. */ @@ -2781,6 +2805,12 @@ no_journal: clear_opt(sbi->s_mount_opt, NOBH); } } + EXT4_SB(sb)->dio_unwritten_wq = create_workqueue("ext4-dio-unwritten"); + if (!EXT4_SB(sb)->dio_unwritten_wq) { + printk(KERN_ERR "EXT4-fs: failed to create DIO workqueue\n"); + goto failed_mount_wq; + } + /* * The jbd2_journal_load will have done any necessary log recovery, * so we can safely mount the rest of the filesystem now. @@ -2832,12 +2862,12 @@ no_journal: "available"); } - if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) { + if (test_opt(sb, DELALLOC) && + (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)) { ext4_msg(sb, KERN_WARNING, "Ignoring delalloc option - " "requested data journaling mode"); clear_opt(sbi->s_mount_opt, DELALLOC); - } else if (test_opt(sb, DELALLOC)) - ext4_msg(sb, KERN_INFO, "delayed allocation enabled"); + } err = ext4_setup_system_zone(sb); if (err) { @@ -2893,6 +2923,8 @@ cantfind_ext4: failed_mount4: ext4_msg(sb, KERN_ERR, "mount failed"); + destroy_workqueue(EXT4_SB(sb)->dio_unwritten_wq); +failed_mount_wq: ext4_release_system_zone(sb); if (sbi->s_journal) { jbd2_journal_destroy(sbi->s_journal); @@ -3147,9 +3179,7 @@ static int ext4_load_journal(struct super_block *sb, return -EINVAL; } - if (journal->j_flags & JBD2_BARRIER) - ext4_msg(sb, KERN_INFO, "barriers enabled"); - else + if (!(journal->j_flags & JBD2_BARRIER)) ext4_msg(sb, KERN_INFO, "barriers disabled"); if (!really_read_only && test_opt(sb, UPDATE_JOURNAL)) { @@ -3208,7 +3238,18 @@ static int ext4_commit_super(struct super_block *sb, int sync) clear_buffer_write_io_error(sbh); set_buffer_uptodate(sbh); } - es->s_wtime = cpu_to_le32(get_seconds()); + /* + * If the file system is mounted read-only, don't update the + * superblock write time. This avoids updating the superblock + * write time when we are mounting the root file system + * read/only but we need to replay the journal; at that point, + * for people who are east of GMT and who make their clock + * tick in localtime for Windows bug-for-bug compatibility, + * the clock is set in the future, and this will cause e2fsck + * to complain and force a full file system check. + */ + if (!(sb->s_flags & MS_RDONLY)) + es->s_wtime = cpu_to_le32(get_seconds()); es->s_kbytes_written = cpu_to_le64(EXT4_SB(sb)->s_kbytes_written + ((part_stat_read(sb->s_bdev->bd_part, sectors[1]) - @@ -3333,11 +3374,13 @@ static int ext4_sync_fs(struct super_block *sb, int wait) { int ret = 0; tid_t target; + struct ext4_sb_info *sbi = EXT4_SB(sb); trace_ext4_sync_fs(sb, wait); - if (jbd2_journal_start_commit(EXT4_SB(sb)->s_journal, &target)) { + flush_workqueue(sbi->dio_unwritten_wq); + if (jbd2_journal_start_commit(sbi->s_journal, &target)) { if (wait) - jbd2_log_wait_commit(EXT4_SB(sb)->s_journal, target); + jbd2_log_wait_commit(sbi->s_journal, target); } return ret; } @@ -3477,18 +3520,11 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) if (sbi->s_journal) ext4_mark_recovery_complete(sb, es); } else { - int ret; - if ((ret = EXT4_HAS_RO_COMPAT_FEATURE(sb, - ~EXT4_FEATURE_RO_COMPAT_SUPP))) { - ext4_msg(sb, KERN_WARNING, "couldn't " - "remount RDWR because of unsupported " - "optional features (%x)", - (le32_to_cpu(sbi->s_es->s_feature_ro_compat) & - ~EXT4_FEATURE_RO_COMPAT_SUPP)); + /* Make sure we can mount this feature set readwrite */ + if (!ext4_feature_set_ok(sb, 0)) { err = -EROFS; goto restore_opts; } - /* * Make sure the group descriptor checksums * are sane. If they aren't, refuse to remount r/w. @@ -3930,27 +3966,6 @@ static struct file_system_type ext4_fs_type = { .fs_flags = FS_REQUIRES_DEV, }; -#ifdef CONFIG_EXT4DEV_COMPAT -static int ext4dev_get_sb(struct file_system_type *fs_type, int flags, - const char *dev_name, void *data,struct vfsmount *mnt) -{ - printk(KERN_WARNING "EXT4-fs (%s): Update your userspace programs " - "to mount using ext4\n", dev_name); - printk(KERN_WARNING "EXT4-fs (%s): ext4dev backwards compatibility " - "will go away by 2.6.31\n", dev_name); - return get_sb_bdev(fs_type, flags, dev_name, data, ext4_fill_super,mnt); -} - -static struct file_system_type ext4dev_fs_type = { - .owner = THIS_MODULE, - .name = "ext4dev", - .get_sb = ext4dev_get_sb, - .kill_sb = kill_block_super, - .fs_flags = FS_REQUIRES_DEV, -}; -MODULE_ALIAS("ext4dev"); -#endif - static int __init init_ext4_fs(void) { int err; @@ -3975,13 +3990,6 @@ static int __init init_ext4_fs(void) err = register_filesystem(&ext4_fs_type); if (err) goto out; -#ifdef CONFIG_EXT4DEV_COMPAT - err = register_filesystem(&ext4dev_fs_type); - if (err) { - unregister_filesystem(&ext4_fs_type); - goto out; - } -#endif return 0; out: destroy_inodecache(); @@ -4000,9 +4008,6 @@ out4: static void __exit exit_ext4_fs(void) { unregister_filesystem(&ext4_fs_type); -#ifdef CONFIG_EXT4DEV_COMPAT - unregister_filesystem(&ext4dev_fs_type); -#endif destroy_inodecache(); exit_ext4_xattr(); exit_ext4_mballoc(); diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 62b31c2..fed5b01 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -810,12 +810,23 @@ inserted: get_bh(new_bh); } else { /* We need to allocate a new block */ - ext4_fsblk_t goal = ext4_group_first_block_no(sb, + ext4_fsblk_t goal, block; + + goal = ext4_group_first_block_no(sb, EXT4_I(inode)->i_block_group); - ext4_fsblk_t block = ext4_new_meta_blocks(handle, inode, + + /* non-extent files can't have physical blocks past 2^32 */ + if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) + goal = goal & EXT4_MAX_BLOCK_FILE_PHYS; + + block = ext4_new_meta_blocks(handle, inode, goal, NULL, &error); if (error) goto cleanup; + + if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) + BUG_ON(block > EXT4_MAX_BLOCK_FILE_PHYS); + ea_idebug(inode, "creating block %d", block); new_bh = sb_getblk(sb, block); diff --git a/fs/fat/fat.h b/fs/fat/fat.h index adb0e72..7db0979 100644 --- a/fs/fat/fat.h +++ b/fs/fat/fat.h @@ -323,7 +323,7 @@ extern int fat_flush_inodes(struct super_block *sb, struct inode *i1, /* fat/misc.c */ extern void fat_fs_error(struct super_block *s, const char *fmt, ...) __attribute__ ((format (printf, 2, 3))) __cold; -extern void fat_clusters_flush(struct super_block *sb); +extern int fat_clusters_flush(struct super_block *sb); extern int fat_chain_add(struct inode *inode, int new_dclus, int nr_cluster); extern void fat_time_fat2unix(struct msdos_sb_info *sbi, struct timespec *ts, __le16 __time, __le16 __date, u8 time_cs); diff --git a/fs/fat/file.c b/fs/fat/file.c index f042b96..e8c159d 100644 --- a/fs/fat/file.c +++ b/fs/fat/file.c @@ -176,8 +176,26 @@ static int fat_cont_expand(struct inode *inode, loff_t size) inode->i_ctime = inode->i_mtime = CURRENT_TIME_SEC; mark_inode_dirty(inode); - if (IS_SYNC(inode)) - err = sync_page_range_nolock(inode, mapping, start, count); + if (IS_SYNC(inode)) { + int err2; + + /* + * Opencode syncing since we don't have a file open to use + * standard fsync path. + */ + err = filemap_fdatawrite_range(mapping, start, + start + count - 1); + err2 = sync_mapping_buffers(mapping); + if (!err) + err = err2; + err2 = write_inode_now(inode, 1); + if (!err) + err = err2; + if (!err) { + err = filemap_fdatawait_range(mapping, start, + start + count - 1); + } + } out: return err; } diff --git a/fs/fat/inode.c b/fs/fat/inode.c index 8970d8c..76b7961 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -451,12 +451,16 @@ static void fat_write_super(struct super_block *sb) static int fat_sync_fs(struct super_block *sb, int wait) { - lock_super(sb); - fat_clusters_flush(sb); - sb->s_dirt = 0; - unlock_super(sb); + int err = 0; - return 0; + if (sb->s_dirt) { + lock_super(sb); + sb->s_dirt = 0; + err = fat_clusters_flush(sb); + unlock_super(sb); + } + + return err; } static void fat_put_super(struct super_block *sb) @@ -470,19 +474,11 @@ static void fat_put_super(struct super_block *sb) iput(sbi->fat_inode); - if (sbi->nls_disk) { - unload_nls(sbi->nls_disk); - sbi->nls_disk = NULL; - sbi->options.codepage = fat_default_codepage; - } - if (sbi->nls_io) { - unload_nls(sbi->nls_io); - sbi->nls_io = NULL; - } - if (sbi->options.iocharset != fat_default_iocharset) { + unload_nls(sbi->nls_disk); + unload_nls(sbi->nls_io); + + if (sbi->options.iocharset != fat_default_iocharset) kfree(sbi->options.iocharset); - sbi->options.iocharset = fat_default_iocharset; - } sb->s_fs_info = NULL; kfree(sbi); @@ -820,7 +816,7 @@ static int fat_show_options(struct seq_file *m, struct vfsmount *mnt) seq_puts(m, ",shortname=mixed"); break; case VFAT_SFN_DISPLAY_LOWER | VFAT_SFN_CREATE_WIN95: - /* seq_puts(m, ",shortname=lower"); */ + seq_puts(m, ",shortname=lower"); break; default: seq_puts(m, ",shortname=unknown"); @@ -971,7 +967,7 @@ static int parse_options(char *options, int is_vfat, int silent, int *debug, opts->codepage = fat_default_codepage; opts->iocharset = fat_default_iocharset; if (is_vfat) { - opts->shortname = VFAT_SFN_DISPLAY_LOWER|VFAT_SFN_CREATE_WIN95; + opts->shortname = VFAT_SFN_DISPLAY_WINNT|VFAT_SFN_CREATE_WIN95; opts->rodir = 0; } else { opts->shortname = 0; diff --git a/fs/fat/misc.c b/fs/fat/misc.c index a6c2047..0f55f5c 100644 --- a/fs/fat/misc.c +++ b/fs/fat/misc.c @@ -43,19 +43,19 @@ EXPORT_SYMBOL_GPL(fat_fs_error); /* Flushes the number of free clusters on FAT32 */ /* XXX: Need to write one per FSINFO block. Currently only writes 1 */ -void fat_clusters_flush(struct super_block *sb) +int fat_clusters_flush(struct super_block *sb) { struct msdos_sb_info *sbi = MSDOS_SB(sb); struct buffer_head *bh; struct fat_boot_fsinfo *fsinfo; if (sbi->fat_bits != 32) - return; + return 0; bh = sb_bread(sb, sbi->fsinfo_sector); if (bh == NULL) { printk(KERN_ERR "FAT: bread failed in fat_clusters_flush\n"); - return; + return -EIO; } fsinfo = (struct fat_boot_fsinfo *)bh->b_data; @@ -74,6 +74,8 @@ void fat_clusters_flush(struct super_block *sb) mark_buffer_dirty(bh); } brelse(bh); + + return 0; } /* @@ -119,8 +121,8 @@ int fat_chain_add(struct inode *inode, int new_dclus, int nr_cluster) MSDOS_I(inode)->i_start = new_dclus; MSDOS_I(inode)->i_logstart = new_dclus; /* - * Since generic_osync_inode() synchronize later if - * this is not directory, we don't here. + * Since generic_write_sync() synchronizes regular files later, + * we sync here only directories. */ if (S_ISDIR(inode->i_mode) && IS_DIRSYNC(inode)) { ret = fat_sync_inode(inode); diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c index cb6e835..f565f24 100644 --- a/fs/fat/namei_vfat.c +++ b/fs/fat/namei_vfat.c @@ -499,17 +499,10 @@ xlate_to_uni(const unsigned char *name, int len, unsigned char *outname, int charlen; if (utf8) { - int name_len = strlen(name); - - *outlen = utf8s_to_utf16s(name, PATH_MAX, (wchar_t *) outname); - - /* - * We stripped '.'s before and set len appropriately, - * but utf8s_to_utf16s doesn't care about len - */ - *outlen -= (name_len - len); - - if (*outlen > 255) + *outlen = utf8s_to_utf16s(name, len, (wchar_t *)outname); + if (*outlen < 0) + return *outlen; + else if (*outlen > 255) return -ENAMETOOLONG; op = &outname[*outlen * sizeof(wchar_t)]; @@ -263,6 +263,79 @@ pid_t f_getown(struct file *filp) return pid; } +static int f_setown_ex(struct file *filp, unsigned long arg) +{ + struct f_owner_ex * __user owner_p = (void * __user)arg; + struct f_owner_ex owner; + struct pid *pid; + int type; + int ret; + + ret = copy_from_user(&owner, owner_p, sizeof(owner)); + if (ret) + return ret; + + switch (owner.type) { + case F_OWNER_TID: + type = PIDTYPE_MAX; + break; + + case F_OWNER_PID: + type = PIDTYPE_PID; + break; + + case F_OWNER_GID: + type = PIDTYPE_PGID; + break; + + default: + return -EINVAL; + } + + rcu_read_lock(); + pid = find_vpid(owner.pid); + if (owner.pid && !pid) + ret = -ESRCH; + else + ret = __f_setown(filp, pid, type, 1); + rcu_read_unlock(); + + return ret; +} + +static int f_getown_ex(struct file *filp, unsigned long arg) +{ + struct f_owner_ex * __user owner_p = (void * __user)arg; + struct f_owner_ex owner; + int ret = 0; + + read_lock(&filp->f_owner.lock); + owner.pid = pid_vnr(filp->f_owner.pid); + switch (filp->f_owner.pid_type) { + case PIDTYPE_MAX: + owner.type = F_OWNER_TID; + break; + + case PIDTYPE_PID: + owner.type = F_OWNER_PID; + break; + + case PIDTYPE_PGID: + owner.type = F_OWNER_GID; + break; + + default: + WARN_ON(1); + ret = -EINVAL; + break; + } + read_unlock(&filp->f_owner.lock); + + if (!ret) + ret = copy_to_user(owner_p, &owner, sizeof(owner)); + return ret; +} + static long do_fcntl(int fd, unsigned int cmd, unsigned long arg, struct file *filp) { @@ -313,6 +386,12 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg, case F_SETOWN: err = f_setown(filp, arg, 1); break; + case F_GETOWN_EX: + err = f_getown_ex(filp, arg); + break; + case F_SETOWN_EX: + err = f_setown_ex(filp, arg); + break; case F_GETSIG: err = filp->f_owner.signum; break; @@ -428,8 +507,7 @@ static inline int sigio_perm(struct task_struct *p, static void send_sigio_to_task(struct task_struct *p, struct fown_struct *fown, - int fd, - int reason) + int fd, int reason, int group) { /* * F_SETSIG can change ->signum lockless in parallel, make @@ -461,11 +539,11 @@ static void send_sigio_to_task(struct task_struct *p, else si.si_band = band_table[reason - POLL_IN]; si.si_fd = fd; - if (!group_send_sig_info(signum, &si, p)) + if (!do_send_sig_info(signum, &si, p, group)) break; /* fall-through: fall back on the old plain SIGIO signal */ case 0: - group_send_sig_info(SIGIO, SEND_SIG_PRIV, p); + do_send_sig_info(SIGIO, SEND_SIG_PRIV, p, group); } } @@ -474,16 +552,23 @@ void send_sigio(struct fown_struct *fown, int fd, int band) struct task_struct *p; enum pid_type type; struct pid *pid; + int group = 1; read_lock(&fown->lock); + type = fown->pid_type; + if (type == PIDTYPE_MAX) { + group = 0; + type = PIDTYPE_PID; + } + pid = fown->pid; if (!pid) goto out_unlock_fown; read_lock(&tasklist_lock); do_each_pid_task(pid, type, p) { - send_sigio_to_task(p, fown, fd, band); + send_sigio_to_task(p, fown, fd, band, group); } while_each_pid_task(pid, type, p); read_unlock(&tasklist_lock); out_unlock_fown: @@ -491,10 +576,10 @@ void send_sigio(struct fown_struct *fown, int fd, int band) } static void send_sigurg_to_task(struct task_struct *p, - struct fown_struct *fown) + struct fown_struct *fown, int group) { if (sigio_perm(p, fown, SIGURG)) - group_send_sig_info(SIGURG, SEND_SIG_PRIV, p); + do_send_sig_info(SIGURG, SEND_SIG_PRIV, p, group); } int send_sigurg(struct fown_struct *fown) @@ -502,10 +587,17 @@ int send_sigurg(struct fown_struct *fown) struct task_struct *p; enum pid_type type; struct pid *pid; + int group = 1; int ret = 0; read_lock(&fown->lock); + type = fown->pid_type; + if (type == PIDTYPE_MAX) { + group = 0; + type = PIDTYPE_PID; + } + pid = fown->pid; if (!pid) goto out_unlock_fown; @@ -514,7 +606,7 @@ int send_sigurg(struct fown_struct *fown) read_lock(&tasklist_lock); do_each_pid_task(pid, type, p) { - send_sigurg_to_task(p, fown); + send_sigurg_to_task(p, fown, group); } while_each_pid_task(pid, type, p); read_unlock(&tasklist_lock); out_unlock_fown: @@ -10,6 +10,7 @@ #include <linux/fs.h> #include <linux/mm.h> #include <linux/time.h> +#include <linux/sched.h> #include <linux/slab.h> #include <linux/vmalloc.h> #include <linux/file.h> diff --git a/fs/file_table.c b/fs/file_table.c index 334ce39..8eb4404 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -74,14 +74,14 @@ EXPORT_SYMBOL_GPL(get_max_files); * Handle nr_files sysctl */ #if defined(CONFIG_SYSCTL) && defined(CONFIG_PROC_FS) -int proc_nr_files(ctl_table *table, int write, struct file *filp, +int proc_nr_files(ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { files_stat.nr_files = get_nr_files(); - return proc_dointvec(table, write, filp, buffer, lenp, ppos); + return proc_dointvec(table, write, buffer, lenp, ppos); } #else -int proc_nr_files(ctl_table *table, int write, struct file *filp, +int proc_nr_files(ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index c54226b..9d5360c 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -19,171 +19,257 @@ #include <linux/sched.h> #include <linux/fs.h> #include <linux/mm.h> +#include <linux/kthread.h> +#include <linux/freezer.h> #include <linux/writeback.h> #include <linux/blkdev.h> #include <linux/backing-dev.h> #include <linux/buffer_head.h> #include "internal.h" +#define inode_to_bdi(inode) ((inode)->i_mapping->backing_dev_info) -/** - * writeback_acquire - attempt to get exclusive writeback access to a device - * @bdi: the device's backing_dev_info structure - * - * It is a waste of resources to have more than one pdflush thread blocked on - * a single request queue. Exclusion at the request_queue level is obtained - * via a flag in the request_queue's backing_dev_info.state. - * - * Non-request_queue-backed address_spaces will share default_backing_dev_info, - * unless they implement their own. Which is somewhat inefficient, as this - * may prevent concurrent writeback against multiple devices. +/* + * We don't actually have pdflush, but this one is exported though /proc... */ -static int writeback_acquire(struct backing_dev_info *bdi) +int nr_pdflush_threads; + +/* + * Passed into wb_writeback(), essentially a subset of writeback_control + */ +struct wb_writeback_args { + long nr_pages; + struct super_block *sb; + enum writeback_sync_modes sync_mode; + int for_kupdate:1; + int range_cyclic:1; + int for_background:1; +}; + +/* + * Work items for the bdi_writeback threads + */ +struct bdi_work { + struct list_head list; /* pending work list */ + struct rcu_head rcu_head; /* for RCU free/clear of work */ + + unsigned long seen; /* threads that have seen this work */ + atomic_t pending; /* number of threads still to do work */ + + struct wb_writeback_args args; /* writeback arguments */ + + unsigned long state; /* flag bits, see WS_* */ +}; + +enum { + WS_USED_B = 0, + WS_ONSTACK_B, +}; + +#define WS_USED (1 << WS_USED_B) +#define WS_ONSTACK (1 << WS_ONSTACK_B) + +static inline bool bdi_work_on_stack(struct bdi_work *work) { - return !test_and_set_bit(BDI_pdflush, &bdi->state); + return test_bit(WS_ONSTACK_B, &work->state); +} + +static inline void bdi_work_init(struct bdi_work *work, + struct wb_writeback_args *args) +{ + INIT_RCU_HEAD(&work->rcu_head); + work->args = *args; + work->state = WS_USED; } /** * writeback_in_progress - determine whether there is writeback in progress * @bdi: the device's backing_dev_info structure. * - * Determine whether there is writeback in progress against a backing device. + * Determine whether there is writeback waiting to be handled against a + * backing device. */ int writeback_in_progress(struct backing_dev_info *bdi) { - return test_bit(BDI_pdflush, &bdi->state); + return !list_empty(&bdi->work_list); } -/** - * writeback_release - relinquish exclusive writeback access against a device. - * @bdi: the device's backing_dev_info structure - */ -static void writeback_release(struct backing_dev_info *bdi) +static void bdi_work_clear(struct bdi_work *work) { - BUG_ON(!writeback_in_progress(bdi)); - clear_bit(BDI_pdflush, &bdi->state); + clear_bit(WS_USED_B, &work->state); + smp_mb__after_clear_bit(); + /* + * work can have disappeared at this point. bit waitq functions + * should be able to tolerate this, provided bdi_sched_wait does + * not dereference it's pointer argument. + */ + wake_up_bit(&work->state, WS_USED_B); } -static noinline void block_dump___mark_inode_dirty(struct inode *inode) +static void bdi_work_free(struct rcu_head *head) { - if (inode->i_ino || strcmp(inode->i_sb->s_id, "bdev")) { - struct dentry *dentry; - const char *name = "?"; + struct bdi_work *work = container_of(head, struct bdi_work, rcu_head); - dentry = d_find_alias(inode); - if (dentry) { - spin_lock(&dentry->d_lock); - name = (const char *) dentry->d_name.name; - } - printk(KERN_DEBUG - "%s(%d): dirtied inode %lu (%s) on %s\n", - current->comm, task_pid_nr(current), inode->i_ino, - name, inode->i_sb->s_id); - if (dentry) { - spin_unlock(&dentry->d_lock); - dput(dentry); - } - } + if (!bdi_work_on_stack(work)) + kfree(work); + else + bdi_work_clear(work); } -/** - * __mark_inode_dirty - internal function - * @inode: inode to mark - * @flags: what kind of dirty (i.e. I_DIRTY_SYNC) - * Mark an inode as dirty. Callers should use mark_inode_dirty or - * mark_inode_dirty_sync. - * - * Put the inode on the super block's dirty list. - * - * CAREFUL! We mark it dirty unconditionally, but move it onto the - * dirty list only if it is hashed or if it refers to a blockdev. - * If it was not hashed, it will never be added to the dirty list - * even if it is later hashed, as it will have been marked dirty already. - * - * In short, make sure you hash any inodes _before_ you start marking - * them dirty. - * - * This function *must* be atomic for the I_DIRTY_PAGES case - - * set_page_dirty() is called under spinlock in several places. - * - * Note that for blockdevs, inode->dirtied_when represents the dirtying time of - * the block-special inode (/dev/hda1) itself. And the ->dirtied_when field of - * the kernel-internal blockdev inode represents the dirtying time of the - * blockdev's pages. This is why for I_DIRTY_PAGES we always use - * page->mapping->host, so the page-dirtying time is recorded in the internal - * blockdev inode. - */ -void __mark_inode_dirty(struct inode *inode, int flags) +static void wb_work_complete(struct bdi_work *work) { - struct super_block *sb = inode->i_sb; + const enum writeback_sync_modes sync_mode = work->args.sync_mode; + int onstack = bdi_work_on_stack(work); /* - * Don't do this for I_DIRTY_PAGES - that doesn't actually - * dirty the inode itself + * For allocated work, we can clear the done/seen bit right here. + * For on-stack work, we need to postpone both the clear and free + * to after the RCU grace period, since the stack could be invalidated + * as soon as bdi_work_clear() has done the wakeup. */ - if (flags & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) { - if (sb->s_op->dirty_inode) - sb->s_op->dirty_inode(inode); - } + if (!onstack) + bdi_work_clear(work); + if (sync_mode == WB_SYNC_NONE || onstack) + call_rcu(&work->rcu_head, bdi_work_free); +} +static void wb_clear_pending(struct bdi_writeback *wb, struct bdi_work *work) +{ /* - * make sure that changes are seen by all cpus before we test i_state - * -- mikulas + * The caller has retrieved the work arguments from this work, + * drop our reference. If this is the last ref, delete and free it */ - smp_mb(); + if (atomic_dec_and_test(&work->pending)) { + struct backing_dev_info *bdi = wb->bdi; - /* avoid the locking if we can */ - if ((inode->i_state & flags) == flags) - return; + spin_lock(&bdi->wb_lock); + list_del_rcu(&work->list); + spin_unlock(&bdi->wb_lock); - if (unlikely(block_dump)) - block_dump___mark_inode_dirty(inode); + wb_work_complete(work); + } +} - spin_lock(&inode_lock); - if ((inode->i_state & flags) != flags) { - const int was_dirty = inode->i_state & I_DIRTY; +static void bdi_queue_work(struct backing_dev_info *bdi, struct bdi_work *work) +{ + work->seen = bdi->wb_mask; + BUG_ON(!work->seen); + atomic_set(&work->pending, bdi->wb_cnt); + BUG_ON(!bdi->wb_cnt); - inode->i_state |= flags; + /* + * list_add_tail_rcu() contains the necessary barriers to + * make sure the above stores are seen before the item is + * noticed on the list + */ + spin_lock(&bdi->wb_lock); + list_add_tail_rcu(&work->list, &bdi->work_list); + spin_unlock(&bdi->wb_lock); - /* - * If the inode is being synced, just update its dirty state. - * The unlocker will place the inode on the appropriate - * superblock list, based upon its state. - */ - if (inode->i_state & I_SYNC) - goto out; + /* + * If the default thread isn't there, make sure we add it. When + * it gets created and wakes up, we'll run this work. + */ + if (unlikely(list_empty_careful(&bdi->wb_list))) + wake_up_process(default_backing_dev_info.wb.task); + else { + struct bdi_writeback *wb = &bdi->wb; - /* - * Only add valid (hashed) inodes to the superblock's - * dirty list. Add blockdev inodes as well. - */ - if (!S_ISBLK(inode->i_mode)) { - if (hlist_unhashed(&inode->i_hash)) - goto out; - } - if (inode->i_state & (I_FREEING|I_CLEAR)) - goto out; + if (wb->task) + wake_up_process(wb->task); + } +} - /* - * If the inode was already on s_dirty/s_io/s_more_io, don't - * reposition it (that would break s_dirty time-ordering). - */ - if (!was_dirty) { - inode->dirtied_when = jiffies; - list_move(&inode->i_list, &sb->s_dirty); - } +/* + * Used for on-stack allocated work items. The caller needs to wait until + * the wb threads have acked the work before it's safe to continue. + */ +static void bdi_wait_on_work_clear(struct bdi_work *work) +{ + wait_on_bit(&work->state, WS_USED_B, bdi_sched_wait, + TASK_UNINTERRUPTIBLE); +} + +static void bdi_alloc_queue_work(struct backing_dev_info *bdi, + struct wb_writeback_args *args) +{ + struct bdi_work *work; + + /* + * This is WB_SYNC_NONE writeback, so if allocation fails just + * wakeup the thread for old dirty data writeback + */ + work = kmalloc(sizeof(*work), GFP_ATOMIC); + if (work) { + bdi_work_init(work, args); + bdi_queue_work(bdi, work); + } else { + struct bdi_writeback *wb = &bdi->wb; + + if (wb->task) + wake_up_process(wb->task); } -out: - spin_unlock(&inode_lock); } -EXPORT_SYMBOL(__mark_inode_dirty); +/** + * bdi_sync_writeback - start and wait for writeback + * @bdi: the backing device to write from + * @sb: write inodes from this super_block + * + * Description: + * This does WB_SYNC_ALL data integrity writeback and waits for the + * IO to complete. Callers must hold the sb s_umount semaphore for + * reading, to avoid having the super disappear before we are done. + */ +static void bdi_sync_writeback(struct backing_dev_info *bdi, + struct super_block *sb) +{ + struct wb_writeback_args args = { + .sb = sb, + .sync_mode = WB_SYNC_ALL, + .nr_pages = LONG_MAX, + .range_cyclic = 0, + }; + struct bdi_work work; -static int write_inode(struct inode *inode, int sync) + bdi_work_init(&work, &args); + work.state |= WS_ONSTACK; + + bdi_queue_work(bdi, &work); + bdi_wait_on_work_clear(&work); +} + +/** + * bdi_start_writeback - start writeback + * @bdi: the backing device to write from + * @nr_pages: the number of pages to write + * + * Description: + * This does WB_SYNC_NONE opportunistic writeback. The IO is only + * started when this function returns, we make no guarentees on + * completion. Caller need not hold sb s_umount semaphore. + * + */ +void bdi_start_writeback(struct backing_dev_info *bdi, struct super_block *sb, + long nr_pages) { - if (inode->i_sb->s_op->write_inode && !is_bad_inode(inode)) - return inode->i_sb->s_op->write_inode(inode, sync); - return 0; + struct wb_writeback_args args = { + .sb = sb, + .sync_mode = WB_SYNC_NONE, + .nr_pages = nr_pages, + .range_cyclic = 1, + }; + + /* + * We treat @nr_pages=0 as the special case to do background writeback, + * ie. to sync pages until the background dirty threshold is reached. + */ + if (!nr_pages) { + args.nr_pages = LONG_MAX; + args.for_background = 1; + } + + bdi_alloc_queue_work(bdi, &args); } /* @@ -191,31 +277,32 @@ static int write_inode(struct inode *inode, int sync) * furthest end of its superblock's dirty-inode list. * * Before stamping the inode's ->dirtied_when, we check to see whether it is - * already the most-recently-dirtied inode on the s_dirty list. If that is + * already the most-recently-dirtied inode on the b_dirty list. If that is * the case then the inode must have been redirtied while it was being written * out and we don't reset its dirtied_when. */ static void redirty_tail(struct inode *inode) { - struct super_block *sb = inode->i_sb; + struct bdi_writeback *wb = &inode_to_bdi(inode)->wb; - if (!list_empty(&sb->s_dirty)) { - struct inode *tail_inode; + if (!list_empty(&wb->b_dirty)) { + struct inode *tail; - tail_inode = list_entry(sb->s_dirty.next, struct inode, i_list); - if (time_before(inode->dirtied_when, - tail_inode->dirtied_when)) + tail = list_entry(wb->b_dirty.next, struct inode, i_list); + if (time_before(inode->dirtied_when, tail->dirtied_when)) inode->dirtied_when = jiffies; } - list_move(&inode->i_list, &sb->s_dirty); + list_move(&inode->i_list, &wb->b_dirty); } /* - * requeue inode for re-scanning after sb->s_io list is exhausted. + * requeue inode for re-scanning after bdi->b_io list is exhausted. */ static void requeue_io(struct inode *inode) { - list_move(&inode->i_list, &inode->i_sb->s_more_io); + struct bdi_writeback *wb = &inode_to_bdi(inode)->wb; + + list_move(&inode->i_list, &wb->b_more_io); } static void inode_sync_complete(struct inode *inode) @@ -235,7 +322,7 @@ static bool inode_dirtied_after(struct inode *inode, unsigned long t) * For inodes being constantly redirtied, dirtied_when can get stuck. * It _appears_ to be in the future, but is actually in distant past. * This test is necessary to prevent such wrapped-around relative times - * from permanently stopping the whole pdflush writeback. + * from permanently stopping the whole bdi writeback. */ ret = ret && time_before_eq(inode->dirtied_when, jiffies); #endif @@ -249,33 +336,56 @@ static void move_expired_inodes(struct list_head *delaying_queue, struct list_head *dispatch_queue, unsigned long *older_than_this) { + LIST_HEAD(tmp); + struct list_head *pos, *node; + struct super_block *sb = NULL; + struct inode *inode; + int do_sb_sort = 0; + while (!list_empty(delaying_queue)) { - struct inode *inode = list_entry(delaying_queue->prev, - struct inode, i_list); + inode = list_entry(delaying_queue->prev, struct inode, i_list); if (older_than_this && inode_dirtied_after(inode, *older_than_this)) break; - list_move(&inode->i_list, dispatch_queue); + if (sb && sb != inode->i_sb) + do_sb_sort = 1; + sb = inode->i_sb; + list_move(&inode->i_list, &tmp); + } + + /* just one sb in list, splice to dispatch_queue and we're done */ + if (!do_sb_sort) { + list_splice(&tmp, dispatch_queue); + return; + } + + /* Move inodes from one superblock together */ + while (!list_empty(&tmp)) { + inode = list_entry(tmp.prev, struct inode, i_list); + sb = inode->i_sb; + list_for_each_prev_safe(pos, node, &tmp) { + inode = list_entry(pos, struct inode, i_list); + if (inode->i_sb == sb) + list_move(&inode->i_list, dispatch_queue); + } } } /* * Queue all expired dirty inodes for io, eldest first. */ -static void queue_io(struct super_block *sb, - unsigned long *older_than_this) +static void queue_io(struct bdi_writeback *wb, unsigned long *older_than_this) { - list_splice_init(&sb->s_more_io, sb->s_io.prev); - move_expired_inodes(&sb->s_dirty, &sb->s_io, older_than_this); + list_splice_init(&wb->b_more_io, wb->b_io.prev); + move_expired_inodes(&wb->b_dirty, &wb->b_io, older_than_this); } -int sb_has_dirty_inodes(struct super_block *sb) +static int write_inode(struct inode *inode, int sync) { - return !list_empty(&sb->s_dirty) || - !list_empty(&sb->s_io) || - !list_empty(&sb->s_more_io); + if (inode->i_sb->s_op->write_inode && !is_bad_inode(inode)) + return inode->i_sb->s_op->write_inode(inode, sync); + return 0; } -EXPORT_SYMBOL(sb_has_dirty_inodes); /* * Wait for writeback on an inode to complete. @@ -322,11 +432,11 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc) if (inode->i_state & I_SYNC) { /* * If this inode is locked for writeback and we are not doing - * writeback-for-data-integrity, move it to s_more_io so that + * writeback-for-data-integrity, move it to b_more_io so that * writeback can proceed with the other inodes on s_io. * * We'll have another go at writing back this inode when we - * completed a full scan of s_io. + * completed a full scan of b_io. */ if (!wait) { requeue_io(inode); @@ -366,16 +476,26 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc) spin_lock(&inode_lock); inode->i_state &= ~I_SYNC; if (!(inode->i_state & (I_FREEING | I_CLEAR))) { - if (!(inode->i_state & I_DIRTY) && - mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) { + if ((inode->i_state & I_DIRTY_PAGES) && wbc->for_kupdate) { + /* + * More pages get dirtied by a fast dirtier. + */ + goto select_queue; + } else if (inode->i_state & I_DIRTY) { + /* + * At least XFS will redirty the inode during the + * writeback (delalloc) and on io completion (isize). + */ + redirty_tail(inode); + } else if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) { /* * We didn't write back all the pages. nfs_writepages() * sometimes bales out without doing anything. Redirty - * the inode; Move it from s_io onto s_more_io/s_dirty. + * the inode; Move it from b_io onto b_more_io/b_dirty. */ /* * akpm: if the caller was the kupdate function we put - * this inode at the head of s_dirty so it gets first + * this inode at the head of b_dirty so it gets first * consideration. Otherwise, move it to the tail, for * the reasons described there. I'm not really sure * how much sense this makes. Presumably I had a good @@ -385,10 +505,11 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc) if (wbc->for_kupdate) { /* * For the kupdate function we move the inode - * to s_more_io so it will get more writeout as + * to b_more_io so it will get more writeout as * soon as the queue becomes uncongested. */ inode->i_state |= I_DIRTY_PAGES; +select_queue: if (wbc->nr_to_write <= 0) { /* * slice used up: queue for next turn @@ -411,12 +532,6 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc) inode->i_state |= I_DIRTY_PAGES; redirty_tail(inode); } - } else if (inode->i_state & I_DIRTY) { - /* - * Someone redirtied the inode while were writing back - * the pages. - */ - redirty_tail(inode); } else if (atomic_read(&inode->i_count)) { /* * The inode is clean, inuse @@ -433,51 +548,96 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc) return ret; } +static void unpin_sb_for_writeback(struct super_block **psb) +{ + struct super_block *sb = *psb; + + if (sb) { + up_read(&sb->s_umount); + put_super(sb); + *psb = NULL; + } +} + /* - * Write out a superblock's list of dirty inodes. A wait will be performed - * upon no inodes, all inodes or the final one, depending upon sync_mode. - * - * If older_than_this is non-NULL, then only write out inodes which - * had their first dirtying at a time earlier than *older_than_this. - * - * If we're a pdflush thread, then implement pdflush collision avoidance - * against the entire list. - * - * If `bdi' is non-zero then we're being asked to writeback a specific queue. - * This function assumes that the blockdev superblock's inodes are backed by - * a variety of queues, so all inodes are searched. For other superblocks, - * assume that all inodes are backed by the same queue. + * For WB_SYNC_NONE writeback, the caller does not have the sb pinned + * before calling writeback. So make sure that we do pin it, so it doesn't + * go away while we are writing inodes from it. * - * FIXME: this linear search could get expensive with many fileystems. But - * how to fix? We need to go from an address_space to all inodes which share - * a queue with that address_space. (Easy: have a global "dirty superblocks" - * list). - * - * The inodes to be written are parked on sb->s_io. They are moved back onto - * sb->s_dirty as they are selected for writing. This way, none can be missed - * on the writer throttling path, and we get decent balancing between many - * throttled threads: we don't want them all piling up on inode_sync_wait. + * Returns 0 if the super was successfully pinned (or pinning wasn't needed), + * 1 if we failed. */ -void generic_sync_sb_inodes(struct super_block *sb, +static int pin_sb_for_writeback(struct writeback_control *wbc, + struct inode *inode, struct super_block **psb) +{ + struct super_block *sb = inode->i_sb; + + /* + * If this sb is already pinned, nothing more to do. If not and + * *psb is non-NULL, unpin the old one first + */ + if (sb == *psb) + return 0; + else if (*psb) + unpin_sb_for_writeback(psb); + + /* + * Caller must already hold the ref for this + */ + if (wbc->sync_mode == WB_SYNC_ALL) { + WARN_ON(!rwsem_is_locked(&sb->s_umount)); + return 0; + } + + spin_lock(&sb_lock); + sb->s_count++; + if (down_read_trylock(&sb->s_umount)) { + if (sb->s_root) { + spin_unlock(&sb_lock); + goto pinned; + } + /* + * umounted, drop rwsem again and fall through to failure + */ + up_read(&sb->s_umount); + } + + sb->s_count--; + spin_unlock(&sb_lock); + return 1; +pinned: + *psb = sb; + return 0; +} + +static void writeback_inodes_wb(struct bdi_writeback *wb, struct writeback_control *wbc) { + struct super_block *sb = wbc->sb, *pin_sb = NULL; + const int is_blkdev_sb = sb_is_blkdev_sb(sb); const unsigned long start = jiffies; /* livelock avoidance */ - int sync = wbc->sync_mode == WB_SYNC_ALL; spin_lock(&inode_lock); - if (!wbc->for_kupdate || list_empty(&sb->s_io)) - queue_io(sb, wbc->older_than_this); - while (!list_empty(&sb->s_io)) { - struct inode *inode = list_entry(sb->s_io.prev, + if (!wbc->for_kupdate || list_empty(&wb->b_io)) + queue_io(wb, wbc->older_than_this); + + while (!list_empty(&wb->b_io)) { + struct inode *inode = list_entry(wb->b_io.prev, struct inode, i_list); - struct address_space *mapping = inode->i_mapping; - struct backing_dev_info *bdi = mapping->backing_dev_info; long pages_skipped; - if (!bdi_cap_writeback_dirty(bdi)) { + /* + * super block given and doesn't match, skip this inode + */ + if (sb && sb != inode->i_sb) { redirty_tail(inode); - if (sb_is_blkdev_sb(sb)) { + continue; + } + + if (!bdi_cap_writeback_dirty(wb->bdi)) { + redirty_tail(inode); + if (is_blkdev_sb) { /* * Dirty memory-backed blockdev: the ramdisk * driver does this. Skip just this inode @@ -497,21 +657,14 @@ void generic_sync_sb_inodes(struct super_block *sb, continue; } - if (wbc->nonblocking && bdi_write_congested(bdi)) { + if (wbc->nonblocking && bdi_write_congested(wb->bdi)) { wbc->encountered_congestion = 1; - if (!sb_is_blkdev_sb(sb)) + if (!is_blkdev_sb) break; /* Skip a congested fs */ requeue_io(inode); continue; /* Skip a congested blockdev */ } - if (wbc->bdi && bdi != wbc->bdi) { - if (!sb_is_blkdev_sb(sb)) - break; /* fs has the wrong queue */ - requeue_io(inode); - continue; /* blockdev has wrong queue */ - } - /* * Was this inode dirtied after sync_sb_inodes was called? * This keeps sync from extra jobs and livelock. @@ -519,16 +672,15 @@ void generic_sync_sb_inodes(struct super_block *sb, if (inode_dirtied_after(inode, start)) break; - /* Is another pdflush already flushing this queue? */ - if (current_is_pdflush() && !writeback_acquire(bdi)) - break; + if (pin_sb_for_writeback(wbc, inode, &pin_sb)) { + requeue_io(inode); + continue; + } BUG_ON(inode->i_state & (I_FREEING | I_CLEAR)); __iget(inode); pages_skipped = wbc->pages_skipped; writeback_single_inode(inode, wbc); - if (current_is_pdflush()) - writeback_release(bdi); if (wbc->pages_skipped != pages_skipped) { /* * writeback is not making progress due to locked @@ -544,144 +696,535 @@ void generic_sync_sb_inodes(struct super_block *sb, wbc->more_io = 1; break; } - if (!list_empty(&sb->s_more_io)) + if (!list_empty(&wb->b_more_io)) wbc->more_io = 1; } - if (sync) { - struct inode *inode, *old_inode = NULL; + unpin_sb_for_writeback(&pin_sb); + spin_unlock(&inode_lock); + /* Leave any unwritten inodes on b_io */ +} + +void writeback_inodes_wbc(struct writeback_control *wbc) +{ + struct backing_dev_info *bdi = wbc->bdi; + + writeback_inodes_wb(&bdi->wb, wbc); +} + +/* + * The maximum number of pages to writeout in a single bdi flush/kupdate + * operation. We do this so we don't hold I_SYNC against an inode for + * enormous amounts of time, which would block a userspace task which has + * been forced to throttle against that inode. Also, the code reevaluates + * the dirty each time it has written this many pages. + */ +#define MAX_WRITEBACK_PAGES 1024 + +static inline bool over_bground_thresh(void) +{ + unsigned long background_thresh, dirty_thresh; + + get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL); + + return (global_page_state(NR_FILE_DIRTY) + + global_page_state(NR_UNSTABLE_NFS) >= background_thresh); +} + +/* + * Explicit flushing or periodic writeback of "old" data. + * + * Define "old": the first time one of an inode's pages is dirtied, we mark the + * dirtying-time in the inode's address_space. So this periodic writeback code + * just walks the superblock inode list, writing back any inodes which are + * older than a specific point in time. + * + * Try to run once per dirty_writeback_interval. But if a writeback event + * takes longer than a dirty_writeback_interval interval, then leave a + * one-second gap. + * + * older_than_this takes precedence over nr_to_write. So we'll only write back + * all dirty pages if they are all attached to "old" mappings. + */ +static long wb_writeback(struct bdi_writeback *wb, + struct wb_writeback_args *args) +{ + struct writeback_control wbc = { + .bdi = wb->bdi, + .sb = args->sb, + .sync_mode = args->sync_mode, + .older_than_this = NULL, + .for_kupdate = args->for_kupdate, + .range_cyclic = args->range_cyclic, + }; + unsigned long oldest_jif; + long wrote = 0; + struct inode *inode; + + if (wbc.for_kupdate) { + wbc.older_than_this = &oldest_jif; + oldest_jif = jiffies - + msecs_to_jiffies(dirty_expire_interval * 10); + } + if (!wbc.range_cyclic) { + wbc.range_start = 0; + wbc.range_end = LLONG_MAX; + } + + for (;;) { /* - * Data integrity sync. Must wait for all pages under writeback, - * because there may have been pages dirtied before our sync - * call, but which had writeout started before we write it out. - * In which case, the inode may not be on the dirty list, but - * we still have to wait for that writeout. + * Stop writeback when nr_pages has been consumed */ - list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { - struct address_space *mapping; - - if (inode->i_state & - (I_FREEING|I_CLEAR|I_WILL_FREE|I_NEW)) - continue; - mapping = inode->i_mapping; - if (mapping->nrpages == 0) - continue; - __iget(inode); - spin_unlock(&inode_lock); - /* - * We hold a reference to 'inode' so it couldn't have - * been removed from s_inodes list while we dropped the - * inode_lock. We cannot iput the inode now as we can - * be holding the last reference and we cannot iput it - * under inode_lock. So we keep the reference and iput - * it later. - */ - iput(old_inode); - old_inode = inode; + if (args->nr_pages <= 0) + break; - filemap_fdatawait(mapping); + /* + * For background writeout, stop when we are below the + * background dirty threshold + */ + if (args->for_background && !over_bground_thresh()) + break; - cond_resched(); + wbc.more_io = 0; + wbc.encountered_congestion = 0; + wbc.nr_to_write = MAX_WRITEBACK_PAGES; + wbc.pages_skipped = 0; + writeback_inodes_wb(wb, &wbc); + args->nr_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write; + wrote += MAX_WRITEBACK_PAGES - wbc.nr_to_write; - spin_lock(&inode_lock); + /* + * If we consumed everything, see if we have more + */ + if (wbc.nr_to_write <= 0) + continue; + /* + * Didn't write everything and we don't have more IO, bail + */ + if (!wbc.more_io) + break; + /* + * Did we write something? Try for more + */ + if (wbc.nr_to_write < MAX_WRITEBACK_PAGES) + continue; + /* + * Nothing written. Wait for some inode to + * become available for writeback. Otherwise + * we'll just busyloop. + */ + spin_lock(&inode_lock); + if (!list_empty(&wb->b_more_io)) { + inode = list_entry(wb->b_more_io.prev, + struct inode, i_list); + inode_wait_for_writeback(inode); } spin_unlock(&inode_lock); - iput(old_inode); - } else - spin_unlock(&inode_lock); + } - return; /* Leave any unwritten inodes on s_io */ + return wrote; } -EXPORT_SYMBOL_GPL(generic_sync_sb_inodes); -static void sync_sb_inodes(struct super_block *sb, - struct writeback_control *wbc) +/* + * Return the next bdi_work struct that hasn't been processed by this + * wb thread yet. ->seen is initially set for each thread that exists + * for this device, when a thread first notices a piece of work it + * clears its bit. Depending on writeback type, the thread will notify + * completion on either receiving the work (WB_SYNC_NONE) or after + * it is done (WB_SYNC_ALL). + */ +static struct bdi_work *get_next_work_item(struct backing_dev_info *bdi, + struct bdi_writeback *wb) +{ + struct bdi_work *work, *ret = NULL; + + rcu_read_lock(); + + list_for_each_entry_rcu(work, &bdi->work_list, list) { + if (!test_bit(wb->nr, &work->seen)) + continue; + clear_bit(wb->nr, &work->seen); + + ret = work; + break; + } + + rcu_read_unlock(); + return ret; +} + +static long wb_check_old_data_flush(struct bdi_writeback *wb) +{ + unsigned long expired; + long nr_pages; + + expired = wb->last_old_flush + + msecs_to_jiffies(dirty_writeback_interval * 10); + if (time_before(jiffies, expired)) + return 0; + + wb->last_old_flush = jiffies; + nr_pages = global_page_state(NR_FILE_DIRTY) + + global_page_state(NR_UNSTABLE_NFS) + + (inodes_stat.nr_inodes - inodes_stat.nr_unused); + + if (nr_pages) { + struct wb_writeback_args args = { + .nr_pages = nr_pages, + .sync_mode = WB_SYNC_NONE, + .for_kupdate = 1, + .range_cyclic = 1, + }; + + return wb_writeback(wb, &args); + } + + return 0; +} + +/* + * Retrieve work items and do the writeback they describe + */ +long wb_do_writeback(struct bdi_writeback *wb, int force_wait) +{ + struct backing_dev_info *bdi = wb->bdi; + struct bdi_work *work; + long wrote = 0; + + while ((work = get_next_work_item(bdi, wb)) != NULL) { + struct wb_writeback_args args = work->args; + + /* + * Override sync mode, in case we must wait for completion + */ + if (force_wait) + work->args.sync_mode = args.sync_mode = WB_SYNC_ALL; + + /* + * If this isn't a data integrity operation, just notify + * that we have seen this work and we are now starting it. + */ + if (args.sync_mode == WB_SYNC_NONE) + wb_clear_pending(wb, work); + + wrote += wb_writeback(wb, &args); + + /* + * This is a data integrity writeback, so only do the + * notification when we have completed the work. + */ + if (args.sync_mode == WB_SYNC_ALL) + wb_clear_pending(wb, work); + } + + /* + * Check for periodic writeback, kupdated() style + */ + wrote += wb_check_old_data_flush(wb); + + return wrote; +} + +/* + * Handle writeback of dirty data for the device backed by this bdi. Also + * wakes up periodically and does kupdated style flushing. + */ +int bdi_writeback_task(struct bdi_writeback *wb) +{ + unsigned long last_active = jiffies; + unsigned long wait_jiffies = -1UL; + long pages_written; + + while (!kthread_should_stop()) { + pages_written = wb_do_writeback(wb, 0); + + if (pages_written) + last_active = jiffies; + else if (wait_jiffies != -1UL) { + unsigned long max_idle; + + /* + * Longest period of inactivity that we tolerate. If we + * see dirty data again later, the task will get + * recreated automatically. + */ + max_idle = max(5UL * 60 * HZ, wait_jiffies); + if (time_after(jiffies, max_idle + last_active)) + break; + } + + wait_jiffies = msecs_to_jiffies(dirty_writeback_interval * 10); + schedule_timeout_interruptible(wait_jiffies); + try_to_freeze(); + } + + return 0; +} + +/* + * Schedule writeback for all backing devices. This does WB_SYNC_NONE + * writeback, for integrity writeback see bdi_sync_writeback(). + */ +static void bdi_writeback_all(struct super_block *sb, long nr_pages) { - generic_sync_sb_inodes(sb, wbc); + struct wb_writeback_args args = { + .sb = sb, + .nr_pages = nr_pages, + .sync_mode = WB_SYNC_NONE, + }; + struct backing_dev_info *bdi; + + rcu_read_lock(); + + list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) { + if (!bdi_has_dirty_io(bdi)) + continue; + + bdi_alloc_queue_work(bdi, &args); + } + + rcu_read_unlock(); } /* - * Start writeback of dirty pagecache data against all unlocked inodes. + * Start writeback of `nr_pages' pages. If `nr_pages' is zero, write back + * the whole world. + */ +void wakeup_flusher_threads(long nr_pages) +{ + if (nr_pages == 0) + nr_pages = global_page_state(NR_FILE_DIRTY) + + global_page_state(NR_UNSTABLE_NFS); + bdi_writeback_all(NULL, nr_pages); +} + +static noinline void block_dump___mark_inode_dirty(struct inode *inode) +{ + if (inode->i_ino || strcmp(inode->i_sb->s_id, "bdev")) { + struct dentry *dentry; + const char *name = "?"; + + dentry = d_find_alias(inode); + if (dentry) { + spin_lock(&dentry->d_lock); + name = (const char *) dentry->d_name.name; + } + printk(KERN_DEBUG + "%s(%d): dirtied inode %lu (%s) on %s\n", + current->comm, task_pid_nr(current), inode->i_ino, + name, inode->i_sb->s_id); + if (dentry) { + spin_unlock(&dentry->d_lock); + dput(dentry); + } + } +} + +/** + * __mark_inode_dirty - internal function + * @inode: inode to mark + * @flags: what kind of dirty (i.e. I_DIRTY_SYNC) + * Mark an inode as dirty. Callers should use mark_inode_dirty or + * mark_inode_dirty_sync. + * + * Put the inode on the super block's dirty list. + * + * CAREFUL! We mark it dirty unconditionally, but move it onto the + * dirty list only if it is hashed or if it refers to a blockdev. + * If it was not hashed, it will never be added to the dirty list + * even if it is later hashed, as it will have been marked dirty already. * - * Note: - * We don't need to grab a reference to superblock here. If it has non-empty - * ->s_dirty it's hadn't been killed yet and kill_super() won't proceed - * past sync_inodes_sb() until the ->s_dirty/s_io/s_more_io lists are all - * empty. Since __sync_single_inode() regains inode_lock before it finally moves - * inode from superblock lists we are OK. + * In short, make sure you hash any inodes _before_ you start marking + * them dirty. * - * If `older_than_this' is non-zero then only flush inodes which have a - * flushtime older than *older_than_this. + * This function *must* be atomic for the I_DIRTY_PAGES case - + * set_page_dirty() is called under spinlock in several places. * - * If `bdi' is non-zero then we will scan the first inode against each - * superblock until we find the matching ones. One group will be the dirty - * inodes against a filesystem. Then when we hit the dummy blockdev superblock, - * sync_sb_inodes will seekout the blockdev which matches `bdi'. Maybe not - * super-efficient but we're about to do a ton of I/O... + * Note that for blockdevs, inode->dirtied_when represents the dirtying time of + * the block-special inode (/dev/hda1) itself. And the ->dirtied_when field of + * the kernel-internal blockdev inode represents the dirtying time of the + * blockdev's pages. This is why for I_DIRTY_PAGES we always use + * page->mapping->host, so the page-dirtying time is recorded in the internal + * blockdev inode. */ -void -writeback_inodes(struct writeback_control *wbc) +void __mark_inode_dirty(struct inode *inode, int flags) { - struct super_block *sb; + struct super_block *sb = inode->i_sb; - might_sleep(); - spin_lock(&sb_lock); -restart: - list_for_each_entry_reverse(sb, &super_blocks, s_list) { - if (sb_has_dirty_inodes(sb)) { - /* we're making our own get_super here */ - sb->s_count++; - spin_unlock(&sb_lock); - /* - * If we can't get the readlock, there's no sense in - * waiting around, most of the time the FS is going to - * be unmounted by the time it is released. - */ - if (down_read_trylock(&sb->s_umount)) { - if (sb->s_root) - sync_sb_inodes(sb, wbc); - up_read(&sb->s_umount); + /* + * Don't do this for I_DIRTY_PAGES - that doesn't actually + * dirty the inode itself + */ + if (flags & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) { + if (sb->s_op->dirty_inode) + sb->s_op->dirty_inode(inode); + } + + /* + * make sure that changes are seen by all cpus before we test i_state + * -- mikulas + */ + smp_mb(); + + /* avoid the locking if we can */ + if ((inode->i_state & flags) == flags) + return; + + if (unlikely(block_dump)) + block_dump___mark_inode_dirty(inode); + + spin_lock(&inode_lock); + if ((inode->i_state & flags) != flags) { + const int was_dirty = inode->i_state & I_DIRTY; + + inode->i_state |= flags; + + /* + * If the inode is being synced, just update its dirty state. + * The unlocker will place the inode on the appropriate + * superblock list, based upon its state. + */ + if (inode->i_state & I_SYNC) + goto out; + + /* + * Only add valid (hashed) inodes to the superblock's + * dirty list. Add blockdev inodes as well. + */ + if (!S_ISBLK(inode->i_mode)) { + if (hlist_unhashed(&inode->i_hash)) + goto out; + } + if (inode->i_state & (I_FREEING|I_CLEAR)) + goto out; + + /* + * If the inode was already on b_dirty/b_io/b_more_io, don't + * reposition it (that would break b_dirty time-ordering). + */ + if (!was_dirty) { + struct bdi_writeback *wb = &inode_to_bdi(inode)->wb; + struct backing_dev_info *bdi = wb->bdi; + + if (bdi_cap_writeback_dirty(bdi) && + !test_bit(BDI_registered, &bdi->state)) { + WARN_ON(1); + printk(KERN_ERR "bdi-%s not registered\n", + bdi->name); } - spin_lock(&sb_lock); - if (__put_super_and_need_restart(sb)) - goto restart; + + inode->dirtied_when = jiffies; + list_move(&inode->i_list, &wb->b_dirty); } - if (wbc->nr_to_write <= 0) - break; } - spin_unlock(&sb_lock); +out: + spin_unlock(&inode_lock); } +EXPORT_SYMBOL(__mark_inode_dirty); /* - * writeback and wait upon the filesystem's dirty inodes. The caller will - * do this in two passes - one to write, and one to wait. + * Write out a superblock's list of dirty inodes. A wait will be performed + * upon no inodes, all inodes or the final one, depending upon sync_mode. + * + * If older_than_this is non-NULL, then only write out inodes which + * had their first dirtying at a time earlier than *older_than_this. * - * A finite limit is set on the number of pages which will be written. - * To prevent infinite livelock of sys_sync(). + * If `bdi' is non-zero then we're being asked to writeback a specific queue. + * This function assumes that the blockdev superblock's inodes are backed by + * a variety of queues, so all inodes are searched. For other superblocks, + * assume that all inodes are backed by the same queue. * - * We add in the number of potentially dirty inodes, because each inode write - * can dirty pagecache in the underlying blockdev. + * The inodes to be written are parked on bdi->b_io. They are moved back onto + * bdi->b_dirty as they are selected for writing. This way, none can be missed + * on the writer throttling path, and we get decent balancing between many + * throttled threads: we don't want them all piling up on inode_sync_wait. */ -void sync_inodes_sb(struct super_block *sb, int wait) +static void wait_sb_inodes(struct super_block *sb) { - struct writeback_control wbc = { - .sync_mode = wait ? WB_SYNC_ALL : WB_SYNC_NONE, - .range_start = 0, - .range_end = LLONG_MAX, - }; + struct inode *inode, *old_inode = NULL; + + /* + * We need to be protected against the filesystem going from + * r/o to r/w or vice versa. + */ + WARN_ON(!rwsem_is_locked(&sb->s_umount)); + + spin_lock(&inode_lock); - if (!wait) { - unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY); - unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS); + /* + * Data integrity sync. Must wait for all pages under writeback, + * because there may have been pages dirtied before our sync + * call, but which had writeout started before we write it out. + * In which case, the inode may not be on the dirty list, but + * we still have to wait for that writeout. + */ + list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { + struct address_space *mapping; - wbc.nr_to_write = nr_dirty + nr_unstable + + if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE|I_NEW)) + continue; + mapping = inode->i_mapping; + if (mapping->nrpages == 0) + continue; + __iget(inode); + spin_unlock(&inode_lock); + /* + * We hold a reference to 'inode' so it couldn't have + * been removed from s_inodes list while we dropped the + * inode_lock. We cannot iput the inode now as we can + * be holding the last reference and we cannot iput it + * under inode_lock. So we keep the reference and iput + * it later. + */ + iput(old_inode); + old_inode = inode; + + filemap_fdatawait(mapping); + + cond_resched(); + + spin_lock(&inode_lock); + } + spin_unlock(&inode_lock); + iput(old_inode); +} + +/** + * writeback_inodes_sb - writeback dirty inodes from given super_block + * @sb: the superblock + * + * Start writeback on some inodes on this super_block. No guarantees are made + * on how many (if any) will be written, and this function does not wait + * for IO completion of submitted IO. The number of pages submitted is + * returned. + */ +void writeback_inodes_sb(struct super_block *sb) +{ + unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY); + unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS); + long nr_to_write; + + nr_to_write = nr_dirty + nr_unstable + (inodes_stat.nr_inodes - inodes_stat.nr_unused); - } else - wbc.nr_to_write = LONG_MAX; /* doesn't actually matter */ - sync_sb_inodes(sb, &wbc); + bdi_start_writeback(sb->s_bdi, sb, nr_to_write); } +EXPORT_SYMBOL(writeback_inodes_sb); + +/** + * sync_inodes_sb - sync sb inode pages + * @sb: the superblock + * + * This function writes and waits on any dirty inode belonging to this + * super_block. The number of pages synced is returned. + */ +void sync_inodes_sb(struct super_block *sb) +{ + bdi_sync_writeback(sb->s_bdi, sb); + wait_sb_inodes(sb); +} +EXPORT_SYMBOL(sync_inodes_sb); /** * write_inode_now - write an inode to disk @@ -737,57 +1280,3 @@ int sync_inode(struct inode *inode, struct writeback_control *wbc) return ret; } EXPORT_SYMBOL(sync_inode); - -/** - * generic_osync_inode - flush all dirty data for a given inode to disk - * @inode: inode to write - * @mapping: the address_space that should be flushed - * @what: what to write and wait upon - * - * This can be called by file_write functions for files which have the - * O_SYNC flag set, to flush dirty writes to disk. - * - * @what is a bitmask, specifying which part of the inode's data should be - * written and waited upon. - * - * OSYNC_DATA: i_mapping's dirty data - * OSYNC_METADATA: the buffers at i_mapping->private_list - * OSYNC_INODE: the inode itself - */ - -int generic_osync_inode(struct inode *inode, struct address_space *mapping, int what) -{ - int err = 0; - int need_write_inode_now = 0; - int err2; - - if (what & OSYNC_DATA) - err = filemap_fdatawrite(mapping); - if (what & (OSYNC_METADATA|OSYNC_DATA)) { - err2 = sync_mapping_buffers(mapping); - if (!err) - err = err2; - } - if (what & OSYNC_DATA) { - err2 = filemap_fdatawait(mapping); - if (!err) - err = err2; - } - - spin_lock(&inode_lock); - if ((inode->i_state & I_DIRTY) && - ((what & OSYNC_INODE) || (inode->i_state & I_DIRTY_DATASYNC))) - need_write_inode_now = 1; - spin_unlock(&inode_lock); - - if (need_write_inode_now) { - err2 = write_inode_now(inode, 1); - if (!err) - err = err2; - } - else - inode_sync_wait(inode); - - return err; -} -EXPORT_SYMBOL(generic_osync_inode); diff --git a/fs/fuse/control.c b/fs/fuse/control.c index 99c99df..3773fd6 100644 --- a/fs/fuse/control.c +++ b/fs/fuse/control.c @@ -61,6 +61,121 @@ static ssize_t fuse_conn_waiting_read(struct file *file, char __user *buf, return simple_read_from_buffer(buf, len, ppos, tmp, size); } +static ssize_t fuse_conn_limit_read(struct file *file, char __user *buf, + size_t len, loff_t *ppos, unsigned val) +{ + char tmp[32]; + size_t size = sprintf(tmp, "%u\n", val); + + return simple_read_from_buffer(buf, len, ppos, tmp, size); +} + +static ssize_t fuse_conn_limit_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos, unsigned *val, + unsigned global_limit) +{ + unsigned long t; + char tmp[32]; + unsigned limit = (1 << 16) - 1; + int err; + + if (*ppos || count >= sizeof(tmp) - 1) + return -EINVAL; + + if (copy_from_user(tmp, buf, count)) + return -EINVAL; + + tmp[count] = '\0'; + + err = strict_strtoul(tmp, 0, &t); + if (err) + return err; + + if (!capable(CAP_SYS_ADMIN)) + limit = min(limit, global_limit); + + if (t > limit) + return -EINVAL; + + *val = t; + + return count; +} + +static ssize_t fuse_conn_max_background_read(struct file *file, + char __user *buf, size_t len, + loff_t *ppos) +{ + struct fuse_conn *fc; + unsigned val; + + fc = fuse_ctl_file_conn_get(file); + if (!fc) + return 0; + + val = fc->max_background; + fuse_conn_put(fc); + + return fuse_conn_limit_read(file, buf, len, ppos, val); +} + +static ssize_t fuse_conn_max_background_write(struct file *file, + const char __user *buf, + size_t count, loff_t *ppos) +{ + unsigned val; + ssize_t ret; + + ret = fuse_conn_limit_write(file, buf, count, ppos, &val, + max_user_bgreq); + if (ret > 0) { + struct fuse_conn *fc = fuse_ctl_file_conn_get(file); + if (fc) { + fc->max_background = val; + fuse_conn_put(fc); + } + } + + return ret; +} + +static ssize_t fuse_conn_congestion_threshold_read(struct file *file, + char __user *buf, size_t len, + loff_t *ppos) +{ + struct fuse_conn *fc; + unsigned val; + + fc = fuse_ctl_file_conn_get(file); + if (!fc) + return 0; + + val = fc->congestion_threshold; + fuse_conn_put(fc); + + return fuse_conn_limit_read(file, buf, len, ppos, val); +} + +static ssize_t fuse_conn_congestion_threshold_write(struct file *file, + const char __user *buf, + size_t count, loff_t *ppos) +{ + unsigned val; + ssize_t ret; + + ret = fuse_conn_limit_write(file, buf, count, ppos, &val, + max_user_congthresh); + if (ret > 0) { + struct fuse_conn *fc = fuse_ctl_file_conn_get(file); + if (fc) { + fc->congestion_threshold = val; + fuse_conn_put(fc); + } + } + + return ret; +} + static const struct file_operations fuse_ctl_abort_ops = { .open = nonseekable_open, .write = fuse_conn_abort_write, @@ -71,6 +186,18 @@ static const struct file_operations fuse_ctl_waiting_ops = { .read = fuse_conn_waiting_read, }; +static const struct file_operations fuse_conn_max_background_ops = { + .open = nonseekable_open, + .read = fuse_conn_max_background_read, + .write = fuse_conn_max_background_write, +}; + +static const struct file_operations fuse_conn_congestion_threshold_ops = { + .open = nonseekable_open, + .read = fuse_conn_congestion_threshold_read, + .write = fuse_conn_congestion_threshold_write, +}; + static struct dentry *fuse_ctl_add_dentry(struct dentry *parent, struct fuse_conn *fc, const char *name, @@ -127,9 +254,14 @@ int fuse_ctl_add_conn(struct fuse_conn *fc) goto err; if (!fuse_ctl_add_dentry(parent, fc, "waiting", S_IFREG | 0400, 1, - NULL, &fuse_ctl_waiting_ops) || + NULL, &fuse_ctl_waiting_ops) || !fuse_ctl_add_dentry(parent, fc, "abort", S_IFREG | 0200, 1, - NULL, &fuse_ctl_abort_ops)) + NULL, &fuse_ctl_abort_ops) || + !fuse_ctl_add_dentry(parent, fc, "max_background", S_IFREG | 0600, + 1, NULL, &fuse_conn_max_background_ops) || + !fuse_ctl_add_dentry(parent, fc, "congestion_threshold", + S_IFREG | 0600, 1, NULL, + &fuse_conn_congestion_threshold_ops)) goto err; return 0; @@ -156,7 +288,7 @@ void fuse_ctl_remove_conn(struct fuse_conn *fc) d_drop(dentry); dput(dentry); } - fuse_control_sb->s_root->d_inode->i_nlink--; + drop_nlink(fuse_control_sb->s_root->d_inode); } static int fuse_ctl_fill_super(struct super_block *sb, void *data, int silent) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 6484eb7..51d9e33 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -250,7 +250,7 @@ static void queue_request(struct fuse_conn *fc, struct fuse_req *req) static void flush_bg_queue(struct fuse_conn *fc) { - while (fc->active_background < FUSE_MAX_BACKGROUND && + while (fc->active_background < fc->max_background && !list_empty(&fc->bg_queue)) { struct fuse_req *req; @@ -280,11 +280,11 @@ __releases(&fc->lock) list_del(&req->intr_entry); req->state = FUSE_REQ_FINISHED; if (req->background) { - if (fc->num_background == FUSE_MAX_BACKGROUND) { + if (fc->num_background == fc->max_background) { fc->blocked = 0; wake_up_all(&fc->blocked_waitq); } - if (fc->num_background == FUSE_CONGESTION_THRESHOLD && + if (fc->num_background == fc->congestion_threshold && fc->connected && fc->bdi_initialized) { clear_bdi_congested(&fc->bdi, BLK_RW_SYNC); clear_bdi_congested(&fc->bdi, BLK_RW_ASYNC); @@ -410,9 +410,9 @@ static void fuse_request_send_nowait_locked(struct fuse_conn *fc, { req->background = 1; fc->num_background++; - if (fc->num_background == FUSE_MAX_BACKGROUND) + if (fc->num_background == fc->max_background) fc->blocked = 1; - if (fc->num_background == FUSE_CONGESTION_THRESHOLD && + if (fc->num_background == fc->congestion_threshold && fc->bdi_initialized) { set_bdi_congested(&fc->bdi, BLK_RW_SYNC); set_bdi_congested(&fc->bdi, BLK_RW_ASYNC); diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index e703654..992f6c9 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -1276,14 +1276,9 @@ static int fuse_do_setattr(struct dentry *entry, struct iattr *attr, return 0; if (attr->ia_valid & ATTR_SIZE) { - unsigned long limit; - if (IS_SWAPFILE(inode)) - return -ETXTBSY; - limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur; - if (limit != RLIM_INFINITY && attr->ia_size > (loff_t) limit) { - send_sig(SIGXFSZ, current, 0); - return -EFBIG; - } + err = inode_newsize_ok(inode, attr->ia_size); + if (err) + return err; is_truncate = true; } @@ -1350,8 +1345,7 @@ static int fuse_do_setattr(struct dentry *entry, struct iattr *attr, * FUSE_NOWRITE, otherwise fuse_launder_page() would deadlock. */ if (S_ISREG(inode->i_mode) && oldsize != outarg.attr.size) { - if (outarg.attr.size < oldsize) - fuse_truncate(inode->i_mapping, outarg.attr.size); + truncate_pagecache(inode, oldsize, outarg.attr.size); invalidate_inode_pages2(inode->i_mapping); } diff --git a/fs/fuse/file.c b/fs/fuse/file.c index cbc4640..a3492f7 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1313,7 +1313,7 @@ static int fuse_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) return 0; } -static struct vm_operations_struct fuse_file_vm_ops = { +static const struct vm_operations_struct fuse_file_vm_ops = { .close = fuse_vma_close, .fault = filemap_fault, .page_mkwrite = fuse_page_mkwrite, diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 52b641f..01cc462 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -25,12 +25,6 @@ /** Max number of pages that can be used in a single read request */ #define FUSE_MAX_PAGES_PER_REQ 32 -/** Maximum number of outstanding background requests */ -#define FUSE_MAX_BACKGROUND 12 - -/** Congestion starts at 75% of maximum */ -#define FUSE_CONGESTION_THRESHOLD (FUSE_MAX_BACKGROUND * 75 / 100) - /** Bias for fi->writectr, meaning new writepages must not be sent */ #define FUSE_NOWRITE INT_MIN @@ -38,7 +32,7 @@ #define FUSE_NAME_MAX 1024 /** Number of dentries for each connection in the control filesystem */ -#define FUSE_CTL_NUM_DENTRIES 3 +#define FUSE_CTL_NUM_DENTRIES 5 /** If the FUSE_DEFAULT_PERMISSIONS flag is given, the filesystem module will check permissions based on the file mode. Otherwise no @@ -55,6 +49,10 @@ extern struct list_head fuse_conn_list; /** Global mutex protecting fuse_conn_list and the control filesystem */ extern struct mutex fuse_mutex; +/** Module parameters */ +extern unsigned max_user_bgreq; +extern unsigned max_user_congthresh; + /** FUSE inode */ struct fuse_inode { /** Inode data */ @@ -349,6 +347,12 @@ struct fuse_conn { /** rbtree of fuse_files waiting for poll events indexed by ph */ struct rb_root polled_files; + /** Maximum number of outstanding background requests */ + unsigned max_background; + + /** Number of background requests at which congestion starts */ + unsigned congestion_threshold; + /** Number of requests currently in the background */ unsigned num_background; @@ -602,8 +606,6 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr, void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr, u64 attr_valid); -void fuse_truncate(struct address_space *mapping, loff_t offset); - /** * Initialize the client device */ diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index f91ccc4..1a822ce 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -14,6 +14,7 @@ #include <linux/seq_file.h> #include <linux/init.h> #include <linux/module.h> +#include <linux/moduleparam.h> #include <linux/parser.h> #include <linux/statfs.h> #include <linux/random.h> @@ -28,10 +29,34 @@ static struct kmem_cache *fuse_inode_cachep; struct list_head fuse_conn_list; DEFINE_MUTEX(fuse_mutex); +static int set_global_limit(const char *val, struct kernel_param *kp); + +unsigned max_user_bgreq; +module_param_call(max_user_bgreq, set_global_limit, param_get_uint, + &max_user_bgreq, 0644); +__MODULE_PARM_TYPE(max_user_bgreq, "uint"); +MODULE_PARM_DESC(max_user_bgreq, + "Global limit for the maximum number of backgrounded requests an " + "unprivileged user can set"); + +unsigned max_user_congthresh; +module_param_call(max_user_congthresh, set_global_limit, param_get_uint, + &max_user_congthresh, 0644); +__MODULE_PARM_TYPE(max_user_congthresh, "uint"); +MODULE_PARM_DESC(max_user_congthresh, + "Global limit for the maximum congestion threshold an " + "unprivileged user can set"); + #define FUSE_SUPER_MAGIC 0x65735546 #define FUSE_DEFAULT_BLKSIZE 512 +/** Maximum number of outstanding background requests */ +#define FUSE_DEFAULT_MAX_BACKGROUND 12 + +/** Congestion starts at 75% of maximum */ +#define FUSE_DEFAULT_CONGESTION_THRESHOLD (FUSE_DEFAULT_MAX_BACKGROUND * 3 / 4) + struct fuse_mount_data { int fd; unsigned rootmode; @@ -115,14 +140,6 @@ static int fuse_remount_fs(struct super_block *sb, int *flags, char *data) return 0; } -void fuse_truncate(struct address_space *mapping, loff_t offset) -{ - /* See vmtruncate() */ - unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1); - truncate_inode_pages(mapping, offset); - unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1); -} - void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr, u64 attr_valid) { @@ -180,8 +197,7 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr, spin_unlock(&fc->lock); if (S_ISREG(inode->i_mode) && oldsize != attr->size) { - if (attr->size < oldsize) - fuse_truncate(inode->i_mapping, attr->size); + truncate_pagecache(inode, oldsize, attr->size); invalidate_inode_pages2(inode->i_mapping); } } @@ -517,6 +533,8 @@ void fuse_conn_init(struct fuse_conn *fc) INIT_LIST_HEAD(&fc->bg_queue); INIT_LIST_HEAD(&fc->entry); atomic_set(&fc->num_waiting, 0); + fc->max_background = FUSE_DEFAULT_MAX_BACKGROUND; + fc->congestion_threshold = FUSE_DEFAULT_CONGESTION_THRESHOLD; fc->khctr = 0; fc->polled_files = RB_ROOT; fc->reqctr = 0; @@ -727,6 +745,54 @@ static const struct super_operations fuse_super_operations = { .show_options = fuse_show_options, }; +static void sanitize_global_limit(unsigned *limit) +{ + if (*limit == 0) + *limit = ((num_physpages << PAGE_SHIFT) >> 13) / + sizeof(struct fuse_req); + + if (*limit >= 1 << 16) + *limit = (1 << 16) - 1; +} + +static int set_global_limit(const char *val, struct kernel_param *kp) +{ + int rv; + + rv = param_set_uint(val, kp); + if (rv) + return rv; + + sanitize_global_limit((unsigned *)kp->arg); + + return 0; +} + +static void process_init_limits(struct fuse_conn *fc, struct fuse_init_out *arg) +{ + int cap_sys_admin = capable(CAP_SYS_ADMIN); + + if (arg->minor < 13) + return; + + sanitize_global_limit(&max_user_bgreq); + sanitize_global_limit(&max_user_congthresh); + + if (arg->max_background) { + fc->max_background = arg->max_background; + + if (!cap_sys_admin && fc->max_background > max_user_bgreq) + fc->max_background = max_user_bgreq; + } + if (arg->congestion_threshold) { + fc->congestion_threshold = arg->congestion_threshold; + + if (!cap_sys_admin && + fc->congestion_threshold > max_user_congthresh) + fc->congestion_threshold = max_user_congthresh; + } +} + static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req) { struct fuse_init_out *arg = &req->misc.init_out; @@ -736,6 +802,8 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req) else { unsigned long ra_pages; + process_init_limits(fc, arg); + if (arg->minor >= 6) { ra_pages = arg->max_readahead / PAGE_CACHE_SIZE; if (arg->flags & FUSE_ASYNC_READ) @@ -801,6 +869,7 @@ static int fuse_bdi_init(struct fuse_conn *fc, struct super_block *sb) { int err; + fc->bdi.name = "fuse"; fc->bdi.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE; fc->bdi.unplug_io_fn = default_unplug_io_fn; /* fuse does it's own writeback accounting */ @@ -893,6 +962,8 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) if (err) goto err_put_conn; + sb->s_bdi = &fc->bdi; + /* Handle umasking inside the fuse code */ if (sb->s_flags & MS_POSIXACL) fc->dont_mask = 1; @@ -1147,6 +1218,9 @@ static int __init fuse_init(void) if (res) goto err_sysfs_cleanup; + sanitize_global_limit(&max_user_bgreq); + sanitize_global_limit(&max_user_congthresh); + return 0; err_sysfs_cleanup: diff --git a/fs/gfs2/Makefile b/fs/gfs2/Makefile index 3da2f1f..21f7e46 100644 --- a/fs/gfs2/Makefile +++ b/fs/gfs2/Makefile @@ -1,6 +1,6 @@ EXTRA_CFLAGS := -I$(src) obj-$(CONFIG_GFS2_FS) += gfs2.o -gfs2-y := acl.o bmap.o dir.o eaops.o eattr.o glock.o \ +gfs2-y := acl.o bmap.o dir.o xattr.o glock.o \ glops.o inode.o log.o lops.o main.o meta_io.o \ aops.o dentry.o export.o file.o \ ops_fstype.o ops_inode.o quota.o \ diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c index fa881bd..3fc4e3a 100644 --- a/fs/gfs2/acl.c +++ b/fs/gfs2/acl.c @@ -19,8 +19,7 @@ #include "gfs2.h" #include "incore.h" #include "acl.h" -#include "eaops.h" -#include "eattr.h" +#include "xattr.h" #include "glock.h" #include "inode.h" #include "meta_io.h" @@ -31,8 +30,7 @@ #define ACL_DEFAULT 0 int gfs2_acl_validate_set(struct gfs2_inode *ip, int access, - struct gfs2_ea_request *er, - int *remove, mode_t *mode) + struct gfs2_ea_request *er, int *remove, mode_t *mode) { struct posix_acl *acl; int error; @@ -83,30 +81,20 @@ int gfs2_acl_validate_remove(struct gfs2_inode *ip, int access) return 0; } -static int acl_get(struct gfs2_inode *ip, int access, struct posix_acl **acl, - struct gfs2_ea_location *el, char **data, unsigned int *len) +static int acl_get(struct gfs2_inode *ip, const char *name, + struct posix_acl **acl, struct gfs2_ea_location *el, + char **datap, unsigned int *lenp) { - struct gfs2_ea_request er; - struct gfs2_ea_location el_this; + char *data; + unsigned int len; int error; + el->el_bh = NULL; + if (!ip->i_eattr) return 0; - memset(&er, 0, sizeof(struct gfs2_ea_request)); - if (access) { - er.er_name = GFS2_POSIX_ACL_ACCESS; - er.er_name_len = GFS2_POSIX_ACL_ACCESS_LEN; - } else { - er.er_name = GFS2_POSIX_ACL_DEFAULT; - er.er_name_len = GFS2_POSIX_ACL_DEFAULT_LEN; - } - er.er_type = GFS2_EATYPE_SYS; - - if (!el) - el = &el_this; - - error = gfs2_ea_find(ip, &er, el); + error = gfs2_ea_find(ip, GFS2_EATYPE_SYS, name, el); if (error) return error; if (!el->el_ea) @@ -114,32 +102,31 @@ static int acl_get(struct gfs2_inode *ip, int access, struct posix_acl **acl, if (!GFS2_EA_DATA_LEN(el->el_ea)) goto out; - er.er_data_len = GFS2_EA_DATA_LEN(el->el_ea); - er.er_data = kmalloc(er.er_data_len, GFP_NOFS); + len = GFS2_EA_DATA_LEN(el->el_ea); + data = kmalloc(len, GFP_NOFS); error = -ENOMEM; - if (!er.er_data) + if (!data) goto out; - error = gfs2_ea_get_copy(ip, el, er.er_data); - if (error) + error = gfs2_ea_get_copy(ip, el, data, len); + if (error < 0) goto out_kfree; + error = 0; if (acl) { - *acl = posix_acl_from_xattr(er.er_data, er.er_data_len); + *acl = posix_acl_from_xattr(data, len); if (IS_ERR(*acl)) error = PTR_ERR(*acl); } out_kfree: - if (error || !data) - kfree(er.er_data); - else { - *data = er.er_data; - *len = er.er_data_len; + if (error || !datap) { + kfree(data); + } else { + *datap = data; + *lenp = len; } out: - if (error || el == &el_this) - brelse(el->el_bh); return error; } @@ -153,10 +140,12 @@ out: int gfs2_check_acl(struct inode *inode, int mask) { + struct gfs2_ea_location el; struct posix_acl *acl = NULL; int error; - error = acl_get(GFS2_I(inode), ACL_ACCESS, &acl, NULL, NULL, NULL); + error = acl_get(GFS2_I(inode), GFS2_POSIX_ACL_ACCESS, &acl, &el, NULL, NULL); + brelse(el.el_bh); if (error) return error; @@ -196,10 +185,12 @@ static int munge_mode(struct gfs2_inode *ip, mode_t mode) int gfs2_acl_create(struct gfs2_inode *dip, struct gfs2_inode *ip) { + struct gfs2_ea_location el; struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode); struct posix_acl *acl = NULL, *clone; - struct gfs2_ea_request er; mode_t mode = ip->i_inode.i_mode; + char *data = NULL; + unsigned int len; int error; if (!sdp->sd_args.ar_posix_acl) @@ -207,11 +198,8 @@ int gfs2_acl_create(struct gfs2_inode *dip, struct gfs2_inode *ip) if (S_ISLNK(ip->i_inode.i_mode)) return 0; - memset(&er, 0, sizeof(struct gfs2_ea_request)); - er.er_type = GFS2_EATYPE_SYS; - - error = acl_get(dip, ACL_DEFAULT, &acl, NULL, - &er.er_data, &er.er_data_len); + error = acl_get(dip, GFS2_POSIX_ACL_DEFAULT, &acl, &el, &data, &len); + brelse(el.el_bh); if (error) return error; if (!acl) { @@ -229,9 +217,8 @@ int gfs2_acl_create(struct gfs2_inode *dip, struct gfs2_inode *ip) acl = clone; if (S_ISDIR(ip->i_inode.i_mode)) { - er.er_name = GFS2_POSIX_ACL_DEFAULT; - er.er_name_len = GFS2_POSIX_ACL_DEFAULT_LEN; - error = gfs2_system_eaops.eo_set(ip, &er); + error = gfs2_xattr_set(&ip->i_inode, GFS2_EATYPE_SYS, + GFS2_POSIX_ACL_DEFAULT, data, len, 0); if (error) goto out; } @@ -239,21 +226,19 @@ int gfs2_acl_create(struct gfs2_inode *dip, struct gfs2_inode *ip) error = posix_acl_create_masq(acl, &mode); if (error < 0) goto out; - if (error > 0) { - er.er_name = GFS2_POSIX_ACL_ACCESS; - er.er_name_len = GFS2_POSIX_ACL_ACCESS_LEN; - posix_acl_to_xattr(acl, er.er_data, er.er_data_len); - er.er_mode = mode; - er.er_flags = GFS2_ERF_MODE; - error = gfs2_system_eaops.eo_set(ip, &er); - if (error) - goto out; - } else - munge_mode(ip, mode); + if (error == 0) + goto munge; + posix_acl_to_xattr(acl, data, len); + error = gfs2_xattr_set(&ip->i_inode, GFS2_EATYPE_SYS, + GFS2_POSIX_ACL_ACCESS, data, len, 0); + if (error) + goto out; +munge: + error = munge_mode(ip, mode); out: posix_acl_release(acl); - kfree(er.er_data); + kfree(data); return error; } @@ -265,9 +250,9 @@ int gfs2_acl_chmod(struct gfs2_inode *ip, struct iattr *attr) unsigned int len; int error; - error = acl_get(ip, ACL_ACCESS, &acl, &el, &data, &len); + error = acl_get(ip, GFS2_POSIX_ACL_ACCESS, &acl, &el, &data, &len); if (error) - return error; + goto out_brelse; if (!acl) return gfs2_setattr_simple(ip, attr); @@ -286,8 +271,9 @@ int gfs2_acl_chmod(struct gfs2_inode *ip, struct iattr *attr) out: posix_acl_release(acl); - brelse(el.el_bh); kfree(data); +out_brelse: + brelse(el.el_bh); return error; } diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index 7ebae9a..694b5d4 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -1135,6 +1135,7 @@ static const struct address_space_operations gfs2_writeback_aops = { .direct_IO = gfs2_direct_IO, .migratepage = buffer_migrate_page, .is_partially_uptodate = block_is_partially_uptodate, + .error_remove_page = generic_error_remove_page, }; static const struct address_space_operations gfs2_ordered_aops = { @@ -1151,6 +1152,7 @@ static const struct address_space_operations gfs2_ordered_aops = { .direct_IO = gfs2_direct_IO, .migratepage = buffer_migrate_page, .is_partially_uptodate = block_is_partially_uptodate, + .error_remove_page = generic_error_remove_page, }; static const struct address_space_operations gfs2_jdata_aops = { @@ -1166,6 +1168,7 @@ static const struct address_space_operations gfs2_jdata_aops = { .invalidatepage = gfs2_invalidatepage, .releasepage = gfs2_releasepage, .is_partially_uptodate = block_is_partially_uptodate, + .error_remove_page = generic_error_remove_page, }; void gfs2_set_aops(struct inode *inode) diff --git a/fs/gfs2/dentry.c b/fs/gfs2/dentry.c index 022c66c..91bedda 100644 --- a/fs/gfs2/dentry.c +++ b/fs/gfs2/dentry.c @@ -107,8 +107,26 @@ static int gfs2_dhash(struct dentry *dentry, struct qstr *str) return 0; } +static int gfs2_dentry_delete(struct dentry *dentry) +{ + struct gfs2_inode *ginode; + + if (!dentry->d_inode) + return 0; + + ginode = GFS2_I(dentry->d_inode); + if (!ginode->i_iopen_gh.gh_gl) + return 0; + + if (test_bit(GLF_DEMOTE, &ginode->i_iopen_gh.gh_gl->gl_flags)) + return 1; + + return 0; +} + const struct dentry_operations gfs2_dops = { .d_revalidate = gfs2_drevalidate, .d_hash = gfs2_dhash, + .d_delete = gfs2_dentry_delete, }; diff --git a/fs/gfs2/eaops.c b/fs/gfs2/eaops.c deleted file mode 100644 index dee9b03..0000000 --- a/fs/gfs2/eaops.c +++ /dev/null @@ -1,157 +0,0 @@ -/* - * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. - * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. - * - * This copyrighted material is made available to anyone wishing to use, - * modify, copy, or redistribute it subject to the terms and conditions - * of the GNU General Public License version 2. - */ - -#include <linux/slab.h> -#include <linux/spinlock.h> -#include <linux/completion.h> -#include <linux/buffer_head.h> -#include <linux/capability.h> -#include <linux/xattr.h> -#include <linux/gfs2_ondisk.h> -#include <asm/uaccess.h> - -#include "gfs2.h" -#include "incore.h" -#include "acl.h" -#include "eaops.h" -#include "eattr.h" -#include "util.h" - -/** - * gfs2_ea_name2type - get the type of the ea, and truncate type from the name - * @namep: ea name, possibly with type appended - * - * Returns: GFS2_EATYPE_XXX - */ - -unsigned int gfs2_ea_name2type(const char *name, const char **truncated_name) -{ - unsigned int type; - - if (strncmp(name, "system.", 7) == 0) { - type = GFS2_EATYPE_SYS; - if (truncated_name) - *truncated_name = name + sizeof("system.") - 1; - } else if (strncmp(name, "user.", 5) == 0) { - type = GFS2_EATYPE_USR; - if (truncated_name) - *truncated_name = name + sizeof("user.") - 1; - } else if (strncmp(name, "security.", 9) == 0) { - type = GFS2_EATYPE_SECURITY; - if (truncated_name) - *truncated_name = name + sizeof("security.") - 1; - } else { - type = GFS2_EATYPE_UNUSED; - if (truncated_name) - *truncated_name = NULL; - } - - return type; -} - -static int system_eo_get(struct gfs2_inode *ip, struct gfs2_ea_request *er) -{ - if (!GFS2_ACL_IS_ACCESS(er->er_name, er->er_name_len) && - !GFS2_ACL_IS_DEFAULT(er->er_name, er->er_name_len) && - !capable(CAP_SYS_ADMIN)) - return -EPERM; - - if (GFS2_SB(&ip->i_inode)->sd_args.ar_posix_acl == 0 && - (GFS2_ACL_IS_ACCESS(er->er_name, er->er_name_len) || - GFS2_ACL_IS_DEFAULT(er->er_name, er->er_name_len))) - return -EOPNOTSUPP; - - return gfs2_ea_get_i(ip, er); -} - -static int system_eo_set(struct gfs2_inode *ip, struct gfs2_ea_request *er) -{ - int remove = 0; - int error; - - if (GFS2_ACL_IS_ACCESS(er->er_name, er->er_name_len)) { - if (!(er->er_flags & GFS2_ERF_MODE)) { - er->er_mode = ip->i_inode.i_mode; - er->er_flags |= GFS2_ERF_MODE; - } - error = gfs2_acl_validate_set(ip, 1, er, - &remove, &er->er_mode); - if (error) - return error; - error = gfs2_ea_set_i(ip, er); - if (error) - return error; - if (remove) - gfs2_ea_remove_i(ip, er); - return 0; - - } else if (GFS2_ACL_IS_DEFAULT(er->er_name, er->er_name_len)) { - error = gfs2_acl_validate_set(ip, 0, er, - &remove, NULL); - if (error) - return error; - if (!remove) - error = gfs2_ea_set_i(ip, er); - else { - error = gfs2_ea_remove_i(ip, er); - if (error == -ENODATA) - error = 0; - } - return error; - } - - return -EPERM; -} - -static int system_eo_remove(struct gfs2_inode *ip, struct gfs2_ea_request *er) -{ - if (GFS2_ACL_IS_ACCESS(er->er_name, er->er_name_len)) { - int error = gfs2_acl_validate_remove(ip, 1); - if (error) - return error; - - } else if (GFS2_ACL_IS_DEFAULT(er->er_name, er->er_name_len)) { - int error = gfs2_acl_validate_remove(ip, 0); - if (error) - return error; - - } else - return -EPERM; - - return gfs2_ea_remove_i(ip, er); -} - -static const struct gfs2_eattr_operations gfs2_user_eaops = { - .eo_get = gfs2_ea_get_i, - .eo_set = gfs2_ea_set_i, - .eo_remove = gfs2_ea_remove_i, - .eo_name = "user", -}; - -const struct gfs2_eattr_operations gfs2_system_eaops = { - .eo_get = system_eo_get, - .eo_set = system_eo_set, - .eo_remove = system_eo_remove, - .eo_name = "system", -}; - -static const struct gfs2_eattr_operations gfs2_security_eaops = { - .eo_get = gfs2_ea_get_i, - .eo_set = gfs2_ea_set_i, - .eo_remove = gfs2_ea_remove_i, - .eo_name = "security", -}; - -const struct gfs2_eattr_operations *gfs2_ea_ops[] = { - NULL, - &gfs2_user_eaops, - &gfs2_system_eaops, - &gfs2_security_eaops, -}; - diff --git a/fs/gfs2/eaops.h b/fs/gfs2/eaops.h deleted file mode 100644 index da2f7fb..0000000 --- a/fs/gfs2/eaops.h +++ /dev/null @@ -1,30 +0,0 @@ -/* - * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. - * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. - * - * This copyrighted material is made available to anyone wishing to use, - * modify, copy, or redistribute it subject to the terms and conditions - * of the GNU General Public License version 2. - */ - -#ifndef __EAOPS_DOT_H__ -#define __EAOPS_DOT_H__ - -struct gfs2_ea_request; -struct gfs2_inode; - -struct gfs2_eattr_operations { - int (*eo_get) (struct gfs2_inode *ip, struct gfs2_ea_request *er); - int (*eo_set) (struct gfs2_inode *ip, struct gfs2_ea_request *er); - int (*eo_remove) (struct gfs2_inode *ip, struct gfs2_ea_request *er); - char *eo_name; -}; - -unsigned int gfs2_ea_name2type(const char *name, const char **truncated_name); - -extern const struct gfs2_eattr_operations gfs2_system_eaops; - -extern const struct gfs2_eattr_operations *gfs2_ea_ops[]; - -#endif /* __EAOPS_DOT_H__ */ - diff --git a/fs/gfs2/export.c b/fs/gfs2/export.c index 9200ef2..d15876e 100644 --- a/fs/gfs2/export.c +++ b/fs/gfs2/export.c @@ -143,17 +143,14 @@ static struct dentry *gfs2_get_parent(struct dentry *child) } static struct dentry *gfs2_get_dentry(struct super_block *sb, - struct gfs2_inum_host *inum) + struct gfs2_inum_host *inum) { struct gfs2_sbd *sdp = sb->s_fs_info; - struct gfs2_holder i_gh, ri_gh, rgd_gh; - struct gfs2_rgrpd *rgd; + struct gfs2_holder i_gh; struct inode *inode; struct dentry *dentry; int error; - /* System files? */ - inode = gfs2_ilookup(sb, inum->no_addr); if (inode) { if (GFS2_I(inode)->i_no_formal_ino != inum->no_formal_ino) { @@ -168,29 +165,11 @@ static struct dentry *gfs2_get_dentry(struct super_block *sb, if (error) return ERR_PTR(error); - error = gfs2_rindex_hold(sdp, &ri_gh); + error = gfs2_check_blk_type(sdp, inum->no_addr, GFS2_BLKST_DINODE); if (error) goto fail; - error = -EINVAL; - rgd = gfs2_blk2rgrpd(sdp, inum->no_addr); - if (!rgd) - goto fail_rindex; - - error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_SHARED, 0, &rgd_gh); - if (error) - goto fail_rindex; - - error = -ESTALE; - if (gfs2_get_block_type(rgd, inum->no_addr) != GFS2_BLKST_DINODE) - goto fail_rgd; - - gfs2_glock_dq_uninit(&rgd_gh); - gfs2_glock_dq_uninit(&ri_gh); - - inode = gfs2_inode_lookup(sb, DT_UNKNOWN, - inum->no_addr, - 0, 0); + inode = gfs2_inode_lookup(sb, DT_UNKNOWN, inum->no_addr, 0, 0); if (IS_ERR(inode)) { error = PTR_ERR(inode); goto fail; @@ -224,13 +203,6 @@ out_inode: if (!IS_ERR(dentry)) dentry->d_op = &gfs2_dops; return dentry; - -fail_rgd: - gfs2_glock_dq_uninit(&rgd_gh); - -fail_rindex: - gfs2_glock_dq_uninit(&ri_gh); - fail: gfs2_glock_dq_uninit(&i_gh); return ERR_PTR(error); diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index 73318a3..4eb308a 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -38,7 +38,6 @@ #include "rgrp.h" #include "trans.h" #include "util.h" -#include "eaops.h" /** * gfs2_llseek - seek to a location in a file @@ -419,7 +418,7 @@ out: return ret; } -static struct vm_operations_struct gfs2_vm_ops = { +static const struct vm_operations_struct gfs2_vm_ops = { .fault = filemap_fault, .page_mkwrite = gfs2_page_mkwrite, }; diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index 61801ad..6edb423 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -406,6 +406,12 @@ struct gfs2_statfs_change_host { #define GFS2_DATA_WRITEBACK 1 #define GFS2_DATA_ORDERED 2 +#define GFS2_ERRORS_DEFAULT GFS2_ERRORS_WITHDRAW +#define GFS2_ERRORS_WITHDRAW 0 +#define GFS2_ERRORS_CONTINUE 1 /* place holder for future feature */ +#define GFS2_ERRORS_RO 2 /* place holder for future feature */ +#define GFS2_ERRORS_PANIC 3 + struct gfs2_args { char ar_lockproto[GFS2_LOCKNAME_LEN]; /* Name of the Lock Protocol */ char ar_locktable[GFS2_LOCKNAME_LEN]; /* Name of the Lock Table */ @@ -422,6 +428,7 @@ struct gfs2_args { unsigned int ar_data:2; /* ordered/writeback */ unsigned int ar_meta:1; /* mount metafs */ unsigned int ar_discard:1; /* discard requests */ + unsigned int ar_errors:2; /* errors=withdraw | panic */ int ar_commit; /* Commit interval */ }; @@ -489,7 +496,6 @@ struct gfs2_sb_host { */ struct lm_lockstruct { - u32 ls_id; unsigned int ls_jid; unsigned int ls_first; unsigned int ls_first_done; @@ -541,18 +547,12 @@ struct gfs2_sbd { struct dentry *sd_root_dir; struct inode *sd_jindex; - struct inode *sd_inum_inode; struct inode *sd_statfs_inode; - struct inode *sd_ir_inode; struct inode *sd_sc_inode; struct inode *sd_qc_inode; struct inode *sd_rindex; struct inode *sd_quota_inode; - /* Inum stuff */ - - struct mutex sd_inum_mutex; - /* StatFS stuff */ spinlock_t sd_statfs_spin; @@ -580,7 +580,6 @@ struct gfs2_sbd { struct gfs2_holder sd_journal_gh; struct gfs2_holder sd_jinode_gh; - struct gfs2_holder sd_ir_gh; struct gfs2_holder sd_sc_gh; struct gfs2_holder sd_qc_gh; diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index 2f94bd7..fb15d3b1 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -24,7 +24,7 @@ #include "acl.h" #include "bmap.h" #include "dir.h" -#include "eattr.h" +#include "xattr.h" #include "glock.h" #include "glops.h" #include "inode.h" @@ -519,139 +519,6 @@ out: return inode ? inode : ERR_PTR(error); } -static void gfs2_inum_range_in(struct gfs2_inum_range_host *ir, const void *buf) -{ - const struct gfs2_inum_range *str = buf; - - ir->ir_start = be64_to_cpu(str->ir_start); - ir->ir_length = be64_to_cpu(str->ir_length); -} - -static void gfs2_inum_range_out(const struct gfs2_inum_range_host *ir, void *buf) -{ - struct gfs2_inum_range *str = buf; - - str->ir_start = cpu_to_be64(ir->ir_start); - str->ir_length = cpu_to_be64(ir->ir_length); -} - -static int pick_formal_ino_1(struct gfs2_sbd *sdp, u64 *formal_ino) -{ - struct gfs2_inode *ip = GFS2_I(sdp->sd_ir_inode); - struct buffer_head *bh; - struct gfs2_inum_range_host ir; - int error; - - error = gfs2_trans_begin(sdp, RES_DINODE, 0); - if (error) - return error; - mutex_lock(&sdp->sd_inum_mutex); - - error = gfs2_meta_inode_buffer(ip, &bh); - if (error) { - mutex_unlock(&sdp->sd_inum_mutex); - gfs2_trans_end(sdp); - return error; - } - - gfs2_inum_range_in(&ir, bh->b_data + sizeof(struct gfs2_dinode)); - - if (ir.ir_length) { - *formal_ino = ir.ir_start++; - ir.ir_length--; - gfs2_trans_add_bh(ip->i_gl, bh, 1); - gfs2_inum_range_out(&ir, - bh->b_data + sizeof(struct gfs2_dinode)); - brelse(bh); - mutex_unlock(&sdp->sd_inum_mutex); - gfs2_trans_end(sdp); - return 0; - } - - brelse(bh); - - mutex_unlock(&sdp->sd_inum_mutex); - gfs2_trans_end(sdp); - - return 1; -} - -static int pick_formal_ino_2(struct gfs2_sbd *sdp, u64 *formal_ino) -{ - struct gfs2_inode *ip = GFS2_I(sdp->sd_ir_inode); - struct gfs2_inode *m_ip = GFS2_I(sdp->sd_inum_inode); - struct gfs2_holder gh; - struct buffer_head *bh; - struct gfs2_inum_range_host ir; - int error; - - error = gfs2_glock_nq_init(m_ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh); - if (error) - return error; - - error = gfs2_trans_begin(sdp, 2 * RES_DINODE, 0); - if (error) - goto out; - mutex_lock(&sdp->sd_inum_mutex); - - error = gfs2_meta_inode_buffer(ip, &bh); - if (error) - goto out_end_trans; - - gfs2_inum_range_in(&ir, bh->b_data + sizeof(struct gfs2_dinode)); - - if (!ir.ir_length) { - struct buffer_head *m_bh; - u64 x, y; - __be64 z; - - error = gfs2_meta_inode_buffer(m_ip, &m_bh); - if (error) - goto out_brelse; - - z = *(__be64 *)(m_bh->b_data + sizeof(struct gfs2_dinode)); - x = y = be64_to_cpu(z); - ir.ir_start = x; - ir.ir_length = GFS2_INUM_QUANTUM; - x += GFS2_INUM_QUANTUM; - if (x < y) - gfs2_consist_inode(m_ip); - z = cpu_to_be64(x); - gfs2_trans_add_bh(m_ip->i_gl, m_bh, 1); - *(__be64 *)(m_bh->b_data + sizeof(struct gfs2_dinode)) = z; - - brelse(m_bh); - } - - *formal_ino = ir.ir_start++; - ir.ir_length--; - - gfs2_trans_add_bh(ip->i_gl, bh, 1); - gfs2_inum_range_out(&ir, bh->b_data + sizeof(struct gfs2_dinode)); - -out_brelse: - brelse(bh); -out_end_trans: - mutex_unlock(&sdp->sd_inum_mutex); - gfs2_trans_end(sdp); -out: - gfs2_glock_dq_uninit(&gh); - return error; -} - -static int pick_formal_ino(struct gfs2_sbd *sdp, u64 *inum) -{ - int error; - - error = pick_formal_ino_1(sdp, inum); - if (error <= 0) - return error; - - error = pick_formal_ino_2(sdp, inum); - - return error; -} - /** * create_ok - OK to create a new on-disk inode here? * @dip: Directory in which dinode is to be created @@ -731,7 +598,7 @@ static int alloc_dinode(struct gfs2_inode *dip, u64 *no_addr, u64 *generation) if (error) goto out_ipreserv; - *no_addr = gfs2_alloc_di(dip, generation); + error = gfs2_alloc_di(dip, no_addr, generation); gfs2_trans_end(sdp); @@ -924,7 +791,6 @@ static int gfs2_security_init(struct gfs2_inode *dip, struct gfs2_inode *ip) size_t len; void *value; char *name; - struct gfs2_ea_request er; err = security_inode_init_security(&ip->i_inode, &dip->i_inode, &name, &value, &len); @@ -935,16 +801,7 @@ static int gfs2_security_init(struct gfs2_inode *dip, struct gfs2_inode *ip) return err; } - memset(&er, 0, sizeof(struct gfs2_ea_request)); - - er.er_type = GFS2_EATYPE_SECURITY; - er.er_name = name; - er.er_data = value; - er.er_name_len = strlen(name); - er.er_data_len = len; - - err = gfs2_ea_set_i(ip, &er); - + err = gfs2_xattr_set(&ip->i_inode, GFS2_EATYPE_SECURITY, name, value, len, 0); kfree(value); kfree(name); @@ -991,13 +848,10 @@ struct inode *gfs2_createi(struct gfs2_holder *ghs, const struct qstr *name, if (error) goto fail_gunlock; - error = pick_formal_ino(sdp, &inum.no_formal_ino); - if (error) - goto fail_gunlock; - error = alloc_dinode(dip, &inum.no_addr, &generation); if (error) goto fail_gunlock; + inum.no_formal_ino = generation; error = gfs2_glock_nq_num(sdp, inum.no_addr, &gfs2_inode_glops, LM_ST_EXCLUSIVE, GL_SKIP, ghs + 1); @@ -1008,9 +862,8 @@ struct inode *gfs2_createi(struct gfs2_holder *ghs, const struct qstr *name, if (error) goto fail_gunlock2; - inode = gfs2_inode_lookup(dir->i_sb, IF2DT(mode), - inum.no_addr, - inum.no_formal_ino, 0); + inode = gfs2_inode_lookup(dir->i_sb, IF2DT(mode), inum.no_addr, + inum.no_formal_ino, 0); if (IS_ERR(inode)) goto fail_gunlock2; diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index 7bc3c45..52fb6c0 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -84,7 +84,6 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb) gfs2_tune_init(&sdp->sd_tune); - mutex_init(&sdp->sd_inum_mutex); spin_lock_init(&sdp->sd_statfs_spin); spin_lock_init(&sdp->sd_rindex_spin); @@ -833,21 +832,12 @@ static int init_inodes(struct gfs2_sbd *sdp, int undo) if (error) goto fail; - /* Read in the master inode number inode */ - sdp->sd_inum_inode = gfs2_lookup_simple(master, "inum"); - if (IS_ERR(sdp->sd_inum_inode)) { - error = PTR_ERR(sdp->sd_inum_inode); - fs_err(sdp, "can't read in inum inode: %d\n", error); - goto fail_journal; - } - - /* Read in the master statfs inode */ sdp->sd_statfs_inode = gfs2_lookup_simple(master, "statfs"); if (IS_ERR(sdp->sd_statfs_inode)) { error = PTR_ERR(sdp->sd_statfs_inode); fs_err(sdp, "can't read in statfs inode: %d\n", error); - goto fail_inum; + goto fail_journal; } /* Read in the resource index inode */ @@ -876,8 +866,6 @@ fail_rindex: iput(sdp->sd_rindex); fail_statfs: iput(sdp->sd_statfs_inode); -fail_inum: - iput(sdp->sd_inum_inode); fail_journal: init_journal(sdp, UNDO); fail: @@ -905,20 +893,12 @@ static int init_per_node(struct gfs2_sbd *sdp, int undo) return error; } - sprintf(buf, "inum_range%u", sdp->sd_jdesc->jd_jid); - sdp->sd_ir_inode = gfs2_lookup_simple(pn, buf); - if (IS_ERR(sdp->sd_ir_inode)) { - error = PTR_ERR(sdp->sd_ir_inode); - fs_err(sdp, "can't find local \"ir\" file: %d\n", error); - goto fail; - } - sprintf(buf, "statfs_change%u", sdp->sd_jdesc->jd_jid); sdp->sd_sc_inode = gfs2_lookup_simple(pn, buf); if (IS_ERR(sdp->sd_sc_inode)) { error = PTR_ERR(sdp->sd_sc_inode); fs_err(sdp, "can't find local \"sc\" file: %d\n", error); - goto fail_ir_i; + goto fail; } sprintf(buf, "quota_change%u", sdp->sd_jdesc->jd_jid); @@ -932,27 +912,16 @@ static int init_per_node(struct gfs2_sbd *sdp, int undo) iput(pn); pn = NULL; - ip = GFS2_I(sdp->sd_ir_inode); - error = gfs2_glock_nq_init(ip->i_gl, - LM_ST_EXCLUSIVE, 0, - &sdp->sd_ir_gh); - if (error) { - fs_err(sdp, "can't lock local \"ir\" file: %d\n", error); - goto fail_qc_i; - } - ip = GFS2_I(sdp->sd_sc_inode); - error = gfs2_glock_nq_init(ip->i_gl, - LM_ST_EXCLUSIVE, 0, + error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &sdp->sd_sc_gh); if (error) { fs_err(sdp, "can't lock local \"sc\" file: %d\n", error); - goto fail_ir_gh; + goto fail_qc_i; } ip = GFS2_I(sdp->sd_qc_inode); - error = gfs2_glock_nq_init(ip->i_gl, - LM_ST_EXCLUSIVE, 0, + error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &sdp->sd_qc_gh); if (error) { fs_err(sdp, "can't lock local \"qc\" file: %d\n", error); @@ -965,14 +934,10 @@ fail_qc_gh: gfs2_glock_dq_uninit(&sdp->sd_qc_gh); fail_ut_gh: gfs2_glock_dq_uninit(&sdp->sd_sc_gh); -fail_ir_gh: - gfs2_glock_dq_uninit(&sdp->sd_ir_gh); fail_qc_i: iput(sdp->sd_qc_inode); fail_ut_i: iput(sdp->sd_sc_inode); -fail_ir_i: - iput(sdp->sd_ir_inode); fail: if (pn) iput(pn); @@ -1063,7 +1028,6 @@ static int gfs2_lm_mount(struct gfs2_sbd *sdp, int silent) ls->ls_ops = lm; ls->ls_first = 1; - ls->ls_id = 0; for (options = args->ar_hostdata; (o = strsep(&options, ":")); ) { substring_t tmp[MAX_OPT_ARGS]; @@ -1081,10 +1045,7 @@ static int gfs2_lm_mount(struct gfs2_sbd *sdp, int silent) ls->ls_jid = option; break; case Opt_id: - ret = match_int(&tmp[0], &option); - if (ret) - goto hostdata_error; - ls->ls_id = option; + /* Obsolete, but left for backward compat purposes */ break; case Opt_first: ret = match_int(&tmp[0], &option); @@ -1133,6 +1094,17 @@ void gfs2_lm_unmount(struct gfs2_sbd *sdp) lm->lm_unmount(sdp); } +void gfs2_online_uevent(struct gfs2_sbd *sdp) +{ + struct super_block *sb = sdp->sd_vfs; + char ro[20]; + char spectator[20]; + char *envp[] = { ro, spectator, NULL }; + sprintf(ro, "RDONLY=%d", (sb->s_flags & MS_RDONLY) ? 1 : 0); + sprintf(spectator, "SPECTATOR=%d", sdp->sd_args.ar_spectator ? 1 : 0); + kobject_uevent_env(&sdp->sd_kobj, KOBJ_ONLINE, envp); +} + /** * fill_super - Read in superblock * @sb: The VFS superblock @@ -1157,6 +1129,7 @@ static int fill_super(struct super_block *sb, void *data, int silent) sdp->sd_args.ar_quota = GFS2_QUOTA_DEFAULT; sdp->sd_args.ar_data = GFS2_DATA_DEFAULT; sdp->sd_args.ar_commit = 60; + sdp->sd_args.ar_errors = GFS2_ERRORS_DEFAULT; error = gfs2_mount_args(sdp, &sdp->sd_args, data); if (error) { @@ -1174,6 +1147,7 @@ static int fill_super(struct super_block *sb, void *data, int silent) sb->s_magic = GFS2_MAGIC; sb->s_op = &gfs2_super_ops; sb->s_export_op = &gfs2_export_ops; + sb->s_xattr = gfs2_xattr_handlers; sb->s_time_gran = 1; sb->s_maxbytes = MAX_LFS_FILESIZE; @@ -1236,7 +1210,7 @@ static int fill_super(struct super_block *sb, void *data, int silent) } gfs2_glock_dq_uninit(&mount_gh); - + gfs2_online_uevent(sdp); return 0; fail_threads: diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c index f8bd20b..247436c 100644 --- a/fs/gfs2/ops_inode.c +++ b/fs/gfs2/ops_inode.c @@ -12,7 +12,6 @@ #include <linux/completion.h> #include <linux/buffer_head.h> #include <linux/namei.h> -#include <linux/utsname.h> #include <linux/mm.h> #include <linux/xattr.h> #include <linux/posix_acl.h> @@ -26,8 +25,7 @@ #include "acl.h" #include "bmap.h" #include "dir.h" -#include "eaops.h" -#include "eattr.h" +#include "xattr.h" #include "glock.h" #include "inode.h" #include "meta_io.h" @@ -349,7 +347,7 @@ static int gfs2_unlink(struct inode *dir, struct dentry *dentry) error = gfs2_trans_begin(sdp, 2*RES_DINODE + RES_LEAF + RES_RG_BIT, 0); if (error) - goto out_rgrp; + goto out_gunlock; error = gfs2_dir_del(dip, &dentry->d_name); if (error) @@ -1302,60 +1300,53 @@ static int gfs2_setxattr(struct dentry *dentry, const char *name, const void *data, size_t size, int flags) { struct inode *inode = dentry->d_inode; - struct gfs2_ea_request er; - - memset(&er, 0, sizeof(struct gfs2_ea_request)); - er.er_type = gfs2_ea_name2type(name, &er.er_name); - if (er.er_type == GFS2_EATYPE_UNUSED) - return -EOPNOTSUPP; - er.er_data = (char *)data; - er.er_name_len = strlen(er.er_name); - er.er_data_len = size; - er.er_flags = flags; - - gfs2_assert_warn(GFS2_SB(inode), !(er.er_flags & GFS2_ERF_MODE)); + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_holder gh; + int ret; - return gfs2_ea_set(GFS2_I(inode), &er); + gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh); + ret = gfs2_glock_nq(&gh); + if (ret == 0) { + ret = generic_setxattr(dentry, name, data, size, flags); + gfs2_glock_dq(&gh); + } + gfs2_holder_uninit(&gh); + return ret; } static ssize_t gfs2_getxattr(struct dentry *dentry, const char *name, void *data, size_t size) { - struct gfs2_ea_request er; - - memset(&er, 0, sizeof(struct gfs2_ea_request)); - er.er_type = gfs2_ea_name2type(name, &er.er_name); - if (er.er_type == GFS2_EATYPE_UNUSED) - return -EOPNOTSUPP; - er.er_data = data; - er.er_name_len = strlen(er.er_name); - er.er_data_len = size; - - return gfs2_ea_get(GFS2_I(dentry->d_inode), &er); -} - -static ssize_t gfs2_listxattr(struct dentry *dentry, char *buffer, size_t size) -{ - struct gfs2_ea_request er; - - memset(&er, 0, sizeof(struct gfs2_ea_request)); - er.er_data = (size) ? buffer : NULL; - er.er_data_len = size; + struct inode *inode = dentry->d_inode; + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_holder gh; + int ret; - return gfs2_ea_list(GFS2_I(dentry->d_inode), &er); + gfs2_holder_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &gh); + ret = gfs2_glock_nq(&gh); + if (ret == 0) { + ret = generic_getxattr(dentry, name, data, size); + gfs2_glock_dq(&gh); + } + gfs2_holder_uninit(&gh); + return ret; } static int gfs2_removexattr(struct dentry *dentry, const char *name) { - struct gfs2_ea_request er; - - memset(&er, 0, sizeof(struct gfs2_ea_request)); - er.er_type = gfs2_ea_name2type(name, &er.er_name); - if (er.er_type == GFS2_EATYPE_UNUSED) - return -EOPNOTSUPP; - er.er_name_len = strlen(er.er_name); + struct inode *inode = dentry->d_inode; + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_holder gh; + int ret; - return gfs2_ea_remove(GFS2_I(dentry->d_inode), &er); + gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh); + ret = gfs2_glock_nq(&gh); + if (ret == 0) { + ret = generic_removexattr(dentry, name); + gfs2_glock_dq(&gh); + } + gfs2_holder_uninit(&gh); + return ret; } static int gfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index fba7957..8f1cfb0 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -179,7 +179,7 @@ static inline u64 gfs2_bit_search(const __le64 *ptr, u64 mask, u8 state) * always aligned to a 64 bit boundary. * * The size of the buffer is in bytes, but is it assumed that it is - * always ok to to read a complete multiple of 64 bits at the end + * always ok to read a complete multiple of 64 bits at the end * of the block in case the end is no aligned to a natural boundary. * * Return: the block number (bitmap buffer scope) that was found @@ -857,7 +857,8 @@ static void gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset, goto start_new_extent; if ((start + nr_sects) != blk) { rv = blkdev_issue_discard(bdev, start, - nr_sects, GFP_NOFS); + nr_sects, GFP_NOFS, + DISCARD_FL_BARRIER); if (rv) goto fail; nr_sects = 0; @@ -871,7 +872,8 @@ start_new_extent: } } if (nr_sects) { - rv = blkdev_issue_discard(bdev, start, nr_sects, GFP_NOFS); + rv = blkdev_issue_discard(bdev, start, nr_sects, GFP_NOFS, + DISCARD_FL_BARRIER); if (rv) goto fail; } @@ -1256,7 +1258,7 @@ void gfs2_inplace_release(struct gfs2_inode *ip) * Returns: The block type (GFS2_BLKST_*) */ -unsigned char gfs2_get_block_type(struct gfs2_rgrpd *rgd, u64 block) +static unsigned char gfs2_get_block_type(struct gfs2_rgrpd *rgd, u64 block) { struct gfs2_bitmap *bi = NULL; u32 length, rgrp_block, buf_block; @@ -1459,6 +1461,16 @@ int gfs2_rgrp_dump(struct seq_file *seq, const struct gfs2_glock *gl) return 0; } +static void gfs2_rgrp_error(struct gfs2_rgrpd *rgd) +{ + struct gfs2_sbd *sdp = rgd->rd_sbd; + fs_warn(sdp, "rgrp %llu has an error, marking it readonly until umount\n", + (unsigned long long)rgd->rd_addr); + fs_warn(sdp, "umount on all nodes and run fsck.gfs2 to fix the error\n"); + gfs2_rgrp_dump(NULL, rgd->rd_gl); + rgd->rd_flags |= GFS2_RDF_ERROR; +} + /** * gfs2_alloc_block - Allocate one or more blocks * @ip: the inode to allocate the block for @@ -1520,22 +1532,20 @@ int gfs2_alloc_block(struct gfs2_inode *ip, u64 *bn, unsigned int *n) return 0; rgrp_error: - fs_warn(sdp, "rgrp %llu has an error, marking it readonly until umount\n", - (unsigned long long)rgd->rd_addr); - fs_warn(sdp, "umount on all nodes and run fsck.gfs2 to fix the error\n"); - gfs2_rgrp_dump(NULL, rgd->rd_gl); - rgd->rd_flags |= GFS2_RDF_ERROR; + gfs2_rgrp_error(rgd); return -EIO; } /** * gfs2_alloc_di - Allocate a dinode * @dip: the directory that the inode is going in + * @bn: the block number which is allocated + * @generation: the generation number of the inode * - * Returns: the block allocated + * Returns: 0 on success or error */ -u64 gfs2_alloc_di(struct gfs2_inode *dip, u64 *generation) +int gfs2_alloc_di(struct gfs2_inode *dip, u64 *bn, u64 *generation) { struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode); struct gfs2_alloc *al = dip->i_alloc; @@ -1546,16 +1556,21 @@ u64 gfs2_alloc_di(struct gfs2_inode *dip, u64 *generation) blk = rgblk_search(rgd, rgd->rd_last_alloc, GFS2_BLKST_FREE, GFS2_BLKST_DINODE, &n); - BUG_ON(blk == BFITNOENT); - rgd->rd_last_alloc = blk; + /* Since all blocks are reserved in advance, this shouldn't happen */ + if (blk == BFITNOENT) + goto rgrp_error; + rgd->rd_last_alloc = blk; block = rgd->rd_data0 + blk; + if (rgd->rd_free == 0) + goto rgrp_error; - gfs2_assert_withdraw(sdp, rgd->rd_free); rgd->rd_free--; rgd->rd_dinodes++; *generation = rgd->rd_igeneration++; + if (*generation == 0) + *generation = rgd->rd_igeneration++; gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1); gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data); @@ -1568,7 +1583,12 @@ u64 gfs2_alloc_di(struct gfs2_inode *dip, u64 *generation) rgd->rd_free_clone--; spin_unlock(&sdp->sd_rindex_spin); trace_gfs2_block_alloc(dip, block, 1, GFS2_BLKST_DINODE); - return block; + *bn = block; + return 0; + +rgrp_error: + gfs2_rgrp_error(rgd); + return -EIO; } /** @@ -1676,6 +1696,46 @@ void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip) } /** + * gfs2_check_blk_type - Check the type of a block + * @sdp: The superblock + * @no_addr: The block number to check + * @type: The block type we are looking for + * + * Returns: 0 if the block type matches the expected type + * -ESTALE if it doesn't match + * or -ve errno if something went wrong while checking + */ + +int gfs2_check_blk_type(struct gfs2_sbd *sdp, u64 no_addr, unsigned int type) +{ + struct gfs2_rgrpd *rgd; + struct gfs2_holder ri_gh, rgd_gh; + int error; + + error = gfs2_rindex_hold(sdp, &ri_gh); + if (error) + goto fail; + + error = -EINVAL; + rgd = gfs2_blk2rgrpd(sdp, no_addr); + if (!rgd) + goto fail_rindex; + + error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_SHARED, 0, &rgd_gh); + if (error) + goto fail_rindex; + + if (gfs2_get_block_type(rgd, no_addr) != type) + error = -ESTALE; + + gfs2_glock_dq_uninit(&rgd_gh); +fail_rindex: + gfs2_glock_dq_uninit(&ri_gh); +fail: + return error; +} + +/** * gfs2_rlist_add - add a RG to a list of RGs * @sdp: the filesystem * @rlist: the list of resource groups diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h index 1e76ff0..b4106dd 100644 --- a/fs/gfs2/rgrp.h +++ b/fs/gfs2/rgrp.h @@ -44,15 +44,15 @@ gfs2_inplace_reserve_i((ip), __FILE__, __LINE__) extern void gfs2_inplace_release(struct gfs2_inode *ip); -extern unsigned char gfs2_get_block_type(struct gfs2_rgrpd *rgd, u64 block); - extern int gfs2_alloc_block(struct gfs2_inode *ip, u64 *bn, unsigned int *n); -extern u64 gfs2_alloc_di(struct gfs2_inode *ip, u64 *generation); +extern int gfs2_alloc_di(struct gfs2_inode *ip, u64 *bn, u64 *generation); extern void gfs2_free_data(struct gfs2_inode *ip, u64 bstart, u32 blen); extern void gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen); extern void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip); extern void gfs2_unlink_di(struct inode *inode); +extern int gfs2_check_blk_type(struct gfs2_sbd *sdp, u64 no_addr, + unsigned int type); struct gfs2_rgrp_list { unsigned int rl_rgrps; diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index f522bb0..0ec3ec6 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -38,7 +38,7 @@ #include "trans.h" #include "util.h" #include "sys.h" -#include "eattr.h" +#include "xattr.h" #define args_neq(a1, a2, x) ((a1)->ar_##x != (a2)->ar_##x) @@ -68,6 +68,8 @@ enum { Opt_discard, Opt_nodiscard, Opt_commit, + Opt_err_withdraw, + Opt_err_panic, Opt_error, }; @@ -97,6 +99,8 @@ static const match_table_t tokens = { {Opt_discard, "discard"}, {Opt_nodiscard, "nodiscard"}, {Opt_commit, "commit=%d"}, + {Opt_err_withdraw, "errors=withdraw"}, + {Opt_err_panic, "errors=panic"}, {Opt_error, NULL} }; @@ -152,6 +156,11 @@ int gfs2_mount_args(struct gfs2_sbd *sdp, struct gfs2_args *args, char *options) args->ar_localcaching = 1; break; case Opt_debug: + if (args->ar_errors == GFS2_ERRORS_PANIC) { + fs_info(sdp, "-o debug and -o errors=panic " + "are mutually exclusive.\n"); + return -EINVAL; + } args->ar_debug = 1; break; case Opt_nodebug: @@ -205,6 +214,17 @@ int gfs2_mount_args(struct gfs2_sbd *sdp, struct gfs2_args *args, char *options) return rv ? rv : -EINVAL; } break; + case Opt_err_withdraw: + args->ar_errors = GFS2_ERRORS_WITHDRAW; + break; + case Opt_err_panic: + if (args->ar_debug) { + fs_info(sdp, "-o debug and -o errors=panic " + "are mutually exclusive.\n"); + return -EINVAL; + } + args->ar_errors = GFS2_ERRORS_PANIC; + break; case Opt_error: default: fs_info(sdp, "invalid mount option: %s\n", o); @@ -768,7 +788,6 @@ restart: /* Release stuff */ iput(sdp->sd_jindex); - iput(sdp->sd_inum_inode); iput(sdp->sd_statfs_inode); iput(sdp->sd_rindex); iput(sdp->sd_quota_inode); @@ -779,10 +798,8 @@ restart: if (!sdp->sd_args.ar_spectator) { gfs2_glock_dq_uninit(&sdp->sd_journal_gh); gfs2_glock_dq_uninit(&sdp->sd_jinode_gh); - gfs2_glock_dq_uninit(&sdp->sd_ir_gh); gfs2_glock_dq_uninit(&sdp->sd_sc_gh); gfs2_glock_dq_uninit(&sdp->sd_qc_gh); - iput(sdp->sd_ir_inode); iput(sdp->sd_sc_inode); iput(sdp->sd_qc_inode); } @@ -1084,6 +1101,7 @@ static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data) gt->gt_log_flush_secs = args.ar_commit; spin_unlock(>->gt_spin); + gfs2_online_uevent(sdp); return 0; } @@ -1225,6 +1243,22 @@ static int gfs2_show_options(struct seq_file *s, struct vfsmount *mnt) lfsecs = sdp->sd_tune.gt_log_flush_secs; if (lfsecs != 60) seq_printf(s, ",commit=%d", lfsecs); + if (args->ar_errors != GFS2_ERRORS_DEFAULT) { + const char *state; + + switch (args->ar_errors) { + case GFS2_ERRORS_WITHDRAW: + state = "withdraw"; + break; + case GFS2_ERRORS_PANIC: + state = "panic"; + break; + default: + state = "unknown"; + break; + } + seq_printf(s, ",errors=%s", state); + } return 0; } @@ -1252,6 +1286,10 @@ static void gfs2_delete_inode(struct inode *inode) goto out; } + error = gfs2_check_blk_type(sdp, ip->i_no_addr, GFS2_BLKST_UNLINKED); + if (error) + goto out_truncate; + gfs2_glock_dq_wait(&ip->i_iopen_gh); gfs2_holder_reinit(LM_ST_EXCLUSIVE, LM_FLAG_TRY_1CB | GL_NOCACHE, &ip->i_iopen_gh); error = gfs2_glock_nq(&ip->i_iopen_gh); diff --git a/fs/gfs2/super.h b/fs/gfs2/super.h index 22e0417..235db36 100644 --- a/fs/gfs2/super.h +++ b/fs/gfs2/super.h @@ -25,7 +25,7 @@ static inline unsigned int gfs2_jindex_size(struct gfs2_sbd *sdp) return x; } -void gfs2_jindex_free(struct gfs2_sbd *sdp); +extern void gfs2_jindex_free(struct gfs2_sbd *sdp); extern int gfs2_mount_args(struct gfs2_sbd *sdp, struct gfs2_args *args, char *data); @@ -36,7 +36,7 @@ extern int gfs2_lookup_in_master_dir(struct gfs2_sbd *sdp, char *filename, struct gfs2_inode **ipp); extern int gfs2_make_fs_rw(struct gfs2_sbd *sdp); - +extern void gfs2_online_uevent(struct gfs2_sbd *sdp); extern int gfs2_statfs_init(struct gfs2_sbd *sdp); extern void gfs2_statfs_change(struct gfs2_sbd *sdp, s64 total, s64 free, s64 dinodes); @@ -54,6 +54,7 @@ extern struct file_system_type gfs2meta_fs_type; extern const struct export_operations gfs2_export_ops; extern const struct super_operations gfs2_super_ops; extern const struct dentry_operations gfs2_dops; +extern struct xattr_handler *gfs2_xattr_handlers[]; #endif /* __SUPER_DOT_H__ */ diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c index a7cbfbd..4463297 100644 --- a/fs/gfs2/sys.c +++ b/fs/gfs2/sys.c @@ -16,6 +16,7 @@ #include <linux/kobject.h> #include <asm/uaccess.h> #include <linux/gfs2_ondisk.h> +#include <linux/genhd.h> #include "gfs2.h" #include "incore.h" @@ -319,12 +320,6 @@ static ssize_t block_store(struct gfs2_sbd *sdp, const char *buf, size_t len) return ret; } -static ssize_t lkid_show(struct gfs2_sbd *sdp, char *buf) -{ - struct lm_lockstruct *ls = &sdp->sd_lockstruct; - return sprintf(buf, "%u\n", ls->ls_id); -} - static ssize_t lkfirst_show(struct gfs2_sbd *sdp, char *buf) { struct lm_lockstruct *ls = &sdp->sd_lockstruct; @@ -389,7 +384,6 @@ static struct gfs2_attr gdlm_attr_##_name = __ATTR(_name,_mode,_show,_store) GDLM_ATTR(proto_name, 0444, proto_name_show, NULL); GDLM_ATTR(block, 0644, block_show, block_store); GDLM_ATTR(withdraw, 0644, withdraw_show, withdraw_store); -GDLM_ATTR(id, 0444, lkid_show, NULL); GDLM_ATTR(jid, 0444, jid_show, NULL); GDLM_ATTR(first, 0444, lkfirst_show, NULL); GDLM_ATTR(first_done, 0444, first_done_show, NULL); @@ -401,7 +395,6 @@ static struct attribute *lock_module_attrs[] = { &gdlm_attr_proto_name.attr, &gdlm_attr_block.attr, &gdlm_attr_withdraw.attr, - &gdlm_attr_id.attr, &gdlm_attr_jid.attr, &gdlm_attr_first.attr, &gdlm_attr_first_done.attr, @@ -519,7 +512,14 @@ static struct attribute_group lock_module_group = { int gfs2_sys_fs_add(struct gfs2_sbd *sdp) { + struct super_block *sb = sdp->sd_vfs; int error; + char ro[20]; + char spectator[20]; + char *envp[] = { ro, spectator, NULL }; + + sprintf(ro, "RDONLY=%d", (sb->s_flags & MS_RDONLY) ? 1 : 0); + sprintf(spectator, "SPECTATOR=%d", sdp->sd_args.ar_spectator ? 1 : 0); sdp->sd_kobj.kset = gfs2_kset; error = kobject_init_and_add(&sdp->sd_kobj, &gfs2_ktype, NULL, @@ -535,9 +535,17 @@ int gfs2_sys_fs_add(struct gfs2_sbd *sdp) if (error) goto fail_tune; - kobject_uevent(&sdp->sd_kobj, KOBJ_ADD); + error = sysfs_create_link(&sdp->sd_kobj, + &disk_to_dev(sb->s_bdev->bd_disk)->kobj, + "device"); + if (error) + goto fail_lock_module; + + kobject_uevent_env(&sdp->sd_kobj, KOBJ_ADD, envp); return 0; +fail_lock_module: + sysfs_remove_group(&sdp->sd_kobj, &lock_module_group); fail_tune: sysfs_remove_group(&sdp->sd_kobj, &tune_group); fail_reg: @@ -549,12 +557,12 @@ fail: void gfs2_sys_fs_del(struct gfs2_sbd *sdp) { + sysfs_remove_link(&sdp->sd_kobj, "device"); sysfs_remove_group(&sdp->sd_kobj, &tune_group); sysfs_remove_group(&sdp->sd_kobj, &lock_module_group); kobject_put(&sdp->sd_kobj); } - static int gfs2_uevent(struct kset *kset, struct kobject *kobj, struct kobj_uevent_env *env) { @@ -563,6 +571,8 @@ static int gfs2_uevent(struct kset *kset, struct kobject *kobj, add_uevent_var(env, "LOCKTABLE=%s", sdp->sd_table_name); add_uevent_var(env, "LOCKPROTO=%s", sdp->sd_proto_name); + if (!sdp->sd_args.ar_spectator) + add_uevent_var(env, "JOURNALID=%u", sdp->sd_lockstruct.ls_jid); if (gfs2_uuid_valid(uuid)) { add_uevent_var(env, "UUID=%02X%02X%02X%02X-%02X%02X-%02X%02X-" "%02X%02X-%02X%02X%02X%02X%02X%02X", @@ -578,7 +588,6 @@ static struct kset_uevent_ops gfs2_uevent_ops = { .uevent = gfs2_uevent, }; - int gfs2_sys_init(void) { gfs2_kset = kset_create_and_add("gfs2", &gfs2_uevent_ops, fs_kobj); diff --git a/fs/gfs2/util.c b/fs/gfs2/util.c index 9d12b11..f6a7efa 100644 --- a/fs/gfs2/util.c +++ b/fs/gfs2/util.c @@ -38,24 +38,30 @@ int gfs2_lm_withdraw(struct gfs2_sbd *sdp, char *fmt, ...) const struct lm_lockops *lm = ls->ls_ops; va_list args; - if (test_and_set_bit(SDF_SHUTDOWN, &sdp->sd_flags)) + if (sdp->sd_args.ar_errors == GFS2_ERRORS_WITHDRAW && + test_and_set_bit(SDF_SHUTDOWN, &sdp->sd_flags)) return 0; va_start(args, fmt); vprintk(fmt, args); va_end(args); - fs_err(sdp, "about to withdraw this file system\n"); - BUG_ON(sdp->sd_args.ar_debug); + if (sdp->sd_args.ar_errors == GFS2_ERRORS_WITHDRAW) { + fs_err(sdp, "about to withdraw this file system\n"); + BUG_ON(sdp->sd_args.ar_debug); - kobject_uevent(&sdp->sd_kobj, KOBJ_OFFLINE); + kobject_uevent(&sdp->sd_kobj, KOBJ_OFFLINE); - if (lm->lm_unmount) { - fs_err(sdp, "telling LM to unmount\n"); - lm->lm_unmount(sdp); + if (lm->lm_unmount) { + fs_err(sdp, "telling LM to unmount\n"); + lm->lm_unmount(sdp); + } + fs_err(sdp, "withdrawn\n"); + dump_stack(); } - fs_err(sdp, "withdrawn\n"); - dump_stack(); + + if (sdp->sd_args.ar_errors == GFS2_ERRORS_PANIC) + panic("GFS2: fsid=%s: panic requested.\n", sdp->sd_fsname); return -1; } @@ -93,17 +99,24 @@ int gfs2_assert_warn_i(struct gfs2_sbd *sdp, char *assertion, gfs2_tune_get(sdp, gt_complain_secs) * HZ)) return -2; - printk(KERN_WARNING - "GFS2: fsid=%s: warning: assertion \"%s\" failed\n" - "GFS2: fsid=%s: function = %s, file = %s, line = %u\n", - sdp->sd_fsname, assertion, - sdp->sd_fsname, function, file, line); + if (sdp->sd_args.ar_errors == GFS2_ERRORS_WITHDRAW) + printk(KERN_WARNING + "GFS2: fsid=%s: warning: assertion \"%s\" failed\n" + "GFS2: fsid=%s: function = %s, file = %s, line = %u\n", + sdp->sd_fsname, assertion, + sdp->sd_fsname, function, file, line); if (sdp->sd_args.ar_debug) BUG(); else dump_stack(); + if (sdp->sd_args.ar_errors == GFS2_ERRORS_PANIC) + panic("GFS2: fsid=%s: warning: assertion \"%s\" failed\n" + "GFS2: fsid=%s: function = %s, file = %s, line = %u\n", + sdp->sd_fsname, assertion, + sdp->sd_fsname, function, file, line); + sdp->sd_last_warning = jiffies; return -1; diff --git a/fs/gfs2/eattr.c b/fs/gfs2/xattr.c index 07ea952..8a0f8ef 100644 --- a/fs/gfs2/eattr.c +++ b/fs/gfs2/xattr.c @@ -18,8 +18,7 @@ #include "gfs2.h" #include "incore.h" #include "acl.h" -#include "eaops.h" -#include "eattr.h" +#include "xattr.h" #include "glock.h" #include "inode.h" #include "meta_io.h" @@ -38,26 +37,32 @@ * Returns: 1 if the EA should be stuffed */ -static int ea_calc_size(struct gfs2_sbd *sdp, struct gfs2_ea_request *er, +static int ea_calc_size(struct gfs2_sbd *sdp, unsigned int nsize, size_t dsize, unsigned int *size) { - *size = GFS2_EAREQ_SIZE_STUFFED(er); - if (*size <= sdp->sd_jbsize) + unsigned int jbsize = sdp->sd_jbsize; + + /* Stuffed */ + *size = ALIGN(sizeof(struct gfs2_ea_header) + nsize + dsize, 8); + + if (*size <= jbsize) return 1; - *size = GFS2_EAREQ_SIZE_UNSTUFFED(sdp, er); + /* Unstuffed */ + *size = ALIGN(sizeof(struct gfs2_ea_header) + nsize + + (sizeof(__be64) * DIV_ROUND_UP(dsize, jbsize)), 8); return 0; } -static int ea_check_size(struct gfs2_sbd *sdp, struct gfs2_ea_request *er) +static int ea_check_size(struct gfs2_sbd *sdp, unsigned int nsize, size_t dsize) { unsigned int size; - if (er->er_data_len > GFS2_EA_MAX_DATA_LEN) + if (dsize > GFS2_EA_MAX_DATA_LEN) return -ERANGE; - ea_calc_size(sdp, er, &size); + ea_calc_size(sdp, nsize, dsize, &size); /* This can only happen with 512 byte blocks */ if (size > sdp->sd_jbsize) @@ -151,7 +156,9 @@ out: } struct ea_find { - struct gfs2_ea_request *ef_er; + int type; + const char *name; + size_t namel; struct gfs2_ea_location *ef_el; }; @@ -160,14 +167,13 @@ static int ea_find_i(struct gfs2_inode *ip, struct buffer_head *bh, void *private) { struct ea_find *ef = private; - struct gfs2_ea_request *er = ef->ef_er; if (ea->ea_type == GFS2_EATYPE_UNUSED) return 0; - if (ea->ea_type == er->er_type) { - if (ea->ea_name_len == er->er_name_len && - !memcmp(GFS2_EA2NAME(ea), er->er_name, ea->ea_name_len)) { + if (ea->ea_type == ef->type) { + if (ea->ea_name_len == ef->namel && + !memcmp(GFS2_EA2NAME(ea), ef->name, ea->ea_name_len)) { struct gfs2_ea_location *el = ef->ef_el; get_bh(bh); el->el_bh = bh; @@ -180,13 +186,15 @@ static int ea_find_i(struct gfs2_inode *ip, struct buffer_head *bh, return 0; } -int gfs2_ea_find(struct gfs2_inode *ip, struct gfs2_ea_request *er, +int gfs2_ea_find(struct gfs2_inode *ip, int type, const char *name, struct gfs2_ea_location *el) { struct ea_find ef; int error; - ef.ef_er = er; + ef.type = type; + ef.name = name; + ef.namel = strlen(name); ef.ef_el = el; memset(el, 0, sizeof(struct gfs2_ea_location)); @@ -344,6 +352,20 @@ struct ea_list { unsigned int ei_size; }; +static inline unsigned int gfs2_ea_strlen(struct gfs2_ea_header *ea) +{ + switch (ea->ea_type) { + case GFS2_EATYPE_USR: + return 5 + ea->ea_name_len + 1; + case GFS2_EATYPE_SYS: + return 7 + ea->ea_name_len + 1; + case GFS2_EATYPE_SECURITY: + return 9 + ea->ea_name_len + 1; + default: + return 0; + } +} + static int ea_list_i(struct gfs2_inode *ip, struct buffer_head *bh, struct gfs2_ea_header *ea, struct gfs2_ea_header *prev, void *private) @@ -392,21 +414,25 @@ static int ea_list_i(struct gfs2_inode *ip, struct buffer_head *bh, } /** - * gfs2_ea_list - - * @ip: - * @er: + * gfs2_listxattr - List gfs2 extended attributes + * @dentry: The dentry whose inode we are interested in + * @buffer: The buffer to write the results + * @size: The size of the buffer * * Returns: actual size of data on success, -errno on error */ -int gfs2_ea_list(struct gfs2_inode *ip, struct gfs2_ea_request *er) +ssize_t gfs2_listxattr(struct dentry *dentry, char *buffer, size_t size) { + struct gfs2_inode *ip = GFS2_I(dentry->d_inode); + struct gfs2_ea_request er; struct gfs2_holder i_gh; int error; - if (!er->er_data || !er->er_data_len) { - er->er_data = NULL; - er->er_data_len = 0; + memset(&er, 0, sizeof(struct gfs2_ea_request)); + if (size) { + er.er_data = buffer; + er.er_data_len = size; } error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh); @@ -414,7 +440,7 @@ int gfs2_ea_list(struct gfs2_inode *ip, struct gfs2_ea_request *er) return error; if (ip->i_eattr) { - struct ea_list ei = { .ei_er = er, .ei_size = 0 }; + struct ea_list ei = { .ei_er = &er, .ei_size = 0 }; error = ea_foreach(ip, ea_list_i, &ei); if (!error) @@ -491,84 +517,61 @@ out: } int gfs2_ea_get_copy(struct gfs2_inode *ip, struct gfs2_ea_location *el, - char *data) + char *data, size_t size) { + int ret; + size_t len = GFS2_EA_DATA_LEN(el->el_ea); + if (len > size) + return -ERANGE; + if (GFS2_EA_IS_STUFFED(el->el_ea)) { - memcpy(data, GFS2_EA2DATA(el->el_ea), GFS2_EA_DATA_LEN(el->el_ea)); - return 0; - } else - return ea_get_unstuffed(ip, el->el_ea, data); + memcpy(data, GFS2_EA2DATA(el->el_ea), len); + return len; + } + ret = ea_get_unstuffed(ip, el->el_ea, data); + if (ret < 0) + return ret; + return len; } /** - * gfs2_ea_get_i - - * @ip: The GFS2 inode - * @er: The request structure + * gfs2_xattr_get - Get a GFS2 extended attribute + * @inode: The inode + * @type: The type of extended attribute + * @name: The name of the extended attribute + * @buffer: The buffer to write the result into + * @size: The size of the buffer * * Returns: actual size of data on success, -errno on error */ -int gfs2_ea_get_i(struct gfs2_inode *ip, struct gfs2_ea_request *er) +int gfs2_xattr_get(struct inode *inode, int type, const char *name, + void *buffer, size_t size) { + struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_ea_location el; int error; if (!ip->i_eattr) return -ENODATA; + if (strlen(name) > GFS2_EA_MAX_NAME_LEN) + return -EINVAL; - error = gfs2_ea_find(ip, er, &el); + error = gfs2_ea_find(ip, type, name, &el); if (error) return error; if (!el.el_ea) return -ENODATA; - - if (er->er_data_len) { - if (GFS2_EA_DATA_LEN(el.el_ea) > er->er_data_len) - error = -ERANGE; - else - error = gfs2_ea_get_copy(ip, &el, er->er_data); - } - if (!error) + if (size) + error = gfs2_ea_get_copy(ip, &el, buffer, size); + else error = GFS2_EA_DATA_LEN(el.el_ea); - brelse(el.el_bh); return error; } /** - * gfs2_ea_get - - * @ip: The GFS2 inode - * @er: The request structure - * - * Returns: actual size of data on success, -errno on error - */ - -int gfs2_ea_get(struct gfs2_inode *ip, struct gfs2_ea_request *er) -{ - struct gfs2_holder i_gh; - int error; - - if (!er->er_name_len || - er->er_name_len > GFS2_EA_MAX_NAME_LEN) - return -EINVAL; - if (!er->er_data || !er->er_data_len) { - er->er_data = NULL; - er->er_data_len = 0; - } - - error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh); - if (error) - return error; - - error = gfs2_ea_ops[er->er_type]->eo_get(ip, er); - - gfs2_glock_dq_uninit(&i_gh); - - return error; -} - -/** * ea_alloc_blk - allocates a new block for extended attributes. * @ip: A pointer to the inode that's getting extended attributes * @bhp: Pointer to pointer to a struct buffer_head @@ -713,12 +716,6 @@ static int ea_alloc_skeleton(struct gfs2_inode *ip, struct gfs2_ea_request *er, error = gfs2_meta_inode_buffer(ip, &dibh); if (!error) { - if (er->er_flags & GFS2_ERF_MODE) { - gfs2_assert_withdraw(GFS2_SB(&ip->i_inode), - (ip->i_inode.i_mode & S_IFMT) == - (er->er_mode & S_IFMT)); - ip->i_inode.i_mode = er->er_mode; - } ip->i_inode.i_ctime = CURRENT_TIME; gfs2_trans_add_bh(ip->i_gl, dibh, 1); gfs2_dinode_out(ip, dibh->b_data); @@ -762,15 +759,23 @@ static int ea_init_i(struct gfs2_inode *ip, struct gfs2_ea_request *er, * Returns: errno */ -static int ea_init(struct gfs2_inode *ip, struct gfs2_ea_request *er) +static int ea_init(struct gfs2_inode *ip, int type, const char *name, + const void *data, size_t size) { + struct gfs2_ea_request er; unsigned int jbsize = GFS2_SB(&ip->i_inode)->sd_jbsize; unsigned int blks = 1; - if (GFS2_EAREQ_SIZE_STUFFED(er) > jbsize) - blks += DIV_ROUND_UP(er->er_data_len, jbsize); + er.er_type = type; + er.er_name = name; + er.er_name_len = strlen(name); + er.er_data = (void *)data; + er.er_data_len = size; + + if (GFS2_EAREQ_SIZE_STUFFED(&er) > jbsize) + blks += DIV_ROUND_UP(er.er_data_len, jbsize); - return ea_alloc_skeleton(ip, er, blks, ea_init_i, NULL); + return ea_alloc_skeleton(ip, &er, blks, ea_init_i, NULL); } static struct gfs2_ea_header *ea_split_ea(struct gfs2_ea_header *ea) @@ -848,12 +853,6 @@ static int ea_set_simple_noalloc(struct gfs2_inode *ip, struct buffer_head *bh, error = gfs2_meta_inode_buffer(ip, &dibh); if (error) goto out; - - if (er->er_flags & GFS2_ERF_MODE) { - gfs2_assert_withdraw(GFS2_SB(&ip->i_inode), - (ip->i_inode.i_mode & S_IFMT) == (er->er_mode & S_IFMT)); - ip->i_inode.i_mode = er->er_mode; - } ip->i_inode.i_ctime = CURRENT_TIME; gfs2_trans_add_bh(ip->i_gl, dibh, 1); gfs2_dinode_out(ip, dibh->b_data); @@ -894,7 +893,8 @@ static int ea_set_simple(struct gfs2_inode *ip, struct buffer_head *bh, int stuffed; int error; - stuffed = ea_calc_size(GFS2_SB(&ip->i_inode), es->es_er, &size); + stuffed = ea_calc_size(GFS2_SB(&ip->i_inode), es->es_er->er_name_len, + es->es_er->er_data_len, &size); if (ea->ea_type == GFS2_EATYPE_UNUSED) { if (GFS2_EA_REC_LEN(ea) < size) @@ -1005,15 +1005,22 @@ out: return error; } -static int ea_set_i(struct gfs2_inode *ip, struct gfs2_ea_request *er, - struct gfs2_ea_location *el) +static int ea_set_i(struct gfs2_inode *ip, int type, const char *name, + const void *value, size_t size, struct gfs2_ea_location *el) { + struct gfs2_ea_request er; struct ea_set es; unsigned int blks = 2; int error; + er.er_type = type; + er.er_name = name; + er.er_data = (void *)value; + er.er_name_len = strlen(name); + er.er_data_len = size; + memset(&es, 0, sizeof(struct ea_set)); - es.es_er = er; + es.es_er = &er; es.es_el = el; error = ea_foreach(ip, ea_set_simple, &es); @@ -1024,10 +1031,10 @@ static int ea_set_i(struct gfs2_inode *ip, struct gfs2_ea_request *er, if (!(ip->i_diskflags & GFS2_DIF_EA_INDIRECT)) blks++; - if (GFS2_EAREQ_SIZE_STUFFED(er) > GFS2_SB(&ip->i_inode)->sd_jbsize) - blks += DIV_ROUND_UP(er->er_data_len, GFS2_SB(&ip->i_inode)->sd_jbsize); + if (GFS2_EAREQ_SIZE_STUFFED(&er) > GFS2_SB(&ip->i_inode)->sd_jbsize) + blks += DIV_ROUND_UP(er.er_data_len, GFS2_SB(&ip->i_inode)->sd_jbsize); - return ea_alloc_skeleton(ip, er, blks, ea_set_block, el); + return ea_alloc_skeleton(ip, &er, blks, ea_set_block, el); } static int ea_set_remove_unstuffed(struct gfs2_inode *ip, @@ -1039,75 +1046,7 @@ static int ea_set_remove_unstuffed(struct gfs2_inode *ip, GFS2_EA2NEXT(el->el_prev) == el->el_ea); } - return ea_remove_unstuffed(ip, el->el_bh, el->el_ea, el->el_prev,0); -} - -int gfs2_ea_set_i(struct gfs2_inode *ip, struct gfs2_ea_request *er) -{ - struct gfs2_ea_location el; - int error; - - if (!ip->i_eattr) { - if (er->er_flags & XATTR_REPLACE) - return -ENODATA; - return ea_init(ip, er); - } - - error = gfs2_ea_find(ip, er, &el); - if (error) - return error; - - if (el.el_ea) { - if (ip->i_diskflags & GFS2_DIF_APPENDONLY) { - brelse(el.el_bh); - return -EPERM; - } - - error = -EEXIST; - if (!(er->er_flags & XATTR_CREATE)) { - int unstuffed = !GFS2_EA_IS_STUFFED(el.el_ea); - error = ea_set_i(ip, er, &el); - if (!error && unstuffed) - ea_set_remove_unstuffed(ip, &el); - } - - brelse(el.el_bh); - } else { - error = -ENODATA; - if (!(er->er_flags & XATTR_REPLACE)) - error = ea_set_i(ip, er, NULL); - } - - return error; -} - -int gfs2_ea_set(struct gfs2_inode *ip, struct gfs2_ea_request *er) -{ - struct gfs2_holder i_gh; - int error; - - if (!er->er_name_len || er->er_name_len > GFS2_EA_MAX_NAME_LEN) - return -EINVAL; - if (!er->er_data || !er->er_data_len) { - er->er_data = NULL; - er->er_data_len = 0; - } - error = ea_check_size(GFS2_SB(&ip->i_inode), er); - if (error) - return error; - - error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh); - if (error) - return error; - - if (IS_IMMUTABLE(&ip->i_inode)) - error = -EPERM; - else - error = gfs2_ea_ops[er->er_type]->eo_set(ip, er); - - gfs2_glock_dq_uninit(&i_gh); - - return error; + return ea_remove_unstuffed(ip, el->el_bh, el->el_ea, el->el_prev, 0); } static int ea_remove_stuffed(struct gfs2_inode *ip, struct gfs2_ea_location *el) @@ -1131,8 +1070,9 @@ static int ea_remove_stuffed(struct gfs2_inode *ip, struct gfs2_ea_location *el) if (GFS2_EA_IS_LAST(ea)) prev->ea_flags |= GFS2_EAFLAG_LAST; - } else + } else { ea->ea_type = GFS2_EATYPE_UNUSED; + } error = gfs2_meta_inode_buffer(ip, &dibh); if (!error) { @@ -1147,15 +1087,29 @@ static int ea_remove_stuffed(struct gfs2_inode *ip, struct gfs2_ea_location *el) return error; } -int gfs2_ea_remove_i(struct gfs2_inode *ip, struct gfs2_ea_request *er) +/** + * gfs2_xattr_remove - Remove a GFS2 extended attribute + * @inode: The inode + * @type: The type of the extended attribute + * @name: The name of the extended attribute + * + * This is not called directly by the VFS since we use the (common) + * scheme of making a "set with NULL data" mean a remove request. Note + * that this is different from a set with zero length data. + * + * Returns: 0, or errno on failure + */ + +static int gfs2_xattr_remove(struct inode *inode, int type, const char *name) { + struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_ea_location el; int error; if (!ip->i_eattr) return -ENODATA; - error = gfs2_ea_find(ip, er, &el); + error = gfs2_ea_find(ip, type, name, &el); if (error) return error; if (!el.el_ea) @@ -1164,8 +1118,7 @@ int gfs2_ea_remove_i(struct gfs2_inode *ip, struct gfs2_ea_request *er) if (GFS2_EA_IS_STUFFED(el.el_ea)) error = ea_remove_stuffed(ip, &el); else - error = ea_remove_unstuffed(ip, el.el_bh, el.el_ea, el.el_prev, - 0); + error = ea_remove_unstuffed(ip, el.el_bh, el.el_ea, el.el_prev, 0); brelse(el.el_bh); @@ -1173,31 +1126,70 @@ int gfs2_ea_remove_i(struct gfs2_inode *ip, struct gfs2_ea_request *er) } /** - * gfs2_ea_remove - sets (or creates or replaces) an extended attribute - * @ip: pointer to the inode of the target file - * @er: request information + * gfs2_xattr_set - Set (or remove) a GFS2 extended attribute + * @inode: The inode + * @type: The type of the extended attribute + * @name: The name of the extended attribute + * @value: The value of the extended attribute (NULL for remove) + * @size: The size of the @value argument + * @flags: Create or Replace * - * Returns: errno + * See gfs2_xattr_remove() for details of the removal of xattrs. + * + * Returns: 0 or errno on failure */ -int gfs2_ea_remove(struct gfs2_inode *ip, struct gfs2_ea_request *er) +int gfs2_xattr_set(struct inode *inode, int type, const char *name, + const void *value, size_t size, int flags) { - struct gfs2_holder i_gh; + struct gfs2_sbd *sdp = GFS2_SB(inode); + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_ea_location el; + unsigned int namel = strlen(name); int error; - if (!er->er_name_len || er->er_name_len > GFS2_EA_MAX_NAME_LEN) - return -EINVAL; + if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) + return -EPERM; + if (namel > GFS2_EA_MAX_NAME_LEN) + return -ERANGE; - error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh); + if (value == NULL) + return gfs2_xattr_remove(inode, type, name); + + if (ea_check_size(sdp, namel, size)) + return -ERANGE; + + if (!ip->i_eattr) { + if (flags & XATTR_REPLACE) + return -ENODATA; + return ea_init(ip, type, name, value, size); + } + + error = gfs2_ea_find(ip, type, name, &el); if (error) return error; - if (IS_IMMUTABLE(&ip->i_inode) || IS_APPEND(&ip->i_inode)) - error = -EPERM; - else - error = gfs2_ea_ops[er->er_type]->eo_remove(ip, er); + if (el.el_ea) { + if (ip->i_diskflags & GFS2_DIF_APPENDONLY) { + brelse(el.el_bh); + return -EPERM; + } - gfs2_glock_dq_uninit(&i_gh); + error = -EEXIST; + if (!(flags & XATTR_CREATE)) { + int unstuffed = !GFS2_EA_IS_STUFFED(el.el_ea); + error = ea_set_i(ip, type, name, value, size, &el); + if (!error && unstuffed) + ea_set_remove_unstuffed(ip, &el); + } + + brelse(el.el_bh); + return error; + } + + error = -ENODATA; + if (!(flags & XATTR_REPLACE)) + error = ea_set_i(ip, type, name, value, size, NULL); return error; } @@ -1503,3 +1495,64 @@ out_alloc: return error; } +static int gfs2_xattr_user_get(struct inode *inode, const char *name, + void *buffer, size_t size) +{ + return gfs2_xattr_get(inode, GFS2_EATYPE_USR, name, buffer, size); +} + +static int gfs2_xattr_user_set(struct inode *inode, const char *name, + const void *value, size_t size, int flags) +{ + return gfs2_xattr_set(inode, GFS2_EATYPE_USR, name, value, size, flags); +} + +static int gfs2_xattr_system_get(struct inode *inode, const char *name, + void *buffer, size_t size) +{ + return gfs2_xattr_get(inode, GFS2_EATYPE_SYS, name, buffer, size); +} + +static int gfs2_xattr_system_set(struct inode *inode, const char *name, + const void *value, size_t size, int flags) +{ + return gfs2_xattr_set(inode, GFS2_EATYPE_SYS, name, value, size, flags); +} + +static int gfs2_xattr_security_get(struct inode *inode, const char *name, + void *buffer, size_t size) +{ + return gfs2_xattr_get(inode, GFS2_EATYPE_SECURITY, name, buffer, size); +} + +static int gfs2_xattr_security_set(struct inode *inode, const char *name, + const void *value, size_t size, int flags) +{ + return gfs2_xattr_set(inode, GFS2_EATYPE_SECURITY, name, value, size, flags); +} + +static struct xattr_handler gfs2_xattr_user_handler = { + .prefix = XATTR_USER_PREFIX, + .get = gfs2_xattr_user_get, + .set = gfs2_xattr_user_set, +}; + +static struct xattr_handler gfs2_xattr_security_handler = { + .prefix = XATTR_SECURITY_PREFIX, + .get = gfs2_xattr_security_get, + .set = gfs2_xattr_security_set, +}; + +static struct xattr_handler gfs2_xattr_system_handler = { + .prefix = XATTR_SYSTEM_PREFIX, + .get = gfs2_xattr_system_get, + .set = gfs2_xattr_system_set, +}; + +struct xattr_handler *gfs2_xattr_handlers[] = { + &gfs2_xattr_user_handler, + &gfs2_xattr_security_handler, + &gfs2_xattr_system_handler, + NULL, +}; + diff --git a/fs/gfs2/eattr.h b/fs/gfs2/xattr.h index c82dbe0..cbdfd77 100644 --- a/fs/gfs2/eattr.h +++ b/fs/gfs2/xattr.h @@ -19,7 +19,7 @@ struct iattr; #define GFS2_EA_SIZE(ea) \ ALIGN(sizeof(struct gfs2_ea_header) + (ea)->ea_name_len + \ ((GFS2_EA_IS_STUFFED(ea)) ? GFS2_EA_DATA_LEN(ea) : \ - (sizeof(__be64) * (ea)->ea_num_ptrs)), 8) + (sizeof(__be64) * (ea)->ea_num_ptrs)), 8) #define GFS2_EA_IS_STUFFED(ea) (!(ea)->ea_num_ptrs) #define GFS2_EA_IS_LAST(ea) ((ea)->ea_flags & GFS2_EAFLAG_LAST) @@ -27,10 +27,6 @@ ALIGN(sizeof(struct gfs2_ea_header) + (ea)->ea_name_len + \ #define GFS2_EAREQ_SIZE_STUFFED(er) \ ALIGN(sizeof(struct gfs2_ea_header) + (er)->er_name_len + (er)->er_data_len, 8) -#define GFS2_EAREQ_SIZE_UNSTUFFED(sdp, er) \ -ALIGN(sizeof(struct gfs2_ea_header) + (er)->er_name_len + \ - sizeof(__be64) * DIV_ROUND_UP((er)->er_data_len, (sdp)->sd_jbsize), 8) - #define GFS2_EA2NAME(ea) ((char *)((struct gfs2_ea_header *)(ea) + 1)) #define GFS2_EA2DATA(ea) (GFS2_EA2NAME(ea) + (ea)->ea_name_len) @@ -43,16 +39,12 @@ ALIGN(sizeof(struct gfs2_ea_header) + (er)->er_name_len + \ #define GFS2_EA_BH2FIRST(bh) \ ((struct gfs2_ea_header *)((bh)->b_data + sizeof(struct gfs2_meta_header))) -#define GFS2_ERF_MODE 0x80000000 - struct gfs2_ea_request { const char *er_name; char *er_data; unsigned int er_name_len; unsigned int er_data_len; unsigned int er_type; /* GFS2_EATYPE_... */ - int er_flags; - mode_t er_mode; }; struct gfs2_ea_location { @@ -61,40 +53,20 @@ struct gfs2_ea_location { struct gfs2_ea_header *el_prev; }; -int gfs2_ea_get_i(struct gfs2_inode *ip, struct gfs2_ea_request *er); -int gfs2_ea_set_i(struct gfs2_inode *ip, struct gfs2_ea_request *er); -int gfs2_ea_remove_i(struct gfs2_inode *ip, struct gfs2_ea_request *er); - -int gfs2_ea_list(struct gfs2_inode *ip, struct gfs2_ea_request *er); -int gfs2_ea_get(struct gfs2_inode *ip, struct gfs2_ea_request *er); -int gfs2_ea_set(struct gfs2_inode *ip, struct gfs2_ea_request *er); -int gfs2_ea_remove(struct gfs2_inode *ip, struct gfs2_ea_request *er); - -int gfs2_ea_dealloc(struct gfs2_inode *ip); +extern int gfs2_xattr_get(struct inode *inode, int type, const char *name, + void *buffer, size_t size); +extern int gfs2_xattr_set(struct inode *inode, int type, const char *name, + const void *value, size_t size, int flags); +extern ssize_t gfs2_listxattr(struct dentry *dentry, char *buffer, size_t size); +extern int gfs2_ea_dealloc(struct gfs2_inode *ip); /* Exported to acl.c */ -int gfs2_ea_find(struct gfs2_inode *ip, - struct gfs2_ea_request *er, - struct gfs2_ea_location *el); -int gfs2_ea_get_copy(struct gfs2_inode *ip, - struct gfs2_ea_location *el, - char *data); -int gfs2_ea_acl_chmod(struct gfs2_inode *ip, struct gfs2_ea_location *el, - struct iattr *attr, char *data); - -static inline unsigned int gfs2_ea_strlen(struct gfs2_ea_header *ea) -{ - switch (ea->ea_type) { - case GFS2_EATYPE_USR: - return 5 + ea->ea_name_len + 1; - case GFS2_EATYPE_SYS: - return 7 + ea->ea_name_len + 1; - case GFS2_EATYPE_SECURITY: - return 9 + ea->ea_name_len + 1; - default: - return 0; - } -} +extern int gfs2_ea_find(struct gfs2_inode *ip, int type, const char *name, + struct gfs2_ea_location *el); +extern int gfs2_ea_get_copy(struct gfs2_inode *ip, struct gfs2_ea_location *el, + char *data, size_t size); +extern int gfs2_ea_acl_chmod(struct gfs2_inode *ip, struct gfs2_ea_location *el, + struct iattr *attr, char *data); #endif /* __EATTR_DOT_H__ */ diff --git a/fs/hfs/btree.c b/fs/hfs/btree.c index 9b9d639..052f214 100644 --- a/fs/hfs/btree.c +++ b/fs/hfs/btree.c @@ -58,6 +58,11 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id, btree_keycmp ke } unlock_new_inode(tree->inode); + if (!HFS_I(tree->inode)->first_blocks) { + printk(KERN_ERR "hfs: invalid btree extent records (0 size).\n"); + goto free_inode; + } + mapping = tree->inode->i_mapping; page = read_mapping_page(mapping, 0, NULL); if (IS_ERR(page)) diff --git a/fs/hfs/mdb.c b/fs/hfs/mdb.c index 7b6165f..8bbe03c 100644 --- a/fs/hfs/mdb.c +++ b/fs/hfs/mdb.c @@ -344,10 +344,8 @@ void hfs_mdb_put(struct super_block *sb) brelse(HFS_SB(sb)->mdb_bh); brelse(HFS_SB(sb)->alt_mdb_bh); - if (HFS_SB(sb)->nls_io) - unload_nls(HFS_SB(sb)->nls_io); - if (HFS_SB(sb)->nls_disk) - unload_nls(HFS_SB(sb)->nls_disk); + unload_nls(HFS_SB(sb)->nls_io); + unload_nls(HFS_SB(sb)->nls_disk); free_pages((unsigned long)HFS_SB(sb)->bitmap, PAGE_SIZE < 8192 ? 1 : 0); kfree(HFS_SB(sb)); diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c index c0759fe..43022f3 100644 --- a/fs/hfsplus/super.c +++ b/fs/hfsplus/super.c @@ -229,8 +229,7 @@ static void hfsplus_put_super(struct super_block *sb) iput(HFSPLUS_SB(sb).alloc_file); iput(HFSPLUS_SB(sb).hidden_dir); brelse(HFSPLUS_SB(sb).s_vhbh); - if (HFSPLUS_SB(sb).nls) - unload_nls(HFSPLUS_SB(sb).nls); + unload_nls(HFSPLUS_SB(sb).nls); kfree(sb->s_fs_info); sb->s_fs_info = NULL; @@ -464,8 +463,7 @@ out: cleanup: hfsplus_put_super(sb); - if (nls) - unload_nls(nls); + unload_nls(nls); return err; } diff --git a/fs/hfsplus/wrapper.c b/fs/hfsplus/wrapper.c index 175d08e..bed78ac8 100644 --- a/fs/hfsplus/wrapper.c +++ b/fs/hfsplus/wrapper.c @@ -99,6 +99,10 @@ int hfsplus_read_wrapper(struct super_block *sb) if (hfsplus_get_last_session(sb, &part_start, &part_size)) return -EINVAL; + if ((u64)part_start + part_size > 0x100000000ULL) { + pr_err("hfs: volumes larger than 2TB are not supported yet\n"); + return -EINVAL; + } while (1) { bh = sb_bread512(sb, part_start + HFSPLUS_VOLHEAD_SECTOR, vhdr); if (!bh) diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index cb88dac..87a1258 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -31,12 +31,10 @@ #include <linux/statfs.h> #include <linux/security.h> #include <linux/ima.h> +#include <linux/magic.h> #include <asm/uaccess.h> -/* some random number */ -#define HUGETLBFS_MAGIC 0x958458f6 - static const struct super_operations hugetlbfs_ops; static const struct address_space_operations hugetlbfs_aops; const struct file_operations hugetlbfs_file_operations; @@ -44,6 +42,7 @@ static const struct inode_operations hugetlbfs_dir_inode_operations; static const struct inode_operations hugetlbfs_inode_operations; static struct backing_dev_info hugetlbfs_backing_dev_info = { + .name = "hugetlbfs", .ra_pages = 0, /* No readahead */ .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, }; @@ -381,36 +380,11 @@ static void hugetlbfs_delete_inode(struct inode *inode) static void hugetlbfs_forget_inode(struct inode *inode) __releases(inode_lock) { - struct super_block *sb = inode->i_sb; - - if (!hlist_unhashed(&inode->i_hash)) { - if (!(inode->i_state & (I_DIRTY|I_SYNC))) - list_move(&inode->i_list, &inode_unused); - inodes_stat.nr_unused++; - if (!sb || (sb->s_flags & MS_ACTIVE)) { - spin_unlock(&inode_lock); - return; - } - inode->i_state |= I_WILL_FREE; - spin_unlock(&inode_lock); - /* - * write_inode_now is a noop as we set BDI_CAP_NO_WRITEBACK - * in our backing_dev_info. - */ - write_inode_now(inode, 1); - spin_lock(&inode_lock); - inode->i_state &= ~I_WILL_FREE; - inodes_stat.nr_unused--; - hlist_del_init(&inode->i_hash); + if (generic_detach_inode(inode)) { + truncate_hugepages(inode, 0); + clear_inode(inode); + destroy_inode(inode); } - list_del_init(&inode->i_list); - list_del_init(&inode->i_sb_list); - inode->i_state |= I_FREEING; - inodes_stat.nr_inodes--; - spin_unlock(&inode_lock); - truncate_hugepages(inode, 0); - clear_inode(inode); - destroy_inode(inode); } static void hugetlbfs_drop_inode(struct inode *inode) @@ -506,6 +480,13 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb, uid_t uid, inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; INIT_LIST_HEAD(&inode->i_mapping->private_list); info = HUGETLBFS_I(inode); + /* + * The policy is initialized here even if we are creating a + * private inode because initialization simply creates an + * an empty rb tree and calls spin_lock_init(), later when we + * call mpol_free_shared_policy() it will just return because + * the rb tree will still be empty. + */ mpol_shared_policy_init(&info->policy, NULL); switch (mode & S_IFMT) { default: @@ -936,7 +917,7 @@ static int can_do_hugetlb_shm(void) } struct file *hugetlb_file_setup(const char *name, size_t size, int acctflag, - struct user_struct **user) + struct user_struct **user, int creat_flags) { int error = -ENOMEM; struct file *file; @@ -948,7 +929,7 @@ struct file *hugetlb_file_setup(const char *name, size_t size, int acctflag, if (!hugetlbfs_vfsmount) return ERR_PTR(-ENOENT); - if (!can_do_hugetlb_shm()) { + if (creat_flags == HUGETLB_SHMFS_INODE && !can_do_hugetlb_shm()) { *user = current_user(); if (user_shm_lock(size, *user)) { WARN_ONCE(1, @@ -14,6 +14,7 @@ #include <linux/module.h> #include <linux/backing-dev.h> #include <linux/wait.h> +#include <linux/rwsem.h> #include <linux/hash.h> #include <linux/swap.h> #include <linux/security.h> @@ -87,14 +88,18 @@ static struct hlist_head *inode_hashtable __read_mostly; DEFINE_SPINLOCK(inode_lock); /* - * iprune_mutex provides exclusion between the kswapd or try_to_free_pages + * iprune_sem provides exclusion between the kswapd or try_to_free_pages * icache shrinking path, and the umount path. Without this exclusion, * by the time prune_icache calls iput for the inode whose pages it has * been invalidating, or by the time it calls clear_inode & destroy_inode * from its final dispose_list, the struct super_block they refer to * (for inode->i_sb->s_op) may already have been freed and reused. + * + * We make this an rwsem because the fastpath is icache shrinking. In + * some cases a filesystem may be doing a significant amount of work in + * its inode reclaim code, so this should improve parallelism. */ -static DEFINE_MUTEX(iprune_mutex); +static DECLARE_RWSEM(iprune_sem); /* * Statistics gathering.. @@ -123,7 +128,7 @@ static void wake_up_inode(struct inode *inode) int inode_init_always(struct super_block *sb, struct inode *inode) { static const struct address_space_operations empty_aops; - static struct inode_operations empty_iops; + static const struct inode_operations empty_iops; static const struct file_operations empty_fops; struct address_space *const mapping = &inode->i_data; @@ -182,9 +187,7 @@ int inode_init_always(struct super_block *sb, struct inode *inode) if (sb->s_bdev) { struct backing_dev_info *bdi; - bdi = sb->s_bdev->bd_inode_backing_dev_info; - if (!bdi) - bdi = sb->s_bdev->bd_inode->i_mapping->backing_dev_info; + bdi = sb->s_bdev->bd_inode->i_mapping->backing_dev_info; mapping->backing_dev_info = bdi; } inode->i_private = NULL; @@ -383,7 +386,7 @@ static int invalidate_list(struct list_head *head, struct list_head *dispose) /* * We can reschedule here without worrying about the list's * consistency because the per-sb list of inodes must not - * change during umount anymore, and because iprune_mutex keeps + * change during umount anymore, and because iprune_sem keeps * shrink_icache_memory() away. */ cond_resched_lock(&inode_lock); @@ -422,7 +425,7 @@ int invalidate_inodes(struct super_block *sb) int busy; LIST_HEAD(throw_away); - mutex_lock(&iprune_mutex); + down_write(&iprune_sem); spin_lock(&inode_lock); inotify_unmount_inodes(&sb->s_inodes); fsnotify_unmount_inodes(&sb->s_inodes); @@ -430,7 +433,7 @@ int invalidate_inodes(struct super_block *sb) spin_unlock(&inode_lock); dispose_list(&throw_away); - mutex_unlock(&iprune_mutex); + up_write(&iprune_sem); return busy; } @@ -469,7 +472,7 @@ static void prune_icache(int nr_to_scan) int nr_scanned; unsigned long reap = 0; - mutex_lock(&iprune_mutex); + down_read(&iprune_sem); spin_lock(&inode_lock); for (nr_scanned = 0; nr_scanned < nr_to_scan; nr_scanned++) { struct inode *inode; @@ -511,7 +514,7 @@ static void prune_icache(int nr_to_scan) spin_unlock(&inode_lock); dispose_list(&freeable); - mutex_unlock(&iprune_mutex); + up_read(&iprune_sem); } /* @@ -697,13 +700,15 @@ void unlock_new_inode(struct inode *inode) } #endif /* - * This is special! We do not need the spinlock - * when clearing I_LOCK, because we're guaranteed - * that nobody else tries to do anything about the - * state of the inode when it is locked, as we - * just created it (so there can be no old holders - * that haven't tested I_LOCK). + * This is special! We do not need the spinlock when clearing I_LOCK, + * because we're guaranteed that nobody else tries to do anything about + * the state of the inode when it is locked, as we just created it (so + * there can be no old holders that haven't tested I_LOCK). + * However we must emit the memory barrier so that other CPUs reliably + * see the clearing of I_LOCK after the other inode initialisation has + * completed. */ + smp_mb(); WARN_ON((inode->i_state & (I_LOCK|I_NEW)) != (I_LOCK|I_NEW)); inode->i_state &= ~(I_LOCK|I_NEW); wake_up_inode(inode); @@ -1236,7 +1241,16 @@ void generic_delete_inode(struct inode *inode) } EXPORT_SYMBOL(generic_delete_inode); -static void generic_forget_inode(struct inode *inode) +/** + * generic_detach_inode - remove inode from inode lists + * @inode: inode to remove + * + * Remove inode from inode lists, write it if it's dirty. This is just an + * internal VFS helper exported for hugetlbfs. Do not use! + * + * Returns 1 if inode should be completely destroyed. + */ +int generic_detach_inode(struct inode *inode) { struct super_block *sb = inode->i_sb; @@ -1246,7 +1260,7 @@ static void generic_forget_inode(struct inode *inode) inodes_stat.nr_unused++; if (sb->s_flags & MS_ACTIVE) { spin_unlock(&inode_lock); - return; + return 0; } WARN_ON(inode->i_state & I_NEW); inode->i_state |= I_WILL_FREE; @@ -1264,6 +1278,14 @@ static void generic_forget_inode(struct inode *inode) inode->i_state |= I_FREEING; inodes_stat.nr_inodes--; spin_unlock(&inode_lock); + return 1; +} +EXPORT_SYMBOL_GPL(generic_detach_inode); + +static void generic_forget_inode(struct inode *inode) +{ + if (!generic_detach_inode(inode)) + return; if (inode->i_data.nrpages) truncate_inode_pages(&inode->i_data, 0); clear_inode(inode); @@ -1394,31 +1416,31 @@ void touch_atime(struct vfsmount *mnt, struct dentry *dentry) struct inode *inode = dentry->d_inode; struct timespec now; - if (mnt_want_write(mnt)) - return; if (inode->i_flags & S_NOATIME) - goto out; + return; if (IS_NOATIME(inode)) - goto out; + return; if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode)) - goto out; + return; if (mnt->mnt_flags & MNT_NOATIME) - goto out; + return; if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode)) - goto out; + return; now = current_fs_time(inode->i_sb); if (!relatime_need_update(mnt, inode, now)) - goto out; + return; if (timespec_equal(&inode->i_atime, &now)) - goto out; + return; + + if (mnt_want_write(mnt)) + return; inode->i_atime = now; mark_inode_dirty_sync(inode); -out: mnt_drop_write(mnt); } EXPORT_SYMBOL(touch_atime); @@ -1439,34 +1461,37 @@ void file_update_time(struct file *file) { struct inode *inode = file->f_path.dentry->d_inode; struct timespec now; - int sync_it = 0; - int err; + enum { S_MTIME = 1, S_CTIME = 2, S_VERSION = 4 } sync_it = 0; + /* First try to exhaust all avenues to not sync */ if (IS_NOCMTIME(inode)) return; - err = mnt_want_write_file(file); - if (err) - return; - now = current_fs_time(inode->i_sb); - if (!timespec_equal(&inode->i_mtime, &now)) { - inode->i_mtime = now; - sync_it = 1; - } + if (!timespec_equal(&inode->i_mtime, &now)) + sync_it = S_MTIME; - if (!timespec_equal(&inode->i_ctime, &now)) { - inode->i_ctime = now; - sync_it = 1; - } + if (!timespec_equal(&inode->i_ctime, &now)) + sync_it |= S_CTIME; - if (IS_I_VERSION(inode)) { - inode_inc_iversion(inode); - sync_it = 1; - } + if (IS_I_VERSION(inode)) + sync_it |= S_VERSION; + + if (!sync_it) + return; - if (sync_it) - mark_inode_dirty_sync(inode); + /* Finally allowed to write? Takes lock. */ + if (mnt_want_write_file(file)) + return; + + /* Only change inode inside the lock region */ + if (sync_it & S_VERSION) + inode_inc_iversion(inode); + if (sync_it & S_CTIME) + inode->i_ctime = now; + if (sync_it & S_MTIME) + inode->i_mtime = now; + mark_inode_dirty_sync(inode); mnt_drop_write(file->f_path.mnt); } EXPORT_SYMBOL(file_update_time); @@ -1594,7 +1619,8 @@ void init_special_inode(struct inode *inode, umode_t mode, dev_t rdev) else if (S_ISSOCK(mode)) inode->i_fop = &bad_sock_fops; else - printk(KERN_DEBUG "init_special_inode: bogus i_mode (%o)\n", - mode); + printk(KERN_DEBUG "init_special_inode: bogus i_mode (%o) for" + " inode %s:%lu\n", mode, inode->i_sb->s_id, + inode->i_ino); } EXPORT_SYMBOL(init_special_inode); diff --git a/fs/internal.h b/fs/internal.h index d55ef56..515175b 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -57,6 +57,7 @@ extern int check_unsafe_exec(struct linux_binprm *); * namespace.c */ extern int copy_mount_options(const void __user *, unsigned long *); +extern int copy_mount_string(const void __user *, char **); extern void free_vfsmnt(struct vfsmount *); extern struct vfsmount *alloc_vfsmnt(const char *); @@ -162,20 +162,21 @@ EXPORT_SYMBOL(fiemap_check_flags); static int fiemap_check_ranges(struct super_block *sb, u64 start, u64 len, u64 *new_len) { + u64 maxbytes = (u64) sb->s_maxbytes; + *new_len = len; if (len == 0) return -EINVAL; - if (start > sb->s_maxbytes) + if (start > maxbytes) return -EFBIG; /* * Shrink request scope to what the fs can actually handle. */ - if ((len > sb->s_maxbytes) || - (sb->s_maxbytes - len) < start) - *new_len = sb->s_maxbytes - start; + if (len > maxbytes || (maxbytes - len) < start) + *new_len = maxbytes - start; return 0; } diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c index 85f96bc..6b4dcd4 100644 --- a/fs/isofs/inode.c +++ b/fs/isofs/inode.c @@ -46,10 +46,7 @@ static void isofs_put_super(struct super_block *sb) #ifdef CONFIG_JOLIET lock_kernel(); - if (sbi->s_nls_iocharset) { - unload_nls(sbi->s_nls_iocharset); - sbi->s_nls_iocharset = NULL; - } + unload_nls(sbi->s_nls_iocharset); unlock_kernel(); #endif @@ -912,8 +909,7 @@ out_no_root: printk(KERN_WARNING "%s: get root inode failed\n", __func__); out_no_inode: #ifdef CONFIG_JOLIET - if (sbi->s_nls_iocharset) - unload_nls(sbi->s_nls_iocharset); + unload_nls(sbi->s_nls_iocharset); #endif goto out_freesbi; out_no_read: diff --git a/fs/jbd/checkpoint.c b/fs/jbd/checkpoint.c index 61f32f3..b0435dd 100644 --- a/fs/jbd/checkpoint.c +++ b/fs/jbd/checkpoint.c @@ -456,7 +456,7 @@ int cleanup_journal_tail(journal_t *journal) { transaction_t * transaction; tid_t first_tid; - unsigned long blocknr, freed; + unsigned int blocknr, freed; if (is_journal_aborted(journal)) return 1; @@ -502,8 +502,8 @@ int cleanup_journal_tail(journal_t *journal) freed = freed + journal->j_last - journal->j_first; jbd_debug(1, - "Cleaning journal tail from %d to %d (offset %lu), " - "freeing %lu\n", + "Cleaning journal tail from %d to %d (offset %u), " + "freeing %u\n", journal->j_tail_sequence, first_tid, blocknr, freed); journal->j_free += freed; diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c index 618e21c..4bd8825 100644 --- a/fs/jbd/commit.c +++ b/fs/jbd/commit.c @@ -308,7 +308,7 @@ void journal_commit_transaction(journal_t *journal) int bufs; int flags; int err; - unsigned long blocknr; + unsigned int blocknr; ktime_t start_time; u64 commit_time; char *tagp = NULL; diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c index f96f850..bd3c073 100644 --- a/fs/jbd/journal.c +++ b/fs/jbd/journal.c @@ -276,7 +276,7 @@ static void journal_kill_thread(journal_t *journal) int journal_write_metadata_buffer(transaction_t *transaction, struct journal_head *jh_in, struct journal_head **jh_out, - unsigned long blocknr) + unsigned int blocknr) { int need_copy_out = 0; int done_copy_out = 0; @@ -567,9 +567,9 @@ int log_wait_commit(journal_t *journal, tid_t tid) * Log buffer allocation routines: */ -int journal_next_log_block(journal_t *journal, unsigned long *retp) +int journal_next_log_block(journal_t *journal, unsigned int *retp) { - unsigned long blocknr; + unsigned int blocknr; spin_lock(&journal->j_state_lock); J_ASSERT(journal->j_free > 1); @@ -590,11 +590,11 @@ int journal_next_log_block(journal_t *journal, unsigned long *retp) * this is a no-op. If needed, we can use j_blk_offset - everything is * ready. */ -int journal_bmap(journal_t *journal, unsigned long blocknr, - unsigned long *retp) +int journal_bmap(journal_t *journal, unsigned int blocknr, + unsigned int *retp) { int err = 0; - unsigned long ret; + unsigned int ret; if (journal->j_inode) { ret = bmap(journal->j_inode, blocknr); @@ -604,7 +604,7 @@ int journal_bmap(journal_t *journal, unsigned long blocknr, char b[BDEVNAME_SIZE]; printk(KERN_ALERT "%s: journal block not found " - "at offset %lu on %s\n", + "at offset %u on %s\n", __func__, blocknr, bdevname(journal->j_dev, b)); @@ -630,7 +630,7 @@ int journal_bmap(journal_t *journal, unsigned long blocknr, struct journal_head *journal_get_descriptor_buffer(journal_t *journal) { struct buffer_head *bh; - unsigned long blocknr; + unsigned int blocknr; int err; err = journal_next_log_block(journal, &blocknr); @@ -774,7 +774,7 @@ journal_t * journal_init_inode (struct inode *inode) journal_t *journal = journal_init_common(); int err; int n; - unsigned long blocknr; + unsigned int blocknr; if (!journal) return NULL; @@ -846,12 +846,12 @@ static void journal_fail_superblock (journal_t *journal) static int journal_reset(journal_t *journal) { journal_superblock_t *sb = journal->j_superblock; - unsigned long first, last; + unsigned int first, last; first = be32_to_cpu(sb->s_first); last = be32_to_cpu(sb->s_maxlen); if (first + JFS_MIN_JOURNAL_BLOCKS > last + 1) { - printk(KERN_ERR "JBD: Journal too short (blocks %lu-%lu).\n", + printk(KERN_ERR "JBD: Journal too short (blocks %u-%u).\n", first, last); journal_fail_superblock(journal); return -EINVAL; @@ -885,7 +885,7 @@ static int journal_reset(journal_t *journal) **/ int journal_create(journal_t *journal) { - unsigned long blocknr; + unsigned int blocknr; struct buffer_head *bh; journal_superblock_t *sb; int i, err; @@ -969,14 +969,14 @@ void journal_update_superblock(journal_t *journal, int wait) if (sb->s_start == 0 && journal->j_tail_sequence == journal->j_transaction_sequence) { jbd_debug(1,"JBD: Skipping superblock update on recovered sb " - "(start %ld, seq %d, errno %d)\n", + "(start %u, seq %d, errno %d)\n", journal->j_tail, journal->j_tail_sequence, journal->j_errno); goto out; } spin_lock(&journal->j_state_lock); - jbd_debug(1,"JBD: updating superblock (start %ld, seq %d, errno %d)\n", + jbd_debug(1,"JBD: updating superblock (start %u, seq %d, errno %d)\n", journal->j_tail, journal->j_tail_sequence, journal->j_errno); sb->s_sequence = cpu_to_be32(journal->j_tail_sequence); @@ -1371,7 +1371,7 @@ int journal_flush(journal_t *journal) { int err = 0; transaction_t *transaction = NULL; - unsigned long old_tail; + unsigned int old_tail; spin_lock(&journal->j_state_lock); diff --git a/fs/jbd/recovery.c b/fs/jbd/recovery.c index db5e982..cb1a49a 100644 --- a/fs/jbd/recovery.c +++ b/fs/jbd/recovery.c @@ -70,7 +70,7 @@ static int do_readahead(journal_t *journal, unsigned int start) { int err; unsigned int max, nbufs, next; - unsigned long blocknr; + unsigned int blocknr; struct buffer_head *bh; struct buffer_head * bufs[MAXBUF]; @@ -132,7 +132,7 @@ static int jread(struct buffer_head **bhp, journal_t *journal, unsigned int offset) { int err; - unsigned long blocknr; + unsigned int blocknr; struct buffer_head *bh; *bhp = NULL; @@ -314,7 +314,7 @@ static int do_one_pass(journal_t *journal, struct recovery_info *info, enum passtype pass) { unsigned int first_commit_ID, next_commit_ID; - unsigned long next_log_block; + unsigned int next_log_block; int err, success = 0; journal_superblock_t * sb; journal_header_t * tmp; @@ -367,14 +367,14 @@ static int do_one_pass(journal_t *journal, if (tid_geq(next_commit_ID, info->end_transaction)) break; - jbd_debug(2, "Scanning for sequence ID %u at %lu/%lu\n", + jbd_debug(2, "Scanning for sequence ID %u at %u/%u\n", next_commit_ID, next_log_block, journal->j_last); /* Skip over each chunk of the transaction looking * either the next descriptor block or the final commit * record. */ - jbd_debug(3, "JBD: checking block %ld\n", next_log_block); + jbd_debug(3, "JBD: checking block %u\n", next_log_block); err = jread(&bh, journal, next_log_block); if (err) goto failed; @@ -429,7 +429,7 @@ static int do_one_pass(journal_t *journal, tagp = &bh->b_data[sizeof(journal_header_t)]; while ((tagp - bh->b_data +sizeof(journal_block_tag_t)) <= journal->j_blocksize) { - unsigned long io_block; + unsigned int io_block; tag = (journal_block_tag_t *) tagp; flags = be32_to_cpu(tag->t_flags); @@ -443,10 +443,10 @@ static int do_one_pass(journal_t *journal, success = err; printk (KERN_ERR "JBD: IO error %d recovering " - "block %ld in log\n", + "block %u in log\n", err, io_block); } else { - unsigned long blocknr; + unsigned int blocknr; J_ASSERT(obh != NULL); blocknr = be32_to_cpu(tag->t_blocknr); @@ -581,7 +581,7 @@ static int scan_revoke_records(journal_t *journal, struct buffer_head *bh, max = be32_to_cpu(header->r_count); while (offset < max) { - unsigned long blocknr; + unsigned int blocknr; int err; blocknr = be32_to_cpu(* ((__be32 *) (bh->b_data+offset))); diff --git a/fs/jbd/revoke.c b/fs/jbd/revoke.c index da6cd9b..ad71732 100644 --- a/fs/jbd/revoke.c +++ b/fs/jbd/revoke.c @@ -101,7 +101,7 @@ struct jbd_revoke_record_s { struct list_head hash; tid_t sequence; /* Used for recovery only */ - unsigned long blocknr; + unsigned int blocknr; }; @@ -126,7 +126,7 @@ static void flush_descriptor(journal_t *, struct journal_head *, int, int); /* Utility functions to maintain the revoke table */ /* Borrowed from buffer.c: this is a tried and tested block hash function */ -static inline int hash(journal_t *journal, unsigned long block) +static inline int hash(journal_t *journal, unsigned int block) { struct jbd_revoke_table_s *table = journal->j_revoke; int hash_shift = table->hash_shift; @@ -136,7 +136,7 @@ static inline int hash(journal_t *journal, unsigned long block) (block << (hash_shift - 12))) & (table->hash_size - 1); } -static int insert_revoke_hash(journal_t *journal, unsigned long blocknr, +static int insert_revoke_hash(journal_t *journal, unsigned int blocknr, tid_t seq) { struct list_head *hash_list; @@ -166,7 +166,7 @@ oom: /* Find a revoke record in the journal's hash table. */ static struct jbd_revoke_record_s *find_revoke_record(journal_t *journal, - unsigned long blocknr) + unsigned int blocknr) { struct list_head *hash_list; struct jbd_revoke_record_s *record; @@ -332,7 +332,7 @@ void journal_destroy_revoke(journal_t *journal) * by one. */ -int journal_revoke(handle_t *handle, unsigned long blocknr, +int journal_revoke(handle_t *handle, unsigned int blocknr, struct buffer_head *bh_in) { struct buffer_head *bh = NULL; @@ -401,7 +401,7 @@ int journal_revoke(handle_t *handle, unsigned long blocknr, } } - jbd_debug(2, "insert revoke for block %lu, bh_in=%p\n", blocknr, bh_in); + jbd_debug(2, "insert revoke for block %u, bh_in=%p\n", blocknr, bh_in); err = insert_revoke_hash(journal, blocknr, handle->h_transaction->t_tid); BUFFER_TRACE(bh_in, "exit"); @@ -644,7 +644,7 @@ static void flush_descriptor(journal_t *journal, */ int journal_set_revoke(journal_t *journal, - unsigned long blocknr, + unsigned int blocknr, tid_t sequence) { struct jbd_revoke_record_s *record; @@ -668,7 +668,7 @@ int journal_set_revoke(journal_t *journal, */ int journal_test_revoke(journal_t *journal, - unsigned long blocknr, + unsigned int blocknr, tid_t sequence) { struct jbd_revoke_record_s *record; diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c index c03ac11..006f9ad 100644 --- a/fs/jbd/transaction.c +++ b/fs/jbd/transaction.c @@ -56,7 +56,8 @@ get_transaction(journal_t *journal, transaction_t *transaction) spin_lock_init(&transaction->t_handle_lock); /* Set up the commit timer for the new transaction. */ - journal->j_commit_timer.expires = round_jiffies(transaction->t_expires); + journal->j_commit_timer.expires = + round_jiffies_up(transaction->t_expires); add_timer(&journal->j_commit_timer); J_ASSERT(journal->j_running_transaction == NULL); @@ -228,6 +229,8 @@ repeat_locked: __log_space_left(journal)); spin_unlock(&transaction->t_handle_lock); spin_unlock(&journal->j_state_lock); + + lock_map_acquire(&handle->h_lockdep_map); out: if (unlikely(new_transaction)) /* It's usually NULL */ kfree(new_transaction); @@ -292,9 +295,6 @@ handle_t *journal_start(journal_t *journal, int nblocks) handle = ERR_PTR(err); goto out; } - - lock_map_acquire(&handle->h_lockdep_map); - out: return handle; } @@ -416,6 +416,7 @@ int journal_restart(handle_t *handle, int nblocks) __log_start_commit(journal, transaction->t_tid); spin_unlock(&journal->j_state_lock); + lock_map_release(&handle->h_lockdep_map); handle->h_buffer_credits = nblocks; ret = start_this_handle(journal, handle); return ret; diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c index 5d70b3e..ca0f5eb 100644 --- a/fs/jbd2/checkpoint.c +++ b/fs/jbd2/checkpoint.c @@ -643,6 +643,7 @@ out: int __jbd2_journal_remove_checkpoint(struct journal_head *jh) { + struct transaction_chp_stats_s *stats; transaction_t *transaction; journal_t *journal; int ret = 0; @@ -679,6 +680,12 @@ int __jbd2_journal_remove_checkpoint(struct journal_head *jh) /* OK, that was the last buffer for the transaction: we can now safely remove this transaction from the log */ + stats = &transaction->t_chp_stats; + if (stats->cs_chp_time) + stats->cs_chp_time = jbd2_time_diff(stats->cs_chp_time, + jiffies); + trace_jbd2_checkpoint_stats(journal->j_fs_dev->bd_dev, + transaction->t_tid, stats); __jbd2_journal_drop_transaction(journal, transaction); kfree(transaction); diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 7b4088b..d4cfd6d 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -25,6 +25,7 @@ #include <linux/writeback.h> #include <linux/backing-dev.h> #include <linux/bio.h> +#include <linux/blkdev.h> #include <trace/events/jbd2.h> /* @@ -133,8 +134,8 @@ static int journal_submit_commit_record(journal_t *journal, bh->b_end_io = journal_end_buffer_io_sync; if (journal->j_flags & JBD2_BARRIER && - !JBD2_HAS_INCOMPAT_FEATURE(journal, - JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) { + !JBD2_HAS_INCOMPAT_FEATURE(journal, + JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) { set_buffer_ordered(bh); barrier_done = 1; } @@ -220,7 +221,6 @@ static int journal_submit_inode_data_buffers(struct address_space *mapping) .nr_to_write = mapping->nrpages * 2, .range_start = 0, .range_end = i_size_read(mapping->host), - .for_writepages = 1, }; ret = generic_writepages(mapping, &wbc); @@ -410,10 +410,10 @@ void jbd2_journal_commit_transaction(journal_t *journal) if (commit_transaction->t_synchronous_commit) write_op = WRITE_SYNC_PLUG; trace_jbd2_commit_locking(journal, commit_transaction); - stats.u.run.rs_wait = commit_transaction->t_max_wait; - stats.u.run.rs_locked = jiffies; - stats.u.run.rs_running = jbd2_time_diff(commit_transaction->t_start, - stats.u.run.rs_locked); + stats.run.rs_wait = commit_transaction->t_max_wait; + stats.run.rs_locked = jiffies; + stats.run.rs_running = jbd2_time_diff(commit_transaction->t_start, + stats.run.rs_locked); spin_lock(&commit_transaction->t_handle_lock); while (commit_transaction->t_updates) { @@ -486,9 +486,9 @@ void jbd2_journal_commit_transaction(journal_t *journal) jbd2_journal_switch_revoke_table(journal); trace_jbd2_commit_flushing(journal, commit_transaction); - stats.u.run.rs_flushing = jiffies; - stats.u.run.rs_locked = jbd2_time_diff(stats.u.run.rs_locked, - stats.u.run.rs_flushing); + stats.run.rs_flushing = jiffies; + stats.run.rs_locked = jbd2_time_diff(stats.run.rs_locked, + stats.run.rs_flushing); commit_transaction->t_state = T_FLUSH; journal->j_committing_transaction = commit_transaction; @@ -523,11 +523,11 @@ void jbd2_journal_commit_transaction(journal_t *journal) spin_unlock(&journal->j_state_lock); trace_jbd2_commit_logging(journal, commit_transaction); - stats.u.run.rs_logging = jiffies; - stats.u.run.rs_flushing = jbd2_time_diff(stats.u.run.rs_flushing, - stats.u.run.rs_logging); - stats.u.run.rs_blocks = commit_transaction->t_outstanding_credits; - stats.u.run.rs_blocks_logged = 0; + stats.run.rs_logging = jiffies; + stats.run.rs_flushing = jbd2_time_diff(stats.run.rs_flushing, + stats.run.rs_logging); + stats.run.rs_blocks = commit_transaction->t_outstanding_credits; + stats.run.rs_blocks_logged = 0; J_ASSERT(commit_transaction->t_nr_buffers <= commit_transaction->t_outstanding_credits); @@ -695,7 +695,7 @@ start_journal_io: submit_bh(write_op, bh); } cond_resched(); - stats.u.run.rs_blocks_logged += bufs; + stats.run.rs_blocks_logged += bufs; /* Force a new descriptor to be generated next time round the loop. */ @@ -707,11 +707,13 @@ start_journal_io: /* Done it all: now write the commit record asynchronously. */ if (JBD2_HAS_INCOMPAT_FEATURE(journal, - JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) { + JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) { err = journal_submit_commit_record(journal, commit_transaction, &cbh, crc32_sum); if (err) __jbd2_journal_abort_hard(journal); + if (journal->j_flags & JBD2_BARRIER) + blkdev_issue_flush(journal->j_dev, NULL); } /* @@ -834,7 +836,7 @@ wait_for_iobuf: jbd_debug(3, "JBD: commit phase 5\n"); if (!JBD2_HAS_INCOMPAT_FEATURE(journal, - JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) { + JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) { err = journal_submit_commit_record(journal, commit_transaction, &cbh, crc32_sum); if (err) @@ -986,33 +988,30 @@ restart_loop: J_ASSERT(commit_transaction->t_state == T_COMMIT); commit_transaction->t_start = jiffies; - stats.u.run.rs_logging = jbd2_time_diff(stats.u.run.rs_logging, - commit_transaction->t_start); + stats.run.rs_logging = jbd2_time_diff(stats.run.rs_logging, + commit_transaction->t_start); /* - * File the transaction for history + * File the transaction statistics */ - stats.ts_type = JBD2_STATS_RUN; stats.ts_tid = commit_transaction->t_tid; - stats.u.run.rs_handle_count = commit_transaction->t_handle_count; - spin_lock(&journal->j_history_lock); - memcpy(journal->j_history + journal->j_history_cur, &stats, - sizeof(stats)); - if (++journal->j_history_cur == journal->j_history_max) - journal->j_history_cur = 0; + stats.run.rs_handle_count = commit_transaction->t_handle_count; + trace_jbd2_run_stats(journal->j_fs_dev->bd_dev, + commit_transaction->t_tid, &stats.run); /* * Calculate overall stats */ + spin_lock(&journal->j_history_lock); journal->j_stats.ts_tid++; - journal->j_stats.u.run.rs_wait += stats.u.run.rs_wait; - journal->j_stats.u.run.rs_running += stats.u.run.rs_running; - journal->j_stats.u.run.rs_locked += stats.u.run.rs_locked; - journal->j_stats.u.run.rs_flushing += stats.u.run.rs_flushing; - journal->j_stats.u.run.rs_logging += stats.u.run.rs_logging; - journal->j_stats.u.run.rs_handle_count += stats.u.run.rs_handle_count; - journal->j_stats.u.run.rs_blocks += stats.u.run.rs_blocks; - journal->j_stats.u.run.rs_blocks_logged += stats.u.run.rs_blocks_logged; + journal->j_stats.run.rs_wait += stats.run.rs_wait; + journal->j_stats.run.rs_running += stats.run.rs_running; + journal->j_stats.run.rs_locked += stats.run.rs_locked; + journal->j_stats.run.rs_flushing += stats.run.rs_flushing; + journal->j_stats.run.rs_logging += stats.run.rs_logging; + journal->j_stats.run.rs_handle_count += stats.run.rs_handle_count; + journal->j_stats.run.rs_blocks += stats.run.rs_blocks; + journal->j_stats.run.rs_blocks_logged += stats.run.rs_blocks_logged; spin_unlock(&journal->j_history_lock); commit_transaction->t_state = T_FINISHED; diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index e378cb3..b0ab521 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -136,10 +136,6 @@ static int kjournald2(void *arg) journal->j_task = current; wake_up(&journal->j_wait_done_commit); - printk(KERN_INFO "kjournald2 starting: pid %d, dev %s, " - "commit interval %ld seconds\n", current->pid, - journal->j_devname, journal->j_commit_interval / HZ); - /* * And now, wait forever for commit wakeup events. */ @@ -223,7 +219,8 @@ static int jbd2_journal_start_thread(journal_t *journal) { struct task_struct *t; - t = kthread_run(kjournald2, journal, "kjournald2"); + t = kthread_run(kjournald2, journal, "jbd2/%s", + journal->j_devname); if (IS_ERR(t)) return PTR_ERR(t); @@ -679,153 +676,6 @@ struct jbd2_stats_proc_session { int max; }; -static void *jbd2_history_skip_empty(struct jbd2_stats_proc_session *s, - struct transaction_stats_s *ts, - int first) -{ - if (ts == s->stats + s->max) - ts = s->stats; - if (!first && ts == s->stats + s->start) - return NULL; - while (ts->ts_type == 0) { - ts++; - if (ts == s->stats + s->max) - ts = s->stats; - if (ts == s->stats + s->start) - return NULL; - } - return ts; - -} - -static void *jbd2_seq_history_start(struct seq_file *seq, loff_t *pos) -{ - struct jbd2_stats_proc_session *s = seq->private; - struct transaction_stats_s *ts; - int l = *pos; - - if (l == 0) - return SEQ_START_TOKEN; - ts = jbd2_history_skip_empty(s, s->stats + s->start, 1); - if (!ts) - return NULL; - l--; - while (l) { - ts = jbd2_history_skip_empty(s, ++ts, 0); - if (!ts) - break; - l--; - } - return ts; -} - -static void *jbd2_seq_history_next(struct seq_file *seq, void *v, loff_t *pos) -{ - struct jbd2_stats_proc_session *s = seq->private; - struct transaction_stats_s *ts = v; - - ++*pos; - if (v == SEQ_START_TOKEN) - return jbd2_history_skip_empty(s, s->stats + s->start, 1); - else - return jbd2_history_skip_empty(s, ++ts, 0); -} - -static int jbd2_seq_history_show(struct seq_file *seq, void *v) -{ - struct transaction_stats_s *ts = v; - if (v == SEQ_START_TOKEN) { - seq_printf(seq, "%-4s %-5s %-5s %-5s %-5s %-5s %-5s %-6s %-5s " - "%-5s %-5s %-5s %-5s %-5s\n", "R/C", "tid", - "wait", "run", "lock", "flush", "log", "hndls", - "block", "inlog", "ctime", "write", "drop", - "close"); - return 0; - } - if (ts->ts_type == JBD2_STATS_RUN) - seq_printf(seq, "%-4s %-5lu %-5u %-5u %-5u %-5u %-5u " - "%-6lu %-5lu %-5lu\n", "R", ts->ts_tid, - jiffies_to_msecs(ts->u.run.rs_wait), - jiffies_to_msecs(ts->u.run.rs_running), - jiffies_to_msecs(ts->u.run.rs_locked), - jiffies_to_msecs(ts->u.run.rs_flushing), - jiffies_to_msecs(ts->u.run.rs_logging), - ts->u.run.rs_handle_count, - ts->u.run.rs_blocks, - ts->u.run.rs_blocks_logged); - else if (ts->ts_type == JBD2_STATS_CHECKPOINT) - seq_printf(seq, "%-4s %-5lu %48s %-5u %-5lu %-5lu %-5lu\n", - "C", ts->ts_tid, " ", - jiffies_to_msecs(ts->u.chp.cs_chp_time), - ts->u.chp.cs_written, ts->u.chp.cs_dropped, - ts->u.chp.cs_forced_to_close); - else - J_ASSERT(0); - return 0; -} - -static void jbd2_seq_history_stop(struct seq_file *seq, void *v) -{ -} - -static struct seq_operations jbd2_seq_history_ops = { - .start = jbd2_seq_history_start, - .next = jbd2_seq_history_next, - .stop = jbd2_seq_history_stop, - .show = jbd2_seq_history_show, -}; - -static int jbd2_seq_history_open(struct inode *inode, struct file *file) -{ - journal_t *journal = PDE(inode)->data; - struct jbd2_stats_proc_session *s; - int rc, size; - - s = kmalloc(sizeof(*s), GFP_KERNEL); - if (s == NULL) - return -ENOMEM; - size = sizeof(struct transaction_stats_s) * journal->j_history_max; - s->stats = kmalloc(size, GFP_KERNEL); - if (s->stats == NULL) { - kfree(s); - return -ENOMEM; - } - spin_lock(&journal->j_history_lock); - memcpy(s->stats, journal->j_history, size); - s->max = journal->j_history_max; - s->start = journal->j_history_cur % s->max; - spin_unlock(&journal->j_history_lock); - - rc = seq_open(file, &jbd2_seq_history_ops); - if (rc == 0) { - struct seq_file *m = file->private_data; - m->private = s; - } else { - kfree(s->stats); - kfree(s); - } - return rc; - -} - -static int jbd2_seq_history_release(struct inode *inode, struct file *file) -{ - struct seq_file *seq = file->private_data; - struct jbd2_stats_proc_session *s = seq->private; - - kfree(s->stats); - kfree(s); - return seq_release(inode, file); -} - -static struct file_operations jbd2_seq_history_fops = { - .owner = THIS_MODULE, - .open = jbd2_seq_history_open, - .read = seq_read, - .llseek = seq_lseek, - .release = jbd2_seq_history_release, -}; - static void *jbd2_seq_info_start(struct seq_file *seq, loff_t *pos) { return *pos ? NULL : SEQ_START_TOKEN; @@ -842,29 +692,29 @@ static int jbd2_seq_info_show(struct seq_file *seq, void *v) if (v != SEQ_START_TOKEN) return 0; - seq_printf(seq, "%lu transaction, each upto %u blocks\n", + seq_printf(seq, "%lu transaction, each up to %u blocks\n", s->stats->ts_tid, s->journal->j_max_transaction_buffers); if (s->stats->ts_tid == 0) return 0; seq_printf(seq, "average: \n %ums waiting for transaction\n", - jiffies_to_msecs(s->stats->u.run.rs_wait / s->stats->ts_tid)); + jiffies_to_msecs(s->stats->run.rs_wait / s->stats->ts_tid)); seq_printf(seq, " %ums running transaction\n", - jiffies_to_msecs(s->stats->u.run.rs_running / s->stats->ts_tid)); + jiffies_to_msecs(s->stats->run.rs_running / s->stats->ts_tid)); seq_printf(seq, " %ums transaction was being locked\n", - jiffies_to_msecs(s->stats->u.run.rs_locked / s->stats->ts_tid)); + jiffies_to_msecs(s->stats->run.rs_locked / s->stats->ts_tid)); seq_printf(seq, " %ums flushing data (in ordered mode)\n", - jiffies_to_msecs(s->stats->u.run.rs_flushing / s->stats->ts_tid)); + jiffies_to_msecs(s->stats->run.rs_flushing / s->stats->ts_tid)); seq_printf(seq, " %ums logging transaction\n", - jiffies_to_msecs(s->stats->u.run.rs_logging / s->stats->ts_tid)); + jiffies_to_msecs(s->stats->run.rs_logging / s->stats->ts_tid)); seq_printf(seq, " %lluus average transaction commit time\n", div_u64(s->journal->j_average_commit_time, 1000)); seq_printf(seq, " %lu handles per transaction\n", - s->stats->u.run.rs_handle_count / s->stats->ts_tid); + s->stats->run.rs_handle_count / s->stats->ts_tid); seq_printf(seq, " %lu blocks per transaction\n", - s->stats->u.run.rs_blocks / s->stats->ts_tid); + s->stats->run.rs_blocks / s->stats->ts_tid); seq_printf(seq, " %lu logged blocks per transaction\n", - s->stats->u.run.rs_blocks_logged / s->stats->ts_tid); + s->stats->run.rs_blocks_logged / s->stats->ts_tid); return 0; } @@ -872,7 +722,7 @@ static void jbd2_seq_info_stop(struct seq_file *seq, void *v) { } -static struct seq_operations jbd2_seq_info_ops = { +static const struct seq_operations jbd2_seq_info_ops = { .start = jbd2_seq_info_start, .next = jbd2_seq_info_next, .stop = jbd2_seq_info_stop, @@ -920,7 +770,7 @@ static int jbd2_seq_info_release(struct inode *inode, struct file *file) return seq_release(inode, file); } -static struct file_operations jbd2_seq_info_fops = { +static const struct file_operations jbd2_seq_info_fops = { .owner = THIS_MODULE, .open = jbd2_seq_info_open, .read = seq_read, @@ -934,8 +784,6 @@ static void jbd2_stats_proc_init(journal_t *journal) { journal->j_proc_entry = proc_mkdir(journal->j_devname, proc_jbd2_stats); if (journal->j_proc_entry) { - proc_create_data("history", S_IRUGO, journal->j_proc_entry, - &jbd2_seq_history_fops, journal); proc_create_data("info", S_IRUGO, journal->j_proc_entry, &jbd2_seq_info_fops, journal); } @@ -944,27 +792,9 @@ static void jbd2_stats_proc_init(journal_t *journal) static void jbd2_stats_proc_exit(journal_t *journal) { remove_proc_entry("info", journal->j_proc_entry); - remove_proc_entry("history", journal->j_proc_entry); remove_proc_entry(journal->j_devname, proc_jbd2_stats); } -static void journal_init_stats(journal_t *journal) -{ - int size; - - if (!proc_jbd2_stats) - return; - - journal->j_history_max = 100; - size = sizeof(struct transaction_stats_s) * journal->j_history_max; - journal->j_history = kzalloc(size, GFP_KERNEL); - if (!journal->j_history) { - journal->j_history_max = 0; - return; - } - spin_lock_init(&journal->j_history_lock); -} - /* * Management for journal control blocks: functions to create and * destroy journal_t structures, and to initialise and read existing @@ -1009,7 +839,7 @@ static journal_t * journal_init_common (void) goto fail; } - journal_init_stats(journal); + spin_lock_init(&journal->j_history_lock); return journal; fail: @@ -1115,7 +945,7 @@ journal_t * jbd2_journal_init_inode (struct inode *inode) while ((p = strchr(p, '/'))) *p = '!'; p = journal->j_devname + strlen(journal->j_devname); - sprintf(p, ":%lu", journal->j_inode->i_ino); + sprintf(p, "-%lu", journal->j_inode->i_ino); jbd_debug(1, "journal %p: inode %s/%ld, size %Ld, bits %d, blksize %ld\n", journal, inode->i_sb->s_id, inode->i_ino, @@ -1187,6 +1017,12 @@ static int journal_reset(journal_t *journal) first = be32_to_cpu(sb->s_first); last = be32_to_cpu(sb->s_maxlen); + if (first + JBD2_MIN_JOURNAL_BLOCKS > last + 1) { + printk(KERN_ERR "JBD: Journal too short (blocks %llu-%llu).\n", + first, last); + journal_fail_superblock(journal); + return -EINVAL; + } journal->j_first = first; journal->j_last = last; diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index 6213ac7..a051270 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -57,7 +57,7 @@ jbd2_get_transaction(journal_t *journal, transaction_t *transaction) INIT_LIST_HEAD(&transaction->t_private_list); /* Set up the commit timer for the new transaction. */ - journal->j_commit_timer.expires = round_jiffies(transaction->t_expires); + journal->j_commit_timer.expires = round_jiffies_up(transaction->t_expires); add_timer(&journal->j_commit_timer); J_ASSERT(journal->j_running_transaction == NULL); @@ -238,6 +238,8 @@ repeat_locked: __jbd2_log_space_left(journal)); spin_unlock(&transaction->t_handle_lock); spin_unlock(&journal->j_state_lock); + + lock_map_acquire(&handle->h_lockdep_map); out: if (unlikely(new_transaction)) /* It's usually NULL */ kfree(new_transaction); @@ -303,8 +305,6 @@ handle_t *jbd2_journal_start(journal_t *journal, int nblocks) handle = ERR_PTR(err); goto out; } - - lock_map_acquire(&handle->h_lockdep_map); out: return handle; } @@ -426,6 +426,7 @@ int jbd2_journal_restart(handle_t *handle, int nblocks) __jbd2_log_start_commit(journal, transaction->t_tid); spin_unlock(&journal->j_state_lock); + lock_map_release(&handle->h_lockdep_map); handle->h_buffer_credits = nblocks; ret = start_this_handle(journal, handle); return ret; diff --git a/fs/jffs2/acl.c b/fs/jffs2/acl.c index 8fcb62392..7edb62e 100644 --- a/fs/jffs2/acl.c +++ b/fs/jffs2/acl.c @@ -258,7 +258,7 @@ static int jffs2_set_acl(struct inode *inode, int type, struct posix_acl *acl) return rc; } -static int jffs2_check_acl(struct inode *inode, int mask) +int jffs2_check_acl(struct inode *inode, int mask) { struct posix_acl *acl; int rc; @@ -274,11 +274,6 @@ static int jffs2_check_acl(struct inode *inode, int mask) return -EAGAIN; } -int jffs2_permission(struct inode *inode, int mask) -{ - return generic_permission(inode, mask, jffs2_check_acl); -} - int jffs2_init_acl_pre(struct inode *dir_i, struct inode *inode, int *i_mode) { struct posix_acl *acl, *clone; diff --git a/fs/jffs2/acl.h b/fs/jffs2/acl.h index fc929f2..f0ba63e 100644 --- a/fs/jffs2/acl.h +++ b/fs/jffs2/acl.h @@ -26,7 +26,7 @@ struct jffs2_acl_header { #ifdef CONFIG_JFFS2_FS_POSIX_ACL -extern int jffs2_permission(struct inode *, int); +extern int jffs2_check_acl(struct inode *, int); extern int jffs2_acl_chmod(struct inode *); extern int jffs2_init_acl_pre(struct inode *, struct inode *, int *); extern int jffs2_init_acl_post(struct inode *); @@ -36,7 +36,7 @@ extern struct xattr_handler jffs2_acl_default_xattr_handler; #else -#define jffs2_permission (NULL) +#define jffs2_check_acl (NULL) #define jffs2_acl_chmod(inode) (0) #define jffs2_init_acl_pre(dir_i,inode,mode) (0) #define jffs2_init_acl_post(inode) (0) diff --git a/fs/jffs2/background.c b/fs/jffs2/background.c index e958010..3ff50da 100644 --- a/fs/jffs2/background.c +++ b/fs/jffs2/background.c @@ -15,6 +15,7 @@ #include <linux/completion.h> #include <linux/sched.h> #include <linux/freezer.h> +#include <linux/kthread.h> #include "nodelist.h" @@ -31,7 +32,7 @@ void jffs2_garbage_collect_trigger(struct jffs2_sb_info *c) /* This must only ever be called when no GC thread is currently running */ int jffs2_start_garbage_collect_thread(struct jffs2_sb_info *c) { - pid_t pid; + struct task_struct *tsk; int ret = 0; BUG_ON(c->gc_task); @@ -39,15 +40,16 @@ int jffs2_start_garbage_collect_thread(struct jffs2_sb_info *c) init_completion(&c->gc_thread_start); init_completion(&c->gc_thread_exit); - pid = kernel_thread(jffs2_garbage_collect_thread, c, CLONE_FS|CLONE_FILES); - if (pid < 0) { - printk(KERN_WARNING "fork failed for JFFS2 garbage collect thread: %d\n", -pid); + tsk = kthread_run(jffs2_garbage_collect_thread, c, "jffs2_gcd_mtd%d", c->mtd->index); + if (IS_ERR(tsk)) { + printk(KERN_WARNING "fork failed for JFFS2 garbage collect thread: %ld\n", -PTR_ERR(tsk)); complete(&c->gc_thread_exit); - ret = pid; + ret = PTR_ERR(tsk); } else { /* Wait for it... */ - D1(printk(KERN_DEBUG "JFFS2: Garbage collect thread is pid %d\n", pid)); + D1(printk(KERN_DEBUG "JFFS2: Garbage collect thread is pid %d\n", tsk->pid)); wait_for_completion(&c->gc_thread_start); + ret = tsk->pid; } return ret; @@ -71,7 +73,6 @@ static int jffs2_garbage_collect_thread(void *_c) { struct jffs2_sb_info *c = _c; - daemonize("jffs2_gcd_mtd%d", c->mtd->index); allow_signal(SIGKILL); allow_signal(SIGSTOP); allow_signal(SIGCONT); @@ -107,6 +108,11 @@ static int jffs2_garbage_collect_thread(void *_c) * the GC thread get there first. */ schedule_timeout_interruptible(msecs_to_jiffies(50)); + if (kthread_should_stop()) { + D1(printk(KERN_DEBUG "jffs2_garbage_collect_thread(): kthread_stop() called.\n")); + goto die; + } + /* Put_super will send a SIGKILL and then wait on the sem. */ while (signal_pending(current) || freezing(current)) { diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c index 6f60cc9..7aa4417 100644 --- a/fs/jffs2/dir.c +++ b/fs/jffs2/dir.c @@ -55,7 +55,7 @@ const struct inode_operations jffs2_dir_inode_operations = .rmdir = jffs2_rmdir, .mknod = jffs2_mknod, .rename = jffs2_rename, - .permission = jffs2_permission, + .check_acl = jffs2_check_acl, .setattr = jffs2_setattr, .setxattr = jffs2_setxattr, .getxattr = jffs2_getxattr, diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c index 23c9475..b7b74e2 100644 --- a/fs/jffs2/file.c +++ b/fs/jffs2/file.c @@ -56,7 +56,7 @@ const struct file_operations jffs2_file_operations = const struct inode_operations jffs2_file_inode_operations = { - .permission = jffs2_permission, + .check_acl = jffs2_check_acl, .setattr = jffs2_setattr, .setxattr = jffs2_setxattr, .getxattr = jffs2_getxattr, diff --git a/fs/jffs2/malloc.c b/fs/jffs2/malloc.c index 9eff2bd..c082868 100644 --- a/fs/jffs2/malloc.c +++ b/fs/jffs2/malloc.c @@ -39,13 +39,13 @@ int __init jffs2_create_slab_caches(void) raw_dirent_slab = kmem_cache_create("jffs2_raw_dirent", sizeof(struct jffs2_raw_dirent), - 0, 0, NULL); + 0, SLAB_HWCACHE_ALIGN, NULL); if (!raw_dirent_slab) goto err; raw_inode_slab = kmem_cache_create("jffs2_raw_inode", sizeof(struct jffs2_raw_inode), - 0, 0, NULL); + 0, SLAB_HWCACHE_ALIGN, NULL); if (!raw_inode_slab) goto err; diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c index 0035c02..9a80e8e 100644 --- a/fs/jffs2/super.c +++ b/fs/jffs2/super.c @@ -123,7 +123,7 @@ static struct dentry *jffs2_get_parent(struct dentry *child) return d_obtain_alias(jffs2_iget(child->d_inode->i_sb, pino)); } -static struct export_operations jffs2_export_ops = { +static const struct export_operations jffs2_export_ops = { .get_parent = jffs2_get_parent, .fh_to_dentry = jffs2_fh_to_dentry, .fh_to_parent = jffs2_fh_to_parent, diff --git a/fs/jffs2/symlink.c b/fs/jffs2/symlink.c index b7339c3..4ec11e8 100644 --- a/fs/jffs2/symlink.c +++ b/fs/jffs2/symlink.c @@ -21,7 +21,7 @@ const struct inode_operations jffs2_symlink_inode_operations = { .readlink = generic_readlink, .follow_link = jffs2_follow_link, - .permission = jffs2_permission, + .check_acl = jffs2_check_acl, .setattr = jffs2_setattr, .setxattr = jffs2_setxattr, .getxattr = jffs2_getxattr, diff --git a/fs/jfs/acl.c b/fs/jfs/acl.c index a29c7c3..d66477c 100644 --- a/fs/jfs/acl.c +++ b/fs/jfs/acl.c @@ -114,7 +114,7 @@ out: return rc; } -static int jfs_check_acl(struct inode *inode, int mask) +int jfs_check_acl(struct inode *inode, int mask) { struct posix_acl *acl = jfs_get_acl(inode, ACL_TYPE_ACCESS); @@ -129,11 +129,6 @@ static int jfs_check_acl(struct inode *inode, int mask) return -EAGAIN; } -int jfs_permission(struct inode *inode, int mask) -{ - return generic_permission(inode, mask, jfs_check_acl); -} - int jfs_init_acl(tid_t tid, struct inode *inode, struct inode *dir) { struct posix_acl *acl = NULL; diff --git a/fs/jfs/file.c b/fs/jfs/file.c index 7f6063a..2b70fa7 100644 --- a/fs/jfs/file.c +++ b/fs/jfs/file.c @@ -96,7 +96,7 @@ const struct inode_operations jfs_file_inode_operations = { .removexattr = jfs_removexattr, #ifdef CONFIG_JFS_POSIX_ACL .setattr = jfs_setattr, - .permission = jfs_permission, + .check_acl = jfs_check_acl, #endif }; diff --git a/fs/jfs/jfs_acl.h b/fs/jfs/jfs_acl.h index 88475f1..b07bd41 100644 --- a/fs/jfs/jfs_acl.h +++ b/fs/jfs/jfs_acl.h @@ -20,7 +20,7 @@ #ifdef CONFIG_JFS_POSIX_ACL -int jfs_permission(struct inode *, int); +int jfs_check_acl(struct inode *, int); int jfs_init_acl(tid_t, struct inode *, struct inode *); int jfs_setattr(struct dentry *, struct iattr *); diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c index 514ee2e..c79a4270 100644 --- a/fs/jfs/namei.c +++ b/fs/jfs/namei.c @@ -1543,7 +1543,7 @@ const struct inode_operations jfs_dir_inode_operations = { .removexattr = jfs_removexattr, #ifdef CONFIG_JFS_POSIX_ACL .setattr = jfs_setattr, - .permission = jfs_permission, + .check_acl = jfs_check_acl, #endif }; diff --git a/fs/jfs/super.c b/fs/jfs/super.c index 37e6dcd..2234c73 100644 --- a/fs/jfs/super.c +++ b/fs/jfs/super.c @@ -178,13 +178,11 @@ static void jfs_put_super(struct super_block *sb) rc = jfs_umount(sb); if (rc) jfs_err("jfs_umount failed with return code %d", rc); - if (sbi->nls_tab) - unload_nls(sbi->nls_tab); - sbi->nls_tab = NULL; + + unload_nls(sbi->nls_tab); truncate_inode_pages(sbi->direct_inode->i_mapping, 0); iput(sbi->direct_inode); - sbi->direct_inode = NULL; kfree(sbi); @@ -347,8 +345,7 @@ static int parse_options(char *options, struct super_block *sb, s64 *newLVSize, if (nls_map != (void *) -1) { /* Discard old (if remount) */ - if (sbi->nls_tab) - unload_nls(sbi->nls_tab); + unload_nls(sbi->nls_tab); sbi->nls_tab = nls_map; } return 1; @@ -527,14 +527,18 @@ ssize_t simple_read_from_buffer(void __user *to, size_t count, loff_t *ppos, const void *from, size_t available) { loff_t pos = *ppos; + size_t ret; + if (pos < 0) return -EINVAL; - if (pos >= available) + if (pos >= available || !count) return 0; if (count > available - pos) count = available - pos; - if (copy_to_user(to, from + pos, count)) + ret = copy_to_user(to, from + pos, count); + if (ret == count) return -EFAULT; + count -= ret; *ppos = pos + count; return count; } @@ -735,10 +739,11 @@ ssize_t simple_attr_write(struct file *file, const char __user *buf, if (copy_from_user(attr->set_buf, buf, size)) goto out; - ret = len; /* claim we got the whole input */ attr->set_buf[size] = '\0'; val = simple_strtol(attr->set_buf, NULL, 0); - attr->set(attr->data, val); + ret = attr->set(attr->data, val); + if (ret == 0) + ret = len; /* on success, claim we got the whole input */ out: mutex_unlock(&attr->mutex); return ret; diff --git a/fs/lockd/clntlock.c b/fs/lockd/clntlock.c index 1f3b0fc..fc9032d 100644 --- a/fs/lockd/clntlock.c +++ b/fs/lockd/clntlock.c @@ -166,7 +166,7 @@ __be32 nlmclnt_grant(const struct sockaddr *addr, const struct nlm_lock *lock) */ if (fl_blocked->fl_u.nfs_fl.owner->pid != lock->svid) continue; - if (!nlm_cmp_addr(nlm_addr(block->b_host), addr)) + if (!rpc_cmp_addr(nlm_addr(block->b_host), addr)) continue; if (nfs_compare_fh(NFS_FH(fl_blocked->fl_file->f_path.dentry->d_inode) ,fh) != 0) continue; diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c index 4336adb..c81249f 100644 --- a/fs/lockd/clntproc.c +++ b/fs/lockd/clntproc.c @@ -458,7 +458,7 @@ static void nlmclnt_locks_release_private(struct file_lock *fl) nlm_put_lockowner(fl->fl_u.nfs_fl.owner); } -static struct file_lock_operations nlmclnt_lock_ops = { +static const struct file_lock_operations nlmclnt_lock_ops = { .fl_copy_lock = nlmclnt_locks_copy_lock, .fl_release_private = nlmclnt_locks_release_private, }; diff --git a/fs/lockd/host.c b/fs/lockd/host.c index 99d737b..4600c20 100644 --- a/fs/lockd/host.c +++ b/fs/lockd/host.c @@ -87,18 +87,6 @@ static unsigned int nlm_hash_address(const struct sockaddr *sap) return hash & (NLM_HOST_NRHASH - 1); } -static void nlm_clear_port(struct sockaddr *sap) -{ - switch (sap->sa_family) { - case AF_INET: - ((struct sockaddr_in *)sap)->sin_port = 0; - break; - case AF_INET6: - ((struct sockaddr_in6 *)sap)->sin6_port = 0; - break; - } -} - /* * Common host lookup routine for server & client */ @@ -123,7 +111,7 @@ static struct nlm_host *nlm_lookup_host(struct nlm_lookup_host_info *ni) */ chain = &nlm_hosts[nlm_hash_address(ni->sap)]; hlist_for_each_entry(host, pos, chain, h_hash) { - if (!nlm_cmp_addr(nlm_addr(host), ni->sap)) + if (!rpc_cmp_addr(nlm_addr(host), ni->sap)) continue; /* See if we have an NSM handle for this client */ @@ -137,7 +125,7 @@ static struct nlm_host *nlm_lookup_host(struct nlm_lookup_host_info *ni) if (host->h_server != ni->server) continue; if (ni->server && - !nlm_cmp_addr(nlm_srcaddr(host), ni->src_sap)) + !rpc_cmp_addr(nlm_srcaddr(host), ni->src_sap)) continue; /* Move to head of hash chain. */ @@ -177,7 +165,7 @@ static struct nlm_host *nlm_lookup_host(struct nlm_lookup_host_info *ni) host->h_addrbuf = nsm->sm_addrbuf; memcpy(nlm_addr(host), ni->sap, ni->salen); host->h_addrlen = ni->salen; - nlm_clear_port(nlm_addr(host)); + rpc_set_port(nlm_addr(host), 0); memcpy(nlm_srcaddr(host), ni->src_sap, ni->src_len); host->h_version = ni->version; host->h_proto = ni->protocol; diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c index 7fce1b5..f956651 100644 --- a/fs/lockd/mon.c +++ b/fs/lockd/mon.c @@ -61,43 +61,6 @@ static inline struct sockaddr *nsm_addr(const struct nsm_handle *nsm) return (struct sockaddr *)&nsm->sm_addr; } -static void nsm_display_ipv4_address(const struct sockaddr *sap, char *buf, - const size_t len) -{ - const struct sockaddr_in *sin = (struct sockaddr_in *)sap; - snprintf(buf, len, "%pI4", &sin->sin_addr.s_addr); -} - -static void nsm_display_ipv6_address(const struct sockaddr *sap, char *buf, - const size_t len) -{ - const struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sap; - - if (ipv6_addr_v4mapped(&sin6->sin6_addr)) - snprintf(buf, len, "%pI4", &sin6->sin6_addr.s6_addr32[3]); - else if (sin6->sin6_scope_id != 0) - snprintf(buf, len, "%pI6%%%u", &sin6->sin6_addr, - sin6->sin6_scope_id); - else - snprintf(buf, len, "%pI6", &sin6->sin6_addr); -} - -static void nsm_display_address(const struct sockaddr *sap, - char *buf, const size_t len) -{ - switch (sap->sa_family) { - case AF_INET: - nsm_display_ipv4_address(sap, buf, len); - break; - case AF_INET6: - nsm_display_ipv6_address(sap, buf, len); - break; - default: - snprintf(buf, len, "unsupported address family"); - break; - } -} - static struct rpc_clnt *nsm_create(void) { struct sockaddr_in sin = { @@ -246,7 +209,7 @@ static struct nsm_handle *nsm_lookup_addr(const struct sockaddr *sap) struct nsm_handle *nsm; list_for_each_entry(nsm, &nsm_handles, sm_link) - if (nlm_cmp_addr(nsm_addr(nsm), sap)) + if (rpc_cmp_addr(nsm_addr(nsm), sap)) return nsm; return NULL; } @@ -307,8 +270,11 @@ static struct nsm_handle *nsm_create_handle(const struct sockaddr *sap, memcpy(nsm_addr(new), sap, salen); new->sm_addrlen = salen; nsm_init_private(new); - nsm_display_address((const struct sockaddr *)&new->sm_addr, - new->sm_addrbuf, sizeof(new->sm_addrbuf)); + + if (rpc_ntop(nsm_addr(new), new->sm_addrbuf, + sizeof(new->sm_addrbuf)) == 0) + (void)snprintf(new->sm_addrbuf, sizeof(new->sm_addrbuf), + "unsupported address family"); memcpy(new->sm_name, hostname, hostname_len); new->sm_name[hostname_len] = '\0'; diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c index e577a78..d100179 100644 --- a/fs/lockd/svclock.c +++ b/fs/lockd/svclock.c @@ -705,7 +705,7 @@ static int nlmsvc_same_owner(struct file_lock *fl1, struct file_lock *fl2) return fl1->fl_owner == fl2->fl_owner && fl1->fl_pid == fl2->fl_pid; } -struct lock_manager_operations nlmsvc_lock_operations = { +const struct lock_manager_operations nlmsvc_lock_operations = { .fl_compare_owner = nlmsvc_same_owner, .fl_notify = nlmsvc_notify_blocked, .fl_grant = nlmsvc_grant_deferred, diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c index 9e4d6aab..ad478da 100644 --- a/fs/lockd/svcsubs.c +++ b/fs/lockd/svcsubs.c @@ -417,7 +417,7 @@ EXPORT_SYMBOL_GPL(nlmsvc_unlock_all_by_sb); static int nlmsvc_match_ip(void *datap, struct nlm_host *host) { - return nlm_cmp_addr(nlm_srcaddr(host), datap); + return rpc_cmp_addr(nlm_srcaddr(host), datap); } /** diff --git a/fs/lockd/xdr.c b/fs/lockd/xdr.c index 0336f2b..b583ab0 100644 --- a/fs/lockd/xdr.c +++ b/fs/lockd/xdr.c @@ -8,7 +8,6 @@ #include <linux/types.h> #include <linux/sched.h> -#include <linux/utsname.h> #include <linux/nfs.h> #include <linux/sunrpc/xdr.h> diff --git a/fs/lockd/xdr4.c b/fs/lockd/xdr4.c index e1d5286..ad9dbbc 100644 --- a/fs/lockd/xdr4.c +++ b/fs/lockd/xdr4.c @@ -9,7 +9,6 @@ #include <linux/types.h> #include <linux/sched.h> -#include <linux/utsname.h> #include <linux/nfs.h> #include <linux/sunrpc/xdr.h> @@ -434,7 +434,7 @@ static int lease_mylease_callback(struct file_lock *fl, struct file_lock *try) return fl->fl_file == try->fl_file; } -static struct lock_manager_operations lease_manager_ops = { +static const struct lock_manager_operations lease_manager_ops = { .fl_break = lease_break_callback, .fl_release_private = lease_release_private_callback, .fl_mylease = lease_mylease_callback, @@ -768,7 +768,7 @@ static int flock_lock_file(struct file *filp, struct file_lock *request) * give it the opportunity to lock the file. */ if (found) - cond_resched_bkl(); + cond_resched(); find_conflict: for_each_lock(inode, before) { @@ -1591,7 +1591,7 @@ SYSCALL_DEFINE2(flock, unsigned int, fd, unsigned int, cmd) if (can_sleep) lock->fl_flags |= FL_SLEEP; - error = security_file_lock(filp, cmd); + error = security_file_lock(filp, lock->fl_type); if (error) goto out_free; diff --git a/fs/minix/dir.c b/fs/minix/dir.c index d407e7a..6198731 100644 --- a/fs/minix/dir.c +++ b/fs/minix/dir.c @@ -308,14 +308,18 @@ int minix_delete_entry(struct minix_dir_entry *de, struct page *page) struct inode *inode = (struct inode*)mapping->host; char *kaddr = page_address(page); loff_t pos = page_offset(page) + (char*)de - kaddr; - unsigned len = minix_sb(inode->i_sb)->s_dirsize; + struct minix_sb_info *sbi = minix_sb(inode->i_sb); + unsigned len = sbi->s_dirsize; int err; lock_page(page); err = __minix_write_begin(NULL, mapping, pos, len, AOP_FLAG_UNINTERRUPTIBLE, &page, NULL); if (err == 0) { - de->inode = 0; + if (sbi->s_version == MINIX_V3) + ((minix3_dirent *) de)->inode = 0; + else + de->inode = 0; err = dir_commit_chunk(page, pos, len); } else { unlock_page(page); @@ -440,7 +444,10 @@ void minix_set_link(struct minix_dir_entry *de, struct page *page, err = __minix_write_begin(NULL, mapping, pos, sbi->s_dirsize, AOP_FLAG_UNINTERRUPTIBLE, &page, NULL); if (err == 0) { - de->inode = inode->i_ino; + if (sbi->s_version == MINIX_V3) + ((minix3_dirent *) de)->inode = inode->i_ino; + else + de->inode = inode->i_ino; err = dir_commit_chunk(page, pos, sbi->s_dirsize); } else { unlock_page(page); @@ -470,7 +477,14 @@ ino_t minix_inode_by_name(struct dentry *dentry) ino_t res = 0; if (de) { - res = de->inode; + struct address_space *mapping = page->mapping; + struct inode *inode = mapping->host; + struct minix_sb_info *sbi = minix_sb(inode->i_sb); + + if (sbi->s_version == MINIX_V3) + res = ((minix3_dirent *) de)->inode; + else + res = de->inode; dir_put_page(page); } return res; @@ -169,19 +169,10 @@ void putname(const char *name) EXPORT_SYMBOL(putname); #endif - -/** - * generic_permission - check for access rights on a Posix-like filesystem - * @inode: inode to check access rights for - * @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC) - * @check_acl: optional callback to check for Posix ACLs - * - * Used to check for read/write/execute permissions on a file. - * We use "fsuid" for this, letting us set arbitrary permissions - * for filesystem access without changing the "normal" uids which - * are used for other things.. +/* + * This does basic POSIX ACL permission checking */ -int generic_permission(struct inode *inode, int mask, +static int acl_permission_check(struct inode *inode, int mask, int (*check_acl)(struct inode *inode, int mask)) { umode_t mode = inode->i_mode; @@ -193,9 +184,7 @@ int generic_permission(struct inode *inode, int mask, else { if (IS_POSIXACL(inode) && (mode & S_IRWXG) && check_acl) { int error = check_acl(inode, mask); - if (error == -EACCES) - goto check_capabilities; - else if (error != -EAGAIN) + if (error != -EAGAIN) return error; } @@ -208,8 +197,32 @@ int generic_permission(struct inode *inode, int mask, */ if ((mask & ~mode) == 0) return 0; + return -EACCES; +} + +/** + * generic_permission - check for access rights on a Posix-like filesystem + * @inode: inode to check access rights for + * @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC) + * @check_acl: optional callback to check for Posix ACLs + * + * Used to check for read/write/execute permissions on a file. + * We use "fsuid" for this, letting us set arbitrary permissions + * for filesystem access without changing the "normal" uids which + * are used for other things.. + */ +int generic_permission(struct inode *inode, int mask, + int (*check_acl)(struct inode *inode, int mask)) +{ + int ret; + + /* + * Do the basic POSIX ACL permission checks. + */ + ret = acl_permission_check(inode, mask, check_acl); + if (ret != -EACCES) + return ret; - check_capabilities: /* * Read/write DACs are always overridable. * Executable DACs are overridable if at least one exec bit is set. @@ -262,7 +275,7 @@ int inode_permission(struct inode *inode, int mask) if (inode->i_op->permission) retval = inode->i_op->permission(inode, mask); else - retval = generic_permission(inode, mask, NULL); + retval = generic_permission(inode, mask, inode->i_op->check_acl); if (retval) return retval; @@ -432,29 +445,22 @@ static struct dentry * cached_lookup(struct dentry * parent, struct qstr * name, */ static int exec_permission_lite(struct inode *inode) { - umode_t mode = inode->i_mode; - - if (inode->i_op->permission) - return -EAGAIN; + int ret; - if (current_fsuid() == inode->i_uid) - mode >>= 6; - else if (in_group_p(inode->i_gid)) - mode >>= 3; - - if (mode & MAY_EXEC) - goto ok; - - if ((inode->i_mode & S_IXUGO) && capable(CAP_DAC_OVERRIDE)) - goto ok; - - if (S_ISDIR(inode->i_mode) && capable(CAP_DAC_OVERRIDE)) + if (inode->i_op->permission) { + ret = inode->i_op->permission(inode, MAY_EXEC); + if (!ret) + goto ok; + return ret; + } + ret = acl_permission_check(inode, MAY_EXEC, inode->i_op->check_acl); + if (!ret) goto ok; - if (S_ISDIR(inode->i_mode) && capable(CAP_DAC_READ_SEARCH)) + if (capable(CAP_DAC_OVERRIDE) || capable(CAP_DAC_READ_SEARCH)) goto ok; - return -EACCES; + return ret; ok: return security_inode_permission(inode, MAY_EXEC); } @@ -853,12 +859,6 @@ static int __link_path_walk(const char *name, struct nameidata *nd) nd->flags |= LOOKUP_CONTINUE; err = exec_permission_lite(inode); - if (err == -EAGAIN) - err = inode_permission(nd->path.dentry->d_inode, - MAY_EXEC); - if (!err) - err = ima_path_check(&nd->path, MAY_EXEC, - IMA_COUNT_UPDATE); if (err) break; @@ -1533,9 +1533,11 @@ int may_open(struct path *path, int acc_mode, int flag) if (error) return error; - error = ima_path_check(path, - acc_mode & (MAY_READ | MAY_WRITE | MAY_EXEC), + error = ima_path_check(path, acc_mode ? + acc_mode & (MAY_READ | MAY_WRITE | MAY_EXEC) : + ACC_MODE(flag) & (MAY_READ | MAY_WRITE), IMA_COUNT_UPDATE); + if (error) return error; /* diff --git a/fs/namespace.c b/fs/namespace.c index 7230787..bdc3cb4 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -1640,7 +1640,7 @@ static int do_new_mount(struct path *path, char *type, int flags, { struct vfsmount *mnt; - if (!type || !memchr(type, 0, PAGE_SIZE)) + if (!type) return -EINVAL; /* we need capabilities... */ @@ -1871,6 +1871,23 @@ int copy_mount_options(const void __user * data, unsigned long *where) return 0; } +int copy_mount_string(const void __user *data, char **where) +{ + char *tmp; + + if (!data) { + *where = NULL; + return 0; + } + + tmp = strndup_user(data, PAGE_SIZE); + if (IS_ERR(tmp)) + return PTR_ERR(tmp); + + *where = tmp; + return 0; +} + /* * Flags is a 32-bit value that allows up to 31 non-fs dependent flags to * be given to the mount() call (ie: read-only, no-dev, no-suid etc). @@ -1900,8 +1917,6 @@ long do_mount(char *dev_name, char *dir_name, char *type_page, if (!dir_name || !*dir_name || !memchr(dir_name, 0, PAGE_SIZE)) return -EINVAL; - if (dev_name && !memchr(dev_name, 0, PAGE_SIZE)) - return -EINVAL; if (data_page) ((char *)data_page)[PAGE_SIZE - 1] = 0; @@ -2070,40 +2085,42 @@ EXPORT_SYMBOL(create_mnt_ns); SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *, dir_name, char __user *, type, unsigned long, flags, void __user *, data) { - int retval; + int ret; + char *kernel_type; + char *kernel_dir; + char *kernel_dev; unsigned long data_page; - unsigned long type_page; - unsigned long dev_page; - char *dir_page; - retval = copy_mount_options(type, &type_page); - if (retval < 0) - return retval; + ret = copy_mount_string(type, &kernel_type); + if (ret < 0) + goto out_type; - dir_page = getname(dir_name); - retval = PTR_ERR(dir_page); - if (IS_ERR(dir_page)) - goto out1; + kernel_dir = getname(dir_name); + if (IS_ERR(kernel_dir)) { + ret = PTR_ERR(kernel_dir); + goto out_dir; + } - retval = copy_mount_options(dev_name, &dev_page); - if (retval < 0) - goto out2; + ret = copy_mount_string(dev_name, &kernel_dev); + if (ret < 0) + goto out_dev; - retval = copy_mount_options(data, &data_page); - if (retval < 0) - goto out3; + ret = copy_mount_options(data, &data_page); + if (ret < 0) + goto out_data; - retval = do_mount((char *)dev_page, dir_page, (char *)type_page, - flags, (void *)data_page); - free_page(data_page); + ret = do_mount(kernel_dev, kernel_dir, kernel_type, flags, + (void *) data_page); -out3: - free_page(dev_page); -out2: - putname(dir_page); -out1: - free_page(type_page); - return retval; + free_page(data_page); +out_data: + kfree(kernel_dev); +out_dev: + putname(kernel_dir); +out_dir: + kfree(kernel_type); +out_type: + return ret; } /* diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c index 9c59072..b8b5b30 100644 --- a/fs/ncpfs/dir.c +++ b/fs/ncpfs/dir.c @@ -1241,7 +1241,7 @@ ncp_date_unix2dos(int unix_date, __le16 *time, __le16 *date) month = 2; } else { nl_day = (year & 3) || day <= 59 ? day : day - 1; - for (month = 0; month < 12; month++) + for (month = 1; month < 12; month++) if (day_n[month] > nl_day) break; } diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c index b99ce20..cf98da1 100644 --- a/fs/ncpfs/inode.c +++ b/fs/ncpfs/inode.c @@ -746,16 +746,8 @@ static void ncp_put_super(struct super_block *sb) #ifdef CONFIG_NCPFS_NLS /* unload the NLS charsets */ - if (server->nls_vol) - { - unload_nls(server->nls_vol); - server->nls_vol = NULL; - } - if (server->nls_io) - { - unload_nls(server->nls_io); - server->nls_io = NULL; - } + unload_nls(server->nls_vol); + unload_nls(server->nls_io); #endif /* CONFIG_NCPFS_NLS */ if (server->info_filp) diff --git a/fs/ncpfs/ioctl.c b/fs/ncpfs/ioctl.c index fa038df..0d58caf 100644 --- a/fs/ncpfs/ioctl.c +++ b/fs/ncpfs/ioctl.c @@ -223,10 +223,8 @@ ncp_set_charsets(struct ncp_server* server, struct ncp_nls_ioctl __user *arg) oldset_io = server->nls_io; server->nls_io = iocharset; - if (oldset_cp) - unload_nls(oldset_cp); - if (oldset_io) - unload_nls(oldset_io); + unload_nls(oldset_cp); + unload_nls(oldset_io); return 0; } @@ -442,7 +440,7 @@ static int __ncp_ioctl(struct inode *inode, struct file *filp, if (dentry) { struct inode* s_inode = dentry->d_inode; - if (inode) { + if (s_inode) { NCP_FINFO(s_inode)->volNumber = vnum; NCP_FINFO(s_inode)->dirEntNum = de; NCP_FINFO(s_inode)->DosDirNum = dosde; diff --git a/fs/ncpfs/mmap.c b/fs/ncpfs/mmap.c index 5d8dcb9..15458de 100644 --- a/fs/ncpfs/mmap.c +++ b/fs/ncpfs/mmap.c @@ -95,7 +95,7 @@ static int ncp_file_mmap_fault(struct vm_area_struct *area, return VM_FAULT_MAJOR; } -static struct vm_operations_struct ncp_file_mmap = +static const struct vm_operations_struct ncp_file_mmap = { .fault = ncp_file_mmap_fault, }; diff --git a/fs/nfs/Makefile b/fs/nfs/Makefile index 8451598..da7fda6 100644 --- a/fs/nfs/Makefile +++ b/fs/nfs/Makefile @@ -6,7 +6,8 @@ obj-$(CONFIG_NFS_FS) += nfs.o nfs-y := client.o dir.o file.o getroot.o inode.o super.o nfs2xdr.o \ direct.o pagelist.o proc.o read.o symlink.o unlink.o \ - write.o namespace.o mount_clnt.o + write.o namespace.o mount_clnt.o \ + dns_resolve.o cache_lib.o nfs-$(CONFIG_ROOT_NFS) += nfsroot.o nfs-$(CONFIG_NFS_V3) += nfs3proc.o nfs3xdr.o nfs-$(CONFIG_NFS_V3_ACL) += nfs3acl.o diff --git a/fs/nfs/cache_lib.c b/fs/nfs/cache_lib.c new file mode 100644 index 0000000..b4ffd01 --- /dev/null +++ b/fs/nfs/cache_lib.c @@ -0,0 +1,140 @@ +/* + * linux/fs/nfs/cache_lib.c + * + * Helper routines for the NFS client caches + * + * Copyright (c) 2009 Trond Myklebust <Trond.Myklebust@netapp.com> + */ +#include <linux/kmod.h> +#include <linux/module.h> +#include <linux/moduleparam.h> +#include <linux/mount.h> +#include <linux/namei.h> +#include <linux/sunrpc/cache.h> +#include <linux/sunrpc/rpc_pipe_fs.h> + +#include "cache_lib.h" + +#define NFS_CACHE_UPCALL_PATHLEN 256 +#define NFS_CACHE_UPCALL_TIMEOUT 15 + +static char nfs_cache_getent_prog[NFS_CACHE_UPCALL_PATHLEN] = + "/sbin/nfs_cache_getent"; +static unsigned long nfs_cache_getent_timeout = NFS_CACHE_UPCALL_TIMEOUT; + +module_param_string(cache_getent, nfs_cache_getent_prog, + sizeof(nfs_cache_getent_prog), 0600); +MODULE_PARM_DESC(cache_getent, "Path to the client cache upcall program"); +module_param_named(cache_getent_timeout, nfs_cache_getent_timeout, ulong, 0600); +MODULE_PARM_DESC(cache_getent_timeout, "Timeout (in seconds) after which " + "the cache upcall is assumed to have failed"); + +int nfs_cache_upcall(struct cache_detail *cd, char *entry_name) +{ + static char *envp[] = { "HOME=/", + "TERM=linux", + "PATH=/sbin:/usr/sbin:/bin:/usr/bin", + NULL + }; + char *argv[] = { + nfs_cache_getent_prog, + cd->name, + entry_name, + NULL + }; + int ret = -EACCES; + + if (nfs_cache_getent_prog[0] == '\0') + goto out; + ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC); + /* + * Disable the upcall mechanism if we're getting an ENOENT or + * EACCES error. The admin can re-enable it on the fly by using + * sysfs to set the 'cache_getent' parameter once the problem + * has been fixed. + */ + if (ret == -ENOENT || ret == -EACCES) + nfs_cache_getent_prog[0] = '\0'; +out: + return ret > 0 ? 0 : ret; +} + +/* + * Deferred request handling + */ +void nfs_cache_defer_req_put(struct nfs_cache_defer_req *dreq) +{ + if (atomic_dec_and_test(&dreq->count)) + kfree(dreq); +} + +static void nfs_dns_cache_revisit(struct cache_deferred_req *d, int toomany) +{ + struct nfs_cache_defer_req *dreq; + + dreq = container_of(d, struct nfs_cache_defer_req, deferred_req); + + complete_all(&dreq->completion); + nfs_cache_defer_req_put(dreq); +} + +static struct cache_deferred_req *nfs_dns_cache_defer(struct cache_req *req) +{ + struct nfs_cache_defer_req *dreq; + + dreq = container_of(req, struct nfs_cache_defer_req, req); + dreq->deferred_req.revisit = nfs_dns_cache_revisit; + atomic_inc(&dreq->count); + + return &dreq->deferred_req; +} + +struct nfs_cache_defer_req *nfs_cache_defer_req_alloc(void) +{ + struct nfs_cache_defer_req *dreq; + + dreq = kzalloc(sizeof(*dreq), GFP_KERNEL); + if (dreq) { + init_completion(&dreq->completion); + atomic_set(&dreq->count, 1); + dreq->req.defer = nfs_dns_cache_defer; + } + return dreq; +} + +int nfs_cache_wait_for_upcall(struct nfs_cache_defer_req *dreq) +{ + if (wait_for_completion_timeout(&dreq->completion, + nfs_cache_getent_timeout * HZ) == 0) + return -ETIMEDOUT; + return 0; +} + +int nfs_cache_register(struct cache_detail *cd) +{ + struct nameidata nd; + struct vfsmount *mnt; + int ret; + + mnt = rpc_get_mount(); + if (IS_ERR(mnt)) + return PTR_ERR(mnt); + ret = vfs_path_lookup(mnt->mnt_root, mnt, "/cache", 0, &nd); + if (ret) + goto err; + ret = sunrpc_cache_register_pipefs(nd.path.dentry, + cd->name, 0600, cd); + path_put(&nd.path); + if (!ret) + return ret; +err: + rpc_put_mount(); + return ret; +} + +void nfs_cache_unregister(struct cache_detail *cd) +{ + sunrpc_cache_unregister_pipefs(cd); + rpc_put_mount(); +} + diff --git a/fs/nfs/cache_lib.h b/fs/nfs/cache_lib.h new file mode 100644 index 0000000..76f856e --- /dev/null +++ b/fs/nfs/cache_lib.h @@ -0,0 +1,27 @@ +/* + * Helper routines for the NFS client caches + * + * Copyright (c) 2009 Trond Myklebust <Trond.Myklebust@netapp.com> + */ + +#include <linux/completion.h> +#include <linux/sunrpc/cache.h> +#include <asm/atomic.h> + +/* + * Deferred request handling + */ +struct nfs_cache_defer_req { + struct cache_req req; + struct cache_deferred_req deferred_req; + struct completion completion; + atomic_t count; +}; + +extern int nfs_cache_upcall(struct cache_detail *cd, char *entry_name); +extern struct nfs_cache_defer_req *nfs_cache_defer_req_alloc(void); +extern void nfs_cache_defer_req_put(struct nfs_cache_defer_req *dreq); +extern int nfs_cache_wait_for_upcall(struct nfs_cache_defer_req *dreq); + +extern int nfs_cache_register(struct cache_detail *cd); +extern void nfs_cache_unregister(struct cache_detail *cd); diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c index 7f604c7..293fa05 100644 --- a/fs/nfs/callback.c +++ b/fs/nfs/callback.c @@ -43,21 +43,29 @@ static struct svc_program nfs4_callback_program; unsigned int nfs_callback_set_tcpport; unsigned short nfs_callback_tcpport; unsigned short nfs_callback_tcpport6; -static const int nfs_set_port_min = 0; -static const int nfs_set_port_max = 65535; +#define NFS_CALLBACK_MAXPORTNR (65535U) -static int param_set_port(const char *val, struct kernel_param *kp) +static int param_set_portnr(const char *val, struct kernel_param *kp) { - char *endp; - int num = simple_strtol(val, &endp, 0); - if (endp == val || *endp || num < nfs_set_port_min || num > nfs_set_port_max) + unsigned long num; + int ret; + + if (!val) + return -EINVAL; + ret = strict_strtoul(val, 0, &num); + if (ret == -EINVAL || num > NFS_CALLBACK_MAXPORTNR) return -EINVAL; - *((int *)kp->arg) = num; + *((unsigned int *)kp->arg) = num; return 0; } -module_param_call(callback_tcpport, param_set_port, param_get_int, - &nfs_callback_set_tcpport, 0644); +static int param_get_portnr(char *buffer, struct kernel_param *kp) +{ + return param_get_uint(buffer, kp); +} +#define param_check_portnr(name, p) __param_check(name, p, unsigned int); + +module_param_named(callback_tcpport, nfs_callback_set_tcpport, portnr, 0644); /* * This is the NFSv4 callback kernel thread. diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c index e5a2dac..76b0aa0f 100644 --- a/fs/nfs/callback_xdr.c +++ b/fs/nfs/callback_xdr.c @@ -222,7 +222,7 @@ static unsigned decode_sessionid(struct xdr_stream *xdr, p = read_buf(xdr, len); if (unlikely(p == NULL)) - return htonl(NFS4ERR_RESOURCE);; + return htonl(NFS4ERR_RESOURCE); memcpy(sid->data, p, len); return 0; diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 8d25ccb..99ea196 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -648,8 +648,6 @@ static int nfs_start_lockd(struct nfs_server *server) .hostname = clp->cl_hostname, .address = (struct sockaddr *)&clp->cl_addr, .addrlen = clp->cl_addrlen, - .protocol = server->flags & NFS_MOUNT_TCP ? - IPPROTO_TCP : IPPROTO_UDP, .nfs_version = clp->rpc_ops->version, .noresvport = server->flags & NFS_MOUNT_NORESVPORT ? 1 : 0, @@ -660,6 +658,14 @@ static int nfs_start_lockd(struct nfs_server *server) if (server->flags & NFS_MOUNT_NONLM) return 0; + switch (clp->cl_proto) { + default: + nlm_init.protocol = IPPROTO_TCP; + break; + case XPRT_TRANSPORT_UDP: + nlm_init.protocol = IPPROTO_UDP; + } + host = nlmclnt_init(&nlm_init); if (IS_ERR(host)) return PTR_ERR(host); @@ -787,7 +793,7 @@ static int nfs_init_server(struct nfs_server *server, dprintk("--> nfs_init_server()\n"); #ifdef CONFIG_NFS_V3 - if (data->flags & NFS_MOUNT_VER3) + if (data->version == 3) cl_init.rpc_ops = &nfs_v3_clientops; #endif @@ -809,6 +815,9 @@ static int nfs_init_server(struct nfs_server *server, /* Initialise the client representation from the mount data */ server->flags = data->flags; server->options = data->options; + server->caps |= NFS_CAP_HARDLINKS|NFS_CAP_SYMLINKS|NFS_CAP_FILEID| + NFS_CAP_MODE|NFS_CAP_NLINK|NFS_CAP_OWNER|NFS_CAP_OWNER_GROUP| + NFS_CAP_ATIME|NFS_CAP_CTIME|NFS_CAP_MTIME; if (data->rsize) server->rsize = nfs_block_size(data->rsize, NULL); @@ -879,6 +888,7 @@ static void nfs_server_set_fsinfo(struct nfs_server *server, struct nfs_fsinfo * server->rsize = NFS_MAX_FILE_IO_SIZE; server->rpages = (server->rsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + server->backing_dev_info.name = "nfs"; server->backing_dev_info.ra_pages = server->rpages * NFS_MAX_READAHEAD; if (server->wsize > max_rpc_payload) @@ -929,10 +939,6 @@ static int nfs_probe_fsinfo(struct nfs_server *server, struct nfs_fh *mntfh, str goto out_error; nfs_server_set_fsinfo(server, &fsinfo); - error = bdi_init(&server->backing_dev_info); - if (error) - goto out_error; - /* Get some general file system info */ if (server->namelen == 0) { @@ -964,6 +970,7 @@ static void nfs_server_copy_userdata(struct nfs_server *target, struct nfs_serve target->acdirmin = source->acdirmin; target->acdirmax = source->acdirmax; target->caps = source->caps; + target->options = source->options; } /* @@ -991,6 +998,12 @@ static struct nfs_server *nfs_alloc_server(void) return NULL; } + if (bdi_init(&server->backing_dev_info)) { + nfs_free_iostats(server->io_stats); + kfree(server); + return NULL; + } + return server; } @@ -1074,10 +1087,6 @@ struct nfs_server *nfs_create_server(const struct nfs_parsed_mount_data *data, (unsigned long long) server->fsid.major, (unsigned long long) server->fsid.minor); - BUG_ON(!server->nfs_client); - BUG_ON(!server->nfs_client->rpc_ops); - BUG_ON(!server->nfs_client->rpc_ops->file_inode_ops); - spin_lock(&nfs_client_lock); list_add_tail(&server->client_link, &server->nfs_client->cl_superblocks); list_add_tail(&server->master_link, &nfs_volume_list); @@ -1171,7 +1180,7 @@ static int nfs4_init_client(struct nfs_client *clp, 1, flags & NFS_MOUNT_NORESVPORT); if (error < 0) goto error; - memcpy(clp->cl_ipaddr, ip_addr, sizeof(clp->cl_ipaddr)); + strlcpy(clp->cl_ipaddr, ip_addr, sizeof(clp->cl_ipaddr)); error = nfs_idmap_new(clp); if (error < 0) { @@ -1274,7 +1283,7 @@ static int nfs4_init_server(struct nfs_server *server, /* Initialise the client representation from the mount data */ server->flags = data->flags; - server->caps |= NFS_CAP_ATOMIC_OPEN; + server->caps |= NFS_CAP_ATOMIC_OPEN|NFS_CAP_CHANGE_ATTR; server->options = data->options; /* Get a client record */ @@ -1359,10 +1368,6 @@ struct nfs_server *nfs4_create_server(const struct nfs_parsed_mount_data *data, if (server->namelen == 0 || server->namelen > NFS4_MAXNAMLEN) server->namelen = NFS4_MAXNAMLEN; - BUG_ON(!server->nfs_client); - BUG_ON(!server->nfs_client->rpc_ops); - BUG_ON(!server->nfs_client->rpc_ops->file_inode_ops); - spin_lock(&nfs_client_lock); list_add_tail(&server->client_link, &server->nfs_client->cl_superblocks); list_add_tail(&server->master_link, &nfs_volume_list); @@ -1400,7 +1405,7 @@ struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *data, /* Initialise the client representation from the parent server */ nfs_server_copy_userdata(server, parent_server); - server->caps |= NFS_CAP_ATOMIC_OPEN; + server->caps |= NFS_CAP_ATOMIC_OPEN|NFS_CAP_CHANGE_ATTR; /* Get a client representation. * Note: NFSv4 always uses TCP, */ @@ -1533,7 +1538,7 @@ static void *nfs_server_list_next(struct seq_file *p, void *v, loff_t *pos); static void nfs_server_list_stop(struct seq_file *p, void *v); static int nfs_server_list_show(struct seq_file *m, void *v); -static struct seq_operations nfs_server_list_ops = { +static const struct seq_operations nfs_server_list_ops = { .start = nfs_server_list_start, .next = nfs_server_list_next, .stop = nfs_server_list_stop, @@ -1554,7 +1559,7 @@ static void *nfs_volume_list_next(struct seq_file *p, void *v, loff_t *pos); static void nfs_volume_list_stop(struct seq_file *p, void *v); static int nfs_volume_list_show(struct seq_file *m, void *v); -static struct seq_operations nfs_volume_list_ops = { +static const struct seq_operations nfs_volume_list_ops = { .start = nfs_volume_list_start, .next = nfs_volume_list_next, .stop = nfs_volume_list_stop, diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index e4e089a..6c32100 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -934,9 +934,6 @@ out: * back into its cache. We let the server do generic write * parameter checking and report problems. * - * We also avoid an unnecessary invocation of generic_osync_inode(), - * as it is fairly meaningless to sync the metadata of an NFS file. - * * We eliminate local atime updates, see direct read above. * * We avoid unnecessary page cache invalidations for normal cached diff --git a/fs/nfs/dns_resolve.c b/fs/nfs/dns_resolve.c new file mode 100644 index 0000000..f4d54ba --- /dev/null +++ b/fs/nfs/dns_resolve.c @@ -0,0 +1,335 @@ +/* + * linux/fs/nfs/dns_resolve.c + * + * Copyright (c) 2009 Trond Myklebust <Trond.Myklebust@netapp.com> + * + * Resolves DNS hostnames into valid ip addresses + */ + +#include <linux/hash.h> +#include <linux/string.h> +#include <linux/kmod.h> +#include <linux/module.h> +#include <linux/socket.h> +#include <linux/seq_file.h> +#include <linux/inet.h> +#include <linux/sunrpc/clnt.h> +#include <linux/sunrpc/cache.h> +#include <linux/sunrpc/svcauth.h> + +#include "dns_resolve.h" +#include "cache_lib.h" + +#define NFS_DNS_HASHBITS 4 +#define NFS_DNS_HASHTBL_SIZE (1 << NFS_DNS_HASHBITS) + +static struct cache_head *nfs_dns_table[NFS_DNS_HASHTBL_SIZE]; + +struct nfs_dns_ent { + struct cache_head h; + + char *hostname; + size_t namelen; + + struct sockaddr_storage addr; + size_t addrlen; +}; + + +static void nfs_dns_ent_init(struct cache_head *cnew, + struct cache_head *ckey) +{ + struct nfs_dns_ent *new; + struct nfs_dns_ent *key; + + new = container_of(cnew, struct nfs_dns_ent, h); + key = container_of(ckey, struct nfs_dns_ent, h); + + kfree(new->hostname); + new->hostname = kstrndup(key->hostname, key->namelen, GFP_KERNEL); + if (new->hostname) { + new->namelen = key->namelen; + memcpy(&new->addr, &key->addr, key->addrlen); + new->addrlen = key->addrlen; + } else { + new->namelen = 0; + new->addrlen = 0; + } +} + +static void nfs_dns_ent_put(struct kref *ref) +{ + struct nfs_dns_ent *item; + + item = container_of(ref, struct nfs_dns_ent, h.ref); + kfree(item->hostname); + kfree(item); +} + +static struct cache_head *nfs_dns_ent_alloc(void) +{ + struct nfs_dns_ent *item = kmalloc(sizeof(*item), GFP_KERNEL); + + if (item != NULL) { + item->hostname = NULL; + item->namelen = 0; + item->addrlen = 0; + return &item->h; + } + return NULL; +}; + +static unsigned int nfs_dns_hash(const struct nfs_dns_ent *key) +{ + return hash_str(key->hostname, NFS_DNS_HASHBITS); +} + +static void nfs_dns_request(struct cache_detail *cd, + struct cache_head *ch, + char **bpp, int *blen) +{ + struct nfs_dns_ent *key = container_of(ch, struct nfs_dns_ent, h); + + qword_add(bpp, blen, key->hostname); + (*bpp)[-1] = '\n'; +} + +static int nfs_dns_upcall(struct cache_detail *cd, + struct cache_head *ch) +{ + struct nfs_dns_ent *key = container_of(ch, struct nfs_dns_ent, h); + int ret; + + ret = nfs_cache_upcall(cd, key->hostname); + if (ret) + ret = sunrpc_cache_pipe_upcall(cd, ch, nfs_dns_request); + return ret; +} + +static int nfs_dns_match(struct cache_head *ca, + struct cache_head *cb) +{ + struct nfs_dns_ent *a; + struct nfs_dns_ent *b; + + a = container_of(ca, struct nfs_dns_ent, h); + b = container_of(cb, struct nfs_dns_ent, h); + + if (a->namelen == 0 || a->namelen != b->namelen) + return 0; + return memcmp(a->hostname, b->hostname, a->namelen) == 0; +} + +static int nfs_dns_show(struct seq_file *m, struct cache_detail *cd, + struct cache_head *h) +{ + struct nfs_dns_ent *item; + long ttl; + + if (h == NULL) { + seq_puts(m, "# ip address hostname ttl\n"); + return 0; + } + item = container_of(h, struct nfs_dns_ent, h); + ttl = (long)item->h.expiry_time - (long)get_seconds(); + if (ttl < 0) + ttl = 0; + + if (!test_bit(CACHE_NEGATIVE, &h->flags)) { + char buf[INET6_ADDRSTRLEN+IPV6_SCOPE_ID_LEN+1]; + + rpc_ntop((struct sockaddr *)&item->addr, buf, sizeof(buf)); + seq_printf(m, "%15s ", buf); + } else + seq_puts(m, "<none> "); + seq_printf(m, "%15s %ld\n", item->hostname, ttl); + return 0; +} + +struct nfs_dns_ent *nfs_dns_lookup(struct cache_detail *cd, + struct nfs_dns_ent *key) +{ + struct cache_head *ch; + + ch = sunrpc_cache_lookup(cd, + &key->h, + nfs_dns_hash(key)); + if (!ch) + return NULL; + return container_of(ch, struct nfs_dns_ent, h); +} + +struct nfs_dns_ent *nfs_dns_update(struct cache_detail *cd, + struct nfs_dns_ent *new, + struct nfs_dns_ent *key) +{ + struct cache_head *ch; + + ch = sunrpc_cache_update(cd, + &new->h, &key->h, + nfs_dns_hash(key)); + if (!ch) + return NULL; + return container_of(ch, struct nfs_dns_ent, h); +} + +static int nfs_dns_parse(struct cache_detail *cd, char *buf, int buflen) +{ + char buf1[NFS_DNS_HOSTNAME_MAXLEN+1]; + struct nfs_dns_ent key, *item; + unsigned long ttl; + ssize_t len; + int ret = -EINVAL; + + if (buf[buflen-1] != '\n') + goto out; + buf[buflen-1] = '\0'; + + len = qword_get(&buf, buf1, sizeof(buf1)); + if (len <= 0) + goto out; + key.addrlen = rpc_pton(buf1, len, + (struct sockaddr *)&key.addr, + sizeof(key.addr)); + + len = qword_get(&buf, buf1, sizeof(buf1)); + if (len <= 0) + goto out; + + key.hostname = buf1; + key.namelen = len; + memset(&key.h, 0, sizeof(key.h)); + + ttl = get_expiry(&buf); + if (ttl == 0) + goto out; + key.h.expiry_time = ttl + get_seconds(); + + ret = -ENOMEM; + item = nfs_dns_lookup(cd, &key); + if (item == NULL) + goto out; + + if (key.addrlen == 0) + set_bit(CACHE_NEGATIVE, &key.h.flags); + + item = nfs_dns_update(cd, &key, item); + if (item == NULL) + goto out; + + ret = 0; + cache_put(&item->h, cd); +out: + return ret; +} + +static struct cache_detail nfs_dns_resolve = { + .owner = THIS_MODULE, + .hash_size = NFS_DNS_HASHTBL_SIZE, + .hash_table = nfs_dns_table, + .name = "dns_resolve", + .cache_put = nfs_dns_ent_put, + .cache_upcall = nfs_dns_upcall, + .cache_parse = nfs_dns_parse, + .cache_show = nfs_dns_show, + .match = nfs_dns_match, + .init = nfs_dns_ent_init, + .update = nfs_dns_ent_init, + .alloc = nfs_dns_ent_alloc, +}; + +static int do_cache_lookup(struct cache_detail *cd, + struct nfs_dns_ent *key, + struct nfs_dns_ent **item, + struct nfs_cache_defer_req *dreq) +{ + int ret = -ENOMEM; + + *item = nfs_dns_lookup(cd, key); + if (*item) { + ret = cache_check(cd, &(*item)->h, &dreq->req); + if (ret) + *item = NULL; + } + return ret; +} + +static int do_cache_lookup_nowait(struct cache_detail *cd, + struct nfs_dns_ent *key, + struct nfs_dns_ent **item) +{ + int ret = -ENOMEM; + + *item = nfs_dns_lookup(cd, key); + if (!*item) + goto out_err; + ret = -ETIMEDOUT; + if (!test_bit(CACHE_VALID, &(*item)->h.flags) + || (*item)->h.expiry_time < get_seconds() + || cd->flush_time > (*item)->h.last_refresh) + goto out_put; + ret = -ENOENT; + if (test_bit(CACHE_NEGATIVE, &(*item)->h.flags)) + goto out_put; + return 0; +out_put: + cache_put(&(*item)->h, cd); +out_err: + *item = NULL; + return ret; +} + +static int do_cache_lookup_wait(struct cache_detail *cd, + struct nfs_dns_ent *key, + struct nfs_dns_ent **item) +{ + struct nfs_cache_defer_req *dreq; + int ret = -ENOMEM; + + dreq = nfs_cache_defer_req_alloc(); + if (!dreq) + goto out; + ret = do_cache_lookup(cd, key, item, dreq); + if (ret == -EAGAIN) { + ret = nfs_cache_wait_for_upcall(dreq); + if (!ret) + ret = do_cache_lookup_nowait(cd, key, item); + } + nfs_cache_defer_req_put(dreq); +out: + return ret; +} + +ssize_t nfs_dns_resolve_name(char *name, size_t namelen, + struct sockaddr *sa, size_t salen) +{ + struct nfs_dns_ent key = { + .hostname = name, + .namelen = namelen, + }; + struct nfs_dns_ent *item = NULL; + ssize_t ret; + + ret = do_cache_lookup_wait(&nfs_dns_resolve, &key, &item); + if (ret == 0) { + if (salen >= item->addrlen) { + memcpy(sa, &item->addr, item->addrlen); + ret = item->addrlen; + } else + ret = -EOVERFLOW; + cache_put(&item->h, &nfs_dns_resolve); + } else if (ret == -ENOENT) + ret = -ESRCH; + return ret; +} + +int nfs_dns_resolver_init(void) +{ + return nfs_cache_register(&nfs_dns_resolve); +} + +void nfs_dns_resolver_destroy(void) +{ + nfs_cache_unregister(&nfs_dns_resolve); +} + diff --git a/fs/nfs/dns_resolve.h b/fs/nfs/dns_resolve.h new file mode 100644 index 0000000..a3f0938 --- /dev/null +++ b/fs/nfs/dns_resolve.h @@ -0,0 +1,14 @@ +/* + * Resolve DNS hostnames into valid ip addresses + */ +#ifndef __LINUX_FS_NFS_DNS_RESOLVE_H +#define __LINUX_FS_NFS_DNS_RESOLVE_H + +#define NFS_DNS_HOSTNAME_MAXLEN (128) + +extern int nfs_dns_resolver_init(void); +extern void nfs_dns_resolver_destroy(void); +extern ssize_t nfs_dns_resolve_name(char *name, size_t namelen, + struct sockaddr *sa, size_t salen); + +#endif diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 0506232..f5fdd39 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -59,7 +59,7 @@ static int nfs_lock(struct file *filp, int cmd, struct file_lock *fl); static int nfs_flock(struct file *filp, int cmd, struct file_lock *fl); static int nfs_setlease(struct file *file, long arg, struct file_lock **fl); -static struct vm_operations_struct nfs_file_vm_ops; +static const struct vm_operations_struct nfs_file_vm_ops; const struct file_operations nfs_file_operations = { .llseek = nfs_file_llseek, @@ -328,6 +328,42 @@ nfs_file_fsync(struct file *file, struct dentry *dentry, int datasync) } /* + * Decide whether a read/modify/write cycle may be more efficient + * then a modify/write/read cycle when writing to a page in the + * page cache. + * + * The modify/write/read cycle may occur if a page is read before + * being completely filled by the writer. In this situation, the + * page must be completely written to stable storage on the server + * before it can be refilled by reading in the page from the server. + * This can lead to expensive, small, FILE_SYNC mode writes being + * done. + * + * It may be more efficient to read the page first if the file is + * open for reading in addition to writing, the page is not marked + * as Uptodate, it is not dirty or waiting to be committed, + * indicating that it was previously allocated and then modified, + * that there were valid bytes of data in that range of the file, + * and that the new data won't completely replace the old data in + * that range of the file. + */ +static int nfs_want_read_modify_write(struct file *file, struct page *page, + loff_t pos, unsigned len) +{ + unsigned int pglen = nfs_page_length(page); + unsigned int offset = pos & (PAGE_CACHE_SIZE - 1); + unsigned int end = offset + len; + + if ((file->f_mode & FMODE_READ) && /* open for read? */ + !PageUptodate(page) && /* Uptodate? */ + !PagePrivate(page) && /* i/o request already? */ + pglen && /* valid bytes of file? */ + (end < pglen || offset)) /* replace all valid bytes? */ + return 1; + return 0; +} + +/* * This does the "real" work of the write. We must allocate and lock the * page to be sent back to the generic routine, which then copies the * data from user space. @@ -340,15 +376,16 @@ static int nfs_write_begin(struct file *file, struct address_space *mapping, struct page **pagep, void **fsdata) { int ret; - pgoff_t index; + pgoff_t index = pos >> PAGE_CACHE_SHIFT; struct page *page; - index = pos >> PAGE_CACHE_SHIFT; + int once_thru = 0; dfprintk(PAGECACHE, "NFS: write_begin(%s/%s(%ld), %u@%lld)\n", file->f_path.dentry->d_parent->d_name.name, file->f_path.dentry->d_name.name, mapping->host->i_ino, len, (long long) pos); +start: /* * Prevent starvation issues if someone is doing a consistency * sync-to-disk @@ -367,6 +404,13 @@ static int nfs_write_begin(struct file *file, struct address_space *mapping, if (ret) { unlock_page(page); page_cache_release(page); + } else if (!once_thru && + nfs_want_read_modify_write(file, page, pos, len)) { + once_thru = 1; + ret = nfs_readpage(file, page); + page_cache_release(page); + if (!ret) + goto start; } return ret; } @@ -479,7 +523,9 @@ const struct address_space_operations nfs_file_aops = { .invalidatepage = nfs_invalidate_page, .releasepage = nfs_release_page, .direct_IO = nfs_direct_IO, + .migratepage = nfs_migrate_page, .launder_page = nfs_launder_page, + .error_remove_page = generic_error_remove_page, }; /* @@ -526,7 +572,7 @@ out_unlock: return VM_FAULT_SIGBUS; } -static struct vm_operations_struct nfs_file_vm_ops = { +static const struct vm_operations_struct nfs_file_vm_ops = { .fault = filemap_fault, .page_mkwrite = nfs_vm_page_mkwrite, }; diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c index 379be67..70fad69 100644 --- a/fs/nfs/fscache.c +++ b/fs/nfs/fscache.c @@ -58,17 +58,34 @@ void nfs_fscache_release_client_cookie(struct nfs_client *clp) /* * Get the cache cookie for an NFS superblock. We have to handle * uniquification here because the cache doesn't do it for us. + * + * The default uniquifier is just an empty string, but it may be overridden + * either by the 'fsc=xxx' option to mount, or by inheriting it from the parent + * superblock across an automount point of some nature. */ -void nfs_fscache_get_super_cookie(struct super_block *sb, - struct nfs_parsed_mount_data *data) +void nfs_fscache_get_super_cookie(struct super_block *sb, const char *uniq, + struct nfs_clone_mount *mntdata) { struct nfs_fscache_key *key, *xkey; struct nfs_server *nfss = NFS_SB(sb); struct rb_node **p, *parent; - const char *uniq = data->fscache_uniq ?: ""; int diff, ulen; - ulen = strlen(uniq); + if (uniq) { + ulen = strlen(uniq); + } else if (mntdata) { + struct nfs_server *mnt_s = NFS_SB(mntdata->sb); + if (mnt_s->fscache_key) { + uniq = mnt_s->fscache_key->key.uniquifier; + ulen = mnt_s->fscache_key->key.uniq_len; + } + } + + if (!uniq) { + uniq = ""; + ulen = 1; + } + key = kzalloc(sizeof(*key) + ulen, GFP_KERNEL); if (!key) return; diff --git a/fs/nfs/fscache.h b/fs/nfs/fscache.h index 6e809bb..b9c572d 100644 --- a/fs/nfs/fscache.h +++ b/fs/nfs/fscache.h @@ -74,7 +74,8 @@ extern void nfs_fscache_get_client_cookie(struct nfs_client *); extern void nfs_fscache_release_client_cookie(struct nfs_client *); extern void nfs_fscache_get_super_cookie(struct super_block *, - struct nfs_parsed_mount_data *); + const char *, + struct nfs_clone_mount *); extern void nfs_fscache_release_super_cookie(struct super_block *); extern void nfs_fscache_init_inode_cookie(struct inode *); @@ -173,7 +174,8 @@ static inline void nfs_fscache_release_client_cookie(struct nfs_client *clp) {} static inline void nfs_fscache_get_super_cookie( struct super_block *sb, - struct nfs_parsed_mount_data *data) + const char *uniq, + struct nfs_clone_mount *mntdata) { } static inline void nfs_fscache_release_super_cookie(struct super_block *sb) {} diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c index 86147b0..21a84d45 100644 --- a/fs/nfs/idmap.c +++ b/fs/nfs/idmap.c @@ -101,7 +101,7 @@ static void idmap_pipe_destroy_msg(struct rpc_pipe_msg *); static unsigned int fnvhash32(const void *, size_t); -static struct rpc_pipe_ops idmap_upcall_ops = { +static const struct rpc_pipe_ops idmap_upcall_ops = { .upcall = idmap_pipe_upcall, .downcall = idmap_pipe_downcall, .destroy_msg = idmap_pipe_destroy_msg, @@ -119,8 +119,8 @@ nfs_idmap_new(struct nfs_client *clp) if (idmap == NULL) return -ENOMEM; - idmap->idmap_dentry = rpc_mkpipe(clp->cl_rpcclient->cl_dentry, "idmap", - idmap, &idmap_upcall_ops, 0); + idmap->idmap_dentry = rpc_mkpipe(clp->cl_rpcclient->cl_path.dentry, + "idmap", idmap, &idmap_upcall_ops, 0); if (IS_ERR(idmap->idmap_dentry)) { error = PTR_ERR(idmap->idmap_dentry); kfree(idmap); diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index bd7938e..faa0918 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -46,6 +46,7 @@ #include "iostat.h" #include "internal.h" #include "fscache.h" +#include "dns_resolve.h" #define NFSDBG_FACILITY NFSDBG_VFS @@ -286,6 +287,11 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr) /* We can't support update_atime(), since the server will reset it */ inode->i_flags |= S_NOATIME|S_NOCMTIME; inode->i_mode = fattr->mode; + if ((fattr->valid & NFS_ATTR_FATTR_MODE) == 0 + && nfs_server_capable(inode, NFS_CAP_MODE)) + nfsi->cache_validity |= NFS_INO_INVALID_ATTR + | NFS_INO_INVALID_ACCESS + | NFS_INO_INVALID_ACL; /* Why so? Because we want revalidate for devices/FIFOs, and * that's precisely what we have in nfs_file_inode_operations. */ @@ -330,20 +336,46 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr) nfsi->attr_gencount = fattr->gencount; if (fattr->valid & NFS_ATTR_FATTR_ATIME) inode->i_atime = fattr->atime; + else if (nfs_server_capable(inode, NFS_CAP_ATIME)) + nfsi->cache_validity |= NFS_INO_INVALID_ATTR; if (fattr->valid & NFS_ATTR_FATTR_MTIME) inode->i_mtime = fattr->mtime; + else if (nfs_server_capable(inode, NFS_CAP_MTIME)) + nfsi->cache_validity |= NFS_INO_INVALID_ATTR + | NFS_INO_INVALID_DATA; if (fattr->valid & NFS_ATTR_FATTR_CTIME) inode->i_ctime = fattr->ctime; + else if (nfs_server_capable(inode, NFS_CAP_CTIME)) + nfsi->cache_validity |= NFS_INO_INVALID_ATTR + | NFS_INO_INVALID_ACCESS + | NFS_INO_INVALID_ACL; if (fattr->valid & NFS_ATTR_FATTR_CHANGE) nfsi->change_attr = fattr->change_attr; + else if (nfs_server_capable(inode, NFS_CAP_CHANGE_ATTR)) + nfsi->cache_validity |= NFS_INO_INVALID_ATTR + | NFS_INO_INVALID_DATA; if (fattr->valid & NFS_ATTR_FATTR_SIZE) inode->i_size = nfs_size_to_loff_t(fattr->size); + else + nfsi->cache_validity |= NFS_INO_INVALID_ATTR + | NFS_INO_INVALID_DATA + | NFS_INO_REVAL_PAGECACHE; if (fattr->valid & NFS_ATTR_FATTR_NLINK) inode->i_nlink = fattr->nlink; + else if (nfs_server_capable(inode, NFS_CAP_NLINK)) + nfsi->cache_validity |= NFS_INO_INVALID_ATTR; if (fattr->valid & NFS_ATTR_FATTR_OWNER) inode->i_uid = fattr->uid; + else if (nfs_server_capable(inode, NFS_CAP_OWNER)) + nfsi->cache_validity |= NFS_INO_INVALID_ATTR + | NFS_INO_INVALID_ACCESS + | NFS_INO_INVALID_ACL; if (fattr->valid & NFS_ATTR_FATTR_GROUP) inode->i_gid = fattr->gid; + else if (nfs_server_capable(inode, NFS_CAP_OWNER_GROUP)) + nfsi->cache_validity |= NFS_INO_INVALID_ATTR + | NFS_INO_INVALID_ACCESS + | NFS_INO_INVALID_ACL; if (fattr->valid & NFS_ATTR_FATTR_BLOCKS_USED) inode->i_blocks = fattr->du.nfs2.blocks; if (fattr->valid & NFS_ATTR_FATTR_SPACE_USED) { @@ -426,49 +458,21 @@ nfs_setattr(struct dentry *dentry, struct iattr *attr) */ static int nfs_vmtruncate(struct inode * inode, loff_t offset) { - if (i_size_read(inode) < offset) { - unsigned long limit; - - limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur; - if (limit != RLIM_INFINITY && offset > limit) - goto out_sig; - if (offset > inode->i_sb->s_maxbytes) - goto out_big; - spin_lock(&inode->i_lock); - i_size_write(inode, offset); - spin_unlock(&inode->i_lock); - } else { - struct address_space *mapping = inode->i_mapping; + loff_t oldsize; + int err; - /* - * truncation of in-use swapfiles is disallowed - it would - * cause subsequent swapout to scribble on the now-freed - * blocks. - */ - if (IS_SWAPFILE(inode)) - return -ETXTBSY; - spin_lock(&inode->i_lock); - i_size_write(inode, offset); - spin_unlock(&inode->i_lock); + err = inode_newsize_ok(inode, offset); + if (err) + goto out; - /* - * unmap_mapping_range is called twice, first simply for - * efficiency so that truncate_inode_pages does fewer - * single-page unmaps. However after this first call, and - * before truncate_inode_pages finishes, it is possible for - * private pages to be COWed, which remain after - * truncate_inode_pages finishes, hence the second - * unmap_mapping_range call must be made for correctness. - */ - unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1); - truncate_inode_pages(mapping, offset); - unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1); - } - return 0; -out_sig: - send_sig(SIGXFSZ, current, 0); -out_big: - return -EFBIG; + spin_lock(&inode->i_lock); + oldsize = inode->i_size; + i_size_write(inode, offset); + spin_unlock(&inode->i_lock); + + truncate_pagecache(inode, oldsize, offset); +out: + return err; } /** @@ -1145,6 +1149,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) loff_t cur_isize, new_isize; unsigned long invalid = 0; unsigned long now = jiffies; + unsigned long save_cache_validity; dfprintk(VFS, "NFS: %s(%s/%ld ct=%d info=0x%x)\n", __func__, inode->i_sb->s_id, inode->i_ino, @@ -1171,10 +1176,11 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) */ nfsi->read_cache_jiffies = fattr->time_start; - if ((fattr->valid & NFS_ATTR_FATTR_CHANGE) || (fattr->valid & (NFS_ATTR_FATTR_MTIME|NFS_ATTR_FATTR_CTIME))) - nfsi->cache_validity &= ~(NFS_INO_INVALID_ATTR - | NFS_INO_INVALID_ATIME - | NFS_INO_REVAL_PAGECACHE); + save_cache_validity = nfsi->cache_validity; + nfsi->cache_validity &= ~(NFS_INO_INVALID_ATTR + | NFS_INO_INVALID_ATIME + | NFS_INO_REVAL_FORCED + | NFS_INO_REVAL_PAGECACHE); /* Do atomic weak cache consistency updates */ nfs_wcc_update_inode(inode, fattr); @@ -1189,7 +1195,8 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) nfs_force_lookup_revalidate(inode); nfsi->change_attr = fattr->change_attr; } - } + } else if (server->caps & NFS_CAP_CHANGE_ATTR) + invalid |= save_cache_validity; if (fattr->valid & NFS_ATTR_FATTR_MTIME) { /* NFSv2/v3: Check if the mtime agrees */ @@ -1201,7 +1208,12 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) nfs_force_lookup_revalidate(inode); memcpy(&inode->i_mtime, &fattr->mtime, sizeof(inode->i_mtime)); } - } + } else if (server->caps & NFS_CAP_MTIME) + invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR + | NFS_INO_INVALID_DATA + | NFS_INO_REVAL_PAGECACHE + | NFS_INO_REVAL_FORCED); + if (fattr->valid & NFS_ATTR_FATTR_CTIME) { /* If ctime has changed we should definitely clear access+acl caches */ if (!timespec_equal(&inode->i_ctime, &fattr->ctime)) { @@ -1215,7 +1227,11 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) } memcpy(&inode->i_ctime, &fattr->ctime, sizeof(inode->i_ctime)); } - } + } else if (server->caps & NFS_CAP_CTIME) + invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR + | NFS_INO_INVALID_ACCESS + | NFS_INO_INVALID_ACL + | NFS_INO_REVAL_FORCED); /* Check if our cached file size is stale */ if (fattr->valid & NFS_ATTR_FATTR_SIZE) { @@ -1231,30 +1247,50 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) dprintk("NFS: isize change on server for file %s/%ld\n", inode->i_sb->s_id, inode->i_ino); } - } + } else + invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR + | NFS_INO_REVAL_PAGECACHE + | NFS_INO_REVAL_FORCED); if (fattr->valid & NFS_ATTR_FATTR_ATIME) memcpy(&inode->i_atime, &fattr->atime, sizeof(inode->i_atime)); + else if (server->caps & NFS_CAP_ATIME) + invalid |= save_cache_validity & (NFS_INO_INVALID_ATIME + | NFS_INO_REVAL_FORCED); if (fattr->valid & NFS_ATTR_FATTR_MODE) { if ((inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO)) { invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL; inode->i_mode = fattr->mode; } - } + } else if (server->caps & NFS_CAP_MODE) + invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR + | NFS_INO_INVALID_ACCESS + | NFS_INO_INVALID_ACL + | NFS_INO_REVAL_FORCED); + if (fattr->valid & NFS_ATTR_FATTR_OWNER) { if (inode->i_uid != fattr->uid) { invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL; inode->i_uid = fattr->uid; } - } + } else if (server->caps & NFS_CAP_OWNER) + invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR + | NFS_INO_INVALID_ACCESS + | NFS_INO_INVALID_ACL + | NFS_INO_REVAL_FORCED); + if (fattr->valid & NFS_ATTR_FATTR_GROUP) { if (inode->i_gid != fattr->gid) { invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL; inode->i_gid = fattr->gid; } - } + } else if (server->caps & NFS_CAP_OWNER_GROUP) + invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR + | NFS_INO_INVALID_ACCESS + | NFS_INO_INVALID_ACL + | NFS_INO_REVAL_FORCED); if (fattr->valid & NFS_ATTR_FATTR_NLINK) { if (inode->i_nlink != fattr->nlink) { @@ -1263,7 +1299,9 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) invalid |= NFS_INO_INVALID_DATA; inode->i_nlink = fattr->nlink; } - } + } else if (server->caps & NFS_CAP_NLINK) + invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR + | NFS_INO_REVAL_FORCED); if (fattr->valid & NFS_ATTR_FATTR_SPACE_USED) { /* @@ -1293,9 +1331,8 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) || S_ISLNK(inode->i_mode))) invalid &= ~NFS_INO_INVALID_DATA; if (!nfs_have_delegation(inode, FMODE_READ) || - (nfsi->cache_validity & NFS_INO_REVAL_FORCED)) + (save_cache_validity & NFS_INO_REVAL_FORCED)) nfsi->cache_validity |= invalid; - nfsi->cache_validity &= ~NFS_INO_REVAL_FORCED; return 0; out_changed: @@ -1442,6 +1479,10 @@ static int __init init_nfs_fs(void) { int err; + err = nfs_dns_resolver_init(); + if (err < 0) + goto out8; + err = nfs_fscache_register(); if (err < 0) goto out7; @@ -1500,6 +1541,8 @@ out5: out6: nfs_fscache_unregister(); out7: + nfs_dns_resolver_destroy(); +out8: return err; } @@ -1511,6 +1554,7 @@ static void __exit exit_nfs_fs(void) nfs_destroy_inodecache(); nfs_destroy_nfspagecache(); nfs_fscache_unregister(); + nfs_dns_resolver_destroy(); #ifdef CONFIG_PROC_FS rpc_proc_unregister("nfs"); #endif diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 7dd90a6..e21b1bb 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -49,6 +49,11 @@ struct nfs_clone_mount { #define NFS_MAX_SECFLAVORS (12) /* + * Value used if the user did not specify a port value. + */ +#define NFS_UNSPEC_PORT (-1) + +/* * In-kernel mount arguments */ struct nfs_parsed_mount_data { @@ -63,6 +68,7 @@ struct nfs_parsed_mount_data { unsigned int auth_flavor_len; rpc_authflavor_t auth_flavors[1]; char *client_address; + unsigned int version; unsigned int minorversion; char *fscache_uniq; @@ -71,7 +77,7 @@ struct nfs_parsed_mount_data { size_t addrlen; char *hostname; u32 version; - unsigned short port; + int port; unsigned short protocol; } mount_server; @@ -80,7 +86,7 @@ struct nfs_parsed_mount_data { size_t addrlen; char *hostname; char *export_path; - unsigned short port; + int port; unsigned short protocol; } nfs_server; @@ -102,6 +108,7 @@ struct nfs_mount_request { }; extern int nfs_mount(struct nfs_mount_request *info); +extern void nfs_umount(const struct nfs_mount_request *info); /* client.c */ extern struct rpc_program nfs_program; @@ -213,7 +220,6 @@ void nfs_zap_acl_cache(struct inode *inode); extern int nfs_wait_bit_killable(void *word); /* super.c */ -void nfs_parse_ip_address(char *, size_t, struct sockaddr *, size_t *); extern struct file_system_type nfs_xdev_fs_type; #ifdef CONFIG_NFS_V4 extern struct file_system_type nfs4_xdev_fs_type; @@ -248,6 +254,12 @@ extern void nfs_read_prepare(struct rpc_task *task, void *calldata); /* write.c */ extern void nfs_write_prepare(struct rpc_task *task, void *calldata); +#ifdef CONFIG_MIGRATION +extern int nfs_migrate_page(struct address_space *, + struct page *, struct page *); +#else +#define nfs_migrate_page NULL +#endif /* nfs4proc.c */ extern int _nfs4_call_sync(struct nfs_server *server, @@ -368,24 +380,3 @@ unsigned int nfs_page_array_len(unsigned int base, size_t len) return ((unsigned long)len + (unsigned long)base + PAGE_SIZE - 1) >> PAGE_SHIFT; } - -#define IPV6_SCOPE_DELIMITER '%' - -/* - * Set the port number in an address. Be agnostic about the address - * family. - */ -static inline void nfs_set_port(struct sockaddr *sap, unsigned short port) -{ - struct sockaddr_in *ap = (struct sockaddr_in *)sap; - struct sockaddr_in6 *ap6 = (struct sockaddr_in6 *)sap; - - switch (sap->sa_family) { - case AF_INET: - ap->sin_port = htons(port); - break; - case AF_INET6: - ap6->sin6_port = htons(port); - break; - } -} diff --git a/fs/nfs/mount_clnt.c b/fs/nfs/mount_clnt.c index 38ef9ea..0adefc4 100644 --- a/fs/nfs/mount_clnt.c +++ b/fs/nfs/mount_clnt.c @@ -209,6 +209,71 @@ out_mnt_err: goto out; } +/** + * nfs_umount - Notify a server that we have unmounted this export + * @info: pointer to umount request arguments + * + * MOUNTPROC_UMNT is advisory, so we set a short timeout, and always + * use UDP. + */ +void nfs_umount(const struct nfs_mount_request *info) +{ + static const struct rpc_timeout nfs_umnt_timeout = { + .to_initval = 1 * HZ, + .to_maxval = 3 * HZ, + .to_retries = 2, + }; + struct rpc_create_args args = { + .protocol = IPPROTO_UDP, + .address = info->sap, + .addrsize = info->salen, + .timeout = &nfs_umnt_timeout, + .servername = info->hostname, + .program = &mnt_program, + .version = info->version, + .authflavor = RPC_AUTH_UNIX, + .flags = RPC_CLNT_CREATE_NOPING, + }; + struct mountres result; + struct rpc_message msg = { + .rpc_argp = info->dirpath, + .rpc_resp = &result, + }; + struct rpc_clnt *clnt; + int status; + + if (info->noresvport) + args.flags |= RPC_CLNT_CREATE_NONPRIVPORT; + + clnt = rpc_create(&args); + if (unlikely(IS_ERR(clnt))) + goto out_clnt_err; + + dprintk("NFS: sending UMNT request for %s:%s\n", + (info->hostname ? info->hostname : "server"), info->dirpath); + + if (info->version == NFS_MNT3_VERSION) + msg.rpc_proc = &clnt->cl_procinfo[MOUNTPROC3_UMNT]; + else + msg.rpc_proc = &clnt->cl_procinfo[MOUNTPROC_UMNT]; + + status = rpc_call_sync(clnt, &msg, 0); + rpc_shutdown_client(clnt); + + if (unlikely(status < 0)) + goto out_call_err; + + return; + +out_clnt_err: + dprintk("NFS: failed to create UMNT RPC client, status=%ld\n", + PTR_ERR(clnt)); + return; + +out_call_err: + dprintk("NFS: UMNT request failed, status=%d\n", status); +} + /* * XDR encode/decode functions for MOUNT */ @@ -258,7 +323,7 @@ static int decode_status(struct xdr_stream *xdr, struct mountres *res) return -EIO; status = ntohl(*p); - for (i = 0; i <= ARRAY_SIZE(mnt_errtbl); i++) { + for (i = 0; i < ARRAY_SIZE(mnt_errtbl); i++) { if (mnt_errtbl[i].status == status) { res->errno = mnt_errtbl[i].errno; return 0; @@ -309,7 +374,7 @@ static int decode_fhs_status(struct xdr_stream *xdr, struct mountres *res) return -EIO; status = ntohl(*p); - for (i = 0; i <= ARRAY_SIZE(mnt3_errtbl); i++) { + for (i = 0; i < ARRAY_SIZE(mnt3_errtbl); i++) { if (mnt3_errtbl[i].status == status) { res->errno = mnt3_errtbl[i].errno; return 0; @@ -407,6 +472,13 @@ static struct rpc_procinfo mnt_procedures[] = { .p_statidx = MOUNTPROC_MNT, .p_name = "MOUNT", }, + [MOUNTPROC_UMNT] = { + .p_proc = MOUNTPROC_UMNT, + .p_encode = (kxdrproc_t)mnt_enc_dirpath, + .p_arglen = MNT_enc_dirpath_sz, + .p_statidx = MOUNTPROC_UMNT, + .p_name = "UMOUNT", + }, }; static struct rpc_procinfo mnt3_procedures[] = { @@ -419,6 +491,13 @@ static struct rpc_procinfo mnt3_procedures[] = { .p_statidx = MOUNTPROC3_MNT, .p_name = "MOUNT", }, + [MOUNTPROC3_UMNT] = { + .p_proc = MOUNTPROC3_UMNT, + .p_encode = (kxdrproc_t)mnt_enc_dirpath, + .p_arglen = MNT_enc_dirpath_sz, + .p_statidx = MOUNTPROC3_UMNT, + .p_name = "UMOUNT", + }, }; diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c index c862c93..5e078b2 100644 --- a/fs/nfs/nfs2xdr.c +++ b/fs/nfs/nfs2xdr.c @@ -13,7 +13,6 @@ #include <linux/time.h> #include <linux/mm.h> #include <linux/slab.h> -#include <linux/utsname.h> #include <linux/errno.h> #include <linux/string.h> #include <linux/in.h> diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c index d0cc5ce..3f8881d 100644 --- a/fs/nfs/nfs3proc.c +++ b/fs/nfs/nfs3proc.c @@ -7,7 +7,6 @@ */ #include <linux/mm.h> -#include <linux/utsname.h> #include <linux/errno.h> #include <linux/string.h> #include <linux/sunrpc/clnt.h> @@ -299,7 +298,6 @@ static void nfs3_free_createdata(struct nfs3_createdata *data) /* * Create a regular file. - * For now, we don't implement O_EXCL. */ static int nfs3_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr, diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c index 35869a4..5fe5492 100644 --- a/fs/nfs/nfs3xdr.c +++ b/fs/nfs/nfs3xdr.c @@ -10,7 +10,6 @@ #include <linux/time.h> #include <linux/mm.h> #include <linux/slab.h> -#include <linux/utsname.h> #include <linux/errno.h> #include <linux/string.h> #include <linux/in.h> diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c index 2a2a0a7..fa3408f 100644 --- a/fs/nfs/nfs4namespace.c +++ b/fs/nfs/nfs4namespace.c @@ -17,6 +17,7 @@ #include <linux/inet.h> #include "internal.h" #include "nfs4_fs.h" +#include "dns_resolve.h" #define NFSDBG_FACILITY NFSDBG_VFS @@ -95,6 +96,20 @@ static int nfs4_validate_fspath(const struct vfsmount *mnt_parent, return 0; } +static size_t nfs_parse_server_name(char *string, size_t len, + struct sockaddr *sa, size_t salen) +{ + ssize_t ret; + + ret = rpc_pton(string, len, sa, salen); + if (ret == 0) { + ret = nfs_dns_resolve_name(string, len, sa, salen); + if (ret < 0) + ret = 0; + } + return ret; +} + static struct vfsmount *try_location(struct nfs_clone_mount *mountdata, char *page, char *page2, const struct nfs4_fs_location *location) @@ -106,7 +121,7 @@ static struct vfsmount *try_location(struct nfs_clone_mount *mountdata, mnt_path = nfs4_pathname_string(&location->rootpath, page2, PAGE_SIZE); if (IS_ERR(mnt_path)) - return mnt; + return ERR_CAST(mnt_path); mountdata->mnt_path = mnt_path; maxbuflen = mnt_path - 1 - page2; @@ -117,15 +132,16 @@ static struct vfsmount *try_location(struct nfs_clone_mount *mountdata, if (buf->len <= 0 || buf->len >= maxbuflen) continue; - mountdata->addr = (struct sockaddr *)&addr; - if (memchr(buf->data, IPV6_SCOPE_DELIMITER, buf->len)) continue; - nfs_parse_ip_address(buf->data, buf->len, - mountdata->addr, &mountdata->addrlen); - if (mountdata->addr->sa_family == AF_UNSPEC) + + mountdata->addrlen = nfs_parse_server_name(buf->data, buf->len, + (struct sockaddr *)&addr, sizeof(addr)); + if (mountdata->addrlen == 0) continue; - nfs_set_port(mountdata->addr, NFS_PORT); + + mountdata->addr = (struct sockaddr *)&addr; + rpc_set_port(mountdata->addr, NFS_PORT); memcpy(page2, buf->data, buf->len); page2[buf->len] = '\0'; diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 6917311..ed7c269 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -36,7 +36,6 @@ */ #include <linux/mm.h> -#include <linux/utsname.h> #include <linux/delay.h> #include <linux/errno.h> #include <linux/string.h> @@ -61,6 +60,8 @@ #define NFS4_POLL_RETRY_MIN (HZ/10) #define NFS4_POLL_RETRY_MAX (15*HZ) +#define NFS4_MAX_LOOP_ON_RECOVER (10) + struct nfs4_opendata; static int _nfs4_proc_open(struct nfs4_opendata *data); static int nfs4_do_fsinfo(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *); @@ -426,17 +427,19 @@ out: static int nfs4_recover_session(struct nfs4_session *session) { struct nfs_client *clp = session->clp; + unsigned int loop; int ret; - for (;;) { + for (loop = NFS4_MAX_LOOP_ON_RECOVER; loop != 0; loop--) { ret = nfs4_wait_clnt_recover(clp); if (ret != 0) - return ret; + break; if (!test_bit(NFS4CLNT_SESSION_SETUP, &clp->cl_state)) break; nfs4_schedule_state_manager(clp); + ret = -EIO; } - return 0; + return ret; } static int nfs41_setup_sequence(struct nfs4_session *session, @@ -1444,18 +1447,20 @@ static int _nfs4_proc_open(struct nfs4_opendata *data) static int nfs4_recover_expired_lease(struct nfs_server *server) { struct nfs_client *clp = server->nfs_client; + unsigned int loop; int ret; - for (;;) { + for (loop = NFS4_MAX_LOOP_ON_RECOVER; loop != 0; loop--) { ret = nfs4_wait_clnt_recover(clp); if (ret != 0) - return ret; + break; if (!test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) && !test_bit(NFS4CLNT_CHECK_LEASE,&clp->cl_state)) break; nfs4_schedule_state_recovery(clp); + ret = -EIO; } - return 0; + return ret; } /* @@ -1997,12 +2002,34 @@ static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *f status = nfs4_call_sync(server, &msg, &args, &res, 0); if (status == 0) { memcpy(server->attr_bitmask, res.attr_bitmask, sizeof(server->attr_bitmask)); + server->caps &= ~(NFS_CAP_ACLS|NFS_CAP_HARDLINKS| + NFS_CAP_SYMLINKS|NFS_CAP_FILEID| + NFS_CAP_MODE|NFS_CAP_NLINK|NFS_CAP_OWNER| + NFS_CAP_OWNER_GROUP|NFS_CAP_ATIME| + NFS_CAP_CTIME|NFS_CAP_MTIME); if (res.attr_bitmask[0] & FATTR4_WORD0_ACL) server->caps |= NFS_CAP_ACLS; if (res.has_links != 0) server->caps |= NFS_CAP_HARDLINKS; if (res.has_symlinks != 0) server->caps |= NFS_CAP_SYMLINKS; + if (res.attr_bitmask[0] & FATTR4_WORD0_FILEID) + server->caps |= NFS_CAP_FILEID; + if (res.attr_bitmask[1] & FATTR4_WORD1_MODE) + server->caps |= NFS_CAP_MODE; + if (res.attr_bitmask[1] & FATTR4_WORD1_NUMLINKS) + server->caps |= NFS_CAP_NLINK; + if (res.attr_bitmask[1] & FATTR4_WORD1_OWNER) + server->caps |= NFS_CAP_OWNER; + if (res.attr_bitmask[1] & FATTR4_WORD1_OWNER_GROUP) + server->caps |= NFS_CAP_OWNER_GROUP; + if (res.attr_bitmask[1] & FATTR4_WORD1_TIME_ACCESS) + server->caps |= NFS_CAP_ATIME; + if (res.attr_bitmask[1] & FATTR4_WORD1_TIME_METADATA) + server->caps |= NFS_CAP_CTIME; + if (res.attr_bitmask[1] & FATTR4_WORD1_TIME_MODIFY) + server->caps |= NFS_CAP_MTIME; + memcpy(server->cache_consistency_bitmask, res.attr_bitmask, sizeof(server->cache_consistency_bitmask)); server->cache_consistency_bitmask[0] &= FATTR4_WORD0_CHANGE|FATTR4_WORD0_SIZE; server->cache_consistency_bitmask[1] &= FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY; diff --git a/fs/nfs/nfs4renewd.c b/fs/nfs/nfs4renewd.c index e27c6ce..0156c01 100644 --- a/fs/nfs/nfs4renewd.c +++ b/fs/nfs/nfs4renewd.c @@ -127,12 +127,6 @@ nfs4_schedule_state_renewal(struct nfs_client *clp) } void -nfs4_renewd_prepare_shutdown(struct nfs_server *server) -{ - cancel_delayed_work(&server->nfs_client->cl_renewd); -} - -void nfs4_kill_renewd(struct nfs_client *clp) { cancel_delayed_work_sync(&clp->cl_renewd); diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 1434080..2ef4fec 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -638,7 +638,7 @@ static void nfs4_fl_release_lock(struct file_lock *fl) nfs4_put_lock_state(fl->fl_u.nfs4_fl.owner); } -static struct file_lock_operations nfs4_fl_lock_ops = { +static const struct file_lock_operations nfs4_fl_lock_ops = { .fl_copy_lock = nfs4_fl_copy_lock, .fl_release_private = nfs4_fl_release_lock, }; diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 617273e..83ad47c 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -39,7 +39,6 @@ #include <linux/time.h> #include <linux/mm.h> #include <linux/slab.h> -#include <linux/utsname.h> #include <linux/errno.h> #include <linux/string.h> #include <linux/in.h> @@ -702,29 +701,12 @@ struct compound_hdr { u32 minorversion; }; -/* - * START OF "GENERIC" ENCODE ROUTINES. - * These may look a little ugly since they are imported from a "generic" - * set of XDR encode/decode routines which are intended to be shared by - * all of our NFSv4 implementations (OpenBSD, MacOS X...). - * - * If the pain of reading these is too great, it should be a straightforward - * task to translate them into Linux-specific versions which are more - * consistent with the style used in NFSv2/v3... - */ -#define WRITE32(n) *p++ = htonl(n) -#define WRITE64(n) do { \ - *p++ = htonl((uint32_t)((n) >> 32)); \ - *p++ = htonl((uint32_t)(n)); \ -} while (0) -#define WRITEMEM(ptr,nbytes) do { \ - p = xdr_encode_opaque_fixed(p, ptr, nbytes); \ -} while (0) - -#define RESERVE_SPACE(nbytes) do { \ - p = xdr_reserve_space(xdr, nbytes); \ - BUG_ON(!p); \ -} while (0) +static __be32 *reserve_space(struct xdr_stream *xdr, size_t nbytes) +{ + __be32 *p = xdr_reserve_space(xdr, nbytes); + BUG_ON(!p); + return p; +} static void encode_string(struct xdr_stream *xdr, unsigned int len, const char *str) { @@ -749,12 +731,11 @@ static void encode_compound_hdr(struct xdr_stream *xdr, dprintk("encode_compound: tag=%.*s\n", (int)hdr->taglen, hdr->tag); BUG_ON(hdr->taglen > NFS4_MAXTAGLEN); - RESERVE_SPACE(12+(XDR_QUADLEN(hdr->taglen)<<2)); - WRITE32(hdr->taglen); - WRITEMEM(hdr->tag, hdr->taglen); - WRITE32(hdr->minorversion); + p = reserve_space(xdr, 4 + hdr->taglen + 8); + p = xdr_encode_opaque(p, hdr->tag, hdr->taglen); + *p++ = cpu_to_be32(hdr->minorversion); hdr->nops_p = p; - WRITE32(hdr->nops); + *p = cpu_to_be32(hdr->nops); } static void encode_nops(struct compound_hdr *hdr) @@ -829,55 +810,53 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const len += 16; else if (iap->ia_valid & ATTR_MTIME) len += 4; - RESERVE_SPACE(len); + p = reserve_space(xdr, len); /* * We write the bitmap length now, but leave the bitmap and the attribute * buffer length to be backfilled at the end of this routine. */ - WRITE32(2); + *p++ = cpu_to_be32(2); q = p; p += 3; if (iap->ia_valid & ATTR_SIZE) { bmval0 |= FATTR4_WORD0_SIZE; - WRITE64(iap->ia_size); + p = xdr_encode_hyper(p, iap->ia_size); } if (iap->ia_valid & ATTR_MODE) { bmval1 |= FATTR4_WORD1_MODE; - WRITE32(iap->ia_mode & S_IALLUGO); + *p++ = cpu_to_be32(iap->ia_mode & S_IALLUGO); } if (iap->ia_valid & ATTR_UID) { bmval1 |= FATTR4_WORD1_OWNER; - WRITE32(owner_namelen); - WRITEMEM(owner_name, owner_namelen); + p = xdr_encode_opaque(p, owner_name, owner_namelen); } if (iap->ia_valid & ATTR_GID) { bmval1 |= FATTR4_WORD1_OWNER_GROUP; - WRITE32(owner_grouplen); - WRITEMEM(owner_group, owner_grouplen); + p = xdr_encode_opaque(p, owner_group, owner_grouplen); } if (iap->ia_valid & ATTR_ATIME_SET) { bmval1 |= FATTR4_WORD1_TIME_ACCESS_SET; - WRITE32(NFS4_SET_TO_CLIENT_TIME); - WRITE32(0); - WRITE32(iap->ia_mtime.tv_sec); - WRITE32(iap->ia_mtime.tv_nsec); + *p++ = cpu_to_be32(NFS4_SET_TO_CLIENT_TIME); + *p++ = cpu_to_be32(0); + *p++ = cpu_to_be32(iap->ia_mtime.tv_sec); + *p++ = cpu_to_be32(iap->ia_mtime.tv_nsec); } else if (iap->ia_valid & ATTR_ATIME) { bmval1 |= FATTR4_WORD1_TIME_ACCESS_SET; - WRITE32(NFS4_SET_TO_SERVER_TIME); + *p++ = cpu_to_be32(NFS4_SET_TO_SERVER_TIME); } if (iap->ia_valid & ATTR_MTIME_SET) { bmval1 |= FATTR4_WORD1_TIME_MODIFY_SET; - WRITE32(NFS4_SET_TO_CLIENT_TIME); - WRITE32(0); - WRITE32(iap->ia_mtime.tv_sec); - WRITE32(iap->ia_mtime.tv_nsec); + *p++ = cpu_to_be32(NFS4_SET_TO_CLIENT_TIME); + *p++ = cpu_to_be32(0); + *p++ = cpu_to_be32(iap->ia_mtime.tv_sec); + *p++ = cpu_to_be32(iap->ia_mtime.tv_nsec); } else if (iap->ia_valid & ATTR_MTIME) { bmval1 |= FATTR4_WORD1_TIME_MODIFY_SET; - WRITE32(NFS4_SET_TO_SERVER_TIME); + *p++ = cpu_to_be32(NFS4_SET_TO_SERVER_TIME); } /* @@ -891,7 +870,7 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const len = (char *)p - (char *)q - 12; *q++ = htonl(bmval0); *q++ = htonl(bmval1); - *q++ = htonl(len); + *q = htonl(len); /* out: */ } @@ -900,9 +879,9 @@ static void encode_access(struct xdr_stream *xdr, u32 access, struct compound_hd { __be32 *p; - RESERVE_SPACE(8); - WRITE32(OP_ACCESS); - WRITE32(access); + p = reserve_space(xdr, 8); + *p++ = cpu_to_be32(OP_ACCESS); + *p = cpu_to_be32(access); hdr->nops++; hdr->replen += decode_access_maxsz; } @@ -911,10 +890,10 @@ static void encode_close(struct xdr_stream *xdr, const struct nfs_closeargs *arg { __be32 *p; - RESERVE_SPACE(8+NFS4_STATEID_SIZE); - WRITE32(OP_CLOSE); - WRITE32(arg->seqid->sequence->counter); - WRITEMEM(arg->stateid->data, NFS4_STATEID_SIZE); + p = reserve_space(xdr, 8+NFS4_STATEID_SIZE); + *p++ = cpu_to_be32(OP_CLOSE); + *p++ = cpu_to_be32(arg->seqid->sequence->counter); + xdr_encode_opaque_fixed(p, arg->stateid->data, NFS4_STATEID_SIZE); hdr->nops++; hdr->replen += decode_close_maxsz; } @@ -923,10 +902,10 @@ static void encode_commit(struct xdr_stream *xdr, const struct nfs_writeargs *ar { __be32 *p; - RESERVE_SPACE(16); - WRITE32(OP_COMMIT); - WRITE64(args->offset); - WRITE32(args->count); + p = reserve_space(xdr, 16); + *p++ = cpu_to_be32(OP_COMMIT); + p = xdr_encode_hyper(p, args->offset); + *p = cpu_to_be32(args->count); hdr->nops++; hdr->replen += decode_commit_maxsz; } @@ -935,30 +914,28 @@ static void encode_create(struct xdr_stream *xdr, const struct nfs4_create_arg * { __be32 *p; - RESERVE_SPACE(8); - WRITE32(OP_CREATE); - WRITE32(create->ftype); + p = reserve_space(xdr, 8); + *p++ = cpu_to_be32(OP_CREATE); + *p = cpu_to_be32(create->ftype); switch (create->ftype) { case NF4LNK: - RESERVE_SPACE(4); - WRITE32(create->u.symlink.len); + p = reserve_space(xdr, 4); + *p = cpu_to_be32(create->u.symlink.len); xdr_write_pages(xdr, create->u.symlink.pages, 0, create->u.symlink.len); break; case NF4BLK: case NF4CHR: - RESERVE_SPACE(8); - WRITE32(create->u.device.specdata1); - WRITE32(create->u.device.specdata2); + p = reserve_space(xdr, 8); + *p++ = cpu_to_be32(create->u.device.specdata1); + *p = cpu_to_be32(create->u.device.specdata2); break; default: break; } - RESERVE_SPACE(4 + create->name->len); - WRITE32(create->name->len); - WRITEMEM(create->name->name, create->name->len); + encode_string(xdr, create->name->len, create->name->name); hdr->nops++; hdr->replen += decode_create_maxsz; @@ -969,10 +946,10 @@ static void encode_getattr_one(struct xdr_stream *xdr, uint32_t bitmap, struct c { __be32 *p; - RESERVE_SPACE(12); - WRITE32(OP_GETATTR); - WRITE32(1); - WRITE32(bitmap); + p = reserve_space(xdr, 12); + *p++ = cpu_to_be32(OP_GETATTR); + *p++ = cpu_to_be32(1); + *p = cpu_to_be32(bitmap); hdr->nops++; hdr->replen += decode_getattr_maxsz; } @@ -981,11 +958,11 @@ static void encode_getattr_two(struct xdr_stream *xdr, uint32_t bm0, uint32_t bm { __be32 *p; - RESERVE_SPACE(16); - WRITE32(OP_GETATTR); - WRITE32(2); - WRITE32(bm0); - WRITE32(bm1); + p = reserve_space(xdr, 16); + *p++ = cpu_to_be32(OP_GETATTR); + *p++ = cpu_to_be32(2); + *p++ = cpu_to_be32(bm0); + *p = cpu_to_be32(bm1); hdr->nops++; hdr->replen += decode_getattr_maxsz; } @@ -1012,8 +989,8 @@ static void encode_getfh(struct xdr_stream *xdr, struct compound_hdr *hdr) { __be32 *p; - RESERVE_SPACE(4); - WRITE32(OP_GETFH); + p = reserve_space(xdr, 4); + *p = cpu_to_be32(OP_GETFH); hdr->nops++; hdr->replen += decode_getfh_maxsz; } @@ -1022,10 +999,9 @@ static void encode_link(struct xdr_stream *xdr, const struct qstr *name, struct { __be32 *p; - RESERVE_SPACE(8 + name->len); - WRITE32(OP_LINK); - WRITE32(name->len); - WRITEMEM(name->name, name->len); + p = reserve_space(xdr, 8 + name->len); + *p++ = cpu_to_be32(OP_LINK); + xdr_encode_opaque(p, name->name, name->len); hdr->nops++; hdr->replen += decode_link_maxsz; } @@ -1052,27 +1028,27 @@ static void encode_lock(struct xdr_stream *xdr, const struct nfs_lock_args *args { __be32 *p; - RESERVE_SPACE(32); - WRITE32(OP_LOCK); - WRITE32(nfs4_lock_type(args->fl, args->block)); - WRITE32(args->reclaim); - WRITE64(args->fl->fl_start); - WRITE64(nfs4_lock_length(args->fl)); - WRITE32(args->new_lock_owner); + p = reserve_space(xdr, 32); + *p++ = cpu_to_be32(OP_LOCK); + *p++ = cpu_to_be32(nfs4_lock_type(args->fl, args->block)); + *p++ = cpu_to_be32(args->reclaim); + p = xdr_encode_hyper(p, args->fl->fl_start); + p = xdr_encode_hyper(p, nfs4_lock_length(args->fl)); + *p = cpu_to_be32(args->new_lock_owner); if (args->new_lock_owner){ - RESERVE_SPACE(4+NFS4_STATEID_SIZE+32); - WRITE32(args->open_seqid->sequence->counter); - WRITEMEM(args->open_stateid->data, NFS4_STATEID_SIZE); - WRITE32(args->lock_seqid->sequence->counter); - WRITE64(args->lock_owner.clientid); - WRITE32(16); - WRITEMEM("lock id:", 8); - WRITE64(args->lock_owner.id); + p = reserve_space(xdr, 4+NFS4_STATEID_SIZE+32); + *p++ = cpu_to_be32(args->open_seqid->sequence->counter); + p = xdr_encode_opaque_fixed(p, args->open_stateid->data, NFS4_STATEID_SIZE); + *p++ = cpu_to_be32(args->lock_seqid->sequence->counter); + p = xdr_encode_hyper(p, args->lock_owner.clientid); + *p++ = cpu_to_be32(16); + p = xdr_encode_opaque_fixed(p, "lock id:", 8); + xdr_encode_hyper(p, args->lock_owner.id); } else { - RESERVE_SPACE(NFS4_STATEID_SIZE+4); - WRITEMEM(args->lock_stateid->data, NFS4_STATEID_SIZE); - WRITE32(args->lock_seqid->sequence->counter); + p = reserve_space(xdr, NFS4_STATEID_SIZE+4); + p = xdr_encode_opaque_fixed(p, args->lock_stateid->data, NFS4_STATEID_SIZE); + *p = cpu_to_be32(args->lock_seqid->sequence->counter); } hdr->nops++; hdr->replen += decode_lock_maxsz; @@ -1082,15 +1058,15 @@ static void encode_lockt(struct xdr_stream *xdr, const struct nfs_lockt_args *ar { __be32 *p; - RESERVE_SPACE(52); - WRITE32(OP_LOCKT); - WRITE32(nfs4_lock_type(args->fl, 0)); - WRITE64(args->fl->fl_start); - WRITE64(nfs4_lock_length(args->fl)); - WRITE64(args->lock_owner.clientid); - WRITE32(16); - WRITEMEM("lock id:", 8); - WRITE64(args->lock_owner.id); + p = reserve_space(xdr, 52); + *p++ = cpu_to_be32(OP_LOCKT); + *p++ = cpu_to_be32(nfs4_lock_type(args->fl, 0)); + p = xdr_encode_hyper(p, args->fl->fl_start); + p = xdr_encode_hyper(p, nfs4_lock_length(args->fl)); + p = xdr_encode_hyper(p, args->lock_owner.clientid); + *p++ = cpu_to_be32(16); + p = xdr_encode_opaque_fixed(p, "lock id:", 8); + xdr_encode_hyper(p, args->lock_owner.id); hdr->nops++; hdr->replen += decode_lockt_maxsz; } @@ -1099,13 +1075,13 @@ static void encode_locku(struct xdr_stream *xdr, const struct nfs_locku_args *ar { __be32 *p; - RESERVE_SPACE(12+NFS4_STATEID_SIZE+16); - WRITE32(OP_LOCKU); - WRITE32(nfs4_lock_type(args->fl, 0)); - WRITE32(args->seqid->sequence->counter); - WRITEMEM(args->stateid->data, NFS4_STATEID_SIZE); - WRITE64(args->fl->fl_start); - WRITE64(nfs4_lock_length(args->fl)); + p = reserve_space(xdr, 12+NFS4_STATEID_SIZE+16); + *p++ = cpu_to_be32(OP_LOCKU); + *p++ = cpu_to_be32(nfs4_lock_type(args->fl, 0)); + *p++ = cpu_to_be32(args->seqid->sequence->counter); + p = xdr_encode_opaque_fixed(p, args->stateid->data, NFS4_STATEID_SIZE); + p = xdr_encode_hyper(p, args->fl->fl_start); + xdr_encode_hyper(p, nfs4_lock_length(args->fl)); hdr->nops++; hdr->replen += decode_locku_maxsz; } @@ -1115,10 +1091,9 @@ static void encode_lookup(struct xdr_stream *xdr, const struct qstr *name, struc int len = name->len; __be32 *p; - RESERVE_SPACE(8 + len); - WRITE32(OP_LOOKUP); - WRITE32(len); - WRITEMEM(name->name, len); + p = reserve_space(xdr, 8 + len); + *p++ = cpu_to_be32(OP_LOOKUP); + xdr_encode_opaque(p, name->name, len); hdr->nops++; hdr->replen += decode_lookup_maxsz; } @@ -1127,21 +1102,21 @@ static void encode_share_access(struct xdr_stream *xdr, fmode_t fmode) { __be32 *p; - RESERVE_SPACE(8); + p = reserve_space(xdr, 8); switch (fmode & (FMODE_READ|FMODE_WRITE)) { case FMODE_READ: - WRITE32(NFS4_SHARE_ACCESS_READ); + *p++ = cpu_to_be32(NFS4_SHARE_ACCESS_READ); break; case FMODE_WRITE: - WRITE32(NFS4_SHARE_ACCESS_WRITE); + *p++ = cpu_to_be32(NFS4_SHARE_ACCESS_WRITE); break; case FMODE_READ|FMODE_WRITE: - WRITE32(NFS4_SHARE_ACCESS_BOTH); + *p++ = cpu_to_be32(NFS4_SHARE_ACCESS_BOTH); break; default: - WRITE32(0); + *p++ = cpu_to_be32(0); } - WRITE32(0); /* for linux, share_deny = 0 always */ + *p = cpu_to_be32(0); /* for linux, share_deny = 0 always */ } static inline void encode_openhdr(struct xdr_stream *xdr, const struct nfs_openargs *arg) @@ -1151,29 +1126,29 @@ static inline void encode_openhdr(struct xdr_stream *xdr, const struct nfs_opena * opcode 4, seqid 4, share_access 4, share_deny 4, clientid 8, ownerlen 4, * owner 4 = 32 */ - RESERVE_SPACE(8); - WRITE32(OP_OPEN); - WRITE32(arg->seqid->sequence->counter); + p = reserve_space(xdr, 8); + *p++ = cpu_to_be32(OP_OPEN); + *p = cpu_to_be32(arg->seqid->sequence->counter); encode_share_access(xdr, arg->fmode); - RESERVE_SPACE(28); - WRITE64(arg->clientid); - WRITE32(16); - WRITEMEM("open id:", 8); - WRITE64(arg->id); + p = reserve_space(xdr, 28); + p = xdr_encode_hyper(p, arg->clientid); + *p++ = cpu_to_be32(16); + p = xdr_encode_opaque_fixed(p, "open id:", 8); + xdr_encode_hyper(p, arg->id); } static inline void encode_createmode(struct xdr_stream *xdr, const struct nfs_openargs *arg) { __be32 *p; - RESERVE_SPACE(4); + p = reserve_space(xdr, 4); switch(arg->open_flags & O_EXCL) { case 0: - WRITE32(NFS4_CREATE_UNCHECKED); + *p = cpu_to_be32(NFS4_CREATE_UNCHECKED); encode_attrs(xdr, arg->u.attrs, arg->server); break; default: - WRITE32(NFS4_CREATE_EXCLUSIVE); + *p = cpu_to_be32(NFS4_CREATE_EXCLUSIVE); encode_nfs4_verifier(xdr, &arg->u.verifier); } } @@ -1182,14 +1157,14 @@ static void encode_opentype(struct xdr_stream *xdr, const struct nfs_openargs *a { __be32 *p; - RESERVE_SPACE(4); + p = reserve_space(xdr, 4); switch (arg->open_flags & O_CREAT) { case 0: - WRITE32(NFS4_OPEN_NOCREATE); + *p = cpu_to_be32(NFS4_OPEN_NOCREATE); break; default: BUG_ON(arg->claim != NFS4_OPEN_CLAIM_NULL); - WRITE32(NFS4_OPEN_CREATE); + *p = cpu_to_be32(NFS4_OPEN_CREATE); encode_createmode(xdr, arg); } } @@ -1198,16 +1173,16 @@ static inline void encode_delegation_type(struct xdr_stream *xdr, fmode_t delega { __be32 *p; - RESERVE_SPACE(4); + p = reserve_space(xdr, 4); switch (delegation_type) { case 0: - WRITE32(NFS4_OPEN_DELEGATE_NONE); + *p = cpu_to_be32(NFS4_OPEN_DELEGATE_NONE); break; case FMODE_READ: - WRITE32(NFS4_OPEN_DELEGATE_READ); + *p = cpu_to_be32(NFS4_OPEN_DELEGATE_READ); break; case FMODE_WRITE|FMODE_READ: - WRITE32(NFS4_OPEN_DELEGATE_WRITE); + *p = cpu_to_be32(NFS4_OPEN_DELEGATE_WRITE); break; default: BUG(); @@ -1218,8 +1193,8 @@ static inline void encode_claim_null(struct xdr_stream *xdr, const struct qstr * { __be32 *p; - RESERVE_SPACE(4); - WRITE32(NFS4_OPEN_CLAIM_NULL); + p = reserve_space(xdr, 4); + *p = cpu_to_be32(NFS4_OPEN_CLAIM_NULL); encode_string(xdr, name->len, name->name); } @@ -1227,8 +1202,8 @@ static inline void encode_claim_previous(struct xdr_stream *xdr, fmode_t type) { __be32 *p; - RESERVE_SPACE(4); - WRITE32(NFS4_OPEN_CLAIM_PREVIOUS); + p = reserve_space(xdr, 4); + *p = cpu_to_be32(NFS4_OPEN_CLAIM_PREVIOUS); encode_delegation_type(xdr, type); } @@ -1236,9 +1211,9 @@ static inline void encode_claim_delegate_cur(struct xdr_stream *xdr, const struc { __be32 *p; - RESERVE_SPACE(4+NFS4_STATEID_SIZE); - WRITE32(NFS4_OPEN_CLAIM_DELEGATE_CUR); - WRITEMEM(stateid->data, NFS4_STATEID_SIZE); + p = reserve_space(xdr, 4+NFS4_STATEID_SIZE); + *p++ = cpu_to_be32(NFS4_OPEN_CLAIM_DELEGATE_CUR); + xdr_encode_opaque_fixed(p, stateid->data, NFS4_STATEID_SIZE); encode_string(xdr, name->len, name->name); } @@ -1267,10 +1242,10 @@ static void encode_open_confirm(struct xdr_stream *xdr, const struct nfs_open_co { __be32 *p; - RESERVE_SPACE(4+NFS4_STATEID_SIZE+4); - WRITE32(OP_OPEN_CONFIRM); - WRITEMEM(arg->stateid->data, NFS4_STATEID_SIZE); - WRITE32(arg->seqid->sequence->counter); + p = reserve_space(xdr, 4+NFS4_STATEID_SIZE+4); + *p++ = cpu_to_be32(OP_OPEN_CONFIRM); + p = xdr_encode_opaque_fixed(p, arg->stateid->data, NFS4_STATEID_SIZE); + *p = cpu_to_be32(arg->seqid->sequence->counter); hdr->nops++; hdr->replen += decode_open_confirm_maxsz; } @@ -1279,10 +1254,10 @@ static void encode_open_downgrade(struct xdr_stream *xdr, const struct nfs_close { __be32 *p; - RESERVE_SPACE(4+NFS4_STATEID_SIZE+4); - WRITE32(OP_OPEN_DOWNGRADE); - WRITEMEM(arg->stateid->data, NFS4_STATEID_SIZE); - WRITE32(arg->seqid->sequence->counter); + p = reserve_space(xdr, 4+NFS4_STATEID_SIZE+4); + *p++ = cpu_to_be32(OP_OPEN_DOWNGRADE); + p = xdr_encode_opaque_fixed(p, arg->stateid->data, NFS4_STATEID_SIZE); + *p = cpu_to_be32(arg->seqid->sequence->counter); encode_share_access(xdr, arg->fmode); hdr->nops++; hdr->replen += decode_open_downgrade_maxsz; @@ -1294,10 +1269,9 @@ encode_putfh(struct xdr_stream *xdr, const struct nfs_fh *fh, struct compound_hd int len = fh->size; __be32 *p; - RESERVE_SPACE(8 + len); - WRITE32(OP_PUTFH); - WRITE32(len); - WRITEMEM(fh->data, len); + p = reserve_space(xdr, 8 + len); + *p++ = cpu_to_be32(OP_PUTFH); + xdr_encode_opaque(p, fh->data, len); hdr->nops++; hdr->replen += decode_putfh_maxsz; } @@ -1306,8 +1280,8 @@ static void encode_putrootfh(struct xdr_stream *xdr, struct compound_hdr *hdr) { __be32 *p; - RESERVE_SPACE(4); - WRITE32(OP_PUTROOTFH); + p = reserve_space(xdr, 4); + *p = cpu_to_be32(OP_PUTROOTFH); hdr->nops++; hdr->replen += decode_putrootfh_maxsz; } @@ -1317,26 +1291,26 @@ static void encode_stateid(struct xdr_stream *xdr, const struct nfs_open_context nfs4_stateid stateid; __be32 *p; - RESERVE_SPACE(NFS4_STATEID_SIZE); + p = reserve_space(xdr, NFS4_STATEID_SIZE); if (ctx->state != NULL) { nfs4_copy_stateid(&stateid, ctx->state, ctx->lockowner); - WRITEMEM(stateid.data, NFS4_STATEID_SIZE); + xdr_encode_opaque_fixed(p, stateid.data, NFS4_STATEID_SIZE); } else - WRITEMEM(zero_stateid.data, NFS4_STATEID_SIZE); + xdr_encode_opaque_fixed(p, zero_stateid.data, NFS4_STATEID_SIZE); } static void encode_read(struct xdr_stream *xdr, const struct nfs_readargs *args, struct compound_hdr *hdr) { __be32 *p; - RESERVE_SPACE(4); - WRITE32(OP_READ); + p = reserve_space(xdr, 4); + *p = cpu_to_be32(OP_READ); encode_stateid(xdr, args->context); - RESERVE_SPACE(12); - WRITE64(args->offset); - WRITE32(args->count); + p = reserve_space(xdr, 12); + p = xdr_encode_hyper(p, args->offset); + *p = cpu_to_be32(args->count); hdr->nops++; hdr->replen += decode_read_maxsz; } @@ -1349,20 +1323,20 @@ static void encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg }; __be32 *p; - RESERVE_SPACE(12+NFS4_VERIFIER_SIZE+20); - WRITE32(OP_READDIR); - WRITE64(readdir->cookie); - WRITEMEM(readdir->verifier.data, NFS4_VERIFIER_SIZE); - WRITE32(readdir->count >> 1); /* We're not doing readdirplus */ - WRITE32(readdir->count); - WRITE32(2); + p = reserve_space(xdr, 12+NFS4_VERIFIER_SIZE+20); + *p++ = cpu_to_be32(OP_READDIR); + p = xdr_encode_hyper(p, readdir->cookie); + p = xdr_encode_opaque_fixed(p, readdir->verifier.data, NFS4_VERIFIER_SIZE); + *p++ = cpu_to_be32(readdir->count >> 1); /* We're not doing readdirplus */ + *p++ = cpu_to_be32(readdir->count); + *p++ = cpu_to_be32(2); /* Switch to mounted_on_fileid if the server supports it */ if (readdir->bitmask[1] & FATTR4_WORD1_MOUNTED_ON_FILEID) attrs[0] &= ~FATTR4_WORD0_FILEID; else attrs[1] &= ~FATTR4_WORD1_MOUNTED_ON_FILEID; - WRITE32(attrs[0] & readdir->bitmask[0]); - WRITE32(attrs[1] & readdir->bitmask[1]); + *p++ = cpu_to_be32(attrs[0] & readdir->bitmask[0]); + *p = cpu_to_be32(attrs[1] & readdir->bitmask[1]); hdr->nops++; hdr->replen += decode_readdir_maxsz; dprintk("%s: cookie = %Lu, verifier = %08x:%08x, bitmap = %08x:%08x\n", @@ -1378,8 +1352,8 @@ static void encode_readlink(struct xdr_stream *xdr, const struct nfs4_readlink * { __be32 *p; - RESERVE_SPACE(4); - WRITE32(OP_READLINK); + p = reserve_space(xdr, 4); + *p = cpu_to_be32(OP_READLINK); hdr->nops++; hdr->replen += decode_readlink_maxsz; } @@ -1388,10 +1362,9 @@ static void encode_remove(struct xdr_stream *xdr, const struct qstr *name, struc { __be32 *p; - RESERVE_SPACE(8 + name->len); - WRITE32(OP_REMOVE); - WRITE32(name->len); - WRITEMEM(name->name, name->len); + p = reserve_space(xdr, 8 + name->len); + *p++ = cpu_to_be32(OP_REMOVE); + xdr_encode_opaque(p, name->name, name->len); hdr->nops++; hdr->replen += decode_remove_maxsz; } @@ -1400,14 +1373,10 @@ static void encode_rename(struct xdr_stream *xdr, const struct qstr *oldname, co { __be32 *p; - RESERVE_SPACE(8 + oldname->len); - WRITE32(OP_RENAME); - WRITE32(oldname->len); - WRITEMEM(oldname->name, oldname->len); - - RESERVE_SPACE(4 + newname->len); - WRITE32(newname->len); - WRITEMEM(newname->name, newname->len); + p = reserve_space(xdr, 4); + *p = cpu_to_be32(OP_RENAME); + encode_string(xdr, oldname->len, oldname->name); + encode_string(xdr, newname->len, newname->name); hdr->nops++; hdr->replen += decode_rename_maxsz; } @@ -1416,9 +1385,9 @@ static void encode_renew(struct xdr_stream *xdr, const struct nfs_client *client { __be32 *p; - RESERVE_SPACE(12); - WRITE32(OP_RENEW); - WRITE64(client_stateid->cl_clientid); + p = reserve_space(xdr, 12); + *p++ = cpu_to_be32(OP_RENEW); + xdr_encode_hyper(p, client_stateid->cl_clientid); hdr->nops++; hdr->replen += decode_renew_maxsz; } @@ -1428,8 +1397,8 @@ encode_restorefh(struct xdr_stream *xdr, struct compound_hdr *hdr) { __be32 *p; - RESERVE_SPACE(4); - WRITE32(OP_RESTOREFH); + p = reserve_space(xdr, 4); + *p = cpu_to_be32(OP_RESTOREFH); hdr->nops++; hdr->replen += decode_restorefh_maxsz; } @@ -1439,16 +1408,16 @@ encode_setacl(struct xdr_stream *xdr, struct nfs_setaclargs *arg, struct compoun { __be32 *p; - RESERVE_SPACE(4+NFS4_STATEID_SIZE); - WRITE32(OP_SETATTR); - WRITEMEM(zero_stateid.data, NFS4_STATEID_SIZE); - RESERVE_SPACE(2*4); - WRITE32(1); - WRITE32(FATTR4_WORD0_ACL); + p = reserve_space(xdr, 4+NFS4_STATEID_SIZE); + *p++ = cpu_to_be32(OP_SETATTR); + xdr_encode_opaque_fixed(p, zero_stateid.data, NFS4_STATEID_SIZE); + p = reserve_space(xdr, 2*4); + *p++ = cpu_to_be32(1); + *p = cpu_to_be32(FATTR4_WORD0_ACL); if (arg->acl_len % 4) return -EINVAL; - RESERVE_SPACE(4); - WRITE32(arg->acl_len); + p = reserve_space(xdr, 4); + *p = cpu_to_be32(arg->acl_len); xdr_write_pages(xdr, arg->acl_pages, arg->acl_pgbase, arg->acl_len); hdr->nops++; hdr->replen += decode_setacl_maxsz; @@ -1460,8 +1429,8 @@ encode_savefh(struct xdr_stream *xdr, struct compound_hdr *hdr) { __be32 *p; - RESERVE_SPACE(4); - WRITE32(OP_SAVEFH); + p = reserve_space(xdr, 4); + *p = cpu_to_be32(OP_SAVEFH); hdr->nops++; hdr->replen += decode_savefh_maxsz; } @@ -1470,9 +1439,9 @@ static void encode_setattr(struct xdr_stream *xdr, const struct nfs_setattrargs { __be32 *p; - RESERVE_SPACE(4+NFS4_STATEID_SIZE); - WRITE32(OP_SETATTR); - WRITEMEM(arg->stateid.data, NFS4_STATEID_SIZE); + p = reserve_space(xdr, 4+NFS4_STATEID_SIZE); + *p++ = cpu_to_be32(OP_SETATTR); + xdr_encode_opaque_fixed(p, arg->stateid.data, NFS4_STATEID_SIZE); hdr->nops++; hdr->replen += decode_setattr_maxsz; encode_attrs(xdr, arg->iap, server); @@ -1482,17 +1451,17 @@ static void encode_setclientid(struct xdr_stream *xdr, const struct nfs4_setclie { __be32 *p; - RESERVE_SPACE(4 + NFS4_VERIFIER_SIZE); - WRITE32(OP_SETCLIENTID); - WRITEMEM(setclientid->sc_verifier->data, NFS4_VERIFIER_SIZE); + p = reserve_space(xdr, 4 + NFS4_VERIFIER_SIZE); + *p++ = cpu_to_be32(OP_SETCLIENTID); + xdr_encode_opaque_fixed(p, setclientid->sc_verifier->data, NFS4_VERIFIER_SIZE); encode_string(xdr, setclientid->sc_name_len, setclientid->sc_name); - RESERVE_SPACE(4); - WRITE32(setclientid->sc_prog); + p = reserve_space(xdr, 4); + *p = cpu_to_be32(setclientid->sc_prog); encode_string(xdr, setclientid->sc_netid_len, setclientid->sc_netid); encode_string(xdr, setclientid->sc_uaddr_len, setclientid->sc_uaddr); - RESERVE_SPACE(4); - WRITE32(setclientid->sc_cb_ident); + p = reserve_space(xdr, 4); + *p = cpu_to_be32(setclientid->sc_cb_ident); hdr->nops++; hdr->replen += decode_setclientid_maxsz; } @@ -1501,10 +1470,10 @@ static void encode_setclientid_confirm(struct xdr_stream *xdr, const struct nfs_ { __be32 *p; - RESERVE_SPACE(12 + NFS4_VERIFIER_SIZE); - WRITE32(OP_SETCLIENTID_CONFIRM); - WRITE64(client_state->cl_clientid); - WRITEMEM(client_state->cl_confirm.data, NFS4_VERIFIER_SIZE); + p = reserve_space(xdr, 12 + NFS4_VERIFIER_SIZE); + *p++ = cpu_to_be32(OP_SETCLIENTID_CONFIRM); + p = xdr_encode_hyper(p, client_state->cl_clientid); + xdr_encode_opaque_fixed(p, client_state->cl_confirm.data, NFS4_VERIFIER_SIZE); hdr->nops++; hdr->replen += decode_setclientid_confirm_maxsz; } @@ -1513,15 +1482,15 @@ static void encode_write(struct xdr_stream *xdr, const struct nfs_writeargs *arg { __be32 *p; - RESERVE_SPACE(4); - WRITE32(OP_WRITE); + p = reserve_space(xdr, 4); + *p = cpu_to_be32(OP_WRITE); encode_stateid(xdr, args->context); - RESERVE_SPACE(16); - WRITE64(args->offset); - WRITE32(args->stable); - WRITE32(args->count); + p = reserve_space(xdr, 16); + p = xdr_encode_hyper(p, args->offset); + *p++ = cpu_to_be32(args->stable); + *p = cpu_to_be32(args->count); xdr_write_pages(xdr, args->pages, args->pgbase, args->count); hdr->nops++; @@ -1532,10 +1501,10 @@ static void encode_delegreturn(struct xdr_stream *xdr, const nfs4_stateid *state { __be32 *p; - RESERVE_SPACE(4+NFS4_STATEID_SIZE); + p = reserve_space(xdr, 4+NFS4_STATEID_SIZE); - WRITE32(OP_DELEGRETURN); - WRITEMEM(stateid->data, NFS4_STATEID_SIZE); + *p++ = cpu_to_be32(OP_DELEGRETURN); + xdr_encode_opaque_fixed(p, stateid->data, NFS4_STATEID_SIZE); hdr->nops++; hdr->replen += decode_delegreturn_maxsz; } @@ -1548,16 +1517,16 @@ static void encode_exchange_id(struct xdr_stream *xdr, { __be32 *p; - RESERVE_SPACE(4 + sizeof(args->verifier->data)); - WRITE32(OP_EXCHANGE_ID); - WRITEMEM(args->verifier->data, sizeof(args->verifier->data)); + p = reserve_space(xdr, 4 + sizeof(args->verifier->data)); + *p++ = cpu_to_be32(OP_EXCHANGE_ID); + xdr_encode_opaque_fixed(p, args->verifier->data, sizeof(args->verifier->data)); encode_string(xdr, args->id_len, args->id); - RESERVE_SPACE(12); - WRITE32(args->flags); - WRITE32(0); /* zero length state_protect4_a */ - WRITE32(0); /* zero length implementation id array */ + p = reserve_space(xdr, 12); + *p++ = cpu_to_be32(args->flags); + *p++ = cpu_to_be32(0); /* zero length state_protect4_a */ + *p = cpu_to_be32(0); /* zero length implementation id array */ hdr->nops++; hdr->replen += decode_exchange_id_maxsz; } @@ -1571,55 +1540,43 @@ static void encode_create_session(struct xdr_stream *xdr, uint32_t len; struct nfs_client *clp = args->client; - RESERVE_SPACE(4); - WRITE32(OP_CREATE_SESSION); - - RESERVE_SPACE(8); - WRITE64(clp->cl_ex_clid); + len = scnprintf(machine_name, sizeof(machine_name), "%s", + clp->cl_ipaddr); - RESERVE_SPACE(8); - WRITE32(clp->cl_seqid); /*Sequence id */ - WRITE32(args->flags); /*flags */ + p = reserve_space(xdr, 20 + 2*28 + 20 + len + 12); + *p++ = cpu_to_be32(OP_CREATE_SESSION); + p = xdr_encode_hyper(p, clp->cl_ex_clid); + *p++ = cpu_to_be32(clp->cl_seqid); /*Sequence id */ + *p++ = cpu_to_be32(args->flags); /*flags */ - RESERVE_SPACE(2*28); /* 2 channel_attrs */ /* Fore Channel */ - WRITE32(args->fc_attrs.headerpadsz); /* header padding size */ - WRITE32(args->fc_attrs.max_rqst_sz); /* max req size */ - WRITE32(args->fc_attrs.max_resp_sz); /* max resp size */ - WRITE32(args->fc_attrs.max_resp_sz_cached); /* Max resp sz cached */ - WRITE32(args->fc_attrs.max_ops); /* max operations */ - WRITE32(args->fc_attrs.max_reqs); /* max requests */ - WRITE32(0); /* rdmachannel_attrs */ + *p++ = cpu_to_be32(args->fc_attrs.headerpadsz); /* header padding size */ + *p++ = cpu_to_be32(args->fc_attrs.max_rqst_sz); /* max req size */ + *p++ = cpu_to_be32(args->fc_attrs.max_resp_sz); /* max resp size */ + *p++ = cpu_to_be32(args->fc_attrs.max_resp_sz_cached); /* Max resp sz cached */ + *p++ = cpu_to_be32(args->fc_attrs.max_ops); /* max operations */ + *p++ = cpu_to_be32(args->fc_attrs.max_reqs); /* max requests */ + *p++ = cpu_to_be32(0); /* rdmachannel_attrs */ /* Back Channel */ - WRITE32(args->fc_attrs.headerpadsz); /* header padding size */ - WRITE32(args->bc_attrs.max_rqst_sz); /* max req size */ - WRITE32(args->bc_attrs.max_resp_sz); /* max resp size */ - WRITE32(args->bc_attrs.max_resp_sz_cached); /* Max resp sz cached */ - WRITE32(args->bc_attrs.max_ops); /* max operations */ - WRITE32(args->bc_attrs.max_reqs); /* max requests */ - WRITE32(0); /* rdmachannel_attrs */ - - RESERVE_SPACE(4); - WRITE32(args->cb_program); /* cb_program */ - - RESERVE_SPACE(4); /* # of security flavors */ - WRITE32(1); - - RESERVE_SPACE(4); - WRITE32(RPC_AUTH_UNIX); /* auth_sys */ + *p++ = cpu_to_be32(args->fc_attrs.headerpadsz); /* header padding size */ + *p++ = cpu_to_be32(args->bc_attrs.max_rqst_sz); /* max req size */ + *p++ = cpu_to_be32(args->bc_attrs.max_resp_sz); /* max resp size */ + *p++ = cpu_to_be32(args->bc_attrs.max_resp_sz_cached); /* Max resp sz cached */ + *p++ = cpu_to_be32(args->bc_attrs.max_ops); /* max operations */ + *p++ = cpu_to_be32(args->bc_attrs.max_reqs); /* max requests */ + *p++ = cpu_to_be32(0); /* rdmachannel_attrs */ + + *p++ = cpu_to_be32(args->cb_program); /* cb_program */ + *p++ = cpu_to_be32(1); + *p++ = cpu_to_be32(RPC_AUTH_UNIX); /* auth_sys */ /* authsys_parms rfc1831 */ - RESERVE_SPACE(4); - WRITE32((u32)clp->cl_boot_time.tv_nsec); /* stamp */ - len = scnprintf(machine_name, sizeof(machine_name), "%s", - clp->cl_ipaddr); - RESERVE_SPACE(16 + len); - WRITE32(len); - WRITEMEM(machine_name, len); - WRITE32(0); /* UID */ - WRITE32(0); /* GID */ - WRITE32(0); /* No more gids */ + *p++ = cpu_to_be32((u32)clp->cl_boot_time.tv_nsec); /* stamp */ + p = xdr_encode_opaque(p, machine_name, len); + *p++ = cpu_to_be32(0); /* UID */ + *p++ = cpu_to_be32(0); /* GID */ + *p = cpu_to_be32(0); /* No more gids */ hdr->nops++; hdr->replen += decode_create_session_maxsz; } @@ -1629,9 +1586,9 @@ static void encode_destroy_session(struct xdr_stream *xdr, struct compound_hdr *hdr) { __be32 *p; - RESERVE_SPACE(4 + NFS4_MAX_SESSIONID_LEN); - WRITE32(OP_DESTROY_SESSION); - WRITEMEM(session->sess_id.data, NFS4_MAX_SESSIONID_LEN); + p = reserve_space(xdr, 4 + NFS4_MAX_SESSIONID_LEN); + *p++ = cpu_to_be32(OP_DESTROY_SESSION); + xdr_encode_opaque_fixed(p, session->sess_id.data, NFS4_MAX_SESSIONID_LEN); hdr->nops++; hdr->replen += decode_destroy_session_maxsz; } @@ -1655,8 +1612,8 @@ static void encode_sequence(struct xdr_stream *xdr, WARN_ON(args->sa_slotid == NFS4_MAX_SLOT_TABLE); slot = tp->slots + args->sa_slotid; - RESERVE_SPACE(4); - WRITE32(OP_SEQUENCE); + p = reserve_space(xdr, 4 + NFS4_MAX_SESSIONID_LEN + 16); + *p++ = cpu_to_be32(OP_SEQUENCE); /* * Sessionid + seqid + slotid + max slotid + cache_this @@ -1670,12 +1627,11 @@ static void encode_sequence(struct xdr_stream *xdr, ((u32 *)session->sess_id.data)[3], slot->seq_nr, args->sa_slotid, tp->highest_used_slotid, args->sa_cache_this); - RESERVE_SPACE(NFS4_MAX_SESSIONID_LEN + 16); - WRITEMEM(session->sess_id.data, NFS4_MAX_SESSIONID_LEN); - WRITE32(slot->seq_nr); - WRITE32(args->sa_slotid); - WRITE32(tp->highest_used_slotid); - WRITE32(args->sa_cache_this); + p = xdr_encode_opaque_fixed(p, session->sess_id.data, NFS4_MAX_SESSIONID_LEN); + *p++ = cpu_to_be32(slot->seq_nr); + *p++ = cpu_to_be32(args->sa_slotid); + *p++ = cpu_to_be32(tp->highest_used_slotid); + *p = cpu_to_be32(args->sa_cache_this); hdr->nops++; hdr->replen += decode_sequence_maxsz; #endif /* CONFIG_NFS_V4_1 */ @@ -2466,68 +2422,53 @@ static int nfs4_xdr_enc_get_lease_time(struct rpc_rqst *req, uint32_t *p, } #endif /* CONFIG_NFS_V4_1 */ -/* - * START OF "GENERIC" DECODE ROUTINES. - * These may look a little ugly since they are imported from a "generic" - * set of XDR encode/decode routines which are intended to be shared by - * all of our NFSv4 implementations (OpenBSD, MacOS X...). - * - * If the pain of reading these is too great, it should be a straightforward - * task to translate them into Linux-specific versions which are more - * consistent with the style used in NFSv2/v3... - */ -#define READ32(x) (x) = ntohl(*p++) -#define READ64(x) do { \ - (x) = (u64)ntohl(*p++) << 32; \ - (x) |= ntohl(*p++); \ -} while (0) -#define READTIME(x) do { \ - p++; \ - (x.tv_sec) = ntohl(*p++); \ - (x.tv_nsec) = ntohl(*p++); \ -} while (0) -#define COPYMEM(x,nbytes) do { \ - memcpy((x), p, nbytes); \ - p += XDR_QUADLEN(nbytes); \ -} while (0) - -#define READ_BUF(nbytes) do { \ - p = xdr_inline_decode(xdr, nbytes); \ - if (unlikely(!p)) { \ - dprintk("nfs: %s: prematurely hit end of receive" \ - " buffer\n", __func__); \ - dprintk("nfs: %s: xdr->p=%p, bytes=%u, xdr->end=%p\n", \ - __func__, xdr->p, nbytes, xdr->end); \ - return -EIO; \ - } \ -} while (0) +static void print_overflow_msg(const char *func, const struct xdr_stream *xdr) +{ + dprintk("nfs: %s: prematurely hit end of receive buffer. " + "Remaining buffer length is %tu words.\n", + func, xdr->end - xdr->p); +} static int decode_opaque_inline(struct xdr_stream *xdr, unsigned int *len, char **string) { __be32 *p; - READ_BUF(4); - READ32(*len); - READ_BUF(*len); + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + *len = be32_to_cpup(p); + p = xdr_inline_decode(xdr, *len); + if (unlikely(!p)) + goto out_overflow; *string = (char *)p; return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_compound_hdr(struct xdr_stream *xdr, struct compound_hdr *hdr) { __be32 *p; - READ_BUF(8); - READ32(hdr->status); - READ32(hdr->taglen); + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + hdr->status = be32_to_cpup(p++); + hdr->taglen = be32_to_cpup(p); - READ_BUF(hdr->taglen + 4); + p = xdr_inline_decode(xdr, hdr->taglen + 4); + if (unlikely(!p)) + goto out_overflow; hdr->tag = (char *)p; p += XDR_QUADLEN(hdr->taglen); - READ32(hdr->nops); + hdr->nops = be32_to_cpup(p); if (unlikely(hdr->nops < 1)) return nfs4_stat_to_errno(hdr->status); return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected) @@ -2536,18 +2477,23 @@ static int decode_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected) uint32_t opnum; int32_t nfserr; - READ_BUF(8); - READ32(opnum); + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + opnum = be32_to_cpup(p++); if (opnum != expected) { dprintk("nfs: Server returned operation" " %d but we issued a request for %d\n", opnum, expected); return -EIO; } - READ32(nfserr); + nfserr = be32_to_cpup(p); if (nfserr != NFS_OK) return nfs4_stat_to_errno(nfserr); return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } /* Dummy routine */ @@ -2557,8 +2503,11 @@ static int decode_ace(struct xdr_stream *xdr, void *ace, struct nfs_client *clp) unsigned int strlen; char *str; - READ_BUF(12); - return decode_opaque_inline(xdr, &strlen, &str); + p = xdr_inline_decode(xdr, 12); + if (likely(p)) + return decode_opaque_inline(xdr, &strlen, &str); + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_attr_bitmap(struct xdr_stream *xdr, uint32_t *bitmap) @@ -2566,27 +2515,39 @@ static int decode_attr_bitmap(struct xdr_stream *xdr, uint32_t *bitmap) uint32_t bmlen; __be32 *p; - READ_BUF(4); - READ32(bmlen); + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + bmlen = be32_to_cpup(p); bitmap[0] = bitmap[1] = 0; - READ_BUF((bmlen << 2)); + p = xdr_inline_decode(xdr, (bmlen << 2)); + if (unlikely(!p)) + goto out_overflow; if (bmlen > 0) { - READ32(bitmap[0]); + bitmap[0] = be32_to_cpup(p++); if (bmlen > 1) - READ32(bitmap[1]); + bitmap[1] = be32_to_cpup(p); } return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static inline int decode_attr_length(struct xdr_stream *xdr, uint32_t *attrlen, __be32 **savep) { __be32 *p; - READ_BUF(4); - READ32(*attrlen); + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + *attrlen = be32_to_cpup(p); *savep = xdr->p; return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_attr_supported(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *bitmask) @@ -2609,8 +2570,10 @@ static int decode_attr_type(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t * if (unlikely(bitmap[0] & (FATTR4_WORD0_TYPE - 1U))) return -EIO; if (likely(bitmap[0] & FATTR4_WORD0_TYPE)) { - READ_BUF(4); - READ32(*type); + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + *type = be32_to_cpup(p); if (*type < NF4REG || *type > NF4NAMEDATTR) { dprintk("%s: bad type %d\n", __func__, *type); return -EIO; @@ -2620,6 +2583,9 @@ static int decode_attr_type(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t * } dprintk("%s: type=0%o\n", __func__, nfs_type2fmt[*type]); return ret; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_attr_change(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *change) @@ -2631,14 +2597,19 @@ static int decode_attr_change(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t if (unlikely(bitmap[0] & (FATTR4_WORD0_CHANGE - 1U))) return -EIO; if (likely(bitmap[0] & FATTR4_WORD0_CHANGE)) { - READ_BUF(8); - READ64(*change); + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + xdr_decode_hyper(p, change); bitmap[0] &= ~FATTR4_WORD0_CHANGE; ret = NFS_ATTR_FATTR_CHANGE; } dprintk("%s: change attribute=%Lu\n", __func__, (unsigned long long)*change); return ret; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_attr_size(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *size) @@ -2650,13 +2621,18 @@ static int decode_attr_size(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t * if (unlikely(bitmap[0] & (FATTR4_WORD0_SIZE - 1U))) return -EIO; if (likely(bitmap[0] & FATTR4_WORD0_SIZE)) { - READ_BUF(8); - READ64(*size); + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + xdr_decode_hyper(p, size); bitmap[0] &= ~FATTR4_WORD0_SIZE; ret = NFS_ATTR_FATTR_SIZE; } dprintk("%s: file size=%Lu\n", __func__, (unsigned long long)*size); return ret; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_attr_link_support(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res) @@ -2667,12 +2643,17 @@ static int decode_attr_link_support(struct xdr_stream *xdr, uint32_t *bitmap, ui if (unlikely(bitmap[0] & (FATTR4_WORD0_LINK_SUPPORT - 1U))) return -EIO; if (likely(bitmap[0] & FATTR4_WORD0_LINK_SUPPORT)) { - READ_BUF(4); - READ32(*res); + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + *res = be32_to_cpup(p); bitmap[0] &= ~FATTR4_WORD0_LINK_SUPPORT; } dprintk("%s: link support=%s\n", __func__, *res == 0 ? "false" : "true"); return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_attr_symlink_support(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res) @@ -2683,12 +2664,17 @@ static int decode_attr_symlink_support(struct xdr_stream *xdr, uint32_t *bitmap, if (unlikely(bitmap[0] & (FATTR4_WORD0_SYMLINK_SUPPORT - 1U))) return -EIO; if (likely(bitmap[0] & FATTR4_WORD0_SYMLINK_SUPPORT)) { - READ_BUF(4); - READ32(*res); + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + *res = be32_to_cpup(p); bitmap[0] &= ~FATTR4_WORD0_SYMLINK_SUPPORT; } dprintk("%s: symlink support=%s\n", __func__, *res == 0 ? "false" : "true"); return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_attr_fsid(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_fsid *fsid) @@ -2701,9 +2687,11 @@ static int decode_attr_fsid(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs if (unlikely(bitmap[0] & (FATTR4_WORD0_FSID - 1U))) return -EIO; if (likely(bitmap[0] & FATTR4_WORD0_FSID)) { - READ_BUF(16); - READ64(fsid->major); - READ64(fsid->minor); + p = xdr_inline_decode(xdr, 16); + if (unlikely(!p)) + goto out_overflow; + p = xdr_decode_hyper(p, &fsid->major); + xdr_decode_hyper(p, &fsid->minor); bitmap[0] &= ~FATTR4_WORD0_FSID; ret = NFS_ATTR_FATTR_FSID; } @@ -2711,6 +2699,9 @@ static int decode_attr_fsid(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs (unsigned long long)fsid->major, (unsigned long long)fsid->minor); return ret; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_attr_lease_time(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res) @@ -2721,12 +2712,17 @@ static int decode_attr_lease_time(struct xdr_stream *xdr, uint32_t *bitmap, uint if (unlikely(bitmap[0] & (FATTR4_WORD0_LEASE_TIME - 1U))) return -EIO; if (likely(bitmap[0] & FATTR4_WORD0_LEASE_TIME)) { - READ_BUF(4); - READ32(*res); + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + *res = be32_to_cpup(p); bitmap[0] &= ~FATTR4_WORD0_LEASE_TIME; } dprintk("%s: file size=%u\n", __func__, (unsigned int)*res); return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_attr_aclsupport(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res) @@ -2737,12 +2733,17 @@ static int decode_attr_aclsupport(struct xdr_stream *xdr, uint32_t *bitmap, uint if (unlikely(bitmap[0] & (FATTR4_WORD0_ACLSUPPORT - 1U))) return -EIO; if (likely(bitmap[0] & FATTR4_WORD0_ACLSUPPORT)) { - READ_BUF(4); - READ32(*res); + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + *res = be32_to_cpup(p); bitmap[0] &= ~FATTR4_WORD0_ACLSUPPORT; } dprintk("%s: ACLs supported=%u\n", __func__, (unsigned int)*res); return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_attr_fileid(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *fileid) @@ -2754,13 +2755,18 @@ static int decode_attr_fileid(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t if (unlikely(bitmap[0] & (FATTR4_WORD0_FILEID - 1U))) return -EIO; if (likely(bitmap[0] & FATTR4_WORD0_FILEID)) { - READ_BUF(8); - READ64(*fileid); + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + xdr_decode_hyper(p, fileid); bitmap[0] &= ~FATTR4_WORD0_FILEID; ret = NFS_ATTR_FATTR_FILEID; } dprintk("%s: fileid=%Lu\n", __func__, (unsigned long long)*fileid); return ret; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_attr_mounted_on_fileid(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *fileid) @@ -2772,13 +2778,18 @@ static int decode_attr_mounted_on_fileid(struct xdr_stream *xdr, uint32_t *bitma if (unlikely(bitmap[1] & (FATTR4_WORD1_MOUNTED_ON_FILEID - 1U))) return -EIO; if (likely(bitmap[1] & FATTR4_WORD1_MOUNTED_ON_FILEID)) { - READ_BUF(8); - READ64(*fileid); + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + xdr_decode_hyper(p, fileid); bitmap[1] &= ~FATTR4_WORD1_MOUNTED_ON_FILEID; ret = NFS_ATTR_FATTR_FILEID; } dprintk("%s: fileid=%Lu\n", __func__, (unsigned long long)*fileid); return ret; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_attr_files_avail(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res) @@ -2790,12 +2801,17 @@ static int decode_attr_files_avail(struct xdr_stream *xdr, uint32_t *bitmap, uin if (unlikely(bitmap[0] & (FATTR4_WORD0_FILES_AVAIL - 1U))) return -EIO; if (likely(bitmap[0] & FATTR4_WORD0_FILES_AVAIL)) { - READ_BUF(8); - READ64(*res); + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + xdr_decode_hyper(p, res); bitmap[0] &= ~FATTR4_WORD0_FILES_AVAIL; } dprintk("%s: files avail=%Lu\n", __func__, (unsigned long long)*res); return status; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_attr_files_free(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res) @@ -2807,12 +2823,17 @@ static int decode_attr_files_free(struct xdr_stream *xdr, uint32_t *bitmap, uint if (unlikely(bitmap[0] & (FATTR4_WORD0_FILES_FREE - 1U))) return -EIO; if (likely(bitmap[0] & FATTR4_WORD0_FILES_FREE)) { - READ_BUF(8); - READ64(*res); + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + xdr_decode_hyper(p, res); bitmap[0] &= ~FATTR4_WORD0_FILES_FREE; } dprintk("%s: files free=%Lu\n", __func__, (unsigned long long)*res); return status; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_attr_files_total(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res) @@ -2824,12 +2845,17 @@ static int decode_attr_files_total(struct xdr_stream *xdr, uint32_t *bitmap, uin if (unlikely(bitmap[0] & (FATTR4_WORD0_FILES_TOTAL - 1U))) return -EIO; if (likely(bitmap[0] & FATTR4_WORD0_FILES_TOTAL)) { - READ_BUF(8); - READ64(*res); + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + xdr_decode_hyper(p, res); bitmap[0] &= ~FATTR4_WORD0_FILES_TOTAL; } dprintk("%s: files total=%Lu\n", __func__, (unsigned long long)*res); return status; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_pathname(struct xdr_stream *xdr, struct nfs4_pathname *path) @@ -2838,8 +2864,10 @@ static int decode_pathname(struct xdr_stream *xdr, struct nfs4_pathname *path) __be32 *p; int status = 0; - READ_BUF(4); - READ32(n); + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + n = be32_to_cpup(p); if (n == 0) goto root_path; dprintk("path "); @@ -2873,6 +2901,9 @@ out_eio: dprintk(" status %d", status); status = -EIO; goto out; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs4_fs_locations *res) @@ -2890,8 +2921,10 @@ static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, st status = decode_pathname(xdr, &res->fs_path); if (unlikely(status != 0)) goto out; - READ_BUF(4); - READ32(n); + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + n = be32_to_cpup(p); if (n <= 0) goto out_eio; res->nlocations = 0; @@ -2899,8 +2932,10 @@ static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, st u32 m; struct nfs4_fs_location *loc = &res->locations[res->nlocations]; - READ_BUF(4); - READ32(m); + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + m = be32_to_cpup(p); loc->nservers = 0; dprintk("%s: servers ", __func__); @@ -2939,6 +2974,8 @@ static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, st out: dprintk("%s: fs_locations done, error = %d\n", __func__, status); return status; +out_overflow: + print_overflow_msg(__func__, xdr); out_eio: status = -EIO; goto out; @@ -2953,12 +2990,17 @@ static int decode_attr_maxfilesize(struct xdr_stream *xdr, uint32_t *bitmap, uin if (unlikely(bitmap[0] & (FATTR4_WORD0_MAXFILESIZE - 1U))) return -EIO; if (likely(bitmap[0] & FATTR4_WORD0_MAXFILESIZE)) { - READ_BUF(8); - READ64(*res); + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + xdr_decode_hyper(p, res); bitmap[0] &= ~FATTR4_WORD0_MAXFILESIZE; } dprintk("%s: maxfilesize=%Lu\n", __func__, (unsigned long long)*res); return status; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_attr_maxlink(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *maxlink) @@ -2970,12 +3012,17 @@ static int decode_attr_maxlink(struct xdr_stream *xdr, uint32_t *bitmap, uint32_ if (unlikely(bitmap[0] & (FATTR4_WORD0_MAXLINK - 1U))) return -EIO; if (likely(bitmap[0] & FATTR4_WORD0_MAXLINK)) { - READ_BUF(4); - READ32(*maxlink); + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + *maxlink = be32_to_cpup(p); bitmap[0] &= ~FATTR4_WORD0_MAXLINK; } dprintk("%s: maxlink=%u\n", __func__, *maxlink); return status; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_attr_maxname(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *maxname) @@ -2987,12 +3034,17 @@ static int decode_attr_maxname(struct xdr_stream *xdr, uint32_t *bitmap, uint32_ if (unlikely(bitmap[0] & (FATTR4_WORD0_MAXNAME - 1U))) return -EIO; if (likely(bitmap[0] & FATTR4_WORD0_MAXNAME)) { - READ_BUF(4); - READ32(*maxname); + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + *maxname = be32_to_cpup(p); bitmap[0] &= ~FATTR4_WORD0_MAXNAME; } dprintk("%s: maxname=%u\n", __func__, *maxname); return status; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_attr_maxread(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res) @@ -3005,8 +3057,10 @@ static int decode_attr_maxread(struct xdr_stream *xdr, uint32_t *bitmap, uint32_ return -EIO; if (likely(bitmap[0] & FATTR4_WORD0_MAXREAD)) { uint64_t maxread; - READ_BUF(8); - READ64(maxread); + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + xdr_decode_hyper(p, &maxread); if (maxread > 0x7FFFFFFF) maxread = 0x7FFFFFFF; *res = (uint32_t)maxread; @@ -3014,6 +3068,9 @@ static int decode_attr_maxread(struct xdr_stream *xdr, uint32_t *bitmap, uint32_ } dprintk("%s: maxread=%lu\n", __func__, (unsigned long)*res); return status; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_attr_maxwrite(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res) @@ -3026,8 +3083,10 @@ static int decode_attr_maxwrite(struct xdr_stream *xdr, uint32_t *bitmap, uint32 return -EIO; if (likely(bitmap[0] & FATTR4_WORD0_MAXWRITE)) { uint64_t maxwrite; - READ_BUF(8); - READ64(maxwrite); + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + xdr_decode_hyper(p, &maxwrite); if (maxwrite > 0x7FFFFFFF) maxwrite = 0x7FFFFFFF; *res = (uint32_t)maxwrite; @@ -3035,6 +3094,9 @@ static int decode_attr_maxwrite(struct xdr_stream *xdr, uint32_t *bitmap, uint32 } dprintk("%s: maxwrite=%lu\n", __func__, (unsigned long)*res); return status; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_attr_mode(struct xdr_stream *xdr, uint32_t *bitmap, umode_t *mode) @@ -3047,14 +3109,19 @@ static int decode_attr_mode(struct xdr_stream *xdr, uint32_t *bitmap, umode_t *m if (unlikely(bitmap[1] & (FATTR4_WORD1_MODE - 1U))) return -EIO; if (likely(bitmap[1] & FATTR4_WORD1_MODE)) { - READ_BUF(4); - READ32(tmp); + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + tmp = be32_to_cpup(p); *mode = tmp & ~S_IFMT; bitmap[1] &= ~FATTR4_WORD1_MODE; ret = NFS_ATTR_FATTR_MODE; } dprintk("%s: file mode=0%o\n", __func__, (unsigned int)*mode); return ret; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_attr_nlink(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *nlink) @@ -3066,16 +3133,22 @@ static int decode_attr_nlink(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t if (unlikely(bitmap[1] & (FATTR4_WORD1_NUMLINKS - 1U))) return -EIO; if (likely(bitmap[1] & FATTR4_WORD1_NUMLINKS)) { - READ_BUF(4); - READ32(*nlink); + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + *nlink = be32_to_cpup(p); bitmap[1] &= ~FATTR4_WORD1_NUMLINKS; ret = NFS_ATTR_FATTR_NLINK; } dprintk("%s: nlink=%u\n", __func__, (unsigned int)*nlink); return ret; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } -static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_client *clp, uint32_t *uid) +static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap, + struct nfs_client *clp, uint32_t *uid, int may_sleep) { uint32_t len; __be32 *p; @@ -3085,10 +3158,16 @@ static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap, struct nf if (unlikely(bitmap[1] & (FATTR4_WORD1_OWNER - 1U))) return -EIO; if (likely(bitmap[1] & FATTR4_WORD1_OWNER)) { - READ_BUF(4); - READ32(len); - READ_BUF(len); - if (len < XDR_MAX_NETOBJ) { + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + len = be32_to_cpup(p); + p = xdr_inline_decode(xdr, len); + if (unlikely(!p)) + goto out_overflow; + if (!may_sleep) { + /* do nothing */ + } else if (len < XDR_MAX_NETOBJ) { if (nfs_map_name_to_uid(clp, (char *)p, len, uid) == 0) ret = NFS_ATTR_FATTR_OWNER; else @@ -3101,9 +3180,13 @@ static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap, struct nf } dprintk("%s: uid=%d\n", __func__, (int)*uid); return ret; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } -static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_client *clp, uint32_t *gid) +static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap, + struct nfs_client *clp, uint32_t *gid, int may_sleep) { uint32_t len; __be32 *p; @@ -3113,10 +3196,16 @@ static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap, struct nf if (unlikely(bitmap[1] & (FATTR4_WORD1_OWNER_GROUP - 1U))) return -EIO; if (likely(bitmap[1] & FATTR4_WORD1_OWNER_GROUP)) { - READ_BUF(4); - READ32(len); - READ_BUF(len); - if (len < XDR_MAX_NETOBJ) { + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + len = be32_to_cpup(p); + p = xdr_inline_decode(xdr, len); + if (unlikely(!p)) + goto out_overflow; + if (!may_sleep) { + /* do nothing */ + } else if (len < XDR_MAX_NETOBJ) { if (nfs_map_group_to_gid(clp, (char *)p, len, gid) == 0) ret = NFS_ATTR_FATTR_GROUP; else @@ -3129,6 +3218,9 @@ static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap, struct nf } dprintk("%s: gid=%d\n", __func__, (int)*gid); return ret; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_attr_rdev(struct xdr_stream *xdr, uint32_t *bitmap, dev_t *rdev) @@ -3143,9 +3235,11 @@ static int decode_attr_rdev(struct xdr_stream *xdr, uint32_t *bitmap, dev_t *rde if (likely(bitmap[1] & FATTR4_WORD1_RAWDEV)) { dev_t tmp; - READ_BUF(8); - READ32(major); - READ32(minor); + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + major = be32_to_cpup(p++); + minor = be32_to_cpup(p); tmp = MKDEV(major, minor); if (MAJOR(tmp) == major && MINOR(tmp) == minor) *rdev = tmp; @@ -3154,6 +3248,9 @@ static int decode_attr_rdev(struct xdr_stream *xdr, uint32_t *bitmap, dev_t *rde } dprintk("%s: rdev=(0x%x:0x%x)\n", __func__, major, minor); return ret; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_attr_space_avail(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res) @@ -3165,12 +3262,17 @@ static int decode_attr_space_avail(struct xdr_stream *xdr, uint32_t *bitmap, uin if (unlikely(bitmap[1] & (FATTR4_WORD1_SPACE_AVAIL - 1U))) return -EIO; if (likely(bitmap[1] & FATTR4_WORD1_SPACE_AVAIL)) { - READ_BUF(8); - READ64(*res); + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + xdr_decode_hyper(p, res); bitmap[1] &= ~FATTR4_WORD1_SPACE_AVAIL; } dprintk("%s: space avail=%Lu\n", __func__, (unsigned long long)*res); return status; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_attr_space_free(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res) @@ -3182,12 +3284,17 @@ static int decode_attr_space_free(struct xdr_stream *xdr, uint32_t *bitmap, uint if (unlikely(bitmap[1] & (FATTR4_WORD1_SPACE_FREE - 1U))) return -EIO; if (likely(bitmap[1] & FATTR4_WORD1_SPACE_FREE)) { - READ_BUF(8); - READ64(*res); + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + xdr_decode_hyper(p, res); bitmap[1] &= ~FATTR4_WORD1_SPACE_FREE; } dprintk("%s: space free=%Lu\n", __func__, (unsigned long long)*res); return status; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_attr_space_total(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res) @@ -3199,12 +3306,17 @@ static int decode_attr_space_total(struct xdr_stream *xdr, uint32_t *bitmap, uin if (unlikely(bitmap[1] & (FATTR4_WORD1_SPACE_TOTAL - 1U))) return -EIO; if (likely(bitmap[1] & FATTR4_WORD1_SPACE_TOTAL)) { - READ_BUF(8); - READ64(*res); + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + xdr_decode_hyper(p, res); bitmap[1] &= ~FATTR4_WORD1_SPACE_TOTAL; } dprintk("%s: space total=%Lu\n", __func__, (unsigned long long)*res); return status; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_attr_space_used(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *used) @@ -3216,14 +3328,19 @@ static int decode_attr_space_used(struct xdr_stream *xdr, uint32_t *bitmap, uint if (unlikely(bitmap[1] & (FATTR4_WORD1_SPACE_USED - 1U))) return -EIO; if (likely(bitmap[1] & FATTR4_WORD1_SPACE_USED)) { - READ_BUF(8); - READ64(*used); + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + xdr_decode_hyper(p, used); bitmap[1] &= ~FATTR4_WORD1_SPACE_USED; ret = NFS_ATTR_FATTR_SPACE_USED; } dprintk("%s: space used=%Lu\n", __func__, (unsigned long long)*used); return ret; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_attr_time(struct xdr_stream *xdr, struct timespec *time) @@ -3232,12 +3349,17 @@ static int decode_attr_time(struct xdr_stream *xdr, struct timespec *time) uint64_t sec; uint32_t nsec; - READ_BUF(12); - READ64(sec); - READ32(nsec); + p = xdr_inline_decode(xdr, 12); + if (unlikely(!p)) + goto out_overflow; + p = xdr_decode_hyper(p, &sec); + nsec = be32_to_cpup(p); time->tv_sec = (time_t)sec; time->tv_nsec = (long)nsec; return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_attr_time_access(struct xdr_stream *xdr, uint32_t *bitmap, struct timespec *time) @@ -3315,11 +3437,16 @@ static int decode_change_info(struct xdr_stream *xdr, struct nfs4_change_info *c { __be32 *p; - READ_BUF(20); - READ32(cinfo->atomic); - READ64(cinfo->before); - READ64(cinfo->after); + p = xdr_inline_decode(xdr, 20); + if (unlikely(!p)) + goto out_overflow; + cinfo->atomic = be32_to_cpup(p++); + p = xdr_decode_hyper(p, &cinfo->before); + xdr_decode_hyper(p, &cinfo->after); return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_access(struct xdr_stream *xdr, struct nfs4_accessres *access) @@ -3331,40 +3458,62 @@ static int decode_access(struct xdr_stream *xdr, struct nfs4_accessres *access) status = decode_op_hdr(xdr, OP_ACCESS); if (status) return status; - READ_BUF(8); - READ32(supp); - READ32(acc); + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + supp = be32_to_cpup(p++); + acc = be32_to_cpup(p); access->supported = supp; access->access = acc; return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } -static int decode_close(struct xdr_stream *xdr, struct nfs_closeres *res) +static int decode_opaque_fixed(struct xdr_stream *xdr, void *buf, size_t len) { __be32 *p; + + p = xdr_inline_decode(xdr, len); + if (likely(p)) { + memcpy(buf, p, len); + return 0; + } + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static int decode_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid) +{ + return decode_opaque_fixed(xdr, stateid->data, NFS4_STATEID_SIZE); +} + +static int decode_close(struct xdr_stream *xdr, struct nfs_closeres *res) +{ int status; status = decode_op_hdr(xdr, OP_CLOSE); if (status != -EIO) nfs_increment_open_seqid(status, res->seqid); - if (status) - return status; - READ_BUF(NFS4_STATEID_SIZE); - COPYMEM(res->stateid.data, NFS4_STATEID_SIZE); - return 0; + if (!status) + status = decode_stateid(xdr, &res->stateid); + return status; +} + +static int decode_verifier(struct xdr_stream *xdr, void *verifier) +{ + return decode_opaque_fixed(xdr, verifier, 8); } static int decode_commit(struct xdr_stream *xdr, struct nfs_writeres *res) { - __be32 *p; int status; status = decode_op_hdr(xdr, OP_COMMIT); - if (status) - return status; - READ_BUF(8); - COPYMEM(res->verf->verifier, 8); - return 0; + if (!status) + status = decode_verifier(xdr, res->verf->verifier); + return status; } static int decode_create(struct xdr_stream *xdr, struct nfs4_change_info *cinfo) @@ -3378,10 +3527,16 @@ static int decode_create(struct xdr_stream *xdr, struct nfs4_change_info *cinfo) return status; if ((status = decode_change_info(xdr, cinfo))) return status; - READ_BUF(4); - READ32(bmlen); - READ_BUF(bmlen << 2); - return 0; + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + bmlen = be32_to_cpup(p); + p = xdr_inline_decode(xdr, bmlen << 2); + if (likely(p)) + return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_server_caps(struct xdr_stream *xdr, struct nfs4_server_caps_res *res) @@ -3466,7 +3621,8 @@ xdr_error: return status; } -static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr, const struct nfs_server *server) +static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr, + const struct nfs_server *server, int may_sleep) { __be32 *savep; uint32_t attrlen, @@ -3538,12 +3694,14 @@ static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr, cons goto xdr_error; fattr->valid |= status; - status = decode_attr_owner(xdr, bitmap, server->nfs_client, &fattr->uid); + status = decode_attr_owner(xdr, bitmap, server->nfs_client, + &fattr->uid, may_sleep); if (status < 0) goto xdr_error; fattr->valid |= status; - status = decode_attr_group(xdr, bitmap, server->nfs_client, &fattr->gid); + status = decode_attr_group(xdr, bitmap, server->nfs_client, + &fattr->gid, may_sleep); if (status < 0) goto xdr_error; fattr->valid |= status; @@ -3633,14 +3791,21 @@ static int decode_getfh(struct xdr_stream *xdr, struct nfs_fh *fh) if (status) return status; - READ_BUF(4); - READ32(len); + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + len = be32_to_cpup(p); if (len > NFS4_FHSIZE) return -EIO; fh->size = len; - READ_BUF(len); - COPYMEM(fh->data, len); + p = xdr_inline_decode(xdr, len); + if (unlikely(!p)) + goto out_overflow; + memcpy(fh->data, p, len); return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_link(struct xdr_stream *xdr, struct nfs4_change_info *cinfo) @@ -3662,10 +3827,12 @@ static int decode_lock_denied (struct xdr_stream *xdr, struct file_lock *fl) __be32 *p; uint32_t namelen, type; - READ_BUF(32); - READ64(offset); - READ64(length); - READ32(type); + p = xdr_inline_decode(xdr, 32); + if (unlikely(!p)) + goto out_overflow; + p = xdr_decode_hyper(p, &offset); + p = xdr_decode_hyper(p, &length); + type = be32_to_cpup(p++); if (fl != NULL) { fl->fl_start = (loff_t)offset; fl->fl_end = fl->fl_start + (loff_t)length - 1; @@ -3676,23 +3843,27 @@ static int decode_lock_denied (struct xdr_stream *xdr, struct file_lock *fl) fl->fl_type = F_RDLCK; fl->fl_pid = 0; } - READ64(clientid); - READ32(namelen); - READ_BUF(namelen); - return -NFS4ERR_DENIED; + p = xdr_decode_hyper(p, &clientid); + namelen = be32_to_cpup(p); + p = xdr_inline_decode(xdr, namelen); + if (likely(p)) + return -NFS4ERR_DENIED; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_lock(struct xdr_stream *xdr, struct nfs_lock_res *res) { - __be32 *p; int status; status = decode_op_hdr(xdr, OP_LOCK); if (status == -EIO) goto out; if (status == 0) { - READ_BUF(NFS4_STATEID_SIZE); - COPYMEM(res->stateid.data, NFS4_STATEID_SIZE); + status = decode_stateid(xdr, &res->stateid); + if (unlikely(status)) + goto out; } else if (status == -NFS4ERR_DENIED) status = decode_lock_denied(xdr, NULL); if (res->open_seqid != NULL) @@ -3713,16 +3884,13 @@ static int decode_lockt(struct xdr_stream *xdr, struct nfs_lockt_res *res) static int decode_locku(struct xdr_stream *xdr, struct nfs_locku_res *res) { - __be32 *p; int status; status = decode_op_hdr(xdr, OP_LOCKU); if (status != -EIO) nfs_increment_lock_seqid(status, res->seqid); - if (status == 0) { - READ_BUF(NFS4_STATEID_SIZE); - COPYMEM(res->stateid.data, NFS4_STATEID_SIZE); - } + if (status == 0) + status = decode_stateid(xdr, &res->stateid); return status; } @@ -3737,34 +3905,46 @@ static int decode_space_limit(struct xdr_stream *xdr, u64 *maxsize) __be32 *p; uint32_t limit_type, nblocks, blocksize; - READ_BUF(12); - READ32(limit_type); + p = xdr_inline_decode(xdr, 12); + if (unlikely(!p)) + goto out_overflow; + limit_type = be32_to_cpup(p++); switch (limit_type) { case 1: - READ64(*maxsize); + xdr_decode_hyper(p, maxsize); break; case 2: - READ32(nblocks); - READ32(blocksize); + nblocks = be32_to_cpup(p++); + blocksize = be32_to_cpup(p); *maxsize = (uint64_t)nblocks * (uint64_t)blocksize; } return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_delegation(struct xdr_stream *xdr, struct nfs_openres *res) { __be32 *p; uint32_t delegation_type; + int status; - READ_BUF(4); - READ32(delegation_type); + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + delegation_type = be32_to_cpup(p); if (delegation_type == NFS4_OPEN_DELEGATE_NONE) { res->delegation_type = 0; return 0; } - READ_BUF(NFS4_STATEID_SIZE+4); - COPYMEM(res->delegation.data, NFS4_STATEID_SIZE); - READ32(res->do_recall); + status = decode_stateid(xdr, &res->delegation); + if (unlikely(status)) + return status; + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + res->do_recall = be32_to_cpup(p); switch (delegation_type) { case NFS4_OPEN_DELEGATE_READ: @@ -3776,6 +3956,9 @@ static int decode_delegation(struct xdr_stream *xdr, struct nfs_openres *res) return -EIO; } return decode_ace(xdr, NULL, res->server->nfs_client); +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_open(struct xdr_stream *xdr, struct nfs_openres *res) @@ -3787,23 +3970,27 @@ static int decode_open(struct xdr_stream *xdr, struct nfs_openres *res) status = decode_op_hdr(xdr, OP_OPEN); if (status != -EIO) nfs_increment_open_seqid(status, res->seqid); - if (status) + if (!status) + status = decode_stateid(xdr, &res->stateid); + if (unlikely(status)) return status; - READ_BUF(NFS4_STATEID_SIZE); - COPYMEM(res->stateid.data, NFS4_STATEID_SIZE); decode_change_info(xdr, &res->cinfo); - READ_BUF(8); - READ32(res->rflags); - READ32(bmlen); + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + res->rflags = be32_to_cpup(p++); + bmlen = be32_to_cpup(p); if (bmlen > 10) goto xdr_error; - READ_BUF(bmlen << 2); + p = xdr_inline_decode(xdr, bmlen << 2); + if (unlikely(!p)) + goto out_overflow; savewords = min_t(uint32_t, bmlen, NFS4_BITMAP_SIZE); for (i = 0; i < savewords; ++i) - READ32(res->attrset[i]); + res->attrset[i] = be32_to_cpup(p++); for (; i < NFS4_BITMAP_SIZE; i++) res->attrset[i] = 0; @@ -3811,36 +3998,33 @@ static int decode_open(struct xdr_stream *xdr, struct nfs_openres *res) xdr_error: dprintk("%s: Bitmap too large! Length = %u\n", __func__, bmlen); return -EIO; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_open_confirm(struct xdr_stream *xdr, struct nfs_open_confirmres *res) { - __be32 *p; int status; status = decode_op_hdr(xdr, OP_OPEN_CONFIRM); if (status != -EIO) nfs_increment_open_seqid(status, res->seqid); - if (status) - return status; - READ_BUF(NFS4_STATEID_SIZE); - COPYMEM(res->stateid.data, NFS4_STATEID_SIZE); - return 0; + if (!status) + status = decode_stateid(xdr, &res->stateid); + return status; } static int decode_open_downgrade(struct xdr_stream *xdr, struct nfs_closeres *res) { - __be32 *p; int status; status = decode_op_hdr(xdr, OP_OPEN_DOWNGRADE); if (status != -EIO) nfs_increment_open_seqid(status, res->seqid); - if (status) - return status; - READ_BUF(NFS4_STATEID_SIZE); - COPYMEM(res->stateid.data, NFS4_STATEID_SIZE); - return 0; + if (!status) + status = decode_stateid(xdr, &res->stateid); + return status; } static int decode_putfh(struct xdr_stream *xdr) @@ -3863,9 +4047,11 @@ static int decode_read(struct xdr_stream *xdr, struct rpc_rqst *req, struct nfs_ status = decode_op_hdr(xdr, OP_READ); if (status) return status; - READ_BUF(8); - READ32(eof); - READ32(count); + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + eof = be32_to_cpup(p++); + count = be32_to_cpup(p); hdrlen = (u8 *) p - (u8 *) iov->iov_base; recvd = req->rq_rcv_buf.len - hdrlen; if (count > recvd) { @@ -3878,6 +4064,9 @@ static int decode_read(struct xdr_stream *xdr, struct rpc_rqst *req, struct nfs_ res->eof = eof; res->count = count; return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_readdir(struct xdr_stream *xdr, struct rpc_rqst *req, struct nfs4_readdir_res *readdir) @@ -3892,17 +4081,17 @@ static int decode_readdir(struct xdr_stream *xdr, struct rpc_rqst *req, struct n int status; status = decode_op_hdr(xdr, OP_READDIR); - if (status) + if (!status) + status = decode_verifier(xdr, readdir->verifier.data); + if (unlikely(status)) return status; - READ_BUF(8); - COPYMEM(readdir->verifier.data, 8); dprintk("%s: verifier = %08x:%08x\n", __func__, ((u32 *)readdir->verifier.data)[0], ((u32 *)readdir->verifier.data)[1]); - hdrlen = (char *) p - (char *) iov->iov_base; + hdrlen = (char *) xdr->p - (char *) iov->iov_base; recvd = rcvbuf->len - hdrlen; if (pglen > recvd) pglen = recvd; @@ -3990,8 +4179,10 @@ static int decode_readlink(struct xdr_stream *xdr, struct rpc_rqst *req) return status; /* Convert length of symlink */ - READ_BUF(4); - READ32(len); + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + len = be32_to_cpup(p); if (len >= rcvbuf->page_len || len <= 0) { dprintk("nfs: server returned giant symlink!\n"); return -ENAMETOOLONG; @@ -4015,6 +4206,9 @@ static int decode_readlink(struct xdr_stream *xdr, struct rpc_rqst *req) kaddr[len+rcvbuf->page_base] = '\0'; kunmap_atomic(kaddr, KM_USER0); return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_remove(struct xdr_stream *xdr, struct nfs4_change_info *cinfo) @@ -4112,10 +4306,16 @@ static int decode_setattr(struct xdr_stream *xdr) status = decode_op_hdr(xdr, OP_SETATTR); if (status) return status; - READ_BUF(4); - READ32(bmlen); - READ_BUF(bmlen << 2); - return 0; + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + bmlen = be32_to_cpup(p); + p = xdr_inline_decode(xdr, bmlen << 2); + if (likely(p)) + return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_setclientid(struct xdr_stream *xdr, struct nfs_client *clp) @@ -4124,35 +4324,50 @@ static int decode_setclientid(struct xdr_stream *xdr, struct nfs_client *clp) uint32_t opnum; int32_t nfserr; - READ_BUF(8); - READ32(opnum); + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + opnum = be32_to_cpup(p++); if (opnum != OP_SETCLIENTID) { dprintk("nfs: decode_setclientid: Server returned operation" " %d\n", opnum); return -EIO; } - READ32(nfserr); + nfserr = be32_to_cpup(p); if (nfserr == NFS_OK) { - READ_BUF(8 + NFS4_VERIFIER_SIZE); - READ64(clp->cl_clientid); - COPYMEM(clp->cl_confirm.data, NFS4_VERIFIER_SIZE); + p = xdr_inline_decode(xdr, 8 + NFS4_VERIFIER_SIZE); + if (unlikely(!p)) + goto out_overflow; + p = xdr_decode_hyper(p, &clp->cl_clientid); + memcpy(clp->cl_confirm.data, p, NFS4_VERIFIER_SIZE); } else if (nfserr == NFSERR_CLID_INUSE) { uint32_t len; /* skip netid string */ - READ_BUF(4); - READ32(len); - READ_BUF(len); + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + len = be32_to_cpup(p); + p = xdr_inline_decode(xdr, len); + if (unlikely(!p)) + goto out_overflow; /* skip uaddr string */ - READ_BUF(4); - READ32(len); - READ_BUF(len); + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + len = be32_to_cpup(p); + p = xdr_inline_decode(xdr, len); + if (unlikely(!p)) + goto out_overflow; return -NFSERR_CLID_INUSE; } else return nfs4_stat_to_errno(nfserr); return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_setclientid_confirm(struct xdr_stream *xdr) @@ -4169,11 +4384,16 @@ static int decode_write(struct xdr_stream *xdr, struct nfs_writeres *res) if (status) return status; - READ_BUF(16); - READ32(res->count); - READ32(res->verf->committed); - COPYMEM(res->verf->verifier, 8); + p = xdr_inline_decode(xdr, 16); + if (unlikely(!p)) + goto out_overflow; + res->count = be32_to_cpup(p++); + res->verf->committed = be32_to_cpup(p++); + memcpy(res->verf->verifier, p, 8); return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_delegreturn(struct xdr_stream *xdr) @@ -4187,6 +4407,7 @@ static int decode_exchange_id(struct xdr_stream *xdr, { __be32 *p; uint32_t dummy; + char *dummy_str; int status; struct nfs_client *clp = res->client; @@ -4194,36 +4415,45 @@ static int decode_exchange_id(struct xdr_stream *xdr, if (status) return status; - READ_BUF(8); - READ64(clp->cl_ex_clid); - READ_BUF(12); - READ32(clp->cl_seqid); - READ32(clp->cl_exchange_flags); + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + xdr_decode_hyper(p, &clp->cl_ex_clid); + p = xdr_inline_decode(xdr, 12); + if (unlikely(!p)) + goto out_overflow; + clp->cl_seqid = be32_to_cpup(p++); + clp->cl_exchange_flags = be32_to_cpup(p++); /* We ask for SP4_NONE */ - READ32(dummy); + dummy = be32_to_cpup(p); if (dummy != SP4_NONE) return -EIO; /* Throw away minor_id */ - READ_BUF(8); + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; /* Throw away Major id */ - READ_BUF(4); - READ32(dummy); - READ_BUF(dummy); + status = decode_opaque_inline(xdr, &dummy, &dummy_str); + if (unlikely(status)) + return status; /* Throw away server_scope */ - READ_BUF(4); - READ32(dummy); - READ_BUF(dummy); + status = decode_opaque_inline(xdr, &dummy, &dummy_str); + if (unlikely(status)) + return status; /* Throw away Implementation id array */ - READ_BUF(4); - READ32(dummy); - READ_BUF(dummy); + status = decode_opaque_inline(xdr, &dummy, &dummy_str); + if (unlikely(status)) + return status; return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_chan_attrs(struct xdr_stream *xdr, @@ -4232,22 +4462,35 @@ static int decode_chan_attrs(struct xdr_stream *xdr, __be32 *p; u32 nr_attrs; - READ_BUF(28); - READ32(attrs->headerpadsz); - READ32(attrs->max_rqst_sz); - READ32(attrs->max_resp_sz); - READ32(attrs->max_resp_sz_cached); - READ32(attrs->max_ops); - READ32(attrs->max_reqs); - READ32(nr_attrs); + p = xdr_inline_decode(xdr, 28); + if (unlikely(!p)) + goto out_overflow; + attrs->headerpadsz = be32_to_cpup(p++); + attrs->max_rqst_sz = be32_to_cpup(p++); + attrs->max_resp_sz = be32_to_cpup(p++); + attrs->max_resp_sz_cached = be32_to_cpup(p++); + attrs->max_ops = be32_to_cpup(p++); + attrs->max_reqs = be32_to_cpup(p++); + nr_attrs = be32_to_cpup(p); if (unlikely(nr_attrs > 1)) { printk(KERN_WARNING "%s: Invalid rdma channel attrs count %u\n", __func__, nr_attrs); return -EINVAL; } - if (nr_attrs == 1) - READ_BUF(4); /* skip rdma_attrs */ + if (nr_attrs == 1) { + p = xdr_inline_decode(xdr, 4); /* skip rdma_attrs */ + if (unlikely(!p)) + goto out_overflow; + } return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static int decode_sessionid(struct xdr_stream *xdr, struct nfs4_sessionid *sid) +{ + return decode_opaque_fixed(xdr, sid->data, NFS4_MAX_SESSIONID_LEN); } static int decode_create_session(struct xdr_stream *xdr, @@ -4259,24 +4502,26 @@ static int decode_create_session(struct xdr_stream *xdr, struct nfs4_session *session = clp->cl_session; status = decode_op_hdr(xdr, OP_CREATE_SESSION); - - if (status) + if (!status) + status = decode_sessionid(xdr, &session->sess_id); + if (unlikely(status)) return status; - /* sessionid */ - READ_BUF(NFS4_MAX_SESSIONID_LEN); - COPYMEM(&session->sess_id, NFS4_MAX_SESSIONID_LEN); - /* seqid, flags */ - READ_BUF(8); - READ32(clp->cl_seqid); - READ32(session->flags); + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + clp->cl_seqid = be32_to_cpup(p++); + session->flags = be32_to_cpup(p); /* Channel attributes */ status = decode_chan_attrs(xdr, &session->fc_attrs); if (!status) status = decode_chan_attrs(xdr, &session->bc_attrs); return status; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_destroy_session(struct xdr_stream *xdr, void *dummy) @@ -4300,7 +4545,9 @@ static int decode_sequence(struct xdr_stream *xdr, return 0; status = decode_op_hdr(xdr, OP_SEQUENCE); - if (status) + if (!status) + status = decode_sessionid(xdr, &id); + if (unlikely(status)) goto out_err; /* @@ -4309,36 +4556,43 @@ static int decode_sequence(struct xdr_stream *xdr, */ status = -ESERVERFAULT; - slot = &res->sr_session->fc_slot_table.slots[res->sr_slotid]; - READ_BUF(NFS4_MAX_SESSIONID_LEN + 20); - COPYMEM(id.data, NFS4_MAX_SESSIONID_LEN); if (memcmp(id.data, res->sr_session->sess_id.data, NFS4_MAX_SESSIONID_LEN)) { dprintk("%s Invalid session id\n", __func__); goto out_err; } + + p = xdr_inline_decode(xdr, 20); + if (unlikely(!p)) + goto out_overflow; + /* seqid */ - READ32(dummy); + slot = &res->sr_session->fc_slot_table.slots[res->sr_slotid]; + dummy = be32_to_cpup(p++); if (dummy != slot->seq_nr) { dprintk("%s Invalid sequence number\n", __func__); goto out_err; } /* slot id */ - READ32(dummy); + dummy = be32_to_cpup(p++); if (dummy != res->sr_slotid) { dprintk("%s Invalid slot id\n", __func__); goto out_err; } /* highest slot id - currently not processed */ - READ32(dummy); + dummy = be32_to_cpup(p++); /* target highest slot id - currently not processed */ - READ32(dummy); + dummy = be32_to_cpup(p++); /* result flags - currently not processed */ - READ32(dummy); + dummy = be32_to_cpup(p); status = 0; out_err: res->sr_status = status; return status; +out_overflow: + print_overflow_msg(__func__, xdr); + status = -EIO; + goto out_err; #else /* CONFIG_NFS_V4_1 */ return 0; #endif /* CONFIG_NFS_V4_1 */ @@ -4370,7 +4624,8 @@ static int nfs4_xdr_dec_open_downgrade(struct rpc_rqst *rqstp, __be32 *p, struct status = decode_open_downgrade(&xdr, res); if (status != 0) goto out; - decode_getfattr(&xdr, res->fattr, res->server); + decode_getfattr(&xdr, res->fattr, res->server, + !RPC_IS_ASYNC(rqstp->rq_task)); out: return status; } @@ -4397,7 +4652,8 @@ static int nfs4_xdr_dec_access(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_ac status = decode_access(&xdr, res); if (status != 0) goto out; - decode_getfattr(&xdr, res->fattr, res->server); + decode_getfattr(&xdr, res->fattr, res->server, + !RPC_IS_ASYNC(rqstp->rq_task)); out: return status; } @@ -4424,7 +4680,8 @@ static int nfs4_xdr_dec_lookup(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_lo goto out; if ((status = decode_getfh(&xdr, res->fh)) != 0) goto out; - status = decode_getfattr(&xdr, res->fattr, res->server); + status = decode_getfattr(&xdr, res->fattr, res->server + ,!RPC_IS_ASYNC(rqstp->rq_task)); out: return status; } @@ -4448,7 +4705,8 @@ static int nfs4_xdr_dec_lookup_root(struct rpc_rqst *rqstp, __be32 *p, struct nf if ((status = decode_putrootfh(&xdr)) != 0) goto out; if ((status = decode_getfh(&xdr, res->fh)) == 0) - status = decode_getfattr(&xdr, res->fattr, res->server); + status = decode_getfattr(&xdr, res->fattr, res->server, + !RPC_IS_ASYNC(rqstp->rq_task)); out: return status; } @@ -4473,7 +4731,8 @@ static int nfs4_xdr_dec_remove(struct rpc_rqst *rqstp, __be32 *p, struct nfs_rem goto out; if ((status = decode_remove(&xdr, &res->cinfo)) != 0) goto out; - decode_getfattr(&xdr, &res->dir_attr, res->server); + decode_getfattr(&xdr, &res->dir_attr, res->server, + !RPC_IS_ASYNC(rqstp->rq_task)); out: return status; } @@ -4503,11 +4762,13 @@ static int nfs4_xdr_dec_rename(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_re if ((status = decode_rename(&xdr, &res->old_cinfo, &res->new_cinfo)) != 0) goto out; /* Current FH is target directory */ - if (decode_getfattr(&xdr, res->new_fattr, res->server) != 0) + if (decode_getfattr(&xdr, res->new_fattr, res->server, + !RPC_IS_ASYNC(rqstp->rq_task)) != 0) goto out; if ((status = decode_restorefh(&xdr)) != 0) goto out; - decode_getfattr(&xdr, res->old_fattr, res->server); + decode_getfattr(&xdr, res->old_fattr, res->server, + !RPC_IS_ASYNC(rqstp->rq_task)); out: return status; } @@ -4540,11 +4801,13 @@ static int nfs4_xdr_dec_link(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_link * Note order: OP_LINK leaves the directory as the current * filehandle. */ - if (decode_getfattr(&xdr, res->dir_attr, res->server) != 0) + if (decode_getfattr(&xdr, res->dir_attr, res->server, + !RPC_IS_ASYNC(rqstp->rq_task)) != 0) goto out; if ((status = decode_restorefh(&xdr)) != 0) goto out; - decode_getfattr(&xdr, res->fattr, res->server); + decode_getfattr(&xdr, res->fattr, res->server, + !RPC_IS_ASYNC(rqstp->rq_task)); out: return status; } @@ -4573,11 +4836,13 @@ static int nfs4_xdr_dec_create(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_cr goto out; if ((status = decode_getfh(&xdr, res->fh)) != 0) goto out; - if (decode_getfattr(&xdr, res->fattr, res->server) != 0) + if (decode_getfattr(&xdr, res->fattr, res->server, + !RPC_IS_ASYNC(rqstp->rq_task)) != 0) goto out; if ((status = decode_restorefh(&xdr)) != 0) goto out; - decode_getfattr(&xdr, res->dir_fattr, res->server); + decode_getfattr(&xdr, res->dir_fattr, res->server, + !RPC_IS_ASYNC(rqstp->rq_task)); out: return status; } @@ -4609,7 +4874,8 @@ static int nfs4_xdr_dec_getattr(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_g status = decode_putfh(&xdr); if (status) goto out; - status = decode_getfattr(&xdr, res->fattr, res->server); + status = decode_getfattr(&xdr, res->fattr, res->server, + !RPC_IS_ASYNC(rqstp->rq_task)); out: return status; } @@ -4716,7 +4982,8 @@ static int nfs4_xdr_dec_close(struct rpc_rqst *rqstp, __be32 *p, struct nfs_clos * an ESTALE error. Shouldn't be a problem, * though, since fattr->valid will remain unset. */ - decode_getfattr(&xdr, res->fattr, res->server); + decode_getfattr(&xdr, res->fattr, res->server, + !RPC_IS_ASYNC(rqstp->rq_task)); out: return status; } @@ -4748,11 +5015,13 @@ static int nfs4_xdr_dec_open(struct rpc_rqst *rqstp, __be32 *p, struct nfs_openr goto out; if (decode_getfh(&xdr, &res->fh) != 0) goto out; - if (decode_getfattr(&xdr, res->f_attr, res->server) != 0) + if (decode_getfattr(&xdr, res->f_attr, res->server, + !RPC_IS_ASYNC(rqstp->rq_task)) != 0) goto out; if (decode_restorefh(&xdr) != 0) goto out; - decode_getfattr(&xdr, res->dir_attr, res->server); + decode_getfattr(&xdr, res->dir_attr, res->server, + !RPC_IS_ASYNC(rqstp->rq_task)); out: return status; } @@ -4800,7 +5069,8 @@ static int nfs4_xdr_dec_open_noattr(struct rpc_rqst *rqstp, __be32 *p, struct nf status = decode_open(&xdr, res); if (status) goto out; - decode_getfattr(&xdr, res->f_attr, res->server); + decode_getfattr(&xdr, res->f_attr, res->server, + !RPC_IS_ASYNC(rqstp->rq_task)); out: return status; } @@ -4827,7 +5097,8 @@ static int nfs4_xdr_dec_setattr(struct rpc_rqst *rqstp, __be32 *p, struct nfs_se status = decode_setattr(&xdr); if (status) goto out; - decode_getfattr(&xdr, res->fattr, res->server); + decode_getfattr(&xdr, res->fattr, res->server, + !RPC_IS_ASYNC(rqstp->rq_task)); out: return status; } @@ -5001,7 +5272,8 @@ static int nfs4_xdr_dec_write(struct rpc_rqst *rqstp, __be32 *p, struct nfs_writ status = decode_write(&xdr, res); if (status) goto out; - decode_getfattr(&xdr, res->fattr, res->server); + decode_getfattr(&xdr, res->fattr, res->server, + !RPC_IS_ASYNC(rqstp->rq_task)); if (!status) status = res->count; out: @@ -5030,7 +5302,8 @@ static int nfs4_xdr_dec_commit(struct rpc_rqst *rqstp, __be32 *p, struct nfs_wri status = decode_commit(&xdr, res); if (status) goto out; - decode_getfattr(&xdr, res->fattr, res->server); + decode_getfattr(&xdr, res->fattr, res->server, + !RPC_IS_ASYNC(rqstp->rq_task)); out: return status; } @@ -5194,7 +5467,8 @@ static int nfs4_xdr_dec_delegreturn(struct rpc_rqst *rqstp, __be32 *p, struct nf if (status != 0) goto out; status = decode_delegreturn(&xdr); - decode_getfattr(&xdr, res->fattr, res->server); + decode_getfattr(&xdr, res->fattr, res->server, + !RPC_IS_ASYNC(rqstp->rq_task)); out: return status; } @@ -5222,7 +5496,8 @@ static int nfs4_xdr_dec_fs_locations(struct rpc_rqst *req, __be32 *p, goto out; xdr_enter_page(&xdr, PAGE_SIZE); status = decode_getfattr(&xdr, &res->fs_locations->fattr, - res->fs_locations->server); + res->fs_locations->server, + !RPC_IS_ASYNC(req->rq_task)); out: return status; } diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c index 7be72d9..ef58385 100644 --- a/fs/nfs/proc.c +++ b/fs/nfs/proc.c @@ -32,7 +32,6 @@ #include <linux/slab.h> #include <linux/time.h> #include <linux/mm.h> -#include <linux/utsname.h> #include <linux/errno.h> #include <linux/string.h> #include <linux/in.h> diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 0b4cbdc..90be551 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -73,7 +73,7 @@ enum { Opt_cto, Opt_nocto, Opt_ac, Opt_noac, Opt_lock, Opt_nolock, - Opt_v2, Opt_v3, + Opt_v2, Opt_v3, Opt_v4, Opt_udp, Opt_tcp, Opt_rdma, Opt_acl, Opt_noacl, Opt_rdirplus, Opt_nordirplus, @@ -127,6 +127,7 @@ static const match_table_t nfs_mount_option_tokens = { { Opt_nolock, "nolock" }, { Opt_v2, "v2" }, { Opt_v3, "v3" }, + { Opt_v4, "v4" }, { Opt_udp, "udp" }, { Opt_tcp, "tcp" }, { Opt_rdma, "rdma" }, @@ -158,7 +159,7 @@ static const match_table_t nfs_mount_option_tokens = { { Opt_mountvers, "mountvers=%s" }, { Opt_nfsvers, "nfsvers=%s" }, { Opt_nfsvers, "vers=%s" }, - { Opt_minorversion, "minorversion=%u" }, + { Opt_minorversion, "minorversion=%s" }, { Opt_sec, "sec=%s" }, { Opt_proto, "proto=%s" }, @@ -272,6 +273,10 @@ static const struct super_operations nfs_sops = { }; #ifdef CONFIG_NFS_V4 +static int nfs4_validate_text_mount_data(void *options, + struct nfs_parsed_mount_data *args, const char *dev_name); +static int nfs4_try_mount(int flags, const char *dev_name, + struct nfs_parsed_mount_data *data, struct vfsmount *mnt); static int nfs4_get_sb(struct file_system_type *fs_type, int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt); static int nfs4_remote_get_sb(struct file_system_type *fs_type, @@ -723,6 +728,29 @@ static void nfs_umount_begin(struct super_block *sb) unlock_kernel(); } +static struct nfs_parsed_mount_data *nfs_alloc_parsed_mount_data(unsigned int version) +{ + struct nfs_parsed_mount_data *data; + + data = kzalloc(sizeof(*data), GFP_KERNEL); + if (data) { + data->rsize = NFS_MAX_FILE_IO_SIZE; + data->wsize = NFS_MAX_FILE_IO_SIZE; + data->acregmin = NFS_DEF_ACREGMIN; + data->acregmax = NFS_DEF_ACREGMAX; + data->acdirmin = NFS_DEF_ACDIRMIN; + data->acdirmax = NFS_DEF_ACDIRMAX; + data->mount_server.port = NFS_UNSPEC_PORT; + data->nfs_server.port = NFS_UNSPEC_PORT; + data->nfs_server.protocol = XPRT_TRANSPORT_TCP; + data->auth_flavors[0] = RPC_AUTH_UNIX; + data->auth_flavor_len = 1; + data->version = version; + data->minorversion = 0; + } + return data; +} + /* * Sanity-check a server address provided by the mount command. * @@ -742,127 +770,21 @@ static int nfs_verify_server_address(struct sockaddr *addr) } } + dfprintk(MOUNT, "NFS: Invalid IP address specified\n"); return 0; } -static void nfs_parse_ipv4_address(char *string, size_t str_len, - struct sockaddr *sap, size_t *addr_len) -{ - struct sockaddr_in *sin = (struct sockaddr_in *)sap; - u8 *addr = (u8 *)&sin->sin_addr.s_addr; - - if (str_len <= INET_ADDRSTRLEN) { - dfprintk(MOUNT, "NFS: parsing IPv4 address %*s\n", - (int)str_len, string); - - sin->sin_family = AF_INET; - *addr_len = sizeof(*sin); - if (in4_pton(string, str_len, addr, '\0', NULL)) - return; - } - - sap->sa_family = AF_UNSPEC; - *addr_len = 0; -} - -#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) -static int nfs_parse_ipv6_scope_id(const char *string, const size_t str_len, - const char *delim, - struct sockaddr_in6 *sin6) -{ - char *p; - size_t len; - - if ((string + str_len) == delim) - return 1; - - if (*delim != IPV6_SCOPE_DELIMITER) - return 0; - - if (!(ipv6_addr_type(&sin6->sin6_addr) & IPV6_ADDR_LINKLOCAL)) - return 0; - - len = (string + str_len) - delim - 1; - p = kstrndup(delim + 1, len, GFP_KERNEL); - if (p) { - unsigned long scope_id = 0; - struct net_device *dev; - - dev = dev_get_by_name(&init_net, p); - if (dev != NULL) { - scope_id = dev->ifindex; - dev_put(dev); - } else { - if (strict_strtoul(p, 10, &scope_id) == 0) { - kfree(p); - return 0; - } - } - - kfree(p); - - sin6->sin6_scope_id = scope_id; - dfprintk(MOUNT, "NFS: IPv6 scope ID = %lu\n", scope_id); - return 1; - } - - return 0; -} - -static void nfs_parse_ipv6_address(char *string, size_t str_len, - struct sockaddr *sap, size_t *addr_len) -{ - struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sap; - u8 *addr = (u8 *)&sin6->sin6_addr.in6_u; - const char *delim; - - if (str_len <= INET6_ADDRSTRLEN) { - dfprintk(MOUNT, "NFS: parsing IPv6 address %*s\n", - (int)str_len, string); - - sin6->sin6_family = AF_INET6; - *addr_len = sizeof(*sin6); - if (in6_pton(string, str_len, addr, - IPV6_SCOPE_DELIMITER, &delim) != 0) { - if (nfs_parse_ipv6_scope_id(string, str_len, - delim, sin6) != 0) - return; - } - } - - sap->sa_family = AF_UNSPEC; - *addr_len = 0; -} -#else -static void nfs_parse_ipv6_address(char *string, size_t str_len, - struct sockaddr *sap, size_t *addr_len) -{ - sap->sa_family = AF_UNSPEC; - *addr_len = 0; -} -#endif - /* - * Construct a sockaddr based on the contents of a string that contains - * an IP address in presentation format. - * - * If there is a problem constructing the new sockaddr, set the address - * family to AF_UNSPEC. + * Select between a default port value and a user-specified port value. + * If a zero value is set, then autobind will be used. */ -void nfs_parse_ip_address(char *string, size_t str_len, - struct sockaddr *sap, size_t *addr_len) +static void nfs_set_port(struct sockaddr *sap, int *port, + const unsigned short default_port) { - unsigned int i, colons; - - colons = 0; - for (i = 0; i < str_len; i++) - if (string[i] == ':') - colons++; + if (*port == NFS_UNSPEC_PORT) + *port = default_port; - if (colons >= 2) - nfs_parse_ipv6_address(string, str_len, sap, addr_len); - else - nfs_parse_ipv4_address(string, str_len, sap, addr_len); + rpc_set_port(sap, *port); } /* @@ -904,8 +826,6 @@ static void nfs_set_mount_transport_protocol(struct nfs_parsed_mount_data *mnt) /* * Parse the value of the 'sec=' option. - * - * The flavor_len setting is for v4 mounts. */ static int nfs_parse_security_flavors(char *value, struct nfs_parsed_mount_data *mnt) @@ -916,53 +836,43 @@ static int nfs_parse_security_flavors(char *value, switch (match_token(value, nfs_secflavor_tokens, args)) { case Opt_sec_none: - mnt->auth_flavor_len = 0; mnt->auth_flavors[0] = RPC_AUTH_NULL; break; case Opt_sec_sys: - mnt->auth_flavor_len = 0; mnt->auth_flavors[0] = RPC_AUTH_UNIX; break; case Opt_sec_krb5: - mnt->auth_flavor_len = 1; mnt->auth_flavors[0] = RPC_AUTH_GSS_KRB5; break; case Opt_sec_krb5i: - mnt->auth_flavor_len = 1; mnt->auth_flavors[0] = RPC_AUTH_GSS_KRB5I; break; case Opt_sec_krb5p: - mnt->auth_flavor_len = 1; mnt->auth_flavors[0] = RPC_AUTH_GSS_KRB5P; break; case Opt_sec_lkey: - mnt->auth_flavor_len = 1; mnt->auth_flavors[0] = RPC_AUTH_GSS_LKEY; break; case Opt_sec_lkeyi: - mnt->auth_flavor_len = 1; mnt->auth_flavors[0] = RPC_AUTH_GSS_LKEYI; break; case Opt_sec_lkeyp: - mnt->auth_flavor_len = 1; mnt->auth_flavors[0] = RPC_AUTH_GSS_LKEYP; break; case Opt_sec_spkm: - mnt->auth_flavor_len = 1; mnt->auth_flavors[0] = RPC_AUTH_GSS_SPKM; break; case Opt_sec_spkmi: - mnt->auth_flavor_len = 1; mnt->auth_flavors[0] = RPC_AUTH_GSS_SPKMI; break; case Opt_sec_spkmp: - mnt->auth_flavor_len = 1; mnt->auth_flavors[0] = RPC_AUTH_GSS_SPKMP; break; default: return 0; } + mnt->auth_flavor_len = 1; return 1; } @@ -1001,7 +911,6 @@ static int nfs_parse_mount_options(char *raw, while ((p = strsep(&raw, ",")) != NULL) { substring_t args[MAX_OPT_ARGS]; unsigned long option; - int int_option; int token; if (!*p) @@ -1047,10 +956,18 @@ static int nfs_parse_mount_options(char *raw, break; case Opt_v2: mnt->flags &= ~NFS_MOUNT_VER3; + mnt->version = 2; break; case Opt_v3: mnt->flags |= NFS_MOUNT_VER3; + mnt->version = 3; + break; +#ifdef CONFIG_NFS_V4 + case Opt_v4: + mnt->flags &= ~NFS_MOUNT_VER3; + mnt->version = 4; break; +#endif case Opt_udp: mnt->flags &= ~NFS_MOUNT_TCP; mnt->nfs_server.protocol = XPRT_TRANSPORT_UDP; @@ -1264,20 +1181,33 @@ static int nfs_parse_mount_options(char *raw, switch (option) { case NFS2_VERSION: mnt->flags &= ~NFS_MOUNT_VER3; + mnt->version = 2; break; case NFS3_VERSION: mnt->flags |= NFS_MOUNT_VER3; + mnt->version = 3; + break; +#ifdef CONFIG_NFS_V4 + case NFS4_VERSION: + mnt->flags &= ~NFS_MOUNT_VER3; + mnt->version = 4; break; +#endif default: goto out_invalid_value; } break; case Opt_minorversion: - if (match_int(args, &int_option)) - return 0; - if (int_option < 0 || int_option > NFS4_MAX_MINOR_VERSION) - return 0; - mnt->minorversion = int_option; + string = match_strdup(args); + if (string == NULL) + goto out_nomem; + rc = strict_strtoul(string, 10, &option); + kfree(string); + if (rc != 0) + goto out_invalid_value; + if (option > NFS4_MAX_MINOR_VERSION) + goto out_invalid_value; + mnt->minorversion = option; break; /* @@ -1323,6 +1253,7 @@ static int nfs_parse_mount_options(char *raw, default: dfprintk(MOUNT, "NFS: unrecognized " "transport protocol\n"); + kfree(string); return 0; } break; @@ -1352,11 +1283,14 @@ static int nfs_parse_mount_options(char *raw, string = match_strdup(args); if (string == NULL) goto out_nomem; - nfs_parse_ip_address(string, strlen(string), - (struct sockaddr *) - &mnt->nfs_server.address, - &mnt->nfs_server.addrlen); + mnt->nfs_server.addrlen = + rpc_pton(string, strlen(string), + (struct sockaddr *) + &mnt->nfs_server.address, + sizeof(mnt->nfs_server.address)); kfree(string); + if (mnt->nfs_server.addrlen == 0) + goto out_invalid_address; break; case Opt_clientaddr: string = match_strdup(args); @@ -1376,11 +1310,14 @@ static int nfs_parse_mount_options(char *raw, string = match_strdup(args); if (string == NULL) goto out_nomem; - nfs_parse_ip_address(string, strlen(string), - (struct sockaddr *) - &mnt->mount_server.address, - &mnt->mount_server.addrlen); + mnt->mount_server.addrlen = + rpc_pton(string, strlen(string), + (struct sockaddr *) + &mnt->mount_server.address, + sizeof(mnt->mount_server.address)); kfree(string); + if (mnt->mount_server.addrlen == 0) + goto out_invalid_address; break; case Opt_lookupcache: string = match_strdup(args); @@ -1432,8 +1369,11 @@ static int nfs_parse_mount_options(char *raw, return 1; +out_invalid_address: + printk(KERN_INFO "NFS: bad IP address specified: %s\n", p); + return 0; out_invalid_value: - printk(KERN_INFO "NFS: bad mount option value specified: %s \n", p); + printk(KERN_INFO "NFS: bad mount option value specified: %s\n", p); return 0; out_nomem: printk(KERN_INFO "NFS: not enough memory to parse option\n"); @@ -1445,13 +1385,60 @@ out_security_failure: } /* + * Match the requested auth flavors with the list returned by + * the server. Returns zero and sets the mount's authentication + * flavor on success; returns -EACCES if server does not support + * the requested flavor. + */ +static int nfs_walk_authlist(struct nfs_parsed_mount_data *args, + struct nfs_mount_request *request) +{ + unsigned int i, j, server_authlist_len = *(request->auth_flav_len); + + /* + * Certain releases of Linux's mountd return an empty + * flavor list. To prevent behavioral regression with + * these servers (ie. rejecting mounts that used to + * succeed), revert to pre-2.6.32 behavior (no checking) + * if the returned flavor list is empty. + */ + if (server_authlist_len == 0) + return 0; + + /* + * We avoid sophisticated negotiating here, as there are + * plenty of cases where we can get it wrong, providing + * either too little or too much security. + * + * RFC 2623, section 2.7 suggests we SHOULD prefer the + * flavor listed first. However, some servers list + * AUTH_NULL first. Our caller plants AUTH_SYS, the + * preferred default, in args->auth_flavors[0] if user + * didn't specify sec= mount option. + */ + for (i = 0; i < args->auth_flavor_len; i++) + for (j = 0; j < server_authlist_len; j++) + if (args->auth_flavors[i] == request->auth_flavs[j]) { + dfprintk(MOUNT, "NFS: using auth flavor %d\n", + request->auth_flavs[j]); + args->auth_flavors[0] = request->auth_flavs[j]; + return 0; + } + + dfprintk(MOUNT, "NFS: server does not support requested auth flavor\n"); + nfs_umount(request); + return -EACCES; +} + +/* * Use the remote server's MOUNT service to request the NFS file handle * corresponding to the provided path. */ static int nfs_try_mount(struct nfs_parsed_mount_data *args, struct nfs_fh *root_fh) { - unsigned int auth_flavor_len = 0; + rpc_authflavor_t server_authlist[NFS_MAX_SECFLAVORS]; + unsigned int server_authlist_len = ARRAY_SIZE(server_authlist); struct nfs_mount_request request = { .sap = (struct sockaddr *) &args->mount_server.address, @@ -1459,15 +1446,19 @@ static int nfs_try_mount(struct nfs_parsed_mount_data *args, .protocol = args->mount_server.protocol, .fh = root_fh, .noresvport = args->flags & NFS_MOUNT_NORESVPORT, - .auth_flav_len = &auth_flavor_len, + .auth_flav_len = &server_authlist_len, + .auth_flavs = server_authlist, }; int status; if (args->mount_server.version == 0) { - if (args->flags & NFS_MOUNT_VER3) - args->mount_server.version = NFS_MNT3_VERSION; - else - args->mount_server.version = NFS_MNT_VERSION; + switch (args->version) { + default: + args->mount_server.version = NFS_MNT3_VERSION; + break; + case 2: + args->mount_server.version = NFS_MNT_VERSION; + } } request.version = args->mount_server.version; @@ -1485,23 +1476,25 @@ static int nfs_try_mount(struct nfs_parsed_mount_data *args, args->mount_server.addrlen = args->nfs_server.addrlen; } request.salen = args->mount_server.addrlen; - - /* - * autobind will be used if mount_server.port == 0 - */ - nfs_set_port(request.sap, args->mount_server.port); + nfs_set_port(request.sap, &args->mount_server.port, 0); /* * Now ask the mount server to map our export path * to a file handle. */ status = nfs_mount(&request); - if (status == 0) - return 0; + if (status != 0) { + dfprintk(MOUNT, "NFS: unable to mount server %s, error %d\n", + request.hostname, status); + return status; + } - dfprintk(MOUNT, "NFS: unable to mount server %s, error %d\n", - request.hostname, status); - return status; + /* + * MNTv1 (NFSv2) does not support auth flavor negotiation. + */ + if (args->mount_server.version != NFS_MNT3_VERSION) + return 0; + return nfs_walk_authlist(args, &request); } static int nfs_parse_simple_hostname(const char *dev_name, @@ -1661,22 +1654,11 @@ static int nfs_validate_mount_data(void *options, const char *dev_name) { struct nfs_mount_data *data = (struct nfs_mount_data *)options; + struct sockaddr *sap = (struct sockaddr *)&args->nfs_server.address; if (data == NULL) goto out_no_data; - args->flags = (NFS_MOUNT_VER3 | NFS_MOUNT_TCP); - args->rsize = NFS_MAX_FILE_IO_SIZE; - args->wsize = NFS_MAX_FILE_IO_SIZE; - args->acregmin = NFS_DEF_ACREGMIN; - args->acregmax = NFS_DEF_ACREGMAX; - args->acdirmin = NFS_DEF_ACDIRMIN; - args->acdirmax = NFS_DEF_ACDIRMAX; - args->mount_server.port = 0; /* autobind unless user sets port */ - args->nfs_server.port = 0; /* autobind unless user sets port */ - args->nfs_server.protocol = XPRT_TRANSPORT_TCP; - args->auth_flavors[0] = RPC_AUTH_UNIX; - switch (data->version) { case 1: data->namlen = 0; @@ -1697,8 +1679,11 @@ static int nfs_validate_mount_data(void *options, if (data->root.size > NFS3_FHSIZE || data->root.size == 0) goto out_invalid_fh; mntfh->size = data->root.size; - } else + args->version = 3; + } else { mntfh->size = NFS2_FHSIZE; + args->version = 2; + } memcpy(mntfh->data, data->root.data, mntfh->size); @@ -1720,11 +1705,9 @@ static int nfs_validate_mount_data(void *options, args->acdirmin = data->acdirmin; args->acdirmax = data->acdirmax; - memcpy(&args->nfs_server.address, &data->addr, - sizeof(data->addr)); + memcpy(sap, &data->addr, sizeof(data->addr)); args->nfs_server.addrlen = sizeof(data->addr); - if (!nfs_verify_server_address((struct sockaddr *) - &args->nfs_server.address)) + if (!nfs_verify_server_address(sap)) goto out_no_address; if (!(data->flags & NFS_MOUNT_TCP)) @@ -1772,12 +1755,18 @@ static int nfs_validate_mount_data(void *options, if (nfs_parse_mount_options((char *)options, args) == 0) return -EINVAL; - if (!nfs_verify_server_address((struct sockaddr *) - &args->nfs_server.address)) + if (!nfs_verify_server_address(sap)) goto out_no_address; - nfs_set_port((struct sockaddr *)&args->nfs_server.address, - args->nfs_server.port); + if (args->version == 4) +#ifdef CONFIG_NFS_V4 + return nfs4_validate_text_mount_data(options, + args, dev_name); +#else + goto out_v4_not_compiled; +#endif + + nfs_set_port(sap, &args->nfs_server.port, 0); nfs_set_mount_transport_protocol(args); @@ -1800,7 +1789,7 @@ static int nfs_validate_mount_data(void *options, } #ifndef CONFIG_NFS_V3 - if (args->flags & NFS_MOUNT_VER3) + if (args->version == 3) goto out_v3_not_compiled; #endif /* !CONFIG_NFS_V3 */ @@ -1825,6 +1814,12 @@ out_v3_not_compiled: return -EPROTONOSUPPORT; #endif /* !CONFIG_NFS_V3 */ +#ifndef CONFIG_NFS_V4 +out_v4_not_compiled: + dfprintk(MOUNT, "NFS: NFSv4 is not compiled into kernel\n"); + return -EPROTONOSUPPORT; +#endif /* !CONFIG_NFS_V4 */ + out_nomem: dfprintk(MOUNT, "NFS: not enough memory to handle mount options\n"); return -ENOMEM; @@ -1852,9 +1847,10 @@ nfs_compare_remount_data(struct nfs_server *nfss, data->acdirmin != nfss->acdirmin / HZ || data->acdirmax != nfss->acdirmax / HZ || data->timeo != (10U * nfss->client->cl_timeout->to_initval / HZ) || + data->nfs_server.port != nfss->port || data->nfs_server.addrlen != nfss->nfs_client->cl_addrlen || - memcmp(&data->nfs_server.address, &nfss->nfs_client->cl_addr, - data->nfs_server.addrlen) != 0) + !rpc_cmp_addr((struct sockaddr *)&data->nfs_server.address, + (struct sockaddr *)&nfss->nfs_client->cl_addr)) return -EINVAL; return 0; @@ -1897,6 +1893,7 @@ nfs_remount(struct super_block *sb, int *flags, char *raw_data) data->acdirmin = nfss->acdirmin / HZ; data->acdirmax = nfss->acdirmax / HZ; data->timeo = 10U * nfss->client->cl_timeout->to_initval / HZ; + data->nfs_server.port = nfss->port; data->nfs_server.addrlen = nfss->nfs_client->cl_addrlen; memcpy(&data->nfs_server.address, &nfss->nfs_client->cl_addr, data->nfs_server.addrlen); @@ -1934,6 +1931,8 @@ static inline void nfs_initialise_sb(struct super_block *sb) if (server->flags & NFS_MOUNT_NOAC) sb->s_flags |= MS_SYNCHRONOUS; + sb->s_bdi = &server->backing_dev_info; + nfs_super_set_maxbytes(sb, server->maxfilesize); } @@ -1950,7 +1949,7 @@ static void nfs_fill_super(struct super_block *sb, if (data->bsize) sb->s_blocksize = nfs_block_size(data->bsize, &sb->s_blocksize_bits); - if (server->flags & NFS_MOUNT_VER3) { + if (server->nfs_client->rpc_ops->version == 3) { /* The VFS shouldn't apply the umask to mode bits. We will do * so ourselves when necessary. */ @@ -1974,7 +1973,7 @@ static void nfs_clone_super(struct super_block *sb, sb->s_blocksize = old_sb->s_blocksize; sb->s_maxbytes = old_sb->s_maxbytes; - if (server->flags & NFS_MOUNT_VER3) { + if (server->nfs_client->rpc_ops->version == 3) { /* The VFS shouldn't apply the umask to mode bits. We will do * so ourselves when necessary. */ @@ -2108,7 +2107,7 @@ static int nfs_get_sb(struct file_system_type *fs_type, }; int error = -ENOMEM; - data = kzalloc(sizeof(*data), GFP_KERNEL); + data = nfs_alloc_parsed_mount_data(3); mntfh = kzalloc(sizeof(*mntfh), GFP_KERNEL); if (data == NULL || mntfh == NULL) goto out_free_fh; @@ -2120,6 +2119,14 @@ static int nfs_get_sb(struct file_system_type *fs_type, if (error < 0) goto out; +#ifdef CONFIG_NFS_V4 + if (data->version == 4) { + error = nfs4_try_mount(flags, dev_name, data, mnt); + kfree(data->client_address); + goto out; + } +#endif /* CONFIG_NFS_V4 */ + /* Get a volume representation */ server = nfs_create_server(data, mntfh); if (IS_ERR(server)) { @@ -2150,7 +2157,8 @@ static int nfs_get_sb(struct file_system_type *fs_type, if (!s->s_root) { /* initial superblock/root creation */ nfs_fill_super(s, data); - nfs_fscache_get_super_cookie(s, data); + nfs_fscache_get_super_cookie( + s, data ? data->fscache_uniq : NULL, NULL); } mntroot = nfs_get_root(s, mntfh); @@ -2196,8 +2204,8 @@ static void nfs_kill_super(struct super_block *s) { struct nfs_server *server = NFS_SB(s); - bdi_unregister(&server->backing_dev_info); kill_anon_super(s); + bdi_unregister(&server->backing_dev_info); nfs_fscache_release_super_cookie(s); nfs_free_server(server); } @@ -2251,6 +2259,7 @@ static int nfs_xdev_get_sb(struct file_system_type *fs_type, int flags, if (!s->s_root) { /* initial superblock/root creation */ nfs_clone_super(s, data->sb); + nfs_fscache_get_super_cookie(s, NULL, data); } mntroot = nfs_get_root(s, data->fh); @@ -2317,6 +2326,43 @@ static void nfs4_validate_mount_flags(struct nfs_parsed_mount_data *args) args->flags &= ~(NFS_MOUNT_NONLM|NFS_MOUNT_NOACL|NFS_MOUNT_VER3); } +static int nfs4_validate_text_mount_data(void *options, + struct nfs_parsed_mount_data *args, + const char *dev_name) +{ + struct sockaddr *sap = (struct sockaddr *)&args->nfs_server.address; + + nfs_set_port(sap, &args->nfs_server.port, NFS_PORT); + + nfs_validate_transport_protocol(args); + + nfs4_validate_mount_flags(args); + + if (args->version != 4) { + dfprintk(MOUNT, + "NFS4: Illegal mount version\n"); + return -EINVAL; + } + + if (args->auth_flavor_len > 1) { + dfprintk(MOUNT, + "NFS4: Too many RPC auth flavours specified\n"); + return -EINVAL; + } + + if (args->client_address == NULL) { + dfprintk(MOUNT, + "NFS4: mount program didn't pass callback address\n"); + return -EINVAL; + } + + return nfs_parse_devname(dev_name, + &args->nfs_server.hostname, + NFS4_MAXNAMLEN, + &args->nfs_server.export_path, + NFS4_MAXPATHLEN); +} + /* * Validate NFSv4 mount options */ @@ -2324,36 +2370,23 @@ static int nfs4_validate_mount_data(void *options, struct nfs_parsed_mount_data *args, const char *dev_name) { - struct sockaddr_in *ap; + struct sockaddr *sap = (struct sockaddr *)&args->nfs_server.address; struct nfs4_mount_data *data = (struct nfs4_mount_data *)options; char *c; if (data == NULL) goto out_no_data; - args->rsize = NFS_MAX_FILE_IO_SIZE; - args->wsize = NFS_MAX_FILE_IO_SIZE; - args->acregmin = NFS_DEF_ACREGMIN; - args->acregmax = NFS_DEF_ACREGMAX; - args->acdirmin = NFS_DEF_ACDIRMIN; - args->acdirmax = NFS_DEF_ACDIRMAX; - args->nfs_server.port = NFS_PORT; /* 2049 unless user set port= */ - args->auth_flavors[0] = RPC_AUTH_UNIX; - args->auth_flavor_len = 0; - args->minorversion = 0; - switch (data->version) { case 1: - ap = (struct sockaddr_in *)&args->nfs_server.address; if (data->host_addrlen > sizeof(args->nfs_server.address)) goto out_no_address; if (data->host_addrlen == 0) goto out_no_address; args->nfs_server.addrlen = data->host_addrlen; - if (copy_from_user(ap, data->host_addr, data->host_addrlen)) + if (copy_from_user(sap, data->host_addr, data->host_addrlen)) return -EFAULT; - if (!nfs_verify_server_address((struct sockaddr *) - &args->nfs_server.address)) + if (!nfs_verify_server_address(sap)) goto out_no_address; if (data->auth_flavourlen) { @@ -2399,39 +2432,14 @@ static int nfs4_validate_mount_data(void *options, nfs_validate_transport_protocol(args); break; - default: { - int status; - + default: if (nfs_parse_mount_options((char *)options, args) == 0) return -EINVAL; - if (!nfs_verify_server_address((struct sockaddr *) - &args->nfs_server.address)) + if (!nfs_verify_server_address(sap)) return -EINVAL; - nfs_set_port((struct sockaddr *)&args->nfs_server.address, - args->nfs_server.port); - - nfs_validate_transport_protocol(args); - - nfs4_validate_mount_flags(args); - - if (args->auth_flavor_len > 1) - goto out_inval_auth; - - if (args->client_address == NULL) - goto out_no_client_address; - - status = nfs_parse_devname(dev_name, - &args->nfs_server.hostname, - NFS4_MAXNAMLEN, - &args->nfs_server.export_path, - NFS4_MAXPATHLEN); - if (status < 0) - return status; - - break; - } + return nfs4_validate_text_mount_data(options, args, dev_name); } return 0; @@ -2448,10 +2456,6 @@ out_inval_auth: out_no_address: dfprintk(MOUNT, "NFS4: mount program didn't pass remote address\n"); return -EINVAL; - -out_no_client_address: - dfprintk(MOUNT, "NFS4: mount program didn't pass callback address\n"); - return -EINVAL; } /* @@ -2507,7 +2511,8 @@ static int nfs4_remote_get_sb(struct file_system_type *fs_type, if (!s->s_root) { /* initial superblock/root creation */ nfs4_fill_super(s); - nfs_fscache_get_super_cookie(s, data); + nfs_fscache_get_super_cookie( + s, data ? data->fscache_uniq : NULL, NULL); } mntroot = nfs4_get_root(s, mntfh); @@ -2618,6 +2623,34 @@ out_err: return ret; } +static int nfs4_try_mount(int flags, const char *dev_name, + struct nfs_parsed_mount_data *data, + struct vfsmount *mnt) +{ + char *export_path; + struct vfsmount *root_mnt; + int error; + + dfprintk(MOUNT, "--> nfs4_try_mount()\n"); + + export_path = data->nfs_server.export_path; + data->nfs_server.export_path = "/"; + root_mnt = nfs_do_root_mount(&nfs4_remote_fs_type, flags, data, + data->nfs_server.hostname); + data->nfs_server.export_path = export_path; + + error = PTR_ERR(root_mnt); + if (IS_ERR(root_mnt)) + goto out; + + error = nfs_follow_remote_path(root_mnt, export_path, mnt); + +out: + dfprintk(MOUNT, "<-- nfs4_try_mount() = %d%s\n", error, + error != 0 ? " [error]" : ""); + return error; +} + /* * Get the superblock for an NFS4 mountpoint */ @@ -2625,11 +2658,9 @@ static int nfs4_get_sb(struct file_system_type *fs_type, int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt) { struct nfs_parsed_mount_data *data; - char *export_path; - struct vfsmount *root_mnt; int error = -ENOMEM; - data = kzalloc(sizeof(*data), GFP_KERNEL); + data = nfs_alloc_parsed_mount_data(4); if (data == NULL) goto out_free_data; @@ -2638,17 +2669,7 @@ static int nfs4_get_sb(struct file_system_type *fs_type, if (error < 0) goto out; - export_path = data->nfs_server.export_path; - data->nfs_server.export_path = "/"; - root_mnt = nfs_do_root_mount(&nfs4_remote_fs_type, flags, data, - data->nfs_server.hostname); - data->nfs_server.export_path = export_path; - - error = PTR_ERR(root_mnt); - if (IS_ERR(root_mnt)) - goto out; - - error = nfs_follow_remote_path(root_mnt, export_path, mnt); + error = nfs4_try_mount(flags, dev_name, data, mnt); out: kfree(data->client_address); @@ -2669,7 +2690,6 @@ static void nfs4_kill_super(struct super_block *sb) dprintk("--> %s\n", __func__); nfs_super_return_all_delegations(sb); kill_anon_super(sb); - nfs4_renewd_prepare_shutdown(server); nfs_fscache_release_super_cookie(sb); nfs_free_server(server); dprintk("<-- %s\n", __func__); @@ -2724,6 +2744,7 @@ static int nfs4_xdev_get_sb(struct file_system_type *fs_type, int flags, if (!s->s_root) { /* initial superblock/root creation */ nfs4_clone_super(s, data->sb); + nfs_fscache_get_super_cookie(s, NULL, data); } mntroot = nfs4_get_root(s, data->fh); @@ -2805,6 +2826,7 @@ static int nfs4_remote_referral_get_sb(struct file_system_type *fs_type, if (!s->s_root) { /* initial superblock/root creation */ nfs4_fill_super(s); + nfs_fscache_get_super_cookie(s, NULL, data); } mntroot = nfs4_get_root(s, &mntfh); diff --git a/fs/nfs/write.c b/fs/nfs/write.c index a34fae2..53eb26c 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -13,6 +13,7 @@ #include <linux/file.h> #include <linux/writeback.h> #include <linux/swap.h> +#include <linux/migrate.h> #include <linux/sunrpc/clnt.h> #include <linux/nfs_fs.h> @@ -26,6 +27,7 @@ #include "internal.h" #include "iostat.h" #include "nfs4_fs.h" +#include "fscache.h" #define NFSDBG_FACILITY NFSDBG_PAGECACHE @@ -218,24 +220,17 @@ static void nfs_end_page_writeback(struct page *page) clear_bdi_congested(&nfss->backing_dev_info, BLK_RW_ASYNC); } -/* - * Find an associated nfs write request, and prepare to flush it out - * May return an error if the user signalled nfs_wait_on_request(). - */ -static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio, - struct page *page) +static struct nfs_page *nfs_find_and_lock_request(struct page *page) { struct inode *inode = page->mapping->host; struct nfs_page *req; int ret; spin_lock(&inode->i_lock); - for(;;) { + for (;;) { req = nfs_page_find_request_locked(page); - if (req == NULL) { - spin_unlock(&inode->i_lock); - return 0; - } + if (req == NULL) + break; if (nfs_set_page_tag_locked(req)) break; /* Note: If we hold the page lock, as is the case in nfs_writepage, @@ -247,23 +242,40 @@ static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio, ret = nfs_wait_on_request(req); nfs_release_request(req); if (ret != 0) - return ret; + return ERR_PTR(ret); spin_lock(&inode->i_lock); } - if (test_bit(PG_CLEAN, &req->wb_flags)) { - spin_unlock(&inode->i_lock); - BUG(); - } - if (nfs_set_page_writeback(page) != 0) { - spin_unlock(&inode->i_lock); - BUG(); - } spin_unlock(&inode->i_lock); + return req; +} + +/* + * Find an associated nfs write request, and prepare to flush it out + * May return an error if the user signalled nfs_wait_on_request(). + */ +static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio, + struct page *page) +{ + struct nfs_page *req; + int ret = 0; + + req = nfs_find_and_lock_request(page); + if (!req) + goto out; + ret = PTR_ERR(req); + if (IS_ERR(req)) + goto out; + + ret = nfs_set_page_writeback(page); + BUG_ON(ret != 0); + BUG_ON(test_bit(PG_CLEAN, &req->wb_flags)); + if (!nfs_pageio_add_request(pgio, req)) { nfs_redirty_request(req); - return pgio->pg_error; + ret = pgio->pg_error; } - return 0; +out: + return ret; } static int nfs_do_writepage(struct page *page, struct writeback_control *wbc, struct nfs_pageio_descriptor *pgio) @@ -1478,7 +1490,6 @@ static int nfs_write_mapping(struct address_space *mapping, int how) .nr_to_write = LONG_MAX, .range_start = 0, .range_end = LLONG_MAX, - .for_writepages = 1, }; return __nfs_write_mapping(mapping, &wbc, how); @@ -1580,6 +1591,41 @@ int nfs_wb_page(struct inode *inode, struct page* page) return nfs_wb_page_priority(inode, page, FLUSH_STABLE); } +#ifdef CONFIG_MIGRATION +int nfs_migrate_page(struct address_space *mapping, struct page *newpage, + struct page *page) +{ + struct nfs_page *req; + int ret; + + if (PageFsCache(page)) + nfs_fscache_release_page(page, GFP_KERNEL); + + req = nfs_find_and_lock_request(page); + ret = PTR_ERR(req); + if (IS_ERR(req)) + goto out; + + ret = migrate_page(mapping, newpage, page); + if (!req) + goto out; + if (ret) + goto out_unlock; + page_cache_get(newpage); + req->wb_page = newpage; + SetPagePrivate(newpage); + set_page_private(newpage, page_private(page)); + ClearPagePrivate(page); + set_page_private(page, 0); + page_cache_release(page); +out_unlock: + nfs_clear_page_tag_locked(req); + nfs_release_request(req); +out: + return ret; +} +#endif + int __init nfs_init_writepagecache(void) { nfs_wdata_cachep = kmem_cache_create("nfs_write_data", diff --git a/fs/nfsd/auth.c b/fs/nfsd/auth.c index 5573508..36fcabbf 100644 --- a/fs/nfsd/auth.c +++ b/fs/nfsd/auth.c @@ -34,6 +34,8 @@ int nfsd_setuser(struct svc_rqst *rqstp, struct svc_export *exp) int flags = nfsexp_flags(rqstp, exp); int ret; + validate_process_creds(); + /* discard any old override before preparing the new set */ revert_creds(get_cred(current->real_cred)); new = prepare_creds(); @@ -86,8 +88,10 @@ int nfsd_setuser(struct svc_rqst *rqstp, struct svc_export *exp) else new->cap_effective = cap_raise_nfsd_set(new->cap_effective, new->cap_permitted); + validate_process_creds(); put_cred(override_creds(new)); put_cred(new); + validate_process_creds(); return 0; oom: diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c index b92a276..c1c9e03 100644 --- a/fs/nfsd/export.c +++ b/fs/nfsd/export.c @@ -85,6 +85,11 @@ static void expkey_request(struct cache_detail *cd, (*bpp)[-1] = '\n'; } +static int expkey_upcall(struct cache_detail *cd, struct cache_head *h) +{ + return sunrpc_cache_pipe_upcall(cd, h, expkey_request); +} + static struct svc_expkey *svc_expkey_update(struct svc_expkey *new, struct svc_expkey *old); static struct svc_expkey *svc_expkey_lookup(struct svc_expkey *); static struct cache_detail svc_expkey_cache; @@ -259,7 +264,7 @@ static struct cache_detail svc_expkey_cache = { .hash_table = expkey_table, .name = "nfsd.fh", .cache_put = expkey_put, - .cache_request = expkey_request, + .cache_upcall = expkey_upcall, .cache_parse = expkey_parse, .cache_show = expkey_show, .match = expkey_match, @@ -355,6 +360,11 @@ static void svc_export_request(struct cache_detail *cd, (*bpp)[-1] = '\n'; } +static int svc_export_upcall(struct cache_detail *cd, struct cache_head *h) +{ + return sunrpc_cache_pipe_upcall(cd, h, svc_export_request); +} + static struct svc_export *svc_export_update(struct svc_export *new, struct svc_export *old); static struct svc_export *svc_export_lookup(struct svc_export *); @@ -724,7 +734,7 @@ struct cache_detail svc_export_cache = { .hash_table = export_table, .name = "nfsd.export", .cache_put = svc_export_put, - .cache_request = svc_export_request, + .cache_upcall = svc_export_upcall, .cache_parse = svc_export_parse, .cache_show = svc_export_show, .match = svc_export_match, @@ -1331,6 +1341,8 @@ exp_pseudoroot(struct svc_rqst *rqstp, struct svc_fh *fhp) if (rv) goto out; rv = check_nfsd_access(exp, rqstp); + if (rv) + fh_put(fhp); out: exp_put(exp); return rv; @@ -1505,7 +1517,7 @@ static int e_show(struct seq_file *m, void *p) return svc_export_show(m, &svc_export_cache, cp); } -struct seq_operations nfs_exports_op = { +const struct seq_operations nfs_exports_op = { .start = e_start, .next = e_next, .stop = e_stop, diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c index 01d4ec1..edf926e 100644 --- a/fs/nfsd/nfs3xdr.c +++ b/fs/nfsd/nfs3xdr.c @@ -814,17 +814,6 @@ encode_entry_baggage(struct nfsd3_readdirres *cd, __be32 *p, const char *name, return p; } -static __be32 * -encode_entryplus_baggage(struct nfsd3_readdirres *cd, __be32 *p, - struct svc_fh *fhp) -{ - p = encode_post_op_attr(cd->rqstp, p, fhp); - *p++ = xdr_one; /* yes, a file handle follows */ - p = encode_fh(p, fhp); - fh_put(fhp); - return p; -} - static int compose_entry_fh(struct nfsd3_readdirres *cd, struct svc_fh *fhp, const char *name, int namlen) @@ -836,29 +825,54 @@ compose_entry_fh(struct nfsd3_readdirres *cd, struct svc_fh *fhp, dparent = cd->fh.fh_dentry; exp = cd->fh.fh_export; - fh_init(fhp, NFS3_FHSIZE); if (isdotent(name, namlen)) { if (namlen == 2) { dchild = dget_parent(dparent); if (dchild == dparent) { /* filesystem root - cannot return filehandle for ".." */ dput(dchild); - return 1; + return -ENOENT; } } else dchild = dget(dparent); } else dchild = lookup_one_len(name, dparent, namlen); if (IS_ERR(dchild)) - return 1; - if (d_mountpoint(dchild) || - fh_compose(fhp, exp, dchild, &cd->fh) != 0 || - !dchild->d_inode) - rv = 1; + return -ENOENT; + rv = -ENOENT; + if (d_mountpoint(dchild)) + goto out; + rv = fh_compose(fhp, exp, dchild, &cd->fh); + if (rv) + goto out; + if (!dchild->d_inode) + goto out; + rv = 0; +out: dput(dchild); return rv; } +__be32 *encode_entryplus_baggage(struct nfsd3_readdirres *cd, __be32 *p, const char *name, int namlen) +{ + struct svc_fh fh; + int err; + + fh_init(&fh, NFS3_FHSIZE); + err = compose_entry_fh(cd, &fh, name, namlen); + if (err) { + *p++ = 0; + *p++ = 0; + goto out; + } + p = encode_post_op_attr(cd->rqstp, p, &fh); + *p++ = xdr_one; /* yes, a file handle follows */ + p = encode_fh(p, &fh); +out: + fh_put(&fh); + return p; +} + /* * Encode a directory entry. This one works for both normal readdir * and readdirplus. @@ -929,16 +943,8 @@ encode_entry(struct readdir_cd *ccd, const char *name, int namlen, p = encode_entry_baggage(cd, p, name, namlen, ino); - /* throw in readdirplus baggage */ - if (plus) { - struct svc_fh fh; - - if (compose_entry_fh(cd, &fh, name, namlen) > 0) { - *p++ = 0; - *p++ = 0; - } else - p = encode_entryplus_baggage(cd, p, &fh); - } + if (plus) + p = encode_entryplus_baggage(cd, p, name, namlen); num_entry_words = p - cd->buffer; } else if (cd->rqstp->rq_respages[pn+1] != NULL) { /* temporarily encode entry into next page, then move back to @@ -951,17 +957,8 @@ encode_entry(struct readdir_cd *ccd, const char *name, int namlen, p1 = encode_entry_baggage(cd, p1, name, namlen, ino); - /* throw in readdirplus baggage */ - if (plus) { - struct svc_fh fh; - - if (compose_entry_fh(cd, &fh, name, namlen) > 0) { - /* zero out the filehandle */ - *p1++ = 0; - *p1++ = 0; - } else - p1 = encode_entryplus_baggage(cd, p1, &fh); - } + if (plus) + p = encode_entryplus_baggage(cd, p1, name, namlen); /* determine entry word length and lengths to go in pages */ num_entry_words = p1 - tmp; diff --git a/fs/nfsd/nfs4acl.c b/fs/nfsd/nfs4acl.c index 54b8b41..725d02f 100644 --- a/fs/nfsd/nfs4acl.c +++ b/fs/nfsd/nfs4acl.c @@ -321,7 +321,7 @@ _posix_to_nfsv4_one(struct posix_acl *pacl, struct nfs4_acl *acl, deny = ~pas.group & pas.other; if (deny) { ace->type = NFS4_ACE_ACCESS_DENIED_ACE_TYPE; - ace->flag = eflag | NFS4_ACE_IDENTIFIER_GROUP; + ace->flag = eflag; ace->access_mask = deny_mask_from_posix(deny, flags); ace->whotype = NFS4_ACL_WHO_GROUP; ace++; @@ -335,7 +335,7 @@ _posix_to_nfsv4_one(struct posix_acl *pacl, struct nfs4_acl *acl, if (deny) { ace->type = NFS4_ACE_ACCESS_DENIED_ACE_TYPE; ace->flag = eflag | NFS4_ACE_IDENTIFIER_GROUP; - ace->access_mask = mask_from_posix(deny, flags); + ace->access_mask = deny_mask_from_posix(deny, flags); ace->whotype = NFS4_ACL_WHO_NAMED; ace->who = pa->e_id; ace++; diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index 3fd23f7..24e8d78 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -43,25 +43,30 @@ #include <linux/sunrpc/xdr.h> #include <linux/sunrpc/svc.h> #include <linux/sunrpc/clnt.h> +#include <linux/sunrpc/svcsock.h> #include <linux/nfsd/nfsd.h> #include <linux/nfsd/state.h> #include <linux/sunrpc/sched.h> #include <linux/nfs4.h> +#include <linux/sunrpc/xprtsock.h> #define NFSDDBG_FACILITY NFSDDBG_PROC #define NFSPROC4_CB_NULL 0 #define NFSPROC4_CB_COMPOUND 1 +#define NFS4_STATEID_SIZE 16 /* Index of predefined Linux callback client operations */ enum { - NFSPROC4_CLNT_CB_NULL = 0, + NFSPROC4_CLNT_CB_NULL = 0, NFSPROC4_CLNT_CB_RECALL, + NFSPROC4_CLNT_CB_SEQUENCE, }; enum nfs_cb_opnum4 { OP_CB_RECALL = 4, + OP_CB_SEQUENCE = 11, }; #define NFS4_MAXTAGLEN 20 @@ -70,17 +75,29 @@ enum nfs_cb_opnum4 { #define NFS4_dec_cb_null_sz 0 #define cb_compound_enc_hdr_sz 4 #define cb_compound_dec_hdr_sz (3 + (NFS4_MAXTAGLEN >> 2)) +#define sessionid_sz (NFS4_MAX_SESSIONID_LEN >> 2) +#define cb_sequence_enc_sz (sessionid_sz + 4 + \ + 1 /* no referring calls list yet */) +#define cb_sequence_dec_sz (op_dec_sz + sessionid_sz + 4) + #define op_enc_sz 1 #define op_dec_sz 2 #define enc_nfs4_fh_sz (1 + (NFS4_FHSIZE >> 2)) #define enc_stateid_sz (NFS4_STATEID_SIZE >> 2) #define NFS4_enc_cb_recall_sz (cb_compound_enc_hdr_sz + \ + cb_sequence_enc_sz + \ 1 + enc_stateid_sz + \ enc_nfs4_fh_sz) #define NFS4_dec_cb_recall_sz (cb_compound_dec_hdr_sz + \ + cb_sequence_dec_sz + \ op_dec_sz) +struct nfs4_rpc_args { + void *args_op; + struct nfsd4_cb_sequence args_seq; +}; + /* * Generic encode routines from fs/nfs/nfs4xdr.c */ @@ -137,11 +154,13 @@ xdr_error: \ } while (0) struct nfs4_cb_compound_hdr { - int status; - u32 ident; + /* args */ + u32 ident; /* minorversion 0 only */ u32 nops; __be32 *nops_p; u32 minorversion; + /* res */ + int status; u32 taglen; char *tag; }; @@ -238,6 +257,27 @@ encode_cb_recall(struct xdr_stream *xdr, struct nfs4_delegation *dp, hdr->nops++; } +static void +encode_cb_sequence(struct xdr_stream *xdr, struct nfsd4_cb_sequence *args, + struct nfs4_cb_compound_hdr *hdr) +{ + __be32 *p; + + if (hdr->minorversion == 0) + return; + + RESERVE_SPACE(1 + NFS4_MAX_SESSIONID_LEN + 20); + + WRITE32(OP_CB_SEQUENCE); + WRITEMEM(args->cbs_clp->cl_sessionid.data, NFS4_MAX_SESSIONID_LEN); + WRITE32(args->cbs_clp->cl_cb_seq_nr); + WRITE32(0); /* slotid, always 0 */ + WRITE32(0); /* highest slotid always 0 */ + WRITE32(0); /* cachethis always 0 */ + WRITE32(0); /* FIXME: support referring_call_lists */ + hdr->nops++; +} + static int nfs4_xdr_enc_cb_null(struct rpc_rqst *req, __be32 *p) { @@ -249,15 +289,19 @@ nfs4_xdr_enc_cb_null(struct rpc_rqst *req, __be32 *p) } static int -nfs4_xdr_enc_cb_recall(struct rpc_rqst *req, __be32 *p, struct nfs4_delegation *args) +nfs4_xdr_enc_cb_recall(struct rpc_rqst *req, __be32 *p, + struct nfs4_rpc_args *rpc_args) { struct xdr_stream xdr; + struct nfs4_delegation *args = rpc_args->args_op; struct nfs4_cb_compound_hdr hdr = { .ident = args->dl_ident, + .minorversion = rpc_args->args_seq.cbs_minorversion, }; xdr_init_encode(&xdr, &req->rq_snd_buf, p); encode_cb_compound_hdr(&xdr, &hdr); + encode_cb_sequence(&xdr, &rpc_args->args_seq, &hdr); encode_cb_recall(&xdr, args, &hdr); encode_cb_nops(&hdr); return 0; @@ -299,6 +343,57 @@ decode_cb_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected) return 0; } +/* + * Our current back channel implmentation supports a single backchannel + * with a single slot. + */ +static int +decode_cb_sequence(struct xdr_stream *xdr, struct nfsd4_cb_sequence *res, + struct rpc_rqst *rqstp) +{ + struct nfs4_sessionid id; + int status; + u32 dummy; + __be32 *p; + + if (res->cbs_minorversion == 0) + return 0; + + status = decode_cb_op_hdr(xdr, OP_CB_SEQUENCE); + if (status) + return status; + + /* + * If the server returns different values for sessionID, slotID or + * sequence number, the server is looney tunes. + */ + status = -ESERVERFAULT; + + READ_BUF(NFS4_MAX_SESSIONID_LEN + 16); + memcpy(id.data, p, NFS4_MAX_SESSIONID_LEN); + p += XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN); + if (memcmp(id.data, res->cbs_clp->cl_sessionid.data, + NFS4_MAX_SESSIONID_LEN)) { + dprintk("%s Invalid session id\n", __func__); + goto out; + } + READ32(dummy); + if (dummy != res->cbs_clp->cl_cb_seq_nr) { + dprintk("%s Invalid sequence number\n", __func__); + goto out; + } + READ32(dummy); /* slotid must be 0 */ + if (dummy != 0) { + dprintk("%s Invalid slotid\n", __func__); + goto out; + } + /* FIXME: process highest slotid and target highest slotid */ + status = 0; +out: + return status; +} + + static int nfs4_xdr_dec_cb_null(struct rpc_rqst *req, __be32 *p) { @@ -306,7 +401,8 @@ nfs4_xdr_dec_cb_null(struct rpc_rqst *req, __be32 *p) } static int -nfs4_xdr_dec_cb_recall(struct rpc_rqst *rqstp, __be32 *p) +nfs4_xdr_dec_cb_recall(struct rpc_rqst *rqstp, __be32 *p, + struct nfsd4_cb_sequence *seq) { struct xdr_stream xdr; struct nfs4_cb_compound_hdr hdr; @@ -316,6 +412,11 @@ nfs4_xdr_dec_cb_recall(struct rpc_rqst *rqstp, __be32 *p) status = decode_cb_compound_hdr(&xdr, &hdr); if (status) goto out; + if (seq) { + status = decode_cb_sequence(&xdr, seq, rqstp); + if (status) + goto out; + } status = decode_cb_op_hdr(&xdr, OP_CB_RECALL); out: return status; @@ -377,16 +478,15 @@ static int max_cb_time(void) int setup_callback_client(struct nfs4_client *clp) { - struct sockaddr_in addr; struct nfs4_cb_conn *cb = &clp->cl_cb_conn; struct rpc_timeout timeparms = { .to_initval = max_cb_time(), .to_retries = 0, }; struct rpc_create_args args = { - .protocol = IPPROTO_TCP, - .address = (struct sockaddr *)&addr, - .addrsize = sizeof(addr), + .protocol = XPRT_TRANSPORT_TCP, + .address = (struct sockaddr *) &cb->cb_addr, + .addrsize = cb->cb_addrlen, .timeout = &timeparms, .program = &cb_program, .prognumber = cb->cb_prog, @@ -399,13 +499,10 @@ int setup_callback_client(struct nfs4_client *clp) if (!clp->cl_principal && (clp->cl_flavor >= RPC_AUTH_GSS_KRB5)) return -EINVAL; - - /* Initialize address */ - memset(&addr, 0, sizeof(addr)); - addr.sin_family = AF_INET; - addr.sin_port = htons(cb->cb_port); - addr.sin_addr.s_addr = htonl(cb->cb_addr); - + if (cb->cb_minorversion) { + args.bc_xprt = clp->cl_cb_xprt; + args.protocol = XPRT_TRANSPORT_BC_TCP; + } /* Create RPC client */ client = rpc_create(&args); if (IS_ERR(client)) { @@ -439,42 +536,29 @@ static const struct rpc_call_ops nfsd4_cb_probe_ops = { .rpc_call_done = nfsd4_cb_probe_done, }; -static struct rpc_cred *lookup_cb_cred(struct nfs4_cb_conn *cb) -{ - struct auth_cred acred = { - .machine_cred = 1 - }; +static struct rpc_cred *callback_cred; - /* - * Note in the gss case this doesn't actually have to wait for a - * gss upcall (or any calls to the client); this just creates a - * non-uptodate cred which the rpc state machine will fill in with - * a refresh_upcall later. - */ - return rpcauth_lookup_credcache(cb->cb_client->cl_auth, &acred, - RPCAUTH_LOOKUP_NEW); +int set_callback_cred(void) +{ + callback_cred = rpc_lookup_machine_cred(); + if (!callback_cred) + return -ENOMEM; + return 0; } + void do_probe_callback(struct nfs4_client *clp) { struct nfs4_cb_conn *cb = &clp->cl_cb_conn; struct rpc_message msg = { .rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_NULL], .rpc_argp = clp, + .rpc_cred = callback_cred }; - struct rpc_cred *cred; int status; - cred = lookup_cb_cred(cb); - if (IS_ERR(cred)) { - status = PTR_ERR(cred); - goto out; - } - cb->cb_cred = cred; - msg.rpc_cred = cb->cb_cred; status = rpc_call_async(cb->cb_client, &msg, RPC_TASK_SOFT, &nfsd4_cb_probe_ops, (void *)clp); -out: if (status) { warn_no_callback_path(clp, status); put_nfs4_client(clp); @@ -503,11 +587,95 @@ nfsd4_probe_callback(struct nfs4_client *clp) do_probe_callback(clp); } +/* + * There's currently a single callback channel slot. + * If the slot is available, then mark it busy. Otherwise, set the + * thread for sleeping on the callback RPC wait queue. + */ +static int nfsd41_cb_setup_sequence(struct nfs4_client *clp, + struct rpc_task *task) +{ + struct nfs4_rpc_args *args = task->tk_msg.rpc_argp; + u32 *ptr = (u32 *)clp->cl_sessionid.data; + int status = 0; + + dprintk("%s: %u:%u:%u:%u\n", __func__, + ptr[0], ptr[1], ptr[2], ptr[3]); + + if (test_and_set_bit(0, &clp->cl_cb_slot_busy) != 0) { + rpc_sleep_on(&clp->cl_cb_waitq, task, NULL); + dprintk("%s slot is busy\n", __func__); + status = -EAGAIN; + goto out; + } + + /* + * We'll need the clp during XDR encoding and decoding, + * and the sequence during decoding to verify the reply + */ + args->args_seq.cbs_clp = clp; + task->tk_msg.rpc_resp = &args->args_seq; + +out: + dprintk("%s status=%d\n", __func__, status); + return status; +} + +/* + * TODO: cb_sequence should support referring call lists, cachethis, multiple + * slots, and mark callback channel down on communication errors. + */ +static void nfsd4_cb_prepare(struct rpc_task *task, void *calldata) +{ + struct nfs4_delegation *dp = calldata; + struct nfs4_client *clp = dp->dl_client; + struct nfs4_rpc_args *args = task->tk_msg.rpc_argp; + u32 minorversion = clp->cl_cb_conn.cb_minorversion; + int status = 0; + + args->args_seq.cbs_minorversion = minorversion; + if (minorversion) { + status = nfsd41_cb_setup_sequence(clp, task); + if (status) { + if (status != -EAGAIN) { + /* terminate rpc task */ + task->tk_status = status; + task->tk_action = NULL; + } + return; + } + } + rpc_call_start(task); +} + +static void nfsd4_cb_done(struct rpc_task *task, void *calldata) +{ + struct nfs4_delegation *dp = calldata; + struct nfs4_client *clp = dp->dl_client; + + dprintk("%s: minorversion=%d\n", __func__, + clp->cl_cb_conn.cb_minorversion); + + if (clp->cl_cb_conn.cb_minorversion) { + /* No need for lock, access serialized in nfsd4_cb_prepare */ + ++clp->cl_cb_seq_nr; + clear_bit(0, &clp->cl_cb_slot_busy); + rpc_wake_up_next(&clp->cl_cb_waitq); + dprintk("%s: freed slot, new seqid=%d\n", __func__, + clp->cl_cb_seq_nr); + + /* We're done looking into the sequence information */ + task->tk_msg.rpc_resp = NULL; + } +} + static void nfsd4_cb_recall_done(struct rpc_task *task, void *calldata) { struct nfs4_delegation *dp = calldata; struct nfs4_client *clp = dp->dl_client; + nfsd4_cb_done(task, calldata); + switch (task->tk_status) { case -EIO: /* Network partition? */ @@ -520,16 +688,19 @@ static void nfsd4_cb_recall_done(struct rpc_task *task, void *calldata) break; default: /* success, or error we can't handle */ - return; + goto done; } if (dp->dl_retries--) { rpc_delay(task, 2*HZ); task->tk_status = 0; rpc_restart_call(task); + return; } else { atomic_set(&clp->cl_cb_conn.cb_set, 0); warn_no_callback_path(clp, task->tk_status); } +done: + kfree(task->tk_msg.rpc_argp); } static void nfsd4_cb_recall_release(void *calldata) @@ -542,6 +713,7 @@ static void nfsd4_cb_recall_release(void *calldata) } static const struct rpc_call_ops nfsd4_cb_recall_ops = { + .rpc_call_prepare = nfsd4_cb_prepare, .rpc_call_done = nfsd4_cb_recall_done, .rpc_release = nfsd4_cb_recall_release, }; @@ -554,17 +726,24 @@ nfsd4_cb_recall(struct nfs4_delegation *dp) { struct nfs4_client *clp = dp->dl_client; struct rpc_clnt *clnt = clp->cl_cb_conn.cb_client; + struct nfs4_rpc_args *args; struct rpc_message msg = { .rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_RECALL], - .rpc_argp = dp, - .rpc_cred = clp->cl_cb_conn.cb_cred + .rpc_cred = callback_cred }; - int status; + int status = -ENOMEM; + args = kzalloc(sizeof(*args), GFP_KERNEL); + if (!args) + goto out; + args->args_op = dp; + msg.rpc_argp = args; dp->dl_retries = 1; status = rpc_call_async(clnt, &msg, RPC_TASK_SOFT, &nfsd4_cb_recall_ops, dp); +out: if (status) { + kfree(args); put_nfs4_client(clp); nfs4_put_delegation(dp); } diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c index 5b39842..ba2c199 100644 --- a/fs/nfsd/nfs4idmap.c +++ b/fs/nfsd/nfs4idmap.c @@ -38,7 +38,6 @@ #include <linux/init.h> #include <linux/mm.h> -#include <linux/utsname.h> #include <linux/errno.h> #include <linux/string.h> #include <linux/sunrpc/clnt.h> @@ -146,6 +145,12 @@ idtoname_request(struct cache_detail *cd, struct cache_head *ch, char **bpp, } static int +idtoname_upcall(struct cache_detail *cd, struct cache_head *ch) +{ + return sunrpc_cache_pipe_upcall(cd, ch, idtoname_request); +} + +static int idtoname_match(struct cache_head *ca, struct cache_head *cb) { struct ent *a = container_of(ca, struct ent, h); @@ -175,10 +180,10 @@ idtoname_show(struct seq_file *m, struct cache_detail *cd, struct cache_head *h) } static void -warn_no_idmapd(struct cache_detail *detail) +warn_no_idmapd(struct cache_detail *detail, int has_died) { printk("nfsd: nfsv4 idmapping failing: has idmapd %s?\n", - detail->last_close? "died" : "not been started"); + has_died ? "died" : "not been started"); } @@ -192,7 +197,7 @@ static struct cache_detail idtoname_cache = { .hash_table = idtoname_table, .name = "nfs4.idtoname", .cache_put = ent_put, - .cache_request = idtoname_request, + .cache_upcall = idtoname_upcall, .cache_parse = idtoname_parse, .cache_show = idtoname_show, .warn_no_listener = warn_no_idmapd, @@ -325,6 +330,12 @@ nametoid_request(struct cache_detail *cd, struct cache_head *ch, char **bpp, } static int +nametoid_upcall(struct cache_detail *cd, struct cache_head *ch) +{ + return sunrpc_cache_pipe_upcall(cd, ch, nametoid_request); +} + +static int nametoid_match(struct cache_head *ca, struct cache_head *cb) { struct ent *a = container_of(ca, struct ent, h); @@ -363,7 +374,7 @@ static struct cache_detail nametoid_cache = { .hash_table = nametoid_table, .name = "nfs4.nametoid", .cache_put = ent_put, - .cache_request = nametoid_request, + .cache_upcall = nametoid_upcall, .cache_parse = nametoid_parse, .cache_show = nametoid_show, .warn_no_listener = warn_no_idmapd, diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index 7c88017..bebc0c2 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -68,7 +68,6 @@ check_attr_support(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, u32 *bmval, u32 *writable) { struct dentry *dentry = cstate->current_fh.fh_dentry; - struct svc_export *exp = cstate->current_fh.fh_export; /* * Check about attributes are supported by the NFSv4 server or not. @@ -80,17 +79,13 @@ check_attr_support(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, return nfserr_attrnotsupp; /* - * Check FATTR4_WORD0_ACL & FATTR4_WORD0_FS_LOCATIONS can be supported + * Check FATTR4_WORD0_ACL can be supported * in current environment or not. */ if (bmval[0] & FATTR4_WORD0_ACL) { if (!IS_POSIXACL(dentry->d_inode)) return nfserr_attrnotsupp; } - if (bmval[0] & FATTR4_WORD0_FS_LOCATIONS) { - if (exp->ex_fslocs.locations == NULL) - return nfserr_attrnotsupp; - } /* * According to spec, read-only attributes return ERR_INVAL. @@ -123,6 +118,35 @@ nfsd4_check_open_attributes(struct svc_rqst *rqstp, return status; } +static int +is_create_with_attrs(struct nfsd4_open *open) +{ + return open->op_create == NFS4_OPEN_CREATE + && (open->op_createmode == NFS4_CREATE_UNCHECKED + || open->op_createmode == NFS4_CREATE_GUARDED + || open->op_createmode == NFS4_CREATE_EXCLUSIVE4_1); +} + +/* + * if error occurs when setting the acl, just clear the acl bit + * in the returned attr bitmap. + */ +static void +do_set_nfs4_acl(struct svc_rqst *rqstp, struct svc_fh *fhp, + struct nfs4_acl *acl, u32 *bmval) +{ + __be32 status; + + status = nfsd4_set_nfs4_acl(rqstp, fhp, acl); + if (status) + /* + * We should probably fail the whole open at this point, + * but we've already created the file, so it's too late; + * So this seems the least of evils: + */ + bmval[0] &= ~FATTR4_WORD0_ACL; +} + static inline void fh_dup2(struct svc_fh *dst, struct svc_fh *src) { @@ -206,6 +230,9 @@ do_open_lookup(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_o if (status) goto out; + if (is_create_with_attrs(open) && open->op_acl != NULL) + do_set_nfs4_acl(rqstp, &resfh, open->op_acl, open->op_bmval); + set_change_info(&open->op_cinfo, current_fh); fh_dup2(current_fh, &resfh); @@ -536,12 +563,17 @@ nfsd4_create(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, status = nfserr_badtype; } - if (!status) { - fh_unlock(&cstate->current_fh); - set_change_info(&create->cr_cinfo, &cstate->current_fh); - fh_dup2(&cstate->current_fh, &resfh); - } + if (status) + goto out; + + if (create->cr_acl != NULL) + do_set_nfs4_acl(rqstp, &resfh, create->cr_acl, + create->cr_bmval); + fh_unlock(&cstate->current_fh); + set_change_info(&create->cr_cinfo, &cstate->current_fh); + fh_dup2(&cstate->current_fh, &resfh); +out: fh_put(&resfh); return status; } @@ -947,34 +979,6 @@ static struct nfsd4_operation nfsd4_ops[]; static const char *nfsd4_op_name(unsigned opnum); /* - * This is a replay of a compound for which no cache entry pages - * were used. Encode the sequence operation, and if cachethis is FALSE - * encode the uncache rep error on the next operation. - */ -static __be32 -nfsd4_enc_uncached_replay(struct nfsd4_compoundargs *args, - struct nfsd4_compoundres *resp) -{ - struct nfsd4_op *op; - - dprintk("--> %s resp->opcnt %d ce_cachethis %u \n", __func__, - resp->opcnt, resp->cstate.slot->sl_cache_entry.ce_cachethis); - - /* Encode the replayed sequence operation */ - BUG_ON(resp->opcnt != 1); - op = &args->ops[resp->opcnt - 1]; - nfsd4_encode_operation(resp, op); - - /*return nfserr_retry_uncached_rep in next operation. */ - if (resp->cstate.slot->sl_cache_entry.ce_cachethis == 0) { - op = &args->ops[resp->opcnt++]; - op->status = nfserr_retry_uncached_rep; - nfsd4_encode_operation(resp, op); - } - return op->status; -} - -/* * Enforce NFSv4.1 COMPOUND ordering rules. * * TODO: @@ -1083,13 +1087,10 @@ nfsd4_proc_compound(struct svc_rqst *rqstp, BUG_ON(op->status == nfs_ok); encode_op: - /* Only from SEQUENCE or CREATE_SESSION */ + /* Only from SEQUENCE */ if (resp->cstate.status == nfserr_replay_cache) { dprintk("%s NFS4.1 replay from cache\n", __func__); - if (nfsd4_not_cached(resp)) - status = nfsd4_enc_uncached_replay(args, resp); - else - status = op->status; + status = op->status; goto out; } if (op->status == nfserr_replay_me) { diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 980a216..2153f9bd 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -55,6 +55,7 @@ #include <linux/lockd/bind.h> #include <linux/module.h> #include <linux/sunrpc/svcauth_gss.h> +#include <linux/sunrpc/clnt.h> #define NFSDDBG_FACILITY NFSDDBG_PROC @@ -413,36 +414,65 @@ gen_sessionid(struct nfsd4_session *ses) } /* - * Give the client the number of slots it requests bound by - * NFSD_MAX_SLOTS_PER_SESSION and by sv_drc_max_pages. + * The protocol defines ca_maxresponssize_cached to include the size of + * the rpc header, but all we need to cache is the data starting after + * the end of the initial SEQUENCE operation--the rest we regenerate + * each time. Therefore we can advertise a ca_maxresponssize_cached + * value that is the number of bytes in our cache plus a few additional + * bytes. In order to stay on the safe side, and not promise more than + * we can cache, those additional bytes must be the minimum possible: 24 + * bytes of rpc header (xid through accept state, with AUTH_NULL + * verifier), 12 for the compound header (with zero-length tag), and 44 + * for the SEQUENCE op response: + */ +#define NFSD_MIN_HDR_SEQ_SZ (24 + 12 + 44) + +/* + * Give the client the number of ca_maxresponsesize_cached slots it + * requests, of size bounded by NFSD_SLOT_CACHE_SIZE, + * NFSD_MAX_MEM_PER_SESSION, and nfsd_drc_max_mem. Do not allow more + * than NFSD_MAX_SLOTS_PER_SESSION. * - * If we run out of pages (sv_drc_pages_used == sv_drc_max_pages) we - * should (up to a point) re-negotiate active sessions and reduce their - * slot usage to make rooom for new connections. For now we just fail the - * create session. + * If we run out of reserved DRC memory we should (up to a point) + * re-negotiate active sessions and reduce their slot usage to make + * rooom for new connections. For now we just fail the create session. */ -static int set_forechannel_maxreqs(struct nfsd4_channel_attrs *fchan) +static int set_forechannel_drc_size(struct nfsd4_channel_attrs *fchan) { - int status = 0, np = fchan->maxreqs * NFSD_PAGES_PER_SLOT; + int mem, size = fchan->maxresp_cached; if (fchan->maxreqs < 1) return nfserr_inval; - else if (fchan->maxreqs > NFSD_MAX_SLOTS_PER_SESSION) - fchan->maxreqs = NFSD_MAX_SLOTS_PER_SESSION; - spin_lock(&nfsd_serv->sv_lock); - if (np + nfsd_serv->sv_drc_pages_used > nfsd_serv->sv_drc_max_pages) - np = nfsd_serv->sv_drc_max_pages - nfsd_serv->sv_drc_pages_used; - nfsd_serv->sv_drc_pages_used += np; - spin_unlock(&nfsd_serv->sv_lock); + if (size < NFSD_MIN_HDR_SEQ_SZ) + size = NFSD_MIN_HDR_SEQ_SZ; + size -= NFSD_MIN_HDR_SEQ_SZ; + if (size > NFSD_SLOT_CACHE_SIZE) + size = NFSD_SLOT_CACHE_SIZE; + + /* bound the maxreqs by NFSD_MAX_MEM_PER_SESSION */ + mem = fchan->maxreqs * size; + if (mem > NFSD_MAX_MEM_PER_SESSION) { + fchan->maxreqs = NFSD_MAX_MEM_PER_SESSION / size; + if (fchan->maxreqs > NFSD_MAX_SLOTS_PER_SESSION) + fchan->maxreqs = NFSD_MAX_SLOTS_PER_SESSION; + mem = fchan->maxreqs * size; + } - if (np <= 0) { - status = nfserr_resource; - fchan->maxreqs = 0; - } else - fchan->maxreqs = np / NFSD_PAGES_PER_SLOT; + spin_lock(&nfsd_drc_lock); + /* bound the total session drc memory ussage */ + if (mem + nfsd_drc_mem_used > nfsd_drc_max_mem) { + fchan->maxreqs = (nfsd_drc_max_mem - nfsd_drc_mem_used) / size; + mem = fchan->maxreqs * size; + } + nfsd_drc_mem_used += mem; + spin_unlock(&nfsd_drc_lock); - return status; + if (fchan->maxreqs == 0) + return nfserr_serverfault; + + fchan->maxresp_cached = size + NFSD_MIN_HDR_SEQ_SZ; + return 0; } /* @@ -466,36 +496,41 @@ static int init_forechannel_attrs(struct svc_rqst *rqstp, fchan->maxresp_sz = maxcount; session_fchan->maxresp_sz = fchan->maxresp_sz; - /* Set the max response cached size our default which is - * a multiple of PAGE_SIZE and small */ - session_fchan->maxresp_cached = NFSD_PAGES_PER_SLOT * PAGE_SIZE; - fchan->maxresp_cached = session_fchan->maxresp_cached; - /* Use the client's maxops if possible */ if (fchan->maxops > NFSD_MAX_OPS_PER_COMPOUND) fchan->maxops = NFSD_MAX_OPS_PER_COMPOUND; session_fchan->maxops = fchan->maxops; - /* try to use the client requested number of slots */ - if (fchan->maxreqs > NFSD_MAX_SLOTS_PER_SESSION) - fchan->maxreqs = NFSD_MAX_SLOTS_PER_SESSION; - /* FIXME: Error means no more DRC pages so the server should * recover pages from existing sessions. For now fail session * creation. */ - status = set_forechannel_maxreqs(fchan); + status = set_forechannel_drc_size(fchan); + session_fchan->maxresp_cached = fchan->maxresp_cached; session_fchan->maxreqs = fchan->maxreqs; + + dprintk("%s status %d\n", __func__, status); return status; } +static void +free_session_slots(struct nfsd4_session *ses) +{ + int i; + + for (i = 0; i < ses->se_fchannel.maxreqs; i++) + kfree(ses->se_slots[i]); +} + static int alloc_init_session(struct svc_rqst *rqstp, struct nfs4_client *clp, struct nfsd4_create_session *cses) { struct nfsd4_session *new, tmp; - int idx, status = nfserr_resource, slotsize; + struct nfsd4_slot *sp; + int idx, slotsize, cachesize, i; + int status; memset(&tmp, 0, sizeof(tmp)); @@ -506,14 +541,27 @@ alloc_init_session(struct svc_rqst *rqstp, struct nfs4_client *clp, if (status) goto out; - /* allocate struct nfsd4_session and slot table in one piece */ - slotsize = tmp.se_fchannel.maxreqs * sizeof(struct nfsd4_slot); + BUILD_BUG_ON(NFSD_MAX_SLOTS_PER_SESSION * sizeof(struct nfsd4_slot) + + sizeof(struct nfsd4_session) > PAGE_SIZE); + + status = nfserr_serverfault; + /* allocate struct nfsd4_session and slot table pointers in one piece */ + slotsize = tmp.se_fchannel.maxreqs * sizeof(struct nfsd4_slot *); new = kzalloc(sizeof(*new) + slotsize, GFP_KERNEL); if (!new) goto out; memcpy(new, &tmp, sizeof(*new)); + /* allocate each struct nfsd4_slot and data cache in one piece */ + cachesize = new->se_fchannel.maxresp_cached - NFSD_MIN_HDR_SEQ_SZ; + for (i = 0; i < new->se_fchannel.maxreqs; i++) { + sp = kzalloc(sizeof(*sp) + cachesize, GFP_KERNEL); + if (!sp) + goto out_free; + new->se_slots[i] = sp; + } + new->se_client = clp; gen_sessionid(new); idx = hash_sessionid(&new->se_sessionid); @@ -530,6 +578,10 @@ alloc_init_session(struct svc_rqst *rqstp, struct nfs4_client *clp, status = nfs_ok; out: return status; +out_free: + free_session_slots(new); + kfree(new); + goto out; } /* caller must hold sessionid_lock */ @@ -572,19 +624,16 @@ release_session(struct nfsd4_session *ses) nfsd4_put_session(ses); } -static void nfsd4_release_respages(struct page **respages, short resused); - void free_session(struct kref *kref) { struct nfsd4_session *ses; - int i; ses = container_of(kref, struct nfsd4_session, se_ref); - for (i = 0; i < ses->se_fchannel.maxreqs; i++) { - struct nfsd4_cache_entry *e = &ses->se_slots[i].sl_cache_entry; - nfsd4_release_respages(e->ce_respages, e->ce_resused); - } + spin_lock(&nfsd_drc_lock); + nfsd_drc_mem_used -= ses->se_fchannel.maxreqs * NFSD_SLOT_CACHE_SIZE; + spin_unlock(&nfsd_drc_lock); + free_session_slots(ses); kfree(ses); } @@ -647,18 +696,14 @@ shutdown_callback_client(struct nfs4_client *clp) clp->cl_cb_conn.cb_client = NULL; rpc_shutdown_client(clnt); } - if (clp->cl_cb_conn.cb_cred) { - put_rpccred(clp->cl_cb_conn.cb_cred); - clp->cl_cb_conn.cb_cred = NULL; - } } static inline void free_client(struct nfs4_client *clp) { shutdown_callback_client(clp); - nfsd4_release_respages(clp->cl_slot.sl_cache_entry.ce_respages, - clp->cl_slot.sl_cache_entry.ce_resused); + if (clp->cl_cb_xprt) + svc_xprt_put(clp->cl_cb_xprt); if (clp->cl_cred.cr_group_info) put_group_info(clp->cl_cred.cr_group_info); kfree(clp->cl_principal); @@ -714,25 +759,6 @@ expire_client(struct nfs4_client *clp) put_nfs4_client(clp); } -static struct nfs4_client *create_client(struct xdr_netobj name, char *recdir) -{ - struct nfs4_client *clp; - - clp = alloc_client(name); - if (clp == NULL) - return NULL; - memcpy(clp->cl_recdir, recdir, HEXDIR_LEN); - atomic_set(&clp->cl_count, 1); - atomic_set(&clp->cl_cb_conn.cb_set, 0); - INIT_LIST_HEAD(&clp->cl_idhash); - INIT_LIST_HEAD(&clp->cl_strhash); - INIT_LIST_HEAD(&clp->cl_openowners); - INIT_LIST_HEAD(&clp->cl_delegations); - INIT_LIST_HEAD(&clp->cl_sessions); - INIT_LIST_HEAD(&clp->cl_lru); - return clp; -} - static void copy_verf(struct nfs4_client *target, nfs4_verifier *source) { memcpy(target->cl_verifier.data, source->data, @@ -795,6 +821,46 @@ static void gen_confirm(struct nfs4_client *clp) *p++ = i++; } +static struct nfs4_client *create_client(struct xdr_netobj name, char *recdir, + struct svc_rqst *rqstp, nfs4_verifier *verf) +{ + struct nfs4_client *clp; + struct sockaddr *sa = svc_addr(rqstp); + char *princ; + + clp = alloc_client(name); + if (clp == NULL) + return NULL; + + princ = svc_gss_principal(rqstp); + if (princ) { + clp->cl_principal = kstrdup(princ, GFP_KERNEL); + if (clp->cl_principal == NULL) { + free_client(clp); + return NULL; + } + } + + memcpy(clp->cl_recdir, recdir, HEXDIR_LEN); + atomic_set(&clp->cl_count, 1); + atomic_set(&clp->cl_cb_conn.cb_set, 0); + INIT_LIST_HEAD(&clp->cl_idhash); + INIT_LIST_HEAD(&clp->cl_strhash); + INIT_LIST_HEAD(&clp->cl_openowners); + INIT_LIST_HEAD(&clp->cl_delegations); + INIT_LIST_HEAD(&clp->cl_sessions); + INIT_LIST_HEAD(&clp->cl_lru); + clear_bit(0, &clp->cl_cb_slot_busy); + rpc_init_wait_queue(&clp->cl_cb_waitq, "Backchannel slot table"); + copy_verf(clp, verf); + rpc_copy_addr((struct sockaddr *) &clp->cl_addr, sa); + clp->cl_flavor = rqstp->rq_flavor; + copy_cred(&clp->cl_cred, &rqstp->rq_cred); + gen_confirm(clp); + + return clp; +} + static int check_name(struct xdr_netobj name) { if (name.len == 0) @@ -902,93 +968,40 @@ find_unconfirmed_client_by_str(const char *dname, unsigned int hashval, return NULL; } -/* a helper function for parse_callback */ -static int -parse_octet(unsigned int *lenp, char **addrp) -{ - unsigned int len = *lenp; - char *p = *addrp; - int n = -1; - char c; - - for (;;) { - if (!len) - break; - len--; - c = *p++; - if (c == '.') - break; - if ((c < '0') || (c > '9')) { - n = -1; - break; - } - if (n < 0) - n = 0; - n = (n * 10) + (c - '0'); - if (n > 255) { - n = -1; - break; - } - } - *lenp = len; - *addrp = p; - return n; -} - -/* parse and set the setclientid ipv4 callback address */ -static int -parse_ipv4(unsigned int addr_len, char *addr_val, unsigned int *cbaddrp, unsigned short *cbportp) -{ - int temp = 0; - u32 cbaddr = 0; - u16 cbport = 0; - u32 addrlen = addr_len; - char *addr = addr_val; - int i, shift; - - /* ipaddress */ - shift = 24; - for(i = 4; i > 0 ; i--) { - if ((temp = parse_octet(&addrlen, &addr)) < 0) { - return 0; - } - cbaddr |= (temp << shift); - if (shift > 0) - shift -= 8; - } - *cbaddrp = cbaddr; - - /* port */ - shift = 8; - for(i = 2; i > 0 ; i--) { - if ((temp = parse_octet(&addrlen, &addr)) < 0) { - return 0; - } - cbport |= (temp << shift); - if (shift > 0) - shift -= 8; - } - *cbportp = cbport; - return 1; -} - static void -gen_callback(struct nfs4_client *clp, struct nfsd4_setclientid *se) +gen_callback(struct nfs4_client *clp, struct nfsd4_setclientid *se, u32 scopeid) { struct nfs4_cb_conn *cb = &clp->cl_cb_conn; - - /* Currently, we only support tcp for the callback channel */ - if ((se->se_callback_netid_len != 3) || memcmp((char *)se->se_callback_netid_val, "tcp", 3)) + unsigned short expected_family; + + /* Currently, we only support tcp and tcp6 for the callback channel */ + if (se->se_callback_netid_len == 3 && + !memcmp(se->se_callback_netid_val, "tcp", 3)) + expected_family = AF_INET; + else if (se->se_callback_netid_len == 4 && + !memcmp(se->se_callback_netid_val, "tcp6", 4)) + expected_family = AF_INET6; + else goto out_err; - if ( !(parse_ipv4(se->se_callback_addr_len, se->se_callback_addr_val, - &cb->cb_addr, &cb->cb_port))) + cb->cb_addrlen = rpc_uaddr2sockaddr(se->se_callback_addr_val, + se->se_callback_addr_len, + (struct sockaddr *) &cb->cb_addr, + sizeof(cb->cb_addr)); + + if (!cb->cb_addrlen || cb->cb_addr.ss_family != expected_family) goto out_err; + + if (cb->cb_addr.ss_family == AF_INET6) + ((struct sockaddr_in6 *) &cb->cb_addr)->sin6_scope_id = scopeid; + cb->cb_minorversion = 0; cb->cb_prog = se->se_callback_prog; cb->cb_ident = se->se_callback_ident; return; out_err: + cb->cb_addr.ss_family = AF_UNSPEC; + cb->cb_addrlen = 0; dprintk(KERN_INFO "NFSD: this client (clientid %08x/%08x) " "will not receive delegations\n", clp->cl_clientid.cl_boot, clp->cl_clientid.cl_id); @@ -996,175 +1009,87 @@ out_err: return; } -void -nfsd4_set_statp(struct svc_rqst *rqstp, __be32 *statp) -{ - struct nfsd4_compoundres *resp = rqstp->rq_resp; - - resp->cstate.statp = statp; -} - /* - * Dereference the result pages. + * Cache a reply. nfsd4_check_drc_limit() has bounded the cache size. */ -static void -nfsd4_release_respages(struct page **respages, short resused) +void +nfsd4_store_cache_entry(struct nfsd4_compoundres *resp) { - int i; + struct nfsd4_slot *slot = resp->cstate.slot; + unsigned int base; - dprintk("--> %s\n", __func__); - for (i = 0; i < resused; i++) { - if (!respages[i]) - continue; - put_page(respages[i]); - respages[i] = NULL; - } -} + dprintk("--> %s slot %p\n", __func__, slot); -static void -nfsd4_copy_pages(struct page **topages, struct page **frompages, short count) -{ - int i; + slot->sl_opcnt = resp->opcnt; + slot->sl_status = resp->cstate.status; - for (i = 0; i < count; i++) { - topages[i] = frompages[i]; - if (!topages[i]) - continue; - get_page(topages[i]); + if (nfsd4_not_cached(resp)) { + slot->sl_datalen = 0; + return; } + slot->sl_datalen = (char *)resp->p - (char *)resp->cstate.datap; + base = (char *)resp->cstate.datap - + (char *)resp->xbuf->head[0].iov_base; + if (read_bytes_from_xdr_buf(resp->xbuf, base, slot->sl_data, + slot->sl_datalen)) + WARN("%s: sessions DRC could not cache compound\n", __func__); + return; } /* - * Cache the reply pages up to NFSD_PAGES_PER_SLOT + 1, clearing the previous - * pages. We add a page to NFSD_PAGES_PER_SLOT for the case where the total - * length of the XDR response is less than se_fmaxresp_cached - * (NFSD_PAGES_PER_SLOT * PAGE_SIZE) but the xdr_buf pages is used for a - * of the reply (e.g. readdir). + * Encode the replay sequence operation from the slot values. + * If cachethis is FALSE encode the uncached rep error on the next + * operation which sets resp->p and increments resp->opcnt for + * nfs4svc_encode_compoundres. * - * Store the base and length of the rq_req.head[0] page - * of the NFSv4.1 data, just past the rpc header. */ -void -nfsd4_store_cache_entry(struct nfsd4_compoundres *resp) +static __be32 +nfsd4_enc_sequence_replay(struct nfsd4_compoundargs *args, + struct nfsd4_compoundres *resp) { - struct nfsd4_cache_entry *entry = &resp->cstate.slot->sl_cache_entry; - struct svc_rqst *rqstp = resp->rqstp; - struct nfsd4_compoundargs *args = rqstp->rq_argp; - struct nfsd4_op *op = &args->ops[resp->opcnt]; - struct kvec *resv = &rqstp->rq_res.head[0]; - - dprintk("--> %s entry %p\n", __func__, entry); - - /* Don't cache a failed OP_SEQUENCE. */ - if (resp->opcnt == 1 && op->opnum == OP_SEQUENCE && resp->cstate.status) - return; + struct nfsd4_op *op; + struct nfsd4_slot *slot = resp->cstate.slot; - nfsd4_release_respages(entry->ce_respages, entry->ce_resused); - entry->ce_opcnt = resp->opcnt; - entry->ce_status = resp->cstate.status; + dprintk("--> %s resp->opcnt %d cachethis %u \n", __func__, + resp->opcnt, resp->cstate.slot->sl_cachethis); - /* - * Don't need a page to cache just the sequence operation - the slot - * does this for us! - */ + /* Encode the replayed sequence operation */ + op = &args->ops[resp->opcnt - 1]; + nfsd4_encode_operation(resp, op); - if (nfsd4_not_cached(resp)) { - entry->ce_resused = 0; - entry->ce_rpchdrlen = 0; - dprintk("%s Just cache SEQUENCE. ce_cachethis %d\n", __func__, - resp->cstate.slot->sl_cache_entry.ce_cachethis); - return; - } - entry->ce_resused = rqstp->rq_resused; - if (entry->ce_resused > NFSD_PAGES_PER_SLOT + 1) - entry->ce_resused = NFSD_PAGES_PER_SLOT + 1; - nfsd4_copy_pages(entry->ce_respages, rqstp->rq_respages, - entry->ce_resused); - entry->ce_datav.iov_base = resp->cstate.statp; - entry->ce_datav.iov_len = resv->iov_len - ((char *)resp->cstate.statp - - (char *)page_address(rqstp->rq_respages[0])); - /* Current request rpc header length*/ - entry->ce_rpchdrlen = (char *)resp->cstate.statp - - (char *)page_address(rqstp->rq_respages[0]); -} - -/* - * We keep the rpc header, but take the nfs reply from the replycache. - */ -static int -nfsd41_copy_replay_data(struct nfsd4_compoundres *resp, - struct nfsd4_cache_entry *entry) -{ - struct svc_rqst *rqstp = resp->rqstp; - struct kvec *resv = &resp->rqstp->rq_res.head[0]; - int len; - - /* Current request rpc header length*/ - len = (char *)resp->cstate.statp - - (char *)page_address(rqstp->rq_respages[0]); - if (entry->ce_datav.iov_len + len > PAGE_SIZE) { - dprintk("%s v41 cached reply too large (%Zd).\n", __func__, - entry->ce_datav.iov_len); - return 0; + /* Return nfserr_retry_uncached_rep in next operation. */ + if (args->opcnt > 1 && slot->sl_cachethis == 0) { + op = &args->ops[resp->opcnt++]; + op->status = nfserr_retry_uncached_rep; + nfsd4_encode_operation(resp, op); } - /* copy the cached reply nfsd data past the current rpc header */ - memcpy((char *)resv->iov_base + len, entry->ce_datav.iov_base, - entry->ce_datav.iov_len); - resv->iov_len = len + entry->ce_datav.iov_len; - return 1; + return op->status; } /* - * Keep the first page of the replay. Copy the NFSv4.1 data from the first - * cached page. Replace any futher replay pages from the cache. + * The sequence operation is not cached because we can use the slot and + * session values. */ __be32 nfsd4_replay_cache_entry(struct nfsd4_compoundres *resp, struct nfsd4_sequence *seq) { - struct nfsd4_cache_entry *entry = &resp->cstate.slot->sl_cache_entry; + struct nfsd4_slot *slot = resp->cstate.slot; __be32 status; - dprintk("--> %s entry %p\n", __func__, entry); - - /* - * If this is just the sequence operation, we did not keep - * a page in the cache entry because we can just use the - * slot info stored in struct nfsd4_sequence that was checked - * against the slot in nfsd4_sequence(). - * - * This occurs when seq->cachethis is FALSE, or when the client - * session inactivity timer fires and a solo sequence operation - * is sent (lease renewal). - */ - if (seq && nfsd4_not_cached(resp)) { - seq->maxslots = resp->cstate.session->se_fchannel.maxreqs; - return nfs_ok; - } - - if (!nfsd41_copy_replay_data(resp, entry)) { - /* - * Not enough room to use the replay rpc header, send the - * cached header. Release all the allocated result pages. - */ - svc_free_res_pages(resp->rqstp); - nfsd4_copy_pages(resp->rqstp->rq_respages, entry->ce_respages, - entry->ce_resused); - } else { - /* Release all but the first allocated result page */ + dprintk("--> %s slot %p\n", __func__, slot); - resp->rqstp->rq_resused--; - svc_free_res_pages(resp->rqstp); + /* Either returns 0 or nfserr_retry_uncached */ + status = nfsd4_enc_sequence_replay(resp->rqstp->rq_argp, resp); + if (status == nfserr_retry_uncached_rep) + return status; - nfsd4_copy_pages(&resp->rqstp->rq_respages[1], - &entry->ce_respages[1], - entry->ce_resused - 1); - } + /* The sequence operation has been encoded, cstate->datap set. */ + memcpy(resp->cstate.datap, slot->sl_data, slot->sl_datalen); - resp->rqstp->rq_resused = entry->ce_resused; - resp->opcnt = entry->ce_opcnt; - resp->cstate.iovlen = entry->ce_datav.iov_len + entry->ce_rpchdrlen; - status = entry->ce_status; + resp->opcnt = slot->sl_opcnt; + resp->p = resp->cstate.datap + XDR_QUADLEN(slot->sl_datalen); + status = slot->sl_status; return status; } @@ -1194,13 +1119,15 @@ nfsd4_exchange_id(struct svc_rqst *rqstp, int status; unsigned int strhashval; char dname[HEXDIR_LEN]; + char addr_str[INET6_ADDRSTRLEN]; nfs4_verifier verf = exid->verifier; - u32 ip_addr = svc_addr_in(rqstp)->sin_addr.s_addr; + struct sockaddr *sa = svc_addr(rqstp); + rpc_ntop(sa, addr_str, sizeof(addr_str)); dprintk("%s rqstp=%p exid=%p clname.len=%u clname.data=%p " - " ip_addr=%u flags %x, spa_how %d\n", + "ip_addr=%s flags %x, spa_how %d\n", __func__, rqstp, exid, exid->clname.len, exid->clname.data, - ip_addr, exid->flags, exid->spa_how); + addr_str, exid->flags, exid->spa_how); if (!check_name(exid->clname) || (exid->flags & ~EXCHGID4_FLAG_MASK_A)) return nfserr_inval; @@ -1281,28 +1208,23 @@ nfsd4_exchange_id(struct svc_rqst *rqstp, out_new: /* Normal case */ - new = create_client(exid->clname, dname); + new = create_client(exid->clname, dname, rqstp, &verf); if (new == NULL) { - status = nfserr_resource; + status = nfserr_serverfault; goto out; } - copy_verf(new, &verf); - copy_cred(&new->cl_cred, &rqstp->rq_cred); - new->cl_addr = ip_addr; gen_clid(new); - gen_confirm(new); add_to_unconfirmed(new, strhashval); out_copy: exid->clientid.cl_boot = new->cl_clientid.cl_boot; exid->clientid.cl_id = new->cl_clientid.cl_id; - new->cl_slot.sl_seqid = 0; exid->seqid = 1; nfsd4_set_ex_flags(new, exid); dprintk("nfsd4_exchange_id seqid %d flags %x\n", - new->cl_slot.sl_seqid, new->cl_exchange_flags); + new->cl_cs_slot.sl_seqid, new->cl_exchange_flags); status = nfs_ok; out: @@ -1313,40 +1235,60 @@ error: } static int -check_slot_seqid(u32 seqid, struct nfsd4_slot *slot) +check_slot_seqid(u32 seqid, u32 slot_seqid, int slot_inuse) { - dprintk("%s enter. seqid %d slot->sl_seqid %d\n", __func__, seqid, - slot->sl_seqid); + dprintk("%s enter. seqid %d slot_seqid %d\n", __func__, seqid, + slot_seqid); /* The slot is in use, and no response has been sent. */ - if (slot->sl_inuse) { - if (seqid == slot->sl_seqid) + if (slot_inuse) { + if (seqid == slot_seqid) return nfserr_jukebox; else return nfserr_seq_misordered; } /* Normal */ - if (likely(seqid == slot->sl_seqid + 1)) + if (likely(seqid == slot_seqid + 1)) return nfs_ok; /* Replay */ - if (seqid == slot->sl_seqid) + if (seqid == slot_seqid) return nfserr_replay_cache; /* Wraparound */ - if (seqid == 1 && (slot->sl_seqid + 1) == 0) + if (seqid == 1 && (slot_seqid + 1) == 0) return nfs_ok; /* Misordered replay or misordered new request */ return nfserr_seq_misordered; } +/* + * Cache the create session result into the create session single DRC + * slot cache by saving the xdr structure. sl_seqid has been set. + * Do this for solo or embedded create session operations. + */ +static void +nfsd4_cache_create_session(struct nfsd4_create_session *cr_ses, + struct nfsd4_clid_slot *slot, int nfserr) +{ + slot->sl_status = nfserr; + memcpy(&slot->sl_cr_ses, cr_ses, sizeof(*cr_ses)); +} + +static __be32 +nfsd4_replay_create_session(struct nfsd4_create_session *cr_ses, + struct nfsd4_clid_slot *slot) +{ + memcpy(cr_ses, &slot->sl_cr_ses, sizeof(*cr_ses)); + return slot->sl_status; +} + __be32 nfsd4_create_session(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_create_session *cr_ses) { - u32 ip_addr = svc_addr_in(rqstp)->sin_addr.s_addr; - struct nfsd4_compoundres *resp = rqstp->rq_resp; + struct sockaddr *sa = svc_addr(rqstp); struct nfs4_client *conf, *unconf; - struct nfsd4_slot *slot = NULL; + struct nfsd4_clid_slot *cs_slot = NULL; int status = 0; nfs4_lock_state(); @@ -1354,40 +1296,38 @@ nfsd4_create_session(struct svc_rqst *rqstp, conf = find_confirmed_client(&cr_ses->clientid); if (conf) { - slot = &conf->cl_slot; - status = check_slot_seqid(cr_ses->seqid, slot); + cs_slot = &conf->cl_cs_slot; + status = check_slot_seqid(cr_ses->seqid, cs_slot->sl_seqid, 0); if (status == nfserr_replay_cache) { dprintk("Got a create_session replay! seqid= %d\n", - slot->sl_seqid); - cstate->slot = slot; - cstate->status = status; + cs_slot->sl_seqid); /* Return the cached reply status */ - status = nfsd4_replay_cache_entry(resp, NULL); + status = nfsd4_replay_create_session(cr_ses, cs_slot); goto out; - } else if (cr_ses->seqid != conf->cl_slot.sl_seqid + 1) { + } else if (cr_ses->seqid != cs_slot->sl_seqid + 1) { status = nfserr_seq_misordered; dprintk("Sequence misordered!\n"); dprintk("Expected seqid= %d but got seqid= %d\n", - slot->sl_seqid, cr_ses->seqid); + cs_slot->sl_seqid, cr_ses->seqid); goto out; } - conf->cl_slot.sl_seqid++; + cs_slot->sl_seqid++; } else if (unconf) { if (!same_creds(&unconf->cl_cred, &rqstp->rq_cred) || - (ip_addr != unconf->cl_addr)) { + !rpc_cmp_addr(sa, (struct sockaddr *) &unconf->cl_addr)) { status = nfserr_clid_inuse; goto out; } - slot = &unconf->cl_slot; - status = check_slot_seqid(cr_ses->seqid, slot); + cs_slot = &unconf->cl_cs_slot; + status = check_slot_seqid(cr_ses->seqid, cs_slot->sl_seqid, 0); if (status) { /* an unconfirmed replay returns misordered */ status = nfserr_seq_misordered; - goto out; + goto out_cache; } - slot->sl_seqid++; /* from 0 to 1 */ + cs_slot->sl_seqid++; /* from 0 to 1 */ move_to_confirmed(unconf); /* @@ -1396,6 +1336,19 @@ nfsd4_create_session(struct svc_rqst *rqstp, cr_ses->flags &= ~SESSION4_PERSIST; cr_ses->flags &= ~SESSION4_RDMA; + if (cr_ses->flags & SESSION4_BACK_CHAN) { + unconf->cl_cb_xprt = rqstp->rq_xprt; + svc_xprt_get(unconf->cl_cb_xprt); + rpc_copy_addr( + (struct sockaddr *)&unconf->cl_cb_conn.cb_addr, + sa); + unconf->cl_cb_conn.cb_addrlen = svc_addr_len(sa); + unconf->cl_cb_conn.cb_minorversion = + cstate->minorversion; + unconf->cl_cb_conn.cb_prog = cr_ses->callback_prog; + unconf->cl_cb_seq_nr = 1; + nfsd4_probe_callback(unconf); + } conf = unconf; } else { status = nfserr_stale_clientid; @@ -1408,12 +1361,11 @@ nfsd4_create_session(struct svc_rqst *rqstp, memcpy(cr_ses->sessionid.data, conf->cl_sessionid.data, NFS4_MAX_SESSIONID_LEN); - cr_ses->seqid = slot->sl_seqid; + cr_ses->seqid = cs_slot->sl_seqid; - slot->sl_inuse = true; - cstate->slot = slot; - /* Ensure a page is used for the cache */ - slot->sl_cache_entry.ce_cachethis = 1; +out_cache: + /* cache solo and embedded create sessions under the state lock */ + nfsd4_cache_create_session(cr_ses, cs_slot, status); out: nfs4_unlock_state(); dprintk("%s returns %d\n", __func__, ntohl(status)); @@ -1478,18 +1430,23 @@ nfsd4_sequence(struct svc_rqst *rqstp, if (seq->slotid >= session->se_fchannel.maxreqs) goto out; - slot = &session->se_slots[seq->slotid]; + slot = session->se_slots[seq->slotid]; dprintk("%s: slotid %d\n", __func__, seq->slotid); - status = check_slot_seqid(seq->seqid, slot); + /* We do not negotiate the number of slots yet, so set the + * maxslots to the session maxreqs which is used to encode + * sr_highest_slotid and the sr_target_slot id to maxslots */ + seq->maxslots = session->se_fchannel.maxreqs; + + status = check_slot_seqid(seq->seqid, slot->sl_seqid, slot->sl_inuse); if (status == nfserr_replay_cache) { cstate->slot = slot; cstate->session = session; /* Return the cached reply status and set cstate->status - * for nfsd4_svc_encode_compoundres processing */ + * for nfsd4_proc_compound processing */ status = nfsd4_replay_cache_entry(resp, seq); cstate->status = nfserr_replay_cache; - goto replay_cache; + goto out; } if (status) goto out; @@ -1497,23 +1454,23 @@ nfsd4_sequence(struct svc_rqst *rqstp, /* Success! bump slot seqid */ slot->sl_inuse = true; slot->sl_seqid = seq->seqid; - slot->sl_cache_entry.ce_cachethis = seq->cachethis; - /* Always set the cache entry cachethis for solo sequence */ - if (nfsd4_is_solo_sequence(resp)) - slot->sl_cache_entry.ce_cachethis = 1; + slot->sl_cachethis = seq->cachethis; cstate->slot = slot; cstate->session = session; -replay_cache: - /* Renew the clientid on success and on replay. - * Hold a session reference until done processing the compound: + /* Hold a session reference until done processing the compound: * nfsd4_put_session called only if the cstate slot is set. */ - renew_client(session->se_client); nfsd4_get_session(session); out: spin_unlock(&sessionid_lock); + /* Renew the clientid on success and on replay */ + if (cstate->session) { + nfs4_lock_state(); + renew_client(session->se_client); + nfs4_unlock_state(); + } dprintk("%s: return %d\n", __func__, ntohl(status)); return status; } @@ -1522,7 +1479,7 @@ __be32 nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_setclientid *setclid) { - struct sockaddr_in *sin = svc_addr_in(rqstp); + struct sockaddr *sa = svc_addr(rqstp); struct xdr_netobj clname = { .len = setclid->se_namelen, .data = setclid->se_name, @@ -1531,7 +1488,6 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, unsigned int strhashval; struct nfs4_client *conf, *unconf, *new; __be32 status; - char *princ; char dname[HEXDIR_LEN]; if (!check_name(clname)) @@ -1554,8 +1510,11 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, /* RFC 3530 14.2.33 CASE 0: */ status = nfserr_clid_inuse; if (!same_creds(&conf->cl_cred, &rqstp->rq_cred)) { - dprintk("NFSD: setclientid: string in use by client" - " at %pI4\n", &conf->cl_addr); + char addr_str[INET6_ADDRSTRLEN]; + rpc_ntop((struct sockaddr *) &conf->cl_addr, addr_str, + sizeof(addr_str)); + dprintk("NFSD: setclientid: string in use by client " + "at %s\n", addr_str); goto out; } } @@ -1573,7 +1532,7 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, */ if (unconf) expire_client(unconf); - new = create_client(clname, dname); + new = create_client(clname, dname, rqstp, &clverifier); if (new == NULL) goto out; gen_clid(new); @@ -1590,7 +1549,7 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, */ expire_client(unconf); } - new = create_client(clname, dname); + new = create_client(clname, dname, rqstp, &clverifier); if (new == NULL) goto out; copy_clid(new, conf); @@ -1600,7 +1559,7 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, * probable client reboot; state will be removed if * confirmed. */ - new = create_client(clname, dname); + new = create_client(clname, dname, rqstp, &clverifier); if (new == NULL) goto out; gen_clid(new); @@ -1611,25 +1570,12 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, * confirmed. */ expire_client(unconf); - new = create_client(clname, dname); + new = create_client(clname, dname, rqstp, &clverifier); if (new == NULL) goto out; gen_clid(new); } - copy_verf(new, &clverifier); - new->cl_addr = sin->sin_addr.s_addr; - new->cl_flavor = rqstp->rq_flavor; - princ = svc_gss_principal(rqstp); - if (princ) { - new->cl_principal = kstrdup(princ, GFP_KERNEL); - if (new->cl_principal == NULL) { - free_client(new); - goto out; - } - } - copy_cred(&new->cl_cred, &rqstp->rq_cred); - gen_confirm(new); - gen_callback(new, setclid); + gen_callback(new, setclid, rpc_get_scope_id(sa)); add_to_unconfirmed(new, strhashval); setclid->se_clientid.cl_boot = new->cl_clientid.cl_boot; setclid->se_clientid.cl_id = new->cl_clientid.cl_id; @@ -1651,7 +1597,7 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_setclientid_confirm *setclientid_confirm) { - struct sockaddr_in *sin = svc_addr_in(rqstp); + struct sockaddr *sa = svc_addr(rqstp); struct nfs4_client *conf, *unconf; nfs4_verifier confirm = setclientid_confirm->sc_confirm; clientid_t * clid = &setclientid_confirm->sc_clientid; @@ -1670,9 +1616,9 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp, unconf = find_unconfirmed_client(clid); status = nfserr_clid_inuse; - if (conf && conf->cl_addr != sin->sin_addr.s_addr) + if (conf && !rpc_cmp_addr((struct sockaddr *) &conf->cl_addr, sa)) goto out; - if (unconf && unconf->cl_addr != sin->sin_addr.s_addr) + if (unconf && !rpc_cmp_addr((struct sockaddr *) &unconf->cl_addr, sa)) goto out; /* @@ -2163,7 +2109,7 @@ int nfsd_change_deleg_cb(struct file_lock **onlist, int arg) return -EAGAIN; } -static struct lock_manager_operations nfsd_lease_mng_ops = { +static const struct lock_manager_operations nfsd_lease_mng_ops = { .fl_break = nfsd_break_deleg_cb, .fl_release_private = nfsd_release_deleg_cb, .fl_copy_lock = nfsd_copy_lock_deleg_cb, @@ -3368,7 +3314,7 @@ nfs4_transform_lock_offset(struct file_lock *lock) /* Hack!: For now, we're defining this just so we can use a pointer to it * as a unique cookie to identify our (NFSv4's) posix locks. */ -static struct lock_manager_operations nfsd_posix_mng_ops = { +static const struct lock_manager_operations nfsd_posix_mng_ops = { }; static inline void @@ -4072,7 +4018,7 @@ set_max_delegations(void) /* initialization to perform when the nfsd service is started: */ -static void +static int __nfs4_state_start(void) { unsigned long grace_time; @@ -4084,19 +4030,26 @@ __nfs4_state_start(void) printk(KERN_INFO "NFSD: starting %ld-second grace period\n", grace_time/HZ); laundry_wq = create_singlethread_workqueue("nfsd4"); + if (laundry_wq == NULL) + return -ENOMEM; queue_delayed_work(laundry_wq, &laundromat_work, grace_time); set_max_delegations(); + return set_callback_cred(); } -void +int nfs4_state_start(void) { + int ret; + if (nfs4_init) - return; + return 0; nfsd4_load_reboot_recovery_data(); - __nfs4_state_start(); + ret = __nfs4_state_start(); + if (ret) + return ret; nfs4_init = 1; - return; + return 0; } time_t diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 2dcc7fe..0fbd50c 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -1599,7 +1599,8 @@ static __be32 nfsd4_encode_fs_location4(struct nfsd4_fs_location *location, static char *nfsd4_path(struct svc_rqst *rqstp, struct svc_export *exp, __be32 *stat) { struct svc_fh tmp_fh; - char *path, *rootpath; + char *path = NULL, *rootpath; + size_t rootlen; fh_init(&tmp_fh, NFS4_FHSIZE); *stat = exp_pseudoroot(rqstp, &tmp_fh); @@ -1609,14 +1610,18 @@ static char *nfsd4_path(struct svc_rqst *rqstp, struct svc_export *exp, __be32 * path = exp->ex_pathname; - if (strncmp(path, rootpath, strlen(rootpath))) { + rootlen = strlen(rootpath); + if (strncmp(path, rootpath, rootlen)) { dprintk("nfsd: fs_locations failed;" "%s is not contained in %s\n", path, rootpath); *stat = nfserr_notsupp; - return NULL; + path = NULL; + goto out; } - - return path + strlen(rootpath); + path += rootlen; +out: + fh_put(&tmp_fh); + return path; } /* @@ -1793,11 +1798,6 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp, goto out_nfserr; } } - if (bmval0 & FATTR4_WORD0_FS_LOCATIONS) { - if (exp->ex_fslocs.locations == NULL) { - bmval0 &= ~FATTR4_WORD0_FS_LOCATIONS; - } - } if ((buflen -= 16) < 0) goto out_resource; @@ -1825,8 +1825,6 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp, goto out_resource; if (!aclsupport) word0 &= ~FATTR4_WORD0_ACL; - if (!exp->ex_fslocs.locations) - word0 &= ~FATTR4_WORD0_FS_LOCATIONS; if (!word2) { WRITE32(2); WRITE32(word0); @@ -3064,6 +3062,7 @@ nfsd4_encode_sequence(struct nfsd4_compoundres *resp, int nfserr, WRITE32(0); ADJUST_ARGS(); + resp->cstate.datap = p; /* DRC cache data pointer */ return 0; } @@ -3166,7 +3165,7 @@ static int nfsd4_check_drc_limit(struct nfsd4_compoundres *resp) return status; session = resp->cstate.session; - if (session == NULL || slot->sl_cache_entry.ce_cachethis == 0) + if (session == NULL || slot->sl_cachethis == 0) return status; if (resp->opcnt >= args->opcnt) @@ -3291,6 +3290,7 @@ nfs4svc_encode_compoundres(struct svc_rqst *rqstp, __be32 *p, struct nfsd4_compo /* * All that remains is to write the tag and operation count... */ + struct nfsd4_compound_state *cs = &resp->cstate; struct kvec *iov; p = resp->tagp; *p++ = htonl(resp->taglen); @@ -3304,17 +3304,11 @@ nfs4svc_encode_compoundres(struct svc_rqst *rqstp, __be32 *p, struct nfsd4_compo iov = &rqstp->rq_res.head[0]; iov->iov_len = ((char*)resp->p) - (char*)iov->iov_base; BUG_ON(iov->iov_len > PAGE_SIZE); - if (nfsd4_has_session(&resp->cstate)) { - if (resp->cstate.status == nfserr_replay_cache && - !nfsd4_not_cached(resp)) { - iov->iov_len = resp->cstate.iovlen; - } else { - nfsd4_store_cache_entry(resp); - dprintk("%s: SET SLOT STATE TO AVAILABLE\n", __func__); - resp->cstate.slot->sl_inuse = 0; - } - if (resp->cstate.session) - nfsd4_put_session(resp->cstate.session); + if (nfsd4_has_session(cs) && cs->status != nfserr_replay_cache) { + nfsd4_store_cache_entry(resp); + dprintk("%s: SET SLOT STATE TO AVAILABLE\n", __func__); + resp->cstate.slot->sl_inuse = false; + nfsd4_put_session(resp->cstate.session); } return 1; } diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index 6d08475..5c01fc1 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -37,6 +37,7 @@ #include <linux/nfsd/xdr.h> #include <linux/nfsd/syscall.h> #include <linux/lockd/lockd.h> +#include <linux/sunrpc/clnt.h> #include <asm/uaccess.h> #include <net/ipv6.h> @@ -173,12 +174,13 @@ static const struct file_operations exports_operations = { }; extern int nfsd_pool_stats_open(struct inode *inode, struct file *file); +extern int nfsd_pool_stats_release(struct inode *inode, struct file *file); -static struct file_operations pool_stats_operations = { +static const struct file_operations pool_stats_operations = { .open = nfsd_pool_stats_open, .read = seq_read, .llseek = seq_lseek, - .release = seq_release, + .release = nfsd_pool_stats_release, .owner = THIS_MODULE, }; @@ -490,22 +492,18 @@ static ssize_t write_getfd(struct file *file, char *buf, size_t size) * * Input: * buf: '\n'-terminated C string containing a - * presentation format IPv4 address + * presentation format IP address * size: length of C string in @buf * Output: * On success: returns zero if all specified locks were released; * returns one if one or more locks were not released * On error: return code is negative errno value - * - * Note: Only AF_INET client addresses are passed in */ static ssize_t write_unlock_ip(struct file *file, char *buf, size_t size) { - struct sockaddr_in sin = { - .sin_family = AF_INET, - }; - int b1, b2, b3, b4; - char c; + struct sockaddr_storage address; + struct sockaddr *sap = (struct sockaddr *)&address; + size_t salen = sizeof(address); char *fo_path; /* sanity check */ @@ -519,14 +517,10 @@ static ssize_t write_unlock_ip(struct file *file, char *buf, size_t size) if (qword_get(&buf, fo_path, size) < 0) return -EINVAL; - /* get ipv4 address */ - if (sscanf(fo_path, "%u.%u.%u.%u%c", &b1, &b2, &b3, &b4, &c) != 4) - return -EINVAL; - if (b1 > 255 || b2 > 255 || b3 > 255 || b4 > 255) + if (rpc_pton(fo_path, size, sap, salen) == 0) return -EINVAL; - sin.sin_addr.s_addr = htonl((b1 << 24) | (b2 << 16) | (b3 << 8) | b4); - return nlmsvc_unlock_all_by_ip((struct sockaddr *)&sin); + return nlmsvc_unlock_all_by_ip(sap); } /** @@ -783,10 +777,7 @@ static ssize_t write_pool_threads(struct file *file, char *buf, size_t size) size -= len; mesg += len; } - - mutex_unlock(&nfsd_mutex); - return (mesg-buf); - + rv = mesg - buf; out_free: kfree(nthreads); mutex_unlock(&nfsd_mutex); diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c index 8847f3f..01965b2 100644 --- a/fs/nfsd/nfsfh.c +++ b/fs/nfsd/nfsfh.c @@ -397,44 +397,51 @@ static inline void _fh_update_old(struct dentry *dentry, fh->ofh_dirino = 0; } -__be32 -fh_compose(struct svc_fh *fhp, struct svc_export *exp, struct dentry *dentry, - struct svc_fh *ref_fh) +static bool is_root_export(struct svc_export *exp) { - /* ref_fh is a reference file handle. - * if it is non-null and for the same filesystem, then we should compose - * a filehandle which is of the same version, where possible. - * Currently, that means that if ref_fh->fh_handle.fh_version == 0xca - * Then create a 32byte filehandle using nfs_fhbase_old - * - */ + return exp->ex_path.dentry == exp->ex_path.dentry->d_sb->s_root; +} - u8 version; - u8 fsid_type = 0; - struct inode * inode = dentry->d_inode; - struct dentry *parent = dentry->d_parent; - __u32 *datap; - dev_t ex_dev = exp->ex_path.dentry->d_inode->i_sb->s_dev; - int root_export = (exp->ex_path.dentry == exp->ex_path.dentry->d_sb->s_root); +static struct super_block *exp_sb(struct svc_export *exp) +{ + return exp->ex_path.dentry->d_inode->i_sb; +} - dprintk("nfsd: fh_compose(exp %02x:%02x/%ld %s/%s, ino=%ld)\n", - MAJOR(ex_dev), MINOR(ex_dev), - (long) exp->ex_path.dentry->d_inode->i_ino, - parent->d_name.name, dentry->d_name.name, - (inode ? inode->i_ino : 0)); +static bool fsid_type_ok_for_exp(u8 fsid_type, struct svc_export *exp) +{ + switch (fsid_type) { + case FSID_DEV: + if (!old_valid_dev(exp_sb(exp)->s_dev)) + return 0; + /* FALL THROUGH */ + case FSID_MAJOR_MINOR: + case FSID_ENCODE_DEV: + return exp_sb(exp)->s_type->fs_flags & FS_REQUIRES_DEV; + case FSID_NUM: + return exp->ex_flags & NFSEXP_FSID; + case FSID_UUID8: + case FSID_UUID16: + if (!is_root_export(exp)) + return 0; + /* fall through */ + case FSID_UUID4_INUM: + case FSID_UUID16_INUM: + return exp->ex_uuid != NULL; + } + return 1; +} - /* Choose filehandle version and fsid type based on - * the reference filehandle (if it is in the same export) - * or the export options. - */ - retry: + +static void set_version_and_fsid_type(struct svc_fh *fhp, struct svc_export *exp, struct svc_fh *ref_fh) +{ + u8 version; + u8 fsid_type; +retry: version = 1; if (ref_fh && ref_fh->fh_export == exp) { version = ref_fh->fh_handle.fh_version; fsid_type = ref_fh->fh_handle.fh_fsid_type; - if (ref_fh == fhp) - fh_put(ref_fh); ref_fh = NULL; switch (version) { @@ -447,58 +454,66 @@ fh_compose(struct svc_fh *fhp, struct svc_export *exp, struct dentry *dentry, goto retry; } - /* Need to check that this type works for this - * export point. As the fsid -> filesystem mapping - * was guided by user-space, there is no guarantee - * that the filesystem actually supports that fsid - * type. If it doesn't we loop around again without - * ref_fh set. + /* + * As the fsid -> filesystem mapping was guided by + * user-space, there is no guarantee that the filesystem + * actually supports that fsid type. If it doesn't we + * loop around again without ref_fh set. */ - switch(fsid_type) { - case FSID_DEV: - if (!old_valid_dev(ex_dev)) - goto retry; - /* FALL THROUGH */ - case FSID_MAJOR_MINOR: - case FSID_ENCODE_DEV: - if (!(exp->ex_path.dentry->d_inode->i_sb->s_type->fs_flags - & FS_REQUIRES_DEV)) - goto retry; - break; - case FSID_NUM: - if (! (exp->ex_flags & NFSEXP_FSID)) - goto retry; - break; - case FSID_UUID8: - case FSID_UUID16: - if (!root_export) - goto retry; - /* fall through */ - case FSID_UUID4_INUM: - case FSID_UUID16_INUM: - if (exp->ex_uuid == NULL) - goto retry; - break; - } + if (!fsid_type_ok_for_exp(fsid_type, exp)) + goto retry; } else if (exp->ex_flags & NFSEXP_FSID) { fsid_type = FSID_NUM; } else if (exp->ex_uuid) { if (fhp->fh_maxsize >= 64) { - if (root_export) + if (is_root_export(exp)) fsid_type = FSID_UUID16; else fsid_type = FSID_UUID16_INUM; } else { - if (root_export) + if (is_root_export(exp)) fsid_type = FSID_UUID8; else fsid_type = FSID_UUID4_INUM; } - } else if (!old_valid_dev(ex_dev)) + } else if (!old_valid_dev(exp_sb(exp)->s_dev)) /* for newer device numbers, we must use a newer fsid format */ fsid_type = FSID_ENCODE_DEV; else fsid_type = FSID_DEV; + fhp->fh_handle.fh_version = version; + if (version) + fhp->fh_handle.fh_fsid_type = fsid_type; +} + +__be32 +fh_compose(struct svc_fh *fhp, struct svc_export *exp, struct dentry *dentry, + struct svc_fh *ref_fh) +{ + /* ref_fh is a reference file handle. + * if it is non-null and for the same filesystem, then we should compose + * a filehandle which is of the same version, where possible. + * Currently, that means that if ref_fh->fh_handle.fh_version == 0xca + * Then create a 32byte filehandle using nfs_fhbase_old + * + */ + + struct inode * inode = dentry->d_inode; + struct dentry *parent = dentry->d_parent; + __u32 *datap; + dev_t ex_dev = exp_sb(exp)->s_dev; + + dprintk("nfsd: fh_compose(exp %02x:%02x/%ld %s/%s, ino=%ld)\n", + MAJOR(ex_dev), MINOR(ex_dev), + (long) exp->ex_path.dentry->d_inode->i_ino, + parent->d_name.name, dentry->d_name.name, + (inode ? inode->i_ino : 0)); + + /* Choose filehandle version and fsid type based on + * the reference filehandle (if it is in the same export) + * or the export options. + */ + set_version_and_fsid_type(fhp, exp, ref_fh); if (ref_fh == fhp) fh_put(ref_fh); @@ -516,7 +531,7 @@ fh_compose(struct svc_fh *fhp, struct svc_export *exp, struct dentry *dentry, fhp->fh_export = exp; cache_get(&exp->h); - if (version == 0xca) { + if (fhp->fh_handle.fh_version == 0xca) { /* old style filehandle please */ memset(&fhp->fh_handle.fh_base, 0, NFS_FHSIZE); fhp->fh_handle.fh_size = NFS_FHSIZE; @@ -530,22 +545,22 @@ fh_compose(struct svc_fh *fhp, struct svc_export *exp, struct dentry *dentry, _fh_update_old(dentry, exp, &fhp->fh_handle); } else { int len; - fhp->fh_handle.fh_version = 1; fhp->fh_handle.fh_auth_type = 0; datap = fhp->fh_handle.fh_auth+0; - fhp->fh_handle.fh_fsid_type = fsid_type; - mk_fsid(fsid_type, datap, ex_dev, + mk_fsid(fhp->fh_handle.fh_fsid_type, datap, ex_dev, exp->ex_path.dentry->d_inode->i_ino, exp->ex_fsid, exp->ex_uuid); - len = key_len(fsid_type); + len = key_len(fhp->fh_handle.fh_fsid_type); datap += len/4; fhp->fh_handle.fh_size = 4 + len; if (inode) _fh_update(fhp, exp, dentry); - if (fhp->fh_handle.fh_fileid_type == 255) + if (fhp->fh_handle.fh_fileid_type == 255) { + fh_put(fhp); return nfserr_opnotsupp; + } } return 0; @@ -639,8 +654,7 @@ enum fsid_source fsid_source(struct svc_fh *fhp) case FSID_DEV: case FSID_ENCODE_DEV: case FSID_MAJOR_MINOR: - if (fhp->fh_export->ex_path.dentry->d_inode->i_sb->s_type->fs_flags - & FS_REQUIRES_DEV) + if (exp_sb(fhp->fh_export)->s_type->fs_flags & FS_REQUIRES_DEV) return FSIDSOURCE_DEV; break; case FSID_NUM: diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index 492c79b..67ea83e 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -34,6 +34,7 @@ #include <linux/nfsd/syscall.h> #include <linux/lockd/bind.h> #include <linux/nfsacl.h> +#include <linux/seq_file.h> #define NFSDDBG_FACILITY NFSDDBG_SVC @@ -66,6 +67,16 @@ struct timeval nfssvc_boot; DEFINE_MUTEX(nfsd_mutex); struct svc_serv *nfsd_serv; +/* + * nfsd_drc_lock protects nfsd_drc_max_pages and nfsd_drc_pages_used. + * nfsd_drc_max_pages limits the total amount of memory available for + * version 4.1 DRC caches. + * nfsd_drc_pages_used tracks the current version 4.1 DRC memory usage. + */ +spinlock_t nfsd_drc_lock; +unsigned int nfsd_drc_max_mem; +unsigned int nfsd_drc_mem_used; + #if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) static struct svc_stat nfsd_acl_svcstats; static struct svc_version * nfsd_acl_version[] = { @@ -235,13 +246,12 @@ void nfsd_reset_versions(void) */ static void set_max_drc(void) { - /* The percent of nr_free_buffer_pages used by the V4.1 server DRC */ - #define NFSD_DRC_SIZE_SHIFT 7 - nfsd_serv->sv_drc_max_pages = nr_free_buffer_pages() - >> NFSD_DRC_SIZE_SHIFT; - nfsd_serv->sv_drc_pages_used = 0; - dprintk("%s svc_drc_max_pages %u\n", __func__, - nfsd_serv->sv_drc_max_pages); + #define NFSD_DRC_SIZE_SHIFT 10 + nfsd_drc_max_mem = (nr_free_buffer_pages() + >> NFSD_DRC_SIZE_SHIFT) * PAGE_SIZE; + nfsd_drc_mem_used = 0; + spin_lock_init(&nfsd_drc_lock); + dprintk("%s nfsd_drc_max_mem %u \n", __func__, nfsd_drc_max_mem); } int nfsd_create_serv(void) @@ -401,7 +411,9 @@ nfsd_svc(unsigned short port, int nrservs) error = nfsd_racache_init(2*nrservs); if (error<0) goto out; - nfs4_state_start(); + error = nfs4_state_start(); + if (error) + goto out; nfsd_reset_versions(); @@ -496,7 +508,9 @@ nfsd(void *vrqstp) /* Lock the export hash tables for reading. */ exp_readlock(); + validate_process_creds(); svc_process(rqstp); + validate_process_creds(); /* Unlock export hash tables */ exp_readunlock(); @@ -567,10 +581,6 @@ nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp) + rqstp->rq_res.head[0].iov_len; rqstp->rq_res.head[0].iov_len += sizeof(__be32); - /* NFSv4.1 DRC requires statp */ - if (rqstp->rq_vers == 4) - nfsd4_set_statp(rqstp, statp); - /* Now call the procedure handler, and encode NFS status. */ nfserr = proc->pc_func(rqstp, rqstp->rq_argp, rqstp->rq_resp); nfserr = map_new_errors(rqstp->rq_vers, nfserr); @@ -605,7 +615,25 @@ nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp) int nfsd_pool_stats_open(struct inode *inode, struct file *file) { - if (nfsd_serv == NULL) + int ret; + mutex_lock(&nfsd_mutex); + if (nfsd_serv == NULL) { + mutex_unlock(&nfsd_mutex); return -ENODEV; - return svc_pool_stats_open(nfsd_serv, file); + } + /* bump up the psudo refcount while traversing */ + svc_get(nfsd_serv); + ret = svc_pool_stats_open(nfsd_serv, file); + mutex_unlock(&nfsd_mutex); + return ret; +} + +int nfsd_pool_stats_release(struct inode *inode, struct file *file) +{ + int ret = seq_release(inode, file); + mutex_lock(&nfsd_mutex); + /* this function really, really should have been called svc_put() */ + svc_destroy(nfsd_serv); + mutex_unlock(&nfsd_mutex); + return ret; } diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 23341c1..a293f02 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -89,6 +89,12 @@ struct raparm_hbucket { #define RAPARM_HASH_MASK (RAPARM_HASH_SIZE-1) static struct raparm_hbucket raparm_hash[RAPARM_HASH_SIZE]; +static inline int +nfsd_v4client(struct svc_rqst *rq) +{ + return rq->rq_prog == NFS_PROGRAM && rq->rq_vers == 4; +} + /* * Called from nfsd_lookup and encode_dirent. Check if we have crossed * a mount point. @@ -115,7 +121,8 @@ nfsd_cross_mnt(struct svc_rqst *rqstp, struct dentry **dpp, path_put(&path); goto out; } - if ((exp->ex_flags & NFSEXP_CROSSMOUNT) || EX_NOHIDE(exp2)) { + if (nfsd_v4client(rqstp) || + (exp->ex_flags & NFSEXP_CROSSMOUNT) || EX_NOHIDE(exp2)) { /* successfully crossed mount point */ /* * This is subtle: path.dentry is *not* on path.mnt @@ -684,6 +691,8 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, __be32 err; int host_err; + validate_process_creds(); + /* * If we get here, then the client has already done an "open", * and (hopefully) checked permission - so allow OWNER_OVERRIDE @@ -740,6 +749,7 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, out_nfserr: err = nfserrno(host_err); out: + validate_process_creds(); return err; } diff --git a/fs/nilfs2/Kconfig b/fs/nilfs2/Kconfig index 72da095..251da07 100644 --- a/fs/nilfs2/Kconfig +++ b/fs/nilfs2/Kconfig @@ -1,6 +1,6 @@ config NILFS2_FS tristate "NILFS2 file system support (EXPERIMENTAL)" - depends on BLOCK && EXPERIMENTAL + depends on EXPERIMENTAL select CRC32 help NILFS2 is a log-structured file system (LFS) supporting continuous diff --git a/fs/nilfs2/bmap.c b/fs/nilfs2/bmap.c index 99d58a0..08834df 100644 --- a/fs/nilfs2/bmap.c +++ b/fs/nilfs2/bmap.c @@ -36,6 +36,26 @@ struct inode *nilfs_bmap_get_dat(const struct nilfs_bmap *bmap) return nilfs_dat_inode(NILFS_I_NILFS(bmap->b_inode)); } +/** + * nilfs_bmap_lookup_at_level - find a data block or node block + * @bmap: bmap + * @key: key + * @level: level + * @ptrp: place to store the value associated to @key + * + * Description: nilfs_bmap_lookup_at_level() finds a record whose key + * matches @key in the block at @level of the bmap. + * + * Return Value: On success, 0 is returned and the record associated with @key + * is stored in the place pointed by @ptrp. On error, one of the following + * negative error codes is returned. + * + * %-EIO - I/O error. + * + * %-ENOMEM - Insufficient amount of memory available. + * + * %-ENOENT - A record associated with @key does not exist. + */ int nilfs_bmap_lookup_at_level(struct nilfs_bmap *bmap, __u64 key, int level, __u64 *ptrp) { @@ -69,39 +89,6 @@ int nilfs_bmap_lookup_contig(struct nilfs_bmap *bmap, __u64 key, __u64 *ptrp, return ret; } -/** - * nilfs_bmap_lookup - find a record - * @bmap: bmap - * @key: key - * @recp: pointer to record - * - * Description: nilfs_bmap_lookup() finds a record whose key matches @key in - * @bmap. - * - * Return Value: On success, 0 is returned and the record associated with @key - * is stored in the place pointed by @recp. On error, one of the following - * negative error codes is returned. - * - * %-EIO - I/O error. - * - * %-ENOMEM - Insufficient amount of memory available. - * - * %-ENOENT - A record associated with @key does not exist. - */ -int nilfs_bmap_lookup(struct nilfs_bmap *bmap, - unsigned long key, - unsigned long *recp) -{ - __u64 ptr; - int ret; - - /* XXX: use macro for level 1 */ - ret = nilfs_bmap_lookup_at_level(bmap, key, 1, &ptr); - if (recp != NULL) - *recp = ptr; - return ret; -} - static int nilfs_bmap_do_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr) { __u64 keys[NILFS_BMAP_SMALL_HIGH + 1]; @@ -469,104 +456,6 @@ __u64 nilfs_bmap_find_target_in_group(const struct nilfs_bmap *bmap) (entries_per_group / NILFS_BMAP_GROUP_DIV); } -int nilfs_bmap_prepare_alloc_v(struct nilfs_bmap *bmap, - union nilfs_bmap_ptr_req *req) -{ - return nilfs_dat_prepare_alloc(nilfs_bmap_get_dat(bmap), &req->bpr_req); -} - -void nilfs_bmap_commit_alloc_v(struct nilfs_bmap *bmap, - union nilfs_bmap_ptr_req *req) -{ - nilfs_dat_commit_alloc(nilfs_bmap_get_dat(bmap), &req->bpr_req); -} - -void nilfs_bmap_abort_alloc_v(struct nilfs_bmap *bmap, - union nilfs_bmap_ptr_req *req) -{ - nilfs_dat_abort_alloc(nilfs_bmap_get_dat(bmap), &req->bpr_req); -} - -int nilfs_bmap_start_v(struct nilfs_bmap *bmap, union nilfs_bmap_ptr_req *req, - sector_t blocknr) -{ - struct inode *dat = nilfs_bmap_get_dat(bmap); - int ret; - - ret = nilfs_dat_prepare_start(dat, &req->bpr_req); - if (likely(!ret)) - nilfs_dat_commit_start(dat, &req->bpr_req, blocknr); - return ret; -} - -int nilfs_bmap_prepare_end_v(struct nilfs_bmap *bmap, - union nilfs_bmap_ptr_req *req) -{ - return nilfs_dat_prepare_end(nilfs_bmap_get_dat(bmap), &req->bpr_req); -} - -void nilfs_bmap_commit_end_v(struct nilfs_bmap *bmap, - union nilfs_bmap_ptr_req *req) -{ - nilfs_dat_commit_end(nilfs_bmap_get_dat(bmap), &req->bpr_req, - bmap->b_ptr_type == NILFS_BMAP_PTR_VS); -} - -void nilfs_bmap_abort_end_v(struct nilfs_bmap *bmap, - union nilfs_bmap_ptr_req *req) -{ - nilfs_dat_abort_end(nilfs_bmap_get_dat(bmap), &req->bpr_req); -} - -int nilfs_bmap_move_v(const struct nilfs_bmap *bmap, __u64 vblocknr, - sector_t blocknr) -{ - return nilfs_dat_move(nilfs_bmap_get_dat(bmap), vblocknr, blocknr); -} - -int nilfs_bmap_mark_dirty(const struct nilfs_bmap *bmap, __u64 vblocknr) -{ - return nilfs_dat_mark_dirty(nilfs_bmap_get_dat(bmap), vblocknr); -} - -int nilfs_bmap_prepare_update_v(struct nilfs_bmap *bmap, - union nilfs_bmap_ptr_req *oldreq, - union nilfs_bmap_ptr_req *newreq) -{ - struct inode *dat = nilfs_bmap_get_dat(bmap); - int ret; - - ret = nilfs_dat_prepare_end(dat, &oldreq->bpr_req); - if (ret < 0) - return ret; - ret = nilfs_dat_prepare_alloc(dat, &newreq->bpr_req); - if (ret < 0) - nilfs_dat_abort_end(dat, &oldreq->bpr_req); - - return ret; -} - -void nilfs_bmap_commit_update_v(struct nilfs_bmap *bmap, - union nilfs_bmap_ptr_req *oldreq, - union nilfs_bmap_ptr_req *newreq) -{ - struct inode *dat = nilfs_bmap_get_dat(bmap); - - nilfs_dat_commit_end(dat, &oldreq->bpr_req, - bmap->b_ptr_type == NILFS_BMAP_PTR_VS); - nilfs_dat_commit_alloc(dat, &newreq->bpr_req); -} - -void nilfs_bmap_abort_update_v(struct nilfs_bmap *bmap, - union nilfs_bmap_ptr_req *oldreq, - union nilfs_bmap_ptr_req *newreq) -{ - struct inode *dat = nilfs_bmap_get_dat(bmap); - - nilfs_dat_abort_end(dat, &oldreq->bpr_req); - nilfs_dat_abort_alloc(dat, &newreq->bpr_req); -} - static struct lock_class_key nilfs_bmap_dat_lock_key; static struct lock_class_key nilfs_bmap_mdt_lock_key; diff --git a/fs/nilfs2/bmap.h b/fs/nilfs2/bmap.h index b2890cd..9980d7d 100644 --- a/fs/nilfs2/bmap.h +++ b/fs/nilfs2/bmap.h @@ -28,6 +28,7 @@ #include <linux/buffer_head.h> #include <linux/nilfs2_fs.h> #include "alloc.h" +#include "dat.h" #define NILFS_BMAP_INVALID_PTR 0 @@ -141,7 +142,6 @@ struct nilfs_bmap { int nilfs_bmap_test_and_clear_dirty(struct nilfs_bmap *); int nilfs_bmap_read(struct nilfs_bmap *, struct nilfs_inode *); void nilfs_bmap_write(struct nilfs_bmap *, struct nilfs_inode *); -int nilfs_bmap_lookup(struct nilfs_bmap *, unsigned long, unsigned long *); int nilfs_bmap_lookup_contig(struct nilfs_bmap *, __u64, __u64 *, unsigned); int nilfs_bmap_insert(struct nilfs_bmap *, unsigned long, unsigned long); int nilfs_bmap_delete(struct nilfs_bmap *, unsigned long); @@ -160,90 +160,76 @@ void nilfs_bmap_init_gcdat(struct nilfs_bmap *, struct nilfs_bmap *); void nilfs_bmap_commit_gcdat(struct nilfs_bmap *, struct nilfs_bmap *); +static inline int nilfs_bmap_lookup(struct nilfs_bmap *bmap, __u64 key, + __u64 *ptr) +{ + return nilfs_bmap_lookup_at_level(bmap, key, 1, ptr); +} + /* * Internal use only */ struct inode *nilfs_bmap_get_dat(const struct nilfs_bmap *); -int nilfs_bmap_prepare_alloc_v(struct nilfs_bmap *, - union nilfs_bmap_ptr_req *); -void nilfs_bmap_commit_alloc_v(struct nilfs_bmap *, - union nilfs_bmap_ptr_req *); -void nilfs_bmap_abort_alloc_v(struct nilfs_bmap *, - union nilfs_bmap_ptr_req *); static inline int nilfs_bmap_prepare_alloc_ptr(struct nilfs_bmap *bmap, - union nilfs_bmap_ptr_req *req) + union nilfs_bmap_ptr_req *req, + struct inode *dat) { - if (NILFS_BMAP_USE_VBN(bmap)) - return nilfs_bmap_prepare_alloc_v(bmap, req); + if (dat) + return nilfs_dat_prepare_alloc(dat, &req->bpr_req); /* ignore target ptr */ req->bpr_ptr = bmap->b_last_allocated_ptr++; return 0; } static inline void nilfs_bmap_commit_alloc_ptr(struct nilfs_bmap *bmap, - union nilfs_bmap_ptr_req *req) + union nilfs_bmap_ptr_req *req, + struct inode *dat) { - if (NILFS_BMAP_USE_VBN(bmap)) - nilfs_bmap_commit_alloc_v(bmap, req); + if (dat) + nilfs_dat_commit_alloc(dat, &req->bpr_req); } static inline void nilfs_bmap_abort_alloc_ptr(struct nilfs_bmap *bmap, - union nilfs_bmap_ptr_req *req) + union nilfs_bmap_ptr_req *req, + struct inode *dat) { - if (NILFS_BMAP_USE_VBN(bmap)) - nilfs_bmap_abort_alloc_v(bmap, req); + if (dat) + nilfs_dat_abort_alloc(dat, &req->bpr_req); else bmap->b_last_allocated_ptr--; } -int nilfs_bmap_prepare_end_v(struct nilfs_bmap *, union nilfs_bmap_ptr_req *); -void nilfs_bmap_commit_end_v(struct nilfs_bmap *, union nilfs_bmap_ptr_req *); -void nilfs_bmap_abort_end_v(struct nilfs_bmap *, union nilfs_bmap_ptr_req *); - static inline int nilfs_bmap_prepare_end_ptr(struct nilfs_bmap *bmap, - union nilfs_bmap_ptr_req *req) + union nilfs_bmap_ptr_req *req, + struct inode *dat) { - return NILFS_BMAP_USE_VBN(bmap) ? - nilfs_bmap_prepare_end_v(bmap, req) : 0; + return dat ? nilfs_dat_prepare_end(dat, &req->bpr_req) : 0; } static inline void nilfs_bmap_commit_end_ptr(struct nilfs_bmap *bmap, - union nilfs_bmap_ptr_req *req) + union nilfs_bmap_ptr_req *req, + struct inode *dat) { - if (NILFS_BMAP_USE_VBN(bmap)) - nilfs_bmap_commit_end_v(bmap, req); + if (dat) + nilfs_dat_commit_end(dat, &req->bpr_req, + bmap->b_ptr_type == NILFS_BMAP_PTR_VS); } static inline void nilfs_bmap_abort_end_ptr(struct nilfs_bmap *bmap, - union nilfs_bmap_ptr_req *req) + union nilfs_bmap_ptr_req *req, + struct inode *dat) { - if (NILFS_BMAP_USE_VBN(bmap)) - nilfs_bmap_abort_end_v(bmap, req); + if (dat) + nilfs_dat_abort_end(dat, &req->bpr_req); } -int nilfs_bmap_start_v(struct nilfs_bmap *, union nilfs_bmap_ptr_req *, - sector_t); -int nilfs_bmap_move_v(const struct nilfs_bmap *, __u64, sector_t); -int nilfs_bmap_mark_dirty(const struct nilfs_bmap *, __u64); - - __u64 nilfs_bmap_data_get_key(const struct nilfs_bmap *, const struct buffer_head *); __u64 nilfs_bmap_find_target_seq(const struct nilfs_bmap *, __u64); __u64 nilfs_bmap_find_target_in_group(const struct nilfs_bmap *); -int nilfs_bmap_prepare_update_v(struct nilfs_bmap *, - union nilfs_bmap_ptr_req *, - union nilfs_bmap_ptr_req *); -void nilfs_bmap_commit_update_v(struct nilfs_bmap *, - union nilfs_bmap_ptr_req *, - union nilfs_bmap_ptr_req *); -void nilfs_bmap_abort_update_v(struct nilfs_bmap *, - union nilfs_bmap_ptr_req *, - union nilfs_bmap_ptr_req *); - void nilfs_bmap_add_blocks(const struct nilfs_bmap *, int); void nilfs_bmap_sub_blocks(const struct nilfs_bmap *, int); diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c index c668bca..5941958 100644 --- a/fs/nilfs2/btnode.c +++ b/fs/nilfs2/btnode.c @@ -36,6 +36,7 @@ void nilfs_btnode_cache_init_once(struct address_space *btnc) { + memset(btnc, 0, sizeof(*btnc)); INIT_RADIX_TREE(&btnc->page_tree, GFP_ATOMIC); spin_lock_init(&btnc->tree_lock); INIT_LIST_HEAD(&btnc->private_list); @@ -46,7 +47,7 @@ void nilfs_btnode_cache_init_once(struct address_space *btnc) INIT_LIST_HEAD(&btnc->i_mmap_nonlinear); } -static struct address_space_operations def_btnode_aops = { +static const struct address_space_operations def_btnode_aops = { .sync_page = block_sync_page, }; diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c index aa41272..e25b507 100644 --- a/fs/nilfs2/btree.c +++ b/fs/nilfs2/btree.c @@ -71,21 +71,17 @@ void nilfs_btree_path_cache_destroy(void) kmem_cache_destroy(nilfs_btree_path_cache); } -static inline struct nilfs_btree_path * -nilfs_btree_alloc_path(const struct nilfs_btree *btree) +static inline struct nilfs_btree_path *nilfs_btree_alloc_path(void) { - return (struct nilfs_btree_path *) - kmem_cache_alloc(nilfs_btree_path_cache, GFP_NOFS); + return kmem_cache_alloc(nilfs_btree_path_cache, GFP_NOFS); } -static inline void nilfs_btree_free_path(const struct nilfs_btree *btree, - struct nilfs_btree_path *path) +static inline void nilfs_btree_free_path(struct nilfs_btree_path *path) { kmem_cache_free(nilfs_btree_path_cache, path); } -static void nilfs_btree_init_path(const struct nilfs_btree *btree, - struct nilfs_btree_path *path) +static void nilfs_btree_init_path(struct nilfs_btree_path *path) { int level; @@ -101,26 +97,13 @@ static void nilfs_btree_init_path(const struct nilfs_btree *btree, } } -static void nilfs_btree_clear_path(const struct nilfs_btree *btree, - struct nilfs_btree_path *path) +static void nilfs_btree_release_path(struct nilfs_btree_path *path) { int level; - for (level = NILFS_BTREE_LEVEL_DATA; - level < NILFS_BTREE_LEVEL_MAX; - level++) { - if (path[level].bp_bh != NULL) { - brelse(path[level].bp_bh); - path[level].bp_bh = NULL; - } - /* sib_bh is released or deleted by prepare or commit - * operations. */ - path[level].bp_sib_bh = NULL; - path[level].bp_index = 0; - path[level].bp_oldreq.bpr_ptr = NILFS_BMAP_INVALID_PTR; - path[level].bp_newreq.bpr_ptr = NILFS_BMAP_INVALID_PTR; - path[level].bp_op = NULL; - } + for (level = NILFS_BTREE_LEVEL_DATA; level < NILFS_BTREE_LEVEL_MAX; + level++) + brelse(path[level].bp_bh); } /* @@ -148,129 +131,110 @@ static int nilfs_btree_get_new_block(const struct nilfs_btree *btree, } static inline int -nilfs_btree_node_get_flags(const struct nilfs_btree *btree, - const struct nilfs_btree_node *node) +nilfs_btree_node_get_flags(const struct nilfs_btree_node *node) { return node->bn_flags; } static inline void -nilfs_btree_node_set_flags(struct nilfs_btree *btree, - struct nilfs_btree_node *node, - int flags) +nilfs_btree_node_set_flags(struct nilfs_btree_node *node, int flags) { node->bn_flags = flags; } -static inline int nilfs_btree_node_root(const struct nilfs_btree *btree, - const struct nilfs_btree_node *node) +static inline int nilfs_btree_node_root(const struct nilfs_btree_node *node) { - return nilfs_btree_node_get_flags(btree, node) & NILFS_BTREE_NODE_ROOT; + return nilfs_btree_node_get_flags(node) & NILFS_BTREE_NODE_ROOT; } static inline int -nilfs_btree_node_get_level(const struct nilfs_btree *btree, - const struct nilfs_btree_node *node) +nilfs_btree_node_get_level(const struct nilfs_btree_node *node) { return node->bn_level; } static inline void -nilfs_btree_node_set_level(struct nilfs_btree *btree, - struct nilfs_btree_node *node, - int level) +nilfs_btree_node_set_level(struct nilfs_btree_node *node, int level) { node->bn_level = level; } static inline int -nilfs_btree_node_get_nchildren(const struct nilfs_btree *btree, - const struct nilfs_btree_node *node) +nilfs_btree_node_get_nchildren(const struct nilfs_btree_node *node) { return le16_to_cpu(node->bn_nchildren); } static inline void -nilfs_btree_node_set_nchildren(struct nilfs_btree *btree, - struct nilfs_btree_node *node, - int nchildren) +nilfs_btree_node_set_nchildren(struct nilfs_btree_node *node, int nchildren) { node->bn_nchildren = cpu_to_le16(nchildren); } -static inline int -nilfs_btree_node_size(const struct nilfs_btree *btree) +static inline int nilfs_btree_node_size(const struct nilfs_btree *btree) { return 1 << btree->bt_bmap.b_inode->i_blkbits; } static inline int -nilfs_btree_node_nchildren_min(const struct nilfs_btree *btree, - const struct nilfs_btree_node *node) +nilfs_btree_node_nchildren_min(const struct nilfs_btree_node *node, + const struct nilfs_btree *btree) { - return nilfs_btree_node_root(btree, node) ? + return nilfs_btree_node_root(node) ? NILFS_BTREE_ROOT_NCHILDREN_MIN : NILFS_BTREE_NODE_NCHILDREN_MIN(nilfs_btree_node_size(btree)); } static inline int -nilfs_btree_node_nchildren_max(const struct nilfs_btree *btree, - const struct nilfs_btree_node *node) +nilfs_btree_node_nchildren_max(const struct nilfs_btree_node *node, + const struct nilfs_btree *btree) { - return nilfs_btree_node_root(btree, node) ? + return nilfs_btree_node_root(node) ? NILFS_BTREE_ROOT_NCHILDREN_MAX : NILFS_BTREE_NODE_NCHILDREN_MAX(nilfs_btree_node_size(btree)); } static inline __le64 * -nilfs_btree_node_dkeys(const struct nilfs_btree *btree, - const struct nilfs_btree_node *node) +nilfs_btree_node_dkeys(const struct nilfs_btree_node *node) { return (__le64 *)((char *)(node + 1) + - (nilfs_btree_node_root(btree, node) ? + (nilfs_btree_node_root(node) ? 0 : NILFS_BTREE_NODE_EXTRA_PAD_SIZE)); } static inline __le64 * -nilfs_btree_node_dptrs(const struct nilfs_btree *btree, - const struct nilfs_btree_node *node) +nilfs_btree_node_dptrs(const struct nilfs_btree_node *node, + const struct nilfs_btree *btree) { - return (__le64 *)(nilfs_btree_node_dkeys(btree, node) + - nilfs_btree_node_nchildren_max(btree, node)); + return (__le64 *)(nilfs_btree_node_dkeys(node) + + nilfs_btree_node_nchildren_max(node, btree)); } static inline __u64 -nilfs_btree_node_get_key(const struct nilfs_btree *btree, - const struct nilfs_btree_node *node, int index) +nilfs_btree_node_get_key(const struct nilfs_btree_node *node, int index) { - return nilfs_bmap_dkey_to_key(*(nilfs_btree_node_dkeys(btree, node) + - index)); + return nilfs_bmap_dkey_to_key(*(nilfs_btree_node_dkeys(node) + index)); } static inline void -nilfs_btree_node_set_key(struct nilfs_btree *btree, - struct nilfs_btree_node *node, int index, __u64 key) +nilfs_btree_node_set_key(struct nilfs_btree_node *node, int index, __u64 key) { - *(nilfs_btree_node_dkeys(btree, node) + index) = - nilfs_bmap_key_to_dkey(key); + *(nilfs_btree_node_dkeys(node) + index) = nilfs_bmap_key_to_dkey(key); } static inline __u64 nilfs_btree_node_get_ptr(const struct nilfs_btree *btree, - const struct nilfs_btree_node *node, - int index) + const struct nilfs_btree_node *node, int index) { - return nilfs_bmap_dptr_to_ptr(*(nilfs_btree_node_dptrs(btree, node) + + return nilfs_bmap_dptr_to_ptr(*(nilfs_btree_node_dptrs(node, btree) + index)); } static inline void nilfs_btree_node_set_ptr(struct nilfs_btree *btree, - struct nilfs_btree_node *node, - int index, - __u64 ptr) + struct nilfs_btree_node *node, int index, __u64 ptr) { - *(nilfs_btree_node_dptrs(btree, node) + index) = + *(nilfs_btree_node_dptrs(node, btree) + index) = nilfs_bmap_ptr_to_dptr(ptr); } @@ -283,12 +247,12 @@ static void nilfs_btree_node_init(struct nilfs_btree *btree, __le64 *dptrs; int i; - nilfs_btree_node_set_flags(btree, node, flags); - nilfs_btree_node_set_level(btree, node, level); - nilfs_btree_node_set_nchildren(btree, node, nchildren); + nilfs_btree_node_set_flags(node, flags); + nilfs_btree_node_set_level(node, level); + nilfs_btree_node_set_nchildren(node, nchildren); - dkeys = nilfs_btree_node_dkeys(btree, node); - dptrs = nilfs_btree_node_dptrs(btree, node); + dkeys = nilfs_btree_node_dkeys(node); + dptrs = nilfs_btree_node_dptrs(node, btree); for (i = 0; i < nchildren; i++) { dkeys[i] = nilfs_bmap_key_to_dkey(keys[i]); dptrs[i] = nilfs_bmap_ptr_to_dptr(ptrs[i]); @@ -305,13 +269,13 @@ static void nilfs_btree_node_move_left(struct nilfs_btree *btree, __le64 *ldptrs, *rdptrs; int lnchildren, rnchildren; - ldkeys = nilfs_btree_node_dkeys(btree, left); - ldptrs = nilfs_btree_node_dptrs(btree, left); - lnchildren = nilfs_btree_node_get_nchildren(btree, left); + ldkeys = nilfs_btree_node_dkeys(left); + ldptrs = nilfs_btree_node_dptrs(left, btree); + lnchildren = nilfs_btree_node_get_nchildren(left); - rdkeys = nilfs_btree_node_dkeys(btree, right); - rdptrs = nilfs_btree_node_dptrs(btree, right); - rnchildren = nilfs_btree_node_get_nchildren(btree, right); + rdkeys = nilfs_btree_node_dkeys(right); + rdptrs = nilfs_btree_node_dptrs(right, btree); + rnchildren = nilfs_btree_node_get_nchildren(right); memcpy(ldkeys + lnchildren, rdkeys, n * sizeof(*rdkeys)); memcpy(ldptrs + lnchildren, rdptrs, n * sizeof(*rdptrs)); @@ -320,8 +284,8 @@ static void nilfs_btree_node_move_left(struct nilfs_btree *btree, lnchildren += n; rnchildren -= n; - nilfs_btree_node_set_nchildren(btree, left, lnchildren); - nilfs_btree_node_set_nchildren(btree, right, rnchildren); + nilfs_btree_node_set_nchildren(left, lnchildren); + nilfs_btree_node_set_nchildren(right, rnchildren); } /* Assume that the buffer heads corresponding to left and right are locked. */ @@ -334,13 +298,13 @@ static void nilfs_btree_node_move_right(struct nilfs_btree *btree, __le64 *ldptrs, *rdptrs; int lnchildren, rnchildren; - ldkeys = nilfs_btree_node_dkeys(btree, left); - ldptrs = nilfs_btree_node_dptrs(btree, left); - lnchildren = nilfs_btree_node_get_nchildren(btree, left); + ldkeys = nilfs_btree_node_dkeys(left); + ldptrs = nilfs_btree_node_dptrs(left, btree); + lnchildren = nilfs_btree_node_get_nchildren(left); - rdkeys = nilfs_btree_node_dkeys(btree, right); - rdptrs = nilfs_btree_node_dptrs(btree, right); - rnchildren = nilfs_btree_node_get_nchildren(btree, right); + rdkeys = nilfs_btree_node_dkeys(right); + rdptrs = nilfs_btree_node_dptrs(right, btree); + rnchildren = nilfs_btree_node_get_nchildren(right); memmove(rdkeys + n, rdkeys, rnchildren * sizeof(*rdkeys)); memmove(rdptrs + n, rdptrs, rnchildren * sizeof(*rdptrs)); @@ -349,8 +313,8 @@ static void nilfs_btree_node_move_right(struct nilfs_btree *btree, lnchildren -= n; rnchildren += n; - nilfs_btree_node_set_nchildren(btree, left, lnchildren); - nilfs_btree_node_set_nchildren(btree, right, rnchildren); + nilfs_btree_node_set_nchildren(left, lnchildren); + nilfs_btree_node_set_nchildren(right, rnchildren); } /* Assume that the buffer head corresponding to node is locked. */ @@ -362,9 +326,9 @@ static void nilfs_btree_node_insert(struct nilfs_btree *btree, __le64 *dptrs; int nchildren; - dkeys = nilfs_btree_node_dkeys(btree, node); - dptrs = nilfs_btree_node_dptrs(btree, node); - nchildren = nilfs_btree_node_get_nchildren(btree, node); + dkeys = nilfs_btree_node_dkeys(node); + dptrs = nilfs_btree_node_dptrs(node, btree); + nchildren = nilfs_btree_node_get_nchildren(node); if (index < nchildren) { memmove(dkeys + index + 1, dkeys + index, (nchildren - index) * sizeof(*dkeys)); @@ -374,7 +338,7 @@ static void nilfs_btree_node_insert(struct nilfs_btree *btree, dkeys[index] = nilfs_bmap_key_to_dkey(key); dptrs[index] = nilfs_bmap_ptr_to_dptr(ptr); nchildren++; - nilfs_btree_node_set_nchildren(btree, node, nchildren); + nilfs_btree_node_set_nchildren(node, nchildren); } /* Assume that the buffer head corresponding to node is locked. */ @@ -388,11 +352,11 @@ static void nilfs_btree_node_delete(struct nilfs_btree *btree, __le64 *dptrs; int nchildren; - dkeys = nilfs_btree_node_dkeys(btree, node); - dptrs = nilfs_btree_node_dptrs(btree, node); + dkeys = nilfs_btree_node_dkeys(node); + dptrs = nilfs_btree_node_dptrs(node, btree); key = nilfs_bmap_dkey_to_key(dkeys[index]); ptr = nilfs_bmap_dptr_to_ptr(dptrs[index]); - nchildren = nilfs_btree_node_get_nchildren(btree, node); + nchildren = nilfs_btree_node_get_nchildren(node); if (keyp != NULL) *keyp = key; if (ptrp != NULL) @@ -405,11 +369,10 @@ static void nilfs_btree_node_delete(struct nilfs_btree *btree, (nchildren - index - 1) * sizeof(*dptrs)); } nchildren--; - nilfs_btree_node_set_nchildren(btree, node, nchildren); + nilfs_btree_node_set_nchildren(node, nchildren); } -static int nilfs_btree_node_lookup(const struct nilfs_btree *btree, - const struct nilfs_btree_node *node, +static int nilfs_btree_node_lookup(const struct nilfs_btree_node *node, __u64 key, int *indexp) { __u64 nkey; @@ -417,12 +380,12 @@ static int nilfs_btree_node_lookup(const struct nilfs_btree *btree, /* binary search */ low = 0; - high = nilfs_btree_node_get_nchildren(btree, node) - 1; + high = nilfs_btree_node_get_nchildren(node) - 1; index = 0; s = 0; while (low <= high) { index = (low + high) / 2; - nkey = nilfs_btree_node_get_key(btree, node, index); + nkey = nilfs_btree_node_get_key(node, index); if (nkey == key) { s = 0; goto out; @@ -436,9 +399,8 @@ static int nilfs_btree_node_lookup(const struct nilfs_btree *btree, } /* adjust index */ - if (nilfs_btree_node_get_level(btree, node) > - NILFS_BTREE_LEVEL_NODE_MIN) { - if ((s > 0) && (index > 0)) + if (nilfs_btree_node_get_level(node) > NILFS_BTREE_LEVEL_NODE_MIN) { + if (s > 0 && index > 0) index--; } else if (s < 0) index++; @@ -456,25 +418,20 @@ nilfs_btree_get_root(const struct nilfs_btree *btree) } static inline struct nilfs_btree_node * -nilfs_btree_get_nonroot_node(const struct nilfs_btree *btree, - const struct nilfs_btree_path *path, - int level) +nilfs_btree_get_nonroot_node(const struct nilfs_btree_path *path, int level) { return (struct nilfs_btree_node *)path[level].bp_bh->b_data; } static inline struct nilfs_btree_node * -nilfs_btree_get_sib_node(const struct nilfs_btree *btree, - const struct nilfs_btree_path *path, - int level) +nilfs_btree_get_sib_node(const struct nilfs_btree_path *path, int level) { return (struct nilfs_btree_node *)path[level].bp_sib_bh->b_data; } static inline int nilfs_btree_height(const struct nilfs_btree *btree) { - return nilfs_btree_node_get_level(btree, nilfs_btree_get_root(btree)) - + 1; + return nilfs_btree_node_get_level(nilfs_btree_get_root(btree)) + 1; } static inline struct nilfs_btree_node * @@ -484,7 +441,7 @@ nilfs_btree_get_node(const struct nilfs_btree *btree, { return (level == nilfs_btree_height(btree) - 1) ? nilfs_btree_get_root(btree) : - nilfs_btree_get_nonroot_node(btree, path, level); + nilfs_btree_get_nonroot_node(path, level); } static int nilfs_btree_do_lookup(const struct nilfs_btree *btree, @@ -496,12 +453,11 @@ static int nilfs_btree_do_lookup(const struct nilfs_btree *btree, int level, index, found, ret; node = nilfs_btree_get_root(btree); - level = nilfs_btree_node_get_level(btree, node); - if ((level < minlevel) || - (nilfs_btree_node_get_nchildren(btree, node) <= 0)) + level = nilfs_btree_node_get_level(node); + if (level < minlevel || nilfs_btree_node_get_nchildren(node) <= 0) return -ENOENT; - found = nilfs_btree_node_lookup(btree, node, key, &index); + found = nilfs_btree_node_lookup(node, key, &index); ptr = nilfs_btree_node_get_ptr(btree, node, index); path[level].bp_bh = NULL; path[level].bp_index = index; @@ -510,14 +466,13 @@ static int nilfs_btree_do_lookup(const struct nilfs_btree *btree, ret = nilfs_btree_get_block(btree, ptr, &path[level].bp_bh); if (ret < 0) return ret; - node = nilfs_btree_get_nonroot_node(btree, path, level); - BUG_ON(level != nilfs_btree_node_get_level(btree, node)); + node = nilfs_btree_get_nonroot_node(path, level); + BUG_ON(level != nilfs_btree_node_get_level(node)); if (!found) - found = nilfs_btree_node_lookup(btree, node, key, - &index); + found = nilfs_btree_node_lookup(node, key, &index); else index = 0; - if (index < nilfs_btree_node_nchildren_max(btree, node)) + if (index < nilfs_btree_node_nchildren_max(node, btree)) ptr = nilfs_btree_node_get_ptr(btree, node, index); else { WARN_ON(found || level != NILFS_BTREE_LEVEL_NODE_MIN); @@ -544,10 +499,10 @@ static int nilfs_btree_do_lookup_last(const struct nilfs_btree *btree, int index, level, ret; node = nilfs_btree_get_root(btree); - index = nilfs_btree_node_get_nchildren(btree, node) - 1; + index = nilfs_btree_node_get_nchildren(node) - 1; if (index < 0) return -ENOENT; - level = nilfs_btree_node_get_level(btree, node); + level = nilfs_btree_node_get_level(node); ptr = nilfs_btree_node_get_ptr(btree, node, index); path[level].bp_bh = NULL; path[level].bp_index = index; @@ -556,15 +511,15 @@ static int nilfs_btree_do_lookup_last(const struct nilfs_btree *btree, ret = nilfs_btree_get_block(btree, ptr, &path[level].bp_bh); if (ret < 0) return ret; - node = nilfs_btree_get_nonroot_node(btree, path, level); - BUG_ON(level != nilfs_btree_node_get_level(btree, node)); - index = nilfs_btree_node_get_nchildren(btree, node) - 1; + node = nilfs_btree_get_nonroot_node(path, level); + BUG_ON(level != nilfs_btree_node_get_level(node)); + index = nilfs_btree_node_get_nchildren(node) - 1; ptr = nilfs_btree_node_get_ptr(btree, node, index); path[level].bp_index = index; } if (keyp != NULL) - *keyp = nilfs_btree_node_get_key(btree, node, index); + *keyp = nilfs_btree_node_get_key(node, index); if (ptrp != NULL) *ptrp = ptr; @@ -580,18 +535,18 @@ static int nilfs_btree_lookup(const struct nilfs_bmap *bmap, int ret; btree = (struct nilfs_btree *)bmap; - path = nilfs_btree_alloc_path(btree); + path = nilfs_btree_alloc_path(); if (path == NULL) return -ENOMEM; - nilfs_btree_init_path(btree, path); + nilfs_btree_init_path(path); ret = nilfs_btree_do_lookup(btree, path, key, &ptr, level); if (ptrp != NULL) *ptrp = ptr; - nilfs_btree_clear_path(btree, path); - nilfs_btree_free_path(btree, path); + nilfs_btree_release_path(path); + nilfs_btree_free_path(path); return ret; } @@ -608,10 +563,10 @@ static int nilfs_btree_lookup_contig(const struct nilfs_bmap *bmap, int level = NILFS_BTREE_LEVEL_NODE_MIN; int ret, cnt, index, maxlevel; - path = nilfs_btree_alloc_path(btree); + path = nilfs_btree_alloc_path(); if (path == NULL) return -ENOMEM; - nilfs_btree_init_path(btree, path); + nilfs_btree_init_path(path); ret = nilfs_btree_do_lookup(btree, path, key, &ptr, level); if (ret < 0) goto out; @@ -631,8 +586,8 @@ static int nilfs_btree_lookup_contig(const struct nilfs_bmap *bmap, node = nilfs_btree_get_node(btree, path, level); index = path[level].bp_index + 1; for (;;) { - while (index < nilfs_btree_node_get_nchildren(btree, node)) { - if (nilfs_btree_node_get_key(btree, node, index) != + while (index < nilfs_btree_node_get_nchildren(node)) { + if (nilfs_btree_node_get_key(node, index) != key + cnt) goto end; ptr2 = nilfs_btree_node_get_ptr(btree, node, index); @@ -653,8 +608,8 @@ static int nilfs_btree_lookup_contig(const struct nilfs_bmap *bmap, /* look-up right sibling node */ node = nilfs_btree_get_node(btree, path, level + 1); index = path[level + 1].bp_index + 1; - if (index >= nilfs_btree_node_get_nchildren(btree, node) || - nilfs_btree_node_get_key(btree, node, index) != key + cnt) + if (index >= nilfs_btree_node_get_nchildren(node) || + nilfs_btree_node_get_key(node, index) != key + cnt) break; ptr2 = nilfs_btree_node_get_ptr(btree, node, index); path[level + 1].bp_index = index; @@ -664,7 +619,7 @@ static int nilfs_btree_lookup_contig(const struct nilfs_bmap *bmap, ret = nilfs_btree_get_block(btree, ptr2, &path[level].bp_bh); if (ret < 0) goto out; - node = nilfs_btree_get_nonroot_node(btree, path, level); + node = nilfs_btree_get_nonroot_node(path, level); index = 0; path[level].bp_index = index; } @@ -672,8 +627,8 @@ static int nilfs_btree_lookup_contig(const struct nilfs_bmap *bmap, *ptrp = ptr; ret = cnt; out: - nilfs_btree_clear_path(btree, path); - nilfs_btree_free_path(btree, path); + nilfs_btree_release_path(path); + nilfs_btree_free_path(path); return ret; } @@ -685,9 +640,7 @@ static void nilfs_btree_promote_key(struct nilfs_btree *btree, do { lock_buffer(path[level].bp_bh); nilfs_btree_node_set_key( - btree, - nilfs_btree_get_nonroot_node( - btree, path, level), + nilfs_btree_get_nonroot_node(path, level), path[level].bp_index, key); if (!buffer_dirty(path[level].bp_bh)) nilfs_btnode_mark_dirty(path[level].bp_bh); @@ -698,8 +651,7 @@ static void nilfs_btree_promote_key(struct nilfs_btree *btree, /* root */ if (level == nilfs_btree_height(btree) - 1) { - nilfs_btree_node_set_key(btree, - nilfs_btree_get_root(btree), + nilfs_btree_node_set_key(nilfs_btree_get_root(btree), path[level].bp_index, key); } } @@ -712,7 +664,7 @@ static void nilfs_btree_do_insert(struct nilfs_btree *btree, if (level < nilfs_btree_height(btree) - 1) { lock_buffer(path[level].bp_bh); - node = nilfs_btree_get_nonroot_node(btree, path, level); + node = nilfs_btree_get_nonroot_node(path, level); nilfs_btree_node_insert(btree, node, *keyp, *ptrp, path[level].bp_index); if (!buffer_dirty(path[level].bp_bh)) @@ -721,8 +673,8 @@ static void nilfs_btree_do_insert(struct nilfs_btree *btree, if (path[level].bp_index == 0) nilfs_btree_promote_key(btree, path, level + 1, - nilfs_btree_node_get_key( - btree, node, 0)); + nilfs_btree_node_get_key(node, + 0)); } else { node = nilfs_btree_get_root(btree); nilfs_btree_node_insert(btree, node, *keyp, *ptrp, @@ -740,10 +692,10 @@ static void nilfs_btree_carry_left(struct nilfs_btree *btree, lock_buffer(path[level].bp_bh); lock_buffer(path[level].bp_sib_bh); - node = nilfs_btree_get_nonroot_node(btree, path, level); - left = nilfs_btree_get_sib_node(btree, path, level); - nchildren = nilfs_btree_node_get_nchildren(btree, node); - lnchildren = nilfs_btree_node_get_nchildren(btree, left); + node = nilfs_btree_get_nonroot_node(path, level); + left = nilfs_btree_get_sib_node(path, level); + nchildren = nilfs_btree_node_get_nchildren(node); + lnchildren = nilfs_btree_node_get_nchildren(left); move = 0; n = (nchildren + lnchildren + 1) / 2 - lnchildren; @@ -764,7 +716,7 @@ static void nilfs_btree_carry_left(struct nilfs_btree *btree, unlock_buffer(path[level].bp_sib_bh); nilfs_btree_promote_key(btree, path, level + 1, - nilfs_btree_node_get_key(btree, node, 0)); + nilfs_btree_node_get_key(node, 0)); if (move) { brelse(path[level].bp_bh); @@ -791,10 +743,10 @@ static void nilfs_btree_carry_right(struct nilfs_btree *btree, lock_buffer(path[level].bp_bh); lock_buffer(path[level].bp_sib_bh); - node = nilfs_btree_get_nonroot_node(btree, path, level); - right = nilfs_btree_get_sib_node(btree, path, level); - nchildren = nilfs_btree_node_get_nchildren(btree, node); - rnchildren = nilfs_btree_node_get_nchildren(btree, right); + node = nilfs_btree_get_nonroot_node(path, level); + right = nilfs_btree_get_sib_node(path, level); + nchildren = nilfs_btree_node_get_nchildren(node); + rnchildren = nilfs_btree_node_get_nchildren(right); move = 0; n = (nchildren + rnchildren + 1) / 2 - rnchildren; @@ -816,15 +768,14 @@ static void nilfs_btree_carry_right(struct nilfs_btree *btree, path[level + 1].bp_index++; nilfs_btree_promote_key(btree, path, level + 1, - nilfs_btree_node_get_key(btree, right, 0)); + nilfs_btree_node_get_key(right, 0)); path[level + 1].bp_index--; if (move) { brelse(path[level].bp_bh); path[level].bp_bh = path[level].bp_sib_bh; path[level].bp_sib_bh = NULL; - path[level].bp_index -= - nilfs_btree_node_get_nchildren(btree, node); + path[level].bp_index -= nilfs_btree_node_get_nchildren(node); path[level + 1].bp_index++; } else { brelse(path[level].bp_sib_bh); @@ -846,9 +797,9 @@ static void nilfs_btree_split(struct nilfs_btree *btree, lock_buffer(path[level].bp_bh); lock_buffer(path[level].bp_sib_bh); - node = nilfs_btree_get_nonroot_node(btree, path, level); - right = nilfs_btree_get_sib_node(btree, path, level); - nchildren = nilfs_btree_node_get_nchildren(btree, node); + node = nilfs_btree_get_nonroot_node(path, level); + right = nilfs_btree_get_sib_node(path, level); + nchildren = nilfs_btree_node_get_nchildren(node); move = 0; n = (nchildren + 1) / 2; @@ -867,16 +818,15 @@ static void nilfs_btree_split(struct nilfs_btree *btree, unlock_buffer(path[level].bp_bh); unlock_buffer(path[level].bp_sib_bh); - newkey = nilfs_btree_node_get_key(btree, right, 0); + newkey = nilfs_btree_node_get_key(right, 0); newptr = path[level].bp_newreq.bpr_ptr; if (move) { - path[level].bp_index -= - nilfs_btree_node_get_nchildren(btree, node); + path[level].bp_index -= nilfs_btree_node_get_nchildren(node); nilfs_btree_node_insert(btree, right, *keyp, *ptrp, path[level].bp_index); - *keyp = nilfs_btree_node_get_key(btree, right, 0); + *keyp = nilfs_btree_node_get_key(right, 0); *ptrp = path[level].bp_newreq.bpr_ptr; brelse(path[level].bp_bh); @@ -885,7 +835,7 @@ static void nilfs_btree_split(struct nilfs_btree *btree, } else { nilfs_btree_do_insert(btree, path, level, keyp, ptrp); - *keyp = nilfs_btree_node_get_key(btree, right, 0); + *keyp = nilfs_btree_node_get_key(right, 0); *ptrp = path[level].bp_newreq.bpr_ptr; brelse(path[level].bp_sib_bh); @@ -905,12 +855,12 @@ static void nilfs_btree_grow(struct nilfs_btree *btree, lock_buffer(path[level].bp_sib_bh); root = nilfs_btree_get_root(btree); - child = nilfs_btree_get_sib_node(btree, path, level); + child = nilfs_btree_get_sib_node(path, level); - n = nilfs_btree_node_get_nchildren(btree, root); + n = nilfs_btree_node_get_nchildren(root); nilfs_btree_node_move_right(btree, root, child, n); - nilfs_btree_node_set_level(btree, root, level + 1); + nilfs_btree_node_set_level(root, level + 1); if (!buffer_dirty(path[level].bp_sib_bh)) nilfs_btnode_mark_dirty(path[level].bp_sib_bh); @@ -922,7 +872,7 @@ static void nilfs_btree_grow(struct nilfs_btree *btree, nilfs_btree_do_insert(btree, path, level, keyp, ptrp); - *keyp = nilfs_btree_node_get_key(btree, child, 0); + *keyp = nilfs_btree_node_get_key(child, 0); *ptrp = path[level].bp_newreq.bpr_ptr; } @@ -990,26 +940,29 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree, struct nilfs_btree_node *node, *parent, *sib; __u64 sibptr; int pindex, level, ret; + struct inode *dat = NULL; stats->bs_nblocks = 0; level = NILFS_BTREE_LEVEL_DATA; /* allocate a new ptr for data block */ - if (NILFS_BMAP_USE_VBN(&btree->bt_bmap)) + if (NILFS_BMAP_USE_VBN(&btree->bt_bmap)) { path[level].bp_newreq.bpr_ptr = nilfs_btree_find_target_v(btree, path, key); + dat = nilfs_bmap_get_dat(&btree->bt_bmap); + } ret = nilfs_bmap_prepare_alloc_ptr(&btree->bt_bmap, - &path[level].bp_newreq); + &path[level].bp_newreq, dat); if (ret < 0) goto err_out_data; for (level = NILFS_BTREE_LEVEL_NODE_MIN; level < nilfs_btree_height(btree) - 1; level++) { - node = nilfs_btree_get_nonroot_node(btree, path, level); - if (nilfs_btree_node_get_nchildren(btree, node) < - nilfs_btree_node_nchildren_max(btree, node)) { + node = nilfs_btree_get_nonroot_node(path, level); + if (nilfs_btree_node_get_nchildren(node) < + nilfs_btree_node_nchildren_max(node, btree)) { path[level].bp_op = nilfs_btree_do_insert; stats->bs_nblocks++; goto out; @@ -1026,8 +979,8 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree, if (ret < 0) goto err_out_child_node; sib = (struct nilfs_btree_node *)bh->b_data; - if (nilfs_btree_node_get_nchildren(btree, sib) < - nilfs_btree_node_nchildren_max(btree, sib)) { + if (nilfs_btree_node_get_nchildren(sib) < + nilfs_btree_node_nchildren_max(sib, btree)) { path[level].bp_sib_bh = bh; path[level].bp_op = nilfs_btree_carry_left; stats->bs_nblocks++; @@ -1038,15 +991,15 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree, /* right sibling */ if (pindex < - nilfs_btree_node_get_nchildren(btree, parent) - 1) { + nilfs_btree_node_get_nchildren(parent) - 1) { sibptr = nilfs_btree_node_get_ptr(btree, parent, pindex + 1); ret = nilfs_btree_get_block(btree, sibptr, &bh); if (ret < 0) goto err_out_child_node; sib = (struct nilfs_btree_node *)bh->b_data; - if (nilfs_btree_node_get_nchildren(btree, sib) < - nilfs_btree_node_nchildren_max(btree, sib)) { + if (nilfs_btree_node_get_nchildren(sib) < + nilfs_btree_node_nchildren_max(sib, btree)) { path[level].bp_sib_bh = bh; path[level].bp_op = nilfs_btree_carry_right; stats->bs_nblocks++; @@ -1059,7 +1012,7 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree, path[level].bp_newreq.bpr_ptr = path[level - 1].bp_newreq.bpr_ptr + 1; ret = nilfs_bmap_prepare_alloc_ptr(&btree->bt_bmap, - &path[level].bp_newreq); + &path[level].bp_newreq, dat); if (ret < 0) goto err_out_child_node; ret = nilfs_btree_get_new_block(btree, @@ -1081,8 +1034,8 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree, /* root */ node = nilfs_btree_get_root(btree); - if (nilfs_btree_node_get_nchildren(btree, node) < - nilfs_btree_node_nchildren_max(btree, node)) { + if (nilfs_btree_node_get_nchildren(node) < + nilfs_btree_node_nchildren_max(node, btree)) { path[level].bp_op = nilfs_btree_do_insert; stats->bs_nblocks++; goto out; @@ -1091,7 +1044,7 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree, /* grow */ path[level].bp_newreq.bpr_ptr = path[level - 1].bp_newreq.bpr_ptr + 1; ret = nilfs_bmap_prepare_alloc_ptr(&btree->bt_bmap, - &path[level].bp_newreq); + &path[level].bp_newreq, dat); if (ret < 0) goto err_out_child_node; ret = nilfs_btree_get_new_block(btree, path[level].bp_newreq.bpr_ptr, @@ -1119,16 +1072,18 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree, /* error */ err_out_curr_node: - nilfs_bmap_abort_alloc_ptr(&btree->bt_bmap, &path[level].bp_newreq); + nilfs_bmap_abort_alloc_ptr(&btree->bt_bmap, &path[level].bp_newreq, + dat); err_out_child_node: for (level--; level > NILFS_BTREE_LEVEL_DATA; level--) { nilfs_btnode_delete(path[level].bp_sib_bh); nilfs_bmap_abort_alloc_ptr(&btree->bt_bmap, - &path[level].bp_newreq); + &path[level].bp_newreq, dat); } - nilfs_bmap_abort_alloc_ptr(&btree->bt_bmap, &path[level].bp_newreq); + nilfs_bmap_abort_alloc_ptr(&btree->bt_bmap, &path[level].bp_newreq, + dat); err_out_data: *levelp = level; stats->bs_nblocks = 0; @@ -1139,16 +1094,19 @@ static void nilfs_btree_commit_insert(struct nilfs_btree *btree, struct nilfs_btree_path *path, int maxlevel, __u64 key, __u64 ptr) { + struct inode *dat = NULL; int level; set_buffer_nilfs_volatile((struct buffer_head *)((unsigned long)ptr)); ptr = path[NILFS_BTREE_LEVEL_DATA].bp_newreq.bpr_ptr; - if (NILFS_BMAP_USE_VBN(&btree->bt_bmap)) + if (NILFS_BMAP_USE_VBN(&btree->bt_bmap)) { nilfs_btree_set_target_v(btree, key, ptr); + dat = nilfs_bmap_get_dat(&btree->bt_bmap); + } for (level = NILFS_BTREE_LEVEL_NODE_MIN; level <= maxlevel; level++) { nilfs_bmap_commit_alloc_ptr(&btree->bt_bmap, - &path[level - 1].bp_newreq); + &path[level - 1].bp_newreq, dat); path[level].bp_op(btree, path, level, &key, &ptr); } @@ -1164,10 +1122,10 @@ static int nilfs_btree_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr) int level, ret; btree = (struct nilfs_btree *)bmap; - path = nilfs_btree_alloc_path(btree); + path = nilfs_btree_alloc_path(); if (path == NULL) return -ENOMEM; - nilfs_btree_init_path(btree, path); + nilfs_btree_init_path(path); ret = nilfs_btree_do_lookup(btree, path, key, NULL, NILFS_BTREE_LEVEL_NODE_MIN); @@ -1184,8 +1142,8 @@ static int nilfs_btree_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr) nilfs_bmap_add_blocks(bmap, stats.bs_nblocks); out: - nilfs_btree_clear_path(btree, path); - nilfs_btree_free_path(btree, path); + nilfs_btree_release_path(path); + nilfs_btree_free_path(path); return ret; } @@ -1197,7 +1155,7 @@ static void nilfs_btree_do_delete(struct nilfs_btree *btree, if (level < nilfs_btree_height(btree) - 1) { lock_buffer(path[level].bp_bh); - node = nilfs_btree_get_nonroot_node(btree, path, level); + node = nilfs_btree_get_nonroot_node(path, level); nilfs_btree_node_delete(btree, node, keyp, ptrp, path[level].bp_index); if (!buffer_dirty(path[level].bp_bh)) @@ -1205,7 +1163,7 @@ static void nilfs_btree_do_delete(struct nilfs_btree *btree, unlock_buffer(path[level].bp_bh); if (path[level].bp_index == 0) nilfs_btree_promote_key(btree, path, level + 1, - nilfs_btree_node_get_key(btree, node, 0)); + nilfs_btree_node_get_key(node, 0)); } else { node = nilfs_btree_get_root(btree); nilfs_btree_node_delete(btree, node, keyp, ptrp, @@ -1225,10 +1183,10 @@ static void nilfs_btree_borrow_left(struct nilfs_btree *btree, lock_buffer(path[level].bp_bh); lock_buffer(path[level].bp_sib_bh); - node = nilfs_btree_get_nonroot_node(btree, path, level); - left = nilfs_btree_get_sib_node(btree, path, level); - nchildren = nilfs_btree_node_get_nchildren(btree, node); - lnchildren = nilfs_btree_node_get_nchildren(btree, left); + node = nilfs_btree_get_nonroot_node(path, level); + left = nilfs_btree_get_sib_node(path, level); + nchildren = nilfs_btree_node_get_nchildren(node); + lnchildren = nilfs_btree_node_get_nchildren(left); n = (nchildren + lnchildren) / 2 - nchildren; @@ -1243,7 +1201,7 @@ static void nilfs_btree_borrow_left(struct nilfs_btree *btree, unlock_buffer(path[level].bp_sib_bh); nilfs_btree_promote_key(btree, path, level + 1, - nilfs_btree_node_get_key(btree, node, 0)); + nilfs_btree_node_get_key(node, 0)); brelse(path[level].bp_sib_bh); path[level].bp_sib_bh = NULL; @@ -1262,10 +1220,10 @@ static void nilfs_btree_borrow_right(struct nilfs_btree *btree, lock_buffer(path[level].bp_bh); lock_buffer(path[level].bp_sib_bh); - node = nilfs_btree_get_nonroot_node(btree, path, level); - right = nilfs_btree_get_sib_node(btree, path, level); - nchildren = nilfs_btree_node_get_nchildren(btree, node); - rnchildren = nilfs_btree_node_get_nchildren(btree, right); + node = nilfs_btree_get_nonroot_node(path, level); + right = nilfs_btree_get_sib_node(path, level); + nchildren = nilfs_btree_node_get_nchildren(node); + rnchildren = nilfs_btree_node_get_nchildren(right); n = (nchildren + rnchildren) / 2 - nchildren; @@ -1281,7 +1239,7 @@ static void nilfs_btree_borrow_right(struct nilfs_btree *btree, path[level + 1].bp_index++; nilfs_btree_promote_key(btree, path, level + 1, - nilfs_btree_node_get_key(btree, right, 0)); + nilfs_btree_node_get_key(right, 0)); path[level + 1].bp_index--; brelse(path[level].bp_sib_bh); @@ -1300,10 +1258,10 @@ static void nilfs_btree_concat_left(struct nilfs_btree *btree, lock_buffer(path[level].bp_bh); lock_buffer(path[level].bp_sib_bh); - node = nilfs_btree_get_nonroot_node(btree, path, level); - left = nilfs_btree_get_sib_node(btree, path, level); + node = nilfs_btree_get_nonroot_node(path, level); + left = nilfs_btree_get_sib_node(path, level); - n = nilfs_btree_node_get_nchildren(btree, node); + n = nilfs_btree_node_get_nchildren(node); nilfs_btree_node_move_left(btree, left, node, n); @@ -1316,7 +1274,7 @@ static void nilfs_btree_concat_left(struct nilfs_btree *btree, nilfs_btnode_delete(path[level].bp_bh); path[level].bp_bh = path[level].bp_sib_bh; path[level].bp_sib_bh = NULL; - path[level].bp_index += nilfs_btree_node_get_nchildren(btree, left); + path[level].bp_index += nilfs_btree_node_get_nchildren(left); } static void nilfs_btree_concat_right(struct nilfs_btree *btree, @@ -1331,10 +1289,10 @@ static void nilfs_btree_concat_right(struct nilfs_btree *btree, lock_buffer(path[level].bp_bh); lock_buffer(path[level].bp_sib_bh); - node = nilfs_btree_get_nonroot_node(btree, path, level); - right = nilfs_btree_get_sib_node(btree, path, level); + node = nilfs_btree_get_nonroot_node(path, level); + right = nilfs_btree_get_sib_node(path, level); - n = nilfs_btree_node_get_nchildren(btree, right); + n = nilfs_btree_node_get_nchildren(right); nilfs_btree_node_move_left(btree, node, right, n); @@ -1360,11 +1318,11 @@ static void nilfs_btree_shrink(struct nilfs_btree *btree, lock_buffer(path[level].bp_bh); root = nilfs_btree_get_root(btree); - child = nilfs_btree_get_nonroot_node(btree, path, level); + child = nilfs_btree_get_nonroot_node(path, level); nilfs_btree_node_delete(btree, root, NULL, NULL, 0); - nilfs_btree_node_set_level(btree, root, level); - n = nilfs_btree_node_get_nchildren(btree, child); + nilfs_btree_node_set_level(root, level); + n = nilfs_btree_node_get_nchildren(child); nilfs_btree_node_move_left(btree, root, child, n); unlock_buffer(path[level].bp_bh); @@ -1376,7 +1334,8 @@ static void nilfs_btree_shrink(struct nilfs_btree *btree, static int nilfs_btree_prepare_delete(struct nilfs_btree *btree, struct nilfs_btree_path *path, int *levelp, - struct nilfs_bmap_stats *stats) + struct nilfs_bmap_stats *stats, + struct inode *dat) { struct buffer_head *bh; struct nilfs_btree_node *node, *parent, *sib; @@ -1388,17 +1347,17 @@ static int nilfs_btree_prepare_delete(struct nilfs_btree *btree, for (level = NILFS_BTREE_LEVEL_NODE_MIN; level < nilfs_btree_height(btree) - 1; level++) { - node = nilfs_btree_get_nonroot_node(btree, path, level); + node = nilfs_btree_get_nonroot_node(path, level); path[level].bp_oldreq.bpr_ptr = nilfs_btree_node_get_ptr(btree, node, path[level].bp_index); ret = nilfs_bmap_prepare_end_ptr(&btree->bt_bmap, - &path[level].bp_oldreq); + &path[level].bp_oldreq, dat); if (ret < 0) goto err_out_child_node; - if (nilfs_btree_node_get_nchildren(btree, node) > - nilfs_btree_node_nchildren_min(btree, node)) { + if (nilfs_btree_node_get_nchildren(node) > + nilfs_btree_node_nchildren_min(node, btree)) { path[level].bp_op = nilfs_btree_do_delete; stats->bs_nblocks++; goto out; @@ -1415,8 +1374,8 @@ static int nilfs_btree_prepare_delete(struct nilfs_btree *btree, if (ret < 0) goto err_out_curr_node; sib = (struct nilfs_btree_node *)bh->b_data; - if (nilfs_btree_node_get_nchildren(btree, sib) > - nilfs_btree_node_nchildren_min(btree, sib)) { + if (nilfs_btree_node_get_nchildren(sib) > + nilfs_btree_node_nchildren_min(sib, btree)) { path[level].bp_sib_bh = bh; path[level].bp_op = nilfs_btree_borrow_left; stats->bs_nblocks++; @@ -1428,7 +1387,7 @@ static int nilfs_btree_prepare_delete(struct nilfs_btree *btree, /* continue; */ } } else if (pindex < - nilfs_btree_node_get_nchildren(btree, parent) - 1) { + nilfs_btree_node_get_nchildren(parent) - 1) { /* right sibling */ sibptr = nilfs_btree_node_get_ptr(btree, parent, pindex + 1); @@ -1436,8 +1395,8 @@ static int nilfs_btree_prepare_delete(struct nilfs_btree *btree, if (ret < 0) goto err_out_curr_node; sib = (struct nilfs_btree_node *)bh->b_data; - if (nilfs_btree_node_get_nchildren(btree, sib) > - nilfs_btree_node_nchildren_min(btree, sib)) { + if (nilfs_btree_node_get_nchildren(sib) > + nilfs_btree_node_nchildren_min(sib, btree)) { path[level].bp_sib_bh = bh; path[level].bp_op = nilfs_btree_borrow_right; stats->bs_nblocks++; @@ -1452,7 +1411,7 @@ static int nilfs_btree_prepare_delete(struct nilfs_btree *btree, /* no siblings */ /* the only child of the root node */ WARN_ON(level != nilfs_btree_height(btree) - 2); - if (nilfs_btree_node_get_nchildren(btree, node) - 1 <= + if (nilfs_btree_node_get_nchildren(node) - 1 <= NILFS_BTREE_ROOT_NCHILDREN_MAX) { path[level].bp_op = nilfs_btree_shrink; stats->bs_nblocks += 2; @@ -1471,7 +1430,7 @@ static int nilfs_btree_prepare_delete(struct nilfs_btree *btree, nilfs_btree_node_get_ptr(btree, node, path[level].bp_index); ret = nilfs_bmap_prepare_end_ptr(&btree->bt_bmap, - &path[level].bp_oldreq); + &path[level].bp_oldreq, dat); if (ret < 0) goto err_out_child_node; @@ -1486,12 +1445,12 @@ static int nilfs_btree_prepare_delete(struct nilfs_btree *btree, /* error */ err_out_curr_node: - nilfs_bmap_abort_end_ptr(&btree->bt_bmap, &path[level].bp_oldreq); + nilfs_bmap_abort_end_ptr(&btree->bt_bmap, &path[level].bp_oldreq, dat); err_out_child_node: for (level--; level >= NILFS_BTREE_LEVEL_NODE_MIN; level--) { brelse(path[level].bp_sib_bh); nilfs_bmap_abort_end_ptr(&btree->bt_bmap, - &path[level].bp_oldreq); + &path[level].bp_oldreq, dat); } *levelp = level; stats->bs_nblocks = 0; @@ -1500,13 +1459,13 @@ static int nilfs_btree_prepare_delete(struct nilfs_btree *btree, static void nilfs_btree_commit_delete(struct nilfs_btree *btree, struct nilfs_btree_path *path, - int maxlevel) + int maxlevel, struct inode *dat) { int level; for (level = NILFS_BTREE_LEVEL_NODE_MIN; level <= maxlevel; level++) { nilfs_bmap_commit_end_ptr(&btree->bt_bmap, - &path[level].bp_oldreq); + &path[level].bp_oldreq, dat); path[level].bp_op(btree, path, level, NULL, NULL); } @@ -1520,27 +1479,32 @@ static int nilfs_btree_delete(struct nilfs_bmap *bmap, __u64 key) struct nilfs_btree *btree; struct nilfs_btree_path *path; struct nilfs_bmap_stats stats; + struct inode *dat; int level, ret; btree = (struct nilfs_btree *)bmap; - path = nilfs_btree_alloc_path(btree); + path = nilfs_btree_alloc_path(); if (path == NULL) return -ENOMEM; - nilfs_btree_init_path(btree, path); + nilfs_btree_init_path(path); ret = nilfs_btree_do_lookup(btree, path, key, NULL, NILFS_BTREE_LEVEL_NODE_MIN); if (ret < 0) goto out; - ret = nilfs_btree_prepare_delete(btree, path, &level, &stats); + + dat = NILFS_BMAP_USE_VBN(&btree->bt_bmap) ? + nilfs_bmap_get_dat(&btree->bt_bmap) : NULL; + + ret = nilfs_btree_prepare_delete(btree, path, &level, &stats, dat); if (ret < 0) goto out; - nilfs_btree_commit_delete(btree, path, level); + nilfs_btree_commit_delete(btree, path, level, dat); nilfs_bmap_sub_blocks(bmap, stats.bs_nblocks); out: - nilfs_btree_clear_path(btree, path); - nilfs_btree_free_path(btree, path); + nilfs_btree_release_path(path); + nilfs_btree_free_path(path); return ret; } @@ -1551,15 +1515,15 @@ static int nilfs_btree_last_key(const struct nilfs_bmap *bmap, __u64 *keyp) int ret; btree = (struct nilfs_btree *)bmap; - path = nilfs_btree_alloc_path(btree); + path = nilfs_btree_alloc_path(); if (path == NULL) return -ENOMEM; - nilfs_btree_init_path(btree, path); + nilfs_btree_init_path(path); ret = nilfs_btree_do_lookup_last(btree, path, keyp, NULL); - nilfs_btree_clear_path(btree, path); - nilfs_btree_free_path(btree, path); + nilfs_btree_release_path(path); + nilfs_btree_free_path(path); return ret; } @@ -1581,7 +1545,7 @@ static int nilfs_btree_check_delete(struct nilfs_bmap *bmap, __u64 key) node = root; break; case 3: - nchildren = nilfs_btree_node_get_nchildren(btree, root); + nchildren = nilfs_btree_node_get_nchildren(root); if (nchildren > 1) return 0; ptr = nilfs_btree_node_get_ptr(btree, root, nchildren - 1); @@ -1594,10 +1558,10 @@ static int nilfs_btree_check_delete(struct nilfs_bmap *bmap, __u64 key) return 0; } - nchildren = nilfs_btree_node_get_nchildren(btree, node); - maxkey = nilfs_btree_node_get_key(btree, node, nchildren - 1); + nchildren = nilfs_btree_node_get_nchildren(node); + maxkey = nilfs_btree_node_get_key(node, nchildren - 1); nextmaxkey = (nchildren > 1) ? - nilfs_btree_node_get_key(btree, node, nchildren - 2) : 0; + nilfs_btree_node_get_key(node, nchildren - 2) : 0; if (bh != NULL) brelse(bh); @@ -1623,7 +1587,7 @@ static int nilfs_btree_gather_data(struct nilfs_bmap *bmap, node = root; break; case 3: - nchildren = nilfs_btree_node_get_nchildren(btree, root); + nchildren = nilfs_btree_node_get_nchildren(root); WARN_ON(nchildren > 1); ptr = nilfs_btree_node_get_ptr(btree, root, nchildren - 1); ret = nilfs_btree_get_block(btree, ptr, &bh); @@ -1636,11 +1600,11 @@ static int nilfs_btree_gather_data(struct nilfs_bmap *bmap, return -EINVAL; } - nchildren = nilfs_btree_node_get_nchildren(btree, node); + nchildren = nilfs_btree_node_get_nchildren(node); if (nchildren < nitems) nitems = nchildren; - dkeys = nilfs_btree_node_dkeys(btree, node); - dptrs = nilfs_btree_node_dptrs(btree, node); + dkeys = nilfs_btree_node_dkeys(node); + dptrs = nilfs_btree_node_dptrs(node, btree); for (i = 0; i < nitems; i++) { keys[i] = nilfs_bmap_dkey_to_key(dkeys[i]); ptrs[i] = nilfs_bmap_dptr_to_ptr(dptrs[i]); @@ -1660,18 +1624,20 @@ nilfs_btree_prepare_convert_and_insert(struct nilfs_bmap *bmap, __u64 key, struct nilfs_bmap_stats *stats) { struct buffer_head *bh; - struct nilfs_btree *btree; + struct nilfs_btree *btree = (struct nilfs_btree *)bmap; + struct inode *dat = NULL; int ret; - btree = (struct nilfs_btree *)bmap; stats->bs_nblocks = 0; /* for data */ /* cannot find near ptr */ - if (NILFS_BMAP_USE_VBN(bmap)) + if (NILFS_BMAP_USE_VBN(bmap)) { dreq->bpr_ptr = nilfs_btree_find_target_v(btree, NULL, key); + dat = nilfs_bmap_get_dat(bmap); + } - ret = nilfs_bmap_prepare_alloc_ptr(bmap, dreq); + ret = nilfs_bmap_prepare_alloc_ptr(bmap, dreq, dat); if (ret < 0) return ret; @@ -1679,7 +1645,7 @@ nilfs_btree_prepare_convert_and_insert(struct nilfs_bmap *bmap, __u64 key, stats->bs_nblocks++; if (nreq != NULL) { nreq->bpr_ptr = dreq->bpr_ptr + 1; - ret = nilfs_bmap_prepare_alloc_ptr(bmap, nreq); + ret = nilfs_bmap_prepare_alloc_ptr(bmap, nreq, dat); if (ret < 0) goto err_out_dreq; @@ -1696,9 +1662,9 @@ nilfs_btree_prepare_convert_and_insert(struct nilfs_bmap *bmap, __u64 key, /* error */ err_out_nreq: - nilfs_bmap_abort_alloc_ptr(bmap, nreq); + nilfs_bmap_abort_alloc_ptr(bmap, nreq, dat); err_out_dreq: - nilfs_bmap_abort_alloc_ptr(bmap, dreq); + nilfs_bmap_abort_alloc_ptr(bmap, dreq, dat); stats->bs_nblocks = 0; return ret; @@ -1713,8 +1679,9 @@ nilfs_btree_commit_convert_and_insert(struct nilfs_bmap *bmap, union nilfs_bmap_ptr_req *nreq, struct buffer_head *bh) { - struct nilfs_btree *btree; + struct nilfs_btree *btree = (struct nilfs_btree *)bmap; struct nilfs_btree_node *node; + struct inode *dat; __u64 tmpptr; /* free resources */ @@ -1725,11 +1692,11 @@ nilfs_btree_commit_convert_and_insert(struct nilfs_bmap *bmap, set_buffer_nilfs_volatile((struct buffer_head *)((unsigned long)ptr)); /* convert and insert */ - btree = (struct nilfs_btree *)bmap; + dat = NILFS_BMAP_USE_VBN(bmap) ? nilfs_bmap_get_dat(bmap) : NULL; nilfs_btree_init(bmap); if (nreq != NULL) { - nilfs_bmap_commit_alloc_ptr(bmap, dreq); - nilfs_bmap_commit_alloc_ptr(bmap, nreq); + nilfs_bmap_commit_alloc_ptr(bmap, dreq, dat); + nilfs_bmap_commit_alloc_ptr(bmap, nreq, dat); /* create child node at level 1 */ lock_buffer(bh); @@ -1751,7 +1718,7 @@ nilfs_btree_commit_convert_and_insert(struct nilfs_bmap *bmap, nilfs_btree_node_init(btree, node, NILFS_BTREE_NODE_ROOT, 2, 1, &keys[0], &tmpptr); } else { - nilfs_bmap_commit_alloc_ptr(bmap, dreq); + nilfs_bmap_commit_alloc_ptr(bmap, dreq, dat); /* create root node at level 1 */ node = nilfs_btree_get_root(btree); @@ -1822,7 +1789,7 @@ static int nilfs_btree_propagate_p(struct nilfs_btree *btree, static int nilfs_btree_prepare_update_v(struct nilfs_btree *btree, struct nilfs_btree_path *path, - int level) + int level, struct inode *dat) { struct nilfs_btree_node *parent; int ret; @@ -1832,9 +1799,8 @@ static int nilfs_btree_prepare_update_v(struct nilfs_btree *btree, nilfs_btree_node_get_ptr(btree, parent, path[level + 1].bp_index); path[level].bp_newreq.bpr_ptr = path[level].bp_oldreq.bpr_ptr + 1; - ret = nilfs_bmap_prepare_update_v(&btree->bt_bmap, - &path[level].bp_oldreq, - &path[level].bp_newreq); + ret = nilfs_dat_prepare_update(dat, &path[level].bp_oldreq.bpr_req, + &path[level].bp_newreq.bpr_req); if (ret < 0) return ret; @@ -1846,9 +1812,9 @@ static int nilfs_btree_prepare_update_v(struct nilfs_btree *btree, &NILFS_BMAP_I(&btree->bt_bmap)->i_btnode_cache, &path[level].bp_ctxt); if (ret < 0) { - nilfs_bmap_abort_update_v(&btree->bt_bmap, - &path[level].bp_oldreq, - &path[level].bp_newreq); + nilfs_dat_abort_update(dat, + &path[level].bp_oldreq.bpr_req, + &path[level].bp_newreq.bpr_req); return ret; } } @@ -1858,13 +1824,13 @@ static int nilfs_btree_prepare_update_v(struct nilfs_btree *btree, static void nilfs_btree_commit_update_v(struct nilfs_btree *btree, struct nilfs_btree_path *path, - int level) + int level, struct inode *dat) { struct nilfs_btree_node *parent; - nilfs_bmap_commit_update_v(&btree->bt_bmap, - &path[level].bp_oldreq, - &path[level].bp_newreq); + nilfs_dat_commit_update(dat, &path[level].bp_oldreq.bpr_req, + &path[level].bp_newreq.bpr_req, + btree->bt_bmap.b_ptr_type == NILFS_BMAP_PTR_VS); if (buffer_nilfs_node(path[level].bp_bh)) { nilfs_btnode_commit_change_key( @@ -1881,11 +1847,10 @@ static void nilfs_btree_commit_update_v(struct nilfs_btree *btree, static void nilfs_btree_abort_update_v(struct nilfs_btree *btree, struct nilfs_btree_path *path, - int level) + int level, struct inode *dat) { - nilfs_bmap_abort_update_v(&btree->bt_bmap, - &path[level].bp_oldreq, - &path[level].bp_newreq); + nilfs_dat_abort_update(dat, &path[level].bp_oldreq.bpr_req, + &path[level].bp_newreq.bpr_req); if (buffer_nilfs_node(path[level].bp_bh)) nilfs_btnode_abort_change_key( &NILFS_BMAP_I(&btree->bt_bmap)->i_btnode_cache, @@ -1894,14 +1859,14 @@ static void nilfs_btree_abort_update_v(struct nilfs_btree *btree, static int nilfs_btree_prepare_propagate_v(struct nilfs_btree *btree, struct nilfs_btree_path *path, - int minlevel, - int *maxlevelp) + int minlevel, int *maxlevelp, + struct inode *dat) { int level, ret; level = minlevel; if (!buffer_nilfs_volatile(path[level].bp_bh)) { - ret = nilfs_btree_prepare_update_v(btree, path, level); + ret = nilfs_btree_prepare_update_v(btree, path, level, dat); if (ret < 0) return ret; } @@ -1909,7 +1874,7 @@ static int nilfs_btree_prepare_propagate_v(struct nilfs_btree *btree, !buffer_dirty(path[level].bp_bh)) { WARN_ON(buffer_nilfs_volatile(path[level].bp_bh)); - ret = nilfs_btree_prepare_update_v(btree, path, level); + ret = nilfs_btree_prepare_update_v(btree, path, level, dat); if (ret < 0) goto out; } @@ -1921,39 +1886,40 @@ static int nilfs_btree_prepare_propagate_v(struct nilfs_btree *btree, /* error */ out: while (--level > minlevel) - nilfs_btree_abort_update_v(btree, path, level); + nilfs_btree_abort_update_v(btree, path, level, dat); if (!buffer_nilfs_volatile(path[level].bp_bh)) - nilfs_btree_abort_update_v(btree, path, level); + nilfs_btree_abort_update_v(btree, path, level, dat); return ret; } static void nilfs_btree_commit_propagate_v(struct nilfs_btree *btree, struct nilfs_btree_path *path, - int minlevel, - int maxlevel, - struct buffer_head *bh) + int minlevel, int maxlevel, + struct buffer_head *bh, + struct inode *dat) { int level; if (!buffer_nilfs_volatile(path[minlevel].bp_bh)) - nilfs_btree_commit_update_v(btree, path, minlevel); + nilfs_btree_commit_update_v(btree, path, minlevel, dat); for (level = minlevel + 1; level <= maxlevel; level++) - nilfs_btree_commit_update_v(btree, path, level); + nilfs_btree_commit_update_v(btree, path, level, dat); } static int nilfs_btree_propagate_v(struct nilfs_btree *btree, struct nilfs_btree_path *path, - int level, - struct buffer_head *bh) + int level, struct buffer_head *bh) { int maxlevel, ret; struct nilfs_btree_node *parent; + struct inode *dat = nilfs_bmap_get_dat(&btree->bt_bmap); __u64 ptr; get_bh(bh); path[level].bp_bh = bh; - ret = nilfs_btree_prepare_propagate_v(btree, path, level, &maxlevel); + ret = nilfs_btree_prepare_propagate_v(btree, path, level, &maxlevel, + dat); if (ret < 0) goto out; @@ -1961,12 +1927,12 @@ static int nilfs_btree_propagate_v(struct nilfs_btree *btree, parent = nilfs_btree_get_node(btree, path, level + 1); ptr = nilfs_btree_node_get_ptr(btree, parent, path[level + 1].bp_index); - ret = nilfs_bmap_mark_dirty(&btree->bt_bmap, ptr); + ret = nilfs_dat_mark_dirty(dat, ptr); if (ret < 0) goto out; } - nilfs_btree_commit_propagate_v(btree, path, level, maxlevel, bh); + nilfs_btree_commit_propagate_v(btree, path, level, maxlevel, bh, dat); out: brelse(path[level].bp_bh); @@ -1986,15 +1952,15 @@ static int nilfs_btree_propagate(const struct nilfs_bmap *bmap, WARN_ON(!buffer_dirty(bh)); btree = (struct nilfs_btree *)bmap; - path = nilfs_btree_alloc_path(btree); + path = nilfs_btree_alloc_path(); if (path == NULL) return -ENOMEM; - nilfs_btree_init_path(btree, path); + nilfs_btree_init_path(path); if (buffer_nilfs_node(bh)) { node = (struct nilfs_btree_node *)bh->b_data; - key = nilfs_btree_node_get_key(btree, node, 0); - level = nilfs_btree_node_get_level(btree, node); + key = nilfs_btree_node_get_key(node, 0); + level = nilfs_btree_node_get_level(node); } else { key = nilfs_bmap_data_get_key(bmap, bh); level = NILFS_BTREE_LEVEL_DATA; @@ -2013,8 +1979,8 @@ static int nilfs_btree_propagate(const struct nilfs_bmap *bmap, nilfs_btree_propagate_p(btree, path, level, bh); out: - nilfs_btree_clear_path(btree, path); - nilfs_btree_free_path(btree, path); + nilfs_btree_release_path(path); + nilfs_btree_free_path(path); return ret; } @@ -2022,7 +1988,7 @@ static int nilfs_btree_propagate(const struct nilfs_bmap *bmap, static int nilfs_btree_propagate_gc(const struct nilfs_bmap *bmap, struct buffer_head *bh) { - return nilfs_bmap_mark_dirty(bmap, bh->b_blocknr); + return nilfs_dat_mark_dirty(nilfs_bmap_get_dat(bmap), bh->b_blocknr); } static void nilfs_btree_add_dirty_buffer(struct nilfs_btree *btree, @@ -2037,12 +2003,12 @@ static void nilfs_btree_add_dirty_buffer(struct nilfs_btree *btree, get_bh(bh); node = (struct nilfs_btree_node *)bh->b_data; - key = nilfs_btree_node_get_key(btree, node, 0); - level = nilfs_btree_node_get_level(btree, node); + key = nilfs_btree_node_get_key(node, 0); + level = nilfs_btree_node_get_level(node); list_for_each(head, &lists[level]) { cbh = list_entry(head, struct buffer_head, b_assoc_buffers); cnode = (struct nilfs_btree_node *)cbh->b_data; - ckey = nilfs_btree_node_get_key(btree, cnode, 0); + ckey = nilfs_btree_node_get_key(cnode, 0); if (key < ckey) break; } @@ -2120,8 +2086,7 @@ static int nilfs_btree_assign_p(struct nilfs_btree *btree, nilfs_btree_node_set_ptr(btree, parent, path[level + 1].bp_index, blocknr); - key = nilfs_btree_node_get_key(btree, parent, - path[level + 1].bp_index); + key = nilfs_btree_node_get_key(parent, path[level + 1].bp_index); /* on-disk format */ binfo->bi_dat.bi_blkoff = nilfs_bmap_key_to_dkey(key); binfo->bi_dat.bi_level = level; @@ -2137,6 +2102,7 @@ static int nilfs_btree_assign_v(struct nilfs_btree *btree, union nilfs_binfo *binfo) { struct nilfs_btree_node *parent; + struct inode *dat = nilfs_bmap_get_dat(&btree->bt_bmap); __u64 key; __u64 ptr; union nilfs_bmap_ptr_req req; @@ -2146,12 +2112,12 @@ static int nilfs_btree_assign_v(struct nilfs_btree *btree, ptr = nilfs_btree_node_get_ptr(btree, parent, path[level + 1].bp_index); req.bpr_ptr = ptr; - ret = nilfs_bmap_start_v(&btree->bt_bmap, &req, blocknr); - if (unlikely(ret < 0)) + ret = nilfs_dat_prepare_start(dat, &req.bpr_req); + if (ret < 0) return ret; + nilfs_dat_commit_start(dat, &req.bpr_req, blocknr); - key = nilfs_btree_node_get_key(btree, parent, - path[level + 1].bp_index); + key = nilfs_btree_node_get_key(parent, path[level + 1].bp_index); /* on-disk format */ binfo->bi_v.bi_vblocknr = nilfs_bmap_ptr_to_dptr(ptr); binfo->bi_v.bi_blkoff = nilfs_bmap_key_to_dkey(key); @@ -2171,15 +2137,15 @@ static int nilfs_btree_assign(struct nilfs_bmap *bmap, int level, ret; btree = (struct nilfs_btree *)bmap; - path = nilfs_btree_alloc_path(btree); + path = nilfs_btree_alloc_path(); if (path == NULL) return -ENOMEM; - nilfs_btree_init_path(btree, path); + nilfs_btree_init_path(path); if (buffer_nilfs_node(*bh)) { node = (struct nilfs_btree_node *)(*bh)->b_data; - key = nilfs_btree_node_get_key(btree, node, 0); - level = nilfs_btree_node_get_level(btree, node); + key = nilfs_btree_node_get_key(node, 0); + level = nilfs_btree_node_get_level(node); } else { key = nilfs_bmap_data_get_key(bmap, *bh); level = NILFS_BTREE_LEVEL_DATA; @@ -2196,8 +2162,8 @@ static int nilfs_btree_assign(struct nilfs_bmap *bmap, nilfs_btree_assign_p(btree, path, level, bh, blocknr, binfo); out: - nilfs_btree_clear_path(btree, path); - nilfs_btree_free_path(btree, path); + nilfs_btree_release_path(path); + nilfs_btree_free_path(path); return ret; } @@ -2207,19 +2173,18 @@ static int nilfs_btree_assign_gc(struct nilfs_bmap *bmap, sector_t blocknr, union nilfs_binfo *binfo) { - struct nilfs_btree *btree; struct nilfs_btree_node *node; __u64 key; int ret; - btree = (struct nilfs_btree *)bmap; - ret = nilfs_bmap_move_v(bmap, (*bh)->b_blocknr, blocknr); + ret = nilfs_dat_move(nilfs_bmap_get_dat(bmap), (*bh)->b_blocknr, + blocknr); if (ret < 0) return ret; if (buffer_nilfs_node(*bh)) { node = (struct nilfs_btree_node *)(*bh)->b_data; - key = nilfs_btree_node_get_key(btree, node, 0); + key = nilfs_btree_node_get_key(node, 0); } else key = nilfs_bmap_data_get_key(bmap, *bh); @@ -2239,10 +2204,10 @@ static int nilfs_btree_mark(struct nilfs_bmap *bmap, __u64 key, int level) int ret; btree = (struct nilfs_btree *)bmap; - path = nilfs_btree_alloc_path(btree); + path = nilfs_btree_alloc_path(); if (path == NULL) return -ENOMEM; - nilfs_btree_init_path(btree, path); + nilfs_btree_init_path(path); ret = nilfs_btree_do_lookup(btree, path, key, &ptr, level + 1); if (ret < 0) { @@ -2262,8 +2227,8 @@ static int nilfs_btree_mark(struct nilfs_bmap *bmap, __u64 key, int level) nilfs_bmap_set_dirty(&btree->bt_bmap); out: - nilfs_btree_clear_path(btree, path); - nilfs_btree_free_path(btree, path); + nilfs_btree_release_path(path); + nilfs_btree_free_path(path); return ret; } diff --git a/fs/nilfs2/cpfile.c b/fs/nilfs2/cpfile.c index aec942c..1c6cfb5 100644 --- a/fs/nilfs2/cpfile.c +++ b/fs/nilfs2/cpfile.c @@ -815,8 +815,10 @@ int nilfs_cpfile_is_snapshot(struct inode *cpfile, __u64 cno) void *kaddr; int ret; - if (cno == 0) - return -ENOENT; /* checkpoint number 0 is invalid */ + /* CP number is invalid if it's zero or larger than the + largest exist one.*/ + if (cno == 0 || cno >= nilfs_mdt_cno(cpfile)) + return -ENOENT; down_read(&NILFS_MDT(cpfile)->mi_sem); ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, 0, &bh); @@ -824,7 +826,10 @@ int nilfs_cpfile_is_snapshot(struct inode *cpfile, __u64 cno) goto out; kaddr = kmap_atomic(bh->b_page, KM_USER0); cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, bh, kaddr); - ret = nilfs_checkpoint_snapshot(cp); + if (nilfs_checkpoint_invalid(cp)) + ret = -ENOENT; + else + ret = nilfs_checkpoint_snapshot(cp); kunmap_atomic(kaddr, KM_USER0); brelse(bh); diff --git a/fs/nilfs2/cpfile.h b/fs/nilfs2/cpfile.h index 788a459..debea89 100644 --- a/fs/nilfs2/cpfile.h +++ b/fs/nilfs2/cpfile.h @@ -27,8 +27,6 @@ #include <linux/buffer_head.h> #include <linux/nilfs2_fs.h> -#define NILFS_CPFILE_GFP NILFS_MDT_GFP - int nilfs_cpfile_get_checkpoint(struct inode *, __u64, int, struct nilfs_checkpoint **, diff --git a/fs/nilfs2/dat.c b/fs/nilfs2/dat.c index 8927ca2..1ff8e15 100644 --- a/fs/nilfs2/dat.c +++ b/fs/nilfs2/dat.c @@ -109,12 +109,6 @@ void nilfs_dat_commit_free(struct inode *dat, struct nilfs_palloc_req *req) nilfs_palloc_commit_free_entry(dat, req); } -void nilfs_dat_abort_free(struct inode *dat, struct nilfs_palloc_req *req) -{ - nilfs_dat_abort_entry(dat, req); - nilfs_palloc_abort_free_entry(dat, req); -} - int nilfs_dat_prepare_start(struct inode *dat, struct nilfs_palloc_req *req) { int ret; @@ -140,11 +134,6 @@ void nilfs_dat_commit_start(struct inode *dat, struct nilfs_palloc_req *req, nilfs_dat_commit_entry(dat, req); } -void nilfs_dat_abort_start(struct inode *dat, struct nilfs_palloc_req *req) -{ - nilfs_dat_abort_entry(dat, req); -} - int nilfs_dat_prepare_end(struct inode *dat, struct nilfs_palloc_req *req) { struct nilfs_dat_entry *entry; @@ -222,6 +211,37 @@ void nilfs_dat_abort_end(struct inode *dat, struct nilfs_palloc_req *req) nilfs_dat_abort_entry(dat, req); } +int nilfs_dat_prepare_update(struct inode *dat, + struct nilfs_palloc_req *oldreq, + struct nilfs_palloc_req *newreq) +{ + int ret; + + ret = nilfs_dat_prepare_end(dat, oldreq); + if (!ret) { + ret = nilfs_dat_prepare_alloc(dat, newreq); + if (ret < 0) + nilfs_dat_abort_end(dat, oldreq); + } + return ret; +} + +void nilfs_dat_commit_update(struct inode *dat, + struct nilfs_palloc_req *oldreq, + struct nilfs_palloc_req *newreq, int dead) +{ + nilfs_dat_commit_end(dat, oldreq, dead); + nilfs_dat_commit_alloc(dat, newreq); +} + +void nilfs_dat_abort_update(struct inode *dat, + struct nilfs_palloc_req *oldreq, + struct nilfs_palloc_req *newreq) +{ + nilfs_dat_abort_end(dat, oldreq); + nilfs_dat_abort_alloc(dat, newreq); +} + /** * nilfs_dat_mark_dirty - * @dat: DAT file inode diff --git a/fs/nilfs2/dat.h b/fs/nilfs2/dat.h index d328b81..406070d 100644 --- a/fs/nilfs2/dat.h +++ b/fs/nilfs2/dat.h @@ -27,7 +27,6 @@ #include <linux/buffer_head.h> #include <linux/fs.h> -#define NILFS_DAT_GFP NILFS_MDT_GFP struct nilfs_palloc_req; @@ -39,10 +38,15 @@ void nilfs_dat_abort_alloc(struct inode *, struct nilfs_palloc_req *); int nilfs_dat_prepare_start(struct inode *, struct nilfs_palloc_req *); void nilfs_dat_commit_start(struct inode *, struct nilfs_palloc_req *, sector_t); -void nilfs_dat_abort_start(struct inode *, struct nilfs_palloc_req *); int nilfs_dat_prepare_end(struct inode *, struct nilfs_palloc_req *); void nilfs_dat_commit_end(struct inode *, struct nilfs_palloc_req *, int); void nilfs_dat_abort_end(struct inode *, struct nilfs_palloc_req *); +int nilfs_dat_prepare_update(struct inode *, struct nilfs_palloc_req *, + struct nilfs_palloc_req *); +void nilfs_dat_commit_update(struct inode *, struct nilfs_palloc_req *, + struct nilfs_palloc_req *, int); +void nilfs_dat_abort_update(struct inode *, struct nilfs_palloc_req *, + struct nilfs_palloc_req *); int nilfs_dat_mark_dirty(struct inode *, __u64); int nilfs_dat_freev(struct inode *, __u64 *, size_t); diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c index 1a4fa04..e097099 100644 --- a/fs/nilfs2/dir.c +++ b/fs/nilfs2/dir.c @@ -697,7 +697,7 @@ not_empty: return 0; } -struct file_operations nilfs_dir_operations = { +const struct file_operations nilfs_dir_operations = { .llseek = generic_file_llseek, .read = generic_read_dir, .readdir = nilfs_readdir, diff --git a/fs/nilfs2/direct.c b/fs/nilfs2/direct.c index 342d976..d369ac7 100644 --- a/fs/nilfs2/direct.c +++ b/fs/nilfs2/direct.c @@ -125,106 +125,64 @@ static void nilfs_direct_set_target_v(struct nilfs_direct *direct, direct->d_bmap.b_last_allocated_ptr = ptr; } -static int nilfs_direct_prepare_insert(struct nilfs_direct *direct, - __u64 key, - union nilfs_bmap_ptr_req *req, - struct nilfs_bmap_stats *stats) -{ - int ret; - - if (NILFS_BMAP_USE_VBN(&direct->d_bmap)) - req->bpr_ptr = nilfs_direct_find_target_v(direct, key); - ret = nilfs_bmap_prepare_alloc_ptr(&direct->d_bmap, req); - if (ret < 0) - return ret; - - stats->bs_nblocks = 1; - return 0; -} - -static void nilfs_direct_commit_insert(struct nilfs_direct *direct, - union nilfs_bmap_ptr_req *req, - __u64 key, __u64 ptr) -{ - struct buffer_head *bh; - - /* ptr must be a pointer to a buffer head. */ - bh = (struct buffer_head *)((unsigned long)ptr); - set_buffer_nilfs_volatile(bh); - - nilfs_bmap_commit_alloc_ptr(&direct->d_bmap, req); - nilfs_direct_set_ptr(direct, key, req->bpr_ptr); - - if (!nilfs_bmap_dirty(&direct->d_bmap)) - nilfs_bmap_set_dirty(&direct->d_bmap); - - if (NILFS_BMAP_USE_VBN(&direct->d_bmap)) - nilfs_direct_set_target_v(direct, key, req->bpr_ptr); -} - static int nilfs_direct_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr) { - struct nilfs_direct *direct; + struct nilfs_direct *direct = (struct nilfs_direct *)bmap; union nilfs_bmap_ptr_req req; - struct nilfs_bmap_stats stats; + struct inode *dat = NULL; + struct buffer_head *bh; int ret; - direct = (struct nilfs_direct *)bmap; if (key > NILFS_DIRECT_KEY_MAX) return -ENOENT; if (nilfs_direct_get_ptr(direct, key) != NILFS_BMAP_INVALID_PTR) return -EEXIST; - ret = nilfs_direct_prepare_insert(direct, key, &req, &stats); - if (ret < 0) - return ret; - nilfs_direct_commit_insert(direct, &req, key, ptr); - nilfs_bmap_add_blocks(bmap, stats.bs_nblocks); + if (NILFS_BMAP_USE_VBN(bmap)) { + req.bpr_ptr = nilfs_direct_find_target_v(direct, key); + dat = nilfs_bmap_get_dat(bmap); + } + ret = nilfs_bmap_prepare_alloc_ptr(bmap, &req, dat); + if (!ret) { + /* ptr must be a pointer to a buffer head. */ + bh = (struct buffer_head *)((unsigned long)ptr); + set_buffer_nilfs_volatile(bh); - return 0; -} + nilfs_bmap_commit_alloc_ptr(bmap, &req, dat); + nilfs_direct_set_ptr(direct, key, req.bpr_ptr); -static int nilfs_direct_prepare_delete(struct nilfs_direct *direct, - union nilfs_bmap_ptr_req *req, - __u64 key, - struct nilfs_bmap_stats *stats) -{ - int ret; + if (!nilfs_bmap_dirty(bmap)) + nilfs_bmap_set_dirty(bmap); - req->bpr_ptr = nilfs_direct_get_ptr(direct, key); - ret = nilfs_bmap_prepare_end_ptr(&direct->d_bmap, req); - if (!ret) - stats->bs_nblocks = 1; - return ret; -} + if (NILFS_BMAP_USE_VBN(bmap)) + nilfs_direct_set_target_v(direct, key, req.bpr_ptr); -static void nilfs_direct_commit_delete(struct nilfs_direct *direct, - union nilfs_bmap_ptr_req *req, - __u64 key) -{ - nilfs_bmap_commit_end_ptr(&direct->d_bmap, req); - nilfs_direct_set_ptr(direct, key, NILFS_BMAP_INVALID_PTR); + nilfs_bmap_add_blocks(bmap, 1); + } + return ret; } static int nilfs_direct_delete(struct nilfs_bmap *bmap, __u64 key) { - struct nilfs_direct *direct; + struct nilfs_direct *direct = (struct nilfs_direct *)bmap; union nilfs_bmap_ptr_req req; - struct nilfs_bmap_stats stats; + struct inode *dat; int ret; - direct = (struct nilfs_direct *)bmap; - if ((key > NILFS_DIRECT_KEY_MAX) || + if (key > NILFS_DIRECT_KEY_MAX || nilfs_direct_get_ptr(direct, key) == NILFS_BMAP_INVALID_PTR) return -ENOENT; - ret = nilfs_direct_prepare_delete(direct, &req, key, &stats); - if (ret < 0) - return ret; - nilfs_direct_commit_delete(direct, &req, key); - nilfs_bmap_sub_blocks(bmap, stats.bs_nblocks); + dat = NILFS_BMAP_USE_VBN(bmap) ? nilfs_bmap_get_dat(bmap) : NULL; + req.bpr_ptr = nilfs_direct_get_ptr(direct, key); - return 0; + ret = nilfs_bmap_prepare_end_ptr(bmap, &req, dat); + if (!ret) { + nilfs_bmap_commit_end_ptr(bmap, &req, dat); + nilfs_direct_set_ptr(direct, key, NILFS_BMAP_INVALID_PTR); + nilfs_bmap_sub_blocks(bmap, 1); + } + return ret; } static int nilfs_direct_last_key(const struct nilfs_bmap *bmap, __u64 *keyp) @@ -310,59 +268,56 @@ int nilfs_direct_delete_and_convert(struct nilfs_bmap *bmap, return 0; } -static int nilfs_direct_propagate_v(struct nilfs_direct *direct, - struct buffer_head *bh) +static int nilfs_direct_propagate(const struct nilfs_bmap *bmap, + struct buffer_head *bh) { - union nilfs_bmap_ptr_req oldreq, newreq; + struct nilfs_direct *direct = (struct nilfs_direct *)bmap; + struct nilfs_palloc_req oldreq, newreq; + struct inode *dat; __u64 key; __u64 ptr; int ret; - key = nilfs_bmap_data_get_key(&direct->d_bmap, bh); + if (!NILFS_BMAP_USE_VBN(bmap)) + return 0; + + dat = nilfs_bmap_get_dat(bmap); + key = nilfs_bmap_data_get_key(bmap, bh); ptr = nilfs_direct_get_ptr(direct, key); if (!buffer_nilfs_volatile(bh)) { - oldreq.bpr_ptr = ptr; - newreq.bpr_ptr = ptr; - ret = nilfs_bmap_prepare_update_v(&direct->d_bmap, &oldreq, - &newreq); + oldreq.pr_entry_nr = ptr; + newreq.pr_entry_nr = ptr; + ret = nilfs_dat_prepare_update(dat, &oldreq, &newreq); if (ret < 0) return ret; - nilfs_bmap_commit_update_v(&direct->d_bmap, &oldreq, &newreq); + nilfs_dat_commit_update(dat, &oldreq, &newreq, + bmap->b_ptr_type == NILFS_BMAP_PTR_VS); set_buffer_nilfs_volatile(bh); - nilfs_direct_set_ptr(direct, key, newreq.bpr_ptr); + nilfs_direct_set_ptr(direct, key, newreq.pr_entry_nr); } else - ret = nilfs_bmap_mark_dirty(&direct->d_bmap, ptr); + ret = nilfs_dat_mark_dirty(dat, ptr); return ret; } -static int nilfs_direct_propagate(const struct nilfs_bmap *bmap, - struct buffer_head *bh) -{ - struct nilfs_direct *direct = (struct nilfs_direct *)bmap; - - return NILFS_BMAP_USE_VBN(bmap) ? - nilfs_direct_propagate_v(direct, bh) : 0; -} - static int nilfs_direct_assign_v(struct nilfs_direct *direct, __u64 key, __u64 ptr, struct buffer_head **bh, sector_t blocknr, union nilfs_binfo *binfo) { + struct inode *dat = nilfs_bmap_get_dat(&direct->d_bmap); union nilfs_bmap_ptr_req req; int ret; req.bpr_ptr = ptr; - ret = nilfs_bmap_start_v(&direct->d_bmap, &req, blocknr); - if (unlikely(ret < 0)) - return ret; - - binfo->bi_v.bi_vblocknr = nilfs_bmap_ptr_to_dptr(ptr); - binfo->bi_v.bi_blkoff = nilfs_bmap_key_to_dkey(key); - - return 0; + ret = nilfs_dat_prepare_start(dat, &req.bpr_req); + if (!ret) { + nilfs_dat_commit_start(dat, &req.bpr_req, blocknr); + binfo->bi_v.bi_vblocknr = nilfs_bmap_ptr_to_dptr(ptr); + binfo->bi_v.bi_blkoff = nilfs_bmap_key_to_dkey(key); + } + return ret; } static int nilfs_direct_assign_p(struct nilfs_direct *direct, diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c index 6bd84a0..30292df 100644 --- a/fs/nilfs2/file.c +++ b/fs/nilfs2/file.c @@ -117,7 +117,7 @@ static int nilfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) return 0; } -struct vm_operations_struct nilfs_file_vm_ops = { +static const struct vm_operations_struct nilfs_file_vm_ops = { .fault = filemap_fault, .page_mkwrite = nilfs_page_mkwrite, }; @@ -134,7 +134,7 @@ static int nilfs_file_mmap(struct file *file, struct vm_area_struct *vma) * We have mostly NULL's here: the current defaults are ok for * the nilfs filesystem. */ -struct file_operations nilfs_file_operations = { +const struct file_operations nilfs_file_operations = { .llseek = generic_file_llseek, .read = do_sync_read, .write = do_sync_write, @@ -151,7 +151,7 @@ struct file_operations nilfs_file_operations = { .splice_read = generic_file_splice_read, }; -struct inode_operations nilfs_file_inode_operations = { +const struct inode_operations nilfs_file_inode_operations = { .truncate = nilfs_truncate, .setattr = nilfs_setattr, .permission = nilfs_permission, diff --git a/fs/nilfs2/gcinode.c b/fs/nilfs2/gcinode.c index 1b3c2bb..e6de0a2 100644 --- a/fs/nilfs2/gcinode.c +++ b/fs/nilfs2/gcinode.c @@ -52,7 +52,7 @@ #include "dat.h" #include "ifile.h" -static struct address_space_operations def_gcinode_aops = { +static const struct address_space_operations def_gcinode_aops = { .sync_page = block_sync_page, }; diff --git a/fs/nilfs2/ifile.h b/fs/nilfs2/ifile.h index 5d30a35..ecc3ba76 100644 --- a/fs/nilfs2/ifile.h +++ b/fs/nilfs2/ifile.h @@ -31,7 +31,6 @@ #include "mdt.h" #include "alloc.h" -#define NILFS_IFILE_GFP NILFS_MDT_GFP static inline struct nilfs_inode * nilfs_ifile_map_inode(struct inode *ifile, ino_t ino, struct buffer_head *ibh) diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c index fe9d8f2..5040220 100644 --- a/fs/nilfs2/inode.c +++ b/fs/nilfs2/inode.c @@ -238,7 +238,7 @@ nilfs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, return size; } -struct address_space_operations nilfs_aops = { +const struct address_space_operations nilfs_aops = { .writepage = nilfs_writepage, .readpage = nilfs_readpage, .sync_page = block_sync_page, @@ -400,6 +400,7 @@ int nilfs_read_inode_common(struct inode *inode, ii->i_dir_acl = S_ISREG(inode->i_mode) ? 0 : le32_to_cpu(raw_inode->i_dir_acl); #endif + ii->i_dir_start_lookup = 0; ii->i_cno = 0; inode->i_generation = le32_to_cpu(raw_inode->i_generation); @@ -430,7 +431,8 @@ static int __nilfs_read_inode(struct super_block *sb, unsigned long ino, raw_inode = nilfs_ifile_map_inode(sbi->s_ifile, ino, bh); - if (nilfs_read_inode_common(inode, raw_inode)) + err = nilfs_read_inode_common(inode, raw_inode); + if (err) goto failed_unmap; if (S_ISREG(inode->i_mode)) { diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c index 6ea5f87..6572ea4 100644 --- a/fs/nilfs2/ioctl.c +++ b/fs/nilfs2/ioctl.c @@ -442,12 +442,6 @@ int nilfs_ioctl_prepare_clean_segments(struct the_nilfs *nilfs, const char *msg; int ret; - ret = nilfs_ioctl_move_blocks(nilfs, &argv[0], kbufs[0]); - if (ret < 0) { - msg = "cannot read source blocks"; - goto failed; - } - ret = nilfs_ioctl_delete_checkpoints(nilfs, &argv[1], kbufs[1]); if (ret < 0) { /* @@ -548,7 +542,25 @@ static int nilfs_ioctl_clean_segments(struct inode *inode, struct file *filp, } } - ret = nilfs_clean_segments(inode->i_sb, argv, kbufs); + /* + * nilfs_ioctl_move_blocks() will call nilfs_gc_iget(), + * which will operates an inode list without blocking. + * To protect the list from concurrent operations, + * nilfs_ioctl_move_blocks should be atomic operation. + */ + if (test_and_set_bit(THE_NILFS_GC_RUNNING, &nilfs->ns_flags)) { + ret = -EBUSY; + goto out_free; + } + + ret = nilfs_ioctl_move_blocks(nilfs, &argv[0], kbufs[0]); + if (ret < 0) + printk(KERN_ERR "NILFS: GC failed during preparation: " + "cannot read source blocks: err=%d\n", ret); + else + ret = nilfs_clean_segments(inode->i_sb, argv, kbufs); + + clear_nilfs_gc_running(nilfs); out_free: while (--n >= 0) diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c index 2dfd477..f632611 100644 --- a/fs/nilfs2/mdt.c +++ b/fs/nilfs2/mdt.c @@ -103,15 +103,12 @@ static int nilfs_mdt_create_block(struct inode *inode, unsigned long block, goto failed_unlock; err = -EEXIST; - if (buffer_uptodate(bh) || buffer_mapped(bh)) + if (buffer_uptodate(bh)) goto failed_bh; -#if 0 - /* The uptodate flag is not protected by the page lock, but - the mapped flag is. Thus, we don't have to wait the buffer. */ + wait_on_buffer(bh); if (buffer_uptodate(bh)) goto failed_bh; -#endif bh->b_bdev = nilfs->ns_bdev; err = nilfs_mdt_insert_new_block(inode, block, bh, init_block); @@ -139,7 +136,7 @@ nilfs_mdt_submit_block(struct inode *inode, unsigned long blkoff, int mode, struct buffer_head **out_bh) { struct buffer_head *bh; - unsigned long blknum = 0; + __u64 blknum = 0; int ret = -ENOMEM; bh = nilfs_grab_buffer(inode, inode->i_mapping, blkoff, 0); @@ -162,17 +159,15 @@ nilfs_mdt_submit_block(struct inode *inode, unsigned long blkoff, unlock_buffer(bh); goto out; } - if (!buffer_mapped(bh)) { /* unused buffer */ - ret = nilfs_bmap_lookup(NILFS_I(inode)->i_bmap, blkoff, - &blknum); - if (unlikely(ret)) { - unlock_buffer(bh); - goto failed_bh; - } - bh->b_bdev = NILFS_MDT(inode)->mi_nilfs->ns_bdev; - bh->b_blocknr = blknum; - set_buffer_mapped(bh); + + ret = nilfs_bmap_lookup(NILFS_I(inode)->i_bmap, blkoff, &blknum); + if (unlikely(ret)) { + unlock_buffer(bh); + goto failed_bh; } + bh->b_bdev = NILFS_MDT(inode)->mi_nilfs->ns_bdev; + bh->b_blocknr = (sector_t)blknum; + set_buffer_mapped(bh); bh->b_end_io = end_buffer_read_sync; get_bh(bh); @@ -402,6 +397,7 @@ nilfs_mdt_write_page(struct page *page, struct writeback_control *wbc) struct inode *inode = container_of(page->mapping, struct inode, i_data); struct super_block *sb = inode->i_sb; + struct the_nilfs *nilfs = NILFS_MDT(inode)->mi_nilfs; struct nilfs_sb_info *writer = NULL; int err = 0; @@ -411,9 +407,10 @@ nilfs_mdt_write_page(struct page *page, struct writeback_control *wbc) if (page->mapping->assoc_mapping) return 0; /* Do not request flush for shadow page cache */ if (!sb) { - writer = nilfs_get_writer(NILFS_MDT(inode)->mi_nilfs); + down_read(&nilfs->ns_writer_sem); + writer = nilfs->ns_writer; if (!writer) { - nilfs_put_writer(NILFS_MDT(inode)->mi_nilfs); + up_read(&nilfs->ns_writer_sem); return -EROFS; } sb = writer->s_super; @@ -425,18 +422,18 @@ nilfs_mdt_write_page(struct page *page, struct writeback_control *wbc) nilfs_flush_segment(sb, inode->i_ino); if (writer) - nilfs_put_writer(NILFS_MDT(inode)->mi_nilfs); + up_read(&nilfs->ns_writer_sem); return err; } -static struct address_space_operations def_mdt_aops = { +static const struct address_space_operations def_mdt_aops = { .writepage = nilfs_mdt_write_page, .sync_page = block_sync_page, }; -static struct inode_operations def_mdt_iops; -static struct file_operations def_mdt_fops; +static const struct inode_operations def_mdt_iops; +static const struct file_operations def_mdt_fops; /* * NILFS2 uses pseudo inodes for meta data files such as DAT, cpfile, sufile, @@ -516,9 +513,10 @@ nilfs_mdt_new_common(struct the_nilfs *nilfs, struct super_block *sb, } struct inode *nilfs_mdt_new(struct the_nilfs *nilfs, struct super_block *sb, - ino_t ino, gfp_t gfp_mask) + ino_t ino) { - struct inode *inode = nilfs_mdt_new_common(nilfs, sb, ino, gfp_mask); + struct inode *inode = nilfs_mdt_new_common(nilfs, sb, ino, + NILFS_MDT_GFP); if (!inode) return NULL; diff --git a/fs/nilfs2/mdt.h b/fs/nilfs2/mdt.h index df683e0..4315997 100644 --- a/fs/nilfs2/mdt.h +++ b/fs/nilfs2/mdt.h @@ -74,8 +74,7 @@ int nilfs_mdt_forget_block(struct inode *, unsigned long); int nilfs_mdt_mark_block_dirty(struct inode *, unsigned long); int nilfs_mdt_fetch_dirty(struct inode *); -struct inode *nilfs_mdt_new(struct the_nilfs *, struct super_block *, ino_t, - gfp_t); +struct inode *nilfs_mdt_new(struct the_nilfs *, struct super_block *, ino_t); struct inode *nilfs_mdt_new_common(struct the_nilfs *, struct super_block *, ino_t, gfp_t); void nilfs_mdt_destroy(struct inode *); diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c index df70dad..ed02e88 100644 --- a/fs/nilfs2/namei.c +++ b/fs/nilfs2/namei.c @@ -448,7 +448,7 @@ out: return err; } -struct inode_operations nilfs_dir_inode_operations = { +const struct inode_operations nilfs_dir_inode_operations = { .create = nilfs_create, .lookup = nilfs_lookup, .link = nilfs_link, @@ -462,12 +462,12 @@ struct inode_operations nilfs_dir_inode_operations = { .permission = nilfs_permission, }; -struct inode_operations nilfs_special_inode_operations = { +const struct inode_operations nilfs_special_inode_operations = { .setattr = nilfs_setattr, .permission = nilfs_permission, }; -struct inode_operations nilfs_symlink_inode_operations = { +const struct inode_operations nilfs_symlink_inode_operations = { .readlink = generic_readlink, .follow_link = page_follow_link_light, .put_link = page_put_link, diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h index 724c637..4da6f67 100644 --- a/fs/nilfs2/nilfs.h +++ b/fs/nilfs2/nilfs.h @@ -294,13 +294,13 @@ void nilfs_clear_gcdat_inode(struct the_nilfs *); /* * Inodes and files operations */ -extern struct file_operations nilfs_dir_operations; -extern struct inode_operations nilfs_file_inode_operations; -extern struct file_operations nilfs_file_operations; -extern struct address_space_operations nilfs_aops; -extern struct inode_operations nilfs_dir_inode_operations; -extern struct inode_operations nilfs_special_inode_operations; -extern struct inode_operations nilfs_symlink_inode_operations; +extern const struct file_operations nilfs_dir_operations; +extern const struct inode_operations nilfs_file_inode_operations; +extern const struct file_operations nilfs_file_operations; +extern const struct address_space_operations nilfs_aops; +extern const struct inode_operations nilfs_dir_inode_operations; +extern const struct inode_operations nilfs_special_inode_operations; +extern const struct inode_operations nilfs_symlink_inode_operations; /* * filesystem type diff --git a/fs/nilfs2/recovery.c b/fs/nilfs2/recovery.c index d80cc71..6dc8359 100644 --- a/fs/nilfs2/recovery.c +++ b/fs/nilfs2/recovery.c @@ -552,7 +552,8 @@ static int recover_dsync_blocks(struct nilfs_sb_info *sbi, printk(KERN_WARNING "NILFS warning: error recovering data block " "(err=%d, ino=%lu, block-offset=%llu)\n", - err, rb->ino, (unsigned long long)rb->blkoff); + err, (unsigned long)rb->ino, + (unsigned long long)rb->blkoff); if (!err2) err2 = err; next: diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c index 9e3fe17..e6d9e37 100644 --- a/fs/nilfs2/segbuf.c +++ b/fs/nilfs2/segbuf.c @@ -316,10 +316,10 @@ static struct bio *nilfs_alloc_seg_bio(struct super_block *sb, sector_t start, { struct bio *bio; - bio = bio_alloc(GFP_NOWAIT, nr_vecs); + bio = bio_alloc(GFP_NOIO, nr_vecs); if (bio == NULL) { while (!bio && (nr_vecs >>= 1)) - bio = bio_alloc(GFP_NOWAIT, nr_vecs); + bio = bio_alloc(GFP_NOIO, nr_vecs); } if (likely(bio)) { bio->bi_bdev = sb->s_bdev; diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index 51ff3d0..683df89 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -2501,7 +2501,8 @@ static int nilfs_segctor_construct(struct nilfs_sc_info *sci, if (test_bit(NILFS_SC_SUPER_ROOT, &sci->sc_flags) && nilfs_discontinued(nilfs)) { down_write(&nilfs->ns_sem); - req->sb_err = nilfs_commit_super(sbi, 0); + req->sb_err = nilfs_commit_super(sbi, + nilfs_altsb_need_update(nilfs)); up_write(&nilfs->ns_sem); } } @@ -2689,6 +2690,7 @@ static int nilfs_segctor_thread(void *arg) } else { DEFINE_WAIT(wait); int should_sleep = 1; + struct the_nilfs *nilfs; prepare_to_wait(&sci->sc_wait_daemon, &wait, TASK_INTERRUPTIBLE); @@ -2709,6 +2711,9 @@ static int nilfs_segctor_thread(void *arg) finish_wait(&sci->sc_wait_daemon, &wait); timeout = ((sci->sc_state & NILFS_SEGCTOR_COMMIT) && time_after_eq(jiffies, sci->sc_timer->expires)); + nilfs = sci->sc_sbi->s_nilfs; + if (sci->sc_super->s_dirt && nilfs_sb_need_update(nilfs)) + set_nilfs_discontinued(nilfs); } goto loop; diff --git a/fs/nilfs2/sufile.h b/fs/nilfs2/sufile.h index a2c4d76..0e99e5c 100644 --- a/fs/nilfs2/sufile.h +++ b/fs/nilfs2/sufile.h @@ -28,7 +28,6 @@ #include <linux/nilfs2_fs.h> #include "mdt.h" -#define NILFS_SUFILE_GFP NILFS_MDT_GFP static inline unsigned long nilfs_sufile_get_nsegments(struct inode *sufile) { diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c index 151964f..644e667 100644 --- a/fs/nilfs2/super.c +++ b/fs/nilfs2/super.c @@ -50,6 +50,8 @@ #include <linux/writeback.h> #include <linux/kobject.h> #include <linux/exportfs.h> +#include <linux/seq_file.h> +#include <linux/mount.h> #include "nilfs.h" #include "mdt.h" #include "alloc.h" @@ -65,7 +67,6 @@ MODULE_DESCRIPTION("A New Implementation of the Log-structured Filesystem " "(NILFS)"); MODULE_LICENSE("GPL"); -static void nilfs_write_super(struct super_block *sb); static int nilfs_remount(struct super_block *sb, int *flags, char *data); /** @@ -311,9 +312,6 @@ static void nilfs_put_super(struct super_block *sb) lock_kernel(); - if (sb->s_dirt) - nilfs_write_super(sb); - nilfs_detach_segment_constructor(sbi); if (!(sb->s_flags & MS_RDONLY)) { @@ -336,63 +334,21 @@ static void nilfs_put_super(struct super_block *sb) unlock_kernel(); } -/** - * nilfs_write_super - write super block(s) of NILFS - * @sb: super_block - * - * nilfs_write_super() gets a fs-dependent lock, writes super block(s), and - * clears s_dirt. This function is called in the section protected by - * lock_super(). - * - * The s_dirt flag is managed by each filesystem and we protect it by ns_sem - * of the struct the_nilfs. Lock order must be as follows: - * - * 1. lock_super() - * 2. down_write(&nilfs->ns_sem) - * - * Inside NILFS, locking ns_sem is enough to protect s_dirt and the buffer - * of the super block (nilfs->ns_sbp[]). - * - * In most cases, VFS functions call lock_super() before calling these - * methods. So we must be careful not to bring on deadlocks when using - * lock_super(); see generic_shutdown_super(), write_super(), and so on. - * - * Note that order of lock_kernel() and lock_super() depends on contexts - * of VFS. We should also note that lock_kernel() can be used in its - * protective section and only the outermost one has an effect. - */ -static void nilfs_write_super(struct super_block *sb) +static int nilfs_sync_fs(struct super_block *sb, int wait) { struct nilfs_sb_info *sbi = NILFS_SB(sb); struct the_nilfs *nilfs = sbi->s_nilfs; - - down_write(&nilfs->ns_sem); - if (!(sb->s_flags & MS_RDONLY)) { - struct nilfs_super_block **sbp = nilfs->ns_sbp; - u64 t = get_seconds(); - int dupsb; - - if (!nilfs_discontinued(nilfs) && t >= nilfs->ns_sbwtime[0] && - t < nilfs->ns_sbwtime[0] + NILFS_SB_FREQ) { - up_write(&nilfs->ns_sem); - return; - } - dupsb = sbp[1] && t > nilfs->ns_sbwtime[1] + NILFS_ALTSB_FREQ; - nilfs_commit_super(sbi, dupsb); - } - sb->s_dirt = 0; - up_write(&nilfs->ns_sem); -} - -static int nilfs_sync_fs(struct super_block *sb, int wait) -{ int err = 0; - nilfs_write_super(sb); - /* This function is called when super block should be written back */ if (wait) err = nilfs_construct_segment(sb); + + down_write(&nilfs->ns_sem); + if (sb->s_dirt) + nilfs_commit_super(sbi, 1); + up_write(&nilfs->ns_sem); + return err; } @@ -407,8 +363,7 @@ int nilfs_attach_checkpoint(struct nilfs_sb_info *sbi, __u64 cno) list_add(&sbi->s_list, &nilfs->ns_supers); up_write(&nilfs->ns_super_sem); - sbi->s_ifile = nilfs_mdt_new( - nilfs, sbi->s_super, NILFS_IFILE_INO, NILFS_IFILE_GFP); + sbi->s_ifile = nilfs_mdt_new(nilfs, sbi->s_super, NILFS_IFILE_INO); if (!sbi->s_ifile) return -ENOMEM; @@ -529,7 +484,27 @@ static int nilfs_statfs(struct dentry *dentry, struct kstatfs *buf) return 0; } -static struct super_operations nilfs_sops = { +static int nilfs_show_options(struct seq_file *seq, struct vfsmount *vfs) +{ + struct super_block *sb = vfs->mnt_sb; + struct nilfs_sb_info *sbi = NILFS_SB(sb); + + if (!nilfs_test_opt(sbi, BARRIER)) + seq_printf(seq, ",barrier=off"); + if (nilfs_test_opt(sbi, SNAPSHOT)) + seq_printf(seq, ",cp=%llu", + (unsigned long long int)sbi->s_snapshot_cno); + if (nilfs_test_opt(sbi, ERRORS_RO)) + seq_printf(seq, ",errors=remount-ro"); + if (nilfs_test_opt(sbi, ERRORS_PANIC)) + seq_printf(seq, ",errors=panic"); + if (nilfs_test_opt(sbi, STRICT_ORDER)) + seq_printf(seq, ",order=strict"); + + return 0; +} + +static const struct super_operations nilfs_sops = { .alloc_inode = nilfs_alloc_inode, .destroy_inode = nilfs_destroy_inode, .dirty_inode = nilfs_dirty_inode, @@ -538,7 +513,7 @@ static struct super_operations nilfs_sops = { /* .drop_inode = nilfs_drop_inode, */ .delete_inode = nilfs_delete_inode, .put_super = nilfs_put_super, - .write_super = nilfs_write_super, + /* .write_super = nilfs_write_super, */ .sync_fs = nilfs_sync_fs, /* .write_super_lockfs */ /* .unlockfs */ @@ -546,7 +521,7 @@ static struct super_operations nilfs_sops = { .remount_fs = nilfs_remount, .clear_inode = nilfs_clear_inode, /* .umount_begin */ - /* .show_options */ + .show_options = nilfs_show_options }; static struct inode * @@ -585,7 +560,7 @@ nilfs_fh_to_parent(struct super_block *sb, struct fid *fid, int fh_len, nilfs_nfs_get_inode); } -static struct export_operations nilfs_export_ops = { +static const struct export_operations nilfs_export_ops = { .fh_to_dentry = nilfs_fh_to_dentry, .fh_to_parent = nilfs_fh_to_parent, .get_parent = nilfs_get_parent, @@ -816,10 +791,15 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent, if (sb->s_flags & MS_RDONLY) { if (nilfs_test_opt(sbi, SNAPSHOT)) { + down_read(&nilfs->ns_segctor_sem); err = nilfs_cpfile_is_snapshot(nilfs->ns_cpfile, sbi->s_snapshot_cno); - if (err < 0) + up_read(&nilfs->ns_segctor_sem); + if (err < 0) { + if (err == -ENOENT) + err = -EINVAL; goto failed_sbi; + } if (!err) { printk(KERN_ERR "NILFS: The specified checkpoint is " @@ -1127,10 +1107,6 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags, */ sd.sbi = nilfs_find_sbinfo(nilfs, !(flags & MS_RDONLY), sd.cno); - if (!sd.cno) - /* trying to get the latest checkpoint. */ - sd.cno = nilfs_last_cno(nilfs); - /* * Get super block instance holding the nilfs_sb_info struct. * A new instance is allocated if no existing mount is present or diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c index 8b88898..ad391a8 100644 --- a/fs/nilfs2/the_nilfs.c +++ b/fs/nilfs2/the_nilfs.c @@ -68,12 +68,11 @@ static struct the_nilfs *alloc_nilfs(struct block_device *bdev) nilfs->ns_bdev = bdev; atomic_set(&nilfs->ns_count, 1); - atomic_set(&nilfs->ns_writer_refcount, -1); atomic_set(&nilfs->ns_ndirtyblks, 0); init_rwsem(&nilfs->ns_sem); init_rwsem(&nilfs->ns_super_sem); mutex_init(&nilfs->ns_mount_mutex); - mutex_init(&nilfs->ns_writer_mutex); + init_rwsem(&nilfs->ns_writer_sem); INIT_LIST_HEAD(&nilfs->ns_list); INIT_LIST_HEAD(&nilfs->ns_supers); spin_lock_init(&nilfs->ns_last_segment_lock); @@ -188,23 +187,19 @@ static int nilfs_load_super_root(struct the_nilfs *nilfs, inode_size = nilfs->ns_inode_size; err = -ENOMEM; - nilfs->ns_dat = nilfs_mdt_new( - nilfs, NULL, NILFS_DAT_INO, NILFS_DAT_GFP); + nilfs->ns_dat = nilfs_mdt_new(nilfs, NULL, NILFS_DAT_INO); if (unlikely(!nilfs->ns_dat)) goto failed; - nilfs->ns_gc_dat = nilfs_mdt_new( - nilfs, NULL, NILFS_DAT_INO, NILFS_DAT_GFP); + nilfs->ns_gc_dat = nilfs_mdt_new(nilfs, NULL, NILFS_DAT_INO); if (unlikely(!nilfs->ns_gc_dat)) goto failed_dat; - nilfs->ns_cpfile = nilfs_mdt_new( - nilfs, NULL, NILFS_CPFILE_INO, NILFS_CPFILE_GFP); + nilfs->ns_cpfile = nilfs_mdt_new(nilfs, NULL, NILFS_CPFILE_INO); if (unlikely(!nilfs->ns_cpfile)) goto failed_gc_dat; - nilfs->ns_sufile = nilfs_mdt_new( - nilfs, NULL, NILFS_SUFILE_INO, NILFS_SUFILE_GFP); + nilfs->ns_sufile = nilfs_mdt_new(nilfs, NULL, NILFS_SUFILE_INO); if (unlikely(!nilfs->ns_sufile)) goto failed_cpfile; @@ -596,9 +591,7 @@ int init_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi, char *data) nilfs->ns_mount_state = le16_to_cpu(sbp->s_state); - bdi = nilfs->ns_bdev->bd_inode_backing_dev_info; - if (!bdi) - bdi = nilfs->ns_bdev->bd_inode->i_mapping->backing_dev_info; + bdi = nilfs->ns_bdev->bd_inode->i_mapping->backing_dev_info; nilfs->ns_bdi = bdi ? : &default_backing_dev_info; /* Finding last segment */ diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h index 1b9caaf..20abd55 100644 --- a/fs/nilfs2/the_nilfs.h +++ b/fs/nilfs2/the_nilfs.h @@ -37,6 +37,7 @@ enum { THE_NILFS_LOADED, /* Roll-back/roll-forward has done and the latest checkpoint was loaded */ THE_NILFS_DISCONTINUED, /* 'next' pointer chain has broken */ + THE_NILFS_GC_RUNNING, /* gc process is running */ }; /** @@ -50,8 +51,7 @@ enum { * @ns_sem: semaphore for shared states * @ns_super_sem: semaphore for global operations across super block instances * @ns_mount_mutex: mutex protecting mount process of nilfs - * @ns_writer_mutex: mutex protecting ns_writer attach/detach - * @ns_writer_refcount: number of referrers on ns_writer + * @ns_writer_sem: semaphore protecting ns_writer attach/detach * @ns_current: back pointer to current mount * @ns_sbh: buffer heads of on-disk super blocks * @ns_sbp: pointers to super block data @@ -100,8 +100,7 @@ struct the_nilfs { struct rw_semaphore ns_sem; struct rw_semaphore ns_super_sem; struct mutex ns_mount_mutex; - struct mutex ns_writer_mutex; - atomic_t ns_writer_refcount; + struct rw_semaphore ns_writer_sem; /* * components protected by ns_super_sem @@ -197,11 +196,26 @@ static inline int nilfs_##name(struct the_nilfs *nilfs) \ THE_NILFS_FNS(INIT, init) THE_NILFS_FNS(LOADED, loaded) THE_NILFS_FNS(DISCONTINUED, discontinued) +THE_NILFS_FNS(GC_RUNNING, gc_running) /* Minimum interval of periodical update of superblocks (in seconds) */ #define NILFS_SB_FREQ 10 #define NILFS_ALTSB_FREQ 60 /* spare superblock */ +static inline int nilfs_sb_need_update(struct the_nilfs *nilfs) +{ + u64 t = get_seconds(); + return t < nilfs->ns_sbwtime[0] || + t > nilfs->ns_sbwtime[0] + NILFS_SB_FREQ; +} + +static inline int nilfs_altsb_need_update(struct the_nilfs *nilfs) +{ + u64 t = get_seconds(); + struct nilfs_super_block **sbp = nilfs->ns_sbp; + return sbp[1] && t > nilfs->ns_sbwtime[1] + NILFS_ALTSB_FREQ; +} + void nilfs_set_last_segment(struct the_nilfs *, sector_t, u64, __u64); struct the_nilfs *find_or_create_nilfs(struct block_device *); void put_nilfs(struct the_nilfs *); @@ -221,34 +235,21 @@ static inline void get_nilfs(struct the_nilfs *nilfs) atomic_inc(&nilfs->ns_count); } -static inline struct nilfs_sb_info *nilfs_get_writer(struct the_nilfs *nilfs) -{ - if (atomic_inc_and_test(&nilfs->ns_writer_refcount)) - mutex_lock(&nilfs->ns_writer_mutex); - return nilfs->ns_writer; -} - -static inline void nilfs_put_writer(struct the_nilfs *nilfs) -{ - if (atomic_add_negative(-1, &nilfs->ns_writer_refcount)) - mutex_unlock(&nilfs->ns_writer_mutex); -} - static inline void nilfs_attach_writer(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi) { - mutex_lock(&nilfs->ns_writer_mutex); + down_write(&nilfs->ns_writer_sem); nilfs->ns_writer = sbi; - mutex_unlock(&nilfs->ns_writer_mutex); + up_write(&nilfs->ns_writer_sem); } static inline void nilfs_detach_writer(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi) { - mutex_lock(&nilfs->ns_writer_mutex); + down_write(&nilfs->ns_writer_sem); if (sbi == nilfs->ns_writer) nilfs->ns_writer = NULL; - mutex_unlock(&nilfs->ns_writer_mutex); + up_write(&nilfs->ns_writer_sem); } static inline void nilfs_put_sbinfo(struct nilfs_sb_info *sbi) diff --git a/fs/nls/nls_base.c b/fs/nls/nls_base.c index 477d37d..44a88a9 100644 --- a/fs/nls/nls_base.c +++ b/fs/nls/nls_base.c @@ -124,10 +124,10 @@ int utf8s_to_utf16s(const u8 *s, int len, wchar_t *pwcs) while (*s && len > 0) { if (*s & 0x80) { size = utf8_to_utf32(s, len, &u); - if (size < 0) { - /* Ignore character and move on */ - size = 1; - } else if (u >= PLANE_SIZE) { + if (size < 0) + return -EINVAL; + + if (u >= PLANE_SIZE) { u -= PLANE_SIZE; *op++ = (wchar_t) (SURROGATE_PAIR | ((u >> 10) & SURROGATE_BITS)); @@ -270,7 +270,8 @@ struct nls_table *load_nls(char *charset) void unload_nls(struct nls_table *nls) { - module_put(nls->owner); + if (nls) + module_put(nls->owner); } static const wchar_t charset2uni[256] = { diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c index 828a889..7e54e52 100644 --- a/fs/notify/dnotify/dnotify.c +++ b/fs/notify/dnotify/dnotify.c @@ -91,6 +91,7 @@ static int dnotify_handle_event(struct fsnotify_group *group, struct dnotify_struct *dn; struct dnotify_struct **prev; struct fown_struct *fown; + __u32 test_mask = event->mask & ~FS_EVENT_ON_CHILD; to_tell = event->to_tell; @@ -106,7 +107,7 @@ static int dnotify_handle_event(struct fsnotify_group *group, spin_lock(&entry->lock); prev = &dnentry->dn; while ((dn = *prev) != NULL) { - if ((dn->dn_mask & event->mask) == 0) { + if ((dn->dn_mask & test_mask) == 0) { prev = &dn->dn_next; continue; } diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c index c8a07c6..3165d85 100644 --- a/fs/notify/inode_mark.c +++ b/fs/notify/inode_mark.c @@ -324,11 +324,11 @@ int fsnotify_add_mark(struct fsnotify_mark_entry *entry, spin_lock(&group->mark_lock); spin_lock(&inode->i_lock); - entry->group = group; - entry->inode = inode; - lentry = fsnotify_find_mark_entry(group, inode); if (!lentry) { + entry->group = group; + entry->inode = inode; + hlist_add_head(&entry->i_list, &inode->i_fsnotify_mark_entries); list_add(&entry->g_list, &group->mark_entries); diff --git a/fs/notify/notification.c b/fs/notify/notification.c index 3816d57..b8bf53b 100644 --- a/fs/notify/notification.c +++ b/fs/notify/notification.c @@ -143,7 +143,7 @@ static bool event_compare(struct fsnotify_event *old, struct fsnotify_event *new /* remember, after old was put on the wait_q we aren't * allowed to look at the inode any more, only thing * left to check was if the file_name is the same */ - if (old->name_len && + if (!old->name_len || !strcmp(old->file_name, new->file_name)) return true; break; diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c index b38f944..cfce53c 100644 --- a/fs/ntfs/aops.c +++ b/fs/ntfs/aops.c @@ -1550,6 +1550,7 @@ const struct address_space_operations ntfs_aops = { .migratepage = buffer_migrate_page, /* Move a page cache page from one physical page to an other. */ + .error_remove_page = generic_error_remove_page, }; /** @@ -1569,6 +1570,7 @@ const struct address_space_operations ntfs_mst_aops = { .migratepage = buffer_migrate_page, /* Move a page cache page from one physical page to an other. */ + .error_remove_page = generic_error_remove_page, }; #ifdef NTFS_RW diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c index 3140a44..663c0e3 100644 --- a/fs/ntfs/file.c +++ b/fs/ntfs/file.c @@ -2076,14 +2076,6 @@ err_out: *ppos = pos; if (cached_page) page_cache_release(cached_page); - /* For now, when the user asks for O_SYNC, we actually give O_DSYNC. */ - if (likely(!status)) { - if (unlikely((file->f_flags & O_SYNC) || IS_SYNC(vi))) { - if (!mapping->a_ops->writepage || !is_sync_kiocb(iocb)) - status = generic_osync_inode(vi, mapping, - OSYNC_METADATA|OSYNC_DATA); - } - } pagevec_lru_add_file(&lru_pvec); ntfs_debug("Done. Returning %s (written 0x%lx, status %li).", written ? "written" : "status", (unsigned long)written, @@ -2145,8 +2137,8 @@ static ssize_t ntfs_file_aio_write(struct kiocb *iocb, const struct iovec *iov, mutex_lock(&inode->i_mutex); ret = ntfs_file_aio_write_nolock(iocb, iov, nr_segs, &iocb->ki_pos); mutex_unlock(&inode->i_mutex); - if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { - int err = sync_page_range(inode, mapping, pos, ret); + if (ret > 0) { + int err = generic_write_sync(file, pos, ret); if (err < 0) ret = err; } @@ -2154,46 +2146,6 @@ static ssize_t ntfs_file_aio_write(struct kiocb *iocb, const struct iovec *iov, } /** - * ntfs_file_writev - - * - * Basically the same as generic_file_writev() except that it ends up calling - * ntfs_file_aio_write_nolock() instead of __generic_file_aio_write_nolock(). - */ -static ssize_t ntfs_file_writev(struct file *file, const struct iovec *iov, - unsigned long nr_segs, loff_t *ppos) -{ - struct address_space *mapping = file->f_mapping; - struct inode *inode = mapping->host; - struct kiocb kiocb; - ssize_t ret; - - mutex_lock(&inode->i_mutex); - init_sync_kiocb(&kiocb, file); - ret = ntfs_file_aio_write_nolock(&kiocb, iov, nr_segs, ppos); - if (ret == -EIOCBQUEUED) - ret = wait_on_sync_kiocb(&kiocb); - mutex_unlock(&inode->i_mutex); - if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { - int err = sync_page_range(inode, mapping, *ppos - ret, ret); - if (err < 0) - ret = err; - } - return ret; -} - -/** - * ntfs_file_write - simple wrapper for ntfs_file_writev() - */ -static ssize_t ntfs_file_write(struct file *file, const char __user *buf, - size_t count, loff_t *ppos) -{ - struct iovec local_iov = { .iov_base = (void __user *)buf, - .iov_len = count }; - - return ntfs_file_writev(file, &local_iov, 1, ppos); -} - -/** * ntfs_file_fsync - sync a file to disk * @filp: file to be synced * @dentry: dentry describing the file to sync @@ -2255,7 +2207,7 @@ const struct file_operations ntfs_file_ops = { .read = do_sync_read, /* Read from file. */ .aio_read = generic_file_aio_read, /* Async read from file. */ #ifdef NTFS_RW - .write = ntfs_file_write, /* Write to file. */ + .write = do_sync_write, /* Write to file. */ .aio_write = ntfs_file_aio_write, /* Async write to file. */ /*.release = ,*/ /* Last file is closed. See fs/ext2/file.c:: diff --git a/fs/ntfs/layout.h b/fs/ntfs/layout.h index 50931b1..8b2549f 100644 --- a/fs/ntfs/layout.h +++ b/fs/ntfs/layout.h @@ -829,7 +829,7 @@ enum { /* Note, FILE_ATTR_VALID_SET_FLAGS masks out the old DOS VolId, the F_A_DEVICE, F_A_DIRECTORY, F_A_SPARSE_FILE, F_A_REPARSE_POINT, F_A_COMPRESSED, and F_A_ENCRYPTED and preserves the rest. This mask - is used to to obtain all flags that are valid for setting. */ + is used to obtain all flags that are valid for setting. */ /* * The flag FILE_ATTR_DUP_FILENAME_INDEX_PRESENT is present in all * FILENAME_ATTR attributes but not in the STANDARD_INFORMATION diff --git a/fs/ntfs/malloc.h b/fs/ntfs/malloc.h index cd0be3f..a44b14c 100644 --- a/fs/ntfs/malloc.h +++ b/fs/ntfs/malloc.h @@ -47,7 +47,7 @@ static inline void *__ntfs_malloc(unsigned long size, gfp_t gfp_mask) return kmalloc(PAGE_SIZE, gfp_mask & ~__GFP_HIGHMEM); /* return (void *)__get_free_page(gfp_mask); */ } - if (likely(size >> PAGE_SHIFT < num_physpages)) + if (likely((size >> PAGE_SHIFT) < totalram_pages)) return __vmalloc(size, gfp_mask, PAGE_KERNEL); return NULL; } diff --git a/fs/ntfs/mft.c b/fs/ntfs/mft.c index 23bf684..1caa0ef 100644 --- a/fs/ntfs/mft.c +++ b/fs/ntfs/mft.c @@ -384,13 +384,12 @@ unm_err_out: * it is dirty in the inode meta data rather than the data page cache of the * inode, and thus there are no data pages that need writing out. Therefore, a * full mark_inode_dirty() is overkill. A mark_inode_dirty_sync(), on the - * other hand, is not sufficient, because I_DIRTY_DATASYNC needs to be set to - * ensure ->write_inode is called from generic_osync_inode() and this needs to - * happen or the file data would not necessarily hit the device synchronously, - * even though the vfs inode has the O_SYNC flag set. Also, I_DIRTY_DATASYNC - * simply "feels" better than just I_DIRTY_SYNC, since the file data has not - * actually hit the block device yet, which is not what I_DIRTY_SYNC on its own - * would suggest. + * other hand, is not sufficient, because ->write_inode needs to be called even + * in case of fdatasync. This needs to happen or the file data would not + * necessarily hit the device synchronously, even though the vfs inode has the + * O_SYNC flag set. Also, I_DIRTY_DATASYNC simply "feels" better than just + * I_DIRTY_SYNC, since the file data has not actually hit the block device yet, + * which is not what I_DIRTY_SYNC on its own would suggest. */ void __mark_mft_record_dirty(ntfs_inode *ni) { diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c index abaaa1c..80b0477 100644 --- a/fs/ntfs/super.c +++ b/fs/ntfs/super.c @@ -201,8 +201,7 @@ use_utf8: v, old_nls->charset); nls_map = old_nls; } else /* nls_map */ { - if (old_nls) - unload_nls(old_nls); + unload_nls(old_nls); } } else if (!strcmp(p, "utf8")) { bool val = false; @@ -2427,10 +2426,9 @@ static void ntfs_put_super(struct super_block *sb) ntfs_free(vol->upcase); vol->upcase = NULL; } - if (vol->nls_map) { - unload_nls(vol->nls_map); - vol->nls_map = NULL; - } + + unload_nls(vol->nls_map); + sb->s_fs_info = NULL; kfree(vol); diff --git a/fs/ocfs2/Makefile b/fs/ocfs2/Makefile index 0159607..31f25ce 100644 --- a/fs/ocfs2/Makefile +++ b/fs/ocfs2/Makefile @@ -28,6 +28,7 @@ ocfs2-objs := \ locks.o \ mmap.o \ namei.o \ + refcounttree.o \ resize.o \ slot_map.o \ suballoc.o \ diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index ab513dd..38a42f5 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -49,10 +49,21 @@ #include "super.h" #include "uptodate.h" #include "xattr.h" +#include "refcounttree.h" #include "buffer_head_io.h" +enum ocfs2_contig_type { + CONTIG_NONE = 0, + CONTIG_LEFT, + CONTIG_RIGHT, + CONTIG_LEFTRIGHT, +}; +static enum ocfs2_contig_type + ocfs2_extent_rec_contig(struct super_block *sb, + struct ocfs2_extent_rec *ext, + struct ocfs2_extent_rec *insert_rec); /* * Operations for a specific extent tree type. * @@ -79,18 +90,30 @@ struct ocfs2_extent_tree_operations { * that value. new_clusters is the delta, and must be * added to the total. Required. */ - void (*eo_update_clusters)(struct inode *inode, - struct ocfs2_extent_tree *et, + void (*eo_update_clusters)(struct ocfs2_extent_tree *et, u32 new_clusters); /* + * If this extent tree is supported by an extent map, insert + * a record into the map. + */ + void (*eo_extent_map_insert)(struct ocfs2_extent_tree *et, + struct ocfs2_extent_rec *rec); + + /* + * If this extent tree is supported by an extent map, truncate the + * map to clusters, + */ + void (*eo_extent_map_truncate)(struct ocfs2_extent_tree *et, + u32 clusters); + + /* * If ->eo_insert_check() exists, it is called before rec is * inserted into the extent tree. It is optional. */ - int (*eo_insert_check)(struct inode *inode, - struct ocfs2_extent_tree *et, + int (*eo_insert_check)(struct ocfs2_extent_tree *et, struct ocfs2_extent_rec *rec); - int (*eo_sanity_check)(struct inode *inode, struct ocfs2_extent_tree *et); + int (*eo_sanity_check)(struct ocfs2_extent_tree *et); /* * -------------------------------------------------------------- @@ -109,8 +132,17 @@ struct ocfs2_extent_tree_operations { * it exists. If it does not, et->et_max_leaf_clusters is set * to 0 (unlimited). Optional. */ - void (*eo_fill_max_leaf_clusters)(struct inode *inode, - struct ocfs2_extent_tree *et); + void (*eo_fill_max_leaf_clusters)(struct ocfs2_extent_tree *et); + + /* + * ->eo_extent_contig test whether the 2 ocfs2_extent_rec + * are contiguous or not. Optional. Don't need to set it if use + * ocfs2_extent_rec as the tree leaf. + */ + enum ocfs2_contig_type + (*eo_extent_contig)(struct ocfs2_extent_tree *et, + struct ocfs2_extent_rec *ext, + struct ocfs2_extent_rec *insert_rec); }; @@ -121,19 +153,22 @@ struct ocfs2_extent_tree_operations { static u64 ocfs2_dinode_get_last_eb_blk(struct ocfs2_extent_tree *et); static void ocfs2_dinode_set_last_eb_blk(struct ocfs2_extent_tree *et, u64 blkno); -static void ocfs2_dinode_update_clusters(struct inode *inode, - struct ocfs2_extent_tree *et, +static void ocfs2_dinode_update_clusters(struct ocfs2_extent_tree *et, u32 clusters); -static int ocfs2_dinode_insert_check(struct inode *inode, - struct ocfs2_extent_tree *et, +static void ocfs2_dinode_extent_map_insert(struct ocfs2_extent_tree *et, + struct ocfs2_extent_rec *rec); +static void ocfs2_dinode_extent_map_truncate(struct ocfs2_extent_tree *et, + u32 clusters); +static int ocfs2_dinode_insert_check(struct ocfs2_extent_tree *et, struct ocfs2_extent_rec *rec); -static int ocfs2_dinode_sanity_check(struct inode *inode, - struct ocfs2_extent_tree *et); +static int ocfs2_dinode_sanity_check(struct ocfs2_extent_tree *et); static void ocfs2_dinode_fill_root_el(struct ocfs2_extent_tree *et); static struct ocfs2_extent_tree_operations ocfs2_dinode_et_ops = { .eo_set_last_eb_blk = ocfs2_dinode_set_last_eb_blk, .eo_get_last_eb_blk = ocfs2_dinode_get_last_eb_blk, .eo_update_clusters = ocfs2_dinode_update_clusters, + .eo_extent_map_insert = ocfs2_dinode_extent_map_insert, + .eo_extent_map_truncate = ocfs2_dinode_extent_map_truncate, .eo_insert_check = ocfs2_dinode_insert_check, .eo_sanity_check = ocfs2_dinode_sanity_check, .eo_fill_root_el = ocfs2_dinode_fill_root_el, @@ -156,40 +191,53 @@ static u64 ocfs2_dinode_get_last_eb_blk(struct ocfs2_extent_tree *et) return le64_to_cpu(di->i_last_eb_blk); } -static void ocfs2_dinode_update_clusters(struct inode *inode, - struct ocfs2_extent_tree *et, +static void ocfs2_dinode_update_clusters(struct ocfs2_extent_tree *et, u32 clusters) { + struct ocfs2_inode_info *oi = cache_info_to_inode(et->et_ci); struct ocfs2_dinode *di = et->et_object; le32_add_cpu(&di->i_clusters, clusters); - spin_lock(&OCFS2_I(inode)->ip_lock); - OCFS2_I(inode)->ip_clusters = le32_to_cpu(di->i_clusters); - spin_unlock(&OCFS2_I(inode)->ip_lock); + spin_lock(&oi->ip_lock); + oi->ip_clusters = le32_to_cpu(di->i_clusters); + spin_unlock(&oi->ip_lock); } -static int ocfs2_dinode_insert_check(struct inode *inode, - struct ocfs2_extent_tree *et, +static void ocfs2_dinode_extent_map_insert(struct ocfs2_extent_tree *et, + struct ocfs2_extent_rec *rec) +{ + struct inode *inode = &cache_info_to_inode(et->et_ci)->vfs_inode; + + ocfs2_extent_map_insert_rec(inode, rec); +} + +static void ocfs2_dinode_extent_map_truncate(struct ocfs2_extent_tree *et, + u32 clusters) +{ + struct inode *inode = &cache_info_to_inode(et->et_ci)->vfs_inode; + + ocfs2_extent_map_trunc(inode, clusters); +} + +static int ocfs2_dinode_insert_check(struct ocfs2_extent_tree *et, struct ocfs2_extent_rec *rec) { - struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct ocfs2_inode_info *oi = cache_info_to_inode(et->et_ci); + struct ocfs2_super *osb = OCFS2_SB(oi->vfs_inode.i_sb); - BUG_ON(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL); + BUG_ON(oi->ip_dyn_features & OCFS2_INLINE_DATA_FL); mlog_bug_on_msg(!ocfs2_sparse_alloc(osb) && - (OCFS2_I(inode)->ip_clusters != - le32_to_cpu(rec->e_cpos)), + (oi->ip_clusters != le32_to_cpu(rec->e_cpos)), "Device %s, asking for sparse allocation: inode %llu, " "cpos %u, clusters %u\n", osb->dev_str, - (unsigned long long)OCFS2_I(inode)->ip_blkno, - rec->e_cpos, - OCFS2_I(inode)->ip_clusters); + (unsigned long long)oi->ip_blkno, + rec->e_cpos, oi->ip_clusters); return 0; } -static int ocfs2_dinode_sanity_check(struct inode *inode, - struct ocfs2_extent_tree *et) +static int ocfs2_dinode_sanity_check(struct ocfs2_extent_tree *et) { struct ocfs2_dinode *di = et->et_object; @@ -229,8 +277,7 @@ static u64 ocfs2_xattr_value_get_last_eb_blk(struct ocfs2_extent_tree *et) return le64_to_cpu(vb->vb_xv->xr_last_eb_blk); } -static void ocfs2_xattr_value_update_clusters(struct inode *inode, - struct ocfs2_extent_tree *et, +static void ocfs2_xattr_value_update_clusters(struct ocfs2_extent_tree *et, u32 clusters) { struct ocfs2_xattr_value_buf *vb = et->et_object; @@ -252,12 +299,11 @@ static void ocfs2_xattr_tree_fill_root_el(struct ocfs2_extent_tree *et) et->et_root_el = &xb->xb_attrs.xb_root.xt_list; } -static void ocfs2_xattr_tree_fill_max_leaf_clusters(struct inode *inode, - struct ocfs2_extent_tree *et) +static void ocfs2_xattr_tree_fill_max_leaf_clusters(struct ocfs2_extent_tree *et) { + struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci); et->et_max_leaf_clusters = - ocfs2_clusters_for_bytes(inode->i_sb, - OCFS2_MAX_XATTR_TREE_LEAF_SIZE); + ocfs2_clusters_for_bytes(sb, OCFS2_MAX_XATTR_TREE_LEAF_SIZE); } static void ocfs2_xattr_tree_set_last_eb_blk(struct ocfs2_extent_tree *et, @@ -277,8 +323,7 @@ static u64 ocfs2_xattr_tree_get_last_eb_blk(struct ocfs2_extent_tree *et) return le64_to_cpu(xt->xt_last_eb_blk); } -static void ocfs2_xattr_tree_update_clusters(struct inode *inode, - struct ocfs2_extent_tree *et, +static void ocfs2_xattr_tree_update_clusters(struct ocfs2_extent_tree *et, u32 clusters) { struct ocfs2_xattr_block *xb = et->et_object; @@ -309,8 +354,7 @@ static u64 ocfs2_dx_root_get_last_eb_blk(struct ocfs2_extent_tree *et) return le64_to_cpu(dx_root->dr_last_eb_blk); } -static void ocfs2_dx_root_update_clusters(struct inode *inode, - struct ocfs2_extent_tree *et, +static void ocfs2_dx_root_update_clusters(struct ocfs2_extent_tree *et, u32 clusters) { struct ocfs2_dx_root_block *dx_root = et->et_object; @@ -318,8 +362,7 @@ static void ocfs2_dx_root_update_clusters(struct inode *inode, le32_add_cpu(&dx_root->dr_clusters, clusters); } -static int ocfs2_dx_root_sanity_check(struct inode *inode, - struct ocfs2_extent_tree *et) +static int ocfs2_dx_root_sanity_check(struct ocfs2_extent_tree *et) { struct ocfs2_dx_root_block *dx_root = et->et_object; @@ -343,8 +386,54 @@ static struct ocfs2_extent_tree_operations ocfs2_dx_root_et_ops = { .eo_fill_root_el = ocfs2_dx_root_fill_root_el, }; +static void ocfs2_refcount_tree_fill_root_el(struct ocfs2_extent_tree *et) +{ + struct ocfs2_refcount_block *rb = et->et_object; + + et->et_root_el = &rb->rf_list; +} + +static void ocfs2_refcount_tree_set_last_eb_blk(struct ocfs2_extent_tree *et, + u64 blkno) +{ + struct ocfs2_refcount_block *rb = et->et_object; + + rb->rf_last_eb_blk = cpu_to_le64(blkno); +} + +static u64 ocfs2_refcount_tree_get_last_eb_blk(struct ocfs2_extent_tree *et) +{ + struct ocfs2_refcount_block *rb = et->et_object; + + return le64_to_cpu(rb->rf_last_eb_blk); +} + +static void ocfs2_refcount_tree_update_clusters(struct ocfs2_extent_tree *et, + u32 clusters) +{ + struct ocfs2_refcount_block *rb = et->et_object; + + le32_add_cpu(&rb->rf_clusters, clusters); +} + +static enum ocfs2_contig_type +ocfs2_refcount_tree_extent_contig(struct ocfs2_extent_tree *et, + struct ocfs2_extent_rec *ext, + struct ocfs2_extent_rec *insert_rec) +{ + return CONTIG_NONE; +} + +static struct ocfs2_extent_tree_operations ocfs2_refcount_tree_et_ops = { + .eo_set_last_eb_blk = ocfs2_refcount_tree_set_last_eb_blk, + .eo_get_last_eb_blk = ocfs2_refcount_tree_get_last_eb_blk, + .eo_update_clusters = ocfs2_refcount_tree_update_clusters, + .eo_fill_root_el = ocfs2_refcount_tree_fill_root_el, + .eo_extent_contig = ocfs2_refcount_tree_extent_contig, +}; + static void __ocfs2_init_extent_tree(struct ocfs2_extent_tree *et, - struct inode *inode, + struct ocfs2_caching_info *ci, struct buffer_head *bh, ocfs2_journal_access_func access, void *obj, @@ -352,6 +441,7 @@ static void __ocfs2_init_extent_tree(struct ocfs2_extent_tree *et, { et->et_ops = ops; et->et_root_bh = bh; + et->et_ci = ci; et->et_root_journal_access = access; if (!obj) obj = (void *)bh->b_data; @@ -361,41 +451,49 @@ static void __ocfs2_init_extent_tree(struct ocfs2_extent_tree *et, if (!et->et_ops->eo_fill_max_leaf_clusters) et->et_max_leaf_clusters = 0; else - et->et_ops->eo_fill_max_leaf_clusters(inode, et); + et->et_ops->eo_fill_max_leaf_clusters(et); } void ocfs2_init_dinode_extent_tree(struct ocfs2_extent_tree *et, - struct inode *inode, + struct ocfs2_caching_info *ci, struct buffer_head *bh) { - __ocfs2_init_extent_tree(et, inode, bh, ocfs2_journal_access_di, + __ocfs2_init_extent_tree(et, ci, bh, ocfs2_journal_access_di, NULL, &ocfs2_dinode_et_ops); } void ocfs2_init_xattr_tree_extent_tree(struct ocfs2_extent_tree *et, - struct inode *inode, + struct ocfs2_caching_info *ci, struct buffer_head *bh) { - __ocfs2_init_extent_tree(et, inode, bh, ocfs2_journal_access_xb, + __ocfs2_init_extent_tree(et, ci, bh, ocfs2_journal_access_xb, NULL, &ocfs2_xattr_tree_et_ops); } void ocfs2_init_xattr_value_extent_tree(struct ocfs2_extent_tree *et, - struct inode *inode, + struct ocfs2_caching_info *ci, struct ocfs2_xattr_value_buf *vb) { - __ocfs2_init_extent_tree(et, inode, vb->vb_bh, vb->vb_access, vb, + __ocfs2_init_extent_tree(et, ci, vb->vb_bh, vb->vb_access, vb, &ocfs2_xattr_value_et_ops); } void ocfs2_init_dx_root_extent_tree(struct ocfs2_extent_tree *et, - struct inode *inode, + struct ocfs2_caching_info *ci, struct buffer_head *bh) { - __ocfs2_init_extent_tree(et, inode, bh, ocfs2_journal_access_dr, + __ocfs2_init_extent_tree(et, ci, bh, ocfs2_journal_access_dr, NULL, &ocfs2_dx_root_et_ops); } +void ocfs2_init_refcount_extent_tree(struct ocfs2_extent_tree *et, + struct ocfs2_caching_info *ci, + struct buffer_head *bh) +{ + __ocfs2_init_extent_tree(et, ci, bh, ocfs2_journal_access_rb, + NULL, &ocfs2_refcount_tree_et_ops); +} + static inline void ocfs2_et_set_last_eb_blk(struct ocfs2_extent_tree *et, u64 new_last_eb_blk) { @@ -407,78 +505,71 @@ static inline u64 ocfs2_et_get_last_eb_blk(struct ocfs2_extent_tree *et) return et->et_ops->eo_get_last_eb_blk(et); } -static inline void ocfs2_et_update_clusters(struct inode *inode, - struct ocfs2_extent_tree *et, +static inline void ocfs2_et_update_clusters(struct ocfs2_extent_tree *et, u32 clusters) { - et->et_ops->eo_update_clusters(inode, et, clusters); + et->et_ops->eo_update_clusters(et, clusters); +} + +static inline void ocfs2_et_extent_map_insert(struct ocfs2_extent_tree *et, + struct ocfs2_extent_rec *rec) +{ + if (et->et_ops->eo_extent_map_insert) + et->et_ops->eo_extent_map_insert(et, rec); +} + +static inline void ocfs2_et_extent_map_truncate(struct ocfs2_extent_tree *et, + u32 clusters) +{ + if (et->et_ops->eo_extent_map_truncate) + et->et_ops->eo_extent_map_truncate(et, clusters); } static inline int ocfs2_et_root_journal_access(handle_t *handle, - struct inode *inode, struct ocfs2_extent_tree *et, int type) { - return et->et_root_journal_access(handle, inode, et->et_root_bh, + return et->et_root_journal_access(handle, et->et_ci, et->et_root_bh, type); } -static inline int ocfs2_et_insert_check(struct inode *inode, - struct ocfs2_extent_tree *et, +static inline enum ocfs2_contig_type + ocfs2_et_extent_contig(struct ocfs2_extent_tree *et, + struct ocfs2_extent_rec *rec, + struct ocfs2_extent_rec *insert_rec) +{ + if (et->et_ops->eo_extent_contig) + return et->et_ops->eo_extent_contig(et, rec, insert_rec); + + return ocfs2_extent_rec_contig( + ocfs2_metadata_cache_get_super(et->et_ci), + rec, insert_rec); +} + +static inline int ocfs2_et_insert_check(struct ocfs2_extent_tree *et, struct ocfs2_extent_rec *rec) { int ret = 0; if (et->et_ops->eo_insert_check) - ret = et->et_ops->eo_insert_check(inode, et, rec); + ret = et->et_ops->eo_insert_check(et, rec); return ret; } -static inline int ocfs2_et_sanity_check(struct inode *inode, - struct ocfs2_extent_tree *et) +static inline int ocfs2_et_sanity_check(struct ocfs2_extent_tree *et) { int ret = 0; if (et->et_ops->eo_sanity_check) - ret = et->et_ops->eo_sanity_check(inode, et); + ret = et->et_ops->eo_sanity_check(et); return ret; } static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc); static int ocfs2_cache_extent_block_free(struct ocfs2_cached_dealloc_ctxt *ctxt, struct ocfs2_extent_block *eb); - -/* - * Structures which describe a path through a btree, and functions to - * manipulate them. - * - * The idea here is to be as generic as possible with the tree - * manipulation code. - */ -struct ocfs2_path_item { - struct buffer_head *bh; - struct ocfs2_extent_list *el; -}; - -#define OCFS2_MAX_PATH_DEPTH 5 - -struct ocfs2_path { - int p_tree_depth; - ocfs2_journal_access_func p_root_access; - struct ocfs2_path_item p_node[OCFS2_MAX_PATH_DEPTH]; -}; - -#define path_root_bh(_path) ((_path)->p_node[0].bh) -#define path_root_el(_path) ((_path)->p_node[0].el) -#define path_root_access(_path)((_path)->p_root_access) -#define path_leaf_bh(_path) ((_path)->p_node[(_path)->p_tree_depth].bh) -#define path_leaf_el(_path) ((_path)->p_node[(_path)->p_tree_depth].el) -#define path_num_items(_path) ((_path)->p_tree_depth + 1) - -static int ocfs2_find_path(struct inode *inode, struct ocfs2_path *path, - u32 cpos); -static void ocfs2_adjust_rightmost_records(struct inode *inode, - handle_t *handle, +static void ocfs2_adjust_rightmost_records(handle_t *handle, + struct ocfs2_extent_tree *et, struct ocfs2_path *path, struct ocfs2_extent_rec *insert_rec); /* @@ -486,7 +577,7 @@ static void ocfs2_adjust_rightmost_records(struct inode *inode, * to build another path. Generally, this involves freeing the buffer * heads. */ -static void ocfs2_reinit_path(struct ocfs2_path *path, int keep_root) +void ocfs2_reinit_path(struct ocfs2_path *path, int keep_root) { int i, start = 0, depth = 0; struct ocfs2_path_item *node; @@ -515,7 +606,7 @@ static void ocfs2_reinit_path(struct ocfs2_path *path, int keep_root) path->p_tree_depth = depth; } -static void ocfs2_free_path(struct ocfs2_path *path) +void ocfs2_free_path(struct ocfs2_path *path) { if (path) { ocfs2_reinit_path(path, 0); @@ -613,13 +704,13 @@ static struct ocfs2_path *ocfs2_new_path(struct buffer_head *root_bh, return path; } -static struct ocfs2_path *ocfs2_new_path_from_path(struct ocfs2_path *path) +struct ocfs2_path *ocfs2_new_path_from_path(struct ocfs2_path *path) { return ocfs2_new_path(path_root_bh(path), path_root_el(path), path_root_access(path)); } -static struct ocfs2_path *ocfs2_new_path_from_et(struct ocfs2_extent_tree *et) +struct ocfs2_path *ocfs2_new_path_from_et(struct ocfs2_extent_tree *et) { return ocfs2_new_path(et->et_root_bh, et->et_root_el, et->et_root_journal_access); @@ -632,10 +723,10 @@ static struct ocfs2_path *ocfs2_new_path_from_et(struct ocfs2_extent_tree *et) * I don't like the way this function's name looks next to * ocfs2_journal_access_path(), but I don't have a better one. */ -static int ocfs2_path_bh_journal_access(handle_t *handle, - struct inode *inode, - struct ocfs2_path *path, - int idx) +int ocfs2_path_bh_journal_access(handle_t *handle, + struct ocfs2_caching_info *ci, + struct ocfs2_path *path, + int idx) { ocfs2_journal_access_func access = path_root_access(path); @@ -645,15 +736,16 @@ static int ocfs2_path_bh_journal_access(handle_t *handle, if (idx) access = ocfs2_journal_access_eb; - return access(handle, inode, path->p_node[idx].bh, + return access(handle, ci, path->p_node[idx].bh, OCFS2_JOURNAL_ACCESS_WRITE); } /* * Convenience function to journal all components in a path. */ -static int ocfs2_journal_access_path(struct inode *inode, handle_t *handle, - struct ocfs2_path *path) +int ocfs2_journal_access_path(struct ocfs2_caching_info *ci, + handle_t *handle, + struct ocfs2_path *path) { int i, ret = 0; @@ -661,7 +753,7 @@ static int ocfs2_journal_access_path(struct inode *inode, handle_t *handle, goto out; for(i = 0; i < path_num_items(path); i++) { - ret = ocfs2_path_bh_journal_access(handle, inode, path, i); + ret = ocfs2_path_bh_journal_access(handle, ci, path, i); if (ret < 0) { mlog_errno(ret); goto out; @@ -702,17 +794,9 @@ int ocfs2_search_extent_list(struct ocfs2_extent_list *el, u32 v_cluster) return ret; } -enum ocfs2_contig_type { - CONTIG_NONE = 0, - CONTIG_LEFT, - CONTIG_RIGHT, - CONTIG_LEFTRIGHT, -}; - - /* * NOTE: ocfs2_block_extent_contig(), ocfs2_extents_adjacent() and - * ocfs2_extent_contig only work properly against leaf nodes! + * ocfs2_extent_rec_contig only work properly against leaf nodes! */ static int ocfs2_block_extent_contig(struct super_block *sb, struct ocfs2_extent_rec *ext, @@ -738,9 +822,9 @@ static int ocfs2_extents_adjacent(struct ocfs2_extent_rec *left, } static enum ocfs2_contig_type - ocfs2_extent_contig(struct inode *inode, - struct ocfs2_extent_rec *ext, - struct ocfs2_extent_rec *insert_rec) + ocfs2_extent_rec_contig(struct super_block *sb, + struct ocfs2_extent_rec *ext, + struct ocfs2_extent_rec *insert_rec) { u64 blkno = le64_to_cpu(insert_rec->e_blkno); @@ -753,12 +837,12 @@ static enum ocfs2_contig_type return CONTIG_NONE; if (ocfs2_extents_adjacent(ext, insert_rec) && - ocfs2_block_extent_contig(inode->i_sb, ext, blkno)) + ocfs2_block_extent_contig(sb, ext, blkno)) return CONTIG_RIGHT; blkno = le64_to_cpu(ext->e_blkno); if (ocfs2_extents_adjacent(insert_rec, ext) && - ocfs2_block_extent_contig(inode->i_sb, insert_rec, blkno)) + ocfs2_block_extent_contig(sb, insert_rec, blkno)) return CONTIG_LEFT; return CONTIG_NONE; @@ -853,13 +937,13 @@ static int ocfs2_validate_extent_block(struct super_block *sb, return 0; } -int ocfs2_read_extent_block(struct inode *inode, u64 eb_blkno, +int ocfs2_read_extent_block(struct ocfs2_caching_info *ci, u64 eb_blkno, struct buffer_head **bh) { int rc; struct buffer_head *tmp = *bh; - rc = ocfs2_read_block(inode, eb_blkno, &tmp, + rc = ocfs2_read_block(ci, eb_blkno, &tmp, ocfs2_validate_extent_block); /* If ocfs2_read_block() got us a new bh, pass it up. */ @@ -874,7 +958,6 @@ int ocfs2_read_extent_block(struct inode *inode, u64 eb_blkno, * How many free extents have we got before we need more meta data? */ int ocfs2_num_free_extents(struct ocfs2_super *osb, - struct inode *inode, struct ocfs2_extent_tree *et) { int retval; @@ -889,7 +972,8 @@ int ocfs2_num_free_extents(struct ocfs2_super *osb, last_eb_blk = ocfs2_et_get_last_eb_blk(et); if (last_eb_blk) { - retval = ocfs2_read_extent_block(inode, last_eb_blk, &eb_bh); + retval = ocfs2_read_extent_block(et->et_ci, last_eb_blk, + &eb_bh); if (retval < 0) { mlog_errno(retval); goto bail; @@ -913,9 +997,8 @@ bail: * sets h_signature, h_blkno, h_suballoc_bit, h_suballoc_slot, and * l_count for you */ -static int ocfs2_create_new_meta_bhs(struct ocfs2_super *osb, - handle_t *handle, - struct inode *inode, +static int ocfs2_create_new_meta_bhs(handle_t *handle, + struct ocfs2_extent_tree *et, int wanted, struct ocfs2_alloc_context *meta_ac, struct buffer_head *bhs[]) @@ -924,6 +1007,8 @@ static int ocfs2_create_new_meta_bhs(struct ocfs2_super *osb, u16 suballoc_bit_start; u32 num_got; u64 first_blkno; + struct ocfs2_super *osb = + OCFS2_SB(ocfs2_metadata_cache_get_super(et->et_ci)); struct ocfs2_extent_block *eb; mlog_entry_void(); @@ -949,9 +1034,10 @@ static int ocfs2_create_new_meta_bhs(struct ocfs2_super *osb, mlog_errno(status); goto bail; } - ocfs2_set_new_buffer_uptodate(inode, bhs[i]); + ocfs2_set_new_buffer_uptodate(et->et_ci, bhs[i]); - status = ocfs2_journal_access_eb(handle, inode, bhs[i], + status = ocfs2_journal_access_eb(handle, et->et_ci, + bhs[i], OCFS2_JOURNAL_ACCESS_CREATE); if (status < 0) { mlog_errno(status); @@ -1023,7 +1109,6 @@ static inline u32 ocfs2_sum_rightmost_rec(struct ocfs2_extent_list *el) * extent block's rightmost record. */ static int ocfs2_adjust_rightmost_branch(handle_t *handle, - struct inode *inode, struct ocfs2_extent_tree *et) { int status; @@ -1037,7 +1122,7 @@ static int ocfs2_adjust_rightmost_branch(handle_t *handle, return status; } - status = ocfs2_find_path(inode, path, UINT_MAX); + status = ocfs2_find_path(et->et_ci, path, UINT_MAX); if (status < 0) { mlog_errno(status); goto out; @@ -1050,7 +1135,7 @@ static int ocfs2_adjust_rightmost_branch(handle_t *handle, goto out; } - status = ocfs2_journal_access_path(inode, handle, path); + status = ocfs2_journal_access_path(et->et_ci, handle, path); if (status < 0) { mlog_errno(status); goto out; @@ -1059,7 +1144,7 @@ static int ocfs2_adjust_rightmost_branch(handle_t *handle, el = path_leaf_el(path); rec = &el->l_recs[le32_to_cpu(el->l_next_free_rec) - 1]; - ocfs2_adjust_rightmost_records(inode, handle, path, rec); + ocfs2_adjust_rightmost_records(handle, et, path, rec); out: ocfs2_free_path(path); @@ -1068,7 +1153,7 @@ out: /* * Add an entire tree branch to our inode. eb_bh is the extent block - * to start at, if we don't want to start the branch at the dinode + * to start at, if we don't want to start the branch at the root * structure. * * last_eb_bh is required as we have to update it's next_leaf pointer @@ -1077,9 +1162,7 @@ out: * the new branch will be 'empty' in the sense that every block will * contain a single record with cluster count == 0. */ -static int ocfs2_add_branch(struct ocfs2_super *osb, - handle_t *handle, - struct inode *inode, +static int ocfs2_add_branch(handle_t *handle, struct ocfs2_extent_tree *et, struct buffer_head *eb_bh, struct buffer_head **last_eb_bh, @@ -1123,7 +1206,7 @@ static int ocfs2_add_branch(struct ocfs2_super *osb, if (root_end > new_cpos) { mlog(0, "adjust the cluster end from %u to %u\n", root_end, new_cpos); - status = ocfs2_adjust_rightmost_branch(handle, inode, et); + status = ocfs2_adjust_rightmost_branch(handle, et); if (status) { mlog_errno(status); goto bail; @@ -1139,7 +1222,7 @@ static int ocfs2_add_branch(struct ocfs2_super *osb, goto bail; } - status = ocfs2_create_new_meta_bhs(osb, handle, inode, new_blocks, + status = ocfs2_create_new_meta_bhs(handle, et, new_blocks, meta_ac, new_eb_bhs); if (status < 0) { mlog_errno(status); @@ -1161,7 +1244,7 @@ static int ocfs2_add_branch(struct ocfs2_super *osb, BUG_ON(!OCFS2_IS_VALID_EXTENT_BLOCK(eb)); eb_el = &eb->h_list; - status = ocfs2_journal_access_eb(handle, inode, bh, + status = ocfs2_journal_access_eb(handle, et->et_ci, bh, OCFS2_JOURNAL_ACCESS_CREATE); if (status < 0) { mlog_errno(status); @@ -1201,20 +1284,20 @@ static int ocfs2_add_branch(struct ocfs2_super *osb, * journal_dirty erroring as it won't unless we've aborted the * handle (in which case we would never be here) so reserving * the write with journal_access is all we need to do. */ - status = ocfs2_journal_access_eb(handle, inode, *last_eb_bh, + status = ocfs2_journal_access_eb(handle, et->et_ci, *last_eb_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { mlog_errno(status); goto bail; } - status = ocfs2_et_root_journal_access(handle, inode, et, + status = ocfs2_et_root_journal_access(handle, et, OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { mlog_errno(status); goto bail; } if (eb_bh) { - status = ocfs2_journal_access_eb(handle, inode, eb_bh, + status = ocfs2_journal_access_eb(handle, et->et_ci, eb_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { mlog_errno(status); @@ -1274,9 +1357,7 @@ bail: * returns back the new extent block so you can add a branch to it * after this call. */ -static int ocfs2_shift_tree_depth(struct ocfs2_super *osb, - handle_t *handle, - struct inode *inode, +static int ocfs2_shift_tree_depth(handle_t *handle, struct ocfs2_extent_tree *et, struct ocfs2_alloc_context *meta_ac, struct buffer_head **ret_new_eb_bh) @@ -1290,7 +1371,7 @@ static int ocfs2_shift_tree_depth(struct ocfs2_super *osb, mlog_entry_void(); - status = ocfs2_create_new_meta_bhs(osb, handle, inode, 1, meta_ac, + status = ocfs2_create_new_meta_bhs(handle, et, 1, meta_ac, &new_eb_bh); if (status < 0) { mlog_errno(status); @@ -1304,7 +1385,7 @@ static int ocfs2_shift_tree_depth(struct ocfs2_super *osb, eb_el = &eb->h_list; root_el = et->et_root_el; - status = ocfs2_journal_access_eb(handle, inode, new_eb_bh, + status = ocfs2_journal_access_eb(handle, et->et_ci, new_eb_bh, OCFS2_JOURNAL_ACCESS_CREATE); if (status < 0) { mlog_errno(status); @@ -1323,7 +1404,7 @@ static int ocfs2_shift_tree_depth(struct ocfs2_super *osb, goto bail; } - status = ocfs2_et_root_journal_access(handle, inode, et, + status = ocfs2_et_root_journal_access(handle, et, OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { mlog_errno(status); @@ -1379,9 +1460,7 @@ bail: * * return status < 0 indicates an error. */ -static int ocfs2_find_branch_target(struct ocfs2_super *osb, - struct inode *inode, - struct ocfs2_extent_tree *et, +static int ocfs2_find_branch_target(struct ocfs2_extent_tree *et, struct buffer_head **target_bh) { int status = 0, i; @@ -1399,19 +1478,21 @@ static int ocfs2_find_branch_target(struct ocfs2_super *osb, while(le16_to_cpu(el->l_tree_depth) > 1) { if (le16_to_cpu(el->l_next_free_rec) == 0) { - ocfs2_error(inode->i_sb, "Dinode %llu has empty " + ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci), + "Owner %llu has empty " "extent list (next_free_rec == 0)", - (unsigned long long)OCFS2_I(inode)->ip_blkno); + (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci)); status = -EIO; goto bail; } i = le16_to_cpu(el->l_next_free_rec) - 1; blkno = le64_to_cpu(el->l_recs[i].e_blkno); if (!blkno) { - ocfs2_error(inode->i_sb, "Dinode %llu has extent " + ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci), + "Owner %llu has extent " "list where extent # %d has no physical " "block start", - (unsigned long long)OCFS2_I(inode)->ip_blkno, i); + (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), i); status = -EIO; goto bail; } @@ -1419,7 +1500,7 @@ static int ocfs2_find_branch_target(struct ocfs2_super *osb, brelse(bh); bh = NULL; - status = ocfs2_read_extent_block(inode, blkno, &bh); + status = ocfs2_read_extent_block(et->et_ci, blkno, &bh); if (status < 0) { mlog_errno(status); goto bail; @@ -1460,20 +1541,18 @@ bail: * * *last_eb_bh will be updated by ocfs2_add_branch(). */ -static int ocfs2_grow_tree(struct inode *inode, handle_t *handle, - struct ocfs2_extent_tree *et, int *final_depth, - struct buffer_head **last_eb_bh, +static int ocfs2_grow_tree(handle_t *handle, struct ocfs2_extent_tree *et, + int *final_depth, struct buffer_head **last_eb_bh, struct ocfs2_alloc_context *meta_ac) { int ret, shift; struct ocfs2_extent_list *el = et->et_root_el; int depth = le16_to_cpu(el->l_tree_depth); - struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); struct buffer_head *bh = NULL; BUG_ON(meta_ac == NULL); - shift = ocfs2_find_branch_target(osb, inode, et, &bh); + shift = ocfs2_find_branch_target(et, &bh); if (shift < 0) { ret = shift; mlog_errno(ret); @@ -1490,8 +1569,7 @@ static int ocfs2_grow_tree(struct inode *inode, handle_t *handle, /* ocfs2_shift_tree_depth will return us a buffer with * the new extent block (so we can pass that to * ocfs2_add_branch). */ - ret = ocfs2_shift_tree_depth(osb, handle, inode, et, - meta_ac, &bh); + ret = ocfs2_shift_tree_depth(handle, et, meta_ac, &bh); if (ret < 0) { mlog_errno(ret); goto out; @@ -1517,7 +1595,7 @@ static int ocfs2_grow_tree(struct inode *inode, handle_t *handle, /* call ocfs2_add_branch to add the final part of the tree with * the new data. */ mlog(0, "add branch. bh = %p\n", bh); - ret = ocfs2_add_branch(osb, handle, inode, et, bh, last_eb_bh, + ret = ocfs2_add_branch(handle, et, bh, last_eb_bh, meta_ac); if (ret < 0) { mlog_errno(ret); @@ -1687,7 +1765,7 @@ set_and_inc: * * The array index of the subtree root is passed back. */ -static int ocfs2_find_subtree_root(struct inode *inode, +static int ocfs2_find_subtree_root(struct ocfs2_extent_tree *et, struct ocfs2_path *left, struct ocfs2_path *right) { @@ -1705,10 +1783,10 @@ static int ocfs2_find_subtree_root(struct inode *inode, * The caller didn't pass two adjacent paths. */ mlog_bug_on_msg(i > left->p_tree_depth, - "Inode %lu, left depth %u, right depth %u\n" + "Owner %llu, left depth %u, right depth %u\n" "left leaf blk %llu, right leaf blk %llu\n", - inode->i_ino, left->p_tree_depth, - right->p_tree_depth, + (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), + left->p_tree_depth, right->p_tree_depth, (unsigned long long)path_leaf_bh(left)->b_blocknr, (unsigned long long)path_leaf_bh(right)->b_blocknr); } while (left->p_node[i].bh->b_blocknr == @@ -1725,7 +1803,7 @@ typedef void (path_insert_t)(void *, struct buffer_head *); * This code can be called with a cpos larger than the tree, in which * case it will return the rightmost path. */ -static int __ocfs2_find_path(struct inode *inode, +static int __ocfs2_find_path(struct ocfs2_caching_info *ci, struct ocfs2_extent_list *root_el, u32 cpos, path_insert_t *func, void *data) { @@ -1736,15 +1814,14 @@ static int __ocfs2_find_path(struct inode *inode, struct ocfs2_extent_block *eb; struct ocfs2_extent_list *el; struct ocfs2_extent_rec *rec; - struct ocfs2_inode_info *oi = OCFS2_I(inode); el = root_el; while (el->l_tree_depth) { if (le16_to_cpu(el->l_next_free_rec) == 0) { - ocfs2_error(inode->i_sb, - "Inode %llu has empty extent list at " + ocfs2_error(ocfs2_metadata_cache_get_super(ci), + "Owner %llu has empty extent list at " "depth %u\n", - (unsigned long long)oi->ip_blkno, + (unsigned long long)ocfs2_metadata_cache_owner(ci), le16_to_cpu(el->l_tree_depth)); ret = -EROFS; goto out; @@ -1767,10 +1844,10 @@ static int __ocfs2_find_path(struct inode *inode, blkno = le64_to_cpu(el->l_recs[i].e_blkno); if (blkno == 0) { - ocfs2_error(inode->i_sb, - "Inode %llu has bad blkno in extent list " + ocfs2_error(ocfs2_metadata_cache_get_super(ci), + "Owner %llu has bad blkno in extent list " "at depth %u (index %d)\n", - (unsigned long long)oi->ip_blkno, + (unsigned long long)ocfs2_metadata_cache_owner(ci), le16_to_cpu(el->l_tree_depth), i); ret = -EROFS; goto out; @@ -1778,7 +1855,7 @@ static int __ocfs2_find_path(struct inode *inode, brelse(bh); bh = NULL; - ret = ocfs2_read_extent_block(inode, blkno, &bh); + ret = ocfs2_read_extent_block(ci, blkno, &bh); if (ret) { mlog_errno(ret); goto out; @@ -1789,10 +1866,10 @@ static int __ocfs2_find_path(struct inode *inode, if (le16_to_cpu(el->l_next_free_rec) > le16_to_cpu(el->l_count)) { - ocfs2_error(inode->i_sb, - "Inode %llu has bad count in extent list " + ocfs2_error(ocfs2_metadata_cache_get_super(ci), + "Owner %llu has bad count in extent list " "at block %llu (next free=%u, count=%u)\n", - (unsigned long long)oi->ip_blkno, + (unsigned long long)ocfs2_metadata_cache_owner(ci), (unsigned long long)bh->b_blocknr, le16_to_cpu(el->l_next_free_rec), le16_to_cpu(el->l_count)); @@ -1836,14 +1913,14 @@ static void find_path_ins(void *data, struct buffer_head *bh) ocfs2_path_insert_eb(fp->path, fp->index, bh); fp->index++; } -static int ocfs2_find_path(struct inode *inode, struct ocfs2_path *path, - u32 cpos) +int ocfs2_find_path(struct ocfs2_caching_info *ci, + struct ocfs2_path *path, u32 cpos) { struct find_path_data data; data.index = 1; data.path = path; - return __ocfs2_find_path(inode, path_root_el(path), cpos, + return __ocfs2_find_path(ci, path_root_el(path), cpos, find_path_ins, &data); } @@ -1868,13 +1945,14 @@ static void find_leaf_ins(void *data, struct buffer_head *bh) * * This function doesn't handle non btree extent lists. */ -int ocfs2_find_leaf(struct inode *inode, struct ocfs2_extent_list *root_el, - u32 cpos, struct buffer_head **leaf_bh) +int ocfs2_find_leaf(struct ocfs2_caching_info *ci, + struct ocfs2_extent_list *root_el, u32 cpos, + struct buffer_head **leaf_bh) { int ret; struct buffer_head *bh = NULL; - ret = __ocfs2_find_path(inode, root_el, cpos, find_leaf_ins, &bh); + ret = __ocfs2_find_path(ci, root_el, cpos, find_leaf_ins, &bh); if (ret) { mlog_errno(ret); goto out; @@ -1980,7 +2058,7 @@ static void ocfs2_adjust_root_records(struct ocfs2_extent_list *root_el, * - When we've adjusted the last extent record in the left path leaf and the * 1st extent record in the right path leaf during cross extent block merge. */ -static void ocfs2_complete_edge_insert(struct inode *inode, handle_t *handle, +static void ocfs2_complete_edge_insert(handle_t *handle, struct ocfs2_path *left_path, struct ocfs2_path *right_path, int subtree_index) @@ -2058,8 +2136,8 @@ static void ocfs2_complete_edge_insert(struct inode *inode, handle_t *handle, mlog_errno(ret); } -static int ocfs2_rotate_subtree_right(struct inode *inode, - handle_t *handle, +static int ocfs2_rotate_subtree_right(handle_t *handle, + struct ocfs2_extent_tree *et, struct ocfs2_path *left_path, struct ocfs2_path *right_path, int subtree_index) @@ -2075,10 +2153,10 @@ static int ocfs2_rotate_subtree_right(struct inode *inode, left_el = path_leaf_el(left_path); if (left_el->l_next_free_rec != left_el->l_count) { - ocfs2_error(inode->i_sb, + ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci), "Inode %llu has non-full interior leaf node %llu" "(next free = %u)", - (unsigned long long)OCFS2_I(inode)->ip_blkno, + (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), (unsigned long long)left_leaf_bh->b_blocknr, le16_to_cpu(left_el->l_next_free_rec)); return -EROFS; @@ -2094,7 +2172,7 @@ static int ocfs2_rotate_subtree_right(struct inode *inode, root_bh = left_path->p_node[subtree_index].bh; BUG_ON(root_bh != right_path->p_node[subtree_index].bh); - ret = ocfs2_path_bh_journal_access(handle, inode, right_path, + ret = ocfs2_path_bh_journal_access(handle, et->et_ci, right_path, subtree_index); if (ret) { mlog_errno(ret); @@ -2102,14 +2180,14 @@ static int ocfs2_rotate_subtree_right(struct inode *inode, } for(i = subtree_index + 1; i < path_num_items(right_path); i++) { - ret = ocfs2_path_bh_journal_access(handle, inode, + ret = ocfs2_path_bh_journal_access(handle, et->et_ci, right_path, i); if (ret) { mlog_errno(ret); goto out; } - ret = ocfs2_path_bh_journal_access(handle, inode, + ret = ocfs2_path_bh_journal_access(handle, et->et_ci, left_path, i); if (ret) { mlog_errno(ret); @@ -2123,7 +2201,7 @@ static int ocfs2_rotate_subtree_right(struct inode *inode, /* This is a code error, not a disk corruption. */ mlog_bug_on_msg(!right_el->l_next_free_rec, "Inode %llu: Rotate fails " "because rightmost leaf block %llu is empty\n", - (unsigned long long)OCFS2_I(inode)->ip_blkno, + (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), (unsigned long long)right_leaf_bh->b_blocknr); ocfs2_create_empty_extent(right_el); @@ -2157,8 +2235,8 @@ static int ocfs2_rotate_subtree_right(struct inode *inode, goto out; } - ocfs2_complete_edge_insert(inode, handle, left_path, right_path, - subtree_index); + ocfs2_complete_edge_insert(handle, left_path, right_path, + subtree_index); out: return ret; @@ -2248,10 +2326,18 @@ static int ocfs2_extend_rotate_transaction(handle_t *handle, int subtree_depth, int op_credits, struct ocfs2_path *path) { + int ret; int credits = (path->p_tree_depth - subtree_depth) * 2 + 1 + op_credits; - if (handle->h_buffer_credits < credits) - return ocfs2_extend_trans(handle, credits); + if (handle->h_buffer_credits < credits) { + ret = ocfs2_extend_trans(handle, + credits - handle->h_buffer_credits); + if (ret) + return ret; + + if (unlikely(handle->h_buffer_credits < credits)) + return ocfs2_extend_trans(handle, credits); + } return 0; } @@ -2321,8 +2407,8 @@ static int ocfs2_leftmost_rec_contains(struct ocfs2_extent_list *el, u32 cpos) * *ret_left_path will contain a valid path which can be passed to * ocfs2_insert_path(). */ -static int ocfs2_rotate_tree_right(struct inode *inode, - handle_t *handle, +static int ocfs2_rotate_tree_right(handle_t *handle, + struct ocfs2_extent_tree *et, enum ocfs2_split_type split, u32 insert_cpos, struct ocfs2_path *right_path, @@ -2331,6 +2417,7 @@ static int ocfs2_rotate_tree_right(struct inode *inode, int ret, start, orig_credits = handle->h_buffer_credits; u32 cpos; struct ocfs2_path *left_path = NULL; + struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci); *ret_left_path = NULL; @@ -2341,7 +2428,7 @@ static int ocfs2_rotate_tree_right(struct inode *inode, goto out; } - ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, right_path, &cpos); + ret = ocfs2_find_cpos_for_left_leaf(sb, right_path, &cpos); if (ret) { mlog_errno(ret); goto out; @@ -2379,7 +2466,7 @@ static int ocfs2_rotate_tree_right(struct inode *inode, mlog(0, "Rotating a tree: ins. cpos: %u, left path cpos: %u\n", insert_cpos, cpos); - ret = ocfs2_find_path(inode, left_path, cpos); + ret = ocfs2_find_path(et->et_ci, left_path, cpos); if (ret) { mlog_errno(ret); goto out; @@ -2387,10 +2474,11 @@ static int ocfs2_rotate_tree_right(struct inode *inode, mlog_bug_on_msg(path_leaf_bh(left_path) == path_leaf_bh(right_path), - "Inode %lu: error during insert of %u " + "Owner %llu: error during insert of %u " "(left path cpos %u) results in two identical " "paths ending at %llu\n", - inode->i_ino, insert_cpos, cpos, + (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), + insert_cpos, cpos, (unsigned long long) path_leaf_bh(left_path)->b_blocknr); @@ -2416,7 +2504,7 @@ static int ocfs2_rotate_tree_right(struct inode *inode, goto out_ret_path; } - start = ocfs2_find_subtree_root(inode, left_path, right_path); + start = ocfs2_find_subtree_root(et, left_path, right_path); mlog(0, "Subtree root at index %d (blk %llu, depth %d)\n", start, @@ -2430,7 +2518,7 @@ static int ocfs2_rotate_tree_right(struct inode *inode, goto out; } - ret = ocfs2_rotate_subtree_right(inode, handle, left_path, + ret = ocfs2_rotate_subtree_right(handle, et, left_path, right_path, start); if (ret) { mlog_errno(ret); @@ -2462,8 +2550,7 @@ static int ocfs2_rotate_tree_right(struct inode *inode, */ ocfs2_mv_path(right_path, left_path); - ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, right_path, - &cpos); + ret = ocfs2_find_cpos_for_left_leaf(sb, right_path, &cpos); if (ret) { mlog_errno(ret); goto out; @@ -2477,7 +2564,8 @@ out_ret_path: return ret; } -static int ocfs2_update_edge_lengths(struct inode *inode, handle_t *handle, +static int ocfs2_update_edge_lengths(handle_t *handle, + struct ocfs2_extent_tree *et, int subtree_index, struct ocfs2_path *path) { int i, idx, ret; @@ -2502,7 +2590,7 @@ static int ocfs2_update_edge_lengths(struct inode *inode, handle_t *handle, goto out; } - ret = ocfs2_journal_access_path(inode, handle, path); + ret = ocfs2_journal_access_path(et->et_ci, handle, path); if (ret) { mlog_errno(ret); goto out; @@ -2532,7 +2620,8 @@ out: return ret; } -static void ocfs2_unlink_path(struct inode *inode, handle_t *handle, +static void ocfs2_unlink_path(handle_t *handle, + struct ocfs2_extent_tree *et, struct ocfs2_cached_dealloc_ctxt *dealloc, struct ocfs2_path *path, int unlink_start) { @@ -2554,12 +2643,12 @@ static void ocfs2_unlink_path(struct inode *inode, handle_t *handle, mlog(ML_ERROR, "Inode %llu, attempted to remove extent block " "%llu with %u records\n", - (unsigned long long)OCFS2_I(inode)->ip_blkno, + (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), (unsigned long long)le64_to_cpu(eb->h_blkno), le16_to_cpu(el->l_next_free_rec)); ocfs2_journal_dirty(handle, bh); - ocfs2_remove_from_cache(inode, bh); + ocfs2_remove_from_cache(et->et_ci, bh); continue; } @@ -2572,11 +2661,12 @@ static void ocfs2_unlink_path(struct inode *inode, handle_t *handle, if (ret) mlog_errno(ret); - ocfs2_remove_from_cache(inode, bh); + ocfs2_remove_from_cache(et->et_ci, bh); } } -static void ocfs2_unlink_subtree(struct inode *inode, handle_t *handle, +static void ocfs2_unlink_subtree(handle_t *handle, + struct ocfs2_extent_tree *et, struct ocfs2_path *left_path, struct ocfs2_path *right_path, int subtree_index, @@ -2607,17 +2697,17 @@ static void ocfs2_unlink_subtree(struct inode *inode, handle_t *handle, ocfs2_journal_dirty(handle, root_bh); ocfs2_journal_dirty(handle, path_leaf_bh(left_path)); - ocfs2_unlink_path(inode, handle, dealloc, right_path, + ocfs2_unlink_path(handle, et, dealloc, right_path, subtree_index + 1); } -static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle, +static int ocfs2_rotate_subtree_left(handle_t *handle, + struct ocfs2_extent_tree *et, struct ocfs2_path *left_path, struct ocfs2_path *right_path, int subtree_index, struct ocfs2_cached_dealloc_ctxt *dealloc, - int *deleted, - struct ocfs2_extent_tree *et) + int *deleted) { int ret, i, del_right_subtree = 0, right_has_empty = 0; struct buffer_head *root_bh, *et_root_bh = path_root_bh(right_path); @@ -2653,7 +2743,7 @@ static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle, return -EAGAIN; if (le16_to_cpu(right_leaf_el->l_next_free_rec) > 1) { - ret = ocfs2_journal_access_eb(handle, inode, + ret = ocfs2_journal_access_eb(handle, et->et_ci, path_leaf_bh(right_path), OCFS2_JOURNAL_ACCESS_WRITE); if (ret) { @@ -2672,7 +2762,7 @@ static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle, * We have to update i_last_eb_blk during the meta * data delete. */ - ret = ocfs2_et_root_journal_access(handle, inode, et, + ret = ocfs2_et_root_journal_access(handle, et, OCFS2_JOURNAL_ACCESS_WRITE); if (ret) { mlog_errno(ret); @@ -2688,7 +2778,7 @@ static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle, */ BUG_ON(right_has_empty && !del_right_subtree); - ret = ocfs2_path_bh_journal_access(handle, inode, right_path, + ret = ocfs2_path_bh_journal_access(handle, et->et_ci, right_path, subtree_index); if (ret) { mlog_errno(ret); @@ -2696,14 +2786,14 @@ static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle, } for(i = subtree_index + 1; i < path_num_items(right_path); i++) { - ret = ocfs2_path_bh_journal_access(handle, inode, + ret = ocfs2_path_bh_journal_access(handle, et->et_ci, right_path, i); if (ret) { mlog_errno(ret); goto out; } - ret = ocfs2_path_bh_journal_access(handle, inode, + ret = ocfs2_path_bh_journal_access(handle, et->et_ci, left_path, i); if (ret) { mlog_errno(ret); @@ -2740,9 +2830,9 @@ static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle, mlog_errno(ret); if (del_right_subtree) { - ocfs2_unlink_subtree(inode, handle, left_path, right_path, + ocfs2_unlink_subtree(handle, et, left_path, right_path, subtree_index, dealloc); - ret = ocfs2_update_edge_lengths(inode, handle, subtree_index, + ret = ocfs2_update_edge_lengths(handle, et, subtree_index, left_path); if (ret) { mlog_errno(ret); @@ -2766,7 +2856,7 @@ static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle, *deleted = 1; } else - ocfs2_complete_edge_insert(inode, handle, left_path, right_path, + ocfs2_complete_edge_insert(handle, left_path, right_path, subtree_index); out: @@ -2852,8 +2942,8 @@ out: return ret; } -static int ocfs2_rotate_rightmost_leaf_left(struct inode *inode, - handle_t *handle, +static int ocfs2_rotate_rightmost_leaf_left(handle_t *handle, + struct ocfs2_extent_tree *et, struct ocfs2_path *path) { int ret; @@ -2863,7 +2953,7 @@ static int ocfs2_rotate_rightmost_leaf_left(struct inode *inode, if (!ocfs2_is_empty_extent(&el->l_recs[0])) return 0; - ret = ocfs2_path_bh_journal_access(handle, inode, path, + ret = ocfs2_path_bh_journal_access(handle, et->et_ci, path, path_num_items(path) - 1); if (ret) { mlog_errno(ret); @@ -2880,24 +2970,24 @@ out: return ret; } -static int __ocfs2_rotate_tree_left(struct inode *inode, - handle_t *handle, int orig_credits, +static int __ocfs2_rotate_tree_left(handle_t *handle, + struct ocfs2_extent_tree *et, + int orig_credits, struct ocfs2_path *path, struct ocfs2_cached_dealloc_ctxt *dealloc, - struct ocfs2_path **empty_extent_path, - struct ocfs2_extent_tree *et) + struct ocfs2_path **empty_extent_path) { int ret, subtree_root, deleted; u32 right_cpos; struct ocfs2_path *left_path = NULL; struct ocfs2_path *right_path = NULL; + struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci); BUG_ON(!ocfs2_is_empty_extent(&(path_leaf_el(path)->l_recs[0]))); *empty_extent_path = NULL; - ret = ocfs2_find_cpos_for_right_leaf(inode->i_sb, path, - &right_cpos); + ret = ocfs2_find_cpos_for_right_leaf(sb, path, &right_cpos); if (ret) { mlog_errno(ret); goto out; @@ -2920,13 +3010,13 @@ static int __ocfs2_rotate_tree_left(struct inode *inode, } while (right_cpos) { - ret = ocfs2_find_path(inode, right_path, right_cpos); + ret = ocfs2_find_path(et->et_ci, right_path, right_cpos); if (ret) { mlog_errno(ret); goto out; } - subtree_root = ocfs2_find_subtree_root(inode, left_path, + subtree_root = ocfs2_find_subtree_root(et, left_path, right_path); mlog(0, "Subtree root at index %d (blk %llu, depth %d)\n", @@ -2946,16 +3036,16 @@ static int __ocfs2_rotate_tree_left(struct inode *inode, * Caller might still want to make changes to the * tree root, so re-add it to the journal here. */ - ret = ocfs2_path_bh_journal_access(handle, inode, + ret = ocfs2_path_bh_journal_access(handle, et->et_ci, left_path, 0); if (ret) { mlog_errno(ret); goto out; } - ret = ocfs2_rotate_subtree_left(inode, handle, left_path, + ret = ocfs2_rotate_subtree_left(handle, et, left_path, right_path, subtree_root, - dealloc, &deleted, et); + dealloc, &deleted); if (ret == -EAGAIN) { /* * The rotation has to temporarily stop due to @@ -2982,7 +3072,7 @@ static int __ocfs2_rotate_tree_left(struct inode *inode, ocfs2_mv_path(left_path, right_path); - ret = ocfs2_find_cpos_for_right_leaf(inode->i_sb, left_path, + ret = ocfs2_find_cpos_for_right_leaf(sb, left_path, &right_cpos); if (ret) { mlog_errno(ret); @@ -2997,10 +3087,10 @@ out: return ret; } -static int ocfs2_remove_rightmost_path(struct inode *inode, handle_t *handle, +static int ocfs2_remove_rightmost_path(handle_t *handle, + struct ocfs2_extent_tree *et, struct ocfs2_path *path, - struct ocfs2_cached_dealloc_ctxt *dealloc, - struct ocfs2_extent_tree *et) + struct ocfs2_cached_dealloc_ctxt *dealloc) { int ret, subtree_index; u32 cpos; @@ -3009,7 +3099,7 @@ static int ocfs2_remove_rightmost_path(struct inode *inode, handle_t *handle, struct ocfs2_extent_list *el; - ret = ocfs2_et_sanity_check(inode, et); + ret = ocfs2_et_sanity_check(et); if (ret) goto out; /* @@ -3024,13 +3114,14 @@ static int ocfs2_remove_rightmost_path(struct inode *inode, handle_t *handle, goto out; } - ret = ocfs2_journal_access_path(inode, handle, path); + ret = ocfs2_journal_access_path(et->et_ci, handle, path); if (ret) { mlog_errno(ret); goto out; } - ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, path, &cpos); + ret = ocfs2_find_cpos_for_left_leaf(ocfs2_metadata_cache_get_super(et->et_ci), + path, &cpos); if (ret) { mlog_errno(ret); goto out; @@ -3048,23 +3139,23 @@ static int ocfs2_remove_rightmost_path(struct inode *inode, handle_t *handle, goto out; } - ret = ocfs2_find_path(inode, left_path, cpos); + ret = ocfs2_find_path(et->et_ci, left_path, cpos); if (ret) { mlog_errno(ret); goto out; } - ret = ocfs2_journal_access_path(inode, handle, left_path); + ret = ocfs2_journal_access_path(et->et_ci, handle, left_path); if (ret) { mlog_errno(ret); goto out; } - subtree_index = ocfs2_find_subtree_root(inode, left_path, path); + subtree_index = ocfs2_find_subtree_root(et, left_path, path); - ocfs2_unlink_subtree(inode, handle, left_path, path, + ocfs2_unlink_subtree(handle, et, left_path, path, subtree_index, dealloc); - ret = ocfs2_update_edge_lengths(inode, handle, subtree_index, + ret = ocfs2_update_edge_lengths(handle, et, subtree_index, left_path); if (ret) { mlog_errno(ret); @@ -3078,10 +3169,10 @@ static int ocfs2_remove_rightmost_path(struct inode *inode, handle_t *handle, * 'path' is also the leftmost path which * means it must be the only one. This gets * handled differently because we want to - * revert the inode back to having extents + * revert the root back to having extents * in-line. */ - ocfs2_unlink_path(inode, handle, dealloc, path, 1); + ocfs2_unlink_path(handle, et, dealloc, path, 1); el = et->et_root_el; el->l_tree_depth = 0; @@ -3114,10 +3205,10 @@ out: * the rightmost tree leaf record is removed so the caller is * responsible for detecting and correcting that. */ -static int ocfs2_rotate_tree_left(struct inode *inode, handle_t *handle, +static int ocfs2_rotate_tree_left(handle_t *handle, + struct ocfs2_extent_tree *et, struct ocfs2_path *path, - struct ocfs2_cached_dealloc_ctxt *dealloc, - struct ocfs2_extent_tree *et) + struct ocfs2_cached_dealloc_ctxt *dealloc) { int ret, orig_credits = handle->h_buffer_credits; struct ocfs2_path *tmp_path = NULL, *restart_path = NULL; @@ -3134,8 +3225,7 @@ rightmost_no_delete: * Inline extents. This is trivially handled, so do * it up front. */ - ret = ocfs2_rotate_rightmost_leaf_left(inode, handle, - path); + ret = ocfs2_rotate_rightmost_leaf_left(handle, et, path); if (ret) mlog_errno(ret); goto out; @@ -3151,7 +3241,7 @@ rightmost_no_delete: * * 1) is handled via ocfs2_rotate_rightmost_leaf_left() * 2a) we need the left branch so that we can update it with the unlink - * 2b) we need to bring the inode back to inline extents. + * 2b) we need to bring the root back to inline extents. */ eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data; @@ -3167,9 +3257,9 @@ rightmost_no_delete: if (le16_to_cpu(el->l_next_free_rec) == 0) { ret = -EIO; - ocfs2_error(inode->i_sb, - "Inode %llu has empty extent block at %llu", - (unsigned long long)OCFS2_I(inode)->ip_blkno, + ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci), + "Owner %llu has empty extent block at %llu", + (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), (unsigned long long)le64_to_cpu(eb->h_blkno)); goto out; } @@ -3183,8 +3273,8 @@ rightmost_no_delete: * nonempty list. */ - ret = ocfs2_remove_rightmost_path(inode, handle, path, - dealloc, et); + ret = ocfs2_remove_rightmost_path(handle, et, path, + dealloc); if (ret) mlog_errno(ret); goto out; @@ -3195,8 +3285,8 @@ rightmost_no_delete: * and restarting from there. */ try_rotate: - ret = __ocfs2_rotate_tree_left(inode, handle, orig_credits, path, - dealloc, &restart_path, et); + ret = __ocfs2_rotate_tree_left(handle, et, orig_credits, path, + dealloc, &restart_path); if (ret && ret != -EAGAIN) { mlog_errno(ret); goto out; @@ -3206,9 +3296,9 @@ try_rotate: tmp_path = restart_path; restart_path = NULL; - ret = __ocfs2_rotate_tree_left(inode, handle, orig_credits, + ret = __ocfs2_rotate_tree_left(handle, et, orig_credits, tmp_path, dealloc, - &restart_path, et); + &restart_path); if (ret && ret != -EAGAIN) { mlog_errno(ret); goto out; @@ -3259,7 +3349,7 @@ static void ocfs2_cleanup_merge(struct ocfs2_extent_list *el, } } -static int ocfs2_get_right_path(struct inode *inode, +static int ocfs2_get_right_path(struct ocfs2_extent_tree *et, struct ocfs2_path *left_path, struct ocfs2_path **ret_right_path) { @@ -3276,8 +3366,8 @@ static int ocfs2_get_right_path(struct inode *inode, left_el = path_leaf_el(left_path); BUG_ON(left_el->l_next_free_rec != left_el->l_count); - ret = ocfs2_find_cpos_for_right_leaf(inode->i_sb, left_path, - &right_cpos); + ret = ocfs2_find_cpos_for_right_leaf(ocfs2_metadata_cache_get_super(et->et_ci), + left_path, &right_cpos); if (ret) { mlog_errno(ret); goto out; @@ -3293,7 +3383,7 @@ static int ocfs2_get_right_path(struct inode *inode, goto out; } - ret = ocfs2_find_path(inode, right_path, right_cpos); + ret = ocfs2_find_path(et->et_ci, right_path, right_cpos); if (ret) { mlog_errno(ret); goto out; @@ -3313,9 +3403,9 @@ out: * For index == l_count - 1, the "next" means the 1st extent rec of the * next extent block. */ -static int ocfs2_merge_rec_right(struct inode *inode, - struct ocfs2_path *left_path, +static int ocfs2_merge_rec_right(struct ocfs2_path *left_path, handle_t *handle, + struct ocfs2_extent_tree *et, struct ocfs2_extent_rec *split_rec, int index) { @@ -3336,7 +3426,7 @@ static int ocfs2_merge_rec_right(struct inode *inode, if (index == le16_to_cpu(el->l_next_free_rec) - 1 && le16_to_cpu(el->l_next_free_rec) == le16_to_cpu(el->l_count)) { /* we meet with a cross extent block merge. */ - ret = ocfs2_get_right_path(inode, left_path, &right_path); + ret = ocfs2_get_right_path(et, left_path, &right_path); if (ret) { mlog_errno(ret); goto out; @@ -3355,8 +3445,8 @@ static int ocfs2_merge_rec_right(struct inode *inode, le16_to_cpu(left_rec->e_leaf_clusters) != le32_to_cpu(right_rec->e_cpos)); - subtree_index = ocfs2_find_subtree_root(inode, - left_path, right_path); + subtree_index = ocfs2_find_subtree_root(et, left_path, + right_path); ret = ocfs2_extend_rotate_transaction(handle, subtree_index, handle->h_buffer_credits, @@ -3369,7 +3459,7 @@ static int ocfs2_merge_rec_right(struct inode *inode, root_bh = left_path->p_node[subtree_index].bh; BUG_ON(root_bh != right_path->p_node[subtree_index].bh); - ret = ocfs2_path_bh_journal_access(handle, inode, right_path, + ret = ocfs2_path_bh_journal_access(handle, et->et_ci, right_path, subtree_index); if (ret) { mlog_errno(ret); @@ -3378,14 +3468,14 @@ static int ocfs2_merge_rec_right(struct inode *inode, for (i = subtree_index + 1; i < path_num_items(right_path); i++) { - ret = ocfs2_path_bh_journal_access(handle, inode, + ret = ocfs2_path_bh_journal_access(handle, et->et_ci, right_path, i); if (ret) { mlog_errno(ret); goto out; } - ret = ocfs2_path_bh_journal_access(handle, inode, + ret = ocfs2_path_bh_journal_access(handle, et->et_ci, left_path, i); if (ret) { mlog_errno(ret); @@ -3398,7 +3488,7 @@ static int ocfs2_merge_rec_right(struct inode *inode, right_rec = &el->l_recs[index + 1]; } - ret = ocfs2_path_bh_journal_access(handle, inode, left_path, + ret = ocfs2_path_bh_journal_access(handle, et->et_ci, left_path, path_num_items(left_path) - 1); if (ret) { mlog_errno(ret); @@ -3409,7 +3499,8 @@ static int ocfs2_merge_rec_right(struct inode *inode, le32_add_cpu(&right_rec->e_cpos, -split_clusters); le64_add_cpu(&right_rec->e_blkno, - -ocfs2_clusters_to_blocks(inode->i_sb, split_clusters)); + -ocfs2_clusters_to_blocks(ocfs2_metadata_cache_get_super(et->et_ci), + split_clusters)); le16_add_cpu(&right_rec->e_leaf_clusters, split_clusters); ocfs2_cleanup_merge(el, index); @@ -3423,8 +3514,8 @@ static int ocfs2_merge_rec_right(struct inode *inode, if (ret) mlog_errno(ret); - ocfs2_complete_edge_insert(inode, handle, left_path, - right_path, subtree_index); + ocfs2_complete_edge_insert(handle, left_path, right_path, + subtree_index); } out: if (right_path) @@ -3432,7 +3523,7 @@ out: return ret; } -static int ocfs2_get_left_path(struct inode *inode, +static int ocfs2_get_left_path(struct ocfs2_extent_tree *et, struct ocfs2_path *right_path, struct ocfs2_path **ret_left_path) { @@ -3445,7 +3536,7 @@ static int ocfs2_get_left_path(struct inode *inode, /* This function shouldn't be called for non-trees. */ BUG_ON(right_path->p_tree_depth == 0); - ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, + ret = ocfs2_find_cpos_for_left_leaf(ocfs2_metadata_cache_get_super(et->et_ci), right_path, &left_cpos); if (ret) { mlog_errno(ret); @@ -3462,7 +3553,7 @@ static int ocfs2_get_left_path(struct inode *inode, goto out; } - ret = ocfs2_find_path(inode, left_path, left_cpos); + ret = ocfs2_find_path(et->et_ci, left_path, left_cpos); if (ret) { mlog_errno(ret); goto out; @@ -3485,12 +3576,11 @@ out: * remove the rightmost leaf extent block in the right_path and change * the right path to indicate the new rightmost path. */ -static int ocfs2_merge_rec_left(struct inode *inode, - struct ocfs2_path *right_path, +static int ocfs2_merge_rec_left(struct ocfs2_path *right_path, handle_t *handle, + struct ocfs2_extent_tree *et, struct ocfs2_extent_rec *split_rec, struct ocfs2_cached_dealloc_ctxt *dealloc, - struct ocfs2_extent_tree *et, int index) { int ret, i, subtree_index = 0, has_empty_extent = 0; @@ -3508,7 +3598,7 @@ static int ocfs2_merge_rec_left(struct inode *inode, right_rec = &el->l_recs[index]; if (index == 0) { /* we meet with a cross extent block merge. */ - ret = ocfs2_get_left_path(inode, right_path, &left_path); + ret = ocfs2_get_left_path(et, right_path, &left_path); if (ret) { mlog_errno(ret); goto out; @@ -3524,8 +3614,8 @@ static int ocfs2_merge_rec_left(struct inode *inode, le16_to_cpu(left_rec->e_leaf_clusters) != le32_to_cpu(split_rec->e_cpos)); - subtree_index = ocfs2_find_subtree_root(inode, - left_path, right_path); + subtree_index = ocfs2_find_subtree_root(et, left_path, + right_path); ret = ocfs2_extend_rotate_transaction(handle, subtree_index, handle->h_buffer_credits, @@ -3538,7 +3628,7 @@ static int ocfs2_merge_rec_left(struct inode *inode, root_bh = left_path->p_node[subtree_index].bh; BUG_ON(root_bh != right_path->p_node[subtree_index].bh); - ret = ocfs2_path_bh_journal_access(handle, inode, right_path, + ret = ocfs2_path_bh_journal_access(handle, et->et_ci, right_path, subtree_index); if (ret) { mlog_errno(ret); @@ -3547,14 +3637,14 @@ static int ocfs2_merge_rec_left(struct inode *inode, for (i = subtree_index + 1; i < path_num_items(right_path); i++) { - ret = ocfs2_path_bh_journal_access(handle, inode, + ret = ocfs2_path_bh_journal_access(handle, et->et_ci, right_path, i); if (ret) { mlog_errno(ret); goto out; } - ret = ocfs2_path_bh_journal_access(handle, inode, + ret = ocfs2_path_bh_journal_access(handle, et->et_ci, left_path, i); if (ret) { mlog_errno(ret); @@ -3567,7 +3657,7 @@ static int ocfs2_merge_rec_left(struct inode *inode, has_empty_extent = 1; } - ret = ocfs2_path_bh_journal_access(handle, inode, right_path, + ret = ocfs2_path_bh_journal_access(handle, et->et_ci, right_path, path_num_items(right_path) - 1); if (ret) { mlog_errno(ret); @@ -3586,7 +3676,8 @@ static int ocfs2_merge_rec_left(struct inode *inode, le32_add_cpu(&right_rec->e_cpos, split_clusters); le64_add_cpu(&right_rec->e_blkno, - ocfs2_clusters_to_blocks(inode->i_sb, split_clusters)); + ocfs2_clusters_to_blocks(ocfs2_metadata_cache_get_super(et->et_ci), + split_clusters)); le16_add_cpu(&right_rec->e_leaf_clusters, -split_clusters); ocfs2_cleanup_merge(el, index); @@ -3608,9 +3699,9 @@ static int ocfs2_merge_rec_left(struct inode *inode, if (le16_to_cpu(right_rec->e_leaf_clusters) == 0 && le16_to_cpu(el->l_next_free_rec) == 1) { - ret = ocfs2_remove_rightmost_path(inode, handle, + ret = ocfs2_remove_rightmost_path(handle, et, right_path, - dealloc, et); + dealloc); if (ret) { mlog_errno(ret); goto out; @@ -3622,7 +3713,7 @@ static int ocfs2_merge_rec_left(struct inode *inode, ocfs2_mv_path(right_path, left_path); left_path = NULL; } else - ocfs2_complete_edge_insert(inode, handle, left_path, + ocfs2_complete_edge_insert(handle, left_path, right_path, subtree_index); } out: @@ -3631,15 +3722,13 @@ out: return ret; } -static int ocfs2_try_to_merge_extent(struct inode *inode, - handle_t *handle, +static int ocfs2_try_to_merge_extent(handle_t *handle, + struct ocfs2_extent_tree *et, struct ocfs2_path *path, int split_index, struct ocfs2_extent_rec *split_rec, struct ocfs2_cached_dealloc_ctxt *dealloc, - struct ocfs2_merge_ctxt *ctxt, - struct ocfs2_extent_tree *et) - + struct ocfs2_merge_ctxt *ctxt) { int ret = 0; struct ocfs2_extent_list *el = path_leaf_el(path); @@ -3655,8 +3744,7 @@ static int ocfs2_try_to_merge_extent(struct inode *inode, * extents - having more than one in a leaf is * illegal. */ - ret = ocfs2_rotate_tree_left(inode, handle, path, - dealloc, et); + ret = ocfs2_rotate_tree_left(handle, et, path, dealloc); if (ret) { mlog_errno(ret); goto out; @@ -3685,8 +3773,7 @@ static int ocfs2_try_to_merge_extent(struct inode *inode, * prevoius extent block. It is more efficient and easier * if we do merge_right first and merge_left later. */ - ret = ocfs2_merge_rec_right(inode, path, - handle, split_rec, + ret = ocfs2_merge_rec_right(path, handle, et, split_rec, split_index); if (ret) { mlog_errno(ret); @@ -3699,8 +3786,7 @@ static int ocfs2_try_to_merge_extent(struct inode *inode, BUG_ON(!ocfs2_is_empty_extent(&el->l_recs[0])); /* The merge left us with an empty extent, remove it. */ - ret = ocfs2_rotate_tree_left(inode, handle, path, - dealloc, et); + ret = ocfs2_rotate_tree_left(handle, et, path, dealloc); if (ret) { mlog_errno(ret); goto out; @@ -3712,18 +3798,15 @@ static int ocfs2_try_to_merge_extent(struct inode *inode, * Note that we don't pass split_rec here on purpose - * we've merged it into the rec already. */ - ret = ocfs2_merge_rec_left(inode, path, - handle, rec, - dealloc, et, - split_index); + ret = ocfs2_merge_rec_left(path, handle, et, rec, + dealloc, split_index); if (ret) { mlog_errno(ret); goto out; } - ret = ocfs2_rotate_tree_left(inode, handle, path, - dealloc, et); + ret = ocfs2_rotate_tree_left(handle, et, path, dealloc); /* * Error from this last rotate is not critical, so * print but don't bubble it up. @@ -3740,19 +3823,16 @@ static int ocfs2_try_to_merge_extent(struct inode *inode, * the record on the left (hence the left merge). */ if (ctxt->c_contig_type == CONTIG_RIGHT) { - ret = ocfs2_merge_rec_left(inode, - path, - handle, split_rec, - dealloc, et, + ret = ocfs2_merge_rec_left(path, handle, et, + split_rec, dealloc, split_index); if (ret) { mlog_errno(ret); goto out; } } else { - ret = ocfs2_merge_rec_right(inode, - path, - handle, split_rec, + ret = ocfs2_merge_rec_right(path, handle, + et, split_rec, split_index); if (ret) { mlog_errno(ret); @@ -3765,8 +3845,8 @@ static int ocfs2_try_to_merge_extent(struct inode *inode, * The merge may have left an empty extent in * our leaf. Try to rotate it away. */ - ret = ocfs2_rotate_tree_left(inode, handle, path, - dealloc, et); + ret = ocfs2_rotate_tree_left(handle, et, path, + dealloc); if (ret) mlog_errno(ret); ret = 0; @@ -3812,10 +3892,10 @@ static void ocfs2_subtract_from_rec(struct super_block *sb, * list. If this leaf is part of an allocation tree, it is assumed * that the tree above has been prepared. */ -static void ocfs2_insert_at_leaf(struct ocfs2_extent_rec *insert_rec, +static void ocfs2_insert_at_leaf(struct ocfs2_extent_tree *et, + struct ocfs2_extent_rec *insert_rec, struct ocfs2_extent_list *el, - struct ocfs2_insert_type *insert, - struct inode *inode) + struct ocfs2_insert_type *insert) { int i = insert->ins_contig_index; unsigned int range; @@ -3827,7 +3907,8 @@ static void ocfs2_insert_at_leaf(struct ocfs2_extent_rec *insert_rec, i = ocfs2_search_extent_list(el, le32_to_cpu(insert_rec->e_cpos)); BUG_ON(i == -1); rec = &el->l_recs[i]; - ocfs2_subtract_from_rec(inode->i_sb, insert->ins_split, rec, + ocfs2_subtract_from_rec(ocfs2_metadata_cache_get_super(et->et_ci), + insert->ins_split, rec, insert_rec); goto rotate; } @@ -3869,10 +3950,10 @@ static void ocfs2_insert_at_leaf(struct ocfs2_extent_rec *insert_rec, mlog_bug_on_msg(le16_to_cpu(el->l_next_free_rec) >= le16_to_cpu(el->l_count), - "inode %lu, depth %u, count %u, next free %u, " + "owner %llu, depth %u, count %u, next free %u, " "rec.cpos %u, rec.clusters %u, " "insert.cpos %u, insert.clusters %u\n", - inode->i_ino, + ocfs2_metadata_cache_owner(et->et_ci), le16_to_cpu(el->l_tree_depth), le16_to_cpu(el->l_count), le16_to_cpu(el->l_next_free_rec), @@ -3900,8 +3981,8 @@ rotate: ocfs2_rotate_leaf(el, insert_rec); } -static void ocfs2_adjust_rightmost_records(struct inode *inode, - handle_t *handle, +static void ocfs2_adjust_rightmost_records(handle_t *handle, + struct ocfs2_extent_tree *et, struct ocfs2_path *path, struct ocfs2_extent_rec *insert_rec) { @@ -3919,9 +4000,9 @@ static void ocfs2_adjust_rightmost_records(struct inode *inode, next_free = le16_to_cpu(el->l_next_free_rec); if (next_free == 0) { - ocfs2_error(inode->i_sb, - "Dinode %llu has a bad extent list", - (unsigned long long)OCFS2_I(inode)->ip_blkno); + ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci), + "Owner %llu has a bad extent list", + (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci)); ret = -EIO; return; } @@ -3941,7 +4022,8 @@ static void ocfs2_adjust_rightmost_records(struct inode *inode, } } -static int ocfs2_append_rec_to_path(struct inode *inode, handle_t *handle, +static int ocfs2_append_rec_to_path(handle_t *handle, + struct ocfs2_extent_tree *et, struct ocfs2_extent_rec *insert_rec, struct ocfs2_path *right_path, struct ocfs2_path **ret_left_path) @@ -3969,8 +4051,8 @@ static int ocfs2_append_rec_to_path(struct inode *inode, handle_t *handle, (next_free == 1 && ocfs2_is_empty_extent(&el->l_recs[0]))) { u32 left_cpos; - ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, right_path, - &left_cpos); + ret = ocfs2_find_cpos_for_left_leaf(ocfs2_metadata_cache_get_super(et->et_ci), + right_path, &left_cpos); if (ret) { mlog_errno(ret); goto out; @@ -3992,7 +4074,8 @@ static int ocfs2_append_rec_to_path(struct inode *inode, handle_t *handle, goto out; } - ret = ocfs2_find_path(inode, left_path, left_cpos); + ret = ocfs2_find_path(et->et_ci, left_path, + left_cpos); if (ret) { mlog_errno(ret); goto out; @@ -4005,13 +4088,13 @@ static int ocfs2_append_rec_to_path(struct inode *inode, handle_t *handle, } } - ret = ocfs2_journal_access_path(inode, handle, right_path); + ret = ocfs2_journal_access_path(et->et_ci, handle, right_path); if (ret) { mlog_errno(ret); goto out; } - ocfs2_adjust_rightmost_records(inode, handle, right_path, insert_rec); + ocfs2_adjust_rightmost_records(handle, et, right_path, insert_rec); *ret_left_path = left_path; ret = 0; @@ -4022,7 +4105,7 @@ out: return ret; } -static void ocfs2_split_record(struct inode *inode, +static void ocfs2_split_record(struct ocfs2_extent_tree *et, struct ocfs2_path *left_path, struct ocfs2_path *right_path, struct ocfs2_extent_rec *split_rec, @@ -4095,7 +4178,8 @@ static void ocfs2_split_record(struct inode *inode, } rec = &el->l_recs[index]; - ocfs2_subtract_from_rec(inode->i_sb, split, rec, split_rec); + ocfs2_subtract_from_rec(ocfs2_metadata_cache_get_super(et->et_ci), + split, rec, split_rec); ocfs2_rotate_leaf(insert_el, split_rec); } @@ -4107,8 +4191,8 @@ static void ocfs2_split_record(struct inode *inode, * in. left_path should only be passed in if we need to update that * portion of the tree after an edge insert. */ -static int ocfs2_insert_path(struct inode *inode, - handle_t *handle, +static int ocfs2_insert_path(handle_t *handle, + struct ocfs2_extent_tree *et, struct ocfs2_path *left_path, struct ocfs2_path *right_path, struct ocfs2_extent_rec *insert_rec, @@ -4134,7 +4218,7 @@ static int ocfs2_insert_path(struct inode *inode, goto out; } - ret = ocfs2_journal_access_path(inode, handle, left_path); + ret = ocfs2_journal_access_path(et->et_ci, handle, left_path); if (ret < 0) { mlog_errno(ret); goto out; @@ -4145,7 +4229,7 @@ static int ocfs2_insert_path(struct inode *inode, * Pass both paths to the journal. The majority of inserts * will be touching all components anyway. */ - ret = ocfs2_journal_access_path(inode, handle, right_path); + ret = ocfs2_journal_access_path(et->et_ci, handle, right_path); if (ret < 0) { mlog_errno(ret); goto out; @@ -4157,7 +4241,7 @@ static int ocfs2_insert_path(struct inode *inode, * of splits, but it's easier to just let one separate * function sort it all out. */ - ocfs2_split_record(inode, left_path, right_path, + ocfs2_split_record(et, left_path, right_path, insert_rec, insert->ins_split); /* @@ -4171,8 +4255,8 @@ static int ocfs2_insert_path(struct inode *inode, if (ret) mlog_errno(ret); } else - ocfs2_insert_at_leaf(insert_rec, path_leaf_el(right_path), - insert, inode); + ocfs2_insert_at_leaf(et, insert_rec, path_leaf_el(right_path), + insert); ret = ocfs2_journal_dirty(handle, leaf_bh); if (ret) @@ -4185,10 +4269,10 @@ static int ocfs2_insert_path(struct inode *inode, * * XXX: Should we extend the transaction here? */ - subtree_index = ocfs2_find_subtree_root(inode, left_path, + subtree_index = ocfs2_find_subtree_root(et, left_path, right_path); - ocfs2_complete_edge_insert(inode, handle, left_path, - right_path, subtree_index); + ocfs2_complete_edge_insert(handle, left_path, right_path, + subtree_index); } ret = 0; @@ -4196,8 +4280,7 @@ out: return ret; } -static int ocfs2_do_insert_extent(struct inode *inode, - handle_t *handle, +static int ocfs2_do_insert_extent(handle_t *handle, struct ocfs2_extent_tree *et, struct ocfs2_extent_rec *insert_rec, struct ocfs2_insert_type *type) @@ -4210,7 +4293,7 @@ static int ocfs2_do_insert_extent(struct inode *inode, el = et->et_root_el; - ret = ocfs2_et_root_journal_access(handle, inode, et, + ret = ocfs2_et_root_journal_access(handle, et, OCFS2_JOURNAL_ACCESS_WRITE); if (ret) { mlog_errno(ret); @@ -4218,7 +4301,7 @@ static int ocfs2_do_insert_extent(struct inode *inode, } if (le16_to_cpu(el->l_tree_depth) == 0) { - ocfs2_insert_at_leaf(insert_rec, el, type, inode); + ocfs2_insert_at_leaf(et, insert_rec, el, type); goto out_update_clusters; } @@ -4241,7 +4324,7 @@ static int ocfs2_do_insert_extent(struct inode *inode, cpos = UINT_MAX; } - ret = ocfs2_find_path(inode, right_path, cpos); + ret = ocfs2_find_path(et->et_ci, right_path, cpos); if (ret) { mlog_errno(ret); goto out; @@ -4260,7 +4343,7 @@ static int ocfs2_do_insert_extent(struct inode *inode, * can wind up skipping both of these two special cases... */ if (rotate) { - ret = ocfs2_rotate_tree_right(inode, handle, type->ins_split, + ret = ocfs2_rotate_tree_right(handle, et, type->ins_split, le32_to_cpu(insert_rec->e_cpos), right_path, &left_path); if (ret) { @@ -4272,7 +4355,7 @@ static int ocfs2_do_insert_extent(struct inode *inode, * ocfs2_rotate_tree_right() might have extended the * transaction without re-journaling our tree root. */ - ret = ocfs2_et_root_journal_access(handle, inode, et, + ret = ocfs2_et_root_journal_access(handle, et, OCFS2_JOURNAL_ACCESS_WRITE); if (ret) { mlog_errno(ret); @@ -4280,7 +4363,7 @@ static int ocfs2_do_insert_extent(struct inode *inode, } } else if (type->ins_appending == APPEND_TAIL && type->ins_contig != CONTIG_LEFT) { - ret = ocfs2_append_rec_to_path(inode, handle, insert_rec, + ret = ocfs2_append_rec_to_path(handle, et, insert_rec, right_path, &left_path); if (ret) { mlog_errno(ret); @@ -4288,7 +4371,7 @@ static int ocfs2_do_insert_extent(struct inode *inode, } } - ret = ocfs2_insert_path(inode, handle, left_path, right_path, + ret = ocfs2_insert_path(handle, et, left_path, right_path, insert_rec, type); if (ret) { mlog_errno(ret); @@ -4297,7 +4380,7 @@ static int ocfs2_do_insert_extent(struct inode *inode, out_update_clusters: if (type->ins_split == SPLIT_NONE) - ocfs2_et_update_clusters(inode, et, + ocfs2_et_update_clusters(et, le16_to_cpu(insert_rec->e_leaf_clusters)); ret = ocfs2_journal_dirty(handle, et->et_root_bh); @@ -4312,7 +4395,8 @@ out: } static enum ocfs2_contig_type -ocfs2_figure_merge_contig_type(struct inode *inode, struct ocfs2_path *path, +ocfs2_figure_merge_contig_type(struct ocfs2_extent_tree *et, + struct ocfs2_path *path, struct ocfs2_extent_list *el, int index, struct ocfs2_extent_rec *split_rec) { @@ -4324,12 +4408,12 @@ ocfs2_figure_merge_contig_type(struct inode *inode, struct ocfs2_path *path, struct ocfs2_path *left_path = NULL, *right_path = NULL; struct buffer_head *bh; struct ocfs2_extent_block *eb; + struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci); if (index > 0) { rec = &el->l_recs[index - 1]; } else if (path->p_tree_depth > 0) { - status = ocfs2_find_cpos_for_left_leaf(inode->i_sb, - path, &left_cpos); + status = ocfs2_find_cpos_for_left_leaf(sb, path, &left_cpos); if (status) goto out; @@ -4338,7 +4422,8 @@ ocfs2_figure_merge_contig_type(struct inode *inode, struct ocfs2_path *path, if (!left_path) goto out; - status = ocfs2_find_path(inode, left_path, left_cpos); + status = ocfs2_find_path(et->et_ci, left_path, + left_cpos); if (status) goto out; @@ -4348,7 +4433,7 @@ ocfs2_figure_merge_contig_type(struct inode *inode, struct ocfs2_path *path, le16_to_cpu(new_el->l_count)) { bh = path_leaf_bh(left_path); eb = (struct ocfs2_extent_block *)bh->b_data; - ocfs2_error(inode->i_sb, + ocfs2_error(sb, "Extent block #%llu has an " "invalid l_next_free_rec of " "%d. It should have " @@ -4373,7 +4458,7 @@ ocfs2_figure_merge_contig_type(struct inode *inode, struct ocfs2_path *path, if (split_rec->e_cpos == el->l_recs[index].e_cpos) ret = CONTIG_RIGHT; } else { - ret = ocfs2_extent_contig(inode, rec, split_rec); + ret = ocfs2_et_extent_contig(et, rec, split_rec); } } @@ -4382,8 +4467,7 @@ ocfs2_figure_merge_contig_type(struct inode *inode, struct ocfs2_path *path, rec = &el->l_recs[index + 1]; else if (le16_to_cpu(el->l_next_free_rec) == le16_to_cpu(el->l_count) && path->p_tree_depth > 0) { - status = ocfs2_find_cpos_for_right_leaf(inode->i_sb, - path, &right_cpos); + status = ocfs2_find_cpos_for_right_leaf(sb, path, &right_cpos); if (status) goto out; @@ -4394,7 +4478,7 @@ ocfs2_figure_merge_contig_type(struct inode *inode, struct ocfs2_path *path, if (!right_path) goto out; - status = ocfs2_find_path(inode, right_path, right_cpos); + status = ocfs2_find_path(et->et_ci, right_path, right_cpos); if (status) goto out; @@ -4404,7 +4488,7 @@ ocfs2_figure_merge_contig_type(struct inode *inode, struct ocfs2_path *path, if (le16_to_cpu(new_el->l_next_free_rec) <= 1) { bh = path_leaf_bh(right_path); eb = (struct ocfs2_extent_block *)bh->b_data; - ocfs2_error(inode->i_sb, + ocfs2_error(sb, "Extent block #%llu has an " "invalid l_next_free_rec of %d", (unsigned long long)le64_to_cpu(eb->h_blkno), @@ -4419,7 +4503,7 @@ ocfs2_figure_merge_contig_type(struct inode *inode, struct ocfs2_path *path, if (rec) { enum ocfs2_contig_type contig_type; - contig_type = ocfs2_extent_contig(inode, rec, split_rec); + contig_type = ocfs2_et_extent_contig(et, rec, split_rec); if (contig_type == CONTIG_LEFT && ret == CONTIG_RIGHT) ret = CONTIG_LEFTRIGHT; @@ -4436,11 +4520,10 @@ out: return ret; } -static void ocfs2_figure_contig_type(struct inode *inode, +static void ocfs2_figure_contig_type(struct ocfs2_extent_tree *et, struct ocfs2_insert_type *insert, struct ocfs2_extent_list *el, - struct ocfs2_extent_rec *insert_rec, - struct ocfs2_extent_tree *et) + struct ocfs2_extent_rec *insert_rec) { int i; enum ocfs2_contig_type contig_type = CONTIG_NONE; @@ -4448,8 +4531,8 @@ static void ocfs2_figure_contig_type(struct inode *inode, BUG_ON(le16_to_cpu(el->l_tree_depth) != 0); for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) { - contig_type = ocfs2_extent_contig(inode, &el->l_recs[i], - insert_rec); + contig_type = ocfs2_et_extent_contig(et, &el->l_recs[i], + insert_rec); if (contig_type != CONTIG_NONE) { insert->ins_contig_index = i; break; @@ -4530,8 +4613,7 @@ set_tail_append: * All of the information is stored on the ocfs2_insert_type * structure. */ -static int ocfs2_figure_insert_type(struct inode *inode, - struct ocfs2_extent_tree *et, +static int ocfs2_figure_insert_type(struct ocfs2_extent_tree *et, struct buffer_head **last_eb_bh, struct ocfs2_extent_rec *insert_rec, int *free_records, @@ -4555,7 +4637,7 @@ static int ocfs2_figure_insert_type(struct inode *inode, * ocfs2_figure_insert_type() and ocfs2_add_branch() * may want it later. */ - ret = ocfs2_read_extent_block(inode, + ret = ocfs2_read_extent_block(et->et_ci, ocfs2_et_get_last_eb_blk(et), &bh); if (ret) { @@ -4578,7 +4660,7 @@ static int ocfs2_figure_insert_type(struct inode *inode, le16_to_cpu(el->l_next_free_rec); if (!insert->ins_tree_depth) { - ocfs2_figure_contig_type(inode, insert, el, insert_rec, et); + ocfs2_figure_contig_type(et, insert, el, insert_rec); ocfs2_figure_appending_type(insert, el, insert_rec); return 0; } @@ -4596,7 +4678,7 @@ static int ocfs2_figure_insert_type(struct inode *inode, * us the rightmost tree path. This is accounted for below in * the appending code. */ - ret = ocfs2_find_path(inode, path, le32_to_cpu(insert_rec->e_cpos)); + ret = ocfs2_find_path(et->et_ci, path, le32_to_cpu(insert_rec->e_cpos)); if (ret) { mlog_errno(ret); goto out; @@ -4612,7 +4694,7 @@ static int ocfs2_figure_insert_type(struct inode *inode, * into two types of appends: simple record append, or a * rotate inside the tail leaf. */ - ocfs2_figure_contig_type(inode, insert, el, insert_rec, et); + ocfs2_figure_contig_type(et, insert, el, insert_rec); /* * The insert code isn't quite ready to deal with all cases of @@ -4657,13 +4739,11 @@ out: } /* - * Insert an extent into an inode btree. + * Insert an extent into a btree. * - * The caller needs to update fe->i_clusters + * The caller needs to update the owning btree's cluster count. */ -int ocfs2_insert_extent(struct ocfs2_super *osb, - handle_t *handle, - struct inode *inode, +int ocfs2_insert_extent(handle_t *handle, struct ocfs2_extent_tree *et, u32 cpos, u64 start_blk, @@ -4677,21 +4757,22 @@ int ocfs2_insert_extent(struct ocfs2_super *osb, struct ocfs2_insert_type insert = {0, }; struct ocfs2_extent_rec rec; - mlog(0, "add %u clusters at position %u to inode %llu\n", - new_clusters, cpos, (unsigned long long)OCFS2_I(inode)->ip_blkno); + mlog(0, "add %u clusters at position %u to owner %llu\n", + new_clusters, cpos, + (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci)); memset(&rec, 0, sizeof(rec)); rec.e_cpos = cpu_to_le32(cpos); rec.e_blkno = cpu_to_le64(start_blk); rec.e_leaf_clusters = cpu_to_le16(new_clusters); rec.e_flags = flags; - status = ocfs2_et_insert_check(inode, et, &rec); + status = ocfs2_et_insert_check(et, &rec); if (status) { mlog_errno(status); goto bail; } - status = ocfs2_figure_insert_type(inode, et, &last_eb_bh, &rec, + status = ocfs2_figure_insert_type(et, &last_eb_bh, &rec, &free_records, &insert); if (status < 0) { mlog_errno(status); @@ -4705,7 +4786,7 @@ int ocfs2_insert_extent(struct ocfs2_super *osb, free_records, insert.ins_tree_depth); if (insert.ins_contig == CONTIG_NONE && free_records == 0) { - status = ocfs2_grow_tree(inode, handle, et, + status = ocfs2_grow_tree(handle, et, &insert.ins_tree_depth, &last_eb_bh, meta_ac); if (status) { @@ -4715,11 +4796,11 @@ int ocfs2_insert_extent(struct ocfs2_super *osb, } /* Finally, we can add clusters. This might rotate the tree for us. */ - status = ocfs2_do_insert_extent(inode, handle, et, &rec, &insert); + status = ocfs2_do_insert_extent(handle, et, &rec, &insert); if (status < 0) mlog_errno(status); - else if (et->et_ops == &ocfs2_dinode_et_ops) - ocfs2_extent_map_insert_rec(inode, &rec); + else + ocfs2_et_extent_map_insert(et, &rec); bail: brelse(last_eb_bh); @@ -4735,13 +4816,11 @@ bail: * it is not limited to the file storage. Any extent tree can use this * function if it implements the proper ocfs2_extent_tree. */ -int ocfs2_add_clusters_in_btree(struct ocfs2_super *osb, - struct inode *inode, +int ocfs2_add_clusters_in_btree(handle_t *handle, + struct ocfs2_extent_tree *et, u32 *logical_offset, u32 clusters_to_add, int mark_unwritten, - struct ocfs2_extent_tree *et, - handle_t *handle, struct ocfs2_alloc_context *data_ac, struct ocfs2_alloc_context *meta_ac, enum ocfs2_alloc_restarted *reason_ret) @@ -4752,13 +4831,15 @@ int ocfs2_add_clusters_in_btree(struct ocfs2_super *osb, u32 bit_off, num_bits; u64 block; u8 flags = 0; + struct ocfs2_super *osb = + OCFS2_SB(ocfs2_metadata_cache_get_super(et->et_ci)); BUG_ON(!clusters_to_add); if (mark_unwritten) flags = OCFS2_EXT_UNWRITTEN; - free_extents = ocfs2_num_free_extents(osb, inode, et); + free_extents = ocfs2_num_free_extents(osb, et); if (free_extents < 0) { status = free_extents; mlog_errno(status); @@ -4795,7 +4876,7 @@ int ocfs2_add_clusters_in_btree(struct ocfs2_super *osb, BUG_ON(num_bits > clusters_to_add); /* reserve our write early -- insert_extent may update the tree root */ - status = ocfs2_et_root_journal_access(handle, inode, et, + status = ocfs2_et_root_journal_access(handle, et, OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { mlog_errno(status); @@ -4803,10 +4884,10 @@ int ocfs2_add_clusters_in_btree(struct ocfs2_super *osb, } block = ocfs2_clusters_to_blocks(osb->sb, bit_off); - mlog(0, "Allocating %u clusters at block %u for inode %llu\n", - num_bits, bit_off, (unsigned long long)OCFS2_I(inode)->ip_blkno); - status = ocfs2_insert_extent(osb, handle, inode, et, - *logical_offset, block, + mlog(0, "Allocating %u clusters at block %u for owner %llu\n", + num_bits, bit_off, + (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci)); + status = ocfs2_insert_extent(handle, et, *logical_offset, block, num_bits, flags, meta_ac); if (status < 0) { mlog_errno(status); @@ -4856,10 +4937,9 @@ static void ocfs2_make_right_split_rec(struct super_block *sb, split_rec->e_flags = rec->e_flags; } -static int ocfs2_split_and_insert(struct inode *inode, - handle_t *handle, - struct ocfs2_path *path, +static int ocfs2_split_and_insert(handle_t *handle, struct ocfs2_extent_tree *et, + struct ocfs2_path *path, struct buffer_head **last_eb_bh, int split_index, struct ocfs2_extent_rec *orig_split_rec, @@ -4892,7 +4972,7 @@ leftright: if (le16_to_cpu(rightmost_el->l_next_free_rec) == le16_to_cpu(rightmost_el->l_count)) { - ret = ocfs2_grow_tree(inode, handle, et, + ret = ocfs2_grow_tree(handle, et, &depth, last_eb_bh, meta_ac); if (ret) { mlog_errno(ret); @@ -4921,8 +5001,8 @@ leftright: */ insert.ins_split = SPLIT_RIGHT; - ocfs2_make_right_split_rec(inode->i_sb, &tmprec, insert_range, - &rec); + ocfs2_make_right_split_rec(ocfs2_metadata_cache_get_super(et->et_ci), + &tmprec, insert_range, &rec); split_rec = tmprec; @@ -4930,7 +5010,7 @@ leftright: do_leftright = 1; } - ret = ocfs2_do_insert_extent(inode, handle, et, &split_rec, &insert); + ret = ocfs2_do_insert_extent(handle, et, &split_rec, &insert); if (ret) { mlog_errno(ret); goto out; @@ -4946,7 +5026,7 @@ leftright: ocfs2_reinit_path(path, 1); cpos = le32_to_cpu(split_rec.e_cpos); - ret = ocfs2_find_path(inode, path, cpos); + ret = ocfs2_find_path(et->et_ci, path, cpos); if (ret) { mlog_errno(ret); goto out; @@ -4961,8 +5041,8 @@ out: return ret; } -static int ocfs2_replace_extent_rec(struct inode *inode, - handle_t *handle, +static int ocfs2_replace_extent_rec(handle_t *handle, + struct ocfs2_extent_tree *et, struct ocfs2_path *path, struct ocfs2_extent_list *el, int split_index, @@ -4970,7 +5050,7 @@ static int ocfs2_replace_extent_rec(struct inode *inode, { int ret; - ret = ocfs2_path_bh_journal_access(handle, inode, path, + ret = ocfs2_path_bh_journal_access(handle, et->et_ci, path, path_num_items(path) - 1); if (ret) { mlog_errno(ret); @@ -4985,9 +5065,8 @@ out: } /* - * Mark part or all of the extent record at split_index in the leaf - * pointed to by path as written. This removes the unwritten - * extent flag. + * Split part or all of the extent record at split_index in the leaf + * pointed to by path. Merge with the contiguous extent record if needed. * * Care is taken to handle contiguousness so as to not grow the tree. * @@ -5004,14 +5083,13 @@ out: * have been brought into cache (and pinned via the journal), so the * extra overhead is not expressed in terms of disk reads. */ -static int __ocfs2_mark_extent_written(struct inode *inode, - struct ocfs2_extent_tree *et, - handle_t *handle, - struct ocfs2_path *path, - int split_index, - struct ocfs2_extent_rec *split_rec, - struct ocfs2_alloc_context *meta_ac, - struct ocfs2_cached_dealloc_ctxt *dealloc) +int ocfs2_split_extent(handle_t *handle, + struct ocfs2_extent_tree *et, + struct ocfs2_path *path, + int split_index, + struct ocfs2_extent_rec *split_rec, + struct ocfs2_alloc_context *meta_ac, + struct ocfs2_cached_dealloc_ctxt *dealloc) { int ret = 0; struct ocfs2_extent_list *el = path_leaf_el(path); @@ -5020,12 +5098,6 @@ static int __ocfs2_mark_extent_written(struct inode *inode, struct ocfs2_merge_ctxt ctxt; struct ocfs2_extent_list *rightmost_el; - if (!(rec->e_flags & OCFS2_EXT_UNWRITTEN)) { - ret = -EIO; - mlog_errno(ret); - goto out; - } - if (le32_to_cpu(rec->e_cpos) > le32_to_cpu(split_rec->e_cpos) || ((le32_to_cpu(rec->e_cpos) + le16_to_cpu(rec->e_leaf_clusters)) < (le32_to_cpu(split_rec->e_cpos) + le16_to_cpu(split_rec->e_leaf_clusters)))) { @@ -5034,19 +5106,19 @@ static int __ocfs2_mark_extent_written(struct inode *inode, goto out; } - ctxt.c_contig_type = ocfs2_figure_merge_contig_type(inode, path, el, + ctxt.c_contig_type = ocfs2_figure_merge_contig_type(et, path, el, split_index, split_rec); /* * The core merge / split code wants to know how much room is - * left in this inodes allocation tree, so we pass the + * left in this allocation tree, so we pass the * rightmost extent list. */ if (path->p_tree_depth) { struct ocfs2_extent_block *eb; - ret = ocfs2_read_extent_block(inode, + ret = ocfs2_read_extent_block(et->et_ci, ocfs2_et_get_last_eb_blk(et), &last_eb_bh); if (ret) { @@ -5073,19 +5145,18 @@ static int __ocfs2_mark_extent_written(struct inode *inode, if (ctxt.c_contig_type == CONTIG_NONE) { if (ctxt.c_split_covers_rec) - ret = ocfs2_replace_extent_rec(inode, handle, - path, el, + ret = ocfs2_replace_extent_rec(handle, et, path, el, split_index, split_rec); else - ret = ocfs2_split_and_insert(inode, handle, path, et, + ret = ocfs2_split_and_insert(handle, et, path, &last_eb_bh, split_index, split_rec, meta_ac); if (ret) mlog_errno(ret); } else { - ret = ocfs2_try_to_merge_extent(inode, handle, path, + ret = ocfs2_try_to_merge_extent(handle, et, path, split_index, split_rec, - dealloc, &ctxt, et); + dealloc, &ctxt); if (ret) mlog_errno(ret); } @@ -5096,46 +5167,31 @@ out: } /* - * Mark the already-existing extent at cpos as written for len clusters. + * Change the flags of the already-existing extent at cpos for len clusters. + * + * new_flags: the flags we want to set. + * clear_flags: the flags we want to clear. + * phys: the new physical offset we want this new extent starts from. * * If the existing extent is larger than the request, initiate a * split. An attempt will be made at merging with adjacent extents. * * The caller is responsible for passing down meta_ac if we'll need it. */ -int ocfs2_mark_extent_written(struct inode *inode, - struct ocfs2_extent_tree *et, - handle_t *handle, u32 cpos, u32 len, u32 phys, - struct ocfs2_alloc_context *meta_ac, - struct ocfs2_cached_dealloc_ctxt *dealloc) +int ocfs2_change_extent_flag(handle_t *handle, + struct ocfs2_extent_tree *et, + u32 cpos, u32 len, u32 phys, + struct ocfs2_alloc_context *meta_ac, + struct ocfs2_cached_dealloc_ctxt *dealloc, + int new_flags, int clear_flags) { int ret, index; - u64 start_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys); + struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci); + u64 start_blkno = ocfs2_clusters_to_blocks(sb, phys); struct ocfs2_extent_rec split_rec; struct ocfs2_path *left_path = NULL; struct ocfs2_extent_list *el; - - mlog(0, "Inode %lu cpos %u, len %u, phys %u (%llu)\n", - inode->i_ino, cpos, len, phys, (unsigned long long)start_blkno); - - if (!ocfs2_writes_unwritten_extents(OCFS2_SB(inode->i_sb))) { - ocfs2_error(inode->i_sb, "Inode %llu has unwritten extents " - "that are being written to, but the feature bit " - "is not set in the super block.", - (unsigned long long)OCFS2_I(inode)->ip_blkno); - ret = -EROFS; - goto out; - } - - /* - * XXX: This should be fixed up so that we just re-insert the - * next extent records. - * - * XXX: This is a hack on the extent tree, maybe it should be - * an op? - */ - if (et->et_ops == &ocfs2_dinode_et_ops) - ocfs2_extent_map_trunc(inode, 0); + struct ocfs2_extent_rec *rec; left_path = ocfs2_new_path_from_et(et); if (!left_path) { @@ -5144,7 +5200,7 @@ int ocfs2_mark_extent_written(struct inode *inode, goto out; } - ret = ocfs2_find_path(inode, left_path, cpos); + ret = ocfs2_find_path(et->et_ci, left_path, cpos); if (ret) { mlog_errno(ret); goto out; @@ -5153,34 +5209,102 @@ int ocfs2_mark_extent_written(struct inode *inode, index = ocfs2_search_extent_list(el, cpos); if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) { - ocfs2_error(inode->i_sb, - "Inode %llu has an extent at cpos %u which can no " + ocfs2_error(sb, + "Owner %llu has an extent at cpos %u which can no " "longer be found.\n", - (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos); + (unsigned long long) + ocfs2_metadata_cache_owner(et->et_ci), cpos); ret = -EROFS; goto out; } + ret = -EIO; + rec = &el->l_recs[index]; + if (new_flags && (rec->e_flags & new_flags)) { + mlog(ML_ERROR, "Owner %llu tried to set %d flags on an " + "extent that already had them", + (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), + new_flags); + goto out; + } + + if (clear_flags && !(rec->e_flags & clear_flags)) { + mlog(ML_ERROR, "Owner %llu tried to clear %d flags on an " + "extent that didn't have them", + (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), + clear_flags); + goto out; + } + memset(&split_rec, 0, sizeof(struct ocfs2_extent_rec)); split_rec.e_cpos = cpu_to_le32(cpos); split_rec.e_leaf_clusters = cpu_to_le16(len); split_rec.e_blkno = cpu_to_le64(start_blkno); - split_rec.e_flags = path_leaf_el(left_path)->l_recs[index].e_flags; - split_rec.e_flags &= ~OCFS2_EXT_UNWRITTEN; - - ret = __ocfs2_mark_extent_written(inode, et, handle, left_path, - index, &split_rec, meta_ac, - dealloc); + split_rec.e_flags = rec->e_flags; + if (new_flags) + split_rec.e_flags |= new_flags; + if (clear_flags) + split_rec.e_flags &= ~clear_flags; + + ret = ocfs2_split_extent(handle, et, left_path, + index, &split_rec, meta_ac, + dealloc); if (ret) mlog_errno(ret); out: ocfs2_free_path(left_path); return ret; + } -static int ocfs2_split_tree(struct inode *inode, struct ocfs2_extent_tree *et, - handle_t *handle, struct ocfs2_path *path, +/* + * Mark the already-existing extent at cpos as written for len clusters. + * This removes the unwritten extent flag. + * + * If the existing extent is larger than the request, initiate a + * split. An attempt will be made at merging with adjacent extents. + * + * The caller is responsible for passing down meta_ac if we'll need it. + */ +int ocfs2_mark_extent_written(struct inode *inode, + struct ocfs2_extent_tree *et, + handle_t *handle, u32 cpos, u32 len, u32 phys, + struct ocfs2_alloc_context *meta_ac, + struct ocfs2_cached_dealloc_ctxt *dealloc) +{ + int ret; + + mlog(0, "Inode %lu cpos %u, len %u, phys clusters %u\n", + inode->i_ino, cpos, len, phys); + + if (!ocfs2_writes_unwritten_extents(OCFS2_SB(inode->i_sb))) { + ocfs2_error(inode->i_sb, "Inode %llu has unwritten extents " + "that are being written to, but the feature bit " + "is not set in the super block.", + (unsigned long long)OCFS2_I(inode)->ip_blkno); + ret = -EROFS; + goto out; + } + + /* + * XXX: This should be fixed up so that we just re-insert the + * next extent records. + */ + ocfs2_et_extent_map_truncate(et, 0); + + ret = ocfs2_change_extent_flag(handle, et, cpos, + len, phys, meta_ac, dealloc, + 0, OCFS2_EXT_UNWRITTEN); + if (ret) + mlog_errno(ret); + +out: + return ret; +} + +static int ocfs2_split_tree(handle_t *handle, struct ocfs2_extent_tree *et, + struct ocfs2_path *path, int index, u32 new_range, struct ocfs2_alloc_context *meta_ac) { @@ -5197,11 +5321,12 @@ static int ocfs2_split_tree(struct inode *inode, struct ocfs2_extent_tree *et, */ el = path_leaf_el(path); rec = &el->l_recs[index]; - ocfs2_make_right_split_rec(inode->i_sb, &split_rec, new_range, rec); + ocfs2_make_right_split_rec(ocfs2_metadata_cache_get_super(et->et_ci), + &split_rec, new_range, rec); depth = path->p_tree_depth; if (depth > 0) { - ret = ocfs2_read_extent_block(inode, + ret = ocfs2_read_extent_block(et->et_ci, ocfs2_et_get_last_eb_blk(et), &last_eb_bh); if (ret < 0) { @@ -5224,7 +5349,7 @@ static int ocfs2_split_tree(struct inode *inode, struct ocfs2_extent_tree *et, if (le16_to_cpu(rightmost_el->l_next_free_rec) == le16_to_cpu(rightmost_el->l_count)) { - ret = ocfs2_grow_tree(inode, handle, et, &depth, &last_eb_bh, + ret = ocfs2_grow_tree(handle, et, &depth, &last_eb_bh, meta_ac); if (ret) { mlog_errno(ret); @@ -5238,7 +5363,7 @@ static int ocfs2_split_tree(struct inode *inode, struct ocfs2_extent_tree *et, insert.ins_split = SPLIT_RIGHT; insert.ins_tree_depth = depth; - ret = ocfs2_do_insert_extent(inode, handle, et, &split_rec, &insert); + ret = ocfs2_do_insert_extent(handle, et, &split_rec, &insert); if (ret) mlog_errno(ret); @@ -5247,23 +5372,23 @@ out: return ret; } -static int ocfs2_truncate_rec(struct inode *inode, handle_t *handle, +static int ocfs2_truncate_rec(handle_t *handle, + struct ocfs2_extent_tree *et, struct ocfs2_path *path, int index, struct ocfs2_cached_dealloc_ctxt *dealloc, - u32 cpos, u32 len, - struct ocfs2_extent_tree *et) + u32 cpos, u32 len) { int ret; u32 left_cpos, rec_range, trunc_range; int wants_rotate = 0, is_rightmost_tree_rec = 0; - struct super_block *sb = inode->i_sb; + struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci); struct ocfs2_path *left_path = NULL; struct ocfs2_extent_list *el = path_leaf_el(path); struct ocfs2_extent_rec *rec; struct ocfs2_extent_block *eb; if (ocfs2_is_empty_extent(&el->l_recs[0]) && index > 0) { - ret = ocfs2_rotate_tree_left(inode, handle, path, dealloc, et); + ret = ocfs2_rotate_tree_left(handle, et, path, dealloc); if (ret) { mlog_errno(ret); goto out; @@ -5295,14 +5420,13 @@ static int ocfs2_truncate_rec(struct inode *inode, handle_t *handle, * by this leaf and the one to it's left. * * There are two cases we can skip: - * 1) Path is the leftmost one in our inode tree. + * 1) Path is the leftmost one in our btree. * 2) The leaf is rightmost and will be empty after * we remove the extent record - the rotate code * knows how to update the newly formed edge. */ - ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, path, - &left_cpos); + ret = ocfs2_find_cpos_for_left_leaf(sb, path, &left_cpos); if (ret) { mlog_errno(ret); goto out; @@ -5316,7 +5440,8 @@ static int ocfs2_truncate_rec(struct inode *inode, handle_t *handle, goto out; } - ret = ocfs2_find_path(inode, left_path, left_cpos); + ret = ocfs2_find_path(et->et_ci, left_path, + left_cpos); if (ret) { mlog_errno(ret); goto out; @@ -5332,13 +5457,13 @@ static int ocfs2_truncate_rec(struct inode *inode, handle_t *handle, goto out; } - ret = ocfs2_journal_access_path(inode, handle, path); + ret = ocfs2_journal_access_path(et->et_ci, handle, path); if (ret) { mlog_errno(ret); goto out; } - ret = ocfs2_journal_access_path(inode, handle, left_path); + ret = ocfs2_journal_access_path(et->et_ci, handle, left_path); if (ret) { mlog_errno(ret); goto out; @@ -5361,7 +5486,7 @@ static int ocfs2_truncate_rec(struct inode *inode, handle_t *handle, * be deleted by the rotate code. */ rec = &el->l_recs[next_free - 1]; - ocfs2_adjust_rightmost_records(inode, handle, path, + ocfs2_adjust_rightmost_records(handle, et, path, rec); } } else if (le32_to_cpu(rec->e_cpos) == cpos) { @@ -5373,11 +5498,12 @@ static int ocfs2_truncate_rec(struct inode *inode, handle_t *handle, /* Remove rightmost portion of the record */ le16_add_cpu(&rec->e_leaf_clusters, -len); if (is_rightmost_tree_rec) - ocfs2_adjust_rightmost_records(inode, handle, path, rec); + ocfs2_adjust_rightmost_records(handle, et, path, rec); } else { /* Caller should have trapped this. */ - mlog(ML_ERROR, "Inode %llu: Invalid record truncate: (%u, %u) " - "(%u, %u)\n", (unsigned long long)OCFS2_I(inode)->ip_blkno, + mlog(ML_ERROR, "Owner %llu: Invalid record truncate: (%u, %u) " + "(%u, %u)\n", + (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), le32_to_cpu(rec->e_cpos), le16_to_cpu(rec->e_leaf_clusters), cpos, len); BUG(); @@ -5386,14 +5512,14 @@ static int ocfs2_truncate_rec(struct inode *inode, handle_t *handle, if (left_path) { int subtree_index; - subtree_index = ocfs2_find_subtree_root(inode, left_path, path); - ocfs2_complete_edge_insert(inode, handle, left_path, path, + subtree_index = ocfs2_find_subtree_root(et, left_path, path); + ocfs2_complete_edge_insert(handle, left_path, path, subtree_index); } ocfs2_journal_dirty(handle, path_leaf_bh(path)); - ret = ocfs2_rotate_tree_left(inode, handle, path, dealloc, et); + ret = ocfs2_rotate_tree_left(handle, et, path, dealloc); if (ret) { mlog_errno(ret); goto out; @@ -5404,9 +5530,9 @@ out: return ret; } -int ocfs2_remove_extent(struct inode *inode, +int ocfs2_remove_extent(handle_t *handle, struct ocfs2_extent_tree *et, - u32 cpos, u32 len, handle_t *handle, + u32 cpos, u32 len, struct ocfs2_alloc_context *meta_ac, struct ocfs2_cached_dealloc_ctxt *dealloc) { @@ -5416,7 +5542,11 @@ int ocfs2_remove_extent(struct inode *inode, struct ocfs2_extent_list *el; struct ocfs2_path *path = NULL; - ocfs2_extent_map_trunc(inode, 0); + /* + * XXX: Why are we truncating to 0 instead of wherever this + * affects us? + */ + ocfs2_et_extent_map_truncate(et, 0); path = ocfs2_new_path_from_et(et); if (!path) { @@ -5425,7 +5555,7 @@ int ocfs2_remove_extent(struct inode *inode, goto out; } - ret = ocfs2_find_path(inode, path, cpos); + ret = ocfs2_find_path(et->et_ci, path, cpos); if (ret) { mlog_errno(ret); goto out; @@ -5434,10 +5564,11 @@ int ocfs2_remove_extent(struct inode *inode, el = path_leaf_el(path); index = ocfs2_search_extent_list(el, cpos); if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) { - ocfs2_error(inode->i_sb, - "Inode %llu has an extent at cpos %u which can no " + ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci), + "Owner %llu has an extent at cpos %u which can no " "longer be found.\n", - (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos); + (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), + cpos); ret = -EROFS; goto out; } @@ -5464,20 +5595,21 @@ int ocfs2_remove_extent(struct inode *inode, BUG_ON(cpos < le32_to_cpu(rec->e_cpos) || trunc_range > rec_range); - mlog(0, "Inode %llu, remove (cpos %u, len %u). Existing index %d " + mlog(0, "Owner %llu, remove (cpos %u, len %u). Existing index %d " "(cpos %u, len %u)\n", - (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos, len, index, + (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), + cpos, len, index, le32_to_cpu(rec->e_cpos), ocfs2_rec_clusters(el, rec)); if (le32_to_cpu(rec->e_cpos) == cpos || rec_range == trunc_range) { - ret = ocfs2_truncate_rec(inode, handle, path, index, dealloc, - cpos, len, et); + ret = ocfs2_truncate_rec(handle, et, path, index, dealloc, + cpos, len); if (ret) { mlog_errno(ret); goto out; } } else { - ret = ocfs2_split_tree(inode, et, handle, path, index, + ret = ocfs2_split_tree(handle, et, path, index, trunc_range, meta_ac); if (ret) { mlog_errno(ret); @@ -5490,7 +5622,7 @@ int ocfs2_remove_extent(struct inode *inode, */ ocfs2_reinit_path(path, 1); - ret = ocfs2_find_path(inode, path, cpos); + ret = ocfs2_find_path(et->et_ci, path, cpos); if (ret) { mlog_errno(ret); goto out; @@ -5499,9 +5631,9 @@ int ocfs2_remove_extent(struct inode *inode, el = path_leaf_el(path); index = ocfs2_search_extent_list(el, cpos); if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) { - ocfs2_error(inode->i_sb, - "Inode %llu: split at cpos %u lost record.", - (unsigned long long)OCFS2_I(inode)->ip_blkno, + ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci), + "Owner %llu: split at cpos %u lost record.", + (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), cpos); ret = -EROFS; goto out; @@ -5515,18 +5647,18 @@ int ocfs2_remove_extent(struct inode *inode, rec_range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec); if (rec_range != trunc_range) { - ocfs2_error(inode->i_sb, - "Inode %llu: error after split at cpos %u" + ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci), + "Owner %llu: error after split at cpos %u" "trunc len %u, existing record is (%u,%u)", - (unsigned long long)OCFS2_I(inode)->ip_blkno, + (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), cpos, len, le32_to_cpu(rec->e_cpos), ocfs2_rec_clusters(el, rec)); ret = -EROFS; goto out; } - ret = ocfs2_truncate_rec(inode, handle, path, index, dealloc, - cpos, len, et); + ret = ocfs2_truncate_rec(handle, et, path, index, dealloc, + cpos, len); if (ret) { mlog_errno(ret); goto out; @@ -5573,7 +5705,7 @@ int ocfs2_remove_btree_range(struct inode *inode, goto out; } - ret = ocfs2_et_root_journal_access(handle, inode, et, + ret = ocfs2_et_root_journal_access(handle, et, OCFS2_JOURNAL_ACCESS_WRITE); if (ret) { mlog_errno(ret); @@ -5583,14 +5715,13 @@ int ocfs2_remove_btree_range(struct inode *inode, vfs_dq_free_space_nodirty(inode, ocfs2_clusters_to_bytes(inode->i_sb, len)); - ret = ocfs2_remove_extent(inode, et, cpos, len, handle, meta_ac, - dealloc); + ret = ocfs2_remove_extent(handle, et, cpos, len, meta_ac, dealloc); if (ret) { mlog_errno(ret); goto out_commit; } - ocfs2_et_update_clusters(inode, et, -len); + ocfs2_et_update_clusters(et, -len); ret = ocfs2_journal_dirty(handle, et->et_root_bh); if (ret) { @@ -5690,7 +5821,7 @@ int ocfs2_truncate_log_append(struct ocfs2_super *osb, goto bail; } - status = ocfs2_journal_access_di(handle, tl_inode, tl_bh, + status = ocfs2_journal_access_di(handle, INODE_CACHE(tl_inode), tl_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { mlog_errno(status); @@ -5752,7 +5883,7 @@ static int ocfs2_replay_truncate_records(struct ocfs2_super *osb, while (i >= 0) { /* Caller has given us at least enough credits to * update the truncate log dinode */ - status = ocfs2_journal_access_di(handle, tl_inode, tl_bh, + status = ocfs2_journal_access_di(handle, INODE_CACHE(tl_inode), tl_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { mlog_errno(status); @@ -6010,7 +6141,7 @@ int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb, tl->tl_used = 0; ocfs2_compute_meta_ecc(osb->sb, tl_bh->b_data, &di->i_check); - status = ocfs2_write_block(osb, tl_bh, tl_inode); + status = ocfs2_write_block(osb, tl_bh, INODE_CACHE(tl_inode)); if (status < 0) { mlog_errno(status); goto bail; @@ -6400,9 +6531,9 @@ ocfs2_find_per_slot_free_list(int type, return fl; } -static int ocfs2_cache_block_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt, - int type, int slot, u64 blkno, - unsigned int bit) +int ocfs2_cache_block_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt, + int type, int slot, u64 blkno, + unsigned int bit) { int ret; struct ocfs2_per_slot_free_list *fl; @@ -6518,7 +6649,7 @@ static int ocfs2_find_new_last_ext_blk(struct inode *inode, goto out; } - ret = ocfs2_find_leaf(inode, path_root_el(path), cpos, &bh); + ret = ocfs2_find_leaf(INODE_CACHE(inode), path_root_el(path), cpos, &bh); if (ret) { mlog_errno(ret); goto out; @@ -6551,7 +6682,7 @@ out: */ static int ocfs2_trim_tree(struct inode *inode, struct ocfs2_path *path, handle_t *handle, struct ocfs2_truncate_context *tc, - u32 clusters_to_del, u64 *delete_start) + u32 clusters_to_del, u64 *delete_start, u8 *flags) { int ret, i, index = path->p_tree_depth; u32 new_edge = 0; @@ -6561,6 +6692,7 @@ static int ocfs2_trim_tree(struct inode *inode, struct ocfs2_path *path, struct ocfs2_extent_rec *rec; *delete_start = 0; + *flags = 0; while (index >= 0) { bh = path->p_node[index].bh; @@ -6648,6 +6780,7 @@ find_tail_record: *delete_start = le64_to_cpu(rec->e_blkno) + ocfs2_clusters_to_blocks(inode->i_sb, le16_to_cpu(rec->e_leaf_clusters)); + *flags = rec->e_flags; /* * If it's now empty, remove this record. @@ -6719,7 +6852,7 @@ delete: mlog(0, "deleting this extent block.\n"); - ocfs2_remove_from_cache(inode, bh); + ocfs2_remove_from_cache(INODE_CACHE(inode), bh); BUG_ON(ocfs2_rec_clusters(el, &el->l_recs[0])); BUG_ON(le32_to_cpu(el->l_recs[0].e_cpos)); @@ -6747,7 +6880,8 @@ static int ocfs2_do_truncate(struct ocfs2_super *osb, struct buffer_head *fe_bh, handle_t *handle, struct ocfs2_truncate_context *tc, - struct ocfs2_path *path) + struct ocfs2_path *path, + struct ocfs2_alloc_context *meta_ac) { int status; struct ocfs2_dinode *fe; @@ -6755,6 +6889,7 @@ static int ocfs2_do_truncate(struct ocfs2_super *osb, struct ocfs2_extent_list *el; struct buffer_head *last_eb_bh = NULL; u64 delete_blk = 0; + u8 rec_flags; fe = (struct ocfs2_dinode *) fe_bh->b_data; @@ -6769,14 +6904,14 @@ static int ocfs2_do_truncate(struct ocfs2_super *osb, * Each component will be touched, so we might as well journal * here to avoid having to handle errors later. */ - status = ocfs2_journal_access_path(inode, handle, path); + status = ocfs2_journal_access_path(INODE_CACHE(inode), handle, path); if (status < 0) { mlog_errno(status); goto bail; } if (last_eb_bh) { - status = ocfs2_journal_access_eb(handle, inode, last_eb_bh, + status = ocfs2_journal_access_eb(handle, INODE_CACHE(inode), last_eb_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { mlog_errno(status); @@ -6810,7 +6945,7 @@ static int ocfs2_do_truncate(struct ocfs2_super *osb, inode->i_blocks = ocfs2_inode_sector_count(inode); status = ocfs2_trim_tree(inode, path, handle, tc, - clusters_to_del, &delete_blk); + clusters_to_del, &delete_blk, &rec_flags); if (status) { mlog_errno(status); goto bail; @@ -6842,8 +6977,16 @@ static int ocfs2_do_truncate(struct ocfs2_super *osb, } if (delete_blk) { - status = ocfs2_truncate_log_append(osb, handle, delete_blk, - clusters_to_del); + if (rec_flags & OCFS2_EXT_REFCOUNTED) + status = ocfs2_decrease_refcount(inode, handle, + ocfs2_blocks_to_clusters(osb->sb, + delete_blk), + clusters_to_del, meta_ac, + &tc->tc_dealloc, 1); + else + status = ocfs2_truncate_log_append(osb, handle, + delete_blk, + clusters_to_del); if (status < 0) { mlog_errno(status); goto bail; @@ -6863,9 +7006,9 @@ static int ocfs2_zero_func(handle_t *handle, struct buffer_head *bh) return 0; } -static void ocfs2_map_and_dirty_page(struct inode *inode, handle_t *handle, - unsigned int from, unsigned int to, - struct page *page, int zero, u64 *phys) +void ocfs2_map_and_dirty_page(struct inode *inode, handle_t *handle, + unsigned int from, unsigned int to, + struct page *page, int zero, u64 *phys) { int ret, partial = 0; @@ -6933,20 +7076,16 @@ out: ocfs2_unlock_and_free_pages(pages, numpages); } -static int ocfs2_grab_eof_pages(struct inode *inode, loff_t start, loff_t end, - struct page **pages, int *num) +int ocfs2_grab_pages(struct inode *inode, loff_t start, loff_t end, + struct page **pages, int *num) { int numpages, ret = 0; - struct super_block *sb = inode->i_sb; struct address_space *mapping = inode->i_mapping; unsigned long index; loff_t last_page_bytes; BUG_ON(start > end); - BUG_ON(start >> OCFS2_SB(sb)->s_clustersize_bits != - (end - 1) >> OCFS2_SB(sb)->s_clustersize_bits); - numpages = 0; last_page_bytes = PAGE_ALIGN(end); index = start >> PAGE_CACHE_SHIFT; @@ -6974,6 +7113,17 @@ out: return ret; } +static int ocfs2_grab_eof_pages(struct inode *inode, loff_t start, loff_t end, + struct page **pages, int *num) +{ + struct super_block *sb = inode->i_sb; + + BUG_ON(start >> OCFS2_SB(sb)->s_clustersize_bits != + (end - 1) >> OCFS2_SB(sb)->s_clustersize_bits); + + return ocfs2_grab_pages(inode, start, end, pages, num); +} + /* * Zero the area past i_size but still within an allocated * cluster. This avoids exposing nonzero data on subsequent file @@ -7138,7 +7288,7 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode, goto out_unlock; } - ret = ocfs2_journal_access_di(handle, inode, di_bh, + ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (ret) { mlog_errno(ret); @@ -7218,9 +7368,8 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode, * this proves to be false, we could always re-build * the in-inode data from our pages. */ - ocfs2_init_dinode_extent_tree(&et, inode, di_bh); - ret = ocfs2_insert_extent(osb, handle, inode, &et, - 0, block, 1, 0, NULL); + ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh); + ret = ocfs2_insert_extent(handle, &et, 0, block, 1, 0, NULL); if (ret) { mlog_errno(ret); goto out_commit; @@ -7262,11 +7411,14 @@ int ocfs2_commit_truncate(struct ocfs2_super *osb, { int status, i, credits, tl_sem = 0; u32 clusters_to_del, new_highest_cpos, range; + u64 blkno = 0; struct ocfs2_extent_list *el; handle_t *handle = NULL; struct inode *tl_inode = osb->osb_tl_inode; struct ocfs2_path *path = NULL; struct ocfs2_dinode *di = (struct ocfs2_dinode *)fe_bh->b_data; + struct ocfs2_alloc_context *meta_ac = NULL; + struct ocfs2_refcount_tree *ref_tree = NULL; mlog_entry_void(); @@ -7292,10 +7444,12 @@ start: goto bail; } + credits = 0; + /* * Truncate always works against the rightmost tree branch. */ - status = ocfs2_find_path(inode, path, UINT_MAX); + status = ocfs2_find_path(INODE_CACHE(inode), path, UINT_MAX); if (status) { mlog_errno(status); goto bail; @@ -7332,10 +7486,15 @@ start: clusters_to_del = 0; } else if (le32_to_cpu(el->l_recs[i].e_cpos) >= new_highest_cpos) { clusters_to_del = ocfs2_rec_clusters(el, &el->l_recs[i]); + blkno = le64_to_cpu(el->l_recs[i].e_blkno); } else if (range > new_highest_cpos) { clusters_to_del = (ocfs2_rec_clusters(el, &el->l_recs[i]) + le32_to_cpu(el->l_recs[i].e_cpos)) - new_highest_cpos; + blkno = le64_to_cpu(el->l_recs[i].e_blkno) + + ocfs2_clusters_to_blocks(inode->i_sb, + ocfs2_rec_clusters(el, &el->l_recs[i]) - + clusters_to_del); } else { status = 0; goto bail; @@ -7344,6 +7503,29 @@ start: mlog(0, "clusters_to_del = %u in this pass, tail blk=%llu\n", clusters_to_del, (unsigned long long)path_leaf_bh(path)->b_blocknr); + if (el->l_recs[i].e_flags & OCFS2_EXT_REFCOUNTED && clusters_to_del) { + BUG_ON(!(OCFS2_I(inode)->ip_dyn_features & + OCFS2_HAS_REFCOUNT_FL)); + + status = ocfs2_lock_refcount_tree(osb, + le64_to_cpu(di->i_refcount_loc), + 1, &ref_tree, NULL); + if (status) { + mlog_errno(status); + goto bail; + } + + status = ocfs2_prepare_refcount_change_for_del(inode, fe_bh, + blkno, + clusters_to_del, + &credits, + &meta_ac); + if (status < 0) { + mlog_errno(status); + goto bail; + } + } + mutex_lock(&tl_inode->i_mutex); tl_sem = 1; /* ocfs2_truncate_log_needs_flush guarantees us at least one @@ -7357,7 +7539,7 @@ start: } } - credits = ocfs2_calc_tree_trunc_credits(osb->sb, clusters_to_del, + credits += ocfs2_calc_tree_trunc_credits(osb->sb, clusters_to_del, (struct ocfs2_dinode *)fe_bh->b_data, el); handle = ocfs2_start_trans(osb, credits); @@ -7369,7 +7551,7 @@ start: } status = ocfs2_do_truncate(osb, clusters_to_del, inode, fe_bh, handle, - tc, path); + tc, path, meta_ac); if (status < 0) { mlog_errno(status); goto bail; @@ -7383,6 +7565,16 @@ start: ocfs2_reinit_path(path, 1); + if (meta_ac) { + ocfs2_free_alloc_context(meta_ac); + meta_ac = NULL; + } + + if (ref_tree) { + ocfs2_unlock_refcount_tree(osb, ref_tree, 1); + ref_tree = NULL; + } + /* * The check above will catch the case where we've truncated * away all allocation. @@ -7399,6 +7591,12 @@ bail: if (handle) ocfs2_commit_trans(osb, handle); + if (meta_ac) + ocfs2_free_alloc_context(meta_ac); + + if (ref_tree) + ocfs2_unlock_refcount_tree(osb, ref_tree, 1); + ocfs2_run_deallocs(osb, &tc->tc_dealloc); ocfs2_free_path(path); @@ -7445,7 +7643,7 @@ int ocfs2_prepare_truncate(struct ocfs2_super *osb, ocfs2_init_dealloc_ctxt(&(*tc)->tc_dealloc); if (fe->id2.i_list.l_tree_depth) { - status = ocfs2_read_extent_block(inode, + status = ocfs2_read_extent_block(INODE_CACHE(inode), le64_to_cpu(fe->i_last_eb_blk), &last_eb_bh); if (status < 0) { @@ -7507,7 +7705,7 @@ int ocfs2_truncate_inline(struct inode *inode, struct buffer_head *di_bh, goto out; } - ret = ocfs2_journal_access_di(handle, inode, di_bh, + ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (ret) { mlog_errno(ret); diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h index 353254b..9c122d57 100644 --- a/fs/ocfs2/alloc.h +++ b/fs/ocfs2/alloc.h @@ -45,7 +45,8 @@ * * ocfs2_extent_tree contains info for the root of the b-tree, it must have a * root ocfs2_extent_list and a root_bh so that they can be used in the b-tree - * functions. With metadata ecc, we now call different journal_access + * functions. It needs the ocfs2_caching_info structure associated with + * I/O on the tree. With metadata ecc, we now call different journal_access * functions for each type of metadata, so it must have the * root_journal_access function. * ocfs2_extent_tree_operations abstract the normal operations we do for @@ -56,6 +57,7 @@ struct ocfs2_extent_tree { struct ocfs2_extent_tree_operations *et_ops; struct buffer_head *et_root_bh; struct ocfs2_extent_list *et_root_el; + struct ocfs2_caching_info *et_ci; ocfs2_journal_access_func et_root_journal_access; void *et_object; unsigned int et_max_leaf_clusters; @@ -66,31 +68,32 @@ struct ocfs2_extent_tree { * specified object buffer. */ void ocfs2_init_dinode_extent_tree(struct ocfs2_extent_tree *et, - struct inode *inode, + struct ocfs2_caching_info *ci, struct buffer_head *bh); void ocfs2_init_xattr_tree_extent_tree(struct ocfs2_extent_tree *et, - struct inode *inode, + struct ocfs2_caching_info *ci, struct buffer_head *bh); struct ocfs2_xattr_value_buf; void ocfs2_init_xattr_value_extent_tree(struct ocfs2_extent_tree *et, - struct inode *inode, + struct ocfs2_caching_info *ci, struct ocfs2_xattr_value_buf *vb); void ocfs2_init_dx_root_extent_tree(struct ocfs2_extent_tree *et, - struct inode *inode, + struct ocfs2_caching_info *ci, struct buffer_head *bh); +void ocfs2_init_refcount_extent_tree(struct ocfs2_extent_tree *et, + struct ocfs2_caching_info *ci, + struct buffer_head *bh); /* * Read an extent block into *bh. If *bh is NULL, a bh will be * allocated. This is a cached read. The extent block will be validated * with ocfs2_validate_extent_block(). */ -int ocfs2_read_extent_block(struct inode *inode, u64 eb_blkno, +int ocfs2_read_extent_block(struct ocfs2_caching_info *ci, u64 eb_blkno, struct buffer_head **bh); struct ocfs2_alloc_context; -int ocfs2_insert_extent(struct ocfs2_super *osb, - handle_t *handle, - struct inode *inode, +int ocfs2_insert_extent(handle_t *handle, struct ocfs2_extent_tree *et, u32 cpos, u64 start_blk, @@ -103,25 +106,36 @@ enum ocfs2_alloc_restarted { RESTART_TRANS, RESTART_META }; -int ocfs2_add_clusters_in_btree(struct ocfs2_super *osb, - struct inode *inode, +int ocfs2_add_clusters_in_btree(handle_t *handle, + struct ocfs2_extent_tree *et, u32 *logical_offset, u32 clusters_to_add, int mark_unwritten, - struct ocfs2_extent_tree *et, - handle_t *handle, struct ocfs2_alloc_context *data_ac, struct ocfs2_alloc_context *meta_ac, enum ocfs2_alloc_restarted *reason_ret); struct ocfs2_cached_dealloc_ctxt; +struct ocfs2_path; +int ocfs2_split_extent(handle_t *handle, + struct ocfs2_extent_tree *et, + struct ocfs2_path *path, + int split_index, + struct ocfs2_extent_rec *split_rec, + struct ocfs2_alloc_context *meta_ac, + struct ocfs2_cached_dealloc_ctxt *dealloc); int ocfs2_mark_extent_written(struct inode *inode, struct ocfs2_extent_tree *et, handle_t *handle, u32 cpos, u32 len, u32 phys, struct ocfs2_alloc_context *meta_ac, struct ocfs2_cached_dealloc_ctxt *dealloc); -int ocfs2_remove_extent(struct inode *inode, - struct ocfs2_extent_tree *et, - u32 cpos, u32 len, handle_t *handle, +int ocfs2_change_extent_flag(handle_t *handle, + struct ocfs2_extent_tree *et, + u32 cpos, u32 len, u32 phys, + struct ocfs2_alloc_context *meta_ac, + struct ocfs2_cached_dealloc_ctxt *dealloc, + int new_flags, int clear_flags); +int ocfs2_remove_extent(handle_t *handle, struct ocfs2_extent_tree *et, + u32 cpos, u32 len, struct ocfs2_alloc_context *meta_ac, struct ocfs2_cached_dealloc_ctxt *dealloc); int ocfs2_remove_btree_range(struct inode *inode, @@ -130,7 +144,6 @@ int ocfs2_remove_btree_range(struct inode *inode, struct ocfs2_cached_dealloc_ctxt *dealloc); int ocfs2_num_free_extents(struct ocfs2_super *osb, - struct inode *inode, struct ocfs2_extent_tree *et); /* @@ -195,6 +208,9 @@ static inline void ocfs2_init_dealloc_ctxt(struct ocfs2_cached_dealloc_ctxt *c) } int ocfs2_cache_cluster_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt, u64 blkno, unsigned int bit); +int ocfs2_cache_block_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt, + int type, int slot, u64 blkno, + unsigned int bit); static inline int ocfs2_dealloc_has_cluster(struct ocfs2_cached_dealloc_ctxt *c) { return c->c_global_allocator != NULL; @@ -222,8 +238,9 @@ int ocfs2_commit_truncate(struct ocfs2_super *osb, int ocfs2_truncate_inline(struct inode *inode, struct buffer_head *di_bh, unsigned int start, unsigned int end, int trunc); -int ocfs2_find_leaf(struct inode *inode, struct ocfs2_extent_list *root_el, - u32 cpos, struct buffer_head **leaf_bh); +int ocfs2_find_leaf(struct ocfs2_caching_info *ci, + struct ocfs2_extent_list *root_el, u32 cpos, + struct buffer_head **leaf_bh); int ocfs2_search_extent_list(struct ocfs2_extent_list *el, u32 v_cluster); /* @@ -254,4 +271,50 @@ static inline int ocfs2_is_empty_extent(struct ocfs2_extent_rec *rec) return !rec->e_leaf_clusters; } +int ocfs2_grab_pages(struct inode *inode, loff_t start, loff_t end, + struct page **pages, int *num); +void ocfs2_map_and_dirty_page(struct inode *inode, handle_t *handle, + unsigned int from, unsigned int to, + struct page *page, int zero, u64 *phys); +/* + * Structures which describe a path through a btree, and functions to + * manipulate them. + * + * The idea here is to be as generic as possible with the tree + * manipulation code. + */ +struct ocfs2_path_item { + struct buffer_head *bh; + struct ocfs2_extent_list *el; +}; + +#define OCFS2_MAX_PATH_DEPTH 5 + +struct ocfs2_path { + int p_tree_depth; + ocfs2_journal_access_func p_root_access; + struct ocfs2_path_item p_node[OCFS2_MAX_PATH_DEPTH]; +}; + +#define path_root_bh(_path) ((_path)->p_node[0].bh) +#define path_root_el(_path) ((_path)->p_node[0].el) +#define path_root_access(_path)((_path)->p_root_access) +#define path_leaf_bh(_path) ((_path)->p_node[(_path)->p_tree_depth].bh) +#define path_leaf_el(_path) ((_path)->p_node[(_path)->p_tree_depth].el) +#define path_num_items(_path) ((_path)->p_tree_depth + 1) + +void ocfs2_reinit_path(struct ocfs2_path *path, int keep_root); +void ocfs2_free_path(struct ocfs2_path *path); +int ocfs2_find_path(struct ocfs2_caching_info *ci, + struct ocfs2_path *path, + u32 cpos); +struct ocfs2_path *ocfs2_new_path_from_path(struct ocfs2_path *path); +struct ocfs2_path *ocfs2_new_path_from_et(struct ocfs2_extent_tree *et); +int ocfs2_path_bh_journal_access(handle_t *handle, + struct ocfs2_caching_info *ci, + struct ocfs2_path *path, + int idx); +int ocfs2_journal_access_path(struct ocfs2_caching_info *ci, + handle_t *handle, + struct ocfs2_path *path); #endif /* OCFS2_ALLOC_H */ diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 8a1e615..deb2b13 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -44,6 +44,7 @@ #include "suballoc.h" #include "super.h" #include "symlink.h" +#include "refcounttree.h" #include "buffer_head_io.h" @@ -126,8 +127,8 @@ bail: return err; } -static int ocfs2_get_block(struct inode *inode, sector_t iblock, - struct buffer_head *bh_result, int create) +int ocfs2_get_block(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create) { int err = 0; unsigned int ext_flags; @@ -590,6 +591,8 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock, goto bail; } + /* We should already CoW the refcounted extent. */ + BUG_ON(ext_flags & OCFS2_EXT_REFCOUNTED); /* * get_more_blocks() expects us to describe a hole by clearing * the mapped bit on bh_result(). @@ -687,6 +690,10 @@ static ssize_t ocfs2_direct_IO(int rw, if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) return 0; + /* Fallback to buffered I/O if we are appending. */ + if (i_size_read(inode) <= offset) + return 0; + ret = blockdev_direct_IO_no_locking(rw, iocb, inode, inode->i_sb->s_bdev, iov, offset, nr_segs, @@ -1259,7 +1266,8 @@ static int ocfs2_write_cluster(struct address_space *mapping, goto out; } } else if (unwritten) { - ocfs2_init_dinode_extent_tree(&et, inode, wc->w_di_bh); + ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), + wc->w_di_bh); ret = ocfs2_mark_extent_written(inode, &et, wc->w_handle, cpos, 1, phys, meta_ac, &wc->w_dealloc); @@ -1448,6 +1456,9 @@ static int ocfs2_populate_write_desc(struct inode *inode, goto out; } + /* We should already CoW the refcountd extent. */ + BUG_ON(ext_flags & OCFS2_EXT_REFCOUNTED); + /* * Assume worst case - that we're writing in * the middle of the extent. @@ -1528,7 +1539,7 @@ static int ocfs2_write_begin_inline(struct address_space *mapping, goto out; } - ret = ocfs2_journal_access_di(handle, inode, wc->w_di_bh, + ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), wc->w_di_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (ret) { ocfs2_commit_trans(osb, handle); @@ -1699,6 +1710,19 @@ int ocfs2_write_begin_nolock(struct address_space *mapping, goto out; } + ret = ocfs2_check_range_for_refcount(inode, pos, len); + if (ret < 0) { + mlog_errno(ret); + goto out; + } else if (ret == 1) { + ret = ocfs2_refcount_cow(inode, di_bh, + wc->w_cpos, wc->w_clen, UINT_MAX); + if (ret) { + mlog_errno(ret); + goto out; + } + } + ret = ocfs2_populate_write_desc(inode, wc, &clusters_to_alloc, &extents_to_split); if (ret) { @@ -1726,7 +1750,8 @@ int ocfs2_write_begin_nolock(struct address_space *mapping, (long long)i_size_read(inode), le32_to_cpu(di->i_clusters), clusters_to_alloc, extents_to_split); - ocfs2_init_dinode_extent_tree(&et, inode, wc->w_di_bh); + ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), + wc->w_di_bh); ret = ocfs2_lock_allocators(inode, &et, clusters_to_alloc, extents_to_split, &data_ac, &meta_ac); @@ -1773,7 +1798,7 @@ int ocfs2_write_begin_nolock(struct address_space *mapping, * We don't want this to fail in ocfs2_write_end(), so do it * here. */ - ret = ocfs2_journal_access_di(handle, inode, wc->w_di_bh, + ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), wc->w_di_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (ret) { mlog_errno(ret); @@ -1997,4 +2022,5 @@ const struct address_space_operations ocfs2_aops = { .releasepage = ocfs2_releasepage, .migratepage = buffer_migrate_page, .is_partially_uptodate = block_is_partially_uptodate, + .error_remove_page = generic_error_remove_page, }; diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h index 503e492..c48e93f 100644 --- a/fs/ocfs2/aops.h +++ b/fs/ocfs2/aops.h @@ -57,6 +57,8 @@ int ocfs2_read_inline_data(struct inode *inode, struct page *page, struct buffer_head *di_bh); int ocfs2_size_fits_inline_data(struct buffer_head *di_bh, u64 new_size); +int ocfs2_get_block(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create); /* all ocfs2_dio_end_io()'s fault */ #define ocfs2_iocb_is_rw_locked(iocb) \ test_bit(0, (unsigned long *)&iocb->private) diff --git a/fs/ocfs2/buffer_head_io.c b/fs/ocfs2/buffer_head_io.c index 15c8e6d..d43d34a 100644 --- a/fs/ocfs2/buffer_head_io.c +++ b/fs/ocfs2/buffer_head_io.c @@ -52,12 +52,12 @@ enum ocfs2_state_bits { BUFFER_FNS(NeedsValidate, needs_validate); int ocfs2_write_block(struct ocfs2_super *osb, struct buffer_head *bh, - struct inode *inode) + struct ocfs2_caching_info *ci) { int ret = 0; - mlog_entry("(bh->b_blocknr = %llu, inode=%p)\n", - (unsigned long long)bh->b_blocknr, inode); + mlog_entry("(bh->b_blocknr = %llu, ci=%p)\n", + (unsigned long long)bh->b_blocknr, ci); BUG_ON(bh->b_blocknr < OCFS2_SUPER_BLOCK_BLKNO); BUG_ON(buffer_jbd(bh)); @@ -70,7 +70,7 @@ int ocfs2_write_block(struct ocfs2_super *osb, struct buffer_head *bh, goto out; } - mutex_lock(&OCFS2_I(inode)->ip_io_mutex); + ocfs2_metadata_cache_io_lock(ci); lock_buffer(bh); set_buffer_uptodate(bh); @@ -85,7 +85,7 @@ int ocfs2_write_block(struct ocfs2_super *osb, struct buffer_head *bh, wait_on_buffer(bh); if (buffer_uptodate(bh)) { - ocfs2_set_buffer_uptodate(inode, bh); + ocfs2_set_buffer_uptodate(ci, bh); } else { /* We don't need to remove the clustered uptodate * information for this bh as it's not marked locally @@ -94,7 +94,7 @@ int ocfs2_write_block(struct ocfs2_super *osb, struct buffer_head *bh, put_bh(bh); } - mutex_unlock(&OCFS2_I(inode)->ip_io_mutex); + ocfs2_metadata_cache_io_unlock(ci); out: mlog_exit(ret); return ret; @@ -177,7 +177,7 @@ bail: return status; } -int ocfs2_read_blocks(struct inode *inode, u64 block, int nr, +int ocfs2_read_blocks(struct ocfs2_caching_info *ci, u64 block, int nr, struct buffer_head *bhs[], int flags, int (*validate)(struct super_block *sb, struct buffer_head *bh)) @@ -185,11 +185,12 @@ int ocfs2_read_blocks(struct inode *inode, u64 block, int nr, int status = 0; int i, ignore_cache = 0; struct buffer_head *bh; + struct super_block *sb = ocfs2_metadata_cache_get_super(ci); - mlog_entry("(inode=%p, block=(%llu), nr=(%d), flags=%d)\n", - inode, (unsigned long long)block, nr, flags); + mlog_entry("(ci=%p, block=(%llu), nr=(%d), flags=%d)\n", + ci, (unsigned long long)block, nr, flags); - BUG_ON(!inode); + BUG_ON(!ci); BUG_ON((flags & OCFS2_BH_READAHEAD) && (flags & OCFS2_BH_IGNORE_CACHE)); @@ -212,12 +213,12 @@ int ocfs2_read_blocks(struct inode *inode, u64 block, int nr, goto bail; } - mutex_lock(&OCFS2_I(inode)->ip_io_mutex); + ocfs2_metadata_cache_io_lock(ci); for (i = 0 ; i < nr ; i++) { if (bhs[i] == NULL) { - bhs[i] = sb_getblk(inode->i_sb, block++); + bhs[i] = sb_getblk(sb, block++); if (bhs[i] == NULL) { - mutex_unlock(&OCFS2_I(inode)->ip_io_mutex); + ocfs2_metadata_cache_io_unlock(ci); status = -EIO; mlog_errno(status); goto bail; @@ -250,11 +251,11 @@ int ocfs2_read_blocks(struct inode *inode, u64 block, int nr, * before our is-it-in-flight check. */ - if (!ignore_cache && !ocfs2_buffer_uptodate(inode, bh)) { + if (!ignore_cache && !ocfs2_buffer_uptodate(ci, bh)) { mlog(ML_UPTODATE, - "bh (%llu), inode %llu not uptodate\n", + "bh (%llu), owner %llu not uptodate\n", (unsigned long long)bh->b_blocknr, - (unsigned long long)OCFS2_I(inode)->ip_blkno); + (unsigned long long)ocfs2_metadata_cache_owner(ci)); /* We're using ignore_cache here to say * "go to disk" */ ignore_cache = 1; @@ -283,7 +284,7 @@ int ocfs2_read_blocks(struct inode *inode, u64 block, int nr, * previously submitted request than we are * done here. */ if ((flags & OCFS2_BH_READAHEAD) - && ocfs2_buffer_read_ahead(inode, bh)) + && ocfs2_buffer_read_ahead(ci, bh)) continue; lock_buffer(bh); @@ -305,7 +306,7 @@ int ocfs2_read_blocks(struct inode *inode, u64 block, int nr, * buffer lock. */ if (!(flags & OCFS2_BH_IGNORE_CACHE) && !(flags & OCFS2_BH_READAHEAD) - && ocfs2_buffer_uptodate(inode, bh)) { + && ocfs2_buffer_uptodate(ci, bh)) { unlock_buffer(bh); continue; } @@ -327,7 +328,7 @@ int ocfs2_read_blocks(struct inode *inode, u64 block, int nr, if (!(flags & OCFS2_BH_READAHEAD)) { /* We know this can't have changed as we hold the - * inode sem. Avoid doing any work on the bh if the + * owner sem. Avoid doing any work on the bh if the * journal has it. */ if (!buffer_jbd(bh)) wait_on_buffer(bh); @@ -351,7 +352,7 @@ int ocfs2_read_blocks(struct inode *inode, u64 block, int nr, * that better not have changed */ BUG_ON(buffer_jbd(bh)); clear_buffer_needs_validate(bh); - status = validate(inode->i_sb, bh); + status = validate(sb, bh); if (status) { put_bh(bh); bhs[i] = NULL; @@ -363,9 +364,9 @@ int ocfs2_read_blocks(struct inode *inode, u64 block, int nr, /* Always set the buffer in the cache, even if it was * a forced read, or read-ahead which hasn't yet * completed. */ - ocfs2_set_buffer_uptodate(inode, bh); + ocfs2_set_buffer_uptodate(ci, bh); } - mutex_unlock(&OCFS2_I(inode)->ip_io_mutex); + ocfs2_metadata_cache_io_unlock(ci); mlog(ML_BH_IO, "block=(%llu), nr=(%d), cached=%s, flags=0x%x\n", (unsigned long long)block, nr, @@ -399,7 +400,7 @@ static void ocfs2_check_super_or_backup(struct super_block *sb, /* * Write super block and backups doesn't need to collaborate with journal, - * so we don't need to lock ip_io_mutex and inode doesn't need to bea passed + * so we don't need to lock ip_io_mutex and ci doesn't need to bea passed * into this function. */ int ocfs2_write_super_or_backup(struct ocfs2_super *osb, diff --git a/fs/ocfs2/buffer_head_io.h b/fs/ocfs2/buffer_head_io.h index c75d682..b97bcc6 100644 --- a/fs/ocfs2/buffer_head_io.h +++ b/fs/ocfs2/buffer_head_io.h @@ -33,7 +33,7 @@ void ocfs2_end_buffer_io_sync(struct buffer_head *bh, int ocfs2_write_block(struct ocfs2_super *osb, struct buffer_head *bh, - struct inode *inode); + struct ocfs2_caching_info *ci); int ocfs2_read_blocks_sync(struct ocfs2_super *osb, u64 block, unsigned int nr, struct buffer_head *bhs[]); @@ -44,7 +44,7 @@ int ocfs2_read_blocks_sync(struct ocfs2_super *osb, u64 block, * be set even for a READAHEAD call, as it marks the buffer for later * validation. */ -int ocfs2_read_blocks(struct inode *inode, u64 block, int nr, +int ocfs2_read_blocks(struct ocfs2_caching_info *ci, u64 block, int nr, struct buffer_head *bhs[], int flags, int (*validate)(struct super_block *sb, struct buffer_head *bh)); @@ -55,7 +55,7 @@ int ocfs2_write_super_or_backup(struct ocfs2_super *osb, #define OCFS2_BH_IGNORE_CACHE 1 #define OCFS2_BH_READAHEAD 8 -static inline int ocfs2_read_block(struct inode *inode, u64 off, +static inline int ocfs2_read_block(struct ocfs2_caching_info *ci, u64 off, struct buffer_head **bh, int (*validate)(struct super_block *sb, struct buffer_head *bh)) @@ -68,7 +68,7 @@ static inline int ocfs2_read_block(struct inode *inode, u64 off, goto bail; } - status = ocfs2_read_blocks(inode, off, 1, bh, 0, validate); + status = ocfs2_read_blocks(ci, off, 1, bh, 0, validate); bail: return status; diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index 09cc25d..c452d11 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -966,7 +966,7 @@ static ssize_t o2hb_debug_read(struct file *file, char __user *buf, } #endif /* CONFIG_DEBUG_FS */ -static struct file_operations o2hb_debug_fops = { +static const struct file_operations o2hb_debug_fops = { .open = o2hb_debug_open, .release = o2hb_debug_release, .read = o2hb_debug_read, diff --git a/fs/ocfs2/cluster/masklog.c b/fs/ocfs2/cluster/masklog.c index 96df541..1cd2934 100644 --- a/fs/ocfs2/cluster/masklog.c +++ b/fs/ocfs2/cluster/masklog.c @@ -111,6 +111,7 @@ static struct mlog_attribute mlog_attrs[MLOG_MAX_BITS] = { define_mask(EXPORT), define_mask(XATTR), define_mask(QUOTA), + define_mask(REFCOUNT), define_mask(ERROR), define_mask(NOTICE), define_mask(KTHREAD), diff --git a/fs/ocfs2/cluster/masklog.h b/fs/ocfs2/cluster/masklog.h index 696c32e..9b4d117 100644 --- a/fs/ocfs2/cluster/masklog.h +++ b/fs/ocfs2/cluster/masklog.h @@ -113,6 +113,7 @@ #define ML_EXPORT 0x0000000010000000ULL /* ocfs2 export operations */ #define ML_XATTR 0x0000000020000000ULL /* ocfs2 extended attributes */ #define ML_QUOTA 0x0000000040000000ULL /* ocfs2 quota operations */ +#define ML_REFCOUNT 0x0000000080000000ULL /* refcount tree operations */ /* bits that are infrequently given and frequently matched in the high word */ #define ML_ERROR 0x0000000100000000ULL /* sent to KERN_ERR */ #define ML_NOTICE 0x0000000200000000ULL /* setn to KERN_NOTICE */ diff --git a/fs/ocfs2/cluster/netdebug.c b/fs/ocfs2/cluster/netdebug.c index f842487..da794bc 100644 --- a/fs/ocfs2/cluster/netdebug.c +++ b/fs/ocfs2/cluster/netdebug.c @@ -163,7 +163,7 @@ static void nst_seq_stop(struct seq_file *seq, void *v) { } -static struct seq_operations nst_seq_ops = { +static const struct seq_operations nst_seq_ops = { .start = nst_seq_start, .next = nst_seq_next, .stop = nst_seq_stop, @@ -207,7 +207,7 @@ static int nst_fop_release(struct inode *inode, struct file *file) return seq_release_private(inode, file); } -static struct file_operations nst_seq_fops = { +static const struct file_operations nst_seq_fops = { .open = nst_fop_open, .read = seq_read, .llseek = seq_lseek, @@ -344,7 +344,7 @@ static void sc_seq_stop(struct seq_file *seq, void *v) { } -static struct seq_operations sc_seq_ops = { +static const struct seq_operations sc_seq_ops = { .start = sc_seq_start, .next = sc_seq_next, .stop = sc_seq_stop, @@ -388,7 +388,7 @@ static int sc_fop_release(struct inode *inode, struct file *file) return seq_release_private(inode, file); } -static struct file_operations sc_seq_fops = { +static const struct file_operations sc_seq_fops = { .open = sc_fop_open, .read = seq_read, .llseek = seq_lseek, diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c index b358f3b..28c3ec2 100644 --- a/fs/ocfs2/dir.c +++ b/fs/ocfs2/dir.c @@ -176,7 +176,7 @@ static int ocfs2_dx_dir_link_trailer(struct inode *dir, handle_t *handle, struct ocfs2_dx_root_block *dx_root; struct ocfs2_dir_block_trailer *trailer; - ret = ocfs2_journal_access_dr(handle, dir, dx_root_bh, + ret = ocfs2_journal_access_dr(handle, INODE_CACHE(dir), dx_root_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (ret) { mlog_errno(ret); @@ -564,7 +564,8 @@ static int ocfs2_read_dir_block_direct(struct inode *dir, u64 phys, int ret; struct buffer_head *tmp = *bh; - ret = ocfs2_read_block(dir, phys, &tmp, ocfs2_validate_dir_block); + ret = ocfs2_read_block(INODE_CACHE(dir), phys, &tmp, + ocfs2_validate_dir_block); if (ret) { mlog_errno(ret); goto out; @@ -622,7 +623,8 @@ static int ocfs2_read_dx_root(struct inode *dir, struct ocfs2_dinode *di, u64 blkno = le64_to_cpu(di->i_dx_root); struct buffer_head *tmp = *dx_root_bh; - ret = ocfs2_read_block(dir, blkno, &tmp, ocfs2_validate_dx_root); + ret = ocfs2_read_block(INODE_CACHE(dir), blkno, &tmp, + ocfs2_validate_dx_root); /* If ocfs2_read_block() got us a new bh, pass it up. */ if (!ret && !*dx_root_bh) @@ -662,7 +664,8 @@ static int ocfs2_read_dx_leaf(struct inode *dir, u64 blkno, int ret; struct buffer_head *tmp = *dx_leaf_bh; - ret = ocfs2_read_block(dir, blkno, &tmp, ocfs2_validate_dx_leaf); + ret = ocfs2_read_block(INODE_CACHE(dir), blkno, &tmp, + ocfs2_validate_dx_leaf); /* If ocfs2_read_block() got us a new bh, pass it up. */ if (!ret && !*dx_leaf_bh) @@ -680,7 +683,7 @@ static int ocfs2_read_dx_leaves(struct inode *dir, u64 start, int num, { int ret; - ret = ocfs2_read_blocks(dir, start, num, dx_leaf_bhs, 0, + ret = ocfs2_read_blocks(INODE_CACHE(dir), start, num, dx_leaf_bhs, 0, ocfs2_validate_dx_leaf); if (ret) mlog_errno(ret); @@ -802,7 +805,8 @@ static int ocfs2_dx_dir_lookup_rec(struct inode *inode, struct ocfs2_extent_rec *rec = NULL; if (el->l_tree_depth) { - ret = ocfs2_find_leaf(inode, el, major_hash, &eb_bh); + ret = ocfs2_find_leaf(INODE_CACHE(inode), el, major_hash, + &eb_bh); if (ret) { mlog_errno(ret); goto out; @@ -1133,7 +1137,8 @@ int ocfs2_update_entry(struct inode *dir, handle_t *handle, if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL) access = ocfs2_journal_access_di; - ret = access(handle, dir, de_bh, OCFS2_JOURNAL_ACCESS_WRITE); + ret = access(handle, INODE_CACHE(dir), de_bh, + OCFS2_JOURNAL_ACCESS_WRITE); if (ret) { mlog_errno(ret); goto out; @@ -1176,7 +1181,7 @@ static int __ocfs2_delete_entry(handle_t *handle, struct inode *dir, goto bail; } if (de == de_del) { - status = access(handle, dir, bh, + status = access(handle, INODE_CACHE(dir), bh, OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { status = -EIO; @@ -1326,7 +1331,7 @@ static int ocfs2_delete_entry_dx(handle_t *handle, struct inode *dir, * the entry count needs to be updated. Also, we might be * adding to the start of the free list. */ - ret = ocfs2_journal_access_dr(handle, dir, dx_root_bh, + ret = ocfs2_journal_access_dr(handle, INODE_CACHE(dir), dx_root_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (ret) { mlog_errno(ret); @@ -1334,7 +1339,7 @@ static int ocfs2_delete_entry_dx(handle_t *handle, struct inode *dir, } if (!ocfs2_dx_root_inline(dx_root)) { - ret = ocfs2_journal_access_dl(handle, dir, + ret = ocfs2_journal_access_dl(handle, INODE_CACHE(dir), lookup->dl_dx_leaf_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (ret) { @@ -1493,7 +1498,7 @@ static int __ocfs2_dx_dir_leaf_insert(struct inode *dir, handle_t *handle, int ret; struct ocfs2_dx_leaf *dx_leaf; - ret = ocfs2_journal_access_dl(handle, dir, dx_leaf_bh, + ret = ocfs2_journal_access_dl(handle, INODE_CACHE(dir), dx_leaf_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (ret) { mlog_errno(ret); @@ -1523,7 +1528,7 @@ static int ocfs2_dx_dir_insert(struct inode *dir, handle_t *handle, struct ocfs2_dx_root_block *dx_root; struct buffer_head *dx_root_bh = lookup->dl_dx_root_bh; - ret = ocfs2_journal_access_dr(handle, dir, dx_root_bh, + ret = ocfs2_journal_access_dr(handle, INODE_CACHE(dir), dx_root_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (ret) { mlog_errno(ret); @@ -1645,11 +1650,13 @@ int __ocfs2_add_entry(handle_t *handle, */ if (ocfs2_free_list_at_root(lookup)) { bh = lookup->dl_dx_root_bh; - retval = ocfs2_journal_access_dr(handle, dir, bh, + retval = ocfs2_journal_access_dr(handle, + INODE_CACHE(dir), bh, OCFS2_JOURNAL_ACCESS_WRITE); } else { bh = lookup->dl_prev_leaf_bh; - retval = ocfs2_journal_access_db(handle, dir, bh, + retval = ocfs2_journal_access_db(handle, + INODE_CACHE(dir), bh, OCFS2_JOURNAL_ACCESS_WRITE); } if (retval) { @@ -1700,11 +1707,13 @@ int __ocfs2_add_entry(handle_t *handle, } if (insert_bh == parent_fe_bh) - status = ocfs2_journal_access_di(handle, dir, + status = ocfs2_journal_access_di(handle, + INODE_CACHE(dir), insert_bh, OCFS2_JOURNAL_ACCESS_WRITE); else { - status = ocfs2_journal_access_db(handle, dir, + status = ocfs2_journal_access_db(handle, + INODE_CACHE(dir), insert_bh, OCFS2_JOURNAL_ACCESS_WRITE); @@ -2280,7 +2289,7 @@ static int ocfs2_fill_new_dir_id(struct ocfs2_super *osb, struct ocfs2_inline_data *data = &di->id2.i_data; unsigned int size = le16_to_cpu(data->id_count); - ret = ocfs2_journal_access_di(handle, inode, di_bh, + ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (ret) { mlog_errno(ret); @@ -2332,9 +2341,9 @@ static int ocfs2_fill_new_dir_el(struct ocfs2_super *osb, goto bail; } - ocfs2_set_new_buffer_uptodate(inode, new_bh); + ocfs2_set_new_buffer_uptodate(INODE_CACHE(inode), new_bh); - status = ocfs2_journal_access_db(handle, inode, new_bh, + status = ocfs2_journal_access_db(handle, INODE_CACHE(inode), new_bh, OCFS2_JOURNAL_ACCESS_CREATE); if (status < 0) { mlog_errno(status); @@ -2418,9 +2427,9 @@ static int ocfs2_dx_dir_attach_index(struct ocfs2_super *osb, ret = -EIO; goto out; } - ocfs2_set_new_buffer_uptodate(dir, dx_root_bh); + ocfs2_set_new_buffer_uptodate(INODE_CACHE(dir), dx_root_bh); - ret = ocfs2_journal_access_dr(handle, dir, dx_root_bh, + ret = ocfs2_journal_access_dr(handle, INODE_CACHE(dir), dx_root_bh, OCFS2_JOURNAL_ACCESS_CREATE); if (ret < 0) { mlog_errno(ret); @@ -2454,7 +2463,7 @@ static int ocfs2_dx_dir_attach_index(struct ocfs2_super *osb, if (ret) mlog_errno(ret); - ret = ocfs2_journal_access_di(handle, dir, di_bh, + ret = ocfs2_journal_access_di(handle, INODE_CACHE(dir), di_bh, OCFS2_JOURNAL_ACCESS_CREATE); if (ret) { mlog_errno(ret); @@ -2495,9 +2504,9 @@ static int ocfs2_dx_dir_format_cluster(struct ocfs2_super *osb, } dx_leaves[i] = bh; - ocfs2_set_new_buffer_uptodate(dir, bh); + ocfs2_set_new_buffer_uptodate(INODE_CACHE(dir), bh); - ret = ocfs2_journal_access_dl(handle, dir, bh, + ret = ocfs2_journal_access_dl(handle, INODE_CACHE(dir), bh, OCFS2_JOURNAL_ACCESS_CREATE); if (ret < 0) { mlog_errno(ret); @@ -2582,7 +2591,6 @@ static int ocfs2_dx_dir_new_cluster(struct inode *dir, { int ret; u64 phys_blkno; - struct ocfs2_super *osb = OCFS2_SB(dir->i_sb); ret = __ocfs2_dx_dir_new_cluster(dir, cpos, handle, data_ac, dx_leaves, num_dx_leaves, &phys_blkno); @@ -2591,7 +2599,7 @@ static int ocfs2_dx_dir_new_cluster(struct inode *dir, goto out; } - ret = ocfs2_insert_extent(osb, handle, dir, et, cpos, phys_blkno, 1, 0, + ret = ocfs2_insert_extent(handle, et, cpos, phys_blkno, 1, 0, meta_ac); if (ret) mlog_errno(ret); @@ -2895,7 +2903,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh, struct ocfs2_extent_tree dx_et; int did_quota = 0, bytes_allocated = 0; - ocfs2_init_dinode_extent_tree(&et, dir, di_bh); + ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(dir), di_bh); alloc = ocfs2_clusters_for_bytes(sb, bytes); dx_alloc = 0; @@ -3005,9 +3013,9 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh, goto out_commit; } - ocfs2_set_new_buffer_uptodate(dir, dirdata_bh); + ocfs2_set_new_buffer_uptodate(INODE_CACHE(dir), dirdata_bh); - ret = ocfs2_journal_access_db(handle, dir, dirdata_bh, + ret = ocfs2_journal_access_db(handle, INODE_CACHE(dir), dirdata_bh, OCFS2_JOURNAL_ACCESS_CREATE); if (ret) { mlog_errno(ret); @@ -3060,7 +3068,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh, * We let the later dirent insert modify c/mtime - to the user * the data hasn't changed. */ - ret = ocfs2_journal_access_di(handle, dir, di_bh, + ret = ocfs2_journal_access_di(handle, INODE_CACHE(dir), di_bh, OCFS2_JOURNAL_ACCESS_CREATE); if (ret) { mlog_errno(ret); @@ -3085,7 +3093,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh, * This should never fail as our extent list is empty and all * related blocks have been journaled already. */ - ret = ocfs2_insert_extent(osb, handle, dir, &et, 0, blkno, len, + ret = ocfs2_insert_extent(handle, &et, 0, blkno, len, 0, NULL); if (ret) { mlog_errno(ret); @@ -3117,8 +3125,10 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh, ocfs2_dx_dir_index_root_block(dir, dx_root_bh, dirdata_bh); } else { - ocfs2_init_dx_root_extent_tree(&dx_et, dir, dx_root_bh); - ret = ocfs2_insert_extent(osb, handle, dir, &dx_et, 0, + ocfs2_init_dx_root_extent_tree(&dx_et, + INODE_CACHE(dir), + dx_root_bh); + ret = ocfs2_insert_extent(handle, &dx_et, 0, dx_insert_blkno, 1, 0, NULL); if (ret) mlog_errno(ret); @@ -3138,7 +3148,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh, } blkno = ocfs2_clusters_to_blocks(dir->i_sb, bit_off); - ret = ocfs2_insert_extent(osb, handle, dir, &et, 1, + ret = ocfs2_insert_extent(handle, &et, 1, blkno, len, 0, NULL); if (ret) { mlog_errno(ret); @@ -3337,8 +3347,9 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb, spin_lock(&OCFS2_I(dir)->ip_lock); if (dir_i_size == ocfs2_clusters_to_bytes(sb, OCFS2_I(dir)->ip_clusters)) { spin_unlock(&OCFS2_I(dir)->ip_lock); - ocfs2_init_dinode_extent_tree(&et, dir, parent_fe_bh); - num_free_extents = ocfs2_num_free_extents(osb, dir, &et); + ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(dir), + parent_fe_bh); + num_free_extents = ocfs2_num_free_extents(osb, &et); if (num_free_extents < 0) { status = num_free_extents; mlog_errno(status); @@ -3387,9 +3398,9 @@ do_extend: goto bail; } - ocfs2_set_new_buffer_uptodate(dir, new_bh); + ocfs2_set_new_buffer_uptodate(INODE_CACHE(dir), new_bh); - status = ocfs2_journal_access_db(handle, dir, new_bh, + status = ocfs2_journal_access_db(handle, INODE_CACHE(dir), new_bh, OCFS2_JOURNAL_ACCESS_CREATE); if (status < 0) { mlog_errno(status); @@ -3829,7 +3840,7 @@ static int ocfs2_dx_dir_rebalance(struct ocfs2_super *osb, struct inode *dir, (unsigned long long)OCFS2_I(dir)->ip_blkno, (unsigned long long)leaf_blkno, insert_hash); - ocfs2_init_dx_root_extent_tree(&et, dir, dx_root_bh); + ocfs2_init_dx_root_extent_tree(&et, INODE_CACHE(dir), dx_root_bh); dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data; /* @@ -3885,7 +3896,7 @@ static int ocfs2_dx_dir_rebalance(struct ocfs2_super *osb, struct inode *dir, } did_quota = 1; - ret = ocfs2_journal_access_dl(handle, dir, dx_leaf_bh, + ret = ocfs2_journal_access_dl(handle, INODE_CACHE(dir), dx_leaf_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (ret) { mlog_errno(ret); @@ -3949,7 +3960,8 @@ static int ocfs2_dx_dir_rebalance(struct ocfs2_super *osb, struct inode *dir, } for (i = 0; i < num_dx_leaves; i++) { - ret = ocfs2_journal_access_dl(handle, dir, orig_dx_leaves[i], + ret = ocfs2_journal_access_dl(handle, INODE_CACHE(dir), + orig_dx_leaves[i], OCFS2_JOURNAL_ACCESS_WRITE); if (ret) { mlog_errno(ret); @@ -4165,7 +4177,7 @@ static int ocfs2_expand_inline_dx_root(struct inode *dir, * failure to add the dx_root_bh to the journal won't result * us losing clusters. */ - ret = ocfs2_journal_access_dr(handle, dir, dx_root_bh, + ret = ocfs2_journal_access_dr(handle, INODE_CACHE(dir), dx_root_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (ret) { mlog_errno(ret); @@ -4207,9 +4219,8 @@ static int ocfs2_expand_inline_dx_root(struct inode *dir, /* This should never fail considering we start with an empty * dx_root. */ - ocfs2_init_dx_root_extent_tree(&et, dir, dx_root_bh); - ret = ocfs2_insert_extent(osb, handle, dir, &et, 0, - insert_blkno, 1, 0, NULL); + ocfs2_init_dx_root_extent_tree(&et, INODE_CACHE(dir), dx_root_bh); + ret = ocfs2_insert_extent(handle, &et, 0, insert_blkno, 1, 0, NULL); if (ret) mlog_errno(ret); did_quota = 0; @@ -4469,7 +4480,7 @@ static int ocfs2_dx_dir_remove_index(struct inode *dir, goto out_unlock; } - ret = ocfs2_journal_access_di(handle, dir, di_bh, + ret = ocfs2_journal_access_di(handle, INODE_CACHE(dir), di_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (ret) { mlog_errno(ret); @@ -4532,7 +4543,7 @@ int ocfs2_dx_dir_truncate(struct inode *dir, struct buffer_head *di_bh) if (ocfs2_dx_root_inline(dx_root)) goto remove_index; - ocfs2_init_dx_root_extent_tree(&et, dir, dx_root_bh); + ocfs2_init_dx_root_extent_tree(&et, INODE_CACHE(dir), dx_root_bh); /* XXX: What if dr_clusters is too large? */ while (le32_to_cpu(dx_root->dr_clusters)) { @@ -4565,7 +4576,7 @@ remove_index: goto out; } - ocfs2_remove_from_cache(dir, dx_root_bh); + ocfs2_remove_from_cache(INODE_CACHE(dir), dx_root_bh); out: ocfs2_schedule_truncate_log_flush(osb, 1); ocfs2_run_deallocs(osb, &dealloc); diff --git a/fs/ocfs2/dlm/dlmast.c b/fs/ocfs2/dlm/dlmast.c index 81eff8e..01cf8cc 100644 --- a/fs/ocfs2/dlm/dlmast.c +++ b/fs/ocfs2/dlm/dlmast.c @@ -30,7 +30,6 @@ #include <linux/types.h> #include <linux/slab.h> #include <linux/highmem.h> -#include <linux/utsname.h> #include <linux/init.h> #include <linux/sysctl.h> #include <linux/random.h> diff --git a/fs/ocfs2/dlm/dlmconvert.c b/fs/ocfs2/dlm/dlmconvert.c index 75997b4..ca96bce 100644 --- a/fs/ocfs2/dlm/dlmconvert.c +++ b/fs/ocfs2/dlm/dlmconvert.c @@ -30,7 +30,6 @@ #include <linux/types.h> #include <linux/slab.h> #include <linux/highmem.h> -#include <linux/utsname.h> #include <linux/init.h> #include <linux/sysctl.h> #include <linux/random.h> diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c index df52f70..42b0bad 100644 --- a/fs/ocfs2/dlm/dlmdebug.c +++ b/fs/ocfs2/dlm/dlmdebug.c @@ -27,7 +27,6 @@ #include <linux/types.h> #include <linux/slab.h> #include <linux/highmem.h> -#include <linux/utsname.h> #include <linux/sysctl.h> #include <linux/spinlock.h> #include <linux/debugfs.h> @@ -479,7 +478,7 @@ bail: return -ENOMEM; } -static struct file_operations debug_purgelist_fops = { +static const struct file_operations debug_purgelist_fops = { .open = debug_purgelist_open, .release = debug_buffer_release, .read = debug_buffer_read, @@ -539,7 +538,7 @@ bail: return -ENOMEM; } -static struct file_operations debug_mle_fops = { +static const struct file_operations debug_mle_fops = { .open = debug_mle_open, .release = debug_buffer_release, .read = debug_buffer_read, @@ -683,7 +682,7 @@ static int lockres_seq_show(struct seq_file *s, void *v) return 0; } -static struct seq_operations debug_lockres_ops = { +static const struct seq_operations debug_lockres_ops = { .start = lockres_seq_start, .stop = lockres_seq_stop, .next = lockres_seq_next, @@ -742,7 +741,7 @@ static int debug_lockres_release(struct inode *inode, struct file *file) return seq_release_private(inode, file); } -static struct file_operations debug_lockres_fops = { +static const struct file_operations debug_lockres_fops = { .open = debug_lockres_open, .release = debug_lockres_release, .read = seq_read, @@ -926,7 +925,7 @@ bail: return -ENOMEM; } -static struct file_operations debug_state_fops = { +static const struct file_operations debug_state_fops = { .open = debug_state_open, .release = debug_buffer_release, .read = debug_buffer_read, diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c index 4d9e6b2..0334000 100644 --- a/fs/ocfs2/dlm/dlmdomain.c +++ b/fs/ocfs2/dlm/dlmdomain.c @@ -28,7 +28,6 @@ #include <linux/types.h> #include <linux/slab.h> #include <linux/highmem.h> -#include <linux/utsname.h> #include <linux/init.h> #include <linux/spinlock.h> #include <linux/delay.h> diff --git a/fs/ocfs2/dlm/dlmfs.c b/fs/ocfs2/dlm/dlmfs.c index 1c9efb4..02bf178 100644 --- a/fs/ocfs2/dlm/dlmfs.c +++ b/fs/ocfs2/dlm/dlmfs.c @@ -325,6 +325,7 @@ clear_fields: } static struct backing_dev_info dlmfs_backing_dev_info = { + .name = "ocfs2-dlmfs", .ra_pages = 0, /* No readahead */ .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, }; diff --git a/fs/ocfs2/dlm/dlmlock.c b/fs/ocfs2/dlm/dlmlock.c index 83a9f29..437698e 100644 --- a/fs/ocfs2/dlm/dlmlock.c +++ b/fs/ocfs2/dlm/dlmlock.c @@ -30,7 +30,6 @@ #include <linux/types.h> #include <linux/slab.h> #include <linux/highmem.h> -#include <linux/utsname.h> #include <linux/init.h> #include <linux/sysctl.h> #include <linux/random.h> diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c index f8b653f..83bcaf2 100644 --- a/fs/ocfs2/dlm/dlmmaster.c +++ b/fs/ocfs2/dlm/dlmmaster.c @@ -30,7 +30,6 @@ #include <linux/types.h> #include <linux/slab.h> #include <linux/highmem.h> -#include <linux/utsname.h> #include <linux/init.h> #include <linux/sysctl.h> #include <linux/random.h> diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c index 43e6e32..d9fa3d2 100644 --- a/fs/ocfs2/dlm/dlmrecovery.c +++ b/fs/ocfs2/dlm/dlmrecovery.c @@ -30,7 +30,6 @@ #include <linux/types.h> #include <linux/slab.h> #include <linux/highmem.h> -#include <linux/utsname.h> #include <linux/init.h> #include <linux/sysctl.h> #include <linux/random.h> diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c index d490b66..52ec020 100644 --- a/fs/ocfs2/dlm/dlmthread.c +++ b/fs/ocfs2/dlm/dlmthread.c @@ -30,7 +30,6 @@ #include <linux/types.h> #include <linux/slab.h> #include <linux/highmem.h> -#include <linux/utsname.h> #include <linux/init.h> #include <linux/sysctl.h> #include <linux/random.h> @@ -212,14 +211,18 @@ static int dlm_purge_lockres(struct dlm_ctxt *dlm, spin_lock(&dlm->spinlock); } + spin_lock(&res->spinlock); if (!list_empty(&res->purge)) { mlog(0, "removing lockres %.*s:%p from purgelist, " "master = %d\n", res->lockname.len, res->lockname.name, res, master); list_del_init(&res->purge); + spin_unlock(&res->spinlock); dlm_lockres_put(res); dlm->purge_count--; - } + } else + spin_unlock(&res->spinlock); + __dlm_unhash_lockres(res); /* lockres is not in the hash now. drop the flag and wake up diff --git a/fs/ocfs2/dlm/dlmunlock.c b/fs/ocfs2/dlm/dlmunlock.c index 756f5b0..00f53b2 100644 --- a/fs/ocfs2/dlm/dlmunlock.c +++ b/fs/ocfs2/dlm/dlmunlock.c @@ -30,7 +30,6 @@ #include <linux/types.h> #include <linux/slab.h> #include <linux/highmem.h> -#include <linux/utsname.h> #include <linux/init.h> #include <linux/sysctl.h> #include <linux/random.h> diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index 110bb57..0d38d67 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -53,6 +53,7 @@ #include "super.h" #include "uptodate.h" #include "quota.h" +#include "refcounttree.h" #include "buffer_head_io.h" @@ -110,6 +111,11 @@ static void ocfs2_dentry_post_unlock(struct ocfs2_super *osb, static void ocfs2_set_qinfo_lvb(struct ocfs2_lock_res *lockres); +static int ocfs2_check_refcount_downconvert(struct ocfs2_lock_res *lockres, + int new_level); +static int ocfs2_refcount_convert_worker(struct ocfs2_lock_res *lockres, + int blocking); + #define mlog_meta_lvb(__level, __lockres) ocfs2_dump_meta_lvb_info(__level, __PRETTY_FUNCTION__, __LINE__, __lockres) /* This aids in debugging situations where a bad LVB might be involved. */ @@ -278,6 +284,12 @@ static struct ocfs2_lock_res_ops ocfs2_qinfo_lops = { .flags = LOCK_TYPE_REQUIRES_REFRESH | LOCK_TYPE_USES_LVB, }; +static struct ocfs2_lock_res_ops ocfs2_refcount_block_lops = { + .check_downconvert = ocfs2_check_refcount_downconvert, + .downconvert_worker = ocfs2_refcount_convert_worker, + .flags = 0, +}; + static inline int ocfs2_is_inode_lock(struct ocfs2_lock_res *lockres) { return lockres->l_type == OCFS2_LOCK_TYPE_META || @@ -306,6 +318,12 @@ static inline struct ocfs2_mem_dqinfo *ocfs2_lock_res_qinfo(struct ocfs2_lock_re return (struct ocfs2_mem_dqinfo *)lockres->l_priv; } +static inline struct ocfs2_refcount_tree * +ocfs2_lock_res_refcount_tree(struct ocfs2_lock_res *res) +{ + return container_of(res, struct ocfs2_refcount_tree, rf_lockres); +} + static inline struct ocfs2_super *ocfs2_get_lockres_osb(struct ocfs2_lock_res *lockres) { if (lockres->l_ops->get_osb) @@ -693,6 +711,17 @@ void ocfs2_qinfo_lock_res_init(struct ocfs2_lock_res *lockres, info); } +void ocfs2_refcount_lock_res_init(struct ocfs2_lock_res *lockres, + struct ocfs2_super *osb, u64 ref_blkno, + unsigned int generation) +{ + ocfs2_lock_res_init_once(lockres); + ocfs2_build_lock_name(OCFS2_LOCK_TYPE_REFCOUNT, ref_blkno, + generation, lockres->l_name); + ocfs2_lock_res_init_common(osb, lockres, OCFS2_LOCK_TYPE_REFCOUNT, + &ocfs2_refcount_block_lops, osb); +} + void ocfs2_lock_res_free(struct ocfs2_lock_res *res) { mlog_entry_void(); @@ -1548,8 +1577,10 @@ int ocfs2_rw_lock(struct inode *inode, int write) (unsigned long long)OCFS2_I(inode)->ip_blkno, write ? "EXMODE" : "PRMODE"); - if (ocfs2_mount_local(osb)) + if (ocfs2_mount_local(osb)) { + mlog_exit(0); return 0; + } lockres = &OCFS2_I(inode)->ip_rw_lockres; @@ -2127,7 +2158,7 @@ static int ocfs2_inode_lock_update(struct inode *inode, /* This will discard any caching information we might have had * for the inode metadata. */ - ocfs2_metadata_cache_purge(inode); + ocfs2_metadata_cache_purge(INODE_CACHE(inode)); ocfs2_extent_map_trunc(inode, 0); @@ -3009,6 +3040,7 @@ static void ocfs2_unlock_ast(void *opaque, int error) "unlock_action %d\n", error, lockres->l_name, lockres->l_unlock_action); spin_unlock_irqrestore(&lockres->l_lock, flags); + mlog_exit_void(); return; } @@ -3495,11 +3527,11 @@ out: return UNBLOCK_CONTINUE; } -static int ocfs2_check_meta_downconvert(struct ocfs2_lock_res *lockres, - int new_level) +static int ocfs2_ci_checkpointed(struct ocfs2_caching_info *ci, + struct ocfs2_lock_res *lockres, + int new_level) { - struct inode *inode = ocfs2_lock_res_inode(lockres); - int checkpointed = ocfs2_inode_fully_checkpointed(inode); + int checkpointed = ocfs2_ci_fully_checkpointed(ci); BUG_ON(new_level != DLM_LOCK_NL && new_level != DLM_LOCK_PR); BUG_ON(lockres->l_level != DLM_LOCK_EX && !checkpointed); @@ -3507,10 +3539,18 @@ static int ocfs2_check_meta_downconvert(struct ocfs2_lock_res *lockres, if (checkpointed) return 1; - ocfs2_start_checkpoint(OCFS2_SB(inode->i_sb)); + ocfs2_start_checkpoint(OCFS2_SB(ocfs2_metadata_cache_get_super(ci))); return 0; } +static int ocfs2_check_meta_downconvert(struct ocfs2_lock_res *lockres, + int new_level) +{ + struct inode *inode = ocfs2_lock_res_inode(lockres); + + return ocfs2_ci_checkpointed(INODE_CACHE(inode), lockres, new_level); +} + static void ocfs2_set_meta_lvb(struct ocfs2_lock_res *lockres) { struct inode *inode = ocfs2_lock_res_inode(lockres); @@ -3640,6 +3680,26 @@ static int ocfs2_dentry_convert_worker(struct ocfs2_lock_res *lockres, return UNBLOCK_CONTINUE_POST; } +static int ocfs2_check_refcount_downconvert(struct ocfs2_lock_res *lockres, + int new_level) +{ + struct ocfs2_refcount_tree *tree = + ocfs2_lock_res_refcount_tree(lockres); + + return ocfs2_ci_checkpointed(&tree->rf_ci, lockres, new_level); +} + +static int ocfs2_refcount_convert_worker(struct ocfs2_lock_res *lockres, + int blocking) +{ + struct ocfs2_refcount_tree *tree = + ocfs2_lock_res_refcount_tree(lockres); + + ocfs2_metadata_cache_purge(&tree->rf_ci); + + return UNBLOCK_CONTINUE; +} + static void ocfs2_set_qinfo_lvb(struct ocfs2_lock_res *lockres) { struct ocfs2_qinfo_lvb *lvb; @@ -3752,6 +3812,37 @@ bail: return status; } +int ocfs2_refcount_lock(struct ocfs2_refcount_tree *ref_tree, int ex) +{ + int status; + int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR; + struct ocfs2_lock_res *lockres = &ref_tree->rf_lockres; + struct ocfs2_super *osb = lockres->l_priv; + + + if (ocfs2_is_hard_readonly(osb)) + return -EROFS; + + if (ocfs2_mount_local(osb)) + return 0; + + status = ocfs2_cluster_lock(osb, lockres, level, 0, 0); + if (status < 0) + mlog_errno(status); + + return status; +} + +void ocfs2_refcount_unlock(struct ocfs2_refcount_tree *ref_tree, int ex) +{ + int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR; + struct ocfs2_lock_res *lockres = &ref_tree->rf_lockres; + struct ocfs2_super *osb = lockres->l_priv; + + if (!ocfs2_mount_local(osb)) + ocfs2_cluster_unlock(osb, lockres, level); +} + /* * This is the filesystem locking protocol. It provides the lock handling * hooks for the underlying DLM. It has a maximum version number. diff --git a/fs/ocfs2/dlmglue.h b/fs/ocfs2/dlmglue.h index 7553836..d1ce48e 100644 --- a/fs/ocfs2/dlmglue.h +++ b/fs/ocfs2/dlmglue.h @@ -101,6 +101,9 @@ void ocfs2_file_lock_res_init(struct ocfs2_lock_res *lockres, struct ocfs2_mem_dqinfo; void ocfs2_qinfo_lock_res_init(struct ocfs2_lock_res *lockres, struct ocfs2_mem_dqinfo *info); +void ocfs2_refcount_lock_res_init(struct ocfs2_lock_res *lockres, + struct ocfs2_super *osb, u64 ref_blkno, + unsigned int generation); void ocfs2_lock_res_free(struct ocfs2_lock_res *res); int ocfs2_create_new_inode_locks(struct inode *inode); int ocfs2_drop_inode_locks(struct inode *inode); @@ -148,6 +151,9 @@ int ocfs2_file_lock(struct file *file, int ex, int trylock); void ocfs2_file_unlock(struct file *file); int ocfs2_qinfo_lock(struct ocfs2_mem_dqinfo *oinfo, int ex); void ocfs2_qinfo_unlock(struct ocfs2_mem_dqinfo *oinfo, int ex); +struct ocfs2_refcount_tree; +int ocfs2_refcount_lock(struct ocfs2_refcount_tree *ref_tree, int ex); +void ocfs2_refcount_unlock(struct ocfs2_refcount_tree *ref_tree, int ex); void ocfs2_mark_lockres_freeing(struct ocfs2_lock_res *lockres); diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c index f2bb1a0..843db64 100644 --- a/fs/ocfs2/extent_map.c +++ b/fs/ocfs2/extent_map.c @@ -293,7 +293,7 @@ static int ocfs2_last_eb_is_empty(struct inode *inode, struct ocfs2_extent_block *eb; struct ocfs2_extent_list *el; - ret = ocfs2_read_extent_block(inode, last_eb_blk, &eb_bh); + ret = ocfs2_read_extent_block(INODE_CACHE(inode), last_eb_blk, &eb_bh); if (ret) { mlog_errno(ret); goto out; @@ -353,11 +353,11 @@ static int ocfs2_search_for_hole_index(struct ocfs2_extent_list *el, * eb_bh is NULL. Otherwise, eb_bh should point to the extent block * containing el. */ -static int ocfs2_figure_hole_clusters(struct inode *inode, - struct ocfs2_extent_list *el, - struct buffer_head *eb_bh, - u32 v_cluster, - u32 *num_clusters) +int ocfs2_figure_hole_clusters(struct ocfs2_caching_info *ci, + struct ocfs2_extent_list *el, + struct buffer_head *eb_bh, + u32 v_cluster, + u32 *num_clusters) { int ret, i; struct buffer_head *next_eb_bh = NULL; @@ -375,7 +375,7 @@ static int ocfs2_figure_hole_clusters(struct inode *inode, if (le64_to_cpu(eb->h_next_leaf_blk) == 0ULL) goto no_more_extents; - ret = ocfs2_read_extent_block(inode, + ret = ocfs2_read_extent_block(ci, le64_to_cpu(eb->h_next_leaf_blk), &next_eb_bh); if (ret) { @@ -428,7 +428,8 @@ static int ocfs2_get_clusters_nocache(struct inode *inode, tree_height = le16_to_cpu(el->l_tree_depth); if (tree_height > 0) { - ret = ocfs2_find_leaf(inode, el, v_cluster, &eb_bh); + ret = ocfs2_find_leaf(INODE_CACHE(inode), el, v_cluster, + &eb_bh); if (ret) { mlog_errno(ret); goto out; @@ -455,7 +456,8 @@ static int ocfs2_get_clusters_nocache(struct inode *inode, * field. */ if (hole_len) { - ret = ocfs2_figure_hole_clusters(inode, el, eb_bh, + ret = ocfs2_figure_hole_clusters(INODE_CACHE(inode), + el, eb_bh, v_cluster, &len); if (ret) { mlog_errno(ret); @@ -539,7 +541,8 @@ static void ocfs2_relative_extent_offsets(struct super_block *sb, int ocfs2_xattr_get_clusters(struct inode *inode, u32 v_cluster, u32 *p_cluster, u32 *num_clusters, - struct ocfs2_extent_list *el) + struct ocfs2_extent_list *el, + unsigned int *extent_flags) { int ret = 0, i; struct buffer_head *eb_bh = NULL; @@ -548,7 +551,8 @@ int ocfs2_xattr_get_clusters(struct inode *inode, u32 v_cluster, u32 coff; if (el->l_tree_depth) { - ret = ocfs2_find_leaf(inode, el, v_cluster, &eb_bh); + ret = ocfs2_find_leaf(INODE_CACHE(inode), el, v_cluster, + &eb_bh); if (ret) { mlog_errno(ret); goto out; @@ -590,6 +594,9 @@ int ocfs2_xattr_get_clusters(struct inode *inode, u32 v_cluster, *p_cluster = *p_cluster + coff; if (num_clusters) *num_clusters = ocfs2_rec_clusters(el, rec) - coff; + + if (extent_flags) + *extent_flags = rec->e_flags; } out: if (eb_bh) @@ -862,8 +869,8 @@ int ocfs2_read_virt_blocks(struct inode *inode, u64 v_block, int nr, BUG_ON(bhs[done + i]->b_blocknr != (p_block + i)); } - rc = ocfs2_read_blocks(inode, p_block, count, bhs + done, - flags, validate); + rc = ocfs2_read_blocks(INODE_CACHE(inode), p_block, count, + bhs + done, flags, validate); if (rc) { mlog_errno(rc); break; diff --git a/fs/ocfs2/extent_map.h b/fs/ocfs2/extent_map.h index b7dd973..e79d41c 100644 --- a/fs/ocfs2/extent_map.h +++ b/fs/ocfs2/extent_map.h @@ -55,12 +55,18 @@ int ocfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, int ocfs2_xattr_get_clusters(struct inode *inode, u32 v_cluster, u32 *p_cluster, u32 *num_clusters, - struct ocfs2_extent_list *el); + struct ocfs2_extent_list *el, + unsigned int *extent_flags); int ocfs2_read_virt_blocks(struct inode *inode, u64 v_block, int nr, struct buffer_head *bhs[], int flags, int (*validate)(struct super_block *sb, struct buffer_head *bh)); +int ocfs2_figure_hole_clusters(struct ocfs2_caching_info *ci, + struct ocfs2_extent_list *el, + struct buffer_head *eb_bh, + u32 v_cluster, + u32 *num_clusters); static inline int ocfs2_read_virt_block(struct inode *inode, u64 v_block, struct buffer_head **bh, int (*validate)(struct super_block *sb, diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index aa501d3..89fc8ee 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -59,6 +59,7 @@ #include "xattr.h" #include "acl.h" #include "quota.h" +#include "refcounttree.h" #include "buffer_head_io.h" @@ -259,7 +260,7 @@ int ocfs2_update_inode_atime(struct inode *inode, goto out; } - ret = ocfs2_journal_access_di(handle, inode, bh, + ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), bh, OCFS2_JOURNAL_ACCESS_WRITE); if (ret) { mlog_errno(ret); @@ -334,6 +335,39 @@ out: return ret; } +static int ocfs2_cow_file_pos(struct inode *inode, + struct buffer_head *fe_bh, + u64 offset) +{ + int status; + u32 phys, cpos = offset >> OCFS2_SB(inode->i_sb)->s_clustersize_bits; + unsigned int num_clusters = 0; + unsigned int ext_flags = 0; + + /* + * If the new offset is aligned to the range of the cluster, there is + * no space for ocfs2_zero_range_for_truncate to fill, so no need to + * CoW either. + */ + if ((offset & (OCFS2_SB(inode->i_sb)->s_clustersize - 1)) == 0) + return 0; + + status = ocfs2_get_clusters(inode, cpos, &phys, + &num_clusters, &ext_flags); + if (status) { + mlog_errno(status); + goto out; + } + + if (!(ext_flags & OCFS2_EXT_REFCOUNTED)) + goto out; + + return ocfs2_refcount_cow(inode, fe_bh, cpos, 1, cpos+1); + +out: + return status; +} + static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb, struct inode *inode, struct buffer_head *fe_bh, @@ -346,6 +380,17 @@ static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb, mlog_entry_void(); + /* + * We need to CoW the cluster contains the offset if it is reflinked + * since we will call ocfs2_zero_range_for_truncate later which will + * write "0" from offset to the end of the cluster. + */ + status = ocfs2_cow_file_pos(inode, fe_bh, new_i_size); + if (status) { + mlog_errno(status); + return status; + } + /* TODO: This needs to actually orphan the inode in this * transaction. */ @@ -356,7 +401,7 @@ static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb, goto out; } - status = ocfs2_journal_access_di(handle, inode, fe_bh, + status = ocfs2_journal_access_di(handle, INODE_CACHE(inode), fe_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { mlog_errno(status); @@ -486,6 +531,8 @@ bail_unlock_sem: up_write(&OCFS2_I(inode)->ip_alloc_sem); bail: + if (!status && OCFS2_I(inode)->ip_clusters == 0) + status = ocfs2_try_remove_refcount_tree(inode, di_bh); mlog_exit(status); return status; @@ -515,11 +562,10 @@ int ocfs2_add_inode_data(struct ocfs2_super *osb, int ret; struct ocfs2_extent_tree et; - ocfs2_init_dinode_extent_tree(&et, inode, fe_bh); - ret = ocfs2_add_clusters_in_btree(osb, inode, logical_offset, - clusters_to_add, mark_unwritten, - &et, handle, - data_ac, meta_ac, reason_ret); + ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), fe_bh); + ret = ocfs2_add_clusters_in_btree(handle, &et, logical_offset, + clusters_to_add, mark_unwritten, + data_ac, meta_ac, reason_ret); return ret; } @@ -564,7 +610,7 @@ restart_all: (unsigned long long)OCFS2_I(inode)->ip_blkno, (long long)i_size_read(inode), le32_to_cpu(fe->i_clusters), clusters_to_add); - ocfs2_init_dinode_extent_tree(&et, inode, bh); + ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), bh); status = ocfs2_lock_allocators(inode, &et, clusters_to_add, 0, &data_ac, &meta_ac); if (status) { @@ -593,7 +639,7 @@ restarted_transaction: /* reserve a write to the file entry early on - that we if we * run out of credits in the allocation path, we can still * update i_size. */ - status = ocfs2_journal_access_di(handle, inode, bh, + status = ocfs2_journal_access_di(handle, INODE_CACHE(inode), bh, OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { mlog_errno(status); @@ -1131,7 +1177,7 @@ static int __ocfs2_write_remove_suid(struct inode *inode, goto out; } - ret = ocfs2_journal_access_di(handle, inode, bh, + ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), bh, OCFS2_JOURNAL_ACCESS_WRITE); if (ret < 0) { mlog_errno(ret); @@ -1395,7 +1441,7 @@ static int ocfs2_remove_inode_range(struct inode *inode, struct address_space *mapping = inode->i_mapping; struct ocfs2_extent_tree et; - ocfs2_init_dinode_extent_tree(&et, inode, di_bh); + ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh); ocfs2_init_dealloc_ctxt(&dealloc); if (byte_len == 0) @@ -1657,6 +1703,70 @@ static long ocfs2_fallocate(struct inode *inode, int mode, loff_t offset, OCFS2_IOC_RESVSP64, &sr, change_size); } +int ocfs2_check_range_for_refcount(struct inode *inode, loff_t pos, + size_t count) +{ + int ret = 0; + unsigned int extent_flags; + u32 cpos, clusters, extent_len, phys_cpos; + struct super_block *sb = inode->i_sb; + + if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb)) || + !(OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL)) + return 0; + + cpos = pos >> OCFS2_SB(sb)->s_clustersize_bits; + clusters = ocfs2_clusters_for_bytes(sb, pos + count) - cpos; + + while (clusters) { + ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, &extent_len, + &extent_flags); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + + if (phys_cpos && (extent_flags & OCFS2_EXT_REFCOUNTED)) { + ret = 1; + break; + } + + if (extent_len > clusters) + extent_len = clusters; + + clusters -= extent_len; + cpos += extent_len; + } +out: + return ret; +} + +static int ocfs2_prepare_inode_for_refcount(struct inode *inode, + loff_t pos, size_t count, + int *meta_level) +{ + int ret; + struct buffer_head *di_bh = NULL; + u32 cpos = pos >> OCFS2_SB(inode->i_sb)->s_clustersize_bits; + u32 clusters = + ocfs2_clusters_for_bytes(inode->i_sb, pos + count) - cpos; + + ret = ocfs2_inode_lock(inode, &di_bh, 1); + if (ret) { + mlog_errno(ret); + goto out; + } + + *meta_level = 1; + + ret = ocfs2_refcount_cow(inode, di_bh, cpos, clusters, UINT_MAX); + if (ret) + mlog_errno(ret); +out: + brelse(di_bh); + return ret; +} + static int ocfs2_prepare_inode_for_write(struct dentry *dentry, loff_t *ppos, size_t count, @@ -1713,6 +1823,22 @@ static int ocfs2_prepare_inode_for_write(struct dentry *dentry, end = saved_pos + count; + ret = ocfs2_check_range_for_refcount(inode, saved_pos, count); + if (ret == 1) { + ocfs2_inode_unlock(inode, meta_level); + meta_level = -1; + + ret = ocfs2_prepare_inode_for_refcount(inode, + saved_pos, + count, + &meta_level); + } + + if (ret < 0) { + mlog_errno(ret); + goto out_unlock; + } + /* * Skip the O_DIRECT checks if we don't need * them. @@ -1759,7 +1885,8 @@ static int ocfs2_prepare_inode_for_write(struct dentry *dentry, *ppos = saved_pos; out_unlock: - ocfs2_inode_unlock(inode, meta_level); + if (meta_level >= 0) + ocfs2_inode_unlock(inode, meta_level); out: return ret; @@ -1871,8 +1998,7 @@ relock: goto out_dio; } } else { - written = generic_file_aio_write_nolock(iocb, iov, nr_segs, - *ppos); + written = __generic_file_aio_write(iocb, iov, nr_segs, ppos); } out_dio: @@ -1880,18 +2006,21 @@ out_dio: BUG_ON(ret == -EIOCBQUEUED && !(file->f_flags & O_DIRECT)); if ((file->f_flags & O_SYNC && !direct_io) || IS_SYNC(inode)) { - /* - * The generic write paths have handled getting data - * to disk, but since we don't make use of the dirty - * inode list, a manual journal commit is necessary - * here. - */ - if (old_size != i_size_read(inode) || - old_clusters != OCFS2_I(inode)->ip_clusters) { + ret = filemap_fdatawrite_range(file->f_mapping, pos, + pos + count - 1); + if (ret < 0) + written = ret; + + if (!ret && (old_size != i_size_read(inode) || + old_clusters != OCFS2_I(inode)->ip_clusters)) { ret = jbd2_journal_force_commit(osb->journal->j_journal); if (ret < 0) written = ret; } + + if (!ret) + ret = filemap_fdatawait_range(file->f_mapping, pos, + pos + count - 1); } /* @@ -1991,31 +2120,16 @@ static ssize_t ocfs2_file_splice_write(struct pipe_inode_info *pipe, if (ret > 0) { unsigned long nr_pages; + int err; - *ppos += ret; nr_pages = (ret + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; - /* - * If file or inode is SYNC and we actually wrote some data, - * sync it. - */ - if (unlikely((out->f_flags & O_SYNC) || IS_SYNC(inode))) { - int err; - - mutex_lock(&inode->i_mutex); - err = ocfs2_rw_lock(inode, 1); - if (err < 0) { - mlog_errno(err); - } else { - err = generic_osync_inode(inode, mapping, - OSYNC_METADATA|OSYNC_DATA); - ocfs2_rw_unlock(inode, 1); - } - mutex_unlock(&inode->i_mutex); + err = generic_write_sync(out, *ppos, ret); + if (err) + ret = err; + else + *ppos += ret; - if (err) - ret = err; - } balance_dirty_pages_ratelimited_nr(mapping, nr_pages); } diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h index 172f9fb..d66cf4f 100644 --- a/fs/ocfs2/file.h +++ b/fs/ocfs2/file.h @@ -69,4 +69,6 @@ int ocfs2_update_inode_atime(struct inode *inode, int ocfs2_change_file_space(struct file *file, unsigned int cmd, struct ocfs2_space_resv *sr); +int ocfs2_check_range_for_refcount(struct inode *inode, loff_t pos, + size_t count); #endif /* OCFS2_FILE_H */ diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index 4dc8890..0297fb8 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c @@ -53,6 +53,7 @@ #include "sysfile.h" #include "uptodate.h" #include "xattr.h" +#include "refcounttree.h" #include "buffer_head_io.h" @@ -562,7 +563,8 @@ static int ocfs2_truncate_for_delete(struct ocfs2_super *osb, goto out; } - status = ocfs2_journal_access_di(handle, inode, fe_bh, + status = ocfs2_journal_access_di(handle, INODE_CACHE(inode), + fe_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { mlog_errno(status); @@ -646,7 +648,7 @@ static int ocfs2_remove_inode(struct inode *inode, } /* set the inodes dtime */ - status = ocfs2_journal_access_di(handle, inode, di_bh, + status = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { mlog_errno(status); @@ -662,7 +664,7 @@ static int ocfs2_remove_inode(struct inode *inode, goto bail_commit; } - ocfs2_remove_from_cache(inode, di_bh); + ocfs2_remove_from_cache(INODE_CACHE(inode), di_bh); vfs_dq_free_inode(inode); status = ocfs2_free_dinode(handle, inode_alloc_inode, @@ -781,6 +783,12 @@ static int ocfs2_wipe_inode(struct inode *inode, goto bail_unlock_dir; } + status = ocfs2_remove_refcount_tree(inode, di_bh); + if (status < 0) { + mlog_errno(status); + goto bail_unlock_dir; + } + status = ocfs2_remove_inode(inode, di_bh, orphan_dir_inode, orphan_dir_bh); if (status < 0) @@ -1112,13 +1120,14 @@ void ocfs2_clear_inode(struct inode *inode) ocfs2_lock_res_free(&oi->ip_inode_lockres); ocfs2_lock_res_free(&oi->ip_open_lockres); - ocfs2_metadata_cache_purge(inode); + ocfs2_metadata_cache_exit(INODE_CACHE(inode)); - mlog_bug_on_msg(oi->ip_metadata_cache.ci_num_cached, + mlog_bug_on_msg(INODE_CACHE(inode)->ci_num_cached, "Clear inode of %llu, inode has %u cache items\n", - (unsigned long long)oi->ip_blkno, oi->ip_metadata_cache.ci_num_cached); + (unsigned long long)oi->ip_blkno, + INODE_CACHE(inode)->ci_num_cached); - mlog_bug_on_msg(!(oi->ip_flags & OCFS2_INODE_CACHE_INLINE), + mlog_bug_on_msg(!(INODE_CACHE(inode)->ci_flags & OCFS2_CACHE_FL_INLINE), "Clear inode of %llu, inode has a bad flag\n", (unsigned long long)oi->ip_blkno); @@ -1145,9 +1154,7 @@ void ocfs2_clear_inode(struct inode *inode) (unsigned long long)oi->ip_blkno, oi->ip_open_count); /* Clear all other flags. */ - oi->ip_flags = OCFS2_INODE_CACHE_INLINE; - oi->ip_created_trans = 0; - oi->ip_last_trans = 0; + oi->ip_flags = 0; oi->ip_dir_start_lookup = 0; oi->ip_blkno = 0ULL; @@ -1239,7 +1246,7 @@ int ocfs2_mark_inode_dirty(handle_t *handle, mlog_entry("(inode %llu)\n", (unsigned long long)OCFS2_I(inode)->ip_blkno); - status = ocfs2_journal_access_di(handle, inode, bh, + status = ocfs2_journal_access_di(handle, INODE_CACHE(inode), bh, OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { mlog_errno(status); @@ -1380,8 +1387,8 @@ int ocfs2_read_inode_block_full(struct inode *inode, struct buffer_head **bh, int rc; struct buffer_head *tmp = *bh; - rc = ocfs2_read_blocks(inode, OCFS2_I(inode)->ip_blkno, 1, &tmp, - flags, ocfs2_validate_inode_block); + rc = ocfs2_read_blocks(INODE_CACHE(inode), OCFS2_I(inode)->ip_blkno, + 1, &tmp, flags, ocfs2_validate_inode_block); /* If ocfs2_read_blocks() got us a new bh, pass it up. */ if (!rc && !*bh) @@ -1394,3 +1401,56 @@ int ocfs2_read_inode_block(struct inode *inode, struct buffer_head **bh) { return ocfs2_read_inode_block_full(inode, bh, 0); } + + +static u64 ocfs2_inode_cache_owner(struct ocfs2_caching_info *ci) +{ + struct ocfs2_inode_info *oi = cache_info_to_inode(ci); + + return oi->ip_blkno; +} + +static struct super_block *ocfs2_inode_cache_get_super(struct ocfs2_caching_info *ci) +{ + struct ocfs2_inode_info *oi = cache_info_to_inode(ci); + + return oi->vfs_inode.i_sb; +} + +static void ocfs2_inode_cache_lock(struct ocfs2_caching_info *ci) +{ + struct ocfs2_inode_info *oi = cache_info_to_inode(ci); + + spin_lock(&oi->ip_lock); +} + +static void ocfs2_inode_cache_unlock(struct ocfs2_caching_info *ci) +{ + struct ocfs2_inode_info *oi = cache_info_to_inode(ci); + + spin_unlock(&oi->ip_lock); +} + +static void ocfs2_inode_cache_io_lock(struct ocfs2_caching_info *ci) +{ + struct ocfs2_inode_info *oi = cache_info_to_inode(ci); + + mutex_lock(&oi->ip_io_mutex); +} + +static void ocfs2_inode_cache_io_unlock(struct ocfs2_caching_info *ci) +{ + struct ocfs2_inode_info *oi = cache_info_to_inode(ci); + + mutex_unlock(&oi->ip_io_mutex); +} + +const struct ocfs2_caching_operations ocfs2_inode_caching_ops = { + .co_owner = ocfs2_inode_cache_owner, + .co_get_super = ocfs2_inode_cache_get_super, + .co_cache_lock = ocfs2_inode_cache_lock, + .co_cache_unlock = ocfs2_inode_cache_unlock, + .co_io_lock = ocfs2_inode_cache_io_lock, + .co_io_unlock = ocfs2_inode_cache_io_unlock, +}; + diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h index ea71525..ba4fe07 100644 --- a/fs/ocfs2/inode.h +++ b/fs/ocfs2/inode.h @@ -60,12 +60,6 @@ struct ocfs2_inode_info u32 ip_dir_start_lookup; - /* next two are protected by trans_inc_lock */ - /* which transaction were we created on? Zero if none. */ - unsigned long ip_created_trans; - /* last transaction we were a part of. */ - unsigned long ip_last_trans; - struct ocfs2_caching_info ip_metadata_cache; struct ocfs2_extent_map ip_extent_map; @@ -106,8 +100,6 @@ struct ocfs2_inode_info #define OCFS2_INODE_MAYBE_ORPHANED 0x00000020 /* Does someone have the file open O_DIRECT */ #define OCFS2_INODE_OPEN_DIRECT 0x00000040 -/* Indicates that the metadata cache should be used as an array. */ -#define OCFS2_INODE_CACHE_INLINE 0x00000080 static inline struct ocfs2_inode_info *OCFS2_I(struct inode *inode) { @@ -120,6 +112,12 @@ static inline struct ocfs2_inode_info *OCFS2_I(struct inode *inode) extern struct kmem_cache *ocfs2_inode_cache; extern const struct address_space_operations ocfs2_aops; +extern const struct ocfs2_caching_operations ocfs2_inode_caching_ops; + +static inline struct ocfs2_caching_info *INODE_CACHE(struct inode *inode) +{ + return &OCFS2_I(inode)->ip_metadata_cache; +} void ocfs2_clear_inode(struct inode *inode); void ocfs2_delete_inode(struct inode *inode); @@ -172,4 +170,10 @@ int ocfs2_read_inode_block(struct inode *inode, struct buffer_head **bh); /* The same, but can be passed OCFS2_BH_* flags */ int ocfs2_read_inode_block_full(struct inode *inode, struct buffer_head **bh, int flags); + +static inline struct ocfs2_inode_info *cache_info_to_inode(struct ocfs2_caching_info *ci) +{ + return container_of(ci, struct ocfs2_inode_info, ip_metadata_cache); +} + #endif /* OCFS2_INODE_H */ diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c index 467b413..31fbb06 100644 --- a/fs/ocfs2/ioctl.c +++ b/fs/ocfs2/ioctl.c @@ -21,6 +21,7 @@ #include "ocfs2_fs.h" #include "ioctl.h" #include "resize.h" +#include "refcounttree.h" #include <linux/ext2_fs.h> @@ -115,6 +116,9 @@ long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) int status; struct ocfs2_space_resv sr; struct ocfs2_new_group_input input; + struct reflink_arguments args; + const char *old_path, *new_path; + bool preserve; switch (cmd) { case OCFS2_IOC_GETFLAGS: @@ -160,6 +164,15 @@ long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) return -EFAULT; return ocfs2_group_add(inode, &input); + case OCFS2_IOC_REFLINK: + if (copy_from_user(&args, (struct reflink_arguments *)arg, + sizeof(args))) + return -EFAULT; + old_path = (const char *)(unsigned long)args.old_path; + new_path = (const char *)(unsigned long)args.new_path; + preserve = (args.preserve != 0); + + return ocfs2_reflink_ioctl(inode, old_path, new_path, preserve); default: return -ENOTTY; } @@ -182,6 +195,7 @@ long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg) case OCFS2_IOC_GROUP_EXTEND: case OCFS2_IOC_GROUP_ADD: case OCFS2_IOC_GROUP_ADD64: + case OCFS2_IOC_REFLINK: break; default: return -ENOIOCTLCMD; diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index c48b93a..54c16b6 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -48,6 +48,7 @@ #include "slot_map.h" #include "super.h" #include "sysfile.h" +#include "uptodate.h" #include "quota.h" #include "buffer_head_io.h" @@ -554,6 +555,14 @@ static struct ocfs2_triggers eb_triggers = { .ot_offset = offsetof(struct ocfs2_extent_block, h_check), }; +static struct ocfs2_triggers rb_triggers = { + .ot_triggers = { + .t_commit = ocfs2_commit_trigger, + .t_abort = ocfs2_abort_trigger, + }, + .ot_offset = offsetof(struct ocfs2_refcount_block, rf_check), +}; + static struct ocfs2_triggers gd_triggers = { .ot_triggers = { .t_commit = ocfs2_commit_trigger, @@ -601,14 +610,16 @@ static struct ocfs2_triggers dl_triggers = { }; static int __ocfs2_journal_access(handle_t *handle, - struct inode *inode, + struct ocfs2_caching_info *ci, struct buffer_head *bh, struct ocfs2_triggers *triggers, int type) { int status; + struct ocfs2_super *osb = + OCFS2_SB(ocfs2_metadata_cache_get_super(ci)); - BUG_ON(!inode); + BUG_ON(!ci || !ci->ci_ops); BUG_ON(!handle); BUG_ON(!bh); @@ -627,15 +638,15 @@ static int __ocfs2_journal_access(handle_t *handle, BUG(); } - /* Set the current transaction information on the inode so + /* Set the current transaction information on the ci so * that the locking code knows whether it can drop it's locks - * on this inode or not. We're protected from the commit + * on this ci or not. We're protected from the commit * thread updating the current transaction id until * ocfs2_commit_trans() because ocfs2_start_trans() took * j_trans_barrier for us. */ - ocfs2_set_inode_lock_trans(OCFS2_SB(inode->i_sb)->journal, inode); + ocfs2_set_ci_lock_trans(osb->journal, ci); - mutex_lock(&OCFS2_I(inode)->ip_io_mutex); + ocfs2_metadata_cache_io_lock(ci); switch (type) { case OCFS2_JOURNAL_ACCESS_CREATE: case OCFS2_JOURNAL_ACCESS_WRITE: @@ -650,9 +661,9 @@ static int __ocfs2_journal_access(handle_t *handle, status = -EINVAL; mlog(ML_ERROR, "Uknown access type!\n"); } - if (!status && ocfs2_meta_ecc(OCFS2_SB(inode->i_sb)) && triggers) + if (!status && ocfs2_meta_ecc(osb) && triggers) jbd2_journal_set_triggers(bh, &triggers->ot_triggers); - mutex_unlock(&OCFS2_I(inode)->ip_io_mutex); + ocfs2_metadata_cache_io_unlock(ci); if (status < 0) mlog(ML_ERROR, "Error %d getting %d access to buffer!\n", @@ -662,66 +673,65 @@ static int __ocfs2_journal_access(handle_t *handle, return status; } -int ocfs2_journal_access_di(handle_t *handle, struct inode *inode, - struct buffer_head *bh, int type) +int ocfs2_journal_access_di(handle_t *handle, struct ocfs2_caching_info *ci, + struct buffer_head *bh, int type) { - return __ocfs2_journal_access(handle, inode, bh, &di_triggers, - type); + return __ocfs2_journal_access(handle, ci, bh, &di_triggers, type); } -int ocfs2_journal_access_eb(handle_t *handle, struct inode *inode, +int ocfs2_journal_access_eb(handle_t *handle, struct ocfs2_caching_info *ci, struct buffer_head *bh, int type) { - return __ocfs2_journal_access(handle, inode, bh, &eb_triggers, - type); + return __ocfs2_journal_access(handle, ci, bh, &eb_triggers, type); } -int ocfs2_journal_access_gd(handle_t *handle, struct inode *inode, +int ocfs2_journal_access_rb(handle_t *handle, struct ocfs2_caching_info *ci, struct buffer_head *bh, int type) { - return __ocfs2_journal_access(handle, inode, bh, &gd_triggers, + return __ocfs2_journal_access(handle, ci, bh, &rb_triggers, type); } -int ocfs2_journal_access_db(handle_t *handle, struct inode *inode, +int ocfs2_journal_access_gd(handle_t *handle, struct ocfs2_caching_info *ci, struct buffer_head *bh, int type) { - return __ocfs2_journal_access(handle, inode, bh, &db_triggers, - type); + return __ocfs2_journal_access(handle, ci, bh, &gd_triggers, type); } -int ocfs2_journal_access_xb(handle_t *handle, struct inode *inode, +int ocfs2_journal_access_db(handle_t *handle, struct ocfs2_caching_info *ci, struct buffer_head *bh, int type) { - return __ocfs2_journal_access(handle, inode, bh, &xb_triggers, - type); + return __ocfs2_journal_access(handle, ci, bh, &db_triggers, type); } -int ocfs2_journal_access_dq(handle_t *handle, struct inode *inode, +int ocfs2_journal_access_xb(handle_t *handle, struct ocfs2_caching_info *ci, struct buffer_head *bh, int type) { - return __ocfs2_journal_access(handle, inode, bh, &dq_triggers, - type); + return __ocfs2_journal_access(handle, ci, bh, &xb_triggers, type); } -int ocfs2_journal_access_dr(handle_t *handle, struct inode *inode, +int ocfs2_journal_access_dq(handle_t *handle, struct ocfs2_caching_info *ci, struct buffer_head *bh, int type) { - return __ocfs2_journal_access(handle, inode, bh, &dr_triggers, - type); + return __ocfs2_journal_access(handle, ci, bh, &dq_triggers, type); } -int ocfs2_journal_access_dl(handle_t *handle, struct inode *inode, +int ocfs2_journal_access_dr(handle_t *handle, struct ocfs2_caching_info *ci, struct buffer_head *bh, int type) { - return __ocfs2_journal_access(handle, inode, bh, &dl_triggers, - type); + return __ocfs2_journal_access(handle, ci, bh, &dr_triggers, type); +} + +int ocfs2_journal_access_dl(handle_t *handle, struct ocfs2_caching_info *ci, + struct buffer_head *bh, int type) +{ + return __ocfs2_journal_access(handle, ci, bh, &dl_triggers, type); } -int ocfs2_journal_access(handle_t *handle, struct inode *inode, +int ocfs2_journal_access(handle_t *handle, struct ocfs2_caching_info *ci, struct buffer_head *bh, int type) { - return __ocfs2_journal_access(handle, inode, bh, NULL, type); + return __ocfs2_journal_access(handle, ci, bh, NULL, type); } int ocfs2_journal_dirty(handle_t *handle, @@ -898,7 +908,7 @@ static int ocfs2_journal_toggle_dirty(struct ocfs2_super *osb, ocfs2_bump_recovery_generation(fe); ocfs2_compute_meta_ecc(osb->sb, bh->b_data, &fe->i_check); - status = ocfs2_write_block(osb, bh, journal->j_inode); + status = ocfs2_write_block(osb, bh, INODE_CACHE(journal->j_inode)); if (status < 0) mlog_errno(status); @@ -1642,7 +1652,7 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb, ocfs2_get_recovery_generation(fe); ocfs2_compute_meta_ecc(osb->sb, bh->b_data, &fe->i_check); - status = ocfs2_write_block(osb, bh, inode); + status = ocfs2_write_block(osb, bh, INODE_CACHE(inode)); if (status < 0) mlog_errno(status); diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h index 2c3222a..3f74e09 100644 --- a/fs/ocfs2/journal.h +++ b/fs/ocfs2/journal.h @@ -90,56 +90,66 @@ static inline unsigned long ocfs2_inc_trans_id(struct ocfs2_journal *j) return old_id; } -static inline void ocfs2_set_inode_lock_trans(struct ocfs2_journal *journal, - struct inode *inode) +static inline void ocfs2_set_ci_lock_trans(struct ocfs2_journal *journal, + struct ocfs2_caching_info *ci) { spin_lock(&trans_inc_lock); - OCFS2_I(inode)->ip_last_trans = journal->j_trans_id; + ci->ci_last_trans = journal->j_trans_id; spin_unlock(&trans_inc_lock); } /* Used to figure out whether it's safe to drop a metadata lock on an - * inode. Returns true if all the inodes changes have been + * cached object. Returns true if all the object's changes have been * checkpointed to disk. You should be holding the spinlock on the * metadata lock while calling this to be sure that nobody can take * the lock and put it on another transaction. */ -static inline int ocfs2_inode_fully_checkpointed(struct inode *inode) +static inline int ocfs2_ci_fully_checkpointed(struct ocfs2_caching_info *ci) { int ret; - struct ocfs2_journal *journal = OCFS2_SB(inode->i_sb)->journal; + struct ocfs2_journal *journal = + OCFS2_SB(ocfs2_metadata_cache_get_super(ci))->journal; spin_lock(&trans_inc_lock); - ret = time_after(journal->j_trans_id, OCFS2_I(inode)->ip_last_trans); + ret = time_after(journal->j_trans_id, ci->ci_last_trans); spin_unlock(&trans_inc_lock); return ret; } -/* convenience function to check if an inode is still new (has never - * hit disk) Will do you a favor and set created_trans = 0 when you've - * been checkpointed. returns '1' if the inode is still new. */ -static inline int ocfs2_inode_is_new(struct inode *inode) +/* convenience function to check if an object backed by struct + * ocfs2_caching_info is still new (has never hit disk) Will do you a + * favor and set created_trans = 0 when you've + * been checkpointed. returns '1' if the ci is still new. */ +static inline int ocfs2_ci_is_new(struct ocfs2_caching_info *ci) { int ret; + struct ocfs2_journal *journal = + OCFS2_SB(ocfs2_metadata_cache_get_super(ci))->journal; + spin_lock(&trans_inc_lock); + ret = !(time_after(journal->j_trans_id, ci->ci_created_trans)); + if (!ret) + ci->ci_created_trans = 0; + spin_unlock(&trans_inc_lock); + return ret; +} + +/* Wrapper for inodes so we can check system files */ +static inline int ocfs2_inode_is_new(struct inode *inode) +{ /* System files are never "new" as they're written out by * mkfs. This helps us early during mount, before we have the * journal open and j_trans_id could be junk. */ if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_SYSTEM_FILE) return 0; - spin_lock(&trans_inc_lock); - ret = !(time_after(OCFS2_SB(inode->i_sb)->journal->j_trans_id, - OCFS2_I(inode)->ip_created_trans)); - if (!ret) - OCFS2_I(inode)->ip_created_trans = 0; - spin_unlock(&trans_inc_lock); - return ret; + + return ocfs2_ci_is_new(INODE_CACHE(inode)); } -static inline void ocfs2_inode_set_new(struct ocfs2_super *osb, - struct inode *inode) +static inline void ocfs2_ci_set_new(struct ocfs2_super *osb, + struct ocfs2_caching_info *ci) { spin_lock(&trans_inc_lock); - OCFS2_I(inode)->ip_created_trans = osb->journal->j_trans_id; + ci->ci_created_trans = osb->journal->j_trans_id; spin_unlock(&trans_inc_lock); } @@ -200,7 +210,7 @@ static inline void ocfs2_checkpoint_inode(struct inode *inode) if (ocfs2_mount_local(osb)) return; - if (!ocfs2_inode_fully_checkpointed(inode)) { + if (!ocfs2_ci_fully_checkpointed(INODE_CACHE(inode))) { /* WARNING: This only kicks off a single * checkpoint. If someone races you and adds more * metadata to the journal, you won't know, and will @@ -210,7 +220,7 @@ static inline void ocfs2_checkpoint_inode(struct inode *inode) ocfs2_start_checkpoint(osb); wait_event(osb->journal->j_checkpointed, - ocfs2_inode_fully_checkpointed(inode)); + ocfs2_ci_fully_checkpointed(INODE_CACHE(inode))); } } @@ -266,31 +276,34 @@ int ocfs2_extend_trans(handle_t *handle, int nblocks); /* ocfs2_inode */ -int ocfs2_journal_access_di(handle_t *handle, struct inode *inode, +int ocfs2_journal_access_di(handle_t *handle, struct ocfs2_caching_info *ci, struct buffer_head *bh, int type); /* ocfs2_extent_block */ -int ocfs2_journal_access_eb(handle_t *handle, struct inode *inode, +int ocfs2_journal_access_eb(handle_t *handle, struct ocfs2_caching_info *ci, + struct buffer_head *bh, int type); +/* ocfs2_refcount_block */ +int ocfs2_journal_access_rb(handle_t *handle, struct ocfs2_caching_info *ci, struct buffer_head *bh, int type); /* ocfs2_group_desc */ -int ocfs2_journal_access_gd(handle_t *handle, struct inode *inode, +int ocfs2_journal_access_gd(handle_t *handle, struct ocfs2_caching_info *ci, struct buffer_head *bh, int type); /* ocfs2_xattr_block */ -int ocfs2_journal_access_xb(handle_t *handle, struct inode *inode, +int ocfs2_journal_access_xb(handle_t *handle, struct ocfs2_caching_info *ci, struct buffer_head *bh, int type); /* quota blocks */ -int ocfs2_journal_access_dq(handle_t *handle, struct inode *inode, +int ocfs2_journal_access_dq(handle_t *handle, struct ocfs2_caching_info *ci, struct buffer_head *bh, int type); /* dirblock */ -int ocfs2_journal_access_db(handle_t *handle, struct inode *inode, +int ocfs2_journal_access_db(handle_t *handle, struct ocfs2_caching_info *ci, struct buffer_head *bh, int type); /* ocfs2_dx_root_block */ -int ocfs2_journal_access_dr(handle_t *handle, struct inode *inode, +int ocfs2_journal_access_dr(handle_t *handle, struct ocfs2_caching_info *ci, struct buffer_head *bh, int type); /* ocfs2_dx_leaf */ -int ocfs2_journal_access_dl(handle_t *handle, struct inode *inode, +int ocfs2_journal_access_dl(handle_t *handle, struct ocfs2_caching_info *ci, struct buffer_head *bh, int type); /* Anything that has no ecc */ -int ocfs2_journal_access(handle_t *handle, struct inode *inode, +int ocfs2_journal_access(handle_t *handle, struct ocfs2_caching_info *ci, struct buffer_head *bh, int type); /* @@ -477,6 +490,23 @@ static inline int ocfs2_calc_dxi_expand_credits(struct super_block *sb) return credits; } +/* inode update, new refcount block and its allocation credits. */ +#define OCFS2_REFCOUNT_TREE_CREATE_CREDITS (OCFS2_INODE_UPDATE_CREDITS + 1 \ + + OCFS2_SUBALLOC_ALLOC) + +/* inode and the refcount block update. */ +#define OCFS2_REFCOUNT_TREE_SET_CREDITS (OCFS2_INODE_UPDATE_CREDITS + 1) + +/* + * inode and the refcount block update. + * It doesn't include the credits for sub alloc change. + * So if we need to free the bit, OCFS2_SUBALLOC_FREE needs to be added. + */ +#define OCFS2_REFCOUNT_TREE_REMOVE_CREDITS (OCFS2_INODE_UPDATE_CREDITS + 1) + +/* 2 metadata alloc, 2 new blocks and root refcount block */ +#define OCFS2_EXPAND_REFCOUNT_TREE_CREDITS (OCFS2_SUBALLOC_ALLOC * 2 + 3) + /* * Please note that the caller must make sure that root_el is the root * of extent tree. So for an inode, it should be &fe->id2.i_list. Otherwise diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c index bac7e6a..ac10f83 100644 --- a/fs/ocfs2/localalloc.c +++ b/fs/ocfs2/localalloc.c @@ -297,8 +297,8 @@ void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb) } memcpy(alloc_copy, alloc, bh->b_size); - status = ocfs2_journal_access_di(handle, local_alloc_inode, bh, - OCFS2_JOURNAL_ACCESS_WRITE); + status = ocfs2_journal_access_di(handle, INODE_CACHE(local_alloc_inode), + bh, OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { mlog_errno(status); goto out_commit; @@ -392,7 +392,7 @@ int ocfs2_begin_local_alloc_recovery(struct ocfs2_super *osb, ocfs2_clear_local_alloc(alloc); ocfs2_compute_meta_ecc(osb->sb, alloc_bh->b_data, &alloc->i_check); - status = ocfs2_write_block(osb, alloc_bh, inode); + status = ocfs2_write_block(osb, alloc_bh, INODE_CACHE(inode)); if (status < 0) mlog_errno(status); @@ -678,7 +678,8 @@ int ocfs2_claim_local_alloc_bits(struct ocfs2_super *osb, * delete bits from it! */ *num_bits = bits_wanted; - status = ocfs2_journal_access_di(handle, local_alloc_inode, + status = ocfs2_journal_access_di(handle, + INODE_CACHE(local_alloc_inode), osb->local_alloc_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { @@ -1156,7 +1157,8 @@ static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb, } memcpy(alloc_copy, alloc, osb->local_alloc_bh->b_size); - status = ocfs2_journal_access_di(handle, local_alloc_inode, + status = ocfs2_journal_access_di(handle, + INODE_CACHE(local_alloc_inode), osb->local_alloc_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c index b606496..3973761 100644 --- a/fs/ocfs2/mmap.c +++ b/fs/ocfs2/mmap.c @@ -202,7 +202,7 @@ out: return ret; } -static struct vm_operations_struct ocfs2_file_vm_ops = { +static const struct vm_operations_struct ocfs2_file_vm_ops = { .fault = ocfs2_fault, .page_mkwrite = ocfs2_page_mkwrite, }; diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index 8601f93..f010b22 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -69,7 +69,6 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb, struct inode *dir, struct inode *inode, - struct dentry *dentry, dev_t dev, struct buffer_head **new_fe_bh, struct buffer_head *parent_fe_bh, @@ -78,7 +77,7 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb, static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb, struct inode **ret_orphan_dir, - struct inode *inode, + u64 blkno, char *name, struct ocfs2_dir_lookup_result *lookup); @@ -358,8 +357,12 @@ static int ocfs2_mknod(struct inode *dir, } did_quota_inode = 1; + mlog_entry("(0x%p, 0x%p, %d, %lu, '%.*s')\n", dir, dentry, + inode->i_mode, (unsigned long)dev, dentry->d_name.len, + dentry->d_name.name); + /* do the real work now. */ - status = ocfs2_mknod_locked(osb, dir, inode, dentry, dev, + status = ocfs2_mknod_locked(osb, dir, inode, dev, &new_fe_bh, parent_fe_bh, handle, inode_ac); if (status < 0) { @@ -375,7 +378,8 @@ static int ocfs2_mknod(struct inode *dir, goto leave; } - status = ocfs2_journal_access_di(handle, dir, parent_fe_bh, + status = ocfs2_journal_access_di(handle, INODE_CACHE(dir), + parent_fe_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { mlog_errno(status); @@ -465,7 +469,6 @@ leave: static int ocfs2_mknod_locked(struct ocfs2_super *osb, struct inode *dir, struct inode *inode, - struct dentry *dentry, dev_t dev, struct buffer_head **new_fe_bh, struct buffer_head *parent_fe_bh, @@ -479,10 +482,6 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb, u16 suballoc_bit; u16 feat; - mlog_entry("(0x%p, 0x%p, %d, %lu, '%.*s')\n", dir, dentry, - inode->i_mode, (unsigned long)dev, dentry->d_name.len, - dentry->d_name.name); - *new_fe_bh = NULL; status = ocfs2_claim_new_inode(osb, handle, dir, parent_fe_bh, @@ -507,9 +506,10 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb, mlog_errno(status); goto leave; } - ocfs2_set_new_buffer_uptodate(inode, *new_fe_bh); + ocfs2_set_new_buffer_uptodate(INODE_CACHE(inode), *new_fe_bh); - status = ocfs2_journal_access_di(handle, inode, *new_fe_bh, + status = ocfs2_journal_access_di(handle, INODE_CACHE(inode), + *new_fe_bh, OCFS2_JOURNAL_ACCESS_CREATE); if (status < 0) { mlog_errno(status); @@ -565,7 +565,7 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb, } ocfs2_populate_inode(inode, fe, 1); - ocfs2_inode_set_new(osb, inode); + ocfs2_ci_set_new(osb, INODE_CACHE(inode)); if (!ocfs2_mount_local(osb)) { status = ocfs2_create_new_inode_locks(inode); if (status < 0) @@ -682,7 +682,7 @@ static int ocfs2_link(struct dentry *old_dentry, goto out_unlock_inode; } - err = ocfs2_journal_access_di(handle, inode, fe_bh, + err = ocfs2_journal_access_di(handle, INODE_CACHE(inode), fe_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (err < 0) { mlog_errno(err); @@ -850,7 +850,8 @@ static int ocfs2_unlink(struct inode *dir, } if (inode_is_unlinkable(inode)) { - status = ocfs2_prepare_orphan_dir(osb, &orphan_dir, inode, + status = ocfs2_prepare_orphan_dir(osb, &orphan_dir, + OCFS2_I(inode)->ip_blkno, orphan_name, &orphan_insert); if (status < 0) { mlog_errno(status); @@ -866,7 +867,7 @@ static int ocfs2_unlink(struct inode *dir, goto leave; } - status = ocfs2_journal_access_di(handle, inode, fe_bh, + status = ocfs2_journal_access_di(handle, INODE_CACHE(inode), fe_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { mlog_errno(status); @@ -1241,9 +1242,8 @@ static int ocfs2_rename(struct inode *old_dir, if (S_ISDIR(new_inode->i_mode) || (new_inode->i_nlink == 1)) { status = ocfs2_prepare_orphan_dir(osb, &orphan_dir, - new_inode, - orphan_name, - &orphan_insert); + OCFS2_I(new_inode)->ip_blkno, + orphan_name, &orphan_insert); if (status < 0) { mlog_errno(status); goto bail; @@ -1284,7 +1284,8 @@ static int ocfs2_rename(struct inode *old_dir, goto bail; } } - status = ocfs2_journal_access_di(handle, new_inode, newfe_bh, + status = ocfs2_journal_access_di(handle, INODE_CACHE(new_inode), + newfe_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { mlog_errno(status); @@ -1331,7 +1332,8 @@ static int ocfs2_rename(struct inode *old_dir, old_inode->i_ctime = CURRENT_TIME; mark_inode_dirty(old_inode); - status = ocfs2_journal_access_di(handle, old_inode, old_inode_bh, + status = ocfs2_journal_access_di(handle, INODE_CACHE(old_inode), + old_inode_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (status >= 0) { old_di = (struct ocfs2_dinode *) old_inode_bh->b_data; @@ -1407,9 +1409,10 @@ static int ocfs2_rename(struct inode *old_dir, (int)old_dir_nlink, old_dir->i_nlink); } else { struct ocfs2_dinode *fe; - status = ocfs2_journal_access_di(handle, old_dir, - old_dir_bh, - OCFS2_JOURNAL_ACCESS_WRITE); + status = ocfs2_journal_access_di(handle, + INODE_CACHE(old_dir), + old_dir_bh, + OCFS2_JOURNAL_ACCESS_WRITE); fe = (struct ocfs2_dinode *) old_dir_bh->b_data; ocfs2_set_links_count(fe, old_dir->i_nlink); status = ocfs2_journal_dirty(handle, old_dir_bh); @@ -1527,9 +1530,11 @@ static int ocfs2_create_symlink_data(struct ocfs2_super *osb, mlog_errno(status); goto bail; } - ocfs2_set_new_buffer_uptodate(inode, bhs[virtual]); + ocfs2_set_new_buffer_uptodate(INODE_CACHE(inode), + bhs[virtual]); - status = ocfs2_journal_access(handle, inode, bhs[virtual], + status = ocfs2_journal_access(handle, INODE_CACHE(inode), + bhs[virtual], OCFS2_JOURNAL_ACCESS_CREATE); if (status < 0) { mlog_errno(status); @@ -1692,7 +1697,11 @@ static int ocfs2_symlink(struct inode *dir, } did_quota_inode = 1; - status = ocfs2_mknod_locked(osb, dir, inode, dentry, + mlog_entry("(0x%p, 0x%p, %d, '%.*s')\n", dir, dentry, + inode->i_mode, dentry->d_name.len, + dentry->d_name.name); + + status = ocfs2_mknod_locked(osb, dir, inode, 0, &new_fe_bh, parent_fe_bh, handle, inode_ac); if (status < 0) { @@ -1842,7 +1851,7 @@ bail: static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb, struct inode **ret_orphan_dir, - struct inode *inode, + u64 blkno, char *name, struct ocfs2_dir_lookup_result *lookup) { @@ -1850,7 +1859,7 @@ static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb, struct buffer_head *orphan_dir_bh = NULL; int status = 0; - status = ocfs2_blkno_stringify(OCFS2_I(inode)->ip_blkno, name); + status = ocfs2_blkno_stringify(blkno, name); if (status < 0) { mlog_errno(status); return status; @@ -1917,7 +1926,9 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb, goto leave; } - status = ocfs2_journal_access_di(handle, orphan_dir_inode, orphan_dir_bh, + status = ocfs2_journal_access_di(handle, + INODE_CACHE(orphan_dir_inode), + orphan_dir_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { mlog_errno(status); @@ -2002,7 +2013,9 @@ int ocfs2_orphan_del(struct ocfs2_super *osb, goto leave; } - status = ocfs2_journal_access_di(handle,orphan_dir_inode, orphan_dir_bh, + status = ocfs2_journal_access_di(handle, + INODE_CACHE(orphan_dir_inode), + orphan_dir_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { mlog_errno(status); @@ -2028,6 +2041,274 @@ leave: return status; } +int ocfs2_create_inode_in_orphan(struct inode *dir, + int mode, + struct inode **new_inode) +{ + int status, did_quota_inode = 0; + struct inode *inode = NULL; + struct inode *orphan_dir = NULL; + struct ocfs2_super *osb = OCFS2_SB(dir->i_sb); + struct ocfs2_dinode *di = NULL; + handle_t *handle = NULL; + char orphan_name[OCFS2_ORPHAN_NAMELEN + 1]; + struct buffer_head *parent_di_bh = NULL; + struct buffer_head *new_di_bh = NULL; + struct ocfs2_alloc_context *inode_ac = NULL; + struct ocfs2_dir_lookup_result orphan_insert = { NULL, }; + + status = ocfs2_inode_lock(dir, &parent_di_bh, 1); + if (status < 0) { + if (status != -ENOENT) + mlog_errno(status); + return status; + } + + /* + * We give the orphan dir the root blkno to fake an orphan name, + * and allocate enough space for our insertion. + */ + status = ocfs2_prepare_orphan_dir(osb, &orphan_dir, + osb->root_blkno, + orphan_name, &orphan_insert); + if (status < 0) { + mlog_errno(status); + goto leave; + } + + /* reserve an inode spot */ + status = ocfs2_reserve_new_inode(osb, &inode_ac); + if (status < 0) { + if (status != -ENOSPC) + mlog_errno(status); + goto leave; + } + + inode = ocfs2_get_init_inode(dir, mode); + if (!inode) { + status = -ENOMEM; + mlog_errno(status); + goto leave; + } + + handle = ocfs2_start_trans(osb, ocfs2_mknod_credits(osb->sb, 0, 0)); + if (IS_ERR(handle)) { + status = PTR_ERR(handle); + handle = NULL; + mlog_errno(status); + goto leave; + } + + /* We don't use standard VFS wrapper because we don't want vfs_dq_init + * to be called. */ + if (sb_any_quota_active(osb->sb) && + osb->sb->dq_op->alloc_inode(inode, 1) == NO_QUOTA) { + status = -EDQUOT; + goto leave; + } + did_quota_inode = 1; + + /* do the real work now. */ + status = ocfs2_mknod_locked(osb, dir, inode, + 0, &new_di_bh, parent_di_bh, handle, + inode_ac); + if (status < 0) { + mlog_errno(status); + goto leave; + } + + status = ocfs2_blkno_stringify(OCFS2_I(inode)->ip_blkno, orphan_name); + if (status < 0) { + mlog_errno(status); + goto leave; + } + + di = (struct ocfs2_dinode *)new_di_bh->b_data; + status = ocfs2_orphan_add(osb, handle, inode, di, orphan_name, + &orphan_insert, orphan_dir); + if (status < 0) { + mlog_errno(status); + goto leave; + } + + /* get open lock so that only nodes can't remove it from orphan dir. */ + status = ocfs2_open_lock(inode); + if (status < 0) + mlog_errno(status); + +leave: + if (status < 0 && did_quota_inode) + vfs_dq_free_inode(inode); + if (handle) + ocfs2_commit_trans(osb, handle); + + if (orphan_dir) { + /* This was locked for us in ocfs2_prepare_orphan_dir() */ + ocfs2_inode_unlock(orphan_dir, 1); + mutex_unlock(&orphan_dir->i_mutex); + iput(orphan_dir); + } + + if (status == -ENOSPC) + mlog(0, "Disk is full\n"); + + if ((status < 0) && inode) { + clear_nlink(inode); + iput(inode); + } + + if (inode_ac) + ocfs2_free_alloc_context(inode_ac); + + brelse(new_di_bh); + + if (!status) + *new_inode = inode; + + ocfs2_free_dir_lookup_result(&orphan_insert); + + ocfs2_inode_unlock(dir, 1); + brelse(parent_di_bh); + return status; +} + +int ocfs2_mv_orphaned_inode_to_new(struct inode *dir, + struct inode *inode, + struct dentry *dentry) +{ + int status = 0; + struct buffer_head *parent_di_bh = NULL; + handle_t *handle = NULL; + struct ocfs2_super *osb = OCFS2_SB(dir->i_sb); + struct ocfs2_dinode *dir_di, *di; + struct inode *orphan_dir_inode = NULL; + struct buffer_head *orphan_dir_bh = NULL; + struct buffer_head *di_bh = NULL; + struct ocfs2_dir_lookup_result lookup = { NULL, }; + + mlog_entry("(0x%p, 0x%p, %.*s')\n", dir, dentry, + dentry->d_name.len, dentry->d_name.name); + + status = ocfs2_inode_lock(dir, &parent_di_bh, 1); + if (status < 0) { + if (status != -ENOENT) + mlog_errno(status); + return status; + } + + dir_di = (struct ocfs2_dinode *) parent_di_bh->b_data; + if (!dir_di->i_links_count) { + /* can't make a file in a deleted directory. */ + status = -ENOENT; + goto leave; + } + + status = ocfs2_check_dir_for_entry(dir, dentry->d_name.name, + dentry->d_name.len); + if (status) + goto leave; + + /* get a spot inside the dir. */ + status = ocfs2_prepare_dir_for_insert(osb, dir, parent_di_bh, + dentry->d_name.name, + dentry->d_name.len, &lookup); + if (status < 0) { + mlog_errno(status); + goto leave; + } + + orphan_dir_inode = ocfs2_get_system_file_inode(osb, + ORPHAN_DIR_SYSTEM_INODE, + osb->slot_num); + if (!orphan_dir_inode) { + status = -EEXIST; + mlog_errno(status); + goto leave; + } + + mutex_lock(&orphan_dir_inode->i_mutex); + + status = ocfs2_inode_lock(orphan_dir_inode, &orphan_dir_bh, 1); + if (status < 0) { + mlog_errno(status); + mutex_unlock(&orphan_dir_inode->i_mutex); + iput(orphan_dir_inode); + goto leave; + } + + status = ocfs2_read_inode_block(inode, &di_bh); + if (status < 0) { + mlog_errno(status); + goto orphan_unlock; + } + + handle = ocfs2_start_trans(osb, ocfs2_rename_credits(osb->sb)); + if (IS_ERR(handle)) { + status = PTR_ERR(handle); + handle = NULL; + mlog_errno(status); + goto orphan_unlock; + } + + status = ocfs2_journal_access_di(handle, INODE_CACHE(inode), + di_bh, OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto out_commit; + } + + status = ocfs2_orphan_del(osb, handle, orphan_dir_inode, inode, + orphan_dir_bh); + if (status < 0) { + mlog_errno(status); + goto out_commit; + } + + di = (struct ocfs2_dinode *)di_bh->b_data; + le32_add_cpu(&di->i_flags, -OCFS2_ORPHANED_FL); + di->i_orphaned_slot = 0; + ocfs2_journal_dirty(handle, di_bh); + + status = ocfs2_add_entry(handle, dentry, inode, + OCFS2_I(inode)->ip_blkno, parent_di_bh, + &lookup); + if (status < 0) { + mlog_errno(status); + goto out_commit; + } + + status = ocfs2_dentry_attach_lock(dentry, inode, + OCFS2_I(dir)->ip_blkno); + if (status) { + mlog_errno(status); + goto out_commit; + } + + insert_inode_hash(inode); + dentry->d_op = &ocfs2_dentry_ops; + d_instantiate(dentry, inode); + status = 0; +out_commit: + ocfs2_commit_trans(osb, handle); +orphan_unlock: + ocfs2_inode_unlock(orphan_dir_inode, 1); + mutex_unlock(&orphan_dir_inode->i_mutex); + iput(orphan_dir_inode); +leave: + + ocfs2_inode_unlock(dir, 1); + + brelse(di_bh); + brelse(parent_di_bh); + brelse(orphan_dir_bh); + + ocfs2_free_dir_lookup_result(&lookup); + + mlog_exit(status); + + return status; +} + const struct inode_operations ocfs2_dir_iops = { .create = ocfs2_create, .lookup = ocfs2_lookup, diff --git a/fs/ocfs2/namei.h b/fs/ocfs2/namei.h index 688aef6..e5d059d 100644 --- a/fs/ocfs2/namei.h +++ b/fs/ocfs2/namei.h @@ -35,5 +35,11 @@ int ocfs2_orphan_del(struct ocfs2_super *osb, struct inode *orphan_dir_inode, struct inode *inode, struct buffer_head *orphan_dir_bh); +int ocfs2_create_inode_in_orphan(struct inode *dir, + int mode, + struct inode **new_inode); +int ocfs2_mv_orphaned_inode_to_new(struct inode *dir, + struct inode *new_inode, + struct dentry *new_dentry); #endif /* OCFS2_NAMEI_H */ diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index 39e1d5a..eae4046 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -51,20 +51,51 @@ /* For struct ocfs2_blockcheck_stats */ #include "blockcheck.h" + +/* Caching of metadata buffers */ + /* Most user visible OCFS2 inodes will have very few pieces of * metadata, but larger files (including bitmaps, etc) must be taken * into account when designing an access scheme. We allow a small * amount of inlined blocks to be stored on an array and grow the * structure into a rb tree when necessary. */ -#define OCFS2_INODE_MAX_CACHE_ARRAY 2 +#define OCFS2_CACHE_INFO_MAX_ARRAY 2 + +/* Flags for ocfs2_caching_info */ + +enum ocfs2_caching_info_flags { + /* Indicates that the metadata cache is using the inline array */ + OCFS2_CACHE_FL_INLINE = 1<<1, +}; +struct ocfs2_caching_operations; struct ocfs2_caching_info { + /* + * The parent structure provides the locks, but because the + * parent structure can differ, it provides locking operations + * to struct ocfs2_caching_info. + */ + const struct ocfs2_caching_operations *ci_ops; + + /* next two are protected by trans_inc_lock */ + /* which transaction were we created on? Zero if none. */ + unsigned long ci_created_trans; + /* last transaction we were a part of. */ + unsigned long ci_last_trans; + + /* Cache structures */ + unsigned int ci_flags; unsigned int ci_num_cached; union { - sector_t ci_array[OCFS2_INODE_MAX_CACHE_ARRAY]; + sector_t ci_array[OCFS2_CACHE_INFO_MAX_ARRAY]; struct rb_root ci_tree; } ci_cache; }; +/* + * Need this prototype here instead of in uptodate.h because journal.h + * uses it. + */ +struct super_block *ocfs2_metadata_cache_get_super(struct ocfs2_caching_info *ci); /* this limits us to 256 nodes * if we need more, we can do a kmalloc for the map */ @@ -377,12 +408,17 @@ struct ocfs2_super /* the group we used to allocate inodes. */ u64 osb_inode_alloc_group; + + /* rb tree root for refcount lock. */ + struct rb_root osb_rf_lock_tree; + struct ocfs2_refcount_tree *osb_ref_tree_lru; }; #define OCFS2_SB(sb) ((struct ocfs2_super *)(sb)->s_fs_info) /* Useful typedef for passing around journal access functions */ -typedef int (*ocfs2_journal_access_func)(handle_t *handle, struct inode *inode, +typedef int (*ocfs2_journal_access_func)(handle_t *handle, + struct ocfs2_caching_info *ci, struct buffer_head *bh, int type); static inline int ocfs2_should_order_data(struct inode *inode) @@ -480,6 +516,13 @@ static inline void ocfs2_add_links_count(struct ocfs2_dinode *di, int n) ocfs2_set_links_count(di, links); } +static inline int ocfs2_refcount_tree(struct ocfs2_super *osb) +{ + if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_REFCOUNT_TREE) + return 1; + return 0; +} + /* set / clear functions because cluster events can make these happen * in parallel so we want the transitions to be atomic. this also * means that any future flags osb_flags must be protected by spinlock @@ -578,6 +621,9 @@ static inline int ocfs2_uses_extended_slot_map(struct ocfs2_super *osb) #define OCFS2_IS_VALID_DX_LEAF(ptr) \ (!strcmp((ptr)->dl_signature, OCFS2_DX_LEAF_SIGNATURE)) +#define OCFS2_IS_VALID_REFCOUNT_BLOCK(ptr) \ + (!strcmp((ptr)->rf_signature, OCFS2_REFCOUNT_BLOCK_SIGNATURE)) + static inline unsigned long ino_from_blkno(struct super_block *sb, u64 blkno) { diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h index 7ab6e9e..e9431e4 100644 --- a/fs/ocfs2/ocfs2_fs.h +++ b/fs/ocfs2/ocfs2_fs.h @@ -68,6 +68,7 @@ #define OCFS2_DIR_TRAILER_SIGNATURE "DIRTRL1" #define OCFS2_DX_ROOT_SIGNATURE "DXDIR01" #define OCFS2_DX_LEAF_SIGNATURE "DXLEAF1" +#define OCFS2_REFCOUNT_BLOCK_SIGNATURE "REFCNT1" /* Compatibility flags */ #define OCFS2_HAS_COMPAT_FEATURE(sb,mask) \ @@ -98,7 +99,8 @@ | OCFS2_FEATURE_INCOMPAT_USERSPACE_STACK \ | OCFS2_FEATURE_INCOMPAT_XATTR \ | OCFS2_FEATURE_INCOMPAT_META_ECC \ - | OCFS2_FEATURE_INCOMPAT_INDEXED_DIRS) + | OCFS2_FEATURE_INCOMPAT_INDEXED_DIRS \ + | OCFS2_FEATURE_INCOMPAT_REFCOUNT_TREE) #define OCFS2_FEATURE_RO_COMPAT_SUPP (OCFS2_FEATURE_RO_COMPAT_UNWRITTEN \ | OCFS2_FEATURE_RO_COMPAT_USRQUOTA \ | OCFS2_FEATURE_RO_COMPAT_GRPQUOTA) @@ -160,6 +162,9 @@ /* Metadata checksum and error correction */ #define OCFS2_FEATURE_INCOMPAT_META_ECC 0x0800 +/* Refcount tree support */ +#define OCFS2_FEATURE_INCOMPAT_REFCOUNT_TREE 0x1000 + /* * backup superblock flag is used to indicate that this volume * has backup superblocks. @@ -223,6 +228,7 @@ #define OCFS2_HAS_XATTR_FL (0x0002) #define OCFS2_INLINE_XATTR_FL (0x0004) #define OCFS2_INDEXED_DIR_FL (0x0008) +#define OCFS2_HAS_REFCOUNT_FL (0x0010) /* Inode attributes, keep in sync with EXT2 */ #define OCFS2_SECRM_FL (0x00000001) /* Secure deletion */ @@ -241,8 +247,11 @@ /* * Extent record flags (e_node.leaf.flags) */ -#define OCFS2_EXT_UNWRITTEN (0x01) /* Extent is allocated but - * unwritten */ +#define OCFS2_EXT_UNWRITTEN (0x01) /* Extent is allocated but + * unwritten */ +#define OCFS2_EXT_REFCOUNTED (0x02) /* Extent is reference + * counted in an associated + * refcount tree */ /* * ioctl commands @@ -292,6 +301,15 @@ struct ocfs2_new_group_input { #define OCFS2_IOC_GROUP_ADD _IOW('o', 2,struct ocfs2_new_group_input) #define OCFS2_IOC_GROUP_ADD64 _IOW('o', 3,struct ocfs2_new_group_input) +/* Used to pass 2 file names to reflink. */ +struct reflink_arguments { + __u64 old_path; + __u64 new_path; + __u64 preserve; +}; +#define OCFS2_IOC_REFLINK _IOW('o', 4, struct reflink_arguments) + + /* * Journal Flags (ocfs2_dinode.id1.journal1.i_flags) */ @@ -717,7 +735,8 @@ struct ocfs2_dinode { __le64 i_xattr_loc; /*80*/ struct ocfs2_block_check i_check; /* Error checking */ /*88*/ __le64 i_dx_root; /* Pointer to dir index root block */ - __le64 i_reserved2[5]; +/*90*/ __le64 i_refcount_loc; + __le64 i_reserved2[4]; /*B8*/ union { __le64 i_pad1; /* Generic way to refer to this 64bit union */ @@ -901,6 +920,60 @@ struct ocfs2_group_desc /*40*/ __u8 bg_bitmap[0]; }; +struct ocfs2_refcount_rec { +/*00*/ __le64 r_cpos; /* Physical offset, in clusters */ + __le32 r_clusters; /* Clusters covered by this extent */ + __le32 r_refcount; /* Reference count of this extent */ +/*10*/ +}; +#define OCFS2_32BIT_POS_MASK (0xffffffffULL) + +#define OCFS2_REFCOUNT_LEAF_FL (0x00000001) +#define OCFS2_REFCOUNT_TREE_FL (0x00000002) + +struct ocfs2_refcount_list { +/*00*/ __le16 rl_count; /* Maximum number of entries possible + in rl_records */ + __le16 rl_used; /* Current number of used records */ + __le32 rl_reserved2; + __le64 rl_reserved1; /* Pad to sizeof(ocfs2_refcount_record) */ +/*10*/ struct ocfs2_refcount_rec rl_recs[0]; /* Refcount records */ +}; + + +struct ocfs2_refcount_block { +/*00*/ __u8 rf_signature[8]; /* Signature for verification */ + __le16 rf_suballoc_slot; /* Slot suballocator this block + belongs to */ + __le16 rf_suballoc_bit; /* Bit offset in suballocator + block group */ + __le32 rf_fs_generation; /* Must match superblock */ +/*10*/ __le64 rf_blkno; /* Offset on disk, in blocks */ + __le64 rf_parent; /* Parent block, only valid if + OCFS2_REFCOUNT_LEAF_FL is set in + rf_flags */ +/*20*/ struct ocfs2_block_check rf_check; /* Error checking */ + __le64 rf_last_eb_blk; /* Pointer to last extent block */ +/*30*/ __le32 rf_count; /* Number of inodes sharing this + refcount tree */ + __le32 rf_flags; /* See the flags above */ + __le32 rf_clusters; /* clusters covered by refcount tree. */ + __le32 rf_cpos; /* cluster offset in refcount tree.*/ +/*40*/ __le32 rf_generation; /* generation number. all be the same + * for the same refcount tree. */ + __le32 rf_reserved0; + __le64 rf_reserved1[7]; +/*80*/ union { + struct ocfs2_refcount_list rf_records; /* List of refcount + records */ + struct ocfs2_extent_list rf_list; /* Extent record list, + only valid if + OCFS2_REFCOUNT_TREE_FL + is set in rf_flags */ + }; +/* Actual on-disk size is one block */ +}; + /* * On disk extended attribute structure for OCFS2. */ @@ -1312,6 +1385,32 @@ static inline u16 ocfs2_xattr_recs_per_xb(struct super_block *sb) return size / sizeof(struct ocfs2_extent_rec); } + +static inline u16 ocfs2_extent_recs_per_rb(struct super_block *sb) +{ + int size; + + size = sb->s_blocksize - + offsetof(struct ocfs2_refcount_block, rf_list.l_recs); + + return size / sizeof(struct ocfs2_extent_rec); +} + +static inline u16 ocfs2_refcount_recs_per_rb(struct super_block *sb) +{ + int size; + + size = sb->s_blocksize - + offsetof(struct ocfs2_refcount_block, rf_records.rl_recs); + + return size / sizeof(struct ocfs2_refcount_rec); +} + +static inline u32 +ocfs2_get_ref_rec_low_cpos(const struct ocfs2_refcount_rec *rec) +{ + return le64_to_cpu(rec->r_cpos) & OCFS2_32BIT_POS_MASK; +} #else static inline int ocfs2_fast_symlink_chars(int blocksize) { diff --git a/fs/ocfs2/ocfs2_lockid.h b/fs/ocfs2/ocfs2_lockid.h index c212cf5..d277aab 100644 --- a/fs/ocfs2/ocfs2_lockid.h +++ b/fs/ocfs2/ocfs2_lockid.h @@ -49,6 +49,7 @@ enum ocfs2_lock_type { OCFS2_LOCK_TYPE_QINFO, OCFS2_LOCK_TYPE_NFS_SYNC, OCFS2_LOCK_TYPE_ORPHAN_SCAN, + OCFS2_LOCK_TYPE_REFCOUNT, OCFS2_NUM_LOCK_TYPES }; @@ -89,6 +90,9 @@ static inline char ocfs2_lock_type_char(enum ocfs2_lock_type type) case OCFS2_LOCK_TYPE_ORPHAN_SCAN: c = 'P'; break; + case OCFS2_LOCK_TYPE_REFCOUNT: + c = 'T'; + break; default: c = '\0'; } @@ -110,6 +114,7 @@ static char *ocfs2_lock_type_strings[] = { [OCFS2_LOCK_TYPE_QINFO] = "Quota", [OCFS2_LOCK_TYPE_NFS_SYNC] = "NFSSync", [OCFS2_LOCK_TYPE_ORPHAN_SCAN] = "OrphanScan", + [OCFS2_LOCK_TYPE_REFCOUNT] = "Refcount", }; static inline const char *ocfs2_lock_type_string(enum ocfs2_lock_type type) diff --git a/fs/ocfs2/quota.h b/fs/ocfs2/quota.h index 3fb96fcd..e5df9d1 100644 --- a/fs/ocfs2/quota.h +++ b/fs/ocfs2/quota.h @@ -109,7 +109,7 @@ void ocfs2_unlock_global_qf(struct ocfs2_mem_dqinfo *oinfo, int ex); int ocfs2_read_quota_block(struct inode *inode, u64 v_block, struct buffer_head **bh); -extern struct dquot_operations ocfs2_quota_operations; +extern const struct dquot_operations ocfs2_quota_operations; extern struct quota_format_type ocfs2_quota_format; int ocfs2_quota_setup(void); diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c index 44f2a5e..b437dc0 100644 --- a/fs/ocfs2/quota_global.c +++ b/fs/ocfs2/quota_global.c @@ -154,7 +154,7 @@ static int ocfs2_get_quota_block(struct inode *inode, int block, err = -EIO; mlog_errno(err); } - return err;; + return err; } /* Read data from global quotafile - avoid pagecache and such because we cannot @@ -253,8 +253,9 @@ ssize_t ocfs2_quota_write(struct super_block *sb, int type, flush_dcache_page(bh->b_page); set_buffer_uptodate(bh); unlock_buffer(bh); - ocfs2_set_buffer_uptodate(gqinode, bh); - err = ocfs2_journal_access_dq(handle, gqinode, bh, ja_type); + ocfs2_set_buffer_uptodate(INODE_CACHE(gqinode), bh); + err = ocfs2_journal_access_dq(handle, INODE_CACHE(gqinode), bh, + ja_type); if (err < 0) { brelse(bh); goto out; @@ -849,7 +850,7 @@ static void ocfs2_destroy_dquot(struct dquot *dquot) kmem_cache_free(ocfs2_dquot_cachep, dquot); } -struct dquot_operations ocfs2_quota_operations = { +const struct dquot_operations ocfs2_quota_operations = { .initialize = dquot_initialize, .drop = dquot_drop, .alloc_space = dquot_alloc_space, diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c index bdb09cb..1a2c50a 100644 --- a/fs/ocfs2/quota_local.c +++ b/fs/ocfs2/quota_local.c @@ -108,7 +108,7 @@ static int ocfs2_modify_bh(struct inode *inode, struct buffer_head *bh, mlog_errno(status); return status; } - status = ocfs2_journal_access_dq(handle, inode, bh, + status = ocfs2_journal_access_dq(handle, INODE_CACHE(inode), bh, OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { mlog_errno(status); @@ -510,7 +510,8 @@ static int ocfs2_recover_local_quota_file(struct inode *lqinode, goto out_commit; } /* Release local quota file entry */ - status = ocfs2_journal_access_dq(handle, lqinode, + status = ocfs2_journal_access_dq(handle, + INODE_CACHE(lqinode), qbh, OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { mlog_errno(status); @@ -619,7 +620,8 @@ int ocfs2_finish_quota_recovery(struct ocfs2_super *osb, mlog_errno(status); goto out_bh; } - status = ocfs2_journal_access_dq(handle, lqinode, bh, + status = ocfs2_journal_access_dq(handle, INODE_CACHE(lqinode), + bh, OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { mlog_errno(status); @@ -993,8 +995,8 @@ static struct ocfs2_quota_chunk *ocfs2_local_quota_add_chunk( goto out_trans; } dchunk = (struct ocfs2_local_disk_chunk *)bh->b_data; - ocfs2_set_new_buffer_uptodate(lqinode, bh); - status = ocfs2_journal_access_dq(handle, lqinode, bh, + ocfs2_set_new_buffer_uptodate(INODE_CACHE(lqinode), bh); + status = ocfs2_journal_access_dq(handle, INODE_CACHE(lqinode), bh, OCFS2_JOURNAL_ACCESS_CREATE); if (status < 0) { mlog_errno(status); @@ -1027,8 +1029,8 @@ static struct ocfs2_quota_chunk *ocfs2_local_quota_add_chunk( mlog_errno(status); goto out_trans; } - ocfs2_set_new_buffer_uptodate(lqinode, dbh); - status = ocfs2_journal_access_dq(handle, lqinode, dbh, + ocfs2_set_new_buffer_uptodate(INODE_CACHE(lqinode), dbh); + status = ocfs2_journal_access_dq(handle, INODE_CACHE(lqinode), dbh, OCFS2_JOURNAL_ACCESS_CREATE); if (status < 0) { mlog_errno(status); @@ -1131,7 +1133,7 @@ static struct ocfs2_quota_chunk *ocfs2_extend_local_quota_file( mlog_errno(status); goto out; } - ocfs2_set_new_buffer_uptodate(lqinode, bh); + ocfs2_set_new_buffer_uptodate(INODE_CACHE(lqinode), bh); /* Local quota info, chunk header and the new block we initialize */ handle = ocfs2_start_trans(OCFS2_SB(sb), @@ -1143,7 +1145,7 @@ static struct ocfs2_quota_chunk *ocfs2_extend_local_quota_file( goto out; } /* Zero created block */ - status = ocfs2_journal_access_dq(handle, lqinode, bh, + status = ocfs2_journal_access_dq(handle, INODE_CACHE(lqinode), bh, OCFS2_JOURNAL_ACCESS_CREATE); if (status < 0) { mlog_errno(status); @@ -1158,7 +1160,8 @@ static struct ocfs2_quota_chunk *ocfs2_extend_local_quota_file( goto out_trans; } /* Update chunk header */ - status = ocfs2_journal_access_dq(handle, lqinode, chunk->qc_headerbh, + status = ocfs2_journal_access_dq(handle, INODE_CACHE(lqinode), + chunk->qc_headerbh, OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { mlog_errno(status); @@ -1292,7 +1295,8 @@ static int ocfs2_local_release_dquot(struct dquot *dquot) goto out; } - status = ocfs2_journal_access_dq(handle, sb_dqopt(sb)->files[type], + status = ocfs2_journal_access_dq(handle, + INODE_CACHE(sb_dqopt(sb)->files[type]), od->dq_chunk->qc_headerbh, OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { mlog_errno(status); diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c new file mode 100644 index 0000000..60287fc --- /dev/null +++ b/fs/ocfs2/refcounttree.c @@ -0,0 +1,4313 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * refcounttree.c + * + * Copyright (C) 2009 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ + +#include <linux/sort.h> +#define MLOG_MASK_PREFIX ML_REFCOUNT +#include <cluster/masklog.h> +#include "ocfs2.h" +#include "inode.h" +#include "alloc.h" +#include "suballoc.h" +#include "journal.h" +#include "uptodate.h" +#include "super.h" +#include "buffer_head_io.h" +#include "blockcheck.h" +#include "refcounttree.h" +#include "sysfile.h" +#include "dlmglue.h" +#include "extent_map.h" +#include "aops.h" +#include "xattr.h" +#include "namei.h" + +#include <linux/bio.h> +#include <linux/blkdev.h> +#include <linux/gfp.h> +#include <linux/slab.h> +#include <linux/writeback.h> +#include <linux/pagevec.h> +#include <linux/swap.h> +#include <linux/security.h> +#include <linux/fsnotify.h> +#include <linux/quotaops.h> +#include <linux/namei.h> +#include <linux/mount.h> + +struct ocfs2_cow_context { + struct inode *inode; + u32 cow_start; + u32 cow_len; + struct ocfs2_extent_tree data_et; + struct ocfs2_refcount_tree *ref_tree; + struct buffer_head *ref_root_bh; + struct ocfs2_alloc_context *meta_ac; + struct ocfs2_alloc_context *data_ac; + struct ocfs2_cached_dealloc_ctxt dealloc; + void *cow_object; + struct ocfs2_post_refcount *post_refcount; + int extra_credits; + int (*get_clusters)(struct ocfs2_cow_context *context, + u32 v_cluster, u32 *p_cluster, + u32 *num_clusters, + unsigned int *extent_flags); + int (*cow_duplicate_clusters)(handle_t *handle, + struct ocfs2_cow_context *context, + u32 cpos, u32 old_cluster, + u32 new_cluster, u32 new_len); +}; + +static inline struct ocfs2_refcount_tree * +cache_info_to_refcount(struct ocfs2_caching_info *ci) +{ + return container_of(ci, struct ocfs2_refcount_tree, rf_ci); +} + +static int ocfs2_validate_refcount_block(struct super_block *sb, + struct buffer_head *bh) +{ + int rc; + struct ocfs2_refcount_block *rb = + (struct ocfs2_refcount_block *)bh->b_data; + + mlog(0, "Validating refcount block %llu\n", + (unsigned long long)bh->b_blocknr); + + BUG_ON(!buffer_uptodate(bh)); + + /* + * If the ecc fails, we return the error but otherwise + * leave the filesystem running. We know any error is + * local to this block. + */ + rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &rb->rf_check); + if (rc) { + mlog(ML_ERROR, "Checksum failed for refcount block %llu\n", + (unsigned long long)bh->b_blocknr); + return rc; + } + + + if (!OCFS2_IS_VALID_REFCOUNT_BLOCK(rb)) { + ocfs2_error(sb, + "Refcount block #%llu has bad signature %.*s", + (unsigned long long)bh->b_blocknr, 7, + rb->rf_signature); + return -EINVAL; + } + + if (le64_to_cpu(rb->rf_blkno) != bh->b_blocknr) { + ocfs2_error(sb, + "Refcount block #%llu has an invalid rf_blkno " + "of %llu", + (unsigned long long)bh->b_blocknr, + (unsigned long long)le64_to_cpu(rb->rf_blkno)); + return -EINVAL; + } + + if (le32_to_cpu(rb->rf_fs_generation) != OCFS2_SB(sb)->fs_generation) { + ocfs2_error(sb, + "Refcount block #%llu has an invalid " + "rf_fs_generation of #%u", + (unsigned long long)bh->b_blocknr, + le32_to_cpu(rb->rf_fs_generation)); + return -EINVAL; + } + + return 0; +} + +static int ocfs2_read_refcount_block(struct ocfs2_caching_info *ci, + u64 rb_blkno, + struct buffer_head **bh) +{ + int rc; + struct buffer_head *tmp = *bh; + + rc = ocfs2_read_block(ci, rb_blkno, &tmp, + ocfs2_validate_refcount_block); + + /* If ocfs2_read_block() got us a new bh, pass it up. */ + if (!rc && !*bh) + *bh = tmp; + + return rc; +} + +static u64 ocfs2_refcount_cache_owner(struct ocfs2_caching_info *ci) +{ + struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci); + + return rf->rf_blkno; +} + +static struct super_block * +ocfs2_refcount_cache_get_super(struct ocfs2_caching_info *ci) +{ + struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci); + + return rf->rf_sb; +} + +static void ocfs2_refcount_cache_lock(struct ocfs2_caching_info *ci) +{ + struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci); + + spin_lock(&rf->rf_lock); +} + +static void ocfs2_refcount_cache_unlock(struct ocfs2_caching_info *ci) +{ + struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci); + + spin_unlock(&rf->rf_lock); +} + +static void ocfs2_refcount_cache_io_lock(struct ocfs2_caching_info *ci) +{ + struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci); + + mutex_lock(&rf->rf_io_mutex); +} + +static void ocfs2_refcount_cache_io_unlock(struct ocfs2_caching_info *ci) +{ + struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci); + + mutex_unlock(&rf->rf_io_mutex); +} + +static const struct ocfs2_caching_operations ocfs2_refcount_caching_ops = { + .co_owner = ocfs2_refcount_cache_owner, + .co_get_super = ocfs2_refcount_cache_get_super, + .co_cache_lock = ocfs2_refcount_cache_lock, + .co_cache_unlock = ocfs2_refcount_cache_unlock, + .co_io_lock = ocfs2_refcount_cache_io_lock, + .co_io_unlock = ocfs2_refcount_cache_io_unlock, +}; + +static struct ocfs2_refcount_tree * +ocfs2_find_refcount_tree(struct ocfs2_super *osb, u64 blkno) +{ + struct rb_node *n = osb->osb_rf_lock_tree.rb_node; + struct ocfs2_refcount_tree *tree = NULL; + + while (n) { + tree = rb_entry(n, struct ocfs2_refcount_tree, rf_node); + + if (blkno < tree->rf_blkno) + n = n->rb_left; + else if (blkno > tree->rf_blkno) + n = n->rb_right; + else + return tree; + } + + return NULL; +} + +/* osb_lock is already locked. */ +static void ocfs2_insert_refcount_tree(struct ocfs2_super *osb, + struct ocfs2_refcount_tree *new) +{ + u64 rf_blkno = new->rf_blkno; + struct rb_node *parent = NULL; + struct rb_node **p = &osb->osb_rf_lock_tree.rb_node; + struct ocfs2_refcount_tree *tmp; + + while (*p) { + parent = *p; + + tmp = rb_entry(parent, struct ocfs2_refcount_tree, + rf_node); + + if (rf_blkno < tmp->rf_blkno) + p = &(*p)->rb_left; + else if (rf_blkno > tmp->rf_blkno) + p = &(*p)->rb_right; + else { + /* This should never happen! */ + mlog(ML_ERROR, "Duplicate refcount block %llu found!\n", + (unsigned long long)rf_blkno); + BUG(); + } + } + + rb_link_node(&new->rf_node, parent, p); + rb_insert_color(&new->rf_node, &osb->osb_rf_lock_tree); +} + +static void ocfs2_free_refcount_tree(struct ocfs2_refcount_tree *tree) +{ + ocfs2_metadata_cache_exit(&tree->rf_ci); + ocfs2_simple_drop_lockres(OCFS2_SB(tree->rf_sb), &tree->rf_lockres); + ocfs2_lock_res_free(&tree->rf_lockres); + kfree(tree); +} + +static inline void +ocfs2_erase_refcount_tree_from_list_no_lock(struct ocfs2_super *osb, + struct ocfs2_refcount_tree *tree) +{ + rb_erase(&tree->rf_node, &osb->osb_rf_lock_tree); + if (osb->osb_ref_tree_lru && osb->osb_ref_tree_lru == tree) + osb->osb_ref_tree_lru = NULL; +} + +static void ocfs2_erase_refcount_tree_from_list(struct ocfs2_super *osb, + struct ocfs2_refcount_tree *tree) +{ + spin_lock(&osb->osb_lock); + ocfs2_erase_refcount_tree_from_list_no_lock(osb, tree); + spin_unlock(&osb->osb_lock); +} + +void ocfs2_kref_remove_refcount_tree(struct kref *kref) +{ + struct ocfs2_refcount_tree *tree = + container_of(kref, struct ocfs2_refcount_tree, rf_getcnt); + + ocfs2_free_refcount_tree(tree); +} + +static inline void +ocfs2_refcount_tree_get(struct ocfs2_refcount_tree *tree) +{ + kref_get(&tree->rf_getcnt); +} + +static inline void +ocfs2_refcount_tree_put(struct ocfs2_refcount_tree *tree) +{ + kref_put(&tree->rf_getcnt, ocfs2_kref_remove_refcount_tree); +} + +static inline void ocfs2_init_refcount_tree_ci(struct ocfs2_refcount_tree *new, + struct super_block *sb) +{ + ocfs2_metadata_cache_init(&new->rf_ci, &ocfs2_refcount_caching_ops); + mutex_init(&new->rf_io_mutex); + new->rf_sb = sb; + spin_lock_init(&new->rf_lock); +} + +static inline void ocfs2_init_refcount_tree_lock(struct ocfs2_super *osb, + struct ocfs2_refcount_tree *new, + u64 rf_blkno, u32 generation) +{ + init_rwsem(&new->rf_sem); + ocfs2_refcount_lock_res_init(&new->rf_lockres, osb, + rf_blkno, generation); +} + +static struct ocfs2_refcount_tree* +ocfs2_allocate_refcount_tree(struct ocfs2_super *osb, u64 rf_blkno) +{ + struct ocfs2_refcount_tree *new; + + new = kzalloc(sizeof(struct ocfs2_refcount_tree), GFP_NOFS); + if (!new) + return NULL; + + new->rf_blkno = rf_blkno; + kref_init(&new->rf_getcnt); + ocfs2_init_refcount_tree_ci(new, osb->sb); + + return new; +} + +static int ocfs2_get_refcount_tree(struct ocfs2_super *osb, u64 rf_blkno, + struct ocfs2_refcount_tree **ret_tree) +{ + int ret = 0; + struct ocfs2_refcount_tree *tree, *new = NULL; + struct buffer_head *ref_root_bh = NULL; + struct ocfs2_refcount_block *ref_rb; + + spin_lock(&osb->osb_lock); + if (osb->osb_ref_tree_lru && + osb->osb_ref_tree_lru->rf_blkno == rf_blkno) + tree = osb->osb_ref_tree_lru; + else + tree = ocfs2_find_refcount_tree(osb, rf_blkno); + if (tree) + goto out; + + spin_unlock(&osb->osb_lock); + + new = ocfs2_allocate_refcount_tree(osb, rf_blkno); + if (!new) { + ret = -ENOMEM; + mlog_errno(ret); + return ret; + } + /* + * We need the generation to create the refcount tree lock and since + * it isn't changed during the tree modification, we are safe here to + * read without protection. + * We also have to purge the cache after we create the lock since the + * refcount block may have the stale data. It can only be trusted when + * we hold the refcount lock. + */ + ret = ocfs2_read_refcount_block(&new->rf_ci, rf_blkno, &ref_root_bh); + if (ret) { + mlog_errno(ret); + ocfs2_metadata_cache_exit(&new->rf_ci); + kfree(new); + return ret; + } + + ref_rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data; + new->rf_generation = le32_to_cpu(ref_rb->rf_generation); + ocfs2_init_refcount_tree_lock(osb, new, rf_blkno, + new->rf_generation); + ocfs2_metadata_cache_purge(&new->rf_ci); + + spin_lock(&osb->osb_lock); + tree = ocfs2_find_refcount_tree(osb, rf_blkno); + if (tree) + goto out; + + ocfs2_insert_refcount_tree(osb, new); + + tree = new; + new = NULL; + +out: + *ret_tree = tree; + + osb->osb_ref_tree_lru = tree; + + spin_unlock(&osb->osb_lock); + + if (new) + ocfs2_free_refcount_tree(new); + + brelse(ref_root_bh); + return ret; +} + +static int ocfs2_get_refcount_block(struct inode *inode, u64 *ref_blkno) +{ + int ret; + struct buffer_head *di_bh = NULL; + struct ocfs2_dinode *di; + + ret = ocfs2_read_inode_block(inode, &di_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + BUG_ON(!(OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL)); + + di = (struct ocfs2_dinode *)di_bh->b_data; + *ref_blkno = le64_to_cpu(di->i_refcount_loc); + brelse(di_bh); +out: + return ret; +} + +static int __ocfs2_lock_refcount_tree(struct ocfs2_super *osb, + struct ocfs2_refcount_tree *tree, int rw) +{ + int ret; + + ret = ocfs2_refcount_lock(tree, rw); + if (ret) { + mlog_errno(ret); + goto out; + } + + if (rw) + down_write(&tree->rf_sem); + else + down_read(&tree->rf_sem); + +out: + return ret; +} + +/* + * Lock the refcount tree pointed by ref_blkno and return the tree. + * In most case, we lock the tree and read the refcount block. + * So read it here if the caller really needs it. + * + * If the tree has been re-created by other node, it will free the + * old one and re-create it. + */ +int ocfs2_lock_refcount_tree(struct ocfs2_super *osb, + u64 ref_blkno, int rw, + struct ocfs2_refcount_tree **ret_tree, + struct buffer_head **ref_bh) +{ + int ret, delete_tree = 0; + struct ocfs2_refcount_tree *tree = NULL; + struct buffer_head *ref_root_bh = NULL; + struct ocfs2_refcount_block *rb; + +again: + ret = ocfs2_get_refcount_tree(osb, ref_blkno, &tree); + if (ret) { + mlog_errno(ret); + return ret; + } + + ocfs2_refcount_tree_get(tree); + + ret = __ocfs2_lock_refcount_tree(osb, tree, rw); + if (ret) { + mlog_errno(ret); + ocfs2_refcount_tree_put(tree); + goto out; + } + + ret = ocfs2_read_refcount_block(&tree->rf_ci, tree->rf_blkno, + &ref_root_bh); + if (ret) { + mlog_errno(ret); + ocfs2_unlock_refcount_tree(osb, tree, rw); + ocfs2_refcount_tree_put(tree); + goto out; + } + + rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data; + /* + * If the refcount block has been freed and re-created, we may need + * to recreate the refcount tree also. + * + * Here we just remove the tree from the rb-tree, and the last + * kref holder will unlock and delete this refcount_tree. + * Then we goto "again" and ocfs2_get_refcount_tree will create + * the new refcount tree for us. + */ + if (tree->rf_generation != le32_to_cpu(rb->rf_generation)) { + if (!tree->rf_removed) { + ocfs2_erase_refcount_tree_from_list(osb, tree); + tree->rf_removed = 1; + delete_tree = 1; + } + + ocfs2_unlock_refcount_tree(osb, tree, rw); + /* + * We get an extra reference when we create the refcount + * tree, so another put will destroy it. + */ + if (delete_tree) + ocfs2_refcount_tree_put(tree); + brelse(ref_root_bh); + ref_root_bh = NULL; + goto again; + } + + *ret_tree = tree; + if (ref_bh) { + *ref_bh = ref_root_bh; + ref_root_bh = NULL; + } +out: + brelse(ref_root_bh); + return ret; +} + +int ocfs2_lock_refcount_tree_by_inode(struct inode *inode, int rw, + struct ocfs2_refcount_tree **ret_tree, + struct buffer_head **ref_bh) +{ + int ret; + u64 ref_blkno; + + ret = ocfs2_get_refcount_block(inode, &ref_blkno); + if (ret) { + mlog_errno(ret); + return ret; + } + + return ocfs2_lock_refcount_tree(OCFS2_SB(inode->i_sb), ref_blkno, + rw, ret_tree, ref_bh); +} + +void ocfs2_unlock_refcount_tree(struct ocfs2_super *osb, + struct ocfs2_refcount_tree *tree, int rw) +{ + if (rw) + up_write(&tree->rf_sem); + else + up_read(&tree->rf_sem); + + ocfs2_refcount_unlock(tree, rw); + ocfs2_refcount_tree_put(tree); +} + +void ocfs2_purge_refcount_trees(struct ocfs2_super *osb) +{ + struct rb_node *node; + struct ocfs2_refcount_tree *tree; + struct rb_root *root = &osb->osb_rf_lock_tree; + + while ((node = rb_last(root)) != NULL) { + tree = rb_entry(node, struct ocfs2_refcount_tree, rf_node); + + mlog(0, "Purge tree %llu\n", + (unsigned long long) tree->rf_blkno); + + rb_erase(&tree->rf_node, root); + ocfs2_free_refcount_tree(tree); + } +} + +/* + * Create a refcount tree for an inode. + * We take for granted that the inode is already locked. + */ +static int ocfs2_create_refcount_tree(struct inode *inode, + struct buffer_head *di_bh) +{ + int ret; + handle_t *handle = NULL; + struct ocfs2_alloc_context *meta_ac = NULL; + struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; + struct ocfs2_inode_info *oi = OCFS2_I(inode); + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct buffer_head *new_bh = NULL; + struct ocfs2_refcount_block *rb; + struct ocfs2_refcount_tree *new_tree = NULL, *tree = NULL; + u16 suballoc_bit_start; + u32 num_got; + u64 first_blkno; + + BUG_ON(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL); + + mlog(0, "create tree for inode %lu\n", inode->i_ino); + + ret = ocfs2_reserve_new_metadata_blocks(osb, 1, &meta_ac); + if (ret) { + mlog_errno(ret); + goto out; + } + + handle = ocfs2_start_trans(osb, OCFS2_REFCOUNT_TREE_CREATE_CREDITS); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + goto out; + } + + ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + ret = ocfs2_claim_metadata(osb, handle, meta_ac, 1, + &suballoc_bit_start, &num_got, + &first_blkno); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + new_tree = ocfs2_allocate_refcount_tree(osb, first_blkno); + if (!new_tree) { + ret = -ENOMEM; + mlog_errno(ret); + goto out_commit; + } + + new_bh = sb_getblk(inode->i_sb, first_blkno); + ocfs2_set_new_buffer_uptodate(&new_tree->rf_ci, new_bh); + + ret = ocfs2_journal_access_rb(handle, &new_tree->rf_ci, new_bh, + OCFS2_JOURNAL_ACCESS_CREATE); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + /* Initialize ocfs2_refcount_block. */ + rb = (struct ocfs2_refcount_block *)new_bh->b_data; + memset(rb, 0, inode->i_sb->s_blocksize); + strcpy((void *)rb, OCFS2_REFCOUNT_BLOCK_SIGNATURE); + rb->rf_suballoc_slot = cpu_to_le16(osb->slot_num); + rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start); + rb->rf_fs_generation = cpu_to_le32(osb->fs_generation); + rb->rf_blkno = cpu_to_le64(first_blkno); + rb->rf_count = cpu_to_le32(1); + rb->rf_records.rl_count = + cpu_to_le16(ocfs2_refcount_recs_per_rb(osb->sb)); + spin_lock(&osb->osb_lock); + rb->rf_generation = osb->s_next_generation++; + spin_unlock(&osb->osb_lock); + + ocfs2_journal_dirty(handle, new_bh); + + spin_lock(&oi->ip_lock); + oi->ip_dyn_features |= OCFS2_HAS_REFCOUNT_FL; + di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features); + di->i_refcount_loc = cpu_to_le64(first_blkno); + spin_unlock(&oi->ip_lock); + + mlog(0, "created tree for inode %lu, refblock %llu\n", + inode->i_ino, (unsigned long long)first_blkno); + + ocfs2_journal_dirty(handle, di_bh); + + /* + * We have to init the tree lock here since it will use + * the generation number to create it. + */ + new_tree->rf_generation = le32_to_cpu(rb->rf_generation); + ocfs2_init_refcount_tree_lock(osb, new_tree, first_blkno, + new_tree->rf_generation); + + spin_lock(&osb->osb_lock); + tree = ocfs2_find_refcount_tree(osb, first_blkno); + + /* + * We've just created a new refcount tree in this block. If + * we found a refcount tree on the ocfs2_super, it must be + * one we just deleted. We free the old tree before + * inserting the new tree. + */ + BUG_ON(tree && tree->rf_generation == new_tree->rf_generation); + if (tree) + ocfs2_erase_refcount_tree_from_list_no_lock(osb, tree); + ocfs2_insert_refcount_tree(osb, new_tree); + spin_unlock(&osb->osb_lock); + new_tree = NULL; + if (tree) + ocfs2_refcount_tree_put(tree); + +out_commit: + ocfs2_commit_trans(osb, handle); + +out: + if (new_tree) { + ocfs2_metadata_cache_exit(&new_tree->rf_ci); + kfree(new_tree); + } + + brelse(new_bh); + if (meta_ac) + ocfs2_free_alloc_context(meta_ac); + + return ret; +} + +static int ocfs2_set_refcount_tree(struct inode *inode, + struct buffer_head *di_bh, + u64 refcount_loc) +{ + int ret; + handle_t *handle = NULL; + struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; + struct ocfs2_inode_info *oi = OCFS2_I(inode); + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct buffer_head *ref_root_bh = NULL; + struct ocfs2_refcount_block *rb; + struct ocfs2_refcount_tree *ref_tree; + + BUG_ON(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL); + + ret = ocfs2_lock_refcount_tree(osb, refcount_loc, 1, + &ref_tree, &ref_root_bh); + if (ret) { + mlog_errno(ret); + return ret; + } + + handle = ocfs2_start_trans(osb, OCFS2_REFCOUNT_TREE_SET_CREDITS); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + goto out; + } + + ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + ret = ocfs2_journal_access_rb(handle, &ref_tree->rf_ci, ref_root_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data; + le32_add_cpu(&rb->rf_count, 1); + + ocfs2_journal_dirty(handle, ref_root_bh); + + spin_lock(&oi->ip_lock); + oi->ip_dyn_features |= OCFS2_HAS_REFCOUNT_FL; + di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features); + di->i_refcount_loc = cpu_to_le64(refcount_loc); + spin_unlock(&oi->ip_lock); + ocfs2_journal_dirty(handle, di_bh); + +out_commit: + ocfs2_commit_trans(osb, handle); +out: + ocfs2_unlock_refcount_tree(osb, ref_tree, 1); + brelse(ref_root_bh); + + return ret; +} + +int ocfs2_remove_refcount_tree(struct inode *inode, struct buffer_head *di_bh) +{ + int ret, delete_tree = 0; + handle_t *handle = NULL; + struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; + struct ocfs2_inode_info *oi = OCFS2_I(inode); + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct ocfs2_refcount_block *rb; + struct inode *alloc_inode = NULL; + struct buffer_head *alloc_bh = NULL; + struct buffer_head *blk_bh = NULL; + struct ocfs2_refcount_tree *ref_tree; + int credits = OCFS2_REFCOUNT_TREE_REMOVE_CREDITS; + u64 blk = 0, bg_blkno = 0, ref_blkno = le64_to_cpu(di->i_refcount_loc); + u16 bit = 0; + + if (!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL)) + return 0; + + BUG_ON(!ref_blkno); + ret = ocfs2_lock_refcount_tree(osb, ref_blkno, 1, &ref_tree, &blk_bh); + if (ret) { + mlog_errno(ret); + return ret; + } + + rb = (struct ocfs2_refcount_block *)blk_bh->b_data; + + /* + * If we are the last user, we need to free the block. + * So lock the allocator ahead. + */ + if (le32_to_cpu(rb->rf_count) == 1) { + blk = le64_to_cpu(rb->rf_blkno); + bit = le16_to_cpu(rb->rf_suballoc_bit); + bg_blkno = ocfs2_which_suballoc_group(blk, bit); + + alloc_inode = ocfs2_get_system_file_inode(osb, + EXTENT_ALLOC_SYSTEM_INODE, + le16_to_cpu(rb->rf_suballoc_slot)); + if (!alloc_inode) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + mutex_lock(&alloc_inode->i_mutex); + + ret = ocfs2_inode_lock(alloc_inode, &alloc_bh, 1); + if (ret) { + mlog_errno(ret); + goto out_mutex; + } + + credits += OCFS2_SUBALLOC_FREE; + } + + handle = ocfs2_start_trans(osb, credits); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + goto out_unlock; + } + + ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + ret = ocfs2_journal_access_rb(handle, &ref_tree->rf_ci, blk_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + spin_lock(&oi->ip_lock); + oi->ip_dyn_features &= ~OCFS2_HAS_REFCOUNT_FL; + di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features); + di->i_refcount_loc = 0; + spin_unlock(&oi->ip_lock); + ocfs2_journal_dirty(handle, di_bh); + + le32_add_cpu(&rb->rf_count , -1); + ocfs2_journal_dirty(handle, blk_bh); + + if (!rb->rf_count) { + delete_tree = 1; + ocfs2_erase_refcount_tree_from_list(osb, ref_tree); + ret = ocfs2_free_suballoc_bits(handle, alloc_inode, + alloc_bh, bit, bg_blkno, 1); + if (ret) + mlog_errno(ret); + } + +out_commit: + ocfs2_commit_trans(osb, handle); +out_unlock: + if (alloc_inode) { + ocfs2_inode_unlock(alloc_inode, 1); + brelse(alloc_bh); + } +out_mutex: + if (alloc_inode) { + mutex_unlock(&alloc_inode->i_mutex); + iput(alloc_inode); + } +out: + ocfs2_unlock_refcount_tree(osb, ref_tree, 1); + if (delete_tree) + ocfs2_refcount_tree_put(ref_tree); + brelse(blk_bh); + + return ret; +} + +static void ocfs2_find_refcount_rec_in_rl(struct ocfs2_caching_info *ci, + struct buffer_head *ref_leaf_bh, + u64 cpos, unsigned int len, + struct ocfs2_refcount_rec *ret_rec, + int *index) +{ + int i = 0; + struct ocfs2_refcount_block *rb = + (struct ocfs2_refcount_block *)ref_leaf_bh->b_data; + struct ocfs2_refcount_rec *rec = NULL; + + for (; i < le16_to_cpu(rb->rf_records.rl_used); i++) { + rec = &rb->rf_records.rl_recs[i]; + + if (le64_to_cpu(rec->r_cpos) + + le32_to_cpu(rec->r_clusters) <= cpos) + continue; + else if (le64_to_cpu(rec->r_cpos) > cpos) + break; + + /* ok, cpos fail in this rec. Just return. */ + if (ret_rec) + *ret_rec = *rec; + goto out; + } + + if (ret_rec) { + /* We meet with a hole here, so fake the rec. */ + ret_rec->r_cpos = cpu_to_le64(cpos); + ret_rec->r_refcount = 0; + if (i < le16_to_cpu(rb->rf_records.rl_used) && + le64_to_cpu(rec->r_cpos) < cpos + len) + ret_rec->r_clusters = + cpu_to_le32(le64_to_cpu(rec->r_cpos) - cpos); + else + ret_rec->r_clusters = cpu_to_le32(len); + } + +out: + *index = i; +} + +/* + * Try to remove refcount tree. The mechanism is: + * 1) Check whether i_clusters == 0, if no, exit. + * 2) check whether we have i_xattr_loc in dinode. if yes, exit. + * 3) Check whether we have inline xattr stored outside, if yes, exit. + * 4) Remove the tree. + */ +int ocfs2_try_remove_refcount_tree(struct inode *inode, + struct buffer_head *di_bh) +{ + int ret; + struct ocfs2_inode_info *oi = OCFS2_I(inode); + struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; + + down_write(&oi->ip_xattr_sem); + down_write(&oi->ip_alloc_sem); + + if (oi->ip_clusters) + goto out; + + if ((oi->ip_dyn_features & OCFS2_HAS_XATTR_FL) && di->i_xattr_loc) + goto out; + + if (oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL && + ocfs2_has_inline_xattr_value_outside(inode, di)) + goto out; + + ret = ocfs2_remove_refcount_tree(inode, di_bh); + if (ret) + mlog_errno(ret); +out: + up_write(&oi->ip_alloc_sem); + up_write(&oi->ip_xattr_sem); + return 0; +} + +/* + * Given a cpos and len, try to find the refcount record which contains cpos. + * 1. If cpos can be found in one refcount record, return the record. + * 2. If cpos can't be found, return a fake record which start from cpos + * and end at a small value between cpos+len and start of the next record. + * This fake record has r_refcount = 0. + */ +static int ocfs2_get_refcount_rec(struct ocfs2_caching_info *ci, + struct buffer_head *ref_root_bh, + u64 cpos, unsigned int len, + struct ocfs2_refcount_rec *ret_rec, + int *index, + struct buffer_head **ret_bh) +{ + int ret = 0, i, found; + u32 low_cpos; + struct ocfs2_extent_list *el; + struct ocfs2_extent_rec *tmp, *rec = NULL; + struct ocfs2_extent_block *eb; + struct buffer_head *eb_bh = NULL, *ref_leaf_bh = NULL; + struct super_block *sb = ocfs2_metadata_cache_get_super(ci); + struct ocfs2_refcount_block *rb = + (struct ocfs2_refcount_block *)ref_root_bh->b_data; + + if (!(le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL)) { + ocfs2_find_refcount_rec_in_rl(ci, ref_root_bh, cpos, len, + ret_rec, index); + *ret_bh = ref_root_bh; + get_bh(ref_root_bh); + return 0; + } + + el = &rb->rf_list; + low_cpos = cpos & OCFS2_32BIT_POS_MASK; + + if (el->l_tree_depth) { + ret = ocfs2_find_leaf(ci, el, low_cpos, &eb_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + eb = (struct ocfs2_extent_block *) eb_bh->b_data; + el = &eb->h_list; + + if (el->l_tree_depth) { + ocfs2_error(sb, + "refcount tree %llu has non zero tree " + "depth in leaf btree tree block %llu\n", + (unsigned long long)ocfs2_metadata_cache_owner(ci), + (unsigned long long)eb_bh->b_blocknr); + ret = -EROFS; + goto out; + } + } + + found = 0; + for (i = le16_to_cpu(el->l_next_free_rec) - 1; i >= 0; i--) { + rec = &el->l_recs[i]; + + if (le32_to_cpu(rec->e_cpos) <= low_cpos) { + found = 1; + break; + } + } + + /* adjust len when we have ocfs2_extent_rec after it. */ + if (found && i < le16_to_cpu(el->l_next_free_rec) - 1) { + tmp = &el->l_recs[i+1]; + + if (le32_to_cpu(tmp->e_cpos) < cpos + len) + len = le32_to_cpu(tmp->e_cpos) - cpos; + } + + ret = ocfs2_read_refcount_block(ci, le64_to_cpu(rec->e_blkno), + &ref_leaf_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + ocfs2_find_refcount_rec_in_rl(ci, ref_leaf_bh, cpos, len, + ret_rec, index); + *ret_bh = ref_leaf_bh; +out: + brelse(eb_bh); + return ret; +} + +enum ocfs2_ref_rec_contig { + REF_CONTIG_NONE = 0, + REF_CONTIG_LEFT, + REF_CONTIG_RIGHT, + REF_CONTIG_LEFTRIGHT, +}; + +static enum ocfs2_ref_rec_contig + ocfs2_refcount_rec_adjacent(struct ocfs2_refcount_block *rb, + int index) +{ + if ((rb->rf_records.rl_recs[index].r_refcount == + rb->rf_records.rl_recs[index + 1].r_refcount) && + (le64_to_cpu(rb->rf_records.rl_recs[index].r_cpos) + + le32_to_cpu(rb->rf_records.rl_recs[index].r_clusters) == + le64_to_cpu(rb->rf_records.rl_recs[index + 1].r_cpos))) + return REF_CONTIG_RIGHT; + + return REF_CONTIG_NONE; +} + +static enum ocfs2_ref_rec_contig + ocfs2_refcount_rec_contig(struct ocfs2_refcount_block *rb, + int index) +{ + enum ocfs2_ref_rec_contig ret = REF_CONTIG_NONE; + + if (index < le16_to_cpu(rb->rf_records.rl_used) - 1) + ret = ocfs2_refcount_rec_adjacent(rb, index); + + if (index > 0) { + enum ocfs2_ref_rec_contig tmp; + + tmp = ocfs2_refcount_rec_adjacent(rb, index - 1); + + if (tmp == REF_CONTIG_RIGHT) { + if (ret == REF_CONTIG_RIGHT) + ret = REF_CONTIG_LEFTRIGHT; + else + ret = REF_CONTIG_LEFT; + } + } + + return ret; +} + +static void ocfs2_rotate_refcount_rec_left(struct ocfs2_refcount_block *rb, + int index) +{ + BUG_ON(rb->rf_records.rl_recs[index].r_refcount != + rb->rf_records.rl_recs[index+1].r_refcount); + + le32_add_cpu(&rb->rf_records.rl_recs[index].r_clusters, + le32_to_cpu(rb->rf_records.rl_recs[index+1].r_clusters)); + + if (index < le16_to_cpu(rb->rf_records.rl_used) - 2) + memmove(&rb->rf_records.rl_recs[index + 1], + &rb->rf_records.rl_recs[index + 2], + sizeof(struct ocfs2_refcount_rec) * + (le16_to_cpu(rb->rf_records.rl_used) - index - 2)); + + memset(&rb->rf_records.rl_recs[le16_to_cpu(rb->rf_records.rl_used) - 1], + 0, sizeof(struct ocfs2_refcount_rec)); + le16_add_cpu(&rb->rf_records.rl_used, -1); +} + +/* + * Merge the refcount rec if we are contiguous with the adjacent recs. + */ +static void ocfs2_refcount_rec_merge(struct ocfs2_refcount_block *rb, + int index) +{ + enum ocfs2_ref_rec_contig contig = + ocfs2_refcount_rec_contig(rb, index); + + if (contig == REF_CONTIG_NONE) + return; + + if (contig == REF_CONTIG_LEFT || contig == REF_CONTIG_LEFTRIGHT) { + BUG_ON(index == 0); + index--; + } + + ocfs2_rotate_refcount_rec_left(rb, index); + + if (contig == REF_CONTIG_LEFTRIGHT) + ocfs2_rotate_refcount_rec_left(rb, index); +} + +/* + * Change the refcount indexed by "index" in ref_bh. + * If refcount reaches 0, remove it. + */ +static int ocfs2_change_refcount_rec(handle_t *handle, + struct ocfs2_caching_info *ci, + struct buffer_head *ref_leaf_bh, + int index, int merge, int change) +{ + int ret; + struct ocfs2_refcount_block *rb = + (struct ocfs2_refcount_block *)ref_leaf_bh->b_data; + struct ocfs2_refcount_list *rl = &rb->rf_records; + struct ocfs2_refcount_rec *rec = &rl->rl_recs[index]; + + ret = ocfs2_journal_access_rb(handle, ci, ref_leaf_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out; + } + + mlog(0, "change index %d, old count %u, change %d\n", index, + le32_to_cpu(rec->r_refcount), change); + le32_add_cpu(&rec->r_refcount, change); + + if (!rec->r_refcount) { + if (index != le16_to_cpu(rl->rl_used) - 1) { + memmove(rec, rec + 1, + (le16_to_cpu(rl->rl_used) - index - 1) * + sizeof(struct ocfs2_refcount_rec)); + memset(&rl->rl_recs[le16_to_cpu(rl->rl_used) - 1], + 0, sizeof(struct ocfs2_refcount_rec)); + } + + le16_add_cpu(&rl->rl_used, -1); + } else if (merge) + ocfs2_refcount_rec_merge(rb, index); + + ret = ocfs2_journal_dirty(handle, ref_leaf_bh); + if (ret) + mlog_errno(ret); +out: + return ret; +} + +static int ocfs2_expand_inline_ref_root(handle_t *handle, + struct ocfs2_caching_info *ci, + struct buffer_head *ref_root_bh, + struct buffer_head **ref_leaf_bh, + struct ocfs2_alloc_context *meta_ac) +{ + int ret; + u16 suballoc_bit_start; + u32 num_got; + u64 blkno; + struct super_block *sb = ocfs2_metadata_cache_get_super(ci); + struct buffer_head *new_bh = NULL; + struct ocfs2_refcount_block *new_rb; + struct ocfs2_refcount_block *root_rb = + (struct ocfs2_refcount_block *)ref_root_bh->b_data; + + ret = ocfs2_journal_access_rb(handle, ci, ref_root_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_claim_metadata(OCFS2_SB(sb), handle, meta_ac, 1, + &suballoc_bit_start, &num_got, + &blkno); + if (ret) { + mlog_errno(ret); + goto out; + } + + new_bh = sb_getblk(sb, blkno); + if (new_bh == NULL) { + ret = -EIO; + mlog_errno(ret); + goto out; + } + ocfs2_set_new_buffer_uptodate(ci, new_bh); + + ret = ocfs2_journal_access_rb(handle, ci, new_bh, + OCFS2_JOURNAL_ACCESS_CREATE); + if (ret) { + mlog_errno(ret); + goto out; + } + + /* + * Initialize ocfs2_refcount_block. + * It should contain the same information as the old root. + * so just memcpy it and change the corresponding field. + */ + memcpy(new_bh->b_data, ref_root_bh->b_data, sb->s_blocksize); + + new_rb = (struct ocfs2_refcount_block *)new_bh->b_data; + new_rb->rf_suballoc_slot = cpu_to_le16(OCFS2_SB(sb)->slot_num); + new_rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start); + new_rb->rf_blkno = cpu_to_le64(blkno); + new_rb->rf_cpos = cpu_to_le32(0); + new_rb->rf_parent = cpu_to_le64(ref_root_bh->b_blocknr); + new_rb->rf_flags = cpu_to_le32(OCFS2_REFCOUNT_LEAF_FL); + ocfs2_journal_dirty(handle, new_bh); + + /* Now change the root. */ + memset(&root_rb->rf_list, 0, sb->s_blocksize - + offsetof(struct ocfs2_refcount_block, rf_list)); + root_rb->rf_list.l_count = cpu_to_le16(ocfs2_extent_recs_per_rb(sb)); + root_rb->rf_clusters = cpu_to_le32(1); + root_rb->rf_list.l_next_free_rec = cpu_to_le16(1); + root_rb->rf_list.l_recs[0].e_blkno = cpu_to_le64(blkno); + root_rb->rf_list.l_recs[0].e_leaf_clusters = cpu_to_le16(1); + root_rb->rf_flags = cpu_to_le32(OCFS2_REFCOUNT_TREE_FL); + + ocfs2_journal_dirty(handle, ref_root_bh); + + mlog(0, "new leaf block %llu, used %u\n", (unsigned long long)blkno, + le16_to_cpu(new_rb->rf_records.rl_used)); + + *ref_leaf_bh = new_bh; + new_bh = NULL; +out: + brelse(new_bh); + return ret; +} + +static int ocfs2_refcount_rec_no_intersect(struct ocfs2_refcount_rec *prev, + struct ocfs2_refcount_rec *next) +{ + if (ocfs2_get_ref_rec_low_cpos(prev) + le32_to_cpu(prev->r_clusters) <= + ocfs2_get_ref_rec_low_cpos(next)) + return 1; + + return 0; +} + +static int cmp_refcount_rec_by_low_cpos(const void *a, const void *b) +{ + const struct ocfs2_refcount_rec *l = a, *r = b; + u32 l_cpos = ocfs2_get_ref_rec_low_cpos(l); + u32 r_cpos = ocfs2_get_ref_rec_low_cpos(r); + + if (l_cpos > r_cpos) + return 1; + if (l_cpos < r_cpos) + return -1; + return 0; +} + +static int cmp_refcount_rec_by_cpos(const void *a, const void *b) +{ + const struct ocfs2_refcount_rec *l = a, *r = b; + u64 l_cpos = le64_to_cpu(l->r_cpos); + u64 r_cpos = le64_to_cpu(r->r_cpos); + + if (l_cpos > r_cpos) + return 1; + if (l_cpos < r_cpos) + return -1; + return 0; +} + +static void swap_refcount_rec(void *a, void *b, int size) +{ + struct ocfs2_refcount_rec *l = a, *r = b, tmp; + + tmp = *(struct ocfs2_refcount_rec *)l; + *(struct ocfs2_refcount_rec *)l = + *(struct ocfs2_refcount_rec *)r; + *(struct ocfs2_refcount_rec *)r = tmp; +} + +/* + * The refcount cpos are ordered by their 64bit cpos, + * But we will use the low 32 bit to be the e_cpos in the b-tree. + * So we need to make sure that this pos isn't intersected with others. + * + * Note: The refcount block is already sorted by their low 32 bit cpos, + * So just try the middle pos first, and we will exit when we find + * the good position. + */ +static int ocfs2_find_refcount_split_pos(struct ocfs2_refcount_list *rl, + u32 *split_pos, int *split_index) +{ + int num_used = le16_to_cpu(rl->rl_used); + int delta, middle = num_used / 2; + + for (delta = 0; delta < middle; delta++) { + /* Let's check delta earlier than middle */ + if (ocfs2_refcount_rec_no_intersect( + &rl->rl_recs[middle - delta - 1], + &rl->rl_recs[middle - delta])) { + *split_index = middle - delta; + break; + } + + /* For even counts, don't walk off the end */ + if ((middle + delta + 1) == num_used) + continue; + + /* Now try delta past middle */ + if (ocfs2_refcount_rec_no_intersect( + &rl->rl_recs[middle + delta], + &rl->rl_recs[middle + delta + 1])) { + *split_index = middle + delta + 1; + break; + } + } + + if (delta >= middle) + return -ENOSPC; + + *split_pos = ocfs2_get_ref_rec_low_cpos(&rl->rl_recs[*split_index]); + return 0; +} + +static int ocfs2_divide_leaf_refcount_block(struct buffer_head *ref_leaf_bh, + struct buffer_head *new_bh, + u32 *split_cpos) +{ + int split_index = 0, num_moved, ret; + u32 cpos = 0; + struct ocfs2_refcount_block *rb = + (struct ocfs2_refcount_block *)ref_leaf_bh->b_data; + struct ocfs2_refcount_list *rl = &rb->rf_records; + struct ocfs2_refcount_block *new_rb = + (struct ocfs2_refcount_block *)new_bh->b_data; + struct ocfs2_refcount_list *new_rl = &new_rb->rf_records; + + mlog(0, "split old leaf refcount block %llu, count = %u, used = %u\n", + (unsigned long long)ref_leaf_bh->b_blocknr, + le32_to_cpu(rl->rl_count), le32_to_cpu(rl->rl_used)); + + /* + * XXX: Improvement later. + * If we know all the high 32 bit cpos is the same, no need to sort. + * + * In order to make the whole process safe, we do: + * 1. sort the entries by their low 32 bit cpos first so that we can + * find the split cpos easily. + * 2. call ocfs2_insert_extent to insert the new refcount block. + * 3. move the refcount rec to the new block. + * 4. sort the entries by their 64 bit cpos. + * 5. dirty the new_rb and rb. + */ + sort(&rl->rl_recs, le16_to_cpu(rl->rl_used), + sizeof(struct ocfs2_refcount_rec), + cmp_refcount_rec_by_low_cpos, swap_refcount_rec); + + ret = ocfs2_find_refcount_split_pos(rl, &cpos, &split_index); + if (ret) { + mlog_errno(ret); + return ret; + } + + new_rb->rf_cpos = cpu_to_le32(cpos); + + /* move refcount records starting from split_index to the new block. */ + num_moved = le16_to_cpu(rl->rl_used) - split_index; + memcpy(new_rl->rl_recs, &rl->rl_recs[split_index], + num_moved * sizeof(struct ocfs2_refcount_rec)); + + /*ok, remove the entries we just moved over to the other block. */ + memset(&rl->rl_recs[split_index], 0, + num_moved * sizeof(struct ocfs2_refcount_rec)); + + /* change old and new rl_used accordingly. */ + le16_add_cpu(&rl->rl_used, -num_moved); + new_rl->rl_used = cpu_to_le32(num_moved); + + sort(&rl->rl_recs, le16_to_cpu(rl->rl_used), + sizeof(struct ocfs2_refcount_rec), + cmp_refcount_rec_by_cpos, swap_refcount_rec); + + sort(&new_rl->rl_recs, le16_to_cpu(new_rl->rl_used), + sizeof(struct ocfs2_refcount_rec), + cmp_refcount_rec_by_cpos, swap_refcount_rec); + + *split_cpos = cpos; + return 0; +} + +static int ocfs2_new_leaf_refcount_block(handle_t *handle, + struct ocfs2_caching_info *ci, + struct buffer_head *ref_root_bh, + struct buffer_head *ref_leaf_bh, + struct ocfs2_alloc_context *meta_ac) +{ + int ret; + u16 suballoc_bit_start; + u32 num_got, new_cpos; + u64 blkno; + struct super_block *sb = ocfs2_metadata_cache_get_super(ci); + struct ocfs2_refcount_block *root_rb = + (struct ocfs2_refcount_block *)ref_root_bh->b_data; + struct buffer_head *new_bh = NULL; + struct ocfs2_refcount_block *new_rb; + struct ocfs2_extent_tree ref_et; + + BUG_ON(!(le32_to_cpu(root_rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL)); + + ret = ocfs2_journal_access_rb(handle, ci, ref_root_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_journal_access_rb(handle, ci, ref_leaf_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_claim_metadata(OCFS2_SB(sb), handle, meta_ac, 1, + &suballoc_bit_start, &num_got, + &blkno); + if (ret) { + mlog_errno(ret); + goto out; + } + + new_bh = sb_getblk(sb, blkno); + if (new_bh == NULL) { + ret = -EIO; + mlog_errno(ret); + goto out; + } + ocfs2_set_new_buffer_uptodate(ci, new_bh); + + ret = ocfs2_journal_access_rb(handle, ci, new_bh, + OCFS2_JOURNAL_ACCESS_CREATE); + if (ret) { + mlog_errno(ret); + goto out; + } + + /* Initialize ocfs2_refcount_block. */ + new_rb = (struct ocfs2_refcount_block *)new_bh->b_data; + memset(new_rb, 0, sb->s_blocksize); + strcpy((void *)new_rb, OCFS2_REFCOUNT_BLOCK_SIGNATURE); + new_rb->rf_suballoc_slot = cpu_to_le16(OCFS2_SB(sb)->slot_num); + new_rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start); + new_rb->rf_fs_generation = cpu_to_le32(OCFS2_SB(sb)->fs_generation); + new_rb->rf_blkno = cpu_to_le64(blkno); + new_rb->rf_parent = cpu_to_le64(ref_root_bh->b_blocknr); + new_rb->rf_flags = cpu_to_le32(OCFS2_REFCOUNT_LEAF_FL); + new_rb->rf_records.rl_count = + cpu_to_le16(ocfs2_refcount_recs_per_rb(sb)); + new_rb->rf_generation = root_rb->rf_generation; + + ret = ocfs2_divide_leaf_refcount_block(ref_leaf_bh, new_bh, &new_cpos); + if (ret) { + mlog_errno(ret); + goto out; + } + + ocfs2_journal_dirty(handle, ref_leaf_bh); + ocfs2_journal_dirty(handle, new_bh); + + ocfs2_init_refcount_extent_tree(&ref_et, ci, ref_root_bh); + + mlog(0, "insert new leaf block %llu at %u\n", + (unsigned long long)new_bh->b_blocknr, new_cpos); + + /* Insert the new leaf block with the specific offset cpos. */ + ret = ocfs2_insert_extent(handle, &ref_et, new_cpos, new_bh->b_blocknr, + 1, 0, meta_ac); + if (ret) + mlog_errno(ret); + +out: + brelse(new_bh); + return ret; +} + +static int ocfs2_expand_refcount_tree(handle_t *handle, + struct ocfs2_caching_info *ci, + struct buffer_head *ref_root_bh, + struct buffer_head *ref_leaf_bh, + struct ocfs2_alloc_context *meta_ac) +{ + int ret; + struct buffer_head *expand_bh = NULL; + + if (ref_root_bh == ref_leaf_bh) { + /* + * the old root bh hasn't been expanded to a b-tree, + * so expand it first. + */ + ret = ocfs2_expand_inline_ref_root(handle, ci, ref_root_bh, + &expand_bh, meta_ac); + if (ret) { + mlog_errno(ret); + goto out; + } + } else { + expand_bh = ref_leaf_bh; + get_bh(expand_bh); + } + + + /* Now add a new refcount block into the tree.*/ + ret = ocfs2_new_leaf_refcount_block(handle, ci, ref_root_bh, + expand_bh, meta_ac); + if (ret) + mlog_errno(ret); +out: + brelse(expand_bh); + return ret; +} + +/* + * Adjust the extent rec in b-tree representing ref_leaf_bh. + * + * Only called when we have inserted a new refcount rec at index 0 + * which means ocfs2_extent_rec.e_cpos may need some change. + */ +static int ocfs2_adjust_refcount_rec(handle_t *handle, + struct ocfs2_caching_info *ci, + struct buffer_head *ref_root_bh, + struct buffer_head *ref_leaf_bh, + struct ocfs2_refcount_rec *rec) +{ + int ret = 0, i; + u32 new_cpos, old_cpos; + struct ocfs2_path *path = NULL; + struct ocfs2_extent_tree et; + struct ocfs2_refcount_block *rb = + (struct ocfs2_refcount_block *)ref_root_bh->b_data; + struct ocfs2_extent_list *el; + + if (!(le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL)) + goto out; + + rb = (struct ocfs2_refcount_block *)ref_leaf_bh->b_data; + old_cpos = le32_to_cpu(rb->rf_cpos); + new_cpos = le64_to_cpu(rec->r_cpos) & OCFS2_32BIT_POS_MASK; + if (old_cpos <= new_cpos) + goto out; + + ocfs2_init_refcount_extent_tree(&et, ci, ref_root_bh); + + path = ocfs2_new_path_from_et(&et); + if (!path) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + + ret = ocfs2_find_path(ci, path, old_cpos); + if (ret) { + mlog_errno(ret); + goto out; + } + + /* + * 2 more credits, one for the leaf refcount block, one for + * the extent block contains the extent rec. + */ + ret = ocfs2_extend_trans(handle, handle->h_buffer_credits + 2); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_journal_access_rb(handle, ci, ref_leaf_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_journal_access_eb(handle, ci, path_leaf_bh(path), + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + + /* change the leaf extent block first. */ + el = path_leaf_el(path); + + for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) + if (le32_to_cpu(el->l_recs[i].e_cpos) == old_cpos) + break; + + BUG_ON(i == le16_to_cpu(el->l_next_free_rec)); + + el->l_recs[i].e_cpos = cpu_to_le32(new_cpos); + + /* change the r_cpos in the leaf block. */ + rb->rf_cpos = cpu_to_le32(new_cpos); + + ocfs2_journal_dirty(handle, path_leaf_bh(path)); + ocfs2_journal_dirty(handle, ref_leaf_bh); + +out: + ocfs2_free_path(path); + return ret; +} + +static int ocfs2_insert_refcount_rec(handle_t *handle, + struct ocfs2_caching_info *ci, + struct buffer_head *ref_root_bh, + struct buffer_head *ref_leaf_bh, + struct ocfs2_refcount_rec *rec, + int index, int merge, + struct ocfs2_alloc_context *meta_ac) +{ + int ret; + struct ocfs2_refcount_block *rb = + (struct ocfs2_refcount_block *)ref_leaf_bh->b_data; + struct ocfs2_refcount_list *rf_list = &rb->rf_records; + struct buffer_head *new_bh = NULL; + + BUG_ON(le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL); + + if (rf_list->rl_used == rf_list->rl_count) { + u64 cpos = le64_to_cpu(rec->r_cpos); + u32 len = le32_to_cpu(rec->r_clusters); + + ret = ocfs2_expand_refcount_tree(handle, ci, ref_root_bh, + ref_leaf_bh, meta_ac); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_get_refcount_rec(ci, ref_root_bh, + cpos, len, NULL, &index, + &new_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + ref_leaf_bh = new_bh; + rb = (struct ocfs2_refcount_block *)ref_leaf_bh->b_data; + rf_list = &rb->rf_records; + } + + ret = ocfs2_journal_access_rb(handle, ci, ref_leaf_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out; + } + + if (index < le16_to_cpu(rf_list->rl_used)) + memmove(&rf_list->rl_recs[index + 1], + &rf_list->rl_recs[index], + (le16_to_cpu(rf_list->rl_used) - index) * + sizeof(struct ocfs2_refcount_rec)); + + mlog(0, "insert refcount record start %llu, len %u, count %u " + "to leaf block %llu at index %d\n", + (unsigned long long)le64_to_cpu(rec->r_cpos), + le32_to_cpu(rec->r_clusters), le32_to_cpu(rec->r_refcount), + (unsigned long long)ref_leaf_bh->b_blocknr, index); + + rf_list->rl_recs[index] = *rec; + + le16_add_cpu(&rf_list->rl_used, 1); + + if (merge) + ocfs2_refcount_rec_merge(rb, index); + + ret = ocfs2_journal_dirty(handle, ref_leaf_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + if (index == 0) { + ret = ocfs2_adjust_refcount_rec(handle, ci, + ref_root_bh, + ref_leaf_bh, rec); + if (ret) + mlog_errno(ret); + } +out: + brelse(new_bh); + return ret; +} + +/* + * Split the refcount_rec indexed by "index" in ref_leaf_bh. + * This is much simple than our b-tree code. + * split_rec is the new refcount rec we want to insert. + * If split_rec->r_refcount > 0, we are changing the refcount(in case we + * increase refcount or decrease a refcount to non-zero). + * If split_rec->r_refcount == 0, we are punching a hole in current refcount + * rec( in case we decrease a refcount to zero). + */ +static int ocfs2_split_refcount_rec(handle_t *handle, + struct ocfs2_caching_info *ci, + struct buffer_head *ref_root_bh, + struct buffer_head *ref_leaf_bh, + struct ocfs2_refcount_rec *split_rec, + int index, int merge, + struct ocfs2_alloc_context *meta_ac, + struct ocfs2_cached_dealloc_ctxt *dealloc) +{ + int ret, recs_need; + u32 len; + struct ocfs2_refcount_block *rb = + (struct ocfs2_refcount_block *)ref_leaf_bh->b_data; + struct ocfs2_refcount_list *rf_list = &rb->rf_records; + struct ocfs2_refcount_rec *orig_rec = &rf_list->rl_recs[index]; + struct ocfs2_refcount_rec *tail_rec = NULL; + struct buffer_head *new_bh = NULL; + + BUG_ON(le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL); + + mlog(0, "original r_pos %llu, cluster %u, split %llu, cluster %u\n", + le64_to_cpu(orig_rec->r_cpos), le32_to_cpu(orig_rec->r_clusters), + le64_to_cpu(split_rec->r_cpos), + le32_to_cpu(split_rec->r_clusters)); + + /* + * If we just need to split the header or tail clusters, + * no more recs are needed, just split is OK. + * Otherwise we at least need one new recs. + */ + if (!split_rec->r_refcount && + (split_rec->r_cpos == orig_rec->r_cpos || + le64_to_cpu(split_rec->r_cpos) + + le32_to_cpu(split_rec->r_clusters) == + le64_to_cpu(orig_rec->r_cpos) + le32_to_cpu(orig_rec->r_clusters))) + recs_need = 0; + else + recs_need = 1; + + /* + * We need one more rec if we split in the middle and the new rec have + * some refcount in it. + */ + if (split_rec->r_refcount && + (split_rec->r_cpos != orig_rec->r_cpos && + le64_to_cpu(split_rec->r_cpos) + + le32_to_cpu(split_rec->r_clusters) != + le64_to_cpu(orig_rec->r_cpos) + le32_to_cpu(orig_rec->r_clusters))) + recs_need++; + + /* If the leaf block don't have enough record, expand it. */ + if (le16_to_cpu(rf_list->rl_used) + recs_need > rf_list->rl_count) { + struct ocfs2_refcount_rec tmp_rec; + u64 cpos = le64_to_cpu(orig_rec->r_cpos); + len = le32_to_cpu(orig_rec->r_clusters); + ret = ocfs2_expand_refcount_tree(handle, ci, ref_root_bh, + ref_leaf_bh, meta_ac); + if (ret) { + mlog_errno(ret); + goto out; + } + + /* + * We have to re-get it since now cpos may be moved to + * another leaf block. + */ + ret = ocfs2_get_refcount_rec(ci, ref_root_bh, + cpos, len, &tmp_rec, &index, + &new_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + ref_leaf_bh = new_bh; + rb = (struct ocfs2_refcount_block *)ref_leaf_bh->b_data; + rf_list = &rb->rf_records; + orig_rec = &rf_list->rl_recs[index]; + } + + ret = ocfs2_journal_access_rb(handle, ci, ref_leaf_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out; + } + + /* + * We have calculated out how many new records we need and store + * in recs_need, so spare enough space first by moving the records + * after "index" to the end. + */ + if (index != le16_to_cpu(rf_list->rl_used) - 1) + memmove(&rf_list->rl_recs[index + 1 + recs_need], + &rf_list->rl_recs[index + 1], + (le16_to_cpu(rf_list->rl_used) - index - 1) * + sizeof(struct ocfs2_refcount_rec)); + + len = (le64_to_cpu(orig_rec->r_cpos) + + le32_to_cpu(orig_rec->r_clusters)) - + (le64_to_cpu(split_rec->r_cpos) + + le32_to_cpu(split_rec->r_clusters)); + + /* + * If we have "len", the we will split in the tail and move it + * to the end of the space we have just spared. + */ + if (len) { + tail_rec = &rf_list->rl_recs[index + recs_need]; + + memcpy(tail_rec, orig_rec, sizeof(struct ocfs2_refcount_rec)); + le64_add_cpu(&tail_rec->r_cpos, + le32_to_cpu(tail_rec->r_clusters) - len); + tail_rec->r_clusters = le32_to_cpu(len); + } + + /* + * If the split pos isn't the same as the original one, we need to + * split in the head. + * + * Note: We have the chance that split_rec.r_refcount = 0, + * recs_need = 0 and len > 0, which means we just cut the head from + * the orig_rec and in that case we have done some modification in + * orig_rec above, so the check for r_cpos is faked. + */ + if (split_rec->r_cpos != orig_rec->r_cpos && tail_rec != orig_rec) { + len = le64_to_cpu(split_rec->r_cpos) - + le64_to_cpu(orig_rec->r_cpos); + orig_rec->r_clusters = cpu_to_le32(len); + index++; + } + + le16_add_cpu(&rf_list->rl_used, recs_need); + + if (split_rec->r_refcount) { + rf_list->rl_recs[index] = *split_rec; + mlog(0, "insert refcount record start %llu, len %u, count %u " + "to leaf block %llu at index %d\n", + (unsigned long long)le64_to_cpu(split_rec->r_cpos), + le32_to_cpu(split_rec->r_clusters), + le32_to_cpu(split_rec->r_refcount), + (unsigned long long)ref_leaf_bh->b_blocknr, index); + + if (merge) + ocfs2_refcount_rec_merge(rb, index); + } + + ret = ocfs2_journal_dirty(handle, ref_leaf_bh); + if (ret) + mlog_errno(ret); + +out: + brelse(new_bh); + return ret; +} + +static int __ocfs2_increase_refcount(handle_t *handle, + struct ocfs2_caching_info *ci, + struct buffer_head *ref_root_bh, + u64 cpos, u32 len, int merge, + struct ocfs2_alloc_context *meta_ac, + struct ocfs2_cached_dealloc_ctxt *dealloc) +{ + int ret = 0, index; + struct buffer_head *ref_leaf_bh = NULL; + struct ocfs2_refcount_rec rec; + unsigned int set_len = 0; + + mlog(0, "Tree owner %llu, add refcount start %llu, len %u\n", + (unsigned long long)ocfs2_metadata_cache_owner(ci), + (unsigned long long)cpos, len); + + while (len) { + ret = ocfs2_get_refcount_rec(ci, ref_root_bh, + cpos, len, &rec, &index, + &ref_leaf_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + set_len = le32_to_cpu(rec.r_clusters); + + /* + * Here we may meet with 3 situations: + * + * 1. If we find an already existing record, and the length + * is the same, cool, we just need to increase the r_refcount + * and it is OK. + * 2. If we find a hole, just insert it with r_refcount = 1. + * 3. If we are in the middle of one extent record, split + * it. + */ + if (rec.r_refcount && le64_to_cpu(rec.r_cpos) == cpos && + set_len <= len) { + mlog(0, "increase refcount rec, start %llu, len %u, " + "count %u\n", (unsigned long long)cpos, set_len, + le32_to_cpu(rec.r_refcount)); + ret = ocfs2_change_refcount_rec(handle, ci, + ref_leaf_bh, index, + merge, 1); + if (ret) { + mlog_errno(ret); + goto out; + } + } else if (!rec.r_refcount) { + rec.r_refcount = cpu_to_le32(1); + + mlog(0, "insert refcount rec, start %llu, len %u\n", + (unsigned long long)le64_to_cpu(rec.r_cpos), + set_len); + ret = ocfs2_insert_refcount_rec(handle, ci, ref_root_bh, + ref_leaf_bh, + &rec, index, + merge, meta_ac); + if (ret) { + mlog_errno(ret); + goto out; + } + } else { + set_len = min((u64)(cpos + len), + le64_to_cpu(rec.r_cpos) + set_len) - cpos; + rec.r_cpos = cpu_to_le64(cpos); + rec.r_clusters = cpu_to_le32(set_len); + le32_add_cpu(&rec.r_refcount, 1); + + mlog(0, "split refcount rec, start %llu, " + "len %u, count %u\n", + (unsigned long long)le64_to_cpu(rec.r_cpos), + set_len, le32_to_cpu(rec.r_refcount)); + ret = ocfs2_split_refcount_rec(handle, ci, + ref_root_bh, ref_leaf_bh, + &rec, index, merge, + meta_ac, dealloc); + if (ret) { + mlog_errno(ret); + goto out; + } + } + + cpos += set_len; + len -= set_len; + brelse(ref_leaf_bh); + ref_leaf_bh = NULL; + } + +out: + brelse(ref_leaf_bh); + return ret; +} + +static int ocfs2_remove_refcount_extent(handle_t *handle, + struct ocfs2_caching_info *ci, + struct buffer_head *ref_root_bh, + struct buffer_head *ref_leaf_bh, + struct ocfs2_alloc_context *meta_ac, + struct ocfs2_cached_dealloc_ctxt *dealloc) +{ + int ret; + struct super_block *sb = ocfs2_metadata_cache_get_super(ci); + struct ocfs2_refcount_block *rb = + (struct ocfs2_refcount_block *)ref_leaf_bh->b_data; + struct ocfs2_extent_tree et; + + BUG_ON(rb->rf_records.rl_used); + + ocfs2_init_refcount_extent_tree(&et, ci, ref_root_bh); + ret = ocfs2_remove_extent(handle, &et, le32_to_cpu(rb->rf_cpos), + 1, meta_ac, dealloc); + if (ret) { + mlog_errno(ret); + goto out; + } + + ocfs2_remove_from_cache(ci, ref_leaf_bh); + + /* + * add the freed block to the dealloc so that it will be freed + * when we run dealloc. + */ + ret = ocfs2_cache_block_dealloc(dealloc, EXTENT_ALLOC_SYSTEM_INODE, + le16_to_cpu(rb->rf_suballoc_slot), + le64_to_cpu(rb->rf_blkno), + le16_to_cpu(rb->rf_suballoc_bit)); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_journal_access_rb(handle, ci, ref_root_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out; + } + + rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data; + + le32_add_cpu(&rb->rf_clusters, -1); + + /* + * check whether we need to restore the root refcount block if + * there is no leaf extent block at atll. + */ + if (!rb->rf_list.l_next_free_rec) { + BUG_ON(rb->rf_clusters); + + mlog(0, "reset refcount tree root %llu to be a record block.\n", + (unsigned long long)ref_root_bh->b_blocknr); + + rb->rf_flags = 0; + rb->rf_parent = 0; + rb->rf_cpos = 0; + memset(&rb->rf_records, 0, sb->s_blocksize - + offsetof(struct ocfs2_refcount_block, rf_records)); + rb->rf_records.rl_count = + cpu_to_le16(ocfs2_refcount_recs_per_rb(sb)); + } + + ocfs2_journal_dirty(handle, ref_root_bh); + +out: + return ret; +} + +int ocfs2_increase_refcount(handle_t *handle, + struct ocfs2_caching_info *ci, + struct buffer_head *ref_root_bh, + u64 cpos, u32 len, + struct ocfs2_alloc_context *meta_ac, + struct ocfs2_cached_dealloc_ctxt *dealloc) +{ + return __ocfs2_increase_refcount(handle, ci, ref_root_bh, + cpos, len, 1, + meta_ac, dealloc); +} + +static int ocfs2_decrease_refcount_rec(handle_t *handle, + struct ocfs2_caching_info *ci, + struct buffer_head *ref_root_bh, + struct buffer_head *ref_leaf_bh, + int index, u64 cpos, unsigned int len, + struct ocfs2_alloc_context *meta_ac, + struct ocfs2_cached_dealloc_ctxt *dealloc) +{ + int ret; + struct ocfs2_refcount_block *rb = + (struct ocfs2_refcount_block *)ref_leaf_bh->b_data; + struct ocfs2_refcount_rec *rec = &rb->rf_records.rl_recs[index]; + + BUG_ON(cpos < le64_to_cpu(rec->r_cpos)); + BUG_ON(cpos + len > + le64_to_cpu(rec->r_cpos) + le32_to_cpu(rec->r_clusters)); + + if (cpos == le64_to_cpu(rec->r_cpos) && + len == le32_to_cpu(rec->r_clusters)) + ret = ocfs2_change_refcount_rec(handle, ci, + ref_leaf_bh, index, 1, -1); + else { + struct ocfs2_refcount_rec split = *rec; + split.r_cpos = cpu_to_le64(cpos); + split.r_clusters = cpu_to_le32(len); + + le32_add_cpu(&split.r_refcount, -1); + + mlog(0, "split refcount rec, start %llu, " + "len %u, count %u, original start %llu, len %u\n", + (unsigned long long)le64_to_cpu(split.r_cpos), + len, le32_to_cpu(split.r_refcount), + (unsigned long long)le64_to_cpu(rec->r_cpos), + le32_to_cpu(rec->r_clusters)); + ret = ocfs2_split_refcount_rec(handle, ci, + ref_root_bh, ref_leaf_bh, + &split, index, 1, + meta_ac, dealloc); + } + + if (ret) { + mlog_errno(ret); + goto out; + } + + /* Remove the leaf refcount block if it contains no refcount record. */ + if (!rb->rf_records.rl_used && ref_leaf_bh != ref_root_bh) { + ret = ocfs2_remove_refcount_extent(handle, ci, ref_root_bh, + ref_leaf_bh, meta_ac, + dealloc); + if (ret) + mlog_errno(ret); + } + +out: + return ret; +} + +static int __ocfs2_decrease_refcount(handle_t *handle, + struct ocfs2_caching_info *ci, + struct buffer_head *ref_root_bh, + u64 cpos, u32 len, + struct ocfs2_alloc_context *meta_ac, + struct ocfs2_cached_dealloc_ctxt *dealloc, + int delete) +{ + int ret = 0, index = 0; + struct ocfs2_refcount_rec rec; + unsigned int r_count = 0, r_len; + struct super_block *sb = ocfs2_metadata_cache_get_super(ci); + struct buffer_head *ref_leaf_bh = NULL; + + mlog(0, "Tree owner %llu, decrease refcount start %llu, " + "len %u, delete %u\n", + (unsigned long long)ocfs2_metadata_cache_owner(ci), + (unsigned long long)cpos, len, delete); + + while (len) { + ret = ocfs2_get_refcount_rec(ci, ref_root_bh, + cpos, len, &rec, &index, + &ref_leaf_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + r_count = le32_to_cpu(rec.r_refcount); + BUG_ON(r_count == 0); + if (!delete) + BUG_ON(r_count > 1); + + r_len = min((u64)(cpos + len), le64_to_cpu(rec.r_cpos) + + le32_to_cpu(rec.r_clusters)) - cpos; + + ret = ocfs2_decrease_refcount_rec(handle, ci, ref_root_bh, + ref_leaf_bh, index, + cpos, r_len, + meta_ac, dealloc); + if (ret) { + mlog_errno(ret); + goto out; + } + + if (le32_to_cpu(rec.r_refcount) == 1 && delete) { + ret = ocfs2_cache_cluster_dealloc(dealloc, + ocfs2_clusters_to_blocks(sb, cpos), + r_len); + if (ret) { + mlog_errno(ret); + goto out; + } + } + + cpos += r_len; + len -= r_len; + brelse(ref_leaf_bh); + ref_leaf_bh = NULL; + } + +out: + brelse(ref_leaf_bh); + return ret; +} + +/* Caller must hold refcount tree lock. */ +int ocfs2_decrease_refcount(struct inode *inode, + handle_t *handle, u32 cpos, u32 len, + struct ocfs2_alloc_context *meta_ac, + struct ocfs2_cached_dealloc_ctxt *dealloc, + int delete) +{ + int ret; + u64 ref_blkno; + struct ocfs2_inode_info *oi = OCFS2_I(inode); + struct buffer_head *ref_root_bh = NULL; + struct ocfs2_refcount_tree *tree; + + BUG_ON(!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL)); + + ret = ocfs2_get_refcount_block(inode, &ref_blkno); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_get_refcount_tree(OCFS2_SB(inode->i_sb), ref_blkno, &tree); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_read_refcount_block(&tree->rf_ci, tree->rf_blkno, + &ref_root_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = __ocfs2_decrease_refcount(handle, &tree->rf_ci, ref_root_bh, + cpos, len, meta_ac, dealloc, delete); + if (ret) + mlog_errno(ret); +out: + brelse(ref_root_bh); + return ret; +} + +/* + * Mark the already-existing extent at cpos as refcounted for len clusters. + * This adds the refcount extent flag. + * + * If the existing extent is larger than the request, initiate a + * split. An attempt will be made at merging with adjacent extents. + * + * The caller is responsible for passing down meta_ac if we'll need it. + */ +static int ocfs2_mark_extent_refcounted(struct inode *inode, + struct ocfs2_extent_tree *et, + handle_t *handle, u32 cpos, + u32 len, u32 phys, + struct ocfs2_alloc_context *meta_ac, + struct ocfs2_cached_dealloc_ctxt *dealloc) +{ + int ret; + + mlog(0, "Inode %lu refcount tree cpos %u, len %u, phys cluster %u\n", + inode->i_ino, cpos, len, phys); + + if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) { + ocfs2_error(inode->i_sb, "Inode %lu want to use refcount " + "tree, but the feature bit is not set in the " + "super block.", inode->i_ino); + ret = -EROFS; + goto out; + } + + ret = ocfs2_change_extent_flag(handle, et, cpos, + len, phys, meta_ac, dealloc, + OCFS2_EXT_REFCOUNTED, 0); + if (ret) + mlog_errno(ret); + +out: + return ret; +} + +/* + * Given some contiguous physical clusters, calculate what we need + * for modifying their refcount. + */ +static int ocfs2_calc_refcount_meta_credits(struct super_block *sb, + struct ocfs2_caching_info *ci, + struct buffer_head *ref_root_bh, + u64 start_cpos, + u32 clusters, + int *meta_add, + int *credits) +{ + int ret = 0, index, ref_blocks = 0, recs_add = 0; + u64 cpos = start_cpos; + struct ocfs2_refcount_block *rb; + struct ocfs2_refcount_rec rec; + struct buffer_head *ref_leaf_bh = NULL, *prev_bh = NULL; + u32 len; + + mlog(0, "start_cpos %llu, clusters %u\n", + (unsigned long long)start_cpos, clusters); + while (clusters) { + ret = ocfs2_get_refcount_rec(ci, ref_root_bh, + cpos, clusters, &rec, + &index, &ref_leaf_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + if (ref_leaf_bh != prev_bh) { + /* + * Now we encounter a new leaf block, so calculate + * whether we need to extend the old leaf. + */ + if (prev_bh) { + rb = (struct ocfs2_refcount_block *) + prev_bh->b_data; + + if (le64_to_cpu(rb->rf_records.rl_used) + + recs_add > + le16_to_cpu(rb->rf_records.rl_count)) + ref_blocks++; + } + + recs_add = 0; + *credits += 1; + brelse(prev_bh); + prev_bh = ref_leaf_bh; + get_bh(prev_bh); + } + + rb = (struct ocfs2_refcount_block *)ref_leaf_bh->b_data; + + mlog(0, "recs_add %d,cpos %llu, clusters %u, rec->r_cpos %llu," + "rec->r_clusters %u, rec->r_refcount %u, index %d\n", + recs_add, (unsigned long long)cpos, clusters, + (unsigned long long)le64_to_cpu(rec.r_cpos), + le32_to_cpu(rec.r_clusters), + le32_to_cpu(rec.r_refcount), index); + + len = min((u64)cpos + clusters, le64_to_cpu(rec.r_cpos) + + le32_to_cpu(rec.r_clusters)) - cpos; + /* + * If the refcount rec already exist, cool. We just need + * to check whether there is a split. Otherwise we just need + * to increase the refcount. + * If we will insert one, increases recs_add. + * + * We record all the records which will be inserted to the + * same refcount block, so that we can tell exactly whether + * we need a new refcount block or not. + */ + if (rec.r_refcount) { + /* Check whether we need a split at the beginning. */ + if (cpos == start_cpos && + cpos != le64_to_cpu(rec.r_cpos)) + recs_add++; + + /* Check whether we need a split in the end. */ + if (cpos + clusters < le64_to_cpu(rec.r_cpos) + + le32_to_cpu(rec.r_clusters)) + recs_add++; + } else + recs_add++; + + brelse(ref_leaf_bh); + ref_leaf_bh = NULL; + clusters -= len; + cpos += len; + } + + if (prev_bh) { + rb = (struct ocfs2_refcount_block *)prev_bh->b_data; + + if (le64_to_cpu(rb->rf_records.rl_used) + recs_add > + le16_to_cpu(rb->rf_records.rl_count)) + ref_blocks++; + + *credits += 1; + } + + if (!ref_blocks) + goto out; + + mlog(0, "we need ref_blocks %d\n", ref_blocks); + *meta_add += ref_blocks; + *credits += ref_blocks; + + /* + * So we may need ref_blocks to insert into the tree. + * That also means we need to change the b-tree and add that number + * of records since we never merge them. + * We need one more block for expansion since the new created leaf + * block is also full and needs split. + */ + rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data; + if (le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL) { + struct ocfs2_extent_tree et; + + ocfs2_init_refcount_extent_tree(&et, ci, ref_root_bh); + *meta_add += ocfs2_extend_meta_needed(et.et_root_el); + *credits += ocfs2_calc_extend_credits(sb, + et.et_root_el, + ref_blocks); + } else { + *credits += OCFS2_EXPAND_REFCOUNT_TREE_CREDITS; + *meta_add += 1; + } + +out: + brelse(ref_leaf_bh); + brelse(prev_bh); + return ret; +} + +/* + * For refcount tree, we will decrease some contiguous clusters + * refcount count, so just go through it to see how many blocks + * we gonna touch and whether we need to create new blocks. + * + * Normally the refcount blocks store these refcount should be + * continguous also, so that we can get the number easily. + * As for meta_ac, we will at most add split 2 refcount record and + * 2 more refcount block, so just check it in a rough way. + * + * Caller must hold refcount tree lock. + */ +int ocfs2_prepare_refcount_change_for_del(struct inode *inode, + struct buffer_head *di_bh, + u64 phys_blkno, + u32 clusters, + int *credits, + struct ocfs2_alloc_context **meta_ac) +{ + int ret, ref_blocks = 0; + struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; + struct ocfs2_inode_info *oi = OCFS2_I(inode); + struct buffer_head *ref_root_bh = NULL; + struct ocfs2_refcount_tree *tree; + u64 start_cpos = ocfs2_blocks_to_clusters(inode->i_sb, phys_blkno); + + if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) { + ocfs2_error(inode->i_sb, "Inode %lu want to use refcount " + "tree, but the feature bit is not set in the " + "super block.", inode->i_ino); + ret = -EROFS; + goto out; + } + + BUG_ON(!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL)); + + ret = ocfs2_get_refcount_tree(OCFS2_SB(inode->i_sb), + le64_to_cpu(di->i_refcount_loc), &tree); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_read_refcount_block(&tree->rf_ci, + le64_to_cpu(di->i_refcount_loc), + &ref_root_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_calc_refcount_meta_credits(inode->i_sb, + &tree->rf_ci, + ref_root_bh, + start_cpos, clusters, + &ref_blocks, credits); + if (ret) { + mlog_errno(ret); + goto out; + } + + mlog(0, "reserve new metadata %d, credits = %d\n", + ref_blocks, *credits); + + if (ref_blocks) { + ret = ocfs2_reserve_new_metadata_blocks(OCFS2_SB(inode->i_sb), + ref_blocks, meta_ac); + if (ret) + mlog_errno(ret); + } + +out: + brelse(ref_root_bh); + return ret; +} + +#define MAX_CONTIG_BYTES 1048576 + +static inline unsigned int ocfs2_cow_contig_clusters(struct super_block *sb) +{ + return ocfs2_clusters_for_bytes(sb, MAX_CONTIG_BYTES); +} + +static inline unsigned int ocfs2_cow_contig_mask(struct super_block *sb) +{ + return ~(ocfs2_cow_contig_clusters(sb) - 1); +} + +/* + * Given an extent that starts at 'start' and an I/O that starts at 'cpos', + * find an offset (start + (n * contig_clusters)) that is closest to cpos + * while still being less than or equal to it. + * + * The goal is to break the extent at a multiple of contig_clusters. + */ +static inline unsigned int ocfs2_cow_align_start(struct super_block *sb, + unsigned int start, + unsigned int cpos) +{ + BUG_ON(start > cpos); + + return start + ((cpos - start) & ocfs2_cow_contig_mask(sb)); +} + +/* + * Given a cluster count of len, pad it out so that it is a multiple + * of contig_clusters. + */ +static inline unsigned int ocfs2_cow_align_length(struct super_block *sb, + unsigned int len) +{ + unsigned int padded = + (len + (ocfs2_cow_contig_clusters(sb) - 1)) & + ocfs2_cow_contig_mask(sb); + + /* Did we wrap? */ + if (padded < len) + padded = UINT_MAX; + + return padded; +} + +/* + * Calculate out the start and number of virtual clusters we need to to CoW. + * + * cpos is vitual start cluster position we want to do CoW in a + * file and write_len is the cluster length. + * max_cpos is the place where we want to stop CoW intentionally. + * + * Normal we will start CoW from the beginning of extent record cotaining cpos. + * We try to break up extents on boundaries of MAX_CONTIG_BYTES so that we + * get good I/O from the resulting extent tree. + */ +static int ocfs2_refcount_cal_cow_clusters(struct inode *inode, + struct ocfs2_extent_list *el, + u32 cpos, + u32 write_len, + u32 max_cpos, + u32 *cow_start, + u32 *cow_len) +{ + int ret = 0; + int tree_height = le16_to_cpu(el->l_tree_depth), i; + struct buffer_head *eb_bh = NULL; + struct ocfs2_extent_block *eb = NULL; + struct ocfs2_extent_rec *rec; + unsigned int want_clusters, rec_end = 0; + int contig_clusters = ocfs2_cow_contig_clusters(inode->i_sb); + int leaf_clusters; + + BUG_ON(cpos + write_len > max_cpos); + + if (tree_height > 0) { + ret = ocfs2_find_leaf(INODE_CACHE(inode), el, cpos, &eb_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + eb = (struct ocfs2_extent_block *) eb_bh->b_data; + el = &eb->h_list; + + if (el->l_tree_depth) { + ocfs2_error(inode->i_sb, + "Inode %lu has non zero tree depth in " + "leaf block %llu\n", inode->i_ino, + (unsigned long long)eb_bh->b_blocknr); + ret = -EROFS; + goto out; + } + } + + *cow_len = 0; + for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) { + rec = &el->l_recs[i]; + + if (ocfs2_is_empty_extent(rec)) { + mlog_bug_on_msg(i != 0, "Inode %lu has empty record in " + "index %d\n", inode->i_ino, i); + continue; + } + + if (le32_to_cpu(rec->e_cpos) + + le16_to_cpu(rec->e_leaf_clusters) <= cpos) + continue; + + if (*cow_len == 0) { + /* + * We should find a refcounted record in the + * first pass. + */ + BUG_ON(!(rec->e_flags & OCFS2_EXT_REFCOUNTED)); + *cow_start = le32_to_cpu(rec->e_cpos); + } + + /* + * If we encounter a hole, a non-refcounted record or + * pass the max_cpos, stop the search. + */ + if ((!(rec->e_flags & OCFS2_EXT_REFCOUNTED)) || + (*cow_len && rec_end != le32_to_cpu(rec->e_cpos)) || + (max_cpos <= le32_to_cpu(rec->e_cpos))) + break; + + leaf_clusters = le16_to_cpu(rec->e_leaf_clusters); + rec_end = le32_to_cpu(rec->e_cpos) + leaf_clusters; + if (rec_end > max_cpos) { + rec_end = max_cpos; + leaf_clusters = rec_end - le32_to_cpu(rec->e_cpos); + } + + /* + * How many clusters do we actually need from + * this extent? First we see how many we actually + * need to complete the write. If that's smaller + * than contig_clusters, we try for contig_clusters. + */ + if (!*cow_len) + want_clusters = write_len; + else + want_clusters = (cpos + write_len) - + (*cow_start + *cow_len); + if (want_clusters < contig_clusters) + want_clusters = contig_clusters; + + /* + * If the write does not cover the whole extent, we + * need to calculate how we're going to split the extent. + * We try to do it on contig_clusters boundaries. + * + * Any extent smaller than contig_clusters will be + * CoWed in its entirety. + */ + if (leaf_clusters <= contig_clusters) + *cow_len += leaf_clusters; + else if (*cow_len || (*cow_start == cpos)) { + /* + * This extent needs to be CoW'd from its + * beginning, so all we have to do is compute + * how many clusters to grab. We align + * want_clusters to the edge of contig_clusters + * to get better I/O. + */ + want_clusters = ocfs2_cow_align_length(inode->i_sb, + want_clusters); + + if (leaf_clusters < want_clusters) + *cow_len += leaf_clusters; + else + *cow_len += want_clusters; + } else if ((*cow_start + contig_clusters) >= + (cpos + write_len)) { + /* + * Breaking off contig_clusters at the front + * of the extent will cover our write. That's + * easy. + */ + *cow_len = contig_clusters; + } else if ((rec_end - cpos) <= contig_clusters) { + /* + * Breaking off contig_clusters at the tail of + * this extent will cover cpos. + */ + *cow_start = rec_end - contig_clusters; + *cow_len = contig_clusters; + } else if ((rec_end - cpos) <= want_clusters) { + /* + * While we can't fit the entire write in this + * extent, we know that the write goes from cpos + * to the end of the extent. Break that off. + * We try to break it at some multiple of + * contig_clusters from the front of the extent. + * Failing that (ie, cpos is within + * contig_clusters of the front), we'll CoW the + * entire extent. + */ + *cow_start = ocfs2_cow_align_start(inode->i_sb, + *cow_start, cpos); + *cow_len = rec_end - *cow_start; + } else { + /* + * Ok, the entire write lives in the middle of + * this extent. Let's try to slice the extent up + * nicely. Optimally, our CoW region starts at + * m*contig_clusters from the beginning of the + * extent and goes for n*contig_clusters, + * covering the entire write. + */ + *cow_start = ocfs2_cow_align_start(inode->i_sb, + *cow_start, cpos); + + want_clusters = (cpos + write_len) - *cow_start; + want_clusters = ocfs2_cow_align_length(inode->i_sb, + want_clusters); + if (*cow_start + want_clusters <= rec_end) + *cow_len = want_clusters; + else + *cow_len = rec_end - *cow_start; + } + + /* Have we covered our entire write yet? */ + if ((*cow_start + *cow_len) >= (cpos + write_len)) + break; + + /* + * If we reach the end of the extent block and don't get enough + * clusters, continue with the next extent block if possible. + */ + if (i + 1 == le16_to_cpu(el->l_next_free_rec) && + eb && eb->h_next_leaf_blk) { + brelse(eb_bh); + eb_bh = NULL; + + ret = ocfs2_read_extent_block(INODE_CACHE(inode), + le64_to_cpu(eb->h_next_leaf_blk), + &eb_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + eb = (struct ocfs2_extent_block *) eb_bh->b_data; + el = &eb->h_list; + i = -1; + } + } + +out: + brelse(eb_bh); + return ret; +} + +/* + * Prepare meta_ac, data_ac and calculate credits when we want to add some + * num_clusters in data_tree "et" and change the refcount for the old + * clusters(starting form p_cluster) in the refcount tree. + * + * Note: + * 1. since we may split the old tree, so we at most will need num_clusters + 2 + * more new leaf records. + * 2. In some case, we may not need to reserve new clusters(e.g, reflink), so + * just give data_ac = NULL. + */ +static int ocfs2_lock_refcount_allocators(struct super_block *sb, + u32 p_cluster, u32 num_clusters, + struct ocfs2_extent_tree *et, + struct ocfs2_caching_info *ref_ci, + struct buffer_head *ref_root_bh, + struct ocfs2_alloc_context **meta_ac, + struct ocfs2_alloc_context **data_ac, + int *credits) +{ + int ret = 0, meta_add = 0; + int num_free_extents = ocfs2_num_free_extents(OCFS2_SB(sb), et); + + if (num_free_extents < 0) { + ret = num_free_extents; + mlog_errno(ret); + goto out; + } + + if (num_free_extents < num_clusters + 2) + meta_add = + ocfs2_extend_meta_needed(et->et_root_el); + + *credits += ocfs2_calc_extend_credits(sb, et->et_root_el, + num_clusters + 2); + + ret = ocfs2_calc_refcount_meta_credits(sb, ref_ci, ref_root_bh, + p_cluster, num_clusters, + &meta_add, credits); + if (ret) { + mlog_errno(ret); + goto out; + } + + mlog(0, "reserve new metadata %d, clusters %u, credits = %d\n", + meta_add, num_clusters, *credits); + ret = ocfs2_reserve_new_metadata_blocks(OCFS2_SB(sb), meta_add, + meta_ac); + if (ret) { + mlog_errno(ret); + goto out; + } + + if (data_ac) { + ret = ocfs2_reserve_clusters(OCFS2_SB(sb), num_clusters, + data_ac); + if (ret) + mlog_errno(ret); + } + +out: + if (ret) { + if (*meta_ac) { + ocfs2_free_alloc_context(*meta_ac); + *meta_ac = NULL; + } + } + + return ret; +} + +static int ocfs2_clear_cow_buffer(handle_t *handle, struct buffer_head *bh) +{ + BUG_ON(buffer_dirty(bh)); + + clear_buffer_mapped(bh); + + return 0; +} + +static int ocfs2_duplicate_clusters_by_page(handle_t *handle, + struct ocfs2_cow_context *context, + u32 cpos, u32 old_cluster, + u32 new_cluster, u32 new_len) +{ + int ret = 0, partial; + struct ocfs2_caching_info *ci = context->data_et.et_ci; + struct super_block *sb = ocfs2_metadata_cache_get_super(ci); + u64 new_block = ocfs2_clusters_to_blocks(sb, new_cluster); + struct page *page; + pgoff_t page_index; + unsigned int from, to; + loff_t offset, end, map_end; + struct address_space *mapping = context->inode->i_mapping; + + mlog(0, "old_cluster %u, new %u, len %u at offset %u\n", old_cluster, + new_cluster, new_len, cpos); + + offset = ((loff_t)cpos) << OCFS2_SB(sb)->s_clustersize_bits; + end = offset + (new_len << OCFS2_SB(sb)->s_clustersize_bits); + + while (offset < end) { + page_index = offset >> PAGE_CACHE_SHIFT; + map_end = (page_index + 1) << PAGE_CACHE_SHIFT; + if (map_end > end) + map_end = end; + + /* from, to is the offset within the page. */ + from = offset & (PAGE_CACHE_SIZE - 1); + to = PAGE_CACHE_SIZE; + if (map_end & (PAGE_CACHE_SIZE - 1)) + to = map_end & (PAGE_CACHE_SIZE - 1); + + page = grab_cache_page(mapping, page_index); + + /* This page can't be dirtied before we CoW it out. */ + BUG_ON(PageDirty(page)); + + if (!PageUptodate(page)) { + ret = block_read_full_page(page, ocfs2_get_block); + if (ret) { + mlog_errno(ret); + goto unlock; + } + lock_page(page); + } + + if (page_has_buffers(page)) { + ret = walk_page_buffers(handle, page_buffers(page), + from, to, &partial, + ocfs2_clear_cow_buffer); + if (ret) { + mlog_errno(ret); + goto unlock; + } + } + + ocfs2_map_and_dirty_page(context->inode, + handle, from, to, + page, 0, &new_block); + mark_page_accessed(page); +unlock: + unlock_page(page); + page_cache_release(page); + page = NULL; + offset = map_end; + if (ret) + break; + } + + return ret; +} + +static int ocfs2_duplicate_clusters_by_jbd(handle_t *handle, + struct ocfs2_cow_context *context, + u32 cpos, u32 old_cluster, + u32 new_cluster, u32 new_len) +{ + int ret = 0; + struct super_block *sb = context->inode->i_sb; + struct ocfs2_caching_info *ci = context->data_et.et_ci; + int i, blocks = ocfs2_clusters_to_blocks(sb, new_len); + u64 old_block = ocfs2_clusters_to_blocks(sb, old_cluster); + u64 new_block = ocfs2_clusters_to_blocks(sb, new_cluster); + struct ocfs2_super *osb = OCFS2_SB(sb); + struct buffer_head *old_bh = NULL; + struct buffer_head *new_bh = NULL; + + mlog(0, "old_cluster %u, new %u, len %u\n", old_cluster, + new_cluster, new_len); + + for (i = 0; i < blocks; i++, old_block++, new_block++) { + new_bh = sb_getblk(osb->sb, new_block); + if (new_bh == NULL) { + ret = -EIO; + mlog_errno(ret); + break; + } + + ocfs2_set_new_buffer_uptodate(ci, new_bh); + + ret = ocfs2_read_block(ci, old_block, &old_bh, NULL); + if (ret) { + mlog_errno(ret); + break; + } + + ret = ocfs2_journal_access(handle, ci, new_bh, + OCFS2_JOURNAL_ACCESS_CREATE); + if (ret) { + mlog_errno(ret); + break; + } + + memcpy(new_bh->b_data, old_bh->b_data, sb->s_blocksize); + ret = ocfs2_journal_dirty(handle, new_bh); + if (ret) { + mlog_errno(ret); + break; + } + + brelse(new_bh); + brelse(old_bh); + new_bh = NULL; + old_bh = NULL; + } + + brelse(new_bh); + brelse(old_bh); + return ret; +} + +static int ocfs2_clear_ext_refcount(handle_t *handle, + struct ocfs2_extent_tree *et, + u32 cpos, u32 p_cluster, u32 len, + unsigned int ext_flags, + struct ocfs2_alloc_context *meta_ac, + struct ocfs2_cached_dealloc_ctxt *dealloc) +{ + int ret, index; + struct ocfs2_extent_rec replace_rec; + struct ocfs2_path *path = NULL; + struct ocfs2_extent_list *el; + struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci); + u64 ino = ocfs2_metadata_cache_owner(et->et_ci); + + mlog(0, "inode %llu cpos %u, len %u, p_cluster %u, ext_flags %u\n", + (unsigned long long)ino, cpos, len, p_cluster, ext_flags); + + memset(&replace_rec, 0, sizeof(replace_rec)); + replace_rec.e_cpos = cpu_to_le32(cpos); + replace_rec.e_leaf_clusters = cpu_to_le16(len); + replace_rec.e_blkno = cpu_to_le64(ocfs2_clusters_to_blocks(sb, + p_cluster)); + replace_rec.e_flags = ext_flags; + replace_rec.e_flags &= ~OCFS2_EXT_REFCOUNTED; + + path = ocfs2_new_path_from_et(et); + if (!path) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + + ret = ocfs2_find_path(et->et_ci, path, cpos); + if (ret) { + mlog_errno(ret); + goto out; + } + + el = path_leaf_el(path); + + index = ocfs2_search_extent_list(el, cpos); + if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) { + ocfs2_error(sb, + "Inode %llu has an extent at cpos %u which can no " + "longer be found.\n", + (unsigned long long)ino, cpos); + ret = -EROFS; + goto out; + } + + ret = ocfs2_split_extent(handle, et, path, index, + &replace_rec, meta_ac, dealloc); + if (ret) + mlog_errno(ret); + +out: + ocfs2_free_path(path); + return ret; +} + +static int ocfs2_replace_clusters(handle_t *handle, + struct ocfs2_cow_context *context, + u32 cpos, u32 old, + u32 new, u32 len, + unsigned int ext_flags) +{ + int ret; + struct ocfs2_caching_info *ci = context->data_et.et_ci; + u64 ino = ocfs2_metadata_cache_owner(ci); + + mlog(0, "inode %llu, cpos %u, old %u, new %u, len %u, ext_flags %u\n", + (unsigned long long)ino, cpos, old, new, len, ext_flags); + + /*If the old clusters is unwritten, no need to duplicate. */ + if (!(ext_flags & OCFS2_EXT_UNWRITTEN)) { + ret = context->cow_duplicate_clusters(handle, context, cpos, + old, new, len); + if (ret) { + mlog_errno(ret); + goto out; + } + } + + ret = ocfs2_clear_ext_refcount(handle, &context->data_et, + cpos, new, len, ext_flags, + context->meta_ac, &context->dealloc); + if (ret) + mlog_errno(ret); +out: + return ret; +} + +static int ocfs2_cow_sync_writeback(struct super_block *sb, + struct ocfs2_cow_context *context, + u32 cpos, u32 num_clusters) +{ + int ret = 0; + loff_t offset, end, map_end; + pgoff_t page_index; + struct page *page; + + if (ocfs2_should_order_data(context->inode)) + return 0; + + offset = ((loff_t)cpos) << OCFS2_SB(sb)->s_clustersize_bits; + end = offset + (num_clusters << OCFS2_SB(sb)->s_clustersize_bits); + + ret = filemap_fdatawrite_range(context->inode->i_mapping, + offset, end - 1); + if (ret < 0) { + mlog_errno(ret); + return ret; + } + + while (offset < end) { + page_index = offset >> PAGE_CACHE_SHIFT; + map_end = (page_index + 1) << PAGE_CACHE_SHIFT; + if (map_end > end) + map_end = end; + + page = grab_cache_page(context->inode->i_mapping, page_index); + BUG_ON(!page); + + wait_on_page_writeback(page); + if (PageError(page)) { + ret = -EIO; + mlog_errno(ret); + } else + mark_page_accessed(page); + + unlock_page(page); + page_cache_release(page); + page = NULL; + offset = map_end; + if (ret) + break; + } + + return ret; +} + +static int ocfs2_di_get_clusters(struct ocfs2_cow_context *context, + u32 v_cluster, u32 *p_cluster, + u32 *num_clusters, + unsigned int *extent_flags) +{ + return ocfs2_get_clusters(context->inode, v_cluster, p_cluster, + num_clusters, extent_flags); +} + +static int ocfs2_make_clusters_writable(struct super_block *sb, + struct ocfs2_cow_context *context, + u32 cpos, u32 p_cluster, + u32 num_clusters, unsigned int e_flags) +{ + int ret, delete, index, credits = 0; + u32 new_bit, new_len; + unsigned int set_len; + struct ocfs2_super *osb = OCFS2_SB(sb); + handle_t *handle; + struct buffer_head *ref_leaf_bh = NULL; + struct ocfs2_caching_info *ref_ci = &context->ref_tree->rf_ci; + struct ocfs2_refcount_rec rec; + + mlog(0, "cpos %u, p_cluster %u, num_clusters %u, e_flags %u\n", + cpos, p_cluster, num_clusters, e_flags); + + ret = ocfs2_lock_refcount_allocators(sb, p_cluster, num_clusters, + &context->data_et, + ref_ci, + context->ref_root_bh, + &context->meta_ac, + &context->data_ac, &credits); + if (ret) { + mlog_errno(ret); + return ret; + } + + if (context->post_refcount) + credits += context->post_refcount->credits; + + credits += context->extra_credits; + handle = ocfs2_start_trans(osb, credits); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + goto out; + } + + while (num_clusters) { + ret = ocfs2_get_refcount_rec(ref_ci, context->ref_root_bh, + p_cluster, num_clusters, + &rec, &index, &ref_leaf_bh); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + BUG_ON(!rec.r_refcount); + set_len = min((u64)p_cluster + num_clusters, + le64_to_cpu(rec.r_cpos) + + le32_to_cpu(rec.r_clusters)) - p_cluster; + + /* + * There are many different situation here. + * 1. If refcount == 1, remove the flag and don't COW. + * 2. If refcount > 1, allocate clusters. + * Here we may not allocate r_len once at a time, so continue + * until we reach num_clusters. + */ + if (le32_to_cpu(rec.r_refcount) == 1) { + delete = 0; + ret = ocfs2_clear_ext_refcount(handle, + &context->data_et, + cpos, p_cluster, + set_len, e_flags, + context->meta_ac, + &context->dealloc); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + } else { + delete = 1; + + ret = __ocfs2_claim_clusters(osb, handle, + context->data_ac, + 1, set_len, + &new_bit, &new_len); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + ret = ocfs2_replace_clusters(handle, context, + cpos, p_cluster, new_bit, + new_len, e_flags); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + set_len = new_len; + } + + ret = __ocfs2_decrease_refcount(handle, ref_ci, + context->ref_root_bh, + p_cluster, set_len, + context->meta_ac, + &context->dealloc, delete); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + cpos += set_len; + p_cluster += set_len; + num_clusters -= set_len; + brelse(ref_leaf_bh); + ref_leaf_bh = NULL; + } + + /* handle any post_cow action. */ + if (context->post_refcount && context->post_refcount->func) { + ret = context->post_refcount->func(context->inode, handle, + context->post_refcount->para); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + } + + /* + * Here we should write the new page out first if we are + * in write-back mode. + */ + if (context->get_clusters == ocfs2_di_get_clusters) { + ret = ocfs2_cow_sync_writeback(sb, context, cpos, num_clusters); + if (ret) + mlog_errno(ret); + } + +out_commit: + ocfs2_commit_trans(osb, handle); + +out: + if (context->data_ac) { + ocfs2_free_alloc_context(context->data_ac); + context->data_ac = NULL; + } + if (context->meta_ac) { + ocfs2_free_alloc_context(context->meta_ac); + context->meta_ac = NULL; + } + brelse(ref_leaf_bh); + + return ret; +} + +static int ocfs2_replace_cow(struct ocfs2_cow_context *context) +{ + int ret = 0; + struct inode *inode = context->inode; + u32 cow_start = context->cow_start, cow_len = context->cow_len; + u32 p_cluster, num_clusters; + unsigned int ext_flags; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + + if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) { + ocfs2_error(inode->i_sb, "Inode %lu want to use refcount " + "tree, but the feature bit is not set in the " + "super block.", inode->i_ino); + return -EROFS; + } + + ocfs2_init_dealloc_ctxt(&context->dealloc); + + while (cow_len) { + ret = context->get_clusters(context, cow_start, &p_cluster, + &num_clusters, &ext_flags); + if (ret) { + mlog_errno(ret); + break; + } + + BUG_ON(!(ext_flags & OCFS2_EXT_REFCOUNTED)); + + if (cow_len < num_clusters) + num_clusters = cow_len; + + ret = ocfs2_make_clusters_writable(inode->i_sb, context, + cow_start, p_cluster, + num_clusters, ext_flags); + if (ret) { + mlog_errno(ret); + break; + } + + cow_len -= num_clusters; + cow_start += num_clusters; + } + + if (ocfs2_dealloc_has_cluster(&context->dealloc)) { + ocfs2_schedule_truncate_log_flush(osb, 1); + ocfs2_run_deallocs(osb, &context->dealloc); + } + + return ret; +} + +/* + * Starting at cpos, try to CoW write_len clusters. Don't CoW + * past max_cpos. This will stop when it runs into a hole or an + * unrefcounted extent. + */ +static int ocfs2_refcount_cow_hunk(struct inode *inode, + struct buffer_head *di_bh, + u32 cpos, u32 write_len, u32 max_cpos) +{ + int ret; + u32 cow_start = 0, cow_len = 0; + struct ocfs2_inode_info *oi = OCFS2_I(inode); + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; + struct buffer_head *ref_root_bh = NULL; + struct ocfs2_refcount_tree *ref_tree; + struct ocfs2_cow_context *context = NULL; + + BUG_ON(!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL)); + + ret = ocfs2_refcount_cal_cow_clusters(inode, &di->id2.i_list, + cpos, write_len, max_cpos, + &cow_start, &cow_len); + if (ret) { + mlog_errno(ret); + goto out; + } + + mlog(0, "CoW inode %lu, cpos %u, write_len %u, cow_start %u, " + "cow_len %u\n", inode->i_ino, + cpos, write_len, cow_start, cow_len); + + BUG_ON(cow_len == 0); + + context = kzalloc(sizeof(struct ocfs2_cow_context), GFP_NOFS); + if (!context) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + + ret = ocfs2_lock_refcount_tree(osb, le64_to_cpu(di->i_refcount_loc), + 1, &ref_tree, &ref_root_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + context->inode = inode; + context->cow_start = cow_start; + context->cow_len = cow_len; + context->ref_tree = ref_tree; + context->ref_root_bh = ref_root_bh; + context->cow_duplicate_clusters = ocfs2_duplicate_clusters_by_page; + context->get_clusters = ocfs2_di_get_clusters; + + ocfs2_init_dinode_extent_tree(&context->data_et, + INODE_CACHE(inode), di_bh); + + ret = ocfs2_replace_cow(context); + if (ret) + mlog_errno(ret); + + /* + * truncate the extent map here since no matter whether we meet with + * any error during the action, we shouldn't trust cached extent map + * any more. + */ + ocfs2_extent_map_trunc(inode, cow_start); + + ocfs2_unlock_refcount_tree(osb, ref_tree, 1); + brelse(ref_root_bh); +out: + kfree(context); + return ret; +} + +/* + * CoW any and all clusters between cpos and cpos+write_len. + * Don't CoW past max_cpos. If this returns successfully, all + * clusters between cpos and cpos+write_len are safe to modify. + */ +int ocfs2_refcount_cow(struct inode *inode, + struct buffer_head *di_bh, + u32 cpos, u32 write_len, u32 max_cpos) +{ + int ret = 0; + u32 p_cluster, num_clusters; + unsigned int ext_flags; + + while (write_len) { + ret = ocfs2_get_clusters(inode, cpos, &p_cluster, + &num_clusters, &ext_flags); + if (ret) { + mlog_errno(ret); + break; + } + + if (write_len < num_clusters) + num_clusters = write_len; + + if (ext_flags & OCFS2_EXT_REFCOUNTED) { + ret = ocfs2_refcount_cow_hunk(inode, di_bh, cpos, + num_clusters, max_cpos); + if (ret) { + mlog_errno(ret); + break; + } + } + + write_len -= num_clusters; + cpos += num_clusters; + } + + return ret; +} + +static int ocfs2_xattr_value_get_clusters(struct ocfs2_cow_context *context, + u32 v_cluster, u32 *p_cluster, + u32 *num_clusters, + unsigned int *extent_flags) +{ + struct inode *inode = context->inode; + struct ocfs2_xattr_value_root *xv = context->cow_object; + + return ocfs2_xattr_get_clusters(inode, v_cluster, p_cluster, + num_clusters, &xv->xr_list, + extent_flags); +} + +/* + * Given a xattr value root, calculate the most meta/credits we need for + * refcount tree change if we truncate it to 0. + */ +int ocfs2_refcounted_xattr_delete_need(struct inode *inode, + struct ocfs2_caching_info *ref_ci, + struct buffer_head *ref_root_bh, + struct ocfs2_xattr_value_root *xv, + int *meta_add, int *credits) +{ + int ret = 0, index, ref_blocks = 0; + u32 p_cluster, num_clusters; + u32 cpos = 0, clusters = le32_to_cpu(xv->xr_clusters); + struct ocfs2_refcount_block *rb; + struct ocfs2_refcount_rec rec; + struct buffer_head *ref_leaf_bh = NULL; + + while (cpos < clusters) { + ret = ocfs2_xattr_get_clusters(inode, cpos, &p_cluster, + &num_clusters, &xv->xr_list, + NULL); + if (ret) { + mlog_errno(ret); + goto out; + } + + cpos += num_clusters; + + while (num_clusters) { + ret = ocfs2_get_refcount_rec(ref_ci, ref_root_bh, + p_cluster, num_clusters, + &rec, &index, + &ref_leaf_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + BUG_ON(!rec.r_refcount); + + rb = (struct ocfs2_refcount_block *)ref_leaf_bh->b_data; + + /* + * We really don't know whether the other clusters is in + * this refcount block or not, so just take the worst + * case that all the clusters are in this block and each + * one will split a refcount rec, so totally we need + * clusters * 2 new refcount rec. + */ + if (le64_to_cpu(rb->rf_records.rl_used) + clusters * 2 > + le16_to_cpu(rb->rf_records.rl_count)) + ref_blocks++; + + *credits += 1; + brelse(ref_leaf_bh); + ref_leaf_bh = NULL; + + if (num_clusters <= le32_to_cpu(rec.r_clusters)) + break; + else + num_clusters -= le32_to_cpu(rec.r_clusters); + p_cluster += num_clusters; + } + } + + *meta_add += ref_blocks; + if (!ref_blocks) + goto out; + + rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data; + if (le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL) + *credits += OCFS2_EXPAND_REFCOUNT_TREE_CREDITS; + else { + struct ocfs2_extent_tree et; + + ocfs2_init_refcount_extent_tree(&et, ref_ci, ref_root_bh); + *credits += ocfs2_calc_extend_credits(inode->i_sb, + et.et_root_el, + ref_blocks); + } + +out: + brelse(ref_leaf_bh); + return ret; +} + +/* + * Do CoW for xattr. + */ +int ocfs2_refcount_cow_xattr(struct inode *inode, + struct ocfs2_dinode *di, + struct ocfs2_xattr_value_buf *vb, + struct ocfs2_refcount_tree *ref_tree, + struct buffer_head *ref_root_bh, + u32 cpos, u32 write_len, + struct ocfs2_post_refcount *post) +{ + int ret; + struct ocfs2_xattr_value_root *xv = vb->vb_xv; + struct ocfs2_inode_info *oi = OCFS2_I(inode); + struct ocfs2_cow_context *context = NULL; + u32 cow_start, cow_len; + + BUG_ON(!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL)); + + ret = ocfs2_refcount_cal_cow_clusters(inode, &xv->xr_list, + cpos, write_len, UINT_MAX, + &cow_start, &cow_len); + if (ret) { + mlog_errno(ret); + goto out; + } + + BUG_ON(cow_len == 0); + + context = kzalloc(sizeof(struct ocfs2_cow_context), GFP_NOFS); + if (!context) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + + context->inode = inode; + context->cow_start = cow_start; + context->cow_len = cow_len; + context->ref_tree = ref_tree; + context->ref_root_bh = ref_root_bh;; + context->cow_object = xv; + + context->cow_duplicate_clusters = ocfs2_duplicate_clusters_by_jbd; + /* We need the extra credits for duplicate_clusters by jbd. */ + context->extra_credits = + ocfs2_clusters_to_blocks(inode->i_sb, 1) * cow_len; + context->get_clusters = ocfs2_xattr_value_get_clusters; + context->post_refcount = post; + + ocfs2_init_xattr_value_extent_tree(&context->data_et, + INODE_CACHE(inode), vb); + + ret = ocfs2_replace_cow(context); + if (ret) + mlog_errno(ret); + +out: + kfree(context); + return ret; +} + +/* + * Insert a new extent into refcount tree and mark a extent rec + * as refcounted in the dinode tree. + */ +int ocfs2_add_refcount_flag(struct inode *inode, + struct ocfs2_extent_tree *data_et, + struct ocfs2_caching_info *ref_ci, + struct buffer_head *ref_root_bh, + u32 cpos, u32 p_cluster, u32 num_clusters, + struct ocfs2_cached_dealloc_ctxt *dealloc, + struct ocfs2_post_refcount *post) +{ + int ret; + handle_t *handle; + int credits = 1, ref_blocks = 0; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct ocfs2_alloc_context *meta_ac = NULL; + + ret = ocfs2_calc_refcount_meta_credits(inode->i_sb, + ref_ci, ref_root_bh, + p_cluster, num_clusters, + &ref_blocks, &credits); + if (ret) { + mlog_errno(ret); + goto out; + } + + mlog(0, "reserve new metadata %d, credits = %d\n", + ref_blocks, credits); + + if (ref_blocks) { + ret = ocfs2_reserve_new_metadata_blocks(OCFS2_SB(inode->i_sb), + ref_blocks, &meta_ac); + if (ret) { + mlog_errno(ret); + goto out; + } + } + + if (post) + credits += post->credits; + + handle = ocfs2_start_trans(osb, credits); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + goto out; + } + + ret = ocfs2_mark_extent_refcounted(inode, data_et, handle, + cpos, num_clusters, p_cluster, + meta_ac, dealloc); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + ret = __ocfs2_increase_refcount(handle, ref_ci, ref_root_bh, + p_cluster, num_clusters, 0, + meta_ac, dealloc); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + if (post && post->func) { + ret = post->func(inode, handle, post->para); + if (ret) + mlog_errno(ret); + } + +out_commit: + ocfs2_commit_trans(osb, handle); +out: + if (meta_ac) + ocfs2_free_alloc_context(meta_ac); + return ret; +} + +static int ocfs2_change_ctime(struct inode *inode, + struct buffer_head *di_bh) +{ + int ret; + handle_t *handle; + struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; + + handle = ocfs2_start_trans(OCFS2_SB(inode->i_sb), + OCFS2_INODE_UPDATE_CREDITS); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + goto out; + } + + ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + inode->i_ctime = CURRENT_TIME; + di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec); + di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); + + ocfs2_journal_dirty(handle, di_bh); + +out_commit: + ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle); +out: + return ret; +} + +static int ocfs2_attach_refcount_tree(struct inode *inode, + struct buffer_head *di_bh) +{ + int ret, data_changed = 0; + struct buffer_head *ref_root_bh = NULL; + struct ocfs2_inode_info *oi = OCFS2_I(inode); + struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct ocfs2_refcount_tree *ref_tree; + unsigned int ext_flags; + loff_t size; + u32 cpos, num_clusters, clusters, p_cluster; + struct ocfs2_cached_dealloc_ctxt dealloc; + struct ocfs2_extent_tree di_et; + + ocfs2_init_dealloc_ctxt(&dealloc); + + if (!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL)) { + ret = ocfs2_create_refcount_tree(inode, di_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + } + + BUG_ON(!di->i_refcount_loc); + ret = ocfs2_lock_refcount_tree(osb, + le64_to_cpu(di->i_refcount_loc), 1, + &ref_tree, &ref_root_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + ocfs2_init_dinode_extent_tree(&di_et, INODE_CACHE(inode), di_bh); + + size = i_size_read(inode); + clusters = ocfs2_clusters_for_bytes(inode->i_sb, size); + + cpos = 0; + while (cpos < clusters) { + ret = ocfs2_get_clusters(inode, cpos, &p_cluster, + &num_clusters, &ext_flags); + + if (p_cluster && !(ext_flags & OCFS2_EXT_REFCOUNTED)) { + ret = ocfs2_add_refcount_flag(inode, &di_et, + &ref_tree->rf_ci, + ref_root_bh, cpos, + p_cluster, num_clusters, + &dealloc, NULL); + if (ret) { + mlog_errno(ret); + goto unlock; + } + + data_changed = 1; + } + cpos += num_clusters; + } + + if (oi->ip_dyn_features & OCFS2_HAS_XATTR_FL) { + ret = ocfs2_xattr_attach_refcount_tree(inode, di_bh, + &ref_tree->rf_ci, + ref_root_bh, + &dealloc); + if (ret) { + mlog_errno(ret); + goto unlock; + } + } + + if (data_changed) { + ret = ocfs2_change_ctime(inode, di_bh); + if (ret) + mlog_errno(ret); + } + +unlock: + ocfs2_unlock_refcount_tree(osb, ref_tree, 1); + brelse(ref_root_bh); + + if (!ret && ocfs2_dealloc_has_cluster(&dealloc)) { + ocfs2_schedule_truncate_log_flush(osb, 1); + ocfs2_run_deallocs(osb, &dealloc); + } +out: + /* + * Empty the extent map so that we may get the right extent + * record from the disk. + */ + ocfs2_extent_map_trunc(inode, 0); + + return ret; +} + +static int ocfs2_add_refcounted_extent(struct inode *inode, + struct ocfs2_extent_tree *et, + struct ocfs2_caching_info *ref_ci, + struct buffer_head *ref_root_bh, + u32 cpos, u32 p_cluster, u32 num_clusters, + unsigned int ext_flags, + struct ocfs2_cached_dealloc_ctxt *dealloc) +{ + int ret; + handle_t *handle; + int credits = 0; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct ocfs2_alloc_context *meta_ac = NULL; + + ret = ocfs2_lock_refcount_allocators(inode->i_sb, + p_cluster, num_clusters, + et, ref_ci, + ref_root_bh, &meta_ac, + NULL, &credits); + if (ret) { + mlog_errno(ret); + goto out; + } + + handle = ocfs2_start_trans(osb, credits); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + goto out; + } + + ret = ocfs2_insert_extent(handle, et, cpos, + cpu_to_le64(ocfs2_clusters_to_blocks(inode->i_sb, + p_cluster)), + num_clusters, ext_flags, meta_ac); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + ret = ocfs2_increase_refcount(handle, ref_ci, ref_root_bh, + p_cluster, num_clusters, + meta_ac, dealloc); + if (ret) + mlog_errno(ret); + +out_commit: + ocfs2_commit_trans(osb, handle); +out: + if (meta_ac) + ocfs2_free_alloc_context(meta_ac); + return ret; +} + +static int ocfs2_duplicate_extent_list(struct inode *s_inode, + struct inode *t_inode, + struct buffer_head *t_bh, + struct ocfs2_caching_info *ref_ci, + struct buffer_head *ref_root_bh, + struct ocfs2_cached_dealloc_ctxt *dealloc) +{ + int ret = 0; + u32 p_cluster, num_clusters, clusters, cpos; + loff_t size; + unsigned int ext_flags; + struct ocfs2_extent_tree et; + + ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(t_inode), t_bh); + + size = i_size_read(s_inode); + clusters = ocfs2_clusters_for_bytes(s_inode->i_sb, size); + + cpos = 0; + while (cpos < clusters) { + ret = ocfs2_get_clusters(s_inode, cpos, &p_cluster, + &num_clusters, &ext_flags); + + if (p_cluster) { + ret = ocfs2_add_refcounted_extent(t_inode, &et, + ref_ci, ref_root_bh, + cpos, p_cluster, + num_clusters, + ext_flags, + dealloc); + if (ret) { + mlog_errno(ret); + goto out; + } + } + + cpos += num_clusters; + } + +out: + return ret; +} + +/* + * change the new file's attributes to the src. + * + * reflink creates a snapshot of a file, that means the attributes + * must be identical except for three exceptions - nlink, ino, and ctime. + */ +static int ocfs2_complete_reflink(struct inode *s_inode, + struct buffer_head *s_bh, + struct inode *t_inode, + struct buffer_head *t_bh, + bool preserve) +{ + int ret; + handle_t *handle; + struct ocfs2_dinode *s_di = (struct ocfs2_dinode *)s_bh->b_data; + struct ocfs2_dinode *di = (struct ocfs2_dinode *)t_bh->b_data; + loff_t size = i_size_read(s_inode); + + handle = ocfs2_start_trans(OCFS2_SB(t_inode->i_sb), + OCFS2_INODE_UPDATE_CREDITS); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + return ret; + } + + ret = ocfs2_journal_access_di(handle, INODE_CACHE(t_inode), t_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + spin_lock(&OCFS2_I(t_inode)->ip_lock); + OCFS2_I(t_inode)->ip_clusters = OCFS2_I(s_inode)->ip_clusters; + OCFS2_I(t_inode)->ip_attr = OCFS2_I(s_inode)->ip_attr; + OCFS2_I(t_inode)->ip_dyn_features = OCFS2_I(s_inode)->ip_dyn_features; + spin_unlock(&OCFS2_I(t_inode)->ip_lock); + i_size_write(t_inode, size); + + di->i_xattr_inline_size = s_di->i_xattr_inline_size; + di->i_clusters = s_di->i_clusters; + di->i_size = s_di->i_size; + di->i_dyn_features = s_di->i_dyn_features; + di->i_attr = s_di->i_attr; + + if (preserve) { + di->i_uid = s_di->i_uid; + di->i_gid = s_di->i_gid; + di->i_mode = s_di->i_mode; + + /* + * update time. + * we want mtime to appear identical to the source and + * update ctime. + */ + t_inode->i_ctime = CURRENT_TIME; + + di->i_ctime = cpu_to_le64(t_inode->i_ctime.tv_sec); + di->i_ctime_nsec = cpu_to_le32(t_inode->i_ctime.tv_nsec); + + t_inode->i_mtime = s_inode->i_mtime; + di->i_mtime = s_di->i_mtime; + di->i_mtime_nsec = s_di->i_mtime_nsec; + } + + ocfs2_journal_dirty(handle, t_bh); + +out_commit: + ocfs2_commit_trans(OCFS2_SB(t_inode->i_sb), handle); + return ret; +} + +static int ocfs2_create_reflink_node(struct inode *s_inode, + struct buffer_head *s_bh, + struct inode *t_inode, + struct buffer_head *t_bh, + bool preserve) +{ + int ret; + struct buffer_head *ref_root_bh = NULL; + struct ocfs2_cached_dealloc_ctxt dealloc; + struct ocfs2_super *osb = OCFS2_SB(s_inode->i_sb); + struct ocfs2_refcount_block *rb; + struct ocfs2_dinode *di = (struct ocfs2_dinode *)s_bh->b_data; + struct ocfs2_refcount_tree *ref_tree; + + ocfs2_init_dealloc_ctxt(&dealloc); + + ret = ocfs2_set_refcount_tree(t_inode, t_bh, + le64_to_cpu(di->i_refcount_loc)); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_lock_refcount_tree(osb, le64_to_cpu(di->i_refcount_loc), + 1, &ref_tree, &ref_root_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data; + + ret = ocfs2_duplicate_extent_list(s_inode, t_inode, t_bh, + &ref_tree->rf_ci, ref_root_bh, + &dealloc); + if (ret) { + mlog_errno(ret); + goto out_unlock_refcount; + } + + ret = ocfs2_complete_reflink(s_inode, s_bh, t_inode, t_bh, preserve); + if (ret) + mlog_errno(ret); + +out_unlock_refcount: + ocfs2_unlock_refcount_tree(osb, ref_tree, 1); + brelse(ref_root_bh); +out: + if (ocfs2_dealloc_has_cluster(&dealloc)) { + ocfs2_schedule_truncate_log_flush(osb, 1); + ocfs2_run_deallocs(osb, &dealloc); + } + + return ret; +} + +static int __ocfs2_reflink(struct dentry *old_dentry, + struct buffer_head *old_bh, + struct inode *new_inode, + bool preserve) +{ + int ret; + struct inode *inode = old_dentry->d_inode; + struct buffer_head *new_bh = NULL; + + ret = filemap_fdatawrite(inode->i_mapping); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_attach_refcount_tree(inode, old_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + mutex_lock(&new_inode->i_mutex); + ret = ocfs2_inode_lock(new_inode, &new_bh, 1); + if (ret) { + mlog_errno(ret); + goto out_unlock; + } + + ret = ocfs2_create_reflink_node(inode, old_bh, + new_inode, new_bh, preserve); + if (ret) { + mlog_errno(ret); + goto inode_unlock; + } + + if (OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_XATTR_FL) { + ret = ocfs2_reflink_xattrs(inode, old_bh, + new_inode, new_bh, + preserve); + if (ret) + mlog_errno(ret); + } +inode_unlock: + ocfs2_inode_unlock(new_inode, 1); + brelse(new_bh); +out_unlock: + mutex_unlock(&new_inode->i_mutex); +out: + if (!ret) { + ret = filemap_fdatawait(inode->i_mapping); + if (ret) + mlog_errno(ret); + } + return ret; +} + +static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir, + struct dentry *new_dentry, bool preserve) +{ + int error; + struct inode *inode = old_dentry->d_inode; + struct buffer_head *old_bh = NULL; + struct inode *new_orphan_inode = NULL; + + if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) + return -EOPNOTSUPP; + + error = ocfs2_create_inode_in_orphan(dir, inode->i_mode, + &new_orphan_inode); + if (error) { + mlog_errno(error); + goto out; + } + + error = ocfs2_inode_lock(inode, &old_bh, 1); + if (error) { + mlog_errno(error); + goto out; + } + + down_write(&OCFS2_I(inode)->ip_xattr_sem); + down_write(&OCFS2_I(inode)->ip_alloc_sem); + error = __ocfs2_reflink(old_dentry, old_bh, + new_orphan_inode, preserve); + up_write(&OCFS2_I(inode)->ip_alloc_sem); + up_write(&OCFS2_I(inode)->ip_xattr_sem); + + ocfs2_inode_unlock(inode, 1); + brelse(old_bh); + + if (error) { + mlog_errno(error); + goto out; + } + + /* If the security isn't preserved, we need to re-initialize them. */ + if (!preserve) { + error = ocfs2_init_security_and_acl(dir, new_orphan_inode); + if (error) + mlog_errno(error); + } +out: + if (!error) { + error = ocfs2_mv_orphaned_inode_to_new(dir, new_orphan_inode, + new_dentry); + if (error) + mlog_errno(error); + } + + if (new_orphan_inode) { + /* + * We need to open_unlock the inode no matter whether we + * succeed or not, so that other nodes can delete it later. + */ + ocfs2_open_unlock(new_orphan_inode); + if (error) + iput(new_orphan_inode); + } + + return error; +} + +/* + * Below here are the bits used by OCFS2_IOC_REFLINK() to fake + * sys_reflink(). This will go away when vfs_reflink() exists in + * fs/namei.c. + */ + +/* copied from may_create in VFS. */ +static inline int ocfs2_may_create(struct inode *dir, struct dentry *child) +{ + if (child->d_inode) + return -EEXIST; + if (IS_DEADDIR(dir)) + return -ENOENT; + return inode_permission(dir, MAY_WRITE | MAY_EXEC); +} + +/* copied from user_path_parent. */ +static int ocfs2_user_path_parent(const char __user *path, + struct nameidata *nd, char **name) +{ + char *s = getname(path); + int error; + + if (IS_ERR(s)) + return PTR_ERR(s); + + error = path_lookup(s, LOOKUP_PARENT, nd); + if (error) + putname(s); + else + *name = s; + + return error; +} + +/** + * ocfs2_vfs_reflink - Create a reference-counted link + * + * @old_dentry: source dentry + inode + * @dir: directory to create the target + * @new_dentry: target dentry + * @preserve: if true, preserve all file attributes + */ +int ocfs2_vfs_reflink(struct dentry *old_dentry, struct inode *dir, + struct dentry *new_dentry, bool preserve) +{ + struct inode *inode = old_dentry->d_inode; + int error; + + if (!inode) + return -ENOENT; + + error = ocfs2_may_create(dir, new_dentry); + if (error) + return error; + + if (dir->i_sb != inode->i_sb) + return -EXDEV; + + /* + * A reflink to an append-only or immutable file cannot be created. + */ + if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) + return -EPERM; + + /* Only regular files can be reflinked. */ + if (!S_ISREG(inode->i_mode)) + return -EPERM; + + /* + * If the caller wants to preserve ownership, they require the + * rights to do so. + */ + if (preserve) { + if ((current_fsuid() != inode->i_uid) && !capable(CAP_CHOWN)) + return -EPERM; + if (!in_group_p(inode->i_gid) && !capable(CAP_CHOWN)) + return -EPERM; + } + + /* + * If the caller is modifying any aspect of the attributes, they + * are not creating a snapshot. They need read permission on the + * file. + */ + if (!preserve) { + error = inode_permission(inode, MAY_READ); + if (error) + return error; + } + + mutex_lock(&inode->i_mutex); + vfs_dq_init(dir); + error = ocfs2_reflink(old_dentry, dir, new_dentry, preserve); + mutex_unlock(&inode->i_mutex); + if (!error) + fsnotify_create(dir, new_dentry); + return error; +} +/* + * Most codes are copied from sys_linkat. + */ +int ocfs2_reflink_ioctl(struct inode *inode, + const char __user *oldname, + const char __user *newname, + bool preserve) +{ + struct dentry *new_dentry; + struct nameidata nd; + struct path old_path; + int error; + char *to = NULL; + + if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) + return -EOPNOTSUPP; + + error = user_path_at(AT_FDCWD, oldname, 0, &old_path); + if (error) { + mlog_errno(error); + return error; + } + + error = ocfs2_user_path_parent(newname, &nd, &to); + if (error) { + mlog_errno(error); + goto out; + } + + error = -EXDEV; + if (old_path.mnt != nd.path.mnt) + goto out_release; + new_dentry = lookup_create(&nd, 0); + error = PTR_ERR(new_dentry); + if (IS_ERR(new_dentry)) { + mlog_errno(error); + goto out_unlock; + } + + error = mnt_want_write(nd.path.mnt); + if (error) { + mlog_errno(error); + goto out_dput; + } + + error = ocfs2_vfs_reflink(old_path.dentry, + nd.path.dentry->d_inode, + new_dentry, preserve); + mnt_drop_write(nd.path.mnt); +out_dput: + dput(new_dentry); +out_unlock: + mutex_unlock(&nd.path.dentry->d_inode->i_mutex); +out_release: + path_put(&nd.path); + putname(to); +out: + path_put(&old_path); + + return error; +} diff --git a/fs/ocfs2/refcounttree.h b/fs/ocfs2/refcounttree.h new file mode 100644 index 0000000..c1d19b1 --- /dev/null +++ b/fs/ocfs2/refcounttree.h @@ -0,0 +1,106 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * refcounttree.h + * + * Copyright (C) 2009 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ +#ifndef OCFS2_REFCOUNTTREE_H +#define OCFS2_REFCOUNTTREE_H + +struct ocfs2_refcount_tree { + struct rb_node rf_node; + u64 rf_blkno; + u32 rf_generation; + struct rw_semaphore rf_sem; + struct ocfs2_lock_res rf_lockres; + struct kref rf_getcnt; + int rf_removed; + + /* the following 4 fields are used by caching_info. */ + struct ocfs2_caching_info rf_ci; + spinlock_t rf_lock; + struct mutex rf_io_mutex; + struct super_block *rf_sb; +}; + +void ocfs2_purge_refcount_trees(struct ocfs2_super *osb); +int ocfs2_lock_refcount_tree(struct ocfs2_super *osb, u64 ref_blkno, int rw, + struct ocfs2_refcount_tree **tree, + struct buffer_head **ref_bh); +void ocfs2_unlock_refcount_tree(struct ocfs2_super *osb, + struct ocfs2_refcount_tree *tree, + int rw); + +int ocfs2_decrease_refcount(struct inode *inode, + handle_t *handle, u32 cpos, u32 len, + struct ocfs2_alloc_context *meta_ac, + struct ocfs2_cached_dealloc_ctxt *dealloc, + int delete); +int ocfs2_prepare_refcount_change_for_del(struct inode *inode, + struct buffer_head *di_bh, + u64 phys_blkno, + u32 clusters, + int *credits, + struct ocfs2_alloc_context **meta_ac); +int ocfs2_refcount_cow(struct inode *inode, struct buffer_head *di_bh, + u32 cpos, u32 write_len, u32 max_cpos); + +typedef int (ocfs2_post_refcount_func)(struct inode *inode, + handle_t *handle, + void *para); +/* + * Some refcount caller need to do more work after we modify the data b-tree + * during refcount operation(including CoW and add refcount flag), and make the + * transaction complete. So it must give us this structure so that we can do it + * within our transaction. + * + */ +struct ocfs2_post_refcount { + int credits; /* credits it need for journal. */ + ocfs2_post_refcount_func *func; /* real function. */ + void *para; +}; + +int ocfs2_refcounted_xattr_delete_need(struct inode *inode, + struct ocfs2_caching_info *ref_ci, + struct buffer_head *ref_root_bh, + struct ocfs2_xattr_value_root *xv, + int *meta_add, int *credits); +int ocfs2_refcount_cow_xattr(struct inode *inode, + struct ocfs2_dinode *di, + struct ocfs2_xattr_value_buf *vb, + struct ocfs2_refcount_tree *ref_tree, + struct buffer_head *ref_root_bh, + u32 cpos, u32 write_len, + struct ocfs2_post_refcount *post); +int ocfs2_add_refcount_flag(struct inode *inode, + struct ocfs2_extent_tree *data_et, + struct ocfs2_caching_info *ref_ci, + struct buffer_head *ref_root_bh, + u32 cpos, u32 p_cluster, u32 num_clusters, + struct ocfs2_cached_dealloc_ctxt *dealloc, + struct ocfs2_post_refcount *post); +int ocfs2_remove_refcount_tree(struct inode *inode, struct buffer_head *di_bh); +int ocfs2_try_remove_refcount_tree(struct inode *inode, + struct buffer_head *di_bh); +int ocfs2_increase_refcount(handle_t *handle, + struct ocfs2_caching_info *ci, + struct buffer_head *ref_root_bh, + u64 cpos, u32 len, + struct ocfs2_alloc_context *meta_ac, + struct ocfs2_cached_dealloc_ctxt *dealloc); +int ocfs2_reflink_ioctl(struct inode *inode, + const char __user *oldname, + const char __user *newname, + bool preserve); +#endif /* OCFS2_REFCOUNTTREE_H */ diff --git a/fs/ocfs2/resize.c b/fs/ocfs2/resize.c index 424adaa..3c3d673 100644 --- a/fs/ocfs2/resize.c +++ b/fs/ocfs2/resize.c @@ -106,8 +106,8 @@ static int ocfs2_update_last_group_and_inode(handle_t *handle, mlog_entry("(new_clusters=%d, first_new_cluster = %u)\n", new_clusters, first_new_cluster); - ret = ocfs2_journal_access_gd(handle, bm_inode, group_bh, - OCFS2_JOURNAL_ACCESS_WRITE); + ret = ocfs2_journal_access_gd(handle, INODE_CACHE(bm_inode), + group_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (ret < 0) { mlog_errno(ret); goto out; @@ -141,7 +141,7 @@ static int ocfs2_update_last_group_and_inode(handle_t *handle, } /* update the inode accordingly. */ - ret = ocfs2_journal_access_di(handle, bm_inode, bm_bh, + ret = ocfs2_journal_access_di(handle, INODE_CACHE(bm_inode), bm_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (ret < 0) { mlog_errno(ret); @@ -514,7 +514,7 @@ int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input) goto out_unlock; } - ocfs2_set_new_buffer_uptodate(inode, group_bh); + ocfs2_set_new_buffer_uptodate(INODE_CACHE(inode), group_bh); ret = ocfs2_verify_group_and_input(main_bm_inode, fe, input, group_bh); if (ret) { @@ -536,8 +536,8 @@ int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input) cl = &fe->id2.i_chain; cr = &cl->cl_recs[input->chain]; - ret = ocfs2_journal_access_gd(handle, main_bm_inode, group_bh, - OCFS2_JOURNAL_ACCESS_WRITE); + ret = ocfs2_journal_access_gd(handle, INODE_CACHE(main_bm_inode), + group_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (ret < 0) { mlog_errno(ret); goto out_commit; @@ -552,8 +552,8 @@ int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input) goto out_commit; } - ret = ocfs2_journal_access_di(handle, main_bm_inode, main_bm_bh, - OCFS2_JOURNAL_ACCESS_WRITE); + ret = ocfs2_journal_access_di(handle, INODE_CACHE(main_bm_inode), + main_bm_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (ret < 0) { mlog_errno(ret); goto out_commit; diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c index 40661e7..bfbd7e9 100644 --- a/fs/ocfs2/slot_map.c +++ b/fs/ocfs2/slot_map.c @@ -150,8 +150,8 @@ int ocfs2_refresh_slot_info(struct ocfs2_super *osb) * be !NULL. Thus, ocfs2_read_blocks() will ignore blocknr. If * this is not true, the read of -1 (UINT64_MAX) will fail. */ - ret = ocfs2_read_blocks(si->si_inode, -1, si->si_blocks, si->si_bh, - OCFS2_BH_IGNORE_CACHE, NULL); + ret = ocfs2_read_blocks(INODE_CACHE(si->si_inode), -1, si->si_blocks, + si->si_bh, OCFS2_BH_IGNORE_CACHE, NULL); if (ret == 0) { spin_lock(&osb->osb_lock); ocfs2_update_slot_info(si); @@ -213,7 +213,7 @@ static int ocfs2_update_disk_slot(struct ocfs2_super *osb, ocfs2_update_disk_slot_old(si, slot_num, &bh); spin_unlock(&osb->osb_lock); - status = ocfs2_write_block(osb, bh, si->si_inode); + status = ocfs2_write_block(osb, bh, INODE_CACHE(si->si_inode)); if (status < 0) mlog_errno(status); @@ -404,8 +404,8 @@ static int ocfs2_map_slot_buffers(struct ocfs2_super *osb, (unsigned long long)blkno); bh = NULL; /* Acquire a fresh bh */ - status = ocfs2_read_blocks(si->si_inode, blkno, 1, &bh, - OCFS2_BH_IGNORE_CACHE, NULL); + status = ocfs2_read_blocks(INODE_CACHE(si->si_inode), blkno, + 1, &bh, OCFS2_BH_IGNORE_CACHE, NULL); if (status < 0) { mlog_errno(status); goto bail; diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c index 73a16d4..c30b644 100644 --- a/fs/ocfs2/suballoc.c +++ b/fs/ocfs2/suballoc.c @@ -310,7 +310,7 @@ int ocfs2_read_group_descriptor(struct inode *inode, struct ocfs2_dinode *di, int rc; struct buffer_head *tmp = *bh; - rc = ocfs2_read_block(inode, gd_blkno, &tmp, + rc = ocfs2_read_block(INODE_CACHE(inode), gd_blkno, &tmp, ocfs2_validate_group_descriptor); if (rc) goto out; @@ -352,7 +352,7 @@ static int ocfs2_block_group_fill(handle_t *handle, } status = ocfs2_journal_access_gd(handle, - alloc_inode, + INODE_CACHE(alloc_inode), bg_bh, OCFS2_JOURNAL_ACCESS_CREATE); if (status < 0) { @@ -476,7 +476,7 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb, mlog_errno(status); goto bail; } - ocfs2_set_new_buffer_uptodate(alloc_inode, bg_bh); + ocfs2_set_new_buffer_uptodate(INODE_CACHE(alloc_inode), bg_bh); status = ocfs2_block_group_fill(handle, alloc_inode, @@ -491,7 +491,7 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb, bg = (struct ocfs2_group_desc *) bg_bh->b_data; - status = ocfs2_journal_access_di(handle, alloc_inode, + status = ocfs2_journal_access_di(handle, INODE_CACHE(alloc_inode), bh, OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { mlog_errno(status); @@ -1033,7 +1033,7 @@ static inline int ocfs2_block_group_set_bits(handle_t *handle, journal_type = OCFS2_JOURNAL_ACCESS_UNDO; status = ocfs2_journal_access_gd(handle, - alloc_inode, + INODE_CACHE(alloc_inode), group_bh, journal_type); if (status < 0) { @@ -1106,7 +1106,8 @@ static int ocfs2_relink_block_group(handle_t *handle, bg_ptr = le64_to_cpu(bg->bg_next_group); prev_bg_ptr = le64_to_cpu(prev_bg->bg_next_group); - status = ocfs2_journal_access_gd(handle, alloc_inode, prev_bg_bh, + status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode), + prev_bg_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { mlog_errno(status); @@ -1121,8 +1122,8 @@ static int ocfs2_relink_block_group(handle_t *handle, goto out_rollback; } - status = ocfs2_journal_access_gd(handle, alloc_inode, bg_bh, - OCFS2_JOURNAL_ACCESS_WRITE); + status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode), + bg_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { mlog_errno(status); goto out_rollback; @@ -1136,8 +1137,8 @@ static int ocfs2_relink_block_group(handle_t *handle, goto out_rollback; } - status = ocfs2_journal_access_di(handle, alloc_inode, fe_bh, - OCFS2_JOURNAL_ACCESS_WRITE); + status = ocfs2_journal_access_di(handle, INODE_CACHE(alloc_inode), + fe_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { mlog_errno(status); goto out_rollback; @@ -1288,7 +1289,7 @@ static int ocfs2_alloc_dinode_update_counts(struct inode *inode, struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data; struct ocfs2_chain_list *cl = (struct ocfs2_chain_list *) &di->id2.i_chain; - ret = ocfs2_journal_access_di(handle, inode, di_bh, + ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (ret < 0) { mlog_errno(ret); @@ -1461,7 +1462,7 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac, /* Ok, claim our bits now: set the info on dinode, chainlist * and then the group */ status = ocfs2_journal_access_di(handle, - alloc_inode, + INODE_CACHE(alloc_inode), ac->ac_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { @@ -1907,8 +1908,8 @@ static inline int ocfs2_block_group_clear_bits(handle_t *handle, if (ocfs2_is_cluster_bitmap(alloc_inode)) journal_type = OCFS2_JOURNAL_ACCESS_UNDO; - status = ocfs2_journal_access_gd(handle, alloc_inode, group_bh, - journal_type); + status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode), + group_bh, journal_type); if (status < 0) { mlog_errno(status); goto bail; @@ -1993,8 +1994,8 @@ int ocfs2_free_suballoc_bits(handle_t *handle, goto bail; } - status = ocfs2_journal_access_di(handle, alloc_inode, alloc_bh, - OCFS2_JOURNAL_ACCESS_WRITE); + status = ocfs2_journal_access_di(handle, INODE_CACHE(alloc_inode), + alloc_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { mlog_errno(status); goto bail; @@ -2151,7 +2152,7 @@ int ocfs2_lock_allocators(struct inode *inode, BUG_ON(clusters_to_add != 0 && data_ac == NULL); - num_free_extents = ocfs2_num_free_extents(osb, inode, et); + num_free_extents = ocfs2_num_free_extents(osb, et); if (num_free_extents < 0) { ret = num_free_extents; mlog_errno(ret); diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index a3f8871..c0e48ae 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -28,7 +28,6 @@ #include <linux/types.h> #include <linux/slab.h> #include <linux/highmem.h> -#include <linux/utsname.h> #include <linux/init.h> #include <linux/random.h> #include <linux/statfs.h> @@ -69,6 +68,7 @@ #include "ver.h" #include "xattr.h" #include "quota.h" +#include "refcounttree.h" #include "buffer_head_io.h" @@ -373,7 +373,7 @@ static ssize_t ocfs2_debug_read(struct file *file, char __user *buf, } #endif /* CONFIG_DEBUG_FS */ -static struct file_operations ocfs2_osb_debug_fops = { +static const struct file_operations ocfs2_osb_debug_fops = { .open = ocfs2_osb_debug_open, .release = ocfs2_debug_release, .read = ocfs2_debug_read, @@ -965,7 +965,7 @@ static int ocfs2_quota_off(struct super_block *sb, int type, int remount) return vfs_quota_disable(sb, type, DQUOT_LIMITS_ENABLED); } -static struct quotactl_ops ocfs2_quotactl_ops = { +static const struct quotactl_ops ocfs2_quotactl_ops = { .quota_on = ocfs2_quota_on, .quota_off = ocfs2_quota_off, .quota_sync = vfs_quota_sync, @@ -1668,8 +1668,6 @@ static void ocfs2_inode_init_once(void *data) spin_lock_init(&oi->ip_lock); ocfs2_extent_map_init(&oi->vfs_inode); INIT_LIST_HEAD(&oi->ip_io_markers); - oi->ip_created_trans = 0; - oi->ip_last_trans = 0; oi->ip_dir_start_lookup = 0; init_rwsem(&oi->ip_alloc_sem); @@ -1683,7 +1681,8 @@ static void ocfs2_inode_init_once(void *data) ocfs2_lock_res_init_once(&oi->ip_inode_lockres); ocfs2_lock_res_init_once(&oi->ip_open_lockres); - ocfs2_metadata_cache_init(&oi->vfs_inode); + ocfs2_metadata_cache_init(INODE_CACHE(&oi->vfs_inode), + &ocfs2_inode_caching_ops); inode_init_once(&oi->vfs_inode); } @@ -1859,6 +1858,8 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err) ocfs2_sync_blockdev(sb); + ocfs2_purge_refcount_trees(osb); + /* No cluster connection means we've failed during mount, so skip * all the steps which depended on that to complete. */ if (osb->cconn) { @@ -2065,6 +2066,8 @@ static int ocfs2_initialize_super(struct super_block *sb, goto bail; } + osb->osb_rf_lock_tree = RB_ROOT; + osb->s_feature_compat = le32_to_cpu(OCFS2_RAW_SB(di)->s_feature_compat); osb->s_feature_ro_compat = @@ -2490,7 +2493,8 @@ void __ocfs2_abort(struct super_block* sb, /* Force a panic(). This stinks, but it's better than letting * things continue without having a proper hard readonly * here. */ - OCFS2_SB(sb)->s_mount_opt |= OCFS2_MOUNT_ERRORS_PANIC; + if (!ocfs2_mount_local(OCFS2_SB(sb))) + OCFS2_SB(sb)->s_mount_opt |= OCFS2_MOUNT_ERRORS_PANIC; ocfs2_handle_error(sb); } diff --git a/fs/ocfs2/symlink.c b/fs/ocfs2/symlink.c index 579dd1b..e342103 100644 --- a/fs/ocfs2/symlink.c +++ b/fs/ocfs2/symlink.c @@ -38,7 +38,6 @@ #include <linux/types.h> #include <linux/slab.h> #include <linux/pagemap.h> -#include <linux/utsname.h> #include <linux/namei.h> #define MLOG_MASK_PREFIX ML_NAMEI diff --git a/fs/ocfs2/uptodate.c b/fs/ocfs2/uptodate.c index 187b99f..b6284f2 100644 --- a/fs/ocfs2/uptodate.c +++ b/fs/ocfs2/uptodate.c @@ -75,15 +75,77 @@ struct ocfs2_meta_cache_item { static struct kmem_cache *ocfs2_uptodate_cachep = NULL; -void ocfs2_metadata_cache_init(struct inode *inode) +u64 ocfs2_metadata_cache_owner(struct ocfs2_caching_info *ci) { - struct ocfs2_inode_info *oi = OCFS2_I(inode); - struct ocfs2_caching_info *ci = &oi->ip_metadata_cache; + BUG_ON(!ci || !ci->ci_ops); - oi->ip_flags |= OCFS2_INODE_CACHE_INLINE; + return ci->ci_ops->co_owner(ci); +} + +struct super_block *ocfs2_metadata_cache_get_super(struct ocfs2_caching_info *ci) +{ + BUG_ON(!ci || !ci->ci_ops); + + return ci->ci_ops->co_get_super(ci); +} + +static void ocfs2_metadata_cache_lock(struct ocfs2_caching_info *ci) +{ + BUG_ON(!ci || !ci->ci_ops); + + ci->ci_ops->co_cache_lock(ci); +} + +static void ocfs2_metadata_cache_unlock(struct ocfs2_caching_info *ci) +{ + BUG_ON(!ci || !ci->ci_ops); + + ci->ci_ops->co_cache_unlock(ci); +} + +void ocfs2_metadata_cache_io_lock(struct ocfs2_caching_info *ci) +{ + BUG_ON(!ci || !ci->ci_ops); + + ci->ci_ops->co_io_lock(ci); +} + +void ocfs2_metadata_cache_io_unlock(struct ocfs2_caching_info *ci) +{ + BUG_ON(!ci || !ci->ci_ops); + + ci->ci_ops->co_io_unlock(ci); +} + + +static void ocfs2_metadata_cache_reset(struct ocfs2_caching_info *ci, + int clear) +{ + ci->ci_flags |= OCFS2_CACHE_FL_INLINE; ci->ci_num_cached = 0; + + if (clear) { + ci->ci_created_trans = 0; + ci->ci_last_trans = 0; + } +} + +void ocfs2_metadata_cache_init(struct ocfs2_caching_info *ci, + const struct ocfs2_caching_operations *ops) +{ + BUG_ON(!ops); + + ci->ci_ops = ops; + ocfs2_metadata_cache_reset(ci, 1); } +void ocfs2_metadata_cache_exit(struct ocfs2_caching_info *ci) +{ + ocfs2_metadata_cache_purge(ci); + ocfs2_metadata_cache_reset(ci, 1); +} + + /* No lock taken here as 'root' is not expected to be visible to other * processes. */ static unsigned int ocfs2_purge_copied_metadata_tree(struct rb_root *root) @@ -112,19 +174,20 @@ static unsigned int ocfs2_purge_copied_metadata_tree(struct rb_root *root) * This function is a few more lines longer than necessary due to some * accounting done here, but I think it's worth tracking down those * bugs sooner -- Mark */ -void ocfs2_metadata_cache_purge(struct inode *inode) +void ocfs2_metadata_cache_purge(struct ocfs2_caching_info *ci) { - struct ocfs2_inode_info *oi = OCFS2_I(inode); unsigned int tree, to_purge, purged; - struct ocfs2_caching_info *ci = &oi->ip_metadata_cache; struct rb_root root = RB_ROOT; - spin_lock(&oi->ip_lock); - tree = !(oi->ip_flags & OCFS2_INODE_CACHE_INLINE); + BUG_ON(!ci || !ci->ci_ops); + + ocfs2_metadata_cache_lock(ci); + tree = !(ci->ci_flags & OCFS2_CACHE_FL_INLINE); to_purge = ci->ci_num_cached; - mlog(0, "Purge %u %s items from Inode %llu\n", to_purge, - tree ? "array" : "tree", (unsigned long long)oi->ip_blkno); + mlog(0, "Purge %u %s items from Owner %llu\n", to_purge, + tree ? "array" : "tree", + (unsigned long long)ocfs2_metadata_cache_owner(ci)); /* If we're a tree, save off the root so that we can safely * initialize the cache. We do the work to free tree members @@ -132,16 +195,17 @@ void ocfs2_metadata_cache_purge(struct inode *inode) if (tree) root = ci->ci_cache.ci_tree; - ocfs2_metadata_cache_init(inode); - spin_unlock(&oi->ip_lock); + ocfs2_metadata_cache_reset(ci, 0); + ocfs2_metadata_cache_unlock(ci); purged = ocfs2_purge_copied_metadata_tree(&root); /* If possible, track the number wiped so that we can more * easily detect counting errors. Unfortunately, this is only * meaningful for trees. */ if (tree && purged != to_purge) - mlog(ML_ERROR, "Inode %llu, count = %u, purged = %u\n", - (unsigned long long)oi->ip_blkno, to_purge, purged); + mlog(ML_ERROR, "Owner %llu, count = %u, purged = %u\n", + (unsigned long long)ocfs2_metadata_cache_owner(ci), + to_purge, purged); } /* Returns the index in the cache array, -1 if not found. @@ -182,27 +246,25 @@ ocfs2_search_cache_tree(struct ocfs2_caching_info *ci, return NULL; } -static int ocfs2_buffer_cached(struct ocfs2_inode_info *oi, +static int ocfs2_buffer_cached(struct ocfs2_caching_info *ci, struct buffer_head *bh) { int index = -1; struct ocfs2_meta_cache_item *item = NULL; - spin_lock(&oi->ip_lock); + ocfs2_metadata_cache_lock(ci); - mlog(0, "Inode %llu, query block %llu (inline = %u)\n", - (unsigned long long)oi->ip_blkno, + mlog(0, "Owner %llu, query block %llu (inline = %u)\n", + (unsigned long long)ocfs2_metadata_cache_owner(ci), (unsigned long long) bh->b_blocknr, - !!(oi->ip_flags & OCFS2_INODE_CACHE_INLINE)); + !!(ci->ci_flags & OCFS2_CACHE_FL_INLINE)); - if (oi->ip_flags & OCFS2_INODE_CACHE_INLINE) - index = ocfs2_search_cache_array(&oi->ip_metadata_cache, - bh->b_blocknr); + if (ci->ci_flags & OCFS2_CACHE_FL_INLINE) + index = ocfs2_search_cache_array(ci, bh->b_blocknr); else - item = ocfs2_search_cache_tree(&oi->ip_metadata_cache, - bh->b_blocknr); + item = ocfs2_search_cache_tree(ci, bh->b_blocknr); - spin_unlock(&oi->ip_lock); + ocfs2_metadata_cache_unlock(ci); mlog(0, "index = %d, item = %p\n", index, item); @@ -214,7 +276,7 @@ static int ocfs2_buffer_cached(struct ocfs2_inode_info *oi, * * This can be called under lock_buffer() */ -int ocfs2_buffer_uptodate(struct inode *inode, +int ocfs2_buffer_uptodate(struct ocfs2_caching_info *ci, struct buffer_head *bh) { /* Doesn't matter if the bh is in our cache or not -- if it's @@ -230,24 +292,24 @@ int ocfs2_buffer_uptodate(struct inode *inode, /* Ok, locally the buffer is marked as up to date, now search * our cache to see if we can trust that. */ - return ocfs2_buffer_cached(OCFS2_I(inode), bh); + return ocfs2_buffer_cached(ci, bh); } -/* +/* * Determine whether a buffer is currently out on a read-ahead request. - * ip_io_sem should be held to serialize submitters with the logic here. + * ci_io_sem should be held to serialize submitters with the logic here. */ -int ocfs2_buffer_read_ahead(struct inode *inode, +int ocfs2_buffer_read_ahead(struct ocfs2_caching_info *ci, struct buffer_head *bh) { - return buffer_locked(bh) && ocfs2_buffer_cached(OCFS2_I(inode), bh); + return buffer_locked(bh) && ocfs2_buffer_cached(ci, bh); } /* Requires ip_lock */ static void ocfs2_append_cache_array(struct ocfs2_caching_info *ci, sector_t block) { - BUG_ON(ci->ci_num_cached >= OCFS2_INODE_MAX_CACHE_ARRAY); + BUG_ON(ci->ci_num_cached >= OCFS2_CACHE_INFO_MAX_ARRAY); mlog(0, "block %llu takes position %u\n", (unsigned long long) block, ci->ci_num_cached); @@ -292,66 +354,64 @@ static void __ocfs2_insert_cache_tree(struct ocfs2_caching_info *ci, ci->ci_num_cached++; } -static inline int ocfs2_insert_can_use_array(struct ocfs2_inode_info *oi, - struct ocfs2_caching_info *ci) +/* co_cache_lock() must be held */ +static inline int ocfs2_insert_can_use_array(struct ocfs2_caching_info *ci) { - assert_spin_locked(&oi->ip_lock); - - return (oi->ip_flags & OCFS2_INODE_CACHE_INLINE) && - (ci->ci_num_cached < OCFS2_INODE_MAX_CACHE_ARRAY); + return (ci->ci_flags & OCFS2_CACHE_FL_INLINE) && + (ci->ci_num_cached < OCFS2_CACHE_INFO_MAX_ARRAY); } -/* tree should be exactly OCFS2_INODE_MAX_CACHE_ARRAY wide. NULL the +/* tree should be exactly OCFS2_CACHE_INFO_MAX_ARRAY wide. NULL the * pointers in tree after we use them - this allows caller to detect - * when to free in case of error. */ -static void ocfs2_expand_cache(struct ocfs2_inode_info *oi, + * when to free in case of error. + * + * The co_cache_lock() must be held. */ +static void ocfs2_expand_cache(struct ocfs2_caching_info *ci, struct ocfs2_meta_cache_item **tree) { int i; - struct ocfs2_caching_info *ci = &oi->ip_metadata_cache; - mlog_bug_on_msg(ci->ci_num_cached != OCFS2_INODE_MAX_CACHE_ARRAY, - "Inode %llu, num cached = %u, should be %u\n", - (unsigned long long)oi->ip_blkno, ci->ci_num_cached, - OCFS2_INODE_MAX_CACHE_ARRAY); - mlog_bug_on_msg(!(oi->ip_flags & OCFS2_INODE_CACHE_INLINE), - "Inode %llu not marked as inline anymore!\n", - (unsigned long long)oi->ip_blkno); - assert_spin_locked(&oi->ip_lock); + mlog_bug_on_msg(ci->ci_num_cached != OCFS2_CACHE_INFO_MAX_ARRAY, + "Owner %llu, num cached = %u, should be %u\n", + (unsigned long long)ocfs2_metadata_cache_owner(ci), + ci->ci_num_cached, OCFS2_CACHE_INFO_MAX_ARRAY); + mlog_bug_on_msg(!(ci->ci_flags & OCFS2_CACHE_FL_INLINE), + "Owner %llu not marked as inline anymore!\n", + (unsigned long long)ocfs2_metadata_cache_owner(ci)); /* Be careful to initialize the tree members *first* because * once the ci_tree is used, the array is junk... */ - for(i = 0; i < OCFS2_INODE_MAX_CACHE_ARRAY; i++) + for (i = 0; i < OCFS2_CACHE_INFO_MAX_ARRAY; i++) tree[i]->c_block = ci->ci_cache.ci_array[i]; - oi->ip_flags &= ~OCFS2_INODE_CACHE_INLINE; + ci->ci_flags &= ~OCFS2_CACHE_FL_INLINE; ci->ci_cache.ci_tree = RB_ROOT; /* this will be set again by __ocfs2_insert_cache_tree */ ci->ci_num_cached = 0; - for(i = 0; i < OCFS2_INODE_MAX_CACHE_ARRAY; i++) { + for (i = 0; i < OCFS2_CACHE_INFO_MAX_ARRAY; i++) { __ocfs2_insert_cache_tree(ci, tree[i]); tree[i] = NULL; } mlog(0, "Expanded %llu to a tree cache: flags 0x%x, num = %u\n", - (unsigned long long)oi->ip_blkno, oi->ip_flags, ci->ci_num_cached); + (unsigned long long)ocfs2_metadata_cache_owner(ci), + ci->ci_flags, ci->ci_num_cached); } /* Slow path function - memory allocation is necessary. See the * comment above ocfs2_set_buffer_uptodate for more information. */ -static void __ocfs2_set_buffer_uptodate(struct ocfs2_inode_info *oi, +static void __ocfs2_set_buffer_uptodate(struct ocfs2_caching_info *ci, sector_t block, int expand_tree) { int i; - struct ocfs2_caching_info *ci = &oi->ip_metadata_cache; struct ocfs2_meta_cache_item *new = NULL; - struct ocfs2_meta_cache_item *tree[OCFS2_INODE_MAX_CACHE_ARRAY] = + struct ocfs2_meta_cache_item *tree[OCFS2_CACHE_INFO_MAX_ARRAY] = { NULL, }; - mlog(0, "Inode %llu, block %llu, expand = %d\n", - (unsigned long long)oi->ip_blkno, + mlog(0, "Owner %llu, block %llu, expand = %d\n", + (unsigned long long)ocfs2_metadata_cache_owner(ci), (unsigned long long)block, expand_tree); new = kmem_cache_alloc(ocfs2_uptodate_cachep, GFP_NOFS); @@ -364,7 +424,7 @@ static void __ocfs2_set_buffer_uptodate(struct ocfs2_inode_info *oi, if (expand_tree) { /* Do *not* allocate an array here - the removal code * has no way of tracking that. */ - for(i = 0; i < OCFS2_INODE_MAX_CACHE_ARRAY; i++) { + for (i = 0; i < OCFS2_CACHE_INFO_MAX_ARRAY; i++) { tree[i] = kmem_cache_alloc(ocfs2_uptodate_cachep, GFP_NOFS); if (!tree[i]) { @@ -376,21 +436,21 @@ static void __ocfs2_set_buffer_uptodate(struct ocfs2_inode_info *oi, } } - spin_lock(&oi->ip_lock); - if (ocfs2_insert_can_use_array(oi, ci)) { + ocfs2_metadata_cache_lock(ci); + if (ocfs2_insert_can_use_array(ci)) { mlog(0, "Someone cleared the tree underneath us\n"); /* Ok, items were removed from the cache in between * locks. Detect this and revert back to the fast path */ ocfs2_append_cache_array(ci, block); - spin_unlock(&oi->ip_lock); + ocfs2_metadata_cache_unlock(ci); goto out_free; } if (expand_tree) - ocfs2_expand_cache(oi, tree); + ocfs2_expand_cache(ci, tree); __ocfs2_insert_cache_tree(ci, new); - spin_unlock(&oi->ip_lock); + ocfs2_metadata_cache_unlock(ci); new = NULL; out_free: @@ -400,14 +460,14 @@ out_free: /* If these were used, then ocfs2_expand_cache re-set them to * NULL for us. */ if (tree[0]) { - for(i = 0; i < OCFS2_INODE_MAX_CACHE_ARRAY; i++) + for (i = 0; i < OCFS2_CACHE_INFO_MAX_ARRAY; i++) if (tree[i]) kmem_cache_free(ocfs2_uptodate_cachep, tree[i]); } } -/* Item insertion is guarded by ip_io_mutex, so the insertion path takes +/* Item insertion is guarded by co_io_lock(), so the insertion path takes * advantage of this by not rechecking for a duplicate insert during * the slow case. Additionally, if the cache needs to be bumped up to * a tree, the code will not recheck after acquiring the lock -- @@ -425,59 +485,55 @@ out_free: * Readahead buffers can be passed in here before the I/O request is * completed. */ -void ocfs2_set_buffer_uptodate(struct inode *inode, +void ocfs2_set_buffer_uptodate(struct ocfs2_caching_info *ci, struct buffer_head *bh) { int expand; - struct ocfs2_inode_info *oi = OCFS2_I(inode); - struct ocfs2_caching_info *ci = &oi->ip_metadata_cache; /* The block may very well exist in our cache already, so avoid * doing any more work in that case. */ - if (ocfs2_buffer_cached(oi, bh)) + if (ocfs2_buffer_cached(ci, bh)) return; - mlog(0, "Inode %llu, inserting block %llu\n", - (unsigned long long)oi->ip_blkno, + mlog(0, "Owner %llu, inserting block %llu\n", + (unsigned long long)ocfs2_metadata_cache_owner(ci), (unsigned long long)bh->b_blocknr); /* No need to recheck under spinlock - insertion is guarded by - * ip_io_mutex */ - spin_lock(&oi->ip_lock); - if (ocfs2_insert_can_use_array(oi, ci)) { + * co_io_lock() */ + ocfs2_metadata_cache_lock(ci); + if (ocfs2_insert_can_use_array(ci)) { /* Fast case - it's an array and there's a free * spot. */ ocfs2_append_cache_array(ci, bh->b_blocknr); - spin_unlock(&oi->ip_lock); + ocfs2_metadata_cache_unlock(ci); return; } expand = 0; - if (oi->ip_flags & OCFS2_INODE_CACHE_INLINE) { + if (ci->ci_flags & OCFS2_CACHE_FL_INLINE) { /* We need to bump things up to a tree. */ expand = 1; } - spin_unlock(&oi->ip_lock); + ocfs2_metadata_cache_unlock(ci); - __ocfs2_set_buffer_uptodate(oi, bh->b_blocknr, expand); + __ocfs2_set_buffer_uptodate(ci, bh->b_blocknr, expand); } /* Called against a newly allocated buffer. Most likely nobody should * be able to read this sort of metadata while it's still being - * allocated, but this is careful to take ip_io_mutex anyway. */ -void ocfs2_set_new_buffer_uptodate(struct inode *inode, + * allocated, but this is careful to take co_io_lock() anyway. */ +void ocfs2_set_new_buffer_uptodate(struct ocfs2_caching_info *ci, struct buffer_head *bh) { - struct ocfs2_inode_info *oi = OCFS2_I(inode); - /* This should definitely *not* exist in our cache */ - BUG_ON(ocfs2_buffer_cached(oi, bh)); + BUG_ON(ocfs2_buffer_cached(ci, bh)); set_buffer_uptodate(bh); - mutex_lock(&oi->ip_io_mutex); - ocfs2_set_buffer_uptodate(inode, bh); - mutex_unlock(&oi->ip_io_mutex); + ocfs2_metadata_cache_io_lock(ci); + ocfs2_set_buffer_uptodate(ci, bh); + ocfs2_metadata_cache_io_unlock(ci); } /* Requires ip_lock. */ @@ -487,7 +543,7 @@ static void ocfs2_remove_metadata_array(struct ocfs2_caching_info *ci, sector_t *array = ci->ci_cache.ci_array; int bytes; - BUG_ON(index < 0 || index >= OCFS2_INODE_MAX_CACHE_ARRAY); + BUG_ON(index < 0 || index >= OCFS2_CACHE_INFO_MAX_ARRAY); BUG_ON(index >= ci->ci_num_cached); BUG_ON(!ci->ci_num_cached); @@ -515,21 +571,19 @@ static void ocfs2_remove_metadata_tree(struct ocfs2_caching_info *ci, ci->ci_num_cached--; } -static void ocfs2_remove_block_from_cache(struct inode *inode, +static void ocfs2_remove_block_from_cache(struct ocfs2_caching_info *ci, sector_t block) { int index; struct ocfs2_meta_cache_item *item = NULL; - struct ocfs2_inode_info *oi = OCFS2_I(inode); - struct ocfs2_caching_info *ci = &oi->ip_metadata_cache; - spin_lock(&oi->ip_lock); - mlog(0, "Inode %llu, remove %llu, items = %u, array = %u\n", - (unsigned long long)oi->ip_blkno, + ocfs2_metadata_cache_lock(ci); + mlog(0, "Owner %llu, remove %llu, items = %u, array = %u\n", + (unsigned long long)ocfs2_metadata_cache_owner(ci), (unsigned long long) block, ci->ci_num_cached, - oi->ip_flags & OCFS2_INODE_CACHE_INLINE); + ci->ci_flags & OCFS2_CACHE_FL_INLINE); - if (oi->ip_flags & OCFS2_INODE_CACHE_INLINE) { + if (ci->ci_flags & OCFS2_CACHE_FL_INLINE) { index = ocfs2_search_cache_array(ci, block); if (index != -1) ocfs2_remove_metadata_array(ci, index); @@ -538,7 +592,7 @@ static void ocfs2_remove_block_from_cache(struct inode *inode, if (item) ocfs2_remove_metadata_tree(ci, item); } - spin_unlock(&oi->ip_lock); + ocfs2_metadata_cache_unlock(ci); if (item) kmem_cache_free(ocfs2_uptodate_cachep, item); @@ -549,23 +603,24 @@ static void ocfs2_remove_block_from_cache(struct inode *inode, * bother reverting things to an inlined array in the case of a remove * which moves us back under the limit. */ -void ocfs2_remove_from_cache(struct inode *inode, +void ocfs2_remove_from_cache(struct ocfs2_caching_info *ci, struct buffer_head *bh) { sector_t block = bh->b_blocknr; - ocfs2_remove_block_from_cache(inode, block); + ocfs2_remove_block_from_cache(ci, block); } /* Called when we remove xattr clusters from an inode. */ -void ocfs2_remove_xattr_clusters_from_cache(struct inode *inode, +void ocfs2_remove_xattr_clusters_from_cache(struct ocfs2_caching_info *ci, sector_t block, u32 c_len) { - unsigned int i, b_len = ocfs2_clusters_to_blocks(inode->i_sb, 1) * c_len; + struct super_block *sb = ocfs2_metadata_cache_get_super(ci); + unsigned int i, b_len = ocfs2_clusters_to_blocks(sb, 1) * c_len; for (i = 0; i < b_len; i++, block++) - ocfs2_remove_block_from_cache(inode, block); + ocfs2_remove_block_from_cache(ci, block); } int __init init_ocfs2_uptodate_cache(void) @@ -577,7 +632,7 @@ int __init init_ocfs2_uptodate_cache(void) return -ENOMEM; mlog(0, "%u inlined cache items per inode.\n", - OCFS2_INODE_MAX_CACHE_ARRAY); + OCFS2_CACHE_INFO_MAX_ARRAY); return 0; } diff --git a/fs/ocfs2/uptodate.h b/fs/ocfs2/uptodate.h index 531b4b3..0d826fe 100644 --- a/fs/ocfs2/uptodate.h +++ b/fs/ocfs2/uptodate.h @@ -26,24 +26,59 @@ #ifndef OCFS2_UPTODATE_H #define OCFS2_UPTODATE_H +/* + * The caching code relies on locking provided by the user of + * struct ocfs2_caching_info. These operations connect that up. + */ +struct ocfs2_caching_operations { + /* + * A u64 representing the owning structure. Usually this + * is the block number (i_blkno or whatnot). This is used so + * that caching log messages can identify the owning structure. + */ + u64 (*co_owner)(struct ocfs2_caching_info *ci); + + /* The superblock is needed during I/O. */ + struct super_block *(*co_get_super)(struct ocfs2_caching_info *ci); + /* + * Lock and unlock the caching data. These will not sleep, and + * should probably be spinlocks. + */ + void (*co_cache_lock)(struct ocfs2_caching_info *ci); + void (*co_cache_unlock)(struct ocfs2_caching_info *ci); + + /* + * Lock and unlock for disk I/O. These will sleep, and should + * be mutexes. + */ + void (*co_io_lock)(struct ocfs2_caching_info *ci); + void (*co_io_unlock)(struct ocfs2_caching_info *ci); +}; + int __init init_ocfs2_uptodate_cache(void); void exit_ocfs2_uptodate_cache(void); -void ocfs2_metadata_cache_init(struct inode *inode); -void ocfs2_metadata_cache_purge(struct inode *inode); +void ocfs2_metadata_cache_init(struct ocfs2_caching_info *ci, + const struct ocfs2_caching_operations *ops); +void ocfs2_metadata_cache_purge(struct ocfs2_caching_info *ci); +void ocfs2_metadata_cache_exit(struct ocfs2_caching_info *ci); + +u64 ocfs2_metadata_cache_owner(struct ocfs2_caching_info *ci); +void ocfs2_metadata_cache_io_lock(struct ocfs2_caching_info *ci); +void ocfs2_metadata_cache_io_unlock(struct ocfs2_caching_info *ci); -int ocfs2_buffer_uptodate(struct inode *inode, +int ocfs2_buffer_uptodate(struct ocfs2_caching_info *ci, struct buffer_head *bh); -void ocfs2_set_buffer_uptodate(struct inode *inode, +void ocfs2_set_buffer_uptodate(struct ocfs2_caching_info *ci, struct buffer_head *bh); -void ocfs2_set_new_buffer_uptodate(struct inode *inode, +void ocfs2_set_new_buffer_uptodate(struct ocfs2_caching_info *ci, struct buffer_head *bh); -void ocfs2_remove_from_cache(struct inode *inode, +void ocfs2_remove_from_cache(struct ocfs2_caching_info *ci, struct buffer_head *bh); -void ocfs2_remove_xattr_clusters_from_cache(struct inode *inode, +void ocfs2_remove_xattr_clusters_from_cache(struct ocfs2_caching_info *ci, sector_t block, u32 c_len); -int ocfs2_buffer_read_ahead(struct inode *inode, +int ocfs2_buffer_read_ahead(struct ocfs2_caching_info *ci, struct buffer_head *bh); #endif /* OCFS2_UPTODATE_H */ diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index d1a27cd..fe34190 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -55,7 +55,8 @@ #include "buffer_head_io.h" #include "super.h" #include "xattr.h" - +#include "refcounttree.h" +#include "acl.h" struct ocfs2_xattr_def_value_root { struct ocfs2_xattr_value_root xv; @@ -140,7 +141,7 @@ struct ocfs2_xattr_search { int not_found; }; -static int ocfs2_xattr_bucket_get_name_value(struct inode *inode, +static int ocfs2_xattr_bucket_get_name_value(struct super_block *sb, struct ocfs2_xattr_header *xh, int index, int *block_off, @@ -157,7 +158,7 @@ static int ocfs2_xattr_index_block_find(struct inode *inode, struct ocfs2_xattr_search *xs); static int ocfs2_xattr_tree_list_index_block(struct inode *inode, - struct ocfs2_xattr_tree_root *xt, + struct buffer_head *blk_bh, char *buffer, size_t buffer_size); @@ -170,12 +171,42 @@ static int ocfs2_xattr_set_entry_index_block(struct inode *inode, struct ocfs2_xattr_search *xs, struct ocfs2_xattr_set_ctxt *ctxt); -static int ocfs2_delete_xattr_index_block(struct inode *inode, - struct buffer_head *xb_bh); +typedef int (xattr_tree_rec_func)(struct inode *inode, + struct buffer_head *root_bh, + u64 blkno, u32 cpos, u32 len, void *para); +static int ocfs2_iterate_xattr_index_block(struct inode *inode, + struct buffer_head *root_bh, + xattr_tree_rec_func *rec_func, + void *para); +static int ocfs2_delete_xattr_in_bucket(struct inode *inode, + struct ocfs2_xattr_bucket *bucket, + void *para); +static int ocfs2_rm_xattr_cluster(struct inode *inode, + struct buffer_head *root_bh, + u64 blkno, + u32 cpos, + u32 len, + void *para); + static int ocfs2_mv_xattr_buckets(struct inode *inode, handle_t *handle, u64 src_blk, u64 last_blk, u64 to_blk, unsigned int start_bucket, u32 *first_hash); +static int ocfs2_prepare_refcount_xattr(struct inode *inode, + struct ocfs2_dinode *di, + struct ocfs2_xattr_info *xi, + struct ocfs2_xattr_search *xis, + struct ocfs2_xattr_search *xbs, + struct ocfs2_refcount_tree **ref_tree, + int *meta_need, + int *credits); +static int ocfs2_get_xattr_tree_value_root(struct super_block *sb, + struct ocfs2_xattr_bucket *bucket, + int offset, + struct ocfs2_xattr_value_root **xv, + struct buffer_head **bh); +static int ocfs2_xattr_security_set(struct inode *inode, const char *name, + const void *value, size_t size, int flags); static inline u16 ocfs2_xattr_buckets_per_cluster(struct ocfs2_super *osb) { @@ -254,9 +285,9 @@ static int ocfs2_init_xattr_bucket(struct ocfs2_xattr_bucket *bucket, break; } - if (!ocfs2_buffer_uptodate(bucket->bu_inode, + if (!ocfs2_buffer_uptodate(INODE_CACHE(bucket->bu_inode), bucket->bu_bhs[i])) - ocfs2_set_new_buffer_uptodate(bucket->bu_inode, + ocfs2_set_new_buffer_uptodate(INODE_CACHE(bucket->bu_inode), bucket->bu_bhs[i]); } @@ -271,7 +302,7 @@ static int ocfs2_read_xattr_bucket(struct ocfs2_xattr_bucket *bucket, { int rc; - rc = ocfs2_read_blocks(bucket->bu_inode, xb_blkno, + rc = ocfs2_read_blocks(INODE_CACHE(bucket->bu_inode), xb_blkno, bucket->bu_blocks, bucket->bu_bhs, 0, NULL); if (!rc) { @@ -297,7 +328,8 @@ static int ocfs2_xattr_bucket_journal_access(handle_t *handle, int i, rc = 0; for (i = 0; i < bucket->bu_blocks; i++) { - rc = ocfs2_journal_access(handle, bucket->bu_inode, + rc = ocfs2_journal_access(handle, + INODE_CACHE(bucket->bu_inode), bucket->bu_bhs[i], type); if (rc) { mlog_errno(rc); @@ -399,7 +431,7 @@ static int ocfs2_read_xattr_block(struct inode *inode, u64 xb_blkno, int rc; struct buffer_head *tmp = *bh; - rc = ocfs2_read_block(inode, xb_blkno, &tmp, + rc = ocfs2_read_block(INODE_CACHE(inode), xb_blkno, &tmp, ocfs2_validate_xattr_block); /* If ocfs2_read_block() got us a new bh, pass it up. */ @@ -596,15 +628,14 @@ static int ocfs2_xattr_extend_allocation(struct inode *inode, int status = 0; handle_t *handle = ctxt->handle; enum ocfs2_alloc_restarted why; - struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); u32 prev_clusters, logical_start = le32_to_cpu(vb->vb_xv->xr_clusters); struct ocfs2_extent_tree et; mlog(0, "(clusters_to_add for xattr= %u)\n", clusters_to_add); - ocfs2_init_xattr_value_extent_tree(&et, inode, vb); + ocfs2_init_xattr_value_extent_tree(&et, INODE_CACHE(inode), vb); - status = vb->vb_access(handle, inode, vb->vb_bh, + status = vb->vb_access(handle, INODE_CACHE(inode), vb->vb_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { mlog_errno(status); @@ -612,13 +643,11 @@ static int ocfs2_xattr_extend_allocation(struct inode *inode, } prev_clusters = le32_to_cpu(vb->vb_xv->xr_clusters); - status = ocfs2_add_clusters_in_btree(osb, - inode, + status = ocfs2_add_clusters_in_btree(handle, + &et, &logical_start, clusters_to_add, 0, - &et, - handle, ctxt->data_ac, ctxt->meta_ac, &why); @@ -649,6 +678,7 @@ leave: static int __ocfs2_remove_xattr_range(struct inode *inode, struct ocfs2_xattr_value_buf *vb, u32 cpos, u32 phys_cpos, u32 len, + unsigned int ext_flags, struct ocfs2_xattr_set_ctxt *ctxt) { int ret; @@ -656,16 +686,16 @@ static int __ocfs2_remove_xattr_range(struct inode *inode, handle_t *handle = ctxt->handle; struct ocfs2_extent_tree et; - ocfs2_init_xattr_value_extent_tree(&et, inode, vb); + ocfs2_init_xattr_value_extent_tree(&et, INODE_CACHE(inode), vb); - ret = vb->vb_access(handle, inode, vb->vb_bh, + ret = vb->vb_access(handle, INODE_CACHE(inode), vb->vb_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (ret) { mlog_errno(ret); goto out; } - ret = ocfs2_remove_extent(inode, &et, cpos, len, handle, ctxt->meta_ac, + ret = ocfs2_remove_extent(handle, &et, cpos, len, ctxt->meta_ac, &ctxt->dealloc); if (ret) { mlog_errno(ret); @@ -680,7 +710,14 @@ static int __ocfs2_remove_xattr_range(struct inode *inode, goto out; } - ret = ocfs2_cache_cluster_dealloc(&ctxt->dealloc, phys_blkno, len); + if (ext_flags & OCFS2_EXT_REFCOUNTED) + ret = ocfs2_decrease_refcount(inode, handle, + ocfs2_blocks_to_clusters(inode->i_sb, + phys_blkno), + len, ctxt->meta_ac, &ctxt->dealloc, 1); + else + ret = ocfs2_cache_cluster_dealloc(&ctxt->dealloc, + phys_blkno, len); if (ret) mlog_errno(ret); @@ -695,6 +732,7 @@ static int ocfs2_xattr_shrink_size(struct inode *inode, struct ocfs2_xattr_set_ctxt *ctxt) { int ret = 0; + unsigned int ext_flags; u32 trunc_len, cpos, phys_cpos, alloc_size; u64 block; @@ -706,7 +744,7 @@ static int ocfs2_xattr_shrink_size(struct inode *inode, while (trunc_len) { ret = ocfs2_xattr_get_clusters(inode, cpos, &phys_cpos, &alloc_size, - &vb->vb_xv->xr_list); + &vb->vb_xv->xr_list, &ext_flags); if (ret) { mlog_errno(ret); goto out; @@ -717,15 +755,15 @@ static int ocfs2_xattr_shrink_size(struct inode *inode, ret = __ocfs2_remove_xattr_range(inode, vb, cpos, phys_cpos, alloc_size, - ctxt); + ext_flags, ctxt); if (ret) { mlog_errno(ret); goto out; } block = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos); - ocfs2_remove_xattr_clusters_from_cache(inode, block, - alloc_size); + ocfs2_remove_xattr_clusters_from_cache(INODE_CACHE(inode), + block, alloc_size); cpos += alloc_size; trunc_len -= alloc_size; } @@ -810,6 +848,23 @@ static int ocfs2_xattr_list_entries(struct inode *inode, return result; } +int ocfs2_has_inline_xattr_value_outside(struct inode *inode, + struct ocfs2_dinode *di) +{ + struct ocfs2_xattr_header *xh; + int i; + + xh = (struct ocfs2_xattr_header *) + ((void *)di + inode->i_sb->s_blocksize - + le16_to_cpu(di->i_xattr_inline_size)); + + for (i = 0; i < le16_to_cpu(xh->xh_count); i++) + if (!ocfs2_xattr_is_local(&xh->xh_entries[i])) + return 1; + + return 0; +} + static int ocfs2_xattr_ibody_list(struct inode *inode, struct ocfs2_dinode *di, char *buffer, @@ -855,11 +910,9 @@ static int ocfs2_xattr_block_list(struct inode *inode, struct ocfs2_xattr_header *header = &xb->xb_attrs.xb_header; ret = ocfs2_xattr_list_entries(inode, header, buffer, buffer_size); - } else { - struct ocfs2_xattr_tree_root *xt = &xb->xb_attrs.xb_root; - ret = ocfs2_xattr_tree_list_index_block(inode, xt, + } else + ret = ocfs2_xattr_tree_list_index_block(inode, blk_bh, buffer, buffer_size); - } brelse(blk_bh); @@ -961,7 +1014,7 @@ static int ocfs2_xattr_get_value_outside(struct inode *inode, cpos = 0; while (cpos < clusters) { ret = ocfs2_xattr_get_clusters(inode, cpos, &p_cluster, - &num_clusters, el); + &num_clusters, el, NULL); if (ret) { mlog_errno(ret); goto out; @@ -970,7 +1023,8 @@ static int ocfs2_xattr_get_value_outside(struct inode *inode, blkno = ocfs2_clusters_to_blocks(inode->i_sb, p_cluster); /* Copy ocfs2_xattr_value */ for (i = 0; i < num_clusters * bpc; i++, blkno++) { - ret = ocfs2_read_block(inode, blkno, &bh, NULL); + ret = ocfs2_read_block(INODE_CACHE(inode), blkno, + &bh, NULL); if (ret) { mlog_errno(ret); goto out; @@ -1085,7 +1139,7 @@ static int ocfs2_xattr_block_get(struct inode *inode, i = xs->here - xs->header->xh_entries; if (le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED) { - ret = ocfs2_xattr_bucket_get_name_value(inode, + ret = ocfs2_xattr_bucket_get_name_value(inode->i_sb, bucket_xh(xs->bucket), i, &block_off, @@ -1183,7 +1237,7 @@ static int ocfs2_xattr_get(struct inode *inode, static int __ocfs2_xattr_set_value_outside(struct inode *inode, handle_t *handle, - struct ocfs2_xattr_value_root *xv, + struct ocfs2_xattr_value_buf *vb, const void *value, int value_len) { @@ -1194,28 +1248,34 @@ static int __ocfs2_xattr_set_value_outside(struct inode *inode, u32 clusters = ocfs2_clusters_for_bytes(inode->i_sb, value_len); u64 blkno; struct buffer_head *bh = NULL; + unsigned int ext_flags; + struct ocfs2_xattr_value_root *xv = vb->vb_xv; BUG_ON(clusters > le32_to_cpu(xv->xr_clusters)); while (cpos < clusters) { ret = ocfs2_xattr_get_clusters(inode, cpos, &p_cluster, - &num_clusters, &xv->xr_list); + &num_clusters, &xv->xr_list, + &ext_flags); if (ret) { mlog_errno(ret); goto out; } + BUG_ON(ext_flags & OCFS2_EXT_REFCOUNTED); + blkno = ocfs2_clusters_to_blocks(inode->i_sb, p_cluster); for (i = 0; i < num_clusters * bpc; i++, blkno++) { - ret = ocfs2_read_block(inode, blkno, &bh, NULL); + ret = ocfs2_read_block(INODE_CACHE(inode), blkno, + &bh, NULL); if (ret) { mlog_errno(ret); goto out; } ret = ocfs2_journal_access(handle, - inode, + INODE_CACHE(inode), bh, OCFS2_JOURNAL_ACCESS_WRITE); if (ret < 0) { @@ -1266,7 +1326,7 @@ static int ocfs2_xattr_cleanup(struct inode *inode, void *val = xs->base + offs; size_t size = OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_ROOT_SIZE; - ret = vb->vb_access(handle, inode, vb->vb_bh, + ret = vb->vb_access(handle, INODE_CACHE(inode), vb->vb_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (ret) { mlog_errno(ret); @@ -1294,7 +1354,7 @@ static int ocfs2_xattr_update_entry(struct inode *inode, { int ret; - ret = vb->vb_access(handle, inode, vb->vb_bh, + ret = vb->vb_access(handle, INODE_CACHE(inode), vb->vb_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (ret) { mlog_errno(ret); @@ -1355,7 +1415,7 @@ static int ocfs2_xattr_set_value_outside(struct inode *inode, mlog_errno(ret); return ret; } - ret = __ocfs2_xattr_set_value_outside(inode, ctxt->handle, vb->vb_xv, + ret = __ocfs2_xattr_set_value_outside(inode, ctxt->handle, vb, xi->value, xi->value_len); if (ret < 0) mlog_errno(ret); @@ -1594,7 +1654,7 @@ static int ocfs2_xattr_set_entry(struct inode *inode, ret = __ocfs2_xattr_set_value_outside(inode, handle, - vb.vb_xv, + &vb, xi->value, xi->value_len); if (ret < 0) @@ -1615,7 +1675,7 @@ static int ocfs2_xattr_set_entry(struct inode *inode, } } - ret = ocfs2_journal_access_di(handle, inode, xs->inode_bh, + ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), xs->inode_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (ret) { mlog_errno(ret); @@ -1623,7 +1683,7 @@ static int ocfs2_xattr_set_entry(struct inode *inode, } if (!(flag & OCFS2_INLINE_XATTR_FL)) { - ret = vb.vb_access(handle, inode, vb.vb_bh, + ret = vb.vb_access(handle, INODE_CACHE(inode), vb.vb_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (ret) { mlog_errno(ret); @@ -1700,51 +1760,112 @@ out: return ret; } +/* + * In xattr remove, if it is stored outside and refcounted, we may have + * the chance to split the refcount tree. So need the allocators. + */ +static int ocfs2_lock_xattr_remove_allocators(struct inode *inode, + struct ocfs2_xattr_value_root *xv, + struct ocfs2_caching_info *ref_ci, + struct buffer_head *ref_root_bh, + struct ocfs2_alloc_context **meta_ac, + int *ref_credits) +{ + int ret, meta_add = 0; + u32 p_cluster, num_clusters; + unsigned int ext_flags; + + *ref_credits = 0; + ret = ocfs2_xattr_get_clusters(inode, 0, &p_cluster, + &num_clusters, + &xv->xr_list, + &ext_flags); + if (ret) { + mlog_errno(ret); + goto out; + } + + if (!(ext_flags & OCFS2_EXT_REFCOUNTED)) + goto out; + + ret = ocfs2_refcounted_xattr_delete_need(inode, ref_ci, + ref_root_bh, xv, + &meta_add, ref_credits); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_reserve_new_metadata_blocks(OCFS2_SB(inode->i_sb), + meta_add, meta_ac); + if (ret) + mlog_errno(ret); + +out: + return ret; +} + static int ocfs2_remove_value_outside(struct inode*inode, struct ocfs2_xattr_value_buf *vb, - struct ocfs2_xattr_header *header) + struct ocfs2_xattr_header *header, + struct ocfs2_caching_info *ref_ci, + struct buffer_head *ref_root_bh) { - int ret = 0, i; + int ret = 0, i, ref_credits; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); struct ocfs2_xattr_set_ctxt ctxt = { NULL, NULL, }; + void *val; ocfs2_init_dealloc_ctxt(&ctxt.dealloc); - ctxt.handle = ocfs2_start_trans(osb, - ocfs2_remove_extent_credits(osb->sb)); - if (IS_ERR(ctxt.handle)) { - ret = PTR_ERR(ctxt.handle); - mlog_errno(ret); - goto out; - } - for (i = 0; i < le16_to_cpu(header->xh_count); i++) { struct ocfs2_xattr_entry *entry = &header->xh_entries[i]; - if (!ocfs2_xattr_is_local(entry)) { - void *val; + if (ocfs2_xattr_is_local(entry)) + continue; - val = (void *)header + - le16_to_cpu(entry->xe_name_offset); - vb->vb_xv = (struct ocfs2_xattr_value_root *) - (val + OCFS2_XATTR_SIZE(entry->xe_name_len)); - ret = ocfs2_xattr_value_truncate(inode, vb, 0, &ctxt); - if (ret < 0) { - mlog_errno(ret); - break; - } + val = (void *)header + + le16_to_cpu(entry->xe_name_offset); + vb->vb_xv = (struct ocfs2_xattr_value_root *) + (val + OCFS2_XATTR_SIZE(entry->xe_name_len)); + + ret = ocfs2_lock_xattr_remove_allocators(inode, vb->vb_xv, + ref_ci, ref_root_bh, + &ctxt.meta_ac, + &ref_credits); + + ctxt.handle = ocfs2_start_trans(osb, ref_credits + + ocfs2_remove_extent_credits(osb->sb)); + if (IS_ERR(ctxt.handle)) { + ret = PTR_ERR(ctxt.handle); + mlog_errno(ret); + break; + } + + ret = ocfs2_xattr_value_truncate(inode, vb, 0, &ctxt); + if (ret < 0) { + mlog_errno(ret); + break; + } + + ocfs2_commit_trans(osb, ctxt.handle); + if (ctxt.meta_ac) { + ocfs2_free_alloc_context(ctxt.meta_ac); + ctxt.meta_ac = NULL; } } - ocfs2_commit_trans(osb, ctxt.handle); + if (ctxt.meta_ac) + ocfs2_free_alloc_context(ctxt.meta_ac); ocfs2_schedule_truncate_log_flush(osb, 1); ocfs2_run_deallocs(osb, &ctxt.dealloc); -out: return ret; } static int ocfs2_xattr_ibody_remove(struct inode *inode, - struct buffer_head *di_bh) + struct buffer_head *di_bh, + struct ocfs2_caching_info *ref_ci, + struct buffer_head *ref_root_bh) { struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; @@ -1759,13 +1880,21 @@ static int ocfs2_xattr_ibody_remove(struct inode *inode, ((void *)di + inode->i_sb->s_blocksize - le16_to_cpu(di->i_xattr_inline_size)); - ret = ocfs2_remove_value_outside(inode, &vb, header); + ret = ocfs2_remove_value_outside(inode, &vb, header, + ref_ci, ref_root_bh); return ret; } +struct ocfs2_rm_xattr_bucket_para { + struct ocfs2_caching_info *ref_ci; + struct buffer_head *ref_root_bh; +}; + static int ocfs2_xattr_block_remove(struct inode *inode, - struct buffer_head *blk_bh) + struct buffer_head *blk_bh, + struct ocfs2_caching_info *ref_ci, + struct buffer_head *ref_root_bh) { struct ocfs2_xattr_block *xb; int ret = 0; @@ -1773,19 +1902,29 @@ static int ocfs2_xattr_block_remove(struct inode *inode, .vb_bh = blk_bh, .vb_access = ocfs2_journal_access_xb, }; + struct ocfs2_rm_xattr_bucket_para args = { + .ref_ci = ref_ci, + .ref_root_bh = ref_root_bh, + }; xb = (struct ocfs2_xattr_block *)blk_bh->b_data; if (!(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED)) { struct ocfs2_xattr_header *header = &(xb->xb_attrs.xb_header); - ret = ocfs2_remove_value_outside(inode, &vb, header); + ret = ocfs2_remove_value_outside(inode, &vb, header, + ref_ci, ref_root_bh); } else - ret = ocfs2_delete_xattr_index_block(inode, blk_bh); + ret = ocfs2_iterate_xattr_index_block(inode, + blk_bh, + ocfs2_rm_xattr_cluster, + &args); return ret; } static int ocfs2_xattr_free_block(struct inode *inode, - u64 block) + u64 block, + struct ocfs2_caching_info *ref_ci, + struct buffer_head *ref_root_bh) { struct inode *xb_alloc_inode; struct buffer_head *xb_alloc_bh = NULL; @@ -1803,7 +1942,7 @@ static int ocfs2_xattr_free_block(struct inode *inode, goto out; } - ret = ocfs2_xattr_block_remove(inode, blk_bh); + ret = ocfs2_xattr_block_remove(inode, blk_bh, ref_ci, ref_root_bh); if (ret < 0) { mlog_errno(ret); goto out; @@ -1863,6 +2002,9 @@ int ocfs2_xattr_remove(struct inode *inode, struct buffer_head *di_bh) { struct ocfs2_inode_info *oi = OCFS2_I(inode); struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; + struct ocfs2_refcount_tree *ref_tree = NULL; + struct buffer_head *ref_root_bh = NULL; + struct ocfs2_caching_info *ref_ci = NULL; handle_t *handle; int ret; @@ -1872,8 +2014,21 @@ int ocfs2_xattr_remove(struct inode *inode, struct buffer_head *di_bh) if (!(oi->ip_dyn_features & OCFS2_HAS_XATTR_FL)) return 0; + if (OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL) { + ret = ocfs2_lock_refcount_tree(OCFS2_SB(inode->i_sb), + le64_to_cpu(di->i_refcount_loc), + 1, &ref_tree, &ref_root_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + ref_ci = &ref_tree->rf_ci; + + } + if (oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL) { - ret = ocfs2_xattr_ibody_remove(inode, di_bh); + ret = ocfs2_xattr_ibody_remove(inode, di_bh, + ref_ci, ref_root_bh); if (ret < 0) { mlog_errno(ret); goto out; @@ -1882,7 +2037,8 @@ int ocfs2_xattr_remove(struct inode *inode, struct buffer_head *di_bh) if (di->i_xattr_loc) { ret = ocfs2_xattr_free_block(inode, - le64_to_cpu(di->i_xattr_loc)); + le64_to_cpu(di->i_xattr_loc), + ref_ci, ref_root_bh); if (ret < 0) { mlog_errno(ret); goto out; @@ -1896,7 +2052,7 @@ int ocfs2_xattr_remove(struct inode *inode, struct buffer_head *di_bh) mlog_errno(ret); goto out; } - ret = ocfs2_journal_access_di(handle, inode, di_bh, + ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (ret) { mlog_errno(ret); @@ -1916,6 +2072,9 @@ int ocfs2_xattr_remove(struct inode *inode, struct buffer_head *di_bh) out_commit: ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle); out: + if (ref_tree) + ocfs2_unlock_refcount_tree(OCFS2_SB(inode->i_sb), ref_tree, 1); + brelse(ref_root_bh); return ret; } @@ -2083,6 +2242,84 @@ cleanup: return ret; } +static int ocfs2_create_xattr_block(handle_t *handle, + struct inode *inode, + struct buffer_head *inode_bh, + struct ocfs2_alloc_context *meta_ac, + struct buffer_head **ret_bh, + int indexed) +{ + int ret; + u16 suballoc_bit_start; + u32 num_got; + u64 first_blkno; + struct ocfs2_dinode *di = (struct ocfs2_dinode *)inode_bh->b_data; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct buffer_head *new_bh = NULL; + struct ocfs2_xattr_block *xblk; + + ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), inode_bh, + OCFS2_JOURNAL_ACCESS_CREATE); + if (ret < 0) { + mlog_errno(ret); + goto end; + } + + ret = ocfs2_claim_metadata(osb, handle, meta_ac, 1, + &suballoc_bit_start, &num_got, + &first_blkno); + if (ret < 0) { + mlog_errno(ret); + goto end; + } + + new_bh = sb_getblk(inode->i_sb, first_blkno); + ocfs2_set_new_buffer_uptodate(INODE_CACHE(inode), new_bh); + + ret = ocfs2_journal_access_xb(handle, INODE_CACHE(inode), + new_bh, + OCFS2_JOURNAL_ACCESS_CREATE); + if (ret < 0) { + mlog_errno(ret); + goto end; + } + + /* Initialize ocfs2_xattr_block */ + xblk = (struct ocfs2_xattr_block *)new_bh->b_data; + memset(xblk, 0, inode->i_sb->s_blocksize); + strcpy((void *)xblk, OCFS2_XATTR_BLOCK_SIGNATURE); + xblk->xb_suballoc_slot = cpu_to_le16(osb->slot_num); + xblk->xb_suballoc_bit = cpu_to_le16(suballoc_bit_start); + xblk->xb_fs_generation = cpu_to_le32(osb->fs_generation); + xblk->xb_blkno = cpu_to_le64(first_blkno); + + if (indexed) { + struct ocfs2_xattr_tree_root *xr = &xblk->xb_attrs.xb_root; + xr->xt_clusters = cpu_to_le32(1); + xr->xt_last_eb_blk = 0; + xr->xt_list.l_tree_depth = 0; + xr->xt_list.l_count = cpu_to_le16( + ocfs2_xattr_recs_per_xb(inode->i_sb)); + xr->xt_list.l_next_free_rec = cpu_to_le16(1); + xblk->xb_flags = cpu_to_le16(OCFS2_XATTR_INDEXED); + } + + ret = ocfs2_journal_dirty(handle, new_bh); + if (ret < 0) { + mlog_errno(ret); + goto end; + } + di->i_xattr_loc = cpu_to_le64(first_blkno); + ocfs2_journal_dirty(handle, inode_bh); + + *ret_bh = new_bh; + new_bh = NULL; + +end: + brelse(new_bh); + return ret; +} + /* * ocfs2_xattr_block_set() * @@ -2095,63 +2332,24 @@ static int ocfs2_xattr_block_set(struct inode *inode, struct ocfs2_xattr_set_ctxt *ctxt) { struct buffer_head *new_bh = NULL; - struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); - struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data; handle_t *handle = ctxt->handle; struct ocfs2_xattr_block *xblk = NULL; - u16 suballoc_bit_start; - u32 num_got; - u64 first_blkno; int ret; if (!xs->xattr_bh) { - ret = ocfs2_journal_access_di(handle, inode, xs->inode_bh, - OCFS2_JOURNAL_ACCESS_CREATE); - if (ret < 0) { - mlog_errno(ret); - goto end; - } - - ret = ocfs2_claim_metadata(osb, handle, ctxt->meta_ac, 1, - &suballoc_bit_start, &num_got, - &first_blkno); - if (ret < 0) { - mlog_errno(ret); - goto end; - } - - new_bh = sb_getblk(inode->i_sb, first_blkno); - ocfs2_set_new_buffer_uptodate(inode, new_bh); - - ret = ocfs2_journal_access_xb(handle, inode, new_bh, - OCFS2_JOURNAL_ACCESS_CREATE); - if (ret < 0) { + ret = ocfs2_create_xattr_block(handle, inode, xs->inode_bh, + ctxt->meta_ac, &new_bh, 0); + if (ret) { mlog_errno(ret); goto end; } - /* Initialize ocfs2_xattr_block */ xs->xattr_bh = new_bh; - xblk = (struct ocfs2_xattr_block *)new_bh->b_data; - memset(xblk, 0, inode->i_sb->s_blocksize); - strcpy((void *)xblk, OCFS2_XATTR_BLOCK_SIGNATURE); - xblk->xb_suballoc_slot = cpu_to_le16(osb->slot_num); - xblk->xb_suballoc_bit = cpu_to_le16(suballoc_bit_start); - xblk->xb_fs_generation = cpu_to_le32(osb->fs_generation); - xblk->xb_blkno = cpu_to_le64(first_blkno); - + xblk = (struct ocfs2_xattr_block *)xs->xattr_bh->b_data; xs->header = &xblk->xb_attrs.xb_header; xs->base = (void *)xs->header; xs->end = (void *)xblk + inode->i_sb->s_blocksize; xs->here = xs->header->xh_entries; - - ret = ocfs2_journal_dirty(handle, new_bh); - if (ret < 0) { - mlog_errno(ret); - goto end; - } - di->i_xattr_loc = cpu_to_le64(first_blkno); - ocfs2_journal_dirty(handle, xs->inode_bh); } else xblk = (struct ocfs2_xattr_block *)xs->xattr_bh->b_data; @@ -2273,7 +2471,7 @@ static int ocfs2_calc_xattr_set_need(struct inode *inode, old_in_xb = 1; if (le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED) { - ret = ocfs2_xattr_bucket_get_name_value(inode, + ret = ocfs2_xattr_bucket_get_name_value(inode->i_sb, bucket_xh(xbs->bucket), i, &block_off, &name_offset); @@ -2428,6 +2626,7 @@ static int ocfs2_init_xattr_set_ctxt(struct inode *inode, struct ocfs2_xattr_search *xis, struct ocfs2_xattr_search *xbs, struct ocfs2_xattr_set_ctxt *ctxt, + int extra_meta, int *credits) { int clusters_add, meta_add, ret; @@ -2444,6 +2643,7 @@ static int ocfs2_init_xattr_set_ctxt(struct inode *inode, return ret; } + meta_add += extra_meta; mlog(0, "Set xattr %s, reserve meta blocks = %d, clusters = %d, " "credits = %d\n", xi->name, meta_add, clusters_add, *credits); @@ -2598,7 +2798,7 @@ static int __ocfs2_xattr_set_handle(struct inode *inode, if (!ret) { /* Update inode ctime. */ - ret = ocfs2_journal_access_di(ctxt->handle, inode, + ret = ocfs2_journal_access_di(ctxt->handle, INODE_CACHE(inode), xis->inode_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (ret) { @@ -2711,10 +2911,11 @@ int ocfs2_xattr_set(struct inode *inode, { struct buffer_head *di_bh = NULL; struct ocfs2_dinode *di; - int ret, credits; + int ret, credits, ref_meta = 0, ref_credits = 0; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); struct inode *tl_inode = osb->osb_tl_inode; struct ocfs2_xattr_set_ctxt ctxt = { NULL, NULL, }; + struct ocfs2_refcount_tree *ref_tree = NULL; struct ocfs2_xattr_info xi = { .name_index = name_index, @@ -2779,6 +2980,17 @@ int ocfs2_xattr_set(struct inode *inode, goto cleanup; } + /* Check whether the value is refcounted and do some prepartion. */ + if (OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL && + (!xis.not_found || !xbs.not_found)) { + ret = ocfs2_prepare_refcount_xattr(inode, di, &xi, + &xis, &xbs, &ref_tree, + &ref_meta, &ref_credits); + if (ret) { + mlog_errno(ret); + goto cleanup; + } + } mutex_lock(&tl_inode->i_mutex); @@ -2793,7 +3005,7 @@ int ocfs2_xattr_set(struct inode *inode, mutex_unlock(&tl_inode->i_mutex); ret = ocfs2_init_xattr_set_ctxt(inode, di, &xi, &xis, - &xbs, &ctxt, &credits); + &xbs, &ctxt, ref_meta, &credits); if (ret) { mlog_errno(ret); goto cleanup; @@ -2801,7 +3013,7 @@ int ocfs2_xattr_set(struct inode *inode, /* we need to update inode's ctime field, so add credit for it. */ credits += OCFS2_INODE_UPDATE_CREDITS; - ctxt.handle = ocfs2_start_trans(osb, credits); + ctxt.handle = ocfs2_start_trans(osb, credits + ref_credits); if (IS_ERR(ctxt.handle)) { ret = PTR_ERR(ctxt.handle); mlog_errno(ret); @@ -2819,8 +3031,16 @@ int ocfs2_xattr_set(struct inode *inode, if (ocfs2_dealloc_has_cluster(&ctxt.dealloc)) ocfs2_schedule_truncate_log_flush(osb, 1); ocfs2_run_deallocs(osb, &ctxt.dealloc); + cleanup: + if (ref_tree) + ocfs2_unlock_refcount_tree(osb, ref_tree, 1); up_write(&OCFS2_I(inode)->ip_xattr_sem); + if (!value && !ret) { + ret = ocfs2_try_remove_refcount_tree(inode, di_bh); + if (ret) + mlog_errno(ret); + } ocfs2_inode_unlock(inode, 1); cleanup_nolock: brelse(di_bh); @@ -2849,7 +3069,8 @@ static int ocfs2_xattr_get_rec(struct inode *inode, u64 e_blkno = 0; if (el->l_tree_depth) { - ret = ocfs2_find_leaf(inode, el, name_hash, &eb_bh); + ret = ocfs2_find_leaf(INODE_CACHE(inode), el, name_hash, + &eb_bh); if (ret) { mlog_errno(ret); goto out; @@ -2931,7 +3152,7 @@ static int ocfs2_find_xe_in_bucket(struct inode *inode, if (cmp) continue; - ret = ocfs2_xattr_bucket_get_name_value(inode, + ret = ocfs2_xattr_bucket_get_name_value(inode->i_sb, xh, i, &block_off, @@ -3175,7 +3396,7 @@ struct ocfs2_xattr_tree_list { size_t result; }; -static int ocfs2_xattr_bucket_get_name_value(struct inode *inode, +static int ocfs2_xattr_bucket_get_name_value(struct super_block *sb, struct ocfs2_xattr_header *xh, int index, int *block_off, @@ -3188,8 +3409,8 @@ static int ocfs2_xattr_bucket_get_name_value(struct inode *inode, name_offset = le16_to_cpu(xh->xh_entries[index].xe_name_offset); - *block_off = name_offset >> inode->i_sb->s_blocksize_bits; - *new_offset = name_offset % inode->i_sb->s_blocksize; + *block_off = name_offset >> sb->s_blocksize_bits; + *new_offset = name_offset % sb->s_blocksize; return 0; } @@ -3209,7 +3430,7 @@ static int ocfs2_list_xattr_bucket(struct inode *inode, prefix = ocfs2_xattr_prefix(type); if (prefix) { - ret = ocfs2_xattr_bucket_get_name_value(inode, + ret = ocfs2_xattr_bucket_get_name_value(inode->i_sb, bucket_xh(bucket), i, &block_off, @@ -3232,22 +3453,19 @@ static int ocfs2_list_xattr_bucket(struct inode *inode, return ret; } -static int ocfs2_xattr_tree_list_index_block(struct inode *inode, - struct ocfs2_xattr_tree_root *xt, - char *buffer, - size_t buffer_size) +static int ocfs2_iterate_xattr_index_block(struct inode *inode, + struct buffer_head *blk_bh, + xattr_tree_rec_func *rec_func, + void *para) { - struct ocfs2_extent_list *el = &xt->xt_list; + struct ocfs2_xattr_block *xb = + (struct ocfs2_xattr_block *)blk_bh->b_data; + struct ocfs2_extent_list *el = &xb->xb_attrs.xb_root.xt_list; int ret = 0; u32 name_hash = UINT_MAX, e_cpos = 0, num_clusters = 0; u64 p_blkno = 0; - struct ocfs2_xattr_tree_list xl = { - .buffer = buffer, - .buffer_size = buffer_size, - .result = 0, - }; - if (le16_to_cpu(el->l_next_free_rec) == 0) + if (!el->l_next_free_rec || !rec_func) return 0; while (name_hash > 0) { @@ -3255,16 +3473,15 @@ static int ocfs2_xattr_tree_list_index_block(struct inode *inode, &e_cpos, &num_clusters, el); if (ret) { mlog_errno(ret); - goto out; + break; } - ret = ocfs2_iterate_xattr_buckets(inode, p_blkno, num_clusters, - ocfs2_list_xattr_bucket, - &xl); + ret = rec_func(inode, blk_bh, p_blkno, e_cpos, + num_clusters, para); if (ret) { if (ret != -ERANGE) mlog_errno(ret); - goto out; + break; } if (e_cpos == 0) @@ -3273,6 +3490,37 @@ static int ocfs2_xattr_tree_list_index_block(struct inode *inode, name_hash = e_cpos - 1; } + return ret; + +} + +static int ocfs2_list_xattr_tree_rec(struct inode *inode, + struct buffer_head *root_bh, + u64 blkno, u32 cpos, u32 len, void *para) +{ + return ocfs2_iterate_xattr_buckets(inode, blkno, len, + ocfs2_list_xattr_bucket, para); +} + +static int ocfs2_xattr_tree_list_index_block(struct inode *inode, + struct buffer_head *blk_bh, + char *buffer, + size_t buffer_size) +{ + int ret; + struct ocfs2_xattr_tree_list xl = { + .buffer = buffer, + .buffer_size = buffer_size, + .result = 0, + }; + + ret = ocfs2_iterate_xattr_index_block(inode, blk_bh, + ocfs2_list_xattr_tree_rec, &xl); + if (ret) { + mlog_errno(ret); + goto out; + } + ret = xl.result; out: return ret; @@ -3426,7 +3674,7 @@ static int ocfs2_xattr_create_index_block(struct inode *inode, */ down_write(&oi->ip_alloc_sem); - ret = ocfs2_journal_access_xb(handle, inode, xb_bh, + ret = ocfs2_journal_access_xb(handle, INODE_CACHE(inode), xb_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (ret) { mlog_errno(ret); @@ -4263,9 +4511,9 @@ static int ocfs2_add_new_xattr_cluster(struct inode *inode, (unsigned long long)OCFS2_I(inode)->ip_blkno, prev_cpos, (unsigned long long)bucket_blkno(first)); - ocfs2_init_xattr_tree_extent_tree(&et, inode, root_bh); + ocfs2_init_xattr_tree_extent_tree(&et, INODE_CACHE(inode), root_bh); - ret = ocfs2_journal_access_xb(handle, inode, root_bh, + ret = ocfs2_journal_access_xb(handle, INODE_CACHE(inode), root_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (ret < 0) { mlog_errno(ret); @@ -4319,7 +4567,7 @@ static int ocfs2_add_new_xattr_cluster(struct inode *inode, mlog(0, "Insert %u clusters at block %llu for xattr at %u\n", num_bits, (unsigned long long)block, v_start); - ret = ocfs2_insert_extent(osb, handle, inode, &et, v_start, block, + ret = ocfs2_insert_extent(handle, &et, v_start, block, num_bits, 0, ctxt->meta_ac); if (ret < 0) { mlog_errno(ret); @@ -4798,10 +5046,13 @@ static int ocfs2_xattr_bucket_set_value_outside(struct inode *inode, struct ocfs2_xattr_entry *xe = xs->here; struct ocfs2_xattr_header *xh = bucket_xh(xs->bucket); void *base; + struct ocfs2_xattr_value_buf vb = { + .vb_access = ocfs2_journal_access, + }; BUG_ON(!xs->base || !xe || ocfs2_xattr_is_local(xe)); - ret = ocfs2_xattr_bucket_get_name_value(inode, xh, + ret = ocfs2_xattr_bucket_get_name_value(inode->i_sb, xh, xe - xh->xh_entries, &block_off, &offset); @@ -4814,8 +5065,10 @@ static int ocfs2_xattr_bucket_set_value_outside(struct inode *inode, xv = (struct ocfs2_xattr_value_root *)(base + offset + OCFS2_XATTR_SIZE(xe->xe_name_len)); + vb.vb_xv = xv; + vb.vb_bh = xs->bucket->bu_bhs[block_off]; ret = __ocfs2_xattr_set_value_outside(inode, handle, - xv, val, value_len); + &vb, val, value_len); if (ret) mlog_errno(ret); out: @@ -4826,7 +5079,8 @@ static int ocfs2_rm_xattr_cluster(struct inode *inode, struct buffer_head *root_bh, u64 blkno, u32 cpos, - u32 len) + u32 len, + void *para) { int ret; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); @@ -4838,14 +5092,22 @@ static int ocfs2_rm_xattr_cluster(struct inode *inode, struct ocfs2_cached_dealloc_ctxt dealloc; struct ocfs2_extent_tree et; - ocfs2_init_xattr_tree_extent_tree(&et, inode, root_bh); + ret = ocfs2_iterate_xattr_buckets(inode, blkno, len, + ocfs2_delete_xattr_in_bucket, para); + if (ret) { + mlog_errno(ret); + return ret; + } + + ocfs2_init_xattr_tree_extent_tree(&et, INODE_CACHE(inode), root_bh); ocfs2_init_dealloc_ctxt(&dealloc); mlog(0, "rm xattr extent rec at %u len = %u, start from %llu\n", cpos, len, (unsigned long long)blkno); - ocfs2_remove_xattr_clusters_from_cache(inode, blkno, len); + ocfs2_remove_xattr_clusters_from_cache(INODE_CACHE(inode), blkno, + len); ret = ocfs2_lock_allocators(inode, &et, 0, 1, NULL, &meta_ac); if (ret) { @@ -4870,14 +5132,14 @@ static int ocfs2_rm_xattr_cluster(struct inode *inode, goto out; } - ret = ocfs2_journal_access_xb(handle, inode, root_bh, + ret = ocfs2_journal_access_xb(handle, INODE_CACHE(inode), root_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (ret) { mlog_errno(ret); goto out_commit; } - ret = ocfs2_remove_extent(inode, &et, cpos, len, handle, meta_ac, + ret = ocfs2_remove_extent(handle, &et, cpos, len, meta_ac, &dealloc); if (ret) { mlog_errno(ret); @@ -5220,7 +5482,7 @@ static int ocfs2_delete_xattr_in_bucket(struct inode *inode, struct ocfs2_xattr_bucket *bucket, void *para) { - int ret = 0; + int ret = 0, ref_credits; struct ocfs2_xattr_header *xh = bucket_xh(bucket); u16 i; struct ocfs2_xattr_entry *xe; @@ -5228,7 +5490,9 @@ static int ocfs2_delete_xattr_in_bucket(struct inode *inode, struct ocfs2_xattr_set_ctxt ctxt = {NULL, NULL,}; int credits = ocfs2_remove_extent_credits(osb->sb) + ocfs2_blocks_per_xattr_bucket(inode->i_sb); - + struct ocfs2_xattr_value_root *xv; + struct ocfs2_rm_xattr_bucket_para *args = + (struct ocfs2_rm_xattr_bucket_para *)para; ocfs2_init_dealloc_ctxt(&ctxt.dealloc); @@ -5237,7 +5501,16 @@ static int ocfs2_delete_xattr_in_bucket(struct inode *inode, if (ocfs2_xattr_is_local(xe)) continue; - ctxt.handle = ocfs2_start_trans(osb, credits); + ret = ocfs2_get_xattr_tree_value_root(inode->i_sb, bucket, + i, &xv, NULL); + + ret = ocfs2_lock_xattr_remove_allocators(inode, xv, + args->ref_ci, + args->ref_root_bh, + &ctxt.meta_ac, + &ref_credits); + + ctxt.handle = ocfs2_start_trans(osb, credits + ref_credits); if (IS_ERR(ctxt.handle)) { ret = PTR_ERR(ctxt.handle); mlog_errno(ret); @@ -5248,57 +5521,1439 @@ static int ocfs2_delete_xattr_in_bucket(struct inode *inode, i, 0, &ctxt); ocfs2_commit_trans(osb, ctxt.handle); + if (ctxt.meta_ac) { + ocfs2_free_alloc_context(ctxt.meta_ac); + ctxt.meta_ac = NULL; + } if (ret) { mlog_errno(ret); break; } } + if (ctxt.meta_ac) + ocfs2_free_alloc_context(ctxt.meta_ac); ocfs2_schedule_truncate_log_flush(osb, 1); ocfs2_run_deallocs(osb, &ctxt.dealloc); return ret; } -static int ocfs2_delete_xattr_index_block(struct inode *inode, - struct buffer_head *xb_bh) +/* + * Whenever we modify a xattr value root in the bucket(e.g, CoW + * or change the extent record flag), we need to recalculate + * the metaecc for the whole bucket. So it is done here. + * + * Note: + * We have to give the extra credits for the caller. + */ +static int ocfs2_xattr_bucket_post_refcount(struct inode *inode, + handle_t *handle, + void *para) +{ + int ret; + struct ocfs2_xattr_bucket *bucket = + (struct ocfs2_xattr_bucket *)para; + + ret = ocfs2_xattr_bucket_journal_access(handle, bucket, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + return ret; + } + + ocfs2_xattr_bucket_journal_dirty(handle, bucket); + + return 0; +} + +/* + * Special action we need if the xattr value is refcounted. + * + * 1. If the xattr is refcounted, lock the tree. + * 2. CoW the xattr if we are setting the new value and the value + * will be stored outside. + * 3. In other case, decrease_refcount will work for us, so just + * lock the refcount tree, calculate the meta and credits is OK. + * + * We have to do CoW before ocfs2_init_xattr_set_ctxt since + * currently CoW is a completed transaction, while this function + * will also lock the allocators and let us deadlock. So we will + * CoW the whole xattr value. + */ +static int ocfs2_prepare_refcount_xattr(struct inode *inode, + struct ocfs2_dinode *di, + struct ocfs2_xattr_info *xi, + struct ocfs2_xattr_search *xis, + struct ocfs2_xattr_search *xbs, + struct ocfs2_refcount_tree **ref_tree, + int *meta_add, + int *credits) { - struct ocfs2_xattr_block *xb = - (struct ocfs2_xattr_block *)xb_bh->b_data; - struct ocfs2_extent_list *el = &xb->xb_attrs.xb_root.xt_list; int ret = 0; - u32 name_hash = UINT_MAX, e_cpos, num_clusters; - u64 p_blkno; + struct ocfs2_xattr_block *xb; + struct ocfs2_xattr_entry *xe; + char *base; + u32 p_cluster, num_clusters; + unsigned int ext_flags; + int name_offset, name_len; + struct ocfs2_xattr_value_buf vb; + struct ocfs2_xattr_bucket *bucket = NULL; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct ocfs2_post_refcount refcount; + struct ocfs2_post_refcount *p = NULL; + struct buffer_head *ref_root_bh = NULL; - if (le16_to_cpu(el->l_next_free_rec) == 0) - return 0; + if (!xis->not_found) { + xe = xis->here; + name_offset = le16_to_cpu(xe->xe_name_offset); + name_len = OCFS2_XATTR_SIZE(xe->xe_name_len); + base = xis->base; + vb.vb_bh = xis->inode_bh; + vb.vb_access = ocfs2_journal_access_di; + } else { + int i, block_off = 0; + xb = (struct ocfs2_xattr_block *)xbs->xattr_bh->b_data; + xe = xbs->here; + name_offset = le16_to_cpu(xe->xe_name_offset); + name_len = OCFS2_XATTR_SIZE(xe->xe_name_len); + i = xbs->here - xbs->header->xh_entries; - while (name_hash > 0) { - ret = ocfs2_xattr_get_rec(inode, name_hash, &p_blkno, - &e_cpos, &num_clusters, el); + if (le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED) { + ret = ocfs2_xattr_bucket_get_name_value(inode->i_sb, + bucket_xh(xbs->bucket), + i, &block_off, + &name_offset); + if (ret) { + mlog_errno(ret); + goto out; + } + base = bucket_block(xbs->bucket, block_off); + vb.vb_bh = xbs->bucket->bu_bhs[block_off]; + vb.vb_access = ocfs2_journal_access; + + if (ocfs2_meta_ecc(osb)) { + /*create parameters for ocfs2_post_refcount. */ + bucket = xbs->bucket; + refcount.credits = bucket->bu_blocks; + refcount.para = bucket; + refcount.func = + ocfs2_xattr_bucket_post_refcount; + p = &refcount; + } + } else { + base = xbs->base; + vb.vb_bh = xbs->xattr_bh; + vb.vb_access = ocfs2_journal_access_xb; + } + } + + if (ocfs2_xattr_is_local(xe)) + goto out; + + vb.vb_xv = (struct ocfs2_xattr_value_root *) + (base + name_offset + name_len); + + ret = ocfs2_xattr_get_clusters(inode, 0, &p_cluster, + &num_clusters, &vb.vb_xv->xr_list, + &ext_flags); + if (ret) { + mlog_errno(ret); + goto out; + } + + /* + * We just need to check the 1st extent record, since we always + * CoW the whole xattr. So there shouldn't be a xattr with + * some REFCOUNT extent recs after the 1st one. + */ + if (!(ext_flags & OCFS2_EXT_REFCOUNTED)) + goto out; + + ret = ocfs2_lock_refcount_tree(osb, le64_to_cpu(di->i_refcount_loc), + 1, ref_tree, &ref_root_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + /* + * If we are deleting the xattr or the new size will be stored inside, + * cool, leave it there, the xattr truncate process will remove them + * for us(it still needs the refcount tree lock and the meta, credits). + * And the worse case is that every cluster truncate will split the + * refcount tree, and make the original extent become 3. So we will need + * 2 * cluster more extent recs at most. + */ + if (!xi->value || xi->value_len <= OCFS2_XATTR_INLINE_SIZE) { + + ret = ocfs2_refcounted_xattr_delete_need(inode, + &(*ref_tree)->rf_ci, + ref_root_bh, vb.vb_xv, + meta_add, credits); + if (ret) + mlog_errno(ret); + goto out; + } + + ret = ocfs2_refcount_cow_xattr(inode, di, &vb, + *ref_tree, ref_root_bh, 0, + le32_to_cpu(vb.vb_xv->xr_clusters), p); + if (ret) + mlog_errno(ret); + +out: + brelse(ref_root_bh); + return ret; +} + +/* + * Add the REFCOUNTED flags for all the extent rec in ocfs2_xattr_value_root. + * The physical clusters will be added to refcount tree. + */ +static int ocfs2_xattr_value_attach_refcount(struct inode *inode, + struct ocfs2_xattr_value_root *xv, + struct ocfs2_extent_tree *value_et, + struct ocfs2_caching_info *ref_ci, + struct buffer_head *ref_root_bh, + struct ocfs2_cached_dealloc_ctxt *dealloc, + struct ocfs2_post_refcount *refcount) +{ + int ret = 0; + u32 clusters = le32_to_cpu(xv->xr_clusters); + u32 cpos, p_cluster, num_clusters; + struct ocfs2_extent_list *el = &xv->xr_list; + unsigned int ext_flags; + + cpos = 0; + while (cpos < clusters) { + ret = ocfs2_xattr_get_clusters(inode, cpos, &p_cluster, + &num_clusters, el, &ext_flags); + + cpos += num_clusters; + if ((ext_flags & OCFS2_EXT_REFCOUNTED)) + continue; + + BUG_ON(!p_cluster); + + ret = ocfs2_add_refcount_flag(inode, value_et, + ref_ci, ref_root_bh, + cpos - num_clusters, + p_cluster, num_clusters, + dealloc, refcount); + if (ret) { + mlog_errno(ret); + break; + } + } + + return ret; +} + +/* + * Given a normal ocfs2_xattr_header, refcount all the entries which + * have value stored outside. + * Used for xattrs stored in inode and ocfs2_xattr_block. + */ +static int ocfs2_xattr_attach_refcount_normal(struct inode *inode, + struct ocfs2_xattr_value_buf *vb, + struct ocfs2_xattr_header *header, + struct ocfs2_caching_info *ref_ci, + struct buffer_head *ref_root_bh, + struct ocfs2_cached_dealloc_ctxt *dealloc) +{ + + struct ocfs2_xattr_entry *xe; + struct ocfs2_xattr_value_root *xv; + struct ocfs2_extent_tree et; + int i, ret = 0; + + for (i = 0; i < le16_to_cpu(header->xh_count); i++) { + xe = &header->xh_entries[i]; + + if (ocfs2_xattr_is_local(xe)) + continue; + + xv = (struct ocfs2_xattr_value_root *)((void *)header + + le16_to_cpu(xe->xe_name_offset) + + OCFS2_XATTR_SIZE(xe->xe_name_len)); + + vb->vb_xv = xv; + ocfs2_init_xattr_value_extent_tree(&et, INODE_CACHE(inode), vb); + + ret = ocfs2_xattr_value_attach_refcount(inode, xv, &et, + ref_ci, ref_root_bh, + dealloc, NULL); + if (ret) { + mlog_errno(ret); + break; + } + } + + return ret; +} + +static int ocfs2_xattr_inline_attach_refcount(struct inode *inode, + struct buffer_head *fe_bh, + struct ocfs2_caching_info *ref_ci, + struct buffer_head *ref_root_bh, + struct ocfs2_cached_dealloc_ctxt *dealloc) +{ + struct ocfs2_dinode *di = (struct ocfs2_dinode *)fe_bh->b_data; + struct ocfs2_xattr_header *header = (struct ocfs2_xattr_header *) + (fe_bh->b_data + inode->i_sb->s_blocksize - + le16_to_cpu(di->i_xattr_inline_size)); + struct ocfs2_xattr_value_buf vb = { + .vb_bh = fe_bh, + .vb_access = ocfs2_journal_access_di, + }; + + return ocfs2_xattr_attach_refcount_normal(inode, &vb, header, + ref_ci, ref_root_bh, dealloc); +} + +struct ocfs2_xattr_tree_value_refcount_para { + struct ocfs2_caching_info *ref_ci; + struct buffer_head *ref_root_bh; + struct ocfs2_cached_dealloc_ctxt *dealloc; +}; + +static int ocfs2_get_xattr_tree_value_root(struct super_block *sb, + struct ocfs2_xattr_bucket *bucket, + int offset, + struct ocfs2_xattr_value_root **xv, + struct buffer_head **bh) +{ + int ret, block_off, name_offset; + struct ocfs2_xattr_header *xh = bucket_xh(bucket); + struct ocfs2_xattr_entry *xe = &xh->xh_entries[offset]; + void *base; + + ret = ocfs2_xattr_bucket_get_name_value(sb, + bucket_xh(bucket), + offset, + &block_off, + &name_offset); + if (ret) { + mlog_errno(ret); + goto out; + } + + base = bucket_block(bucket, block_off); + + *xv = (struct ocfs2_xattr_value_root *)(base + name_offset + + OCFS2_XATTR_SIZE(xe->xe_name_len)); + + if (bh) + *bh = bucket->bu_bhs[block_off]; +out: + return ret; +} + +/* + * For a given xattr bucket, refcount all the entries which + * have value stored outside. + */ +static int ocfs2_xattr_bucket_value_refcount(struct inode *inode, + struct ocfs2_xattr_bucket *bucket, + void *para) +{ + int i, ret = 0; + struct ocfs2_extent_tree et; + struct ocfs2_xattr_tree_value_refcount_para *ref = + (struct ocfs2_xattr_tree_value_refcount_para *)para; + struct ocfs2_xattr_header *xh = + (struct ocfs2_xattr_header *)bucket->bu_bhs[0]->b_data; + struct ocfs2_xattr_entry *xe; + struct ocfs2_xattr_value_buf vb = { + .vb_access = ocfs2_journal_access, + }; + struct ocfs2_post_refcount refcount = { + .credits = bucket->bu_blocks, + .para = bucket, + .func = ocfs2_xattr_bucket_post_refcount, + }; + struct ocfs2_post_refcount *p = NULL; + + /* We only need post_refcount if we support metaecc. */ + if (ocfs2_meta_ecc(OCFS2_SB(inode->i_sb))) + p = &refcount; + + mlog(0, "refcount bucket %llu, count = %u\n", + (unsigned long long)bucket_blkno(bucket), + le16_to_cpu(xh->xh_count)); + for (i = 0; i < le16_to_cpu(xh->xh_count); i++) { + xe = &xh->xh_entries[i]; + + if (ocfs2_xattr_is_local(xe)) + continue; + + ret = ocfs2_get_xattr_tree_value_root(inode->i_sb, bucket, i, + &vb.vb_xv, &vb.vb_bh); + if (ret) { + mlog_errno(ret); + break; + } + + ocfs2_init_xattr_value_extent_tree(&et, + INODE_CACHE(inode), &vb); + + ret = ocfs2_xattr_value_attach_refcount(inode, vb.vb_xv, + &et, ref->ref_ci, + ref->ref_root_bh, + ref->dealloc, p); + if (ret) { + mlog_errno(ret); + break; + } + } + + return ret; + +} + +static int ocfs2_refcount_xattr_tree_rec(struct inode *inode, + struct buffer_head *root_bh, + u64 blkno, u32 cpos, u32 len, void *para) +{ + return ocfs2_iterate_xattr_buckets(inode, blkno, len, + ocfs2_xattr_bucket_value_refcount, + para); +} + +static int ocfs2_xattr_block_attach_refcount(struct inode *inode, + struct buffer_head *blk_bh, + struct ocfs2_caching_info *ref_ci, + struct buffer_head *ref_root_bh, + struct ocfs2_cached_dealloc_ctxt *dealloc) +{ + int ret = 0; + struct ocfs2_xattr_block *xb = + (struct ocfs2_xattr_block *)blk_bh->b_data; + + if (!(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED)) { + struct ocfs2_xattr_header *header = &xb->xb_attrs.xb_header; + struct ocfs2_xattr_value_buf vb = { + .vb_bh = blk_bh, + .vb_access = ocfs2_journal_access_xb, + }; + + ret = ocfs2_xattr_attach_refcount_normal(inode, &vb, header, + ref_ci, ref_root_bh, + dealloc); + } else { + struct ocfs2_xattr_tree_value_refcount_para para = { + .ref_ci = ref_ci, + .ref_root_bh = ref_root_bh, + .dealloc = dealloc, + }; + + ret = ocfs2_iterate_xattr_index_block(inode, blk_bh, + ocfs2_refcount_xattr_tree_rec, + ¶); + } + + return ret; +} + +int ocfs2_xattr_attach_refcount_tree(struct inode *inode, + struct buffer_head *fe_bh, + struct ocfs2_caching_info *ref_ci, + struct buffer_head *ref_root_bh, + struct ocfs2_cached_dealloc_ctxt *dealloc) +{ + int ret = 0; + struct ocfs2_inode_info *oi = OCFS2_I(inode); + struct ocfs2_dinode *di = (struct ocfs2_dinode *)fe_bh->b_data; + struct buffer_head *blk_bh = NULL; + + if (oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL) { + ret = ocfs2_xattr_inline_attach_refcount(inode, fe_bh, + ref_ci, ref_root_bh, + dealloc); if (ret) { mlog_errno(ret); goto out; } + } + + if (!di->i_xattr_loc) + goto out; + + ret = ocfs2_read_xattr_block(inode, le64_to_cpu(di->i_xattr_loc), + &blk_bh); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_xattr_block_attach_refcount(inode, blk_bh, ref_ci, + ref_root_bh, dealloc); + if (ret) + mlog_errno(ret); + + brelse(blk_bh); +out: + + return ret; +} + +typedef int (should_xattr_reflinked)(struct ocfs2_xattr_entry *xe); +/* + * Store the information we need in xattr reflink. + * old_bh and new_bh are inode bh for the old and new inode. + */ +struct ocfs2_xattr_reflink { + struct inode *old_inode; + struct inode *new_inode; + struct buffer_head *old_bh; + struct buffer_head *new_bh; + struct ocfs2_caching_info *ref_ci; + struct buffer_head *ref_root_bh; + struct ocfs2_cached_dealloc_ctxt *dealloc; + should_xattr_reflinked *xattr_reflinked; +}; + +/* + * Given a xattr header and xe offset, + * return the proper xv and the corresponding bh. + * xattr in inode, block and xattr tree have different implementaions. + */ +typedef int (get_xattr_value_root)(struct super_block *sb, + struct buffer_head *bh, + struct ocfs2_xattr_header *xh, + int offset, + struct ocfs2_xattr_value_root **xv, + struct buffer_head **ret_bh, + void *para); + +/* + * Calculate all the xattr value root metadata stored in this xattr header and + * credits we need if we create them from the scratch. + * We use get_xattr_value_root so that all types of xattr container can use it. + */ +static int ocfs2_value_metas_in_xattr_header(struct super_block *sb, + struct buffer_head *bh, + struct ocfs2_xattr_header *xh, + int *metas, int *credits, + int *num_recs, + get_xattr_value_root *func, + void *para) +{ + int i, ret = 0; + struct ocfs2_xattr_value_root *xv; + struct ocfs2_xattr_entry *xe; + + for (i = 0; i < le16_to_cpu(xh->xh_count); i++) { + xe = &xh->xh_entries[i]; + if (ocfs2_xattr_is_local(xe)) + continue; + + ret = func(sb, bh, xh, i, &xv, NULL, para); + if (ret) { + mlog_errno(ret); + break; + } + + *metas += le16_to_cpu(xv->xr_list.l_tree_depth) * + le16_to_cpu(xv->xr_list.l_next_free_rec); + + *credits += ocfs2_calc_extend_credits(sb, + &def_xv.xv.xr_list, + le32_to_cpu(xv->xr_clusters)); + + /* + * If the value is a tree with depth > 1, We don't go deep + * to the extent block, so just calculate a maximum record num. + */ + if (!xv->xr_list.l_tree_depth) + *num_recs += xv->xr_list.l_next_free_rec; + else + *num_recs += ocfs2_clusters_for_bytes(sb, + XATTR_SIZE_MAX); + } + + return ret; +} + +/* Used by xattr inode and block to return the right xv and buffer_head. */ +static int ocfs2_get_xattr_value_root(struct super_block *sb, + struct buffer_head *bh, + struct ocfs2_xattr_header *xh, + int offset, + struct ocfs2_xattr_value_root **xv, + struct buffer_head **ret_bh, + void *para) +{ + struct ocfs2_xattr_entry *xe = &xh->xh_entries[offset]; + + *xv = (struct ocfs2_xattr_value_root *)((void *)xh + + le16_to_cpu(xe->xe_name_offset) + + OCFS2_XATTR_SIZE(xe->xe_name_len)); + + if (ret_bh) + *ret_bh = bh; + + return 0; +} + +/* + * Lock the meta_ac and caculate how much credits we need for reflink xattrs. + * It is only used for inline xattr and xattr block. + */ +static int ocfs2_reflink_lock_xattr_allocators(struct ocfs2_super *osb, + struct ocfs2_xattr_header *xh, + struct buffer_head *ref_root_bh, + int *credits, + struct ocfs2_alloc_context **meta_ac) +{ + int ret, meta_add = 0, num_recs = 0; + struct ocfs2_refcount_block *rb = + (struct ocfs2_refcount_block *)ref_root_bh->b_data; + + *credits = 0; + + ret = ocfs2_value_metas_in_xattr_header(osb->sb, NULL, xh, + &meta_add, credits, &num_recs, + ocfs2_get_xattr_value_root, + NULL); + if (ret) { + mlog_errno(ret); + goto out; + } + + /* + * We need to add/modify num_recs in refcount tree, so just calculate + * an approximate number we need for refcount tree change. + * Sometimes we need to split the tree, and after split, half recs + * will be moved to the new block, and a new block can only provide + * half number of recs. So we multiple new blocks by 2. + */ + num_recs = num_recs / ocfs2_refcount_recs_per_rb(osb->sb) * 2; + meta_add += num_recs; + *credits += num_recs + num_recs * OCFS2_EXPAND_REFCOUNT_TREE_CREDITS; + if (le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL) + *credits += le16_to_cpu(rb->rf_list.l_tree_depth) * + le16_to_cpu(rb->rf_list.l_next_free_rec) + 1; + else + *credits += 1; + + ret = ocfs2_reserve_new_metadata_blocks(osb, meta_add, meta_ac); + if (ret) + mlog_errno(ret); + +out: + return ret; +} - ret = ocfs2_iterate_xattr_buckets(inode, p_blkno, num_clusters, - ocfs2_delete_xattr_in_bucket, - NULL); +/* + * Given a xattr header, reflink all the xattrs in this container. + * It can be used for inode, block and bucket. + * + * NOTE: + * Before we call this function, the caller has memcpy the xattr in + * old_xh to the new_xh. + * + * If args.xattr_reflinked is set, call it to decide whether the xe should + * be reflinked or not. If not, remove it from the new xattr header. + */ +static int ocfs2_reflink_xattr_header(handle_t *handle, + struct ocfs2_xattr_reflink *args, + struct buffer_head *old_bh, + struct ocfs2_xattr_header *xh, + struct buffer_head *new_bh, + struct ocfs2_xattr_header *new_xh, + struct ocfs2_xattr_value_buf *vb, + struct ocfs2_alloc_context *meta_ac, + get_xattr_value_root *func, + void *para) +{ + int ret = 0, i, j; + struct super_block *sb = args->old_inode->i_sb; + struct buffer_head *value_bh; + struct ocfs2_xattr_entry *xe, *last; + struct ocfs2_xattr_value_root *xv, *new_xv; + struct ocfs2_extent_tree data_et; + u32 clusters, cpos, p_cluster, num_clusters; + unsigned int ext_flags = 0; + + mlog(0, "reflink xattr in container %llu, count = %u\n", + (unsigned long long)old_bh->b_blocknr, le16_to_cpu(xh->xh_count)); + + last = &new_xh->xh_entries[le16_to_cpu(new_xh->xh_count)]; + for (i = 0, j = 0; i < le16_to_cpu(xh->xh_count); i++, j++) { + xe = &xh->xh_entries[i]; + + if (args->xattr_reflinked && !args->xattr_reflinked(xe)) { + xe = &new_xh->xh_entries[j]; + + le16_add_cpu(&new_xh->xh_count, -1); + if (new_xh->xh_count) { + memmove(xe, xe + 1, + (void *)last - (void *)xe); + memset(last, 0, + sizeof(struct ocfs2_xattr_entry)); + } + + /* + * We don't want j to increase in the next round since + * it is already moved ahead. + */ + j--; + continue; + } + + if (ocfs2_xattr_is_local(xe)) + continue; + + ret = func(sb, old_bh, xh, i, &xv, NULL, para); + if (ret) { + mlog_errno(ret); + break; + } + + ret = func(sb, new_bh, new_xh, j, &new_xv, &value_bh, para); + if (ret) { + mlog_errno(ret); + break; + } + + /* + * For the xattr which has l_tree_depth = 0, all the extent + * recs have already be copied to the new xh with the + * propriate OCFS2_EXT_REFCOUNTED flag we just need to + * increase the refount count int the refcount tree. + * + * For the xattr which has l_tree_depth > 0, we need + * to initialize it to the empty default value root, + * and then insert the extents one by one. + */ + if (xv->xr_list.l_tree_depth) { + memcpy(new_xv, &def_xv, sizeof(def_xv)); + vb->vb_xv = new_xv; + vb->vb_bh = value_bh; + ocfs2_init_xattr_value_extent_tree(&data_et, + INODE_CACHE(args->new_inode), vb); + } + + clusters = le32_to_cpu(xv->xr_clusters); + cpos = 0; + while (cpos < clusters) { + ret = ocfs2_xattr_get_clusters(args->old_inode, + cpos, + &p_cluster, + &num_clusters, + &xv->xr_list, + &ext_flags); + if (ret) { + mlog_errno(ret); + goto out; + } + + BUG_ON(!p_cluster); + + if (xv->xr_list.l_tree_depth) { + ret = ocfs2_insert_extent(handle, + &data_et, cpos, + ocfs2_clusters_to_blocks( + args->old_inode->i_sb, + p_cluster), + num_clusters, ext_flags, + meta_ac); + if (ret) { + mlog_errno(ret); + goto out; + } + } + + ret = ocfs2_increase_refcount(handle, args->ref_ci, + args->ref_root_bh, + p_cluster, num_clusters, + meta_ac, args->dealloc); + if (ret) { + mlog_errno(ret); + goto out; + } + + cpos += num_clusters; + } + } + +out: + return ret; +} + +static int ocfs2_reflink_xattr_inline(struct ocfs2_xattr_reflink *args) +{ + int ret = 0, credits = 0; + handle_t *handle; + struct ocfs2_super *osb = OCFS2_SB(args->old_inode->i_sb); + struct ocfs2_dinode *di = (struct ocfs2_dinode *)args->old_bh->b_data; + int inline_size = le16_to_cpu(di->i_xattr_inline_size); + int header_off = osb->sb->s_blocksize - inline_size; + struct ocfs2_xattr_header *xh = (struct ocfs2_xattr_header *) + (args->old_bh->b_data + header_off); + struct ocfs2_xattr_header *new_xh = (struct ocfs2_xattr_header *) + (args->new_bh->b_data + header_off); + struct ocfs2_alloc_context *meta_ac = NULL; + struct ocfs2_inode_info *new_oi; + struct ocfs2_dinode *new_di; + struct ocfs2_xattr_value_buf vb = { + .vb_bh = args->new_bh, + .vb_access = ocfs2_journal_access_di, + }; + + ret = ocfs2_reflink_lock_xattr_allocators(osb, xh, args->ref_root_bh, + &credits, &meta_ac); + if (ret) { + mlog_errno(ret); + goto out; + } + + handle = ocfs2_start_trans(osb, credits); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + goto out; + } + + ret = ocfs2_journal_access_di(handle, INODE_CACHE(args->new_inode), + args->new_bh, OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + memcpy(args->new_bh->b_data + header_off, + args->old_bh->b_data + header_off, inline_size); + + new_di = (struct ocfs2_dinode *)args->new_bh->b_data; + new_di->i_xattr_inline_size = cpu_to_le16(inline_size); + + ret = ocfs2_reflink_xattr_header(handle, args, args->old_bh, xh, + args->new_bh, new_xh, &vb, meta_ac, + ocfs2_get_xattr_value_root, NULL); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + new_oi = OCFS2_I(args->new_inode); + spin_lock(&new_oi->ip_lock); + new_oi->ip_dyn_features |= OCFS2_HAS_XATTR_FL | OCFS2_INLINE_XATTR_FL; + new_di->i_dyn_features = cpu_to_le16(new_oi->ip_dyn_features); + spin_unlock(&new_oi->ip_lock); + + ocfs2_journal_dirty(handle, args->new_bh); + +out_commit: + ocfs2_commit_trans(osb, handle); + +out: + if (meta_ac) + ocfs2_free_alloc_context(meta_ac); + return ret; +} + +static int ocfs2_create_empty_xattr_block(struct inode *inode, + struct buffer_head *fe_bh, + struct buffer_head **ret_bh, + int indexed) +{ + int ret; + handle_t *handle; + struct ocfs2_alloc_context *meta_ac; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + + ret = ocfs2_reserve_new_metadata_blocks(osb, 1, &meta_ac); + if (ret < 0) { + mlog_errno(ret); + return ret; + } + + handle = ocfs2_start_trans(osb, OCFS2_XATTR_BLOCK_CREATE_CREDITS); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + goto out; + } + + mlog(0, "create new xattr block for inode %llu, index = %d\n", + (unsigned long long)fe_bh->b_blocknr, indexed); + ret = ocfs2_create_xattr_block(handle, inode, fe_bh, + meta_ac, ret_bh, indexed); + if (ret) + mlog_errno(ret); + + ocfs2_commit_trans(osb, handle); +out: + ocfs2_free_alloc_context(meta_ac); + return ret; +} + +static int ocfs2_reflink_xattr_block(struct ocfs2_xattr_reflink *args, + struct buffer_head *blk_bh, + struct buffer_head *new_blk_bh) +{ + int ret = 0, credits = 0; + handle_t *handle; + struct ocfs2_inode_info *new_oi = OCFS2_I(args->new_inode); + struct ocfs2_dinode *new_di; + struct ocfs2_super *osb = OCFS2_SB(args->new_inode->i_sb); + int header_off = offsetof(struct ocfs2_xattr_block, xb_attrs.xb_header); + struct ocfs2_xattr_block *xb = + (struct ocfs2_xattr_block *)blk_bh->b_data; + struct ocfs2_xattr_header *xh = &xb->xb_attrs.xb_header; + struct ocfs2_xattr_block *new_xb = + (struct ocfs2_xattr_block *)new_blk_bh->b_data; + struct ocfs2_xattr_header *new_xh = &new_xb->xb_attrs.xb_header; + struct ocfs2_alloc_context *meta_ac; + struct ocfs2_xattr_value_buf vb = { + .vb_bh = new_blk_bh, + .vb_access = ocfs2_journal_access_xb, + }; + + ret = ocfs2_reflink_lock_xattr_allocators(osb, xh, args->ref_root_bh, + &credits, &meta_ac); + if (ret) { + mlog_errno(ret); + return ret; + } + + /* One more credits in case we need to add xattr flags in new inode. */ + handle = ocfs2_start_trans(osb, credits + 1); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + goto out; + } + + if (!(new_oi->ip_dyn_features & OCFS2_HAS_XATTR_FL)) { + ret = ocfs2_journal_access_di(handle, + INODE_CACHE(args->new_inode), + args->new_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + } + + ret = ocfs2_journal_access_xb(handle, INODE_CACHE(args->new_inode), + new_blk_bh, OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + memcpy(new_blk_bh->b_data + header_off, blk_bh->b_data + header_off, + osb->sb->s_blocksize - header_off); + + ret = ocfs2_reflink_xattr_header(handle, args, blk_bh, xh, + new_blk_bh, new_xh, &vb, meta_ac, + ocfs2_get_xattr_value_root, NULL); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + ocfs2_journal_dirty(handle, new_blk_bh); + + if (!(new_oi->ip_dyn_features & OCFS2_HAS_XATTR_FL)) { + new_di = (struct ocfs2_dinode *)args->new_bh->b_data; + spin_lock(&new_oi->ip_lock); + new_oi->ip_dyn_features |= OCFS2_HAS_XATTR_FL; + new_di->i_dyn_features = cpu_to_le16(new_oi->ip_dyn_features); + spin_unlock(&new_oi->ip_lock); + + ocfs2_journal_dirty(handle, args->new_bh); + } + +out_commit: + ocfs2_commit_trans(osb, handle); + +out: + ocfs2_free_alloc_context(meta_ac); + return ret; +} + +struct ocfs2_reflink_xattr_tree_args { + struct ocfs2_xattr_reflink *reflink; + struct buffer_head *old_blk_bh; + struct buffer_head *new_blk_bh; + struct ocfs2_xattr_bucket *old_bucket; + struct ocfs2_xattr_bucket *new_bucket; +}; + +/* + * NOTE: + * We have to handle the case that both old bucket and new bucket + * will call this function to get the right ret_bh. + * So The caller must give us the right bh. + */ +static int ocfs2_get_reflink_xattr_value_root(struct super_block *sb, + struct buffer_head *bh, + struct ocfs2_xattr_header *xh, + int offset, + struct ocfs2_xattr_value_root **xv, + struct buffer_head **ret_bh, + void *para) +{ + struct ocfs2_reflink_xattr_tree_args *args = + (struct ocfs2_reflink_xattr_tree_args *)para; + struct ocfs2_xattr_bucket *bucket; + + if (bh == args->old_bucket->bu_bhs[0]) + bucket = args->old_bucket; + else + bucket = args->new_bucket; + + return ocfs2_get_xattr_tree_value_root(sb, bucket, offset, + xv, ret_bh); +} + +struct ocfs2_value_tree_metas { + int num_metas; + int credits; + int num_recs; +}; + +static int ocfs2_value_tree_metas_in_bucket(struct super_block *sb, + struct buffer_head *bh, + struct ocfs2_xattr_header *xh, + int offset, + struct ocfs2_xattr_value_root **xv, + struct buffer_head **ret_bh, + void *para) +{ + struct ocfs2_xattr_bucket *bucket = + (struct ocfs2_xattr_bucket *)para; + + return ocfs2_get_xattr_tree_value_root(sb, bucket, offset, + xv, ret_bh); +} + +static int ocfs2_calc_value_tree_metas(struct inode *inode, + struct ocfs2_xattr_bucket *bucket, + void *para) +{ + struct ocfs2_value_tree_metas *metas = + (struct ocfs2_value_tree_metas *)para; + struct ocfs2_xattr_header *xh = + (struct ocfs2_xattr_header *)bucket->bu_bhs[0]->b_data; + + /* Add the credits for this bucket first. */ + metas->credits += bucket->bu_blocks; + return ocfs2_value_metas_in_xattr_header(inode->i_sb, bucket->bu_bhs[0], + xh, &metas->num_metas, + &metas->credits, &metas->num_recs, + ocfs2_value_tree_metas_in_bucket, + bucket); +} + +/* + * Given a xattr extent rec starting from blkno and having len clusters, + * iterate all the buckets calculate how much metadata we need for reflinking + * all the ocfs2_xattr_value_root and lock the allocators accordingly. + */ +static int ocfs2_lock_reflink_xattr_rec_allocators( + struct ocfs2_reflink_xattr_tree_args *args, + struct ocfs2_extent_tree *xt_et, + u64 blkno, u32 len, int *credits, + struct ocfs2_alloc_context **meta_ac, + struct ocfs2_alloc_context **data_ac) +{ + int ret, num_free_extents; + struct ocfs2_value_tree_metas metas; + struct ocfs2_super *osb = OCFS2_SB(args->reflink->old_inode->i_sb); + struct ocfs2_refcount_block *rb; + + memset(&metas, 0, sizeof(metas)); + + ret = ocfs2_iterate_xattr_buckets(args->reflink->old_inode, blkno, len, + ocfs2_calc_value_tree_metas, &metas); + if (ret) { + mlog_errno(ret); + goto out; + } + + *credits = metas.credits; + + /* + * Calculate we need for refcount tree change. + * + * We need to add/modify num_recs in refcount tree, so just calculate + * an approximate number we need for refcount tree change. + * Sometimes we need to split the tree, and after split, half recs + * will be moved to the new block, and a new block can only provide + * half number of recs. So we multiple new blocks by 2. + * In the end, we have to add credits for modifying the already + * existed refcount block. + */ + rb = (struct ocfs2_refcount_block *)args->reflink->ref_root_bh->b_data; + metas.num_recs = + (metas.num_recs + ocfs2_refcount_recs_per_rb(osb->sb) - 1) / + ocfs2_refcount_recs_per_rb(osb->sb) * 2; + metas.num_metas += metas.num_recs; + *credits += metas.num_recs + + metas.num_recs * OCFS2_EXPAND_REFCOUNT_TREE_CREDITS; + if (le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL) + *credits += le16_to_cpu(rb->rf_list.l_tree_depth) * + le16_to_cpu(rb->rf_list.l_next_free_rec) + 1; + else + *credits += 1; + + /* count in the xattr tree change. */ + num_free_extents = ocfs2_num_free_extents(osb, xt_et); + if (num_free_extents < 0) { + ret = num_free_extents; + mlog_errno(ret); + goto out; + } + + if (num_free_extents < len) + metas.num_metas += ocfs2_extend_meta_needed(xt_et->et_root_el); + + *credits += ocfs2_calc_extend_credits(osb->sb, + xt_et->et_root_el, len); + + if (metas.num_metas) { + ret = ocfs2_reserve_new_metadata_blocks(osb, metas.num_metas, + meta_ac); if (ret) { mlog_errno(ret); goto out; } + } - ret = ocfs2_rm_xattr_cluster(inode, xb_bh, - p_blkno, e_cpos, num_clusters); + if (len) { + ret = ocfs2_reserve_clusters(osb, len, data_ac); + if (ret) + mlog_errno(ret); + } +out: + if (ret) { + if (*meta_ac) { + ocfs2_free_alloc_context(*meta_ac); + meta_ac = NULL; + } + } + + return ret; +} + +static int ocfs2_reflink_xattr_buckets(handle_t *handle, + u64 blkno, u64 new_blkno, u32 clusters, + struct ocfs2_alloc_context *meta_ac, + struct ocfs2_alloc_context *data_ac, + struct ocfs2_reflink_xattr_tree_args *args) +{ + int i, j, ret = 0; + struct super_block *sb = args->reflink->old_inode->i_sb; + u32 bpc = ocfs2_xattr_buckets_per_cluster(OCFS2_SB(sb)); + u32 num_buckets = clusters * bpc; + int bpb = args->old_bucket->bu_blocks; + struct ocfs2_xattr_value_buf vb = { + .vb_access = ocfs2_journal_access, + }; + + for (i = 0; i < num_buckets; i++, blkno += bpb, new_blkno += bpb) { + ret = ocfs2_read_xattr_bucket(args->old_bucket, blkno); if (ret) { mlog_errno(ret); break; } - if (e_cpos == 0) + ret = ocfs2_init_xattr_bucket(args->new_bucket, new_blkno); + if (ret) { + mlog_errno(ret); break; + } - name_hash = e_cpos - 1; + /* + * The real bucket num in this series of blocks is stored + * in the 1st bucket. + */ + if (i == 0) + num_buckets = le16_to_cpu( + bucket_xh(args->old_bucket)->xh_num_buckets); + + ret = ocfs2_xattr_bucket_journal_access(handle, + args->new_bucket, + OCFS2_JOURNAL_ACCESS_CREATE); + if (ret) { + mlog_errno(ret); + break; + } + + for (j = 0; j < bpb; j++) + memcpy(bucket_block(args->new_bucket, j), + bucket_block(args->old_bucket, j), + sb->s_blocksize); + + ocfs2_xattr_bucket_journal_dirty(handle, args->new_bucket); + + ret = ocfs2_reflink_xattr_header(handle, args->reflink, + args->old_bucket->bu_bhs[0], + bucket_xh(args->old_bucket), + args->new_bucket->bu_bhs[0], + bucket_xh(args->new_bucket), + &vb, meta_ac, + ocfs2_get_reflink_xattr_value_root, + args); + if (ret) { + mlog_errno(ret); + break; + } + + /* + * Re-access and dirty the bucket to calculate metaecc. + * Because we may extend the transaction in reflink_xattr_header + * which will let the already accessed block gone. + */ + ret = ocfs2_xattr_bucket_journal_access(handle, + args->new_bucket, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + break; + } + + ocfs2_xattr_bucket_journal_dirty(handle, args->new_bucket); + ocfs2_xattr_bucket_relse(args->old_bucket); + ocfs2_xattr_bucket_relse(args->new_bucket); + } + + ocfs2_xattr_bucket_relse(args->old_bucket); + ocfs2_xattr_bucket_relse(args->new_bucket); + return ret; +} +/* + * Create the same xattr extent record in the new inode's xattr tree. + */ +static int ocfs2_reflink_xattr_rec(struct inode *inode, + struct buffer_head *root_bh, + u64 blkno, + u32 cpos, + u32 len, + void *para) +{ + int ret, credits = 0; + u32 p_cluster, num_clusters; + u64 new_blkno; + handle_t *handle; + struct ocfs2_reflink_xattr_tree_args *args = + (struct ocfs2_reflink_xattr_tree_args *)para; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct ocfs2_alloc_context *meta_ac = NULL; + struct ocfs2_alloc_context *data_ac = NULL; + struct ocfs2_extent_tree et; + + ocfs2_init_xattr_tree_extent_tree(&et, + INODE_CACHE(args->reflink->new_inode), + args->new_blk_bh); + + ret = ocfs2_lock_reflink_xattr_rec_allocators(args, &et, blkno, + len, &credits, + &meta_ac, &data_ac); + if (ret) { + mlog_errno(ret); + goto out; + } + + handle = ocfs2_start_trans(osb, credits); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + goto out; + } + + ret = ocfs2_claim_clusters(osb, handle, data_ac, + len, &p_cluster, &num_clusters); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + new_blkno = ocfs2_clusters_to_blocks(osb->sb, p_cluster); + + mlog(0, "reflink xattr buckets %llu to %llu, len %u\n", + (unsigned long long)blkno, (unsigned long long)new_blkno, len); + ret = ocfs2_reflink_xattr_buckets(handle, blkno, new_blkno, len, + meta_ac, data_ac, args); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + mlog(0, "insert new xattr extent rec start %llu len %u to %u\n", + (unsigned long long)new_blkno, len, cpos); + ret = ocfs2_insert_extent(handle, &et, cpos, new_blkno, + len, 0, meta_ac); + if (ret) + mlog_errno(ret); + +out_commit: + ocfs2_commit_trans(osb, handle); + +out: + if (meta_ac) + ocfs2_free_alloc_context(meta_ac); + if (data_ac) + ocfs2_free_alloc_context(data_ac); + return ret; +} + +/* + * Create reflinked xattr buckets. + * We will add bucket one by one, and refcount all the xattrs in the bucket + * if they are stored outside. + */ +static int ocfs2_reflink_xattr_tree(struct ocfs2_xattr_reflink *args, + struct buffer_head *blk_bh, + struct buffer_head *new_blk_bh) +{ + int ret; + struct ocfs2_reflink_xattr_tree_args para; + + memset(¶, 0, sizeof(para)); + para.reflink = args; + para.old_blk_bh = blk_bh; + para.new_blk_bh = new_blk_bh; + + para.old_bucket = ocfs2_xattr_bucket_new(args->old_inode); + if (!para.old_bucket) { + mlog_errno(-ENOMEM); + return -ENOMEM; + } + + para.new_bucket = ocfs2_xattr_bucket_new(args->new_inode); + if (!para.new_bucket) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + + ret = ocfs2_iterate_xattr_index_block(args->old_inode, blk_bh, + ocfs2_reflink_xattr_rec, + ¶); + if (ret) + mlog_errno(ret); + +out: + ocfs2_xattr_bucket_free(para.old_bucket); + ocfs2_xattr_bucket_free(para.new_bucket); + return ret; +} + +static int ocfs2_reflink_xattr_in_block(struct ocfs2_xattr_reflink *args, + struct buffer_head *blk_bh) +{ + int ret, indexed = 0; + struct buffer_head *new_blk_bh = NULL; + struct ocfs2_xattr_block *xb = + (struct ocfs2_xattr_block *)blk_bh->b_data; + + + if (le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED) + indexed = 1; + + ret = ocfs2_create_empty_xattr_block(args->new_inode, args->new_bh, + &new_blk_bh, indexed); + if (ret) { + mlog_errno(ret); + goto out; + } + + if (!(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED)) + ret = ocfs2_reflink_xattr_block(args, blk_bh, new_blk_bh); + else + ret = ocfs2_reflink_xattr_tree(args, blk_bh, new_blk_bh); + if (ret) + mlog_errno(ret); + +out: + brelse(new_blk_bh); + return ret; +} + +static int ocfs2_reflink_xattr_no_security(struct ocfs2_xattr_entry *xe) +{ + int type = ocfs2_xattr_get_type(xe); + + return type != OCFS2_XATTR_INDEX_SECURITY && + type != OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS && + type != OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT; +} + +int ocfs2_reflink_xattrs(struct inode *old_inode, + struct buffer_head *old_bh, + struct inode *new_inode, + struct buffer_head *new_bh, + bool preserve_security) +{ + int ret; + struct ocfs2_xattr_reflink args; + struct ocfs2_inode_info *oi = OCFS2_I(old_inode); + struct ocfs2_dinode *di = (struct ocfs2_dinode *)old_bh->b_data; + struct buffer_head *blk_bh = NULL; + struct ocfs2_cached_dealloc_ctxt dealloc; + struct ocfs2_refcount_tree *ref_tree; + struct buffer_head *ref_root_bh = NULL; + + ret = ocfs2_lock_refcount_tree(OCFS2_SB(old_inode->i_sb), + le64_to_cpu(di->i_refcount_loc), + 1, &ref_tree, &ref_root_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + ocfs2_init_dealloc_ctxt(&dealloc); + + args.old_inode = old_inode; + args.new_inode = new_inode; + args.old_bh = old_bh; + args.new_bh = new_bh; + args.ref_ci = &ref_tree->rf_ci; + args.ref_root_bh = ref_root_bh; + args.dealloc = &dealloc; + if (preserve_security) + args.xattr_reflinked = NULL; + else + args.xattr_reflinked = ocfs2_reflink_xattr_no_security; + + if (oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL) { + ret = ocfs2_reflink_xattr_inline(&args); + if (ret) { + mlog_errno(ret); + goto out_unlock; + } + } + + if (!di->i_xattr_loc) + goto out_unlock; + + ret = ocfs2_read_xattr_block(old_inode, le64_to_cpu(di->i_xattr_loc), + &blk_bh); + if (ret < 0) { + mlog_errno(ret); + goto out_unlock; + } + + ret = ocfs2_reflink_xattr_in_block(&args, blk_bh); + if (ret) + mlog_errno(ret); + + brelse(blk_bh); + +out_unlock: + ocfs2_unlock_refcount_tree(OCFS2_SB(old_inode->i_sb), + ref_tree, 1); + brelse(ref_root_bh); + + if (ocfs2_dealloc_has_cluster(&dealloc)) { + ocfs2_schedule_truncate_log_flush(OCFS2_SB(old_inode->i_sb), 1); + ocfs2_run_deallocs(OCFS2_SB(old_inode->i_sb), &dealloc); } out: @@ -5306,6 +6961,51 @@ out: } /* + * Initialize security and acl for a already created inode. + * Used for reflink a non-preserve-security file. + * + * It uses common api like ocfs2_xattr_set, so the caller + * must not hold any lock expect i_mutex. + */ +int ocfs2_init_security_and_acl(struct inode *dir, + struct inode *inode) +{ + int ret = 0; + struct buffer_head *dir_bh = NULL; + struct ocfs2_security_xattr_info si = { + .enable = 1, + }; + + ret = ocfs2_init_security_get(inode, dir, &si); + if (!ret) { + ret = ocfs2_xattr_security_set(inode, si.name, + si.value, si.value_len, + XATTR_CREATE); + if (ret) { + mlog_errno(ret); + goto leave; + } + } else if (ret != -EOPNOTSUPP) { + mlog_errno(ret); + goto leave; + } + + ret = ocfs2_inode_lock(dir, &dir_bh, 0); + if (ret) { + mlog_errno(ret); + goto leave; + } + + ret = ocfs2_init_acl(NULL, inode, dir, NULL, dir_bh, NULL, NULL); + if (ret) + mlog_errno(ret); + + ocfs2_inode_unlock(dir, 0); + brelse(dir_bh); +leave: + return ret; +} +/* * 'security' attributes support */ static size_t ocfs2_xattr_security_list(struct inode *inode, char *list, diff --git a/fs/ocfs2/xattr.h b/fs/ocfs2/xattr.h index 1ca7e9a..08e3638 100644 --- a/fs/ocfs2/xattr.h +++ b/fs/ocfs2/xattr.h @@ -55,6 +55,8 @@ int ocfs2_xattr_set_handle(handle_t *, struct inode *, struct buffer_head *, int, const char *, const void *, size_t, int, struct ocfs2_alloc_context *, struct ocfs2_alloc_context *); +int ocfs2_has_inline_xattr_value_outside(struct inode *inode, + struct ocfs2_dinode *di); int ocfs2_xattr_remove(struct inode *, struct buffer_head *); int ocfs2_init_security_get(struct inode *, struct inode *, struct ocfs2_security_xattr_info *); @@ -83,5 +85,16 @@ struct ocfs2_xattr_value_buf { struct ocfs2_xattr_value_root *vb_xv; }; - +int ocfs2_xattr_attach_refcount_tree(struct inode *inode, + struct buffer_head *fe_bh, + struct ocfs2_caching_info *ref_ci, + struct buffer_head *ref_root_bh, + struct ocfs2_cached_dealloc_ctxt *dealloc); +int ocfs2_reflink_xattrs(struct inode *old_inode, + struct buffer_head *old_bh, + struct inode *new_inode, + struct buffer_head *new_bh, + bool preserve_security); +int ocfs2_init_security_and_acl(struct inode *dir, + struct inode *inode); #endif /* OCFS2_XATTR_H */ diff --git a/fs/omfs/dir.c b/fs/omfs/dir.c index c7275cf..b42d624 100644 --- a/fs/omfs/dir.c +++ b/fs/omfs/dir.c @@ -489,7 +489,7 @@ out: return ret; } -struct inode_operations omfs_dir_inops = { +const struct inode_operations omfs_dir_inops = { .lookup = omfs_lookup, .mkdir = omfs_mkdir, .rename = omfs_rename, @@ -498,7 +498,7 @@ struct inode_operations omfs_dir_inops = { .rmdir = omfs_rmdir, }; -struct file_operations omfs_dir_operations = { +const struct file_operations omfs_dir_operations = { .read = generic_read_dir, .readdir = omfs_readdir, .llseek = generic_file_llseek, diff --git a/fs/omfs/file.c b/fs/omfs/file.c index d17e774e..399487c 100644 --- a/fs/omfs/file.c +++ b/fs/omfs/file.c @@ -322,7 +322,7 @@ static sector_t omfs_bmap(struct address_space *mapping, sector_t block) return generic_block_bmap(mapping, block, omfs_get_block); } -struct file_operations omfs_file_operations = { +const struct file_operations omfs_file_operations = { .llseek = generic_file_llseek, .read = do_sync_read, .write = do_sync_write, @@ -333,11 +333,11 @@ struct file_operations omfs_file_operations = { .splice_read = generic_file_splice_read, }; -struct inode_operations omfs_file_inops = { +const struct inode_operations omfs_file_inops = { .truncate = omfs_truncate }; -struct address_space_operations omfs_aops = { +const struct address_space_operations omfs_aops = { .readpage = omfs_readpage, .readpages = omfs_readpages, .writepage = omfs_writepage, diff --git a/fs/omfs/inode.c b/fs/omfs/inode.c index 379ae5f..f3b7c15 100644 --- a/fs/omfs/inode.c +++ b/fs/omfs/inode.c @@ -278,7 +278,7 @@ static int omfs_statfs(struct dentry *dentry, struct kstatfs *buf) return 0; } -static struct super_operations omfs_sops = { +static const struct super_operations omfs_sops = { .write_inode = omfs_write_inode, .delete_inode = omfs_delete_inode, .put_super = omfs_put_super, diff --git a/fs/omfs/omfs.h b/fs/omfs/omfs.h index 2bc0f06..ebe2fdb 100644 --- a/fs/omfs/omfs.h +++ b/fs/omfs/omfs.h @@ -44,16 +44,16 @@ extern int omfs_allocate_range(struct super_block *sb, int min_request, extern int omfs_clear_range(struct super_block *sb, u64 block, int count); /* dir.c */ -extern struct file_operations omfs_dir_operations; -extern struct inode_operations omfs_dir_inops; +extern const struct file_operations omfs_dir_operations; +extern const struct inode_operations omfs_dir_inops; extern int omfs_make_empty(struct inode *inode, struct super_block *sb); extern int omfs_is_bad(struct omfs_sb_info *sbi, struct omfs_header *header, u64 fsblock); /* file.c */ -extern struct file_operations omfs_file_operations; -extern struct inode_operations omfs_file_inops; -extern struct address_space_operations omfs_aops; +extern const struct file_operations omfs_file_operations; +extern const struct inode_operations omfs_file_inops; +extern const struct address_space_operations omfs_aops; extern void omfs_make_empty_table(struct buffer_head *bh, int offset); extern int omfs_shrink_inode(struct inode *inode); @@ -199,7 +199,7 @@ out: int do_truncate(struct dentry *dentry, loff_t length, unsigned int time_attrs, struct file *filp) { - int err; + int ret; struct iattr newattrs; /* Not pretty: "inode->i_size" shouldn't really be signed. But it is. */ @@ -214,12 +214,14 @@ int do_truncate(struct dentry *dentry, loff_t length, unsigned int time_attrs, } /* Remove suid/sgid on truncate too */ - newattrs.ia_valid |= should_remove_suid(dentry); + ret = should_remove_suid(dentry); + if (ret) + newattrs.ia_valid |= ret | ATTR_FORCE; mutex_lock(&dentry->d_inode->i_mutex); - err = notify_change(dentry, &newattrs); + ret = notify_change(dentry, &newattrs); mutex_unlock(&dentry->d_inode->i_mutex); - return err; + return ret; } static long do_sys_truncate(const char __user *pathname, loff_t length) @@ -288,10 +290,9 @@ out: return error; } -SYSCALL_DEFINE2(truncate, const char __user *, path, unsigned long, length) +SYSCALL_DEFINE2(truncate, const char __user *, path, long, length) { - /* on 32-bit boxen it will cut the range 2^31--2^32-1 off */ - return do_sys_truncate(path, (long)length); + return do_sys_truncate(path, length); } static long do_sys_ftruncate(unsigned int fd, loff_t length, int small) @@ -957,6 +958,8 @@ struct file *dentry_open(struct dentry *dentry, struct vfsmount *mnt, int flags, int error; struct file *f; + validate_creds(cred); + /* * We must always pass in a valid mount pointer. Historically * callers got away with not passing it, but we must enforce this at diff --git a/fs/partitions/check.c b/fs/partitions/check.c index ea4e6cb..7b685e1 100644 --- a/fs/partitions/check.c +++ b/fs/partitions/check.c @@ -248,11 +248,19 @@ ssize_t part_stat_show(struct device *dev, part_stat_read(p, merges[WRITE]), (unsigned long long)part_stat_read(p, sectors[WRITE]), jiffies_to_msecs(part_stat_read(p, ticks[WRITE])), - p->in_flight, + part_in_flight(p), jiffies_to_msecs(part_stat_read(p, io_ticks)), jiffies_to_msecs(part_stat_read(p, time_in_queue))); } +ssize_t part_inflight_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct hd_struct *p = dev_to_part(dev); + + return sprintf(buf, "%8u %8u\n", p->in_flight[0], p->in_flight[1]); +} + #ifdef CONFIG_FAIL_MAKE_REQUEST ssize_t part_fail_show(struct device *dev, struct device_attribute *attr, char *buf) @@ -281,6 +289,7 @@ static DEVICE_ATTR(start, S_IRUGO, part_start_show, NULL); static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL); static DEVICE_ATTR(alignment_offset, S_IRUGO, part_alignment_offset_show, NULL); static DEVICE_ATTR(stat, S_IRUGO, part_stat_show, NULL); +static DEVICE_ATTR(inflight, S_IRUGO, part_inflight_show, NULL); #ifdef CONFIG_FAIL_MAKE_REQUEST static struct device_attribute dev_attr_fail = __ATTR(make-it-fail, S_IRUGO|S_IWUSR, part_fail_show, part_fail_store); @@ -292,6 +301,7 @@ static struct attribute *part_attrs[] = { &dev_attr_size.attr, &dev_attr_alignment_offset.attr, &dev_attr_stat.attr, + &dev_attr_inflight.attr, #ifdef CONFIG_FAIL_MAKE_REQUEST &dev_attr_fail.attr, #endif @@ -302,7 +312,7 @@ static struct attribute_group part_attr_group = { .attrs = part_attrs, }; -static struct attribute_group *part_attr_groups[] = { +static const struct attribute_group *part_attr_groups[] = { &part_attr_group, #ifdef CONFIG_BLK_DEV_IO_TRACE &blk_trace_attr_group, @@ -571,7 +581,7 @@ try_scan: } if (from + size > get_capacity(disk)) { - struct block_device_operations *bdops = disk->fops; + const struct block_device_operations *bdops = disk->fops; unsigned long long capacity; printk(KERN_WARNING @@ -777,36 +777,55 @@ pipe_rdwr_release(struct inode *inode, struct file *filp) static int pipe_read_open(struct inode *inode, struct file *filp) { - /* We could have perhaps used atomic_t, but this and friends - below are the only places. So it doesn't seem worthwhile. */ + int ret = -ENOENT; + mutex_lock(&inode->i_mutex); - inode->i_pipe->readers++; + + if (inode->i_pipe) { + ret = 0; + inode->i_pipe->readers++; + } + mutex_unlock(&inode->i_mutex); - return 0; + return ret; } static int pipe_write_open(struct inode *inode, struct file *filp) { + int ret = -ENOENT; + mutex_lock(&inode->i_mutex); - inode->i_pipe->writers++; + + if (inode->i_pipe) { + ret = 0; + inode->i_pipe->writers++; + } + mutex_unlock(&inode->i_mutex); - return 0; + return ret; } static int pipe_rdwr_open(struct inode *inode, struct file *filp) { + int ret = -ENOENT; + mutex_lock(&inode->i_mutex); - if (filp->f_mode & FMODE_READ) - inode->i_pipe->readers++; - if (filp->f_mode & FMODE_WRITE) - inode->i_pipe->writers++; + + if (inode->i_pipe) { + ret = 0; + if (filp->f_mode & FMODE_READ) + inode->i_pipe->readers++; + if (filp->f_mode & FMODE_WRITE) + inode->i_pipe->writers++; + } + mutex_unlock(&inode->i_mutex); - return 0; + return ret; } /* diff --git a/fs/proc/array.c b/fs/proc/array.c index 725a650..07f77a7 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -82,6 +82,7 @@ #include <linux/pid_namespace.h> #include <linux/ptrace.h> #include <linux/tracehook.h> +#include <linux/swapops.h> #include <asm/pgtable.h> #include <asm/processor.h> @@ -321,6 +322,94 @@ static inline void task_context_switch_counts(struct seq_file *m, p->nivcsw); } +#ifdef CONFIG_MMU + +struct stack_stats { + struct vm_area_struct *vma; + unsigned long startpage; + unsigned long usage; +}; + +static int stack_usage_pte_range(pmd_t *pmd, unsigned long addr, + unsigned long end, struct mm_walk *walk) +{ + struct stack_stats *ss = walk->private; + struct vm_area_struct *vma = ss->vma; + pte_t *pte, ptent; + spinlock_t *ptl; + int ret = 0; + + pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); + for (; addr != end; pte++, addr += PAGE_SIZE) { + ptent = *pte; + +#ifdef CONFIG_STACK_GROWSUP + if (pte_present(ptent) || is_swap_pte(ptent)) + ss->usage = addr - ss->startpage + PAGE_SIZE; +#else + if (pte_present(ptent) || is_swap_pte(ptent)) { + ss->usage = ss->startpage - addr + PAGE_SIZE; + pte++; + ret = 1; + break; + } +#endif + } + pte_unmap_unlock(pte - 1, ptl); + cond_resched(); + return ret; +} + +static inline unsigned long get_stack_usage_in_bytes(struct vm_area_struct *vma, + struct task_struct *task) +{ + struct stack_stats ss; + struct mm_walk stack_walk = { + .pmd_entry = stack_usage_pte_range, + .mm = vma->vm_mm, + .private = &ss, + }; + + if (!vma->vm_mm || is_vm_hugetlb_page(vma)) + return 0; + + ss.vma = vma; + ss.startpage = task->stack_start & PAGE_MASK; + ss.usage = 0; + +#ifdef CONFIG_STACK_GROWSUP + walk_page_range(KSTK_ESP(task) & PAGE_MASK, vma->vm_end, + &stack_walk); +#else + walk_page_range(vma->vm_start, (KSTK_ESP(task) & PAGE_MASK) + PAGE_SIZE, + &stack_walk); +#endif + return ss.usage; +} + +static inline void task_show_stack_usage(struct seq_file *m, + struct task_struct *task) +{ + struct vm_area_struct *vma; + struct mm_struct *mm = get_task_mm(task); + + if (mm) { + down_read(&mm->mmap_sem); + vma = find_vma(mm, task->stack_start); + if (vma) + seq_printf(m, "Stack usage:\t%lu kB\n", + get_stack_usage_in_bytes(vma, task) >> 10); + + up_read(&mm->mmap_sem); + mmput(mm); + } +} +#else +static void task_show_stack_usage(struct seq_file *m, struct task_struct *task) +{ +} +#endif /* CONFIG_MMU */ + int proc_pid_status(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task) { @@ -340,6 +429,7 @@ int proc_pid_status(struct seq_file *m, struct pid_namespace *ns, task_show_regs(m, task); #endif task_context_switch_counts(m, task); + task_show_stack_usage(m, task); return 0; } @@ -481,7 +571,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, rsslim, mm ? mm->start_code : 0, mm ? mm->end_code : 0, - (permitted && mm) ? mm->start_stack : 0, + (permitted) ? task->stack_start : 0, esp, eip, /* The signal information here is obsolete. diff --git a/fs/proc/base.c b/fs/proc/base.c index 6f742f6..837469a 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -447,7 +447,7 @@ static int proc_oom_score(struct task_struct *task, char *buffer) do_posix_clock_monotonic_gettime(&uptime); read_lock(&tasklist_lock); - points = badness(task, uptime.tv_sec); + points = badness(task->group_leader, uptime.tv_sec); read_unlock(&tasklist_lock); return sprintf(buffer, "%lu\n", points); } @@ -458,7 +458,7 @@ struct limit_names { }; static const struct limit_names lnames[RLIM_NLIMITS] = { - [RLIMIT_CPU] = {"Max cpu time", "ms"}, + [RLIMIT_CPU] = {"Max cpu time", "seconds"}, [RLIMIT_FSIZE] = {"Max file size", "bytes"}, [RLIMIT_DATA] = {"Max data size", "bytes"}, [RLIMIT_STACK] = {"Max stack size", "bytes"}, @@ -999,11 +999,17 @@ static ssize_t oom_adjust_read(struct file *file, char __user *buf, struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode); char buffer[PROC_NUMBUF]; size_t len; - int oom_adjust; + int oom_adjust = OOM_DISABLE; + unsigned long flags; if (!task) return -ESRCH; - oom_adjust = task->oomkilladj; + + if (lock_task_sighand(task, &flags)) { + oom_adjust = task->signal->oom_adj; + unlock_task_sighand(task, &flags); + } + put_task_struct(task); len = snprintf(buffer, sizeof(buffer), "%i\n", oom_adjust); @@ -1015,32 +1021,44 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { struct task_struct *task; - char buffer[PROC_NUMBUF], *end; - int oom_adjust; + char buffer[PROC_NUMBUF]; + long oom_adjust; + unsigned long flags; + int err; memset(buffer, 0, sizeof(buffer)); if (count > sizeof(buffer) - 1) count = sizeof(buffer) - 1; if (copy_from_user(buffer, buf, count)) return -EFAULT; - oom_adjust = simple_strtol(buffer, &end, 0); + + err = strict_strtol(strstrip(buffer), 0, &oom_adjust); + if (err) + return -EINVAL; if ((oom_adjust < OOM_ADJUST_MIN || oom_adjust > OOM_ADJUST_MAX) && oom_adjust != OOM_DISABLE) return -EINVAL; - if (*end == '\n') - end++; + task = get_proc_task(file->f_path.dentry->d_inode); if (!task) return -ESRCH; - if (oom_adjust < task->oomkilladj && !capable(CAP_SYS_RESOURCE)) { + if (!lock_task_sighand(task, &flags)) { + put_task_struct(task); + return -ESRCH; + } + + if (oom_adjust < task->signal->oom_adj && !capable(CAP_SYS_RESOURCE)) { + unlock_task_sighand(task, &flags); put_task_struct(task); return -EACCES; } - task->oomkilladj = oom_adjust; + + task->signal->oom_adj = oom_adjust; + + unlock_task_sighand(task, &flags); put_task_struct(task); - if (end - buffer == 0) - return -EIO; - return end - buffer; + + return count; } static const struct file_operations proc_oom_adjust_operations = { @@ -1169,17 +1187,16 @@ static ssize_t proc_fault_inject_write(struct file * file, count = sizeof(buffer) - 1; if (copy_from_user(buffer, buf, count)) return -EFAULT; - make_it_fail = simple_strtol(buffer, &end, 0); - if (*end == '\n') - end++; + make_it_fail = simple_strtol(strstrip(buffer), &end, 0); + if (*end) + return -EINVAL; task = get_proc_task(file->f_dentry->d_inode); if (!task) return -ESRCH; task->make_it_fail = make_it_fail; put_task_struct(task); - if (end - buffer == 0) - return -EIO; - return end - buffer; + + return count; } static const struct file_operations proc_fault_inject_operations = { @@ -2586,9 +2603,6 @@ static void proc_flush_task_mnt(struct vfsmount *mnt, pid_t pid, pid_t tgid) dput(dentry); } - if (tgid == 0) - goto out; - name.name = buf; name.len = snprintf(buf, sizeof(buf), "%d", tgid); leader = d_hash_and_lookup(mnt->mnt_root, &name); @@ -2645,17 +2659,16 @@ out: void proc_flush_task(struct task_struct *task) { int i; - struct pid *pid, *tgid = NULL; + struct pid *pid, *tgid; struct upid *upid; pid = task_pid(task); - if (thread_group_leader(task)) - tgid = task_tgid(task); + tgid = task_tgid(task); for (i = 0; i <= pid->level; i++) { upid = &pid->numbers[i]; proc_flush_task_mnt(upid->ns->proc_mnt, upid->nr, - tgid ? tgid->numbers[i].nr : 0); + tgid->numbers[i].nr); } upid = &pid->numbers[pid->level]; diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c index 59b43a0..a44a789 100644 --- a/fs/proc/kcore.c +++ b/fs/proc/kcore.c @@ -17,9 +17,14 @@ #include <linux/elfcore.h> #include <linux/vmalloc.h> #include <linux/highmem.h> +#include <linux/bootmem.h> #include <linux/init.h> #include <asm/uaccess.h> #include <asm/io.h> +#include <linux/list.h> +#include <linux/ioport.h> +#include <linux/memory.h> +#include <asm/sections.h> #define CORE_STR "CORE" @@ -29,17 +34,6 @@ static struct proc_dir_entry *proc_root_kcore; -static int open_kcore(struct inode * inode, struct file * filp) -{ - return capable(CAP_SYS_RAWIO) ? 0 : -EPERM; -} - -static ssize_t read_kcore(struct file *, char __user *, size_t, loff_t *); - -static const struct file_operations proc_kcore_operations = { - .read = read_kcore, - .open = open_kcore, -}; #ifndef kc_vaddr_to_offset #define kc_vaddr_to_offset(v) ((v) - PAGE_OFFSET) @@ -57,18 +51,19 @@ struct memelfnote void *data; }; -static struct kcore_list *kclist; +static LIST_HEAD(kclist_head); static DEFINE_RWLOCK(kclist_lock); +static int kcore_need_update = 1; void -kclist_add(struct kcore_list *new, void *addr, size_t size) +kclist_add(struct kcore_list *new, void *addr, size_t size, int type) { new->addr = (unsigned long)addr; new->size = size; + new->type = type; write_lock(&kclist_lock); - new->next = kclist; - kclist = new; + list_add_tail(&new->list, &kclist_head); write_unlock(&kclist_lock); } @@ -80,7 +75,7 @@ static size_t get_kcore_size(int *nphdr, size_t *elf_buflen) *nphdr = 1; /* PT_NOTE */ size = 0; - for (m=kclist; m; m=m->next) { + list_for_each_entry(m, &kclist_head, list) { try = kc_vaddr_to_offset((size_t)m->addr + m->size); if (try > size) size = try; @@ -97,6 +92,177 @@ static size_t get_kcore_size(int *nphdr, size_t *elf_buflen) return size + *elf_buflen; } +static void free_kclist_ents(struct list_head *head) +{ + struct kcore_list *tmp, *pos; + + list_for_each_entry_safe(pos, tmp, head, list) { + list_del(&pos->list); + kfree(pos); + } +} +/* + * Replace all KCORE_RAM/KCORE_VMEMMAP information with passed list. + */ +static void __kcore_update_ram(struct list_head *list) +{ + int nphdr; + size_t size; + struct kcore_list *tmp, *pos; + LIST_HEAD(garbage); + + write_lock(&kclist_lock); + if (kcore_need_update) { + list_for_each_entry_safe(pos, tmp, &kclist_head, list) { + if (pos->type == KCORE_RAM + || pos->type == KCORE_VMEMMAP) + list_move(&pos->list, &garbage); + } + list_splice_tail(list, &kclist_head); + } else + list_splice(list, &garbage); + kcore_need_update = 0; + proc_root_kcore->size = get_kcore_size(&nphdr, &size); + write_unlock(&kclist_lock); + + free_kclist_ents(&garbage); +} + + +#ifdef CONFIG_HIGHMEM +/* + * If no highmem, we can assume [0...max_low_pfn) continuous range of memory + * because memory hole is not as big as !HIGHMEM case. + * (HIGHMEM is special because part of memory is _invisible_ from the kernel.) + */ +static int kcore_update_ram(void) +{ + LIST_HEAD(head); + struct kcore_list *ent; + int ret = 0; + + ent = kmalloc(sizeof(*ent), GFP_KERNEL); + if (!ent) + return -ENOMEM; + ent->addr = (unsigned long)__va(0); + ent->size = max_low_pfn << PAGE_SHIFT; + ent->type = KCORE_RAM; + list_add(&ent->list, &head); + __kcore_update_ram(&head); + return ret; +} + +#else /* !CONFIG_HIGHMEM */ + +#ifdef CONFIG_SPARSEMEM_VMEMMAP +/* calculate vmemmap's address from given system ram pfn and register it */ +int get_sparsemem_vmemmap_info(struct kcore_list *ent, struct list_head *head) +{ + unsigned long pfn = __pa(ent->addr) >> PAGE_SHIFT; + unsigned long nr_pages = ent->size >> PAGE_SHIFT; + unsigned long start, end; + struct kcore_list *vmm, *tmp; + + + start = ((unsigned long)pfn_to_page(pfn)) & PAGE_MASK; + end = ((unsigned long)pfn_to_page(pfn + nr_pages)) - 1; + end = ALIGN(end, PAGE_SIZE); + /* overlap check (because we have to align page */ + list_for_each_entry(tmp, head, list) { + if (tmp->type != KCORE_VMEMMAP) + continue; + if (start < tmp->addr + tmp->size) + if (end > tmp->addr) + end = tmp->addr; + } + if (start < end) { + vmm = kmalloc(sizeof(*vmm), GFP_KERNEL); + if (!vmm) + return 0; + vmm->addr = start; + vmm->size = end - start; + vmm->type = KCORE_VMEMMAP; + list_add_tail(&vmm->list, head); + } + return 1; + +} +#else +int get_sparsemem_vmemmap_info(struct kcore_list *ent, struct list_head *head) +{ + return 1; +} + +#endif + +static int +kclist_add_private(unsigned long pfn, unsigned long nr_pages, void *arg) +{ + struct list_head *head = (struct list_head *)arg; + struct kcore_list *ent; + + ent = kmalloc(sizeof(*ent), GFP_KERNEL); + if (!ent) + return -ENOMEM; + ent->addr = (unsigned long)__va((pfn << PAGE_SHIFT)); + ent->size = nr_pages << PAGE_SHIFT; + + /* Sanity check: Can happen in 32bit arch...maybe */ + if (ent->addr < (unsigned long) __va(0)) + goto free_out; + + /* cut not-mapped area. ....from ppc-32 code. */ + if (ULONG_MAX - ent->addr < ent->size) + ent->size = ULONG_MAX - ent->addr; + + /* cut when vmalloc() area is higher than direct-map area */ + if (VMALLOC_START > (unsigned long)__va(0)) { + if (ent->addr > VMALLOC_START) + goto free_out; + if (VMALLOC_START - ent->addr < ent->size) + ent->size = VMALLOC_START - ent->addr; + } + + ent->type = KCORE_RAM; + list_add_tail(&ent->list, head); + + if (!get_sparsemem_vmemmap_info(ent, head)) { + list_del(&ent->list); + goto free_out; + } + + return 0; +free_out: + kfree(ent); + return 1; +} + +static int kcore_update_ram(void) +{ + int nid, ret; + unsigned long end_pfn; + LIST_HEAD(head); + + /* Not inialized....update now */ + /* find out "max pfn" */ + end_pfn = 0; + for_each_node_state(nid, N_HIGH_MEMORY) { + unsigned long node_end; + node_end = NODE_DATA(nid)->node_start_pfn + + NODE_DATA(nid)->node_spanned_pages; + if (end_pfn < node_end) + end_pfn = node_end; + } + /* scan 0 to max_pfn */ + ret = walk_system_ram_range(0, end_pfn, &head, kclist_add_private); + if (ret) { + free_kclist_ents(&head); + return -ENOMEM; + } + __kcore_update_ram(&head); + return ret; +} +#endif /* CONFIG_HIGHMEM */ /*****************************************************************************/ /* @@ -192,7 +358,7 @@ static void elf_kcore_store_hdr(char *bufp, int nphdr, int dataoff) nhdr->p_align = 0; /* setup ELF PT_LOAD program header for every area */ - for (m=kclist; m; m=m->next) { + list_for_each_entry(m, &kclist_head, list) { phdr = (struct elf_phdr *) bufp; bufp += sizeof(struct elf_phdr); offset += sizeof(struct elf_phdr); @@ -265,7 +431,8 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos) unsigned long start; read_lock(&kclist_lock); - proc_root_kcore->size = size = get_kcore_size(&nphdr, &elf_buflen); + size = get_kcore_size(&nphdr, &elf_buflen); + if (buflen == 0 || *fpos >= size) { read_unlock(&kclist_lock); return 0; @@ -317,7 +484,7 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos) struct kcore_list *m; read_lock(&kclist_lock); - for (m=kclist; m; m=m->next) { + list_for_each_entry(m, &kclist_head, list) { if (start >= m->addr && start < (m->addr+m->size)) break; } @@ -326,45 +493,14 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos) if (m == NULL) { if (clear_user(buffer, tsz)) return -EFAULT; - } else if (is_vmalloc_addr((void *)start)) { + } else if (is_vmalloc_or_module_addr((void *)start)) { char * elf_buf; - struct vm_struct *m; - unsigned long curstart = start; - unsigned long cursize = tsz; elf_buf = kzalloc(tsz, GFP_KERNEL); if (!elf_buf) return -ENOMEM; - - read_lock(&vmlist_lock); - for (m=vmlist; m && cursize; m=m->next) { - unsigned long vmstart; - unsigned long vmsize; - unsigned long msize = m->size - PAGE_SIZE; - - if (((unsigned long)m->addr + msize) < - curstart) - continue; - if ((unsigned long)m->addr > (curstart + - cursize)) - break; - vmstart = (curstart < (unsigned long)m->addr ? - (unsigned long)m->addr : curstart); - if (((unsigned long)m->addr + msize) > - (curstart + cursize)) - vmsize = curstart + cursize - vmstart; - else - vmsize = (unsigned long)m->addr + - msize - vmstart; - curstart = vmstart + vmsize; - cursize -= vmsize; - /* don't dump ioremap'd stuff! (TA) */ - if (m->flags & VM_IOREMAP) - continue; - memcpy(elf_buf + (vmstart - start), - (char *)vmstart, vmsize); - } - read_unlock(&vmlist_lock); + vread(elf_buf, (char *)start, tsz); + /* we have to zero-fill user buffer even if no read */ if (copy_to_user(buffer, elf_buf, tsz)) { kfree(elf_buf); return -EFAULT; @@ -402,12 +538,96 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos) return acc; } + +static int open_kcore(struct inode *inode, struct file *filp) +{ + if (!capable(CAP_SYS_RAWIO)) + return -EPERM; + if (kcore_need_update) + kcore_update_ram(); + if (i_size_read(inode) != proc_root_kcore->size) { + mutex_lock(&inode->i_mutex); + i_size_write(inode, proc_root_kcore->size); + mutex_unlock(&inode->i_mutex); + } + return 0; +} + + +static const struct file_operations proc_kcore_operations = { + .read = read_kcore, + .open = open_kcore, +}; + +#ifdef CONFIG_MEMORY_HOTPLUG +/* just remember that we have to update kcore */ +static int __meminit kcore_callback(struct notifier_block *self, + unsigned long action, void *arg) +{ + switch (action) { + case MEM_ONLINE: + case MEM_OFFLINE: + write_lock(&kclist_lock); + kcore_need_update = 1; + write_unlock(&kclist_lock); + } + return NOTIFY_OK; +} +#endif + + +static struct kcore_list kcore_vmalloc; + +#ifdef CONFIG_ARCH_PROC_KCORE_TEXT +static struct kcore_list kcore_text; +/* + * If defined, special segment is used for mapping kernel text instead of + * direct-map area. We need to create special TEXT section. + */ +static void __init proc_kcore_text_init(void) +{ + kclist_add(&kcore_text, _stext, _end - _stext, KCORE_TEXT); +} +#else +static void __init proc_kcore_text_init(void) +{ +} +#endif + +#if defined(CONFIG_MODULES) && defined(MODULES_VADDR) +/* + * MODULES_VADDR has no intersection with VMALLOC_ADDR. + */ +struct kcore_list kcore_modules; +static void __init add_modules_range(void) +{ + kclist_add(&kcore_modules, (void *)MODULES_VADDR, + MODULES_END - MODULES_VADDR, KCORE_VMALLOC); +} +#else +static void __init add_modules_range(void) +{ +} +#endif + static int __init proc_kcore_init(void) { - proc_root_kcore = proc_create("kcore", S_IRUSR, NULL, &proc_kcore_operations); - if (proc_root_kcore) - proc_root_kcore->size = - (size_t)high_memory - PAGE_OFFSET + PAGE_SIZE; + proc_root_kcore = proc_create("kcore", S_IRUSR, NULL, + &proc_kcore_operations); + if (!proc_root_kcore) { + printk(KERN_ERR "couldn't create /proc/kcore\n"); + return 0; /* Always returns 0. */ + } + /* Store text area if it's special */ + proc_kcore_text_init(); + /* Store vmalloc area */ + kclist_add(&kcore_vmalloc, (void *)VMALLOC_START, + VMALLOC_END - VMALLOC_START, KCORE_VMALLOC); + add_modules_range(); + /* Store direct-map area from physical memory map */ + kcore_update_ram(); + hotplug_memory_notifier(kcore_callback, 0); + return 0; } module_init(proc_kcore_init); diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index d5c410d..a65239c 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -81,9 +81,11 @@ static int meminfo_proc_show(struct seq_file *m, void *v) "Writeback: %8lu kB\n" "AnonPages: %8lu kB\n" "Mapped: %8lu kB\n" + "Shmem: %8lu kB\n" "Slab: %8lu kB\n" "SReclaimable: %8lu kB\n" "SUnreclaim: %8lu kB\n" + "KernelStack: %8lu kB\n" "PageTables: %8lu kB\n" #ifdef CONFIG_QUICKLIST "Quicklists: %8lu kB\n" @@ -95,7 +97,11 @@ static int meminfo_proc_show(struct seq_file *m, void *v) "Committed_AS: %8lu kB\n" "VmallocTotal: %8lu kB\n" "VmallocUsed: %8lu kB\n" - "VmallocChunk: %8lu kB\n", + "VmallocChunk: %8lu kB\n" +#ifdef CONFIG_MEMORY_FAILURE + "HardwareCorrupted: %5lu kB\n" +#endif + , K(i.totalram), K(i.freeram), K(i.bufferram), @@ -124,10 +130,12 @@ static int meminfo_proc_show(struct seq_file *m, void *v) K(global_page_state(NR_WRITEBACK)), K(global_page_state(NR_ANON_PAGES)), K(global_page_state(NR_FILE_MAPPED)), + K(global_page_state(NR_SHMEM)), K(global_page_state(NR_SLAB_RECLAIMABLE) + global_page_state(NR_SLAB_UNRECLAIMABLE)), K(global_page_state(NR_SLAB_RECLAIMABLE)), K(global_page_state(NR_SLAB_UNRECLAIMABLE)), + global_page_state(NR_KERNEL_STACK) * THREAD_SIZE / 1024, K(global_page_state(NR_PAGETABLE)), #ifdef CONFIG_QUICKLIST K(quicklist_total_size()), @@ -140,6 +148,9 @@ static int meminfo_proc_show(struct seq_file *m, void *v) (unsigned long)VMALLOC_TOTAL >> 10, vmi.used >> 10, vmi.largest_chunk >> 10 +#ifdef CONFIG_MEMORY_FAILURE + ,atomic_long_read(&mce_bad_pages) << (PAGE_SHIFT - 10) +#endif ); hugetlb_report_meminfo(m); diff --git a/fs/proc/nommu.c b/fs/proc/nommu.c index 7e14d1a..9fe7d7e 100644 --- a/fs/proc/nommu.c +++ b/fs/proc/nommu.c @@ -109,7 +109,7 @@ static void *nommu_region_list_next(struct seq_file *m, void *v, loff_t *pos) return rb_next((struct rb_node *) v); } -static struct seq_operations proc_nommu_region_list_seqop = { +static const struct seq_operations proc_nommu_region_list_seqop = { .start = nommu_region_list_start, .next = nommu_region_list_next, .stop = nommu_region_list_stop, diff --git a/fs/proc/page.c b/fs/proc/page.c index 2707c6c..5033ce0 100644 --- a/fs/proc/page.c +++ b/fs/proc/page.c @@ -2,6 +2,7 @@ #include <linux/compiler.h> #include <linux/fs.h> #include <linux/init.h> +#include <linux/ksm.h> #include <linux/mm.h> #include <linux/mmzone.h> #include <linux/proc_fs.h> @@ -93,8 +94,11 @@ static const struct file_operations proc_kpagecount_operations = { #define KPF_COMPOUND_TAIL 16 #define KPF_HUGE 17 #define KPF_UNEVICTABLE 18 +#define KPF_HWPOISON 19 #define KPF_NOPAGE 20 +#define KPF_KSM 21 + /* kernel hacking assistances * WARNING: subject to change, never rely on them! */ @@ -137,6 +141,8 @@ static u64 get_uflags(struct page *page) u |= 1 << KPF_MMAP; if (PageAnon(page)) u |= 1 << KPF_ANON; + if (PageKsm(page)) + u |= 1 << KPF_KSM; /* * compound pages: export both head/tail info @@ -175,6 +181,10 @@ static u64 get_uflags(struct page *page) u |= kpf_copy_bit(k, KPF_UNEVICTABLE, PG_unevictable); u |= kpf_copy_bit(k, KPF_MLOCKED, PG_mlocked); +#ifdef CONFIG_MEMORY_FAILURE + u |= kpf_copy_bit(k, KPF_HWPOISON, PG_hwpoison); +#endif + #ifdef CONFIG_IA64_UNCACHED_ALLOCATOR u |= kpf_copy_bit(k, KPF_UNCACHED, PG_uncached); #endif diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index 9b1e4e9..f667e8a 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -153,7 +153,7 @@ static ssize_t proc_sys_call_handler(struct file *filp, void __user *buf, /* careful: calling conventions are nasty here */ res = count; - error = table->proc_handler(table, write, filp, buf, &res, ppos); + error = table->proc_handler(table, write, buf, &res, ppos); if (!error) error = res; out: diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 9bd8be1..2a1bef9 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -243,6 +243,25 @@ static void show_map_vma(struct seq_file *m, struct vm_area_struct *vma) } else if (vma->vm_start <= mm->start_stack && vma->vm_end >= mm->start_stack) { name = "[stack]"; + } else { + unsigned long stack_start; + struct proc_maps_private *pmp; + + pmp = m->private; + stack_start = pmp->task->stack_start; + + if (vma->vm_start <= stack_start && + vma->vm_end >= stack_start) { + pad_len_spaces(m, len); + seq_printf(m, + "[threadstack:%08lx]", +#ifdef CONFIG_STACK_GROWSUP + vma->vm_end - stack_start +#else + stack_start - vma->vm_start +#endif + ); + } } } else { name = "[vdso]"; @@ -465,23 +484,28 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr, return 0; } +#define CLEAR_REFS_ALL 1 +#define CLEAR_REFS_ANON 2 +#define CLEAR_REFS_MAPPED 3 + static ssize_t clear_refs_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { struct task_struct *task; - char buffer[PROC_NUMBUF], *end; + char buffer[PROC_NUMBUF]; struct mm_struct *mm; struct vm_area_struct *vma; + long type; memset(buffer, 0, sizeof(buffer)); if (count > sizeof(buffer) - 1) count = sizeof(buffer) - 1; if (copy_from_user(buffer, buf, count)) return -EFAULT; - if (!simple_strtol(buffer, &end, 0)) + if (strict_strtol(strstrip(buffer), 10, &type)) + return -EINVAL; + if (type < CLEAR_REFS_ALL || type > CLEAR_REFS_MAPPED) return -EINVAL; - if (*end == '\n') - end++; task = get_proc_task(file->f_path.dentry->d_inode); if (!task) return -ESRCH; @@ -494,18 +518,31 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, down_read(&mm->mmap_sem); for (vma = mm->mmap; vma; vma = vma->vm_next) { clear_refs_walk.private = vma; - if (!is_vm_hugetlb_page(vma)) - walk_page_range(vma->vm_start, vma->vm_end, - &clear_refs_walk); + if (is_vm_hugetlb_page(vma)) + continue; + /* + * Writing 1 to /proc/pid/clear_refs affects all pages. + * + * Writing 2 to /proc/pid/clear_refs only affects + * Anonymous pages. + * + * Writing 3 to /proc/pid/clear_refs only affects file + * mapped pages. + */ + if (type == CLEAR_REFS_ANON && vma->vm_file) + continue; + if (type == CLEAR_REFS_MAPPED && !vma->vm_file) + continue; + walk_page_range(vma->vm_start, vma->vm_end, + &clear_refs_walk); } flush_tlb_mm(mm); up_read(&mm->mmap_sem); mmput(mm); } put_task_struct(task); - if (end - buffer == 0) - return -EIO; - return end - buffer; + + return count; } const struct file_operations proc_clear_refs_operations = { diff --git a/fs/proc/uptime.c b/fs/proc/uptime.c index 0c10a0b..766b1d4 100644 --- a/fs/proc/uptime.c +++ b/fs/proc/uptime.c @@ -4,13 +4,18 @@ #include <linux/sched.h> #include <linux/seq_file.h> #include <linux/time.h> +#include <linux/kernel_stat.h> #include <asm/cputime.h> static int uptime_proc_show(struct seq_file *m, void *v) { struct timespec uptime; struct timespec idle; - cputime_t idletime = cputime_add(init_task.utime, init_task.stime); + int i; + cputime_t idletime = cputime_zero; + + for_each_possible_cpu(i) + idletime = cputime64_add(idletime, kstat_cpu(i).cpustat.idle); do_posix_clock_monotonic_gettime(&uptime); monotonic_to_bootbased(&uptime); diff --git a/fs/qnx4/Kconfig b/fs/qnx4/Kconfig index be8e0e1..5f60899 100644 --- a/fs/qnx4/Kconfig +++ b/fs/qnx4/Kconfig @@ -6,20 +6,9 @@ config QNX4FS_FS QNX 4 and QNX 6 (the latter is also called QNX RTP). Further information is available at <http://www.qnx.com/>. Say Y if you intend to mount QNX hard disks or floppies. - Unless you say Y to "QNX4FS read-write support" below, you will - only be able to read these file systems. To compile this file system support as a module, choose M here: the module will be called qnx4. If you don't know whether you need it, then you don't need it: answer N. - -config QNX4FS_RW - bool "QNX4FS write support (DANGEROUS)" - depends on QNX4FS_FS && EXPERIMENTAL && BROKEN - help - Say Y if you want to test write support for QNX4 file systems. - - It's currently broken, so for now: - answer N. diff --git a/fs/qnx4/Makefile b/fs/qnx4/Makefile index e4d408c..4a283b3 100644 --- a/fs/qnx4/Makefile +++ b/fs/qnx4/Makefile @@ -4,4 +4,4 @@ obj-$(CONFIG_QNX4FS_FS) += qnx4.o -qnx4-objs := inode.o dir.o namei.o file.o bitmap.o truncate.o +qnx4-objs := inode.o dir.o namei.o bitmap.o diff --git a/fs/qnx4/bitmap.c b/fs/qnx4/bitmap.c index e1cd061..0afba06 100644 --- a/fs/qnx4/bitmap.c +++ b/fs/qnx4/bitmap.c @@ -78,84 +78,3 @@ unsigned long qnx4_count_free_blocks(struct super_block *sb) return total_free; } - -#ifdef CONFIG_QNX4FS_RW - -int qnx4_is_free(struct super_block *sb, long block) -{ - int start = le32_to_cpu(qnx4_sb(sb)->BitMap->di_first_xtnt.xtnt_blk) - 1; - int size = le32_to_cpu(qnx4_sb(sb)->BitMap->di_size); - struct buffer_head *bh; - const char *g; - int ret = -EIO; - - start += block / (QNX4_BLOCK_SIZE * 8); - QNX4DEBUG(("qnx4: is_free requesting block [%lu], bitmap in block [%lu]\n", - (unsigned long) block, (unsigned long) start)); - (void) size; /* CHECKME */ - bh = sb_bread(sb, start); - if (bh == NULL) { - return -EIO; - } - g = bh->b_data + (block % QNX4_BLOCK_SIZE); - if (((*g) & (1 << (block % 8))) == 0) { - QNX4DEBUG(("qnx4: is_free -> block is free\n")); - ret = 1; - } else { - QNX4DEBUG(("qnx4: is_free -> block is busy\n")); - ret = 0; - } - brelse(bh); - - return ret; -} - -int qnx4_set_bitmap(struct super_block *sb, long block, int busy) -{ - int start = le32_to_cpu(qnx4_sb(sb)->BitMap->di_first_xtnt.xtnt_blk) - 1; - int size = le32_to_cpu(qnx4_sb(sb)->BitMap->di_size); - struct buffer_head *bh; - char *g; - - start += block / (QNX4_BLOCK_SIZE * 8); - QNX4DEBUG(("qnx4: set_bitmap requesting block [%lu], bitmap in block [%lu]\n", - (unsigned long) block, (unsigned long) start)); - (void) size; /* CHECKME */ - bh = sb_bread(sb, start); - if (bh == NULL) { - return -EIO; - } - g = bh->b_data + (block % QNX4_BLOCK_SIZE); - if (busy == 0) { - (*g) &= ~(1 << (block % 8)); - } else { - (*g) |= (1 << (block % 8)); - } - mark_buffer_dirty(bh); - brelse(bh); - - return 0; -} - -static void qnx4_clear_inode(struct inode *inode) -{ - struct qnx4_inode_entry *qnx4_ino = qnx4_raw_inode(inode); - /* What for? */ - memset(qnx4_ino->di_fname, 0, sizeof qnx4_ino->di_fname); - qnx4_ino->di_size = 0; - qnx4_ino->di_num_xtnts = 0; - qnx4_ino->di_mode = 0; - qnx4_ino->di_status = 0; -} - -void qnx4_free_inode(struct inode *inode) -{ - if (inode->i_ino < 1) { - printk("free_inode: inode 0 or nonexistent inode\n"); - return; - } - qnx4_clear_inode(inode); - clear_inode(inode); -} - -#endif diff --git a/fs/qnx4/dir.c b/fs/qnx4/dir.c index 003c68f..86cc39c 100644 --- a/fs/qnx4/dir.c +++ b/fs/qnx4/dir.c @@ -85,9 +85,4 @@ const struct file_operations qnx4_dir_operations = const struct inode_operations qnx4_dir_inode_operations = { .lookup = qnx4_lookup, -#ifdef CONFIG_QNX4FS_RW - .create = qnx4_create, - .unlink = qnx4_unlink, - .rmdir = qnx4_rmdir, -#endif }; diff --git a/fs/qnx4/file.c b/fs/qnx4/file.c deleted file mode 100644 index 09b170a..0000000 --- a/fs/qnx4/file.c +++ /dev/null @@ -1,40 +0,0 @@ -/* - * QNX4 file system, Linux implementation. - * - * Version : 0.2.1 - * - * Using parts of the xiafs filesystem. - * - * History : - * - * 25-05-1998 by Richard Frowijn : first release. - * 21-06-1998 by Frank Denis : wrote qnx4_readpage to use generic_file_read. - * 27-06-1998 by Frank Denis : file overwriting. - */ - -#include "qnx4.h" - -/* - * We have mostly NULL's here: the current defaults are ok for - * the qnx4 filesystem. - */ -const struct file_operations qnx4_file_operations = -{ - .llseek = generic_file_llseek, - .read = do_sync_read, - .aio_read = generic_file_aio_read, - .mmap = generic_file_mmap, - .splice_read = generic_file_splice_read, -#ifdef CONFIG_QNX4FS_RW - .write = do_sync_write, - .aio_write = generic_file_aio_write, - .fsync = simple_fsync, -#endif -}; - -const struct inode_operations qnx4_file_inode_operations = -{ -#ifdef CONFIG_QNX4FS_RW - .truncate = qnx4_truncate, -#endif -}; diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c index 681df5f..d2cd179 100644 --- a/fs/qnx4/inode.c +++ b/fs/qnx4/inode.c @@ -28,73 +28,6 @@ static const struct super_operations qnx4_sops; -#ifdef CONFIG_QNX4FS_RW - -static void qnx4_delete_inode(struct inode *inode) -{ - QNX4DEBUG(("qnx4: deleting inode [%lu]\n", (unsigned long) inode->i_ino)); - truncate_inode_pages(&inode->i_data, 0); - inode->i_size = 0; - qnx4_truncate(inode); - lock_kernel(); - qnx4_free_inode(inode); - unlock_kernel(); -} - -static int qnx4_write_inode(struct inode *inode, int do_sync) -{ - struct qnx4_inode_entry *raw_inode; - int block, ino; - struct buffer_head *bh; - ino = inode->i_ino; - - QNX4DEBUG(("qnx4: write inode 1.\n")); - if (inode->i_nlink == 0) { - return 0; - } - if (!ino) { - printk("qnx4: bad inode number on dev %s: %d is out of range\n", - inode->i_sb->s_id, ino); - return -EIO; - } - QNX4DEBUG(("qnx4: write inode 2.\n")); - block = ino / QNX4_INODES_PER_BLOCK; - lock_kernel(); - if (!(bh = sb_bread(inode->i_sb, block))) { - printk("qnx4: major problem: unable to read inode from dev " - "%s\n", inode->i_sb->s_id); - unlock_kernel(); - return -EIO; - } - raw_inode = ((struct qnx4_inode_entry *) bh->b_data) + - (ino % QNX4_INODES_PER_BLOCK); - raw_inode->di_mode = cpu_to_le16(inode->i_mode); - raw_inode->di_uid = cpu_to_le16(fs_high2lowuid(inode->i_uid)); - raw_inode->di_gid = cpu_to_le16(fs_high2lowgid(inode->i_gid)); - raw_inode->di_nlink = cpu_to_le16(inode->i_nlink); - raw_inode->di_size = cpu_to_le32(inode->i_size); - raw_inode->di_mtime = cpu_to_le32(inode->i_mtime.tv_sec); - raw_inode->di_atime = cpu_to_le32(inode->i_atime.tv_sec); - raw_inode->di_ctime = cpu_to_le32(inode->i_ctime.tv_sec); - raw_inode->di_first_xtnt.xtnt_size = cpu_to_le32(inode->i_blocks); - mark_buffer_dirty(bh); - if (do_sync) { - sync_dirty_buffer(bh); - if (buffer_req(bh) && !buffer_uptodate(bh)) { - printk("qnx4: IO error syncing inode [%s:%08x]\n", - inode->i_sb->s_id, ino); - brelse(bh); - unlock_kernel(); - return -EIO; - } - } - brelse(bh); - unlock_kernel(); - return 0; -} - -#endif - static void qnx4_put_super(struct super_block *sb); static struct inode *qnx4_alloc_inode(struct super_block *sb); static void qnx4_destroy_inode(struct inode *inode); @@ -108,10 +41,6 @@ static const struct super_operations qnx4_sops = .put_super = qnx4_put_super, .statfs = qnx4_statfs, .remount_fs = qnx4_remount, -#ifdef CONFIG_QNX4FS_RW - .write_inode = qnx4_write_inode, - .delete_inode = qnx4_delete_inode, -#endif }; static int qnx4_remount(struct super_block *sb, int *flags, char *data) @@ -120,15 +49,7 @@ static int qnx4_remount(struct super_block *sb, int *flags, char *data) qs = qnx4_sb(sb); qs->Version = QNX4_VERSION; -#ifndef CONFIG_QNX4FS_RW *flags |= MS_RDONLY; -#endif - if (*flags & MS_RDONLY) { - return 0; - } - - mark_buffer_dirty(qs->sb_buf); - return 0; } @@ -354,9 +275,7 @@ static int qnx4_fill_super(struct super_block *s, void *data, int silent) } s->s_op = &qnx4_sops; s->s_magic = QNX4_SUPER_MAGIC; -#ifndef CONFIG_QNX4FS_RW s->s_flags |= MS_RDONLY; /* Yup, read-only yet */ -#endif qnx4_sb(s)->sb_buf = bh; qnx4_sb(s)->sb = (struct qnx4_super_block *) bh->b_data; @@ -489,8 +408,7 @@ struct inode *qnx4_iget(struct super_block *sb, unsigned long ino) memcpy(qnx4_inode, raw_inode, QNX4_DIR_ENTRY_SIZE); if (S_ISREG(inode->i_mode)) { - inode->i_op = &qnx4_file_inode_operations; - inode->i_fop = &qnx4_file_operations; + inode->i_fop = &generic_ro_fops; inode->i_mapping->a_ops = &qnx4_aops; qnx4_i(inode)->mmu_private = inode->i_size; } else if (S_ISDIR(inode->i_mode)) { diff --git a/fs/qnx4/namei.c b/fs/qnx4/namei.c index 5972ed2..ae1e7ed 100644 --- a/fs/qnx4/namei.c +++ b/fs/qnx4/namei.c @@ -134,108 +134,3 @@ out: return NULL; } - -#ifdef CONFIG_QNX4FS_RW -int qnx4_create(struct inode *dir, struct dentry *dentry, int mode, - struct nameidata *nd) -{ - QNX4DEBUG(("qnx4: qnx4_create\n")); - if (dir == NULL) { - return -ENOENT; - } - return -ENOSPC; -} - -int qnx4_rmdir(struct inode *dir, struct dentry *dentry) -{ - struct buffer_head *bh; - struct qnx4_inode_entry *de; - struct inode *inode; - int retval; - int ino; - - QNX4DEBUG(("qnx4: qnx4_rmdir [%s]\n", dentry->d_name.name)); - lock_kernel(); - bh = qnx4_find_entry(dentry->d_name.len, dir, dentry->d_name.name, - &de, &ino); - if (bh == NULL) { - unlock_kernel(); - return -ENOENT; - } - inode = dentry->d_inode; - if (inode->i_ino != ino) { - retval = -EIO; - goto end_rmdir; - } -#if 0 - if (!empty_dir(inode)) { - retval = -ENOTEMPTY; - goto end_rmdir; - } -#endif - if (inode->i_nlink != 2) { - QNX4DEBUG(("empty directory has nlink!=2 (%d)\n", inode->i_nlink)); - } - QNX4DEBUG(("qnx4: deleting directory\n")); - de->di_status = 0; - memset(de->di_fname, 0, sizeof de->di_fname); - de->di_mode = 0; - mark_buffer_dirty_inode(bh, dir); - clear_nlink(inode); - mark_inode_dirty(inode); - inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC; - inode_dec_link_count(dir); - retval = 0; - - end_rmdir: - brelse(bh); - - unlock_kernel(); - return retval; -} - -int qnx4_unlink(struct inode *dir, struct dentry *dentry) -{ - struct buffer_head *bh; - struct qnx4_inode_entry *de; - struct inode *inode; - int retval; - int ino; - - QNX4DEBUG(("qnx4: qnx4_unlink [%s]\n", dentry->d_name.name)); - lock_kernel(); - bh = qnx4_find_entry(dentry->d_name.len, dir, dentry->d_name.name, - &de, &ino); - if (bh == NULL) { - unlock_kernel(); - return -ENOENT; - } - inode = dentry->d_inode; - if (inode->i_ino != ino) { - retval = -EIO; - goto end_unlink; - } - retval = -EPERM; - if (!inode->i_nlink) { - QNX4DEBUG(("Deleting nonexistent file (%s:%lu), %d\n", - inode->i_sb->s_id, - inode->i_ino, inode->i_nlink)); - inode->i_nlink = 1; - } - de->di_status = 0; - memset(de->di_fname, 0, sizeof de->di_fname); - de->di_mode = 0; - mark_buffer_dirty_inode(bh, dir); - dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC; - mark_inode_dirty(dir); - inode->i_ctime = dir->i_ctime; - inode_dec_link_count(inode); - retval = 0; - -end_unlink: - unlock_kernel(); - brelse(bh); - - return retval; -} -#endif diff --git a/fs/qnx4/qnx4.h b/fs/qnx4/qnx4.h index 9efc089..33a6085 100644 --- a/fs/qnx4/qnx4.h +++ b/fs/qnx4/qnx4.h @@ -29,17 +29,9 @@ extern unsigned long qnx4_block_map(struct inode *inode, long iblock); extern struct buffer_head *qnx4_bread(struct inode *, int, int); -extern const struct inode_operations qnx4_file_inode_operations; extern const struct inode_operations qnx4_dir_inode_operations; -extern const struct file_operations qnx4_file_operations; extern const struct file_operations qnx4_dir_operations; extern int qnx4_is_free(struct super_block *sb, long block); -extern int qnx4_set_bitmap(struct super_block *sb, long block, int busy); -extern int qnx4_create(struct inode *inode, struct dentry *dentry, int mode, struct nameidata *nd); -extern void qnx4_truncate(struct inode *inode); -extern void qnx4_free_inode(struct inode *inode); -extern int qnx4_unlink(struct inode *dir, struct dentry *dentry); -extern int qnx4_rmdir(struct inode *dir, struct dentry *dentry); static inline struct qnx4_sb_info *qnx4_sb(struct super_block *sb) { diff --git a/fs/qnx4/truncate.c b/fs/qnx4/truncate.c deleted file mode 100644 index d94d9ee..0000000 --- a/fs/qnx4/truncate.c +++ /dev/null @@ -1,34 +0,0 @@ -/* - * QNX4 file system, Linux implementation. - * - * Version : 0.1 - * - * Using parts of the xiafs filesystem. - * - * History : - * - * 30-06-1998 by Frank DENIS : ugly filler. - */ - -#include <linux/smp_lock.h> -#include "qnx4.h" - -#ifdef CONFIG_QNX4FS_RW - -void qnx4_truncate(struct inode *inode) -{ - if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || - S_ISLNK(inode->i_mode))) { - return; - } - lock_kernel(); - if (!(S_ISDIR(inode->i_mode))) { - /* TODO */ - } - QNX4DEBUG(("qnx4: qnx4_truncate called\n")); - inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC; - mark_inode_dirty(inode); - unlock_kernel(); -} - -#endif diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index 38f7bd5..39b49c4 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -1839,7 +1839,7 @@ EXPORT_SYMBOL(dquot_commit_info); /* * Definitions of diskquota operations. */ -struct dquot_operations dquot_operations = { +const struct dquot_operations dquot_operations = { .initialize = dquot_initialize, .drop = dquot_drop, .alloc_space = dquot_alloc_space, @@ -2461,7 +2461,7 @@ out: } EXPORT_SYMBOL(vfs_set_dqinfo); -struct quotactl_ops vfs_quotactl_ops = { +const struct quotactl_ops vfs_quotactl_ops = { .quota_on = vfs_quota_on, .quota_off = vfs_quota_off, .quota_sync = vfs_quota_sync, diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c index 11f0c06..32fae40 100644 --- a/fs/ramfs/file-nommu.c +++ b/fs/ramfs/file-nommu.c @@ -69,14 +69,11 @@ int ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize) /* make various checks */ order = get_order(newsize); if (unlikely(order >= MAX_ORDER)) - goto too_big; + return -EFBIG; - limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur; - if (limit != RLIM_INFINITY && newsize > limit) - goto fsize_exceeded; - - if (newsize > inode->i_sb->s_maxbytes) - goto too_big; + ret = inode_newsize_ok(inode, newsize); + if (ret) + return ret; i_size_write(inode, newsize); @@ -118,12 +115,7 @@ int ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize) return 0; - fsize_exceeded: - send_sig(SIGXFSZ, current, 0); - too_big: - return -EFBIG; - - add_error: +add_error: while (loop < npages) __free_page(pages + loop++); return ret; diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c index 0ff7566..a6090aa 100644 --- a/fs/ramfs/inode.c +++ b/fs/ramfs/inode.c @@ -34,18 +34,17 @@ #include <linux/ramfs.h> #include <linux/sched.h> #include <linux/parser.h> +#include <linux/magic.h> #include <asm/uaccess.h> #include "internal.h" -/* some random number */ -#define RAMFS_MAGIC 0x858458f6 - #define RAMFS_DEFAULT_MODE 0755 static const struct super_operations ramfs_ops; static const struct inode_operations ramfs_dir_inode_operations; static struct backing_dev_info ramfs_backing_dev_info = { + .name = "ramfs", .ra_pages = 0, /* No readahead */ .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_MAP_DIRECT | BDI_CAP_MAP_COPY | diff --git a/fs/read_write.c b/fs/read_write.c index 6c8c55d..3ac2898 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -839,9 +839,6 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos, max = min(in_inode->i_sb->s_maxbytes, out_inode->i_sb->s_maxbytes); pos = *ppos; - retval = -EINVAL; - if (unlikely(pos < 0)) - goto fput_out; if (unlikely(pos + count > max)) { retval = -EOVERFLOW; if (pos >= max) diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index 7adea74..f0ad05f 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -612,7 +612,7 @@ static int reiserfs_mark_dquot_dirty(struct dquot *); static int reiserfs_write_info(struct super_block *, int); static int reiserfs_quota_on(struct super_block *, int, int, char *, int); -static struct dquot_operations reiserfs_quota_operations = { +static const struct dquot_operations reiserfs_quota_operations = { .initialize = dquot_initialize, .drop = dquot_drop, .alloc_space = dquot_alloc_space, @@ -629,7 +629,7 @@ static struct dquot_operations reiserfs_quota_operations = { .destroy_dquot = dquot_destroy, }; -static struct quotactl_ops reiserfs_qctl_operations = { +static const struct quotactl_ops reiserfs_qctl_operations = { .quota_on = reiserfs_quota_on, .quota_off = vfs_quota_off, .quota_sync = vfs_quota_sync, diff --git a/fs/romfs/storage.c b/fs/romfs/storage.c index b3208ad..71e2b4d 100644 --- a/fs/romfs/storage.c +++ b/fs/romfs/storage.c @@ -253,11 +253,11 @@ ssize_t romfs_dev_strnlen(struct super_block *sb, #ifdef CONFIG_ROMFS_ON_MTD if (sb->s_mtd) - return romfs_mtd_strnlen(sb, pos, limit); + return romfs_mtd_strnlen(sb, pos, maxlen); #endif #ifdef CONFIG_ROMFS_ON_BLOCK if (sb->s_bdev) - return romfs_blk_strnlen(sb, pos, limit); + return romfs_blk_strnlen(sb, pos, maxlen); #endif return -EIO; } diff --git a/fs/romfs/super.c b/fs/romfs/super.c index 4ab3c03..c117fa8 100644 --- a/fs/romfs/super.c +++ b/fs/romfs/super.c @@ -284,7 +284,7 @@ static const struct file_operations romfs_dir_operations = { .readdir = romfs_readdir, }; -static struct inode_operations romfs_dir_inode_operations = { +static const struct inode_operations romfs_dir_inode_operations = { .lookup = romfs_lookup, }; @@ -528,7 +528,7 @@ static int romfs_fill_super(struct super_block *sb, void *data, int silent) pos = (ROMFH_SIZE + len + 1 + ROMFH_PAD) & ROMFH_MASK; root = romfs_iget(sb, pos); - if (!root) + if (IS_ERR(root)) goto error; sb->s_root = d_alloc_root(root); diff --git a/fs/select.c b/fs/select.c index 8084834..fd38ce2 100644 --- a/fs/select.c +++ b/fs/select.c @@ -15,6 +15,7 @@ */ #include <linux/kernel.h> +#include <linux/sched.h> #include <linux/syscalls.h> #include <linux/module.h> #include <linux/slab.h> @@ -41,22 +42,28 @@ * better solutions.. */ +#define MAX_SLACK (100 * NSEC_PER_MSEC) + static long __estimate_accuracy(struct timespec *tv) { long slack; int divfactor = 1000; + if (tv->tv_sec < 0) + return 0; + if (task_nice(current) > 0) divfactor = divfactor / 5; + if (tv->tv_sec > MAX_SLACK / (NSEC_PER_SEC/divfactor)) + return MAX_SLACK; + slack = tv->tv_nsec / divfactor; slack += tv->tv_sec * (NSEC_PER_SEC/divfactor); - if (slack > 100 * NSEC_PER_MSEC) - slack = 100 * NSEC_PER_MSEC; + if (slack > MAX_SLACK) + return MAX_SLACK; - if (slack < 0) - slack = 0; return slack; } diff --git a/fs/seq_file.c b/fs/seq_file.c index 6c95927..eae7d9d 100644 --- a/fs/seq_file.c +++ b/fs/seq_file.c @@ -429,20 +429,21 @@ EXPORT_SYMBOL(mangle_path); */ int seq_path(struct seq_file *m, struct path *path, char *esc) { - if (m->count < m->size) { - char *s = m->buf + m->count; - char *p = d_path(path, s, m->size - m->count); + char *buf; + size_t size = seq_get_buf(m, &buf); + int res = -1; + + if (size) { + char *p = d_path(path, buf, size); if (!IS_ERR(p)) { - s = mangle_path(s, p, esc); - if (s) { - p = m->buf + m->count; - m->count = s - m->buf; - return s - p; - } + char *end = mangle_path(buf, p, esc); + if (end) + res = end - buf; } } - m->count = m->size; - return -1; + seq_commit(m, res); + + return res; } EXPORT_SYMBOL(seq_path); @@ -454,26 +455,28 @@ EXPORT_SYMBOL(seq_path); int seq_path_root(struct seq_file *m, struct path *path, struct path *root, char *esc) { - int err = -ENAMETOOLONG; - if (m->count < m->size) { - char *s = m->buf + m->count; + char *buf; + size_t size = seq_get_buf(m, &buf); + int res = -ENAMETOOLONG; + + if (size) { char *p; spin_lock(&dcache_lock); - p = __d_path(path, root, s, m->size - m->count); + p = __d_path(path, root, buf, size); spin_unlock(&dcache_lock); - err = PTR_ERR(p); + res = PTR_ERR(p); if (!IS_ERR(p)) { - s = mangle_path(s, p, esc); - if (s) { - p = m->buf + m->count; - m->count = s - m->buf; - return 0; - } + char *end = mangle_path(buf, p, esc); + if (end) + res = end - buf; + else + res = -ENAMETOOLONG; } } - m->count = m->size; - return err; + seq_commit(m, res); + + return res < 0 ? res : 0; } /* @@ -481,20 +484,21 @@ int seq_path_root(struct seq_file *m, struct path *path, struct path *root, */ int seq_dentry(struct seq_file *m, struct dentry *dentry, char *esc) { - if (m->count < m->size) { - char *s = m->buf + m->count; - char *p = dentry_path(dentry, s, m->size - m->count); + char *buf; + size_t size = seq_get_buf(m, &buf); + int res = -1; + + if (size) { + char *p = dentry_path(dentry, buf, size); if (!IS_ERR(p)) { - s = mangle_path(s, p, esc); - if (s) { - p = m->buf + m->count; - m->count = s - m->buf; - return s - p; - } + char *end = mangle_path(buf, p, esc); + if (end) + res = end - buf; } } - m->count = m->size; - return -1; + seq_commit(m, res); + + return res; } int seq_bitmap(struct seq_file *m, const unsigned long *bits, diff --git a/fs/smbfs/inode.c b/fs/smbfs/inode.c index 1402d2d..1c4c8f0 100644 --- a/fs/smbfs/inode.c +++ b/fs/smbfs/inode.c @@ -459,14 +459,8 @@ smb_show_options(struct seq_file *s, struct vfsmount *m) static void smb_unload_nls(struct smb_sb_info *server) { - if (server->remote_nls) { - unload_nls(server->remote_nls); - server->remote_nls = NULL; - } - if (server->local_nls) { - unload_nls(server->local_nls); - server->local_nls = NULL; - } + unload_nls(server->remote_nls); + unload_nls(server->local_nls); } static void diff --git a/fs/smbfs/proc.c b/fs/smbfs/proc.c index 9468168..71c29b6 100644 --- a/fs/smbfs/proc.c +++ b/fs/smbfs/proc.c @@ -509,7 +509,7 @@ date_unix2dos(struct smb_sb_info *server, month = 2; } else { nl_day = (year & 3) || day <= 59 ? day : day - 1; - for (month = 0; month < 12; month++) + for (month = 1; month < 12; month++) if (day_n[month] > nl_day) break; } diff --git a/fs/splice.c b/fs/splice.c index 73766d2..7394e9e 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -502,8 +502,10 @@ ssize_t generic_file_splice_read(struct file *in, loff_t *ppos, len = left; ret = __generic_file_splice_read(in, ppos, pipe, len, flags); - if (ret > 0) + if (ret > 0) { *ppos += ret; + file_accessed(in); + } return ret; } @@ -963,8 +965,10 @@ generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out, mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD); ret = file_remove_suid(out); - if (!ret) + if (!ret) { + file_update_time(out); ret = splice_from_pipe_feed(pipe, &sd, pipe_to_file); + } mutex_unlock(&inode->i_mutex); } while (ret > 0); splice_from_pipe_end(pipe, &sd); @@ -976,25 +980,15 @@ generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out, if (ret > 0) { unsigned long nr_pages; + int err; - *ppos += ret; nr_pages = (ret + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; - /* - * If file or inode is SYNC and we actually wrote some data, - * sync it. - */ - if (unlikely((out->f_flags & O_SYNC) || IS_SYNC(inode))) { - int err; - - mutex_lock(&inode->i_mutex); - err = generic_osync_inode(inode, mapping, - OSYNC_METADATA|OSYNC_DATA); - mutex_unlock(&inode->i_mutex); - - if (err) - ret = err; - } + err = generic_write_sync(out, *ppos, ret); + if (err) + ret = err; + else + *ppos += ret; balance_dirty_pages_ratelimited_nr(mapping, nr_pages); } diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c index cb5fc57..6c197ef 100644 --- a/fs/squashfs/super.c +++ b/fs/squashfs/super.c @@ -44,7 +44,7 @@ #include "squashfs.h" static struct file_system_type squashfs_fs_type; -static struct super_operations squashfs_super_ops; +static const struct super_operations squashfs_super_ops; static int supported_squashfs_filesystem(short major, short minor, short comp) { @@ -444,7 +444,7 @@ static struct file_system_type squashfs_fs_type = { .fs_flags = FS_REQUIRES_DEV }; -static struct super_operations squashfs_super_ops = { +static const struct super_operations squashfs_super_ops = { .alloc_inode = squashfs_alloc_inode, .destroy_inode = squashfs_destroy_inode, .statfs = squashfs_statfs, @@ -54,7 +54,7 @@ DEFINE_SPINLOCK(sb_lock); static struct super_block *alloc_super(struct file_system_type *type) { struct super_block *s = kzalloc(sizeof(struct super_block), GFP_USER); - static struct super_operations default_op; + static const struct super_operations default_op; if (s) { if (security_sb_alloc(s)) { @@ -62,9 +62,6 @@ static struct super_block *alloc_super(struct file_system_type *type) s = NULL; goto out; } - INIT_LIST_HEAD(&s->s_dirty); - INIT_LIST_HEAD(&s->s_io); - INIT_LIST_HEAD(&s->s_more_io); INIT_LIST_HEAD(&s->s_files); INIT_LIST_HEAD(&s->s_instances); INIT_HLIST_HEAD(&s->s_anon); @@ -171,7 +168,7 @@ int __put_super_and_need_restart(struct super_block *sb) * Drops a temporary reference, frees superblock if there's no * references left. */ -static void put_super(struct super_block *sb) +void put_super(struct super_block *sb) { spin_lock(&sb_lock); __put_super(sb); @@ -468,6 +465,48 @@ rescan: } EXPORT_SYMBOL(get_super); + +/** + * get_active_super - get an active reference to the superblock of a device + * @bdev: device to get the superblock for + * + * Scans the superblock list and finds the superblock of the file system + * mounted on the device given. Returns the superblock with an active + * reference and s_umount held exclusively or %NULL if none was found. + */ +struct super_block *get_active_super(struct block_device *bdev) +{ + struct super_block *sb; + + if (!bdev) + return NULL; + + spin_lock(&sb_lock); + list_for_each_entry(sb, &super_blocks, s_list) { + if (sb->s_bdev != bdev) + continue; + + sb->s_count++; + spin_unlock(&sb_lock); + down_write(&sb->s_umount); + if (sb->s_root) { + spin_lock(&sb_lock); + if (sb->s_count > S_BIAS) { + atomic_inc(&sb->s_active); + sb->s_count--; + spin_unlock(&sb_lock); + return sb; + } + spin_unlock(&sb_lock); + } + up_write(&sb->s_umount); + put_super(sb); + yield(); + spin_lock(&sb_lock); + } + spin_unlock(&sb_lock); + return NULL; +} struct super_block * user_get_super(dev_t dev) { @@ -530,11 +569,15 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force) { int retval; int remount_rw; - + + if (sb->s_frozen != SB_UNFROZEN) + return -EBUSY; + #ifdef CONFIG_BLOCK if (!(flags & MS_RDONLY) && bdev_read_only(sb->s_bdev)) return -EACCES; #endif + if (flags & MS_RDONLY) acct_auto_close(sb); shrink_dcache_sb(sb); @@ -710,6 +753,12 @@ static int set_bdev_super(struct super_block *s, void *data) { s->s_bdev = data; s->s_dev = s->s_bdev->bd_dev; + + /* + * We set the bdi here to the queue backing, file systems can + * overwrite this in ->fill_super() + */ + s->s_bdi = &bdev_get_queue(s->s_bdev)->backing_dev_info; return 0; } @@ -740,9 +789,14 @@ int get_sb_bdev(struct file_system_type *fs_type, * will protect the lockfs code from trying to start a snapshot * while we are mounting */ - down(&bdev->bd_mount_sem); + mutex_lock(&bdev->bd_fsfreeze_mutex); + if (bdev->bd_fsfreeze_count > 0) { + mutex_unlock(&bdev->bd_fsfreeze_mutex); + error = -EBUSY; + goto error_bdev; + } s = sget(fs_type, test_bdev_super, set_bdev_super, bdev); - up(&bdev->bd_mount_sem); + mutex_unlock(&bdev->bd_fsfreeze_mutex); if (IS_ERR(s)) goto error_s; @@ -889,6 +943,16 @@ vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void if (error) goto out_sb; + /* + * filesystems should never set s_maxbytes larger than MAX_LFS_FILESIZE + * but s_maxbytes was an unsigned long long for many releases. Throw + * this warning for a little while to try and catch filesystems that + * violate this rule. This warning should be either removed or + * converted to a BUG() in 2.6.34. + */ + WARN((mnt->mnt_sb->s_maxbytes < 0), "%s set sb->s_maxbytes to " + "negative value (%lld)\n", type->name, mnt->mnt_sb->s_maxbytes); + mnt->mnt_mountpoint = mnt->mnt_root; mnt->mnt_parent = mnt; up_write(&mnt->mnt_sb->s_umount); @@ -19,20 +19,29 @@ SYNC_FILE_RANGE_WAIT_AFTER) /* - * Do the filesystem syncing work. For simple filesystems sync_inodes_sb(sb, 0) - * just dirties buffers with inodes so we have to submit IO for these buffers - * via __sync_blockdev(). This also speeds up the wait == 1 case since in that - * case write_inode() functions do sync_dirty_buffer() and thus effectively - * write one block at a time. + * Do the filesystem syncing work. For simple filesystems + * writeback_inodes_sb(sb) just dirties buffers with inodes so we have to + * submit IO for these buffers via __sync_blockdev(). This also speeds up the + * wait == 1 case since in that case write_inode() functions do + * sync_dirty_buffer() and thus effectively write one block at a time. */ static int __sync_filesystem(struct super_block *sb, int wait) { + /* + * This should be safe, as we require bdi backing to actually + * write out data in the first place + */ + if (!sb->s_bdi) + return 0; + /* Avoid doing twice syncing and cache pruning for quota sync */ - if (!wait) + if (!wait) { writeout_quota_sb(sb, -1); - else + writeback_inodes_sb(sb); + } else { sync_quota_sb(sb, -1); - sync_inodes_sb(sb, wait); + sync_inodes_sb(sb); + } if (sb->s_op->sync_fs) sb->s_op->sync_fs(sb, wait); return __sync_blockdev(sb->s_bdev, wait); @@ -99,7 +108,7 @@ restart: spin_unlock(&sb_lock); down_read(&sb->s_umount); - if (!(sb->s_flags & MS_RDONLY) && sb->s_root) + if (!(sb->s_flags & MS_RDONLY) && sb->s_root && sb->s_bdi) __sync_filesystem(sb, wait); up_read(&sb->s_umount); @@ -118,7 +127,7 @@ restart: */ SYSCALL_DEFINE0(sync) { - wakeup_pdflush(0); + wakeup_flusher_threads(0); sync_filesystems(0); sync_filesystems(1); if (unlikely(laptop_mode)) @@ -174,21 +183,26 @@ int file_fsync(struct file *filp, struct dentry *dentry, int datasync) ret = err; return ret; } +EXPORT_SYMBOL(file_fsync); /** - * vfs_fsync - perform a fsync or fdatasync on a file + * vfs_fsync_range - helper to sync a range of data & metadata to disk * @file: file to sync * @dentry: dentry of @file - * @data: only perform a fdatasync operation + * @start: offset in bytes of the beginning of data range to sync + * @end: offset in bytes of the end of data range (inclusive) + * @datasync: perform only datasync * - * Write back data and metadata for @file to disk. If @datasync is - * set only metadata needed to access modified file data is written. + * Write back data in range @start..@end and metadata for @file to disk. If + * @datasync is set only metadata needed to access modified file data is + * written. * * In case this function is called from nfsd @file may be %NULL and * only @dentry is set. This can only happen when the filesystem * implements the export_operations API. */ -int vfs_fsync(struct file *file, struct dentry *dentry, int datasync) +int vfs_fsync_range(struct file *file, struct dentry *dentry, loff_t start, + loff_t end, int datasync) { const struct file_operations *fop; struct address_space *mapping; @@ -212,7 +226,7 @@ int vfs_fsync(struct file *file, struct dentry *dentry, int datasync) goto out; } - ret = filemap_fdatawrite(mapping); + ret = filemap_write_and_wait_range(mapping, start, end); /* * We need to protect against concurrent writers, which could cause @@ -223,12 +237,29 @@ int vfs_fsync(struct file *file, struct dentry *dentry, int datasync) if (!ret) ret = err; mutex_unlock(&mapping->host->i_mutex); - err = filemap_fdatawait(mapping); - if (!ret) - ret = err; + out: return ret; } +EXPORT_SYMBOL(vfs_fsync_range); + +/** + * vfs_fsync - perform a fsync or fdatasync on a file + * @file: file to sync + * @dentry: dentry of @file + * @datasync: only perform a fdatasync operation + * + * Write back data and metadata for @file to disk. If @datasync is + * set only metadata needed to access modified file data is written. + * + * In case this function is called from nfsd @file may be %NULL and + * only @dentry is set. This can only happen when the filesystem + * implements the export_operations API. + */ +int vfs_fsync(struct file *file, struct dentry *dentry, int datasync) +{ + return vfs_fsync_range(file, dentry, 0, LLONG_MAX, datasync); +} EXPORT_SYMBOL(vfs_fsync); static int do_fsync(unsigned int fd, int datasync) @@ -254,6 +285,23 @@ SYSCALL_DEFINE1(fdatasync, unsigned int, fd) return do_fsync(fd, 1); } +/** + * generic_write_sync - perform syncing after a write if file / inode is sync + * @file: file to which the write happened + * @pos: offset where the write started + * @count: length of the write + * + * This is just a simple wrapper about our general syncing function. + */ +int generic_write_sync(struct file *file, loff_t pos, loff_t count) +{ + if (!(file->f_flags & O_SYNC) && !IS_SYNC(file->f_mapping->host)) + return 0; + return vfs_fsync_range(file, file->f_path.dentry, pos, + pos + count - 1, 1); +} +EXPORT_SYMBOL(generic_write_sync); + /* * sys_sync_file_range() permits finely controlled syncing over a segment of * a file in the range offset .. (offset+nbytes-1) inclusive. If nbytes is diff --git a/fs/sysfs/bin.c b/fs/sysfs/bin.c index 2524714..60c702b 100644 --- a/fs/sysfs/bin.c +++ b/fs/sysfs/bin.c @@ -40,7 +40,7 @@ struct bin_buffer { struct mutex mutex; void *buffer; int mmapped; - struct vm_operations_struct *vm_ops; + const struct vm_operations_struct *vm_ops; struct file *file; struct hlist_node list; }; @@ -331,7 +331,7 @@ static int bin_migrate(struct vm_area_struct *vma, const nodemask_t *from, } #endif -static struct vm_operations_struct bin_vm_ops = { +static const struct vm_operations_struct bin_vm_ops = { .open = bin_vma_open, .close = bin_vma_close, .fault = bin_fault, diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c index 14f2d71..5fad489 100644 --- a/fs/sysfs/dir.c +++ b/fs/sysfs/dir.c @@ -760,6 +760,7 @@ static struct dentry * sysfs_lookup(struct inode *dir, struct dentry *dentry, const struct inode_operations sysfs_dir_inode_operations = { .lookup = sysfs_lookup, .setattr = sysfs_setattr, + .setxattr = sysfs_setxattr, }; static void remove_dir(struct sysfs_dirent *sd) @@ -893,7 +894,8 @@ int sysfs_move_dir(struct kobject *kobj, struct kobject *new_parent_kobj) mutex_lock(&sysfs_rename_mutex); BUG_ON(!sd->s_parent); - new_parent_sd = new_parent_kobj->sd ? new_parent_kobj->sd : &sysfs_root; + new_parent_sd = (new_parent_kobj && new_parent_kobj->sd) ? + new_parent_kobj->sd : &sysfs_root; error = 0; if (sd->s_parent == new_parent_sd) diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c index 561a9c0..f5ea468 100644 --- a/fs/sysfs/file.c +++ b/fs/sysfs/file.c @@ -268,7 +268,7 @@ static int sysfs_get_open_dirent(struct sysfs_dirent *sd, struct sysfs_open_dirent *od, *new_od = NULL; retry: - spin_lock(&sysfs_open_dirent_lock); + spin_lock_irq(&sysfs_open_dirent_lock); if (!sd->s_attr.open && new_od) { sd->s_attr.open = new_od; @@ -281,7 +281,7 @@ static int sysfs_get_open_dirent(struct sysfs_dirent *sd, list_add_tail(&buffer->list, &od->buffers); } - spin_unlock(&sysfs_open_dirent_lock); + spin_unlock_irq(&sysfs_open_dirent_lock); if (od) { kfree(new_od); @@ -315,8 +315,9 @@ static void sysfs_put_open_dirent(struct sysfs_dirent *sd, struct sysfs_buffer *buffer) { struct sysfs_open_dirent *od = sd->s_attr.open; + unsigned long flags; - spin_lock(&sysfs_open_dirent_lock); + spin_lock_irqsave(&sysfs_open_dirent_lock, flags); list_del(&buffer->list); if (atomic_dec_and_test(&od->refcnt)) @@ -324,7 +325,7 @@ static void sysfs_put_open_dirent(struct sysfs_dirent *sd, else od = NULL; - spin_unlock(&sysfs_open_dirent_lock); + spin_unlock_irqrestore(&sysfs_open_dirent_lock, flags); kfree(od); } @@ -456,8 +457,9 @@ static unsigned int sysfs_poll(struct file *filp, poll_table *wait) void sysfs_notify_dirent(struct sysfs_dirent *sd) { struct sysfs_open_dirent *od; + unsigned long flags; - spin_lock(&sysfs_open_dirent_lock); + spin_lock_irqsave(&sysfs_open_dirent_lock, flags); od = sd->s_attr.open; if (od) { @@ -465,7 +467,7 @@ void sysfs_notify_dirent(struct sysfs_dirent *sd) wake_up_interruptible(&od->poll); } - spin_unlock(&sysfs_open_dirent_lock); + spin_unlock_irqrestore(&sysfs_open_dirent_lock, flags); } EXPORT_SYMBOL_GPL(sysfs_notify_dirent); diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c index 555f0ff..e28cecf 100644 --- a/fs/sysfs/inode.c +++ b/fs/sysfs/inode.c @@ -18,6 +18,8 @@ #include <linux/capability.h> #include <linux/errno.h> #include <linux/sched.h> +#include <linux/xattr.h> +#include <linux/security.h> #include "sysfs.h" extern struct super_block * sysfs_sb; @@ -29,12 +31,14 @@ static const struct address_space_operations sysfs_aops = { }; static struct backing_dev_info sysfs_backing_dev_info = { + .name = "sysfs", .ra_pages = 0, /* No readahead */ .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, }; static const struct inode_operations sysfs_inode_operations ={ .setattr = sysfs_setattr, + .setxattr = sysfs_setxattr, }; int __init sysfs_inode_init(void) @@ -42,18 +46,37 @@ int __init sysfs_inode_init(void) return bdi_init(&sysfs_backing_dev_info); } +struct sysfs_inode_attrs *sysfs_init_inode_attrs(struct sysfs_dirent *sd) +{ + struct sysfs_inode_attrs *attrs; + struct iattr *iattrs; + + attrs = kzalloc(sizeof(struct sysfs_inode_attrs), GFP_KERNEL); + if (!attrs) + return NULL; + iattrs = &attrs->ia_iattr; + + /* assign default attributes */ + iattrs->ia_mode = sd->s_mode; + iattrs->ia_uid = 0; + iattrs->ia_gid = 0; + iattrs->ia_atime = iattrs->ia_mtime = iattrs->ia_ctime = CURRENT_TIME; + + return attrs; +} int sysfs_setattr(struct dentry * dentry, struct iattr * iattr) { struct inode * inode = dentry->d_inode; struct sysfs_dirent * sd = dentry->d_fsdata; - struct iattr * sd_iattr; + struct sysfs_inode_attrs *sd_attrs; + struct iattr *iattrs; unsigned int ia_valid = iattr->ia_valid; int error; if (!sd) return -EINVAL; - sd_iattr = sd->s_iattr; + sd_attrs = sd->s_iattr; error = inode_change_ok(inode, iattr); if (error) @@ -65,42 +88,77 @@ int sysfs_setattr(struct dentry * dentry, struct iattr * iattr) if (error) return error; - if (!sd_iattr) { + if (!sd_attrs) { /* setting attributes for the first time, allocate now */ - sd_iattr = kzalloc(sizeof(struct iattr), GFP_KERNEL); - if (!sd_iattr) + sd_attrs = sysfs_init_inode_attrs(sd); + if (!sd_attrs) return -ENOMEM; - /* assign default attributes */ - sd_iattr->ia_mode = sd->s_mode; - sd_iattr->ia_uid = 0; - sd_iattr->ia_gid = 0; - sd_iattr->ia_atime = sd_iattr->ia_mtime = sd_iattr->ia_ctime = CURRENT_TIME; - sd->s_iattr = sd_iattr; + sd->s_iattr = sd_attrs; + } else { + /* attributes were changed at least once in past */ + iattrs = &sd_attrs->ia_iattr; + + if (ia_valid & ATTR_UID) + iattrs->ia_uid = iattr->ia_uid; + if (ia_valid & ATTR_GID) + iattrs->ia_gid = iattr->ia_gid; + if (ia_valid & ATTR_ATIME) + iattrs->ia_atime = timespec_trunc(iattr->ia_atime, + inode->i_sb->s_time_gran); + if (ia_valid & ATTR_MTIME) + iattrs->ia_mtime = timespec_trunc(iattr->ia_mtime, + inode->i_sb->s_time_gran); + if (ia_valid & ATTR_CTIME) + iattrs->ia_ctime = timespec_trunc(iattr->ia_ctime, + inode->i_sb->s_time_gran); + if (ia_valid & ATTR_MODE) { + umode_t mode = iattr->ia_mode; + + if (!in_group_p(inode->i_gid) && !capable(CAP_FSETID)) + mode &= ~S_ISGID; + iattrs->ia_mode = sd->s_mode = mode; + } } + return error; +} - /* attributes were changed atleast once in past */ - - if (ia_valid & ATTR_UID) - sd_iattr->ia_uid = iattr->ia_uid; - if (ia_valid & ATTR_GID) - sd_iattr->ia_gid = iattr->ia_gid; - if (ia_valid & ATTR_ATIME) - sd_iattr->ia_atime = timespec_trunc(iattr->ia_atime, - inode->i_sb->s_time_gran); - if (ia_valid & ATTR_MTIME) - sd_iattr->ia_mtime = timespec_trunc(iattr->ia_mtime, - inode->i_sb->s_time_gran); - if (ia_valid & ATTR_CTIME) - sd_iattr->ia_ctime = timespec_trunc(iattr->ia_ctime, - inode->i_sb->s_time_gran); - if (ia_valid & ATTR_MODE) { - umode_t mode = iattr->ia_mode; - - if (!in_group_p(inode->i_gid) && !capable(CAP_FSETID)) - mode &= ~S_ISGID; - sd_iattr->ia_mode = sd->s_mode = mode; - } +int sysfs_setxattr(struct dentry *dentry, const char *name, const void *value, + size_t size, int flags) +{ + struct sysfs_dirent *sd = dentry->d_fsdata; + struct sysfs_inode_attrs *iattrs; + void *secdata; + int error; + u32 secdata_len = 0; + + if (!sd) + return -EINVAL; + if (!sd->s_iattr) + sd->s_iattr = sysfs_init_inode_attrs(sd); + if (!sd->s_iattr) + return -ENOMEM; + + iattrs = sd->s_iattr; + + if (!strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN)) { + const char *suffix = name + XATTR_SECURITY_PREFIX_LEN; + error = security_inode_setsecurity(dentry->d_inode, suffix, + value, size, flags); + if (error) + goto out; + error = security_inode_getsecctx(dentry->d_inode, + &secdata, &secdata_len); + if (error) + goto out; + if (iattrs->ia_secdata) + security_release_secctx(iattrs->ia_secdata, + iattrs->ia_secdata_len); + iattrs->ia_secdata = secdata; + iattrs->ia_secdata_len = secdata_len; + } else + return -EINVAL; +out: return error; } @@ -146,6 +204,7 @@ static int sysfs_count_nlink(struct sysfs_dirent *sd) static void sysfs_init_inode(struct sysfs_dirent *sd, struct inode *inode) { struct bin_attribute *bin_attr; + struct sysfs_inode_attrs *iattrs; inode->i_private = sysfs_get(sd); inode->i_mapping->a_ops = &sysfs_aops; @@ -154,16 +213,20 @@ static void sysfs_init_inode(struct sysfs_dirent *sd, struct inode *inode) inode->i_ino = sd->s_ino; lockdep_set_class(&inode->i_mutex, &sysfs_inode_imutex_key); - if (sd->s_iattr) { + iattrs = sd->s_iattr; + if (iattrs) { /* sysfs_dirent has non-default attributes * get them for the new inode from persistent copy * in sysfs_dirent */ - set_inode_attr(inode, sd->s_iattr); + set_inode_attr(inode, &iattrs->ia_iattr); + if (iattrs->ia_secdata) + security_inode_notifysecctx(inode, + iattrs->ia_secdata, + iattrs->ia_secdata_len); } else set_default_inode_attr(inode, sd->s_mode); - /* initialize inode according to type */ switch (sysfs_type(sd)) { case SYSFS_DIR: diff --git a/fs/sysfs/symlink.c b/fs/sysfs/symlink.c index 1d897ad..c5081ad 100644 --- a/fs/sysfs/symlink.c +++ b/fs/sysfs/symlink.c @@ -16,6 +16,7 @@ #include <linux/kobject.h> #include <linux/namei.h> #include <linux/mutex.h> +#include <linux/security.h> #include "sysfs.h" @@ -209,6 +210,7 @@ static void sysfs_put_link(struct dentry *dentry, struct nameidata *nd, void *co } const struct inode_operations sysfs_symlink_inode_operations = { + .setxattr = sysfs_setxattr, .readlink = generic_readlink, .follow_link = sysfs_follow_link, .put_link = sysfs_put_link, diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h index 3fa0d984..af4c4e7 100644 --- a/fs/sysfs/sysfs.h +++ b/fs/sysfs/sysfs.h @@ -8,6 +8,8 @@ * This file is released under the GPLv2. */ +#include <linux/fs.h> + struct sysfs_open_dirent; /* type-specific structures for sysfs_dirent->s_* union members */ @@ -31,6 +33,12 @@ struct sysfs_elem_bin_attr { struct hlist_head buffers; }; +struct sysfs_inode_attrs { + struct iattr ia_iattr; + void *ia_secdata; + u32 ia_secdata_len; +}; + /* * sysfs_dirent - the building block of sysfs hierarchy. Each and * every sysfs node is represented by single sysfs_dirent. @@ -56,7 +64,7 @@ struct sysfs_dirent { unsigned int s_flags; ino_t s_ino; umode_t s_mode; - struct iattr *s_iattr; + struct sysfs_inode_attrs *s_iattr; }; #define SD_DEACTIVATED_BIAS INT_MIN @@ -148,6 +156,8 @@ static inline void __sysfs_put(struct sysfs_dirent *sd) struct inode *sysfs_get_inode(struct sysfs_dirent *sd); void sysfs_delete_inode(struct inode *inode); int sysfs_setattr(struct dentry *dentry, struct iattr *iattr); +int sysfs_setxattr(struct dentry *dentry, const char *name, const void *value, + size_t size, int flags); int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const char *name); int sysfs_inode_init(void); diff --git a/fs/ubifs/budget.c b/fs/ubifs/budget.c index eaf6d89..076ca50 100644 --- a/fs/ubifs/budget.c +++ b/fs/ubifs/budget.c @@ -54,41 +54,15 @@ * @nr_to_write: how many dirty pages to write-back * * This function shrinks UBIFS liability by means of writing back some amount - * of dirty inodes and their pages. Returns the amount of pages which were - * written back. The returned value does not include dirty inodes which were - * synchronized. + * of dirty inodes and their pages. * * Note, this function synchronizes even VFS inodes which are locked * (@i_mutex) by the caller of the budgeting function, because write-back does * not touch @i_mutex. */ -static int shrink_liability(struct ubifs_info *c, int nr_to_write) +static void shrink_liability(struct ubifs_info *c, int nr_to_write) { - int nr_written; - struct writeback_control wbc = { - .sync_mode = WB_SYNC_NONE, - .range_end = LLONG_MAX, - .nr_to_write = nr_to_write, - }; - - generic_sync_sb_inodes(c->vfs_sb, &wbc); - nr_written = nr_to_write - wbc.nr_to_write; - - if (!nr_written) { - /* - * Re-try again but wait on pages/inodes which are being - * written-back concurrently (e.g., by pdflush). - */ - memset(&wbc, 0, sizeof(struct writeback_control)); - wbc.sync_mode = WB_SYNC_ALL; - wbc.range_end = LLONG_MAX; - wbc.nr_to_write = nr_to_write; - generic_sync_sb_inodes(c->vfs_sb, &wbc); - nr_written = nr_to_write - wbc.nr_to_write; - } - - dbg_budg("%d pages were written back", nr_written); - return nr_written; + writeback_inodes_sb(c->vfs_sb); } /** @@ -741,7 +715,7 @@ long long ubifs_get_free_space_nolock(struct ubifs_info *c) * ubifs_get_free_space - return amount of free space. * @c: UBIFS file-system description object * - * This function calculates and retuns amount of free space to report to + * This function calculates and returns amount of free space to report to * user-space. */ long long ubifs_get_free_space(struct ubifs_info *c) diff --git a/fs/ubifs/commit.c b/fs/ubifs/commit.c index f3a7945..4775af4 100644 --- a/fs/ubifs/commit.c +++ b/fs/ubifs/commit.c @@ -510,7 +510,7 @@ int dbg_check_old_index(struct ubifs_info *c, struct ubifs_zbranch *zroot) int lnum, offs, len, err = 0, uninitialized_var(last_level), child_cnt; int first = 1, iip; struct ubifs_debug_info *d = c->dbg; - union ubifs_key lower_key, upper_key, l_key, u_key; + union ubifs_key uninitialized_var(lower_key), upper_key, l_key, u_key; unsigned long long uninitialized_var(last_sqnum); struct ubifs_idx_node *idx; struct list_head list; diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c index ce2cd83..dbc093a 100644 --- a/fs/ubifs/debug.c +++ b/fs/ubifs/debug.c @@ -210,6 +210,20 @@ const char *dbg_cstate(int cmt_state) } } +const char *dbg_jhead(int jhead) +{ + switch (jhead) { + case GCHD: + return "0 (GC)"; + case BASEHD: + return "1 (base)"; + case DATAHD: + return "2 (data)"; + default: + return "unknown journal head"; + } +} + static void dump_ch(const struct ubifs_ch *ch) { printk(KERN_DEBUG "\tmagic %#x\n", le32_to_cpu(ch->magic)); @@ -623,8 +637,9 @@ void dbg_dump_budg(struct ubifs_info *c) /* If we are in R/O mode, journal heads do not exist */ if (c->jheads) for (i = 0; i < c->jhead_cnt; i++) - printk(KERN_DEBUG "\tjhead %d\t LEB %d\n", - c->jheads[i].wbuf.jhead, c->jheads[i].wbuf.lnum); + printk(KERN_DEBUG "\tjhead %s\t LEB %d\n", + dbg_jhead(c->jheads[i].wbuf.jhead), + c->jheads[i].wbuf.lnum); for (rb = rb_first(&c->buds); rb; rb = rb_next(rb)) { bud = rb_entry(rb, struct ubifs_bud, rb); printk(KERN_DEBUG "\tbud LEB %d\n", bud->lnum); @@ -648,9 +663,90 @@ void dbg_dump_budg(struct ubifs_info *c) void dbg_dump_lprop(const struct ubifs_info *c, const struct ubifs_lprops *lp) { - printk(KERN_DEBUG "LEB %d lprops: free %d, dirty %d (used %d), " - "flags %#x\n", lp->lnum, lp->free, lp->dirty, - c->leb_size - lp->free - lp->dirty, lp->flags); + int i, spc, dark = 0, dead = 0; + struct rb_node *rb; + struct ubifs_bud *bud; + + spc = lp->free + lp->dirty; + if (spc < c->dead_wm) + dead = spc; + else + dark = ubifs_calc_dark(c, spc); + + if (lp->flags & LPROPS_INDEX) + printk(KERN_DEBUG "LEB %-7d free %-8d dirty %-8d used %-8d " + "free + dirty %-8d flags %#x (", lp->lnum, lp->free, + lp->dirty, c->leb_size - spc, spc, lp->flags); + else + printk(KERN_DEBUG "LEB %-7d free %-8d dirty %-8d used %-8d " + "free + dirty %-8d dark %-4d dead %-4d nodes fit %-3d " + "flags %#-4x (", lp->lnum, lp->free, lp->dirty, + c->leb_size - spc, spc, dark, dead, + (int)(spc / UBIFS_MAX_NODE_SZ), lp->flags); + + if (lp->flags & LPROPS_TAKEN) { + if (lp->flags & LPROPS_INDEX) + printk(KERN_CONT "index, taken"); + else + printk(KERN_CONT "taken"); + } else { + const char *s; + + if (lp->flags & LPROPS_INDEX) { + switch (lp->flags & LPROPS_CAT_MASK) { + case LPROPS_DIRTY_IDX: + s = "dirty index"; + break; + case LPROPS_FRDI_IDX: + s = "freeable index"; + break; + default: + s = "index"; + } + } else { + switch (lp->flags & LPROPS_CAT_MASK) { + case LPROPS_UNCAT: + s = "not categorized"; + break; + case LPROPS_DIRTY: + s = "dirty"; + break; + case LPROPS_FREE: + s = "free"; + break; + case LPROPS_EMPTY: + s = "empty"; + break; + case LPROPS_FREEABLE: + s = "freeable"; + break; + default: + s = NULL; + break; + } + } + printk(KERN_CONT "%s", s); + } + + for (rb = rb_first((struct rb_root *)&c->buds); rb; rb = rb_next(rb)) { + bud = rb_entry(rb, struct ubifs_bud, rb); + if (bud->lnum == lp->lnum) { + int head = 0; + for (i = 0; i < c->jhead_cnt; i++) { + if (lp->lnum == c->jheads[i].wbuf.lnum) { + printk(KERN_CONT ", jhead %s", + dbg_jhead(i)); + head = 1; + } + } + if (!head) + printk(KERN_CONT ", bud of jhead %s", + dbg_jhead(bud->jhead)); + } + } + if (lp->lnum == c->gc_lnum) + printk(KERN_CONT ", GC LEB"); + printk(KERN_CONT ")\n"); } void dbg_dump_lprops(struct ubifs_info *c) @@ -724,7 +820,7 @@ void dbg_dump_leb(const struct ubifs_info *c, int lnum) printk(KERN_DEBUG "(pid %d) start dumping LEB %d\n", current->pid, lnum); - sleb = ubifs_scan(c, lnum, 0, c->dbg->buf); + sleb = ubifs_scan(c, lnum, 0, c->dbg->buf, 0); if (IS_ERR(sleb)) { ubifs_err("scan error %d", (int)PTR_ERR(sleb)); return; @@ -909,8 +1005,10 @@ out: ubifs_msg("saved lprops statistics dump"); dbg_dump_lstats(&d->saved_lst); ubifs_get_lp_stats(c, &lst); + ubifs_msg("current lprops statistics dump"); - dbg_dump_lstats(&d->saved_lst); + dbg_dump_lstats(&lst); + spin_lock(&c->space_lock); dbg_dump_budg(c); spin_unlock(&c->space_lock); diff --git a/fs/ubifs/debug.h b/fs/ubifs/debug.h index c1cd73b..29d9601 100644 --- a/fs/ubifs/debug.h +++ b/fs/ubifs/debug.h @@ -271,6 +271,7 @@ void ubifs_debugging_exit(struct ubifs_info *c); /* Dump functions */ const char *dbg_ntype(int type); const char *dbg_cstate(int cmt_state); +const char *dbg_jhead(int jhead); const char *dbg_get_key_dump(const struct ubifs_info *c, const union ubifs_key *key); void dbg_dump_inode(const struct ubifs_info *c, const struct inode *inode); @@ -321,6 +322,8 @@ void dbg_check_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap, int cat, int dbg_check_lprops(struct ubifs_info *c); int dbg_check_lpt_nodes(struct ubifs_info *c, struct ubifs_cnode *cnode, int row, int col); +int dbg_check_inode_size(struct ubifs_info *c, const struct inode *inode, + loff_t size); /* Force the use of in-the-gaps method for testing */ @@ -425,6 +428,7 @@ void dbg_debugfs_exit_fs(struct ubifs_info *c); #define dbg_ntype(type) "" #define dbg_cstate(cmt_state) "" +#define dbg_jhead(jhead) "" #define dbg_get_key_dump(c, key) ({}) #define dbg_dump_inode(c, inode) ({}) #define dbg_dump_node(c, node) ({}) @@ -460,6 +464,7 @@ void dbg_debugfs_exit_fs(struct ubifs_info *c); #define dbg_check_heap(c, heap, cat, add_pos) ({}) #define dbg_check_lprops(c) 0 #define dbg_check_lpt_nodes(c, cnode, row, col) 0 +#define dbg_check_inode_size(c, inode, size) 0 #define dbg_force_in_the_gaps_enabled 0 #define dbg_force_in_the_gaps() 0 #define dbg_failure_mode 0 diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index 6d34dc7..1009adc 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -21,34 +21,32 @@ */ /* - * This file implements VFS file and inode operations of regular files, device + * This file implements VFS file and inode operations for regular files, device * nodes and symlinks as well as address space operations. * - * UBIFS uses 2 page flags: PG_private and PG_checked. PG_private is set if the - * page is dirty and is used for budgeting purposes - dirty pages should not be - * budgeted. The PG_checked flag is set if full budgeting is required for the - * page e.g., when it corresponds to a file hole or it is just beyond the file - * size. The budgeting is done in 'ubifs_write_begin()', because it is OK to - * fail in this function, and the budget is released in 'ubifs_write_end()'. So - * the PG_private and PG_checked flags carry the information about how the page - * was budgeted, to make it possible to release the budget properly. + * UBIFS uses 2 page flags: @PG_private and @PG_checked. @PG_private is set if + * the page is dirty and is used for optimization purposes - dirty pages are + * not budgeted so the flag shows that 'ubifs_write_end()' should not release + * the budget for this page. The @PG_checked flag is set if full budgeting is + * required for the page e.g., when it corresponds to a file hole or it is + * beyond the file size. The budgeting is done in 'ubifs_write_begin()', because + * it is OK to fail in this function, and the budget is released in + * 'ubifs_write_end()'. So the @PG_private and @PG_checked flags carry + * information about how the page was budgeted, to make it possible to release + * the budget properly. * - * A thing to keep in mind: inode's 'i_mutex' is locked in most VFS operations - * we implement. However, this is not true for '->writepage()', which might be - * called with 'i_mutex' unlocked. For example, when pdflush is performing - * write-back, it calls 'writepage()' with unlocked 'i_mutex', although the - * inode has 'I_LOCK' flag in this case. At "normal" work-paths 'i_mutex' is - * locked in '->writepage', e.g. in "sys_write -> alloc_pages -> direct reclaim - * path'. So, in '->writepage()' we are only guaranteed that the page is - * locked. + * A thing to keep in mind: inode @i_mutex is locked in most VFS operations we + * implement. However, this is not true for 'ubifs_writepage()', which may be + * called with @i_mutex unlocked. For example, when pdflush is doing background + * write-back, it calls 'ubifs_writepage()' with unlocked @i_mutex. At "normal" + * work-paths the @i_mutex is locked in 'ubifs_writepage()', e.g. in the + * "sys_write -> alloc_pages -> direct reclaim path". So, in 'ubifs_writepage()' + * we are only guaranteed that the page is locked. * - * Similarly, 'i_mutex' does not have to be locked in readpage(), e.g., - * readahead path does not have it locked ("sys_read -> generic_file_aio_read - * -> ondemand_readahead -> readpage"). In case of readahead, 'I_LOCK' flag is - * not set as well. However, UBIFS disables readahead. - * - * This, for example means that there might be 2 concurrent '->writepage()' - * calls for the same inode, but different inode dirty pages. + * Similarly, @i_mutex is not always locked in 'ubifs_readpage()', e.g., the + * read-ahead path does not lock it ("sys_read -> generic_file_aio_read -> + * ondemand_readahead -> readpage"). In case of readahead, @I_LOCK flag is not + * set as well. However, UBIFS disables readahead. */ #include "ubifs.h" @@ -449,9 +447,9 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping, /* * We change whole page so no need to load it. But we * have to set the @PG_checked flag to make the further - * code the page is new. This might be not true, but it - * is better to budget more that to read the page from - * the media. + * code know that the page is new. This might be not + * true, but it is better to budget more than to read + * the page from the media. */ SetPageChecked(page); skipped_read = 1; @@ -497,8 +495,8 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping, } /* - * Whee, we aquired budgeting quickly - without involving - * garbage-collection, committing or forceing write-back. We return + * Whee, we acquired budgeting quickly - without involving + * garbage-collection, committing or forcing write-back. We return * with @ui->ui_mutex locked if we are appending pages, and unlocked * otherwise. This is an optimization (slightly hacky though). */ @@ -562,7 +560,7 @@ static int ubifs_write_end(struct file *file, struct address_space *mapping, /* * Return 0 to force VFS to repeat the whole operation, or the - * error code if 'do_readpage()' failes. + * error code if 'do_readpage()' fails. */ copied = do_readpage(page); goto out; @@ -1175,11 +1173,11 @@ static int do_truncation(struct ubifs_info *c, struct inode *inode, ui->ui_size = inode->i_size; /* Truncation changes inode [mc]time */ inode->i_mtime = inode->i_ctime = ubifs_current_time(inode); - /* The other attributes may be changed at the same time as well */ + /* Other attributes may be changed at the same time as well */ do_attr_changes(inode, attr); - err = ubifs_jnl_truncate(c, inode, old_size, new_size); mutex_unlock(&ui->ui_mutex); + out_budg: if (budgeted) ubifs_release_budget(c, &req); @@ -1536,7 +1534,7 @@ out_unlock: return err; } -static struct vm_operations_struct ubifs_file_vm_ops = { +static const struct vm_operations_struct ubifs_file_vm_ops = { .fault = filemap_fault, .page_mkwrite = ubifs_vm_page_mkwrite, }; diff --git a/fs/ubifs/gc.c b/fs/ubifs/gc.c index f0f5f15..618c270 100644 --- a/fs/ubifs/gc.c +++ b/fs/ubifs/gc.c @@ -529,7 +529,7 @@ int ubifs_garbage_collect_leb(struct ubifs_info *c, struct ubifs_lprops *lp) * We scan the entire LEB even though we only really need to scan up to * (c->leb_size - lp->free). */ - sleb = ubifs_scan(c, lnum, 0, c->sbuf); + sleb = ubifs_scan(c, lnum, 0, c->sbuf, 0); if (IS_ERR(sleb)) return PTR_ERR(sleb); diff --git a/fs/ubifs/io.c b/fs/ubifs/io.c index 762a7d6..e589fed 100644 --- a/fs/ubifs/io.c +++ b/fs/ubifs/io.c @@ -297,7 +297,7 @@ static enum hrtimer_restart wbuf_timer_callback_nolock(struct hrtimer *timer) { struct ubifs_wbuf *wbuf = container_of(timer, struct ubifs_wbuf, timer); - dbg_io("jhead %d", wbuf->jhead); + dbg_io("jhead %s", dbg_jhead(wbuf->jhead)); wbuf->need_sync = 1; wbuf->c->need_wbuf_sync = 1; ubifs_wake_up_bgt(wbuf->c); @@ -314,7 +314,8 @@ static void new_wbuf_timer_nolock(struct ubifs_wbuf *wbuf) if (wbuf->no_timer) return; - dbg_io("set timer for jhead %d, %llu-%llu millisecs", wbuf->jhead, + dbg_io("set timer for jhead %s, %llu-%llu millisecs", + dbg_jhead(wbuf->jhead), div_u64(ktime_to_ns(wbuf->softlimit), USEC_PER_SEC), div_u64(ktime_to_ns(wbuf->softlimit) + wbuf->delta, USEC_PER_SEC)); @@ -351,8 +352,8 @@ int ubifs_wbuf_sync_nolock(struct ubifs_wbuf *wbuf) /* Write-buffer is empty or not seeked */ return 0; - dbg_io("LEB %d:%d, %d bytes, jhead %d", - wbuf->lnum, wbuf->offs, wbuf->used, wbuf->jhead); + dbg_io("LEB %d:%d, %d bytes, jhead %s", + wbuf->lnum, wbuf->offs, wbuf->used, dbg_jhead(wbuf->jhead)); ubifs_assert(!(c->vfs_sb->s_flags & MS_RDONLY)); ubifs_assert(!(wbuf->avail & 7)); ubifs_assert(wbuf->offs + c->min_io_size <= c->leb_size); @@ -401,7 +402,7 @@ int ubifs_wbuf_seek_nolock(struct ubifs_wbuf *wbuf, int lnum, int offs, { const struct ubifs_info *c = wbuf->c; - dbg_io("LEB %d:%d, jhead %d", lnum, offs, wbuf->jhead); + dbg_io("LEB %d:%d, jhead %s", lnum, offs, dbg_jhead(wbuf->jhead)); ubifs_assert(lnum >= 0 && lnum < c->leb_cnt); ubifs_assert(offs >= 0 && offs <= c->leb_size); ubifs_assert(offs % c->min_io_size == 0 && !(offs & 7)); @@ -508,9 +509,9 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len) struct ubifs_info *c = wbuf->c; int err, written, n, aligned_len = ALIGN(len, 8), offs; - dbg_io("%d bytes (%s) to jhead %d wbuf at LEB %d:%d", len, - dbg_ntype(((struct ubifs_ch *)buf)->node_type), wbuf->jhead, - wbuf->lnum, wbuf->offs + wbuf->used); + dbg_io("%d bytes (%s) to jhead %s wbuf at LEB %d:%d", len, + dbg_ntype(((struct ubifs_ch *)buf)->node_type), + dbg_jhead(wbuf->jhead), wbuf->lnum, wbuf->offs + wbuf->used); ubifs_assert(len > 0 && wbuf->lnum >= 0 && wbuf->lnum < c->leb_cnt); ubifs_assert(wbuf->offs >= 0 && wbuf->offs % c->min_io_size == 0); ubifs_assert(!(wbuf->offs & 7) && wbuf->offs <= c->leb_size); @@ -535,8 +536,8 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len) memcpy(wbuf->buf + wbuf->used, buf, len); if (aligned_len == wbuf->avail) { - dbg_io("flush jhead %d wbuf to LEB %d:%d", - wbuf->jhead, wbuf->lnum, wbuf->offs); + dbg_io("flush jhead %s wbuf to LEB %d:%d", + dbg_jhead(wbuf->jhead), wbuf->lnum, wbuf->offs); err = ubi_leb_write(c->ubi, wbuf->lnum, wbuf->buf, wbuf->offs, c->min_io_size, wbuf->dtype); @@ -564,8 +565,8 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len) * minimal I/O unit. We have to fill and flush write-buffer and switch * to the next min. I/O unit. */ - dbg_io("flush jhead %d wbuf to LEB %d:%d", - wbuf->jhead, wbuf->lnum, wbuf->offs); + dbg_io("flush jhead %s wbuf to LEB %d:%d", + dbg_jhead(wbuf->jhead), wbuf->lnum, wbuf->offs); memcpy(wbuf->buf + wbuf->used, buf, wbuf->avail); err = ubi_leb_write(c->ubi, wbuf->lnum, wbuf->buf, wbuf->offs, c->min_io_size, wbuf->dtype); @@ -698,8 +699,8 @@ int ubifs_read_node_wbuf(struct ubifs_wbuf *wbuf, void *buf, int type, int len, int err, rlen, overlap; struct ubifs_ch *ch = buf; - dbg_io("LEB %d:%d, %s, length %d, jhead %d", lnum, offs, - dbg_ntype(type), len, wbuf->jhead); + dbg_io("LEB %d:%d, %s, length %d, jhead %s", lnum, offs, + dbg_ntype(type), len, dbg_jhead(wbuf->jhead)); ubifs_assert(wbuf && lnum >= 0 && lnum < c->leb_cnt && offs >= 0); ubifs_assert(!(offs & 7) && offs < c->leb_size); ubifs_assert(type >= 0 && type < UBIFS_NODE_TYPES_CNT); diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c index 64b5f3a..d321bae 100644 --- a/fs/ubifs/journal.c +++ b/fs/ubifs/journal.c @@ -158,7 +158,7 @@ again: * some. But the write-buffer mutex has to be unlocked because * GC also takes it. */ - dbg_jnl("no free space jhead %d, run GC", jhead); + dbg_jnl("no free space in jhead %s, run GC", dbg_jhead(jhead)); mutex_unlock(&wbuf->io_mutex); lnum = ubifs_garbage_collect(c, 0); @@ -173,7 +173,8 @@ again: * because we dropped @wbuf->io_mutex, so try once * again. */ - dbg_jnl("GC couldn't make a free LEB for jhead %d", jhead); + dbg_jnl("GC couldn't make a free LEB for jhead %s", + dbg_jhead(jhead)); if (retries++ < 2) { dbg_jnl("retry (%d)", retries); goto again; @@ -184,7 +185,7 @@ again: } mutex_lock_nested(&wbuf->io_mutex, wbuf->jhead); - dbg_jnl("got LEB %d for jhead %d", lnum, jhead); + dbg_jnl("got LEB %d for jhead %s", lnum, dbg_jhead(jhead)); avail = c->leb_size - wbuf->offs - wbuf->used; if (wbuf->lnum != -1 && avail >= len) { @@ -255,7 +256,8 @@ static int write_node(struct ubifs_info *c, int jhead, void *node, int len, *lnum = c->jheads[jhead].wbuf.lnum; *offs = c->jheads[jhead].wbuf.offs + c->jheads[jhead].wbuf.used; - dbg_jnl("jhead %d, LEB %d:%d, len %d", jhead, *lnum, *offs, len); + dbg_jnl("jhead %s, LEB %d:%d, len %d", + dbg_jhead(jhead), *lnum, *offs, len); ubifs_prepare_node(c, node, len, 0); return ubifs_wbuf_write_nolock(wbuf, node, len); @@ -285,7 +287,8 @@ static int write_head(struct ubifs_info *c, int jhead, void *buf, int len, *lnum = c->jheads[jhead].wbuf.lnum; *offs = c->jheads[jhead].wbuf.offs + c->jheads[jhead].wbuf.used; - dbg_jnl("jhead %d, LEB %d:%d, len %d", jhead, *lnum, *offs, len); + dbg_jnl("jhead %s, LEB %d:%d, len %d", + dbg_jhead(jhead), *lnum, *offs, len); err = ubifs_wbuf_write_nolock(wbuf, buf, len); if (err) diff --git a/fs/ubifs/key.h b/fs/ubifs/key.h index 5fa27ea..0f530c6 100644 --- a/fs/ubifs/key.h +++ b/fs/ubifs/key.h @@ -229,23 +229,6 @@ static inline void xent_key_init(const struct ubifs_info *c, } /** - * xent_key_init_hash - initialize extended attribute entry key without - * re-calculating hash function. - * @c: UBIFS file-system description object - * @key: key to initialize - * @inum: host inode number - * @hash: extended attribute entry name hash - */ -static inline void xent_key_init_hash(const struct ubifs_info *c, - union ubifs_key *key, ino_t inum, - uint32_t hash) -{ - ubifs_assert(!(hash & ~UBIFS_S_KEY_HASH_MASK)); - key->u32[0] = inum; - key->u32[1] = hash | (UBIFS_XENT_KEY << UBIFS_S_KEY_HASH_BITS); -} - -/** * xent_key_init_flash - initialize on-flash extended attribute entry key. * @c: UBIFS file-system description object * @k: key to initialize @@ -295,22 +278,15 @@ static inline void data_key_init(const struct ubifs_info *c, } /** - * data_key_init_flash - initialize on-flash data key. + * highest_data_key - get the highest possible data key for an inode. * @c: UBIFS file-system description object - * @k: key to initialize + * @key: key to initialize * @inum: inode number - * @block: block number */ -static inline void data_key_init_flash(const struct ubifs_info *c, void *k, - ino_t inum, unsigned int block) +static inline void highest_data_key(const struct ubifs_info *c, + union ubifs_key *key, ino_t inum) { - union ubifs_key *key = k; - - ubifs_assert(!(block & ~UBIFS_S_KEY_BLOCK_MASK)); - key->j32[0] = cpu_to_le32(inum); - key->j32[1] = cpu_to_le32(block | - (UBIFS_DATA_KEY << UBIFS_S_KEY_BLOCK_BITS)); - memset(k + 8, 0, UBIFS_MAX_KEY_LEN - 8); + data_key_init(c, key, inum, UBIFS_S_KEY_BLOCK_MASK); } /** @@ -554,4 +530,5 @@ static inline unsigned long long key_max_inode_size(const struct ubifs_info *c) return 0; } } + #endif /* !__UBIFS_KEY_H__ */ diff --git a/fs/ubifs/log.c b/fs/ubifs/log.c index 56e3377..c345e12 100644 --- a/fs/ubifs/log.c +++ b/fs/ubifs/log.c @@ -169,8 +169,8 @@ void ubifs_add_bud(struct ubifs_info *c, struct ubifs_bud *bud) */ c->bud_bytes += c->leb_size - bud->start; - dbg_log("LEB %d:%d, jhead %d, bud_bytes %lld", bud->lnum, - bud->start, bud->jhead, c->bud_bytes); + dbg_log("LEB %d:%d, jhead %s, bud_bytes %lld", bud->lnum, + bud->start, dbg_jhead(bud->jhead), c->bud_bytes); spin_unlock(&c->buds_lock); } @@ -355,16 +355,16 @@ static void remove_buds(struct ubifs_info *c) * heads (non-closed buds). */ c->cmt_bud_bytes += wbuf->offs - bud->start; - dbg_log("preserve %d:%d, jhead %d, bud bytes %d, " + dbg_log("preserve %d:%d, jhead %s, bud bytes %d, " "cmt_bud_bytes %lld", bud->lnum, bud->start, - bud->jhead, wbuf->offs - bud->start, + dbg_jhead(bud->jhead), wbuf->offs - bud->start, c->cmt_bud_bytes); bud->start = wbuf->offs; } else { c->cmt_bud_bytes += c->leb_size - bud->start; - dbg_log("remove %d:%d, jhead %d, bud bytes %d, " + dbg_log("remove %d:%d, jhead %s, bud bytes %d, " "cmt_bud_bytes %lld", bud->lnum, bud->start, - bud->jhead, c->leb_size - bud->start, + dbg_jhead(bud->jhead), c->leb_size - bud->start, c->cmt_bud_bytes); rb_erase(p1, &c->buds); /* @@ -429,7 +429,8 @@ int ubifs_log_start_commit(struct ubifs_info *c, int *ltail_lnum) if (lnum == -1 || offs == c->leb_size) continue; - dbg_log("add ref to LEB %d:%d for jhead %d", lnum, offs, i); + dbg_log("add ref to LEB %d:%d for jhead %s", + lnum, offs, dbg_jhead(i)); ref = buf + len; ref->ch.node_type = UBIFS_REF_NODE; ref->lnum = cpu_to_le32(lnum); @@ -695,7 +696,7 @@ int ubifs_consolidate_log(struct ubifs_info *c) lnum = c->ltail_lnum; write_lnum = lnum; while (1) { - sleb = ubifs_scan(c, lnum, 0, c->sbuf); + sleb = ubifs_scan(c, lnum, 0, c->sbuf, 0); if (IS_ERR(sleb)) { err = PTR_ERR(sleb); goto out_free; diff --git a/fs/ubifs/lprops.c b/fs/ubifs/lprops.c index 4cdd284..4d4ca38 100644 --- a/fs/ubifs/lprops.c +++ b/fs/ubifs/lprops.c @@ -281,7 +281,7 @@ void ubifs_add_to_cat(struct ubifs_info *c, struct ubifs_lprops *lprops, case LPROPS_FREE: if (add_to_lpt_heap(c, lprops, cat)) break; - /* No more room on heap so make it uncategorized */ + /* No more room on heap so make it un-categorized */ cat = LPROPS_UNCAT; /* Fall through */ case LPROPS_UNCAT: @@ -375,8 +375,8 @@ void ubifs_replace_cat(struct ubifs_info *c, struct ubifs_lprops *old_lprops, * @lprops: LEB properties * * A LEB may have fallen off of the bottom of a heap, and ended up as - * uncategorized even though it has enough space for us now. If that is the case - * this function will put the LEB back onto a heap. + * un-categorized even though it has enough space for us now. If that is the + * case this function will put the LEB back onto a heap. */ void ubifs_ensure_cat(struct ubifs_info *c, struct ubifs_lprops *lprops) { @@ -436,10 +436,10 @@ int ubifs_categorize_lprops(const struct ubifs_info *c, /** * change_category - change LEB properties category. * @c: UBIFS file-system description object - * @lprops: LEB properties to recategorize + * @lprops: LEB properties to re-categorize * * LEB properties are categorized to enable fast find operations. When the LEB - * properties change they must be recategorized. + * properties change they must be re-categorized. */ static void change_category(struct ubifs_info *c, struct ubifs_lprops *lprops) { @@ -461,21 +461,18 @@ static void change_category(struct ubifs_info *c, struct ubifs_lprops *lprops) } /** - * calc_dark - calculate LEB dark space size. + * ubifs_calc_dark - calculate LEB dark space size. * @c: the UBIFS file-system description object * @spc: amount of free and dirty space in the LEB * - * This function calculates amount of dark space in an LEB which has @spc bytes - * of free and dirty space. Returns the calculations result. + * This function calculates and returns amount of dark space in an LEB which + * has @spc bytes of free and dirty space. * - * Dark space is the space which is not always usable - it depends on which - * nodes are written in which order. E.g., if an LEB has only 512 free bytes, - * it is dark space, because it cannot fit a large data node. So UBIFS cannot - * count on this LEB and treat these 512 bytes as usable because it is not true - * if, for example, only big chunks of uncompressible data will be written to - * the FS. + * UBIFS is trying to account the space which might not be usable, and this + * space is called "dark space". For example, if an LEB has only %512 free + * bytes, it is dark space, because it cannot fit a large data node. */ -static int calc_dark(struct ubifs_info *c, int spc) +int ubifs_calc_dark(const struct ubifs_info *c, int spc) { ubifs_assert(!(spc & 7)); @@ -518,7 +515,7 @@ static int is_lprops_dirty(struct ubifs_info *c, struct ubifs_lprops *lprops) * @free: new free space amount * @dirty: new dirty space amount * @flags: new flags - * @idx_gc_cnt: change to the count of idx_gc list + * @idx_gc_cnt: change to the count of @idx_gc list * * This function changes LEB properties (@free, @dirty or @flag). However, the * property which has the %LPROPS_NC value is not changed. Returns a pointer to @@ -535,7 +532,7 @@ const struct ubifs_lprops *ubifs_change_lp(struct ubifs_info *c, { /* * This is the only function that is allowed to change lprops, so we - * discard the const qualifier. + * discard the "const" qualifier. */ struct ubifs_lprops *lprops = (struct ubifs_lprops *)lp; @@ -575,7 +572,7 @@ const struct ubifs_lprops *ubifs_change_lp(struct ubifs_info *c, if (old_spc < c->dead_wm) c->lst.total_dead -= old_spc; else - c->lst.total_dark -= calc_dark(c, old_spc); + c->lst.total_dark -= ubifs_calc_dark(c, old_spc); c->lst.total_used -= c->leb_size - old_spc; } @@ -616,7 +613,7 @@ const struct ubifs_lprops *ubifs_change_lp(struct ubifs_info *c, if (new_spc < c->dead_wm) c->lst.total_dead += new_spc; else - c->lst.total_dark += calc_dark(c, new_spc); + c->lst.total_dark += ubifs_calc_dark(c, new_spc); c->lst.total_used += c->leb_size - new_spc; } @@ -1096,7 +1093,7 @@ static int scan_check_cb(struct ubifs_info *c, } } - sleb = ubifs_scan(c, lnum, 0, c->dbg->buf); + sleb = ubifs_scan(c, lnum, 0, c->dbg->buf, 0); if (IS_ERR(sleb)) { /* * After an unclean unmount, empty and freeable LEBs @@ -1107,7 +1104,7 @@ static int scan_check_cb(struct ubifs_info *c, "- continuing checking"); lst->empty_lebs += 1; lst->total_free += c->leb_size; - lst->total_dark += calc_dark(c, c->leb_size); + lst->total_dark += ubifs_calc_dark(c, c->leb_size); return LPT_SCAN_CONTINUE; } @@ -1117,7 +1114,7 @@ static int scan_check_cb(struct ubifs_info *c, "- continuing checking"); lst->total_free += lp->free; lst->total_dirty += lp->dirty; - lst->total_dark += calc_dark(c, c->leb_size); + lst->total_dark += ubifs_calc_dark(c, c->leb_size); return LPT_SCAN_CONTINUE; } data->err = PTR_ERR(sleb); @@ -1235,7 +1232,7 @@ static int scan_check_cb(struct ubifs_info *c, if (spc < c->dead_wm) lst->total_dead += spc; else - lst->total_dark += calc_dark(c, spc); + lst->total_dark += ubifs_calc_dark(c, spc); } ubifs_scan_destroy(sleb); diff --git a/fs/ubifs/master.c b/fs/ubifs/master.c index a88f338..28beaee 100644 --- a/fs/ubifs/master.c +++ b/fs/ubifs/master.c @@ -29,7 +29,8 @@ * @c: UBIFS file-system description object * * This function scans the master node LEBs and search for the latest master - * node. Returns zero in case of success and a negative error code in case of + * node. Returns zero in case of success, %-EUCLEAN if there master area is + * corrupted and requires recovery, and a negative error code in case of * failure. */ static int scan_for_master(struct ubifs_info *c) @@ -40,7 +41,7 @@ static int scan_for_master(struct ubifs_info *c) lnum = UBIFS_MST_LNUM; - sleb = ubifs_scan(c, lnum, 0, c->sbuf); + sleb = ubifs_scan(c, lnum, 0, c->sbuf, 1); if (IS_ERR(sleb)) return PTR_ERR(sleb); nodes_cnt = sleb->nodes_cnt; @@ -48,7 +49,7 @@ static int scan_for_master(struct ubifs_info *c) snod = list_entry(sleb->nodes.prev, struct ubifs_scan_node, list); if (snod->type != UBIFS_MST_NODE) - goto out; + goto out_dump; memcpy(c->mst_node, snod->node, snod->len); offs = snod->offs; } @@ -56,7 +57,7 @@ static int scan_for_master(struct ubifs_info *c) lnum += 1; - sleb = ubifs_scan(c, lnum, 0, c->sbuf); + sleb = ubifs_scan(c, lnum, 0, c->sbuf, 1); if (IS_ERR(sleb)) return PTR_ERR(sleb); if (sleb->nodes_cnt != nodes_cnt) @@ -65,7 +66,7 @@ static int scan_for_master(struct ubifs_info *c) goto out; snod = list_entry(sleb->nodes.prev, struct ubifs_scan_node, list); if (snod->type != UBIFS_MST_NODE) - goto out; + goto out_dump; if (snod->offs != offs) goto out; if (memcmp((void *)c->mst_node + UBIFS_CH_SZ, @@ -78,6 +79,12 @@ static int scan_for_master(struct ubifs_info *c) out: ubifs_scan_destroy(sleb); + return -EUCLEAN; + +out_dump: + ubifs_err("unexpected node type %d master LEB %d:%d", + snod->type, lnum, snod->offs); + ubifs_scan_destroy(sleb); return -EINVAL; } @@ -256,7 +263,8 @@ int ubifs_read_master(struct ubifs_info *c) err = scan_for_master(c); if (err) { - err = ubifs_recover_master_node(c); + if (err == -EUCLEAN) + err = ubifs_recover_master_node(c); if (err) /* * Note, we do not free 'c->mst_node' here because the diff --git a/fs/ubifs/orphan.c b/fs/ubifs/orphan.c index 152a7b3..82009c7 100644 --- a/fs/ubifs/orphan.c +++ b/fs/ubifs/orphan.c @@ -670,9 +670,10 @@ static int kill_orphans(struct ubifs_info *c) struct ubifs_scan_leb *sleb; dbg_rcvry("LEB %d", lnum); - sleb = ubifs_scan(c, lnum, 0, c->sbuf); + sleb = ubifs_scan(c, lnum, 0, c->sbuf, 1); if (IS_ERR(sleb)) { - sleb = ubifs_recover_leb(c, lnum, 0, c->sbuf, 0); + if (PTR_ERR(sleb) == -EUCLEAN) + sleb = ubifs_recover_leb(c, lnum, 0, c->sbuf, 0); if (IS_ERR(sleb)) { err = PTR_ERR(sleb); break; @@ -899,7 +900,7 @@ static int dbg_scan_orphans(struct ubifs_info *c, struct check_info *ci) for (lnum = c->orph_first; lnum <= c->orph_last; lnum++) { struct ubifs_scan_leb *sleb; - sleb = ubifs_scan(c, lnum, 0, c->dbg->buf); + sleb = ubifs_scan(c, lnum, 0, c->dbg->buf, 0); if (IS_ERR(sleb)) { err = PTR_ERR(sleb); break; diff --git a/fs/ubifs/recovery.c b/fs/ubifs/recovery.c index e5f6cf8..f94ddf7 100644 --- a/fs/ubifs/recovery.c +++ b/fs/ubifs/recovery.c @@ -286,7 +286,7 @@ int ubifs_recover_master_node(struct ubifs_info *c) mst = mst2; } - dbg_rcvry("recovered master node from LEB %d", + ubifs_msg("recovered master node from LEB %d", (mst == mst1 ? UBIFS_MST_LNUM : UBIFS_MST_LNUM + 1)); memcpy(c->mst_node, mst, UBIFS_MST_NODE_SZ); @@ -790,7 +790,7 @@ struct ubifs_scan_leb *ubifs_recover_log_leb(struct ubifs_info *c, int lnum, * We can only recover at the end of the log, so check that the * next log LEB is empty or out of date. */ - sleb = ubifs_scan(c, next_lnum, 0, sbuf); + sleb = ubifs_scan(c, next_lnum, 0, sbuf, 0); if (IS_ERR(sleb)) return sleb; if (sleb->nodes_cnt) { diff --git a/fs/ubifs/replay.c b/fs/ubifs/replay.c index 2970500..5c2d6d7 100644 --- a/fs/ubifs/replay.c +++ b/fs/ubifs/replay.c @@ -506,7 +506,7 @@ static int replay_bud(struct ubifs_info *c, int lnum, int offs, int jhead, if (c->need_recovery) sleb = ubifs_recover_leb(c, lnum, offs, c->sbuf, jhead != GCHD); else - sleb = ubifs_scan(c, lnum, offs, c->sbuf); + sleb = ubifs_scan(c, lnum, offs, c->sbuf, 0); if (IS_ERR(sleb)) return PTR_ERR(sleb); @@ -836,8 +836,8 @@ static int replay_log_leb(struct ubifs_info *c, int lnum, int offs, void *sbuf) const struct ubifs_cs_node *node; dbg_mnt("replay log LEB %d:%d", lnum, offs); - sleb = ubifs_scan(c, lnum, offs, sbuf); - if (IS_ERR(sleb) ) { + sleb = ubifs_scan(c, lnum, offs, sbuf, c->need_recovery); + if (IS_ERR(sleb)) { if (PTR_ERR(sleb) != -EUCLEAN || !c->need_recovery) return PTR_ERR(sleb); sleb = ubifs_recover_log_leb(c, lnum, offs, sbuf); diff --git a/fs/ubifs/scan.c b/fs/ubifs/scan.c index 892ebfe..96c5253 100644 --- a/fs/ubifs/scan.c +++ b/fs/ubifs/scan.c @@ -108,10 +108,9 @@ int ubifs_scan_a_node(const struct ubifs_info *c, void *buf, int len, int lnum, /* Make the node pads to 8-byte boundary */ if ((node_len + pad_len) & 7) { - if (!quiet) { + if (!quiet) dbg_err("bad padding length %d - %d", offs, offs + node_len + pad_len); - } return SCANNED_A_BAD_PAD_NODE; } @@ -253,15 +252,19 @@ void ubifs_scanned_corruption(const struct ubifs_info *c, int lnum, int offs, * @c: UBIFS file-system description object * @lnum: logical eraseblock number * @offs: offset to start at (usually zero) - * @sbuf: scan buffer (must be c->leb_size) + * @sbuf: scan buffer (must be of @c->leb_size bytes in size) + * @quiet: print no messages * * This function scans LEB number @lnum and returns complete information about * its contents. Returns the scaned information in case of success and, * %-EUCLEAN if the LEB neads recovery, and other negative error codes in case * of failure. + * + * If @quiet is non-zero, this function does not print large and scary + * error messages and flash dumps in case of errors. */ struct ubifs_scan_leb *ubifs_scan(const struct ubifs_info *c, int lnum, - int offs, void *sbuf) + int offs, void *sbuf, int quiet) { void *buf = sbuf + offs; int err, len = c->leb_size - offs; @@ -280,7 +283,7 @@ struct ubifs_scan_leb *ubifs_scan(const struct ubifs_info *c, int lnum, cond_resched(); - ret = ubifs_scan_a_node(c, buf, len, lnum, offs, 0); + ret = ubifs_scan_a_node(c, buf, len, lnum, offs, quiet); if (ret > 0) { /* Padding bytes or a valid padding node */ offs += ret; @@ -320,7 +323,9 @@ struct ubifs_scan_leb *ubifs_scan(const struct ubifs_info *c, int lnum, } if (offs % c->min_io_size) { - ubifs_err("empty space starts at non-aligned offset %d", offs); + if (!quiet) + ubifs_err("empty space starts at non-aligned offset %d", + offs); goto corrupted;; } @@ -331,18 +336,25 @@ struct ubifs_scan_leb *ubifs_scan(const struct ubifs_info *c, int lnum, break; for (; len; offs++, buf++, len--) if (*(uint8_t *)buf != 0xff) { - ubifs_err("corrupt empty space at LEB %d:%d", - lnum, offs); + if (!quiet) + ubifs_err("corrupt empty space at LEB %d:%d", + lnum, offs); goto corrupted; } return sleb; corrupted: - ubifs_scanned_corruption(c, lnum, offs, buf); + if (!quiet) { + ubifs_scanned_corruption(c, lnum, offs, buf); + ubifs_err("LEB %d scanning failed", lnum); + } err = -EUCLEAN; + ubifs_scan_destroy(sleb); + return ERR_PTR(err); + error: - ubifs_err("LEB %d scanning failed", lnum); + ubifs_err("LEB %d scanning failed, error %d", lnum, err); ubifs_scan_destroy(sleb); return ERR_PTR(err); } diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index 26d2e0d..333e181e 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -36,7 +36,6 @@ #include <linux/mount.h> #include <linux/math64.h> #include <linux/writeback.h> -#include <linux/smp_lock.h> #include "ubifs.h" /* @@ -318,6 +317,8 @@ static int ubifs_write_inode(struct inode *inode, int wait) if (err) ubifs_err("can't write inode %lu, error %d", inode->i_ino, err); + else + err = dbg_check_inode_size(c, inode, ui->ui_size); } ui->dirty = 0; @@ -438,12 +439,6 @@ static int ubifs_sync_fs(struct super_block *sb, int wait) { int i, err; struct ubifs_info *c = sb->s_fs_info; - struct writeback_control wbc = { - .sync_mode = WB_SYNC_ALL, - .range_start = 0, - .range_end = LLONG_MAX, - .nr_to_write = LONG_MAX, - }; /* * Zero @wait is just an advisory thing to help the file system shove @@ -454,17 +449,6 @@ static int ubifs_sync_fs(struct super_block *sb, int wait) return 0; /* - * VFS calls '->sync_fs()' before synchronizing all dirty inodes and - * pages, so synchronize them first, then commit the journal. Strictly - * speaking, it is not necessary to commit the journal here, - * synchronizing write-buffers would be enough. But committing makes - * UBIFS free space predictions much more accurate, so we want to let - * the user be able to get more accurate results of 'statfs()' after - * they synchronize the file system. - */ - generic_sync_sb_inodes(sb, &wbc); - - /* * Synchronize write buffers, because 'ubifs_run_commit()' does not * do this if it waits for an already running commit. */ @@ -474,6 +458,13 @@ static int ubifs_sync_fs(struct super_block *sb, int wait) return err; } + /* + * Strictly speaking, it is not necessary to commit the journal here, + * synchronizing write-buffers would be enough. But committing makes + * UBIFS free space predictions much more accurate, so we want to let + * the user be able to get more accurate results of 'statfs()' after + * they synchronize the file system. + */ err = ubifs_run_commit(c); if (err) return err; @@ -1726,8 +1717,6 @@ static void ubifs_put_super(struct super_block *sb) ubifs_msg("un-mount UBI device %d, volume %d", c->vi.ubi_num, c->vi.vol_id); - lock_kernel(); - /* * The following asserts are only valid if there has not been a failure * of the media. For example, there will be dirty inodes if we failed @@ -1792,8 +1781,6 @@ static void ubifs_put_super(struct super_block *sb) ubi_close_volume(c->ubi); mutex_unlock(&c->umount_mutex); kfree(c); - - unlock_kernel(); } static int ubifs_remount_fs(struct super_block *sb, int *flags, char *data) @@ -1809,22 +1796,17 @@ static int ubifs_remount_fs(struct super_block *sb, int *flags, char *data) return err; } - lock_kernel(); if ((sb->s_flags & MS_RDONLY) && !(*flags & MS_RDONLY)) { if (c->ro_media) { ubifs_msg("cannot re-mount due to prior errors"); - unlock_kernel(); return -EROFS; } err = ubifs_remount_rw(c); - if (err) { - unlock_kernel(); + if (err) return err; - } } else if (!(sb->s_flags & MS_RDONLY) && (*flags & MS_RDONLY)) { if (c->ro_media) { ubifs_msg("cannot re-mount due to prior errors"); - unlock_kernel(); return -EROFS; } ubifs_remount_ro(c); @@ -1839,7 +1821,6 @@ static int ubifs_remount_fs(struct super_block *sb, int *flags, char *data) } ubifs_assert(c->lst.taken_empty_lebs > 0); - unlock_kernel(); return 0; } @@ -1971,6 +1952,7 @@ static int ubifs_fill_super(struct super_block *sb, void *data, int silent) * * Read-ahead will be disabled because @c->bdi.ra_pages is 0. */ + c->bdi.name = "ubifs", c->bdi.capabilities = BDI_CAP_MAP_COPY; c->bdi.unplug_io_fn = default_unplug_io_fn; err = bdi_init(&c->bdi); @@ -1985,6 +1967,7 @@ static int ubifs_fill_super(struct super_block *sb, void *data, int silent) if (err) goto out_bdi; + sb->s_bdi = &c->bdi; sb->s_fs_info = c; sb->s_magic = UBIFS_SUPER_MAGIC; sb->s_blocksize = UBIFS_BLOCK_SIZE; diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c index f249f7b..e5b1a7d 100644 --- a/fs/ubifs/tnc.c +++ b/fs/ubifs/tnc.c @@ -1159,8 +1159,8 @@ static struct ubifs_znode *dirty_cow_bottom_up(struct ubifs_info *c, * o exact match, i.e. the found zero-level znode contains key @key, then %1 * is returned and slot number of the matched branch is stored in @n; * o not exact match, which means that zero-level znode does not contain - * @key, then %0 is returned and slot number of the closed branch is stored - * in @n; + * @key, then %0 is returned and slot number of the closest branch is stored + * in @n; * o @key is so small that it is even less than the lowest key of the * leftmost zero-level node, then %0 is returned and %0 is stored in @n. * @@ -1433,7 +1433,7 @@ static int maybe_leb_gced(struct ubifs_info *c, int lnum, int gc_seq1) * @lnum: LEB number is returned here * @offs: offset is returned here * - * This function look up and reads node with key @key. The caller has to make + * This function looks up and reads node with key @key. The caller has to make * sure the @node buffer is large enough to fit the node. Returns zero in case * of success, %-ENOENT if the node was not found, and a negative error code in * case of failure. The node location can be returned in @lnum and @offs. @@ -3268,3 +3268,73 @@ out_unlock: mutex_unlock(&c->tnc_mutex); return err; } + +#ifdef CONFIG_UBIFS_FS_DEBUG + +/** + * dbg_check_inode_size - check if inode size is correct. + * @c: UBIFS file-system description object + * @inum: inode number + * @size: inode size + * + * This function makes sure that the inode size (@size) is correct and it does + * not have any pages beyond @size. Returns zero if the inode is OK, %-EINVAL + * if it has a data page beyond @size, and other negative error code in case of + * other errors. + */ +int dbg_check_inode_size(struct ubifs_info *c, const struct inode *inode, + loff_t size) +{ + int err, n; + union ubifs_key from_key, to_key, *key; + struct ubifs_znode *znode; + unsigned int block; + + if (!S_ISREG(inode->i_mode)) + return 0; + if (!(ubifs_chk_flags & UBIFS_CHK_GEN)) + return 0; + + block = (size + UBIFS_BLOCK_SIZE - 1) >> UBIFS_BLOCK_SHIFT; + data_key_init(c, &from_key, inode->i_ino, block); + highest_data_key(c, &to_key, inode->i_ino); + + mutex_lock(&c->tnc_mutex); + err = ubifs_lookup_level0(c, &from_key, &znode, &n); + if (err < 0) + goto out_unlock; + + if (err) { + err = -EINVAL; + key = &from_key; + goto out_dump; + } + + err = tnc_next(c, &znode, &n); + if (err == -ENOENT) { + err = 0; + goto out_unlock; + } + if (err < 0) + goto out_unlock; + + ubifs_assert(err == 0); + key = &znode->zbranch[n].key; + if (!key_in_range(c, key, &from_key, &to_key)) + goto out_unlock; + +out_dump: + block = key_block(c, key); + ubifs_err("inode %lu has size %lld, but there are data at offset %lld " + "(data key %s)", (unsigned long)inode->i_ino, size, + ((loff_t)block) << UBIFS_BLOCK_SHIFT, DBGKEY(key)); + dbg_dump_inode(c, inode); + dbg_dump_stack(); + err = -EINVAL; + +out_unlock: + mutex_unlock(&c->tnc_mutex); + return err; +} + +#endif /* CONFIG_UBIFS_FS_DEBUG */ diff --git a/fs/ubifs/tnc_commit.c b/fs/ubifs/tnc_commit.c index fde8d12..53288e5 100644 --- a/fs/ubifs/tnc_commit.c +++ b/fs/ubifs/tnc_commit.c @@ -245,7 +245,7 @@ static int layout_leb_in_gaps(struct ubifs_info *c, int *p) * it is more comprehensive and less efficient than is needed for this * purpose. */ - sleb = ubifs_scan(c, lnum, 0, c->ileb_buf); + sleb = ubifs_scan(c, lnum, 0, c->ileb_buf, 0); c->ileb_len = 0; if (IS_ERR(sleb)) return PTR_ERR(sleb); diff --git a/fs/ubifs/ubifs-media.h b/fs/ubifs/ubifs-media.h index 3eee07e..191ca78 100644 --- a/fs/ubifs/ubifs-media.h +++ b/fs/ubifs/ubifs-media.h @@ -135,6 +135,13 @@ /* The key is always at the same position in all keyed nodes */ #define UBIFS_KEY_OFFSET offsetof(struct ubifs_ino_node, key) +/* Garbage collector journal head number */ +#define UBIFS_GC_HEAD 0 +/* Base journal head number */ +#define UBIFS_BASE_HEAD 1 +/* Data journal head number */ +#define UBIFS_DATA_HEAD 2 + /* * LEB Properties Tree node types. * diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h index a293490..b2d9763 100644 --- a/fs/ubifs/ubifs.h +++ b/fs/ubifs/ubifs.h @@ -105,12 +105,10 @@ /* Number of non-data journal heads */ #define NONDATA_JHEADS_CNT 2 -/* Garbage collector head */ -#define GCHD 0 -/* Base journal head number */ -#define BASEHD 1 -/* First "general purpose" journal head */ -#define DATAHD 2 +/* Shorter names for journal head numbers for internal usage */ +#define GCHD UBIFS_GC_HEAD +#define BASEHD UBIFS_BASE_HEAD +#define DATAHD UBIFS_DATA_HEAD /* 'No change' value for 'ubifs_change_lp()' */ #define LPROPS_NC 0x80000001 @@ -1451,7 +1449,7 @@ int ubifs_sync_wbufs_by_inode(struct ubifs_info *c, struct inode *inode); /* scan.c */ struct ubifs_scan_leb *ubifs_scan(const struct ubifs_info *c, int lnum, - int offs, void *sbuf); + int offs, void *sbuf, int quiet); void ubifs_scan_destroy(struct ubifs_scan_leb *sleb); int ubifs_scan_a_node(const struct ubifs_info *c, void *buf, int len, int lnum, int offs, int quiet); @@ -1676,6 +1674,7 @@ const struct ubifs_lprops *ubifs_fast_find_free(struct ubifs_info *c); const struct ubifs_lprops *ubifs_fast_find_empty(struct ubifs_info *c); const struct ubifs_lprops *ubifs_fast_find_freeable(struct ubifs_info *c); const struct ubifs_lprops *ubifs_fast_find_frdi_idx(struct ubifs_info *c); +int ubifs_calc_dark(const struct ubifs_info *c, int spc); /* file.c */ int ubifs_fsync(struct file *file, struct dentry *dentry, int datasync); diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c index adafcf5..195830f 100644 --- a/fs/ubifs/xattr.c +++ b/fs/ubifs/xattr.c @@ -78,9 +78,9 @@ enum { SECURITY_XATTR, }; -static struct inode_operations none_inode_operations; -static struct address_space_operations none_address_operations; -static struct file_operations none_file_operations; +static const struct inode_operations none_inode_operations; +static const struct address_space_operations none_address_operations; +static const struct file_operations none_file_operations; /** * create_xattr - create an extended attribute. diff --git a/fs/udf/directory.c b/fs/udf/directory.c index 1d2c570..2ffdb67 100644 --- a/fs/udf/directory.c +++ b/fs/udf/directory.c @@ -18,59 +18,6 @@ #include <linux/string.h> #include <linux/buffer_head.h> -#if 0 -static uint8_t *udf_filead_read(struct inode *dir, uint8_t *tmpad, - uint8_t ad_size, struct kernel_lb_addr fe_loc, - int *pos, int *offset, struct buffer_head **bh, - int *error) -{ - int loffset = *offset; - int block; - uint8_t *ad; - int remainder; - - *error = 0; - - ad = (uint8_t *)(*bh)->b_data + *offset; - *offset += ad_size; - - if (!ad) { - brelse(*bh); - *error = 1; - return NULL; - } - - if (*offset == dir->i_sb->s_blocksize) { - brelse(*bh); - block = udf_get_lb_pblock(dir->i_sb, fe_loc, ++*pos); - if (!block) - return NULL; - *bh = udf_tread(dir->i_sb, block); - if (!*bh) - return NULL; - } else if (*offset > dir->i_sb->s_blocksize) { - ad = tmpad; - - remainder = dir->i_sb->s_blocksize - loffset; - memcpy((uint8_t *)ad, (*bh)->b_data + loffset, remainder); - - brelse(*bh); - block = udf_get_lb_pblock(dir->i_sb, fe_loc, ++*pos); - if (!block) - return NULL; - (*bh) = udf_tread(dir->i_sb, block); - if (!*bh) - return NULL; - - memcpy((uint8_t *)ad + remainder, (*bh)->b_data, - ad_size - remainder); - *offset = ad_size - remainder; - } - - return ad; -} -#endif - struct fileIdentDesc *udf_fileident_read(struct inode *dir, loff_t *nf_pos, struct udf_fileident_bh *fibh, struct fileIdentDesc *cfi, @@ -248,39 +195,6 @@ struct fileIdentDesc *udf_get_fileident(void *buffer, int bufsize, int *offset) return fi; } -#if 0 -static struct extent_ad *udf_get_fileextent(void *buffer, int bufsize, int *offset) -{ - struct extent_ad *ext; - struct fileEntry *fe; - uint8_t *ptr; - - if ((!buffer) || (!offset)) { - printk(KERN_ERR "udf: udf_get_fileextent() invalidparms\n"); - return NULL; - } - - fe = (struct fileEntry *)buffer; - - if (fe->descTag.tagIdent != cpu_to_le16(TAG_IDENT_FE)) { - udf_debug("0x%x != TAG_IDENT_FE\n", - le16_to_cpu(fe->descTag.tagIdent)); - return NULL; - } - - ptr = (uint8_t *)(fe->extendedAttr) + - le32_to_cpu(fe->lengthExtendedAttr); - - if ((*offset > 0) && (*offset < le32_to_cpu(fe->lengthAllocDescs))) - ptr += *offset; - - ext = (struct extent_ad *)ptr; - - *offset = *offset + sizeof(struct extent_ad); - return ext; -} -#endif - struct short_ad *udf_get_fileshortad(uint8_t *ptr, int maxoffset, uint32_t *offset, int inc) { diff --git a/fs/udf/file.c b/fs/udf/file.c index 7464305..b80cbd7 100644 --- a/fs/udf/file.c +++ b/fs/udf/file.c @@ -193,9 +193,11 @@ int udf_ioctl(struct inode *inode, struct file *filp, unsigned int cmd, static int udf_release_file(struct inode *inode, struct file *filp) { if (filp->f_mode & FMODE_WRITE) { + mutex_lock(&inode->i_mutex); lock_kernel(); udf_discard_prealloc(inode); unlock_kernel(); + mutex_unlock(&inode->i_mutex); } return 0; } diff --git a/fs/udf/inode.c b/fs/udf/inode.c index e7533f7..6d24c2c 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -90,19 +90,16 @@ no_delete: } /* - * If we are going to release inode from memory, we discard preallocation and - * truncate last inode extent to proper length. We could use drop_inode() but - * it's called under inode_lock and thus we cannot mark inode dirty there. We - * use clear_inode() but we have to make sure to write inode as it's not written - * automatically. + * If we are going to release inode from memory, we truncate last inode extent + * to proper length. We could use drop_inode() but it's called under inode_lock + * and thus we cannot mark inode dirty there. We use clear_inode() but we have + * to make sure to write inode as it's not written automatically. */ void udf_clear_inode(struct inode *inode) { struct udf_inode_info *iinfo; if (!(inode->i_sb->s_flags & MS_RDONLY)) { lock_kernel(); - /* Discard preallocation for directories, symlinks, etc. */ - udf_discard_prealloc(inode); udf_truncate_tail_extent(inode); unlock_kernel(); write_inode_now(inode, 0); @@ -664,8 +661,12 @@ static struct buffer_head *inode_getblk(struct inode *inode, sector_t block, udf_split_extents(inode, &c, offset, newblocknum, laarr, &endnum); #ifdef UDF_PREALLOCATE - /* preallocate blocks */ - udf_prealloc_extents(inode, c, lastblock, laarr, &endnum); + /* We preallocate blocks only for regular files. It also makes sense + * for directories but there's a problem when to drop the + * preallocation. We might use some delayed work for that but I feel + * it's overengineering for a filesystem like UDF. */ + if (S_ISREG(inode->i_mode)) + udf_prealloc_extents(inode, c, lastblock, laarr, &endnum); #endif /* merge any continuous blocks in laarr */ diff --git a/fs/udf/lowlevel.c b/fs/udf/lowlevel.c index 1b88fd5..43e24a3 100644 --- a/fs/udf/lowlevel.c +++ b/fs/udf/lowlevel.c @@ -36,14 +36,10 @@ unsigned int udf_get_last_session(struct super_block *sb) ms_info.addr_format = CDROM_LBA; i = ioctl_by_bdev(bdev, CDROMMULTISESSION, (unsigned long)&ms_info); -#define WE_OBEY_THE_WRITTEN_STANDARDS 1 - if (i == 0) { udf_debug("XA disk: %s, vol_desc_start=%d\n", (ms_info.xa_flag ? "yes" : "no"), ms_info.addr.lba); -#if WE_OBEY_THE_WRITTEN_STANDARDS if (ms_info.xa_flag) /* necessary for a valid ms_info.addr */ -#endif vol_desc_start = ms_info.addr.lba; } else { udf_debug("CDROMMULTISESSION not supported: rc=%d\n", i); diff --git a/fs/udf/namei.c b/fs/udf/namei.c index 6a29fa34..21dad8c 100644 --- a/fs/udf/namei.c +++ b/fs/udf/namei.c @@ -943,7 +943,6 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry, pc->componentType = 1; pc->lengthComponentIdent = 0; pc->componentFileVersionNum = 0; - pc += sizeof(struct pathComponent); elen += sizeof(struct pathComponent); } @@ -66,22 +66,28 @@ xattr_permission(struct inode *inode, const char *name, int mask) return inode_permission(inode, mask); } -int -vfs_setxattr(struct dentry *dentry, const char *name, const void *value, - size_t size, int flags) +/** + * __vfs_setxattr_noperm - perform setxattr operation without performing + * permission checks. + * + * @dentry - object to perform setxattr on + * @name - xattr name to set + * @value - value to set @name to + * @size - size of @value + * @flags - flags to pass into filesystem operations + * + * returns the result of the internal setxattr or setsecurity operations. + * + * This function requires the caller to lock the inode's i_mutex before it + * is executed. It also assumes that the caller will make the appropriate + * permission checks. + */ +int __vfs_setxattr_noperm(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags) { struct inode *inode = dentry->d_inode; - int error; - - error = xattr_permission(inode, name, MAY_WRITE); - if (error) - return error; + int error = -EOPNOTSUPP; - mutex_lock(&inode->i_mutex); - error = security_inode_setxattr(dentry, name, value, size, flags); - if (error) - goto out; - error = -EOPNOTSUPP; if (inode->i_op->setxattr) { error = inode->i_op->setxattr(dentry, name, value, size, flags); if (!error) { @@ -97,6 +103,29 @@ vfs_setxattr(struct dentry *dentry, const char *name, const void *value, if (!error) fsnotify_xattr(dentry); } + + return error; +} + + +int +vfs_setxattr(struct dentry *dentry, const char *name, const void *value, + size_t size, int flags) +{ + struct inode *inode = dentry->d_inode; + int error; + + error = xattr_permission(inode, name, MAY_WRITE); + if (error) + return error; + + mutex_lock(&inode->i_mutex); + error = security_inode_setxattr(dentry, name, value, size, flags); + if (error) + goto out; + + error = __vfs_setxattr_noperm(dentry, name, value, size, flags); + out: mutex_unlock(&inode->i_mutex); return error; diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c index 6a2910e..c2e30ee 100644 --- a/fs/xfs/linux-2.6/xfs_aops.c +++ b/fs/xfs/linux-2.6/xfs_aops.c @@ -1653,4 +1653,5 @@ const struct address_space_operations xfs_address_space_operations = { .direct_IO = xfs_vm_direct_IO, .migratepage = buffer_migrate_page, .is_partially_uptodate = block_is_partially_uptodate, + .error_remove_page = generic_error_remove_page, }; diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c index 6d65baf..eff61e2 100644 --- a/fs/xfs/linux-2.6/xfs_file.c +++ b/fs/xfs/linux-2.6/xfs_file.c @@ -42,7 +42,7 @@ #include <linux/dcache.h> -static struct vm_operations_struct xfs_file_vm_ops; +static const struct vm_operations_struct xfs_file_vm_ops; STATIC ssize_t xfs_file_aio_read( @@ -273,7 +273,7 @@ const struct file_operations xfs_dir_file_operations = { .fsync = xfs_file_fsync, }; -static struct vm_operations_struct xfs_file_vm_ops = { +static const struct vm_operations_struct xfs_file_vm_ops = { .fault = filemap_fault, .page_mkwrite = xfs_vm_page_mkwrite, }; diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c index cdf7114..cd42ef7 100644 --- a/fs/xfs/linux-2.6/xfs_iops.c +++ b/fs/xfs/linux-2.6/xfs_iops.c @@ -475,14 +475,6 @@ xfs_vn_put_link( } STATIC int -xfs_vn_permission( - struct inode *inode, - int mask) -{ - return generic_permission(inode, mask, xfs_check_acl); -} - -STATIC int xfs_vn_getattr( struct vfsmount *mnt, struct dentry *dentry, @@ -684,7 +676,7 @@ xfs_vn_fiemap( } static const struct inode_operations xfs_inode_operations = { - .permission = xfs_vn_permission, + .check_acl = xfs_check_acl, .truncate = xfs_vn_truncate, .getattr = xfs_vn_getattr, .setattr = xfs_vn_setattr, @@ -712,7 +704,7 @@ static const struct inode_operations xfs_dir_inode_operations = { .rmdir = xfs_vn_unlink, .mknod = xfs_vn_mknod, .rename = xfs_vn_rename, - .permission = xfs_vn_permission, + .check_acl = xfs_check_acl, .getattr = xfs_vn_getattr, .setattr = xfs_vn_setattr, .setxattr = generic_setxattr, @@ -737,7 +729,7 @@ static const struct inode_operations xfs_dir_ci_inode_operations = { .rmdir = xfs_vn_unlink, .mknod = xfs_vn_mknod, .rename = xfs_vn_rename, - .permission = xfs_vn_permission, + .check_acl = xfs_check_acl, .getattr = xfs_vn_getattr, .setattr = xfs_vn_setattr, .setxattr = generic_setxattr, @@ -750,7 +742,7 @@ static const struct inode_operations xfs_symlink_inode_operations = { .readlink = generic_readlink, .follow_link = xfs_vn_follow_link, .put_link = xfs_vn_put_link, - .permission = xfs_vn_permission, + .check_acl = xfs_check_acl, .getattr = xfs_vn_getattr, .setattr = xfs_vn_setattr, .setxattr = generic_setxattr, diff --git a/fs/xfs/linux-2.6/xfs_quotaops.c b/fs/xfs/linux-2.6/xfs_quotaops.c index cb6e2cc..9e41f91 100644 --- a/fs/xfs/linux-2.6/xfs_quotaops.c +++ b/fs/xfs/linux-2.6/xfs_quotaops.c @@ -150,7 +150,7 @@ xfs_fs_set_xquota( return -xfs_qm_scall_setqlim(mp, id, xfs_quota_type(type), fdq); } -struct quotactl_ops xfs_quotactl_operations = { +const struct quotactl_ops xfs_quotactl_operations = { .quota_sync = xfs_fs_quota_sync, .get_xstate = xfs_fs_get_xstate, .set_xstate = xfs_fs_set_xstate, diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c index eefea0d..18a4b8e 100644 --- a/fs/xfs/linux-2.6/xfs_super.c +++ b/fs/xfs/linux-2.6/xfs_super.c @@ -67,7 +67,7 @@ #include <linux/freezer.h> #include <linux/parser.h> -static struct super_operations xfs_super_operations; +static const struct super_operations xfs_super_operations; static kmem_zone_t *xfs_ioend_zone; mempool_t *xfs_ioend_pool; @@ -1558,7 +1558,7 @@ xfs_fs_get_sb( mnt); } -static struct super_operations xfs_super_operations = { +static const struct super_operations xfs_super_operations = { .alloc_inode = xfs_fs_alloc_inode, .destroy_inode = xfs_fs_destroy_inode, .dirty_inode = xfs_fs_dirty_inode, diff --git a/fs/xfs/linux-2.6/xfs_super.h b/fs/xfs/linux-2.6/xfs_super.h index 5a2ea3a..18175eb 100644 --- a/fs/xfs/linux-2.6/xfs_super.h +++ b/fs/xfs/linux-2.6/xfs_super.h @@ -93,7 +93,7 @@ extern void xfs_blkdev_issue_flush(struct xfs_buftarg *); extern const struct export_operations xfs_export_operations; extern struct xattr_handler *xfs_xattr_handlers[]; -extern struct quotactl_ops xfs_quotactl_operations; +extern const struct quotactl_ops xfs_quotactl_operations; #define XFS_M(sb) ((struct xfs_mount *)((sb)->s_fs_info)) diff --git a/fs/xfs/linux-2.6/xfs_sysctl.c b/fs/xfs/linux-2.6/xfs_sysctl.c index 916c0ff..c5bc67c 100644 --- a/fs/xfs/linux-2.6/xfs_sysctl.c +++ b/fs/xfs/linux-2.6/xfs_sysctl.c @@ -26,7 +26,6 @@ STATIC int xfs_stats_clear_proc_handler( ctl_table *ctl, int write, - struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos) @@ -34,7 +33,7 @@ xfs_stats_clear_proc_handler( int c, ret, *valp = ctl->data; __uint32_t vn_active; - ret = proc_dointvec_minmax(ctl, write, filp, buffer, lenp, ppos); + ret = proc_dointvec_minmax(ctl, write, buffer, lenp, ppos); if (!ret && write && *valp) { printk("XFS Clearing xfsstats\n"); diff --git a/fs/xfs/xfs_fs.h b/fs/xfs/xfs_fs.h index c4ea51b..f52ac27 100644 --- a/fs/xfs/xfs_fs.h +++ b/fs/xfs/xfs_fs.h @@ -117,7 +117,7 @@ struct getbmapx { #define BMV_IF_VALID \ (BMV_IF_ATTRFORK|BMV_IF_NO_DMAPI_READ|BMV_IF_PREALLOC|BMV_IF_DELALLOC) -/* bmv_oflags values - returned for for each non-header segment */ +/* bmv_oflags values - returned for each non-header segment */ #define BMV_OF_PREALLOC 0x1 /* segment = unwritten pre-allocation */ #define BMV_OF_DELALLOC 0x2 /* segment = delayed allocation */ #define BMV_OF_LAST 0x4 /* segment is the last in the file */ |