diff options
Diffstat (limited to 'fs')
298 files changed, 6263 insertions, 4354 deletions
diff --git a/fs/9p/Kconfig b/fs/9p/Kconfig index 0a93dc1cb..55abfd6 100644 --- a/fs/9p/Kconfig +++ b/fs/9p/Kconfig @@ -11,8 +11,7 @@ config 9P_FS if 9P_FS config 9P_FSCACHE - bool "Enable 9P client caching support (EXPERIMENTAL)" - depends on EXPERIMENTAL + bool "Enable 9P client caching support" depends on 9P_FS=m && FSCACHE || 9P_FS=y && FSCACHE=y help Choose Y here to enable persistent, read-only local diff --git a/fs/9p/fid.c b/fs/9p/fid.c index da8eefb..afd4724 100644 --- a/fs/9p/fid.c +++ b/fs/9p/fid.c @@ -74,19 +74,20 @@ int v9fs_fid_add(struct dentry *dentry, struct p9_fid *fid) * */ -static struct p9_fid *v9fs_fid_find(struct dentry *dentry, u32 uid, int any) +static struct p9_fid *v9fs_fid_find(struct dentry *dentry, kuid_t uid, int any) { struct v9fs_dentry *dent; struct p9_fid *fid, *ret; p9_debug(P9_DEBUG_VFS, " dentry: %s (%p) uid %d any %d\n", - dentry->d_name.name, dentry, uid, any); + dentry->d_name.name, dentry, from_kuid(&init_user_ns, uid), + any); dent = (struct v9fs_dentry *) dentry->d_fsdata; ret = NULL; if (dent) { spin_lock(&dent->lock); list_for_each_entry(fid, &dent->fidlist, dlist) { - if (any || fid->uid == uid) { + if (any || uid_eq(fid->uid, uid)) { ret = fid; break; } @@ -126,7 +127,7 @@ err_out: } static struct p9_fid *v9fs_fid_lookup_with_uid(struct dentry *dentry, - uid_t uid, int any) + kuid_t uid, int any) { struct dentry *ds; char **wnames, *uname; @@ -233,7 +234,7 @@ err_out: struct p9_fid *v9fs_fid_lookup(struct dentry *dentry) { - uid_t uid; + kuid_t uid; int any, access; struct v9fs_session_info *v9ses; @@ -253,7 +254,7 @@ struct p9_fid *v9fs_fid_lookup(struct dentry *dentry) break; default: - uid = ~0; + uid = INVALID_UID; any = 0; break; } @@ -272,7 +273,7 @@ struct p9_fid *v9fs_fid_clone(struct dentry *dentry) return ret; } -static struct p9_fid *v9fs_fid_clone_with_uid(struct dentry *dentry, uid_t uid) +static struct p9_fid *v9fs_fid_clone_with_uid(struct dentry *dentry, kuid_t uid) { struct p9_fid *fid, *ret; @@ -289,7 +290,7 @@ struct p9_fid *v9fs_writeback_fid(struct dentry *dentry) int err; struct p9_fid *fid; - fid = v9fs_fid_clone_with_uid(dentry, 0); + fid = v9fs_fid_clone_with_uid(dentry, GLOBAL_ROOT_UID); if (IS_ERR(fid)) goto error_out; /* diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c index d934f04e7..58e6cbc 100644 --- a/fs/9p/v9fs.c +++ b/fs/9p/v9fs.c @@ -161,7 +161,13 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts) ret = r; continue; } - v9ses->dfltuid = option; + v9ses->dfltuid = make_kuid(current_user_ns(), option); + if (!uid_valid(v9ses->dfltuid)) { + p9_debug(P9_DEBUG_ERROR, + "uid field, but not a uid?\n"); + ret = -EINVAL; + continue; + } break; case Opt_dfltgid: r = match_int(&args[0], &option); @@ -171,7 +177,13 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts) ret = r; continue; } - v9ses->dfltgid = option; + v9ses->dfltgid = make_kgid(current_user_ns(), option); + if (!gid_valid(v9ses->dfltgid)) { + p9_debug(P9_DEBUG_ERROR, + "gid field, but not a gid?\n"); + ret = -EINVAL; + continue; + } break; case Opt_afid: r = match_int(&args[0], &option); @@ -248,8 +260,9 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts) else if (strcmp(s, "client") == 0) { v9ses->flags |= V9FS_ACCESS_CLIENT; } else { + uid_t uid; v9ses->flags |= V9FS_ACCESS_SINGLE; - v9ses->uid = simple_strtoul(s, &e, 10); + uid = simple_strtoul(s, &e, 10); if (*e != '\0') { ret = -EINVAL; pr_info("Unknown access argument %s\n", @@ -257,6 +270,13 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts) kfree(s); goto free_and_return; } + v9ses->uid = make_kuid(current_user_ns(), uid); + if (!uid_valid(v9ses->uid)) { + ret = -EINVAL; + pr_info("Uknown uid %s\n", s); + kfree(s); + goto free_and_return; + } } kfree(s); @@ -319,7 +339,7 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses, list_add(&v9ses->slist, &v9fs_sessionlist); spin_unlock(&v9fs_sessionlist_lock); - v9ses->uid = ~0; + v9ses->uid = INVALID_UID; v9ses->dfltuid = V9FS_DEFUID; v9ses->dfltgid = V9FS_DEFGID; @@ -364,7 +384,7 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses, v9ses->flags &= ~V9FS_ACCESS_MASK; v9ses->flags |= V9FS_ACCESS_ANY; - v9ses->uid = ~0; + v9ses->uid = INVALID_UID; } if (!v9fs_proto_dotl(v9ses) || !((v9ses->flags & V9FS_ACCESS_MASK) == V9FS_ACCESS_CLIENT)) { @@ -375,7 +395,7 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses, v9ses->flags &= ~V9FS_ACL_MASK; } - fid = p9_client_attach(v9ses->clnt, NULL, v9ses->uname, ~0, + fid = p9_client_attach(v9ses->clnt, NULL, v9ses->uname, INVALID_UID, v9ses->aname); if (IS_ERR(fid)) { retval = PTR_ERR(fid); @@ -387,7 +407,7 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses, if ((v9ses->flags & V9FS_ACCESS_MASK) == V9FS_ACCESS_SINGLE) fid->uid = v9ses->uid; else - fid->uid = ~0; + fid->uid = INVALID_UID; #ifdef CONFIG_9P_FSCACHE /* register the session for caching */ diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h index 34c59f1..a8e127c 100644 --- a/fs/9p/v9fs.h +++ b/fs/9p/v9fs.h @@ -109,9 +109,9 @@ struct v9fs_session_info { char *uname; /* user name to mount as */ char *aname; /* name of remote hierarchy being mounted */ unsigned int maxdata; /* max data for client interface */ - unsigned int dfltuid; /* default uid/muid for legacy support */ - unsigned int dfltgid; /* default gid for legacy support */ - u32 uid; /* if ACCESS_SINGLE, the uid that has access */ + kuid_t dfltuid; /* default uid/muid for legacy support */ + kgid_t dfltgid; /* default gid for legacy support */ + kuid_t uid; /* if ACCESS_SINGLE, the uid that has access */ struct p9_client *clnt; /* 9p client */ struct list_head slist; /* list of sessions registered with v9fs */ struct backing_dev_info bdi; @@ -165,8 +165,8 @@ extern struct inode *v9fs_inode_from_fid_dotl(struct v9fs_session_info *v9ses, #define V9FS_PORT 564 #define V9FS_DEFUSER "nobody" #define V9FS_DEFANAME "" -#define V9FS_DEFUID (-2) -#define V9FS_DEFGID (-2) +#define V9FS_DEFUID KUIDT_INIT(-2) +#define V9FS_DEFGID KGIDT_INIT(-2) static inline struct v9fs_session_info *v9fs_inode2v9ses(struct inode *inode) { diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c index ff911e7..be1e34a 100644 --- a/fs/9p/vfs_dir.c +++ b/fs/9p/vfs_dir.c @@ -52,10 +52,9 @@ */ struct p9_rdir { - struct mutex mutex; int head; int tail; - uint8_t *buf; + uint8_t buf[]; }; /** @@ -93,33 +92,12 @@ static void p9stat_init(struct p9_wstat *stbuf) * */ -static int v9fs_alloc_rdir_buf(struct file *filp, int buflen) +static struct p9_rdir *v9fs_alloc_rdir_buf(struct file *filp, int buflen) { - struct p9_rdir *rdir; - struct p9_fid *fid; - int err = 0; - - fid = filp->private_data; - if (!fid->rdir) { - rdir = kmalloc(sizeof(struct p9_rdir) + buflen, GFP_KERNEL); - - if (rdir == NULL) { - err = -ENOMEM; - goto exit; - } - spin_lock(&filp->f_dentry->d_lock); - if (!fid->rdir) { - rdir->buf = (uint8_t *)rdir + sizeof(struct p9_rdir); - mutex_init(&rdir->mutex); - rdir->head = rdir->tail = 0; - fid->rdir = (void *) rdir; - rdir = NULL; - } - spin_unlock(&filp->f_dentry->d_lock); - kfree(rdir); - } -exit: - return err; + struct p9_fid *fid = filp->private_data; + if (!fid->rdir) + fid->rdir = kzalloc(sizeof(struct p9_rdir) + buflen, GFP_KERNEL); + return fid->rdir; } /** @@ -145,20 +123,16 @@ static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir) buflen = fid->clnt->msize - P9_IOHDRSZ; - err = v9fs_alloc_rdir_buf(filp, buflen); - if (err) - goto exit; - rdir = (struct p9_rdir *) fid->rdir; + rdir = v9fs_alloc_rdir_buf(filp, buflen); + if (!rdir) + return -ENOMEM; - err = mutex_lock_interruptible(&rdir->mutex); - if (err) - return err; - while (err == 0) { + while (1) { if (rdir->tail == rdir->head) { err = v9fs_file_readn(filp, rdir->buf, NULL, buflen, filp->f_pos); if (err <= 0) - goto unlock_and_exit; + return err; rdir->head = 0; rdir->tail = err; @@ -169,9 +143,8 @@ static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir) rdir->tail - rdir->head, &st); if (err) { p9_debug(P9_DEBUG_VFS, "returned %d\n", err); - err = -EIO; p9stat_free(&st); - goto unlock_and_exit; + return -EIO; } reclen = st.size+2; @@ -180,19 +153,13 @@ static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir) p9stat_free(&st); - if (over) { - err = 0; - goto unlock_and_exit; - } + if (over) + return 0; + rdir->head += reclen; filp->f_pos += reclen; } } - -unlock_and_exit: - mutex_unlock(&rdir->mutex); -exit: - return err; } /** @@ -218,21 +185,16 @@ static int v9fs_dir_readdir_dotl(struct file *filp, void *dirent, buflen = fid->clnt->msize - P9_READDIRHDRSZ; - err = v9fs_alloc_rdir_buf(filp, buflen); - if (err) - goto exit; - rdir = (struct p9_rdir *) fid->rdir; + rdir = v9fs_alloc_rdir_buf(filp, buflen); + if (!rdir) + return -ENOMEM; - err = mutex_lock_interruptible(&rdir->mutex); - if (err) - return err; - - while (err == 0) { + while (1) { if (rdir->tail == rdir->head) { err = p9_client_readdir(fid, rdir->buf, buflen, filp->f_pos); if (err <= 0) - goto unlock_and_exit; + return err; rdir->head = 0; rdir->tail = err; @@ -245,8 +207,7 @@ static int v9fs_dir_readdir_dotl(struct file *filp, void *dirent, &curdirent); if (err < 0) { p9_debug(P9_DEBUG_VFS, "returned %d\n", err); - err = -EIO; - goto unlock_and_exit; + return -EIO; } /* d_off in dirent structure tracks the offset into @@ -261,20 +222,13 @@ static int v9fs_dir_readdir_dotl(struct file *filp, void *dirent, curdirent.d_type); oldoffset = curdirent.d_off; - if (over) { - err = 0; - goto unlock_and_exit; - } + if (over) + return 0; filp->f_pos = curdirent.d_off; rdir->head += err; } } - -unlock_and_exit: - mutex_unlock(&rdir->mutex); -exit: - return err; } diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c index 3356e3e..d384a8b 100644 --- a/fs/9p/vfs_file.c +++ b/fs/9p/vfs_file.c @@ -80,10 +80,6 @@ int v9fs_file_open(struct inode *inode, struct file *file) p9_client_clunk(fid); return err; } - if (file->f_flags & O_TRUNC) { - i_size_write(inode, 0); - inode->i_blocks = 0; - } if ((file->f_flags & O_APPEND) && (!v9fs_proto_dotu(v9ses) && !v9fs_proto_dotl(v9ses))) generic_file_llseek(file, 0, SEEK_END); @@ -620,6 +616,7 @@ v9fs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) lock_page(page); if (page->mapping != inode->i_mapping) goto out_unlock; + wait_for_stable_page(page); return VM_FAULT_LOCKED; out_unlock: diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index 890bed5..b5340c8 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -192,9 +192,6 @@ int v9fs_uflags2omode(int uflags, int extended) break; } - if (uflags & O_TRUNC) - ret |= P9_OTRUNC; - if (extended) { if (uflags & O_EXCL) ret |= P9_OEXCL; @@ -228,9 +225,9 @@ v9fs_blank_wstat(struct p9_wstat *wstat) wstat->uid = NULL; wstat->gid = NULL; wstat->muid = NULL; - wstat->n_uid = ~0; - wstat->n_gid = ~0; - wstat->n_muid = ~0; + wstat->n_uid = INVALID_UID; + wstat->n_gid = INVALID_GID; + wstat->n_muid = INVALID_UID; wstat->extension = NULL; } diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c index 7c29558..61e4fa7 100644 --- a/fs/9p/vfs_inode_dotl.c +++ b/fs/9p/vfs_inode_dotl.c @@ -57,7 +57,7 @@ v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, umode_t omode, * group of the new file system object. */ -static gid_t v9fs_get_fsgid_for_create(struct inode *dir_inode) +static kgid_t v9fs_get_fsgid_for_create(struct inode *dir_inode) { BUG_ON(dir_inode == NULL); @@ -186,7 +186,6 @@ static int v9fs_mapped_dotl_flags(int flags) { O_CREAT, P9_DOTL_CREATE }, { O_EXCL, P9_DOTL_EXCL }, { O_NOCTTY, P9_DOTL_NOCTTY }, - { O_TRUNC, P9_DOTL_TRUNC }, { O_APPEND, P9_DOTL_APPEND }, { O_NONBLOCK, P9_DOTL_NONBLOCK }, { O_DSYNC, P9_DOTL_DSYNC }, @@ -246,7 +245,7 @@ v9fs_vfs_atomic_open_dotl(struct inode *dir, struct dentry *dentry, int *opened) { int err = 0; - gid_t gid; + kgid_t gid; umode_t mode; char *name = NULL; struct p9_qid qid; @@ -268,8 +267,14 @@ v9fs_vfs_atomic_open_dotl(struct inode *dir, struct dentry *dentry, } /* Only creates */ - if (!(flags & O_CREAT) || dentry->d_inode) - return finish_no_open(file, res); + if (!(flags & O_CREAT)) + return finish_no_open(file, res); + else if (dentry->d_inode) { + if ((flags & (O_CREAT | O_EXCL)) == (O_CREAT | O_EXCL)) + return -EEXIST; + else + return finish_no_open(file, res); + } v9ses = v9fs_inode2v9ses(dir); @@ -391,7 +396,7 @@ static int v9fs_vfs_mkdir_dotl(struct inode *dir, int err; struct v9fs_session_info *v9ses; struct p9_fid *fid = NULL, *dfid = NULL; - gid_t gid; + kgid_t gid; char *name; umode_t mode; struct inode *inode; @@ -693,7 +698,7 @@ v9fs_vfs_symlink_dotl(struct inode *dir, struct dentry *dentry, const char *symname) { int err; - gid_t gid; + kgid_t gid; char *name; struct p9_qid qid; struct inode *inode; @@ -833,7 +838,7 @@ v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, umode_t omode, dev_t rdev) { int err; - gid_t gid; + kgid_t gid; char *name; umode_t mode; struct v9fs_session_info *v9ses; @@ -68,16 +68,6 @@ source "fs/quota/Kconfig" source "fs/autofs4/Kconfig" source "fs/fuse/Kconfig" -config CUSE - tristate "Character device in Userspace support" - depends on FUSE_FS - help - This FUSE extension allows character devices to be - implemented in userspace. - - If you want to develop or use userspace character device - based on CUSE, answer Y or M. - config GENERIC_ACL bool select FS_POSIX_ACL diff --git a/fs/adfs/Kconfig b/fs/adfs/Kconfig index e55182a..c5a7787 100644 --- a/fs/adfs/Kconfig +++ b/fs/adfs/Kconfig @@ -1,6 +1,6 @@ config ADFS_FS - tristate "ADFS file system support (EXPERIMENTAL)" - depends on BLOCK && EXPERIMENTAL + tristate "ADFS file system support" + depends on BLOCK help The Acorn Disc Filing System is the standard file system of the RiscOS operating system which runs on Acorn's ARM-based Risc PC diff --git a/fs/affs/Kconfig b/fs/affs/Kconfig index cfad9af..a04d9e8 100644 --- a/fs/affs/Kconfig +++ b/fs/affs/Kconfig @@ -1,6 +1,6 @@ config AFFS_FS - tristate "Amiga FFS file system support (EXPERIMENTAL)" - depends on BLOCK && EXPERIMENTAL + tristate "Amiga FFS file system support" + depends on BLOCK help The Fast File System (FFS) is the common file system used on hard disks by Amiga(tm) systems since AmigaOS Version 1.3 (34.20). Say Y diff --git a/fs/afs/Kconfig b/fs/afs/Kconfig index 8f975f2..ebba3b1 100644 --- a/fs/afs/Kconfig +++ b/fs/afs/Kconfig @@ -1,6 +1,6 @@ config AFS_FS - tristate "Andrew File System support (AFS) (EXPERIMENTAL)" - depends on INET && EXPERIMENTAL + tristate "Andrew File System support (AFS)" + depends on INET select AF_RXRPC select DNS_RESOLVER help @@ -22,8 +22,7 @@ config AFS_DEBUG If unsure, say N. config AFS_FSCACHE - bool "Provide AFS client caching support (EXPERIMENTAL)" - depends on EXPERIMENTAL + bool "Provide AFS client caching support" depends on AFS_FS=m && FSCACHE || AFS_FS=y && FSCACHE=y help Say Y here if you want AFS data to be cached locally on disk through diff --git a/fs/afs/afs.h b/fs/afs/afs.h index c548aa3..3c462ff 100644 --- a/fs/afs/afs.h +++ b/fs/afs/afs.h @@ -119,8 +119,8 @@ struct afs_file_status { u64 size; /* file size */ afs_dataversion_t data_version; /* current data version */ u32 author; /* author ID */ - u32 owner; /* owner ID */ - u32 group; /* group ID */ + kuid_t owner; /* owner ID */ + kgid_t group; /* group ID */ afs_access_t caller_access; /* access rights for authenticated caller */ afs_access_t anon_access; /* access rights for unauthenticated caller */ umode_t mode; /* UNIX mode */ @@ -133,13 +133,6 @@ struct afs_file_status { /* * AFS file status change request */ -struct afs_store_status { - u32 mask; /* which bits of the struct are set */ - u32 mtime_client; /* last time client changed data */ - u32 owner; /* owner ID */ - u32 group; /* group ID */ - umode_t mode; /* UNIX mode */ -}; #define AFS_SET_MTIME 0x01 /* set the mtime */ #define AFS_SET_OWNER 0x02 /* set the owner ID */ diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c index b960ff0..c2e930e 100644 --- a/fs/afs/fsclient.c +++ b/fs/afs/fsclient.c @@ -42,6 +42,8 @@ static void xdr_decode_AFSFetchStatus(const __be32 **_bp, umode_t mode; u64 data_version, size; u32 changed = 0; /* becomes non-zero if ctime-type changes seen */ + kuid_t owner; + kgid_t group; #define EXTRACT(DST) \ do { \ @@ -56,7 +58,9 @@ static void xdr_decode_AFSFetchStatus(const __be32 **_bp, size = ntohl(*bp++); data_version = ntohl(*bp++); EXTRACT(status->author); - EXTRACT(status->owner); + owner = make_kuid(&init_user_ns, ntohl(*bp++)); + changed |= !uid_eq(owner, status->owner); + status->owner = owner; EXTRACT(status->caller_access); /* call ticket dependent */ EXTRACT(status->anon_access); EXTRACT(status->mode); @@ -65,7 +69,9 @@ static void xdr_decode_AFSFetchStatus(const __be32 **_bp, bp++; /* seg size */ status->mtime_client = ntohl(*bp++); status->mtime_server = ntohl(*bp++); - EXTRACT(status->group); + group = make_kgid(&init_user_ns, ntohl(*bp++)); + changed |= !gid_eq(group, status->group); + status->group = group; bp++; /* sync counter */ data_version |= (u64) ntohl(*bp++) << 32; EXTRACT(status->lock_count); @@ -181,12 +187,12 @@ static void xdr_encode_AFS_StoreStatus(__be32 **_bp, struct iattr *attr) if (attr->ia_valid & ATTR_UID) { mask |= AFS_SET_OWNER; - owner = attr->ia_uid; + owner = from_kuid(&init_user_ns, attr->ia_uid); } if (attr->ia_valid & ATTR_GID) { mask |= AFS_SET_GROUP; - group = attr->ia_gid; + group = from_kgid(&init_user_ns, attr->ia_gid); } if (attr->ia_valid & ATTR_MODE) { diff --git a/fs/afs/inode.c b/fs/afs/inode.c index 95cffd3..789bc25 100644 --- a/fs/afs/inode.c +++ b/fs/afs/inode.c @@ -69,7 +69,7 @@ static int afs_inode_map_status(struct afs_vnode *vnode, struct key *key) set_nlink(inode, vnode->status.nlink); inode->i_uid = vnode->status.owner; - inode->i_gid = 0; + inode->i_gid = GLOBAL_ROOT_GID; inode->i_size = vnode->status.size; inode->i_ctime.tv_sec = vnode->status.mtime_server; inode->i_ctime.tv_nsec = 0; @@ -175,8 +175,8 @@ struct inode *afs_iget_autocell(struct inode *dir, const char *dev_name, inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO; inode->i_op = &afs_autocell_inode_operations; set_nlink(inode, 2); - inode->i_uid = 0; - inode->i_gid = 0; + inode->i_uid = GLOBAL_ROOT_UID; + inode->i_gid = GLOBAL_ROOT_GID; inode->i_ctime.tv_sec = get_seconds(); inode->i_ctime.tv_nsec = 0; inode->i_atime = inode->i_mtime = inode->i_ctime; diff --git a/fs/afs/super.c b/fs/afs/super.c index 4316500..7c31ec3 100644 --- a/fs/afs/super.c +++ b/fs/afs/super.c @@ -24,6 +24,8 @@ #include <linux/parser.h> #include <linux/statfs.h> #include <linux/sched.h> +#include <linux/nsproxy.h> +#include <net/net_namespace.h> #include "internal.h" #define AFS_FS_MAGIC 0x6B414653 /* 'kAFS' */ @@ -363,6 +365,10 @@ static struct dentry *afs_mount(struct file_system_type *fs_type, memset(¶ms, 0, sizeof(params)); + ret = -EINVAL; + if (current->nsproxy->net_ns != &init_net) + goto error; + /* parse the options and device name */ if (options) { ret = afs_parse_options(¶ms, options, &dev_name); @@ -101,7 +101,7 @@ static int aio_setup_ring(struct kioctx *ctx) struct aio_ring *ring; struct aio_ring_info *info = &ctx->ring_info; unsigned nr_events = ctx->max_reqs; - unsigned long size; + unsigned long size, populate; int nr_pages; /* Compensate for the ring buffer's head/tail overlap entry */ @@ -129,7 +129,8 @@ static int aio_setup_ring(struct kioctx *ctx) down_write(&ctx->mm->mmap_sem); info->mmap_base = do_mmap_pgoff(NULL, 0, info->mmap_size, PROT_READ|PROT_WRITE, - MAP_ANONYMOUS|MAP_PRIVATE, 0); + MAP_ANONYMOUS|MAP_PRIVATE, 0, + &populate); if (IS_ERR((void *)info->mmap_base)) { up_write(&ctx->mm->mmap_sem); info->mmap_size = 0; @@ -147,6 +148,8 @@ static int aio_setup_ring(struct kioctx *ctx) aio_free_ring(ctx); return -EAGAIN; } + if (populate) + mm_populate(info->mmap_base, populate); ctx->user_id = info->mmap_base; diff --git a/fs/befs/Kconfig b/fs/befs/Kconfig index 7835d30..edc5cc2 100644 --- a/fs/befs/Kconfig +++ b/fs/befs/Kconfig @@ -1,6 +1,6 @@ config BEFS_FS - tristate "BeOS file system (BeFS) support (read only) (EXPERIMENTAL)" - depends on BLOCK && EXPERIMENTAL + tristate "BeOS file system (BeFS) support (read only)" + depends on BLOCK select NLS help The BeOS File System (BeFS) is the native file system of Be, Inc's diff --git a/fs/bfs/Kconfig b/fs/bfs/Kconfig index c2336c6..3728a64 100644 --- a/fs/bfs/Kconfig +++ b/fs/bfs/Kconfig @@ -1,6 +1,6 @@ config BFS_FS - tristate "BFS file system support (EXPERIMENTAL)" - depends on BLOCK && EXPERIMENTAL + tristate "BFS file system support" + depends on BLOCK help Boot File System (BFS) is a file system used under SCO UnixWare to allow the bootloader access to the kernel image and other important diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 11e078a..a5702d7 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -33,6 +33,7 @@ #include <linux/elf.h> #include <linux/utsname.h> #include <linux/coredump.h> +#include <linux/sched.h> #include <asm/uaccess.h> #include <asm/param.h> #include <asm/page.h> @@ -1248,7 +1249,7 @@ static int writenote(struct memelfnote *men, struct file *file, #undef DUMP_WRITE static void fill_elf_header(struct elfhdr *elf, int segs, - u16 machine, u32 flags, u8 osabi) + u16 machine, u32 flags) { memset(elf, 0, sizeof(*elf)); @@ -1320,8 +1321,11 @@ static void fill_prstatus(struct elf_prstatus *prstatus, cputime_to_timeval(cputime.utime, &prstatus->pr_utime); cputime_to_timeval(cputime.stime, &prstatus->pr_stime); } else { - cputime_to_timeval(p->utime, &prstatus->pr_utime); - cputime_to_timeval(p->stime, &prstatus->pr_stime); + cputime_t utime, stime; + + task_cputime(p, &utime, &stime); + cputime_to_timeval(utime, &prstatus->pr_utime); + cputime_to_timeval(stime, &prstatus->pr_stime); } cputime_to_timeval(p->signal->cutime, &prstatus->pr_cutime); cputime_to_timeval(p->signal->cstime, &prstatus->pr_cstime); @@ -1630,7 +1634,7 @@ static int fill_note_info(struct elfhdr *elf, int phdrs, * Initialize the ELF file header. */ fill_elf_header(elf, phdrs, - view->e_machine, view->e_flags, view->ei_osabi); + view->e_machine, view->e_flags); /* * Allocate a structure for each thread. @@ -1870,7 +1874,7 @@ static int fill_note_info(struct elfhdr *elf, int phdrs, elf_core_copy_regs(&info->prstatus->pr_reg, regs); /* Set up header */ - fill_elf_header(elf, phdrs, ELF_ARCH, ELF_CORE_EFLAGS, ELF_OSABI); + fill_elf_header(elf, phdrs, ELF_ARCH, ELF_CORE_EFLAGS); /* * Set up the notes in similar form to SVR4 core dumps made diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index 30de01c..9c13e02 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -1375,8 +1375,11 @@ static void fill_prstatus(struct elf_prstatus *prstatus, cputime_to_timeval(cputime.utime, &prstatus->pr_utime); cputime_to_timeval(cputime.stime, &prstatus->pr_stime); } else { - cputime_to_timeval(p->utime, &prstatus->pr_utime); - cputime_to_timeval(p->stime, &prstatus->pr_stime); + cputime_t utime, stime; + + task_cputime(p, &utime, &stime); + cputime_to_timeval(utime, &prstatus->pr_utime); + cputime_to_timeval(stime, &prstatus->pr_stime); } cputime_to_timeval(p->signal->cutime, &prstatus->pr_cutime); cputime_to_timeval(p->signal->cstime, &prstatus->pr_cstime); diff --git a/fs/block_dev.c b/fs/block_dev.c index 7d6bdfc..53f5fae 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -994,6 +994,7 @@ int revalidate_disk(struct gendisk *disk) mutex_lock(&bdev->bd_mutex); check_disk_size_change(disk, bdev); + bdev->bd_invalidated = 0; mutex_unlock(&bdev->bd_mutex); bdput(bdev); return ret; diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig index d33f01c..ccd25ba 100644 --- a/fs/btrfs/Kconfig +++ b/fs/btrfs/Kconfig @@ -1,6 +1,5 @@ config BTRFS_FS - tristate "Btrfs filesystem (EXPERIMENTAL) Unstable disk format" - depends on EXPERIMENTAL + tristate "Btrfs filesystem Unstable disk format" select LIBCRC32C select ZLIB_INFLATE select ZLIB_DEFLATE diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 521e9d4..1e59ed5 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3997,7 +3997,7 @@ again: * We make the other tasks wait for the flush only when we can flush * all things. */ - if (ret && flush == BTRFS_RESERVE_FLUSH_ALL) { + if (ret && flush != BTRFS_RESERVE_NO_FLUSH) { flushing = true; space_info->flush = 1; } @@ -4534,7 +4534,7 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) unsigned nr_extents = 0; int extra_reserve = 0; enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL; - int ret; + int ret = 0; bool delalloc_lock = true; /* If we are a free space inode we need to not flush since we will be in @@ -4579,20 +4579,18 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) csum_bytes = BTRFS_I(inode)->csum_bytes; spin_unlock(&BTRFS_I(inode)->lock); - if (root->fs_info->quota_enabled) { + if (root->fs_info->quota_enabled) ret = btrfs_qgroup_reserve(root, num_bytes + nr_extents * root->leafsize); - if (ret) { - spin_lock(&BTRFS_I(inode)->lock); - calc_csum_metadata_size(inode, num_bytes, 0); - spin_unlock(&BTRFS_I(inode)->lock); - if (delalloc_lock) - mutex_unlock(&BTRFS_I(inode)->delalloc_mutex); - return ret; - } - } - ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush); + /* + * ret != 0 here means the qgroup reservation failed, we go straight to + * the shared error handling then. + */ + if (ret == 0) + ret = reserve_metadata_bytes(root, block_rsv, + to_reserve, flush); + if (ret) { u64 to_free = 0; unsigned dropped; @@ -5560,7 +5558,7 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans, int empty_cluster = 2 * 1024 * 1024; struct btrfs_space_info *space_info; int loop = 0; - int index = 0; + int index = __get_raid_index(data); int alloc_type = (data & BTRFS_BLOCK_GROUP_DATA) ? RESERVE_ALLOC_NO_ACCOUNT : RESERVE_ALLOC; bool found_uncached_bg = false; @@ -6524,7 +6522,7 @@ reada: } /* - * hepler to process tree block while walking down the tree. + * helper to process tree block while walking down the tree. * * when wc->stage == UPDATE_BACKREF, this function updates * back refs for pointers in the block. @@ -6599,7 +6597,7 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans, } /* - * hepler to process tree block pointer. + * helper to process tree block pointer. * * when wc->stage == DROP_REFERENCE, this function checks * reference count of the block pointed to. if the block @@ -6737,7 +6735,7 @@ skip: } /* - * hepler to process tree block while walking up the tree. + * helper to process tree block while walking up the tree. * * when wc->stage == DROP_REFERENCE, this function drops * reference count on the block. @@ -6788,11 +6786,13 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans, &wc->flags[level]); if (ret < 0) { btrfs_tree_unlock_rw(eb, path->locks[level]); + path->locks[level] = 0; return ret; } BUG_ON(wc->refs[level] == 0); if (wc->refs[level] == 1) { btrfs_tree_unlock_rw(eb, path->locks[level]); + path->locks[level] = 0; return 1; } } diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index f169d6b..fdb7a8d 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -171,6 +171,10 @@ static int mergable_maps(struct extent_map *prev, struct extent_map *next) if (test_bit(EXTENT_FLAG_COMPRESSED, &prev->flags)) return 0; + if (test_bit(EXTENT_FLAG_LOGGING, &prev->flags) || + test_bit(EXTENT_FLAG_LOGGING, &next->flags)) + return 0; + if (extent_map_end(prev) == next->start && prev->flags == next->flags && prev->bdev == next->bdev && @@ -255,7 +259,8 @@ int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len, if (!em) goto out; - list_move(&em->list, &tree->modified_extents); + if (!test_bit(EXTENT_FLAG_LOGGING, &em->flags)) + list_move(&em->list, &tree->modified_extents); em->generation = gen; clear_bit(EXTENT_FLAG_PINNED, &em->flags); em->mod_start = em->start; @@ -280,6 +285,13 @@ out: } +void clear_em_logging(struct extent_map_tree *tree, struct extent_map *em) +{ + clear_bit(EXTENT_FLAG_LOGGING, &em->flags); + if (em->in_tree) + try_merge_map(tree, em); +} + /** * add_extent_mapping - add new extent map to the extent tree * @tree: tree to insert new map in diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h index 922943c..c6598c8 100644 --- a/fs/btrfs/extent_map.h +++ b/fs/btrfs/extent_map.h @@ -69,6 +69,7 @@ void free_extent_map(struct extent_map *em); int __init extent_map_init(void); void extent_map_exit(void); int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len, u64 gen); +void clear_em_logging(struct extent_map_tree *tree, struct extent_map *em); struct extent_map *search_extent_mapping(struct extent_map_tree *tree, u64 start, u64 len); #endif diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index bd38cef..94aa53b 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -460,8 +460,8 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode, if (!contig) offset = page_offset(bvec->bv_page) + bvec->bv_offset; - if (!contig && (offset >= ordered->file_offset + ordered->len || - offset < ordered->file_offset)) { + if (offset >= ordered->file_offset + ordered->len || + offset < ordered->file_offset) { unsigned long bytes_left; sums->len = this_sum_bytes; this_sum_bytes = 0; diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 4118e0b..4b241fe 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -293,15 +293,24 @@ static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info, struct btrfs_key key; struct btrfs_ioctl_defrag_range_args range; int num_defrag; + int index; + int ret; /* get the inode */ key.objectid = defrag->root; btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); key.offset = (u64)-1; + + index = srcu_read_lock(&fs_info->subvol_srcu); + inode_root = btrfs_read_fs_root_no_name(fs_info, &key); if (IS_ERR(inode_root)) { - kmem_cache_free(btrfs_inode_defrag_cachep, defrag); - return PTR_ERR(inode_root); + ret = PTR_ERR(inode_root); + goto cleanup; + } + if (btrfs_root_refs(&inode_root->root_item) == 0) { + ret = -ENOENT; + goto cleanup; } key.objectid = defrag->ino; @@ -309,9 +318,10 @@ static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info, key.offset = 0; inode = btrfs_iget(fs_info->sb, &key, inode_root, NULL); if (IS_ERR(inode)) { - kmem_cache_free(btrfs_inode_defrag_cachep, defrag); - return PTR_ERR(inode); + ret = PTR_ERR(inode); + goto cleanup; } + srcu_read_unlock(&fs_info->subvol_srcu, index); /* do a chunk of defrag */ clear_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags); @@ -346,6 +356,10 @@ static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info, iput(inode); return 0; +cleanup: + srcu_read_unlock(&fs_info->subvol_srcu, index); + kmem_cache_free(btrfs_inode_defrag_cachep, defrag); + return ret; } /* @@ -1594,9 +1608,10 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb, if (err < 0 && num_written > 0) num_written = err; } -out: + if (sync) atomic_dec(&BTRFS_I(inode)->sync_writers); +out: sb_end_write(inode->i_sb); current->backing_dev_info = NULL; return num_written ? num_written : err; @@ -2241,6 +2256,7 @@ static int find_desired_extent(struct inode *inode, loff_t *offset, int whence) if (lockend <= lockstart) lockend = lockstart + root->sectorsize; + lockend--; len = lockend - lockstart + 1; len = max_t(u64, len, root->sectorsize); @@ -2307,9 +2323,12 @@ static int find_desired_extent(struct inode *inode, loff_t *offset, int whence) } } - *offset = start; - free_extent_map(em); - break; + if (!test_bit(EXTENT_FLAG_PREALLOC, + &em->flags)) { + *offset = start; + free_extent_map(em); + break; + } } } diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 59ea2e4..0be7a87 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -1862,11 +1862,13 @@ int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group, { struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; struct btrfs_free_space *info; - int ret = 0; + int ret; + bool re_search = false; spin_lock(&ctl->tree_lock); again: + ret = 0; if (!bytes) goto out_lock; @@ -1879,17 +1881,17 @@ again: info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset), 1, 0); if (!info) { - /* the tree logging code might be calling us before we - * have fully loaded the free space rbtree for this - * block group. So it is possible the entry won't - * be in the rbtree yet at all. The caching code - * will make sure not to put it in the rbtree if - * the logging code has pinned it. + /* + * If we found a partial bit of our free space in a + * bitmap but then couldn't find the other part this may + * be a problem, so WARN about it. */ + WARN_ON(re_search); goto out_lock; } } + re_search = false; if (!info->bitmap) { unlink_free_space(ctl, info); if (offset == info->offset) { @@ -1935,8 +1937,10 @@ again: } ret = remove_from_bitmap(ctl, info, &offset, &bytes); - if (ret == -EAGAIN) + if (ret == -EAGAIN) { + re_search = true; goto again; + } BUG_ON(ret); /* logic error */ out_lock: spin_unlock(&ctl->tree_lock); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 02d946a..55c07b6 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -88,7 +88,7 @@ static unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = { [S_IFLNK >> S_SHIFT] = BTRFS_FT_SYMLINK, }; -static int btrfs_setsize(struct inode *inode, loff_t newsize); +static int btrfs_setsize(struct inode *inode, struct iattr *attr); static int btrfs_truncate(struct inode *inode); static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent); static noinline int cow_file_range(struct inode *inode, @@ -2478,6 +2478,18 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) continue; } nr_truncate++; + + /* 1 for the orphan item deletion. */ + trans = btrfs_start_transaction(root, 1); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto out; + } + ret = btrfs_orphan_add(trans, inode); + btrfs_end_transaction(trans, root); + if (ret) + goto out; + ret = btrfs_truncate(inode); } else { nr_unlink++; @@ -3665,6 +3677,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size) block_end - cur_offset, 0); if (IS_ERR(em)) { err = PTR_ERR(em); + em = NULL; break; } last_byte = min(extent_map_end(em), block_end); @@ -3748,16 +3761,27 @@ next: return err; } -static int btrfs_setsize(struct inode *inode, loff_t newsize) +static int btrfs_setsize(struct inode *inode, struct iattr *attr) { struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_trans_handle *trans; loff_t oldsize = i_size_read(inode); + loff_t newsize = attr->ia_size; + int mask = attr->ia_valid; int ret; if (newsize == oldsize) return 0; + /* + * The regular truncate() case without ATTR_CTIME and ATTR_MTIME is a + * special case where we need to update the times despite not having + * these flags set. For all other operations the VFS set these flags + * explicitly if it wants a timestamp update. + */ + if (newsize != oldsize && (!(mask & (ATTR_CTIME | ATTR_MTIME)))) + inode->i_ctime = inode->i_mtime = current_fs_time(inode->i_sb); + if (newsize > oldsize) { truncate_pagecache(inode, oldsize, newsize); ret = btrfs_cont_expand(inode, oldsize, newsize); @@ -3783,9 +3807,34 @@ static int btrfs_setsize(struct inode *inode, loff_t newsize) set_bit(BTRFS_INODE_ORDERED_DATA_CLOSE, &BTRFS_I(inode)->runtime_flags); + /* + * 1 for the orphan item we're going to add + * 1 for the orphan item deletion. + */ + trans = btrfs_start_transaction(root, 2); + if (IS_ERR(trans)) + return PTR_ERR(trans); + + /* + * We need to do this in case we fail at _any_ point during the + * actual truncate. Once we do the truncate_setsize we could + * invalidate pages which forces any outstanding ordered io to + * be instantly completed which will give us extents that need + * to be truncated. If we fail to get an orphan inode down we + * could have left over extents that were never meant to live, + * so we need to garuntee from this point on that everything + * will be consistent. + */ + ret = btrfs_orphan_add(trans, inode); + btrfs_end_transaction(trans, root); + if (ret) + return ret; + /* we don't support swapfiles, so vmtruncate shouldn't fail */ truncate_setsize(inode, newsize); ret = btrfs_truncate(inode); + if (ret && inode->i_nlink) + btrfs_orphan_del(NULL, inode); } return ret; @@ -3805,7 +3854,7 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr) return err; if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) { - err = btrfs_setsize(inode, attr->ia_size); + err = btrfs_setsize(inode, attr); if (err) return err; } @@ -5572,10 +5621,13 @@ struct extent_map *btrfs_get_extent_fiemap(struct inode *inode, struct page *pag return em; if (em) { /* - * if our em maps to a hole, there might - * actually be delalloc bytes behind it + * if our em maps to + * - a hole or + * - a pre-alloc extent, + * there might actually be delalloc bytes behind it. */ - if (em->block_start != EXTENT_MAP_HOLE) + if (em->block_start != EXTENT_MAP_HOLE && + !test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) return em; else hole_em = em; @@ -5657,6 +5709,8 @@ struct extent_map *btrfs_get_extent_fiemap(struct inode *inode, struct page *pag */ em->block_start = hole_em->block_start; em->block_len = hole_len; + if (test_bit(EXTENT_FLAG_PREALLOC, &hole_em->flags)) + set_bit(EXTENT_FLAG_PREALLOC, &em->flags); } else { em->start = range_start; em->len = found; @@ -6915,11 +6969,9 @@ static int btrfs_truncate(struct inode *inode) /* * 1 for the truncate slack space - * 1 for the orphan item we're going to add - * 1 for the orphan item deletion * 1 for updating the inode. */ - trans = btrfs_start_transaction(root, 4); + trans = btrfs_start_transaction(root, 2); if (IS_ERR(trans)) { err = PTR_ERR(trans); goto out; @@ -6930,12 +6982,6 @@ static int btrfs_truncate(struct inode *inode) min_size); BUG_ON(ret); - ret = btrfs_orphan_add(trans, inode); - if (ret) { - btrfs_end_transaction(trans, root); - goto out; - } - /* * setattr is responsible for setting the ordered_data_close flag, * but that is only tested during the last file release. That @@ -7004,12 +7050,6 @@ static int btrfs_truncate(struct inode *inode) ret = btrfs_orphan_del(trans, inode); if (ret) err = ret; - } else if (ret && inode->i_nlink > 0) { - /* - * Failed to do the truncate, remove us from the in memory - * orphan list. - */ - ret = btrfs_orphan_del(NULL, inode); } if (trans) { @@ -7531,41 +7571,61 @@ void btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work *work) */ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput) { - struct list_head *head = &root->fs_info->delalloc_inodes; struct btrfs_inode *binode; struct inode *inode; struct btrfs_delalloc_work *work, *next; struct list_head works; + struct list_head splice; int ret = 0; if (root->fs_info->sb->s_flags & MS_RDONLY) return -EROFS; INIT_LIST_HEAD(&works); - + INIT_LIST_HEAD(&splice); +again: spin_lock(&root->fs_info->delalloc_lock); - while (!list_empty(head)) { - binode = list_entry(head->next, struct btrfs_inode, + list_splice_init(&root->fs_info->delalloc_inodes, &splice); + while (!list_empty(&splice)) { + binode = list_entry(splice.next, struct btrfs_inode, delalloc_inodes); + + list_del_init(&binode->delalloc_inodes); + inode = igrab(&binode->vfs_inode); if (!inode) - list_del_init(&binode->delalloc_inodes); + continue; + + list_add_tail(&binode->delalloc_inodes, + &root->fs_info->delalloc_inodes); spin_unlock(&root->fs_info->delalloc_lock); - if (inode) { - work = btrfs_alloc_delalloc_work(inode, 0, delay_iput); - if (!work) { - ret = -ENOMEM; - goto out; - } - list_add_tail(&work->list, &works); - btrfs_queue_worker(&root->fs_info->flush_workers, - &work->work); + + work = btrfs_alloc_delalloc_work(inode, 0, delay_iput); + if (unlikely(!work)) { + ret = -ENOMEM; + goto out; } + list_add_tail(&work->list, &works); + btrfs_queue_worker(&root->fs_info->flush_workers, + &work->work); + cond_resched(); spin_lock(&root->fs_info->delalloc_lock); } spin_unlock(&root->fs_info->delalloc_lock); + list_for_each_entry_safe(work, next, &works, list) { + list_del_init(&work->list); + btrfs_wait_and_free_delalloc_work(work); + } + + spin_lock(&root->fs_info->delalloc_lock); + if (!list_empty(&root->fs_info->delalloc_inodes)) { + spin_unlock(&root->fs_info->delalloc_lock); + goto again; + } + spin_unlock(&root->fs_info->delalloc_lock); + /* the filemap_flush will queue IO into the worker threads, but * we have to make sure the IO is actually started and that * ordered extents get created before we return @@ -7578,11 +7638,18 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput) atomic_read(&root->fs_info->async_delalloc_pages) == 0)); } atomic_dec(&root->fs_info->async_submit_draining); + return 0; out: list_for_each_entry_safe(work, next, &works, list) { list_del_init(&work->list); btrfs_wait_and_free_delalloc_work(work); } + + if (!list_empty_careful(&splice)) { + spin_lock(&root->fs_info->delalloc_lock); + list_splice_tail(&splice, &root->fs_info->delalloc_inodes); + spin_unlock(&root->fs_info->delalloc_lock); + } return ret; } diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 61045ad..c3f09f7 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -515,7 +515,6 @@ static noinline int create_subvol(struct btrfs_root *root, BUG_ON(ret); - d_instantiate(dentry, btrfs_lookup_dentry(dir, dentry)); fail: if (async_transid) { *async_transid = trans->transid; @@ -525,6 +524,10 @@ fail: } if (err && !ret) ret = err; + + if (!ret) + d_instantiate(dentry, btrfs_lookup_dentry(dir, dentry)); + return ret; } @@ -1339,7 +1342,8 @@ static noinline int btrfs_ioctl_resize(struct file *file, if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running, 1)) { pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n"); - return -EINPROGRESS; + mnt_drop_write_file(file); + return -EINVAL; } mutex_lock(&root->fs_info->volume_mutex); @@ -1362,6 +1366,7 @@ static noinline int btrfs_ioctl_resize(struct file *file, printk(KERN_INFO "btrfs: resizing devid %llu\n", (unsigned long long)devid); } + device = btrfs_find_device(root->fs_info, devid, NULL, NULL); if (!device) { printk(KERN_INFO "btrfs: resizer unable to find device %llu\n", @@ -1369,9 +1374,10 @@ static noinline int btrfs_ioctl_resize(struct file *file, ret = -EINVAL; goto out_free; } - if (device->fs_devices && device->fs_devices->seeding) { + + if (!device->writeable) { printk(KERN_INFO "btrfs: resizer unable to apply on " - "seeding device %llu\n", + "readonly device %llu\n", (unsigned long long)devid); ret = -EINVAL; goto out_free; @@ -1443,8 +1449,8 @@ out_free: kfree(vol_args); out: mutex_unlock(&root->fs_info->volume_mutex); - mnt_drop_write_file(file); atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0); + mnt_drop_write_file(file); return ret; } @@ -2095,13 +2101,13 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, err = inode_permission(inode, MAY_WRITE | MAY_EXEC); if (err) goto out_dput; - - /* check if subvolume may be deleted by a non-root user */ - err = btrfs_may_delete(dir, dentry, 1); - if (err) - goto out_dput; } + /* check if subvolume may be deleted by a user */ + err = btrfs_may_delete(dir, dentry, 1); + if (err) + goto out_dput; + if (btrfs_ino(inode) != BTRFS_FIRST_FREE_OBJECTID) { err = -EINVAL; goto out_dput; @@ -2183,19 +2189,20 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp) struct btrfs_ioctl_defrag_range_args *range; int ret; - if (btrfs_root_readonly(root)) - return -EROFS; + ret = mnt_want_write_file(file); + if (ret) + return ret; if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running, 1)) { pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n"); - return -EINPROGRESS; + mnt_drop_write_file(file); + return -EINVAL; } - ret = mnt_want_write_file(file); - if (ret) { - atomic_set(&root->fs_info->mutually_exclusive_operation_running, - 0); - return ret; + + if (btrfs_root_readonly(root)) { + ret = -EROFS; + goto out; } switch (inode->i_mode & S_IFMT) { @@ -2247,8 +2254,8 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp) ret = -EINVAL; } out: - mnt_drop_write_file(file); atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0); + mnt_drop_write_file(file); return ret; } @@ -2263,7 +2270,7 @@ static long btrfs_ioctl_add_dev(struct btrfs_root *root, void __user *arg) if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running, 1)) { pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n"); - return -EINPROGRESS; + return -EINVAL; } mutex_lock(&root->fs_info->volume_mutex); @@ -2300,7 +2307,7 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg) 1)) { pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n"); mnt_drop_write_file(file); - return -EINPROGRESS; + return -EINVAL; } mutex_lock(&root->fs_info->volume_mutex); @@ -2316,8 +2323,8 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg) kfree(vol_args); out: mutex_unlock(&root->fs_info->volume_mutex); - mnt_drop_write_file(file); atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0); + mnt_drop_write_file(file); return ret; } @@ -3437,8 +3444,8 @@ static long btrfs_ioctl_balance(struct file *file, void __user *arg) struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_ioctl_balance_args *bargs; struct btrfs_balance_control *bctl; + bool need_unlock; /* for mut. excl. ops lock */ int ret; - int need_to_clear_lock = 0; if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -3447,14 +3454,61 @@ static long btrfs_ioctl_balance(struct file *file, void __user *arg) if (ret) return ret; - mutex_lock(&fs_info->volume_mutex); +again: + if (!atomic_xchg(&fs_info->mutually_exclusive_operation_running, 1)) { + mutex_lock(&fs_info->volume_mutex); + mutex_lock(&fs_info->balance_mutex); + need_unlock = true; + goto locked; + } + + /* + * mut. excl. ops lock is locked. Three possibilites: + * (1) some other op is running + * (2) balance is running + * (3) balance is paused -- special case (think resume) + */ mutex_lock(&fs_info->balance_mutex); + if (fs_info->balance_ctl) { + /* this is either (2) or (3) */ + if (!atomic_read(&fs_info->balance_running)) { + mutex_unlock(&fs_info->balance_mutex); + if (!mutex_trylock(&fs_info->volume_mutex)) + goto again; + mutex_lock(&fs_info->balance_mutex); + + if (fs_info->balance_ctl && + !atomic_read(&fs_info->balance_running)) { + /* this is (3) */ + need_unlock = false; + goto locked; + } + + mutex_unlock(&fs_info->balance_mutex); + mutex_unlock(&fs_info->volume_mutex); + goto again; + } else { + /* this is (2) */ + mutex_unlock(&fs_info->balance_mutex); + ret = -EINPROGRESS; + goto out; + } + } else { + /* this is (1) */ + mutex_unlock(&fs_info->balance_mutex); + pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n"); + ret = -EINVAL; + goto out; + } + +locked: + BUG_ON(!atomic_read(&fs_info->mutually_exclusive_operation_running)); if (arg) { bargs = memdup_user(arg, sizeof(*bargs)); if (IS_ERR(bargs)) { ret = PTR_ERR(bargs); - goto out; + goto out_unlock; } if (bargs->flags & BTRFS_BALANCE_RESUME) { @@ -3474,13 +3528,10 @@ static long btrfs_ioctl_balance(struct file *file, void __user *arg) bargs = NULL; } - if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running, - 1)) { - pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n"); + if (fs_info->balance_ctl) { ret = -EINPROGRESS; goto out_bargs; } - need_to_clear_lock = 1; bctl = kzalloc(sizeof(*bctl), GFP_NOFS); if (!bctl) { @@ -3501,11 +3552,17 @@ static long btrfs_ioctl_balance(struct file *file, void __user *arg) } do_balance: - ret = btrfs_balance(bctl, bargs); /* - * bctl is freed in __cancel_balance or in free_fs_info if - * restriper was paused all the way until unmount + * Ownership of bctl and mutually_exclusive_operation_running + * goes to to btrfs_balance. bctl is freed in __cancel_balance, + * or, if restriper was paused all the way until unmount, in + * free_fs_info. mutually_exclusive_operation_running is + * cleared in __cancel_balance. */ + need_unlock = false; + + ret = btrfs_balance(bctl, bargs); + if (arg) { if (copy_to_user(arg, bargs, sizeof(*bargs))) ret = -EFAULT; @@ -3513,12 +3570,12 @@ do_balance: out_bargs: kfree(bargs); -out: - if (need_to_clear_lock) - atomic_set(&root->fs_info->mutually_exclusive_operation_running, - 0); +out_unlock: mutex_unlock(&fs_info->balance_mutex); mutex_unlock(&fs_info->volume_mutex); + if (need_unlock) + atomic_set(&fs_info->mutually_exclusive_operation_running, 0); +out: mnt_drop_write_file(file); return ret; } @@ -3698,6 +3755,11 @@ static long btrfs_ioctl_qgroup_create(struct file *file, void __user *arg) goto drop_write; } + if (!sa->qgroupid) { + ret = -EINVAL; + goto out; + } + trans = btrfs_join_transaction(root); if (IS_ERR(trans)) { ret = PTR_ERR(trans); diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index f107312..e5ed567 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -836,9 +836,16 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset, * if the disk i_size is already at the inode->i_size, or * this ordered extent is inside the disk i_size, we're done */ - if (disk_i_size == i_size || offset <= disk_i_size) { + if (disk_i_size == i_size) + goto out; + + /* + * We still need to update disk_i_size if outstanding_isize is greater + * than disk_i_size. + */ + if (offset <= disk_i_size && + (!ordered || ordered->outstanding_isize <= disk_i_size)) goto out; - } /* * walk backward from this ordered extent to disk_i_size. @@ -870,7 +877,7 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset, break; if (test->file_offset >= i_size) break; - if (test->file_offset >= disk_i_size) { + if (entry_end(test) > disk_i_size) { /* * we don't update disk_i_size now, so record this * undealt i_size. Or we will not know the real diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index fe9d02c..a5c8562 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -379,6 +379,13 @@ next1: ret = add_relation_rb(fs_info, found_key.objectid, found_key.offset); + if (ret == -ENOENT) { + printk(KERN_WARNING + "btrfs: orphan qgroup relation 0x%llx->0x%llx\n", + (unsigned long long)found_key.objectid, + (unsigned long long)found_key.offset); + ret = 0; /* ignore the error */ + } if (ret) goto out; next2: @@ -956,17 +963,28 @@ int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 qgroupid) { struct btrfs_root *quota_root; + struct btrfs_qgroup *qgroup; int ret = 0; quota_root = fs_info->quota_root; if (!quota_root) return -EINVAL; + /* check if there are no relations to this qgroup */ + spin_lock(&fs_info->qgroup_lock); + qgroup = find_qgroup_rb(fs_info, qgroupid); + if (qgroup) { + if (!list_empty(&qgroup->groups) || !list_empty(&qgroup->members)) { + spin_unlock(&fs_info->qgroup_lock); + return -EBUSY; + } + } + spin_unlock(&fs_info->qgroup_lock); + ret = del_qgroup_item(trans, quota_root, qgroupid); spin_lock(&fs_info->qgroup_lock); del_qgroup_rb(quota_root->fs_info, qgroupid); - spin_unlock(&fs_info->qgroup_lock); return ret; diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 300e09a..17c306b 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -3472,7 +3472,7 @@ out: } /* - * hepler to find all tree blocks that reference a given data extent + * helper to find all tree blocks that reference a given data extent */ static noinline_for_stack int add_data_references(struct reloc_control *rc, @@ -3566,7 +3566,7 @@ int add_data_references(struct reloc_control *rc, } /* - * hepler to find next unprocessed extent + * helper to find next unprocessed extent */ static noinline_for_stack int find_next_extent(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index bdbb94f..67783e0 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -580,20 +580,29 @@ static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *fixup_ctx) int corrected = 0; struct btrfs_key key; struct inode *inode = NULL; + struct btrfs_fs_info *fs_info; u64 end = offset + PAGE_SIZE - 1; struct btrfs_root *local_root; + int srcu_index; key.objectid = root; key.type = BTRFS_ROOT_ITEM_KEY; key.offset = (u64)-1; - local_root = btrfs_read_fs_root_no_name(fixup->root->fs_info, &key); - if (IS_ERR(local_root)) + + fs_info = fixup->root->fs_info; + srcu_index = srcu_read_lock(&fs_info->subvol_srcu); + + local_root = btrfs_read_fs_root_no_name(fs_info, &key); + if (IS_ERR(local_root)) { + srcu_read_unlock(&fs_info->subvol_srcu, srcu_index); return PTR_ERR(local_root); + } key.type = BTRFS_INODE_ITEM_KEY; key.objectid = inum; key.offset = 0; - inode = btrfs_iget(fixup->root->fs_info->sb, &key, local_root, NULL); + inode = btrfs_iget(fs_info->sb, &key, local_root, NULL); + srcu_read_unlock(&fs_info->subvol_srcu, srcu_index); if (IS_ERR(inode)) return PTR_ERR(inode); @@ -606,7 +615,6 @@ static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *fixup_ctx) } if (PageUptodate(page)) { - struct btrfs_fs_info *fs_info; if (PageDirty(page)) { /* * we need to write the data to the defect sector. the @@ -3180,18 +3188,25 @@ static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, void *ctx) u64 physical_for_dev_replace; u64 len; struct btrfs_fs_info *fs_info = nocow_ctx->sctx->dev_root->fs_info; + int srcu_index; key.objectid = root; key.type = BTRFS_ROOT_ITEM_KEY; key.offset = (u64)-1; + + srcu_index = srcu_read_lock(&fs_info->subvol_srcu); + local_root = btrfs_read_fs_root_no_name(fs_info, &key); - if (IS_ERR(local_root)) + if (IS_ERR(local_root)) { + srcu_read_unlock(&fs_info->subvol_srcu, srcu_index); return PTR_ERR(local_root); + } key.type = BTRFS_INODE_ITEM_KEY; key.objectid = inum; key.offset = 0; inode = btrfs_iget(fs_info->sb, &key, local_root, NULL); + srcu_read_unlock(&fs_info->subvol_srcu, srcu_index); if (IS_ERR(inode)) return PTR_ERR(inode); diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index f80df6b..f4ab7a9 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -1814,8 +1814,10 @@ static int name_cache_insert(struct send_ctx *sctx, (unsigned long)nce->ino); if (!nce_head) { nce_head = kmalloc(sizeof(*nce_head), GFP_NOFS); - if (!nce_head) + if (!nce_head) { + kfree(nce); return -ENOMEM; + } INIT_LIST_HEAD(nce_head); ret = radix_tree_insert(&sctx->name_cache, nce->ino, nce_head); diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 99545df..d8982e9 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -267,7 +267,7 @@ void __btrfs_abort_transaction(struct btrfs_trans_handle *trans, function, line, errstr); return; } - trans->transaction->aborted = errno; + ACCESS_ONCE(trans->transaction->aborted) = errno; __btrfs_std_error(root->fs_info, function, line, errno, NULL); } /* diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 87fac9a..4c0067c 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -112,7 +112,6 @@ loop: * to redo the trans_no_join checks above */ kmem_cache_free(btrfs_transaction_cachep, cur_trans); - cur_trans = fs_info->running_transaction; goto loop; } else if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) { spin_unlock(&fs_info->trans_lock); @@ -333,12 +332,14 @@ start_transaction(struct btrfs_root *root, u64 num_items, int type, &root->fs_info->trans_block_rsv, num_bytes, flush); if (ret) - return ERR_PTR(ret); + goto reserve_fail; } again: h = kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS); - if (!h) - return ERR_PTR(-ENOMEM); + if (!h) { + ret = -ENOMEM; + goto alloc_fail; + } /* * If we are JOIN_NOLOCK we're already committing a transaction and @@ -365,11 +366,7 @@ again: if (ret < 0) { /* We must get the transaction if we are JOIN_NOLOCK. */ BUG_ON(type == TRANS_JOIN_NOLOCK); - - if (type < TRANS_JOIN_NOLOCK) - sb_end_intwrite(root->fs_info->sb); - kmem_cache_free(btrfs_trans_handle_cachep, h); - return ERR_PTR(ret); + goto join_fail; } cur_trans = root->fs_info->running_transaction; @@ -410,6 +407,19 @@ got_it: if (!current->journal_info && type != TRANS_USERSPACE) current->journal_info = h; return h; + +join_fail: + if (type < TRANS_JOIN_NOLOCK) + sb_end_intwrite(root->fs_info->sb); + kmem_cache_free(btrfs_trans_handle_cachep, h); +alloc_fail: + if (num_bytes) + btrfs_block_rsv_release(root, &root->fs_info->trans_block_rsv, + num_bytes); +reserve_fail: + if (qgroup_reserved) + btrfs_qgroup_free(root, qgroup_reserved); + return ERR_PTR(ret); } struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root, @@ -1468,7 +1478,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, goto cleanup_transaction; } - if (cur_trans->aborted) { + /* Stop the commit early if ->aborted is set */ + if (unlikely(ACCESS_ONCE(cur_trans->aborted))) { ret = cur_trans->aborted; goto cleanup_transaction; } @@ -1574,6 +1585,11 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, wait_event(cur_trans->writer_wait, atomic_read(&cur_trans->num_writers) == 1); + /* ->aborted might be set after the previous check, so check it */ + if (unlikely(ACCESS_ONCE(cur_trans->aborted))) { + ret = cur_trans->aborted; + goto cleanup_transaction; + } /* * the reloc mutex makes sure that we stop * the balancing code from coming in and moving @@ -1657,6 +1673,17 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, goto cleanup_transaction; } + /* + * The tasks which save the space cache and inode cache may also + * update ->aborted, check it. + */ + if (unlikely(ACCESS_ONCE(cur_trans->aborted))) { + ret = cur_trans->aborted; + mutex_unlock(&root->fs_info->tree_log_mutex); + mutex_unlock(&root->fs_info->reloc_mutex); + goto cleanup_transaction; + } + btrfs_prepare_extent_commit(trans, root); cur_trans = root->fs_info->running_transaction; diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 83186c7..9027bb1 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -3357,6 +3357,11 @@ static int log_one_extent(struct btrfs_trans_handle *trans, if (skip_csum) return 0; + if (em->compress_type) { + csum_offset = 0; + csum_len = block_len; + } + /* block start is already adjusted for the file extent offset. */ ret = btrfs_lookup_csums_range(log->fs_info->csum_root, em->block_start + csum_offset, @@ -3410,13 +3415,13 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, em = list_entry(extents.next, struct extent_map, list); list_del_init(&em->list); - clear_bit(EXTENT_FLAG_LOGGING, &em->flags); /* * If we had an error we just need to delete everybody from our * private list. */ if (ret) { + clear_em_logging(tree, em); free_extent_map(em); continue; } @@ -3424,8 +3429,9 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, write_unlock(&tree->lock); ret = log_one_extent(trans, inode, root, em, path); - free_extent_map(em); write_lock(&tree->lock); + clear_em_logging(tree, em); + free_extent_map(em); } WARN_ON(!list_empty(&extents)); write_unlock(&tree->lock); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 5cce6aa..5cbb7f4 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1431,7 +1431,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) } } else { ret = btrfs_get_bdev_and_sb(device_path, - FMODE_READ | FMODE_EXCL, + FMODE_WRITE | FMODE_EXCL, root->fs_info->bdev_holder, 0, &bdev, &bh); if (ret) @@ -1556,7 +1556,8 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) ret = 0; /* Notify udev that device has changed */ - btrfs_kobject_uevent(bdev, KOBJ_CHANGE); + if (bdev) + btrfs_kobject_uevent(bdev, KOBJ_CHANGE); error_brelse: brelse(bh); @@ -2614,7 +2615,14 @@ static int chunk_usage_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset, cache = btrfs_lookup_block_group(fs_info, chunk_offset); chunk_used = btrfs_block_group_used(&cache->item); - user_thresh = div_factor_fine(cache->key.offset, bargs->usage); + if (bargs->usage == 0) + user_thresh = 0; + else if (bargs->usage > 100) + user_thresh = cache->key.offset; + else + user_thresh = div_factor_fine(cache->key.offset, + bargs->usage); + if (chunk_used < user_thresh) ret = 0; @@ -2959,6 +2967,8 @@ static void __cancel_balance(struct btrfs_fs_info *fs_info) unset_balance_control(fs_info); ret = del_balance_item(fs_info->tree_root); BUG_ON(ret); + + atomic_set(&fs_info->mutually_exclusive_operation_running, 0); } void update_ioctl_balance_args(struct btrfs_fs_info *fs_info, int lock, @@ -3138,8 +3148,10 @@ int btrfs_balance(struct btrfs_balance_control *bctl, out: if (bctl->flags & BTRFS_BALANCE_RESUME) __cancel_balance(fs_info); - else + else { kfree(bctl); + atomic_set(&fs_info->mutually_exclusive_operation_running, 0); + } return ret; } @@ -3156,7 +3168,6 @@ static int balance_kthread(void *data) ret = btrfs_balance(fs_info->balance_ctl, NULL); } - atomic_set(&fs_info->mutually_exclusive_operation_running, 0); mutex_unlock(&fs_info->balance_mutex); mutex_unlock(&fs_info->volume_mutex); @@ -3179,7 +3190,6 @@ int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info) return 0; } - WARN_ON(atomic_xchg(&fs_info->mutually_exclusive_operation_running, 1)); tsk = kthread_run(balance_kthread, fs_info, "btrfs-balance"); if (IS_ERR(tsk)) return PTR_ERR(tsk); @@ -3233,6 +3243,8 @@ int btrfs_recover_balance(struct btrfs_fs_info *fs_info) btrfs_balance_sys(leaf, item, &disk_bargs); btrfs_disk_balance_args_to_cpu(&bctl->sys, &disk_bargs); + WARN_ON(atomic_xchg(&fs_info->mutually_exclusive_operation_running, 1)); + mutex_lock(&fs_info->volume_mutex); mutex_lock(&fs_info->balance_mutex); @@ -3496,7 +3508,7 @@ struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { { 1, 1, 2, 2, 2, 2 /* raid1 */ }, { 1, 2, 1, 1, 1, 2 /* dup */ }, { 1, 1, 0, 2, 1, 1 /* raid0 */ }, - { 1, 1, 0, 1, 1, 1 /* single */ }, + { 1, 1, 1, 1, 1, 1 /* single */ }, }; static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, diff --git a/fs/buffer.c b/fs/buffer.c index b8a8b4d..8e18281 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -2359,7 +2359,7 @@ int __block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf, if (unlikely(ret < 0)) goto out_unlock; set_page_dirty(page); - wait_on_page_writeback(page); + wait_for_stable_page(page); return 0; out_unlock: unlock_page(page); @@ -3227,7 +3227,7 @@ static struct kmem_cache *bh_cachep __read_mostly; * Once the number of bh's in the machine exceeds this level, we start * stripping them in writeback. */ -static int max_buffer_heads; +static unsigned long max_buffer_heads; int buffer_heads_over_limit; @@ -3343,7 +3343,7 @@ EXPORT_SYMBOL(bh_submit_read); void __init buffer_init(void) { - int nrpages; + unsigned long nrpages; bh_cachep = kmem_cache_create("buffer_head", sizeof(struct buffer_head), 0, diff --git a/fs/ceph/Kconfig b/fs/ceph/Kconfig index 9eb134e..49bc782 100644 --- a/fs/ceph/Kconfig +++ b/fs/ceph/Kconfig @@ -1,6 +1,6 @@ config CEPH_FS - tristate "Ceph distributed file system (EXPERIMENTAL)" - depends on INET && EXPERIMENTAL + tristate "Ceph distributed file system" + depends on INET select CEPH_LIB select LIBCRC32C select CRYPTO_AES diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index a1d9bb3..ae2be69 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -930,7 +930,7 @@ static int send_cap_msg(struct ceph_mds_session *session, u64 size, u64 max_size, struct timespec *mtime, struct timespec *atime, u64 time_warp_seq, - uid_t uid, gid_t gid, umode_t mode, + kuid_t uid, kgid_t gid, umode_t mode, u64 xattr_version, struct ceph_buffer *xattrs_buf, u64 follows) @@ -974,8 +974,8 @@ static int send_cap_msg(struct ceph_mds_session *session, ceph_encode_timespec(&fc->atime, atime); fc->time_warp_seq = cpu_to_le32(time_warp_seq); - fc->uid = cpu_to_le32(uid); - fc->gid = cpu_to_le32(gid); + fc->uid = cpu_to_le32(from_kuid(&init_user_ns, uid)); + fc->gid = cpu_to_le32(from_kgid(&init_user_ns, gid)); fc->mode = cpu_to_le32(mode); fc->xattr_version = cpu_to_le64(xattr_version); @@ -1081,8 +1081,8 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap, struct timespec mtime, atime; int wake = 0; umode_t mode; - uid_t uid; - gid_t gid; + kuid_t uid; + kgid_t gid; struct ceph_mds_session *session; u64 xattr_version = 0; struct ceph_buffer *xattr_blob = NULL; @@ -2359,10 +2359,11 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant, if ((issued & CEPH_CAP_AUTH_EXCL) == 0) { inode->i_mode = le32_to_cpu(grant->mode); - inode->i_uid = le32_to_cpu(grant->uid); - inode->i_gid = le32_to_cpu(grant->gid); + inode->i_uid = make_kuid(&init_user_ns, le32_to_cpu(grant->uid)); + inode->i_gid = make_kgid(&init_user_ns, le32_to_cpu(grant->gid)); dout("%p mode 0%o uid.gid %d.%d\n", inode, inode->i_mode, - inode->i_uid, inode->i_gid); + from_kuid(&init_user_ns, inode->i_uid), + from_kgid(&init_user_ns, inode->i_gid)); } if ((issued & CEPH_CAP_LINK_EXCL) == 0) diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 4bc086a..851814d 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -612,10 +612,11 @@ static int fill_inode(struct inode *inode, if ((issued & CEPH_CAP_AUTH_EXCL) == 0) { inode->i_mode = le32_to_cpu(info->mode); - inode->i_uid = le32_to_cpu(info->uid); - inode->i_gid = le32_to_cpu(info->gid); + inode->i_uid = make_kuid(&init_user_ns, le32_to_cpu(info->uid)); + inode->i_gid = make_kgid(&init_user_ns, le32_to_cpu(info->gid)); dout("%p mode 0%o uid.gid %d.%d\n", inode, inode->i_mode, - inode->i_uid, inode->i_gid); + from_kuid(&init_user_ns, inode->i_uid), + from_kgid(&init_user_ns, inode->i_gid)); } if ((issued & CEPH_CAP_LINK_EXCL) == 0) @@ -1601,26 +1602,30 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr) if (ia_valid & ATTR_UID) { dout("setattr %p uid %d -> %d\n", inode, - inode->i_uid, attr->ia_uid); + from_kuid(&init_user_ns, inode->i_uid), + from_kuid(&init_user_ns, attr->ia_uid)); if (issued & CEPH_CAP_AUTH_EXCL) { inode->i_uid = attr->ia_uid; dirtied |= CEPH_CAP_AUTH_EXCL; } else if ((issued & CEPH_CAP_AUTH_SHARED) == 0 || - attr->ia_uid != inode->i_uid) { - req->r_args.setattr.uid = cpu_to_le32(attr->ia_uid); + !uid_eq(attr->ia_uid, inode->i_uid)) { + req->r_args.setattr.uid = cpu_to_le32( + from_kuid(&init_user_ns, attr->ia_uid)); mask |= CEPH_SETATTR_UID; release |= CEPH_CAP_AUTH_SHARED; } } if (ia_valid & ATTR_GID) { dout("setattr %p gid %d -> %d\n", inode, - inode->i_gid, attr->ia_gid); + from_kgid(&init_user_ns, inode->i_gid), + from_kgid(&init_user_ns, attr->ia_gid)); if (issued & CEPH_CAP_AUTH_EXCL) { inode->i_gid = attr->ia_gid; dirtied |= CEPH_CAP_AUTH_EXCL; } else if ((issued & CEPH_CAP_AUTH_SHARED) == 0 || - attr->ia_gid != inode->i_gid) { - req->r_args.setattr.gid = cpu_to_le32(attr->ia_gid); + !gid_eq(attr->ia_gid, inode->i_gid)) { + req->r_args.setattr.gid = cpu_to_le32( + from_kgid(&init_user_ns, attr->ia_gid)); mask |= CEPH_SETATTR_GID; release |= CEPH_CAP_AUTH_SHARED; } diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 9165eb8..7a3dfe0 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -1658,8 +1658,8 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc, head->mdsmap_epoch = cpu_to_le32(mdsc->mdsmap->m_epoch); head->op = cpu_to_le32(req->r_op); - head->caller_uid = cpu_to_le32(req->r_uid); - head->caller_gid = cpu_to_le32(req->r_gid); + head->caller_uid = cpu_to_le32(from_kuid(&init_user_ns, req->r_uid)); + head->caller_gid = cpu_to_le32(from_kgid(&init_user_ns, req->r_gid)); head->args = req->r_args; ceph_encode_filepath(&p, end, ino1, path1); diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index dd26846..ff4188b 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -184,8 +184,8 @@ struct ceph_mds_request { union ceph_mds_request_args r_args; int r_fmode; /* file mode, if expecting cap */ - uid_t r_uid; - gid_t r_gid; + kuid_t r_uid; + kgid_t r_gid; /* for choosing which mds to send this request to */ int r_direct_mode; diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 66ebe72..f053bbd 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -138,8 +138,8 @@ struct ceph_cap_snap { struct ceph_snap_context *context; umode_t mode; - uid_t uid; - gid_t gid; + kuid_t uid; + kgid_t gid; struct ceph_buffer *xattr_blob; u64 xattr_version; diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig index 21ff76c..2906ee2 100644 --- a/fs/cifs/Kconfig +++ b/fs/cifs/Kconfig @@ -155,14 +155,14 @@ config CIFS_DFS_UPCALL points. If unsure, say N. config CIFS_NFSD_EXPORT - bool "Allow nfsd to export CIFS file system (EXPERIMENTAL)" - depends on CIFS && EXPERIMENTAL && BROKEN + bool "Allow nfsd to export CIFS file system" + depends on CIFS && BROKEN help Allows NFS server to export a CIFS mounted share (nfsd over cifs) config CIFS_SMB2 - bool "SMB2 network file system support (EXPERIMENTAL)" - depends on CIFS && EXPERIMENTAL && INET + bool "SMB2 network file system support" + depends on CIFS && INET select NLS select KEYS select FSCACHE diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c index ce5cbd7..210fce2 100644 --- a/fs/cifs/cifs_dfs_ref.c +++ b/fs/cifs/cifs_dfs_ref.c @@ -226,6 +226,8 @@ compose_mount_options_out: compose_mount_options_err: kfree(mountdata); mountdata = ERR_PTR(rc); + kfree(*devname); + *devname = NULL; goto compose_mount_options_out; } diff --git a/fs/cifs/cifs_fs_sb.h b/fs/cifs/cifs_fs_sb.h index c865bfd..37e4a72 100644 --- a/fs/cifs/cifs_fs_sb.h +++ b/fs/cifs/cifs_fs_sb.h @@ -55,10 +55,10 @@ struct cifs_sb_info { unsigned int wsize; unsigned long actimeo; /* attribute cache timeout (jiffies) */ atomic_t active; - uid_t mnt_uid; - gid_t mnt_gid; - uid_t mnt_backupuid; - gid_t mnt_backupgid; + kuid_t mnt_uid; + kgid_t mnt_gid; + kuid_t mnt_backupuid; + kgid_t mnt_backupgid; umode_t mnt_file_mode; umode_t mnt_dir_mode; unsigned int mnt_cifs_flags; diff --git a/fs/cifs/cifs_spnego.c b/fs/cifs/cifs_spnego.c index 086f381..10e7747 100644 --- a/fs/cifs/cifs_spnego.c +++ b/fs/cifs/cifs_spnego.c @@ -149,10 +149,12 @@ cifs_get_spnego_key(struct cifs_ses *sesInfo) goto out; dp = description + strlen(description); - sprintf(dp, ";uid=0x%x", sesInfo->linux_uid); + sprintf(dp, ";uid=0x%x", + from_kuid_munged(&init_user_ns, sesInfo->linux_uid)); dp = description + strlen(description); - sprintf(dp, ";creduid=0x%x", sesInfo->cred_uid); + sprintf(dp, ";creduid=0x%x", + from_kuid_munged(&init_user_ns, sesInfo->cred_uid)); if (sesInfo->user_name) { dp = description + strlen(description); diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c index 5cbd00e..f1e3f25 100644 --- a/fs/cifs/cifsacl.c +++ b/fs/cifs/cifsacl.c @@ -266,8 +266,8 @@ sid_to_id(struct cifs_sb_info *cifs_sb, struct cifs_sid *psid, struct key *sidkey; char *sidstr; const struct cred *saved_cred; - uid_t fuid = cifs_sb->mnt_uid; - gid_t fgid = cifs_sb->mnt_gid; + kuid_t fuid = cifs_sb->mnt_uid; + kgid_t fgid = cifs_sb->mnt_gid; /* * If we have too many subauthorities, then something is really wrong. @@ -297,6 +297,7 @@ sid_to_id(struct cifs_sb_info *cifs_sb, struct cifs_sid *psid, * probably a safe assumption but might be better to check based on * sidtype. */ + BUILD_BUG_ON(sizeof(uid_t) != sizeof(gid_t)); if (sidkey->datalen != sizeof(uid_t)) { rc = -EIO; cFYI(1, "%s: Downcall contained malformed key " @@ -305,10 +306,21 @@ sid_to_id(struct cifs_sb_info *cifs_sb, struct cifs_sid *psid, goto out_key_put; } - if (sidtype == SIDOWNER) - memcpy(&fuid, &sidkey->payload.value, sizeof(uid_t)); - else - memcpy(&fgid, &sidkey->payload.value, sizeof(gid_t)); + if (sidtype == SIDOWNER) { + kuid_t uid; + uid_t id; + memcpy(&id, &sidkey->payload.value, sizeof(uid_t)); + uid = make_kuid(&init_user_ns, id); + if (uid_valid(uid)) + fuid = uid; + } else { + kgid_t gid; + gid_t id; + memcpy(&id, &sidkey->payload.value, sizeof(gid_t)); + gid = make_kgid(&init_user_ns, id); + if (gid_valid(gid)) + fgid = gid; + } out_key_put: key_put(sidkey); @@ -346,7 +358,8 @@ init_cifs_idmap(void) if (!cred) return -ENOMEM; - keyring = keyring_alloc(".cifs_idmap", 0, 0, cred, + keyring = keyring_alloc(".cifs_idmap", + GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, cred, (KEY_POS_ALL & ~KEY_POS_SETATTR) | KEY_USR_VIEW | KEY_USR_READ, KEY_ALLOC_NOT_IN_QUOTA, NULL); @@ -774,7 +787,7 @@ static int parse_sec_desc(struct cifs_sb_info *cifs_sb, /* Convert permission bits from mode to equivalent CIFS ACL */ static int build_sec_desc(struct cifs_ntsd *pntsd, struct cifs_ntsd *pnntsd, - __u32 secdesclen, __u64 nmode, uid_t uid, gid_t gid, int *aclflag) + __u32 secdesclen, __u64 nmode, kuid_t uid, kgid_t gid, int *aclflag) { int rc = 0; __u32 dacloffset; @@ -806,17 +819,19 @@ static int build_sec_desc(struct cifs_ntsd *pntsd, struct cifs_ntsd *pnntsd, *aclflag = CIFS_ACL_DACL; } else { memcpy(pnntsd, pntsd, secdesclen); - if (uid != NO_CHANGE_32) { /* chown */ + if (uid_valid(uid)) { /* chown */ + uid_t id; owner_sid_ptr = (struct cifs_sid *)((char *)pnntsd + le32_to_cpu(pnntsd->osidoffset)); nowner_sid_ptr = kmalloc(sizeof(struct cifs_sid), GFP_KERNEL); if (!nowner_sid_ptr) return -ENOMEM; - rc = id_to_sid(uid, SIDOWNER, nowner_sid_ptr); + id = from_kuid(&init_user_ns, uid); + rc = id_to_sid(id, SIDOWNER, nowner_sid_ptr); if (rc) { cFYI(1, "%s: Mapping error %d for owner id %d", - __func__, rc, uid); + __func__, rc, id); kfree(nowner_sid_ptr); return rc; } @@ -824,17 +839,19 @@ static int build_sec_desc(struct cifs_ntsd *pntsd, struct cifs_ntsd *pnntsd, kfree(nowner_sid_ptr); *aclflag = CIFS_ACL_OWNER; } - if (gid != NO_CHANGE_32) { /* chgrp */ + if (gid_valid(gid)) { /* chgrp */ + gid_t id; group_sid_ptr = (struct cifs_sid *)((char *)pnntsd + le32_to_cpu(pnntsd->gsidoffset)); ngroup_sid_ptr = kmalloc(sizeof(struct cifs_sid), GFP_KERNEL); if (!ngroup_sid_ptr) return -ENOMEM; - rc = id_to_sid(gid, SIDGROUP, ngroup_sid_ptr); + id = from_kgid(&init_user_ns, gid); + rc = id_to_sid(id, SIDGROUP, ngroup_sid_ptr); if (rc) { cFYI(1, "%s: Mapping error %d for group id %d", - __func__, rc, gid); + __func__, rc, id); kfree(ngroup_sid_ptr); return rc; } @@ -1002,7 +1019,7 @@ cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb, struct cifs_fattr *fattr, /* Convert mode bits to an ACL so we can update the ACL on the server */ int id_mode_to_cifs_acl(struct inode *inode, const char *path, __u64 nmode, - uid_t uid, gid_t gid) + kuid_t uid, kgid_t gid) { int rc = 0; int aclflag = CIFS_ACL_DACL; /* default flag to set */ diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 8b35365..4bad7b1 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -375,13 +375,15 @@ cifs_show_options(struct seq_file *s, struct dentry *root) (int)(srcaddr->sa_family)); } - seq_printf(s, ",uid=%u", cifs_sb->mnt_uid); + seq_printf(s, ",uid=%u", + from_kuid_munged(&init_user_ns, cifs_sb->mnt_uid)); if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_UID) seq_printf(s, ",forceuid"); else seq_printf(s, ",noforceuid"); - seq_printf(s, ",gid=%u", cifs_sb->mnt_gid); + seq_printf(s, ",gid=%u", + from_kgid_munged(&init_user_ns, cifs_sb->mnt_gid)); if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_GID) seq_printf(s, ",forcegid"); else @@ -436,9 +438,13 @@ cifs_show_options(struct seq_file *s, struct dentry *root) if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_PERM) seq_printf(s, ",noperm"); if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_BACKUPUID) - seq_printf(s, ",backupuid=%u", cifs_sb->mnt_backupuid); + seq_printf(s, ",backupuid=%u", + from_kuid_munged(&init_user_ns, + cifs_sb->mnt_backupuid)); if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_BACKUPGID) - seq_printf(s, ",backupgid=%u", cifs_sb->mnt_backupgid); + seq_printf(s, ",backupgid=%u", + from_kgid_munged(&init_user_ns, + cifs_sb->mnt_backupgid)); seq_printf(s, ",rsize=%u", cifs_sb->rsize); seq_printf(s, ",wsize=%u", cifs_sb->wsize); diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index e6899ce..4f07f6f 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -400,11 +400,11 @@ struct smb_vol { char *iocharset; /* local code page for mapping to and from Unicode */ char source_rfc1001_name[RFC1001_NAME_LEN_WITH_NULL]; /* clnt nb name */ char target_rfc1001_name[RFC1001_NAME_LEN_WITH_NULL]; /* srvr nb name */ - uid_t cred_uid; - uid_t linux_uid; - gid_t linux_gid; - uid_t backupuid; - gid_t backupgid; + kuid_t cred_uid; + kuid_t linux_uid; + kgid_t linux_gid; + kuid_t backupuid; + kgid_t backupgid; umode_t file_mode; umode_t dir_mode; unsigned secFlg; @@ -703,8 +703,8 @@ struct cifs_ses { char *serverNOS; /* name of network operating system of server */ char *serverDomain; /* security realm of server */ __u64 Suid; /* remote smb uid */ - uid_t linux_uid; /* overriding owner of files on the mount */ - uid_t cred_uid; /* owner of credentials */ + kuid_t linux_uid; /* overriding owner of files on the mount */ + kuid_t cred_uid; /* owner of credentials */ unsigned int capabilities; char serverName[SERVER_NAME_LEN_WITH_NULL * 2]; /* BB make bigger for TCP names - will ipv6 and sctp addresses fit? */ @@ -838,7 +838,7 @@ struct cifs_tcon { */ struct tcon_link { struct rb_node tl_rbnode; - uid_t tl_uid; + kuid_t tl_uid; unsigned long tl_flags; #define TCON_LINK_MASTER 0 #define TCON_LINK_PENDING 1 @@ -931,7 +931,7 @@ struct cifsFileInfo { struct list_head tlist; /* pointer to next fid owned by tcon */ struct list_head flist; /* next fid (file instance) for this inode */ struct cifs_fid_locks *llist; /* brlocks held by this fid */ - unsigned int uid; /* allows finding which FileInfo structure */ + kuid_t uid; /* allows finding which FileInfo structure */ __u32 pid; /* process id who opened file */ struct cifs_fid fid; /* file id from remote */ /* BB add lock scope info here if needed */ ; @@ -1245,8 +1245,8 @@ struct cifs_fattr { u64 cf_eof; u64 cf_bytes; u64 cf_createtime; - uid_t cf_uid; - gid_t cf_gid; + kuid_t cf_uid; + kgid_t cf_gid; umode_t cf_mode; dev_t cf_rdev; unsigned int cf_nlink; diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h index b9d59a9..e996ff6 100644 --- a/fs/cifs/cifspdu.h +++ b/fs/cifs/cifspdu.h @@ -277,7 +277,6 @@ #define CIFS_NO_HANDLE 0xFFFF #define NO_CHANGE_64 0xFFFFFFFFFFFFFFFFULL -#define NO_CHANGE_32 0xFFFFFFFFUL /* IPC$ in ASCII */ #define CIFS_IPC_RESOURCE "\x49\x50\x43\x24" diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index 1988c1b..f450f06 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -46,7 +46,8 @@ extern void _free_xid(unsigned int); ({ \ unsigned int __xid = _get_xid(); \ cFYI(1, "CIFS VFS: in %s as Xid: %u with uid: %d", \ - __func__, __xid, current_fsuid()); \ + __func__, __xid, \ + from_kuid(&init_user_ns, current_fsuid())); \ __xid; \ }) @@ -161,7 +162,7 @@ extern int cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb, struct cifs_fattr *fattr, struct inode *inode, const char *path, const __u16 *pfid); extern int id_mode_to_cifs_acl(struct inode *inode, const char *path, __u64, - uid_t, gid_t); + kuid_t, kgid_t); extern struct cifs_ntsd *get_cifs_acl(struct cifs_sb_info *, struct inode *, const char *, u32 *); extern int set_cifs_acl(struct cifs_ntsd *, __u32, struct inode *, @@ -304,8 +305,8 @@ struct cifs_unix_set_info_args { __u64 atime; __u64 mtime; __u64 mode; - __u64 uid; - __u64 gid; + kuid_t uid; + kgid_t gid; dev_t device; }; diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index 76d0d29..00e12f2 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -5819,8 +5819,14 @@ static void cifs_fill_unix_set_info(FILE_UNIX_BASIC_INFO *data_offset, const struct cifs_unix_set_info_args *args) { + u64 uid = NO_CHANGE_64, gid = NO_CHANGE_64; u64 mode = args->mode; + if (uid_valid(args->uid)) + uid = from_kuid(&init_user_ns, args->uid); + if (gid_valid(args->gid)) + gid = from_kgid(&init_user_ns, args->gid); + /* * Samba server ignores set of file size to zero due to bugs in some * older clients, but we should be precise - we use SetFileSize to @@ -5833,8 +5839,8 @@ cifs_fill_unix_set_info(FILE_UNIX_BASIC_INFO *data_offset, data_offset->LastStatusChange = cpu_to_le64(args->ctime); data_offset->LastAccessTime = cpu_to_le64(args->atime); data_offset->LastModificationTime = cpu_to_le64(args->mtime); - data_offset->Uid = cpu_to_le64(args->uid); - data_offset->Gid = cpu_to_le64(args->gid); + data_offset->Uid = cpu_to_le64(uid); + data_offset->Gid = cpu_to_le64(gid); /* better to leave device as zero when it is */ data_offset->DevMajor = cpu_to_le64(MAJOR(args->device)); data_offset->DevMinor = cpu_to_le64(MINOR(args->device)); diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 17c3643..4474a57 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -987,6 +987,41 @@ static int get_option_ul(substring_t args[], unsigned long *option) return rc; } +static int get_option_uid(substring_t args[], kuid_t *result) +{ + unsigned long value; + kuid_t uid; + int rc; + + rc = get_option_ul(args, &value); + if (rc) + return rc; + + uid = make_kuid(current_user_ns(), value); + if (!uid_valid(uid)) + return -EINVAL; + + *result = uid; + return 0; +} + +static int get_option_gid(substring_t args[], kgid_t *result) +{ + unsigned long value; + kgid_t gid; + int rc; + + rc = get_option_ul(args, &value); + if (rc) + return rc; + + gid = make_kgid(current_user_ns(), value); + if (!gid_valid(gid)) + return -EINVAL; + + *result = gid; + return 0; +} static int cifs_parse_security_flavors(char *value, struct smb_vol *vol) @@ -1424,47 +1459,42 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, /* Numeric Values */ case Opt_backupuid: - if (get_option_ul(args, &option)) { + if (get_option_uid(args, &vol->backupuid)) { cERROR(1, "%s: Invalid backupuid value", __func__); goto cifs_parse_mount_err; } - vol->backupuid = option; vol->backupuid_specified = true; break; case Opt_backupgid: - if (get_option_ul(args, &option)) { + if (get_option_gid(args, &vol->backupgid)) { cERROR(1, "%s: Invalid backupgid value", __func__); goto cifs_parse_mount_err; } - vol->backupgid = option; vol->backupgid_specified = true; break; case Opt_uid: - if (get_option_ul(args, &option)) { + if (get_option_uid(args, &vol->linux_uid)) { cERROR(1, "%s: Invalid uid value", __func__); goto cifs_parse_mount_err; } - vol->linux_uid = option; uid_specified = true; break; case Opt_cruid: - if (get_option_ul(args, &option)) { + if (get_option_uid(args, &vol->cred_uid)) { cERROR(1, "%s: Invalid cruid value", __func__); goto cifs_parse_mount_err; } - vol->cred_uid = option; break; case Opt_gid: - if (get_option_ul(args, &option)) { + if (get_option_gid(args, &vol->linux_gid)) { cERROR(1, "%s: Invalid gid value", __func__); goto cifs_parse_mount_err; } - vol->linux_gid = option; gid_specified = true; break; case Opt_file_mode: @@ -1917,7 +1947,7 @@ srcip_matches(struct sockaddr *srcaddr, struct sockaddr *rhs) } case AF_INET6: { struct sockaddr_in6 *saddr6 = (struct sockaddr_in6 *)srcaddr; - struct sockaddr_in6 *vaddr6 = (struct sockaddr_in6 *)&rhs; + struct sockaddr_in6 *vaddr6 = (struct sockaddr_in6 *)rhs; return ipv6_addr_equal(&saddr6->sin6_addr, &vaddr6->sin6_addr); } default: @@ -2241,7 +2271,7 @@ static int match_session(struct cifs_ses *ses, struct smb_vol *vol) { switch (ses->server->secType) { case Kerberos: - if (vol->cred_uid != ses->cred_uid) + if (!uid_eq(vol->cred_uid, ses->cred_uid)) return 0; break; default: @@ -2713,7 +2743,7 @@ compare_mount_options(struct super_block *sb, struct cifs_mnt_data *mnt_data) if (new->rsize && new->rsize < old->rsize) return 0; - if (old->mnt_uid != new->mnt_uid || old->mnt_gid != new->mnt_gid) + if (!uid_eq(old->mnt_uid, new->mnt_uid) || !gid_eq(old->mnt_gid, new->mnt_gid)) return 0; if (old->mnt_file_mode != new->mnt_file_mode || @@ -3919,7 +3949,7 @@ cifs_set_vol_auth(struct smb_vol *vol, struct cifs_ses *ses) } static struct cifs_tcon * -cifs_construct_tcon(struct cifs_sb_info *cifs_sb, uid_t fsuid) +cifs_construct_tcon(struct cifs_sb_info *cifs_sb, kuid_t fsuid) { int rc; struct cifs_tcon *master_tcon = cifs_sb_master_tcon(cifs_sb); @@ -3989,7 +4019,7 @@ cifs_sb_tcon_pending_wait(void *unused) /* find and return a tlink with given uid */ static struct tcon_link * -tlink_rb_search(struct rb_root *root, uid_t uid) +tlink_rb_search(struct rb_root *root, kuid_t uid) { struct rb_node *node = root->rb_node; struct tcon_link *tlink; @@ -3997,9 +4027,9 @@ tlink_rb_search(struct rb_root *root, uid_t uid) while (node) { tlink = rb_entry(node, struct tcon_link, tl_rbnode); - if (tlink->tl_uid > uid) + if (uid_gt(tlink->tl_uid, uid)) node = node->rb_left; - else if (tlink->tl_uid < uid) + else if (uid_lt(tlink->tl_uid, uid)) node = node->rb_right; else return tlink; @@ -4018,7 +4048,7 @@ tlink_rb_insert(struct rb_root *root, struct tcon_link *new_tlink) tlink = rb_entry(*new, struct tcon_link, tl_rbnode); parent = *new; - if (tlink->tl_uid > new_tlink->tl_uid) + if (uid_gt(tlink->tl_uid, new_tlink->tl_uid)) new = &((*new)->rb_left); else new = &((*new)->rb_right); @@ -4048,7 +4078,7 @@ struct tcon_link * cifs_sb_tlink(struct cifs_sb_info *cifs_sb) { int ret; - uid_t fsuid = current_fsuid(); + kuid_t fsuid = current_fsuid(); struct tcon_link *tlink, *newtlink; if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER)) diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c index 8719bbe..1cd0162 100644 --- a/fs/cifs/dir.c +++ b/fs/cifs/dir.c @@ -342,14 +342,14 @@ cifs_do_create(struct inode *inode, struct dentry *direntry, unsigned int xid, *created |= FILE_CREATED; if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID) { - args.uid = (__u64) current_fsuid(); + args.uid = current_fsuid(); if (inode->i_mode & S_ISGID) - args.gid = (__u64) inode->i_gid; + args.gid = inode->i_gid; else - args.gid = (__u64) current_fsgid(); + args.gid = current_fsgid(); } else { - args.uid = NO_CHANGE_64; - args.gid = NO_CHANGE_64; + args.uid = INVALID_UID; /* no change */ + args.gid = INVALID_GID; /* no change */ } CIFSSMBUnixSetFileInfo(xid, tcon, &args, fid->netfid, current->tgid); @@ -588,11 +588,11 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, umode_t mode, .device = device_number, }; if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID) { - args.uid = (__u64) current_fsuid(); - args.gid = (__u64) current_fsgid(); + args.uid = current_fsuid(); + args.gid = current_fsgid(); } else { - args.uid = NO_CHANGE_64; - args.gid = NO_CHANGE_64; + args.uid = INVALID_UID; /* no change */ + args.gid = INVALID_GID; /* no change */ } rc = CIFSSMBUnixSetPathInfo(xid, pTcon, full_path, &args, cifs_sb->local_nls, diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 1a5c291..c16d2a0 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -515,8 +515,8 @@ int cifs_open(struct inode *inode, struct file *file) */ struct cifs_unix_set_info_args args = { .mode = inode->i_mode, - .uid = NO_CHANGE_64, - .gid = NO_CHANGE_64, + .uid = INVALID_UID, /* no change */ + .gid = INVALID_GID, /* no change */ .ctime = NO_CHANGE_64, .atime = NO_CHANGE_64, .mtime = NO_CHANGE_64, @@ -1693,7 +1693,7 @@ struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *cifs_inode, are always at the end of the list but since the first entry might have a close pending, we go through the whole list */ list_for_each_entry(open_file, &cifs_inode->openFileList, flist) { - if (fsuid_only && open_file->uid != current_fsuid()) + if (fsuid_only && !uid_eq(open_file->uid, current_fsuid())) continue; if (OPEN_FMODE(open_file->f_flags) & FMODE_READ) { if (!open_file->invalidHandle) { @@ -1746,7 +1746,7 @@ refind_writable: list_for_each_entry(open_file, &cifs_inode->openFileList, flist) { if (!any_available && open_file->pid != current->tgid) continue; - if (fsuid_only && open_file->uid != current_fsuid()) + if (fsuid_only && !uid_eq(open_file->uid, current_fsuid())) continue; if (OPEN_FMODE(open_file->f_flags) & FMODE_WRITE) { if (!open_file->invalidHandle) { diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index 1fc864b..d2a8339 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -244,15 +244,25 @@ cifs_unix_basic_to_fattr(struct cifs_fattr *fattr, FILE_UNIX_BASIC_INFO *info, break; } - if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_UID) - fattr->cf_uid = cifs_sb->mnt_uid; - else - fattr->cf_uid = le64_to_cpu(info->Uid); - - if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_GID) - fattr->cf_gid = cifs_sb->mnt_gid; - else - fattr->cf_gid = le64_to_cpu(info->Gid); + fattr->cf_uid = cifs_sb->mnt_uid; + if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_UID)) { + u64 id = le64_to_cpu(info->Uid); + if (id < ((uid_t)-1)) { + kuid_t uid = make_kuid(&init_user_ns, id); + if (uid_valid(uid)) + fattr->cf_uid = uid; + } + } + + fattr->cf_gid = cifs_sb->mnt_gid; + if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_GID)) { + u64 id = le64_to_cpu(info->Gid); + if (id < ((gid_t)-1)) { + kgid_t gid = make_kgid(&init_user_ns, id); + if (gid_valid(gid)) + fattr->cf_gid = gid; + } + } fattr->cf_nlink = le64_to_cpu(info->Nlinks); } @@ -1245,14 +1255,14 @@ cifs_mkdir_qinfo(struct inode *parent, struct dentry *dentry, umode_t mode, .device = 0, }; if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID) { - args.uid = (__u64)current_fsuid(); + args.uid = current_fsuid(); if (parent->i_mode & S_ISGID) - args.gid = (__u64)parent->i_gid; + args.gid = parent->i_gid; else - args.gid = (__u64)current_fsgid(); + args.gid = current_fsgid(); } else { - args.uid = NO_CHANGE_64; - args.gid = NO_CHANGE_64; + args.uid = INVALID_UID; /* no change */ + args.gid = INVALID_GID; /* no change */ } CIFSSMBUnixSetPathInfo(xid, tcon, full_path, &args, cifs_sb->local_nls, @@ -2013,12 +2023,12 @@ cifs_setattr_unix(struct dentry *direntry, struct iattr *attrs) if (attrs->ia_valid & ATTR_UID) args->uid = attrs->ia_uid; else - args->uid = NO_CHANGE_64; + args->uid = INVALID_UID; /* no change */ if (attrs->ia_valid & ATTR_GID) args->gid = attrs->ia_gid; else - args->gid = NO_CHANGE_64; + args->gid = INVALID_GID; /* no change */ if (attrs->ia_valid & ATTR_ATIME) args->atime = cifs_UnixTimeToNT(attrs->ia_atime); @@ -2086,8 +2096,8 @@ static int cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs) { unsigned int xid; - uid_t uid = NO_CHANGE_32; - gid_t gid = NO_CHANGE_32; + kuid_t uid = INVALID_UID; + kgid_t gid = INVALID_GID; struct inode *inode = direntry->d_inode; struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); struct cifsInodeInfo *cifsInode = CIFS_I(inode); @@ -2146,7 +2156,7 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs) #ifdef CONFIG_CIFS_ACL if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) { - if (uid != NO_CHANGE_32 || gid != NO_CHANGE_32) { + if (uid_valid(uid) || gid_valid(gid)) { rc = id_mode_to_cifs_acl(inode, full_path, NO_CHANGE_64, uid, gid); if (rc) { @@ -2170,7 +2180,7 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs) #ifdef CONFIG_CIFS_ACL if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) { rc = id_mode_to_cifs_acl(inode, full_path, mode, - NO_CHANGE_32, NO_CHANGE_32); + INVALID_UID, INVALID_GID); if (rc) { cFYI(1, "%s: Setting ACL failed with error: %d", __func__, rc); diff --git a/fs/cifs/link.c b/fs/cifs/link.c index 51dc2fb..9f6c4c4 100644 --- a/fs/cifs/link.c +++ b/fs/cifs/link.c @@ -76,7 +76,7 @@ symlink_hash(unsigned int link_len, const char *link_str, u8 *md5_hash) } rc = crypto_shash_update(&sdescmd5->shash, link_str, link_len); if (rc) { - cERROR(1, "%s: Could not update iwth link_str", __func__); + cERROR(1, "%s: Could not update with link_str", __func__); goto symlink_hash_err; } rc = crypto_shash_final(&sdescmd5->shash, md5_hash); diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c index 3a00c0d..1b15bf8 100644 --- a/fs/cifs/misc.c +++ b/fs/cifs/misc.c @@ -569,7 +569,7 @@ bool backup_cred(struct cifs_sb_info *cifs_sb) { if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_BACKUPUID) { - if (cifs_sb->mnt_backupuid == current_fsuid()) + if (uid_eq(cifs_sb->mnt_backupuid, current_fsuid())) return true; } if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_BACKUPGID) { diff --git a/fs/coda/cache.c b/fs/coda/cache.c index 958ae0e..1da168c 100644 --- a/fs/coda/cache.c +++ b/fs/coda/cache.c @@ -33,7 +33,7 @@ void coda_cache_enter(struct inode *inode, int mask) spin_lock(&cii->c_lock); cii->c_cached_epoch = atomic_read(&permission_epoch); - if (cii->c_uid != current_fsuid()) { + if (!uid_eq(cii->c_uid, current_fsuid())) { cii->c_uid = current_fsuid(); cii->c_cached_perm = mask; } else @@ -65,7 +65,7 @@ int coda_cache_check(struct inode *inode, int mask) spin_lock(&cii->c_lock); hit = (mask & cii->c_cached_perm) == mask && - cii->c_uid == current_fsuid() && + uid_eq(cii->c_uid, current_fsuid()) && cii->c_cached_epoch == atomic_read(&permission_epoch); spin_unlock(&cii->c_lock); diff --git a/fs/coda/coda_fs_i.h b/fs/coda/coda_fs_i.h index b24fdfd..c640752 100644 --- a/fs/coda/coda_fs_i.h +++ b/fs/coda/coda_fs_i.h @@ -25,7 +25,7 @@ struct coda_inode_info { u_short c_flags; /* flags (see below) */ unsigned int c_mapcount; /* nr of times this inode is mapped */ unsigned int c_cached_epoch; /* epoch for cached permissions */ - vuid_t c_uid; /* fsuid for cached permissions */ + kuid_t c_uid; /* fsuid for cached permissions */ unsigned int c_cached_perm; /* cached access permissions */ spinlock_t c_lock; struct inode vfs_inode; diff --git a/fs/coda/coda_linux.c b/fs/coda/coda_linux.c index 854ace7..2849f41 100644 --- a/fs/coda/coda_linux.c +++ b/fs/coda/coda_linux.c @@ -100,9 +100,9 @@ void coda_vattr_to_iattr(struct inode *inode, struct coda_vattr *attr) if (attr->va_mode != (u_short) -1) inode->i_mode = attr->va_mode | inode_type; if (attr->va_uid != -1) - inode->i_uid = (uid_t) attr->va_uid; + inode->i_uid = make_kuid(&init_user_ns, (uid_t) attr->va_uid); if (attr->va_gid != -1) - inode->i_gid = (gid_t) attr->va_gid; + inode->i_gid = make_kgid(&init_user_ns, (gid_t) attr->va_gid); if (attr->va_nlink != -1) set_nlink(inode, attr->va_nlink); if (attr->va_size != -1) @@ -171,10 +171,10 @@ void coda_iattr_to_vattr(struct iattr *iattr, struct coda_vattr *vattr) vattr->va_mode = iattr->ia_mode; } if ( valid & ATTR_UID ) { - vattr->va_uid = (vuid_t) iattr->ia_uid; + vattr->va_uid = (vuid_t) from_kuid(&init_user_ns, iattr->ia_uid); } if ( valid & ATTR_GID ) { - vattr->va_gid = (vgid_t) iattr->ia_gid; + vattr->va_gid = (vgid_t) from_kgid(&init_user_ns, iattr->ia_gid); } if ( valid & ATTR_SIZE ) { vattr->va_size = iattr->ia_size; diff --git a/fs/coda/inode.c b/fs/coda/inode.c index 6df708c..dada9d0 100644 --- a/fs/coda/inode.c +++ b/fs/coda/inode.c @@ -20,6 +20,7 @@ #include <linux/file.h> #include <linux/vfs.h> #include <linux/slab.h> +#include <linux/pid_namespace.h> #include <asm/uaccess.h> @@ -48,7 +49,7 @@ static struct inode *coda_alloc_inode(struct super_block *sb) return NULL; memset(&ei->c_fid, 0, sizeof(struct CodaFid)); ei->c_flags = 0; - ei->c_uid = 0; + ei->c_uid = GLOBAL_ROOT_UID; ei->c_cached_perm = 0; spin_lock_init(&ei->c_lock); return &ei->vfs_inode; @@ -157,6 +158,9 @@ static int coda_fill_super(struct super_block *sb, void *data, int silent) int error; int idx; + if (task_active_pid_ns(current) != &init_pid_ns) + return -EINVAL; + idx = get_device_index((struct coda_mount_data *) data); /* Ignore errors in data, for backward compatibility */ diff --git a/fs/coda/psdev.c b/fs/coda/psdev.c index 761d5b3..ebc2bae 100644 --- a/fs/coda/psdev.c +++ b/fs/coda/psdev.c @@ -37,6 +37,7 @@ #include <linux/list.h> #include <linux/mutex.h> #include <linux/device.h> +#include <linux/pid_namespace.h> #include <asm/io.h> #include <asm/poll.h> #include <asm/uaccess.h> @@ -266,6 +267,12 @@ static int coda_psdev_open(struct inode * inode, struct file * file) struct venus_comm *vcp; int idx, err; + if (task_active_pid_ns(current) != &init_pid_ns) + return -EINVAL; + + if (current_user_ns() != &init_user_ns) + return -EINVAL; + idx = iminor(inode); if (idx < 0 || idx >= MAX_CODADEVS) return -ENODEV; diff --git a/fs/coda/upcall.c b/fs/coda/upcall.c index 0c68fd3..3a73197 100644 --- a/fs/coda/upcall.c +++ b/fs/coda/upcall.c @@ -50,9 +50,9 @@ static void *alloc_upcall(int opcode, int size) return ERR_PTR(-ENOMEM); inp->ih.opcode = opcode; - inp->ih.pid = current->pid; - inp->ih.pgid = task_pgrp_nr(current); - inp->ih.uid = current_fsuid(); + inp->ih.pid = task_pid_nr_ns(current, &init_pid_ns); + inp->ih.pgid = task_pgrp_nr_ns(current, &init_pid_ns); + inp->ih.uid = from_kuid(&init_user_ns, current_fsuid()); return (void*)inp; } @@ -157,7 +157,7 @@ int venus_lookup(struct super_block *sb, struct CodaFid *fid, } int venus_close(struct super_block *sb, struct CodaFid *fid, int flags, - vuid_t uid) + kuid_t uid) { union inputArgs *inp; union outputArgs *outp; @@ -166,7 +166,7 @@ int venus_close(struct super_block *sb, struct CodaFid *fid, int flags, insize = SIZE(release); UPARG(CODA_CLOSE); - inp->ih.uid = uid; + inp->ih.uid = from_kuid(&init_user_ns, uid); inp->coda_close.VFid = *fid; inp->coda_close.flags = flags; diff --git a/fs/compat.c b/fs/compat.c index 015e1e1..fe40fde 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -1278,8 +1278,7 @@ compat_sys_vmsplice(int fd, const struct compat_iovec __user *iov32, * Exactly like fs/open.c:sys_open(), except that it doesn't set the * O_LARGEFILE flag. */ -asmlinkage long -compat_sys_open(const char __user *filename, int flags, umode_t mode) +COMPAT_SYSCALL_DEFINE3(open, const char __user *, filename, int, flags, umode_t, mode) { return do_sys_open(AT_FDCWD, filename, flags, mode); } @@ -1288,8 +1287,7 @@ compat_sys_open(const char __user *filename, int flags, umode_t mode) * Exactly like fs/open.c:sys_openat(), except that it doesn't set the * O_LARGEFILE flag. */ -asmlinkage long -compat_sys_openat(unsigned int dfd, const char __user *filename, int flags, umode_t mode) +COMPAT_SYSCALL_DEFINE4(openat, int, dfd, const char __user *, filename, int, flags, umode_t, mode) { return do_sys_open(dfd, filename, flags, mode); } @@ -1739,55 +1737,13 @@ asmlinkage long compat_sys_signalfd(int ufd, } #endif /* CONFIG_SIGNALFD */ -#ifdef CONFIG_TIMERFD - -asmlinkage long compat_sys_timerfd_settime(int ufd, int flags, - const struct compat_itimerspec __user *utmr, - struct compat_itimerspec __user *otmr) -{ - int error; - struct itimerspec t; - struct itimerspec __user *ut; - - if (get_compat_itimerspec(&t, utmr)) - return -EFAULT; - ut = compat_alloc_user_space(2 * sizeof(struct itimerspec)); - if (copy_to_user(&ut[0], &t, sizeof(t))) - return -EFAULT; - error = sys_timerfd_settime(ufd, flags, &ut[0], &ut[1]); - if (!error && otmr) - error = (copy_from_user(&t, &ut[1], sizeof(struct itimerspec)) || - put_compat_itimerspec(otmr, &t)) ? -EFAULT: 0; - - return error; -} - -asmlinkage long compat_sys_timerfd_gettime(int ufd, - struct compat_itimerspec __user *otmr) -{ - int error; - struct itimerspec t; - struct itimerspec __user *ut; - - ut = compat_alloc_user_space(sizeof(struct itimerspec)); - error = sys_timerfd_gettime(ufd, ut); - if (!error) - error = (copy_from_user(&t, ut, sizeof(struct itimerspec)) || - put_compat_itimerspec(otmr, &t)) ? -EFAULT: 0; - - return error; -} - -#endif /* CONFIG_TIMERFD */ - #ifdef CONFIG_FHANDLE /* * Exactly like fs/open.c:sys_open_by_handle_at(), except that it * doesn't set the O_LARGEFILE flag. */ -asmlinkage long -compat_sys_open_by_handle_at(int mountdirfd, - struct file_handle __user *handle, int flags) +COMPAT_SYSCALL_DEFINE3(open_by_handle_at, int, mountdirfd, + struct file_handle __user *, handle, int, flags) { return do_handle_open(mountdirfd, handle, flags); } diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c index 90d222f..7aabc6a 100644 --- a/fs/configfs/dir.c +++ b/fs/configfs/dir.c @@ -1037,10 +1037,11 @@ static int configfs_dump(struct configfs_dirent *sd, int level) static int configfs_depend_prep(struct dentry *origin, struct config_item *target) { - struct configfs_dirent *child_sd, *sd = origin->d_fsdata; + struct configfs_dirent *child_sd, *sd; int ret = 0; - BUG_ON(!origin || !sd); + BUG_ON(!origin || !origin->d_fsdata); + sd = origin->d_fsdata; if (sd->s_element == target) /* Boo-yah */ goto out; diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c index a5f12b7..0c4f80b 100644 --- a/fs/debugfs/inode.c +++ b/fs/debugfs/inode.c @@ -322,7 +322,6 @@ static struct dentry *__create_file(const char *name, umode_t mode, if (!parent) parent = debugfs_mount->mnt_root; - dentry = NULL; mutex_lock(&parent->d_inode->i_mutex); dentry = lookup_one_len(name, parent, strlen(name)); if (!IS_ERR(dentry)) { diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c index 472e6be..073d30b 100644 --- a/fs/devpts/inode.c +++ b/fs/devpts/inode.c @@ -243,6 +243,13 @@ static int mknod_ptmx(struct super_block *sb) struct dentry *root = sb->s_root; struct pts_fs_info *fsi = DEVPTS_SB(sb); struct pts_mount_opts *opts = &fsi->mount_opts; + kuid_t root_uid; + kgid_t root_gid; + + root_uid = make_kuid(current_user_ns(), 0); + root_gid = make_kgid(current_user_ns(), 0); + if (!uid_valid(root_uid) || !gid_valid(root_gid)) + return -EINVAL; mutex_lock(&root->d_inode->i_mutex); @@ -273,6 +280,8 @@ static int mknod_ptmx(struct super_block *sb) mode = S_IFCHR|opts->ptmxmode; init_special_inode(inode, mode, MKDEV(TTYAUX_MAJOR, 2)); + inode->i_uid = root_uid; + inode->i_gid = root_gid; d_add(dentry, inode); @@ -438,6 +447,12 @@ static struct dentry *devpts_mount(struct file_system_type *fs_type, if (error) return ERR_PTR(error); + /* Require newinstance for all user namespace mounts to ensure + * the mount options are not changed. + */ + if ((current_user_ns() != &init_user_ns) && !opts.newinstance) + return ERR_PTR(-EINVAL); + if (opts.newinstance) s = sget(fs_type, NULL, set_anon_super, flags, NULL); else @@ -491,6 +506,9 @@ static struct file_system_type devpts_fs_type = { .name = "devpts", .mount = devpts_mount, .kill_sb = devpts_kill_sb, +#ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES + .fs_flags = FS_USERNS_MOUNT | FS_USERNS_DEV_MOUNT, +#endif }; /* diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h index 77c0f70..e7665c3 100644 --- a/fs/dlm/dlm_internal.h +++ b/fs/dlm/dlm_internal.h @@ -96,10 +96,13 @@ do { \ } +#define DLM_RTF_SHRINK 0x00000001 + struct dlm_rsbtable { struct rb_root keep; struct rb_root toss; spinlock_t lock; + uint32_t flags; }; diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c index a579f30..f750165 100644 --- a/fs/dlm/lock.c +++ b/fs/dlm/lock.c @@ -1132,6 +1132,7 @@ static void toss_rsb(struct kref *kref) rb_erase(&r->res_hashnode, &ls->ls_rsbtbl[r->res_bucket].keep); rsb_insert(r, &ls->ls_rsbtbl[r->res_bucket].toss); r->res_toss_time = jiffies; + ls->ls_rsbtbl[r->res_bucket].flags |= DLM_RTF_SHRINK; if (r->res_lvbptr) { dlm_free_lvb(r->res_lvbptr); r->res_lvbptr = NULL; @@ -1659,11 +1660,18 @@ static void shrink_bucket(struct dlm_ls *ls, int b) char *name; int our_nodeid = dlm_our_nodeid(); int remote_count = 0; + int need_shrink = 0; int i, len, rv; memset(&ls->ls_remove_lens, 0, sizeof(int) * DLM_REMOVE_NAMES_MAX); spin_lock(&ls->ls_rsbtbl[b].lock); + + if (!(ls->ls_rsbtbl[b].flags & DLM_RTF_SHRINK)) { + spin_unlock(&ls->ls_rsbtbl[b].lock); + return; + } + for (n = rb_first(&ls->ls_rsbtbl[b].toss); n; n = next) { next = rb_next(n); r = rb_entry(n, struct dlm_rsb, res_hashnode); @@ -1679,6 +1687,8 @@ static void shrink_bucket(struct dlm_ls *ls, int b) continue; } + need_shrink = 1; + if (!time_after_eq(jiffies, r->res_toss_time + dlm_config.ci_toss_secs * HZ)) { continue; @@ -1710,6 +1720,11 @@ static void shrink_bucket(struct dlm_ls *ls, int b) rb_erase(&r->res_hashnode, &ls->ls_rsbtbl[b].toss); dlm_free_rsb(r); } + + if (need_shrink) + ls->ls_rsbtbl[b].flags |= DLM_RTF_SHRINK; + else + ls->ls_rsbtbl[b].flags &= ~DLM_RTF_SHRINK; spin_unlock(&ls->ls_rsbtbl[b].lock); /* diff --git a/fs/dlm/user.c b/fs/dlm/user.c index 7ff4985..911649a 100644 --- a/fs/dlm/user.c +++ b/fs/dlm/user.c @@ -503,11 +503,11 @@ static ssize_t device_write(struct file *file, const char __user *buf, #endif return -EINVAL; -#ifdef CONFIG_COMPAT - if (count > sizeof(struct dlm_write_request32) + DLM_RESNAME_MAXLEN) -#else + /* + * can't compare against COMPAT/dlm_write_request32 because + * we don't yet know if is64bit is zero + */ if (count > sizeof(struct dlm_write_request) + DLM_RESNAME_MAXLEN) -#endif return -EINVAL; kbuf = kzalloc(count + 1, GFP_NOFS); diff --git a/fs/ecryptfs/Kconfig b/fs/ecryptfs/Kconfig index cc16562..e15ef38 100644 --- a/fs/ecryptfs/Kconfig +++ b/fs/ecryptfs/Kconfig @@ -1,6 +1,6 @@ config ECRYPT_FS - tristate "eCrypt filesystem layer support (EXPERIMENTAL)" - depends on EXPERIMENTAL && KEYS && CRYPTO && (ENCRYPTED_KEYS || ENCRYPTED_KEYS=n) + tristate "eCrypt filesystem layer support" + depends on KEYS && CRYPTO && (ENCRYPTED_KEYS || ENCRYPTED_KEYS=n) select CRYPTO_ECB select CRYPTO_CBC select CRYPTO_MD5 diff --git a/fs/efs/Kconfig b/fs/efs/Kconfig index 6ebfc1c..d020e3c 100644 --- a/fs/efs/Kconfig +++ b/fs/efs/Kconfig @@ -1,6 +1,6 @@ config EFS_FS - tristate "EFS file system support (read only) (EXPERIMENTAL)" - depends on BLOCK && EXPERIMENTAL + tristate "EFS file system support (read only)" + depends on BLOCK help EFS is an older file system used for non-ISO9660 CD-ROMs and hard disk partitions by SGI's IRIX operating system (IRIX 6.0 and newer diff --git a/fs/ext2/balloc.c b/fs/ext2/balloc.c index 2616d0e..9f9992b 100644 --- a/fs/ext2/balloc.c +++ b/fs/ext2/balloc.c @@ -159,15 +159,6 @@ read_block_bitmap(struct super_block *sb, unsigned int block_group) return bh; } -static void release_blocks(struct super_block *sb, int count) -{ - if (count) { - struct ext2_sb_info *sbi = EXT2_SB(sb); - - percpu_counter_add(&sbi->s_freeblocks_counter, count); - } -} - static void group_adjust_blocks(struct super_block *sb, int group_no, struct ext2_group_desc *desc, struct buffer_head *bh, int count) { @@ -568,8 +559,11 @@ do_more: } error_return: brelse(bitmap_bh); - release_blocks(sb, freed); - dquot_free_block_nodirty(inode, freed); + if (freed) { + percpu_counter_add(&sbi->s_freeblocks_counter, freed); + dquot_free_block_nodirty(inode, freed); + mark_inode_dirty(inode); + } } /** @@ -1239,10 +1233,6 @@ ext2_fsblk_t ext2_new_blocks(struct inode *inode, ext2_fsblk_t goal, *errp = -ENOSPC; sb = inode->i_sb; - if (!sb) { - printk("ext2_new_blocks: nonexistent device"); - return 0; - } /* * Check quota for allocation of this block. @@ -1416,9 +1406,11 @@ allocated: *errp = 0; brelse(bitmap_bh); - dquot_free_block_nodirty(inode, *count-num); - mark_inode_dirty(inode); - *count = num; + if (num < *count) { + dquot_free_block_nodirty(inode, *count-num); + mark_inode_dirty(inode); + *count = num; + } return ret_block; io_error: diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index 6363ac6..c3881e5 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -495,6 +495,10 @@ static int ext2_alloc_branch(struct inode *inode, * parent to disk. */ bh = sb_getblk(inode->i_sb, new_blocks[n-1]); + if (unlikely(!bh)) { + err = -ENOMEM; + goto failed; + } branch[n].bh = bh; lock_buffer(bh); memset(bh->b_data, 0, blocksize); @@ -523,6 +527,14 @@ static int ext2_alloc_branch(struct inode *inode, } *blks = num; return err; + +failed: + for (i = 1; i < n; i++) + bforget(branch[i].bh); + for (i = 0; i < indirect_blks; i++) + ext2_free_blocks(inode, new_blocks[i], 1); + ext2_free_blocks(inode, new_blocks[i], num); + return err; } /** diff --git a/fs/ext2/super.c b/fs/ext2/super.c index fa04d02..7f68c81 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -1500,7 +1500,7 @@ static ssize_t ext2_quota_write(struct super_block *sb, int type, bh = sb_bread(sb, tmp_bh.b_blocknr); else bh = sb_getblk(sb, tmp_bh.b_blocknr); - if (!bh) { + if (unlikely(!bh)) { err = -EIO; goto out; } diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c index b6754db..2d7557d 100644 --- a/fs/ext2/xattr.c +++ b/fs/ext2/xattr.c @@ -662,10 +662,10 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh, ea_idebug(inode, "creating block %d", block); new_bh = sb_getblk(sb, block); - if (!new_bh) { + if (unlikely(!new_bh)) { ext2_free_blocks(inode, block, 1); mark_inode_dirty(inode); - error = -EIO; + error = -ENOMEM; goto cleanup; } lock_buffer(new_bh); diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c index b176d42..d512c4b 100644 --- a/fs/ext3/inode.c +++ b/fs/ext3/inode.c @@ -676,6 +676,10 @@ static int ext3_alloc_branch(handle_t *handle, struct inode *inode, * parent to disk. */ bh = sb_getblk(inode->i_sb, new_blocks[n-1]); + if (unlikely(!bh)) { + err = -ENOMEM; + goto failed; + } branch[n].bh = bh; lock_buffer(bh); BUFFER_TRACE(bh, "call get_create_access"); @@ -717,7 +721,7 @@ failed: BUFFER_TRACE(branch[i].bh, "call journal_forget"); ext3_journal_forget(handle, branch[i].bh); } - for (i = 0; i <indirect_blks; i++) + for (i = 0; i < indirect_blks; i++) ext3_free_blocks(handle, inode, new_blocks[i], 1); ext3_free_blocks(handle, inode, new_blocks[i], num); @@ -1078,8 +1082,8 @@ struct buffer_head *ext3_getblk(handle_t *handle, struct inode *inode, if (!err && buffer_mapped(&dummy)) { struct buffer_head *bh; bh = sb_getblk(inode->i_sb, dummy.b_blocknr); - if (!bh) { - *errp = -EIO; + if (unlikely(!bh)) { + *errp = -ENOMEM; goto err; } if (buffer_new(&dummy)) { @@ -2729,12 +2733,12 @@ static int __ext3_get_inode_loc(struct inode *inode, return -EIO; bh = sb_getblk(inode->i_sb, block); - if (!bh) { + if (unlikely(!bh)) { ext3_error (inode->i_sb, "ext3_get_inode_loc", "unable to read inode block - " "inode=%lu, block="E3FSBLK, inode->i_ino, block); - return -EIO; + return -ENOMEM; } if (!buffer_uptodate(bh)) { lock_buffer(bh); @@ -2783,7 +2787,7 @@ static int __ext3_get_inode_loc(struct inode *inode, bitmap_bh = sb_getblk(inode->i_sb, le32_to_cpu(desc->bg_inode_bitmap)); - if (!bitmap_bh) + if (unlikely(!bitmap_bh)) goto make_io; /* diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c index 61fa09e..692de13 100644 --- a/fs/ext3/namei.c +++ b/fs/ext3/namei.c @@ -36,7 +36,6 @@ #define NAMEI_RA_CHUNKS 2 #define NAMEI_RA_BLOCKS 4 #define NAMEI_RA_SIZE (NAMEI_RA_CHUNKS * NAMEI_RA_BLOCKS) -#define NAMEI_RA_INDEX(c,b) (((c) * NAMEI_RA_BLOCKS) + (b)) static struct buffer_head *ext3_append(handle_t *handle, struct inode *inode, diff --git a/fs/ext3/resize.c b/fs/ext3/resize.c index 0f814f3..2710565 100644 --- a/fs/ext3/resize.c +++ b/fs/ext3/resize.c @@ -116,8 +116,8 @@ static struct buffer_head *bclean(handle_t *handle, struct super_block *sb, int err; bh = sb_getblk(sb, blk); - if (!bh) - return ERR_PTR(-EIO); + if (unlikely(!bh)) + return ERR_PTR(-ENOMEM); if ((err = ext3_journal_get_write_access(handle, bh))) { brelse(bh); bh = ERR_PTR(err); @@ -234,8 +234,8 @@ static int setup_new_group_blocks(struct super_block *sb, goto exit_bh; gdb = sb_getblk(sb, block); - if (!gdb) { - err = -EIO; + if (unlikely(!gdb)) { + err = -ENOMEM; goto exit_bh; } if ((err = ext3_journal_get_write_access(handle, gdb))) { @@ -722,8 +722,8 @@ static void update_backups(struct super_block *sb, break; bh = sb_getblk(sb, group * bpg + blk_off); - if (!bh) { - err = -EIO; + if (unlikely(!bh)) { + err = -ENOMEM; break; } ext3_debug("update metadata backup %#04lx\n", diff --git a/fs/ext3/super.c b/fs/ext3/super.c index 6e50223..5546ca2 100644 --- a/fs/ext3/super.c +++ b/fs/ext3/super.c @@ -916,21 +916,24 @@ static int set_qf_name(struct super_block *sb, int qtype, substring_t *args) "Not enough memory for storing quotafile name"); return 0; } - if (sbi->s_qf_names[qtype] && - strcmp(sbi->s_qf_names[qtype], qname)) { - ext3_msg(sb, KERN_ERR, - "%s quota file already specified", QTYPE2NAME(qtype)); + if (sbi->s_qf_names[qtype]) { + int same = !strcmp(sbi->s_qf_names[qtype], qname); + kfree(qname); - return 0; + if (!same) { + ext3_msg(sb, KERN_ERR, + "%s quota file already specified", + QTYPE2NAME(qtype)); + } + return same; } - sbi->s_qf_names[qtype] = qname; - if (strchr(sbi->s_qf_names[qtype], '/')) { + if (strchr(qname, '/')) { ext3_msg(sb, KERN_ERR, "quotafile must be on filesystem root"); - kfree(sbi->s_qf_names[qtype]); - sbi->s_qf_names[qtype] = NULL; + kfree(qname); return 0; } + sbi->s_qf_names[qtype] = qname; set_opt(sbi->s_mount_opt, QUOTA); return 1; } @@ -945,11 +948,10 @@ static int clear_qf_name(struct super_block *sb, int qtype) { " when quota turned on"); return 0; } - /* - * The space will be released later when all options are confirmed - * to be correct - */ - sbi->s_qf_names[qtype] = NULL; + if (sbi->s_qf_names[qtype]) { + kfree(sbi->s_qf_names[qtype]); + sbi->s_qf_names[qtype] = NULL; + } return 1; } #endif @@ -2065,6 +2067,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent) test_opt(sb,DATA_FLAGS) == EXT3_MOUNT_JOURNAL_DATA ? "journal": test_opt(sb,DATA_FLAGS) == EXT3_MOUNT_ORDERED_DATA ? "ordered": "writeback"); + sb->s_flags |= MS_SNAP_STABLE; return 0; @@ -2605,7 +2608,18 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data) #ifdef CONFIG_QUOTA old_opts.s_jquota_fmt = sbi->s_jquota_fmt; for (i = 0; i < MAXQUOTAS; i++) - old_opts.s_qf_names[i] = sbi->s_qf_names[i]; + if (sbi->s_qf_names[i]) { + old_opts.s_qf_names[i] = kstrdup(sbi->s_qf_names[i], + GFP_KERNEL); + if (!old_opts.s_qf_names[i]) { + int j; + + for (j = 0; j < i; j++) + kfree(old_opts.s_qf_names[j]); + return -ENOMEM; + } + } else + old_opts.s_qf_names[i] = NULL; #endif /* @@ -2698,9 +2712,7 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data) #ifdef CONFIG_QUOTA /* Release old quota file names */ for (i = 0; i < MAXQUOTAS; i++) - if (old_opts.s_qf_names[i] && - old_opts.s_qf_names[i] != sbi->s_qf_names[i]) - kfree(old_opts.s_qf_names[i]); + kfree(old_opts.s_qf_names[i]); #endif if (enable_quota) dquot_resume(sb, -1); @@ -2714,9 +2726,7 @@ restore_opts: #ifdef CONFIG_QUOTA sbi->s_jquota_fmt = old_opts.s_jquota_fmt; for (i = 0; i < MAXQUOTAS; i++) { - if (sbi->s_qf_names[i] && - old_opts.s_qf_names[i] != sbi->s_qf_names[i]) - kfree(sbi->s_qf_names[i]); + kfree(sbi->s_qf_names[i]); sbi->s_qf_names[i] = old_opts.s_qf_names[i]; } #endif diff --git a/fs/ext3/xattr.c b/fs/ext3/xattr.c index d22ebb7..b1fc963 100644 --- a/fs/ext3/xattr.c +++ b/fs/ext3/xattr.c @@ -813,10 +813,10 @@ inserted: ea_idebug(inode, "creating block %d", block); new_bh = sb_getblk(sb, block); - if (!new_bh) { + if (unlikely(!new_bh)) { getblk_failed: ext3_free_blocks(handle, inode, block, 1); - error = -EIO; + error = -ENOMEM; goto cleanup; } lock_buffer(new_bh); diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c index e6e0d98..39a54a0 100644 --- a/fs/ext4/acl.c +++ b/fs/ext4/acl.c @@ -324,8 +324,8 @@ ext4_acl_chmod(struct inode *inode) if (error) return error; retry: - handle = ext4_journal_start(inode, - EXT4_DATA_TRANS_BLOCKS(inode->i_sb)); + handle = ext4_journal_start(inode, EXT4_HT_XATTR, + ext4_jbd2_credits_xattr(inode)); if (IS_ERR(handle)) { error = PTR_ERR(handle); ext4_std_error(inode->i_sb, error); @@ -422,7 +422,8 @@ ext4_xattr_set_acl(struct dentry *dentry, const char *name, const void *value, acl = NULL; retry: - handle = ext4_journal_start(inode, EXT4_DATA_TRANS_BLOCKS(inode->i_sb)); + handle = ext4_journal_start(inode, EXT4_HT_XATTR, + ext4_jbd2_credits_xattr(inode)); if (IS_ERR(handle)) { error = PTR_ERR(handle); goto release_and_out; diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index cf18217..2f2e0da 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -358,7 +358,7 @@ void ext4_validate_block_bitmap(struct super_block *sb, } /** - * ext4_read_block_bitmap() + * ext4_read_block_bitmap_nowait() * @sb: super block * @block_group: given block group * @@ -457,6 +457,8 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group) struct buffer_head *bh; bh = ext4_read_block_bitmap_nowait(sb, block_group); + if (!bh) + return NULL; if (ext4_wait_block_bitmap(sb, block_group, bh)) { put_bh(bh); return NULL; @@ -482,11 +484,16 @@ static int ext4_has_free_clusters(struct ext4_sb_info *sbi, free_clusters = percpu_counter_read_positive(fcc); dirty_clusters = percpu_counter_read_positive(dcc); - root_clusters = EXT4_B2C(sbi, ext4_r_blocks_count(sbi->s_es)); + + /* + * r_blocks_count should always be multiple of the cluster ratio so + * we are safe to do a plane bit shift only. + */ + root_clusters = ext4_r_blocks_count(sbi->s_es) >> sbi->s_cluster_bits; if (free_clusters - (nclusters + root_clusters + dirty_clusters) < EXT4_FREECLUSTERS_WATERMARK) { - free_clusters = EXT4_C2B(sbi, percpu_counter_sum_positive(fcc)); + free_clusters = percpu_counter_sum_positive(fcc); dirty_clusters = percpu_counter_sum_positive(dcc); } /* Check whether we have space after accounting for current diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index dc149d1..6dda04f 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -185,6 +185,7 @@ static int ext4_readdir(struct file *filp, "at offset %llu", (unsigned long long)filp->f_pos); filp->f_pos += sb->s_blocksize - offset; + brelse(bh); continue; } set_buffer_verified(bh); diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 8462eb3..6e16c18 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -194,8 +194,7 @@ struct mpage_da_data { */ #define EXT4_IO_END_UNWRITTEN 0x0001 #define EXT4_IO_END_ERROR 0x0002 -#define EXT4_IO_END_QUEUED 0x0004 -#define EXT4_IO_END_DIRECT 0x0008 +#define EXT4_IO_END_DIRECT 0x0004 struct ext4_io_page { struct page *p_page; @@ -215,10 +214,8 @@ typedef struct ext4_io_end { struct list_head list; /* per-file finished IO list */ struct inode *inode; /* file being written to */ unsigned int flag; /* unwritten or not */ - struct page *page; /* for writepage() path */ loff_t offset; /* offset in the file */ ssize_t size; /* size of the extent */ - struct work_struct work; /* data work queue */ struct kiocb *iocb; /* iocb struct for AIO */ int result; /* error value for AIO */ int num_io_pages; /* for writepages() */ @@ -582,6 +579,8 @@ enum { #define EXT4_GET_BLOCKS_KEEP_SIZE 0x0080 /* Do not take i_data_sem locking in ext4_map_blocks */ #define EXT4_GET_BLOCKS_NO_LOCK 0x0100 + /* Do not put hole in extent cache */ +#define EXT4_GET_BLOCKS_NO_PUT_HOLE 0x0200 /* * Flags used by ext4_free_blocks @@ -810,17 +809,6 @@ do { \ #endif /* defined(__KERNEL__) || defined(__linux__) */ -/* - * storage for cached extent - * If ec_len == 0, then the cache is invalid. - * If ec_start == 0, then the cache represents a gap (null mapping) - */ -struct ext4_ext_cache { - ext4_fsblk_t ec_start; - ext4_lblk_t ec_block; - __u32 ec_len; /* must be 32bit to return holes */ -}; - #include "extents_status.h" /* @@ -887,7 +875,6 @@ struct ext4_inode_info { struct inode vfs_inode; struct jbd2_inode *jinode; - struct ext4_ext_cache i_cached_extent; /* * File creation time. Its function is same as that of * struct timespec i_{a,c,m}time in the generic inode. @@ -901,6 +888,8 @@ struct ext4_inode_info { /* extents status tree */ struct ext4_es_tree i_es_tree; rwlock_t i_es_lock; + struct list_head i_es_lru; + unsigned int i_es_lru_nr; /* protected by i_es_lock */ /* ialloc */ ext4_group_t i_last_alloc_group; @@ -930,6 +919,7 @@ struct ext4_inode_info { spinlock_t i_completed_io_lock; atomic_t i_ioend_count; /* Number of outstanding io_end structs */ atomic_t i_unwritten; /* Nr. of inflight conversions pending */ + struct work_struct i_unwritten_work; /* deferred extent conversion */ spinlock_t i_block_reservation_lock; @@ -985,7 +975,6 @@ struct ext4_inode_info { #define EXT4_MOUNT_DIOREAD_NOLOCK 0x400000 /* Enable support for dio read nolocking */ #define EXT4_MOUNT_JOURNAL_CHECKSUM 0x800000 /* Journal checksums */ #define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */ -#define EXT4_MOUNT_MBLK_IO_SUBMIT 0x4000000 /* multi-block io submits */ #define EXT4_MOUNT_DELALLOC 0x8000000 /* Delalloc support */ #define EXT4_MOUNT_DATA_ERR_ABORT 0x10000000 /* Abort on file data write */ #define EXT4_MOUNT_BLOCK_VALIDITY 0x20000000 /* Block validity checking */ @@ -1316,6 +1305,11 @@ struct ext4_sb_info { /* Precomputed FS UUID checksum for seeding other checksums */ __u32 s_csum_seed; + + /* Reclaim extents from extent status tree */ + struct shrinker s_es_shrinker; + struct list_head s_es_lru; + spinlock_t s_es_lru_lock ____cacheline_aligned_in_smp; }; static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb) @@ -2007,9 +2001,20 @@ extern int ext4fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo); /* ialloc.c */ -extern struct inode *ext4_new_inode(handle_t *, struct inode *, umode_t, - const struct qstr *qstr, __u32 goal, - uid_t *owner); +extern struct inode *__ext4_new_inode(handle_t *, struct inode *, umode_t, + const struct qstr *qstr, __u32 goal, + uid_t *owner, int handle_type, + unsigned int line_no, int nblocks); + +#define ext4_new_inode(handle, dir, mode, qstr, goal, owner) \ + __ext4_new_inode((handle), (dir), (mode), (qstr), (goal), (owner), \ + 0, 0, 0) +#define ext4_new_inode_start_handle(dir, mode, qstr, goal, owner, \ + type, nblocks) \ + __ext4_new_inode(NULL, (dir), (mode), (qstr), (goal), (owner), \ + (type), __LINE__, (nblocks)) + + extern void ext4_free_inode(handle_t *, struct inode *); extern struct inode * ext4_orphan_get(struct super_block *, unsigned long); extern unsigned long ext4_count_free_inodes(struct super_block *); @@ -2103,6 +2108,7 @@ extern ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb, extern int ext4_ind_calc_metadata_amount(struct inode *inode, sector_t lblock); extern int ext4_ind_trans_blocks(struct inode *inode, int nrblocks, int chunk); extern void ext4_ind_truncate(struct inode *inode); +extern int ext4_ind_punch_hole(struct file *file, loff_t offset, loff_t length); /* ioctl.c */ extern long ext4_ioctl(struct file *, unsigned int, unsigned long); @@ -2151,6 +2157,8 @@ extern void *ext4_kvzalloc(size_t size, gfp_t flags); extern void ext4_kvfree(void *ptr); extern int ext4_alloc_flex_bg_array(struct super_block *sb, ext4_group_t ngroup); +extern const char *ext4_decode_error(struct super_block *sb, int errno, + char nbuf[16]); extern __printf(4, 5) void __ext4_error(struct super_block *, const char *, unsigned int, const char *, ...); @@ -2227,6 +2235,8 @@ extern int ext4_group_desc_csum_verify(struct super_block *sb, __u32 group, struct ext4_group_desc *gdp); extern void ext4_group_desc_csum_set(struct super_block *sb, __u32 group, struct ext4_group_desc *gdp); +extern int ext4_register_li_request(struct super_block *sb, + ext4_group_t first_not_zeroed); static inline int ext4_has_group_desc_csum(struct super_block *sb) { @@ -2454,6 +2464,75 @@ extern const struct file_operations ext4_file_operations; extern loff_t ext4_llseek(struct file *file, loff_t offset, int origin); extern void ext4_unwritten_wait(struct inode *inode); +/* inline.c */ +extern int ext4_has_inline_data(struct inode *inode); +extern int ext4_get_inline_size(struct inode *inode); +extern int ext4_get_max_inline_size(struct inode *inode); +extern int ext4_find_inline_data_nolock(struct inode *inode); +extern void ext4_write_inline_data(struct inode *inode, + struct ext4_iloc *iloc, + void *buffer, loff_t pos, + unsigned int len); +extern int ext4_prepare_inline_data(handle_t *handle, struct inode *inode, + unsigned int len); +extern int ext4_init_inline_data(handle_t *handle, struct inode *inode, + unsigned int len); +extern int ext4_destroy_inline_data(handle_t *handle, struct inode *inode); + +extern int ext4_readpage_inline(struct inode *inode, struct page *page); +extern int ext4_try_to_write_inline_data(struct address_space *mapping, + struct inode *inode, + loff_t pos, unsigned len, + unsigned flags, + struct page **pagep); +extern int ext4_write_inline_data_end(struct inode *inode, + loff_t pos, unsigned len, + unsigned copied, + struct page *page); +extern struct buffer_head * +ext4_journalled_write_inline_data(struct inode *inode, + unsigned len, + struct page *page); +extern int ext4_da_write_inline_data_begin(struct address_space *mapping, + struct inode *inode, + loff_t pos, unsigned len, + unsigned flags, + struct page **pagep, + void **fsdata); +extern int ext4_da_write_inline_data_end(struct inode *inode, loff_t pos, + unsigned len, unsigned copied, + struct page *page); +extern int ext4_try_add_inline_entry(handle_t *handle, struct dentry *dentry, + struct inode *inode); +extern int ext4_try_create_inline_dir(handle_t *handle, + struct inode *parent, + struct inode *inode); +extern int ext4_read_inline_dir(struct file *filp, + void *dirent, filldir_t filldir, + int *has_inline_data); +extern struct buffer_head *ext4_find_inline_entry(struct inode *dir, + const struct qstr *d_name, + struct ext4_dir_entry_2 **res_dir, + int *has_inline_data); +extern int ext4_delete_inline_entry(handle_t *handle, + struct inode *dir, + struct ext4_dir_entry_2 *de_del, + struct buffer_head *bh, + int *has_inline_data); +extern int empty_inline_dir(struct inode *dir, int *has_inline_data); +extern struct buffer_head *ext4_get_first_inline_block(struct inode *inode, + struct ext4_dir_entry_2 **parent_de, + int *retval); +extern int ext4_inline_data_fiemap(struct inode *inode, + struct fiemap_extent_info *fieinfo, + int *has_inline); +extern int ext4_try_to_evict_inline_data(handle_t *handle, + struct inode *inode, + int needed); +extern void ext4_inline_data_truncate(struct inode *inode, int *has_inline); + +extern int ext4_convert_inline_data(struct inode *inode); + /* namei.c */ extern const struct inode_operations ext4_dir_inode_operations; extern const struct inode_operations ext4_special_inode_operations; @@ -2520,6 +2599,9 @@ extern struct ext4_ext_path *ext4_ext_find_extent(struct inode *, ext4_lblk_t, struct ext4_ext_path *); extern void ext4_ext_drop_refs(struct ext4_ext_path *); extern int ext4_ext_check_inode(struct inode *inode); +extern int ext4_find_delalloc_range(struct inode *inode, + ext4_lblk_t lblk_start, + ext4_lblk_t lblk_end); extern int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk); extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, __u64 start, __u64 len); @@ -2537,6 +2619,7 @@ extern void ext4_exit_pageio(void); extern void ext4_ioend_wait(struct inode *); extern void ext4_free_io_end(ext4_io_end_t *io); extern ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags); +extern void ext4_end_io_work(struct work_struct *work); extern void ext4_io_submit(struct ext4_io_submit *io); extern int ext4_bio_write_page(struct ext4_io_submit *io, struct page *page, diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h index 487fda1..8643ff5 100644 --- a/fs/ext4/ext4_extents.h +++ b/fs/ext4/ext4_extents.h @@ -193,12 +193,6 @@ static inline unsigned short ext_depth(struct inode *inode) return le16_to_cpu(ext_inode_hdr(inode)->eh_depth); } -static inline void -ext4_ext_invalidate_cache(struct inode *inode) -{ - EXT4_I(inode)->i_cached_extent.ec_len = 0; -} - static inline void ext4_ext_mark_uninitialized(struct ext4_extent *ext) { /* We can not have an uninitialized extent of zero length! */ diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c index b4323ba..7058975 100644 --- a/fs/ext4/ext4_jbd2.c +++ b/fs/ext4/ext4_jbd2.c @@ -6,6 +6,108 @@ #include <trace/events/ext4.h> +/* Just increment the non-pointer handle value */ +static handle_t *ext4_get_nojournal(void) +{ + handle_t *handle = current->journal_info; + unsigned long ref_cnt = (unsigned long)handle; + + BUG_ON(ref_cnt >= EXT4_NOJOURNAL_MAX_REF_COUNT); + + ref_cnt++; + handle = (handle_t *)ref_cnt; + + current->journal_info = handle; + return handle; +} + + +/* Decrement the non-pointer handle value */ +static void ext4_put_nojournal(handle_t *handle) +{ + unsigned long ref_cnt = (unsigned long)handle; + + BUG_ON(ref_cnt == 0); + + ref_cnt--; + handle = (handle_t *)ref_cnt; + + current->journal_info = handle; +} + +/* + * Wrappers for jbd2_journal_start/end. + */ +handle_t *__ext4_journal_start_sb(struct super_block *sb, unsigned int line, + int type, int nblocks) +{ + journal_t *journal; + + trace_ext4_journal_start(sb, nblocks, _RET_IP_); + if (sb->s_flags & MS_RDONLY) + return ERR_PTR(-EROFS); + + WARN_ON(sb->s_writers.frozen == SB_FREEZE_COMPLETE); + journal = EXT4_SB(sb)->s_journal; + if (!journal) + return ext4_get_nojournal(); + /* + * Special case here: if the journal has aborted behind our + * backs (eg. EIO in the commit thread), then we still need to + * take the FS itself readonly cleanly. + */ + if (is_journal_aborted(journal)) { + ext4_abort(sb, "Detected aborted journal"); + return ERR_PTR(-EROFS); + } + return jbd2__journal_start(journal, nblocks, GFP_NOFS, type, line); +} + +int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle) +{ + struct super_block *sb; + int err; + int rc; + + if (!ext4_handle_valid(handle)) { + ext4_put_nojournal(handle); + return 0; + } + sb = handle->h_transaction->t_journal->j_private; + err = handle->h_err; + rc = jbd2_journal_stop(handle); + + if (!err) + err = rc; + if (err) + __ext4_std_error(sb, where, line, err); + return err; +} + +void ext4_journal_abort_handle(const char *caller, unsigned int line, + const char *err_fn, struct buffer_head *bh, + handle_t *handle, int err) +{ + char nbuf[16]; + const char *errstr = ext4_decode_error(NULL, err, nbuf); + + BUG_ON(!ext4_handle_valid(handle)); + + if (bh) + BUFFER_TRACE(bh, "abort"); + + if (!handle->h_err) + handle->h_err = err; + + if (is_handle_aborted(handle)) + return; + + printk(KERN_ERR "EXT4-fs: %s:%d: aborting transaction: %s in %s\n", + caller, line, errstr, err_fn); + + jbd2_journal_abort_handle(handle); +} + int __ext4_journal_get_write_access(const char *where, unsigned int line, handle_t *handle, struct buffer_head *bh) { diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h index 7177f9b..4c216b1 100644 --- a/fs/ext4/ext4_jbd2.h +++ b/fs/ext4/ext4_jbd2.h @@ -59,12 +59,6 @@ #define EXT4_META_TRANS_BLOCKS(sb) (EXT4_XATTR_TRANS_BLOCKS + \ EXT4_MAXQUOTAS_TRANS_BLOCKS(sb)) -/* Delete operations potentially hit one directory's namespace plus an - * entire inode, plus arbitrary amounts of bitmap/indirection data. Be - * generous. We can grow the delete transaction later if necessary. */ - -#define EXT4_DELETE_TRANS_BLOCKS(sb) (2 * EXT4_DATA_TRANS_BLOCKS(sb) + 64) - /* Define an arbitrary limit for the amount of data we will anticipate * writing to any given transaction. For unbounded transactions such as * write(2) and truncate(2) we can write more than this, but we always @@ -110,6 +104,36 @@ #define EXT4_MAXQUOTAS_INIT_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_INIT_BLOCKS(sb)) #define EXT4_MAXQUOTAS_DEL_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_DEL_BLOCKS(sb)) +static inline int ext4_jbd2_credits_xattr(struct inode *inode) +{ + int credits = EXT4_DATA_TRANS_BLOCKS(inode->i_sb); + + /* + * In case of inline data, we may push out the data to a block, + * so we need to reserve credits for this eventuality + */ + if (ext4_has_inline_data(inode)) + credits += ext4_writepage_trans_blocks(inode) + 1; + return credits; +} + + +/* + * Ext4 handle operation types -- for logging purposes + */ +#define EXT4_HT_MISC 0 +#define EXT4_HT_INODE 1 +#define EXT4_HT_WRITE_PAGE 2 +#define EXT4_HT_MAP_BLOCKS 3 +#define EXT4_HT_DIR 4 +#define EXT4_HT_TRUNCATE 5 +#define EXT4_HT_QUOTA 6 +#define EXT4_HT_RESIZE 7 +#define EXT4_HT_MIGRATE 8 +#define EXT4_HT_MOVE_EXTENTS 9 +#define EXT4_HT_XATTR 10 +#define EXT4_HT_MAX 11 + /** * struct ext4_journal_cb_entry - Base structure for callback information. * @@ -234,7 +258,8 @@ int __ext4_handle_dirty_super(const char *where, unsigned int line, #define ext4_handle_dirty_super(handle, sb) \ __ext4_handle_dirty_super(__func__, __LINE__, (handle), (sb)) -handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks); +handle_t *__ext4_journal_start_sb(struct super_block *sb, unsigned int line, + int type, int nblocks); int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle); #define EXT4_NOJOURNAL_MAX_REF_COUNT ((unsigned long) 4096) @@ -268,9 +293,17 @@ static inline int ext4_handle_has_enough_credits(handle_t *handle, int needed) return 1; } -static inline handle_t *ext4_journal_start(struct inode *inode, int nblocks) +#define ext4_journal_start_sb(sb, type, nblocks) \ + __ext4_journal_start_sb((sb), __LINE__, (type), (nblocks)) + +#define ext4_journal_start(inode, type, nblocks) \ + __ext4_journal_start((inode), __LINE__, (type), (nblocks)) + +static inline handle_t *__ext4_journal_start(struct inode *inode, + unsigned int line, int type, + int nblocks) { - return ext4_journal_start_sb(inode->i_sb, nblocks); + return __ext4_journal_start_sb(inode->i_sb, line, type, nblocks); } #define ext4_journal_stop(handle) \ diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 7817ca7..28dd8ee 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -112,7 +112,7 @@ static int ext4_split_extent_at(handle_t *handle, int flags); static int ext4_find_delayed_extent(struct inode *inode, - struct ext4_ext_cache *newex); + struct extent_status *newes); static int ext4_ext_truncate_extend_restart(handle_t *handle, struct inode *inode, @@ -714,7 +714,6 @@ int ext4_ext_tree_init(handle_t *handle, struct inode *inode) eh->eh_magic = EXT4_EXT_MAGIC; eh->eh_max = cpu_to_le16(ext4_ext_space_root(inode, 0)); ext4_mark_inode_dirty(handle, inode); - ext4_ext_invalidate_cache(inode); return 0; } @@ -725,6 +724,7 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block, struct ext4_extent_header *eh; struct buffer_head *bh; short int depth, i, ppos = 0, alloc = 0; + int ret; eh = ext_inode_hdr(inode); depth = ext_depth(inode); @@ -752,12 +752,15 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block, path[ppos].p_ext = NULL; bh = sb_getblk(inode->i_sb, path[ppos].p_block); - if (unlikely(!bh)) + if (unlikely(!bh)) { + ret = -ENOMEM; goto err; + } if (!bh_uptodate_or_lock(bh)) { trace_ext4_ext_load_extent(inode, block, path[ppos].p_block); - if (bh_submit_read(bh) < 0) { + ret = bh_submit_read(bh); + if (ret < 0) { put_bh(bh); goto err; } @@ -768,13 +771,15 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block, put_bh(bh); EXT4_ERROR_INODE(inode, "ppos %d > depth %d", ppos, depth); + ret = -EIO; goto err; } path[ppos].p_bh = bh; path[ppos].p_hdr = eh; i--; - if (ext4_ext_check_block(inode, eh, i, bh)) + ret = ext4_ext_check_block(inode, eh, i, bh); + if (ret < 0) goto err; } @@ -796,7 +801,7 @@ err: ext4_ext_drop_refs(path); if (alloc) kfree(path); - return ERR_PTR(-EIO); + return ERR_PTR(ret); } /* @@ -950,8 +955,8 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode, goto cleanup; } bh = sb_getblk(inode->i_sb, newblock); - if (!bh) { - err = -EIO; + if (unlikely(!bh)) { + err = -ENOMEM; goto cleanup; } lock_buffer(bh); @@ -1023,8 +1028,8 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode, oldblock = newblock; newblock = ablocks[--a]; bh = sb_getblk(inode->i_sb, newblock); - if (!bh) { - err = -EIO; + if (unlikely(!bh)) { + err = -ENOMEM; goto cleanup; } lock_buffer(bh); @@ -1136,11 +1141,8 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode, return err; bh = sb_getblk(inode->i_sb, newblock); - if (!bh) { - err = -EIO; - ext4_std_error(inode->i_sb, err); - return err; - } + if (unlikely(!bh)) + return -ENOMEM; lock_buffer(bh); err = ext4_journal_get_create_access(handle, bh); @@ -1960,7 +1962,6 @@ cleanup: ext4_ext_drop_refs(npath); kfree(npath); } - ext4_ext_invalidate_cache(inode); return err; } @@ -1969,8 +1970,8 @@ static int ext4_fill_fiemap_extents(struct inode *inode, struct fiemap_extent_info *fieinfo) { struct ext4_ext_path *path = NULL; - struct ext4_ext_cache newex; struct ext4_extent *ex; + struct extent_status es; ext4_lblk_t next, next_del, start = 0, end = 0; ext4_lblk_t last = block + num; int exists, depth = 0, err = 0; @@ -2044,37 +2045,47 @@ static int ext4_fill_fiemap_extents(struct inode *inode, BUG_ON(end <= start); if (!exists) { - newex.ec_block = start; - newex.ec_len = end - start; - newex.ec_start = 0; + es.es_lblk = start; + es.es_len = end - start; + es.es_pblk = 0; } else { - newex.ec_block = le32_to_cpu(ex->ee_block); - newex.ec_len = ext4_ext_get_actual_len(ex); - newex.ec_start = ext4_ext_pblock(ex); + es.es_lblk = le32_to_cpu(ex->ee_block); + es.es_len = ext4_ext_get_actual_len(ex); + es.es_pblk = ext4_ext_pblock(ex); if (ext4_ext_is_uninitialized(ex)) flags |= FIEMAP_EXTENT_UNWRITTEN; } /* - * Find delayed extent and update newex accordingly. We call - * it even in !exists case to find out whether newex is the + * Find delayed extent and update es accordingly. We call + * it even in !exists case to find out whether es is the * last existing extent or not. */ - next_del = ext4_find_delayed_extent(inode, &newex); + next_del = ext4_find_delayed_extent(inode, &es); if (!exists && next_del) { exists = 1; flags |= FIEMAP_EXTENT_DELALLOC; } up_read(&EXT4_I(inode)->i_data_sem); - if (unlikely(newex.ec_len == 0)) { - EXT4_ERROR_INODE(inode, "newex.ec_len == 0"); + if (unlikely(es.es_len == 0)) { + EXT4_ERROR_INODE(inode, "es.es_len == 0"); err = -EIO; break; } - /* This is possible iff next == next_del == EXT_MAX_BLOCKS */ - if (next == next_del) { + /* + * This is possible iff next == next_del == EXT_MAX_BLOCKS. + * we need to check next == EXT_MAX_BLOCKS because it is + * possible that an extent is with unwritten and delayed + * status due to when an extent is delayed allocated and + * is allocated by fallocate status tree will track both of + * them in a extent. + * + * So we could return a unwritten and delayed extent, and + * its block is equal to 'next'. + */ + if (next == next_del && next == EXT_MAX_BLOCKS) { flags |= FIEMAP_EXTENT_LAST; if (unlikely(next_del != EXT_MAX_BLOCKS || next != EXT_MAX_BLOCKS)) { @@ -2089,9 +2100,9 @@ static int ext4_fill_fiemap_extents(struct inode *inode, if (exists) { err = fiemap_fill_next_extent(fieinfo, - (__u64)newex.ec_block << blksize_bits, - (__u64)newex.ec_start << blksize_bits, - (__u64)newex.ec_len << blksize_bits, + (__u64)es.es_lblk << blksize_bits, + (__u64)es.es_pblk << blksize_bits, + (__u64)es.es_len << blksize_bits, flags); if (err < 0) break; @@ -2101,7 +2112,7 @@ static int ext4_fill_fiemap_extents(struct inode *inode, } } - block = newex.ec_block + newex.ec_len; + block = es.es_lblk + es.es_len; } if (path) { @@ -2112,21 +2123,6 @@ static int ext4_fill_fiemap_extents(struct inode *inode, return err; } -static void -ext4_ext_put_in_cache(struct inode *inode, ext4_lblk_t block, - __u32 len, ext4_fsblk_t start) -{ - struct ext4_ext_cache *cex; - BUG_ON(len == 0); - spin_lock(&EXT4_I(inode)->i_block_reservation_lock); - trace_ext4_ext_put_in_cache(inode, block, len, start); - cex = &EXT4_I(inode)->i_cached_extent; - cex->ec_block = block; - cex->ec_len = len; - cex->ec_start = start; - spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); -} - /* * ext4_ext_put_gap_in_cache: * calculate boundaries of the gap that the requested block fits into @@ -2143,9 +2139,10 @@ ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path, ex = path[depth].p_ext; if (ex == NULL) { - /* there is no extent yet, so gap is [0;-] */ - lblock = 0; - len = EXT_MAX_BLOCKS; + /* + * there is no extent yet, so gap is [0;-] and we + * don't cache it + */ ext_debug("cache gap(whole file):"); } else if (block < le32_to_cpu(ex->ee_block)) { lblock = block; @@ -2154,6 +2151,9 @@ ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path, block, le32_to_cpu(ex->ee_block), ext4_ext_get_actual_len(ex)); + if (!ext4_find_delalloc_range(inode, lblock, lblock + len - 1)) + ext4_es_insert_extent(inode, lblock, len, ~0, + EXTENT_STATUS_HOLE); } else if (block >= le32_to_cpu(ex->ee_block) + ext4_ext_get_actual_len(ex)) { ext4_lblk_t next; @@ -2167,58 +2167,15 @@ ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path, block); BUG_ON(next == lblock); len = next - lblock; + if (!ext4_find_delalloc_range(inode, lblock, lblock + len - 1)) + ext4_es_insert_extent(inode, lblock, len, ~0, + EXTENT_STATUS_HOLE); } else { lblock = len = 0; BUG(); } ext_debug(" -> %u:%lu\n", lblock, len); - ext4_ext_put_in_cache(inode, lblock, len, 0); -} - -/* - * ext4_ext_in_cache() - * Checks to see if the given block is in the cache. - * If it is, the cached extent is stored in the given - * cache extent pointer. - * - * @inode: The files inode - * @block: The block to look for in the cache - * @ex: Pointer where the cached extent will be stored - * if it contains block - * - * Return 0 if cache is invalid; 1 if the cache is valid - */ -static int -ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block, - struct ext4_extent *ex) -{ - struct ext4_ext_cache *cex; - int ret = 0; - - /* - * We borrow i_block_reservation_lock to protect i_cached_extent - */ - spin_lock(&EXT4_I(inode)->i_block_reservation_lock); - cex = &EXT4_I(inode)->i_cached_extent; - - /* has cache valid data? */ - if (cex->ec_len == 0) - goto errout; - - if (in_range(block, cex->ec_block, cex->ec_len)) { - ex->ee_block = cpu_to_le32(cex->ec_block); - ext4_ext_store_pblock(ex, cex->ec_start); - ex->ee_len = cpu_to_le16(cex->ec_len); - ext_debug("%u cached by %u:%u:%llu\n", - block, - cex->ec_block, cex->ec_len, cex->ec_start); - ret = 1; - } -errout: - trace_ext4_ext_in_cache(inode, block, ret); - spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); - return ret; } /* @@ -2653,13 +2610,11 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start, ext_debug("truncate since %u to %u\n", start, end); /* probably first extent we're gonna free will be last in block */ - handle = ext4_journal_start(inode, depth + 1); + handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, depth + 1); if (IS_ERR(handle)) return PTR_ERR(handle); again: - ext4_ext_invalidate_cache(inode); - trace_ext4_ext_remove_space(inode, start, depth); /* @@ -3519,19 +3474,19 @@ out: * * Return 1 if there is a delalloc block in the range, otherwise 0. */ -static int ext4_find_delalloc_range(struct inode *inode, - ext4_lblk_t lblk_start, - ext4_lblk_t lblk_end) +int ext4_find_delalloc_range(struct inode *inode, + ext4_lblk_t lblk_start, + ext4_lblk_t lblk_end) { struct extent_status es; - es.start = lblk_start; - ext4_es_find_extent(inode, &es); - if (es.len == 0) + ext4_es_find_delayed_extent(inode, lblk_start, &es); + if (es.es_len == 0) return 0; /* there is no delay extent in this tree */ - else if (es.start <= lblk_start && lblk_start < es.start + es.len) + else if (es.es_lblk <= lblk_start && + lblk_start < es.es_lblk + es.es_len) return 1; - else if (lblk_start <= es.start && es.start <= lblk_end) + else if (lblk_start <= es.es_lblk && es.es_lblk <= lblk_end) return 1; else return 0; @@ -3656,6 +3611,7 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode, ext4_set_io_unwritten_flag(inode, io); else ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN); + map->m_flags |= EXT4_MAP_UNWRITTEN; if (ext4_should_dioread_nolock(inode)) map->m_flags |= EXT4_MAP_UNINIT; goto out; @@ -3677,8 +3633,10 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode, * repeat fallocate creation request * we already have an unwritten extent */ - if (flags & EXT4_GET_BLOCKS_UNINIT_EXT) + if (flags & EXT4_GET_BLOCKS_UNINIT_EXT) { + map->m_flags |= EXT4_MAP_UNWRITTEN; goto map_out; + } /* buffered READ or buffered write_begin() lookup */ if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) { @@ -3898,35 +3856,6 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, map->m_lblk, map->m_len, inode->i_ino); trace_ext4_ext_map_blocks_enter(inode, map->m_lblk, map->m_len, flags); - /* check in cache */ - if (ext4_ext_in_cache(inode, map->m_lblk, &newex)) { - if (!newex.ee_start_lo && !newex.ee_start_hi) { - if ((sbi->s_cluster_ratio > 1) && - ext4_find_delalloc_cluster(inode, map->m_lblk)) - map->m_flags |= EXT4_MAP_FROM_CLUSTER; - - if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) { - /* - * block isn't allocated yet and - * user doesn't want to allocate it - */ - goto out2; - } - /* we should allocate requested block */ - } else { - /* block is already allocated */ - if (sbi->s_cluster_ratio > 1) - map->m_flags |= EXT4_MAP_FROM_CLUSTER; - newblock = map->m_lblk - - le32_to_cpu(newex.ee_block) - + ext4_ext_pblock(&newex); - /* number of remaining blocks in the extent */ - allocated = ext4_ext_get_actual_len(&newex) - - (map->m_lblk - le32_to_cpu(newex.ee_block)); - goto out; - } - } - /* find extent for this block */ path = ext4_ext_find_extent(inode, map->m_lblk, NULL); if (IS_ERR(path)) { @@ -3973,15 +3902,9 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, ext_debug("%u fit into %u:%d -> %llu\n", map->m_lblk, ee_block, ee_len, newblock); - /* - * Do not put uninitialized extent - * in the cache - */ - if (!ext4_ext_is_uninitialized(ex)) { - ext4_ext_put_in_cache(inode, ee_block, - ee_len, ee_start); + if (!ext4_ext_is_uninitialized(ex)) goto out; - } + allocated = ext4_ext_handle_uninitialized_extents( handle, inode, map, path, flags, allocated, newblock); @@ -4002,7 +3925,8 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, * put just found gap into cache to speed up * subsequent requests */ - ext4_ext_put_gap_in_cache(inode, path, map->m_lblk); + if ((flags & EXT4_GET_BLOCKS_NO_PUT_HOLE) == 0) + ext4_ext_put_gap_in_cache(inode, path, map->m_lblk); goto out2; } @@ -4108,6 +4032,7 @@ got_allocated_blocks: /* Mark uninitialized */ if (flags & EXT4_GET_BLOCKS_UNINIT_EXT){ ext4_ext_mark_uninitialized(&newex); + map->m_flags |= EXT4_MAP_UNWRITTEN; /* * io_end structure was created for every IO write to an * uninitialized extent. To avoid unnecessary conversion, @@ -4241,10 +4166,9 @@ got_allocated_blocks: * Cache the extent and update transaction to commit on fdatasync only * when it is _not_ an uninitialized extent. */ - if ((flags & EXT4_GET_BLOCKS_UNINIT_EXT) == 0) { - ext4_ext_put_in_cache(inode, map->m_lblk, allocated, newblock); + if ((flags & EXT4_GET_BLOCKS_UNINIT_EXT) == 0) ext4_update_inode_fsync_trans(handle, inode, 1); - } else + else ext4_update_inode_fsync_trans(handle, inode, 0); out: if (allocated > map->m_len) @@ -4284,7 +4208,7 @@ void ext4_ext_truncate(struct inode *inode) * probably first extent we're gonna free will be last in block */ err = ext4_writepage_trans_blocks(inode); - handle = ext4_journal_start(inode, err); + handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, err); if (IS_ERR(handle)) return; @@ -4303,7 +4227,6 @@ void ext4_ext_truncate(struct inode *inode) goto out_stop; down_write(&EXT4_I(inode)->i_data_sem); - ext4_ext_invalidate_cache(inode); ext4_discard_preallocations(inode); @@ -4397,13 +4320,6 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) struct ext4_map_blocks map; unsigned int credits, blkbits = inode->i_blkbits; - /* - * currently supporting (pre)allocate mode for extent-based - * files _only_ - */ - if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) - return -EOPNOTSUPP; - /* Return error if mode is not supported */ if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) return -EOPNOTSUPP; @@ -4415,6 +4331,13 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) if (ret) return ret; + /* + * currently supporting (pre)allocate mode for extent-based + * files _only_ + */ + if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) + return -EOPNOTSUPP; + trace_ext4_fallocate_enter(inode, offset, len, mode); map.m_lblk = offset >> blkbits; /* @@ -4451,7 +4374,8 @@ retry: while (ret >= 0 && ret < max_blocks) { map.m_lblk = map.m_lblk + ret; map.m_len = max_blocks = max_blocks - ret; - handle = ext4_journal_start(inode, credits); + handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, + credits); if (IS_ERR(handle)) { ret = PTR_ERR(handle); break; @@ -4459,11 +4383,11 @@ retry: ret = ext4_map_blocks(handle, inode, &map, flags); if (ret <= 0) { #ifdef EXT4FS_DEBUG - WARN_ON(ret <= 0); - printk(KERN_ERR "%s: ext4_ext_map_blocks " - "returned error inode#%lu, block=%u, " - "max_blocks=%u", __func__, - inode->i_ino, map.m_lblk, max_blocks); + ext4_warning(inode->i_sb, + "inode #%lu: block %u: len %u: " + "ext4_ext_map_blocks returned %d", + inode->i_ino, map.m_lblk, + map.m_len, ret); #endif ext4_mark_inode_dirty(handle, inode); ret2 = ext4_journal_stop(handle); @@ -4529,21 +4453,19 @@ int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset, while (ret >= 0 && ret < max_blocks) { map.m_lblk += ret; map.m_len = (max_blocks -= ret); - handle = ext4_journal_start(inode, credits); + handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, credits); if (IS_ERR(handle)) { ret = PTR_ERR(handle); break; } ret = ext4_map_blocks(handle, inode, &map, EXT4_GET_BLOCKS_IO_CONVERT_EXT); - if (ret <= 0) { - WARN_ON(ret <= 0); - ext4_msg(inode->i_sb, KERN_ERR, - "%s:%d: inode #%lu: block %u: len %u: " - "ext4_ext_map_blocks returned %d", - __func__, __LINE__, inode->i_ino, map.m_lblk, - map.m_len, ret); - } + if (ret <= 0) + ext4_warning(inode->i_sb, + "inode #%lu: block %u: len %u: " + "ext4_ext_map_blocks returned %d", + inode->i_ino, map.m_lblk, + map.m_len, ret); ext4_mark_inode_dirty(handle, inode); ret2 = ext4_journal_stop(handle); if (ret <= 0 || ret2 ) @@ -4553,42 +4475,48 @@ int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset, } /* - * If newex is not existing extent (newex->ec_start equals zero) find - * delayed extent at start of newex and update newex accordingly and + * If newes is not existing extent (newes->ec_pblk equals zero) find + * delayed extent at start of newes and update newes accordingly and * return start of the next delayed extent. * - * If newex is existing extent (newex->ec_start is not equal zero) + * If newes is existing extent (newes->ec_pblk is not equal zero) * return start of next delayed extent or EXT_MAX_BLOCKS if no delayed - * extent found. Leave newex unmodified. + * extent found. Leave newes unmodified. */ static int ext4_find_delayed_extent(struct inode *inode, - struct ext4_ext_cache *newex) + struct extent_status *newes) { struct extent_status es; - ext4_lblk_t next_del; + ext4_lblk_t block, next_del; - es.start = newex->ec_block; - next_del = ext4_es_find_extent(inode, &es); + ext4_es_find_delayed_extent(inode, newes->es_lblk, &es); - if (newex->ec_start == 0) { + if (newes->es_pblk == 0) { /* - * No extent in extent-tree contains block @newex->ec_start, + * No extent in extent-tree contains block @newes->es_pblk, * then the block may stay in 1)a hole or 2)delayed-extent. */ - if (es.len == 0) + if (es.es_len == 0) /* A hole found. */ return 0; - if (es.start > newex->ec_block) { + if (es.es_lblk > newes->es_lblk) { /* A hole found. */ - newex->ec_len = min(es.start - newex->ec_block, - newex->ec_len); + newes->es_len = min(es.es_lblk - newes->es_lblk, + newes->es_len); return 0; } - newex->ec_len = es.start + es.len - newex->ec_block; + newes->es_len = es.es_lblk + es.es_len - newes->es_lblk; } + block = newes->es_lblk + newes->es_len; + ext4_es_find_delayed_extent(inode, block, &es); + if (es.es_len == 0) + next_del = EXT_MAX_BLOCKS; + else + next_del = es.es_lblk; + return next_del; } /* fiemap flags we can handle specified here */ @@ -4709,7 +4637,7 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length) inode_dio_wait(inode); credits = ext4_writepage_trans_blocks(inode); - handle = ext4_journal_start(inode, credits); + handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits); if (IS_ERR(handle)) { err = PTR_ERR(handle); goto out_dio; @@ -4786,14 +4714,12 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length) goto out; down_write(&EXT4_I(inode)->i_data_sem); - ext4_ext_invalidate_cache(inode); ext4_discard_preallocations(inode); err = ext4_es_remove_extent(inode, first_block, stop_block - first_block); err = ext4_ext_remove_space(inode, first_block, stop_block - 1); - ext4_ext_invalidate_cache(inode); ext4_discard_preallocations(inode); if (IS_SYNC(inode)) diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index 564d981..f768f4a 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -23,40 +23,53 @@ * (e.g. Reservation space warning), and provide extent-level locking. * Delay extent tree is the first step to achieve this goal. It is * original built by Yongqiang Yang. At that time it is called delay - * extent tree, whose goal is only track delay extent in memory to + * extent tree, whose goal is only track delayed extents in memory to * simplify the implementation of fiemap and bigalloc, and introduce * lseek SEEK_DATA/SEEK_HOLE support. That is why it is still called - * delay extent tree at the following comment. But for better - * understand what it does, it has been rename to extent status tree. + * delay extent tree at the first commit. But for better understand + * what it does, it has been rename to extent status tree. * - * Currently the first step has been done. All delay extents are - * tracked in the tree. It maintains the delay extent when a delay - * allocation is issued, and the delay extent is written out or + * Step1: + * Currently the first step has been done. All delayed extents are + * tracked in the tree. It maintains the delayed extent when a delayed + * allocation is issued, and the delayed extent is written out or * invalidated. Therefore the implementation of fiemap and bigalloc * are simplified, and SEEK_DATA/SEEK_HOLE are introduced. * * The following comment describes the implemenmtation of extent * status tree and future works. + * + * Step2: + * In this step all extent status are tracked by extent status tree. + * Thus, we can first try to lookup a block mapping in this tree before + * finding it in extent tree. Hence, single extent cache can be removed + * because extent status tree can do a better job. Extents in status + * tree are loaded on-demand. Therefore, the extent status tree may not + * contain all of the extents in a file. Meanwhile we define a shrinker + * to reclaim memory from extent status tree because fragmented extent + * tree will make status tree cost too much memory. written/unwritten/- + * hole extents in the tree will be reclaimed by this shrinker when we + * are under high memory pressure. Delayed extents will not be + * reclimed because fiemap, bigalloc, and seek_data/hole need it. */ /* - * extents status tree implementation for ext4. + * Extent status tree implementation for ext4. * * * ========================================================================== - * Extents status encompass delayed extents and extent locks + * Extent status tree tracks all extent status. * - * 1. Why delayed extent implementation ? + * 1. Why we need to implement extent status tree? * - * Without delayed extent, ext4 identifies a delayed extent by looking + * Without extent status tree, ext4 identifies a delayed extent by looking * up page cache, this has several deficiencies - complicated, buggy, * and inefficient code. * - * FIEMAP, SEEK_HOLE/DATA, bigalloc, punch hole and writeout all need - * to know if a block or a range of blocks are belonged to a delayed - * extent. + * FIEMAP, SEEK_HOLE/DATA, bigalloc, and writeout all need to know if a + * block or a range of blocks are belonged to a delayed extent. * - * Let us have a look at how they do without delayed extents implementation. + * Let us have a look at how they do without extent status tree. * -- FIEMAP * FIEMAP looks up page cache to identify delayed allocations from holes. * @@ -68,47 +81,48 @@ * already under delayed allocation or not to determine whether * quota reserving is needed for the cluster. * - * -- punch hole - * punch hole looks up page cache to identify a delayed extent. - * * -- writeout * Writeout looks up whole page cache to see if a buffer is * mapped, If there are not very many delayed buffers, then it is * time comsuming. * - * With delayed extents implementation, FIEMAP, SEEK_HOLE/DATA, + * With extent status tree implementation, FIEMAP, SEEK_HOLE/DATA, * bigalloc and writeout can figure out if a block or a range of * blocks is under delayed allocation(belonged to a delayed extent) or - * not by searching the delayed extent tree. + * not by searching the extent tree. * * * ========================================================================== - * 2. ext4 delayed extents impelmentation + * 2. Ext4 extent status tree impelmentation + * + * -- extent + * A extent is a range of blocks which are contiguous logically and + * physically. Unlike extent in extent tree, this extent in ext4 is + * a in-memory struct, there is no corresponding on-disk data. There + * is no limit on length of extent, so an extent can contain as many + * blocks as they are contiguous logically and physically. * - * -- delayed extent - * A delayed extent is a range of blocks which are contiguous - * logically and under delayed allocation. Unlike extent in - * ext4, delayed extent in ext4 is a in-memory struct, there is - * no corresponding on-disk data. There is no limit on length of - * delayed extent, so a delayed extent can contain as many blocks - * as they are contiguous logically. + * -- extent status tree + * Every inode has an extent status tree and all allocation blocks + * are added to the tree with different status. The extent in the + * tree are ordered by logical block no. * - * -- delayed extent tree - * Every inode has a delayed extent tree and all under delayed - * allocation blocks are added to the tree as delayed extents. - * Delayed extents in the tree are ordered by logical block no. + * -- operations on a extent status tree + * There are three important operations on a delayed extent tree: find + * next extent, adding a extent(a range of blocks) and removing a extent. * - * -- operations on a delayed extent tree - * There are three operations on a delayed extent tree: find next - * delayed extent, adding a space(a range of blocks) and removing - * a space. + * -- race on a extent status tree + * Extent status tree is protected by inode->i_es_lock. * - * -- race on a delayed extent tree - * Delayed extent tree is protected inode->i_es_lock. + * -- memory consumption + * Fragmented extent tree will make extent status tree cost too much + * memory. Hence, we will reclaim written/unwritten/hole extents from + * the tree under a heavy memory pressure. * * * ========================================================================== - * 3. performance analysis + * 3. Performance analysis + * * -- overhead * 1. There is a cache extent for write access, so if writes are * not very random, adding space operaions are in O(1) time. @@ -120,15 +134,21 @@ * * ========================================================================== * 4. TODO list - * -- Track all extent status * - * -- Improve get block process + * -- Refactor delayed space reservation * * -- Extent-level locking */ static struct kmem_cache *ext4_es_cachep; +static int __es_insert_extent(struct inode *inode, struct extent_status *newes); +static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk, + ext4_lblk_t end); +static int __es_try_to_reclaim_extents(struct ext4_inode_info *ei, + int nr_to_scan); +static int ext4_es_reclaim_extents_count(struct super_block *sb); + int __init ext4_init_es(void) { ext4_es_cachep = KMEM_CACHE(extent_status, SLAB_RECLAIM_ACCOUNT); @@ -161,7 +181,9 @@ static void ext4_es_print_tree(struct inode *inode) while (node) { struct extent_status *es; es = rb_entry(node, struct extent_status, rb_node); - printk(KERN_DEBUG " [%u/%u)", es->start, es->len); + printk(KERN_DEBUG " [%u/%u) %llu %llx", + es->es_lblk, es->es_len, + ext4_es_pblock(es), ext4_es_status(es)); node = rb_next(node); } printk(KERN_DEBUG "\n"); @@ -170,10 +192,10 @@ static void ext4_es_print_tree(struct inode *inode) #define ext4_es_print_tree(inode) #endif -static inline ext4_lblk_t extent_status_end(struct extent_status *es) +static inline ext4_lblk_t ext4_es_end(struct extent_status *es) { - BUG_ON(es->start + es->len < es->start); - return es->start + es->len - 1; + BUG_ON(es->es_lblk + es->es_len < es->es_lblk); + return es->es_lblk + es->es_len - 1; } /* @@ -181,25 +203,25 @@ static inline ext4_lblk_t extent_status_end(struct extent_status *es) * it can't be found, try to find next extent. */ static struct extent_status *__es_tree_search(struct rb_root *root, - ext4_lblk_t offset) + ext4_lblk_t lblk) { struct rb_node *node = root->rb_node; struct extent_status *es = NULL; while (node) { es = rb_entry(node, struct extent_status, rb_node); - if (offset < es->start) + if (lblk < es->es_lblk) node = node->rb_left; - else if (offset > extent_status_end(es)) + else if (lblk > ext4_es_end(es)) node = node->rb_right; else return es; } - if (es && offset < es->start) + if (es && lblk < es->es_lblk) return es; - if (es && offset > extent_status_end(es)) { + if (es && lblk > ext4_es_end(es)) { node = rb_next(&es->rb_node); return node ? rb_entry(node, struct extent_status, rb_node) : NULL; @@ -209,79 +231,121 @@ static struct extent_status *__es_tree_search(struct rb_root *root, } /* - * ext4_es_find_extent: find the 1st delayed extent covering @es->start - * if it exists, otherwise, the next extent after @es->start. + * ext4_es_find_delayed_extent: find the 1st delayed extent covering @es->lblk + * if it exists, otherwise, the next extent after @es->lblk. * * @inode: the inode which owns delayed extents + * @lblk: the offset where we start to search * @es: delayed extent that we found - * - * Returns the first block of the next extent after es, otherwise - * EXT_MAX_BLOCKS if no delay extent is found. - * Delayed extent is returned via @es. */ -ext4_lblk_t ext4_es_find_extent(struct inode *inode, struct extent_status *es) +void ext4_es_find_delayed_extent(struct inode *inode, ext4_lblk_t lblk, + struct extent_status *es) { struct ext4_es_tree *tree = NULL; struct extent_status *es1 = NULL; struct rb_node *node; - ext4_lblk_t ret = EXT_MAX_BLOCKS; - trace_ext4_es_find_extent_enter(inode, es->start); + BUG_ON(es == NULL); + trace_ext4_es_find_delayed_extent_enter(inode, lblk); read_lock(&EXT4_I(inode)->i_es_lock); tree = &EXT4_I(inode)->i_es_tree; - /* find delay extent in cache firstly */ + /* find extent in cache firstly */ + es->es_lblk = es->es_len = es->es_pblk = 0; if (tree->cache_es) { es1 = tree->cache_es; - if (in_range(es->start, es1->start, es1->len)) { - es_debug("%u cached by [%u/%u)\n", - es->start, es1->start, es1->len); + if (in_range(lblk, es1->es_lblk, es1->es_len)) { + es_debug("%u cached by [%u/%u) %llu %llx\n", + lblk, es1->es_lblk, es1->es_len, + ext4_es_pblock(es1), ext4_es_status(es1)); goto out; } } - es->len = 0; - es1 = __es_tree_search(&tree->root, es->start); + es1 = __es_tree_search(&tree->root, lblk); out: - if (es1) { - tree->cache_es = es1; - es->start = es1->start; - es->len = es1->len; - node = rb_next(&es1->rb_node); - if (node) { + if (es1 && !ext4_es_is_delayed(es1)) { + while ((node = rb_next(&es1->rb_node)) != NULL) { es1 = rb_entry(node, struct extent_status, rb_node); - ret = es1->start; + if (ext4_es_is_delayed(es1)) + break; } } + if (es1 && ext4_es_is_delayed(es1)) { + tree->cache_es = es1; + es->es_lblk = es1->es_lblk; + es->es_len = es1->es_len; + es->es_pblk = es1->es_pblk; + } + read_unlock(&EXT4_I(inode)->i_es_lock); - trace_ext4_es_find_extent_exit(inode, es, ret); - return ret; + ext4_es_lru_add(inode); + trace_ext4_es_find_delayed_extent_exit(inode, es); } static struct extent_status * -ext4_es_alloc_extent(ext4_lblk_t start, ext4_lblk_t len) +ext4_es_alloc_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len, + ext4_fsblk_t pblk) { struct extent_status *es; es = kmem_cache_alloc(ext4_es_cachep, GFP_ATOMIC); if (es == NULL) return NULL; - es->start = start; - es->len = len; + es->es_lblk = lblk; + es->es_len = len; + es->es_pblk = pblk; + + /* + * We don't count delayed extent because we never try to reclaim them + */ + if (!ext4_es_is_delayed(es)) + EXT4_I(inode)->i_es_lru_nr++; + return es; } -static void ext4_es_free_extent(struct extent_status *es) +static void ext4_es_free_extent(struct inode *inode, struct extent_status *es) { + /* Decrease the lru counter when this es is not delayed */ + if (!ext4_es_is_delayed(es)) { + BUG_ON(EXT4_I(inode)->i_es_lru_nr == 0); + EXT4_I(inode)->i_es_lru_nr--; + } + kmem_cache_free(ext4_es_cachep, es); } +/* + * Check whether or not two extents can be merged + * Condition: + * - logical block number is contiguous + * - physical block number is contiguous + * - status is equal + */ +static int ext4_es_can_be_merged(struct extent_status *es1, + struct extent_status *es2) +{ + if (es1->es_lblk + es1->es_len != es2->es_lblk) + return 0; + + if (ext4_es_status(es1) != ext4_es_status(es2)) + return 0; + + if ((ext4_es_is_written(es1) || ext4_es_is_unwritten(es1)) && + (ext4_es_pblock(es1) + es1->es_len != ext4_es_pblock(es2))) + return 0; + + return 1; +} + static struct extent_status * -ext4_es_try_to_merge_left(struct ext4_es_tree *tree, struct extent_status *es) +ext4_es_try_to_merge_left(struct inode *inode, struct extent_status *es) { + struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree; struct extent_status *es1; struct rb_node *node; @@ -290,10 +354,10 @@ ext4_es_try_to_merge_left(struct ext4_es_tree *tree, struct extent_status *es) return es; es1 = rb_entry(node, struct extent_status, rb_node); - if (es->start == extent_status_end(es1) + 1) { - es1->len += es->len; + if (ext4_es_can_be_merged(es1, es)) { + es1->es_len += es->es_len; rb_erase(&es->rb_node, &tree->root); - ext4_es_free_extent(es); + ext4_es_free_extent(inode, es); es = es1; } @@ -301,8 +365,9 @@ ext4_es_try_to_merge_left(struct ext4_es_tree *tree, struct extent_status *es) } static struct extent_status * -ext4_es_try_to_merge_right(struct ext4_es_tree *tree, struct extent_status *es) +ext4_es_try_to_merge_right(struct inode *inode, struct extent_status *es) { + struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree; struct extent_status *es1; struct rb_node *node; @@ -311,69 +376,57 @@ ext4_es_try_to_merge_right(struct ext4_es_tree *tree, struct extent_status *es) return es; es1 = rb_entry(node, struct extent_status, rb_node); - if (es1->start == extent_status_end(es) + 1) { - es->len += es1->len; + if (ext4_es_can_be_merged(es, es1)) { + es->es_len += es1->es_len; rb_erase(node, &tree->root); - ext4_es_free_extent(es1); + ext4_es_free_extent(inode, es1); } return es; } -static int __es_insert_extent(struct ext4_es_tree *tree, ext4_lblk_t offset, - ext4_lblk_t len) +static int __es_insert_extent(struct inode *inode, struct extent_status *newes) { + struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree; struct rb_node **p = &tree->root.rb_node; struct rb_node *parent = NULL; struct extent_status *es; - ext4_lblk_t end = offset + len - 1; - - BUG_ON(end < offset); - es = tree->cache_es; - if (es && offset == (extent_status_end(es) + 1)) { - es_debug("cached by [%u/%u)\n", es->start, es->len); - es->len += len; - es = ext4_es_try_to_merge_right(tree, es); - goto out; - } else if (es && es->start == end + 1) { - es_debug("cached by [%u/%u)\n", es->start, es->len); - es->start = offset; - es->len += len; - es = ext4_es_try_to_merge_left(tree, es); - goto out; - } else if (es && es->start <= offset && - end <= extent_status_end(es)) { - es_debug("cached by [%u/%u)\n", es->start, es->len); - goto out; - } while (*p) { parent = *p; es = rb_entry(parent, struct extent_status, rb_node); - if (offset < es->start) { - if (es->start == end + 1) { - es->start = offset; - es->len += len; - es = ext4_es_try_to_merge_left(tree, es); + if (newes->es_lblk < es->es_lblk) { + if (ext4_es_can_be_merged(newes, es)) { + /* + * Here we can modify es_lblk directly + * because it isn't overlapped. + */ + es->es_lblk = newes->es_lblk; + es->es_len += newes->es_len; + if (ext4_es_is_written(es) || + ext4_es_is_unwritten(es)) + ext4_es_store_pblock(es, + newes->es_pblk); + es = ext4_es_try_to_merge_left(inode, es); goto out; } p = &(*p)->rb_left; - } else if (offset > extent_status_end(es)) { - if (offset == extent_status_end(es) + 1) { - es->len += len; - es = ext4_es_try_to_merge_right(tree, es); + } else if (newes->es_lblk > ext4_es_end(es)) { + if (ext4_es_can_be_merged(es, newes)) { + es->es_len += newes->es_len; + es = ext4_es_try_to_merge_right(inode, es); goto out; } p = &(*p)->rb_right; } else { - if (extent_status_end(es) <= end) - es->len = offset - es->start + len; - goto out; + BUG_ON(1); + return -EINVAL; } } - es = ext4_es_alloc_extent(offset, len); + es = ext4_es_alloc_extent(inode, newes->es_lblk, newes->es_len, + newes->es_pblk); if (!es) return -ENOMEM; rb_link_node(&es->rb_node, parent, p); @@ -385,85 +438,166 @@ out: } /* - * ext4_es_insert_extent() adds a space to a delayed extent tree. - * Caller holds inode->i_es_lock. + * ext4_es_insert_extent() adds a space to a extent status tree. * * ext4_es_insert_extent is called by ext4_da_write_begin and * ext4_es_remove_extent. * * Return 0 on success, error code on failure. */ -int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t offset, - ext4_lblk_t len) +int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk, + ext4_lblk_t len, ext4_fsblk_t pblk, + unsigned long long status) { - struct ext4_es_tree *tree; + struct extent_status newes; + ext4_lblk_t end = lblk + len - 1; int err = 0; - trace_ext4_es_insert_extent(inode, offset, len); - es_debug("add [%u/%u) to extent status tree of inode %lu\n", - offset, len, inode->i_ino); + es_debug("add [%u/%u) %llu %llx to extent status tree of inode %lu\n", + lblk, len, pblk, status, inode->i_ino); + + if (!len) + return 0; + + BUG_ON(end < lblk); + + newes.es_lblk = lblk; + newes.es_len = len; + ext4_es_store_pblock(&newes, pblk); + ext4_es_store_status(&newes, status); + trace_ext4_es_insert_extent(inode, &newes); write_lock(&EXT4_I(inode)->i_es_lock); - tree = &EXT4_I(inode)->i_es_tree; - err = __es_insert_extent(tree, offset, len); + err = __es_remove_extent(inode, lblk, end); + if (err != 0) + goto error; + err = __es_insert_extent(inode, &newes); + +error: write_unlock(&EXT4_I(inode)->i_es_lock); + ext4_es_lru_add(inode); ext4_es_print_tree(inode); return err; } /* - * ext4_es_remove_extent() removes a space from a delayed extent tree. - * Caller holds inode->i_es_lock. + * ext4_es_lookup_extent() looks up an extent in extent status tree. * - * Return 0 on success, error code on failure. + * ext4_es_lookup_extent is called by ext4_map_blocks/ext4_da_map_blocks. + * + * Return: 1 on found, 0 on not */ -int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t offset, - ext4_lblk_t len) +int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk, + struct extent_status *es) { - struct rb_node *node; struct ext4_es_tree *tree; + struct extent_status *es1 = NULL; + struct rb_node *node; + int found = 0; + + trace_ext4_es_lookup_extent_enter(inode, lblk); + es_debug("lookup extent in block %u\n", lblk); + + tree = &EXT4_I(inode)->i_es_tree; + read_lock(&EXT4_I(inode)->i_es_lock); + + /* find extent in cache firstly */ + es->es_lblk = es->es_len = es->es_pblk = 0; + if (tree->cache_es) { + es1 = tree->cache_es; + if (in_range(lblk, es1->es_lblk, es1->es_len)) { + es_debug("%u cached by [%u/%u)\n", + lblk, es1->es_lblk, es1->es_len); + found = 1; + goto out; + } + } + + node = tree->root.rb_node; + while (node) { + es1 = rb_entry(node, struct extent_status, rb_node); + if (lblk < es1->es_lblk) + node = node->rb_left; + else if (lblk > ext4_es_end(es1)) + node = node->rb_right; + else { + found = 1; + break; + } + } + +out: + if (found) { + BUG_ON(!es1); + es->es_lblk = es1->es_lblk; + es->es_len = es1->es_len; + es->es_pblk = es1->es_pblk; + } + + read_unlock(&EXT4_I(inode)->i_es_lock); + + ext4_es_lru_add(inode); + trace_ext4_es_lookup_extent_exit(inode, es, found); + return found; +} + +static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk, + ext4_lblk_t end) +{ + struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree; + struct rb_node *node; struct extent_status *es; struct extent_status orig_es; - ext4_lblk_t len1, len2, end; + ext4_lblk_t len1, len2; + ext4_fsblk_t block; int err = 0; - trace_ext4_es_remove_extent(inode, offset, len); - es_debug("remove [%u/%u) from extent status tree of inode %lu\n", - offset, len, inode->i_ino); - - end = offset + len - 1; - BUG_ON(end < offset); - write_lock(&EXT4_I(inode)->i_es_lock); - tree = &EXT4_I(inode)->i_es_tree; - es = __es_tree_search(&tree->root, offset); + es = __es_tree_search(&tree->root, lblk); if (!es) goto out; - if (es->start > end) + if (es->es_lblk > end) goto out; /* Simply invalidate cache_es. */ tree->cache_es = NULL; - orig_es.start = es->start; - orig_es.len = es->len; - len1 = offset > es->start ? offset - es->start : 0; - len2 = extent_status_end(es) > end ? - extent_status_end(es) - end : 0; + orig_es.es_lblk = es->es_lblk; + orig_es.es_len = es->es_len; + orig_es.es_pblk = es->es_pblk; + + len1 = lblk > es->es_lblk ? lblk - es->es_lblk : 0; + len2 = ext4_es_end(es) > end ? ext4_es_end(es) - end : 0; if (len1 > 0) - es->len = len1; + es->es_len = len1; if (len2 > 0) { if (len1 > 0) { - err = __es_insert_extent(tree, end + 1, len2); + struct extent_status newes; + + newes.es_lblk = end + 1; + newes.es_len = len2; + if (ext4_es_is_written(&orig_es) || + ext4_es_is_unwritten(&orig_es)) { + block = ext4_es_pblock(&orig_es) + + orig_es.es_len - len2; + ext4_es_store_pblock(&newes, block); + } + ext4_es_store_status(&newes, ext4_es_status(&orig_es)); + err = __es_insert_extent(inode, &newes); if (err) { - es->start = orig_es.start; - es->len = orig_es.len; + es->es_lblk = orig_es.es_lblk; + es->es_len = orig_es.es_len; goto out; } } else { - es->start = end + 1; - es->len = len2; + es->es_lblk = end + 1; + es->es_len = len2; + if (ext4_es_is_written(es) || + ext4_es_is_unwritten(es)) { + block = orig_es.es_pblk + orig_es.es_len - len2; + ext4_es_store_pblock(es, block); + } } goto out; } @@ -476,10 +610,10 @@ int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t offset, es = NULL; } - while (es && extent_status_end(es) <= end) { + while (es && ext4_es_end(es) <= end) { node = rb_next(&es->rb_node); rb_erase(&es->rb_node, &tree->root); - ext4_es_free_extent(es); + ext4_es_free_extent(inode, es); if (!node) { es = NULL; break; @@ -487,14 +621,183 @@ int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t offset, es = rb_entry(node, struct extent_status, rb_node); } - if (es && es->start < end + 1) { - len1 = extent_status_end(es) - end; - es->start = end + 1; - es->len = len1; + if (es && es->es_lblk < end + 1) { + ext4_lblk_t orig_len = es->es_len; + + len1 = ext4_es_end(es) - end; + es->es_lblk = end + 1; + es->es_len = len1; + if (ext4_es_is_written(es) || ext4_es_is_unwritten(es)) { + block = es->es_pblk + orig_len - len1; + ext4_es_store_pblock(es, block); + } } out: + return err; +} + +/* + * ext4_es_remove_extent() removes a space from a extent status tree. + * + * Return 0 on success, error code on failure. + */ +int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk, + ext4_lblk_t len) +{ + ext4_lblk_t end; + int err = 0; + + trace_ext4_es_remove_extent(inode, lblk, len); + es_debug("remove [%u/%u) from extent status tree of inode %lu\n", + lblk, len, inode->i_ino); + + if (!len) + return err; + + end = lblk + len - 1; + BUG_ON(end < lblk); + + write_lock(&EXT4_I(inode)->i_es_lock); + err = __es_remove_extent(inode, lblk, end); write_unlock(&EXT4_I(inode)->i_es_lock); ext4_es_print_tree(inode); return err; } + +static int ext4_es_shrink(struct shrinker *shrink, struct shrink_control *sc) +{ + struct ext4_sb_info *sbi = container_of(shrink, + struct ext4_sb_info, s_es_shrinker); + struct ext4_inode_info *ei; + struct list_head *cur, *tmp, scanned; + int nr_to_scan = sc->nr_to_scan; + int ret, nr_shrunk = 0; + + trace_ext4_es_shrink_enter(sbi->s_sb, nr_to_scan); + + if (!nr_to_scan) + return ext4_es_reclaim_extents_count(sbi->s_sb); + + INIT_LIST_HEAD(&scanned); + + spin_lock(&sbi->s_es_lru_lock); + list_for_each_safe(cur, tmp, &sbi->s_es_lru) { + list_move_tail(cur, &scanned); + + ei = list_entry(cur, struct ext4_inode_info, i_es_lru); + + read_lock(&ei->i_es_lock); + if (ei->i_es_lru_nr == 0) { + read_unlock(&ei->i_es_lock); + continue; + } + read_unlock(&ei->i_es_lock); + + write_lock(&ei->i_es_lock); + ret = __es_try_to_reclaim_extents(ei, nr_to_scan); + write_unlock(&ei->i_es_lock); + + nr_shrunk += ret; + nr_to_scan -= ret; + if (nr_to_scan == 0) + break; + } + list_splice_tail(&scanned, &sbi->s_es_lru); + spin_unlock(&sbi->s_es_lru_lock); + trace_ext4_es_shrink_exit(sbi->s_sb, nr_shrunk); + + return ext4_es_reclaim_extents_count(sbi->s_sb); +} + +void ext4_es_register_shrinker(struct super_block *sb) +{ + struct ext4_sb_info *sbi; + + sbi = EXT4_SB(sb); + INIT_LIST_HEAD(&sbi->s_es_lru); + spin_lock_init(&sbi->s_es_lru_lock); + sbi->s_es_shrinker.shrink = ext4_es_shrink; + sbi->s_es_shrinker.seeks = DEFAULT_SEEKS; + register_shrinker(&sbi->s_es_shrinker); +} + +void ext4_es_unregister_shrinker(struct super_block *sb) +{ + unregister_shrinker(&EXT4_SB(sb)->s_es_shrinker); +} + +void ext4_es_lru_add(struct inode *inode) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + + spin_lock(&sbi->s_es_lru_lock); + if (list_empty(&ei->i_es_lru)) + list_add_tail(&ei->i_es_lru, &sbi->s_es_lru); + else + list_move_tail(&ei->i_es_lru, &sbi->s_es_lru); + spin_unlock(&sbi->s_es_lru_lock); +} + +void ext4_es_lru_del(struct inode *inode) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + + spin_lock(&sbi->s_es_lru_lock); + if (!list_empty(&ei->i_es_lru)) + list_del_init(&ei->i_es_lru); + spin_unlock(&sbi->s_es_lru_lock); +} + +static int ext4_es_reclaim_extents_count(struct super_block *sb) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_inode_info *ei; + struct list_head *cur; + int nr_cached = 0; + + spin_lock(&sbi->s_es_lru_lock); + list_for_each(cur, &sbi->s_es_lru) { + ei = list_entry(cur, struct ext4_inode_info, i_es_lru); + read_lock(&ei->i_es_lock); + nr_cached += ei->i_es_lru_nr; + read_unlock(&ei->i_es_lock); + } + spin_unlock(&sbi->s_es_lru_lock); + trace_ext4_es_reclaim_extents_count(sb, nr_cached); + return nr_cached; +} + +static int __es_try_to_reclaim_extents(struct ext4_inode_info *ei, + int nr_to_scan) +{ + struct inode *inode = &ei->vfs_inode; + struct ext4_es_tree *tree = &ei->i_es_tree; + struct rb_node *node; + struct extent_status *es; + int nr_shrunk = 0; + + if (ei->i_es_lru_nr == 0) + return 0; + + node = rb_first(&tree->root); + while (node != NULL) { + es = rb_entry(node, struct extent_status, rb_node); + node = rb_next(&es->rb_node); + /* + * We can't reclaim delayed extent from status tree because + * fiemap, bigallic, and seek_data/hole need to use it. + */ + if (!ext4_es_is_delayed(es)) { + rb_erase(&es->rb_node, &tree->root); + ext4_es_free_extent(inode, es); + nr_shrunk++; + if (--nr_to_scan == 0) + break; + } + } + tree->cache_es = NULL; + return nr_shrunk; +} diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h index 077f82d..cf83e77 100644 --- a/fs/ext4/extents_status.h +++ b/fs/ext4/extents_status.h @@ -20,10 +20,21 @@ #define es_debug(fmt, ...) no_printk(fmt, ##__VA_ARGS__) #endif +#define EXTENT_STATUS_WRITTEN 0x80000000 /* written extent */ +#define EXTENT_STATUS_UNWRITTEN 0x40000000 /* unwritten extent */ +#define EXTENT_STATUS_DELAYED 0x20000000 /* delayed extent */ +#define EXTENT_STATUS_HOLE 0x10000000 /* hole */ + +#define EXTENT_STATUS_FLAGS (EXTENT_STATUS_WRITTEN | \ + EXTENT_STATUS_UNWRITTEN | \ + EXTENT_STATUS_DELAYED | \ + EXTENT_STATUS_HOLE) + struct extent_status { struct rb_node rb_node; - ext4_lblk_t start; /* first block extent covers */ - ext4_lblk_t len; /* length of extent in block */ + ext4_lblk_t es_lblk; /* first logical block extent covers */ + ext4_lblk_t es_len; /* length of extent in block */ + ext4_fsblk_t es_pblk; /* first physical block */ }; struct ext4_es_tree { @@ -35,11 +46,69 @@ extern int __init ext4_init_es(void); extern void ext4_exit_es(void); extern void ext4_es_init_tree(struct ext4_es_tree *tree); -extern int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t start, +extern int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk, + ext4_lblk_t len, ext4_fsblk_t pblk, + unsigned long long status); +extern int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len); -extern int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t start, - ext4_lblk_t len); -extern ext4_lblk_t ext4_es_find_extent(struct inode *inode, - struct extent_status *es); +extern void ext4_es_find_delayed_extent(struct inode *inode, ext4_lblk_t lblk, + struct extent_status *es); +extern int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk, + struct extent_status *es); + +static inline int ext4_es_is_written(struct extent_status *es) +{ + return (es->es_pblk & EXTENT_STATUS_WRITTEN); +} + +static inline int ext4_es_is_unwritten(struct extent_status *es) +{ + return (es->es_pblk & EXTENT_STATUS_UNWRITTEN); +} + +static inline int ext4_es_is_delayed(struct extent_status *es) +{ + return (es->es_pblk & EXTENT_STATUS_DELAYED); +} + +static inline int ext4_es_is_hole(struct extent_status *es) +{ + return (es->es_pblk & EXTENT_STATUS_HOLE); +} + +static inline ext4_fsblk_t ext4_es_status(struct extent_status *es) +{ + return (es->es_pblk & EXTENT_STATUS_FLAGS); +} + +static inline ext4_fsblk_t ext4_es_pblock(struct extent_status *es) +{ + return (es->es_pblk & ~EXTENT_STATUS_FLAGS); +} + +static inline void ext4_es_store_pblock(struct extent_status *es, + ext4_fsblk_t pb) +{ + ext4_fsblk_t block; + + block = (pb & ~EXTENT_STATUS_FLAGS) | + (es->es_pblk & EXTENT_STATUS_FLAGS); + es->es_pblk = block; +} + +static inline void ext4_es_store_status(struct extent_status *es, + unsigned long long status) +{ + ext4_fsblk_t block; + + block = (status & EXTENT_STATUS_FLAGS) | + (es->es_pblk & ~EXTENT_STATUS_FLAGS); + es->es_pblk = block; +} + +extern void ext4_es_register_shrinker(struct super_block *sb); +extern void ext4_es_unregister_shrinker(struct super_block *sb); +extern void ext4_es_lru_add(struct inode *inode); +extern void ext4_es_lru_del(struct inode *inode); #endif /* _EXT4_EXTENTS_STATUS_H */ diff --git a/fs/ext4/file.c b/fs/ext4/file.c index c00ea79..64848b5 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -240,7 +240,7 @@ static int ext4_file_open(struct inode * inode, struct file * filp) handle_t *handle; int err; - handle = ext4_journal_start_sb(sb, 1); + handle = ext4_journal_start_sb(sb, EXT4_HT_MISC, 1); if (IS_ERR(handle)) return PTR_ERR(handle); err = ext4_journal_get_write_access(handle, sbi->s_sbh); @@ -464,10 +464,8 @@ static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize) * If there is a delay extent at this offset, * it will be as a data. */ - es.start = last; - (void)ext4_es_find_extent(inode, &es); - if (last >= es.start && - last < es.start + es.len) { + ext4_es_find_delayed_extent(inode, last, &es); + if (es.es_len != 0 && in_range(last, es.es_lblk, es.es_len)) { if (last != start) dataoff = last << blkbits; break; @@ -549,11 +547,9 @@ static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize) * If there is a delay extent at this offset, * we will skip this extent. */ - es.start = last; - (void)ext4_es_find_extent(inode, &es); - if (last >= es.start && - last < es.start + es.len) { - last = es.start + es.len; + ext4_es_find_delayed_extent(inode, last, &es); + if (es.es_len != 0 && in_range(last, es.es_lblk, es.es_len)) { + last = es.es_lblk + es.es_len; holeoff = last << blkbits; continue; } diff --git a/fs/ext4/hash.c b/fs/ext4/hash.c index fa8e491..3d586f0 100644 --- a/fs/ext4/hash.c +++ b/fs/ext4/hash.c @@ -155,11 +155,11 @@ int ext4fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo) /* Check to see if the seed is all zero's */ if (hinfo->seed) { for (i = 0; i < 4; i++) { - if (hinfo->seed[i]) + if (hinfo->seed[i]) { + memcpy(buf, hinfo->seed, sizeof(buf)); break; + } } - if (i < 4) - memcpy(buf, hinfo->seed, sizeof(buf)); } switch (hinfo->hash_version) { diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 3f32c80..32fd2b9 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -634,8 +634,10 @@ static int find_group_other(struct super_block *sb, struct inode *parent, * For other inodes, search forward from the parent directory's block * group to find a free inode. */ -struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, umode_t mode, - const struct qstr *qstr, __u32 goal, uid_t *owner) +struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir, + umode_t mode, const struct qstr *qstr, + __u32 goal, uid_t *owner, int handle_type, + unsigned int line_no, int nblocks) { struct super_block *sb; struct buffer_head *inode_bitmap_bh = NULL; @@ -725,6 +727,15 @@ repeat_in_this_group: "inode=%lu", ino + 1); continue; } + if (!handle) { + BUG_ON(nblocks <= 0); + handle = __ext4_journal_start_sb(dir->i_sb, line_no, + handle_type, nblocks); + if (IS_ERR(handle)) { + err = PTR_ERR(handle); + goto fail; + } + } BUFFER_TRACE(inode_bitmap_bh, "get_write_access"); err = ext4_journal_get_write_access(handle, inode_bitmap_bh); if (err) @@ -1017,17 +1028,17 @@ iget_failed: inode = NULL; bad_orphan: ext4_warning(sb, "bad orphan inode %lu! e2fsck was run?", ino); - printk(KERN_NOTICE "ext4_test_bit(bit=%d, block=%llu) = %d\n", + printk(KERN_WARNING "ext4_test_bit(bit=%d, block=%llu) = %d\n", bit, (unsigned long long)bitmap_bh->b_blocknr, ext4_test_bit(bit, bitmap_bh->b_data)); - printk(KERN_NOTICE "inode=%p\n", inode); + printk(KERN_WARNING "inode=%p\n", inode); if (inode) { - printk(KERN_NOTICE "is_bad_inode(inode)=%d\n", + printk(KERN_WARNING "is_bad_inode(inode)=%d\n", is_bad_inode(inode)); - printk(KERN_NOTICE "NEXT_ORPHAN(inode)=%u\n", + printk(KERN_WARNING "NEXT_ORPHAN(inode)=%u\n", NEXT_ORPHAN(inode)); - printk(KERN_NOTICE "max_ino=%lu\n", max_ino); - printk(KERN_NOTICE "i_nlink=%u\n", inode->i_nlink); + printk(KERN_WARNING "max_ino=%lu\n", max_ino); + printk(KERN_WARNING "i_nlink=%u\n", inode->i_nlink); /* Avoid freeing blocks if we got a bad deleted inode */ if (inode->i_nlink == 0) inode->i_blocks = 0; @@ -1137,7 +1148,7 @@ int ext4_init_inode_table(struct super_block *sb, ext4_group_t group, if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED)) goto out; - handle = ext4_journal_start_sb(sb, 1); + handle = ext4_journal_start_sb(sb, EXT4_HT_MISC, 1); if (IS_ERR(handle)) { ret = PTR_ERR(handle); goto out; diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c index 20862f9..c541ab8 100644 --- a/fs/ext4/indirect.c +++ b/fs/ext4/indirect.c @@ -146,6 +146,7 @@ static Indirect *ext4_get_branch(struct inode *inode, int depth, struct super_block *sb = inode->i_sb; Indirect *p = chain; struct buffer_head *bh; + int ret = -EIO; *err = 0; /* i_data is not going away, no lock needed */ @@ -154,8 +155,10 @@ static Indirect *ext4_get_branch(struct inode *inode, int depth, goto no_block; while (--depth) { bh = sb_getblk(sb, le32_to_cpu(p->key)); - if (unlikely(!bh)) + if (unlikely(!bh)) { + ret = -ENOMEM; goto failure; + } if (!bh_uptodate_or_lock(bh)) { if (bh_submit_read(bh) < 0) { @@ -177,7 +180,7 @@ static Indirect *ext4_get_branch(struct inode *inode, int depth, return NULL; failure: - *err = -EIO; + *err = ret; no_block: return p; } @@ -355,9 +358,8 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode, * for the first direct block */ new_blocks[index] = current_block; - printk(KERN_INFO "%s returned more blocks than " + WARN(1, KERN_INFO "%s returned more blocks than " "requested\n", __func__); - WARN_ON(1); break; } } @@ -471,7 +473,7 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode, */ bh = sb_getblk(inode->i_sb, new_blocks[n-1]); if (unlikely(!bh)) { - err = -EIO; + err = -ENOMEM; goto failed; } @@ -789,7 +791,7 @@ ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb, if (final_size > inode->i_size) { /* Credits for sb + inode write */ - handle = ext4_journal_start(inode, 2); + handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); if (IS_ERR(handle)) { ret = PTR_ERR(handle); goto out; @@ -849,7 +851,7 @@ locked: int err; /* Credits for sb + inode write */ - handle = ext4_journal_start(inode, 2); + handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); if (IS_ERR(handle)) { /* This is really bad luck. We've written the data * but cannot extend i_size. Bail out and pretend @@ -948,7 +950,8 @@ static handle_t *start_transaction(struct inode *inode) { handle_t *result; - result = ext4_journal_start(inode, ext4_blocks_for_truncate(inode)); + result = ext4_journal_start(inode, EXT4_HT_TRUNCATE, + ext4_blocks_for_truncate(inode)); if (!IS_ERR(result)) return result; @@ -1515,3 +1518,243 @@ out_stop: trace_ext4_truncate_exit(inode); } +static int free_hole_blocks(handle_t *handle, struct inode *inode, + struct buffer_head *parent_bh, __le32 *i_data, + int level, ext4_lblk_t first, + ext4_lblk_t count, int max) +{ + struct buffer_head *bh = NULL; + int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb); + int ret = 0; + int i, inc; + ext4_lblk_t offset; + __le32 blk; + + inc = 1 << ((EXT4_BLOCK_SIZE_BITS(inode->i_sb) - 2) * level); + for (i = 0, offset = 0; i < max; i++, i_data++, offset += inc) { + if (offset >= count + first) + break; + if (*i_data == 0 || (offset + inc) <= first) + continue; + blk = *i_data; + if (level > 0) { + ext4_lblk_t first2; + bh = sb_bread(inode->i_sb, blk); + if (!bh) { + EXT4_ERROR_INODE_BLOCK(inode, blk, + "Read failure"); + return -EIO; + } + first2 = (first > offset) ? first - offset : 0; + ret = free_hole_blocks(handle, inode, bh, + (__le32 *)bh->b_data, level - 1, + first2, count - offset, + inode->i_sb->s_blocksize >> 2); + if (ret) { + brelse(bh); + goto err; + } + } + if (level == 0 || + (bh && all_zeroes((__le32 *)bh->b_data, + (__le32 *)bh->b_data + addr_per_block))) { + ext4_free_data(handle, inode, parent_bh, &blk, &blk+1); + *i_data = 0; + } + brelse(bh); + bh = NULL; + } + +err: + return ret; +} + +static int ext4_free_hole_blocks(handle_t *handle, struct inode *inode, + ext4_lblk_t first, ext4_lblk_t stop) +{ + int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb); + int level, ret = 0; + int num = EXT4_NDIR_BLOCKS; + ext4_lblk_t count, max = EXT4_NDIR_BLOCKS; + __le32 *i_data = EXT4_I(inode)->i_data; + + count = stop - first; + for (level = 0; level < 4; level++, max *= addr_per_block) { + if (first < max) { + ret = free_hole_blocks(handle, inode, NULL, i_data, + level, first, count, num); + if (ret) + goto err; + if (count > max - first) + count -= max - first; + else + break; + first = 0; + } else { + first -= max; + } + i_data += num; + if (level == 0) { + num = 1; + max = 1; + } + } + +err: + return ret; +} + +int ext4_ind_punch_hole(struct file *file, loff_t offset, loff_t length) +{ + struct inode *inode = file->f_path.dentry->d_inode; + struct super_block *sb = inode->i_sb; + ext4_lblk_t first_block, stop_block; + struct address_space *mapping = inode->i_mapping; + handle_t *handle = NULL; + loff_t first_page, last_page, page_len; + loff_t first_page_offset, last_page_offset; + int err = 0; + + /* + * Write out all dirty pages to avoid race conditions + * Then release them. + */ + if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) { + err = filemap_write_and_wait_range(mapping, + offset, offset + length - 1); + if (err) + return err; + } + + mutex_lock(&inode->i_mutex); + /* It's not possible punch hole on append only file */ + if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) { + err = -EPERM; + goto out_mutex; + } + if (IS_SWAPFILE(inode)) { + err = -ETXTBSY; + goto out_mutex; + } + + /* No need to punch hole beyond i_size */ + if (offset >= inode->i_size) + goto out_mutex; + + /* + * If the hole extents beyond i_size, set the hole + * to end after the page that contains i_size + */ + if (offset + length > inode->i_size) { + length = inode->i_size + + PAGE_CACHE_SIZE - (inode->i_size & (PAGE_CACHE_SIZE - 1)) - + offset; + } + + first_page = (offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + last_page = (offset + length) >> PAGE_CACHE_SHIFT; + + first_page_offset = first_page << PAGE_CACHE_SHIFT; + last_page_offset = last_page << PAGE_CACHE_SHIFT; + + /* Now release the pages */ + if (last_page_offset > first_page_offset) { + truncate_pagecache_range(inode, first_page_offset, + last_page_offset - 1); + } + + /* Wait all existing dio works, newcomers will block on i_mutex */ + inode_dio_wait(inode); + + handle = start_transaction(inode); + if (IS_ERR(handle)) + goto out_mutex; + + /* + * Now we need to zero out the non-page-aligned data in the + * pages at the start and tail of the hole, and unmap the buffer + * heads for the block aligned regions of the page that were + * completely zerod. + */ + if (first_page > last_page) { + /* + * If the file space being truncated is contained within a page + * just zero out and unmap the middle of that page + */ + err = ext4_discard_partial_page_buffers(handle, + mapping, offset, length, 0); + if (err) + goto out; + } else { + /* + * Zero out and unmap the paritial page that contains + * the start of the hole + */ + page_len = first_page_offset - offset; + if (page_len > 0) { + err = ext4_discard_partial_page_buffers(handle, mapping, + offset, page_len, 0); + if (err) + goto out; + } + + /* + * Zero out and unmap the partial page that contains + * the end of the hole + */ + page_len = offset + length - last_page_offset; + if (page_len > 0) { + err = ext4_discard_partial_page_buffers(handle, mapping, + last_page_offset, page_len, 0); + if (err) + goto out; + } + } + + /* + * If i_size contained in the last page, we need to + * unmap and zero the paritial page after i_size + */ + if (inode->i_size >> PAGE_CACHE_SHIFT == last_page && + inode->i_size % PAGE_CACHE_SIZE != 0) { + page_len = PAGE_CACHE_SIZE - + (inode->i_size & (PAGE_CACHE_SIZE - 1)); + if (page_len > 0) { + err = ext4_discard_partial_page_buffers(handle, + mapping, inode->i_size, page_len, 0); + if (err) + goto out; + } + } + + first_block = (offset + sb->s_blocksize - 1) >> + EXT4_BLOCK_SIZE_BITS(sb); + stop_block = (offset + length) >> EXT4_BLOCK_SIZE_BITS(sb); + + if (first_block >= stop_block) + goto out; + + down_write(&EXT4_I(inode)->i_data_sem); + ext4_discard_preallocations(inode); + + err = ext4_es_remove_extent(inode, first_block, + stop_block - first_block); + err = ext4_free_hole_blocks(handle, inode, first_block, stop_block); + + ext4_discard_preallocations(inode); + + if (IS_SYNC(inode)) + ext4_handle_sync(handle); + + up_write(&EXT4_I(inode)->i_data_sem); + +out: + inode->i_mtime = inode->i_ctime = ext4_current_time(inode); + ext4_mark_inode_dirty(handle, inode); + ext4_journal_stop(handle); + +out_mutex: + mutex_unlock(&inode->i_mutex); + + return err; +} diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index 8106dca..c0fd1a1 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -545,7 +545,7 @@ static int ext4_convert_inline_data_to_extent(struct address_space *mapping, return ret; retry: - handle = ext4_journal_start(inode, needed_blocks); + handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, needed_blocks); if (IS_ERR(handle)) { ret = PTR_ERR(handle); handle = NULL; @@ -657,7 +657,7 @@ int ext4_try_to_write_inline_data(struct address_space *mapping, * The possible write could happen in the inode, * so try to reserve the space in inode first. */ - handle = ext4_journal_start(inode, 1); + handle = ext4_journal_start(inode, EXT4_HT_INODE, 1); if (IS_ERR(handle)) { ret = PTR_ERR(handle); handle = NULL; @@ -853,7 +853,7 @@ int ext4_da_write_inline_data_begin(struct address_space *mapping, if (ret) return ret; - handle = ext4_journal_start(inode, 1); + handle = ext4_journal_start(inode, EXT4_HT_INODE, 1); if (IS_ERR(handle)) { ret = PTR_ERR(handle); handle = NULL; @@ -1188,7 +1188,7 @@ static int ext4_convert_inline_data_nolock(handle_t *handle, data_bh = sb_getblk(inode->i_sb, map.m_pblk); if (!data_bh) { - error = -EIO; + error = -ENOMEM; goto out_restore; } @@ -1770,7 +1770,7 @@ void ext4_inline_data_truncate(struct inode *inode, int *has_inline) needed_blocks = ext4_writepage_trans_blocks(inode); - handle = ext4_journal_start(inode, needed_blocks); + handle = ext4_journal_start(inode, EXT4_HT_INODE, needed_blocks); if (IS_ERR(handle)) return; @@ -1862,7 +1862,7 @@ int ext4_convert_inline_data(struct inode *inode) if (error) return error; - handle = ext4_journal_start(inode, needed_blocks); + handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, needed_blocks); if (IS_ERR(handle)) { error = PTR_ERR(handle); goto out_free; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 521bd4a..9c4f4b1 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -132,10 +132,6 @@ static inline int ext4_begin_ordered_truncate(struct inode *inode, } static void ext4_invalidatepage(struct page *page, unsigned long offset); -static int noalloc_get_block_write(struct inode *inode, sector_t iblock, - struct buffer_head *bh_result, int create); -static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode); -static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate); static int __ext4_journalled_writepage(struct page *page, unsigned int len); static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh); static int ext4_discard_partial_page_buffers_no_lock(handle_t *handle, @@ -238,7 +234,8 @@ void ext4_evict_inode(struct inode *inode) * protection against it */ sb_start_intwrite(inode->i_sb); - handle = ext4_journal_start(inode, ext4_blocks_for_truncate(inode)+3); + handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, + ext4_blocks_for_truncate(inode)+3); if (IS_ERR(handle)) { ext4_std_error(inode->i_sb, PTR_ERR(handle)); /* @@ -346,7 +343,7 @@ void ext4_da_update_reserve_space(struct inode *inode, spin_lock(&ei->i_block_reservation_lock); trace_ext4_da_update_reserve_space(inode, used, quota_claim); if (unlikely(used > ei->i_reserved_data_blocks)) { - ext4_msg(inode->i_sb, KERN_NOTICE, "%s: ino %lu, used %d " + ext4_warning(inode->i_sb, "%s: ino %lu, used %d " "with only %d reserved data blocks", __func__, inode->i_ino, used, ei->i_reserved_data_blocks); @@ -355,10 +352,12 @@ void ext4_da_update_reserve_space(struct inode *inode, } if (unlikely(ei->i_allocated_meta_blocks > ei->i_reserved_meta_blocks)) { - ext4_msg(inode->i_sb, KERN_NOTICE, "%s: ino %lu, allocated %d " - "with only %d reserved metadata blocks\n", __func__, - inode->i_ino, ei->i_allocated_meta_blocks, - ei->i_reserved_meta_blocks); + ext4_warning(inode->i_sb, "ino %lu, allocated %d " + "with only %d reserved metadata blocks " + "(releasing %d blocks with reserved %d data blocks)", + inode->i_ino, ei->i_allocated_meta_blocks, + ei->i_reserved_meta_blocks, used, + ei->i_reserved_data_blocks); WARN_ON(1); ei->i_allocated_meta_blocks = ei->i_reserved_meta_blocks; } @@ -508,12 +507,33 @@ static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx, int ext4_map_blocks(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, int flags) { + struct extent_status es; int retval; map->m_flags = 0; ext_debug("ext4_map_blocks(): inode %lu, flag %d, max_blocks %u," "logical block %lu\n", inode->i_ino, flags, map->m_len, (unsigned long) map->m_lblk); + + /* Lookup extent status tree firstly */ + if (ext4_es_lookup_extent(inode, map->m_lblk, &es)) { + if (ext4_es_is_written(&es) || ext4_es_is_unwritten(&es)) { + map->m_pblk = ext4_es_pblock(&es) + + map->m_lblk - es.es_lblk; + map->m_flags |= ext4_es_is_written(&es) ? + EXT4_MAP_MAPPED : EXT4_MAP_UNWRITTEN; + retval = es.es_len - (map->m_lblk - es.es_lblk); + if (retval > map->m_len) + retval = map->m_len; + map->m_len = retval; + } else if (ext4_es_is_delayed(&es) || ext4_es_is_hole(&es)) { + retval = 0; + } else { + BUG_ON(1); + } + goto found; + } + /* * Try to see if we can get the block without requesting a new * file system block. @@ -527,20 +547,27 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, retval = ext4_ind_map_blocks(handle, inode, map, flags & EXT4_GET_BLOCKS_KEEP_SIZE); } + if (retval > 0) { + int ret; + unsigned long long status; + + status = map->m_flags & EXT4_MAP_UNWRITTEN ? + EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; + if (!(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) && + ext4_find_delalloc_range(inode, map->m_lblk, + map->m_lblk + map->m_len - 1)) + status |= EXTENT_STATUS_DELAYED; + ret = ext4_es_insert_extent(inode, map->m_lblk, + map->m_len, map->m_pblk, status); + if (ret < 0) + retval = ret; + } if (!(flags & EXT4_GET_BLOCKS_NO_LOCK)) up_read((&EXT4_I(inode)->i_data_sem)); +found: if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) { - int ret; - if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) { - /* delayed alloc may be allocated by fallocate and - * coverted to initialized by directIO. - * we need to handle delayed extent here. - */ - down_write((&EXT4_I(inode)->i_data_sem)); - goto delayed_mapped; - } - ret = check_block_validity(inode, map); + int ret = check_block_validity(inode, map); if (ret != 0) return ret; } @@ -560,16 +587,10 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, return retval; /* - * When we call get_blocks without the create flag, the - * BH_Unwritten flag could have gotten set if the blocks - * requested were part of a uninitialized extent. We need to - * clear this flag now that we are committed to convert all or - * part of the uninitialized extent to be an initialized - * extent. This is because we need to avoid the combination - * of BH_Unwritten and BH_Mapped flags being simultaneously - * set on the buffer_head. + * Here we clear m_flags because after allocating an new extent, + * it will be set again. */ - map->m_flags &= ~EXT4_MAP_UNWRITTEN; + map->m_flags &= ~EXT4_MAP_FLAGS; /* * New blocks allocate and/or writing to uninitialized extent @@ -615,18 +636,23 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)) ext4_da_update_reserve_space(inode, retval, 1); } - if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) { + if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) ext4_clear_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED); - if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) { - int ret; -delayed_mapped: - /* delayed allocation blocks has been allocated */ - ret = ext4_es_remove_extent(inode, map->m_lblk, - map->m_len); - if (ret < 0) - retval = ret; - } + if (retval > 0) { + int ret; + unsigned long long status; + + status = map->m_flags & EXT4_MAP_UNWRITTEN ? + EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; + if (!(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) && + ext4_find_delalloc_range(inode, map->m_lblk, + map->m_lblk + map->m_len - 1)) + status |= EXTENT_STATUS_DELAYED; + ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len, + map->m_pblk, status); + if (ret < 0) + retval = ret; } up_write((&EXT4_I(inode)->i_data_sem)); @@ -660,7 +686,8 @@ static int _ext4_get_block(struct inode *inode, sector_t iblock, if (map.m_len > DIO_MAX_BLOCKS) map.m_len = DIO_MAX_BLOCKS; dio_credits = ext4_chunk_trans_blocks(inode, map.m_len); - handle = ext4_journal_start(inode, dio_credits); + handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, + dio_credits); if (IS_ERR(handle)) { ret = PTR_ERR(handle); return ret; @@ -707,14 +734,16 @@ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode, /* ensure we send some value back into *errp */ *errp = 0; + if (create && err == 0) + err = -ENOSPC; /* should never happen */ if (err < 0) *errp = err; if (err <= 0) return NULL; bh = sb_getblk(inode->i_sb, map.m_pblk); - if (!bh) { - *errp = -EIO; + if (unlikely(!bh)) { + *errp = -ENOMEM; return NULL; } if (map.m_flags & EXT4_MAP_NEW) { @@ -808,11 +837,10 @@ int ext4_walk_page_buffers(handle_t *handle, * and the commit_write(). So doing the jbd2_journal_start at the start of * prepare_write() is the right place. * - * Also, this function can nest inside ext4_writepage() -> - * block_write_full_page(). In that case, we *know* that ext4_writepage() - * has generated enough buffer credits to do the whole page. So we won't - * block on the journal in that case, which is good, because the caller may - * be PF_MEMALLOC. + * Also, this function can nest inside ext4_writepage(). In that case, we + * *know* that ext4_writepage() has generated enough buffer credits to do the + * whole page. So we won't block on the journal in that case, which is good, + * because the caller may be PF_MEMALLOC. * * By accident, ext4 can be reentered when a transaction is open via * quota file writes. If we were to commit the transaction while thus @@ -878,32 +906,40 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping, ret = ext4_try_to_write_inline_data(mapping, inode, pos, len, flags, pagep); if (ret < 0) - goto out; - if (ret == 1) { - ret = 0; - goto out; - } + return ret; + if (ret == 1) + return 0; } -retry: - handle = ext4_journal_start(inode, needed_blocks); + /* + * grab_cache_page_write_begin() can take a long time if the + * system is thrashing due to memory pressure, or if the page + * is being written back. So grab it first before we start + * the transaction handle. This also allows us to allocate + * the page (if needed) without using GFP_NOFS. + */ +retry_grab: + page = grab_cache_page_write_begin(mapping, index, flags); + if (!page) + return -ENOMEM; + unlock_page(page); + +retry_journal: + handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, needed_blocks); if (IS_ERR(handle)) { - ret = PTR_ERR(handle); - goto out; + page_cache_release(page); + return PTR_ERR(handle); } - /* We cannot recurse into the filesystem as the transaction is already - * started */ - flags |= AOP_FLAG_NOFS; - - page = grab_cache_page_write_begin(mapping, index, flags); - if (!page) { + lock_page(page); + if (page->mapping != mapping) { + /* The page got truncated from under us */ + unlock_page(page); + page_cache_release(page); ext4_journal_stop(handle); - ret = -ENOMEM; - goto out; + goto retry_grab; } - - *pagep = page; + wait_on_page_writeback(page); if (ext4_should_dioread_nolock(inode)) ret = __block_write_begin(page, pos, len, ext4_get_block_write); @@ -918,7 +954,6 @@ retry: if (ret) { unlock_page(page); - page_cache_release(page); /* * __block_write_begin may have instantiated a few blocks * outside i_size. Trim these off again. Don't need @@ -942,11 +977,14 @@ retry: if (inode->i_nlink) ext4_orphan_del(NULL, inode); } - } - if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) - goto retry; -out: + if (ret == -ENOSPC && + ext4_should_retry_alloc(inode->i_sb, &retries)) + goto retry_journal; + page_cache_release(page); + return ret; + } + *pagep = page; return ret; } @@ -1256,7 +1294,7 @@ static void ext4_da_release_space(struct inode *inode, int to_free) * function is called from invalidate page, it's * harmless to return without any action. */ - ext4_msg(inode->i_sb, KERN_NOTICE, "ext4_da_release_space: " + ext4_warning(inode->i_sb, "ext4_da_release_space: " "ino %lu, to_free %d with only %d reserved " "data blocks", inode->i_ino, to_free, ei->i_reserved_data_blocks); @@ -1357,7 +1395,6 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd, loff_t size = i_size_read(inode); unsigned int len, block_start; struct buffer_head *bh, *page_bufs = NULL; - int journal_data = ext4_should_journal_data(inode); sector_t pblock = 0, cur_logical = 0; struct ext4_io_submit io_submit; @@ -1378,7 +1415,7 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd, if (nr_pages == 0) break; for (i = 0; i < nr_pages; i++) { - int commit_write = 0, skip_page = 0; + int skip_page = 0; struct page *page = pvec.pages[i]; index = page->index; @@ -1400,27 +1437,9 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd, BUG_ON(!PageLocked(page)); BUG_ON(PageWriteback(page)); - /* - * If the page does not have buffers (for - * whatever reason), try to create them using - * __block_write_begin. If this fails, - * skip the page and move on. - */ - if (!page_has_buffers(page)) { - if (__block_write_begin(page, 0, len, - noalloc_get_block_write)) { - skip_page: - unlock_page(page); - continue; - } - commit_write = 1; - } - bh = page_bufs = page_buffers(page); block_start = 0; do { - if (!bh) - goto skip_page; if (map && (cur_logical >= map->m_lblk) && (cur_logical <= (map->m_lblk + (map->m_len - 1)))) { @@ -1448,33 +1467,14 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd, pblock++; } while (bh != page_bufs); - if (skip_page) - goto skip_page; - - if (commit_write) - /* mark the buffer_heads as dirty & uptodate */ - block_commit_write(page, 0, len); + if (skip_page) { + unlock_page(page); + continue; + } clear_page_dirty_for_io(page); - /* - * Delalloc doesn't support data journalling, - * but eventually maybe we'll lift this - * restriction. - */ - if (unlikely(journal_data && PageChecked(page))) - err = __ext4_journalled_writepage(page, len); - else if (test_opt(inode->i_sb, MBLK_IO_SUBMIT)) - err = ext4_bio_write_page(&io_submit, page, - len, mpd->wbc); - else if (buffer_uninit(page_bufs)) { - ext4_set_bh_endio(page_bufs, inode); - err = block_write_full_page_endio(page, - noalloc_get_block_write, - mpd->wbc, ext4_end_io_buffer_write); - } else - err = block_write_full_page(page, - noalloc_get_block_write, mpd->wbc); - + err = ext4_bio_write_page(&io_submit, page, len, + mpd->wbc); if (!err) mpd->pages_written++; /* @@ -1640,7 +1640,7 @@ static void mpage_da_map_and_submit(struct mpage_da_data *mpd) (unsigned long long) next, mpd->b_size >> mpd->inode->i_blkbits, err); ext4_msg(sb, KERN_CRIT, - "This should not happen!! Data will be lost\n"); + "This should not happen!! Data will be lost"); if (err == -ENOSPC) ext4_print_free_blocks(mpd->inode); } @@ -1690,16 +1690,16 @@ submit_io: * * @mpd->lbh - extent of blocks * @logical - logical number of the block in the file - * @bh - bh of the block (used to access block's state) + * @b_state - b_state of the buffer head added * * the function is used to collect contig. blocks in same state */ -static void mpage_add_bh_to_extent(struct mpage_da_data *mpd, - sector_t logical, size_t b_size, +static void mpage_add_bh_to_extent(struct mpage_da_data *mpd, sector_t logical, unsigned long b_state) { sector_t next; - int nrblocks = mpd->b_size >> mpd->inode->i_blkbits; + int blkbits = mpd->inode->i_blkbits; + int nrblocks = mpd->b_size >> blkbits; /* * XXX Don't go larger than mballoc is willing to allocate @@ -1707,11 +1707,11 @@ static void mpage_add_bh_to_extent(struct mpage_da_data *mpd, * mpage_da_submit_io() into this function and then call * ext4_map_blocks() multiple times in a loop */ - if (nrblocks >= 8*1024*1024/mpd->inode->i_sb->s_blocksize) + if (nrblocks >= (8*1024*1024 >> blkbits)) goto flush_it; - /* check if thereserved journal credits might overflow */ - if (!(ext4_test_inode_flag(mpd->inode, EXT4_INODE_EXTENTS))) { + /* check if the reserved journal credits might overflow */ + if (!ext4_test_inode_flag(mpd->inode, EXT4_INODE_EXTENTS)) { if (nrblocks >= EXT4_MAX_TRANS_DATA) { /* * With non-extent format we are limited by the journal @@ -1720,16 +1720,6 @@ static void mpage_add_bh_to_extent(struct mpage_da_data *mpd, * nrblocks. So limit nrblocks. */ goto flush_it; - } else if ((nrblocks + (b_size >> mpd->inode->i_blkbits)) > - EXT4_MAX_TRANS_DATA) { - /* - * Adding the new buffer_head would make it cross the - * allowed limit for which we have journal credit - * reserved. So limit the new bh->b_size - */ - b_size = (EXT4_MAX_TRANS_DATA - nrblocks) << - mpd->inode->i_blkbits; - /* we will do mpage_da_submit_io in the next loop */ } } /* @@ -1737,7 +1727,7 @@ static void mpage_add_bh_to_extent(struct mpage_da_data *mpd, */ if (mpd->b_size == 0) { mpd->b_blocknr = logical; - mpd->b_size = b_size; + mpd->b_size = 1 << blkbits; mpd->b_state = b_state & BH_FLAGS; return; } @@ -1747,7 +1737,7 @@ static void mpage_add_bh_to_extent(struct mpage_da_data *mpd, * Can we merge the block to our big extent? */ if (logical == next && (b_state & BH_FLAGS) == mpd->b_state) { - mpd->b_size += b_size; + mpd->b_size += 1 << blkbits; return; } @@ -1775,6 +1765,7 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock, struct ext4_map_blocks *map, struct buffer_head *bh) { + struct extent_status es; int retval; sector_t invalid_block = ~((sector_t) 0xffff); @@ -1785,6 +1776,42 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock, ext_debug("ext4_da_map_blocks(): inode %lu, max_blocks %u," "logical block %lu\n", inode->i_ino, map->m_len, (unsigned long) map->m_lblk); + + /* Lookup extent status tree firstly */ + if (ext4_es_lookup_extent(inode, iblock, &es)) { + + if (ext4_es_is_hole(&es)) { + retval = 0; + down_read((&EXT4_I(inode)->i_data_sem)); + goto add_delayed; + } + + /* + * Delayed extent could be allocated by fallocate. + * So we need to check it. + */ + if (ext4_es_is_delayed(&es) && !ext4_es_is_unwritten(&es)) { + map_bh(bh, inode->i_sb, invalid_block); + set_buffer_new(bh); + set_buffer_delay(bh); + return 0; + } + + map->m_pblk = ext4_es_pblock(&es) + iblock - es.es_lblk; + retval = es.es_len - (iblock - es.es_lblk); + if (retval > map->m_len) + retval = map->m_len; + map->m_len = retval; + if (ext4_es_is_written(&es)) + map->m_flags |= EXT4_MAP_MAPPED; + else if (ext4_es_is_unwritten(&es)) + map->m_flags |= EXT4_MAP_UNWRITTEN; + else + BUG_ON(1); + + return retval; + } + /* * Try to see if we can get the block without requesting a new * file system block. @@ -1803,11 +1830,15 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock, map->m_flags |= EXT4_MAP_FROM_CLUSTER; retval = 0; } else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) - retval = ext4_ext_map_blocks(NULL, inode, map, 0); + retval = ext4_ext_map_blocks(NULL, inode, map, + EXT4_GET_BLOCKS_NO_PUT_HOLE); else - retval = ext4_ind_map_blocks(NULL, inode, map, 0); + retval = ext4_ind_map_blocks(NULL, inode, map, + EXT4_GET_BLOCKS_NO_PUT_HOLE); +add_delayed: if (retval == 0) { + int ret; /* * XXX: __block_prepare_write() unmaps passed block, * is it OK? @@ -1815,15 +1846,20 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock, /* If the block was allocated from previously allocated cluster, * then we dont need to reserve it again. */ if (!(map->m_flags & EXT4_MAP_FROM_CLUSTER)) { - retval = ext4_da_reserve_space(inode, iblock); - if (retval) + ret = ext4_da_reserve_space(inode, iblock); + if (ret) { /* not enough space to reserve */ + retval = ret; goto out_unlock; + } } - retval = ext4_es_insert_extent(inode, map->m_lblk, map->m_len); - if (retval) + ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len, + ~0, EXTENT_STATUS_DELAYED); + if (ret) { + retval = ret; goto out_unlock; + } /* Clear EXT4_MAP_FROM_CLUSTER flag since its purpose is served * and it should not appear on the bh->b_state. @@ -1833,6 +1869,16 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock, map_bh(bh, inode->i_sb, invalid_block); set_buffer_new(bh); set_buffer_delay(bh); + } else if (retval > 0) { + int ret; + unsigned long long status; + + status = map->m_flags & EXT4_MAP_UNWRITTEN ? + EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; + ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len, + map->m_pblk, status); + if (ret != 0) + retval = ret; } out_unlock: @@ -1890,27 +1936,6 @@ int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, return 0; } -/* - * This function is used as a standard get_block_t calback function - * when there is no desire to allocate any blocks. It is used as a - * callback function for block_write_begin() and block_write_full_page(). - * These functions should only try to map a single block at a time. - * - * Since this function doesn't do block allocations even if the caller - * requests it by passing in create=1, it is critically important that - * any caller checks to make sure that any buffer heads are returned - * by this function are either all already mapped or marked for - * delayed allocation before calling block_write_full_page(). Otherwise, - * b_blocknr could be left unitialized, and the page write functions will - * be taken by surprise. - */ -static int noalloc_get_block_write(struct inode *inode, sector_t iblock, - struct buffer_head *bh_result, int create) -{ - BUG_ON(bh_result->b_size != inode->i_sb->s_blocksize); - return _ext4_get_block(inode, iblock, bh_result, 0); -} - static int bget_one(handle_t *handle, struct buffer_head *bh) { get_bh(bh); @@ -1955,7 +1980,8 @@ static int __ext4_journalled_writepage(struct page *page, * references to buffers so we are safe */ unlock_page(page); - handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode)); + handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, + ext4_writepage_trans_blocks(inode)); if (IS_ERR(handle)) { ret = PTR_ERR(handle); goto out; @@ -2035,11 +2061,12 @@ out: static int ext4_writepage(struct page *page, struct writeback_control *wbc) { - int ret = 0, commit_write = 0; + int ret = 0; loff_t size; unsigned int len; struct buffer_head *page_bufs = NULL; struct inode *inode = page->mapping->host; + struct ext4_io_submit io_submit; trace_ext4_writepage(page); size = i_size_read(inode); @@ -2048,39 +2075,29 @@ static int ext4_writepage(struct page *page, else len = PAGE_CACHE_SIZE; + page_bufs = page_buffers(page); /* - * If the page does not have buffers (for whatever reason), - * try to create them using __block_write_begin. If this - * fails, redirty the page and move on. + * We cannot do block allocation or other extent handling in this + * function. If there are buffers needing that, we have to redirty + * the page. But we may reach here when we do a journal commit via + * journal_submit_inode_data_buffers() and in that case we must write + * allocated buffers to achieve data=ordered mode guarantees. */ - if (!page_has_buffers(page)) { - if (__block_write_begin(page, 0, len, - noalloc_get_block_write)) { - redirty_page: - redirty_page_for_writepage(wbc, page); + if (ext4_walk_page_buffers(NULL, page_bufs, 0, len, NULL, + ext4_bh_delay_or_unwritten)) { + redirty_page_for_writepage(wbc, page); + if (current->flags & PF_MEMALLOC) { + /* + * For memory cleaning there's no point in writing only + * some buffers. So just bail out. Warn if we came here + * from direct reclaim. + */ + WARN_ON_ONCE((current->flags & (PF_MEMALLOC|PF_KSWAPD)) + == PF_MEMALLOC); unlock_page(page); return 0; } - commit_write = 1; - } - page_bufs = page_buffers(page); - if (ext4_walk_page_buffers(NULL, page_bufs, 0, len, NULL, - ext4_bh_delay_or_unwritten)) { - /* - * We don't want to do block allocation, so redirty - * the page and return. We may reach here when we do - * a journal commit via journal_submit_inode_data_buffers. - * We can also reach here via shrink_page_list but it - * should never be for direct reclaim so warn if that - * happens - */ - WARN_ON_ONCE((current->flags & (PF_MEMALLOC|PF_KSWAPD)) == - PF_MEMALLOC); - goto redirty_page; } - if (commit_write) - /* now mark the buffer_heads as dirty and uptodate */ - block_commit_write(page, 0, len); if (PageChecked(page) && ext4_should_journal_data(inode)) /* @@ -2089,14 +2106,9 @@ static int ext4_writepage(struct page *page, */ return __ext4_journalled_writepage(page, len); - if (buffer_uninit(page_bufs)) { - ext4_set_bh_endio(page_bufs, inode); - ret = block_write_full_page_endio(page, noalloc_get_block_write, - wbc, ext4_end_io_buffer_write); - } else - ret = block_write_full_page(page, noalloc_get_block_write, - wbc); - + memset(&io_submit, 0, sizeof(io_submit)); + ret = ext4_bio_write_page(&io_submit, page, len, wbc); + ext4_io_submit(&io_submit); return ret; } @@ -2228,51 +2240,38 @@ static int write_cache_pages_da(handle_t *handle, logical = (sector_t) page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits); - if (!page_has_buffers(page)) { - mpage_add_bh_to_extent(mpd, logical, - PAGE_CACHE_SIZE, - (1 << BH_Dirty) | (1 << BH_Uptodate)); - if (mpd->io_done) - goto ret_extent_tail; - } else { + /* Add all dirty buffers to mpd */ + head = page_buffers(page); + bh = head; + do { + BUG_ON(buffer_locked(bh)); /* - * Page with regular buffer heads, - * just add all dirty ones + * We need to try to allocate unmapped blocks + * in the same page. Otherwise we won't make + * progress with the page in ext4_writepage */ - head = page_buffers(page); - bh = head; - do { - BUG_ON(buffer_locked(bh)); + if (ext4_bh_delay_or_unwritten(NULL, bh)) { + mpage_add_bh_to_extent(mpd, logical, + bh->b_state); + if (mpd->io_done) + goto ret_extent_tail; + } else if (buffer_dirty(bh) && + buffer_mapped(bh)) { /* - * We need to try to allocate - * unmapped blocks in the same page. - * Otherwise we won't make progress - * with the page in ext4_writepage + * mapped dirty buffer. We need to + * update the b_state because we look + * at b_state in mpage_da_map_blocks. + * We don't update b_size because if we + * find an unmapped buffer_head later + * we need to use the b_state flag of + * that buffer_head. */ - if (ext4_bh_delay_or_unwritten(NULL, bh)) { - mpage_add_bh_to_extent(mpd, logical, - bh->b_size, - bh->b_state); - if (mpd->io_done) - goto ret_extent_tail; - } else if (buffer_dirty(bh) && (buffer_mapped(bh))) { - /* - * mapped dirty buffer. We need - * to update the b_state - * because we look at b_state - * in mpage_da_map_blocks. We - * don't update b_size because - * if we find an unmapped - * buffer_head later we need to - * use the b_state flag of that - * buffer_head. - */ - if (mpd->b_size == 0) - mpd->b_state = bh->b_state & BH_FLAGS; - } - logical++; - } while ((bh = bh->b_this_page) != head); - } + if (mpd->b_size == 0) + mpd->b_state = + bh->b_state & BH_FLAGS; + } + logical++; + } while ((bh = bh->b_this_page) != head); if (nr_to_write > 0) { nr_to_write--; @@ -2413,7 +2412,8 @@ retry: needed_blocks = ext4_da_writepages_trans_blocks(inode); /* start a new transaction*/ - handle = ext4_journal_start(inode, needed_blocks); + handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, + needed_blocks); if (IS_ERR(handle)) { ret = PTR_ERR(handle); ext4_msg(inode->i_sb, KERN_CRIT, "%s: jbd2_start: " @@ -2555,42 +2555,52 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping, pos, len, flags, pagep, fsdata); if (ret < 0) - goto out; - if (ret == 1) { - ret = 0; - goto out; - } + return ret; + if (ret == 1) + return 0; } -retry: + /* + * grab_cache_page_write_begin() can take a long time if the + * system is thrashing due to memory pressure, or if the page + * is being written back. So grab it first before we start + * the transaction handle. This also allows us to allocate + * the page (if needed) without using GFP_NOFS. + */ +retry_grab: + page = grab_cache_page_write_begin(mapping, index, flags); + if (!page) + return -ENOMEM; + unlock_page(page); + /* * With delayed allocation, we don't log the i_disksize update * if there is delayed block allocation. But we still need * to journalling the i_disksize update if writes to the end * of file which has an already mapped buffer. */ - handle = ext4_journal_start(inode, 1); +retry_journal: + handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, 1); if (IS_ERR(handle)) { - ret = PTR_ERR(handle); - goto out; + page_cache_release(page); + return PTR_ERR(handle); } - /* We cannot recurse into the filesystem as the transaction is already - * started */ - flags |= AOP_FLAG_NOFS; - page = grab_cache_page_write_begin(mapping, index, flags); - if (!page) { + lock_page(page); + if (page->mapping != mapping) { + /* The page got truncated from under us */ + unlock_page(page); + page_cache_release(page); ext4_journal_stop(handle); - ret = -ENOMEM; - goto out; + goto retry_grab; } - *pagep = page; + /* In case writeback began while the page was unlocked */ + wait_on_page_writeback(page); ret = __block_write_begin(page, pos, len, ext4_da_get_block_prep); if (ret < 0) { unlock_page(page); ext4_journal_stop(handle); - page_cache_release(page); /* * block_write_begin may have instantiated a few blocks * outside i_size. Trim these off again. Don't need @@ -2598,11 +2608,16 @@ retry: */ if (pos + len > inode->i_size) ext4_truncate_failed_write(inode); + + if (ret == -ENOSPC && + ext4_should_retry_alloc(inode->i_sb, &retries)) + goto retry_journal; + + page_cache_release(page); + return ret; } - if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) - goto retry; -out: + *pagep = page; return ret; } @@ -2858,36 +2873,10 @@ ext4_readpages(struct file *file, struct address_space *mapping, return mpage_readpages(mapping, pages, nr_pages, ext4_get_block); } -static void ext4_invalidatepage_free_endio(struct page *page, unsigned long offset) -{ - struct buffer_head *head, *bh; - unsigned int curr_off = 0; - - if (!page_has_buffers(page)) - return; - head = bh = page_buffers(page); - do { - if (offset <= curr_off && test_clear_buffer_uninit(bh) - && bh->b_private) { - ext4_free_io_end(bh->b_private); - bh->b_private = NULL; - bh->b_end_io = NULL; - } - curr_off = curr_off + bh->b_size; - bh = bh->b_this_page; - } while (bh != head); -} - static void ext4_invalidatepage(struct page *page, unsigned long offset) { trace_ext4_invalidatepage(page, offset); - /* - * free any io_end structure allocated for buffers to be discarded - */ - if (ext4_should_dioread_nolock(page->mapping->host)) - ext4_invalidatepage_free_endio(page, offset); - /* No journalling happens on data buffers when this function is used */ WARN_ON(page_has_buffers(page) && buffer_jbd(page_buffers(page))); @@ -2977,9 +2966,9 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset, if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) { ext4_free_io_end(io_end); out: + inode_dio_done(inode); if (is_async) aio_complete(iocb, ret, 0); - inode_dio_done(inode); return; } @@ -2993,65 +2982,6 @@ out: ext4_add_complete_io(io_end); } -static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate) -{ - ext4_io_end_t *io_end = bh->b_private; - struct inode *inode; - - if (!test_clear_buffer_uninit(bh) || !io_end) - goto out; - - if (!(io_end->inode->i_sb->s_flags & MS_ACTIVE)) { - ext4_msg(io_end->inode->i_sb, KERN_INFO, - "sb umounted, discard end_io request for inode %lu", - io_end->inode->i_ino); - ext4_free_io_end(io_end); - goto out; - } - - /* - * It may be over-defensive here to check EXT4_IO_END_UNWRITTEN now, - * but being more careful is always safe for the future change. - */ - inode = io_end->inode; - ext4_set_io_unwritten_flag(inode, io_end); - ext4_add_complete_io(io_end); -out: - bh->b_private = NULL; - bh->b_end_io = NULL; - clear_buffer_uninit(bh); - end_buffer_async_write(bh, uptodate); -} - -static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode) -{ - ext4_io_end_t *io_end; - struct page *page = bh->b_page; - loff_t offset = (sector_t)page->index << PAGE_CACHE_SHIFT; - size_t size = bh->b_size; - -retry: - io_end = ext4_init_io_end(inode, GFP_ATOMIC); - if (!io_end) { - pr_warn_ratelimited("%s: allocation fail\n", __func__); - schedule(); - goto retry; - } - io_end->offset = offset; - io_end->size = size; - /* - * We need to hold a reference to the page to make sure it - * doesn't get evicted before ext4_end_io_work() has a chance - * to convert the extent from written to unwritten. - */ - io_end->page = page; - get_page(io_end->page); - - bh->b_private = io_end; - bh->b_end_io = ext4_end_io_buffer_write; - return 0; -} - /* * For ext4 extent files, ext4 will do direct-io write to holes, * preallocated extents, and those write extend the file, no need to @@ -3557,16 +3487,16 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length) if (!S_ISREG(inode->i_mode)) return -EOPNOTSUPP; - if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { - /* TODO: Add support for non extent hole punching */ - return -EOPNOTSUPP; - } + if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) + return ext4_ind_punch_hole(file, offset, length); if (EXT4_SB(inode->i_sb)->s_cluster_ratio > 1) { /* TODO: Add support for bigalloc file systems */ return -EOPNOTSUPP; } + trace_ext4_punch_hole(inode, offset, length); + return ext4_ext_punch_hole(file, offset, length); } @@ -3660,11 +3590,8 @@ static int __ext4_get_inode_loc(struct inode *inode, iloc->offset = (inode_offset % inodes_per_block) * EXT4_INODE_SIZE(sb); bh = sb_getblk(sb, block); - if (!bh) { - EXT4_ERROR_INODE_BLOCK(inode, block, - "unable to read itable block"); - return -EIO; - } + if (unlikely(!bh)) + return -ENOMEM; if (!buffer_uptodate(bh)) { lock_buffer(bh); @@ -3696,7 +3623,7 @@ static int __ext4_get_inode_loc(struct inode *inode, /* Is the inode bitmap in cache? */ bitmap_bh = sb_getblk(sb, ext4_inode_bitmap(sb, gdp)); - if (!bitmap_bh) + if (unlikely(!bitmap_bh)) goto make_io; /* @@ -4404,8 +4331,9 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) /* (user+group)*(old+new) structure, inode write (sb, * inode block, ? - but truncate inode update has it) */ - handle = ext4_journal_start(inode, (EXT4_MAXQUOTAS_INIT_BLOCKS(inode->i_sb)+ - EXT4_MAXQUOTAS_DEL_BLOCKS(inode->i_sb))+3); + handle = ext4_journal_start(inode, EXT4_HT_QUOTA, + (EXT4_MAXQUOTAS_INIT_BLOCKS(inode->i_sb) + + EXT4_MAXQUOTAS_DEL_BLOCKS(inode->i_sb)) + 3); if (IS_ERR(handle)) { error = PTR_ERR(handle); goto err_out; @@ -4440,7 +4368,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) (attr->ia_size < inode->i_size)) { handle_t *handle; - handle = ext4_journal_start(inode, 3); + handle = ext4_journal_start(inode, EXT4_HT_INODE, 3); if (IS_ERR(handle)) { error = PTR_ERR(handle); goto err_out; @@ -4460,7 +4388,8 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) attr->ia_size); if (error) { /* Do as much error cleanup as possible */ - handle = ext4_journal_start(inode, 3); + handle = ext4_journal_start(inode, + EXT4_HT_INODE, 3); if (IS_ERR(handle)) { ext4_orphan_del(NULL, inode); goto err_out; @@ -4801,7 +4730,7 @@ void ext4_dirty_inode(struct inode *inode, int flags) { handle_t *handle; - handle = ext4_journal_start(inode, 2); + handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); if (IS_ERR(handle)) goto out; @@ -4902,7 +4831,7 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val) /* Finally we can mark the inode as dirty. */ - handle = ext4_journal_start(inode, 1); + handle = ext4_journal_start(inode, EXT4_HT_INODE, 1); if (IS_ERR(handle)) return PTR_ERR(handle); @@ -4968,7 +4897,7 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) 0, len, NULL, ext4_bh_unmapped)) { /* Wait so that we don't change page under IO */ - wait_on_page_writeback(page); + wait_for_stable_page(page); ret = VM_FAULT_LOCKED; goto out; } @@ -4980,7 +4909,8 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) else get_block = ext4_get_block; retry_alloc: - handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode)); + handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, + ext4_writepage_trans_blocks(inode)); if (IS_ERR(handle)) { ret = VM_FAULT_SIGBUS; goto out; diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index c2f8e06..721f4d3 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -104,7 +104,7 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) } else if (oldflags & EXT4_EOFBLOCKS_FL) ext4_truncate(inode); - handle = ext4_journal_start(inode, 1); + handle = ext4_journal_start(inode, EXT4_HT_INODE, 1); if (IS_ERR(handle)) { err = PTR_ERR(handle); goto flags_out; @@ -173,7 +173,7 @@ flags_out: } mutex_lock(&inode->i_mutex); - handle = ext4_journal_start(inode, 1); + handle = ext4_journal_start(inode, EXT4_HT_INODE, 1); if (IS_ERR(handle)) { err = PTR_ERR(handle); goto unlock_out; @@ -313,6 +313,9 @@ mext_out: if (err == 0) err = err2; mnt_drop_write_file(filp); + if (!err && ext4_has_group_desc_csum(sb) && + test_opt(sb, INIT_INODE_TABLE)) + err = ext4_register_li_request(sb, input.group); group_add_out: ext4_resize_end(sb); return err; @@ -358,6 +361,7 @@ group_add_out: ext4_fsblk_t n_blocks_count; struct super_block *sb = inode->i_sb; int err = 0, err2 = 0; + ext4_group_t o_group = EXT4_SB(sb)->s_groups_count; if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_BIGALLOC)) { @@ -388,6 +392,11 @@ group_add_out: if (err == 0) err = err2; mnt_drop_write_file(filp); + if (!err && (o_group > EXT4_SB(sb)->s_groups_count) && + ext4_has_group_desc_csum(sb) && + test_opt(sb, INIT_INODE_TABLE)) + err = ext4_register_li_request(sb, o_group); + resizefs_out: ext4_resize_end(sb); return err; diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 1bf6fe7..6540ebe 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -23,11 +23,18 @@ #include "ext4_jbd2.h" #include "mballoc.h" -#include <linux/debugfs.h> #include <linux/log2.h> +#include <linux/module.h> #include <linux/slab.h> #include <trace/events/ext4.h> +#ifdef CONFIG_EXT4_DEBUG +ushort ext4_mballoc_debug __read_mostly; + +module_param_named(mballoc_debug, ext4_mballoc_debug, ushort, 0644); +MODULE_PARM_DESC(mballoc_debug, "Debugging level for ext4's mballoc"); +#endif + /* * MUSTDO: * - test ext4_ext_search_left() and ext4_ext_search_right() @@ -1884,15 +1891,19 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac, case 0: BUG_ON(ac->ac_2order == 0); - if (grp->bb_largest_free_order < ac->ac_2order) - return 0; - /* Avoid using the first bg of a flexgroup for data files */ if ((ac->ac_flags & EXT4_MB_HINT_DATA) && (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) && ((group % flex_size) == 0)) return 0; + if ((ac->ac_2order > ac->ac_sb->s_blocksize_bits+1) || + (free / fragments) >= ac->ac_g_ex.fe_len) + return 1; + + if (grp->bb_largest_free_order < ac->ac_2order) + return 0; + return 1; case 1: if ((free / fragments) >= ac->ac_g_ex.fe_len) @@ -2007,7 +2018,7 @@ repeat: } ac->ac_groups_scanned++; - if (cr == 0) + if (cr == 0 && ac->ac_2order < sb->s_blocksize_bits+2) ext4_mb_simple_scan_group(ac, &e4b); else if (cr == 1 && sbi->s_stripe && !(ac->ac_g_ex.fe_len % sbi->s_stripe)) @@ -2656,40 +2667,6 @@ static void ext4_free_data_callback(struct super_block *sb, mb_debug(1, "freed %u blocks in %u structures\n", count, count2); } -#ifdef CONFIG_EXT4_DEBUG -u8 mb_enable_debug __read_mostly; - -static struct dentry *debugfs_dir; -static struct dentry *debugfs_debug; - -static void __init ext4_create_debugfs_entry(void) -{ - debugfs_dir = debugfs_create_dir("ext4", NULL); - if (debugfs_dir) - debugfs_debug = debugfs_create_u8("mballoc-debug", - S_IRUGO | S_IWUSR, - debugfs_dir, - &mb_enable_debug); -} - -static void ext4_remove_debugfs_entry(void) -{ - debugfs_remove(debugfs_debug); - debugfs_remove(debugfs_dir); -} - -#else - -static void __init ext4_create_debugfs_entry(void) -{ -} - -static void ext4_remove_debugfs_entry(void) -{ -} - -#endif - int __init ext4_init_mballoc(void) { ext4_pspace_cachep = KMEM_CACHE(ext4_prealloc_space, @@ -2711,7 +2688,6 @@ int __init ext4_init_mballoc(void) kmem_cache_destroy(ext4_ac_cachep); return -ENOMEM; } - ext4_create_debugfs_entry(); return 0; } @@ -2726,7 +2702,6 @@ void ext4_exit_mballoc(void) kmem_cache_destroy(ext4_ac_cachep); kmem_cache_destroy(ext4_free_data_cachep); ext4_groupinfo_destroy_slabs(); - ext4_remove_debugfs_entry(); } @@ -3872,7 +3847,7 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac) struct super_block *sb = ac->ac_sb; ext4_group_t ngroups, i; - if (!mb_enable_debug || + if (!ext4_mballoc_debug || (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED)) return; @@ -4005,8 +3980,8 @@ ext4_mb_initialize_context(struct ext4_allocation_context *ac, len = ar->len; /* just a dirty hack to filter too big requests */ - if (len >= EXT4_CLUSTERS_PER_GROUP(sb) - 10) - len = EXT4_CLUSTERS_PER_GROUP(sb) - 10; + if (len >= EXT4_CLUSTERS_PER_GROUP(sb)) + len = EXT4_CLUSTERS_PER_GROUP(sb); /* start searching from the goal */ goal = ar->goal; @@ -4136,7 +4111,7 @@ static void ext4_mb_add_n_trim(struct ext4_allocation_context *ac) /* The max size of hash table is PREALLOC_TB_SIZE */ order = PREALLOC_TB_SIZE - 1; /* Add the prealloc space to lg */ - rcu_read_lock(); + spin_lock(&lg->lg_prealloc_lock); list_for_each_entry_rcu(tmp_pa, &lg->lg_prealloc_list[order], pa_inode_list) { spin_lock(&tmp_pa->pa_lock); @@ -4160,12 +4135,12 @@ static void ext4_mb_add_n_trim(struct ext4_allocation_context *ac) if (!added) list_add_tail_rcu(&pa->pa_inode_list, &lg->lg_prealloc_list[order]); - rcu_read_unlock(); + spin_unlock(&lg->lg_prealloc_lock); /* Now trim the list to be not more than 8 elements */ if (lg_prealloc_count > 8) { ext4_mb_discard_lg_preallocations(sb, lg, - order, lg_prealloc_count); + order, lg_prealloc_count); return; } return ; diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h index 3ccd889..08481ee 100644 --- a/fs/ext4/mballoc.h +++ b/fs/ext4/mballoc.h @@ -37,11 +37,11 @@ /* */ #ifdef CONFIG_EXT4_DEBUG -extern u8 mb_enable_debug; +extern ushort ext4_mballoc_debug; #define mb_debug(n, fmt, a...) \ do { \ - if ((n) <= mb_enable_debug) { \ + if ((n) <= ext4_mballoc_debug) { \ printk(KERN_DEBUG "(%s, %d): %s: ", \ __FILE__, __LINE__, __func__); \ printk(fmt, ## a); \ diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c index db8226d..480acf4 100644 --- a/fs/ext4/migrate.c +++ b/fs/ext4/migrate.c @@ -456,11 +456,14 @@ int ext4_ext_migrate(struct inode *inode) */ return retval; - handle = ext4_journal_start(inode, - EXT4_DATA_TRANS_BLOCKS(inode->i_sb) + - EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 + - EXT4_MAXQUOTAS_INIT_BLOCKS(inode->i_sb) - + 1); + /* + * Worst case we can touch the allocation bitmaps, a bgd + * block, and a block to link in the orphan list. We do need + * need to worry about credits for modifying the quota inode. + */ + handle = ext4_journal_start(inode, EXT4_HT_MIGRATE, + 4 + EXT4_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb)); + if (IS_ERR(handle)) { retval = PTR_ERR(handle); return retval; @@ -507,7 +510,7 @@ int ext4_ext_migrate(struct inode *inode) ext4_set_inode_state(inode, EXT4_STATE_EXT_MIGRATE); up_read((&EXT4_I(inode)->i_data_sem)); - handle = ext4_journal_start(inode, 1); + handle = ext4_journal_start(inode, EXT4_HT_MIGRATE, 1); if (IS_ERR(handle)) { /* * It is impossible to update on-disk structures without diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c index fe7c63f..f9b5515 100644 --- a/fs/ext4/mmp.c +++ b/fs/ext4/mmp.c @@ -80,6 +80,8 @@ static int read_mmp_block(struct super_block *sb, struct buffer_head **bh, * is not blocked in the elevator. */ if (!*bh) *bh = sb_getblk(sb, mmp_block); + if (!*bh) + return -ENOMEM; if (*bh) { get_bh(*bh); lock_buffer(*bh); @@ -91,7 +93,7 @@ static int read_mmp_block(struct super_block *sb, struct buffer_head **bh, *bh = NULL; } } - if (!*bh) { + if (unlikely(!*bh)) { ext4_warning(sb, "Error while reading MMP block %llu", mmp_block); return -EIO; diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index 796f7ac..4e81d47 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -681,6 +681,8 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode, depth = ext_depth(donor_inode); dext = donor_path[depth].p_ext; + if (unlikely(!dext)) + goto missing_donor_extent; tmp_dext = *dext; *err = mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off, @@ -691,7 +693,8 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode, /* Loop for the donor extents */ while (1) { /* The extent for donor must be found. */ - if (!dext) { + if (unlikely(!dext)) { + missing_donor_extent: EXT4_ERROR_INODE(donor_inode, "The extent for donor must be found"); *err = -EIO; @@ -761,9 +764,6 @@ out: kfree(donor_path); } - ext4_ext_invalidate_cache(orig_inode); - ext4_ext_invalidate_cache(donor_inode); - return replaced_count; } @@ -920,7 +920,7 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode, again: *err = 0; jblocks = ext4_writepage_trans_blocks(orig_inode) * 2; - handle = ext4_journal_start(orig_inode, jblocks); + handle = ext4_journal_start(orig_inode, EXT4_HT_MOVE_EXTENTS, jblocks); if (IS_ERR(handle)) { *err = PTR_ERR(handle); return 0; diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index bb97ad6..3825d6a 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -47,38 +47,111 @@ #define NAMEI_RA_CHUNKS 2 #define NAMEI_RA_BLOCKS 4 #define NAMEI_RA_SIZE (NAMEI_RA_CHUNKS * NAMEI_RA_BLOCKS) -#define NAMEI_RA_INDEX(c,b) (((c) * NAMEI_RA_BLOCKS) + (b)) static struct buffer_head *ext4_append(handle_t *handle, struct inode *inode, - ext4_lblk_t *block, int *err) + ext4_lblk_t *block) { struct buffer_head *bh; + int err = 0; if (unlikely(EXT4_SB(inode->i_sb)->s_max_dir_size_kb && ((inode->i_size >> 10) >= - EXT4_SB(inode->i_sb)->s_max_dir_size_kb))) { - *err = -ENOSPC; - return NULL; - } + EXT4_SB(inode->i_sb)->s_max_dir_size_kb))) + return ERR_PTR(-ENOSPC); *block = inode->i_size >> inode->i_sb->s_blocksize_bits; - bh = ext4_bread(handle, inode, *block, 1, err); - if (bh) { - inode->i_size += inode->i_sb->s_blocksize; - EXT4_I(inode)->i_disksize = inode->i_size; - *err = ext4_journal_get_write_access(handle, bh); - if (*err) { + bh = ext4_bread(handle, inode, *block, 1, &err); + if (!bh) + return ERR_PTR(err); + inode->i_size += inode->i_sb->s_blocksize; + EXT4_I(inode)->i_disksize = inode->i_size; + err = ext4_journal_get_write_access(handle, bh); + if (err) { + brelse(bh); + ext4_std_error(inode->i_sb, err); + return ERR_PTR(err); + } + return bh; +} + +static int ext4_dx_csum_verify(struct inode *inode, + struct ext4_dir_entry *dirent); + +typedef enum { + EITHER, INDEX, DIRENT +} dirblock_type_t; + +#define ext4_read_dirblock(inode, block, type) \ + __ext4_read_dirblock((inode), (block), (type), __LINE__) + +static struct buffer_head *__ext4_read_dirblock(struct inode *inode, + ext4_lblk_t block, + dirblock_type_t type, + unsigned int line) +{ + struct buffer_head *bh; + struct ext4_dir_entry *dirent; + int err = 0, is_dx_block = 0; + + bh = ext4_bread(NULL, inode, block, 0, &err); + if (!bh) { + if (err == 0) { + ext4_error_inode(inode, __func__, line, block, + "Directory hole found"); + return ERR_PTR(-EIO); + } + __ext4_warning(inode->i_sb, __func__, line, + "error reading directory block " + "(ino %lu, block %lu)", inode->i_ino, + (unsigned long) block); + return ERR_PTR(err); + } + dirent = (struct ext4_dir_entry *) bh->b_data; + /* Determine whether or not we have an index block */ + if (is_dx(inode)) { + if (block == 0) + is_dx_block = 1; + else if (ext4_rec_len_from_disk(dirent->rec_len, + inode->i_sb->s_blocksize) == + inode->i_sb->s_blocksize) + is_dx_block = 1; + } + if (!is_dx_block && type == INDEX) { + ext4_error_inode(inode, __func__, line, block, + "directory leaf block found instead of index block"); + return ERR_PTR(-EIO); + } + if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, + EXT4_FEATURE_RO_COMPAT_METADATA_CSUM) || + buffer_verified(bh)) + return bh; + + /* + * An empty leaf block can get mistaken for a index block; for + * this reason, we can only check the index checksum when the + * caller is sure it should be an index block. + */ + if (is_dx_block && type == INDEX) { + if (ext4_dx_csum_verify(inode, dirent)) + set_buffer_verified(bh); + else { + ext4_error_inode(inode, __func__, line, block, + "Directory index failed checksum"); brelse(bh); - bh = NULL; + return ERR_PTR(-EIO); } } - if (!bh && !(*err)) { - *err = -EIO; - ext4_error(inode->i_sb, - "Directory hole detected on inode %lu\n", - inode->i_ino); + if (!is_dx_block) { + if (ext4_dirent_csum_verify(inode, dirent)) + set_buffer_verified(bh); + else { + ext4_error_inode(inode, __func__, line, block, + "Directory block failed checksum"); + brelse(bh); + return ERR_PTR(-EIO); + } } return bh; } @@ -604,9 +677,9 @@ dx_probe(const struct qstr *d_name, struct inode *dir, u32 hash; frame->bh = NULL; - if (!(bh = ext4_bread(NULL, dir, 0, 0, err))) { - if (*err == 0) - *err = ERR_BAD_DX_DIR; + bh = ext4_read_dirblock(dir, 0, INDEX); + if (IS_ERR(bh)) { + *err = PTR_ERR(bh); goto fail; } root = (struct dx_root *) bh->b_data; @@ -643,15 +716,6 @@ dx_probe(const struct qstr *d_name, struct inode *dir, goto fail; } - if (!buffer_verified(bh) && - !ext4_dx_csum_verify(dir, (struct ext4_dir_entry *)bh->b_data)) { - ext4_warning(dir->i_sb, "Root failed checksum"); - brelse(bh); - *err = ERR_BAD_DX_DIR; - goto fail; - } - set_buffer_verified(bh); - entries = (struct dx_entry *) (((char *)&root->info) + root->info.info_length); @@ -709,22 +773,12 @@ dx_probe(const struct qstr *d_name, struct inode *dir, frame->entries = entries; frame->at = at; if (!indirect--) return frame; - if (!(bh = ext4_bread(NULL, dir, dx_get_block(at), 0, err))) { - if (!(*err)) - *err = ERR_BAD_DX_DIR; - goto fail2; - } - at = entries = ((struct dx_node *) bh->b_data)->entries; - - if (!buffer_verified(bh) && - !ext4_dx_csum_verify(dir, - (struct ext4_dir_entry *)bh->b_data)) { - ext4_warning(dir->i_sb, "Node failed checksum"); - brelse(bh); - *err = ERR_BAD_DX_DIR; + bh = ext4_read_dirblock(dir, dx_get_block(at), INDEX); + if (IS_ERR(bh)) { + *err = PTR_ERR(bh); goto fail2; } - set_buffer_verified(bh); + entries = ((struct dx_node *) bh->b_data)->entries; if (dx_get_limit(entries) != dx_node_limit (dir)) { ext4_warning(dir->i_sb, @@ -783,7 +837,7 @@ static int ext4_htree_next_block(struct inode *dir, __u32 hash, { struct dx_frame *p; struct buffer_head *bh; - int err, num_frames = 0; + int num_frames = 0; __u32 bhash; p = frame; @@ -822,25 +876,9 @@ static int ext4_htree_next_block(struct inode *dir, __u32 hash, * block so no check is necessary */ while (num_frames--) { - if (!(bh = ext4_bread(NULL, dir, dx_get_block(p->at), - 0, &err))) { - if (!err) { - ext4_error(dir->i_sb, - "Directory hole detected on inode %lu\n", - dir->i_ino); - return -EIO; - } - return err; /* Failure */ - } - - if (!buffer_verified(bh) && - !ext4_dx_csum_verify(dir, - (struct ext4_dir_entry *)bh->b_data)) { - ext4_warning(dir->i_sb, "Node failed checksum"); - return -EIO; - } - set_buffer_verified(bh); - + bh = ext4_read_dirblock(dir, dx_get_block(p->at), INDEX); + if (IS_ERR(bh)) + return PTR_ERR(bh); p++; brelse(p->bh); p->bh = bh; @@ -866,20 +904,9 @@ static int htree_dirblock_to_tree(struct file *dir_file, dxtrace(printk(KERN_INFO "In htree dirblock_to_tree: block %lu\n", (unsigned long)block)); - if (!(bh = ext4_bread(NULL, dir, block, 0, &err))) { - if (!err) { - err = -EIO; - ext4_error(dir->i_sb, - "Directory hole detected on inode %lu\n", - dir->i_ino); - } - return err; - } - - if (!buffer_verified(bh) && - !ext4_dirent_csum_verify(dir, (struct ext4_dir_entry *)bh->b_data)) - return -EIO; - set_buffer_verified(bh); + bh = ext4_read_dirblock(dir, block, DIRENT); + if (IS_ERR(bh)) + return PTR_ERR(bh); de = (struct ext4_dir_entry_2 *) bh->b_data; top = (struct ext4_dir_entry_2 *) ((char *) de + @@ -1333,26 +1360,11 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct q return NULL; do { block = dx_get_block(frame->at); - if (!(bh = ext4_bread(NULL, dir, block, 0, err))) { - if (!(*err)) { - *err = -EIO; - ext4_error(dir->i_sb, - "Directory hole detected on inode %lu\n", - dir->i_ino); - } - goto errout; - } - - if (!buffer_verified(bh) && - !ext4_dirent_csum_verify(dir, - (struct ext4_dir_entry *)bh->b_data)) { - EXT4_ERROR_INODE(dir, "checksumming directory " - "block %lu", (unsigned long)block); - brelse(bh); - *err = -EIO; + bh = ext4_read_dirblock(dir, block, DIRENT); + if (IS_ERR(bh)) { + *err = PTR_ERR(bh); goto errout; } - set_buffer_verified(bh); retval = search_dirblock(bh, dir, d_name, block << EXT4_BLOCK_SIZE_BITS(sb), res_dir); @@ -1536,11 +1548,12 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir, EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) csum_size = sizeof(struct ext4_dir_entry_tail); - bh2 = ext4_append (handle, dir, &newblock, &err); - if (!(bh2)) { + bh2 = ext4_append(handle, dir, &newblock); + if (IS_ERR(bh2)) { brelse(*bh); *bh = NULL; - goto errout; + *error = PTR_ERR(bh2); + return NULL; } BUFFER_TRACE(*bh, "get_write_access"); @@ -1621,7 +1634,6 @@ journal_error: brelse(bh2); *bh = NULL; ext4_std_error(dir->i_sb, err); -errout: *error = err; return NULL; } @@ -1699,7 +1711,6 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry, const char *name = dentry->d_name.name; int namelen = dentry->d_name.len; unsigned int blocksize = dir->i_sb->s_blocksize; - unsigned short reclen; int csum_size = 0; int err; @@ -1707,7 +1718,6 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry, EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) csum_size = sizeof(struct ext4_dir_entry_tail); - reclen = EXT4_DIR_REC_LEN(namelen); if (!de) { err = ext4_find_dest_de(dir, inode, bh, bh->b_data, blocksize - csum_size, @@ -1798,10 +1808,10 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry, len = ((char *) root) + (blocksize - csum_size) - (char *) de; /* Allocate new block for the 0th block's dirents */ - bh2 = ext4_append(handle, dir, &block, &retval); - if (!(bh2)) { + bh2 = ext4_append(handle, dir, &block); + if (IS_ERR(bh2)) { brelse(bh); - return retval; + return PTR_ERR(bh2); } ext4_set_inode_flag(dir, EXT4_INODE_INDEX); data1 = bh2->b_data; @@ -1918,20 +1928,10 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry, } blocks = dir->i_size >> sb->s_blocksize_bits; for (block = 0; block < blocks; block++) { - if (!(bh = ext4_bread(handle, dir, block, 0, &retval))) { - if (!retval) { - retval = -EIO; - ext4_error(inode->i_sb, - "Directory hole detected on inode %lu\n", - inode->i_ino); - } - return retval; - } - if (!buffer_verified(bh) && - !ext4_dirent_csum_verify(dir, - (struct ext4_dir_entry *)bh->b_data)) - return -EIO; - set_buffer_verified(bh); + bh = ext4_read_dirblock(dir, block, DIRENT); + if (IS_ERR(bh)) + return PTR_ERR(bh); + retval = add_dirent_to_buf(handle, dentry, inode, NULL, bh); if (retval != -ENOSPC) { brelse(bh); @@ -1943,9 +1943,9 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry, return make_indexed_dir(handle, dentry, inode, bh); brelse(bh); } - bh = ext4_append(handle, dir, &block, &retval); - if (!bh) - return retval; + bh = ext4_append(handle, dir, &block); + if (IS_ERR(bh)) + return PTR_ERR(bh); de = (struct ext4_dir_entry_2 *) bh->b_data; de->inode = 0; de->rec_len = ext4_rec_len_to_disk(blocksize - csum_size, blocksize); @@ -1982,22 +1982,13 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry, return err; entries = frame->entries; at = frame->at; - - if (!(bh = ext4_bread(handle, dir, dx_get_block(frame->at), 0, &err))) { - if (!err) { - err = -EIO; - ext4_error(dir->i_sb, - "Directory hole detected on inode %lu\n", - dir->i_ino); - } + bh = ext4_read_dirblock(dir, dx_get_block(frame->at), DIRENT); + if (IS_ERR(bh)) { + err = PTR_ERR(bh); + bh = NULL; goto cleanup; } - if (!buffer_verified(bh) && - !ext4_dirent_csum_verify(dir, (struct ext4_dir_entry *)bh->b_data)) - goto journal_error; - set_buffer_verified(bh); - BUFFER_TRACE(bh, "get_write_access"); err = ext4_journal_get_write_access(handle, bh); if (err) @@ -2025,9 +2016,11 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry, err = -ENOSPC; goto cleanup; } - bh2 = ext4_append (handle, dir, &newblock, &err); - if (!(bh2)) + bh2 = ext4_append(handle, dir, &newblock); + if (IS_ERR(bh2)) { + err = PTR_ERR(bh2); goto cleanup; + } node2 = (struct dx_node *)(bh2->b_data); entries2 = node2->entries; memset(&node2->fake, 0, sizeof(struct fake_dirent)); @@ -2106,8 +2099,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry, journal_error: ext4_std_error(dir->i_sb, err); cleanup: - if (bh) - brelse(bh); + brelse(bh); dx_release(frames); return err; } @@ -2254,29 +2246,28 @@ static int ext4_create(struct inode *dir, struct dentry *dentry, umode_t mode, { handle_t *handle; struct inode *inode; - int err, retries = 0; + int err, credits, retries = 0; dquot_initialize(dir); + credits = (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 + + EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb)); retry: - handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + - EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 + - EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb)); - if (IS_ERR(handle)) - return PTR_ERR(handle); - - if (IS_DIRSYNC(dir)) - ext4_handle_sync(handle); - - inode = ext4_new_inode(handle, dir, mode, &dentry->d_name, 0, NULL); + inode = ext4_new_inode_start_handle(dir, mode, &dentry->d_name, 0, + NULL, EXT4_HT_DIR, credits); + handle = ext4_journal_current_handle(); err = PTR_ERR(inode); if (!IS_ERR(inode)) { inode->i_op = &ext4_file_inode_operations; inode->i_fop = &ext4_file_operations; ext4_set_aops(inode); err = ext4_add_nondir(handle, dentry, inode); + if (!err && IS_DIRSYNC(dir)) + ext4_handle_sync(handle); } - ext4_journal_stop(handle); + if (handle) + ext4_journal_stop(handle); if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries)) goto retry; return err; @@ -2287,31 +2278,30 @@ static int ext4_mknod(struct inode *dir, struct dentry *dentry, { handle_t *handle; struct inode *inode; - int err, retries = 0; + int err, credits, retries = 0; if (!new_valid_dev(rdev)) return -EINVAL; dquot_initialize(dir); + credits = (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 + + EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb)); retry: - handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + - EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 + - EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb)); - if (IS_ERR(handle)) - return PTR_ERR(handle); - - if (IS_DIRSYNC(dir)) - ext4_handle_sync(handle); - - inode = ext4_new_inode(handle, dir, mode, &dentry->d_name, 0, NULL); + inode = ext4_new_inode_start_handle(dir, mode, &dentry->d_name, 0, + NULL, EXT4_HT_DIR, credits); + handle = ext4_journal_current_handle(); err = PTR_ERR(inode); if (!IS_ERR(inode)) { init_special_inode(inode, inode->i_mode, rdev); inode->i_op = &ext4_special_inode_operations; err = ext4_add_nondir(handle, dentry, inode); + if (!err && IS_DIRSYNC(dir)) + ext4_handle_sync(handle); } - ext4_journal_stop(handle); + if (handle) + ext4_journal_stop(handle); if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries)) goto retry; return err; @@ -2351,6 +2341,7 @@ static int ext4_init_new_dir(handle_t *handle, struct inode *dir, struct buffer_head *dir_block = NULL; struct ext4_dir_entry_2 *de; struct ext4_dir_entry_tail *t; + ext4_lblk_t block = 0; unsigned int blocksize = dir->i_sb->s_blocksize; int csum_size = 0; int err; @@ -2367,16 +2358,10 @@ static int ext4_init_new_dir(handle_t *handle, struct inode *dir, goto out; } - inode->i_size = EXT4_I(inode)->i_disksize = blocksize; - if (!(dir_block = ext4_bread(handle, inode, 0, 1, &err))) { - if (!err) { - err = -EIO; - ext4_error(inode->i_sb, - "Directory hole detected on inode %lu\n", - inode->i_ino); - } - goto out; - } + inode->i_size = 0; + dir_block = ext4_append(handle, inode, &block); + if (IS_ERR(dir_block)) + return PTR_ERR(dir_block); BUFFER_TRACE(dir_block, "get_write_access"); err = ext4_journal_get_write_access(handle, dir_block); if (err) @@ -2403,25 +2388,21 @@ static int ext4_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) { handle_t *handle; struct inode *inode; - int err, retries = 0; + int err, credits, retries = 0; if (EXT4_DIR_LINK_MAX(dir)) return -EMLINK; dquot_initialize(dir); + credits = (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 + + EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb)); retry: - handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + - EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 + - EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb)); - if (IS_ERR(handle)) - return PTR_ERR(handle); - - if (IS_DIRSYNC(dir)) - ext4_handle_sync(handle); - - inode = ext4_new_inode(handle, dir, S_IFDIR | mode, - &dentry->d_name, 0, NULL); + inode = ext4_new_inode_start_handle(dir, S_IFDIR | mode, + &dentry->d_name, + 0, NULL, EXT4_HT_DIR, credits); + handle = ext4_journal_current_handle(); err = PTR_ERR(inode); if (IS_ERR(inode)) goto out_stop; @@ -2449,8 +2430,12 @@ out_clear_inode: goto out_clear_inode; unlock_new_inode(inode); d_instantiate(dentry, inode); + if (IS_DIRSYNC(dir)) + ext4_handle_sync(handle); + out_stop: - ext4_journal_stop(handle); + if (handle) + ext4_journal_stop(handle); if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries)) goto retry; return err; @@ -2476,25 +2461,14 @@ static int empty_dir(struct inode *inode) } sb = inode->i_sb; - if (inode->i_size < EXT4_DIR_REC_LEN(1) + EXT4_DIR_REC_LEN(2) || - !(bh = ext4_bread(NULL, inode, 0, 0, &err))) { - if (err) - EXT4_ERROR_INODE(inode, - "error %d reading directory lblock 0", err); - else - ext4_warning(inode->i_sb, - "bad directory (dir #%lu) - no data block", - inode->i_ino); + if (inode->i_size < EXT4_DIR_REC_LEN(1) + EXT4_DIR_REC_LEN(2)) { + EXT4_ERROR_INODE(inode, "invalid size"); return 1; } - if (!buffer_verified(bh) && - !ext4_dirent_csum_verify(inode, - (struct ext4_dir_entry *)bh->b_data)) { - EXT4_ERROR_INODE(inode, "checksum error reading directory " - "lblock 0"); - return -EIO; - } - set_buffer_verified(bh); + bh = ext4_read_dirblock(inode, 0, EITHER); + if (IS_ERR(bh)) + return 1; + de = (struct ext4_dir_entry_2 *) bh->b_data; de1 = ext4_next_entry(de, sb->s_blocksize); if (le32_to_cpu(de->inode) != inode->i_ino || @@ -2517,28 +2491,9 @@ static int empty_dir(struct inode *inode) err = 0; brelse(bh); lblock = offset >> EXT4_BLOCK_SIZE_BITS(sb); - bh = ext4_bread(NULL, inode, lblock, 0, &err); - if (!bh) { - if (err) - EXT4_ERROR_INODE(inode, - "error %d reading directory " - "lblock %u", err, lblock); - else - ext4_warning(inode->i_sb, - "bad directory (dir #%lu) - no data block", - inode->i_ino); - - offset += sb->s_blocksize; - continue; - } - if (!buffer_verified(bh) && - !ext4_dirent_csum_verify(inode, - (struct ext4_dir_entry *)bh->b_data)) { - EXT4_ERROR_INODE(inode, "checksum error " - "reading directory lblock 0"); - return -EIO; - } - set_buffer_verified(bh); + bh = ext4_read_dirblock(inode, lblock, EITHER); + if (IS_ERR(bh)) + return 1; de = (struct ext4_dir_entry_2 *) bh->b_data; } if (ext4_check_dir_entry(inode, NULL, de, bh, @@ -2717,25 +2672,18 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry) struct inode *inode; struct buffer_head *bh; struct ext4_dir_entry_2 *de; - handle_t *handle; + handle_t *handle = NULL; /* Initialize quotas before so that eventual writes go in * separate transaction */ dquot_initialize(dir); dquot_initialize(dentry->d_inode); - handle = ext4_journal_start(dir, EXT4_DELETE_TRANS_BLOCKS(dir->i_sb)); - if (IS_ERR(handle)) - return PTR_ERR(handle); - retval = -ENOENT; bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL); if (!bh) goto end_rmdir; - if (IS_DIRSYNC(dir)) - ext4_handle_sync(handle); - inode = dentry->d_inode; retval = -EIO; @@ -2746,6 +2694,17 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry) if (!empty_dir(inode)) goto end_rmdir; + handle = ext4_journal_start(dir, EXT4_HT_DIR, + EXT4_DATA_TRANS_BLOCKS(dir->i_sb)); + if (IS_ERR(handle)) { + retval = PTR_ERR(handle); + handle = NULL; + goto end_rmdir; + } + + if (IS_DIRSYNC(dir)) + ext4_handle_sync(handle); + retval = ext4_delete_entry(handle, dir, de, bh); if (retval) goto end_rmdir; @@ -2767,8 +2726,9 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry) ext4_mark_inode_dirty(handle, dir); end_rmdir: - ext4_journal_stop(handle); brelse(bh); + if (handle) + ext4_journal_stop(handle); return retval; } @@ -2778,7 +2738,7 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry) struct inode *inode; struct buffer_head *bh; struct ext4_dir_entry_2 *de; - handle_t *handle; + handle_t *handle = NULL; trace_ext4_unlink_enter(dir, dentry); /* Initialize quotas before so that eventual writes go @@ -2786,13 +2746,6 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry) dquot_initialize(dir); dquot_initialize(dentry->d_inode); - handle = ext4_journal_start(dir, EXT4_DELETE_TRANS_BLOCKS(dir->i_sb)); - if (IS_ERR(handle)) - return PTR_ERR(handle); - - if (IS_DIRSYNC(dir)) - ext4_handle_sync(handle); - retval = -ENOENT; bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL); if (!bh) @@ -2804,6 +2757,17 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry) if (le32_to_cpu(de->inode) != inode->i_ino) goto end_unlink; + handle = ext4_journal_start(dir, EXT4_HT_DIR, + EXT4_DATA_TRANS_BLOCKS(dir->i_sb)); + if (IS_ERR(handle)) { + retval = PTR_ERR(handle); + handle = NULL; + goto end_unlink; + } + + if (IS_DIRSYNC(dir)) + ext4_handle_sync(handle); + if (!inode->i_nlink) { ext4_warning(inode->i_sb, "Deleting nonexistent file (%lu), %d", @@ -2824,8 +2788,9 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry) retval = 0; end_unlink: - ext4_journal_stop(handle); brelse(bh); + if (handle) + ext4_journal_stop(handle); trace_ext4_unlink_exit(dentry, retval); return retval; } @@ -2865,15 +2830,10 @@ static int ext4_symlink(struct inode *dir, EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb); } retry: - handle = ext4_journal_start(dir, credits); - if (IS_ERR(handle)) - return PTR_ERR(handle); - - if (IS_DIRSYNC(dir)) - ext4_handle_sync(handle); - - inode = ext4_new_inode(handle, dir, S_IFLNK|S_IRWXUGO, - &dentry->d_name, 0, NULL); + inode = ext4_new_inode_start_handle(dir, S_IFLNK|S_IRWXUGO, + &dentry->d_name, 0, NULL, + EXT4_HT_DIR, credits); + handle = ext4_journal_current_handle(); err = PTR_ERR(inode); if (IS_ERR(inode)) goto out_stop; @@ -2903,7 +2863,7 @@ retry: * Now inode is being linked into dir (EXT4_DATA_TRANS_BLOCKS * + EXT4_INDEX_EXTRA_TRANS_BLOCKS), inode is also modified */ - handle = ext4_journal_start(dir, + handle = ext4_journal_start(dir, EXT4_HT_DIR, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 1); if (IS_ERR(handle)) { @@ -2926,8 +2886,12 @@ retry: } EXT4_I(inode)->i_disksize = inode->i_size; err = ext4_add_nondir(handle, dentry, inode); + if (!err && IS_DIRSYNC(dir)) + ext4_handle_sync(handle); + out_stop: - ext4_journal_stop(handle); + if (handle) + ext4_journal_stop(handle); if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries)) goto retry; return err; @@ -2950,8 +2914,9 @@ static int ext4_link(struct dentry *old_dentry, dquot_initialize(dir); retry: - handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + - EXT4_INDEX_EXTRA_TRANS_BLOCKS); + handle = ext4_journal_start(dir, EXT4_HT_DIR, + (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + + EXT4_INDEX_EXTRA_TRANS_BLOCKS)); if (IS_ERR(handle)) return PTR_ERR(handle); @@ -2991,13 +2956,9 @@ static struct buffer_head *ext4_get_first_dir_block(handle_t *handle, struct buffer_head *bh; if (!ext4_has_inline_data(inode)) { - if (!(bh = ext4_bread(handle, inode, 0, 0, retval))) { - if (!*retval) { - *retval = -EIO; - ext4_error(inode->i_sb, - "Directory hole detected on inode %lu\n", - inode->i_ino); - } + bh = ext4_read_dirblock(inode, 0, EITHER); + if (IS_ERR(bh)) { + *retval = PTR_ERR(bh); return NULL; } *parent_de = ext4_next_entry( @@ -3034,9 +2995,9 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, * in separate transaction */ if (new_dentry->d_inode) dquot_initialize(new_dentry->d_inode); - handle = ext4_journal_start(old_dir, 2 * - EXT4_DATA_TRANS_BLOCKS(old_dir->i_sb) + - EXT4_INDEX_EXTRA_TRANS_BLOCKS + 2); + handle = ext4_journal_start(old_dir, EXT4_HT_DIR, + (2 * EXT4_DATA_TRANS_BLOCKS(old_dir->i_sb) + + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 2)); if (IS_ERR(handle)) return PTR_ERR(handle); @@ -3076,11 +3037,6 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, &inlined); if (!dir_bh) goto end_rename; - if (!inlined && !buffer_verified(dir_bh) && - !ext4_dirent_csum_verify(old_inode, - (struct ext4_dir_entry *)dir_bh->b_data)) - goto end_rename; - set_buffer_verified(dir_bh); if (le32_to_cpu(parent_de->inode) != old_dir->i_ino) goto end_rename; retval = -EMLINK; diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 0016fbc..809b310 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -23,6 +23,7 @@ #include <linux/workqueue.h> #include <linux/kernel.h> #include <linux/slab.h> +#include <linux/mm.h> #include "ext4_jbd2.h" #include "xattr.h" @@ -73,8 +74,6 @@ void ext4_free_io_end(ext4_io_end_t *io) BUG_ON(!list_empty(&io->list)); BUG_ON(io->flag & EXT4_IO_END_UNWRITTEN); - if (io->page) - put_page(io->page); for (i = 0; i < io->num_io_pages; i++) put_io_page(io->pages[i]); io->num_io_pages = 0; @@ -103,14 +102,13 @@ static int ext4_end_io(ext4_io_end_t *io) "(inode %lu, offset %llu, size %zd, error %d)", inode->i_ino, offset, size, ret); } - if (io->iocb) - aio_complete(io->iocb, io->result, 0); - - if (io->flag & EXT4_IO_END_DIRECT) - inode_dio_done(inode); /* Wake up anyone waiting on unwritten extent conversion */ if (atomic_dec_and_test(&EXT4_I(inode)->i_unwritten)) wake_up_all(ext4_ioend_wq(inode)); + if (io->flag & EXT4_IO_END_DIRECT) + inode_dio_done(inode); + if (io->iocb) + aio_complete(io->iocb, io->result, 0); return ret; } @@ -119,7 +117,6 @@ static void dump_completed_IO(struct inode *inode) #ifdef EXT4FS_DEBUG struct list_head *cur, *before, *after; ext4_io_end_t *io, *io0, *io1; - unsigned long flags; if (list_empty(&EXT4_I(inode)->i_completed_io_list)) { ext4_debug("inode %lu completed_io list is empty\n", @@ -152,26 +149,20 @@ void ext4_add_complete_io(ext4_io_end_t *io_end) wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq; spin_lock_irqsave(&ei->i_completed_io_lock, flags); - if (list_empty(&ei->i_completed_io_list)) { - io_end->flag |= EXT4_IO_END_QUEUED; - queue_work(wq, &io_end->work); - } + if (list_empty(&ei->i_completed_io_list)) + queue_work(wq, &ei->i_unwritten_work); list_add_tail(&io_end->list, &ei->i_completed_io_list); spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); } -static int ext4_do_flush_completed_IO(struct inode *inode, - ext4_io_end_t *work_io) +static int ext4_do_flush_completed_IO(struct inode *inode) { ext4_io_end_t *io; - struct list_head unwritten, complete, to_free; + struct list_head unwritten; unsigned long flags; struct ext4_inode_info *ei = EXT4_I(inode); int err, ret = 0; - INIT_LIST_HEAD(&complete); - INIT_LIST_HEAD(&to_free); - spin_lock_irqsave(&ei->i_completed_io_lock, flags); dump_completed_IO(inode); list_replace_init(&ei->i_completed_io_list, &unwritten); @@ -185,32 +176,7 @@ static int ext4_do_flush_completed_IO(struct inode *inode, err = ext4_end_io(io); if (unlikely(!ret && err)) ret = err; - - list_add_tail(&io->list, &complete); - } - spin_lock_irqsave(&ei->i_completed_io_lock, flags); - while (!list_empty(&complete)) { - io = list_entry(complete.next, ext4_io_end_t, list); io->flag &= ~EXT4_IO_END_UNWRITTEN; - /* end_io context can not be destroyed now because it still - * used by queued worker. Worker thread will destroy it later */ - if (io->flag & EXT4_IO_END_QUEUED) - list_del_init(&io->list); - else - list_move(&io->list, &to_free); - } - /* If we are called from worker context, it is time to clear queued - * flag, and destroy it's end_io if it was converted already */ - if (work_io) { - work_io->flag &= ~EXT4_IO_END_QUEUED; - if (!(work_io->flag & EXT4_IO_END_UNWRITTEN)) - list_add_tail(&work_io->list, &to_free); - } - spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); - - while (!list_empty(&to_free)) { - io = list_entry(to_free.next, ext4_io_end_t, list); - list_del_init(&io->list); ext4_free_io_end(io); } return ret; @@ -219,10 +185,11 @@ static int ext4_do_flush_completed_IO(struct inode *inode, /* * work on completed aio dio IO, to convert unwritten extents to extents */ -static void ext4_end_io_work(struct work_struct *work) +void ext4_end_io_work(struct work_struct *work) { - ext4_io_end_t *io = container_of(work, ext4_io_end_t, work); - ext4_do_flush_completed_IO(io->inode, io); + struct ext4_inode_info *ei = container_of(work, struct ext4_inode_info, + i_unwritten_work); + ext4_do_flush_completed_IO(&ei->vfs_inode); } int ext4_flush_unwritten_io(struct inode *inode) @@ -230,7 +197,7 @@ int ext4_flush_unwritten_io(struct inode *inode) int ret; WARN_ON_ONCE(!mutex_is_locked(&inode->i_mutex) && !(inode->i_state & I_FREEING)); - ret = ext4_do_flush_completed_IO(inode, NULL); + ret = ext4_do_flush_completed_IO(inode); ext4_unwritten_wait(inode); return ret; } @@ -241,7 +208,6 @@ ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags) if (io) { atomic_inc(&EXT4_I(inode)->i_ioend_count); io->inode = inode; - INIT_WORK(&io->work, ext4_end_io_work); INIT_LIST_HEAD(&io->list); } return io; @@ -382,14 +348,6 @@ static int io_submit_add_bh(struct ext4_io_submit *io, unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr); } - if (!buffer_mapped(bh) || buffer_delay(bh)) { - if (!buffer_mapped(bh)) - clear_buffer_dirty(bh); - if (io->io_bio) - ext4_io_submit(io); - return 0; - } - if (io->io_bio && bh->b_blocknr != io->io_next_block) { submit_and_retry: ext4_io_submit(io); @@ -436,7 +394,7 @@ int ext4_bio_write_page(struct ext4_io_submit *io, io_page = kmem_cache_alloc(io_page_cachep, GFP_NOFS); if (!io_page) { - set_page_dirty(page); + redirty_page_for_writepage(wbc, page); unlock_page(page); return -ENOMEM; } @@ -468,7 +426,15 @@ int ext4_bio_write_page(struct ext4_io_submit *io, set_buffer_uptodate(bh); continue; } - clear_buffer_dirty(bh); + if (!buffer_dirty(bh) || buffer_delay(bh) || + !buffer_mapped(bh) || buffer_unwritten(bh)) { + /* A hole? We can safely clear the dirty bit */ + if (!buffer_mapped(bh)) + clear_buffer_dirty(bh); + if (io->io_bio) + ext4_io_submit(io); + continue; + } ret = io_submit_add_bh(io, io_page, inode, wbc, bh); if (ret) { /* @@ -476,9 +442,10 @@ int ext4_bio_write_page(struct ext4_io_submit *io, * we can do but mark the page as dirty, and * better luck next time. */ - set_page_dirty(page); + redirty_page_for_writepage(wbc, page); break; } + clear_buffer_dirty(bh); } unlock_page(page); /* diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index d99387b..c7f4d75 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -333,8 +333,8 @@ static struct buffer_head *bclean(handle_t *handle, struct super_block *sb, int err; bh = sb_getblk(sb, blk); - if (!bh) - return ERR_PTR(-EIO); + if (unlikely(!bh)) + return ERR_PTR(-ENOMEM); if ((err = ext4_journal_get_write_access(handle, bh))) { brelse(bh); bh = ERR_PTR(err); @@ -410,8 +410,8 @@ static int set_flexbg_block_bitmap(struct super_block *sb, handle_t *handle, return err; bh = sb_getblk(sb, flex_gd->groups[group].block_bitmap); - if (!bh) - return -EIO; + if (unlikely(!bh)) + return -ENOMEM; err = ext4_journal_get_write_access(handle, bh); if (err) @@ -466,7 +466,7 @@ static int setup_new_flex_group_blocks(struct super_block *sb, meta_bg = EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_META_BG); /* This transaction may be extended/restarted along the way */ - handle = ext4_journal_start_sb(sb, EXT4_MAX_TRANS_DATA); + handle = ext4_journal_start_sb(sb, EXT4_HT_RESIZE, EXT4_MAX_TRANS_DATA); if (IS_ERR(handle)) return PTR_ERR(handle); @@ -500,8 +500,8 @@ static int setup_new_flex_group_blocks(struct super_block *sb, goto out; gdb = sb_getblk(sb, block); - if (!gdb) { - err = -EIO; + if (unlikely(!gdb)) { + err = -ENOMEM; goto out; } @@ -1031,7 +1031,7 @@ static void update_backups(struct super_block *sb, int blk_off, char *data, handle_t *handle; int err = 0, err2; - handle = ext4_journal_start_sb(sb, EXT4_MAX_TRANS_DATA); + handle = ext4_journal_start_sb(sb, EXT4_HT_RESIZE, EXT4_MAX_TRANS_DATA); if (IS_ERR(handle)) { group = 1; err = PTR_ERR(handle); @@ -1064,8 +1064,8 @@ static void update_backups(struct super_block *sb, int blk_off, char *data, ext4_bg_has_super(sb, group)); bh = sb_getblk(sb, backup_block); - if (!bh) { - err = -EIO; + if (unlikely(!bh)) { + err = -ENOMEM; break; } ext4_debug("update metadata backup %llu(+%llu)\n", @@ -1168,7 +1168,7 @@ static int ext4_add_new_descs(handle_t *handle, struct super_block *sb, static struct buffer_head *ext4_get_bitmap(struct super_block *sb, __u64 block) { struct buffer_head *bh = sb_getblk(sb, block); - if (!bh) + if (unlikely(!bh)) return NULL; if (!bh_uptodate_or_lock(bh)) { if (bh_submit_read(bh) < 0) { @@ -1412,7 +1412,7 @@ static int ext4_flex_group_add(struct super_block *sb, * modify each of the reserved GDT dindirect blocks. */ credit = flex_gd->count * 4 + reserved_gdb; - handle = ext4_journal_start_sb(sb, credit); + handle = ext4_journal_start_sb(sb, EXT4_HT_RESIZE, credit); if (IS_ERR(handle)) { err = PTR_ERR(handle); goto exit; @@ -1506,10 +1506,12 @@ static int ext4_setup_next_flex_gd(struct super_block *sb, group_data[i].blocks_count = blocks_per_group; overhead = ext4_group_overhead_blocks(sb, group + i); group_data[i].free_blocks_count = blocks_per_group - overhead; - if (ext4_has_group_desc_csum(sb)) + if (ext4_has_group_desc_csum(sb)) { flex_gd->bg_flags[i] = EXT4_BG_BLOCK_UNINIT | EXT4_BG_INODE_UNINIT; - else + if (!test_opt(sb, INIT_INODE_TABLE)) + flex_gd->bg_flags[i] |= EXT4_BG_INODE_ZEROED; + } else flex_gd->bg_flags[i] = EXT4_BG_INODE_ZEROED; } @@ -1594,7 +1596,7 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input) err = ext4_alloc_flex_bg_array(sb, input->group + 1); if (err) - return err; + goto out; err = ext4_mb_alloc_groupinfo(sb, input->group + 1); if (err) @@ -1622,7 +1624,7 @@ static int ext4_group_extend_no_check(struct super_block *sb, /* We will update the superblock, one block bitmap, and * one group descriptor via ext4_group_add_blocks(). */ - handle = ext4_journal_start_sb(sb, 3); + handle = ext4_journal_start_sb(sb, EXT4_HT_RESIZE, 3); if (IS_ERR(handle)) { err = PTR_ERR(handle); ext4_warning(sb, "error %d on journal start", err); @@ -1786,7 +1788,7 @@ static int ext4_convert_meta_bg(struct super_block *sb, struct inode *inode) credits += 3; /* block bitmap, bg descriptor, resize inode */ } - handle = ext4_journal_start_sb(sb, credits); + handle = ext4_journal_start_sb(sb, EXT4_HT_RESIZE, credits); if (IS_ERR(handle)) return PTR_ERR(handle); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 4df78dd..620cf561 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -69,8 +69,6 @@ static void ext4_mark_recovery_complete(struct super_block *sb, static void ext4_clear_journal_err(struct super_block *sb, struct ext4_super_block *es); static int ext4_sync_fs(struct super_block *sb, int wait); -static const char *ext4_decode_error(struct super_block *sb, int errno, - char nbuf[16]); static int ext4_remount(struct super_block *sb, int *flags, char *data); static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf); static int ext4_unfreeze(struct super_block *sb); @@ -296,107 +294,6 @@ void ext4_itable_unused_set(struct super_block *sb, } -/* Just increment the non-pointer handle value */ -static handle_t *ext4_get_nojournal(void) -{ - handle_t *handle = current->journal_info; - unsigned long ref_cnt = (unsigned long)handle; - - BUG_ON(ref_cnt >= EXT4_NOJOURNAL_MAX_REF_COUNT); - - ref_cnt++; - handle = (handle_t *)ref_cnt; - - current->journal_info = handle; - return handle; -} - - -/* Decrement the non-pointer handle value */ -static void ext4_put_nojournal(handle_t *handle) -{ - unsigned long ref_cnt = (unsigned long)handle; - - BUG_ON(ref_cnt == 0); - - ref_cnt--; - handle = (handle_t *)ref_cnt; - - current->journal_info = handle; -} - -/* - * Wrappers for jbd2_journal_start/end. - */ -handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks) -{ - journal_t *journal; - - trace_ext4_journal_start(sb, nblocks, _RET_IP_); - if (sb->s_flags & MS_RDONLY) - return ERR_PTR(-EROFS); - - WARN_ON(sb->s_writers.frozen == SB_FREEZE_COMPLETE); - journal = EXT4_SB(sb)->s_journal; - if (!journal) - return ext4_get_nojournal(); - /* - * Special case here: if the journal has aborted behind our - * backs (eg. EIO in the commit thread), then we still need to - * take the FS itself readonly cleanly. - */ - if (is_journal_aborted(journal)) { - ext4_abort(sb, "Detected aborted journal"); - return ERR_PTR(-EROFS); - } - return jbd2_journal_start(journal, nblocks); -} - -int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle) -{ - struct super_block *sb; - int err; - int rc; - - if (!ext4_handle_valid(handle)) { - ext4_put_nojournal(handle); - return 0; - } - sb = handle->h_transaction->t_journal->j_private; - err = handle->h_err; - rc = jbd2_journal_stop(handle); - - if (!err) - err = rc; - if (err) - __ext4_std_error(sb, where, line, err); - return err; -} - -void ext4_journal_abort_handle(const char *caller, unsigned int line, - const char *err_fn, struct buffer_head *bh, - handle_t *handle, int err) -{ - char nbuf[16]; - const char *errstr = ext4_decode_error(NULL, err, nbuf); - - BUG_ON(!ext4_handle_valid(handle)); - - if (bh) - BUFFER_TRACE(bh, "abort"); - - if (!handle->h_err) - handle->h_err = err; - - if (is_handle_aborted(handle)) - return; - - printk(KERN_ERR "EXT4-fs: %s:%d: aborting transaction: %s in %s\n", - caller, line, errstr, err_fn); - - jbd2_journal_abort_handle(handle); -} - static void __save_error_info(struct super_block *sb, const char *func, unsigned int line) { @@ -582,8 +479,8 @@ void ext4_error_file(struct file *file, const char *function, ext4_handle_error(inode->i_sb); } -static const char *ext4_decode_error(struct super_block *sb, int errno, - char nbuf[16]) +const char *ext4_decode_error(struct super_block *sb, int errno, + char nbuf[16]) { char *errstr = NULL; @@ -858,6 +755,7 @@ static void ext4_put_super(struct super_block *sb) ext4_abort(sb, "Couldn't clean up the journal"); } + ext4_es_unregister_shrinker(sb); del_timer(&sbi->s_err_report); ext4_release_system_zone(sb); ext4_mb_release(sb); @@ -939,11 +837,12 @@ static struct inode *ext4_alloc_inode(struct super_block *sb) return NULL; ei->vfs_inode.i_version = 1; - memset(&ei->i_cached_extent, 0, sizeof(struct ext4_ext_cache)); INIT_LIST_HEAD(&ei->i_prealloc_list); spin_lock_init(&ei->i_prealloc_lock); ext4_es_init_tree(&ei->i_es_tree); rwlock_init(&ei->i_es_lock); + INIT_LIST_HEAD(&ei->i_es_lru); + ei->i_es_lru_nr = 0; ei->i_reserved_data_blocks = 0; ei->i_reserved_meta_blocks = 0; ei->i_allocated_meta_blocks = 0; @@ -960,6 +859,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb) ei->i_datasync_tid = 0; atomic_set(&ei->i_ioend_count, 0); atomic_set(&ei->i_unwritten, 0); + INIT_WORK(&ei->i_unwritten_work, ext4_end_io_work); return &ei->vfs_inode; } @@ -1031,6 +931,7 @@ void ext4_clear_inode(struct inode *inode) dquot_drop(inode); ext4_discard_preallocations(inode); ext4_es_remove_extent(inode, 0, EXT_MAX_BLOCKS); + ext4_es_lru_del(inode); if (EXT4_I(inode)->jinode) { jbd2_journal_release_jbd_inode(EXT4_JOURNAL(inode), EXT4_I(inode)->jinode); @@ -1280,8 +1181,8 @@ static const match_table_t tokens = { {Opt_stripe, "stripe=%u"}, {Opt_delalloc, "delalloc"}, {Opt_nodelalloc, "nodelalloc"}, - {Opt_mblk_io_submit, "mblk_io_submit"}, - {Opt_nomblk_io_submit, "nomblk_io_submit"}, + {Opt_removed, "mblk_io_submit"}, + {Opt_removed, "nomblk_io_submit"}, {Opt_block_validity, "block_validity"}, {Opt_noblock_validity, "noblock_validity"}, {Opt_inode_readahead_blks, "inode_readahead_blks=%u"}, @@ -1337,6 +1238,7 @@ static int set_qf_name(struct super_block *sb, int qtype, substring_t *args) { struct ext4_sb_info *sbi = EXT4_SB(sb); char *qname; + int ret = -1; if (sb_any_quota_loaded(sb) && !sbi->s_qf_names[qtype]) { @@ -1351,23 +1253,26 @@ static int set_qf_name(struct super_block *sb, int qtype, substring_t *args) "Not enough memory for storing quotafile name"); return -1; } - if (sbi->s_qf_names[qtype] && - strcmp(sbi->s_qf_names[qtype], qname)) { - ext4_msg(sb, KERN_ERR, - "%s quota file already specified", QTYPE2NAME(qtype)); - kfree(qname); - return -1; + if (sbi->s_qf_names[qtype]) { + if (strcmp(sbi->s_qf_names[qtype], qname) == 0) + ret = 1; + else + ext4_msg(sb, KERN_ERR, + "%s quota file already specified", + QTYPE2NAME(qtype)); + goto errout; } - sbi->s_qf_names[qtype] = qname; - if (strchr(sbi->s_qf_names[qtype], '/')) { + if (strchr(qname, '/')) { ext4_msg(sb, KERN_ERR, "quotafile must be on filesystem root"); - kfree(sbi->s_qf_names[qtype]); - sbi->s_qf_names[qtype] = NULL; - return -1; + goto errout; } + sbi->s_qf_names[qtype] = qname; set_opt(sb, QUOTA); return 1; +errout: + kfree(qname); + return ret; } static int clear_qf_name(struct super_block *sb, int qtype) @@ -1381,10 +1286,7 @@ static int clear_qf_name(struct super_block *sb, int qtype) " when quota turned on"); return -1; } - /* - * The space will be released later when all options are confirmed - * to be correct - */ + kfree(sbi->s_qf_names[qtype]); sbi->s_qf_names[qtype] = NULL; return 1; } @@ -1404,6 +1306,9 @@ static int clear_qf_name(struct super_block *sb, int qtype) #define MOPT_QFMT MOPT_NOSUPPORT #endif #define MOPT_DATAJ 0x0080 +#define MOPT_NO_EXT2 0x0100 +#define MOPT_NO_EXT3 0x0200 +#define MOPT_EXT4_ONLY (MOPT_NO_EXT2 | MOPT_NO_EXT3) static const struct mount_opts { int token; @@ -1414,25 +1319,31 @@ static const struct mount_opts { {Opt_bsd_df, EXT4_MOUNT_MINIX_DF, MOPT_CLEAR}, {Opt_grpid, EXT4_MOUNT_GRPID, MOPT_SET}, {Opt_nogrpid, EXT4_MOUNT_GRPID, MOPT_CLEAR}, - {Opt_mblk_io_submit, EXT4_MOUNT_MBLK_IO_SUBMIT, MOPT_SET}, - {Opt_nomblk_io_submit, EXT4_MOUNT_MBLK_IO_SUBMIT, MOPT_CLEAR}, {Opt_block_validity, EXT4_MOUNT_BLOCK_VALIDITY, MOPT_SET}, {Opt_noblock_validity, EXT4_MOUNT_BLOCK_VALIDITY, MOPT_CLEAR}, - {Opt_dioread_nolock, EXT4_MOUNT_DIOREAD_NOLOCK, MOPT_SET}, - {Opt_dioread_lock, EXT4_MOUNT_DIOREAD_NOLOCK, MOPT_CLEAR}, + {Opt_dioread_nolock, EXT4_MOUNT_DIOREAD_NOLOCK, + MOPT_EXT4_ONLY | MOPT_SET}, + {Opt_dioread_lock, EXT4_MOUNT_DIOREAD_NOLOCK, + MOPT_EXT4_ONLY | MOPT_CLEAR}, {Opt_discard, EXT4_MOUNT_DISCARD, MOPT_SET}, {Opt_nodiscard, EXT4_MOUNT_DISCARD, MOPT_CLEAR}, - {Opt_delalloc, EXT4_MOUNT_DELALLOC, MOPT_SET | MOPT_EXPLICIT}, - {Opt_nodelalloc, EXT4_MOUNT_DELALLOC, MOPT_CLEAR | MOPT_EXPLICIT}, - {Opt_journal_checksum, EXT4_MOUNT_JOURNAL_CHECKSUM, MOPT_SET}, + {Opt_delalloc, EXT4_MOUNT_DELALLOC, + MOPT_EXT4_ONLY | MOPT_SET | MOPT_EXPLICIT}, + {Opt_nodelalloc, EXT4_MOUNT_DELALLOC, + MOPT_EXT4_ONLY | MOPT_CLEAR | MOPT_EXPLICIT}, + {Opt_journal_checksum, EXT4_MOUNT_JOURNAL_CHECKSUM, + MOPT_EXT4_ONLY | MOPT_SET}, {Opt_journal_async_commit, (EXT4_MOUNT_JOURNAL_ASYNC_COMMIT | - EXT4_MOUNT_JOURNAL_CHECKSUM), MOPT_SET}, - {Opt_noload, EXT4_MOUNT_NOLOAD, MOPT_SET}, + EXT4_MOUNT_JOURNAL_CHECKSUM), + MOPT_EXT4_ONLY | MOPT_SET}, + {Opt_noload, EXT4_MOUNT_NOLOAD, MOPT_NO_EXT2 | MOPT_SET}, {Opt_err_panic, EXT4_MOUNT_ERRORS_PANIC, MOPT_SET | MOPT_CLEAR_ERR}, {Opt_err_ro, EXT4_MOUNT_ERRORS_RO, MOPT_SET | MOPT_CLEAR_ERR}, {Opt_err_cont, EXT4_MOUNT_ERRORS_CONT, MOPT_SET | MOPT_CLEAR_ERR}, - {Opt_data_err_abort, EXT4_MOUNT_DATA_ERR_ABORT, MOPT_SET}, - {Opt_data_err_ignore, EXT4_MOUNT_DATA_ERR_ABORT, MOPT_CLEAR}, + {Opt_data_err_abort, EXT4_MOUNT_DATA_ERR_ABORT, + MOPT_NO_EXT2 | MOPT_SET}, + {Opt_data_err_ignore, EXT4_MOUNT_DATA_ERR_ABORT, + MOPT_NO_EXT2 | MOPT_CLEAR}, {Opt_barrier, EXT4_MOUNT_BARRIER, MOPT_SET}, {Opt_nobarrier, EXT4_MOUNT_BARRIER, MOPT_CLEAR}, {Opt_noauto_da_alloc, EXT4_MOUNT_NO_AUTO_DA_ALLOC, MOPT_SET}, @@ -1444,9 +1355,14 @@ static const struct mount_opts { {Opt_inode_readahead_blks, 0, MOPT_GTE0}, {Opt_init_itable, 0, MOPT_GTE0}, {Opt_stripe, 0, MOPT_GTE0}, - {Opt_data_journal, EXT4_MOUNT_JOURNAL_DATA, MOPT_DATAJ}, - {Opt_data_ordered, EXT4_MOUNT_ORDERED_DATA, MOPT_DATAJ}, - {Opt_data_writeback, EXT4_MOUNT_WRITEBACK_DATA, MOPT_DATAJ}, + {Opt_resuid, 0, MOPT_GTE0}, + {Opt_resgid, 0, MOPT_GTE0}, + {Opt_journal_dev, 0, MOPT_GTE0}, + {Opt_journal_ioprio, 0, MOPT_GTE0}, + {Opt_data_journal, EXT4_MOUNT_JOURNAL_DATA, MOPT_NO_EXT2 | MOPT_DATAJ}, + {Opt_data_ordered, EXT4_MOUNT_ORDERED_DATA, MOPT_NO_EXT2 | MOPT_DATAJ}, + {Opt_data_writeback, EXT4_MOUNT_WRITEBACK_DATA, + MOPT_NO_EXT2 | MOPT_DATAJ}, {Opt_user_xattr, EXT4_MOUNT_XATTR_USER, MOPT_SET}, {Opt_nouser_xattr, EXT4_MOUNT_XATTR_USER, MOPT_CLEAR}, #ifdef CONFIG_EXT4_FS_POSIX_ACL @@ -1496,8 +1412,6 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token, else if (token == Opt_offgrpjquota) return clear_qf_name(sb, GRPQUOTA); #endif - if (args->from && match_int(args, &arg)) - return -1; switch (token) { case Opt_noacl: case Opt_nouser_xattr: @@ -1506,138 +1420,149 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token, case Opt_sb: return 1; /* handled by get_sb_block() */ case Opt_removed: - ext4_msg(sb, KERN_WARNING, - "Ignoring removed %s option", opt); + ext4_msg(sb, KERN_WARNING, "Ignoring removed %s option", opt); + return 1; + case Opt_abort: + sbi->s_mount_flags |= EXT4_MF_FS_ABORTED; + return 1; + case Opt_i_version: + sb->s_flags |= MS_I_VERSION; return 1; - case Opt_resuid: + } + + for (m = ext4_mount_opts; m->token != Opt_err; m++) + if (token == m->token) + break; + + if (m->token == Opt_err) { + ext4_msg(sb, KERN_ERR, "Unrecognized mount option \"%s\" " + "or missing value", opt); + return -1; + } + + if ((m->flags & MOPT_NO_EXT2) && IS_EXT2_SB(sb)) { + ext4_msg(sb, KERN_ERR, + "Mount option \"%s\" incompatible with ext2", opt); + return -1; + } + if ((m->flags & MOPT_NO_EXT3) && IS_EXT3_SB(sb)) { + ext4_msg(sb, KERN_ERR, + "Mount option \"%s\" incompatible with ext3", opt); + return -1; + } + + if (args->from && match_int(args, &arg)) + return -1; + if (args->from && (m->flags & MOPT_GTE0) && (arg < 0)) + return -1; + if (m->flags & MOPT_EXPLICIT) + set_opt2(sb, EXPLICIT_DELALLOC); + if (m->flags & MOPT_CLEAR_ERR) + clear_opt(sb, ERRORS_MASK); + if (token == Opt_noquota && sb_any_quota_loaded(sb)) { + ext4_msg(sb, KERN_ERR, "Cannot change quota " + "options when quota turned on"); + return -1; + } + + if (m->flags & MOPT_NOSUPPORT) { + ext4_msg(sb, KERN_ERR, "%s option not supported", opt); + } else if (token == Opt_commit) { + if (arg == 0) + arg = JBD2_DEFAULT_MAX_COMMIT_AGE; + sbi->s_commit_interval = HZ * arg; + } else if (token == Opt_max_batch_time) { + if (arg == 0) + arg = EXT4_DEF_MAX_BATCH_TIME; + sbi->s_max_batch_time = arg; + } else if (token == Opt_min_batch_time) { + sbi->s_min_batch_time = arg; + } else if (token == Opt_inode_readahead_blks) { + if (arg && (arg > (1 << 30) || !is_power_of_2(arg))) { + ext4_msg(sb, KERN_ERR, + "EXT4-fs: inode_readahead_blks must be " + "0 or a power of 2 smaller than 2^31"); + return -1; + } + sbi->s_inode_readahead_blks = arg; + } else if (token == Opt_init_itable) { + set_opt(sb, INIT_INODE_TABLE); + if (!args->from) + arg = EXT4_DEF_LI_WAIT_MULT; + sbi->s_li_wait_mult = arg; + } else if (token == Opt_max_dir_size_kb) { + sbi->s_max_dir_size_kb = arg; + } else if (token == Opt_stripe) { + sbi->s_stripe = arg; + } else if (token == Opt_resuid) { uid = make_kuid(current_user_ns(), arg); if (!uid_valid(uid)) { ext4_msg(sb, KERN_ERR, "Invalid uid value %d", arg); return -1; } sbi->s_resuid = uid; - return 1; - case Opt_resgid: + } else if (token == Opt_resgid) { gid = make_kgid(current_user_ns(), arg); if (!gid_valid(gid)) { ext4_msg(sb, KERN_ERR, "Invalid gid value %d", arg); return -1; } sbi->s_resgid = gid; - return 1; - case Opt_abort: - sbi->s_mount_flags |= EXT4_MF_FS_ABORTED; - return 1; - case Opt_i_version: - sb->s_flags |= MS_I_VERSION; - return 1; - case Opt_journal_dev: + } else if (token == Opt_journal_dev) { if (is_remount) { ext4_msg(sb, KERN_ERR, "Cannot specify journal on remount"); return -1; } *journal_devnum = arg; - return 1; - case Opt_journal_ioprio: - if (arg < 0 || arg > 7) - return -1; - *journal_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, arg); - return 1; - } - - for (m = ext4_mount_opts; m->token != Opt_err; m++) { - if (token != m->token) - continue; - if (args->from && (m->flags & MOPT_GTE0) && (arg < 0)) - return -1; - if (m->flags & MOPT_EXPLICIT) - set_opt2(sb, EXPLICIT_DELALLOC); - if (m->flags & MOPT_CLEAR_ERR) - clear_opt(sb, ERRORS_MASK); - if (token == Opt_noquota && sb_any_quota_loaded(sb)) { - ext4_msg(sb, KERN_ERR, "Cannot change quota " - "options when quota turned on"); + } else if (token == Opt_journal_ioprio) { + if (arg > 7) { + ext4_msg(sb, KERN_ERR, "Invalid journal IO priority" + " (must be 0-7)"); return -1; } - - if (m->flags & MOPT_NOSUPPORT) { - ext4_msg(sb, KERN_ERR, "%s option not supported", opt); - } else if (token == Opt_commit) { - if (arg == 0) - arg = JBD2_DEFAULT_MAX_COMMIT_AGE; - sbi->s_commit_interval = HZ * arg; - } else if (token == Opt_max_batch_time) { - if (arg == 0) - arg = EXT4_DEF_MAX_BATCH_TIME; - sbi->s_max_batch_time = arg; - } else if (token == Opt_min_batch_time) { - sbi->s_min_batch_time = arg; - } else if (token == Opt_inode_readahead_blks) { - if (arg > (1 << 30)) - return -1; - if (arg && !is_power_of_2(arg)) { + *journal_ioprio = + IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, arg); + } else if (m->flags & MOPT_DATAJ) { + if (is_remount) { + if (!sbi->s_journal) + ext4_msg(sb, KERN_WARNING, "Remounting file system with no journal so ignoring journalled data option"); + else if (test_opt(sb, DATA_FLAGS) != m->mount_opt) { ext4_msg(sb, KERN_ERR, - "EXT4-fs: inode_readahead_blks" - " must be a power of 2"); - return -1; - } - sbi->s_inode_readahead_blks = arg; - } else if (token == Opt_init_itable) { - set_opt(sb, INIT_INODE_TABLE); - if (!args->from) - arg = EXT4_DEF_LI_WAIT_MULT; - sbi->s_li_wait_mult = arg; - } else if (token == Opt_max_dir_size_kb) { - sbi->s_max_dir_size_kb = arg; - } else if (token == Opt_stripe) { - sbi->s_stripe = arg; - } else if (m->flags & MOPT_DATAJ) { - if (is_remount) { - if (!sbi->s_journal) - ext4_msg(sb, KERN_WARNING, "Remounting file system with no journal so ignoring journalled data option"); - else if (test_opt(sb, DATA_FLAGS) != - m->mount_opt) { - ext4_msg(sb, KERN_ERR, "Cannot change data mode on remount"); - return -1; - } - } else { - clear_opt(sb, DATA_FLAGS); - sbi->s_mount_opt |= m->mount_opt; - } -#ifdef CONFIG_QUOTA - } else if (m->flags & MOPT_QFMT) { - if (sb_any_quota_loaded(sb) && - sbi->s_jquota_fmt != m->mount_opt) { - ext4_msg(sb, KERN_ERR, "Cannot " - "change journaled quota options " - "when quota turned on"); return -1; } - sbi->s_jquota_fmt = m->mount_opt; -#endif } else { - if (!args->from) - arg = 1; - if (m->flags & MOPT_CLEAR) - arg = !arg; - else if (unlikely(!(m->flags & MOPT_SET))) { - ext4_msg(sb, KERN_WARNING, - "buggy handling of option %s", opt); - WARN_ON(1); - return -1; - } - if (arg != 0) - sbi->s_mount_opt |= m->mount_opt; - else - sbi->s_mount_opt &= ~m->mount_opt; + clear_opt(sb, DATA_FLAGS); + sbi->s_mount_opt |= m->mount_opt; } - return 1; +#ifdef CONFIG_QUOTA + } else if (m->flags & MOPT_QFMT) { + if (sb_any_quota_loaded(sb) && + sbi->s_jquota_fmt != m->mount_opt) { + ext4_msg(sb, KERN_ERR, "Cannot change journaled " + "quota options when quota turned on"); + return -1; + } + sbi->s_jquota_fmt = m->mount_opt; +#endif + } else { + if (!args->from) + arg = 1; + if (m->flags & MOPT_CLEAR) + arg = !arg; + else if (unlikely(!(m->flags & MOPT_SET))) { + ext4_msg(sb, KERN_WARNING, + "buggy handling of option %s", opt); + WARN_ON(1); + return -1; + } + if (arg != 0) + sbi->s_mount_opt |= m->mount_opt; + else + sbi->s_mount_opt &= ~m->mount_opt; } - ext4_msg(sb, KERN_ERR, "Unrecognized mount option \"%s\" " - "or missing value", opt); - return -1; + return 1; } static int parse_options(char *options, struct super_block *sb, @@ -2776,7 +2701,7 @@ static int ext4_run_li_request(struct ext4_li_request *elr) break; } - if (group == ngroups) + if (group >= ngroups) ret = 1; if (!ret) { @@ -3016,33 +2941,34 @@ static struct ext4_li_request *ext4_li_request_new(struct super_block *sb, return elr; } -static int ext4_register_li_request(struct super_block *sb, - ext4_group_t first_not_zeroed) +int ext4_register_li_request(struct super_block *sb, + ext4_group_t first_not_zeroed) { struct ext4_sb_info *sbi = EXT4_SB(sb); - struct ext4_li_request *elr; + struct ext4_li_request *elr = NULL; ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count; int ret = 0; + mutex_lock(&ext4_li_mtx); if (sbi->s_li_request != NULL) { /* * Reset timeout so it can be computed again, because * s_li_wait_mult might have changed. */ sbi->s_li_request->lr_timeout = 0; - return 0; + goto out; } if (first_not_zeroed == ngroups || (sb->s_flags & MS_RDONLY) || !test_opt(sb, INIT_INODE_TABLE)) - return 0; + goto out; elr = ext4_li_request_new(sb, first_not_zeroed); - if (!elr) - return -ENOMEM; - - mutex_lock(&ext4_li_mtx); + if (!elr) { + ret = -ENOMEM; + goto out; + } if (NULL == ext4_li_info) { ret = ext4_li_info_new(); @@ -3379,7 +3305,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) #ifdef CONFIG_EXT4_FS_POSIX_ACL set_opt(sb, POSIX_ACL); #endif - set_opt(sb, MBLK_IO_SUBMIT); if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_DATA) set_opt(sb, JOURNAL_DATA); else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_ORDERED) @@ -3772,6 +3697,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) sbi->s_max_writeback_mb_bump = 128; sbi->s_extent_max_zeroout_kb = 32; + /* Register extent status tree shrinker */ + ext4_es_register_shrinker(sb); + /* * set up enough so that it can read an inode */ @@ -4008,7 +3936,7 @@ no_journal: !(sb->s_flags & MS_RDONLY)) { err = ext4_enable_quotas(sb); if (err) - goto failed_mount7; + goto failed_mount8; } #endif /* CONFIG_QUOTA */ @@ -4035,6 +3963,10 @@ cantfind_ext4: ext4_msg(sb, KERN_ERR, "VFS: Can't find ext4 filesystem"); goto failed_mount; +#ifdef CONFIG_QUOTA +failed_mount8: + kobject_del(&sbi->s_kobj); +#endif failed_mount7: ext4_unregister_li_request(sb); failed_mount6: @@ -4476,16 +4408,12 @@ static void ext4_clear_journal_err(struct super_block *sb, int ext4_force_commit(struct super_block *sb) { journal_t *journal; - int ret = 0; if (sb->s_flags & MS_RDONLY) return 0; journal = EXT4_SB(sb)->s_journal; - if (journal) - ret = ext4_journal_force_commit(journal); - - return ret; + return ext4_journal_force_commit(journal); } static int ext4_sync_fs(struct super_block *sb, int wait) @@ -4588,7 +4516,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO; int err = 0; #ifdef CONFIG_QUOTA - int i; + int i, j; #endif char *orig_data = kstrdup(data, GFP_KERNEL); @@ -4604,7 +4532,16 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) #ifdef CONFIG_QUOTA old_opts.s_jquota_fmt = sbi->s_jquota_fmt; for (i = 0; i < MAXQUOTAS; i++) - old_opts.s_qf_names[i] = sbi->s_qf_names[i]; + if (sbi->s_qf_names[i]) { + old_opts.s_qf_names[i] = kstrdup(sbi->s_qf_names[i], + GFP_KERNEL); + if (!old_opts.s_qf_names[i]) { + for (j = 0; j < i; j++) + kfree(old_opts.s_qf_names[j]); + return -ENOMEM; + } + } else + old_opts.s_qf_names[i] = NULL; #endif if (sbi->s_journal && sbi->s_journal->j_task->io_context) journal_ioprio = sbi->s_journal->j_task->io_context->ioprio; @@ -4737,9 +4674,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) #ifdef CONFIG_QUOTA /* Release old quota file names */ for (i = 0; i < MAXQUOTAS; i++) - if (old_opts.s_qf_names[i] && - old_opts.s_qf_names[i] != sbi->s_qf_names[i]) - kfree(old_opts.s_qf_names[i]); + kfree(old_opts.s_qf_names[i]); if (enable_quota) { if (sb_any_quota_suspended(sb)) dquot_resume(sb, -1); @@ -4768,9 +4703,7 @@ restore_opts: #ifdef CONFIG_QUOTA sbi->s_jquota_fmt = old_opts.s_jquota_fmt; for (i = 0; i < MAXQUOTAS; i++) { - if (sbi->s_qf_names[i] && - old_opts.s_qf_names[i] != sbi->s_qf_names[i]) - kfree(sbi->s_qf_names[i]); + kfree(sbi->s_qf_names[i]); sbi->s_qf_names[i] = old_opts.s_qf_names[i]; } #endif @@ -4835,7 +4768,7 @@ static int ext4_write_dquot(struct dquot *dquot) struct inode *inode; inode = dquot_to_inode(dquot); - handle = ext4_journal_start(inode, + handle = ext4_journal_start(inode, EXT4_HT_QUOTA, EXT4_QUOTA_TRANS_BLOCKS(dquot->dq_sb)); if (IS_ERR(handle)) return PTR_ERR(handle); @@ -4851,7 +4784,7 @@ static int ext4_acquire_dquot(struct dquot *dquot) int ret, err; handle_t *handle; - handle = ext4_journal_start(dquot_to_inode(dquot), + handle = ext4_journal_start(dquot_to_inode(dquot), EXT4_HT_QUOTA, EXT4_QUOTA_INIT_BLOCKS(dquot->dq_sb)); if (IS_ERR(handle)) return PTR_ERR(handle); @@ -4867,7 +4800,7 @@ static int ext4_release_dquot(struct dquot *dquot) int ret, err; handle_t *handle; - handle = ext4_journal_start(dquot_to_inode(dquot), + handle = ext4_journal_start(dquot_to_inode(dquot), EXT4_HT_QUOTA, EXT4_QUOTA_DEL_BLOCKS(dquot->dq_sb)); if (IS_ERR(handle)) { /* Release dquot anyway to avoid endless cycle in dqput() */ @@ -4899,7 +4832,7 @@ static int ext4_write_info(struct super_block *sb, int type) handle_t *handle; /* Data block + inode block */ - handle = ext4_journal_start(sb->s_root->d_inode, 2); + handle = ext4_journal_start(sb->s_root->d_inode, EXT4_HT_QUOTA, 2); if (IS_ERR(handle)) return PTR_ERR(handle); ret = dquot_commit_info(sb, type); @@ -5005,9 +4938,9 @@ static int ext4_enable_quotas(struct super_block *sb) DQUOT_USAGE_ENABLED); if (err) { ext4_warning(sb, - "Failed to enable quota (type=%d) " - "tracking. Please run e2fsck to fix.", - type); + "Failed to enable quota tracking " + "(type=%d, err=%d). Please run " + "e2fsck to fix.", type, err); return err; } } @@ -5045,7 +4978,7 @@ static int ext4_quota_off(struct super_block *sb, int type) /* Update modification times of quota files when userspace can * start looking at them */ - handle = ext4_journal_start(inode, 1); + handle = ext4_journal_start(inode, EXT4_HT_QUOTA, 1); if (IS_ERR(handle)) goto out; inode->i_mtime = inode->i_ctime = CURRENT_TIME; diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 3a91ebc..3a120b2 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -549,7 +549,7 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode, error = ext4_handle_dirty_xattr_block(handle, inode, bh); if (IS_SYNC(inode)) ext4_handle_sync(handle); - dquot_free_block(inode, 1); + dquot_free_block(inode, EXT4_C2B(EXT4_SB(inode->i_sb), 1)); ea_bdebug(bh, "refcount now=%d; releasing", le32_to_cpu(BHDR(bh)->h_refcount)); } @@ -832,7 +832,8 @@ inserted: else { /* The old block is released after updating the inode. */ - error = dquot_alloc_block(inode, 1); + error = dquot_alloc_block(inode, + EXT4_C2B(EXT4_SB(sb), 1)); if (error) goto cleanup; error = ext4_journal_get_write_access(handle, @@ -886,17 +887,18 @@ inserted: (unsigned long long)block); new_bh = sb_getblk(sb, block); - if (!new_bh) { + if (unlikely(!new_bh)) { + error = -ENOMEM; getblk_failed: ext4_free_blocks(handle, inode, NULL, block, 1, EXT4_FREE_BLOCKS_METADATA); - error = -EIO; goto cleanup; } lock_buffer(new_bh); error = ext4_journal_get_create_access(handle, new_bh); if (error) { unlock_buffer(new_bh); + error = -EIO; goto getblk_failed; } memcpy(new_bh->b_data, s->base, new_bh->b_size); @@ -928,7 +930,7 @@ cleanup: return error; cleanup_dquot: - dquot_free_block(inode, 1); + dquot_free_block(inode, EXT4_C2B(EXT4_SB(sb), 1)); goto cleanup; bad_block: @@ -1164,17 +1166,10 @@ ext4_xattr_set(struct inode *inode, int name_index, const char *name, { handle_t *handle; int error, retries = 0; - int credits = EXT4_DATA_TRANS_BLOCKS(inode->i_sb); + int credits = ext4_jbd2_credits_xattr(inode); retry: - /* - * In case of inline data, we may push out the data to a block, - * So reserve the journal space first. - */ - if (ext4_has_inline_data(inode)) - credits += ext4_writepage_trans_blocks(inode) + 1; - - handle = ext4_journal_start(inode, credits); + handle = ext4_journal_start(inode, EXT4_HT_XATTR, credits); if (IS_ERR(handle)) { error = PTR_ERR(handle); } else { diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h index 69eda78..aa25deb 100644 --- a/fs/ext4/xattr.h +++ b/fs/ext4/xattr.h @@ -125,74 +125,6 @@ extern int ext4_xattr_ibody_inline_set(handle_t *handle, struct inode *inode, struct ext4_xattr_info *i, struct ext4_xattr_ibody_find *is); -extern int ext4_has_inline_data(struct inode *inode); -extern int ext4_get_inline_size(struct inode *inode); -extern int ext4_get_max_inline_size(struct inode *inode); -extern int ext4_find_inline_data_nolock(struct inode *inode); -extern void ext4_write_inline_data(struct inode *inode, - struct ext4_iloc *iloc, - void *buffer, loff_t pos, - unsigned int len); -extern int ext4_prepare_inline_data(handle_t *handle, struct inode *inode, - unsigned int len); -extern int ext4_init_inline_data(handle_t *handle, struct inode *inode, - unsigned int len); -extern int ext4_destroy_inline_data(handle_t *handle, struct inode *inode); - -extern int ext4_readpage_inline(struct inode *inode, struct page *page); -extern int ext4_try_to_write_inline_data(struct address_space *mapping, - struct inode *inode, - loff_t pos, unsigned len, - unsigned flags, - struct page **pagep); -extern int ext4_write_inline_data_end(struct inode *inode, - loff_t pos, unsigned len, - unsigned copied, - struct page *page); -extern struct buffer_head * -ext4_journalled_write_inline_data(struct inode *inode, - unsigned len, - struct page *page); -extern int ext4_da_write_inline_data_begin(struct address_space *mapping, - struct inode *inode, - loff_t pos, unsigned len, - unsigned flags, - struct page **pagep, - void **fsdata); -extern int ext4_da_write_inline_data_end(struct inode *inode, loff_t pos, - unsigned len, unsigned copied, - struct page *page); -extern int ext4_try_add_inline_entry(handle_t *handle, struct dentry *dentry, - struct inode *inode); -extern int ext4_try_create_inline_dir(handle_t *handle, - struct inode *parent, - struct inode *inode); -extern int ext4_read_inline_dir(struct file *filp, - void *dirent, filldir_t filldir, - int *has_inline_data); -extern struct buffer_head *ext4_find_inline_entry(struct inode *dir, - const struct qstr *d_name, - struct ext4_dir_entry_2 **res_dir, - int *has_inline_data); -extern int ext4_delete_inline_entry(handle_t *handle, - struct inode *dir, - struct ext4_dir_entry_2 *de_del, - struct buffer_head *bh, - int *has_inline_data); -extern int empty_inline_dir(struct inode *dir, int *has_inline_data); -extern struct buffer_head *ext4_get_first_inline_block(struct inode *inode, - struct ext4_dir_entry_2 **parent_de, - int *retval); -extern int ext4_inline_data_fiemap(struct inode *inode, - struct fiemap_extent_info *fieinfo, - int *has_inline); -extern int ext4_try_to_evict_inline_data(handle_t *handle, - struct inode *inode, - int needed); -extern void ext4_inline_data_truncate(struct inode *inode, int *has_inline); - -extern int ext4_convert_inline_data(struct inode *inode); - #ifdef CONFIG_EXT4_FS_SECURITY extern int ext4_init_security(handle_t *handle, struct inode *inode, struct inode *dir, const struct qstr *qstr); diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c index e95b949..137af42 100644 --- a/fs/f2fs/acl.c +++ b/fs/f2fs/acl.c @@ -191,15 +191,14 @@ struct posix_acl *f2fs_get_acl(struct inode *inode, int type) retval = f2fs_getxattr(inode, name_index, "", value, retval); } - if (retval < 0) { - if (retval == -ENODATA) - acl = NULL; - else - acl = ERR_PTR(retval); - } else { + if (retval > 0) acl = f2fs_acl_from_disk(value, retval); - } + else if (retval == -ENODATA) + acl = NULL; + else + acl = ERR_PTR(retval); kfree(value); + if (!IS_ERR(acl)) set_cached_acl(inode, type, acl); diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 6ef36c3..2b6fc13 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -72,22 +72,22 @@ static int f2fs_write_meta_page(struct page *page, { struct inode *inode = page->mapping->host; struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); - int err; - wait_on_page_writeback(page); - - err = write_meta_page(sbi, page, wbc); - if (err) { + /* Should not write any meta pages, if any IO error was occurred */ + if (wbc->for_reclaim || + is_set_ckpt_flags(F2FS_CKPT(sbi), CP_ERROR_FLAG)) { + dec_page_count(sbi, F2FS_DIRTY_META); wbc->pages_skipped++; set_page_dirty(page); + return AOP_WRITEPAGE_ACTIVATE; } - dec_page_count(sbi, F2FS_DIRTY_META); + wait_on_page_writeback(page); - /* In this case, we should not unlock this page */ - if (err != AOP_WRITEPAGE_ACTIVATE) - unlock_page(page); - return err; + write_meta_page(sbi, page); + dec_page_count(sbi, F2FS_DIRTY_META); + unlock_page(page); + return 0; } static int f2fs_write_meta_pages(struct address_space *mapping, @@ -138,7 +138,10 @@ long sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type, BUG_ON(page->mapping != mapping); BUG_ON(!PageDirty(page)); clear_page_dirty_for_io(page); - f2fs_write_meta_page(page, &wbc); + if (f2fs_write_meta_page(page, &wbc)) { + unlock_page(page); + break; + } if (nwritten++ >= nr_to_write) break; } @@ -161,7 +164,6 @@ static int f2fs_set_meta_page_dirty(struct page *page) if (!PageDirty(page)) { __set_page_dirty_nobuffers(page); inc_page_count(sbi, F2FS_DIRTY_META); - F2FS_SET_SB_DIRT(sbi); return 1; } return 0; @@ -214,22 +216,13 @@ retry: goto retry; } new->ino = ino; - INIT_LIST_HEAD(&new->list); /* add new_oentry into list which is sorted by inode number */ - if (orphan) { - struct orphan_inode_entry *prev; - - /* get previous entry */ - prev = list_entry(orphan->list.prev, typeof(*prev), list); - if (&prev->list != head) - /* insert new orphan inode entry */ - list_add(&new->list, &prev->list); - else - list_add(&new->list, head); - } else { + if (orphan) + list_add(&new->list, this->prev); + else list_add_tail(&new->list, head); - } + sbi->n_orphans++; out: mutex_unlock(&sbi->orphan_inode_mutex); @@ -546,7 +539,7 @@ retry: /* * Freeze all the FS-operations for checkpoint. */ -void block_operations(struct f2fs_sb_info *sbi) +static void block_operations(struct f2fs_sb_info *sbi) { int t; struct writeback_control wbc = { @@ -718,27 +711,24 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount) sbi->alloc_valid_block_count = 0; /* Here, we only have one bio having CP pack */ - if (is_set_ckpt_flags(ckpt, CP_ERROR_FLAG)) - sbi->sb->s_flags |= MS_RDONLY; - else - sync_meta_pages(sbi, META_FLUSH, LONG_MAX); + sync_meta_pages(sbi, META_FLUSH, LONG_MAX); - clear_prefree_segments(sbi); - F2FS_RESET_SB_DIRT(sbi); + if (!is_set_ckpt_flags(ckpt, CP_ERROR_FLAG)) { + clear_prefree_segments(sbi); + F2FS_RESET_SB_DIRT(sbi); + } } /* * We guarantee that this checkpoint procedure should not fail. */ -void write_checkpoint(struct f2fs_sb_info *sbi, bool blocked, bool is_umount) +void write_checkpoint(struct f2fs_sb_info *sbi, bool is_umount) { struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); unsigned long long ckpt_ver; - if (!blocked) { - mutex_lock(&sbi->cp_mutex); - block_operations(sbi); - } + mutex_lock(&sbi->cp_mutex); + block_operations(sbi); f2fs_submit_bio(sbi, DATA, true); f2fs_submit_bio(sbi, NODE, true); @@ -772,7 +762,7 @@ void init_orphan_info(struct f2fs_sb_info *sbi) sbi->n_orphans = 0; } -int create_checkpoint_caches(void) +int __init create_checkpoint_caches(void) { orphan_entry_slab = f2fs_kmem_cache_create("f2fs_orphan_entry", sizeof(struct orphan_inode_entry), NULL); diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 3aa5ce7..7bd22a2 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -547,6 +547,15 @@ redirty_out: #define MAX_DESIRED_PAGES_WP 4096 +static int __f2fs_writepage(struct page *page, struct writeback_control *wbc, + void *data) +{ + struct address_space *mapping = data; + int ret = mapping->a_ops->writepage(page, wbc); + mapping_set_error(mapping, ret); + return ret; +} + static int f2fs_write_data_pages(struct address_space *mapping, struct writeback_control *wbc) { @@ -563,7 +572,7 @@ static int f2fs_write_data_pages(struct address_space *mapping, if (!S_ISDIR(inode->i_mode)) mutex_lock(&sbi->writepages); - ret = generic_writepages(mapping, wbc); + ret = write_cache_pages(mapping, wbc, __f2fs_writepage, mapping); if (!S_ISDIR(inode->i_mode)) mutex_unlock(&sbi->writepages); f2fs_submit_bio(sbi, DATA, (wbc->sync_mode == WB_SYNC_ALL)); @@ -689,6 +698,11 @@ static int f2fs_set_data_page_dirty(struct page *page) return 0; } +static sector_t f2fs_bmap(struct address_space *mapping, sector_t block) +{ + return generic_block_bmap(mapping, block, get_data_block_ro); +} + const struct address_space_operations f2fs_dblock_aops = { .readpage = f2fs_read_data_page, .readpages = f2fs_read_data_pages, @@ -700,4 +714,5 @@ const struct address_space_operations f2fs_dblock_aops = { .invalidatepage = f2fs_invalidate_data_page, .releasepage = f2fs_release_data_page, .direct_IO = f2fs_direct_IO, + .bmap = f2fs_bmap, }; diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index 0e0380a..025b9e2 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -26,6 +26,7 @@ static LIST_HEAD(f2fs_stat_list); static struct dentry *debugfs_root; +static DEFINE_MUTEX(f2fs_stat_mutex); static void update_general_status(struct f2fs_sb_info *sbi) { @@ -180,18 +181,16 @@ static int stat_show(struct seq_file *s, void *v) int i = 0; int j; + mutex_lock(&f2fs_stat_mutex); list_for_each_entry_safe(si, next, &f2fs_stat_list, stat_list) { + char devname[BDEVNAME_SIZE]; - mutex_lock(&si->stat_lock); - if (!si->sbi) { - mutex_unlock(&si->stat_lock); - continue; - } update_general_status(si->sbi); - seq_printf(s, "\n=====[ partition info. #%d ]=====\n", i++); - seq_printf(s, "[SB: 1] [CP: 2] [NAT: %d] [SIT: %d] ", - si->nat_area_segs, si->sit_area_segs); + seq_printf(s, "\n=====[ partition info(%s). #%d ]=====\n", + bdevname(si->sbi->sb->s_bdev, devname), i++); + seq_printf(s, "[SB: 1] [CP: 2] [SIT: %d] [NAT: %d] ", + si->sit_area_segs, si->nat_area_segs); seq_printf(s, "[SSA: %d] [MAIN: %d", si->ssa_area_segs, si->main_area_segs); seq_printf(s, "(OverProv:%d Resv:%d)]\n\n", @@ -286,8 +285,8 @@ static int stat_show(struct seq_file *s, void *v) seq_printf(s, "\nMemory: %u KB = static: %u + cached: %u\n", (si->base_mem + si->cache_mem) >> 10, si->base_mem >> 10, si->cache_mem >> 10); - mutex_unlock(&si->stat_lock); } + mutex_unlock(&f2fs_stat_mutex); return 0; } @@ -303,7 +302,7 @@ static const struct file_operations stat_fops = { .release = single_release, }; -static int init_stats(struct f2fs_sb_info *sbi) +int f2fs_build_stats(struct f2fs_sb_info *sbi) { struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi); struct f2fs_stat_info *si; @@ -313,9 +312,6 @@ static int init_stats(struct f2fs_sb_info *sbi) return -ENOMEM; si = sbi->stat_info; - mutex_init(&si->stat_lock); - list_add_tail(&si->stat_list, &f2fs_stat_list); - si->all_area_segs = le32_to_cpu(raw_super->segment_count); si->sit_area_segs = le32_to_cpu(raw_super->segment_count_sit); si->nat_area_segs = le32_to_cpu(raw_super->segment_count_nat); @@ -325,21 +321,11 @@ static int init_stats(struct f2fs_sb_info *sbi) si->main_area_zones = si->main_area_sections / le32_to_cpu(raw_super->secs_per_zone); si->sbi = sbi; - return 0; -} -int f2fs_build_stats(struct f2fs_sb_info *sbi) -{ - int retval; - - retval = init_stats(sbi); - if (retval) - return retval; - - if (!debugfs_root) - debugfs_root = debugfs_create_dir("f2fs", NULL); + mutex_lock(&f2fs_stat_mutex); + list_add_tail(&si->stat_list, &f2fs_stat_list); + mutex_unlock(&f2fs_stat_mutex); - debugfs_create_file("status", S_IRUGO, debugfs_root, NULL, &stat_fops); return 0; } @@ -347,14 +333,22 @@ void f2fs_destroy_stats(struct f2fs_sb_info *sbi) { struct f2fs_stat_info *si = sbi->stat_info; + mutex_lock(&f2fs_stat_mutex); list_del(&si->stat_list); - mutex_lock(&si->stat_lock); - si->sbi = NULL; - mutex_unlock(&si->stat_lock); + mutex_unlock(&f2fs_stat_mutex); + kfree(sbi->stat_info); } -void destroy_root_stats(void) +void __init f2fs_create_root_stats(void) +{ + debugfs_root = debugfs_create_dir("f2fs", NULL); + if (debugfs_root) + debugfs_create_file("status", S_IRUGO, debugfs_root, + NULL, &stat_fops); +} + +void f2fs_destroy_root_stats(void) { debugfs_remove_recursive(debugfs_root); debugfs_root = NULL; diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index fda0bcc..a1f3844 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -265,7 +265,7 @@ void f2fs_set_link(struct inode *dir, struct f2fs_dir_entry *de, mutex_unlock_op(sbi, DENTRY_OPS); } -void init_dent_inode(struct dentry *dentry, struct page *ipage) +void init_dent_inode(const struct qstr *name, struct page *ipage) { struct f2fs_node *rn; @@ -274,20 +274,19 @@ void init_dent_inode(struct dentry *dentry, struct page *ipage) wait_on_page_writeback(ipage); - /* copy dentry info. to this inode page */ + /* copy name info. to this inode page */ rn = (struct f2fs_node *)page_address(ipage); - rn->i.i_namelen = cpu_to_le32(dentry->d_name.len); - memcpy(rn->i.i_name, dentry->d_name.name, dentry->d_name.len); + rn->i.i_namelen = cpu_to_le32(name->len); + memcpy(rn->i.i_name, name->name, name->len); set_page_dirty(ipage); } -static int init_inode_metadata(struct inode *inode, struct dentry *dentry) +static int init_inode_metadata(struct inode *inode, + struct inode *dir, const struct qstr *name) { - struct inode *dir = dentry->d_parent->d_inode; - if (is_inode_flag_set(F2FS_I(inode), FI_NEW_INODE)) { int err; - err = new_inode_page(inode, dentry); + err = new_inode_page(inode, name); if (err) return err; @@ -310,7 +309,7 @@ static int init_inode_metadata(struct inode *inode, struct dentry *dentry) if (IS_ERR(ipage)) return PTR_ERR(ipage); set_cold_node(inode, ipage); - init_dent_inode(dentry, ipage); + init_dent_inode(name, ipage); f2fs_put_page(ipage, 1); } if (is_inode_flag_set(F2FS_I(inode), FI_INC_LINK)) { @@ -371,7 +370,7 @@ next: goto next; } -int f2fs_add_link(struct dentry *dentry, struct inode *inode) +int __f2fs_add_link(struct inode *dir, const struct qstr *name, struct inode *inode) { unsigned int bit_pos; unsigned int level; @@ -380,17 +379,15 @@ int f2fs_add_link(struct dentry *dentry, struct inode *inode) f2fs_hash_t dentry_hash; struct f2fs_dir_entry *de; unsigned int nbucket, nblock; - struct inode *dir = dentry->d_parent->d_inode; struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb); - const char *name = dentry->d_name.name; - size_t namelen = dentry->d_name.len; + size_t namelen = name->len; struct page *dentry_page = NULL; struct f2fs_dentry_block *dentry_blk = NULL; int slots = GET_DENTRY_SLOTS(namelen); int err = 0; int i; - dentry_hash = f2fs_dentry_hash(name, dentry->d_name.len); + dentry_hash = f2fs_dentry_hash(name->name, name->len); level = 0; current_depth = F2FS_I(dir)->i_current_depth; if (F2FS_I(dir)->chash == dentry_hash) { @@ -433,7 +430,7 @@ start: ++level; goto start; add_dentry: - err = init_inode_metadata(inode, dentry); + err = init_inode_metadata(inode, dir, name); if (err) goto fail; @@ -442,7 +439,7 @@ add_dentry: de = &dentry_blk->dentry[bit_pos]; de->hash_code = dentry_hash; de->name_len = cpu_to_le16(namelen); - memcpy(dentry_blk->filename[bit_pos], name, namelen); + memcpy(dentry_blk->filename[bit_pos], name->name, name->len); de->ino = cpu_to_le32(inode->i_ino); set_de_type(de, inode); for (i = 0; i < slots; i++) @@ -503,7 +500,7 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, } if (inode) { - inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME; + inode->i_ctime = CURRENT_TIME; drop_nlink(inode); if (S_ISDIR(inode->i_mode)) { drop_nlink(inode); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 13c6dfb..cc2213a 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -104,6 +104,20 @@ static inline int update_sits_in_cursum(struct f2fs_summary_block *rs, int i) } /* + * ioctl commands + */ +#define F2FS_IOC_GETFLAGS FS_IOC_GETFLAGS +#define F2FS_IOC_SETFLAGS FS_IOC_SETFLAGS + +#if defined(__KERNEL__) && defined(CONFIG_COMPAT) +/* + * ioctl commands in 32 bit emulation + */ +#define F2FS_IOC32_GETFLAGS FS_IOC32_GETFLAGS +#define F2FS_IOC32_SETFLAGS FS_IOC32_SETFLAGS +#endif + +/* * For INODE and NODE manager */ #define XATTR_NODE_OFFSET (-1) /* @@ -141,7 +155,7 @@ struct f2fs_inode_info { /* Use below internally in f2fs*/ unsigned long flags; /* use to pass per-file flags */ - unsigned long long data_version;/* lastes version of data for fsync */ + unsigned long long data_version;/* latest version of data for fsync */ atomic_t dirty_dents; /* # of dirty dentry pages */ f2fs_hash_t chash; /* hash value of given file name */ unsigned int clevel; /* maximum level of given file name */ @@ -211,11 +225,11 @@ struct dnode_of_data { static inline void set_new_dnode(struct dnode_of_data *dn, struct inode *inode, struct page *ipage, struct page *npage, nid_t nid) { + memset(dn, 0, sizeof(*dn)); dn->inode = inode; dn->inode_page = ipage; dn->node_page = npage; dn->nid = nid; - dn->inode_page_locked = 0; } /* @@ -573,6 +587,14 @@ static inline int get_pages(struct f2fs_sb_info *sbi, int count_type) return atomic_read(&sbi->nr_pages[count_type]); } +static inline int get_blocktype_secs(struct f2fs_sb_info *sbi, int block_type) +{ + unsigned int pages_per_sec = sbi->segs_per_sec * + (1 << sbi->log_blocks_per_seg); + return ((get_pages(sbi, block_type) + pages_per_sec - 1) + >> sbi->log_blocks_per_seg) / sbi->segs_per_sec; +} + static inline block_t valid_user_blocks(struct f2fs_sb_info *sbi) { block_t ret; @@ -842,12 +864,12 @@ void f2fs_truncate(struct inode *); int f2fs_setattr(struct dentry *, struct iattr *); int truncate_hole(struct inode *, pgoff_t, pgoff_t); long f2fs_ioctl(struct file *, unsigned int, unsigned long); +long f2fs_compat_ioctl(struct file *, unsigned int, unsigned long); /* * inode.c */ void f2fs_set_inode_flags(struct inode *); -struct inode *f2fs_iget_nowait(struct super_block *, unsigned long); struct inode *f2fs_iget(struct super_block *, unsigned long); void update_inode(struct inode *, struct page *); int f2fs_write_inode(struct inode *, struct writeback_control *); @@ -867,16 +889,24 @@ struct f2fs_dir_entry *f2fs_parent_dir(struct inode *, struct page **); ino_t f2fs_inode_by_name(struct inode *, struct qstr *); void f2fs_set_link(struct inode *, struct f2fs_dir_entry *, struct page *, struct inode *); -void init_dent_inode(struct dentry *, struct page *); -int f2fs_add_link(struct dentry *, struct inode *); +void init_dent_inode(const struct qstr *, struct page *); +int __f2fs_add_link(struct inode *, const struct qstr *, struct inode *); void f2fs_delete_entry(struct f2fs_dir_entry *, struct page *, struct inode *); int f2fs_make_empty(struct inode *, struct inode *); bool f2fs_empty_dir(struct inode *); +static inline int f2fs_add_link(struct dentry *dentry, struct inode *inode) +{ + return __f2fs_add_link(dentry->d_parent->d_inode, &dentry->d_name, + inode); +} + /* * super.c */ int f2fs_sync_fs(struct super_block *, int); +extern __printf(3, 4) +void f2fs_msg(struct super_block *, const char *, const char *, ...); /* * hash.c @@ -894,7 +924,7 @@ void get_node_info(struct f2fs_sb_info *, nid_t, struct node_info *); int get_dnode_of_data(struct dnode_of_data *, pgoff_t, int); int truncate_inode_blocks(struct inode *, pgoff_t); int remove_inode_page(struct inode *); -int new_inode_page(struct inode *, struct dentry *); +int new_inode_page(struct inode *, const struct qstr *); struct page *new_node_page(struct dnode_of_data *, unsigned int); void ra_node_page(struct f2fs_sb_info *, nid_t); struct page *get_node_page(struct f2fs_sb_info *, pgoff_t); @@ -912,7 +942,7 @@ int restore_node_summary(struct f2fs_sb_info *, unsigned int, void flush_nat_entries(struct f2fs_sb_info *); int build_node_manager(struct f2fs_sb_info *); void destroy_node_manager(struct f2fs_sb_info *); -int create_node_manager_caches(void); +int __init create_node_manager_caches(void); void destroy_node_manager_caches(void); /* @@ -927,8 +957,7 @@ void allocate_new_segments(struct f2fs_sb_info *); struct page *get_sum_page(struct f2fs_sb_info *, unsigned int); struct bio *f2fs_bio_alloc(struct block_device *, int); void f2fs_submit_bio(struct f2fs_sb_info *, enum page_type, bool sync); -int write_meta_page(struct f2fs_sb_info *, struct page *, - struct writeback_control *); +void write_meta_page(struct f2fs_sb_info *, struct page *); void write_node_page(struct f2fs_sb_info *, struct page *, unsigned int, block_t, block_t *); void write_data_page(struct inode *, struct page *, struct dnode_of_data*, @@ -961,10 +990,9 @@ int get_valid_checkpoint(struct f2fs_sb_info *); void set_dirty_dir_page(struct inode *, struct page *); void remove_dirty_dir_inode(struct inode *); void sync_dirty_dir_inodes(struct f2fs_sb_info *); -void block_operations(struct f2fs_sb_info *); -void write_checkpoint(struct f2fs_sb_info *, bool, bool); +void write_checkpoint(struct f2fs_sb_info *, bool); void init_orphan_info(struct f2fs_sb_info *); -int create_checkpoint_caches(void); +int __init create_checkpoint_caches(void); void destroy_checkpoint_caches(void); /* @@ -984,9 +1012,9 @@ int do_write_data_page(struct page *); int start_gc_thread(struct f2fs_sb_info *); void stop_gc_thread(struct f2fs_sb_info *); block_t start_bidx_of_node(unsigned int); -int f2fs_gc(struct f2fs_sb_info *, int); +int f2fs_gc(struct f2fs_sb_info *); void build_gc_manager(struct f2fs_sb_info *); -int create_gc_caches(void); +int __init create_gc_caches(void); void destroy_gc_caches(void); /* @@ -1058,7 +1086,8 @@ struct f2fs_stat_info { int f2fs_build_stats(struct f2fs_sb_info *); void f2fs_destroy_stats(struct f2fs_sb_info *); -void destroy_root_stats(void); +void __init f2fs_create_root_stats(void); +void f2fs_destroy_root_stats(void); #else #define stat_inc_call_count(si) #define stat_inc_seg_count(si, type) @@ -1068,7 +1097,8 @@ void destroy_root_stats(void); static inline int f2fs_build_stats(struct f2fs_sb_info *sbi) { return 0; } static inline void f2fs_destroy_stats(struct f2fs_sb_info *sbi) { } -static inline void destroy_root_stats(void) { } +static inline void __init f2fs_create_root_stats(void) { } +static inline void f2fs_destroy_root_stats(void) { } #endif extern const struct file_operations f2fs_dir_operations; diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 7f9ea92..b7a053d 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -15,6 +15,7 @@ #include <linux/writeback.h> #include <linux/falloc.h> #include <linux/types.h> +#include <linux/compat.h> #include <linux/uaccess.h> #include <linux/mount.h> @@ -96,8 +97,9 @@ out: } static const struct vm_operations_struct f2fs_file_vm_ops = { - .fault = filemap_fault, - .page_mkwrite = f2fs_vm_page_mkwrite, + .fault = filemap_fault, + .page_mkwrite = f2fs_vm_page_mkwrite, + .remap_pages = generic_file_remap_pages, }; static int need_to_sync_dir(struct f2fs_sb_info *sbi, struct inode *inode) @@ -137,6 +139,9 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) if (ret) return ret; + /* guarantee free sections for fsync */ + f2fs_balance_fs(sbi); + mutex_lock(&inode->i_mutex); if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) @@ -153,11 +158,11 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) if (!S_ISREG(inode->i_mode) || inode->i_nlink != 1) need_cp = true; - if (is_inode_flag_set(F2FS_I(inode), FI_NEED_CP)) + else if (is_inode_flag_set(F2FS_I(inode), FI_NEED_CP)) need_cp = true; - if (!space_for_roll_forward(sbi)) + else if (!space_for_roll_forward(sbi)) need_cp = true; - if (need_to_sync_dir(sbi, inode)) + else if (need_to_sync_dir(sbi, inode)) need_cp = true; if (need_cp) { @@ -294,8 +299,6 @@ void f2fs_truncate(struct inode *inode) inode->i_mtime = inode->i_ctime = CURRENT_TIME; mark_inode_dirty(inode); } - - f2fs_balance_fs(F2FS_SB(inode->i_sb)); } static int f2fs_getattr(struct vfsmount *mnt, @@ -352,6 +355,7 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) attr->ia_size != i_size_read(inode)) { truncate_setsize(inode, attr->ia_size); f2fs_truncate(inode); + f2fs_balance_fs(F2FS_SB(inode->i_sb)); } __setattr_copy(inode, attr); @@ -383,12 +387,17 @@ const struct inode_operations f2fs_file_inode_operations = { static void fill_zero(struct inode *inode, pgoff_t index, loff_t start, loff_t len) { + struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); struct page *page; if (!len) return; + f2fs_balance_fs(sbi); + + mutex_lock_op(sbi, DATA_NEW); page = get_new_data_page(inode, index, false); + mutex_unlock_op(sbi, DATA_NEW); if (!IS_ERR(page)) { wait_on_page_writeback(page); @@ -407,6 +416,8 @@ int truncate_hole(struct inode *inode, pgoff_t pg_start, pgoff_t pg_end) struct dnode_of_data dn; struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + f2fs_balance_fs(sbi); + mutex_lock_op(sbi, DATA_TRUNC); set_new_dnode(&dn, inode, NULL, NULL, 0); err = get_dnode_of_data(&dn, index, RDONLY_NODE); @@ -534,7 +545,6 @@ static long f2fs_fallocate(struct file *file, int mode, loff_t offset, loff_t len) { struct inode *inode = file->f_path.dentry->d_inode; - struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); long ret; if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) @@ -545,7 +555,10 @@ static long f2fs_fallocate(struct file *file, int mode, else ret = expand_inode_data(inode, offset, len, mode); - f2fs_balance_fs(sbi); + if (!ret) { + inode->i_mtime = inode->i_ctime = CURRENT_TIME; + mark_inode_dirty(inode); + } return ret; } @@ -622,6 +635,23 @@ out: } } +#ifdef CONFIG_COMPAT +long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + switch (cmd) { + case F2FS_IOC32_GETFLAGS: + cmd = F2FS_IOC_GETFLAGS; + break; + case F2FS_IOC32_SETFLAGS: + cmd = F2FS_IOC_SETFLAGS; + break; + default: + return -ENOIOCTLCMD; + } + return f2fs_ioctl(file, cmd, (unsigned long) compat_ptr(arg)); +} +#endif + const struct file_operations f2fs_file_operations = { .llseek = generic_file_llseek, .read = do_sync_read, @@ -633,6 +663,9 @@ const struct file_operations f2fs_file_operations = { .fsync = f2fs_sync_file, .fallocate = f2fs_fallocate, .unlocked_ioctl = f2fs_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = f2fs_compat_ioctl, +#endif .splice_read = generic_file_splice_read, .splice_write = generic_file_splice_write, }; diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index b0ec721..94b8a0c 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -44,10 +44,10 @@ static int gc_thread_func(void *data) if (kthread_should_stop()) break; - f2fs_balance_fs(sbi); - - if (!test_opt(sbi, BG_GC)) + if (sbi->sb->s_writers.frozen >= SB_FREEZE_WRITE) { + wait_ms = GC_THREAD_MAX_SLEEP_TIME; continue; + } /* * [GC triggering condition] @@ -78,7 +78,8 @@ static int gc_thread_func(void *data) sbi->bg_gc++; - if (f2fs_gc(sbi, 1) == GC_NONE) + /* if return value is not zero, no victim was selected */ + if (f2fs_gc(sbi)) wait_ms = GC_THREAD_NOGC_SLEEP_TIME; else if (wait_ms == GC_THREAD_NOGC_SLEEP_TIME) wait_ms = GC_THREAD_MAX_SLEEP_TIME; @@ -90,7 +91,10 @@ static int gc_thread_func(void *data) int start_gc_thread(struct f2fs_sb_info *sbi) { struct f2fs_gc_kthread *gc_th; + dev_t dev = sbi->sb->s_bdev->bd_dev; + if (!test_opt(sbi, BG_GC)) + return 0; gc_th = kmalloc(sizeof(struct f2fs_gc_kthread), GFP_KERNEL); if (!gc_th) return -ENOMEM; @@ -98,9 +102,10 @@ int start_gc_thread(struct f2fs_sb_info *sbi) sbi->gc_thread = gc_th; init_waitqueue_head(&sbi->gc_thread->gc_wait_queue_head); sbi->gc_thread->f2fs_gc_task = kthread_run(gc_thread_func, sbi, - GC_THREAD_NAME); + "f2fs_gc-%u:%u", MAJOR(dev), MINOR(dev)); if (IS_ERR(gc_th->f2fs_gc_task)) { kfree(gc_th); + sbi->gc_thread = NULL; return -ENOMEM; } return 0; @@ -141,6 +146,9 @@ static void select_policy(struct f2fs_sb_info *sbi, int gc_type, static unsigned int get_max_cost(struct f2fs_sb_info *sbi, struct victim_sel_policy *p) { + /* SSR allocates in a segment unit */ + if (p->alloc_mode == SSR) + return 1 << sbi->log_blocks_per_seg; if (p->gc_mode == GC_GREEDY) return (1 << sbi->log_blocks_per_seg) * p->ofs_unit; else if (p->gc_mode == GC_CB) @@ -356,7 +364,7 @@ static int check_valid_map(struct f2fs_sb_info *sbi, sentry = get_seg_entry(sbi, segno); ret = f2fs_test_bit(offset, sentry->cur_valid_map); mutex_unlock(&sit_i->sentry_lock); - return ret ? GC_OK : GC_NEXT; + return ret; } /* @@ -364,7 +372,7 @@ static int check_valid_map(struct f2fs_sb_info *sbi, * On validity, copy that node with cold status, otherwise (invalid node) * ignore that. */ -static int gc_node_segment(struct f2fs_sb_info *sbi, +static void gc_node_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, unsigned int segno, int gc_type) { bool initial = true; @@ -376,21 +384,12 @@ next_step: for (off = 0; off < sbi->blocks_per_seg; off++, entry++) { nid_t nid = le32_to_cpu(entry->nid); struct page *node_page; - int err; - /* - * It makes sure that free segments are able to write - * all the dirty node pages before CP after this CP. - * So let's check the space of dirty node pages. - */ - if (should_do_checkpoint(sbi)) { - mutex_lock(&sbi->cp_mutex); - block_operations(sbi); - return GC_BLOCKED; - } + /* stop BG_GC if there is not enough free sections. */ + if (gc_type == BG_GC && has_not_enough_free_secs(sbi, 0)) + return; - err = check_valid_map(sbi, segno, off); - if (err == GC_NEXT) + if (check_valid_map(sbi, segno, off) == 0) continue; if (initial) { @@ -420,11 +419,14 @@ next_step: }; sync_node_pages(sbi, 0, &wbc); } - return GC_DONE; } /* - * Calculate start block index that this node page contains + * Calculate start block index indicating the given node offset. + * Be careful, caller should give this node offset only indicating direct node + * blocks. If any node offsets, which point the other types of node blocks such + * as indirect or double indirect node blocks, are given, it must be a caller's + * bug. */ block_t start_bidx_of_node(unsigned int node_ofs) { @@ -459,13 +461,13 @@ static int check_dnode(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, node_page = get_node_page(sbi, nid); if (IS_ERR(node_page)) - return GC_NEXT; + return 0; get_node_info(sbi, nid, dni); if (sum->version != dni->version) { f2fs_put_page(node_page, 1); - return GC_NEXT; + return 0; } *nofs = ofs_of_node(node_page); @@ -473,8 +475,8 @@ static int check_dnode(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, f2fs_put_page(node_page, 1); if (source_blkaddr != blkaddr) - return GC_NEXT; - return GC_OK; + return 0; + return 1; } static void move_data_page(struct inode *inode, struct page *page, int gc_type) @@ -515,13 +517,13 @@ out: * If the parent node is not valid or the data block address is different, * the victim data block is ignored. */ -static int gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, +static void gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, struct list_head *ilist, unsigned int segno, int gc_type) { struct super_block *sb = sbi->sb; struct f2fs_summary *entry; block_t start_addr; - int err, off; + int off; int phase = 0; start_addr = START_BLOCK(sbi, segno); @@ -535,20 +537,11 @@ next_step: unsigned int ofs_in_node, nofs; block_t start_bidx; - /* - * It makes sure that free segments are able to write - * all the dirty node pages before CP after this CP. - * So let's check the space of dirty node pages. - */ - if (should_do_checkpoint(sbi)) { - mutex_lock(&sbi->cp_mutex); - block_operations(sbi); - err = GC_BLOCKED; - goto stop; - } + /* stop BG_GC if there is not enough free sections. */ + if (gc_type == BG_GC && has_not_enough_free_secs(sbi, 0)) + return; - err = check_valid_map(sbi, segno, off); - if (err == GC_NEXT) + if (check_valid_map(sbi, segno, off) == 0) continue; if (phase == 0) { @@ -557,8 +550,7 @@ next_step: } /* Get an inode by ino with checking validity */ - err = check_dnode(sbi, entry, &dni, start_addr + off, &nofs); - if (err == GC_NEXT) + if (check_dnode(sbi, entry, &dni, start_addr + off, &nofs) == 0) continue; if (phase == 1) { @@ -570,7 +562,7 @@ next_step: ofs_in_node = le16_to_cpu(entry->ofs_in_node); if (phase == 2) { - inode = f2fs_iget_nowait(sb, dni.ino); + inode = f2fs_iget(sb, dni.ino); if (IS_ERR(inode)) continue; @@ -598,11 +590,9 @@ next_iput: } if (++phase < 4) goto next_step; - err = GC_DONE; -stop: + if (gc_type == FG_GC) f2fs_submit_bio(sbi, DATA, true); - return err; } static int __get_victim(struct f2fs_sb_info *sbi, unsigned int *victim, @@ -616,17 +606,16 @@ static int __get_victim(struct f2fs_sb_info *sbi, unsigned int *victim, return ret; } -static int do_garbage_collect(struct f2fs_sb_info *sbi, unsigned int segno, +static void do_garbage_collect(struct f2fs_sb_info *sbi, unsigned int segno, struct list_head *ilist, int gc_type) { struct page *sum_page; struct f2fs_summary_block *sum; - int ret = GC_DONE; /* read segment summary of victim */ sum_page = get_sum_page(sbi, segno); if (IS_ERR(sum_page)) - return GC_ERROR; + return; /* * CP needs to lock sum_page. In this time, we don't need @@ -638,76 +627,55 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi, unsigned int segno, switch (GET_SUM_TYPE((&sum->footer))) { case SUM_TYPE_NODE: - ret = gc_node_segment(sbi, sum->entries, segno, gc_type); + gc_node_segment(sbi, sum->entries, segno, gc_type); break; case SUM_TYPE_DATA: - ret = gc_data_segment(sbi, sum->entries, ilist, segno, gc_type); + gc_data_segment(sbi, sum->entries, ilist, segno, gc_type); break; } stat_inc_seg_count(sbi, GET_SUM_TYPE((&sum->footer))); stat_inc_call_count(sbi->stat_info); f2fs_put_page(sum_page, 0); - return ret; } -int f2fs_gc(struct f2fs_sb_info *sbi, int nGC) +int f2fs_gc(struct f2fs_sb_info *sbi) { - unsigned int segno; - int old_free_secs, cur_free_secs; - int gc_status, nfree; struct list_head ilist; + unsigned int segno, i; int gc_type = BG_GC; + int nfree = 0; + int ret = -1; INIT_LIST_HEAD(&ilist); gc_more: - nfree = 0; - gc_status = GC_NONE; + if (!(sbi->sb->s_flags & MS_ACTIVE)) + goto stop; - if (has_not_enough_free_secs(sbi)) - old_free_secs = reserved_sections(sbi); - else - old_free_secs = free_sections(sbi); + if (gc_type == BG_GC && has_not_enough_free_secs(sbi, nfree)) + gc_type = FG_GC; - while (sbi->sb->s_flags & MS_ACTIVE) { - int i; - if (has_not_enough_free_secs(sbi)) - gc_type = FG_GC; + if (!__get_victim(sbi, &segno, gc_type, NO_CHECK_TYPE)) + goto stop; + ret = 0; - cur_free_secs = free_sections(sbi) + nfree; + for (i = 0; i < sbi->segs_per_sec; i++) + do_garbage_collect(sbi, segno + i, &ilist, gc_type); - /* We got free space successfully. */ - if (nGC < cur_free_secs - old_free_secs) - break; + if (gc_type == FG_GC && + get_valid_blocks(sbi, segno, sbi->segs_per_sec) == 0) + nfree++; - if (!__get_victim(sbi, &segno, gc_type, NO_CHECK_TYPE)) - break; + if (has_not_enough_free_secs(sbi, nfree)) + goto gc_more; - for (i = 0; i < sbi->segs_per_sec; i++) { - /* - * do_garbage_collect will give us three gc_status: - * GC_ERROR, GC_DONE, and GC_BLOCKED. - * If GC is finished uncleanly, we have to return - * the victim to dirty segment list. - */ - gc_status = do_garbage_collect(sbi, segno + i, - &ilist, gc_type); - if (gc_status != GC_DONE) - goto stop; - nfree++; - } - } + if (gc_type == FG_GC) + write_checkpoint(sbi, false); stop: - if (has_not_enough_free_secs(sbi) || gc_status == GC_BLOCKED) { - write_checkpoint(sbi, (gc_status == GC_BLOCKED), false); - if (nfree) - goto gc_more; - } mutex_unlock(&sbi->gc_mutex); put_gc_inode(&ilist); - BUG_ON(!list_empty(&ilist)); - return gc_status; + return ret; } void build_gc_manager(struct f2fs_sb_info *sbi) @@ -715,7 +683,7 @@ void build_gc_manager(struct f2fs_sb_info *sbi) DIRTY_I(sbi)->v_ops = &default_v_ops; } -int create_gc_caches(void) +int __init create_gc_caches(void) { winode_slab = f2fs_kmem_cache_create("f2fs_gc_inodes", sizeof(struct inode_entry), NULL); diff --git a/fs/f2fs/gc.h b/fs/f2fs/gc.h index b026d93..30b2db0 100644 --- a/fs/f2fs/gc.h +++ b/fs/f2fs/gc.h @@ -8,7 +8,6 @@ * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. */ -#define GC_THREAD_NAME "f2fs_gc_task" #define GC_THREAD_MIN_WB_PAGES 1 /* * a threshold to determine * whether IO subsystem is idle @@ -23,15 +22,6 @@ /* Search max. number of dirty segments to select a victim segment */ #define MAX_VICTIM_SEARCH 20 -enum { - GC_NONE = 0, - GC_ERROR, - GC_OK, - GC_NEXT, - GC_BLOCKED, - GC_DONE, -}; - struct f2fs_gc_kthread { struct task_struct *f2fs_gc_task; wait_queue_head_t gc_wait_queue_head; @@ -104,14 +94,3 @@ static inline int is_idle(struct f2fs_sb_info *sbi) struct request_list *rl = &q->root_rl; return !(rl->count[BLK_RW_SYNC]) && !(rl->count[BLK_RW_ASYNC]); } - -static inline bool should_do_checkpoint(struct f2fs_sb_info *sbi) -{ - unsigned int pages_per_sec = sbi->segs_per_sec * - (1 << sbi->log_blocks_per_seg); - int node_secs = ((get_pages(sbi, F2FS_DIRTY_NODES) + pages_per_sec - 1) - >> sbi->log_blocks_per_seg) / sbi->segs_per_sec; - int dent_secs = ((get_pages(sbi, F2FS_DIRTY_DENTS) + pages_per_sec - 1) - >> sbi->log_blocks_per_seg) / sbi->segs_per_sec; - return free_sections(sbi) <= (node_secs + 2 * dent_secs + 2); -} diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index bf20b4d..ddae412 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -16,11 +16,6 @@ #include "f2fs.h" #include "node.h" -struct f2fs_iget_args { - u64 ino; - int on_free; -}; - void f2fs_set_inode_flags(struct inode *inode) { unsigned int flags = F2FS_I(inode)->i_flags; @@ -40,34 +35,6 @@ void f2fs_set_inode_flags(struct inode *inode) inode->i_flags |= S_DIRSYNC; } -static int f2fs_iget_test(struct inode *inode, void *data) -{ - struct f2fs_iget_args *args = data; - - if (inode->i_ino != args->ino) - return 0; - if (inode->i_state & (I_FREEING | I_WILL_FREE)) { - args->on_free = 1; - return 0; - } - return 1; -} - -struct inode *f2fs_iget_nowait(struct super_block *sb, unsigned long ino) -{ - struct f2fs_iget_args args = { - .ino = ino, - .on_free = 0 - }; - struct inode *inode = ilookup5(sb, ino, f2fs_iget_test, &args); - - if (inode) - return inode; - if (!args.on_free) - return f2fs_iget(sb, ino); - return ERR_PTR(-ENOENT); -} - static int do_read_inode(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); @@ -100,6 +67,10 @@ static int do_read_inode(struct inode *inode) inode->i_ctime.tv_nsec = le32_to_cpu(ri->i_ctime_nsec); inode->i_mtime.tv_nsec = le32_to_cpu(ri->i_mtime_nsec); inode->i_generation = le32_to_cpu(ri->i_generation); + if (ri->i_addr[0]) + inode->i_rdev = old_decode_dev(le32_to_cpu(ri->i_addr[0])); + else + inode->i_rdev = new_decode_dev(le32_to_cpu(ri->i_addr[1])); fi->i_current_depth = le32_to_cpu(ri->i_current_depth); fi->i_xattr_nid = le32_to_cpu(ri->i_xattr_nid); @@ -203,6 +174,20 @@ void update_inode(struct inode *inode, struct page *node_page) ri->i_flags = cpu_to_le32(F2FS_I(inode)->i_flags); ri->i_pino = cpu_to_le32(F2FS_I(inode)->i_pino); ri->i_generation = cpu_to_le32(inode->i_generation); + + if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) { + if (old_valid_dev(inode->i_rdev)) { + ri->i_addr[0] = + cpu_to_le32(old_encode_dev(inode->i_rdev)); + ri->i_addr[1] = 0; + } else { + ri->i_addr[0] = 0; + ri->i_addr[1] = + cpu_to_le32(new_encode_dev(inode->i_rdev)); + ri->i_addr[2] = 0; + } + } + set_cold_node(inode, node_page); set_page_dirty(node_page); } @@ -217,6 +202,9 @@ int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc) inode->i_ino == F2FS_META_INO(sbi)) return 0; + if (wbc) + f2fs_balance_fs(sbi); + node_page = get_node_page(sbi, inode->i_ino); if (IS_ERR(node_page)) return PTR_ERR(node_page); @@ -257,6 +245,7 @@ void f2fs_evict_inode(struct inode *inode) if (inode->i_nlink || is_bad_inode(inode)) goto no_delete; + sb_start_intwrite(inode->i_sb); set_inode_flag(F2FS_I(inode), FI_NO_ALLOC); i_size_write(inode, 0); @@ -264,6 +253,7 @@ void f2fs_evict_inode(struct inode *inode) f2fs_truncate(inode); remove_inode_page(inode); + sb_end_intwrite(inode->i_sb); no_delete: clear_inode(inode); } diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 5066bfd..e275218 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -104,7 +104,7 @@ static void ra_nat_pages(struct f2fs_sb_info *sbi, int nid) f2fs_put_page(page, 1); continue; } - page_cache_release(page); + f2fs_put_page(page, 0); } } @@ -660,7 +660,7 @@ int truncate_inode_blocks(struct inode *inode, pgoff_t from) struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); int err = 0, cont = 1; int level, offset[4], noffset[4]; - unsigned int nofs; + unsigned int nofs = 0; struct f2fs_node *rn; struct dnode_of_data dn; struct page *page; @@ -780,7 +780,7 @@ int remove_inode_page(struct inode *inode) return 0; } -int new_inode_page(struct inode *inode, struct dentry *dentry) +int new_inode_page(struct inode *inode, const struct qstr *name) { struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); struct page *page; @@ -790,7 +790,7 @@ int new_inode_page(struct inode *inode, struct dentry *dentry) set_new_dnode(&dn, inode, NULL, NULL, inode->i_ino); mutex_lock_op(sbi, NODE_NEW); page = new_node_page(&dn, 0); - init_dent_inode(dentry, page); + init_dent_inode(name, page); mutex_unlock_op(sbi, NODE_NEW); if (IS_ERR(page)) return PTR_ERR(page); @@ -874,15 +874,11 @@ void ra_node_page(struct f2fs_sb_info *sbi, nid_t nid) return; if (read_node_page(apage, READA)) - goto unlock_out; + unlock_page(apage); - page_cache_release(apage); - return; - -unlock_out: - unlock_page(apage); release_out: - page_cache_release(apage); + f2fs_put_page(apage, 0); + return; } struct page *get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid) @@ -1124,6 +1120,12 @@ static int f2fs_write_node_page(struct page *page, return 0; } +/* + * It is very important to gather dirty pages and write at once, so that we can + * submit a big bio without interfering other data writes. + * Be default, 512 pages (2MB), a segment size, is quite reasonable. + */ +#define COLLECT_DIRTY_NODES 512 static int f2fs_write_node_pages(struct address_space *mapping, struct writeback_control *wbc) { @@ -1131,17 +1133,16 @@ static int f2fs_write_node_pages(struct address_space *mapping, struct block_device *bdev = sbi->sb->s_bdev; long nr_to_write = wbc->nr_to_write; - if (wbc->for_kupdate) - return 0; - - if (get_pages(sbi, F2FS_DIRTY_NODES) == 0) - return 0; - + /* First check balancing cached NAT entries */ if (try_to_free_nats(sbi, NAT_ENTRY_PER_BLOCK)) { - write_checkpoint(sbi, false, false); + write_checkpoint(sbi, false); return 0; } + /* collect a number of dirty node pages and write together */ + if (get_pages(sbi, F2FS_DIRTY_NODES) < COLLECT_DIRTY_NODES) + return 0; + /* if mounting is failed, skip writing node pages */ wbc->nr_to_write = bio_get_nr_vecs(bdev); sync_node_pages(sbi, 0, wbc); @@ -1732,7 +1733,7 @@ void destroy_node_manager(struct f2fs_sb_info *sbi) kfree(nm_i); } -int create_node_manager_caches(void) +int __init create_node_manager_caches(void) { nat_entry_slab = f2fs_kmem_cache_create("nat_entry", sizeof(struct nat_entry), NULL); diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index b571fee..b235215 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -42,7 +42,7 @@ static int recover_dentry(struct page *ipage, struct inode *inode) { struct f2fs_node *raw_node = (struct f2fs_node *)kmap(ipage); struct f2fs_inode *raw_inode = &(raw_node->i); - struct dentry dent, parent; + struct qstr name; struct f2fs_dir_entry *de; struct page *page; struct inode *dir; @@ -57,17 +57,15 @@ static int recover_dentry(struct page *ipage, struct inode *inode) goto out; } - parent.d_inode = dir; - dent.d_parent = &parent; - dent.d_name.len = le32_to_cpu(raw_inode->i_namelen); - dent.d_name.name = raw_inode->i_name; + name.len = le32_to_cpu(raw_inode->i_namelen); + name.name = raw_inode->i_name; - de = f2fs_find_entry(dir, &dent.d_name, &page); + de = f2fs_find_entry(dir, &name, &page); if (de) { kunmap(page); f2fs_put_page(page, 0); } else { - f2fs_add_link(&dent, inode); + err = __f2fs_add_link(dir, &name, inode); } iput(dir); out: @@ -151,7 +149,6 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head) goto out; } - INIT_LIST_HEAD(&entry->list); list_add_tail(&entry->list, head); entry->blkaddr = blkaddr; } @@ -174,10 +171,9 @@ out: static void destroy_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head) { - struct list_head *this; - struct fsync_inode_entry *entry; - list_for_each(this, head) { - entry = list_entry(this, struct fsync_inode_entry, list); + struct fsync_inode_entry *entry, *tmp; + + list_for_each_entry_safe(entry, tmp, head, list) { iput(entry->inode); list_del(&entry->list); kmem_cache_free(fsync_entry_slab, entry); @@ -228,7 +224,7 @@ static void check_index_in_prev_nodes(struct f2fs_sb_info *sbi, f2fs_put_page(node_page, 1); /* Deallocate previous index in the node page */ - inode = f2fs_iget_nowait(sbi->sb, ino); + inode = f2fs_iget(sbi->sb, ino); if (IS_ERR(inode)) return; @@ -375,5 +371,5 @@ void recover_fsync_data(struct f2fs_sb_info *sbi) out: destroy_fsync_dnodes(sbi, &inode_list); kmem_cache_destroy(fsync_entry_slab); - write_checkpoint(sbi, false, false); + write_checkpoint(sbi, false); } diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index de62409..777f17e 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -29,9 +29,9 @@ void f2fs_balance_fs(struct f2fs_sb_info *sbi) * We should do GC or end up with checkpoint, if there are so many dirty * dir/node pages without enough free segments. */ - if (has_not_enough_free_secs(sbi)) { + if (has_not_enough_free_secs(sbi, 0)) { mutex_lock(&sbi->gc_mutex); - f2fs_gc(sbi, 1); + f2fs_gc(sbi); } } @@ -308,7 +308,7 @@ static unsigned int check_prefree_segments(struct f2fs_sb_info *sbi, * If there is not enough reserved sections, * we should not reuse prefree segments. */ - if (has_not_enough_free_secs(sbi)) + if (has_not_enough_free_secs(sbi, 0)) return NULL_SEGNO; /* @@ -536,6 +536,23 @@ static void change_curseg(struct f2fs_sb_info *sbi, int type, bool reuse) } } +static int get_ssr_segment(struct f2fs_sb_info *sbi, int type) +{ + struct curseg_info *curseg = CURSEG_I(sbi, type); + const struct victim_selection *v_ops = DIRTY_I(sbi)->v_ops; + + if (IS_NODESEG(type) || !has_not_enough_free_secs(sbi, 0)) + return v_ops->get_victim(sbi, + &(curseg)->next_segno, BG_GC, type, SSR); + + /* For data segments, let's do SSR more intensively */ + for (; type >= CURSEG_HOT_DATA; type--) + if (v_ops->get_victim(sbi, &(curseg)->next_segno, + BG_GC, type, SSR)) + return 1; + return 0; +} + /* * flush out current segment and replace it with new segment * This function should be returned with success, otherwise BUG @@ -600,6 +617,7 @@ static void f2fs_end_io_write(struct bio *bio, int err) if (page->mapping) set_bit(AS_EIO, &page->mapping->flags); set_ckpt_flags(p->sbi->ckpt, CP_ERROR_FLAG); + p->sbi->sb->s_flags |= MS_RDONLY; } end_page_writeback(page); dec_page_count(p->sbi, F2FS_WRITEBACK); @@ -815,15 +833,10 @@ static void do_write_page(struct f2fs_sb_info *sbi, struct page *page, mutex_unlock(&curseg->curseg_mutex); } -int write_meta_page(struct f2fs_sb_info *sbi, struct page *page, - struct writeback_control *wbc) +void write_meta_page(struct f2fs_sb_info *sbi, struct page *page) { - if (wbc->for_reclaim) - return AOP_WRITEPAGE_ACTIVATE; - set_page_writeback(page); submit_write_page(sbi, page, page->index, META); - return 0; } void write_node_page(struct f2fs_sb_info *sbi, struct page *page, diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 66a288a..552dadb 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -450,29 +450,16 @@ static inline bool need_SSR(struct f2fs_sb_info *sbi) return (free_sections(sbi) < overprovision_sections(sbi)); } -static inline int get_ssr_segment(struct f2fs_sb_info *sbi, int type) +static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi, int freed) { - struct curseg_info *curseg = CURSEG_I(sbi, type); - return DIRTY_I(sbi)->v_ops->get_victim(sbi, - &(curseg)->next_segno, BG_GC, type, SSR); -} - -static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi) -{ - unsigned int pages_per_sec = (1 << sbi->log_blocks_per_seg) * - sbi->segs_per_sec; - int node_secs = ((get_pages(sbi, F2FS_DIRTY_NODES) + pages_per_sec - 1) - >> sbi->log_blocks_per_seg) / sbi->segs_per_sec; - int dent_secs = ((get_pages(sbi, F2FS_DIRTY_DENTS) + pages_per_sec - 1) - >> sbi->log_blocks_per_seg) / sbi->segs_per_sec; + int node_secs = get_blocktype_secs(sbi, F2FS_DIRTY_NODES); + int dent_secs = get_blocktype_secs(sbi, F2FS_DIRTY_DENTS); if (sbi->por_doing) return false; - if (free_sections(sbi) <= (node_secs + 2 * dent_secs + - reserved_sections(sbi))) - return true; - return false; + return ((free_sections(sbi) + freed) <= (node_secs + 2 * dent_secs + + reserved_sections(sbi))); } static inline int utilization(struct f2fs_sb_info *sbi) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 08a94c8..8c11764 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -53,6 +53,18 @@ static match_table_t f2fs_tokens = { {Opt_err, NULL}, }; +void f2fs_msg(struct super_block *sb, const char *level, const char *fmt, ...) +{ + struct va_format vaf; + va_list args; + + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; + printk("%sF2FS-fs (%s): %pV\n", level, sb->s_id, &vaf); + va_end(args); +} + static void init_once(void *foo) { struct f2fs_inode_info *fi = (struct f2fs_inode_info *) foo; @@ -100,7 +112,7 @@ static void f2fs_put_super(struct super_block *sb) f2fs_destroy_stats(sbi); stop_gc_thread(sbi); - write_checkpoint(sbi, false, true); + write_checkpoint(sbi, true); iput(sbi->node_inode); iput(sbi->meta_inode); @@ -124,11 +136,29 @@ int f2fs_sync_fs(struct super_block *sb, int sync) return 0; if (sync) - write_checkpoint(sbi, false, false); + write_checkpoint(sbi, false); + else + f2fs_balance_fs(sbi); return 0; } +static int f2fs_freeze(struct super_block *sb) +{ + int err; + + if (sb->s_flags & MS_RDONLY) + return 0; + + err = f2fs_sync_fs(sb, 1); + return err; +} + +static int f2fs_unfreeze(struct super_block *sb) +{ + return 0; +} + static int f2fs_statfs(struct dentry *dentry, struct kstatfs *buf) { struct super_block *sb = dentry->d_sb; @@ -184,7 +214,7 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) seq_puts(seq, ",noacl"); #endif if (test_opt(sbi, DISABLE_EXT_IDENTIFY)) - seq_puts(seq, ",disable_ext_indentify"); + seq_puts(seq, ",disable_ext_identify"); seq_printf(seq, ",active_logs=%u", sbi->active_logs); @@ -199,6 +229,8 @@ static struct super_operations f2fs_sops = { .evict_inode = f2fs_evict_inode, .put_super = f2fs_put_super, .sync_fs = f2fs_sync_fs, + .freeze_fs = f2fs_freeze, + .unfreeze_fs = f2fs_unfreeze, .statfs = f2fs_statfs, }; @@ -247,7 +279,8 @@ static const struct export_operations f2fs_export_ops = { .get_parent = f2fs_get_parent, }; -static int parse_options(struct f2fs_sb_info *sbi, char *options) +static int parse_options(struct super_block *sb, struct f2fs_sb_info *sbi, + char *options) { substring_t args[MAX_OPT_ARGS]; char *p; @@ -286,7 +319,8 @@ static int parse_options(struct f2fs_sb_info *sbi, char *options) break; #else case Opt_nouser_xattr: - pr_info("nouser_xattr options not supported\n"); + f2fs_msg(sb, KERN_INFO, + "nouser_xattr options not supported"); break; #endif #ifdef CONFIG_F2FS_FS_POSIX_ACL @@ -295,7 +329,7 @@ static int parse_options(struct f2fs_sb_info *sbi, char *options) break; #else case Opt_noacl: - pr_info("noacl options not supported\n"); + f2fs_msg(sb, KERN_INFO, "noacl options not supported"); break; #endif case Opt_active_logs: @@ -309,8 +343,9 @@ static int parse_options(struct f2fs_sb_info *sbi, char *options) set_opt(sbi, DISABLE_EXT_IDENTIFY); break; default: - pr_err("Unrecognized mount option \"%s\" or missing value\n", - p); + f2fs_msg(sb, KERN_ERR, + "Unrecognized mount option \"%s\" or missing value", + p); return -EINVAL; } } @@ -337,30 +372,53 @@ static loff_t max_file_size(unsigned bits) return result; } -static int sanity_check_raw_super(struct f2fs_super_block *raw_super) +static int sanity_check_raw_super(struct super_block *sb, + struct f2fs_super_block *raw_super) { unsigned int blocksize; - if (F2FS_SUPER_MAGIC != le32_to_cpu(raw_super->magic)) + if (F2FS_SUPER_MAGIC != le32_to_cpu(raw_super->magic)) { + f2fs_msg(sb, KERN_INFO, + "Magic Mismatch, valid(0x%x) - read(0x%x)", + F2FS_SUPER_MAGIC, le32_to_cpu(raw_super->magic)); return 1; + } + + /* Currently, support only 4KB page cache size */ + if (F2FS_BLKSIZE != PAGE_CACHE_SIZE) { + f2fs_msg(sb, KERN_INFO, + "Invalid page_cache_size (%lu), supports only 4KB\n", + PAGE_CACHE_SIZE); + return 1; + } /* Currently, support only 4KB block size */ blocksize = 1 << le32_to_cpu(raw_super->log_blocksize); - if (blocksize != PAGE_CACHE_SIZE) + if (blocksize != F2FS_BLKSIZE) { + f2fs_msg(sb, KERN_INFO, + "Invalid blocksize (%u), supports only 4KB\n", + blocksize); return 1; + } + if (le32_to_cpu(raw_super->log_sectorsize) != - F2FS_LOG_SECTOR_SIZE) + F2FS_LOG_SECTOR_SIZE) { + f2fs_msg(sb, KERN_INFO, "Invalid log sectorsize"); return 1; + } if (le32_to_cpu(raw_super->log_sectors_per_block) != - F2FS_LOG_SECTORS_PER_BLOCK) + F2FS_LOG_SECTORS_PER_BLOCK) { + f2fs_msg(sb, KERN_INFO, "Invalid log sectors per block"); return 1; + } return 0; } -static int sanity_check_ckpt(struct f2fs_super_block *raw_super, - struct f2fs_checkpoint *ckpt) +static int sanity_check_ckpt(struct f2fs_sb_info *sbi) { unsigned int total, fsmeta; + struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi); + struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); total = le32_to_cpu(raw_super->segment_count); fsmeta = le32_to_cpu(raw_super->segment_count_ckpt); @@ -371,6 +429,11 @@ static int sanity_check_ckpt(struct f2fs_super_block *raw_super, if (fsmeta >= total) return 1; + + if (is_set_ckpt_flags(ckpt, CP_ERROR_FLAG)) { + f2fs_msg(sbi->sb, KERN_ERR, "A bug case: need to run fsck"); + return 1; + } return 0; } @@ -399,6 +462,32 @@ static void init_sb_info(struct f2fs_sb_info *sbi) atomic_set(&sbi->nr_pages[i], 0); } +static int validate_superblock(struct super_block *sb, + struct f2fs_super_block **raw_super, + struct buffer_head **raw_super_buf, sector_t block) +{ + const char *super = (block == 0 ? "first" : "second"); + + /* read f2fs raw super block */ + *raw_super_buf = sb_bread(sb, block); + if (!*raw_super_buf) { + f2fs_msg(sb, KERN_ERR, "unable to read %s superblock", + super); + return 1; + } + + *raw_super = (struct f2fs_super_block *) + ((char *)(*raw_super_buf)->b_data + F2FS_SUPER_OFFSET); + + /* sanity checking of raw super */ + if (!sanity_check_raw_super(sb, *raw_super)) + return 0; + + f2fs_msg(sb, KERN_ERR, "Can't find a valid F2FS filesystem " + "in %s superblock", super); + return 1; +} + static int f2fs_fill_super(struct super_block *sb, void *data, int silent) { struct f2fs_sb_info *sbi; @@ -413,19 +502,17 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) if (!sbi) return -ENOMEM; - /* set a temporary block size */ - if (!sb_set_blocksize(sb, F2FS_BLKSIZE)) - goto free_sbi; - - /* read f2fs raw super block */ - raw_super_buf = sb_bread(sb, 0); - if (!raw_super_buf) { - err = -EIO; + /* set a block size */ + if (!sb_set_blocksize(sb, F2FS_BLKSIZE)) { + f2fs_msg(sb, KERN_ERR, "unable to set blocksize"); goto free_sbi; } - raw_super = (struct f2fs_super_block *) - ((char *)raw_super_buf->b_data + F2FS_SUPER_OFFSET); + if (validate_superblock(sb, &raw_super, &raw_super_buf, 0)) { + brelse(raw_super_buf); + if (validate_superblock(sb, &raw_super, &raw_super_buf, 1)) + goto free_sb_buf; + } /* init some FS parameters */ sbi->active_logs = NR_CURSEG_TYPE; @@ -438,11 +525,7 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) set_opt(sbi, POSIX_ACL); #endif /* parse mount options */ - if (parse_options(sbi, (char *)data)) - goto free_sb_buf; - - /* sanity checking of raw super */ - if (sanity_check_raw_super(raw_super)) + if (parse_options(sb, sbi, (char *)data)) goto free_sb_buf; sb->s_maxbytes = max_file_size(le32_to_cpu(raw_super->log_blocksize)); @@ -477,18 +560,23 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) /* get an inode for meta space */ sbi->meta_inode = f2fs_iget(sb, F2FS_META_INO(sbi)); if (IS_ERR(sbi->meta_inode)) { + f2fs_msg(sb, KERN_ERR, "Failed to read F2FS meta data inode"); err = PTR_ERR(sbi->meta_inode); goto free_sb_buf; } err = get_valid_checkpoint(sbi); - if (err) + if (err) { + f2fs_msg(sb, KERN_ERR, "Failed to get valid F2FS checkpoint"); goto free_meta_inode; + } /* sanity checking of checkpoint */ err = -EINVAL; - if (sanity_check_ckpt(raw_super, sbi->ckpt)) + if (sanity_check_ckpt(sbi)) { + f2fs_msg(sb, KERN_ERR, "Invalid F2FS checkpoint"); goto free_cp; + } sbi->total_valid_node_count = le32_to_cpu(sbi->ckpt->valid_node_count); @@ -502,25 +590,28 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) INIT_LIST_HEAD(&sbi->dir_inode_list); spin_lock_init(&sbi->dir_inode_lock); - /* init super block */ - if (!sb_set_blocksize(sb, sbi->blocksize)) - goto free_cp; - init_orphan_info(sbi); /* setup f2fs internal modules */ err = build_segment_manager(sbi); - if (err) + if (err) { + f2fs_msg(sb, KERN_ERR, + "Failed to initialize F2FS segment manager"); goto free_sm; + } err = build_node_manager(sbi); - if (err) + if (err) { + f2fs_msg(sb, KERN_ERR, + "Failed to initialize F2FS node manager"); goto free_nm; + } build_gc_manager(sbi); /* get an inode for node space */ sbi->node_inode = f2fs_iget(sb, F2FS_NODE_INO(sbi)); if (IS_ERR(sbi->node_inode)) { + f2fs_msg(sb, KERN_ERR, "Failed to read node inode"); err = PTR_ERR(sbi->node_inode); goto free_nm; } @@ -533,6 +624,7 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) /* read root inode and dentry */ root = f2fs_iget(sb, F2FS_ROOT_INO(sbi)); if (IS_ERR(root)) { + f2fs_msg(sb, KERN_ERR, "Failed to read root inode"); err = PTR_ERR(root); goto free_node_inode; } @@ -596,7 +688,7 @@ static struct file_system_type f2fs_fs_type = { .fs_flags = FS_REQUIRES_DEV, }; -static int init_inodecache(void) +static int __init init_inodecache(void) { f2fs_inode_cachep = f2fs_kmem_cache_create("f2fs_inode_cache", sizeof(struct f2fs_inode_info), NULL); @@ -631,14 +723,17 @@ static int __init init_f2fs_fs(void) err = create_checkpoint_caches(); if (err) goto fail; - return register_filesystem(&f2fs_fs_type); + err = register_filesystem(&f2fs_fs_type); + if (err) + goto fail; + f2fs_create_root_stats(); fail: return err; } static void __exit exit_f2fs_fs(void) { - destroy_root_stats(); + f2fs_destroy_root_stats(); unregister_filesystem(&f2fs_fs_type); destroy_checkpoint_caches(); destroy_gc_caches(); diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index 940136a..8038c04 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -318,6 +318,8 @@ int f2fs_setxattr(struct inode *inode, int name_index, const char *name, if (name_len > 255 || value_len > MAX_VALUE_LEN) return -ERANGE; + f2fs_balance_fs(sbi); + mutex_lock_op(sbi, NODE_NEW); if (!fi->i_xattr_nid) { /* Allocate new attribute block */ @@ -516,7 +516,7 @@ struct files_struct init_files = { .close_on_exec = init_files.close_on_exec_init, .open_fds = init_files.open_fds_init, }, - .file_lock = __SPIN_LOCK_UNLOCKED(init_task.file_lock), + .file_lock = __SPIN_LOCK_UNLOCKED(init_files.file_lock), }; /* diff --git a/fs/fuse/Kconfig b/fs/fuse/Kconfig index 0cf160a..1b2f6c2 100644 --- a/fs/fuse/Kconfig +++ b/fs/fuse/Kconfig @@ -4,12 +4,24 @@ config FUSE_FS With FUSE it is possible to implement a fully functional filesystem in a userspace program. - There's also companion library: libfuse. This library along with - utilities is available from the FUSE homepage: + There's also a companion library: libfuse2. This library is available + from the FUSE homepage: <http://fuse.sourceforge.net/> + although chances are your distribution already has that library + installed if you've installed the "fuse" package itself. See <file:Documentation/filesystems/fuse.txt> for more information. See <file:Documentation/Changes> for needed library/utility version. If you want to develop a userspace FS, or if you want to use a filesystem based on FUSE, answer Y or M. + +config CUSE + tristate "Character device in Userspace support" + depends on FUSE_FS + help + This FUSE extension allows character devices to be + implemented in userspace. + + If you want to develop or use a userspace character device + based on CUSE, answer Y or M. diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c index ee8d550..6f96a8d 100644 --- a/fs/fuse/cuse.c +++ b/fs/fuse/cuse.c @@ -45,7 +45,6 @@ #include <linux/miscdevice.h> #include <linux/mutex.h> #include <linux/slab.h> -#include <linux/spinlock.h> #include <linux/stat.h> #include <linux/module.h> @@ -63,7 +62,7 @@ struct cuse_conn { bool unrestricted_ioctl; }; -static DEFINE_SPINLOCK(cuse_lock); /* protects cuse_conntbl */ +static DEFINE_MUTEX(cuse_lock); /* protects registration */ static struct list_head cuse_conntbl[CUSE_CONNTBL_LEN]; static struct class *cuse_class; @@ -92,19 +91,22 @@ static ssize_t cuse_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { loff_t pos = 0; + struct iovec iov = { .iov_base = buf, .iov_len = count }; - return fuse_direct_io(file, buf, count, &pos, 0); + return fuse_direct_io(file, &iov, 1, count, &pos, 0); } static ssize_t cuse_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { loff_t pos = 0; + struct iovec iov = { .iov_base = (void __user *)buf, .iov_len = count }; + /* * No locking or generic_write_checks(), the server is * responsible for locking and sanity checks. */ - return fuse_direct_io(file, buf, count, &pos, 1); + return fuse_direct_io(file, &iov, 1, count, &pos, 1); } static int cuse_open(struct inode *inode, struct file *file) @@ -114,14 +116,14 @@ static int cuse_open(struct inode *inode, struct file *file) int rc; /* look up and get the connection */ - spin_lock(&cuse_lock); + mutex_lock(&cuse_lock); list_for_each_entry(pos, cuse_conntbl_head(devt), list) if (pos->dev->devt == devt) { fuse_conn_get(&pos->fc); cc = pos; break; } - spin_unlock(&cuse_lock); + mutex_unlock(&cuse_lock); /* dead? */ if (!cc) @@ -267,7 +269,7 @@ static int cuse_parse_one(char **pp, char *end, char **keyp, char **valp) static int cuse_parse_devinfo(char *p, size_t len, struct cuse_devinfo *devinfo) { char *end = p + len; - char *key, *val; + char *uninitialized_var(key), *uninitialized_var(val); int rc; while (true) { @@ -305,14 +307,14 @@ static void cuse_gendev_release(struct device *dev) */ static void cuse_process_init_reply(struct fuse_conn *fc, struct fuse_req *req) { - struct cuse_conn *cc = fc_to_cc(fc); + struct cuse_conn *cc = fc_to_cc(fc), *pos; struct cuse_init_out *arg = req->out.args[0].value; struct page *page = req->pages[0]; struct cuse_devinfo devinfo = { }; struct device *dev; struct cdev *cdev; dev_t devt; - int rc; + int rc, i; if (req->out.h.error || arg->major != FUSE_KERNEL_VERSION || arg->minor < 11) { @@ -356,15 +358,24 @@ static void cuse_process_init_reply(struct fuse_conn *fc, struct fuse_req *req) dev_set_drvdata(dev, cc); dev_set_name(dev, "%s", devinfo.name); + mutex_lock(&cuse_lock); + + /* make sure the device-name is unique */ + for (i = 0; i < CUSE_CONNTBL_LEN; ++i) { + list_for_each_entry(pos, &cuse_conntbl[i], list) + if (!strcmp(dev_name(pos->dev), dev_name(dev))) + goto err_unlock; + } + rc = device_add(dev); if (rc) - goto err_device; + goto err_unlock; /* register cdev */ rc = -ENOMEM; cdev = cdev_alloc(); if (!cdev) - goto err_device; + goto err_unlock; cdev->owner = THIS_MODULE; cdev->ops = &cuse_frontend_fops; @@ -377,9 +388,8 @@ static void cuse_process_init_reply(struct fuse_conn *fc, struct fuse_req *req) cc->cdev = cdev; /* make the device available */ - spin_lock(&cuse_lock); list_add(&cc->list, cuse_conntbl_head(devt)); - spin_unlock(&cuse_lock); + mutex_unlock(&cuse_lock); /* announce device availability */ dev_set_uevent_suppress(dev, 0); @@ -391,7 +401,8 @@ out: err_cdev: cdev_del(cdev); -err_device: +err_unlock: + mutex_unlock(&cuse_lock); put_device(dev); err_region: unregister_chrdev_region(devt, 1); @@ -411,7 +422,7 @@ static int cuse_send_init(struct cuse_conn *cc) BUILD_BUG_ON(CUSE_INIT_INFO_MAX > PAGE_SIZE); - req = fuse_get_req(fc); + req = fuse_get_req(fc, 1); if (IS_ERR(req)) { rc = PTR_ERR(req); goto err; @@ -441,6 +452,7 @@ static int cuse_send_init(struct cuse_conn *cc) req->out.argvar = 1; req->out.argpages = 1; req->pages[0] = page; + req->page_descs[0].length = req->out.args[1].size; req->num_pages = 1; req->end = cuse_process_init_reply; fuse_request_send_background(fc, req); @@ -520,9 +532,9 @@ static int cuse_channel_release(struct inode *inode, struct file *file) int rc; /* remove from the conntbl, no more access from this point on */ - spin_lock(&cuse_lock); + mutex_lock(&cuse_lock); list_del_init(&cc->list); - spin_unlock(&cuse_lock); + mutex_unlock(&cuse_lock); /* remove device */ if (cc->dev) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index c163353..e9bdec0 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -34,34 +34,67 @@ static struct fuse_conn *fuse_get_conn(struct file *file) return file->private_data; } -static void fuse_request_init(struct fuse_req *req) +static void fuse_request_init(struct fuse_req *req, struct page **pages, + struct fuse_page_desc *page_descs, + unsigned npages) { memset(req, 0, sizeof(*req)); + memset(pages, 0, sizeof(*pages) * npages); + memset(page_descs, 0, sizeof(*page_descs) * npages); INIT_LIST_HEAD(&req->list); INIT_LIST_HEAD(&req->intr_entry); init_waitqueue_head(&req->waitq); atomic_set(&req->count, 1); + req->pages = pages; + req->page_descs = page_descs; + req->max_pages = npages; } -struct fuse_req *fuse_request_alloc(void) +static struct fuse_req *__fuse_request_alloc(unsigned npages, gfp_t flags) { - struct fuse_req *req = kmem_cache_alloc(fuse_req_cachep, GFP_KERNEL); - if (req) - fuse_request_init(req); + struct fuse_req *req = kmem_cache_alloc(fuse_req_cachep, flags); + if (req) { + struct page **pages; + struct fuse_page_desc *page_descs; + + if (npages <= FUSE_REQ_INLINE_PAGES) { + pages = req->inline_pages; + page_descs = req->inline_page_descs; + } else { + pages = kmalloc(sizeof(struct page *) * npages, flags); + page_descs = kmalloc(sizeof(struct fuse_page_desc) * + npages, flags); + } + + if (!pages || !page_descs) { + kfree(pages); + kfree(page_descs); + kmem_cache_free(fuse_req_cachep, req); + return NULL; + } + + fuse_request_init(req, pages, page_descs, npages); + } return req; } + +struct fuse_req *fuse_request_alloc(unsigned npages) +{ + return __fuse_request_alloc(npages, GFP_KERNEL); +} EXPORT_SYMBOL_GPL(fuse_request_alloc); -struct fuse_req *fuse_request_alloc_nofs(void) +struct fuse_req *fuse_request_alloc_nofs(unsigned npages) { - struct fuse_req *req = kmem_cache_alloc(fuse_req_cachep, GFP_NOFS); - if (req) - fuse_request_init(req); - return req; + return __fuse_request_alloc(npages, GFP_NOFS); } void fuse_request_free(struct fuse_req *req) { + if (req->pages != req->inline_pages) { + kfree(req->pages); + kfree(req->page_descs); + } kmem_cache_free(fuse_req_cachep, req); } @@ -97,7 +130,7 @@ static void fuse_req_init_context(struct fuse_req *req) req->in.h.pid = current->pid; } -struct fuse_req *fuse_get_req(struct fuse_conn *fc) +struct fuse_req *fuse_get_req(struct fuse_conn *fc, unsigned npages) { struct fuse_req *req; sigset_t oldset; @@ -116,7 +149,7 @@ struct fuse_req *fuse_get_req(struct fuse_conn *fc) if (!fc->connected) goto out; - req = fuse_request_alloc(); + req = fuse_request_alloc(npages); err = -ENOMEM; if (!req) goto out; @@ -165,7 +198,7 @@ static void put_reserved_req(struct fuse_conn *fc, struct fuse_req *req) struct fuse_file *ff = file->private_data; spin_lock(&fc->lock); - fuse_request_init(req); + fuse_request_init(req, req->pages, req->page_descs, req->max_pages); BUG_ON(ff->reserved_req); ff->reserved_req = req; wake_up_all(&fc->reserved_req_waitq); @@ -186,13 +219,14 @@ static void put_reserved_req(struct fuse_conn *fc, struct fuse_req *req) * filesystem should not have it's own file open. If deadlock is * intentional, it can still be broken by "aborting" the filesystem. */ -struct fuse_req *fuse_get_req_nofail(struct fuse_conn *fc, struct file *file) +struct fuse_req *fuse_get_req_nofail_nopages(struct fuse_conn *fc, + struct file *file) { struct fuse_req *req; atomic_inc(&fc->num_waiting); wait_event(fc->blocked_waitq, !fc->blocked); - req = fuse_request_alloc(); + req = fuse_request_alloc(0); if (!req) req = get_reserved_req(fc, file); @@ -406,9 +440,8 @@ __acquires(fc->lock) } } -void fuse_request_send(struct fuse_conn *fc, struct fuse_req *req) +static void __fuse_request_send(struct fuse_conn *fc, struct fuse_req *req) { - req->isreply = 1; spin_lock(&fc->lock); if (!fc->connected) req->out.h.error = -ENOTCONN; @@ -425,6 +458,12 @@ void fuse_request_send(struct fuse_conn *fc, struct fuse_req *req) } spin_unlock(&fc->lock); } + +void fuse_request_send(struct fuse_conn *fc, struct fuse_req *req) +{ + req->isreply = 1; + __fuse_request_send(fc, req); +} EXPORT_SYMBOL_GPL(fuse_request_send); static void fuse_request_send_nowait_locked(struct fuse_conn *fc, @@ -491,6 +530,27 @@ void fuse_request_send_background_locked(struct fuse_conn *fc, fuse_request_send_nowait_locked(fc, req); } +void fuse_force_forget(struct file *file, u64 nodeid) +{ + struct inode *inode = file->f_path.dentry->d_inode; + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_req *req; + struct fuse_forget_in inarg; + + memset(&inarg, 0, sizeof(inarg)); + inarg.nlookup = 1; + req = fuse_get_req_nofail_nopages(fc, file); + req->in.h.opcode = FUSE_FORGET; + req->in.h.nodeid = nodeid; + req->in.numargs = 1; + req->in.args[0].size = sizeof(inarg); + req->in.args[0].value = &inarg; + req->isreply = 0; + __fuse_request_send(fc, req); + /* ignore errors */ + fuse_put_request(fc, req); +} + /* * Lock the request. Up to the next unlock_request() there mustn't be * anything that could cause a page-fault. If the request was already @@ -692,8 +752,6 @@ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep) struct page *oldpage = *pagep; struct page *newpage; struct pipe_buffer *buf = cs->pipebufs; - struct address_space *mapping; - pgoff_t index; unlock_request(cs->fc, cs->req); fuse_copy_finish(cs); @@ -724,9 +782,6 @@ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep) if (fuse_check_page(newpage) != 0) goto out_fallback_unlock; - mapping = oldpage->mapping; - index = oldpage->index; - /* * This is a new and locked page, it shouldn't be mapped or * have any special flags on it @@ -855,11 +910,11 @@ static int fuse_copy_pages(struct fuse_copy_state *cs, unsigned nbytes, { unsigned i; struct fuse_req *req = cs->req; - unsigned offset = req->page_offset; - unsigned count = min(nbytes, (unsigned) PAGE_SIZE - offset); for (i = 0; i < req->num_pages && (nbytes || zeroing); i++) { int err; + unsigned offset = req->page_descs[i].offset; + unsigned count = min(nbytes, req->page_descs[i].length); err = fuse_copy_page(cs, &req->pages[i], offset, count, zeroing); @@ -867,8 +922,6 @@ static int fuse_copy_pages(struct fuse_copy_state *cs, unsigned nbytes, return err; nbytes -= count; - count = min(nbytes, (unsigned) PAGE_SIZE); - offset = 0; } return 0; } @@ -1541,29 +1594,34 @@ static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode, unsigned int num; unsigned int offset; size_t total_len = 0; + int num_pages; + + offset = outarg->offset & ~PAGE_CACHE_MASK; + file_size = i_size_read(inode); + + num = outarg->size; + if (outarg->offset > file_size) + num = 0; + else if (outarg->offset + num > file_size) + num = file_size - outarg->offset; - req = fuse_get_req(fc); + num_pages = (num + offset + PAGE_SIZE - 1) >> PAGE_SHIFT; + num_pages = min(num_pages, FUSE_MAX_PAGES_PER_REQ); + + req = fuse_get_req(fc, num_pages); if (IS_ERR(req)) return PTR_ERR(req); - offset = outarg->offset & ~PAGE_CACHE_MASK; - req->in.h.opcode = FUSE_NOTIFY_REPLY; req->in.h.nodeid = outarg->nodeid; req->in.numargs = 2; req->in.argpages = 1; - req->page_offset = offset; + req->page_descs[0].offset = offset; req->end = fuse_retrieve_end; index = outarg->offset >> PAGE_CACHE_SHIFT; - file_size = i_size_read(inode); - num = outarg->size; - if (outarg->offset > file_size) - num = 0; - else if (outarg->offset + num > file_size) - num = file_size - outarg->offset; - while (num && req->num_pages < FUSE_MAX_PAGES_PER_REQ) { + while (num && req->num_pages < num_pages) { struct page *page; unsigned int this_num; @@ -1573,6 +1631,7 @@ static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode, this_num = min_t(unsigned, num, PAGE_CACHE_SIZE - offset); req->pages[req->num_pages] = page; + req->page_descs[req->num_pages].length = this_num; req->num_pages++; offset = 0; diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 80ba395..ff15522 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -14,6 +14,29 @@ #include <linux/namei.h> #include <linux/slab.h> +static bool fuse_use_readdirplus(struct inode *dir, struct file *filp) +{ + struct fuse_conn *fc = get_fuse_conn(dir); + struct fuse_inode *fi = get_fuse_inode(dir); + + if (!fc->do_readdirplus) + return false; + if (!fc->readdirplus_auto) + return true; + if (test_and_clear_bit(FUSE_I_ADVISE_RDPLUS, &fi->state)) + return true; + if (filp->f_pos == 0) + return true; + return false; +} + +static void fuse_advise_use_readdirplus(struct inode *dir) +{ + struct fuse_inode *fi = get_fuse_inode(dir); + + set_bit(FUSE_I_ADVISE_RDPLUS, &fi->state); +} + #if BITS_PER_LONG >= 64 static inline void fuse_dentry_settime(struct dentry *entry, u64 time) { @@ -178,7 +201,7 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags) return -ECHILD; fc = get_fuse_conn(inode); - req = fuse_get_req(fc); + req = fuse_get_req_nopages(fc); if (IS_ERR(req)) return 0; @@ -219,6 +242,7 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags) attr_version); fuse_change_entry_timeout(entry, &outarg); } + fuse_advise_use_readdirplus(inode); return 1; } @@ -271,7 +295,7 @@ int fuse_lookup_name(struct super_block *sb, u64 nodeid, struct qstr *name, if (name->len > FUSE_NAME_MAX) goto out; - req = fuse_get_req(fc); + req = fuse_get_req_nopages(fc); err = PTR_ERR(req); if (IS_ERR(req)) goto out; @@ -355,6 +379,7 @@ static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry, else fuse_invalidate_entry_cache(entry); + fuse_advise_use_readdirplus(dir); return newent; out_iput: @@ -391,7 +416,7 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, if (!forget) goto out_err; - req = fuse_get_req(fc); + req = fuse_get_req_nopages(fc); err = PTR_ERR(req); if (IS_ERR(req)) goto out_put_forget_req; @@ -592,7 +617,7 @@ static int fuse_mknod(struct inode *dir, struct dentry *entry, umode_t mode, { struct fuse_mknod_in inarg; struct fuse_conn *fc = get_fuse_conn(dir); - struct fuse_req *req = fuse_get_req(fc); + struct fuse_req *req = fuse_get_req_nopages(fc); if (IS_ERR(req)) return PTR_ERR(req); @@ -623,7 +648,7 @@ static int fuse_mkdir(struct inode *dir, struct dentry *entry, umode_t mode) { struct fuse_mkdir_in inarg; struct fuse_conn *fc = get_fuse_conn(dir); - struct fuse_req *req = fuse_get_req(fc); + struct fuse_req *req = fuse_get_req_nopages(fc); if (IS_ERR(req)) return PTR_ERR(req); @@ -647,7 +672,7 @@ static int fuse_symlink(struct inode *dir, struct dentry *entry, { struct fuse_conn *fc = get_fuse_conn(dir); unsigned len = strlen(link) + 1; - struct fuse_req *req = fuse_get_req(fc); + struct fuse_req *req = fuse_get_req_nopages(fc); if (IS_ERR(req)) return PTR_ERR(req); @@ -664,7 +689,7 @@ static int fuse_unlink(struct inode *dir, struct dentry *entry) { int err; struct fuse_conn *fc = get_fuse_conn(dir); - struct fuse_req *req = fuse_get_req(fc); + struct fuse_req *req = fuse_get_req_nopages(fc); if (IS_ERR(req)) return PTR_ERR(req); @@ -682,7 +707,14 @@ static int fuse_unlink(struct inode *dir, struct dentry *entry) spin_lock(&fc->lock); fi->attr_version = ++fc->attr_version; - drop_nlink(inode); + /* + * If i_nlink == 0 then unlink doesn't make sense, yet this can + * happen if userspace filesystem is careless. It would be + * difficult to enforce correct nlink usage so just ignore this + * condition here + */ + if (inode->i_nlink > 0) + drop_nlink(inode); spin_unlock(&fc->lock); fuse_invalidate_attr(inode); fuse_invalidate_attr(dir); @@ -696,7 +728,7 @@ static int fuse_rmdir(struct inode *dir, struct dentry *entry) { int err; struct fuse_conn *fc = get_fuse_conn(dir); - struct fuse_req *req = fuse_get_req(fc); + struct fuse_req *req = fuse_get_req_nopages(fc); if (IS_ERR(req)) return PTR_ERR(req); @@ -723,7 +755,7 @@ static int fuse_rename(struct inode *olddir, struct dentry *oldent, int err; struct fuse_rename_in inarg; struct fuse_conn *fc = get_fuse_conn(olddir); - struct fuse_req *req = fuse_get_req(fc); + struct fuse_req *req = fuse_get_req_nopages(fc); if (IS_ERR(req)) return PTR_ERR(req); @@ -776,7 +808,7 @@ static int fuse_link(struct dentry *entry, struct inode *newdir, struct fuse_link_in inarg; struct inode *inode = entry->d_inode; struct fuse_conn *fc = get_fuse_conn(inode); - struct fuse_req *req = fuse_get_req(fc); + struct fuse_req *req = fuse_get_req_nopages(fc); if (IS_ERR(req)) return PTR_ERR(req); @@ -848,7 +880,7 @@ static int fuse_do_getattr(struct inode *inode, struct kstat *stat, struct fuse_req *req; u64 attr_version; - req = fuse_get_req(fc); + req = fuse_get_req_nopages(fc); if (IS_ERR(req)) return PTR_ERR(req); @@ -985,7 +1017,7 @@ int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid, /* * Calling into a user-controlled filesystem gives the filesystem - * daemon ptrace-like capabilities over the requester process. This + * daemon ptrace-like capabilities over the current process. This * means, that the filesystem daemon is able to record the exact * filesystem operations performed, and can also control the behavior * of the requester process in otherwise impossible ways. For example @@ -996,27 +1028,23 @@ int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid, * for which the owner of the mount has ptrace privilege. This * excludes processes started by other users, suid or sgid processes. */ -int fuse_allow_task(struct fuse_conn *fc, struct task_struct *task) +int fuse_allow_current_process(struct fuse_conn *fc) { const struct cred *cred; - int ret; if (fc->flags & FUSE_ALLOW_OTHER) return 1; - rcu_read_lock(); - ret = 0; - cred = __task_cred(task); + cred = current_cred(); if (uid_eq(cred->euid, fc->user_id) && uid_eq(cred->suid, fc->user_id) && uid_eq(cred->uid, fc->user_id) && gid_eq(cred->egid, fc->group_id) && gid_eq(cred->sgid, fc->group_id) && gid_eq(cred->gid, fc->group_id)) - ret = 1; - rcu_read_unlock(); + return 1; - return ret; + return 0; } static int fuse_access(struct inode *inode, int mask) @@ -1029,7 +1057,7 @@ static int fuse_access(struct inode *inode, int mask) if (fc->no_access) return 0; - req = fuse_get_req(fc); + req = fuse_get_req_nopages(fc); if (IS_ERR(req)) return PTR_ERR(req); @@ -1077,7 +1105,7 @@ static int fuse_permission(struct inode *inode, int mask) bool refreshed = false; int err = 0; - if (!fuse_allow_task(fc, current)) + if (!fuse_allow_current_process(fc)) return -EACCES; /* @@ -1155,19 +1183,157 @@ static int parse_dirfile(char *buf, size_t nbytes, struct file *file, return 0; } -static int fuse_readdir(struct file *file, void *dstbuf, filldir_t filldir) +static int fuse_direntplus_link(struct file *file, + struct fuse_direntplus *direntplus, + u64 attr_version) { int err; + struct fuse_entry_out *o = &direntplus->entry_out; + struct fuse_dirent *dirent = &direntplus->dirent; + struct dentry *parent = file->f_path.dentry; + struct qstr name = QSTR_INIT(dirent->name, dirent->namelen); + struct dentry *dentry; + struct dentry *alias; + struct inode *dir = parent->d_inode; + struct fuse_conn *fc; + struct inode *inode; + + if (!o->nodeid) { + /* + * Unlike in the case of fuse_lookup, zero nodeid does not mean + * ENOENT. Instead, it only means the userspace filesystem did + * not want to return attributes/handle for this entry. + * + * So do nothing. + */ + return 0; + } + + if (name.name[0] == '.') { + /* + * We could potentially refresh the attributes of the directory + * and its parent? + */ + if (name.len == 1) + return 0; + if (name.name[1] == '.' && name.len == 2) + return 0; + } + fc = get_fuse_conn(dir); + + name.hash = full_name_hash(name.name, name.len); + dentry = d_lookup(parent, &name); + if (dentry && dentry->d_inode) { + inode = dentry->d_inode; + if (get_node_id(inode) == o->nodeid) { + struct fuse_inode *fi; + fi = get_fuse_inode(inode); + spin_lock(&fc->lock); + fi->nlookup++; + spin_unlock(&fc->lock); + + /* + * The other branch to 'found' comes via fuse_iget() + * which bumps nlookup inside + */ + goto found; + } + err = d_invalidate(dentry); + if (err) + goto out; + dput(dentry); + dentry = NULL; + } + + dentry = d_alloc(parent, &name); + err = -ENOMEM; + if (!dentry) + goto out; + + inode = fuse_iget(dir->i_sb, o->nodeid, o->generation, + &o->attr, entry_attr_timeout(o), attr_version); + if (!inode) + goto out; + + alias = d_materialise_unique(dentry, inode); + err = PTR_ERR(alias); + if (IS_ERR(alias)) + goto out; + if (alias) { + dput(dentry); + dentry = alias; + } + +found: + fuse_change_attributes(inode, &o->attr, entry_attr_timeout(o), + attr_version); + + fuse_change_entry_timeout(dentry, o); + + err = 0; +out: + if (dentry) + dput(dentry); + return err; +} + +static int parse_dirplusfile(char *buf, size_t nbytes, struct file *file, + void *dstbuf, filldir_t filldir, u64 attr_version) +{ + struct fuse_direntplus *direntplus; + struct fuse_dirent *dirent; + size_t reclen; + int over = 0; + int ret; + + while (nbytes >= FUSE_NAME_OFFSET_DIRENTPLUS) { + direntplus = (struct fuse_direntplus *) buf; + dirent = &direntplus->dirent; + reclen = FUSE_DIRENTPLUS_SIZE(direntplus); + + if (!dirent->namelen || dirent->namelen > FUSE_NAME_MAX) + return -EIO; + if (reclen > nbytes) + break; + + if (!over) { + /* We fill entries into dstbuf only as much as + it can hold. But we still continue iterating + over remaining entries to link them. If not, + we need to send a FORGET for each of those + which we did not link. + */ + over = filldir(dstbuf, dirent->name, dirent->namelen, + file->f_pos, dirent->ino, + dirent->type); + file->f_pos = dirent->off; + } + + buf += reclen; + nbytes -= reclen; + + ret = fuse_direntplus_link(file, direntplus, attr_version); + if (ret) + fuse_force_forget(file, direntplus->entry_out.nodeid); + } + + return 0; +} + +static int fuse_readdir(struct file *file, void *dstbuf, filldir_t filldir) +{ + int plus, err; size_t nbytes; struct page *page; struct inode *inode = file_inode(file); struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_req *req; + u64 attr_version = 0; if (is_bad_inode(inode)) return -EIO; - req = fuse_get_req(fc); + req = fuse_get_req(fc, 1); if (IS_ERR(req)) return PTR_ERR(req); @@ -1176,17 +1342,34 @@ static int fuse_readdir(struct file *file, void *dstbuf, filldir_t filldir) fuse_put_request(fc, req); return -ENOMEM; } + + plus = fuse_use_readdirplus(inode, file); req->out.argpages = 1; req->num_pages = 1; req->pages[0] = page; - fuse_read_fill(req, file, file->f_pos, PAGE_SIZE, FUSE_READDIR); + req->page_descs[0].length = PAGE_SIZE; + if (plus) { + attr_version = fuse_get_attr_version(fc); + fuse_read_fill(req, file, file->f_pos, PAGE_SIZE, + FUSE_READDIRPLUS); + } else { + fuse_read_fill(req, file, file->f_pos, PAGE_SIZE, + FUSE_READDIR); + } fuse_request_send(fc, req); nbytes = req->out.args[0].size; err = req->out.h.error; fuse_put_request(fc, req); - if (!err) - err = parse_dirfile(page_address(page), nbytes, file, dstbuf, - filldir); + if (!err) { + if (plus) { + err = parse_dirplusfile(page_address(page), nbytes, + file, dstbuf, filldir, + attr_version); + } else { + err = parse_dirfile(page_address(page), nbytes, file, + dstbuf, filldir); + } + } __free_page(page); fuse_invalidate_attr(inode); /* atime changed */ @@ -1197,7 +1380,7 @@ static char *read_link(struct dentry *dentry) { struct inode *inode = dentry->d_inode; struct fuse_conn *fc = get_fuse_conn(inode); - struct fuse_req *req = fuse_get_req(fc); + struct fuse_req *req = fuse_get_req_nopages(fc); char *link; if (IS_ERR(req)) @@ -1391,7 +1574,7 @@ static int fuse_do_setattr(struct dentry *entry, struct iattr *attr, loff_t oldsize; int err; - if (!fuse_allow_task(fc, current)) + if (!fuse_allow_current_process(fc)) return -EACCES; if (!(fc->flags & FUSE_DEFAULT_PERMISSIONS)) @@ -1410,7 +1593,7 @@ static int fuse_do_setattr(struct dentry *entry, struct iattr *attr, if (attr->ia_valid & ATTR_SIZE) is_truncate = true; - req = fuse_get_req(fc); + req = fuse_get_req_nopages(fc); if (IS_ERR(req)) return PTR_ERR(req); @@ -1500,7 +1683,7 @@ static int fuse_getattr(struct vfsmount *mnt, struct dentry *entry, struct inode *inode = entry->d_inode; struct fuse_conn *fc = get_fuse_conn(inode); - if (!fuse_allow_task(fc, current)) + if (!fuse_allow_current_process(fc)) return -EACCES; return fuse_update_attributes(inode, stat, NULL, NULL); @@ -1518,7 +1701,7 @@ static int fuse_setxattr(struct dentry *entry, const char *name, if (fc->no_setxattr) return -EOPNOTSUPP; - req = fuse_get_req(fc); + req = fuse_get_req_nopages(fc); if (IS_ERR(req)) return PTR_ERR(req); @@ -1557,7 +1740,7 @@ static ssize_t fuse_getxattr(struct dentry *entry, const char *name, if (fc->no_getxattr) return -EOPNOTSUPP; - req = fuse_get_req(fc); + req = fuse_get_req_nopages(fc); if (IS_ERR(req)) return PTR_ERR(req); @@ -1603,13 +1786,13 @@ static ssize_t fuse_listxattr(struct dentry *entry, char *list, size_t size) struct fuse_getxattr_out outarg; ssize_t ret; - if (!fuse_allow_task(fc, current)) + if (!fuse_allow_current_process(fc)) return -EACCES; if (fc->no_listxattr) return -EOPNOTSUPP; - req = fuse_get_req(fc); + req = fuse_get_req_nopages(fc); if (IS_ERR(req)) return PTR_ERR(req); @@ -1654,7 +1837,7 @@ static int fuse_removexattr(struct dentry *entry, const char *name) if (fc->no_removexattr) return -EOPNOTSUPP; - req = fuse_get_req(fc); + req = fuse_get_req_nopages(fc); if (IS_ERR(req)) return PTR_ERR(req); diff --git a/fs/fuse/file.c b/fs/fuse/file.c index e21d4d8..c807176 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -25,7 +25,7 @@ static int fuse_send_open(struct fuse_conn *fc, u64 nodeid, struct file *file, struct fuse_req *req; int err; - req = fuse_get_req(fc); + req = fuse_get_req_nopages(fc); if (IS_ERR(req)) return PTR_ERR(req); @@ -57,7 +57,7 @@ struct fuse_file *fuse_file_alloc(struct fuse_conn *fc) return NULL; ff->fc = fc; - ff->reserved_req = fuse_request_alloc(); + ff->reserved_req = fuse_request_alloc(0); if (unlikely(!ff->reserved_req)) { kfree(ff); return NULL; @@ -368,7 +368,7 @@ static int fuse_flush(struct file *file, fl_owner_t id) if (fc->no_flush) return 0; - req = fuse_get_req_nofail(fc, file); + req = fuse_get_req_nofail_nopages(fc, file); memset(&inarg, 0, sizeof(inarg)); inarg.fh = ff->fh; inarg.lock_owner = fuse_lock_owner_id(fc, id); @@ -436,7 +436,7 @@ int fuse_fsync_common(struct file *file, loff_t start, loff_t end, fuse_sync_writes(inode); - req = fuse_get_req(fc); + req = fuse_get_req_nopages(fc); if (IS_ERR(req)) { err = PTR_ERR(req); goto out; @@ -544,7 +544,7 @@ static int fuse_readpage(struct file *file, struct page *page) */ fuse_wait_on_page_writeback(inode, page->index); - req = fuse_get_req(fc); + req = fuse_get_req(fc, 1); err = PTR_ERR(req); if (IS_ERR(req)) goto out; @@ -555,6 +555,7 @@ static int fuse_readpage(struct file *file, struct page *page) req->out.argpages = 1; req->num_pages = 1; req->pages[0] = page; + req->page_descs[0].length = count; num_read = fuse_send_read(req, file, pos, count, NULL); err = req->out.h.error; fuse_put_request(fc, req); @@ -641,6 +642,7 @@ struct fuse_fill_data { struct fuse_req *req; struct file *file; struct inode *inode; + unsigned nr_pages; }; static int fuse_readpages_fill(void *_data, struct page *page) @@ -656,16 +658,26 @@ static int fuse_readpages_fill(void *_data, struct page *page) (req->num_pages == FUSE_MAX_PAGES_PER_REQ || (req->num_pages + 1) * PAGE_CACHE_SIZE > fc->max_read || req->pages[req->num_pages - 1]->index + 1 != page->index)) { + int nr_alloc = min_t(unsigned, data->nr_pages, + FUSE_MAX_PAGES_PER_REQ); fuse_send_readpages(req, data->file); - data->req = req = fuse_get_req(fc); + data->req = req = fuse_get_req(fc, nr_alloc); if (IS_ERR(req)) { unlock_page(page); return PTR_ERR(req); } } + + if (WARN_ON(req->num_pages >= req->max_pages)) { + fuse_put_request(fc, req); + return -EIO; + } + page_cache_get(page); req->pages[req->num_pages] = page; + req->page_descs[req->num_pages].length = PAGE_SIZE; req->num_pages++; + data->nr_pages--; return 0; } @@ -676,6 +688,7 @@ static int fuse_readpages(struct file *file, struct address_space *mapping, struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_fill_data data; int err; + int nr_alloc = min_t(unsigned, nr_pages, FUSE_MAX_PAGES_PER_REQ); err = -EIO; if (is_bad_inode(inode)) @@ -683,7 +696,8 @@ static int fuse_readpages(struct file *file, struct address_space *mapping, data.file = file; data.inode = inode; - data.req = fuse_get_req(fc); + data.req = fuse_get_req(fc, nr_alloc); + data.nr_pages = nr_pages; err = PTR_ERR(data.req); if (IS_ERR(data.req)) goto out; @@ -786,7 +800,7 @@ static size_t fuse_send_write_pages(struct fuse_req *req, struct file *file, res = fuse_send_write(req, file, pos, count, NULL); - offset = req->page_offset; + offset = req->page_descs[0].offset; count = res; for (i = 0; i < req->num_pages; i++) { struct page *page = req->pages[i]; @@ -817,7 +831,7 @@ static ssize_t fuse_fill_write_pages(struct fuse_req *req, int err; req->in.argpages = 1; - req->page_offset = offset; + req->page_descs[0].offset = offset; do { size_t tmp; @@ -857,6 +871,7 @@ static ssize_t fuse_fill_write_pages(struct fuse_req *req, err = 0; req->pages[req->num_pages] = page; + req->page_descs[req->num_pages].length = tmp; req->num_pages++; iov_iter_advance(ii, tmp); @@ -869,11 +884,19 @@ static ssize_t fuse_fill_write_pages(struct fuse_req *req, if (!fc->big_writes) break; } while (iov_iter_count(ii) && count < fc->max_write && - req->num_pages < FUSE_MAX_PAGES_PER_REQ && offset == 0); + req->num_pages < req->max_pages && offset == 0); return count > 0 ? count : err; } +static inline unsigned fuse_wr_pages(loff_t pos, size_t len) +{ + return min_t(unsigned, + ((pos + len - 1) >> PAGE_CACHE_SHIFT) - + (pos >> PAGE_CACHE_SHIFT) + 1, + FUSE_MAX_PAGES_PER_REQ); +} + static ssize_t fuse_perform_write(struct file *file, struct address_space *mapping, struct iov_iter *ii, loff_t pos) @@ -889,8 +912,9 @@ static ssize_t fuse_perform_write(struct file *file, do { struct fuse_req *req; ssize_t count; + unsigned nr_pages = fuse_wr_pages(pos, iov_iter_count(ii)); - req = fuse_get_req(fc); + req = fuse_get_req(fc, nr_pages); if (IS_ERR(req)) { err = PTR_ERR(req); break; @@ -1023,47 +1047,110 @@ static void fuse_release_user_pages(struct fuse_req *req, int write) } } -static int fuse_get_user_pages(struct fuse_req *req, const char __user *buf, +static inline void fuse_page_descs_length_init(struct fuse_req *req, + unsigned index, unsigned nr_pages) +{ + int i; + + for (i = index; i < index + nr_pages; i++) + req->page_descs[i].length = PAGE_SIZE - + req->page_descs[i].offset; +} + +static inline unsigned long fuse_get_user_addr(const struct iov_iter *ii) +{ + return (unsigned long)ii->iov->iov_base + ii->iov_offset; +} + +static inline size_t fuse_get_frag_size(const struct iov_iter *ii, + size_t max_size) +{ + return min(iov_iter_single_seg_count(ii), max_size); +} + +static int fuse_get_user_pages(struct fuse_req *req, struct iov_iter *ii, size_t *nbytesp, int write) { - size_t nbytes = *nbytesp; - unsigned long user_addr = (unsigned long) buf; - unsigned offset = user_addr & ~PAGE_MASK; - int npages; + size_t nbytes = 0; /* # bytes already packed in req */ /* Special case for kernel I/O: can copy directly into the buffer */ if (segment_eq(get_fs(), KERNEL_DS)) { + unsigned long user_addr = fuse_get_user_addr(ii); + size_t frag_size = fuse_get_frag_size(ii, *nbytesp); + if (write) req->in.args[1].value = (void *) user_addr; else req->out.args[0].value = (void *) user_addr; + iov_iter_advance(ii, frag_size); + *nbytesp = frag_size; return 0; } - nbytes = min_t(size_t, nbytes, FUSE_MAX_PAGES_PER_REQ << PAGE_SHIFT); - npages = (nbytes + offset + PAGE_SIZE - 1) >> PAGE_SHIFT; - npages = clamp(npages, 1, FUSE_MAX_PAGES_PER_REQ); - npages = get_user_pages_fast(user_addr, npages, !write, req->pages); - if (npages < 0) - return npages; + while (nbytes < *nbytesp && req->num_pages < req->max_pages) { + unsigned npages; + unsigned long user_addr = fuse_get_user_addr(ii); + unsigned offset = user_addr & ~PAGE_MASK; + size_t frag_size = fuse_get_frag_size(ii, *nbytesp - nbytes); + int ret; + + unsigned n = req->max_pages - req->num_pages; + frag_size = min_t(size_t, frag_size, n << PAGE_SHIFT); + + npages = (frag_size + offset + PAGE_SIZE - 1) >> PAGE_SHIFT; + npages = clamp(npages, 1U, n); + + ret = get_user_pages_fast(user_addr, npages, !write, + &req->pages[req->num_pages]); + if (ret < 0) + return ret; - req->num_pages = npages; - req->page_offset = offset; + npages = ret; + frag_size = min_t(size_t, frag_size, + (npages << PAGE_SHIFT) - offset); + iov_iter_advance(ii, frag_size); + + req->page_descs[req->num_pages].offset = offset; + fuse_page_descs_length_init(req, req->num_pages, npages); + + req->num_pages += npages; + req->page_descs[req->num_pages - 1].length -= + (npages << PAGE_SHIFT) - offset - frag_size; + + nbytes += frag_size; + } if (write) req->in.argpages = 1; else req->out.argpages = 1; - nbytes = (req->num_pages << PAGE_SHIFT) - req->page_offset; - *nbytesp = min(*nbytesp, nbytes); + *nbytesp = nbytes; return 0; } -ssize_t fuse_direct_io(struct file *file, const char __user *buf, - size_t count, loff_t *ppos, int write) +static inline int fuse_iter_npages(const struct iov_iter *ii_p) +{ + struct iov_iter ii = *ii_p; + int npages = 0; + + while (iov_iter_count(&ii) && npages < FUSE_MAX_PAGES_PER_REQ) { + unsigned long user_addr = fuse_get_user_addr(&ii); + unsigned offset = user_addr & ~PAGE_MASK; + size_t frag_size = iov_iter_single_seg_count(&ii); + + npages += (frag_size + offset + PAGE_SIZE - 1) >> PAGE_SHIFT; + iov_iter_advance(&ii, frag_size); + } + + return min(npages, FUSE_MAX_PAGES_PER_REQ); +} + +ssize_t fuse_direct_io(struct file *file, const struct iovec *iov, + unsigned long nr_segs, size_t count, loff_t *ppos, + int write) { struct fuse_file *ff = file->private_data; struct fuse_conn *fc = ff->fc; @@ -1071,8 +1158,11 @@ ssize_t fuse_direct_io(struct file *file, const char __user *buf, loff_t pos = *ppos; ssize_t res = 0; struct fuse_req *req; + struct iov_iter ii; + + iov_iter_init(&ii, iov, nr_segs, count, 0); - req = fuse_get_req(fc); + req = fuse_get_req(fc, fuse_iter_npages(&ii)); if (IS_ERR(req)) return PTR_ERR(req); @@ -1080,7 +1170,7 @@ ssize_t fuse_direct_io(struct file *file, const char __user *buf, size_t nres; fl_owner_t owner = current->files; size_t nbytes = min(count, nmax); - int err = fuse_get_user_pages(req, buf, &nbytes, write); + int err = fuse_get_user_pages(req, &ii, &nbytes, write); if (err) { res = err; break; @@ -1103,12 +1193,11 @@ ssize_t fuse_direct_io(struct file *file, const char __user *buf, count -= nres; res += nres; pos += nres; - buf += nres; if (nres != nbytes) break; if (count) { fuse_put_request(fc, req); - req = fuse_get_req(fc); + req = fuse_get_req(fc, fuse_iter_npages(&ii)); if (IS_ERR(req)) break; } @@ -1122,8 +1211,8 @@ ssize_t fuse_direct_io(struct file *file, const char __user *buf, } EXPORT_SYMBOL_GPL(fuse_direct_io); -static ssize_t fuse_direct_read(struct file *file, char __user *buf, - size_t count, loff_t *ppos) +static ssize_t __fuse_direct_read(struct file *file, const struct iovec *iov, + unsigned long nr_segs, loff_t *ppos) { ssize_t res; struct inode *inode = file->f_path.dentry->d_inode; @@ -1131,22 +1220,31 @@ static ssize_t fuse_direct_read(struct file *file, char __user *buf, if (is_bad_inode(inode)) return -EIO; - res = fuse_direct_io(file, buf, count, ppos, 0); + res = fuse_direct_io(file, iov, nr_segs, iov_length(iov, nr_segs), + ppos, 0); fuse_invalidate_attr(inode); return res; } -static ssize_t __fuse_direct_write(struct file *file, const char __user *buf, - size_t count, loff_t *ppos) +static ssize_t fuse_direct_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + struct iovec iov = { .iov_base = buf, .iov_len = count }; + return __fuse_direct_read(file, &iov, 1, ppos); +} + +static ssize_t __fuse_direct_write(struct file *file, const struct iovec *iov, + unsigned long nr_segs, loff_t *ppos) { struct inode *inode = file->f_path.dentry->d_inode; + size_t count = iov_length(iov, nr_segs); ssize_t res; res = generic_write_checks(file, ppos, &count, 0); if (!res) { - res = fuse_direct_io(file, buf, count, ppos, 1); + res = fuse_direct_io(file, iov, nr_segs, count, ppos, 1); if (res > 0) fuse_write_update_size(inode, *ppos); } @@ -1159,6 +1257,7 @@ static ssize_t __fuse_direct_write(struct file *file, const char __user *buf, static ssize_t fuse_direct_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { + struct iovec iov = { .iov_base = (void __user *)buf, .iov_len = count }; struct inode *inode = file->f_path.dentry->d_inode; ssize_t res; @@ -1167,7 +1266,7 @@ static ssize_t fuse_direct_write(struct file *file, const char __user *buf, /* Don't allow parallel writes to the same file */ mutex_lock(&inode->i_mutex); - res = __fuse_direct_write(file, buf, count, ppos); + res = __fuse_direct_write(file, &iov, 1, ppos); mutex_unlock(&inode->i_mutex); return res; @@ -1272,7 +1371,7 @@ static int fuse_writepage_locked(struct page *page) set_page_writeback(page); - req = fuse_request_alloc_nofs(); + req = fuse_request_alloc_nofs(1); if (!req) goto err; @@ -1293,7 +1392,8 @@ static int fuse_writepage_locked(struct page *page) req->in.argpages = 1; req->num_pages = 1; req->pages[0] = tmp_page; - req->page_offset = 0; + req->page_descs[0].offset = 0; + req->page_descs[0].length = PAGE_SIZE; req->end = fuse_writepage_end; req->inode = inode; @@ -1471,7 +1571,7 @@ static int fuse_getlk(struct file *file, struct file_lock *fl) struct fuse_lk_out outarg; int err; - req = fuse_get_req(fc); + req = fuse_get_req_nopages(fc); if (IS_ERR(req)) return PTR_ERR(req); @@ -1506,7 +1606,7 @@ static int fuse_setlk(struct file *file, struct file_lock *fl, int flock) if (fl->fl_flags & FL_CLOSE) return 0; - req = fuse_get_req(fc); + req = fuse_get_req_nopages(fc); if (IS_ERR(req)) return PTR_ERR(req); @@ -1575,7 +1675,7 @@ static sector_t fuse_bmap(struct address_space *mapping, sector_t block) if (!inode->i_sb->s_bdev || fc->no_bmap) return 0; - req = fuse_get_req(fc); + req = fuse_get_req_nopages(fc); if (IS_ERR(req)) return 0; @@ -1873,7 +1973,7 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg, num_pages++; } - req = fuse_get_req(fc); + req = fuse_get_req(fc, num_pages); if (IS_ERR(req)) { err = PTR_ERR(req); req = NULL; @@ -1881,6 +1981,7 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg, } memcpy(req->pages, pages, sizeof(req->pages[0]) * num_pages); req->num_pages = num_pages; + fuse_page_descs_length_init(req, 0, req->num_pages); /* okay, let's send it to the client */ req->in.h.opcode = FUSE_IOCTL; @@ -1981,7 +2082,7 @@ long fuse_ioctl_common(struct file *file, unsigned int cmd, struct inode *inode = file->f_dentry->d_inode; struct fuse_conn *fc = get_fuse_conn(inode); - if (!fuse_allow_task(fc, current)) + if (!fuse_allow_current_process(fc)) return -EACCES; if (is_bad_inode(inode)) @@ -2066,6 +2167,7 @@ unsigned fuse_file_poll(struct file *file, poll_table *wait) return DEFAULT_POLLMASK; poll_wait(file, &ff->poll_wait, wait); + inarg.events = (__u32)poll_requested_events(wait); /* * Ask for notification iff there's someone waiting for it. @@ -2076,7 +2178,7 @@ unsigned fuse_file_poll(struct file *file, poll_table *wait) fuse_register_polled_file(fc, ff); } - req = fuse_get_req(fc); + req = fuse_get_req_nopages(fc); if (IS_ERR(req)) return POLLERR; @@ -2126,41 +2228,6 @@ int fuse_notify_poll_wakeup(struct fuse_conn *fc, return 0; } -static ssize_t fuse_loop_dio(struct file *filp, const struct iovec *iov, - unsigned long nr_segs, loff_t *ppos, int rw) -{ - const struct iovec *vector = iov; - ssize_t ret = 0; - - while (nr_segs > 0) { - void __user *base; - size_t len; - ssize_t nr; - - base = vector->iov_base; - len = vector->iov_len; - vector++; - nr_segs--; - - if (rw == WRITE) - nr = __fuse_direct_write(filp, base, len, ppos); - else - nr = fuse_direct_read(filp, base, len, ppos); - - if (nr < 0) { - if (!ret) - ret = nr; - break; - } - ret += nr; - if (nr != len) - break; - } - - return ret; -} - - static ssize_t fuse_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, loff_t offset, unsigned long nr_segs) @@ -2172,13 +2239,16 @@ fuse_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, file = iocb->ki_filp; pos = offset; - ret = fuse_loop_dio(file, iov, nr_segs, &pos, rw); + if (rw == WRITE) + ret = __fuse_direct_write(file, iov, nr_segs, &pos); + else + ret = __fuse_direct_read(file, iov, nr_segs, &pos); return ret; } -long fuse_file_fallocate(struct file *file, int mode, loff_t offset, - loff_t length) +static long fuse_file_fallocate(struct file *file, int mode, loff_t offset, + loff_t length) { struct fuse_file *ff = file->private_data; struct fuse_conn *fc = ff->fc; @@ -2194,7 +2264,7 @@ long fuse_file_fallocate(struct file *file, int mode, loff_t offset, if (fc->no_fallocate) return -EOPNOTSUPP; - req = fuse_get_req(fc); + req = fuse_get_req_nopages(fc); if (IS_ERR(req)) return PTR_ERR(req); @@ -2213,7 +2283,6 @@ long fuse_file_fallocate(struct file *file, int mode, loff_t offset, return err; } -EXPORT_SYMBOL_GPL(fuse_file_fallocate); static const struct file_operations fuse_file_operations = { .llseek = fuse_file_llseek, diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index e105a53..6aeba86 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -44,6 +44,9 @@ doing the mount will be allowed to access the filesystem */ #define FUSE_ALLOW_OTHER (1 << 1) +/** Number of page pointers embedded in fuse_req */ +#define FUSE_REQ_INLINE_PAGES 1 + /** List of active connections */ extern struct list_head fuse_conn_list; @@ -103,6 +106,15 @@ struct fuse_inode { /** List of writepage requestst (pending or sent) */ struct list_head writepages; + + /** Miscellaneous bits describing inode state */ + unsigned long state; +}; + +/** FUSE inode state bits */ +enum { + /** Advise readdirplus */ + FUSE_I_ADVISE_RDPLUS, }; struct fuse_conn; @@ -200,6 +212,12 @@ struct fuse_out { struct fuse_arg args[3]; }; +/** FUSE page descriptor */ +struct fuse_page_desc { + unsigned int length; + unsigned int offset; +}; + /** The request state */ enum fuse_req_state { FUSE_REQ_INIT = 0, @@ -291,14 +309,23 @@ struct fuse_req { } misc; /** page vector */ - struct page *pages[FUSE_MAX_PAGES_PER_REQ]; + struct page **pages; + + /** page-descriptor vector */ + struct fuse_page_desc *page_descs; + + /** size of the 'pages' array */ + unsigned max_pages; + + /** inline page vector */ + struct page *inline_pages[FUSE_REQ_INLINE_PAGES]; + + /** inline page-descriptor vector */ + struct fuse_page_desc inline_page_descs[FUSE_REQ_INLINE_PAGES]; /** number of pages in vector */ unsigned num_pages; - /** offset of data on first page */ - unsigned page_offset; - /** File used in the request (or NULL) */ struct fuse_file *ff; @@ -487,6 +514,12 @@ struct fuse_conn { /** Use enhanced/automatic page cache invalidation. */ unsigned auto_inval_data:1; + /** Does the filesystem support readdirplus? */ + unsigned do_readdirplus:1; + + /** Does the filesystem want adaptive readdirplus? */ + unsigned readdirplus_auto:1; + /** The number of requests waiting for completion */ atomic_t num_waiting; @@ -578,6 +611,9 @@ void fuse_queue_forget(struct fuse_conn *fc, struct fuse_forget_link *forget, struct fuse_forget_link *fuse_alloc_forget(void); +/* Used by READDIRPLUS */ +void fuse_force_forget(struct file *file, u64 nodeid); + /** * Initialize READ or READDIR request */ @@ -658,9 +694,9 @@ void fuse_ctl_cleanup(void); /** * Allocate a request */ -struct fuse_req *fuse_request_alloc(void); +struct fuse_req *fuse_request_alloc(unsigned npages); -struct fuse_req *fuse_request_alloc_nofs(void); +struct fuse_req *fuse_request_alloc_nofs(unsigned npages); /** * Free a request @@ -668,14 +704,25 @@ struct fuse_req *fuse_request_alloc_nofs(void); void fuse_request_free(struct fuse_req *req); /** - * Get a request, may fail with -ENOMEM + * Get a request, may fail with -ENOMEM, + * caller should specify # elements in req->pages[] explicitly */ -struct fuse_req *fuse_get_req(struct fuse_conn *fc); +struct fuse_req *fuse_get_req(struct fuse_conn *fc, unsigned npages); + +/** + * Get a request, may fail with -ENOMEM, + * useful for callers who doesn't use req->pages[] + */ +static inline struct fuse_req *fuse_get_req_nopages(struct fuse_conn *fc) +{ + return fuse_get_req(fc, 0); +} /** * Gets a requests for a file operation, always succeeds */ -struct fuse_req *fuse_get_req_nofail(struct fuse_conn *fc, struct file *file); +struct fuse_req *fuse_get_req_nofail_nopages(struct fuse_conn *fc, + struct file *file); /** * Decrement reference count of a request. If count goes to zero free @@ -739,9 +786,9 @@ void fuse_ctl_remove_conn(struct fuse_conn *fc); int fuse_valid_type(int m); /** - * Is task allowed to perform filesystem operation? + * Is current process allowed to perform filesystem operation? */ -int fuse_allow_task(struct fuse_conn *fc, struct task_struct *task); +int fuse_allow_current_process(struct fuse_conn *fc); u64 fuse_lock_owner_id(struct fuse_conn *fc, fl_owner_t id); @@ -776,8 +823,9 @@ int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid, int fuse_do_open(struct fuse_conn *fc, u64 nodeid, struct file *file, bool isdir); -ssize_t fuse_direct_io(struct file *file, const char __user *buf, - size_t count, loff_t *ppos, int write); +ssize_t fuse_direct_io(struct file *file, const struct iovec *iov, + unsigned long nr_segs, size_t count, loff_t *ppos, + int write); long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg, unsigned int flags); long fuse_ioctl_common(struct file *file, unsigned int cmd, diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index b730fda..df00993 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -92,6 +92,7 @@ static struct inode *fuse_alloc_inode(struct super_block *sb) fi->attr_version = 0; fi->writectr = 0; fi->orig_ino = 0; + fi->state = 0; INIT_LIST_HEAD(&fi->write_files); INIT_LIST_HEAD(&fi->queued_writes); INIT_LIST_HEAD(&fi->writepages); @@ -408,12 +409,12 @@ static int fuse_statfs(struct dentry *dentry, struct kstatfs *buf) struct fuse_statfs_out outarg; int err; - if (!fuse_allow_task(fc, current)) { + if (!fuse_allow_current_process(fc)) { buf->f_type = FUSE_SUPER_MAGIC; return 0; } - req = fuse_get_req(fc); + req = fuse_get_req_nopages(fc); if (IS_ERR(req)) return PTR_ERR(req); @@ -863,6 +864,10 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req) fc->dont_mask = 1; if (arg->flags & FUSE_AUTO_INVAL_DATA) fc->auto_inval_data = 1; + if (arg->flags & FUSE_DO_READDIRPLUS) + fc->do_readdirplus = 1; + if (arg->flags & FUSE_READDIRPLUS_AUTO) + fc->readdirplus_auto = 1; } else { ra_pages = fc->max_read / PAGE_CACHE_SIZE; fc->no_lock = 1; @@ -889,7 +894,8 @@ static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req) arg->flags |= FUSE_ASYNC_READ | FUSE_POSIX_LOCKS | FUSE_ATOMIC_O_TRUNC | FUSE_EXPORT_SUPPORT | FUSE_BIG_WRITES | FUSE_DONT_MASK | FUSE_SPLICE_WRITE | FUSE_SPLICE_MOVE | FUSE_SPLICE_READ | - FUSE_FLOCK_LOCKS | FUSE_IOCTL_DIR | FUSE_AUTO_INVAL_DATA; + FUSE_FLOCK_LOCKS | FUSE_IOCTL_DIR | FUSE_AUTO_INVAL_DATA | + FUSE_DO_READDIRPLUS | FUSE_READDIRPLUS_AUTO; req->in.h.opcode = FUSE_INIT; req->in.numargs = 1; req->in.args[0].size = sizeof(*arg); @@ -1034,12 +1040,12 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) /* only now - we want root dentry with NULL ->d_op */ sb->s_d_op = &fuse_dentry_operations; - init_req = fuse_request_alloc(); + init_req = fuse_request_alloc(0); if (!init_req) goto err_put_root; if (is_bdev) { - fc->destroy_req = fuse_request_alloc(); + fc->destroy_req = fuse_request_alloc(0); if (!fc->destroy_req) goto err_free_init_req; } diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c index f850020..f69ac0a 100644 --- a/fs/gfs2/acl.c +++ b/fs/gfs2/acl.c @@ -237,7 +237,7 @@ static int gfs2_xattr_system_set(struct dentry *dentry, const char *name, return -EINVAL; if (type == ACL_TYPE_DEFAULT && !S_ISDIR(inode->i_mode)) return value ? -EACCES : 0; - if ((current_fsuid() != inode->i_uid) && !capable(CAP_FOWNER)) + if (!uid_eq(current_fsuid(), inode->i_uid) && !capable(CAP_FOWNER)) return -EPERM; if (S_ISLNK(inode->i_mode)) return -EOPNOTSUPP; diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index 30de4f2..24f414f 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -51,7 +51,7 @@ static void gfs2_page_add_databufs(struct gfs2_inode *ip, struct page *page, continue; if (gfs2_is_jdata(ip)) set_buffer_uptodate(bh); - gfs2_trans_add_bh(ip->i_gl, bh, 0); + gfs2_trans_add_data(ip->i_gl, bh); } } @@ -230,16 +230,14 @@ out_ignore: } /** - * gfs2_writeback_writepages - Write a bunch of dirty pages back to disk + * gfs2_writepages - Write a bunch of dirty pages back to disk * @mapping: The mapping to write * @wbc: Write-back control * - * For the data=writeback case we can already ignore buffer heads - * and write whole extents at once. This is a big reduction in the - * number of I/O requests we send and the bmap calls we make in this case. + * Used for both ordered and writeback modes. */ -static int gfs2_writeback_writepages(struct address_space *mapping, - struct writeback_control *wbc) +static int gfs2_writepages(struct address_space *mapping, + struct writeback_control *wbc) { return mpage_writepages(mapping, wbc, gfs2_get_block_noalloc); } @@ -852,7 +850,7 @@ static int gfs2_write_end(struct file *file, struct address_space *mapping, goto failed; } - gfs2_trans_add_bh(ip->i_gl, dibh, 1); + gfs2_trans_add_meta(ip->i_gl, dibh); if (gfs2_is_stuffed(ip)) return gfs2_stuffed_write_end(inode, dibh, pos, len, copied, page); @@ -1102,7 +1100,7 @@ cannot_release: static const struct address_space_operations gfs2_writeback_aops = { .writepage = gfs2_writeback_writepage, - .writepages = gfs2_writeback_writepages, + .writepages = gfs2_writepages, .readpage = gfs2_readpage, .readpages = gfs2_readpages, .write_begin = gfs2_write_begin, @@ -1118,6 +1116,7 @@ static const struct address_space_operations gfs2_writeback_aops = { static const struct address_space_operations gfs2_ordered_aops = { .writepage = gfs2_ordered_writepage, + .writepages = gfs2_writepages, .readpage = gfs2_readpage, .readpages = gfs2_readpages, .write_begin = gfs2_write_begin, diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index a68e91b..5e83657 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -22,6 +22,7 @@ #include "meta_io.h" #include "quota.h" #include "rgrp.h" +#include "log.h" #include "super.h" #include "trans.h" #include "dir.h" @@ -93,7 +94,7 @@ static int gfs2_unstuffer_page(struct gfs2_inode *ip, struct buffer_head *dibh, if (!gfs2_is_jdata(ip)) mark_buffer_dirty(bh); if (!gfs2_is_writeback(ip)) - gfs2_trans_add_bh(ip->i_gl, bh, 0); + gfs2_trans_add_data(ip->i_gl, bh); if (release) { unlock_page(page); @@ -153,7 +154,7 @@ int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page) /* Set up the pointer to the new block */ - gfs2_trans_add_bh(ip->i_gl, dibh, 1); + gfs2_trans_add_meta(ip->i_gl, dibh); di = (struct gfs2_dinode *)dibh->b_data; gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode)); @@ -405,7 +406,7 @@ static inline __be64 *gfs2_indirect_init(struct metapath *mp, BUG_ON(i < 1); BUG_ON(mp->mp_bh[i] != NULL); mp->mp_bh[i] = gfs2_meta_new(gl, bn); - gfs2_trans_add_bh(gl, mp->mp_bh[i], 1); + gfs2_trans_add_meta(gl, mp->mp_bh[i]); gfs2_metatype_set(mp->mp_bh[i], GFS2_METATYPE_IN, GFS2_FORMAT_IN); gfs2_buffer_clear_tail(mp->mp_bh[i], sizeof(struct gfs2_meta_header)); ptr += offset; @@ -468,7 +469,7 @@ static int gfs2_bmap_alloc(struct inode *inode, const sector_t lblock, BUG_ON(sheight < 1); BUG_ON(dibh == NULL); - gfs2_trans_add_bh(ip->i_gl, dibh, 1); + gfs2_trans_add_meta(ip->i_gl, dibh); if (height == sheight) { struct buffer_head *bh; @@ -544,7 +545,7 @@ static int gfs2_bmap_alloc(struct inode *inode, const sector_t lblock, /* Branching from existing tree */ case ALLOC_GROW_DEPTH: if (i > 1 && i < height) - gfs2_trans_add_bh(ip->i_gl, mp->mp_bh[i-1], 1); + gfs2_trans_add_meta(ip->i_gl, mp->mp_bh[i-1]); for (; i < height && n > 0; i++, n--) gfs2_indirect_init(mp, ip->i_gl, i, mp->mp_list[i-1], bn++); @@ -556,7 +557,7 @@ static int gfs2_bmap_alloc(struct inode *inode, const sector_t lblock, case ALLOC_DATA: BUG_ON(n > dblks); BUG_ON(mp->mp_bh[end_of_metadata] == NULL); - gfs2_trans_add_bh(ip->i_gl, mp->mp_bh[end_of_metadata], 1); + gfs2_trans_add_meta(ip->i_gl, mp->mp_bh[end_of_metadata]); dblks = n; ptr = metapointer(end_of_metadata, mp); dblock = bn; @@ -796,8 +797,8 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh, down_write(&ip->i_rw_mutex); - gfs2_trans_add_bh(ip->i_gl, dibh, 1); - gfs2_trans_add_bh(ip->i_gl, bh, 1); + gfs2_trans_add_meta(ip->i_gl, dibh); + gfs2_trans_add_meta(ip->i_gl, bh); bstart = 0; blen = 0; @@ -981,7 +982,7 @@ static int gfs2_block_truncate_page(struct address_space *mapping, loff_t from) } if (!gfs2_is_writeback(ip)) - gfs2_trans_add_bh(ip->i_gl, bh, 0); + gfs2_trans_add_data(ip->i_gl, bh); zero_user(page, offset, length); mark_buffer_dirty(bh); @@ -1046,7 +1047,7 @@ static int trunc_start(struct inode *inode, u64 oldsize, u64 newsize) if (error) goto out; - gfs2_trans_add_bh(ip->i_gl, dibh, 1); + gfs2_trans_add_meta(ip->i_gl, dibh); if (gfs2_is_stuffed(ip)) { gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode) + newsize); @@ -1098,7 +1099,7 @@ static int trunc_dealloc(struct gfs2_inode *ip, u64 size) if (error) return error; - error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE); + error = gfs2_quota_hold(ip, NO_UID_QUOTA_CHANGE, NO_GID_QUOTA_CHANGE); if (error) return error; @@ -1137,11 +1138,12 @@ static int trunc_end(struct gfs2_inode *ip) ip->i_height = 0; ip->i_goal = ip->i_no_addr; gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode)); + gfs2_ordered_del_inode(ip); } ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME; ip->i_diskflags &= ~GFS2_DIF_TRUNC_IN_PROG; - gfs2_trans_add_bh(ip->i_gl, dibh, 1); + gfs2_trans_add_meta(ip->i_gl, dibh); gfs2_dinode_out(ip, dibh->b_data); brelse(dibh); @@ -1246,7 +1248,7 @@ static int do_grow(struct inode *inode, u64 size) i_size_write(inode, size); ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME; - gfs2_trans_add_bh(ip->i_gl, dibh, 1); + gfs2_trans_add_meta(ip->i_gl, dibh); gfs2_dinode_out(ip, dibh->b_data); brelse(dibh); @@ -1286,6 +1288,10 @@ int gfs2_setattr_size(struct inode *inode, u64 newsize) inode_dio_wait(inode); + ret = gfs2_rs_alloc(GFS2_I(inode)); + if (ret) + return ret; + oldsize = inode->i_size; if (newsize >= oldsize) return do_grow(inode, newsize); diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c index 9a35670..c3e82bd 100644 --- a/fs/gfs2/dir.c +++ b/fs/gfs2/dir.c @@ -93,7 +93,7 @@ int gfs2_dir_get_new_buffer(struct gfs2_inode *ip, u64 block, struct buffer_head *bh; bh = gfs2_meta_new(ip->i_gl, block); - gfs2_trans_add_bh(ip->i_gl, bh, 1); + gfs2_trans_add_meta(ip->i_gl, bh); gfs2_metatype_set(bh, GFS2_METATYPE_JD, GFS2_FORMAT_JD); gfs2_buffer_clear_tail(bh, sizeof(struct gfs2_meta_header)); *bhp = bh; @@ -127,7 +127,7 @@ static int gfs2_dir_write_stuffed(struct gfs2_inode *ip, const char *buf, if (error) return error; - gfs2_trans_add_bh(ip->i_gl, dibh, 1); + gfs2_trans_add_meta(ip->i_gl, dibh); memcpy(dibh->b_data + offset + sizeof(struct gfs2_dinode), buf, size); if (ip->i_inode.i_size < offset + size) i_size_write(&ip->i_inode, offset + size); @@ -209,7 +209,7 @@ static int gfs2_dir_write_data(struct gfs2_inode *ip, const char *buf, if (error) goto fail; - gfs2_trans_add_bh(ip->i_gl, bh, 1); + gfs2_trans_add_meta(ip->i_gl, bh); memcpy(bh->b_data + o, buf, amount); brelse(bh); @@ -231,7 +231,7 @@ out: i_size_write(&ip->i_inode, offset + copied); ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME; - gfs2_trans_add_bh(ip->i_gl, dibh, 1); + gfs2_trans_add_meta(ip->i_gl, dibh); gfs2_dinode_out(ip, dibh->b_data); brelse(dibh); @@ -647,7 +647,7 @@ static void dirent_del(struct gfs2_inode *dip, struct buffer_head *bh, return; } - gfs2_trans_add_bh(dip->i_gl, bh, 1); + gfs2_trans_add_meta(dip->i_gl, bh); /* If there is no prev entry, this is the first entry in the block. The de_rec_len is already as big as it needs to be. Just zero @@ -690,7 +690,7 @@ static struct gfs2_dirent *gfs2_init_dirent(struct inode *inode, offset = GFS2_DIRENT_SIZE(be16_to_cpu(dent->de_name_len)); totlen = be16_to_cpu(dent->de_rec_len); BUG_ON(offset + name->len > totlen); - gfs2_trans_add_bh(ip->i_gl, bh, 1); + gfs2_trans_add_meta(ip->i_gl, bh); ndent = (struct gfs2_dirent *)((char *)dent + offset); dent->de_rec_len = cpu_to_be16(offset); gfs2_qstr2dirent(name, totlen - offset, ndent); @@ -831,7 +831,7 @@ static struct gfs2_leaf *new_leaf(struct inode *inode, struct buffer_head **pbh, return NULL; gfs2_trans_add_unrevoke(GFS2_SB(inode), bn, 1); - gfs2_trans_add_bh(ip->i_gl, bh, 1); + gfs2_trans_add_meta(ip->i_gl, bh); gfs2_metatype_set(bh, GFS2_METATYPE_LF, GFS2_FORMAT_LF); leaf = (struct gfs2_leaf *)bh->b_data; leaf->lf_depth = cpu_to_be16(depth); @@ -916,7 +916,7 @@ static int dir_make_exhash(struct inode *inode) /* We're done with the new leaf block, now setup the new hash table. */ - gfs2_trans_add_bh(dip->i_gl, dibh, 1); + gfs2_trans_add_meta(dip->i_gl, dibh); gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode)); lp = (__be64 *)(dibh->b_data + sizeof(struct gfs2_dinode)); @@ -976,7 +976,7 @@ static int dir_split_leaf(struct inode *inode, const struct qstr *name) return 1; /* can't split */ } - gfs2_trans_add_bh(dip->i_gl, obh, 1); + gfs2_trans_add_meta(dip->i_gl, obh); nleaf = new_leaf(inode, &nbh, be16_to_cpu(oleaf->lf_depth) + 1); if (!nleaf) { @@ -1069,7 +1069,7 @@ static int dir_split_leaf(struct inode *inode, const struct qstr *name) error = gfs2_meta_inode_buffer(dip, &dibh); if (!gfs2_assert_withdraw(GFS2_SB(&dip->i_inode), !error)) { - gfs2_trans_add_bh(dip->i_gl, dibh, 1); + gfs2_trans_add_meta(dip->i_gl, dibh); gfs2_add_inode_blocks(&dip->i_inode, 1); gfs2_dinode_out(dip, dibh->b_data); brelse(dibh); @@ -1622,7 +1622,7 @@ static int dir_new_leaf(struct inode *inode, const struct qstr *name) return error; } while(1); - gfs2_trans_add_bh(ip->i_gl, obh, 1); + gfs2_trans_add_meta(ip->i_gl, obh); leaf = new_leaf(inode, &bh, be16_to_cpu(oleaf->lf_depth)); if (!leaf) { @@ -1636,7 +1636,7 @@ static int dir_new_leaf(struct inode *inode, const struct qstr *name) error = gfs2_meta_inode_buffer(ip, &bh); if (error) return error; - gfs2_trans_add_bh(ip->i_gl, bh, 1); + gfs2_trans_add_meta(ip->i_gl, bh); gfs2_add_inode_blocks(&ip->i_inode, 1); gfs2_dinode_out(ip, bh->b_data); brelse(bh); @@ -1795,7 +1795,7 @@ int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename, if (IS_ERR(dent)) return PTR_ERR(dent); - gfs2_trans_add_bh(dip->i_gl, bh, 1); + gfs2_trans_add_meta(dip->i_gl, bh); gfs2_inum_out(nip, dent); dent->de_type = cpu_to_be16(new_type); @@ -1804,7 +1804,7 @@ int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename, error = gfs2_meta_inode_buffer(dip, &bh); if (error) return error; - gfs2_trans_add_bh(dip->i_gl, bh, 1); + gfs2_trans_add_meta(dip->i_gl, bh); } dip->i_inode.i_mtime = dip->i_inode.i_ctime = CURRENT_TIME; @@ -1849,7 +1849,7 @@ static int leaf_dealloc(struct gfs2_inode *dip, u32 index, u32 len, if (!ht) return -ENOMEM; - error = gfs2_quota_hold(dip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE); + error = gfs2_quota_hold(dip, NO_UID_QUOTA_CHANGE, NO_GID_QUOTA_CHANGE); if (error) goto out; @@ -1917,7 +1917,7 @@ static int leaf_dealloc(struct gfs2_inode *dip, u32 index, u32 len, if (error) goto out_end_trans; - gfs2_trans_add_bh(dip->i_gl, dibh, 1); + gfs2_trans_add_meta(dip->i_gl, dibh); /* On the last dealloc, make this a regular file in case we crash. (We don't want to free these blocks a second time.) */ if (last_dealloc) diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index 44543df..019f45e 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -276,7 +276,7 @@ static int do_gfs2_set_flags(struct file *filp, u32 reqflags, u32 mask) error = gfs2_meta_inode_buffer(ip, &bh); if (error) goto out_trans_end; - gfs2_trans_add_bh(ip->i_gl, bh, 1); + gfs2_trans_add_meta(ip->i_gl, bh); ip->i_diskflags = new_flags; gfs2_dinode_out(ip, bh->b_data); brelse(bh); @@ -483,7 +483,7 @@ out: gfs2_holder_uninit(&gh); if (ret == 0) { set_page_dirty(page); - wait_on_page_writeback(page); + wait_for_stable_page(page); } sb_end_pagefault(inode->i_sb); return block_page_mkwrite_return(ret); @@ -708,7 +708,7 @@ static int fallocate_chunk(struct inode *inode, loff_t offset, loff_t len, if (unlikely(error)) return error; - gfs2_trans_add_bh(ip->i_gl, dibh, 1); + gfs2_trans_add_meta(ip->i_gl, dibh); if (gfs2_is_stuffed(ip)) { error = gfs2_unstuff_dinode(ip, NULL); diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 992c5c0..cf35155 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -30,6 +30,7 @@ #include <linux/rculist_bl.h> #include <linux/bit_spinlock.h> #include <linux/percpu.h> +#include <linux/list_sort.h> #include "gfs2.h" #include "incore.h" @@ -1376,56 +1377,105 @@ void gfs2_glock_complete(struct gfs2_glock *gl, int ret) gfs2_glock_put(gl); } +static int glock_cmp(void *priv, struct list_head *a, struct list_head *b) +{ + struct gfs2_glock *gla, *glb; -static int gfs2_shrink_glock_memory(struct shrinker *shrink, - struct shrink_control *sc) + gla = list_entry(a, struct gfs2_glock, gl_lru); + glb = list_entry(b, struct gfs2_glock, gl_lru); + + if (gla->gl_name.ln_number > glb->gl_name.ln_number) + return 1; + if (gla->gl_name.ln_number < glb->gl_name.ln_number) + return -1; + + return 0; +} + +/** + * gfs2_dispose_glock_lru - Demote a list of glocks + * @list: The list to dispose of + * + * Disposing of glocks may involve disk accesses, so that here we sort + * the glocks by number (i.e. disk location of the inodes) so that if + * there are any such accesses, they'll be sent in order (mostly). + * + * Must be called under the lru_lock, but may drop and retake this + * lock. While the lru_lock is dropped, entries may vanish from the + * list, but no new entries will appear on the list (since it is + * private) + */ + +static void gfs2_dispose_glock_lru(struct list_head *list) +__releases(&lru_lock) +__acquires(&lru_lock) { struct gfs2_glock *gl; - int may_demote; - int nr_skipped = 0; - int nr = sc->nr_to_scan; - gfp_t gfp_mask = sc->gfp_mask; - LIST_HEAD(skipped); - if (nr == 0) - goto out; + list_sort(NULL, list, glock_cmp); - if (!(gfp_mask & __GFP_FS)) - return -1; + while(!list_empty(list)) { + gl = list_entry(list->next, struct gfs2_glock, gl_lru); + list_del_init(&gl->gl_lru); + clear_bit(GLF_LRU, &gl->gl_flags); + gfs2_glock_hold(gl); + spin_unlock(&lru_lock); + spin_lock(&gl->gl_spin); + if (demote_ok(gl)) + handle_callback(gl, LM_ST_UNLOCKED, 0); + WARN_ON(!test_and_clear_bit(GLF_LOCK, &gl->gl_flags)); + smp_mb__after_clear_bit(); + if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0) + gfs2_glock_put_nolock(gl); + spin_unlock(&gl->gl_spin); + spin_lock(&lru_lock); + } +} + +/** + * gfs2_scan_glock_lru - Scan the LRU looking for locks to demote + * @nr: The number of entries to scan + * + * This function selects the entries on the LRU which are able to + * be demoted, and then kicks off the process by calling + * gfs2_dispose_glock_lru() above. + */ + +static void gfs2_scan_glock_lru(int nr) +{ + struct gfs2_glock *gl; + LIST_HEAD(skipped); + LIST_HEAD(dispose); spin_lock(&lru_lock); while(nr && !list_empty(&lru_list)) { gl = list_entry(lru_list.next, struct gfs2_glock, gl_lru); - list_del_init(&gl->gl_lru); - clear_bit(GLF_LRU, &gl->gl_flags); - atomic_dec(&lru_count); /* Test for being demotable */ if (!test_and_set_bit(GLF_LOCK, &gl->gl_flags)) { - gfs2_glock_hold(gl); - spin_unlock(&lru_lock); - spin_lock(&gl->gl_spin); - may_demote = demote_ok(gl); - if (may_demote) { - handle_callback(gl, LM_ST_UNLOCKED, 0); - nr--; - } - clear_bit(GLF_LOCK, &gl->gl_flags); - smp_mb__after_clear_bit(); - if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0) - gfs2_glock_put_nolock(gl); - spin_unlock(&gl->gl_spin); - spin_lock(&lru_lock); + list_move(&gl->gl_lru, &dispose); + atomic_dec(&lru_count); + nr--; continue; } - nr_skipped++; - list_add(&gl->gl_lru, &skipped); - set_bit(GLF_LRU, &gl->gl_flags); + + list_move(&gl->gl_lru, &skipped); } list_splice(&skipped, &lru_list); - atomic_add(nr_skipped, &lru_count); + if (!list_empty(&dispose)) + gfs2_dispose_glock_lru(&dispose); spin_unlock(&lru_lock); -out: +} + +static int gfs2_shrink_glock_memory(struct shrinker *shrink, + struct shrink_control *sc) +{ + if (sc->nr_to_scan) { + if (!(sc->gfp_mask & __GFP_FS)) + return -1; + gfs2_scan_glock_lru(sc->nr_to_scan); + } + return (atomic_read(&lru_count) / 100) * sysctl_vfs_cache_pressure; } diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c index 78d4184..444b650 100644 --- a/fs/gfs2/glops.c +++ b/fs/gfs2/glops.c @@ -322,8 +322,8 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf) break; }; - ip->i_inode.i_uid = be32_to_cpu(str->di_uid); - ip->i_inode.i_gid = be32_to_cpu(str->di_gid); + i_uid_write(&ip->i_inode, be32_to_cpu(str->di_uid)); + i_gid_write(&ip->i_inode, be32_to_cpu(str->di_gid)); gfs2_set_nlink(&ip->i_inode, be32_to_cpu(str->di_nlink)); i_size_write(&ip->i_inode, be64_to_cpu(str->di_size)); gfs2_set_inode_blocks(&ip->i_inode, be64_to_cpu(str->di_blocks)); diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index c373a24..156e42e 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -52,7 +52,6 @@ struct gfs2_log_header_host { */ struct gfs2_log_operations { - void (*lo_add) (struct gfs2_sbd *sdp, struct gfs2_bufdata *bd); void (*lo_before_commit) (struct gfs2_sbd *sdp); void (*lo_after_commit) (struct gfs2_sbd *sdp, struct gfs2_ail *ai); void (*lo_before_scan) (struct gfs2_jdesc *jd, @@ -341,6 +340,7 @@ enum { GIF_QD_LOCKED = 1, GIF_ALLOC_FAILED = 2, GIF_SW_PAGED = 3, + GIF_ORDERED = 4, }; struct gfs2_inode { @@ -357,6 +357,7 @@ struct gfs2_inode { struct gfs2_rgrpd *i_rgd; u64 i_goal; /* goal block for allocations */ struct rw_semaphore i_rw_mutex; + struct list_head i_ordered; struct list_head i_trunc_list; __be64 *i_hash_cache; u32 i_entries; @@ -391,7 +392,6 @@ struct gfs2_revoke_replay { }; enum { - QDF_USER = 0, QDF_CHANGE = 1, QDF_LOCKED = 2, QDF_REFRESH = 3, @@ -403,7 +403,7 @@ struct gfs2_quota_data { atomic_t qd_count; - u32 qd_id; + struct kqid qd_id; unsigned long qd_flags; /* QDF_... */ s64 qd_change; @@ -641,6 +641,7 @@ struct gfs2_sbd { wait_queue_head_t sd_glock_wait; atomic_t sd_glock_disposal; struct completion sd_locking_init; + struct completion sd_wdack; struct delayed_work sd_control_work; /* Inode Stuff */ @@ -723,6 +724,7 @@ struct gfs2_sbd { struct list_head sd_log_le_revoke; struct list_head sd_log_le_databuf; struct list_head sd_log_le_ordered; + spinlock_t sd_ordered_lock; atomic_t sd_log_thresh1; atomic_t sd_log_thresh2; @@ -758,10 +760,7 @@ struct gfs2_sbd { unsigned int sd_replayed_blocks; /* For quiescing the filesystem */ - struct gfs2_holder sd_freeze_gh; - struct mutex sd_freeze_lock; - unsigned int sd_freeze_count; char sd_fsname[GFS2_FSNAME_LEN]; char sd_table_name[GFS2_FSNAME_LEN]; diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index 2b6f569..cc00bd1 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -368,10 +368,11 @@ static void munge_mode_uid_gid(const struct gfs2_inode *dip, struct inode *inode) { if (GFS2_SB(&dip->i_inode)->sd_args.ar_suiddir && - (dip->i_inode.i_mode & S_ISUID) && dip->i_inode.i_uid) { + (dip->i_inode.i_mode & S_ISUID) && + !uid_eq(dip->i_inode.i_uid, GLOBAL_ROOT_UID)) { if (S_ISDIR(inode->i_mode)) inode->i_mode |= S_ISUID; - else if (dip->i_inode.i_uid != current_fsuid()) + else if (!uid_eq(dip->i_inode.i_uid, current_fsuid())) inode->i_mode &= ~07111; inode->i_uid = dip->i_inode.i_uid; } else @@ -447,7 +448,7 @@ static void init_dinode(struct gfs2_inode *dip, struct gfs2_inode *ip, struct timespec tv = CURRENT_TIME; dibh = gfs2_meta_new(ip->i_gl, ip->i_no_addr); - gfs2_trans_add_bh(ip->i_gl, dibh, 1); + gfs2_trans_add_meta(ip->i_gl, dibh); gfs2_metatype_set(dibh, GFS2_METATYPE_DI, GFS2_FORMAT_DI); gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode)); di = (struct gfs2_dinode *)dibh->b_data; @@ -455,8 +456,8 @@ static void init_dinode(struct gfs2_inode *dip, struct gfs2_inode *ip, di->di_num.no_formal_ino = cpu_to_be64(ip->i_no_formal_ino); di->di_num.no_addr = cpu_to_be64(ip->i_no_addr); di->di_mode = cpu_to_be32(ip->i_inode.i_mode); - di->di_uid = cpu_to_be32(ip->i_inode.i_uid); - di->di_gid = cpu_to_be32(ip->i_inode.i_gid); + di->di_uid = cpu_to_be32(i_uid_read(&ip->i_inode)); + di->di_gid = cpu_to_be32(i_gid_read(&ip->i_inode)); di->di_nlink = 0; di->di_size = cpu_to_be64(ip->i_inode.i_size); di->di_blocks = cpu_to_be64(1); @@ -548,7 +549,7 @@ static int link_dinode(struct gfs2_inode *dip, const struct qstr *name, if (error) return error; - error = gfs2_quota_lock(dip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE); + error = gfs2_quota_lock(dip, NO_UID_QUOTA_CHANGE, NO_GID_QUOTA_CHANGE); if (error) goto fail; @@ -584,7 +585,7 @@ static int link_dinode(struct gfs2_inode *dip, const struct qstr *name, if (error) goto fail_end_trans; set_nlink(&ip->i_inode, S_ISDIR(ip->i_inode.i_mode) ? 2 : 1); - gfs2_trans_add_bh(ip->i_gl, dibh, 1); + gfs2_trans_add_meta(ip->i_gl, dibh); gfs2_dinode_out(ip, dibh->b_data); brelse(dibh); return 0; @@ -931,7 +932,7 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir, if (error) goto out_brelse; - gfs2_trans_add_bh(ip->i_gl, dibh, 1); + gfs2_trans_add_meta(ip->i_gl, dibh); inc_nlink(&ip->i_inode); ip->i_inode.i_ctime = CURRENT_TIME; ihold(inode); @@ -978,8 +979,8 @@ static int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name, return -EPERM; if ((dip->i_inode.i_mode & S_ISVTX) && - dip->i_inode.i_uid != current_fsuid() && - ip->i_inode.i_uid != current_fsuid() && !capable(CAP_FOWNER)) + !uid_eq(dip->i_inode.i_uid, current_fsuid()) && + !uid_eq(ip->i_inode.i_uid, current_fsuid()) && !capable(CAP_FOWNER)) return -EPERM; if (IS_APPEND(&dip->i_inode)) @@ -1412,7 +1413,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry, if (error) goto out_end_trans; ip->i_inode.i_ctime = CURRENT_TIME; - gfs2_trans_add_bh(ip->i_gl, dibh, 1); + gfs2_trans_add_meta(ip->i_gl, dibh); gfs2_dinode_out(ip, dibh->b_data); brelse(dibh); } @@ -1580,7 +1581,8 @@ static int setattr_chown(struct inode *inode, struct iattr *attr) { struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_sbd *sdp = GFS2_SB(inode); - u32 ouid, ogid, nuid, ngid; + kuid_t ouid, nuid; + kgid_t ogid, ngid; int error; ouid = inode->i_uid; @@ -1588,16 +1590,17 @@ static int setattr_chown(struct inode *inode, struct iattr *attr) nuid = attr->ia_uid; ngid = attr->ia_gid; - if (!(attr->ia_valid & ATTR_UID) || ouid == nuid) - ouid = nuid = NO_QUOTA_CHANGE; - if (!(attr->ia_valid & ATTR_GID) || ogid == ngid) - ogid = ngid = NO_QUOTA_CHANGE; + if (!(attr->ia_valid & ATTR_UID) || uid_eq(ouid, nuid)) + ouid = nuid = NO_UID_QUOTA_CHANGE; + if (!(attr->ia_valid & ATTR_GID) || gid_eq(ogid, ngid)) + ogid = ngid = NO_GID_QUOTA_CHANGE; error = gfs2_quota_lock(ip, nuid, ngid); if (error) return error; - if (ouid != NO_QUOTA_CHANGE || ogid != NO_QUOTA_CHANGE) { + if (!uid_eq(ouid, NO_UID_QUOTA_CHANGE) || + !gid_eq(ogid, NO_GID_QUOTA_CHANGE)) { error = gfs2_quota_check(ip, nuid, ngid); if (error) goto out_gunlock_q; @@ -1611,7 +1614,8 @@ static int setattr_chown(struct inode *inode, struct iattr *attr) if (error) goto out_end_trans; - if (ouid != NO_QUOTA_CHANGE || ogid != NO_QUOTA_CHANGE) { + if (!uid_eq(ouid, NO_UID_QUOTA_CHANGE) || + !gid_eq(ogid, NO_GID_QUOTA_CHANGE)) { u64 blocks = gfs2_get_inode_blocks(&ip->i_inode); gfs2_quota_change(ip, -blocks, ouid, ogid); gfs2_quota_change(ip, blocks, nuid, ngid); diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c index b906ed1..9802de0 100644 --- a/fs/gfs2/lock_dlm.c +++ b/fs/gfs2/lock_dlm.c @@ -281,6 +281,7 @@ static void gdlm_put_lock(struct gfs2_glock *gl) { struct gfs2_sbd *sdp = gl->gl_sbd; struct lm_lockstruct *ls = &sdp->sd_lockstruct; + int lvb_needs_unlock = 0; int error; if (gl->gl_lksb.sb_lkid == 0) { @@ -294,8 +295,12 @@ static void gdlm_put_lock(struct gfs2_glock *gl) gfs2_update_request_times(gl); /* don't want to skip dlm_unlock writing the lvb when lock is ex */ + + if (gl->gl_lksb.sb_lvbptr && (gl->gl_state == LM_ST_EXCLUSIVE)) + lvb_needs_unlock = 1; + if (test_bit(SDF_SKIP_DLM_UNLOCK, &sdp->sd_flags) && - gl->gl_lksb.sb_lvbptr && (gl->gl_state != LM_ST_EXCLUSIVE)) { + !lvb_needs_unlock) { gfs2_glock_free(gl); return; } diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c index f4beeb9..9a2ca8b 100644 --- a/fs/gfs2/log.c +++ b/fs/gfs2/log.c @@ -482,70 +482,66 @@ static void log_flush_wait(struct gfs2_sbd *sdp) } } -static int bd_cmp(void *priv, struct list_head *a, struct list_head *b) +static int ip_cmp(void *priv, struct list_head *a, struct list_head *b) { - struct gfs2_bufdata *bda, *bdb; + struct gfs2_inode *ipa, *ipb; - bda = list_entry(a, struct gfs2_bufdata, bd_list); - bdb = list_entry(b, struct gfs2_bufdata, bd_list); + ipa = list_entry(a, struct gfs2_inode, i_ordered); + ipb = list_entry(b, struct gfs2_inode, i_ordered); - if (bda->bd_bh->b_blocknr < bdb->bd_bh->b_blocknr) + if (ipa->i_no_addr < ipb->i_no_addr) return -1; - if (bda->bd_bh->b_blocknr > bdb->bd_bh->b_blocknr) + if (ipa->i_no_addr > ipb->i_no_addr) return 1; return 0; } static void gfs2_ordered_write(struct gfs2_sbd *sdp) { - struct gfs2_bufdata *bd; - struct buffer_head *bh; + struct gfs2_inode *ip; LIST_HEAD(written); - gfs2_log_lock(sdp); - list_sort(NULL, &sdp->sd_log_le_ordered, &bd_cmp); + spin_lock(&sdp->sd_ordered_lock); + list_sort(NULL, &sdp->sd_log_le_ordered, &ip_cmp); while (!list_empty(&sdp->sd_log_le_ordered)) { - bd = list_entry(sdp->sd_log_le_ordered.next, struct gfs2_bufdata, bd_list); - list_move(&bd->bd_list, &written); - bh = bd->bd_bh; - if (!buffer_dirty(bh)) + ip = list_entry(sdp->sd_log_le_ordered.next, struct gfs2_inode, i_ordered); + list_move(&ip->i_ordered, &written); + if (ip->i_inode.i_mapping->nrpages == 0) continue; - get_bh(bh); - gfs2_log_unlock(sdp); - lock_buffer(bh); - if (buffer_mapped(bh) && test_clear_buffer_dirty(bh)) { - bh->b_end_io = end_buffer_write_sync; - submit_bh(WRITE_SYNC, bh); - } else { - unlock_buffer(bh); - brelse(bh); - } - gfs2_log_lock(sdp); + spin_unlock(&sdp->sd_ordered_lock); + filemap_fdatawrite(ip->i_inode.i_mapping); + spin_lock(&sdp->sd_ordered_lock); } list_splice(&written, &sdp->sd_log_le_ordered); - gfs2_log_unlock(sdp); + spin_unlock(&sdp->sd_ordered_lock); } static void gfs2_ordered_wait(struct gfs2_sbd *sdp) { - struct gfs2_bufdata *bd; - struct buffer_head *bh; + struct gfs2_inode *ip; - gfs2_log_lock(sdp); + spin_lock(&sdp->sd_ordered_lock); while (!list_empty(&sdp->sd_log_le_ordered)) { - bd = list_entry(sdp->sd_log_le_ordered.prev, struct gfs2_bufdata, bd_list); - bh = bd->bd_bh; - if (buffer_locked(bh)) { - get_bh(bh); - gfs2_log_unlock(sdp); - wait_on_buffer(bh); - brelse(bh); - gfs2_log_lock(sdp); + ip = list_entry(sdp->sd_log_le_ordered.next, struct gfs2_inode, i_ordered); + list_del(&ip->i_ordered); + WARN_ON(!test_and_clear_bit(GIF_ORDERED, &ip->i_flags)); + if (ip->i_inode.i_mapping->nrpages == 0) continue; - } - list_del_init(&bd->bd_list); + spin_unlock(&sdp->sd_ordered_lock); + filemap_fdatawait(ip->i_inode.i_mapping); + spin_lock(&sdp->sd_ordered_lock); } - gfs2_log_unlock(sdp); + spin_unlock(&sdp->sd_ordered_lock); +} + +void gfs2_ordered_del_inode(struct gfs2_inode *ip) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + + spin_lock(&sdp->sd_ordered_lock); + if (test_and_clear_bit(GIF_ORDERED, &ip->i_flags)) + list_del(&ip->i_ordered); + spin_unlock(&sdp->sd_ordered_lock); } /** diff --git a/fs/gfs2/log.h b/fs/gfs2/log.h index 3fd5215..3566f35 100644 --- a/fs/gfs2/log.h +++ b/fs/gfs2/log.h @@ -48,6 +48,18 @@ static inline void gfs2_log_pointers_init(struct gfs2_sbd *sdp, sdp->sd_log_head = sdp->sd_log_tail = value; } +static inline void gfs2_ordered_add_inode(struct gfs2_inode *ip) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + + if (!test_bit(GIF_ORDERED, &ip->i_flags)) { + spin_lock(&sdp->sd_ordered_lock); + if (!test_and_set_bit(GIF_ORDERED, &ip->i_flags)) + list_add(&ip->i_ordered, &sdp->sd_log_le_ordered); + spin_unlock(&sdp->sd_ordered_lock); + } +} +extern void gfs2_ordered_del_inode(struct gfs2_inode *ip); extern unsigned int gfs2_struct2blk(struct gfs2_sbd *sdp, unsigned int nstruct, unsigned int ssize); diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c index 9ceccb1..a505597 100644 --- a/fs/gfs2/lops.c +++ b/fs/gfs2/lops.c @@ -37,7 +37,7 @@ * * The log lock must be held when calling this function */ -static void gfs2_pin(struct gfs2_sbd *sdp, struct buffer_head *bh) +void gfs2_pin(struct gfs2_sbd *sdp, struct buffer_head *bh) { struct gfs2_bufdata *bd; @@ -388,32 +388,6 @@ static struct page *gfs2_get_log_desc(struct gfs2_sbd *sdp, u32 ld_type, return page; } -static void buf_lo_add(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd) -{ - struct gfs2_meta_header *mh; - struct gfs2_trans *tr; - - tr = current->journal_info; - tr->tr_touched = 1; - if (!list_empty(&bd->bd_list)) - return; - set_bit(GLF_LFLUSH, &bd->bd_gl->gl_flags); - set_bit(GLF_DIRTY, &bd->bd_gl->gl_flags); - mh = (struct gfs2_meta_header *)bd->bd_bh->b_data; - if (unlikely(mh->mh_magic != cpu_to_be32(GFS2_MAGIC))) { - printk(KERN_ERR - "Attempting to add uninitialised block to journal (inplace block=%lld)\n", - (unsigned long long)bd->bd_bh->b_blocknr); - BUG(); - } - gfs2_pin(sdp, bd->bd_bh); - mh->__pad0 = cpu_to_be64(0); - mh->mh_jid = cpu_to_be32(sdp->sd_jdesc->jd_jid); - sdp->sd_log_num_buf++; - list_add(&bd->bd_list, &sdp->sd_log_le_buf); - tr->tr_num_buf_new++; -} - static void gfs2_check_magic(struct buffer_head *bh) { void *kaddr; @@ -600,20 +574,6 @@ static void buf_lo_after_scan(struct gfs2_jdesc *jd, int error, int pass) jd->jd_jid, sdp->sd_replayed_blocks, sdp->sd_found_blocks); } -static void revoke_lo_add(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd) -{ - struct gfs2_glock *gl = bd->bd_gl; - struct gfs2_trans *tr; - - tr = current->journal_info; - tr->tr_touched = 1; - tr->tr_num_revoke++; - sdp->sd_log_num_revoke++; - atomic_inc(&gl->gl_revokes); - set_bit(GLF_LFLUSH, &gl->gl_flags); - list_add(&bd->bd_list, &sdp->sd_log_le_revoke); -} - static void revoke_lo_before_commit(struct gfs2_sbd *sdp) { struct gfs2_meta_header *mh; @@ -749,44 +709,6 @@ static void revoke_lo_after_scan(struct gfs2_jdesc *jd, int error, int pass) } /** - * databuf_lo_add - Add a databuf to the transaction. - * - * This is used in two distinct cases: - * i) In ordered write mode - * We put the data buffer on a list so that we can ensure that its - * synced to disk at the right time - * ii) In journaled data mode - * We need to journal the data block in the same way as metadata in - * the functions above. The difference is that here we have a tag - * which is two __be64's being the block number (as per meta data) - * and a flag which says whether the data block needs escaping or - * not. This means we need a new log entry for each 251 or so data - * blocks, which isn't an enormous overhead but twice as much as - * for normal metadata blocks. - */ -static void databuf_lo_add(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd) -{ - struct gfs2_trans *tr = current->journal_info; - struct address_space *mapping = bd->bd_bh->b_page->mapping; - struct gfs2_inode *ip = GFS2_I(mapping->host); - - if (tr) - tr->tr_touched = 1; - if (!list_empty(&bd->bd_list)) - return; - set_bit(GLF_LFLUSH, &bd->bd_gl->gl_flags); - set_bit(GLF_DIRTY, &bd->bd_gl->gl_flags); - if (gfs2_is_jdata(ip)) { - gfs2_pin(sdp, bd->bd_bh); - tr->tr_num_databuf_new++; - sdp->sd_log_num_databuf++; - list_add_tail(&bd->bd_list, &sdp->sd_log_le_databuf); - } else { - list_add_tail(&bd->bd_list, &sdp->sd_log_le_ordered); - } -} - -/** * databuf_lo_before_commit - Scan the data buffers, writing as we go * */ @@ -885,7 +807,6 @@ static void databuf_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_ail *ai) const struct gfs2_log_operations gfs2_buf_lops = { - .lo_add = buf_lo_add, .lo_before_commit = buf_lo_before_commit, .lo_after_commit = buf_lo_after_commit, .lo_before_scan = buf_lo_before_scan, @@ -895,7 +816,6 @@ const struct gfs2_log_operations gfs2_buf_lops = { }; const struct gfs2_log_operations gfs2_revoke_lops = { - .lo_add = revoke_lo_add, .lo_before_commit = revoke_lo_before_commit, .lo_after_commit = revoke_lo_after_commit, .lo_before_scan = revoke_lo_before_scan, @@ -909,7 +829,6 @@ const struct gfs2_log_operations gfs2_rg_lops = { }; const struct gfs2_log_operations gfs2_databuf_lops = { - .lo_add = databuf_lo_add, .lo_before_commit = databuf_lo_before_commit, .lo_after_commit = databuf_lo_after_commit, .lo_scan_elements = databuf_lo_scan_elements, diff --git a/fs/gfs2/lops.h b/fs/gfs2/lops.h index 954a330..ba77b7d 100644 --- a/fs/gfs2/lops.h +++ b/fs/gfs2/lops.h @@ -29,6 +29,7 @@ extern const struct gfs2_log_operations gfs2_databuf_lops; extern const struct gfs2_log_operations *gfs2_log_ops[]; extern void gfs2_log_write_page(struct gfs2_sbd *sdp, struct page *page); extern void gfs2_log_flush_bio(struct gfs2_sbd *sdp, int rw); +extern void gfs2_pin(struct gfs2_sbd *sdp, struct buffer_head *bh); static inline unsigned int buf_limit(struct gfs2_sbd *sdp) { @@ -46,19 +47,6 @@ static inline unsigned int databuf_limit(struct gfs2_sbd *sdp) return limit; } -static inline void lops_init_le(struct gfs2_bufdata *bd, - const struct gfs2_log_operations *lops) -{ - INIT_LIST_HEAD(&bd->bd_list); - bd->bd_ops = lops; -} - -static inline void lops_add(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd) -{ - if (bd->bd_ops->lo_add) - bd->bd_ops->lo_add(sdp, bd); -} - static inline void lops_before_commit(struct gfs2_sbd *sdp) { int x; diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c index 22255d9..b059bbb 100644 --- a/fs/gfs2/meta_io.c +++ b/fs/gfs2/meta_io.c @@ -271,41 +271,6 @@ int gfs2_meta_wait(struct gfs2_sbd *sdp, struct buffer_head *bh) return 0; } -/** - * gfs2_attach_bufdata - attach a struct gfs2_bufdata structure to a buffer - * @gl: the glock the buffer belongs to - * @bh: The buffer to be attached to - * @meta: Flag to indicate whether its metadata or not - */ - -void gfs2_attach_bufdata(struct gfs2_glock *gl, struct buffer_head *bh, - int meta) -{ - struct gfs2_bufdata *bd; - - if (meta) - lock_page(bh->b_page); - - if (bh->b_private) { - if (meta) - unlock_page(bh->b_page); - return; - } - - bd = kmem_cache_zalloc(gfs2_bufdata_cachep, GFP_NOFS | __GFP_NOFAIL); - bd->bd_bh = bh; - bd->bd_gl = gl; - - if (meta) - lops_init_le(bd, &gfs2_buf_lops); - else - lops_init_le(bd, &gfs2_databuf_lops); - bh->b_private = bd; - - if (meta) - unlock_page(bh->b_page); -} - void gfs2_remove_from_journal(struct buffer_head *bh, struct gfs2_trans *tr, int meta) { struct address_space *mapping = bh->b_page->mapping; diff --git a/fs/gfs2/meta_io.h b/fs/gfs2/meta_io.h index c30973b..0d4c843 100644 --- a/fs/gfs2/meta_io.h +++ b/fs/gfs2/meta_io.h @@ -56,9 +56,6 @@ int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int gfs2_meta_wait(struct gfs2_sbd *sdp, struct buffer_head *bh); struct buffer_head *gfs2_getbuf(struct gfs2_glock *gl, u64 blkno, int create); -void gfs2_attach_bufdata(struct gfs2_glock *gl, struct buffer_head *bh, - int meta); - void gfs2_remove_from_journal(struct buffer_head *bh, struct gfs2_trans *tr, int meta); diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index 0e3554e..1b612be 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -81,6 +81,7 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb) init_waitqueue_head(&sdp->sd_glock_wait); atomic_set(&sdp->sd_glock_disposal, 0); init_completion(&sdp->sd_locking_init); + init_completion(&sdp->sd_wdack); spin_lock_init(&sdp->sd_statfs_spin); spin_lock_init(&sdp->sd_rindex_spin); @@ -102,6 +103,7 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb) INIT_LIST_HEAD(&sdp->sd_log_le_revoke); INIT_LIST_HEAD(&sdp->sd_log_le_databuf); INIT_LIST_HEAD(&sdp->sd_log_le_ordered); + spin_lock_init(&sdp->sd_ordered_lock); init_waitqueue_head(&sdp->sd_log_waitq); init_waitqueue_head(&sdp->sd_logd_waitq); @@ -115,8 +117,6 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb) INIT_LIST_HEAD(&sdp->sd_revoke_list); - mutex_init(&sdp->sd_freeze_lock); - return sdp; } diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index ae55e24..c7c840e 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -65,13 +65,10 @@ #include "inode.h" #include "util.h" -#define QUOTA_USER 1 -#define QUOTA_GROUP 0 - struct gfs2_quota_change_host { u64 qc_change; u32 qc_flags; /* GFS2_QCF_... */ - u32 qc_id; + struct kqid qc_id; }; static LIST_HEAD(qd_lru_list); @@ -120,17 +117,24 @@ out: return (atomic_read(&qd_lru_count) * sysctl_vfs_cache_pressure) / 100; } +static u64 qd2index(struct gfs2_quota_data *qd) +{ + struct kqid qid = qd->qd_id; + return (2 * (u64)from_kqid(&init_user_ns, qid)) + + (qid.type == USRQUOTA) ? 0 : 1; +} + static u64 qd2offset(struct gfs2_quota_data *qd) { u64 offset; - offset = 2 * (u64)qd->qd_id + !test_bit(QDF_USER, &qd->qd_flags); + offset = qd2index(qd); offset *= sizeof(struct gfs2_quota); return offset; } -static int qd_alloc(struct gfs2_sbd *sdp, int user, u32 id, +static int qd_alloc(struct gfs2_sbd *sdp, struct kqid qid, struct gfs2_quota_data **qdp) { struct gfs2_quota_data *qd; @@ -141,13 +145,11 @@ static int qd_alloc(struct gfs2_sbd *sdp, int user, u32 id, return -ENOMEM; atomic_set(&qd->qd_count, 1); - qd->qd_id = id; - if (user) - set_bit(QDF_USER, &qd->qd_flags); + qd->qd_id = qid; qd->qd_slot = -1; INIT_LIST_HEAD(&qd->qd_reclaim); - error = gfs2_glock_get(sdp, 2 * (u64)id + !user, + error = gfs2_glock_get(sdp, qd2index(qd), &gfs2_quota_glops, CREATE, &qd->qd_gl); if (error) goto fail; @@ -161,7 +163,7 @@ fail: return error; } -static int qd_get(struct gfs2_sbd *sdp, int user, u32 id, +static int qd_get(struct gfs2_sbd *sdp, struct kqid qid, struct gfs2_quota_data **qdp) { struct gfs2_quota_data *qd = NULL, *new_qd = NULL; @@ -173,8 +175,7 @@ static int qd_get(struct gfs2_sbd *sdp, int user, u32 id, found = 0; spin_lock(&qd_lru_lock); list_for_each_entry(qd, &sdp->sd_quota_list, qd_list) { - if (qd->qd_id == id && - !test_bit(QDF_USER, &qd->qd_flags) == !user) { + if (qid_eq(qd->qd_id, qid)) { if (!atomic_read(&qd->qd_count) && !list_empty(&qd->qd_reclaim)) { /* Remove it from reclaim list */ @@ -208,7 +209,7 @@ static int qd_get(struct gfs2_sbd *sdp, int user, u32 id, return 0; } - error = qd_alloc(sdp, user, id, &new_qd); + error = qd_alloc(sdp, qid, &new_qd); if (error) return error; } @@ -458,12 +459,12 @@ static void qd_unlock(struct gfs2_quota_data *qd) qd_put(qd); } -static int qdsb_get(struct gfs2_sbd *sdp, int user, u32 id, +static int qdsb_get(struct gfs2_sbd *sdp, struct kqid qid, struct gfs2_quota_data **qdp) { int error; - error = qd_get(sdp, user, id, qdp); + error = qd_get(sdp, qid, qdp); if (error) return error; @@ -491,7 +492,7 @@ static void qdsb_put(struct gfs2_quota_data *qd) qd_put(qd); } -int gfs2_quota_hold(struct gfs2_inode *ip, u32 uid, u32 gid) +int gfs2_quota_hold(struct gfs2_inode *ip, kuid_t uid, kgid_t gid) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); struct gfs2_quota_data **qd; @@ -512,28 +513,30 @@ int gfs2_quota_hold(struct gfs2_inode *ip, u32 uid, u32 gid) if (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF) return 0; - error = qdsb_get(sdp, QUOTA_USER, ip->i_inode.i_uid, qd); + error = qdsb_get(sdp, make_kqid_uid(ip->i_inode.i_uid), qd); if (error) goto out; ip->i_res->rs_qa_qd_num++; qd++; - error = qdsb_get(sdp, QUOTA_GROUP, ip->i_inode.i_gid, qd); + error = qdsb_get(sdp, make_kqid_gid(ip->i_inode.i_gid), qd); if (error) goto out; ip->i_res->rs_qa_qd_num++; qd++; - if (uid != NO_QUOTA_CHANGE && uid != ip->i_inode.i_uid) { - error = qdsb_get(sdp, QUOTA_USER, uid, qd); + if (!uid_eq(uid, NO_UID_QUOTA_CHANGE) && + !uid_eq(uid, ip->i_inode.i_uid)) { + error = qdsb_get(sdp, make_kqid_uid(uid), qd); if (error) goto out; ip->i_res->rs_qa_qd_num++; qd++; } - if (gid != NO_QUOTA_CHANGE && gid != ip->i_inode.i_gid) { - error = qdsb_get(sdp, QUOTA_GROUP, gid, qd); + if (!gid_eq(gid, NO_GID_QUOTA_CHANGE) && + !gid_eq(gid, ip->i_inode.i_gid)) { + error = qdsb_get(sdp, make_kqid_gid(gid), qd); if (error) goto out; ip->i_res->rs_qa_qd_num++; @@ -567,18 +570,10 @@ static int sort_qd(const void *a, const void *b) const struct gfs2_quota_data *qd_a = *(const struct gfs2_quota_data **)a; const struct gfs2_quota_data *qd_b = *(const struct gfs2_quota_data **)b; - if (!test_bit(QDF_USER, &qd_a->qd_flags) != - !test_bit(QDF_USER, &qd_b->qd_flags)) { - if (test_bit(QDF_USER, &qd_a->qd_flags)) - return -1; - else - return 1; - } - if (qd_a->qd_id < qd_b->qd_id) + if (qid_lt(qd_a->qd_id, qd_b->qd_id)) return -1; - if (qd_a->qd_id > qd_b->qd_id) + if (qid_lt(qd_b->qd_id, qd_a->qd_id)) return 1; - return 0; } @@ -590,14 +585,14 @@ static void do_qc(struct gfs2_quota_data *qd, s64 change) s64 x; mutex_lock(&sdp->sd_quota_mutex); - gfs2_trans_add_bh(ip->i_gl, qd->qd_bh, 1); + gfs2_trans_add_meta(ip->i_gl, qd->qd_bh); if (!test_bit(QDF_CHANGE, &qd->qd_flags)) { qc->qc_change = 0; qc->qc_flags = 0; - if (test_bit(QDF_USER, &qd->qd_flags)) + if (qd->qd_id.type == USRQUOTA) qc->qc_flags = cpu_to_be32(GFS2_QCF_USER); - qc->qc_id = cpu_to_be32(qd->qd_id); + qc->qc_id = cpu_to_be32(from_kqid(&init_user_ns, qd->qd_id)); } x = be64_to_cpu(qc->qc_change) + change; @@ -726,7 +721,7 @@ get_a_page: goto unlock_out; } - gfs2_trans_add_bh(ip->i_gl, bh, 0); + gfs2_trans_add_meta(ip->i_gl, bh); kaddr = kmap_atomic(page); if (offset + sizeof(struct gfs2_quota) > PAGE_CACHE_SIZE) @@ -925,7 +920,7 @@ fail: return error; } -int gfs2_quota_lock(struct gfs2_inode *ip, u32 uid, u32 gid) +int gfs2_quota_lock(struct gfs2_inode *ip, kuid_t uid, kgid_t gid) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); struct gfs2_quota_data *qd; @@ -1040,13 +1035,13 @@ static int print_message(struct gfs2_quota_data *qd, char *type) printk(KERN_INFO "GFS2: fsid=%s: quota %s for %s %u\n", sdp->sd_fsname, type, - (test_bit(QDF_USER, &qd->qd_flags)) ? "user" : "group", - qd->qd_id); + (qd->qd_id.type == USRQUOTA) ? "user" : "group", + from_kqid(&init_user_ns, qd->qd_id)); return 0; } -int gfs2_quota_check(struct gfs2_inode *ip, u32 uid, u32 gid) +int gfs2_quota_check(struct gfs2_inode *ip, kuid_t uid, kgid_t gid) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); struct gfs2_quota_data *qd; @@ -1063,8 +1058,8 @@ int gfs2_quota_check(struct gfs2_inode *ip, u32 uid, u32 gid) for (x = 0; x < ip->i_res->rs_qa_qd_num; x++) { qd = ip->i_res->rs_qa_qd[x]; - if (!((qd->qd_id == uid && test_bit(QDF_USER, &qd->qd_flags)) || - (qd->qd_id == gid && !test_bit(QDF_USER, &qd->qd_flags)))) + if (!(qid_eq(qd->qd_id, make_kqid_uid(uid)) || + qid_eq(qd->qd_id, make_kqid_gid(gid)))) continue; value = (s64)be64_to_cpu(qd->qd_qb.qb_value); @@ -1074,10 +1069,7 @@ int gfs2_quota_check(struct gfs2_inode *ip, u32 uid, u32 gid) if (be64_to_cpu(qd->qd_qb.qb_limit) && (s64)be64_to_cpu(qd->qd_qb.qb_limit) < value) { print_message(qd, "exceeded"); - quota_send_warning(make_kqid(&init_user_ns, - test_bit(QDF_USER, &qd->qd_flags) ? - USRQUOTA : GRPQUOTA, - qd->qd_id), + quota_send_warning(qd->qd_id, sdp->sd_vfs->s_dev, QUOTA_NL_BHARDWARN); error = -EDQUOT; @@ -1087,10 +1079,7 @@ int gfs2_quota_check(struct gfs2_inode *ip, u32 uid, u32 gid) time_after_eq(jiffies, qd->qd_last_warn + gfs2_tune_get(sdp, gt_quota_warn_period) * HZ)) { - quota_send_warning(make_kqid(&init_user_ns, - test_bit(QDF_USER, &qd->qd_flags) ? - USRQUOTA : GRPQUOTA, - qd->qd_id), + quota_send_warning(qd->qd_id, sdp->sd_vfs->s_dev, QUOTA_NL_BSOFTWARN); error = print_message(qd, "warning"); qd->qd_last_warn = jiffies; @@ -1101,7 +1090,7 @@ int gfs2_quota_check(struct gfs2_inode *ip, u32 uid, u32 gid) } void gfs2_quota_change(struct gfs2_inode *ip, s64 change, - u32 uid, u32 gid) + kuid_t uid, kgid_t gid) { struct gfs2_quota_data *qd; unsigned int x; @@ -1114,8 +1103,8 @@ void gfs2_quota_change(struct gfs2_inode *ip, s64 change, for (x = 0; x < ip->i_res->rs_qa_qd_num; x++) { qd = ip->i_res->rs_qa_qd[x]; - if ((qd->qd_id == uid && test_bit(QDF_USER, &qd->qd_flags)) || - (qd->qd_id == gid && !test_bit(QDF_USER, &qd->qd_flags))) { + if (qid_eq(qd->qd_id, make_kqid_uid(uid)) || + qid_eq(qd->qd_id, make_kqid_gid(gid))) { do_qc(qd, change); } } @@ -1170,13 +1159,13 @@ static int gfs2_quota_sync_timeo(struct super_block *sb, int type) return gfs2_quota_sync(sb, type); } -int gfs2_quota_refresh(struct gfs2_sbd *sdp, int user, u32 id) +int gfs2_quota_refresh(struct gfs2_sbd *sdp, struct kqid qid) { struct gfs2_quota_data *qd; struct gfs2_holder q_gh; int error; - error = qd_get(sdp, user, id, &qd); + error = qd_get(sdp, qid, &qd); if (error) return error; @@ -1194,7 +1183,9 @@ static void gfs2_quota_change_in(struct gfs2_quota_change_host *qc, const void * qc->qc_change = be64_to_cpu(str->qc_change); qc->qc_flags = be32_to_cpu(str->qc_flags); - qc->qc_id = be32_to_cpu(str->qc_id); + qc->qc_id = make_kqid(&init_user_ns, + (qc->qc_flags & GFS2_QCF_USER)?USRQUOTA:GRPQUOTA, + be32_to_cpu(str->qc_id)); } int gfs2_quota_init(struct gfs2_sbd *sdp) @@ -1257,8 +1248,7 @@ int gfs2_quota_init(struct gfs2_sbd *sdp) if (!qc.qc_change) continue; - error = qd_alloc(sdp, (qc.qc_flags & GFS2_QCF_USER), - qc.qc_id, &qd); + error = qd_alloc(sdp, qc.qc_id, &qd); if (error) { brelse(bh); goto fail; @@ -1485,21 +1475,17 @@ static int gfs2_get_dqblk(struct super_block *sb, struct kqid qid, struct gfs2_quota_data *qd; struct gfs2_holder q_gh; int error; - int type; memset(fdq, 0, sizeof(struct fs_disk_quota)); if (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF) return -ESRCH; /* Crazy XFS error code */ - if (qid.type == USRQUOTA) - type = QUOTA_USER; - else if (qid.type == GRPQUOTA) - type = QUOTA_GROUP; - else + if ((qid.type != USRQUOTA) && + (qid.type != GRPQUOTA)) return -EINVAL; - error = qd_get(sdp, type, from_kqid(&init_user_ns, qid), &qd); + error = qd_get(sdp, qid, &qd); if (error) return error; error = do_glock(qd, FORCE, &q_gh); @@ -1508,8 +1494,8 @@ static int gfs2_get_dqblk(struct super_block *sb, struct kqid qid, qlvb = (struct gfs2_quota_lvb *)qd->qd_gl->gl_lksb.sb_lvbptr; fdq->d_version = FS_DQUOT_VERSION; - fdq->d_flags = (type == QUOTA_USER) ? FS_USER_QUOTA : FS_GROUP_QUOTA; - fdq->d_id = from_kqid(&init_user_ns, qid); + fdq->d_flags = (qid.type == USRQUOTA) ? FS_USER_QUOTA : FS_GROUP_QUOTA; + fdq->d_id = from_kqid_munged(current_user_ns(), qid); fdq->d_blk_hardlimit = be64_to_cpu(qlvb->qb_limit) << sdp->sd_fsb2bb_shift; fdq->d_blk_softlimit = be64_to_cpu(qlvb->qb_warn) << sdp->sd_fsb2bb_shift; fdq->d_bcount = be64_to_cpu(qlvb->qb_value) << sdp->sd_fsb2bb_shift; @@ -1535,32 +1521,18 @@ static int gfs2_set_dqblk(struct super_block *sb, struct kqid qid, int alloc_required; loff_t offset; int error; - int type; if (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF) return -ESRCH; /* Crazy XFS error code */ - switch(qid.type) { - case USRQUOTA: - type = QUOTA_USER; - if (fdq->d_flags != FS_USER_QUOTA) - return -EINVAL; - break; - case GRPQUOTA: - type = QUOTA_GROUP; - if (fdq->d_flags != FS_GROUP_QUOTA) - return -EINVAL; - break; - default: + if ((qid.type != USRQUOTA) && + (qid.type != GRPQUOTA)) return -EINVAL; - } if (fdq->d_fieldmask & ~GFS2_FIELDMASK) return -EINVAL; - if (fdq->d_id != from_kqid(&init_user_ns, qid)) - return -EINVAL; - error = qd_get(sdp, type, from_kqid(&init_user_ns, qid), &qd); + error = qd_get(sdp, qid, &qd); if (error) return error; diff --git a/fs/gfs2/quota.h b/fs/gfs2/quota.h index f25d98b..4f5e6e4 100644 --- a/fs/gfs2/quota.h +++ b/fs/gfs2/quota.h @@ -14,20 +14,21 @@ struct gfs2_inode; struct gfs2_sbd; struct shrink_control; -#define NO_QUOTA_CHANGE ((u32)-1) +#define NO_UID_QUOTA_CHANGE INVALID_UID +#define NO_GID_QUOTA_CHANGE INVALID_GID -extern int gfs2_quota_hold(struct gfs2_inode *ip, u32 uid, u32 gid); +extern int gfs2_quota_hold(struct gfs2_inode *ip, kuid_t uid, kgid_t gid); extern void gfs2_quota_unhold(struct gfs2_inode *ip); -extern int gfs2_quota_lock(struct gfs2_inode *ip, u32 uid, u32 gid); +extern int gfs2_quota_lock(struct gfs2_inode *ip, kuid_t uid, kgid_t gid); extern void gfs2_quota_unlock(struct gfs2_inode *ip); -extern int gfs2_quota_check(struct gfs2_inode *ip, u32 uid, u32 gid); +extern int gfs2_quota_check(struct gfs2_inode *ip, kuid_t uid, kgid_t gid); extern void gfs2_quota_change(struct gfs2_inode *ip, s64 change, - u32 uid, u32 gid); + kuid_t uid, kgid_t gid); extern int gfs2_quota_sync(struct super_block *sb, int type); -extern int gfs2_quota_refresh(struct gfs2_sbd *sdp, int user, u32 id); +extern int gfs2_quota_refresh(struct gfs2_sbd *sdp, struct kqid qid); extern int gfs2_quota_init(struct gfs2_sbd *sdp); extern void gfs2_quota_cleanup(struct gfs2_sbd *sdp); @@ -41,7 +42,7 @@ static inline int gfs2_quota_lock_check(struct gfs2_inode *ip) int ret; if (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF) return 0; - ret = gfs2_quota_lock(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE); + ret = gfs2_quota_lock(ip, NO_UID_QUOTA_CHANGE, NO_GID_QUOTA_CHANGE); if (ret) return ret; if (sdp->sd_args.ar_quota != GFS2_QUOTA_ON) diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index 04af1cf..d1f51fd 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -1323,7 +1323,7 @@ int gfs2_fitrim(struct file *filp, void __user *argp) if (ret == 0) { bh = rgd->rd_bits[0].bi_bh; rgd->rd_flags |= GFS2_RGF_TRIMMED; - gfs2_trans_add_bh(rgd->rd_gl, bh, 1); + gfs2_trans_add_meta(rgd->rd_gl, bh); gfs2_rgrp_out(rgd, bh->b_data); gfs2_rgrp_ondisk2lvb(rgd->rd_rgl, bh->b_data); gfs2_trans_end(sdp); @@ -1968,14 +1968,14 @@ static void gfs2_alloc_extent(const struct gfs2_rbm *rbm, bool dinode, *n = 1; block = gfs2_rbm_to_block(rbm); - gfs2_trans_add_bh(rbm->rgd->rd_gl, rbm->bi->bi_bh, 1); + gfs2_trans_add_meta(rbm->rgd->rd_gl, rbm->bi->bi_bh); gfs2_setbit(rbm, true, dinode ? GFS2_BLKST_DINODE : GFS2_BLKST_USED); block++; while (*n < elen) { ret = gfs2_rbm_from_block(&pos, block); if (ret || gfs2_testbit(&pos) != GFS2_BLKST_FREE) break; - gfs2_trans_add_bh(pos.rgd->rd_gl, pos.bi->bi_bh, 1); + gfs2_trans_add_meta(pos.rgd->rd_gl, pos.bi->bi_bh); gfs2_setbit(&pos, true, GFS2_BLKST_USED); (*n)++; block++; @@ -2014,7 +2014,7 @@ static struct gfs2_rgrpd *rgblk_free(struct gfs2_sbd *sdp, u64 bstart, rbm.bi->bi_bh->b_data + rbm.bi->bi_offset, rbm.bi->bi_len); } - gfs2_trans_add_bh(rbm.rgd->rd_gl, rbm.bi->bi_bh, 1); + gfs2_trans_add_meta(rbm.rgd->rd_gl, rbm.bi->bi_bh); gfs2_setbit(&rbm, false, new_state); } @@ -2157,7 +2157,7 @@ int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *nblocks, if (error == 0) { struct gfs2_dinode *di = (struct gfs2_dinode *)dibh->b_data; - gfs2_trans_add_bh(ip->i_gl, dibh, 1); + gfs2_trans_add_meta(ip->i_gl, dibh); di->di_goal_meta = di->di_goal_data = cpu_to_be64(ip->i_goal); brelse(dibh); @@ -2176,7 +2176,7 @@ int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *nblocks, *generation = rbm.rgd->rd_igeneration++; } - gfs2_trans_add_bh(rbm.rgd->rd_gl, rbm.rgd->rd_bits[0].bi_bh, 1); + gfs2_trans_add_meta(rbm.rgd->rd_gl, rbm.rgd->rd_bits[0].bi_bh); gfs2_rgrp_out(rbm.rgd, rbm.rgd->rd_bits[0].bi_bh->b_data); gfs2_rgrp_ondisk2lvb(rbm.rgd->rd_rgl, rbm.rgd->rd_bits[0].bi_bh->b_data); @@ -2223,7 +2223,7 @@ void __gfs2_free_blocks(struct gfs2_inode *ip, u64 bstart, u32 blen, int meta) trace_gfs2_block_alloc(ip, rgd, bstart, blen, GFS2_BLKST_FREE); rgd->rd_free += blen; rgd->rd_flags &= ~GFS2_RGF_TRIMMED; - gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1); + gfs2_trans_add_meta(rgd->rd_gl, rgd->rd_bits[0].bi_bh); gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data); gfs2_rgrp_ondisk2lvb(rgd->rd_rgl, rgd->rd_bits[0].bi_bh->b_data); @@ -2260,7 +2260,7 @@ void gfs2_unlink_di(struct inode *inode) if (!rgd) return; trace_gfs2_block_alloc(ip, rgd, blkno, 1, GFS2_BLKST_UNLINKED); - gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1); + gfs2_trans_add_meta(rgd->rd_gl, rgd->rd_bits[0].bi_bh); gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data); gfs2_rgrp_ondisk2lvb(rgd->rd_rgl, rgd->rd_bits[0].bi_bh->b_data); update_rgrp_lvb_unlinked(rgd, 1); @@ -2281,7 +2281,7 @@ static void gfs2_free_uninit_di(struct gfs2_rgrpd *rgd, u64 blkno) rgd->rd_dinodes--; rgd->rd_free++; - gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1); + gfs2_trans_add_meta(rgd->rd_gl, rgd->rd_bits[0].bi_bh); gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data); gfs2_rgrp_ondisk2lvb(rgd->rd_rgl, rgd->rd_bits[0].bi_bh->b_data); update_rgrp_lvb_unlinked(rgd, -1); diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index d648867..cab77b8 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -500,7 +500,7 @@ void gfs2_statfs_change(struct gfs2_sbd *sdp, s64 total, s64 free, if (error) return; - gfs2_trans_add_bh(l_ip->i_gl, l_bh, 1); + gfs2_trans_add_meta(l_ip->i_gl, l_bh); spin_lock(&sdp->sd_statfs_spin); l_sc->sc_total += total; @@ -528,7 +528,7 @@ void update_statfs(struct gfs2_sbd *sdp, struct buffer_head *m_bh, struct gfs2_statfs_change_host *m_sc = &sdp->sd_statfs_master; struct gfs2_statfs_change_host *l_sc = &sdp->sd_statfs_local; - gfs2_trans_add_bh(l_ip->i_gl, l_bh, 1); + gfs2_trans_add_meta(l_ip->i_gl, l_bh); spin_lock(&sdp->sd_statfs_spin); m_sc->sc_total += l_sc->sc_total; @@ -539,7 +539,7 @@ void update_statfs(struct gfs2_sbd *sdp, struct buffer_head *m_bh, 0, sizeof(struct gfs2_statfs_change)); spin_unlock(&sdp->sd_statfs_spin); - gfs2_trans_add_bh(m_ip->i_gl, m_bh, 1); + gfs2_trans_add_meta(m_ip->i_gl, m_bh); gfs2_statfs_change_out(m_sc, m_bh->b_data + sizeof(struct gfs2_dinode)); } @@ -663,54 +663,6 @@ out: return error; } -/** - * gfs2_freeze_fs - freezes the file system - * @sdp: the file system - * - * This function flushes data and meta data for all machines by - * acquiring the transaction log exclusively. All journals are - * ensured to be in a clean state as well. - * - * Returns: errno - */ - -int gfs2_freeze_fs(struct gfs2_sbd *sdp) -{ - int error = 0; - - mutex_lock(&sdp->sd_freeze_lock); - - if (!sdp->sd_freeze_count++) { - error = gfs2_lock_fs_check_clean(sdp, &sdp->sd_freeze_gh); - if (error) - sdp->sd_freeze_count--; - } - - mutex_unlock(&sdp->sd_freeze_lock); - - return error; -} - -/** - * gfs2_unfreeze_fs - unfreezes the file system - * @sdp: the file system - * - * This function allows the file system to proceed by unlocking - * the exclusively held transaction lock. Other GFS2 nodes are - * now free to acquire the lock shared and go on with their lives. - * - */ - -void gfs2_unfreeze_fs(struct gfs2_sbd *sdp) -{ - mutex_lock(&sdp->sd_freeze_lock); - - if (sdp->sd_freeze_count && !--sdp->sd_freeze_count) - gfs2_glock_dq_uninit(&sdp->sd_freeze_gh); - - mutex_unlock(&sdp->sd_freeze_lock); -} - void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf) { struct gfs2_dinode *str = buf; @@ -721,8 +673,8 @@ void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf) str->di_num.no_addr = cpu_to_be64(ip->i_no_addr); str->di_num.no_formal_ino = cpu_to_be64(ip->i_no_formal_ino); str->di_mode = cpu_to_be32(ip->i_inode.i_mode); - str->di_uid = cpu_to_be32(ip->i_inode.i_uid); - str->di_gid = cpu_to_be32(ip->i_inode.i_gid); + str->di_uid = cpu_to_be32(i_uid_read(&ip->i_inode)); + str->di_gid = cpu_to_be32(i_gid_read(&ip->i_inode)); str->di_nlink = cpu_to_be32(ip->i_inode.i_nlink); str->di_size = cpu_to_be64(i_size_read(&ip->i_inode)); str->di_blocks = cpu_to_be64(gfs2_get_inode_blocks(&ip->i_inode)); @@ -824,7 +776,7 @@ static void gfs2_dirty_inode(struct inode *inode, int flags) ret = gfs2_meta_inode_buffer(ip, &bh); if (ret == 0) { - gfs2_trans_add_bh(ip->i_gl, bh, 1); + gfs2_trans_add_meta(ip->i_gl, bh); gfs2_dinode_out(ip, bh->b_data); brelse(bh); } @@ -888,13 +840,6 @@ static void gfs2_put_super(struct super_block *sb) int error; struct gfs2_jdesc *jd; - /* Unfreeze the filesystem, if we need to */ - - mutex_lock(&sdp->sd_freeze_lock); - if (sdp->sd_freeze_count) - gfs2_glock_dq_uninit(&sdp->sd_freeze_gh); - mutex_unlock(&sdp->sd_freeze_lock); - /* No more recovery requests */ set_bit(SDF_NORECOVERY, &sdp->sd_flags); smp_mb(); @@ -985,7 +930,7 @@ static int gfs2_freeze(struct super_block *sb) return -EINVAL; for (;;) { - error = gfs2_freeze_fs(sdp); + error = gfs2_lock_fs_check_clean(sdp, &sdp->sd_freeze_gh); if (!error) break; @@ -1013,7 +958,9 @@ static int gfs2_freeze(struct super_block *sb) static int gfs2_unfreeze(struct super_block *sb) { - gfs2_unfreeze_fs(sb->s_fs_info); + struct gfs2_sbd *sdp = sb->s_fs_info; + + gfs2_glock_dq_uninit(&sdp->sd_freeze_gh); return 0; } @@ -1429,7 +1376,7 @@ static int gfs2_dinode_dealloc(struct gfs2_inode *ip) if (error) return error; - error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE); + error = gfs2_quota_hold(ip, NO_UID_QUOTA_CHANGE, NO_GID_QUOTA_CHANGE); if (error) return error; @@ -1577,6 +1524,7 @@ out: /* Case 3 starts here */ truncate_inode_pages(&inode->i_data, 0); gfs2_rs_delete(ip); + gfs2_ordered_del_inode(ip); clear_inode(inode); gfs2_dir_hash_inval(ip); ip->i_gl->gl_object = NULL; diff --git a/fs/gfs2/super.h b/fs/gfs2/super.h index a046468..90e3322 100644 --- a/fs/gfs2/super.h +++ b/fs/gfs2/super.h @@ -46,9 +46,6 @@ extern void update_statfs(struct gfs2_sbd *sdp, struct buffer_head *m_bh, struct buffer_head *l_bh); extern int gfs2_statfs_sync(struct super_block *sb, int type); -extern int gfs2_freeze_fs(struct gfs2_sbd *sdp); -extern void gfs2_unfreeze_fs(struct gfs2_sbd *sdp); - extern struct file_system_type gfs2_fs_type; extern struct file_system_type gfs2meta_fs_type; extern const struct export_operations gfs2_export_ops; diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c index 0acbe2f..aa5c480 100644 --- a/fs/gfs2/sys.c +++ b/fs/gfs2/sys.c @@ -91,19 +91,15 @@ static ssize_t uuid_show(struct gfs2_sbd *sdp, char *buf) static ssize_t freeze_show(struct gfs2_sbd *sdp, char *buf) { - unsigned int count; - - mutex_lock(&sdp->sd_freeze_lock); - count = sdp->sd_freeze_count; - mutex_unlock(&sdp->sd_freeze_lock); + struct super_block *sb = sdp->sd_vfs; + int frozen = (sb->s_writers.frozen == SB_UNFROZEN) ? 0 : 1; - return snprintf(buf, PAGE_SIZE, "%u\n", count); + return snprintf(buf, PAGE_SIZE, "%u\n", frozen); } static ssize_t freeze_store(struct gfs2_sbd *sdp, const char *buf, size_t len) { - ssize_t ret = len; - int error = 0; + int error; int n = simple_strtol(buf, NULL, 0); if (!capable(CAP_SYS_ADMIN)) @@ -111,19 +107,21 @@ static ssize_t freeze_store(struct gfs2_sbd *sdp, const char *buf, size_t len) switch (n) { case 0: - gfs2_unfreeze_fs(sdp); + error = thaw_super(sdp->sd_vfs); break; case 1: - error = gfs2_freeze_fs(sdp); + error = freeze_super(sdp->sd_vfs); break; default: - ret = -EINVAL; + return -EINVAL; } - if (error) + if (error) { fs_warn(sdp, "freeze %d error %d", n, error); + return error; + } - return ret; + return len; } static ssize_t withdraw_show(struct gfs2_sbd *sdp, char *buf) @@ -175,6 +173,7 @@ static ssize_t quota_sync_store(struct gfs2_sbd *sdp, const char *buf, static ssize_t quota_refresh_user_store(struct gfs2_sbd *sdp, const char *buf, size_t len) { + struct kqid qid; int error; u32 id; @@ -183,13 +182,18 @@ static ssize_t quota_refresh_user_store(struct gfs2_sbd *sdp, const char *buf, id = simple_strtoul(buf, NULL, 0); - error = gfs2_quota_refresh(sdp, 1, id); + qid = make_kqid(current_user_ns(), USRQUOTA, id); + if (!qid_valid(qid)) + return -EINVAL; + + error = gfs2_quota_refresh(sdp, qid); return error ? error : len; } static ssize_t quota_refresh_group_store(struct gfs2_sbd *sdp, const char *buf, size_t len) { + struct kqid qid; int error; u32 id; @@ -198,7 +202,11 @@ static ssize_t quota_refresh_group_store(struct gfs2_sbd *sdp, const char *buf, id = simple_strtoul(buf, NULL, 0); - error = gfs2_quota_refresh(sdp, 0, id); + qid = make_kqid(current_user_ns(), GRPQUOTA, id); + if (!qid_valid(qid)) + return -EINVAL; + + error = gfs2_quota_refresh(sdp, qid); return error ? error : len; } @@ -332,6 +340,28 @@ static ssize_t block_store(struct gfs2_sbd *sdp, const char *buf, size_t len) return ret; } +static ssize_t wdack_show(struct gfs2_sbd *sdp, char *buf) +{ + int val = completion_done(&sdp->sd_wdack) ? 1 : 0; + + return sprintf(buf, "%d\n", val); +} + +static ssize_t wdack_store(struct gfs2_sbd *sdp, const char *buf, size_t len) +{ + ssize_t ret = len; + int val; + + val = simple_strtol(buf, NULL, 0); + + if ((val == 1) && + !strcmp(sdp->sd_lockstruct.ls_ops->lm_proto_name, "lock_dlm")) + complete(&sdp->sd_wdack); + else + ret = -EINVAL; + return ret; +} + static ssize_t lkfirst_show(struct gfs2_sbd *sdp, char *buf) { struct lm_lockstruct *ls = &sdp->sd_lockstruct; @@ -463,7 +493,7 @@ static struct gfs2_attr gdlm_attr_##_name = __ATTR(_name,_mode,_show,_store) GDLM_ATTR(proto_name, 0444, proto_name_show, NULL); GDLM_ATTR(block, 0644, block_show, block_store); -GDLM_ATTR(withdraw, 0644, withdraw_show, withdraw_store); +GDLM_ATTR(withdraw, 0644, wdack_show, wdack_store); GDLM_ATTR(jid, 0644, jid_show, jid_store); GDLM_ATTR(first, 0644, lkfirst_show, lkfirst_store); GDLM_ATTR(first_done, 0444, first_done_show, NULL); diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c index 4136270..88162fa 100644 --- a/fs/gfs2/trans.c +++ b/fs/gfs2/trans.c @@ -18,6 +18,7 @@ #include "gfs2.h" #include "incore.h" #include "glock.h" +#include "inode.h" #include "log.h" #include "lops.h" #include "meta_io.h" @@ -142,44 +143,143 @@ void gfs2_trans_end(struct gfs2_sbd *sdp) sb_end_intwrite(sdp->sd_vfs); } +static struct gfs2_bufdata *gfs2_alloc_bufdata(struct gfs2_glock *gl, + struct buffer_head *bh, + const struct gfs2_log_operations *lops) +{ + struct gfs2_bufdata *bd; + + bd = kmem_cache_zalloc(gfs2_bufdata_cachep, GFP_NOFS | __GFP_NOFAIL); + bd->bd_bh = bh; + bd->bd_gl = gl; + bd->bd_ops = lops; + INIT_LIST_HEAD(&bd->bd_list); + bh->b_private = bd; + return bd; +} + /** - * gfs2_trans_add_bh - Add a to-be-modified buffer to the current transaction - * @gl: the glock the buffer belongs to + * gfs2_trans_add_data - Add a databuf to the transaction. + * @gl: The inode glock associated with the buffer * @bh: The buffer to add - * @meta: True in the case of adding metadata * + * This is used in two distinct cases: + * i) In ordered write mode + * We put the data buffer on a list so that we can ensure that its + * synced to disk at the right time + * ii) In journaled data mode + * We need to journal the data block in the same way as metadata in + * the functions above. The difference is that here we have a tag + * which is two __be64's being the block number (as per meta data) + * and a flag which says whether the data block needs escaping or + * not. This means we need a new log entry for each 251 or so data + * blocks, which isn't an enormous overhead but twice as much as + * for normal metadata blocks. */ +void gfs2_trans_add_data(struct gfs2_glock *gl, struct buffer_head *bh) +{ + struct gfs2_trans *tr = current->journal_info; + struct gfs2_sbd *sdp = gl->gl_sbd; + struct address_space *mapping = bh->b_page->mapping; + struct gfs2_inode *ip = GFS2_I(mapping->host); + struct gfs2_bufdata *bd; -void gfs2_trans_add_bh(struct gfs2_glock *gl, struct buffer_head *bh, int meta) + if (!gfs2_is_jdata(ip)) { + gfs2_ordered_add_inode(ip); + return; + } + + lock_buffer(bh); + gfs2_log_lock(sdp); + bd = bh->b_private; + if (bd == NULL) { + gfs2_log_unlock(sdp); + unlock_buffer(bh); + if (bh->b_private == NULL) + bd = gfs2_alloc_bufdata(gl, bh, &gfs2_databuf_lops); + lock_buffer(bh); + gfs2_log_lock(sdp); + } + gfs2_assert(sdp, bd->bd_gl == gl); + tr->tr_touched = 1; + if (list_empty(&bd->bd_list)) { + set_bit(GLF_LFLUSH, &bd->bd_gl->gl_flags); + set_bit(GLF_DIRTY, &bd->bd_gl->gl_flags); + gfs2_pin(sdp, bd->bd_bh); + tr->tr_num_databuf_new++; + sdp->sd_log_num_databuf++; + list_add_tail(&bd->bd_list, &sdp->sd_log_le_databuf); + } + gfs2_log_unlock(sdp); + unlock_buffer(bh); +} + +static void meta_lo_add(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd) { + struct gfs2_meta_header *mh; + struct gfs2_trans *tr; + + tr = current->journal_info; + tr->tr_touched = 1; + if (!list_empty(&bd->bd_list)) + return; + set_bit(GLF_LFLUSH, &bd->bd_gl->gl_flags); + set_bit(GLF_DIRTY, &bd->bd_gl->gl_flags); + mh = (struct gfs2_meta_header *)bd->bd_bh->b_data; + if (unlikely(mh->mh_magic != cpu_to_be32(GFS2_MAGIC))) { + printk(KERN_ERR + "Attempting to add uninitialised block to journal (inplace block=%lld)\n", + (unsigned long long)bd->bd_bh->b_blocknr); + BUG(); + } + gfs2_pin(sdp, bd->bd_bh); + mh->__pad0 = cpu_to_be64(0); + mh->mh_jid = cpu_to_be32(sdp->sd_jdesc->jd_jid); + sdp->sd_log_num_buf++; + list_add(&bd->bd_list, &sdp->sd_log_le_buf); + tr->tr_num_buf_new++; +} + +void gfs2_trans_add_meta(struct gfs2_glock *gl, struct buffer_head *bh) +{ + struct gfs2_sbd *sdp = gl->gl_sbd; struct gfs2_bufdata *bd; lock_buffer(bh); gfs2_log_lock(sdp); bd = bh->b_private; - if (bd) - gfs2_assert(sdp, bd->bd_gl == gl); - else { + if (bd == NULL) { gfs2_log_unlock(sdp); unlock_buffer(bh); - gfs2_attach_bufdata(gl, bh, meta); - bd = bh->b_private; + lock_page(bh->b_page); + if (bh->b_private == NULL) + bd = gfs2_alloc_bufdata(gl, bh, &gfs2_buf_lops); + unlock_page(bh->b_page); lock_buffer(bh); gfs2_log_lock(sdp); } - lops_add(sdp, bd); + gfs2_assert(sdp, bd->bd_gl == gl); + meta_lo_add(sdp, bd); gfs2_log_unlock(sdp); unlock_buffer(bh); } void gfs2_trans_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd) { + struct gfs2_glock *gl = bd->bd_gl; + struct gfs2_trans *tr = current->journal_info; + BUG_ON(!list_empty(&bd->bd_list)); BUG_ON(!list_empty(&bd->bd_ail_st_list)); BUG_ON(!list_empty(&bd->bd_ail_gl_list)); - lops_init_le(bd, &gfs2_revoke_lops); - lops_add(sdp, bd); + bd->bd_ops = &gfs2_revoke_lops; + tr->tr_touched = 1; + tr->tr_num_revoke++; + sdp->sd_log_num_revoke++; + atomic_inc(&gl->gl_revokes); + set_bit(GLF_LFLUSH, &gl->gl_flags); + list_add(&bd->bd_list, &sdp->sd_log_le_revoke); } void gfs2_trans_add_unrevoke(struct gfs2_sbd *sdp, u64 blkno, unsigned int len) diff --git a/fs/gfs2/trans.h b/fs/gfs2/trans.h index bf2ae9a..1e6e7da 100644 --- a/fs/gfs2/trans.h +++ b/fs/gfs2/trans.h @@ -39,7 +39,8 @@ extern int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks, unsigned int revokes); extern void gfs2_trans_end(struct gfs2_sbd *sdp); -extern void gfs2_trans_add_bh(struct gfs2_glock *gl, struct buffer_head *bh, int meta); +extern void gfs2_trans_add_data(struct gfs2_glock *gl, struct buffer_head *bh); +extern void gfs2_trans_add_meta(struct gfs2_glock *gl, struct buffer_head *bh); extern void gfs2_trans_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd); extern void gfs2_trans_add_unrevoke(struct gfs2_sbd *sdp, u64 blkno, unsigned int len); diff --git a/fs/gfs2/util.c b/fs/gfs2/util.c index f00d7c5..6402fb6 100644 --- a/fs/gfs2/util.c +++ b/fs/gfs2/util.c @@ -54,6 +54,9 @@ int gfs2_lm_withdraw(struct gfs2_sbd *sdp, char *fmt, ...) kobject_uevent(&sdp->sd_kobj, KOBJ_OFFLINE); + if (!strcmp(sdp->sd_lockstruct.ls_ops->lm_proto_name, "lock_dlm")) + wait_for_completion(&sdp->sd_wdack); + if (lm->lm_unmount) { fs_err(sdp, "telling LM to unmount\n"); lm->lm_unmount(sdp); diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c index 76c144b3..ecd37f3 100644 --- a/fs/gfs2/xattr.c +++ b/fs/gfs2/xattr.c @@ -270,7 +270,7 @@ static int ea_dealloc_unstuffed(struct gfs2_inode *ip, struct buffer_head *bh, if (error) goto out_gunlock; - gfs2_trans_add_bh(ip->i_gl, bh, 1); + gfs2_trans_add_meta(ip->i_gl, bh); dataptrs = GFS2_EA2DATAPTRS(ea); for (x = 0; x < ea->ea_num_ptrs; x++, dataptrs++) { @@ -309,7 +309,7 @@ static int ea_dealloc_unstuffed(struct gfs2_inode *ip, struct buffer_head *bh, error = gfs2_meta_inode_buffer(ip, &dibh); if (!error) { ip->i_inode.i_ctime = CURRENT_TIME; - gfs2_trans_add_bh(ip->i_gl, dibh, 1); + gfs2_trans_add_meta(ip->i_gl, dibh); gfs2_dinode_out(ip, dibh->b_data); brelse(dibh); } @@ -331,7 +331,7 @@ static int ea_remove_unstuffed(struct gfs2_inode *ip, struct buffer_head *bh, if (error) return error; - error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE); + error = gfs2_quota_hold(ip, NO_UID_QUOTA_CHANGE, NO_GID_QUOTA_CHANGE); if (error) goto out_alloc; @@ -509,7 +509,7 @@ static int gfs2_iter_unstuffed(struct gfs2_inode *ip, struct gfs2_ea_header *ea, } if (din) { - gfs2_trans_add_bh(ip->i_gl, bh[x], 1); + gfs2_trans_add_meta(ip->i_gl, bh[x]); memcpy(pos, din, cp_size); din += sdp->sd_jbsize; } @@ -629,7 +629,7 @@ static int ea_alloc_blk(struct gfs2_inode *ip, struct buffer_head **bhp) return error; gfs2_trans_add_unrevoke(sdp, block, 1); *bhp = gfs2_meta_new(ip->i_gl, block); - gfs2_trans_add_bh(ip->i_gl, *bhp, 1); + gfs2_trans_add_meta(ip->i_gl, *bhp); gfs2_metatype_set(*bhp, GFS2_METATYPE_EA, GFS2_FORMAT_EA); gfs2_buffer_clear_tail(*bhp, sizeof(struct gfs2_meta_header)); @@ -691,7 +691,7 @@ static int ea_write(struct gfs2_inode *ip, struct gfs2_ea_header *ea, return error; gfs2_trans_add_unrevoke(sdp, block, 1); bh = gfs2_meta_new(ip->i_gl, block); - gfs2_trans_add_bh(ip->i_gl, bh, 1); + gfs2_trans_add_meta(ip->i_gl, bh); gfs2_metatype_set(bh, GFS2_METATYPE_ED, GFS2_FORMAT_ED); gfs2_add_inode_blocks(&ip->i_inode, 1); @@ -751,7 +751,7 @@ static int ea_alloc_skeleton(struct gfs2_inode *ip, struct gfs2_ea_request *er, error = gfs2_meta_inode_buffer(ip, &dibh); if (!error) { ip->i_inode.i_ctime = CURRENT_TIME; - gfs2_trans_add_bh(ip->i_gl, dibh, 1); + gfs2_trans_add_meta(ip->i_gl, dibh); gfs2_dinode_out(ip, dibh->b_data); brelse(dibh); } @@ -834,7 +834,7 @@ static void ea_set_remove_stuffed(struct gfs2_inode *ip, struct gfs2_ea_header *prev = el->el_prev; u32 len; - gfs2_trans_add_bh(ip->i_gl, el->el_bh, 1); + gfs2_trans_add_meta(ip->i_gl, el->el_bh); if (!prev || !GFS2_EA_IS_STUFFED(ea)) { ea->ea_type = GFS2_EATYPE_UNUSED; @@ -872,7 +872,7 @@ static int ea_set_simple_noalloc(struct gfs2_inode *ip, struct buffer_head *bh, if (error) return error; - gfs2_trans_add_bh(ip->i_gl, bh, 1); + gfs2_trans_add_meta(ip->i_gl, bh); if (es->ea_split) ea = ea_split_ea(ea); @@ -886,7 +886,7 @@ static int ea_set_simple_noalloc(struct gfs2_inode *ip, struct buffer_head *bh, if (error) goto out; ip->i_inode.i_ctime = CURRENT_TIME; - gfs2_trans_add_bh(ip->i_gl, dibh, 1); + gfs2_trans_add_meta(ip->i_gl, dibh); gfs2_dinode_out(ip, dibh->b_data); brelse(dibh); out: @@ -901,7 +901,7 @@ static int ea_set_simple_alloc(struct gfs2_inode *ip, struct gfs2_ea_header *ea = es->es_ea; int error; - gfs2_trans_add_bh(ip->i_gl, es->es_bh, 1); + gfs2_trans_add_meta(ip->i_gl, es->es_bh); if (es->ea_split) ea = ea_split_ea(ea); @@ -997,7 +997,7 @@ static int ea_set_block(struct gfs2_inode *ip, struct gfs2_ea_request *er, goto out; } - gfs2_trans_add_bh(ip->i_gl, indbh, 1); + gfs2_trans_add_meta(ip->i_gl, indbh); } else { u64 blk; unsigned int n = 1; @@ -1006,7 +1006,7 @@ static int ea_set_block(struct gfs2_inode *ip, struct gfs2_ea_request *er, return error; gfs2_trans_add_unrevoke(sdp, blk, 1); indbh = gfs2_meta_new(ip->i_gl, blk); - gfs2_trans_add_bh(ip->i_gl, indbh, 1); + gfs2_trans_add_meta(ip->i_gl, indbh); gfs2_metatype_set(indbh, GFS2_METATYPE_IN, GFS2_FORMAT_IN); gfs2_buffer_clear_tail(indbh, mh_size); @@ -1092,7 +1092,7 @@ static int ea_remove_stuffed(struct gfs2_inode *ip, struct gfs2_ea_location *el) if (error) return error; - gfs2_trans_add_bh(ip->i_gl, el->el_bh, 1); + gfs2_trans_add_meta(ip->i_gl, el->el_bh); if (prev) { u32 len; @@ -1109,7 +1109,7 @@ static int ea_remove_stuffed(struct gfs2_inode *ip, struct gfs2_ea_location *el) error = gfs2_meta_inode_buffer(ip, &dibh); if (!error) { ip->i_inode.i_ctime = CURRENT_TIME; - gfs2_trans_add_bh(ip->i_gl, dibh, 1); + gfs2_trans_add_meta(ip->i_gl, dibh); gfs2_dinode_out(ip, dibh->b_data); brelse(dibh); } @@ -1265,7 +1265,7 @@ int gfs2_xattr_acl_chmod(struct gfs2_inode *ip, struct iattr *attr, char *data) if (GFS2_EA_IS_STUFFED(el.el_ea)) { error = gfs2_trans_begin(sdp, RES_DINODE + RES_EATTR, 0); if (error == 0) { - gfs2_trans_add_bh(ip->i_gl, el.el_bh, 1); + gfs2_trans_add_meta(ip->i_gl, el.el_bh); memcpy(GFS2_EA2DATA(el.el_ea), data, GFS2_EA_DATA_LEN(el.el_ea)); } @@ -1352,7 +1352,7 @@ static int ea_dealloc_indirect(struct gfs2_inode *ip) if (error) goto out_gunlock; - gfs2_trans_add_bh(ip->i_gl, indbh, 1); + gfs2_trans_add_meta(ip->i_gl, indbh); eablk = (__be64 *)(indbh->b_data + sizeof(struct gfs2_meta_header)); bstart = 0; @@ -1384,7 +1384,7 @@ static int ea_dealloc_indirect(struct gfs2_inode *ip) error = gfs2_meta_inode_buffer(ip, &dibh); if (!error) { - gfs2_trans_add_bh(ip->i_gl, dibh, 1); + gfs2_trans_add_meta(ip->i_gl, dibh); gfs2_dinode_out(ip, dibh->b_data); brelse(dibh); } @@ -1434,7 +1434,7 @@ static int ea_dealloc_block(struct gfs2_inode *ip) error = gfs2_meta_inode_buffer(ip, &dibh); if (!error) { - gfs2_trans_add_bh(ip->i_gl, dibh, 1); + gfs2_trans_add_meta(ip->i_gl, dibh); gfs2_dinode_out(ip, dibh->b_data); brelse(dibh); } @@ -1461,7 +1461,7 @@ int gfs2_ea_dealloc(struct gfs2_inode *ip) if (error) return error; - error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE); + error = gfs2_quota_hold(ip, NO_UID_QUOTA_CHANGE, NO_GID_QUOTA_CHANGE); if (error) return error; diff --git a/fs/hfs/Kconfig b/fs/hfs/Kconfig index b77c5bc..998e3a6 100644 --- a/fs/hfs/Kconfig +++ b/fs/hfs/Kconfig @@ -1,6 +1,6 @@ config HFS_FS - tristate "Apple Macintosh file system support (EXPERIMENTAL)" - depends on BLOCK && EXPERIMENTAL + tristate "Apple Macintosh file system support" + depends on BLOCK select NLS help If you say Y here, you will be able to mount Macintosh-formatted diff --git a/fs/hpfs/inode.c b/fs/hpfs/inode.c index 5dc06c8..9edeeb0 100644 --- a/fs/hpfs/inode.c +++ b/fs/hpfs/inode.c @@ -147,7 +147,7 @@ static void hpfs_write_inode_ea(struct inode *i, struct fnode *fnode) /*if (le32_to_cpu(fnode->acl_size_l) || le16_to_cpu(fnode->acl_size_s)) { Some unknown structures like ACL may be in fnode, we'd better not overwrite them - hpfs_error(i->i_sb, "fnode %08x has some unknown HPFS386 stuctures", i->i_ino); + hpfs_error(i->i_sb, "fnode %08x has some unknown HPFS386 structures", i->i_ino); } else*/ if (hpfs_sb(i->i_sb)->sb_eas >= 2) { __le32 ea; if (!uid_eq(i->i_uid, hpfs_sb(i->i_sb)->sb_uid) || hpfs_inode->i_ea_uid) { diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 3091d42..750c701 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -435,7 +435,12 @@ void jbd2_journal_commit_transaction(journal_t *journal) trace_jbd2_commit_locking(journal, commit_transaction); stats.run.rs_wait = commit_transaction->t_max_wait; + stats.run.rs_request_delay = 0; stats.run.rs_locked = jiffies; + if (commit_transaction->t_requested) + stats.run.rs_request_delay = + jbd2_time_diff(commit_transaction->t_requested, + stats.run.rs_locked); stats.run.rs_running = jbd2_time_diff(commit_transaction->t_start, stats.run.rs_locked); @@ -1116,7 +1121,10 @@ restart_loop: */ spin_lock(&journal->j_history_lock); journal->j_stats.ts_tid++; + if (commit_transaction->t_requested) + journal->j_stats.ts_requested++; journal->j_stats.run.rs_wait += stats.run.rs_wait; + journal->j_stats.run.rs_request_delay += stats.run.rs_request_delay; journal->j_stats.run.rs_running += stats.run.rs_running; journal->j_stats.run.rs_locked += stats.run.rs_locked; journal->j_stats.run.rs_flushing += stats.run.rs_flushing; diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index dbf41f9..ed10991 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -35,7 +35,6 @@ #include <linux/kthread.h> #include <linux/poison.h> #include <linux/proc_fs.h> -#include <linux/debugfs.h> #include <linux/seq_file.h> #include <linux/math64.h> #include <linux/hash.h> @@ -51,6 +50,14 @@ #include <asm/uaccess.h> #include <asm/page.h> +#ifdef CONFIG_JBD2_DEBUG +ushort jbd2_journal_enable_debug __read_mostly; +EXPORT_SYMBOL(jbd2_journal_enable_debug); + +module_param_named(jbd2_debug, jbd2_journal_enable_debug, ushort, 0644); +MODULE_PARM_DESC(jbd2_debug, "Debugging level for jbd2"); +#endif + EXPORT_SYMBOL(jbd2_journal_extend); EXPORT_SYMBOL(jbd2_journal_stop); EXPORT_SYMBOL(jbd2_journal_lock_updates); @@ -513,6 +520,10 @@ int __jbd2_log_space_left(journal_t *journal) */ int __jbd2_log_start_commit(journal_t *journal, tid_t target) { + /* Return if the txn has already requested to be committed */ + if (journal->j_commit_request == target) + return 0; + /* * The only transaction we can possibly wait upon is the * currently running transaction (if it exists). Otherwise, @@ -529,6 +540,7 @@ int __jbd2_log_start_commit(journal_t *journal, tid_t target) jbd_debug(1, "JBD2: requesting commit %d/%d\n", journal->j_commit_request, journal->j_commit_sequence); + journal->j_running_transaction->t_requested = jiffies; wake_up(&journal->j_wait_commit); return 1; } else if (!tid_geq(journal->j_commit_request, target)) @@ -894,13 +906,18 @@ static int jbd2_seq_info_show(struct seq_file *seq, void *v) if (v != SEQ_START_TOKEN) return 0; - seq_printf(seq, "%lu transaction, each up to %u blocks\n", - s->stats->ts_tid, - s->journal->j_max_transaction_buffers); + seq_printf(seq, "%lu transactions (%lu requested), " + "each up to %u blocks\n", + s->stats->ts_tid, s->stats->ts_requested, + s->journal->j_max_transaction_buffers); if (s->stats->ts_tid == 0) return 0; seq_printf(seq, "average: \n %ums waiting for transaction\n", jiffies_to_msecs(s->stats->run.rs_wait / s->stats->ts_tid)); + seq_printf(seq, " %ums request delay\n", + (s->stats->ts_requested == 0) ? 0 : + jiffies_to_msecs(s->stats->run.rs_request_delay / + s->stats->ts_requested)); seq_printf(seq, " %ums running transaction\n", jiffies_to_msecs(s->stats->run.rs_running / s->stats->ts_tid)); seq_printf(seq, " %ums transaction was being locked\n", @@ -2485,45 +2502,6 @@ restart: spin_unlock(&journal->j_list_lock); } -/* - * debugfs tunables - */ -#ifdef CONFIG_JBD2_DEBUG -u8 jbd2_journal_enable_debug __read_mostly; -EXPORT_SYMBOL(jbd2_journal_enable_debug); - -#define JBD2_DEBUG_NAME "jbd2-debug" - -static struct dentry *jbd2_debugfs_dir; -static struct dentry *jbd2_debug; - -static void __init jbd2_create_debugfs_entry(void) -{ - jbd2_debugfs_dir = debugfs_create_dir("jbd2", NULL); - if (jbd2_debugfs_dir) - jbd2_debug = debugfs_create_u8(JBD2_DEBUG_NAME, - S_IRUGO | S_IWUSR, - jbd2_debugfs_dir, - &jbd2_journal_enable_debug); -} - -static void __exit jbd2_remove_debugfs_entry(void) -{ - debugfs_remove(jbd2_debug); - debugfs_remove(jbd2_debugfs_dir); -} - -#else - -static void __init jbd2_create_debugfs_entry(void) -{ -} - -static void __exit jbd2_remove_debugfs_entry(void) -{ -} - -#endif #ifdef CONFIG_PROC_FS @@ -2609,7 +2587,6 @@ static int __init journal_init(void) ret = journal_init_caches(); if (ret == 0) { - jbd2_create_debugfs_entry(); jbd2_create_jbd_stats_proc_entry(); } else { jbd2_journal_destroy_caches(); @@ -2624,7 +2601,6 @@ static void __exit journal_exit(void) if (n) printk(KERN_EMERG "JBD2: leaked %d journal_heads!\n", n); #endif - jbd2_remove_debugfs_entry(); jbd2_remove_jbd_stats_proc_entry(); jbd2_journal_destroy_caches(); } diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index df9f297..b7e2385 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -30,6 +30,8 @@ #include <linux/bug.h> #include <linux/module.h> +#include <trace/events/jbd2.h> + static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh); static void __jbd2_journal_unfile_buffer(struct journal_head *jh); @@ -100,6 +102,7 @@ jbd2_get_transaction(journal_t *journal, transaction_t *transaction) journal->j_running_transaction = transaction; transaction->t_max_wait = 0; transaction->t_start = jiffies; + transaction->t_requested = 0; return transaction; } @@ -306,6 +309,8 @@ repeat: */ update_t_max_wait(transaction, ts); handle->h_transaction = transaction; + handle->h_requested_credits = nblocks; + handle->h_start_jiffies = jiffies; atomic_inc(&transaction->t_updates); atomic_inc(&transaction->t_handle_count); jbd_debug(4, "Handle %p given %d credits (total %d, free %d)\n", @@ -352,7 +357,8 @@ static handle_t *new_handle(int nblocks) * Return a pointer to a newly allocated handle, or an ERR_PTR() value * on failure. */ -handle_t *jbd2__journal_start(journal_t *journal, int nblocks, gfp_t gfp_mask) +handle_t *jbd2__journal_start(journal_t *journal, int nblocks, gfp_t gfp_mask, + unsigned int type, unsigned int line_no) { handle_t *handle = journal_current_handle(); int err; @@ -378,6 +384,11 @@ handle_t *jbd2__journal_start(journal_t *journal, int nblocks, gfp_t gfp_mask) current->journal_info = NULL; handle = ERR_PTR(err); } + handle->h_type = type; + handle->h_line_no = line_no; + trace_jbd2_handle_start(journal->j_fs_dev->bd_dev, + handle->h_transaction->t_tid, type, + line_no, nblocks); return handle; } EXPORT_SYMBOL(jbd2__journal_start); @@ -385,7 +396,7 @@ EXPORT_SYMBOL(jbd2__journal_start); handle_t *jbd2_journal_start(journal_t *journal, int nblocks) { - return jbd2__journal_start(journal, nblocks, GFP_NOFS); + return jbd2__journal_start(journal, nblocks, GFP_NOFS, 0, 0); } EXPORT_SYMBOL(jbd2_journal_start); @@ -447,7 +458,14 @@ int jbd2_journal_extend(handle_t *handle, int nblocks) goto unlock; } + trace_jbd2_handle_extend(journal->j_fs_dev->bd_dev, + handle->h_transaction->t_tid, + handle->h_type, handle->h_line_no, + handle->h_buffer_credits, + nblocks); + handle->h_buffer_credits += nblocks; + handle->h_requested_credits += nblocks; atomic_add(nblocks, &transaction->t_outstanding_credits); result = 0; @@ -1376,6 +1394,13 @@ int jbd2_journal_stop(handle_t *handle) } jbd_debug(4, "Handle %p going down\n", handle); + trace_jbd2_handle_stats(journal->j_fs_dev->bd_dev, + handle->h_transaction->t_tid, + handle->h_type, handle->h_line_no, + jiffies - handle->h_start_jiffies, + handle->h_sync, handle->h_requested_credits, + (handle->h_requested_credits - + handle->h_buffer_credits)); /* * Implement synchronous transaction batching. If the handle diff --git a/fs/jffs2/Kconfig b/fs/jffs2/Kconfig index 6ae169c..d8bb6c4 100644 --- a/fs/jffs2/Kconfig +++ b/fs/jffs2/Kconfig @@ -50,8 +50,8 @@ config JFFS2_FS_WBUF_VERIFY write-buffer, and check for errors. config JFFS2_SUMMARY - bool "JFFS2 summary support (EXPERIMENTAL)" - depends on JFFS2_FS && EXPERIMENTAL + bool "JFFS2 summary support" + depends on JFFS2_FS default n help This feature makes it possible to use summary information @@ -63,8 +63,8 @@ config JFFS2_SUMMARY If unsure, say 'N'. config JFFS2_FS_XATTR - bool "JFFS2 XATTR support (EXPERIMENTAL)" - depends on JFFS2_FS && EXPERIMENTAL + bool "JFFS2 XATTR support" + depends on JFFS2_FS default n help Extended attributes are name:value pairs associated with inodes by @@ -173,7 +173,7 @@ config JFFS2_CMODE_PRIORITY successful one. config JFFS2_CMODE_SIZE - bool "size (EXPERIMENTAL)" + bool "size" help Tries all compressors and chooses the one which has the smallest result. diff --git a/fs/jfs/super.c b/fs/jfs/super.c index 1a543be..060ba63 100644 --- a/fs/jfs/super.c +++ b/fs/jfs/super.c @@ -154,7 +154,7 @@ static int jfs_statfs(struct dentry *dentry, struct kstatfs *buf) /* * If we really return the number of allocated & free inodes, some * applications will fail because they won't see enough free inodes. - * We'll try to calculate some guess as to how may inodes we can + * We'll try to calculate some guess as to how many inodes we can * really allocate * * buf->f_files = atomic_read(&imap->im_numinos); diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c index d7e1ec1..3662771 100644 --- a/fs/lockd/clntproc.c +++ b/fs/lockd/clntproc.c @@ -550,6 +550,9 @@ again: status = nlmclnt_block(block, req, NLMCLNT_POLL_TIMEOUT); if (status < 0) break; + /* Resend the blocking lock request after a server reboot */ + if (resp->status == nlm_lck_denied_grace_period) + continue; if (resp->status != nlm_lck_blocked) break; } diff --git a/fs/logfs/Kconfig b/fs/logfs/Kconfig index daf9a9b..09ed066 100644 --- a/fs/logfs/Kconfig +++ b/fs/logfs/Kconfig @@ -1,6 +1,6 @@ config LOGFS - tristate "LogFS file system (EXPERIMENTAL)" - depends on (MTD || BLOCK) && EXPERIMENTAL + tristate "LogFS file system" + depends on (MTD || BLOCK) select ZLIB_INFLATE select ZLIB_DEFLATE select CRC32 diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c index c41e029..7dafd6899 100644 --- a/fs/ncpfs/inode.c +++ b/fs/ncpfs/inode.c @@ -331,12 +331,15 @@ static int ncp_show_options(struct seq_file *seq, struct dentry *root) struct ncp_server *server = NCP_SBP(root->d_sb); unsigned int tmp; - if (server->m.uid != 0) - seq_printf(seq, ",uid=%u", server->m.uid); - if (server->m.gid != 0) - seq_printf(seq, ",gid=%u", server->m.gid); - if (server->m.mounted_uid != 0) - seq_printf(seq, ",owner=%u", server->m.mounted_uid); + if (!uid_eq(server->m.uid, GLOBAL_ROOT_UID)) + seq_printf(seq, ",uid=%u", + from_kuid_munged(&init_user_ns, server->m.uid)); + if (!gid_eq(server->m.gid, GLOBAL_ROOT_GID)) + seq_printf(seq, ",gid=%u", + from_kgid_munged(&init_user_ns, server->m.gid)); + if (!uid_eq(server->m.mounted_uid, GLOBAL_ROOT_UID)) + seq_printf(seq, ",owner=%u", + from_kuid_munged(&init_user_ns, server->m.mounted_uid)); tmp = server->m.file_mode & S_IALLUGO; if (tmp != NCP_DEFAULT_FILE_MODE) seq_printf(seq, ",mode=0%o", tmp); @@ -381,13 +384,13 @@ static int ncp_parse_options(struct ncp_mount_data_kernel *data, char *options) data->flags = 0; data->int_flags = 0; - data->mounted_uid = 0; + data->mounted_uid = GLOBAL_ROOT_UID; data->wdog_pid = NULL; data->ncp_fd = ~0; data->time_out = NCP_DEFAULT_TIME_OUT; data->retry_count = NCP_DEFAULT_RETRY_COUNT; - data->uid = 0; - data->gid = 0; + data->uid = GLOBAL_ROOT_UID; + data->gid = GLOBAL_ROOT_GID; data->file_mode = NCP_DEFAULT_FILE_MODE; data->dir_mode = NCP_DEFAULT_DIR_MODE; data->info_fd = -1; @@ -399,13 +402,19 @@ static int ncp_parse_options(struct ncp_mount_data_kernel *data, char *options) goto err; switch (optval) { case 'u': - data->uid = optint; + data->uid = make_kuid(current_user_ns(), optint); + if (!uid_valid(data->uid)) + goto err; break; case 'g': - data->gid = optint; + data->gid = make_kgid(current_user_ns(), optint); + if (!gid_valid(data->gid)) + goto err; break; case 'o': - data->mounted_uid = optint; + data->mounted_uid = make_kuid(current_user_ns(), optint); + if (!uid_valid(data->mounted_uid)) + goto err; break; case 'm': data->file_mode = optint; @@ -480,13 +489,13 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent) data.flags = md->flags; data.int_flags = NCP_IMOUNT_LOGGEDIN_POSSIBLE; - data.mounted_uid = md->mounted_uid; + data.mounted_uid = make_kuid(current_user_ns(), md->mounted_uid); data.wdog_pid = find_get_pid(md->wdog_pid); data.ncp_fd = md->ncp_fd; data.time_out = md->time_out; data.retry_count = md->retry_count; - data.uid = md->uid; - data.gid = md->gid; + data.uid = make_kuid(current_user_ns(), md->uid); + data.gid = make_kgid(current_user_ns(), md->gid); data.file_mode = md->file_mode; data.dir_mode = md->dir_mode; data.info_fd = -1; @@ -499,13 +508,13 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent) struct ncp_mount_data_v4* md = (struct ncp_mount_data_v4*)raw_data; data.flags = md->flags; - data.mounted_uid = md->mounted_uid; + data.mounted_uid = make_kuid(current_user_ns(), md->mounted_uid); data.wdog_pid = find_get_pid(md->wdog_pid); data.ncp_fd = md->ncp_fd; data.time_out = md->time_out; data.retry_count = md->retry_count; - data.uid = md->uid; - data.gid = md->gid; + data.uid = make_kuid(current_user_ns(), md->uid); + data.gid = make_kgid(current_user_ns(), md->gid); data.file_mode = md->file_mode; data.dir_mode = md->dir_mode; data.info_fd = -1; @@ -520,6 +529,10 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent) goto out; break; } + error = -EINVAL; + if (!uid_valid(data.mounted_uid) || !uid_valid(data.uid) || + !gid_valid(data.gid)) + goto out; error = -EBADF; ncp_filp = fget(data.ncp_fd); if (!ncp_filp) @@ -886,12 +899,10 @@ int ncp_notify_change(struct dentry *dentry, struct iattr *attr) goto out; result = -EPERM; - if (((attr->ia_valid & ATTR_UID) && - (attr->ia_uid != server->m.uid))) + if ((attr->ia_valid & ATTR_UID) && !uid_eq(attr->ia_uid, server->m.uid)) goto out; - if (((attr->ia_valid & ATTR_GID) && - (attr->ia_gid != server->m.gid))) + if ((attr->ia_valid & ATTR_GID) && !gid_eq(attr->ia_gid, server->m.gid)) goto out; if (((attr->ia_valid & ATTR_MODE) && diff --git a/fs/ncpfs/ioctl.c b/fs/ncpfs/ioctl.c index 811d411..60426cc 100644 --- a/fs/ncpfs/ioctl.c +++ b/fs/ncpfs/ioctl.c @@ -45,7 +45,7 @@ ncp_get_fs_info(struct ncp_server * server, struct inode *inode, return -EINVAL; } /* TODO: info.addr = server->m.serv_addr; */ - SET_UID(info.mounted_uid, server->m.mounted_uid); + SET_UID(info.mounted_uid, from_kuid_munged(current_user_ns(), server->m.mounted_uid)); info.connection = server->connection; info.buffer_size = server->buffer_size; info.volume_number = NCP_FINFO(inode)->volNumber; @@ -69,7 +69,7 @@ ncp_get_fs_info_v2(struct ncp_server * server, struct inode *inode, DPRINTK("info.version invalid: %d\n", info2.version); return -EINVAL; } - info2.mounted_uid = server->m.mounted_uid; + info2.mounted_uid = from_kuid_munged(current_user_ns(), server->m.mounted_uid); info2.connection = server->connection; info2.buffer_size = server->buffer_size; info2.volume_number = NCP_FINFO(inode)->volNumber; @@ -135,7 +135,7 @@ ncp_get_compat_fs_info_v2(struct ncp_server * server, struct inode *inode, DPRINTK("info.version invalid: %d\n", info2.version); return -EINVAL; } - info2.mounted_uid = server->m.mounted_uid; + info2.mounted_uid = from_kuid_munged(current_user_ns(), server->m.mounted_uid); info2.connection = server->connection; info2.buffer_size = server->buffer_size; info2.volume_number = NCP_FINFO(inode)->volNumber; @@ -348,22 +348,25 @@ static long __ncp_ioctl(struct inode *inode, unsigned int cmd, unsigned long arg { u16 uid; - SET_UID(uid, server->m.mounted_uid); + SET_UID(uid, from_kuid_munged(current_user_ns(), server->m.mounted_uid)); if (put_user(uid, (u16 __user *)argp)) return -EFAULT; return 0; } case NCP_IOC_GETMOUNTUID32: - if (put_user(server->m.mounted_uid, - (u32 __user *)argp)) + { + uid_t uid = from_kuid_munged(current_user_ns(), server->m.mounted_uid); + if (put_user(uid, (u32 __user *)argp)) return -EFAULT; return 0; + } case NCP_IOC_GETMOUNTUID64: - if (put_user(server->m.mounted_uid, - (u64 __user *)argp)) + { + uid_t uid = from_kuid_munged(current_user_ns(), server->m.mounted_uid); + if (put_user(uid, (u64 __user *)argp)) return -EFAULT; return 0; - + } case NCP_IOC_GETROOT: { struct ncp_setroot_ioctl sr; @@ -810,7 +813,7 @@ long ncp_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { struct inode *inode = file_inode(filp); struct ncp_server *server = NCP_SERVER(inode); - uid_t uid = current_uid(); + kuid_t uid = current_uid(); int need_drop_write = 0; long ret; @@ -824,7 +827,7 @@ long ncp_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) } break; } - if (server->m.mounted_uid != uid) { + if (!uid_eq(server->m.mounted_uid, uid)) { switch (cmd) { /* * Only mount owner can issue these ioctls. Information diff --git a/fs/ncpfs/ncp_fs_sb.h b/fs/ncpfs/ncp_fs_sb.h index 54cc0cd..c51b2c5 100644 --- a/fs/ncpfs/ncp_fs_sb.h +++ b/fs/ncpfs/ncp_fs_sb.h @@ -23,15 +23,15 @@ struct ncp_mount_data_kernel { unsigned long flags; /* NCP_MOUNT_* flags */ unsigned int int_flags; /* internal flags */ #define NCP_IMOUNT_LOGGEDIN_POSSIBLE 0x0001 - uid_t mounted_uid; /* Who may umount() this filesystem? */ + kuid_t mounted_uid; /* Who may umount() this filesystem? */ struct pid *wdog_pid; /* Who cares for our watchdog packets? */ unsigned int ncp_fd; /* The socket to the ncp port */ unsigned int time_out; /* How long should I wait after sending a NCP request? */ unsigned int retry_count; /* And how often should I retry? */ unsigned char mounted_vol[NCP_VOLNAME_LEN + 1]; - uid_t uid; - gid_t gid; + kuid_t uid; + kgid_t gid; umode_t file_mode; umode_t dir_mode; int info_fd; diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c index 4fa788c..434b93e 100644 --- a/fs/nfs/blocklayout/blocklayout.c +++ b/fs/nfs/blocklayout/blocklayout.c @@ -1273,6 +1273,7 @@ static const struct nfs_pageio_ops bl_pg_write_ops = { static struct pnfs_layoutdriver_type blocklayout_type = { .id = LAYOUT_BLOCK_VOLUME, .name = "LAYOUT_BLOCK_VOLUME", + .owner = THIS_MODULE, .read_pagelist = bl_read_pagelist, .write_pagelist = bl_write_pagelist, .alloc_layout_hdr = bl_alloc_layout_hdr, diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c index 264d1aa..2960512 100644 --- a/fs/nfs/callback_proc.c +++ b/fs/nfs/callback_proc.c @@ -183,60 +183,15 @@ static u32 initiate_file_draining(struct nfs_client *clp, static u32 initiate_bulk_draining(struct nfs_client *clp, struct cb_layoutrecallargs *args) { - struct nfs_server *server; - struct pnfs_layout_hdr *lo; - struct inode *ino; - u32 rv = NFS4ERR_NOMATCHING_LAYOUT; - struct pnfs_layout_hdr *tmp; - LIST_HEAD(recall_list); - LIST_HEAD(free_me_list); - struct pnfs_layout_range range = { - .iomode = IOMODE_ANY, - .offset = 0, - .length = NFS4_MAX_UINT64, - }; - - spin_lock(&clp->cl_lock); - rcu_read_lock(); - list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) { - if ((args->cbl_recall_type == RETURN_FSID) && - memcmp(&server->fsid, &args->cbl_fsid, - sizeof(struct nfs_fsid))) - continue; + int stat; - list_for_each_entry(lo, &server->layouts, plh_layouts) { - ino = igrab(lo->plh_inode); - if (!ino) - continue; - spin_lock(&ino->i_lock); - /* Is this layout in the process of being freed? */ - if (NFS_I(ino)->layout != lo) { - spin_unlock(&ino->i_lock); - iput(ino); - continue; - } - pnfs_get_layout_hdr(lo); - spin_unlock(&ino->i_lock); - list_add(&lo->plh_bulk_recall, &recall_list); - } - } - rcu_read_unlock(); - spin_unlock(&clp->cl_lock); - - list_for_each_entry_safe(lo, tmp, - &recall_list, plh_bulk_recall) { - ino = lo->plh_inode; - spin_lock(&ino->i_lock); - set_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags); - if (pnfs_mark_matching_lsegs_invalid(lo, &free_me_list, &range)) - rv = NFS4ERR_DELAY; - list_del_init(&lo->plh_bulk_recall); - spin_unlock(&ino->i_lock); - pnfs_free_lseg_list(&free_me_list); - pnfs_put_layout_hdr(lo); - iput(ino); - } - return rv; + if (args->cbl_recall_type == RETURN_FSID) + stat = pnfs_destroy_layouts_byfsid(clp, &args->cbl_fsid, true); + else + stat = pnfs_destroy_layouts_byclid(clp, true); + if (stat != 0) + return NFS4ERR_DELAY; + return NFS4ERR_NOMATCHING_LAYOUT; } static u32 do_callback_layoutrecall(struct nfs_client *clp, diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c index 81c5eec..6390a4b 100644 --- a/fs/nfs/delegation.c +++ b/fs/nfs/delegation.c @@ -55,7 +55,8 @@ int nfs4_have_delegation(struct inode *inode, fmode_t flags) flags &= FMODE_READ|FMODE_WRITE; rcu_read_lock(); delegation = rcu_dereference(NFS_I(inode)->delegation); - if (delegation != NULL && (delegation->type & flags) == flags) { + if (delegation != NULL && (delegation->type & flags) == flags && + !test_bit(NFS_DELEGATION_RETURNING, &delegation->flags)) { nfs_mark_delegation_referenced(delegation); ret = 1; } @@ -70,8 +71,10 @@ static int nfs_delegation_claim_locks(struct nfs_open_context *ctx, struct nfs4_ int status = 0; if (inode->i_flock == NULL) - goto out; + return 0; + if (inode->i_flock == NULL) + goto out; /* Protect inode->i_flock using the file locks lock */ lock_flocks(); for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) { @@ -94,7 +97,9 @@ static int nfs_delegation_claim_opens(struct inode *inode, const nfs4_stateid *s { struct nfs_inode *nfsi = NFS_I(inode); struct nfs_open_context *ctx; + struct nfs4_state_owner *sp; struct nfs4_state *state; + unsigned int seq; int err; again: @@ -109,9 +114,16 @@ again: continue; get_nfs_open_context(ctx); spin_unlock(&inode->i_lock); + sp = state->owner; + /* Block nfs4_proc_unlck */ + mutex_lock(&sp->so_delegreturn_mutex); + seq = raw_seqcount_begin(&sp->so_reclaim_seqcount); err = nfs4_open_delegation_recall(ctx, state, stateid); - if (err >= 0) + if (!err) err = nfs_delegation_claim_locks(ctx, state); + if (!err && read_seqcount_retry(&sp->so_reclaim_seqcount, seq)) + err = -EAGAIN; + mutex_unlock(&sp->so_delegreturn_mutex); put_nfs_open_context(ctx); if (err != 0) return err; @@ -182,39 +194,91 @@ static struct inode *nfs_delegation_grab_inode(struct nfs_delegation *delegation } static struct nfs_delegation * +nfs_start_delegation_return_locked(struct nfs_inode *nfsi) +{ + struct nfs_delegation *ret = NULL; + struct nfs_delegation *delegation = rcu_dereference(nfsi->delegation); + + if (delegation == NULL) + goto out; + spin_lock(&delegation->lock); + if (!test_and_set_bit(NFS_DELEGATION_RETURNING, &delegation->flags)) + ret = delegation; + spin_unlock(&delegation->lock); +out: + return ret; +} + +static struct nfs_delegation * +nfs_start_delegation_return(struct nfs_inode *nfsi) +{ + struct nfs_delegation *delegation; + + rcu_read_lock(); + delegation = nfs_start_delegation_return_locked(nfsi); + rcu_read_unlock(); + return delegation; +} + +static void +nfs_abort_delegation_return(struct nfs_delegation *delegation, + struct nfs_client *clp) +{ + + spin_lock(&delegation->lock); + clear_bit(NFS_DELEGATION_RETURNING, &delegation->flags); + set_bit(NFS_DELEGATION_RETURN, &delegation->flags); + spin_unlock(&delegation->lock); + set_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state); +} + +static struct nfs_delegation * nfs_detach_delegation_locked(struct nfs_inode *nfsi, - struct nfs_server *server) + struct nfs_delegation *delegation, + struct nfs_client *clp) { - struct nfs_delegation *delegation = + struct nfs_delegation *deleg_cur = rcu_dereference_protected(nfsi->delegation, - lockdep_is_held(&server->nfs_client->cl_lock)); + lockdep_is_held(&clp->cl_lock)); - if (delegation == NULL) - goto nomatch; + if (deleg_cur == NULL || delegation != deleg_cur) + return NULL; spin_lock(&delegation->lock); + set_bit(NFS_DELEGATION_RETURNING, &delegation->flags); list_del_rcu(&delegation->super_list); delegation->inode = NULL; nfsi->delegation_state = 0; rcu_assign_pointer(nfsi->delegation, NULL); spin_unlock(&delegation->lock); return delegation; -nomatch: - return NULL; } static struct nfs_delegation *nfs_detach_delegation(struct nfs_inode *nfsi, - struct nfs_server *server) + struct nfs_delegation *delegation, + struct nfs_server *server) { struct nfs_client *clp = server->nfs_client; - struct nfs_delegation *delegation; spin_lock(&clp->cl_lock); - delegation = nfs_detach_delegation_locked(nfsi, server); + delegation = nfs_detach_delegation_locked(nfsi, delegation, clp); spin_unlock(&clp->cl_lock); return delegation; } +static struct nfs_delegation * +nfs_inode_detach_delegation(struct inode *inode) +{ + struct nfs_inode *nfsi = NFS_I(inode); + struct nfs_server *server = NFS_SERVER(inode); + struct nfs_delegation *delegation; + + delegation = nfs_start_delegation_return(nfsi); + if (delegation == NULL) + return NULL; + return nfs_detach_delegation(nfsi, delegation, server); +} + /** * nfs_inode_set_delegation - set up a delegation on an inode * @inode: inode to which delegation applies @@ -268,7 +332,10 @@ int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct delegation = NULL; goto out; } - freeme = nfs_detach_delegation_locked(nfsi, server); + freeme = nfs_detach_delegation_locked(nfsi, + old_delegation, clp); + if (freeme == NULL) + goto out; } list_add_rcu(&delegation->super_list, &server->delegations); nfsi->delegation_state = delegation->type; @@ -292,19 +359,29 @@ out: /* * Basic procedure for returning a delegation to the server */ -static int __nfs_inode_return_delegation(struct inode *inode, struct nfs_delegation *delegation, int issync) +static int nfs_end_delegation_return(struct inode *inode, struct nfs_delegation *delegation, int issync) { + struct nfs_client *clp = NFS_SERVER(inode)->nfs_client; struct nfs_inode *nfsi = NFS_I(inode); int err; - /* - * Guard against new delegated open/lock/unlock calls and against - * state recovery - */ - down_write(&nfsi->rwsem); - err = nfs_delegation_claim_opens(inode, &delegation->stateid); - up_write(&nfsi->rwsem); - if (err) + if (delegation == NULL) + return 0; + do { + err = nfs_delegation_claim_opens(inode, &delegation->stateid); + if (!issync || err != -EAGAIN) + break; + /* + * Guard against state recovery + */ + err = nfs4_wait_clnt_recover(clp); + } while (err == 0); + + if (err) { + nfs_abort_delegation_return(delegation, clp); + goto out; + } + if (!nfs_detach_delegation(nfsi, delegation, NFS_SERVER(inode))) goto out; err = nfs_do_return_delegation(inode, delegation, issync); @@ -340,13 +417,10 @@ restart: inode = nfs_delegation_grab_inode(delegation); if (inode == NULL) continue; - delegation = nfs_detach_delegation(NFS_I(inode), - server); + delegation = nfs_start_delegation_return_locked(NFS_I(inode)); rcu_read_unlock(); - if (delegation != NULL) - err = __nfs_inode_return_delegation(inode, - delegation, 0); + err = nfs_end_delegation_return(inode, delegation, 0); iput(inode); if (!err) goto restart; @@ -367,15 +441,11 @@ restart: */ void nfs_inode_return_delegation_noreclaim(struct inode *inode) { - struct nfs_server *server = NFS_SERVER(inode); - struct nfs_inode *nfsi = NFS_I(inode); struct nfs_delegation *delegation; - if (rcu_access_pointer(nfsi->delegation) != NULL) { - delegation = nfs_detach_delegation(nfsi, server); - if (delegation != NULL) - nfs_do_return_delegation(inode, delegation, 0); - } + delegation = nfs_inode_detach_delegation(inode); + if (delegation != NULL) + nfs_do_return_delegation(inode, delegation, 0); } /** @@ -390,18 +460,14 @@ void nfs_inode_return_delegation_noreclaim(struct inode *inode) */ int nfs4_inode_return_delegation(struct inode *inode) { - struct nfs_server *server = NFS_SERVER(inode); struct nfs_inode *nfsi = NFS_I(inode); struct nfs_delegation *delegation; int err = 0; nfs_wb_all(inode); - if (rcu_access_pointer(nfsi->delegation) != NULL) { - delegation = nfs_detach_delegation(nfsi, server); - if (delegation != NULL) { - err = __nfs_inode_return_delegation(inode, delegation, 1); - } - } + delegation = nfs_start_delegation_return(nfsi); + if (delegation != NULL) + err = nfs_end_delegation_return(inode, delegation, 1); return err; } @@ -471,7 +537,7 @@ void nfs_remove_bad_delegation(struct inode *inode) { struct nfs_delegation *delegation; - delegation = nfs_detach_delegation(NFS_I(inode), NFS_SERVER(inode)); + delegation = nfs_inode_detach_delegation(inode); if (delegation) { nfs_inode_find_state_and_recover(inode, &delegation->stateid); nfs_free_delegation(delegation); @@ -649,7 +715,7 @@ restart: if (inode == NULL) continue; delegation = nfs_detach_delegation(NFS_I(inode), - server); + delegation, server); rcu_read_unlock(); if (delegation != NULL) diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h index bbc6a4d..d54d4fc 100644 --- a/fs/nfs/delegation.h +++ b/fs/nfs/delegation.h @@ -29,6 +29,7 @@ enum { NFS_DELEGATION_NEED_RECLAIM = 0, NFS_DELEGATION_RETURN, NFS_DELEGATION_REFERENCED, + NFS_DELEGATION_RETURNING, }; int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res); diff --git a/fs/nfs/getroot.c b/fs/nfs/getroot.c index 033803c..44efaa8 100644 --- a/fs/nfs/getroot.c +++ b/fs/nfs/getroot.c @@ -126,8 +126,7 @@ struct dentry *nfs_get_root(struct super_block *sb, struct nfs_fh *mntfh, } spin_unlock(&ret->d_lock); out: - if (name) - kfree(name); + kfree(name); nfs_free_fattr(fsinfo.fattr); return ret; } diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c index 2ad8dea..dc0f98d 100644 --- a/fs/nfs/idmap.c +++ b/fs/nfs/idmap.c @@ -97,7 +97,7 @@ static void nfs_fattr_free_group_name(struct nfs_fattr *fattr) static bool nfs_fattr_map_owner_name(struct nfs_server *server, struct nfs_fattr *fattr) { struct nfs4_string *owner = fattr->owner_name; - __u32 uid; + kuid_t uid; if (!(fattr->valid & NFS_ATTR_FATTR_OWNER_NAME)) return false; @@ -111,7 +111,7 @@ static bool nfs_fattr_map_owner_name(struct nfs_server *server, struct nfs_fattr static bool nfs_fattr_map_group_name(struct nfs_server *server, struct nfs_fattr *fattr) { struct nfs4_string *group = fattr->group_name; - __u32 gid; + kgid_t gid; if (!(fattr->valid & NFS_ATTR_FATTR_GROUP_NAME)) return false; @@ -193,7 +193,8 @@ static int nfs_idmap_init_keyring(void) if (!cred) return -ENOMEM; - keyring = keyring_alloc(".id_resolver", 0, 0, cred, + keyring = keyring_alloc(".id_resolver", + GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, cred, (KEY_POS_ALL & ~KEY_POS_SETATTR) | KEY_USR_VIEW | KEY_USR_READ, KEY_ALLOC_NOT_IN_QUOTA, NULL); @@ -836,43 +837,61 @@ idmap_release_pipe(struct inode *inode) nfs_idmap_abort_pipe_upcall(idmap, -EPIPE); } -int nfs_map_name_to_uid(const struct nfs_server *server, const char *name, size_t namelen, __u32 *uid) +int nfs_map_name_to_uid(const struct nfs_server *server, const char *name, size_t namelen, kuid_t *uid) { struct idmap *idmap = server->nfs_client->cl_idmap; + __u32 id = -1; + int ret = 0; - if (nfs_map_string_to_numeric(name, namelen, uid)) - return 0; - return nfs_idmap_lookup_id(name, namelen, "uid", uid, idmap); + if (!nfs_map_string_to_numeric(name, namelen, &id)) + ret = nfs_idmap_lookup_id(name, namelen, "uid", &id, idmap); + if (ret == 0) { + *uid = make_kuid(&init_user_ns, id); + if (!uid_valid(*uid)) + ret = -ERANGE; + } + return ret; } -int nfs_map_group_to_gid(const struct nfs_server *server, const char *name, size_t namelen, __u32 *gid) +int nfs_map_group_to_gid(const struct nfs_server *server, const char *name, size_t namelen, kgid_t *gid) { struct idmap *idmap = server->nfs_client->cl_idmap; + __u32 id = -1; + int ret = 0; - if (nfs_map_string_to_numeric(name, namelen, gid)) - return 0; - return nfs_idmap_lookup_id(name, namelen, "gid", gid, idmap); + if (!nfs_map_string_to_numeric(name, namelen, &id)) + ret = nfs_idmap_lookup_id(name, namelen, "gid", &id, idmap); + if (ret == 0) { + *gid = make_kgid(&init_user_ns, id); + if (!gid_valid(*gid)) + ret = -ERANGE; + } + return ret; } -int nfs_map_uid_to_name(const struct nfs_server *server, __u32 uid, char *buf, size_t buflen) +int nfs_map_uid_to_name(const struct nfs_server *server, kuid_t uid, char *buf, size_t buflen) { struct idmap *idmap = server->nfs_client->cl_idmap; int ret = -EINVAL; + __u32 id; + id = from_kuid(&init_user_ns, uid); if (!(server->caps & NFS_CAP_UIDGID_NOMAP)) - ret = nfs_idmap_lookup_name(uid, "user", buf, buflen, idmap); + ret = nfs_idmap_lookup_name(id, "user", buf, buflen, idmap); if (ret < 0) - ret = nfs_map_numeric_to_string(uid, buf, buflen); + ret = nfs_map_numeric_to_string(id, buf, buflen); return ret; } -int nfs_map_gid_to_group(const struct nfs_server *server, __u32 gid, char *buf, size_t buflen) +int nfs_map_gid_to_group(const struct nfs_server *server, kgid_t gid, char *buf, size_t buflen) { struct idmap *idmap = server->nfs_client->cl_idmap; int ret = -EINVAL; + __u32 id; + id = from_kgid(&init_user_ns, gid); if (!(server->caps & NFS_CAP_UIDGID_NOMAP)) - ret = nfs_idmap_lookup_name(gid, "group", buf, buflen, idmap); + ret = nfs_idmap_lookup_name(id, "group", buf, buflen, idmap); if (ret < 0) - ret = nfs_map_numeric_to_string(gid, buf, buflen); + ret = nfs_map_numeric_to_string(id, buf, buflen); return ret; } diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 548ae31..b586fe9 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -332,8 +332,8 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr) inode->i_version = 0; inode->i_size = 0; clear_nlink(inode); - inode->i_uid = -2; - inode->i_gid = -2; + inode->i_uid = make_kuid(&init_user_ns, -2); + inode->i_gid = make_kgid(&init_user_ns, -2); inode->i_blocks = 0; memset(nfsi->cookieverf, 0, sizeof(nfsi->cookieverf)); nfsi->write_io = 0; @@ -694,10 +694,7 @@ static void __put_nfs_open_context(struct nfs_open_context *ctx, int is_sync) if (ctx->cred != NULL) put_rpccred(ctx->cred); dput(ctx->dentry); - if (is_sync) - nfs_sb_deactive(sb); - else - nfs_sb_deactive_async(sb); + nfs_sb_deactive(sb); kfree(ctx->mdsthreshold); kfree(ctx); } @@ -1009,9 +1006,9 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat /* Have any file permissions changed? */ if ((fattr->valid & NFS_ATTR_FATTR_MODE) && (inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO)) invalid |= NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL; - if ((fattr->valid & NFS_ATTR_FATTR_OWNER) && inode->i_uid != fattr->uid) + if ((fattr->valid & NFS_ATTR_FATTR_OWNER) && !uid_eq(inode->i_uid, fattr->uid)) invalid |= NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL; - if ((fattr->valid & NFS_ATTR_FATTR_GROUP) && inode->i_gid != fattr->gid) + if ((fattr->valid & NFS_ATTR_FATTR_GROUP) && !gid_eq(inode->i_gid, fattr->gid)) invalid |= NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL; /* Has the link count changed? */ @@ -1440,7 +1437,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) | NFS_INO_REVAL_FORCED); if (fattr->valid & NFS_ATTR_FATTR_OWNER) { - if (inode->i_uid != fattr->uid) { + if (!uid_eq(inode->i_uid, fattr->uid)) { invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL; inode->i_uid = fattr->uid; } @@ -1451,7 +1448,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) | NFS_INO_REVAL_FORCED); if (fattr->valid & NFS_ATTR_FATTR_GROUP) { - if (inode->i_gid != fattr->gid) { + if (!gid_eq(inode->i_gid, fattr->gid)) { invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL; inode->i_gid = fattr->gid; } diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index f0e6c7d..541c9eb 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -329,7 +329,6 @@ extern int __init register_nfs_fs(void); extern void __exit unregister_nfs_fs(void); extern void nfs_sb_active(struct super_block *sb); extern void nfs_sb_deactive(struct super_block *sb); -extern void nfs_sb_deactive_async(struct super_block *sb); /* namespace.c */ #define NFS_PATH_CANONICAL 1 diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c index dd057bc..fc8dc20 100644 --- a/fs/nfs/namespace.c +++ b/fs/nfs/namespace.c @@ -177,11 +177,31 @@ out_nofree: return mnt; } +static int +nfs_namespace_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) +{ + if (NFS_FH(dentry->d_inode)->size != 0) + return nfs_getattr(mnt, dentry, stat); + generic_fillattr(dentry->d_inode, stat); + return 0; +} + +static int +nfs_namespace_setattr(struct dentry *dentry, struct iattr *attr) +{ + if (NFS_FH(dentry->d_inode)->size != 0) + return nfs_setattr(dentry, attr); + return -EACCES; +} + const struct inode_operations nfs_mountpoint_inode_operations = { .getattr = nfs_getattr, + .setattr = nfs_setattr, }; const struct inode_operations nfs_referral_inode_operations = { + .getattr = nfs_namespace_getattr, + .setattr = nfs_namespace_setattr, }; static void nfs_expire_automounts(struct work_struct *work) diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c index 06b9df4..62db136 100644 --- a/fs/nfs/nfs2xdr.c +++ b/fs/nfs/nfs2xdr.c @@ -290,8 +290,13 @@ static int decode_fattr(struct xdr_stream *xdr, struct nfs_fattr *fattr) fattr->mode = be32_to_cpup(p++); fattr->nlink = be32_to_cpup(p++); - fattr->uid = be32_to_cpup(p++); - fattr->gid = be32_to_cpup(p++); + fattr->uid = make_kuid(&init_user_ns, be32_to_cpup(p++)); + if (!uid_valid(fattr->uid)) + goto out_uid; + fattr->gid = make_kgid(&init_user_ns, be32_to_cpup(p++)); + if (!gid_valid(fattr->gid)) + goto out_gid; + fattr->size = be32_to_cpup(p++); fattr->du.nfs2.blocksize = be32_to_cpup(p++); @@ -313,6 +318,12 @@ static int decode_fattr(struct xdr_stream *xdr, struct nfs_fattr *fattr) fattr->change_attr = nfs_timespec_to_change_attr(&fattr->ctime); return 0; +out_uid: + dprintk("NFS: returned invalid uid\n"); + return -EINVAL; +out_gid: + dprintk("NFS: returned invalid gid\n"); + return -EINVAL; out_overflow: print_overflow_msg(__func__, xdr); return -EIO; @@ -351,11 +362,11 @@ static void encode_sattr(struct xdr_stream *xdr, const struct iattr *attr) else *p++ = cpu_to_be32(NFS2_SATTR_NOT_SET); if (attr->ia_valid & ATTR_UID) - *p++ = cpu_to_be32(attr->ia_uid); + *p++ = cpu_to_be32(from_kuid(&init_user_ns, attr->ia_uid)); else *p++ = cpu_to_be32(NFS2_SATTR_NOT_SET); if (attr->ia_valid & ATTR_GID) - *p++ = cpu_to_be32(attr->ia_gid); + *p++ = cpu_to_be32(from_kgid(&init_user_ns, attr->ia_gid)); else *p++ = cpu_to_be32(NFS2_SATTR_NOT_SET); if (attr->ia_valid & ATTR_SIZE) diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c index bffc324..fa6d721 100644 --- a/fs/nfs/nfs3xdr.c +++ b/fs/nfs/nfs3xdr.c @@ -592,13 +592,13 @@ static void encode_sattr3(struct xdr_stream *xdr, const struct iattr *attr) if (attr->ia_valid & ATTR_UID) { *p++ = xdr_one; - *p++ = cpu_to_be32(attr->ia_uid); + *p++ = cpu_to_be32(from_kuid(&init_user_ns, attr->ia_uid)); } else *p++ = xdr_zero; if (attr->ia_valid & ATTR_GID) { *p++ = xdr_one; - *p++ = cpu_to_be32(attr->ia_gid); + *p++ = cpu_to_be32(from_kgid(&init_user_ns, attr->ia_gid)); } else *p++ = xdr_zero; @@ -657,8 +657,12 @@ static int decode_fattr3(struct xdr_stream *xdr, struct nfs_fattr *fattr) fattr->mode = (be32_to_cpup(p++) & ~S_IFMT) | fmode; fattr->nlink = be32_to_cpup(p++); - fattr->uid = be32_to_cpup(p++); - fattr->gid = be32_to_cpup(p++); + fattr->uid = make_kuid(&init_user_ns, be32_to_cpup(p++)); + if (!uid_valid(fattr->uid)) + goto out_uid; + fattr->gid = make_kgid(&init_user_ns, be32_to_cpup(p++)); + if (!gid_valid(fattr->gid)) + goto out_gid; p = xdr_decode_size3(p, &fattr->size); p = xdr_decode_size3(p, &fattr->du.nfs3.used); @@ -675,6 +679,12 @@ static int decode_fattr3(struct xdr_stream *xdr, struct nfs_fattr *fattr) fattr->valid |= NFS_ATTR_FATTR_V3; return 0; +out_uid: + dprintk("NFS: returned invalid uid\n"); + return -EINVAL; +out_gid: + dprintk("NFS: returned invalid gid\n"); + return -EINVAL; out_overflow: print_overflow_msg(__func__, xdr); return -EIO; diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index a3f488b..944c9a5 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -13,6 +13,8 @@ #define NFS4_MAX_LOOP_ON_RECOVER (10) +#include <linux/seqlock.h> + struct idmap; enum nfs4_client_state { @@ -90,6 +92,8 @@ struct nfs4_state_owner { unsigned long so_flags; struct list_head so_states; struct nfs_seqid_counter so_seqid; + seqcount_t so_reclaim_seqcount; + struct mutex so_delegreturn_mutex; }; enum { diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c index acc3472..2e9779b 100644 --- a/fs/nfs/nfs4client.c +++ b/fs/nfs/nfs4client.c @@ -236,11 +236,10 @@ struct nfs_client *nfs4_init_client(struct nfs_client *clp, error = nfs4_discover_server_trunking(clp, &old); if (error < 0) goto error; + nfs_put_client(clp); if (clp != old) { clp->cl_preserve_clid = true; - nfs_put_client(clp); clp = old; - atomic_inc(&clp->cl_count); } return clp; @@ -306,7 +305,7 @@ int nfs40_walk_client_list(struct nfs_client *new, .clientid = new->cl_clientid, .confirm = new->cl_confirm, }; - int status; + int status = -NFS4ERR_STALE_CLIENTID; spin_lock(&nn->nfs_client_lock); list_for_each_entry_safe(pos, n, &nn->nfs_client_list, cl_share_link) { @@ -332,40 +331,33 @@ int nfs40_walk_client_list(struct nfs_client *new, if (prev) nfs_put_client(prev); + prev = pos; status = nfs4_proc_setclientid_confirm(pos, &clid, cred); - if (status == 0) { + switch (status) { + case -NFS4ERR_STALE_CLIENTID: + break; + case 0: nfs4_swap_callback_idents(pos, new); - nfs_put_client(pos); + prev = NULL; *result = pos; dprintk("NFS: <-- %s using nfs_client = %p ({%d})\n", __func__, pos, atomic_read(&pos->cl_count)); - return 0; - } - if (status != -NFS4ERR_STALE_CLIENTID) { - nfs_put_client(pos); - dprintk("NFS: <-- %s status = %d, no result\n", - __func__, status); - return status; + default: + goto out; } spin_lock(&nn->nfs_client_lock); - prev = pos; } + spin_unlock(&nn->nfs_client_lock); - /* - * No matching nfs_client found. This should be impossible, - * because the new nfs_client has already been added to - * nfs_client_list by nfs_get_client(). - * - * Don't BUG(), since the caller is holding a mutex. - */ + /* No match found. The server lost our clientid */ +out: if (prev) nfs_put_client(prev); - spin_unlock(&nn->nfs_client_lock); - pr_err("NFS: %s Error: no matching nfs_client found\n", __func__); - return -NFS4ERR_STALE_CLIENTID; + dprintk("NFS: <-- %s status = %d\n", __func__, status); + return status; } #ifdef CONFIG_NFS_V4_1 @@ -432,7 +424,7 @@ int nfs41_walk_client_list(struct nfs_client *new, { struct nfs_net *nn = net_generic(new->cl_net, nfs_net_id); struct nfs_client *pos, *n, *prev = NULL; - int error; + int status = -NFS4ERR_STALE_CLIENTID; spin_lock(&nn->nfs_client_lock); list_for_each_entry_safe(pos, n, &nn->nfs_client_list, cl_share_link) { @@ -448,14 +440,17 @@ int nfs41_walk_client_list(struct nfs_client *new, nfs_put_client(prev); prev = pos; - error = nfs_wait_client_init_complete(pos); - if (error < 0) { + nfs4_schedule_lease_recovery(pos); + status = nfs_wait_client_init_complete(pos); + if (status < 0) { nfs_put_client(pos); spin_lock(&nn->nfs_client_lock); continue; } - + status = pos->cl_cons_state; spin_lock(&nn->nfs_client_lock); + if (status < 0) + continue; } if (pos->rpc_ops != new->rpc_ops) @@ -473,6 +468,7 @@ int nfs41_walk_client_list(struct nfs_client *new, if (!nfs4_match_serverowners(pos, new)) continue; + atomic_inc(&pos->cl_count); spin_unlock(&nn->nfs_client_lock); dprintk("NFS: <-- %s using nfs_client = %p ({%d})\n", __func__, pos, atomic_read(&pos->cl_count)); @@ -481,16 +477,10 @@ int nfs41_walk_client_list(struct nfs_client *new, return 0; } - /* - * No matching nfs_client found. This should be impossible, - * because the new nfs_client has already been added to - * nfs_client_list by nfs_get_client(). - * - * Don't BUG(), since the caller is holding a mutex. - */ + /* No matching nfs_client found. */ spin_unlock(&nn->nfs_client_lock); - pr_err("NFS: %s Error: no matching nfs_client found\n", __func__); - return -NFS4ERR_STALE_CLIENTID; + dprintk("NFS: <-- %s status = %d\n", __func__, status); + return status; } #endif /* CONFIG_NFS_V4_1 */ diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index cf747ef..eae83bf 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -896,6 +896,8 @@ static int can_open_delegated(struct nfs_delegation *delegation, fmode_t fmode) return 0; if (test_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags)) return 0; + if (test_bit(NFS_DELEGATION_RETURNING, &delegation->flags)) + return 0; nfs_mark_delegation_referenced(delegation); return 1; } @@ -973,6 +975,7 @@ static int update_open_stateid(struct nfs4_state *state, nfs4_stateid *open_stat spin_lock(&deleg_cur->lock); if (nfsi->delegation != deleg_cur || + test_bit(NFS_DELEGATION_RETURNING, &deleg_cur->flags) || (deleg_cur->type & fmode) != fmode) goto no_delegation_unlock; @@ -1352,19 +1355,18 @@ int nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state case -NFS4ERR_BAD_HIGH_SLOT: case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION: case -NFS4ERR_DEADSESSION: + set_bit(NFS_DELEGATED_STATE, &state->flags); nfs4_schedule_session_recovery(server->nfs_client->cl_session, err); + err = -EAGAIN; goto out; case -NFS4ERR_STALE_CLIENTID: case -NFS4ERR_STALE_STATEID: + set_bit(NFS_DELEGATED_STATE, &state->flags); case -NFS4ERR_EXPIRED: /* Don't recall a delegation if it was lost */ nfs4_schedule_lease_recovery(server->nfs_client); + err = -EAGAIN; goto out; - case -ERESTARTSYS: - /* - * The show must go on: exit, but mark the - * stateid as needing recovery. - */ case -NFS4ERR_DELEG_REVOKED: case -NFS4ERR_ADMIN_REVOKED: case -NFS4ERR_BAD_STATEID: @@ -1375,6 +1377,7 @@ int nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state err = 0; goto out; } + set_bit(NFS_DELEGATED_STATE, &state->flags); err = nfs4_handle_exception(server, err, &exception); } while (exception.retry); out: @@ -1463,7 +1466,7 @@ static void nfs4_open_prepare(struct rpc_task *task, void *calldata) struct nfs4_state_owner *sp = data->owner; if (nfs_wait_on_sequence(data->o_arg.seqid, task) != 0) - return; + goto out_wait; /* * Check if we still need to send an OPEN call, or if we can use * a delegation instead. @@ -1498,6 +1501,7 @@ unlock_no_action: rcu_read_unlock(); out_no_action: task->tk_action = NULL; +out_wait: nfs4_sequence_done(task, &data->o_res.seq_res); } @@ -1845,6 +1849,43 @@ static inline void nfs4_exclusive_attrset(struct nfs4_opendata *opendata, struct sattr->ia_valid |= ATTR_MTIME; } +static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata, + fmode_t fmode, + int flags, + struct nfs4_state **res) +{ + struct nfs4_state_owner *sp = opendata->owner; + struct nfs_server *server = sp->so_server; + struct nfs4_state *state; + unsigned int seq; + int ret; + + seq = raw_seqcount_begin(&sp->so_reclaim_seqcount); + + ret = _nfs4_proc_open(opendata); + if (ret != 0) + goto out; + + state = nfs4_opendata_to_nfs4_state(opendata); + ret = PTR_ERR(state); + if (IS_ERR(state)) + goto out; + if (server->caps & NFS_CAP_POSIX_LOCK) + set_bit(NFS_STATE_POSIX_LOCKS, &state->flags); + + ret = nfs4_opendata_access(sp->so_cred, opendata, state, fmode, flags); + if (ret != 0) + goto out; + + if (read_seqcount_retry(&sp->so_reclaim_seqcount, seq)) { + nfs4_schedule_stateid_recovery(server, state); + nfs4_wait_clnt_recover(server->nfs_client); + } + *res = state; +out: + return ret; +} + /* * Returns a referenced nfs4_state */ @@ -1889,18 +1930,7 @@ static int _nfs4_do_open(struct inode *dir, if (dentry->d_inode != NULL) opendata->state = nfs4_get_open_state(dentry->d_inode, sp); - status = _nfs4_proc_open(opendata); - if (status != 0) - goto err_opendata_put; - - state = nfs4_opendata_to_nfs4_state(opendata); - status = PTR_ERR(state); - if (IS_ERR(state)) - goto err_opendata_put; - if (server->caps & NFS_CAP_POSIX_LOCK) - set_bit(NFS_STATE_POSIX_LOCKS, &state->flags); - - status = nfs4_opendata_access(cred, opendata, state, fmode, flags); + status = _nfs4_open_and_get_state(opendata, fmode, flags, &state); if (status != 0) goto err_opendata_put; @@ -2088,7 +2118,7 @@ static void nfs4_free_closedata(void *data) nfs4_put_open_state(calldata->state); nfs_free_seqid(calldata->arg.seqid); nfs4_put_state_owner(sp); - nfs_sb_deactive_async(sb); + nfs_sb_deactive(sb); kfree(calldata); } @@ -2150,7 +2180,7 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data) dprintk("%s: begin!\n", __func__); if (nfs_wait_on_sequence(calldata->arg.seqid, task) != 0) - return; + goto out_wait; task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_DOWNGRADE]; calldata->arg.fmode = FMODE_READ|FMODE_WRITE; @@ -2172,16 +2202,14 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data) if (!call_close) { /* Note: exit _without_ calling nfs4_close_done */ - task->tk_action = NULL; - nfs4_sequence_done(task, &calldata->res.seq_res); - goto out; + goto out_no_action; } if (calldata->arg.fmode == 0) { task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CLOSE]; if (calldata->roc && pnfs_roc_drain(inode, &calldata->roc_barrier, task)) - goto out; + goto out_wait; } nfs_fattr_init(calldata->res.fattr); @@ -2191,8 +2219,12 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data) &calldata->res.seq_res, task) != 0) nfs_release_seqid(calldata->arg.seqid); -out: dprintk("%s: done!\n", __func__); + return; +out_no_action: + task->tk_action = NULL; +out_wait: + nfs4_sequence_done(task, &calldata->res.seq_res); } static const struct rpc_call_ops nfs4_close_ops = { @@ -4423,12 +4455,10 @@ static void nfs4_locku_prepare(struct rpc_task *task, void *data) struct nfs4_unlockdata *calldata = data; if (nfs_wait_on_sequence(calldata->arg.seqid, task) != 0) - return; + goto out_wait; if (test_bit(NFS_LOCK_INITIALIZED, &calldata->lsp->ls_flags) == 0) { /* Note: exit _without_ running nfs4_locku_done */ - task->tk_action = NULL; - nfs4_sequence_done(task, &calldata->res.seq_res); - return; + goto out_no_action; } calldata->timestamp = jiffies; if (nfs4_setup_sequence(calldata->server, @@ -4436,6 +4466,11 @@ static void nfs4_locku_prepare(struct rpc_task *task, void *data) &calldata->res.seq_res, task) != 0) nfs_release_seqid(calldata->arg.seqid); + return; +out_no_action: + task->tk_action = NULL; +out_wait: + nfs4_sequence_done(task, &calldata->res.seq_res); } static const struct rpc_call_ops nfs4_locku_ops = { @@ -4482,7 +4517,9 @@ static struct rpc_task *nfs4_do_unlck(struct file_lock *fl, static int nfs4_proc_unlck(struct nfs4_state *state, int cmd, struct file_lock *request) { - struct nfs_inode *nfsi = NFS_I(state->inode); + struct inode *inode = state->inode; + struct nfs4_state_owner *sp = state->owner; + struct nfs_inode *nfsi = NFS_I(inode); struct nfs_seqid *seqid; struct nfs4_lock_state *lsp; struct rpc_task *task; @@ -4492,12 +4529,17 @@ static int nfs4_proc_unlck(struct nfs4_state *state, int cmd, struct file_lock * status = nfs4_set_lock_state(state, request); /* Unlock _before_ we do the RPC call */ request->fl_flags |= FL_EXISTS; + /* Exclude nfs_delegation_claim_locks() */ + mutex_lock(&sp->so_delegreturn_mutex); + /* Exclude nfs4_reclaim_open_stateid() - note nesting! */ down_read(&nfsi->rwsem); if (do_vfs_lock(request->fl_file, request) == -ENOENT) { up_read(&nfsi->rwsem); + mutex_unlock(&sp->so_delegreturn_mutex); goto out; } up_read(&nfsi->rwsem); + mutex_unlock(&sp->so_delegreturn_mutex); if (status != 0) goto out; /* Is this a delegated lock? */ @@ -4576,7 +4618,7 @@ static void nfs4_lock_prepare(struct rpc_task *task, void *calldata) dprintk("%s: begin!\n", __func__); if (nfs_wait_on_sequence(data->arg.lock_seqid, task) != 0) - return; + goto out_wait; /* Do we need to do an open_to_lock_owner? */ if (!(data->arg.lock_seqid->sequence->flags & NFS_SEQID_CONFIRMED)) { if (nfs_wait_on_sequence(data->arg.open_seqid, task) != 0) { @@ -4596,6 +4638,8 @@ static void nfs4_lock_prepare(struct rpc_task *task, void *calldata) nfs_release_seqid(data->arg.open_seqid); out_release_lock_seqid: nfs_release_seqid(data->arg.lock_seqid); +out_wait: + nfs4_sequence_done(task, &data->res.seq_res); dprintk("%s: done!, ret = %d\n", __func__, data->rpc_status); } @@ -4813,8 +4857,10 @@ static int nfs41_lock_expired(struct nfs4_state *state, struct file_lock *reques static int _nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock *request) { + struct nfs4_state_owner *sp = state->owner; struct nfs_inode *nfsi = NFS_I(state->inode); unsigned char fl_flags = request->fl_flags; + unsigned int seq; int status = -ENOLCK; if ((fl_flags & FL_POSIX) && @@ -4836,9 +4882,16 @@ static int _nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock status = do_vfs_lock(request->fl_file, request); goto out_unlock; } + seq = raw_seqcount_begin(&sp->so_reclaim_seqcount); + up_read(&nfsi->rwsem); status = _nfs4_do_setlk(state, cmd, request, NFS_LOCK_NEW); if (status != 0) + goto out; + down_read(&nfsi->rwsem); + if (read_seqcount_retry(&sp->so_reclaim_seqcount, seq)) { + status = -NFS4ERR_DELAY; goto out_unlock; + } /* Note: we always want to sleep here! */ request->fl_flags = fl_flags | FL_SLEEP; if (do_vfs_lock(request->fl_file, request) < 0) @@ -4945,24 +4998,22 @@ int nfs4_lock_delegation_recall(struct nfs4_state *state, struct file_lock *fl) case 0: case -ESTALE: goto out; - case -NFS4ERR_EXPIRED: - nfs4_schedule_stateid_recovery(server, state); case -NFS4ERR_STALE_CLIENTID: case -NFS4ERR_STALE_STATEID: + set_bit(NFS_DELEGATED_STATE, &state->flags); + case -NFS4ERR_EXPIRED: nfs4_schedule_lease_recovery(server->nfs_client); + err = -EAGAIN; goto out; case -NFS4ERR_BADSESSION: case -NFS4ERR_BADSLOT: case -NFS4ERR_BAD_HIGH_SLOT: case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION: case -NFS4ERR_DEADSESSION: + set_bit(NFS_DELEGATED_STATE, &state->flags); nfs4_schedule_session_recovery(server->nfs_client->cl_session, err); + err = -EAGAIN; goto out; - case -ERESTARTSYS: - /* - * The show must go on: exit, but mark the - * stateid as needing recovery. - */ case -NFS4ERR_DELEG_REVOKED: case -NFS4ERR_ADMIN_REVOKED: case -NFS4ERR_BAD_STATEID: @@ -4975,9 +5026,8 @@ int nfs4_lock_delegation_recall(struct nfs4_state *state, struct file_lock *fl) /* kill_proc(fl->fl_pid, SIGLOST, 1); */ err = 0; goto out; - case -NFS4ERR_DELAY: - break; } + set_bit(NFS_DELEGATED_STATE, &state->flags); err = nfs4_handle_exception(server, err, &exception); } while (exception.retry); out: @@ -6134,7 +6184,8 @@ nfs4_proc_layoutget(struct nfs4_layoutget *lgp, gfp_t gfp_flags) status = nfs4_wait_for_completion_rpc_task(task); if (status == 0) status = task->tk_status; - if (status == 0) + /* if layoutp->len is 0, nfs4_layoutget_prepare called rpc_exit */ + if (status == 0 && lgp->res.layoutp->len) lseg = pnfs_layout_process(lgp); rpc_put_task(task); dprintk("<-- %s status=%d\n", __func__, status); diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 9448c57..6ace365 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -136,16 +136,11 @@ int nfs40_discover_server_trunking(struct nfs_client *clp, clp->cl_confirm = clid.confirm; status = nfs40_walk_client_list(clp, result, cred); - switch (status) { - case -NFS4ERR_STALE_CLIENTID: - set_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state); - case 0: + if (status == 0) { /* Sustain the lease, even if it's empty. If the clientid4 * goes stale it's of no use for trunking discovery. */ nfs4_schedule_state_renewal(*result); - break; } - out: return status; } @@ -523,6 +518,8 @@ nfs4_alloc_state_owner(struct nfs_server *server, nfs4_init_seqid_counter(&sp->so_seqid); atomic_set(&sp->so_count, 1); INIT_LIST_HEAD(&sp->so_lru); + seqcount_init(&sp->so_reclaim_seqcount); + mutex_init(&sp->so_delegreturn_mutex); return sp; } @@ -1395,8 +1392,9 @@ static int nfs4_reclaim_open_state(struct nfs4_state_owner *sp, const struct nfs * recovering after a network partition or a reboot from a * server that doesn't support a grace period. */ -restart: spin_lock(&sp->so_lock); + write_seqcount_begin(&sp->so_reclaim_seqcount); +restart: list_for_each_entry(state, &sp->so_states, open_states) { if (!test_and_clear_bit(ops->state_flag_bit, &state->flags)) continue; @@ -1417,6 +1415,7 @@ restart: } spin_unlock(&state->state_lock); nfs4_put_open_state(state); + spin_lock(&sp->so_lock); goto restart; } } @@ -1454,12 +1453,17 @@ restart: goto out_err; } nfs4_put_open_state(state); + spin_lock(&sp->so_lock); goto restart; } + write_seqcount_end(&sp->so_reclaim_seqcount); spin_unlock(&sp->so_lock); return 0; out_err: nfs4_put_open_state(state); + spin_lock(&sp->so_lock); + write_seqcount_end(&sp->so_reclaim_seqcount); + spin_unlock(&sp->so_lock); return status; } @@ -1863,6 +1867,7 @@ again: case -ETIMEDOUT: case -EAGAIN: ssleep(1); + case -NFS4ERR_STALE_CLIENTID: dprintk("NFS: %s after status %d, retrying\n", __func__, status); goto again; @@ -2022,8 +2027,18 @@ static int nfs4_reset_session(struct nfs_client *clp) nfs4_begin_drain_session(clp); cred = nfs4_get_exchange_id_cred(clp); status = nfs4_proc_destroy_session(clp->cl_session, cred); - if (status && status != -NFS4ERR_BADSESSION && - status != -NFS4ERR_DEADSESSION) { + switch (status) { + case 0: + case -NFS4ERR_BADSESSION: + case -NFS4ERR_DEADSESSION: + break; + case -NFS4ERR_BACK_CHAN_BUSY: + case -NFS4ERR_DELAY: + set_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state); + status = 0; + ssleep(1); + goto out; + default: status = nfs4_recovery_handle_error(clp, status); goto out; } diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 26b1439..e3edda5 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -1002,7 +1002,7 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const owner_namelen = nfs_map_uid_to_name(server, iap->ia_uid, owner_name, IDMAP_NAMESZ); if (owner_namelen < 0) { dprintk("nfs: couldn't resolve uid %d to string\n", - iap->ia_uid); + from_kuid(&init_user_ns, iap->ia_uid)); /* XXX */ strcpy(owner_name, "nobody"); owner_namelen = sizeof("nobody") - 1; @@ -1014,7 +1014,7 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const owner_grouplen = nfs_map_gid_to_group(server, iap->ia_gid, owner_group, IDMAP_NAMESZ); if (owner_grouplen < 0) { dprintk("nfs: couldn't resolve gid %d to string\n", - iap->ia_gid); + from_kgid(&init_user_ns, iap->ia_gid)); strcpy(owner_group, "nobody"); owner_grouplen = sizeof("nobody") - 1; /* goto out; */ @@ -3778,14 +3778,14 @@ out_overflow: } static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap, - const struct nfs_server *server, uint32_t *uid, + const struct nfs_server *server, kuid_t *uid, struct nfs4_string *owner_name) { uint32_t len; __be32 *p; int ret = 0; - *uid = -2; + *uid = make_kuid(&init_user_ns, -2); if (unlikely(bitmap[1] & (FATTR4_WORD1_OWNER - 1U))) return -EIO; if (likely(bitmap[1] & FATTR4_WORD1_OWNER)) { @@ -3813,7 +3813,7 @@ static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap, __func__, len); bitmap[1] &= ~FATTR4_WORD1_OWNER; } - dprintk("%s: uid=%d\n", __func__, (int)*uid); + dprintk("%s: uid=%d\n", __func__, (int)from_kuid(&init_user_ns, *uid)); return ret; out_overflow: print_overflow_msg(__func__, xdr); @@ -3821,14 +3821,14 @@ out_overflow: } static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap, - const struct nfs_server *server, uint32_t *gid, + const struct nfs_server *server, kgid_t *gid, struct nfs4_string *group_name) { uint32_t len; __be32 *p; int ret = 0; - *gid = -2; + *gid = make_kgid(&init_user_ns, -2); if (unlikely(bitmap[1] & (FATTR4_WORD1_OWNER_GROUP - 1U))) return -EIO; if (likely(bitmap[1] & FATTR4_WORD1_OWNER_GROUP)) { @@ -3856,7 +3856,7 @@ static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap, __func__, len); bitmap[1] &= ~FATTR4_WORD1_OWNER_GROUP; } - dprintk("%s: gid=%d\n", __func__, (int)*gid); + dprintk("%s: gid=%d\n", __func__, (int)from_kgid(&init_user_ns, *gid)); return ret; out_overflow: print_overflow_msg(__func__, xdr); diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c index c6f9906..88f9611 100644 --- a/fs/nfs/objlayout/objio_osd.c +++ b/fs/nfs/objlayout/objio_osd.c @@ -647,6 +647,7 @@ static struct pnfs_layoutdriver_type objlayout_type = { .flags = PNFS_LAYOUTRET_ON_SETATTR | PNFS_LAYOUTRET_ON_ERROR, + .owner = THIS_MODULE, .alloc_layout_hdr = objlayout_alloc_layout_hdr, .free_layout_hdr = objlayout_free_layout_hdr, diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index d00260b..6be70f6 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -505,37 +505,147 @@ pnfs_destroy_layout(struct nfs_inode *nfsi) } EXPORT_SYMBOL_GPL(pnfs_destroy_layout); -/* - * Called by the state manger to remove all layouts established under an - * expired lease. - */ -void -pnfs_destroy_all_layouts(struct nfs_client *clp) +static bool +pnfs_layout_add_bulk_destroy_list(struct inode *inode, + struct list_head *layout_list) { - struct nfs_server *server; struct pnfs_layout_hdr *lo; - LIST_HEAD(tmp_list); + bool ret = false; - nfs4_deviceid_mark_client_invalid(clp); - nfs4_deviceid_purge_client(clp); + spin_lock(&inode->i_lock); + lo = NFS_I(inode)->layout; + if (lo != NULL && list_empty(&lo->plh_bulk_destroy)) { + pnfs_get_layout_hdr(lo); + list_add(&lo->plh_bulk_destroy, layout_list); + ret = true; + } + spin_unlock(&inode->i_lock); + return ret; +} + +/* Caller must hold rcu_read_lock and clp->cl_lock */ +static int +pnfs_layout_bulk_destroy_byserver_locked(struct nfs_client *clp, + struct nfs_server *server, + struct list_head *layout_list) +{ + struct pnfs_layout_hdr *lo, *next; + struct inode *inode; + + list_for_each_entry_safe(lo, next, &server->layouts, plh_layouts) { + inode = igrab(lo->plh_inode); + if (inode == NULL) + continue; + list_del_init(&lo->plh_layouts); + if (pnfs_layout_add_bulk_destroy_list(inode, layout_list)) + continue; + rcu_read_unlock(); + spin_unlock(&clp->cl_lock); + iput(inode); + spin_lock(&clp->cl_lock); + rcu_read_lock(); + return -EAGAIN; + } + return 0; +} + +static int +pnfs_layout_free_bulk_destroy_list(struct list_head *layout_list, + bool is_bulk_recall) +{ + struct pnfs_layout_hdr *lo; + struct inode *inode; + struct pnfs_layout_range range = { + .iomode = IOMODE_ANY, + .offset = 0, + .length = NFS4_MAX_UINT64, + }; + LIST_HEAD(lseg_list); + int ret = 0; + + while (!list_empty(layout_list)) { + lo = list_entry(layout_list->next, struct pnfs_layout_hdr, + plh_bulk_destroy); + dprintk("%s freeing layout for inode %lu\n", __func__, + lo->plh_inode->i_ino); + inode = lo->plh_inode; + spin_lock(&inode->i_lock); + list_del_init(&lo->plh_bulk_destroy); + lo->plh_block_lgets++; /* permanently block new LAYOUTGETs */ + if (is_bulk_recall) + set_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags); + if (pnfs_mark_matching_lsegs_invalid(lo, &lseg_list, &range)) + ret = -EAGAIN; + spin_unlock(&inode->i_lock); + pnfs_free_lseg_list(&lseg_list); + pnfs_put_layout_hdr(lo); + iput(inode); + } + return ret; +} + +int +pnfs_destroy_layouts_byfsid(struct nfs_client *clp, + struct nfs_fsid *fsid, + bool is_recall) +{ + struct nfs_server *server; + LIST_HEAD(layout_list); spin_lock(&clp->cl_lock); rcu_read_lock(); +restart: list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) { - if (!list_empty(&server->layouts)) - list_splice_init(&server->layouts, &tmp_list); + if (memcmp(&server->fsid, fsid, sizeof(*fsid)) != 0) + continue; + if (pnfs_layout_bulk_destroy_byserver_locked(clp, + server, + &layout_list) != 0) + goto restart; } rcu_read_unlock(); spin_unlock(&clp->cl_lock); - while (!list_empty(&tmp_list)) { - lo = list_entry(tmp_list.next, struct pnfs_layout_hdr, - plh_layouts); - dprintk("%s freeing layout for inode %lu\n", __func__, - lo->plh_inode->i_ino); - list_del_init(&lo->plh_layouts); - pnfs_destroy_layout(NFS_I(lo->plh_inode)); + if (list_empty(&layout_list)) + return 0; + return pnfs_layout_free_bulk_destroy_list(&layout_list, is_recall); +} + +int +pnfs_destroy_layouts_byclid(struct nfs_client *clp, + bool is_recall) +{ + struct nfs_server *server; + LIST_HEAD(layout_list); + + spin_lock(&clp->cl_lock); + rcu_read_lock(); +restart: + list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) { + if (pnfs_layout_bulk_destroy_byserver_locked(clp, + server, + &layout_list) != 0) + goto restart; } + rcu_read_unlock(); + spin_unlock(&clp->cl_lock); + + if (list_empty(&layout_list)) + return 0; + return pnfs_layout_free_bulk_destroy_list(&layout_list, is_recall); +} + +/* + * Called by the state manger to remove all layouts established under an + * expired lease. + */ +void +pnfs_destroy_all_layouts(struct nfs_client *clp) +{ + nfs4_deviceid_mark_client_invalid(clp); + nfs4_deviceid_purge_client(clp); + + pnfs_destroy_layouts_byclid(clp, false); } /* @@ -888,7 +998,7 @@ alloc_init_layout_hdr(struct inode *ino, atomic_set(&lo->plh_refcount, 1); INIT_LIST_HEAD(&lo->plh_layouts); INIT_LIST_HEAD(&lo->plh_segs); - INIT_LIST_HEAD(&lo->plh_bulk_recall); + INIT_LIST_HEAD(&lo->plh_bulk_destroy); lo->plh_inode = ino; lo->plh_lc_cred = get_rpccred(ctx->state->owner->so_cred); return lo; diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index dbf7bba..97cb358 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -132,7 +132,7 @@ struct pnfs_layoutdriver_type { struct pnfs_layout_hdr { atomic_t plh_refcount; struct list_head plh_layouts; /* other client layouts */ - struct list_head plh_bulk_recall; /* clnt list of bulk recalls */ + struct list_head plh_bulk_destroy; struct list_head plh_segs; /* layout segments list */ nfs4_stateid plh_stateid; atomic_t plh_outstanding; /* number of RPCs out */ @@ -196,6 +196,11 @@ struct pnfs_layout_segment *pnfs_layout_process(struct nfs4_layoutget *lgp); void pnfs_free_lseg_list(struct list_head *tmp_list); void pnfs_destroy_layout(struct nfs_inode *); void pnfs_destroy_all_layouts(struct nfs_client *); +int pnfs_destroy_layouts_byfsid(struct nfs_client *clp, + struct nfs_fsid *fsid, + bool is_recall); +int pnfs_destroy_layouts_byclid(struct nfs_client *clp, + bool is_recall); void pnfs_put_layout_hdr(struct pnfs_layout_hdr *lo); void pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, const nfs4_stateid *new, diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 92acc26..a9dc5fc 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -54,7 +54,6 @@ #include <linux/parser.h> #include <linux/nsproxy.h> #include <linux/rcupdate.h> -#include <linux/kthread.h> #include <asm/uaccess.h> @@ -418,54 +417,6 @@ void nfs_sb_deactive(struct super_block *sb) } EXPORT_SYMBOL_GPL(nfs_sb_deactive); -static int nfs_deactivate_super_async_work(void *ptr) -{ - struct super_block *sb = ptr; - - deactivate_super(sb); - module_put_and_exit(0); - return 0; -} - -/* - * same effect as deactivate_super, but will do final unmount in kthread - * context - */ -static void nfs_deactivate_super_async(struct super_block *sb) -{ - struct task_struct *task; - char buf[INET6_ADDRSTRLEN + 1]; - struct nfs_server *server = NFS_SB(sb); - struct nfs_client *clp = server->nfs_client; - - if (!atomic_add_unless(&sb->s_active, -1, 1)) { - rcu_read_lock(); - snprintf(buf, sizeof(buf), - rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_ADDR)); - rcu_read_unlock(); - - __module_get(THIS_MODULE); - task = kthread_run(nfs_deactivate_super_async_work, sb, - "%s-deactivate-super", buf); - if (IS_ERR(task)) { - pr_err("%s: kthread_run: %ld\n", - __func__, PTR_ERR(task)); - /* make synchronous call and hope for the best */ - deactivate_super(sb); - module_put(THIS_MODULE); - } - } -} - -void nfs_sb_deactive_async(struct super_block *sb) -{ - struct nfs_server *server = NFS_SB(sb); - - if (atomic_dec_and_test(&server->active)) - nfs_deactivate_super_async(sb); -} -EXPORT_SYMBOL_GPL(nfs_sb_deactive_async); - /* * Deliver file system statistics to userspace */ @@ -2589,27 +2540,23 @@ nfs_xdev_mount(struct file_system_type *fs_type, int flags, struct nfs_server *server; struct dentry *mntroot = ERR_PTR(-ENOMEM); struct nfs_subversion *nfs_mod = NFS_SB(data->sb)->nfs_client->cl_nfs_mod; - int error; - dprintk("--> nfs_xdev_mount_common()\n"); + dprintk("--> nfs_xdev_mount()\n"); mount_info.mntfh = mount_info.cloned->fh; /* create a new volume representation */ server = nfs_mod->rpc_ops->clone_server(NFS_SB(data->sb), data->fh, data->fattr, data->authflavor); - if (IS_ERR(server)) { - error = PTR_ERR(server); - goto out_err; - } - mntroot = nfs_fs_mount_common(server, flags, dev_name, &mount_info, nfs_mod); - dprintk("<-- nfs_xdev_mount_common() = 0\n"); -out: - return mntroot; + if (IS_ERR(server)) + mntroot = ERR_CAST(server); + else + mntroot = nfs_fs_mount_common(server, flags, + dev_name, &mount_info, nfs_mod); -out_err: - dprintk("<-- nfs_xdev_mount_common() = %d [error]\n", error); - goto out; + dprintk("<-- nfs_xdev_mount() = %ld\n", + IS_ERR(mntroot) ? PTR_ERR(mntroot) : 0L); + return mntroot; } #if IS_ENABLED(CONFIG_NFS_V4) diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c index 3f79c77..d26a32f 100644 --- a/fs/nfs/unlink.c +++ b/fs/nfs/unlink.c @@ -95,7 +95,7 @@ static void nfs_async_unlink_release(void *calldata) nfs_dec_sillycount(data->dir); nfs_free_unlinkdata(data); - nfs_sb_deactive_async(sb); + nfs_sb_deactive(sb); } static void nfs_unlink_prepare(struct rpc_task *task, void *calldata) @@ -268,8 +268,7 @@ nfs_async_unlink(struct inode *dir, struct dentry *dentry) * point dentry is definitely not a root, so we won't need * that anymore. */ - if (devname_garbage) - kfree(devname_garbage); + kfree(devname_garbage); return 0; out_unlock: spin_unlock(&dentry->d_lock); diff --git a/fs/nfs_common/nfsacl.c b/fs/nfs_common/nfsacl.c index 6940439..ed628f7 100644 --- a/fs/nfs_common/nfsacl.c +++ b/fs/nfs_common/nfsacl.c @@ -38,8 +38,8 @@ struct nfsacl_encode_desc { unsigned int count; struct posix_acl *acl; int typeflag; - uid_t uid; - gid_t gid; + kuid_t uid; + kgid_t gid; }; struct nfsacl_simple_acl { @@ -60,14 +60,16 @@ xdr_nfsace_encode(struct xdr_array2_desc *desc, void *elem) *p++ = htonl(entry->e_tag | nfsacl_desc->typeflag); switch(entry->e_tag) { case ACL_USER_OBJ: - *p++ = htonl(nfsacl_desc->uid); + *p++ = htonl(from_kuid(&init_user_ns, nfsacl_desc->uid)); break; case ACL_GROUP_OBJ: - *p++ = htonl(nfsacl_desc->gid); + *p++ = htonl(from_kgid(&init_user_ns, nfsacl_desc->gid)); break; case ACL_USER: + *p++ = htonl(from_kuid(&init_user_ns, entry->e_uid)); + break; case ACL_GROUP: - *p++ = htonl(entry->e_id); + *p++ = htonl(from_kgid(&init_user_ns, entry->e_gid)); break; default: /* Solaris depends on that! */ *p++ = 0; @@ -148,6 +150,7 @@ xdr_nfsace_decode(struct xdr_array2_desc *desc, void *elem) (struct nfsacl_decode_desc *) desc; __be32 *p = elem; struct posix_acl_entry *entry; + unsigned int id; if (!nfsacl_desc->acl) { if (desc->array_len > NFS_ACL_MAX_ENTRIES) @@ -160,14 +163,22 @@ xdr_nfsace_decode(struct xdr_array2_desc *desc, void *elem) entry = &nfsacl_desc->acl->a_entries[nfsacl_desc->count++]; entry->e_tag = ntohl(*p++) & ~NFS_ACL_DEFAULT; - entry->e_id = ntohl(*p++); + id = ntohl(*p++); entry->e_perm = ntohl(*p++); switch(entry->e_tag) { - case ACL_USER_OBJ: case ACL_USER: - case ACL_GROUP_OBJ: + entry->e_uid = make_kuid(&init_user_ns, id); + if (!uid_valid(entry->e_uid)) + return -EINVAL; + break; case ACL_GROUP: + entry->e_gid = make_kgid(&init_user_ns, id); + if (!gid_valid(entry->e_gid)) + return -EINVAL; + break; + case ACL_USER_OBJ: + case ACL_GROUP_OBJ: case ACL_OTHER: if (entry->e_perm & ~S_IRWXO) return -EINVAL; @@ -190,9 +201,13 @@ cmp_acl_entry(const void *x, const void *y) if (a->e_tag != b->e_tag) return a->e_tag - b->e_tag; - else if (a->e_id > b->e_id) + else if ((a->e_tag == ACL_USER) && uid_gt(a->e_uid, b->e_uid)) + return 1; + else if ((a->e_tag == ACL_USER) && uid_lt(a->e_uid, b->e_uid)) + return -1; + else if ((a->e_tag == ACL_GROUP) && gid_gt(a->e_gid, b->e_gid)) return 1; - else if (a->e_id < b->e_id) + else if ((a->e_tag == ACL_GROUP) && gid_lt(a->e_gid, b->e_gid)) return -1; else return 0; @@ -213,22 +228,18 @@ posix_acl_from_nfsacl(struct posix_acl *acl) sort(acl->a_entries, acl->a_count, sizeof(struct posix_acl_entry), cmp_acl_entry, NULL); - /* Clear undefined identifier fields and find the ACL_GROUP_OBJ - and ACL_MASK entries. */ + /* Find the ACL_GROUP_OBJ and ACL_MASK entries. */ FOREACH_ACL_ENTRY(pa, acl, pe) { switch(pa->e_tag) { case ACL_USER_OBJ: - pa->e_id = ACL_UNDEFINED_ID; break; case ACL_GROUP_OBJ: - pa->e_id = ACL_UNDEFINED_ID; group_obj = pa; break; case ACL_MASK: mask = pa; /* fall through */ case ACL_OTHER: - pa->e_id = ACL_UNDEFINED_ID; break; } } diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig index 8df1ea4..430b687 100644 --- a/fs/nfsd/Kconfig +++ b/fs/nfsd/Kconfig @@ -65,8 +65,8 @@ config NFSD_V3_ACL If unsure, say N. config NFSD_V4 - bool "NFS server support for NFS version 4 (EXPERIMENTAL)" - depends on NFSD && PROC_FS && EXPERIMENTAL + bool "NFS server support for NFS version 4" + depends on NFSD && PROC_FS select NFSD_V3 select FS_POSIX_ACL select SUNRPC_GSS diff --git a/fs/nfsd/acl.h b/fs/nfsd/acl.h index 34e5c40..8b186a4 100644 --- a/fs/nfsd/acl.h +++ b/fs/nfsd/acl.h @@ -44,8 +44,6 @@ struct nfs4_acl *nfs4_acl_new(int); int nfs4_acl_get_whotype(char *, u32); int nfs4_acl_write_who(int who, char *p); -int nfs4_acl_permission(struct nfs4_acl *acl, uid_t owner, gid_t group, - uid_t who, u32 mask); #define NFS4_ACL_TYPE_DEFAULT 0x01 #define NFS4_ACL_DIR 0x02 diff --git a/fs/nfsd/auth.c b/fs/nfsd/auth.c index 34a10d7..06cddd5 100644 --- a/fs/nfsd/auth.c +++ b/fs/nfsd/auth.c @@ -47,9 +47,9 @@ int nfsd_setuser(struct svc_rqst *rqstp, struct svc_export *exp) if (!gi) goto oom; } else if (flags & NFSEXP_ROOTSQUASH) { - if (!new->fsuid) + if (uid_eq(new->fsuid, GLOBAL_ROOT_UID)) new->fsuid = exp->ex_anon_uid; - if (!new->fsgid) + if (gid_eq(new->fsgid, GLOBAL_ROOT_GID)) new->fsgid = exp->ex_anon_gid; gi = groups_alloc(rqgi->ngroups); @@ -58,7 +58,7 @@ int nfsd_setuser(struct svc_rqst *rqstp, struct svc_export *exp) for (i = 0; i < rqgi->ngroups; i++) { if (gid_eq(GLOBAL_ROOT_GID, GROUP_AT(rqgi, i))) - GROUP_AT(gi, i) = make_kgid(&init_user_ns, exp->ex_anon_gid); + GROUP_AT(gi, i) = exp->ex_anon_gid; else GROUP_AT(gi, i) = GROUP_AT(rqgi, i); } @@ -66,9 +66,9 @@ int nfsd_setuser(struct svc_rqst *rqstp, struct svc_export *exp) gi = get_group_info(rqgi); } - if (new->fsuid == (uid_t) -1) + if (uid_eq(new->fsuid, INVALID_UID)) new->fsuid = exp->ex_anon_uid; - if (new->fsgid == (gid_t) -1) + if (gid_eq(new->fsgid, INVALID_GID)) new->fsgid = exp->ex_anon_gid; ret = set_groups(new, gi); @@ -76,7 +76,7 @@ int nfsd_setuser(struct svc_rqst *rqstp, struct svc_export *exp) if (ret < 0) goto error; - if (new->fsuid) + if (!uid_eq(new->fsuid, GLOBAL_ROOT_UID)) new->cap_effective = cap_drop_nfsd_set(new->cap_effective); else new->cap_effective = cap_raise_nfsd_set(new->cap_effective, diff --git a/fs/nfsd/auth.h b/fs/nfsd/auth.h index 78b3c0e..53325a1 100644 --- a/fs/nfsd/auth.h +++ b/fs/nfsd/auth.h @@ -1,6 +1,5 @@ /* * nfsd-specific authentication stuff. - * uid/gid mapping not yet implemented. * * Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de> */ @@ -8,11 +7,6 @@ #ifndef LINUX_NFSD_AUTH_H #define LINUX_NFSD_AUTH_H -#define nfsd_luid(rq, uid) ((u32)(uid)) -#define nfsd_lgid(rq, gid) ((u32)(gid)) -#define nfsd_ruid(rq, uid) ((u32)(uid)) -#define nfsd_rgid(rq, gid) ((u32)(gid)) - /* * Set the current process's fsuid/fsgid etc to those of the NFS * client user diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c index a3946cf..5681c59 100644 --- a/fs/nfsd/export.c +++ b/fs/nfsd/export.c @@ -544,13 +544,17 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen) err = get_int(&mesg, &an_int); if (err) goto out3; - exp.ex_anon_uid= an_int; + exp.ex_anon_uid= make_kuid(&init_user_ns, an_int); + if (!uid_valid(exp.ex_anon_uid)) + goto out3; /* anon gid */ err = get_int(&mesg, &an_int); if (err) goto out3; - exp.ex_anon_gid= an_int; + exp.ex_anon_gid= make_kgid(&init_user_ns, an_int); + if (!gid_valid(exp.ex_anon_gid)) + goto out3; /* fsid */ err = get_int(&mesg, &an_int); @@ -613,7 +617,7 @@ out: } static void exp_flags(struct seq_file *m, int flag, int fsid, - uid_t anonu, uid_t anong, struct nfsd4_fs_locations *fslocs); + kuid_t anonu, kgid_t anong, struct nfsd4_fs_locations *fslocs); static void show_secinfo(struct seq_file *m, struct svc_export *exp); static int svc_export_show(struct seq_file *m, @@ -1179,15 +1183,17 @@ static void show_secinfo(struct seq_file *m, struct svc_export *exp) } static void exp_flags(struct seq_file *m, int flag, int fsid, - uid_t anonu, uid_t anong, struct nfsd4_fs_locations *fsloc) + kuid_t anonu, kgid_t anong, struct nfsd4_fs_locations *fsloc) { show_expflags(m, flag, NFSEXP_ALLFLAGS); if (flag & NFSEXP_FSID) seq_printf(m, ",fsid=%d", fsid); - if (anonu != (uid_t)-2 && anonu != (0x10000-2)) - seq_printf(m, ",anonuid=%u", anonu); - if (anong != (gid_t)-2 && anong != (0x10000-2)) - seq_printf(m, ",anongid=%u", anong); + if (!uid_eq(anonu, make_kuid(&init_user_ns, (uid_t)-2)) && + !uid_eq(anonu, make_kuid(&init_user_ns, 0x10000-2))) + seq_printf(m, ",anonuid=%u", from_kuid(&init_user_ns, anonu)); + if (!gid_eq(anong, make_kgid(&init_user_ns, (gid_t)-2)) && + !gid_eq(anong, make_kgid(&init_user_ns, 0x10000-2))) + seq_printf(m, ",anongid=%u", from_kgid(&init_user_ns, anong)); if (fsloc && fsloc->locations_count > 0) { char *loctype = (fsloc->migrated) ? "refer" : "replicas"; int i; diff --git a/fs/nfsd/idmap.h b/fs/nfsd/idmap.h index 9d513ef..bf95f6b 100644 --- a/fs/nfsd/idmap.h +++ b/fs/nfsd/idmap.h @@ -54,9 +54,9 @@ static inline void nfsd_idmap_shutdown(struct net *net) } #endif -__be32 nfsd_map_name_to_uid(struct svc_rqst *, const char *, size_t, __u32 *); -__be32 nfsd_map_name_to_gid(struct svc_rqst *, const char *, size_t, __u32 *); -int nfsd_map_uid_to_name(struct svc_rqst *, __u32, char *); -int nfsd_map_gid_to_name(struct svc_rqst *, __u32, char *); +__be32 nfsd_map_name_to_uid(struct svc_rqst *, const char *, size_t, kuid_t *); +__be32 nfsd_map_name_to_gid(struct svc_rqst *, const char *, size_t, kgid_t *); +int nfsd_map_uid_to_name(struct svc_rqst *, kuid_t, char *); +int nfsd_map_gid_to_name(struct svc_rqst *, kgid_t, char *); #endif /* LINUX_NFSD_IDMAP_H */ diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c index 7af9417b..14d9ecb 100644 --- a/fs/nfsd/nfs3xdr.c +++ b/fs/nfsd/nfs3xdr.c @@ -106,12 +106,14 @@ decode_sattr3(__be32 *p, struct iattr *iap) iap->ia_mode = ntohl(*p++); } if (*p++) { - iap->ia_valid |= ATTR_UID; - iap->ia_uid = ntohl(*p++); + iap->ia_uid = make_kuid(&init_user_ns, ntohl(*p++)); + if (uid_valid(iap->ia_uid)) + iap->ia_valid |= ATTR_UID; } if (*p++) { - iap->ia_valid |= ATTR_GID; - iap->ia_gid = ntohl(*p++); + iap->ia_gid = make_kgid(&init_user_ns, ntohl(*p++)); + if (gid_valid(iap->ia_gid)) + iap->ia_valid |= ATTR_GID; } if (*p++) { u64 newsize; @@ -168,8 +170,8 @@ encode_fattr3(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp, *p++ = htonl(nfs3_ftypes[(stat->mode & S_IFMT) >> 12]); *p++ = htonl((u32) stat->mode); *p++ = htonl((u32) stat->nlink); - *p++ = htonl((u32) nfsd_ruid(rqstp, stat->uid)); - *p++ = htonl((u32) nfsd_rgid(rqstp, stat->gid)); + *p++ = htonl((u32) from_kuid(&init_user_ns, stat->uid)); + *p++ = htonl((u32) from_kgid(&init_user_ns, stat->gid)); if (S_ISLNK(stat->mode) && stat->size > NFS3_MAXPATHLEN) { p = xdr_encode_hyper(p, (u64) NFS3_MAXPATHLEN); } else { diff --git a/fs/nfsd/nfs4acl.c b/fs/nfsd/nfs4acl.c index 9c51aff..8a50b3c 100644 --- a/fs/nfsd/nfs4acl.c +++ b/fs/nfsd/nfs4acl.c @@ -264,7 +264,7 @@ _posix_to_nfsv4_one(struct posix_acl *pacl, struct nfs4_acl *acl, ace->flag = eflag; ace->access_mask = deny_mask_from_posix(deny, flags); ace->whotype = NFS4_ACL_WHO_NAMED; - ace->who = pa->e_id; + ace->who_uid = pa->e_uid; ace++; acl->naces++; } @@ -273,7 +273,7 @@ _posix_to_nfsv4_one(struct posix_acl *pacl, struct nfs4_acl *acl, ace->access_mask = mask_from_posix(pa->e_perm & pas.mask, flags); ace->whotype = NFS4_ACL_WHO_NAMED; - ace->who = pa->e_id; + ace->who_uid = pa->e_uid; ace++; acl->naces++; pa++; @@ -300,7 +300,7 @@ _posix_to_nfsv4_one(struct posix_acl *pacl, struct nfs4_acl *acl, ace->access_mask = mask_from_posix(pa->e_perm & pas.mask, flags); ace->whotype = NFS4_ACL_WHO_NAMED; - ace->who = pa->e_id; + ace->who_gid = pa->e_gid; ace++; acl->naces++; pa++; @@ -329,7 +329,7 @@ _posix_to_nfsv4_one(struct posix_acl *pacl, struct nfs4_acl *acl, ace->flag = eflag | NFS4_ACE_IDENTIFIER_GROUP; ace->access_mask = deny_mask_from_posix(deny, flags); ace->whotype = NFS4_ACL_WHO_NAMED; - ace->who = pa->e_id; + ace->who_gid = pa->e_gid; ace++; acl->naces++; } @@ -345,6 +345,18 @@ _posix_to_nfsv4_one(struct posix_acl *pacl, struct nfs4_acl *acl, acl->naces++; } +static bool +pace_gt(struct posix_acl_entry *pace1, struct posix_acl_entry *pace2) +{ + if (pace1->e_tag != pace2->e_tag) + return pace1->e_tag > pace2->e_tag; + if (pace1->e_tag == ACL_USER) + return uid_gt(pace1->e_uid, pace2->e_uid); + if (pace1->e_tag == ACL_GROUP) + return gid_gt(pace1->e_gid, pace2->e_gid); + return false; +} + static void sort_pacl_range(struct posix_acl *pacl, int start, int end) { int sorted = 0, i; @@ -355,8 +367,8 @@ sort_pacl_range(struct posix_acl *pacl, int start, int end) { while (!sorted) { sorted = 1; for (i = start; i < end; i++) { - if (pacl->a_entries[i].e_id - > pacl->a_entries[i+1].e_id) { + if (pace_gt(&pacl->a_entries[i], + &pacl->a_entries[i+1])) { sorted = 0; tmp = pacl->a_entries[i]; pacl->a_entries[i] = pacl->a_entries[i+1]; @@ -398,7 +410,10 @@ struct posix_ace_state { }; struct posix_user_ace_state { - uid_t uid; + union { + kuid_t uid; + kgid_t gid; + }; struct posix_ace_state perms; }; @@ -521,7 +536,6 @@ posix_state_to_acl(struct posix_acl_state *state, unsigned int flags) if (error) goto out_err; low_mode_from_nfs4(state->owner.allow, &pace->e_perm, flags); - pace->e_id = ACL_UNDEFINED_ID; for (i=0; i < state->users->n; i++) { pace++; @@ -531,7 +545,7 @@ posix_state_to_acl(struct posix_acl_state *state, unsigned int flags) goto out_err; low_mode_from_nfs4(state->users->aces[i].perms.allow, &pace->e_perm, flags); - pace->e_id = state->users->aces[i].uid; + pace->e_uid = state->users->aces[i].uid; add_to_mask(state, &state->users->aces[i].perms); } @@ -541,7 +555,6 @@ posix_state_to_acl(struct posix_acl_state *state, unsigned int flags) if (error) goto out_err; low_mode_from_nfs4(state->group.allow, &pace->e_perm, flags); - pace->e_id = ACL_UNDEFINED_ID; add_to_mask(state, &state->group); for (i=0; i < state->groups->n; i++) { @@ -552,14 +565,13 @@ posix_state_to_acl(struct posix_acl_state *state, unsigned int flags) goto out_err; low_mode_from_nfs4(state->groups->aces[i].perms.allow, &pace->e_perm, flags); - pace->e_id = state->groups->aces[i].uid; + pace->e_gid = state->groups->aces[i].gid; add_to_mask(state, &state->groups->aces[i].perms); } pace++; pace->e_tag = ACL_MASK; low_mode_from_nfs4(state->mask.allow, &pace->e_perm, flags); - pace->e_id = ACL_UNDEFINED_ID; pace++; pace->e_tag = ACL_OTHER; @@ -567,7 +579,6 @@ posix_state_to_acl(struct posix_acl_state *state, unsigned int flags) if (error) goto out_err; low_mode_from_nfs4(state->other.allow, &pace->e_perm, flags); - pace->e_id = ACL_UNDEFINED_ID; return pacl; out_err: @@ -587,12 +598,13 @@ static inline void deny_bits(struct posix_ace_state *astate, u32 mask) astate->deny |= mask & ~astate->allow; } -static int find_uid(struct posix_acl_state *state, struct posix_ace_state_array *a, uid_t uid) +static int find_uid(struct posix_acl_state *state, kuid_t uid) { + struct posix_ace_state_array *a = state->users; int i; for (i = 0; i < a->n; i++) - if (a->aces[i].uid == uid) + if (uid_eq(a->aces[i].uid, uid)) return i; /* Not found: */ a->n++; @@ -603,6 +615,23 @@ static int find_uid(struct posix_acl_state *state, struct posix_ace_state_array return i; } +static int find_gid(struct posix_acl_state *state, kgid_t gid) +{ + struct posix_ace_state_array *a = state->groups; + int i; + + for (i = 0; i < a->n; i++) + if (gid_eq(a->aces[i].gid, gid)) + return i; + /* Not found: */ + a->n++; + a->aces[i].gid = gid; + a->aces[i].perms.allow = state->everyone.allow; + a->aces[i].perms.deny = state->everyone.deny; + + return i; +} + static void deny_bits_array(struct posix_ace_state_array *a, u32 mask) { int i; @@ -636,7 +665,7 @@ static void process_one_v4_ace(struct posix_acl_state *state, } break; case ACL_USER: - i = find_uid(state, state->users, ace->who); + i = find_uid(state, ace->who_uid); if (ace->type == NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE) { allow_bits(&state->users->aces[i].perms, mask); } else { @@ -658,7 +687,7 @@ static void process_one_v4_ace(struct posix_acl_state *state, } break; case ACL_GROUP: - i = find_uid(state, state->groups, ace->who); + i = find_gid(state, ace->who_gid); if (ace->type == NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE) { allow_bits(&state->groups->aces[i].perms, mask); } else { diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c index a1f10c0..0ce1234 100644 --- a/fs/nfsd/nfs4idmap.c +++ b/fs/nfsd/nfs4idmap.c @@ -65,7 +65,7 @@ MODULE_PARM_DESC(nfs4_disable_idmapping, struct ent { struct cache_head h; int type; /* User / Group */ - uid_t id; + u32 id; char name[IDMAP_NAMESZ]; char authname[IDMAP_NAMESZ]; }; @@ -540,7 +540,7 @@ rqst_authname(struct svc_rqst *rqstp) static __be32 idmap_name_to_id(struct svc_rqst *rqstp, int type, const char *name, u32 namelen, - uid_t *id) + u32 *id) { struct ent *item, key = { .type = type, @@ -564,7 +564,7 @@ idmap_name_to_id(struct svc_rqst *rqstp, int type, const char *name, u32 namelen } static int -idmap_id_to_name(struct svc_rqst *rqstp, int type, uid_t id, char *name) +idmap_id_to_name(struct svc_rqst *rqstp, int type, u32 id, char *name) { struct ent *item, key = { .id = id, @@ -587,7 +587,7 @@ idmap_id_to_name(struct svc_rqst *rqstp, int type, uid_t id, char *name) } static bool -numeric_name_to_id(struct svc_rqst *rqstp, int type, const char *name, u32 namelen, uid_t *id) +numeric_name_to_id(struct svc_rqst *rqstp, int type, const char *name, u32 namelen, u32 *id) { int ret; char buf[11]; @@ -603,7 +603,7 @@ numeric_name_to_id(struct svc_rqst *rqstp, int type, const char *name, u32 namel } static __be32 -do_name_to_id(struct svc_rqst *rqstp, int type, const char *name, u32 namelen, uid_t *id) +do_name_to_id(struct svc_rqst *rqstp, int type, const char *name, u32 namelen, u32 *id) { if (nfs4_disable_idmapping && rqstp->rq_cred.cr_flavor < RPC_AUTH_GSS) if (numeric_name_to_id(rqstp, type, name, namelen, id)) @@ -616,7 +616,7 @@ do_name_to_id(struct svc_rqst *rqstp, int type, const char *name, u32 namelen, u } static int -do_id_to_name(struct svc_rqst *rqstp, int type, uid_t id, char *name) +do_id_to_name(struct svc_rqst *rqstp, int type, u32 id, char *name) { if (nfs4_disable_idmapping && rqstp->rq_cred.cr_flavor < RPC_AUTH_GSS) return sprintf(name, "%u", id); @@ -625,26 +625,40 @@ do_id_to_name(struct svc_rqst *rqstp, int type, uid_t id, char *name) __be32 nfsd_map_name_to_uid(struct svc_rqst *rqstp, const char *name, size_t namelen, - __u32 *id) + kuid_t *uid) { - return do_name_to_id(rqstp, IDMAP_TYPE_USER, name, namelen, id); + __be32 status; + u32 id = -1; + status = do_name_to_id(rqstp, IDMAP_TYPE_USER, name, namelen, &id); + *uid = make_kuid(&init_user_ns, id); + if (!uid_valid(*uid)) + status = nfserr_badowner; + return status; } __be32 nfsd_map_name_to_gid(struct svc_rqst *rqstp, const char *name, size_t namelen, - __u32 *id) + kgid_t *gid) { - return do_name_to_id(rqstp, IDMAP_TYPE_GROUP, name, namelen, id); + __be32 status; + u32 id = -1; + status = do_name_to_id(rqstp, IDMAP_TYPE_GROUP, name, namelen, &id); + *gid = make_kgid(&init_user_ns, id); + if (!gid_valid(*gid)) + status = nfserr_badowner; + return status; } int -nfsd_map_uid_to_name(struct svc_rqst *rqstp, __u32 id, char *name) +nfsd_map_uid_to_name(struct svc_rqst *rqstp, kuid_t uid, char *name) { + u32 id = from_kuid(&init_user_ns, uid); return do_id_to_name(rqstp, IDMAP_TYPE_USER, id, name); } int -nfsd_map_gid_to_name(struct svc_rqst *rqstp, __u32 id, char *name) +nfsd_map_gid_to_name(struct svc_rqst *rqstp, kgid_t gid, char *name) { + u32 id = from_kgid(&init_user_ns, gid); return do_id_to_name(rqstp, IDMAP_TYPE_GROUP, id, name); } diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c index ba6fdd4..4914af4 100644 --- a/fs/nfsd/nfs4recover.c +++ b/fs/nfsd/nfs4recover.c @@ -73,8 +73,8 @@ nfs4_save_creds(const struct cred **original_creds) if (!new) return -ENOMEM; - new->fsuid = 0; - new->fsgid = 0; + new->fsuid = GLOBAL_ROOT_UID; + new->fsgid = GLOBAL_ROOT_GID; *original_creds = override_creds(new); put_cred(new); return 0; diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index ac8ed96..9e7103b 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -151,7 +151,7 @@ get_nfs4_file(struct nfs4_file *fi) } static int num_delegations; -unsigned int max_delegations; +unsigned long max_delegations; /* * Open owner state (share locks) @@ -700,8 +700,8 @@ static int nfsd4_get_drc_mem(int slotsize, u32 num) num = min_t(u32, num, NFSD_MAX_SLOTS_PER_SESSION); spin_lock(&nfsd_drc_lock); - avail = min_t(int, NFSD_MAX_MEM_PER_SESSION, - nfsd_drc_max_mem - nfsd_drc_mem_used); + avail = min((unsigned long)NFSD_MAX_MEM_PER_SESSION, + nfsd_drc_max_mem - nfsd_drc_mem_used); num = min_t(int, num, avail / slotsize); nfsd_drc_mem_used += num * slotsize; spin_unlock(&nfsd_drc_lock); @@ -1202,7 +1202,7 @@ static bool groups_equal(struct group_info *g1, struct group_info *g2) if (g1->ngroups != g2->ngroups) return false; for (i=0; i<g1->ngroups; i++) - if (GROUP_AT(g1, i) != GROUP_AT(g2, i)) + if (!gid_eq(GROUP_AT(g1, i), GROUP_AT(g2, i))) return false; return true; } @@ -1227,8 +1227,8 @@ static bool same_creds(struct svc_cred *cr1, struct svc_cred *cr2) { if ((is_gss_cred(cr1) != is_gss_cred(cr2)) - || (cr1->cr_uid != cr2->cr_uid) - || (cr1->cr_gid != cr2->cr_gid) + || (!uid_eq(cr1->cr_uid, cr2->cr_uid)) + || (!gid_eq(cr1->cr_gid, cr2->cr_gid)) || !groups_equal(cr1->cr_group_info, cr2->cr_group_info)) return false; if (cr1->cr_principal == cr2->cr_principal) diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 17e70da..8ca6d17 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -293,13 +293,13 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, ace->whotype = nfs4_acl_get_whotype(buf, dummy32); status = nfs_ok; if (ace->whotype != NFS4_ACL_WHO_NAMED) - ace->who = 0; + ; else if (ace->flag & NFS4_ACE_IDENTIFIER_GROUP) status = nfsd_map_name_to_gid(argp->rqstp, - buf, dummy32, &ace->who); + buf, dummy32, &ace->who_gid); else status = nfsd_map_name_to_uid(argp->rqstp, - buf, dummy32, &ace->who); + buf, dummy32, &ace->who_uid); if (status) return status; } @@ -464,9 +464,16 @@ static __be32 nfsd4_decode_cb_sec(struct nfsd4_compoundargs *argp, struct nfsd4_ READ32(dummy); READ_BUF(dummy * 4); if (cbs->flavor == (u32)(-1)) { - cbs->uid = uid; - cbs->gid = gid; - cbs->flavor = RPC_AUTH_UNIX; + kuid_t kuid = make_kuid(&init_user_ns, uid); + kgid_t kgid = make_kgid(&init_user_ns, gid); + if (uid_valid(kuid) && gid_valid(kgid)) { + cbs->uid = kuid; + cbs->gid = kgid; + cbs->flavor = RPC_AUTH_UNIX; + } else { + dprintk("RPC_AUTH_UNIX with invalid" + "uid or gid ignoring!\n"); + } } break; case RPC_AUTH_GSS: @@ -1926,7 +1933,7 @@ static u32 nfs4_file_type(umode_t mode) } static __be32 -nfsd4_encode_name(struct svc_rqst *rqstp, int whotype, uid_t id, int group, +nfsd4_encode_name(struct svc_rqst *rqstp, int whotype, kuid_t uid, kgid_t gid, __be32 **p, int *buflen) { int status; @@ -1935,10 +1942,10 @@ nfsd4_encode_name(struct svc_rqst *rqstp, int whotype, uid_t id, int group, return nfserr_resource; if (whotype != NFS4_ACL_WHO_NAMED) status = nfs4_acl_write_who(whotype, (u8 *)(*p + 1)); - else if (group) - status = nfsd_map_gid_to_name(rqstp, id, (u8 *)(*p + 1)); + else if (gid_valid(gid)) + status = nfsd_map_gid_to_name(rqstp, gid, (u8 *)(*p + 1)); else - status = nfsd_map_uid_to_name(rqstp, id, (u8 *)(*p + 1)); + status = nfsd_map_uid_to_name(rqstp, uid, (u8 *)(*p + 1)); if (status < 0) return nfserrno(status); *p = xdr_encode_opaque(*p, NULL, status); @@ -1948,22 +1955,33 @@ nfsd4_encode_name(struct svc_rqst *rqstp, int whotype, uid_t id, int group, } static inline __be32 -nfsd4_encode_user(struct svc_rqst *rqstp, uid_t uid, __be32 **p, int *buflen) +nfsd4_encode_user(struct svc_rqst *rqstp, kuid_t user, __be32 **p, int *buflen) { - return nfsd4_encode_name(rqstp, NFS4_ACL_WHO_NAMED, uid, 0, p, buflen); + return nfsd4_encode_name(rqstp, NFS4_ACL_WHO_NAMED, user, INVALID_GID, + p, buflen); } static inline __be32 -nfsd4_encode_group(struct svc_rqst *rqstp, uid_t gid, __be32 **p, int *buflen) +nfsd4_encode_group(struct svc_rqst *rqstp, kgid_t group, __be32 **p, int *buflen) { - return nfsd4_encode_name(rqstp, NFS4_ACL_WHO_NAMED, gid, 1, p, buflen); + return nfsd4_encode_name(rqstp, NFS4_ACL_WHO_NAMED, INVALID_UID, group, + p, buflen); } static inline __be32 -nfsd4_encode_aclname(struct svc_rqst *rqstp, int whotype, uid_t id, int group, +nfsd4_encode_aclname(struct svc_rqst *rqstp, struct nfs4_ace *ace, __be32 **p, int *buflen) { - return nfsd4_encode_name(rqstp, whotype, id, group, p, buflen); + kuid_t uid = INVALID_UID; + kgid_t gid = INVALID_GID; + + if (ace->whotype == NFS4_ACL_WHO_NAMED) { + if (ace->flag & NFS4_ACE_IDENTIFIER_GROUP) + gid = ace->who_gid; + else + uid = ace->who_uid; + } + return nfsd4_encode_name(rqstp, ace->whotype, uid, gid, p, buflen); } #define WORD0_ABSENT_FS_ATTRS (FATTR4_WORD0_FS_LOCATIONS | FATTR4_WORD0_FSID | \ @@ -2224,9 +2242,7 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp, WRITE32(ace->type); WRITE32(ace->flag); WRITE32(ace->access_mask & NFS4_ACE_MASK_ALL); - status = nfsd4_encode_aclname(rqstp, ace->whotype, - ace->who, ace->flag & NFS4_ACE_IDENTIFIER_GROUP, - &p, &buflen); + status = nfsd4_encode_aclname(rqstp, ace, &p, &buflen); if (status == nfserr_resource) goto out_resource; if (status) diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h index de23db2..07a473f 100644 --- a/fs/nfsd/nfsd.h +++ b/fs/nfsd/nfsd.h @@ -56,8 +56,8 @@ extern struct svc_version nfsd_version2, nfsd_version3, extern u32 nfsd_supported_minorversion; extern struct mutex nfsd_mutex; extern spinlock_t nfsd_drc_lock; -extern unsigned int nfsd_drc_max_mem; -extern unsigned int nfsd_drc_mem_used; +extern unsigned long nfsd_drc_max_mem; +extern unsigned long nfsd_drc_mem_used; extern const struct seq_operations nfs_exports_op; @@ -106,7 +106,7 @@ static inline int nfsd_v4client(struct svc_rqst *rq) * NFSv4 State */ #ifdef CONFIG_NFSD_V4 -extern unsigned int max_delegations; +extern unsigned long max_delegations; void nfs4_state_init(void); int nfsd4_init_slabs(void); void nfsd4_free_slabs(void); diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index cee62ab..be7af50 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -59,8 +59,8 @@ DEFINE_MUTEX(nfsd_mutex); * nfsd_drc_pages_used tracks the current version 4.1 DRC memory usage. */ spinlock_t nfsd_drc_lock; -unsigned int nfsd_drc_max_mem; -unsigned int nfsd_drc_mem_used; +unsigned long nfsd_drc_max_mem; +unsigned long nfsd_drc_mem_used; #if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) static struct svc_stat nfsd_acl_svcstats; @@ -342,7 +342,7 @@ static void set_max_drc(void) >> NFSD_DRC_SIZE_SHIFT) * PAGE_SIZE; nfsd_drc_mem_used = 0; spin_lock_init(&nfsd_drc_lock); - dprintk("%s nfsd_drc_max_mem %u \n", __func__, nfsd_drc_max_mem); + dprintk("%s nfsd_drc_max_mem %lu \n", __func__, nfsd_drc_max_mem); } static int nfsd_get_default_max_blksize(void) diff --git a/fs/nfsd/nfsxdr.c b/fs/nfsd/nfsxdr.c index 96e5619..9c769a4 100644 --- a/fs/nfsd/nfsxdr.c +++ b/fs/nfsd/nfsxdr.c @@ -101,12 +101,14 @@ decode_sattr(__be32 *p, struct iattr *iap) iap->ia_mode = tmp; } if ((tmp = ntohl(*p++)) != (u32)-1) { - iap->ia_valid |= ATTR_UID; - iap->ia_uid = tmp; + iap->ia_uid = make_kuid(&init_user_ns, tmp); + if (uid_valid(iap->ia_uid)) + iap->ia_valid |= ATTR_UID; } if ((tmp = ntohl(*p++)) != (u32)-1) { - iap->ia_valid |= ATTR_GID; - iap->ia_gid = tmp; + iap->ia_gid = make_kgid(&init_user_ns, tmp); + if (gid_valid(iap->ia_gid)) + iap->ia_valid |= ATTR_GID; } if ((tmp = ntohl(*p++)) != (u32)-1) { iap->ia_valid |= ATTR_SIZE; @@ -152,8 +154,8 @@ encode_fattr(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp, *p++ = htonl(nfs_ftypes[type >> 12]); *p++ = htonl((u32) stat->mode); *p++ = htonl((u32) stat->nlink); - *p++ = htonl((u32) nfsd_ruid(rqstp, stat->uid)); - *p++ = htonl((u32) nfsd_rgid(rqstp, stat->gid)); + *p++ = htonl((u32) from_kuid(&init_user_ns, stat->uid)); + *p++ = htonl((u32) from_kgid(&init_user_ns, stat->gid)); if (S_ISLNK(type) && stat->size > NFS_MAXPATHLEN) { *p++ = htonl(NFS_MAXPATHLEN); diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index d1c229f..1a8c739 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -152,8 +152,8 @@ struct nfsd4_channel_attrs { struct nfsd4_cb_sec { u32 flavor; /* (u32)(-1) used to mean "no valid flavor" */ - u32 uid; - u32 gid; + kuid_t uid; + kgid_t gid; }; struct nfsd4_create_session { diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index a94245b..2a7eb53 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -401,8 +401,8 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap, /* Revoke setuid/setgid on chown */ if (!S_ISDIR(inode->i_mode) && - (((iap->ia_valid & ATTR_UID) && iap->ia_uid != inode->i_uid) || - ((iap->ia_valid & ATTR_GID) && iap->ia_gid != inode->i_gid))) { + (((iap->ia_valid & ATTR_UID) && !uid_eq(iap->ia_uid, inode->i_uid)) || + ((iap->ia_valid & ATTR_GID) && !gid_eq(iap->ia_gid, inode->i_gid)))) { iap->ia_valid |= ATTR_KILL_PRIV; if (iap->ia_valid & ATTR_MODE) { /* we're setting mode too, just clear the s*id bits */ @@ -1205,7 +1205,7 @@ nfsd_create_setattr(struct svc_rqst *rqstp, struct svc_fh *resfhp, * send along the gid on create when it tries to implement * setgid directories via NFS: */ - if (current_fsuid() != 0) + if (!uid_eq(current_fsuid(), GLOBAL_ROOT_UID)) iap->ia_valid &= ~(ATTR_UID|ATTR_GID); if (iap->ia_valid) return nfsd_setattr(rqstp, resfhp, iap, 0, (time_t)0); @@ -2150,7 +2150,7 @@ nfsd_permission(struct svc_rqst *rqstp, struct svc_export *exp, * with NFSv3. */ if ((acc & NFSD_MAY_OWNER_OVERRIDE) && - inode->i_uid == current_fsuid()) + uid_eq(inode->i_uid, current_fsuid())) return 0; /* This assumes NFSD_MAY_{READ,WRITE,EXEC} == MAY_{READ,WRITE,EXEC} */ diff --git a/fs/nilfs2/Kconfig b/fs/nilfs2/Kconfig index 251da07..80da8eb 100644 --- a/fs/nilfs2/Kconfig +++ b/fs/nilfs2/Kconfig @@ -1,6 +1,5 @@ config NILFS2_FS - tristate "NILFS2 file system support (EXPERIMENTAL)" - depends on EXPERIMENTAL + tristate "NILFS2 file system support" select CRC32 help NILFS2 is a log-structured file system (LFS) supporting continuous diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c index 89dc088..08fdb77 100644 --- a/fs/nilfs2/file.c +++ b/fs/nilfs2/file.c @@ -126,7 +126,7 @@ static int nilfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) nilfs_transaction_commit(inode->i_sb); mapped: - wait_on_page_writeback(page); + wait_for_stable_page(page); out: sb_end_pagefault(inode->i_sb); return block_page_mkwrite_return(ret); diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c index ef61c74..b44bdb29 100644 --- a/fs/nilfs2/ioctl.c +++ b/fs/nilfs2/ioctl.c @@ -664,8 +664,11 @@ static int nilfs_ioctl_clean_segments(struct inode *inode, struct file *filp, if (ret < 0) printk(KERN_ERR "NILFS: GC failed during preparation: " "cannot read source blocks: err=%d\n", ret); - else + else { + if (nilfs_sb_need_update(nilfs)) + set_nilfs_discontinued(nilfs); ret = nilfs_clean_segments(inode->i_sb, argv, kbufs); + } nilfs_remove_all_gcinodes(nilfs); clear_nilfs_gc_running(nilfs); diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c index 228a2c2..07f7a92 100644 --- a/fs/notify/inotify/inotify_user.c +++ b/fs/notify/inotify/inotify_user.c @@ -576,8 +576,6 @@ static int inotify_update_existing_watch(struct fsnotify_group *group, /* don't allow invalid bits: we don't want flags set */ mask = inotify_arg_to_mask(arg); - if (unlikely(!(mask & IN_ALL_EVENTS))) - return -EINVAL; fsn_mark = fsnotify_find_inode_mark(group, inode); if (!fsn_mark) @@ -629,8 +627,6 @@ static int inotify_new_watch(struct fsnotify_group *group, /* don't allow invalid bits: we don't want flags set */ mask = inotify_arg_to_mask(arg); - if (unlikely(!(mask & IN_ALL_EVENTS))) - return -EINVAL; tmp_i_mark = kmem_cache_alloc(inotify_inode_mark_cachep, GFP_KERNEL); if (unlikely(!tmp_i_mark)) diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c index 260b162..8a40457 100644 --- a/fs/ocfs2/acl.c +++ b/fs/ocfs2/acl.c @@ -65,7 +65,20 @@ static struct posix_acl *ocfs2_acl_from_xattr(const void *value, size_t size) acl->a_entries[n].e_tag = le16_to_cpu(entry->e_tag); acl->a_entries[n].e_perm = le16_to_cpu(entry->e_perm); - acl->a_entries[n].e_id = le32_to_cpu(entry->e_id); + switch(acl->a_entries[n].e_tag) { + case ACL_USER: + acl->a_entries[n].e_uid = + make_kuid(&init_user_ns, + le32_to_cpu(entry->e_id)); + break; + case ACL_GROUP: + acl->a_entries[n].e_gid = + make_kgid(&init_user_ns, + le32_to_cpu(entry->e_id)); + break; + default: + break; + } value += sizeof(struct posix_acl_entry); } @@ -91,7 +104,21 @@ static void *ocfs2_acl_to_xattr(const struct posix_acl *acl, size_t *size) for (n = 0; n < acl->a_count; n++, entry++) { entry->e_tag = cpu_to_le16(acl->a_entries[n].e_tag); entry->e_perm = cpu_to_le16(acl->a_entries[n].e_perm); - entry->e_id = cpu_to_le32(acl->a_entries[n].e_id); + switch(acl->a_entries[n].e_tag) { + case ACL_USER: + entry->e_id = cpu_to_le32( + from_kuid(&init_user_ns, + acl->a_entries[n].e_uid)); + break; + case ACL_GROUP: + entry->e_id = cpu_to_le32( + from_kgid(&init_user_ns, + acl->a_entries[n].e_gid)); + break; + default: + entry->e_id = cpu_to_le32(ACL_UNDEFINED_ID); + break; + } } return ocfs2_acl; } diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index 31b9463..b8a9d87 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -6751,8 +6751,7 @@ int ocfs2_zero_range_for_truncate(struct inode *inode, handle_t *handle, mlog_errno(ret); out: - if (pages) - kfree(pages); + kfree(pages); return ret; } diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 50fe28b..20dfec7 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -1194,6 +1194,7 @@ static int ocfs2_grab_pages_for_write(struct address_space *mapping, goto out; } } + wait_for_stable_page(wc->w_pages[i]); if (index == target_index) wc->w_target_page = wc->w_pages[i]; diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index f7c648d..42252bf 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -1471,8 +1471,7 @@ static void o2hb_region_release(struct config_item *item) mlog(ML_HEARTBEAT, "hb region release (%s)\n", reg->hr_dev_name); - if (reg->hr_tmp_block) - kfree(reg->hr_tmp_block); + kfree(reg->hr_tmp_block); if (reg->hr_slot_data) { for (i = 0; i < reg->hr_num_pages; i++) { @@ -1486,8 +1485,7 @@ static void o2hb_region_release(struct config_item *item) if (reg->hr_bdev) blkdev_put(reg->hr_bdev, FMODE_READ|FMODE_WRITE); - if (reg->hr_slots) - kfree(reg->hr_slots); + kfree(reg->hr_slots); kfree(reg->hr_db_regnum); kfree(reg->hr_db_livenodes); diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c index 1bfe880..0d2bf56 100644 --- a/fs/ocfs2/cluster/tcp.c +++ b/fs/ocfs2/cluster/tcp.c @@ -870,7 +870,7 @@ int o2net_register_handler(u32 msg_type, u32 key, u32 max_len, /* we've had some trouble with handlers seemingly vanishing. */ mlog_bug_on_msg(o2net_handler_tree_lookup(msg_type, key, &p, &parent) == NULL, - "couldn't find handler we *just* registerd " + "couldn't find handler we *just* registered " "for type %u key %08x\n", msg_type, key); } write_unlock(&o2net_handler_lock); @@ -1165,10 +1165,8 @@ out: o2net_debug_del_nst(&nst); /* must be before dropping sc and node */ if (sc) sc_put(sc); - if (vec) - kfree(vec); - if (msg) - kfree(msg); + kfree(vec); + kfree(msg); o2net_complete_nsw(nn, &nsw, 0, 0, 0); return ret; } diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c index c87d079..f1e1aed 100644 --- a/fs/ocfs2/dir.c +++ b/fs/ocfs2/dir.c @@ -67,7 +67,6 @@ #define NAMEI_RA_CHUNKS 2 #define NAMEI_RA_BLOCKS 4 #define NAMEI_RA_SIZE (NAMEI_RA_CHUNKS * NAMEI_RA_BLOCKS) -#define NAMEI_RA_INDEX(c,b) (((c) * NAMEI_RA_BLOCKS) + (b)) static unsigned char ocfs2_filetype_table[] = { DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c index 9e89d70..dbb17c0 100644 --- a/fs/ocfs2/dlm/dlmdomain.c +++ b/fs/ocfs2/dlm/dlmdomain.c @@ -319,9 +319,7 @@ static void dlm_free_ctxt_mem(struct dlm_ctxt *dlm) if (dlm->master_hash) dlm_free_pagevec((void **)dlm->master_hash, DLM_HASH_PAGES); - if (dlm->name) - kfree(dlm->name); - + kfree(dlm->name); kfree(dlm); } diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index 4f7795f..12ae194 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -2045,8 +2045,8 @@ static void __ocfs2_stuff_meta_lvb(struct inode *inode) lvb->lvb_version = OCFS2_LVB_VERSION; lvb->lvb_isize = cpu_to_be64(i_size_read(inode)); lvb->lvb_iclusters = cpu_to_be32(oi->ip_clusters); - lvb->lvb_iuid = cpu_to_be32(inode->i_uid); - lvb->lvb_igid = cpu_to_be32(inode->i_gid); + lvb->lvb_iuid = cpu_to_be32(i_uid_read(inode)); + lvb->lvb_igid = cpu_to_be32(i_gid_read(inode)); lvb->lvb_imode = cpu_to_be16(inode->i_mode); lvb->lvb_inlink = cpu_to_be16(inode->i_nlink); lvb->lvb_iatime_packed = @@ -2095,8 +2095,8 @@ static void ocfs2_refresh_inode_from_lvb(struct inode *inode) else inode->i_blocks = ocfs2_inode_sector_count(inode); - inode->i_uid = be32_to_cpu(lvb->lvb_iuid); - inode->i_gid = be32_to_cpu(lvb->lvb_igid); + i_uid_write(inode, be32_to_cpu(lvb->lvb_iuid)); + i_gid_write(inode, be32_to_cpu(lvb->lvb_igid)); inode->i_mode = be16_to_cpu(lvb->lvb_imode); set_nlink(inode, be16_to_cpu(lvb->lvb_inlink)); ocfs2_unpack_timespec(&inode->i_atime, @@ -2545,6 +2545,7 @@ int ocfs2_super_lock(struct ocfs2_super *osb, * everything is up to the caller :) */ status = ocfs2_should_refresh_lock_res(lockres); if (status < 0) { + ocfs2_cluster_unlock(osb, lockres, level); mlog_errno(status); goto bail; } @@ -2553,8 +2554,10 @@ int ocfs2_super_lock(struct ocfs2_super *osb, ocfs2_complete_lock_res_refresh(lockres, status); - if (status < 0) + if (status < 0) { + ocfs2_cluster_unlock(osb, lockres, level); mlog_errno(status); + } ocfs2_track_lock_refresh(lockres); } bail: diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c index f487aa3..1c39efb 100644 --- a/fs/ocfs2/extent_map.c +++ b/fs/ocfs2/extent_map.c @@ -282,8 +282,7 @@ search: spin_unlock(&oi->ip_lock); out: - if (new_emi) - kfree(new_emi); + kfree(new_emi); } static int ocfs2_last_eb_is_empty(struct inode *inode, diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 5bcd865..6474cb4 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -1116,7 +1116,8 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr) (unsigned long long)OCFS2_I(inode)->ip_blkno, dentry->d_name.len, dentry->d_name.name, attr->ia_valid, attr->ia_mode, - attr->ia_uid, attr->ia_gid); + from_kuid(&init_user_ns, attr->ia_uid), + from_kgid(&init_user_ns, attr->ia_gid)); /* ensuring we don't even attempt to truncate a symlink */ if (S_ISLNK(inode->i_mode)) @@ -1174,14 +1175,14 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr) } } - if ((attr->ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) || - (attr->ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) { + if ((attr->ia_valid & ATTR_UID && !uid_eq(attr->ia_uid, inode->i_uid)) || + (attr->ia_valid & ATTR_GID && !gid_eq(attr->ia_gid, inode->i_gid))) { /* * Gather pointers to quota structures so that allocation / * freeing of quota structures happens here and not inside * dquot_transfer() where we have problems with lock ordering */ - if (attr->ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid + if (attr->ia_valid & ATTR_UID && !uid_eq(attr->ia_uid, inode->i_uid) && OCFS2_HAS_RO_COMPAT_FEATURE(sb, OCFS2_FEATURE_RO_COMPAT_USRQUOTA)) { transfer_to[USRQUOTA] = dqget(sb, make_kqid_uid(attr->ia_uid)); @@ -1190,7 +1191,7 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr) goto bail_unlock; } } - if (attr->ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid + if (attr->ia_valid & ATTR_GID && !gid_eq(attr->ia_gid, inode->i_gid) && OCFS2_HAS_RO_COMPAT_FEATURE(sb, OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)) { transfer_to[GRPQUOTA] = dqget(sb, make_kqid_gid(attr->ia_gid)); diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index d89e08a..f87f9bd 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c @@ -269,8 +269,8 @@ void ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe, inode->i_generation = le32_to_cpu(fe->i_generation); inode->i_rdev = huge_decode_dev(le64_to_cpu(fe->id1.dev1.i_rdev)); inode->i_mode = le16_to_cpu(fe->i_mode); - inode->i_uid = le32_to_cpu(fe->i_uid); - inode->i_gid = le32_to_cpu(fe->i_gid); + i_uid_write(inode, le32_to_cpu(fe->i_uid)); + i_gid_write(inode, le32_to_cpu(fe->i_gid)); /* Fast symlinks will have i_size but no allocated clusters. */ if (S_ISLNK(inode->i_mode) && !fe->i_clusters) { @@ -1259,8 +1259,8 @@ int ocfs2_mark_inode_dirty(handle_t *handle, fe->i_size = cpu_to_le64(i_size_read(inode)); ocfs2_set_links_count(fe, inode->i_nlink); - fe->i_uid = cpu_to_le32(inode->i_uid); - fe->i_gid = cpu_to_le32(inode->i_gid); + fe->i_uid = cpu_to_le32(i_uid_read(inode)); + fe->i_gid = cpu_to_le32(i_gid_read(inode)); fe->i_mode = cpu_to_le16(inode->i_mode); fe->i_atime = cpu_to_le64(inode->i_atime.tv_sec); fe->i_atime_nsec = cpu_to_le32(inode->i_atime.tv_nsec); @@ -1290,8 +1290,8 @@ void ocfs2_refresh_inode(struct inode *inode, ocfs2_set_inode_flags(inode); i_size_write(inode, le64_to_cpu(fe->i_size)); set_nlink(inode, ocfs2_read_links_count(fe)); - inode->i_uid = le32_to_cpu(fe->i_uid); - inode->i_gid = le32_to_cpu(fe->i_gid); + i_uid_write(inode, le32_to_cpu(fe->i_uid)); + i_gid_write(inode, le32_to_cpu(fe->i_gid)); inode->i_mode = le16_to_cpu(fe->i_mode); if (S_ISLNK(inode->i_mode) && le32_to_cpu(fe->i_clusters) == 0) inode->i_blocks = 0; diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index 2dd36af..8eccfab 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -1234,11 +1234,8 @@ static void ocfs2_queue_recovery_completion(struct ocfs2_journal *journal, /* Though we wish to avoid it, we are in fact safe in * skipping local alloc cleanup as fsck.ocfs2 is more * than capable of reclaiming unused space. */ - if (la_dinode) - kfree(la_dinode); - - if (tl_dinode) - kfree(tl_dinode); + kfree(la_dinode); + kfree(tl_dinode); if (qrec) ocfs2_free_quota_recovery(qrec); @@ -1408,8 +1405,7 @@ bail: mutex_unlock(&osb->recovery_lock); - if (rm_quota) - kfree(rm_quota); + kfree(rm_quota); /* no one is callint kthread_stop() for us so the kthread() api * requires that we call do_exit(). And it isn't exported, but diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c index a9f78c7..aebeacd 100644 --- a/fs/ocfs2/localalloc.c +++ b/fs/ocfs2/localalloc.c @@ -476,8 +476,7 @@ out: if (local_alloc_inode) iput(local_alloc_inode); - if (alloc_copy) - kfree(alloc_copy); + kfree(alloc_copy); } /* @@ -534,7 +533,7 @@ int ocfs2_begin_local_alloc_recovery(struct ocfs2_super *osb, mlog_errno(status); bail: - if ((status < 0) && (*alloc_copy)) { + if (status < 0) { kfree(*alloc_copy); *alloc_copy = NULL; } @@ -1290,8 +1289,7 @@ bail: if (main_bm_inode) iput(main_bm_inode); - if (alloc_copy) - kfree(alloc_copy); + kfree(alloc_copy); if (ac) ocfs2_free_alloc_context(ac); diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index f1fd074..04ee1b5 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -512,8 +512,8 @@ static int __ocfs2_mknod_locked(struct inode *dir, fe->i_suballoc_loc = cpu_to_le64(suballoc_loc); fe->i_suballoc_bit = cpu_to_le16(suballoc_bit); fe->i_suballoc_slot = cpu_to_le16(inode_ac->ac_alloc_slot); - fe->i_uid = cpu_to_le32(inode->i_uid); - fe->i_gid = cpu_to_le32(inode->i_gid); + fe->i_uid = cpu_to_le32(i_uid_read(inode)); + fe->i_gid = cpu_to_le32(i_gid_read(inode)); fe->i_mode = cpu_to_le16(inode->i_mode); if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) fe->id1.dev1.i_rdev = cpu_to_le64(huge_encode_dev(dev)); diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index 1baffaa..998b17e 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -4407,7 +4407,7 @@ static int ocfs2_vfs_reflink(struct dentry *old_dentry, struct inode *dir, * rights to do so. */ if (preserve) { - if ((current_fsuid() != inode->i_uid) && !capable(CAP_CHOWN)) + if (!uid_eq(current_fsuid(), inode->i_uid) && !capable(CAP_CHOWN)) return -EPERM; if (!in_group_p(inode->i_gid) && !capable(CAP_CHOWN)) return -EPERM; diff --git a/fs/ocfs2/stack_o2cb.c b/fs/ocfs2/stack_o2cb.c index 9436801..bf1f893 100644 --- a/fs/ocfs2/stack_o2cb.c +++ b/fs/ocfs2/stack_o2cb.c @@ -376,7 +376,7 @@ static int o2cb_cluster_connect(struct ocfs2_cluster_connection *conn) dlm_register_eviction_cb(dlm, &priv->op_eviction_cb); out_free: - if (rc && conn->cc_private) + if (rc) kfree(conn->cc_private); out: diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 0e91ec2..9b6910d 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -2525,8 +2525,7 @@ static int ocfs2_check_volume(struct ocfs2_super *osb) mlog_errno(status); finally: - if (local_alloc) - kfree(local_alloc); + kfree(local_alloc); if (status) mlog_errno(status); @@ -2553,8 +2552,7 @@ static void ocfs2_delete_osb(struct ocfs2_super *osb) * we free it here. */ kfree(osb->journal); - if (osb->local_alloc_copy) - kfree(osb->local_alloc_copy); + kfree(osb->local_alloc_copy); kfree(osb->uuid_str); ocfs2_put_dlm_debug(osb->osb_dlm_debug); memset(osb, 0, sizeof(struct ocfs2_super)); diff --git a/fs/ocfs2/sysfile.c b/fs/ocfs2/sysfile.c index 3d635f4..f053688 100644 --- a/fs/ocfs2/sysfile.c +++ b/fs/ocfs2/sysfile.c @@ -91,8 +91,7 @@ static struct inode **get_local_system_inode(struct ocfs2_super *osb, } else osb->local_system_inodes = local_system_inodes; spin_unlock(&osb->osb_lock); - if (unlikely(free)) - kfree(free); + kfree(free); } index = (slot * NUM_LOCAL_SYSTEM_INODES) + diff --git a/fs/proc/Makefile b/fs/proc/Makefile index 981b056..712f24d 100644 --- a/fs/proc/Makefile +++ b/fs/proc/Makefile @@ -8,7 +8,8 @@ proc-y := nommu.o task_nommu.o proc-$(CONFIG_MMU) := mmu.o task_mmu.o proc-y += inode.o root.o base.o generic.o array.o \ - proc_tty.o fd.o + fd.o +proc-$(CONFIG_TTY) += proc_tty.o proc-y += cmdline.o proc-y += consoles.o proc-y += cpuinfo.o diff --git a/fs/proc/array.c b/fs/proc/array.c index 6a91e6f..f7ed9ee 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -449,7 +449,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, do { min_flt += t->min_flt; maj_flt += t->maj_flt; - gtime += t->gtime; + gtime += task_gtime(t); t = next_thread(t); } while (t != task); @@ -472,7 +472,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, min_flt = task->min_flt; maj_flt = task->maj_flt; task_cputime_adjusted(task, &utime, &stime); - gtime = task->gtime; + gtime = task_gtime(task); } /* scale priority and nice values from timeslices to -20..20 */ diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index 80e4645..1efaaa1 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -40,7 +40,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v) * sysctl_overcommit_ratio / 100) + total_swap_pages; cached = global_page_state(NR_FILE_PAGES) - - total_swapcache_pages - i.bufferram; + total_swapcache_pages() - i.bufferram; if (cached < 0) cached = 0; @@ -109,7 +109,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v) K(i.freeram), K(i.bufferram), K(cached), - K(total_swapcache_pages), + K(total_swapcache_pages()), K(pages[LRU_ACTIVE_ANON] + pages[LRU_ACTIVE_FILE]), K(pages[LRU_INACTIVE_ANON] + pages[LRU_INACTIVE_FILE]), K(pages[LRU_ACTIVE_ANON]), @@ -158,7 +158,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v) vmi.used >> 10, vmi.largest_chunk >> 10 #ifdef CONFIG_MEMORY_FAILURE - ,atomic_long_read(&mce_bad_pages) << (PAGE_SHIFT - 10) + ,atomic_long_read(&num_poisoned_pages) << (PAGE_SHIFT - 10) #endif #ifdef CONFIG_TRANSPARENT_HUGEPAGE ,K(global_page_state(NR_ANON_TRANSPARENT_HUGEPAGES) * diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c index 75df0d7..b4ac657 100644 --- a/fs/proc/proc_net.c +++ b/fs/proc/proc_net.c @@ -177,20 +177,6 @@ const struct file_operations proc_net_operations = { .readdir = proc_tgid_net_readdir, }; - -struct proc_dir_entry *proc_net_fops_create(struct net *net, - const char *name, umode_t mode, const struct file_operations *fops) -{ - return proc_create(name, mode, net->proc_net, fops); -} -EXPORT_SYMBOL_GPL(proc_net_fops_create); - -void proc_net_remove(struct net *net, const char *name) -{ - remove_proc_entry(name, net->proc_net); -} -EXPORT_SYMBOL_GPL(proc_net_remove); - static __net_init int proc_net_ns_init(struct net *net) { struct proc_dir_entry *netd, *net_statd; diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c index 67de74c..e4bcb2c 100644 --- a/fs/pstore/inode.c +++ b/fs/pstore/inode.c @@ -418,9 +418,25 @@ static struct file_system_type pstore_fs_type = { .kill_sb = pstore_kill_sb, }; +static struct kobject *pstore_kobj; + static int __init init_pstore_fs(void) { - return register_filesystem(&pstore_fs_type); + int err = 0; + + /* Create a convenient mount point for people to access pstore */ + pstore_kobj = kobject_create_and_add("pstore", fs_kobj); + if (!pstore_kobj) { + err = -ENOMEM; + goto out; + } + + err = register_filesystem(&pstore_fs_type); + if (err < 0) + kobject_put(pstore_kobj); + +out: + return err; } module_init(init_pstore_fs) diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c index 5ea2e77..86d1038 100644 --- a/fs/pstore/platform.c +++ b/fs/pstore/platform.c @@ -96,6 +96,27 @@ static const char *get_reason_str(enum kmsg_dump_reason reason) } } +bool pstore_cannot_block_path(enum kmsg_dump_reason reason) +{ + /* + * In case of NMI path, pstore shouldn't be blocked + * regardless of reason. + */ + if (in_nmi()) + return true; + + switch (reason) { + /* In panic case, other cpus are stopped by smp_send_stop(). */ + case KMSG_DUMP_PANIC: + /* Emergency restart shouldn't be blocked by spin lock. */ + case KMSG_DUMP_EMERG: + return true; + default: + return false; + } +} +EXPORT_SYMBOL_GPL(pstore_cannot_block_path); + /* * callback from kmsg_dump. (s2,l2) has the most recently * written bytes, older bytes are in (s1,l1). Save as much @@ -114,10 +135,12 @@ static void pstore_dump(struct kmsg_dumper *dumper, why = get_reason_str(reason); - if (in_nmi()) { - is_locked = spin_trylock(&psinfo->buf_lock); - if (!is_locked) - pr_err("pstore dump routine blocked in NMI, may corrupt error record\n"); + if (pstore_cannot_block_path(reason)) { + is_locked = spin_trylock_irqsave(&psinfo->buf_lock, flags); + if (!is_locked) { + pr_err("pstore dump routine blocked in %s path, may corrupt error record\n" + , in_nmi() ? "NMI" : why); + } } else spin_lock_irqsave(&psinfo->buf_lock, flags); oopscount++; @@ -143,9 +166,9 @@ static void pstore_dump(struct kmsg_dumper *dumper, total += hsize + len; part++; } - if (in_nmi()) { + if (pstore_cannot_block_path(reason)) { if (is_locked) - spin_unlock(&psinfo->buf_lock); + spin_unlock_irqrestore(&psinfo->buf_lock, flags); } else spin_unlock_irqrestore(&psinfo->buf_lock, flags); } diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c index 7003e52..288f068 100644 --- a/fs/pstore/ram.c +++ b/fs/pstore/ram.c @@ -167,12 +167,16 @@ static ssize_t ramoops_pstore_read(u64 *id, enum pstore_type_id *type, static size_t ramoops_write_kmsg_hdr(struct persistent_ram_zone *prz) { char *hdr; - struct timeval timestamp; + struct timespec timestamp; size_t len; - do_gettimeofday(×tamp); + /* Report zeroed timestamp if called before timekeeping has resumed. */ + if (__getnstimeofday(×tamp)) { + timestamp.tv_sec = 0; + timestamp.tv_nsec = 0; + } hdr = kasprintf(GFP_ATOMIC, RAMOOPS_KERNMSG_HDR "%lu.%lu\n", - (long)timestamp.tv_sec, (long)timestamp.tv_usec); + (long)timestamp.tv_sec, (long)(timestamp.tv_nsec / 1000)); WARN_ON_ONCE(!hdr); len = hdr ? strlen(hdr) : 0; persistent_ram_write(prz, hdr, len); diff --git a/fs/qnx6/inode.c b/fs/qnx6/inode.c index b6addf5..57199a5 100644 --- a/fs/qnx6/inode.c +++ b/fs/qnx6/inode.c @@ -285,7 +285,7 @@ static struct buffer_head *qnx6_check_first_superblock(struct super_block *s, if (fs32_to_cpu(sbi, sb->sb_magic) == QNX6_SUPER_MAGIC) { /* we got a big endian fs */ QNX6DEBUG((KERN_INFO "qnx6: fs got different" - " endianess.\n")); + " endianness.\n")); return bh; } else sbi->s_bytesex = BYTESEX_LE; diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c index eab8c09..c24f1e1 100644 --- a/fs/ramfs/inode.c +++ b/fs/ramfs/inode.c @@ -260,6 +260,7 @@ static struct file_system_type ramfs_fs_type = { .name = "ramfs", .mount = ramfs_mount, .kill_sb = ramfs_kill_sb, + .fs_flags = FS_USERNS_MOUNT, }; static struct file_system_type rootfs_fs_type = { .name = "rootfs", diff --git a/fs/select.c b/fs/select.c index 2ef72d9..8c1c96c 100644 --- a/fs/select.c +++ b/fs/select.c @@ -26,6 +26,7 @@ #include <linux/fs.h> #include <linux/rcupdate.h> #include <linux/hrtimer.h> +#include <linux/sched/rt.h> #include <asm/uaccess.h> diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c index 2df555c..aec3d5c 100644 --- a/fs/sysfs/group.c +++ b/fs/sysfs/group.c @@ -205,6 +205,48 @@ void sysfs_unmerge_group(struct kobject *kobj, } EXPORT_SYMBOL_GPL(sysfs_unmerge_group); +/** + * sysfs_add_link_to_group - add a symlink to an attribute group. + * @kobj: The kobject containing the group. + * @group_name: The name of the group. + * @target: The target kobject of the symlink to create. + * @link_name: The name of the symlink to create. + */ +int sysfs_add_link_to_group(struct kobject *kobj, const char *group_name, + struct kobject *target, const char *link_name) +{ + struct sysfs_dirent *dir_sd; + int error = 0; + + dir_sd = sysfs_get_dirent(kobj->sd, NULL, group_name); + if (!dir_sd) + return -ENOENT; + + error = sysfs_create_link_sd(dir_sd, target, link_name); + sysfs_put(dir_sd); + + return error; +} +EXPORT_SYMBOL_GPL(sysfs_add_link_to_group); + +/** + * sysfs_remove_link_from_group - remove a symlink from an attribute group. + * @kobj: The kobject containing the group. + * @group_name: The name of the group. + * @link_name: The name of the symlink to remove. + */ +void sysfs_remove_link_from_group(struct kobject *kobj, const char *group_name, + const char *link_name) +{ + struct sysfs_dirent *dir_sd; + + dir_sd = sysfs_get_dirent(kobj->sd, NULL, group_name); + if (dir_sd) { + sysfs_hash_and_remove(dir_sd, NULL, link_name); + sysfs_put(dir_sd); + } +} +EXPORT_SYMBOL_GPL(sysfs_remove_link_from_group); EXPORT_SYMBOL_GPL(sysfs_create_group); EXPORT_SYMBOL_GPL(sysfs_update_group); diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c index db940a9..8d924b5 100644 --- a/fs/sysfs/mount.c +++ b/fs/sysfs/mount.c @@ -10,7 +10,7 @@ * Please see Documentation/filesystems/sysfs.txt for more information. */ -#define DEBUG +#define DEBUG #include <linux/fs.h> #include <linux/mount.h> diff --git a/fs/sysfs/symlink.c b/fs/sysfs/symlink.c index 3c9eb56..8c940df 100644 --- a/fs/sysfs/symlink.c +++ b/fs/sysfs/symlink.c @@ -21,26 +21,17 @@ #include "sysfs.h" -static int sysfs_do_create_link(struct kobject *kobj, struct kobject *target, - const char *name, int warn) +static int sysfs_do_create_link_sd(struct sysfs_dirent *parent_sd, + struct kobject *target, + const char *name, int warn) { - struct sysfs_dirent *parent_sd = NULL; struct sysfs_dirent *target_sd = NULL; struct sysfs_dirent *sd = NULL; struct sysfs_addrm_cxt acxt; enum kobj_ns_type ns_type; int error; - BUG_ON(!name); - - if (!kobj) - parent_sd = &sysfs_root; - else - parent_sd = kobj->sd; - - error = -EFAULT; - if (!parent_sd) - goto out_put; + BUG_ON(!name || !parent_sd); /* target->sd can go away beneath us but is protected with * sysfs_assoc_lock. Fetch target_sd from it. @@ -96,6 +87,34 @@ static int sysfs_do_create_link(struct kobject *kobj, struct kobject *target, } /** + * sysfs_create_link_sd - create symlink to a given object. + * @sd: directory we're creating the link in. + * @target: object we're pointing to. + * @name: name of the symlink. + */ +int sysfs_create_link_sd(struct sysfs_dirent *sd, struct kobject *target, + const char *name) +{ + return sysfs_do_create_link_sd(sd, target, name, 1); +} + +static int sysfs_do_create_link(struct kobject *kobj, struct kobject *target, + const char *name, int warn) +{ + struct sysfs_dirent *parent_sd = NULL; + + if (!kobj) + parent_sd = &sysfs_root; + else + parent_sd = kobj->sd; + + if (!parent_sd) + return -EFAULT; + + return sysfs_do_create_link_sd(parent_sd, target, name, warn); +} + +/** * sysfs_create_link - create symlink between two objects. * @kobj: object whose directory we're creating the link in. * @target: object we're pointing to. diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h index d73c093..d1e4043 100644 --- a/fs/sysfs/sysfs.h +++ b/fs/sysfs/sysfs.h @@ -240,3 +240,5 @@ void unmap_bin_file(struct sysfs_dirent *attr_sd); * symlink.c */ extern const struct inode_operations sysfs_symlink_inode_operations; +int sysfs_create_link_sd(struct sysfs_dirent *sd, struct kobject *target, + const char *name); diff --git a/fs/timerfd.c b/fs/timerfd.c index d03822b..0e606b1 100644 --- a/fs/timerfd.c +++ b/fs/timerfd.c @@ -22,6 +22,7 @@ #include <linux/anon_inodes.h> #include <linux/timerfd.h> #include <linux/syscalls.h> +#include <linux/compat.h> #include <linux/rcupdate.h> struct timerfd_ctx { @@ -278,21 +279,17 @@ SYSCALL_DEFINE2(timerfd_create, int, clockid, int, flags) return ufd; } -SYSCALL_DEFINE4(timerfd_settime, int, ufd, int, flags, - const struct itimerspec __user *, utmr, - struct itimerspec __user *, otmr) +static int do_timerfd_settime(int ufd, int flags, + const struct itimerspec *new, + struct itimerspec *old) { struct fd f; struct timerfd_ctx *ctx; - struct itimerspec ktmr, kotmr; int ret; - if (copy_from_user(&ktmr, utmr, sizeof(ktmr))) - return -EFAULT; - if ((flags & ~TFD_SETTIME_FLAGS) || - !timespec_valid(&ktmr.it_value) || - !timespec_valid(&ktmr.it_interval)) + !timespec_valid(&new->it_value) || + !timespec_valid(&new->it_interval)) return -EINVAL; ret = timerfd_fget(ufd, &f); @@ -323,27 +320,23 @@ SYSCALL_DEFINE4(timerfd_settime, int, ufd, int, flags, if (ctx->expired && ctx->tintv.tv64) hrtimer_forward_now(&ctx->tmr, ctx->tintv); - kotmr.it_value = ktime_to_timespec(timerfd_get_remaining(ctx)); - kotmr.it_interval = ktime_to_timespec(ctx->tintv); + old->it_value = ktime_to_timespec(timerfd_get_remaining(ctx)); + old->it_interval = ktime_to_timespec(ctx->tintv); /* * Re-program the timer to the new value ... */ - ret = timerfd_setup(ctx, flags, &ktmr); + ret = timerfd_setup(ctx, flags, new); spin_unlock_irq(&ctx->wqh.lock); fdput(f); - if (otmr && copy_to_user(otmr, &kotmr, sizeof(kotmr))) - return -EFAULT; - return ret; } -SYSCALL_DEFINE2(timerfd_gettime, int, ufd, struct itimerspec __user *, otmr) +static int do_timerfd_gettime(int ufd, struct itimerspec *t) { struct fd f; struct timerfd_ctx *ctx; - struct itimerspec kotmr; int ret = timerfd_fget(ufd, &f); if (ret) return ret; @@ -356,11 +349,65 @@ SYSCALL_DEFINE2(timerfd_gettime, int, ufd, struct itimerspec __user *, otmr) hrtimer_forward_now(&ctx->tmr, ctx->tintv) - 1; hrtimer_restart(&ctx->tmr); } - kotmr.it_value = ktime_to_timespec(timerfd_get_remaining(ctx)); - kotmr.it_interval = ktime_to_timespec(ctx->tintv); + t->it_value = ktime_to_timespec(timerfd_get_remaining(ctx)); + t->it_interval = ktime_to_timespec(ctx->tintv); spin_unlock_irq(&ctx->wqh.lock); fdput(f); + return 0; +} + +SYSCALL_DEFINE4(timerfd_settime, int, ufd, int, flags, + const struct itimerspec __user *, utmr, + struct itimerspec __user *, otmr) +{ + struct itimerspec new, old; + int ret; + + if (copy_from_user(&new, utmr, sizeof(new))) + return -EFAULT; + ret = do_timerfd_settime(ufd, flags, &new, &old); + if (ret) + return ret; + if (otmr && copy_to_user(otmr, &old, sizeof(old))) + return -EFAULT; + + return ret; +} +SYSCALL_DEFINE2(timerfd_gettime, int, ufd, struct itimerspec __user *, otmr) +{ + struct itimerspec kotmr; + int ret = do_timerfd_gettime(ufd, &kotmr); + if (ret) + return ret; return copy_to_user(otmr, &kotmr, sizeof(kotmr)) ? -EFAULT: 0; } +#ifdef COMPAT +COMPAT_SYSCALL_DEFINE4(timerfd_settime, int, ufd, int, flags, + const struct itimerspec __user *, utmr, + struct itimerspec __user *, otmr) +{ + struct itimerspec new, old; + int ret; + + if (get_compat_itimerspec(&new, utmr)) + return -EFAULT; + ret = do_timerfd_settime(ufd, flags, &new, &old); + if (ret) + return ret; + if (otmr && put_compat_itimerspec(otmr, &old)) + return -EFAULT; + return ret; +} + +COMPAT_SYSCALL_DEFINE2(timerfd_gettime, int, ufd, + struct itimerspec __user *, otmr) +{ + struct itimerspec kotmr; + int ret = do_timerfd_gettime(ufd, &kotmr); + if (ret) + return ret; + return put_compat_itimerspec(otmr, &t) ? -EFAULT: 0; +} +#endif diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c index 12817ff..7f60e90 100644 --- a/fs/ubifs/debug.c +++ b/fs/ubifs/debug.c @@ -2459,7 +2459,7 @@ error_dump: static inline int chance(unsigned int n, unsigned int out_of) { - return !!((random32() % out_of) + 1 <= n); + return !!((prandom_u32() % out_of) + 1 <= n); } @@ -2477,13 +2477,13 @@ static int power_cut_emulated(struct ubifs_info *c, int lnum, int write) if (chance(1, 2)) { d->pc_delay = 1; /* Fail withing 1 minute */ - delay = random32() % 60000; + delay = prandom_u32() % 60000; d->pc_timeout = jiffies; d->pc_timeout += msecs_to_jiffies(delay); ubifs_warn("failing after %lums", delay); } else { d->pc_delay = 2; - delay = random32() % 10000; + delay = prandom_u32() % 10000; /* Fail within 10000 operations */ d->pc_cnt_max = delay; ubifs_warn("failing after %lu calls", delay); @@ -2563,7 +2563,7 @@ static int corrupt_data(const struct ubifs_info *c, const void *buf, unsigned int from, to, ffs = chance(1, 2); unsigned char *p = (void *)buf; - from = random32() % (len + 1); + from = prandom_u32() % (len + 1); /* Corruption may only span one max. write unit */ to = min(len, ALIGN(from, c->max_write_size)); diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index fa5b347..f12189d 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -1522,6 +1522,7 @@ static int ubifs_vm_page_mkwrite(struct vm_area_struct *vma, ubifs_release_dirty_inode_budget(c, ui); } + wait_for_stable_page(page); unlock_page(page); return 0; diff --git a/fs/ubifs/lpt_commit.c b/fs/ubifs/lpt_commit.c index 9daaeef..4b826ab 100644 --- a/fs/ubifs/lpt_commit.c +++ b/fs/ubifs/lpt_commit.c @@ -2007,28 +2007,28 @@ static int dbg_populate_lsave(struct ubifs_info *c) if (!dbg_is_chk_gen(c)) return 0; - if (random32() & 3) + if (prandom_u32() & 3) return 0; for (i = 0; i < c->lsave_cnt; i++) c->lsave[i] = c->main_first; list_for_each_entry(lprops, &c->empty_list, list) - c->lsave[random32() % c->lsave_cnt] = lprops->lnum; + c->lsave[prandom_u32() % c->lsave_cnt] = lprops->lnum; list_for_each_entry(lprops, &c->freeable_list, list) - c->lsave[random32() % c->lsave_cnt] = lprops->lnum; + c->lsave[prandom_u32() % c->lsave_cnt] = lprops->lnum; list_for_each_entry(lprops, &c->frdi_idx_list, list) - c->lsave[random32() % c->lsave_cnt] = lprops->lnum; + c->lsave[prandom_u32() % c->lsave_cnt] = lprops->lnum; heap = &c->lpt_heap[LPROPS_DIRTY_IDX - 1]; for (i = 0; i < heap->cnt; i++) - c->lsave[random32() % c->lsave_cnt] = heap->arr[i]->lnum; + c->lsave[prandom_u32() % c->lsave_cnt] = heap->arr[i]->lnum; heap = &c->lpt_heap[LPROPS_DIRTY - 1]; for (i = 0; i < heap->cnt; i++) - c->lsave[random32() % c->lsave_cnt] = heap->arr[i]->lnum; + c->lsave[prandom_u32() % c->lsave_cnt] = heap->arr[i]->lnum; heap = &c->lpt_heap[LPROPS_FREE - 1]; for (i = 0; i < heap->cnt; i++) - c->lsave[random32() % c->lsave_cnt] = heap->arr[i]->lnum; + c->lsave[prandom_u32() % c->lsave_cnt] = heap->arr[i]->lnum; return 1; } diff --git a/fs/ubifs/orphan.c b/fs/ubifs/orphan.c index 769701c..ba32da3 100644 --- a/fs/ubifs/orphan.c +++ b/fs/ubifs/orphan.c @@ -126,13 +126,14 @@ void ubifs_delete_orphan(struct ubifs_info *c, ino_t inum) else if (inum > o->inum) p = p->rb_right; else { - if (o->dnext) { + if (o->del) { spin_unlock(&c->orphan_lock); dbg_gen("deleted twice ino %lu", (unsigned long)inum); return; } - if (o->cnext) { + if (o->cmt) { + o->del = 1; o->dnext = c->orph_dnext; c->orph_dnext = o; spin_unlock(&c->orphan_lock); @@ -172,7 +173,9 @@ int ubifs_orphan_start_commit(struct ubifs_info *c) last = &c->orph_cnext; list_for_each_entry(orphan, &c->orph_new, new_list) { ubifs_assert(orphan->new); + ubifs_assert(!orphan->cmt); orphan->new = 0; + orphan->cmt = 1; *last = orphan; last = &orphan->cnext; } @@ -299,7 +302,9 @@ static int write_orph_node(struct ubifs_info *c, int atomic) cnext = c->orph_cnext; for (i = 0; i < cnt; i++) { orphan = cnext; + ubifs_assert(orphan->cmt); orph->inos[i] = cpu_to_le64(orphan->inum); + orphan->cmt = 0; cnext = orphan->cnext; orphan->cnext = NULL; } @@ -378,6 +383,7 @@ static int consolidate(struct ubifs_info *c) list_for_each_entry(orphan, &c->orph_list, list) { if (orphan->new) continue; + orphan->cmt = 1; *last = orphan; last = &orphan->cnext; cnt += 1; @@ -442,6 +448,7 @@ static void erase_deleted(struct ubifs_info *c) orphan = dnext; dnext = orphan->dnext; ubifs_assert(!orphan->new); + ubifs_assert(orphan->del); rb_erase(&orphan->rb, &c->orph_tree); list_del(&orphan->list); c->tot_orphans -= 1; @@ -531,6 +538,7 @@ static int insert_dead_orphan(struct ubifs_info *c, ino_t inum) rb_link_node(&orphan->rb, parent, p); rb_insert_color(&orphan->rb, &c->orph_tree); list_add_tail(&orphan->list, &c->orph_list); + orphan->del = 1; orphan->dnext = c->orph_dnext; c->orph_dnext = orphan; dbg_mnt("ino %lu, new %d, tot %d", (unsigned long)inum, diff --git a/fs/ubifs/tnc_commit.c b/fs/ubifs/tnc_commit.c index 523bbad..52a6559 100644 --- a/fs/ubifs/tnc_commit.c +++ b/fs/ubifs/tnc_commit.c @@ -683,7 +683,7 @@ static int alloc_idx_lebs(struct ubifs_info *c, int cnt) c->ilebs[c->ileb_cnt++] = lnum; dbg_cmt("LEB %d", lnum); } - if (dbg_is_chk_index(c) && !(random32() & 7)) + if (dbg_is_chk_index(c) && !(prandom_u32() & 7)) return -ENOSPC; return 0; } diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h index d133c27..b2babce 100644 --- a/fs/ubifs/ubifs.h +++ b/fs/ubifs/ubifs.h @@ -904,6 +904,8 @@ struct ubifs_budget_req { * @dnext: next orphan to delete * @inum: inode number * @new: %1 => added since the last commit, otherwise %0 + * @cmt: %1 => commit pending, otherwise %0 + * @del: %1 => delete pending, otherwise %0 */ struct ubifs_orphan { struct rb_node rb; @@ -912,7 +914,9 @@ struct ubifs_orphan { struct ubifs_orphan *cnext; struct ubifs_orphan *dnext; ino_t inum; - int new; + unsigned new:1; + unsigned cmt:1; + unsigned del:1; }; /** diff --git a/fs/udf/inode.c b/fs/udf/inode.c index cbae1ed..7a12e48 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -67,6 +67,74 @@ static void udf_update_extents(struct inode *, struct extent_position *); static int udf_get_block(struct inode *, sector_t, struct buffer_head *, int); +static void __udf_clear_extent_cache(struct inode *inode) +{ + struct udf_inode_info *iinfo = UDF_I(inode); + + if (iinfo->cached_extent.lstart != -1) { + brelse(iinfo->cached_extent.epos.bh); + iinfo->cached_extent.lstart = -1; + } +} + +/* Invalidate extent cache */ +static void udf_clear_extent_cache(struct inode *inode) +{ + struct udf_inode_info *iinfo = UDF_I(inode); + + spin_lock(&iinfo->i_extent_cache_lock); + __udf_clear_extent_cache(inode); + spin_unlock(&iinfo->i_extent_cache_lock); +} + +/* Return contents of extent cache */ +static int udf_read_extent_cache(struct inode *inode, loff_t bcount, + loff_t *lbcount, struct extent_position *pos) +{ + struct udf_inode_info *iinfo = UDF_I(inode); + int ret = 0; + + spin_lock(&iinfo->i_extent_cache_lock); + if ((iinfo->cached_extent.lstart <= bcount) && + (iinfo->cached_extent.lstart != -1)) { + /* Cache hit */ + *lbcount = iinfo->cached_extent.lstart; + memcpy(pos, &iinfo->cached_extent.epos, + sizeof(struct extent_position)); + if (pos->bh) + get_bh(pos->bh); + ret = 1; + } + spin_unlock(&iinfo->i_extent_cache_lock); + return ret; +} + +/* Add extent to extent cache */ +static void udf_update_extent_cache(struct inode *inode, loff_t estart, + struct extent_position *pos, int next_epos) +{ + struct udf_inode_info *iinfo = UDF_I(inode); + + spin_lock(&iinfo->i_extent_cache_lock); + /* Invalidate previously cached extent */ + __udf_clear_extent_cache(inode); + if (pos->bh) + get_bh(pos->bh); + memcpy(&iinfo->cached_extent.epos, pos, + sizeof(struct extent_position)); + iinfo->cached_extent.lstart = estart; + if (next_epos) + switch (iinfo->i_alloc_type) { + case ICBTAG_FLAG_AD_SHORT: + iinfo->cached_extent.epos.offset -= + sizeof(struct short_ad); + break; + case ICBTAG_FLAG_AD_LONG: + iinfo->cached_extent.epos.offset -= + sizeof(struct long_ad); + } + spin_unlock(&iinfo->i_extent_cache_lock); +} void udf_evict_inode(struct inode *inode) { @@ -90,6 +158,7 @@ void udf_evict_inode(struct inode *inode) } kfree(iinfo->i_ext.i_data); iinfo->i_ext.i_data = NULL; + udf_clear_extent_cache(inode); if (want_delete) { udf_free_inode(inode); } @@ -105,6 +174,7 @@ static void udf_write_failed(struct address_space *mapping, loff_t to) truncate_pagecache(inode, to, isize); if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) { down_write(&iinfo->i_data_sem); + udf_clear_extent_cache(inode); udf_truncate_extents(inode); up_write(&iinfo->i_data_sem); } @@ -372,7 +442,7 @@ static int udf_get_block(struct inode *inode, sector_t block, iinfo->i_next_alloc_goal++; } - + udf_clear_extent_cache(inode); phys = inode_getblk(inode, block, &err, &new); if (!phys) goto abort; @@ -1171,6 +1241,7 @@ set_size: } else { if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) { down_write(&iinfo->i_data_sem); + udf_clear_extent_cache(inode); memset(iinfo->i_ext.i_data + iinfo->i_lenEAttr + newsize, 0x00, bsize - newsize - udf_file_entry_alloc_offset(inode)); @@ -1184,6 +1255,7 @@ set_size: if (err) return err; down_write(&iinfo->i_data_sem); + udf_clear_extent_cache(inode); truncate_setsize(inode, newsize); udf_truncate_extents(inode); up_write(&iinfo->i_data_sem); @@ -2156,11 +2228,12 @@ int8_t inode_bmap(struct inode *inode, sector_t block, struct udf_inode_info *iinfo; iinfo = UDF_I(inode); - pos->offset = 0; - pos->block = iinfo->i_location; - pos->bh = NULL; + if (!udf_read_extent_cache(inode, bcount, &lbcount, pos)) { + pos->offset = 0; + pos->block = iinfo->i_location; + pos->bh = NULL; + } *elen = 0; - do { etype = udf_next_aext(inode, pos, eloc, elen, 1); if (etype == -1) { @@ -2170,7 +2243,8 @@ int8_t inode_bmap(struct inode *inode, sector_t block, } lbcount += *elen; } while (lbcount <= bcount); - + /* update extent cache */ + udf_update_extent_cache(inode, lbcount - *elen, pos, 1); *offset = (bcount + *elen - lbcount) >> blocksize_bits; return etype; diff --git a/fs/udf/super.c b/fs/udf/super.c index e9be396..bc5b30a 100644 --- a/fs/udf/super.c +++ b/fs/udf/super.c @@ -134,6 +134,8 @@ static struct inode *udf_alloc_inode(struct super_block *sb) ei->i_next_alloc_goal = 0; ei->i_strat4096 = 0; init_rwsem(&ei->i_data_sem); + ei->cached_extent.lstart = -1; + spin_lock_init(&ei->i_extent_cache_lock); return &ei->vfs_inode; } @@ -1021,7 +1023,6 @@ static struct udf_bitmap *udf_sb_alloc_bitmap(struct super_block *sb, u32 index) if (bitmap == NULL) return NULL; - bitmap->s_block_bitmap = (struct buffer_head **)(bitmap + 1); bitmap->s_nr_groups = nr_groups; return bitmap; } @@ -1079,8 +1080,6 @@ static int udf_fill_partdesc_info(struct super_block *sb, if (!bitmap) return 1; map->s_uspace.s_bitmap = bitmap; - bitmap->s_extLength = le32_to_cpu( - phd->unallocSpaceBitmap.extLength); bitmap->s_extPosition = le32_to_cpu( phd->unallocSpaceBitmap.extPosition); map->s_partition_flags |= UDF_PART_FLAG_UNALLOC_BITMAP; @@ -1115,8 +1114,6 @@ static int udf_fill_partdesc_info(struct super_block *sb, if (!bitmap) return 1; map->s_fspace.s_bitmap = bitmap; - bitmap->s_extLength = le32_to_cpu( - phd->freedSpaceBitmap.extLength); bitmap->s_extPosition = le32_to_cpu( phd->freedSpaceBitmap.extPosition); map->s_partition_flags |= UDF_PART_FLAG_FREED_BITMAP; @@ -1866,6 +1863,8 @@ static void udf_open_lvid(struct super_block *sb) mark_buffer_dirty(bh); sbi->s_lvid_dirty = 0; mutex_unlock(&sbi->s_alloc_mutex); + /* Make opening of filesystem visible on the media immediately */ + sync_dirty_buffer(bh); } static void udf_close_lvid(struct super_block *sb) @@ -1906,6 +1905,8 @@ static void udf_close_lvid(struct super_block *sb) mark_buffer_dirty(bh); sbi->s_lvid_dirty = 0; mutex_unlock(&sbi->s_alloc_mutex); + /* Make closing of filesystem visible on the media immediately */ + sync_dirty_buffer(bh); } u64 lvid_get_unique_id(struct super_block *sb) diff --git a/fs/udf/udf_i.h b/fs/udf/udf_i.h index bb8309d..b5cd8ed 100644 --- a/fs/udf/udf_i.h +++ b/fs/udf/udf_i.h @@ -1,6 +1,19 @@ #ifndef _UDF_I_H #define _UDF_I_H +struct extent_position { + struct buffer_head *bh; + uint32_t offset; + struct kernel_lb_addr block; +}; + +struct udf_ext_cache { + /* Extent position */ + struct extent_position epos; + /* Start logical offset in bytes */ + loff_t lstart; +}; + /* * The i_data_sem and i_mutex serve for protection of allocation information * of a regular files and symlinks. This includes all extents belonging to @@ -35,6 +48,9 @@ struct udf_inode_info { __u8 *i_data; } i_ext; struct rw_semaphore i_data_sem; + struct udf_ext_cache cached_extent; + /* Spinlock for protecting extent cache */ + spinlock_t i_extent_cache_lock; struct inode vfs_inode; }; diff --git a/fs/udf/udf_sb.h b/fs/udf/udf_sb.h index 5f02722..ed401e9 100644 --- a/fs/udf/udf_sb.h +++ b/fs/udf/udf_sb.h @@ -80,10 +80,9 @@ struct udf_virtual_data { }; struct udf_bitmap { - __u32 s_extLength; __u32 s_extPosition; - __u16 s_nr_groups; - struct buffer_head **s_block_bitmap; + int s_nr_groups; + struct buffer_head *s_block_bitmap[0]; }; struct udf_part_map { diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h index de038da..be7dabb 100644 --- a/fs/udf/udfdecl.h +++ b/fs/udf/udfdecl.h @@ -113,11 +113,6 @@ struct ustr { uint8_t u_len; }; -struct extent_position { - struct buffer_head *bh; - uint32_t offset; - struct kernel_lb_addr block; -}; /* super.c */ diff --git a/fs/ufs/Kconfig b/fs/ufs/Kconfig index e4f10a4..0bf6e16 100644 --- a/fs/ufs/Kconfig +++ b/fs/ufs/Kconfig @@ -29,7 +29,7 @@ config UFS_FS config UFS_FS_WRITE bool "UFS file system write support (DANGEROUS)" - depends on UFS_FS && EXPERIMENTAL + depends on UFS_FS help Say Y here if you want to try writing to UFS partitions. This is experimental, so you should back up your UFS partitions beforehand. diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig index 5a7ffe5..cc33aaf 100644 --- a/fs/xfs/Kconfig +++ b/fs/xfs/Kconfig @@ -70,8 +70,8 @@ config XFS_RT If unsure, say N. config XFS_DEBUG - bool "XFS Debugging support (EXPERIMENTAL)" - depends on XFS_FS && EXPERIMENTAL + bool "XFS Debugging support" + depends on XFS_FS help Say Y here to get an XFS build with many debugging features, including ASSERT checks, function wrappers around macros, diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c index 393055f..0ad2325 100644 --- a/fs/xfs/xfs_alloc.c +++ b/fs/xfs/xfs_alloc.c @@ -1925,8 +1925,6 @@ xfs_alloc_fix_freelist( targs.mp = mp; targs.agbp = agbp; targs.agno = args->agno; - targs.mod = targs.minleft = targs.wasdel = targs.userdata = - targs.minalignslop = 0; targs.alignment = targs.minlen = targs.prod = targs.isfl = 1; targs.type = XFS_ALLOCTYPE_THIS_AG; targs.pag = pag; diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 4111a40..5f707e5 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -86,11 +86,11 @@ xfs_destroy_ioend( } if (ioend->io_iocb) { + inode_dio_done(ioend->io_inode); if (ioend->io_isasync) { aio_complete(ioend->io_iocb, ioend->io_error ? ioend->io_error : ioend->io_result, 0); } - inode_dio_done(ioend->io_inode); } mempool_free(ioend, xfs_ioend_pool); diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c index aaf4725..8886838 100644 --- a/fs/xfs/xfs_attr.c +++ b/fs/xfs/xfs_attr.c @@ -300,9 +300,12 @@ xfs_attr_set_int( if (rsvd) args.trans->t_flags |= XFS_TRANS_RESERVE; - if ((error = xfs_trans_reserve(args.trans, args.total, - XFS_ATTRSET_LOG_RES(mp, args.total), 0, - XFS_TRANS_PERM_LOG_RES, XFS_ATTRSET_LOG_COUNT))) { + error = xfs_trans_reserve(args.trans, args.total, + XFS_ATTRSETM_LOG_RES(mp) + + XFS_ATTRSETRT_LOG_RES(mp) * args.total, + 0, XFS_TRANS_PERM_LOG_RES, + XFS_ATTRSET_LOG_COUNT); + if (error) { xfs_trans_cancel(args.trans, 0); return(error); } diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c index 0e92d12..b44af92 100644 --- a/fs/xfs/xfs_bmap.c +++ b/fs/xfs/xfs_bmap.c @@ -147,7 +147,10 @@ xfs_bmap_local_to_extents( xfs_fsblock_t *firstblock, /* first block allocated in xaction */ xfs_extlen_t total, /* total blocks needed by transaction */ int *logflagsp, /* inode logging flags */ - int whichfork); /* data or attr fork */ + int whichfork, /* data or attr fork */ + void (*init_fn)(struct xfs_buf *bp, + struct xfs_inode *ip, + struct xfs_ifork *ifp)); /* * Search the extents list for the inode, for the extent containing bno. @@ -357,7 +360,42 @@ xfs_bmap_add_attrfork_extents( } /* - * Called from xfs_bmap_add_attrfork to handle local format files. + * Block initialisation functions for local to extent format conversion. + * As these get more complex, they will be moved to the relevant files, + * but for now they are too simple to worry about. + */ +STATIC void +xfs_bmap_local_to_extents_init_fn( + struct xfs_buf *bp, + struct xfs_inode *ip, + struct xfs_ifork *ifp) +{ + bp->b_ops = &xfs_bmbt_buf_ops; + memcpy(bp->b_addr, ifp->if_u1.if_data, ifp->if_bytes); +} + +STATIC void +xfs_symlink_local_to_remote( + struct xfs_buf *bp, + struct xfs_inode *ip, + struct xfs_ifork *ifp) +{ + /* remote symlink blocks are not verifiable until CRCs come along */ + bp->b_ops = NULL; + memcpy(bp->b_addr, ifp->if_u1.if_data, ifp->if_bytes); +} + +/* + * Called from xfs_bmap_add_attrfork to handle local format files. Each + * different data fork content type needs a different callout to do the + * conversion. Some are basic and only require special block initialisation + * callouts for the data formating, others (directories) are so specialised they + * handle everything themselves. + * + * XXX (dgc): investigate whether directory conversion can use the generic + * formatting callout. It should be possible - it's just a very complex + * formatter. it would also require passing the transaction through to the init + * function. */ STATIC int /* error */ xfs_bmap_add_attrfork_local( @@ -368,25 +406,29 @@ xfs_bmap_add_attrfork_local( int *flags) /* inode logging flags */ { xfs_da_args_t dargs; /* args for dir/attr code */ - int error; /* error return value */ - xfs_mount_t *mp; /* mount structure pointer */ if (ip->i_df.if_bytes <= XFS_IFORK_DSIZE(ip)) return 0; + if (S_ISDIR(ip->i_d.di_mode)) { - mp = ip->i_mount; memset(&dargs, 0, sizeof(dargs)); dargs.dp = ip; dargs.firstblock = firstblock; dargs.flist = flist; - dargs.total = mp->m_dirblkfsbs; + dargs.total = ip->i_mount->m_dirblkfsbs; dargs.whichfork = XFS_DATA_FORK; dargs.trans = tp; - error = xfs_dir2_sf_to_block(&dargs); - } else - error = xfs_bmap_local_to_extents(tp, ip, firstblock, 1, flags, - XFS_DATA_FORK); - return error; + return xfs_dir2_sf_to_block(&dargs); + } + + if (S_ISLNK(ip->i_d.di_mode)) + return xfs_bmap_local_to_extents(tp, ip, firstblock, 1, + flags, XFS_DATA_FORK, + xfs_symlink_local_to_remote); + + return xfs_bmap_local_to_extents(tp, ip, firstblock, 1, flags, + XFS_DATA_FORK, + xfs_bmap_local_to_extents_init_fn); } /* @@ -3099,8 +3141,6 @@ xfs_bmap_extents_to_btree( args.fsbno = *firstblock; } args.minlen = args.maxlen = args.prod = 1; - args.total = args.minleft = args.alignment = args.mod = args.isfl = - args.minalignslop = 0; args.wasdel = wasdel; *logflagsp = 0; if ((error = xfs_alloc_vextent(&args))) { @@ -3221,7 +3261,10 @@ xfs_bmap_local_to_extents( xfs_fsblock_t *firstblock, /* first block allocated in xaction */ xfs_extlen_t total, /* total blocks needed by transaction */ int *logflagsp, /* inode logging flags */ - int whichfork) /* data or attr fork */ + int whichfork, + void (*init_fn)(struct xfs_buf *bp, + struct xfs_inode *ip, + struct xfs_ifork *ifp)) { int error; /* error return value */ int flags; /* logging flags returned */ @@ -3241,12 +3284,12 @@ xfs_bmap_local_to_extents( xfs_buf_t *bp; /* buffer for extent block */ xfs_bmbt_rec_host_t *ep;/* extent record pointer */ + ASSERT((ifp->if_flags & + (XFS_IFINLINE|XFS_IFEXTENTS|XFS_IFEXTIREC)) == XFS_IFINLINE); memset(&args, 0, sizeof(args)); args.tp = tp; args.mp = ip->i_mount; args.firstblock = *firstblock; - ASSERT((ifp->if_flags & - (XFS_IFINLINE|XFS_IFEXTENTS|XFS_IFEXTIREC)) == XFS_IFINLINE); /* * Allocate a block. We know we need only one, since the * file currently fits in an inode. @@ -3259,20 +3302,21 @@ xfs_bmap_local_to_extents( args.type = XFS_ALLOCTYPE_NEAR_BNO; } args.total = total; - args.mod = args.minleft = args.alignment = args.wasdel = - args.isfl = args.minalignslop = 0; args.minlen = args.maxlen = args.prod = 1; - if ((error = xfs_alloc_vextent(&args))) + error = xfs_alloc_vextent(&args); + if (error) goto done; - /* - * Can't fail, the space was reserved. - */ + + /* Can't fail, the space was reserved. */ ASSERT(args.fsbno != NULLFSBLOCK); ASSERT(args.len == 1); *firstblock = args.fsbno; bp = xfs_btree_get_bufl(args.mp, tp, args.fsbno, 0); - bp->b_ops = &xfs_bmbt_buf_ops; - memcpy(bp->b_addr, ifp->if_u1.if_data, ifp->if_bytes); + + /* initialise the block and copy the data */ + init_fn(bp, ip, ifp); + + /* account for the change in fork size and log everything */ xfs_trans_log_buf(tp, bp, 0, ifp->if_bytes - 1); xfs_bmap_forkoff_reset(args.mp, ip, whichfork); xfs_idata_realloc(ip, -ifp->if_bytes, whichfork); @@ -4680,9 +4724,6 @@ __xfs_bmapi_allocate( return error; } - if (bma->flags & XFS_BMAPI_STACK_SWITCH) - bma->stack_switch = 1; - error = xfs_bmap_alloc(bma); if (error) return error; @@ -4922,8 +4963,32 @@ xfs_bmapi_write( XFS_STATS_INC(xs_blk_mapw); if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) { + /* + * XXX (dgc): This assumes we are only called for inodes that + * contain content neutral data in local format. Anything that + * contains caller-specific data in local format that needs + * transformation to move to a block format needs to do the + * conversion to extent format itself. + * + * Directory data forks and attribute forks handle this + * themselves, but with the addition of metadata verifiers every + * data fork in local format now contains caller specific data + * and as such conversion through this function is likely to be + * broken. + * + * The only likely user of this branch is for remote symlinks, + * but we cannot overwrite the data fork contents of the symlink + * (EEXIST occurs higher up the stack) and so it will never go + * from local format to extent format here. Hence I don't think + * this branch is ever executed intentionally and we should + * consider removing it and asserting that xfs_bmapi_write() + * cannot be called directly on local format forks. i.e. callers + * are completely responsible for local to extent format + * conversion, not xfs_bmapi_write(). + */ error = xfs_bmap_local_to_extents(tp, ip, firstblock, total, - &bma.logflags, whichfork); + &bma.logflags, whichfork, + xfs_bmap_local_to_extents_init_fn); if (error) goto error0; } @@ -4956,6 +5021,9 @@ xfs_bmapi_write( bma.flist = flist; bma.firstblock = firstblock; + if (flags & XFS_BMAPI_STACK_SWITCH) + bma.stack_switch = 1; + while (bno < end && n < *nmap) { inhole = eof || bma.got.br_startoff > bno; wasdelay = !inhole && isnullstartblock(bma.got.br_startblock); diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 56d1614..4e8f0df 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -487,6 +487,7 @@ _xfs_buf_find( struct rb_node *parent; xfs_buf_t *bp; xfs_daddr_t blkno = map[0].bm_bn; + xfs_daddr_t eofs; int numblks = 0; int i; @@ -498,6 +499,23 @@ _xfs_buf_find( ASSERT(!(numbytes < (1 << btp->bt_sshift))); ASSERT(!(BBTOB(blkno) & (xfs_off_t)btp->bt_smask)); + /* + * Corrupted block numbers can get through to here, unfortunately, so we + * have to check that the buffer falls within the filesystem bounds. + */ + eofs = XFS_FSB_TO_BB(btp->bt_mount, btp->bt_mount->m_sb.sb_dblocks); + if (blkno >= eofs) { + /* + * XXX (dgc): we should really be returning EFSCORRUPTED here, + * but none of the higher level infrastructure supports + * returning a specific error on buffer lookup failures. + */ + xfs_alert(btp->bt_mount, + "%s: Block out of range: block 0x%llx, EOFS 0x%llx ", + __func__, blkno, eofs); + return NULL; + } + /* get tree root */ pag = xfs_perag_get(btp->bt_mount, xfs_daddr_to_agno(btp->bt_mount, blkno)); @@ -933,8 +951,6 @@ xfs_buf_trylock( locked = down_trylock(&bp->b_sema) == 0; if (locked) XB_SET_OWNER(bp); - else if (atomic_read(&bp->b_pin_count) && (bp->b_flags & XBF_STALE)) - xfs_log_force(bp->b_target->bt_mount, 0); trace_xfs_buf_trylock(bp, _RET_IP_); return locked; @@ -1487,6 +1503,8 @@ restart: while (!list_empty(&btp->bt_lru)) { bp = list_first_entry(&btp->bt_lru, struct xfs_buf, b_lru); if (atomic_read(&bp->b_hold) > 1) { + trace_xfs_buf_wait_buftarg(bp, _RET_IP_); + list_move_tail(&bp->b_lru, &btp->bt_lru); spin_unlock(&btp->bt_lru_lock); delay(100); goto restart; diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index 77b0975..cf26347 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -37,109 +37,6 @@ static inline struct xfs_buf_log_item *BUF_ITEM(struct xfs_log_item *lip) return container_of(lip, struct xfs_buf_log_item, bli_item); } - -#ifdef XFS_TRANS_DEBUG -/* - * This function uses an alternate strategy for tracking the bytes - * that the user requests to be logged. This can then be used - * in conjunction with the bli_orig array in the buf log item to - * catch bugs in our callers' code. - * - * We also double check the bits set in xfs_buf_item_log using a - * simple algorithm to check that every byte is accounted for. - */ -STATIC void -xfs_buf_item_log_debug( - xfs_buf_log_item_t *bip, - uint first, - uint last) -{ - uint x; - uint byte; - uint nbytes; - uint chunk_num; - uint word_num; - uint bit_num; - uint bit_set; - uint *wordp; - - ASSERT(bip->bli_logged != NULL); - byte = first; - nbytes = last - first + 1; - bfset(bip->bli_logged, first, nbytes); - for (x = 0; x < nbytes; x++) { - chunk_num = byte >> XFS_BLF_SHIFT; - word_num = chunk_num >> BIT_TO_WORD_SHIFT; - bit_num = chunk_num & (NBWORD - 1); - wordp = &(bip->__bli_format.blf_data_map[word_num]); - bit_set = *wordp & (1 << bit_num); - ASSERT(bit_set); - byte++; - } -} - -/* - * This function is called when we flush something into a buffer without - * logging it. This happens for things like inodes which are logged - * separately from the buffer. - */ -void -xfs_buf_item_flush_log_debug( - xfs_buf_t *bp, - uint first, - uint last) -{ - xfs_buf_log_item_t *bip = bp->b_fspriv; - uint nbytes; - - if (bip == NULL || (bip->bli_item.li_type != XFS_LI_BUF)) - return; - - ASSERT(bip->bli_logged != NULL); - nbytes = last - first + 1; - bfset(bip->bli_logged, first, nbytes); -} - -/* - * This function is called to verify that our callers have logged - * all the bytes that they changed. - * - * It does this by comparing the original copy of the buffer stored in - * the buf log item's bli_orig array to the current copy of the buffer - * and ensuring that all bytes which mismatch are set in the bli_logged - * array of the buf log item. - */ -STATIC void -xfs_buf_item_log_check( - xfs_buf_log_item_t *bip) -{ - char *orig; - char *buffer; - int x; - xfs_buf_t *bp; - - ASSERT(bip->bli_orig != NULL); - ASSERT(bip->bli_logged != NULL); - - bp = bip->bli_buf; - ASSERT(bp->b_length > 0); - ASSERT(bp->b_addr != NULL); - orig = bip->bli_orig; - buffer = bp->b_addr; - for (x = 0; x < BBTOB(bp->b_length); x++) { - if (orig[x] != buffer[x] && !btst(bip->bli_logged, x)) { - xfs_emerg(bp->b_mount, - "%s: bip %x buffer %x orig %x index %d", - __func__, bip, bp, orig, x); - ASSERT(0); - } - } -} -#else -#define xfs_buf_item_log_debug(x,y,z) -#define xfs_buf_item_log_check(x) -#endif - STATIC void xfs_buf_do_callbacks(struct xfs_buf *bp); /* @@ -429,7 +326,6 @@ xfs_buf_item_format( * Check to make sure everything is consistent. */ trace_xfs_buf_item_format(bip); - xfs_buf_item_log_check(bip); } /* @@ -573,8 +469,18 @@ xfs_buf_item_push( if (xfs_buf_ispinned(bp)) return XFS_ITEM_PINNED; - if (!xfs_buf_trylock(bp)) + if (!xfs_buf_trylock(bp)) { + /* + * If we have just raced with a buffer being pinned and it has + * been marked stale, we could end up stalling until someone else + * issues a log force to unpin the stale buffer. Check for the + * race condition here so xfsaild recognizes the buffer is pinned + * and queues a log force to move it along. + */ + if (xfs_buf_ispinned(bp)) + return XFS_ITEM_PINNED; return XFS_ITEM_LOCKED; + } ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); @@ -652,7 +558,10 @@ xfs_buf_item_unlock( /* * If the buf item isn't tracking any data, free it, otherwise drop the - * reference we hold to it. + * reference we hold to it. If we are aborting the transaction, this may + * be the only reference to the buf item, so we free it anyway + * regardless of whether it is dirty or not. A dirty abort implies a + * shutdown, anyway. */ clean = 1; for (i = 0; i < bip->bli_format_count; i++) { @@ -664,7 +573,12 @@ xfs_buf_item_unlock( } if (clean) xfs_buf_item_relse(bp); - else + else if (aborted) { + if (atomic_dec_and_test(&bip->bli_refcount)) { + ASSERT(XFS_FORCED_SHUTDOWN(lip->li_mountp)); + xfs_buf_item_relse(bp); + } + } else atomic_dec(&bip->bli_refcount); if (!hold) @@ -915,8 +829,6 @@ xfs_buf_item_log_segment( mask = (1 << end_bit) - 1; *wordp |= mask; } - - xfs_buf_item_log_debug(bip, first, last); } /* diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h index 16def43..ee36c88 100644 --- a/fs/xfs/xfs_buf_item.h +++ b/fs/xfs/xfs_buf_item.h @@ -98,10 +98,6 @@ typedef struct xfs_buf_log_item { unsigned int bli_flags; /* misc flags */ unsigned int bli_recur; /* lock recursion count */ atomic_t bli_refcount; /* cnt of tp refs */ -#ifdef XFS_TRANS_DEBUG - char *bli_orig; /* original buffer copy */ - char *bli_logged; /* bytes logged (bitmap) */ -#endif int bli_format_count; /* count of headers */ struct xfs_buf_log_format *bli_formats; /* array of in-log header ptrs */ struct xfs_buf_log_format __bli_format; /* embedded in-log header */ @@ -117,16 +113,6 @@ void xfs_buf_attach_iodone(struct xfs_buf *, void xfs_buf_iodone_callbacks(struct xfs_buf *); void xfs_buf_iodone(struct xfs_buf *, struct xfs_log_item *); -#ifdef XFS_TRANS_DEBUG -void -xfs_buf_item_flush_log_debug( - struct xfs_buf *bp, - uint first, - uint last); -#else -#define xfs_buf_item_flush_log_debug(bp, first, last) -#endif - #endif /* __KERNEL__ */ #endif /* __XFS_BUF_ITEM_H__ */ diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c index 75d854b..f852b08 100644 --- a/fs/xfs/xfs_dfrag.c +++ b/fs/xfs/xfs_dfrag.c @@ -246,10 +246,10 @@ xfs_swap_extents( goto out_unlock; } - error = -filemap_write_and_wait(VFS_I(ip)->i_mapping); + error = -filemap_write_and_wait(VFS_I(tip)->i_mapping); if (error) goto out_unlock; - truncate_pagecache_range(VFS_I(ip), 0, -1); + truncate_pagecache_range(VFS_I(tip), 0, -1); /* Verify O_DIRECT for ftmp */ if (VN_CACHED(VFS_I(tip)) != 0) { diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index 9e1bf52..8025eb2 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -612,15 +612,9 @@ xfs_qm_dqread( if (flags & XFS_QMOPT_DQALLOC) { tp = xfs_trans_alloc(mp, XFS_TRANS_QM_DQALLOC); error = xfs_trans_reserve(tp, XFS_QM_DQALLOC_SPACE_RES(mp), - XFS_WRITE_LOG_RES(mp) + - /* - * Round the chunklen up to the next multiple - * of 128 (buf log item chunk size)). - */ - BBTOB(mp->m_quotainfo->qi_dqchunklen) - 1 + 128, - 0, - XFS_TRANS_PERM_LOG_RES, - XFS_WRITE_LOG_COUNT); + XFS_QM_DQALLOC_LOG_RES(mp), 0, + XFS_TRANS_PERM_LOG_RES, + XFS_WRITE_LOG_COUNT); if (error) goto error1; cancelflags = XFS_TRANS_RELEASE_LOG_RES; diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index 94eaeed..2866b8c 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -709,8 +709,8 @@ xfs_fs_log_dummy( int error; tp = _xfs_trans_alloc(mp, XFS_TRANS_DUMMY1, KM_SLEEP); - error = xfs_trans_reserve(tp, 0, mp->m_sb.sb_sectsize + 128, 0, 0, - XFS_DEFAULT_LOG_COUNT); + error = xfs_trans_reserve(tp, 0, XFS_SB_LOG_RES(mp), 0, 0, + XFS_DEFAULT_LOG_COUNT); if (error) { xfs_trans_cancel(tp, 0); return error; diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c index a815412..515bf71 100644 --- a/fs/xfs/xfs_ialloc.c +++ b/fs/xfs/xfs_ialloc.c @@ -279,8 +279,6 @@ xfs_ialloc_ag_alloc( (args.agbno < be32_to_cpu(agi->agi_length)))) { args.fsbno = XFS_AGB_TO_FSB(args.mp, agno, args.agbno); args.type = XFS_ALLOCTYPE_THIS_BNO; - args.mod = args.total = args.wasdel = args.isfl = - args.userdata = args.minalignslop = 0; args.prod = 1; /* @@ -333,8 +331,6 @@ xfs_ialloc_ag_alloc( * Allocate a fixed-size extent of inodes. */ args.type = XFS_ALLOCTYPE_NEAR_BNO; - args.mod = args.total = args.wasdel = args.isfl = - args.userdata = args.minalignslop = 0; args.prod = 1; /* * Allow space for the inode btree to split. diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 66282dc..4f20165 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -2379,9 +2379,6 @@ xfs_iflush_fork( char *cp; xfs_ifork_t *ifp; xfs_mount_t *mp; -#ifdef XFS_TRANS_DEBUG - int first; -#endif static const short brootflag[2] = { XFS_ILOG_DBROOT, XFS_ILOG_ABROOT }; static const short dataflag[2] = @@ -2724,9 +2721,6 @@ xfs_iflush_int( xfs_inode_log_item_t *iip; xfs_dinode_t *dip; xfs_mount_t *mp; -#ifdef XFS_TRANS_DEBUG - int first; -#endif ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); ASSERT(xfs_isiflocked(ip)); diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 22baf6e..237e7f6 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -419,6 +419,7 @@ static inline void xfs_iflock(struct xfs_inode *ip) static inline void xfs_ifunlock(struct xfs_inode *ip) { xfs_iflags_clear(ip, XFS_IFLOCK); + smp_mb(); wake_up_bit(&ip->i_flags, __XFS_IFLOCK_BIT); } diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index d041d47..f034bd1 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -269,17 +269,6 @@ xfs_inode_item_format( } else { ASSERT(!(iip->ili_fields & XFS_ILOG_DBROOT)); -#ifdef XFS_TRANS_DEBUG - if (iip->ili_root_size > 0) { - ASSERT(iip->ili_root_size == - ip->i_df.if_broot_bytes); - ASSERT(memcmp(iip->ili_orig_root, - ip->i_df.if_broot, - iip->ili_root_size) == 0); - } else { - ASSERT(ip->i_df.if_broot_bytes == 0); - } -#endif iip->ili_fields &= ~XFS_ILOG_DBROOT; } break; @@ -678,11 +667,6 @@ void xfs_inode_item_destroy( xfs_inode_t *ip) { -#ifdef XFS_TRANS_DEBUG - if (ip->i_itemp->ili_root_size != 0) { - kmem_free(ip->i_itemp->ili_orig_root); - } -#endif kmem_zone_free(xfs_ili_zone, ip->i_itemp); } diff --git a/fs/xfs/xfs_inode_item.h b/fs/xfs/xfs_inode_item.h index 376d4d0..779812f 100644 --- a/fs/xfs/xfs_inode_item.h +++ b/fs/xfs/xfs_inode_item.h @@ -148,10 +148,6 @@ typedef struct xfs_inode_log_item { data exts */ struct xfs_bmbt_rec *ili_aextents_buf; /* array of logged attr exts */ -#ifdef XFS_TRANS_DEBUG - int ili_root_size; - char *ili_orig_root; -#endif xfs_inode_log_format_t ili_format; /* logged structure */ } xfs_inode_log_item_t; diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index add06b4..912d83d 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -311,6 +311,62 @@ xfs_iomap_eof_want_preallocate( } /* + * Determine the initial size of the preallocation. We are beyond the current + * EOF here, but we need to take into account whether this is a sparse write or + * an extending write when determining the preallocation size. Hence we need to + * look up the extent that ends at the current write offset and use the result + * to determine the preallocation size. + * + * If the extent is a hole, then preallocation is essentially disabled. + * Otherwise we take the size of the preceeding data extent as the basis for the + * preallocation size. If the size of the extent is greater than half the + * maximum extent length, then use the current offset as the basis. This ensures + * that for large files the preallocation size always extends to MAXEXTLEN + * rather than falling short due to things like stripe unit/width alignment of + * real extents. + */ +STATIC int +xfs_iomap_eof_prealloc_initial_size( + struct xfs_mount *mp, + struct xfs_inode *ip, + xfs_off_t offset, + xfs_bmbt_irec_t *imap, + int nimaps) +{ + xfs_fileoff_t start_fsb; + int imaps = 1; + int error; + + ASSERT(nimaps >= imaps); + + /* if we are using a specific prealloc size, return now */ + if (mp->m_flags & XFS_MOUNT_DFLT_IOSIZE) + return 0; + + /* + * As we write multiple pages, the offset will always align to the + * start of a page and hence point to a hole at EOF. i.e. if the size is + * 4096 bytes, we only have one block at FSB 0, but XFS_B_TO_FSB(4096) + * will return FSB 1. Hence if there are blocks in the file, we want to + * point to the block prior to the EOF block and not the hole that maps + * directly at @offset. + */ + start_fsb = XFS_B_TO_FSB(mp, offset); + if (start_fsb) + start_fsb--; + error = xfs_bmapi_read(ip, start_fsb, 1, imap, &imaps, XFS_BMAPI_ENTIRE); + if (error) + return 0; + + ASSERT(imaps == 1); + if (imap[0].br_startblock == HOLESTARTBLOCK) + return 0; + if (imap[0].br_blockcount <= (MAXEXTLEN >> 1)) + return imap[0].br_blockcount; + return XFS_B_TO_FSB(mp, offset); +} + +/* * If we don't have a user specified preallocation size, dynamically increase * the preallocation size as the size of the file grows. Cap the maximum size * at a single extent or less if the filesystem is near full. The closer the @@ -319,20 +375,19 @@ xfs_iomap_eof_want_preallocate( STATIC xfs_fsblock_t xfs_iomap_prealloc_size( struct xfs_mount *mp, - struct xfs_inode *ip) + struct xfs_inode *ip, + xfs_off_t offset, + struct xfs_bmbt_irec *imap, + int nimaps) { xfs_fsblock_t alloc_blocks = 0; - if (!(mp->m_flags & XFS_MOUNT_DFLT_IOSIZE)) { + alloc_blocks = xfs_iomap_eof_prealloc_initial_size(mp, ip, offset, + imap, nimaps); + if (alloc_blocks > 0) { int shift = 0; int64_t freesp; - /* - * rounddown_pow_of_two() returns an undefined result - * if we pass in alloc_blocks = 0. Hence the "+ 1" to - * ensure we always pass in a non-zero value. - */ - alloc_blocks = XFS_B_TO_FSB(mp, XFS_ISIZE(ip)) + 1; alloc_blocks = XFS_FILEOFF_MIN(MAXEXTLEN, rounddown_pow_of_two(alloc_blocks)); @@ -351,6 +406,15 @@ xfs_iomap_prealloc_size( } if (shift) alloc_blocks >>= shift; + + /* + * If we are still trying to allocate more space than is + * available, squash the prealloc hard. This can happen if we + * have a large file on a small filesystem and the above + * lowspace thresholds are smaller than MAXEXTLEN. + */ + while (alloc_blocks >= freesp) + alloc_blocks >>= 4; } if (alloc_blocks < mp->m_writeio_blocks) @@ -390,7 +454,6 @@ xfs_iomap_write_delay( extsz = xfs_get_extsz_hint(ip); offset_fsb = XFS_B_TO_FSBT(mp, offset); - error = xfs_iomap_eof_want_preallocate(mp, ip, offset, count, imap, XFS_WRITE_IMAPS, &prealloc); if (error) @@ -398,7 +461,10 @@ xfs_iomap_write_delay( retry: if (prealloc) { - xfs_fsblock_t alloc_blocks = xfs_iomap_prealloc_size(mp, ip); + xfs_fsblock_t alloc_blocks; + + alloc_blocks = xfs_iomap_prealloc_size(mp, ip, offset, imap, + XFS_WRITE_IMAPS); aligned_offset = XFS_WRITEIO_ALIGN(mp, (offset + count - 1)); ioalign = XFS_B_TO_FSBT(mp, aligned_offset); diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 46bd9d5..eec226f 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -120,7 +120,7 @@ xlog_verify_iclog( struct xlog *log, struct xlog_in_core *iclog, int count, - boolean_t syncing); + bool syncing); STATIC void xlog_verify_tail_lsn( struct xlog *log, @@ -1737,7 +1737,7 @@ xlog_sync( ASSERT(XFS_BUF_ADDR(bp) <= log->l_logBBsize-1); ASSERT(XFS_BUF_ADDR(bp) + BTOBB(count) <= log->l_logBBsize); - xlog_verify_iclog(log, iclog, count, B_TRUE); + xlog_verify_iclog(log, iclog, count, true); /* account for log which doesn't start at block #0 */ XFS_BUF_SET_ADDR(bp, XFS_BUF_ADDR(bp) + log->l_logBBstart); @@ -3611,7 +3611,7 @@ xlog_verify_iclog( struct xlog *log, struct xlog_in_core *iclog, int count, - boolean_t syncing) + bool syncing) { xlog_op_header_t *ophead; xlog_in_core_t *icptr; @@ -3659,7 +3659,7 @@ xlog_verify_iclog( /* clientid is only 1 byte */ field_offset = (__psint_t) ((xfs_caddr_t)&(ophead->oh_clientid) - base_ptr); - if (syncing == B_FALSE || (field_offset & 0x1ff)) { + if (!syncing || (field_offset & 0x1ff)) { clientid = ophead->oh_clientid; } else { idx = BTOBBT((xfs_caddr_t)&(ophead->oh_clientid) - iclog->ic_datap); @@ -3682,7 +3682,7 @@ xlog_verify_iclog( /* check length */ field_offset = (__psint_t) ((xfs_caddr_t)&(ophead->oh_len) - base_ptr); - if (syncing == B_FALSE || (field_offset & 0x1ff)) { + if (!syncing || (field_offset & 0x1ff)) { op_len = be32_to_cpu(ophead->oh_len); } else { idx = BTOBBT((__psint_t)&ophead->oh_len - diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index da50846..3806088 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -658,7 +658,7 @@ xfs_sb_quiet_read_verify( return; } /* quietly fail */ - xfs_buf_ioerror(bp, EFSCORRUPTED); + xfs_buf_ioerror(bp, EWRONGFS); } static void @@ -1109,8 +1109,8 @@ xfs_mount_reset_sbqflags( return 0; tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SBCHANGE); - error = xfs_trans_reserve(tp, 0, mp->m_sb.sb_sectsize + 128, 0, 0, - XFS_DEFAULT_LOG_COUNT); + error = xfs_trans_reserve(tp, 0, XFS_QM_SBCHANGE_LOG_RES(mp), + 0, 0, XFS_DEFAULT_LOG_COUNT); if (error) { xfs_trans_cancel(tp, 0); xfs_alert(mp, "%s: Superblock update failed!", __func__); @@ -1583,8 +1583,8 @@ xfs_log_sbcount(xfs_mount_t *mp) return 0; tp = _xfs_trans_alloc(mp, XFS_TRANS_SB_COUNT, KM_SLEEP); - error = xfs_trans_reserve(tp, 0, mp->m_sb.sb_sectsize + 128, 0, 0, - XFS_DEFAULT_LOG_COUNT); + error = xfs_trans_reserve(tp, 0, XFS_SB_LOG_RES(mp), 0, 0, + XFS_DEFAULT_LOG_COUNT); if (error) { xfs_trans_cancel(tp, 0); return error; @@ -1945,8 +1945,8 @@ xfs_mount_log_sb( XFS_SB_VERSIONNUM)); tp = xfs_trans_alloc(mp, XFS_TRANS_SB_UNIT); - error = xfs_trans_reserve(tp, 0, mp->m_sb.sb_sectsize + 128, 0, 0, - XFS_DEFAULT_LOG_COUNT); + error = xfs_trans_reserve(tp, 0, XFS_SB_LOG_RES(mp), 0, 0, + XFS_DEFAULT_LOG_COUNT); if (error) { xfs_trans_cancel(tp, 0); return error; diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index bab8314..bc90706 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -34,12 +34,19 @@ typedef struct xfs_trans_reservations { uint tr_addafork; /* cvt inode to attributed trans */ uint tr_writeid; /* write setuid/setgid file */ uint tr_attrinval; /* attr fork buffer invalidation */ - uint tr_attrset; /* set/create an attribute */ + uint tr_attrsetm; /* set/create an attribute at mount time */ + uint tr_attrsetrt; /* set/create an attribute at runtime */ uint tr_attrrm; /* remove an attribute */ uint tr_clearagi; /* clear bad agi unlinked ino bucket */ uint tr_growrtalloc; /* grow realtime allocations */ uint tr_growrtzero; /* grow realtime zeroing */ uint tr_growrtfree; /* grow realtime freeing */ + uint tr_qm_sbchange; /* change quota flags */ + uint tr_qm_setqlim; /* adjust quota limits */ + uint tr_qm_dqalloc; /* allocate quota on disk */ + uint tr_qm_quotaoff; /* turn quota off */ + uint tr_qm_equotaoff;/* end of turn quota off */ + uint tr_sb; /* modify superblock */ } xfs_trans_reservations_t; #ifndef __KERNEL__ diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index 60eff47..e5b5cf9 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -1584,10 +1584,9 @@ xfs_qm_write_sb_changes( int error; tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SBCHANGE); - if ((error = xfs_trans_reserve(tp, 0, - mp->m_sb.sb_sectsize + 128, 0, - 0, - XFS_DEFAULT_LOG_COUNT))) { + error = xfs_trans_reserve(tp, 0, XFS_QM_SBCHANGE_LOG_RES(mp), + 0, 0, XFS_DEFAULT_LOG_COUNT); + if (error) { xfs_trans_cancel(tp, 0); return error; } diff --git a/fs/xfs/xfs_qm_bhv.c b/fs/xfs/xfs_qm_bhv.c index 6b39115..2d02eac1 100644 --- a/fs/xfs/xfs_qm_bhv.c +++ b/fs/xfs/xfs_qm_bhv.c @@ -146,7 +146,7 @@ xfs_qm_newmount( * inode goes inactive and wants to free blocks, * or via xfs_log_mount_finish. */ - *needquotamount = B_TRUE; + *needquotamount = true; *quotaflags = mp->m_qflags; mp->m_qflags = 0; } diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c index 8a59f85..cf9a340 100644 --- a/fs/xfs/xfs_qm_syscalls.c +++ b/fs/xfs/xfs_qm_syscalls.c @@ -408,10 +408,10 @@ xfs_qm_scall_getqstat( { struct xfs_quotainfo *q = mp->m_quotainfo; struct xfs_inode *uip, *gip; - boolean_t tempuqip, tempgqip; + bool tempuqip, tempgqip; uip = gip = NULL; - tempuqip = tempgqip = B_FALSE; + tempuqip = tempgqip = false; memset(out, 0, sizeof(fs_quota_stat_t)); out->qs_version = FS_QSTAT_VERSION; @@ -434,12 +434,12 @@ xfs_qm_scall_getqstat( if (!uip && mp->m_sb.sb_uquotino != NULLFSINO) { if (xfs_iget(mp, NULL, mp->m_sb.sb_uquotino, 0, 0, &uip) == 0) - tempuqip = B_TRUE; + tempuqip = true; } if (!gip && mp->m_sb.sb_gquotino != NULLFSINO) { if (xfs_iget(mp, NULL, mp->m_sb.sb_gquotino, 0, 0, &gip) == 0) - tempgqip = B_TRUE; + tempgqip = true; } if (uip) { out->qs_uquota.qfs_nblks = uip->i_d.di_nblocks; @@ -490,8 +490,9 @@ xfs_qm_scall_setqlim( return 0; tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SETQLIM); - if ((error = xfs_trans_reserve(tp, 0, sizeof(xfs_disk_dquot_t) + 128, - 0, 0, XFS_DEFAULT_LOG_COUNT))) { + error = xfs_trans_reserve(tp, 0, XFS_QM_SETQLIM_LOG_RES(mp), + 0, 0, XFS_DEFAULT_LOG_COUNT); + if (error) { xfs_trans_cancel(tp, 0); return (error); } @@ -638,8 +639,9 @@ xfs_qm_log_quotaoff_end( tp = xfs_trans_alloc(mp, XFS_TRANS_QM_QUOTAOFF_END); - if ((error = xfs_trans_reserve(tp, 0, sizeof(xfs_qoff_logitem_t) * 2, - 0, 0, XFS_DEFAULT_LOG_COUNT))) { + error = xfs_trans_reserve(tp, 0, XFS_QM_QUOTAOFF_END_LOG_RES(mp), + 0, 0, XFS_DEFAULT_LOG_COUNT); + if (error) { xfs_trans_cancel(tp, 0); return (error); } @@ -671,14 +673,10 @@ xfs_qm_log_quotaoff( uint oldsbqflag=0; tp = xfs_trans_alloc(mp, XFS_TRANS_QM_QUOTAOFF); - if ((error = xfs_trans_reserve(tp, 0, - sizeof(xfs_qoff_logitem_t) * 2 + - mp->m_sb.sb_sectsize + 128, - 0, - 0, - XFS_DEFAULT_LOG_COUNT))) { + error = xfs_trans_reserve(tp, 0, XFS_QM_QUOTAOFF_LOG_RES(mp), + 0, 0, XFS_DEFAULT_LOG_COUNT); + if (error) goto error0; - } qoffi = xfs_trans_get_qoff_item(tp, NULL, flags & XFS_ALL_QUOTA_ACCT); xfs_trans_log_quotaoff_item(tp, qoffi); diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index ab8839b..c407121 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -139,9 +139,9 @@ static const match_table_t tokens = { STATIC unsigned long -suffix_strtoul(char *s, char **endp, unsigned int base) +suffix_kstrtoint(char *s, unsigned int base, int *res) { - int last, shift_left_factor = 0; + int last, shift_left_factor = 0, _res; char *value = s; last = strlen(value) - 1; @@ -158,7 +158,10 @@ suffix_strtoul(char *s, char **endp, unsigned int base) value[last] = '\0'; } - return simple_strtoul((const char *)s, endp, base) << shift_left_factor; + if (kstrtoint(s, base, &_res)) + return -EINVAL; + *res = _res << shift_left_factor; + return 0; } /* @@ -174,7 +177,7 @@ xfs_parseargs( char *options) { struct super_block *sb = mp->m_super; - char *this_char, *value, *eov; + char *this_char, *value; int dsunit = 0; int dswidth = 0; int iosize = 0; @@ -230,14 +233,16 @@ xfs_parseargs( this_char); return EINVAL; } - mp->m_logbufs = simple_strtoul(value, &eov, 10); + if (kstrtoint(value, 10, &mp->m_logbufs)) + return EINVAL; } else if (!strcmp(this_char, MNTOPT_LOGBSIZE)) { if (!value || !*value) { xfs_warn(mp, "%s option requires an argument", this_char); return EINVAL; } - mp->m_logbsize = suffix_strtoul(value, &eov, 10); + if (suffix_kstrtoint(value, 10, &mp->m_logbsize)) + return EINVAL; } else if (!strcmp(this_char, MNTOPT_LOGDEV)) { if (!value || !*value) { xfs_warn(mp, "%s option requires an argument", @@ -266,7 +271,8 @@ xfs_parseargs( this_char); return EINVAL; } - iosize = simple_strtoul(value, &eov, 10); + if (kstrtoint(value, 10, &iosize)) + return EINVAL; iosizelog = ffs(iosize) - 1; } else if (!strcmp(this_char, MNTOPT_ALLOCSIZE)) { if (!value || !*value) { @@ -274,7 +280,8 @@ xfs_parseargs( this_char); return EINVAL; } - iosize = suffix_strtoul(value, &eov, 10); + if (suffix_kstrtoint(value, 10, &iosize)) + return EINVAL; iosizelog = ffs(iosize) - 1; } else if (!strcmp(this_char, MNTOPT_GRPID) || !strcmp(this_char, MNTOPT_BSDGROUPS)) { @@ -296,14 +303,16 @@ xfs_parseargs( this_char); return EINVAL; } - dsunit = simple_strtoul(value, &eov, 10); + if (kstrtoint(value, 10, &dsunit)) + return EINVAL; } else if (!strcmp(this_char, MNTOPT_SWIDTH)) { if (!value || !*value) { xfs_warn(mp, "%s option requires an argument", this_char); return EINVAL; } - dswidth = simple_strtoul(value, &eov, 10); + if (kstrtoint(value, 10, &dswidth)) + return EINVAL; } else if (!strcmp(this_char, MNTOPT_32BITINODE)) { mp->m_flags |= XFS_MOUNT_SMALL_INUMS; } else if (!strcmp(this_char, MNTOPT_64BITINODE)) { diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 2e137d4..16a8129 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -341,6 +341,7 @@ DEFINE_BUF_EVENT(xfs_buf_item_relse); DEFINE_BUF_EVENT(xfs_buf_item_iodone); DEFINE_BUF_EVENT(xfs_buf_item_iodone_async); DEFINE_BUF_EVENT(xfs_buf_error_relse); +DEFINE_BUF_EVENT(xfs_buf_wait_buftarg); DEFINE_BUF_EVENT(xfs_trans_read_buf_io); DEFINE_BUF_EVENT(xfs_trans_read_buf_shut); diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index 06ed520..2fd7c1f 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -37,14 +37,45 @@ #include "xfs_extent_busy.h" #include "xfs_bmap.h" #include "xfs_quota.h" +#include "xfs_qm.h" #include "xfs_trans_priv.h" #include "xfs_trans_space.h" #include "xfs_inode_item.h" +#include "xfs_log_priv.h" +#include "xfs_buf_item.h" #include "xfs_trace.h" kmem_zone_t *xfs_trans_zone; kmem_zone_t *xfs_log_item_desc_zone; +/* + * A buffer has a format structure overhead in the log in addition + * to the data, so we need to take this into account when reserving + * space in a transaction for a buffer. Round the space required up + * to a multiple of 128 bytes so that we don't change the historical + * reservation that has been used for this overhead. + */ +STATIC uint +xfs_buf_log_overhead(void) +{ + return round_up(sizeof(struct xlog_op_header) + + sizeof(struct xfs_buf_log_format), 128); +} + +/* + * Calculate out transaction log reservation per item in bytes. + * + * The nbufs argument is used to indicate the number of items that + * will be changed in a transaction. size is used to tell how many + * bytes should be reserved per item. + */ +STATIC uint +xfs_calc_buf_res( + uint nbufs, + uint size) +{ + return nbufs * (size + xfs_buf_log_overhead()); +} /* * Various log reservation values. @@ -85,18 +116,15 @@ xfs_calc_write_reservation( struct xfs_mount *mp) { return XFS_DQUOT_LOGRES(mp) + - MAX((mp->m_sb.sb_inodesize + - XFS_FSB_TO_B(mp, XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK)) + - 2 * mp->m_sb.sb_sectsize + - mp->m_sb.sb_sectsize + - XFS_ALLOCFREE_LOG_RES(mp, 2) + - 128 * (4 + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + - XFS_ALLOCFREE_LOG_COUNT(mp, 2))), - (2 * mp->m_sb.sb_sectsize + - 2 * mp->m_sb.sb_sectsize + - mp->m_sb.sb_sectsize + - XFS_ALLOCFREE_LOG_RES(mp, 2) + - 128 * (5 + XFS_ALLOCFREE_LOG_COUNT(mp, 2)))); + MAX((xfs_calc_buf_res(1, mp->m_sb.sb_inodesize) + + xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK), + XFS_FSB_TO_B(mp, 1)) + + xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) + + xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 2), + XFS_FSB_TO_B(mp, 1))), + (xfs_calc_buf_res(5, mp->m_sb.sb_sectsize) + + xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 2), + XFS_FSB_TO_B(mp, 1)))); } /* @@ -117,18 +145,17 @@ xfs_calc_itruncate_reservation( struct xfs_mount *mp) { return XFS_DQUOT_LOGRES(mp) + - MAX((mp->m_sb.sb_inodesize + - XFS_FSB_TO_B(mp, XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + 1) + - 128 * (2 + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK))), - (4 * mp->m_sb.sb_sectsize + - 4 * mp->m_sb.sb_sectsize + - mp->m_sb.sb_sectsize + - XFS_ALLOCFREE_LOG_RES(mp, 4) + - 128 * (9 + XFS_ALLOCFREE_LOG_COUNT(mp, 4)) + - 128 * 5 + - XFS_ALLOCFREE_LOG_RES(mp, 1) + - 128 * (2 + XFS_IALLOC_BLOCKS(mp) + mp->m_in_maxlevels + - XFS_ALLOCFREE_LOG_COUNT(mp, 1)))); + MAX((xfs_calc_buf_res(1, mp->m_sb.sb_inodesize) + + xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + 1, + XFS_FSB_TO_B(mp, 1))), + (xfs_calc_buf_res(9, mp->m_sb.sb_sectsize) + + xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 4), + XFS_FSB_TO_B(mp, 1)) + + xfs_calc_buf_res(5, 0) + + xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1), + XFS_FSB_TO_B(mp, 1)) + + xfs_calc_buf_res(2 + XFS_IALLOC_BLOCKS(mp) + + mp->m_in_maxlevels, 0))); } /* @@ -148,14 +175,12 @@ xfs_calc_rename_reservation( struct xfs_mount *mp) { return XFS_DQUOT_LOGRES(mp) + - MAX((4 * mp->m_sb.sb_inodesize + - 2 * XFS_DIROP_LOG_RES(mp) + - 128 * (4 + 2 * XFS_DIROP_LOG_COUNT(mp))), - (3 * mp->m_sb.sb_sectsize + - 3 * mp->m_sb.sb_sectsize + - mp->m_sb.sb_sectsize + - XFS_ALLOCFREE_LOG_RES(mp, 3) + - 128 * (7 + XFS_ALLOCFREE_LOG_COUNT(mp, 3)))); + MAX((xfs_calc_buf_res(4, mp->m_sb.sb_inodesize) + + xfs_calc_buf_res(2 * XFS_DIROP_LOG_COUNT(mp), + XFS_FSB_TO_B(mp, 1))), + (xfs_calc_buf_res(7, mp->m_sb.sb_sectsize) + + xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 3), + XFS_FSB_TO_B(mp, 1)))); } /* @@ -175,15 +200,12 @@ xfs_calc_link_reservation( struct xfs_mount *mp) { return XFS_DQUOT_LOGRES(mp) + - MAX((mp->m_sb.sb_inodesize + - mp->m_sb.sb_inodesize + - XFS_DIROP_LOG_RES(mp) + - 128 * (2 + XFS_DIROP_LOG_COUNT(mp))), - (mp->m_sb.sb_sectsize + - mp->m_sb.sb_sectsize + - mp->m_sb.sb_sectsize + - XFS_ALLOCFREE_LOG_RES(mp, 1) + - 128 * (3 + XFS_ALLOCFREE_LOG_COUNT(mp, 1)))); + MAX((xfs_calc_buf_res(2, mp->m_sb.sb_inodesize) + + xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp), + XFS_FSB_TO_B(mp, 1))), + (xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) + + xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1), + XFS_FSB_TO_B(mp, 1)))); } /* @@ -203,15 +225,12 @@ xfs_calc_remove_reservation( struct xfs_mount *mp) { return XFS_DQUOT_LOGRES(mp) + - MAX((mp->m_sb.sb_inodesize + - mp->m_sb.sb_inodesize + - XFS_DIROP_LOG_RES(mp) + - 128 * (2 + XFS_DIROP_LOG_COUNT(mp))), - (2 * mp->m_sb.sb_sectsize + - 2 * mp->m_sb.sb_sectsize + - mp->m_sb.sb_sectsize + - XFS_ALLOCFREE_LOG_RES(mp, 2) + - 128 * (5 + XFS_ALLOCFREE_LOG_COUNT(mp, 2)))); + MAX((xfs_calc_buf_res(2, mp->m_sb.sb_inodesize) + + xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp), + XFS_FSB_TO_B(mp, 1))), + (xfs_calc_buf_res(5, mp->m_sb.sb_sectsize) + + xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 2), + XFS_FSB_TO_B(mp, 1)))); } /* @@ -233,18 +252,18 @@ xfs_calc_symlink_reservation( struct xfs_mount *mp) { return XFS_DQUOT_LOGRES(mp) + - MAX((mp->m_sb.sb_inodesize + - mp->m_sb.sb_inodesize + - XFS_FSB_TO_B(mp, 1) + - XFS_DIROP_LOG_RES(mp) + - 1024 + - 128 * (4 + XFS_DIROP_LOG_COUNT(mp))), - (2 * mp->m_sb.sb_sectsize + - XFS_FSB_TO_B(mp, XFS_IALLOC_BLOCKS(mp)) + - XFS_FSB_TO_B(mp, mp->m_in_maxlevels) + - XFS_ALLOCFREE_LOG_RES(mp, 1) + - 128 * (2 + XFS_IALLOC_BLOCKS(mp) + mp->m_in_maxlevels + - XFS_ALLOCFREE_LOG_COUNT(mp, 1)))); + MAX((xfs_calc_buf_res(2, mp->m_sb.sb_inodesize) + + xfs_calc_buf_res(1, XFS_FSB_TO_B(mp, 1)) + + xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp), + XFS_FSB_TO_B(mp, 1)) + + xfs_calc_buf_res(1, 1024)), + (xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) + + xfs_calc_buf_res(XFS_IALLOC_BLOCKS(mp), + XFS_FSB_TO_B(mp, 1)) + + xfs_calc_buf_res(mp->m_in_maxlevels, + XFS_FSB_TO_B(mp, 1)) + + xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1), + XFS_FSB_TO_B(mp, 1)))); } /* @@ -267,18 +286,19 @@ xfs_calc_create_reservation( struct xfs_mount *mp) { return XFS_DQUOT_LOGRES(mp) + - MAX((mp->m_sb.sb_inodesize + - mp->m_sb.sb_inodesize + + MAX((xfs_calc_buf_res(2, mp->m_sb.sb_inodesize) + + xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) + + (uint)XFS_FSB_TO_B(mp, 1) + + xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp), + XFS_FSB_TO_B(mp, 1))), + (xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) + mp->m_sb.sb_sectsize + - XFS_FSB_TO_B(mp, 1) + - XFS_DIROP_LOG_RES(mp) + - 128 * (3 + XFS_DIROP_LOG_COUNT(mp))), - (3 * mp->m_sb.sb_sectsize + - XFS_FSB_TO_B(mp, XFS_IALLOC_BLOCKS(mp)) + - XFS_FSB_TO_B(mp, mp->m_in_maxlevels) + - XFS_ALLOCFREE_LOG_RES(mp, 1) + - 128 * (2 + XFS_IALLOC_BLOCKS(mp) + mp->m_in_maxlevels + - XFS_ALLOCFREE_LOG_COUNT(mp, 1)))); + xfs_calc_buf_res(XFS_IALLOC_BLOCKS(mp), + XFS_FSB_TO_B(mp, 1)) + + xfs_calc_buf_res(mp->m_in_maxlevels, + XFS_FSB_TO_B(mp, 1)) + + xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1), + XFS_FSB_TO_B(mp, 1)))); } /* @@ -306,16 +326,16 @@ xfs_calc_ifree_reservation( struct xfs_mount *mp) { return XFS_DQUOT_LOGRES(mp) + - mp->m_sb.sb_inodesize + - mp->m_sb.sb_sectsize + - mp->m_sb.sb_sectsize + - XFS_FSB_TO_B(mp, 1) + + xfs_calc_buf_res(1, mp->m_sb.sb_inodesize) + + xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) + + xfs_calc_buf_res(1, XFS_FSB_TO_B(mp, 1)) + MAX((__uint16_t)XFS_FSB_TO_B(mp, 1), XFS_INODE_CLUSTER_SIZE(mp)) + - 128 * 5 + - XFS_ALLOCFREE_LOG_RES(mp, 1) + - 128 * (2 + XFS_IALLOC_BLOCKS(mp) + mp->m_in_maxlevels + - XFS_ALLOCFREE_LOG_COUNT(mp, 1)); + xfs_calc_buf_res(1, 0) + + xfs_calc_buf_res(2 + XFS_IALLOC_BLOCKS(mp) + + mp->m_in_maxlevels, 0) + + xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1), + XFS_FSB_TO_B(mp, 1)); } /* @@ -343,9 +363,9 @@ STATIC uint xfs_calc_growdata_reservation( struct xfs_mount *mp) { - return mp->m_sb.sb_sectsize * 3 + - XFS_ALLOCFREE_LOG_RES(mp, 1) + - 128 * (3 + XFS_ALLOCFREE_LOG_COUNT(mp, 1)); + return xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) + + xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1), + XFS_FSB_TO_B(mp, 1)); } /* @@ -362,12 +382,12 @@ STATIC uint xfs_calc_growrtalloc_reservation( struct xfs_mount *mp) { - return 2 * mp->m_sb.sb_sectsize + - XFS_FSB_TO_B(mp, XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK)) + - mp->m_sb.sb_inodesize + - XFS_ALLOCFREE_LOG_RES(mp, 1) + - 128 * (3 + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + - XFS_ALLOCFREE_LOG_COUNT(mp, 1)); + return xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) + + xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK), + XFS_FSB_TO_B(mp, 1)) + + xfs_calc_buf_res(1, mp->m_sb.sb_inodesize) + + xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1), + XFS_FSB_TO_B(mp, 1)); } /* @@ -379,7 +399,7 @@ STATIC uint xfs_calc_growrtzero_reservation( struct xfs_mount *mp) { - return mp->m_sb.sb_blocksize + 128; + return xfs_calc_buf_res(1, mp->m_sb.sb_blocksize); } /* @@ -396,11 +416,10 @@ STATIC uint xfs_calc_growrtfree_reservation( struct xfs_mount *mp) { - return mp->m_sb.sb_sectsize + - 2 * mp->m_sb.sb_inodesize + - mp->m_sb.sb_blocksize + - mp->m_rsumsize + - 128 * 5; + return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) + + xfs_calc_buf_res(2, mp->m_sb.sb_inodesize) + + xfs_calc_buf_res(1, mp->m_sb.sb_blocksize) + + xfs_calc_buf_res(1, mp->m_rsumsize); } /* @@ -411,7 +430,7 @@ STATIC uint xfs_calc_swrite_reservation( struct xfs_mount *mp) { - return mp->m_sb.sb_inodesize + 128; + return xfs_calc_buf_res(1, mp->m_sb.sb_inodesize); } /* @@ -421,7 +440,7 @@ xfs_calc_swrite_reservation( STATIC uint xfs_calc_writeid_reservation(xfs_mount_t *mp) { - return mp->m_sb.sb_inodesize + 128; + return xfs_calc_buf_res(1, mp->m_sb.sb_inodesize); } /* @@ -437,13 +456,13 @@ xfs_calc_addafork_reservation( struct xfs_mount *mp) { return XFS_DQUOT_LOGRES(mp) + - mp->m_sb.sb_inodesize + - mp->m_sb.sb_sectsize * 2 + - mp->m_dirblksize + - XFS_FSB_TO_B(mp, XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK) + 1) + - XFS_ALLOCFREE_LOG_RES(mp, 1) + - 128 * (4 + XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK) + 1 + - XFS_ALLOCFREE_LOG_COUNT(mp, 1)); + xfs_calc_buf_res(1, mp->m_sb.sb_inodesize) + + xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) + + xfs_calc_buf_res(1, mp->m_dirblksize) + + xfs_calc_buf_res(XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK) + 1, + XFS_FSB_TO_B(mp, 1)) + + xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1), + XFS_FSB_TO_B(mp, 1)); } /* @@ -461,35 +480,51 @@ STATIC uint xfs_calc_attrinval_reservation( struct xfs_mount *mp) { - return MAX((mp->m_sb.sb_inodesize + - XFS_FSB_TO_B(mp, XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK)) + - 128 * (1 + XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK))), - (4 * mp->m_sb.sb_sectsize + - 4 * mp->m_sb.sb_sectsize + - mp->m_sb.sb_sectsize + - XFS_ALLOCFREE_LOG_RES(mp, 4) + - 128 * (9 + XFS_ALLOCFREE_LOG_COUNT(mp, 4)))); + return MAX((xfs_calc_buf_res(1, mp->m_sb.sb_inodesize) + + xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK), + XFS_FSB_TO_B(mp, 1))), + (xfs_calc_buf_res(9, mp->m_sb.sb_sectsize) + + xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 4), + XFS_FSB_TO_B(mp, 1)))); } /* - * Setting an attribute. + * Setting an attribute at mount time. * the inode getting the attribute * the superblock for allocations * the agfs extents are allocated from * the attribute btree * max depth * the inode allocation btree * Since attribute transaction space is dependent on the size of the attribute, - * the calculation is done partially at mount time and partially at runtime. + * the calculation is done partially at mount time and partially at runtime(see + * below). */ STATIC uint -xfs_calc_attrset_reservation( +xfs_calc_attrsetm_reservation( struct xfs_mount *mp) { return XFS_DQUOT_LOGRES(mp) + - mp->m_sb.sb_inodesize + - mp->m_sb.sb_sectsize + - XFS_FSB_TO_B(mp, XFS_DA_NODE_MAXDEPTH) + - 128 * (2 + XFS_DA_NODE_MAXDEPTH); + xfs_calc_buf_res(1, mp->m_sb.sb_inodesize) + + xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) + + xfs_calc_buf_res(XFS_DA_NODE_MAXDEPTH, XFS_FSB_TO_B(mp, 1)); +} + +/* + * Setting an attribute at runtime, transaction space unit per block. + * the superblock for allocations: sector size + * the inode bmap btree could join or split: max depth * block size + * Since the runtime attribute transaction space is dependent on the total + * blocks needed for the 1st bmap, here we calculate out the space unit for + * one block so that the caller could figure out the total space according + * to the attibute extent length in blocks by: ext * XFS_ATTRSETRT_LOG_RES(mp). + */ +STATIC uint +xfs_calc_attrsetrt_reservation( + struct xfs_mount *mp) +{ + return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) + + xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK), + XFS_FSB_TO_B(mp, 1)); } /* @@ -508,16 +543,15 @@ xfs_calc_attrrm_reservation( struct xfs_mount *mp) { return XFS_DQUOT_LOGRES(mp) + - MAX((mp->m_sb.sb_inodesize + - XFS_FSB_TO_B(mp, XFS_DA_NODE_MAXDEPTH) + - XFS_FSB_TO_B(mp, XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK)) + - 128 * (1 + XFS_DA_NODE_MAXDEPTH + - XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK))), - (2 * mp->m_sb.sb_sectsize + - 2 * mp->m_sb.sb_sectsize + - mp->m_sb.sb_sectsize + - XFS_ALLOCFREE_LOG_RES(mp, 2) + - 128 * (5 + XFS_ALLOCFREE_LOG_COUNT(mp, 2)))); + MAX((xfs_calc_buf_res(1, mp->m_sb.sb_inodesize) + + xfs_calc_buf_res(XFS_DA_NODE_MAXDEPTH, + XFS_FSB_TO_B(mp, 1)) + + (uint)XFS_FSB_TO_B(mp, + XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK)) + + xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK), 0)), + (xfs_calc_buf_res(5, mp->m_sb.sb_sectsize) + + xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 2), + XFS_FSB_TO_B(mp, 1)))); } /* @@ -527,7 +561,78 @@ STATIC uint xfs_calc_clear_agi_bucket_reservation( struct xfs_mount *mp) { - return mp->m_sb.sb_sectsize + 128; + return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize); +} + +/* + * Clearing the quotaflags in the superblock. + * the super block for changing quota flags: sector size + */ +STATIC uint +xfs_calc_qm_sbchange_reservation( + struct xfs_mount *mp) +{ + return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize); +} + +/* + * Adjusting quota limits. + * the xfs_disk_dquot_t: sizeof(struct xfs_disk_dquot) + */ +STATIC uint +xfs_calc_qm_setqlim_reservation( + struct xfs_mount *mp) +{ + return xfs_calc_buf_res(1, sizeof(struct xfs_disk_dquot)); +} + +/* + * Allocating quota on disk if needed. + * the write transaction log space: XFS_WRITE_LOG_RES(mp) + * the unit of quota allocation: one system block size + */ +STATIC uint +xfs_calc_qm_dqalloc_reservation( + struct xfs_mount *mp) +{ + return XFS_WRITE_LOG_RES(mp) + + xfs_calc_buf_res(1, + XFS_FSB_TO_B(mp, XFS_DQUOT_CLUSTER_SIZE_FSB) - 1); +} + +/* + * Turning off quotas. + * the xfs_qoff_logitem_t: sizeof(struct xfs_qoff_logitem) * 2 + * the superblock for the quota flags: sector size + */ +STATIC uint +xfs_calc_qm_quotaoff_reservation( + struct xfs_mount *mp) +{ + return sizeof(struct xfs_qoff_logitem) * 2 + + xfs_calc_buf_res(1, mp->m_sb.sb_sectsize); +} + +/* + * End of turning off quotas. + * the xfs_qoff_logitem_t: sizeof(struct xfs_qoff_logitem) * 2 + */ +STATIC uint +xfs_calc_qm_quotaoff_end_reservation( + struct xfs_mount *mp) +{ + return sizeof(struct xfs_qoff_logitem) * 2; +} + +/* + * Syncing the incore super block changes to disk. + * the super block to reflect the changes: sector size + */ +STATIC uint +xfs_calc_sb_reservation( + struct xfs_mount *mp) +{ + return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize); } /* @@ -555,12 +660,19 @@ xfs_trans_init( resp->tr_writeid = xfs_calc_writeid_reservation(mp); resp->tr_addafork = xfs_calc_addafork_reservation(mp); resp->tr_attrinval = xfs_calc_attrinval_reservation(mp); - resp->tr_attrset = xfs_calc_attrset_reservation(mp); + resp->tr_attrsetm = xfs_calc_attrsetm_reservation(mp); + resp->tr_attrsetrt = xfs_calc_attrsetrt_reservation(mp); resp->tr_attrrm = xfs_calc_attrrm_reservation(mp); resp->tr_clearagi = xfs_calc_clear_agi_bucket_reservation(mp); resp->tr_growrtalloc = xfs_calc_growrtalloc_reservation(mp); resp->tr_growrtzero = xfs_calc_growrtzero_reservation(mp); resp->tr_growrtfree = xfs_calc_growrtfree_reservation(mp); + resp->tr_qm_sbchange = xfs_calc_qm_sbchange_reservation(mp); + resp->tr_qm_setqlim = xfs_calc_qm_setqlim_reservation(mp); + resp->tr_qm_dqalloc = xfs_calc_qm_dqalloc_reservation(mp); + resp->tr_qm_quotaoff = xfs_calc_qm_quotaoff_reservation(mp); + resp->tr_qm_equotaoff = xfs_calc_qm_quotaoff_end_reservation(mp); + resp->tr_sb = xfs_calc_sb_reservation(mp); } /* diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index c6c0601..cd29f61 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -252,17 +252,19 @@ struct xfs_log_item_desc { * as long as SWRITE logs the entire inode core */ #define XFS_FSYNC_TS_LOG_RES(mp) ((mp)->m_reservations.tr_swrite) -#define XFS_WRITEID_LOG_RES(mp) ((mp)->m_reservations.tr_swrite) +#define XFS_WRITEID_LOG_RES(mp) ((mp)->m_reservations.tr_swrite) #define XFS_ADDAFORK_LOG_RES(mp) ((mp)->m_reservations.tr_addafork) #define XFS_ATTRINVAL_LOG_RES(mp) ((mp)->m_reservations.tr_attrinval) -#define XFS_ATTRSET_LOG_RES(mp, ext) \ - ((mp)->m_reservations.tr_attrset + \ - (ext * (mp)->m_sb.sb_sectsize) + \ - (ext * XFS_FSB_TO_B((mp), XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK))) + \ - (128 * (ext + (ext * XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK))))) -#define XFS_ATTRRM_LOG_RES(mp) ((mp)->m_reservations.tr_attrrm) +#define XFS_ATTRSETM_LOG_RES(mp) ((mp)->m_reservations.tr_attrsetm) +#define XFS_ATTRSETRT_LOG_RES(mp) ((mp)->m_reservations.tr_attrsetrt) +#define XFS_ATTRRM_LOG_RES(mp) ((mp)->m_reservations.tr_attrrm) #define XFS_CLEAR_AGI_BUCKET_LOG_RES(mp) ((mp)->m_reservations.tr_clearagi) - +#define XFS_QM_SBCHANGE_LOG_RES(mp) ((mp)->m_reservations.tr_qm_sbchange) +#define XFS_QM_SETQLIM_LOG_RES(mp) ((mp)->m_reservations.tr_qm_setqlim) +#define XFS_QM_DQALLOC_LOG_RES(mp) ((mp)->m_reservations.tr_qm_dqalloc) +#define XFS_QM_QUOTAOFF_LOG_RES(mp) ((mp)->m_reservations.tr_qm_quotaoff) +#define XFS_QM_QUOTAOFF_END_LOG_RES(mp) ((mp)->m_reservations.tr_qm_equotaoff) +#define XFS_SB_LOG_RES(mp) ((mp)->m_reservations.tr_sb) /* * Various log count values. diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c index 6011ee6..0eda725 100644 --- a/fs/xfs/xfs_trans_ail.c +++ b/fs/xfs/xfs_trans_ail.c @@ -55,20 +55,6 @@ xfs_ail_check( ASSERT(XFS_LSN_CMP(prev_lip->li_lsn, lip->li_lsn) >= 0); -#ifdef XFS_TRANS_DEBUG - /* - * Walk the list checking lsn ordering, and that every entry has the - * XFS_LI_IN_AIL flag set. This is really expensive, so only do it - * when specifically debugging the transaction subsystem. - */ - prev_lip = list_entry(&ailp->xa_ail, xfs_log_item_t, li_ail); - list_for_each_entry(lip, &ailp->xa_ail, li_ail) { - if (&prev_lip->li_ail != &ailp->xa_ail) - ASSERT(XFS_LSN_CMP(prev_lip->li_lsn, lip->li_lsn) <= 0); - ASSERT((lip->li_flags & XFS_LI_IN_AIL) != 0); - prev_lip = lip; - } -#endif /* XFS_TRANS_DEBUG */ } #else /* !DEBUG */ #define xfs_ail_check(a,l) diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c index 0c7fa54..642c2d6e 100644 --- a/fs/xfs/xfs_trans_dquot.c +++ b/fs/xfs/xfs_trans_dquot.c @@ -516,7 +516,7 @@ xfs_trans_unreserve_and_mod_dquots( int i, j; xfs_dquot_t *dqp; xfs_dqtrx_t *qtrx, *qa; - boolean_t locked; + bool locked; if (!tp->t_dqinfo || !(tp->t_flags & XFS_TRANS_DQ_DIRTY)) return; @@ -537,17 +537,17 @@ xfs_trans_unreserve_and_mod_dquots( * about the number of blocks used field, or deltas. * Also we don't bother to zero the fields. */ - locked = B_FALSE; + locked = false; if (qtrx->qt_blk_res) { xfs_dqlock(dqp); - locked = B_TRUE; + locked = true; dqp->q_res_bcount -= (xfs_qcnt_t)qtrx->qt_blk_res; } if (qtrx->qt_ino_res) { if (!locked) { xfs_dqlock(dqp); - locked = B_TRUE; + locked = true; } dqp->q_res_icount -= (xfs_qcnt_t)qtrx->qt_ino_res; @@ -556,7 +556,7 @@ xfs_trans_unreserve_and_mod_dquots( if (qtrx->qt_rtblk_res) { if (!locked) { xfs_dqlock(dqp); - locked = B_TRUE; + locked = true; } dqp->q_res_rtbcount -= (xfs_qcnt_t)qtrx->qt_rtblk_res; diff --git a/fs/xfs/xfs_trans_inode.c b/fs/xfs/xfs_trans_inode.c index d2eee20..ac6d567 100644 --- a/fs/xfs/xfs_trans_inode.c +++ b/fs/xfs/xfs_trans_inode.c @@ -33,14 +33,6 @@ #include "xfs_inode_item.h" #include "xfs_trace.h" -#ifdef XFS_TRANS_DEBUG -STATIC void -xfs_trans_inode_broot_debug( - xfs_inode_t *ip); -#else -#define xfs_trans_inode_broot_debug(ip) -#endif - /* * Add a locked inode to the transaction. * @@ -67,8 +59,6 @@ xfs_trans_ijoin( * Get a log_item_desc to point at the new item. */ xfs_trans_add_item(tp, &iip->ili_item); - - xfs_trans_inode_broot_debug(ip); } /* @@ -135,34 +125,3 @@ xfs_trans_log_inode( flags |= ip->i_itemp->ili_last_fields; ip->i_itemp->ili_fields |= flags; } - -#ifdef XFS_TRANS_DEBUG -/* - * Keep track of the state of the inode btree root to make sure we - * log it properly. - */ -STATIC void -xfs_trans_inode_broot_debug( - xfs_inode_t *ip) -{ - xfs_inode_log_item_t *iip; - - ASSERT(ip->i_itemp != NULL); - iip = ip->i_itemp; - if (iip->ili_root_size != 0) { - ASSERT(iip->ili_orig_root != NULL); - kmem_free(iip->ili_orig_root); - iip->ili_root_size = 0; - iip->ili_orig_root = NULL; - } - if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE) { - ASSERT((ip->i_df.if_broot != NULL) && - (ip->i_df.if_broot_bytes > 0)); - iip->ili_root_size = ip->i_df.if_broot_bytes; - iip->ili_orig_root = - (char*)kmem_alloc(iip->ili_root_size, KM_SLEEP); - memcpy(iip->ili_orig_root, (char*)(ip->i_df.if_broot), - iip->ili_root_size); - } -} -#endif diff --git a/fs/xfs/xfs_types.h b/fs/xfs/xfs_types.h index 7a41874..61ba1cf 100644 --- a/fs/xfs/xfs_types.h +++ b/fs/xfs/xfs_types.h @@ -32,7 +32,6 @@ typedef unsigned int __uint32_t; typedef signed long long int __int64_t; typedef unsigned long long int __uint64_t; -typedef enum { B_FALSE,B_TRUE } boolean_t; typedef __uint32_t prid_t; /* project ID */ typedef __uint32_t inst_t; /* an instruction */ diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c index d95f565..77ad748 100644 --- a/fs/xfs/xfs_vnodeops.c +++ b/fs/xfs/xfs_vnodeops.c @@ -725,7 +725,7 @@ xfs_create( int error; xfs_bmap_free_t free_list; xfs_fsblock_t first_block; - boolean_t unlock_dp_on_error = B_FALSE; + bool unlock_dp_on_error = false; uint cancel_flags; int committed; prid_t prid; @@ -794,7 +794,7 @@ xfs_create( } xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT); - unlock_dp_on_error = B_TRUE; + unlock_dp_on_error = true; xfs_bmap_init(&free_list, &first_block); @@ -830,7 +830,7 @@ xfs_create( * error path. */ xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL); - unlock_dp_on_error = B_FALSE; + unlock_dp_on_error = false; error = xfs_dir_createname(tp, dp, name, ip->i_ino, &first_block, &free_list, resblks ? @@ -1367,7 +1367,7 @@ xfs_symlink( int pathlen; xfs_bmap_free_t free_list; xfs_fsblock_t first_block; - boolean_t unlock_dp_on_error = B_FALSE; + bool unlock_dp_on_error = false; uint cancel_flags; int committed; xfs_fileoff_t first_fsb; @@ -1438,7 +1438,7 @@ xfs_symlink( } xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT); - unlock_dp_on_error = B_TRUE; + unlock_dp_on_error = true; /* * Check whether the directory allows new symlinks or not. @@ -1484,7 +1484,7 @@ xfs_symlink( * error path. */ xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL); - unlock_dp_on_error = B_FALSE; + unlock_dp_on_error = false; /* * Also attach the dquot(s) to it, if applicable. |