diff options
Diffstat (limited to 'fs')
608 files changed, 25435 insertions, 23440 deletions
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c index ef96618..2b78014 100644 --- a/fs/9p/v9fs.c +++ b/fs/9p/v9fs.c @@ -132,21 +132,19 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts) options = tmp_options; while ((p = strsep(&options, ",")) != NULL) { - int token; + int token, r; if (!*p) continue; token = match_token(p, tokens, args); - if (token < Opt_uname) { - int r = match_int(&args[0], &option); + switch (token) { + case Opt_debug: + r = match_int(&args[0], &option); if (r < 0) { P9_DPRINTK(P9_DEBUG_ERROR, - "integer field, but no integer?\n"); + "integer field, but no integer?\n"); ret = r; continue; } - } - switch (token) { - case Opt_debug: v9ses->debug = option; #ifdef CONFIG_NET_9P_DEBUG p9_debug_level = option; @@ -154,12 +152,33 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts) break; case Opt_dfltuid: + r = match_int(&args[0], &option); + if (r < 0) { + P9_DPRINTK(P9_DEBUG_ERROR, + "integer field, but no integer?\n"); + ret = r; + continue; + } v9ses->dfltuid = option; break; case Opt_dfltgid: + r = match_int(&args[0], &option); + if (r < 0) { + P9_DPRINTK(P9_DEBUG_ERROR, + "integer field, but no integer?\n"); + ret = r; + continue; + } v9ses->dfltgid = option; break; case Opt_afid: + r = match_int(&args[0], &option); + if (r < 0) { + P9_DPRINTK(P9_DEBUG_ERROR, + "integer field, but no integer?\n"); + ret = r; + continue; + } v9ses->afid = option; break; case Opt_uname: diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h index 410ffd6..dc95a25 100644 --- a/fs/9p/v9fs_vfs.h +++ b/fs/9p/v9fs_vfs.h @@ -54,9 +54,9 @@ extern struct kmem_cache *v9fs_inode_cache; struct inode *v9fs_alloc_inode(struct super_block *sb); void v9fs_destroy_inode(struct inode *inode); -struct inode *v9fs_get_inode(struct super_block *sb, int mode, dev_t); +struct inode *v9fs_get_inode(struct super_block *sb, umode_t mode, dev_t); int v9fs_init_inode(struct v9fs_session_info *v9ses, - struct inode *inode, int mode, dev_t); + struct inode *inode, umode_t mode, dev_t); void v9fs_evict_inode(struct inode *inode); ino_t v9fs_qid2ino(struct p9_qid *qid); void v9fs_stat2inode(struct p9_wstat *, struct inode *, struct super_block *); diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c index 9c2bdda..598fff1 100644 --- a/fs/9p/vfs_dir.c +++ b/fs/9p/vfs_dir.c @@ -165,9 +165,8 @@ static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir) } while (rdir->head < rdir->tail) { p9stat_init(&st); - err = p9stat_read(rdir->buf + rdir->head, - rdir->tail - rdir->head, &st, - fid->clnt->proto_version); + err = p9stat_read(fid->clnt, rdir->buf + rdir->head, + rdir->tail - rdir->head, &st); if (err) { P9_DPRINTK(P9_DEBUG_VFS, "returned %d\n", err); err = -EIO; @@ -231,7 +230,7 @@ static int v9fs_dir_readdir_dotl(struct file *filp, void *dirent, while (err == 0) { if (rdir->tail == rdir->head) { err = p9_client_readdir(fid, rdir->buf, buflen, - filp->f_pos); + filp->f_pos); if (err <= 0) goto unlock_and_exit; @@ -241,10 +240,9 @@ static int v9fs_dir_readdir_dotl(struct file *filp, void *dirent, while (rdir->head < rdir->tail) { - err = p9dirent_read(rdir->buf + rdir->head, - rdir->tail - rdir->head, - &curdirent, - fid->clnt->proto_version); + err = p9dirent_read(fid->clnt, rdir->buf + rdir->head, + rdir->tail - rdir->head, + &curdirent); if (err < 0) { P9_DPRINTK(P9_DEBUG_VFS, "returned %d\n", err); err = -EIO; diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index e3c03db..e0f20de 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -59,15 +59,13 @@ static const struct inode_operations v9fs_symlink_inode_operations; * */ -static int unixmode2p9mode(struct v9fs_session_info *v9ses, int mode) +static u32 unixmode2p9mode(struct v9fs_session_info *v9ses, umode_t mode) { int res; res = mode & 0777; if (S_ISDIR(mode)) res |= P9_DMDIR; if (v9fs_proto_dotu(v9ses)) { - if (S_ISLNK(mode)) - res |= P9_DMSYMLINK; if (v9ses->nodev == 0) { if (S_ISSOCK(mode)) res |= P9_DMSOCKET; @@ -85,10 +83,7 @@ static int unixmode2p9mode(struct v9fs_session_info *v9ses, int mode) res |= P9_DMSETGID; if ((mode & S_ISVTX) == S_ISVTX) res |= P9_DMSETVTX; - if ((mode & P9_DMLINK)) - res |= P9_DMLINK; } - return res; } @@ -99,11 +94,11 @@ static int unixmode2p9mode(struct v9fs_session_info *v9ses, int mode) * @rdev: major number, minor number in case of device files. * */ -static int p9mode2unixmode(struct v9fs_session_info *v9ses, - struct p9_wstat *stat, dev_t *rdev) +static umode_t p9mode2unixmode(struct v9fs_session_info *v9ses, + struct p9_wstat *stat, dev_t *rdev) { int res; - int mode = stat->mode; + u32 mode = stat->mode; res = mode & S_IALLUGO; *rdev = 0; @@ -251,7 +246,6 @@ struct inode *v9fs_alloc_inode(struct super_block *sb) static void v9fs_i_callback(struct rcu_head *head) { struct inode *inode = container_of(head, struct inode, i_rcu); - INIT_LIST_HEAD(&inode->i_dentry); kmem_cache_free(v9fs_inode_cache, V9FS_I(inode)); } @@ -261,7 +255,7 @@ void v9fs_destroy_inode(struct inode *inode) } int v9fs_init_inode(struct v9fs_session_info *v9ses, - struct inode *inode, int mode, dev_t rdev) + struct inode *inode, umode_t mode, dev_t rdev) { int err = 0; @@ -278,10 +272,8 @@ int v9fs_init_inode(struct v9fs_session_info *v9ses, case S_IFSOCK: if (v9fs_proto_dotl(v9ses)) { inode->i_op = &v9fs_file_inode_operations_dotl; - inode->i_fop = &v9fs_file_operations_dotl; } else if (v9fs_proto_dotu(v9ses)) { inode->i_op = &v9fs_file_inode_operations; - inode->i_fop = &v9fs_file_operations; } else { P9_DPRINTK(P9_DEBUG_ERROR, "special files without extended mode\n"); @@ -337,7 +329,7 @@ int v9fs_init_inode(struct v9fs_session_info *v9ses, break; default: - P9_DPRINTK(P9_DEBUG_ERROR, "BAD mode 0x%x S_IFMT 0x%x\n", + P9_DPRINTK(P9_DEBUG_ERROR, "BAD mode 0x%hx S_IFMT 0x%x\n", mode, mode & S_IFMT); err = -EINVAL; goto error; @@ -354,13 +346,13 @@ error: * */ -struct inode *v9fs_get_inode(struct super_block *sb, int mode, dev_t rdev) +struct inode *v9fs_get_inode(struct super_block *sb, umode_t mode, dev_t rdev) { int err; struct inode *inode; struct v9fs_session_info *v9ses = sb->s_fs_info; - P9_DPRINTK(P9_DEBUG_VFS, "super block: %p mode: %o\n", sb, mode); + P9_DPRINTK(P9_DEBUG_VFS, "super block: %p mode: %ho\n", sb, mode); inode = new_inode(sb); if (!inode) { @@ -494,7 +486,8 @@ static struct inode *v9fs_qid_iget(struct super_block *sb, int new) { dev_t rdev; - int retval, umode; + int retval; + umode_t umode; unsigned long i_ino; struct inode *inode; struct v9fs_session_info *v9ses = sb->s_fs_info; @@ -705,7 +698,7 @@ error: */ static int -v9fs_vfs_create(struct inode *dir, struct dentry *dentry, int mode, +v9fs_vfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, struct nameidata *nd) { int err; @@ -788,7 +781,7 @@ error: * */ -static int v9fs_vfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) +static int v9fs_vfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) { int err; u32 perm; @@ -1133,14 +1126,14 @@ void v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode, struct super_block *sb) { - mode_t mode; + umode_t mode; char ext[32]; char tag_name[14]; unsigned int i_nlink; struct v9fs_session_info *v9ses = sb->s_fs_info; struct v9fs_inode *v9inode = V9FS_I(inode); - inode->i_nlink = 1; + set_nlink(inode, 1); inode->i_atime.tv_sec = stat->atime; inode->i_mtime.tv_sec = stat->mtime; @@ -1166,7 +1159,7 @@ v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode, /* HARDLINKCOUNT %u */ sscanf(ext, "%13s %u", tag_name, &i_nlink); if (!strncmp(tag_name, "HARDLINKCOUNT", 13)) - inode->i_nlink = i_nlink; + set_nlink(inode, i_nlink); } } mode = stat->mode & S_IALLUGO; @@ -1306,9 +1299,8 @@ v9fs_vfs_put_link(struct dentry *dentry, struct nameidata *nd, void *p) */ static int v9fs_vfs_mkspecial(struct inode *dir, struct dentry *dentry, - int mode, const char *extension) + u32 perm, const char *extension) { - u32 perm; struct p9_fid *fid; struct v9fs_session_info *v9ses; @@ -1318,7 +1310,6 @@ static int v9fs_vfs_mkspecial(struct inode *dir, struct dentry *dentry, return -EPERM; } - perm = unixmode2p9mode(v9ses, mode); fid = v9fs_create(v9ses, dir, dentry, (char *) extension, perm, P9_OREAD); if (IS_ERR(fid)) @@ -1345,7 +1336,7 @@ v9fs_vfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname) P9_DPRINTK(P9_DEBUG_VFS, " %lu,%s,%s\n", dir->i_ino, dentry->d_name.name, symname); - return v9fs_vfs_mkspecial(dir, dentry, S_IFLNK, symname); + return v9fs_vfs_mkspecial(dir, dentry, P9_DMSYMLINK, symname); } /** @@ -1400,13 +1391,15 @@ clunk_fid: */ static int -v9fs_vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t rdev) +v9fs_vfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev) { + struct v9fs_session_info *v9ses = v9fs_inode2v9ses(dir); int retval; char *name; + u32 perm; P9_DPRINTK(P9_DEBUG_VFS, - " %lu,%s mode: %x MAJOR: %u MINOR: %u\n", dir->i_ino, + " %lu,%s mode: %hx MAJOR: %u MINOR: %u\n", dir->i_ino, dentry->d_name.name, mode, MAJOR(rdev), MINOR(rdev)); if (!new_valid_dev(rdev)) @@ -1429,7 +1422,8 @@ v9fs_vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t rdev) return -EINVAL; } - retval = v9fs_vfs_mkspecial(dir, dentry, mode, name); + perm = unixmode2p9mode(v9ses, mode); + retval = v9fs_vfs_mkspecial(dir, dentry, perm, name); __putname(name); return retval; diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c index aded79f..8ef152a 100644 --- a/fs/9p/vfs_inode_dotl.c +++ b/fs/9p/vfs_inode_dotl.c @@ -48,7 +48,7 @@ #include "acl.h" static int -v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, int omode, +v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, umode_t omode, dev_t rdev); /** @@ -253,7 +253,7 @@ int v9fs_open_to_dotl_flags(int flags) */ static int -v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int omode, +v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, umode_t omode, struct nameidata *nd) { int err = 0; @@ -284,7 +284,7 @@ v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int omode, name = (char *) dentry->d_name.name; P9_DPRINTK(P9_DEBUG_VFS, "v9fs_vfs_create_dotl: name:%s flags:0x%x " - "mode:0x%x\n", name, flags, omode); + "mode:0x%hx\n", name, flags, omode); dfid = v9fs_fid_lookup(dentry->d_parent); if (IS_ERR(dfid)) { @@ -395,7 +395,7 @@ err_clunk_old_fid: */ static int v9fs_vfs_mkdir_dotl(struct inode *dir, - struct dentry *dentry, int omode) + struct dentry *dentry, umode_t omode) { int err; struct v9fs_session_info *v9ses; @@ -594,7 +594,7 @@ int v9fs_vfs_setattr_dotl(struct dentry *dentry, struct iattr *iattr) void v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode) { - mode_t mode; + umode_t mode; struct v9fs_inode *v9inode = V9FS_I(inode); if ((stat->st_result_mask & P9_STATS_BASIC) == P9_STATS_BASIC) { @@ -606,7 +606,7 @@ v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode) inode->i_ctime.tv_nsec = stat->st_ctime_nsec; inode->i_uid = stat->st_uid; inode->i_gid = stat->st_gid; - inode->i_nlink = stat->st_nlink; + set_nlink(inode, stat->st_nlink); mode = stat->st_mode & S_IALLUGO; mode |= inode->i_mode & ~S_IALLUGO; @@ -632,7 +632,7 @@ v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode) if (stat->st_result_mask & P9_STATS_GID) inode->i_gid = stat->st_gid; if (stat->st_result_mask & P9_STATS_NLINK) - inode->i_nlink = stat->st_nlink; + set_nlink(inode, stat->st_nlink); if (stat->st_result_mask & P9_STATS_MODE) { inode->i_mode = stat->st_mode; if ((S_ISBLK(inode->i_mode)) || @@ -799,7 +799,7 @@ v9fs_vfs_link_dotl(struct dentry *old_dentry, struct inode *dir, * */ static int -v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, int omode, +v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, umode_t omode, dev_t rdev) { int err; @@ -814,7 +814,7 @@ v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, int omode, struct posix_acl *dacl = NULL, *pacl = NULL; P9_DPRINTK(P9_DEBUG_VFS, - " %lu,%s mode: %x MAJOR: %u MINOR: %u\n", dir->i_ino, + " %lu,%s mode: %hx MAJOR: %u MINOR: %u\n", dir->i_ino, dentry->d_name.name, omode, MAJOR(rdev), MINOR(rdev)); if (!new_valid_dev(rdev)) diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c index c70251d..f68ff65 100644 --- a/fs/9p/vfs_super.c +++ b/fs/9p/vfs_super.c @@ -117,7 +117,7 @@ static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags, struct inode *inode = NULL; struct dentry *root = NULL; struct v9fs_session_info *v9ses = NULL; - int mode = S_IRWXUGO | S_ISVTX; + umode_t mode = S_IRWXUGO | S_ISVTX; struct p9_fid *fid; int retval = 0; @@ -109,7 +109,7 @@ source "fs/proc/Kconfig" source "fs/sysfs/Kconfig" config TMPFS - bool "Virtual memory file system support (former shm fs)" + bool "Tmpfs virtual memory file system support (former shm fs)" depends on SHMEM help Tmpfs is a file system which keeps all files in virtual memory. @@ -218,6 +218,8 @@ source "fs/exofs/Kconfig" endif # MISC_FILESYSTEMS +source "fs/exofs/Kconfig.ore" + menuconfig NETWORK_FILESYSTEMS bool "Network File Systems" default y @@ -266,14 +268,6 @@ source "fs/9p/Kconfig" endif # NETWORK_FILESYSTEMS -if BLOCK -menu "Partition Types" - -source "fs/partitions/Kconfig" - -endmenu -endif - source "fs/nls/Kconfig" source "fs/dlm/Kconfig" diff --git a/fs/Makefile b/fs/Makefile index afc1096..93804d4 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -19,6 +19,8 @@ else obj-y += no-block.o endif +obj-$(CONFIG_PROC_FS) += proc_namespace.o + obj-$(CONFIG_BLK_DEV_INTEGRITY) += bio-integrity.o obj-y += notify/ obj-$(CONFIG_EPOLL) += eventpoll.o @@ -52,7 +54,6 @@ obj-$(CONFIG_FHANDLE) += fhandle.o obj-y += quota/ obj-$(CONFIG_PROC_FS) += proc/ -obj-y += partitions/ obj-$(CONFIG_SYSFS) += sysfs/ obj-$(CONFIG_CONFIGFS_FS) += configfs/ obj-y += devpts/ @@ -120,6 +121,6 @@ obj-$(CONFIG_DEBUG_FS) += debugfs/ obj-$(CONFIG_OCFS2_FS) += ocfs2/ obj-$(CONFIG_BTRFS_FS) += btrfs/ obj-$(CONFIG_GFS2_FS) += gfs2/ -obj-$(CONFIG_EXOFS_FS) += exofs/ +obj-y += exofs/ # Multiple modules obj-$(CONFIG_CEPH_FS) += ceph/ obj-$(CONFIG_PSTORE) += pstore/ diff --git a/fs/adfs/inode.c b/fs/adfs/inode.c index d5250c5..1dab6a1 100644 --- a/fs/adfs/inode.c +++ b/fs/adfs/inode.c @@ -247,7 +247,7 @@ adfs_iget(struct super_block *sb, struct object_info *obj) inode->i_gid = ADFS_SB(sb)->s_gid; inode->i_ino = obj->file_id; inode->i_size = obj->size; - inode->i_nlink = 2; + set_nlink(inode, 2); inode->i_blocks = (inode->i_size + sb->s_blocksize - 1) >> sb->s_blocksize_bits; diff --git a/fs/adfs/super.c b/fs/adfs/super.c index c8bf36a..8e3b36a 100644 --- a/fs/adfs/super.c +++ b/fs/adfs/super.c @@ -126,9 +126,9 @@ static void adfs_put_super(struct super_block *sb) sb->s_fs_info = NULL; } -static int adfs_show_options(struct seq_file *seq, struct vfsmount *mnt) +static int adfs_show_options(struct seq_file *seq, struct dentry *root) { - struct adfs_sb_info *asb = ADFS_SB(mnt->mnt_sb); + struct adfs_sb_info *asb = ADFS_SB(root->d_sb); if (asb->s_uid != 0) seq_printf(seq, ",uid=%u", asb->s_uid); diff --git a/fs/affs/affs.h b/fs/affs/affs.h index c2b9c79..45a0ce4 100644 --- a/fs/affs/affs.h +++ b/fs/affs/affs.h @@ -136,7 +136,7 @@ extern int affs_remove_header(struct dentry *dentry); extern u32 affs_checksum_block(struct super_block *sb, struct buffer_head *bh); extern void affs_fix_checksum(struct super_block *sb, struct buffer_head *bh); extern void secs_to_datestamp(time_t secs, struct affs_date *ds); -extern mode_t prot_to_mode(u32 prot); +extern umode_t prot_to_mode(u32 prot); extern void mode_to_prot(struct inode *inode); extern void affs_error(struct super_block *sb, const char *function, const char *fmt, ...); extern void affs_warning(struct super_block *sb, const char *function, const char *fmt, ...); @@ -156,8 +156,8 @@ extern void affs_free_bitmap(struct super_block *sb); extern int affs_hash_name(struct super_block *sb, const u8 *name, unsigned int len); extern struct dentry *affs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *); extern int affs_unlink(struct inode *dir, struct dentry *dentry); -extern int affs_create(struct inode *dir, struct dentry *dentry, int mode, struct nameidata *); -extern int affs_mkdir(struct inode *dir, struct dentry *dentry, int mode); +extern int affs_create(struct inode *dir, struct dentry *dentry, umode_t mode, struct nameidata *); +extern int affs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode); extern int affs_rmdir(struct inode *dir, struct dentry *dentry); extern int affs_link(struct dentry *olddentry, struct inode *dir, struct dentry *dentry); diff --git a/fs/affs/amigaffs.c b/fs/affs/amigaffs.c index 3a4557e..52a6407 100644 --- a/fs/affs/amigaffs.c +++ b/fs/affs/amigaffs.c @@ -215,7 +215,7 @@ affs_remove_link(struct dentry *dentry) break; default: if (!AFFS_TAIL(sb, bh)->link_chain) - inode->i_nlink = 1; + set_nlink(inode, 1); } affs_free_block(sb, link_ino); goto done; @@ -316,7 +316,7 @@ affs_remove_header(struct dentry *dentry) if (inode->i_nlink > 1) retval = affs_remove_link(dentry); else - inode->i_nlink = 0; + clear_nlink(inode); affs_unlock_link(inode); inode->i_ctime = CURRENT_TIME_SEC; mark_inode_dirty(inode); @@ -390,10 +390,10 @@ secs_to_datestamp(time_t secs, struct affs_date *ds) ds->ticks = cpu_to_be32(secs * 50); } -mode_t +umode_t prot_to_mode(u32 prot) { - int mode = 0; + umode_t mode = 0; if (!(prot & FIBF_NOWRITE)) mode |= S_IWUSR; @@ -421,7 +421,7 @@ void mode_to_prot(struct inode *inode) { u32 prot = AFFS_I(inode)->i_protect; - mode_t mode = inode->i_mode; + umode_t mode = inode->i_mode; if (!(mode & S_IXUSR)) prot |= FIBF_NOEXECUTE; diff --git a/fs/affs/inode.c b/fs/affs/inode.c index 5d82890..88a4b0b 100644 --- a/fs/affs/inode.c +++ b/fs/affs/inode.c @@ -54,7 +54,7 @@ struct inode *affs_iget(struct super_block *sb, unsigned long ino) prot = be32_to_cpu(tail->protect); inode->i_size = 0; - inode->i_nlink = 1; + set_nlink(inode, 1); inode->i_mode = 0; AFFS_I(inode)->i_extcnt = 1; AFFS_I(inode)->i_ext_last = ~1; @@ -137,7 +137,7 @@ struct inode *affs_iget(struct super_block *sb, unsigned long ino) sbi->s_hashsize + 1; } if (tail->link_chain) - inode->i_nlink = 2; + set_nlink(inode, 2); inode->i_mapping->a_ops = (sbi->s_flags & SF_OFS) ? &affs_aops_ofs : &affs_aops; inode->i_op = &affs_file_inode_operations; inode->i_fop = &affs_file_operations; @@ -304,7 +304,7 @@ affs_new_inode(struct inode *dir) inode->i_uid = current_fsuid(); inode->i_gid = current_fsgid(); inode->i_ino = block; - inode->i_nlink = 1; + set_nlink(inode, 1); inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC; atomic_set(&AFFS_I(inode)->i_opencnt, 0); AFFS_I(inode)->i_blkcnt = 0; @@ -387,7 +387,7 @@ affs_add_entry(struct inode *dir, struct inode *inode, struct dentry *dentry, s3 AFFS_TAIL(sb, inode_bh)->link_chain = cpu_to_be32(block); affs_adjust_checksum(inode_bh, block - be32_to_cpu(chain)); mark_buffer_dirty_inode(inode_bh, inode); - inode->i_nlink = 2; + set_nlink(inode, 2); ihold(inode); } affs_fix_checksum(sb, bh); diff --git a/fs/affs/namei.c b/fs/affs/namei.c index e3e9efc..4780694 100644 --- a/fs/affs/namei.c +++ b/fs/affs/namei.c @@ -255,13 +255,13 @@ affs_unlink(struct inode *dir, struct dentry *dentry) } int -affs_create(struct inode *dir, struct dentry *dentry, int mode, struct nameidata *nd) +affs_create(struct inode *dir, struct dentry *dentry, umode_t mode, struct nameidata *nd) { struct super_block *sb = dir->i_sb; struct inode *inode; int error; - pr_debug("AFFS: create(%lu,\"%.*s\",0%o)\n",dir->i_ino,(int)dentry->d_name.len, + pr_debug("AFFS: create(%lu,\"%.*s\",0%ho)\n",dir->i_ino,(int)dentry->d_name.len, dentry->d_name.name,mode); inode = affs_new_inode(dir); @@ -277,7 +277,7 @@ affs_create(struct inode *dir, struct dentry *dentry, int mode, struct nameidata inode->i_mapping->a_ops = (AFFS_SB(sb)->s_flags & SF_OFS) ? &affs_aops_ofs : &affs_aops; error = affs_add_entry(dir, inode, dentry, ST_FILE); if (error) { - inode->i_nlink = 0; + clear_nlink(inode); iput(inode); return error; } @@ -285,12 +285,12 @@ affs_create(struct inode *dir, struct dentry *dentry, int mode, struct nameidata } int -affs_mkdir(struct inode *dir, struct dentry *dentry, int mode) +affs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) { struct inode *inode; int error; - pr_debug("AFFS: mkdir(%lu,\"%.*s\",0%o)\n",dir->i_ino, + pr_debug("AFFS: mkdir(%lu,\"%.*s\",0%ho)\n",dir->i_ino, (int)dentry->d_name.len,dentry->d_name.name,mode); inode = affs_new_inode(dir); @@ -305,7 +305,7 @@ affs_mkdir(struct inode *dir, struct dentry *dentry, int mode) error = affs_add_entry(dir, inode, dentry, ST_USERDIR); if (error) { - inode->i_nlink = 0; + clear_nlink(inode); mark_inode_dirty(inode); iput(inode); return error; @@ -392,7 +392,7 @@ affs_symlink(struct inode *dir, struct dentry *dentry, const char *symname) return 0; err: - inode->i_nlink = 0; + clear_nlink(inode); mark_inode_dirty(inode); iput(inode); return error; diff --git a/fs/affs/super.c b/fs/affs/super.c index b31507d..8ba73fe 100644 --- a/fs/affs/super.c +++ b/fs/affs/super.c @@ -98,7 +98,6 @@ static struct inode *affs_alloc_inode(struct super_block *sb) static void affs_i_callback(struct rcu_head *head) { struct inode *inode = container_of(head, struct inode, i_rcu); - INIT_LIST_HEAD(&inode->i_dentry); kmem_cache_free(affs_inode_cachep, AFFS_I(inode)); } diff --git a/fs/afs/dir.c b/fs/afs/dir.c index 1b0b195..e22dc4b 100644 --- a/fs/afs/dir.c +++ b/fs/afs/dir.c @@ -28,9 +28,9 @@ static int afs_d_delete(const struct dentry *dentry); static void afs_d_release(struct dentry *dentry); static int afs_lookup_filldir(void *_cookie, const char *name, int nlen, loff_t fpos, u64 ino, unsigned dtype); -static int afs_create(struct inode *dir, struct dentry *dentry, int mode, +static int afs_create(struct inode *dir, struct dentry *dentry, umode_t mode, struct nameidata *nd); -static int afs_mkdir(struct inode *dir, struct dentry *dentry, int mode); +static int afs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode); static int afs_rmdir(struct inode *dir, struct dentry *dentry); static int afs_unlink(struct inode *dir, struct dentry *dentry); static int afs_link(struct dentry *from, struct inode *dir, @@ -764,7 +764,7 @@ static void afs_d_release(struct dentry *dentry) /* * create a directory on an AFS filesystem */ -static int afs_mkdir(struct inode *dir, struct dentry *dentry, int mode) +static int afs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) { struct afs_file_status status; struct afs_callback cb; @@ -777,7 +777,7 @@ static int afs_mkdir(struct inode *dir, struct dentry *dentry, int mode) dvnode = AFS_FS_I(dir); - _enter("{%x:%u},{%s},%o", + _enter("{%x:%u},{%s},%ho", dvnode->fid.vid, dvnode->fid.vnode, dentry->d_name.name, mode); ret = -ENAMETOOLONG; @@ -948,7 +948,7 @@ error: /* * create a regular file on an AFS filesystem */ -static int afs_create(struct inode *dir, struct dentry *dentry, int mode, +static int afs_create(struct inode *dir, struct dentry *dentry, umode_t mode, struct nameidata *nd) { struct afs_file_status status; @@ -962,7 +962,7 @@ static int afs_create(struct inode *dir, struct dentry *dentry, int mode, dvnode = AFS_FS_I(dir); - _enter("{%x:%u},{%s},%o,", + _enter("{%x:%u},{%s},%ho,", dvnode->fid.vid, dvnode->fid.vnode, dentry->d_name.name, mode); ret = -ENAMETOOLONG; diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c index 346e328..2f213d1 100644 --- a/fs/afs/fsclient.c +++ b/fs/afs/fsclient.c @@ -90,7 +90,7 @@ static void xdr_decode_AFSFetchStatus(const __be32 **_bp, vnode->vfs_inode.i_uid = status->owner; vnode->vfs_inode.i_gid = status->group; vnode->vfs_inode.i_generation = vnode->fid.unique; - vnode->vfs_inode.i_nlink = status->nlink; + set_nlink(&vnode->vfs_inode, status->nlink); mode = vnode->vfs_inode.i_mode; mode &= ~S_IALLUGO; diff --git a/fs/afs/inode.c b/fs/afs/inode.c index 0fdab6e..d890ae3 100644 --- a/fs/afs/inode.c +++ b/fs/afs/inode.c @@ -67,7 +67,7 @@ static int afs_inode_map_status(struct afs_vnode *vnode, struct key *key) fscache_attr_changed(vnode->cache); #endif - inode->i_nlink = vnode->status.nlink; + set_nlink(inode, vnode->status.nlink); inode->i_uid = vnode->status.owner; inode->i_gid = 0; inode->i_size = vnode->status.size; @@ -174,7 +174,7 @@ struct inode *afs_iget_autocell(struct inode *dir, const char *dev_name, inode->i_size = 0; inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO; inode->i_op = &afs_autocell_inode_operations; - inode->i_nlink = 2; + set_nlink(inode, 2); inode->i_uid = 0; inode->i_gid = 0; inode->i_ctime.tv_sec = get_seconds(); diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c index aa59184..8f4ce26 100644 --- a/fs/afs/mntpt.c +++ b/fs/afs/mntpt.c @@ -242,7 +242,7 @@ struct vfsmount *afs_d_automount(struct path *path) { struct vfsmount *newmnt; - _enter("{%s,%s}", path->mnt->mnt_devname, path->dentry->d_name.name); + _enter("{%s}", path->dentry->d_name.name); newmnt = afs_mntpt_do_automount(path->dentry); if (IS_ERR(newmnt)) @@ -252,7 +252,7 @@ struct vfsmount *afs_d_automount(struct path *path) mnt_set_expiry(newmnt, &afs_vfsmounts); queue_delayed_work(afs_wq, &afs_mntpt_expiry_timer, afs_mntpt_expiry_timeout * HZ); - _leave(" = %p {%s}", newmnt, newmnt->mnt_devname); + _leave(" = %p", newmnt); return newmnt; } diff --git a/fs/afs/super.c b/fs/afs/super.c index 356dcf0..983ec59 100644 --- a/fs/afs/super.c +++ b/fs/afs/super.c @@ -495,7 +495,6 @@ static void afs_i_callback(struct rcu_head *head) { struct inode *inode = container_of(head, struct inode, i_rcu); struct afs_vnode *vnode = AFS_FS_I(inode); - INIT_LIST_HEAD(&inode->i_dentry); kmem_cache_free(afs_inode_cachep, vnode); } @@ -440,8 +440,6 @@ void exit_aio(struct mm_struct *mm) static struct kiocb *__aio_get_req(struct kioctx *ctx) { struct kiocb *req = NULL; - struct aio_ring *ring; - int okay = 0; req = kmem_cache_alloc(kiocb_cachep, GFP_KERNEL); if (unlikely(!req)) @@ -459,39 +457,114 @@ static struct kiocb *__aio_get_req(struct kioctx *ctx) INIT_LIST_HEAD(&req->ki_run_list); req->ki_eventfd = NULL; - /* Check if the completion queue has enough free space to - * accept an event from this io. - */ + return req; +} + +/* + * struct kiocb's are allocated in batches to reduce the number of + * times the ctx lock is acquired and released. + */ +#define KIOCB_BATCH_SIZE 32L +struct kiocb_batch { + struct list_head head; + long count; /* number of requests left to allocate */ +}; + +static void kiocb_batch_init(struct kiocb_batch *batch, long total) +{ + INIT_LIST_HEAD(&batch->head); + batch->count = total; +} + +static void kiocb_batch_free(struct kiocb_batch *batch) +{ + struct kiocb *req, *n; + + list_for_each_entry_safe(req, n, &batch->head, ki_batch) { + list_del(&req->ki_batch); + kmem_cache_free(kiocb_cachep, req); + } +} + +/* + * Allocate a batch of kiocbs. This avoids taking and dropping the + * context lock a lot during setup. + */ +static int kiocb_batch_refill(struct kioctx *ctx, struct kiocb_batch *batch) +{ + unsigned short allocated, to_alloc; + long avail; + bool called_fput = false; + struct kiocb *req, *n; + struct aio_ring *ring; + + to_alloc = min(batch->count, KIOCB_BATCH_SIZE); + for (allocated = 0; allocated < to_alloc; allocated++) { + req = __aio_get_req(ctx); + if (!req) + /* allocation failed, go with what we've got */ + break; + list_add(&req->ki_batch, &batch->head); + } + + if (allocated == 0) + goto out; + +retry: spin_lock_irq(&ctx->ctx_lock); - ring = kmap_atomic(ctx->ring_info.ring_pages[0], KM_USER0); - if (ctx->reqs_active < aio_ring_avail(&ctx->ring_info, ring)) { + ring = kmap_atomic(ctx->ring_info.ring_pages[0]); + + avail = aio_ring_avail(&ctx->ring_info, ring) - ctx->reqs_active; + BUG_ON(avail < 0); + if (avail == 0 && !called_fput) { + /* + * Handle a potential starvation case. It is possible that + * we hold the last reference on a struct file, causing us + * to delay the final fput to non-irq context. In this case, + * ctx->reqs_active is artificially high. Calling the fput + * routine here may free up a slot in the event completion + * ring, allowing this allocation to succeed. + */ + kunmap_atomic(ring); + spin_unlock_irq(&ctx->ctx_lock); + aio_fput_routine(NULL); + called_fput = true; + goto retry; + } + + if (avail < allocated) { + /* Trim back the number of requests. */ + list_for_each_entry_safe(req, n, &batch->head, ki_batch) { + list_del(&req->ki_batch); + kmem_cache_free(kiocb_cachep, req); + if (--allocated <= avail) + break; + } + } + + batch->count -= allocated; + list_for_each_entry(req, &batch->head, ki_batch) { list_add(&req->ki_list, &ctx->active_reqs); ctx->reqs_active++; - okay = 1; } - kunmap_atomic(ring, KM_USER0); - spin_unlock_irq(&ctx->ctx_lock); - if (!okay) { - kmem_cache_free(kiocb_cachep, req); - req = NULL; - } + kunmap_atomic(ring); + spin_unlock_irq(&ctx->ctx_lock); - return req; +out: + return allocated; } -static inline struct kiocb *aio_get_req(struct kioctx *ctx) +static inline struct kiocb *aio_get_req(struct kioctx *ctx, + struct kiocb_batch *batch) { struct kiocb *req; - /* Handle a potential starvation case -- should be exceedingly rare as - * requests will be stuck on fput_head only if the aio_fput_routine is - * delayed and the requests were the last user of the struct file. - */ - req = __aio_get_req(ctx); - if (unlikely(NULL == req)) { - aio_fput_routine(NULL); - req = __aio_get_req(ctx); - } + + if (list_empty(&batch->head)) + if (kiocb_batch_refill(ctx, batch) == 0) + return NULL; + req = list_first_entry(&batch->head, struct kiocb, ki_batch); + list_del(&req->ki_batch); return req; } @@ -1387,13 +1460,13 @@ static ssize_t aio_setup_vectored_rw(int type, struct kiocb *kiocb, bool compat) ret = compat_rw_copy_check_uvector(type, (struct compat_iovec __user *)kiocb->ki_buf, kiocb->ki_nbytes, 1, &kiocb->ki_inline_vec, - &kiocb->ki_iovec); + &kiocb->ki_iovec, 1); else #endif ret = rw_copy_check_uvector(type, (struct iovec __user *)kiocb->ki_buf, kiocb->ki_nbytes, 1, &kiocb->ki_inline_vec, - &kiocb->ki_iovec); + &kiocb->ki_iovec, 1); if (ret < 0) goto out; @@ -1515,7 +1588,8 @@ static ssize_t aio_setup_iocb(struct kiocb *kiocb, bool compat) } static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, - struct iocb *iocb, bool compat) + struct iocb *iocb, struct kiocb_batch *batch, + bool compat) { struct kiocb *req; struct file *file; @@ -1541,7 +1615,7 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, if (unlikely(!file)) return -EBADF; - req = aio_get_req(ctx); /* returns with 2 references to req */ + req = aio_get_req(ctx, batch); /* returns with 2 references to req */ if (unlikely(!req)) { fput(file); return -EAGAIN; @@ -1621,8 +1695,9 @@ long do_io_submit(aio_context_t ctx_id, long nr, { struct kioctx *ctx; long ret = 0; - int i; + int i = 0; struct blk_plug plug; + struct kiocb_batch batch; if (unlikely(nr < 0)) return -EINVAL; @@ -1639,6 +1714,8 @@ long do_io_submit(aio_context_t ctx_id, long nr, return -EINVAL; } + kiocb_batch_init(&batch, nr); + blk_start_plug(&plug); /* @@ -1659,12 +1736,13 @@ long do_io_submit(aio_context_t ctx_id, long nr, break; } - ret = io_submit_one(ctx, user_iocb, &tmp, compat); + ret = io_submit_one(ctx, user_iocb, &tmp, &batch, compat); if (ret) break; } blk_finish_plug(&plug); + kiocb_batch_free(&batch); put_ioctx(ctx); return i ? i : ret; } @@ -13,6 +13,7 @@ #include <linux/fsnotify.h> #include <linux/fcntl.h> #include <linux/security.h> +#include <linux/evm.h> /** * inode_change_ok - check if attribute changes to an inode are allowed @@ -165,7 +166,7 @@ EXPORT_SYMBOL(setattr_copy); int notify_change(struct dentry * dentry, struct iattr * attr) { struct inode *inode = dentry->d_inode; - mode_t mode = inode->i_mode; + umode_t mode = inode->i_mode; int error; struct timespec now; unsigned int ia_valid = attr->ia_valid; @@ -176,7 +177,7 @@ int notify_change(struct dentry * dentry, struct iattr * attr) } if ((ia_valid & ATTR_MODE)) { - mode_t amode = attr->ia_mode; + umode_t amode = attr->ia_mode; /* Flag setting protected by i_mutex */ if (is_sxid(amode)) inode->i_flags &= ~S_NOSEC; @@ -237,8 +238,10 @@ int notify_change(struct dentry * dentry, struct iattr * attr) else error = simple_setattr(dentry, attr); - if (!error) + if (!error) { fsnotify_change(dentry, ia_valid); + evm_inode_post_setattr(dentry, ia_valid); + } return error; } diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h index 326dc08..5869d4e 100644 --- a/fs/autofs4/autofs_i.h +++ b/fs/autofs4/autofs_i.h @@ -155,7 +155,7 @@ static inline int autofs4_ispending(struct dentry *dentry) return 0; } -struct inode *autofs4_get_inode(struct super_block *, mode_t); +struct inode *autofs4_get_inode(struct super_block *, umode_t); void autofs4_free_ino(struct autofs_info *); /* Expiration */ diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c index 509fe1e..76741d8 100644 --- a/fs/autofs4/dev-ioctl.c +++ b/fs/autofs4/dev-ioctl.c @@ -194,7 +194,7 @@ static int find_autofs_mount(const char *pathname, return err; err = -ENOENT; while (path.dentry == path.mnt->mnt_root) { - if (path.mnt->mnt_sb->s_magic == AUTOFS_SUPER_MAGIC) { + if (path.dentry->d_sb->s_magic == AUTOFS_SUPER_MAGIC) { if (test(&path, data)) { path_get(&path); if (!err) /* already found some */ @@ -212,7 +212,7 @@ static int find_autofs_mount(const char *pathname, static int test_by_dev(struct path *path, void *p) { - return path->mnt->mnt_sb->s_dev == *(dev_t *)p; + return path->dentry->d_sb->s_dev == *(dev_t *)p; } static int test_by_type(struct path *path, void *p) @@ -538,11 +538,11 @@ static int autofs_dev_ioctl_ismountpoint(struct file *fp, err = find_autofs_mount(name, &path, test_by_type, &type); if (err) goto out; - devid = new_encode_dev(path.mnt->mnt_sb->s_dev); + devid = new_encode_dev(path.dentry->d_sb->s_dev); err = 0; if (path.mnt->mnt_root == path.dentry) { err = 1; - magic = path.mnt->mnt_sb->s_magic; + magic = path.dentry->d_sb->s_magic; } } else { dev_t dev = sbi->sb->s_dev; @@ -556,7 +556,7 @@ static int autofs_dev_ioctl_ismountpoint(struct file *fp, err = have_submounts(path.dentry); if (follow_down_one(&path)) - magic = path.mnt->mnt_sb->s_magic; + magic = path.dentry->d_sb->s_magic; } param->ismountpoint.out.devid = devid; diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c index 180fa24..2ba44c7 100644 --- a/fs/autofs4/inode.c +++ b/fs/autofs4/inode.c @@ -70,10 +70,10 @@ out_kill_sb: kill_litter_super(sb); } -static int autofs4_show_options(struct seq_file *m, struct vfsmount *mnt) +static int autofs4_show_options(struct seq_file *m, struct dentry *root) { - struct autofs_sb_info *sbi = autofs4_sbi(mnt->mnt_sb); - struct inode *root_inode = mnt->mnt_sb->s_root->d_inode; + struct autofs_sb_info *sbi = autofs4_sbi(root->d_sb); + struct inode *root_inode = root->d_sb->s_root->d_inode; if (!sbi) return 0; @@ -326,7 +326,7 @@ fail_unlock: return -EINVAL; } -struct inode *autofs4_get_inode(struct super_block *sb, mode_t mode) +struct inode *autofs4_get_inode(struct super_block *sb, umode_t mode) { struct inode *inode = new_inode(sb); @@ -342,7 +342,7 @@ struct inode *autofs4_get_inode(struct super_block *sb, mode_t mode) inode->i_ino = get_next_ino(); if (S_ISDIR(mode)) { - inode->i_nlink = 2; + set_nlink(inode, 2); inode->i_op = &autofs4_dir_inode_operations; inode->i_fop = &autofs4_dir_operations; } else if (S_ISLNK(mode)) { diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c index f55ae23..75e5f1c 100644 --- a/fs/autofs4/root.c +++ b/fs/autofs4/root.c @@ -26,7 +26,7 @@ static int autofs4_dir_symlink(struct inode *,struct dentry *,const char *); static int autofs4_dir_unlink(struct inode *,struct dentry *); static int autofs4_dir_rmdir(struct inode *,struct dentry *); -static int autofs4_dir_mkdir(struct inode *,struct dentry *,int); +static int autofs4_dir_mkdir(struct inode *,struct dentry *,umode_t); static long autofs4_root_ioctl(struct file *,unsigned int,unsigned long); #ifdef CONFIG_COMPAT static long autofs4_root_compat_ioctl(struct file *,unsigned int,unsigned long); @@ -699,7 +699,7 @@ static int autofs4_dir_rmdir(struct inode *dir, struct dentry *dentry) return 0; } -static int autofs4_dir_mkdir(struct inode *dir, struct dentry *dentry, int mode) +static int autofs4_dir_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) { struct autofs_sb_info *sbi = autofs4_sbi(dir->i_sb); struct autofs_info *ino = autofs4_dentry_ino(dentry); diff --git a/fs/bad_inode.c b/fs/bad_inode.c index 9205cf2..22e9a78 100644 --- a/fs/bad_inode.c +++ b/fs/bad_inode.c @@ -173,7 +173,7 @@ static const struct file_operations bad_file_ops = }; static int bad_inode_create (struct inode *dir, struct dentry *dentry, - int mode, struct nameidata *nd) + umode_t mode, struct nameidata *nd) { return -EIO; } @@ -202,7 +202,7 @@ static int bad_inode_symlink (struct inode *dir, struct dentry *dentry, } static int bad_inode_mkdir(struct inode *dir, struct dentry *dentry, - int mode) + umode_t mode) { return -EIO; } @@ -213,7 +213,7 @@ static int bad_inode_rmdir (struct inode *dir, struct dentry *dentry) } static int bad_inode_mknod (struct inode *dir, struct dentry *dentry, - int mode, dev_t rdev) + umode_t mode, dev_t rdev) { return -EIO; } diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c index 720d885..6e6d536 100644 --- a/fs/befs/linuxvfs.c +++ b/fs/befs/linuxvfs.c @@ -286,7 +286,6 @@ befs_alloc_inode(struct super_block *sb) static void befs_i_callback(struct rcu_head *head) { struct inode *inode = container_of(head, struct inode, i_rcu); - INIT_LIST_HEAD(&inode->i_dentry); kmem_cache_free(befs_inode_cachep, BEFS_I(inode)); } @@ -357,7 +356,7 @@ static struct inode *befs_iget(struct super_block *sb, unsigned long ino) inode->i_gid = befs_sb->mount_opts.use_gid ? befs_sb->mount_opts.gid : (gid_t) fs32_to_cpu(sb, raw_inode->gid); - inode->i_nlink = 1; + set_nlink(inode, 1); /* * BEFS's time is 64 bits, but current VFS is 32 bits... diff --git a/fs/bfs/dir.c b/fs/bfs/dir.c index b14cebf..d12c796 100644 --- a/fs/bfs/dir.c +++ b/fs/bfs/dir.c @@ -84,7 +84,7 @@ const struct file_operations bfs_dir_operations = { extern void dump_imap(const char *, struct super_block *); -static int bfs_create(struct inode *dir, struct dentry *dentry, int mode, +static int bfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, struct nameidata *nd) { int err; @@ -199,7 +199,7 @@ static int bfs_unlink(struct inode *dir, struct dentry *dentry) printf("unlinking non-existent file %s:%lu (nlink=%d)\n", inode->i_sb->s_id, inode->i_ino, inode->i_nlink); - inode->i_nlink = 1; + set_nlink(inode, 1); } de->ino = 0; mark_buffer_dirty_inode(bh, dir); diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c index a8e37f8..b0391bc 100644 --- a/fs/bfs/inode.c +++ b/fs/bfs/inode.c @@ -78,7 +78,7 @@ struct inode *bfs_iget(struct super_block *sb, unsigned long ino) BFS_I(inode)->i_dsk_ino = le16_to_cpu(di->i_ino); inode->i_uid = le32_to_cpu(di->i_uid); inode->i_gid = le32_to_cpu(di->i_gid); - inode->i_nlink = le32_to_cpu(di->i_nlink); + set_nlink(inode, le32_to_cpu(di->i_nlink)); inode->i_size = BFS_FILESIZE(di); inode->i_blocks = BFS_FILEBLOCKS(di); inode->i_atime.tv_sec = le32_to_cpu(di->i_atime); @@ -251,7 +251,6 @@ static struct inode *bfs_alloc_inode(struct super_block *sb) static void bfs_i_callback(struct rcu_head *head) { struct inode *inode = container_of(head, struct inode, i_rcu); - INIT_LIST_HEAD(&inode->i_dentry); kmem_cache_free(bfs_inode_cachep, BFS_I(inode)); } diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index dd0fdfc..21ac5ee 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -795,7 +795,16 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs) * might try to exec. This is because the brk will * follow the loader, and is not movable. */ #if defined(CONFIG_X86) || defined(CONFIG_ARM) - load_bias = 0; + /* Memory randomization might have been switched off + * in runtime via sysctl. + * If that is the case, retain the original non-zero + * load_bias value in order to establish proper + * non-randomized mappings. + */ + if (current->flags & PF_RANDOMIZE) + load_bias = 0; + else + load_bias = ELF_PAGESTART(ELF_ET_DYN_BASE - vaddr); #else load_bias = ELF_PAGESTART(ELF_ET_DYN_BASE - vaddr); #endif diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c index ba1a1ae..a9198df 100644 --- a/fs/binfmt_misc.c +++ b/fs/binfmt_misc.c @@ -521,7 +521,7 @@ static void kill_node(Node *e) write_unlock(&entries_lock); if (dentry) { - dentry->d_inode->i_nlink--; + drop_nlink(dentry->d_inode); d_drop(dentry); dput(dentry); simple_release_fs(&bm_mnt, &entry_count); @@ -560,7 +560,7 @@ static ssize_t bm_entry_write(struct file *file, const char __user *buffer, break; case 2: set_bit(Enabled, &e->flags); break; - case 3: root = dget(file->f_path.mnt->mnt_sb->s_root); + case 3: root = dget(file->f_path.dentry->d_sb->s_root); mutex_lock(&root->d_inode->i_mutex); kill_node(e); @@ -587,7 +587,7 @@ static ssize_t bm_register_write(struct file *file, const char __user *buffer, Node *e; struct inode *inode; struct dentry *root, *dentry; - struct super_block *sb = file->f_path.mnt->mnt_sb; + struct super_block *sb = file->f_path.dentry->d_sb; int err = 0; e = create_entry(buffer, count); @@ -666,7 +666,7 @@ static ssize_t bm_status_write(struct file * file, const char __user * buffer, switch (res) { case 1: enabled = 0; break; case 2: enabled = 1; break; - case 3: root = dget(file->f_path.mnt->mnt_sb->s_root); + case 3: root = dget(file->f_path.dentry->d_sb->s_root); mutex_lock(&root->d_inode->i_mutex); while (!list_empty(&entries)) diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c index 9c5e6b2..c2183f3 100644 --- a/fs/bio-integrity.c +++ b/fs/bio-integrity.c @@ -22,6 +22,7 @@ #include <linux/blkdev.h> #include <linux/mempool.h> +#include <linux/export.h> #include <linux/bio.h> #include <linux/workqueue.h> #include <linux/slab.h> @@ -255,7 +255,6 @@ void bio_init(struct bio *bio) { memset(bio, 0, sizeof(*bio)); bio->bi_flags = 1 << BIO_UPTODATE; - bio->bi_comp_cpu = -1; atomic_set(&bio->bi_cnt, 1); } EXPORT_SYMBOL(bio_init); @@ -338,7 +337,7 @@ static void bio_fs_destructor(struct bio *bio) * RETURNS: * Pointer to new bio on success, NULL on failure. */ -struct bio *bio_alloc(gfp_t gfp_mask, int nr_iovecs) +struct bio *bio_alloc(gfp_t gfp_mask, unsigned int nr_iovecs) { struct bio *bio = bio_alloc_bioset(gfp_mask, nr_iovecs, fs_bio_set); @@ -366,7 +365,7 @@ static void bio_kmalloc_destructor(struct bio *bio) * %__GFP_WAIT, the allocation is guaranteed to succeed. * **/ -struct bio *bio_kmalloc(gfp_t gfp_mask, int nr_iovecs) +struct bio *bio_kmalloc(gfp_t gfp_mask, unsigned int nr_iovecs) { struct bio *bio; @@ -697,7 +696,8 @@ static void bio_free_map_data(struct bio_map_data *bmd) kfree(bmd); } -static struct bio_map_data *bio_alloc_map_data(int nr_segs, int iov_count, +static struct bio_map_data *bio_alloc_map_data(int nr_segs, + unsigned int iov_count, gfp_t gfp_mask) { struct bio_map_data *bmd; diff --git a/fs/block_dev.c b/fs/block_dev.c index 95f786e..69a5b6f 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -17,6 +17,7 @@ #include <linux/module.h> #include <linux/blkpg.h> #include <linux/buffer_head.h> +#include <linux/swap.h> #include <linux/pagevec.h> #include <linux/writeback.h> #include <linux/mpage.h> @@ -25,6 +26,7 @@ #include <linux/namei.h> #include <linux/log2.h> #include <linux/kmemleak.h> +#include <linux/cleancache.h> #include <asm/uaccess.h> #include "internal.h" @@ -82,13 +84,35 @@ static sector_t max_block(struct block_device *bdev) } /* Kill _all_ buffers and pagecache , dirty or not.. */ -static void kill_bdev(struct block_device *bdev) +void kill_bdev(struct block_device *bdev) { - if (bdev->bd_inode->i_mapping->nrpages == 0) + struct address_space *mapping = bdev->bd_inode->i_mapping; + + if (mapping->nrpages == 0) return; + invalidate_bh_lrus(); - truncate_inode_pages(bdev->bd_inode->i_mapping, 0); + truncate_inode_pages(mapping, 0); } +EXPORT_SYMBOL(kill_bdev); + +/* Invalidate clean unused buffers and pagecache. */ +void invalidate_bdev(struct block_device *bdev) +{ + struct address_space *mapping = bdev->bd_inode->i_mapping; + + if (mapping->nrpages == 0) + return; + + invalidate_bh_lrus(); + lru_add_drain_all(); /* make sure all lru add caches are flushed */ + invalidate_mapping_pages(mapping, 0, -1); + /* 99% of the time, we don't need to flush the cleancache on the bdev. + * But, for the strange corners, lets be cautious + */ + cleancache_flush_inode(mapping); +} +EXPORT_SYMBOL(invalidate_bdev); int set_blocksize(struct block_device *bdev, int size) { @@ -425,7 +449,6 @@ static void bdev_i_callback(struct rcu_head *head) struct inode *inode = container_of(head, struct inode, i_rcu); struct bdev_inode *bdi = BDEV_I(inode); - INIT_LIST_HEAD(&inode->i_dentry); kmem_cache_free(bdev_cachep, bdi); } @@ -493,7 +516,7 @@ static struct file_system_type bd_type = { .kill_sb = kill_anon_super, }; -struct super_block *blockdev_superblock __read_mostly; +static struct super_block *blockdev_superblock __read_mostly; void __init bdev_cache_init(void) { @@ -639,6 +662,11 @@ static struct block_device *bd_acquire(struct inode *inode) return bdev; } +static inline int sb_is_blkdev_sb(struct super_block *sb) +{ + return sb == blockdev_superblock; +} + /* Call when you free inode */ void bd_forget(struct inode *inode) @@ -971,7 +999,7 @@ static void flush_disk(struct block_device *bdev, bool kill_dirty) if (!bdev->bd_disk) return; - if (disk_partitionable(bdev->bd_disk)) + if (disk_part_scan_enabled(bdev->bd_disk)) bdev->bd_invalidated = 1; } @@ -1085,6 +1113,7 @@ static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part); static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) { struct gendisk *disk; + struct module *owner; int ret; int partno; int perm = 0; @@ -1110,6 +1139,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) disk = get_gendisk(bdev->bd_dev, &partno); if (!disk) goto out; + owner = disk->fops->owner; disk_block_events(disk); mutex_lock_nested(&bdev->bd_mutex, for_part); @@ -1137,8 +1167,8 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) bdev->bd_disk = NULL; mutex_unlock(&bdev->bd_mutex); disk_unblock_events(disk); - module_put(disk->fops->owner); put_disk(disk); + module_put(owner); goto restart; } } @@ -1194,8 +1224,8 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) goto out_unlock_bdev; } /* only one opener holds refs to the module and disk */ - module_put(disk->fops->owner); put_disk(disk); + module_put(owner); } bdev->bd_openers++; if (for_part) @@ -1215,8 +1245,8 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) out_unlock_bdev: mutex_unlock(&bdev->bd_mutex); disk_unblock_events(disk); - module_put(disk->fops->owner); put_disk(disk); + module_put(owner); out: bdput(bdev); @@ -1442,14 +1472,15 @@ static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part) if (!bdev->bd_openers) { struct module *owner = disk->fops->owner; - put_disk(disk); - module_put(owner); disk_put_part(bdev->bd_part); bdev->bd_part = NULL; bdev->bd_disk = NULL; if (bdev != bdev->bd_contains) victim = bdev->bd_contains; bdev->bd_contains = NULL; + + put_disk(disk); + module_put(owner); } mutex_unlock(&bdev->bd_mutex); bdput(bdev); diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index 40e6ac0..c0ddfd2 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -7,6 +7,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \ extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \ extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \ export.o tree-log.o free-space-cache.o zlib.o lzo.o \ - compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o + compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \ + reada.o backref.o btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c index eb159aa..89b156d 100644 --- a/fs/btrfs/acl.c +++ b/fs/btrfs/acl.c @@ -59,22 +59,19 @@ struct posix_acl *btrfs_get_acl(struct inode *inode, int type) if (!value) return ERR_PTR(-ENOMEM); size = __btrfs_getxattr(inode, name, value, size); - if (size > 0) { - acl = posix_acl_from_xattr(value, size); - if (IS_ERR(acl)) { - kfree(value); - return acl; - } - set_cached_acl(inode, type, acl); - } - kfree(value); + } + if (size > 0) { + acl = posix_acl_from_xattr(value, size); } else if (size == -ENOENT || size == -ENODATA || size == 0) { /* FIXME, who returns -ENOENT? I think nobody */ acl = NULL; - set_cached_acl(inode, type, acl); } else { acl = ERR_PTR(-EIO); } + kfree(value); + + if (!IS_ERR(acl)) + set_cached_acl(inode, type, acl); return acl; } diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c index 7ec1409..0cc20b3 100644 --- a/fs/btrfs/async-thread.c +++ b/fs/btrfs/async-thread.c @@ -64,6 +64,8 @@ struct btrfs_worker_thread { int idle; }; +static int __btrfs_start_workers(struct btrfs_workers *workers); + /* * btrfs_start_workers uses kthread_run, which can block waiting for memory * for a very long time. It will actually throttle on page writeback, @@ -88,27 +90,10 @@ static void start_new_worker_func(struct btrfs_work *work) { struct worker_start *start; start = container_of(work, struct worker_start, work); - btrfs_start_workers(start->queue, 1); + __btrfs_start_workers(start->queue); kfree(start); } -static int start_new_worker(struct btrfs_workers *queue) -{ - struct worker_start *start; - int ret; - - start = kzalloc(sizeof(*start), GFP_NOFS); - if (!start) - return -ENOMEM; - - start->work.func = start_new_worker_func; - start->queue = queue; - ret = btrfs_queue_worker(queue->atomic_worker_start, &start->work); - if (ret) - kfree(start); - return ret; -} - /* * helper function to move a thread onto the idle list after it * has finished some requests. @@ -153,12 +138,20 @@ static void check_busy_worker(struct btrfs_worker_thread *worker) static void check_pending_worker_creates(struct btrfs_worker_thread *worker) { struct btrfs_workers *workers = worker->workers; + struct worker_start *start; unsigned long flags; rmb(); if (!workers->atomic_start_pending) return; + start = kzalloc(sizeof(*start), GFP_NOFS); + if (!start) + return; + + start->work.func = start_new_worker_func; + start->queue = workers; + spin_lock_irqsave(&workers->lock, flags); if (!workers->atomic_start_pending) goto out; @@ -170,10 +163,11 @@ static void check_pending_worker_creates(struct btrfs_worker_thread *worker) workers->num_workers_starting += 1; spin_unlock_irqrestore(&workers->lock, flags); - start_new_worker(workers); + btrfs_queue_worker(workers->atomic_worker_start, &start->work); return; out: + kfree(start); spin_unlock_irqrestore(&workers->lock, flags); } @@ -331,7 +325,7 @@ again: run_ordered_completions(worker->workers, work); check_pending_worker_creates(worker); - + cond_resched(); } spin_lock_irq(&worker->lock); @@ -340,7 +334,7 @@ again: if (freezing(current)) { worker->working = 0; spin_unlock_irq(&worker->lock); - refrigerator(); + try_to_freeze(); } else { spin_unlock_irq(&worker->lock); if (!kthread_should_stop()) { @@ -462,56 +456,55 @@ void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max, * starts new worker threads. This does not enforce the max worker * count in case you need to temporarily go past it. */ -static int __btrfs_start_workers(struct btrfs_workers *workers, - int num_workers) +static int __btrfs_start_workers(struct btrfs_workers *workers) { struct btrfs_worker_thread *worker; int ret = 0; - int i; - for (i = 0; i < num_workers; i++) { - worker = kzalloc(sizeof(*worker), GFP_NOFS); - if (!worker) { - ret = -ENOMEM; - goto fail; - } + worker = kzalloc(sizeof(*worker), GFP_NOFS); + if (!worker) { + ret = -ENOMEM; + goto fail; + } - INIT_LIST_HEAD(&worker->pending); - INIT_LIST_HEAD(&worker->prio_pending); - INIT_LIST_HEAD(&worker->worker_list); - spin_lock_init(&worker->lock); - - atomic_set(&worker->num_pending, 0); - atomic_set(&worker->refs, 1); - worker->workers = workers; - worker->task = kthread_run(worker_loop, worker, - "btrfs-%s-%d", workers->name, - workers->num_workers + i); - if (IS_ERR(worker->task)) { - ret = PTR_ERR(worker->task); - kfree(worker); - goto fail; - } - spin_lock_irq(&workers->lock); - list_add_tail(&worker->worker_list, &workers->idle_list); - worker->idle = 1; - workers->num_workers++; - workers->num_workers_starting--; - WARN_ON(workers->num_workers_starting < 0); - spin_unlock_irq(&workers->lock); + INIT_LIST_HEAD(&worker->pending); + INIT_LIST_HEAD(&worker->prio_pending); + INIT_LIST_HEAD(&worker->worker_list); + spin_lock_init(&worker->lock); + + atomic_set(&worker->num_pending, 0); + atomic_set(&worker->refs, 1); + worker->workers = workers; + worker->task = kthread_run(worker_loop, worker, + "btrfs-%s-%d", workers->name, + workers->num_workers + 1); + if (IS_ERR(worker->task)) { + ret = PTR_ERR(worker->task); + kfree(worker); + goto fail; } + spin_lock_irq(&workers->lock); + list_add_tail(&worker->worker_list, &workers->idle_list); + worker->idle = 1; + workers->num_workers++; + workers->num_workers_starting--; + WARN_ON(workers->num_workers_starting < 0); + spin_unlock_irq(&workers->lock); + return 0; fail: - btrfs_stop_workers(workers); + spin_lock_irq(&workers->lock); + workers->num_workers_starting--; + spin_unlock_irq(&workers->lock); return ret; } -int btrfs_start_workers(struct btrfs_workers *workers, int num_workers) +int btrfs_start_workers(struct btrfs_workers *workers) { spin_lock_irq(&workers->lock); - workers->num_workers_starting += num_workers; + workers->num_workers_starting++; spin_unlock_irq(&workers->lock); - return __btrfs_start_workers(workers, num_workers); + return __btrfs_start_workers(workers); } /* @@ -568,9 +561,10 @@ static struct btrfs_worker_thread *find_worker(struct btrfs_workers *workers) struct btrfs_worker_thread *worker; unsigned long flags; struct list_head *fallback; + int ret; -again: spin_lock_irqsave(&workers->lock, flags); +again: worker = next_worker(workers); if (!worker) { @@ -584,7 +578,10 @@ again: workers->num_workers_starting++; spin_unlock_irqrestore(&workers->lock, flags); /* we're below the limit, start another worker */ - __btrfs_start_workers(workers, 1); + ret = __btrfs_start_workers(workers); + spin_lock_irqsave(&workers->lock, flags); + if (ret) + goto fallback; goto again; } } @@ -665,7 +662,7 @@ void btrfs_set_work_high_prio(struct btrfs_work *work) /* * places a struct btrfs_work into the pending queue of one of the kthreads */ -int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work) +void btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work) { struct btrfs_worker_thread *worker; unsigned long flags; @@ -673,7 +670,7 @@ int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work) /* don't requeue something already on a list */ if (test_and_set_bit(WORK_QUEUED_BIT, &work->flags)) - goto out; + return; worker = find_worker(workers); if (workers->ordered) { @@ -712,7 +709,4 @@ int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work) if (wake) wake_up_process(worker->task); spin_unlock_irqrestore(&worker->lock, flags); - -out: - return 0; } diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h index 5077746..f34cc31 100644 --- a/fs/btrfs/async-thread.h +++ b/fs/btrfs/async-thread.h @@ -109,8 +109,8 @@ struct btrfs_workers { char *name; }; -int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work); -int btrfs_start_workers(struct btrfs_workers *workers, int num_workers); +void btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work); +int btrfs_start_workers(struct btrfs_workers *workers); int btrfs_stop_workers(struct btrfs_workers *workers); void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max, struct btrfs_workers *async_starter); diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c new file mode 100644 index 0000000..22c64ff --- /dev/null +++ b/fs/btrfs/backref.c @@ -0,0 +1,776 @@ +/* + * Copyright (C) 2011 STRATO. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include "ctree.h" +#include "disk-io.h" +#include "backref.h" + +struct __data_ref { + struct list_head list; + u64 inum; + u64 root; + u64 extent_data_item_offset; +}; + +struct __shared_ref { + struct list_head list; + u64 disk_byte; +}; + +static int __inode_info(u64 inum, u64 ioff, u8 key_type, + struct btrfs_root *fs_root, struct btrfs_path *path, + struct btrfs_key *found_key) +{ + int ret; + struct btrfs_key key; + struct extent_buffer *eb; + + key.type = key_type; + key.objectid = inum; + key.offset = ioff; + + ret = btrfs_search_slot(NULL, fs_root, &key, path, 0, 0); + if (ret < 0) + return ret; + + eb = path->nodes[0]; + if (ret && path->slots[0] >= btrfs_header_nritems(eb)) { + ret = btrfs_next_leaf(fs_root, path); + if (ret) + return ret; + eb = path->nodes[0]; + } + + btrfs_item_key_to_cpu(eb, found_key, path->slots[0]); + if (found_key->type != key.type || found_key->objectid != key.objectid) + return 1; + + return 0; +} + +/* + * this makes the path point to (inum INODE_ITEM ioff) + */ +int inode_item_info(u64 inum, u64 ioff, struct btrfs_root *fs_root, + struct btrfs_path *path) +{ + struct btrfs_key key; + return __inode_info(inum, ioff, BTRFS_INODE_ITEM_KEY, fs_root, path, + &key); +} + +static int inode_ref_info(u64 inum, u64 ioff, struct btrfs_root *fs_root, + struct btrfs_path *path, + struct btrfs_key *found_key) +{ + return __inode_info(inum, ioff, BTRFS_INODE_REF_KEY, fs_root, path, + found_key); +} + +/* + * this iterates to turn a btrfs_inode_ref into a full filesystem path. elements + * of the path are separated by '/' and the path is guaranteed to be + * 0-terminated. the path is only given within the current file system. + * Therefore, it never starts with a '/'. the caller is responsible to provide + * "size" bytes in "dest". the dest buffer will be filled backwards. finally, + * the start point of the resulting string is returned. this pointer is within + * dest, normally. + * in case the path buffer would overflow, the pointer is decremented further + * as if output was written to the buffer, though no more output is actually + * generated. that way, the caller can determine how much space would be + * required for the path to fit into the buffer. in that case, the returned + * value will be smaller than dest. callers must check this! + */ +static char *iref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path, + struct btrfs_inode_ref *iref, + struct extent_buffer *eb_in, u64 parent, + char *dest, u32 size) +{ + u32 len; + int slot; + u64 next_inum; + int ret; + s64 bytes_left = size - 1; + struct extent_buffer *eb = eb_in; + struct btrfs_key found_key; + + if (bytes_left >= 0) + dest[bytes_left] = '\0'; + + while (1) { + len = btrfs_inode_ref_name_len(eb, iref); + bytes_left -= len; + if (bytes_left >= 0) + read_extent_buffer(eb, dest + bytes_left, + (unsigned long)(iref + 1), len); + if (eb != eb_in) + free_extent_buffer(eb); + ret = inode_ref_info(parent, 0, fs_root, path, &found_key); + if (ret) + break; + next_inum = found_key.offset; + + /* regular exit ahead */ + if (parent == next_inum) + break; + + slot = path->slots[0]; + eb = path->nodes[0]; + /* make sure we can use eb after releasing the path */ + if (eb != eb_in) + atomic_inc(&eb->refs); + btrfs_release_path(path); + + iref = btrfs_item_ptr(eb, slot, struct btrfs_inode_ref); + parent = next_inum; + --bytes_left; + if (bytes_left >= 0) + dest[bytes_left] = '/'; + } + + btrfs_release_path(path); + + if (ret) + return ERR_PTR(ret); + + return dest + bytes_left; +} + +/* + * this makes the path point to (logical EXTENT_ITEM *) + * returns BTRFS_EXTENT_FLAG_DATA for data, BTRFS_EXTENT_FLAG_TREE_BLOCK for + * tree blocks and <0 on error. + */ +int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical, + struct btrfs_path *path, struct btrfs_key *found_key) +{ + int ret; + u64 flags; + u32 item_size; + struct extent_buffer *eb; + struct btrfs_extent_item *ei; + struct btrfs_key key; + + key.type = BTRFS_EXTENT_ITEM_KEY; + key.objectid = logical; + key.offset = (u64)-1; + + ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, path, 0, 0); + if (ret < 0) + return ret; + ret = btrfs_previous_item(fs_info->extent_root, path, + 0, BTRFS_EXTENT_ITEM_KEY); + if (ret < 0) + return ret; + + btrfs_item_key_to_cpu(path->nodes[0], found_key, path->slots[0]); + if (found_key->type != BTRFS_EXTENT_ITEM_KEY || + found_key->objectid > logical || + found_key->objectid + found_key->offset <= logical) + return -ENOENT; + + eb = path->nodes[0]; + item_size = btrfs_item_size_nr(eb, path->slots[0]); + BUG_ON(item_size < sizeof(*ei)); + + ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item); + flags = btrfs_extent_flags(eb, ei); + + if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) + return BTRFS_EXTENT_FLAG_TREE_BLOCK; + if (flags & BTRFS_EXTENT_FLAG_DATA) + return BTRFS_EXTENT_FLAG_DATA; + + return -EIO; +} + +/* + * helper function to iterate extent inline refs. ptr must point to a 0 value + * for the first call and may be modified. it is used to track state. + * if more refs exist, 0 is returned and the next call to + * __get_extent_inline_ref must pass the modified ptr parameter to get the + * next ref. after the last ref was processed, 1 is returned. + * returns <0 on error + */ +static int __get_extent_inline_ref(unsigned long *ptr, struct extent_buffer *eb, + struct btrfs_extent_item *ei, u32 item_size, + struct btrfs_extent_inline_ref **out_eiref, + int *out_type) +{ + unsigned long end; + u64 flags; + struct btrfs_tree_block_info *info; + + if (!*ptr) { + /* first call */ + flags = btrfs_extent_flags(eb, ei); + if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { + info = (struct btrfs_tree_block_info *)(ei + 1); + *out_eiref = + (struct btrfs_extent_inline_ref *)(info + 1); + } else { + *out_eiref = (struct btrfs_extent_inline_ref *)(ei + 1); + } + *ptr = (unsigned long)*out_eiref; + if ((void *)*ptr >= (void *)ei + item_size) + return -ENOENT; + } + + end = (unsigned long)ei + item_size; + *out_eiref = (struct btrfs_extent_inline_ref *)*ptr; + *out_type = btrfs_extent_inline_ref_type(eb, *out_eiref); + + *ptr += btrfs_extent_inline_ref_size(*out_type); + WARN_ON(*ptr > end); + if (*ptr == end) + return 1; /* last */ + + return 0; +} + +/* + * reads the tree block backref for an extent. tree level and root are returned + * through out_level and out_root. ptr must point to a 0 value for the first + * call and may be modified (see __get_extent_inline_ref comment). + * returns 0 if data was provided, 1 if there was no more data to provide or + * <0 on error. + */ +int tree_backref_for_extent(unsigned long *ptr, struct extent_buffer *eb, + struct btrfs_extent_item *ei, u32 item_size, + u64 *out_root, u8 *out_level) +{ + int ret; + int type; + struct btrfs_tree_block_info *info; + struct btrfs_extent_inline_ref *eiref; + + if (*ptr == (unsigned long)-1) + return 1; + + while (1) { + ret = __get_extent_inline_ref(ptr, eb, ei, item_size, + &eiref, &type); + if (ret < 0) + return ret; + + if (type == BTRFS_TREE_BLOCK_REF_KEY || + type == BTRFS_SHARED_BLOCK_REF_KEY) + break; + + if (ret == 1) + return 1; + } + + /* we can treat both ref types equally here */ + info = (struct btrfs_tree_block_info *)(ei + 1); + *out_root = btrfs_extent_inline_ref_offset(eb, eiref); + *out_level = btrfs_tree_block_level(eb, info); + + if (ret == 1) + *ptr = (unsigned long)-1; + + return 0; +} + +static int __data_list_add(struct list_head *head, u64 inum, + u64 extent_data_item_offset, u64 root) +{ + struct __data_ref *ref; + + ref = kmalloc(sizeof(*ref), GFP_NOFS); + if (!ref) + return -ENOMEM; + + ref->inum = inum; + ref->extent_data_item_offset = extent_data_item_offset; + ref->root = root; + list_add_tail(&ref->list, head); + + return 0; +} + +static int __data_list_add_eb(struct list_head *head, struct extent_buffer *eb, + struct btrfs_extent_data_ref *dref) +{ + return __data_list_add(head, btrfs_extent_data_ref_objectid(eb, dref), + btrfs_extent_data_ref_offset(eb, dref), + btrfs_extent_data_ref_root(eb, dref)); +} + +static int __shared_list_add(struct list_head *head, u64 disk_byte) +{ + struct __shared_ref *ref; + + ref = kmalloc(sizeof(*ref), GFP_NOFS); + if (!ref) + return -ENOMEM; + + ref->disk_byte = disk_byte; + list_add_tail(&ref->list, head); + + return 0; +} + +static int __iter_shared_inline_ref_inodes(struct btrfs_fs_info *fs_info, + u64 logical, u64 inum, + u64 extent_data_item_offset, + u64 extent_offset, + struct btrfs_path *path, + struct list_head *data_refs, + iterate_extent_inodes_t *iterate, + void *ctx) +{ + u64 ref_root; + u32 item_size; + struct btrfs_key key; + struct extent_buffer *eb; + struct btrfs_extent_item *ei; + struct btrfs_extent_inline_ref *eiref; + struct __data_ref *ref; + int ret; + int type; + int last; + unsigned long ptr = 0; + + WARN_ON(!list_empty(data_refs)); + ret = extent_from_logical(fs_info, logical, path, &key); + if (ret & BTRFS_EXTENT_FLAG_DATA) + ret = -EIO; + if (ret < 0) + goto out; + + eb = path->nodes[0]; + ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item); + item_size = btrfs_item_size_nr(eb, path->slots[0]); + + ret = 0; + ref_root = 0; + /* + * as done in iterate_extent_inodes, we first build a list of refs to + * iterate, then free the path and then iterate them to avoid deadlocks. + */ + do { + last = __get_extent_inline_ref(&ptr, eb, ei, item_size, + &eiref, &type); + if (last < 0) { + ret = last; + goto out; + } + if (type == BTRFS_TREE_BLOCK_REF_KEY || + type == BTRFS_SHARED_BLOCK_REF_KEY) { + ref_root = btrfs_extent_inline_ref_offset(eb, eiref); + ret = __data_list_add(data_refs, inum, + extent_data_item_offset, + ref_root); + } + } while (!ret && !last); + + btrfs_release_path(path); + + if (ref_root == 0) { + printk(KERN_ERR "btrfs: failed to find tree block ref " + "for shared data backref %llu\n", logical); + WARN_ON(1); + ret = -EIO; + } + +out: + while (!list_empty(data_refs)) { + ref = list_first_entry(data_refs, struct __data_ref, list); + list_del(&ref->list); + if (!ret) + ret = iterate(ref->inum, extent_offset + + ref->extent_data_item_offset, + ref->root, ctx); + kfree(ref); + } + + return ret; +} + +static int __iter_shared_inline_ref(struct btrfs_fs_info *fs_info, + u64 logical, u64 orig_extent_item_objectid, + u64 extent_offset, struct btrfs_path *path, + struct list_head *data_refs, + iterate_extent_inodes_t *iterate, + void *ctx) +{ + u64 disk_byte; + struct btrfs_key key; + struct btrfs_file_extent_item *fi; + struct extent_buffer *eb; + int slot; + int nritems; + int ret; + int found = 0; + + eb = read_tree_block(fs_info->tree_root, logical, + fs_info->tree_root->leafsize, 0); + if (!eb) + return -EIO; + + /* + * from the shared data ref, we only have the leaf but we need + * the key. thus, we must look into all items and see that we + * find one (some) with a reference to our extent item. + */ + nritems = btrfs_header_nritems(eb); + for (slot = 0; slot < nritems; ++slot) { + btrfs_item_key_to_cpu(eb, &key, slot); + if (key.type != BTRFS_EXTENT_DATA_KEY) + continue; + fi = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item); + if (!fi) { + free_extent_buffer(eb); + return -EIO; + } + disk_byte = btrfs_file_extent_disk_bytenr(eb, fi); + if (disk_byte != orig_extent_item_objectid) { + if (found) + break; + else + continue; + } + ++found; + ret = __iter_shared_inline_ref_inodes(fs_info, logical, + key.objectid, + key.offset, + extent_offset, path, + data_refs, + iterate, ctx); + if (ret) + break; + } + + if (!found) { + printk(KERN_ERR "btrfs: failed to follow shared data backref " + "to parent %llu\n", logical); + WARN_ON(1); + ret = -EIO; + } + + free_extent_buffer(eb); + return ret; +} + +/* + * calls iterate() for every inode that references the extent identified by + * the given parameters. will use the path given as a parameter and return it + * released. + * when the iterator function returns a non-zero value, iteration stops. + */ +int iterate_extent_inodes(struct btrfs_fs_info *fs_info, + struct btrfs_path *path, + u64 extent_item_objectid, + u64 extent_offset, + iterate_extent_inodes_t *iterate, void *ctx) +{ + unsigned long ptr = 0; + int last; + int ret; + int type; + u64 logical; + u32 item_size; + struct btrfs_extent_inline_ref *eiref; + struct btrfs_extent_data_ref *dref; + struct extent_buffer *eb; + struct btrfs_extent_item *ei; + struct btrfs_key key; + struct list_head data_refs = LIST_HEAD_INIT(data_refs); + struct list_head shared_refs = LIST_HEAD_INIT(shared_refs); + struct __data_ref *ref_d; + struct __shared_ref *ref_s; + + eb = path->nodes[0]; + ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item); + item_size = btrfs_item_size_nr(eb, path->slots[0]); + + /* first we iterate the inline refs, ... */ + do { + last = __get_extent_inline_ref(&ptr, eb, ei, item_size, + &eiref, &type); + if (last == -ENOENT) { + ret = 0; + break; + } + if (last < 0) { + ret = last; + break; + } + + if (type == BTRFS_EXTENT_DATA_REF_KEY) { + dref = (struct btrfs_extent_data_ref *)(&eiref->offset); + ret = __data_list_add_eb(&data_refs, eb, dref); + } else if (type == BTRFS_SHARED_DATA_REF_KEY) { + logical = btrfs_extent_inline_ref_offset(eb, eiref); + ret = __shared_list_add(&shared_refs, logical); + } + } while (!ret && !last); + + /* ... then we proceed to in-tree references and ... */ + while (!ret) { + ++path->slots[0]; + if (path->slots[0] > btrfs_header_nritems(eb)) { + ret = btrfs_next_leaf(fs_info->extent_root, path); + if (ret) { + if (ret == 1) + ret = 0; /* we're done */ + break; + } + eb = path->nodes[0]; + } + btrfs_item_key_to_cpu(eb, &key, path->slots[0]); + if (key.objectid != extent_item_objectid) + break; + if (key.type == BTRFS_EXTENT_DATA_REF_KEY) { + dref = btrfs_item_ptr(eb, path->slots[0], + struct btrfs_extent_data_ref); + ret = __data_list_add_eb(&data_refs, eb, dref); + } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) { + ret = __shared_list_add(&shared_refs, key.offset); + } + } + + btrfs_release_path(path); + + /* + * ... only at the very end we can process the refs we found. this is + * because the iterator function we call is allowed to make tree lookups + * and we have to avoid deadlocks. additionally, we need more tree + * lookups ourselves for shared data refs. + */ + while (!list_empty(&data_refs)) { + ref_d = list_first_entry(&data_refs, struct __data_ref, list); + list_del(&ref_d->list); + if (!ret) + ret = iterate(ref_d->inum, extent_offset + + ref_d->extent_data_item_offset, + ref_d->root, ctx); + kfree(ref_d); + } + + while (!list_empty(&shared_refs)) { + ref_s = list_first_entry(&shared_refs, struct __shared_ref, + list); + list_del(&ref_s->list); + if (!ret) + ret = __iter_shared_inline_ref(fs_info, + ref_s->disk_byte, + extent_item_objectid, + extent_offset, path, + &data_refs, + iterate, ctx); + kfree(ref_s); + } + + return ret; +} + +int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info, + struct btrfs_path *path, + iterate_extent_inodes_t *iterate, void *ctx) +{ + int ret; + u64 offset; + struct btrfs_key found_key; + + ret = extent_from_logical(fs_info, logical, path, + &found_key); + if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK) + ret = -EINVAL; + if (ret < 0) + return ret; + + offset = logical - found_key.objectid; + ret = iterate_extent_inodes(fs_info, path, found_key.objectid, + offset, iterate, ctx); + + return ret; +} + +static int iterate_irefs(u64 inum, struct btrfs_root *fs_root, + struct btrfs_path *path, + iterate_irefs_t *iterate, void *ctx) +{ + int ret; + int slot; + u32 cur; + u32 len; + u32 name_len; + u64 parent = 0; + int found = 0; + struct extent_buffer *eb; + struct btrfs_item *item; + struct btrfs_inode_ref *iref; + struct btrfs_key found_key; + + while (1) { + ret = inode_ref_info(inum, parent ? parent+1 : 0, fs_root, path, + &found_key); + if (ret < 0) + break; + if (ret) { + ret = found ? 0 : -ENOENT; + break; + } + ++found; + + parent = found_key.offset; + slot = path->slots[0]; + eb = path->nodes[0]; + /* make sure we can use eb after releasing the path */ + atomic_inc(&eb->refs); + btrfs_release_path(path); + + item = btrfs_item_nr(eb, slot); + iref = btrfs_item_ptr(eb, slot, struct btrfs_inode_ref); + + for (cur = 0; cur < btrfs_item_size(eb, item); cur += len) { + name_len = btrfs_inode_ref_name_len(eb, iref); + /* path must be released before calling iterate()! */ + ret = iterate(parent, iref, eb, ctx); + if (ret) { + free_extent_buffer(eb); + break; + } + len = sizeof(*iref) + name_len; + iref = (struct btrfs_inode_ref *)((char *)iref + len); + } + free_extent_buffer(eb); + } + + btrfs_release_path(path); + + return ret; +} + +/* + * returns 0 if the path could be dumped (probably truncated) + * returns <0 in case of an error + */ +static int inode_to_path(u64 inum, struct btrfs_inode_ref *iref, + struct extent_buffer *eb, void *ctx) +{ + struct inode_fs_paths *ipath = ctx; + char *fspath; + char *fspath_min; + int i = ipath->fspath->elem_cnt; + const int s_ptr = sizeof(char *); + u32 bytes_left; + + bytes_left = ipath->fspath->bytes_left > s_ptr ? + ipath->fspath->bytes_left - s_ptr : 0; + + fspath_min = (char *)ipath->fspath->val + (i + 1) * s_ptr; + fspath = iref_to_path(ipath->fs_root, ipath->btrfs_path, iref, eb, + inum, fspath_min, bytes_left); + if (IS_ERR(fspath)) + return PTR_ERR(fspath); + + if (fspath > fspath_min) { + ipath->fspath->val[i] = (u64)(unsigned long)fspath; + ++ipath->fspath->elem_cnt; + ipath->fspath->bytes_left = fspath - fspath_min; + } else { + ++ipath->fspath->elem_missed; + ipath->fspath->bytes_missing += fspath_min - fspath; + ipath->fspath->bytes_left = 0; + } + + return 0; +} + +/* + * this dumps all file system paths to the inode into the ipath struct, provided + * is has been created large enough. each path is zero-terminated and accessed + * from ipath->fspath->val[i]. + * when it returns, there are ipath->fspath->elem_cnt number of paths available + * in ipath->fspath->val[]. when the allocated space wasn't sufficient, the + * number of missed paths in recored in ipath->fspath->elem_missed, otherwise, + * it's zero. ipath->fspath->bytes_missing holds the number of bytes that would + * have been needed to return all paths. + */ +int paths_from_inode(u64 inum, struct inode_fs_paths *ipath) +{ + return iterate_irefs(inum, ipath->fs_root, ipath->btrfs_path, + inode_to_path, ipath); +} + +/* + * allocates space to return multiple file system paths for an inode. + * total_bytes to allocate are passed, note that space usable for actual path + * information will be total_bytes - sizeof(struct inode_fs_paths). + * the returned pointer must be freed with free_ipath() in the end. + */ +struct btrfs_data_container *init_data_container(u32 total_bytes) +{ + struct btrfs_data_container *data; + size_t alloc_bytes; + + alloc_bytes = max_t(size_t, total_bytes, sizeof(*data)); + data = kmalloc(alloc_bytes, GFP_NOFS); + if (!data) + return ERR_PTR(-ENOMEM); + + if (total_bytes >= sizeof(*data)) { + data->bytes_left = total_bytes - sizeof(*data); + data->bytes_missing = 0; + } else { + data->bytes_missing = sizeof(*data) - total_bytes; + data->bytes_left = 0; + } + + data->elem_cnt = 0; + data->elem_missed = 0; + + return data; +} + +/* + * allocates space to return multiple file system paths for an inode. + * total_bytes to allocate are passed, note that space usable for actual path + * information will be total_bytes - sizeof(struct inode_fs_paths). + * the returned pointer must be freed with free_ipath() in the end. + */ +struct inode_fs_paths *init_ipath(s32 total_bytes, struct btrfs_root *fs_root, + struct btrfs_path *path) +{ + struct inode_fs_paths *ifp; + struct btrfs_data_container *fspath; + + fspath = init_data_container(total_bytes); + if (IS_ERR(fspath)) + return (void *)fspath; + + ifp = kmalloc(sizeof(*ifp), GFP_NOFS); + if (!ifp) { + kfree(fspath); + return ERR_PTR(-ENOMEM); + } + + ifp->btrfs_path = path; + ifp->fspath = fspath; + ifp->fs_root = fs_root; + + return ifp; +} + +void free_ipath(struct inode_fs_paths *ipath) +{ + kfree(ipath); +} diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h new file mode 100644 index 0000000..9261883 --- /dev/null +++ b/fs/btrfs/backref.h @@ -0,0 +1,62 @@ +/* + * Copyright (C) 2011 STRATO. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef __BTRFS_BACKREF__ +#define __BTRFS_BACKREF__ + +#include "ioctl.h" + +struct inode_fs_paths { + struct btrfs_path *btrfs_path; + struct btrfs_root *fs_root; + struct btrfs_data_container *fspath; +}; + +typedef int (iterate_extent_inodes_t)(u64 inum, u64 offset, u64 root, + void *ctx); +typedef int (iterate_irefs_t)(u64 parent, struct btrfs_inode_ref *iref, + struct extent_buffer *eb, void *ctx); + +int inode_item_info(u64 inum, u64 ioff, struct btrfs_root *fs_root, + struct btrfs_path *path); + +int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical, + struct btrfs_path *path, struct btrfs_key *found_key); + +int tree_backref_for_extent(unsigned long *ptr, struct extent_buffer *eb, + struct btrfs_extent_item *ei, u32 item_size, + u64 *out_root, u8 *out_level); + +int iterate_extent_inodes(struct btrfs_fs_info *fs_info, + struct btrfs_path *path, + u64 extent_item_objectid, + u64 extent_offset, + iterate_extent_inodes_t *iterate, void *ctx); + +int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info, + struct btrfs_path *path, + iterate_extent_inodes_t *iterate, void *ctx); + +int paths_from_inode(u64 inum, struct inode_fs_paths *ipath); + +struct btrfs_data_container *init_data_container(u32 total_bytes); +struct inode_fs_paths *init_ipath(s32 total_bytes, struct btrfs_root *fs_root, + struct btrfs_path *path); +void free_ipath(struct inode_fs_paths *ipath); + +#endif diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index d9f99a1..634608d2 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -103,11 +103,6 @@ struct btrfs_inode { */ u64 delalloc_bytes; - /* total number of bytes that may be used for this inode for - * delalloc - */ - u64 reserved_bytes; - /* * the size of the file stored in the metadata on disk. data=ordered * means the in-memory i_size might be larger than the size on disk @@ -115,9 +110,6 @@ struct btrfs_inode { */ u64 disk_i_size; - /* flags field from the on disk inode */ - u32 flags; - /* * if this is a directory then index_cnt is the counter for the index * number for new files that are created @@ -132,6 +124,15 @@ struct btrfs_inode { u64 last_unlink_trans; /* + * Number of bytes outstanding that are going to need csums. This is + * used in ENOSPC accounting. + */ + u64 csum_bytes; + + /* flags field from the on disk inode */ + u32 flags; + + /* * Counters to keep track of the number of extent item's we may use due * to delalloc and such. outstanding_extents is the number of extent * items we think we'll end up using, and reserved_extents is the number @@ -146,14 +147,12 @@ struct btrfs_inode { * the btrfs file release call will add this inode to the * ordered operations list so that we make sure to flush out any * new data the application may have written before commit. - * - * yes, its silly to have a single bitflag, but we might grow more - * of these. */ unsigned ordered_data_close:1; unsigned orphan_meta_reserved:1; unsigned dummy_inode:1; unsigned in_defrag:1; + unsigned delalloc_meta_reserved:1; /* * always compress this one file diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 8ec5d86..14f1c5a 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -85,7 +85,8 @@ struct compressed_bio { static inline int compressed_bio_size(struct btrfs_root *root, unsigned long disk_size) { - u16 csum_size = btrfs_super_csum_size(&root->fs_info->super_copy); + u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy); + return sizeof(struct compressed_bio) + ((disk_size + root->sectorsize - 1) / root->sectorsize) * csum_size; diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 011cab3..dede441 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -514,10 +514,25 @@ static inline int should_cow_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf) { + /* ensure we can see the force_cow */ + smp_rmb(); + + /* + * We do not need to cow a block if + * 1) this block is not created or changed in this transaction; + * 2) this block does not belong to TREE_RELOC tree; + * 3) the root is not forced COW. + * + * What is forced COW: + * when we create snapshot during commiting the transaction, + * after we've finished coping src root, we must COW the shared + * block to ensure the metadata consistency. + */ if (btrfs_header_generation(buf) == trans->transid && !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) && !(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID && - btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC))) + btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)) && + !root->force_cow) return 0; return 1; } @@ -902,9 +917,10 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, orig_ptr = btrfs_node_blockptr(mid, orig_slot); - if (level < BTRFS_MAX_LEVEL - 1) + if (level < BTRFS_MAX_LEVEL - 1) { parent = path->nodes[level + 1]; - pslot = path->slots[level + 1]; + pslot = path->slots[level + 1]; + } /* * deal with the case where there is only one pointer in the root @@ -1107,9 +1123,10 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, mid = path->nodes[level]; WARN_ON(btrfs_header_generation(mid) != trans->transid); - if (level < BTRFS_MAX_LEVEL - 1) + if (level < BTRFS_MAX_LEVEL - 1) { parent = path->nodes[level + 1]; - pslot = path->slots[level + 1]; + pslot = path->slots[level + 1]; + } if (!parent) return 1; diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 03912c5..6738503 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -30,6 +30,7 @@ #include <linux/kobject.h> #include <trace/events/btrfs.h> #include <asm/kmap_types.h> +#include <linux/pagemap.h> #include "extent_io.h" #include "extent_map.h" #include "async-thread.h" @@ -360,6 +361,47 @@ struct btrfs_header { #define BTRFS_LABEL_SIZE 256 /* + * just in case we somehow lose the roots and are not able to mount, + * we store an array of the roots from previous transactions + * in the super. + */ +#define BTRFS_NUM_BACKUP_ROOTS 4 +struct btrfs_root_backup { + __le64 tree_root; + __le64 tree_root_gen; + + __le64 chunk_root; + __le64 chunk_root_gen; + + __le64 extent_root; + __le64 extent_root_gen; + + __le64 fs_root; + __le64 fs_root_gen; + + __le64 dev_root; + __le64 dev_root_gen; + + __le64 csum_root; + __le64 csum_root_gen; + + __le64 total_bytes; + __le64 bytes_used; + __le64 num_devices; + /* future */ + __le64 unsed_64[4]; + + u8 tree_root_level; + u8 chunk_root_level; + u8 extent_root_level; + u8 fs_root_level; + u8 dev_root_level; + u8 csum_root_level; + /* future and to align */ + u8 unused_8[10]; +} __attribute__ ((__packed__)); + +/* * the super block basically lists the main trees of the FS * it currently lacks any block count etc etc */ @@ -405,6 +447,7 @@ struct btrfs_super_block { /* future expansion */ __le64 reserved[31]; u8 sys_chunk_array[BTRFS_SYSTEM_CHUNK_ARRAY_SIZE]; + struct btrfs_root_backup super_roots[BTRFS_NUM_BACKUP_ROOTS]; } __attribute__ ((__packed__)); /* @@ -772,14 +815,8 @@ struct btrfs_space_info { struct btrfs_block_rsv { u64 size; u64 reserved; - u64 freed[2]; struct btrfs_space_info *space_info; - struct list_head list; spinlock_t lock; - atomic_t usage; - unsigned int priority:8; - unsigned int durable:1; - unsigned int refill_used:1; unsigned int full:1; }; @@ -811,7 +848,8 @@ struct btrfs_free_cluster { enum btrfs_caching_type { BTRFS_CACHE_NO = 0, BTRFS_CACHE_STARTED = 1, - BTRFS_CACHE_FINISHED = 2, + BTRFS_CACHE_FAST = 2, + BTRFS_CACHE_FINISHED = 3, }; enum btrfs_disk_cache_state { @@ -840,10 +878,10 @@ struct btrfs_block_group_cache { spinlock_t lock; u64 pinned; u64 reserved; - u64 reserved_pinned; u64 bytes_super; u64 flags; u64 sectorsize; + u64 cache_generation; unsigned int ro:1; unsigned int dirty:1; unsigned int iref:1; @@ -899,6 +937,10 @@ struct btrfs_fs_info { spinlock_t block_group_cache_lock; struct rb_root block_group_cache_tree; + /* keep track of unallocated space */ + spinlock_t free_chunk_lock; + u64 free_chunk_space; + struct extent_io_tree freed_extents[2]; struct extent_io_tree *pinned_extents; @@ -916,14 +958,11 @@ struct btrfs_fs_info { struct btrfs_block_rsv trans_block_rsv; /* block reservation for chunk tree */ struct btrfs_block_rsv chunk_block_rsv; + /* block reservation for delayed operations */ + struct btrfs_block_rsv delayed_block_rsv; struct btrfs_block_rsv empty_block_rsv; - /* list of block reservations that cross multiple transactions */ - struct list_head durable_block_rsv_list; - - struct mutex durable_block_rsv_mutex; - u64 generation; u64 last_trans_committed; @@ -942,8 +981,8 @@ struct btrfs_fs_info { wait_queue_head_t transaction_blocked_wait; wait_queue_head_t async_submit_wait; - struct btrfs_super_block super_copy; - struct btrfs_super_block super_for_commit; + struct btrfs_super_block *super_copy; + struct btrfs_super_block *super_for_commit; struct block_device *__bdev; struct super_block *sb; struct inode *btree_inode; @@ -1036,6 +1075,7 @@ struct btrfs_fs_info { struct btrfs_workers endio_freespace_worker; struct btrfs_workers submit_workers; struct btrfs_workers caching_workers; + struct btrfs_workers readahead_workers; /* * fixup workers take dirty pages that didn't properly go through @@ -1119,6 +1159,13 @@ struct btrfs_fs_info { u64 fs_state; struct btrfs_delayed_root *delayed_root; + + /* readahead tree */ + spinlock_t reada_lock; + struct radix_tree_root reada_tree; + + /* next backup root to be overwritten */ + int backup_root_index; }; /* @@ -1225,6 +1272,8 @@ struct btrfs_root { * for stat. It may be used for more later */ dev_t anon_dev; + + int force_cow; }; struct btrfs_ioctl_defrag_range_args { @@ -1363,6 +1412,7 @@ struct btrfs_ioctl_defrag_range_args { #define BTRFS_MOUNT_ENOSPC_DEBUG (1 << 15) #define BTRFS_MOUNT_AUTO_DEFRAG (1 << 16) #define BTRFS_MOUNT_INODE_MAP_CACHE (1 << 17) +#define BTRFS_MOUNT_RECOVERY (1 << 18) #define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt) #define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt) @@ -1978,6 +2028,55 @@ static inline bool btrfs_root_readonly(struct btrfs_root *root) return root->root_item.flags & BTRFS_ROOT_SUBVOL_RDONLY; } +/* struct btrfs_root_backup */ +BTRFS_SETGET_STACK_FUNCS(backup_tree_root, struct btrfs_root_backup, + tree_root, 64); +BTRFS_SETGET_STACK_FUNCS(backup_tree_root_gen, struct btrfs_root_backup, + tree_root_gen, 64); +BTRFS_SETGET_STACK_FUNCS(backup_tree_root_level, struct btrfs_root_backup, + tree_root_level, 8); + +BTRFS_SETGET_STACK_FUNCS(backup_chunk_root, struct btrfs_root_backup, + chunk_root, 64); +BTRFS_SETGET_STACK_FUNCS(backup_chunk_root_gen, struct btrfs_root_backup, + chunk_root_gen, 64); +BTRFS_SETGET_STACK_FUNCS(backup_chunk_root_level, struct btrfs_root_backup, + chunk_root_level, 8); + +BTRFS_SETGET_STACK_FUNCS(backup_extent_root, struct btrfs_root_backup, + extent_root, 64); +BTRFS_SETGET_STACK_FUNCS(backup_extent_root_gen, struct btrfs_root_backup, + extent_root_gen, 64); +BTRFS_SETGET_STACK_FUNCS(backup_extent_root_level, struct btrfs_root_backup, + extent_root_level, 8); + +BTRFS_SETGET_STACK_FUNCS(backup_fs_root, struct btrfs_root_backup, + fs_root, 64); +BTRFS_SETGET_STACK_FUNCS(backup_fs_root_gen, struct btrfs_root_backup, + fs_root_gen, 64); +BTRFS_SETGET_STACK_FUNCS(backup_fs_root_level, struct btrfs_root_backup, + fs_root_level, 8); + +BTRFS_SETGET_STACK_FUNCS(backup_dev_root, struct btrfs_root_backup, + dev_root, 64); +BTRFS_SETGET_STACK_FUNCS(backup_dev_root_gen, struct btrfs_root_backup, + dev_root_gen, 64); +BTRFS_SETGET_STACK_FUNCS(backup_dev_root_level, struct btrfs_root_backup, + dev_root_level, 8); + +BTRFS_SETGET_STACK_FUNCS(backup_csum_root, struct btrfs_root_backup, + csum_root, 64); +BTRFS_SETGET_STACK_FUNCS(backup_csum_root_gen, struct btrfs_root_backup, + csum_root_gen, 64); +BTRFS_SETGET_STACK_FUNCS(backup_csum_root_level, struct btrfs_root_backup, + csum_root_level, 8); +BTRFS_SETGET_STACK_FUNCS(backup_total_bytes, struct btrfs_root_backup, + total_bytes, 64); +BTRFS_SETGET_STACK_FUNCS(backup_bytes_used, struct btrfs_root_backup, + bytes_used, 64); +BTRFS_SETGET_STACK_FUNCS(backup_num_devices, struct btrfs_root_backup, + num_devices, 64); + /* struct btrfs_super_block */ BTRFS_SETGET_STACK_FUNCS(super_bytenr, struct btrfs_super_block, bytenr, 64); @@ -2129,6 +2228,11 @@ static inline bool btrfs_mixed_space_info(struct btrfs_space_info *space_info) (space_info->flags & BTRFS_BLOCK_GROUP_DATA)); } +static inline gfp_t btrfs_alloc_write_mask(struct address_space *mapping) +{ + return mapping_gfp_mask(mapping) & ~__GFP_FS; +} + /* extent-tree.c */ static inline u64 btrfs_calc_trans_metadata_size(struct btrfs_root *root, unsigned num_items) @@ -2137,6 +2241,17 @@ static inline u64 btrfs_calc_trans_metadata_size(struct btrfs_root *root, 3 * num_items; } +/* + * Doing a truncate won't result in new nodes or leaves, just what we need for + * COW. + */ +static inline u64 btrfs_calc_trunc_metadata_size(struct btrfs_root *root, + unsigned num_items) +{ + return (root->leafsize + root->nodesize * (BTRFS_MAX_LEVEL - 1)) * + num_items; +} + void btrfs_put_block_group(struct btrfs_block_group_cache *cache); int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, struct btrfs_root *root, unsigned long count); @@ -2146,6 +2261,9 @@ int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans, u64 num_bytes, u64 *refs, u64 *flags); int btrfs_pin_extent(struct btrfs_root *root, u64 bytenr, u64 num, int reserved); +int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 bytenr, u64 num_bytes); int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 objectid, u64 offset, u64 bytenr); @@ -2196,8 +2314,8 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, u64 root_objectid, u64 owner, u64 offset); int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len); -int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache, - u64 num_bytes, int reserve, int sinfo); +int btrfs_free_and_pin_reserved_extent(struct btrfs_root *root, + u64 start, u64 len); int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans, struct btrfs_root *root); int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, @@ -2240,25 +2358,26 @@ void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv); struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root); void btrfs_free_block_rsv(struct btrfs_root *root, struct btrfs_block_rsv *rsv); -void btrfs_add_durable_block_rsv(struct btrfs_fs_info *fs_info, - struct btrfs_block_rsv *rsv); -int btrfs_block_rsv_add(struct btrfs_trans_handle *trans, - struct btrfs_root *root, +int btrfs_block_rsv_add(struct btrfs_root *root, struct btrfs_block_rsv *block_rsv, u64 num_bytes); -int btrfs_block_rsv_check(struct btrfs_trans_handle *trans, - struct btrfs_root *root, +int btrfs_block_rsv_add_noflush(struct btrfs_root *root, + struct btrfs_block_rsv *block_rsv, + u64 num_bytes); +int btrfs_block_rsv_check(struct btrfs_root *root, + struct btrfs_block_rsv *block_rsv, int min_factor); +int btrfs_block_rsv_refill(struct btrfs_root *root, struct btrfs_block_rsv *block_rsv, - u64 min_reserved, int min_factor); + u64 min_reserved); +int btrfs_block_rsv_refill_noflush(struct btrfs_root *root, + struct btrfs_block_rsv *block_rsv, + u64 min_reserved); int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv, struct btrfs_block_rsv *dst_rsv, u64 num_bytes); void btrfs_block_rsv_release(struct btrfs_root *root, struct btrfs_block_rsv *block_rsv, u64 num_bytes); -int btrfs_truncate_reserve_metadata(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_block_rsv *rsv); int btrfs_set_block_group_ro(struct btrfs_root *root, struct btrfs_block_group_cache *cache); int btrfs_set_block_group_rw(struct btrfs_root *root, @@ -2379,6 +2498,18 @@ static inline int btrfs_fs_closing(struct btrfs_fs_info *fs_info) smp_mb(); return fs_info->closing; } +static inline void free_fs_info(struct btrfs_fs_info *fs_info) +{ + kfree(fs_info->delayed_root); + kfree(fs_info->extent_root); + kfree(fs_info->tree_root); + kfree(fs_info->chunk_root); + kfree(fs_info->dev_root); + kfree(fs_info->csum_root); + kfree(fs_info->super_copy); + kfree(fs_info->super_for_commit); + kfree(fs_info); +} /* root-item.c */ int btrfs_find_root_ref(struct btrfs_root *tree_root, @@ -2561,7 +2692,8 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); int btrfs_readpage(struct file *file, struct page *page); void btrfs_evict_inode(struct inode *inode); int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc); -void btrfs_dirty_inode(struct inode *inode, int flags); +int btrfs_dirty_inode(struct inode *inode); +int btrfs_update_time(struct file *file); struct inode *btrfs_alloc_inode(struct super_block *sb); void btrfs_destroy_inode(struct inode *inode); int btrfs_drop_inode(struct inode *inode); @@ -2579,11 +2711,6 @@ int btrfs_update_inode(struct btrfs_trans_handle *trans, int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode); int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode); int btrfs_orphan_cleanup(struct btrfs_root *root); -void btrfs_orphan_pre_snapshot(struct btrfs_trans_handle *trans, - struct btrfs_pending_snapshot *pending, - u64 *bytes_to_reserve); -void btrfs_orphan_post_snapshot(struct btrfs_trans_handle *trans, - struct btrfs_pending_snapshot *pending); void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans, struct btrfs_root *root); int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size); @@ -2697,4 +2824,20 @@ int btrfs_scrub_cancel_devid(struct btrfs_root *root, u64 devid); int btrfs_scrub_progress(struct btrfs_root *root, u64 devid, struct btrfs_scrub_progress *progress); +/* reada.c */ +struct reada_control { + struct btrfs_root *root; /* tree to prefetch */ + struct btrfs_key key_start; + struct btrfs_key key_end; /* exclusive */ + atomic_t elems; + struct kref refcnt; + wait_queue_head_t wait; +}; +struct reada_control *btrfs_reada_add(struct btrfs_root *root, + struct btrfs_key *start, struct btrfs_key *end); +int btrfs_reada_wait(void *handle); +void btrfs_reada_detach(void *handle); +int btree_readahead_hook(struct btrfs_root *root, struct extent_buffer *eb, + u64 start, int err); + #endif diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index b52c672..9c1eccc 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -591,7 +591,7 @@ static int btrfs_delayed_item_reserve_metadata(struct btrfs_trans_handle *trans, return 0; src_rsv = trans->block_rsv; - dst_rsv = &root->fs_info->global_block_rsv; + dst_rsv = &root->fs_info->delayed_block_rsv; num_bytes = btrfs_calc_trans_metadata_size(root, 1); ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes); @@ -609,7 +609,7 @@ static void btrfs_delayed_item_release_metadata(struct btrfs_root *root, if (!item->bytes_reserved) return; - rsv = &root->fs_info->global_block_rsv; + rsv = &root->fs_info->delayed_block_rsv; btrfs_block_rsv_release(root, rsv, item->bytes_reserved); } @@ -617,24 +617,102 @@ static void btrfs_delayed_item_release_metadata(struct btrfs_root *root, static int btrfs_delayed_inode_reserve_metadata( struct btrfs_trans_handle *trans, struct btrfs_root *root, + struct inode *inode, struct btrfs_delayed_node *node) { struct btrfs_block_rsv *src_rsv; struct btrfs_block_rsv *dst_rsv; u64 num_bytes; int ret; - - if (!trans->bytes_reserved) - return 0; + int release = false; src_rsv = trans->block_rsv; - dst_rsv = &root->fs_info->global_block_rsv; + dst_rsv = &root->fs_info->delayed_block_rsv; num_bytes = btrfs_calc_trans_metadata_size(root, 1); + + /* + * btrfs_dirty_inode will update the inode under btrfs_join_transaction + * which doesn't reserve space for speed. This is a problem since we + * still need to reserve space for this update, so try to reserve the + * space. + * + * Now if src_rsv == delalloc_block_rsv we'll let it just steal since + * we're accounted for. + */ + if (!src_rsv || (!trans->bytes_reserved && + src_rsv != &root->fs_info->delalloc_block_rsv)) { + ret = btrfs_block_rsv_add_noflush(root, dst_rsv, num_bytes); + /* + * Since we're under a transaction reserve_metadata_bytes could + * try to commit the transaction which will make it return + * EAGAIN to make us stop the transaction we have, so return + * ENOSPC instead so that btrfs_dirty_inode knows what to do. + */ + if (ret == -EAGAIN) + ret = -ENOSPC; + if (!ret) + node->bytes_reserved = num_bytes; + return ret; + } else if (src_rsv == &root->fs_info->delalloc_block_rsv) { + spin_lock(&BTRFS_I(inode)->lock); + if (BTRFS_I(inode)->delalloc_meta_reserved) { + BTRFS_I(inode)->delalloc_meta_reserved = 0; + spin_unlock(&BTRFS_I(inode)->lock); + release = true; + goto migrate; + } + spin_unlock(&BTRFS_I(inode)->lock); + + /* Ok we didn't have space pre-reserved. This shouldn't happen + * too often but it can happen if we do delalloc to an existing + * inode which gets dirtied because of the time update, and then + * isn't touched again until after the transaction commits and + * then we try to write out the data. First try to be nice and + * reserve something strictly for us. If not be a pain and try + * to steal from the delalloc block rsv. + */ + ret = btrfs_block_rsv_add_noflush(root, dst_rsv, num_bytes); + if (!ret) + goto out; + + ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes); + if (!ret) + goto out; + + /* + * Ok this is a problem, let's just steal from the global rsv + * since this really shouldn't happen that often. + */ + WARN_ON(1); + ret = btrfs_block_rsv_migrate(&root->fs_info->global_block_rsv, + dst_rsv, num_bytes); + goto out; + } + +migrate: ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes); + +out: + /* + * Migrate only takes a reservation, it doesn't touch the size of the + * block_rsv. This is to simplify people who don't normally have things + * migrated from their block rsv. If they go to release their + * reservation, that will decrease the size as well, so if migrate + * reduced size we'd end up with a negative size. But for the + * delalloc_meta_reserved stuff we will only know to drop 1 reservation, + * but we could in fact do this reserve/migrate dance several times + * between the time we did the original reservation and we'd clean it + * up. So to take care of this, release the space for the meta + * reservation here. I think it may be time for a documentation page on + * how block rsvs. work. + */ if (!ret) node->bytes_reserved = num_bytes; + if (release) + btrfs_block_rsv_release(root, src_rsv, num_bytes); + return ret; } @@ -646,7 +724,7 @@ static void btrfs_delayed_inode_release_metadata(struct btrfs_root *root, if (!node->bytes_reserved) return; - rsv = &root->fs_info->global_block_rsv; + rsv = &root->fs_info->delayed_block_rsv; btrfs_block_rsv_release(root, rsv, node->bytes_reserved); node->bytes_reserved = 0; @@ -1026,7 +1104,7 @@ int btrfs_run_delayed_items(struct btrfs_trans_handle *trans, path->leave_spinning = 1; block_rsv = trans->block_rsv; - trans->block_rsv = &root->fs_info->global_block_rsv; + trans->block_rsv = &root->fs_info->delayed_block_rsv; delayed_root = btrfs_get_delayed_root(root); @@ -1069,7 +1147,7 @@ static int __btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans, path->leave_spinning = 1; block_rsv = trans->block_rsv; - trans->block_rsv = &node->root->fs_info->global_block_rsv; + trans->block_rsv = &node->root->fs_info->delayed_block_rsv; ret = btrfs_insert_delayed_items(trans, path, node->root, node); if (!ret) @@ -1149,7 +1227,7 @@ static void btrfs_async_run_delayed_node_done(struct btrfs_work *work) goto free_path; block_rsv = trans->block_rsv; - trans->block_rsv = &root->fs_info->global_block_rsv; + trans->block_rsv = &root->fs_info->delayed_block_rsv; ret = btrfs_insert_delayed_items(trans, path, root, delayed_node); if (!ret) @@ -1641,7 +1719,7 @@ int btrfs_fill_inode(struct inode *inode, u32 *rdev) inode->i_gid = btrfs_stack_inode_gid(inode_item); btrfs_i_size_write(inode, btrfs_stack_inode_size(inode_item)); inode->i_mode = btrfs_stack_inode_mode(inode_item); - inode->i_nlink = btrfs_stack_inode_nlink(inode_item); + set_nlink(inode, btrfs_stack_inode_nlink(inode_item)); inode_set_bytes(inode, btrfs_stack_inode_nbytes(inode_item)); BTRFS_I(inode)->generation = btrfs_stack_inode_generation(inode_item); BTRFS_I(inode)->sequence = btrfs_stack_inode_sequence(inode_item); @@ -1685,12 +1763,10 @@ int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans, goto release_node; } - ret = btrfs_delayed_inode_reserve_metadata(trans, root, delayed_node); - /* - * we must reserve enough space when we start a new transaction, - * so reserving metadata failure is impossible - */ - BUG_ON(ret); + ret = btrfs_delayed_inode_reserve_metadata(trans, root, inode, + delayed_node); + if (ret) + goto release_node; fill_stack_inode_item(trans, &delayed_node->inode_item, inode); delayed_node->inode_dirty = 1; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 07b3ac6..f99a099 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -256,8 +256,7 @@ void btrfs_csum_final(u32 crc, char *result) static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf, int verify) { - u16 csum_size = - btrfs_super_csum_size(&root->fs_info->super_copy); + u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy); char *result = NULL; unsigned long len; unsigned long cur_len; @@ -367,7 +366,8 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root, clear_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags); io_tree = &BTRFS_I(root->fs_info->btree_inode)->io_tree; while (1) { - ret = read_extent_buffer_pages(io_tree, eb, start, 1, + ret = read_extent_buffer_pages(io_tree, eb, start, + WAIT_COMPLETE, btree_get_extent, mirror_num); if (!ret && !verify_parent_transid(io_tree, eb, parent_transid)) @@ -608,11 +608,48 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end, end = min_t(u64, eb->len, PAGE_CACHE_SIZE); end = eb->start + end - 1; err: + if (test_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags)) { + clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags); + btree_readahead_hook(root, eb, eb->start, ret); + } + free_extent_buffer(eb); out: return ret; } +static int btree_io_failed_hook(struct bio *failed_bio, + struct page *page, u64 start, u64 end, + int mirror_num, struct extent_state *state) +{ + struct extent_io_tree *tree; + unsigned long len; + struct extent_buffer *eb; + struct btrfs_root *root = BTRFS_I(page->mapping->host)->root; + + tree = &BTRFS_I(page->mapping->host)->io_tree; + if (page->private == EXTENT_PAGE_PRIVATE) + goto out; + if (!page->private) + goto out; + + len = page->private >> 2; + WARN_ON(len == 0); + + eb = alloc_extent_buffer(tree, start, len, page); + if (eb == NULL) + goto out; + + if (test_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags)) { + clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags); + btree_readahead_hook(root, eb, eb->start, -EIO); + } + free_extent_buffer(eb); + +out: + return -EIO; /* we fixed nothing */ +} + static void end_workqueue_bio(struct bio *bio, int err) { struct end_io_wq *end_io_wq = bio->bi_private; @@ -908,7 +945,7 @@ static int btree_readpage(struct file *file, struct page *page) { struct extent_io_tree *tree; tree = &BTRFS_I(page->mapping->host)->io_tree; - return extent_read_full_page(tree, page, btree_get_extent); + return extent_read_full_page(tree, page, btree_get_extent, 0); } static int btree_releasepage(struct page *page, gfp_t gfp_flags) @@ -974,11 +1011,43 @@ int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize, if (!buf) return 0; read_extent_buffer_pages(&BTRFS_I(btree_inode)->io_tree, - buf, 0, 0, btree_get_extent, 0); + buf, 0, WAIT_NONE, btree_get_extent, 0); free_extent_buffer(buf); return ret; } +int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, u32 blocksize, + int mirror_num, struct extent_buffer **eb) +{ + struct extent_buffer *buf = NULL; + struct inode *btree_inode = root->fs_info->btree_inode; + struct extent_io_tree *io_tree = &BTRFS_I(btree_inode)->io_tree; + int ret; + + buf = btrfs_find_create_tree_block(root, bytenr, blocksize); + if (!buf) + return 0; + + set_bit(EXTENT_BUFFER_READAHEAD, &buf->bflags); + + ret = read_extent_buffer_pages(io_tree, buf, 0, WAIT_PAGE_LOCK, + btree_get_extent, mirror_num); + if (ret) { + free_extent_buffer(buf); + return ret; + } + + if (test_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags)) { + free_extent_buffer(buf); + return -EIO; + } else if (extent_buffer_uptodate(io_tree, buf, NULL)) { + *eb = buf; + } else { + free_extent_buffer(buf); + } + return 0; +} + struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize) { @@ -1135,10 +1204,12 @@ static int find_and_setup_root(struct btrfs_root *tree_root, generation = btrfs_root_generation(&root->root_item); blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item)); + root->commit_root = NULL; root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item), blocksize, generation); if (!root->node || !btrfs_buffer_uptodate(root->node, generation)) { free_extent_buffer(root->node); + root->node = NULL; return -EIO; } root->commit_root = btrfs_root_node(root); @@ -1508,9 +1579,7 @@ static int cleaner_kthread(void *arg) btrfs_run_defrag_inodes(root->fs_info); } - if (freezing(current)) { - refrigerator(); - } else { + if (!try_to_freeze()) { set_current_state(TASK_INTERRUPTIBLE); if (!kthread_should_stop()) schedule(); @@ -1564,9 +1633,7 @@ sleep: wake_up_process(root->fs_info->cleaner_kthread); mutex_unlock(&root->fs_info->transaction_kthread_mutex); - if (freezing(current)) { - refrigerator(); - } else { + if (!try_to_freeze()) { set_current_state(TASK_INTERRUPTIBLE); if (!kthread_should_stop() && !btrfs_transaction_blocked(root->fs_info)) @@ -1577,6 +1644,235 @@ sleep: return 0; } +/* + * this will find the highest generation in the array of + * root backups. The index of the highest array is returned, + * or -1 if we can't find anything. + * + * We check to make sure the array is valid by comparing the + * generation of the latest root in the array with the generation + * in the super block. If they don't match we pitch it. + */ +static int find_newest_super_backup(struct btrfs_fs_info *info, u64 newest_gen) +{ + u64 cur; + int newest_index = -1; + struct btrfs_root_backup *root_backup; + int i; + + for (i = 0; i < BTRFS_NUM_BACKUP_ROOTS; i++) { + root_backup = info->super_copy->super_roots + i; + cur = btrfs_backup_tree_root_gen(root_backup); + if (cur == newest_gen) + newest_index = i; + } + + /* check to see if we actually wrapped around */ + if (newest_index == BTRFS_NUM_BACKUP_ROOTS - 1) { + root_backup = info->super_copy->super_roots; + cur = btrfs_backup_tree_root_gen(root_backup); + if (cur == newest_gen) + newest_index = 0; + } + return newest_index; +} + + +/* + * find the oldest backup so we know where to store new entries + * in the backup array. This will set the backup_root_index + * field in the fs_info struct + */ +static void find_oldest_super_backup(struct btrfs_fs_info *info, + u64 newest_gen) +{ + int newest_index = -1; + + newest_index = find_newest_super_backup(info, newest_gen); + /* if there was garbage in there, just move along */ + if (newest_index == -1) { + info->backup_root_index = 0; + } else { + info->backup_root_index = (newest_index + 1) % BTRFS_NUM_BACKUP_ROOTS; + } +} + +/* + * copy all the root pointers into the super backup array. + * this will bump the backup pointer by one when it is + * done + */ +static void backup_super_roots(struct btrfs_fs_info *info) +{ + int next_backup; + struct btrfs_root_backup *root_backup; + int last_backup; + + next_backup = info->backup_root_index; + last_backup = (next_backup + BTRFS_NUM_BACKUP_ROOTS - 1) % + BTRFS_NUM_BACKUP_ROOTS; + + /* + * just overwrite the last backup if we're at the same generation + * this happens only at umount + */ + root_backup = info->super_for_commit->super_roots + last_backup; + if (btrfs_backup_tree_root_gen(root_backup) == + btrfs_header_generation(info->tree_root->node)) + next_backup = last_backup; + + root_backup = info->super_for_commit->super_roots + next_backup; + + /* + * make sure all of our padding and empty slots get zero filled + * regardless of which ones we use today + */ + memset(root_backup, 0, sizeof(*root_backup)); + + info->backup_root_index = (next_backup + 1) % BTRFS_NUM_BACKUP_ROOTS; + + btrfs_set_backup_tree_root(root_backup, info->tree_root->node->start); + btrfs_set_backup_tree_root_gen(root_backup, + btrfs_header_generation(info->tree_root->node)); + + btrfs_set_backup_tree_root_level(root_backup, + btrfs_header_level(info->tree_root->node)); + + btrfs_set_backup_chunk_root(root_backup, info->chunk_root->node->start); + btrfs_set_backup_chunk_root_gen(root_backup, + btrfs_header_generation(info->chunk_root->node)); + btrfs_set_backup_chunk_root_level(root_backup, + btrfs_header_level(info->chunk_root->node)); + + btrfs_set_backup_extent_root(root_backup, info->extent_root->node->start); + btrfs_set_backup_extent_root_gen(root_backup, + btrfs_header_generation(info->extent_root->node)); + btrfs_set_backup_extent_root_level(root_backup, + btrfs_header_level(info->extent_root->node)); + + /* + * we might commit during log recovery, which happens before we set + * the fs_root. Make sure it is valid before we fill it in. + */ + if (info->fs_root && info->fs_root->node) { + btrfs_set_backup_fs_root(root_backup, + info->fs_root->node->start); + btrfs_set_backup_fs_root_gen(root_backup, + btrfs_header_generation(info->fs_root->node)); + btrfs_set_backup_fs_root_level(root_backup, + btrfs_header_level(info->fs_root->node)); + } + + btrfs_set_backup_dev_root(root_backup, info->dev_root->node->start); + btrfs_set_backup_dev_root_gen(root_backup, + btrfs_header_generation(info->dev_root->node)); + btrfs_set_backup_dev_root_level(root_backup, + btrfs_header_level(info->dev_root->node)); + + btrfs_set_backup_csum_root(root_backup, info->csum_root->node->start); + btrfs_set_backup_csum_root_gen(root_backup, + btrfs_header_generation(info->csum_root->node)); + btrfs_set_backup_csum_root_level(root_backup, + btrfs_header_level(info->csum_root->node)); + + btrfs_set_backup_total_bytes(root_backup, + btrfs_super_total_bytes(info->super_copy)); + btrfs_set_backup_bytes_used(root_backup, + btrfs_super_bytes_used(info->super_copy)); + btrfs_set_backup_num_devices(root_backup, + btrfs_super_num_devices(info->super_copy)); + + /* + * if we don't copy this out to the super_copy, it won't get remembered + * for the next commit + */ + memcpy(&info->super_copy->super_roots, + &info->super_for_commit->super_roots, + sizeof(*root_backup) * BTRFS_NUM_BACKUP_ROOTS); +} + +/* + * this copies info out of the root backup array and back into + * the in-memory super block. It is meant to help iterate through + * the array, so you send it the number of backups you've already + * tried and the last backup index you used. + * + * this returns -1 when it has tried all the backups + */ +static noinline int next_root_backup(struct btrfs_fs_info *info, + struct btrfs_super_block *super, + int *num_backups_tried, int *backup_index) +{ + struct btrfs_root_backup *root_backup; + int newest = *backup_index; + + if (*num_backups_tried == 0) { + u64 gen = btrfs_super_generation(super); + + newest = find_newest_super_backup(info, gen); + if (newest == -1) + return -1; + + *backup_index = newest; + *num_backups_tried = 1; + } else if (*num_backups_tried == BTRFS_NUM_BACKUP_ROOTS) { + /* we've tried all the backups, all done */ + return -1; + } else { + /* jump to the next oldest backup */ + newest = (*backup_index + BTRFS_NUM_BACKUP_ROOTS - 1) % + BTRFS_NUM_BACKUP_ROOTS; + *backup_index = newest; + *num_backups_tried += 1; + } + root_backup = super->super_roots + newest; + + btrfs_set_super_generation(super, + btrfs_backup_tree_root_gen(root_backup)); + btrfs_set_super_root(super, btrfs_backup_tree_root(root_backup)); + btrfs_set_super_root_level(super, + btrfs_backup_tree_root_level(root_backup)); + btrfs_set_super_bytes_used(super, btrfs_backup_bytes_used(root_backup)); + + /* + * fixme: the total bytes and num_devices need to match or we should + * need a fsck + */ + btrfs_set_super_total_bytes(super, btrfs_backup_total_bytes(root_backup)); + btrfs_set_super_num_devices(super, btrfs_backup_num_devices(root_backup)); + return 0; +} + +/* helper to cleanup tree roots */ +static void free_root_pointers(struct btrfs_fs_info *info, int chunk_root) +{ + free_extent_buffer(info->tree_root->node); + free_extent_buffer(info->tree_root->commit_root); + free_extent_buffer(info->dev_root->node); + free_extent_buffer(info->dev_root->commit_root); + free_extent_buffer(info->extent_root->node); + free_extent_buffer(info->extent_root->commit_root); + free_extent_buffer(info->csum_root->node); + free_extent_buffer(info->csum_root->commit_root); + + info->tree_root->node = NULL; + info->tree_root->commit_root = NULL; + info->dev_root->node = NULL; + info->dev_root->commit_root = NULL; + info->extent_root->node = NULL; + info->extent_root->commit_root = NULL; + info->csum_root->node = NULL; + info->csum_root->commit_root = NULL; + + if (chunk_root) { + free_extent_buffer(info->chunk_root->node); + free_extent_buffer(info->chunk_root->commit_root); + info->chunk_root->node = NULL; + info->chunk_root->commit_root = NULL; + } +} + + struct btrfs_root *open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_devices, char *options) @@ -1590,29 +1886,32 @@ struct btrfs_root *open_ctree(struct super_block *sb, u64 features; struct btrfs_key location; struct buffer_head *bh; - struct btrfs_root *extent_root = kzalloc(sizeof(struct btrfs_root), - GFP_NOFS); - struct btrfs_root *csum_root = kzalloc(sizeof(struct btrfs_root), - GFP_NOFS); + struct btrfs_super_block *disk_super; struct btrfs_root *tree_root = btrfs_sb(sb); - struct btrfs_fs_info *fs_info = NULL; - struct btrfs_root *chunk_root = kzalloc(sizeof(struct btrfs_root), - GFP_NOFS); - struct btrfs_root *dev_root = kzalloc(sizeof(struct btrfs_root), - GFP_NOFS); + struct btrfs_fs_info *fs_info = tree_root->fs_info; + struct btrfs_root *extent_root; + struct btrfs_root *csum_root; + struct btrfs_root *chunk_root; + struct btrfs_root *dev_root; struct btrfs_root *log_tree_root; - int ret; int err = -EINVAL; - - struct btrfs_super_block *disk_super; - - if (!extent_root || !tree_root || !tree_root->fs_info || - !chunk_root || !dev_root || !csum_root) { + int num_backups_tried = 0; + int backup_index = 0; + + extent_root = fs_info->extent_root = + kzalloc(sizeof(struct btrfs_root), GFP_NOFS); + csum_root = fs_info->csum_root = + kzalloc(sizeof(struct btrfs_root), GFP_NOFS); + chunk_root = fs_info->chunk_root = + kzalloc(sizeof(struct btrfs_root), GFP_NOFS); + dev_root = fs_info->dev_root = + kzalloc(sizeof(struct btrfs_root), GFP_NOFS); + + if (!extent_root || !csum_root || !chunk_root || !dev_root) { err = -ENOMEM; goto fail; } - fs_info = tree_root->fs_info; ret = init_srcu_struct(&fs_info->subvol_srcu); if (ret) { @@ -1648,15 +1947,10 @@ struct btrfs_root *open_ctree(struct super_block *sb, spin_lock_init(&fs_info->fs_roots_radix_lock); spin_lock_init(&fs_info->delayed_iput_lock); spin_lock_init(&fs_info->defrag_inodes_lock); + spin_lock_init(&fs_info->free_chunk_lock); mutex_init(&fs_info->reloc_mutex); init_completion(&fs_info->kobj_unregister); - fs_info->tree_root = tree_root; - fs_info->extent_root = extent_root; - fs_info->csum_root = csum_root; - fs_info->chunk_root = chunk_root; - fs_info->dev_root = dev_root; - fs_info->fs_devices = fs_devices; INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots); INIT_LIST_HEAD(&fs_info->space_info); btrfs_mapping_init(&fs_info->mapping_tree); @@ -1665,8 +1959,7 @@ struct btrfs_root *open_ctree(struct super_block *sb, btrfs_init_block_rsv(&fs_info->trans_block_rsv); btrfs_init_block_rsv(&fs_info->chunk_block_rsv); btrfs_init_block_rsv(&fs_info->empty_block_rsv); - INIT_LIST_HEAD(&fs_info->durable_block_rsv_list); - mutex_init(&fs_info->durable_block_rsv_mutex); + btrfs_init_block_rsv(&fs_info->delayed_block_rsv); atomic_set(&fs_info->nr_async_submits, 0); atomic_set(&fs_info->async_delalloc_pages, 0); atomic_set(&fs_info->async_submit_draining, 0); @@ -1677,6 +1970,11 @@ struct btrfs_root *open_ctree(struct super_block *sb, fs_info->metadata_ratio = 0; fs_info->defrag_inodes = RB_ROOT; fs_info->trans_no_join = 0; + fs_info->free_chunk_space = 0; + + /* readahead state */ + INIT_RADIX_TREE(&fs_info->reada_tree, GFP_NOFS & ~__GFP_WAIT); + spin_lock_init(&fs_info->reada_lock); fs_info->thread_pool_size = min_t(unsigned long, num_online_cpus() + 2, 8); @@ -1705,7 +2003,7 @@ struct btrfs_root *open_ctree(struct super_block *sb, sb->s_bdi = &fs_info->bdi; fs_info->btree_inode->i_ino = BTRFS_BTREE_INODE_OBJECTID; - fs_info->btree_inode->i_nlink = 1; + set_nlink(fs_info->btree_inode, 1); /* * we set the i_size on the btree inode to the max possible int. * the real end of the address space is determined by all of @@ -1766,14 +2064,14 @@ struct btrfs_root *open_ctree(struct super_block *sb, goto fail_alloc; } - memcpy(&fs_info->super_copy, bh->b_data, sizeof(fs_info->super_copy)); - memcpy(&fs_info->super_for_commit, &fs_info->super_copy, - sizeof(fs_info->super_for_commit)); + memcpy(fs_info->super_copy, bh->b_data, sizeof(*fs_info->super_copy)); + memcpy(fs_info->super_for_commit, fs_info->super_copy, + sizeof(*fs_info->super_for_commit)); brelse(bh); - memcpy(fs_info->fsid, fs_info->super_copy.fsid, BTRFS_FSID_SIZE); + memcpy(fs_info->fsid, fs_info->super_copy->fsid, BTRFS_FSID_SIZE); - disk_super = &fs_info->super_copy; + disk_super = fs_info->super_copy; if (!btrfs_super_root(disk_super)) goto fail_alloc; @@ -1783,6 +2081,13 @@ struct btrfs_root *open_ctree(struct super_block *sb, btrfs_check_super_valid(fs_info, sb->s_flags & MS_RDONLY); /* + * run through our array of backup supers and setup + * our ring pointer to the oldest one + */ + generation = btrfs_super_generation(disk_super); + find_oldest_super_backup(fs_info, generation); + + /* * In the long term, we'll store the compression type in the super * block, and it'll be used for per file compression control. */ @@ -1870,6 +2175,9 @@ struct btrfs_root *open_ctree(struct super_block *sb, btrfs_init_workers(&fs_info->delayed_workers, "delayed-meta", fs_info->thread_pool_size, &fs_info->generic_worker); + btrfs_init_workers(&fs_info->readahead_workers, "readahead", + fs_info->thread_pool_size, + &fs_info->generic_worker); /* * endios are largely parallel and should have a very @@ -1880,19 +2188,29 @@ struct btrfs_root *open_ctree(struct super_block *sb, fs_info->endio_write_workers.idle_thresh = 2; fs_info->endio_meta_write_workers.idle_thresh = 2; + fs_info->readahead_workers.idle_thresh = 2; - btrfs_start_workers(&fs_info->workers, 1); - btrfs_start_workers(&fs_info->generic_worker, 1); - btrfs_start_workers(&fs_info->submit_workers, 1); - btrfs_start_workers(&fs_info->delalloc_workers, 1); - btrfs_start_workers(&fs_info->fixup_workers, 1); - btrfs_start_workers(&fs_info->endio_workers, 1); - btrfs_start_workers(&fs_info->endio_meta_workers, 1); - btrfs_start_workers(&fs_info->endio_meta_write_workers, 1); - btrfs_start_workers(&fs_info->endio_write_workers, 1); - btrfs_start_workers(&fs_info->endio_freespace_worker, 1); - btrfs_start_workers(&fs_info->delayed_workers, 1); - btrfs_start_workers(&fs_info->caching_workers, 1); + /* + * btrfs_start_workers can really only fail because of ENOMEM so just + * return -ENOMEM if any of these fail. + */ + ret = btrfs_start_workers(&fs_info->workers); + ret |= btrfs_start_workers(&fs_info->generic_worker); + ret |= btrfs_start_workers(&fs_info->submit_workers); + ret |= btrfs_start_workers(&fs_info->delalloc_workers); + ret |= btrfs_start_workers(&fs_info->fixup_workers); + ret |= btrfs_start_workers(&fs_info->endio_workers); + ret |= btrfs_start_workers(&fs_info->endio_meta_workers); + ret |= btrfs_start_workers(&fs_info->endio_meta_write_workers); + ret |= btrfs_start_workers(&fs_info->endio_write_workers); + ret |= btrfs_start_workers(&fs_info->endio_freespace_worker); + ret |= btrfs_start_workers(&fs_info->delayed_workers); + ret |= btrfs_start_workers(&fs_info->caching_workers); + ret |= btrfs_start_workers(&fs_info->readahead_workers); + if (ret) { + ret = -ENOMEM; + goto fail_sb_buffer; + } fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super); fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages, @@ -1939,7 +2257,7 @@ struct btrfs_root *open_ctree(struct super_block *sb, if (!test_bit(EXTENT_BUFFER_UPTODATE, &chunk_root->node->bflags)) { printk(KERN_WARNING "btrfs: failed to read chunk root on %s\n", sb->s_id); - goto fail_chunk_root; + goto fail_tree_roots; } btrfs_set_root_node(&chunk_root->root_item, chunk_root->node); chunk_root->commit_root = btrfs_root_node(chunk_root); @@ -1954,11 +2272,12 @@ struct btrfs_root *open_ctree(struct super_block *sb, if (ret) { printk(KERN_WARNING "btrfs: failed to read chunk tree on %s\n", sb->s_id); - goto fail_chunk_root; + goto fail_tree_roots; } btrfs_close_extra_devices(fs_devices); +retry_root_backup: blocksize = btrfs_level_size(tree_root, btrfs_super_root_level(disk_super)); generation = btrfs_super_generation(disk_super); @@ -1966,32 +2285,33 @@ struct btrfs_root *open_ctree(struct super_block *sb, tree_root->node = read_tree_block(tree_root, btrfs_super_root(disk_super), blocksize, generation); - if (!tree_root->node) - goto fail_chunk_root; - if (!test_bit(EXTENT_BUFFER_UPTODATE, &tree_root->node->bflags)) { + if (!tree_root->node || + !test_bit(EXTENT_BUFFER_UPTODATE, &tree_root->node->bflags)) { printk(KERN_WARNING "btrfs: failed to read tree root on %s\n", sb->s_id); - goto fail_tree_root; + + goto recovery_tree_root; } + btrfs_set_root_node(&tree_root->root_item, tree_root->node); tree_root->commit_root = btrfs_root_node(tree_root); ret = find_and_setup_root(tree_root, fs_info, BTRFS_EXTENT_TREE_OBJECTID, extent_root); if (ret) - goto fail_tree_root; + goto recovery_tree_root; extent_root->track_dirty = 1; ret = find_and_setup_root(tree_root, fs_info, BTRFS_DEV_TREE_OBJECTID, dev_root); if (ret) - goto fail_extent_root; + goto recovery_tree_root; dev_root->track_dirty = 1; ret = find_and_setup_root(tree_root, fs_info, BTRFS_CSUM_TREE_OBJECTID, csum_root); if (ret) - goto fail_dev_root; + goto recovery_tree_root; csum_root->track_dirty = 1; @@ -2124,22 +2444,13 @@ fail_cleaner: fail_block_groups: btrfs_free_block_groups(fs_info); - free_extent_buffer(csum_root->node); - free_extent_buffer(csum_root->commit_root); -fail_dev_root: - free_extent_buffer(dev_root->node); - free_extent_buffer(dev_root->commit_root); -fail_extent_root: - free_extent_buffer(extent_root->node); - free_extent_buffer(extent_root->commit_root); -fail_tree_root: - free_extent_buffer(tree_root->node); - free_extent_buffer(tree_root->commit_root); -fail_chunk_root: - free_extent_buffer(chunk_root->node); - free_extent_buffer(chunk_root->commit_root); + +fail_tree_roots: + free_root_pointers(fs_info, 1); + fail_sb_buffer: btrfs_stop_workers(&fs_info->generic_worker); + btrfs_stop_workers(&fs_info->readahead_workers); btrfs_stop_workers(&fs_info->fixup_workers); btrfs_stop_workers(&fs_info->delalloc_workers); btrfs_stop_workers(&fs_info->workers); @@ -2152,25 +2463,37 @@ fail_sb_buffer: btrfs_stop_workers(&fs_info->delayed_workers); btrfs_stop_workers(&fs_info->caching_workers); fail_alloc: - kfree(fs_info->delayed_root); fail_iput: + btrfs_mapping_tree_free(&fs_info->mapping_tree); + invalidate_inode_pages2(fs_info->btree_inode->i_mapping); iput(fs_info->btree_inode); - - btrfs_close_devices(fs_info->fs_devices); - btrfs_mapping_tree_free(&fs_info->mapping_tree); fail_bdi: bdi_destroy(&fs_info->bdi); fail_srcu: cleanup_srcu_struct(&fs_info->subvol_srcu); fail: - kfree(extent_root); - kfree(tree_root); - kfree(fs_info); - kfree(chunk_root); - kfree(dev_root); - kfree(csum_root); + btrfs_close_devices(fs_info->fs_devices); + free_fs_info(fs_info); return ERR_PTR(err); + +recovery_tree_root: + if (!btrfs_test_opt(tree_root, RECOVERY)) + goto fail_tree_roots; + + free_root_pointers(fs_info, 0); + + /* don't use the log in recovery mode, it won't be valid */ + btrfs_set_super_log_root(disk_super, 0); + + /* we can't trust the free space cache either */ + btrfs_set_opt(fs_info->mount_opt, CLEAR_CACHE); + + ret = next_root_backup(fs_info, fs_info->super_copy, + &num_backups_tried, &backup_index); + if (ret == -1) + goto fail_block_groups; + goto retry_root_backup; } static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate) @@ -2254,22 +2577,10 @@ static int write_dev_supers(struct btrfs_device *device, int errors = 0; u32 crc; u64 bytenr; - int last_barrier = 0; if (max_mirrors == 0) max_mirrors = BTRFS_SUPER_MIRROR_MAX; - /* make sure only the last submit_bh does a barrier */ - if (do_barriers) { - for (i = 0; i < max_mirrors; i++) { - bytenr = btrfs_sb_offset(i); - if (bytenr + BTRFS_SUPER_INFO_SIZE >= - device->total_bytes) - break; - last_barrier = i; - } - } - for (i = 0; i < max_mirrors; i++) { bytenr = btrfs_sb_offset(i); if (bytenr + BTRFS_SUPER_INFO_SIZE >= device->total_bytes) @@ -2315,17 +2626,136 @@ static int write_dev_supers(struct btrfs_device *device, bh->b_end_io = btrfs_end_buffer_write_sync; } - if (i == last_barrier && do_barriers) - ret = submit_bh(WRITE_FLUSH_FUA, bh); - else - ret = submit_bh(WRITE_SYNC, bh); - + /* + * we fua the first super. The others we allow + * to go down lazy. + */ + ret = submit_bh(WRITE_FUA, bh); if (ret) errors++; } return errors < i ? 0 : -1; } +/* + * endio for the write_dev_flush, this will wake anyone waiting + * for the barrier when it is done + */ +static void btrfs_end_empty_barrier(struct bio *bio, int err) +{ + if (err) { + if (err == -EOPNOTSUPP) + set_bit(BIO_EOPNOTSUPP, &bio->bi_flags); + clear_bit(BIO_UPTODATE, &bio->bi_flags); + } + if (bio->bi_private) + complete(bio->bi_private); + bio_put(bio); +} + +/* + * trigger flushes for one the devices. If you pass wait == 0, the flushes are + * sent down. With wait == 1, it waits for the previous flush. + * + * any device where the flush fails with eopnotsupp are flagged as not-barrier + * capable + */ +static int write_dev_flush(struct btrfs_device *device, int wait) +{ + struct bio *bio; + int ret = 0; + + if (device->nobarriers) + return 0; + + if (wait) { + bio = device->flush_bio; + if (!bio) + return 0; + + wait_for_completion(&device->flush_wait); + + if (bio_flagged(bio, BIO_EOPNOTSUPP)) { + printk("btrfs: disabling barriers on dev %s\n", + device->name); + device->nobarriers = 1; + } + if (!bio_flagged(bio, BIO_UPTODATE)) { + ret = -EIO; + } + + /* drop the reference from the wait == 0 run */ + bio_put(bio); + device->flush_bio = NULL; + + return ret; + } + + /* + * one reference for us, and we leave it for the + * caller + */ + device->flush_bio = NULL;; + bio = bio_alloc(GFP_NOFS, 0); + if (!bio) + return -ENOMEM; + + bio->bi_end_io = btrfs_end_empty_barrier; + bio->bi_bdev = device->bdev; + init_completion(&device->flush_wait); + bio->bi_private = &device->flush_wait; + device->flush_bio = bio; + + bio_get(bio); + submit_bio(WRITE_FLUSH, bio); + + return 0; +} + +/* + * send an empty flush down to each device in parallel, + * then wait for them + */ +static int barrier_all_devices(struct btrfs_fs_info *info) +{ + struct list_head *head; + struct btrfs_device *dev; + int errors = 0; + int ret; + + /* send down all the barriers */ + head = &info->fs_devices->devices; + list_for_each_entry_rcu(dev, head, dev_list) { + if (!dev->bdev) { + errors++; + continue; + } + if (!dev->in_fs_metadata || !dev->writeable) + continue; + + ret = write_dev_flush(dev, 0); + if (ret) + errors++; + } + + /* wait for all the barriers */ + list_for_each_entry_rcu(dev, head, dev_list) { + if (!dev->bdev) { + errors++; + continue; + } + if (!dev->in_fs_metadata || !dev->writeable) + continue; + + ret = write_dev_flush(dev, 1); + if (ret) + errors++; + } + if (errors) + return -EIO; + return 0; +} + int write_all_supers(struct btrfs_root *root, int max_mirrors) { struct list_head *head; @@ -2338,14 +2768,19 @@ int write_all_supers(struct btrfs_root *root, int max_mirrors) int total_errors = 0; u64 flags; - max_errors = btrfs_super_num_devices(&root->fs_info->super_copy) - 1; + max_errors = btrfs_super_num_devices(root->fs_info->super_copy) - 1; do_barriers = !btrfs_test_opt(root, NOBARRIER); + backup_super_roots(root->fs_info); - sb = &root->fs_info->super_for_commit; + sb = root->fs_info->super_for_commit; dev_item = &sb->dev_item; mutex_lock(&root->fs_info->fs_devices->device_list_mutex); head = &root->fs_info->fs_devices->devices; + + if (do_barriers) + barrier_all_devices(root->fs_info); + list_for_each_entry_rcu(dev, head, dev_list) { if (!dev->bdev) { total_errors++; @@ -2545,8 +2980,6 @@ int close_ctree(struct btrfs_root *root) /* clear out the rbtree of defraggable inodes */ btrfs_run_defrag_inodes(root->fs_info); - btrfs_put_block_group_cache(fs_info); - /* * Here come 2 situations when btrfs is broken to flip readonly: * @@ -2572,6 +3005,8 @@ int close_ctree(struct btrfs_root *root) printk(KERN_ERR "btrfs: commit super ret %d\n", ret); } + btrfs_put_block_group_cache(fs_info); + kthread_stop(root->fs_info->transaction_kthread); kthread_stop(root->fs_info->cleaner_kthread); @@ -2603,7 +3038,6 @@ int close_ctree(struct btrfs_root *root) del_fs_roots(fs_info); iput(fs_info->btree_inode); - kfree(fs_info->delayed_root); btrfs_stop_workers(&fs_info->generic_worker); btrfs_stop_workers(&fs_info->fixup_workers); @@ -2617,6 +3051,7 @@ int close_ctree(struct btrfs_root *root) btrfs_stop_workers(&fs_info->submit_workers); btrfs_stop_workers(&fs_info->delayed_workers); btrfs_stop_workers(&fs_info->caching_workers); + btrfs_stop_workers(&fs_info->readahead_workers); btrfs_close_devices(fs_info->fs_devices); btrfs_mapping_tree_free(&fs_info->mapping_tree); @@ -2624,12 +3059,7 @@ int close_ctree(struct btrfs_root *root) bdi_destroy(&fs_info->bdi); cleanup_srcu_struct(&fs_info->subvol_srcu); - kfree(fs_info->extent_root); - kfree(fs_info->tree_root); - kfree(fs_info->chunk_root); - kfree(fs_info->dev_root); - kfree(fs_info->csum_root); - kfree(fs_info); + free_fs_info(fs_info); return 0; } @@ -2735,7 +3165,8 @@ int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid) return ret; } -int btree_lock_page_hook(struct page *page) +static int btree_lock_page_hook(struct page *page, void *data, + void (*flush_fn)(void *)) { struct inode *inode = page->mapping->host; struct btrfs_root *root = BTRFS_I(inode)->root; @@ -2752,7 +3183,10 @@ int btree_lock_page_hook(struct page *page) if (!eb) goto out; - btrfs_tree_lock(eb); + if (!btrfs_try_tree_write_lock(eb)) { + flush_fn(data); + btrfs_tree_lock(eb); + } btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN); if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) { @@ -2767,7 +3201,10 @@ int btree_lock_page_hook(struct page *page) btrfs_tree_unlock(eb); free_extent_buffer(eb); out: - lock_page(page); + if (!trylock_page(page)) { + flush_fn(data); + lock_page(page); + } return 0; } @@ -3123,6 +3560,7 @@ static int btrfs_cleanup_transaction(struct btrfs_root *root) static struct extent_io_ops btree_extent_io_ops = { .write_cache_pages_lock_hook = btree_lock_page_hook, .readpage_end_io_hook = btree_readpage_end_io_hook, + .readpage_io_failed_hook = btree_io_failed_hook, .submit_bio_hook = btree_submit_bio_hook, /* note we're sharing with inode.c for the merge bio hook */ .merge_bio_hook = btrfs_merge_bio_hook, diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index bec3ea4..c99d0a8f 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -40,6 +40,8 @@ struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize, u64 parent_transid); int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize, u64 parent_transid); +int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, u32 blocksize, + int mirror_num, struct extent_buffer **eb); struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize); int clean_tree_block(struct btrfs_trans_handle *trans, @@ -83,8 +85,6 @@ int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info); int btrfs_add_log_tree(struct btrfs_trans_handle *trans, struct btrfs_root *root); -int btree_lock_page_hook(struct page *page); - #ifdef CONFIG_DEBUG_LOCK_ALLOC void btrfs_init_lockdep(void); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index f5be06a..f5fbe57 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -23,6 +23,7 @@ #include <linux/rcupdate.h> #include <linux/kthread.h> #include <linux/slab.h> +#include <linux/ratelimit.h> #include "compat.h" #include "hash.h" #include "ctree.h" @@ -52,6 +53,21 @@ enum { CHUNK_ALLOC_LIMITED = 2, }; +/* + * Control how reservations are dealt with. + * + * RESERVE_FREE - freeing a reservation. + * RESERVE_ALLOC - allocating space and we need to update bytes_may_use for + * ENOSPC accounting + * RESERVE_ALLOC_NO_ACCOUNT - allocating space and we should not update + * bytes_may_use as the ENOSPC accounting is done elsewhere + */ +enum { + RESERVE_FREE = 0, + RESERVE_ALLOC = 1, + RESERVE_ALLOC_NO_ACCOUNT = 2, +}; + static int update_block_group(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 num_bytes, int alloc); @@ -81,6 +97,8 @@ static int find_next_key(struct btrfs_path *path, int level, struct btrfs_key *key); static void dump_space_info(struct btrfs_space_info *info, u64 bytes, int dump_block_groups); +static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache, + u64 num_bytes, int reserve); static noinline int block_group_cache_done(struct btrfs_block_group_cache *cache) @@ -104,7 +122,6 @@ void btrfs_put_block_group(struct btrfs_block_group_cache *cache) if (atomic_dec_and_test(&cache->count)) { WARN_ON(cache->pinned > 0); WARN_ON(cache->reserved > 0); - WARN_ON(cache->reserved_pinned > 0); kfree(cache->free_space_ctl); kfree(cache); } @@ -450,13 +467,59 @@ static int cache_block_group(struct btrfs_block_group_cache *cache, struct btrfs_root *root, int load_cache_only) { + DEFINE_WAIT(wait); struct btrfs_fs_info *fs_info = cache->fs_info; struct btrfs_caching_control *caching_ctl; int ret = 0; - smp_mb(); - if (cache->cached != BTRFS_CACHE_NO) + caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS); + BUG_ON(!caching_ctl); + + INIT_LIST_HEAD(&caching_ctl->list); + mutex_init(&caching_ctl->mutex); + init_waitqueue_head(&caching_ctl->wait); + caching_ctl->block_group = cache; + caching_ctl->progress = cache->key.objectid; + atomic_set(&caching_ctl->count, 1); + caching_ctl->work.func = caching_thread; + + spin_lock(&cache->lock); + /* + * This should be a rare occasion, but this could happen I think in the + * case where one thread starts to load the space cache info, and then + * some other thread starts a transaction commit which tries to do an + * allocation while the other thread is still loading the space cache + * info. The previous loop should have kept us from choosing this block + * group, but if we've moved to the state where we will wait on caching + * block groups we need to first check if we're doing a fast load here, + * so we can wait for it to finish, otherwise we could end up allocating + * from a block group who's cache gets evicted for one reason or + * another. + */ + while (cache->cached == BTRFS_CACHE_FAST) { + struct btrfs_caching_control *ctl; + + ctl = cache->caching_ctl; + atomic_inc(&ctl->count); + prepare_to_wait(&ctl->wait, &wait, TASK_UNINTERRUPTIBLE); + spin_unlock(&cache->lock); + + schedule(); + + finish_wait(&ctl->wait, &wait); + put_caching_control(ctl); + spin_lock(&cache->lock); + } + + if (cache->cached != BTRFS_CACHE_NO) { + spin_unlock(&cache->lock); + kfree(caching_ctl); return 0; + } + WARN_ON(cache->caching_ctl); + cache->caching_ctl = caching_ctl; + cache->cached = BTRFS_CACHE_FAST; + spin_unlock(&cache->lock); /* * We can't do the read from on-disk cache during a commit since we need @@ -465,57 +528,53 @@ static int cache_block_group(struct btrfs_block_group_cache *cache, * we likely hold important locks. */ if (trans && (!trans->transaction->in_commit) && - (root && root != root->fs_info->tree_root)) { - spin_lock(&cache->lock); - if (cache->cached != BTRFS_CACHE_NO) { - spin_unlock(&cache->lock); - return 0; - } - cache->cached = BTRFS_CACHE_STARTED; - spin_unlock(&cache->lock); - + (root && root != root->fs_info->tree_root) && + btrfs_test_opt(root, SPACE_CACHE)) { ret = load_free_space_cache(fs_info, cache); spin_lock(&cache->lock); if (ret == 1) { + cache->caching_ctl = NULL; cache->cached = BTRFS_CACHE_FINISHED; cache->last_byte_to_unpin = (u64)-1; } else { - cache->cached = BTRFS_CACHE_NO; + if (load_cache_only) { + cache->caching_ctl = NULL; + cache->cached = BTRFS_CACHE_NO; + } else { + cache->cached = BTRFS_CACHE_STARTED; + } } spin_unlock(&cache->lock); + wake_up(&caching_ctl->wait); if (ret == 1) { + put_caching_control(caching_ctl); free_excluded_extents(fs_info->extent_root, cache); return 0; } + } else { + /* + * We are not going to do the fast caching, set cached to the + * appropriate value and wakeup any waiters. + */ + spin_lock(&cache->lock); + if (load_cache_only) { + cache->caching_ctl = NULL; + cache->cached = BTRFS_CACHE_NO; + } else { + cache->cached = BTRFS_CACHE_STARTED; + } + spin_unlock(&cache->lock); + wake_up(&caching_ctl->wait); } - if (load_cache_only) - return 0; - - caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS); - BUG_ON(!caching_ctl); - - INIT_LIST_HEAD(&caching_ctl->list); - mutex_init(&caching_ctl->mutex); - init_waitqueue_head(&caching_ctl->wait); - caching_ctl->block_group = cache; - caching_ctl->progress = cache->key.objectid; - /* one for caching kthread, one for caching block group list */ - atomic_set(&caching_ctl->count, 2); - caching_ctl->work.func = caching_thread; - - spin_lock(&cache->lock); - if (cache->cached != BTRFS_CACHE_NO) { - spin_unlock(&cache->lock); - kfree(caching_ctl); + if (load_cache_only) { + put_caching_control(caching_ctl); return 0; } - cache->caching_ctl = caching_ctl; - cache->cached = BTRFS_CACHE_STARTED; - spin_unlock(&cache->lock); down_write(&fs_info->extent_commit_sem); + atomic_inc(&caching_ctl->count); list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups); up_write(&fs_info->extent_commit_sem); @@ -1770,18 +1829,18 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, { int ret; u64 discarded_bytes = 0; - struct btrfs_multi_bio *multi = NULL; + struct btrfs_bio *bbio = NULL; /* Tell the block device(s) that the sectors can be discarded */ ret = btrfs_map_block(&root->fs_info->mapping_tree, REQ_DISCARD, - bytenr, &num_bytes, &multi, 0); + bytenr, &num_bytes, &bbio, 0); if (!ret) { - struct btrfs_bio_stripe *stripe = multi->stripes; + struct btrfs_bio_stripe *stripe = bbio->stripes; int i; - for (i = 0; i < multi->num_stripes; i++, stripe++) { + for (i = 0; i < bbio->num_stripes; i++, stripe++) { if (!stripe->dev->can_discard) continue; @@ -1800,7 +1859,7 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, */ ret = 0; } - kfree(multi); + kfree(bbio); } if (actual_bytes) @@ -2700,6 +2759,13 @@ again: goto again; } + /* We've already setup this transaction, go ahead and exit */ + if (block_group->cache_generation == trans->transid && + i_size_read(inode)) { + dcs = BTRFS_DC_SETUP; + goto out_put; + } + /* * We want to set the generation to 0, that way if anything goes wrong * from here on out we know not to trust this cache when we load up next @@ -2749,12 +2815,15 @@ again: if (!ret) dcs = BTRFS_DC_SETUP; btrfs_free_reserved_data_space(inode, num_pages); + out_put: iput(inode); out_free: btrfs_release_path(path); out: spin_lock(&block_group->lock); + if (!ret && dcs == BTRFS_DC_SETUP) + block_group->cache_generation = trans->transid; block_group->disk_cache_state = dcs; spin_unlock(&block_group->lock); @@ -3122,16 +3191,13 @@ commit_trans: return -ENOSPC; } data_sinfo->bytes_may_use += bytes; - BTRFS_I(inode)->reserved_bytes += bytes; spin_unlock(&data_sinfo->lock); return 0; } /* - * called when we are clearing an delalloc extent from the - * inode's io_tree or there was an error for whatever reason - * after calling btrfs_check_data_free_space + * Called if we need to clear a data reservation for this inode. */ void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes) { @@ -3144,7 +3210,6 @@ void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes) data_sinfo = BTRFS_I(inode)->space_info; spin_lock(&data_sinfo->lock); data_sinfo->bytes_may_use -= bytes; - BTRFS_I(inode)->reserved_bytes -= bytes; spin_unlock(&data_sinfo->lock); } @@ -3165,6 +3230,7 @@ static int should_alloc_chunk(struct btrfs_root *root, struct btrfs_space_info *sinfo, u64 alloc_bytes, int force) { + struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv; u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly; u64 num_allocated = sinfo->bytes_used + sinfo->bytes_reserved; u64 thresh; @@ -3173,11 +3239,18 @@ static int should_alloc_chunk(struct btrfs_root *root, return 1; /* + * We need to take into account the global rsv because for all intents + * and purposes it's used space. Don't worry about locking the + * global_rsv, it doesn't change except when the transaction commits. + */ + num_allocated += global_rsv->size; + + /* * in limited mode, we want to have some free space up to * about 1% of the FS size. */ if (force == CHUNK_ALLOC_LIMITED) { - thresh = btrfs_super_total_bytes(&root->fs_info->super_copy); + thresh = btrfs_super_total_bytes(root->fs_info->super_copy); thresh = max_t(u64, 64 * 1024 * 1024, div_factor_fine(thresh, 1)); @@ -3199,7 +3272,7 @@ static int should_alloc_chunk(struct btrfs_root *root, if (num_allocated + alloc_bytes < div_factor(num_bytes, 8)) return 0; - thresh = btrfs_super_total_bytes(&root->fs_info->super_copy); + thresh = btrfs_super_total_bytes(root->fs_info->super_copy); /* 256MB or 5% of the FS */ thresh = max_t(u64, 256 * 1024 * 1024, div_factor_fine(thresh, 5)); @@ -3302,24 +3375,26 @@ out: /* * shrink metadata reservation for delalloc */ -static int shrink_delalloc(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 to_reclaim, int sync) +static int shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, + bool wait_ordered) { struct btrfs_block_rsv *block_rsv; struct btrfs_space_info *space_info; + struct btrfs_trans_handle *trans; u64 reserved; u64 max_reclaim; u64 reclaimed = 0; long time_left; - int nr_pages = (2 * 1024 * 1024) >> PAGE_CACHE_SHIFT; + unsigned long nr_pages = (2 * 1024 * 1024) >> PAGE_CACHE_SHIFT; int loops = 0; unsigned long progress; + trans = (struct btrfs_trans_handle *)current->journal_info; block_rsv = &root->fs_info->delalloc_block_rsv; space_info = block_rsv->space_info; smp_mb(); - reserved = space_info->bytes_reserved; + reserved = space_info->bytes_may_use; progress = space_info->reservation_progress; if (reserved == 0) @@ -3334,18 +3409,20 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans, } max_reclaim = min(reserved, to_reclaim); - + nr_pages = max_t(unsigned long, nr_pages, + max_reclaim >> PAGE_CACHE_SHIFT); while (loops < 1024) { /* have the flusher threads jump in and do some IO */ smp_mb(); nr_pages = min_t(unsigned long, nr_pages, root->fs_info->delalloc_bytes >> PAGE_CACHE_SHIFT); - writeback_inodes_sb_nr_if_idle(root->fs_info->sb, nr_pages); + writeback_inodes_sb_nr_if_idle(root->fs_info->sb, nr_pages, + WB_REASON_FS_FREE_SPACE); spin_lock(&space_info->lock); - if (reserved > space_info->bytes_reserved) - reclaimed += reserved - space_info->bytes_reserved; - reserved = space_info->bytes_reserved; + if (reserved > space_info->bytes_may_use) + reclaimed += reserved - space_info->bytes_may_use; + reserved = space_info->bytes_may_use; spin_unlock(&space_info->lock); loops++; @@ -3356,11 +3433,15 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans, if (trans && trans->transaction->blocked) return -EAGAIN; - time_left = schedule_timeout_interruptible(1); + if (wait_ordered && !trans) { + btrfs_wait_ordered_extents(root, 0, 0); + } else { + time_left = schedule_timeout_interruptible(1); - /* We were interrupted, exit */ - if (time_left) - break; + /* We were interrupted, exit */ + if (time_left) + break; + } /* we've kicked the IO a few times, if anything has been freed, * exit. There is no sense in looping here for a long time @@ -3375,34 +3456,90 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans, } } - if (reclaimed >= to_reclaim && !trans) - btrfs_wait_ordered_extents(root, 0, 0); + return reclaimed >= to_reclaim; } -/* - * Retries tells us how many times we've called reserve_metadata_bytes. The - * idea is if this is the first call (retries == 0) then we will add to our - * reserved count if we can't make the allocation in order to hold our place - * while we go and try and free up space. That way for retries > 1 we don't try - * and add space, we just check to see if the amount of unused space is >= the - * total space, meaning that our reservation is valid. +/** + * maybe_commit_transaction - possibly commit the transaction if its ok to + * @root - the root we're allocating for + * @bytes - the number of bytes we want to reserve + * @force - force the commit * - * However if we don't intend to retry this reservation, pass -1 as retries so - * that it short circuits this logic. + * This will check to make sure that committing the transaction will actually + * get us somewhere and then commit the transaction if it does. Otherwise it + * will return -ENOSPC. */ -static int reserve_metadata_bytes(struct btrfs_trans_handle *trans, - struct btrfs_root *root, +static int may_commit_transaction(struct btrfs_root *root, + struct btrfs_space_info *space_info, + u64 bytes, int force) +{ + struct btrfs_block_rsv *delayed_rsv = &root->fs_info->delayed_block_rsv; + struct btrfs_trans_handle *trans; + + trans = (struct btrfs_trans_handle *)current->journal_info; + if (trans) + return -EAGAIN; + + if (force) + goto commit; + + /* See if there is enough pinned space to make this reservation */ + spin_lock(&space_info->lock); + if (space_info->bytes_pinned >= bytes) { + spin_unlock(&space_info->lock); + goto commit; + } + spin_unlock(&space_info->lock); + + /* + * See if there is some space in the delayed insertion reservation for + * this reservation. + */ + if (space_info != delayed_rsv->space_info) + return -ENOSPC; + + spin_lock(&delayed_rsv->lock); + if (delayed_rsv->size < bytes) { + spin_unlock(&delayed_rsv->lock); + return -ENOSPC; + } + spin_unlock(&delayed_rsv->lock); + +commit: + trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) + return -ENOSPC; + + return btrfs_commit_transaction(trans, root); +} + +/** + * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space + * @root - the root we're allocating for + * @block_rsv - the block_rsv we're allocating for + * @orig_bytes - the number of bytes we want + * @flush - wether or not we can flush to make our reservation + * + * This will reserve orgi_bytes number of bytes from the space info associated + * with the block_rsv. If there is not enough space it will make an attempt to + * flush out space to make room. It will do this by flushing delalloc if + * possible or committing the transaction. If flush is 0 then no attempts to + * regain reservations will be made and this will fail if there is not enough + * space already. + */ +static int reserve_metadata_bytes(struct btrfs_root *root, struct btrfs_block_rsv *block_rsv, u64 orig_bytes, int flush) { struct btrfs_space_info *space_info = block_rsv->space_info; - u64 unused; + u64 used; u64 num_bytes = orig_bytes; int retries = 0; int ret = 0; bool committed = false; bool flushing = false; + bool wait_ordered = false; again: ret = 0; @@ -3419,7 +3556,7 @@ again: * deadlock since we are waiting for the flusher to finish, but * hold the current transaction open. */ - if (trans) + if (current->journal_info) return -EAGAIN; ret = wait_event_interruptible(space_info->wait, !space_info->flush); @@ -3431,9 +3568,9 @@ again: } ret = -ENOSPC; - unused = space_info->bytes_used + space_info->bytes_reserved + - space_info->bytes_pinned + space_info->bytes_readonly + - space_info->bytes_may_use; + used = space_info->bytes_used + space_info->bytes_reserved + + space_info->bytes_pinned + space_info->bytes_readonly + + space_info->bytes_may_use; /* * The idea here is that we've not already over-reserved the block group @@ -3442,10 +3579,9 @@ again: * lets start flushing stuff first and then come back and try to make * our reservation. */ - if (unused <= space_info->total_bytes) { - unused = space_info->total_bytes - unused; - if (unused >= num_bytes) { - space_info->bytes_reserved += orig_bytes; + if (used <= space_info->total_bytes) { + if (used + orig_bytes <= space_info->total_bytes) { + space_info->bytes_may_use += orig_bytes; ret = 0; } else { /* @@ -3461,10 +3597,64 @@ again: * amount plus the amount of bytes that we need for this * reservation. */ - num_bytes = unused - space_info->total_bytes + + wait_ordered = true; + num_bytes = used - space_info->total_bytes + (orig_bytes * (retries + 1)); } + if (ret) { + u64 profile = btrfs_get_alloc_profile(root, 0); + u64 avail; + + /* + * If we have a lot of space that's pinned, don't bother doing + * the overcommit dance yet and just commit the transaction. + */ + avail = (space_info->total_bytes - space_info->bytes_used) * 8; + do_div(avail, 10); + if (space_info->bytes_pinned >= avail && flush && !committed) { + space_info->flush = 1; + flushing = true; + spin_unlock(&space_info->lock); + ret = may_commit_transaction(root, space_info, + orig_bytes, 1); + if (ret) + goto out; + committed = true; + goto again; + } + + spin_lock(&root->fs_info->free_chunk_lock); + avail = root->fs_info->free_chunk_space; + + /* + * If we have dup, raid1 or raid10 then only half of the free + * space is actually useable. + */ + if (profile & (BTRFS_BLOCK_GROUP_DUP | + BTRFS_BLOCK_GROUP_RAID1 | + BTRFS_BLOCK_GROUP_RAID10)) + avail >>= 1; + + /* + * If we aren't flushing don't let us overcommit too much, say + * 1/8th of the space. If we can flush, let it overcommit up to + * 1/2 of the space. + */ + if (flush) + avail >>= 3; + else + avail >>= 1; + spin_unlock(&root->fs_info->free_chunk_lock); + + if (used + num_bytes < space_info->total_bytes + avail) { + space_info->bytes_may_use += orig_bytes; + ret = 0; + } else { + wait_ordered = true; + } + } + /* * Couldn't make our reservation, save our place so while we're trying * to reclaim space we can actually use it instead of somebody else @@ -3484,7 +3674,7 @@ again: * We do synchronous shrinking since we don't actually unreserve * metadata until after the IO is completed. */ - ret = shrink_delalloc(trans, root, num_bytes, 1); + ret = shrink_delalloc(root, num_bytes, wait_ordered); if (ret < 0) goto out; @@ -3496,35 +3686,17 @@ again: * so go back around and try again. */ if (retries < 2) { + wait_ordered = true; retries++; goto again; } - /* - * Not enough space to be reclaimed, don't bother committing the - * transaction. - */ - spin_lock(&space_info->lock); - if (space_info->bytes_pinned < orig_bytes) - ret = -ENOSPC; - spin_unlock(&space_info->lock); - if (ret) - goto out; - - ret = -EAGAIN; - if (trans) - goto out; - ret = -ENOSPC; if (committed) goto out; - trans = btrfs_join_transaction(root); - if (IS_ERR(trans)) - goto out; - ret = btrfs_commit_transaction(trans, root); + ret = may_commit_transaction(root, space_info, orig_bytes, 0); if (!ret) { - trans = NULL; committed = true; goto again; } @@ -3542,10 +3714,12 @@ out: static struct btrfs_block_rsv *get_block_rsv(struct btrfs_trans_handle *trans, struct btrfs_root *root) { - struct btrfs_block_rsv *block_rsv; - if (root->ref_cows) + struct btrfs_block_rsv *block_rsv = NULL; + + if (root->ref_cows || root == root->fs_info->csum_root) block_rsv = trans->block_rsv; - else + + if (!block_rsv) block_rsv = root->block_rsv; if (!block_rsv) @@ -3616,7 +3790,7 @@ static void block_rsv_release_bytes(struct btrfs_block_rsv *block_rsv, } if (num_bytes) { spin_lock(&space_info->lock); - space_info->bytes_reserved -= num_bytes; + space_info->bytes_may_use -= num_bytes; space_info->reservation_progress++; spin_unlock(&space_info->lock); } @@ -3640,9 +3814,6 @@ void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv) { memset(rsv, 0, sizeof(*rsv)); spin_lock_init(&rsv->lock); - atomic_set(&rsv->usage, 1); - rsv->priority = 6; - INIT_LIST_HEAD(&rsv->list); } struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root) @@ -3663,38 +3834,20 @@ struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root) void btrfs_free_block_rsv(struct btrfs_root *root, struct btrfs_block_rsv *rsv) { - if (rsv && atomic_dec_and_test(&rsv->usage)) { - btrfs_block_rsv_release(root, rsv, (u64)-1); - if (!rsv->durable) - kfree(rsv); - } -} - -/* - * make the block_rsv struct be able to capture freed space. - * the captured space will re-add to the the block_rsv struct - * after transaction commit - */ -void btrfs_add_durable_block_rsv(struct btrfs_fs_info *fs_info, - struct btrfs_block_rsv *block_rsv) -{ - block_rsv->durable = 1; - mutex_lock(&fs_info->durable_block_rsv_mutex); - list_add_tail(&block_rsv->list, &fs_info->durable_block_rsv_list); - mutex_unlock(&fs_info->durable_block_rsv_mutex); + btrfs_block_rsv_release(root, rsv, (u64)-1); + kfree(rsv); } -int btrfs_block_rsv_add(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_block_rsv *block_rsv, - u64 num_bytes) +static inline int __block_rsv_add(struct btrfs_root *root, + struct btrfs_block_rsv *block_rsv, + u64 num_bytes, int flush) { int ret; if (num_bytes == 0) return 0; - ret = reserve_metadata_bytes(trans, root, block_rsv, num_bytes, 1); + ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush); if (!ret) { block_rsv_add_bytes(block_rsv, num_bytes, 1); return 0; @@ -3703,55 +3856,80 @@ int btrfs_block_rsv_add(struct btrfs_trans_handle *trans, return ret; } -int btrfs_block_rsv_check(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_block_rsv *block_rsv, - u64 min_reserved, int min_factor) +int btrfs_block_rsv_add(struct btrfs_root *root, + struct btrfs_block_rsv *block_rsv, + u64 num_bytes) +{ + return __block_rsv_add(root, block_rsv, num_bytes, 1); +} + +int btrfs_block_rsv_add_noflush(struct btrfs_root *root, + struct btrfs_block_rsv *block_rsv, + u64 num_bytes) +{ + return __block_rsv_add(root, block_rsv, num_bytes, 0); +} + +int btrfs_block_rsv_check(struct btrfs_root *root, + struct btrfs_block_rsv *block_rsv, int min_factor) { u64 num_bytes = 0; - int commit_trans = 0; int ret = -ENOSPC; if (!block_rsv) return 0; spin_lock(&block_rsv->lock); - if (min_factor > 0) - num_bytes = div_factor(block_rsv->size, min_factor); - if (min_reserved > num_bytes) - num_bytes = min_reserved; + num_bytes = div_factor(block_rsv->size, min_factor); + if (block_rsv->reserved >= num_bytes) + ret = 0; + spin_unlock(&block_rsv->lock); - if (block_rsv->reserved >= num_bytes) { + return ret; +} + +static inline int __btrfs_block_rsv_refill(struct btrfs_root *root, + struct btrfs_block_rsv *block_rsv, + u64 min_reserved, int flush) +{ + u64 num_bytes = 0; + int ret = -ENOSPC; + + if (!block_rsv) + return 0; + + spin_lock(&block_rsv->lock); + num_bytes = min_reserved; + if (block_rsv->reserved >= num_bytes) ret = 0; - } else { + else num_bytes -= block_rsv->reserved; - if (block_rsv->durable && - block_rsv->freed[0] + block_rsv->freed[1] >= num_bytes) - commit_trans = 1; - } spin_unlock(&block_rsv->lock); + if (!ret) return 0; - if (block_rsv->refill_used) { - ret = reserve_metadata_bytes(trans, root, block_rsv, - num_bytes, 0); - if (!ret) { - block_rsv_add_bytes(block_rsv, num_bytes, 0); - return 0; - } - } - - if (commit_trans) { - if (trans) - return -EAGAIN; - trans = btrfs_join_transaction(root); - BUG_ON(IS_ERR(trans)); - ret = btrfs_commit_transaction(trans, root); + ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush); + if (!ret) { + block_rsv_add_bytes(block_rsv, num_bytes, 0); return 0; } - return -ENOSPC; + return ret; +} + +int btrfs_block_rsv_refill(struct btrfs_root *root, + struct btrfs_block_rsv *block_rsv, + u64 min_reserved) +{ + return __btrfs_block_rsv_refill(root, block_rsv, min_reserved, 1); +} + +int btrfs_block_rsv_refill_noflush(struct btrfs_root *root, + struct btrfs_block_rsv *block_rsv, + u64 min_reserved) +{ + return __btrfs_block_rsv_refill(root, block_rsv, min_reserved, 0); } int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv, @@ -3783,7 +3961,7 @@ static u64 calc_global_metadata_size(struct btrfs_fs_info *fs_info) u64 num_bytes; u64 meta_used; u64 data_used; - int csum_size = btrfs_super_csum_size(&fs_info->super_copy); + int csum_size = btrfs_super_csum_size(fs_info->super_copy); sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_DATA); spin_lock(&sinfo->lock); @@ -3827,12 +4005,12 @@ static void update_global_block_rsv(struct btrfs_fs_info *fs_info) if (sinfo->total_bytes > num_bytes) { num_bytes = sinfo->total_bytes - num_bytes; block_rsv->reserved += num_bytes; - sinfo->bytes_reserved += num_bytes; + sinfo->bytes_may_use += num_bytes; } if (block_rsv->reserved >= block_rsv->size) { num_bytes = block_rsv->reserved - block_rsv->size; - sinfo->bytes_reserved -= num_bytes; + sinfo->bytes_may_use -= num_bytes; sinfo->reservation_progress++; block_rsv->reserved = block_rsv->size; block_rsv->full = 1; @@ -3848,16 +4026,13 @@ static void init_global_block_rsv(struct btrfs_fs_info *fs_info) space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM); fs_info->chunk_block_rsv.space_info = space_info; - fs_info->chunk_block_rsv.priority = 10; space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); fs_info->global_block_rsv.space_info = space_info; - fs_info->global_block_rsv.priority = 10; - fs_info->global_block_rsv.refill_used = 1; fs_info->delalloc_block_rsv.space_info = space_info; fs_info->trans_block_rsv.space_info = space_info; fs_info->empty_block_rsv.space_info = space_info; - fs_info->empty_block_rsv.priority = 10; + fs_info->delayed_block_rsv.space_info = space_info; fs_info->extent_root->block_rsv = &fs_info->global_block_rsv; fs_info->csum_root->block_rsv = &fs_info->global_block_rsv; @@ -3865,10 +4040,6 @@ static void init_global_block_rsv(struct btrfs_fs_info *fs_info) fs_info->tree_root->block_rsv = &fs_info->global_block_rsv; fs_info->chunk_root->block_rsv = &fs_info->chunk_block_rsv; - btrfs_add_durable_block_rsv(fs_info, &fs_info->global_block_rsv); - - btrfs_add_durable_block_rsv(fs_info, &fs_info->delalloc_block_rsv); - update_global_block_rsv(fs_info); } @@ -3881,37 +4052,8 @@ static void release_global_block_rsv(struct btrfs_fs_info *fs_info) WARN_ON(fs_info->trans_block_rsv.reserved > 0); WARN_ON(fs_info->chunk_block_rsv.size > 0); WARN_ON(fs_info->chunk_block_rsv.reserved > 0); -} - -int btrfs_truncate_reserve_metadata(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_block_rsv *rsv) -{ - struct btrfs_block_rsv *trans_rsv = &root->fs_info->trans_block_rsv; - u64 num_bytes; - int ret; - - /* - * Truncate should be freeing data, but give us 2 items just in case it - * needs to use some space. We may want to be smarter about this in the - * future. - */ - num_bytes = btrfs_calc_trans_metadata_size(root, 2); - - /* We already have enough bytes, just return */ - if (rsv->reserved >= num_bytes) - return 0; - - num_bytes -= rsv->reserved; - - /* - * You should have reserved enough space before hand to do this, so this - * should not fail. - */ - ret = block_rsv_migrate_bytes(trans_rsv, rsv, num_bytes); - BUG_ON(ret); - - return 0; + WARN_ON(fs_info->delayed_block_rsv.size > 0); + WARN_ON(fs_info->delayed_block_rsv.reserved > 0); } void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans, @@ -3920,9 +4062,7 @@ void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans, if (!trans->bytes_reserved) return; - BUG_ON(trans->block_rsv != &root->fs_info->trans_block_rsv); - btrfs_block_rsv_release(root, trans->block_rsv, - trans->bytes_reserved); + btrfs_block_rsv_release(root, trans->block_rsv, trans->bytes_reserved); trans->bytes_reserved = 0; } @@ -3964,33 +4104,99 @@ int btrfs_snap_reserve_metadata(struct btrfs_trans_handle *trans, return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes); } +/** + * drop_outstanding_extent - drop an outstanding extent + * @inode: the inode we're dropping the extent for + * + * This is called when we are freeing up an outstanding extent, either called + * after an error or after an extent is written. This will return the number of + * reserved extents that need to be freed. This must be called with + * BTRFS_I(inode)->lock held. + */ static unsigned drop_outstanding_extent(struct inode *inode) { + unsigned drop_inode_space = 0; unsigned dropped_extents = 0; - spin_lock(&BTRFS_I(inode)->lock); BUG_ON(!BTRFS_I(inode)->outstanding_extents); BTRFS_I(inode)->outstanding_extents--; + if (BTRFS_I(inode)->outstanding_extents == 0 && + BTRFS_I(inode)->delalloc_meta_reserved) { + drop_inode_space = 1; + BTRFS_I(inode)->delalloc_meta_reserved = 0; + } + /* * If we have more or the same amount of outsanding extents than we have * reserved then we need to leave the reserved extents count alone. */ if (BTRFS_I(inode)->outstanding_extents >= BTRFS_I(inode)->reserved_extents) - goto out; + return drop_inode_space; dropped_extents = BTRFS_I(inode)->reserved_extents - BTRFS_I(inode)->outstanding_extents; BTRFS_I(inode)->reserved_extents -= dropped_extents; -out: - spin_unlock(&BTRFS_I(inode)->lock); - return dropped_extents; + return dropped_extents + drop_inode_space; } -static u64 calc_csum_metadata_size(struct inode *inode, u64 num_bytes) +/** + * calc_csum_metadata_size - return the amount of metada space that must be + * reserved/free'd for the given bytes. + * @inode: the inode we're manipulating + * @num_bytes: the number of bytes in question + * @reserve: 1 if we are reserving space, 0 if we are freeing space + * + * This adjusts the number of csum_bytes in the inode and then returns the + * correct amount of metadata that must either be reserved or freed. We + * calculate how many checksums we can fit into one leaf and then divide the + * number of bytes that will need to be checksumed by this value to figure out + * how many checksums will be required. If we are adding bytes then the number + * may go up and we will return the number of additional bytes that must be + * reserved. If it is going down we will return the number of bytes that must + * be freed. + * + * This must be called with BTRFS_I(inode)->lock held. + */ +static u64 calc_csum_metadata_size(struct inode *inode, u64 num_bytes, + int reserve) { - return num_bytes >>= 3; + struct btrfs_root *root = BTRFS_I(inode)->root; + u64 csum_size; + int num_csums_per_leaf; + int num_csums; + int old_csums; + + if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM && + BTRFS_I(inode)->csum_bytes == 0) + return 0; + + old_csums = (int)div64_u64(BTRFS_I(inode)->csum_bytes, root->sectorsize); + if (reserve) + BTRFS_I(inode)->csum_bytes += num_bytes; + else + BTRFS_I(inode)->csum_bytes -= num_bytes; + csum_size = BTRFS_LEAF_DATA_SIZE(root) - sizeof(struct btrfs_item); + num_csums_per_leaf = (int)div64_u64(csum_size, + sizeof(struct btrfs_csum_item) + + sizeof(struct btrfs_disk_key)); + num_csums = (int)div64_u64(BTRFS_I(inode)->csum_bytes, root->sectorsize); + num_csums = num_csums + num_csums_per_leaf - 1; + num_csums = num_csums / num_csums_per_leaf; + + old_csums = old_csums + num_csums_per_leaf - 1; + old_csums = old_csums / num_csums_per_leaf; + + /* No change, no need to reserve more */ + if (old_csums == num_csums) + return 0; + + if (reserve) + return btrfs_calc_trans_metadata_size(root, + num_csums - old_csums); + + return btrfs_calc_trans_metadata_size(root, old_csums - num_csums); } int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) @@ -3998,10 +4204,19 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_block_rsv *block_rsv = &root->fs_info->delalloc_block_rsv; u64 to_reserve = 0; + u64 csum_bytes; unsigned nr_extents = 0; + int extra_reserve = 0; + int flush = 1; int ret; - if (btrfs_transaction_in_commit(root->fs_info)) + /* Need to be holding the i_mutex here if we aren't free space cache */ + if (btrfs_is_free_space_inode(root, inode)) + flush = 0; + else + WARN_ON(!mutex_is_locked(&inode->i_mutex)); + + if (flush && btrfs_transaction_in_commit(root->fs_info)) schedule_timeout(1); num_bytes = ALIGN(num_bytes, root->sectorsize); @@ -4010,33 +4225,74 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) BTRFS_I(inode)->outstanding_extents++; if (BTRFS_I(inode)->outstanding_extents > - BTRFS_I(inode)->reserved_extents) { + BTRFS_I(inode)->reserved_extents) nr_extents = BTRFS_I(inode)->outstanding_extents - BTRFS_I(inode)->reserved_extents; - BTRFS_I(inode)->reserved_extents += nr_extents; - to_reserve = btrfs_calc_trans_metadata_size(root, nr_extents); + /* + * Add an item to reserve for updating the inode when we complete the + * delalloc io. + */ + if (!BTRFS_I(inode)->delalloc_meta_reserved) { + nr_extents++; + extra_reserve = 1; } + + to_reserve = btrfs_calc_trans_metadata_size(root, nr_extents); + to_reserve += calc_csum_metadata_size(inode, num_bytes, 1); + csum_bytes = BTRFS_I(inode)->csum_bytes; spin_unlock(&BTRFS_I(inode)->lock); - to_reserve += calc_csum_metadata_size(inode, num_bytes); - ret = reserve_metadata_bytes(NULL, root, block_rsv, to_reserve, 1); + ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush); if (ret) { + u64 to_free = 0; unsigned dropped; + + spin_lock(&BTRFS_I(inode)->lock); + dropped = drop_outstanding_extent(inode); /* - * We don't need the return value since our reservation failed, - * we just need to clean up our counter. + * If the inodes csum_bytes is the same as the original + * csum_bytes then we know we haven't raced with any free()ers + * so we can just reduce our inodes csum bytes and carry on. + * Otherwise we have to do the normal free thing to account for + * the case that the free side didn't free up its reserve + * because of this outstanding reservation. */ - dropped = drop_outstanding_extent(inode); - WARN_ON(dropped > 1); + if (BTRFS_I(inode)->csum_bytes == csum_bytes) + calc_csum_metadata_size(inode, num_bytes, 0); + else + to_free = calc_csum_metadata_size(inode, num_bytes, 0); + spin_unlock(&BTRFS_I(inode)->lock); + if (dropped) + to_free += btrfs_calc_trans_metadata_size(root, dropped); + + if (to_free) + btrfs_block_rsv_release(root, block_rsv, to_free); return ret; } + spin_lock(&BTRFS_I(inode)->lock); + if (extra_reserve) { + BTRFS_I(inode)->delalloc_meta_reserved = 1; + nr_extents--; + } + BTRFS_I(inode)->reserved_extents += nr_extents; + spin_unlock(&BTRFS_I(inode)->lock); + block_rsv_add_bytes(block_rsv, to_reserve, 1); return 0; } +/** + * btrfs_delalloc_release_metadata - release a metadata reservation for an inode + * @inode: the inode to release the reservation for + * @num_bytes: the number of bytes we're releasing + * + * This will release the metadata reservation for an inode. This can be called + * once we complete IO for a given set of bytes to release their metadata + * reservations. + */ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes) { struct btrfs_root *root = BTRFS_I(inode)->root; @@ -4044,9 +4300,11 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes) unsigned dropped; num_bytes = ALIGN(num_bytes, root->sectorsize); + spin_lock(&BTRFS_I(inode)->lock); dropped = drop_outstanding_extent(inode); - to_free = calc_csum_metadata_size(inode, num_bytes); + to_free = calc_csum_metadata_size(inode, num_bytes, 0); + spin_unlock(&BTRFS_I(inode)->lock); if (dropped > 0) to_free += btrfs_calc_trans_metadata_size(root, dropped); @@ -4054,6 +4312,21 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes) to_free); } +/** + * btrfs_delalloc_reserve_space - reserve data and metadata space for delalloc + * @inode: inode we're writing to + * @num_bytes: the number of bytes we want to allocate + * + * This will do the following things + * + * o reserve space in the data space info for num_bytes + * o reserve space in the metadata space info based on number of outstanding + * extents and how much csums will be needed + * o add to the inodes ->delalloc_bytes + * o add it to the fs_info's delalloc inodes list. + * + * This will return 0 for success and -ENOSPC if there is no space left. + */ int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes) { int ret; @@ -4071,6 +4344,19 @@ int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes) return 0; } +/** + * btrfs_delalloc_release_space - release data and metadata space for delalloc + * @inode: inode we're releasing space for + * @num_bytes: the number of bytes we want to free up + * + * This must be matched with a call to btrfs_delalloc_reserve_space. This is + * called in the case that we don't need the metadata AND data reservations + * anymore. So if there is an error or we insert an inline extent. + * + * This function will release the metadata space that was not used and will + * decrement ->delalloc_bytes and remove it from the fs_info delalloc_inodes + * list if there are no delalloc bytes left. + */ void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes) { btrfs_delalloc_release_metadata(inode, num_bytes); @@ -4090,12 +4376,12 @@ static int update_block_group(struct btrfs_trans_handle *trans, /* block accounting for super block */ spin_lock(&info->delalloc_lock); - old_val = btrfs_super_bytes_used(&info->super_copy); + old_val = btrfs_super_bytes_used(info->super_copy); if (alloc) old_val += num_bytes; else old_val -= num_bytes; - btrfs_set_super_bytes_used(&info->super_copy, old_val); + btrfs_set_super_bytes_used(info->super_copy, old_val); spin_unlock(&info->delalloc_lock); while (total) { @@ -4123,7 +4409,7 @@ static int update_block_group(struct btrfs_trans_handle *trans, spin_lock(&cache->space_info->lock); spin_lock(&cache->lock); - if (btrfs_super_cache_generation(&info->super_copy) != 0 && + if (btrfs_test_opt(root, SPACE_CACHE) && cache->disk_cache_state < BTRFS_DC_CLEAR) cache->disk_cache_state = BTRFS_DC_CLEAR; @@ -4135,7 +4421,6 @@ static int update_block_group(struct btrfs_trans_handle *trans, btrfs_set_block_group_used(&cache->item, old_val); cache->reserved -= num_bytes; cache->space_info->bytes_reserved -= num_bytes; - cache->space_info->reservation_progress++; cache->space_info->bytes_used += num_bytes; cache->space_info->disk_used += num_bytes * factor; spin_unlock(&cache->lock); @@ -4187,7 +4472,6 @@ static int pin_down_extent(struct btrfs_root *root, if (reserved) { cache->reserved -= num_bytes; cache->space_info->bytes_reserved -= num_bytes; - cache->space_info->reservation_progress++; } spin_unlock(&cache->lock); spin_unlock(&cache->space_info->lock); @@ -4215,45 +4499,82 @@ int btrfs_pin_extent(struct btrfs_root *root, } /* - * update size of reserved extents. this function may return -EAGAIN - * if 'reserve' is true or 'sinfo' is false. + * this function must be called within transaction + */ +int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 bytenr, u64 num_bytes) +{ + struct btrfs_block_group_cache *cache; + + cache = btrfs_lookup_block_group(root->fs_info, bytenr); + BUG_ON(!cache); + + /* + * pull in the free space cache (if any) so that our pin + * removes the free space from the cache. We have load_only set + * to one because the slow code to read in the free extents does check + * the pinned extents. + */ + cache_block_group(cache, trans, root, 1); + + pin_down_extent(root, cache, bytenr, num_bytes, 0); + + /* remove us from the free space cache (if we're there at all) */ + btrfs_remove_free_space(cache, bytenr, num_bytes); + btrfs_put_block_group(cache); + return 0; +} + +/** + * btrfs_update_reserved_bytes - update the block_group and space info counters + * @cache: The cache we are manipulating + * @num_bytes: The number of bytes in question + * @reserve: One of the reservation enums + * + * This is called by the allocator when it reserves space, or by somebody who is + * freeing space that was never actually used on disk. For example if you + * reserve some space for a new leaf in transaction A and before transaction A + * commits you free that leaf, you call this with reserve set to 0 in order to + * clear the reservation. + * + * Metadata reservations should be called with RESERVE_ALLOC so we do the proper + * ENOSPC accounting. For data we handle the reservation through clearing the + * delalloc bits in the io_tree. We have to do this since we could end up + * allocating less disk space for the amount of data we have reserved in the + * case of compression. + * + * If this is a reservation and the block group has become read only we cannot + * make the reservation and return -EAGAIN, otherwise this function always + * succeeds. */ -int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache, - u64 num_bytes, int reserve, int sinfo) +static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache, + u64 num_bytes, int reserve) { + struct btrfs_space_info *space_info = cache->space_info; int ret = 0; - if (sinfo) { - struct btrfs_space_info *space_info = cache->space_info; - spin_lock(&space_info->lock); - spin_lock(&cache->lock); - if (reserve) { - if (cache->ro) { - ret = -EAGAIN; - } else { - cache->reserved += num_bytes; - space_info->bytes_reserved += num_bytes; - } - } else { - if (cache->ro) - space_info->bytes_readonly += num_bytes; - cache->reserved -= num_bytes; - space_info->bytes_reserved -= num_bytes; - space_info->reservation_progress++; - } - spin_unlock(&cache->lock); - spin_unlock(&space_info->lock); - } else { - spin_lock(&cache->lock); + spin_lock(&space_info->lock); + spin_lock(&cache->lock); + if (reserve != RESERVE_FREE) { if (cache->ro) { ret = -EAGAIN; } else { - if (reserve) - cache->reserved += num_bytes; - else - cache->reserved -= num_bytes; + cache->reserved += num_bytes; + space_info->bytes_reserved += num_bytes; + if (reserve == RESERVE_ALLOC) { + BUG_ON(space_info->bytes_may_use < num_bytes); + space_info->bytes_may_use -= num_bytes; + } } - spin_unlock(&cache->lock); + } else { + if (cache->ro) + space_info->bytes_readonly += num_bytes; + cache->reserved -= num_bytes; + space_info->bytes_reserved -= num_bytes; + space_info->reservation_progress++; } + spin_unlock(&cache->lock); + spin_unlock(&space_info->lock); return ret; } @@ -4319,13 +4640,8 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end) spin_lock(&cache->lock); cache->pinned -= len; cache->space_info->bytes_pinned -= len; - if (cache->ro) { + if (cache->ro) cache->space_info->bytes_readonly += len; - } else if (cache->reserved_pinned > 0) { - len = min(len, cache->reserved_pinned); - cache->reserved_pinned -= len; - cache->space_info->bytes_reserved += len; - } spin_unlock(&cache->lock); spin_unlock(&cache->space_info->lock); } @@ -4340,11 +4656,8 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, { struct btrfs_fs_info *fs_info = root->fs_info; struct extent_io_tree *unpin; - struct btrfs_block_rsv *block_rsv; - struct btrfs_block_rsv *next_rsv; u64 start; u64 end; - int idx; int ret; if (fs_info->pinned_extents == &fs_info->freed_extents[0]) @@ -4367,30 +4680,6 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, cond_resched(); } - mutex_lock(&fs_info->durable_block_rsv_mutex); - list_for_each_entry_safe(block_rsv, next_rsv, - &fs_info->durable_block_rsv_list, list) { - - idx = trans->transid & 0x1; - if (block_rsv->freed[idx] > 0) { - block_rsv_add_bytes(block_rsv, - block_rsv->freed[idx], 0); - block_rsv->freed[idx] = 0; - } - if (atomic_read(&block_rsv->usage) == 0) { - btrfs_block_rsv_release(root, block_rsv, (u64)-1); - - if (block_rsv->freed[0] == 0 && - block_rsv->freed[1] == 0) { - list_del_init(&block_rsv->list); - kfree(block_rsv); - } - } else { - btrfs_block_rsv_release(root, block_rsv, 0); - } - } - mutex_unlock(&fs_info->durable_block_rsv_mutex); - return 0; } @@ -4668,7 +4957,6 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans, struct extent_buffer *buf, u64 parent, int last_ref) { - struct btrfs_block_rsv *block_rsv; struct btrfs_block_group_cache *cache = NULL; int ret; @@ -4683,64 +4971,24 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans, if (!last_ref) return; - block_rsv = get_block_rsv(trans, root); cache = btrfs_lookup_block_group(root->fs_info, buf->start); - if (block_rsv->space_info != cache->space_info) - goto out; if (btrfs_header_generation(buf) == trans->transid) { if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { ret = check_ref_cleanup(trans, root, buf->start); if (!ret) - goto pin; + goto out; } if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) { pin_down_extent(root, cache, buf->start, buf->len, 1); - goto pin; + goto out; } WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)); btrfs_add_free_space(cache, buf->start, buf->len); - ret = btrfs_update_reserved_bytes(cache, buf->len, 0, 0); - if (ret == -EAGAIN) { - /* block group became read-only */ - btrfs_update_reserved_bytes(cache, buf->len, 0, 1); - goto out; - } - - ret = 1; - spin_lock(&block_rsv->lock); - if (block_rsv->reserved < block_rsv->size) { - block_rsv->reserved += buf->len; - ret = 0; - } - spin_unlock(&block_rsv->lock); - - if (ret) { - spin_lock(&cache->space_info->lock); - cache->space_info->bytes_reserved -= buf->len; - cache->space_info->reservation_progress++; - spin_unlock(&cache->space_info->lock); - } - goto out; - } -pin: - if (block_rsv->durable && !cache->ro) { - ret = 0; - spin_lock(&cache->lock); - if (!cache->ro) { - cache->reserved_pinned += buf->len; - ret = 1; - } - spin_unlock(&cache->lock); - - if (ret) { - spin_lock(&block_rsv->lock); - block_rsv->freed[trans->transid & 0x1] += buf->len; - spin_unlock(&block_rsv->lock); - } + btrfs_update_reserved_bytes(cache, buf->len, RESERVE_FREE); } out: /* @@ -4876,17 +5124,20 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root = orig_root->fs_info->extent_root; struct btrfs_free_cluster *last_ptr = NULL; struct btrfs_block_group_cache *block_group = NULL; + struct btrfs_block_group_cache *used_block_group; int empty_cluster = 2 * 1024 * 1024; int allowed_chunk_alloc = 0; int done_chunk_alloc = 0; struct btrfs_space_info *space_info; - int last_ptr_loop = 0; int loop = 0; int index = 0; + int alloc_type = (data & BTRFS_BLOCK_GROUP_DATA) ? + RESERVE_ALLOC_NO_ACCOUNT : RESERVE_ALLOC; bool found_uncached_bg = false; bool failed_cluster_refill = false; bool failed_alloc = false; bool use_cluster = true; + bool have_caching_bg = false; u64 ideal_cache_percent = 0; u64 ideal_cache_offset = 0; @@ -4939,6 +5190,7 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans, ideal_cache: block_group = btrfs_lookup_block_group(root->fs_info, search_start); + used_block_group = block_group; /* * we don't want to use the block group if it doesn't match our * allocation bits, or if its not cached. @@ -4969,12 +5221,14 @@ ideal_cache: } } search: + have_caching_bg = false; down_read(&space_info->groups_sem); list_for_each_entry(block_group, &space_info->block_groups[index], list) { u64 offset; int cached; + used_block_group = block_group; btrfs_get_block_group(block_group); search_start = block_group->key.objectid; @@ -4998,13 +5252,15 @@ search: } have_block_group: - if (unlikely(block_group->cached == BTRFS_CACHE_NO)) { + cached = block_group_cache_done(block_group); + if (unlikely(!cached)) { u64 free_percent; + found_uncached_bg = true; ret = cache_block_group(block_group, trans, orig_root, 1); if (block_group->cached == BTRFS_CACHE_FINISHED) - goto have_block_group; + goto alloc; free_percent = btrfs_block_group_used(&block_group->item); free_percent *= 100; @@ -5026,7 +5282,6 @@ have_block_group: orig_root, 0); BUG_ON(ret); } - found_uncached_bg = true; /* * If loop is set for cached only, try the next block @@ -5036,94 +5291,80 @@ have_block_group: goto loop; } - cached = block_group_cache_done(block_group); - if (unlikely(!cached)) - found_uncached_bg = true; - +alloc: if (unlikely(block_group->ro)) goto loop; spin_lock(&block_group->free_space_ctl->tree_lock); if (cached && block_group->free_space_ctl->free_space < - num_bytes + empty_size) { + num_bytes + empty_cluster + empty_size) { spin_unlock(&block_group->free_space_ctl->tree_lock); goto loop; } spin_unlock(&block_group->free_space_ctl->tree_lock); /* - * Ok we want to try and use the cluster allocator, so lets look - * there, unless we are on LOOP_NO_EMPTY_SIZE, since we will - * have tried the cluster allocator plenty of times at this - * point and not have found anything, so we are likely way too - * fragmented for the clustering stuff to find anything, so lets - * just skip it and let the allocator find whatever block it can - * find + * Ok we want to try and use the cluster allocator, so + * lets look there */ - if (last_ptr && loop < LOOP_NO_EMPTY_SIZE) { + if (last_ptr) { /* * the refill lock keeps out other * people trying to start a new cluster */ spin_lock(&last_ptr->refill_lock); - if (last_ptr->block_group && - (last_ptr->block_group->ro || - !block_group_bits(last_ptr->block_group, data))) { - offset = 0; + used_block_group = last_ptr->block_group; + if (used_block_group != block_group && + (!used_block_group || + used_block_group->ro || + !block_group_bits(used_block_group, data))) { + used_block_group = block_group; goto refill_cluster; } - offset = btrfs_alloc_from_cluster(block_group, last_ptr, - num_bytes, search_start); + if (used_block_group != block_group) + btrfs_get_block_group(used_block_group); + + offset = btrfs_alloc_from_cluster(used_block_group, + last_ptr, num_bytes, used_block_group->key.objectid); if (offset) { /* we have a block, we're done */ spin_unlock(&last_ptr->refill_lock); goto checks; } - spin_lock(&last_ptr->lock); - /* - * whoops, this cluster doesn't actually point to - * this block group. Get a ref on the block - * group is does point to and try again - */ - if (!last_ptr_loop && last_ptr->block_group && - last_ptr->block_group != block_group && - index <= - get_block_group_index(last_ptr->block_group)) { - - btrfs_put_block_group(block_group); - block_group = last_ptr->block_group; - btrfs_get_block_group(block_group); - spin_unlock(&last_ptr->lock); - spin_unlock(&last_ptr->refill_lock); - - last_ptr_loop = 1; - search_start = block_group->key.objectid; - /* - * we know this block group is properly - * in the list because - * btrfs_remove_block_group, drops the - * cluster before it removes the block - * group from the list - */ - goto have_block_group; + WARN_ON(last_ptr->block_group != used_block_group); + if (used_block_group != block_group) { + btrfs_put_block_group(used_block_group); + used_block_group = block_group; } - spin_unlock(&last_ptr->lock); refill_cluster: + BUG_ON(used_block_group != block_group); + /* If we are on LOOP_NO_EMPTY_SIZE, we can't + * set up a new clusters, so lets just skip it + * and let the allocator find whatever block + * it can find. If we reach this point, we + * will have tried the cluster allocator + * plenty of times and not have found + * anything, so we are likely way too + * fragmented for the clustering stuff to find + * anything. */ + if (loop >= LOOP_NO_EMPTY_SIZE) { + spin_unlock(&last_ptr->refill_lock); + goto unclustered_alloc; + } + /* * this cluster didn't work out, free it and * start over */ btrfs_return_cluster_to_free_space(NULL, last_ptr); - last_ptr_loop = 0; - /* allocate a cluster in this block group */ ret = btrfs_find_space_cluster(trans, root, block_group, last_ptr, - offset, num_bytes, + search_start, num_bytes, empty_cluster + empty_size); if (ret == 0) { /* @@ -5159,6 +5400,7 @@ refill_cluster: goto loop; } +unclustered_alloc: offset = btrfs_find_space_for_alloc(block_group, search_start, num_bytes, empty_size); /* @@ -5177,20 +5419,22 @@ refill_cluster: failed_alloc = true; goto have_block_group; } else if (!offset) { + if (!cached) + have_caching_bg = true; goto loop; } checks: search_start = stripe_align(root, offset); /* move on to the next group */ if (search_start + num_bytes >= search_end) { - btrfs_add_free_space(block_group, offset, num_bytes); + btrfs_add_free_space(used_block_group, offset, num_bytes); goto loop; } /* move on to the next group */ if (search_start + num_bytes > - block_group->key.objectid + block_group->key.offset) { - btrfs_add_free_space(block_group, offset, num_bytes); + used_block_group->key.objectid + used_block_group->key.offset) { + btrfs_add_free_space(used_block_group, offset, num_bytes); goto loop; } @@ -5198,14 +5442,14 @@ checks: ins->offset = num_bytes; if (offset < search_start) - btrfs_add_free_space(block_group, offset, + btrfs_add_free_space(used_block_group, offset, search_start - offset); BUG_ON(offset > search_start); - ret = btrfs_update_reserved_bytes(block_group, num_bytes, 1, - (data & BTRFS_BLOCK_GROUP_DATA)); + ret = btrfs_update_reserved_bytes(used_block_group, num_bytes, + alloc_type); if (ret == -EAGAIN) { - btrfs_add_free_space(block_group, offset, num_bytes); + btrfs_add_free_space(used_block_group, offset, num_bytes); goto loop; } @@ -5214,19 +5458,26 @@ checks: ins->offset = num_bytes; if (offset < search_start) - btrfs_add_free_space(block_group, offset, + btrfs_add_free_space(used_block_group, offset, search_start - offset); BUG_ON(offset > search_start); + if (used_block_group != block_group) + btrfs_put_block_group(used_block_group); btrfs_put_block_group(block_group); break; loop: failed_cluster_refill = false; failed_alloc = false; BUG_ON(index != get_block_group_index(block_group)); + if (used_block_group != block_group) + btrfs_put_block_group(used_block_group); btrfs_put_block_group(block_group); } up_read(&space_info->groups_sem); + if (!ins->objectid && loop >= LOOP_CACHING_WAIT && have_caching_bg) + goto search; + if (!ins->objectid && ++index < BTRFS_NR_RAID_TYPES) goto search; @@ -5325,7 +5576,8 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes, int index = 0; spin_lock(&info->lock); - printk(KERN_INFO "space_info has %llu free, is %sfull\n", + printk(KERN_INFO "space_info %llu has %llu free, is %sfull\n", + (unsigned long long)info->flags, (unsigned long long)(info->total_bytes - info->bytes_used - info->bytes_pinned - info->bytes_reserved - info->bytes_readonly), @@ -5411,7 +5663,8 @@ again: return ret; } -int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len) +static int __btrfs_free_reserved_extent(struct btrfs_root *root, + u64 start, u64 len, int pin) { struct btrfs_block_group_cache *cache; int ret = 0; @@ -5426,8 +5679,12 @@ int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len) if (btrfs_test_opt(root, DISCARD)) ret = btrfs_discard_extent(root, start, len, NULL); - btrfs_add_free_space(cache, start, len); - btrfs_update_reserved_bytes(cache, len, 0, 1); + if (pin) + pin_down_extent(root, cache, start, len, 1); + else { + btrfs_add_free_space(cache, start, len); + btrfs_update_reserved_bytes(cache, len, RESERVE_FREE); + } btrfs_put_block_group(cache); trace_btrfs_reserved_extent_free(root, start, len); @@ -5435,6 +5692,18 @@ int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len) return ret; } +int btrfs_free_reserved_extent(struct btrfs_root *root, + u64 start, u64 len) +{ + return __btrfs_free_reserved_extent(root, start, len, 0); +} + +int btrfs_free_and_pin_reserved_extent(struct btrfs_root *root, + u64 start, u64 len) +{ + return __btrfs_free_reserved_extent(root, start, len, 1); +} + static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 parent, u64 root_objectid, @@ -5630,7 +5899,8 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, put_caching_control(caching_ctl); } - ret = btrfs_update_reserved_bytes(block_group, ins->offset, 1, 1); + ret = btrfs_update_reserved_bytes(block_group, ins->offset, + RESERVE_ALLOC_NO_ACCOUNT); BUG_ON(ret); btrfs_put_block_group(block_group); ret = alloc_reserved_file_extent(trans, root, 0, root_objectid, @@ -5687,8 +5957,7 @@ use_block_rsv(struct btrfs_trans_handle *trans, block_rsv = get_block_rsv(trans, root); if (block_rsv->size == 0) { - ret = reserve_metadata_bytes(trans, root, block_rsv, - blocksize, 0); + ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0); /* * If we couldn't reserve metadata bytes try and use some from * the global reserve. @@ -5708,13 +5977,15 @@ use_block_rsv(struct btrfs_trans_handle *trans, if (!ret) return block_rsv; if (ret) { - WARN_ON(1); - ret = reserve_metadata_bytes(trans, root, block_rsv, blocksize, - 0); + static DEFINE_RATELIMIT_STATE(_rs, + DEFAULT_RATELIMIT_INTERVAL, + /*DEFAULT_RATELIMIT_BURST*/ 2); + if (__ratelimit(&_rs)) { + printk(KERN_DEBUG "btrfs: block rsv returned %d\n", ret); + WARN_ON(1); + } + ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0); if (!ret) { - spin_lock(&block_rsv->lock); - block_rsv->size += blocksize; - spin_unlock(&block_rsv->lock); return block_rsv; } else if (ret && block_rsv != global_rsv) { ret = block_rsv_use_bytes(global_rsv, blocksize); @@ -6592,12 +6863,9 @@ static int set_block_group_ro(struct btrfs_block_group_cache *cache, int force) cache->bytes_super - btrfs_block_group_used(&cache->item); if (sinfo->bytes_used + sinfo->bytes_reserved + sinfo->bytes_pinned + - sinfo->bytes_may_use + sinfo->bytes_readonly + - cache->reserved_pinned + num_bytes + min_allocable_bytes <= - sinfo->total_bytes) { + sinfo->bytes_may_use + sinfo->bytes_readonly + num_bytes + + min_allocable_bytes <= sinfo->total_bytes) { sinfo->bytes_readonly += num_bytes; - sinfo->bytes_reserved += cache->reserved_pinned; - cache->reserved_pinned = 0; cache->ro = 1; ret = 0; } @@ -6964,7 +7232,8 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info) struct btrfs_space_info, list); if (space_info->bytes_pinned > 0 || - space_info->bytes_reserved > 0) { + space_info->bytes_reserved > 0 || + space_info->bytes_may_use > 0) { WARN_ON(1); dump_space_info(space_info, 0, 0); } @@ -7006,14 +7275,12 @@ int btrfs_read_block_groups(struct btrfs_root *root) return -ENOMEM; path->reada = 1; - cache_gen = btrfs_super_cache_generation(&root->fs_info->super_copy); - if (cache_gen != 0 && - btrfs_super_generation(&root->fs_info->super_copy) != cache_gen) + cache_gen = btrfs_super_cache_generation(root->fs_info->super_copy); + if (btrfs_test_opt(root, SPACE_CACHE) && + btrfs_super_generation(root->fs_info->super_copy) != cache_gen) need_clear = 1; if (btrfs_test_opt(root, CLEAR_CACHE)) need_clear = 1; - if (!btrfs_test_opt(root, SPACE_CACHE) && cache_gen) - printk(KERN_INFO "btrfs: disk space caching is enabled\n"); while (1) { ret = find_first_block_group(root, path, &key); @@ -7252,7 +7519,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, goto out; } - inode = lookup_free_space_inode(root, block_group, path); + inode = lookup_free_space_inode(tree_root, block_group, path); if (!IS_ERR(inode)) { ret = btrfs_orphan_add(trans, inode); BUG_ON(ret); @@ -7268,7 +7535,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, spin_unlock(&block_group->lock); } /* One for our lookup ref */ - iput(inode); + btrfs_add_delayed_iput(inode); } key.objectid = BTRFS_FREE_SPACE_OBJECTID; @@ -7339,7 +7606,7 @@ int btrfs_init_space_info(struct btrfs_fs_info *fs_info) int mixed = 0; int ret; - disk_super = &fs_info->super_copy; + disk_super = fs_info->super_copy; if (!btrfs_super_root(disk_super)) return 1; diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index d418164..49f3c9d 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -17,6 +17,7 @@ #include "compat.h" #include "ctree.h" #include "btrfs_inode.h" +#include "volumes.h" static struct kmem_cache *extent_state_cache; static struct kmem_cache *extent_buffer_cache; @@ -894,6 +895,202 @@ search_again: goto again; } +/** + * convert_extent - convert all bits in a given range from one bit to another + * @tree: the io tree to search + * @start: the start offset in bytes + * @end: the end offset in bytes (inclusive) + * @bits: the bits to set in this range + * @clear_bits: the bits to clear in this range + * @mask: the allocation mask + * + * This will go through and set bits for the given range. If any states exist + * already in this range they are set with the given bit and cleared of the + * clear_bits. This is only meant to be used by things that are mergeable, ie + * converting from say DELALLOC to DIRTY. This is not meant to be used with + * boundary bits like LOCK. + */ +int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, + int bits, int clear_bits, gfp_t mask) +{ + struct extent_state *state; + struct extent_state *prealloc = NULL; + struct rb_node *node; + int err = 0; + u64 last_start; + u64 last_end; + +again: + if (!prealloc && (mask & __GFP_WAIT)) { + prealloc = alloc_extent_state(mask); + if (!prealloc) + return -ENOMEM; + } + + spin_lock(&tree->lock); + /* + * this search will find all the extents that end after + * our range starts. + */ + node = tree_search(tree, start); + if (!node) { + prealloc = alloc_extent_state_atomic(prealloc); + if (!prealloc) { + err = -ENOMEM; + goto out; + } + err = insert_state(tree, prealloc, start, end, &bits); + prealloc = NULL; + BUG_ON(err == -EEXIST); + goto out; + } + state = rb_entry(node, struct extent_state, rb_node); +hit_next: + last_start = state->start; + last_end = state->end; + + /* + * | ---- desired range ---- | + * | state | + * + * Just lock what we found and keep going + */ + if (state->start == start && state->end <= end) { + struct rb_node *next_node; + + set_state_bits(tree, state, &bits); + clear_state_bit(tree, state, &clear_bits, 0); + + merge_state(tree, state); + if (last_end == (u64)-1) + goto out; + + start = last_end + 1; + next_node = rb_next(&state->rb_node); + if (next_node && start < end && prealloc && !need_resched()) { + state = rb_entry(next_node, struct extent_state, + rb_node); + if (state->start == start) + goto hit_next; + } + goto search_again; + } + + /* + * | ---- desired range ---- | + * | state | + * or + * | ------------- state -------------- | + * + * We need to split the extent we found, and may flip bits on + * second half. + * + * If the extent we found extends past our + * range, we just split and search again. It'll get split + * again the next time though. + * + * If the extent we found is inside our range, we set the + * desired bit on it. + */ + if (state->start < start) { + prealloc = alloc_extent_state_atomic(prealloc); + if (!prealloc) { + err = -ENOMEM; + goto out; + } + err = split_state(tree, state, prealloc, start); + BUG_ON(err == -EEXIST); + prealloc = NULL; + if (err) + goto out; + if (state->end <= end) { + set_state_bits(tree, state, &bits); + clear_state_bit(tree, state, &clear_bits, 0); + merge_state(tree, state); + if (last_end == (u64)-1) + goto out; + start = last_end + 1; + } + goto search_again; + } + /* + * | ---- desired range ---- | + * | state | or | state | + * + * There's a hole, we need to insert something in it and + * ignore the extent we found. + */ + if (state->start > start) { + u64 this_end; + if (end < last_start) + this_end = end; + else + this_end = last_start - 1; + + prealloc = alloc_extent_state_atomic(prealloc); + if (!prealloc) { + err = -ENOMEM; + goto out; + } + + /* + * Avoid to free 'prealloc' if it can be merged with + * the later extent. + */ + err = insert_state(tree, prealloc, start, this_end, + &bits); + BUG_ON(err == -EEXIST); + if (err) { + free_extent_state(prealloc); + prealloc = NULL; + goto out; + } + prealloc = NULL; + start = this_end + 1; + goto search_again; + } + /* + * | ---- desired range ---- | + * | state | + * We need to split the extent, and set the bit + * on the first half + */ + if (state->start <= end && state->end > end) { + prealloc = alloc_extent_state_atomic(prealloc); + if (!prealloc) { + err = -ENOMEM; + goto out; + } + + err = split_state(tree, state, prealloc, end + 1); + BUG_ON(err == -EEXIST); + + set_state_bits(tree, prealloc, &bits); + clear_state_bit(tree, prealloc, &clear_bits, 0); + + merge_state(tree, prealloc); + prealloc = NULL; + goto out; + } + + goto search_again; + +out: + spin_unlock(&tree->lock); + if (prealloc) + free_extent_state(prealloc); + + return err; + +search_again: + if (start > end) + goto out; + spin_unlock(&tree->lock); + if (mask & __GFP_WAIT) + cond_resched(); + goto again; +} + /* wrappers around set/clear extent bit */ int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask) @@ -919,7 +1116,7 @@ int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end, struct extent_state **cached_state, gfp_t mask) { return set_extent_bit(tree, start, end, - EXTENT_DELALLOC | EXTENT_DIRTY | EXTENT_UPTODATE, + EXTENT_DELALLOC | EXTENT_UPTODATE, 0, NULL, cached_state, mask); } @@ -1599,6 +1796,368 @@ static int check_page_writeback(struct extent_io_tree *tree, return 0; } +/* + * When IO fails, either with EIO or csum verification fails, we + * try other mirrors that might have a good copy of the data. This + * io_failure_record is used to record state as we go through all the + * mirrors. If another mirror has good data, the page is set up to date + * and things continue. If a good mirror can't be found, the original + * bio end_io callback is called to indicate things have failed. + */ +struct io_failure_record { + struct page *page; + u64 start; + u64 len; + u64 logical; + unsigned long bio_flags; + int this_mirror; + int failed_mirror; + int in_validation; +}; + +static int free_io_failure(struct inode *inode, struct io_failure_record *rec, + int did_repair) +{ + int ret; + int err = 0; + struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree; + + set_state_private(failure_tree, rec->start, 0); + ret = clear_extent_bits(failure_tree, rec->start, + rec->start + rec->len - 1, + EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS); + if (ret) + err = ret; + + if (did_repair) { + ret = clear_extent_bits(&BTRFS_I(inode)->io_tree, rec->start, + rec->start + rec->len - 1, + EXTENT_DAMAGED, GFP_NOFS); + if (ret && !err) + err = ret; + } + + kfree(rec); + return err; +} + +static void repair_io_failure_callback(struct bio *bio, int err) +{ + complete(bio->bi_private); +} + +/* + * this bypasses the standard btrfs submit functions deliberately, as + * the standard behavior is to write all copies in a raid setup. here we only + * want to write the one bad copy. so we do the mapping for ourselves and issue + * submit_bio directly. + * to avoid any synchonization issues, wait for the data after writing, which + * actually prevents the read that triggered the error from finishing. + * currently, there can be no more than two copies of every data bit. thus, + * exactly one rewrite is required. + */ +int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start, + u64 length, u64 logical, struct page *page, + int mirror_num) +{ + struct bio *bio; + struct btrfs_device *dev; + DECLARE_COMPLETION_ONSTACK(compl); + u64 map_length = 0; + u64 sector; + struct btrfs_bio *bbio = NULL; + int ret; + + BUG_ON(!mirror_num); + + bio = bio_alloc(GFP_NOFS, 1); + if (!bio) + return -EIO; + bio->bi_private = &compl; + bio->bi_end_io = repair_io_failure_callback; + bio->bi_size = 0; + map_length = length; + + ret = btrfs_map_block(map_tree, WRITE, logical, + &map_length, &bbio, mirror_num); + if (ret) { + bio_put(bio); + return -EIO; + } + BUG_ON(mirror_num != bbio->mirror_num); + sector = bbio->stripes[mirror_num-1].physical >> 9; + bio->bi_sector = sector; + dev = bbio->stripes[mirror_num-1].dev; + kfree(bbio); + if (!dev || !dev->bdev || !dev->writeable) { + bio_put(bio); + return -EIO; + } + bio->bi_bdev = dev->bdev; + bio_add_page(bio, page, length, start-page_offset(page)); + submit_bio(WRITE_SYNC, bio); + wait_for_completion(&compl); + + if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) { + /* try to remap that extent elsewhere? */ + bio_put(bio); + return -EIO; + } + + printk(KERN_INFO "btrfs read error corrected: ino %lu off %llu (dev %s " + "sector %llu)\n", page->mapping->host->i_ino, start, + dev->name, sector); + + bio_put(bio); + return 0; +} + +/* + * each time an IO finishes, we do a fast check in the IO failure tree + * to see if we need to process or clean up an io_failure_record + */ +static int clean_io_failure(u64 start, struct page *page) +{ + u64 private; + u64 private_failure; + struct io_failure_record *failrec; + struct btrfs_mapping_tree *map_tree; + struct extent_state *state; + int num_copies; + int did_repair = 0; + int ret; + struct inode *inode = page->mapping->host; + + private = 0; + ret = count_range_bits(&BTRFS_I(inode)->io_failure_tree, &private, + (u64)-1, 1, EXTENT_DIRTY, 0); + if (!ret) + return 0; + + ret = get_state_private(&BTRFS_I(inode)->io_failure_tree, start, + &private_failure); + if (ret) + return 0; + + failrec = (struct io_failure_record *)(unsigned long) private_failure; + BUG_ON(!failrec->this_mirror); + + if (failrec->in_validation) { + /* there was no real error, just free the record */ + pr_debug("clean_io_failure: freeing dummy error at %llu\n", + failrec->start); + did_repair = 1; + goto out; + } + + spin_lock(&BTRFS_I(inode)->io_tree.lock); + state = find_first_extent_bit_state(&BTRFS_I(inode)->io_tree, + failrec->start, + EXTENT_LOCKED); + spin_unlock(&BTRFS_I(inode)->io_tree.lock); + + if (state && state->start == failrec->start) { + map_tree = &BTRFS_I(inode)->root->fs_info->mapping_tree; + num_copies = btrfs_num_copies(map_tree, failrec->logical, + failrec->len); + if (num_copies > 1) { + ret = repair_io_failure(map_tree, start, failrec->len, + failrec->logical, page, + failrec->failed_mirror); + did_repair = !ret; + } + } + +out: + if (!ret) + ret = free_io_failure(inode, failrec, did_repair); + + return ret; +} + +/* + * this is a generic handler for readpage errors (default + * readpage_io_failed_hook). if other copies exist, read those and write back + * good data to the failed position. does not investigate in remapping the + * failed extent elsewhere, hoping the device will be smart enough to do this as + * needed + */ + +static int bio_readpage_error(struct bio *failed_bio, struct page *page, + u64 start, u64 end, int failed_mirror, + struct extent_state *state) +{ + struct io_failure_record *failrec = NULL; + u64 private; + struct extent_map *em; + struct inode *inode = page->mapping->host; + struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree; + struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; + struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; + struct bio *bio; + int num_copies; + int ret; + int read_mode; + u64 logical; + + BUG_ON(failed_bio->bi_rw & REQ_WRITE); + + ret = get_state_private(failure_tree, start, &private); + if (ret) { + failrec = kzalloc(sizeof(*failrec), GFP_NOFS); + if (!failrec) + return -ENOMEM; + failrec->start = start; + failrec->len = end - start + 1; + failrec->this_mirror = 0; + failrec->bio_flags = 0; + failrec->in_validation = 0; + + read_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, start, failrec->len); + if (!em) { + read_unlock(&em_tree->lock); + kfree(failrec); + return -EIO; + } + + if (em->start > start || em->start + em->len < start) { + free_extent_map(em); + em = NULL; + } + read_unlock(&em_tree->lock); + + if (!em || IS_ERR(em)) { + kfree(failrec); + return -EIO; + } + logical = start - em->start; + logical = em->block_start + logical; + if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { + logical = em->block_start; + failrec->bio_flags = EXTENT_BIO_COMPRESSED; + extent_set_compress_type(&failrec->bio_flags, + em->compress_type); + } + pr_debug("bio_readpage_error: (new) logical=%llu, start=%llu, " + "len=%llu\n", logical, start, failrec->len); + failrec->logical = logical; + free_extent_map(em); + + /* set the bits in the private failure tree */ + ret = set_extent_bits(failure_tree, start, end, + EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS); + if (ret >= 0) + ret = set_state_private(failure_tree, start, + (u64)(unsigned long)failrec); + /* set the bits in the inode's tree */ + if (ret >= 0) + ret = set_extent_bits(tree, start, end, EXTENT_DAMAGED, + GFP_NOFS); + if (ret < 0) { + kfree(failrec); + return ret; + } + } else { + failrec = (struct io_failure_record *)(unsigned long)private; + pr_debug("bio_readpage_error: (found) logical=%llu, " + "start=%llu, len=%llu, validation=%d\n", + failrec->logical, failrec->start, failrec->len, + failrec->in_validation); + /* + * when data can be on disk more than twice, add to failrec here + * (e.g. with a list for failed_mirror) to make + * clean_io_failure() clean all those errors at once. + */ + } + num_copies = btrfs_num_copies( + &BTRFS_I(inode)->root->fs_info->mapping_tree, + failrec->logical, failrec->len); + if (num_copies == 1) { + /* + * we only have a single copy of the data, so don't bother with + * all the retry and error correction code that follows. no + * matter what the error is, it is very likely to persist. + */ + pr_debug("bio_readpage_error: cannot repair, num_copies == 1. " + "state=%p, num_copies=%d, next_mirror %d, " + "failed_mirror %d\n", state, num_copies, + failrec->this_mirror, failed_mirror); + free_io_failure(inode, failrec, 0); + return -EIO; + } + + if (!state) { + spin_lock(&tree->lock); + state = find_first_extent_bit_state(tree, failrec->start, + EXTENT_LOCKED); + if (state && state->start != failrec->start) + state = NULL; + spin_unlock(&tree->lock); + } + + /* + * there are two premises: + * a) deliver good data to the caller + * b) correct the bad sectors on disk + */ + if (failed_bio->bi_vcnt > 1) { + /* + * to fulfill b), we need to know the exact failing sectors, as + * we don't want to rewrite any more than the failed ones. thus, + * we need separate read requests for the failed bio + * + * if the following BUG_ON triggers, our validation request got + * merged. we need separate requests for our algorithm to work. + */ + BUG_ON(failrec->in_validation); + failrec->in_validation = 1; + failrec->this_mirror = failed_mirror; + read_mode = READ_SYNC | REQ_FAILFAST_DEV; + } else { + /* + * we're ready to fulfill a) and b) alongside. get a good copy + * of the failed sector and if we succeed, we have setup + * everything for repair_io_failure to do the rest for us. + */ + if (failrec->in_validation) { + BUG_ON(failrec->this_mirror != failed_mirror); + failrec->in_validation = 0; + failrec->this_mirror = 0; + } + failrec->failed_mirror = failed_mirror; + failrec->this_mirror++; + if (failrec->this_mirror == failed_mirror) + failrec->this_mirror++; + read_mode = READ_SYNC; + } + + if (!state || failrec->this_mirror > num_copies) { + pr_debug("bio_readpage_error: (fail) state=%p, num_copies=%d, " + "next_mirror %d, failed_mirror %d\n", state, + num_copies, failrec->this_mirror, failed_mirror); + free_io_failure(inode, failrec, 0); + return -EIO; + } + + bio = bio_alloc(GFP_NOFS, 1); + bio->bi_private = state; + bio->bi_end_io = failed_bio->bi_end_io; + bio->bi_sector = failrec->logical >> 9; + bio->bi_bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev; + bio->bi_size = 0; + + bio_add_page(bio, page, failrec->len, start - page_offset(page)); + + pr_debug("bio_readpage_error: submitting new read[%#x] to " + "this_mirror=%d, num_copies=%d, in_validation=%d\n", read_mode, + failrec->this_mirror, num_copies, failrec->in_validation); + + tree->ops->submit_bio_hook(inode, read_mode, bio, failrec->this_mirror, + failrec->bio_flags, 0); + return 0; +} + /* lots and lots of room for performance fixes in the end_bio funcs */ /* @@ -1697,6 +2256,9 @@ static void end_bio_extent_readpage(struct bio *bio, int err) struct extent_state *cached = NULL; struct extent_state *state; + pr_debug("end_bio_extent_readpage: bi_vcnt=%d, idx=%d, err=%d, " + "mirror=%ld\n", bio->bi_vcnt, bio->bi_idx, err, + (long int)bio->bi_bdev); tree = &BTRFS_I(page->mapping->host)->io_tree; start = ((u64)page->index << PAGE_CACHE_SHIFT) + @@ -1727,12 +2289,26 @@ static void end_bio_extent_readpage(struct bio *bio, int err) state); if (ret) uptodate = 0; + else + clean_io_failure(start, page); } - if (!uptodate && tree->ops && - tree->ops->readpage_io_failed_hook) { - ret = tree->ops->readpage_io_failed_hook(bio, page, - start, end, NULL); + if (!uptodate) { + int failed_mirror; + failed_mirror = (int)(unsigned long)bio->bi_bdev; + /* + * The generic bio_readpage_error handles errors the + * following way: If possible, new read requests are + * created and submitted and will end up in + * end_bio_extent_readpage as well (if we're lucky, not + * in the !uptodate case). In that case it returns 0 and + * we just go on with the next page in our bio. If it + * can't handle the error it will return -EIO and we + * remain responsible for that page. + */ + ret = bio_readpage_error(bio, page, start, end, + failed_mirror, NULL); if (ret == 0) { +error_handled: uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); if (err) @@ -1740,6 +2316,13 @@ static void end_bio_extent_readpage(struct bio *bio, int err) uncache_state(&cached); continue; } + if (tree->ops && tree->ops->readpage_io_failed_hook) { + ret = tree->ops->readpage_io_failed_hook( + bio, page, start, end, + failed_mirror, state); + if (ret == 0) + goto error_handled; + } } if (uptodate) { @@ -1811,6 +2394,7 @@ static int submit_one_bio(int rw, struct bio *bio, int mirror_num, mirror_num, bio_flags, start); else submit_bio(rw, bio); + if (bio_flagged(bio, BIO_EOPNOTSUPP)) ret = -EOPNOTSUPP; bio_put(bio); @@ -2076,16 +2660,16 @@ out: } int extent_read_full_page(struct extent_io_tree *tree, struct page *page, - get_extent_t *get_extent) + get_extent_t *get_extent, int mirror_num) { struct bio *bio = NULL; unsigned long bio_flags = 0; int ret; - ret = __extent_read_full_page(tree, page, get_extent, &bio, 0, + ret = __extent_read_full_page(tree, page, get_extent, &bio, mirror_num, &bio_flags); if (bio) - ret = submit_one_bio(READ, bio, 0, bio_flags); + ret = submit_one_bio(READ, bio, mirror_num, bio_flags); return ret; } @@ -2136,6 +2720,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, int compressed; int write_flags; unsigned long nr_written = 0; + bool fill_delalloc = true; if (wbc->sync_mode == WB_SYNC_ALL) write_flags = WRITE_SYNC; @@ -2145,6 +2730,9 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, trace___extent_writepage(page, inode, wbc); WARN_ON(!PageLocked(page)); + + ClearPageError(page); + pg_offset = i_size & (PAGE_CACHE_SIZE - 1); if (page->index > end_index || (page->index == end_index && !pg_offset)) { @@ -2166,10 +2754,13 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, set_page_extent_mapped(page); + if (!tree->ops || !tree->ops->fill_delalloc) + fill_delalloc = false; + delalloc_start = start; delalloc_end = 0; page_started = 0; - if (!epd->extent_locked) { + if (!epd->extent_locked && fill_delalloc) { u64 delalloc_to_write = 0; /* * make sure the wbc mapping index is at least updated @@ -2421,10 +3012,16 @@ retry: * swizzled back from swapper_space to tmpfs file * mapping */ - if (tree->ops && tree->ops->write_cache_pages_lock_hook) - tree->ops->write_cache_pages_lock_hook(page); - else - lock_page(page); + if (tree->ops && + tree->ops->write_cache_pages_lock_hook) { + tree->ops->write_cache_pages_lock_hook(page, + data, flush_fn); + } else { + if (!trylock_page(page)) { + flush_fn(data); + lock_page(page); + } + } if (unlikely(page->mapping != mapping)) { unlock_page(page); @@ -2790,6 +3387,9 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, return -ENOMEM; path->leave_spinning = 1; + start = ALIGN(start, BTRFS_I(inode)->root->sectorsize); + len = ALIGN(len, BTRFS_I(inode)->root->sectorsize); + /* * lookup the last file extent. We're not using i_size here * because there might be preallocation past i_size @@ -2837,7 +3437,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len, 0, &cached_state, GFP_NOFS); - em = get_extent_skip_holes(inode, off, last_for_get_extent, + em = get_extent_skip_holes(inode, start, last_for_get_extent, get_extent); if (!em) goto out; @@ -2926,7 +3526,7 @@ out: return ret; } -static inline struct page *extent_buffer_page(struct extent_buffer *eb, +inline struct page *extent_buffer_page(struct extent_buffer *eb, unsigned long i) { struct page *p; @@ -2951,7 +3551,7 @@ static inline struct page *extent_buffer_page(struct extent_buffer *eb, return p; } -static inline unsigned long num_extent_pages(u64 start, u64 len) +inline unsigned long num_extent_pages(u64 start, u64 len) { return ((start + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) - (start >> PAGE_CACHE_SHIFT); @@ -3204,6 +3804,7 @@ int clear_extent_buffer_dirty(struct extent_io_tree *tree, PAGECACHE_TAG_DIRTY); } spin_unlock_irq(&page->mapping->tree_lock); + ClearPageError(page); unlock_page(page); } return 0; @@ -3349,8 +3950,7 @@ int extent_buffer_uptodate(struct extent_io_tree *tree, } int read_extent_buffer_pages(struct extent_io_tree *tree, - struct extent_buffer *eb, - u64 start, int wait, + struct extent_buffer *eb, u64 start, int wait, get_extent_t *get_extent, int mirror_num) { unsigned long i; @@ -3386,7 +3986,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree, num_pages = num_extent_pages(eb->start, eb->len); for (i = start_i; i < num_pages; i++) { page = extent_buffer_page(eb, i); - if (!wait) { + if (wait == WAIT_NONE) { if (!trylock_page(page)) goto unlock_exit; } else { @@ -3430,7 +4030,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree, if (bio) submit_one_bio(READ, bio, mirror_num, bio_flags); - if (ret || !wait) + if (ret || wait != WAIT_COMPLETE) return ret; for (i = start_i; i < num_pages; i++) { diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 7b2f0c3..7604c30 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -17,6 +17,8 @@ #define EXTENT_NODATASUM (1 << 10) #define EXTENT_DO_ACCOUNTING (1 << 11) #define EXTENT_FIRST_DELALLOC (1 << 12) +#define EXTENT_NEED_WAIT (1 << 13) +#define EXTENT_DAMAGED (1 << 14) #define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK) #define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC) @@ -32,6 +34,7 @@ #define EXTENT_BUFFER_BLOCKING 1 #define EXTENT_BUFFER_DIRTY 2 #define EXTENT_BUFFER_CORRUPT 3 +#define EXTENT_BUFFER_READAHEAD 4 /* this got triggered by readahead */ /* these are flags for extent_clear_unlock_delalloc */ #define EXTENT_CLEAR_UNLOCK_PAGE 0x1 @@ -67,7 +70,7 @@ struct extent_io_ops { unsigned long bio_flags); int (*readpage_io_hook)(struct page *page, u64 start, u64 end); int (*readpage_io_failed_hook)(struct bio *bio, struct page *page, - u64 start, u64 end, + u64 start, u64 end, int failed_mirror, struct extent_state *state); int (*writepage_io_failed_hook)(struct bio *bio, struct page *page, u64 start, u64 end, @@ -85,7 +88,8 @@ struct extent_io_ops { struct extent_state *other); void (*split_extent_hook)(struct inode *inode, struct extent_state *orig, u64 split); - int (*write_cache_pages_lock_hook)(struct page *page); + int (*write_cache_pages_lock_hook)(struct page *page, void *data, + void (*flush_fn)(void *)); }; struct extent_io_tree { @@ -185,7 +189,7 @@ int unlock_extent_cached(struct extent_io_tree *tree, u64 start, u64 end, int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask); int extent_read_full_page(struct extent_io_tree *tree, struct page *page, - get_extent_t *get_extent); + get_extent_t *get_extent, int mirror_num); int __init extent_io_init(void); void extent_io_exit(void); @@ -214,6 +218,8 @@ int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask); int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask); +int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, + int bits, int clear_bits, gfp_t mask); int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end, struct extent_state **cached_state, gfp_t mask); int find_first_extent_bit(struct extent_io_tree *tree, u64 start, @@ -248,9 +254,14 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree, struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree, u64 start, unsigned long len); void free_extent_buffer(struct extent_buffer *eb); +#define WAIT_NONE 0 +#define WAIT_COMPLETE 1 +#define WAIT_PAGE_LOCK 2 int read_extent_buffer_pages(struct extent_io_tree *tree, struct extent_buffer *eb, u64 start, int wait, get_extent_t *get_extent, int mirror_num); +unsigned long num_extent_pages(u64 start, u64 len); +struct page *extent_buffer_page(struct extent_buffer *eb, unsigned long i); static inline void extent_buffer_get(struct extent_buffer *eb) { @@ -300,4 +311,10 @@ int extent_clear_unlock_delalloc(struct inode *inode, struct bio * btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs, gfp_t gfp_flags); + +struct btrfs_mapping_tree; + +int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start, + u64 length, u64 logical, struct page *page, + int mirror_num); #endif diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index a1cb782..c7fb3a4 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -91,8 +91,7 @@ struct btrfs_csum_item *btrfs_lookup_csum(struct btrfs_trans_handle *trans, struct btrfs_csum_item *item; struct extent_buffer *leaf; u64 csum_offset = 0; - u16 csum_size = - btrfs_super_csum_size(&root->fs_info->super_copy); + u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy); int csums_in_item; file_key.objectid = BTRFS_EXTENT_CSUM_OBJECTID; @@ -162,8 +161,7 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root, u64 item_last_offset = 0; u64 disk_bytenr; u32 diff; - u16 csum_size = - btrfs_super_csum_size(&root->fs_info->super_copy); + u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy); int ret; struct btrfs_path *path; struct btrfs_csum_item *item = NULL; @@ -290,7 +288,7 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, int ret; size_t size; u64 csum_end; - u16 csum_size = btrfs_super_csum_size(&root->fs_info->super_copy); + u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy); path = btrfs_alloc_path(); if (!path) @@ -492,8 +490,7 @@ static noinline int truncate_one_csum(struct btrfs_trans_handle *trans, u64 bytenr, u64 len) { struct extent_buffer *leaf; - u16 csum_size = - btrfs_super_csum_size(&root->fs_info->super_copy); + u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy); u64 csum_end; u64 end_byte = bytenr + len; u32 blocksize_bits = root->fs_info->sb->s_blocksize_bits; @@ -549,8 +546,7 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans, u64 csum_end; struct extent_buffer *leaf; int ret; - u16 csum_size = - btrfs_super_csum_size(&root->fs_info->super_copy); + u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy); int blocksize_bits = root->fs_info->sb->s_blocksize_bits; root = root->fs_info->csum_root; @@ -676,8 +672,7 @@ int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans, struct btrfs_sector_sum *sector_sum; u32 nritems; u32 ins_size; - u16 csum_size = - btrfs_super_csum_size(&root->fs_info->super_copy); + u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy); path = btrfs_alloc_path(); if (!path) diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index e4e57d5..97fbe93 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1069,6 +1069,7 @@ static noinline int prepare_pages(struct btrfs_root *root, struct file *file, int i; unsigned long index = pos >> PAGE_CACHE_SHIFT; struct inode *inode = fdentry(file)->d_inode; + gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping); int err = 0; int faili = 0; u64 start_pos; @@ -1080,7 +1081,7 @@ static noinline int prepare_pages(struct btrfs_root *root, struct file *file, again: for (i = 0; i < num_pages; i++) { pages[i] = find_or_create_page(inode->i_mapping, index + i, - GFP_NOFS); + mask); if (!pages[i]) { faili = i - 1; err = -ENOMEM; @@ -1166,6 +1167,8 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file, nrptrs = min((iov_iter_count(i) + PAGE_CACHE_SIZE - 1) / PAGE_CACHE_SIZE, PAGE_CACHE_SIZE / (sizeof(struct page *))); + nrptrs = min(nrptrs, current->nr_dirtied_pause - current->nr_dirtied); + nrptrs = max(nrptrs, 8); pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL); if (!pages) return -ENOMEM; @@ -1386,7 +1389,11 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb, goto out; } - file_update_time(file); + err = btrfs_update_time(file); + if (err) { + mutex_unlock(&inode->i_mutex); + goto out; + } BTRFS_I(inode)->sequence++; start_pos = round_down(pos, root->sectorsize); @@ -1615,10 +1622,6 @@ static long btrfs_fallocate(struct file *file, int mode, goto out; } - ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start); - if (ret) - goto out; - locked_end = alloc_end - 1; while (1) { struct btrfs_ordered_extent *ordered; @@ -1664,11 +1667,27 @@ static long btrfs_fallocate(struct file *file, int mode, if (em->block_start == EXTENT_MAP_HOLE || (cur_offset >= inode->i_size && !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) { + + /* + * Make sure we have enough space before we do the + * allocation. + */ + ret = btrfs_check_data_free_space(inode, last_byte - + cur_offset); + if (ret) { + free_extent_map(em); + break; + } + ret = btrfs_prealloc_file_range(inode, mode, cur_offset, last_byte - cur_offset, 1 << inode->i_blkbits, offset + len, &alloc_hint); + + /* Let go of our reservation. */ + btrfs_free_reserved_data_space(inode, last_byte - + cur_offset); if (ret < 0) { free_extent_map(em); break; @@ -1694,8 +1713,6 @@ static long btrfs_fallocate(struct file *file, int mode, } unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end, &cached_state, GFP_NOFS); - - btrfs_free_reserved_data_space(inode, alloc_end - alloc_start); out: mutex_unlock(&inode->i_mutex); return ret; @@ -1821,7 +1838,7 @@ static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int origin) switch (origin) { case SEEK_END: case SEEK_CUR: - offset = generic_file_llseek_unlocked(file, offset, origin); + offset = generic_file_llseek(file, offset, origin); goto out; case SEEK_DATA: case SEEK_HOLE: diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 41ac927..9a897bf 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -20,6 +20,7 @@ #include <linux/sched.h> #include <linux/slab.h> #include <linux/math64.h> +#include <linux/ratelimit.h> #include "ctree.h" #include "free-space-cache.h" #include "transaction.h" @@ -84,6 +85,7 @@ struct inode *lookup_free_space_inode(struct btrfs_root *root, *block_group, struct btrfs_path *path) { struct inode *inode = NULL; + u32 flags = BTRFS_INODE_NODATASUM | BTRFS_INODE_NODATACOW; spin_lock(&block_group->lock); if (block_group->inode) @@ -98,13 +100,14 @@ struct inode *lookup_free_space_inode(struct btrfs_root *root, return inode; spin_lock(&block_group->lock); - if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) { + if (!((BTRFS_I(inode)->flags & flags) == flags)) { printk(KERN_INFO "Old style space inode found, converting.\n"); - BTRFS_I(inode)->flags &= ~BTRFS_INODE_NODATASUM; + BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM | + BTRFS_INODE_NODATACOW; block_group->disk_cache_state = BTRFS_DC_CLEAR; } - if (!btrfs_fs_closing(root->fs_info)) { + if (!block_group->iref) { block_group->inode = igrab(inode); block_group->iref = 1; } @@ -122,12 +125,17 @@ int __create_free_space_inode(struct btrfs_root *root, struct btrfs_free_space_header *header; struct btrfs_inode_item *inode_item; struct extent_buffer *leaf; + u64 flags = BTRFS_INODE_NOCOMPRESS | BTRFS_INODE_PREALLOC; int ret; ret = btrfs_insert_empty_inode(trans, root, path, ino); if (ret) return ret; + /* We inline crc's for the free disk space cache */ + if (ino != BTRFS_FREE_INO_OBJECTID) + flags |= BTRFS_INODE_NODATASUM | BTRFS_INODE_NODATACOW; + leaf = path->nodes[0]; inode_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item); @@ -140,8 +148,7 @@ int __create_free_space_inode(struct btrfs_root *root, btrfs_set_inode_uid(leaf, inode_item, 0); btrfs_set_inode_gid(leaf, inode_item, 0); btrfs_set_inode_mode(leaf, inode_item, S_IFREG | 0600); - btrfs_set_inode_flags(leaf, inode_item, BTRFS_INODE_NOCOMPRESS | - BTRFS_INODE_PREALLOC); + btrfs_set_inode_flags(leaf, inode_item, flags); btrfs_set_inode_nlink(leaf, inode_item, 1); btrfs_set_inode_transid(leaf, inode_item, trans->transid); btrfs_set_inode_block_group(leaf, inode_item, offset); @@ -191,16 +198,24 @@ int btrfs_truncate_free_space_cache(struct btrfs_root *root, struct inode *inode) { struct btrfs_block_rsv *rsv; + u64 needed_bytes; loff_t oldsize; int ret = 0; rsv = trans->block_rsv; - trans->block_rsv = root->orphan_block_rsv; - ret = btrfs_block_rsv_check(trans, root, - root->orphan_block_rsv, - 0, 5); - if (ret) - return ret; + trans->block_rsv = &root->fs_info->global_block_rsv; + + /* 1 for slack space, 1 for updating the inode */ + needed_bytes = btrfs_calc_trunc_metadata_size(root, 1) + + btrfs_calc_trans_metadata_size(root, 1); + + spin_lock(&trans->block_rsv->lock); + if (trans->block_rsv->reserved < needed_bytes) { + spin_unlock(&trans->block_rsv->lock); + trans->block_rsv = rsv; + return -ENOSPC; + } + spin_unlock(&trans->block_rsv->lock); oldsize = i_size_read(inode); btrfs_i_size_write(inode, 0); @@ -213,13 +228,15 @@ int btrfs_truncate_free_space_cache(struct btrfs_root *root, ret = btrfs_truncate_inode_items(trans, root, inode, 0, BTRFS_EXTENT_DATA_KEY); - trans->block_rsv = rsv; if (ret) { + trans->block_rsv = rsv; WARN_ON(1); return ret; } ret = btrfs_update_inode(trans, root, inode); + trans->block_rsv = rsv; + return ret; } @@ -242,26 +259,348 @@ static int readahead_cache(struct inode *inode) return 0; } +struct io_ctl { + void *cur, *orig; + struct page *page; + struct page **pages; + struct btrfs_root *root; + unsigned long size; + int index; + int num_pages; + unsigned check_crcs:1; +}; + +static int io_ctl_init(struct io_ctl *io_ctl, struct inode *inode, + struct btrfs_root *root) +{ + memset(io_ctl, 0, sizeof(struct io_ctl)); + io_ctl->num_pages = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> + PAGE_CACHE_SHIFT; + io_ctl->pages = kzalloc(sizeof(struct page *) * io_ctl->num_pages, + GFP_NOFS); + if (!io_ctl->pages) + return -ENOMEM; + io_ctl->root = root; + if (btrfs_ino(inode) != BTRFS_FREE_INO_OBJECTID) + io_ctl->check_crcs = 1; + return 0; +} + +static void io_ctl_free(struct io_ctl *io_ctl) +{ + kfree(io_ctl->pages); +} + +static void io_ctl_unmap_page(struct io_ctl *io_ctl) +{ + if (io_ctl->cur) { + kunmap(io_ctl->page); + io_ctl->cur = NULL; + io_ctl->orig = NULL; + } +} + +static void io_ctl_map_page(struct io_ctl *io_ctl, int clear) +{ + WARN_ON(io_ctl->cur); + BUG_ON(io_ctl->index >= io_ctl->num_pages); + io_ctl->page = io_ctl->pages[io_ctl->index++]; + io_ctl->cur = kmap(io_ctl->page); + io_ctl->orig = io_ctl->cur; + io_ctl->size = PAGE_CACHE_SIZE; + if (clear) + memset(io_ctl->cur, 0, PAGE_CACHE_SIZE); +} + +static void io_ctl_drop_pages(struct io_ctl *io_ctl) +{ + int i; + + io_ctl_unmap_page(io_ctl); + + for (i = 0; i < io_ctl->num_pages; i++) { + ClearPageChecked(io_ctl->pages[i]); + unlock_page(io_ctl->pages[i]); + page_cache_release(io_ctl->pages[i]); + } +} + +static int io_ctl_prepare_pages(struct io_ctl *io_ctl, struct inode *inode, + int uptodate) +{ + struct page *page; + gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping); + int i; + + for (i = 0; i < io_ctl->num_pages; i++) { + page = find_or_create_page(inode->i_mapping, i, mask); + if (!page) { + io_ctl_drop_pages(io_ctl); + return -ENOMEM; + } + io_ctl->pages[i] = page; + if (uptodate && !PageUptodate(page)) { + btrfs_readpage(NULL, page); + lock_page(page); + if (!PageUptodate(page)) { + printk(KERN_ERR "btrfs: error reading free " + "space cache\n"); + io_ctl_drop_pages(io_ctl); + return -EIO; + } + } + } + + for (i = 0; i < io_ctl->num_pages; i++) { + clear_page_dirty_for_io(io_ctl->pages[i]); + set_page_extent_mapped(io_ctl->pages[i]); + } + + return 0; +} + +static void io_ctl_set_generation(struct io_ctl *io_ctl, u64 generation) +{ + u64 *val; + + io_ctl_map_page(io_ctl, 1); + + /* + * Skip the csum areas. If we don't check crcs then we just have a + * 64bit chunk at the front of the first page. + */ + if (io_ctl->check_crcs) { + io_ctl->cur += (sizeof(u32) * io_ctl->num_pages); + io_ctl->size -= sizeof(u64) + (sizeof(u32) * io_ctl->num_pages); + } else { + io_ctl->cur += sizeof(u64); + io_ctl->size -= sizeof(u64) * 2; + } + + val = io_ctl->cur; + *val = cpu_to_le64(generation); + io_ctl->cur += sizeof(u64); +} + +static int io_ctl_check_generation(struct io_ctl *io_ctl, u64 generation) +{ + u64 *gen; + + /* + * Skip the crc area. If we don't check crcs then we just have a 64bit + * chunk at the front of the first page. + */ + if (io_ctl->check_crcs) { + io_ctl->cur += sizeof(u32) * io_ctl->num_pages; + io_ctl->size -= sizeof(u64) + + (sizeof(u32) * io_ctl->num_pages); + } else { + io_ctl->cur += sizeof(u64); + io_ctl->size -= sizeof(u64) * 2; + } + + gen = io_ctl->cur; + if (le64_to_cpu(*gen) != generation) { + printk_ratelimited(KERN_ERR "btrfs: space cache generation " + "(%Lu) does not match inode (%Lu)\n", *gen, + generation); + io_ctl_unmap_page(io_ctl); + return -EIO; + } + io_ctl->cur += sizeof(u64); + return 0; +} + +static void io_ctl_set_crc(struct io_ctl *io_ctl, int index) +{ + u32 *tmp; + u32 crc = ~(u32)0; + unsigned offset = 0; + + if (!io_ctl->check_crcs) { + io_ctl_unmap_page(io_ctl); + return; + } + + if (index == 0) + offset = sizeof(u32) * io_ctl->num_pages; + + crc = btrfs_csum_data(io_ctl->root, io_ctl->orig + offset, crc, + PAGE_CACHE_SIZE - offset); + btrfs_csum_final(crc, (char *)&crc); + io_ctl_unmap_page(io_ctl); + tmp = kmap(io_ctl->pages[0]); + tmp += index; + *tmp = crc; + kunmap(io_ctl->pages[0]); +} + +static int io_ctl_check_crc(struct io_ctl *io_ctl, int index) +{ + u32 *tmp, val; + u32 crc = ~(u32)0; + unsigned offset = 0; + + if (!io_ctl->check_crcs) { + io_ctl_map_page(io_ctl, 0); + return 0; + } + + if (index == 0) + offset = sizeof(u32) * io_ctl->num_pages; + + tmp = kmap(io_ctl->pages[0]); + tmp += index; + val = *tmp; + kunmap(io_ctl->pages[0]); + + io_ctl_map_page(io_ctl, 0); + crc = btrfs_csum_data(io_ctl->root, io_ctl->orig + offset, crc, + PAGE_CACHE_SIZE - offset); + btrfs_csum_final(crc, (char *)&crc); + if (val != crc) { + printk_ratelimited(KERN_ERR "btrfs: csum mismatch on free " + "space cache\n"); + io_ctl_unmap_page(io_ctl); + return -EIO; + } + + return 0; +} + +static int io_ctl_add_entry(struct io_ctl *io_ctl, u64 offset, u64 bytes, + void *bitmap) +{ + struct btrfs_free_space_entry *entry; + + if (!io_ctl->cur) + return -ENOSPC; + + entry = io_ctl->cur; + entry->offset = cpu_to_le64(offset); + entry->bytes = cpu_to_le64(bytes); + entry->type = (bitmap) ? BTRFS_FREE_SPACE_BITMAP : + BTRFS_FREE_SPACE_EXTENT; + io_ctl->cur += sizeof(struct btrfs_free_space_entry); + io_ctl->size -= sizeof(struct btrfs_free_space_entry); + + if (io_ctl->size >= sizeof(struct btrfs_free_space_entry)) + return 0; + + io_ctl_set_crc(io_ctl, io_ctl->index - 1); + + /* No more pages to map */ + if (io_ctl->index >= io_ctl->num_pages) + return 0; + + /* map the next page */ + io_ctl_map_page(io_ctl, 1); + return 0; +} + +static int io_ctl_add_bitmap(struct io_ctl *io_ctl, void *bitmap) +{ + if (!io_ctl->cur) + return -ENOSPC; + + /* + * If we aren't at the start of the current page, unmap this one and + * map the next one if there is any left. + */ + if (io_ctl->cur != io_ctl->orig) { + io_ctl_set_crc(io_ctl, io_ctl->index - 1); + if (io_ctl->index >= io_ctl->num_pages) + return -ENOSPC; + io_ctl_map_page(io_ctl, 0); + } + + memcpy(io_ctl->cur, bitmap, PAGE_CACHE_SIZE); + io_ctl_set_crc(io_ctl, io_ctl->index - 1); + if (io_ctl->index < io_ctl->num_pages) + io_ctl_map_page(io_ctl, 0); + return 0; +} + +static void io_ctl_zero_remaining_pages(struct io_ctl *io_ctl) +{ + /* + * If we're not on the boundary we know we've modified the page and we + * need to crc the page. + */ + if (io_ctl->cur != io_ctl->orig) + io_ctl_set_crc(io_ctl, io_ctl->index - 1); + else + io_ctl_unmap_page(io_ctl); + + while (io_ctl->index < io_ctl->num_pages) { + io_ctl_map_page(io_ctl, 1); + io_ctl_set_crc(io_ctl, io_ctl->index - 1); + } +} + +static int io_ctl_read_entry(struct io_ctl *io_ctl, + struct btrfs_free_space *entry, u8 *type) +{ + struct btrfs_free_space_entry *e; + int ret; + + if (!io_ctl->cur) { + ret = io_ctl_check_crc(io_ctl, io_ctl->index); + if (ret) + return ret; + } + + e = io_ctl->cur; + entry->offset = le64_to_cpu(e->offset); + entry->bytes = le64_to_cpu(e->bytes); + *type = e->type; + io_ctl->cur += sizeof(struct btrfs_free_space_entry); + io_ctl->size -= sizeof(struct btrfs_free_space_entry); + + if (io_ctl->size >= sizeof(struct btrfs_free_space_entry)) + return 0; + + io_ctl_unmap_page(io_ctl); + + return 0; +} + +static int io_ctl_read_bitmap(struct io_ctl *io_ctl, + struct btrfs_free_space *entry) +{ + int ret; + + ret = io_ctl_check_crc(io_ctl, io_ctl->index); + if (ret) + return ret; + + memcpy(entry->bitmap, io_ctl->cur, PAGE_CACHE_SIZE); + io_ctl_unmap_page(io_ctl); + + return 0; +} + int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, struct btrfs_free_space_ctl *ctl, struct btrfs_path *path, u64 offset) { struct btrfs_free_space_header *header; struct extent_buffer *leaf; - struct page *page; + struct io_ctl io_ctl; struct btrfs_key key; + struct btrfs_free_space *e, *n; struct list_head bitmaps; u64 num_entries; u64 num_bitmaps; u64 generation; - pgoff_t index = 0; + u8 type; int ret = 0; INIT_LIST_HEAD(&bitmaps); /* Nothing in the space cache, goodbye */ if (!i_size_read(inode)) - goto out; + return 0; key.objectid = BTRFS_FREE_SPACE_OBJECTID; key.offset = offset; @@ -269,11 +608,10 @@ int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) - goto out; + return 0; else if (ret > 0) { btrfs_release_path(path); - ret = 0; - goto out; + return 0; } ret = -1; @@ -291,169 +629,102 @@ int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, " not match free space cache generation (%llu)\n", (unsigned long long)BTRFS_I(inode)->generation, (unsigned long long)generation); - goto out; + return 0; } if (!num_entries) - goto out; + return 0; + io_ctl_init(&io_ctl, inode, root); ret = readahead_cache(inode); if (ret) goto out; - while (1) { - struct btrfs_free_space_entry *entry; - struct btrfs_free_space *e; - void *addr; - unsigned long offset = 0; - int need_loop = 0; + ret = io_ctl_prepare_pages(&io_ctl, inode, 1); + if (ret) + goto out; - if (!num_entries && !num_bitmaps) - break; + ret = io_ctl_check_crc(&io_ctl, 0); + if (ret) + goto free_cache; + + ret = io_ctl_check_generation(&io_ctl, generation); + if (ret) + goto free_cache; - page = find_or_create_page(inode->i_mapping, index, GFP_NOFS); - if (!page) + while (num_entries) { + e = kmem_cache_zalloc(btrfs_free_space_cachep, + GFP_NOFS); + if (!e) goto free_cache; - if (!PageUptodate(page)) { - btrfs_readpage(NULL, page); - lock_page(page); - if (!PageUptodate(page)) { - unlock_page(page); - page_cache_release(page); - printk(KERN_ERR "btrfs: error reading free " - "space cache\n"); - goto free_cache; - } + ret = io_ctl_read_entry(&io_ctl, e, &type); + if (ret) { + kmem_cache_free(btrfs_free_space_cachep, e); + goto free_cache; } - addr = kmap(page); - if (index == 0) { - u64 *gen; + if (!e->bytes) { + kmem_cache_free(btrfs_free_space_cachep, e); + goto free_cache; + } - /* - * We put a bogus crc in the front of the first page in - * case old kernels try to mount a fs with the new - * format to make sure they discard the cache. - */ - addr += sizeof(u64); - offset += sizeof(u64); - - gen = addr; - if (*gen != BTRFS_I(inode)->generation) { - printk(KERN_ERR "btrfs: space cache generation" - " (%llu) does not match inode (%llu)\n", - (unsigned long long)*gen, - (unsigned long long) - BTRFS_I(inode)->generation); - kunmap(page); - unlock_page(page); - page_cache_release(page); + if (type == BTRFS_FREE_SPACE_EXTENT) { + spin_lock(&ctl->tree_lock); + ret = link_free_space(ctl, e); + spin_unlock(&ctl->tree_lock); + if (ret) { + printk(KERN_ERR "Duplicate entries in " + "free space cache, dumping\n"); + kmem_cache_free(btrfs_free_space_cachep, e); goto free_cache; } - addr += sizeof(u64); - offset += sizeof(u64); - } - entry = addr; - - while (1) { - if (!num_entries) - break; - - need_loop = 1; - e = kmem_cache_zalloc(btrfs_free_space_cachep, - GFP_NOFS); - if (!e) { - kunmap(page); - unlock_page(page); - page_cache_release(page); + } else { + BUG_ON(!num_bitmaps); + num_bitmaps--; + e->bitmap = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS); + if (!e->bitmap) { + kmem_cache_free( + btrfs_free_space_cachep, e); goto free_cache; } - - e->offset = le64_to_cpu(entry->offset); - e->bytes = le64_to_cpu(entry->bytes); - if (!e->bytes) { - kunmap(page); + spin_lock(&ctl->tree_lock); + ret = link_free_space(ctl, e); + ctl->total_bitmaps++; + ctl->op->recalc_thresholds(ctl); + spin_unlock(&ctl->tree_lock); + if (ret) { + printk(KERN_ERR "Duplicate entries in " + "free space cache, dumping\n"); kmem_cache_free(btrfs_free_space_cachep, e); - unlock_page(page); - page_cache_release(page); goto free_cache; } - - if (entry->type == BTRFS_FREE_SPACE_EXTENT) { - spin_lock(&ctl->tree_lock); - ret = link_free_space(ctl, e); - spin_unlock(&ctl->tree_lock); - if (ret) { - printk(KERN_ERR "Duplicate entries in " - "free space cache, dumping\n"); - kunmap(page); - unlock_page(page); - page_cache_release(page); - goto free_cache; - } - } else { - e->bitmap = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS); - if (!e->bitmap) { - kunmap(page); - kmem_cache_free( - btrfs_free_space_cachep, e); - unlock_page(page); - page_cache_release(page); - goto free_cache; - } - spin_lock(&ctl->tree_lock); - ret = link_free_space(ctl, e); - ctl->total_bitmaps++; - ctl->op->recalc_thresholds(ctl); - spin_unlock(&ctl->tree_lock); - if (ret) { - printk(KERN_ERR "Duplicate entries in " - "free space cache, dumping\n"); - kunmap(page); - unlock_page(page); - page_cache_release(page); - goto free_cache; - } - list_add_tail(&e->list, &bitmaps); - } - - num_entries--; - offset += sizeof(struct btrfs_free_space_entry); - if (offset + sizeof(struct btrfs_free_space_entry) >= - PAGE_CACHE_SIZE) - break; - entry++; + list_add_tail(&e->list, &bitmaps); } - /* - * We read an entry out of this page, we need to move on to the - * next page. - */ - if (need_loop) { - kunmap(page); - goto next; - } + num_entries--; + } - /* - * We add the bitmaps at the end of the entries in order that - * the bitmap entries are added to the cache. - */ - e = list_entry(bitmaps.next, struct btrfs_free_space, list); + io_ctl_unmap_page(&io_ctl); + + /* + * We add the bitmaps at the end of the entries in order that + * the bitmap entries are added to the cache. + */ + list_for_each_entry_safe(e, n, &bitmaps, list) { list_del_init(&e->list); - memcpy(e->bitmap, addr, PAGE_CACHE_SIZE); - kunmap(page); - num_bitmaps--; -next: - unlock_page(page); - page_cache_release(page); - index++; + ret = io_ctl_read_bitmap(&io_ctl, e); + if (ret) + goto free_cache; } + io_ctl_drop_pages(&io_ctl); ret = 1; out: + io_ctl_free(&io_ctl); return ret; free_cache: + io_ctl_drop_pages(&io_ctl); __btrfs_remove_free_space_cache(ctl); goto out; } @@ -465,7 +736,7 @@ int load_free_space_cache(struct btrfs_fs_info *fs_info, struct btrfs_root *root = fs_info->tree_root; struct inode *inode; struct btrfs_path *path; - int ret; + int ret = 0; bool matched; u64 used = btrfs_block_group_used(&block_group->item); @@ -497,6 +768,14 @@ int load_free_space_cache(struct btrfs_fs_info *fs_info, return 0; } + /* We may have converted the inode and made the cache invalid. */ + spin_lock(&block_group->lock); + if (block_group->disk_cache_state != BTRFS_DC_WRITTEN) { + spin_unlock(&block_group->lock); + goto out; + } + spin_unlock(&block_group->lock); + ret = __load_free_space_cache(fs_info->tree_root, inode, ctl, path, block_group->key.objectid); btrfs_free_path(path); @@ -530,6 +809,19 @@ out: return ret; } +/** + * __btrfs_write_out_cache - write out cached info to an inode + * @root - the root the inode belongs to + * @ctl - the free space cache we are going to write out + * @block_group - the block_group for this cache if it belongs to a block_group + * @trans - the trans handle + * @path - the path to use + * @offset - the offset for the key we'll insert + * + * This function writes out a free space cache struct to disk for quick recovery + * on mount. This will return 0 if it was successfull in writing the cache out, + * and -1 if it was not. + */ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, struct btrfs_free_space_ctl *ctl, struct btrfs_block_group_cache *block_group, @@ -540,42 +832,24 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, struct extent_buffer *leaf; struct rb_node *node; struct list_head *pos, *n; - struct page **pages; - struct page *page; struct extent_state *cached_state = NULL; struct btrfs_free_cluster *cluster = NULL; struct extent_io_tree *unpin = NULL; + struct io_ctl io_ctl; struct list_head bitmap_list; struct btrfs_key key; u64 start, end, len; - u64 bytes = 0; - u32 crc = ~(u32)0; - int index = 0, num_pages = 0; int entries = 0; int bitmaps = 0; - int ret = -1; - bool next_page = false; - bool out_of_space = false; + int ret; + int err = -1; INIT_LIST_HEAD(&bitmap_list); - node = rb_first(&ctl->free_space_offset); - if (!node) - return 0; - if (!i_size_read(inode)) return -1; - num_pages = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> - PAGE_CACHE_SHIFT; - - filemap_write_and_wait(inode->i_mapping); - btrfs_wait_ordered_range(inode, inode->i_size & - ~(root->sectorsize - 1), (u64)-1); - - pages = kzalloc(sizeof(struct page *) * num_pages, GFP_NOFS); - if (!pages) - return -1; + io_ctl_init(&io_ctl, inode, root); /* Get the cluster for this block_group if it exists */ if (block_group && !list_empty(&block_group->cluster_list)) @@ -589,30 +863,9 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, */ unpin = root->fs_info->pinned_extents; - /* - * Lock all pages first so we can lock the extent safely. - * - * NOTE: Because we hold the ref the entire time we're going to write to - * the page find_get_page should never fail, so we don't do a check - * after find_get_page at this point. Just putting this here so people - * know and don't freak out. - */ - while (index < num_pages) { - page = find_or_create_page(inode->i_mapping, index, GFP_NOFS); - if (!page) { - int i; + /* Lock all pages first so we can lock the extent safely. */ + io_ctl_prepare_pages(&io_ctl, inode, 0); - for (i = 0; i < num_pages; i++) { - unlock_page(pages[i]); - page_cache_release(pages[i]); - } - goto out; - } - pages[index] = page; - index++; - } - - index = 0; lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1, 0, &cached_state, GFP_NOFS); @@ -623,189 +876,111 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, if (block_group) start = block_group->key.objectid; - /* Write out the extent entries */ - do { - struct btrfs_free_space_entry *entry; - void *addr, *orig; - unsigned long offset = 0; + node = rb_first(&ctl->free_space_offset); + if (!node && cluster) { + node = rb_first(&cluster->root); + cluster = NULL; + } - next_page = false; + /* Make sure we can fit our crcs into the first page */ + if (io_ctl.check_crcs && + (io_ctl.num_pages * sizeof(u32)) >= PAGE_CACHE_SIZE) { + WARN_ON(1); + goto out_nospc; + } - if (index >= num_pages) { - out_of_space = true; - break; - } + io_ctl_set_generation(&io_ctl, trans->transid); - page = pages[index]; + /* Write out the extent entries */ + while (node) { + struct btrfs_free_space *e; - orig = addr = kmap(page); - if (index == 0) { - u64 *gen; + e = rb_entry(node, struct btrfs_free_space, offset_index); + entries++; - /* - * We're going to put in a bogus crc for this page to - * make sure that old kernels who aren't aware of this - * format will be sure to discard the cache. - */ - addr += sizeof(u64); - offset += sizeof(u64); + ret = io_ctl_add_entry(&io_ctl, e->offset, e->bytes, + e->bitmap); + if (ret) + goto out_nospc; - gen = addr; - *gen = trans->transid; - addr += sizeof(u64); - offset += sizeof(u64); + if (e->bitmap) { + list_add_tail(&e->list, &bitmap_list); + bitmaps++; } - entry = addr; - - memset(addr, 0, PAGE_CACHE_SIZE - offset); - while (node && !next_page) { - struct btrfs_free_space *e; - - e = rb_entry(node, struct btrfs_free_space, offset_index); - entries++; - - entry->offset = cpu_to_le64(e->offset); - entry->bytes = cpu_to_le64(e->bytes); - if (e->bitmap) { - entry->type = BTRFS_FREE_SPACE_BITMAP; - list_add_tail(&e->list, &bitmap_list); - bitmaps++; - } else { - entry->type = BTRFS_FREE_SPACE_EXTENT; - } - node = rb_next(node); - if (!node && cluster) { - node = rb_first(&cluster->root); - cluster = NULL; - } - offset += sizeof(struct btrfs_free_space_entry); - if (offset + sizeof(struct btrfs_free_space_entry) >= - PAGE_CACHE_SIZE) - next_page = true; - entry++; + node = rb_next(node); + if (!node && cluster) { + node = rb_first(&cluster->root); + cluster = NULL; } + } - /* - * We want to add any pinned extents to our free space cache - * so we don't leak the space - */ - while (block_group && !next_page && - (start < block_group->key.objectid + - block_group->key.offset)) { - ret = find_first_extent_bit(unpin, start, &start, &end, - EXTENT_DIRTY); - if (ret) { - ret = 0; - break; - } - - /* This pinned extent is out of our range */ - if (start >= block_group->key.objectid + - block_group->key.offset) - break; - - len = block_group->key.objectid + - block_group->key.offset - start; - len = min(len, end + 1 - start); - - entries++; - entry->offset = cpu_to_le64(start); - entry->bytes = cpu_to_le64(len); - entry->type = BTRFS_FREE_SPACE_EXTENT; - - start = end + 1; - offset += sizeof(struct btrfs_free_space_entry); - if (offset + sizeof(struct btrfs_free_space_entry) >= - PAGE_CACHE_SIZE) - next_page = true; - entry++; + /* + * We want to add any pinned extents to our free space cache + * so we don't leak the space + */ + while (block_group && (start < block_group->key.objectid + + block_group->key.offset)) { + ret = find_first_extent_bit(unpin, start, &start, &end, + EXTENT_DIRTY); + if (ret) { + ret = 0; + break; } - /* Generate bogus crc value */ - if (index == 0) { - u32 *tmp; - crc = btrfs_csum_data(root, orig + sizeof(u64), crc, - PAGE_CACHE_SIZE - sizeof(u64)); - btrfs_csum_final(crc, (char *)&crc); - crc++; - tmp = orig; - *tmp = crc; - } + /* This pinned extent is out of our range */ + if (start >= block_group->key.objectid + + block_group->key.offset) + break; - kunmap(page); + len = block_group->key.objectid + + block_group->key.offset - start; + len = min(len, end + 1 - start); - bytes += PAGE_CACHE_SIZE; + entries++; + ret = io_ctl_add_entry(&io_ctl, start, len, NULL); + if (ret) + goto out_nospc; - index++; - } while (node || next_page); + start = end + 1; + } /* Write out the bitmaps */ list_for_each_safe(pos, n, &bitmap_list) { - void *addr; struct btrfs_free_space *entry = list_entry(pos, struct btrfs_free_space, list); - if (index >= num_pages) { - out_of_space = true; - break; - } - page = pages[index]; - - addr = kmap(page); - memcpy(addr, entry->bitmap, PAGE_CACHE_SIZE); - kunmap(page); - bytes += PAGE_CACHE_SIZE; - + ret = io_ctl_add_bitmap(&io_ctl, entry->bitmap); + if (ret) + goto out_nospc; list_del_init(&entry->list); - index++; - } - - if (out_of_space) { - btrfs_drop_pages(pages, num_pages); - unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0, - i_size_read(inode) - 1, &cached_state, - GFP_NOFS); - ret = 0; - goto out; } /* Zero out the rest of the pages just to make sure */ - while (index < num_pages) { - void *addr; + io_ctl_zero_remaining_pages(&io_ctl); - page = pages[index]; - addr = kmap(page); - memset(addr, 0, PAGE_CACHE_SIZE); - kunmap(page); - bytes += PAGE_CACHE_SIZE; - index++; - } - - ret = btrfs_dirty_pages(root, inode, pages, num_pages, 0, - bytes, &cached_state); - btrfs_drop_pages(pages, num_pages); + ret = btrfs_dirty_pages(root, inode, io_ctl.pages, io_ctl.num_pages, + 0, i_size_read(inode), &cached_state); + io_ctl_drop_pages(&io_ctl); unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1, &cached_state, GFP_NOFS); - if (ret) { - ret = 0; + if (ret) goto out; - } - BTRFS_I(inode)->generation = trans->transid; - filemap_write_and_wait(inode->i_mapping); + ret = filemap_write_and_wait(inode->i_mapping); + if (ret) + goto out; key.objectid = BTRFS_FREE_SPACE_OBJECTID; key.offset = offset; key.type = 0; - ret = btrfs_search_slot(trans, root, &key, path, 1, 1); + ret = btrfs_search_slot(trans, root, &key, path, 0, 1); if (ret < 0) { - ret = -1; - clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, bytes - 1, - EXTENT_DIRTY | EXTENT_DELALLOC | - EXTENT_DO_ACCOUNTING, 0, 0, NULL, GFP_NOFS); + clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1, + EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, NULL, + GFP_NOFS); goto out; } leaf = path->nodes[0]; @@ -816,15 +991,16 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); if (found_key.objectid != BTRFS_FREE_SPACE_OBJECTID || found_key.offset != offset) { - ret = -1; - clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, bytes - 1, - EXTENT_DIRTY | EXTENT_DELALLOC | - EXTENT_DO_ACCOUNTING, 0, 0, NULL, - GFP_NOFS); + clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, + inode->i_size - 1, + EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, + NULL, GFP_NOFS); btrfs_release_path(path); goto out; } } + + BTRFS_I(inode)->generation = trans->transid; header = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_free_space_header); btrfs_set_free_space_entries(leaf, header, entries); @@ -833,16 +1009,26 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, btrfs_mark_buffer_dirty(leaf); btrfs_release_path(path); - ret = 1; - + err = 0; out: - kfree(pages); - if (ret != 1) { - invalidate_inode_pages2_range(inode->i_mapping, 0, index); + io_ctl_free(&io_ctl); + if (err) { + invalidate_inode_pages2(inode->i_mapping); BTRFS_I(inode)->generation = 0; } btrfs_update_inode(trans, root, inode); - return ret; + return err; + +out_nospc: + list_for_each_safe(pos, n, &bitmap_list) { + struct btrfs_free_space *entry = + list_entry(pos, struct btrfs_free_space, list); + list_del_init(&entry->list); + } + io_ctl_drop_pages(&io_ctl); + unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0, + i_size_read(inode) - 1, &cached_state, GFP_NOFS); + goto out; } int btrfs_write_out_cache(struct btrfs_root *root, @@ -869,14 +1055,15 @@ int btrfs_write_out_cache(struct btrfs_root *root, ret = __btrfs_write_out_cache(root, inode, ctl, block_group, trans, path, block_group->key.objectid); - if (ret < 0) { + if (ret) { spin_lock(&block_group->lock); block_group->disk_cache_state = BTRFS_DC_ERROR; spin_unlock(&block_group->lock); ret = 0; - +#ifdef DEBUG printk(KERN_ERR "btrfs: failed to write free space cace " "for block group %llu\n", block_group->key.objectid); +#endif } iput(inode); @@ -1283,6 +1470,7 @@ static void add_new_bitmap(struct btrfs_free_space_ctl *ctl, { info->offset = offset_to_bitmap(ctl, offset); info->bytes = 0; + INIT_LIST_HEAD(&info->list); link_free_space(ctl, info); ctl->total_bitmaps++; @@ -1662,7 +1850,13 @@ again: info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset), 1, 0); if (!info) { - WARN_ON(1); + /* the tree logging code might be calling us before we + * have fully loaded the free space rbtree for this + * block group. So it is possible the entry won't + * be in the rbtree yet at all. The caching code + * will make sure not to put it in the rbtree if + * the logging code has pinned it. + */ goto out_lock; } } @@ -1701,6 +1895,7 @@ again: ctl->total_bitmaps--; } kmem_cache_free(btrfs_free_space_cachep, info); + ret = 0; goto out_lock; } @@ -1708,7 +1903,8 @@ again: unlink_free_space(ctl, info); info->offset += bytes; info->bytes -= bytes; - link_free_space(ctl, info); + ret = link_free_space(ctl, info); + WARN_ON(ret); goto out_lock; } @@ -2124,6 +2320,7 @@ again: if (!found) { start = i; + cluster->max_size = 0; found = true; } @@ -2267,16 +2464,23 @@ setup_cluster_bitmap(struct btrfs_block_group_cache *block_group, { struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; struct btrfs_free_space *entry; - struct rb_node *node; int ret = -ENOSPC; + u64 bitmap_offset = offset_to_bitmap(ctl, offset); if (ctl->total_bitmaps == 0) return -ENOSPC; /* - * First check our cached list of bitmaps and see if there is an entry - * here that will work. + * The bitmap that covers offset won't be in the list unless offset + * is just its start offset. */ + entry = list_first_entry(bitmaps, struct btrfs_free_space, list); + if (entry->offset != bitmap_offset) { + entry = tree_search_offset(ctl, bitmap_offset, 1, 0); + if (entry && list_empty(&entry->list)) + list_add(&entry->list, bitmaps); + } + list_for_each_entry(entry, bitmaps, list) { if (entry->bytes < min_bytes) continue; @@ -2287,38 +2491,10 @@ setup_cluster_bitmap(struct btrfs_block_group_cache *block_group, } /* - * If we do have entries on our list and we are here then we didn't find - * anything, so go ahead and get the next entry after the last entry in - * this list and start the search from there. + * The bitmaps list has all the bitmaps that record free space + * starting after offset, so no more search is required. */ - if (!list_empty(bitmaps)) { - entry = list_entry(bitmaps->prev, struct btrfs_free_space, - list); - node = rb_next(&entry->offset_index); - if (!node) - return -ENOSPC; - entry = rb_entry(node, struct btrfs_free_space, offset_index); - goto search; - } - - entry = tree_search_offset(ctl, offset_to_bitmap(ctl, offset), 0, 1); - if (!entry) - return -ENOSPC; - -search: - node = &entry->offset_index; - do { - entry = rb_entry(node, struct btrfs_free_space, offset_index); - node = rb_next(&entry->offset_index); - if (!entry->bitmap) - continue; - if (entry->bytes < min_bytes) - continue; - ret = btrfs_bitmap_cluster(block_group, entry, cluster, offset, - bytes, min_bytes); - } while (ret && node); - - return ret; + return -ENOSPC; } /* @@ -2336,8 +2512,8 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans, u64 offset, u64 bytes, u64 empty_size) { struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; - struct list_head bitmaps; struct btrfs_free_space *entry, *tmp; + LIST_HEAD(bitmaps); u64 min_bytes; int ret; @@ -2376,7 +2552,6 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans, goto out; } - INIT_LIST_HEAD(&bitmaps); ret = setup_cluster_no_bitmap(block_group, cluster, &bitmaps, offset, bytes, min_bytes); if (ret) @@ -2472,9 +2647,19 @@ int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group, spin_unlock(&ctl->tree_lock); if (bytes >= minlen) { - int update_ret; - update_ret = btrfs_update_reserved_bytes(block_group, - bytes, 1, 1); + struct btrfs_space_info *space_info; + int update = 0; + + space_info = block_group->space_info; + spin_lock(&space_info->lock); + spin_lock(&block_group->lock); + if (!block_group->ro) { + block_group->reserved += bytes; + space_info->bytes_reserved += bytes; + update = 1; + } + spin_unlock(&block_group->lock); + spin_unlock(&space_info->lock); ret = btrfs_error_discard_extent(fs_info->extent_root, start, @@ -2482,9 +2667,16 @@ int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group, &actually_trimmed); btrfs_add_free_space(block_group, start, bytes); - if (!update_ret) - btrfs_update_reserved_bytes(block_group, - bytes, 0, 1); + if (update) { + spin_lock(&space_info->lock); + spin_lock(&block_group->lock); + if (block_group->ro) + space_info->bytes_readonly += bytes; + block_group->reserved -= bytes; + space_info->bytes_reserved -= bytes; + spin_unlock(&space_info->lock); + spin_unlock(&block_group->lock); + } if (ret) break; @@ -2643,9 +2835,13 @@ int btrfs_write_out_ino_cache(struct btrfs_root *root, return 0; ret = __btrfs_write_out_cache(root, inode, ctl, NULL, trans, path, 0); - if (ret < 0) + if (ret) { + btrfs_delalloc_release_metadata(inode, inode->i_size); +#ifdef DEBUG printk(KERN_ERR "btrfs: failed to write free ino cache " "for root %llu\n", root->root_key.objectid); +#endif + } iput(inode); return ret; diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c index b4087e0..f8962a9 100644 --- a/fs/btrfs/inode-map.c +++ b/fs/btrfs/inode-map.c @@ -398,6 +398,8 @@ int btrfs_save_ino_cache(struct btrfs_root *root, struct btrfs_free_space_ctl *ctl = root->free_ino_ctl; struct btrfs_path *path; struct inode *inode; + struct btrfs_block_rsv *rsv; + u64 num_bytes; u64 alloc_hint = 0; int ret; int prealloc; @@ -421,11 +423,26 @@ int btrfs_save_ino_cache(struct btrfs_root *root, if (!path) return -ENOMEM; + rsv = trans->block_rsv; + trans->block_rsv = &root->fs_info->trans_block_rsv; + + num_bytes = trans->bytes_reserved; + /* + * 1 item for inode item insertion if need + * 3 items for inode item update (in the worst case) + * 1 item for free space object + * 3 items for pre-allocation + */ + trans->bytes_reserved = btrfs_calc_trans_metadata_size(root, 8); + ret = btrfs_block_rsv_add_noflush(root, trans->block_rsv, + trans->bytes_reserved); + if (ret) + goto out; again: inode = lookup_free_ino_inode(root, path); if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) { ret = PTR_ERR(inode); - goto out; + goto out_release; } if (IS_ERR(inode)) { @@ -434,7 +451,7 @@ again: ret = create_free_ino_inode(root, trans, path); if (ret) - goto out; + goto out_release; goto again; } @@ -465,21 +482,26 @@ again: /* Just to make sure we have enough space */ prealloc += 8 * PAGE_CACHE_SIZE; - ret = btrfs_check_data_free_space(inode, prealloc); + ret = btrfs_delalloc_reserve_space(inode, prealloc); if (ret) goto out_put; ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, prealloc, prealloc, prealloc, &alloc_hint); - if (ret) + if (ret) { + btrfs_delalloc_release_space(inode, prealloc); goto out_put; + } btrfs_free_reserved_data_space(inode, prealloc); + ret = btrfs_write_out_ino_cache(root, trans, path); out_put: iput(inode); +out_release: + btrfs_block_rsv_release(root, trans->block_rsv, trans->bytes_reserved); out: - if (ret == 0) - ret = btrfs_write_out_ino_cache(root, trans, path); + trans->block_rsv = rsv; + trans->bytes_reserved = num_bytes; btrfs_free_path(path); return ret; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index b2d004a..81b235a 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -38,6 +38,7 @@ #include <linux/falloc.h> #include <linux/slab.h> #include <linux/ratelimit.h> +#include <linux/mount.h> #include "compat.h" #include "ctree.h" #include "disk-io.h" @@ -45,10 +46,10 @@ #include "btrfs_inode.h" #include "ioctl.h" #include "print-tree.h" -#include "volumes.h" #include "ordered-data.h" #include "xattr.h" #include "tree-log.h" +#include "volumes.h" #include "compression.h" #include "locking.h" #include "free-space-cache.h" @@ -93,6 +94,8 @@ static noinline int cow_file_range(struct inode *inode, struct page *locked_page, u64 start, u64 end, int *page_started, unsigned long *nr_written, int unlock); +static noinline int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct inode *inode); static int btrfs_init_inode_security(struct btrfs_trans_handle *trans, struct inode *inode, struct inode *dir, @@ -393,7 +396,10 @@ again: (BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS))) { WARN_ON(pages); pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS); - BUG_ON(!pages); + if (!pages) { + /* just bail out to the uncompressed code */ + goto cont; + } if (BTRFS_I(inode)->force_compress) compress_type = BTRFS_I(inode)->force_compress; @@ -424,6 +430,7 @@ again: will_compress = 1; } } +cont: if (start == 0) { trans = btrfs_join_transaction(root); BUG_ON(IS_ERR(trans)); @@ -820,7 +827,7 @@ static noinline int cow_file_range(struct inode *inode, } BUG_ON(disk_num_bytes > - btrfs_super_total_bytes(&root->fs_info->super_copy)); + btrfs_super_total_bytes(root->fs_info->super_copy)); alloc_hint = get_extent_allocation_hint(inode, start, num_bytes); btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0); @@ -1737,7 +1744,7 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end) trans = btrfs_join_transaction(root); BUG_ON(IS_ERR(trans)); trans->block_rsv = &root->fs_info->delalloc_block_rsv; - ret = btrfs_update_inode(trans, root, inode); + ret = btrfs_update_inode_fallback(trans, root, inode); BUG_ON(ret); } goto out; @@ -1787,17 +1794,17 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end) ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent); if (!ret || !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) { - ret = btrfs_update_inode(trans, root, inode); + ret = btrfs_update_inode_fallback(trans, root, inode); BUG_ON(ret); } ret = 0; out: - if (nolock) { - if (trans) - btrfs_end_transaction_nolock(trans, root); - } else { + if (root != root->fs_info->tree_root) btrfs_delalloc_release_metadata(inode, ordered_extent->len); - if (trans) + if (trans) { + if (nolock) + btrfs_end_transaction_nolock(trans, root); + else btrfs_end_transaction(trans, root); } @@ -1819,153 +1826,9 @@ static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end, } /* - * When IO fails, either with EIO or csum verification fails, we - * try other mirrors that might have a good copy of the data. This - * io_failure_record is used to record state as we go through all the - * mirrors. If another mirror has good data, the page is set up to date - * and things continue. If a good mirror can't be found, the original - * bio end_io callback is called to indicate things have failed. - */ -struct io_failure_record { - struct page *page; - u64 start; - u64 len; - u64 logical; - unsigned long bio_flags; - int last_mirror; -}; - -static int btrfs_io_failed_hook(struct bio *failed_bio, - struct page *page, u64 start, u64 end, - struct extent_state *state) -{ - struct io_failure_record *failrec = NULL; - u64 private; - struct extent_map *em; - struct inode *inode = page->mapping->host; - struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree; - struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; - struct bio *bio; - int num_copies; - int ret; - int rw; - u64 logical; - - ret = get_state_private(failure_tree, start, &private); - if (ret) { - failrec = kmalloc(sizeof(*failrec), GFP_NOFS); - if (!failrec) - return -ENOMEM; - failrec->start = start; - failrec->len = end - start + 1; - failrec->last_mirror = 0; - failrec->bio_flags = 0; - - read_lock(&em_tree->lock); - em = lookup_extent_mapping(em_tree, start, failrec->len); - if (em->start > start || em->start + em->len < start) { - free_extent_map(em); - em = NULL; - } - read_unlock(&em_tree->lock); - - if (IS_ERR_OR_NULL(em)) { - kfree(failrec); - return -EIO; - } - logical = start - em->start; - logical = em->block_start + logical; - if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { - logical = em->block_start; - failrec->bio_flags = EXTENT_BIO_COMPRESSED; - extent_set_compress_type(&failrec->bio_flags, - em->compress_type); - } - failrec->logical = logical; - free_extent_map(em); - set_extent_bits(failure_tree, start, end, EXTENT_LOCKED | - EXTENT_DIRTY, GFP_NOFS); - set_state_private(failure_tree, start, - (u64)(unsigned long)failrec); - } else { - failrec = (struct io_failure_record *)(unsigned long)private; - } - num_copies = btrfs_num_copies( - &BTRFS_I(inode)->root->fs_info->mapping_tree, - failrec->logical, failrec->len); - failrec->last_mirror++; - if (!state) { - spin_lock(&BTRFS_I(inode)->io_tree.lock); - state = find_first_extent_bit_state(&BTRFS_I(inode)->io_tree, - failrec->start, - EXTENT_LOCKED); - if (state && state->start != failrec->start) - state = NULL; - spin_unlock(&BTRFS_I(inode)->io_tree.lock); - } - if (!state || failrec->last_mirror > num_copies) { - set_state_private(failure_tree, failrec->start, 0); - clear_extent_bits(failure_tree, failrec->start, - failrec->start + failrec->len - 1, - EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS); - kfree(failrec); - return -EIO; - } - bio = bio_alloc(GFP_NOFS, 1); - bio->bi_private = state; - bio->bi_end_io = failed_bio->bi_end_io; - bio->bi_sector = failrec->logical >> 9; - bio->bi_bdev = failed_bio->bi_bdev; - bio->bi_size = 0; - - bio_add_page(bio, page, failrec->len, start - page_offset(page)); - if (failed_bio->bi_rw & REQ_WRITE) - rw = WRITE; - else - rw = READ; - - ret = BTRFS_I(inode)->io_tree.ops->submit_bio_hook(inode, rw, bio, - failrec->last_mirror, - failrec->bio_flags, 0); - return ret; -} - -/* - * each time an IO finishes, we do a fast check in the IO failure tree - * to see if we need to process or clean up an io_failure_record - */ -static int btrfs_clean_io_failures(struct inode *inode, u64 start) -{ - u64 private; - u64 private_failure; - struct io_failure_record *failure; - int ret; - - private = 0; - if (count_range_bits(&BTRFS_I(inode)->io_failure_tree, &private, - (u64)-1, 1, EXTENT_DIRTY, 0)) { - ret = get_state_private(&BTRFS_I(inode)->io_failure_tree, - start, &private_failure); - if (ret == 0) { - failure = (struct io_failure_record *)(unsigned long) - private_failure; - set_state_private(&BTRFS_I(inode)->io_failure_tree, - failure->start, 0); - clear_extent_bits(&BTRFS_I(inode)->io_failure_tree, - failure->start, - failure->start + failure->len - 1, - EXTENT_DIRTY | EXTENT_LOCKED, - GFP_NOFS); - kfree(failure); - } - } - return 0; -} - -/* * when reads are done, we need to check csums to verify the data is correct - * if there's a match, we allow the bio to finish. If not, we go through - * the io_failure_record routines to find good copies + * if there's a match, we allow the bio to finish. If not, the code in + * extent_io.c will try to find good copies for us. */ static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end, struct extent_state *state) @@ -2011,10 +1874,6 @@ static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end, kunmap_atomic(kaddr, KM_USER0); good: - /* if the io failure tree for this inode is non-empty, - * check to see if we've recovered from a failed IO - */ - btrfs_clean_io_failures(inode, start); return 0; zeroit: @@ -2079,96 +1938,13 @@ void btrfs_run_delayed_iputs(struct btrfs_root *root) up_read(&root->fs_info->cleanup_work_sem); } -/* - * calculate extra metadata reservation when snapshotting a subvolume - * contains orphan files. - */ -void btrfs_orphan_pre_snapshot(struct btrfs_trans_handle *trans, - struct btrfs_pending_snapshot *pending, - u64 *bytes_to_reserve) -{ - struct btrfs_root *root; - struct btrfs_block_rsv *block_rsv; - u64 num_bytes; - int index; - - root = pending->root; - if (!root->orphan_block_rsv || list_empty(&root->orphan_list)) - return; - - block_rsv = root->orphan_block_rsv; - - /* orphan block reservation for the snapshot */ - num_bytes = block_rsv->size; - - /* - * after the snapshot is created, COWing tree blocks may use more - * space than it frees. So we should make sure there is enough - * reserved space. - */ - index = trans->transid & 0x1; - if (block_rsv->reserved + block_rsv->freed[index] < block_rsv->size) { - num_bytes += block_rsv->size - - (block_rsv->reserved + block_rsv->freed[index]); - } - - *bytes_to_reserve += num_bytes; -} - -void btrfs_orphan_post_snapshot(struct btrfs_trans_handle *trans, - struct btrfs_pending_snapshot *pending) -{ - struct btrfs_root *root = pending->root; - struct btrfs_root *snap = pending->snap; - struct btrfs_block_rsv *block_rsv; - u64 num_bytes; - int index; - int ret; - - if (!root->orphan_block_rsv || list_empty(&root->orphan_list)) - return; - - /* refill source subvolume's orphan block reservation */ - block_rsv = root->orphan_block_rsv; - index = trans->transid & 0x1; - if (block_rsv->reserved + block_rsv->freed[index] < block_rsv->size) { - num_bytes = block_rsv->size - - (block_rsv->reserved + block_rsv->freed[index]); - ret = btrfs_block_rsv_migrate(&pending->block_rsv, - root->orphan_block_rsv, - num_bytes); - BUG_ON(ret); - } - - /* setup orphan block reservation for the snapshot */ - block_rsv = btrfs_alloc_block_rsv(snap); - BUG_ON(!block_rsv); - - btrfs_add_durable_block_rsv(root->fs_info, block_rsv); - snap->orphan_block_rsv = block_rsv; - - num_bytes = root->orphan_block_rsv->size; - ret = btrfs_block_rsv_migrate(&pending->block_rsv, - block_rsv, num_bytes); - BUG_ON(ret); - -#if 0 - /* insert orphan item for the snapshot */ - WARN_ON(!root->orphan_item_inserted); - ret = btrfs_insert_orphan_item(trans, root->fs_info->tree_root, - snap->root_key.objectid); - BUG_ON(ret); - snap->orphan_item_inserted = 1; -#endif -} - enum btrfs_orphan_cleanup_state { ORPHAN_CLEANUP_STARTED = 1, ORPHAN_CLEANUP_DONE = 2, }; /* - * This is called in transaction commmit time. If there are no orphan + * This is called in transaction commit time. If there are no orphan * files in the subvolume, it removes orphan item and frees block_rsv * structure. */ @@ -2247,9 +2023,6 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode) } spin_unlock(&root->orphan_lock); - if (block_rsv) - btrfs_add_durable_block_rsv(root->fs_info, block_rsv); - /* grab metadata reservation from transaction handle */ if (reserve) { ret = btrfs_orphan_reserve_metadata(trans, inode); @@ -2259,7 +2032,7 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode) /* insert an orphan item to track this unlinked/truncated file */ if (insert >= 1) { ret = btrfs_insert_orphan_item(trans, root, btrfs_ino(inode)); - BUG_ON(ret); + BUG_ON(ret && ret != -EEXIST); } /* insert an orphan item to track subvolume contains orphan files */ @@ -2316,6 +2089,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) struct btrfs_key key, found_key; struct btrfs_trans_handle *trans; struct inode *inode; + u64 last_objectid = 0; int ret = 0, nr_unlink = 0, nr_truncate = 0; if (cmpxchg(&root->orphan_cleanup_state, 0, ORPHAN_CLEANUP_STARTED)) @@ -2367,41 +2141,81 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) * crossing root thing. we store the inode number in the * offset of the orphan item. */ + + if (found_key.offset == last_objectid) { + printk(KERN_ERR "btrfs: Error removing orphan entry, " + "stopping orphan cleanup\n"); + ret = -EINVAL; + goto out; + } + + last_objectid = found_key.offset; + found_key.objectid = found_key.offset; found_key.type = BTRFS_INODE_ITEM_KEY; found_key.offset = 0; inode = btrfs_iget(root->fs_info->sb, &found_key, root, NULL); - if (IS_ERR(inode)) { - ret = PTR_ERR(inode); + ret = PTR_RET(inode); + if (ret && ret != -ESTALE) goto out; - } - /* - * add this inode to the orphan list so btrfs_orphan_del does - * the proper thing when we hit it - */ - spin_lock(&root->orphan_lock); - list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list); - spin_unlock(&root->orphan_lock); + if (ret == -ESTALE && root == root->fs_info->tree_root) { + struct btrfs_root *dead_root; + struct btrfs_fs_info *fs_info = root->fs_info; + int is_dead_root = 0; + /* + * this is an orphan in the tree root. Currently these + * could come from 2 sources: + * a) a snapshot deletion in progress + * b) a free space cache inode + * We need to distinguish those two, as the snapshot + * orphan must not get deleted. + * find_dead_roots already ran before us, so if this + * is a snapshot deletion, we should find the root + * in the dead_roots list + */ + spin_lock(&fs_info->trans_lock); + list_for_each_entry(dead_root, &fs_info->dead_roots, + root_list) { + if (dead_root->root_key.objectid == + found_key.objectid) { + is_dead_root = 1; + break; + } + } + spin_unlock(&fs_info->trans_lock); + if (is_dead_root) { + /* prevent this orphan from being found again */ + key.offset = found_key.objectid - 1; + continue; + } + } /* - * if this is a bad inode, means we actually succeeded in - * removing the inode, but not the orphan record, which means - * we need to manually delete the orphan since iput will just - * do a destroy_inode + * Inode is already gone but the orphan item is still there, + * kill the orphan item. */ - if (is_bad_inode(inode)) { - trans = btrfs_start_transaction(root, 0); + if (ret == -ESTALE) { + trans = btrfs_start_transaction(root, 1); if (IS_ERR(trans)) { ret = PTR_ERR(trans); goto out; } - btrfs_orphan_del(trans, inode); + ret = btrfs_del_orphan_item(trans, root, + found_key.objectid); + BUG_ON(ret); btrfs_end_transaction(trans, root); - iput(inode); continue; } + /* + * add this inode to the orphan list so btrfs_orphan_del does + * the proper thing when we hit it + */ + spin_lock(&root->orphan_lock); + list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list); + spin_unlock(&root->orphan_lock); + /* if we have links, this was a truncate, lets do that */ if (inode->i_nlink) { if (!S_ISREG(inode->i_mode)) { @@ -2410,7 +2224,14 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) continue; } nr_truncate++; + /* + * Need to hold the imutex for reservation purposes, not + * a huge deal here but I have a WARN_ON in + * btrfs_delalloc_reserve_space to catch offenders. + */ + mutex_lock(&inode->i_mutex); ret = btrfs_truncate(inode); + mutex_unlock(&inode->i_mutex); } else { nr_unlink++; } @@ -2420,6 +2241,9 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) if (ret) goto out; } + /* release the path since we're done with it */ + btrfs_release_path(path); + root->orphan_cleanup_state = ORPHAN_CLEANUP_DONE; if (root->orphan_block_rsv) @@ -2534,7 +2358,7 @@ static void btrfs_read_locked_inode(struct inode *inode) inode_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item); inode->i_mode = btrfs_inode_mode(leaf, inode_item); - inode->i_nlink = btrfs_inode_nlink(leaf, inode_item); + set_nlink(inode, btrfs_inode_nlink(leaf, inode_item)); inode->i_uid = btrfs_inode_uid(leaf, inode_item); inode->i_gid = btrfs_inode_gid(leaf, inode_item); btrfs_i_size_write(inode, btrfs_inode_size(leaf, inode_item)); @@ -2647,7 +2471,7 @@ static void fill_inode_item(struct btrfs_trans_handle *trans, /* * copy everything in the in-memory inode into the btree. */ -noinline int btrfs_update_inode(struct btrfs_trans_handle *trans, +static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode) { struct btrfs_inode_item *inode_item; @@ -2655,21 +2479,6 @@ noinline int btrfs_update_inode(struct btrfs_trans_handle *trans, struct extent_buffer *leaf; int ret; - /* - * If the inode is a free space inode, we can deadlock during commit - * if we put it into the delayed code. - * - * The data relocation inode should also be directly updated - * without delay - */ - if (!btrfs_is_free_space_inode(root, inode) - && root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID) { - ret = btrfs_delayed_update_inode(trans, root, inode); - if (!ret) - btrfs_set_inode_last_trans(trans, inode); - return ret; - } - path = btrfs_alloc_path(); if (!path) return -ENOMEM; @@ -2698,6 +2507,43 @@ failed: } /* + * copy everything in the in-memory inode into the btree. + */ +noinline int btrfs_update_inode(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct inode *inode) +{ + int ret; + + /* + * If the inode is a free space inode, we can deadlock during commit + * if we put it into the delayed code. + * + * The data relocation inode should also be directly updated + * without delay + */ + if (!btrfs_is_free_space_inode(root, inode) + && root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID) { + ret = btrfs_delayed_update_inode(trans, root, inode); + if (!ret) + btrfs_set_inode_last_trans(trans, inode); + return ret; + } + + return btrfs_update_inode_item(trans, root, inode); +} + +static noinline int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct inode *inode) +{ + int ret; + + ret = btrfs_update_inode(trans, root, inode); + if (ret == -ENOSPC) + return btrfs_update_inode_item(trans, root, inode); + return ret; +} + +/* * unlink helper that gets used here in inode.c and in the tree logging * recovery code. It remove a link in a directory with a given name, and * also drops the back refs in the inode to the directory @@ -2835,7 +2681,16 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir, u64 ino = btrfs_ino(inode); u64 dir_ino = btrfs_ino(dir); - trans = btrfs_start_transaction(root, 10); + /* + * 1 for the possible orphan item + * 1 for the dir item + * 1 for the dir index + * 1 for the inode ref + * 1 for the inode ref in the tree log + * 2 for the dir entries in the log + * 1 for the inode + */ + trans = btrfs_start_transaction(root, 8); if (!IS_ERR(trans) || PTR_ERR(trans) != -ENOSPC) return trans; @@ -2858,7 +2713,8 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir, return ERR_PTR(-ENOMEM); } - trans = btrfs_start_transaction(root, 0); + /* 1 for the orphan item */ + trans = btrfs_start_transaction(root, 1); if (IS_ERR(trans)) { btrfs_free_path(path); root->fs_info->enospc_unlink = 0; @@ -2963,6 +2819,12 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir, err = 0; out: btrfs_free_path(path); + /* Migrate the orphan reservation over */ + if (!err) + err = btrfs_block_rsv_migrate(trans->block_rsv, + &root->fs_info->global_block_rsv, + trans->bytes_reserved); + if (err) { btrfs_end_transaction(trans, root); root->fs_info->enospc_unlink = 0; @@ -2977,6 +2839,9 @@ static void __unlink_end_trans(struct btrfs_trans_handle *trans, struct btrfs_root *root) { if (trans->block_rsv == &root->fs_info->global_block_rsv) { + btrfs_block_rsv_release(root, trans->block_rsv, + trans->bytes_reserved); + trans->block_rsv = &root->fs_info->trans_block_rsv; BUG_ON(!root->fs_info->enospc_unlink); root->fs_info->enospc_unlink = 0; } @@ -3368,6 +3233,7 @@ static int btrfs_truncate_page(struct address_space *mapping, loff_t from) pgoff_t index = from >> PAGE_CACHE_SHIFT; unsigned offset = from & (PAGE_CACHE_SIZE-1); struct page *page; + gfp_t mask = btrfs_alloc_write_mask(mapping); int ret = 0; u64 page_start; u64 page_end; @@ -3380,7 +3246,7 @@ static int btrfs_truncate_page(struct address_space *mapping, loff_t from) ret = -ENOMEM; again: - page = find_or_create_page(mapping, index, GFP_NOFS); + page = find_or_create_page(mapping, index, mask); if (!page) { btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE); goto out; @@ -3501,7 +3367,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size) u64 hint_byte = 0; hole_size = last_byte - cur_offset; - trans = btrfs_start_transaction(root, 2); + trans = btrfs_start_transaction(root, 3); if (IS_ERR(trans)) { err = PTR_ERR(trans); break; @@ -3511,6 +3377,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size) cur_offset + hole_size, &hint_byte, 1); if (err) { + btrfs_update_inode(trans, root, inode); btrfs_end_transaction(trans, root); break; } @@ -3520,6 +3387,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size) 0, hole_size, 0, hole_size, 0, 0, 0); if (err) { + btrfs_update_inode(trans, root, inode); btrfs_end_transaction(trans, root); break; } @@ -3527,6 +3395,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size) btrfs_drop_extent_cache(inode, hole_start, last_byte - 1, 0); + btrfs_update_inode(trans, root, inode); btrfs_end_transaction(trans, root); } free_extent_map(em); @@ -3544,6 +3413,8 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size) static int btrfs_setsize(struct inode *inode, loff_t newsize) { + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_trans_handle *trans; loff_t oldsize = i_size_read(inode); int ret; @@ -3551,16 +3422,19 @@ static int btrfs_setsize(struct inode *inode, loff_t newsize) return 0; if (newsize > oldsize) { - i_size_write(inode, newsize); - btrfs_ordered_update_i_size(inode, i_size_read(inode), NULL); truncate_pagecache(inode, oldsize, newsize); ret = btrfs_cont_expand(inode, oldsize, newsize); - if (ret) { - btrfs_setsize(inode, oldsize); + if (ret) return ret; - } - mark_inode_dirty(inode); + trans = btrfs_start_transaction(root, 1); + if (IS_ERR(trans)) + return PTR_ERR(trans); + + i_size_write(inode, newsize); + btrfs_ordered_update_i_size(inode, i_size_read(inode), NULL); + ret = btrfs_update_inode(trans, root, inode); + btrfs_end_transaction_throttle(trans, root); } else { /* @@ -3600,9 +3474,9 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr) if (attr->ia_valid) { setattr_copy(inode, attr); - mark_inode_dirty(inode); + err = btrfs_dirty_inode(inode); - if (attr->ia_valid & ATTR_MODE) + if (!err && attr->ia_valid & ATTR_MODE) err = btrfs_acl_chmod(inode); } @@ -3613,6 +3487,8 @@ void btrfs_evict_inode(struct inode *inode) { struct btrfs_trans_handle *trans; struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_block_rsv *rsv, *global_rsv; + u64 min_size = btrfs_calc_trunc_metadata_size(root, 1); unsigned long nr; int ret; @@ -3640,22 +3516,55 @@ void btrfs_evict_inode(struct inode *inode) goto no_delete; } + rsv = btrfs_alloc_block_rsv(root); + if (!rsv) { + btrfs_orphan_del(NULL, inode); + goto no_delete; + } + rsv->size = min_size; + global_rsv = &root->fs_info->global_block_rsv; + btrfs_i_size_write(inode, 0); + /* + * This is a bit simpler than btrfs_truncate since + * + * 1) We've already reserved our space for our orphan item in the + * unlink. + * 2) We're going to delete the inode item, so we don't need to update + * it at all. + * + * So we just need to reserve some slack space in case we add bytes when + * doing the truncate. + */ while (1) { - trans = btrfs_join_transaction(root); - BUG_ON(IS_ERR(trans)); - trans->block_rsv = root->orphan_block_rsv; + ret = btrfs_block_rsv_refill_noflush(root, rsv, min_size); + + /* + * Try and steal from the global reserve since we will + * likely not use this space anyway, we want to try as + * hard as possible to get this to work. + */ + if (ret) + ret = btrfs_block_rsv_migrate(global_rsv, rsv, min_size); - ret = btrfs_block_rsv_check(trans, root, - root->orphan_block_rsv, 0, 5); if (ret) { - BUG_ON(ret != -EAGAIN); - ret = btrfs_commit_transaction(trans, root); - BUG_ON(ret); - continue; + printk(KERN_WARNING "Could not get space for a " + "delete, will truncate on mount %d\n", ret); + btrfs_orphan_del(NULL, inode); + btrfs_free_block_rsv(root, rsv); + goto no_delete; + } + + trans = btrfs_start_transaction(root, 0); + if (IS_ERR(trans)) { + btrfs_orphan_del(NULL, inode); + btrfs_free_block_rsv(root, rsv); + goto no_delete; } + trans->block_rsv = rsv; + ret = btrfs_truncate_inode_items(trans, root, inode, 0, 0); if (ret != -EAGAIN) break; @@ -3664,14 +3573,17 @@ void btrfs_evict_inode(struct inode *inode) btrfs_end_transaction(trans, root); trans = NULL; btrfs_btree_balance_dirty(root, nr); - } + btrfs_free_block_rsv(root, rsv); + if (ret == 0) { + trans->block_rsv = root->orphan_block_rsv; ret = btrfs_orphan_del(trans, inode); BUG_ON(ret); } + trans->block_rsv = &root->fs_info->trans_block_rsv; if (!(root == root->fs_info->tree_root || root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)) btrfs_return_ino(root, btrfs_ino(inode)); @@ -4340,42 +4252,80 @@ int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc) * FIXME, needs more benchmarking...there are no reasons other than performance * to keep or drop this code. */ -void btrfs_dirty_inode(struct inode *inode, int flags) +int btrfs_dirty_inode(struct inode *inode) { struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_trans_handle *trans; int ret; if (BTRFS_I(inode)->dummy_inode) - return; + return 0; trans = btrfs_join_transaction(root); - BUG_ON(IS_ERR(trans)); + if (IS_ERR(trans)) + return PTR_ERR(trans); ret = btrfs_update_inode(trans, root, inode); if (ret && ret == -ENOSPC) { /* whoops, lets try again with the full transaction */ btrfs_end_transaction(trans, root); trans = btrfs_start_transaction(root, 1); - if (IS_ERR(trans)) { - printk_ratelimited(KERN_ERR "btrfs: fail to " - "dirty inode %llu error %ld\n", - (unsigned long long)btrfs_ino(inode), - PTR_ERR(trans)); - return; - } + if (IS_ERR(trans)) + return PTR_ERR(trans); ret = btrfs_update_inode(trans, root, inode); - if (ret) { - printk_ratelimited(KERN_ERR "btrfs: fail to " - "dirty inode %llu error %d\n", - (unsigned long long)btrfs_ino(inode), - ret); - } } btrfs_end_transaction(trans, root); if (BTRFS_I(inode)->delayed_node) btrfs_balance_delayed_items(root); + + return ret; +} + +/* + * This is a copy of file_update_time. We need this so we can return error on + * ENOSPC for updating the inode in the case of file write and mmap writes. + */ +int btrfs_update_time(struct file *file) +{ + struct inode *inode = file->f_path.dentry->d_inode; + struct timespec now; + int ret; + enum { S_MTIME = 1, S_CTIME = 2, S_VERSION = 4 } sync_it = 0; + + /* First try to exhaust all avenues to not sync */ + if (IS_NOCMTIME(inode)) + return 0; + + now = current_fs_time(inode->i_sb); + if (!timespec_equal(&inode->i_mtime, &now)) + sync_it = S_MTIME; + + if (!timespec_equal(&inode->i_ctime, &now)) + sync_it |= S_CTIME; + + if (IS_I_VERSION(inode)) + sync_it |= S_VERSION; + + if (!sync_it) + return 0; + + /* Finally allowed to write? Takes lock. */ + if (mnt_want_write_file(file)) + return 0; + + /* Only change inode inside the lock region */ + if (sync_it & S_VERSION) + inode_inc_iversion(inode); + if (sync_it & S_CTIME) + inode->i_ctime = now; + if (sync_it & S_MTIME) + inode->i_mtime = now; + ret = btrfs_dirty_inode(inode); + if (!ret) + mark_inode_dirty_sync(inode); + mnt_drop_write(file->f_path.mnt); + return ret; } /* @@ -4462,8 +4412,8 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *dir, const char *name, int name_len, - u64 ref_objectid, u64 objectid, int mode, - u64 *index) + u64 ref_objectid, u64 objectid, + umode_t mode, u64 *index) { struct inode *inode; struct btrfs_inode_item *inode_item; @@ -4640,17 +4590,13 @@ static int btrfs_add_nondir(struct btrfs_trans_handle *trans, int err = btrfs_add_link(trans, dir, inode, dentry->d_name.name, dentry->d_name.len, backref, index); - if (!err) { - d_instantiate(dentry, inode); - return 0; - } if (err > 0) err = -EEXIST; return err; } static int btrfs_mknod(struct inode *dir, struct dentry *dentry, - int mode, dev_t rdev) + umode_t mode, dev_t rdev) { struct btrfs_trans_handle *trans; struct btrfs_root *root = BTRFS_I(dir)->root; @@ -4691,13 +4637,21 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry, goto out_unlock; } + /* + * If the active LSM wants to access the inode during + * d_instantiate it needs these. Smack checks to see + * if the filesystem supports xattrs by looking at the + * ops vector. + */ + + inode->i_op = &btrfs_special_inode_operations; err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index); if (err) drop_inode = 1; else { - inode->i_op = &btrfs_special_inode_operations; init_special_inode(inode, inode->i_mode, rdev); btrfs_update_inode(trans, root, inode); + d_instantiate(dentry, inode); } out_unlock: nr = trans->blocks_used; @@ -4711,7 +4665,7 @@ out_unlock: } static int btrfs_create(struct inode *dir, struct dentry *dentry, - int mode, struct nameidata *nd) + umode_t mode, struct nameidata *nd) { struct btrfs_trans_handle *trans; struct btrfs_root *root = BTRFS_I(dir)->root; @@ -4749,15 +4703,23 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry, goto out_unlock; } + /* + * If the active LSM wants to access the inode during + * d_instantiate it needs these. Smack checks to see + * if the filesystem supports xattrs by looking at the + * ops vector. + */ + inode->i_fop = &btrfs_file_operations; + inode->i_op = &btrfs_file_inode_operations; + err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index); if (err) drop_inode = 1; else { inode->i_mapping->a_ops = &btrfs_aops; inode->i_mapping->backing_dev_info = &root->fs_info->bdi; - inode->i_fop = &btrfs_file_operations; - inode->i_op = &btrfs_file_inode_operations; BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; + d_instantiate(dentry, inode); } out_unlock: nr = trans->blocks_used; @@ -4815,6 +4777,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *parent = dentry->d_parent; err = btrfs_update_inode(trans, root, inode); BUG_ON(err); + d_instantiate(dentry, inode); btrfs_log_new_name(trans, inode, NULL, parent); } @@ -4829,7 +4792,7 @@ fail: return err; } -static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) +static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) { struct inode *inode = NULL; struct btrfs_trans_handle *trans; @@ -5795,8 +5758,7 @@ again: if (test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) { ret = btrfs_ordered_update_i_size(inode, 0, ordered); if (!ret) - ret = btrfs_update_inode(trans, root, inode); - err = ret; + err = btrfs_update_inode_fallback(trans, root, inode); goto out; } @@ -5834,7 +5796,7 @@ again: add_pending_csums(trans, inode, ordered->file_offset, &ordered->list); ret = btrfs_ordered_update_i_size(inode, 0, ordered); if (!ret || !test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags)) - btrfs_update_inode(trans, root, inode); + btrfs_update_inode_fallback(trans, root, inode); ret = 0; out_unlock: unlock_extent_cached(&BTRFS_I(inode)->io_tree, ordered->file_offset, @@ -6289,7 +6251,7 @@ int btrfs_readpage(struct file *file, struct page *page) { struct extent_io_tree *tree; tree = &BTRFS_I(page->mapping->host)->io_tree; - return extent_read_full_page(tree, page, btrfs_get_extent); + return extent_read_full_page(tree, page, btrfs_get_extent, 0); } static int btrfs_writepage(struct page *page, struct writeback_control *wbc) @@ -6440,7 +6402,12 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) u64 page_start; u64 page_end; + /* Need this to keep space reservations serialized */ + mutex_lock(&inode->i_mutex); ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE); + mutex_unlock(&inode->i_mutex); + if (!ret) + ret = btrfs_update_time(vma->vm_file); if (ret) { if (ret == -ENOMEM) ret = VM_FAULT_OOM; @@ -6541,6 +6508,7 @@ static int btrfs_truncate(struct inode *inode) struct btrfs_trans_handle *trans; unsigned long nr; u64 mask = root->sectorsize - 1; + u64 min_size = btrfs_calc_trunc_metadata_size(root, 1); ret = btrfs_truncate_page(inode->i_mapping, inode->i_size); if (ret) @@ -6588,19 +6556,23 @@ static int btrfs_truncate(struct inode *inode) rsv = btrfs_alloc_block_rsv(root); if (!rsv) return -ENOMEM; - btrfs_add_durable_block_rsv(root->fs_info, rsv); + rsv->size = min_size; + /* + * 1 for the truncate slack space + * 1 for the orphan item we're going to add + * 1 for the orphan item deletion + * 1 for updating the inode. + */ trans = btrfs_start_transaction(root, 4); if (IS_ERR(trans)) { err = PTR_ERR(trans); goto out; } - /* - * Reserve space for the truncate process. Truncate should be adding - * space, but if there are snapshots it may end up using space. - */ - ret = btrfs_truncate_reserve_metadata(trans, root, rsv); + /* Migrate the slack space for the truncate to our reserve */ + ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv, rsv, + min_size); BUG_ON(ret); ret = btrfs_orphan_add(trans, inode); @@ -6609,21 +6581,6 @@ static int btrfs_truncate(struct inode *inode) goto out; } - nr = trans->blocks_used; - btrfs_end_transaction(trans, root); - btrfs_btree_balance_dirty(root, nr); - - /* - * Ok so we've already migrated our bytes over for the truncate, so here - * just reserve the one slot we need for updating the inode. - */ - trans = btrfs_start_transaction(root, 1); - if (IS_ERR(trans)) { - err = PTR_ERR(trans); - goto out; - } - trans->block_rsv = rsv; - /* * setattr is responsible for setting the ordered_data_close flag, * but that is only tested during the last file release. That @@ -6645,20 +6602,31 @@ static int btrfs_truncate(struct inode *inode) btrfs_add_ordered_operation(trans, root, inode); while (1) { + ret = btrfs_block_rsv_refill(root, rsv, min_size); + if (ret) { + /* + * This can only happen with the original transaction we + * started above, every other time we shouldn't have a + * transaction started yet. + */ + if (ret == -EAGAIN) + goto end_trans; + err = ret; + break; + } + if (!trans) { - trans = btrfs_start_transaction(root, 3); + /* Just need the 1 for updating the inode */ + trans = btrfs_start_transaction(root, 1); if (IS_ERR(trans)) { - err = PTR_ERR(trans); - goto out; + ret = err = PTR_ERR(trans); + trans = NULL; + break; } - - ret = btrfs_truncate_reserve_metadata(trans, root, - rsv); - BUG_ON(ret); - - trans->block_rsv = rsv; } + trans->block_rsv = rsv; + ret = btrfs_truncate_inode_items(trans, root, inode, inode->i_size, BTRFS_EXTENT_DATA_KEY); @@ -6673,7 +6641,7 @@ static int btrfs_truncate(struct inode *inode) err = ret; break; } - +end_trans: nr = trans->blocks_used; btrfs_end_transaction(trans, root); trans = NULL; @@ -6693,14 +6661,16 @@ static int btrfs_truncate(struct inode *inode) ret = btrfs_orphan_del(NULL, inode); } - trans->block_rsv = &root->fs_info->trans_block_rsv; - ret = btrfs_update_inode(trans, root, inode); - if (ret && !err) - err = ret; + if (trans) { + trans->block_rsv = &root->fs_info->trans_block_rsv; + ret = btrfs_update_inode(trans, root, inode); + if (ret && !err) + err = ret; - nr = trans->blocks_used; - ret = btrfs_end_transaction_throttle(trans, root); - btrfs_btree_balance_dirty(root, nr); + nr = trans->blocks_used; + ret = btrfs_end_transaction_throttle(trans, root); + btrfs_btree_balance_dirty(root, nr); + } out: btrfs_free_block_rsv(root, rsv); @@ -6728,7 +6698,7 @@ int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, inode->i_op = &btrfs_dir_inode_operations; inode->i_fop = &btrfs_dir_file_operations; - inode->i_nlink = 1; + set_nlink(inode, 1); btrfs_i_size_write(inode, 0); err = btrfs_update_inode(trans, new_root, inode); @@ -6755,9 +6725,9 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) ei->last_sub_trans = 0; ei->logged_trans = 0; ei->delalloc_bytes = 0; - ei->reserved_bytes = 0; ei->disk_i_size = 0; ei->flags = 0; + ei->csum_bytes = 0; ei->index_cnt = (u64)-1; ei->last_unlink_trans = 0; @@ -6769,6 +6739,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) ei->orphan_meta_reserved = 0; ei->dummy_inode = 0; ei->in_defrag = 0; + ei->delalloc_meta_reserved = 0; ei->force_compress = BTRFS_COMPRESS_NONE; ei->delayed_node = NULL; @@ -6790,7 +6761,6 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) static void btrfs_i_callback(struct rcu_head *head) { struct inode *inode = container_of(head, struct inode, i_rcu); - INIT_LIST_HEAD(&inode->i_dentry); kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode)); } @@ -6803,6 +6773,8 @@ void btrfs_destroy_inode(struct inode *inode) WARN_ON(inode->i_data.nrpages); WARN_ON(BTRFS_I(inode)->outstanding_extents); WARN_ON(BTRFS_I(inode)->reserved_extents); + WARN_ON(BTRFS_I(inode)->delalloc_bytes); + WARN_ON(BTRFS_I(inode)->csum_bytes); /* * This can happen where we create an inode, but somebody else also @@ -6926,11 +6898,13 @@ static int btrfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) { struct inode *inode = dentry->d_inode; + u32 blocksize = inode->i_sb->s_blocksize; + generic_fillattr(inode, stat); stat->dev = BTRFS_I(inode)->root->anon_dev; stat->blksize = PAGE_CACHE_SIZE; - stat->blocks = (inode_get_bytes(inode) + - BTRFS_I(inode)->delalloc_bytes) >> 9; + stat->blocks = (ALIGN(inode_get_bytes(inode), blocksize) + + ALIGN(BTRFS_I(inode)->delalloc_bytes, blocksize)) >> 9; return 0; } @@ -7206,14 +7180,21 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry, goto out_unlock; } + /* + * If the active LSM wants to access the inode during + * d_instantiate it needs these. Smack checks to see + * if the filesystem supports xattrs by looking at the + * ops vector. + */ + inode->i_fop = &btrfs_file_operations; + inode->i_op = &btrfs_file_inode_operations; + err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index); if (err) drop_inode = 1; else { inode->i_mapping->a_ops = &btrfs_aops; inode->i_mapping->backing_dev_info = &root->fs_info->bdi; - inode->i_fop = &btrfs_file_operations; - inode->i_op = &btrfs_file_inode_operations; BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; } if (drop_inode) @@ -7262,6 +7243,8 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry, drop_inode = 1; out_unlock: + if (!err) + d_instantiate(dentry, inode); nr = trans->blocks_used; btrfs_end_transaction_throttle(trans, root); if (drop_inode) { @@ -7420,7 +7403,6 @@ static struct extent_io_ops btrfs_extent_io_ops = { .readpage_end_io_hook = btrfs_readpage_end_io_hook, .writepage_end_io_hook = btrfs_writepage_end_io_hook, .writepage_start_hook = btrfs_writepage_start_hook, - .readpage_io_failed_hook = btrfs_io_failed_hook, .set_bit_hook = btrfs_set_bit_hook, .clear_bit_hook = btrfs_clear_bit_hook, .merge_extent_hook = btrfs_merge_extent_hook, @@ -7484,6 +7466,7 @@ static const struct inode_operations btrfs_symlink_inode_operations = { .follow_link = page_follow_link_light, .put_link = page_put_link, .getattr = btrfs_getattr, + .setattr = btrfs_setattr, .permission = btrfs_permission, .setxattr = btrfs_setxattr, .getxattr = btrfs_getxattr, diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index dae5dfe..5441ff1 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -51,6 +51,7 @@ #include "volumes.h" #include "locking.h" #include "inode-map.h" +#include "backref.h" /* Mask out flags that are inappropriate for the given type of inode. */ static inline __u32 btrfs_mask_flags(umode_t mode, __u32 flags) @@ -117,7 +118,7 @@ void btrfs_update_iflags(struct inode *inode) /* * Inherit flags from the parent inode. * - * Unlike extN we don't have any flags we don't want to inherit currently. + * Currently only the compression flags and the cow flags are inherited. */ void btrfs_inherit_iflags(struct inode *inode, struct inode *dir) { @@ -128,12 +129,17 @@ void btrfs_inherit_iflags(struct inode *inode, struct inode *dir) flags = BTRFS_I(dir)->flags; - if (S_ISREG(inode->i_mode)) - flags &= ~BTRFS_INODE_DIRSYNC; - else if (!S_ISDIR(inode->i_mode)) - flags &= (BTRFS_INODE_NODUMP | BTRFS_INODE_NOATIME); + if (flags & BTRFS_INODE_NOCOMPRESS) { + BTRFS_I(inode)->flags &= ~BTRFS_INODE_COMPRESS; + BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS; + } else if (flags & BTRFS_INODE_COMPRESS) { + BTRFS_I(inode)->flags &= ~BTRFS_INODE_NOCOMPRESS; + BTRFS_I(inode)->flags |= BTRFS_INODE_COMPRESS; + } + + if (flags & BTRFS_INODE_NODATACOW) + BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW; - BTRFS_I(inode)->flags = flags; btrfs_update_iflags(inode); } @@ -195,7 +201,7 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg) } } - ret = mnt_want_write(file->f_path.mnt); + ret = mnt_want_write_file(file); if (ret) goto out_unlock; @@ -246,14 +252,14 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg) trans = btrfs_join_transaction(root); BUG_ON(IS_ERR(trans)); + btrfs_update_iflags(inode); + inode->i_ctime = CURRENT_TIME; ret = btrfs_update_inode(trans, root, inode); BUG_ON(ret); - btrfs_update_iflags(inode); - inode->i_ctime = CURRENT_TIME; btrfs_end_transaction(trans, root); - mnt_drop_write(file->f_path.mnt); + mnt_drop_write_file(file); ret = 0; out_unlock: @@ -277,6 +283,7 @@ static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg) struct fstrim_range range; u64 minlen = ULLONG_MAX; u64 num_devices = 0; + u64 total_bytes = btrfs_super_total_bytes(root->fs_info->super_copy); int ret; if (!capable(CAP_SYS_ADMIN)) @@ -295,12 +302,15 @@ static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg) } } rcu_read_unlock(); + if (!num_devices) return -EOPNOTSUPP; - if (copy_from_user(&range, arg, sizeof(range))) return -EFAULT; + if (range.start > total_bytes) + return -EINVAL; + range.len = min(range.len, total_bytes - range.start); range.minlen = max(range.minlen, minlen); ret = btrfs_trim_fs(root, &range); if (ret < 0) @@ -760,7 +770,7 @@ static int should_defrag_range(struct inode *inode, u64 start, u64 len, int ret = 1; /* - * make sure that once we start defragging and extent, we keep on + * make sure that once we start defragging an extent, we keep on * defragging it */ if (start < *defrag_end) @@ -805,7 +815,6 @@ static int should_defrag_range(struct inode *inode, u64 start, u64 len, * extent will force at least part of that big extent to be defragged. */ if (ret) { - *last_len += len; *defrag_end = extent_map_end(em); } else { *last_len = 0; @@ -843,13 +852,16 @@ static int cluster_pages_for_defrag(struct inode *inode, int i_done; struct btrfs_ordered_extent *ordered; struct extent_state *cached_state = NULL; + gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping); if (isize == 0) return 0; file_end = (isize - 1) >> PAGE_CACHE_SHIFT; + mutex_lock(&inode->i_mutex); ret = btrfs_delalloc_reserve_space(inode, num_pages << PAGE_CACHE_SHIFT); + mutex_unlock(&inode->i_mutex); if (ret) return ret; again: @@ -860,7 +872,7 @@ again: for (i = 0; i < num_pages; i++) { struct page *page; page = find_or_create_page(inode->i_mapping, - start_index + i, GFP_NOFS); + start_index + i, mask); if (!page) break; @@ -972,18 +984,20 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, struct btrfs_super_block *disk_super; struct file_ra_state *ra = NULL; unsigned long last_index; + u64 isize = i_size_read(inode); u64 features; u64 last_len = 0; u64 skip = 0; u64 defrag_end = 0; u64 newer_off = range->start; - int newer_left = 0; unsigned long i; + unsigned long ra_index = 0; int ret; int defrag_count = 0; int compress_type = BTRFS_COMPRESS_ZLIB; int extent_thresh = range->extent_thresh; - int newer_cluster = (256 * 1024) >> PAGE_CACHE_SHIFT; + int max_cluster = (256 * 1024) >> PAGE_CACHE_SHIFT; + int cluster = max_cluster; u64 new_align = ~((u64)128 * 1024 - 1); struct page **pages = NULL; @@ -997,7 +1011,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, compress_type = range->compress_type; } - if (inode->i_size == 0) + if (isize == 0) return 0; /* @@ -1013,7 +1027,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, ra = &file->f_ra; } - pages = kmalloc(sizeof(struct page *) * newer_cluster, + pages = kmalloc(sizeof(struct page *) * max_cluster, GFP_NOFS); if (!pages) { ret = -ENOMEM; @@ -1022,10 +1036,10 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, /* find the last page to defrag */ if (range->start + range->len > range->start) { - last_index = min_t(u64, inode->i_size - 1, + last_index = min_t(u64, isize - 1, range->start + range->len - 1) >> PAGE_CACHE_SHIFT; } else { - last_index = (inode->i_size - 1) >> PAGE_CACHE_SHIFT; + last_index = (isize - 1) >> PAGE_CACHE_SHIFT; } if (newer_than) { @@ -1038,14 +1052,13 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, * the extents in the file evenly spaced */ i = (newer_off & new_align) >> PAGE_CACHE_SHIFT; - newer_left = newer_cluster; } else goto out_ra; } else { i = range->start >> PAGE_CACHE_SHIFT; } if (!max_to_defrag) - max_to_defrag = last_index - 1; + max_to_defrag = last_index; /* * make writeback starts from i, so the defrag range can be @@ -1079,18 +1092,31 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, i = max(i + 1, next); continue; } + + if (!newer_than) { + cluster = (PAGE_CACHE_ALIGN(defrag_end) >> + PAGE_CACHE_SHIFT) - i; + cluster = min(cluster, max_cluster); + } else { + cluster = max_cluster; + } + if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS) BTRFS_I(inode)->force_compress = compress_type; - btrfs_force_ra(inode->i_mapping, ra, file, i, newer_cluster); + if (i + cluster > ra_index) { + ra_index = max(i, ra_index); + btrfs_force_ra(inode->i_mapping, ra, file, ra_index, + cluster); + ra_index += max_cluster; + } - ret = cluster_pages_for_defrag(inode, pages, i, newer_cluster); + ret = cluster_pages_for_defrag(inode, pages, i, cluster); if (ret < 0) goto out_ra; defrag_count += ret; balance_dirty_pages_ratelimited_nr(inode->i_mapping, ret); - i += ret; if (newer_than) { if (newer_off == (u64)-1) @@ -1105,12 +1131,17 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, if (!ret) { range->start = newer_off; i = (newer_off & new_align) >> PAGE_CACHE_SHIFT; - newer_left = newer_cluster; } else { break; } } else { - i++; + if (ret > 0) { + i += ret; + last_len += ret << PAGE_CACHE_SHIFT; + } else { + i++; + last_len = 0; + } } } @@ -1136,16 +1167,14 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, mutex_unlock(&inode->i_mutex); } - disk_super = &root->fs_info->super_copy; + disk_super = root->fs_info->super_copy; features = btrfs_super_incompat_flags(disk_super); if (range->compress_type == BTRFS_COMPRESS_LZO) { features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO; btrfs_set_super_incompat_flags(disk_super, features); } - if (!file) - kfree(ra); - return defrag_count; + ret = defrag_count; out_ra: if (!file) @@ -1189,12 +1218,12 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root, *devstr = '\0'; devstr = vol_args->name; devid = simple_strtoull(devstr, &end, 10); - printk(KERN_INFO "resizing devid %llu\n", + printk(KERN_INFO "btrfs: resizing devid %llu\n", (unsigned long long)devid); } device = btrfs_find_device(root, devid, NULL, NULL); if (!device) { - printk(KERN_INFO "resizer unable to find device %llu\n", + printk(KERN_INFO "btrfs: resizer unable to find device %llu\n", (unsigned long long)devid); ret = -EINVAL; goto out_unlock; @@ -1240,7 +1269,7 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root, do_div(new_size, root->sectorsize); new_size *= root->sectorsize; - printk(KERN_INFO "new size for %s is %llu\n", + printk(KERN_INFO "btrfs: new size for %s is %llu\n", device->name, (unsigned long long)new_size); if (new_size > old_size) { @@ -1251,7 +1280,7 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root, } ret = btrfs_grow_device(trans, device, new_size); btrfs_commit_transaction(trans, root); - } else { + } else if (new_size < old_size) { ret = btrfs_shrink_device(device, new_size); } @@ -1826,7 +1855,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, goto out; } - err = mnt_want_write(file->f_path.mnt); + err = mnt_want_write_file(file); if (err) goto out; @@ -1942,7 +1971,7 @@ out_dput: dput(dentry); out_unlock_dir: mutex_unlock(&dir->i_mutex); - mnt_drop_write(file->f_path.mnt); + mnt_drop_write_file(file); out: kfree(vol_args); return err; @@ -1958,7 +1987,7 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp) if (btrfs_root_readonly(root)) return -EROFS; - ret = mnt_want_write(file->f_path.mnt); + ret = mnt_want_write_file(file); if (ret) return ret; @@ -2011,7 +2040,7 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp) ret = -EINVAL; } out: - mnt_drop_write(file->f_path.mnt); + mnt_drop_write_file(file); return ret; } @@ -2166,7 +2195,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, if (btrfs_root_readonly(root)) return -EROFS; - ret = mnt_want_write(file->f_path.mnt); + ret = mnt_want_write_file(file); if (ret) return ret; @@ -2481,7 +2510,7 @@ out_unlock: out_fput: fput(src_file); out_drop_write: - mnt_drop_write(file->f_path.mnt); + mnt_drop_write_file(file); return ret; } @@ -2520,7 +2549,7 @@ static long btrfs_ioctl_trans_start(struct file *file) if (btrfs_root_readonly(root)) goto out; - ret = mnt_want_write(file->f_path.mnt); + ret = mnt_want_write_file(file); if (ret) goto out; @@ -2536,7 +2565,7 @@ static long btrfs_ioctl_trans_start(struct file *file) out_drop: atomic_dec(&root->fs_info->open_ioctl_trans); - mnt_drop_write(file->f_path.mnt); + mnt_drop_write_file(file); out: return ret; } @@ -2587,7 +2616,7 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp) return PTR_ERR(trans); } - dir_id = btrfs_super_root_dir(&root->fs_info->super_copy); + dir_id = btrfs_super_root_dir(root->fs_info->super_copy); di = btrfs_lookup_dir_item(trans, root->fs_info->tree_root, path, dir_id, "default", 7, 1); if (IS_ERR_OR_NULL(di)) { @@ -2603,7 +2632,7 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp) btrfs_mark_buffer_dirty(path->nodes[0]); btrfs_free_path(path); - disk_super = &root->fs_info->super_copy; + disk_super = root->fs_info->super_copy; features = btrfs_super_incompat_flags(disk_super); if (!(features & BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL)) { features |= BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL; @@ -2771,7 +2800,7 @@ long btrfs_ioctl_trans_end(struct file *file) atomic_dec(&root->fs_info->open_ioctl_trans); - mnt_drop_write(file->f_path.mnt); + mnt_drop_write_file(file); return 0; } @@ -2864,6 +2893,147 @@ static long btrfs_ioctl_scrub_progress(struct btrfs_root *root, return ret; } +static long btrfs_ioctl_ino_to_path(struct btrfs_root *root, void __user *arg) +{ + int ret = 0; + int i; + u64 rel_ptr; + int size; + struct btrfs_ioctl_ino_path_args *ipa = NULL; + struct inode_fs_paths *ipath = NULL; + struct btrfs_path *path; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto out; + } + + ipa = memdup_user(arg, sizeof(*ipa)); + if (IS_ERR(ipa)) { + ret = PTR_ERR(ipa); + ipa = NULL; + goto out; + } + + size = min_t(u32, ipa->size, 4096); + ipath = init_ipath(size, root, path); + if (IS_ERR(ipath)) { + ret = PTR_ERR(ipath); + ipath = NULL; + goto out; + } + + ret = paths_from_inode(ipa->inum, ipath); + if (ret < 0) + goto out; + + for (i = 0; i < ipath->fspath->elem_cnt; ++i) { + rel_ptr = ipath->fspath->val[i] - + (u64)(unsigned long)ipath->fspath->val; + ipath->fspath->val[i] = rel_ptr; + } + + ret = copy_to_user((void *)(unsigned long)ipa->fspath, + (void *)(unsigned long)ipath->fspath, size); + if (ret) { + ret = -EFAULT; + goto out; + } + +out: + btrfs_free_path(path); + free_ipath(ipath); + kfree(ipa); + + return ret; +} + +static int build_ino_list(u64 inum, u64 offset, u64 root, void *ctx) +{ + struct btrfs_data_container *inodes = ctx; + const size_t c = 3 * sizeof(u64); + + if (inodes->bytes_left >= c) { + inodes->bytes_left -= c; + inodes->val[inodes->elem_cnt] = inum; + inodes->val[inodes->elem_cnt + 1] = offset; + inodes->val[inodes->elem_cnt + 2] = root; + inodes->elem_cnt += 3; + } else { + inodes->bytes_missing += c - inodes->bytes_left; + inodes->bytes_left = 0; + inodes->elem_missed += 3; + } + + return 0; +} + +static long btrfs_ioctl_logical_to_ino(struct btrfs_root *root, + void __user *arg) +{ + int ret = 0; + int size; + u64 extent_offset; + struct btrfs_ioctl_logical_ino_args *loi; + struct btrfs_data_container *inodes = NULL; + struct btrfs_path *path = NULL; + struct btrfs_key key; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + loi = memdup_user(arg, sizeof(*loi)); + if (IS_ERR(loi)) { + ret = PTR_ERR(loi); + loi = NULL; + goto out; + } + + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto out; + } + + size = min_t(u32, loi->size, 4096); + inodes = init_data_container(size); + if (IS_ERR(inodes)) { + ret = PTR_ERR(inodes); + inodes = NULL; + goto out; + } + + ret = extent_from_logical(root->fs_info, loi->logical, path, &key); + + if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK) + ret = -ENOENT; + if (ret < 0) + goto out; + + extent_offset = loi->logical - key.objectid; + ret = iterate_extent_inodes(root->fs_info, path, key.objectid, + extent_offset, build_ino_list, inodes); + + if (ret < 0) + goto out; + + ret = copy_to_user((void *)(unsigned long)loi->inodes, + (void *)(unsigned long)inodes, size); + if (ret) + ret = -EFAULT; + +out: + btrfs_free_path(path); + kfree(inodes); + kfree(loi); + + return ret; +} + long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { @@ -2921,6 +3091,10 @@ long btrfs_ioctl(struct file *file, unsigned int return btrfs_ioctl_tree_search(file, argp); case BTRFS_IOC_INO_LOOKUP: return btrfs_ioctl_ino_lookup(file, argp); + case BTRFS_IOC_INO_PATHS: + return btrfs_ioctl_ino_to_path(root, argp); + case BTRFS_IOC_LOGICAL_INO: + return btrfs_ioctl_logical_to_ino(root, argp); case BTRFS_IOC_SPACE_INFO: return btrfs_ioctl_space_info(root, argp); case BTRFS_IOC_SYNC: diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h index ad1ea78..252ae99 100644 --- a/fs/btrfs/ioctl.h +++ b/fs/btrfs/ioctl.h @@ -193,6 +193,30 @@ struct btrfs_ioctl_space_args { struct btrfs_ioctl_space_info spaces[0]; }; +struct btrfs_data_container { + __u32 bytes_left; /* out -- bytes not needed to deliver output */ + __u32 bytes_missing; /* out -- additional bytes needed for result */ + __u32 elem_cnt; /* out */ + __u32 elem_missed; /* out */ + __u64 val[0]; /* out */ +}; + +struct btrfs_ioctl_ino_path_args { + __u64 inum; /* in */ + __u32 size; /* in */ + __u64 reserved[4]; + /* struct btrfs_data_container *fspath; out */ + __u64 fspath; /* out */ +}; + +struct btrfs_ioctl_logical_ino_args { + __u64 logical; /* in */ + __u32 size; /* in */ + __u64 reserved[4]; + /* struct btrfs_data_container *inodes; out */ + __u64 inodes; +}; + #define BTRFS_IOC_SNAP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 1, \ struct btrfs_ioctl_vol_args) #define BTRFS_IOC_DEFRAG _IOW(BTRFS_IOCTL_MAGIC, 2, \ @@ -248,4 +272,9 @@ struct btrfs_ioctl_space_args { struct btrfs_ioctl_dev_info_args) #define BTRFS_IOC_FS_INFO _IOR(BTRFS_IOCTL_MAGIC, 31, \ struct btrfs_ioctl_fs_info_args) +#define BTRFS_IOC_INO_PATHS _IOWR(BTRFS_IOCTL_MAGIC, 35, \ + struct btrfs_ioctl_ino_path_args) +#define BTRFS_IOC_LOGICAL_INO _IOWR(BTRFS_IOCTL_MAGIC, 36, \ + struct btrfs_ioctl_ino_path_args) + #endif diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c index fb2605d..f38e452 100644 --- a/fs/btrfs/print-tree.c +++ b/fs/btrfs/print-tree.c @@ -158,8 +158,7 @@ static void print_extent_ref_v0(struct extent_buffer *eb, int slot) void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l) { int i; - u32 type; - u32 nr = btrfs_header_nritems(l); + u32 type, nr; struct btrfs_item *item; struct btrfs_root_item *ri; struct btrfs_dir_item *di; @@ -172,6 +171,11 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l) struct btrfs_key key; struct btrfs_key found_key; + if (!l) + return; + + nr = btrfs_header_nritems(l); + printk(KERN_INFO "leaf %llu total ptrs %d free space %d\n", (unsigned long long)btrfs_header_bytenr(l), nr, btrfs_leaf_free_space(root, l)); diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c new file mode 100644 index 0000000..2373b39 --- /dev/null +++ b/fs/btrfs/reada.c @@ -0,0 +1,951 @@ +/* + * Copyright (C) 2011 STRATO. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/sched.h> +#include <linux/pagemap.h> +#include <linux/writeback.h> +#include <linux/blkdev.h> +#include <linux/rbtree.h> +#include <linux/slab.h> +#include <linux/workqueue.h> +#include "ctree.h" +#include "volumes.h" +#include "disk-io.h" +#include "transaction.h" + +#undef DEBUG + +/* + * This is the implementation for the generic read ahead framework. + * + * To trigger a readahead, btrfs_reada_add must be called. It will start + * a read ahead for the given range [start, end) on tree root. The returned + * handle can either be used to wait on the readahead to finish + * (btrfs_reada_wait), or to send it to the background (btrfs_reada_detach). + * + * The read ahead works as follows: + * On btrfs_reada_add, the root of the tree is inserted into a radix_tree. + * reada_start_machine will then search for extents to prefetch and trigger + * some reads. When a read finishes for a node, all contained node/leaf + * pointers that lie in the given range will also be enqueued. The reads will + * be triggered in sequential order, thus giving a big win over a naive + * enumeration. It will also make use of multi-device layouts. Each disk + * will have its on read pointer and all disks will by utilized in parallel. + * Also will no two disks read both sides of a mirror simultaneously, as this + * would waste seeking capacity. Instead both disks will read different parts + * of the filesystem. + * Any number of readaheads can be started in parallel. The read order will be + * determined globally, i.e. 2 parallel readaheads will normally finish faster + * than the 2 started one after another. + */ + +#define MAX_MIRRORS 2 +#define MAX_IN_FLIGHT 6 + +struct reada_extctl { + struct list_head list; + struct reada_control *rc; + u64 generation; +}; + +struct reada_extent { + u64 logical; + struct btrfs_key top; + u32 blocksize; + int err; + struct list_head extctl; + struct kref refcnt; + spinlock_t lock; + struct reada_zone *zones[MAX_MIRRORS]; + int nzones; + struct btrfs_device *scheduled_for; +}; + +struct reada_zone { + u64 start; + u64 end; + u64 elems; + struct list_head list; + spinlock_t lock; + int locked; + struct btrfs_device *device; + struct btrfs_device *devs[MAX_MIRRORS]; /* full list, incl self */ + int ndevs; + struct kref refcnt; +}; + +struct reada_machine_work { + struct btrfs_work work; + struct btrfs_fs_info *fs_info; +}; + +static void reada_extent_put(struct btrfs_fs_info *, struct reada_extent *); +static void reada_control_release(struct kref *kref); +static void reada_zone_release(struct kref *kref); +static void reada_start_machine(struct btrfs_fs_info *fs_info); +static void __reada_start_machine(struct btrfs_fs_info *fs_info); + +static int reada_add_block(struct reada_control *rc, u64 logical, + struct btrfs_key *top, int level, u64 generation); + +/* recurses */ +/* in case of err, eb might be NULL */ +static int __readahead_hook(struct btrfs_root *root, struct extent_buffer *eb, + u64 start, int err) +{ + int level = 0; + int nritems; + int i; + u64 bytenr; + u64 generation; + struct reada_extent *re; + struct btrfs_fs_info *fs_info = root->fs_info; + struct list_head list; + unsigned long index = start >> PAGE_CACHE_SHIFT; + struct btrfs_device *for_dev; + + if (eb) + level = btrfs_header_level(eb); + + /* find extent */ + spin_lock(&fs_info->reada_lock); + re = radix_tree_lookup(&fs_info->reada_tree, index); + if (re) + kref_get(&re->refcnt); + spin_unlock(&fs_info->reada_lock); + + if (!re) + return -1; + + spin_lock(&re->lock); + /* + * just take the full list from the extent. afterwards we + * don't need the lock anymore + */ + list_replace_init(&re->extctl, &list); + for_dev = re->scheduled_for; + re->scheduled_for = NULL; + spin_unlock(&re->lock); + + if (err == 0) { + nritems = level ? btrfs_header_nritems(eb) : 0; + generation = btrfs_header_generation(eb); + /* + * FIXME: currently we just set nritems to 0 if this is a leaf, + * effectively ignoring the content. In a next step we could + * trigger more readahead depending from the content, e.g. + * fetch the checksums for the extents in the leaf. + */ + } else { + /* + * this is the error case, the extent buffer has not been + * read correctly. We won't access anything from it and + * just cleanup our data structures. Effectively this will + * cut the branch below this node from read ahead. + */ + nritems = 0; + generation = 0; + } + + for (i = 0; i < nritems; i++) { + struct reada_extctl *rec; + u64 n_gen; + struct btrfs_key key; + struct btrfs_key next_key; + + btrfs_node_key_to_cpu(eb, &key, i); + if (i + 1 < nritems) + btrfs_node_key_to_cpu(eb, &next_key, i + 1); + else + next_key = re->top; + bytenr = btrfs_node_blockptr(eb, i); + n_gen = btrfs_node_ptr_generation(eb, i); + + list_for_each_entry(rec, &list, list) { + struct reada_control *rc = rec->rc; + + /* + * if the generation doesn't match, just ignore this + * extctl. This will probably cut off a branch from + * prefetch. Alternatively one could start a new (sub-) + * prefetch for this branch, starting again from root. + * FIXME: move the generation check out of this loop + */ +#ifdef DEBUG + if (rec->generation != generation) { + printk(KERN_DEBUG "generation mismatch for " + "(%llu,%d,%llu) %llu != %llu\n", + key.objectid, key.type, key.offset, + rec->generation, generation); + } +#endif + if (rec->generation == generation && + btrfs_comp_cpu_keys(&key, &rc->key_end) < 0 && + btrfs_comp_cpu_keys(&next_key, &rc->key_start) > 0) + reada_add_block(rc, bytenr, &next_key, + level - 1, n_gen); + } + } + /* + * free extctl records + */ + while (!list_empty(&list)) { + struct reada_control *rc; + struct reada_extctl *rec; + + rec = list_first_entry(&list, struct reada_extctl, list); + list_del(&rec->list); + rc = rec->rc; + kfree(rec); + + kref_get(&rc->refcnt); + if (atomic_dec_and_test(&rc->elems)) { + kref_put(&rc->refcnt, reada_control_release); + wake_up(&rc->wait); + } + kref_put(&rc->refcnt, reada_control_release); + + reada_extent_put(fs_info, re); /* one ref for each entry */ + } + reada_extent_put(fs_info, re); /* our ref */ + if (for_dev) + atomic_dec(&for_dev->reada_in_flight); + + return 0; +} + +/* + * start is passed separately in case eb in NULL, which may be the case with + * failed I/O + */ +int btree_readahead_hook(struct btrfs_root *root, struct extent_buffer *eb, + u64 start, int err) +{ + int ret; + + ret = __readahead_hook(root, eb, start, err); + + reada_start_machine(root->fs_info); + + return ret; +} + +static struct reada_zone *reada_find_zone(struct btrfs_fs_info *fs_info, + struct btrfs_device *dev, u64 logical, + struct btrfs_bio *bbio) +{ + int ret; + int looped = 0; + struct reada_zone *zone; + struct btrfs_block_group_cache *cache = NULL; + u64 start; + u64 end; + int i; + +again: + zone = NULL; + spin_lock(&fs_info->reada_lock); + ret = radix_tree_gang_lookup(&dev->reada_zones, (void **)&zone, + logical >> PAGE_CACHE_SHIFT, 1); + if (ret == 1) + kref_get(&zone->refcnt); + spin_unlock(&fs_info->reada_lock); + + if (ret == 1) { + if (logical >= zone->start && logical < zone->end) + return zone; + spin_lock(&fs_info->reada_lock); + kref_put(&zone->refcnt, reada_zone_release); + spin_unlock(&fs_info->reada_lock); + } + + if (looped) + return NULL; + + cache = btrfs_lookup_block_group(fs_info, logical); + if (!cache) + return NULL; + + start = cache->key.objectid; + end = start + cache->key.offset - 1; + btrfs_put_block_group(cache); + + zone = kzalloc(sizeof(*zone), GFP_NOFS); + if (!zone) + return NULL; + + zone->start = start; + zone->end = end; + INIT_LIST_HEAD(&zone->list); + spin_lock_init(&zone->lock); + zone->locked = 0; + kref_init(&zone->refcnt); + zone->elems = 0; + zone->device = dev; /* our device always sits at index 0 */ + for (i = 0; i < bbio->num_stripes; ++i) { + /* bounds have already been checked */ + zone->devs[i] = bbio->stripes[i].dev; + } + zone->ndevs = bbio->num_stripes; + + spin_lock(&fs_info->reada_lock); + ret = radix_tree_insert(&dev->reada_zones, + (unsigned long)zone->end >> PAGE_CACHE_SHIFT, + zone); + spin_unlock(&fs_info->reada_lock); + + if (ret) { + kfree(zone); + looped = 1; + goto again; + } + + return zone; +} + +static struct reada_extent *reada_find_extent(struct btrfs_root *root, + u64 logical, + struct btrfs_key *top, int level) +{ + int ret; + int looped = 0; + struct reada_extent *re = NULL; + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; + struct btrfs_bio *bbio = NULL; + struct btrfs_device *dev; + u32 blocksize; + u64 length; + int nzones = 0; + int i; + unsigned long index = logical >> PAGE_CACHE_SHIFT; + +again: + spin_lock(&fs_info->reada_lock); + re = radix_tree_lookup(&fs_info->reada_tree, index); + if (re) + kref_get(&re->refcnt); + spin_unlock(&fs_info->reada_lock); + + if (re || looped) + return re; + + re = kzalloc(sizeof(*re), GFP_NOFS); + if (!re) + return NULL; + + blocksize = btrfs_level_size(root, level); + re->logical = logical; + re->blocksize = blocksize; + re->top = *top; + INIT_LIST_HEAD(&re->extctl); + spin_lock_init(&re->lock); + kref_init(&re->refcnt); + + /* + * map block + */ + length = blocksize; + ret = btrfs_map_block(map_tree, REQ_WRITE, logical, &length, &bbio, 0); + if (ret || !bbio || length < blocksize) + goto error; + + if (bbio->num_stripes > MAX_MIRRORS) { + printk(KERN_ERR "btrfs readahead: more than %d copies not " + "supported", MAX_MIRRORS); + goto error; + } + + for (nzones = 0; nzones < bbio->num_stripes; ++nzones) { + struct reada_zone *zone; + + dev = bbio->stripes[nzones].dev; + zone = reada_find_zone(fs_info, dev, logical, bbio); + if (!zone) + break; + + re->zones[nzones] = zone; + spin_lock(&zone->lock); + if (!zone->elems) + kref_get(&zone->refcnt); + ++zone->elems; + spin_unlock(&zone->lock); + spin_lock(&fs_info->reada_lock); + kref_put(&zone->refcnt, reada_zone_release); + spin_unlock(&fs_info->reada_lock); + } + re->nzones = nzones; + if (nzones == 0) { + /* not a single zone found, error and out */ + goto error; + } + + /* insert extent in reada_tree + all per-device trees, all or nothing */ + spin_lock(&fs_info->reada_lock); + ret = radix_tree_insert(&fs_info->reada_tree, index, re); + if (ret) { + spin_unlock(&fs_info->reada_lock); + if (ret != -ENOMEM) { + /* someone inserted the extent in the meantime */ + looped = 1; + } + goto error; + } + for (i = 0; i < nzones; ++i) { + dev = bbio->stripes[i].dev; + ret = radix_tree_insert(&dev->reada_extents, index, re); + if (ret) { + while (--i >= 0) { + dev = bbio->stripes[i].dev; + BUG_ON(dev == NULL); + radix_tree_delete(&dev->reada_extents, index); + } + BUG_ON(fs_info == NULL); + radix_tree_delete(&fs_info->reada_tree, index); + spin_unlock(&fs_info->reada_lock); + goto error; + } + } + spin_unlock(&fs_info->reada_lock); + + kfree(bbio); + return re; + +error: + while (nzones) { + struct reada_zone *zone; + + --nzones; + zone = re->zones[nzones]; + kref_get(&zone->refcnt); + spin_lock(&zone->lock); + --zone->elems; + if (zone->elems == 0) { + /* + * no fs_info->reada_lock needed, as this can't be + * the last ref + */ + kref_put(&zone->refcnt, reada_zone_release); + } + spin_unlock(&zone->lock); + + spin_lock(&fs_info->reada_lock); + kref_put(&zone->refcnt, reada_zone_release); + spin_unlock(&fs_info->reada_lock); + } + kfree(bbio); + kfree(re); + if (looped) + goto again; + return NULL; +} + +static void reada_kref_dummy(struct kref *kr) +{ +} + +static void reada_extent_put(struct btrfs_fs_info *fs_info, + struct reada_extent *re) +{ + int i; + unsigned long index = re->logical >> PAGE_CACHE_SHIFT; + + spin_lock(&fs_info->reada_lock); + if (!kref_put(&re->refcnt, reada_kref_dummy)) { + spin_unlock(&fs_info->reada_lock); + return; + } + + radix_tree_delete(&fs_info->reada_tree, index); + for (i = 0; i < re->nzones; ++i) { + struct reada_zone *zone = re->zones[i]; + + radix_tree_delete(&zone->device->reada_extents, index); + } + + spin_unlock(&fs_info->reada_lock); + + for (i = 0; i < re->nzones; ++i) { + struct reada_zone *zone = re->zones[i]; + + kref_get(&zone->refcnt); + spin_lock(&zone->lock); + --zone->elems; + if (zone->elems == 0) { + /* no fs_info->reada_lock needed, as this can't be + * the last ref */ + kref_put(&zone->refcnt, reada_zone_release); + } + spin_unlock(&zone->lock); + + spin_lock(&fs_info->reada_lock); + kref_put(&zone->refcnt, reada_zone_release); + spin_unlock(&fs_info->reada_lock); + } + if (re->scheduled_for) + atomic_dec(&re->scheduled_for->reada_in_flight); + + kfree(re); +} + +static void reada_zone_release(struct kref *kref) +{ + struct reada_zone *zone = container_of(kref, struct reada_zone, refcnt); + + radix_tree_delete(&zone->device->reada_zones, + zone->end >> PAGE_CACHE_SHIFT); + + kfree(zone); +} + +static void reada_control_release(struct kref *kref) +{ + struct reada_control *rc = container_of(kref, struct reada_control, + refcnt); + + kfree(rc); +} + +static int reada_add_block(struct reada_control *rc, u64 logical, + struct btrfs_key *top, int level, u64 generation) +{ + struct btrfs_root *root = rc->root; + struct reada_extent *re; + struct reada_extctl *rec; + + re = reada_find_extent(root, logical, top, level); /* takes one ref */ + if (!re) + return -1; + + rec = kzalloc(sizeof(*rec), GFP_NOFS); + if (!rec) { + reada_extent_put(root->fs_info, re); + return -1; + } + + rec->rc = rc; + rec->generation = generation; + atomic_inc(&rc->elems); + + spin_lock(&re->lock); + list_add_tail(&rec->list, &re->extctl); + spin_unlock(&re->lock); + + /* leave the ref on the extent */ + + return 0; +} + +/* + * called with fs_info->reada_lock held + */ +static void reada_peer_zones_set_lock(struct reada_zone *zone, int lock) +{ + int i; + unsigned long index = zone->end >> PAGE_CACHE_SHIFT; + + for (i = 0; i < zone->ndevs; ++i) { + struct reada_zone *peer; + peer = radix_tree_lookup(&zone->devs[i]->reada_zones, index); + if (peer && peer->device != zone->device) + peer->locked = lock; + } +} + +/* + * called with fs_info->reada_lock held + */ +static int reada_pick_zone(struct btrfs_device *dev) +{ + struct reada_zone *top_zone = NULL; + struct reada_zone *top_locked_zone = NULL; + u64 top_elems = 0; + u64 top_locked_elems = 0; + unsigned long index = 0; + int ret; + + if (dev->reada_curr_zone) { + reada_peer_zones_set_lock(dev->reada_curr_zone, 0); + kref_put(&dev->reada_curr_zone->refcnt, reada_zone_release); + dev->reada_curr_zone = NULL; + } + /* pick the zone with the most elements */ + while (1) { + struct reada_zone *zone; + + ret = radix_tree_gang_lookup(&dev->reada_zones, + (void **)&zone, index, 1); + if (ret == 0) + break; + index = (zone->end >> PAGE_CACHE_SHIFT) + 1; + if (zone->locked) { + if (zone->elems > top_locked_elems) { + top_locked_elems = zone->elems; + top_locked_zone = zone; + } + } else { + if (zone->elems > top_elems) { + top_elems = zone->elems; + top_zone = zone; + } + } + } + if (top_zone) + dev->reada_curr_zone = top_zone; + else if (top_locked_zone) + dev->reada_curr_zone = top_locked_zone; + else + return 0; + + dev->reada_next = dev->reada_curr_zone->start; + kref_get(&dev->reada_curr_zone->refcnt); + reada_peer_zones_set_lock(dev->reada_curr_zone, 1); + + return 1; +} + +static int reada_start_machine_dev(struct btrfs_fs_info *fs_info, + struct btrfs_device *dev) +{ + struct reada_extent *re = NULL; + int mirror_num = 0; + struct extent_buffer *eb = NULL; + u64 logical; + u32 blocksize; + int ret; + int i; + int need_kick = 0; + + spin_lock(&fs_info->reada_lock); + if (dev->reada_curr_zone == NULL) { + ret = reada_pick_zone(dev); + if (!ret) { + spin_unlock(&fs_info->reada_lock); + return 0; + } + } + /* + * FIXME currently we issue the reads one extent at a time. If we have + * a contiguous block of extents, we could also coagulate them or use + * plugging to speed things up + */ + ret = radix_tree_gang_lookup(&dev->reada_extents, (void **)&re, + dev->reada_next >> PAGE_CACHE_SHIFT, 1); + if (ret == 0 || re->logical >= dev->reada_curr_zone->end) { + ret = reada_pick_zone(dev); + if (!ret) { + spin_unlock(&fs_info->reada_lock); + return 0; + } + re = NULL; + ret = radix_tree_gang_lookup(&dev->reada_extents, (void **)&re, + dev->reada_next >> PAGE_CACHE_SHIFT, 1); + } + if (ret == 0) { + spin_unlock(&fs_info->reada_lock); + return 0; + } + dev->reada_next = re->logical + re->blocksize; + kref_get(&re->refcnt); + + spin_unlock(&fs_info->reada_lock); + + /* + * find mirror num + */ + for (i = 0; i < re->nzones; ++i) { + if (re->zones[i]->device == dev) { + mirror_num = i + 1; + break; + } + } + logical = re->logical; + blocksize = re->blocksize; + + spin_lock(&re->lock); + if (re->scheduled_for == NULL) { + re->scheduled_for = dev; + need_kick = 1; + } + spin_unlock(&re->lock); + + reada_extent_put(fs_info, re); + + if (!need_kick) + return 0; + + atomic_inc(&dev->reada_in_flight); + ret = reada_tree_block_flagged(fs_info->extent_root, logical, blocksize, + mirror_num, &eb); + if (ret) + __readahead_hook(fs_info->extent_root, NULL, logical, ret); + else if (eb) + __readahead_hook(fs_info->extent_root, eb, eb->start, ret); + + if (eb) + free_extent_buffer(eb); + + return 1; + +} + +static void reada_start_machine_worker(struct btrfs_work *work) +{ + struct reada_machine_work *rmw; + struct btrfs_fs_info *fs_info; + + rmw = container_of(work, struct reada_machine_work, work); + fs_info = rmw->fs_info; + + kfree(rmw); + + __reada_start_machine(fs_info); +} + +static void __reada_start_machine(struct btrfs_fs_info *fs_info) +{ + struct btrfs_device *device; + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; + u64 enqueued; + u64 total = 0; + int i; + + do { + enqueued = 0; + list_for_each_entry(device, &fs_devices->devices, dev_list) { + if (atomic_read(&device->reada_in_flight) < + MAX_IN_FLIGHT) + enqueued += reada_start_machine_dev(fs_info, + device); + } + total += enqueued; + } while (enqueued && total < 10000); + + if (enqueued == 0) + return; + + /* + * If everything is already in the cache, this is effectively single + * threaded. To a) not hold the caller for too long and b) to utilize + * more cores, we broke the loop above after 10000 iterations and now + * enqueue to workers to finish it. This will distribute the load to + * the cores. + */ + for (i = 0; i < 2; ++i) + reada_start_machine(fs_info); +} + +static void reada_start_machine(struct btrfs_fs_info *fs_info) +{ + struct reada_machine_work *rmw; + + rmw = kzalloc(sizeof(*rmw), GFP_NOFS); + if (!rmw) { + /* FIXME we cannot handle this properly right now */ + BUG(); + } + rmw->work.func = reada_start_machine_worker; + rmw->fs_info = fs_info; + + btrfs_queue_worker(&fs_info->readahead_workers, &rmw->work); +} + +#ifdef DEBUG +static void dump_devs(struct btrfs_fs_info *fs_info, int all) +{ + struct btrfs_device *device; + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; + unsigned long index; + int ret; + int i; + int j; + int cnt; + + spin_lock(&fs_info->reada_lock); + list_for_each_entry(device, &fs_devices->devices, dev_list) { + printk(KERN_DEBUG "dev %lld has %d in flight\n", device->devid, + atomic_read(&device->reada_in_flight)); + index = 0; + while (1) { + struct reada_zone *zone; + ret = radix_tree_gang_lookup(&device->reada_zones, + (void **)&zone, index, 1); + if (ret == 0) + break; + printk(KERN_DEBUG " zone %llu-%llu elems %llu locked " + "%d devs", zone->start, zone->end, zone->elems, + zone->locked); + for (j = 0; j < zone->ndevs; ++j) { + printk(KERN_CONT " %lld", + zone->devs[j]->devid); + } + if (device->reada_curr_zone == zone) + printk(KERN_CONT " curr off %llu", + device->reada_next - zone->start); + printk(KERN_CONT "\n"); + index = (zone->end >> PAGE_CACHE_SHIFT) + 1; + } + cnt = 0; + index = 0; + while (all) { + struct reada_extent *re = NULL; + + ret = radix_tree_gang_lookup(&device->reada_extents, + (void **)&re, index, 1); + if (ret == 0) + break; + printk(KERN_DEBUG + " re: logical %llu size %u empty %d for %lld", + re->logical, re->blocksize, + list_empty(&re->extctl), re->scheduled_for ? + re->scheduled_for->devid : -1); + + for (i = 0; i < re->nzones; ++i) { + printk(KERN_CONT " zone %llu-%llu devs", + re->zones[i]->start, + re->zones[i]->end); + for (j = 0; j < re->zones[i]->ndevs; ++j) { + printk(KERN_CONT " %lld", + re->zones[i]->devs[j]->devid); + } + } + printk(KERN_CONT "\n"); + index = (re->logical >> PAGE_CACHE_SHIFT) + 1; + if (++cnt > 15) + break; + } + } + + index = 0; + cnt = 0; + while (all) { + struct reada_extent *re = NULL; + + ret = radix_tree_gang_lookup(&fs_info->reada_tree, (void **)&re, + index, 1); + if (ret == 0) + break; + if (!re->scheduled_for) { + index = (re->logical >> PAGE_CACHE_SHIFT) + 1; + continue; + } + printk(KERN_DEBUG + "re: logical %llu size %u list empty %d for %lld", + re->logical, re->blocksize, list_empty(&re->extctl), + re->scheduled_for ? re->scheduled_for->devid : -1); + for (i = 0; i < re->nzones; ++i) { + printk(KERN_CONT " zone %llu-%llu devs", + re->zones[i]->start, + re->zones[i]->end); + for (i = 0; i < re->nzones; ++i) { + printk(KERN_CONT " zone %llu-%llu devs", + re->zones[i]->start, + re->zones[i]->end); + for (j = 0; j < re->zones[i]->ndevs; ++j) { + printk(KERN_CONT " %lld", + re->zones[i]->devs[j]->devid); + } + } + } + printk(KERN_CONT "\n"); + index = (re->logical >> PAGE_CACHE_SHIFT) + 1; + } + spin_unlock(&fs_info->reada_lock); +} +#endif + +/* + * interface + */ +struct reada_control *btrfs_reada_add(struct btrfs_root *root, + struct btrfs_key *key_start, struct btrfs_key *key_end) +{ + struct reada_control *rc; + u64 start; + u64 generation; + int level; + struct extent_buffer *node; + static struct btrfs_key max_key = { + .objectid = (u64)-1, + .type = (u8)-1, + .offset = (u64)-1 + }; + + rc = kzalloc(sizeof(*rc), GFP_NOFS); + if (!rc) + return ERR_PTR(-ENOMEM); + + rc->root = root; + rc->key_start = *key_start; + rc->key_end = *key_end; + atomic_set(&rc->elems, 0); + init_waitqueue_head(&rc->wait); + kref_init(&rc->refcnt); + kref_get(&rc->refcnt); /* one ref for having elements */ + + node = btrfs_root_node(root); + start = node->start; + level = btrfs_header_level(node); + generation = btrfs_header_generation(node); + free_extent_buffer(node); + + reada_add_block(rc, start, &max_key, level, generation); + + reada_start_machine(root->fs_info); + + return rc; +} + +#ifdef DEBUG +int btrfs_reada_wait(void *handle) +{ + struct reada_control *rc = handle; + + while (atomic_read(&rc->elems)) { + wait_event_timeout(rc->wait, atomic_read(&rc->elems) == 0, + 5 * HZ); + dump_devs(rc->root->fs_info, rc->elems < 10 ? 1 : 0); + } + + dump_devs(rc->root->fs_info, rc->elems < 10 ? 1 : 0); + + kref_put(&rc->refcnt, reada_control_release); + + return 0; +} +#else +int btrfs_reada_wait(void *handle) +{ + struct reada_control *rc = handle; + + while (atomic_read(&rc->elems)) { + wait_event(rc->wait, atomic_read(&rc->elems) == 0); + } + + kref_put(&rc->refcnt, reada_control_release); + + return 0; +} +#endif + +void btrfs_reada_detach(void *handle) +{ + struct reada_control *rc = handle; + + kref_put(&rc->refcnt, reada_control_release); +} diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 59bb176..cfb5543 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -1174,6 +1174,8 @@ static int clone_backref_node(struct btrfs_trans_handle *trans, list_add_tail(&new_edge->list[UPPER], &new_node->lower); } + } else { + list_add_tail(&new_node->lower, &cache->leaves); } rb_node = tree_insert(&cache->rb_root, new_node->bytenr, @@ -2041,8 +2043,7 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc, BUG_ON(IS_ERR(trans)); trans->block_rsv = rc->block_rsv; - ret = btrfs_block_rsv_check(trans, root, rc->block_rsv, - min_reserved, 0); + ret = btrfs_block_rsv_refill(root, rc->block_rsv, min_reserved); if (ret) { BUG_ON(ret != -EAGAIN); ret = btrfs_commit_transaction(trans, root); @@ -2152,8 +2153,7 @@ int prepare_to_merge(struct reloc_control *rc, int err) again: if (!err) { num_bytes = rc->merging_rsv_size; - ret = btrfs_block_rsv_add(NULL, root, rc->block_rsv, - num_bytes); + ret = btrfs_block_rsv_add(root, rc->block_rsv, num_bytes); if (ret) err = ret; } @@ -2427,7 +2427,7 @@ static int reserve_metadata_space(struct btrfs_trans_handle *trans, num_bytes = calcu_metadata_size(rc, node, 1) * 2; trans->block_rsv = rc->block_rsv; - ret = btrfs_block_rsv_add(trans, root, rc->block_rsv, num_bytes); + ret = btrfs_block_rsv_add(root, rc->block_rsv, num_bytes); if (ret) { if (ret == -EAGAIN) rc->commit_transaction = 1; @@ -2922,6 +2922,7 @@ static int relocate_file_extent_cluster(struct inode *inode, unsigned long last_index; struct page *page; struct file_ra_state *ra; + gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping); int nr = 0; int ret = 0; @@ -2946,7 +2947,9 @@ static int relocate_file_extent_cluster(struct inode *inode, index = (cluster->start - offset) >> PAGE_CACHE_SHIFT; last_index = (cluster->end - offset) >> PAGE_CACHE_SHIFT; while (index <= last_index) { + mutex_lock(&inode->i_mutex); ret = btrfs_delalloc_reserve_metadata(inode, PAGE_CACHE_SIZE); + mutex_unlock(&inode->i_mutex); if (ret) goto out; @@ -2956,7 +2959,7 @@ static int relocate_file_extent_cluster(struct inode *inode, ra, NULL, index, last_index + 1 - index); page = find_or_create_page(inode->i_mapping, index, - GFP_NOFS); + mask); if (!page) { btrfs_delalloc_release_metadata(inode, PAGE_CACHE_SIZE); @@ -3323,8 +3326,11 @@ static int find_data_references(struct reloc_control *rc, } key.objectid = ref_objectid; - key.offset = ref_offset; key.type = BTRFS_EXTENT_DATA_KEY; + if (ref_offset > ((u64)-1 << 32)) + key.offset = 0; + else + key.offset = ref_offset; path->search_commit_root = 1; path->skip_locking = 1; @@ -3645,14 +3651,11 @@ int prepare_to_relocate(struct reloc_control *rc) * btrfs_init_reloc_root will use them when there * is no reservation in transaction handle. */ - ret = btrfs_block_rsv_add(NULL, rc->extent_root, rc->block_rsv, + ret = btrfs_block_rsv_add(rc->extent_root, rc->block_rsv, rc->extent_root->nodesize * 256); if (ret) return ret; - rc->block_rsv->refill_used = 1; - btrfs_add_durable_block_rsv(rc->extent_root->fs_info, rc->block_rsv); - memset(&rc->cluster, 0, sizeof(rc->cluster)); rc->search_start = rc->block_group->key.objectid; rc->extents_found = 0; @@ -3777,8 +3780,7 @@ restart: } } - ret = btrfs_block_rsv_check(trans, rc->extent_root, - rc->block_rsv, 0, 5); + ret = btrfs_block_rsv_check(rc->extent_root, rc->block_rsv, 5); if (ret < 0) { if (ret != -EAGAIN) { err = ret; diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index a8d03d5..ddf2c90 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -17,10 +17,14 @@ */ #include <linux/blkdev.h> +#include <linux/ratelimit.h> #include "ctree.h" #include "volumes.h" #include "disk-io.h" #include "ordered-data.h" +#include "transaction.h" +#include "backref.h" +#include "extent_io.h" /* * This is only the first step towards a full-features scrub. It reads all @@ -29,15 +33,12 @@ * any can be found. * * Future enhancements: - * - To enhance the performance, better read-ahead strategies for the - * extent-tree can be employed. * - In case an unrepairable extent is encountered, track which files are * affected and report them * - In case of a read error on files with nodatasum, map the file and read * the extent to trigger a writeback of the good copy * - track and record media errors, throw out bad devices * - add a mode to also read unallocated space - * - make the prefetch cancellable */ struct scrub_bio; @@ -63,7 +64,7 @@ static void scrub_fixup(struct scrub_bio *sbio, int ix); struct scrub_page { u64 flags; /* extent flags */ u64 generation; - u64 mirror_num; + int mirror_num; int have_csum; u8 csum[BTRFS_CSUM_SIZE]; }; @@ -87,6 +88,7 @@ struct scrub_dev { int first_free; int curr; atomic_t in_flight; + atomic_t fixup_cnt; spinlock_t list_lock; wait_queue_head_t list_wait; u16 csum_size; @@ -100,6 +102,27 @@ struct scrub_dev { spinlock_t stat_lock; }; +struct scrub_fixup_nodatasum { + struct scrub_dev *sdev; + u64 logical; + struct btrfs_root *root; + struct btrfs_work work; + int mirror_num; +}; + +struct scrub_warning { + struct btrfs_path *path; + u64 extent_item_size; + char *scratch_buf; + char *msg_buf; + const char *errstr; + sector_t sector; + u64 logical; + struct btrfs_device *dev; + int msg_bufsize; + int scratch_bufsize; +}; + static void scrub_free_csums(struct scrub_dev *sdev) { while (!list_empty(&sdev->csum_list)) { @@ -175,14 +198,15 @@ struct scrub_dev *scrub_setup_dev(struct btrfs_device *dev) if (i != SCRUB_BIOS_PER_DEV-1) sdev->bios[i]->next_free = i + 1; - else + else sdev->bios[i]->next_free = -1; } sdev->first_free = 0; sdev->curr = -1; atomic_set(&sdev->in_flight, 0); + atomic_set(&sdev->fixup_cnt, 0); atomic_set(&sdev->cancel_req, 0); - sdev->csum_size = btrfs_super_csum_size(&fs_info->super_copy); + sdev->csum_size = btrfs_super_csum_size(fs_info->super_copy); INIT_LIST_HEAD(&sdev->csum_list); spin_lock_init(&sdev->list_lock); @@ -195,24 +219,366 @@ nomem: return ERR_PTR(-ENOMEM); } +static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, void *ctx) +{ + u64 isize; + u32 nlink; + int ret; + int i; + struct extent_buffer *eb; + struct btrfs_inode_item *inode_item; + struct scrub_warning *swarn = ctx; + struct btrfs_fs_info *fs_info = swarn->dev->dev_root->fs_info; + struct inode_fs_paths *ipath = NULL; + struct btrfs_root *local_root; + struct btrfs_key root_key; + + root_key.objectid = root; + root_key.type = BTRFS_ROOT_ITEM_KEY; + root_key.offset = (u64)-1; + local_root = btrfs_read_fs_root_no_name(fs_info, &root_key); + if (IS_ERR(local_root)) { + ret = PTR_ERR(local_root); + goto err; + } + + ret = inode_item_info(inum, 0, local_root, swarn->path); + if (ret) { + btrfs_release_path(swarn->path); + goto err; + } + + eb = swarn->path->nodes[0]; + inode_item = btrfs_item_ptr(eb, swarn->path->slots[0], + struct btrfs_inode_item); + isize = btrfs_inode_size(eb, inode_item); + nlink = btrfs_inode_nlink(eb, inode_item); + btrfs_release_path(swarn->path); + + ipath = init_ipath(4096, local_root, swarn->path); + if (IS_ERR(ipath)) { + ret = PTR_ERR(ipath); + ipath = NULL; + goto err; + } + ret = paths_from_inode(inum, ipath); + + if (ret < 0) + goto err; + + /* + * we deliberately ignore the bit ipath might have been too small to + * hold all of the paths here + */ + for (i = 0; i < ipath->fspath->elem_cnt; ++i) + printk(KERN_WARNING "btrfs: %s at logical %llu on dev " + "%s, sector %llu, root %llu, inode %llu, offset %llu, " + "length %llu, links %u (path: %s)\n", swarn->errstr, + swarn->logical, swarn->dev->name, + (unsigned long long)swarn->sector, root, inum, offset, + min(isize - offset, (u64)PAGE_SIZE), nlink, + (char *)(unsigned long)ipath->fspath->val[i]); + + free_ipath(ipath); + return 0; + +err: + printk(KERN_WARNING "btrfs: %s at logical %llu on dev " + "%s, sector %llu, root %llu, inode %llu, offset %llu: path " + "resolving failed with ret=%d\n", swarn->errstr, + swarn->logical, swarn->dev->name, + (unsigned long long)swarn->sector, root, inum, offset, ret); + + free_ipath(ipath); + return 0; +} + +static void scrub_print_warning(const char *errstr, struct scrub_bio *sbio, + int ix) +{ + struct btrfs_device *dev = sbio->sdev->dev; + struct btrfs_fs_info *fs_info = dev->dev_root->fs_info; + struct btrfs_path *path; + struct btrfs_key found_key; + struct extent_buffer *eb; + struct btrfs_extent_item *ei; + struct scrub_warning swarn; + u32 item_size; + int ret; + u64 ref_root; + u8 ref_level; + unsigned long ptr = 0; + const int bufsize = 4096; + u64 extent_offset; + + path = btrfs_alloc_path(); + + swarn.scratch_buf = kmalloc(bufsize, GFP_NOFS); + swarn.msg_buf = kmalloc(bufsize, GFP_NOFS); + swarn.sector = (sbio->physical + ix * PAGE_SIZE) >> 9; + swarn.logical = sbio->logical + ix * PAGE_SIZE; + swarn.errstr = errstr; + swarn.dev = dev; + swarn.msg_bufsize = bufsize; + swarn.scratch_bufsize = bufsize; + + if (!path || !swarn.scratch_buf || !swarn.msg_buf) + goto out; + + ret = extent_from_logical(fs_info, swarn.logical, path, &found_key); + if (ret < 0) + goto out; + + extent_offset = swarn.logical - found_key.objectid; + swarn.extent_item_size = found_key.offset; + + eb = path->nodes[0]; + ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item); + item_size = btrfs_item_size_nr(eb, path->slots[0]); + + if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK) { + do { + ret = tree_backref_for_extent(&ptr, eb, ei, item_size, + &ref_root, &ref_level); + printk(KERN_WARNING "%s at logical %llu on dev %s, " + "sector %llu: metadata %s (level %d) in tree " + "%llu\n", errstr, swarn.logical, dev->name, + (unsigned long long)swarn.sector, + ref_level ? "node" : "leaf", + ret < 0 ? -1 : ref_level, + ret < 0 ? -1 : ref_root); + } while (ret != 1); + } else { + swarn.path = path; + iterate_extent_inodes(fs_info, path, found_key.objectid, + extent_offset, + scrub_print_warning_inode, &swarn); + } + +out: + btrfs_free_path(path); + kfree(swarn.scratch_buf); + kfree(swarn.msg_buf); +} + +static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *ctx) +{ + struct page *page = NULL; + unsigned long index; + struct scrub_fixup_nodatasum *fixup = ctx; + int ret; + int corrected = 0; + struct btrfs_key key; + struct inode *inode = NULL; + u64 end = offset + PAGE_SIZE - 1; + struct btrfs_root *local_root; + + key.objectid = root; + key.type = BTRFS_ROOT_ITEM_KEY; + key.offset = (u64)-1; + local_root = btrfs_read_fs_root_no_name(fixup->root->fs_info, &key); + if (IS_ERR(local_root)) + return PTR_ERR(local_root); + + key.type = BTRFS_INODE_ITEM_KEY; + key.objectid = inum; + key.offset = 0; + inode = btrfs_iget(fixup->root->fs_info->sb, &key, local_root, NULL); + if (IS_ERR(inode)) + return PTR_ERR(inode); + + index = offset >> PAGE_CACHE_SHIFT; + + page = find_or_create_page(inode->i_mapping, index, GFP_NOFS); + if (!page) { + ret = -ENOMEM; + goto out; + } + + if (PageUptodate(page)) { + struct btrfs_mapping_tree *map_tree; + if (PageDirty(page)) { + /* + * we need to write the data to the defect sector. the + * data that was in that sector is not in memory, + * because the page was modified. we must not write the + * modified page to that sector. + * + * TODO: what could be done here: wait for the delalloc + * runner to write out that page (might involve + * COW) and see whether the sector is still + * referenced afterwards. + * + * For the meantime, we'll treat this error + * incorrectable, although there is a chance that a + * later scrub will find the bad sector again and that + * there's no dirty page in memory, then. + */ + ret = -EIO; + goto out; + } + map_tree = &BTRFS_I(inode)->root->fs_info->mapping_tree; + ret = repair_io_failure(map_tree, offset, PAGE_SIZE, + fixup->logical, page, + fixup->mirror_num); + unlock_page(page); + corrected = !ret; + } else { + /* + * we need to get good data first. the general readpage path + * will call repair_io_failure for us, we just have to make + * sure we read the bad mirror. + */ + ret = set_extent_bits(&BTRFS_I(inode)->io_tree, offset, end, + EXTENT_DAMAGED, GFP_NOFS); + if (ret) { + /* set_extent_bits should give proper error */ + WARN_ON(ret > 0); + if (ret > 0) + ret = -EFAULT; + goto out; + } + + ret = extent_read_full_page(&BTRFS_I(inode)->io_tree, page, + btrfs_get_extent, + fixup->mirror_num); + wait_on_page_locked(page); + + corrected = !test_range_bit(&BTRFS_I(inode)->io_tree, offset, + end, EXTENT_DAMAGED, 0, NULL); + if (!corrected) + clear_extent_bits(&BTRFS_I(inode)->io_tree, offset, end, + EXTENT_DAMAGED, GFP_NOFS); + } + +out: + if (page) + put_page(page); + if (inode) + iput(inode); + + if (ret < 0) + return ret; + + if (ret == 0 && corrected) { + /* + * we only need to call readpage for one of the inodes belonging + * to this extent. so make iterate_extent_inodes stop + */ + return 1; + } + + return -EIO; +} + +static void scrub_fixup_nodatasum(struct btrfs_work *work) +{ + int ret; + struct scrub_fixup_nodatasum *fixup; + struct scrub_dev *sdev; + struct btrfs_trans_handle *trans = NULL; + struct btrfs_fs_info *fs_info; + struct btrfs_path *path; + int uncorrectable = 0; + + fixup = container_of(work, struct scrub_fixup_nodatasum, work); + sdev = fixup->sdev; + fs_info = fixup->root->fs_info; + + path = btrfs_alloc_path(); + if (!path) { + spin_lock(&sdev->stat_lock); + ++sdev->stat.malloc_errors; + spin_unlock(&sdev->stat_lock); + uncorrectable = 1; + goto out; + } + + trans = btrfs_join_transaction(fixup->root); + if (IS_ERR(trans)) { + uncorrectable = 1; + goto out; + } + + /* + * the idea is to trigger a regular read through the standard path. we + * read a page from the (failed) logical address by specifying the + * corresponding copynum of the failed sector. thus, that readpage is + * expected to fail. + * that is the point where on-the-fly error correction will kick in + * (once it's finished) and rewrite the failed sector if a good copy + * can be found. + */ + ret = iterate_inodes_from_logical(fixup->logical, fixup->root->fs_info, + path, scrub_fixup_readpage, + fixup); + if (ret < 0) { + uncorrectable = 1; + goto out; + } + WARN_ON(ret != 1); + + spin_lock(&sdev->stat_lock); + ++sdev->stat.corrected_errors; + spin_unlock(&sdev->stat_lock); + +out: + if (trans && !IS_ERR(trans)) + btrfs_end_transaction(trans, fixup->root); + if (uncorrectable) { + spin_lock(&sdev->stat_lock); + ++sdev->stat.uncorrectable_errors; + spin_unlock(&sdev->stat_lock); + printk_ratelimited(KERN_ERR "btrfs: unable to fixup " + "(nodatasum) error at logical %llu\n", + fixup->logical); + } + + btrfs_free_path(path); + kfree(fixup); + + /* see caller why we're pretending to be paused in the scrub counters */ + mutex_lock(&fs_info->scrub_lock); + atomic_dec(&fs_info->scrubs_running); + atomic_dec(&fs_info->scrubs_paused); + mutex_unlock(&fs_info->scrub_lock); + atomic_dec(&sdev->fixup_cnt); + wake_up(&fs_info->scrub_pause_wait); + wake_up(&sdev->list_wait); +} + /* * scrub_recheck_error gets called when either verification of the page * failed or the bio failed to read, e.g. with EIO. In the latter case, * recheck_error gets called for every page in the bio, even though only * one may be bad */ -static void scrub_recheck_error(struct scrub_bio *sbio, int ix) +static int scrub_recheck_error(struct scrub_bio *sbio, int ix) { + struct scrub_dev *sdev = sbio->sdev; + u64 sector = (sbio->physical + ix * PAGE_SIZE) >> 9; + static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL, + DEFAULT_RATELIMIT_BURST); + if (sbio->err) { - if (scrub_fixup_io(READ, sbio->sdev->dev->bdev, - (sbio->physical + ix * PAGE_SIZE) >> 9, + if (scrub_fixup_io(READ, sbio->sdev->dev->bdev, sector, sbio->bio->bi_io_vec[ix].bv_page) == 0) { if (scrub_fixup_check(sbio, ix) == 0) - return; + return 0; } + if (__ratelimit(&_rs)) + scrub_print_warning("i/o error", sbio, ix); + } else { + if (__ratelimit(&_rs)) + scrub_print_warning("checksum error", sbio, ix); } + spin_lock(&sdev->stat_lock); + ++sdev->stat.read_errors; + spin_unlock(&sdev->stat_lock); + scrub_fixup(sbio, ix); + return 1; } static int scrub_fixup_check(struct scrub_bio *sbio, int ix) @@ -250,7 +616,8 @@ static void scrub_fixup(struct scrub_bio *sbio, int ix) struct scrub_dev *sdev = sbio->sdev; struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info; struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; - struct btrfs_multi_bio *multi = NULL; + struct btrfs_bio *bbio = NULL; + struct scrub_fixup_nodatasum *fixup; u64 logical = sbio->logical + ix * PAGE_SIZE; u64 length; int i; @@ -259,38 +626,57 @@ static void scrub_fixup(struct scrub_bio *sbio, int ix) if ((sbio->spag[ix].flags & BTRFS_EXTENT_FLAG_DATA) && (sbio->spag[ix].have_csum == 0)) { + fixup = kzalloc(sizeof(*fixup), GFP_NOFS); + if (!fixup) + goto uncorrectable; + fixup->sdev = sdev; + fixup->logical = logical; + fixup->root = fs_info->extent_root; + fixup->mirror_num = sbio->spag[ix].mirror_num; /* - * nodatasum, don't try to fix anything - * FIXME: we can do better, open the inode and trigger a - * writeback + * increment scrubs_running to prevent cancel requests from + * completing as long as a fixup worker is running. we must also + * increment scrubs_paused to prevent deadlocking on pause + * requests used for transactions commits (as the worker uses a + * transaction context). it is safe to regard the fixup worker + * as paused for all matters practical. effectively, we only + * avoid cancellation requests from completing. */ - goto uncorrectable; + mutex_lock(&fs_info->scrub_lock); + atomic_inc(&fs_info->scrubs_running); + atomic_inc(&fs_info->scrubs_paused); + mutex_unlock(&fs_info->scrub_lock); + atomic_inc(&sdev->fixup_cnt); + fixup->work.func = scrub_fixup_nodatasum; + btrfs_queue_worker(&fs_info->scrub_workers, &fixup->work); + return; } length = PAGE_SIZE; ret = btrfs_map_block(map_tree, REQ_WRITE, logical, &length, - &multi, 0); - if (ret || !multi || length < PAGE_SIZE) { + &bbio, 0); + if (ret || !bbio || length < PAGE_SIZE) { printk(KERN_ERR "scrub_fixup: btrfs_map_block failed us for %llu\n", (unsigned long long)logical); WARN_ON(1); + kfree(bbio); return; } - if (multi->num_stripes == 1) + if (bbio->num_stripes == 1) /* there aren't any replicas */ goto uncorrectable; /* * first find a good copy */ - for (i = 0; i < multi->num_stripes; ++i) { - if (i == sbio->spag[ix].mirror_num) + for (i = 0; i < bbio->num_stripes; ++i) { + if (i + 1 == sbio->spag[ix].mirror_num) continue; - if (scrub_fixup_io(READ, multi->stripes[i].dev->bdev, - multi->stripes[i].physical >> 9, + if (scrub_fixup_io(READ, bbio->stripes[i].dev->bdev, + bbio->stripes[i].physical >> 9, sbio->bio->bi_io_vec[ix].bv_page)) { /* I/O-error, this is not a good copy */ continue; @@ -299,7 +685,7 @@ static void scrub_fixup(struct scrub_bio *sbio, int ix) if (scrub_fixup_check(sbio, ix) == 0) break; } - if (i == multi->num_stripes) + if (i == bbio->num_stripes) goto uncorrectable; if (!sdev->readonly) { @@ -314,25 +700,23 @@ static void scrub_fixup(struct scrub_bio *sbio, int ix) } } - kfree(multi); + kfree(bbio); spin_lock(&sdev->stat_lock); ++sdev->stat.corrected_errors; spin_unlock(&sdev->stat_lock); - if (printk_ratelimit()) - printk(KERN_ERR "btrfs: fixed up at %llu\n", - (unsigned long long)logical); + printk_ratelimited(KERN_ERR "btrfs: fixed up error at logical %llu\n", + (unsigned long long)logical); return; uncorrectable: - kfree(multi); + kfree(bbio); spin_lock(&sdev->stat_lock); ++sdev->stat.uncorrectable_errors; spin_unlock(&sdev->stat_lock); - if (printk_ratelimit()) - printk(KERN_ERR "btrfs: unable to fixup at %llu\n", - (unsigned long long)logical); + printk_ratelimited(KERN_ERR "btrfs: unable to fixup (regular) error at " + "logical %llu\n", (unsigned long long)logical); } static int scrub_fixup_io(int rw, struct block_device *bdev, sector_t sector, @@ -382,8 +766,14 @@ static void scrub_checksum(struct btrfs_work *work) int ret; if (sbio->err) { + ret = 0; for (i = 0; i < sbio->count; ++i) - scrub_recheck_error(sbio, i); + ret |= scrub_recheck_error(sbio, i); + if (!ret) { + spin_lock(&sdev->stat_lock); + ++sdev->stat.unverified_errors; + spin_unlock(&sdev->stat_lock); + } sbio->bio->bi_flags &= ~(BIO_POOL_MASK - 1); sbio->bio->bi_flags |= 1 << BIO_UPTODATE; @@ -396,10 +786,6 @@ static void scrub_checksum(struct btrfs_work *work) bi->bv_offset = 0; bi->bv_len = PAGE_SIZE; } - - spin_lock(&sdev->stat_lock); - ++sdev->stat.read_errors; - spin_unlock(&sdev->stat_lock); goto out; } for (i = 0; i < sbio->count; ++i) { @@ -420,8 +806,14 @@ static void scrub_checksum(struct btrfs_work *work) WARN_ON(1); } kunmap_atomic(buffer, KM_USER0); - if (ret) - scrub_recheck_error(sbio, i); + if (ret) { + ret = scrub_recheck_error(sbio, i); + if (!ret) { + spin_lock(&sdev->stat_lock); + ++sdev->stat.unverified_errors; + spin_unlock(&sdev->stat_lock); + } + } } out: @@ -557,57 +949,27 @@ static int scrub_checksum_super(struct scrub_bio *sbio, void *buffer) static int scrub_submit(struct scrub_dev *sdev) { struct scrub_bio *sbio; - struct bio *bio; - int i; if (sdev->curr == -1) return 0; sbio = sdev->bios[sdev->curr]; - - bio = bio_alloc(GFP_NOFS, sbio->count); - if (!bio) - goto nomem; - - bio->bi_private = sbio; - bio->bi_end_io = scrub_bio_end_io; - bio->bi_bdev = sdev->dev->bdev; - bio->bi_sector = sbio->physical >> 9; - - for (i = 0; i < sbio->count; ++i) { - struct page *page; - int ret; - - page = alloc_page(GFP_NOFS); - if (!page) - goto nomem; - - ret = bio_add_page(bio, page, PAGE_SIZE, 0); - if (!ret) { - __free_page(page); - goto nomem; - } - } - sbio->err = 0; sdev->curr = -1; atomic_inc(&sdev->in_flight); - submit_bio(READ, bio); + submit_bio(READ, sbio->bio); return 0; - -nomem: - scrub_free_bio(bio); - - return -ENOMEM; } static int scrub_page(struct scrub_dev *sdev, u64 logical, u64 len, - u64 physical, u64 flags, u64 gen, u64 mirror_num, + u64 physical, u64 flags, u64 gen, int mirror_num, u8 *csum, int force) { struct scrub_bio *sbio; + struct page *page; + int ret; again: /* @@ -628,12 +990,22 @@ again: } sbio = sdev->bios[sdev->curr]; if (sbio->count == 0) { + struct bio *bio; + sbio->physical = physical; sbio->logical = logical; + bio = bio_alloc(GFP_NOFS, SCRUB_PAGES_PER_BIO); + if (!bio) + return -ENOMEM; + + bio->bi_private = sbio; + bio->bi_end_io = scrub_bio_end_io; + bio->bi_bdev = sdev->dev->bdev; + bio->bi_sector = sbio->physical >> 9; + sbio->err = 0; + sbio->bio = bio; } else if (sbio->physical + sbio->count * PAGE_SIZE != physical || sbio->logical + sbio->count * PAGE_SIZE != logical) { - int ret; - ret = scrub_submit(sdev); if (ret) return ret; @@ -643,6 +1015,20 @@ again: sbio->spag[sbio->count].generation = gen; sbio->spag[sbio->count].have_csum = 0; sbio->spag[sbio->count].mirror_num = mirror_num; + + page = alloc_page(GFP_NOFS); + if (!page) + return -ENOMEM; + + ret = bio_add_page(sbio->bio, page, PAGE_SIZE, 0); + if (!ret) { + __free_page(page); + ret = scrub_submit(sdev); + if (ret) + return ret; + goto again; + } + if (csum) { sbio->spag[sbio->count].have_csum = 1; memcpy(sbio->spag[sbio->count].csum, csum, sdev->csum_size); @@ -701,7 +1087,7 @@ static int scrub_find_csum(struct scrub_dev *sdev, u64 logical, u64 len, /* scrub extent tries to collect up to 64 kB for each bio */ static int scrub_extent(struct scrub_dev *sdev, u64 logical, u64 len, - u64 physical, u64 flags, u64 gen, u64 mirror_num) + u64 physical, u64 flags, u64 gen, int mirror_num) { int ret; u8 csum[BTRFS_CSUM_SIZE]; @@ -741,13 +1127,16 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev, int slot; int i; u64 nstripes; - int start_stripe; struct extent_buffer *l; struct btrfs_key key; u64 physical; u64 logical; u64 generation; - u64 mirror_num; + int mirror_num; + struct reada_control *reada1; + struct reada_control *reada2; + struct btrfs_key key_start; + struct btrfs_key key_end; u64 increment = map->stripe_len; u64 offset; @@ -758,102 +1147,88 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev, if (map->type & BTRFS_BLOCK_GROUP_RAID0) { offset = map->stripe_len * num; increment = map->stripe_len * map->num_stripes; - mirror_num = 0; + mirror_num = 1; } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { int factor = map->num_stripes / map->sub_stripes; offset = map->stripe_len * (num / map->sub_stripes); increment = map->stripe_len * factor; - mirror_num = num % map->sub_stripes; + mirror_num = num % map->sub_stripes + 1; } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) { increment = map->stripe_len; - mirror_num = num % map->num_stripes; + mirror_num = num % map->num_stripes + 1; } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { increment = map->stripe_len; - mirror_num = num % map->num_stripes; + mirror_num = num % map->num_stripes + 1; } else { increment = map->stripe_len; - mirror_num = 0; + mirror_num = 1; } path = btrfs_alloc_path(); if (!path) return -ENOMEM; - path->reada = 2; path->search_commit_root = 1; path->skip_locking = 1; /* - * find all extents for each stripe and just read them to get - * them into the page cache - * FIXME: we can do better. build a more intelligent prefetching + * trigger the readahead for extent tree csum tree and wait for + * completion. During readahead, the scrub is officially paused + * to not hold off transaction commits */ logical = base + offset; - physical = map->stripes[num].physical; - ret = 0; - for (i = 0; i < nstripes; ++i) { - key.objectid = logical; - key.type = BTRFS_EXTENT_ITEM_KEY; - key.offset = (u64)0; - - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); - if (ret < 0) - goto out_noplug; - /* - * we might miss half an extent here, but that doesn't matter, - * as it's only the prefetch - */ - while (1) { - l = path->nodes[0]; - slot = path->slots[0]; - if (slot >= btrfs_header_nritems(l)) { - ret = btrfs_next_leaf(root, path); - if (ret == 0) - continue; - if (ret < 0) - goto out_noplug; - - break; - } - btrfs_item_key_to_cpu(l, &key, slot); + wait_event(sdev->list_wait, + atomic_read(&sdev->in_flight) == 0); + atomic_inc(&fs_info->scrubs_paused); + wake_up(&fs_info->scrub_pause_wait); - if (key.objectid >= logical + map->stripe_len) - break; + /* FIXME it might be better to start readahead at commit root */ + key_start.objectid = logical; + key_start.type = BTRFS_EXTENT_ITEM_KEY; + key_start.offset = (u64)0; + key_end.objectid = base + offset + nstripes * increment; + key_end.type = BTRFS_EXTENT_ITEM_KEY; + key_end.offset = (u64)0; + reada1 = btrfs_reada_add(root, &key_start, &key_end); + + key_start.objectid = BTRFS_EXTENT_CSUM_OBJECTID; + key_start.type = BTRFS_EXTENT_CSUM_KEY; + key_start.offset = logical; + key_end.objectid = BTRFS_EXTENT_CSUM_OBJECTID; + key_end.type = BTRFS_EXTENT_CSUM_KEY; + key_end.offset = base + offset + nstripes * increment; + reada2 = btrfs_reada_add(csum_root, &key_start, &key_end); + + if (!IS_ERR(reada1)) + btrfs_reada_wait(reada1); + if (!IS_ERR(reada2)) + btrfs_reada_wait(reada2); - path->slots[0]++; - } - btrfs_release_path(path); - logical += increment; - physical += map->stripe_len; - cond_resched(); + mutex_lock(&fs_info->scrub_lock); + while (atomic_read(&fs_info->scrub_pause_req)) { + mutex_unlock(&fs_info->scrub_lock); + wait_event(fs_info->scrub_pause_wait, + atomic_read(&fs_info->scrub_pause_req) == 0); + mutex_lock(&fs_info->scrub_lock); } + atomic_dec(&fs_info->scrubs_paused); + mutex_unlock(&fs_info->scrub_lock); + wake_up(&fs_info->scrub_pause_wait); /* * collect all data csums for the stripe to avoid seeking during * the scrub. This might currently (crc32) end up to be about 1MB */ - start_stripe = 0; blk_start_plug(&plug); -again: - logical = base + offset + start_stripe * increment; - for (i = start_stripe; i < nstripes; ++i) { - ret = btrfs_lookup_csums_range(csum_root, logical, - logical + map->stripe_len - 1, - &sdev->csum_list, 1); - if (ret) - goto out; - logical += increment; - cond_resched(); - } /* * now find all extents for each stripe and scrub them */ - logical = base + offset + start_stripe * increment; - physical = map->stripes[num].physical + start_stripe * map->stripe_len; + logical = base + offset; + physical = map->stripes[num].physical; ret = 0; - for (i = start_stripe; i < nstripes; ++i) { + for (i = 0; i < nstripes; ++i) { /* * canceled? */ @@ -882,11 +1257,14 @@ again: atomic_dec(&fs_info->scrubs_paused); mutex_unlock(&fs_info->scrub_lock); wake_up(&fs_info->scrub_pause_wait); - scrub_free_csums(sdev); - start_stripe = i; - goto again; } + ret = btrfs_lookup_csums_range(csum_root, logical, + logical + map->stripe_len - 1, + &sdev->csum_list, 1); + if (ret) + goto out; + key.objectid = logical; key.type = BTRFS_EXTENT_ITEM_KEY; key.offset = (u64)0; @@ -982,7 +1360,6 @@ next: out: blk_finish_plug(&plug); -out_noplug: btrfs_free_path(path); return ret < 0 ? ret : 0; } @@ -1158,18 +1535,22 @@ static noinline_for_stack int scrub_supers(struct scrub_dev *sdev) static noinline_for_stack int scrub_workers_get(struct btrfs_root *root) { struct btrfs_fs_info *fs_info = root->fs_info; + int ret = 0; mutex_lock(&fs_info->scrub_lock); if (fs_info->scrub_workers_refcnt == 0) { btrfs_init_workers(&fs_info->scrub_workers, "scrub", fs_info->thread_pool_size, &fs_info->generic_worker); fs_info->scrub_workers.idle_thresh = 4; - btrfs_start_workers(&fs_info->scrub_workers, 1); + ret = btrfs_start_workers(&fs_info->scrub_workers); + if (ret) + goto out; } ++fs_info->scrub_workers_refcnt; +out: mutex_unlock(&fs_info->scrub_lock); - return 0; + return ret; } static noinline_for_stack void scrub_workers_put(struct btrfs_root *root) @@ -1253,10 +1634,11 @@ int btrfs_scrub_dev(struct btrfs_root *root, u64 devid, u64 start, u64 end, ret = scrub_enumerate_chunks(sdev, start, end); wait_event(sdev->list_wait, atomic_read(&sdev->in_flight) == 0); - atomic_dec(&fs_info->scrubs_running); wake_up(&fs_info->scrub_pause_wait); + wait_event(sdev->list_wait, atomic_read(&sdev->fixup_cnt) == 0); + if (progress) memcpy(progress, &sdev->stat, sizeof(*progress)); diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 15634d4..ae488aa 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -40,6 +40,7 @@ #include <linux/magic.h> #include <linux/slab.h> #include <linux/cleancache.h> +#include <linux/ratelimit.h> #include "compat.h" #include "delayed-inode.h" #include "ctree.h" @@ -58,6 +59,7 @@ #include <trace/events/btrfs.h> static const struct super_operations btrfs_super_ops; +static struct file_system_type btrfs_fs_type; static const char *btrfs_decode_error(struct btrfs_fs_info *fs_info, int errno, char nbuf[16]) @@ -162,7 +164,7 @@ enum { Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_discard, Opt_space_cache, Opt_clear_cache, Opt_user_subvol_rm_allowed, Opt_enospc_debug, Opt_subvolrootid, Opt_defrag, - Opt_inode_cache, Opt_err, + Opt_inode_cache, Opt_no_space_cache, Opt_recovery, Opt_err, }; static match_table_t tokens = { @@ -195,6 +197,8 @@ static match_table_t tokens = { {Opt_subvolrootid, "subvolrootid=%d"}, {Opt_defrag, "autodefrag"}, {Opt_inode_cache, "inode_cache"}, + {Opt_no_space_cache, "nospace_cache"}, + {Opt_recovery, "recovery"}, {Opt_err, NULL}, }; @@ -206,14 +210,19 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) { struct btrfs_fs_info *info = root->fs_info; substring_t args[MAX_OPT_ARGS]; - char *p, *num, *orig; + char *p, *num, *orig = NULL; + u64 cache_gen; int intarg; int ret = 0; char *compress_type; bool compress_force = false; + cache_gen = btrfs_super_cache_generation(root->fs_info->super_copy); + if (cache_gen) + btrfs_set_opt(info->mount_opt, SPACE_CACHE); + if (!options) - return 0; + goto out; /* * strsep changes the string, duplicate it because parse_options @@ -360,9 +369,12 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) btrfs_set_opt(info->mount_opt, DISCARD); break; case Opt_space_cache: - printk(KERN_INFO "btrfs: enabling disk space caching\n"); btrfs_set_opt(info->mount_opt, SPACE_CACHE); break; + case Opt_no_space_cache: + printk(KERN_INFO "btrfs: disabling disk space caching\n"); + btrfs_clear_opt(info->mount_opt, SPACE_CACHE); + break; case Opt_inode_cache: printk(KERN_INFO "btrfs: enabling inode map caching\n"); btrfs_set_opt(info->mount_opt, INODE_MAP_CACHE); @@ -381,6 +393,10 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) printk(KERN_INFO "btrfs: enabling auto defrag"); btrfs_set_opt(info->mount_opt, AUTO_DEFRAG); break; + case Opt_recovery: + printk(KERN_INFO "btrfs: enabling auto recovery"); + btrfs_set_opt(info->mount_opt, RECOVERY); + break; case Opt_err: printk(KERN_INFO "btrfs: unrecognized mount option " "'%s'\n", p); @@ -391,6 +407,8 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) } } out: + if (!ret && btrfs_test_opt(root, SPACE_CACHE)) + printk(KERN_INFO "btrfs: disk space caching is enabled\n"); kfree(orig); return ret; } @@ -406,12 +424,12 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags, u64 *subvol_rootid, struct btrfs_fs_devices **fs_devices) { substring_t args[MAX_OPT_ARGS]; - char *opts, *orig, *p; + char *device_name, *opts, *orig, *p; int error = 0; int intarg; if (!options) - goto out; + return 0; /* * strsep changes the string, duplicate it because parse_options @@ -430,6 +448,7 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags, token = match_token(p, tokens, args); switch (token) { case Opt_subvol: + kfree(*subvol_name); *subvol_name = match_strdup(&args[0]); break; case Opt_subvolid: @@ -457,29 +476,24 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags, } break; case Opt_device: - error = btrfs_scan_one_device(match_strdup(&args[0]), + device_name = match_strdup(&args[0]); + if (!device_name) { + error = -ENOMEM; + goto out; + } + error = btrfs_scan_one_device(device_name, flags, holder, fs_devices); + kfree(device_name); if (error) - goto out_free_opts; + goto out; break; default: break; } } - out_free_opts: +out: kfree(orig); - out: - /* - * If no subvolume name is specified we use the default one. Allocate - * a copy of the string "." here so that code later in the - * mount path doesn't care if it's the default volume or another one. - */ - if (!*subvol_name) { - *subvol_name = kstrdup(".", GFP_KERNEL); - if (!*subvol_name) - return -ENOMEM; - } return error; } @@ -492,7 +506,6 @@ static struct dentry *get_default_root(struct super_block *sb, struct btrfs_path *path; struct btrfs_key location; struct inode *inode; - struct dentry *dentry; u64 dir_id; int new = 0; @@ -517,7 +530,7 @@ static struct dentry *get_default_root(struct super_block *sb, * will mount by default if we haven't been given a specific subvolume * to mount. */ - dir_id = btrfs_super_root_dir(&root->fs_info->super_copy); + dir_id = btrfs_super_root_dir(root->fs_info->super_copy); di = btrfs_lookup_dir_item(NULL, root, path, dir_id, "default", 7, 0); if (IS_ERR(di)) { btrfs_free_path(path); @@ -566,29 +579,7 @@ setup_root: return dget(sb->s_root); } - if (new) { - const struct qstr name = { .name = "/", .len = 1 }; - - /* - * New inode, we need to make the dentry a sibling of s_root so - * everything gets cleaned up properly on unmount. - */ - dentry = d_alloc(sb->s_root, &name); - if (!dentry) { - iput(inode); - return ERR_PTR(-ENOMEM); - } - d_splice_alias(inode, dentry); - } else { - /* - * We found the inode in cache, just find a dentry for it and - * put the reference to the inode we just got. - */ - dentry = d_find_alias(inode); - iput(inode); - } - - return dentry; + return d_obtain_alias(inode); } static int btrfs_fill_super(struct super_block *sb, @@ -670,9 +661,9 @@ int btrfs_sync_fs(struct super_block *sb, int wait) return ret; } -static int btrfs_show_options(struct seq_file *seq, struct vfsmount *vfs) +static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry) { - struct btrfs_root *root = btrfs_sb(vfs->mnt_sb); + struct btrfs_root *root = btrfs_sb(dentry->d_sb); struct btrfs_fs_info *info = root->fs_info; char *compress_type; @@ -719,6 +710,8 @@ static int btrfs_show_options(struct seq_file *seq, struct vfsmount *vfs) seq_puts(seq, ",noacl"); if (btrfs_test_opt(root, SPACE_CACHE)) seq_puts(seq, ",space_cache"); + else + seq_puts(seq, ",nospace_cache"); if (btrfs_test_opt(root, CLEAR_CACHE)) seq_puts(seq, ",clear_cache"); if (btrfs_test_opt(root, USER_SUBVOL_RM_ALLOWED)) @@ -753,6 +746,111 @@ static int btrfs_set_super(struct super_block *s, void *data) return set_anon_super(s, data); } +/* + * subvolumes are identified by ino 256 + */ +static inline int is_subvolume_inode(struct inode *inode) +{ + if (inode && inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) + return 1; + return 0; +} + +/* + * This will strip out the subvol=%s argument for an argument string and add + * subvolid=0 to make sure we get the actual tree root for path walking to the + * subvol we want. + */ +static char *setup_root_args(char *args) +{ + unsigned copied = 0; + unsigned len = strlen(args) + 2; + char *pos; + char *ret; + + /* + * We need the same args as before, but minus + * + * subvol=a + * + * and add + * + * subvolid=0 + * + * which is a difference of 2 characters, so we allocate strlen(args) + + * 2 characters. + */ + ret = kzalloc(len * sizeof(char), GFP_NOFS); + if (!ret) + return NULL; + pos = strstr(args, "subvol="); + + /* This shouldn't happen, but just in case.. */ + if (!pos) { + kfree(ret); + return NULL; + } + + /* + * The subvol=<> arg is not at the front of the string, copy everybody + * up to that into ret. + */ + if (pos != args) { + *pos = '\0'; + strcpy(ret, args); + copied += strlen(args); + pos++; + } + + strncpy(ret + copied, "subvolid=0", len - copied); + + /* Length of subvolid=0 */ + copied += 10; + + /* + * If there is no , after the subvol= option then we know there's no + * other options and we can just return. + */ + pos = strchr(pos, ','); + if (!pos) + return ret; + + /* Copy the rest of the arguments into our buffer */ + strncpy(ret + copied, pos, len - copied); + copied += strlen(pos); + + return ret; +} + +static struct dentry *mount_subvol(const char *subvol_name, int flags, + const char *device_name, char *data) +{ + struct dentry *root; + struct vfsmount *mnt; + char *newargs; + + newargs = setup_root_args(data); + if (!newargs) + return ERR_PTR(-ENOMEM); + mnt = vfs_kern_mount(&btrfs_fs_type, flags, device_name, + newargs); + kfree(newargs); + if (IS_ERR(mnt)) + return ERR_CAST(mnt); + + root = mount_subtree(mnt, subvol_name); + + if (!IS_ERR(root) && !is_subvolume_inode(root->d_inode)) { + struct super_block *s = root->d_sb; + dput(root); + root = ERR_PTR(-EINVAL); + deactivate_locked_super(s); + printk(KERN_ERR "btrfs: '%s' is not a valid subvolume\n", + subvol_name); + } + + return root; +} /* * Find a superblock for the given device / mount point. @@ -767,7 +865,6 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, struct super_block *s; struct dentry *root; struct btrfs_fs_devices *fs_devices = NULL; - struct btrfs_root *tree_root = NULL; struct btrfs_fs_info *fs_info = NULL; fmode_t mode = FMODE_READ; char *subvol_name = NULL; @@ -781,21 +878,20 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, error = btrfs_parse_early_options(data, mode, fs_type, &subvol_name, &subvol_objectid, &subvol_rootid, &fs_devices); - if (error) + if (error) { + kfree(subvol_name); return ERR_PTR(error); + } - error = btrfs_scan_one_device(device_name, mode, fs_type, &fs_devices); - if (error) - goto error_free_subvol_name; + if (subvol_name) { + root = mount_subvol(subvol_name, flags, device_name, data); + kfree(subvol_name); + return root; + } - error = btrfs_open_devices(fs_devices, mode, fs_type); + error = btrfs_scan_one_device(device_name, mode, fs_type, &fs_devices); if (error) - goto error_free_subvol_name; - - if (!(flags & MS_RDONLY) && fs_devices->rw_devices == 0) { - error = -EACCES; - goto error_close_devices; - } + return ERR_PTR(error); /* * Setup a dummy root and fs_info for test/set super. This is because @@ -804,19 +900,40 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, * then open_ctree will properly initialize everything later. */ fs_info = kzalloc(sizeof(struct btrfs_fs_info), GFP_NOFS); - tree_root = kzalloc(sizeof(struct btrfs_root), GFP_NOFS); - if (!fs_info || !tree_root) { + if (!fs_info) + return ERR_PTR(-ENOMEM); + + fs_info->tree_root = kzalloc(sizeof(struct btrfs_root), GFP_NOFS); + if (!fs_info->tree_root) { error = -ENOMEM; - goto error_close_devices; + goto error_fs_info; } - fs_info->tree_root = tree_root; + fs_info->tree_root->fs_info = fs_info; fs_info->fs_devices = fs_devices; - tree_root->fs_info = fs_info; + + fs_info->super_copy = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_NOFS); + fs_info->super_for_commit = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_NOFS); + if (!fs_info->super_copy || !fs_info->super_for_commit) { + error = -ENOMEM; + goto error_fs_info; + } + + error = btrfs_open_devices(fs_devices, mode, fs_type); + if (error) + goto error_fs_info; + + if (!(flags & MS_RDONLY) && fs_devices->rw_devices == 0) { + error = -EACCES; + goto error_close_devices; + } bdev = fs_devices->latest_bdev; - s = sget(fs_type, btrfs_test_super, btrfs_set_super, tree_root); - if (IS_ERR(s)) - goto error_s; + s = sget(fs_type, btrfs_test_super, btrfs_set_super, + fs_info->tree_root); + if (IS_ERR(s)) { + error = PTR_ERR(s); + goto error_close_devices; + } if (s->s_root) { if ((flags ^ s->s_flags) & MS_RDONLY) { @@ -826,75 +943,35 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, } btrfs_close_devices(fs_devices); - kfree(fs_info); - kfree(tree_root); + free_fs_info(fs_info); } else { char b[BDEVNAME_SIZE]; s->s_flags = flags | MS_NOSEC; strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id)); + btrfs_sb(s)->fs_info->bdev_holder = fs_type; error = btrfs_fill_super(s, fs_devices, data, flags & MS_SILENT ? 1 : 0); if (error) { deactivate_locked_super(s); - goto error_free_subvol_name; + return ERR_PTR(error); } - btrfs_sb(s)->fs_info->bdev_holder = fs_type; s->s_flags |= MS_ACTIVE; } - /* if they gave us a subvolume name bind mount into that */ - if (strcmp(subvol_name, ".")) { - struct dentry *new_root; - - root = get_default_root(s, subvol_rootid); - if (IS_ERR(root)) { - error = PTR_ERR(root); - deactivate_locked_super(s); - goto error_free_subvol_name; - } - - mutex_lock(&root->d_inode->i_mutex); - new_root = lookup_one_len(subvol_name, root, - strlen(subvol_name)); - mutex_unlock(&root->d_inode->i_mutex); - - if (IS_ERR(new_root)) { - dput(root); - deactivate_locked_super(s); - error = PTR_ERR(new_root); - goto error_free_subvol_name; - } - if (!new_root->d_inode) { - dput(root); - dput(new_root); - deactivate_locked_super(s); - error = -ENXIO; - goto error_free_subvol_name; - } - dput(root); - root = new_root; - } else { - root = get_default_root(s, subvol_objectid); - if (IS_ERR(root)) { - error = PTR_ERR(root); - deactivate_locked_super(s); - goto error_free_subvol_name; - } + root = get_default_root(s, subvol_objectid); + if (IS_ERR(root)) { + deactivate_locked_super(s); + return root; } - kfree(subvol_name); return root; -error_s: - error = PTR_ERR(s); error_close_devices: btrfs_close_devices(fs_devices); - kfree(fs_info); - kfree(tree_root); -error_free_subvol_name: - kfree(subvol_name); +error_fs_info: + free_fs_info(fs_info); return ERR_PTR(error); } @@ -919,7 +996,7 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) if (root->fs_info->fs_devices->rw_devices == 0) return -EACCES; - if (btrfs_super_log_root(&root->fs_info->super_copy) != 0) + if (btrfs_super_log_root(root->fs_info->super_copy) != 0) return -EINVAL; ret = btrfs_cleanup_fs_roots(root->fs_info); @@ -976,11 +1053,11 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes) u64 avail_space; u64 used_space; u64 min_stripe_size; - int min_stripes = 1; + int min_stripes = 1, num_stripes = 1; int i = 0, nr_devices; int ret; - nr_devices = fs_info->fs_devices->rw_devices; + nr_devices = fs_info->fs_devices->open_devices; BUG_ON(!nr_devices); devices_info = kmalloc(sizeof(*devices_info) * nr_devices, @@ -990,20 +1067,24 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes) /* calc min stripe number for data space alloction */ type = btrfs_get_alloc_profile(root, 1); - if (type & BTRFS_BLOCK_GROUP_RAID0) + if (type & BTRFS_BLOCK_GROUP_RAID0) { min_stripes = 2; - else if (type & BTRFS_BLOCK_GROUP_RAID1) + num_stripes = nr_devices; + } else if (type & BTRFS_BLOCK_GROUP_RAID1) { min_stripes = 2; - else if (type & BTRFS_BLOCK_GROUP_RAID10) + num_stripes = 2; + } else if (type & BTRFS_BLOCK_GROUP_RAID10) { min_stripes = 4; + num_stripes = 4; + } if (type & BTRFS_BLOCK_GROUP_DUP) min_stripe_size = 2 * BTRFS_STRIPE_LEN; else min_stripe_size = BTRFS_STRIPE_LEN; - list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) { - if (!device->in_fs_metadata) + list_for_each_entry(device, &fs_devices->devices, dev_list) { + if (!device->in_fs_metadata || !device->bdev) continue; avail_space = device->total_bytes - device->bytes_used; @@ -1064,13 +1145,16 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes) i = nr_devices - 1; avail_space = 0; while (nr_devices >= min_stripes) { + if (num_stripes > nr_devices) + num_stripes = nr_devices; + if (devices_info[i].max_avail >= min_stripe_size) { int j; u64 alloc_size; - avail_space += devices_info[i].max_avail * min_stripes; + avail_space += devices_info[i].max_avail * num_stripes; alloc_size = devices_info[i].max_avail; - for (j = i + 1 - min_stripes; j <= i; j++) + for (j = i + 1 - num_stripes; j <= i; j++) devices_info[j].max_avail -= alloc_size; } i--; @@ -1085,7 +1169,7 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes) static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) { struct btrfs_root *root = btrfs_sb(dentry->d_sb); - struct btrfs_super_block *disk_super = &root->fs_info->super_copy; + struct btrfs_super_block *disk_super = root->fs_info->super_copy; struct list_head *head = &root->fs_info->space_info; struct btrfs_space_info *found; u64 total_used = 0; @@ -1187,6 +1271,16 @@ static int btrfs_unfreeze(struct super_block *sb) return 0; } +static void btrfs_fs_dirty_inode(struct inode *inode, int flags) +{ + int ret; + + ret = btrfs_dirty_inode(inode); + if (ret) + printk_ratelimited(KERN_ERR "btrfs: fail to dirty inode %Lu " + "error %d\n", btrfs_ino(inode), ret); +} + static const struct super_operations btrfs_super_ops = { .drop_inode = btrfs_drop_inode, .evict_inode = btrfs_evict_inode, @@ -1194,7 +1288,7 @@ static const struct super_operations btrfs_super_ops = { .sync_fs = btrfs_sync_fs, .show_options = btrfs_show_options, .write_inode = btrfs_write_inode, - .dirty_inode = btrfs_dirty_inode, + .dirty_inode = btrfs_fs_dirty_inode, .alloc_inode = btrfs_alloc_inode, .destroy_inode = btrfs_destroy_inode, .statfs = btrfs_statfs, diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index e24b796..81376d9 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -55,6 +55,7 @@ static noinline int join_transaction(struct btrfs_root *root, int nofail) struct btrfs_transaction *cur_trans; spin_lock(&root->fs_info->trans_lock); +loop: if (root->fs_info->trans_no_join) { if (!nofail) { spin_unlock(&root->fs_info->trans_lock); @@ -75,16 +76,18 @@ static noinline int join_transaction(struct btrfs_root *root, int nofail) cur_trans = kmem_cache_alloc(btrfs_transaction_cachep, GFP_NOFS); if (!cur_trans) return -ENOMEM; + spin_lock(&root->fs_info->trans_lock); if (root->fs_info->running_transaction) { + /* + * someone started a transaction after we unlocked. Make sure + * to redo the trans_no_join checks above + */ kmem_cache_free(btrfs_transaction_cachep, cur_trans); cur_trans = root->fs_info->running_transaction; - atomic_inc(&cur_trans->use_count); - atomic_inc(&cur_trans->num_writers); - cur_trans->num_joined++; - spin_unlock(&root->fs_info->trans_lock); - return 0; + goto loop; } + atomic_set(&cur_trans->num_writers, 1); cur_trans->num_joined = 0; init_waitqueue_head(&cur_trans->writer_wait); @@ -275,7 +278,7 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root, */ if (num_items > 0 && root != root->fs_info->chunk_root) { num_bytes = btrfs_calc_trans_metadata_size(root, num_items); - ret = btrfs_block_rsv_add(NULL, root, + ret = btrfs_block_rsv_add(root, &root->fs_info->trans_block_rsv, num_bytes); if (ret) @@ -418,8 +421,8 @@ static int should_end_transaction(struct btrfs_trans_handle *trans, struct btrfs_root *root) { int ret; - ret = btrfs_block_rsv_check(trans, root, - &root->fs_info->global_block_rsv, 0, 5); + + ret = btrfs_block_rsv_check(root, &root->fs_info->global_block_rsv, 5); return ret ? 1 : 0; } @@ -427,17 +430,26 @@ int btrfs_should_end_transaction(struct btrfs_trans_handle *trans, struct btrfs_root *root) { struct btrfs_transaction *cur_trans = trans->transaction; + struct btrfs_block_rsv *rsv = trans->block_rsv; int updates; smp_mb(); if (cur_trans->blocked || cur_trans->delayed_refs.flushing) return 1; + /* + * We need to do this in case we're deleting csums so the global block + * rsv get's used instead of the csum block rsv. + */ + trans->block_rsv = NULL; + updates = trans->delayed_ref_updates; trans->delayed_ref_updates = 0; if (updates) btrfs_run_delayed_refs(trans, root, updates); + trans->block_rsv = rsv; + return should_end_transaction(trans, root); } @@ -453,6 +465,8 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, return 0; } + btrfs_trans_release_metadata(trans, root); + trans->block_rsv = NULL; while (count < 4) { unsigned long cur = trans->delayed_ref_updates; trans->delayed_ref_updates = 0; @@ -473,8 +487,6 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, count++; } - btrfs_trans_release_metadata(trans, root); - if (lock && !atomic_read(&root->fs_info->open_ioctl_trans) && should_end_transaction(trans, root)) { trans->transaction->blocked = 1; @@ -562,50 +574,21 @@ int btrfs_end_transaction_dmeta(struct btrfs_trans_handle *trans, int btrfs_write_marked_extents(struct btrfs_root *root, struct extent_io_tree *dirty_pages, int mark) { - int ret; int err = 0; int werr = 0; - struct page *page; - struct inode *btree_inode = root->fs_info->btree_inode; + struct address_space *mapping = root->fs_info->btree_inode->i_mapping; u64 start = 0; u64 end; - unsigned long index; - - while (1) { - ret = find_first_extent_bit(dirty_pages, start, &start, &end, - mark); - if (ret) - break; - while (start <= end) { - cond_resched(); - - index = start >> PAGE_CACHE_SHIFT; - start = (u64)(index + 1) << PAGE_CACHE_SHIFT; - page = find_get_page(btree_inode->i_mapping, index); - if (!page) - continue; - - btree_lock_page_hook(page); - if (!page->mapping) { - unlock_page(page); - page_cache_release(page); - continue; - } - if (PageWriteback(page)) { - if (PageDirty(page)) - wait_on_page_writeback(page); - else { - unlock_page(page); - page_cache_release(page); - continue; - } - } - err = write_one_page(page, 0); - if (err) - werr = err; - page_cache_release(page); - } + while (!find_first_extent_bit(dirty_pages, start, &start, &end, + mark)) { + convert_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT, mark, + GFP_NOFS); + err = filemap_fdatawrite_range(mapping, start, end); + if (err) + werr = err; + cond_resched(); + start = end + 1; } if (err) werr = err; @@ -621,39 +604,20 @@ int btrfs_write_marked_extents(struct btrfs_root *root, int btrfs_wait_marked_extents(struct btrfs_root *root, struct extent_io_tree *dirty_pages, int mark) { - int ret; int err = 0; int werr = 0; - struct page *page; - struct inode *btree_inode = root->fs_info->btree_inode; + struct address_space *mapping = root->fs_info->btree_inode->i_mapping; u64 start = 0; u64 end; - unsigned long index; - while (1) { - ret = find_first_extent_bit(dirty_pages, start, &start, &end, - mark); - if (ret) - break; - - clear_extent_bits(dirty_pages, start, end, mark, GFP_NOFS); - while (start <= end) { - index = start >> PAGE_CACHE_SHIFT; - start = (u64)(index + 1) << PAGE_CACHE_SHIFT; - page = find_get_page(btree_inode->i_mapping, index); - if (!page) - continue; - if (PageDirty(page)) { - btree_lock_page_hook(page); - wait_on_page_writeback(page); - err = write_one_page(page, 0); - if (err) - werr = err; - } - wait_on_page_writeback(page); - page_cache_release(page); - cond_resched(); - } + while (!find_first_extent_bit(dirty_pages, start, &start, &end, + EXTENT_NEED_WAIT)) { + clear_extent_bits(dirty_pages, start, end, EXTENT_NEED_WAIT, GFP_NOFS); + err = filemap_fdatawait_range(mapping, start, end); + if (err) + werr = err; + cond_resched(); + start = end + 1; } if (err) werr = err; @@ -673,7 +637,12 @@ int btrfs_write_and_wait_marked_extents(struct btrfs_root *root, ret = btrfs_write_marked_extents(root, dirty_pages, mark); ret2 = btrfs_wait_marked_extents(root, dirty_pages, mark); - return ret || ret2; + + if (ret) + return ret; + if (ret2) + return ret2; + return 0; } int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans, @@ -816,6 +785,10 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans, btrfs_save_ino_cache(root, trans); + /* see comments in should_cow_block() */ + root->force_cow = 0; + smp_wmb(); + if (root->commit_root != root->node) { mutex_lock(&root->fs_commit_mutex); switch_commit_root(root); @@ -911,11 +884,10 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, } btrfs_reloc_pre_snapshot(trans, pending, &to_reserve); - btrfs_orphan_pre_snapshot(trans, pending, &to_reserve); if (to_reserve > 0) { - ret = btrfs_block_rsv_add(trans, root, &pending->block_rsv, - to_reserve); + ret = btrfs_block_rsv_add_noflush(root, &pending->block_rsv, + to_reserve); if (ret) { pending->error = ret; goto fail; @@ -979,6 +951,10 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, btrfs_tree_unlock(old); free_extent_buffer(old); + /* see comments in should_cow_block() */ + root->force_cow = 1; + smp_wmb(); + btrfs_set_root_node(new_root_item, tmp); /* record when the snapshot was created in key.offset */ key.offset = trans->transid; @@ -1002,7 +978,6 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, BUG_ON(IS_ERR(pending->snap)); btrfs_reloc_post_snapshot(trans, pending); - btrfs_orphan_post_snapshot(trans, pending); fail: kfree(new_root_item); trans->block_rsv = rsv; @@ -1032,7 +1007,7 @@ static void update_super_roots(struct btrfs_root *root) struct btrfs_root_item *root_item; struct btrfs_super_block *super; - super = &root->fs_info->super_copy; + super = root->fs_info->super_copy; root_item = &root->fs_info->chunk_root->root_item; super->chunk_root = root_item->bytenr; @@ -1043,7 +1018,7 @@ static void update_super_roots(struct btrfs_root *root) super->root = root_item->bytenr; super->generation = root_item->generation; super->root_level = root_item->level; - if (super->cache_generation != 0 || btrfs_test_opt(root, SPACE_CACHE)) + if (btrfs_test_opt(root, SPACE_CACHE)) super->cache_generation = root_item->generation; } @@ -1168,14 +1143,15 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, btrfs_run_ordered_operations(root, 0); + btrfs_trans_release_metadata(trans, root); + trans->block_rsv = NULL; + /* make a pass through all the delayed refs we have so far * any runnings procs may add more while we are here */ ret = btrfs_run_delayed_refs(trans, root, 0); BUG_ON(ret); - btrfs_trans_release_metadata(trans, root); - cur_trans = trans->transaction; /* * set the flushing flag so procs in this transaction have to @@ -1341,12 +1317,12 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, update_super_roots(root); if (!root->fs_info->log_root_recovering) { - btrfs_set_super_log_root(&root->fs_info->super_copy, 0); - btrfs_set_super_log_root_level(&root->fs_info->super_copy, 0); + btrfs_set_super_log_root(root->fs_info->super_copy, 0); + btrfs_set_super_log_root_level(root->fs_info->super_copy, 0); } - memcpy(&root->fs_info->super_for_commit, &root->fs_info->super_copy, - sizeof(root->fs_info->super_copy)); + memcpy(root->fs_info->super_for_commit, root->fs_info->super_copy, + sizeof(*root->fs_info->super_copy)); trans->transaction->blocked = 0; spin_lock(&root->fs_info->trans_lock); diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 786639f..3568374 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -276,8 +276,9 @@ static int process_one_buffer(struct btrfs_root *log, struct walk_control *wc, u64 gen) { if (wc->pin) - btrfs_pin_extent(log->fs_info->extent_root, - eb->start, eb->len, 0); + btrfs_pin_extent_for_log_replay(wc->trans, + log->fs_info->extent_root, + eb->start, eb->len); if (btrfs_buffer_uptodate(eb, gen)) { if (wc->write) @@ -1030,7 +1031,7 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans, } btrfs_release_path(path); if (nlink != inode->i_nlink) { - inode->i_nlink = nlink; + set_nlink(inode, nlink); btrfs_update_inode(trans, root, inode); } BTRFS_I(inode)->index_cnt = (u64)-1; @@ -1760,7 +1761,7 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID); - ret = btrfs_free_reserved_extent(root, + ret = btrfs_free_and_pin_reserved_extent(root, bytenr, blocksize); BUG_ON(ret); } @@ -1828,7 +1829,7 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans, btrfs_tree_unlock(next); WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID); - ret = btrfs_free_reserved_extent(root, + ret = btrfs_free_and_pin_reserved_extent(root, path->nodes[*level]->start, path->nodes[*level]->len); BUG_ON(ret); @@ -1897,7 +1898,7 @@ static int walk_log_tree(struct btrfs_trans_handle *trans, WARN_ON(log->root_key.objectid != BTRFS_TREE_LOG_OBJECTID); - ret = btrfs_free_reserved_extent(log, next->start, + ret = btrfs_free_and_pin_reserved_extent(log, next->start, next->len); BUG_ON(ret); } @@ -2013,10 +2014,10 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, /* wait for previous tree log sync to complete */ if (atomic_read(&root->log_commit[(index1 + 1) % 2])) wait_log_commit(trans, root, root->log_transid - 1); - while (1) { unsigned long batch = root->log_batch; - if (root->log_multiple_pids) { + /* when we're on an ssd, just kick the log commit out */ + if (!btrfs_test_opt(root, SSD) && root->log_multiple_pids) { mutex_unlock(&root->log_mutex); schedule_timeout_uninterruptible(1); mutex_lock(&root->log_mutex); @@ -2117,9 +2118,9 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, BUG_ON(ret); btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); - btrfs_set_super_log_root(&root->fs_info->super_for_commit, + btrfs_set_super_log_root(root->fs_info->super_for_commit, log_root_tree->node->start); - btrfs_set_super_log_root_level(&root->fs_info->super_for_commit, + btrfs_set_super_log_root_level(root->fs_info->super_for_commit, btrfs_header_level(log_root_tree->node)); log_root_tree->log_batch = 0; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index f2a4cc7..f4b839f 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -295,6 +295,12 @@ loop_lock: btrfs_requeue_work(&device->work); goto done; } + /* unplug every 64 requests just for good measure */ + if (batch_run % 64 == 0) { + blk_finish_plug(&plug); + blk_start_plug(&plug); + sync_pending = 0; + } } cond_resched(); @@ -366,6 +372,14 @@ static noinline int device_list_add(const char *path, } INIT_LIST_HEAD(&device->dev_alloc_list); + /* init readahead state */ + spin_lock_init(&device->reada_lock); + device->reada_curr_zone = NULL; + atomic_set(&device->reada_in_flight, 0); + device->reada_next = 0; + INIT_RADIX_TREE(&device->reada_zones, GFP_NOFS & ~__GFP_WAIT); + INIT_RADIX_TREE(&device->reada_extents, GFP_NOFS & ~__GFP_WAIT); + mutex_lock(&fs_devices->device_list_mutex); list_add_rcu(&device->dev_list, &fs_devices->devices); mutex_unlock(&fs_devices->device_list_mutex); @@ -597,10 +611,8 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices, set_blocksize(bdev, 4096); bh = btrfs_read_dev_super(bdev); - if (!bh) { - ret = -EINVAL; + if (!bh) goto error_close; - } disk_super = (struct btrfs_super_block *)bh->b_data; devid = btrfs_stack_device_id(&disk_super->dev_item); @@ -655,7 +667,7 @@ error: continue; } if (fs_devices->open_devices == 0) { - ret = -EIO; + ret = -EINVAL; goto out; } fs_devices->seeding = seeding; @@ -993,7 +1005,7 @@ static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans, key.objectid = device->devid; key.offset = start; key.type = BTRFS_DEV_EXTENT_KEY; - +again: ret = btrfs_search_slot(trans, root, &key, path, -1, 1); if (ret > 0) { ret = btrfs_previous_item(root, path, key.objectid, @@ -1006,6 +1018,9 @@ static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans, struct btrfs_dev_extent); BUG_ON(found_key.offset > start || found_key.offset + btrfs_dev_extent_length(leaf, extent) < start); + key = found_key; + btrfs_release_path(path); + goto again; } else if (ret == 0) { leaf = path->nodes[0]; extent = btrfs_item_ptr(leaf, path->slots[0], @@ -1013,8 +1028,13 @@ static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans, } BUG_ON(ret); - if (device->bytes_used > 0) - device->bytes_used -= btrfs_dev_extent_length(leaf, extent); + if (device->bytes_used > 0) { + u64 len = btrfs_dev_extent_length(leaf, extent); + device->bytes_used -= len; + spin_lock(&root->fs_info->free_chunk_lock); + root->fs_info->free_chunk_space += len; + spin_unlock(&root->fs_info->free_chunk_lock); + } ret = btrfs_del_item(trans, root, path); out: @@ -1356,6 +1376,11 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) if (ret) goto error_undo; + spin_lock(&root->fs_info->free_chunk_lock); + root->fs_info->free_chunk_space = device->total_bytes - + device->bytes_used; + spin_unlock(&root->fs_info->free_chunk_lock); + device->in_fs_metadata = 0; btrfs_scrub_cancel_dev(root, device); @@ -1387,8 +1412,8 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) call_rcu(&device->rcu, free_device); mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); - num_devices = btrfs_super_num_devices(&root->fs_info->super_copy) - 1; - btrfs_set_super_num_devices(&root->fs_info->super_copy, num_devices); + num_devices = btrfs_super_num_devices(root->fs_info->super_copy) - 1; + btrfs_set_super_num_devices(root->fs_info->super_copy, num_devices); if (cur_devices->open_devices == 0) { struct btrfs_fs_devices *fs_devices; @@ -1450,7 +1475,7 @@ static int btrfs_prepare_sprout(struct btrfs_trans_handle *trans, struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; struct btrfs_fs_devices *old_devices; struct btrfs_fs_devices *seed_devices; - struct btrfs_super_block *disk_super = &root->fs_info->super_copy; + struct btrfs_super_block *disk_super = root->fs_info->super_copy; struct btrfs_device *device; u64 super_flags; @@ -1592,7 +1617,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) if ((sb->s_flags & MS_RDONLY) && !root->fs_info->fs_devices->seeding) return -EINVAL; - bdev = blkdev_get_by_path(device_path, FMODE_EXCL, + bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL, root->fs_info->bdev_holder); if (IS_ERR(bdev)) return PTR_ERR(bdev); @@ -1691,15 +1716,19 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) root->fs_info->fs_devices->num_can_discard++; root->fs_info->fs_devices->total_rw_bytes += device->total_bytes; + spin_lock(&root->fs_info->free_chunk_lock); + root->fs_info->free_chunk_space += device->total_bytes; + spin_unlock(&root->fs_info->free_chunk_lock); + if (!blk_queue_nonrot(bdev_get_queue(bdev))) root->fs_info->fs_devices->rotating = 1; - total_bytes = btrfs_super_total_bytes(&root->fs_info->super_copy); - btrfs_set_super_total_bytes(&root->fs_info->super_copy, + total_bytes = btrfs_super_total_bytes(root->fs_info->super_copy); + btrfs_set_super_total_bytes(root->fs_info->super_copy, total_bytes + device->total_bytes); - total_bytes = btrfs_super_num_devices(&root->fs_info->super_copy); - btrfs_set_super_num_devices(&root->fs_info->super_copy, + total_bytes = btrfs_super_num_devices(root->fs_info->super_copy); + btrfs_set_super_num_devices(root->fs_info->super_copy, total_bytes + 1); mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); @@ -1790,7 +1819,7 @@ static int __btrfs_grow_device(struct btrfs_trans_handle *trans, struct btrfs_device *device, u64 new_size) { struct btrfs_super_block *super_copy = - &device->dev_root->fs_info->super_copy; + device->dev_root->fs_info->super_copy; u64 old_total = btrfs_super_total_bytes(super_copy); u64 diff = new_size - device->total_bytes; @@ -1849,7 +1878,7 @@ static int btrfs_free_chunk(struct btrfs_trans_handle *trans, static int btrfs_del_sys_chunk(struct btrfs_root *root, u64 chunk_objectid, u64 chunk_offset) { - struct btrfs_super_block *super_copy = &root->fs_info->super_copy; + struct btrfs_super_block *super_copy = root->fs_info->super_copy; struct btrfs_disk_key *disk_key; struct btrfs_chunk *chunk; u8 *ptr; @@ -2175,7 +2204,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) bool retried = false; struct extent_buffer *l; struct btrfs_key key; - struct btrfs_super_block *super_copy = &root->fs_info->super_copy; + struct btrfs_super_block *super_copy = root->fs_info->super_copy; u64 old_total = btrfs_super_total_bytes(super_copy); u64 old_size = device->total_bytes; u64 diff = device->total_bytes - new_size; @@ -2192,8 +2221,12 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) lock_chunks(root); device->total_bytes = new_size; - if (device->writeable) + if (device->writeable) { device->fs_devices->total_rw_bytes -= diff; + spin_lock(&root->fs_info->free_chunk_lock); + root->fs_info->free_chunk_space -= diff; + spin_unlock(&root->fs_info->free_chunk_lock); + } unlock_chunks(root); again: @@ -2257,6 +2290,9 @@ again: device->total_bytes = old_size; if (device->writeable) device->fs_devices->total_rw_bytes += diff; + spin_lock(&root->fs_info->free_chunk_lock); + root->fs_info->free_chunk_space += diff; + spin_unlock(&root->fs_info->free_chunk_lock); unlock_chunks(root); goto done; } @@ -2292,7 +2328,7 @@ static int btrfs_add_system_chunk(struct btrfs_trans_handle *trans, struct btrfs_key *key, struct btrfs_chunk *chunk, int item_size) { - struct btrfs_super_block *super_copy = &root->fs_info->super_copy; + struct btrfs_super_block *super_copy = root->fs_info->super_copy; struct btrfs_disk_key disk_key; u32 array_size; u8 *ptr; @@ -2615,6 +2651,11 @@ static int __finish_chunk_alloc(struct btrfs_trans_handle *trans, index++; } + spin_lock(&extent_root->fs_info->free_chunk_lock); + extent_root->fs_info->free_chunk_space -= (stripe_size * + map->num_stripes); + spin_unlock(&extent_root->fs_info->free_chunk_lock); + index = 0; stripe = &chunk->stripe; while (index < map->num_stripes) { @@ -2848,7 +2889,7 @@ static int find_live_mirror(struct map_lookup *map, int first, int num, static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, u64 logical, u64 *length, - struct btrfs_multi_bio **multi_ret, + struct btrfs_bio **bbio_ret, int mirror_num) { struct extent_map *em; @@ -2866,18 +2907,18 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, int i; int num_stripes; int max_errors = 0; - struct btrfs_multi_bio *multi = NULL; + struct btrfs_bio *bbio = NULL; - if (multi_ret && !(rw & (REQ_WRITE | REQ_DISCARD))) + if (bbio_ret && !(rw & (REQ_WRITE | REQ_DISCARD))) stripes_allocated = 1; again: - if (multi_ret) { - multi = kzalloc(btrfs_multi_bio_size(stripes_allocated), + if (bbio_ret) { + bbio = kzalloc(btrfs_bio_size(stripes_allocated), GFP_NOFS); - if (!multi) + if (!bbio) return -ENOMEM; - atomic_set(&multi->error, 0); + atomic_set(&bbio->error, 0); } read_lock(&em_tree->lock); @@ -2898,7 +2939,7 @@ again: if (mirror_num > map->num_stripes) mirror_num = 0; - /* if our multi bio struct is too small, back off and try again */ + /* if our btrfs_bio struct is too small, back off and try again */ if (rw & REQ_WRITE) { if (map->type & (BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_DUP)) { @@ -2917,11 +2958,11 @@ again: stripes_required = map->num_stripes; } } - if (multi_ret && (rw & (REQ_WRITE | REQ_DISCARD)) && + if (bbio_ret && (rw & (REQ_WRITE | REQ_DISCARD)) && stripes_allocated < stripes_required) { stripes_allocated = map->num_stripes; free_extent_map(em); - kfree(multi); + kfree(bbio); goto again; } stripe_nr = offset; @@ -2950,7 +2991,7 @@ again: *length = em->len - offset; } - if (!multi_ret) + if (!bbio_ret) goto out; num_stripes = 1; @@ -2975,13 +3016,17 @@ again: stripe_index = find_live_mirror(map, 0, map->num_stripes, current->pid % map->num_stripes); + mirror_num = stripe_index + 1; } } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { - if (rw & (REQ_WRITE | REQ_DISCARD)) + if (rw & (REQ_WRITE | REQ_DISCARD)) { num_stripes = map->num_stripes; - else if (mirror_num) + } else if (mirror_num) { stripe_index = mirror_num - 1; + } else { + mirror_num = 1; + } } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { int factor = map->num_stripes / map->sub_stripes; @@ -3001,6 +3046,7 @@ again: stripe_index = find_live_mirror(map, stripe_index, map->sub_stripes, stripe_index + current->pid % map->sub_stripes); + mirror_num = stripe_index + 1; } } else { /* @@ -3009,15 +3055,16 @@ again: * stripe_index is the number of our device in the stripe array */ stripe_index = do_div(stripe_nr, map->num_stripes); + mirror_num = stripe_index + 1; } BUG_ON(stripe_index >= map->num_stripes); if (rw & REQ_DISCARD) { for (i = 0; i < num_stripes; i++) { - multi->stripes[i].physical = + bbio->stripes[i].physical = map->stripes[stripe_index].physical + stripe_offset + stripe_nr * map->stripe_len; - multi->stripes[i].dev = map->stripes[stripe_index].dev; + bbio->stripes[i].dev = map->stripes[stripe_index].dev; if (map->type & BTRFS_BLOCK_GROUP_RAID0) { u64 stripes; @@ -3038,16 +3085,16 @@ again: } stripes = stripe_nr_end - 1 - j; do_div(stripes, map->num_stripes); - multi->stripes[i].length = map->stripe_len * + bbio->stripes[i].length = map->stripe_len * (stripes - stripe_nr + 1); if (i == 0) { - multi->stripes[i].length -= + bbio->stripes[i].length -= stripe_offset; stripe_offset = 0; } if (stripe_index == last_stripe) - multi->stripes[i].length -= + bbio->stripes[i].length -= stripe_end_offset; } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { u64 stripes; @@ -3072,11 +3119,11 @@ again: } stripes = stripe_nr_end - 1 - j; do_div(stripes, factor); - multi->stripes[i].length = map->stripe_len * + bbio->stripes[i].length = map->stripe_len * (stripes - stripe_nr + 1); if (i < map->sub_stripes) { - multi->stripes[i].length -= + bbio->stripes[i].length -= stripe_offset; if (i == map->sub_stripes - 1) stripe_offset = 0; @@ -3084,11 +3131,11 @@ again: if (stripe_index >= last_stripe && stripe_index <= (last_stripe + map->sub_stripes - 1)) { - multi->stripes[i].length -= + bbio->stripes[i].length -= stripe_end_offset; } } else - multi->stripes[i].length = *length; + bbio->stripes[i].length = *length; stripe_index++; if (stripe_index == map->num_stripes) { @@ -3099,19 +3146,20 @@ again: } } else { for (i = 0; i < num_stripes; i++) { - multi->stripes[i].physical = + bbio->stripes[i].physical = map->stripes[stripe_index].physical + stripe_offset + stripe_nr * map->stripe_len; - multi->stripes[i].dev = + bbio->stripes[i].dev = map->stripes[stripe_index].dev; stripe_index++; } } - if (multi_ret) { - *multi_ret = multi; - multi->num_stripes = num_stripes; - multi->max_errors = max_errors; + if (bbio_ret) { + *bbio_ret = bbio; + bbio->num_stripes = num_stripes; + bbio->max_errors = max_errors; + bbio->mirror_num = mirror_num; } out: free_extent_map(em); @@ -3120,9 +3168,9 @@ out: int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, u64 logical, u64 *length, - struct btrfs_multi_bio **multi_ret, int mirror_num) + struct btrfs_bio **bbio_ret, int mirror_num) { - return __btrfs_map_block(map_tree, rw, logical, length, multi_ret, + return __btrfs_map_block(map_tree, rw, logical, length, bbio_ret, mirror_num); } @@ -3191,30 +3239,32 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, return 0; } -static void end_bio_multi_stripe(struct bio *bio, int err) +static void btrfs_end_bio(struct bio *bio, int err) { - struct btrfs_multi_bio *multi = bio->bi_private; + struct btrfs_bio *bbio = bio->bi_private; int is_orig_bio = 0; if (err) - atomic_inc(&multi->error); + atomic_inc(&bbio->error); - if (bio == multi->orig_bio) + if (bio == bbio->orig_bio) is_orig_bio = 1; - if (atomic_dec_and_test(&multi->stripes_pending)) { + if (atomic_dec_and_test(&bbio->stripes_pending)) { if (!is_orig_bio) { bio_put(bio); - bio = multi->orig_bio; + bio = bbio->orig_bio; } - bio->bi_private = multi->private; - bio->bi_end_io = multi->end_io; + bio->bi_private = bbio->private; + bio->bi_end_io = bbio->end_io; + bio->bi_bdev = (struct block_device *) + (unsigned long)bbio->mirror_num; /* only send an error to the higher layers if it is * beyond the tolerance of the multi-bio */ - if (atomic_read(&multi->error) > multi->max_errors) { + if (atomic_read(&bbio->error) > bbio->max_errors) { err = -EIO; - } else if (err) { + } else { /* * this bio is actually up to date, we didn't * go over the max number of errors @@ -3222,7 +3272,7 @@ static void end_bio_multi_stripe(struct bio *bio, int err) set_bit(BIO_UPTODATE, &bio->bi_flags); err = 0; } - kfree(multi); + kfree(bbio); bio_endio(bio, err); } else if (!is_orig_bio) { @@ -3302,20 +3352,20 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, u64 logical = (u64)bio->bi_sector << 9; u64 length = 0; u64 map_length; - struct btrfs_multi_bio *multi = NULL; int ret; int dev_nr = 0; int total_devs = 1; + struct btrfs_bio *bbio = NULL; length = bio->bi_size; map_tree = &root->fs_info->mapping_tree; map_length = length; - ret = btrfs_map_block(map_tree, rw, logical, &map_length, &multi, + ret = btrfs_map_block(map_tree, rw, logical, &map_length, &bbio, mirror_num); BUG_ON(ret); - total_devs = multi->num_stripes; + total_devs = bbio->num_stripes; if (map_length < length) { printk(KERN_CRIT "mapping failed logical %llu bio len %llu " "len %llu\n", (unsigned long long)logical, @@ -3323,25 +3373,28 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, (unsigned long long)map_length); BUG(); } - multi->end_io = first_bio->bi_end_io; - multi->private = first_bio->bi_private; - multi->orig_bio = first_bio; - atomic_set(&multi->stripes_pending, multi->num_stripes); + + bbio->orig_bio = first_bio; + bbio->private = first_bio->bi_private; + bbio->end_io = first_bio->bi_end_io; + atomic_set(&bbio->stripes_pending, bbio->num_stripes); while (dev_nr < total_devs) { - if (total_devs > 1) { - if (dev_nr < total_devs - 1) { - bio = bio_clone(first_bio, GFP_NOFS); - BUG_ON(!bio); - } else { - bio = first_bio; - } - bio->bi_private = multi; - bio->bi_end_io = end_bio_multi_stripe; + if (dev_nr < total_devs - 1) { + bio = bio_clone(first_bio, GFP_NOFS); + BUG_ON(!bio); + } else { + bio = first_bio; } - bio->bi_sector = multi->stripes[dev_nr].physical >> 9; - dev = multi->stripes[dev_nr].dev; + bio->bi_private = bbio; + bio->bi_end_io = btrfs_end_bio; + bio->bi_sector = bbio->stripes[dev_nr].physical >> 9; + dev = bbio->stripes[dev_nr].dev; if (dev && dev->bdev && (rw != WRITE || dev->writeable)) { + pr_debug("btrfs_map_bio: rw %d, secor=%llu, dev=%lu " + "(%s id %llu), size=%u\n", rw, + (u64)bio->bi_sector, (u_long)dev->bdev->bd_dev, + dev->name, dev->devid, bio->bi_size); bio->bi_bdev = dev->bdev; if (async_submit) schedule_bio(root, dev, rw, bio); @@ -3354,8 +3407,6 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, } dev_nr++; } - if (total_devs == 1) - kfree(multi); return 0; } @@ -3616,15 +3667,20 @@ static int read_one_dev(struct btrfs_root *root, fill_device_from_item(leaf, dev_item, device); device->dev_root = root->fs_info->dev_root; device->in_fs_metadata = 1; - if (device->writeable) + if (device->writeable) { device->fs_devices->total_rw_bytes += device->total_bytes; + spin_lock(&root->fs_info->free_chunk_lock); + root->fs_info->free_chunk_space += device->total_bytes - + device->bytes_used; + spin_unlock(&root->fs_info->free_chunk_lock); + } ret = 0; return ret; } int btrfs_read_sys_array(struct btrfs_root *root) { - struct btrfs_super_block *super_copy = &root->fs_info->super_copy; + struct btrfs_super_block *super_copy = root->fs_info->super_copy; struct extent_buffer *sb; struct btrfs_disk_key *disk_key; struct btrfs_chunk *chunk; diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 6d866db..78f2d4d 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -92,6 +92,20 @@ struct btrfs_device { struct btrfs_work work; struct rcu_head rcu; struct work_struct rcu_work; + + /* readahead state */ + spinlock_t reada_lock; + atomic_t reada_in_flight; + u64 reada_next; + struct reada_zone *reada_curr_zone; + struct radix_tree_root reada_zones; + struct radix_tree_root reada_extents; + + /* for sending down flush barriers */ + struct bio *flush_bio; + struct completion flush_wait; + int nobarriers; + }; struct btrfs_fs_devices { @@ -136,7 +150,10 @@ struct btrfs_bio_stripe { u64 length; /* only used for discard mappings */ }; -struct btrfs_multi_bio { +struct btrfs_bio; +typedef void (btrfs_bio_end_io_t) (struct btrfs_bio *bio, int err); + +struct btrfs_bio { atomic_t stripes_pending; bio_end_io_t *end_io; struct bio *orig_bio; @@ -144,6 +161,7 @@ struct btrfs_multi_bio { atomic_t error; int max_errors; int num_stripes; + int mirror_num; struct btrfs_bio_stripe stripes[]; }; @@ -171,7 +189,7 @@ struct map_lookup { int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start, u64 end, u64 *length); -#define btrfs_multi_bio_size(n) (sizeof(struct btrfs_multi_bio) + \ +#define btrfs_bio_size(n) (sizeof(struct btrfs_bio) + \ (sizeof(struct btrfs_bio_stripe) * (n))) int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans, @@ -180,7 +198,7 @@ int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans, u64 chunk_offset, u64 start, u64 num_bytes); int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, u64 logical, u64 *length, - struct btrfs_multi_bio **multi_ret, int mirror_num); + struct btrfs_bio **bbio_ret, int mirror_num); int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, u64 chunk_start, u64 physical, u64 devid, u64 **logical, int *naddrs, int *stripe_len); diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index 69565e5..3848b04 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -127,6 +127,17 @@ static int do_setxattr(struct btrfs_trans_handle *trans, again: ret = btrfs_insert_xattr_item(trans, root, path, btrfs_ino(inode), name, name_len, value, size); + /* + * If we're setting an xattr to a new value but the new value is say + * exactly BTRFS_MAX_XATTR_SIZE, we could end up with EOVERFLOW getting + * back from split_leaf. This is because it thinks we'll be extending + * the existing item size, but we're asking for enough space to add the + * item itself. So if we get EOVERFLOW just set ret to EEXIST and let + * the rest of the function figure it out. + */ + if (ret == -EOVERFLOW) + ret = -EEXIST; + if (ret == -EEXIST) { if (flags & XATTR_CREATE) goto out; @@ -383,36 +394,36 @@ int btrfs_removexattr(struct dentry *dentry, const char *name) XATTR_REPLACE); } -int btrfs_xattr_security_init(struct btrfs_trans_handle *trans, - struct inode *inode, struct inode *dir, - const struct qstr *qstr) +int btrfs_initxattrs(struct inode *inode, const struct xattr *xattr_array, + void *fs_info) { - int err; - size_t len; - void *value; - char *suffix; + const struct xattr *xattr; + struct btrfs_trans_handle *trans = fs_info; char *name; + int err = 0; - err = security_inode_init_security(inode, dir, qstr, &suffix, &value, - &len); - if (err) { - if (err == -EOPNOTSUPP) - return 0; - return err; - } - - name = kmalloc(XATTR_SECURITY_PREFIX_LEN + strlen(suffix) + 1, - GFP_NOFS); - if (!name) { - err = -ENOMEM; - } else { + for (xattr = xattr_array; xattr->name != NULL; xattr++) { + name = kmalloc(XATTR_SECURITY_PREFIX_LEN + + strlen(xattr->name) + 1, GFP_NOFS); + if (!name) { + err = -ENOMEM; + break; + } strcpy(name, XATTR_SECURITY_PREFIX); - strcpy(name + XATTR_SECURITY_PREFIX_LEN, suffix); - err = __btrfs_setxattr(trans, inode, name, value, len, 0); + strcpy(name + XATTR_SECURITY_PREFIX_LEN, xattr->name); + err = __btrfs_setxattr(trans, inode, name, + xattr->value, xattr->value_len, 0); kfree(name); + if (err < 0) + break; } - - kfree(suffix); - kfree(value); return err; } + +int btrfs_xattr_security_init(struct btrfs_trans_handle *trans, + struct inode *inode, struct inode *dir, + const struct qstr *qstr) +{ + return security_inode_init_security(inode, dir, qstr, + &btrfs_initxattrs, trans); +} diff --git a/fs/buffer.c b/fs/buffer.c index 1a80b04..1a30db7 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -41,7 +41,6 @@ #include <linux/bitops.h> #include <linux/mpage.h> #include <linux/bit_spinlock.h> -#include <linux/cleancache.h> static int fsync_buffers_list(spinlock_t *lock, struct list_head *list); @@ -213,13 +212,16 @@ __find_get_block_slow(struct block_device *bdev, sector_t block) * elsewhere, don't buffer_error if we had some unmapped buffers */ if (all_mapped) { + char b[BDEVNAME_SIZE]; + printk("__find_get_block_slow() failed. " "block=%llu, b_blocknr=%llu\n", (unsigned long long)block, (unsigned long long)bh->b_blocknr); printk("b_state=0x%08lx, b_size=%zu\n", bh->b_state, bh->b_size); - printk("device blocksize: %d\n", 1 << bd_inode->i_blkbits); + printk("device %s blocksize: %d\n", bdevname(bdev, b), + 1 << bd_inode->i_blkbits); } out_unlock: spin_unlock(&bd_mapping->private_lock); @@ -228,55 +230,6 @@ out: return ret; } -/* If invalidate_buffers() will trash dirty buffers, it means some kind - of fs corruption is going on. Trashing dirty data always imply losing - information that was supposed to be just stored on the physical layer - by the user. - - Thus invalidate_buffers in general usage is not allwowed to trash - dirty buffers. For example ioctl(FLSBLKBUF) expects dirty data to - be preserved. These buffers are simply skipped. - - We also skip buffers which are still in use. For example this can - happen if a userspace program is reading the block device. - - NOTE: In the case where the user removed a removable-media-disk even if - there's still dirty data not synced on disk (due a bug in the device driver - or due an error of the user), by not destroying the dirty buffers we could - generate corruption also on the next media inserted, thus a parameter is - necessary to handle this case in the most safe way possible (trying - to not corrupt also the new disk inserted with the data belonging to - the old now corrupted disk). Also for the ramdisk the natural thing - to do in order to release the ramdisk memory is to destroy dirty buffers. - - These are two special cases. Normal usage imply the device driver - to issue a sync on the device (without waiting I/O completion) and - then an invalidate_buffers call that doesn't trash dirty buffers. - - For handling cache coherency with the blkdev pagecache the 'update' case - is been introduced. It is needed to re-read from disk any pinned - buffer. NOTE: re-reading from disk is destructive so we can do it only - when we assume nobody is changing the buffercache under our I/O and when - we think the disk contains more recent information than the buffercache. - The update == 1 pass marks the buffers we need to update, the update == 2 - pass does the actual I/O. */ -void invalidate_bdev(struct block_device *bdev) -{ - struct address_space *mapping = bdev->bd_inode->i_mapping; - - if (mapping->nrpages == 0) - return; - - invalidate_bh_lrus(); - lru_add_drain_all(); /* make sure all lru add caches are flushed */ - invalidate_mapping_pages(mapping, 0, -1); - /* 99% of the time, we don't need to flush the cleancache on the bdev. - * But, for the strange corners, lets be cautious - */ - cleancache_flush_inode(mapping); -} -EXPORT_SYMBOL(invalidate_bdev); - /* * Kick the writeback threads then try to free up some ZONE_NORMAL memory. */ @@ -285,7 +238,7 @@ static void free_more_memory(void) struct zone *zone; int nid; - wakeup_flusher_threads(1024); + wakeup_flusher_threads(1024, WB_REASON_FREE_MORE_MEM); yield(); for_each_online_node(nid) { @@ -1470,13 +1423,13 @@ static void discard_buffer(struct buffer_head * bh) } /** - * block_invalidatepage - invalidate part of all of a buffer-backed page + * block_invalidatepage - invalidate part or all of a buffer-backed page * * @page: the page which is affected * @offset: the index of the truncation point * * block_invalidatepage() is called when all or part of the page has become - * invalidatedby a truncate operation. + * invalidated by a truncate operation. * * block_invalidatepage() does not have to release all buffers, but it must * ensure that no dirty buffer is left outside @offset and that no I/O diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c index 1064805..67bef6d 100644 --- a/fs/cachefiles/interface.c +++ b/fs/cachefiles/interface.c @@ -11,7 +11,6 @@ #include <linux/slab.h> #include <linux/mount.h> -#include <linux/buffer_head.h> #include "internal.h" #define list_to_page(head) (list_entry((head)->prev, struct page, lru)) diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 5a3953d..173b1d2 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -87,7 +87,7 @@ static int ceph_set_page_dirty(struct page *page) snapc = ceph_get_snap_context(ci->i_snap_realm->cached_context); /* dirty the head */ - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); if (ci->i_head_snapc == NULL) ci->i_head_snapc = ceph_get_snap_context(snapc); ++ci->i_wrbuffer_ref_head; @@ -100,7 +100,7 @@ static int ceph_set_page_dirty(struct page *page) ci->i_wrbuffer_ref-1, ci->i_wrbuffer_ref_head-1, ci->i_wrbuffer_ref, ci->i_wrbuffer_ref_head, snapc, snapc->seq, snapc->num_snaps); - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); /* now adjust page */ spin_lock_irq(&mapping->tree_lock); @@ -228,102 +228,155 @@ static int ceph_readpage(struct file *filp, struct page *page) } /* - * Build a vector of contiguous pages from the provided page list. + * Finish an async read(ahead) op. */ -static struct page **page_vector_from_list(struct list_head *page_list, - unsigned *nr_pages) +static void finish_read(struct ceph_osd_request *req, struct ceph_msg *msg) { - struct page **pages; - struct page *page; - int next_index, contig_pages = 0; + struct inode *inode = req->r_inode; + struct ceph_osd_reply_head *replyhead; + int rc, bytes; + int i; - /* build page vector */ - pages = kmalloc(sizeof(*pages) * *nr_pages, GFP_NOFS); - if (!pages) - return ERR_PTR(-ENOMEM); + /* parse reply */ + replyhead = msg->front.iov_base; + WARN_ON(le32_to_cpu(replyhead->num_ops) == 0); + rc = le32_to_cpu(replyhead->result); + bytes = le32_to_cpu(msg->hdr.data_len); - BUG_ON(list_empty(page_list)); - next_index = list_entry(page_list->prev, struct page, lru)->index; - list_for_each_entry_reverse(page, page_list, lru) { - if (page->index == next_index) { - dout("readpages page %d %p\n", contig_pages, page); - pages[contig_pages] = page; - contig_pages++; - next_index++; - } else { - break; + dout("finish_read %p req %p rc %d bytes %d\n", inode, req, rc, bytes); + + /* unlock all pages, zeroing any data we didn't read */ + for (i = 0; i < req->r_num_pages; i++, bytes -= PAGE_CACHE_SIZE) { + struct page *page = req->r_pages[i]; + + if (bytes < (int)PAGE_CACHE_SIZE) { + /* zero (remainder of) page */ + int s = bytes < 0 ? 0 : bytes; + zero_user_segment(page, s, PAGE_CACHE_SIZE); } + dout("finish_read %p uptodate %p idx %lu\n", inode, page, + page->index); + flush_dcache_page(page); + SetPageUptodate(page); + unlock_page(page); + page_cache_release(page); } - *nr_pages = contig_pages; - return pages; + kfree(req->r_pages); } /* - * Read multiple pages. Leave pages we don't read + unlock in page_list; - * the caller (VM) cleans them up. + * start an async read(ahead) operation. return nr_pages we submitted + * a read for on success, or negative error code. */ -static int ceph_readpages(struct file *file, struct address_space *mapping, - struct list_head *page_list, unsigned nr_pages) +static int start_read(struct inode *inode, struct list_head *page_list, int max) { - struct inode *inode = file->f_dentry->d_inode; - struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_osd_client *osdc = &ceph_inode_to_client(inode)->client->osdc; - int rc = 0; - struct page **pages; - loff_t offset; + struct ceph_inode_info *ci = ceph_inode(inode); + struct page *page = list_entry(page_list->prev, struct page, lru); + struct ceph_osd_request *req; + u64 off; u64 len; + int i; + struct page **pages; + pgoff_t next_index; + int nr_pages = 0; + int ret; - dout("readpages %p file %p nr_pages %d\n", - inode, file, nr_pages); - - pages = page_vector_from_list(page_list, &nr_pages); - if (IS_ERR(pages)) - return PTR_ERR(pages); + off = page->index << PAGE_CACHE_SHIFT; - /* guess read extent */ - offset = pages[0]->index << PAGE_CACHE_SHIFT; + /* count pages */ + next_index = page->index; + list_for_each_entry_reverse(page, page_list, lru) { + if (page->index != next_index) + break; + nr_pages++; + next_index++; + if (max && nr_pages == max) + break; + } len = nr_pages << PAGE_CACHE_SHIFT; - rc = ceph_osdc_readpages(osdc, ceph_vino(inode), &ci->i_layout, - offset, &len, - ci->i_truncate_seq, ci->i_truncate_size, - pages, nr_pages, 0); - if (rc == -ENOENT) - rc = 0; - if (rc < 0) - goto out; - - for (; !list_empty(page_list) && len > 0; - rc -= PAGE_CACHE_SIZE, len -= PAGE_CACHE_SIZE) { - struct page *page = - list_entry(page_list->prev, struct page, lru); + dout("start_read %p nr_pages %d is %lld~%lld\n", inode, nr_pages, + off, len); + + req = ceph_osdc_new_request(osdc, &ci->i_layout, ceph_vino(inode), + off, &len, + CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ, + NULL, 0, + ci->i_truncate_seq, ci->i_truncate_size, + NULL, false, 1, 0); + if (!req) + return -ENOMEM; + /* build page vector */ + nr_pages = len >> PAGE_CACHE_SHIFT; + pages = kmalloc(sizeof(*pages) * nr_pages, GFP_NOFS); + ret = -ENOMEM; + if (!pages) + goto out; + for (i = 0; i < nr_pages; ++i) { + page = list_entry(page_list->prev, struct page, lru); + BUG_ON(PageLocked(page)); list_del(&page->lru); - - if (rc < (int)PAGE_CACHE_SIZE) { - /* zero (remainder of) page */ - int s = rc < 0 ? 0 : rc; - zero_user_segment(page, s, PAGE_CACHE_SIZE); - } - - if (add_to_page_cache_lru(page, mapping, page->index, + + dout("start_read %p adding %p idx %lu\n", inode, page, + page->index); + if (add_to_page_cache_lru(page, &inode->i_data, page->index, GFP_NOFS)) { page_cache_release(page); - dout("readpages %p add_to_page_cache failed %p\n", + dout("start_read %p add_to_page_cache failed %p\n", inode, page); - continue; + nr_pages = i; + goto out_pages; } - dout("readpages %p adding %p idx %lu\n", inode, page, - page->index); - flush_dcache_page(page); - SetPageUptodate(page); - unlock_page(page); - page_cache_release(page); + pages[i] = page; } - rc = 0; + req->r_pages = pages; + req->r_num_pages = nr_pages; + req->r_callback = finish_read; + req->r_inode = inode; + + dout("start_read %p starting %p %lld~%lld\n", inode, req, off, len); + ret = ceph_osdc_start_request(osdc, req, false); + if (ret < 0) + goto out_pages; + ceph_osdc_put_request(req); + return nr_pages; +out_pages: + ceph_release_page_vector(pages, nr_pages); +out: + ceph_osdc_put_request(req); + return ret; +} + + +/* + * Read multiple pages. Leave pages we don't read + unlock in page_list; + * the caller (VM) cleans them up. + */ +static int ceph_readpages(struct file *file, struct address_space *mapping, + struct list_head *page_list, unsigned nr_pages) +{ + struct inode *inode = file->f_dentry->d_inode; + struct ceph_fs_client *fsc = ceph_inode_to_client(inode); + int rc = 0; + int max = 0; + + if (fsc->mount_options->rsize >= PAGE_CACHE_SIZE) + max = (fsc->mount_options->rsize + PAGE_CACHE_SIZE - 1) + >> PAGE_SHIFT; + + dout("readpages %p file %p nr_pages %d max %d\n", inode, file, nr_pages, + max); + while (!list_empty(page_list)) { + rc = start_read(inode, page_list, max); + if (rc < 0) + goto out; + BUG_ON(rc == 0); + } out: - kfree(pages); + dout("readpages %p file %p ret %d\n", inode, file, rc); return rc; } @@ -338,7 +391,7 @@ static struct ceph_snap_context *get_oldest_context(struct inode *inode, struct ceph_snap_context *snapc = NULL; struct ceph_cap_snap *capsnap = NULL; - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) { dout(" cap_snap %p snapc %p has %d dirty pages\n", capsnap, capsnap->context, capsnap->dirty_pages); @@ -354,7 +407,7 @@ static struct ceph_snap_context *get_oldest_context(struct inode *inode, dout(" head snapc %p has %d dirty pages\n", snapc, ci->i_wrbuffer_ref_head); } - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); return snapc; } diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 8d74ad7..b60fc8bf 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -309,7 +309,7 @@ void ceph_reservation_status(struct ceph_fs_client *fsc, /* * Find ceph_cap for given mds, if any. * - * Called with i_lock held. + * Called with i_ceph_lock held. */ static struct ceph_cap *__get_cap_for_mds(struct ceph_inode_info *ci, int mds) { @@ -332,9 +332,9 @@ struct ceph_cap *ceph_get_cap_for_mds(struct ceph_inode_info *ci, int mds) { struct ceph_cap *cap; - spin_lock(&ci->vfs_inode.i_lock); + spin_lock(&ci->i_ceph_lock); cap = __get_cap_for_mds(ci, mds); - spin_unlock(&ci->vfs_inode.i_lock); + spin_unlock(&ci->i_ceph_lock); return cap; } @@ -361,15 +361,16 @@ static int __ceph_get_cap_mds(struct ceph_inode_info *ci) int ceph_get_cap_mds(struct inode *inode) { + struct ceph_inode_info *ci = ceph_inode(inode); int mds; - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); mds = __ceph_get_cap_mds(ceph_inode(inode)); - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); return mds; } /* - * Called under i_lock. + * Called under i_ceph_lock. */ static void __insert_cap_node(struct ceph_inode_info *ci, struct ceph_cap *new) @@ -415,7 +416,7 @@ static void __cap_set_timeouts(struct ceph_mds_client *mdsc, * * If I_FLUSH is set, leave the inode at the front of the list. * - * Caller holds i_lock + * Caller holds i_ceph_lock * -> we take mdsc->cap_delay_lock */ static void __cap_delay_requeue(struct ceph_mds_client *mdsc, @@ -457,7 +458,7 @@ static void __cap_delay_requeue_front(struct ceph_mds_client *mdsc, /* * Cancel delayed work on cap. * - * Caller must hold i_lock. + * Caller must hold i_ceph_lock. */ static void __cap_delay_cancel(struct ceph_mds_client *mdsc, struct ceph_inode_info *ci) @@ -487,17 +488,15 @@ static void __check_cap_issue(struct ceph_inode_info *ci, struct ceph_cap *cap, ci->i_rdcache_gen++; /* - * if we are newly issued FILE_SHARED, clear I_COMPLETE; we + * if we are newly issued FILE_SHARED, clear D_COMPLETE; we * don't know what happened to this directory while we didn't * have the cap. */ if ((issued & CEPH_CAP_FILE_SHARED) && (had & CEPH_CAP_FILE_SHARED) == 0) { ci->i_shared_gen++; - if (S_ISDIR(ci->vfs_inode.i_mode)) { - dout(" marking %p NOT complete\n", &ci->vfs_inode); - ci->i_ceph_flags &= ~CEPH_I_COMPLETE; - } + if (S_ISDIR(ci->vfs_inode.i_mode)) + ceph_dir_clear_complete(&ci->vfs_inode); } } @@ -534,14 +533,14 @@ int ceph_add_cap(struct inode *inode, wanted |= ceph_caps_for_mode(fmode); retry: - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); cap = __get_cap_for_mds(ci, mds); if (!cap) { if (new_cap) { cap = new_cap; new_cap = NULL; } else { - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); new_cap = get_cap(mdsc, caps_reservation); if (new_cap == NULL) return -ENOMEM; @@ -627,7 +626,7 @@ retry: if (fmode >= 0) __ceph_get_fmode(ci, fmode); - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); wake_up_all(&ci->i_cap_wq); return 0; } @@ -794,7 +793,7 @@ int ceph_caps_revoking(struct ceph_inode_info *ci, int mask) struct rb_node *p; int ret = 0; - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) { cap = rb_entry(p, struct ceph_cap, ci_node); if (__cap_is_valid(cap) && @@ -803,7 +802,7 @@ int ceph_caps_revoking(struct ceph_inode_info *ci, int mask) break; } } - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); dout("ceph_caps_revoking %p %s = %d\n", inode, ceph_cap_string(mask), ret); return ret; @@ -857,7 +856,7 @@ int __ceph_caps_mds_wanted(struct ceph_inode_info *ci) } /* - * called under i_lock + * called under i_ceph_lock */ static int __ceph_is_any_caps(struct ceph_inode_info *ci) { @@ -867,7 +866,7 @@ static int __ceph_is_any_caps(struct ceph_inode_info *ci) /* * Remove a cap. Take steps to deal with a racing iterate_session_caps. * - * caller should hold i_lock. + * caller should hold i_ceph_lock. * caller will not hold session s_mutex if called from destroy_inode. */ void __ceph_remove_cap(struct ceph_cap *cap) @@ -929,7 +928,7 @@ static int send_cap_msg(struct ceph_mds_session *session, u64 size, u64 max_size, struct timespec *mtime, struct timespec *atime, u64 time_warp_seq, - uid_t uid, gid_t gid, mode_t mode, + uid_t uid, gid_t gid, umode_t mode, u64 xattr_version, struct ceph_buffer *xattrs_buf, u64 follows) @@ -945,7 +944,7 @@ static int send_cap_msg(struct ceph_mds_session *session, seq, issue_seq, mseq, follows, size, max_size, xattr_version, xattrs_buf ? (int)xattrs_buf->vec.iov_len : 0); - msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, sizeof(*fc), GFP_NOFS); + msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, sizeof(*fc), GFP_NOFS, false); if (!msg) return -ENOMEM; @@ -1030,7 +1029,7 @@ static void __queue_cap_release(struct ceph_mds_session *session, /* * Queue cap releases when an inode is dropped from our cache. Since - * inode is about to be destroyed, there is no need for i_lock. + * inode is about to be destroyed, there is no need for i_ceph_lock. */ void ceph_queue_caps_release(struct inode *inode) { @@ -1051,7 +1050,7 @@ void ceph_queue_caps_release(struct inode *inode) /* * Send a cap msg on the given inode. Update our caps state, then - * drop i_lock and send the message. + * drop i_ceph_lock and send the message. * * Make note of max_size reported/requested from mds, revoked caps * that have now been implemented. @@ -1063,13 +1062,13 @@ void ceph_queue_caps_release(struct inode *inode) * Return non-zero if delayed release, or we experienced an error * such that the caller should requeue + retry later. * - * called with i_lock, then drops it. + * called with i_ceph_lock, then drops it. * caller should hold snap_rwsem (read), s_mutex. */ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap, int op, int used, int want, int retain, int flushing, unsigned *pflush_tid) - __releases(cap->ci->vfs_inode->i_lock) + __releases(cap->ci->i_ceph_lock) { struct ceph_inode_info *ci = cap->ci; struct inode *inode = &ci->vfs_inode; @@ -1079,7 +1078,7 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap, u64 size, max_size; struct timespec mtime, atime; int wake = 0; - mode_t mode; + umode_t mode; uid_t uid; gid_t gid; struct ceph_mds_session *session; @@ -1172,7 +1171,7 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap, xattr_version = ci->i_xattrs.version; } - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); ret = send_cap_msg(session, ceph_vino(inode).ino, cap_id, op, keep, want, flushing, seq, flush_tid, issue_seq, mseq, @@ -1200,13 +1199,13 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap, * Unless @again is true, skip cap_snaps that were already sent to * the MDS (i.e., during this session). * - * Called under i_lock. Takes s_mutex as needed. + * Called under i_ceph_lock. Takes s_mutex as needed. */ void __ceph_flush_snaps(struct ceph_inode_info *ci, struct ceph_mds_session **psession, int again) - __releases(ci->vfs_inode->i_lock) - __acquires(ci->vfs_inode->i_lock) + __releases(ci->i_ceph_lock) + __acquires(ci->i_ceph_lock) { struct inode *inode = &ci->vfs_inode; int mds; @@ -1263,7 +1262,7 @@ retry: session = NULL; } if (!session) { - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); mutex_lock(&mdsc->mutex); session = __ceph_lookup_mds_session(mdsc, mds); mutex_unlock(&mdsc->mutex); @@ -1277,7 +1276,7 @@ retry: * deletion or migration. retry, and we'll * get a better @mds value next time. */ - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); goto retry; } @@ -1287,7 +1286,7 @@ retry: list_del_init(&capsnap->flushing_item); list_add_tail(&capsnap->flushing_item, &session->s_cap_snaps_flushing); - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); dout("flush_snaps %p cap_snap %p follows %lld tid %llu\n", inode, capsnap, capsnap->follows, capsnap->flush_tid); @@ -1304,7 +1303,7 @@ retry: next_follows = capsnap->follows + 1; ceph_put_cap_snap(capsnap); - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); goto retry; } @@ -1324,11 +1323,9 @@ out: static void ceph_flush_snaps(struct ceph_inode_info *ci) { - struct inode *inode = &ci->vfs_inode; - - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); __ceph_flush_snaps(ci, NULL, 0); - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); } /* @@ -1375,7 +1372,7 @@ int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask) * Add dirty inode to the flushing list. Assigned a seq number so we * can wait for caps to flush without starving. * - * Called under i_lock. + * Called under i_ceph_lock. */ static int __mark_caps_flushing(struct inode *inode, struct ceph_mds_session *session) @@ -1423,9 +1420,9 @@ static int try_nonblocking_invalidate(struct inode *inode) struct ceph_inode_info *ci = ceph_inode(inode); u32 invalidating_gen = ci->i_rdcache_gen; - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); invalidate_mapping_pages(&inode->i_data, 0, -1); - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); if (inode->i_data.nrpages == 0 && invalidating_gen == ci->i_rdcache_gen) { @@ -1472,7 +1469,7 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags, if (mdsc->stopping) is_delayed = 1; - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); if (ci->i_ceph_flags & CEPH_I_FLUSH) flags |= CHECK_CAPS_FLUSH; @@ -1482,7 +1479,7 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags, __ceph_flush_snaps(ci, &session, 0); goto retry_locked; retry: - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); retry_locked: file_wanted = __ceph_caps_file_wanted(ci); used = __ceph_caps_used(ci); @@ -1636,7 +1633,7 @@ ack: if (mutex_trylock(&session->s_mutex) == 0) { dout("inverting session/ino locks on %p\n", session); - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); if (took_snap_rwsem) { up_read(&mdsc->snap_rwsem); took_snap_rwsem = 0; @@ -1650,7 +1647,7 @@ ack: if (down_read_trylock(&mdsc->snap_rwsem) == 0) { dout("inverting snap/in locks on %p\n", inode); - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); down_read(&mdsc->snap_rwsem); took_snap_rwsem = 1; goto retry; @@ -1666,10 +1663,10 @@ ack: mds = cap->mds; /* remember mds, so we don't repeat */ sent++; - /* __send_cap drops i_lock */ + /* __send_cap drops i_ceph_lock */ delayed += __send_cap(mdsc, cap, CEPH_CAP_OP_UPDATE, used, want, retain, flushing, NULL); - goto retry; /* retake i_lock and restart our cap scan. */ + goto retry; /* retake i_ceph_lock and restart our cap scan. */ } /* @@ -1683,7 +1680,7 @@ ack: else if (!is_delayed || force_requeue) __cap_delay_requeue(mdsc, ci); - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); if (queue_invalidate) ceph_queue_invalidate(inode); @@ -1706,7 +1703,7 @@ static int try_flush_caps(struct inode *inode, struct ceph_mds_session *session, int flushing = 0; retry: - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); if (ci->i_ceph_flags & CEPH_I_NOFLUSH) { dout("try_flush_caps skipping %p I_NOFLUSH set\n", inode); goto out; @@ -1718,7 +1715,7 @@ retry: int delayed; if (!session) { - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); session = cap->session; mutex_lock(&session->s_mutex); goto retry; @@ -1729,18 +1726,18 @@ retry: flushing = __mark_caps_flushing(inode, session); - /* __send_cap drops i_lock */ + /* __send_cap drops i_ceph_lock */ delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, used, want, cap->issued | cap->implemented, flushing, flush_tid); if (!delayed) goto out_unlocked; - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); __cap_delay_requeue(mdsc, ci); } out: - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); out_unlocked: if (session && unlock_session) mutex_unlock(&session->s_mutex); @@ -1755,7 +1752,7 @@ static int caps_are_flushed(struct inode *inode, unsigned tid) struct ceph_inode_info *ci = ceph_inode(inode); int i, ret = 1; - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); for (i = 0; i < CEPH_CAP_BITS; i++) if ((ci->i_flushing_caps & (1 << i)) && ci->i_cap_flush_tid[i] <= tid) { @@ -1763,7 +1760,7 @@ static int caps_are_flushed(struct inode *inode, unsigned tid) ret = 0; break; } - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); return ret; } @@ -1870,10 +1867,10 @@ int ceph_write_inode(struct inode *inode, struct writeback_control *wbc) struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); if (__ceph_caps_dirty(ci)) __cap_delay_requeue_front(mdsc, ci); - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); } return err; } @@ -1896,7 +1893,7 @@ static void kick_flushing_capsnaps(struct ceph_mds_client *mdsc, struct inode *inode = &ci->vfs_inode; struct ceph_cap *cap; - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); cap = ci->i_auth_cap; if (cap && cap->session == session) { dout("kick_flushing_caps %p cap %p capsnap %p\n", inode, @@ -1906,7 +1903,7 @@ static void kick_flushing_capsnaps(struct ceph_mds_client *mdsc, pr_err("%p auth cap %p not mds%d ???\n", inode, cap, session->s_mds); } - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); } } @@ -1923,7 +1920,7 @@ void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc, struct ceph_cap *cap; int delayed = 0; - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); cap = ci->i_auth_cap; if (cap && cap->session == session) { dout("kick_flushing_caps %p cap %p %s\n", inode, @@ -1934,14 +1931,14 @@ void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc, cap->issued | cap->implemented, ci->i_flushing_caps, NULL); if (delayed) { - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); __cap_delay_requeue(mdsc, ci); - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); } } else { pr_err("%p auth cap %p not mds%d ???\n", inode, cap, session->s_mds); - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); } } } @@ -1954,7 +1951,7 @@ static void kick_flushing_inode_caps(struct ceph_mds_client *mdsc, struct ceph_cap *cap; int delayed = 0; - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); cap = ci->i_auth_cap; dout("kick_flushing_inode_caps %p flushing %s flush_seq %lld\n", inode, ceph_cap_string(ci->i_flushing_caps), ci->i_cap_flush_seq); @@ -1966,12 +1963,12 @@ static void kick_flushing_inode_caps(struct ceph_mds_client *mdsc, cap->issued | cap->implemented, ci->i_flushing_caps, NULL); if (delayed) { - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); __cap_delay_requeue(mdsc, ci); - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); } } else { - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); } } @@ -1980,7 +1977,7 @@ static void kick_flushing_inode_caps(struct ceph_mds_client *mdsc, * Take references to capabilities we hold, so that we don't release * them to the MDS prematurely. * - * Protected by i_lock. + * Protected by i_ceph_lock. */ static void __take_cap_refs(struct ceph_inode_info *ci, int got) { @@ -2018,7 +2015,7 @@ static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want, dout("get_cap_refs %p need %s want %s\n", inode, ceph_cap_string(need), ceph_cap_string(want)); - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); /* make sure file is actually open */ file_wanted = __ceph_caps_file_wanted(ci); @@ -2079,7 +2076,7 @@ static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want, ceph_cap_string(have), ceph_cap_string(need)); } out: - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); dout("get_cap_refs %p ret %d got %s\n", inode, ret, ceph_cap_string(*got)); return ret; @@ -2096,7 +2093,7 @@ static void check_max_size(struct inode *inode, loff_t endoff) int check = 0; /* do we need to explicitly request a larger max_size? */ - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); if ((endoff >= ci->i_max_size || endoff > (inode->i_size << 1)) && endoff > ci->i_wanted_max_size) { @@ -2105,7 +2102,7 @@ static void check_max_size(struct inode *inode, loff_t endoff) ci->i_wanted_max_size = endoff; check = 1; } - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); if (check) ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL); } @@ -2142,9 +2139,9 @@ retry: */ void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps) { - spin_lock(&ci->vfs_inode.i_lock); + spin_lock(&ci->i_ceph_lock); __take_cap_refs(ci, caps); - spin_unlock(&ci->vfs_inode.i_lock); + spin_unlock(&ci->i_ceph_lock); } /* @@ -2162,7 +2159,7 @@ void ceph_put_cap_refs(struct ceph_inode_info *ci, int had) int last = 0, put = 0, flushsnaps = 0, wake = 0; struct ceph_cap_snap *capsnap; - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); if (had & CEPH_CAP_PIN) --ci->i_pin_ref; if (had & CEPH_CAP_FILE_RD) @@ -2195,7 +2192,7 @@ void ceph_put_cap_refs(struct ceph_inode_info *ci, int had) } } } - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); dout("put_cap_refs %p had %s%s%s\n", inode, ceph_cap_string(had), last ? " last" : "", put ? " put" : ""); @@ -2227,7 +2224,7 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr, int found = 0; struct ceph_cap_snap *capsnap = NULL; - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); ci->i_wrbuffer_ref -= nr; last = !ci->i_wrbuffer_ref; @@ -2276,7 +2273,7 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr, } } - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); if (last) { ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL); @@ -2293,7 +2290,7 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr, * Handle a cap GRANT message from the MDS. (Note that a GRANT may * actually be a revocation if it specifies a smaller cap set.) * - * caller holds s_mutex and i_lock, we drop both. + * caller holds s_mutex and i_ceph_lock, we drop both. * * return value: * 0 - ok @@ -2304,7 +2301,7 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant, struct ceph_mds_session *session, struct ceph_cap *cap, struct ceph_buffer *xattr_buf) - __releases(inode->i_lock) + __releases(ci->i_ceph_lock) { struct ceph_inode_info *ci = ceph_inode(inode); int mds = session->s_mds; @@ -2363,7 +2360,7 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant, } if ((issued & CEPH_CAP_LINK_EXCL) == 0) - inode->i_nlink = le32_to_cpu(grant->nlink); + set_nlink(inode, le32_to_cpu(grant->nlink)); if ((issued & CEPH_CAP_XATTR_EXCL) == 0 && grant->xattr_len) { int len = le32_to_cpu(grant->xattr_len); @@ -2455,7 +2452,7 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant, } BUG_ON(cap->issued & ~cap->implemented); - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); if (writeback) /* * queue inode for writeback: we can't actually call @@ -2485,7 +2482,7 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid, struct ceph_mds_caps *m, struct ceph_mds_session *session, struct ceph_cap *cap) - __releases(inode->i_lock) + __releases(ci->i_ceph_lock) { struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; @@ -2541,7 +2538,7 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid, wake_up_all(&ci->i_cap_wq); out: - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); if (drop) iput(inode); } @@ -2564,7 +2561,7 @@ static void handle_cap_flushsnap_ack(struct inode *inode, u64 flush_tid, dout("handle_cap_flushsnap_ack inode %p ci %p mds%d follows %lld\n", inode, ci, session->s_mds, follows); - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) { if (capsnap->follows == follows) { if (capsnap->flush_tid != flush_tid) { @@ -2587,7 +2584,7 @@ static void handle_cap_flushsnap_ack(struct inode *inode, u64 flush_tid, capsnap, capsnap->follows); } } - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); if (drop) iput(inode); } @@ -2600,7 +2597,7 @@ static void handle_cap_flushsnap_ack(struct inode *inode, u64 flush_tid, static void handle_cap_trunc(struct inode *inode, struct ceph_mds_caps *trunc, struct ceph_mds_session *session) - __releases(inode->i_lock) + __releases(ci->i_ceph_lock) { struct ceph_inode_info *ci = ceph_inode(inode); int mds = session->s_mds; @@ -2619,7 +2616,7 @@ static void handle_cap_trunc(struct inode *inode, inode, mds, seq, truncate_size, truncate_seq); queue_trunc = ceph_fill_file_size(inode, issued, truncate_seq, truncate_size, size); - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); if (queue_trunc) ceph_queue_vmtruncate(inode); @@ -2648,7 +2645,7 @@ static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex, dout("handle_cap_export inode %p ci %p mds%d mseq %d\n", inode, ci, mds, mseq); - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); /* make sure we haven't seen a higher mseq */ for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) { @@ -2692,7 +2689,7 @@ static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex, } /* else, we already released it */ - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); } /* @@ -2747,9 +2744,9 @@ static void handle_cap_import(struct ceph_mds_client *mdsc, up_read(&mdsc->snap_rwsem); /* make sure we re-request max_size, if necessary */ - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); ci->i_requested_max_size = 0; - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); } /* @@ -2764,6 +2761,7 @@ void ceph_handle_caps(struct ceph_mds_session *session, struct ceph_mds_client *mdsc = session->s_mdsc; struct super_block *sb = mdsc->fsc->sb; struct inode *inode; + struct ceph_inode_info *ci; struct ceph_cap *cap; struct ceph_mds_caps *h; int mds = session->s_mds; @@ -2817,6 +2815,7 @@ void ceph_handle_caps(struct ceph_mds_session *session, /* lookup ino */ inode = ceph_find_inode(sb, vino); + ci = ceph_inode(inode); dout(" op %s ino %llx.%llx inode %p\n", ceph_cap_op_name(op), vino.ino, vino.snap, inode); if (!inode) { @@ -2846,16 +2845,16 @@ void ceph_handle_caps(struct ceph_mds_session *session, } /* the rest require a cap */ - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); cap = __get_cap_for_mds(ceph_inode(inode), mds); if (!cap) { dout(" no cap on %p ino %llx.%llx from mds%d\n", inode, ceph_ino(inode), ceph_snap(inode), mds); - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); goto flush_cap_releases; } - /* note that each of these drops i_lock for us */ + /* note that each of these drops i_ceph_lock for us */ switch (op) { case CEPH_CAP_OP_REVOKE: case CEPH_CAP_OP_GRANT: @@ -2871,7 +2870,7 @@ void ceph_handle_caps(struct ceph_mds_session *session, break; default: - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); pr_err("ceph_handle_caps: unknown cap op %d %s\n", op, ceph_cap_op_name(op)); } @@ -2964,13 +2963,13 @@ void ceph_put_fmode(struct ceph_inode_info *ci, int fmode) struct inode *inode = &ci->vfs_inode; int last = 0; - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); dout("put_fmode %p fmode %d %d -> %d\n", inode, fmode, ci->i_nr_by_mode[fmode], ci->i_nr_by_mode[fmode]-1); BUG_ON(ci->i_nr_by_mode[fmode] == 0); if (--ci->i_nr_by_mode[fmode] == 0) last++; - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); if (last && ci->i_vino.snap == CEPH_NOSNAP) ceph_check_caps(ci, 0, NULL); @@ -2993,7 +2992,7 @@ int ceph_encode_inode_release(void **p, struct inode *inode, int used, dirty; int ret = 0; - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); used = __ceph_caps_used(ci); dirty = __ceph_caps_dirty(ci); @@ -3048,7 +3047,7 @@ int ceph_encode_inode_release(void **p, struct inode *inode, inode, cap, ceph_cap_string(cap->issued)); } } - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); return ret; } @@ -3063,7 +3062,7 @@ int ceph_encode_dentry_release(void **p, struct dentry *dentry, /* * force an record for the directory caps if we have a dentry lease. - * this is racy (can't take i_lock and d_lock together), but it + * this is racy (can't take i_ceph_lock and d_lock together), but it * doesn't have to be perfect; the mds will revoke anything we don't * release. */ diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 382abc9..74fd747 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -108,7 +108,7 @@ static unsigned fpos_off(loff_t p) * falling back to a "normal" sync readdir if any dentries in the dir * are dropped. * - * I_COMPLETE tells indicates we have all dentries in the dir. It is + * D_COMPLETE tells indicates we have all dentries in the dir. It is * defined IFF we hold CEPH_CAP_FILE_SHARED (which will be revoked by * the MDS if/when the directory is modified). */ @@ -199,8 +199,8 @@ more: filp->f_pos++; /* make sure a dentry wasn't dropped while we didn't have parent lock */ - if (!ceph_i_test(dir, CEPH_I_COMPLETE)) { - dout(" lost I_COMPLETE on %p; falling back to mds\n", dir); + if (!ceph_dir_test_complete(dir)) { + dout(" lost D_COMPLETE on %p; falling back to mds\n", dir); err = -EAGAIN; goto out; } @@ -281,18 +281,18 @@ static int ceph_readdir(struct file *filp, void *dirent, filldir_t filldir) } /* can we use the dcache? */ - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); if ((filp->f_pos == 2 || fi->dentry) && !ceph_test_mount_opt(fsc, NOASYNCREADDIR) && ceph_snap(inode) != CEPH_SNAPDIR && - (ci->i_ceph_flags & CEPH_I_COMPLETE) && + ceph_dir_test_complete(inode) && __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1)) { - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); err = __dcache_readdir(filp, dirent, filldir); if (err != -EAGAIN) return err; } else { - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); } if (fi->dentry) { err = note_last_dentry(fi, fi->dentry->d_name.name, @@ -351,7 +351,7 @@ more: if (!req->r_did_prepopulate) { dout("readdir !did_prepopulate"); - fi->dir_release_count--; /* preclude I_COMPLETE */ + fi->dir_release_count--; /* preclude D_COMPLETE */ } /* note next offset and last dentry name */ @@ -428,13 +428,12 @@ more: * were released during the whole readdir, and we should have * the complete dir contents in our cache. */ - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); if (ci->i_release_count == fi->dir_release_count) { - dout(" marking %p complete\n", inode); - /* ci->i_ceph_flags |= CEPH_I_COMPLETE; */ + ceph_dir_set_complete(inode); ci->i_max_offset = filp->f_pos; } - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); dout("readdir %p filp %p done.\n", inode, filp); return 0; @@ -608,21 +607,21 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry, struct ceph_inode_info *ci = ceph_inode(dir); struct ceph_dentry_info *di = ceph_dentry(dentry); - spin_lock(&dir->i_lock); + spin_lock(&ci->i_ceph_lock); dout(" dir %p flags are %d\n", dir, ci->i_ceph_flags); if (strncmp(dentry->d_name.name, fsc->mount_options->snapdir_name, dentry->d_name.len) && !is_root_ceph_dentry(dir, dentry) && - (ci->i_ceph_flags & CEPH_I_COMPLETE) && + ceph_dir_test_complete(dir) && (__ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1))) { - spin_unlock(&dir->i_lock); + spin_unlock(&ci->i_ceph_lock); dout(" dir %p complete, -ENOENT\n", dir); d_add(dentry, NULL); di->lease_shared_gen = ci->i_shared_gen; return NULL; } - spin_unlock(&dir->i_lock); + spin_unlock(&ci->i_ceph_lock); } op = ceph_snap(dir) == CEPH_SNAPDIR ? @@ -667,7 +666,7 @@ int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry) } static int ceph_mknod(struct inode *dir, struct dentry *dentry, - int mode, dev_t rdev) + umode_t mode, dev_t rdev) { struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb); struct ceph_mds_client *mdsc = fsc->mdsc; @@ -677,7 +676,7 @@ static int ceph_mknod(struct inode *dir, struct dentry *dentry, if (ceph_snap(dir) != CEPH_NOSNAP) return -EROFS; - dout("mknod in dir %p dentry %p mode 0%o rdev %d\n", + dout("mknod in dir %p dentry %p mode 0%ho rdev %d\n", dir, dentry, mode, rdev); req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_MKNOD, USE_AUTH_MDS); if (IS_ERR(req)) { @@ -700,7 +699,7 @@ static int ceph_mknod(struct inode *dir, struct dentry *dentry, return err; } -static int ceph_create(struct inode *dir, struct dentry *dentry, int mode, +static int ceph_create(struct inode *dir, struct dentry *dentry, umode_t mode, struct nameidata *nd) { dout("create in dir %p dentry %p name '%.*s'\n", @@ -754,7 +753,7 @@ static int ceph_symlink(struct inode *dir, struct dentry *dentry, return err; } -static int ceph_mkdir(struct inode *dir, struct dentry *dentry, int mode) +static int ceph_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) { struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb); struct ceph_mds_client *mdsc = fsc->mdsc; @@ -768,7 +767,7 @@ static int ceph_mkdir(struct inode *dir, struct dentry *dentry, int mode) dout("mksnap dir %p snap '%.*s' dn %p\n", dir, dentry->d_name.len, dentry->d_name.name, dentry); } else if (ceph_snap(dir) == CEPH_NOSNAP) { - dout("mkdir dir %p dn %p mode 0%o\n", dir, dentry, mode); + dout("mkdir dir %p dn %p mode 0%ho\n", dir, dentry, mode); op = CEPH_MDS_OP_MKDIR; } else { goto out; @@ -842,12 +841,12 @@ static int drop_caps_for_unlink(struct inode *inode) struct ceph_inode_info *ci = ceph_inode(inode); int drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL; - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); if (inode->i_nlink == 1) { drop |= ~(__ceph_caps_wanted(ci) | CEPH_CAP_PIN); ci->i_ceph_flags |= CEPH_I_NODELAY; } - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); return drop; } @@ -871,7 +870,7 @@ static int ceph_unlink(struct inode *dir, struct dentry *dentry) } else if (ceph_snap(dir) == CEPH_NOSNAP) { dout("unlink/rmdir dir %p dn %p inode %p\n", dir, dentry, inode); - op = ((dentry->d_inode->i_mode & S_IFMT) == S_IFDIR) ? + op = S_ISDIR(dentry->d_inode->i_mode) ? CEPH_MDS_OP_RMDIR : CEPH_MDS_OP_UNLINK; } else goto out; @@ -934,7 +933,7 @@ static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry, */ /* d_move screws up d_subdirs order */ - ceph_i_clear(new_dir, CEPH_I_COMPLETE); + ceph_dir_clear_complete(new_dir); d_move(old_dentry, new_dentry); @@ -1016,10 +1015,10 @@ static int dir_lease_is_valid(struct inode *dir, struct dentry *dentry) struct ceph_dentry_info *di = ceph_dentry(dentry); int valid = 0; - spin_lock(&dir->i_lock); + spin_lock(&ci->i_ceph_lock); if (ci->i_shared_gen == di->lease_shared_gen) valid = __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1); - spin_unlock(&dir->i_lock); + spin_unlock(&ci->i_ceph_lock); dout("dir_lease_is_valid dir %p v%u dentry %p v%u = %d\n", dir, (unsigned)ci->i_shared_gen, dentry, (unsigned)di->lease_shared_gen, valid); @@ -1092,7 +1091,52 @@ static int ceph_snapdir_d_revalidate(struct dentry *dentry, return 1; } +/* + * Set/clear/test dir complete flag on the dir's dentry. + */ +void ceph_dir_set_complete(struct inode *inode) +{ + /* not yet implemented */ +} + +void ceph_dir_clear_complete(struct inode *inode) +{ + /* not yet implemented */ +} + +bool ceph_dir_test_complete(struct inode *inode) +{ + /* not yet implemented */ + return false; +} + +/* + * When the VFS prunes a dentry from the cache, we need to clear the + * complete flag on the parent directory. + * + * Called under dentry->d_lock. + */ +static void ceph_d_prune(struct dentry *dentry) +{ + struct ceph_dentry_info *di; + + dout("ceph_d_prune %p\n", dentry); + /* do we have a valid parent? */ + if (!dentry->d_parent || IS_ROOT(dentry)) + return; + + /* if we are not hashed, we don't affect D_COMPLETE */ + if (d_unhashed(dentry)) + return; + + /* + * we hold d_lock, so d_parent is stable, and d_fsdata is never + * cleared until d_release + */ + di = ceph_dentry(dentry->d_parent); + clear_bit(CEPH_D_COMPLETE, &di->flags); +} /* * read() on a dir. This weird interface hack only works if mounted @@ -1306,6 +1350,7 @@ const struct inode_operations ceph_dir_iops = { const struct dentry_operations ceph_dentry_ops = { .d_revalidate = ceph_d_revalidate, .d_release = ceph_d_release, + .d_prune = ceph_d_prune, }; const struct dentry_operations ceph_snapdir_dentry_ops = { @@ -1315,4 +1360,5 @@ const struct dentry_operations ceph_snapdir_dentry_ops = { const struct dentry_operations ceph_snap_dentry_ops = { .d_release = ceph_d_release, + .d_prune = ceph_d_prune, }; diff --git a/fs/ceph/file.c b/fs/ceph/file.c index ce549d3..ed72428 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -147,9 +147,9 @@ int ceph_open(struct inode *inode, struct file *file) /* trivially open snapdir */ if (ceph_snap(inode) == CEPH_SNAPDIR) { - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); __ceph_get_fmode(ci, fmode); - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); return ceph_init_file(inode, file, fmode); } @@ -158,7 +158,7 @@ int ceph_open(struct inode *inode, struct file *file) * write) or any MDS (for read). Update wanted set * asynchronously. */ - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); if (__ceph_is_any_real_caps(ci) && (((fmode & CEPH_FILE_MODE_WR) == 0) || ci->i_auth_cap)) { int mds_wanted = __ceph_caps_mds_wanted(ci); @@ -168,7 +168,7 @@ int ceph_open(struct inode *inode, struct file *file) inode, fmode, ceph_cap_string(wanted), ceph_cap_string(issued)); __ceph_get_fmode(ci, fmode); - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); /* adjust wanted? */ if ((issued & wanted) != wanted && @@ -180,10 +180,10 @@ int ceph_open(struct inode *inode, struct file *file) } else if (ceph_snap(inode) != CEPH_NOSNAP && (ci->i_snap_caps & wanted) == wanted) { __ceph_get_fmode(ci, fmode); - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); return ceph_init_file(inode, file, fmode); } - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); dout("open fmode %d wants %s\n", fmode, ceph_cap_string(wanted)); req = prepare_open_request(inode->i_sb, flags, 0); @@ -743,9 +743,9 @@ retry_snap: */ int dirty; - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR); - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); ceph_put_cap_refs(ci, got); ret = generic_file_aio_write(iocb, iov, nr_segs, pos); @@ -764,9 +764,9 @@ retry_snap: if (ret >= 0) { int dirty; - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR); - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); if (dirty) __mark_inode_dirty(inode, dirty); } @@ -797,7 +797,8 @@ static loff_t ceph_llseek(struct file *file, loff_t offset, int origin) mutex_lock(&inode->i_mutex); __ceph_do_pending_vmtruncate(inode); - if (origin != SEEK_CUR || origin != SEEK_SET) { + + if (origin == SEEK_END || origin == SEEK_DATA || origin == SEEK_HOLE) { ret = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE); if (ret < 0) { offset = ret; diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 095799b..25283e7 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -9,7 +9,6 @@ #include <linux/namei.h> #include <linux/writeback.h> #include <linux/vmalloc.h> -#include <linux/pagevec.h> #include "super.h" #include "mds_client.h" @@ -298,6 +297,8 @@ struct inode *ceph_alloc_inode(struct super_block *sb) dout("alloc_inode %p\n", &ci->vfs_inode); + spin_lock_init(&ci->i_ceph_lock); + ci->i_version = 0; ci->i_time_warp_seq = 0; ci->i_ceph_flags = 0; @@ -383,7 +384,6 @@ static void ceph_i_callback(struct rcu_head *head) struct inode *inode = container_of(head, struct inode, i_rcu); struct ceph_inode_info *ci = ceph_inode(inode); - INIT_LIST_HEAD(&inode->i_dentry); kmem_cache_free(ceph_inode_cachep, ci); } @@ -584,7 +584,7 @@ static int fill_inode(struct inode *inode, iinfo->xattr_len); } - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); /* * provided version will be odd if inode value is projected, @@ -619,7 +619,7 @@ static int fill_inode(struct inode *inode, } if ((issued & CEPH_CAP_LINK_EXCL) == 0) - inode->i_nlink = le32_to_cpu(info->nlink); + set_nlink(inode, le32_to_cpu(info->nlink)); /* be careful with mtime, atime, size */ ceph_decode_timespec(&atime, &info->atime); @@ -681,7 +681,7 @@ static int fill_inode(struct inode *inode, char *sym; BUG_ON(symlen != inode->i_size); - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); err = -ENOMEM; sym = kmalloc(symlen+1, GFP_NOFS); @@ -690,7 +690,7 @@ static int fill_inode(struct inode *inode, memcpy(sym, iinfo->symlink, symlen); sym[symlen] = 0; - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); if (!ci->i_symlink) ci->i_symlink = sym; else @@ -716,7 +716,7 @@ static int fill_inode(struct inode *inode, } no_change: - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); /* queue truncate if we saw i_size decrease */ if (queue_trunc) @@ -751,13 +751,13 @@ no_change: info->cap.flags, caps_reservation); } else { - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); dout(" %p got snap_caps %s\n", inode, ceph_cap_string(le32_to_cpu(info->cap.caps))); ci->i_snap_caps |= le32_to_cpu(info->cap.caps); if (cap_fmode >= 0) __ceph_get_fmode(ci, cap_fmode); - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); } } else if (cap_fmode >= 0) { pr_warning("mds issued no caps on %llx.%llx\n", @@ -772,9 +772,9 @@ no_change: ceph_snap(inode) == CEPH_NOSNAP && (le32_to_cpu(info->cap.caps) & CEPH_CAP_FILE_SHARED) && (issued & CEPH_CAP_FILE_EXCL) == 0 && - (ci->i_ceph_flags & CEPH_I_COMPLETE) == 0) { + !ceph_dir_test_complete(inode)) { dout(" marking %p complete (empty)\n", inode); - /* ci->i_ceph_flags |= CEPH_I_COMPLETE; */ + ceph_dir_set_complete(inode); ci->i_max_offset = 2; } @@ -850,19 +850,20 @@ static void ceph_set_dentry_offset(struct dentry *dn) { struct dentry *dir = dn->d_parent; struct inode *inode = dir->d_inode; + struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_dentry_info *di; BUG_ON(!inode); di = ceph_dentry(dn); - spin_lock(&inode->i_lock); - if ((ceph_inode(inode)->i_ceph_flags & CEPH_I_COMPLETE) == 0) { - spin_unlock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); + if (!ceph_dir_test_complete(inode)) { + spin_unlock(&ci->i_ceph_lock); return; } di->offset = ceph_inode(inode)->i_max_offset++; - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); spin_lock(&dir->d_lock); spin_lock_nested(&dn->d_lock, DENTRY_D_LOCK_NESTED); @@ -1057,7 +1058,7 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req, * d_move() puts the renamed dentry at the end of * d_subdirs. We need to assign it an appropriate * directory offset so we can behave when holding - * I_COMPLETE. + * D_COMPLETE. */ ceph_set_dentry_offset(req->r_old_dentry); dout("dn %p gets new offset %lld\n", req->r_old_dentry, @@ -1309,7 +1310,7 @@ int ceph_inode_set_size(struct inode *inode, loff_t size) struct ceph_inode_info *ci = ceph_inode(inode); int ret = 0; - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); dout("set_size %p %llu -> %llu\n", inode, inode->i_size, size); inode->i_size = size; inode->i_blocks = (size + (1 << 9) - 1) >> 9; @@ -1319,7 +1320,7 @@ int ceph_inode_set_size(struct inode *inode, loff_t size) (ci->i_reported_size << 1) < ci->i_max_size) ret = 1; - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); return ret; } @@ -1329,12 +1330,13 @@ int ceph_inode_set_size(struct inode *inode, loff_t size) */ void ceph_queue_writeback(struct inode *inode) { + ihold(inode); if (queue_work(ceph_inode_to_client(inode)->wb_wq, &ceph_inode(inode)->i_wb_work)) { dout("ceph_queue_writeback %p\n", inode); - ihold(inode); } else { dout("ceph_queue_writeback %p failed\n", inode); + iput(inode); } } @@ -1354,55 +1356,13 @@ static void ceph_writeback_work(struct work_struct *work) */ void ceph_queue_invalidate(struct inode *inode) { + ihold(inode); if (queue_work(ceph_inode_to_client(inode)->pg_inv_wq, &ceph_inode(inode)->i_pg_inv_work)) { dout("ceph_queue_invalidate %p\n", inode); - ihold(inode); } else { dout("ceph_queue_invalidate %p failed\n", inode); - } -} - -/* - * invalidate any pages that are not dirty or under writeback. this - * includes pages that are clean and mapped. - */ -static void ceph_invalidate_nondirty_pages(struct address_space *mapping) -{ - struct pagevec pvec; - pgoff_t next = 0; - int i; - - pagevec_init(&pvec, 0); - while (pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) { - for (i = 0; i < pagevec_count(&pvec); i++) { - struct page *page = pvec.pages[i]; - pgoff_t index; - int skip_page = - (PageDirty(page) || PageWriteback(page)); - - if (!skip_page) - skip_page = !trylock_page(page); - - /* - * We really shouldn't be looking at the ->index of an - * unlocked page. But we're not allowed to lock these - * pages. So we rely upon nobody altering the ->index - * of this (pinned-by-us) page. - */ - index = page->index; - if (index > next) - next = index; - next++; - - if (skip_page) - continue; - - generic_error_remove_page(mapping, page); - unlock_page(page); - } - pagevec_release(&pvec); - cond_resched(); + iput(inode); } } @@ -1418,20 +1378,20 @@ static void ceph_invalidate_work(struct work_struct *work) u32 orig_gen; int check = 0; - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); dout("invalidate_pages %p gen %d revoking %d\n", inode, ci->i_rdcache_gen, ci->i_rdcache_revoking); if (ci->i_rdcache_revoking != ci->i_rdcache_gen) { /* nevermind! */ - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); goto out; } orig_gen = ci->i_rdcache_gen; - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); - ceph_invalidate_nondirty_pages(inode->i_mapping); + truncate_inode_pages(&inode->i_data, 0); - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); if (orig_gen == ci->i_rdcache_gen && orig_gen == ci->i_rdcache_revoking) { dout("invalidate_pages %p gen %d successful\n", inode, @@ -1443,7 +1403,7 @@ static void ceph_invalidate_work(struct work_struct *work) inode, orig_gen, ci->i_rdcache_gen, ci->i_rdcache_revoking); } - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); if (check) ceph_check_caps(ci, 0, NULL); @@ -1478,13 +1438,14 @@ void ceph_queue_vmtruncate(struct inode *inode) { struct ceph_inode_info *ci = ceph_inode(inode); + ihold(inode); if (queue_work(ceph_sb_to_client(inode->i_sb)->trunc_wq, &ci->i_vmtruncate_work)) { dout("ceph_queue_vmtruncate %p\n", inode); - ihold(inode); } else { dout("ceph_queue_vmtruncate %p failed, pending=%d\n", inode, ci->i_truncate_pending); + iput(inode); } } @@ -1501,10 +1462,10 @@ void __ceph_do_pending_vmtruncate(struct inode *inode) int wrbuffer_refs, wake = 0; retry: - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); if (ci->i_truncate_pending == 0) { dout("__do_pending_vmtruncate %p none pending\n", inode); - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); return; } @@ -1515,7 +1476,7 @@ retry: if (ci->i_wrbuffer_ref_head < ci->i_wrbuffer_ref) { dout("__do_pending_vmtruncate %p flushing snaps first\n", inode); - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); filemap_write_and_wait_range(&inode->i_data, 0, inode->i_sb->s_maxbytes); goto retry; @@ -1525,15 +1486,15 @@ retry: wrbuffer_refs = ci->i_wrbuffer_ref; dout("__do_pending_vmtruncate %p (%d) to %lld\n", inode, ci->i_truncate_pending, to); - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); truncate_inode_pages(inode->i_mapping, to); - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); ci->i_truncate_pending--; if (ci->i_truncate_pending == 0) wake = 1; - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); if (wrbuffer_refs == 0) ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL); @@ -1588,7 +1549,7 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr) if (IS_ERR(req)) return PTR_ERR(req); - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); issued = __ceph_caps_issued(ci, NULL); dout("setattr %p issued %s\n", inode, ceph_cap_string(issued)); @@ -1736,7 +1697,7 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr) } release &= issued; - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); if (inode_dirty_flags) __mark_inode_dirty(inode, inode_dirty_flags); @@ -1758,7 +1719,7 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr) __ceph_do_pending_vmtruncate(inode); return err; out: - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); ceph_mdsc_put_request(req); return err; } diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c index 3b256b5..790914a59 100644 --- a/fs/ceph/ioctl.c +++ b/fs/ceph/ioctl.c @@ -42,17 +42,39 @@ static long ceph_ioctl_set_layout(struct file *file, void __user *arg) struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; struct ceph_mds_request *req; struct ceph_ioctl_layout l; + struct ceph_inode_info *ci = ceph_inode(file->f_dentry->d_inode); + struct ceph_ioctl_layout nl; int err, i; - /* copy and validate */ if (copy_from_user(&l, arg, sizeof(l))) return -EFAULT; - if ((l.object_size & ~PAGE_MASK) || - (l.stripe_unit & ~PAGE_MASK) || - !l.stripe_unit || - (l.object_size && - (unsigned)l.object_size % (unsigned)l.stripe_unit)) + /* validate changed params against current layout */ + err = ceph_do_getattr(file->f_dentry->d_inode, CEPH_STAT_CAP_LAYOUT); + if (!err) { + nl.stripe_unit = ceph_file_layout_su(ci->i_layout); + nl.stripe_count = ceph_file_layout_stripe_count(ci->i_layout); + nl.object_size = ceph_file_layout_object_size(ci->i_layout); + nl.data_pool = le32_to_cpu(ci->i_layout.fl_pg_pool); + nl.preferred_osd = + (s32)le32_to_cpu(ci->i_layout.fl_pg_preferred); + } else + return err; + + if (l.stripe_count) + nl.stripe_count = l.stripe_count; + if (l.stripe_unit) + nl.stripe_unit = l.stripe_unit; + if (l.object_size) + nl.object_size = l.object_size; + if (l.data_pool) + nl.data_pool = l.data_pool; + if (l.preferred_osd) + nl.preferred_osd = l.preferred_osd; + + if ((nl.object_size & ~PAGE_MASK) || + (nl.stripe_unit & ~PAGE_MASK) || + ((unsigned)nl.object_size % (unsigned)nl.stripe_unit)) return -EINVAL; /* make sure it's a valid data pool */ @@ -219,11 +241,11 @@ static long ceph_ioctl_lazyio(struct file *file) struct ceph_inode_info *ci = ceph_inode(inode); if ((fi->fmode & CEPH_FILE_MODE_LAZY) == 0) { - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); ci->i_nr_by_mode[fi->fmode]--; fi->fmode |= CEPH_FILE_MODE_LAZY; ci->i_nr_by_mode[fi->fmode]++; - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); dout("ioctl_layzio: file %p marked lazy\n", file); ceph_check_caps(ci, 0, NULL); diff --git a/fs/ceph/ioctl.h b/fs/ceph/ioctl.h index 0c5167e..be4a604 100644 --- a/fs/ceph/ioctl.h +++ b/fs/ceph/ioctl.h @@ -6,7 +6,31 @@ #define CEPH_IOCTL_MAGIC 0x97 -/* just use u64 to align sanely on all archs */ +/* + * CEPH_IOC_GET_LAYOUT - get file layout or dir layout policy + * CEPH_IOC_SET_LAYOUT - set file layout + * CEPH_IOC_SET_LAYOUT_POLICY - set dir layout policy + * + * The file layout specifies how file data is striped over objects in + * the distributed object store, which object pool they belong to (if + * it differs from the default), and an optional 'preferred osd' to + * store them on. + * + * Files get a new layout based on the policy set on the containing + * directory or one of its ancestors. The GET_LAYOUT ioctl will let + * you examine the layout for a file or the policy on a directory. + * + * SET_LAYOUT will let you set a layout on a newly created file. This + * only works immediately after the file is created and before any + * data is written to it. + * + * SET_LAYOUT_POLICY will let you set a layout policy (default layout) + * on a directory that will apply to any new files created in that + * directory (or any child directory that doesn't specify a layout of + * its own). + */ + +/* use u64 to align sanely on all archs */ struct ceph_ioctl_layout { __u64 stripe_unit, stripe_count, object_size; __u64 data_pool; @@ -21,6 +45,8 @@ struct ceph_ioctl_layout { struct ceph_ioctl_layout) /* + * CEPH_IOC_GET_DATALOC - get location of file data in the cluster + * * Extract identity, address of the OSD and object storing a given * file offset. */ @@ -39,7 +65,34 @@ struct ceph_ioctl_dataloc { #define CEPH_IOC_GET_DATALOC _IOWR(CEPH_IOCTL_MAGIC, 3, \ struct ceph_ioctl_dataloc) +/* + * CEPH_IOC_LAZYIO - relax consistency + * + * Normally Ceph switches to synchronous IO when multiple clients have + * the file open (and or more for write). Reads and writes bypass the + * page cache and go directly to the OSD. Setting this flag on a file + * descriptor will allow buffered IO for this file in cases where the + * application knows it won't interfere with other nodes (or doesn't + * care). + */ #define CEPH_IOC_LAZYIO _IO(CEPH_IOCTL_MAGIC, 4) + +/* + * CEPH_IOC_SYNCIO - force synchronous IO + * + * This ioctl sets a file flag that forces the synchronous IO that + * bypasses the page cache, even if it is not necessary. This is + * essentially the opposite behavior of IOC_LAZYIO. This forces the + * same read/write path as a file opened by multiple clients when one + * or more of those clients is opened for write. + * + * Note that this type of sync IO takes a different path than a file + * opened with O_SYNC/D_SYNC (writes hit the page cache and are + * immediately flushed on page boundaries). It is very similar to + * O_DIRECT (writes bypass the page cache) excep that O_DIRECT writes + * are not copied (user page must remain stable) and O_DIRECT writes + * have alignment restrictions (on the buffer and file offset). + */ #define CEPH_IOC_SYNCIO _IO(CEPH_IOCTL_MAGIC, 5) #endif diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 86c59e1..6203d80 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -619,7 +619,7 @@ static void __unregister_request(struct ceph_mds_client *mdsc, * * Called under mdsc->mutex. */ -struct dentry *get_nonsnap_parent(struct dentry *dentry) +static struct dentry *get_nonsnap_parent(struct dentry *dentry) { /* * we don't need to worry about protecting the d_parent access @@ -732,21 +732,21 @@ static int __choose_mds(struct ceph_mds_client *mdsc, } } - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); cap = NULL; if (mode == USE_AUTH_MDS) cap = ci->i_auth_cap; if (!cap && !RB_EMPTY_ROOT(&ci->i_caps)) cap = rb_entry(rb_first(&ci->i_caps), struct ceph_cap, ci_node); if (!cap) { - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); goto random; } mds = cap->session->s_mds; dout("choose_mds %p %llx.%llx mds%d (%scap %p)\n", inode, ceph_vinop(inode), mds, cap == ci->i_auth_cap ? "auth " : "", cap); - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); return mds; random: @@ -764,7 +764,8 @@ static struct ceph_msg *create_session_msg(u32 op, u64 seq) struct ceph_msg *msg; struct ceph_mds_session_head *h; - msg = ceph_msg_new(CEPH_MSG_CLIENT_SESSION, sizeof(*h), GFP_NOFS); + msg = ceph_msg_new(CEPH_MSG_CLIENT_SESSION, sizeof(*h), GFP_NOFS, + false); if (!msg) { pr_err("create_session_msg ENOMEM creating msg\n"); return NULL; @@ -950,7 +951,7 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, dout("removing cap %p, ci is %p, inode is %p\n", cap, ci, &ci->vfs_inode); - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); __ceph_remove_cap(cap); if (!__ceph_is_any_real_caps(ci)) { struct ceph_mds_client *mdsc = @@ -983,7 +984,7 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, } spin_unlock(&mdsc->cap_dirty_lock); } - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); while (drop--) iput(inode); return 0; @@ -1014,10 +1015,10 @@ static int wake_up_session_cb(struct inode *inode, struct ceph_cap *cap, wake_up_all(&ci->i_cap_wq); if (arg) { - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); ci->i_wanted_max_size = 0; ci->i_requested_max_size = 0; - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); } return 0; } @@ -1150,7 +1151,7 @@ static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg) if (session->s_trim_caps <= 0) return -1; - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); mine = cap->issued | cap->implemented; used = __ceph_caps_used(ci); oissued = __ceph_caps_issued_other(ci, cap); @@ -1169,7 +1170,7 @@ static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg) __ceph_remove_cap(cap); } else { /* try to drop referring dentries */ - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); d_prune_aliases(inode); dout("trim_caps_cb %p cap %p pruned, count now %d\n", inode, cap, atomic_read(&inode->i_count)); @@ -1177,7 +1178,7 @@ static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg) } out: - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); return 0; } @@ -1240,7 +1241,7 @@ int ceph_add_cap_releases(struct ceph_mds_client *mdsc, while (session->s_num_cap_releases < session->s_nr_caps + extra) { spin_unlock(&session->s_cap_lock); msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPRELEASE, PAGE_CACHE_SIZE, - GFP_NOFS); + GFP_NOFS, false); if (!msg) goto out_unlocked; dout("add_cap_releases %p msg %p now %d\n", session, msg, @@ -1295,7 +1296,7 @@ static int check_cap_flush(struct ceph_mds_client *mdsc, u64 want_flush_seq) i_flushing_item); struct inode *inode = &ci->vfs_inode; - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); if (ci->i_cap_flush_seq <= want_flush_seq) { dout("check_cap_flush still flushing %p " "seq %lld <= %lld to mds%d\n", inode, @@ -1303,7 +1304,7 @@ static int check_cap_flush(struct ceph_mds_client *mdsc, u64 want_flush_seq) session->s_mds); ret = 0; } - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); } mutex_unlock(&session->s_mutex); ceph_put_mds_session(session); @@ -1494,6 +1495,7 @@ retry: pos, temp); } else if (stop_on_nosnap && inode && ceph_snap(inode) == CEPH_NOSNAP) { + spin_unlock(&temp->d_lock); break; } else { pos -= temp->d_name.len; @@ -1652,7 +1654,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc, if (req->r_old_dentry_drop) len += req->r_old_dentry->d_name.len; - msg = ceph_msg_new(CEPH_MSG_CLIENT_REQUEST, len, GFP_NOFS); + msg = ceph_msg_new(CEPH_MSG_CLIENT_REQUEST, len, GFP_NOFS, false); if (!msg) { msg = ERR_PTR(-ENOMEM); goto out_free2; @@ -2001,7 +2003,7 @@ out: } /* - * Invalidate dir I_COMPLETE, dentry lease state on an aborted MDS + * Invalidate dir D_COMPLETE, dentry lease state on an aborted MDS * namespace request. */ void ceph_invalidate_dir_request(struct ceph_mds_request *req) @@ -2009,11 +2011,11 @@ void ceph_invalidate_dir_request(struct ceph_mds_request *req) struct inode *inode = req->r_locked_dir; struct ceph_inode_info *ci = ceph_inode(inode); - dout("invalidate_dir_request %p (I_COMPLETE, lease(s))\n", inode); - spin_lock(&inode->i_lock); - ci->i_ceph_flags &= ~CEPH_I_COMPLETE; + dout("invalidate_dir_request %p (D_COMPLETE, lease(s))\n", inode); + spin_lock(&ci->i_ceph_lock); + ceph_dir_clear_complete(inode); ci->i_release_count++; - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); if (req->r_dentry) ceph_invalidate_dentry_lease(req->r_dentry); @@ -2421,7 +2423,7 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap, if (err) goto out_free; - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); cap->seq = 0; /* reset cap seq */ cap->issue_seq = 0; /* and issue_seq */ @@ -2444,7 +2446,7 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap, rec.v1.pathbase = cpu_to_le64(pathbase); reclen = sizeof(rec.v1); } - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); if (recon_state->flock) { int num_fcntl_locks, num_flock_locks; @@ -2518,7 +2520,7 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc, goto fail_nopagelist; ceph_pagelist_init(pagelist); - reply = ceph_msg_new(CEPH_MSG_CLIENT_RECONNECT, 0, GFP_NOFS); + reply = ceph_msg_new(CEPH_MSG_CLIENT_RECONNECT, 0, GFP_NOFS, false); if (!reply) goto fail_nomsg; @@ -2831,7 +2833,7 @@ void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session, dnamelen = dentry->d_name.len; len += dnamelen; - msg = ceph_msg_new(CEPH_MSG_CLIENT_LEASE, len, GFP_NOFS); + msg = ceph_msg_new(CEPH_MSG_CLIENT_LEASE, len, GFP_NOFS, false); if (!msg) return; lease = msg->front.iov_base; @@ -3153,7 +3155,7 @@ void ceph_mdsc_sync(struct ceph_mds_client *mdsc) /* * true if all sessions are closed, or we force unmount */ -bool done_closing_sessions(struct ceph_mds_client *mdsc) +static bool done_closing_sessions(struct ceph_mds_client *mdsc) { int i, n = 0; diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index 4bb2399..a50ca0e 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -20,7 +20,7 @@ * * mdsc->snap_rwsem * - * inode->i_lock + * ci->i_ceph_lock * mdsc->snap_flush_lock * mdsc->cap_delay_lock * diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c index e264371..a559c80 100644 --- a/fs/ceph/snap.c +++ b/fs/ceph/snap.c @@ -446,7 +446,7 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci) return; } - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); used = __ceph_caps_used(ci); dirty = __ceph_caps_dirty(ci); @@ -528,7 +528,7 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci) kfree(capsnap); } - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); } /* @@ -537,7 +537,7 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci) * * If capsnap can now be flushed, add to snap_flush list, and return 1. * - * Caller must hold i_lock. + * Caller must hold i_ceph_lock. */ int __ceph_finish_cap_snap(struct ceph_inode_info *ci, struct ceph_cap_snap *capsnap) @@ -739,9 +739,9 @@ static void flush_snaps(struct ceph_mds_client *mdsc) inode = &ci->vfs_inode; ihold(inode); spin_unlock(&mdsc->snap_flush_lock); - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); __ceph_flush_snaps(ci, &session, 0); - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); iput(inode); spin_lock(&mdsc->snap_flush_lock); } @@ -847,7 +847,7 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc, continue; ci = ceph_inode(inode); - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); if (!ci->i_snap_realm) goto skip_inode; /* @@ -876,7 +876,7 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc, oldrealm = ci->i_snap_realm; ci->i_snap_realm = realm; spin_unlock(&realm->inodes_with_caps_lock); - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); ceph_get_snap_realm(mdsc, realm); ceph_put_snap_realm(mdsc, oldrealm); @@ -885,7 +885,7 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc, continue; skip_inode: - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); iput(inode); } diff --git a/fs/ceph/super.c b/fs/ceph/super.c index 88bacaf..48f61a1 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -114,6 +114,7 @@ static int ceph_sync_fs(struct super_block *sb, int wait) enum { Opt_wsize, Opt_rsize, + Opt_rasize, Opt_caps_wanted_delay_min, Opt_caps_wanted_delay_max, Opt_cap_release_safety, @@ -136,6 +137,7 @@ enum { static match_table_t fsopt_tokens = { {Opt_wsize, "wsize=%d"}, {Opt_rsize, "rsize=%d"}, + {Opt_rasize, "rasize=%d"}, {Opt_caps_wanted_delay_min, "caps_wanted_delay_min=%d"}, {Opt_caps_wanted_delay_max, "caps_wanted_delay_max=%d"}, {Opt_cap_release_safety, "cap_release_safety=%d"}, @@ -196,6 +198,9 @@ static int parse_fsopt_token(char *c, void *private) case Opt_rsize: fsopt->rsize = intval; break; + case Opt_rasize: + fsopt->rasize = intval; + break; case Opt_caps_wanted_delay_min: fsopt->caps_wanted_delay_min = intval; break; @@ -289,28 +294,29 @@ static int parse_mount_options(struct ceph_mount_options **pfsopt, dout("parse_mount_options %p, dev_name '%s'\n", fsopt, dev_name); - fsopt->sb_flags = flags; - fsopt->flags = CEPH_MOUNT_OPT_DEFAULT; + fsopt->sb_flags = flags; + fsopt->flags = CEPH_MOUNT_OPT_DEFAULT; - fsopt->rsize = CEPH_RSIZE_DEFAULT; - fsopt->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL); + fsopt->rsize = CEPH_RSIZE_DEFAULT; + fsopt->rasize = CEPH_RASIZE_DEFAULT; + fsopt->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL); fsopt->caps_wanted_delay_min = CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT; fsopt->caps_wanted_delay_max = CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT; - fsopt->cap_release_safety = CEPH_CAP_RELEASE_SAFETY_DEFAULT; - fsopt->max_readdir = CEPH_MAX_READDIR_DEFAULT; - fsopt->max_readdir_bytes = CEPH_MAX_READDIR_BYTES_DEFAULT; - fsopt->congestion_kb = default_congestion_kb(); - - /* ip1[:port1][,ip2[:port2]...]:/subdir/in/fs */ - err = -EINVAL; - if (!dev_name) - goto out; - *path = strstr(dev_name, ":/"); - if (*path == NULL) { - pr_err("device name is missing path (no :/ in %s)\n", - dev_name); - goto out; - } + fsopt->cap_release_safety = CEPH_CAP_RELEASE_SAFETY_DEFAULT; + fsopt->max_readdir = CEPH_MAX_READDIR_DEFAULT; + fsopt->max_readdir_bytes = CEPH_MAX_READDIR_BYTES_DEFAULT; + fsopt->congestion_kb = default_congestion_kb(); + + /* ip1[:port1][,ip2[:port2]...]:/subdir/in/fs */ + err = -EINVAL; + if (!dev_name) + goto out; + *path = strstr(dev_name, ":/"); + if (*path == NULL) { + pr_err("device name is missing path (no :/ in %s)\n", + dev_name); + goto out; + } dev_name_end = *path; dout("device name '%.*s'\n", (int)(dev_name_end - dev_name), dev_name); @@ -335,11 +341,11 @@ out: /** * ceph_show_options - Show mount options in /proc/mounts * @m: seq_file to write to - * @mnt: mount descriptor + * @root: root of that (sub)tree */ -static int ceph_show_options(struct seq_file *m, struct vfsmount *mnt) +static int ceph_show_options(struct seq_file *m, struct dentry *root) { - struct ceph_fs_client *fsc = ceph_sb_to_client(mnt->mnt_sb); + struct ceph_fs_client *fsc = ceph_sb_to_client(root->d_sb); struct ceph_mount_options *fsopt = fsc->mount_options; struct ceph_options *opt = fsc->client->options; @@ -376,6 +382,8 @@ static int ceph_show_options(struct seq_file *m, struct vfsmount *mnt) seq_printf(m, ",wsize=%d", fsopt->wsize); if (fsopt->rsize != CEPH_RSIZE_DEFAULT) seq_printf(m, ",rsize=%d", fsopt->rsize); + if (fsopt->rasize != CEPH_RASIZE_DEFAULT) + seq_printf(m, ",rasize=%d", fsopt->rasize); if (fsopt->congestion_kb != default_congestion_kb()) seq_printf(m, ",write_congestion_kb=%d", fsopt->congestion_kb); if (fsopt->caps_wanted_delay_min != CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT) @@ -418,24 +426,27 @@ static int extra_mon_dispatch(struct ceph_client *client, struct ceph_msg *msg) /* * create a new fs client */ -struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt, +static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt, struct ceph_options *opt) { struct ceph_fs_client *fsc; + const unsigned supported_features = + CEPH_FEATURE_FLOCK | + CEPH_FEATURE_DIRLAYOUTHASH; + const unsigned required_features = 0; int err = -ENOMEM; fsc = kzalloc(sizeof(*fsc), GFP_KERNEL); if (!fsc) return ERR_PTR(-ENOMEM); - fsc->client = ceph_create_client(opt, fsc); + fsc->client = ceph_create_client(opt, fsc, supported_features, + required_features); if (IS_ERR(fsc->client)) { err = PTR_ERR(fsc->client); goto fail; } fsc->client->extra_mon_dispatch = extra_mon_dispatch; - fsc->client->supported_features |= CEPH_FEATURE_FLOCK | - CEPH_FEATURE_DIRLAYOUTHASH; fsc->client->monc.want_mdsmap = 1; fsc->mount_options = fsopt; @@ -491,7 +502,7 @@ fail: return ERR_PTR(err); } -void destroy_fs_client(struct ceph_fs_client *fsc) +static void destroy_fs_client(struct ceph_fs_client *fsc) { dout("destroy_fs_client %p\n", fsc); @@ -625,17 +636,26 @@ static struct dentry *open_root_dentry(struct ceph_fs_client *fsc, req->r_num_caps = 2; err = ceph_mdsc_do_request(mdsc, NULL, req); if (err == 0) { - dout("open_root_inode success\n"); - if (ceph_ino(req->r_target_inode) == CEPH_INO_ROOT && - fsc->sb->s_root == NULL) - root = d_alloc_root(req->r_target_inode); - else - root = d_obtain_alias(req->r_target_inode); + struct inode *inode = req->r_target_inode; req->r_target_inode = NULL; + dout("open_root_inode success\n"); + if (ceph_ino(inode) == CEPH_INO_ROOT && + fsc->sb->s_root == NULL) { + root = d_alloc_root(inode); + if (!root) { + iput(inode); + root = ERR_PTR(-ENOMEM); + goto out; + } + ceph_init_dentry(root); + } else { + root = d_obtain_alias(inode); + } dout("open_root_inode success, root dentry is %p\n", root); } else { root = ERR_PTR(err); } +out: ceph_mdsc_put_request(req); return root; } @@ -774,10 +794,10 @@ static int ceph_register_bdi(struct super_block *sb, { int err; - /* set ra_pages based on rsize mount option? */ - if (fsc->mount_options->rsize >= PAGE_CACHE_SIZE) + /* set ra_pages based on rasize mount option? */ + if (fsc->mount_options->rasize >= PAGE_CACHE_SIZE) fsc->backing_dev_info.ra_pages = - (fsc->mount_options->rsize + PAGE_CACHE_SIZE - 1) + (fsc->mount_options->rasize + PAGE_CACHE_SIZE - 1) >> PAGE_SHIFT; else fsc->backing_dev_info.ra_pages = diff --git a/fs/ceph/super.h b/fs/ceph/super.h index a23eed5..cb3652b 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -36,7 +36,8 @@ #define ceph_test_mount_opt(fsc, opt) \ (!!((fsc)->mount_options->flags & CEPH_MOUNT_OPT_##opt)) -#define CEPH_RSIZE_DEFAULT (512*1024) /* readahead */ +#define CEPH_RSIZE_DEFAULT 0 /* max read size */ +#define CEPH_RASIZE_DEFAULT (8192*1024) /* readahead */ #define CEPH_MAX_READDIR_DEFAULT 1024 #define CEPH_MAX_READDIR_BYTES_DEFAULT (512*1024) #define CEPH_SNAPDIRNAME_DEFAULT ".snap" @@ -45,8 +46,9 @@ struct ceph_mount_options { int flags; int sb_flags; - int wsize; - int rsize; /* max readahead */ + int wsize; /* max write size */ + int rsize; /* max read size */ + int rasize; /* max readahead */ int congestion_kb; /* max writeback in flight */ int caps_wanted_delay_min, caps_wanted_delay_max; int cap_release_safety; @@ -134,7 +136,7 @@ struct ceph_cap_snap { int issued, dirty; struct ceph_snap_context *context; - mode_t mode; + umode_t mode; uid_t uid; gid_t gid; @@ -201,6 +203,7 @@ struct ceph_inode_xattr { * Ceph dentry state */ struct ceph_dentry_info { + unsigned long flags; struct ceph_mds_session *lease_session; u32 lease_gen, lease_shared_gen; u32 lease_seq; @@ -211,6 +214,18 @@ struct ceph_dentry_info { u64 offset; }; +/* + * dentry flags + * + * The locking for D_COMPLETE is a bit odd: + * - we can clear it at almost any time (see ceph_d_prune) + * - it is only meaningful if: + * - we hold dir inode i_ceph_lock + * - we hold dir FILE_SHARED caps + * - the dentry D_COMPLETE is set + */ +#define CEPH_D_COMPLETE 1 /* if set, d_u.d_subdirs is complete directory */ + struct ceph_inode_xattrs_info { /* * (still encoded) xattr blob. we avoid the overhead of parsing @@ -235,6 +250,8 @@ struct ceph_inode_xattrs_info { struct ceph_inode_info { struct ceph_vino i_vino; /* ceph ino + snap */ + spinlock_t i_ceph_lock; + u64 i_version; u32 i_time_warp_seq; @@ -249,14 +266,14 @@ struct ceph_inode_info { struct timespec i_rctime; u64 i_rbytes, i_rfiles, i_rsubdirs; u64 i_files, i_subdirs; - u64 i_max_offset; /* largest readdir offset, set with I_COMPLETE */ + u64 i_max_offset; /* largest readdir offset, set with D_COMPLETE */ struct rb_root i_fragtree; struct mutex i_fragtree_mutex; struct ceph_inode_xattrs_info i_xattrs; - /* capabilities. protected _both_ by i_lock and cap->session's + /* capabilities. protected _both_ by i_ceph_lock and cap->session's * s_mutex. */ struct rb_root i_caps; /* cap list */ struct ceph_cap *i_auth_cap; /* authoritative cap, if any */ @@ -344,9 +361,10 @@ static inline struct ceph_vino ceph_vino(struct inode *inode) * x86_64+ino32 64 32 * x86_64 64 64 */ -static inline u32 ceph_ino_to_ino32(ino_t ino) +static inline u32 ceph_ino_to_ino32(__u64 vino) { - ino ^= ino >> (sizeof(ino) * 8 - 32); + u32 ino = vino & 0xffffffff; + ino ^= vino >> 32; if (!ino) ino = 1; return ino; @@ -357,11 +375,11 @@ static inline u32 ceph_ino_to_ino32(ino_t ino) */ static inline ino_t ceph_vino_to_ino(struct ceph_vino vino) { - ino_t ino = (ino_t)vino.ino; /* ^ (vino.snap << 20); */ #if BITS_PER_LONG == 32 - ino = ceph_ino_to_ino32(ino); + return ceph_ino_to_ino32(vino.ino); +#else + return (ino_t)vino.ino; #endif - return ino; } /* @@ -413,7 +431,6 @@ static inline struct inode *ceph_find_inode(struct super_block *sb, /* * Ceph inode. */ -#define CEPH_I_COMPLETE 1 /* we have complete directory cached */ #define CEPH_I_NODELAY 4 /* do not delay cap release */ #define CEPH_I_FLUSH 8 /* do not delay flush of dirty metadata */ #define CEPH_I_NOFLUSH 16 /* do not flush dirty caps */ @@ -422,18 +439,18 @@ static inline void ceph_i_clear(struct inode *inode, unsigned mask) { struct ceph_inode_info *ci = ceph_inode(inode); - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); ci->i_ceph_flags &= ~mask; - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); } static inline void ceph_i_set(struct inode *inode, unsigned mask) { struct ceph_inode_info *ci = ceph_inode(inode); - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); ci->i_ceph_flags |= mask; - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); } static inline bool ceph_i_test(struct inode *inode, unsigned mask) @@ -441,9 +458,9 @@ static inline bool ceph_i_test(struct inode *inode, unsigned mask) struct ceph_inode_info *ci = ceph_inode(inode); bool r; - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); r = (ci->i_ceph_flags & mask) == mask; - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); return r; } @@ -471,6 +488,13 @@ static inline loff_t ceph_make_fpos(unsigned frag, unsigned off) } /* + * set/clear directory D_COMPLETE flag + */ +void ceph_dir_set_complete(struct inode *inode); +void ceph_dir_clear_complete(struct inode *inode); +bool ceph_dir_test_complete(struct inode *inode); + +/* * caps helpers */ static inline bool __ceph_is_any_real_caps(struct ceph_inode_info *ci) @@ -486,9 +510,9 @@ extern int __ceph_caps_issued_other(struct ceph_inode_info *ci, static inline int ceph_caps_issued(struct ceph_inode_info *ci) { int issued; - spin_lock(&ci->vfs_inode.i_lock); + spin_lock(&ci->i_ceph_lock); issued = __ceph_caps_issued(ci, NULL); - spin_unlock(&ci->vfs_inode.i_lock); + spin_unlock(&ci->i_ceph_lock); return issued; } @@ -496,9 +520,9 @@ static inline int ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch) { int r; - spin_lock(&ci->vfs_inode.i_lock); + spin_lock(&ci->i_ceph_lock); r = __ceph_caps_issued_mask(ci, mask, touch); - spin_unlock(&ci->vfs_inode.i_lock); + spin_unlock(&ci->i_ceph_lock); return r; } @@ -721,10 +745,9 @@ extern int ceph_add_cap(struct inode *inode, extern void __ceph_remove_cap(struct ceph_cap *cap); static inline void ceph_remove_cap(struct ceph_cap *cap) { - struct inode *inode = &cap->ci->vfs_inode; - spin_lock(&inode->i_lock); + spin_lock(&cap->ci->i_ceph_lock); __ceph_remove_cap(cap); - spin_unlock(&inode->i_lock); + spin_unlock(&cap->ci->i_ceph_lock); } extern void ceph_put_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap); diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index 96c6739..a5e36e4 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c @@ -343,8 +343,8 @@ void __ceph_destroy_xattrs(struct ceph_inode_info *ci) } static int __build_xattrs(struct inode *inode) - __releases(inode->i_lock) - __acquires(inode->i_lock) + __releases(ci->i_ceph_lock) + __acquires(ci->i_ceph_lock) { u32 namelen; u32 numattr = 0; @@ -372,7 +372,7 @@ start: end = p + ci->i_xattrs.blob->vec.iov_len; ceph_decode_32_safe(&p, end, numattr, bad); xattr_version = ci->i_xattrs.version; - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); xattrs = kcalloc(numattr, sizeof(struct ceph_xattr *), GFP_NOFS); @@ -387,7 +387,7 @@ start: goto bad_lock; } - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); if (ci->i_xattrs.version != xattr_version) { /* lost a race, retry */ for (i = 0; i < numattr; i++) @@ -418,7 +418,7 @@ start: return err; bad_lock: - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); bad: if (xattrs) { for (i = 0; i < numattr; i++) @@ -512,7 +512,7 @@ ssize_t ceph_getxattr(struct dentry *dentry, const char *name, void *value, if (vxattrs) vxattr = ceph_match_vxattr(vxattrs, name); - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); dout("getxattr %p ver=%lld index_ver=%lld\n", inode, ci->i_xattrs.version, ci->i_xattrs.index_version); @@ -520,14 +520,14 @@ ssize_t ceph_getxattr(struct dentry *dentry, const char *name, void *value, (ci->i_xattrs.index_version >= ci->i_xattrs.version)) { goto get_xattr; } else { - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); /* get xattrs from mds (if we don't already have them) */ err = ceph_do_getattr(inode, CEPH_STAT_CAP_XATTR); if (err) return err; } - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); if (vxattr && vxattr->readonly) { err = vxattr->getxattr_cb(ci, value, size); @@ -558,7 +558,7 @@ get_xattr: memcpy(value, xattr->val, xattr->val_len); out: - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); return err; } @@ -573,7 +573,7 @@ ssize_t ceph_listxattr(struct dentry *dentry, char *names, size_t size) u32 len; int i; - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); dout("listxattr %p ver=%lld index_ver=%lld\n", inode, ci->i_xattrs.version, ci->i_xattrs.index_version); @@ -581,13 +581,13 @@ ssize_t ceph_listxattr(struct dentry *dentry, char *names, size_t size) (ci->i_xattrs.index_version >= ci->i_xattrs.version)) { goto list_xattr; } else { - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); err = ceph_do_getattr(inode, CEPH_STAT_CAP_XATTR); if (err) return err; } - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); err = __build_xattrs(inode); if (err < 0) @@ -619,7 +619,7 @@ list_xattr: } out: - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); return err; } @@ -739,7 +739,7 @@ int ceph_setxattr(struct dentry *dentry, const char *name, if (!xattr) goto out; - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); retry: issued = __ceph_caps_issued(ci, NULL); if (!(issued & CEPH_CAP_XATTR_EXCL)) @@ -752,12 +752,12 @@ retry: required_blob_size > ci->i_xattrs.prealloc_blob->alloc_len) { struct ceph_buffer *blob = NULL; - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); dout(" preaallocating new blob size=%d\n", required_blob_size); blob = ceph_buffer_new(required_blob_size, GFP_NOFS); if (!blob) goto out; - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); if (ci->i_xattrs.prealloc_blob) ceph_buffer_put(ci->i_xattrs.prealloc_blob); ci->i_xattrs.prealloc_blob = blob; @@ -770,13 +770,13 @@ retry: dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL); ci->i_xattrs.dirty = true; inode->i_ctime = CURRENT_TIME; - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); if (dirty) __mark_inode_dirty(inode, dirty); return err; do_sync: - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); err = ceph_sync_setxattr(dentry, name, value, size, flags); out: kfree(newname); @@ -833,7 +833,7 @@ int ceph_removexattr(struct dentry *dentry, const char *name) return -EOPNOTSUPP; } - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); __build_xattrs(inode); issued = __ceph_caps_issued(ci, NULL); dout("removexattr %p issued %s\n", inode, ceph_cap_string(issued)); @@ -846,12 +846,12 @@ int ceph_removexattr(struct dentry *dentry, const char *name) ci->i_xattrs.dirty = true; inode->i_ctime = CURRENT_TIME; - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); if (dirty) __mark_inode_dirty(inode, dirty); return err; do_sync: - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); err = ceph_send_removexattr(dentry, name); return err; } diff --git a/fs/char_dev.c b/fs/char_dev.c index dca9e5e..3f152b9 100644 --- a/fs/char_dev.c +++ b/fs/char_dev.c @@ -272,7 +272,7 @@ int __register_chrdev(unsigned int major, unsigned int baseminor, cd = __register_chrdev_region(major, baseminor, count, name); if (IS_ERR(cd)) return PTR_ERR(cd); - + cdev = cdev_alloc(); if (!cdev) goto out2; @@ -280,7 +280,7 @@ int __register_chrdev(unsigned int major, unsigned int baseminor, cdev->owner = fops->owner; cdev->ops = fops; kobject_set_name(&cdev->kobj, "%s", name); - + err = cdev_add(cdev, MKDEV(cd->major, baseminor), count); if (err) goto out; @@ -405,7 +405,7 @@ static int chrdev_open(struct inode *inode, struct file *filp) goto out_cdev_put; if (filp->f_op->open) { - ret = filp->f_op->open(inode,filp); + ret = filp->f_op->open(inode, filp); if (ret) goto out_cdev_put; } diff --git a/fs/cifs/README b/fs/cifs/README index c5c2c5e..895da1d 100644 --- a/fs/cifs/README +++ b/fs/cifs/README @@ -745,4 +745,18 @@ installed and something like the following lines should be added to the create cifs.spnego * * /usr/local/sbin/cifs.upcall %k create dns_resolver * * /usr/local/sbin/cifs.upcall %k +CIFS kernel module parameters +============================= +These module parameters can be specified or modified either during the time of +module loading or during the runtime by using the interface + /proc/module/cifs/parameters/<param> + +i.e. echo "value" > /sys/module/cifs/parameters/<param> + +1. echo_retries - The number of echo attempts before giving up and + reconnecting to the server. The default is 5. The value 0 + means never reconnect. + +2. enable_oplocks - Enable or disable oplocks. Oplocks are enabled by default. + [Y/y/1]. To disable use any of [N/n/0]. diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c index 6d40656..84e8c07 100644 --- a/fs/cifs/cifs_debug.c +++ b/fs/cifs/cifs_debug.c @@ -511,7 +511,7 @@ static const struct file_operations cifsFYI_proc_fops = { static int cifs_oplock_proc_show(struct seq_file *m, void *v) { - seq_printf(m, "%d\n", oplockEnabled); + seq_printf(m, "%d\n", enable_oplocks); return 0; } @@ -526,13 +526,16 @@ static ssize_t cifs_oplock_proc_write(struct file *file, char c; int rc; + printk(KERN_WARNING "CIFS: The /proc/fs/cifs/OplockEnabled interface " + "will be removed in kernel version 3.4. Please migrate to " + "using the 'enable_oplocks' module parameter in cifs.ko.\n"); rc = get_user(c, buffer); if (rc) return rc; if (c == '0' || c == 'n' || c == 'N') - oplockEnabled = 0; + enable_oplocks = false; else if (c == '1' || c == 'y' || c == 'Y') - oplockEnabled = 1; + enable_oplocks = true; return count; } diff --git a/fs/cifs/cifs_fs_sb.h b/fs/cifs/cifs_fs_sb.h index 7260e11..c865bfd 100644 --- a/fs/cifs/cifs_fs_sb.h +++ b/fs/cifs/cifs_fs_sb.h @@ -43,6 +43,8 @@ #define CIFS_MOUNT_STRICT_IO 0x40000 /* strict cache mode */ #define CIFS_MOUNT_RWPIDFORWARD 0x80000 /* use pid forwarding for rw */ #define CIFS_MOUNT_POSIXACL 0x100000 /* mirror of MS_POSIXACL in mnt_cifs_flags */ +#define CIFS_MOUNT_CIFS_BACKUPUID 0x200000 /* backup intent bit for a user */ +#define CIFS_MOUNT_CIFS_BACKUPGID 0x400000 /* backup intent bit for a group */ struct cifs_sb_info { struct rb_root tlink_tree; @@ -55,8 +57,10 @@ struct cifs_sb_info { atomic_t active; uid_t mnt_uid; gid_t mnt_gid; - mode_t mnt_file_mode; - mode_t mnt_dir_mode; + uid_t mnt_backupuid; + gid_t mnt_backupgid; + umode_t mnt_file_mode; + umode_t mnt_dir_mode; unsigned int mnt_cifs_flags; char *mountdata; /* options received at mount time or via DFS refs */ struct backing_dev_info bdi; diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c index d0f59fa..72ddf23 100644 --- a/fs/cifs/cifsacl.c +++ b/fs/cifs/cifsacl.c @@ -91,9 +91,76 @@ cifs_idmap_shrinker(struct shrinker *shrink, struct shrink_control *sc) shrink_idmap_tree(root, nr_to_scan, &nr_rem, &nr_del); spin_unlock(&sidgidlock); + root = &siduidtree; + spin_lock(&uidsidlock); + shrink_idmap_tree(root, nr_to_scan, &nr_rem, &nr_del); + spin_unlock(&uidsidlock); + + root = &sidgidtree; + spin_lock(&gidsidlock); + shrink_idmap_tree(root, nr_to_scan, &nr_rem, &nr_del); + spin_unlock(&gidsidlock); + return nr_rem; } +static void +sid_rb_insert(struct rb_root *root, unsigned long cid, + struct cifs_sid_id **psidid, char *typestr) +{ + char *strptr; + struct rb_node *node = root->rb_node; + struct rb_node *parent = NULL; + struct rb_node **linkto = &(root->rb_node); + struct cifs_sid_id *lsidid; + + while (node) { + lsidid = rb_entry(node, struct cifs_sid_id, rbnode); + parent = node; + if (cid > lsidid->id) { + linkto = &(node->rb_left); + node = node->rb_left; + } + if (cid < lsidid->id) { + linkto = &(node->rb_right); + node = node->rb_right; + } + } + + (*psidid)->id = cid; + (*psidid)->time = jiffies - (SID_MAP_RETRY + 1); + (*psidid)->refcount = 0; + + sprintf((*psidid)->sidstr, "%s", typestr); + strptr = (*psidid)->sidstr + strlen((*psidid)->sidstr); + sprintf(strptr, "%ld", cid); + + clear_bit(SID_ID_PENDING, &(*psidid)->state); + clear_bit(SID_ID_MAPPED, &(*psidid)->state); + + rb_link_node(&(*psidid)->rbnode, parent, linkto); + rb_insert_color(&(*psidid)->rbnode, root); +} + +static struct cifs_sid_id * +sid_rb_search(struct rb_root *root, unsigned long cid) +{ + struct rb_node *node = root->rb_node; + struct cifs_sid_id *lsidid; + + while (node) { + lsidid = rb_entry(node, struct cifs_sid_id, rbnode); + if (cid > lsidid->id) + node = node->rb_left; + else if (cid < lsidid->id) + node = node->rb_right; + else /* node found */ + return lsidid; + } + + return NULL; +} + static struct shrinker cifs_shrinker = { .shrink = cifs_idmap_shrinker, .seeks = DEFAULT_SEEKS, @@ -110,6 +177,7 @@ cifs_idmap_key_instantiate(struct key *key, const void *data, size_t datalen) memcpy(payload, data, datalen); key->payload.data = payload; + key->datalen = datalen; return 0; } @@ -224,6 +292,120 @@ sidid_pending_wait(void *unused) } static int +id_to_sid(unsigned long cid, uint sidtype, struct cifs_sid *ssid) +{ + int rc = 0; + struct key *sidkey; + const struct cred *saved_cred; + struct cifs_sid *lsid; + struct cifs_sid_id *psidid, *npsidid; + struct rb_root *cidtree; + spinlock_t *cidlock; + + if (sidtype == SIDOWNER) { + cidlock = &siduidlock; + cidtree = &uidtree; + } else if (sidtype == SIDGROUP) { + cidlock = &sidgidlock; + cidtree = &gidtree; + } else + return -EINVAL; + + spin_lock(cidlock); + psidid = sid_rb_search(cidtree, cid); + + if (!psidid) { /* node does not exist, allocate one & attempt adding */ + spin_unlock(cidlock); + npsidid = kzalloc(sizeof(struct cifs_sid_id), GFP_KERNEL); + if (!npsidid) + return -ENOMEM; + + npsidid->sidstr = kmalloc(SIDLEN, GFP_KERNEL); + if (!npsidid->sidstr) { + kfree(npsidid); + return -ENOMEM; + } + + spin_lock(cidlock); + psidid = sid_rb_search(cidtree, cid); + if (psidid) { /* node happened to get inserted meanwhile */ + ++psidid->refcount; + spin_unlock(cidlock); + kfree(npsidid->sidstr); + kfree(npsidid); + } else { + psidid = npsidid; + sid_rb_insert(cidtree, cid, &psidid, + sidtype == SIDOWNER ? "oi:" : "gi:"); + ++psidid->refcount; + spin_unlock(cidlock); + } + } else { + ++psidid->refcount; + spin_unlock(cidlock); + } + + /* + * If we are here, it is safe to access psidid and its fields + * since a reference was taken earlier while holding the spinlock. + * A reference on the node is put without holding the spinlock + * and it is OK to do so in this case, shrinker will not erase + * this node until all references are put and we do not access + * any fields of the node after a reference is put . + */ + if (test_bit(SID_ID_MAPPED, &psidid->state)) { + memcpy(ssid, &psidid->sid, sizeof(struct cifs_sid)); + psidid->time = jiffies; /* update ts for accessing */ + goto id_sid_out; + } + + if (time_after(psidid->time + SID_MAP_RETRY, jiffies)) { + rc = -EINVAL; + goto id_sid_out; + } + + if (!test_and_set_bit(SID_ID_PENDING, &psidid->state)) { + saved_cred = override_creds(root_cred); + sidkey = request_key(&cifs_idmap_key_type, psidid->sidstr, ""); + if (IS_ERR(sidkey)) { + rc = -EINVAL; + cFYI(1, "%s: Can't map and id to a SID", __func__); + } else { + lsid = (struct cifs_sid *)sidkey->payload.data; + memcpy(&psidid->sid, lsid, + sidkey->datalen < sizeof(struct cifs_sid) ? + sidkey->datalen : sizeof(struct cifs_sid)); + memcpy(ssid, &psidid->sid, + sidkey->datalen < sizeof(struct cifs_sid) ? + sidkey->datalen : sizeof(struct cifs_sid)); + set_bit(SID_ID_MAPPED, &psidid->state); + key_put(sidkey); + kfree(psidid->sidstr); + } + psidid->time = jiffies; /* update ts for accessing */ + revert_creds(saved_cred); + clear_bit(SID_ID_PENDING, &psidid->state); + wake_up_bit(&psidid->state, SID_ID_PENDING); + } else { + rc = wait_on_bit(&psidid->state, SID_ID_PENDING, + sidid_pending_wait, TASK_INTERRUPTIBLE); + if (rc) { + cFYI(1, "%s: sidid_pending_wait interrupted %d", + __func__, rc); + --psidid->refcount; + return rc; + } + if (test_bit(SID_ID_MAPPED, &psidid->state)) + memcpy(ssid, &psidid->sid, sizeof(struct cifs_sid)); + else + rc = -EINVAL; + } +id_sid_out: + --psidid->refcount; + return rc; +} + +static int sid_to_id(struct cifs_sb_info *cifs_sb, struct cifs_sid *psid, struct cifs_fattr *fattr, uint sidtype) { @@ -383,6 +565,10 @@ init_cifs_idmap(void) spin_lock_init(&sidgidlock); gidtree = RB_ROOT; + spin_lock_init(&uidsidlock); + siduidtree = RB_ROOT; + spin_lock_init(&gidsidlock); + sidgidtree = RB_ROOT; register_shrinker(&cifs_shrinker); cFYI(1, "cifs idmap keyring: %d\n", key_serial(keyring)); @@ -422,6 +608,18 @@ cifs_destroy_idmaptrees(void) while ((node = rb_first(root))) rb_erase(node, root); spin_unlock(&sidgidlock); + + root = &siduidtree; + spin_lock(&uidsidlock); + while ((node = rb_first(root))) + rb_erase(node, root); + spin_unlock(&uidsidlock); + + root = &sidgidtree; + spin_lock(&gidsidlock); + while ((node = rb_first(root))) + rb_erase(node, root); + spin_unlock(&gidsidlock); } /* if the two SIDs (roughly equivalent to a UUID for a user or group) are @@ -706,7 +904,7 @@ static void parse_dacl(struct cifs_acl *pdacl, char *end_of_acl, acl_size = sizeof(struct cifs_acl); num_aces = le32_to_cpu(pdacl->num_aces); - if (num_aces > 0) { + if (num_aces > 0) { umode_t user_mask = S_IRWXU; umode_t group_mask = S_IRWXG; umode_t other_mask = S_IRWXU | S_IRWXG | S_IRWXO; @@ -868,52 +1066,82 @@ static int parse_sec_desc(struct cifs_sb_info *cifs_sb, else cFYI(1, "no ACL"); /* BB grant all or default perms? */ -/* cifscred->uid = owner_sid_ptr->rid; - cifscred->gid = group_sid_ptr->rid; - memcpy((void *)(&(cifscred->osid)), (void *)owner_sid_ptr, - sizeof(struct cifs_sid)); - memcpy((void *)(&(cifscred->gsid)), (void *)group_sid_ptr, - sizeof(struct cifs_sid)); */ - return rc; } - /* Convert permission bits from mode to equivalent CIFS ACL */ static int build_sec_desc(struct cifs_ntsd *pntsd, struct cifs_ntsd *pnntsd, - struct inode *inode, __u64 nmode) + __u32 secdesclen, __u64 nmode, uid_t uid, gid_t gid, int *aclflag) { int rc = 0; __u32 dacloffset; __u32 ndacloffset; __u32 sidsoffset; struct cifs_sid *owner_sid_ptr, *group_sid_ptr; + struct cifs_sid *nowner_sid_ptr, *ngroup_sid_ptr; struct cifs_acl *dacl_ptr = NULL; /* no need for SACL ptr */ struct cifs_acl *ndacl_ptr = NULL; /* no need for SACL ptr */ - if ((inode == NULL) || (pntsd == NULL) || (pnntsd == NULL)) - return -EIO; - - owner_sid_ptr = (struct cifs_sid *)((char *)pntsd + + if (nmode != NO_CHANGE_64) { /* chmod */ + owner_sid_ptr = (struct cifs_sid *)((char *)pntsd + le32_to_cpu(pntsd->osidoffset)); - group_sid_ptr = (struct cifs_sid *)((char *)pntsd + + group_sid_ptr = (struct cifs_sid *)((char *)pntsd + le32_to_cpu(pntsd->gsidoffset)); - - dacloffset = le32_to_cpu(pntsd->dacloffset); - dacl_ptr = (struct cifs_acl *)((char *)pntsd + dacloffset); - - ndacloffset = sizeof(struct cifs_ntsd); - ndacl_ptr = (struct cifs_acl *)((char *)pnntsd + ndacloffset); - ndacl_ptr->revision = dacl_ptr->revision; - ndacl_ptr->size = 0; - ndacl_ptr->num_aces = 0; - - rc = set_chmod_dacl(ndacl_ptr, owner_sid_ptr, group_sid_ptr, nmode); - - sidsoffset = ndacloffset + le16_to_cpu(ndacl_ptr->size); - - /* copy security descriptor control portion and owner and group sid */ - copy_sec_desc(pntsd, pnntsd, sidsoffset); + dacloffset = le32_to_cpu(pntsd->dacloffset); + dacl_ptr = (struct cifs_acl *)((char *)pntsd + dacloffset); + ndacloffset = sizeof(struct cifs_ntsd); + ndacl_ptr = (struct cifs_acl *)((char *)pnntsd + ndacloffset); + ndacl_ptr->revision = dacl_ptr->revision; + ndacl_ptr->size = 0; + ndacl_ptr->num_aces = 0; + + rc = set_chmod_dacl(ndacl_ptr, owner_sid_ptr, group_sid_ptr, + nmode); + sidsoffset = ndacloffset + le16_to_cpu(ndacl_ptr->size); + /* copy sec desc control portion & owner and group sids */ + copy_sec_desc(pntsd, pnntsd, sidsoffset); + *aclflag = CIFS_ACL_DACL; + } else { + memcpy(pnntsd, pntsd, secdesclen); + if (uid != NO_CHANGE_32) { /* chown */ + owner_sid_ptr = (struct cifs_sid *)((char *)pnntsd + + le32_to_cpu(pnntsd->osidoffset)); + nowner_sid_ptr = kmalloc(sizeof(struct cifs_sid), + GFP_KERNEL); + if (!nowner_sid_ptr) + return -ENOMEM; + rc = id_to_sid(uid, SIDOWNER, nowner_sid_ptr); + if (rc) { + cFYI(1, "%s: Mapping error %d for owner id %d", + __func__, rc, uid); + kfree(nowner_sid_ptr); + return rc; + } + memcpy(owner_sid_ptr, nowner_sid_ptr, + sizeof(struct cifs_sid)); + kfree(nowner_sid_ptr); + *aclflag = CIFS_ACL_OWNER; + } + if (gid != NO_CHANGE_32) { /* chgrp */ + group_sid_ptr = (struct cifs_sid *)((char *)pnntsd + + le32_to_cpu(pnntsd->gsidoffset)); + ngroup_sid_ptr = kmalloc(sizeof(struct cifs_sid), + GFP_KERNEL); + if (!ngroup_sid_ptr) + return -ENOMEM; + rc = id_to_sid(gid, SIDGROUP, ngroup_sid_ptr); + if (rc) { + cFYI(1, "%s: Mapping error %d for group id %d", + __func__, rc, gid); + kfree(ngroup_sid_ptr); + return rc; + } + memcpy(group_sid_ptr, ngroup_sid_ptr, + sizeof(struct cifs_sid)); + kfree(ngroup_sid_ptr); + *aclflag = CIFS_ACL_GROUP; + } + } return rc; } @@ -945,7 +1173,7 @@ static struct cifs_ntsd *get_cifs_acl_by_path(struct cifs_sb_info *cifs_sb, { struct cifs_ntsd *pntsd = NULL; int oplock = 0; - int xid, rc; + int xid, rc, create_options = 0; __u16 fid; struct cifs_tcon *tcon; struct tcon_link *tlink = cifs_sb_tlink(cifs_sb); @@ -956,9 +1184,12 @@ static struct cifs_ntsd *get_cifs_acl_by_path(struct cifs_sb_info *cifs_sb, tcon = tlink_tcon(tlink); xid = GetXid(); - rc = CIFSSMBOpen(xid, tcon, path, FILE_OPEN, READ_CONTROL, 0, - &fid, &oplock, NULL, cifs_sb->local_nls, - cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); + if (backup_cred(cifs_sb)) + create_options |= CREATE_OPEN_BACKUP_INTENT; + + rc = CIFSSMBOpen(xid, tcon, path, FILE_OPEN, READ_CONTROL, + create_options, &fid, &oplock, NULL, cifs_sb->local_nls, + cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); if (!rc) { rc = CIFSSMBGetCIFSACL(xid, tcon, fid, &pntsd, pacllen); CIFSSMBClose(xid, tcon, fid); @@ -991,13 +1222,15 @@ struct cifs_ntsd *get_cifs_acl(struct cifs_sb_info *cifs_sb, return pntsd; } -static int set_cifs_acl_by_path(struct cifs_sb_info *cifs_sb, const char *path, - struct cifs_ntsd *pnntsd, u32 acllen) + /* Set an ACL on the server */ +int set_cifs_acl(struct cifs_ntsd *pnntsd, __u32 acllen, + struct inode *inode, const char *path, int aclflag) { int oplock = 0; - int xid, rc; + int xid, rc, access_flags, create_options = 0; __u16 fid; struct cifs_tcon *tcon; + struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); struct tcon_link *tlink = cifs_sb_tlink(cifs_sb); if (IS_ERR(tlink)) @@ -1006,15 +1239,23 @@ static int set_cifs_acl_by_path(struct cifs_sb_info *cifs_sb, const char *path, tcon = tlink_tcon(tlink); xid = GetXid(); - rc = CIFSSMBOpen(xid, tcon, path, FILE_OPEN, WRITE_DAC, 0, - &fid, &oplock, NULL, cifs_sb->local_nls, - cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); + if (backup_cred(cifs_sb)) + create_options |= CREATE_OPEN_BACKUP_INTENT; + + if (aclflag == CIFS_ACL_OWNER || aclflag == CIFS_ACL_GROUP) + access_flags = WRITE_OWNER; + else + access_flags = WRITE_DAC; + + rc = CIFSSMBOpen(xid, tcon, path, FILE_OPEN, access_flags, + create_options, &fid, &oplock, NULL, cifs_sb->local_nls, + cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); if (rc) { cERROR(1, "Unable to open file to set ACL"); goto out; } - rc = CIFSSMBSetCIFSACL(xid, tcon, fid, pnntsd, acllen); + rc = CIFSSMBSetCIFSACL(xid, tcon, fid, pnntsd, acllen, aclflag); cFYI(DBG2, "SetCIFSACL rc = %d", rc); CIFSSMBClose(xid, tcon, fid); @@ -1024,17 +1265,6 @@ out: return rc; } -/* Set an ACL on the server */ -int set_cifs_acl(struct cifs_ntsd *pnntsd, __u32 acllen, - struct inode *inode, const char *path) -{ - struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); - - cFYI(DBG2, "set ACL for %s from mode 0x%x", path, inode->i_mode); - - return set_cifs_acl_by_path(cifs_sb, path, pnntsd, acllen); -} - /* Translate the CIFS ACL (simlar to NTFS ACL) for a file into mode bits */ int cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb, struct cifs_fattr *fattr, @@ -1066,9 +1296,12 @@ cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb, struct cifs_fattr *fattr, } /* Convert mode bits to an ACL so we can update the ACL on the server */ -int mode_to_cifs_acl(struct inode *inode, const char *path, __u64 nmode) +int +id_mode_to_cifs_acl(struct inode *inode, const char *path, __u64 nmode, + uid_t uid, gid_t gid) { int rc = 0; + int aclflag = CIFS_ACL_DACL; /* default flag to set */ __u32 secdesclen = 0; struct cifs_ntsd *pntsd = NULL; /* acl obtained from server */ struct cifs_ntsd *pnntsd = NULL; /* modified acl to be sent to server */ @@ -1098,13 +1331,15 @@ int mode_to_cifs_acl(struct inode *inode, const char *path, __u64 nmode) return -ENOMEM; } - rc = build_sec_desc(pntsd, pnntsd, inode, nmode); + rc = build_sec_desc(pntsd, pnntsd, secdesclen, nmode, uid, gid, + &aclflag); cFYI(DBG2, "build_sec_desc rc: %d", rc); if (!rc) { /* Set the security descriptor */ - rc = set_cifs_acl(pnntsd, secdesclen, inode, path); + rc = set_cifs_acl(pnntsd, secdesclen, inode, + path, aclflag); cFYI(DBG2, "set_cifs_acl rc: %d", rc); } diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c index 30acd22..5d9b9ac 100644 --- a/fs/cifs/cifsencrypt.c +++ b/fs/cifs/cifsencrypt.c @@ -37,83 +37,8 @@ * the sequence number before this function is called. Also, this function * should be called with the server->srv_mutex held. */ -static int cifs_calculate_signature(const struct smb_hdr *cifs_pdu, - struct TCP_Server_Info *server, char *signature) -{ - int rc; - - if (cifs_pdu == NULL || signature == NULL || server == NULL) - return -EINVAL; - - if (!server->secmech.sdescmd5) { - cERROR(1, "%s: Can't generate signature\n", __func__); - return -1; - } - - rc = crypto_shash_init(&server->secmech.sdescmd5->shash); - if (rc) { - cERROR(1, "%s: Could not init md5\n", __func__); - return rc; - } - - rc = crypto_shash_update(&server->secmech.sdescmd5->shash, - server->session_key.response, server->session_key.len); - if (rc) { - cERROR(1, "%s: Could not update with response\n", __func__); - return rc; - } - - rc = crypto_shash_update(&server->secmech.sdescmd5->shash, - cifs_pdu->Protocol, be32_to_cpu(cifs_pdu->smb_buf_length)); - if (rc) { - cERROR(1, "%s: Could not update with payload\n", __func__); - return rc; - } - - rc = crypto_shash_final(&server->secmech.sdescmd5->shash, signature); - if (rc) - cERROR(1, "%s: Could not generate md5 hash\n", __func__); - - return rc; -} - -/* must be called with server->srv_mutex held */ -int cifs_sign_smb(struct smb_hdr *cifs_pdu, struct TCP_Server_Info *server, - __u32 *pexpected_response_sequence_number) -{ - int rc = 0; - char smb_signature[20]; - - if ((cifs_pdu == NULL) || (server == NULL)) - return -EINVAL; - - if (!(cifs_pdu->Flags2 & SMBFLG2_SECURITY_SIGNATURE) || - server->tcpStatus == CifsNeedNegotiate) - return rc; - - if (!server->session_estab) { - strncpy(cifs_pdu->Signature.SecuritySignature, "BSRSPYL", 8); - return rc; - } - - cifs_pdu->Signature.Sequence.SequenceNumber = - cpu_to_le32(server->sequence_number); - cifs_pdu->Signature.Sequence.Reserved = 0; - - *pexpected_response_sequence_number = server->sequence_number++; - server->sequence_number++; - - rc = cifs_calculate_signature(cifs_pdu, server, smb_signature); - if (rc) - memset(cifs_pdu->Signature.SecuritySignature, 0, 8); - else - memcpy(cifs_pdu->Signature.SecuritySignature, smb_signature, 8); - - return rc; -} - -static int cifs_calc_signature2(const struct kvec *iov, int n_vec, - struct TCP_Server_Info *server, char *signature) +static int cifs_calc_signature(const struct kvec *iov, int n_vec, + struct TCP_Server_Info *server, char *signature) { int i; int rc; @@ -179,7 +104,7 @@ int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *server, { int rc = 0; char smb_signature[20]; - struct smb_hdr *cifs_pdu = iov[0].iov_base; + struct smb_hdr *cifs_pdu = (struct smb_hdr *)iov[0].iov_base; if ((cifs_pdu == NULL) || (server == NULL)) return -EINVAL; @@ -189,7 +114,7 @@ int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *server, return rc; if (!server->session_estab) { - strncpy(cifs_pdu->Signature.SecuritySignature, "BSRSPYL", 8); + memcpy(cifs_pdu->Signature.SecuritySignature, "BSRSPYL", 8); return rc; } @@ -200,7 +125,7 @@ int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *server, *pexpected_response_sequence_number = server->sequence_number++; server->sequence_number++; - rc = cifs_calc_signature2(iov, n_vec, server, smb_signature); + rc = cifs_calc_signature(iov, n_vec, server, smb_signature); if (rc) memset(cifs_pdu->Signature.SecuritySignature, 0, 8); else @@ -209,13 +134,27 @@ int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *server, return rc; } -int cifs_verify_signature(struct smb_hdr *cifs_pdu, +/* must be called with server->srv_mutex held */ +int cifs_sign_smb(struct smb_hdr *cifs_pdu, struct TCP_Server_Info *server, + __u32 *pexpected_response_sequence_number) +{ + struct kvec iov; + + iov.iov_base = cifs_pdu; + iov.iov_len = be32_to_cpu(cifs_pdu->smb_buf_length) + 4; + + return cifs_sign_smb2(&iov, 1, server, + pexpected_response_sequence_number); +} + +int cifs_verify_signature(struct kvec *iov, unsigned int nr_iov, struct TCP_Server_Info *server, __u32 expected_sequence_number) { unsigned int rc; char server_response_sig[8]; char what_we_think_sig_should_be[20]; + struct smb_hdr *cifs_pdu = (struct smb_hdr *)iov[0].iov_base; if (cifs_pdu == NULL || server == NULL) return -EINVAL; @@ -247,8 +186,8 @@ int cifs_verify_signature(struct smb_hdr *cifs_pdu, cifs_pdu->Signature.Sequence.Reserved = 0; mutex_lock(&server->srv_mutex); - rc = cifs_calculate_signature(cifs_pdu, server, - what_we_think_sig_should_be); + rc = cifs_calc_signature(iov, nr_iov, server, + what_we_think_sig_should_be); mutex_unlock(&server->srv_mutex); if (rc) @@ -265,7 +204,7 @@ int cifs_verify_signature(struct smb_hdr *cifs_pdu, } /* first calculate 24 bytes ntlm response and then 16 byte session key */ -int setup_ntlm_response(struct cifs_ses *ses) +int setup_ntlm_response(struct cifs_ses *ses, const struct nls_table *nls_cp) { int rc = 0; unsigned int temp_len = CIFS_SESS_KEY_SIZE + CIFS_AUTH_RESP_SIZE; @@ -282,14 +221,14 @@ int setup_ntlm_response(struct cifs_ses *ses) ses->auth_key.len = temp_len; rc = SMBNTencrypt(ses->password, ses->server->cryptkey, - ses->auth_key.response + CIFS_SESS_KEY_SIZE); + ses->auth_key.response + CIFS_SESS_KEY_SIZE, nls_cp); if (rc) { cFYI(1, "%s Can't generate NTLM response, error: %d", __func__, rc); return rc; } - rc = E_md4hash(ses->password, temp_key); + rc = E_md4hash(ses->password, temp_key, nls_cp); if (rc) { cFYI(1, "%s Can't generate NT hash, error: %d", __func__, rc); return rc; @@ -465,7 +404,7 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash, } /* calculate md4 hash of password */ - E_md4hash(ses->password, nt_hash); + E_md4hash(ses->password, nt_hash, nls_cp); rc = crypto_shash_setkey(ses->server->secmech.hmacmd5, nt_hash, CIFS_NTHASH_SIZE); diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 54b8f1e..b1fd382 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -53,7 +53,7 @@ int cifsFYI = 0; int cifsERROR = 1; int traceSMB = 0; -unsigned int oplockEnabled = 1; +bool enable_oplocks = true; unsigned int linuxExtEnabled = 1; unsigned int lookupCacheEnabled = 1; unsigned int multiuser_mount = 0; @@ -74,7 +74,7 @@ module_param(cifs_min_small, int, 0); MODULE_PARM_DESC(cifs_min_small, "Small network buffers in pool. Default: 30 " "Range: 2 to 256"); unsigned int cifs_max_pending = CIFS_MAX_REQ; -module_param(cifs_max_pending, int, 0); +module_param(cifs_max_pending, int, 0444); MODULE_PARM_DESC(cifs_max_pending, "Simultaneous requests to server. " "Default: 50 Range: 2 to 256"); unsigned short echo_retries = 5; @@ -82,6 +82,10 @@ module_param(echo_retries, ushort, 0644); MODULE_PARM_DESC(echo_retries, "Number of echo attempts before giving up and " "reconnecting server. Default: 5. 0 means " "never reconnect."); +module_param(enable_oplocks, bool, 0644); +MODULE_PARM_DESC(enable_oplocks, "Enable or disable oplocks (bool). Default:" + "y/Y/1"); + extern mempool_t *cifs_sm_req_poolp; extern mempool_t *cifs_req_poolp; extern mempool_t *cifs_mid_poolp; @@ -132,12 +136,12 @@ cifs_read_super(struct super_block *sb) else sb->s_d_op = &cifs_dentry_ops; -#ifdef CIFS_NFSD_EXPORT +#ifdef CONFIG_CIFS_NFSD_EXPORT if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) { cFYI(1, "export ops supported"); sb->s_export_op = &cifs_export_ops; } -#endif /* CIFS_NFSD_EXPORT */ +#endif /* CONFIG_CIFS_NFSD_EXPORT */ return 0; @@ -339,9 +343,9 @@ cifs_show_security(struct seq_file *s, struct TCP_Server_Info *server) * ones are. */ static int -cifs_show_options(struct seq_file *s, struct vfsmount *m) +cifs_show_options(struct seq_file *s, struct dentry *root) { - struct cifs_sb_info *cifs_sb = CIFS_SB(m->mnt_sb); + struct cifs_sb_info *cifs_sb = CIFS_SB(root->d_sb); struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb); struct sockaddr *srcaddr; srcaddr = (struct sockaddr *)&tcon->ses->server->srcaddr; @@ -389,7 +393,7 @@ cifs_show_options(struct seq_file *s, struct vfsmount *m) cifs_show_address(s, tcon->ses->server); if (!tcon->unix_ext) - seq_printf(s, ",file_mode=0%o,dir_mode=0%o", + seq_printf(s, ",file_mode=0%ho,dir_mode=0%ho", cifs_sb->mnt_file_mode, cifs_sb->mnt_dir_mode); if (tcon->seal) @@ -426,12 +430,18 @@ cifs_show_options(struct seq_file *s, struct vfsmount *m) seq_printf(s, ",cifsacl"); if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DYNPERM) seq_printf(s, ",dynperm"); - if (m->mnt_sb->s_flags & MS_POSIXACL) + if (root->d_sb->s_flags & MS_POSIXACL) seq_printf(s, ",acl"); if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS) seq_printf(s, ",mfsymlinks"); if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_FSCACHE) seq_printf(s, ",fsc"); + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOSSYNC) + seq_printf(s, ",nostrictsync"); + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_PERM) + seq_printf(s, ",noperm"); + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_STRICT_IO) + seq_printf(s, ",strictcache"); seq_printf(s, ",rsize=%d", cifs_sb->rsize); seq_printf(s, ",wsize=%d", cifs_sb->wsize); @@ -478,7 +488,7 @@ static void cifs_umount_begin(struct super_block *sb) } #ifdef CONFIG_CIFS_STATS2 -static int cifs_show_stats(struct seq_file *s, struct vfsmount *mnt) +static int cifs_show_stats(struct seq_file *s, struct dentry *root) { /* BB FIXME */ return 0; @@ -530,7 +540,6 @@ cifs_get_root(struct smb_vol *vol, struct super_block *sb) char *full_path = NULL; char *s, *p; char sep; - int xid; full_path = cifs_build_path_to_root(vol, cifs_sb, cifs_sb_master_tcon(cifs_sb)); @@ -539,7 +548,6 @@ cifs_get_root(struct smb_vol *vol, struct super_block *sb) cFYI(1, "Get root dentry for %s", full_path); - xid = GetXid(); sep = CIFS_DIR_SEP(cifs_sb); dentry = dget(sb->s_root); p = s = full_path; @@ -570,7 +578,6 @@ cifs_get_root(struct smb_vol *vol, struct super_block *sb) dput(dentry); dentry = child; } while (!IS_ERR(dentry)); - _FreeXid(xid); kfree(full_path); return dentry; } @@ -723,7 +730,7 @@ static loff_t cifs_llseek(struct file *file, loff_t offset, int origin) if (rc < 0) return (loff_t)rc; } - return generic_file_llseek_unlocked(file, offset, origin); + return generic_file_llseek(file, offset, origin); } static int cifs_setlease(struct file *file, long arg, struct file_lock **lease) @@ -942,7 +949,8 @@ cifs_init_once(void *inode) struct cifsInodeInfo *cifsi = inode; inode_init_once(&cifsi->vfs_inode); - INIT_LIST_HEAD(&cifsi->lockList); + INIT_LIST_HEAD(&cifsi->llist); + mutex_init(&cifsi->lock_mutex); } static int diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h index 95da802..fe5ecf1 100644 --- a/fs/cifs/cifsfs.h +++ b/fs/cifs/cifsfs.h @@ -44,14 +44,14 @@ extern const struct address_space_operations cifs_addr_ops_smallbuf; /* Functions related to inodes */ extern const struct inode_operations cifs_dir_inode_ops; extern struct inode *cifs_root_iget(struct super_block *); -extern int cifs_create(struct inode *, struct dentry *, int, +extern int cifs_create(struct inode *, struct dentry *, umode_t, struct nameidata *); extern struct dentry *cifs_lookup(struct inode *, struct dentry *, struct nameidata *); extern int cifs_unlink(struct inode *dir, struct dentry *dentry); extern int cifs_hardlink(struct dentry *, struct inode *, struct dentry *); -extern int cifs_mknod(struct inode *, struct dentry *, int, dev_t); -extern int cifs_mkdir(struct inode *, struct dentry *, int); +extern int cifs_mknod(struct inode *, struct dentry *, umode_t, dev_t); +extern int cifs_mkdir(struct inode *, struct dentry *, umode_t); extern int cifs_rmdir(struct inode *, struct dentry *); extern int cifs_rename(struct inode *, struct dentry *, struct inode *, struct dentry *); @@ -121,9 +121,9 @@ extern ssize_t cifs_getxattr(struct dentry *, const char *, void *, size_t); extern ssize_t cifs_listxattr(struct dentry *, char *, size_t); extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg); -#ifdef CIFS_NFSD_EXPORT +#ifdef CONFIG_CIFS_NFSD_EXPORT extern const struct export_operations cifs_export_ops; -#endif /* CIFS_NFSD_EXPORT */ +#endif /* CONFIG_CIFS_NFSD_EXPORT */ -#define CIFS_VERSION "1.75" +#define CIFS_VERSION "1.76" #endif /* _CIFSFS_H */ diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index 95dad9d..ba53c1c 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -167,8 +167,10 @@ struct smb_vol { uid_t cred_uid; uid_t linux_uid; gid_t linux_gid; - mode_t file_mode; - mode_t dir_mode; + uid_t backupuid; + gid_t backupgid; + umode_t file_mode; + umode_t dir_mode; unsigned secFlg; bool retry:1; bool intr:1; @@ -179,6 +181,8 @@ struct smb_vol { bool noperm:1; bool no_psx_acl:1; /* set if posix acl support should be disabled */ bool cifs_acl:1; + bool backupuid_specified; /* mount option backupuid is specified */ + bool backupgid_specified; /* mount option backupgid is specified */ bool no_xattr:1; /* set if xattr (EA) support should be disabled*/ bool server_ino:1; /* use inode numbers from server ie UniqueId */ bool direct_io:1; @@ -219,7 +223,8 @@ struct smb_vol { CIFS_MOUNT_OVERR_GID | CIFS_MOUNT_DYNPERM | \ CIFS_MOUNT_NOPOSIXBRL | CIFS_MOUNT_NOSSYNC | \ CIFS_MOUNT_FSCACHE | CIFS_MOUNT_MF_SYMLINKS | \ - CIFS_MOUNT_MULTIUSER | CIFS_MOUNT_STRICT_IO) + CIFS_MOUNT_MULTIUSER | CIFS_MOUNT_STRICT_IO | \ + CIFS_MOUNT_CIFS_BACKUPUID | CIFS_MOUNT_CIFS_BACKUPGID) #define CIFS_MS_MASK (MS_RDONLY | MS_MANDLOCK | MS_NOEXEC | MS_NOSUID | \ MS_NODEV | MS_SYNCHRONOUS) @@ -286,7 +291,13 @@ struct TCP_Server_Info { bool sec_kerberosu2u; /* supports U2U Kerberos */ bool sec_kerberos; /* supports plain Kerberos */ bool sec_mskerberos; /* supports legacy MS Kerberos */ + bool large_buf; /* is current buffer large? */ struct delayed_work echo; /* echo ping workqueue job */ + struct kvec *iov; /* reusable kvec array for receives */ + unsigned int nr_iov; /* number of kvecs in array */ + char *smallbuf; /* pointer to current "small" buffer */ + char *bigbuf; /* pointer to current "big" buffer */ + unsigned int total_read; /* total amount of data read in this pass */ #ifdef CONFIG_CIFS_FSCACHE struct fscache_cookie *fscache; /* client index cache cookie */ #endif @@ -485,9 +496,13 @@ extern struct cifs_tcon *cifs_sb_master_tcon(struct cifs_sb_info *cifs_sb); */ struct cifsLockInfo { struct list_head llist; /* pointer to next cifsLockInfo */ + struct list_head blist; /* pointer to locks blocked on this */ + wait_queue_head_t block_q; __u64 offset; __u64 length; + __u32 pid; __u8 type; + __u16 netfid; }; /* @@ -520,8 +535,6 @@ struct cifsFileInfo { struct dentry *dentry; unsigned int f_flags; struct tcon_link *tlink; - struct mutex lock_mutex; - struct list_head llist; /* list of byte range locks we have. */ bool invalidHandle:1; /* file closed via session abend */ bool oplock_break_cancelled:1; int count; /* refcount protected by cifs_file_list_lock */ @@ -554,7 +567,9 @@ void cifsFileInfo_put(struct cifsFileInfo *cifs_file); */ struct cifsInodeInfo { - struct list_head lockList; + struct list_head llist; /* brlocks for this inode */ + bool can_cache_brlcks; + struct mutex lock_mutex; /* protect two fields above */ /* BB add in lists for dirty pages i.e. write caching info for oplock */ struct list_head openFileList; __u32 cifsAttrs; /* e.g. DOS archive bit, sparse, compressed, system */ @@ -643,8 +658,24 @@ static inline void cifs_stats_bytes_read(struct cifs_tcon *tcon, struct mid_q_entry; /* - * This is the prototype for the mid callback function. When creating one, - * take special care to avoid deadlocks. Things to bear in mind: + * This is the prototype for the mid receive function. This function is for + * receiving the rest of the SMB frame, starting with the WordCount (which is + * just after the MID in struct smb_hdr). Note: + * + * - This will be called by cifsd, with no locks held. + * - The mid will still be on the pending_mid_q. + * - mid->resp_buf will point to the current buffer. + * + * Returns zero on a successful receive, or an error. The receive state in + * the TCP_Server_Info will also be updated. + */ +typedef int (mid_receive_t)(struct TCP_Server_Info *server, + struct mid_q_entry *mid); + +/* + * This is the prototype for the mid callback function. This is called once the + * mid has been received off of the socket. When creating one, take special + * care to avoid deadlocks. Things to bear in mind: * * - it will be called by cifsd, with no locks held * - the mid will be removed from any lists @@ -662,9 +693,10 @@ struct mid_q_entry { unsigned long when_sent; /* time when smb send finished */ unsigned long when_received; /* when demux complete (taken off wire) */ #endif + mid_receive_t *receive; /* call receive callback */ mid_callback_t *callback; /* call completion callback */ void *callback_data; /* general purpose pointer for callback */ - struct smb_hdr *resp_buf; /* response buffer */ + struct smb_hdr *resp_buf; /* pointer to received SMB header */ int midState; /* wish this were enum but can not pass to wait_event */ __u8 command; /* smb command code */ bool largeBuf:1; /* if valid response, is pointer to large buf */ @@ -964,7 +996,8 @@ GLOBAL_EXTERN unsigned int multiuser_mount; /* if enabled allows new sessions to be established on existing mount if we have the uid/password or Kerberos credential or equivalent for current user */ -GLOBAL_EXTERN unsigned int oplockEnabled; +/* enable or disable oplocks */ +GLOBAL_EXTERN bool enable_oplocks; GLOBAL_EXTERN unsigned int lookupCacheEnabled; GLOBAL_EXTERN unsigned int global_secflags; /* if on, session setup sent with more secure ntlmssp2 challenge/resp */ @@ -978,10 +1011,16 @@ GLOBAL_EXTERN unsigned int cifs_max_pending; /* MAX requests at once to server*/ /* reconnect after this many failed echo attempts */ GLOBAL_EXTERN unsigned short echo_retries; +#ifdef CONFIG_CIFS_ACL GLOBAL_EXTERN struct rb_root uidtree; GLOBAL_EXTERN struct rb_root gidtree; GLOBAL_EXTERN spinlock_t siduidlock; GLOBAL_EXTERN spinlock_t sidgidlock; +GLOBAL_EXTERN struct rb_root siduidtree; +GLOBAL_EXTERN struct rb_root sidgidtree; +GLOBAL_EXTERN spinlock_t uidsidlock; +GLOBAL_EXTERN spinlock_t gidsidlock; +#endif /* CONFIG_CIFS_ACL */ void cifs_oplock_break(struct work_struct *work); diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h index de3aa28..3fb03e2 100644 --- a/fs/cifs/cifspdu.h +++ b/fs/cifs/cifspdu.h @@ -1089,9 +1089,7 @@ typedef struct smb_com_read_rsp { __le16 DataLengthHigh; __u64 Reserved2; __u16 ByteCount; - __u8 Pad; /* BB check for whether padded to DWORD - boundary and optimum performance here */ - char Data[1]; + /* read response data immediately follows */ } __attribute__((packed)) READ_RSP; typedef struct locking_andx_range { @@ -1913,6 +1911,10 @@ typedef struct whoami_rsp_data { /* Query level 0x202 */ /* SETFSInfo Levels */ #define SMB_SET_CIFS_UNIX_INFO 0x200 +/* level 0x203 is defined above in list of QFS info levels */ +/* #define SMB_REQUEST_TRANSPORT_ENCRYPTION 0x203 */ + +/* Level 0x200 request structure follows */ typedef struct smb_com_transaction2_setfsi_req { struct smb_hdr hdr; /* wct = 15 */ __le16 TotalParameterCount; @@ -1940,13 +1942,39 @@ typedef struct smb_com_transaction2_setfsi_req { __le64 ClientUnixCap; /* Data end */ } __attribute__((packed)) TRANSACTION2_SETFSI_REQ; +/* level 0x203 request structure follows */ +typedef struct smb_com_transaction2_setfs_enc_req { + struct smb_hdr hdr; /* wct = 15 */ + __le16 TotalParameterCount; + __le16 TotalDataCount; + __le16 MaxParameterCount; + __le16 MaxDataCount; + __u8 MaxSetupCount; + __u8 Reserved; + __le16 Flags; + __le32 Timeout; + __u16 Reserved2; + __le16 ParameterCount; /* 4 */ + __le16 ParameterOffset; + __le16 DataCount; /* 12 */ + __le16 DataOffset; + __u8 SetupCount; /* one */ + __u8 Reserved3; + __le16 SubCommand; /* TRANS2_SET_FS_INFORMATION */ + __le16 ByteCount; + __u8 Pad; + __u16 Reserved4; /* Parameters start. */ + __le16 InformationLevel;/* Parameters end. */ + /* NTLMSSP Blob, Data start. */ +} __attribute__((packed)) TRANSACTION2_SETFSI_ENC_REQ; + +/* response for setfsinfo levels 0x200 and 0x203 */ typedef struct smb_com_transaction2_setfsi_rsp { struct smb_hdr hdr; /* wct = 10 */ struct trans2_resp t2; __u16 ByteCount; } __attribute__((packed)) TRANSACTION2_SETFSI_RSP; - typedef struct smb_com_transaction2_get_dfs_refer_req { struct smb_hdr hdr; /* wct = 15 */ __le16 TotalParameterCount; @@ -2098,13 +2126,13 @@ typedef struct { #define CIFS_UNIX_PROXY_CAP 0x00000400 /* Proxy cap: 0xACE ioctl and QFS PROXY call */ #ifdef CONFIG_CIFS_POSIX -/* Can not set pathnames cap yet until we send new posix create SMB since - otherwise server can treat such handles opened with older ntcreatex - (by a new client which knows how to send posix path ops) - as non-posix handles (can affect write behavior with byte range locks. - We can add back in POSIX_PATH_OPS cap when Posix Create/Mkdir finished */ +/* presumably don't need the 0x20 POSIX_PATH_OPS_CAP since we never send + LockingX instead of posix locking call on unix sess (and we do not expect + LockingX to use different (ie Windows) semantics than posix locking on + the same session (if WINE needs to do this later, we can add this cap + back in later */ /* #define CIFS_UNIX_CAP_MASK 0x000000fb */ -#define CIFS_UNIX_CAP_MASK 0x000000db +#define CIFS_UNIX_CAP_MASK 0x000003db #else #define CIFS_UNIX_CAP_MASK 0x00000013 #endif /* CONFIG_CIFS_POSIX */ diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index 8df28e9..6f4e243 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -69,8 +69,9 @@ extern struct mid_q_entry *AllocMidQEntry(const struct smb_hdr *smb_buffer, struct TCP_Server_Info *server); extern void DeleteMidQEntry(struct mid_q_entry *midEntry); extern int cifs_call_async(struct TCP_Server_Info *server, struct kvec *iov, - unsigned int nvec, mid_callback_t *callback, - void *cbdata, bool ignore_pend); + unsigned int nvec, mid_receive_t *receive, + mid_callback_t *callback, void *cbdata, + bool ignore_pend); extern int SendReceive(const unsigned int /* xid */ , struct cifs_ses *, struct smb_hdr * /* input */ , struct smb_hdr * /* out */ , @@ -90,6 +91,7 @@ extern int SendReceiveBlockingLock(const unsigned int xid, extern int checkSMB(struct smb_hdr *smb, __u16 mid, unsigned int length); extern bool is_valid_oplock_break(struct smb_hdr *smb, struct TCP_Server_Info *); +extern bool backup_cred(struct cifs_sb_info *); extern bool is_size_safe_to_change(struct cifsInodeInfo *, __u64 eof); extern void cifs_update_eof(struct cifsInodeInfo *cifsi, loff_t offset, unsigned int bytes_written); @@ -145,12 +147,19 @@ extern int cifs_get_inode_info_unix(struct inode **pinode, extern int cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb, struct cifs_fattr *fattr, struct inode *inode, const char *path, const __u16 *pfid); -extern int mode_to_cifs_acl(struct inode *inode, const char *path, __u64); +extern int id_mode_to_cifs_acl(struct inode *inode, const char *path, __u64, + uid_t, gid_t); extern struct cifs_ntsd *get_cifs_acl(struct cifs_sb_info *, struct inode *, const char *, u32 *); extern int set_cifs_acl(struct cifs_ntsd *, __u32, struct inode *, - const char *); + const char *, int); +extern void dequeue_mid(struct mid_q_entry *mid, bool malformed); +extern int cifs_read_from_socket(struct TCP_Server_Info *server, char *buf, + unsigned int to_read); +extern int cifs_readv_from_socket(struct TCP_Server_Info *server, + struct kvec *iov_orig, unsigned int nr_segs, + unsigned int to_read); extern void cifs_setup_cifs_sb(struct smb_vol *pvolume_info, struct cifs_sb_info *cifs_sb); extern int cifs_match_super(struct super_block *, void *); @@ -359,14 +368,17 @@ extern int CIFSGetSrvInodeNumber(const int xid, struct cifs_tcon *tcon, const struct nls_table *nls_codepage, int remap_special_chars); +extern int cifs_lockv(const int xid, struct cifs_tcon *tcon, const __u16 netfid, + const __u8 lock_type, const __u32 num_unlock, + const __u32 num_lock, LOCKING_ANDX_RANGE *buf); extern int CIFSSMBLock(const int xid, struct cifs_tcon *tcon, - const __u16 netfid, const __u64 len, + const __u16 netfid, const __u32 netpid, const __u64 len, const __u64 offset, const __u32 numUnlock, const __u32 numLock, const __u8 lockType, const bool waitFlag, const __u8 oplock_level); extern int CIFSSMBPosixLock(const int xid, struct cifs_tcon *tcon, - const __u16 smb_file_id, const int get_flag, - const __u64 len, struct file_lock *, + const __u16 smb_file_id, const __u32 netpid, + const int get_flag, const __u64 len, struct file_lock *, const __u16 lock_type, const bool waitFlag); extern int CIFSSMBTDis(const int xid, struct cifs_tcon *tcon); extern int CIFSSMBEcho(struct TCP_Server_Info *server); @@ -380,11 +392,12 @@ extern void tconInfoFree(struct cifs_tcon *); extern int cifs_sign_smb(struct smb_hdr *, struct TCP_Server_Info *, __u32 *); extern int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *, __u32 *); -extern int cifs_verify_signature(struct smb_hdr *, +extern int cifs_verify_signature(struct kvec *iov, unsigned int nr_iov, struct TCP_Server_Info *server, __u32 expected_sequence_number); -extern int SMBNTencrypt(unsigned char *, unsigned char *, unsigned char *); -extern int setup_ntlm_response(struct cifs_ses *); +extern int SMBNTencrypt(unsigned char *, unsigned char *, unsigned char *, + const struct nls_table *); +extern int setup_ntlm_response(struct cifs_ses *, const struct nls_table *); extern int setup_ntlmv2_rsp(struct cifs_ses *, const struct nls_table *); extern int cifs_crypto_shash_allocate(struct TCP_Server_Info *); extern void cifs_crypto_shash_release(struct TCP_Server_Info *); @@ -419,7 +432,7 @@ extern int CIFSSMBSetEA(const int xid, struct cifs_tcon *tcon, extern int CIFSSMBGetCIFSACL(const int xid, struct cifs_tcon *tcon, __u16 fid, struct cifs_ntsd **acl_inf, __u32 *buflen); extern int CIFSSMBSetCIFSACL(const int, struct cifs_tcon *, __u16, - struct cifs_ntsd *, __u32); + struct cifs_ntsd *, __u32, int); extern int CIFSSMBGetPosixACL(const int xid, struct cifs_tcon *tcon, const unsigned char *searchName, char *acl_inf, const int buflen, const int acl_type, @@ -436,10 +449,29 @@ extern int CIFSCheckMFSymlink(struct cifs_fattr *fattr, const unsigned char *path, struct cifs_sb_info *cifs_sb, int xid); extern int mdfour(unsigned char *, unsigned char *, int); -extern int E_md4hash(const unsigned char *passwd, unsigned char *p16); +extern int E_md4hash(const unsigned char *passwd, unsigned char *p16, + const struct nls_table *codepage); extern int SMBencrypt(unsigned char *passwd, const unsigned char *c8, unsigned char *p24); +/* asynchronous read support */ +struct cifs_readdata { + struct cifsFileInfo *cfile; + struct address_space *mapping; + __u64 offset; + unsigned int bytes; + pid_t pid; + int result; + struct list_head pages; + struct work_struct work; + unsigned int nr_iov; + struct kvec iov[1]; +}; + +struct cifs_readdata *cifs_readdata_alloc(unsigned int nr_pages); +void cifs_readdata_free(struct cifs_readdata *rdata); +int cifs_async_readv(struct cifs_readdata *rdata); + /* asynchronous write support */ struct cifs_writedata { struct kref refcount; diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index a80f7bd..6600aa2 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -33,6 +33,8 @@ #include <linux/slab.h> #include <linux/posix_acl_xattr.h> #include <linux/pagemap.h> +#include <linux/swap.h> +#include <linux/task_io_accounting_ops.h> #include <asm/uaccess.h> #include "cifspdu.h" #include "cifsglob.h" @@ -40,6 +42,7 @@ #include "cifsproto.h" #include "cifs_unicode.h" #include "cifs_debug.h" +#include "fscache.h" #ifdef CONFIG_CIFS_POSIX static struct { @@ -83,6 +86,9 @@ static struct { #endif /* CONFIG_CIFS_WEAK_PW_HASH */ #endif /* CIFS_POSIX */ +/* Forward declarations */ +static void cifs_readv_complete(struct work_struct *work); + /* Mark as invalid, all open files on tree connections since they were closed when session to server was lost */ static void mark_open_files_invalid(struct cifs_tcon *pTcon) @@ -453,8 +459,7 @@ CIFSSMBNegotiate(unsigned int xid, struct cifs_ses *ses) } server->sec_mode = (__u8)le16_to_cpu(rsp->SecurityMode); server->maxReq = le16_to_cpu(rsp->MaxMpxCount); - server->maxBuf = min((__u32)le16_to_cpu(rsp->MaxBufSize), - (__u32)CIFSMaxBufSize + MAX_CIFS_HDR_SIZE); + server->maxBuf = le16_to_cpu(rsp->MaxBufSize); server->max_vcs = le16_to_cpu(rsp->MaxNumberVcs); /* even though we do not use raw we might as well set this accurately, in case we ever find a need for it */ @@ -561,8 +566,7 @@ CIFSSMBNegotiate(unsigned int xid, struct cifs_ses *ses) little endian */ server->maxReq = le16_to_cpu(pSMBr->MaxMpxCount); /* probably no need to store and check maxvcs */ - server->maxBuf = min(le32_to_cpu(pSMBr->MaxBufferSize), - (__u32) CIFSMaxBufSize + MAX_CIFS_HDR_SIZE); + server->maxBuf = le32_to_cpu(pSMBr->MaxBufferSize); server->max_rw = le32_to_cpu(pSMBr->MaxRawSize); cFYI(DBG2, "Max buf = %d", ses->server->maxBuf); server->capabilities = le32_to_cpu(pSMBr->Capabilities); @@ -739,7 +743,8 @@ CIFSSMBEcho(struct TCP_Server_Info *server) iov.iov_base = smb; iov.iov_len = be32_to_cpu(smb->hdr.smb_buf_length) + 4; - rc = cifs_call_async(server, &iov, 1, cifs_echo_callback, server, true); + rc = cifs_call_async(server, &iov, 1, NULL, cifs_echo_callback, + server, true); if (rc) cFYI(1, "Echo request failed: %d", rc); @@ -1376,6 +1381,359 @@ openRetry: return rc; } +struct cifs_readdata * +cifs_readdata_alloc(unsigned int nr_pages) +{ + struct cifs_readdata *rdata; + + /* readdata + 1 kvec for each page */ + rdata = kzalloc(sizeof(*rdata) + + sizeof(struct kvec) * nr_pages, GFP_KERNEL); + if (rdata != NULL) { + INIT_WORK(&rdata->work, cifs_readv_complete); + INIT_LIST_HEAD(&rdata->pages); + } + return rdata; +} + +void +cifs_readdata_free(struct cifs_readdata *rdata) +{ + cifsFileInfo_put(rdata->cfile); + kfree(rdata); +} + +/* + * Discard any remaining data in the current SMB. To do this, we borrow the + * current bigbuf. + */ +static int +cifs_readv_discard(struct TCP_Server_Info *server, struct mid_q_entry *mid) +{ + READ_RSP *rsp = (READ_RSP *)server->smallbuf; + unsigned int rfclen = be32_to_cpu(rsp->hdr.smb_buf_length); + int remaining = rfclen + 4 - server->total_read; + struct cifs_readdata *rdata = mid->callback_data; + + while (remaining > 0) { + int length; + + length = cifs_read_from_socket(server, server->bigbuf, + min_t(unsigned int, remaining, + CIFSMaxBufSize + MAX_CIFS_HDR_SIZE)); + if (length < 0) + return length; + server->total_read += length; + remaining -= length; + } + + dequeue_mid(mid, rdata->result); + return 0; +} + +static int +cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid) +{ + int length, len; + unsigned int data_offset, remaining, data_len; + struct cifs_readdata *rdata = mid->callback_data; + READ_RSP *rsp = (READ_RSP *)server->smallbuf; + unsigned int rfclen = be32_to_cpu(rsp->hdr.smb_buf_length) + 4; + u64 eof; + pgoff_t eof_index; + struct page *page, *tpage; + + cFYI(1, "%s: mid=%u offset=%llu bytes=%u", __func__, + mid->mid, rdata->offset, rdata->bytes); + + /* + * read the rest of READ_RSP header (sans Data array), or whatever we + * can if there's not enough data. At this point, we've read down to + * the Mid. + */ + len = min_t(unsigned int, rfclen, sizeof(*rsp)) - + sizeof(struct smb_hdr) + 1; + + rdata->iov[0].iov_base = server->smallbuf + sizeof(struct smb_hdr) - 1; + rdata->iov[0].iov_len = len; + + length = cifs_readv_from_socket(server, rdata->iov, 1, len); + if (length < 0) + return length; + server->total_read += length; + + /* Was the SMB read successful? */ + rdata->result = map_smb_to_linux_error(&rsp->hdr, false); + if (rdata->result != 0) { + cFYI(1, "%s: server returned error %d", __func__, + rdata->result); + return cifs_readv_discard(server, mid); + } + + /* Is there enough to get to the rest of the READ_RSP header? */ + if (server->total_read < sizeof(READ_RSP)) { + cFYI(1, "%s: server returned short header. got=%u expected=%zu", + __func__, server->total_read, sizeof(READ_RSP)); + rdata->result = -EIO; + return cifs_readv_discard(server, mid); + } + + data_offset = le16_to_cpu(rsp->DataOffset) + 4; + if (data_offset < server->total_read) { + /* + * win2k8 sometimes sends an offset of 0 when the read + * is beyond the EOF. Treat it as if the data starts just after + * the header. + */ + cFYI(1, "%s: data offset (%u) inside read response header", + __func__, data_offset); + data_offset = server->total_read; + } else if (data_offset > MAX_CIFS_SMALL_BUFFER_SIZE) { + /* data_offset is beyond the end of smallbuf */ + cFYI(1, "%s: data offset (%u) beyond end of smallbuf", + __func__, data_offset); + rdata->result = -EIO; + return cifs_readv_discard(server, mid); + } + + cFYI(1, "%s: total_read=%u data_offset=%u", __func__, + server->total_read, data_offset); + + len = data_offset - server->total_read; + if (len > 0) { + /* read any junk before data into the rest of smallbuf */ + rdata->iov[0].iov_base = server->smallbuf + server->total_read; + rdata->iov[0].iov_len = len; + length = cifs_readv_from_socket(server, rdata->iov, 1, len); + if (length < 0) + return length; + server->total_read += length; + } + + /* set up first iov for signature check */ + rdata->iov[0].iov_base = server->smallbuf; + rdata->iov[0].iov_len = server->total_read; + cFYI(1, "0: iov_base=%p iov_len=%zu", + rdata->iov[0].iov_base, rdata->iov[0].iov_len); + + /* how much data is in the response? */ + data_len = le16_to_cpu(rsp->DataLengthHigh) << 16; + data_len += le16_to_cpu(rsp->DataLength); + if (data_offset + data_len > rfclen) { + /* data_len is corrupt -- discard frame */ + rdata->result = -EIO; + return cifs_readv_discard(server, mid); + } + + /* marshal up the page array */ + len = 0; + remaining = data_len; + rdata->nr_iov = 1; + + /* determine the eof that the server (probably) has */ + eof = CIFS_I(rdata->mapping->host)->server_eof; + eof_index = eof ? (eof - 1) >> PAGE_CACHE_SHIFT : 0; + cFYI(1, "eof=%llu eof_index=%lu", eof, eof_index); + + list_for_each_entry_safe(page, tpage, &rdata->pages, lru) { + if (remaining >= PAGE_CACHE_SIZE) { + /* enough data to fill the page */ + rdata->iov[rdata->nr_iov].iov_base = kmap(page); + rdata->iov[rdata->nr_iov].iov_len = PAGE_CACHE_SIZE; + cFYI(1, "%u: idx=%lu iov_base=%p iov_len=%zu", + rdata->nr_iov, page->index, + rdata->iov[rdata->nr_iov].iov_base, + rdata->iov[rdata->nr_iov].iov_len); + ++rdata->nr_iov; + len += PAGE_CACHE_SIZE; + remaining -= PAGE_CACHE_SIZE; + } else if (remaining > 0) { + /* enough for partial page, fill and zero the rest */ + rdata->iov[rdata->nr_iov].iov_base = kmap(page); + rdata->iov[rdata->nr_iov].iov_len = remaining; + cFYI(1, "%u: idx=%lu iov_base=%p iov_len=%zu", + rdata->nr_iov, page->index, + rdata->iov[rdata->nr_iov].iov_base, + rdata->iov[rdata->nr_iov].iov_len); + memset(rdata->iov[rdata->nr_iov].iov_base + remaining, + '\0', PAGE_CACHE_SIZE - remaining); + ++rdata->nr_iov; + len += remaining; + remaining = 0; + } else if (page->index > eof_index) { + /* + * The VFS will not try to do readahead past the + * i_size, but it's possible that we have outstanding + * writes with gaps in the middle and the i_size hasn't + * caught up yet. Populate those with zeroed out pages + * to prevent the VFS from repeatedly attempting to + * fill them until the writes are flushed. + */ + zero_user(page, 0, PAGE_CACHE_SIZE); + list_del(&page->lru); + lru_cache_add_file(page); + flush_dcache_page(page); + SetPageUptodate(page); + unlock_page(page); + page_cache_release(page); + } else { + /* no need to hold page hostage */ + list_del(&page->lru); + lru_cache_add_file(page); + unlock_page(page); + page_cache_release(page); + } + } + + /* issue the read if we have any iovecs left to fill */ + if (rdata->nr_iov > 1) { + length = cifs_readv_from_socket(server, &rdata->iov[1], + rdata->nr_iov - 1, len); + if (length < 0) + return length; + server->total_read += length; + } else { + length = 0; + } + + rdata->bytes = length; + + cFYI(1, "total_read=%u rfclen=%u remaining=%u", server->total_read, + rfclen, remaining); + + /* discard anything left over */ + if (server->total_read < rfclen) + return cifs_readv_discard(server, mid); + + dequeue_mid(mid, false); + return length; +} + +static void +cifs_readv_complete(struct work_struct *work) +{ + struct cifs_readdata *rdata = container_of(work, + struct cifs_readdata, work); + struct page *page, *tpage; + + list_for_each_entry_safe(page, tpage, &rdata->pages, lru) { + list_del(&page->lru); + lru_cache_add_file(page); + + if (rdata->result == 0) { + kunmap(page); + flush_dcache_page(page); + SetPageUptodate(page); + } + + unlock_page(page); + + if (rdata->result == 0) + cifs_readpage_to_fscache(rdata->mapping->host, page); + + page_cache_release(page); + } + cifs_readdata_free(rdata); +} + +static void +cifs_readv_callback(struct mid_q_entry *mid) +{ + struct cifs_readdata *rdata = mid->callback_data; + struct cifs_tcon *tcon = tlink_tcon(rdata->cfile->tlink); + struct TCP_Server_Info *server = tcon->ses->server; + + cFYI(1, "%s: mid=%u state=%d result=%d bytes=%u", __func__, + mid->mid, mid->midState, rdata->result, rdata->bytes); + + switch (mid->midState) { + case MID_RESPONSE_RECEIVED: + /* result already set, check signature */ + if (server->sec_mode & + (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) { + if (cifs_verify_signature(rdata->iov, rdata->nr_iov, + server, mid->sequence_number + 1)) + cERROR(1, "Unexpected SMB signature"); + } + /* FIXME: should this be counted toward the initiating task? */ + task_io_account_read(rdata->bytes); + cifs_stats_bytes_read(tcon, rdata->bytes); + break; + case MID_REQUEST_SUBMITTED: + case MID_RETRY_NEEDED: + rdata->result = -EAGAIN; + break; + default: + rdata->result = -EIO; + } + + queue_work(system_nrt_wq, &rdata->work); + DeleteMidQEntry(mid); + atomic_dec(&server->inFlight); + wake_up(&server->request_q); +} + +/* cifs_async_readv - send an async write, and set up mid to handle result */ +int +cifs_async_readv(struct cifs_readdata *rdata) +{ + int rc; + READ_REQ *smb = NULL; + int wct; + struct cifs_tcon *tcon = tlink_tcon(rdata->cfile->tlink); + + cFYI(1, "%s: offset=%llu bytes=%u", __func__, + rdata->offset, rdata->bytes); + + if (tcon->ses->capabilities & CAP_LARGE_FILES) + wct = 12; + else { + wct = 10; /* old style read */ + if ((rdata->offset >> 32) > 0) { + /* can not handle this big offset for old */ + return -EIO; + } + } + + rc = small_smb_init(SMB_COM_READ_ANDX, wct, tcon, (void **)&smb); + if (rc) + return rc; + + smb->hdr.Pid = cpu_to_le16((__u16)rdata->pid); + smb->hdr.PidHigh = cpu_to_le16((__u16)(rdata->pid >> 16)); + + smb->AndXCommand = 0xFF; /* none */ + smb->Fid = rdata->cfile->netfid; + smb->OffsetLow = cpu_to_le32(rdata->offset & 0xFFFFFFFF); + if (wct == 12) + smb->OffsetHigh = cpu_to_le32(rdata->offset >> 32); + smb->Remaining = 0; + smb->MaxCount = cpu_to_le16(rdata->bytes & 0xFFFF); + smb->MaxCountHigh = cpu_to_le32(rdata->bytes >> 16); + if (wct == 12) + smb->ByteCount = 0; + else { + /* old style read */ + struct smb_com_readx_req *smbr = + (struct smb_com_readx_req *)smb; + smbr->ByteCount = 0; + } + + /* 4 for RFC1001 length + 1 for BCC */ + rdata->iov[0].iov_base = smb; + rdata->iov[0].iov_len = be32_to_cpu(smb->hdr.smb_buf_length) + 4; + + rc = cifs_call_async(tcon->ses->server, rdata->iov, 1, + cifs_readv_receive, cifs_readv_callback, + rdata, false); + + if (rc == 0) + cifs_stats_inc(&tcon->num_reads); + + cifs_small_buf_release(smb); + return rc; +} + int CIFSSMBRead(const int xid, struct cifs_io_parms *io_parms, unsigned int *nbytes, char **buf, int *pbuf_type) @@ -1836,7 +2194,7 @@ cifs_async_writev(struct cifs_writedata *wdata) kref_get(&wdata->refcount); rc = cifs_call_async(tcon->ses->server, iov, wdata->nr_pages + 1, - cifs_writev_callback, wdata, false); + NULL, cifs_writev_callback, wdata, false); if (rc == 0) cifs_stats_inc(&tcon->num_writes); @@ -1962,10 +2320,50 @@ CIFSSMBWrite2(const int xid, struct cifs_io_parms *io_parms, return rc; } +int cifs_lockv(const int xid, struct cifs_tcon *tcon, const __u16 netfid, + const __u8 lock_type, const __u32 num_unlock, + const __u32 num_lock, LOCKING_ANDX_RANGE *buf) +{ + int rc = 0; + LOCK_REQ *pSMB = NULL; + struct kvec iov[2]; + int resp_buf_type; + __u16 count; + + cFYI(1, "cifs_lockv num lock %d num unlock %d", num_lock, num_unlock); + + rc = small_smb_init(SMB_COM_LOCKING_ANDX, 8, tcon, (void **) &pSMB); + if (rc) + return rc; + + pSMB->Timeout = 0; + pSMB->NumberOfLocks = cpu_to_le16(num_lock); + pSMB->NumberOfUnlocks = cpu_to_le16(num_unlock); + pSMB->LockType = lock_type; + pSMB->AndXCommand = 0xFF; /* none */ + pSMB->Fid = netfid; /* netfid stays le */ + + count = (num_unlock + num_lock) * sizeof(LOCKING_ANDX_RANGE); + inc_rfc1001_len(pSMB, count); + pSMB->ByteCount = cpu_to_le16(count); + + iov[0].iov_base = (char *)pSMB; + iov[0].iov_len = be32_to_cpu(pSMB->hdr.smb_buf_length) + 4 - + (num_unlock + num_lock) * sizeof(LOCKING_ANDX_RANGE); + iov[1].iov_base = (char *)buf; + iov[1].iov_len = (num_unlock + num_lock) * sizeof(LOCKING_ANDX_RANGE); + + cifs_stats_inc(&tcon->num_locks); + rc = SendReceive2(xid, tcon->ses, iov, 2, &resp_buf_type, CIFS_NO_RESP); + if (rc) + cFYI(1, "Send error in cifs_lockv = %d", rc); + + return rc; +} int CIFSSMBLock(const int xid, struct cifs_tcon *tcon, - const __u16 smb_file_id, const __u64 len, + const __u16 smb_file_id, const __u32 netpid, const __u64 len, const __u64 offset, const __u32 numUnlock, const __u32 numLock, const __u8 lockType, const bool waitFlag, const __u8 oplock_level) @@ -2001,7 +2399,7 @@ CIFSSMBLock(const int xid, struct cifs_tcon *tcon, pSMB->Fid = smb_file_id; /* netfid stays le */ if ((numLock != 0) || (numUnlock != 0)) { - pSMB->Locks[0].Pid = cpu_to_le16(current->tgid); + pSMB->Locks[0].Pid = cpu_to_le16(netpid); /* BB where to store pid high? */ pSMB->Locks[0].LengthLow = cpu_to_le32((u32)len); pSMB->Locks[0].LengthHigh = cpu_to_le32((u32)(len>>32)); @@ -2035,9 +2433,9 @@ CIFSSMBLock(const int xid, struct cifs_tcon *tcon, int CIFSSMBPosixLock(const int xid, struct cifs_tcon *tcon, - const __u16 smb_file_id, const int get_flag, const __u64 len, - struct file_lock *pLockData, const __u16 lock_type, - const bool waitFlag) + const __u16 smb_file_id, const __u32 netpid, const int get_flag, + const __u64 len, struct file_lock *pLockData, + const __u16 lock_type, const bool waitFlag) { struct smb_com_transaction2_sfi_req *pSMB = NULL; struct smb_com_transaction2_sfi_rsp *pSMBr = NULL; @@ -2095,7 +2493,7 @@ CIFSSMBPosixLock(const int xid, struct cifs_tcon *tcon, } else pSMB->Timeout = 0; - parm_data->pid = cpu_to_le32(current->tgid); + parm_data->pid = cpu_to_le32(netpid); parm_data->start = cpu_to_le64(pLockData->fl_start); parm_data->length = cpu_to_le64(len); /* normalize negative numbers */ @@ -2812,8 +3210,7 @@ CIFSSMBQueryReparseLinkInfo(const int xid, struct cifs_tcon *tcon, pSMB->TotalDataCount = 0; pSMB->MaxParameterCount = cpu_to_le32(2); /* BB find exact data count max from sess structure BB */ - pSMB->MaxDataCount = cpu_to_le32((tcon->ses->server->maxBuf - - MAX_CIFS_HDR_SIZE) & 0xFFFFFF00); + pSMB->MaxDataCount = cpu_to_le32(CIFSMaxBufSize & 0xFFFFFF00); pSMB->MaxSetupCount = 4; pSMB->Reserved = 0; pSMB->ParameterOffset = 0; @@ -3306,8 +3703,7 @@ smb_init_nttransact(const __u16 sub_command, const int setup_count, pSMB->Reserved = 0; pSMB->TotalParameterCount = cpu_to_le32(parm_len); pSMB->TotalDataCount = 0; - pSMB->MaxDataCount = cpu_to_le32((tcon->ses->server->maxBuf - - MAX_CIFS_HDR_SIZE) & 0xFFFFFF00); + pSMB->MaxDataCount = cpu_to_le32(CIFSMaxBufSize & 0xFFFFFF00); pSMB->ParameterCount = pSMB->TotalParameterCount; pSMB->DataCount = pSMB->TotalDataCount; temp_offset = offsetof(struct smb_com_ntransact_req, Parms) + @@ -3467,7 +3863,7 @@ qsec_out: int CIFSSMBSetCIFSACL(const int xid, struct cifs_tcon *tcon, __u16 fid, - struct cifs_ntsd *pntsd, __u32 acllen) + struct cifs_ntsd *pntsd, __u32 acllen, int aclflag) { __u16 byte_count, param_count, data_count, param_offset, data_offset; int rc = 0; @@ -3504,7 +3900,7 @@ setCifsAclRetry: pSMB->Fid = fid; /* file handle always le */ pSMB->Reserved2 = 0; - pSMB->AclFlags = cpu_to_le32(CIFS_ACL_DACL); + pSMB->AclFlags = cpu_to_le32(aclflag); if (pntsd && acllen) { memcpy((char *) &pSMBr->hdr.Protocol + data_offset, @@ -3977,8 +4373,7 @@ findFirstRetry: params = 12 + name_len /* includes null */ ; pSMB->TotalDataCount = 0; /* no EAs */ pSMB->MaxParameterCount = cpu_to_le16(10); - pSMB->MaxDataCount = cpu_to_le16((tcon->ses->server->maxBuf - - MAX_CIFS_HDR_SIZE) & 0xFFFFFF00); + pSMB->MaxDataCount = cpu_to_le16(CIFSMaxBufSize & 0xFFFFFF00); pSMB->MaxSetupCount = 0; pSMB->Reserved = 0; pSMB->Flags = 0; @@ -4052,8 +4447,7 @@ findFirstRetry: psrch_inf->index_of_last_entry = 2 /* skip . and .. */ + psrch_inf->entries_in_buffer; lnoff = le16_to_cpu(parms->LastNameOffset); - if (tcon->ses->server->maxBuf - MAX_CIFS_HDR_SIZE < - lnoff) { + if (CIFSMaxBufSize < lnoff) { cERROR(1, "ignoring corrupt resume name"); psrch_inf->last_entry = NULL; return rc; @@ -4097,9 +4491,7 @@ int CIFSFindNext(const int xid, struct cifs_tcon *tcon, byte_count = 0; pSMB->TotalDataCount = 0; /* no EAs */ pSMB->MaxParameterCount = cpu_to_le16(8); - pSMB->MaxDataCount = - cpu_to_le16((tcon->ses->server->maxBuf - MAX_CIFS_HDR_SIZE) & - 0xFFFFFF00); + pSMB->MaxDataCount = cpu_to_le16(CIFSMaxBufSize & 0xFFFFFF00); pSMB->MaxSetupCount = 0; pSMB->Reserved = 0; pSMB->Flags = 0; @@ -4181,8 +4573,7 @@ int CIFSFindNext(const int xid, struct cifs_tcon *tcon, psrch_inf->index_of_last_entry += psrch_inf->entries_in_buffer; lnoff = le16_to_cpu(parms->LastNameOffset); - if (tcon->ses->server->maxBuf - MAX_CIFS_HDR_SIZE < - lnoff) { + if (CIFSMaxBufSize < lnoff) { cERROR(1, "ignoring corrupt resume name"); psrch_inf->last_entry = NULL; return rc; @@ -5840,7 +6231,7 @@ QAllEAsRetry: if (ea_name) { if (ea_name_len == name_len && - strncmp(ea_name, temp_ptr, name_len) == 0) { + memcmp(ea_name, temp_ptr, name_len) == 0) { temp_ptr += name_len + 1; rc = value_len; if (buf_size == 0) @@ -6035,12 +6426,7 @@ int CIFSSMBNotify(const int xid, struct cifs_tcon *tcon, pSMB->TotalParameterCount = 0 ; pSMB->TotalDataCount = 0; pSMB->MaxParameterCount = cpu_to_le32(2); - /* BB find exact data count max from sess structure BB */ - pSMB->MaxDataCount = 0; /* same in little endian or be */ -/* BB VERIFY verify which is correct for above BB */ - pSMB->MaxDataCount = cpu_to_le32((tcon->ses->server->maxBuf - - MAX_CIFS_HDR_SIZE) & 0xFFFFFF00); - + pSMB->MaxDataCount = cpu_to_le32(CIFSMaxBufSize & 0xFFFFFF00); pSMB->MaxSetupCount = 4; pSMB->Reserved = 0; pSMB->ParameterOffset = 0; diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 71beb02..4666780 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -37,6 +37,7 @@ #include <asm/uaccess.h> #include <asm/processor.h> #include <linux/inet.h> +#include <linux/module.h> #include <net/ipv6.h> #include "cifspdu.h" #include "cifsglob.h" @@ -181,7 +182,7 @@ cifs_reconnect(struct TCP_Server_Info *server) -EINVAL = invalid transact2 */ -static int check2ndT2(struct smb_hdr *pSMB, unsigned int maxBufSize) +static int check2ndT2(struct smb_hdr *pSMB) { struct smb_t2_rsp *pSMBt; int remaining; @@ -214,9 +215,9 @@ static int check2ndT2(struct smb_hdr *pSMB, unsigned int maxBufSize) cFYI(1, "missing %d bytes from transact2, check next response", remaining); - if (total_data_size > maxBufSize) { + if (total_data_size > CIFSMaxBufSize) { cERROR(1, "TotalDataSize %d is over maximum buffer %d", - total_data_size, maxBufSize); + total_data_size, CIFSMaxBufSize); return -EINVAL; } return remaining; @@ -281,7 +282,7 @@ static int coalesce_t2(struct smb_hdr *psecond, struct smb_hdr *pTargetSMB) byte_count = be32_to_cpu(pTargetSMB->smb_buf_length); byte_count += total_in_buf2; /* don't allow buffer to overflow */ - if (byte_count > CIFSMaxBufSize) + if (byte_count > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4) return -ENOBUFS; pTargetSMB->smb_buf_length = cpu_to_be32(byte_count); @@ -320,27 +321,24 @@ requeue_echo: } static bool -allocate_buffers(char **bigbuf, char **smallbuf, unsigned int size, - bool is_large_buf) +allocate_buffers(struct TCP_Server_Info *server) { - char *bbuf = *bigbuf, *sbuf = *smallbuf; - - if (bbuf == NULL) { - bbuf = (char *)cifs_buf_get(); - if (!bbuf) { + if (!server->bigbuf) { + server->bigbuf = (char *)cifs_buf_get(); + if (!server->bigbuf) { cERROR(1, "No memory for large SMB response"); msleep(3000); /* retry will check if exiting */ return false; } - } else if (is_large_buf) { + } else if (server->large_buf) { /* we are reusing a dirty large buf, clear its start */ - memset(bbuf, 0, size); + memset(server->bigbuf, 0, sizeof(struct smb_hdr)); } - if (sbuf == NULL) { - sbuf = (char *)cifs_small_buf_get(); - if (!sbuf) { + if (!server->smallbuf) { + server->smallbuf = (char *)cifs_small_buf_get(); + if (!server->smallbuf) { cERROR(1, "No memory for SMB response"); msleep(1000); /* retry will check if exiting */ @@ -349,36 +347,118 @@ allocate_buffers(char **bigbuf, char **smallbuf, unsigned int size, /* beginning of smb buffer is cleared in our buf_get */ } else { /* if existing small buf clear beginning */ - memset(sbuf, 0, size); + memset(server->smallbuf, 0, sizeof(struct smb_hdr)); } - *bigbuf = bbuf; - *smallbuf = sbuf; - return true; } -static int -read_from_socket(struct TCP_Server_Info *server, struct msghdr *smb_msg, - struct kvec *iov, unsigned int to_read, - unsigned int *ptotal_read, bool is_header_read) +static bool +server_unresponsive(struct TCP_Server_Info *server) +{ + if (echo_retries > 0 && server->tcpStatus == CifsGood && + time_after(jiffies, server->lstrp + + (echo_retries * SMB_ECHO_INTERVAL))) { + cERROR(1, "Server %s has not responded in %d seconds. " + "Reconnecting...", server->hostname, + (echo_retries * SMB_ECHO_INTERVAL / HZ)); + cifs_reconnect(server); + wake_up(&server->response_q); + return true; + } + + return false; +} + +/* + * kvec_array_init - clone a kvec array, and advance into it + * @new: pointer to memory for cloned array + * @iov: pointer to original array + * @nr_segs: number of members in original array + * @bytes: number of bytes to advance into the cloned array + * + * This function will copy the array provided in iov to a section of memory + * and advance the specified number of bytes into the new array. It returns + * the number of segments in the new array. "new" must be at least as big as + * the original iov array. + */ +static unsigned int +kvec_array_init(struct kvec *new, struct kvec *iov, unsigned int nr_segs, + size_t bytes) +{ + size_t base = 0; + + while (bytes || !iov->iov_len) { + int copy = min(bytes, iov->iov_len); + + bytes -= copy; + base += copy; + if (iov->iov_len == base) { + iov++; + nr_segs--; + base = 0; + } + } + memcpy(new, iov, sizeof(*iov) * nr_segs); + new->iov_base += base; + new->iov_len -= base; + return nr_segs; +} + +static struct kvec * +get_server_iovec(struct TCP_Server_Info *server, unsigned int nr_segs) { - int length, rc = 0; - unsigned int total_read; - char *buf = iov->iov_base; + struct kvec *new_iov; + + if (server->iov && nr_segs <= server->nr_iov) + return server->iov; + + /* not big enough -- allocate a new one and release the old */ + new_iov = kmalloc(sizeof(*new_iov) * nr_segs, GFP_NOFS); + if (new_iov) { + kfree(server->iov); + server->iov = new_iov; + server->nr_iov = nr_segs; + } + return new_iov; +} + +int +cifs_readv_from_socket(struct TCP_Server_Info *server, struct kvec *iov_orig, + unsigned int nr_segs, unsigned int to_read) +{ + int length = 0; + int total_read; + unsigned int segs; + struct msghdr smb_msg; + struct kvec *iov; + + iov = get_server_iovec(server, nr_segs); + if (!iov) + return -ENOMEM; + + smb_msg.msg_control = NULL; + smb_msg.msg_controllen = 0; + + for (total_read = 0; to_read; total_read += length, to_read -= length) { + try_to_freeze(); + + if (server_unresponsive(server)) { + total_read = -EAGAIN; + break; + } + + segs = kvec_array_init(iov, iov_orig, nr_segs, total_read); + + length = kernel_recvmsg(server->ssocket, &smb_msg, + iov, segs, to_read, 0); - for (total_read = 0; total_read < to_read; total_read += length) { - length = kernel_recvmsg(server->ssocket, smb_msg, iov, 1, - to_read - total_read, 0); if (server->tcpStatus == CifsExiting) { - /* then will exit */ - rc = 2; + total_read = -ESHUTDOWN; break; } else if (server->tcpStatus == CifsNeedReconnect) { cifs_reconnect(server); - /* Reconnect wakes up rspns q */ - /* Now we will reread sock */ - rc = 1; + total_read = -EAGAIN; break; } else if (length == -ERESTARTSYS || length == -EAGAIN || @@ -390,56 +470,54 @@ read_from_socket(struct TCP_Server_Info *server, struct msghdr *smb_msg, */ usleep_range(1000, 2000); length = 0; - if (!is_header_read) - continue; - /* Special handling for header read */ - if (total_read) { - iov->iov_base = (to_read - total_read) + - buf; - iov->iov_len = to_read - total_read; - smb_msg->msg_control = NULL; - smb_msg->msg_controllen = 0; - rc = 3; - } else - rc = 1; - break; + continue; } else if (length <= 0) { - cERROR(1, "Received no data, expecting %d", - to_read - total_read); + cFYI(1, "Received no data or error: expecting %d " + "got %d", to_read, length); cifs_reconnect(server); - rc = 1; + total_read = -EAGAIN; break; } } + return total_read; +} - *ptotal_read = total_read; - return rc; +int +cifs_read_from_socket(struct TCP_Server_Info *server, char *buf, + unsigned int to_read) +{ + struct kvec iov; + + iov.iov_base = buf; + iov.iov_len = to_read; + + return cifs_readv_from_socket(server, &iov, 1, to_read); } static bool -check_rfc1002_header(struct TCP_Server_Info *server, char *buf) +is_smb_response(struct TCP_Server_Info *server, unsigned char type) { - char temp = *buf; - unsigned int pdu_length = be32_to_cpu( - ((struct smb_hdr *)buf)->smb_buf_length); - /* * The first byte big endian of the length field, * is actually not part of the length but the type * with the most common, zero, as regular data. */ - if (temp == (char) RFC1002_SESSION_KEEP_ALIVE) { - return false; - } else if (temp == (char)RFC1002_POSITIVE_SESSION_RESPONSE) { - cFYI(1, "Good RFC 1002 session rsp"); - return false; - } else if (temp == (char)RFC1002_NEGATIVE_SESSION_RESPONSE) { + switch (type) { + case RFC1002_SESSION_MESSAGE: + /* Regular SMB response */ + return true; + case RFC1002_SESSION_KEEP_ALIVE: + cFYI(1, "RFC 1002 session keep alive"); + break; + case RFC1002_POSITIVE_SESSION_RESPONSE: + cFYI(1, "RFC 1002 positive session response"); + break; + case RFC1002_NEGATIVE_SESSION_RESPONSE: /* * We get this from Windows 98 instead of an error on * SMB negprot response. */ - cFYI(1, "Negative RFC1002 Session Response Error 0x%x)", - pdu_length); + cFYI(1, "RFC 1002 negative session response"); /* give server a second to clean up */ msleep(1000); /* @@ -448,87 +526,89 @@ check_rfc1002_header(struct TCP_Server_Info *server, char *buf) * is since we do not begin with RFC1001 session * initialize frame). */ - cifs_set_port((struct sockaddr *) - &server->dstaddr, CIFS_PORT); + cifs_set_port((struct sockaddr *)&server->dstaddr, CIFS_PORT); cifs_reconnect(server); wake_up(&server->response_q); - return false; - } else if (temp != (char) 0) { - cERROR(1, "Unknown RFC 1002 frame"); - cifs_dump_mem(" Received Data: ", buf, 4); - cifs_reconnect(server); - return false; - } - - /* else we have an SMB response */ - if ((pdu_length > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4) || - (pdu_length < sizeof(struct smb_hdr) - 1 - 4)) { - cERROR(1, "Invalid size SMB length %d pdu_length %d", - 4, pdu_length+4); + break; + default: + cERROR(1, "RFC 1002 unknown response type 0x%x", type); cifs_reconnect(server); - wake_up(&server->response_q); - return false; } - return true; + return false; } static struct mid_q_entry * -find_cifs_mid(struct TCP_Server_Info *server, struct smb_hdr *buf, - int *length, bool is_large_buf, bool *is_multi_rsp, char **bigbuf) +find_mid(struct TCP_Server_Info *server, struct smb_hdr *buf) { - struct mid_q_entry *mid = NULL, *tmp_mid, *ret = NULL; + struct mid_q_entry *mid; spin_lock(&GlobalMid_Lock); - list_for_each_entry_safe(mid, tmp_mid, &server->pending_mid_q, qhead) { - if (mid->mid != buf->Mid || - mid->midState != MID_REQUEST_SUBMITTED || - mid->command != buf->Command) - continue; - - if (*length == 0 && check2ndT2(buf, server->maxBuf) > 0) { - /* We have a multipart transact2 resp */ - *is_multi_rsp = true; - if (mid->resp_buf) { - /* merge response - fix up 1st*/ - *length = coalesce_t2(buf, mid->resp_buf); - if (*length > 0) { - *length = 0; - mid->multiRsp = true; - break; - } - /* All parts received or packet is malformed. */ - mid->multiEnd = true; - goto multi_t2_fnd; - } - if (!is_large_buf) { - /*FIXME: switch to already allocated largebuf?*/ - cERROR(1, "1st trans2 resp needs bigbuf"); - } else { - /* Have first buffer */ - mid->resp_buf = buf; - mid->largeBuf = true; - *bigbuf = NULL; - } - break; + list_for_each_entry(mid, &server->pending_mid_q, qhead) { + if (mid->mid == buf->Mid && + mid->midState == MID_REQUEST_SUBMITTED && + mid->command == buf->Command) { + spin_unlock(&GlobalMid_Lock); + return mid; } - mid->resp_buf = buf; - mid->largeBuf = is_large_buf; -multi_t2_fnd: - if (*length == 0) - mid->midState = MID_RESPONSE_RECEIVED; - else - mid->midState = MID_RESPONSE_MALFORMED; + } + spin_unlock(&GlobalMid_Lock); + return NULL; +} + +void +dequeue_mid(struct mid_q_entry *mid, bool malformed) +{ #ifdef CONFIG_CIFS_STATS2 - mid->when_received = jiffies; + mid->when_received = jiffies; #endif - list_del_init(&mid->qhead); - ret = mid; - break; - } + spin_lock(&GlobalMid_Lock); + if (!malformed) + mid->midState = MID_RESPONSE_RECEIVED; + else + mid->midState = MID_RESPONSE_MALFORMED; + list_del_init(&mid->qhead); spin_unlock(&GlobalMid_Lock); +} - return ret; +static void +handle_mid(struct mid_q_entry *mid, struct TCP_Server_Info *server, + struct smb_hdr *buf, int malformed) +{ + if (malformed == 0 && check2ndT2(buf) > 0) { + mid->multiRsp = true; + if (mid->resp_buf) { + /* merge response - fix up 1st*/ + malformed = coalesce_t2(buf, mid->resp_buf); + if (malformed > 0) + return; + + /* All parts received or packet is malformed. */ + mid->multiEnd = true; + return dequeue_mid(mid, malformed); + } + if (!server->large_buf) { + /*FIXME: switch to already allocated largebuf?*/ + cERROR(1, "1st trans2 resp needs bigbuf"); + } else { + /* Have first buffer */ + mid->resp_buf = buf; + mid->largeBuf = true; + server->bigbuf = NULL; + } + return; + } + mid->resp_buf = buf; + mid->largeBuf = server->large_buf; + /* Was previous buf put in mpx struct for multi-rsp? */ + if (!mid->multiRsp) { + /* smb buffer will be freed by user thread */ + if (server->large_buf) + server->bigbuf = NULL; + else + server->smallbuf = NULL; + } + dequeue_mid(mid, malformed); } static void clean_demultiplex_info(struct TCP_Server_Info *server) @@ -618,6 +698,7 @@ static void clean_demultiplex_info(struct TCP_Server_Info *server) } kfree(server->hostname); + kfree(server->iov); kfree(server); length = atomic_dec_return(&tcpSesAllocCount); @@ -627,20 +708,70 @@ static void clean_demultiplex_info(struct TCP_Server_Info *server) } static int +standard_receive3(struct TCP_Server_Info *server, struct mid_q_entry *mid) +{ + int length; + char *buf = server->smallbuf; + struct smb_hdr *smb_buffer = (struct smb_hdr *)buf; + unsigned int pdu_length = be32_to_cpu(smb_buffer->smb_buf_length); + + /* make sure this will fit in a large buffer */ + if (pdu_length > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4) { + cERROR(1, "SMB response too long (%u bytes)", + pdu_length); + cifs_reconnect(server); + wake_up(&server->response_q); + return -EAGAIN; + } + + /* switch to large buffer if too big for a small one */ + if (pdu_length > MAX_CIFS_SMALL_BUFFER_SIZE - 4) { + server->large_buf = true; + memcpy(server->bigbuf, server->smallbuf, server->total_read); + buf = server->bigbuf; + smb_buffer = (struct smb_hdr *)buf; + } + + /* now read the rest */ + length = cifs_read_from_socket(server, + buf + sizeof(struct smb_hdr) - 1, + pdu_length - sizeof(struct smb_hdr) + 1 + 4); + if (length < 0) + return length; + server->total_read += length; + + dump_smb(smb_buffer, server->total_read); + + /* + * We know that we received enough to get to the MID as we + * checked the pdu_length earlier. Now check to see + * if the rest of the header is OK. We borrow the length + * var for the rest of the loop to avoid a new stack var. + * + * 48 bytes is enough to display the header and a little bit + * into the payload for debugging purposes. + */ + length = checkSMB(smb_buffer, smb_buffer->Mid, server->total_read); + if (length != 0) + cifs_dump_mem("Bad SMB: ", buf, + min_t(unsigned int, server->total_read, 48)); + + if (mid) + handle_mid(mid, server, smb_buffer, length); + + return length; +} + +static int cifs_demultiplex_thread(void *p) { int length; struct TCP_Server_Info *server = p; - unsigned int pdu_length, total_read; - char *buf = NULL, *bigbuf = NULL, *smallbuf = NULL; + unsigned int pdu_length; + char *buf = NULL; struct smb_hdr *smb_buffer = NULL; - struct msghdr smb_msg; - struct kvec iov; struct task_struct *task_to_wake = NULL; struct mid_q_entry *mid_entry; - bool isLargeBuf = false; - bool isMultiRsp = false; - int rc; current->flags |= PF_MEMALLOC; cFYI(1, "Demultiplex PID: %d", task_pid_nr(current)); @@ -655,111 +786,65 @@ cifs_demultiplex_thread(void *p) if (try_to_freeze()) continue; - if (!allocate_buffers(&bigbuf, &smallbuf, - sizeof(struct smb_hdr), isLargeBuf)) + if (!allocate_buffers(server)) continue; - isLargeBuf = false; - isMultiRsp = false; - smb_buffer = (struct smb_hdr *)smallbuf; - buf = smallbuf; - iov.iov_base = buf; - iov.iov_len = 4; - smb_msg.msg_control = NULL; - smb_msg.msg_controllen = 0; + server->large_buf = false; + smb_buffer = (struct smb_hdr *)server->smallbuf; + buf = server->smallbuf; pdu_length = 4; /* enough to get RFC1001 header */ -incomplete_rcv: - if (echo_retries > 0 && server->tcpStatus == CifsGood && - time_after(jiffies, server->lstrp + - (echo_retries * SMB_ECHO_INTERVAL))) { - cERROR(1, "Server %s has not responded in %d seconds. " - "Reconnecting...", server->hostname, - (echo_retries * SMB_ECHO_INTERVAL / HZ)); - cifs_reconnect(server); - wake_up(&server->response_q); - continue; - } - - rc = read_from_socket(server, &smb_msg, &iov, pdu_length, - &total_read, true /* header read */); - if (rc == 3) - goto incomplete_rcv; - else if (rc == 2) - break; - else if (rc == 1) + length = cifs_read_from_socket(server, buf, pdu_length); + if (length < 0) continue; + server->total_read = length; /* * The right amount was read from socket - 4 bytes, * so we can now interpret the length field. */ - - /* - * Note that RFC 1001 length is big endian on the wire, - * but we convert it here so it is always manipulated - * as host byte order. - */ pdu_length = be32_to_cpu(smb_buffer->smb_buf_length); - cFYI(1, "rfc1002 length 0x%x", pdu_length+4); - if (!check_rfc1002_header(server, buf)) + cFYI(1, "RFC1002 header 0x%x", pdu_length); + if (!is_smb_response(server, buf[0])) continue; - /* else length ok */ - if (pdu_length > MAX_CIFS_SMALL_BUFFER_SIZE - 4) { - isLargeBuf = true; - memcpy(bigbuf, smallbuf, 4); - smb_buffer = (struct smb_hdr *)bigbuf; - buf = bigbuf; + /* make sure we have enough to get to the MID */ + if (pdu_length < sizeof(struct smb_hdr) - 1 - 4) { + cERROR(1, "SMB response too short (%u bytes)", + pdu_length); + cifs_reconnect(server); + wake_up(&server->response_q); + continue; } - iov.iov_base = 4 + buf; - iov.iov_len = pdu_length; - rc = read_from_socket(server, &smb_msg, &iov, pdu_length, - &total_read, false); - if (rc == 2) - break; - else if (rc == 1) + /* read down to the MID */ + length = cifs_read_from_socket(server, buf + 4, + sizeof(struct smb_hdr) - 1 - 4); + if (length < 0) continue; + server->total_read += length; - total_read += 4; /* account for rfc1002 hdr */ + mid_entry = find_mid(server, smb_buffer); - dump_smb(smb_buffer, total_read); + if (!mid_entry || !mid_entry->receive) + length = standard_receive3(server, mid_entry); + else + length = mid_entry->receive(server, mid_entry); - /* - * We know that we received enough to get to the MID as we - * checked the pdu_length earlier. Now check to see - * if the rest of the header is OK. We borrow the length - * var for the rest of the loop to avoid a new stack var. - * - * 48 bytes is enough to display the header and a little bit - * into the payload for debugging purposes. - */ - length = checkSMB(smb_buffer, smb_buffer->Mid, total_read); - if (length != 0) - cifs_dump_mem("Bad SMB: ", buf, - min_t(unsigned int, total_read, 48)); + if (length < 0) + continue; - server->lstrp = jiffies; + if (server->large_buf) { + buf = server->bigbuf; + smb_buffer = (struct smb_hdr *)buf; + } - mid_entry = find_cifs_mid(server, smb_buffer, &length, - isLargeBuf, &isMultiRsp, &bigbuf); + server->lstrp = jiffies; if (mid_entry != NULL) { - mid_entry->callback(mid_entry); - /* Was previous buf put in mpx struct for multi-rsp? */ - if (!isMultiRsp) { - /* smb buffer will be freed by user thread */ - if (isLargeBuf) - bigbuf = NULL; - else - smallbuf = NULL; - } - } else if (length != 0) { - /* response sanity checks failed */ - continue; - } else if (!is_valid_oplock_break(smb_buffer, server) && - !isMultiRsp) { + if (!mid_entry->multiRsp || mid_entry->multiEnd) + mid_entry->callback(mid_entry); + } else if (!is_valid_oplock_break(smb_buffer, server)) { cERROR(1, "No task to wake, unknown frame received! " "NumMids %d", atomic_read(&midCount)); cifs_dump_mem("Received Data is: ", buf, @@ -773,9 +858,9 @@ incomplete_rcv: } /* end while !EXITING */ /* buffer usually freed in free_mid - need to free it here on exit */ - cifs_buf_release(bigbuf); - if (smallbuf) /* no sense logging a debug message if NULL */ - cifs_small_buf_release(smallbuf); + cifs_buf_release(server->bigbuf); + if (server->smallbuf) /* no sense logging a debug message if NULL */ + cifs_small_buf_release(server->smallbuf); task_to_wake = xchg(&server->tsk, NULL); clean_demultiplex_info(server); @@ -827,6 +912,7 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, { char *value, *data, *end; char *mountdata_copy = NULL, *options; + int err; unsigned int temp_len, i, j; char separator[2]; short int override_uid = -1; @@ -883,6 +969,8 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, cFYI(1, "Null separator not allowed"); } } + vol->backupuid_specified = false; /* no backup intent for a user */ + vol->backupgid_specified = false; /* no backup intent for a group */ while ((data = strsep(&options, separator)) != NULL) { if (!*data) @@ -1442,6 +1530,22 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, vol->mfsymlinks = true; } else if (strnicmp(data, "multiuser", 8) == 0) { vol->multiuser = true; + } else if (!strnicmp(data, "backupuid", 9) && value && *value) { + err = kstrtouint(value, 0, &vol->backupuid); + if (err < 0) { + cERROR(1, "%s: Invalid backupuid value", + __func__); + goto cifs_parse_mount_err; + } + vol->backupuid_specified = true; + } else if (!strnicmp(data, "backupgid", 9) && value && *value) { + err = kstrtouint(value, 0, &vol->backupgid); + if (err < 0) { + cERROR(1, "%s: Invalid backupgid value", + __func__); + goto cifs_parse_mount_err; + } + vol->backupgid_specified = true; } else printk(KERN_WARNING "CIFS: Unknown mount option %s\n", data); @@ -2018,7 +2122,7 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb_vol *volume_info) warned_on_ntlm = true; cERROR(1, "default security mechanism requested. The default " "security mechanism will be upgraded from ntlm to " - "ntlmv2 in kernel release 3.2"); + "ntlmv2 in kernel release 3.3"); } ses->overrideSecFlg = volume_info->secFlg; @@ -2209,16 +2313,16 @@ compare_mount_options(struct super_block *sb, struct cifs_mnt_data *mnt_data) (new->mnt_cifs_flags & CIFS_MOUNT_MASK)) return 0; - if (old->rsize != new->rsize) - return 0; - /* - * We want to share sb only if we don't specify wsize or specified wsize - * is greater or equal than existing one. + * We want to share sb only if we don't specify an r/wsize or + * specified r/wsize is greater than or equal to existing one. */ if (new->wsize && new->wsize < old->wsize) return 0; + if (new->rsize && new->rsize < old->rsize) + return 0; + if (old->mnt_uid != new->mnt_uid || old->mnt_gid != new->mnt_gid) return 0; @@ -2656,14 +2760,6 @@ void reset_cifs_unix_caps(int xid, struct cifs_tcon *tcon, CIFS_MOUNT_POSIX_PATHS; } - if (cifs_sb && (cifs_sb->rsize > 127 * 1024)) { - if ((cap & CIFS_UNIX_LARGE_READ_CAP) == 0) { - cifs_sb->rsize = 127 * 1024; - cFYI(DBG2, "larger reads not supported by srv"); - } - } - - cFYI(1, "Negotiate caps 0x%x", (int)cap); #ifdef CONFIG_CIFS_DEBUG2 if (cap & CIFS_UNIX_FCNTL_CAP) @@ -2708,34 +2804,22 @@ void cifs_setup_cifs_sb(struct smb_vol *pvolume_info, spin_lock_init(&cifs_sb->tlink_tree_lock); cifs_sb->tlink_tree = RB_ROOT; - if (pvolume_info->rsize > CIFSMaxBufSize) { - cERROR(1, "rsize %d too large, using MaxBufSize", - pvolume_info->rsize); - cifs_sb->rsize = CIFSMaxBufSize; - } else if ((pvolume_info->rsize) && - (pvolume_info->rsize <= CIFSMaxBufSize)) - cifs_sb->rsize = pvolume_info->rsize; - else /* default */ - cifs_sb->rsize = CIFSMaxBufSize; - - if (cifs_sb->rsize < 2048) { - cifs_sb->rsize = 2048; - /* Windows ME may prefer this */ - cFYI(1, "readsize set to minimum: 2048"); - } - /* - * Temporarily set wsize for matching superblock. If we end up using - * new sb then cifs_negotiate_wsize will later negotiate it downward - * if needed. + * Temporarily set r/wsize for matching superblock. If we end up using + * new sb then client will later negotiate it downward if needed. */ + cifs_sb->rsize = pvolume_info->rsize; cifs_sb->wsize = pvolume_info->wsize; cifs_sb->mnt_uid = pvolume_info->linux_uid; cifs_sb->mnt_gid = pvolume_info->linux_gid; + if (pvolume_info->backupuid_specified) + cifs_sb->mnt_backupuid = pvolume_info->backupuid; + if (pvolume_info->backupgid_specified) + cifs_sb->mnt_backupgid = pvolume_info->backupgid; cifs_sb->mnt_file_mode = pvolume_info->file_mode; cifs_sb->mnt_dir_mode = pvolume_info->dir_mode; - cFYI(1, "file mode: 0x%x dir mode: 0x%x", + cFYI(1, "file mode: 0x%hx dir mode: 0x%hx", cifs_sb->mnt_file_mode, cifs_sb->mnt_dir_mode); cifs_sb->actimeo = pvolume_info->actimeo; @@ -2763,6 +2847,10 @@ void cifs_setup_cifs_sb(struct smb_vol *pvolume_info, cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_RWPIDFORWARD; if (pvolume_info->cifs_acl) cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_CIFS_ACL; + if (pvolume_info->backupuid_specified) + cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_CIFS_BACKUPUID; + if (pvolume_info->backupgid_specified) + cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_CIFS_BACKUPGID; if (pvolume_info->override_uid) cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_OVERR_UID; if (pvolume_info->override_gid) @@ -2795,29 +2883,41 @@ void cifs_setup_cifs_sb(struct smb_vol *pvolume_info, } /* - * When the server supports very large writes via POSIX extensions, we can - * allow up to 2^24-1, minus the size of a WRITE_AND_X header, not including - * the RFC1001 length. + * When the server supports very large reads and writes via POSIX extensions, + * we can allow up to 2^24-1, minus the size of a READ/WRITE_AND_X header, not + * including the RFC1001 length. * * Note that this might make for "interesting" allocation problems during * writeback however as we have to allocate an array of pointers for the * pages. A 16M write means ~32kb page array with PAGE_CACHE_SIZE == 4096. + * + * For reads, there is a similar problem as we need to allocate an array + * of kvecs to handle the receive, though that should only need to be done + * once. */ #define CIFS_MAX_WSIZE ((1<<24) - 1 - sizeof(WRITE_REQ) + 4) +#define CIFS_MAX_RSIZE ((1<<24) - sizeof(READ_RSP) + 4) /* - * When the server doesn't allow large posix writes, only allow a wsize of - * 128k minus the size of the WRITE_AND_X header. That allows for a write up - * to the maximum size described by RFC1002. + * When the server doesn't allow large posix writes, only allow a rsize/wsize + * of 2^17-1 minus the size of the call header. That allows for a read or + * write up to the maximum size described by RFC1002. */ -#define CIFS_MAX_RFC1002_WSIZE (128 * 1024 - sizeof(WRITE_REQ) + 4) +#define CIFS_MAX_RFC1002_WSIZE ((1<<17) - 1 - sizeof(WRITE_REQ) + 4) +#define CIFS_MAX_RFC1002_RSIZE ((1<<17) - 1 - sizeof(READ_RSP) + 4) /* * The default wsize is 1M. find_get_pages seems to return a maximum of 256 * pages in a single call. With PAGE_CACHE_SIZE == 4k, this means we can fill * a single wsize request with a single call. */ -#define CIFS_DEFAULT_WSIZE (1024 * 1024) +#define CIFS_DEFAULT_IOSIZE (1024 * 1024) + +/* + * Windows only supports a max of 60k reads. Default to that when posix + * extensions aren't in force. + */ +#define CIFS_DEFAULT_NON_POSIX_RSIZE (60 * 1024) static unsigned int cifs_negotiate_wsize(struct cifs_tcon *tcon, struct smb_vol *pvolume_info) @@ -2825,7 +2925,7 @@ cifs_negotiate_wsize(struct cifs_tcon *tcon, struct smb_vol *pvolume_info) __u64 unix_cap = le64_to_cpu(tcon->fsUnixInfo.Capability); struct TCP_Server_Info *server = tcon->ses->server; unsigned int wsize = pvolume_info->wsize ? pvolume_info->wsize : - CIFS_DEFAULT_WSIZE; + CIFS_DEFAULT_IOSIZE; /* can server support 24-bit write sizes? (via UNIX extensions) */ if (!tcon->unix_ext || !(unix_cap & CIFS_UNIX_LARGE_WRITE_CAP)) @@ -2848,6 +2948,50 @@ cifs_negotiate_wsize(struct cifs_tcon *tcon, struct smb_vol *pvolume_info) return wsize; } +static unsigned int +cifs_negotiate_rsize(struct cifs_tcon *tcon, struct smb_vol *pvolume_info) +{ + __u64 unix_cap = le64_to_cpu(tcon->fsUnixInfo.Capability); + struct TCP_Server_Info *server = tcon->ses->server; + unsigned int rsize, defsize; + + /* + * Set default value... + * + * HACK alert! Ancient servers have very small buffers. Even though + * MS-CIFS indicates that servers are only limited by the client's + * bufsize for reads, testing against win98se shows that it throws + * INVALID_PARAMETER errors if you try to request too large a read. + * + * If the server advertises a MaxBufferSize of less than one page, + * assume that it also can't satisfy reads larger than that either. + * + * FIXME: Is there a better heuristic for this? + */ + if (tcon->unix_ext && (unix_cap & CIFS_UNIX_LARGE_READ_CAP)) + defsize = CIFS_DEFAULT_IOSIZE; + else if (server->capabilities & CAP_LARGE_READ_X) + defsize = CIFS_DEFAULT_NON_POSIX_RSIZE; + else if (server->maxBuf >= PAGE_CACHE_SIZE) + defsize = CIFSMaxBufSize; + else + defsize = server->maxBuf - sizeof(READ_RSP); + + rsize = pvolume_info->rsize ? pvolume_info->rsize : defsize; + + /* + * no CAP_LARGE_READ_X? Then MS-CIFS states that we must limit this to + * the client's MaxBufferSize. + */ + if (!(server->capabilities & CAP_LARGE_READ_X)) + rsize = min_t(unsigned int, CIFSMaxBufSize, rsize); + + /* hard limit of CIFS_MAX_RSIZE */ + rsize = min_t(unsigned int, rsize, CIFS_MAX_RSIZE); + + return rsize; +} + static int is_path_accessible(int xid, struct cifs_tcon *tcon, struct cifs_sb_info *cifs_sb, const char *full_path) @@ -2877,9 +3021,9 @@ cleanup_volume_info_contents(struct smb_vol *volume_info) { kfree(volume_info->username); kzfree(volume_info->password); - kfree(volume_info->UNC); if (volume_info->UNCip != volume_info->UNC + 2) kfree(volume_info->UNCip); + kfree(volume_info->UNC); kfree(volume_info->domainname); kfree(volume_info->iocharset); kfree(volume_info->prepath); @@ -3041,6 +3185,22 @@ cifs_get_volume_info(char *mount_data, const char *devname) return volume_info; } +/* make sure ra_pages is a multiple of rsize */ +static inline unsigned int +cifs_ra_pages(struct cifs_sb_info *cifs_sb) +{ + unsigned int reads; + unsigned int rsize_pages = cifs_sb->rsize / PAGE_CACHE_SIZE; + + if (rsize_pages >= default_backing_dev_info.ra_pages) + return default_backing_dev_info.ra_pages; + else if (rsize_pages == 0) + return rsize_pages; + + reads = default_backing_dev_info.ra_pages / rsize_pages; + return reads * rsize_pages; +} + int cifs_mount(struct cifs_sb_info *cifs_sb, struct smb_vol *volume_info) { @@ -3059,8 +3219,6 @@ cifs_mount(struct cifs_sb_info *cifs_sb, struct smb_vol *volume_info) if (rc) return rc; - cifs_sb->bdi.ra_pages = default_backing_dev_info.ra_pages; - #ifdef CONFIG_CIFS_DFS_UPCALL try_mount_again: /* cleanup activities if we're chasing a referral */ @@ -3125,15 +3283,11 @@ try_mount_again: CIFSSMBQFSAttributeInfo(xid, tcon); } - if ((tcon->unix_ext == 0) && (cifs_sb->rsize > (1024 * 127))) { - cifs_sb->rsize = 1024 * 127; - cFYI(DBG2, "no very large read support, rsize now 127K"); - } - if (!(tcon->ses->capabilities & CAP_LARGE_READ_X)) - cifs_sb->rsize = min(cifs_sb->rsize, - (tcon->ses->server->maxBuf - MAX_CIFS_HDR_SIZE)); - cifs_sb->wsize = cifs_negotiate_wsize(tcon, volume_info); + cifs_sb->rsize = cifs_negotiate_rsize(tcon, volume_info); + + /* tune readahead according to rsize */ + cifs_sb->bdi.ra_pages = cifs_ra_pages(cifs_sb); remote_path_check: #ifdef CONFIG_CIFS_DFS_UPCALL @@ -3301,7 +3455,7 @@ CIFSTCon(unsigned int xid, struct cifs_ses *ses, else #endif /* CIFS_WEAK_PW_HASH */ rc = SMBNTencrypt(tcon->password, ses->server->cryptkey, - bcc_ptr); + bcc_ptr, nls_codepage); bcc_ptr += CIFS_AUTH_RESP_SIZE; if (ses->capabilities & CAP_UNICODE) { diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c index 72d448b..df8fecb 100644 --- a/fs/cifs/dir.c +++ b/fs/cifs/dir.c @@ -136,7 +136,7 @@ cifs_bp_rename_retry: /* Inode operations in similar order to how they appear in Linux file fs.h */ int -cifs_create(struct inode *inode, struct dentry *direntry, int mode, +cifs_create(struct inode *inode, struct dentry *direntry, umode_t mode, struct nameidata *nd) { int rc = -ENOENT; @@ -171,7 +171,7 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode, } tcon = tlink_tcon(tlink); - if (oplockEnabled) + if (enable_oplocks) oplock = REQ_OPLOCK; if (nd) @@ -244,6 +244,9 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode, if (!tcon->unix_ext && (mode & S_IWUGO) == 0) create_options |= CREATE_OPTION_READONLY; + if (backup_cred(cifs_sb)) + create_options |= CREATE_OPEN_BACKUP_INTENT; + if (tcon->ses->capabilities & CAP_NT_SMBS) rc = CIFSSMBOpen(xid, tcon, full_path, disposition, desiredAccess, create_options, @@ -352,11 +355,12 @@ cifs_create_out: return rc; } -int cifs_mknod(struct inode *inode, struct dentry *direntry, int mode, +int cifs_mknod(struct inode *inode, struct dentry *direntry, umode_t mode, dev_t device_number) { int rc = -EPERM; int xid; + int create_options = CREATE_NOT_DIR | CREATE_OPTION_SPECIAL; struct cifs_sb_info *cifs_sb; struct tcon_link *tlink; struct cifs_tcon *pTcon; @@ -431,9 +435,11 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, int mode, return rc; } - /* FIXME: would WRITE_OWNER | WRITE_DAC be better? */ + if (backup_cred(cifs_sb)) + create_options |= CREATE_OPEN_BACKUP_INTENT; + rc = CIFSSMBOpen(xid, pTcon, full_path, FILE_CREATE, - GENERIC_WRITE, CREATE_NOT_DIR | CREATE_OPTION_SPECIAL, + GENERIC_WRITE, create_options, &fileHandle, &oplock, buf, cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); if (rc) @@ -642,8 +648,16 @@ cifs_d_revalidate(struct dentry *direntry, struct nameidata *nd) if (direntry->d_inode) { if (cifs_revalidate_dentry(direntry)) return 0; - else + else { + /* + * Forcibly invalidate automounting directory inodes + * (remote DFS directories) so to have them + * instantiated again for automount + */ + if (IS_AUTOMOUNT(direntry->d_inode)) + return 0; return 1; + } } /* diff --git a/fs/cifs/export.c b/fs/cifs/export.c index 55d87ac..9c7ecdc 100644 --- a/fs/cifs/export.c +++ b/fs/cifs/export.c @@ -45,7 +45,7 @@ #include "cifs_debug.h" #include "cifsfs.h" -#ifdef CIFS_NFSD_EXPORT +#ifdef CONFIG_CIFS_NFSD_EXPORT static struct dentry *cifs_get_parent(struct dentry *dentry) { /* BB need to add code here eventually to enable export via NFSD */ @@ -63,5 +63,5 @@ const struct export_operations cifs_export_ops = { .encode_fs = */ }; -#endif /* CIFS_NFSD_EXPORT */ +#endif /* CONFIG_CIFS_NFSD_EXPORT */ diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 9f41a10..4dd9283 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -32,6 +32,7 @@ #include <linux/delay.h> #include <linux/mount.h> #include <linux/slab.h> +#include <linux/swap.h> #include <asm/div64.h> #include "cifsfs.h" #include "cifspdu.h" @@ -174,6 +175,7 @@ cifs_nt_open(char *full_path, struct inode *inode, struct cifs_sb_info *cifs_sb, int rc; int desiredAccess; int disposition; + int create_options = CREATE_NOT_DIR; FILE_ALL_INFO *buf; desiredAccess = cifs_convert_flags(f_flags); @@ -210,9 +212,12 @@ cifs_nt_open(char *full_path, struct inode *inode, struct cifs_sb_info *cifs_sb, if (!buf) return -ENOMEM; + if (backup_cred(cifs_sb)) + create_options |= CREATE_OPEN_BACKUP_INTENT; + if (tcon->ses->capabilities & CAP_NT_SMBS) rc = CIFSSMBOpen(xid, tcon, full_path, disposition, - desiredAccess, CREATE_NOT_DIR, pnetfid, poplock, buf, + desiredAccess, create_options, pnetfid, poplock, buf, cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); else @@ -258,8 +263,6 @@ cifs_new_fileinfo(__u16 fileHandle, struct file *file, pCifsFile->invalidHandle = false; pCifsFile->tlink = cifs_get_tlink(tlink); mutex_init(&pCifsFile->fh_mutex); - mutex_init(&pCifsFile->lock_mutex); - INIT_LIST_HEAD(&pCifsFile->llist); INIT_WORK(&pCifsFile->oplock_break, cifs_oplock_break); spin_lock(&cifs_file_list_lock); @@ -272,11 +275,14 @@ cifs_new_fileinfo(__u16 fileHandle, struct file *file, spin_unlock(&cifs_file_list_lock); cifs_set_oplock_level(pCifsInode, oplock); + pCifsInode->can_cache_brlcks = pCifsInode->clientCanCacheAll; file->private_data = pCifsFile; return pCifsFile; } +static void cifs_del_lock_waiters(struct cifsLockInfo *lock); + /* * Release a reference on the file private data. This may involve closing * the filehandle out on the server. Must be called without holding @@ -327,12 +333,15 @@ void cifsFileInfo_put(struct cifsFileInfo *cifs_file) /* Delete any outstanding lock records. We'll lose them when the file * is closed anyway. */ - mutex_lock(&cifs_file->lock_mutex); - list_for_each_entry_safe(li, tmp, &cifs_file->llist, llist) { + mutex_lock(&cifsi->lock_mutex); + list_for_each_entry_safe(li, tmp, &cifsi->llist, llist) { + if (li->netfid != cifs_file->netfid) + continue; list_del(&li->llist); + cifs_del_lock_waiters(li); kfree(li); } - mutex_unlock(&cifs_file->lock_mutex); + mutex_unlock(&cifsi->lock_mutex); cifs_put_tlink(cifs_file->tlink); dput(cifs_file->dentry); @@ -371,7 +380,7 @@ int cifs_open(struct inode *inode, struct file *file) cFYI(1, "inode = 0x%p file flags are 0x%x for %s", inode, file->f_flags, full_path); - if (oplockEnabled) + if (enable_oplocks) oplock = REQ_OPLOCK; else oplock = 0; @@ -465,6 +474,7 @@ static int cifs_reopen_file(struct cifsFileInfo *pCifsFile, bool can_flush) char *full_path = NULL; int desiredAccess; int disposition = FILE_OPEN; + int create_options = CREATE_NOT_DIR; __u16 netfid; xid = GetXid(); @@ -495,7 +505,7 @@ static int cifs_reopen_file(struct cifsFileInfo *pCifsFile, bool can_flush) cFYI(1, "inode = 0x%p file flags 0x%x for %s", inode, pCifsFile->f_flags, full_path); - if (oplockEnabled) + if (enable_oplocks) oplock = REQ_OPLOCK; else oplock = 0; @@ -524,6 +534,9 @@ static int cifs_reopen_file(struct cifsFileInfo *pCifsFile, bool can_flush) desiredAccess = cifs_convert_flags(pCifsFile->f_flags); + if (backup_cred(cifs_sb)) + create_options |= CREATE_OPEN_BACKUP_INTENT; + /* Can not refresh inode by passing in file_info buf to be returned by SMBOpen and then calling get_inode_info with returned buf since file might have write behind data that needs to be flushed @@ -531,7 +544,7 @@ static int cifs_reopen_file(struct cifsFileInfo *pCifsFile, bool can_flush) that inode was not dirty locally we could do this */ rc = CIFSSMBOpen(xid, tcon, full_path, disposition, desiredAccess, - CREATE_NOT_DIR, &netfid, &oplock, NULL, + create_options, &netfid, &oplock, NULL, cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); if (rc) { @@ -631,219 +644,713 @@ int cifs_closedir(struct inode *inode, struct file *file) return rc; } -static int store_file_lock(struct cifsFileInfo *fid, __u64 len, - __u64 offset, __u8 lockType) +static struct cifsLockInfo * +cifs_lock_init(__u64 offset, __u64 length, __u8 type, __u16 netfid) { - struct cifsLockInfo *li = + struct cifsLockInfo *lock = kmalloc(sizeof(struct cifsLockInfo), GFP_KERNEL); - if (li == NULL) - return -ENOMEM; - li->offset = offset; - li->length = len; - li->type = lockType; - mutex_lock(&fid->lock_mutex); - list_add(&li->llist, &fid->llist); - mutex_unlock(&fid->lock_mutex); - return 0; + if (!lock) + return lock; + lock->offset = offset; + lock->length = length; + lock->type = type; + lock->netfid = netfid; + lock->pid = current->tgid; + INIT_LIST_HEAD(&lock->blist); + init_waitqueue_head(&lock->block_q); + return lock; } -int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock) +static void +cifs_del_lock_waiters(struct cifsLockInfo *lock) { - int rc, xid; - __u32 numLock = 0; - __u32 numUnlock = 0; - __u64 length; - bool wait_flag = false; - struct cifs_sb_info *cifs_sb; + struct cifsLockInfo *li, *tmp; + list_for_each_entry_safe(li, tmp, &lock->blist, blist) { + list_del_init(&li->blist); + wake_up(&li->block_q); + } +} + +static bool +__cifs_find_lock_conflict(struct cifsInodeInfo *cinode, __u64 offset, + __u64 length, __u8 type, __u16 netfid, + struct cifsLockInfo **conf_lock) +{ + struct cifsLockInfo *li, *tmp; + + list_for_each_entry_safe(li, tmp, &cinode->llist, llist) { + if (offset + length <= li->offset || + offset >= li->offset + li->length) + continue; + else if ((type & LOCKING_ANDX_SHARED_LOCK) && + ((netfid == li->netfid && current->tgid == li->pid) || + type == li->type)) + continue; + else { + *conf_lock = li; + return true; + } + } + return false; +} + +static bool +cifs_find_lock_conflict(struct cifsInodeInfo *cinode, struct cifsLockInfo *lock, + struct cifsLockInfo **conf_lock) +{ + return __cifs_find_lock_conflict(cinode, lock->offset, lock->length, + lock->type, lock->netfid, conf_lock); +} + +/* + * Check if there is another lock that prevents us to set the lock (mandatory + * style). If such a lock exists, update the flock structure with its + * properties. Otherwise, set the flock type to F_UNLCK if we can cache brlocks + * or leave it the same if we can't. Returns 0 if we don't need to request to + * the server or 1 otherwise. + */ +static int +cifs_lock_test(struct cifsInodeInfo *cinode, __u64 offset, __u64 length, + __u8 type, __u16 netfid, struct file_lock *flock) +{ + int rc = 0; + struct cifsLockInfo *conf_lock; + bool exist; + + mutex_lock(&cinode->lock_mutex); + + exist = __cifs_find_lock_conflict(cinode, offset, length, type, netfid, + &conf_lock); + if (exist) { + flock->fl_start = conf_lock->offset; + flock->fl_end = conf_lock->offset + conf_lock->length - 1; + flock->fl_pid = conf_lock->pid; + if (conf_lock->type & LOCKING_ANDX_SHARED_LOCK) + flock->fl_type = F_RDLCK; + else + flock->fl_type = F_WRLCK; + } else if (!cinode->can_cache_brlcks) + rc = 1; + else + flock->fl_type = F_UNLCK; + + mutex_unlock(&cinode->lock_mutex); + return rc; +} + +static void +cifs_lock_add(struct cifsInodeInfo *cinode, struct cifsLockInfo *lock) +{ + mutex_lock(&cinode->lock_mutex); + list_add_tail(&lock->llist, &cinode->llist); + mutex_unlock(&cinode->lock_mutex); +} + +/* + * Set the byte-range lock (mandatory style). Returns: + * 1) 0, if we set the lock and don't need to request to the server; + * 2) 1, if no locks prevent us but we need to request to the server; + * 3) -EACCESS, if there is a lock that prevents us and wait is false. + */ +static int +cifs_lock_add_if(struct cifsInodeInfo *cinode, struct cifsLockInfo *lock, + bool wait) +{ + struct cifsLockInfo *conf_lock; + bool exist; + int rc = 0; + +try_again: + exist = false; + mutex_lock(&cinode->lock_mutex); + + exist = cifs_find_lock_conflict(cinode, lock, &conf_lock); + if (!exist && cinode->can_cache_brlcks) { + list_add_tail(&lock->llist, &cinode->llist); + mutex_unlock(&cinode->lock_mutex); + return rc; + } + + if (!exist) + rc = 1; + else if (!wait) + rc = -EACCES; + else { + list_add_tail(&lock->blist, &conf_lock->blist); + mutex_unlock(&cinode->lock_mutex); + rc = wait_event_interruptible(lock->block_q, + (lock->blist.prev == &lock->blist) && + (lock->blist.next == &lock->blist)); + if (!rc) + goto try_again; + mutex_lock(&cinode->lock_mutex); + list_del_init(&lock->blist); + } + + mutex_unlock(&cinode->lock_mutex); + return rc; +} + +/* + * Check if there is another lock that prevents us to set the lock (posix + * style). If such a lock exists, update the flock structure with its + * properties. Otherwise, set the flock type to F_UNLCK if we can cache brlocks + * or leave it the same if we can't. Returns 0 if we don't need to request to + * the server or 1 otherwise. + */ +static int +cifs_posix_lock_test(struct file *file, struct file_lock *flock) +{ + int rc = 0; + struct cifsInodeInfo *cinode = CIFS_I(file->f_path.dentry->d_inode); + unsigned char saved_type = flock->fl_type; + + if ((flock->fl_flags & FL_POSIX) == 0) + return 1; + + mutex_lock(&cinode->lock_mutex); + posix_test_lock(file, flock); + + if (flock->fl_type == F_UNLCK && !cinode->can_cache_brlcks) { + flock->fl_type = saved_type; + rc = 1; + } + + mutex_unlock(&cinode->lock_mutex); + return rc; +} + +/* + * Set the byte-range lock (posix style). Returns: + * 1) 0, if we set the lock and don't need to request to the server; + * 2) 1, if we need to request to the server; + * 3) <0, if the error occurs while setting the lock. + */ +static int +cifs_posix_lock_set(struct file *file, struct file_lock *flock) +{ + struct cifsInodeInfo *cinode = CIFS_I(file->f_path.dentry->d_inode); + int rc = 1; + + if ((flock->fl_flags & FL_POSIX) == 0) + return rc; + + mutex_lock(&cinode->lock_mutex); + if (!cinode->can_cache_brlcks) { + mutex_unlock(&cinode->lock_mutex); + return rc; + } + rc = posix_lock_file_wait(file, flock); + mutex_unlock(&cinode->lock_mutex); + return rc; +} + +static int +cifs_push_mandatory_locks(struct cifsFileInfo *cfile) +{ + int xid, rc = 0, stored_rc; + struct cifsLockInfo *li, *tmp; struct cifs_tcon *tcon; - __u16 netfid; - __u8 lockType = LOCKING_ANDX_LARGE_FILES; - bool posix_locking = 0; + struct cifsInodeInfo *cinode = CIFS_I(cfile->dentry->d_inode); + unsigned int num, max_num; + LOCKING_ANDX_RANGE *buf, *cur; + int types[] = {LOCKING_ANDX_LARGE_FILES, + LOCKING_ANDX_SHARED_LOCK | LOCKING_ANDX_LARGE_FILES}; + int i; - length = 1 + pfLock->fl_end - pfLock->fl_start; - rc = -EACCES; xid = GetXid(); + tcon = tlink_tcon(cfile->tlink); - cFYI(1, "Lock parm: 0x%x flockflags: " - "0x%x flocktype: 0x%x start: %lld end: %lld", - cmd, pfLock->fl_flags, pfLock->fl_type, pfLock->fl_start, - pfLock->fl_end); + mutex_lock(&cinode->lock_mutex); + if (!cinode->can_cache_brlcks) { + mutex_unlock(&cinode->lock_mutex); + FreeXid(xid); + return rc; + } - if (pfLock->fl_flags & FL_POSIX) + max_num = (tcon->ses->server->maxBuf - sizeof(struct smb_hdr)) / + sizeof(LOCKING_ANDX_RANGE); + buf = kzalloc(max_num * sizeof(LOCKING_ANDX_RANGE), GFP_KERNEL); + if (!buf) { + mutex_unlock(&cinode->lock_mutex); + FreeXid(xid); + return rc; + } + + for (i = 0; i < 2; i++) { + cur = buf; + num = 0; + list_for_each_entry_safe(li, tmp, &cinode->llist, llist) { + if (li->type != types[i]) + continue; + cur->Pid = cpu_to_le16(li->pid); + cur->LengthLow = cpu_to_le32((u32)li->length); + cur->LengthHigh = cpu_to_le32((u32)(li->length>>32)); + cur->OffsetLow = cpu_to_le32((u32)li->offset); + cur->OffsetHigh = cpu_to_le32((u32)(li->offset>>32)); + if (++num == max_num) { + stored_rc = cifs_lockv(xid, tcon, cfile->netfid, + li->type, 0, num, buf); + if (stored_rc) + rc = stored_rc; + cur = buf; + num = 0; + } else + cur++; + } + + if (num) { + stored_rc = cifs_lockv(xid, tcon, cfile->netfid, + types[i], 0, num, buf); + if (stored_rc) + rc = stored_rc; + } + } + + cinode->can_cache_brlcks = false; + mutex_unlock(&cinode->lock_mutex); + + kfree(buf); + FreeXid(xid); + return rc; +} + +/* copied from fs/locks.c with a name change */ +#define cifs_for_each_lock(inode, lockp) \ + for (lockp = &inode->i_flock; *lockp != NULL; \ + lockp = &(*lockp)->fl_next) + +static int +cifs_push_posix_locks(struct cifsFileInfo *cfile) +{ + struct cifsInodeInfo *cinode = CIFS_I(cfile->dentry->d_inode); + struct cifs_tcon *tcon = tlink_tcon(cfile->tlink); + struct file_lock *flock, **before; + struct cifsLockInfo *lck, *tmp; + int rc = 0, xid, type; + __u64 length; + struct list_head locks_to_send; + + xid = GetXid(); + + mutex_lock(&cinode->lock_mutex); + if (!cinode->can_cache_brlcks) { + mutex_unlock(&cinode->lock_mutex); + FreeXid(xid); + return rc; + } + + INIT_LIST_HEAD(&locks_to_send); + + lock_flocks(); + cifs_for_each_lock(cfile->dentry->d_inode, before) { + flock = *before; + length = 1 + flock->fl_end - flock->fl_start; + if (flock->fl_type == F_RDLCK || flock->fl_type == F_SHLCK) + type = CIFS_RDLCK; + else + type = CIFS_WRLCK; + + lck = cifs_lock_init(flock->fl_start, length, type, + cfile->netfid); + if (!lck) { + rc = -ENOMEM; + goto send_locks; + } + lck->pid = flock->fl_pid; + + list_add_tail(&lck->llist, &locks_to_send); + } + +send_locks: + unlock_flocks(); + + list_for_each_entry_safe(lck, tmp, &locks_to_send, llist) { + struct file_lock tmp_lock; + int stored_rc; + + tmp_lock.fl_start = lck->offset; + stored_rc = CIFSSMBPosixLock(xid, tcon, lck->netfid, lck->pid, + 0, lck->length, &tmp_lock, + lck->type, 0); + if (stored_rc) + rc = stored_rc; + list_del(&lck->llist); + kfree(lck); + } + + cinode->can_cache_brlcks = false; + mutex_unlock(&cinode->lock_mutex); + + FreeXid(xid); + return rc; +} + +static int +cifs_push_locks(struct cifsFileInfo *cfile) +{ + struct cifs_sb_info *cifs_sb = CIFS_SB(cfile->dentry->d_sb); + struct cifs_tcon *tcon = tlink_tcon(cfile->tlink); + + if ((tcon->ses->capabilities & CAP_UNIX) && + (CIFS_UNIX_FCNTL_CAP & le64_to_cpu(tcon->fsUnixInfo.Capability)) && + ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0)) + return cifs_push_posix_locks(cfile); + + return cifs_push_mandatory_locks(cfile); +} + +static void +cifs_read_flock(struct file_lock *flock, __u8 *type, int *lock, int *unlock, + bool *wait_flag) +{ + if (flock->fl_flags & FL_POSIX) cFYI(1, "Posix"); - if (pfLock->fl_flags & FL_FLOCK) + if (flock->fl_flags & FL_FLOCK) cFYI(1, "Flock"); - if (pfLock->fl_flags & FL_SLEEP) { + if (flock->fl_flags & FL_SLEEP) { cFYI(1, "Blocking lock"); - wait_flag = true; + *wait_flag = true; } - if (pfLock->fl_flags & FL_ACCESS) + if (flock->fl_flags & FL_ACCESS) cFYI(1, "Process suspended by mandatory locking - " - "not implemented yet"); - if (pfLock->fl_flags & FL_LEASE) + "not implemented yet"); + if (flock->fl_flags & FL_LEASE) cFYI(1, "Lease on file - not implemented yet"); - if (pfLock->fl_flags & + if (flock->fl_flags & (~(FL_POSIX | FL_FLOCK | FL_SLEEP | FL_ACCESS | FL_LEASE))) - cFYI(1, "Unknown lock flags 0x%x", pfLock->fl_flags); + cFYI(1, "Unknown lock flags 0x%x", flock->fl_flags); - if (pfLock->fl_type == F_WRLCK) { + *type = LOCKING_ANDX_LARGE_FILES; + if (flock->fl_type == F_WRLCK) { cFYI(1, "F_WRLCK "); - numLock = 1; - } else if (pfLock->fl_type == F_UNLCK) { + *lock = 1; + } else if (flock->fl_type == F_UNLCK) { cFYI(1, "F_UNLCK"); - numUnlock = 1; - /* Check if unlock includes more than - one lock range */ - } else if (pfLock->fl_type == F_RDLCK) { + *unlock = 1; + /* Check if unlock includes more than one lock range */ + } else if (flock->fl_type == F_RDLCK) { cFYI(1, "F_RDLCK"); - lockType |= LOCKING_ANDX_SHARED_LOCK; - numLock = 1; - } else if (pfLock->fl_type == F_EXLCK) { + *type |= LOCKING_ANDX_SHARED_LOCK; + *lock = 1; + } else if (flock->fl_type == F_EXLCK) { cFYI(1, "F_EXLCK"); - numLock = 1; - } else if (pfLock->fl_type == F_SHLCK) { + *lock = 1; + } else if (flock->fl_type == F_SHLCK) { cFYI(1, "F_SHLCK"); - lockType |= LOCKING_ANDX_SHARED_LOCK; - numLock = 1; + *type |= LOCKING_ANDX_SHARED_LOCK; + *lock = 1; } else cFYI(1, "Unknown type of lock"); +} - cifs_sb = CIFS_SB(file->f_path.dentry->d_sb); - tcon = tlink_tcon(((struct cifsFileInfo *)file->private_data)->tlink); - netfid = ((struct cifsFileInfo *)file->private_data)->netfid; +static int +cifs_getlk(struct file *file, struct file_lock *flock, __u8 type, + bool wait_flag, bool posix_lck, int xid) +{ + int rc = 0; + __u64 length = 1 + flock->fl_end - flock->fl_start; + struct cifsFileInfo *cfile = (struct cifsFileInfo *)file->private_data; + struct cifs_tcon *tcon = tlink_tcon(cfile->tlink); + struct cifsInodeInfo *cinode = CIFS_I(cfile->dentry->d_inode); + __u16 netfid = cfile->netfid; - if ((tcon->ses->capabilities & CAP_UNIX) && - (CIFS_UNIX_FCNTL_CAP & le64_to_cpu(tcon->fsUnixInfo.Capability)) && - ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0)) - posix_locking = 1; - /* BB add code here to normalize offset and length to - account for negative length which we can not accept over the - wire */ - if (IS_GETLK(cmd)) { - if (posix_locking) { - int posix_lock_type; - if (lockType & LOCKING_ANDX_SHARED_LOCK) - posix_lock_type = CIFS_RDLCK; - else - posix_lock_type = CIFS_WRLCK; - rc = CIFSSMBPosixLock(xid, tcon, netfid, 1 /* get */, - length, pfLock, posix_lock_type, - wait_flag); - FreeXid(xid); + if (posix_lck) { + int posix_lock_type; + + rc = cifs_posix_lock_test(file, flock); + if (!rc) return rc; - } - /* BB we could chain these into one lock request BB */ - rc = CIFSSMBLock(xid, tcon, netfid, length, pfLock->fl_start, - 0, 1, lockType, 0 /* wait flag */, 0); - if (rc == 0) { - rc = CIFSSMBLock(xid, tcon, netfid, length, - pfLock->fl_start, 1 /* numUnlock */ , - 0 /* numLock */ , lockType, - 0 /* wait flag */, 0); - pfLock->fl_type = F_UNLCK; - if (rc != 0) - cERROR(1, "Error unlocking previously locked " - "range %d during test of lock", rc); - rc = 0; + if (type & LOCKING_ANDX_SHARED_LOCK) + posix_lock_type = CIFS_RDLCK; + else + posix_lock_type = CIFS_WRLCK; + rc = CIFSSMBPosixLock(xid, tcon, netfid, current->tgid, + 1 /* get */, length, flock, + posix_lock_type, wait_flag); + return rc; + } - } else { - /* if rc == ERR_SHARING_VIOLATION ? */ - rc = 0; + rc = cifs_lock_test(cinode, flock->fl_start, length, type, netfid, + flock); + if (!rc) + return rc; + + /* BB we could chain these into one lock request BB */ + rc = CIFSSMBLock(xid, tcon, netfid, current->tgid, length, + flock->fl_start, 0, 1, type, 0, 0); + if (rc == 0) { + rc = CIFSSMBLock(xid, tcon, netfid, current->tgid, + length, flock->fl_start, 1, 0, + type, 0, 0); + flock->fl_type = F_UNLCK; + if (rc != 0) + cERROR(1, "Error unlocking previously locked " + "range %d during test of lock", rc); + return 0; + } + + if (type & LOCKING_ANDX_SHARED_LOCK) { + flock->fl_type = F_WRLCK; + return 0; + } + + rc = CIFSSMBLock(xid, tcon, netfid, current->tgid, length, + flock->fl_start, 0, 1, + type | LOCKING_ANDX_SHARED_LOCK, 0, 0); + if (rc == 0) { + rc = CIFSSMBLock(xid, tcon, netfid, current->tgid, + length, flock->fl_start, 1, 0, + type | LOCKING_ANDX_SHARED_LOCK, + 0, 0); + flock->fl_type = F_RDLCK; + if (rc != 0) + cERROR(1, "Error unlocking previously locked " + "range %d during test of lock", rc); + } else + flock->fl_type = F_WRLCK; + + return 0; +} - if (lockType & LOCKING_ANDX_SHARED_LOCK) { - pfLock->fl_type = F_WRLCK; +static void +cifs_move_llist(struct list_head *source, struct list_head *dest) +{ + struct list_head *li, *tmp; + list_for_each_safe(li, tmp, source) + list_move(li, dest); +} + +static void +cifs_free_llist(struct list_head *llist) +{ + struct cifsLockInfo *li, *tmp; + list_for_each_entry_safe(li, tmp, llist, llist) { + cifs_del_lock_waiters(li); + list_del(&li->llist); + kfree(li); + } +} + +static int +cifs_unlock_range(struct cifsFileInfo *cfile, struct file_lock *flock, int xid) +{ + int rc = 0, stored_rc; + int types[] = {LOCKING_ANDX_LARGE_FILES, + LOCKING_ANDX_SHARED_LOCK | LOCKING_ANDX_LARGE_FILES}; + unsigned int i; + unsigned int max_num, num; + LOCKING_ANDX_RANGE *buf, *cur; + struct cifs_tcon *tcon = tlink_tcon(cfile->tlink); + struct cifsInodeInfo *cinode = CIFS_I(cfile->dentry->d_inode); + struct cifsLockInfo *li, *tmp; + __u64 length = 1 + flock->fl_end - flock->fl_start; + struct list_head tmp_llist; + + INIT_LIST_HEAD(&tmp_llist); + + max_num = (tcon->ses->server->maxBuf - sizeof(struct smb_hdr)) / + sizeof(LOCKING_ANDX_RANGE); + buf = kzalloc(max_num * sizeof(LOCKING_ANDX_RANGE), GFP_KERNEL); + if (!buf) + return -ENOMEM; + + mutex_lock(&cinode->lock_mutex); + for (i = 0; i < 2; i++) { + cur = buf; + num = 0; + list_for_each_entry_safe(li, tmp, &cinode->llist, llist) { + if (flock->fl_start > li->offset || + (flock->fl_start + length) < + (li->offset + li->length)) + continue; + if (current->tgid != li->pid) + continue; + if (cfile->netfid != li->netfid) + continue; + if (types[i] != li->type) + continue; + if (!cinode->can_cache_brlcks) { + cur->Pid = cpu_to_le16(li->pid); + cur->LengthLow = cpu_to_le32((u32)li->length); + cur->LengthHigh = + cpu_to_le32((u32)(li->length>>32)); + cur->OffsetLow = cpu_to_le32((u32)li->offset); + cur->OffsetHigh = + cpu_to_le32((u32)(li->offset>>32)); + /* + * We need to save a lock here to let us add + * it again to the inode list if the unlock + * range request fails on the server. + */ + list_move(&li->llist, &tmp_llist); + if (++num == max_num) { + stored_rc = cifs_lockv(xid, tcon, + cfile->netfid, + li->type, num, + 0, buf); + if (stored_rc) { + /* + * We failed on the unlock range + * request - add all locks from + * the tmp list to the head of + * the inode list. + */ + cifs_move_llist(&tmp_llist, + &cinode->llist); + rc = stored_rc; + } else + /* + * The unlock range request + * succeed - free the tmp list. + */ + cifs_free_llist(&tmp_llist); + cur = buf; + num = 0; + } else + cur++; } else { - rc = CIFSSMBLock(xid, tcon, netfid, length, - pfLock->fl_start, 0, 1, - lockType | LOCKING_ANDX_SHARED_LOCK, - 0 /* wait flag */, 0); - if (rc == 0) { - rc = CIFSSMBLock(xid, tcon, netfid, - length, pfLock->fl_start, 1, 0, - lockType | - LOCKING_ANDX_SHARED_LOCK, - 0 /* wait flag */, 0); - pfLock->fl_type = F_RDLCK; - if (rc != 0) - cERROR(1, "Error unlocking " - "previously locked range %d " - "during test of lock", rc); - rc = 0; - } else { - pfLock->fl_type = F_WRLCK; - rc = 0; - } + /* + * We can cache brlock requests - simply remove + * a lock from the inode list. + */ + list_del(&li->llist); + cifs_del_lock_waiters(li); + kfree(li); } } - - FreeXid(xid); - return rc; + if (num) { + stored_rc = cifs_lockv(xid, tcon, cfile->netfid, + types[i], num, 0, buf); + if (stored_rc) { + cifs_move_llist(&tmp_llist, &cinode->llist); + rc = stored_rc; + } else + cifs_free_llist(&tmp_llist); + } } - if (!numLock && !numUnlock) { - /* if no lock or unlock then nothing - to do since we do not know what it is */ - FreeXid(xid); - return -EOPNOTSUPP; - } + mutex_unlock(&cinode->lock_mutex); + kfree(buf); + return rc; +} + +static int +cifs_setlk(struct file *file, struct file_lock *flock, __u8 type, + bool wait_flag, bool posix_lck, int lock, int unlock, int xid) +{ + int rc = 0; + __u64 length = 1 + flock->fl_end - flock->fl_start; + struct cifsFileInfo *cfile = (struct cifsFileInfo *)file->private_data; + struct cifs_tcon *tcon = tlink_tcon(cfile->tlink); + struct cifsInodeInfo *cinode = CIFS_I(file->f_path.dentry->d_inode); + __u16 netfid = cfile->netfid; - if (posix_locking) { + if (posix_lck) { int posix_lock_type; - if (lockType & LOCKING_ANDX_SHARED_LOCK) + + rc = cifs_posix_lock_set(file, flock); + if (!rc || rc < 0) + return rc; + + if (type & LOCKING_ANDX_SHARED_LOCK) posix_lock_type = CIFS_RDLCK; else posix_lock_type = CIFS_WRLCK; - if (numUnlock == 1) + if (unlock == 1) posix_lock_type = CIFS_UNLCK; - rc = CIFSSMBPosixLock(xid, tcon, netfid, 0 /* set */, - length, pfLock, posix_lock_type, - wait_flag); - } else { - struct cifsFileInfo *fid = file->private_data; + rc = CIFSSMBPosixLock(xid, tcon, netfid, current->tgid, + 0 /* set */, length, flock, + posix_lock_type, wait_flag); + goto out; + } - if (numLock) { - rc = CIFSSMBLock(xid, tcon, netfid, length, - pfLock->fl_start, 0, numLock, lockType, - wait_flag, 0); + if (lock) { + struct cifsLockInfo *lock; - if (rc == 0) { - /* For Windows locks we must store them. */ - rc = store_file_lock(fid, length, - pfLock->fl_start, lockType); - } - } else if (numUnlock) { - /* For each stored lock that this unlock overlaps - completely, unlock it. */ - int stored_rc = 0; - struct cifsLockInfo *li, *tmp; + lock = cifs_lock_init(flock->fl_start, length, type, netfid); + if (!lock) + return -ENOMEM; - rc = 0; - mutex_lock(&fid->lock_mutex); - list_for_each_entry_safe(li, tmp, &fid->llist, llist) { - if (pfLock->fl_start <= li->offset && - (pfLock->fl_start + length) >= - (li->offset + li->length)) { - stored_rc = CIFSSMBLock(xid, tcon, - netfid, li->length, - li->offset, 1, 0, - li->type, false, 0); - if (stored_rc) - rc = stored_rc; - else { - list_del(&li->llist); - kfree(li); - } - } - } - mutex_unlock(&fid->lock_mutex); + rc = cifs_lock_add_if(cinode, lock, wait_flag); + if (rc < 0) + kfree(lock); + if (rc <= 0) + goto out; + + rc = CIFSSMBLock(xid, tcon, netfid, current->tgid, length, + flock->fl_start, 0, 1, type, wait_flag, 0); + if (rc) { + kfree(lock); + goto out; } + + cifs_lock_add(cinode, lock); + } else if (unlock) + rc = cifs_unlock_range(cfile, flock, xid); + +out: + if (flock->fl_flags & FL_POSIX) + posix_lock_file_wait(file, flock); + return rc; +} + +int cifs_lock(struct file *file, int cmd, struct file_lock *flock) +{ + int rc, xid; + int lock = 0, unlock = 0; + bool wait_flag = false; + bool posix_lck = false; + struct cifs_sb_info *cifs_sb; + struct cifs_tcon *tcon; + struct cifsInodeInfo *cinode; + struct cifsFileInfo *cfile; + __u16 netfid; + __u8 type; + + rc = -EACCES; + xid = GetXid(); + + cFYI(1, "Lock parm: 0x%x flockflags: 0x%x flocktype: 0x%x start: %lld " + "end: %lld", cmd, flock->fl_flags, flock->fl_type, + flock->fl_start, flock->fl_end); + + cifs_read_flock(flock, &type, &lock, &unlock, &wait_flag); + + cifs_sb = CIFS_SB(file->f_path.dentry->d_sb); + cfile = (struct cifsFileInfo *)file->private_data; + tcon = tlink_tcon(cfile->tlink); + netfid = cfile->netfid; + cinode = CIFS_I(file->f_path.dentry->d_inode); + + if ((tcon->ses->capabilities & CAP_UNIX) && + (CIFS_UNIX_FCNTL_CAP & le64_to_cpu(tcon->fsUnixInfo.Capability)) && + ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0)) + posix_lck = true; + /* + * BB add code here to normalize offset and length to account for + * negative length which we can not accept over the wire. + */ + if (IS_GETLK(cmd)) { + rc = cifs_getlk(file, flock, type, wait_flag, posix_lck, xid); + FreeXid(xid); + return rc; } - if (pfLock->fl_flags & FL_POSIX) - posix_lock_file_wait(file, pfLock); + if (!lock && !unlock) { + /* + * if no lock or unlock then nothing to do since we do not + * know what it is + */ + FreeXid(xid); + return -EOPNOTSUPP; + } + + rc = cifs_setlk(file, flock, type, wait_flag, posix_lck, lock, unlock, + xid); FreeXid(xid); return rc; } @@ -1714,6 +2221,7 @@ cifs_iovec_read(struct file *file, const struct iovec *iov, struct smb_com_read_rsp *pSMBr; struct cifs_io_parms io_parms; char *read_data; + unsigned int rsize; __u32 pid; if (!nr_segs) @@ -1726,6 +2234,9 @@ cifs_iovec_read(struct file *file, const struct iovec *iov, xid = GetXid(); cifs_sb = CIFS_SB(file->f_path.dentry->d_sb); + /* FIXME: set up handlers for larger reads and/or convert to async */ + rsize = min_t(unsigned int, cifs_sb->rsize, CIFSMaxBufSize); + open_file = file->private_data; pTcon = tlink_tcon(open_file->tlink); @@ -1738,7 +2249,7 @@ cifs_iovec_read(struct file *file, const struct iovec *iov, cFYI(1, "attempting read on write only file instance"); for (total_read = 0; total_read < len; total_read += bytes_read) { - cur_len = min_t(const size_t, len - total_read, cifs_sb->rsize); + cur_len = min_t(const size_t, len - total_read, rsize); rc = -EAGAIN; read_data = NULL; @@ -1830,6 +2341,7 @@ static ssize_t cifs_read(struct file *file, char *read_data, size_t read_size, unsigned int bytes_read = 0; unsigned int total_read; unsigned int current_read_size; + unsigned int rsize; struct cifs_sb_info *cifs_sb; struct cifs_tcon *pTcon; int xid; @@ -1842,6 +2354,9 @@ static ssize_t cifs_read(struct file *file, char *read_data, size_t read_size, xid = GetXid(); cifs_sb = CIFS_SB(file->f_path.dentry->d_sb); + /* FIXME: set up handlers for larger reads and/or convert to async */ + rsize = min_t(unsigned int, cifs_sb->rsize, CIFSMaxBufSize); + if (file->private_data == NULL) { rc = -EBADF; FreeXid(xid); @@ -1861,14 +2376,14 @@ static ssize_t cifs_read(struct file *file, char *read_data, size_t read_size, for (total_read = 0, current_offset = read_data; read_size > total_read; total_read += bytes_read, current_offset += bytes_read) { - current_read_size = min_t(const int, read_size - total_read, - cifs_sb->rsize); + current_read_size = min_t(uint, read_size - total_read, rsize); + /* For windows me and 9x we do not want to request more than it negotiated since it will refuse the read then */ if ((pTcon->ses) && !(pTcon->ses->capabilities & CAP_LARGE_FILES)) { - current_read_size = min_t(const int, current_read_size, - pTcon->ses->server->maxBuf - 128); + current_read_size = min_t(uint, current_read_size, + CIFSMaxBufSize); } rc = -EAGAIN; while (rc == -EAGAIN) { @@ -1957,82 +2472,24 @@ int cifs_file_mmap(struct file *file, struct vm_area_struct *vma) return rc; } - -static void cifs_copy_cache_pages(struct address_space *mapping, - struct list_head *pages, int bytes_read, char *data) -{ - struct page *page; - char *target; - - while (bytes_read > 0) { - if (list_empty(pages)) - break; - - page = list_entry(pages->prev, struct page, lru); - list_del(&page->lru); - - if (add_to_page_cache_lru(page, mapping, page->index, - GFP_KERNEL)) { - page_cache_release(page); - cFYI(1, "Add page cache failed"); - data += PAGE_CACHE_SIZE; - bytes_read -= PAGE_CACHE_SIZE; - continue; - } - page_cache_release(page); - - target = kmap_atomic(page, KM_USER0); - - if (PAGE_CACHE_SIZE > bytes_read) { - memcpy(target, data, bytes_read); - /* zero the tail end of this partial page */ - memset(target + bytes_read, 0, - PAGE_CACHE_SIZE - bytes_read); - bytes_read = 0; - } else { - memcpy(target, data, PAGE_CACHE_SIZE); - bytes_read -= PAGE_CACHE_SIZE; - } - kunmap_atomic(target, KM_USER0); - - flush_dcache_page(page); - SetPageUptodate(page); - unlock_page(page); - data += PAGE_CACHE_SIZE; - - /* add page to FS-Cache */ - cifs_readpage_to_fscache(mapping->host, page); - } - return; -} - static int cifs_readpages(struct file *file, struct address_space *mapping, struct list_head *page_list, unsigned num_pages) { - int rc = -EACCES; - int xid; - loff_t offset; - struct page *page; - struct cifs_sb_info *cifs_sb; - struct cifs_tcon *pTcon; - unsigned int bytes_read = 0; - unsigned int read_size, i; - char *smb_read_data = NULL; - struct smb_com_read_rsp *pSMBr; - struct cifsFileInfo *open_file; - struct cifs_io_parms io_parms; - int buf_type = CIFS_NO_BUFFER; - __u32 pid; + int rc; + struct list_head tmplist; + struct cifsFileInfo *open_file = file->private_data; + struct cifs_sb_info *cifs_sb = CIFS_SB(file->f_path.dentry->d_sb); + unsigned int rsize = cifs_sb->rsize; + pid_t pid; - xid = GetXid(); - if (file->private_data == NULL) { - rc = -EBADF; - FreeXid(xid); - return rc; - } - open_file = file->private_data; - cifs_sb = CIFS_SB(file->f_path.dentry->d_sb); - pTcon = tlink_tcon(open_file->tlink); + /* + * Give up immediately if rsize is too small to read an entire page. + * The VFS will fall back to readpage. We should never reach this + * point however since we set ra_pages to 0 when the rsize is smaller + * than a cache page. + */ + if (unlikely(rsize < PAGE_CACHE_SIZE)) + return 0; /* * Reads as many pages as possible from fscache. Returns -ENOBUFS @@ -2041,125 +2498,127 @@ static int cifs_readpages(struct file *file, struct address_space *mapping, rc = cifs_readpages_from_fscache(mapping->host, mapping, page_list, &num_pages); if (rc == 0) - goto read_complete; + return rc; - cFYI(DBG2, "rpages: num pages %d", num_pages); if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RWPIDFORWARD) pid = open_file->pid; else pid = current->tgid; - for (i = 0; i < num_pages; ) { - unsigned contig_pages; - struct page *tmp_page; - unsigned long expected_index; + rc = 0; + INIT_LIST_HEAD(&tmplist); - if (list_empty(page_list)) - break; + cFYI(1, "%s: file=%p mapping=%p num_pages=%u", __func__, file, + mapping, num_pages); + + /* + * Start with the page at end of list and move it to private + * list. Do the same with any following pages until we hit + * the rsize limit, hit an index discontinuity, or run out of + * pages. Issue the async read and then start the loop again + * until the list is empty. + * + * Note that list order is important. The page_list is in + * the order of declining indexes. When we put the pages in + * the rdata->pages, then we want them in increasing order. + */ + while (!list_empty(page_list)) { + unsigned int bytes = PAGE_CACHE_SIZE; + unsigned int expected_index; + unsigned int nr_pages = 1; + loff_t offset; + struct page *page, *tpage; + struct cifs_readdata *rdata; page = list_entry(page_list->prev, struct page, lru); + + /* + * Lock the page and put it in the cache. Since no one else + * should have access to this page, we're safe to simply set + * PG_locked without checking it first. + */ + __set_page_locked(page); + rc = add_to_page_cache_locked(page, mapping, + page->index, GFP_KERNEL); + + /* give up if we can't stick it in the cache */ + if (rc) { + __clear_page_locked(page); + break; + } + + /* move first page to the tmplist */ offset = (loff_t)page->index << PAGE_CACHE_SHIFT; + list_move_tail(&page->lru, &tmplist); - /* count adjacent pages that we will read into */ - contig_pages = 0; - expected_index = - list_entry(page_list->prev, struct page, lru)->index; - list_for_each_entry_reverse(tmp_page, page_list, lru) { - if (tmp_page->index == expected_index) { - contig_pages++; - expected_index++; - } else + /* now try and add more pages onto the request */ + expected_index = page->index + 1; + list_for_each_entry_safe_reverse(page, tpage, page_list, lru) { + /* discontinuity ? */ + if (page->index != expected_index) break; + + /* would this page push the read over the rsize? */ + if (bytes + PAGE_CACHE_SIZE > rsize) + break; + + __set_page_locked(page); + if (add_to_page_cache_locked(page, mapping, + page->index, GFP_KERNEL)) { + __clear_page_locked(page); + break; + } + list_move_tail(&page->lru, &tmplist); + bytes += PAGE_CACHE_SIZE; + expected_index++; + nr_pages++; } - if (contig_pages + i > num_pages) - contig_pages = num_pages - i; - - /* for reads over a certain size could initiate async - read ahead */ - - read_size = contig_pages * PAGE_CACHE_SIZE; - /* Read size needs to be in multiples of one page */ - read_size = min_t(const unsigned int, read_size, - cifs_sb->rsize & PAGE_CACHE_MASK); - cFYI(DBG2, "rpages: read size 0x%x contiguous pages %d", - read_size, contig_pages); - rc = -EAGAIN; - while (rc == -EAGAIN) { + + rdata = cifs_readdata_alloc(nr_pages); + if (!rdata) { + /* best to give up if we're out of mem */ + list_for_each_entry_safe(page, tpage, &tmplist, lru) { + list_del(&page->lru); + lru_cache_add_file(page); + unlock_page(page); + page_cache_release(page); + } + rc = -ENOMEM; + break; + } + + spin_lock(&cifs_file_list_lock); + cifsFileInfo_get(open_file); + spin_unlock(&cifs_file_list_lock); + rdata->cfile = open_file; + rdata->mapping = mapping; + rdata->offset = offset; + rdata->bytes = bytes; + rdata->pid = pid; + list_splice_init(&tmplist, &rdata->pages); + + do { if (open_file->invalidHandle) { rc = cifs_reopen_file(open_file, true); if (rc != 0) - break; + continue; } - io_parms.netfid = open_file->netfid; - io_parms.pid = pid; - io_parms.tcon = pTcon; - io_parms.offset = offset; - io_parms.length = read_size; - rc = CIFSSMBRead(xid, &io_parms, &bytes_read, - &smb_read_data, &buf_type); - /* BB more RC checks ? */ - if (rc == -EAGAIN) { - if (smb_read_data) { - if (buf_type == CIFS_SMALL_BUFFER) - cifs_small_buf_release(smb_read_data); - else if (buf_type == CIFS_LARGE_BUFFER) - cifs_buf_release(smb_read_data); - smb_read_data = NULL; - } - } - } - if ((rc < 0) || (smb_read_data == NULL)) { - cFYI(1, "Read error in readpages: %d", rc); - break; - } else if (bytes_read > 0) { - task_io_account_read(bytes_read); - pSMBr = (struct smb_com_read_rsp *)smb_read_data; - cifs_copy_cache_pages(mapping, page_list, bytes_read, - smb_read_data + 4 /* RFC1001 hdr */ + - le16_to_cpu(pSMBr->DataOffset)); - - i += bytes_read >> PAGE_CACHE_SHIFT; - cifs_stats_bytes_read(pTcon, bytes_read); - if ((bytes_read & PAGE_CACHE_MASK) != bytes_read) { - i++; /* account for partial page */ - - /* server copy of file can have smaller size - than client */ - /* BB do we need to verify this common case ? - this case is ok - if we are at server EOF - we will hit it on next read */ + rc = cifs_async_readv(rdata); + } while (rc == -EAGAIN); - /* break; */ + if (rc != 0) { + list_for_each_entry_safe(page, tpage, &rdata->pages, + lru) { + list_del(&page->lru); + lru_cache_add_file(page); + unlock_page(page); + page_cache_release(page); } - } else { - cFYI(1, "No bytes read (%d) at offset %lld . " - "Cleaning remaining pages from readahead list", - bytes_read, offset); - /* BB turn off caching and do new lookup on - file size at server? */ + cifs_readdata_free(rdata); break; } - if (smb_read_data) { - if (buf_type == CIFS_SMALL_BUFFER) - cifs_small_buf_release(smb_read_data); - else if (buf_type == CIFS_LARGE_BUFFER) - cifs_buf_release(smb_read_data); - smb_read_data = NULL; - } - bytes_read = 0; } -/* need to free smb_read_data buf before exit */ - if (smb_read_data) { - if (buf_type == CIFS_SMALL_BUFFER) - cifs_small_buf_release(smb_read_data); - else if (buf_type == CIFS_LARGE_BUFFER) - cifs_buf_release(smb_read_data); - smb_read_data = NULL; - } - -read_complete: - FreeXid(xid); return rc; } @@ -2408,6 +2867,10 @@ void cifs_oplock_break(struct work_struct *work) cFYI(1, "Oplock flush inode %p rc %d", inode, rc); } + rc = cifs_push_locks(cfile); + if (rc) + cERROR(1, "Push locks rc = %d", rc); + /* * releasing stale oplock after recent reconnect of smb session using * a now incorrect file handle is not a data integrity issue but do @@ -2415,8 +2878,9 @@ void cifs_oplock_break(struct work_struct *work) * disconnected since oplock already released by the server */ if (!cfile->oplock_break_cancelled) { - rc = CIFSSMBLock(0, tlink_tcon(cfile->tlink), cfile->netfid, 0, - 0, 0, 0, LOCKING_ANDX_OPLOCK_RELEASE, false, + rc = CIFSSMBLock(0, tlink_tcon(cfile->tlink), cfile->netfid, + current->tgid, 0, 0, 0, 0, + LOCKING_ANDX_OPLOCK_RELEASE, false, cinode->clientCanCacheRead ? 1 : 0); cFYI(1, "Oplock release rc = %d", rc); } diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index a7b2dcd..a5f54b7 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -132,7 +132,7 @@ cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr) inode->i_mtime = fattr->cf_mtime; inode->i_ctime = fattr->cf_ctime; inode->i_rdev = fattr->cf_rdev; - inode->i_nlink = fattr->cf_nlink; + set_nlink(inode, fattr->cf_nlink); inode->i_uid = fattr->cf_uid; inode->i_gid = fattr->cf_gid; @@ -562,7 +562,16 @@ int cifs_get_file_info(struct file *filp) xid = GetXid(); rc = CIFSSMBQFileInfo(xid, tcon, cfile->netfid, &find_data); - if (rc == -EOPNOTSUPP || rc == -EINVAL) { + switch (rc) { + case 0: + cifs_all_info_to_fattr(&fattr, &find_data, cifs_sb, false); + break; + case -EREMOTE: + cifs_create_dfs_fattr(&fattr, inode->i_sb); + rc = 0; + break; + case -EOPNOTSUPP: + case -EINVAL: /* * FIXME: legacy server -- fall back to path-based call? * for now, just skip revalidating and mark inode for @@ -570,18 +579,14 @@ int cifs_get_file_info(struct file *filp) */ rc = 0; CIFS_I(inode)->time = 0; + default: goto cgfi_exit; - } else if (rc == -EREMOTE) { - cifs_create_dfs_fattr(&fattr, inode->i_sb); - rc = 0; - } else if (rc) - goto cgfi_exit; + } /* * don't bother with SFU junk here -- just mark inode as needing * revalidation. */ - cifs_all_info_to_fattr(&fattr, &find_data, cifs_sb, false); fattr.cf_uniqueid = CIFS_I(inode)->uniqueid; fattr.cf_flags |= CIFS_FATTR_NEED_REVAL; cifs_fattr_to_inode(inode, &fattr); @@ -900,7 +905,7 @@ struct inode *cifs_root_iget(struct super_block *sb) if (rc && tcon->ipc) { cFYI(1, "ipc connection - fake read inode"); inode->i_mode |= S_IFDIR; - inode->i_nlink = 2; + set_nlink(inode, 2); inode->i_op = &cifs_ipc_inode_ops; inode->i_fop = &simple_dir_operations; inode->i_uid = cifs_sb->mnt_uid; @@ -1259,7 +1264,7 @@ unlink_out: return rc; } -int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode) +int cifs_mkdir(struct inode *inode, struct dentry *direntry, umode_t mode) { int rc = 0, tmprc; int xid; @@ -1270,7 +1275,7 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode) struct inode *newinode = NULL; struct cifs_fattr fattr; - cFYI(1, "In cifs_mkdir, mode = 0x%x inode = 0x%p", mode, inode); + cFYI(1, "In cifs_mkdir, mode = 0x%hx inode = 0x%p", mode, inode); cifs_sb = CIFS_SB(inode->i_sb); tlink = cifs_sb_tlink(cifs_sb); @@ -1362,7 +1367,7 @@ mkdir_get_info: /* setting nlink not necessary except in cases where we * failed to get it from the server or was set bogus */ if ((direntry->d_inode) && (direntry->d_inode->i_nlink < 2)) - direntry->d_inode->i_nlink = 2; + set_nlink(direntry->d_inode, 2); mode &= ~current_umask(); /* must turn on setgid bit if parent dir has it */ @@ -2096,6 +2101,8 @@ static int cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs) { int xid; + uid_t uid = NO_CHANGE_32; + gid_t gid = NO_CHANGE_32; struct inode *inode = direntry->d_inode; struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); struct cifsInodeInfo *cifsInode = CIFS_I(inode); @@ -2146,13 +2153,25 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs) goto cifs_setattr_exit; } - /* - * Without unix extensions we can't send ownership changes to the - * server, so silently ignore them. This is consistent with how - * local DOS/Windows filesystems behave (VFAT, NTFS, etc). With - * CIFSACL support + proper Windows to Unix idmapping, we may be - * able to support this in the future. - */ + if (attrs->ia_valid & ATTR_UID) + uid = attrs->ia_uid; + + if (attrs->ia_valid & ATTR_GID) + gid = attrs->ia_gid; + +#ifdef CONFIG_CIFS_ACL + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) { + if (uid != NO_CHANGE_32 || gid != NO_CHANGE_32) { + rc = id_mode_to_cifs_acl(inode, full_path, NO_CHANGE_64, + uid, gid); + if (rc) { + cFYI(1, "%s: Setting id failed with error: %d", + __func__, rc); + goto cifs_setattr_exit; + } + } + } else +#endif /* CONFIG_CIFS_ACL */ if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID)) attrs->ia_valid &= ~(ATTR_UID | ATTR_GID); @@ -2161,15 +2180,12 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs) attrs->ia_valid &= ~ATTR_MODE; if (attrs->ia_valid & ATTR_MODE) { - cFYI(1, "Mode changed to 0%o", attrs->ia_mode); mode = attrs->ia_mode; - } - - if (attrs->ia_valid & ATTR_MODE) { rc = 0; #ifdef CONFIG_CIFS_ACL if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) { - rc = mode_to_cifs_acl(inode, full_path, mode); + rc = id_mode_to_cifs_acl(inode, full_path, mode, + NO_CHANGE_32, NO_CHANGE_32); if (rc) { cFYI(1, "%s: Setting ACL failed with error: %d", __func__, rc); diff --git a/fs/cifs/link.c b/fs/cifs/link.c index db3f18c..6b0e064 100644 --- a/fs/cifs/link.c +++ b/fs/cifs/link.c @@ -183,14 +183,20 @@ CIFSFormatMFSymlink(u8 *buf, unsigned int buf_len, const char *link_str) static int CIFSCreateMFSymLink(const int xid, struct cifs_tcon *tcon, const char *fromName, const char *toName, - const struct nls_table *nls_codepage, int remap) + struct cifs_sb_info *cifs_sb) { int rc; int oplock = 0; + int remap; + int create_options = CREATE_NOT_DIR; __u16 netfid = 0; u8 *buf; unsigned int bytes_written = 0; struct cifs_io_parms io_parms; + struct nls_table *nls_codepage; + + nls_codepage = cifs_sb->local_nls; + remap = cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR; buf = kmalloc(CIFS_MF_SYMLINK_FILE_SIZE, GFP_KERNEL); if (!buf) @@ -202,8 +208,11 @@ CIFSCreateMFSymLink(const int xid, struct cifs_tcon *tcon, return rc; } + if (backup_cred(cifs_sb)) + create_options |= CREATE_OPEN_BACKUP_INTENT; + rc = CIFSSMBOpen(xid, tcon, fromName, FILE_CREATE, GENERIC_WRITE, - CREATE_NOT_DIR, &netfid, &oplock, NULL, + create_options, &netfid, &oplock, NULL, nls_codepage, remap); if (rc != 0) { kfree(buf); @@ -424,7 +433,7 @@ cifs_hardlink(struct dentry *old_file, struct inode *inode, if (old_file->d_inode) { cifsInode = CIFS_I(old_file->d_inode); if (rc == 0) { - old_file->d_inode->i_nlink++; + inc_nlink(old_file->d_inode); /* BB should we make this contingent on superblock flag NOATIME? */ /* old_file->d_inode->i_ctime = CURRENT_TIME;*/ /* parent dir timestamps will update from srv @@ -559,9 +568,7 @@ cifs_symlink(struct inode *inode, struct dentry *direntry, const char *symname) /* BB what if DFS and this volume is on different share? BB */ if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS) rc = CIFSCreateMFSymLink(xid, pTcon, full_path, symname, - cifs_sb->local_nls, - cifs_sb->mnt_cifs_flags & - CIFS_MOUNT_MAP_SPECIAL_CHR); + cifs_sb); else if (pTcon->unix_ext) rc = CIFSUnixCreateSymLink(xid, pTcon, full_path, symname, cifs_sb->local_nls); diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c index 7c16933..703ef5c 100644 --- a/fs/cifs/misc.c +++ b/fs/cifs/misc.c @@ -420,19 +420,22 @@ check_smb_hdr(struct smb_hdr *smb, __u16 mid) } int -checkSMB(struct smb_hdr *smb, __u16 mid, unsigned int length) +checkSMB(struct smb_hdr *smb, __u16 mid, unsigned int total_read) { - __u32 len = be32_to_cpu(smb->smb_buf_length); + __u32 rfclen = be32_to_cpu(smb->smb_buf_length); __u32 clc_len; /* calculated length */ - cFYI(0, "checkSMB Length: 0x%x, smb_buf_length: 0x%x", length, len); + cFYI(0, "checkSMB Length: 0x%x, smb_buf_length: 0x%x", + total_read, rfclen); - if (length < 2 + sizeof(struct smb_hdr)) { - if ((length >= sizeof(struct smb_hdr) - 1) + /* is this frame too small to even get to a BCC? */ + if (total_read < 2 + sizeof(struct smb_hdr)) { + if ((total_read >= sizeof(struct smb_hdr) - 1) && (smb->Status.CifsError != 0)) { + /* it's an error return */ smb->WordCount = 0; /* some error cases do not return wct and bcc */ return 0; - } else if ((length == sizeof(struct smb_hdr) + 1) && + } else if ((total_read == sizeof(struct smb_hdr) + 1) && (smb->WordCount == 0)) { char *tmp = (char *)smb; /* Need to work around a bug in two servers here */ @@ -452,39 +455,35 @@ checkSMB(struct smb_hdr *smb, __u16 mid, unsigned int length) } else { cERROR(1, "Length less than smb header size"); } - return 1; - } - if (len > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4) { - cERROR(1, "smb length greater than MaxBufSize, mid=%d", - smb->Mid); - return 1; + return -EIO; } + /* otherwise, there is enough to get to the BCC */ if (check_smb_hdr(smb, mid)) - return 1; + return -EIO; clc_len = smbCalcSize(smb); - if (4 + len != length) { + if (4 + rfclen != total_read) { cERROR(1, "Length read does not match RFC1001 length %d", - len); - return 1; + rfclen); + return -EIO; } - if (4 + len != clc_len) { + if (4 + rfclen != clc_len) { /* check if bcc wrapped around for large read responses */ - if ((len > 64 * 1024) && (len > clc_len)) { + if ((rfclen > 64 * 1024) && (rfclen > clc_len)) { /* check if lengths match mod 64K */ - if (((4 + len) & 0xFFFF) == (clc_len & 0xFFFF)) + if (((4 + rfclen) & 0xFFFF) == (clc_len & 0xFFFF)) return 0; /* bcc wrapped */ } cFYI(1, "Calculated size %u vs length %u mismatch for mid=%u", - clc_len, 4 + len, smb->Mid); + clc_len, 4 + rfclen, smb->Mid); - if (4 + len < clc_len) { + if (4 + rfclen < clc_len) { cERROR(1, "RFC1001 size %u smaller than SMB for mid=%u", - len, smb->Mid); - return 1; - } else if (len > clc_len + 512) { + rfclen, smb->Mid); + return -EIO; + } else if (rfclen > clc_len + 512) { /* * Some servers (Windows XP in particular) send more * data than the lengths in the SMB packet would @@ -495,8 +494,8 @@ checkSMB(struct smb_hdr *smb, __u16 mid, unsigned int length) * data to 512 bytes. */ cERROR(1, "RFC1001 size %u more than 512 bytes larger " - "than SMB for mid=%u", len, smb->Mid); - return 1; + "than SMB for mid=%u", rfclen, smb->Mid); + return -EIO; } } return 0; @@ -676,3 +675,18 @@ void cifs_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock) cinode->clientCanCacheRead = false; } } + +bool +backup_cred(struct cifs_sb_info *cifs_sb) +{ + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_BACKUPUID) { + if (cifs_sb->mnt_backupuid == current_fsuid()) + return true; + } + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_BACKUPGID) { + if (in_group_p(cifs_sb->mnt_backupgid)) + return true; + } + + return false; +} diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c index 5de03ec..a090bbe 100644 --- a/fs/cifs/readdir.c +++ b/fs/cifs/readdir.c @@ -554,7 +554,10 @@ static int find_cifs_entry(const int xid, struct cifs_tcon *pTcon, rc); return rc; } - cifs_save_resume_key(cifsFile->srch_inf.last_entry, cifsFile); + /* FindFirst/Next set last_entry to NULL on malformed reply */ + if (cifsFile->srch_inf.last_entry) + cifs_save_resume_key(cifsFile->srch_inf.last_entry, + cifsFile); } while ((index_to_find >= cifsFile->srch_inf.index_of_last_entry) && @@ -562,7 +565,10 @@ static int find_cifs_entry(const int xid, struct cifs_tcon *pTcon, cFYI(1, "calling findnext2"); rc = CIFSFindNext(xid, pTcon, cifsFile->netfid, &cifsFile->srch_inf); - cifs_save_resume_key(cifsFile->srch_inf.last_entry, cifsFile); + /* FindFirst/Next set last_entry to NULL on malformed reply */ + if (cifsFile->srch_inf.last_entry) + cifs_save_resume_key(cifsFile->srch_inf.last_entry, + cifsFile); if (rc) return -ENOENT; } diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c index d3e6196..4ec3ee9 100644 --- a/fs/cifs/sess.c +++ b/fs/cifs/sess.c @@ -124,7 +124,9 @@ static __u32 cifs_ssetup_hdr(struct cifs_ses *ses, SESSION_SETUP_ANDX *pSMB) /* that we use in next few lines */ /* Note that header is initialized to zero in header_assemble */ pSMB->req.AndXCommand = 0xFF; - pSMB->req.MaxBufferSize = cpu_to_le16(ses->server->maxBuf); + pSMB->req.MaxBufferSize = cpu_to_le16(min_t(u32, + CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4, + USHRT_MAX)); pSMB->req.MaxMpxCount = cpu_to_le16(ses->server->maxReq); pSMB->req.VcNumber = get_next_vcnum(ses); @@ -681,7 +683,7 @@ ssetup_ntlmssp_authenticate: cpu_to_le16(CIFS_AUTH_RESP_SIZE); /* calculate ntlm response and session key */ - rc = setup_ntlm_response(ses); + rc = setup_ntlm_response(ses, nls_cp); if (rc) { cERROR(1, "Error %d during NTLM authentication", rc); goto ssetup_exit; diff --git a/fs/cifs/smbencrypt.c b/fs/cifs/smbencrypt.c index 42b9fff..80d8508 100644 --- a/fs/cifs/smbencrypt.c +++ b/fs/cifs/smbencrypt.c @@ -199,160 +199,36 @@ SMBencrypt(unsigned char *passwd, const unsigned char *c8, unsigned char *p24) return rc; } -/* Routines for Windows NT MD4 Hash functions. */ -static int -_my_wcslen(__u16 *str) -{ - int len = 0; - while (*str++ != 0) - len++; - return len; -} - -/* - * Convert a string into an NT UNICODE string. - * Note that regardless of processor type - * this must be in intel (little-endian) - * format. - */ - -static int -_my_mbstowcs(__u16 *dst, const unsigned char *src, int len) -{ /* BB not a very good conversion routine - change/fix */ - int i; - __u16 val; - - for (i = 0; i < len; i++) { - val = *src; - SSVAL(dst, 0, val); - dst++; - src++; - if (val == 0) - break; - } - return i; -} - /* * Creates the MD4 Hash of the users password in NT UNICODE. */ int -E_md4hash(const unsigned char *passwd, unsigned char *p16) +E_md4hash(const unsigned char *passwd, unsigned char *p16, + const struct nls_table *codepage) { int rc; int len; - __u16 wpwd[129]; + __le16 wpwd[129]; /* Password cannot be longer than 128 characters */ - if (passwd) { - len = strlen((char *) passwd); - if (len > 128) - len = 128; - - /* Password must be converted to NT unicode */ - _my_mbstowcs(wpwd, passwd, len); - } else + if (passwd) /* Password must be converted to NT unicode */ + len = cifs_strtoUCS(wpwd, passwd, 128, codepage); + else { len = 0; + *wpwd = 0; /* Ensure string is null terminated */ + } - wpwd[len] = 0; /* Ensure string is null terminated */ - /* Calculate length in bytes */ - len = _my_wcslen(wpwd) * sizeof(__u16); - - rc = mdfour(p16, (unsigned char *) wpwd, len); - memset(wpwd, 0, 129 * 2); + rc = mdfour(p16, (unsigned char *) wpwd, len * sizeof(__le16)); + memset(wpwd, 0, 129 * sizeof(__le16)); return rc; } -#if 0 /* currently unused */ -/* Does both the NT and LM owfs of a user's password */ -static void -nt_lm_owf_gen(char *pwd, unsigned char nt_p16[16], unsigned char p16[16]) -{ - char passwd[514]; - - memset(passwd, '\0', 514); - if (strlen(pwd) < 513) - strcpy(passwd, pwd); - else - memcpy(passwd, pwd, 512); - /* Calculate the MD4 hash (NT compatible) of the password */ - memset(nt_p16, '\0', 16); - E_md4hash(passwd, nt_p16); - - /* Mangle the passwords into Lanman format */ - passwd[14] = '\0'; -/* strupper(passwd); */ - - /* Calculate the SMB (lanman) hash functions of the password */ - - memset(p16, '\0', 16); - E_P16((unsigned char *) passwd, (unsigned char *) p16); - - /* clear out local copy of user's password (just being paranoid). */ - memset(passwd, '\0', sizeof(passwd)); -} -#endif - -/* Does the NTLMv2 owfs of a user's password */ -#if 0 /* function not needed yet - but will be soon */ -static void -ntv2_owf_gen(const unsigned char owf[16], const char *user_n, - const char *domain_n, unsigned char kr_buf[16], - const struct nls_table *nls_codepage) -{ - wchar_t *user_u; - wchar_t *dom_u; - int user_l, domain_l; - struct HMACMD5Context ctx; - - /* might as well do one alloc to hold both (user_u and dom_u) */ - user_u = kmalloc(2048 * sizeof(wchar_t), GFP_KERNEL); - if (user_u == NULL) - return; - dom_u = user_u + 1024; - - /* push_ucs2(NULL, user_u, user_n, (user_l+1)*2, - STR_UNICODE|STR_NOALIGN|STR_TERMINATE|STR_UPPER); - push_ucs2(NULL, dom_u, domain_n, (domain_l+1)*2, - STR_UNICODE|STR_NOALIGN|STR_TERMINATE|STR_UPPER); */ - - /* BB user and domain may need to be uppercased */ - user_l = cifs_strtoUCS(user_u, user_n, 511, nls_codepage); - domain_l = cifs_strtoUCS(dom_u, domain_n, 511, nls_codepage); - - user_l++; /* trailing null */ - domain_l++; - - hmac_md5_init_limK_to_64(owf, 16, &ctx); - hmac_md5_update((const unsigned char *) user_u, user_l * 2, &ctx); - hmac_md5_update((const unsigned char *) dom_u, domain_l * 2, &ctx); - hmac_md5_final(kr_buf, &ctx); - - kfree(user_u); -} -#endif - -/* Does the des encryption from the FIRST 8 BYTES of the NT or LM MD4 hash. */ -#if 0 /* currently unused */ -static void -NTLMSSPOWFencrypt(unsigned char passwd[8], - unsigned char *ntlmchalresp, unsigned char p24[24]) -{ - unsigned char p21[21]; - - memset(p21, '\0', 21); - memcpy(p21, passwd, 8); - memset(p21 + 8, 0xbd, 8); - - E_P24(p21, ntlmchalresp, p24); -} -#endif - /* Does the NT MD4 hash then des encryption. */ int -SMBNTencrypt(unsigned char *passwd, unsigned char *c8, unsigned char *p24) +SMBNTencrypt(unsigned char *passwd, unsigned char *c8, unsigned char *p24, + const struct nls_table *codepage) { int rc; unsigned char p16[16], p21[21]; @@ -360,7 +236,7 @@ SMBNTencrypt(unsigned char *passwd, unsigned char *c8, unsigned char *p24) memset(p16, '\0', 16); memset(p21, '\0', 21); - rc = E_md4hash(passwd, p16); + rc = E_md4hash(passwd, p16, codepage); if (rc) { cFYI(1, "%s Can't generate NT hash, error: %d", __func__, rc); return rc; @@ -369,39 +245,3 @@ SMBNTencrypt(unsigned char *passwd, unsigned char *c8, unsigned char *p24) rc = E_P24(p21, c8, p24); return rc; } - - -/* Does the md5 encryption from the NT hash for NTLMv2. */ -/* These routines will be needed later */ -#if 0 -static void -SMBOWFencrypt_ntv2(const unsigned char kr[16], - const struct data_blob *srv_chal, - const struct data_blob *cli_chal, unsigned char resp_buf[16]) -{ - struct HMACMD5Context ctx; - - hmac_md5_init_limK_to_64(kr, 16, &ctx); - hmac_md5_update(srv_chal->data, srv_chal->length, &ctx); - hmac_md5_update(cli_chal->data, cli_chal->length, &ctx); - hmac_md5_final(resp_buf, &ctx); -} - -static void -SMBsesskeygen_ntv2(const unsigned char kr[16], - const unsigned char *nt_resp, __u8 sess_key[16]) -{ - struct HMACMD5Context ctx; - - hmac_md5_init_limK_to_64(kr, 16, &ctx); - hmac_md5_update(nt_resp, 16, &ctx); - hmac_md5_final((unsigned char *) sess_key, &ctx); -} - -static void -SMBsesskeygen_ntv1(const unsigned char kr[16], - const unsigned char *nt_resp, __u8 sess_key[16]) -{ - mdfour((unsigned char *) sess_key, (unsigned char *) kr, 16); -} -#endif diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index 10ca6b2..0cc9584 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -26,6 +26,7 @@ #include <linux/wait.h> #include <linux/net.h> #include <linux/delay.h> +#include <linux/freezer.h> #include <asm/uaccess.h> #include <asm/processor.h> #include <linux/mempool.h> @@ -324,7 +325,7 @@ wait_for_response(struct TCP_Server_Info *server, struct mid_q_entry *midQ) { int error; - error = wait_event_killable(server->response_q, + error = wait_event_freezekillable(server->response_q, midQ->midState != MID_REQUEST_SUBMITTED); if (error < 0) return -ERESTARTSYS; @@ -339,8 +340,8 @@ wait_for_response(struct TCP_Server_Info *server, struct mid_q_entry *midQ) */ int cifs_call_async(struct TCP_Server_Info *server, struct kvec *iov, - unsigned int nvec, mid_callback_t *callback, void *cbdata, - bool ignore_pend) + unsigned int nvec, mid_receive_t *receive, + mid_callback_t *callback, void *cbdata, bool ignore_pend) { int rc; struct mid_q_entry *mid; @@ -374,6 +375,7 @@ cifs_call_async(struct TCP_Server_Info *server, struct kvec *iov, goto out_err; } + mid->receive = receive; mid->callback = callback; mid->callback_data = cbdata; mid->midState = MID_REQUEST_SUBMITTED; @@ -496,13 +498,18 @@ int cifs_check_receive(struct mid_q_entry *mid, struct TCP_Server_Info *server, bool log_error) { - dump_smb(mid->resp_buf, - min_t(u32, 92, be32_to_cpu(mid->resp_buf->smb_buf_length))); + unsigned int len = be32_to_cpu(mid->resp_buf->smb_buf_length) + 4; + + dump_smb(mid->resp_buf, min_t(u32, 92, len)); /* convert the length into a more usable form */ if (server->sec_mode & (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) { + struct kvec iov; + + iov.iov_base = mid->resp_buf; + iov.iov_len = len; /* FIXME: add code to kill session */ - if (cifs_verify_signature(mid->resp_buf, server, + if (cifs_verify_signature(&iov, 1, server, mid->sequence_number + 1) != 0) cERROR(1, "Unexpected SMB signature"); } diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c index 2a22fb2..45f07c4 100644 --- a/fs/cifs/xattr.c +++ b/fs/cifs/xattr.c @@ -22,6 +22,7 @@ #include <linux/fs.h> #include <linux/posix_acl_xattr.h> #include <linux/slab.h> +#include <linux/xattr.h> #include "cifsfs.h" #include "cifspdu.h" #include "cifsglob.h" @@ -31,16 +32,8 @@ #define MAX_EA_VALUE_SIZE 65535 #define CIFS_XATTR_DOS_ATTRIB "user.DosAttrib" #define CIFS_XATTR_CIFS_ACL "system.cifs_acl" -#define CIFS_XATTR_USER_PREFIX "user." -#define CIFS_XATTR_SYSTEM_PREFIX "system." -#define CIFS_XATTR_OS2_PREFIX "os2." -#define CIFS_XATTR_SECURITY_PREFIX "security." -#define CIFS_XATTR_TRUSTED_PREFIX "trusted." -#define XATTR_TRUSTED_PREFIX_LEN 8 -#define XATTR_SECURITY_PREFIX_LEN 9 -/* BB need to add server (Samba e.g) support for security and trusted prefix */ - +/* BB need to add server (Samba e.g) support for security and trusted prefix */ int cifs_removexattr(struct dentry *direntry, const char *ea_name) { @@ -76,8 +69,8 @@ int cifs_removexattr(struct dentry *direntry, const char *ea_name) } if (ea_name == NULL) { cFYI(1, "Null xattr names not supported"); - } else if (strncmp(ea_name, CIFS_XATTR_USER_PREFIX, 5) - && (strncmp(ea_name, CIFS_XATTR_OS2_PREFIX, 4))) { + } else if (strncmp(ea_name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN) + && (strncmp(ea_name, XATTR_OS2_PREFIX, XATTR_OS2_PREFIX_LEN))) { cFYI(1, "illegal xattr request %s (only user namespace supported)", ea_name); @@ -88,7 +81,7 @@ int cifs_removexattr(struct dentry *direntry, const char *ea_name) if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR) goto remove_ea_exit; - ea_name += 5; /* skip past user. prefix */ + ea_name += XATTR_USER_PREFIX_LEN; /* skip past user. prefix */ rc = CIFSSMBSetEA(xid, pTcon, full_path, ea_name, NULL, (__u16)0, cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); @@ -149,21 +142,23 @@ int cifs_setxattr(struct dentry *direntry, const char *ea_name, if (ea_name == NULL) { cFYI(1, "Null xattr names not supported"); - } else if (strncmp(ea_name, CIFS_XATTR_USER_PREFIX, 5) == 0) { + } else if (strncmp(ea_name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN) + == 0) { if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR) goto set_ea_exit; if (strncmp(ea_name, CIFS_XATTR_DOS_ATTRIB, 14) == 0) cFYI(1, "attempt to set cifs inode metadata"); - ea_name += 5; /* skip past user. prefix */ + ea_name += XATTR_USER_PREFIX_LEN; /* skip past user. prefix */ rc = CIFSSMBSetEA(xid, pTcon, full_path, ea_name, ea_value, (__u16)value_size, cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); - } else if (strncmp(ea_name, CIFS_XATTR_OS2_PREFIX, 4) == 0) { + } else if (strncmp(ea_name, XATTR_OS2_PREFIX, XATTR_OS2_PREFIX_LEN) + == 0) { if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR) goto set_ea_exit; - ea_name += 4; /* skip past os2. prefix */ + ea_name += XATTR_OS2_PREFIX_LEN; /* skip past os2. prefix */ rc = CIFSSMBSetEA(xid, pTcon, full_path, ea_name, ea_value, (__u16)value_size, cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); @@ -178,7 +173,7 @@ int cifs_setxattr(struct dentry *direntry, const char *ea_name, #ifdef CONFIG_CIFS_ACL memcpy(pacl, ea_value, value_size); rc = set_cifs_acl(pacl, value_size, - direntry->d_inode, full_path); + direntry->d_inode, full_path, CIFS_ACL_DACL); if (rc == 0) /* force revalidate of the inode */ CIFS_I(direntry->d_inode)->time = 0; kfree(pacl); @@ -269,7 +264,8 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name, /* return alt name if available as pseudo attr */ if (ea_name == NULL) { cFYI(1, "Null xattr names not supported"); - } else if (strncmp(ea_name, CIFS_XATTR_USER_PREFIX, 5) == 0) { + } else if (strncmp(ea_name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN) + == 0) { if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR) goto get_ea_exit; @@ -277,15 +273,15 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name, cFYI(1, "attempt to query cifs inode metadata"); /* revalidate/getattr then populate from inode */ } /* BB add else when above is implemented */ - ea_name += 5; /* skip past user. prefix */ + ea_name += XATTR_USER_PREFIX_LEN; /* skip past user. prefix */ rc = CIFSSMBQAllEAs(xid, pTcon, full_path, ea_name, ea_value, buf_size, cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); - } else if (strncmp(ea_name, CIFS_XATTR_OS2_PREFIX, 4) == 0) { + } else if (strncmp(ea_name, XATTR_OS2_PREFIX, XATTR_OS2_PREFIX_LEN) == 0) { if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR) goto get_ea_exit; - ea_name += 4; /* skip past os2. prefix */ + ea_name += XATTR_OS2_PREFIX_LEN; /* skip past os2. prefix */ rc = CIFSSMBQAllEAs(xid, pTcon, full_path, ea_name, ea_value, buf_size, cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); @@ -339,10 +335,10 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name, cFYI(1, "Query CIFS ACL not supported yet"); #endif /* CONFIG_CIFS_ACL */ } else if (strncmp(ea_name, - CIFS_XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) == 0) { + XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) == 0) { cFYI(1, "Trusted xattr namespace not supported yet"); } else if (strncmp(ea_name, - CIFS_XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) == 0) { + XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) == 0) { cFYI(1, "Security xattr namespace not supported yet"); } else cFYI(1, diff --git a/fs/coda/coda_linux.c b/fs/coda/coda_linux.c index 2bdbcc1..854ace7 100644 --- a/fs/coda/coda_linux.c +++ b/fs/coda/coda_linux.c @@ -104,7 +104,7 @@ void coda_vattr_to_iattr(struct inode *inode, struct coda_vattr *attr) if (attr->va_gid != -1) inode->i_gid = (gid_t) attr->va_gid; if (attr->va_nlink != -1) - inode->i_nlink = attr->va_nlink; + set_nlink(inode, attr->va_nlink); if (attr->va_size != -1) inode->i_size = attr->va_size; if (attr->va_size != -1) diff --git a/fs/coda/coda_linux.h b/fs/coda/coda_linux.h index 44e17e9..cc0ea9f 100644 --- a/fs/coda/coda_linux.h +++ b/fs/coda/coda_linux.h @@ -59,12 +59,11 @@ void coda_sysctl_clean(void); #define CODA_ALLOC(ptr, cast, size) do { \ if (size < PAGE_SIZE) \ - ptr = kmalloc((unsigned long) size, GFP_KERNEL); \ + ptr = kzalloc((unsigned long) size, GFP_KERNEL); \ else \ - ptr = (cast)vmalloc((unsigned long) size); \ + ptr = (cast)vzalloc((unsigned long) size); \ if (!ptr) \ printk("kernel malloc returns 0 at %s:%d\n", __FILE__, __LINE__); \ - else memset( ptr, 0, size ); \ } while (0) diff --git a/fs/coda/dir.c b/fs/coda/dir.c index 0239433..83d2fd8 100644 --- a/fs/coda/dir.c +++ b/fs/coda/dir.c @@ -30,14 +30,14 @@ #include "coda_int.h" /* dir inode-ops */ -static int coda_create(struct inode *dir, struct dentry *new, int mode, struct nameidata *nd); +static int coda_create(struct inode *dir, struct dentry *new, umode_t mode, struct nameidata *nd); static struct dentry *coda_lookup(struct inode *dir, struct dentry *target, struct nameidata *nd); static int coda_link(struct dentry *old_dentry, struct inode *dir_inode, struct dentry *entry); static int coda_unlink(struct inode *dir_inode, struct dentry *entry); static int coda_symlink(struct inode *dir_inode, struct dentry *entry, const char *symname); -static int coda_mkdir(struct inode *dir_inode, struct dentry *entry, int mode); +static int coda_mkdir(struct inode *dir_inode, struct dentry *entry, umode_t mode); static int coda_rmdir(struct inode *dir_inode, struct dentry *entry); static int coda_rename(struct inode *old_inode, struct dentry *old_dentry, struct inode *new_inode, struct dentry *new_dentry); @@ -191,7 +191,7 @@ static inline void coda_dir_drop_nlink(struct inode *dir) } /* creation routines: create, mknod, mkdir, link, symlink */ -static int coda_create(struct inode *dir, struct dentry *de, int mode, struct nameidata *nd) +static int coda_create(struct inode *dir, struct dentry *de, umode_t mode, struct nameidata *nd) { int error; const char *name=de->d_name.name; @@ -223,7 +223,7 @@ err_out: return error; } -static int coda_mkdir(struct inode *dir, struct dentry *de, int mode) +static int coda_mkdir(struct inode *dir, struct dentry *de, umode_t mode) { struct inode *inode; struct coda_vattr attrs; @@ -340,7 +340,7 @@ static int coda_rmdir(struct inode *dir, struct dentry *de) if (!error) { /* VFS may delete the child */ if (de->d_inode) - de->d_inode->i_nlink = 0; + clear_nlink(de->d_inode); /* fix the link count of the parent */ coda_dir_drop_nlink(dir); diff --git a/fs/coda/inode.c b/fs/coda/inode.c index 871b277..1c08a8c 100644 --- a/fs/coda/inode.c +++ b/fs/coda/inode.c @@ -58,7 +58,6 @@ static struct inode *coda_alloc_inode(struct super_block *sb) static void coda_i_callback(struct rcu_head *head) { struct inode *inode = container_of(head, struct inode, i_rcu); - INIT_LIST_HEAD(&inode->i_dentry); kmem_cache_free(coda_inode_cachep, ITOC(inode)); } diff --git a/fs/compat.c b/fs/compat.c index 58b1da4..fa9d721 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -37,7 +37,6 @@ #include <linux/dirent.h> #include <linux/fsnotify.h> #include <linux/highuid.h> -#include <linux/nfsd/syscall.h> #include <linux/personality.h> #include <linux/rwsem.h> #include <linux/tsacct_kern.h> @@ -247,11 +246,8 @@ static int put_compat_statfs(struct compat_statfs __user *ubuf, struct kstatfs * __put_user(kbuf->f_fsid.val[0], &ubuf->f_fsid.val[0]) || __put_user(kbuf->f_fsid.val[1], &ubuf->f_fsid.val[1]) || __put_user(kbuf->f_frsize, &ubuf->f_frsize) || - __put_user(0, &ubuf->f_spare[0]) || - __put_user(0, &ubuf->f_spare[1]) || - __put_user(0, &ubuf->f_spare[2]) || - __put_user(0, &ubuf->f_spare[3]) || - __put_user(0, &ubuf->f_spare[4])) + __put_user(kbuf->f_flags, &ubuf->f_flags) || + __clear_user(ubuf->f_spare, sizeof(ubuf->f_spare))) return -EFAULT; return 0; } @@ -346,16 +342,9 @@ asmlinkage long compat_sys_fstatfs64(unsigned int fd, compat_size_t sz, struct c */ asmlinkage long compat_sys_ustat(unsigned dev, struct compat_ustat __user *u) { - struct super_block *sb; struct compat_ustat tmp; struct kstatfs sbuf; - int err; - - sb = user_get_super(new_decode_dev(dev)); - if (!sb) - return -EINVAL; - err = statfs_by_dentry(sb->s_root, &sbuf); - drop_super(sb); + int err = vfs_ustat(new_decode_dev(dev), &sbuf); if (err) return err; @@ -550,7 +539,7 @@ out: ssize_t compat_rw_copy_check_uvector(int type, const struct compat_iovec __user *uvector, unsigned long nr_segs, unsigned long fast_segs, struct iovec *fast_pointer, - struct iovec **ret_pointer) + struct iovec **ret_pointer, int check_access) { compat_ssize_t tot_len; struct iovec *iov = *ret_pointer = fast_pointer; @@ -597,7 +586,8 @@ ssize_t compat_rw_copy_check_uvector(int type, } if (len < 0) /* size_t not fitting in compat_ssize_t .. */ goto out; - if (!access_ok(vrfy_dir(type), compat_ptr(buf), len)) { + if (check_access && + !access_ok(vrfy_dir(type), compat_ptr(buf), len)) { ret = -EFAULT; goto out; } @@ -1111,7 +1101,7 @@ static ssize_t compat_do_readv_writev(int type, struct file *file, goto out; tot_len = compat_rw_copy_check_uvector(type, uvector, nr_segs, - UIO_FASTIOV, iovstack, &iov); + UIO_FASTIOV, iovstack, &iov, 1); if (tot_len == 0) { ret = 0; goto out; @@ -1291,7 +1281,7 @@ compat_sys_vmsplice(int fd, const struct compat_iovec __user *iov32, * O_LARGEFILE flag. */ asmlinkage long -compat_sys_open(const char __user *filename, int flags, int mode) +compat_sys_open(const char __user *filename, int flags, umode_t mode) { return do_sys_open(AT_FDCWD, filename, flags, mode); } @@ -1301,7 +1291,7 @@ compat_sys_open(const char __user *filename, int flags, int mode) * O_LARGEFILE flag. */ asmlinkage long -compat_sys_openat(unsigned int dfd, const char __user *filename, int flags, int mode) +compat_sys_openat(unsigned int dfd, const char __user *filename, int flags, umode_t mode) { return do_sys_open(dfd, filename, flags, mode); } diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c index 51352de..a10e428 100644 --- a/fs/compat_ioctl.c +++ b/fs/compat_ioctl.c @@ -1506,35 +1506,6 @@ static long do_ioctl_trans(int fd, unsigned int cmd, return -ENOIOCTLCMD; } -static void compat_ioctl_error(struct file *filp, unsigned int fd, - unsigned int cmd, unsigned long arg) -{ - char buf[10]; - char *fn = "?"; - char *path; - - /* find the name of the device. */ - path = (char *)__get_free_page(GFP_KERNEL); - if (path) { - fn = d_path(&filp->f_path, path, PAGE_SIZE); - if (IS_ERR(fn)) - fn = "?"; - } - - sprintf(buf,"'%c'", (cmd>>_IOC_TYPESHIFT) & _IOC_TYPEMASK); - if (!isprint(buf[1])) - sprintf(buf, "%02x", buf[1]); - compat_printk("ioctl32(%s:%d): Unknown cmd fd(%d) " - "cmd(%08x){t:%s;sz:%u} arg(%08x) on %s\n", - current->comm, current->pid, - (int)fd, (unsigned int)cmd, buf, - (cmd >> _IOC_SIZESHIFT) & _IOC_SIZEMASK, - (unsigned int)arg, fn); - - if (path) - free_page((unsigned long)path); -} - static int compat_ioctl_check_table(unsigned int xcmd) { int i; @@ -1621,13 +1592,8 @@ asmlinkage long compat_sys_ioctl(unsigned int fd, unsigned int cmd, goto found_handler; error = do_ioctl_trans(fd, cmd, arg, filp); - if (error == -ENOIOCTLCMD) { - static int count; - - if (++count <= 50) - compat_ioctl_error(filp, fd, cmd, arg); - error = -EINVAL; - } + if (error == -ENOIOCTLCMD) + error = -ENOTTY; goto out_fput; diff --git a/fs/configfs/configfs_internal.h b/fs/configfs/configfs_internal.h index 82bda8f..ede857d 100644 --- a/fs/configfs/configfs_internal.h +++ b/fs/configfs/configfs_internal.h @@ -63,8 +63,8 @@ extern struct kmem_cache *configfs_dir_cachep; extern int configfs_is_root(struct config_item *item); -extern struct inode * configfs_new_inode(mode_t mode, struct configfs_dirent *); -extern int configfs_create(struct dentry *, int mode, int (*init)(struct inode *)); +extern struct inode * configfs_new_inode(umode_t mode, struct configfs_dirent *); +extern int configfs_create(struct dentry *, umode_t mode, int (*init)(struct inode *)); extern int configfs_inode_init(void); extern void configfs_inode_exit(void); diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c index 9a37a9b..5ddd7eb 100644 --- a/fs/configfs/dir.c +++ b/fs/configfs/dir.c @@ -311,8 +311,8 @@ static int configfs_create_dir(struct config_item * item, struct dentry *dentry) if (item->ci_parent) parent = item->ci_parent->ci_dentry; - else if (configfs_mount && configfs_mount->mnt_sb) - parent = configfs_mount->mnt_sb->s_root; + else if (configfs_mount) + parent = configfs_mount->mnt_root; else return -EFAULT; @@ -1170,7 +1170,7 @@ void configfs_undepend_item(struct configfs_subsystem *subsys, } EXPORT_SYMBOL(configfs_undepend_item); -static int configfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) +static int configfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) { int ret = 0; int module_got = 0; diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c index c83f476..3ee36d4 100644 --- a/fs/configfs/inode.c +++ b/fs/configfs/inode.c @@ -23,7 +23,8 @@ * * configfs Copyright (C) 2005 Oracle. All rights reserved. * - * Please see Documentation/filesystems/configfs.txt for more information. + * Please see Documentation/filesystems/configfs/configfs.txt for more + * information. */ #undef DEBUG @@ -115,7 +116,7 @@ int configfs_setattr(struct dentry * dentry, struct iattr * iattr) return error; } -static inline void set_default_inode_attr(struct inode * inode, mode_t mode) +static inline void set_default_inode_attr(struct inode * inode, umode_t mode) { inode->i_mode = mode; inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; @@ -131,7 +132,7 @@ static inline void set_inode_attr(struct inode * inode, struct iattr * iattr) inode->i_ctime = iattr->ia_ctime; } -struct inode * configfs_new_inode(mode_t mode, struct configfs_dirent * sd) +struct inode *configfs_new_inode(umode_t mode, struct configfs_dirent * sd) { struct inode * inode = new_inode(configfs_sb); if (inode) { @@ -184,7 +185,7 @@ static void configfs_set_inode_lock_class(struct configfs_dirent *sd, #endif /* CONFIG_LOCKDEP */ -int configfs_create(struct dentry * dentry, int mode, int (*init)(struct inode *)) +int configfs_create(struct dentry * dentry, umode_t mode, int (*init)(struct inode *)) { int error = 0; struct inode * inode = NULL; @@ -291,7 +292,7 @@ int __init configfs_inode_init(void) return bdi_init(&configfs_backing_dev_info); } -void __exit configfs_inode_exit(void) +void configfs_inode_exit(void) { bdi_destroy(&configfs_backing_dev_info); } diff --git a/fs/configfs/item.c b/fs/configfs/item.c index 76dc4c3..50cee7f 100644 --- a/fs/configfs/item.c +++ b/fs/configfs/item.c @@ -23,7 +23,7 @@ * * configfs Copyright (C) 2005 Oracle. All rights reserved. * - * Please see the file Documentation/filesystems/configfs.txt for + * Please see the file Documentation/filesystems/configfs/configfs.txt for * critical information about using the config_item interface. */ diff --git a/fs/configfs/mount.c b/fs/configfs/mount.c index ecc6217..276e15c 100644 --- a/fs/configfs/mount.c +++ b/fs/configfs/mount.c @@ -143,28 +143,26 @@ static int __init configfs_init(void) goto out; config_kobj = kobject_create_and_add("config", kernel_kobj); - if (!config_kobj) { - kmem_cache_destroy(configfs_dir_cachep); - configfs_dir_cachep = NULL; - goto out; - } + if (!config_kobj) + goto out2; + + err = configfs_inode_init(); + if (err) + goto out3; err = register_filesystem(&configfs_fs_type); - if (err) { - printk(KERN_ERR "configfs: Unable to register filesystem!\n"); - kobject_put(config_kobj); - kmem_cache_destroy(configfs_dir_cachep); - configfs_dir_cachep = NULL; - goto out; - } + if (err) + goto out4; - err = configfs_inode_init(); - if (err) { - unregister_filesystem(&configfs_fs_type); - kobject_put(config_kobj); - kmem_cache_destroy(configfs_dir_cachep); - configfs_dir_cachep = NULL; - } + return 0; +out4: + printk(KERN_ERR "configfs: Unable to register filesystem!\n"); + configfs_inode_exit(); +out3: + kobject_put(config_kobj); +out2: + kmem_cache_destroy(configfs_dir_cachep); + configfs_dir_cachep = NULL; out: return err; } diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c index 739fb59..a2ee8f9 100644 --- a/fs/cramfs/inode.c +++ b/fs/cramfs/inode.c @@ -20,7 +20,6 @@ #include <linux/cramfs_fs.h> #include <linux/slab.h> #include <linux/cramfs_fs_sb.h> -#include <linux/buffer_head.h> #include <linux/vfs.h> #include <linux/mutex.h> @@ -378,7 +377,7 @@ static int cramfs_readdir(struct file *filp, void *dirent, filldir_t filldir) unsigned long nextoffset; char *name; ino_t ino; - mode_t mode; + umode_t mode; int namelen, error; mutex_lock(&read_mutex); diff --git a/fs/dcache.c b/fs/dcache.c index a88948b..3c6d311 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -36,7 +36,9 @@ #include <linux/bit_spinlock.h> #include <linux/rculist_bl.h> #include <linux/prefetch.h> +#include <linux/ratelimit.h> #include "internal.h" +#include "mount.h" /* * Usage: @@ -225,7 +227,7 @@ static void dentry_unlink_inode(struct dentry * dentry) } /* - * dentry_lru_(add|del|move_tail) must be called with d_lock held. + * dentry_lru_(add|del|prune|move_tail) must be called with d_lock held. */ static void dentry_lru_add(struct dentry *dentry) { @@ -245,6 +247,9 @@ static void __dentry_lru_del(struct dentry *dentry) dentry_stat.nr_unused--; } +/* + * Remove a dentry with references from the LRU. + */ static void dentry_lru_del(struct dentry *dentry) { if (!list_empty(&dentry->d_lru)) { @@ -254,15 +259,32 @@ static void dentry_lru_del(struct dentry *dentry) } } -static void dentry_lru_move_tail(struct dentry *dentry) +/* + * Remove a dentry that is unreferenced and about to be pruned + * (unhashed and destroyed) from the LRU, and inform the file system. + * This wrapper should be called _prior_ to unhashing a victim dentry. + */ +static void dentry_lru_prune(struct dentry *dentry) +{ + if (!list_empty(&dentry->d_lru)) { + if (dentry->d_flags & DCACHE_OP_PRUNE) + dentry->d_op->d_prune(dentry); + + spin_lock(&dcache_lru_lock); + __dentry_lru_del(dentry); + spin_unlock(&dcache_lru_lock); + } +} + +static void dentry_lru_move_list(struct dentry *dentry, struct list_head *list) { spin_lock(&dcache_lru_lock); if (list_empty(&dentry->d_lru)) { - list_add_tail(&dentry->d_lru, &dentry->d_sb->s_dentry_lru); + list_add_tail(&dentry->d_lru, list); dentry->d_sb->s_nr_dentry_unused++; dentry_stat.nr_unused++; } else { - list_move_tail(&dentry->d_lru, &dentry->d_sb->s_dentry_lru); + list_move_tail(&dentry->d_lru, list); } spin_unlock(&dcache_lru_lock); } @@ -403,8 +425,12 @@ relock: if (ref) dentry->d_count--; - /* if dentry was on the d_lru list delete it from there */ - dentry_lru_del(dentry); + /* + * if dentry was on the d_lru list delete it from there. + * inform the fs via d_prune that this dentry is about to be + * unhashed and destroyed. + */ + dentry_lru_prune(dentry); /* if it was on the hash then remove it */ __d_drop(dentry); return d_kill(dentry, parent); @@ -522,9 +548,11 @@ int d_invalidate(struct dentry * dentry) * would make it unreachable from the root, * we might still populate it if it was a * working directory or similar). + * We also need to leave mountpoints alone, + * directory or not. */ - if (dentry->d_count > 1) { - if (dentry->d_inode && S_ISDIR(dentry->d_inode->i_mode)) { + if (dentry->d_count > 1 && dentry->d_inode) { + if (S_ISDIR(dentry->d_inode->i_mode) || d_mountpoint(dentry)) { spin_unlock(&dentry->d_lock); return -EBUSY; } @@ -742,14 +770,18 @@ static void shrink_dentry_list(struct list_head *list) } /** - * __shrink_dcache_sb - shrink the dentry LRU on a given superblock - * @sb: superblock to shrink dentry LRU. - * @count: number of entries to prune - * @flags: flags to control the dentry processing + * prune_dcache_sb - shrink the dcache + * @sb: superblock + * @count: number of entries to try to free * - * If flags contains DCACHE_REFERENCED reference dentries will not be pruned. + * Attempt to shrink the superblock dcache LRU by @count entries. This is + * done when we need more memory an called from the superblock shrinker + * function. + * + * This function may fail to free any resources if all the dentries are in + * use. */ -static void __shrink_dcache_sb(struct super_block *sb, int count, int flags) +void prune_dcache_sb(struct super_block *sb, int count) { struct dentry *dentry; LIST_HEAD(referenced); @@ -768,13 +800,7 @@ relock: goto relock; } - /* - * If we are honouring the DCACHE_REFERENCED flag and the - * dentry has this flag set, don't free it. Clear the flag - * and put it back on the LRU. - */ - if (flags & DCACHE_REFERENCED && - dentry->d_flags & DCACHE_REFERENCED) { + if (dentry->d_flags & DCACHE_REFERENCED) { dentry->d_flags &= ~DCACHE_REFERENCED; list_move(&dentry->d_lru, &referenced); spin_unlock(&dentry->d_lock); @@ -794,23 +820,6 @@ relock: } /** - * prune_dcache_sb - shrink the dcache - * @sb: superblock - * @nr_to_scan: number of entries to try to free - * - * Attempt to shrink the superblock dcache LRU by @nr_to_scan entries. This is - * done when we need more memory an called from the superblock shrinker - * function. - * - * This function may fail to free any resources if all the dentries are in - * use. - */ -void prune_dcache_sb(struct super_block *sb, int nr_to_scan) -{ - __shrink_dcache_sb(sb, nr_to_scan, DCACHE_REFERENCED); -} - -/** * shrink_dcache_sb - shrink dcache for a superblock * @sb: superblock * @@ -854,8 +863,12 @@ static void shrink_dcache_for_umount_subtree(struct dentry *dentry) do { struct inode *inode; - /* detach from the system */ - dentry_lru_del(dentry); + /* + * remove the dentry from the lru, and inform + * the fs that this dentry is about to be + * unhashed and destroyed. + */ + dentry_lru_prune(dentry); __d_shrink(dentry); if (dentry->d_count != 0) { @@ -1060,7 +1073,7 @@ EXPORT_SYMBOL(have_submounts); * drop the lock and return early due to latency * constraints. */ -static int select_parent(struct dentry * parent) +static int select_parent(struct dentry *parent, struct list_head *dispose) { struct dentry *this_parent; struct list_head *next; @@ -1082,12 +1095,11 @@ resume: spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); - /* - * move only zero ref count dentries to the end - * of the unused list for prune_dcache + /* + * move only zero ref count dentries to the dispose list. */ if (!dentry->d_count) { - dentry_lru_move_tail(dentry); + dentry_lru_move_list(dentry, dispose); found++; } else { dentry_lru_del(dentry); @@ -1149,14 +1161,13 @@ rename_retry: * * Prune the dcache to remove unused children of the parent dentry. */ - void shrink_dcache_parent(struct dentry * parent) { - struct super_block *sb = parent->d_sb; + LIST_HEAD(dispose); int found; - while ((found = select_parent(parent)) != 0) - __shrink_dcache_sb(sb, found, 0); + while ((found = select_parent(parent, &dispose)) != 0) + shrink_dentry_list(&dispose); } EXPORT_SYMBOL(shrink_dcache_parent); @@ -1283,6 +1294,8 @@ void d_set_d_op(struct dentry *dentry, const struct dentry_operations *op) dentry->d_flags |= DCACHE_OP_REVALIDATE; if (op->d_delete) dentry->d_flags |= DCACHE_OP_DELETE; + if (op->d_prune) + dentry->d_flags |= DCACHE_OP_PRUNE; } EXPORT_SYMBOL(d_set_d_op); @@ -1427,6 +1440,23 @@ struct dentry * d_alloc_root(struct inode * root_inode) } EXPORT_SYMBOL(d_alloc_root); +struct dentry *d_make_root(struct inode *root_inode) +{ + struct dentry *res = NULL; + + if (root_inode) { + static const struct qstr name = { .name = "/", .len = 1 }; + + res = __d_alloc(root_inode->i_sb, &name); + if (res) + d_instantiate(res, root_inode); + else + iput(root_inode); + } + return res; +} +EXPORT_SYMBOL(d_make_root); + static struct dentry * __d_find_any_alias(struct inode *inode) { struct dentry *alias; @@ -2351,8 +2381,16 @@ struct dentry *d_materialise_unique(struct dentry *dentry, struct inode *inode) actual = __d_unalias(inode, dentry, alias); } write_sequnlock(&rename_lock); - if (IS_ERR(actual)) + if (IS_ERR(actual)) { + if (PTR_ERR(actual) == -ELOOP) + pr_warn_ratelimited( + "VFS: Lookup of '%s' in %s %s" + " would have caused loop\n", + dentry->d_name.name, + inode->i_sb->s_type->name, + inode->i_sb->s_id); dput(alias); + } goto out_nolock; } } @@ -2398,20 +2436,19 @@ static int prepend_name(char **buffer, int *buflen, struct qstr *name) /** * prepend_path - Prepend path string to a buffer * @path: the dentry/vfsmount to report - * @root: root vfsmnt/dentry (may be modified by this function) + * @root: root vfsmnt/dentry * @buffer: pointer to the end of the buffer * @buflen: pointer to buffer length * * Caller holds the rename_lock. - * - * If path is not reachable from the supplied root, then the value of - * root is changed (without modifying refcounts). */ -static int prepend_path(const struct path *path, struct path *root, +static int prepend_path(const struct path *path, + const struct path *root, char **buffer, int *buflen) { struct dentry *dentry = path->dentry; struct vfsmount *vfsmnt = path->mnt; + struct mount *mnt = real_mount(vfsmnt); bool slash = false; int error = 0; @@ -2421,11 +2458,11 @@ static int prepend_path(const struct path *path, struct path *root, if (dentry == vfsmnt->mnt_root || IS_ROOT(dentry)) { /* Global root? */ - if (vfsmnt->mnt_parent == vfsmnt) { + if (!mnt_has_parent(mnt)) goto global_root; - } - dentry = vfsmnt->mnt_mountpoint; - vfsmnt = vfsmnt->mnt_parent; + dentry = mnt->mnt_mountpoint; + mnt = mnt->mnt_parent; + vfsmnt = &mnt->mnt; continue; } parent = dentry->d_parent; @@ -2442,10 +2479,10 @@ static int prepend_path(const struct path *path, struct path *root, dentry = parent; } -out: if (!error && !slash) error = prepend(buffer, buflen, "/", 1); +out: br_read_unlock(vfsmount_lock); return error; @@ -2459,15 +2496,17 @@ global_root: WARN(1, "Root dentry has weird name <%.*s>\n", (int) dentry->d_name.len, dentry->d_name.name); } - root->mnt = vfsmnt; - root->dentry = dentry; + if (!slash) + error = prepend(buffer, buflen, "/", 1); + if (!error) + error = real_mount(vfsmnt)->mnt_ns ? 1 : 2; goto out; } /** * __d_path - return the path of a dentry * @path: the dentry/vfsmount to report - * @root: root vfsmnt/dentry (may be modified by this function) + * @root: root vfsmnt/dentry * @buf: buffer to return value in * @buflen: buffer length * @@ -2478,10 +2517,10 @@ global_root: * * "buflen" should be positive. * - * If path is not reachable from the supplied root, then the value of - * root is changed (without modifying refcounts). + * If the path is not reachable from the supplied root, return %NULL. */ -char *__d_path(const struct path *path, struct path *root, +char *__d_path(const struct path *path, + const struct path *root, char *buf, int buflen) { char *res = buf + buflen; @@ -2492,7 +2531,28 @@ char *__d_path(const struct path *path, struct path *root, error = prepend_path(path, root, &res, &buflen); write_sequnlock(&rename_lock); - if (error) + if (error < 0) + return ERR_PTR(error); + if (error > 0) + return NULL; + return res; +} + +char *d_absolute_path(const struct path *path, + char *buf, int buflen) +{ + struct path root = {}; + char *res = buf + buflen; + int error; + + prepend(&res, &buflen, "\0", 1); + write_seqlock(&rename_lock); + error = prepend_path(path, &root, &res, &buflen); + write_sequnlock(&rename_lock); + + if (error > 1) + error = -EINVAL; + if (error < 0) return ERR_PTR(error); return res; } @@ -2500,8 +2560,9 @@ char *__d_path(const struct path *path, struct path *root, /* * same as __d_path but appends "(deleted)" for unlinked files. */ -static int path_with_deleted(const struct path *path, struct path *root, - char **buf, int *buflen) +static int path_with_deleted(const struct path *path, + const struct path *root, + char **buf, int *buflen) { prepend(buf, buflen, "\0", 1); if (d_unlinked(path->dentry)) { @@ -2538,7 +2599,6 @@ char *d_path(const struct path *path, char *buf, int buflen) { char *res = buf + buflen; struct path root; - struct path tmp; int error; /* @@ -2553,9 +2613,8 @@ char *d_path(const struct path *path, char *buf, int buflen) get_fs_root(current->fs, &root); write_seqlock(&rename_lock); - tmp = root; - error = path_with_deleted(path, &tmp, &res, &buflen); - if (error) + error = path_with_deleted(path, &root, &res, &buflen); + if (error < 0) res = ERR_PTR(error); write_sequnlock(&rename_lock); path_put(&root); @@ -2576,7 +2635,6 @@ char *d_path_with_unreachable(const struct path *path, char *buf, int buflen) { char *res = buf + buflen; struct path root; - struct path tmp; int error; if (path->dentry->d_op && path->dentry->d_op->d_dname) @@ -2584,9 +2642,8 @@ char *d_path_with_unreachable(const struct path *path, char *buf, int buflen) get_fs_root(current->fs, &root); write_seqlock(&rename_lock); - tmp = root; - error = path_with_deleted(path, &tmp, &res, &buflen); - if (!error && !path_equal(&tmp, &root)) + error = path_with_deleted(path, &root, &res, &buflen); + if (error > 0) error = prepend_unreachable(&res, &buflen); write_sequnlock(&rename_lock); path_put(&root); @@ -2717,19 +2774,18 @@ SYSCALL_DEFINE2(getcwd, char __user *, buf, unsigned long, size) write_seqlock(&rename_lock); if (!d_unlinked(pwd.dentry)) { unsigned long len; - struct path tmp = root; char *cwd = page + PAGE_SIZE; int buflen = PAGE_SIZE; prepend(&cwd, &buflen, "\0", 1); - error = prepend_path(&pwd, &tmp, &cwd, &buflen); + error = prepend_path(&pwd, &root, &cwd, &buflen); write_sequnlock(&rename_lock); - if (error) + if (error < 0) goto out; /* Unreachable from current root */ - if (!path_equal(&tmp, &root)) { + if (error > 0) { error = prepend_unreachable(&cwd, &buflen); if (error) goto out; @@ -2795,31 +2851,6 @@ int is_subdir(struct dentry *new_dentry, struct dentry *old_dentry) return result; } -int path_is_under(struct path *path1, struct path *path2) -{ - struct vfsmount *mnt = path1->mnt; - struct dentry *dentry = path1->dentry; - int res; - - br_read_lock(vfsmount_lock); - if (mnt != path2->mnt) { - for (;;) { - if (mnt->mnt_parent == mnt) { - br_read_unlock(vfsmount_lock); - return 0; - } - if (mnt->mnt_parent == path2->mnt) - break; - mnt = mnt->mnt_parent; - } - dentry = mnt->mnt_mountpoint; - } - res = is_subdir(dentry, path2->dentry); - br_read_unlock(vfsmount_lock); - return res; -} -EXPORT_SYMBOL(path_is_under); - void d_genocide(struct dentry *root) { struct dentry *this_parent; diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c index 90f7657..f65d445 100644 --- a/fs/debugfs/file.c +++ b/fs/debugfs/file.c @@ -15,9 +15,11 @@ #include <linux/module.h> #include <linux/fs.h> +#include <linux/seq_file.h> #include <linux/pagemap.h> #include <linux/namei.h> #include <linux/debugfs.h> +#include <linux/io.h> static ssize_t default_read_file(struct file *file, char __user *buf, size_t count, loff_t *ppos) @@ -95,7 +97,7 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_u8_wo, NULL, debugfs_u8_set, "%llu\n"); * %NULL or !%NULL instead as to eliminate the need for #ifdef in the calling * code. */ -struct dentry *debugfs_create_u8(const char *name, mode_t mode, +struct dentry *debugfs_create_u8(const char *name, umode_t mode, struct dentry *parent, u8 *value) { /* if there are no write bits set, make read only */ @@ -147,7 +149,7 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_u16_wo, NULL, debugfs_u16_set, "%llu\n"); * %NULL or !%NULL instead as to eliminate the need for #ifdef in the calling * code. */ -struct dentry *debugfs_create_u16(const char *name, mode_t mode, +struct dentry *debugfs_create_u16(const char *name, umode_t mode, struct dentry *parent, u16 *value) { /* if there are no write bits set, make read only */ @@ -199,7 +201,7 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_u32_wo, NULL, debugfs_u32_set, "%llu\n"); * %NULL or !%NULL instead as to eliminate the need for #ifdef in the calling * code. */ -struct dentry *debugfs_create_u32(const char *name, mode_t mode, +struct dentry *debugfs_create_u32(const char *name, umode_t mode, struct dentry *parent, u32 *value) { /* if there are no write bits set, make read only */ @@ -252,7 +254,7 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_u64_wo, NULL, debugfs_u64_set, "%llu\n"); * %NULL or !%NULL instead as to eliminate the need for #ifdef in the calling * code. */ -struct dentry *debugfs_create_u64(const char *name, mode_t mode, +struct dentry *debugfs_create_u64(const char *name, umode_t mode, struct dentry *parent, u64 *value) { /* if there are no write bits set, make read only */ @@ -298,7 +300,7 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_x64, debugfs_u64_get, debugfs_u64_set, "0x%016llx\n * @value: a pointer to the variable that the file should read to and write * from. */ -struct dentry *debugfs_create_x8(const char *name, mode_t mode, +struct dentry *debugfs_create_x8(const char *name, umode_t mode, struct dentry *parent, u8 *value) { /* if there are no write bits set, make read only */ @@ -322,7 +324,7 @@ EXPORT_SYMBOL_GPL(debugfs_create_x8); * @value: a pointer to the variable that the file should read to and write * from. */ -struct dentry *debugfs_create_x16(const char *name, mode_t mode, +struct dentry *debugfs_create_x16(const char *name, umode_t mode, struct dentry *parent, u16 *value) { /* if there are no write bits set, make read only */ @@ -346,7 +348,7 @@ EXPORT_SYMBOL_GPL(debugfs_create_x16); * @value: a pointer to the variable that the file should read to and write * from. */ -struct dentry *debugfs_create_x32(const char *name, mode_t mode, +struct dentry *debugfs_create_x32(const char *name, umode_t mode, struct dentry *parent, u32 *value) { /* if there are no write bits set, make read only */ @@ -370,7 +372,7 @@ EXPORT_SYMBOL_GPL(debugfs_create_x32); * @value: a pointer to the variable that the file should read to and write * from. */ -struct dentry *debugfs_create_x64(const char *name, mode_t mode, +struct dentry *debugfs_create_x64(const char *name, umode_t mode, struct dentry *parent, u64 *value) { return debugfs_create_file(name, mode, parent, value, &fops_x64); @@ -401,7 +403,7 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_size_t, debugfs_size_t_get, debugfs_size_t_set, * @value: a pointer to the variable that the file should read to and write * from. */ -struct dentry *debugfs_create_size_t(const char *name, mode_t mode, +struct dentry *debugfs_create_size_t(const char *name, umode_t mode, struct dentry *parent, size_t *value) { return debugfs_create_file(name, mode, parent, value, &fops_size_t); @@ -473,7 +475,7 @@ static const struct file_operations fops_bool = { * %NULL or !%NULL instead as to eliminate the need for #ifdef in the calling * code. */ -struct dentry *debugfs_create_bool(const char *name, mode_t mode, +struct dentry *debugfs_create_bool(const char *name, umode_t mode, struct dentry *parent, u32 *value) { return debugfs_create_file(name, mode, parent, value, &fops_bool); @@ -518,10 +520,103 @@ static const struct file_operations fops_blob = { * %NULL or !%NULL instead as to eliminate the need for #ifdef in the calling * code. */ -struct dentry *debugfs_create_blob(const char *name, mode_t mode, +struct dentry *debugfs_create_blob(const char *name, umode_t mode, struct dentry *parent, struct debugfs_blob_wrapper *blob) { return debugfs_create_file(name, mode, parent, blob, &fops_blob); } EXPORT_SYMBOL_GPL(debugfs_create_blob); + +#ifdef CONFIG_HAS_IOMEM + +/* + * The regset32 stuff is used to print 32-bit registers using the + * seq_file utilities. We offer printing a register set in an already-opened + * sequential file or create a debugfs file that only prints a regset32. + */ + +/** + * debugfs_print_regs32 - use seq_print to describe a set of registers + * @s: the seq_file structure being used to generate output + * @regs: an array if struct debugfs_reg32 structures + * @mregs: the length of the above array + * @base: the base address to be used in reading the registers + * @prefix: a string to be prefixed to every output line + * + * This function outputs a text block describing the current values of + * some 32-bit hardware registers. It is meant to be used within debugfs + * files based on seq_file that need to show registers, intermixed with other + * information. The prefix argument may be used to specify a leading string, + * because some peripherals have several blocks of identical registers, + * for example configuration of dma channels + */ +int debugfs_print_regs32(struct seq_file *s, const struct debugfs_reg32 *regs, + int nregs, void __iomem *base, char *prefix) +{ + int i, ret = 0; + + for (i = 0; i < nregs; i++, regs++) { + if (prefix) + ret += seq_printf(s, "%s", prefix); + ret += seq_printf(s, "%s = 0x%08x\n", regs->name, + readl(base + regs->offset)); + } + return ret; +} +EXPORT_SYMBOL_GPL(debugfs_print_regs32); + +static int debugfs_show_regset32(struct seq_file *s, void *data) +{ + struct debugfs_regset32 *regset = s->private; + + debugfs_print_regs32(s, regset->regs, regset->nregs, regset->base, ""); + return 0; +} + +static int debugfs_open_regset32(struct inode *inode, struct file *file) +{ + return single_open(file, debugfs_show_regset32, inode->i_private); +} + +static const struct file_operations fops_regset32 = { + .open = debugfs_open_regset32, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +/** + * debugfs_create_regset32 - create a debugfs file that returns register values + * @name: a pointer to a string containing the name of the file to create. + * @mode: the permission that the file should have + * @parent: a pointer to the parent dentry for this file. This should be a + * directory dentry if set. If this parameter is %NULL, then the + * file will be created in the root of the debugfs filesystem. + * @regset: a pointer to a struct debugfs_regset32, which contains a pointer + * to an array of register definitions, the array size and the base + * address where the register bank is to be found. + * + * This function creates a file in debugfs with the given name that reports + * the names and values of a set of 32-bit registers. If the @mode variable + * is so set it can be read from. Writing is not supported. + * + * This function will return a pointer to a dentry if it succeeds. This + * pointer must be passed to the debugfs_remove() function when the file is + * to be removed (no automatic cleanup happens if your module is unloaded, + * you are responsible here.) If an error occurs, %NULL will be returned. + * + * If debugfs is not enabled in the kernel, the value -%ENODEV will be + * returned. It is not wise to check for this value, but rather, check for + * %NULL or !%NULL instead as to eliminate the need for #ifdef in the calling + * code. + */ +struct dentry *debugfs_create_regset32(const char *name, mode_t mode, + struct dentry *parent, + struct debugfs_regset32 *regset) +{ + return debugfs_create_file(name, mode, parent, regset, &fops_regset32); +} +EXPORT_SYMBOL_GPL(debugfs_create_regset32); + +#endif /* CONFIG_HAS_IOMEM */ diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c index e7a7a2f..956d5dd 100644 --- a/fs/debugfs/inode.c +++ b/fs/debugfs/inode.c @@ -1,5 +1,5 @@ /* - * file.c - part of debugfs, a tiny little debug file system + * inode.c - part of debugfs, a tiny little debug file system * * Copyright (C) 2004 Greg Kroah-Hartman <greg@kroah.com> * Copyright (C) 2004 IBM Inc. @@ -30,7 +30,7 @@ static struct vfsmount *debugfs_mount; static int debugfs_mount_count; static bool debugfs_registered; -static struct inode *debugfs_get_inode(struct super_block *sb, int mode, dev_t dev, +static struct inode *debugfs_get_inode(struct super_block *sb, umode_t mode, dev_t dev, void *data, const struct file_operations *fops) { @@ -69,7 +69,7 @@ static struct inode *debugfs_get_inode(struct super_block *sb, int mode, dev_t d /* SMP-safe */ static int debugfs_mknod(struct inode *dir, struct dentry *dentry, - int mode, dev_t dev, void *data, + umode_t mode, dev_t dev, void *data, const struct file_operations *fops) { struct inode *inode; @@ -87,7 +87,7 @@ static int debugfs_mknod(struct inode *dir, struct dentry *dentry, return error; } -static int debugfs_mkdir(struct inode *dir, struct dentry *dentry, int mode, +static int debugfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode, void *data, const struct file_operations *fops) { int res; @@ -101,14 +101,14 @@ static int debugfs_mkdir(struct inode *dir, struct dentry *dentry, int mode, return res; } -static int debugfs_link(struct inode *dir, struct dentry *dentry, int mode, +static int debugfs_link(struct inode *dir, struct dentry *dentry, umode_t mode, void *data, const struct file_operations *fops) { mode = (mode & S_IALLUGO) | S_IFLNK; return debugfs_mknod(dir, dentry, mode, 0, data, fops); } -static int debugfs_create(struct inode *dir, struct dentry *dentry, int mode, +static int debugfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, void *data, const struct file_operations *fops) { int res; @@ -146,7 +146,7 @@ static struct file_system_type debug_fs_type = { .kill_sb = kill_litter_super, }; -static int debugfs_create_by_name(const char *name, mode_t mode, +static int debugfs_create_by_name(const char *name, umode_t mode, struct dentry *parent, struct dentry **dentry, void *data, @@ -160,7 +160,7 @@ static int debugfs_create_by_name(const char *name, mode_t mode, * have around. */ if (!parent) - parent = debugfs_mount->mnt_sb->s_root; + parent = debugfs_mount->mnt_root; *dentry = NULL; mutex_lock(&parent->d_inode->i_mutex); @@ -214,7 +214,7 @@ static int debugfs_create_by_name(const char *name, mode_t mode, * If debugfs is not enabled in the kernel, the value -%ENODEV will be * returned. */ -struct dentry *debugfs_create_file(const char *name, mode_t mode, +struct dentry *debugfs_create_file(const char *name, umode_t mode, struct dentry *parent, void *data, const struct file_operations *fops) { diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c index 2f27e57..c4e2a58 100644 --- a/fs/devpts/inode.c +++ b/fs/devpts/inode.c @@ -246,9 +246,9 @@ static int devpts_remount(struct super_block *sb, int *flags, char *data) return err; } -static int devpts_show_options(struct seq_file *seq, struct vfsmount *vfs) +static int devpts_show_options(struct seq_file *seq, struct dentry *root) { - struct pts_fs_info *fsi = DEVPTS_SB(vfs->mnt_sb); + struct pts_fs_info *fsi = DEVPTS_SB(root->d_sb); struct pts_mount_opts *opts = &fsi->mount_opts; if (opts->setuid) @@ -301,13 +301,13 @@ devpts_fill_super(struct super_block *s, void *data, int silent) inode = new_inode(s); if (!inode) - goto free_fsi; + goto fail; inode->i_ino = 1; inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR; inode->i_op = &simple_dir_inode_operations; inode->i_fop = &simple_dir_operations; - inode->i_nlink = 2; + set_nlink(inode, 2); s->s_root = d_alloc_root(inode); if (s->s_root) @@ -316,8 +316,6 @@ devpts_fill_super(struct super_block *s, void *data, int silent) printk(KERN_ERR "devpts: get root dentry failed\n"); iput(inode); -free_fsi: - kfree(s->s_fs_info); fail: return -ENOMEM; } @@ -549,7 +547,7 @@ void devpts_pty_kill(struct tty_struct *tty) dentry = d_find_alias(inode); - inode->i_nlink--; + drop_nlink(inode); d_delete(dentry); dput(dentry); /* d_alloc_name() in devpts_pty_new() */ dput(dentry); /* d_find_alias above */ diff --git a/fs/direct-io.c b/fs/direct-io.c index 44a360c..d740ab6 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -39,7 +39,7 @@ /* * How many user pages to map in one call to get_user_pages(). This determines - * the size of a structure on the stack. + * the size of a structure in the slab cache */ #define DIO_PAGES 64 @@ -55,13 +55,10 @@ * blocksize. */ -struct dio { - /* BIO submission state */ +/* dio_state only used in the submission path */ + +struct dio_submit { struct bio *bio; /* bio under assembly */ - struct inode *inode; - int rw; - loff_t i_size; /* i_size when submitted */ - int flags; /* doesn't change */ unsigned blkbits; /* doesn't change */ unsigned blkfactor; /* When we're using an alignment which is finer than the filesystem's soft @@ -76,18 +73,17 @@ struct dio { sector_t block_in_file; /* Current offset into the underlying file in dio_block units. */ unsigned blocks_available; /* At block_in_file. changes */ + int reap_counter; /* rate limit reaping */ sector_t final_block_in_request;/* doesn't change */ unsigned first_block_in_page; /* doesn't change, Used only once */ int boundary; /* prev block is at a boundary */ - int reap_counter; /* rate limit reaping */ get_block_t *get_block; /* block mapping function */ - dio_iodone_t *end_io; /* IO completion function */ dio_submit_t *submit_io; /* IO submition function */ + loff_t logical_offset_in_bio; /* current first logical block in bio */ sector_t final_block_in_bio; /* current final block in bio + 1 */ sector_t next_block_for_io; /* next block to be put under IO, in dio_blocks units */ - struct buffer_head map_bh; /* last get_block() result */ /* * Deferred addition of a page to the dio. These variables are @@ -100,18 +96,6 @@ struct dio { sector_t cur_page_block; /* Where it starts */ loff_t cur_page_fs_offset; /* Offset in file */ - /* BIO completion state */ - spinlock_t bio_lock; /* protects BIO fields below */ - unsigned long refcount; /* direct_io_worker() and bios */ - struct bio *bio_list; /* singly linked via bi_private */ - struct task_struct *waiter; /* waiting task (NULL if none) */ - - /* AIO related stuff */ - struct kiocb *iocb; /* kiocb */ - int is_async; /* is IO async ? */ - int io_error; /* IO error in completion path */ - ssize_t result; /* IO result */ - /* * Page fetching state. These variables belong to dio_refill_pages(). */ @@ -125,7 +109,30 @@ struct dio { */ unsigned head; /* next page to process */ unsigned tail; /* last valid page + 1 */ +}; + +/* dio_state communicated between submission path and end_io */ +struct dio { + int flags; /* doesn't change */ + int rw; + struct inode *inode; + loff_t i_size; /* i_size when submitted */ + dio_iodone_t *end_io; /* IO completion function */ + + void *private; /* copy from map_bh.b_private */ + + /* BIO completion state */ + spinlock_t bio_lock; /* protects BIO fields below */ int page_errors; /* errno from get_user_pages() */ + int is_async; /* is IO async ? */ + int io_error; /* IO error in completion path */ + unsigned long refcount; /* direct_io_worker() and bios */ + struct bio *bio_list; /* singly linked via bi_private */ + struct task_struct *waiter; /* waiting task (NULL if none) */ + + /* AIO related stuff */ + struct kiocb *iocb; /* kiocb */ + ssize_t result; /* IO result */ /* * pages[] (and any fields placed after it) are not zeroed out at @@ -133,7 +140,9 @@ struct dio { * wish that they not be zeroed. */ struct page *pages[DIO_PAGES]; /* page buffer */ -}; +} ____cacheline_aligned_in_smp; + +static struct kmem_cache *dio_cache __read_mostly; static void __inode_dio_wait(struct inode *inode) { @@ -182,27 +191,27 @@ EXPORT_SYMBOL_GPL(inode_dio_done); /* * How many pages are in the queue? */ -static inline unsigned dio_pages_present(struct dio *dio) +static inline unsigned dio_pages_present(struct dio_submit *sdio) { - return dio->tail - dio->head; + return sdio->tail - sdio->head; } /* * Go grab and pin some userspace pages. Typically we'll get 64 at a time. */ -static int dio_refill_pages(struct dio *dio) +static inline int dio_refill_pages(struct dio *dio, struct dio_submit *sdio) { int ret; int nr_pages; - nr_pages = min(dio->total_pages - dio->curr_page, DIO_PAGES); + nr_pages = min(sdio->total_pages - sdio->curr_page, DIO_PAGES); ret = get_user_pages_fast( - dio->curr_user_address, /* Where from? */ + sdio->curr_user_address, /* Where from? */ nr_pages, /* How many pages? */ dio->rw == READ, /* Write to memory? */ &dio->pages[0]); /* Put results here */ - if (ret < 0 && dio->blocks_available && (dio->rw & WRITE)) { + if (ret < 0 && sdio->blocks_available && (dio->rw & WRITE)) { struct page *page = ZERO_PAGE(0); /* * A memory fault, but the filesystem has some outstanding @@ -213,17 +222,17 @@ static int dio_refill_pages(struct dio *dio) dio->page_errors = ret; page_cache_get(page); dio->pages[0] = page; - dio->head = 0; - dio->tail = 1; + sdio->head = 0; + sdio->tail = 1; ret = 0; goto out; } if (ret >= 0) { - dio->curr_user_address += ret * PAGE_SIZE; - dio->curr_page += ret; - dio->head = 0; - dio->tail = ret; + sdio->curr_user_address += ret * PAGE_SIZE; + sdio->curr_page += ret; + sdio->head = 0; + sdio->tail = ret; ret = 0; } out: @@ -236,17 +245,18 @@ out: * decent number of pages, less frequently. To provide nicer use of the * L1 cache. */ -static struct page *dio_get_page(struct dio *dio) +static inline struct page *dio_get_page(struct dio *dio, + struct dio_submit *sdio) { - if (dio_pages_present(dio) == 0) { + if (dio_pages_present(sdio) == 0) { int ret; - ret = dio_refill_pages(dio); + ret = dio_refill_pages(dio, sdio); if (ret) return ERR_PTR(ret); - BUG_ON(dio_pages_present(dio) == 0); + BUG_ON(dio_pages_present(sdio) == 0); } - return dio->pages[dio->head++]; + return dio->pages[sdio->head++]; } /** @@ -292,7 +302,7 @@ static ssize_t dio_complete(struct dio *dio, loff_t offset, ssize_t ret, bool is if (dio->end_io && dio->result) { dio->end_io(dio->iocb, offset, transferred, - dio->map_bh.b_private, ret, is_async); + dio->private, ret, is_async); } else { if (is_async) aio_complete(dio->iocb, ret, 0); @@ -323,7 +333,7 @@ static void dio_bio_end_aio(struct bio *bio, int error) if (remaining == 0) { dio_complete(dio, dio->iocb->ki_pos, 0, true); - kfree(dio); + kmem_cache_free(dio_cache, dio); } } @@ -367,9 +377,10 @@ void dio_end_io(struct bio *bio, int error) } EXPORT_SYMBOL_GPL(dio_end_io); -static void -dio_bio_alloc(struct dio *dio, struct block_device *bdev, - sector_t first_sector, int nr_vecs) +static inline void +dio_bio_alloc(struct dio *dio, struct dio_submit *sdio, + struct block_device *bdev, + sector_t first_sector, int nr_vecs) { struct bio *bio; @@ -386,8 +397,8 @@ dio_bio_alloc(struct dio *dio, struct block_device *bdev, else bio->bi_end_io = dio_bio_end_io; - dio->bio = bio; - dio->logical_offset_in_bio = dio->cur_page_fs_offset; + sdio->bio = bio; + sdio->logical_offset_in_bio = sdio->cur_page_fs_offset; } /* @@ -397,9 +408,9 @@ dio_bio_alloc(struct dio *dio, struct block_device *bdev, * * bios hold a dio reference between submit_bio and ->end_io. */ -static void dio_bio_submit(struct dio *dio) +static inline void dio_bio_submit(struct dio *dio, struct dio_submit *sdio) { - struct bio *bio = dio->bio; + struct bio *bio = sdio->bio; unsigned long flags; bio->bi_private = dio; @@ -411,24 +422,24 @@ static void dio_bio_submit(struct dio *dio) if (dio->is_async && dio->rw == READ) bio_set_pages_dirty(bio); - if (dio->submit_io) - dio->submit_io(dio->rw, bio, dio->inode, - dio->logical_offset_in_bio); + if (sdio->submit_io) + sdio->submit_io(dio->rw, bio, dio->inode, + sdio->logical_offset_in_bio); else submit_bio(dio->rw, bio); - dio->bio = NULL; - dio->boundary = 0; - dio->logical_offset_in_bio = 0; + sdio->bio = NULL; + sdio->boundary = 0; + sdio->logical_offset_in_bio = 0; } /* * Release any resources in case of a failure */ -static void dio_cleanup(struct dio *dio) +static inline void dio_cleanup(struct dio *dio, struct dio_submit *sdio) { - while (dio_pages_present(dio)) - page_cache_release(dio_get_page(dio)); + while (dio_pages_present(sdio)) + page_cache_release(dio_get_page(dio, sdio)); } /* @@ -518,11 +529,11 @@ static void dio_await_completion(struct dio *dio) * * This also helps to limit the peak amount of pinned userspace memory. */ -static int dio_bio_reap(struct dio *dio) +static inline int dio_bio_reap(struct dio *dio, struct dio_submit *sdio) { int ret = 0; - if (dio->reap_counter++ >= 64) { + if (sdio->reap_counter++ >= 64) { while (dio->bio_list) { unsigned long flags; struct bio *bio; @@ -536,14 +547,14 @@ static int dio_bio_reap(struct dio *dio) if (ret == 0) ret = ret2; } - dio->reap_counter = 0; + sdio->reap_counter = 0; } return ret; } /* * Call into the fs to map some more disk blocks. We record the current number - * of available blocks at dio->blocks_available. These are in units of the + * of available blocks at sdio->blocks_available. These are in units of the * fs blocksize, (1 << inode->i_blkbits). * * The fs is allowed to map lots of blocks at once. If it wants to do that, @@ -564,10 +575,10 @@ static int dio_bio_reap(struct dio *dio) * buffer_mapped(). However the direct-io code will only process holes one * block at a time - it will repeatedly call get_block() as it walks the hole. */ -static int get_more_blocks(struct dio *dio) +static int get_more_blocks(struct dio *dio, struct dio_submit *sdio, + struct buffer_head *map_bh) { int ret; - struct buffer_head *map_bh = &dio->map_bh; sector_t fs_startblk; /* Into file, in filesystem-sized blocks */ unsigned long fs_count; /* Number of filesystem-sized blocks */ unsigned long dio_count;/* Number of dio_block-sized blocks */ @@ -580,11 +591,11 @@ static int get_more_blocks(struct dio *dio) */ ret = dio->page_errors; if (ret == 0) { - BUG_ON(dio->block_in_file >= dio->final_block_in_request); - fs_startblk = dio->block_in_file >> dio->blkfactor; - dio_count = dio->final_block_in_request - dio->block_in_file; - fs_count = dio_count >> dio->blkfactor; - blkmask = (1 << dio->blkfactor) - 1; + BUG_ON(sdio->block_in_file >= sdio->final_block_in_request); + fs_startblk = sdio->block_in_file >> sdio->blkfactor; + dio_count = sdio->final_block_in_request - sdio->block_in_file; + fs_count = dio_count >> sdio->blkfactor; + blkmask = (1 << sdio->blkfactor) - 1; if (dio_count & blkmask) fs_count++; @@ -604,13 +615,16 @@ static int get_more_blocks(struct dio *dio) */ create = dio->rw & WRITE; if (dio->flags & DIO_SKIP_HOLES) { - if (dio->block_in_file < (i_size_read(dio->inode) >> - dio->blkbits)) + if (sdio->block_in_file < (i_size_read(dio->inode) >> + sdio->blkbits)) create = 0; } - ret = (*dio->get_block)(dio->inode, fs_startblk, + ret = (*sdio->get_block)(dio->inode, fs_startblk, map_bh, create); + + /* Store for completion */ + dio->private = map_bh->b_private; } return ret; } @@ -618,20 +632,21 @@ static int get_more_blocks(struct dio *dio) /* * There is no bio. Make one now. */ -static int dio_new_bio(struct dio *dio, sector_t start_sector) +static inline int dio_new_bio(struct dio *dio, struct dio_submit *sdio, + sector_t start_sector, struct buffer_head *map_bh) { sector_t sector; int ret, nr_pages; - ret = dio_bio_reap(dio); + ret = dio_bio_reap(dio, sdio); if (ret) goto out; - sector = start_sector << (dio->blkbits - 9); - nr_pages = min(dio->pages_in_io, bio_get_nr_vecs(dio->map_bh.b_bdev)); + sector = start_sector << (sdio->blkbits - 9); + nr_pages = min(sdio->pages_in_io, bio_get_nr_vecs(map_bh->b_bdev)); nr_pages = min(nr_pages, BIO_MAX_PAGES); BUG_ON(nr_pages <= 0); - dio_bio_alloc(dio, dio->map_bh.b_bdev, sector, nr_pages); - dio->boundary = 0; + dio_bio_alloc(dio, sdio, map_bh->b_bdev, sector, nr_pages); + sdio->boundary = 0; out: return ret; } @@ -643,21 +658,21 @@ out: * * Return zero on success. Non-zero means the caller needs to start a new BIO. */ -static int dio_bio_add_page(struct dio *dio) +static inline int dio_bio_add_page(struct dio_submit *sdio) { int ret; - ret = bio_add_page(dio->bio, dio->cur_page, - dio->cur_page_len, dio->cur_page_offset); - if (ret == dio->cur_page_len) { + ret = bio_add_page(sdio->bio, sdio->cur_page, + sdio->cur_page_len, sdio->cur_page_offset); + if (ret == sdio->cur_page_len) { /* * Decrement count only, if we are done with this page */ - if ((dio->cur_page_len + dio->cur_page_offset) == PAGE_SIZE) - dio->pages_in_io--; - page_cache_get(dio->cur_page); - dio->final_block_in_bio = dio->cur_page_block + - (dio->cur_page_len >> dio->blkbits); + if ((sdio->cur_page_len + sdio->cur_page_offset) == PAGE_SIZE) + sdio->pages_in_io--; + page_cache_get(sdio->cur_page); + sdio->final_block_in_bio = sdio->cur_page_block + + (sdio->cur_page_len >> sdio->blkbits); ret = 0; } else { ret = 1; @@ -675,14 +690,15 @@ static int dio_bio_add_page(struct dio *dio) * The caller of this function is responsible for removing cur_page from the * dio, and for dropping the refcount which came from that presence. */ -static int dio_send_cur_page(struct dio *dio) +static inline int dio_send_cur_page(struct dio *dio, struct dio_submit *sdio, + struct buffer_head *map_bh) { int ret = 0; - if (dio->bio) { - loff_t cur_offset = dio->cur_page_fs_offset; - loff_t bio_next_offset = dio->logical_offset_in_bio + - dio->bio->bi_size; + if (sdio->bio) { + loff_t cur_offset = sdio->cur_page_fs_offset; + loff_t bio_next_offset = sdio->logical_offset_in_bio + + sdio->bio->bi_size; /* * See whether this new request is contiguous with the old. @@ -698,28 +714,28 @@ static int dio_send_cur_page(struct dio *dio) * be the next logical offset in the bio, submit the bio we * have. */ - if (dio->final_block_in_bio != dio->cur_page_block || + if (sdio->final_block_in_bio != sdio->cur_page_block || cur_offset != bio_next_offset) - dio_bio_submit(dio); + dio_bio_submit(dio, sdio); /* * Submit now if the underlying fs is about to perform a * metadata read */ - else if (dio->boundary) - dio_bio_submit(dio); + else if (sdio->boundary) + dio_bio_submit(dio, sdio); } - if (dio->bio == NULL) { - ret = dio_new_bio(dio, dio->cur_page_block); + if (sdio->bio == NULL) { + ret = dio_new_bio(dio, sdio, sdio->cur_page_block, map_bh); if (ret) goto out; } - if (dio_bio_add_page(dio) != 0) { - dio_bio_submit(dio); - ret = dio_new_bio(dio, dio->cur_page_block); + if (dio_bio_add_page(sdio) != 0) { + dio_bio_submit(dio, sdio); + ret = dio_new_bio(dio, sdio, sdio->cur_page_block, map_bh); if (ret == 0) { - ret = dio_bio_add_page(dio); + ret = dio_bio_add_page(sdio); BUG_ON(ret != 0); } } @@ -744,9 +760,10 @@ out: * If that doesn't work out then we put the old page into the bio and add this * page to the dio instead. */ -static int -submit_page_section(struct dio *dio, struct page *page, - unsigned offset, unsigned len, sector_t blocknr) +static inline int +submit_page_section(struct dio *dio, struct dio_submit *sdio, struct page *page, + unsigned offset, unsigned len, sector_t blocknr, + struct buffer_head *map_bh) { int ret = 0; @@ -760,20 +777,20 @@ submit_page_section(struct dio *dio, struct page *page, /* * Can we just grow the current page's presence in the dio? */ - if ( (dio->cur_page == page) && - (dio->cur_page_offset + dio->cur_page_len == offset) && - (dio->cur_page_block + - (dio->cur_page_len >> dio->blkbits) == blocknr)) { - dio->cur_page_len += len; + if (sdio->cur_page == page && + sdio->cur_page_offset + sdio->cur_page_len == offset && + sdio->cur_page_block + + (sdio->cur_page_len >> sdio->blkbits) == blocknr) { + sdio->cur_page_len += len; /* - * If dio->boundary then we want to schedule the IO now to + * If sdio->boundary then we want to schedule the IO now to * avoid metadata seeks. */ - if (dio->boundary) { - ret = dio_send_cur_page(dio); - page_cache_release(dio->cur_page); - dio->cur_page = NULL; + if (sdio->boundary) { + ret = dio_send_cur_page(dio, sdio, map_bh); + page_cache_release(sdio->cur_page); + sdio->cur_page = NULL; } goto out; } @@ -781,20 +798,20 @@ submit_page_section(struct dio *dio, struct page *page, /* * If there's a deferred page already there then send it. */ - if (dio->cur_page) { - ret = dio_send_cur_page(dio); - page_cache_release(dio->cur_page); - dio->cur_page = NULL; + if (sdio->cur_page) { + ret = dio_send_cur_page(dio, sdio, map_bh); + page_cache_release(sdio->cur_page); + sdio->cur_page = NULL; if (ret) goto out; } page_cache_get(page); /* It is in dio */ - dio->cur_page = page; - dio->cur_page_offset = offset; - dio->cur_page_len = len; - dio->cur_page_block = blocknr; - dio->cur_page_fs_offset = dio->block_in_file << dio->blkbits; + sdio->cur_page = page; + sdio->cur_page_offset = offset; + sdio->cur_page_len = len; + sdio->cur_page_block = blocknr; + sdio->cur_page_fs_offset = sdio->block_in_file << sdio->blkbits; out: return ret; } @@ -804,16 +821,16 @@ out: * file blocks. Only called for S_ISREG files - blockdevs do not set * buffer_new */ -static void clean_blockdev_aliases(struct dio *dio) +static void clean_blockdev_aliases(struct dio *dio, struct buffer_head *map_bh) { unsigned i; unsigned nblocks; - nblocks = dio->map_bh.b_size >> dio->inode->i_blkbits; + nblocks = map_bh->b_size >> dio->inode->i_blkbits; for (i = 0; i < nblocks; i++) { - unmap_underlying_metadata(dio->map_bh.b_bdev, - dio->map_bh.b_blocknr + i); + unmap_underlying_metadata(map_bh->b_bdev, + map_bh->b_blocknr + i); } } @@ -826,19 +843,20 @@ static void clean_blockdev_aliases(struct dio *dio) * `end' is zero if we're doing the start of the IO, 1 at the end of the * IO. */ -static void dio_zero_block(struct dio *dio, int end) +static inline void dio_zero_block(struct dio *dio, struct dio_submit *sdio, + int end, struct buffer_head *map_bh) { unsigned dio_blocks_per_fs_block; unsigned this_chunk_blocks; /* In dio_blocks */ unsigned this_chunk_bytes; struct page *page; - dio->start_zero_done = 1; - if (!dio->blkfactor || !buffer_new(&dio->map_bh)) + sdio->start_zero_done = 1; + if (!sdio->blkfactor || !buffer_new(map_bh)) return; - dio_blocks_per_fs_block = 1 << dio->blkfactor; - this_chunk_blocks = dio->block_in_file & (dio_blocks_per_fs_block - 1); + dio_blocks_per_fs_block = 1 << sdio->blkfactor; + this_chunk_blocks = sdio->block_in_file & (dio_blocks_per_fs_block - 1); if (!this_chunk_blocks) return; @@ -850,14 +868,14 @@ static void dio_zero_block(struct dio *dio, int end) if (end) this_chunk_blocks = dio_blocks_per_fs_block - this_chunk_blocks; - this_chunk_bytes = this_chunk_blocks << dio->blkbits; + this_chunk_bytes = this_chunk_blocks << sdio->blkbits; page = ZERO_PAGE(0); - if (submit_page_section(dio, page, 0, this_chunk_bytes, - dio->next_block_for_io)) + if (submit_page_section(dio, sdio, page, 0, this_chunk_bytes, + sdio->next_block_for_io, map_bh)) return; - dio->next_block_for_io += this_chunk_blocks; + sdio->next_block_for_io += this_chunk_blocks; } /* @@ -876,20 +894,20 @@ static void dio_zero_block(struct dio *dio, int end) * it should set b_size to PAGE_SIZE or more inside get_block(). This gives * fine alignment but still allows this function to work in PAGE_SIZE units. */ -static int do_direct_IO(struct dio *dio) +static int do_direct_IO(struct dio *dio, struct dio_submit *sdio, + struct buffer_head *map_bh) { - const unsigned blkbits = dio->blkbits; + const unsigned blkbits = sdio->blkbits; const unsigned blocks_per_page = PAGE_SIZE >> blkbits; struct page *page; unsigned block_in_page; - struct buffer_head *map_bh = &dio->map_bh; int ret = 0; /* The I/O can start at any block offset within the first page */ - block_in_page = dio->first_block_in_page; + block_in_page = sdio->first_block_in_page; - while (dio->block_in_file < dio->final_block_in_request) { - page = dio_get_page(dio); + while (sdio->block_in_file < sdio->final_block_in_request) { + page = dio_get_page(dio, sdio); if (IS_ERR(page)) { ret = PTR_ERR(page); goto out; @@ -901,14 +919,14 @@ static int do_direct_IO(struct dio *dio) unsigned this_chunk_blocks; /* # of blocks */ unsigned u; - if (dio->blocks_available == 0) { + if (sdio->blocks_available == 0) { /* * Need to go and map some more disk */ unsigned long blkmask; unsigned long dio_remainder; - ret = get_more_blocks(dio); + ret = get_more_blocks(dio, sdio, map_bh); if (ret) { page_cache_release(page); goto out; @@ -916,18 +934,18 @@ static int do_direct_IO(struct dio *dio) if (!buffer_mapped(map_bh)) goto do_holes; - dio->blocks_available = - map_bh->b_size >> dio->blkbits; - dio->next_block_for_io = - map_bh->b_blocknr << dio->blkfactor; + sdio->blocks_available = + map_bh->b_size >> sdio->blkbits; + sdio->next_block_for_io = + map_bh->b_blocknr << sdio->blkfactor; if (buffer_new(map_bh)) - clean_blockdev_aliases(dio); + clean_blockdev_aliases(dio, map_bh); - if (!dio->blkfactor) + if (!sdio->blkfactor) goto do_holes; - blkmask = (1 << dio->blkfactor) - 1; - dio_remainder = (dio->block_in_file & blkmask); + blkmask = (1 << sdio->blkfactor) - 1; + dio_remainder = (sdio->block_in_file & blkmask); /* * If we are at the start of IO and that IO @@ -941,8 +959,8 @@ static int do_direct_IO(struct dio *dio) * on-disk */ if (!buffer_new(map_bh)) - dio->next_block_for_io += dio_remainder; - dio->blocks_available -= dio_remainder; + sdio->next_block_for_io += dio_remainder; + sdio->blocks_available -= dio_remainder; } do_holes: /* Handle holes */ @@ -961,7 +979,7 @@ do_holes: */ i_size_aligned = ALIGN(i_size_read(dio->inode), 1 << blkbits); - if (dio->block_in_file >= + if (sdio->block_in_file >= i_size_aligned >> blkbits) { /* We hit eof */ page_cache_release(page); @@ -969,7 +987,7 @@ do_holes: } zero_user(page, block_in_page << blkbits, 1 << blkbits); - dio->block_in_file++; + sdio->block_in_file++; block_in_page++; goto next_block; } @@ -979,38 +997,41 @@ do_holes: * is finer than the underlying fs, go check to see if * we must zero out the start of this block. */ - if (unlikely(dio->blkfactor && !dio->start_zero_done)) - dio_zero_block(dio, 0); + if (unlikely(sdio->blkfactor && !sdio->start_zero_done)) + dio_zero_block(dio, sdio, 0, map_bh); /* * Work out, in this_chunk_blocks, how much disk we * can add to this page */ - this_chunk_blocks = dio->blocks_available; + this_chunk_blocks = sdio->blocks_available; u = (PAGE_SIZE - offset_in_page) >> blkbits; if (this_chunk_blocks > u) this_chunk_blocks = u; - u = dio->final_block_in_request - dio->block_in_file; + u = sdio->final_block_in_request - sdio->block_in_file; if (this_chunk_blocks > u) this_chunk_blocks = u; this_chunk_bytes = this_chunk_blocks << blkbits; BUG_ON(this_chunk_bytes == 0); - dio->boundary = buffer_boundary(map_bh); - ret = submit_page_section(dio, page, offset_in_page, - this_chunk_bytes, dio->next_block_for_io); + sdio->boundary = buffer_boundary(map_bh); + ret = submit_page_section(dio, sdio, page, + offset_in_page, + this_chunk_bytes, + sdio->next_block_for_io, + map_bh); if (ret) { page_cache_release(page); goto out; } - dio->next_block_for_io += this_chunk_blocks; + sdio->next_block_for_io += this_chunk_blocks; - dio->block_in_file += this_chunk_blocks; + sdio->block_in_file += this_chunk_blocks; block_in_page += this_chunk_blocks; - dio->blocks_available -= this_chunk_blocks; + sdio->blocks_available -= this_chunk_blocks; next_block: - BUG_ON(dio->block_in_file > dio->final_block_in_request); - if (dio->block_in_file == dio->final_block_in_request) + BUG_ON(sdio->block_in_file > sdio->final_block_in_request); + if (sdio->block_in_file == sdio->final_block_in_request) break; } @@ -1022,135 +1043,10 @@ out: return ret; } -static ssize_t -direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode, - const struct iovec *iov, loff_t offset, unsigned long nr_segs, - unsigned blkbits, get_block_t get_block, dio_iodone_t end_io, - dio_submit_t submit_io, struct dio *dio) +static inline int drop_refcount(struct dio *dio) { - unsigned long user_addr; + int ret2; unsigned long flags; - int seg; - ssize_t ret = 0; - ssize_t ret2; - size_t bytes; - - dio->inode = inode; - dio->rw = rw; - dio->blkbits = blkbits; - dio->blkfactor = inode->i_blkbits - blkbits; - dio->block_in_file = offset >> blkbits; - - dio->get_block = get_block; - dio->end_io = end_io; - dio->submit_io = submit_io; - dio->final_block_in_bio = -1; - dio->next_block_for_io = -1; - - dio->iocb = iocb; - dio->i_size = i_size_read(inode); - - spin_lock_init(&dio->bio_lock); - dio->refcount = 1; - - /* - * In case of non-aligned buffers, we may need 2 more - * pages since we need to zero out first and last block. - */ - if (unlikely(dio->blkfactor)) - dio->pages_in_io = 2; - - for (seg = 0; seg < nr_segs; seg++) { - user_addr = (unsigned long)iov[seg].iov_base; - dio->pages_in_io += - ((user_addr+iov[seg].iov_len +PAGE_SIZE-1)/PAGE_SIZE - - user_addr/PAGE_SIZE); - } - - for (seg = 0; seg < nr_segs; seg++) { - user_addr = (unsigned long)iov[seg].iov_base; - dio->size += bytes = iov[seg].iov_len; - - /* Index into the first page of the first block */ - dio->first_block_in_page = (user_addr & ~PAGE_MASK) >> blkbits; - dio->final_block_in_request = dio->block_in_file + - (bytes >> blkbits); - /* Page fetching state */ - dio->head = 0; - dio->tail = 0; - dio->curr_page = 0; - - dio->total_pages = 0; - if (user_addr & (PAGE_SIZE-1)) { - dio->total_pages++; - bytes -= PAGE_SIZE - (user_addr & (PAGE_SIZE - 1)); - } - dio->total_pages += (bytes + PAGE_SIZE - 1) / PAGE_SIZE; - dio->curr_user_address = user_addr; - - ret = do_direct_IO(dio); - - dio->result += iov[seg].iov_len - - ((dio->final_block_in_request - dio->block_in_file) << - blkbits); - - if (ret) { - dio_cleanup(dio); - break; - } - } /* end iovec loop */ - - if (ret == -ENOTBLK) { - /* - * The remaining part of the request will be - * be handled by buffered I/O when we return - */ - ret = 0; - } - /* - * There may be some unwritten disk at the end of a part-written - * fs-block-sized block. Go zero that now. - */ - dio_zero_block(dio, 1); - - if (dio->cur_page) { - ret2 = dio_send_cur_page(dio); - if (ret == 0) - ret = ret2; - page_cache_release(dio->cur_page); - dio->cur_page = NULL; - } - if (dio->bio) - dio_bio_submit(dio); - - /* - * It is possible that, we return short IO due to end of file. - * In that case, we need to release all the pages we got hold on. - */ - dio_cleanup(dio); - - /* - * All block lookups have been performed. For READ requests - * we can let i_mutex go now that its achieved its purpose - * of protecting us from looking up uninitialized blocks. - */ - if (rw == READ && (dio->flags & DIO_LOCKING)) - mutex_unlock(&dio->inode->i_mutex); - - /* - * The only time we want to leave bios in flight is when a successful - * partial aio read or full aio write have been setup. In that case - * bio completion will call aio_complete. The only time it's safe to - * call aio_complete is when we return -EIOCBQUEUED, so we key on that. - * This had *better* be the only place that raises -EIOCBQUEUED. - */ - BUG_ON(ret == -EIOCBQUEUED); - if (dio->is_async && ret == 0 && dio->result && - ((rw & READ) || (dio->result == dio->size))) - ret = -EIOCBQUEUED; - - if (ret != -EIOCBQUEUED) - dio_await_completion(dio); /* * Sync will always be dropping the final ref and completing the @@ -1166,14 +1062,7 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode, spin_lock_irqsave(&dio->bio_lock, flags); ret2 = --dio->refcount; spin_unlock_irqrestore(&dio->bio_lock, flags); - - if (ret2 == 0) { - ret = dio_complete(dio, offset, ret, false); - kfree(dio); - } else - BUG_ON(ret != -EIOCBQUEUED); - - return ret; + return ret2; } /* @@ -1195,6 +1084,11 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode, * expected that filesystem provide exclusion between new direct I/O * and truncates. For DIO_LOCKING filesystems this is done by i_mutex, * but other filesystems need to take care of this on their own. + * + * NOTE: if you pass "sdio" to anything by pointer make sure that function + * is always inlined. Otherwise gcc is unable to split the structure into + * individual fields and will generate much worse code. This is important + * for the whole file. */ ssize_t __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, @@ -1211,6 +1105,10 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, ssize_t retval = -EINVAL; loff_t end = offset; struct dio *dio; + struct dio_submit sdio = { 0, }; + unsigned long user_addr; + size_t bytes; + struct buffer_head map_bh = { 0, }; if (rw & WRITE) rw = WRITE_ODIRECT; @@ -1244,7 +1142,7 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, if (rw == READ && end == offset) return 0; - dio = kmalloc(sizeof(*dio), GFP_KERNEL); + dio = kmem_cache_alloc(dio_cache, GFP_KERNEL); retval = -ENOMEM; if (!dio) goto out; @@ -1268,7 +1166,7 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, end - 1); if (retval) { mutex_unlock(&inode->i_mutex); - kfree(dio); + kmem_cache_free(dio_cache, dio); goto out; } } @@ -1288,11 +1186,141 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, dio->is_async = !is_sync_kiocb(iocb) && !((rw & WRITE) && (end > i_size_read(inode))); - retval = direct_io_worker(rw, iocb, inode, iov, offset, - nr_segs, blkbits, get_block, end_io, - submit_io, dio); + retval = 0; + + dio->inode = inode; + dio->rw = rw; + sdio.blkbits = blkbits; + sdio.blkfactor = inode->i_blkbits - blkbits; + sdio.block_in_file = offset >> blkbits; + + sdio.get_block = get_block; + dio->end_io = end_io; + sdio.submit_io = submit_io; + sdio.final_block_in_bio = -1; + sdio.next_block_for_io = -1; + + dio->iocb = iocb; + dio->i_size = i_size_read(inode); + + spin_lock_init(&dio->bio_lock); + dio->refcount = 1; + + /* + * In case of non-aligned buffers, we may need 2 more + * pages since we need to zero out first and last block. + */ + if (unlikely(sdio.blkfactor)) + sdio.pages_in_io = 2; + + for (seg = 0; seg < nr_segs; seg++) { + user_addr = (unsigned long)iov[seg].iov_base; + sdio.pages_in_io += + ((user_addr + iov[seg].iov_len + PAGE_SIZE-1) / + PAGE_SIZE - user_addr / PAGE_SIZE); + } + + for (seg = 0; seg < nr_segs; seg++) { + user_addr = (unsigned long)iov[seg].iov_base; + sdio.size += bytes = iov[seg].iov_len; + + /* Index into the first page of the first block */ + sdio.first_block_in_page = (user_addr & ~PAGE_MASK) >> blkbits; + sdio.final_block_in_request = sdio.block_in_file + + (bytes >> blkbits); + /* Page fetching state */ + sdio.head = 0; + sdio.tail = 0; + sdio.curr_page = 0; + + sdio.total_pages = 0; + if (user_addr & (PAGE_SIZE-1)) { + sdio.total_pages++; + bytes -= PAGE_SIZE - (user_addr & (PAGE_SIZE - 1)); + } + sdio.total_pages += (bytes + PAGE_SIZE - 1) / PAGE_SIZE; + sdio.curr_user_address = user_addr; + + retval = do_direct_IO(dio, &sdio, &map_bh); + + dio->result += iov[seg].iov_len - + ((sdio.final_block_in_request - sdio.block_in_file) << + blkbits); + + if (retval) { + dio_cleanup(dio, &sdio); + break; + } + } /* end iovec loop */ + + if (retval == -ENOTBLK) { + /* + * The remaining part of the request will be + * be handled by buffered I/O when we return + */ + retval = 0; + } + /* + * There may be some unwritten disk at the end of a part-written + * fs-block-sized block. Go zero that now. + */ + dio_zero_block(dio, &sdio, 1, &map_bh); + + if (sdio.cur_page) { + ssize_t ret2; + + ret2 = dio_send_cur_page(dio, &sdio, &map_bh); + if (retval == 0) + retval = ret2; + page_cache_release(sdio.cur_page); + sdio.cur_page = NULL; + } + if (sdio.bio) + dio_bio_submit(dio, &sdio); + + /* + * It is possible that, we return short IO due to end of file. + * In that case, we need to release all the pages we got hold on. + */ + dio_cleanup(dio, &sdio); + + /* + * All block lookups have been performed. For READ requests + * we can let i_mutex go now that its achieved its purpose + * of protecting us from looking up uninitialized blocks. + */ + if (rw == READ && (dio->flags & DIO_LOCKING)) + mutex_unlock(&dio->inode->i_mutex); + + /* + * The only time we want to leave bios in flight is when a successful + * partial aio read or full aio write have been setup. In that case + * bio completion will call aio_complete. The only time it's safe to + * call aio_complete is when we return -EIOCBQUEUED, so we key on that. + * This had *better* be the only place that raises -EIOCBQUEUED. + */ + BUG_ON(retval == -EIOCBQUEUED); + if (dio->is_async && retval == 0 && dio->result && + ((rw & READ) || (dio->result == sdio.size))) + retval = -EIOCBQUEUED; + + if (retval != -EIOCBQUEUED) + dio_await_completion(dio); + + if (drop_refcount(dio) == 0) { + retval = dio_complete(dio, offset, retval, false); + kmem_cache_free(dio_cache, dio); + } else + BUG_ON(retval != -EIOCBQUEUED); out: return retval; } EXPORT_SYMBOL(__blockdev_direct_IO); + +static __init int dio_init(void) +{ + dio_cache = KMEM_CACHE(dio, SLAB_PANIC); + return 0; +} +module_init(dio_init) diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index 990626e..0b3109e 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -281,7 +281,7 @@ static int nodeid_to_addr(int nodeid, struct sockaddr *retaddr) } else { struct sockaddr_in6 *in6 = (struct sockaddr_in6 *) &addr; struct sockaddr_in6 *ret6 = (struct sockaddr_in6 *) retaddr; - ipv6_addr_copy(&ret6->sin6_addr, &in6->sin6_addr); + ret6->sin6_addr = in6->sin6_addr; } return 0; diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c index 58609bd..2a83425 100644 --- a/fs/ecryptfs/crypto.c +++ b/fs/ecryptfs/crypto.c @@ -967,7 +967,7 @@ static void ecryptfs_set_default_crypt_stat_vals( /** * ecryptfs_new_file_context - * @ecryptfs_dentry: The eCryptfs dentry + * @ecryptfs_inode: The eCryptfs inode * * If the crypto context for the file has not yet been established, * this is where we do that. Establishing a new crypto context @@ -984,13 +984,13 @@ static void ecryptfs_set_default_crypt_stat_vals( * * Returns zero on success; non-zero otherwise */ -int ecryptfs_new_file_context(struct dentry *ecryptfs_dentry) +int ecryptfs_new_file_context(struct inode *ecryptfs_inode) { struct ecryptfs_crypt_stat *crypt_stat = - &ecryptfs_inode_to_private(ecryptfs_dentry->d_inode)->crypt_stat; + &ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat; struct ecryptfs_mount_crypt_stat *mount_crypt_stat = &ecryptfs_superblock_to_private( - ecryptfs_dentry->d_sb)->mount_crypt_stat; + ecryptfs_inode->i_sb)->mount_crypt_stat; int cipher_name_len; int rc = 0; @@ -1299,12 +1299,12 @@ static int ecryptfs_write_headers_virt(char *page_virt, size_t max, } static int -ecryptfs_write_metadata_to_contents(struct dentry *ecryptfs_dentry, +ecryptfs_write_metadata_to_contents(struct inode *ecryptfs_inode, char *virt, size_t virt_len) { int rc; - rc = ecryptfs_write_lower(ecryptfs_dentry->d_inode, virt, + rc = ecryptfs_write_lower(ecryptfs_inode, virt, 0, virt_len); if (rc < 0) printk(KERN_ERR "%s: Error attempting to write header " @@ -1338,7 +1338,8 @@ static unsigned long ecryptfs_get_zeroed_pages(gfp_t gfp_mask, /** * ecryptfs_write_metadata - * @ecryptfs_dentry: The eCryptfs dentry + * @ecryptfs_dentry: The eCryptfs dentry, which should be negative + * @ecryptfs_inode: The newly created eCryptfs inode * * Write the file headers out. This will likely involve a userspace * callout, in which the session key is encrypted with one or more @@ -1348,10 +1349,11 @@ static unsigned long ecryptfs_get_zeroed_pages(gfp_t gfp_mask, * * Returns zero on success; non-zero on error */ -int ecryptfs_write_metadata(struct dentry *ecryptfs_dentry) +int ecryptfs_write_metadata(struct dentry *ecryptfs_dentry, + struct inode *ecryptfs_inode) { struct ecryptfs_crypt_stat *crypt_stat = - &ecryptfs_inode_to_private(ecryptfs_dentry->d_inode)->crypt_stat; + &ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat; unsigned int order; char *virt; size_t virt_len; @@ -1391,7 +1393,7 @@ int ecryptfs_write_metadata(struct dentry *ecryptfs_dentry) rc = ecryptfs_write_metadata_to_xattr(ecryptfs_dentry, virt, size); else - rc = ecryptfs_write_metadata_to_contents(ecryptfs_dentry, virt, + rc = ecryptfs_write_metadata_to_contents(ecryptfs_inode, virt, virt_len); if (rc) { printk(KERN_ERR "%s: Error writing metadata out to lower file; " @@ -1943,7 +1945,7 @@ static unsigned char *portable_filename_chars = ("-.0123456789ABCD" /* We could either offset on every reverse map or just pad some 0x00's * at the front here */ -static const unsigned char filename_rev_map[] = { +static const unsigned char filename_rev_map[256] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 15 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 23 */ @@ -1959,7 +1961,7 @@ static const unsigned char filename_rev_map[] = { 0x00, 0x26, 0x27, 0x28, 0x29, 0x2A, 0x2B, 0x2C, /* 103 */ 0x2D, 0x2E, 0x2F, 0x30, 0x31, 0x32, 0x33, 0x34, /* 111 */ 0x35, 0x36, 0x37, 0x38, 0x39, 0x3A, 0x3B, 0x3C, /* 119 */ - 0x3D, 0x3E, 0x3F + 0x3D, 0x3E, 0x3F /* 123 - 255 initialized to 0x00 */ }; /** diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h index b36c557..a9f29b1 100644 --- a/fs/ecryptfs/ecryptfs_kernel.h +++ b/fs/ecryptfs/ecryptfs_kernel.h @@ -514,7 +514,7 @@ ecryptfs_set_dentry_lower_mnt(struct dentry *dentry, struct vfsmount *lower_mnt) #define ecryptfs_printk(type, fmt, arg...) \ __ecryptfs_printk(type "%s: " fmt, __func__, ## arg); -__attribute__ ((format(printf, 1, 2))) +__printf(1, 2) void __ecryptfs_printk(const char *fmt, ...); extern const struct file_operations ecryptfs_main_fops; @@ -584,9 +584,10 @@ int ecryptfs_init_crypt_ctx(struct ecryptfs_crypt_stat *crypt_stat); int ecryptfs_write_inode_size_to_metadata(struct inode *ecryptfs_inode); int ecryptfs_encrypt_page(struct page *page); int ecryptfs_decrypt_page(struct page *page); -int ecryptfs_write_metadata(struct dentry *ecryptfs_dentry); +int ecryptfs_write_metadata(struct dentry *ecryptfs_dentry, + struct inode *ecryptfs_inode); int ecryptfs_read_metadata(struct dentry *ecryptfs_dentry); -int ecryptfs_new_file_context(struct dentry *ecryptfs_dentry); +int ecryptfs_new_file_context(struct inode *ecryptfs_inode); void ecryptfs_write_crypt_stat_flags(char *page_virt, struct ecryptfs_crypt_stat *crypt_stat, size_t *written); diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c index c6ac98c..d3f95f9 100644 --- a/fs/ecryptfs/file.c +++ b/fs/ecryptfs/file.c @@ -139,6 +139,27 @@ out: return rc; } +static void ecryptfs_vma_close(struct vm_area_struct *vma) +{ + filemap_write_and_wait(vma->vm_file->f_mapping); +} + +static const struct vm_operations_struct ecryptfs_file_vm_ops = { + .close = ecryptfs_vma_close, + .fault = filemap_fault, +}; + +static int ecryptfs_file_mmap(struct file *file, struct vm_area_struct *vma) +{ + int rc; + + rc = generic_file_mmap(file, vma); + if (!rc) + vma->vm_ops = &ecryptfs_file_vm_ops; + + return rc; +} + struct kmem_cache *ecryptfs_file_info_cache; /** @@ -349,7 +370,7 @@ const struct file_operations ecryptfs_main_fops = { #ifdef CONFIG_COMPAT .compat_ioctl = ecryptfs_compat_ioctl, #endif - .mmap = generic_file_mmap, + .mmap = ecryptfs_file_mmap, .open = ecryptfs_open, .flush = ecryptfs_flush, .release = ecryptfs_release, diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index 11f8582..19a8ca4 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -144,24 +144,6 @@ static int ecryptfs_interpose(struct dentry *lower_dentry, } /** - * ecryptfs_create_underlying_file - * @lower_dir_inode: inode of the parent in the lower fs of the new file - * @dentry: New file's dentry - * @mode: The mode of the new file - * - * Creates the file in the lower file system. - * - * Returns zero on success; non-zero on error condition - */ -static int -ecryptfs_create_underlying_file(struct inode *lower_dir_inode, - struct dentry *dentry, int mode) -{ - struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry); - return vfs_create(lower_dir_inode, lower_dentry, mode, NULL); -} - -/** * ecryptfs_do_create * @directory_inode: inode of the new file's dentry's parent in ecryptfs * @ecryptfs_dentry: New file's dentry in ecryptfs @@ -172,43 +154,42 @@ ecryptfs_create_underlying_file(struct inode *lower_dir_inode, * it. It will also update the eCryptfs directory inode to mimic the * stat of the lower directory inode. * - * Returns zero on success; non-zero on error condition + * Returns the new eCryptfs inode on success; an ERR_PTR on error condition */ -static int +static struct inode * ecryptfs_do_create(struct inode *directory_inode, - struct dentry *ecryptfs_dentry, int mode) + struct dentry *ecryptfs_dentry, umode_t mode) { int rc; struct dentry *lower_dentry; struct dentry *lower_dir_dentry; + struct inode *inode; lower_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry); lower_dir_dentry = lock_parent(lower_dentry); if (IS_ERR(lower_dir_dentry)) { ecryptfs_printk(KERN_ERR, "Error locking directory of " "dentry\n"); - rc = PTR_ERR(lower_dir_dentry); + inode = ERR_CAST(lower_dir_dentry); goto out; } - rc = ecryptfs_create_underlying_file(lower_dir_dentry->d_inode, - ecryptfs_dentry, mode); + rc = vfs_create(lower_dir_dentry->d_inode, lower_dentry, mode, NULL); if (rc) { printk(KERN_ERR "%s: Failure to create dentry in lower fs; " "rc = [%d]\n", __func__, rc); + inode = ERR_PTR(rc); goto out_lock; } - rc = ecryptfs_interpose(lower_dentry, ecryptfs_dentry, - directory_inode->i_sb); - if (rc) { - ecryptfs_printk(KERN_ERR, "Failure in ecryptfs_interpose\n"); + inode = __ecryptfs_get_inode(lower_dentry->d_inode, + directory_inode->i_sb); + if (IS_ERR(inode)) goto out_lock; - } fsstack_copy_attr_times(directory_inode, lower_dir_dentry->d_inode); fsstack_copy_inode_size(directory_inode, lower_dir_dentry->d_inode); out_lock: unlock_dir(lower_dir_dentry); out: - return rc; + return inode; } /** @@ -219,26 +200,26 @@ out: * * Returns zero on success */ -static int ecryptfs_initialize_file(struct dentry *ecryptfs_dentry) +static int ecryptfs_initialize_file(struct dentry *ecryptfs_dentry, + struct inode *ecryptfs_inode) { struct ecryptfs_crypt_stat *crypt_stat = - &ecryptfs_inode_to_private(ecryptfs_dentry->d_inode)->crypt_stat; + &ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat; int rc = 0; - if (S_ISDIR(ecryptfs_dentry->d_inode->i_mode)) { + if (S_ISDIR(ecryptfs_inode->i_mode)) { ecryptfs_printk(KERN_DEBUG, "This is a directory\n"); crypt_stat->flags &= ~(ECRYPTFS_ENCRYPTED); goto out; } ecryptfs_printk(KERN_DEBUG, "Initializing crypto context\n"); - rc = ecryptfs_new_file_context(ecryptfs_dentry); + rc = ecryptfs_new_file_context(ecryptfs_inode); if (rc) { ecryptfs_printk(KERN_ERR, "Error creating new file " "context; rc = [%d]\n", rc); goto out; } - rc = ecryptfs_get_lower_file(ecryptfs_dentry, - ecryptfs_dentry->d_inode); + rc = ecryptfs_get_lower_file(ecryptfs_dentry, ecryptfs_inode); if (rc) { printk(KERN_ERR "%s: Error attempting to initialize " "the lower file for the dentry with name " @@ -246,10 +227,10 @@ static int ecryptfs_initialize_file(struct dentry *ecryptfs_dentry) ecryptfs_dentry->d_name.name, rc); goto out; } - rc = ecryptfs_write_metadata(ecryptfs_dentry); + rc = ecryptfs_write_metadata(ecryptfs_dentry, ecryptfs_inode); if (rc) printk(KERN_ERR "Error writing headers; rc = [%d]\n", rc); - ecryptfs_put_lower_file(ecryptfs_dentry->d_inode); + ecryptfs_put_lower_file(ecryptfs_inode); out: return rc; } @@ -267,20 +248,30 @@ out: */ static int ecryptfs_create(struct inode *directory_inode, struct dentry *ecryptfs_dentry, - int mode, struct nameidata *nd) + umode_t mode, struct nameidata *nd) { + struct inode *ecryptfs_inode; int rc; - /* ecryptfs_do_create() calls ecryptfs_interpose() */ - rc = ecryptfs_do_create(directory_inode, ecryptfs_dentry, mode); - if (unlikely(rc)) { + ecryptfs_inode = ecryptfs_do_create(directory_inode, ecryptfs_dentry, + mode); + if (unlikely(IS_ERR(ecryptfs_inode))) { ecryptfs_printk(KERN_WARNING, "Failed to create file in" "lower filesystem\n"); + rc = PTR_ERR(ecryptfs_inode); goto out; } /* At this point, a file exists on "disk"; we need to make sure * that this on disk file is prepared to be an ecryptfs file */ - rc = ecryptfs_initialize_file(ecryptfs_dentry); + rc = ecryptfs_initialize_file(ecryptfs_dentry, ecryptfs_inode); + if (rc) { + drop_nlink(ecryptfs_inode); + unlock_new_inode(ecryptfs_inode); + iput(ecryptfs_inode); + goto out; + } + d_instantiate(ecryptfs_dentry, ecryptfs_inode); + unlock_new_inode(ecryptfs_inode); out: return rc; } @@ -474,8 +465,8 @@ static int ecryptfs_link(struct dentry *old_dentry, struct inode *dir, goto out_lock; fsstack_copy_attr_times(dir, lower_dir_dentry->d_inode); fsstack_copy_inode_size(dir, lower_dir_dentry->d_inode); - old_dentry->d_inode->i_nlink = - ecryptfs_inode_to_lower(old_dentry->d_inode)->i_nlink; + set_nlink(old_dentry->d_inode, + ecryptfs_inode_to_lower(old_dentry->d_inode)->i_nlink); i_size_write(new_dentry->d_inode, file_size_save); out_lock: unlock_dir(lower_dir_dentry); @@ -499,8 +490,8 @@ static int ecryptfs_unlink(struct inode *dir, struct dentry *dentry) goto out_unlock; } fsstack_copy_attr_times(dir, lower_dir_inode); - dentry->d_inode->i_nlink = - ecryptfs_inode_to_lower(dentry->d_inode)->i_nlink; + set_nlink(dentry->d_inode, + ecryptfs_inode_to_lower(dentry->d_inode)->i_nlink); dentry->d_inode->i_ctime = dir->i_ctime; d_drop(dentry); out_unlock: @@ -549,7 +540,7 @@ out_lock: return rc; } -static int ecryptfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) +static int ecryptfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) { int rc; struct dentry *lower_dentry; @@ -565,7 +556,7 @@ static int ecryptfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) goto out; fsstack_copy_attr_times(dir, lower_dir_dentry->d_inode); fsstack_copy_inode_size(dir, lower_dir_dentry->d_inode); - dir->i_nlink = lower_dir_dentry->d_inode->i_nlink; + set_nlink(dir, lower_dir_dentry->d_inode->i_nlink); out: unlock_dir(lower_dir_dentry); if (!dentry->d_inode) @@ -588,7 +579,7 @@ static int ecryptfs_rmdir(struct inode *dir, struct dentry *dentry) if (!rc && dentry->d_inode) clear_nlink(dentry->d_inode); fsstack_copy_attr_times(dir, lower_dir_dentry->d_inode); - dir->i_nlink = lower_dir_dentry->d_inode->i_nlink; + set_nlink(dir, lower_dir_dentry->d_inode->i_nlink); unlock_dir(lower_dir_dentry); if (!rc) d_drop(dentry); @@ -597,7 +588,7 @@ static int ecryptfs_rmdir(struct inode *dir, struct dentry *dentry) } static int -ecryptfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev) +ecryptfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev) { int rc; struct dentry *lower_dentry; diff --git a/fs/ecryptfs/super.c b/fs/ecryptfs/super.c index dbd52d40..9df7fd6 100644 --- a/fs/ecryptfs/super.c +++ b/fs/ecryptfs/super.c @@ -69,7 +69,6 @@ static void ecryptfs_i_callback(struct rcu_head *head) struct ecryptfs_inode_info *inode_info; inode_info = ecryptfs_inode_to_private(inode); - INIT_LIST_HEAD(&inode->i_dentry); kmem_cache_free(ecryptfs_inode_info_cache, inode_info); } @@ -132,9 +131,9 @@ static void ecryptfs_evict_inode(struct inode *inode) * Prints the mount options for a given superblock. * Returns zero; does not fail. */ -static int ecryptfs_show_options(struct seq_file *m, struct vfsmount *mnt) +static int ecryptfs_show_options(struct seq_file *m, struct dentry *root) { - struct super_block *sb = mnt->mnt_sb; + struct super_block *sb = root->d_sb; struct ecryptfs_mount_crypt_stat *mount_crypt_stat = &ecryptfs_superblock_to_private(sb)->mount_crypt_stat; struct ecryptfs_global_auth_tok *walker; diff --git a/fs/efs/inode.c b/fs/efs/inode.c index 9c13412..bc84f36 100644 --- a/fs/efs/inode.c +++ b/fs/efs/inode.c @@ -96,7 +96,7 @@ struct inode *efs_iget(struct super_block *super, unsigned long ino) efs_inode = (struct efs_dinode *) (bh->b_data + offset); inode->i_mode = be16_to_cpu(efs_inode->di_mode); - inode->i_nlink = be16_to_cpu(efs_inode->di_nlink); + set_nlink(inode, be16_to_cpu(efs_inode->di_nlink)); inode->i_uid = (uid_t)be16_to_cpu(efs_inode->di_uid); inode->i_gid = (gid_t)be16_to_cpu(efs_inode->di_gid); inode->i_size = be32_to_cpu(efs_inode->di_size); diff --git a/fs/efs/super.c b/fs/efs/super.c index 0f31acb..9811064 100644 --- a/fs/efs/super.c +++ b/fs/efs/super.c @@ -68,7 +68,6 @@ static struct inode *efs_alloc_inode(struct super_block *sb) static void efs_i_callback(struct rcu_head *head) { struct inode *inode = container_of(head, struct inode, i_rcu); - INIT_LIST_HEAD(&inode->i_dentry); kmem_cache_free(efs_inode_cachep, INODE_INFO(inode)); } diff --git a/fs/eventpoll.c b/fs/eventpoll.c index fe047d96..828e750 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -70,6 +70,15 @@ * simultaneous inserts (A into B and B into A) from racing and * constructing a cycle without either insert observing that it is * going to. + * It is necessary to acquire multiple "ep->mtx"es at once in the + * case when one epoll fd is added to another. In this case, we + * always acquire the locks in the order of nesting (i.e. after + * epoll_ctl(e1, EPOLL_CTL_ADD, e2), e1->mtx will always be acquired + * before e2->mtx). Since we disallow cycles of epoll file + * descriptors, this ensures that the mutexes are well-ordered. In + * order to communicate this nesting to lockdep, when walking a tree + * of epoll file descriptors, we use the current recursion depth as + * the lockdep subkey. * It is possible to drop the "ep->mtx" and to use the global * mutex "epmutex" (together with "ep->lock") to have it working, * but having "ep->mtx" will make the interface more scalable. @@ -464,13 +473,15 @@ static void ep_unregister_pollwait(struct eventpoll *ep, struct epitem *epi) * @ep: Pointer to the epoll private data structure. * @sproc: Pointer to the scan callback. * @priv: Private opaque data passed to the @sproc callback. + * @depth: The current depth of recursive f_op->poll calls. * * Returns: The same integer error code returned by the @sproc callback. */ static int ep_scan_ready_list(struct eventpoll *ep, int (*sproc)(struct eventpoll *, struct list_head *, void *), - void *priv) + void *priv, + int depth) { int error, pwake = 0; unsigned long flags; @@ -481,7 +492,7 @@ static int ep_scan_ready_list(struct eventpoll *ep, * We need to lock this because we could be hit by * eventpoll_release_file() and epoll_ctl(). */ - mutex_lock(&ep->mtx); + mutex_lock_nested(&ep->mtx, depth); /* * Steal the ready list, and re-init the original one to the @@ -670,7 +681,7 @@ static int ep_read_events_proc(struct eventpoll *ep, struct list_head *head, static int ep_poll_readyevents_proc(void *priv, void *cookie, int call_nests) { - return ep_scan_ready_list(priv, ep_read_events_proc, NULL); + return ep_scan_ready_list(priv, ep_read_events_proc, NULL, call_nests + 1); } static unsigned int ep_eventpoll_poll(struct file *file, poll_table *wait) @@ -700,7 +711,7 @@ static const struct file_operations eventpoll_fops = { .llseek = noop_llseek, }; -/* Fast test to see if the file is an evenpoll file */ +/* Fast test to see if the file is an eventpoll file */ static inline int is_file_epoll(struct file *f) { return f->f_op == &eventpoll_fops; @@ -737,7 +748,7 @@ void eventpoll_release_file(struct file *file) ep = epi->ep; list_del_init(&epi->fllink); - mutex_lock(&ep->mtx); + mutex_lock_nested(&ep->mtx, 0); ep_remove(ep, epi); mutex_unlock(&ep->mtx); } @@ -1134,7 +1145,7 @@ static int ep_send_events(struct eventpoll *ep, esed.maxevents = maxevents; esed.events = events; - return ep_scan_ready_list(ep, ep_send_events_proc, &esed); + return ep_scan_ready_list(ep, ep_send_events_proc, &esed, 0); } static inline struct timespec ep_set_mstimeout(long ms) @@ -1267,7 +1278,7 @@ static int ep_loop_check_proc(void *priv, void *cookie, int call_nests) struct rb_node *rbp; struct epitem *epi; - mutex_lock(&ep->mtx); + mutex_lock_nested(&ep->mtx, call_nests + 1); for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) { epi = rb_entry(rbp, struct epitem, rbn); if (unlikely(is_file_epoll(epi->ffd.file))) { @@ -1409,7 +1420,7 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, } - mutex_lock(&ep->mtx); + mutex_lock_nested(&ep->mtx, 0); /* * Try to lookup the file inside our RB tree, Since we grabbed "mtx" @@ -841,10 +841,6 @@ static int exec_mmap(struct mm_struct *mm) tsk->mm = mm; tsk->active_mm = mm; activate_mm(active_mm, mm); - if (old_mm && tsk->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) { - atomic_dec(&old_mm->oom_disable_count); - atomic_inc(&tsk->mm->oom_disable_count); - } task_unlock(tsk); arch_pick_mmap_layout(mm); if (old_mm) { @@ -1229,7 +1225,7 @@ EXPORT_SYMBOL(install_exec_creds); * - the caller must hold ->cred_guard_mutex to protect against * PTRACE_ATTACH */ -int check_unsafe_exec(struct linux_binprm *bprm) +static int check_unsafe_exec(struct linux_binprm *bprm) { struct task_struct *p = current, *t; unsigned n_fs; diff --git a/fs/exofs/Kbuild b/fs/exofs/Kbuild index c5a5855..352ba14 100644 --- a/fs/exofs/Kbuild +++ b/fs/exofs/Kbuild @@ -13,7 +13,8 @@ # # ore module library -obj-$(CONFIG_ORE) += ore.o +libore-y := ore.o ore_raid.o +obj-$(CONFIG_ORE) += libore.o exofs-y := inode.o file.o symlink.o namei.o dir.o super.o obj-$(CONFIG_EXOFS_FS) += exofs.o diff --git a/fs/exofs/Kconfig b/fs/exofs/Kconfig index 70bae41..86194b2 100644 --- a/fs/exofs/Kconfig +++ b/fs/exofs/Kconfig @@ -1,10 +1,6 @@ -config ORE - tristate - config EXOFS_FS tristate "exofs: OSD based file system support" depends on SCSI_OSD_ULD - select ORE help EXOFS is a file system that uses an OSD storage device, as its backing storage. diff --git a/fs/exofs/Kconfig.ore b/fs/exofs/Kconfig.ore new file mode 100644 index 0000000..1ca7fb7 --- /dev/null +++ b/fs/exofs/Kconfig.ore @@ -0,0 +1,12 @@ +# ORE - Objects Raid Engine (libore.ko) +# +# Note ORE needs to "select ASYNC_XOR". So Not to force multiple selects +# for every ORE user we do it like this. Any user should add itself here +# at the "depends on EXOFS_FS || ..." with an ||. The dependencies are +# selected here, and we default to "ON". So in effect it is like been +# selected by any of the users. +config ORE + tristate + depends on EXOFS_FS || PNFS_OBJLAYOUT + select ASYNC_XOR + default SCSI_OSD_ULD diff --git a/fs/exofs/dir.c b/fs/exofs/dir.c index d0941c6..8040583 100644 --- a/fs/exofs/dir.c +++ b/fs/exofs/dir.c @@ -234,7 +234,7 @@ static unsigned char exofs_type_by_mode[S_IFMT >> S_SHIFT] = { static inline void exofs_set_de_type(struct exofs_dir_entry *de, struct inode *inode) { - mode_t mode = inode->i_mode; + umode_t mode = inode->i_mode; de->file_type = exofs_type_by_mode[(mode & S_IFMT) >> S_SHIFT]; } diff --git a/fs/exofs/exofs.h b/fs/exofs/exofs.h index f4e442e..ca9d496 100644 --- a/fs/exofs/exofs.h +++ b/fs/exofs/exofs.h @@ -53,6 +53,10 @@ /* u64 has problems with printk this will cast it to unsigned long long */ #define _LLU(x) (unsigned long long)(x) +struct exofs_dev { + struct ore_dev ored; + unsigned did; +}; /* * our extension to the in-memory superblock */ @@ -66,13 +70,9 @@ struct exofs_sb_info { u32 s_next_generation; /* next gen # to use */ atomic_t s_curr_pending; /* number of pending commands */ - struct pnfs_osd_data_map data_map; /* Default raid to use - * FIXME: Needed ? - */ struct ore_layout layout; /* Default files layout */ struct ore_comp one_comp; /* id & cred of partition id=0*/ - struct ore_components comps; /* comps for the partition */ - struct osd_dev *_min_one_dev[1]; /* Place holder for one dev */ + struct ore_components oc; /* comps for the partition */ }; /* @@ -86,7 +86,7 @@ struct exofs_i_info { uint32_t i_dir_start_lookup; /* which page to start lookup */ uint64_t i_commit_size; /* the object's written length */ struct ore_comp one_comp; /* same component for all devices */ - struct ore_components comps; /* inode view of the device table */ + struct ore_components oc; /* inode view of the device table */ }; static inline osd_id exofs_oi_objno(struct exofs_i_info *oi) @@ -154,7 +154,7 @@ int exofs_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, struct page **pagep, void **fsdata); extern struct inode *exofs_iget(struct super_block *, unsigned long); -struct inode *exofs_new_inode(struct inode *, int); +struct inode *exofs_new_inode(struct inode *, umode_t); extern int exofs_write_inode(struct inode *, struct writeback_control *wbc); extern void exofs_evict_inode(struct inode *); @@ -207,7 +207,7 @@ extern const struct inode_operations exofs_fast_symlink_inode_operations; * bigger and that the device table repeats twice. * See: exofs_read_lookup_dev_table() */ -static inline void exofs_init_comps(struct ore_components *comps, +static inline void exofs_init_comps(struct ore_components *oc, struct ore_comp *one_comp, struct exofs_sb_info *sbi, osd_id oid) { @@ -217,13 +217,15 @@ static inline void exofs_init_comps(struct ore_components *comps, one_comp->obj.id = oid; exofs_make_credential(one_comp->cred, &one_comp->obj); - comps->numdevs = sbi->comps.numdevs; - comps->single_comp = EC_SINGLE_COMP; - comps->comps = one_comp; + oc->first_dev = 0; + oc->numdevs = sbi->layout.group_width * sbi->layout.mirrors_p1 * + sbi->layout.group_count; + oc->single_comp = EC_SINGLE_COMP; + oc->comps = one_comp; /* Round robin device view of the table */ - first_dev = (dev_mod * sbi->layout.mirrors_p1) % sbi->comps.numdevs; - comps->ods = sbi->comps.ods + first_dev; + first_dev = (dev_mod * sbi->layout.mirrors_p1) % sbi->oc.numdevs; + oc->ods = &sbi->oc.ods[first_dev]; } #endif diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c index f39a38f..ea5e1f9 100644 --- a/fs/exofs/inode.c +++ b/fs/exofs/inode.c @@ -37,11 +37,7 @@ #define EXOFS_DBGMSG2(M...) do {} while (0) -enum { BIO_MAX_PAGES_KMALLOC = - (PAGE_SIZE - sizeof(struct bio)) / sizeof(struct bio_vec), - MAX_PAGES_KMALLOC = - PAGE_SIZE / sizeof(struct page *), -}; +enum {MAX_PAGES_KMALLOC = PAGE_SIZE / sizeof(struct page *), }; unsigned exofs_max_io_pages(struct ore_layout *layout, unsigned expected_pages) @@ -49,8 +45,7 @@ unsigned exofs_max_io_pages(struct ore_layout *layout, unsigned pages = min_t(unsigned, expected_pages, MAX_PAGES_KMALLOC); /* TODO: easily support bio chaining */ - pages = min_t(unsigned, pages, - layout->group_width * BIO_MAX_PAGES_KMALLOC); + pages = min_t(unsigned, pages, layout->max_io_length / PAGE_SIZE); return pages; } @@ -68,6 +63,7 @@ struct page_collect { bool read_4_write; /* This means two things: that the read is sync * And the pages should not be unlocked. */ + struct page *that_locked_page; }; static void _pcol_init(struct page_collect *pcol, unsigned expected_pages, @@ -86,6 +82,7 @@ static void _pcol_init(struct page_collect *pcol, unsigned expected_pages, pcol->length = 0; pcol->pg_first = -1; pcol->read_4_write = false; + pcol->that_locked_page = NULL; } static void _pcol_reset(struct page_collect *pcol) @@ -98,6 +95,7 @@ static void _pcol_reset(struct page_collect *pcol) pcol->length = 0; pcol->pg_first = -1; pcol->ios = NULL; + pcol->that_locked_page = NULL; /* this is probably the end of the loop but in writes * it might not end here. don't be left with nothing @@ -149,14 +147,17 @@ static int pcol_add_page(struct page_collect *pcol, struct page *page, return 0; } +enum {PAGE_WAS_NOT_IN_IO = 17}; static int update_read_page(struct page *page, int ret) { - if (ret == 0) { + switch (ret) { + case 0: /* Everything is OK */ SetPageUptodate(page); if (PageError(page)) ClearPageError(page); - } else if (ret == -EFAULT) { + break; + case -EFAULT: /* In this case we were trying to read something that wasn't on * disk yet - return a page full of zeroes. This should be OK, * because the object should be empty (if there was a write @@ -167,16 +168,22 @@ static int update_read_page(struct page *page, int ret) SetPageUptodate(page); if (PageError(page)) ClearPageError(page); - ret = 0; /* recovered error */ EXOFS_DBGMSG("recovered read error\n"); - } else /* Error */ + /* fall through */ + case PAGE_WAS_NOT_IN_IO: + ret = 0; /* recovered error */ + break; + default: SetPageError(page); - + } return ret; } static void update_write_page(struct page *page, int ret) { + if (unlikely(ret == PAGE_WAS_NOT_IN_IO)) + return; /* don't pass start don't collect $200 */ + if (ret) { mapping_set_error(page->mapping, ret); SetPageError(page); @@ -190,15 +197,16 @@ static void update_write_page(struct page *page, int ret) static int __readpages_done(struct page_collect *pcol) { int i; - u64 resid; u64 good_bytes; u64 length = 0; - int ret = ore_check_io(pcol->ios, &resid); + int ret = ore_check_io(pcol->ios, NULL); - if (likely(!ret)) + if (likely(!ret)) { good_bytes = pcol->length; - else - good_bytes = pcol->length - resid; + ret = PAGE_WAS_NOT_IN_IO; + } else { + good_bytes = 0; + } EXOFS_DBGMSG2("readpages_done(0x%lx) good_bytes=0x%llx" " length=0x%lx nr_pages=%u\n", @@ -259,6 +267,46 @@ static void _unlock_pcol_pages(struct page_collect *pcol, int ret, int rw) } } +static int _maybe_not_all_in_one_io(struct ore_io_state *ios, + struct page_collect *pcol_src, struct page_collect *pcol) +{ + /* length was wrong or offset was not page aligned */ + BUG_ON(pcol_src->nr_pages < ios->nr_pages); + + if (pcol_src->nr_pages > ios->nr_pages) { + struct page **src_page; + unsigned pages_less = pcol_src->nr_pages - ios->nr_pages; + unsigned long len_less = pcol_src->length - ios->length; + unsigned i; + int ret; + + /* This IO was trimmed */ + pcol_src->nr_pages = ios->nr_pages; + pcol_src->length = ios->length; + + /* Left over pages are passed to the next io */ + pcol->expected_pages += pages_less; + pcol->nr_pages = pages_less; + pcol->length = len_less; + src_page = pcol_src->pages + pcol_src->nr_pages; + pcol->pg_first = (*src_page)->index; + + ret = pcol_try_alloc(pcol); + if (unlikely(ret)) + return ret; + + for (i = 0; i < pages_less; ++i) + pcol->pages[i] = *src_page++; + + EXOFS_DBGMSG("Length was adjusted nr_pages=0x%x " + "pages_less=0x%x expected_pages=0x%x " + "next_offset=0x%llx next_len=0x%lx\n", + pcol_src->nr_pages, pages_less, pcol->expected_pages, + pcol->pg_first * PAGE_SIZE, pcol->length); + } + return 0; +} + static int read_exec(struct page_collect *pcol) { struct exofs_i_info *oi = exofs_i(pcol->inode); @@ -270,7 +318,7 @@ static int read_exec(struct page_collect *pcol) return 0; if (!pcol->ios) { - int ret = ore_get_rw_state(&pcol->sbi->layout, &oi->comps, true, + int ret = ore_get_rw_state(&pcol->sbi->layout, &oi->oc, true, pcol->pg_first << PAGE_CACHE_SHIFT, pcol->length, &pcol->ios); @@ -280,7 +328,6 @@ static int read_exec(struct page_collect *pcol) ios = pcol->ios; ios->pages = pcol->pages; - ios->nr_pages = pcol->nr_pages; if (pcol->read_4_write) { ore_read(pcol->ios); @@ -296,17 +343,23 @@ static int read_exec(struct page_collect *pcol) *pcol_copy = *pcol; ios->done = readpages_done; ios->private = pcol_copy; + + /* pages ownership was passed to pcol_copy */ + _pcol_reset(pcol); + + ret = _maybe_not_all_in_one_io(ios, pcol_copy, pcol); + if (unlikely(ret)) + goto err; + + EXOFS_DBGMSG2("read_exec(0x%lx) offset=0x%llx length=0x%llx\n", + pcol->inode->i_ino, _LLU(ios->offset), _LLU(ios->length)); + ret = ore_read(ios); if (unlikely(ret)) goto err; atomic_inc(&pcol->sbi->s_curr_pending); - EXOFS_DBGMSG2("read_exec obj=0x%llx start=0x%llx length=0x%lx\n", - oi->one_comp.obj.id, _LLU(ios->offset), pcol->length); - - /* pages ownership was passed to pcol_copy */ - _pcol_reset(pcol); return 0; err: @@ -341,6 +394,8 @@ static int readpage_strip(void *data, struct page *page) EXOFS_ERR("PageUptodate(0x%lx, 0x%lx)\n", pcol->inode->i_ino, page->index); + pcol->that_locked_page = page; + if (page->index < end_index) len = PAGE_CACHE_SIZE; else if (page->index == end_index) @@ -429,6 +484,10 @@ static int exofs_readpages(struct file *file, struct address_space *mapping, return ret; } + ret = read_exec(&pcol); + if (unlikely(ret)) + return ret; + return read_exec(&pcol); } @@ -462,17 +521,18 @@ static void writepages_done(struct ore_io_state *ios, void *p) { struct page_collect *pcol = p; int i; - u64 resid; u64 good_bytes; u64 length = 0; - int ret = ore_check_io(ios, &resid); + int ret = ore_check_io(ios, NULL); atomic_dec(&pcol->sbi->s_curr_pending); - if (likely(!ret)) + if (likely(!ret)) { good_bytes = pcol->length; - else - good_bytes = pcol->length - resid; + ret = PAGE_WAS_NOT_IN_IO; + } else { + good_bytes = 0; + } EXOFS_DBGMSG2("writepages_done(0x%lx) good_bytes=0x%llx" " length=0x%lx nr_pages=%u\n", @@ -505,6 +565,56 @@ static void writepages_done(struct ore_io_state *ios, void *p) EXOFS_DBGMSG2("writepages_done END\n"); } +static struct page *__r4w_get_page(void *priv, u64 offset, bool *uptodate) +{ + struct page_collect *pcol = priv; + pgoff_t index = offset / PAGE_SIZE; + + if (!pcol->that_locked_page || + (pcol->that_locked_page->index != index)) { + struct page *page = find_get_page(pcol->inode->i_mapping, index); + + if (!page) { + page = find_or_create_page(pcol->inode->i_mapping, + index, GFP_NOFS); + if (unlikely(!page)) { + EXOFS_DBGMSG("grab_cache_page Failed " + "index=0x%llx\n", _LLU(index)); + return NULL; + } + unlock_page(page); + } + if (PageDirty(page) || PageWriteback(page)) + *uptodate = true; + else + *uptodate = PageUptodate(page); + EXOFS_DBGMSG("index=0x%lx uptodate=%d\n", index, *uptodate); + return page; + } else { + EXOFS_DBGMSG("YES that_locked_page index=0x%lx\n", + pcol->that_locked_page->index); + *uptodate = true; + return pcol->that_locked_page; + } +} + +static void __r4w_put_page(void *priv, struct page *page) +{ + struct page_collect *pcol = priv; + + if (pcol->that_locked_page != page) { + EXOFS_DBGMSG("index=0x%lx\n", page->index); + page_cache_release(page); + return; + } + EXOFS_DBGMSG("that_locked_page index=0x%lx\n", page->index); +} + +static const struct _ore_r4w_op _r4w_op = { + .get_page = &__r4w_get_page, + .put_page = &__r4w_put_page, +}; + static int write_exec(struct page_collect *pcol) { struct exofs_i_info *oi = exofs_i(pcol->inode); @@ -516,10 +626,9 @@ static int write_exec(struct page_collect *pcol) return 0; BUG_ON(pcol->ios); - ret = ore_get_rw_state(&pcol->sbi->layout, &oi->comps, false, + ret = ore_get_rw_state(&pcol->sbi->layout, &oi->oc, false, pcol->pg_first << PAGE_CACHE_SHIFT, pcol->length, &pcol->ios); - if (unlikely(ret)) goto err; @@ -534,10 +643,20 @@ static int write_exec(struct page_collect *pcol) ios = pcol->ios; ios->pages = pcol_copy->pages; - ios->nr_pages = pcol_copy->nr_pages; ios->done = writepages_done; + ios->r4w = &_r4w_op; ios->private = pcol_copy; + /* pages ownership was passed to pcol_copy */ + _pcol_reset(pcol); + + ret = _maybe_not_all_in_one_io(ios, pcol_copy, pcol); + if (unlikely(ret)) + goto err; + + EXOFS_DBGMSG2("write_exec(0x%lx) offset=0x%llx length=0x%llx\n", + pcol->inode->i_ino, _LLU(ios->offset), _LLU(ios->length)); + ret = ore_write(ios); if (unlikely(ret)) { EXOFS_ERR("write_exec: ore_write() Failed\n"); @@ -545,11 +664,6 @@ static int write_exec(struct page_collect *pcol) } atomic_inc(&pcol->sbi->s_curr_pending); - EXOFS_DBGMSG2("write_exec(0x%lx, 0x%llx) start=0x%llx length=0x%lx\n", - pcol->inode->i_ino, pcol->pg_first, _LLU(ios->offset), - pcol->length); - /* pages ownership was passed to pcol_copy */ - _pcol_reset(pcol); return 0; err: @@ -689,14 +803,33 @@ static int exofs_writepages(struct address_space *mapping, _pcol_init(&pcol, expected_pages, mapping->host); ret = write_cache_pages(mapping, wbc, writepage_strip, &pcol); - if (ret) { + if (unlikely(ret)) { EXOFS_ERR("write_cache_pages => %d\n", ret); return ret; } - return write_exec(&pcol); + ret = write_exec(&pcol); + if (unlikely(ret)) + return ret; + + if (wbc->sync_mode == WB_SYNC_ALL) { + return write_exec(&pcol); /* pump the last reminder */ + } else if (pcol.nr_pages) { + /* not SYNC let the reminder join the next writeout */ + unsigned i; + + for (i = 0; i < pcol.nr_pages; i++) { + struct page *page = pcol.pages[i]; + + end_page_writeback(page); + set_page_dirty(page); + unlock_page(page); + } + } + return 0; } +/* static int exofs_writepage(struct page *page, struct writeback_control *wbc) { struct page_collect pcol; @@ -712,7 +845,7 @@ static int exofs_writepage(struct page *page, struct writeback_control *wbc) return write_exec(&pcol); } - +*/ /* i_mutex held using inode->i_size directly */ static void _write_failed(struct inode *inode, loff_t to) { @@ -818,7 +951,7 @@ static void exofs_invalidatepage(struct page *page, unsigned long offset) const struct address_space_operations exofs_aops = { .readpage = exofs_readpage, .readpages = exofs_readpages, - .writepage = exofs_writepage, + .writepage = NULL, .writepages = exofs_writepages, .write_begin = exofs_write_begin_export, .write_end = exofs_write_end, @@ -860,7 +993,7 @@ static int _do_truncate(struct inode *inode, loff_t newsize) inode->i_mtime = inode->i_ctime = CURRENT_TIME; - ret = ore_truncate(&sbi->layout, &oi->comps, (u64)newsize); + ret = ore_truncate(&sbi->layout, &oi->oc, (u64)newsize); if (likely(!ret)) truncate_setsize(inode, newsize); @@ -927,14 +1060,14 @@ static int exofs_get_inode(struct super_block *sb, struct exofs_i_info *oi, struct exofs_on_disk_inode_layout *layout; int ret; - ret = ore_get_io_state(&sbi->layout, &oi->comps, &ios); + ret = ore_get_io_state(&sbi->layout, &oi->oc, &ios); if (unlikely(ret)) { EXOFS_ERR("%s: ore_get_io_state failed.\n", __func__); return ret; } - attrs[1].len = exofs_on_disk_inode_layout_size(sbi->comps.numdevs); - attrs[2].len = exofs_on_disk_inode_layout_size(sbi->comps.numdevs); + attrs[1].len = exofs_on_disk_inode_layout_size(sbi->oc.numdevs); + attrs[2].len = exofs_on_disk_inode_layout_size(sbi->oc.numdevs); ios->in_attr = attrs; ios->in_attr_len = ARRAY_SIZE(attrs); @@ -1018,7 +1151,7 @@ struct inode *exofs_iget(struct super_block *sb, unsigned long ino) return inode; oi = exofs_i(inode); __oi_init(oi); - exofs_init_comps(&oi->comps, &oi->one_comp, sb->s_fs_info, + exofs_init_comps(&oi->oc, &oi->one_comp, sb->s_fs_info, exofs_oi_objno(oi)); /* read the inode from the osd */ @@ -1032,7 +1165,7 @@ struct inode *exofs_iget(struct super_block *sb, unsigned long ino) inode->i_mode = le16_to_cpu(fcb.i_mode); inode->i_uid = le32_to_cpu(fcb.i_uid); inode->i_gid = le32_to_cpu(fcb.i_gid); - inode->i_nlink = le16_to_cpu(fcb.i_links_count); + set_nlink(inode, le16_to_cpu(fcb.i_links_count)); inode->i_ctime.tv_sec = (signed)le32_to_cpu(fcb.i_ctime); inode->i_atime.tv_sec = (signed)le32_to_cpu(fcb.i_atime); inode->i_mtime.tv_sec = (signed)le32_to_cpu(fcb.i_mtime); @@ -1143,7 +1276,7 @@ static void create_done(struct ore_io_state *ios, void *p) /* * Set up a new inode and create an object for it on the OSD */ -struct inode *exofs_new_inode(struct inode *dir, int mode) +struct inode *exofs_new_inode(struct inode *dir, umode_t mode) { struct super_block *sb = dir->i_sb; struct exofs_sb_info *sbi = sb->s_fs_info; @@ -1172,13 +1305,13 @@ struct inode *exofs_new_inode(struct inode *dir, int mode) spin_unlock(&sbi->s_next_gen_lock); insert_inode_hash(inode); - exofs_init_comps(&oi->comps, &oi->one_comp, sb->s_fs_info, + exofs_init_comps(&oi->oc, &oi->one_comp, sb->s_fs_info, exofs_oi_objno(oi)); exofs_sbi_write_stats(sbi); /* Make sure new sbi->s_nextid is on disk */ mark_inode_dirty(inode); - ret = ore_get_io_state(&sbi->layout, &oi->comps, &ios); + ret = ore_get_io_state(&sbi->layout, &oi->oc, &ios); if (unlikely(ret)) { EXOFS_ERR("exofs_new_inode: ore_get_io_state failed\n"); return ERR_PTR(ret); @@ -1267,7 +1400,7 @@ static int exofs_update_inode(struct inode *inode, int do_sync) } else memcpy(fcb->i_data, oi->i_data, sizeof(fcb->i_data)); - ret = ore_get_io_state(&sbi->layout, &oi->comps, &ios); + ret = ore_get_io_state(&sbi->layout, &oi->oc, &ios); if (unlikely(ret)) { EXOFS_ERR("%s: ore_get_io_state failed.\n", __func__); goto free_args; @@ -1350,7 +1483,7 @@ void exofs_evict_inode(struct inode *inode) /* ignore the error, attempt a remove anyway */ /* Now Remove the OSD objects */ - ret = ore_get_io_state(&sbi->layout, &oi->comps, &ios); + ret = ore_get_io_state(&sbi->layout, &oi->oc, &ios); if (unlikely(ret)) { EXOFS_ERR("%s: ore_get_io_state failed\n", __func__); return; diff --git a/fs/exofs/namei.c b/fs/exofs/namei.c index b54c437..9dbf0c3 100644 --- a/fs/exofs/namei.c +++ b/fs/exofs/namei.c @@ -59,7 +59,7 @@ static struct dentry *exofs_lookup(struct inode *dir, struct dentry *dentry, return d_splice_alias(inode, dentry); } -static int exofs_create(struct inode *dir, struct dentry *dentry, int mode, +static int exofs_create(struct inode *dir, struct dentry *dentry, umode_t mode, struct nameidata *nd) { struct inode *inode = exofs_new_inode(dir, mode); @@ -74,7 +74,7 @@ static int exofs_create(struct inode *dir, struct dentry *dentry, int mode, return err; } -static int exofs_mknod(struct inode *dir, struct dentry *dentry, int mode, +static int exofs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev) { struct inode *inode; @@ -153,7 +153,7 @@ static int exofs_link(struct dentry *old_dentry, struct inode *dir, return exofs_add_nondir(dentry, inode); } -static int exofs_mkdir(struct inode *dir, struct dentry *dentry, int mode) +static int exofs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) { struct inode *inode; int err = -EMLINK; diff --git a/fs/exofs/ore.c b/fs/exofs/ore.c index 25305af..49cf230 100644 --- a/fs/exofs/ore.c +++ b/fs/exofs/ore.c @@ -23,77 +23,289 @@ */ #include <linux/slab.h> +#include <linux/module.h> #include <asm/div64.h> +#include <linux/lcm.h> -#include <scsi/osd_ore.h> +#include "ore_raid.h" -#define ORE_ERR(fmt, a...) printk(KERN_ERR "ore: " fmt, ##a) +MODULE_AUTHOR("Boaz Harrosh <bharrosh@panasas.com>"); +MODULE_DESCRIPTION("Objects Raid Engine ore.ko"); +MODULE_LICENSE("GPL"); + +/* ore_verify_layout does a couple of things: + * 1. Given a minimum number of needed parameters fixes up the rest of the + * members to be operatonals for the ore. The needed parameters are those + * that are defined by the pnfs-objects layout STD. + * 2. Check to see if the current ore code actually supports these parameters + * for example stripe_unit must be a multple of the system PAGE_SIZE, + * and etc... + * 3. Cache some havily used calculations that will be needed by users. + */ + +enum { BIO_MAX_PAGES_KMALLOC = + (PAGE_SIZE - sizeof(struct bio)) / sizeof(struct bio_vec),}; -#ifdef CONFIG_EXOFS_DEBUG -#define ORE_DBGMSG(fmt, a...) \ - printk(KERN_NOTICE "ore @%s:%d: " fmt, __func__, __LINE__, ##a) -#else -#define ORE_DBGMSG(fmt, a...) \ - do { if (0) printk(fmt, ##a); } while (0) -#endif +int ore_verify_layout(unsigned total_comps, struct ore_layout *layout) +{ + u64 stripe_length; + + switch (layout->raid_algorithm) { + case PNFS_OSD_RAID_0: + layout->parity = 0; + break; + case PNFS_OSD_RAID_5: + layout->parity = 1; + break; + case PNFS_OSD_RAID_PQ: + case PNFS_OSD_RAID_4: + default: + ORE_ERR("Only RAID_0/5 for now\n"); + return -EINVAL; + } + if (0 != (layout->stripe_unit & ~PAGE_MASK)) { + ORE_ERR("Stripe Unit(0x%llx)" + " must be Multples of PAGE_SIZE(0x%lx)\n", + _LLU(layout->stripe_unit), PAGE_SIZE); + return -EINVAL; + } + if (layout->group_width) { + if (!layout->group_depth) { + ORE_ERR("group_depth == 0 && group_width != 0\n"); + return -EINVAL; + } + if (total_comps < (layout->group_width * layout->mirrors_p1)) { + ORE_ERR("Data Map wrong, " + "numdevs=%d < group_width=%d * mirrors=%d\n", + total_comps, layout->group_width, + layout->mirrors_p1); + return -EINVAL; + } + layout->group_count = total_comps / layout->mirrors_p1 / + layout->group_width; + } else { + if (layout->group_depth) { + printk(KERN_NOTICE "Warning: group_depth ignored " + "group_width == 0 && group_depth == %lld\n", + _LLU(layout->group_depth)); + } + layout->group_width = total_comps / layout->mirrors_p1; + layout->group_depth = -1; + layout->group_count = 1; + } -/* u64 has problems with printk this will cast it to unsigned long long */ -#define _LLU(x) (unsigned long long)(x) + stripe_length = (u64)layout->group_width * layout->stripe_unit; + if (stripe_length >= (1ULL << 32)) { + ORE_ERR("Stripe_length(0x%llx) >= 32bit is not supported\n", + _LLU(stripe_length)); + return -EINVAL; + } -#define ORE_DBGMSG2(M...) do {} while (0) -/* #define ORE_DBGMSG2 ORE_DBGMSG */ + layout->max_io_length = + (BIO_MAX_PAGES_KMALLOC * PAGE_SIZE - layout->stripe_unit) * + layout->group_width; + if (layout->parity) { + unsigned stripe_length = + (layout->group_width - layout->parity) * + layout->stripe_unit; -MODULE_AUTHOR("Boaz Harrosh <bharrosh@panasas.com>"); -MODULE_DESCRIPTION("Objects Raid Engine ore.ko"); -MODULE_LICENSE("GPL"); + layout->max_io_length /= stripe_length; + layout->max_io_length *= stripe_length; + } + return 0; +} +EXPORT_SYMBOL(ore_verify_layout); static u8 *_ios_cred(struct ore_io_state *ios, unsigned index) { - return ios->comps->comps[index & ios->comps->single_comp].cred; + return ios->oc->comps[index & ios->oc->single_comp].cred; } static struct osd_obj_id *_ios_obj(struct ore_io_state *ios, unsigned index) { - return &ios->comps->comps[index & ios->comps->single_comp].obj; + return &ios->oc->comps[index & ios->oc->single_comp].obj; } static struct osd_dev *_ios_od(struct ore_io_state *ios, unsigned index) { - return ios->comps->ods[index]; + ORE_DBGMSG2("oc->first_dev=%d oc->numdevs=%d i=%d oc->ods=%p\n", + ios->oc->first_dev, ios->oc->numdevs, index, + ios->oc->ods); + + return ore_comp_dev(ios->oc, index); } -int ore_get_rw_state(struct ore_layout *layout, struct ore_components *comps, +int _ore_get_io_state(struct ore_layout *layout, + struct ore_components *oc, unsigned numdevs, + unsigned sgs_per_dev, unsigned num_par_pages, + struct ore_io_state **pios) +{ + struct ore_io_state *ios; + struct page **pages; + struct osd_sg_entry *sgilist; + struct __alloc_all_io_state { + struct ore_io_state ios; + struct ore_per_dev_state per_dev[numdevs]; + union { + struct osd_sg_entry sglist[sgs_per_dev * numdevs]; + struct page *pages[num_par_pages]; + }; + } *_aios; + + if (likely(sizeof(*_aios) <= PAGE_SIZE)) { + _aios = kzalloc(sizeof(*_aios), GFP_KERNEL); + if (unlikely(!_aios)) { + ORE_DBGMSG("Failed kzalloc bytes=%zd\n", + sizeof(*_aios)); + *pios = NULL; + return -ENOMEM; + } + pages = num_par_pages ? _aios->pages : NULL; + sgilist = sgs_per_dev ? _aios->sglist : NULL; + ios = &_aios->ios; + } else { + struct __alloc_small_io_state { + struct ore_io_state ios; + struct ore_per_dev_state per_dev[numdevs]; + } *_aio_small; + union __extra_part { + struct osd_sg_entry sglist[sgs_per_dev * numdevs]; + struct page *pages[num_par_pages]; + } *extra_part; + + _aio_small = kzalloc(sizeof(*_aio_small), GFP_KERNEL); + if (unlikely(!_aio_small)) { + ORE_DBGMSG("Failed alloc first part bytes=%zd\n", + sizeof(*_aio_small)); + *pios = NULL; + return -ENOMEM; + } + extra_part = kzalloc(sizeof(*extra_part), GFP_KERNEL); + if (unlikely(!extra_part)) { + ORE_DBGMSG("Failed alloc second part bytes=%zd\n", + sizeof(*extra_part)); + kfree(_aio_small); + *pios = NULL; + return -ENOMEM; + } + + pages = num_par_pages ? extra_part->pages : NULL; + sgilist = sgs_per_dev ? extra_part->sglist : NULL; + /* In this case the per_dev[0].sgilist holds the pointer to + * be freed + */ + ios = &_aio_small->ios; + ios->extra_part_alloc = true; + } + + if (pages) { + ios->parity_pages = pages; + ios->max_par_pages = num_par_pages; + } + if (sgilist) { + unsigned d; + + for (d = 0; d < numdevs; ++d) { + ios->per_dev[d].sglist = sgilist; + sgilist += sgs_per_dev; + } + ios->sgs_per_dev = sgs_per_dev; + } + + ios->layout = layout; + ios->oc = oc; + *pios = ios; + return 0; +} + +/* Allocate an io_state for only a single group of devices + * + * If a user needs to call ore_read/write() this version must be used becase it + * allocates extra stuff for striping and raid. + * The ore might decide to only IO less then @length bytes do to alignmets + * and constrains as follows: + * - The IO cannot cross group boundary. + * - In raid5/6 The end of the IO must align at end of a stripe eg. + * (@offset + @length) % strip_size == 0. Or the complete range is within a + * single stripe. + * - Memory condition only permitted a shorter IO. (A user can use @length=~0 + * And check the returned ios->length for max_io_size.) + * + * The caller must check returned ios->length (and/or ios->nr_pages) and + * re-issue these pages that fall outside of ios->length + */ +int ore_get_rw_state(struct ore_layout *layout, struct ore_components *oc, bool is_reading, u64 offset, u64 length, struct ore_io_state **pios) { struct ore_io_state *ios; + unsigned numdevs = layout->group_width * layout->mirrors_p1; + unsigned sgs_per_dev = 0, max_par_pages = 0; + int ret; - /*TODO: Maybe use kmem_cach per sbi of size - * exofs_io_state_size(layout->s_numdevs) - */ - ios = kzalloc(ore_io_state_size(comps->numdevs), GFP_KERNEL); - if (unlikely(!ios)) { - ORE_DBGMSG("Failed kzalloc bytes=%d\n", - ore_io_state_size(comps->numdevs)); - *pios = NULL; - return -ENOMEM; + if (layout->parity && length) { + unsigned data_devs = layout->group_width - layout->parity; + unsigned stripe_size = layout->stripe_unit * data_devs; + unsigned pages_in_unit = layout->stripe_unit / PAGE_SIZE; + u32 remainder; + u64 num_stripes; + u64 num_raid_units; + + num_stripes = div_u64_rem(length, stripe_size, &remainder); + if (remainder) + ++num_stripes; + + num_raid_units = num_stripes * layout->parity; + + if (is_reading) { + /* For reads add per_dev sglist array */ + /* TODO: Raid 6 we need twice more. Actually: + * num_stripes / LCMdP(W,P); + * if (W%P != 0) num_stripes *= parity; + */ + + /* first/last seg is split */ + num_raid_units += layout->group_width; + sgs_per_dev = div_u64(num_raid_units, data_devs) + 2; + } else { + /* For Writes add parity pages array. */ + max_par_pages = num_raid_units * pages_in_unit * + sizeof(struct page *); + } } - ios->layout = layout; - ios->comps = comps; - ios->offset = offset; - ios->length = length; + ret = _ore_get_io_state(layout, oc, numdevs, sgs_per_dev, max_par_pages, + pios); + if (unlikely(ret)) + return ret; + + ios = *pios; ios->reading = is_reading; + ios->offset = offset; + + if (length) { + ore_calc_stripe_info(layout, offset, length, &ios->si); + ios->length = ios->si.length; + ios->nr_pages = (ios->length + PAGE_SIZE - 1) / PAGE_SIZE; + if (layout->parity) + _ore_post_alloc_raid_stuff(ios); + } - *pios = ios; return 0; } EXPORT_SYMBOL(ore_get_rw_state); -int ore_get_io_state(struct ore_layout *layout, struct ore_components *comps, - struct ore_io_state **ios) +/* Allocate an io_state for all the devices in the comps array + * + * This version of io_state allocation is used mostly by create/remove + * and trunc where we currently need all the devices. The only wastful + * bit is the read/write_attributes with no IO. Those sites should + * be converted to use ore_get_rw_state() with length=0 + */ +int ore_get_io_state(struct ore_layout *layout, struct ore_components *oc, + struct ore_io_state **pios) { - return ore_get_rw_state(layout, comps, true, 0, 0, ios); + return _ore_get_io_state(layout, oc, oc->numdevs, 0, 0, pios); } EXPORT_SYMBOL(ore_get_io_state); @@ -111,6 +323,7 @@ void ore_put_io_state(struct ore_io_state *ios) bio_put(per_dev->bio); } + _ore_free_raid_stuff(ios); kfree(ios); } } @@ -138,7 +351,7 @@ static void _done_io(struct osd_request *or, void *p) kref_put(&ios->kref, _last_io); } -static int ore_io_execute(struct ore_io_state *ios) +int ore_io_execute(struct ore_io_state *ios) { DECLARE_COMPLETION_ONSTACK(wait); bool sync = (ios->done == NULL); @@ -198,7 +411,7 @@ static void _clear_bio(struct bio *bio) } } -int ore_check_io(struct ore_io_state *ios, u64 *resid) +int ore_check_io(struct ore_io_state *ios, ore_on_dev_error on_dev_error) { enum osd_err_priority acumulated_osd_err = 0; int acumulated_lin_err = 0; @@ -206,7 +419,8 @@ int ore_check_io(struct ore_io_state *ios, u64 *resid) for (i = 0; i < ios->numdevs; i++) { struct osd_sense_info osi; - struct osd_request *or = ios->per_dev[i].or; + struct ore_per_dev_state *per_dev = &ios->per_dev[i]; + struct osd_request *or = per_dev->or; int ret; if (unlikely(!or)) @@ -218,29 +432,31 @@ int ore_check_io(struct ore_io_state *ios, u64 *resid) if (OSD_ERR_PRI_CLEAR_PAGES == osi.osd_err_pri) { /* start read offset passed endof file */ - _clear_bio(ios->per_dev[i].bio); + _clear_bio(per_dev->bio); ORE_DBGMSG("start read offset passed end of file " "offset=0x%llx, length=0x%llx\n", - _LLU(ios->per_dev[i].offset), - _LLU(ios->per_dev[i].length)); + _LLU(per_dev->offset), + _LLU(per_dev->length)); continue; /* we recovered */ } + if (on_dev_error) { + u64 residual = ios->reading ? + or->in.residual : or->out.residual; + u64 offset = (ios->offset + ios->length) - residual; + unsigned dev = per_dev->dev - ios->oc->first_dev; + struct ore_dev *od = ios->oc->ods[dev]; + + on_dev_error(ios, od, dev, osi.osd_err_pri, + offset, residual); + } if (osi.osd_err_pri >= acumulated_osd_err) { acumulated_osd_err = osi.osd_err_pri; acumulated_lin_err = ret; } } - /* TODO: raid specific residual calculations */ - if (resid) { - if (likely(!acumulated_lin_err)) - *resid = 0; - else - *resid = ios->length; - } - return acumulated_lin_err; } EXPORT_SYMBOL(ore_check_io); @@ -248,61 +464,65 @@ EXPORT_SYMBOL(ore_check_io); /* * L - logical offset into the file * - * U - The number of bytes in a stripe within a group + * D - number of Data devices + * D = group_width - parity * - * U = stripe_unit * group_width + * U - The number of bytes in a stripe within a group + * U = stripe_unit * D * * T - The number of bytes striped within a group of component objects * (before advancing to the next group) - * - * T = stripe_unit * group_width * group_depth + * T = U * group_depth * * S - The number of bytes striped across all component objects * before the pattern repeats + * S = T * group_count * - * S = stripe_unit * group_width * group_depth * group_count - * - * M - The "major" (i.e., across all components) stripe number - * + * M - The "major" (i.e., across all components) cycle number * M = L / S * - * G - Counts the groups from the beginning of the major stripe - * + * G - Counts the groups from the beginning of the major cycle * G = (L - (M * S)) / T [or (L % S) / T] * * H - The byte offset within the group - * * H = (L - (M * S)) % T [or (L % S) % T] * * N - The "minor" (i.e., across the group) stripe number - * * N = H / U * * C - The component index coresponding to L * - * C = (H - (N * U)) / stripe_unit + G * group_width - * [or (L % U) / stripe_unit + G * group_width] + * C = (H - (N * U)) / stripe_unit + G * D + * [or (L % U) / stripe_unit + G * D] * * O - The component offset coresponding to L - * * O = L % stripe_unit + N * stripe_unit + M * group_depth * stripe_unit + * + * LCMdP – Parity cycle: Lowest Common Multiple of group_width, parity + * divide by parity + * LCMdP = lcm(group_width, parity) / parity + * + * R - The parity Rotation stripe + * (Note parity cycle always starts at a group's boundary) + * R = N % LCMdP + * + * I = the first parity device index + * I = (group_width + group_width - R*parity - parity) % group_width + * + * Craid - The component index Rotated + * Craid = (group_width + C - R*parity) % group_width + * (We add the group_width to avoid negative numbers modulo math) */ -struct _striping_info { - u64 obj_offset; - u64 group_length; - u64 M; /* for truncate */ - unsigned dev; - unsigned unit_off; -}; - -static void _calc_stripe_info(struct ore_layout *layout, u64 file_offset, - struct _striping_info *si) +void ore_calc_stripe_info(struct ore_layout *layout, u64 file_offset, + u64 length, struct ore_striping_info *si) { u32 stripe_unit = layout->stripe_unit; u32 group_width = layout->group_width; u64 group_depth = layout->group_depth; + u32 parity = layout->parity; - u32 U = stripe_unit * group_width; + u32 D = group_width - parity; + u32 U = D * stripe_unit; u64 T = U * group_depth; u64 S = T * layout->group_count; u64 M = div64_u64(file_offset, S); @@ -318,39 +538,65 @@ static void _calc_stripe_info(struct ore_layout *layout, u64 file_offset, u32 N = div_u64(H, U); /* "H - (N * U)" is just "H % U" so it's bound to u32 */ - si->dev = (u32)(H - (N * U)) / stripe_unit + G * group_width; - si->dev *= layout->mirrors_p1; + u32 C = (u32)(H - (N * U)) / stripe_unit + G * group_width; div_u64_rem(file_offset, stripe_unit, &si->unit_off); si->obj_offset = si->unit_off + (N * stripe_unit) + (M * group_depth * stripe_unit); - si->group_length = T - H; + if (parity) { + u32 LCMdP = lcm(group_width, parity) / parity; + /* R = N % LCMdP; */ + u32 RxP = (N % LCMdP) * parity; + u32 first_dev = C - C % group_width; + + si->par_dev = (group_width + group_width - parity - RxP) % + group_width + first_dev; + si->dev = (group_width + C - RxP) % group_width + first_dev; + si->bytes_in_stripe = U; + si->first_stripe_start = M * S + G * T + N * U; + } else { + /* Make the math correct see _prepare_one_group */ + si->par_dev = group_width; + si->dev = C; + } + + si->dev *= layout->mirrors_p1; + si->par_dev *= layout->mirrors_p1; + si->offset = file_offset; + si->length = T - H; + if (si->length > length) + si->length = length; si->M = M; } +EXPORT_SYMBOL(ore_calc_stripe_info); -static int _add_stripe_unit(struct ore_io_state *ios, unsigned *cur_pg, - unsigned pgbase, struct ore_per_dev_state *per_dev, - int cur_len) +int _ore_add_stripe_unit(struct ore_io_state *ios, unsigned *cur_pg, + unsigned pgbase, struct page **pages, + struct ore_per_dev_state *per_dev, int cur_len) { unsigned pg = *cur_pg; struct request_queue *q = osd_request_queue(_ios_od(ios, per_dev->dev)); - - per_dev->length += cur_len; + unsigned len = cur_len; + int ret; if (per_dev->bio == NULL) { unsigned pages_in_stripe = ios->layout->group_width * (ios->layout->stripe_unit / PAGE_SIZE); - unsigned bio_size = (ios->nr_pages + pages_in_stripe) / - ios->layout->group_width; + unsigned nr_pages = ios->nr_pages * ios->layout->group_width / + (ios->layout->group_width - + ios->layout->parity); + unsigned bio_size = (nr_pages + pages_in_stripe) / + ios->layout->group_width; per_dev->bio = bio_kmalloc(GFP_KERNEL, bio_size); if (unlikely(!per_dev->bio)) { ORE_DBGMSG("Failed to allocate BIO size=%u\n", bio_size); - return -ENOMEM; + ret = -ENOMEM; + goto out; } } @@ -358,64 +604,90 @@ static int _add_stripe_unit(struct ore_io_state *ios, unsigned *cur_pg, unsigned pglen = min_t(unsigned, PAGE_SIZE - pgbase, cur_len); unsigned added_len; - BUG_ON(ios->nr_pages <= pg); cur_len -= pglen; - added_len = bio_add_pc_page(q, per_dev->bio, ios->pages[pg], + added_len = bio_add_pc_page(q, per_dev->bio, pages[pg], pglen, pgbase); - if (unlikely(pglen != added_len)) - return -ENOMEM; + if (unlikely(pglen != added_len)) { + ORE_DBGMSG("Failed bio_add_pc_page bi_vcnt=%u\n", + per_dev->bio->bi_vcnt); + ret = -ENOMEM; + goto out; + } + _add_stripe_page(ios->sp2d, &ios->si, pages[pg]); + pgbase = 0; ++pg; } BUG_ON(cur_len); + per_dev->length += len; *cur_pg = pg; - return 0; + ret = 0; +out: /* we fail the complete unit on an error eg don't advance + * per_dev->length and cur_pg. This means that we might have a bigger + * bio than the CDB requested length (per_dev->length). That's fine + * only the oposite is fatal. + */ + return ret; } -static int _prepare_one_group(struct ore_io_state *ios, u64 length, - struct _striping_info *si) +static int _prepare_for_striping(struct ore_io_state *ios) { + struct ore_striping_info *si = &ios->si; unsigned stripe_unit = ios->layout->stripe_unit; unsigned mirrors_p1 = ios->layout->mirrors_p1; - unsigned devs_in_group = ios->layout->group_width * mirrors_p1; + unsigned group_width = ios->layout->group_width; + unsigned devs_in_group = group_width * mirrors_p1; unsigned dev = si->dev; unsigned first_dev = dev - (dev % devs_in_group); - unsigned max_comp = ios->numdevs ? ios->numdevs - mirrors_p1 : 0; + unsigned dev_order; unsigned cur_pg = ios->pages_consumed; + u64 length = ios->length; int ret = 0; + if (!ios->pages) { + ios->numdevs = ios->layout->mirrors_p1; + return 0; + } + + BUG_ON(length > si->length); + + dev_order = _dev_order(devs_in_group, mirrors_p1, si->par_dev, dev); + si->cur_comp = dev_order; + si->cur_pg = si->unit_off / PAGE_SIZE; + while (length) { - struct ore_per_dev_state *per_dev = &ios->per_dev[dev]; + unsigned comp = dev - first_dev; + struct ore_per_dev_state *per_dev = &ios->per_dev[comp]; unsigned cur_len, page_off = 0; if (!per_dev->length) { per_dev->dev = dev; - if (dev < si->dev) { - per_dev->offset = si->obj_offset + stripe_unit - - si->unit_off; - cur_len = stripe_unit; - } else if (dev == si->dev) { + if (dev == si->dev) { + WARN_ON(dev == si->par_dev); per_dev->offset = si->obj_offset; cur_len = stripe_unit - si->unit_off; page_off = si->unit_off & ~PAGE_MASK; BUG_ON(page_off && (page_off != ios->pgbase)); - } else { /* dev > si->dev */ - per_dev->offset = si->obj_offset - si->unit_off; + } else { + if (si->cur_comp > dev_order) + per_dev->offset = + si->obj_offset - si->unit_off; + else /* si->cur_comp < dev_order */ + per_dev->offset = + si->obj_offset + stripe_unit - + si->unit_off; cur_len = stripe_unit; } - - if (max_comp < dev) - max_comp = dev; } else { cur_len = stripe_unit; } if (cur_len >= length) cur_len = length; - ret = _add_stripe_unit(ios, &cur_pg, page_off , per_dev, - cur_len); + ret = _ore_add_stripe_unit(ios, &cur_pg, page_off, ios->pages, + per_dev, cur_len); if (unlikely(ret)) goto out; @@ -423,60 +695,60 @@ static int _prepare_one_group(struct ore_io_state *ios, u64 length, dev = (dev % devs_in_group) + first_dev; length -= cur_len; - } -out: - ios->numdevs = max_comp + mirrors_p1; - ios->pages_consumed = cur_pg; - return ret; -} - -static int _prepare_for_striping(struct ore_io_state *ios) -{ - u64 length = ios->length; - u64 offset = ios->offset; - struct _striping_info si; - int ret = 0; - if (!ios->pages) { - if (ios->kern_buff) { - struct ore_per_dev_state *per_dev = &ios->per_dev[0]; + si->cur_comp = (si->cur_comp + 1) % group_width; + if (unlikely((dev == si->par_dev) || (!length && ios->sp2d))) { + if (!length && ios->sp2d) { + /* If we are writing and this is the very last + * stripe. then operate on parity dev. + */ + dev = si->par_dev; + } + if (ios->sp2d) + /* In writes cur_len just means if it's the + * last one. See _ore_add_parity_unit. + */ + cur_len = length; + per_dev = &ios->per_dev[dev - first_dev]; + if (!per_dev->length) { + /* Only/always the parity unit of the first + * stripe will be empty. So this is a chance to + * initialize the per_dev info. + */ + per_dev->dev = dev; + per_dev->offset = si->obj_offset - si->unit_off; + } - _calc_stripe_info(ios->layout, ios->offset, &si); - per_dev->offset = si.obj_offset; - per_dev->dev = si.dev; + ret = _ore_add_parity_unit(ios, si, per_dev, cur_len); + if (unlikely(ret)) + goto out; - /* no cross device without page array */ - BUG_ON((ios->layout->group_width > 1) && - (si.unit_off + ios->length > - ios->layout->stripe_unit)); + /* Rotate next par_dev backwards with wraping */ + si->par_dev = (devs_in_group + si->par_dev - + ios->layout->parity * mirrors_p1) % + devs_in_group + first_dev; + /* Next stripe, start fresh */ + si->cur_comp = 0; + si->cur_pg = 0; } - ios->numdevs = ios->layout->mirrors_p1; - return 0; - } - - while (length) { - _calc_stripe_info(ios->layout, offset, &si); - - if (length < si.group_length) - si.group_length = length; - - ret = _prepare_one_group(ios, si.group_length, &si); - if (unlikely(ret)) - goto out; - - offset += si.group_length; - length -= si.group_length; } - out: - return ret; + ios->numdevs = devs_in_group; + ios->pages_consumed = cur_pg; + if (unlikely(ret)) { + if (length == ios->length) + return ret; + else + ios->length -= length; + } + return 0; } int ore_create(struct ore_io_state *ios) { int i, ret; - for (i = 0; i < ios->comps->numdevs; i++) { + for (i = 0; i < ios->oc->numdevs; i++) { struct osd_request *or; or = osd_start_request(_ios_od(ios, i), GFP_KERNEL); @@ -501,7 +773,7 @@ int ore_remove(struct ore_io_state *ios) { int i, ret; - for (i = 0; i < ios->comps->numdevs; i++) { + for (i = 0; i < ios->oc->numdevs; i++) { struct osd_request *or; or = osd_start_request(_ios_od(ios, i), GFP_KERNEL); @@ -543,7 +815,6 @@ static int _write_mirror(struct ore_io_state *ios, int cur_comp) goto out; } per_dev->or = or; - per_dev->offset = master_dev->offset; if (ios->pages) { struct bio *bio; @@ -562,6 +833,7 @@ static int _write_mirror(struct ore_io_state *ios, int cur_comp) __bio_clone(bio, master_dev->bio); bio->bi_bdev = NULL; bio->bi_next = NULL; + per_dev->offset = master_dev->offset; per_dev->length = master_dev->length; per_dev->bio = bio; per_dev->dev = dev; @@ -579,7 +851,15 @@ static int _write_mirror(struct ore_io_state *ios, int cur_comp) _LLU(per_dev->offset), _LLU(per_dev->length), dev); } else if (ios->kern_buff) { - ret = osd_req_write_kern(or, _ios_obj(ios, dev), + per_dev->offset = ios->si.obj_offset; + per_dev->dev = ios->si.dev + dev; + + /* no cross device without page array */ + BUG_ON((ios->layout->group_width > 1) && + (ios->si.unit_off + ios->length > + ios->layout->stripe_unit)); + + ret = osd_req_write_kern(or, _ios_obj(ios, per_dev->dev), per_dev->offset, ios->kern_buff, ios->length); if (unlikely(ret)) @@ -588,7 +868,7 @@ static int _write_mirror(struct ore_io_state *ios, int cur_comp) "length=0x%llx dev=%d\n", _LLU(_ios_obj(ios, dev)->id), _LLU(per_dev->offset), - _LLU(ios->length), dev); + _LLU(ios->length), per_dev->dev); } else { osd_req_set_attributes(or, _ios_obj(ios, dev)); ORE_DBGMSG2("obj(0x%llx) set_attributes=%d dev=%d\n", @@ -614,6 +894,14 @@ int ore_write(struct ore_io_state *ios) int i; int ret; + if (unlikely(ios->sp2d && !ios->r4w)) { + /* A library is attempting a RAID-write without providing + * a pages lock interface. + */ + WARN_ON_ONCE(1); + return -ENOTSUPP; + } + ret = _prepare_for_striping(ios); if (unlikely(ret)) return ret; @@ -629,7 +917,7 @@ int ore_write(struct ore_io_state *ios) } EXPORT_SYMBOL(ore_write); -static int _read_mirror(struct ore_io_state *ios, unsigned cur_comp) +int _ore_read_mirror(struct ore_io_state *ios, unsigned cur_comp) { struct osd_request *or; struct ore_per_dev_state *per_dev = &ios->per_dev[cur_comp]; @@ -648,22 +936,27 @@ static int _read_mirror(struct ore_io_state *ios, unsigned cur_comp) per_dev->or = or; if (ios->pages) { - osd_req_read(or, obj, per_dev->offset, - per_dev->bio, per_dev->length); + if (per_dev->cur_sg) { + /* finalize the last sg_entry */ + _ore_add_sg_seg(per_dev, 0, false); + if (unlikely(!per_dev->cur_sg)) + return 0; /* Skip parity only device */ + + osd_req_read_sg(or, obj, per_dev->bio, + per_dev->sglist, per_dev->cur_sg); + } else { + /* The no raid case */ + osd_req_read(or, obj, per_dev->offset, + per_dev->bio, per_dev->length); + } + ORE_DBGMSG("read(0x%llx) offset=0x%llx length=0x%llx" - " dev=%d\n", _LLU(obj->id), + " dev=%d sg_len=%d\n", _LLU(obj->id), _LLU(per_dev->offset), _LLU(per_dev->length), - first_dev); - } else if (ios->kern_buff) { - int ret = osd_req_read_kern(or, obj, per_dev->offset, - ios->kern_buff, ios->length); - ORE_DBGMSG2("read_kern(0x%llx) offset=0x%llx " - "length=0x%llx dev=%d ret=>%d\n", - _LLU(obj->id), _LLU(per_dev->offset), - _LLU(ios->length), first_dev, ret); - if (unlikely(ret)) - return ret; + first_dev, per_dev->cur_sg); } else { + BUG_ON(ios->kern_buff); + osd_req_get_attributes(or, obj); ORE_DBGMSG2("obj(0x%llx) get_attributes=%d dev=%d\n", _LLU(obj->id), @@ -688,7 +981,7 @@ int ore_read(struct ore_io_state *ios) return ret; for (i = 0; i < ios->numdevs; i += ios->layout->mirrors_p1) { - ret = _read_mirror(ios, i); + ret = _ore_read_mirror(ios, i); if (unlikely(ret)) return ret; } @@ -744,31 +1037,29 @@ static int _truncate_mirrors(struct ore_io_state *ios, unsigned cur_comp, } struct _trunc_info { - struct _striping_info si; + struct ore_striping_info si; u64 prev_group_obj_off; u64 next_group_obj_off; unsigned first_group_dev; unsigned nex_group_dev; - unsigned max_devs; }; -void _calc_trunk_info(struct ore_layout *layout, u64 file_offset, - struct _trunc_info *ti) +static void _calc_trunk_info(struct ore_layout *layout, u64 file_offset, + struct _trunc_info *ti) { unsigned stripe_unit = layout->stripe_unit; - _calc_stripe_info(layout, file_offset, &ti->si); + ore_calc_stripe_info(layout, file_offset, 0, &ti->si); ti->prev_group_obj_off = ti->si.M * stripe_unit; ti->next_group_obj_off = ti->si.M ? (ti->si.M - 1) * stripe_unit : 0; ti->first_group_dev = ti->si.dev - (ti->si.dev % layout->group_width); ti->nex_group_dev = ti->first_group_dev + layout->group_width; - ti->max_devs = layout->group_width * layout->group_count; } -int ore_truncate(struct ore_layout *layout, struct ore_components *comps, +int ore_truncate(struct ore_layout *layout, struct ore_components *oc, u64 size) { struct ore_io_state *ios; @@ -779,22 +1070,22 @@ int ore_truncate(struct ore_layout *layout, struct ore_components *comps, struct _trunc_info ti; int i, ret; - ret = ore_get_io_state(layout, comps, &ios); + ret = ore_get_io_state(layout, oc, &ios); if (unlikely(ret)) return ret; _calc_trunk_info(ios->layout, size, &ti); - size_attrs = kcalloc(ti.max_devs, sizeof(*size_attrs), + size_attrs = kcalloc(ios->oc->numdevs, sizeof(*size_attrs), GFP_KERNEL); if (unlikely(!size_attrs)) { ret = -ENOMEM; goto out; } - ios->numdevs = ios->comps->numdevs; + ios->numdevs = ios->oc->numdevs; - for (i = 0; i < ti.max_devs; ++i) { + for (i = 0; i < ios->numdevs; ++i) { struct exofs_trunc_attr *size_attr = &size_attrs[i]; u64 obj_size; @@ -815,7 +1106,7 @@ int ore_truncate(struct ore_layout *layout, struct ore_components *comps, size_attr->attr.val_ptr = &size_attr->newsize; ORE_DBGMSG("trunc(0x%llx) obj_offset=0x%llx dev=%d\n", - _LLU(comps->comps->obj.id), _LLU(obj_size), i); + _LLU(oc->comps->obj.id), _LLU(obj_size), i); ret = _truncate_mirrors(ios, i * ios->layout->mirrors_p1, &size_attr->attr); if (unlikely(ret)) diff --git a/fs/exofs/ore_raid.c b/fs/exofs/ore_raid.c new file mode 100644 index 0000000..d222c77 --- /dev/null +++ b/fs/exofs/ore_raid.c @@ -0,0 +1,712 @@ +/* + * Copyright (C) 2011 + * Boaz Harrosh <bharrosh@panasas.com> + * + * This file is part of the objects raid engine (ore). + * + * It is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation. + * + * You should have received a copy of the GNU General Public License + * along with "ore". If not, write to the Free Software Foundation, Inc: + * "Free Software Foundation <info@fsf.org>" + */ + +#include <linux/gfp.h> +#include <linux/async_tx.h> + +#include "ore_raid.h" + +#undef ORE_DBGMSG2 +#define ORE_DBGMSG2 ORE_DBGMSG + +struct page *_raid_page_alloc(void) +{ + return alloc_page(GFP_KERNEL); +} + +void _raid_page_free(struct page *p) +{ + __free_page(p); +} + +/* This struct is forward declare in ore_io_state, but is private to here. + * It is put on ios->sp2d for RAID5/6 writes only. See _gen_xor_unit. + * + * __stripe_pages_2d is a 2d array of pages, and it is also a corner turn. + * Ascending page index access is sp2d(p-minor, c-major). But storage is + * sp2d[p-minor][c-major], so it can be properlly presented to the async-xor + * API. + */ +struct __stripe_pages_2d { + /* Cache some hot path repeated calculations */ + unsigned parity; + unsigned data_devs; + unsigned pages_in_unit; + + bool needed ; + + /* Array size is pages_in_unit (layout->stripe_unit / PAGE_SIZE) */ + struct __1_page_stripe { + bool alloc; + unsigned write_count; + struct async_submit_ctl submit; + struct dma_async_tx_descriptor *tx; + + /* The size of this array is data_devs + parity */ + struct page **pages; + struct page **scribble; + /* bool array, size of this array is data_devs */ + char *page_is_read; + } _1p_stripes[]; +}; + +/* This can get bigger then a page. So support multiple page allocations + * _sp2d_free should be called even if _sp2d_alloc fails (by returning + * none-zero). + */ +static int _sp2d_alloc(unsigned pages_in_unit, unsigned group_width, + unsigned parity, struct __stripe_pages_2d **psp2d) +{ + struct __stripe_pages_2d *sp2d; + unsigned data_devs = group_width - parity; + struct _alloc_all_bytes { + struct __alloc_stripe_pages_2d { + struct __stripe_pages_2d sp2d; + struct __1_page_stripe _1p_stripes[pages_in_unit]; + } __asp2d; + struct __alloc_1p_arrays { + struct page *pages[group_width]; + struct page *scribble[group_width]; + char page_is_read[data_devs]; + } __a1pa[pages_in_unit]; + } *_aab; + struct __alloc_1p_arrays *__a1pa; + struct __alloc_1p_arrays *__a1pa_end; + const unsigned sizeof__a1pa = sizeof(_aab->__a1pa[0]); + unsigned num_a1pa, alloc_size, i; + + /* FIXME: check these numbers in ore_verify_layout */ + BUG_ON(sizeof(_aab->__asp2d) > PAGE_SIZE); + BUG_ON(sizeof__a1pa > PAGE_SIZE); + + if (sizeof(*_aab) > PAGE_SIZE) { + num_a1pa = (PAGE_SIZE - sizeof(_aab->__asp2d)) / sizeof__a1pa; + alloc_size = sizeof(_aab->__asp2d) + sizeof__a1pa * num_a1pa; + } else { + num_a1pa = pages_in_unit; + alloc_size = sizeof(*_aab); + } + + _aab = kzalloc(alloc_size, GFP_KERNEL); + if (unlikely(!_aab)) { + ORE_DBGMSG("!! Failed to alloc sp2d size=%d\n", alloc_size); + return -ENOMEM; + } + + sp2d = &_aab->__asp2d.sp2d; + *psp2d = sp2d; /* From here Just call _sp2d_free */ + + __a1pa = _aab->__a1pa; + __a1pa_end = __a1pa + num_a1pa; + + for (i = 0; i < pages_in_unit; ++i) { + if (unlikely(__a1pa >= __a1pa_end)) { + num_a1pa = min_t(unsigned, PAGE_SIZE / sizeof__a1pa, + pages_in_unit - i); + + __a1pa = kzalloc(num_a1pa * sizeof__a1pa, GFP_KERNEL); + if (unlikely(!__a1pa)) { + ORE_DBGMSG("!! Failed to _alloc_1p_arrays=%d\n", + num_a1pa); + return -ENOMEM; + } + __a1pa_end = __a1pa + num_a1pa; + /* First *pages is marked for kfree of the buffer */ + sp2d->_1p_stripes[i].alloc = true; + } + + sp2d->_1p_stripes[i].pages = __a1pa->pages; + sp2d->_1p_stripes[i].scribble = __a1pa->scribble ; + sp2d->_1p_stripes[i].page_is_read = __a1pa->page_is_read; + ++__a1pa; + } + + sp2d->parity = parity; + sp2d->data_devs = data_devs; + sp2d->pages_in_unit = pages_in_unit; + return 0; +} + +static void _sp2d_reset(struct __stripe_pages_2d *sp2d, + const struct _ore_r4w_op *r4w, void *priv) +{ + unsigned data_devs = sp2d->data_devs; + unsigned group_width = data_devs + sp2d->parity; + unsigned p; + + if (!sp2d->needed) + return; + + for (p = 0; p < sp2d->pages_in_unit; p++) { + struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p]; + + if (_1ps->write_count < group_width) { + unsigned c; + + for (c = 0; c < data_devs; c++) + if (_1ps->page_is_read[c]) { + struct page *page = _1ps->pages[c]; + + r4w->put_page(priv, page); + _1ps->page_is_read[c] = false; + } + } + + memset(_1ps->pages, 0, group_width * sizeof(*_1ps->pages)); + _1ps->write_count = 0; + _1ps->tx = NULL; + } + + sp2d->needed = false; +} + +static void _sp2d_free(struct __stripe_pages_2d *sp2d) +{ + unsigned i; + + if (!sp2d) + return; + + for (i = 0; i < sp2d->pages_in_unit; ++i) { + if (sp2d->_1p_stripes[i].alloc) + kfree(sp2d->_1p_stripes[i].pages); + } + + kfree(sp2d); +} + +static unsigned _sp2d_min_pg(struct __stripe_pages_2d *sp2d) +{ + unsigned p; + + for (p = 0; p < sp2d->pages_in_unit; p++) { + struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p]; + + if (_1ps->write_count) + return p; + } + + return ~0; +} + +static unsigned _sp2d_max_pg(struct __stripe_pages_2d *sp2d) +{ + unsigned p; + + for (p = sp2d->pages_in_unit - 1; p >= 0; --p) { + struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p]; + + if (_1ps->write_count) + return p; + } + + return ~0; +} + +static void _gen_xor_unit(struct __stripe_pages_2d *sp2d) +{ + unsigned p; + for (p = 0; p < sp2d->pages_in_unit; p++) { + struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p]; + + if (!_1ps->write_count) + continue; + + init_async_submit(&_1ps->submit, + ASYNC_TX_XOR_ZERO_DST | ASYNC_TX_ACK, + NULL, + NULL, NULL, + (addr_conv_t *)_1ps->scribble); + + /* TODO: raid6 */ + _1ps->tx = async_xor(_1ps->pages[sp2d->data_devs], _1ps->pages, + 0, sp2d->data_devs, PAGE_SIZE, + &_1ps->submit); + } + + for (p = 0; p < sp2d->pages_in_unit; p++) { + struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p]; + /* NOTE: We wait for HW synchronously (I don't have such HW + * to test with.) Is parallelism needed with today's multi + * cores? + */ + async_tx_issue_pending(_1ps->tx); + } +} + +void _ore_add_stripe_page(struct __stripe_pages_2d *sp2d, + struct ore_striping_info *si, struct page *page) +{ + struct __1_page_stripe *_1ps; + + sp2d->needed = true; + + _1ps = &sp2d->_1p_stripes[si->cur_pg]; + _1ps->pages[si->cur_comp] = page; + ++_1ps->write_count; + + si->cur_pg = (si->cur_pg + 1) % sp2d->pages_in_unit; + /* si->cur_comp is advanced outside at main loop */ +} + +void _ore_add_sg_seg(struct ore_per_dev_state *per_dev, unsigned cur_len, + bool not_last) +{ + struct osd_sg_entry *sge; + + ORE_DBGMSG("dev=%d cur_len=0x%x not_last=%d cur_sg=%d " + "offset=0x%llx length=0x%x last_sgs_total=0x%x\n", + per_dev->dev, cur_len, not_last, per_dev->cur_sg, + _LLU(per_dev->offset), per_dev->length, + per_dev->last_sgs_total); + + if (!per_dev->cur_sg) { + sge = per_dev->sglist; + + /* First time we prepare two entries */ + if (per_dev->length) { + ++per_dev->cur_sg; + sge->offset = per_dev->offset; + sge->len = per_dev->length; + } else { + /* Here the parity is the first unit of this object. + * This happens every time we reach a parity device on + * the same stripe as the per_dev->offset. We need to + * just skip this unit. + */ + per_dev->offset += cur_len; + return; + } + } else { + /* finalize the last one */ + sge = &per_dev->sglist[per_dev->cur_sg - 1]; + sge->len = per_dev->length - per_dev->last_sgs_total; + } + + if (not_last) { + /* Partly prepare the next one */ + struct osd_sg_entry *next_sge = sge + 1; + + ++per_dev->cur_sg; + next_sge->offset = sge->offset + sge->len + cur_len; + /* Save cur len so we know how mutch was added next time */ + per_dev->last_sgs_total = per_dev->length; + next_sge->len = 0; + } else if (!sge->len) { + /* Optimize for when the last unit is a parity */ + --per_dev->cur_sg; + } +} + +static int _alloc_read_4_write(struct ore_io_state *ios) +{ + struct ore_layout *layout = ios->layout; + int ret; + /* We want to only read those pages not in cache so worst case + * is a stripe populated with every other page + */ + unsigned sgs_per_dev = ios->sp2d->pages_in_unit + 2; + + ret = _ore_get_io_state(layout, ios->oc, + layout->group_width * layout->mirrors_p1, + sgs_per_dev, 0, &ios->ios_read_4_write); + return ret; +} + +/* @si contains info of the to-be-inserted page. Update of @si should be + * maintained by caller. Specificaly si->dev, si->obj_offset, ... + */ +static int _add_to_r4w(struct ore_io_state *ios, struct ore_striping_info *si, + struct page *page, unsigned pg_len) +{ + struct request_queue *q; + struct ore_per_dev_state *per_dev; + struct ore_io_state *read_ios; + unsigned first_dev = si->dev - (si->dev % + (ios->layout->group_width * ios->layout->mirrors_p1)); + unsigned comp = si->dev - first_dev; + unsigned added_len; + + if (!ios->ios_read_4_write) { + int ret = _alloc_read_4_write(ios); + + if (unlikely(ret)) + return ret; + } + + read_ios = ios->ios_read_4_write; + read_ios->numdevs = ios->layout->group_width * ios->layout->mirrors_p1; + + per_dev = &read_ios->per_dev[comp]; + if (!per_dev->length) { + per_dev->bio = bio_kmalloc(GFP_KERNEL, + ios->sp2d->pages_in_unit); + if (unlikely(!per_dev->bio)) { + ORE_DBGMSG("Failed to allocate BIO size=%u\n", + ios->sp2d->pages_in_unit); + return -ENOMEM; + } + per_dev->offset = si->obj_offset; + per_dev->dev = si->dev; + } else if (si->obj_offset != (per_dev->offset + per_dev->length)) { + u64 gap = si->obj_offset - (per_dev->offset + per_dev->length); + + _ore_add_sg_seg(per_dev, gap, true); + } + q = osd_request_queue(ore_comp_dev(read_ios->oc, per_dev->dev)); + added_len = bio_add_pc_page(q, per_dev->bio, page, pg_len, + si->obj_offset % PAGE_SIZE); + if (unlikely(added_len != pg_len)) { + ORE_DBGMSG("Failed to bio_add_pc_page bi_vcnt=%d\n", + per_dev->bio->bi_vcnt); + return -ENOMEM; + } + + per_dev->length += pg_len; + return 0; +} + +/* read the beginning of an unaligned first page */ +static int _add_to_r4w_first_page(struct ore_io_state *ios, struct page *page) +{ + struct ore_striping_info si; + unsigned pg_len; + + ore_calc_stripe_info(ios->layout, ios->offset, 0, &si); + + pg_len = si.obj_offset % PAGE_SIZE; + si.obj_offset -= pg_len; + + ORE_DBGMSG("offset=0x%llx len=0x%x index=0x%lx dev=%x\n", + _LLU(si.obj_offset), pg_len, page->index, si.dev); + + return _add_to_r4w(ios, &si, page, pg_len); +} + +/* read the end of an incomplete last page */ +static int _add_to_r4w_last_page(struct ore_io_state *ios, u64 *offset) +{ + struct ore_striping_info si; + struct page *page; + unsigned pg_len, p, c; + + ore_calc_stripe_info(ios->layout, *offset, 0, &si); + + p = si.unit_off / PAGE_SIZE; + c = _dev_order(ios->layout->group_width * ios->layout->mirrors_p1, + ios->layout->mirrors_p1, si.par_dev, si.dev); + page = ios->sp2d->_1p_stripes[p].pages[c]; + + pg_len = PAGE_SIZE - (si.unit_off % PAGE_SIZE); + *offset += pg_len; + + ORE_DBGMSG("p=%d, c=%d next-offset=0x%llx len=0x%x dev=%x par_dev=%d\n", + p, c, _LLU(*offset), pg_len, si.dev, si.par_dev); + + BUG_ON(!page); + + return _add_to_r4w(ios, &si, page, pg_len); +} + +static void _mark_read4write_pages_uptodate(struct ore_io_state *ios, int ret) +{ + struct bio_vec *bv; + unsigned i, d; + + /* loop on all devices all pages */ + for (d = 0; d < ios->numdevs; d++) { + struct bio *bio = ios->per_dev[d].bio; + + if (!bio) + continue; + + __bio_for_each_segment(bv, bio, i, 0) { + struct page *page = bv->bv_page; + + SetPageUptodate(page); + if (PageError(page)) + ClearPageError(page); + } + } +} + +/* read_4_write is hacked to read the start of the first stripe and/or + * the end of the last stripe. If needed, with an sg-gap at each device/page. + * It is assumed to be called after the to_be_written pages of the first stripe + * are populating ios->sp2d[][] + * + * NOTE: We call ios->r4w->lock_fn for all pages needed for parity calculations + * These pages are held at sp2d[p].pages[c] but with + * sp2d[p].page_is_read[c] = true. At _sp2d_reset these pages are + * ios->r4w->lock_fn(). The ios->r4w->lock_fn might signal that the page is + * @uptodate=true, so we don't need to read it, only unlock, after IO. + * + * TODO: The read_4_write should calc a need_to_read_pages_count, if bigger then + * to-be-written count, we should consider the xor-in-place mode. + * need_to_read_pages_count is the actual number of pages not present in cache. + * maybe "devs_in_group - ios->sp2d[p].write_count" is a good enough + * approximation? In this mode the read pages are put in the empty places of + * ios->sp2d[p][*], xor is calculated the same way. These pages are + * allocated/freed and don't go through cache + */ +static int _read_4_write(struct ore_io_state *ios) +{ + struct ore_io_state *ios_read; + struct ore_striping_info read_si; + struct __stripe_pages_2d *sp2d = ios->sp2d; + u64 offset = ios->si.first_stripe_start; + u64 last_stripe_end; + unsigned bytes_in_stripe = ios->si.bytes_in_stripe; + unsigned i, c, p, min_p = sp2d->pages_in_unit, max_p = -1; + int ret; + + if (offset == ios->offset) /* Go to start collect $200 */ + goto read_last_stripe; + + min_p = _sp2d_min_pg(sp2d); + max_p = _sp2d_max_pg(sp2d); + + for (c = 0; ; c++) { + ore_calc_stripe_info(ios->layout, offset, 0, &read_si); + read_si.obj_offset += min_p * PAGE_SIZE; + offset += min_p * PAGE_SIZE; + for (p = min_p; p <= max_p; p++) { + struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p]; + struct page **pp = &_1ps->pages[c]; + bool uptodate; + + if (*pp) { + if (ios->offset % PAGE_SIZE) + /* Read the remainder of the page */ + _add_to_r4w_first_page(ios, *pp); + /* to-be-written pages start here */ + goto read_last_stripe; + } + + *pp = ios->r4w->get_page(ios->private, offset, + &uptodate); + if (unlikely(!*pp)) + return -ENOMEM; + + if (!uptodate) + _add_to_r4w(ios, &read_si, *pp, PAGE_SIZE); + + /* Mark read-pages to be cache_released */ + _1ps->page_is_read[c] = true; + read_si.obj_offset += PAGE_SIZE; + offset += PAGE_SIZE; + } + offset += (sp2d->pages_in_unit - p) * PAGE_SIZE; + } + +read_last_stripe: + offset = ios->offset + ios->length; + if (offset % PAGE_SIZE) + _add_to_r4w_last_page(ios, &offset); + /* offset will be aligned to next page */ + + last_stripe_end = div_u64(offset + bytes_in_stripe - 1, bytes_in_stripe) + * bytes_in_stripe; + if (offset == last_stripe_end) /* Optimize for the aligned case */ + goto read_it; + + ore_calc_stripe_info(ios->layout, offset, 0, &read_si); + p = read_si.unit_off / PAGE_SIZE; + c = _dev_order(ios->layout->group_width * ios->layout->mirrors_p1, + ios->layout->mirrors_p1, read_si.par_dev, read_si.dev); + + BUG_ON(ios->si.first_stripe_start + bytes_in_stripe != last_stripe_end); + /* unaligned IO must be within a single stripe */ + + if (min_p == sp2d->pages_in_unit) { + /* Didn't do it yet */ + min_p = _sp2d_min_pg(sp2d); + max_p = _sp2d_max_pg(sp2d); + } + + while (offset < last_stripe_end) { + struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p]; + + if ((min_p <= p) && (p <= max_p)) { + struct page *page; + bool uptodate; + + BUG_ON(_1ps->pages[c]); + page = ios->r4w->get_page(ios->private, offset, + &uptodate); + if (unlikely(!page)) + return -ENOMEM; + + _1ps->pages[c] = page; + /* Mark read-pages to be cache_released */ + _1ps->page_is_read[c] = true; + if (!uptodate) + _add_to_r4w(ios, &read_si, page, PAGE_SIZE); + } + + offset += PAGE_SIZE; + if (p == (sp2d->pages_in_unit - 1)) { + ++c; + p = 0; + ore_calc_stripe_info(ios->layout, offset, 0, &read_si); + } else { + read_si.obj_offset += PAGE_SIZE; + ++p; + } + } + +read_it: + ios_read = ios->ios_read_4_write; + if (!ios_read) + return 0; + + /* FIXME: Ugly to signal _sbi_read_mirror that we have bio(s). Change + * to check for per_dev->bio + */ + ios_read->pages = ios->pages; + + /* Now read these devices */ + for (i = 0; i < ios_read->numdevs; i += ios_read->layout->mirrors_p1) { + ret = _ore_read_mirror(ios_read, i); + if (unlikely(ret)) + return ret; + } + + ret = ore_io_execute(ios_read); /* Synchronus execution */ + if (unlikely(ret)) { + ORE_DBGMSG("!! ore_io_execute => %d\n", ret); + return ret; + } + + _mark_read4write_pages_uptodate(ios_read, ret); + return 0; +} + +/* In writes @cur_len means length left. .i.e cur_len==0 is the last parity U */ +int _ore_add_parity_unit(struct ore_io_state *ios, + struct ore_striping_info *si, + struct ore_per_dev_state *per_dev, + unsigned cur_len) +{ + if (ios->reading) { + if (per_dev->cur_sg >= ios->sgs_per_dev) { + ORE_DBGMSG("cur_sg(%d) >= sgs_per_dev(%d)\n" , + per_dev->cur_sg, ios->sgs_per_dev); + return -ENOMEM; + } + _ore_add_sg_seg(per_dev, cur_len, true); + } else { + struct __stripe_pages_2d *sp2d = ios->sp2d; + struct page **pages = ios->parity_pages + ios->cur_par_page; + unsigned num_pages; + unsigned array_start = 0; + unsigned i; + int ret; + + si->cur_pg = _sp2d_min_pg(sp2d); + num_pages = _sp2d_max_pg(sp2d) + 1 - si->cur_pg; + + if (!cur_len) /* If last stripe operate on parity comp */ + si->cur_comp = sp2d->data_devs; + + if (!per_dev->length) { + per_dev->offset += si->cur_pg * PAGE_SIZE; + /* If first stripe, Read in all read4write pages + * (if needed) before we calculate the first parity. + */ + _read_4_write(ios); + } + + for (i = 0; i < num_pages; i++) { + pages[i] = _raid_page_alloc(); + if (unlikely(!pages[i])) + return -ENOMEM; + + ++(ios->cur_par_page); + } + + BUG_ON(si->cur_comp != sp2d->data_devs); + BUG_ON(si->cur_pg + num_pages > sp2d->pages_in_unit); + + ret = _ore_add_stripe_unit(ios, &array_start, 0, pages, + per_dev, num_pages * PAGE_SIZE); + if (unlikely(ret)) + return ret; + + /* TODO: raid6 if (last_parity_dev) */ + _gen_xor_unit(sp2d); + _sp2d_reset(sp2d, ios->r4w, ios->private); + } + return 0; +} + +int _ore_post_alloc_raid_stuff(struct ore_io_state *ios) +{ + struct ore_layout *layout = ios->layout; + + if (ios->parity_pages) { + unsigned pages_in_unit = layout->stripe_unit / PAGE_SIZE; + unsigned stripe_size = ios->si.bytes_in_stripe; + u64 last_stripe, first_stripe; + + if (_sp2d_alloc(pages_in_unit, layout->group_width, + layout->parity, &ios->sp2d)) { + return -ENOMEM; + } + + /* Round io down to last full strip */ + first_stripe = div_u64(ios->offset, stripe_size); + last_stripe = div_u64(ios->offset + ios->length, stripe_size); + + /* If an IO spans more then a single stripe it must end at + * a stripe boundary. The reminder at the end is pushed into the + * next IO. + */ + if (last_stripe != first_stripe) { + ios->length = last_stripe * stripe_size - ios->offset; + + BUG_ON(!ios->length); + ios->nr_pages = (ios->length + PAGE_SIZE - 1) / + PAGE_SIZE; + ios->si.length = ios->length; /*make it consistent */ + } + } + return 0; +} + +void _ore_free_raid_stuff(struct ore_io_state *ios) +{ + if (ios->sp2d) { /* writing and raid */ + unsigned i; + + for (i = 0; i < ios->cur_par_page; i++) { + struct page *page = ios->parity_pages[i]; + + if (page) + _raid_page_free(page); + } + if (ios->extra_part_alloc) + kfree(ios->parity_pages); + /* If IO returned an error pages might need unlocking */ + _sp2d_reset(ios->sp2d, ios->r4w, ios->private); + _sp2d_free(ios->sp2d); + } else { + /* Will only be set if raid reading && sglist is big */ + if (ios->extra_part_alloc) + kfree(ios->per_dev[0].sglist); + } + if (ios->ios_read_4_write) + ore_put_io_state(ios->ios_read_4_write); +} diff --git a/fs/exofs/ore_raid.h b/fs/exofs/ore_raid.h new file mode 100644 index 0000000..2ffd2c3 --- /dev/null +++ b/fs/exofs/ore_raid.h @@ -0,0 +1,79 @@ +/* + * Copyright (C) from 2011 + * Boaz Harrosh <bharrosh@panasas.com> + * + * This file is part of the objects raid engine (ore). + * + * It is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation. + * + * You should have received a copy of the GNU General Public License + * along with "ore". If not, write to the Free Software Foundation, Inc: + * "Free Software Foundation <info@fsf.org>" + */ + +#include <scsi/osd_ore.h> + +#define ORE_ERR(fmt, a...) printk(KERN_ERR "ore: " fmt, ##a) + +#ifdef CONFIG_EXOFS_DEBUG +#define ORE_DBGMSG(fmt, a...) \ + printk(KERN_NOTICE "ore @%s:%d: " fmt, __func__, __LINE__, ##a) +#else +#define ORE_DBGMSG(fmt, a...) \ + do { if (0) printk(fmt, ##a); } while (0) +#endif + +/* u64 has problems with printk this will cast it to unsigned long long */ +#define _LLU(x) (unsigned long long)(x) + +#define ORE_DBGMSG2(M...) do {} while (0) +/* #define ORE_DBGMSG2 ORE_DBGMSG */ + +/* Calculate the component order in a stripe. eg the logical data unit + * address within the stripe of @dev given the @par_dev of this stripe. + */ +static inline unsigned _dev_order(unsigned devs_in_group, unsigned mirrors_p1, + unsigned par_dev, unsigned dev) +{ + unsigned first_dev = dev - dev % devs_in_group; + + dev -= first_dev; + par_dev -= first_dev; + + if (devs_in_group == par_dev) /* The raid 0 case */ + return dev / mirrors_p1; + /* raid4/5/6 case */ + return ((devs_in_group + dev - par_dev - mirrors_p1) % devs_in_group) / + mirrors_p1; +} + +/* ios_raid.c stuff needed by ios.c */ +int _ore_post_alloc_raid_stuff(struct ore_io_state *ios); +void _ore_free_raid_stuff(struct ore_io_state *ios); + +void _ore_add_sg_seg(struct ore_per_dev_state *per_dev, unsigned cur_len, + bool not_last); +int _ore_add_parity_unit(struct ore_io_state *ios, struct ore_striping_info *si, + struct ore_per_dev_state *per_dev, unsigned cur_len); +void _ore_add_stripe_page(struct __stripe_pages_2d *sp2d, + struct ore_striping_info *si, struct page *page); +static inline void _add_stripe_page(struct __stripe_pages_2d *sp2d, + struct ore_striping_info *si, struct page *page) +{ + if (!sp2d) /* Inline the fast path */ + return; /* Hay no raid stuff */ + _ore_add_stripe_page(sp2d, si, page); +} + +/* ios.c stuff needed by ios_raid.c */ +int _ore_get_io_state(struct ore_layout *layout, + struct ore_components *oc, unsigned numdevs, + unsigned sgs_per_dev, unsigned num_par_pages, + struct ore_io_state **pios); +int _ore_add_stripe_unit(struct ore_io_state *ios, unsigned *cur_pg, + unsigned pgbase, struct page **pages, + struct ore_per_dev_state *per_dev, int cur_len); +int _ore_read_mirror(struct ore_io_state *ios, unsigned cur_comp); +int ore_io_execute(struct ore_io_state *ios); diff --git a/fs/exofs/super.c b/fs/exofs/super.c index 2748940..d22cd168 100644 --- a/fs/exofs/super.c +++ b/fs/exofs/super.c @@ -35,6 +35,7 @@ #include <linux/parser.h> #include <linux/vfs.h> #include <linux/random.h> +#include <linux/module.h> #include <linux/exportfs.h> #include <linux/slab.h> @@ -165,7 +166,6 @@ static struct inode *exofs_alloc_inode(struct super_block *sb) static void exofs_i_callback(struct rcu_head *head) { struct inode *inode = container_of(head, struct inode, i_rcu); - INIT_LIST_HEAD(&inode->i_dentry); kmem_cache_free(exofs_inode_cachep, exofs_i(inode)); } @@ -266,7 +266,7 @@ static int __sbi_read_stats(struct exofs_sb_info *sbi) struct ore_io_state *ios; int ret; - ret = ore_get_io_state(&sbi->layout, &sbi->comps, &ios); + ret = ore_get_io_state(&sbi->layout, &sbi->oc, &ios); if (unlikely(ret)) { EXOFS_ERR("%s: ore_get_io_state failed.\n", __func__); return ret; @@ -321,7 +321,7 @@ int exofs_sbi_write_stats(struct exofs_sb_info *sbi) struct ore_io_state *ios; int ret; - ret = ore_get_io_state(&sbi->layout, &sbi->comps, &ios); + ret = ore_get_io_state(&sbi->layout, &sbi->oc, &ios); if (unlikely(ret)) { EXOFS_ERR("%s: ore_get_io_state failed.\n", __func__); return ret; @@ -355,12 +355,12 @@ static const struct export_operations exofs_export_ops; /* * Write the superblock to the OSD */ -int exofs_sync_fs(struct super_block *sb, int wait) +static int exofs_sync_fs(struct super_block *sb, int wait) { struct exofs_sb_info *sbi; struct exofs_fscb *fscb; struct ore_comp one_comp; - struct ore_components comps; + struct ore_components oc; struct ore_io_state *ios; int ret = -ENOMEM; @@ -378,9 +378,9 @@ int exofs_sync_fs(struct super_block *sb, int wait) * the writeable info is set in exofs_sbi_write_stats() above. */ - exofs_init_comps(&comps, &one_comp, sbi, EXOFS_SUPER_ID); + exofs_init_comps(&oc, &one_comp, sbi, EXOFS_SUPER_ID); - ret = ore_get_io_state(&sbi->layout, &comps, &ios); + ret = ore_get_io_state(&sbi->layout, &oc, &ios); if (unlikely(ret)) goto out; @@ -429,19 +429,20 @@ static void _exofs_print_device(const char *msg, const char *dev_path, msg, dev_path ?: "", odi->osdname, _LLU(pid)); } -void exofs_free_sbi(struct exofs_sb_info *sbi) +static void exofs_free_sbi(struct exofs_sb_info *sbi) { - while (sbi->comps.numdevs) { - int i = --sbi->comps.numdevs; - struct osd_dev *od = sbi->comps.ods[i]; + unsigned numdevs = sbi->oc.numdevs; + + while (numdevs) { + unsigned i = --numdevs; + struct osd_dev *od = ore_comp_dev(&sbi->oc, i); if (od) { - sbi->comps.ods[i] = NULL; + ore_comp_set_dev(&sbi->oc, i, NULL); osduld_put_device(od); } } - if (sbi->comps.ods != sbi->_min_one_dev) - kfree(sbi->comps.ods); + kfree(sbi->oc.ods); kfree(sbi); } @@ -468,7 +469,7 @@ static void exofs_put_super(struct super_block *sb) msecs_to_jiffies(100)); } - _exofs_print_device("Unmounting", NULL, sbi->comps.ods[0], + _exofs_print_device("Unmounting", NULL, ore_comp_dev(&sbi->oc, 0), sbi->one_comp.obj.partition); bdi_destroy(&sbi->bdi); @@ -479,76 +480,20 @@ static void exofs_put_super(struct super_block *sb) static int _read_and_match_data_map(struct exofs_sb_info *sbi, unsigned numdevs, struct exofs_device_table *dt) { - u64 stripe_length; + int ret; - sbi->data_map.odm_num_comps = - le32_to_cpu(dt->dt_data_map.cb_num_comps); - sbi->data_map.odm_stripe_unit = + sbi->layout.stripe_unit = le64_to_cpu(dt->dt_data_map.cb_stripe_unit); - sbi->data_map.odm_group_width = + sbi->layout.group_width = le32_to_cpu(dt->dt_data_map.cb_group_width); - sbi->data_map.odm_group_depth = + sbi->layout.group_depth = le32_to_cpu(dt->dt_data_map.cb_group_depth); - sbi->data_map.odm_mirror_cnt = - le32_to_cpu(dt->dt_data_map.cb_mirror_cnt); - sbi->data_map.odm_raid_algorithm = + sbi->layout.mirrors_p1 = + le32_to_cpu(dt->dt_data_map.cb_mirror_cnt) + 1; + sbi->layout.raid_algorithm = le32_to_cpu(dt->dt_data_map.cb_raid_algorithm); -/* FIXME: Only raid0 for now. if not so, do not mount */ - if (sbi->data_map.odm_num_comps != numdevs) { - EXOFS_ERR("odm_num_comps(%u) != numdevs(%u)\n", - sbi->data_map.odm_num_comps, numdevs); - return -EINVAL; - } - if (sbi->data_map.odm_raid_algorithm != PNFS_OSD_RAID_0) { - EXOFS_ERR("Only RAID_0 for now\n"); - return -EINVAL; - } - if (0 != (numdevs % (sbi->data_map.odm_mirror_cnt + 1))) { - EXOFS_ERR("Data Map wrong, numdevs=%d mirrors=%d\n", - numdevs, sbi->data_map.odm_mirror_cnt); - return -EINVAL; - } - - if (0 != (sbi->data_map.odm_stripe_unit & ~PAGE_MASK)) { - EXOFS_ERR("Stripe Unit(0x%llx)" - " must be Multples of PAGE_SIZE(0x%lx)\n", - _LLU(sbi->data_map.odm_stripe_unit), PAGE_SIZE); - return -EINVAL; - } - - sbi->layout.stripe_unit = sbi->data_map.odm_stripe_unit; - sbi->layout.mirrors_p1 = sbi->data_map.odm_mirror_cnt + 1; - - if (sbi->data_map.odm_group_width) { - sbi->layout.group_width = sbi->data_map.odm_group_width; - sbi->layout.group_depth = sbi->data_map.odm_group_depth; - if (!sbi->layout.group_depth) { - EXOFS_ERR("group_depth == 0 && group_width != 0\n"); - return -EINVAL; - } - sbi->layout.group_count = sbi->data_map.odm_num_comps / - sbi->layout.mirrors_p1 / - sbi->data_map.odm_group_width; - } else { - if (sbi->data_map.odm_group_depth) { - printk(KERN_NOTICE "Warning: group_depth ignored " - "group_width == 0 && group_depth == %d\n", - sbi->data_map.odm_group_depth); - sbi->data_map.odm_group_depth = 0; - } - sbi->layout.group_width = sbi->data_map.odm_num_comps / - sbi->layout.mirrors_p1; - sbi->layout.group_depth = -1; - sbi->layout.group_count = 1; - } - - stripe_length = (u64)sbi->layout.group_width * sbi->layout.stripe_unit; - if (stripe_length >= (1ULL << 32)) { - EXOFS_ERR("Total Stripe length(0x%llx)" - " >= 32bit is not supported\n", _LLU(stripe_length)); - return -EINVAL; - } + ret = ore_verify_layout(numdevs, &sbi->layout); EXOFS_DBGMSG("exofs: layout: " "num_comps=%u stripe_unit=0x%x group_width=%u " @@ -558,8 +503,8 @@ static int _read_and_match_data_map(struct exofs_sb_info *sbi, unsigned numdevs, sbi->layout.group_width, _LLU(sbi->layout.group_depth), sbi->layout.mirrors_p1, - sbi->data_map.odm_raid_algorithm); - return 0; + sbi->layout.raid_algorithm); + return ret; } static unsigned __ra_pages(struct ore_layout *layout) @@ -605,12 +550,40 @@ static int exofs_devs_2_odi(struct exofs_dt_device_info *dt_dev, return !(odi->systemid_len || odi->osdname_len); } +int __alloc_dev_table(struct exofs_sb_info *sbi, unsigned numdevs, + struct exofs_dev **peds) +{ + struct __alloc_ore_devs_and_exofs_devs { + /* Twice bigger table: See exofs_init_comps() and comment at + * exofs_read_lookup_dev_table() + */ + struct ore_dev *oreds[numdevs * 2 - 1]; + struct exofs_dev eds[numdevs]; + } *aoded; + struct exofs_dev *eds; + unsigned i; + + aoded = kzalloc(sizeof(*aoded), GFP_KERNEL); + if (unlikely(!aoded)) { + EXOFS_ERR("ERROR: faild allocating Device array[%d]\n", + numdevs); + return -ENOMEM; + } + + sbi->oc.ods = aoded->oreds; + *peds = eds = aoded->eds; + for (i = 0; i < numdevs; ++i) + aoded->oreds[i] = &eds[i].ored; + return 0; +} + static int exofs_read_lookup_dev_table(struct exofs_sb_info *sbi, struct osd_dev *fscb_od, unsigned table_count) { struct ore_comp comp; struct exofs_device_table *dt; + struct exofs_dev *eds; unsigned table_bytes = table_count * sizeof(dt->dt_dev_table[0]) + sizeof(*dt); unsigned numdevs, i; @@ -623,7 +596,7 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info *sbi, return -ENOMEM; } - sbi->comps.numdevs = 0; + sbi->oc.numdevs = 0; comp.obj.partition = sbi->one_comp.obj.partition; comp.obj.id = EXOFS_DEVTABLE_ID; @@ -647,20 +620,16 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info *sbi, if (unlikely(ret)) goto out; - if (likely(numdevs > 1)) { - unsigned size = numdevs * sizeof(sbi->comps.ods[0]); - - /* Twice bigger table: See exofs_init_comps() and below - * comment - */ - sbi->comps.ods = kzalloc(size + size - 1, GFP_KERNEL); - if (unlikely(!sbi->comps.ods)) { - EXOFS_ERR("ERROR: faild allocating Device array[%d]\n", - numdevs); - ret = -ENOMEM; - goto out; - } - } + ret = __alloc_dev_table(sbi, numdevs, &eds); + if (unlikely(ret)) + goto out; + /* exofs round-robins the device table view according to inode + * number. We hold a: twice bigger table hence inodes can point + * to any device and have a sequential view of the table + * starting at this device. See exofs_init_comps() + */ + memcpy(&sbi->oc.ods[numdevs], &sbi->oc.ods[0], + (numdevs - 1) * sizeof(sbi->oc.ods[0])); for (i = 0; i < numdevs; i++) { struct exofs_fscb fscb; @@ -676,13 +645,16 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info *sbi, printk(KERN_NOTICE "Add device[%d]: osd_name-%s\n", i, odi.osdname); + /* the exofs id is currently the table index */ + eds[i].did = i; + /* On all devices the device table is identical. The user can * specify any one of the participating devices on the command * line. We always keep them in device-table order. */ if (fscb_od && osduld_device_same(fscb_od, &odi)) { - sbi->comps.ods[i] = fscb_od; - ++sbi->comps.numdevs; + eds[i].ored.od = fscb_od; + ++sbi->oc.numdevs; fscb_od = NULL; continue; } @@ -695,8 +667,8 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info *sbi, goto out; } - sbi->comps.ods[i] = od; - ++sbi->comps.numdevs; + eds[i].ored.od = od; + ++sbi->oc.numdevs; /* Read the fscb of the other devices to make sure the FS * partition is there. @@ -718,21 +690,10 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info *sbi, out: kfree(dt); - if (likely(!ret)) { - unsigned numdevs = sbi->comps.numdevs; - - if (unlikely(fscb_od)) { + if (unlikely(fscb_od && !ret)) { EXOFS_ERR("ERROR: Bad device-table container device not present\n"); osduld_put_device(fscb_od); return -EINVAL; - } - /* exofs round-robins the device table view according to inode - * number. We hold a: twice bigger table hence inodes can point - * to any device and have a sequential view of the table - * starting at this device. See exofs_init_comps() - */ - for (i = 0; i < numdevs - 1; ++i) - sbi->comps.ods[i + numdevs] = sbi->comps.ods[i]; } return ret; } @@ -783,10 +744,9 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent) sbi->one_comp.obj.partition = opts->pid; sbi->one_comp.obj.id = 0; exofs_make_credential(sbi->one_comp.cred, &sbi->one_comp.obj); - sbi->comps.numdevs = 1; - sbi->comps.single_comp = EC_SINGLE_COMP; - sbi->comps.comps = &sbi->one_comp; - sbi->comps.ods = sbi->_min_one_dev; + sbi->oc.numdevs = 1; + sbi->oc.single_comp = EC_SINGLE_COMP; + sbi->oc.comps = &sbi->one_comp; /* fill in some other data by hand */ memset(sb->s_id, 0, sizeof(sb->s_id)); @@ -835,7 +795,13 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent) if (unlikely(ret)) goto free_sbi; } else { - sbi->comps.ods[0] = od; + struct exofs_dev *eds; + + ret = __alloc_dev_table(sbi, 1, &eds); + if (unlikely(ret)) + goto free_sbi; + + ore_comp_set_dev(&sbi->oc, 0, od); } __sbi_read_stats(sbi); @@ -872,10 +838,13 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent) ret = bdi_setup_and_register(&sbi->bdi, "exofs", BDI_CAP_MAP_COPY); if (ret) { EXOFS_DBGMSG("Failed to bdi_setup_and_register\n"); + dput(sb->s_root); + sb->s_root = NULL; goto free_sbi; } - _exofs_print_device("Mounting", opts->dev_name, sbi->comps.ods[0], + _exofs_print_device("Mounting", opts->dev_name, + ore_comp_dev(&sbi->oc, 0), sbi->one_comp.obj.partition); return 0; @@ -924,7 +893,7 @@ static int exofs_statfs(struct dentry *dentry, struct kstatfs *buf) uint64_t used = ULLONG_MAX; int ret; - ret = ore_get_io_state(&sbi->layout, &sbi->comps, &ios); + ret = ore_get_io_state(&sbi->layout, &sbi->oc, &ios); if (ret) { EXOFS_DBGMSG("ore_get_io_state failed.\n"); return ret; @@ -981,7 +950,7 @@ static const struct super_operations exofs_sops = { * EXPORT OPERATIONS *****************************************************************************/ -struct dentry *exofs_get_parent(struct dentry *child) +static struct dentry *exofs_get_parent(struct dentry *child) { unsigned long ino = exofs_parent_ino(child); diff --git a/fs/ext2/balloc.c b/fs/ext2/balloc.c index 8f44cef..a8cbe1b 100644 --- a/fs/ext2/balloc.c +++ b/fs/ext2/balloc.c @@ -421,7 +421,7 @@ static inline int rsv_is_empty(struct ext2_reserve_window *rsv) void ext2_init_block_alloc_info(struct inode *inode) { struct ext2_inode_info *ei = EXT2_I(inode); - struct ext2_block_alloc_info *block_i = ei->i_block_alloc_info; + struct ext2_block_alloc_info *block_i; struct super_block *sb = inode->i_sb; block_i = kmalloc(sizeof(*block_i), GFP_NOFS); diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c index 47cda41..d37df35 100644 --- a/fs/ext2/dir.c +++ b/fs/ext2/dir.c @@ -279,7 +279,7 @@ static unsigned char ext2_type_by_mode[S_IFMT >> S_SHIFT] = { static inline void ext2_set_de_type(ext2_dirent *de, struct inode *inode) { - mode_t mode = inode->i_mode; + umode_t mode = inode->i_mode; if (EXT2_HAS_INCOMPAT_FEATURE(inode->i_sb, EXT2_FEATURE_INCOMPAT_FILETYPE)) de->file_type = ext2_type_by_mode[(mode & S_IFMT)>>S_SHIFT]; else diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h index af9fc89..75ad433 100644 --- a/fs/ext2/ext2.h +++ b/fs/ext2/ext2.h @@ -110,7 +110,7 @@ extern struct ext2_dir_entry_2 * ext2_dotdot (struct inode *, struct page **); extern void ext2_set_link(struct inode *, struct ext2_dir_entry_2 *, struct page *, struct inode *, int); /* ialloc.c */ -extern struct inode * ext2_new_inode (struct inode *, int, const struct qstr *); +extern struct inode * ext2_new_inode (struct inode *, umode_t, const struct qstr *); extern void ext2_free_inode (struct inode *); extern unsigned long ext2_count_free_inodes (struct super_block *); extern void ext2_check_inodes_bitmap (struct super_block *); @@ -135,10 +135,10 @@ extern long ext2_compat_ioctl(struct file *, unsigned int, unsigned long); struct dentry *ext2_get_parent(struct dentry *child); /* super.c */ -extern void ext2_error (struct super_block *, const char *, const char *, ...) - __attribute__ ((format (printf, 3, 4))); -extern void ext2_msg(struct super_block *, const char *, const char *, ...) - __attribute__ ((format (printf, 3, 4))); +extern __printf(3, 4) +void ext2_error(struct super_block *, const char *, const char *, ...); +extern __printf(3, 4) +void ext2_msg(struct super_block *, const char *, const char *, ...); extern void ext2_update_dynamic_rev (struct super_block *sb); extern void ext2_write_super (struct super_block *); diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c index ee9ed31..8b15cf8 100644 --- a/fs/ext2/ialloc.c +++ b/fs/ext2/ialloc.c @@ -429,7 +429,7 @@ found: return group; } -struct inode *ext2_new_inode(struct inode *dir, int mode, +struct inode *ext2_new_inode(struct inode *dir, umode_t mode, const struct qstr *qstr) { struct super_block *sb; @@ -573,8 +573,11 @@ got: inode->i_generation = sbi->s_next_generation++; spin_unlock(&sbi->s_next_gen_lock); if (insert_inode_locked(inode) < 0) { - err = -EINVAL; - goto fail_drop; + ext2_error(sb, "ext2_new_inode", + "inode number already in use - inode=%lu", + (unsigned long) ino); + err = -EIO; + goto fail; } dquot_initialize(inode); @@ -601,7 +604,7 @@ fail_free_drop: fail_drop: dquot_drop(inode); inode->i_flags |= S_NOQUOTA; - inode->i_nlink = 0; + clear_nlink(inode); unlock_new_inode(inode); iput(inode); return ERR_PTR(err); diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index a8a58f6..740cad8 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -26,7 +26,6 @@ #include <linux/highuid.h> #include <linux/pagemap.h> #include <linux/quotaops.h> -#include <linux/module.h> #include <linux/writeback.h> #include <linux/buffer_head.h> #include <linux/mpage.h> @@ -36,10 +35,6 @@ #include "acl.h" #include "xip.h" -MODULE_AUTHOR("Remy Card and others"); -MODULE_DESCRIPTION("Second Extended Filesystem"); -MODULE_LICENSE("GPL"); - static int __ext2_write_inode(struct inode *inode, int do_sync); /* @@ -1321,7 +1316,7 @@ struct inode *ext2_iget (struct super_block *sb, unsigned long ino) inode->i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16; inode->i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16; } - inode->i_nlink = le16_to_cpu(raw_inode->i_links_count); + set_nlink(inode, le16_to_cpu(raw_inode->i_links_count)); inode->i_size = le32_to_cpu(raw_inode->i_size); inode->i_atime.tv_sec = (signed)le32_to_cpu(raw_inode->i_atime); inode->i_ctime.tv_sec = (signed)le32_to_cpu(raw_inode->i_ctime); diff --git a/fs/ext2/ioctl.c b/fs/ext2/ioctl.c index f81e250..1089f76 100644 --- a/fs/ext2/ioctl.c +++ b/fs/ext2/ioctl.c @@ -35,7 +35,7 @@ long ext2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) case EXT2_IOC_SETFLAGS: { unsigned int oldflags; - ret = mnt_want_write(filp->f_path.mnt); + ret = mnt_want_write_file(filp); if (ret) return ret; @@ -83,7 +83,7 @@ long ext2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) inode->i_ctime = CURRENT_TIME_SEC; mark_inode_dirty(inode); setflags_out: - mnt_drop_write(filp->f_path.mnt); + mnt_drop_write_file(filp); return ret; } case EXT2_IOC_GETVERSION: @@ -91,7 +91,7 @@ setflags_out: case EXT2_IOC_SETVERSION: if (!inode_owner_or_capable(inode)) return -EPERM; - ret = mnt_want_write(filp->f_path.mnt); + ret = mnt_want_write_file(filp); if (ret) return ret; if (get_user(inode->i_generation, (int __user *) arg)) { @@ -100,7 +100,7 @@ setflags_out: inode->i_ctime = CURRENT_TIME_SEC; mark_inode_dirty(inode); } - mnt_drop_write(filp->f_path.mnt); + mnt_drop_write_file(filp); return ret; case EXT2_IOC_GETRSVSZ: if (test_opt(inode->i_sb, RESERVATION) @@ -121,7 +121,7 @@ setflags_out: if (get_user(rsv_window_size, (int __user *)arg)) return -EFAULT; - ret = mnt_want_write(filp->f_path.mnt); + ret = mnt_want_write_file(filp); if (ret) return ret; @@ -145,7 +145,7 @@ setflags_out: rsv->rsv_goal_size = rsv_window_size; } mutex_unlock(&ei->truncate_mutex); - mnt_drop_write(filp->f_path.mnt); + mnt_drop_write_file(filp); return 0; } default: diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c index 761fde8..0804198 100644 --- a/fs/ext2/namei.c +++ b/fs/ext2/namei.c @@ -94,7 +94,7 @@ struct dentry *ext2_get_parent(struct dentry *child) * If the create succeeds, we fill in the inode information * with d_instantiate(). */ -static int ext2_create (struct inode * dir, struct dentry * dentry, int mode, struct nameidata *nd) +static int ext2_create (struct inode * dir, struct dentry * dentry, umode_t mode, struct nameidata *nd) { struct inode *inode; @@ -119,7 +119,7 @@ static int ext2_create (struct inode * dir, struct dentry * dentry, int mode, st return ext2_add_nondir(dentry, inode); } -static int ext2_mknod (struct inode * dir, struct dentry *dentry, int mode, dev_t rdev) +static int ext2_mknod (struct inode * dir, struct dentry *dentry, umode_t mode, dev_t rdev) { struct inode * inode; int err; @@ -214,7 +214,7 @@ static int ext2_link (struct dentry * old_dentry, struct inode * dir, return err; } -static int ext2_mkdir(struct inode * dir, struct dentry * dentry, int mode) +static int ext2_mkdir(struct inode * dir, struct dentry * dentry, umode_t mode) { struct inode * inode; int err = -EMLINK; diff --git a/fs/ext2/super.c b/fs/ext2/super.c index 1dd62ed..0090595 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -173,7 +173,6 @@ static struct inode *ext2_alloc_inode(struct super_block *sb) static void ext2_i_callback(struct rcu_head *head) { struct inode *inode = container_of(head, struct inode, i_rcu); - INIT_LIST_HEAD(&inode->i_dentry); kmem_cache_free(ext2_inode_cachep, EXT2_I(inode)); } @@ -211,9 +210,9 @@ static void destroy_inodecache(void) kmem_cache_destroy(ext2_inode_cachep); } -static int ext2_show_options(struct seq_file *seq, struct vfsmount *vfs) +static int ext2_show_options(struct seq_file *seq, struct dentry *root) { - struct super_block *sb = vfs->mnt_sb; + struct super_block *sb = root->d_sb; struct ext2_sb_info *sbi = EXT2_SB(sb); struct ext2_super_block *es = sbi->s_es; unsigned long def_mount_opts; @@ -327,10 +326,10 @@ static struct inode *ext2_nfs_get_inode(struct super_block *sb, if (ino > le32_to_cpu(EXT2_SB(sb)->s_es->s_inodes_count)) return ERR_PTR(-ESTALE); - /* iget isn't really right if the inode is currently unallocated!! - * ext2_read_inode currently does appropriate checks, but - * it might be "neater" to call ext2_get_inode first and check - * if the inode is valid..... + /* + * ext2_iget isn't quite right if the inode is currently unallocated! + * However ext2_iget currently does appropriate checks to handle stale + * inodes so everything is OK. */ inode = ext2_iget(sb, ino); if (IS_ERR(inode)) @@ -1521,5 +1520,8 @@ static void __exit exit_ext2_fs(void) exit_ext2_xattr(); } +MODULE_AUTHOR("Remy Card and others"); +MODULE_DESCRIPTION("Second Extended Filesystem"); +MODULE_LICENSE("GPL"); module_init(init_ext2_fs) module_exit(exit_ext2_fs) diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c index d27b71f..6dcafc7 100644 --- a/fs/ext2/xattr.c +++ b/fs/ext2/xattr.c @@ -54,7 +54,6 @@ */ #include <linux/buffer_head.h> -#include <linux/module.h> #include <linux/init.h> #include <linux/slab.h> #include <linux/mbcache.h> diff --git a/fs/ext2/xattr_security.c b/fs/ext2/xattr_security.c index 5d979b4..be7a8d0 100644 --- a/fs/ext2/xattr_security.c +++ b/fs/ext2/xattr_security.c @@ -3,7 +3,6 @@ * Handler for storing security labels as extended attributes. */ -#include <linux/module.h> #include <linux/slab.h> #include <linux/string.h> #include <linux/fs.h> @@ -46,28 +45,30 @@ ext2_xattr_security_set(struct dentry *dentry, const char *name, value, size, flags); } -int -ext2_init_security(struct inode *inode, struct inode *dir, - const struct qstr *qstr) +int ext2_initxattrs(struct inode *inode, const struct xattr *xattr_array, + void *fs_info) { - int err; - size_t len; - void *value; - char *name; + const struct xattr *xattr; + int err = 0; - err = security_inode_init_security(inode, dir, qstr, &name, &value, &len); - if (err) { - if (err == -EOPNOTSUPP) - return 0; - return err; + for (xattr = xattr_array; xattr->name != NULL; xattr++) { + err = ext2_xattr_set(inode, EXT2_XATTR_INDEX_SECURITY, + xattr->name, xattr->value, + xattr->value_len, 0); + if (err < 0) + break; } - err = ext2_xattr_set(inode, EXT2_XATTR_INDEX_SECURITY, - name, value, len, 0); - kfree(name); - kfree(value); return err; } +int +ext2_init_security(struct inode *inode, struct inode *dir, + const struct qstr *qstr) +{ + return security_inode_init_security(inode, dir, qstr, + &ext2_initxattrs, NULL); +} + const struct xattr_handler ext2_xattr_security_handler = { .prefix = XATTR_SECURITY_PREFIX, .list = ext2_xattr_security_list, diff --git a/fs/ext2/xattr_trusted.c b/fs/ext2/xattr_trusted.c index 667e46a..2989467 100644 --- a/fs/ext2/xattr_trusted.c +++ b/fs/ext2/xattr_trusted.c @@ -5,7 +5,6 @@ * Copyright (C) 2003 by Andreas Gruenbacher, <a.gruenbacher@computer.org> */ -#include <linux/module.h> #include <linux/string.h> #include <linux/capability.h> #include <linux/fs.h> diff --git a/fs/ext2/xattr_user.c b/fs/ext2/xattr_user.c index 099d20f..f470e44 100644 --- a/fs/ext2/xattr_user.c +++ b/fs/ext2/xattr_user.c @@ -6,7 +6,6 @@ */ #include <linux/init.h> -#include <linux/module.h> #include <linux/string.h> #include "ext2.h" #include "xattr.h" diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c index 6386d76..a203892 100644 --- a/fs/ext3/balloc.c +++ b/fs/ext3/balloc.c @@ -427,7 +427,7 @@ static inline int rsv_is_empty(struct ext3_reserve_window *rsv) void ext3_init_block_alloc_info(struct inode *inode) { struct ext3_inode_info *ei = EXT3_I(inode); - struct ext3_block_alloc_info *block_i = ei->i_block_alloc_info; + struct ext3_block_alloc_info *block_i; struct super_block *sb = inode->i_sb; block_i = kmalloc(sizeof(*block_i), GFP_NOFS); @@ -1440,14 +1440,14 @@ out: * * Check if filesystem has at least 1 free block available for allocation. */ -static int ext3_has_free_blocks(struct ext3_sb_info *sbi) +static int ext3_has_free_blocks(struct ext3_sb_info *sbi, int use_reservation) { ext3_fsblk_t free_blocks, root_blocks; free_blocks = percpu_counter_read_positive(&sbi->s_freeblocks_counter); root_blocks = le32_to_cpu(sbi->s_es->s_r_blocks_count); if (free_blocks < root_blocks + 1 && !capable(CAP_SYS_RESOURCE) && - sbi->s_resuid != current_fsuid() && + !use_reservation && sbi->s_resuid != current_fsuid() && (sbi->s_resgid == 0 || !in_group_p (sbi->s_resgid))) { return 0; } @@ -1468,7 +1468,7 @@ static int ext3_has_free_blocks(struct ext3_sb_info *sbi) */ int ext3_should_retry_alloc(struct super_block *sb, int *retries) { - if (!ext3_has_free_blocks(EXT3_SB(sb)) || (*retries)++ > 3) + if (!ext3_has_free_blocks(EXT3_SB(sb), 0) || (*retries)++ > 3) return 0; jbd_debug(1, "%s: retrying operation after ENOSPC\n", sb->s_id); @@ -1546,7 +1546,7 @@ ext3_fsblk_t ext3_new_blocks(handle_t *handle, struct inode *inode, if (block_i && ((windowsz = block_i->rsv_window_node.rsv_goal_size) > 0)) my_rsv = &block_i->rsv_window_node; - if (!ext3_has_free_blocks(sbi)) { + if (!ext3_has_free_blocks(sbi, IS_NOQUOTA(inode))) { *errp = -ENOSPC; goto out; } @@ -1924,9 +1924,10 @@ unsigned long ext3_bg_num_gdb(struct super_block *sb, int group) * reaches any used block. Then issue a TRIM command on this extent and free * the extent in the block bitmap. This is done until whole group is scanned. */ -ext3_grpblk_t ext3_trim_all_free(struct super_block *sb, unsigned int group, - ext3_grpblk_t start, ext3_grpblk_t max, - ext3_grpblk_t minblocks) +static ext3_grpblk_t ext3_trim_all_free(struct super_block *sb, + unsigned int group, + ext3_grpblk_t start, ext3_grpblk_t max, + ext3_grpblk_t minblocks) { handle_t *handle; ext3_grpblk_t next, free_blocks, bit, freed, count = 0; diff --git a/fs/ext3/fsync.c b/fs/ext3/fsync.c index d494c55..1860ed3 100644 --- a/fs/ext3/fsync.c +++ b/fs/ext3/fsync.c @@ -61,13 +61,6 @@ int ext3_sync_file(struct file *file, loff_t start, loff_t end, int datasync) if (ret) goto out; - /* - * Taking the mutex here just to keep consistent with how fsync was - * called previously, however it looks like we don't need to take - * i_mutex at all. - */ - mutex_lock(&inode->i_mutex); - J_ASSERT(ext3_journal_current_handle() == NULL); /* @@ -85,7 +78,6 @@ int ext3_sync_file(struct file *file, loff_t start, loff_t end, int datasync) * safe in-journal, which is all fsync() needs to ensure. */ if (ext3_should_journal_data(inode)) { - mutex_unlock(&inode->i_mutex); ret = ext3_force_commit(inode->i_sb); goto out; } @@ -108,8 +100,6 @@ int ext3_sync_file(struct file *file, loff_t start, loff_t end, int datasync) */ if (needs_barrier) blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL); - - mutex_unlock(&inode->i_mutex); out: trace_ext3_sync_file_exit(inode, ret); return ret; diff --git a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c index bf09cbf..1cde284 100644 --- a/fs/ext3/ialloc.c +++ b/fs/ext3/ialloc.c @@ -178,42 +178,6 @@ error_return: } /* - * There are two policies for allocating an inode. If the new inode is - * a directory, then a forward search is made for a block group with both - * free space and a low directory-to-inode ratio; if that fails, then of - * the groups with above-average free space, that group with the fewest - * directories already is chosen. - * - * For other inodes, search forward from the parent directory\'s block - * group to find a free inode. - */ -static int find_group_dir(struct super_block *sb, struct inode *parent) -{ - int ngroups = EXT3_SB(sb)->s_groups_count; - unsigned int freei, avefreei; - struct ext3_group_desc *desc, *best_desc = NULL; - int group, best_group = -1; - - freei = percpu_counter_read_positive(&EXT3_SB(sb)->s_freeinodes_counter); - avefreei = freei / ngroups; - - for (group = 0; group < ngroups; group++) { - desc = ext3_get_group_desc (sb, group, NULL); - if (!desc || !desc->bg_free_inodes_count) - continue; - if (le16_to_cpu(desc->bg_free_inodes_count) < avefreei) - continue; - if (!best_desc || - (le16_to_cpu(desc->bg_free_blocks_count) > - le16_to_cpu(best_desc->bg_free_blocks_count))) { - best_group = group; - best_desc = desc; - } - } - return best_group; -} - -/* * Orlov's allocator for directories. * * We always try to spread first-level directories. @@ -407,7 +371,7 @@ static int find_group_other(struct super_block *sb, struct inode *parent) * group to find a free inode. */ struct inode *ext3_new_inode(handle_t *handle, struct inode * dir, - const struct qstr *qstr, int mode) + const struct qstr *qstr, umode_t mode) { struct super_block *sb; struct buffer_head *bitmap_bh = NULL; @@ -436,12 +400,9 @@ struct inode *ext3_new_inode(handle_t *handle, struct inode * dir, sbi = EXT3_SB(sb); es = sbi->s_es; - if (S_ISDIR(mode)) { - if (test_opt (sb, OLDALLOC)) - group = find_group_dir(sb, dir); - else - group = find_group_orlov(sb, dir); - } else + if (S_ISDIR(mode)) + group = find_group_orlov(sb, dir); + else group = find_group_other(sb, dir); err = -ENOSPC; @@ -564,8 +525,12 @@ got: if (IS_DIRSYNC(inode)) handle->h_sync = 1; if (insert_inode_locked(inode) < 0) { - err = -EINVAL; - goto fail_drop; + /* + * Likely a bitmap corruption causing inode to be allocated + * twice. + */ + err = -EIO; + goto fail; } spin_lock(&sbi->s_next_gen_lock); inode->i_generation = sbi->s_next_generation++; @@ -621,7 +586,7 @@ fail_free_drop: fail_drop: dquot_drop(inode); inode->i_flags |= S_NOQUOTA; - inode->i_nlink = 0; + clear_nlink(inode); unlock_new_inode(inode); iput(inode); brelse(bitmap_bh); diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c index 12661e1..2d0afec 100644 --- a/fs/ext3/inode.c +++ b/fs/ext3/inode.c @@ -22,7 +22,6 @@ * Assorted race fixes, rewrite of ext3_get_block() by Al Viro, 2000 */ -#include <linux/module.h> #include <linux/fs.h> #include <linux/time.h> #include <linux/ext3_jbd.h> @@ -223,8 +222,12 @@ void ext3_evict_inode (struct inode *inode) * * Note that directories do not have this problem because they don't * use page cache. + * + * The s_journal check handles the case when ext3_get_journal() fails + * and puts the journal inode. */ if (inode->i_nlink && ext3_should_journal_data(inode) && + EXT3_SB(inode->i_sb)->s_journal && (S_ISLNK(inode->i_mode) || S_ISREG(inode->i_mode))) { tid_t commit_tid = atomic_read(&ei->i_datasync_tid); journal_t *journal = EXT3_SB(inode->i_sb)->s_journal; @@ -1132,9 +1135,11 @@ struct buffer_head *ext3_bread(handle_t *handle, struct inode *inode, bh = ext3_getblk(handle, inode, block, create, err); if (!bh) return bh; - if (buffer_uptodate(bh)) + if (bh_uptodate_or_lock(bh)) return bh; - ll_rw_block(READ | REQ_META | REQ_PRIO, 1, &bh); + get_bh(bh); + bh->b_end_io = end_buffer_read_sync; + submit_bh(READ | REQ_META | REQ_PRIO, bh); wait_on_buffer(bh); if (buffer_uptodate(bh)) return bh; @@ -1617,7 +1622,13 @@ static int ext3_ordered_writepage(struct page *page, int err; J_ASSERT(PageLocked(page)); - WARN_ON_ONCE(IS_RDONLY(inode)); + /* + * We don't want to warn for emergency remount. The condition is + * ordered to avoid dereferencing inode->i_sb in non-error case to + * avoid slow-downs. + */ + WARN_ON_ONCE(IS_RDONLY(inode) && + !(EXT3_SB(inode->i_sb)->s_mount_state & EXT3_ERROR_FS)); /* * We give up here if we're reentered, because it might be for a @@ -1692,7 +1703,13 @@ static int ext3_writeback_writepage(struct page *page, int err; J_ASSERT(PageLocked(page)); - WARN_ON_ONCE(IS_RDONLY(inode)); + /* + * We don't want to warn for emergency remount. The condition is + * ordered to avoid dereferencing inode->i_sb in non-error case to + * avoid slow-downs. + */ + WARN_ON_ONCE(IS_RDONLY(inode) && + !(EXT3_SB(inode->i_sb)->s_mount_state & EXT3_ERROR_FS)); if (ext3_journal_current_handle()) goto out_fail; @@ -1735,7 +1752,13 @@ static int ext3_journalled_writepage(struct page *page, int err; J_ASSERT(PageLocked(page)); - WARN_ON_ONCE(IS_RDONLY(inode)); + /* + * We don't want to warn for emergency remount. The condition is + * ordered to avoid dereferencing inode->i_sb in non-error case to + * avoid slow-downs. + */ + WARN_ON_ONCE(IS_RDONLY(inode) && + !(EXT3_SB(inode->i_sb)->s_mount_state & EXT3_ERROR_FS)); if (ext3_journal_current_handle()) goto no_write; @@ -2064,12 +2087,10 @@ static int ext3_block_truncate_page(struct inode *inode, loff_t from) if (PageUptodate(page)) set_buffer_uptodate(bh); - if (!buffer_uptodate(bh)) { - err = -EIO; - ll_rw_block(READ, 1, &bh); - wait_on_buffer(bh); + if (!bh_uptodate_or_lock(bh)) { + err = bh_submit_read(bh); /* Uhhuh. Read error. Complain and punt. */ - if (!buffer_uptodate(bh)) + if (err) goto unlock; } @@ -2490,7 +2511,7 @@ int ext3_can_truncate(struct inode *inode) * transaction, and VFS/VM ensures that ext3_truncate() cannot run * simultaneously on behalf of the same inode. * - * As we work through the truncate and commmit bits of it to the journal there + * As we work through the truncate and commit bits of it to the journal there * is one core, guiding principle: the file's tree must always be consistent on * disk. We must be able to restart the truncate after a crash. * @@ -2899,7 +2920,7 @@ struct inode *ext3_iget(struct super_block *sb, unsigned long ino) inode->i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16; inode->i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16; } - inode->i_nlink = le16_to_cpu(raw_inode->i_links_count); + set_nlink(inode, le16_to_cpu(raw_inode->i_links_count)); inode->i_size = le32_to_cpu(raw_inode->i_size); inode->i_atime.tv_sec = (signed)le32_to_cpu(raw_inode->i_atime); inode->i_ctime.tv_sec = (signed)le32_to_cpu(raw_inode->i_ctime); diff --git a/fs/ext3/ioctl.c b/fs/ext3/ioctl.c index c7f4394..4af574c 100644 --- a/fs/ext3/ioctl.c +++ b/fs/ext3/ioctl.c @@ -44,7 +44,7 @@ long ext3_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) if (get_user(flags, (int __user *) arg)) return -EFAULT; - err = mnt_want_write(filp->f_path.mnt); + err = mnt_want_write_file(filp); if (err) return err; @@ -110,7 +110,7 @@ flags_err: err = ext3_change_inode_journal_flag(inode, jflag); flags_out: mutex_unlock(&inode->i_mutex); - mnt_drop_write(filp->f_path.mnt); + mnt_drop_write_file(filp); return err; } case EXT3_IOC_GETVERSION: @@ -126,7 +126,7 @@ flags_out: if (!inode_owner_or_capable(inode)) return -EPERM; - err = mnt_want_write(filp->f_path.mnt); + err = mnt_want_write_file(filp); if (err) return err; if (get_user(generation, (int __user *) arg)) { @@ -134,10 +134,11 @@ flags_out: goto setversion_out; } + mutex_lock(&inode->i_mutex); handle = ext3_journal_start(inode, 1); if (IS_ERR(handle)) { err = PTR_ERR(handle); - goto setversion_out; + goto unlock_out; } err = ext3_reserve_inode_write(handle, inode, &iloc); if (err == 0) { @@ -146,34 +147,13 @@ flags_out: err = ext3_mark_iloc_dirty(handle, inode, &iloc); } ext3_journal_stop(handle); + +unlock_out: + mutex_unlock(&inode->i_mutex); setversion_out: - mnt_drop_write(filp->f_path.mnt); + mnt_drop_write_file(filp); return err; } -#ifdef CONFIG_JBD_DEBUG - case EXT3_IOC_WAIT_FOR_READONLY: - /* - * This is racy - by the time we're woken up and running, - * the superblock could be released. And the module could - * have been unloaded. So sue me. - * - * Returns 1 if it slept, else zero. - */ - { - struct super_block *sb = inode->i_sb; - DECLARE_WAITQUEUE(wait, current); - int ret = 0; - - set_current_state(TASK_INTERRUPTIBLE); - add_wait_queue(&EXT3_SB(sb)->ro_wait_queue, &wait); - if (timer_pending(&EXT3_SB(sb)->turn_ro_timer)) { - schedule(); - ret = 1; - } - remove_wait_queue(&EXT3_SB(sb)->ro_wait_queue, &wait); - return ret; - } -#endif case EXT3_IOC_GETRSVSZ: if (test_opt(inode->i_sb, RESERVATION) && S_ISREG(inode->i_mode) @@ -188,7 +168,7 @@ setversion_out: if (!test_opt(inode->i_sb, RESERVATION) ||!S_ISREG(inode->i_mode)) return -ENOTTY; - err = mnt_want_write(filp->f_path.mnt); + err = mnt_want_write_file(filp); if (err) return err; @@ -219,7 +199,7 @@ setversion_out: } mutex_unlock(&ei->truncate_mutex); setrsvsz_out: - mnt_drop_write(filp->f_path.mnt); + mnt_drop_write_file(filp); return err; } case EXT3_IOC_GROUP_EXTEND: { @@ -230,7 +210,7 @@ setrsvsz_out: if (!capable(CAP_SYS_RESOURCE)) return -EPERM; - err = mnt_want_write(filp->f_path.mnt); + err = mnt_want_write_file(filp); if (err) return err; @@ -245,7 +225,7 @@ setrsvsz_out: if (err == 0) err = err2; group_extend_out: - mnt_drop_write(filp->f_path.mnt); + mnt_drop_write_file(filp); return err; } case EXT3_IOC_GROUP_ADD: { @@ -256,7 +236,7 @@ group_extend_out: if (!capable(CAP_SYS_RESOURCE)) return -EPERM; - err = mnt_want_write(filp->f_path.mnt); + err = mnt_want_write_file(filp); if (err) return err; @@ -273,7 +253,7 @@ group_extend_out: if (err == 0) err = err2; group_add_out: - mnt_drop_write(filp->f_path.mnt); + mnt_drop_write_file(filp); return err; } case FITRIM: { diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c index 0629e09..e8e2117 100644 --- a/fs/ext3/namei.c +++ b/fs/ext3/namei.c @@ -921,9 +921,12 @@ restart: num++; bh = ext3_getblk(NULL, dir, b++, 0, &err); bh_use[ra_max] = bh; - if (bh) - ll_rw_block(READ | REQ_META | REQ_PRIO, - 1, &bh); + if (bh && !bh_uptodate_or_lock(bh)) { + get_bh(bh); + bh->b_end_io = end_buffer_read_sync; + submit_bh(READ | REQ_META | REQ_PRIO, + bh); + } } } if ((bh = bh_use[ra_ptr++]) == NULL) @@ -1698,7 +1701,7 @@ static int ext3_add_nondir(handle_t *handle, * If the create succeeds, we fill in the inode information * with d_instantiate(). */ -static int ext3_create (struct inode * dir, struct dentry * dentry, int mode, +static int ext3_create (struct inode * dir, struct dentry * dentry, umode_t mode, struct nameidata *nd) { handle_t *handle; @@ -1732,7 +1735,7 @@ retry: } static int ext3_mknod (struct inode * dir, struct dentry *dentry, - int mode, dev_t rdev) + umode_t mode, dev_t rdev) { handle_t *handle; struct inode *inode; @@ -1768,7 +1771,7 @@ retry: return err; } -static int ext3_mkdir(struct inode * dir, struct dentry * dentry, int mode) +static int ext3_mkdir(struct inode * dir, struct dentry * dentry, umode_t mode) { handle_t *handle; struct inode * inode; @@ -1821,7 +1824,7 @@ retry: de->name_len = 2; strcpy (de->name, ".."); ext3_set_de_type(dir->i_sb, de, S_IFDIR); - inode->i_nlink = 2; + set_nlink(inode, 2); BUFFER_TRACE(dir_block, "call ext3_journal_dirty_metadata"); err = ext3_journal_dirty_metadata(handle, dir_block); if (err) @@ -1833,7 +1836,7 @@ retry: if (err) { out_clear_inode: - inode->i_nlink = 0; + clear_nlink(inode); unlock_new_inode(inode); ext3_mark_inode_dirty(handle, inode); iput (inode); @@ -2170,7 +2173,7 @@ static int ext3_unlink(struct inode * dir, struct dentry *dentry) ext3_warning (inode->i_sb, "ext3_unlink", "Deleting nonexistent file (%lu), %d", inode->i_ino, inode->i_nlink); - inode->i_nlink = 1; + set_nlink(inode, 1); } retval = ext3_delete_entry(handle, dir, de, bh); if (retval) @@ -2272,7 +2275,7 @@ retry: err = PTR_ERR(handle); goto err_drop_inode; } - inc_nlink(inode); + set_nlink(inode, 1); err = ext3_orphan_del(handle, inode); if (err) { ext3_journal_stop(handle); diff --git a/fs/ext3/super.c b/fs/ext3/super.c index 7beb69a..726c7ef 100644 --- a/fs/ext3/super.c +++ b/fs/ext3/super.c @@ -511,7 +511,6 @@ static int ext3_drop_inode(struct inode *inode) static void ext3_i_callback(struct rcu_head *head) { struct inode *inode = container_of(head, struct inode, i_rcu); - INIT_LIST_HEAD(&inode->i_dentry); kmem_cache_free(ext3_inode_cachep, EXT3_I(inode)); } @@ -611,9 +610,9 @@ static char *data_mode_string(unsigned long mode) * - it's set to a non-default value OR * - if the per-sb default is different from the global default */ -static int ext3_show_options(struct seq_file *seq, struct vfsmount *vfs) +static int ext3_show_options(struct seq_file *seq, struct dentry *root) { - struct super_block *sb = vfs->mnt_sb; + struct super_block *sb = root->d_sb; struct ext3_sb_info *sbi = EXT3_SB(sb); struct ext3_super_block *es = sbi->s_es; unsigned long def_mount_opts; @@ -652,8 +651,6 @@ static int ext3_show_options(struct seq_file *seq, struct vfsmount *vfs) seq_puts(seq, ",nouid32"); if (test_opt(sb, DEBUG)) seq_puts(seq, ",debug"); - if (test_opt(sb, OLDALLOC)) - seq_puts(seq, ",oldalloc"); #ifdef CONFIG_EXT3_FS_XATTR if (test_opt(sb, XATTR_USER)) seq_puts(seq, ",user_xattr"); @@ -1049,10 +1046,12 @@ static int parse_options (char *options, struct super_block *sb, set_opt (sbi->s_mount_opt, DEBUG); break; case Opt_oldalloc: - set_opt (sbi->s_mount_opt, OLDALLOC); + ext3_msg(sb, KERN_WARNING, + "Ignoring deprecated oldalloc option"); break; case Opt_orlov: - clear_opt (sbi->s_mount_opt, OLDALLOC); + ext3_msg(sb, KERN_WARNING, + "Ignoring deprecated orlov option"); break; #ifdef CONFIG_EXT3_FS_XATTR case Opt_user_xattr: @@ -2060,9 +2059,10 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent) EXT3_SB(sb)->s_mount_state |= EXT3_ORPHAN_FS; ext3_orphan_cleanup(sb, es); EXT3_SB(sb)->s_mount_state &= ~EXT3_ORPHAN_FS; - if (needs_recovery) + if (needs_recovery) { + ext3_mark_recovery_complete(sb, es); ext3_msg(sb, KERN_INFO, "recovery complete"); - ext3_mark_recovery_complete(sb, es); + } ext3_msg(sb, KERN_INFO, "mounted filesystem with %s data mode", test_opt(sb,DATA_FLAGS) == EXT3_MOUNT_JOURNAL_DATA ? "journal": test_opt(sb,DATA_FLAGS) == EXT3_MOUNT_ORDERED_DATA ? "ordered": @@ -2230,11 +2230,11 @@ static journal_t *ext3_get_dev_journal(struct super_block *sb, goto out_bdev; } journal->j_private = sb; - ll_rw_block(READ, 1, &journal->j_sb_buffer); - wait_on_buffer(journal->j_sb_buffer); - if (!buffer_uptodate(journal->j_sb_buffer)) { - ext3_msg(sb, KERN_ERR, "I/O error on journal device"); - goto out_journal; + if (!bh_uptodate_or_lock(journal->j_sb_buffer)) { + if (bh_submit_read(journal->j_sb_buffer)) { + ext3_msg(sb, KERN_ERR, "I/O error on journal device"); + goto out_journal; + } } if (be32_to_cpu(journal->j_superblock->s_nr_users) != 1) { ext3_msg(sb, KERN_ERR, @@ -2669,13 +2669,13 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data) /* * If we have an unprocessed orphan list hanging * around from a previously readonly bdev mount, - * require a full umount/remount for now. + * require a full umount & mount for now. */ if (es->s_last_orphan) { ext3_msg(sb, KERN_WARNING, "warning: couldn't " "remount RDWR because of unprocessed " "orphan inode list. Please " - "umount/remount instead."); + "umount & mount instead."); err = -EINVAL; goto restore_opts; } @@ -2910,7 +2910,7 @@ static int ext3_quota_on(struct super_block *sb, int type, int format_id, return -EINVAL; /* Quotafile not on the same filesystem? */ - if (path->mnt->mnt_sb != sb) + if (path->dentry->d_sb != sb) return -EXDEV; /* Journaling quota? */ if (EXT3_SB(sb)->s_qf_names[type]) { diff --git a/fs/ext3/xattr_security.c b/fs/ext3/xattr_security.c index b8d9f83..ea26f2a 100644 --- a/fs/ext3/xattr_security.c +++ b/fs/ext3/xattr_security.c @@ -3,7 +3,6 @@ * Handler for storing security labels as extended attributes. */ -#include <linux/module.h> #include <linux/slab.h> #include <linux/string.h> #include <linux/fs.h> @@ -48,28 +47,32 @@ ext3_xattr_security_set(struct dentry *dentry, const char *name, name, value, size, flags); } -int -ext3_init_security(handle_t *handle, struct inode *inode, struct inode *dir, - const struct qstr *qstr) +int ext3_initxattrs(struct inode *inode, const struct xattr *xattr_array, + void *fs_info) { - int err; - size_t len; - void *value; - char *name; + const struct xattr *xattr; + handle_t *handle = fs_info; + int err = 0; - err = security_inode_init_security(inode, dir, qstr, &name, &value, &len); - if (err) { - if (err == -EOPNOTSUPP) - return 0; - return err; + for (xattr = xattr_array; xattr->name != NULL; xattr++) { + err = ext3_xattr_set_handle(handle, inode, + EXT3_XATTR_INDEX_SECURITY, + xattr->name, xattr->value, + xattr->value_len, 0); + if (err < 0) + break; } - err = ext3_xattr_set_handle(handle, inode, EXT3_XATTR_INDEX_SECURITY, - name, value, len, 0); - kfree(name); - kfree(value); return err; } +int +ext3_init_security(handle_t *handle, struct inode *inode, struct inode *dir, + const struct qstr *qstr) +{ + return security_inode_init_security(inode, dir, qstr, + &ext3_initxattrs, handle); +} + const struct xattr_handler ext3_xattr_security_handler = { .prefix = XATTR_SECURITY_PREFIX, .list = ext3_xattr_security_list, diff --git a/fs/ext3/xattr_trusted.c b/fs/ext3/xattr_trusted.c index dc8edda..2526a88 100644 --- a/fs/ext3/xattr_trusted.c +++ b/fs/ext3/xattr_trusted.c @@ -5,7 +5,6 @@ * Copyright (C) 2003 by Andreas Gruenbacher, <a.gruenbacher@computer.org> */ -#include <linux/module.h> #include <linux/string.h> #include <linux/capability.h> #include <linux/fs.h> diff --git a/fs/ext3/xattr_user.c b/fs/ext3/xattr_user.c index 7a32197..b32e473 100644 --- a/fs/ext3/xattr_user.c +++ b/fs/ext3/xattr_user.c @@ -5,7 +5,6 @@ * Copyright (C) 2001 by Andreas Gruenbacher, <a.gruenbacher@computer.org> */ -#include <linux/module.h> #include <linux/string.h> #include <linux/fs.h> #include <linux/ext3_jbd.h> diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index f8224ad..12ccacd 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -28,7 +28,8 @@ */ /* - * Calculate the block group number and offset, given a block number + * Calculate the block group number and offset into the block/cluster + * allocation bitmap, given a block number */ void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr, ext4_group_t *blockgrpp, ext4_grpblk_t *offsetp) @@ -37,7 +38,8 @@ void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr, ext4_grpblk_t offset; blocknr = blocknr - le32_to_cpu(es->s_first_data_block); - offset = do_div(blocknr, EXT4_BLOCKS_PER_GROUP(sb)); + offset = do_div(blocknr, EXT4_BLOCKS_PER_GROUP(sb)) >> + EXT4_SB(sb)->s_cluster_bits; if (offsetp) *offsetp = offset; if (blockgrpp) @@ -55,130 +57,169 @@ static int ext4_block_in_group(struct super_block *sb, ext4_fsblk_t block, return 0; } -static int ext4_group_used_meta_blocks(struct super_block *sb, - ext4_group_t block_group, - struct ext4_group_desc *gdp) +/* Return the number of clusters used for file system metadata; this + * represents the overhead needed by the file system. + */ +unsigned ext4_num_overhead_clusters(struct super_block *sb, + ext4_group_t block_group, + struct ext4_group_desc *gdp) { - ext4_fsblk_t tmp; + unsigned num_clusters; + int block_cluster = -1, inode_cluster = -1, itbl_cluster = -1, i, c; + ext4_fsblk_t start = ext4_group_first_block_no(sb, block_group); + ext4_fsblk_t itbl_blk; struct ext4_sb_info *sbi = EXT4_SB(sb); - /* block bitmap, inode bitmap, and inode table blocks */ - int used_blocks = sbi->s_itb_per_group + 2; - if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) { - if (!ext4_block_in_group(sb, ext4_block_bitmap(sb, gdp), - block_group)) - used_blocks--; - - if (!ext4_block_in_group(sb, ext4_inode_bitmap(sb, gdp), - block_group)) - used_blocks--; - - tmp = ext4_inode_table(sb, gdp); - for (; tmp < ext4_inode_table(sb, gdp) + - sbi->s_itb_per_group; tmp++) { - if (!ext4_block_in_group(sb, tmp, block_group)) - used_blocks -= 1; + /* This is the number of clusters used by the superblock, + * block group descriptors, and reserved block group + * descriptor blocks */ + num_clusters = ext4_num_base_meta_clusters(sb, block_group); + + /* + * For the allocation bitmaps and inode table, we first need + * to check to see if the block is in the block group. If it + * is, then check to see if the cluster is already accounted + * for in the clusters used for the base metadata cluster, or + * if we can increment the base metadata cluster to include + * that block. Otherwise, we will have to track the cluster + * used for the allocation bitmap or inode table explicitly. + * Normally all of these blocks are contiguous, so the special + * case handling shouldn't be necessary except for *very* + * unusual file system layouts. + */ + if (ext4_block_in_group(sb, ext4_block_bitmap(sb, gdp), block_group)) { + block_cluster = EXT4_B2C(sbi, (start - + ext4_block_bitmap(sb, gdp))); + if (block_cluster < num_clusters) + block_cluster = -1; + else if (block_cluster == num_clusters) { + num_clusters++; + block_cluster = -1; } } - return used_blocks; -} -/* Initializes an uninitialized block bitmap if given, and returns the - * number of blocks free in the group. */ -unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh, - ext4_group_t block_group, struct ext4_group_desc *gdp) -{ - int bit, bit_max; - ext4_group_t ngroups = ext4_get_groups_count(sb); - unsigned free_blocks, group_blocks; - struct ext4_sb_info *sbi = EXT4_SB(sb); - - if (bh) { - J_ASSERT_BH(bh, buffer_locked(bh)); - - /* If checksum is bad mark all blocks used to prevent allocation - * essentially implementing a per-group read-only flag. */ - if (!ext4_group_desc_csum_verify(sbi, block_group, gdp)) { - ext4_error(sb, "Checksum bad for group %u", - block_group); - ext4_free_blks_set(sb, gdp, 0); - ext4_free_inodes_set(sb, gdp, 0); - ext4_itable_unused_set(sb, gdp, 0); - memset(bh->b_data, 0xff, sb->s_blocksize); - return 0; + if (ext4_block_in_group(sb, ext4_inode_bitmap(sb, gdp), block_group)) { + inode_cluster = EXT4_B2C(sbi, + start - ext4_inode_bitmap(sb, gdp)); + if (inode_cluster < num_clusters) + inode_cluster = -1; + else if (inode_cluster == num_clusters) { + num_clusters++; + inode_cluster = -1; } - memset(bh->b_data, 0, sb->s_blocksize); } - /* Check for superblock and gdt backups in this group */ - bit_max = ext4_bg_has_super(sb, block_group); - - if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_META_BG) || - block_group < le32_to_cpu(sbi->s_es->s_first_meta_bg) * - sbi->s_desc_per_block) { - if (bit_max) { - bit_max += ext4_bg_num_gdb(sb, block_group); - bit_max += - le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks); + itbl_blk = ext4_inode_table(sb, gdp); + for (i = 0; i < sbi->s_itb_per_group; i++) { + if (ext4_block_in_group(sb, itbl_blk + i, block_group)) { + c = EXT4_B2C(sbi, start - itbl_blk + i); + if ((c < num_clusters) || (c == inode_cluster) || + (c == block_cluster) || (c == itbl_cluster)) + continue; + if (c == num_clusters) { + num_clusters++; + continue; + } + num_clusters++; + itbl_cluster = c; } - } else { /* For META_BG_BLOCK_GROUPS */ - bit_max += ext4_bg_num_gdb(sb, block_group); } - if (block_group == ngroups - 1) { + if (block_cluster != -1) + num_clusters++; + if (inode_cluster != -1) + num_clusters++; + + return num_clusters; +} + +static unsigned int num_clusters_in_group(struct super_block *sb, + ext4_group_t block_group) +{ + unsigned int blocks; + + if (block_group == ext4_get_groups_count(sb) - 1) { /* - * Even though mke2fs always initialize first and last group - * if some other tool enabled the EXT4_BG_BLOCK_UNINIT we need - * to make sure we calculate the right free blocks + * Even though mke2fs always initializes the first and + * last group, just in case some other tool was used, + * we need to make sure we calculate the right free + * blocks. */ - group_blocks = ext4_blocks_count(sbi->s_es) - - ext4_group_first_block_no(sb, ngroups - 1); - } else { - group_blocks = EXT4_BLOCKS_PER_GROUP(sb); - } + blocks = ext4_blocks_count(EXT4_SB(sb)->s_es) - + ext4_group_first_block_no(sb, block_group); + } else + blocks = EXT4_BLOCKS_PER_GROUP(sb); + return EXT4_NUM_B2C(EXT4_SB(sb), blocks); +} - free_blocks = group_blocks - bit_max; +/* Initializes an uninitialized block bitmap */ +void ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh, + ext4_group_t block_group, + struct ext4_group_desc *gdp) +{ + unsigned int bit, bit_max; + struct ext4_sb_info *sbi = EXT4_SB(sb); + ext4_fsblk_t start, tmp; + int flex_bg = 0; + + J_ASSERT_BH(bh, buffer_locked(bh)); + + /* If checksum is bad mark all blocks used to prevent allocation + * essentially implementing a per-group read-only flag. */ + if (!ext4_group_desc_csum_verify(sbi, block_group, gdp)) { + ext4_error(sb, "Checksum bad for group %u", block_group); + ext4_free_group_clusters_set(sb, gdp, 0); + ext4_free_inodes_set(sb, gdp, 0); + ext4_itable_unused_set(sb, gdp, 0); + memset(bh->b_data, 0xff, sb->s_blocksize); + return; + } + memset(bh->b_data, 0, sb->s_blocksize); - if (bh) { - ext4_fsblk_t start, tmp; - int flex_bg = 0; + bit_max = ext4_num_base_meta_clusters(sb, block_group); + for (bit = 0; bit < bit_max; bit++) + ext4_set_bit(bit, bh->b_data); - for (bit = 0; bit < bit_max; bit++) - ext4_set_bit(bit, bh->b_data); + start = ext4_group_first_block_no(sb, block_group); - start = ext4_group_first_block_no(sb, block_group); + if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) + flex_bg = 1; - if (EXT4_HAS_INCOMPAT_FEATURE(sb, - EXT4_FEATURE_INCOMPAT_FLEX_BG)) - flex_bg = 1; + /* Set bits for block and inode bitmaps, and inode table */ + tmp = ext4_block_bitmap(sb, gdp); + if (!flex_bg || ext4_block_in_group(sb, tmp, block_group)) + ext4_set_bit(EXT4_B2C(sbi, tmp - start), bh->b_data); - /* Set bits for block and inode bitmaps, and inode table */ - tmp = ext4_block_bitmap(sb, gdp); - if (!flex_bg || ext4_block_in_group(sb, tmp, block_group)) - ext4_set_bit(tmp - start, bh->b_data); + tmp = ext4_inode_bitmap(sb, gdp); + if (!flex_bg || ext4_block_in_group(sb, tmp, block_group)) + ext4_set_bit(EXT4_B2C(sbi, tmp - start), bh->b_data); - tmp = ext4_inode_bitmap(sb, gdp); + tmp = ext4_inode_table(sb, gdp); + for (; tmp < ext4_inode_table(sb, gdp) + + sbi->s_itb_per_group; tmp++) { if (!flex_bg || ext4_block_in_group(sb, tmp, block_group)) - ext4_set_bit(tmp - start, bh->b_data); - - tmp = ext4_inode_table(sb, gdp); - for (; tmp < ext4_inode_table(sb, gdp) + - sbi->s_itb_per_group; tmp++) { - if (!flex_bg || - ext4_block_in_group(sb, tmp, block_group)) - ext4_set_bit(tmp - start, bh->b_data); - } - /* - * Also if the number of blocks within the group is - * less than the blocksize * 8 ( which is the size - * of bitmap ), set rest of the block bitmap to 1 - */ - ext4_mark_bitmap_end(group_blocks, sb->s_blocksize * 8, - bh->b_data); + ext4_set_bit(EXT4_B2C(sbi, tmp - start), bh->b_data); } - return free_blocks - ext4_group_used_meta_blocks(sb, block_group, gdp); + + /* + * Also if the number of blocks within the group is less than + * the blocksize * 8 ( which is the size of bitmap ), set rest + * of the block bitmap to 1 + */ + ext4_mark_bitmap_end(num_clusters_in_group(sb, block_group), + sb->s_blocksize * 8, bh->b_data); } +/* Return the number of free blocks in a block group. It is used when + * the block bitmap is uninitialized, so we can't just count the bits + * in the bitmap. */ +unsigned ext4_free_clusters_after_init(struct super_block *sb, + ext4_group_t block_group, + struct ext4_group_desc *gdp) +{ + return num_clusters_in_group(sb, block_group) - + ext4_num_overhead_clusters(sb, block_group, gdp); +} /* * The free blocks are managed by bitmaps. A file system contains several @@ -362,53 +403,54 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group) } /** - * ext4_has_free_blocks() + * ext4_has_free_clusters() * @sbi: in-core super block structure. - * @nblocks: number of needed blocks + * @nclusters: number of needed blocks + * @flags: flags from ext4_mb_new_blocks() * - * Check if filesystem has nblocks free & available for allocation. + * Check if filesystem has nclusters free & available for allocation. * On success return 1, return 0 on failure. */ -static int ext4_has_free_blocks(struct ext4_sb_info *sbi, - s64 nblocks, unsigned int flags) +static int ext4_has_free_clusters(struct ext4_sb_info *sbi, + s64 nclusters, unsigned int flags) { - s64 free_blocks, dirty_blocks, root_blocks; - struct percpu_counter *fbc = &sbi->s_freeblocks_counter; - struct percpu_counter *dbc = &sbi->s_dirtyblocks_counter; - - free_blocks = percpu_counter_read_positive(fbc); - dirty_blocks = percpu_counter_read_positive(dbc); - root_blocks = ext4_r_blocks_count(sbi->s_es); - - if (free_blocks - (nblocks + root_blocks + dirty_blocks) < - EXT4_FREEBLOCKS_WATERMARK) { - free_blocks = percpu_counter_sum_positive(fbc); - dirty_blocks = percpu_counter_sum_positive(dbc); + s64 free_clusters, dirty_clusters, root_clusters; + struct percpu_counter *fcc = &sbi->s_freeclusters_counter; + struct percpu_counter *dcc = &sbi->s_dirtyclusters_counter; + + free_clusters = percpu_counter_read_positive(fcc); + dirty_clusters = percpu_counter_read_positive(dcc); + root_clusters = EXT4_B2C(sbi, ext4_r_blocks_count(sbi->s_es)); + + if (free_clusters - (nclusters + root_clusters + dirty_clusters) < + EXT4_FREECLUSTERS_WATERMARK) { + free_clusters = EXT4_C2B(sbi, percpu_counter_sum_positive(fcc)); + dirty_clusters = percpu_counter_sum_positive(dcc); } - /* Check whether we have space after - * accounting for current dirty blocks & root reserved blocks. + /* Check whether we have space after accounting for current + * dirty clusters & root reserved clusters. */ - if (free_blocks >= ((root_blocks + nblocks) + dirty_blocks)) + if (free_clusters >= ((root_clusters + nclusters) + dirty_clusters)) return 1; - /* Hm, nope. Are (enough) root reserved blocks available? */ + /* Hm, nope. Are (enough) root reserved clusters available? */ if (sbi->s_resuid == current_fsuid() || ((sbi->s_resgid != 0) && in_group_p(sbi->s_resgid)) || capable(CAP_SYS_RESOURCE) || (flags & EXT4_MB_USE_ROOT_BLOCKS)) { - if (free_blocks >= (nblocks + dirty_blocks)) + if (free_clusters >= (nclusters + dirty_clusters)) return 1; } return 0; } -int ext4_claim_free_blocks(struct ext4_sb_info *sbi, - s64 nblocks, unsigned int flags) +int ext4_claim_free_clusters(struct ext4_sb_info *sbi, + s64 nclusters, unsigned int flags) { - if (ext4_has_free_blocks(sbi, nblocks, flags)) { - percpu_counter_add(&sbi->s_dirtyblocks_counter, nblocks); + if (ext4_has_free_clusters(sbi, nclusters, flags)) { + percpu_counter_add(&sbi->s_dirtyclusters_counter, nclusters); return 0; } else return -ENOSPC; @@ -428,7 +470,7 @@ int ext4_claim_free_blocks(struct ext4_sb_info *sbi, */ int ext4_should_retry_alloc(struct super_block *sb, int *retries) { - if (!ext4_has_free_blocks(EXT4_SB(sb), 1, 0) || + if (!ext4_has_free_clusters(EXT4_SB(sb), 1, 0) || (*retries)++ > 3 || !EXT4_SB(sb)->s_journal) return 0; @@ -444,7 +486,7 @@ int ext4_should_retry_alloc(struct super_block *sb, int *retries) * @handle: handle to this transaction * @inode: file inode * @goal: given target block(filesystem wide) - * @count: pointer to total number of blocks needed + * @count: pointer to total number of clusters needed * @errp: error code * * Return 1st allocated block number on success, *count stores total account @@ -476,18 +518,19 @@ ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode, spin_lock(&EXT4_I(inode)->i_block_reservation_lock); EXT4_I(inode)->i_allocated_meta_blocks += ar.len; spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); - dquot_alloc_block_nofail(inode, ar.len); + dquot_alloc_block_nofail(inode, + EXT4_C2B(EXT4_SB(inode->i_sb), ar.len)); } return ret; } /** - * ext4_count_free_blocks() -- count filesystem free blocks + * ext4_count_free_clusters() -- count filesystem free clusters * @sb: superblock * - * Adds up the number of free blocks from each block group. + * Adds up the number of free clusters from each block group. */ -ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb) +ext4_fsblk_t ext4_count_free_clusters(struct super_block *sb) { ext4_fsblk_t desc_count; struct ext4_group_desc *gdp; @@ -508,7 +551,7 @@ ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb) gdp = ext4_get_group_desc(sb, i, NULL); if (!gdp) continue; - desc_count += ext4_free_blks_count(sb, gdp); + desc_count += ext4_free_group_clusters(sb, gdp); brelse(bitmap_bh); bitmap_bh = ext4_read_block_bitmap(sb, i); if (bitmap_bh == NULL) @@ -516,12 +559,13 @@ ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb) x = ext4_count_free(bitmap_bh, sb->s_blocksize); printk(KERN_DEBUG "group %u: stored = %d, counted = %u\n", - i, ext4_free_blks_count(sb, gdp), x); + i, ext4_free_group_clusters(sb, gdp), x); bitmap_count += x; } brelse(bitmap_bh); - printk(KERN_DEBUG "ext4_count_free_blocks: stored = %llu" - ", computed = %llu, %llu\n", ext4_free_blocks_count(es), + printk(KERN_DEBUG "ext4_count_free_clusters: stored = %llu" + ", computed = %llu, %llu\n", + EXT4_B2C(EXT4_SB(sb), ext4_free_blocks_count(es)), desc_count, bitmap_count); return bitmap_count; #else @@ -530,7 +574,7 @@ ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb) gdp = ext4_get_group_desc(sb, i, NULL); if (!gdp) continue; - desc_count += ext4_free_blks_count(sb, gdp); + desc_count += ext4_free_group_clusters(sb, gdp); } return desc_count; @@ -620,6 +664,31 @@ unsigned long ext4_bg_num_gdb(struct super_block *sb, ext4_group_t group) } +/* + * This function returns the number of file system metadata clusters at + * the beginning of a block group, including the reserved gdt blocks. + */ +unsigned ext4_num_base_meta_clusters(struct super_block *sb, + ext4_group_t block_group) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + unsigned num; + + /* Check for superblock and gdt backups in this group */ + num = ext4_bg_has_super(sb, block_group); + + if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_META_BG) || + block_group < le32_to_cpu(sbi->s_es->s_first_meta_bg) * + sbi->s_desc_per_block) { + if (num) { + num += ext4_bg_num_gdb(sb, block_group); + num += le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks); + } + } else { /* For META_BG_BLOCK_GROUPS */ + num += ext4_bg_num_gdb(sb, block_group); + } + return EXT4_NUM_B2C(sbi, num); +} /** * ext4_inode_to_goal_block - return a hint for block allocation * @inode: inode for block allocation diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c index 8efb2f0..3f11656 100644 --- a/fs/ext4/block_validity.c +++ b/fs/ext4/block_validity.c @@ -13,7 +13,6 @@ #include <linux/namei.h> #include <linux/quotaops.h> #include <linux/buffer_head.h> -#include <linux/module.h> #include <linux/swap.h> #include <linux/pagemap.h> #include <linux/blkdev.h> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index b7d7bd0..1554b15 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -144,9 +144,17 @@ struct ext4_allocation_request { #define EXT4_MAP_UNWRITTEN (1 << BH_Unwritten) #define EXT4_MAP_BOUNDARY (1 << BH_Boundary) #define EXT4_MAP_UNINIT (1 << BH_Uninit) +/* Sometimes (in the bigalloc case, from ext4_da_get_block_prep) the caller of + * ext4_map_blocks wants to know whether or not the underlying cluster has + * already been accounted for. EXT4_MAP_FROM_CLUSTER conveys to the caller that + * the requested mapping was from previously mapped (or delayed allocated) + * cluster. We use BH_AllocFromCluster only for this flag. BH_AllocFromCluster + * should never appear on buffer_head's state flags. + */ +#define EXT4_MAP_FROM_CLUSTER (1 << BH_AllocFromCluster) #define EXT4_MAP_FLAGS (EXT4_MAP_NEW | EXT4_MAP_MAPPED |\ EXT4_MAP_UNWRITTEN | EXT4_MAP_BOUNDARY |\ - EXT4_MAP_UNINIT) + EXT4_MAP_UNINIT | EXT4_MAP_FROM_CLUSTER) struct ext4_map_blocks { ext4_fsblk_t m_pblk; @@ -239,8 +247,11 @@ struct ext4_io_submit { # define EXT4_BLOCK_SIZE(s) (EXT4_MIN_BLOCK_SIZE << (s)->s_log_block_size) #endif #define EXT4_ADDR_PER_BLOCK(s) (EXT4_BLOCK_SIZE(s) / sizeof(__u32)) +#define EXT4_CLUSTER_SIZE(s) (EXT4_BLOCK_SIZE(s) << \ + EXT4_SB(s)->s_cluster_bits) #ifdef __KERNEL__ # define EXT4_BLOCK_SIZE_BITS(s) ((s)->s_blocksize_bits) +# define EXT4_CLUSTER_BITS(s) (EXT4_SB(s)->s_cluster_bits) #else # define EXT4_BLOCK_SIZE_BITS(s) ((s)->s_log_block_size + 10) #endif @@ -258,6 +269,14 @@ struct ext4_io_submit { #endif #define EXT4_BLOCK_ALIGN(size, blkbits) ALIGN((size), (1 << (blkbits))) +/* Translate a block number to a cluster number */ +#define EXT4_B2C(sbi, blk) ((blk) >> (sbi)->s_cluster_bits) +/* Translate a cluster number to a block number */ +#define EXT4_C2B(sbi, cluster) ((cluster) << (sbi)->s_cluster_bits) +/* Translate # of blks to # of clusters */ +#define EXT4_NUM_B2C(sbi, blks) (((blks) + (sbi)->s_cluster_ratio - 1) >> \ + (sbi)->s_cluster_bits) + /* * Structure of a blocks group descriptor */ @@ -289,7 +308,7 @@ struct ext4_group_desc struct flex_groups { atomic_t free_inodes; - atomic_t free_blocks; + atomic_t free_clusters; atomic_t used_dirs; }; @@ -306,6 +325,7 @@ struct flex_groups { #define EXT4_DESC_SIZE(s) (EXT4_SB(s)->s_desc_size) #ifdef __KERNEL__ # define EXT4_BLOCKS_PER_GROUP(s) (EXT4_SB(s)->s_blocks_per_group) +# define EXT4_CLUSTERS_PER_GROUP(s) (EXT4_SB(s)->s_clusters_per_group) # define EXT4_DESC_PER_BLOCK(s) (EXT4_SB(s)->s_desc_per_block) # define EXT4_INODES_PER_GROUP(s) (EXT4_SB(s)->s_inodes_per_group) # define EXT4_DESC_PER_BLOCK_BITS(s) (EXT4_SB(s)->s_desc_per_block_bits) @@ -358,8 +378,7 @@ struct flex_groups { /* Flags that should be inherited by new inodes from their parent. */ #define EXT4_FL_INHERITED (EXT4_SECRM_FL | EXT4_UNRM_FL | EXT4_COMPR_FL |\ - EXT4_SYNC_FL | EXT4_IMMUTABLE_FL | EXT4_APPEND_FL |\ - EXT4_NODUMP_FL | EXT4_NOATIME_FL |\ + EXT4_SYNC_FL | EXT4_NODUMP_FL | EXT4_NOATIME_FL |\ EXT4_NOCOMPR_FL | EXT4_JOURNAL_DATA_FL |\ EXT4_NOTAIL_FL | EXT4_DIRSYNC_FL) @@ -520,6 +539,8 @@ struct ext4_new_group_data { #define EXT4_GET_BLOCKS_PUNCH_OUT_EXT 0x0020 /* Don't normalize allocation size (used for fallocate) */ #define EXT4_GET_BLOCKS_NO_NORMALIZE 0x0040 + /* Request will not result in inode size update (user for fallocate) */ +#define EXT4_GET_BLOCKS_KEEP_SIZE 0x0080 /* * Flags used by ext4_free_blocks @@ -528,6 +549,13 @@ struct ext4_new_group_data { #define EXT4_FREE_BLOCKS_FORGET 0x0002 #define EXT4_FREE_BLOCKS_VALIDATED 0x0004 #define EXT4_FREE_BLOCKS_NO_QUOT_UPDATE 0x0008 +#define EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER 0x0010 +#define EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER 0x0020 + +/* + * Flags used by ext4_discard_partial_page_buffers + */ +#define EXT4_DISCARD_PARTIAL_PG_ZERO_UNMAPPED 0x0001 /* * ioctl commands @@ -538,9 +566,6 @@ struct ext4_new_group_data { #define EXT4_IOC_SETVERSION _IOW('f', 4, long) #define EXT4_IOC_GETVERSION_OLD FS_IOC_GETVERSION #define EXT4_IOC_SETVERSION_OLD FS_IOC_SETVERSION -#ifdef CONFIG_JBD2_DEBUG -#define EXT4_IOC_WAIT_FOR_READONLY _IOR('f', 99, long) -#endif #define EXT4_IOC_GETRSVSZ _IOR('f', 5, long) #define EXT4_IOC_SETRSVSZ _IOW('f', 6, long) #define EXT4_IOC_GROUP_EXTEND _IOW('f', 7, unsigned long) @@ -563,9 +588,6 @@ struct ext4_new_group_data { #define EXT4_IOC32_SETRSVSZ _IOW('f', 6, int) #define EXT4_IOC32_GROUP_EXTEND _IOW('f', 7, unsigned int) #define EXT4_IOC32_GROUP_ADD _IOW('f', 8, struct compat_ext4_new_group_input) -#ifdef CONFIG_JBD2_DEBUG -#define EXT4_IOC32_WAIT_FOR_READONLY _IOR('f', 99, int) -#endif #define EXT4_IOC32_GETVERSION_OLD FS_IOC32_GETVERSION #define EXT4_IOC32_SETVERSION_OLD FS_IOC32_SETVERSION #endif @@ -837,6 +859,7 @@ struct ext4_inode_info { ext4_group_t i_last_alloc_group; /* allocation reservation info for delalloc */ + /* In case of bigalloc, these refer to clusters rather than blocks */ unsigned int i_reserved_data_blocks; unsigned int i_reserved_meta_blocks; unsigned int i_allocated_meta_blocks; @@ -886,7 +909,6 @@ struct ext4_inode_info { /* * Mount flags */ -#define EXT4_MOUNT_OLDALLOC 0x00002 /* Don't use the new Orlov allocator */ #define EXT4_MOUNT_GRPID 0x00004 /* Create files with directory's group */ #define EXT4_MOUNT_DEBUG 0x00008 /* Some debugging messages */ #define EXT4_MOUNT_ERRORS_CONT 0x00010 /* Continue on errors */ @@ -918,6 +940,9 @@ struct ext4_inode_info { #define EXT4_MOUNT_DISCARD 0x40000000 /* Issue DISCARD requests */ #define EXT4_MOUNT_INIT_INODE_TABLE 0x80000000 /* Initialize uninitialized itables */ +#define EXT4_MOUNT2_EXPLICIT_DELALLOC 0x00000001 /* User explicitly + specified delalloc */ + #define clear_opt(sb, opt) EXT4_SB(sb)->s_mount_opt &= \ ~EXT4_MOUNT_##opt #define set_opt(sb, opt) EXT4_SB(sb)->s_mount_opt |= \ @@ -968,9 +993,9 @@ struct ext4_super_block { /*10*/ __le32 s_free_inodes_count; /* Free inodes count */ __le32 s_first_data_block; /* First Data Block */ __le32 s_log_block_size; /* Block size */ - __le32 s_obso_log_frag_size; /* Obsoleted fragment size */ + __le32 s_log_cluster_size; /* Allocation cluster size */ /*20*/ __le32 s_blocks_per_group; /* # Blocks per group */ - __le32 s_obso_frags_per_group; /* Obsoleted fragments per group */ + __le32 s_clusters_per_group; /* # Clusters per group */ __le32 s_inodes_per_group; /* # Inodes per group */ __le32 s_mtime; /* Mount time */ /*30*/ __le32 s_wtime; /* Write time */ @@ -1066,7 +1091,10 @@ struct ext4_super_block { __u8 s_last_error_func[32]; /* function where the error happened */ #define EXT4_S_ERR_END offsetof(struct ext4_super_block, s_mount_opts) __u8 s_mount_opts[64]; - __le32 s_reserved[112]; /* Padding to the end of the block */ + __le32 s_usr_quota_inum; /* inode for tracking user quota */ + __le32 s_grp_quota_inum; /* inode for tracking group quota */ + __le32 s_overhead_clusters; /* overhead blocks/clusters in fs */ + __le32 s_reserved[109]; /* Padding to the end of the block */ }; #define EXT4_S_ERR_LEN (EXT4_S_ERR_END - EXT4_S_ERR_START) @@ -1086,6 +1114,7 @@ struct ext4_sb_info { unsigned long s_desc_size; /* Size of a group descriptor in bytes */ unsigned long s_inodes_per_block;/* Number of inodes per block */ unsigned long s_blocks_per_group;/* Number of blocks in a group */ + unsigned long s_clusters_per_group; /* Number of clusters in a group */ unsigned long s_inodes_per_group;/* Number of inodes in a group */ unsigned long s_itb_per_group; /* Number of inode table blocks per group */ unsigned long s_gdb_count; /* Number of group descriptor blocks */ @@ -1094,6 +1123,8 @@ struct ext4_sb_info { ext4_group_t s_blockfile_groups;/* Groups acceptable for non-extent files */ unsigned long s_overhead_last; /* Last calculated overhead */ unsigned long s_blocks_last; /* Last seen block count */ + unsigned int s_cluster_ratio; /* Number of blocks per cluster */ + unsigned int s_cluster_bits; /* log2 of s_cluster_ratio */ loff_t s_bitmap_maxbytes; /* max bytes for bitmap files */ struct buffer_head * s_sbh; /* Buffer containing the super block */ struct ext4_super_block *s_es; /* Pointer to the super block in the buffer */ @@ -1117,10 +1148,10 @@ struct ext4_sb_info { u32 s_hash_seed[4]; int s_def_hash_version; int s_hash_unsigned; /* 3 if hash should be signed, 0 if not */ - struct percpu_counter s_freeblocks_counter; + struct percpu_counter s_freeclusters_counter; struct percpu_counter s_freeinodes_counter; struct percpu_counter s_dirs_counter; - struct percpu_counter s_dirtyblocks_counter; + struct percpu_counter s_dirtyclusters_counter; struct blockgroup_lock *s_blockgroup_lock; struct proc_dir_entry *s_proc; struct kobject s_kobj; @@ -1136,10 +1167,6 @@ struct ext4_sb_info { u32 s_max_batch_time; u32 s_min_batch_time; struct block_device *journal_bdev; -#ifdef CONFIG_JBD2_DEBUG - struct timer_list turn_ro_timer; /* For turning read-only (crash simulation) */ - wait_queue_head_t ro_wait_queue; /* For people waiting for the fs to go read-only */ -#endif #ifdef CONFIG_QUOTA char *s_qf_names[MAXQUOTAS]; /* Names of quota files with journalled quota */ int s_jquota_fmt; /* Format of quota to use */ @@ -1248,6 +1275,15 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino) ino <= le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count)); } +static inline void ext4_set_io_unwritten_flag(struct inode *inode, + struct ext4_io_end *io_end) +{ + if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) { + io_end->flag |= EXT4_IO_END_UNWRITTEN; + atomic_inc(&EXT4_I(inode)->i_aiodio_unwritten); + } +} + /* * Inode dynamic state flags */ @@ -1360,6 +1396,7 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei) #define EXT4_FEATURE_RO_COMPAT_DIR_NLINK 0x0020 #define EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE 0x0040 #define EXT4_FEATURE_RO_COMPAT_QUOTA 0x0100 +#define EXT4_FEATURE_RO_COMPAT_BIGALLOC 0x0200 #define EXT4_FEATURE_INCOMPAT_COMPRESSION 0x0001 #define EXT4_FEATURE_INCOMPAT_FILETYPE 0x0002 @@ -1402,7 +1439,8 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei) EXT4_FEATURE_RO_COMPAT_DIR_NLINK | \ EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE | \ EXT4_FEATURE_RO_COMPAT_BTREE_DIR |\ - EXT4_FEATURE_RO_COMPAT_HUGE_FILE) + EXT4_FEATURE_RO_COMPAT_HUGE_FILE |\ + EXT4_FEATURE_RO_COMPAT_BIGALLOC) /* * Default values for user and/or group using reserved blocks @@ -1735,9 +1773,9 @@ extern ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode, unsigned int flags, unsigned long *count, int *errp); -extern int ext4_claim_free_blocks(struct ext4_sb_info *sbi, - s64 nblocks, unsigned int flags); -extern ext4_fsblk_t ext4_count_free_blocks(struct super_block *); +extern int ext4_claim_free_clusters(struct ext4_sb_info *sbi, + s64 nclusters, unsigned int flags); +extern ext4_fsblk_t ext4_count_free_clusters(struct super_block *); extern void ext4_check_blocks_bitmap(struct super_block *); extern struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb, ext4_group_t block_group, @@ -1745,12 +1783,18 @@ extern struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb, extern int ext4_should_retry_alloc(struct super_block *sb, int *retries); struct buffer_head *ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group); -extern unsigned ext4_init_block_bitmap(struct super_block *sb, - struct buffer_head *bh, - ext4_group_t group, - struct ext4_group_desc *desc); -#define ext4_free_blocks_after_init(sb, group, desc) \ - ext4_init_block_bitmap(sb, NULL, group, desc) +extern void ext4_init_block_bitmap(struct super_block *sb, + struct buffer_head *bh, + ext4_group_t group, + struct ext4_group_desc *desc); +extern unsigned ext4_free_clusters_after_init(struct super_block *sb, + ext4_group_t block_group, + struct ext4_group_desc *gdp); +extern unsigned ext4_num_base_meta_clusters(struct super_block *sb, + ext4_group_t block_group); +extern unsigned ext4_num_overhead_clusters(struct super_block *sb, + ext4_group_t block_group, + struct ext4_group_desc *gdp); ext4_fsblk_t ext4_inode_to_goal_block(struct inode *); /* dir.c */ @@ -1775,8 +1819,9 @@ extern int ext4fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo); /* ialloc.c */ -extern struct inode *ext4_new_inode(handle_t *, struct inode *, int, - const struct qstr *qstr, __u32 goal); +extern struct inode *ext4_new_inode(handle_t *, struct inode *, umode_t, + const struct qstr *qstr, __u32 goal, + uid_t *owner); extern void ext4_free_inode(handle_t *, struct inode *); extern struct inode * ext4_orphan_get(struct super_block *, unsigned long); extern unsigned long ext4_count_free_inodes(struct super_block *); @@ -1839,6 +1884,12 @@ extern int ext4_block_truncate_page(handle_t *handle, struct address_space *mapping, loff_t from); extern int ext4_block_zero_page_range(handle_t *handle, struct address_space *mapping, loff_t from, loff_t length); +extern int ext4_discard_partial_page_buffers(handle_t *handle, + struct address_space *mapping, loff_t from, + loff_t length, int flags); +extern int ext4_discard_partial_page_buffers_no_lock(handle_t *handle, + struct inode *inode, struct page *page, loff_t from, + loff_t length, int flags); extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); extern qsize_t *ext4_get_reserved_space(struct inode *inode); extern void ext4_da_update_reserve_space(struct inode *inode, @@ -1878,40 +1929,40 @@ extern int ext4_group_extend(struct super_block *sb, extern void *ext4_kvmalloc(size_t size, gfp_t flags); extern void *ext4_kvzalloc(size_t size, gfp_t flags); extern void ext4_kvfree(void *ptr); -extern void __ext4_error(struct super_block *, const char *, unsigned int, - const char *, ...) - __attribute__ ((format (printf, 4, 5))); +extern __printf(4, 5) +void __ext4_error(struct super_block *, const char *, unsigned int, + const char *, ...); #define ext4_error(sb, message...) __ext4_error(sb, __func__, \ __LINE__, ## message) -extern void ext4_error_inode(struct inode *, const char *, unsigned int, - ext4_fsblk_t, const char *, ...) - __attribute__ ((format (printf, 5, 6))); -extern void ext4_error_file(struct file *, const char *, unsigned int, - ext4_fsblk_t, const char *, ...) - __attribute__ ((format (printf, 5, 6))); +extern __printf(5, 6) +void ext4_error_inode(struct inode *, const char *, unsigned int, ext4_fsblk_t, + const char *, ...); +extern __printf(5, 6) +void ext4_error_file(struct file *, const char *, unsigned int, ext4_fsblk_t, + const char *, ...); extern void __ext4_std_error(struct super_block *, const char *, unsigned int, int); -extern void __ext4_abort(struct super_block *, const char *, unsigned int, - const char *, ...) - __attribute__ ((format (printf, 4, 5))); +extern __printf(4, 5) +void __ext4_abort(struct super_block *, const char *, unsigned int, + const char *, ...); #define ext4_abort(sb, message...) __ext4_abort(sb, __func__, \ __LINE__, ## message) -extern void __ext4_warning(struct super_block *, const char *, unsigned int, - const char *, ...) - __attribute__ ((format (printf, 4, 5))); +extern __printf(4, 5) +void __ext4_warning(struct super_block *, const char *, unsigned int, + const char *, ...); #define ext4_warning(sb, message...) __ext4_warning(sb, __func__, \ __LINE__, ## message) -extern void ext4_msg(struct super_block *, const char *, const char *, ...) - __attribute__ ((format (printf, 3, 4))); +extern __printf(3, 4) +void ext4_msg(struct super_block *, const char *, const char *, ...); extern void __dump_mmp_msg(struct super_block *, struct mmp_struct *mmp, const char *, unsigned int, const char *); #define dump_mmp_msg(sb, mmp, msg) __dump_mmp_msg(sb, mmp, __func__, \ __LINE__, msg) -extern void __ext4_grp_locked_error(const char *, unsigned int, \ - struct super_block *, ext4_group_t, \ - unsigned long, ext4_fsblk_t, \ - const char *, ...) - __attribute__ ((format (printf, 7, 8))); +extern __printf(7, 8) +void __ext4_grp_locked_error(const char *, unsigned int, + struct super_block *, ext4_group_t, + unsigned long, ext4_fsblk_t, + const char *, ...); #define ext4_grp_locked_error(sb, grp, message...) \ __ext4_grp_locked_error(__func__, __LINE__, (sb), (grp), ## message) extern void ext4_update_dynamic_rev(struct super_block *sb); @@ -1927,8 +1978,8 @@ extern ext4_fsblk_t ext4_inode_bitmap(struct super_block *sb, struct ext4_group_desc *bg); extern ext4_fsblk_t ext4_inode_table(struct super_block *sb, struct ext4_group_desc *bg); -extern __u32 ext4_free_blks_count(struct super_block *sb, - struct ext4_group_desc *bg); +extern __u32 ext4_free_group_clusters(struct super_block *sb, + struct ext4_group_desc *bg); extern __u32 ext4_free_inodes_count(struct super_block *sb, struct ext4_group_desc *bg); extern __u32 ext4_used_dirs_count(struct super_block *sb, @@ -1941,8 +1992,9 @@ extern void ext4_inode_bitmap_set(struct super_block *sb, struct ext4_group_desc *bg, ext4_fsblk_t blk); extern void ext4_inode_table_set(struct super_block *sb, struct ext4_group_desc *bg, ext4_fsblk_t blk); -extern void ext4_free_blks_set(struct super_block *sb, - struct ext4_group_desc *bg, __u32 count); +extern void ext4_free_group_clusters_set(struct super_block *sb, + struct ext4_group_desc *bg, + __u32 count); extern void ext4_free_inodes_set(struct super_block *sb, struct ext4_group_desc *bg, __u32 count); extern void ext4_used_dirs_set(struct super_block *sb, @@ -2051,13 +2103,13 @@ do { \ } while (0) #ifdef CONFIG_SMP -/* Each CPU can accumulate percpu_counter_batch blocks in their local - * counters. So we need to make sure we have free blocks more +/* Each CPU can accumulate percpu_counter_batch clusters in their local + * counters. So we need to make sure we have free clusters more * than percpu_counter_batch * nr_cpu_ids. Also add a window of 4 times. */ -#define EXT4_FREEBLOCKS_WATERMARK (4 * (percpu_counter_batch * nr_cpu_ids)) +#define EXT4_FREECLUSTERS_WATERMARK (4 * (percpu_counter_batch * nr_cpu_ids)) #else -#define EXT4_FREEBLOCKS_WATERMARK 0 +#define EXT4_FREECLUSTERS_WATERMARK 0 #endif static inline void ext4_update_i_disksize(struct inode *inode, loff_t newsize) @@ -2243,10 +2295,19 @@ extern int ext4_multi_mount_protect(struct super_block *, ext4_fsblk_t); enum ext4_state_bits { BH_Uninit /* blocks are allocated but uninitialized on disk */ = BH_JBDPrivateStart, + BH_AllocFromCluster, /* allocated blocks were part of already + * allocated cluster. Note that this flag will + * never, ever appear in a buffer_head's state + * flag. See EXT4_MAP_FROM_CLUSTER to see where + * this is used. */ + BH_Da_Mapped, /* Delayed allocated block that now has a mapping. This + * flag is set when ext4_map_blocks is called on a + * delayed allocated block to get its real mapping. */ }; BUFFER_FNS(Uninit, uninit) TAS_BUFFER_FNS(Uninit, uninit) +BUFFER_FNS(Da_Mapped, da_mapped) /* * Add new method to test wether block and inode bitmaps are properly @@ -2282,4 +2343,6 @@ extern void ext4_resize_end(struct super_block *sb); #endif /* __KERNEL__ */ +#include "ext4_extents.h" + #endif /* _EXT4_H */ diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h index 095c36f..a52db3a 100644 --- a/fs/ext4/ext4_extents.h +++ b/fs/ext4/ext4_extents.h @@ -290,5 +290,7 @@ extern struct ext4_ext_path *ext4_ext_find_extent(struct inode *, ext4_lblk_t, struct ext4_ext_path *); extern void ext4_ext_drop_refs(struct ext4_ext_path *); extern int ext4_ext_check_inode(struct inode *inode); +extern int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk, + int search_hint_reverse); #endif /* _EXT4_EXTENTS */ diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c index f5240aa..aca1790 100644 --- a/fs/ext4/ext4_jbd2.c +++ b/fs/ext4/ext4_jbd2.c @@ -109,9 +109,11 @@ int __ext4_handle_dirty_metadata(const char *where, unsigned int line, if (ext4_handle_valid(handle)) { err = jbd2_journal_dirty_metadata(handle, bh); - if (err) - ext4_journal_abort_handle(where, line, __func__, - bh, handle, err); + if (err) { + /* Errors can only happen if there is a bug */ + handle->h_err = err; + __ext4_journal_stop(where, line, handle); + } } else { if (inode) mark_buffer_dirty_inode(bh, inode); diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 57cf568..841faf5 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -29,7 +29,6 @@ * - smart tree reduction */ -#include <linux/module.h> #include <linux/fs.h> #include <linux/time.h> #include <linux/jbd2.h> @@ -42,7 +41,6 @@ #include <asm/uaccess.h> #include <linux/fiemap.h> #include "ext4_jbd2.h" -#include "ext4_extents.h" #include <trace/events/ext4.h> @@ -96,13 +94,17 @@ static int ext4_ext_get_access(handle_t *handle, struct inode *inode, * - ENOMEM * - EIO */ -static int ext4_ext_dirty(handle_t *handle, struct inode *inode, - struct ext4_ext_path *path) +#define ext4_ext_dirty(handle, inode, path) \ + __ext4_ext_dirty(__func__, __LINE__, (handle), (inode), (path)) +static int __ext4_ext_dirty(const char *where, unsigned int line, + handle_t *handle, struct inode *inode, + struct ext4_ext_path *path) { int err; if (path->p_bh) { /* path points to block */ - err = ext4_handle_dirty_metadata(handle, inode, path->p_bh); + err = __ext4_handle_dirty_metadata(where, line, handle, + inode, path->p_bh); } else { /* path points to leaf/index in inode body */ err = ext4_mark_inode_dirty(handle, inode); @@ -114,11 +116,9 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode, struct ext4_ext_path *path, ext4_lblk_t block) { - int depth; - if (path) { + int depth = path->p_depth; struct ext4_extent *ex; - depth = path->p_depth; /* * Try to predict block placement assuming that we are @@ -180,12 +180,10 @@ static inline int ext4_ext_space_block(struct inode *inode, int check) size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header)) / sizeof(struct ext4_extent); - if (!check) { #ifdef AGGRESSIVE_TEST - if (size > 6) - size = 6; + if (!check && size > 6) + size = 6; #endif - } return size; } @@ -195,12 +193,10 @@ static inline int ext4_ext_space_block_idx(struct inode *inode, int check) size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header)) / sizeof(struct ext4_extent_idx); - if (!check) { #ifdef AGGRESSIVE_TEST - if (size > 5) - size = 5; + if (!check && size > 5) + size = 5; #endif - } return size; } @@ -211,12 +207,10 @@ static inline int ext4_ext_space_root(struct inode *inode, int check) size = sizeof(EXT4_I(inode)->i_data); size -= sizeof(struct ext4_extent_header); size /= sizeof(struct ext4_extent); - if (!check) { #ifdef AGGRESSIVE_TEST - if (size > 3) - size = 3; + if (!check && size > 3) + size = 3; #endif - } return size; } @@ -227,12 +221,10 @@ static inline int ext4_ext_space_root_idx(struct inode *inode, int check) size = sizeof(EXT4_I(inode)->i_data); size -= sizeof(struct ext4_extent_header); size /= sizeof(struct ext4_extent_idx); - if (!check) { #ifdef AGGRESSIVE_TEST - if (size > 4) - size = 4; + if (!check && size > 4) + size = 4; #endif - } return size; } @@ -244,7 +236,7 @@ static inline int ext4_ext_space_root_idx(struct inode *inode, int check) int ext4_ext_calc_metadata_amount(struct inode *inode, ext4_lblk_t lblock) { struct ext4_inode_info *ei = EXT4_I(inode); - int idxs, num = 0; + int idxs; idxs = ((inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header)) / sizeof(struct ext4_extent_idx)); @@ -259,6 +251,8 @@ int ext4_ext_calc_metadata_amount(struct inode *inode, ext4_lblk_t lblock) */ if (ei->i_da_metadata_calc_len && ei->i_da_metadata_calc_last_lblock+1 == lblock) { + int num = 0; + if ((ei->i_da_metadata_calc_len % idxs) == 0) num++; if ((ei->i_da_metadata_calc_len % (idxs*idxs)) == 0) @@ -321,8 +315,6 @@ static int ext4_valid_extent_entries(struct inode *inode, struct ext4_extent_header *eh, int depth) { - struct ext4_extent *ext; - struct ext4_extent_idx *ext_idx; unsigned short entries; if (eh->eh_entries == 0) return 1; @@ -331,7 +323,7 @@ static int ext4_valid_extent_entries(struct inode *inode, if (depth == 0) { /* leaf entries */ - ext = EXT_FIRST_EXTENT(eh); + struct ext4_extent *ext = EXT_FIRST_EXTENT(eh); while (entries) { if (!ext4_valid_extent(inode, ext)) return 0; @@ -339,7 +331,7 @@ static int ext4_valid_extent_entries(struct inode *inode, entries--; } } else { - ext_idx = EXT_FIRST_INDEX(eh); + struct ext4_extent_idx *ext_idx = EXT_FIRST_INDEX(eh); while (entries) { if (!ext4_valid_extent_idx(inode, ext_idx)) return 0; @@ -751,31 +743,30 @@ static int ext4_ext_insert_index(handle_t *handle, struct inode *inode, return -EIO; } - len = EXT_MAX_INDEX(curp->p_hdr) - curp->p_idx; if (logical > le32_to_cpu(curp->p_idx->ei_block)) { /* insert after */ - if (curp->p_idx != EXT_LAST_INDEX(curp->p_hdr)) { - len = (len - 1) * sizeof(struct ext4_extent_idx); - len = len < 0 ? 0 : len; - ext_debug("insert new index %d after: %llu. " - "move %d from 0x%p to 0x%p\n", - logical, ptr, len, - (curp->p_idx + 1), (curp->p_idx + 2)); - memmove(curp->p_idx + 2, curp->p_idx + 1, len); - } + ext_debug("insert new index %d after: %llu\n", logical, ptr); ix = curp->p_idx + 1; } else { /* insert before */ - len = len * sizeof(struct ext4_extent_idx); - len = len < 0 ? 0 : len; - ext_debug("insert new index %d before: %llu. " - "move %d from 0x%p to 0x%p\n", - logical, ptr, len, - curp->p_idx, (curp->p_idx + 1)); - memmove(curp->p_idx + 1, curp->p_idx, len); + ext_debug("insert new index %d before: %llu\n", logical, ptr); ix = curp->p_idx; } + len = EXT_LAST_INDEX(curp->p_hdr) - ix + 1; + BUG_ON(len < 0); + if (len > 0) { + ext_debug("insert new index %d: " + "move %d indices from 0x%p to 0x%p\n", + logical, len, ix, ix + 1); + memmove(ix + 1, ix, len * sizeof(struct ext4_extent_idx)); + } + + if (unlikely(ix > EXT_MAX_INDEX(curp->p_hdr))) { + EXT4_ERROR_INODE(inode, "ix > EXT_MAX_INDEX!"); + return -EIO; + } + ix->ei_block = cpu_to_le32(logical); ext4_idx_store_pblock(ix, ptr); le16_add_cpu(&curp->p_hdr->eh_entries, 1); @@ -1042,16 +1033,14 @@ cleanup: */ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode, unsigned int flags, - struct ext4_ext_path *path, struct ext4_extent *newext) { - struct ext4_ext_path *curp = path; struct ext4_extent_header *neh; struct buffer_head *bh; ext4_fsblk_t newblock; int err = 0; - newblock = ext4_ext_new_meta_block(handle, inode, path, + newblock = ext4_ext_new_meta_block(handle, inode, NULL, newext, &err, flags); if (newblock == 0) return err; @@ -1071,7 +1060,8 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode, } /* move top-level index/leaf into new block */ - memmove(bh->b_data, curp->p_hdr, sizeof(EXT4_I(inode)->i_data)); + memmove(bh->b_data, EXT4_I(inode)->i_data, + sizeof(EXT4_I(inode)->i_data)); /* set size of new block */ neh = ext_block_hdr(bh); @@ -1089,32 +1079,23 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode, if (err) goto out; - /* create index in new top-level index: num,max,pointer */ - err = ext4_ext_get_access(handle, inode, curp); - if (err) - goto out; - - curp->p_hdr->eh_magic = EXT4_EXT_MAGIC; - curp->p_hdr->eh_max = cpu_to_le16(ext4_ext_space_root_idx(inode, 0)); - curp->p_hdr->eh_entries = cpu_to_le16(1); - curp->p_idx = EXT_FIRST_INDEX(curp->p_hdr); - - if (path[0].p_hdr->eh_depth) - curp->p_idx->ei_block = - EXT_FIRST_INDEX(path[0].p_hdr)->ei_block; - else - curp->p_idx->ei_block = - EXT_FIRST_EXTENT(path[0].p_hdr)->ee_block; - ext4_idx_store_pblock(curp->p_idx, newblock); - + /* Update top-level index: num,max,pointer */ neh = ext_inode_hdr(inode); + neh->eh_entries = cpu_to_le16(1); + ext4_idx_store_pblock(EXT_FIRST_INDEX(neh), newblock); + if (neh->eh_depth == 0) { + /* Root extent block becomes index block */ + neh->eh_max = cpu_to_le16(ext4_ext_space_root_idx(inode, 0)); + EXT_FIRST_INDEX(neh)->ei_block = + EXT_FIRST_EXTENT(neh)->ee_block; + } ext_debug("new root: num %d(%d), lblock %d, ptr %llu\n", le16_to_cpu(neh->eh_entries), le16_to_cpu(neh->eh_max), le32_to_cpu(EXT_FIRST_INDEX(neh)->ei_block), ext4_idx_pblock(EXT_FIRST_INDEX(neh))); - neh->eh_depth = cpu_to_le16(path->p_depth + 1); - err = ext4_ext_dirty(handle, inode, curp); + neh->eh_depth = cpu_to_le16(le16_to_cpu(neh->eh_depth) + 1); + ext4_mark_inode_dirty(handle, inode); out: brelse(bh); @@ -1162,8 +1143,7 @@ repeat: err = PTR_ERR(path); } else { /* tree is full, time to grow in depth */ - err = ext4_ext_grow_indepth(handle, inode, flags, - path, newext); + err = ext4_ext_grow_indepth(handle, inode, flags, newext); if (err) goto out; @@ -1235,9 +1215,9 @@ static int ext4_ext_search_left(struct inode *inode, if (unlikely(ix != EXT_FIRST_INDEX(path[depth].p_hdr))) { EXT4_ERROR_INODE(inode, "ix (%d) != EXT_FIRST_INDEX (%d) (depth %d)!", - ix != NULL ? ix->ei_block : 0, + ix != NULL ? le32_to_cpu(ix->ei_block) : 0, EXT_FIRST_INDEX(path[depth].p_hdr) != NULL ? - EXT_FIRST_INDEX(path[depth].p_hdr)->ei_block : 0, + le32_to_cpu(EXT_FIRST_INDEX(path[depth].p_hdr)->ei_block) : 0, depth); return -EIO; } @@ -1260,13 +1240,14 @@ static int ext4_ext_search_left(struct inode *inode, /* * search the closest allocated block to the right for *logical * and returns it at @logical + it's physical address at @phys - * if *logical is the smallest allocated block, the function + * if *logical is the largest allocated block, the function * returns 0 at @phys * return value contains 0 (success) or error code */ static int ext4_ext_search_right(struct inode *inode, struct ext4_ext_path *path, - ext4_lblk_t *logical, ext4_fsblk_t *phys) + ext4_lblk_t *logical, ext4_fsblk_t *phys, + struct ext4_extent **ret_ex) { struct buffer_head *bh = NULL; struct ext4_extent_header *eh; @@ -1308,9 +1289,7 @@ static int ext4_ext_search_right(struct inode *inode, return -EIO; } } - *logical = le32_to_cpu(ex->ee_block); - *phys = ext4_ext_pblock(ex); - return 0; + goto found_extent; } if (unlikely(*logical < (le32_to_cpu(ex->ee_block) + ee_len))) { @@ -1323,9 +1302,7 @@ static int ext4_ext_search_right(struct inode *inode, if (ex != EXT_LAST_EXTENT(path[depth].p_hdr)) { /* next allocated block in this leaf */ ex++; - *logical = le32_to_cpu(ex->ee_block); - *phys = ext4_ext_pblock(ex); - return 0; + goto found_extent; } /* go up and search for index to the right */ @@ -1368,9 +1345,12 @@ got_index: return -EIO; } ex = EXT_FIRST_EXTENT(eh); +found_extent: *logical = le32_to_cpu(ex->ee_block); *phys = ext4_ext_pblock(ex); - put_bh(bh); + *ret_ex = ex; + if (bh) + put_bh(bh); return 0; } @@ -1395,7 +1375,8 @@ ext4_ext_next_allocated_block(struct ext4_ext_path *path) while (depth >= 0) { if (depth == path->p_depth) { /* leaf */ - if (path[depth].p_ext != + if (path[depth].p_ext && + path[depth].p_ext != EXT_LAST_EXTENT(path[depth].p_hdr)) return le32_to_cpu(path[depth].p_ext[1].ee_block); } else { @@ -1623,7 +1604,8 @@ static int ext4_ext_try_to_merge(struct inode *inode, * such that there will be no overlap, and then returns 1. * If there is no overlap found, it returns 0. */ -static unsigned int ext4_ext_check_overlap(struct inode *inode, +static unsigned int ext4_ext_check_overlap(struct ext4_sb_info *sbi, + struct inode *inode, struct ext4_extent *newext, struct ext4_ext_path *path) { @@ -1637,6 +1619,7 @@ static unsigned int ext4_ext_check_overlap(struct inode *inode, if (!path[depth].p_ext) goto out; b2 = le32_to_cpu(path[depth].p_ext->ee_block); + b2 &= ~(sbi->s_cluster_ratio - 1); /* * get the next allocated block if the extent in the path @@ -1646,6 +1629,7 @@ static unsigned int ext4_ext_check_overlap(struct inode *inode, b2 = ext4_ext_next_allocated_block(path); if (b2 == EXT_MAX_BLOCKS) goto out; + b2 &= ~(sbi->s_cluster_ratio - 1); } /* check for wrap through zero on extent logical start block*/ @@ -1697,7 +1681,7 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode, /* try to insert block into found extent and return */ if (ex && !(flag & EXT4_GET_BLOCKS_PRE_IO) && ext4_can_extents_be_merged(inode, ex, newext)) { - ext_debug("append [%d]%d block to %d:[%d]%d (from %llu)\n", + ext_debug("append [%d]%d block to %u:[%d]%d (from %llu)\n", ext4_ext_is_uninitialized(newext), ext4_ext_get_actual_len(newext), le32_to_cpu(ex->ee_block), @@ -1735,7 +1719,7 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode, if (le32_to_cpu(newext->ee_block) > le32_to_cpu(fex->ee_block)) next = ext4_ext_next_leaf_block(path); if (next != EXT_MAX_BLOCKS) { - ext_debug("next leaf block - %d\n", next); + ext_debug("next leaf block - %u\n", next); BUG_ON(npath != NULL); npath = ext4_ext_find_extent(inode, next, NULL); if (IS_ERR(npath)) @@ -1773,46 +1757,51 @@ has_space: if (!nearex) { /* there is no extent in this leaf, create first one */ - ext_debug("first extent in the leaf: %d:%llu:[%d]%d\n", + ext_debug("first extent in the leaf: %u:%llu:[%d]%d\n", le32_to_cpu(newext->ee_block), ext4_ext_pblock(newext), ext4_ext_is_uninitialized(newext), ext4_ext_get_actual_len(newext)); - path[depth].p_ext = EXT_FIRST_EXTENT(eh); - } else if (le32_to_cpu(newext->ee_block) + nearex = EXT_FIRST_EXTENT(eh); + } else { + if (le32_to_cpu(newext->ee_block) > le32_to_cpu(nearex->ee_block)) { -/* BUG_ON(newext->ee_block == nearex->ee_block); */ - if (nearex != EXT_LAST_EXTENT(eh)) { - len = EXT_MAX_EXTENT(eh) - nearex; - len = (len - 1) * sizeof(struct ext4_extent); - len = len < 0 ? 0 : len; - ext_debug("insert %d:%llu:[%d]%d after: nearest 0x%p, " - "move %d from 0x%p to 0x%p\n", + /* Insert after */ + ext_debug("insert %u:%llu:[%d]%d before: " + "nearest %p\n", le32_to_cpu(newext->ee_block), ext4_ext_pblock(newext), ext4_ext_is_uninitialized(newext), ext4_ext_get_actual_len(newext), - nearex, len, nearex + 1, nearex + 2); - memmove(nearex + 2, nearex + 1, len); + nearex); + nearex++; + } else { + /* Insert before */ + BUG_ON(newext->ee_block == nearex->ee_block); + ext_debug("insert %u:%llu:[%d]%d after: " + "nearest %p\n", + le32_to_cpu(newext->ee_block), + ext4_ext_pblock(newext), + ext4_ext_is_uninitialized(newext), + ext4_ext_get_actual_len(newext), + nearex); + } + len = EXT_LAST_EXTENT(eh) - nearex + 1; + if (len > 0) { + ext_debug("insert %u:%llu:[%d]%d: " + "move %d extents from 0x%p to 0x%p\n", + le32_to_cpu(newext->ee_block), + ext4_ext_pblock(newext), + ext4_ext_is_uninitialized(newext), + ext4_ext_get_actual_len(newext), + len, nearex, nearex + 1); + memmove(nearex + 1, nearex, + len * sizeof(struct ext4_extent)); } - path[depth].p_ext = nearex + 1; - } else { - BUG_ON(newext->ee_block == nearex->ee_block); - len = (EXT_MAX_EXTENT(eh) - nearex) * sizeof(struct ext4_extent); - len = len < 0 ? 0 : len; - ext_debug("insert %d:%llu:[%d]%d before: nearest 0x%p, " - "move %d from 0x%p to 0x%p\n", - le32_to_cpu(newext->ee_block), - ext4_ext_pblock(newext), - ext4_ext_is_uninitialized(newext), - ext4_ext_get_actual_len(newext), - nearex, len, nearex, nearex + 1); - memmove(nearex + 1, nearex, len); - path[depth].p_ext = nearex; } le16_add_cpu(&eh->eh_entries, 1); - nearex = path[depth].p_ext; + path[depth].p_ext = nearex; nearex->ee_block = newext->ee_block; ext4_ext_store_pblock(nearex, ext4_ext_pblock(newext)); nearex->ee_len = newext->ee_len; @@ -1962,6 +1951,7 @@ ext4_ext_put_in_cache(struct inode *inode, ext4_lblk_t block, struct ext4_ext_cache *cex; BUG_ON(len == 0); spin_lock(&EXT4_I(inode)->i_block_reservation_lock); + trace_ext4_ext_put_in_cache(inode, block, len, start); cex = &EXT4_I(inode)->i_cached_extent; cex->ec_block = block; cex->ec_len = len; @@ -2063,6 +2053,7 @@ errout: sbi->extent_cache_misses++; else sbi->extent_cache_hits++; + trace_ext4_ext_in_cache(inode, block, ret); spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); return ret; } @@ -2130,6 +2121,8 @@ static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode, if (err) return err; ext_debug("index is empty, remove it, free block %llu\n", leaf); + trace_ext4_ext_rm_idx(inode, leaf); + ext4_free_blocks(handle, inode, NULL, leaf, 1, EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET); return err; @@ -2158,7 +2151,7 @@ int ext4_ext_calc_credits_for_single_extent(struct inode *inode, int nrblocks, * need to account for leaf block credit * * bitmaps and block group descriptor blocks - * and other metadat blocks still need to be + * and other metadata blocks still need to be * accounted. */ /* 1 bitmap, 1 block group descriptor */ @@ -2195,14 +2188,40 @@ int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, int chunk) } static int ext4_remove_blocks(handle_t *handle, struct inode *inode, - struct ext4_extent *ex, - ext4_lblk_t from, ext4_lblk_t to) + struct ext4_extent *ex, + ext4_fsblk_t *partial_cluster, + ext4_lblk_t from, ext4_lblk_t to) { + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); unsigned short ee_len = ext4_ext_get_actual_len(ex); + ext4_fsblk_t pblk; int flags = EXT4_FREE_BLOCKS_FORGET; if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) flags |= EXT4_FREE_BLOCKS_METADATA; + /* + * For bigalloc file systems, we never free a partial cluster + * at the beginning of the extent. Instead, we make a note + * that we tried freeing the cluster, and check to see if we + * need to free it on a subsequent call to ext4_remove_blocks, + * or at the end of the ext4_truncate() operation. + */ + flags |= EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER; + + trace_ext4_remove_blocks(inode, ex, from, to, *partial_cluster); + /* + * If we have a partial cluster, and it's different from the + * cluster of the last block, we need to explicitly free the + * partial cluster here. + */ + pblk = ext4_ext_pblock(ex) + ee_len - 1; + if (*partial_cluster && (EXT4_B2C(sbi, pblk) != *partial_cluster)) { + ext4_free_blocks(handle, inode, NULL, + EXT4_C2B(sbi, *partial_cluster), + sbi->s_cluster_ratio, flags); + *partial_cluster = 0; + } + #ifdef EXTENTS_STATS { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); @@ -2222,12 +2241,24 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode, && to == le32_to_cpu(ex->ee_block) + ee_len - 1) { /* tail removal */ ext4_lblk_t num; - ext4_fsblk_t start; num = le32_to_cpu(ex->ee_block) + ee_len - from; - start = ext4_ext_pblock(ex) + ee_len - num; - ext_debug("free last %u blocks starting %llu\n", num, start); - ext4_free_blocks(handle, inode, NULL, start, num, flags); + pblk = ext4_ext_pblock(ex) + ee_len - num; + ext_debug("free last %u blocks starting %llu\n", num, pblk); + ext4_free_blocks(handle, inode, NULL, pblk, num, flags); + /* + * If the block range to be freed didn't start at the + * beginning of a cluster, and we removed the entire + * extent, save the partial cluster here, since we + * might need to delete if we determine that the + * truncate operation has removed all of the blocks in + * the cluster. + */ + if (pblk & (sbi->s_cluster_ratio - 1) && + (ee_len == num)) + *partial_cluster = EXT4_B2C(sbi, pblk); + else + *partial_cluster = 0; } else if (from == le32_to_cpu(ex->ee_block) && to <= le32_to_cpu(ex->ee_block) + ee_len - 1) { /* head removal */ @@ -2238,7 +2269,7 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode, start = ext4_ext_pblock(ex); ext_debug("free first %u blocks starting %llu\n", num, start); - ext4_free_blocks(handle, inode, 0, start, num, flags); + ext4_free_blocks(handle, inode, NULL, start, num, flags); } else { printk(KERN_INFO "strange request: removal(2) " @@ -2262,19 +2293,19 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode, */ static int ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, - struct ext4_ext_path *path, ext4_lblk_t start, - ext4_lblk_t end) + struct ext4_ext_path *path, ext4_fsblk_t *partial_cluster, + ext4_lblk_t start, ext4_lblk_t end) { + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); int err = 0, correct_index = 0; int depth = ext_depth(inode), credits; struct ext4_extent_header *eh; - ext4_lblk_t a, b, block; + ext4_lblk_t a, b; unsigned num; ext4_lblk_t ex_ee_block; unsigned short ex_ee_len; unsigned uninitialized = 0; struct ext4_extent *ex; - struct ext4_map_blocks map; /* the header must be checked already in ext4_ext_remove_space() */ ext_debug("truncate since %u in leaf\n", start); @@ -2291,6 +2322,8 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, ex_ee_block = le32_to_cpu(ex->ee_block); ex_ee_len = ext4_ext_get_actual_len(ex); + trace_ext4_ext_rm_leaf(inode, start, ex, *partial_cluster); + while (ex >= EXT_FIRST_EXTENT(eh) && ex_ee_block + ex_ee_len > start) { @@ -2315,86 +2348,18 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, ex_ee_block = le32_to_cpu(ex->ee_block); ex_ee_len = ext4_ext_get_actual_len(ex); continue; - } else if (a != ex_ee_block && - b != ex_ee_block + ex_ee_len - 1) { - /* - * If this is a truncate, then this condition should - * never happen because at least one of the end points - * needs to be on the edge of the extent. - */ - if (end == EXT_MAX_BLOCKS - 1) { - ext_debug(" bad truncate %u:%u\n", - start, end); - block = 0; - num = 0; - err = -EIO; - goto out; - } - /* - * else this is a hole punch, so the extent needs to - * be split since neither edge of the hole is on the - * extent edge - */ - else{ - map.m_pblk = ext4_ext_pblock(ex); - map.m_lblk = ex_ee_block; - map.m_len = b - ex_ee_block; - - err = ext4_split_extent(handle, - inode, path, &map, 0, - EXT4_GET_BLOCKS_PUNCH_OUT_EXT | - EXT4_GET_BLOCKS_PRE_IO); - - if (err < 0) - goto out; - - ex_ee_len = ext4_ext_get_actual_len(ex); - - b = ex_ee_block+ex_ee_len - 1 < end ? - ex_ee_block+ex_ee_len - 1 : end; - - /* Then remove tail of this extent */ - block = ex_ee_block; - num = a - block; - } + } else if (b != ex_ee_block + ex_ee_len - 1) { + EXT4_ERROR_INODE(inode," bad truncate %u:%u\n", + start, end); + err = -EIO; + goto out; } else if (a != ex_ee_block) { /* remove tail of the extent */ - block = ex_ee_block; - num = a - block; - } else if (b != ex_ee_block + ex_ee_len - 1) { - /* remove head of the extent */ - block = b; - num = ex_ee_block + ex_ee_len - b; - - /* - * If this is a truncate, this condition - * should never happen - */ - if (end == EXT_MAX_BLOCKS - 1) { - ext_debug(" bad truncate %u:%u\n", - start, end); - err = -EIO; - goto out; - } + num = a - ex_ee_block; } else { /* remove whole extent: excellent! */ - block = ex_ee_block; num = 0; - if (a != ex_ee_block) { - ext_debug(" bad truncate %u:%u\n", - start, end); - err = -EIO; - goto out; - } - - if (b != ex_ee_block + ex_ee_len - 1) { - ext_debug(" bad truncate %u:%u\n", - start, end); - err = -EIO; - goto out; - } } - /* * 3 for leaf, sb, and inode plus 2 (bmap and group * descriptor) for each block group; assume two block @@ -2416,23 +2381,15 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, if (err) goto out; - err = ext4_remove_blocks(handle, inode, ex, a, b); + err = ext4_remove_blocks(handle, inode, ex, partial_cluster, + a, b); if (err) goto out; - if (num == 0) { + if (num == 0) /* this extent is removed; mark slot entirely unused */ ext4_ext_store_pblock(ex, 0); - } else if (block != ex_ee_block) { - /* - * If this was a head removal, then we need to update - * the physical block since it is now at a different - * location - */ - ext4_ext_store_pblock(ex, ext4_ext_pblock(ex) + (b-a)); - } - ex->ee_block = cpu_to_le32(block); ex->ee_len = cpu_to_le16(num); /* * Do not mark uninitialized if all the blocks in the @@ -2440,11 +2397,6 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, */ if (uninitialized && num) ext4_ext_mark_uninitialized(ex); - - err = ext4_ext_dirty(handle, inode, path + depth); - if (err) - goto out; - /* * If the extent was completely released, * we need to remove it from the leaf @@ -2464,9 +2416,14 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, sizeof(struct ext4_extent)); } le16_add_cpu(&eh->eh_entries, -1); - } + } else + *partial_cluster = 0; - ext_debug("new extent: %u:%u:%llu\n", block, num, + err = ext4_ext_dirty(handle, inode, path + depth); + if (err) + goto out; + + ext_debug("new extent: %u:%u:%llu\n", ex_ee_block, num, ext4_ext_pblock(ex)); ex--; ex_ee_block = le32_to_cpu(ex->ee_block); @@ -2476,6 +2433,25 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, if (correct_index && eh->eh_entries) err = ext4_ext_correct_indexes(handle, inode, path); + /* + * If there is still a entry in the leaf node, check to see if + * it references the partial cluster. This is the only place + * where it could; if it doesn't, we can free the cluster. + */ + if (*partial_cluster && ex >= EXT_FIRST_EXTENT(eh) && + (EXT4_B2C(sbi, ext4_ext_pblock(ex) + ex_ee_len - 1) != + *partial_cluster)) { + int flags = EXT4_FREE_BLOCKS_FORGET; + + if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) + flags |= EXT4_FREE_BLOCKS_METADATA; + + ext4_free_blocks(handle, inode, NULL, + EXT4_C2B(sbi, *partial_cluster), + sbi->s_cluster_ratio, flags); + *partial_cluster = 0; + } + /* if this leaf is free, then we should * remove it from index block above */ if (err == 0 && eh->eh_entries == 0 && path[depth].p_bh != NULL) @@ -2511,6 +2487,7 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start) struct super_block *sb = inode->i_sb; int depth = ext_depth(inode); struct ext4_ext_path *path; + ext4_fsblk_t partial_cluster = 0; handle_t *handle; int i, err; @@ -2524,6 +2501,8 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start) again: ext4_ext_invalidate_cache(inode); + trace_ext4_ext_remove_space(inode, start, depth); + /* * We start scanning from right side, freeing all the blocks * after i_size and walking into the tree depth-wise. @@ -2546,7 +2525,8 @@ again: if (i == depth) { /* this is leaf block */ err = ext4_ext_rm_leaf(handle, inode, path, - start, EXT_MAX_BLOCKS - 1); + &partial_cluster, start, + EXT_MAX_BLOCKS - 1); /* root level has p_bh == NULL, brelse() eats this */ brelse(path[i].p_bh); path[i].p_bh = NULL; @@ -2618,6 +2598,24 @@ again: } } + trace_ext4_ext_remove_space_done(inode, start, depth, partial_cluster, + path->p_hdr->eh_entries); + + /* If we still have something in the partial cluster and we have removed + * even the first extent, then we should free the blocks in the partial + * cluster as well. */ + if (partial_cluster && path->p_hdr->eh_entries == 0) { + int flags = EXT4_FREE_BLOCKS_FORGET; + + if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) + flags |= EXT4_FREE_BLOCKS_METADATA; + + ext4_free_blocks(handle, inode, NULL, + EXT4_C2B(EXT4_SB(sb), partial_cluster), + EXT4_SB(sb)->s_cluster_ratio, flags); + partial_cluster = 0; + } + /* TODO: flexible tree reduction should be here */ if (path->p_hdr->eh_entries == 0) { /* @@ -2909,17 +2907,29 @@ out: * a> There is no split required: Entire extent should be initialized * b> Splits in two extents: Write is happening at either end of the extent * c> Splits in three extents: Somone is writing in middle of the extent + * + * Pre-conditions: + * - The extent pointed to by 'path' is uninitialized. + * - The extent pointed to by 'path' contains a superset + * of the logical span [map->m_lblk, map->m_lblk + map->m_len). + * + * Post-conditions on success: + * - the returned value is the number of blocks beyond map->l_lblk + * that are allocated and initialized. + * It is guaranteed to be >= map->m_len. */ static int ext4_ext_convert_to_initialized(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, struct ext4_ext_path *path) { + struct ext4_extent_header *eh; struct ext4_map_blocks split_map; struct ext4_extent zero_ex; struct ext4_extent *ex; ext4_lblk_t ee_block, eof_block; - unsigned int allocated, ee_len, depth; + unsigned int ee_len, depth; + int allocated; int err = 0; int split_flag = 0; @@ -2933,11 +2943,92 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, eof_block = map->m_lblk + map->m_len; depth = ext_depth(inode); + eh = path[depth].p_hdr; ex = path[depth].p_ext; ee_block = le32_to_cpu(ex->ee_block); ee_len = ext4_ext_get_actual_len(ex); allocated = ee_len - (map->m_lblk - ee_block); + trace_ext4_ext_convert_to_initialized_enter(inode, map, ex); + + /* Pre-conditions */ + BUG_ON(!ext4_ext_is_uninitialized(ex)); + BUG_ON(!in_range(map->m_lblk, ee_block, ee_len)); + + /* + * Attempt to transfer newly initialized blocks from the currently + * uninitialized extent to its left neighbor. This is much cheaper + * than an insertion followed by a merge as those involve costly + * memmove() calls. This is the common case in steady state for + * workloads doing fallocate(FALLOC_FL_KEEP_SIZE) followed by append + * writes. + * + * Limitations of the current logic: + * - L1: we only deal with writes at the start of the extent. + * The approach could be extended to writes at the end + * of the extent but this scenario was deemed less common. + * - L2: we do not deal with writes covering the whole extent. + * This would require removing the extent if the transfer + * is possible. + * - L3: we only attempt to merge with an extent stored in the + * same extent tree node. + */ + if ((map->m_lblk == ee_block) && /*L1*/ + (map->m_len < ee_len) && /*L2*/ + (ex > EXT_FIRST_EXTENT(eh))) { /*L3*/ + struct ext4_extent *prev_ex; + ext4_lblk_t prev_lblk; + ext4_fsblk_t prev_pblk, ee_pblk; + unsigned int prev_len, write_len; + + prev_ex = ex - 1; + prev_lblk = le32_to_cpu(prev_ex->ee_block); + prev_len = ext4_ext_get_actual_len(prev_ex); + prev_pblk = ext4_ext_pblock(prev_ex); + ee_pblk = ext4_ext_pblock(ex); + write_len = map->m_len; + + /* + * A transfer of blocks from 'ex' to 'prev_ex' is allowed + * upon those conditions: + * - C1: prev_ex is initialized, + * - C2: prev_ex is logically abutting ex, + * - C3: prev_ex is physically abutting ex, + * - C4: prev_ex can receive the additional blocks without + * overflowing the (initialized) length limit. + */ + if ((!ext4_ext_is_uninitialized(prev_ex)) && /*C1*/ + ((prev_lblk + prev_len) == ee_block) && /*C2*/ + ((prev_pblk + prev_len) == ee_pblk) && /*C3*/ + (prev_len < (EXT_INIT_MAX_LEN - write_len))) { /*C4*/ + err = ext4_ext_get_access(handle, inode, path + depth); + if (err) + goto out; + + trace_ext4_ext_convert_to_initialized_fastpath(inode, + map, ex, prev_ex); + + /* Shift the start of ex by 'write_len' blocks */ + ex->ee_block = cpu_to_le32(ee_block + write_len); + ext4_ext_store_pblock(ex, ee_pblk + write_len); + ex->ee_len = cpu_to_le16(ee_len - write_len); + ext4_ext_mark_uninitialized(ex); /* Restore the flag */ + + /* Extend prev_ex by 'write_len' blocks */ + prev_ex->ee_len = cpu_to_le16(prev_len + write_len); + + /* Mark the block containing both extents as dirty */ + ext4_ext_dirty(handle, inode, path + depth); + + /* Update path to point to the right extent */ + path[depth].p_ext = prev_ex; + + /* Result: number of initialized blocks past m_lblk */ + allocated = write_len; + goto out; + } + } + WARN_ON(map->m_lblk < ee_block); /* * It is safe to convert extent to initialized via explicit @@ -3165,6 +3256,192 @@ static int check_eofblocks_fl(handle_t *handle, struct inode *inode, return ext4_mark_inode_dirty(handle, inode); } +/** + * ext4_find_delalloc_range: find delayed allocated block in the given range. + * + * Goes through the buffer heads in the range [lblk_start, lblk_end] and returns + * whether there are any buffers marked for delayed allocation. It returns '1' + * on the first delalloc'ed buffer head found. If no buffer head in the given + * range is marked for delalloc, it returns 0. + * lblk_start should always be <= lblk_end. + * search_hint_reverse is to indicate that searching in reverse from lblk_end to + * lblk_start might be more efficient (i.e., we will likely hit the delalloc'ed + * block sooner). This is useful when blocks are truncated sequentially from + * lblk_start towards lblk_end. + */ +static int ext4_find_delalloc_range(struct inode *inode, + ext4_lblk_t lblk_start, + ext4_lblk_t lblk_end, + int search_hint_reverse) +{ + struct address_space *mapping = inode->i_mapping; + struct buffer_head *head, *bh = NULL; + struct page *page; + ext4_lblk_t i, pg_lblk; + pgoff_t index; + + /* reverse search wont work if fs block size is less than page size */ + if (inode->i_blkbits < PAGE_CACHE_SHIFT) + search_hint_reverse = 0; + + if (search_hint_reverse) + i = lblk_end; + else + i = lblk_start; + + index = i >> (PAGE_CACHE_SHIFT - inode->i_blkbits); + + while ((i >= lblk_start) && (i <= lblk_end)) { + page = find_get_page(mapping, index); + if (!page) + goto nextpage; + + if (!page_has_buffers(page)) + goto nextpage; + + head = page_buffers(page); + if (!head) + goto nextpage; + + bh = head; + pg_lblk = index << (PAGE_CACHE_SHIFT - + inode->i_blkbits); + do { + if (unlikely(pg_lblk < lblk_start)) { + /* + * This is possible when fs block size is less + * than page size and our cluster starts/ends in + * middle of the page. So we need to skip the + * initial few blocks till we reach the 'lblk' + */ + pg_lblk++; + continue; + } + + /* Check if the buffer is delayed allocated and that it + * is not yet mapped. (when da-buffers are mapped during + * their writeout, their da_mapped bit is set.) + */ + if (buffer_delay(bh) && !buffer_da_mapped(bh)) { + page_cache_release(page); + trace_ext4_find_delalloc_range(inode, + lblk_start, lblk_end, + search_hint_reverse, + 1, i); + return 1; + } + if (search_hint_reverse) + i--; + else + i++; + } while ((i >= lblk_start) && (i <= lblk_end) && + ((bh = bh->b_this_page) != head)); +nextpage: + if (page) + page_cache_release(page); + /* + * Move to next page. 'i' will be the first lblk in the next + * page. + */ + if (search_hint_reverse) + index--; + else + index++; + i = index << (PAGE_CACHE_SHIFT - inode->i_blkbits); + } + + trace_ext4_find_delalloc_range(inode, lblk_start, lblk_end, + search_hint_reverse, 0, 0); + return 0; +} + +int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk, + int search_hint_reverse) +{ + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + ext4_lblk_t lblk_start, lblk_end; + lblk_start = lblk & (~(sbi->s_cluster_ratio - 1)); + lblk_end = lblk_start + sbi->s_cluster_ratio - 1; + + return ext4_find_delalloc_range(inode, lblk_start, lblk_end, + search_hint_reverse); +} + +/** + * Determines how many complete clusters (out of those specified by the 'map') + * are under delalloc and were reserved quota for. + * This function is called when we are writing out the blocks that were + * originally written with their allocation delayed, but then the space was + * allocated using fallocate() before the delayed allocation could be resolved. + * The cases to look for are: + * ('=' indicated delayed allocated blocks + * '-' indicates non-delayed allocated blocks) + * (a) partial clusters towards beginning and/or end outside of allocated range + * are not delalloc'ed. + * Ex: + * |----c---=|====c====|====c====|===-c----| + * |++++++ allocated ++++++| + * ==> 4 complete clusters in above example + * + * (b) partial cluster (outside of allocated range) towards either end is + * marked for delayed allocation. In this case, we will exclude that + * cluster. + * Ex: + * |----====c========|========c========| + * |++++++ allocated ++++++| + * ==> 1 complete clusters in above example + * + * Ex: + * |================c================| + * |++++++ allocated ++++++| + * ==> 0 complete clusters in above example + * + * The ext4_da_update_reserve_space will be called only if we + * determine here that there were some "entire" clusters that span + * this 'allocated' range. + * In the non-bigalloc case, this function will just end up returning num_blks + * without ever calling ext4_find_delalloc_range. + */ +static unsigned int +get_reserved_cluster_alloc(struct inode *inode, ext4_lblk_t lblk_start, + unsigned int num_blks) +{ + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + ext4_lblk_t alloc_cluster_start, alloc_cluster_end; + ext4_lblk_t lblk_from, lblk_to, c_offset; + unsigned int allocated_clusters = 0; + + alloc_cluster_start = EXT4_B2C(sbi, lblk_start); + alloc_cluster_end = EXT4_B2C(sbi, lblk_start + num_blks - 1); + + /* max possible clusters for this allocation */ + allocated_clusters = alloc_cluster_end - alloc_cluster_start + 1; + + trace_ext4_get_reserved_cluster_alloc(inode, lblk_start, num_blks); + + /* Check towards left side */ + c_offset = lblk_start & (sbi->s_cluster_ratio - 1); + if (c_offset) { + lblk_from = lblk_start & (~(sbi->s_cluster_ratio - 1)); + lblk_to = lblk_from + c_offset - 1; + + if (ext4_find_delalloc_range(inode, lblk_from, lblk_to, 0)) + allocated_clusters--; + } + + /* Now check towards right. */ + c_offset = (lblk_start + num_blks) & (sbi->s_cluster_ratio - 1); + if (allocated_clusters && c_offset) { + lblk_from = lblk_start + num_blks; + lblk_to = lblk_from + (sbi->s_cluster_ratio - c_offset) - 1; + + if (ext4_find_delalloc_range(inode, lblk_from, lblk_to, 0)) + allocated_clusters--; + } + + return allocated_clusters; +} + static int ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, @@ -3181,6 +3458,9 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode, flags, allocated); ext4_ext_show_leaf(inode, path); + trace_ext4_ext_handle_uninitialized_extents(inode, map, allocated, + newblock); + /* get_block() before submit the IO, split the extent */ if ((flags & EXT4_GET_BLOCKS_PRE_IO)) { ret = ext4_split_unwritten_extents(handle, inode, map, @@ -3190,10 +3470,9 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode, * that this IO needs to conversion to written when IO is * completed */ - if (io && !(io->flag & EXT4_IO_END_UNWRITTEN)) { - io->flag = EXT4_IO_END_UNWRITTEN; - atomic_inc(&EXT4_I(inode)->i_aiodio_unwritten); - } else + if (io) + ext4_set_io_unwritten_flag(inode, io); + else ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN); if (ext4_should_dioread_nolock(inode)) map->m_flags |= EXT4_MAP_UNINIT; @@ -3234,14 +3513,8 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode, /* buffered write, writepage time, convert*/ ret = ext4_ext_convert_to_initialized(handle, inode, map, path); - if (ret >= 0) { + if (ret >= 0) ext4_update_inode_fsync_trans(handle, inode, 1); - err = check_eofblocks_fl(handle, inode, map->m_lblk, path, - map->m_len); - if (err < 0) - goto out2; - } - out: if (ret <= 0) { err = ret; @@ -3270,11 +3543,24 @@ out: * But fallocate would have already updated quota and block * count for this offset. So cancel these reservation */ - if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) - ext4_da_update_reserve_space(inode, allocated, 0); + if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) { + unsigned int reserved_clusters; + reserved_clusters = get_reserved_cluster_alloc(inode, + map->m_lblk, map->m_len); + if (reserved_clusters) + ext4_da_update_reserve_space(inode, + reserved_clusters, + 0); + } map_out: map->m_flags |= EXT4_MAP_MAPPED; + if ((flags & EXT4_GET_BLOCKS_KEEP_SIZE) == 0) { + err = check_eofblocks_fl(handle, inode, map->m_lblk, path, + map->m_len); + if (err < 0) + goto out2; + } out1: if (allocated > map->m_len) allocated = map->m_len; @@ -3290,6 +3576,111 @@ out2: } /* + * get_implied_cluster_alloc - check to see if the requested + * allocation (in the map structure) overlaps with a cluster already + * allocated in an extent. + * @sb The filesystem superblock structure + * @map The requested lblk->pblk mapping + * @ex The extent structure which might contain an implied + * cluster allocation + * + * This function is called by ext4_ext_map_blocks() after we failed to + * find blocks that were already in the inode's extent tree. Hence, + * we know that the beginning of the requested region cannot overlap + * the extent from the inode's extent tree. There are three cases we + * want to catch. The first is this case: + * + * |--- cluster # N--| + * |--- extent ---| |---- requested region ---| + * |==========| + * + * The second case that we need to test for is this one: + * + * |--------- cluster # N ----------------| + * |--- requested region --| |------- extent ----| + * |=======================| + * + * The third case is when the requested region lies between two extents + * within the same cluster: + * |------------- cluster # N-------------| + * |----- ex -----| |---- ex_right ----| + * |------ requested region ------| + * |================| + * + * In each of the above cases, we need to set the map->m_pblk and + * map->m_len so it corresponds to the return the extent labelled as + * "|====|" from cluster #N, since it is already in use for data in + * cluster EXT4_B2C(sbi, map->m_lblk). We will then return 1 to + * signal to ext4_ext_map_blocks() that map->m_pblk should be treated + * as a new "allocated" block region. Otherwise, we will return 0 and + * ext4_ext_map_blocks() will then allocate one or more new clusters + * by calling ext4_mb_new_blocks(). + */ +static int get_implied_cluster_alloc(struct super_block *sb, + struct ext4_map_blocks *map, + struct ext4_extent *ex, + struct ext4_ext_path *path) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + ext4_lblk_t c_offset = map->m_lblk & (sbi->s_cluster_ratio-1); + ext4_lblk_t ex_cluster_start, ex_cluster_end; + ext4_lblk_t rr_cluster_start, rr_cluster_end; + ext4_lblk_t ee_block = le32_to_cpu(ex->ee_block); + ext4_fsblk_t ee_start = ext4_ext_pblock(ex); + unsigned short ee_len = ext4_ext_get_actual_len(ex); + + /* The extent passed in that we are trying to match */ + ex_cluster_start = EXT4_B2C(sbi, ee_block); + ex_cluster_end = EXT4_B2C(sbi, ee_block + ee_len - 1); + + /* The requested region passed into ext4_map_blocks() */ + rr_cluster_start = EXT4_B2C(sbi, map->m_lblk); + rr_cluster_end = EXT4_B2C(sbi, map->m_lblk + map->m_len - 1); + + if ((rr_cluster_start == ex_cluster_end) || + (rr_cluster_start == ex_cluster_start)) { + if (rr_cluster_start == ex_cluster_end) + ee_start += ee_len - 1; + map->m_pblk = (ee_start & ~(sbi->s_cluster_ratio - 1)) + + c_offset; + map->m_len = min(map->m_len, + (unsigned) sbi->s_cluster_ratio - c_offset); + /* + * Check for and handle this case: + * + * |--------- cluster # N-------------| + * |------- extent ----| + * |--- requested region ---| + * |===========| + */ + + if (map->m_lblk < ee_block) + map->m_len = min(map->m_len, ee_block - map->m_lblk); + + /* + * Check for the case where there is already another allocated + * block to the right of 'ex' but before the end of the cluster. + * + * |------------- cluster # N-------------| + * |----- ex -----| |---- ex_right ----| + * |------ requested region ------| + * |================| + */ + if (map->m_lblk > ee_block) { + ext4_lblk_t next = ext4_ext_next_allocated_block(path); + map->m_len = min(map->m_len, next - map->m_lblk); + } + + trace_ext4_get_implied_cluster_alloc_exit(sb, map, 1); + return 1; + } + + trace_ext4_get_implied_cluster_alloc_exit(sb, map, 0); + return 0; +} + + +/* * Block allocation/map/preallocation routine for extents based files * * @@ -3311,15 +3702,17 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, int flags) { struct ext4_ext_path *path = NULL; - struct ext4_extent newex, *ex; + struct ext4_extent newex, *ex, *ex2; + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); ext4_fsblk_t newblock = 0; - int err = 0, depth, ret; - unsigned int allocated = 0; + int free_on_err = 0, err = 0, depth, ret; + unsigned int allocated = 0, offset = 0; + unsigned int allocated_clusters = 0; unsigned int punched_out = 0; unsigned int result = 0; struct ext4_allocation_request ar; ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio; - struct ext4_map_blocks punch_map; + ext4_lblk_t cluster_offset; ext_debug("blocks %u/%u requested for inode %lu\n", map->m_lblk, map->m_len, inode->i_ino); @@ -3329,6 +3722,10 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, if (!(flags & EXT4_GET_BLOCKS_PUNCH_OUT_EXT) && ext4_ext_in_cache(inode, map->m_lblk, &newex)) { if (!newex.ee_start_lo && !newex.ee_start_hi) { + if ((sbi->s_cluster_ratio > 1) && + ext4_find_delalloc_cluster(inode, map->m_lblk, 0)) + map->m_flags |= EXT4_MAP_FROM_CLUSTER; + if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) { /* * block isn't allocated yet and @@ -3339,6 +3736,8 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, /* we should allocate requested block */ } else { /* block is already allocated */ + if (sbi->s_cluster_ratio > 1) + map->m_flags |= EXT4_MAP_FROM_CLUSTER; newblock = map->m_lblk - le32_to_cpu(newex.ee_block) + ext4_ext_pblock(&newex); @@ -3384,8 +3783,14 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, * we split out initialized portions during a write. */ ee_len = ext4_ext_get_actual_len(ex); + + trace_ext4_ext_show_extent(inode, ee_block, ee_start, ee_len); + /* if found extent covers block, simply return it */ if (in_range(map->m_lblk, ee_block, ee_len)) { + struct ext4_map_blocks punch_map; + ext4_fsblk_t partial_cluster = 0; + newblock = map->m_lblk - ee_block + ee_start; /* number of remaining blocks in the extent */ allocated = ee_len - (map->m_lblk - ee_block); @@ -3469,7 +3874,8 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, ext4_ext_invalidate_cache(inode); err = ext4_ext_rm_leaf(handle, inode, path, - map->m_lblk, map->m_lblk + punched_out); + &partial_cluster, map->m_lblk, + map->m_lblk + punched_out); if (!err && path->p_hdr->eh_entries == 0) { /* @@ -3492,6 +3898,10 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, } } + if ((sbi->s_cluster_ratio > 1) && + ext4_find_delalloc_cluster(inode, map->m_lblk, 0)) + map->m_flags |= EXT4_MAP_FROM_CLUSTER; + /* * requested block isn't allocated yet; * we couldn't try to create block if create flag is zero @@ -3504,9 +3914,25 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, ext4_ext_put_gap_in_cache(inode, path, map->m_lblk); goto out2; } + /* * Okay, we need to do block allocation. */ + map->m_flags &= ~EXT4_MAP_FROM_CLUSTER; + newex.ee_block = cpu_to_le32(map->m_lblk); + cluster_offset = map->m_lblk & (sbi->s_cluster_ratio-1); + + /* + * If we are doing bigalloc, check to see if the extent returned + * by ext4_ext_find_extent() implies a cluster we can use. + */ + if (cluster_offset && ex && + get_implied_cluster_alloc(inode->i_sb, map, ex, path)) { + ar.len = allocated = map->m_len; + newblock = map->m_pblk; + map->m_flags |= EXT4_MAP_FROM_CLUSTER; + goto got_allocated_blocks; + } /* find neighbour allocated blocks */ ar.lleft = map->m_lblk; @@ -3514,10 +3940,21 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, if (err) goto out2; ar.lright = map->m_lblk; - err = ext4_ext_search_right(inode, path, &ar.lright, &ar.pright); + ex2 = NULL; + err = ext4_ext_search_right(inode, path, &ar.lright, &ar.pright, &ex2); if (err) goto out2; + /* Check if the extent after searching to the right implies a + * cluster we can use. */ + if ((sbi->s_cluster_ratio > 1) && ex2 && + get_implied_cluster_alloc(inode->i_sb, map, ex2, path)) { + ar.len = allocated = map->m_len; + newblock = map->m_pblk; + map->m_flags |= EXT4_MAP_FROM_CLUSTER; + goto got_allocated_blocks; + } + /* * See if request is beyond maximum number of blocks we can have in * a single extent. For an initialized extent this limit is @@ -3532,9 +3969,8 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, map->m_len = EXT_UNINIT_MAX_LEN; /* Check if we can really insert (m_lblk)::(m_lblk + m_len) extent */ - newex.ee_block = cpu_to_le32(map->m_lblk); newex.ee_len = cpu_to_le16(map->m_len); - err = ext4_ext_check_overlap(inode, &newex, path); + err = ext4_ext_check_overlap(sbi, inode, &newex, path); if (err) allocated = ext4_ext_get_actual_len(&newex); else @@ -3544,7 +3980,18 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, ar.inode = inode; ar.goal = ext4_ext_find_goal(inode, path, map->m_lblk); ar.logical = map->m_lblk; - ar.len = allocated; + /* + * We calculate the offset from the beginning of the cluster + * for the logical block number, since when we allocate a + * physical cluster, the physical block should start at the + * same offset from the beginning of the cluster. This is + * needed so that future calls to get_implied_cluster_alloc() + * work correctly. + */ + offset = map->m_lblk & (sbi->s_cluster_ratio - 1); + ar.len = EXT4_NUM_B2C(sbi, offset+allocated); + ar.goal -= offset; + ar.logical -= offset; if (S_ISREG(inode->i_mode)) ar.flags = EXT4_MB_HINT_DATA; else @@ -3557,9 +4004,15 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, goto out2; ext_debug("allocate new block: goal %llu, found %llu/%u\n", ar.goal, newblock, allocated); + free_on_err = 1; + allocated_clusters = ar.len; + ar.len = EXT4_C2B(sbi, ar.len) - offset; + if (ar.len > allocated) + ar.len = allocated; +got_allocated_blocks: /* try to insert new extent into found leaf and return */ - ext4_ext_store_pblock(&newex, newblock); + ext4_ext_store_pblock(&newex, newblock + offset); newex.ee_len = cpu_to_le16(ar.len); /* Mark uninitialized */ if (flags & EXT4_GET_BLOCKS_UNINIT_EXT){ @@ -3572,10 +4025,9 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, * that we need to perform conversion when IO is done. */ if ((flags & EXT4_GET_BLOCKS_PRE_IO)) { - if (io && !(io->flag & EXT4_IO_END_UNWRITTEN)) { - io->flag = EXT4_IO_END_UNWRITTEN; - atomic_inc(&EXT4_I(inode)->i_aiodio_unwritten); - } else + if (io) + ext4_set_io_unwritten_flag(inode, io); + else ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN); } @@ -3583,11 +4035,14 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, map->m_flags |= EXT4_MAP_UNINIT; } - err = check_eofblocks_fl(handle, inode, map->m_lblk, path, ar.len); + err = 0; + if ((flags & EXT4_GET_BLOCKS_KEEP_SIZE) == 0) + err = check_eofblocks_fl(handle, inode, map->m_lblk, + path, ar.len); if (!err) err = ext4_ext_insert_extent(handle, inode, path, &newex, flags); - if (err) { + if (err && free_on_err) { int fb_flags = flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE ? EXT4_FREE_BLOCKS_NO_QUOT_UPDATE : 0; /* free data blocks we just allocated */ @@ -3610,8 +4065,82 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, * Update reserved blocks/metadata blocks after successful * block allocation which had been deferred till now. */ - if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) - ext4_da_update_reserve_space(inode, allocated, 1); + if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) { + unsigned int reserved_clusters; + /* + * Check how many clusters we had reserved this allocated range + */ + reserved_clusters = get_reserved_cluster_alloc(inode, + map->m_lblk, allocated); + if (map->m_flags & EXT4_MAP_FROM_CLUSTER) { + if (reserved_clusters) { + /* + * We have clusters reserved for this range. + * But since we are not doing actual allocation + * and are simply using blocks from previously + * allocated cluster, we should release the + * reservation and not claim quota. + */ + ext4_da_update_reserve_space(inode, + reserved_clusters, 0); + } + } else { + BUG_ON(allocated_clusters < reserved_clusters); + /* We will claim quota for all newly allocated blocks.*/ + ext4_da_update_reserve_space(inode, allocated_clusters, + 1); + if (reserved_clusters < allocated_clusters) { + struct ext4_inode_info *ei = EXT4_I(inode); + int reservation = allocated_clusters - + reserved_clusters; + /* + * It seems we claimed few clusters outside of + * the range of this allocation. We should give + * it back to the reservation pool. This can + * happen in the following case: + * + * * Suppose s_cluster_ratio is 4 (i.e., each + * cluster has 4 blocks. Thus, the clusters + * are [0-3],[4-7],[8-11]... + * * First comes delayed allocation write for + * logical blocks 10 & 11. Since there were no + * previous delayed allocated blocks in the + * range [8-11], we would reserve 1 cluster + * for this write. + * * Next comes write for logical blocks 3 to 8. + * In this case, we will reserve 2 clusters + * (for [0-3] and [4-7]; and not for [8-11] as + * that range has a delayed allocated blocks. + * Thus total reserved clusters now becomes 3. + * * Now, during the delayed allocation writeout + * time, we will first write blocks [3-8] and + * allocate 3 clusters for writing these + * blocks. Also, we would claim all these + * three clusters above. + * * Now when we come here to writeout the + * blocks [10-11], we would expect to claim + * the reservation of 1 cluster we had made + * (and we would claim it since there are no + * more delayed allocated blocks in the range + * [8-11]. But our reserved cluster count had + * already gone to 0. + * + * Thus, at the step 4 above when we determine + * that there are still some unwritten delayed + * allocated blocks outside of our current + * block range, we should increment the + * reserved clusters count so that when the + * remaining blocks finally gets written, we + * could claim them. + */ + dquot_reserve_block(inode, + EXT4_C2B(sbi, reservation)); + spin_lock(&ei->i_block_reservation_lock); + ei->i_reserved_data_blocks += reservation; + spin_unlock(&ei->i_block_reservation_lock); + } + } + } /* * Cache the extent and update transaction to commit on fdatasync only @@ -3634,12 +4163,12 @@ out2: ext4_ext_drop_refs(path); kfree(path); } - trace_ext4_ext_map_blocks_exit(inode, map->m_lblk, - newblock, map->m_len, err ? err : allocated); - result = (flags & EXT4_GET_BLOCKS_PUNCH_OUT_EXT) ? punched_out : allocated; + trace_ext4_ext_map_blocks_exit(inode, map->m_lblk, + newblock, map->m_len, err ? err : result); + return err ? err : result; } @@ -3649,6 +4178,7 @@ void ext4_ext_truncate(struct inode *inode) struct super_block *sb = inode->i_sb; ext4_lblk_t last_block; handle_t *handle; + loff_t page_len; int err = 0; /* @@ -3665,8 +4195,16 @@ void ext4_ext_truncate(struct inode *inode) if (IS_ERR(handle)) return; - if (inode->i_size & (sb->s_blocksize - 1)) - ext4_block_truncate_page(handle, mapping, inode->i_size); + if (inode->i_size % PAGE_CACHE_SIZE != 0) { + page_len = PAGE_CACHE_SIZE - + (inode->i_size & (PAGE_CACHE_SIZE - 1)); + + err = ext4_discard_partial_page_buffers(handle, + mapping, inode->i_size, page_len, 0); + + if (err) + goto out_stop; + } if (ext4_orphan_add(handle, inode)) goto out_stop; @@ -3760,6 +4298,7 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) int ret = 0; int ret2 = 0; int retries = 0; + int flags; struct ext4_map_blocks map; unsigned int credits, blkbits = inode->i_blkbits; @@ -3796,6 +4335,16 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) trace_ext4_fallocate_exit(inode, offset, max_blocks, ret); return ret; } + flags = EXT4_GET_BLOCKS_CREATE_UNINIT_EXT; + if (mode & FALLOC_FL_KEEP_SIZE) + flags |= EXT4_GET_BLOCKS_KEEP_SIZE; + /* + * Don't normalize the request if it can fit in one extent so + * that it doesn't get unnecessarily split into multiple + * extents. + */ + if (len <= EXT_UNINIT_MAX_LEN << blkbits) + flags |= EXT4_GET_BLOCKS_NO_NORMALIZE; retry: while (ret >= 0 && ret < max_blocks) { map.m_lblk = map.m_lblk + ret; @@ -3805,9 +4354,7 @@ retry: ret = PTR_ERR(handle); break; } - ret = ext4_map_blocks(handle, inode, &map, - EXT4_GET_BLOCKS_CREATE_UNINIT_EXT | - EXT4_GET_BLOCKS_NO_NORMALIZE); + ret = ext4_map_blocks(handle, inode, &map, flags); if (ret <= 0) { #ifdef EXT4FS_DEBUG WARN_ON(ret <= 0); @@ -4102,7 +4649,6 @@ found_delayed_extent: return EXT_BREAK; return EXT_CONTINUE; } - /* fiemap flags we can handle specified here */ #define EXT4_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR) @@ -4162,17 +4708,28 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length) struct address_space *mapping = inode->i_mapping; struct ext4_map_blocks map; handle_t *handle; - loff_t first_block_offset, last_block_offset, block_len; - loff_t first_page, last_page, first_page_offset, last_page_offset; + loff_t first_page, last_page, page_len; + loff_t first_page_offset, last_page_offset; int ret, credits, blocks_released, err = 0; + /* No need to punch hole beyond i_size */ + if (offset >= inode->i_size) + return 0; + + /* + * If the hole extends beyond i_size, set the hole + * to end after the page that contains i_size + */ + if (offset + length > inode->i_size) { + length = inode->i_size + + PAGE_CACHE_SIZE - (inode->i_size & (PAGE_CACHE_SIZE - 1)) - + offset; + } + first_block = (offset + sb->s_blocksize - 1) >> EXT4_BLOCK_SIZE_BITS(sb); last_block = (offset + length) >> EXT4_BLOCK_SIZE_BITS(sb); - first_block_offset = first_block << EXT4_BLOCK_SIZE_BITS(sb); - last_block_offset = last_block << EXT4_BLOCK_SIZE_BITS(sb); - first_page = (offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; last_page = (offset + length) >> PAGE_CACHE_SHIFT; @@ -4185,11 +4742,10 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length) */ if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) { err = filemap_write_and_wait_range(mapping, - first_page_offset == 0 ? 0 : first_page_offset-1, - last_page_offset); + offset, offset + length - 1); - if (err) - return err; + if (err) + return err; } /* Now release the pages */ @@ -4211,24 +4767,64 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length) goto out; /* - * Now we need to zero out the un block aligned data. - * If the file is smaller than a block, just - * zero out the middle + * Now we need to zero out the non-page-aligned data in the + * pages at the start and tail of the hole, and unmap the buffer + * heads for the block aligned regions of the page that were + * completely zeroed. */ - if (first_block > last_block) - ext4_block_zero_page_range(handle, mapping, offset, length); - else { - /* zero out the head of the hole before the first block */ - block_len = first_block_offset - offset; - if (block_len > 0) - ext4_block_zero_page_range(handle, mapping, - offset, block_len); - - /* zero out the tail of the hole after the last block */ - block_len = offset + length - last_block_offset; - if (block_len > 0) { - ext4_block_zero_page_range(handle, mapping, - last_block_offset, block_len); + if (first_page > last_page) { + /* + * If the file space being truncated is contained within a page + * just zero out and unmap the middle of that page + */ + err = ext4_discard_partial_page_buffers(handle, + mapping, offset, length, 0); + + if (err) + goto out; + } else { + /* + * zero out and unmap the partial page that contains + * the start of the hole + */ + page_len = first_page_offset - offset; + if (page_len > 0) { + err = ext4_discard_partial_page_buffers(handle, mapping, + offset, page_len, 0); + if (err) + goto out; + } + + /* + * zero out and unmap the partial page that contains + * the end of the hole + */ + page_len = offset + length - last_page_offset; + if (page_len > 0) { + err = ext4_discard_partial_page_buffers(handle, mapping, + last_page_offset, page_len, 0); + if (err) + goto out; + } + } + + + /* + * If i_size is contained in the last page, we need to + * unmap and zero the partial page after i_size + */ + if (inode->i_size >> PAGE_CACHE_SHIFT == last_page && + inode->i_size % PAGE_CACHE_SIZE != 0) { + + page_len = PAGE_CACHE_SIZE - + (inode->i_size & (PAGE_CACHE_SIZE - 1)); + + if (page_len > 0) { + err = ext4_discard_partial_page_buffers(handle, + mapping, inode->i_size, page_len, 0); + + if (err) + goto out; } } diff --git a/fs/ext4/file.c b/fs/ext4/file.c index e4095e9..cb70f18 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -181,8 +181,8 @@ static int ext4_file_open(struct inode * inode, struct file * filp) path.dentry = mnt->mnt_root; cp = d_path(&path, buf, sizeof(buf)); if (!IS_ERR(cp)) { - memcpy(sbi->s_es->s_last_mounted, cp, - sizeof(sbi->s_es->s_last_mounted)); + strlcpy(sbi->s_es->s_last_mounted, cp, + sizeof(sbi->s_es->s_last_mounted)); ext4_mark_super_dirty(sb); } } @@ -224,53 +224,8 @@ loff_t ext4_llseek(struct file *file, loff_t offset, int origin) maxbytes = EXT4_SB(inode->i_sb)->s_bitmap_maxbytes; else maxbytes = inode->i_sb->s_maxbytes; - mutex_lock(&inode->i_mutex); - switch (origin) { - case SEEK_END: - offset += inode->i_size; - break; - case SEEK_CUR: - if (offset == 0) { - mutex_unlock(&inode->i_mutex); - return file->f_pos; - } - offset += file->f_pos; - break; - case SEEK_DATA: - /* - * In the generic case the entire file is data, so as long as - * offset isn't at the end of the file then the offset is data. - */ - if (offset >= inode->i_size) { - mutex_unlock(&inode->i_mutex); - return -ENXIO; - } - break; - case SEEK_HOLE: - /* - * There is a virtual hole at the end of the file, so as long as - * offset isn't i_size or larger, return i_size. - */ - if (offset >= inode->i_size) { - mutex_unlock(&inode->i_mutex); - return -ENXIO; - } - offset = inode->i_size; - break; - } - - if (offset < 0 || offset > maxbytes) { - mutex_unlock(&inode->i_mutex); - return -EINVAL; - } - - if (offset != file->f_pos) { - file->f_pos = offset; - file->f_version = 0; - } - mutex_unlock(&inode->i_mutex); - return offset; + return generic_file_llseek_size(file, offset, origin, maxbytes); } const struct file_operations ext4_file_operations = { diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c index 036f78f..00a2cb7 100644 --- a/fs/ext4/fsync.c +++ b/fs/ext4/fsync.c @@ -75,7 +75,7 @@ static void dump_completed_IO(struct inode * inode) * to written. * The function return the number of pending IOs on success. */ -extern int ext4_flush_completed_IO(struct inode *inode) +int ext4_flush_completed_IO(struct inode *inode) { ext4_io_end_t *io; struct ext4_inode_info *ei = EXT4_I(inode); @@ -83,14 +83,12 @@ extern int ext4_flush_completed_IO(struct inode *inode) int ret = 0; int ret2 = 0; - if (list_empty(&ei->i_completed_io_list)) - return ret; - dump_completed_IO(inode); spin_lock_irqsave(&ei->i_completed_io_lock, flags); while (!list_empty(&ei->i_completed_io_list)){ io = list_entry(ei->i_completed_io_list.next, ext4_io_end_t, list); + list_del_init(&io->list); /* * Calling ext4_end_io_nolock() to convert completed * IO to written. @@ -107,11 +105,9 @@ extern int ext4_flush_completed_IO(struct inode *inode) */ spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); ret = ext4_end_io_nolock(io); - spin_lock_irqsave(&ei->i_completed_io_lock, flags); if (ret < 0) ret2 = ret; - else - list_del_init(&io->list); + spin_lock_irqsave(&ei->i_completed_io_lock, flags); } spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); return (ret2 < 0) ? ret2 : 0; diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 9c63f27..4637af0 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -78,7 +78,7 @@ static unsigned ext4_init_inode_bitmap(struct super_block *sb, * allocation, essentially implementing a per-group read-only flag. */ if (!ext4_group_desc_csum_verify(sbi, block_group, gdp)) { ext4_error(sb, "Checksum bad for group %u", block_group); - ext4_free_blks_set(sb, gdp, 0); + ext4_free_group_clusters_set(sb, gdp, 0); ext4_free_inodes_set(sb, gdp, 0); ext4_itable_unused_set(sb, gdp, 0); memset(bh->b_data, 0xff, sb->s_blocksize); @@ -293,121 +293,9 @@ error_return: ext4_std_error(sb, fatal); } -/* - * There are two policies for allocating an inode. If the new inode is - * a directory, then a forward search is made for a block group with both - * free space and a low directory-to-inode ratio; if that fails, then of - * the groups with above-average free space, that group with the fewest - * directories already is chosen. - * - * For other inodes, search forward from the parent directory\'s block - * group to find a free inode. - */ -static int find_group_dir(struct super_block *sb, struct inode *parent, - ext4_group_t *best_group) -{ - ext4_group_t ngroups = ext4_get_groups_count(sb); - unsigned int freei, avefreei; - struct ext4_group_desc *desc, *best_desc = NULL; - ext4_group_t group; - int ret = -1; - - freei = percpu_counter_read_positive(&EXT4_SB(sb)->s_freeinodes_counter); - avefreei = freei / ngroups; - - for (group = 0; group < ngroups; group++) { - desc = ext4_get_group_desc(sb, group, NULL); - if (!desc || !ext4_free_inodes_count(sb, desc)) - continue; - if (ext4_free_inodes_count(sb, desc) < avefreei) - continue; - if (!best_desc || - (ext4_free_blks_count(sb, desc) > - ext4_free_blks_count(sb, best_desc))) { - *best_group = group; - best_desc = desc; - ret = 0; - } - } - return ret; -} - -#define free_block_ratio 10 - -static int find_group_flex(struct super_block *sb, struct inode *parent, - ext4_group_t *best_group) -{ - struct ext4_sb_info *sbi = EXT4_SB(sb); - struct ext4_group_desc *desc; - struct flex_groups *flex_group = sbi->s_flex_groups; - ext4_group_t parent_group = EXT4_I(parent)->i_block_group; - ext4_group_t parent_fbg_group = ext4_flex_group(sbi, parent_group); - ext4_group_t ngroups = ext4_get_groups_count(sb); - int flex_size = ext4_flex_bg_size(sbi); - ext4_group_t best_flex = parent_fbg_group; - int blocks_per_flex = sbi->s_blocks_per_group * flex_size; - int flexbg_free_blocks; - int flex_freeb_ratio; - ext4_group_t n_fbg_groups; - ext4_group_t i; - - n_fbg_groups = (ngroups + flex_size - 1) >> - sbi->s_log_groups_per_flex; - -find_close_to_parent: - flexbg_free_blocks = atomic_read(&flex_group[best_flex].free_blocks); - flex_freeb_ratio = flexbg_free_blocks * 100 / blocks_per_flex; - if (atomic_read(&flex_group[best_flex].free_inodes) && - flex_freeb_ratio > free_block_ratio) - goto found_flexbg; - - if (best_flex && best_flex == parent_fbg_group) { - best_flex--; - goto find_close_to_parent; - } - - for (i = 0; i < n_fbg_groups; i++) { - if (i == parent_fbg_group || i == parent_fbg_group - 1) - continue; - - flexbg_free_blocks = atomic_read(&flex_group[i].free_blocks); - flex_freeb_ratio = flexbg_free_blocks * 100 / blocks_per_flex; - - if (flex_freeb_ratio > free_block_ratio && - (atomic_read(&flex_group[i].free_inodes))) { - best_flex = i; - goto found_flexbg; - } - - if ((atomic_read(&flex_group[best_flex].free_inodes) == 0) || - ((atomic_read(&flex_group[i].free_blocks) > - atomic_read(&flex_group[best_flex].free_blocks)) && - atomic_read(&flex_group[i].free_inodes))) - best_flex = i; - } - - if (!atomic_read(&flex_group[best_flex].free_inodes) || - !atomic_read(&flex_group[best_flex].free_blocks)) - return -1; - -found_flexbg: - for (i = best_flex * flex_size; i < ngroups && - i < (best_flex + 1) * flex_size; i++) { - desc = ext4_get_group_desc(sb, i, NULL); - if (ext4_free_inodes_count(sb, desc)) { - *best_group = i; - goto out; - } - } - - return -1; -out: - return 0; -} - struct orlov_stats { __u32 free_inodes; - __u32 free_blocks; + __u32 free_clusters; __u32 used_dirs; }; @@ -424,7 +312,7 @@ static void get_orlov_stats(struct super_block *sb, ext4_group_t g, if (flex_size > 1) { stats->free_inodes = atomic_read(&flex_group[g].free_inodes); - stats->free_blocks = atomic_read(&flex_group[g].free_blocks); + stats->free_clusters = atomic_read(&flex_group[g].free_clusters); stats->used_dirs = atomic_read(&flex_group[g].used_dirs); return; } @@ -432,11 +320,11 @@ static void get_orlov_stats(struct super_block *sb, ext4_group_t g, desc = ext4_get_group_desc(sb, g, NULL); if (desc) { stats->free_inodes = ext4_free_inodes_count(sb, desc); - stats->free_blocks = ext4_free_blks_count(sb, desc); + stats->free_clusters = ext4_free_group_clusters(sb, desc); stats->used_dirs = ext4_used_dirs_count(sb, desc); } else { stats->free_inodes = 0; - stats->free_blocks = 0; + stats->free_clusters = 0; stats->used_dirs = 0; } } @@ -463,7 +351,7 @@ static void get_orlov_stats(struct super_block *sb, ext4_group_t g, */ static int find_group_orlov(struct super_block *sb, struct inode *parent, - ext4_group_t *group, int mode, + ext4_group_t *group, umode_t mode, const struct qstr *qstr) { ext4_group_t parent_group = EXT4_I(parent)->i_block_group; @@ -471,10 +359,10 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent, ext4_group_t real_ngroups = ext4_get_groups_count(sb); int inodes_per_group = EXT4_INODES_PER_GROUP(sb); unsigned int freei, avefreei; - ext4_fsblk_t freeb, avefreeb; + ext4_fsblk_t freeb, avefreec; unsigned int ndirs; int max_dirs, min_inodes; - ext4_grpblk_t min_blocks; + ext4_grpblk_t min_clusters; ext4_group_t i, grp, g, ngroups; struct ext4_group_desc *desc; struct orlov_stats stats; @@ -490,9 +378,10 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent, freei = percpu_counter_read_positive(&sbi->s_freeinodes_counter); avefreei = freei / ngroups; - freeb = percpu_counter_read_positive(&sbi->s_freeblocks_counter); - avefreeb = freeb; - do_div(avefreeb, ngroups); + freeb = EXT4_C2B(sbi, + percpu_counter_read_positive(&sbi->s_freeclusters_counter)); + avefreec = freeb; + do_div(avefreec, ngroups); ndirs = percpu_counter_read_positive(&sbi->s_dirs_counter); if (S_ISDIR(mode) && @@ -518,7 +407,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent, continue; if (stats.free_inodes < avefreei) continue; - if (stats.free_blocks < avefreeb) + if (stats.free_clusters < avefreec) continue; grp = g; ret = 0; @@ -556,7 +445,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent, min_inodes = avefreei - inodes_per_group*flex_size / 4; if (min_inodes < 1) min_inodes = 1; - min_blocks = avefreeb - EXT4_BLOCKS_PER_GROUP(sb)*flex_size / 4; + min_clusters = avefreec - EXT4_CLUSTERS_PER_GROUP(sb)*flex_size / 4; /* * Start looking in the flex group where we last allocated an @@ -575,7 +464,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent, continue; if (stats.free_inodes < min_inodes) continue; - if (stats.free_blocks < min_blocks) + if (stats.free_clusters < min_clusters) continue; goto found_flex_bg; } @@ -608,7 +497,7 @@ fallback_retry: } static int find_group_other(struct super_block *sb, struct inode *parent, - ext4_group_t *group, int mode) + ext4_group_t *group, umode_t mode) { ext4_group_t parent_group = EXT4_I(parent)->i_block_group; ext4_group_t i, last, ngroups = ext4_get_groups_count(sb); @@ -659,7 +548,7 @@ static int find_group_other(struct super_block *sb, struct inode *parent, *group = parent_group; desc = ext4_get_group_desc(sb, *group, NULL); if (desc && ext4_free_inodes_count(sb, desc) && - ext4_free_blks_count(sb, desc)) + ext4_free_group_clusters(sb, desc)) return 0; /* @@ -683,7 +572,7 @@ static int find_group_other(struct super_block *sb, struct inode *parent, *group -= ngroups; desc = ext4_get_group_desc(sb, *group, NULL); if (desc && ext4_free_inodes_count(sb, desc) && - ext4_free_blks_count(sb, desc)) + ext4_free_group_clusters(sb, desc)) return 0; } @@ -713,7 +602,7 @@ static int find_group_other(struct super_block *sb, struct inode *parent, */ static int ext4_claim_inode(struct super_block *sb, struct buffer_head *inode_bitmap_bh, - unsigned long ino, ext4_group_t group, int mode) + unsigned long ino, ext4_group_t group, umode_t mode) { int free = 0, retval = 0, count; struct ext4_sb_info *sbi = EXT4_SB(sb); @@ -801,8 +690,8 @@ err_ret: * For other inodes, search forward from the parent directory's block * group to find a free inode. */ -struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode, - const struct qstr *qstr, __u32 goal) +struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, umode_t mode, + const struct qstr *qstr, __u32 goal, uid_t *owner) { struct super_block *sb; struct buffer_head *inode_bitmap_bh = NULL; @@ -816,8 +705,6 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode, int ret2, err = 0; struct inode *ret; ext4_group_t i; - int free = 0; - static int once = 1; ext4_group_t flex_group; /* Cannot create files in a deleted directory */ @@ -843,26 +730,9 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode, goto got_group; } - if (sbi->s_log_groups_per_flex && test_opt(sb, OLDALLOC)) { - ret2 = find_group_flex(sb, dir, &group); - if (ret2 == -1) { - ret2 = find_group_other(sb, dir, &group, mode); - if (ret2 == 0 && once) { - once = 0; - printk(KERN_NOTICE "ext4: find_group_flex " - "failed, fallback succeeded dir %lu\n", - dir->i_ino); - } - } - goto got_group; - } - - if (S_ISDIR(mode)) { - if (test_opt(sb, OLDALLOC)) - ret2 = find_group_dir(sb, dir, &group); - else - ret2 = find_group_orlov(sb, dir, &group, mode, qstr); - } else + if (S_ISDIR(mode)) + ret2 = find_group_orlov(sb, dir, &group, mode, qstr); + else ret2 = find_group_other(sb, dir, &group, mode); got_group: @@ -950,26 +820,21 @@ got: goto fail; } - free = 0; - ext4_lock_group(sb, group); + BUFFER_TRACE(block_bitmap_bh, "dirty block bitmap"); + err = ext4_handle_dirty_metadata(handle, NULL, block_bitmap_bh); + brelse(block_bitmap_bh); + /* recheck and clear flag under lock if we still need to */ + ext4_lock_group(sb, group); if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { - free = ext4_free_blocks_after_init(sb, group, gdp); gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT); - ext4_free_blks_set(sb, gdp, free); + ext4_free_group_clusters_set(sb, gdp, + ext4_free_clusters_after_init(sb, group, gdp)); gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp); } ext4_unlock_group(sb, group); - /* Don't need to dirty bitmap block if we didn't change it */ - if (free) { - BUFFER_TRACE(block_bitmap_bh, "dirty block bitmap"); - err = ext4_handle_dirty_metadata(handle, - NULL, block_bitmap_bh); - } - - brelse(block_bitmap_bh); if (err) goto fail; } @@ -987,8 +852,11 @@ got: flex_group = ext4_flex_group(sbi, group); atomic_dec(&sbi->s_flex_groups[flex_group].free_inodes); } - - if (test_opt(sb, GRPID)) { + if (owner) { + inode->i_mode = mode; + inode->i_uid = owner[0]; + inode->i_gid = owner[1]; + } else if (test_opt(sb, GRPID)) { inode->i_mode = mode; inode->i_uid = current_fsuid(); inode->i_gid = dir->i_gid; @@ -1005,11 +873,7 @@ got: ei->i_dir_start_lookup = 0; ei->i_disksize = 0; - /* - * Don't inherit extent flag from directory, amongst others. We set - * extent flag on newly created directory and file only if -o extent - * mount option is specified - */ + /* Don't inherit extent flag from directory, amongst others. */ ei->i_flags = ext4_mask_flags(mode, EXT4_I(dir)->i_flags & EXT4_FL_INHERITED); ei->i_file_acl = 0; @@ -1084,7 +948,7 @@ fail_free_drop: fail_drop: dquot_drop(inode); inode->i_flags |= S_NOQUOTA; - inode->i_nlink = 0; + clear_nlink(inode); unlock_new_inode(inode); iput(inode); brelse(inode_bitmap_bh); @@ -1235,7 +1099,7 @@ unsigned long ext4_count_dirs(struct super_block * sb) * inode allocation from the current group, so we take alloc_sem lock, to * block ext4_claim_inode until we are finished. */ -extern int ext4_init_inode_table(struct super_block *sb, ext4_group_t group, +int ext4_init_inode_table(struct super_block *sb, ext4_group_t group, int barrier) { struct ext4_group_info *grp = ext4_get_group_info(sb, group); diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c index 0962642..830e1b2 100644 --- a/fs/ext4/indirect.c +++ b/fs/ext4/indirect.c @@ -20,7 +20,6 @@ * (sct@redhat.com), 1993, 1998 */ -#include <linux/module.h> #include "ext4_jbd2.h" #include "truncate.h" @@ -699,6 +698,13 @@ int ext4_ind_map_blocks(handle_t *handle, struct inode *inode, /* * Okay, we need to do block allocation. */ + if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, + EXT4_FEATURE_RO_COMPAT_BIGALLOC)) { + EXT4_ERROR_INODE(inode, "Can't allocate blocks for " + "non-extent mapped inodes with bigalloc"); + return -ENOSPC; + } + goal = ext4_find_goal(inode, map->m_lblk, partial); /* the number of blocks need to allocate for [d,t]indirect blocks */ @@ -1343,7 +1349,9 @@ void ext4_ind_truncate(struct inode *inode) __le32 nr = 0; int n = 0; ext4_lblk_t last_block, max_block; + loff_t page_len; unsigned blocksize = inode->i_sb->s_blocksize; + int err; handle = start_transaction(inode); if (IS_ERR(handle)) @@ -1354,9 +1362,16 @@ void ext4_ind_truncate(struct inode *inode) max_block = (EXT4_SB(inode->i_sb)->s_bitmap_maxbytes + blocksize-1) >> EXT4_BLOCK_SIZE_BITS(inode->i_sb); - if (inode->i_size & (blocksize - 1)) - if (ext4_block_truncate_page(handle, mapping, inode->i_size)) + if (inode->i_size % PAGE_CACHE_SIZE != 0) { + page_len = PAGE_CACHE_SIZE - + (inode->i_size & (PAGE_CACHE_SIZE - 1)); + + err = ext4_discard_partial_page_buffers(handle, + mapping, inode->i_size, page_len, 0); + + if (err) goto out_stop; + } if (last_block != max_block) { n = ext4_block_to_path(inode, last_block, offsets, NULL); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 986e238..aa8efa6 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -18,7 +18,6 @@ * Assorted race fixes, rewrite of ext4_get_block() by Al Viro, 2000 */ -#include <linux/module.h> #include <linux/fs.h> #include <linux/time.h> #include <linux/jbd2.h> @@ -42,7 +41,6 @@ #include "ext4_jbd2.h" #include "xattr.h" #include "acl.h" -#include "ext4_extents.h" #include "truncate.h" #include <trace/events/ext4.h> @@ -268,7 +266,7 @@ void ext4_da_update_reserve_space(struct inode *inode, struct ext4_inode_info *ei = EXT4_I(inode); spin_lock(&ei->i_block_reservation_lock); - trace_ext4_da_update_reserve_space(inode, used); + trace_ext4_da_update_reserve_space(inode, used, quota_claim); if (unlikely(used > ei->i_reserved_data_blocks)) { ext4_msg(inode->i_sb, KERN_NOTICE, "%s: ino %lu, used %d " "with only %d reserved data blocks\n", @@ -281,7 +279,7 @@ void ext4_da_update_reserve_space(struct inode *inode, /* Update per-inode reservations */ ei->i_reserved_data_blocks -= used; ei->i_reserved_meta_blocks -= ei->i_allocated_meta_blocks; - percpu_counter_sub(&sbi->s_dirtyblocks_counter, + percpu_counter_sub(&sbi->s_dirtyclusters_counter, used + ei->i_allocated_meta_blocks); ei->i_allocated_meta_blocks = 0; @@ -291,7 +289,7 @@ void ext4_da_update_reserve_space(struct inode *inode, * only when we have written all of the delayed * allocation blocks. */ - percpu_counter_sub(&sbi->s_dirtyblocks_counter, + percpu_counter_sub(&sbi->s_dirtyclusters_counter, ei->i_reserved_meta_blocks); ei->i_reserved_meta_blocks = 0; ei->i_da_metadata_calc_len = 0; @@ -300,14 +298,14 @@ void ext4_da_update_reserve_space(struct inode *inode, /* Update quota subsystem for data blocks */ if (quota_claim) - dquot_claim_block(inode, used); + dquot_claim_block(inode, EXT4_C2B(sbi, used)); else { /* * We did fallocate with an offset that is already delayed * allocated. So on delayed allocated writeback we should * not re-claim the quota for fallocated blocks. */ - dquot_release_reservation_block(inode, used); + dquot_release_reservation_block(inode, EXT4_C2B(sbi, used)); } /* @@ -399,6 +397,49 @@ static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx, } /* + * Sets the BH_Da_Mapped bit on the buffer heads corresponding to the given map. + */ +static void set_buffers_da_mapped(struct inode *inode, + struct ext4_map_blocks *map) +{ + struct address_space *mapping = inode->i_mapping; + struct pagevec pvec; + int i, nr_pages; + pgoff_t index, end; + + index = map->m_lblk >> (PAGE_CACHE_SHIFT - inode->i_blkbits); + end = (map->m_lblk + map->m_len - 1) >> + (PAGE_CACHE_SHIFT - inode->i_blkbits); + + pagevec_init(&pvec, 0); + while (index <= end) { + nr_pages = pagevec_lookup(&pvec, mapping, index, + min(end - index + 1, + (pgoff_t)PAGEVEC_SIZE)); + if (nr_pages == 0) + break; + for (i = 0; i < nr_pages; i++) { + struct page *page = pvec.pages[i]; + struct buffer_head *bh, *head; + + if (unlikely(page->mapping != mapping) || + !PageDirty(page)) + break; + + if (page_has_buffers(page)) { + bh = head = page_buffers(page); + do { + set_buffer_da_mapped(bh); + bh = bh->b_this_page; + } while (bh != head); + } + index++; + } + pagevec_release(&pvec); + } +} + +/* * The ext4_map_blocks() function tries to look up the requested blocks, * and returns if the blocks are already mapped. * @@ -416,7 +457,7 @@ static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx, * the buffer head is mapped. * * It returns 0 if plain look up failed (blocks have not been allocated), in - * that casem, buffer head is unmapped + * that case, buffer head is unmapped * * It returns the error in case of allocation failure. */ @@ -435,9 +476,11 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, */ down_read((&EXT4_I(inode)->i_data_sem)); if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { - retval = ext4_ext_map_blocks(handle, inode, map, 0); + retval = ext4_ext_map_blocks(handle, inode, map, flags & + EXT4_GET_BLOCKS_KEEP_SIZE); } else { - retval = ext4_ind_map_blocks(handle, inode, map, 0); + retval = ext4_ind_map_blocks(handle, inode, map, flags & + EXT4_GET_BLOCKS_KEEP_SIZE); } up_read((&EXT4_I(inode)->i_data_sem)); @@ -455,7 +498,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, * Returns if the blocks have already allocated * * Note that if blocks have been preallocated - * ext4_ext_get_block() returns th create = 0 + * ext4_ext_get_block() returns the create = 0 * with buffer head unmapped. */ if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) @@ -517,9 +560,17 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)) ext4_da_update_reserve_space(inode, retval, 1); } - if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) + if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) { ext4_clear_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED); + /* If we have successfully mapped the delayed allocated blocks, + * set the BH_Da_Mapped bit on them. Its important to do this + * under the protection of i_data_sem. + */ + if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) + set_buffers_da_mapped(inode, map); + } + up_write((&EXT4_I(inode)->i_data_sem)); if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) { int ret = check_block_validity(inode, map); @@ -909,7 +960,11 @@ static int ext4_ordered_write_end(struct file *file, ext4_orphan_add(handle, inode); if (ret2 < 0) ret = ret2; + } else { + unlock_page(page); + page_cache_release(page); } + ret2 = ext4_journal_stop(handle); if (!ret) ret = ret2; @@ -1037,14 +1092,14 @@ static int ext4_journalled_write_end(struct file *file, } /* - * Reserve a single block located at lblock + * Reserve a single cluster located at lblock */ static int ext4_da_reserve_space(struct inode *inode, ext4_lblk_t lblock) { int retries = 0; struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct ext4_inode_info *ei = EXT4_I(inode); - unsigned long md_needed; + unsigned int md_needed; int ret; /* @@ -1054,7 +1109,8 @@ static int ext4_da_reserve_space(struct inode *inode, ext4_lblk_t lblock) */ repeat: spin_lock(&ei->i_block_reservation_lock); - md_needed = ext4_calc_metadata_amount(inode, lblock); + md_needed = EXT4_NUM_B2C(sbi, + ext4_calc_metadata_amount(inode, lblock)); trace_ext4_da_reserve_space(inode, md_needed); spin_unlock(&ei->i_block_reservation_lock); @@ -1063,15 +1119,15 @@ repeat: * us from metadata over-estimation, though we may go over by * a small amount in the end. Here we just reserve for data. */ - ret = dquot_reserve_block(inode, 1); + ret = dquot_reserve_block(inode, EXT4_C2B(sbi, 1)); if (ret) return ret; /* * We do still charge estimated metadata to the sb though; * we cannot afford to run out of free blocks. */ - if (ext4_claim_free_blocks(sbi, md_needed + 1, 0)) { - dquot_release_reservation_block(inode, 1); + if (ext4_claim_free_clusters(sbi, md_needed + 1, 0)) { + dquot_release_reservation_block(inode, EXT4_C2B(sbi, 1)); if (ext4_should_retry_alloc(inode->i_sb, &retries)) { yield(); goto repeat; @@ -1118,19 +1174,21 @@ static void ext4_da_release_space(struct inode *inode, int to_free) * We can release all of the reserved metadata blocks * only when we have written all of the delayed * allocation blocks. + * Note that in case of bigalloc, i_reserved_meta_blocks, + * i_reserved_data_blocks, etc. refer to number of clusters. */ - percpu_counter_sub(&sbi->s_dirtyblocks_counter, + percpu_counter_sub(&sbi->s_dirtyclusters_counter, ei->i_reserved_meta_blocks); ei->i_reserved_meta_blocks = 0; ei->i_da_metadata_calc_len = 0; } /* update fs dirty data blocks counter */ - percpu_counter_sub(&sbi->s_dirtyblocks_counter, to_free); + percpu_counter_sub(&sbi->s_dirtyclusters_counter, to_free); spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); - dquot_release_reservation_block(inode, to_free); + dquot_release_reservation_block(inode, EXT4_C2B(sbi, to_free)); } static void ext4_da_page_release_reservation(struct page *page, @@ -1139,6 +1197,9 @@ static void ext4_da_page_release_reservation(struct page *page, int to_release = 0; struct buffer_head *head, *bh; unsigned int curr_off = 0; + struct inode *inode = page->mapping->host; + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + int num_clusters; head = page_buffers(page); bh = head; @@ -1148,10 +1209,24 @@ static void ext4_da_page_release_reservation(struct page *page, if ((offset <= curr_off) && (buffer_delay(bh))) { to_release++; clear_buffer_delay(bh); + clear_buffer_da_mapped(bh); } curr_off = next_off; } while ((bh = bh->b_this_page) != head); - ext4_da_release_space(page->mapping->host, to_release); + + /* If we have released all the blocks belonging to a cluster, then we + * need to release the reserved space for that cluster. */ + num_clusters = EXT4_NUM_B2C(sbi, to_release); + while (num_clusters > 0) { + ext4_fsblk_t lblk; + lblk = (page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits)) + + ((num_clusters - 1) << sbi->s_cluster_bits); + if (sbi->s_cluster_ratio == 1 || + !ext4_find_delalloc_cluster(inode, lblk, 1)) + ext4_da_release_space(inode, 1); + + num_clusters--; + } } /* @@ -1253,6 +1328,8 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd, clear_buffer_delay(bh); bh->b_blocknr = pblock; } + if (buffer_da_mapped(bh)) + clear_buffer_da_mapped(bh); if (buffer_unwritten(bh) || buffer_mapped(bh)) BUG_ON(bh->b_blocknr != pblock); @@ -1261,8 +1338,11 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd, clear_buffer_unwritten(bh); } - /* skip page if block allocation undone */ - if (buffer_delay(bh) || buffer_unwritten(bh)) + /* + * skip page if block allocation undone and + * block is dirty + */ + if (ext4_bh_delay_or_unwritten(NULL, bh)) skip_page = 1; bh = bh->b_this_page; block_start += bh->b_size; @@ -1346,12 +1426,15 @@ static void ext4_print_free_blocks(struct inode *inode) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); printk(KERN_CRIT "Total free blocks count %lld\n", - ext4_count_free_blocks(inode->i_sb)); + EXT4_C2B(EXT4_SB(inode->i_sb), + ext4_count_free_clusters(inode->i_sb))); printk(KERN_CRIT "Free/Dirty block details\n"); printk(KERN_CRIT "free_blocks=%lld\n", - (long long) percpu_counter_sum(&sbi->s_freeblocks_counter)); + (long long) EXT4_C2B(EXT4_SB(inode->i_sb), + percpu_counter_sum(&sbi->s_freeclusters_counter))); printk(KERN_CRIT "dirty_blocks=%lld\n", - (long long) percpu_counter_sum(&sbi->s_dirtyblocks_counter)); + (long long) EXT4_C2B(EXT4_SB(inode->i_sb), + percpu_counter_sum(&sbi->s_dirtyclusters_counter))); printk(KERN_CRIT "Block reservation details\n"); printk(KERN_CRIT "i_reserved_data_blocks=%u\n", EXT4_I(inode)->i_reserved_data_blocks); @@ -1430,8 +1513,7 @@ static void mpage_da_map_and_submit(struct mpage_da_data *mpd) if (err == -EAGAIN) goto submit_io; - if (err == -ENOSPC && - ext4_count_free_blocks(sb)) { + if (err == -ENOSPC && ext4_count_free_clusters(sb)) { mpd->retval = err; goto submit_io; } @@ -1471,13 +1553,15 @@ static void mpage_da_map_and_submit(struct mpage_da_data *mpd) for (i = 0; i < map.m_len; i++) unmap_underlying_metadata(bdev, map.m_pblk + i); - } - if (ext4_should_order_data(mpd->inode)) { - err = ext4_jbd2_file_inode(handle, mpd->inode); - if (err) - /* This only happens if the journal is aborted */ - return; + if (ext4_should_order_data(mpd->inode)) { + err = ext4_jbd2_file_inode(handle, mpd->inode); + if (err) { + /* Only if the journal is aborted */ + mpd->retval = err; + goto submit_io; + } + } } /* @@ -1584,6 +1668,66 @@ static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh) } /* + * This function is grabs code from the very beginning of + * ext4_map_blocks, but assumes that the caller is from delayed write + * time. This function looks up the requested blocks and sets the + * buffer delay bit under the protection of i_data_sem. + */ +static int ext4_da_map_blocks(struct inode *inode, sector_t iblock, + struct ext4_map_blocks *map, + struct buffer_head *bh) +{ + int retval; + sector_t invalid_block = ~((sector_t) 0xffff); + + if (invalid_block < ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es)) + invalid_block = ~0; + + map->m_flags = 0; + ext_debug("ext4_da_map_blocks(): inode %lu, max_blocks %u," + "logical block %lu\n", inode->i_ino, map->m_len, + (unsigned long) map->m_lblk); + /* + * Try to see if we can get the block without requesting a new + * file system block. + */ + down_read((&EXT4_I(inode)->i_data_sem)); + if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) + retval = ext4_ext_map_blocks(NULL, inode, map, 0); + else + retval = ext4_ind_map_blocks(NULL, inode, map, 0); + + if (retval == 0) { + /* + * XXX: __block_prepare_write() unmaps passed block, + * is it OK? + */ + /* If the block was allocated from previously allocated cluster, + * then we dont need to reserve it again. */ + if (!(map->m_flags & EXT4_MAP_FROM_CLUSTER)) { + retval = ext4_da_reserve_space(inode, iblock); + if (retval) + /* not enough space to reserve */ + goto out_unlock; + } + + /* Clear EXT4_MAP_FROM_CLUSTER flag since its purpose is served + * and it should not appear on the bh->b_state. + */ + map->m_flags &= ~EXT4_MAP_FROM_CLUSTER; + + map_bh(bh, inode->i_sb, invalid_block); + set_buffer_new(bh); + set_buffer_delay(bh); + } + +out_unlock: + up_read((&EXT4_I(inode)->i_data_sem)); + + return retval; +} + +/* * This is a special get_blocks_t callback which is used by * ext4_da_write_begin(). It will either return mapped block or * reserve space for a single block. @@ -1600,10 +1744,6 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, { struct ext4_map_blocks map; int ret = 0; - sector_t invalid_block = ~((sector_t) 0xffff); - - if (invalid_block < ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es)) - invalid_block = ~0; BUG_ON(create == 0); BUG_ON(bh->b_size != inode->i_sb->s_blocksize); @@ -1616,25 +1756,9 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, * preallocated blocks are unmapped but should treated * the same as allocated blocks. */ - ret = ext4_map_blocks(NULL, inode, &map, 0); - if (ret < 0) + ret = ext4_da_map_blocks(inode, iblock, &map, bh); + if (ret <= 0) return ret; - if (ret == 0) { - if (buffer_delay(bh)) - return 0; /* Not sure this could or should happen */ - /* - * XXX: __block_write_begin() unmaps passed block, is it OK? - */ - ret = ext4_da_reserve_space(inode, iblock); - if (ret) - /* not enough space to reserve */ - return ret; - - map_bh(bh, inode->i_sb, invalid_block); - set_buffer_new(bh); - set_buffer_delay(bh); - return 0; - } map_bh(bh, inode->i_sb, map.m_pblk); bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags; @@ -1756,7 +1880,7 @@ static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate); * a[0] = 'a'; * truncate(f, 4096); * we have in the page first buffer_head mapped via page_mkwrite call back - * but other bufer_heads would be unmapped but dirty(dirty done via the + * but other buffer_heads would be unmapped but dirty (dirty done via the * do_wp_page). So writepage should write the first block. If we modify * the mmap area beyond 1024 we will again get a page_fault and the * page_mkwrite callback will do the block allocation and mark the @@ -1811,8 +1935,12 @@ static int ext4_writepage(struct page *page, * We don't want to do block allocation, so redirty * the page and return. We may reach here when we do * a journal commit via journal_submit_inode_data_buffers. - * We can also reach here via shrink_page_list + * We can also reach here via shrink_page_list but it + * should never be for direct reclaim so warn if that + * happens */ + WARN_ON_ONCE((current->flags & (PF_MEMALLOC|PF_KSWAPD)) == + PF_MEMALLOC); goto redirty_page; } if (commit_write) @@ -2046,6 +2174,7 @@ static int ext4_da_writepages(struct address_space *mapping, struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb); pgoff_t done_index = 0; pgoff_t end; + struct blk_plug plug; trace_ext4_da_writepages(inode, wbc); @@ -2124,6 +2253,7 @@ retry: if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) tag_pages_for_writeback(mapping, index, end); + blk_start_plug(&plug); while (!ret && wbc->nr_to_write > 0) { /* @@ -2142,6 +2272,7 @@ retry: ext4_msg(inode->i_sb, KERN_CRIT, "%s: jbd2_start: " "%ld pages, ino %lu; err %d", __func__, wbc->nr_to_write, inode->i_ino, ret); + blk_finish_plug(&plug); goto out_writepages; } @@ -2174,11 +2305,12 @@ retry: ret = 0; } else if (ret == MPAGE_DA_EXTENT_TAIL) { /* - * got one extent now try with - * rest of the pages + * Got one extent now try with rest of the pages. + * If mpd.retval is set -EIO, journal is aborted. + * So we don't need to write any more. */ pages_written += mpd.pages_written; - ret = 0; + ret = mpd.retval; io_done = 1; } else if (wbc->nr_to_write) /* @@ -2188,6 +2320,7 @@ retry: */ break; } + blk_finish_plug(&plug); if (!io_done && !cycled) { cycled = 1; index = 0; @@ -2226,10 +2359,11 @@ static int ext4_nonda_switch(struct super_block *sb) * Delalloc need an accurate free block accounting. So switch * to non delalloc when we are near to error range. */ - free_blocks = percpu_counter_read_positive(&sbi->s_freeblocks_counter); - dirty_blocks = percpu_counter_read_positive(&sbi->s_dirtyblocks_counter); + free_blocks = EXT4_C2B(sbi, + percpu_counter_read_positive(&sbi->s_freeclusters_counter)); + dirty_blocks = percpu_counter_read_positive(&sbi->s_dirtyclusters_counter); if (2 * free_blocks < 3 * dirty_blocks || - free_blocks < (dirty_blocks + EXT4_FREEBLOCKS_WATERMARK)) { + free_blocks < (dirty_blocks + EXT4_FREECLUSTERS_WATERMARK)) { /* * free block count is less than 150% of dirty blocks * or free blocks is less than watermark @@ -2241,7 +2375,7 @@ static int ext4_nonda_switch(struct super_block *sb) * start pushing delalloc when 1/2 of free blocks are dirty. */ if (free_blocks < 2 * dirty_blocks) - writeback_inodes_sb_if_idle(sb); + writeback_inodes_sb_if_idle(sb, WB_REASON_FS_FREE_SPACE); return 0; } @@ -2367,7 +2501,7 @@ static int ext4_da_write_end(struct file *file, */ new_i_size = pos + copied; - if (new_i_size > EXT4_I(inode)->i_disksize) { + if (copied && new_i_size > EXT4_I(inode)->i_disksize) { if (ext4_da_should_update_i_disksize(page, end)) { down_write(&EXT4_I(inode)->i_data_sem); if (new_i_size > EXT4_I(inode)->i_disksize) { @@ -2630,10 +2764,11 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset, iocb->private, io_end->inode->i_ino, iocb, offset, size); + iocb->private = NULL; + /* if not aio dio with unwritten extents, just free io and return */ if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) { ext4_free_io_end(io_end); - iocb->private = NULL; out: if (is_async) aio_complete(iocb, ret, 0); @@ -2657,7 +2792,6 @@ out: /* queue the work to convert unwritten extents to written */ queue_work(wq, &io_end->work); - iocb->private = NULL; /* XXX: probably should move into the real I/O completion handler */ inode_dio_done(inode); @@ -2685,10 +2819,7 @@ static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate) * but being more careful is always safe for the future change. */ inode = io_end->inode; - if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) { - io_end->flag |= EXT4_IO_END_UNWRITTEN; - atomic_inc(&EXT4_I(inode)->i_aiodio_unwritten); - } + ext4_set_io_unwritten_flag(inode, io_end); /* Add the io_end to per-inode completed io list*/ spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags); @@ -2854,6 +2985,12 @@ static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb, struct inode *inode = file->f_mapping->host; ssize_t ret; + /* + * If we are doing data journalling we don't support O_DIRECT + */ + if (ext4_should_journal_data(inode)) + return 0; + trace_ext4_direct_IO_enter(inode, offset, iov_length(iov, nr_segs), rw); if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) ret = ext4_ext_direct_IO(rw, iocb, iov, offset, nr_segs); @@ -2923,6 +3060,7 @@ static const struct address_space_operations ext4_journalled_aops = { .bmap = ext4_bmap, .invalidatepage = ext4_invalidatepage, .releasepage = ext4_releasepage, + .direct_IO = ext4_direct_IO, .is_partially_uptodate = block_is_partially_uptodate, .error_remove_page = generic_error_remove_page, }; @@ -2959,6 +3097,209 @@ void ext4_set_aops(struct inode *inode) inode->i_mapping->a_ops = &ext4_journalled_aops; } + +/* + * ext4_discard_partial_page_buffers() + * Wrapper function for ext4_discard_partial_page_buffers_no_lock. + * This function finds and locks the page containing the offset + * "from" and passes it to ext4_discard_partial_page_buffers_no_lock. + * Calling functions that already have the page locked should call + * ext4_discard_partial_page_buffers_no_lock directly. + */ +int ext4_discard_partial_page_buffers(handle_t *handle, + struct address_space *mapping, loff_t from, + loff_t length, int flags) +{ + struct inode *inode = mapping->host; + struct page *page; + int err = 0; + + page = find_or_create_page(mapping, from >> PAGE_CACHE_SHIFT, + mapping_gfp_mask(mapping) & ~__GFP_FS); + if (!page) + return -ENOMEM; + + err = ext4_discard_partial_page_buffers_no_lock(handle, inode, page, + from, length, flags); + + unlock_page(page); + page_cache_release(page); + return err; +} + +/* + * ext4_discard_partial_page_buffers_no_lock() + * Zeros a page range of length 'length' starting from offset 'from'. + * Buffer heads that correspond to the block aligned regions of the + * zeroed range will be unmapped. Unblock aligned regions + * will have the corresponding buffer head mapped if needed so that + * that region of the page can be updated with the partial zero out. + * + * This function assumes that the page has already been locked. The + * The range to be discarded must be contained with in the given page. + * If the specified range exceeds the end of the page it will be shortened + * to the end of the page that corresponds to 'from'. This function is + * appropriate for updating a page and it buffer heads to be unmapped and + * zeroed for blocks that have been either released, or are going to be + * released. + * + * handle: The journal handle + * inode: The files inode + * page: A locked page that contains the offset "from" + * from: The starting byte offset (from the begining of the file) + * to begin discarding + * len: The length of bytes to discard + * flags: Optional flags that may be used: + * + * EXT4_DISCARD_PARTIAL_PG_ZERO_UNMAPPED + * Only zero the regions of the page whose buffer heads + * have already been unmapped. This flag is appropriate + * for updateing the contents of a page whose blocks may + * have already been released, and we only want to zero + * out the regions that correspond to those released blocks. + * + * Returns zero on sucess or negative on failure. + */ +int ext4_discard_partial_page_buffers_no_lock(handle_t *handle, + struct inode *inode, struct page *page, loff_t from, + loff_t length, int flags) +{ + ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT; + unsigned int offset = from & (PAGE_CACHE_SIZE-1); + unsigned int blocksize, max, pos; + ext4_lblk_t iblock; + struct buffer_head *bh; + int err = 0; + + blocksize = inode->i_sb->s_blocksize; + max = PAGE_CACHE_SIZE - offset; + + if (index != page->index) + return -EINVAL; + + /* + * correct length if it does not fall between + * 'from' and the end of the page + */ + if (length > max || length < 0) + length = max; + + iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits); + + if (!page_has_buffers(page)) + create_empty_buffers(page, blocksize, 0); + + /* Find the buffer that contains "offset" */ + bh = page_buffers(page); + pos = blocksize; + while (offset >= pos) { + bh = bh->b_this_page; + iblock++; + pos += blocksize; + } + + pos = offset; + while (pos < offset + length) { + unsigned int end_of_block, range_to_discard; + + err = 0; + + /* The length of space left to zero and unmap */ + range_to_discard = offset + length - pos; + + /* The length of space until the end of the block */ + end_of_block = blocksize - (pos & (blocksize-1)); + + /* + * Do not unmap or zero past end of block + * for this buffer head + */ + if (range_to_discard > end_of_block) + range_to_discard = end_of_block; + + + /* + * Skip this buffer head if we are only zeroing unampped + * regions of the page + */ + if (flags & EXT4_DISCARD_PARTIAL_PG_ZERO_UNMAPPED && + buffer_mapped(bh)) + goto next; + + /* If the range is block aligned, unmap */ + if (range_to_discard == blocksize) { + clear_buffer_dirty(bh); + bh->b_bdev = NULL; + clear_buffer_mapped(bh); + clear_buffer_req(bh); + clear_buffer_new(bh); + clear_buffer_delay(bh); + clear_buffer_unwritten(bh); + clear_buffer_uptodate(bh); + zero_user(page, pos, range_to_discard); + BUFFER_TRACE(bh, "Buffer discarded"); + goto next; + } + + /* + * If this block is not completely contained in the range + * to be discarded, then it is not going to be released. Because + * we need to keep this block, we need to make sure this part + * of the page is uptodate before we modify it by writeing + * partial zeros on it. + */ + if (!buffer_mapped(bh)) { + /* + * Buffer head must be mapped before we can read + * from the block + */ + BUFFER_TRACE(bh, "unmapped"); + ext4_get_block(inode, iblock, bh, 0); + /* unmapped? It's a hole - nothing to do */ + if (!buffer_mapped(bh)) { + BUFFER_TRACE(bh, "still unmapped"); + goto next; + } + } + + /* Ok, it's mapped. Make sure it's up-to-date */ + if (PageUptodate(page)) + set_buffer_uptodate(bh); + + if (!buffer_uptodate(bh)) { + err = -EIO; + ll_rw_block(READ, 1, &bh); + wait_on_buffer(bh); + /* Uhhuh. Read error. Complain and punt.*/ + if (!buffer_uptodate(bh)) + goto next; + } + + if (ext4_should_journal_data(inode)) { + BUFFER_TRACE(bh, "get write access"); + err = ext4_journal_get_write_access(handle, bh); + if (err) + goto next; + } + + zero_user(page, pos, range_to_discard); + + err = 0; + if (ext4_should_journal_data(inode)) { + err = ext4_handle_dirty_metadata(handle, inode, bh); + } else + mark_buffer_dirty(bh); + + BUFFER_TRACE(bh, "Partial buffer zeroed"); +next: + bh = bh->b_this_page; + iblock++; + pos += range_to_discard; + } + + return err; +} + /* * ext4_block_truncate_page() zeroes out a mapping from file offset `from' * up to the end of the block which corresponds to `from'. @@ -3001,7 +3342,7 @@ int ext4_block_zero_page_range(handle_t *handle, page = find_or_create_page(mapping, from >> PAGE_CACHE_SHIFT, mapping_gfp_mask(mapping) & ~__GFP_FS); if (!page) - return -EINVAL; + return -ENOMEM; blocksize = inode->i_sb->s_blocksize; max = blocksize - (offset & (blocksize - 1)); @@ -3070,11 +3411,8 @@ int ext4_block_zero_page_range(handle_t *handle, err = 0; if (ext4_should_journal_data(inode)) { err = ext4_handle_dirty_metadata(handle, inode, bh); - } else { - if (ext4_should_order_data(inode) && EXT4_I(inode)->jinode) - err = ext4_jbd2_file_inode(handle, inode); + } else mark_buffer_dirty(bh); - } unlock: unlock_page(page); @@ -3115,6 +3453,11 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length) return -ENOTSUPP; } + if (EXT4_SB(inode->i_sb)->s_cluster_ratio > 1) { + /* TODO: Add support for bigalloc file systems */ + return -ENOTSUPP; + } + return ext4_ext_punch_hole(file, offset, length); } @@ -3125,7 +3468,7 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length) * transaction, and VFS/VM ensures that ext4_truncate() cannot run * simultaneously on behalf of the same inode. * - * As we work through the truncate and commmit bits of it to the journal there + * As we work through the truncate and commit bits of it to the journal there * is one core, guiding principle: the file's tree must always be consistent on * disk. We must be able to restart the truncate after a crash. * @@ -3414,7 +3757,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) inode->i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16; inode->i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16; } - inode->i_nlink = le16_to_cpu(raw_inode->i_links_count); + set_nlink(inode, le16_to_cpu(raw_inode->i_links_count)); ext4_clear_state_flags(ei); /* Only relevant on 32-bit archs */ ei->i_dir_start_lookup = 0; @@ -4416,6 +4759,7 @@ retry_alloc: PAGE_CACHE_SIZE, NULL, do_journal_get_write_access)) { unlock_page(page); ret = VM_FAULT_SIGBUS; + ext4_journal_stop(handle); goto out; } ext4_set_inode_state(inode, EXT4_STATE_JDATA); diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index f18bfe3..e87a932 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -21,6 +21,7 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { struct inode *inode = filp->f_dentry->d_inode; + struct super_block *sb = inode->i_sb; struct ext4_inode_info *ei = EXT4_I(inode); unsigned int flags; @@ -44,7 +45,7 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) if (get_user(flags, (int __user *) arg)) return -EFAULT; - err = mnt_want_write(filp->f_path.mnt); + err = mnt_want_write_file(filp); if (err) return err; @@ -133,7 +134,7 @@ flags_err: err = ext4_ext_migrate(inode); flags_out: mutex_unlock(&inode->i_mutex); - mnt_drop_write(filp->f_path.mnt); + mnt_drop_write_file(filp); return err; } case EXT4_IOC_GETVERSION: @@ -149,7 +150,7 @@ flags_out: if (!inode_owner_or_capable(inode)) return -EPERM; - err = mnt_want_write(filp->f_path.mnt); + err = mnt_want_write_file(filp); if (err) return err; if (get_user(generation, (int __user *) arg)) { @@ -157,10 +158,11 @@ flags_out: goto setversion_out; } + mutex_lock(&inode->i_mutex); handle = ext4_journal_start(inode, 1); if (IS_ERR(handle)) { err = PTR_ERR(handle); - goto setversion_out; + goto unlock_out; } err = ext4_reserve_inode_write(handle, inode, &iloc); if (err == 0) { @@ -169,37 +171,15 @@ flags_out: err = ext4_mark_iloc_dirty(handle, inode, &iloc); } ext4_journal_stop(handle); + +unlock_out: + mutex_unlock(&inode->i_mutex); setversion_out: - mnt_drop_write(filp->f_path.mnt); + mnt_drop_write_file(filp); return err; } -#ifdef CONFIG_JBD2_DEBUG - case EXT4_IOC_WAIT_FOR_READONLY: - /* - * This is racy - by the time we're woken up and running, - * the superblock could be released. And the module could - * have been unloaded. So sue me. - * - * Returns 1 if it slept, else zero. - */ - { - struct super_block *sb = inode->i_sb; - DECLARE_WAITQUEUE(wait, current); - int ret = 0; - - set_current_state(TASK_INTERRUPTIBLE); - add_wait_queue(&EXT4_SB(sb)->ro_wait_queue, &wait); - if (timer_pending(&EXT4_SB(sb)->turn_ro_timer)) { - schedule(); - ret = 1; - } - remove_wait_queue(&EXT4_SB(sb)->ro_wait_queue, &wait); - return ret; - } -#endif case EXT4_IOC_GROUP_EXTEND: { ext4_fsblk_t n_blocks_count; - struct super_block *sb = inode->i_sb; int err, err2=0; err = ext4_resize_begin(sb); @@ -209,7 +189,14 @@ setversion_out: if (get_user(n_blocks_count, (__u32 __user *)arg)) return -EFAULT; - err = mnt_want_write(filp->f_path.mnt); + if (EXT4_HAS_RO_COMPAT_FEATURE(sb, + EXT4_FEATURE_RO_COMPAT_BIGALLOC)) { + ext4_msg(sb, KERN_ERR, + "Online resizing not supported with bigalloc"); + return -EOPNOTSUPP; + } + + err = mnt_want_write_file(filp); if (err) return err; @@ -221,7 +208,7 @@ setversion_out: } if (err == 0) err = err2; - mnt_drop_write(filp->f_path.mnt); + mnt_drop_write_file(filp); ext4_resize_end(sb); return err; @@ -250,13 +237,20 @@ setversion_out: goto mext_out; } - err = mnt_want_write(filp->f_path.mnt); + if (EXT4_HAS_RO_COMPAT_FEATURE(sb, + EXT4_FEATURE_RO_COMPAT_BIGALLOC)) { + ext4_msg(sb, KERN_ERR, + "Online defrag not supported with bigalloc"); + return -EOPNOTSUPP; + } + + err = mnt_want_write_file(filp); if (err) goto mext_out; err = ext4_move_extents(filp, donor_filp, me.orig_start, me.donor_start, me.len, &me.moved_len); - mnt_drop_write(filp->f_path.mnt); + mnt_drop_write_file(filp); if (me.moved_len > 0) file_remove_suid(donor_filp); @@ -270,7 +264,6 @@ mext_out: case EXT4_IOC_GROUP_ADD: { struct ext4_new_group_data input; - struct super_block *sb = inode->i_sb; int err, err2=0; err = ext4_resize_begin(sb); @@ -281,7 +274,14 @@ mext_out: sizeof(input))) return -EFAULT; - err = mnt_want_write(filp->f_path.mnt); + if (EXT4_HAS_RO_COMPAT_FEATURE(sb, + EXT4_FEATURE_RO_COMPAT_BIGALLOC)) { + ext4_msg(sb, KERN_ERR, + "Online resizing not supported with bigalloc"); + return -EOPNOTSUPP; + } + + err = mnt_want_write_file(filp); if (err) return err; @@ -293,7 +293,7 @@ mext_out: } if (err == 0) err = err2; - mnt_drop_write(filp->f_path.mnt); + mnt_drop_write_file(filp); ext4_resize_end(sb); return err; @@ -305,7 +305,7 @@ mext_out: if (!inode_owner_or_capable(inode)) return -EACCES; - err = mnt_want_write(filp->f_path.mnt); + err = mnt_want_write_file(filp); if (err) return err; /* @@ -317,7 +317,7 @@ mext_out: mutex_lock(&(inode->i_mutex)); err = ext4_ext_migrate(inode); mutex_unlock(&(inode->i_mutex)); - mnt_drop_write(filp->f_path.mnt); + mnt_drop_write_file(filp); return err; } @@ -327,17 +327,16 @@ mext_out: if (!inode_owner_or_capable(inode)) return -EACCES; - err = mnt_want_write(filp->f_path.mnt); + err = mnt_want_write_file(filp); if (err) return err; err = ext4_alloc_da_blocks(inode); - mnt_drop_write(filp->f_path.mnt); + mnt_drop_write_file(filp); return err; } case FITRIM: { - struct super_block *sb = inode->i_sb; struct request_queue *q = bdev_get_queue(sb->s_bdev); struct fstrim_range range; int ret = 0; @@ -348,7 +347,14 @@ mext_out: if (!blk_queue_discard(q)) return -EOPNOTSUPP; - if (copy_from_user(&range, (struct fstrim_range *)arg, + if (EXT4_HAS_RO_COMPAT_FEATURE(sb, + EXT4_FEATURE_RO_COMPAT_BIGALLOC)) { + ext4_msg(sb, KERN_ERR, + "FITRIM not supported with bigalloc"); + return -EOPNOTSUPP; + } + + if (copy_from_user(&range, (struct fstrim_range __user *)arg, sizeof(range))) return -EFAULT; @@ -358,7 +364,7 @@ mext_out: if (ret < 0) return ret; - if (copy_to_user((struct fstrim_range *)arg, &range, + if (copy_to_user((struct fstrim_range __user *)arg, &range, sizeof(range))) return -EFAULT; @@ -396,11 +402,6 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case EXT4_IOC32_SETVERSION_OLD: cmd = EXT4_IOC_SETVERSION_OLD; break; -#ifdef CONFIG_JBD2_DEBUG - case EXT4_IOC32_WAIT_FOR_READONLY: - cmd = EXT4_IOC_WAIT_FOR_READONLY; - break; -#endif case EXT4_IOC32_GETRSVSZ: cmd = EXT4_IOC_GETRSVSZ; break; diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 17a5a57..e2d8be8 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -70,8 +70,8 @@ * * pa_lstart -> the logical start block for this prealloc space * pa_pstart -> the physical start block for this prealloc space - * pa_len -> length for this prealloc space - * pa_free -> free space available in this prealloc space + * pa_len -> length for this prealloc space (in clusters) + * pa_free -> free space available in this prealloc space (in clusters) * * The inode preallocation space is used looking at the _logical_ start * block. If only the logical file block falls within the range of prealloc @@ -126,7 +126,8 @@ * list. In case of inode preallocation we follow a list of heuristics * based on file size. This can be found in ext4_mb_normalize_request. If * we are doing a group prealloc we try to normalize the request to - * sbi->s_mb_group_prealloc. Default value of s_mb_group_prealloc is + * sbi->s_mb_group_prealloc. The default value of s_mb_group_prealloc is + * dependent on the cluster size; for non-bigalloc file systems, it is * 512 blocks. This can be tuned via * /sys/fs/ext4/<partition>/mb_group_prealloc. The value is represented in * terms of number of blocks. If we have mounted the file system with -O @@ -459,7 +460,7 @@ static void mb_free_blocks_double(struct inode *inode, struct ext4_buddy *e4b, ext4_fsblk_t blocknr; blocknr = ext4_group_first_block_no(sb, e4b->bd_group); - blocknr += first + i; + blocknr += EXT4_C2B(EXT4_SB(sb), first + i); ext4_grp_locked_error(sb, e4b->bd_group, inode ? inode->i_ino : 0, blocknr, @@ -580,7 +581,7 @@ static int __mb_check_buddy(struct ext4_buddy *e4b, char *file, continue; } - /* both bits in buddy2 must be 0 */ + /* both bits in buddy2 must be 1 */ MB_CHECK_ASSERT(mb_test_bit(i << 1, buddy2)); MB_CHECK_ASSERT(mb_test_bit((i << 1) + 1, buddy2)); @@ -653,7 +654,7 @@ static void ext4_mb_mark_free_simple(struct super_block *sb, ext4_grpblk_t chunk; unsigned short border; - BUG_ON(len > EXT4_BLOCKS_PER_GROUP(sb)); + BUG_ON(len > EXT4_CLUSTERS_PER_GROUP(sb)); border = 2 << sb->s_blocksize_bits; @@ -705,7 +706,7 @@ void ext4_mb_generate_buddy(struct super_block *sb, void *buddy, void *bitmap, ext4_group_t group) { struct ext4_group_info *grp = ext4_get_group_info(sb, group); - ext4_grpblk_t max = EXT4_BLOCKS_PER_GROUP(sb); + ext4_grpblk_t max = EXT4_CLUSTERS_PER_GROUP(sb); ext4_grpblk_t i = 0; ext4_grpblk_t first; ext4_grpblk_t len; @@ -734,7 +735,7 @@ void ext4_mb_generate_buddy(struct super_block *sb, if (free != grp->bb_free) { ext4_grp_locked_error(sb, group, 0, 0, - "%u blocks in bitmap, %u in gd", + "%u clusters in bitmap, %u in gd", free, grp->bb_free); /* * If we intent to continue, we consider group descritor @@ -1339,7 +1340,7 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b, ext4_fsblk_t blocknr; blocknr = ext4_group_first_block_no(sb, e4b->bd_group); - blocknr += block; + blocknr += EXT4_C2B(EXT4_SB(sb), block); ext4_grp_locked_error(sb, e4b->bd_group, inode ? inode->i_ino : 0, blocknr, @@ -1390,7 +1391,6 @@ static int mb_find_extent(struct ext4_buddy *e4b, int order, int block, { int next = block; int max; - int ord; void *buddy; assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group)); @@ -1432,9 +1432,8 @@ static int mb_find_extent(struct ext4_buddy *e4b, int order, int block, if (mb_test_bit(next, EXT4_MB_BITMAP(e4b))) break; - ord = mb_find_order_for_block(e4b, next); + order = mb_find_order_for_block(e4b, next); - order = ord; block = next >> order; ex->fe_len += 1 << order; } @@ -1624,8 +1623,8 @@ static void ext4_mb_measure_extent(struct ext4_allocation_context *ac, struct ext4_free_extent *gex = &ac->ac_g_ex; BUG_ON(ex->fe_len <= 0); - BUG_ON(ex->fe_len > EXT4_BLOCKS_PER_GROUP(ac->ac_sb)); - BUG_ON(ex->fe_start >= EXT4_BLOCKS_PER_GROUP(ac->ac_sb)); + BUG_ON(ex->fe_len > EXT4_CLUSTERS_PER_GROUP(ac->ac_sb)); + BUG_ON(ex->fe_start >= EXT4_CLUSTERS_PER_GROUP(ac->ac_sb)); BUG_ON(ac->ac_status != AC_STATUS_CONTINUE); ac->ac_found++; @@ -1823,15 +1822,15 @@ void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac, while (free && ac->ac_status == AC_STATUS_CONTINUE) { i = mb_find_next_zero_bit(bitmap, - EXT4_BLOCKS_PER_GROUP(sb), i); - if (i >= EXT4_BLOCKS_PER_GROUP(sb)) { + EXT4_CLUSTERS_PER_GROUP(sb), i); + if (i >= EXT4_CLUSTERS_PER_GROUP(sb)) { /* * IF we have corrupt bitmap, we won't find any * free blocks even though group info says we * we have free blocks */ ext4_grp_locked_error(sb, e4b->bd_group, 0, 0, - "%d free blocks as per " + "%d free clusters as per " "group info. But bitmap says 0", free); break; @@ -1841,7 +1840,7 @@ void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac, BUG_ON(ex.fe_len <= 0); if (free < ex.fe_len) { ext4_grp_locked_error(sb, e4b->bd_group, 0, 0, - "%d free blocks as per " + "%d free clusters as per " "group info. But got %d blocks", free, ex.fe_len); /* @@ -1887,7 +1886,7 @@ void ext4_mb_scan_aligned(struct ext4_allocation_context *ac, do_div(a, sbi->s_stripe); i = (a * sbi->s_stripe) - first_group_block; - while (i < EXT4_BLOCKS_PER_GROUP(sb)) { + while (i < EXT4_CLUSTERS_PER_GROUP(sb)) { if (!mb_test_bit(i, bitmap)) { max = mb_find_extent(e4b, 0, i, sbi->s_stripe, &ex); if (max >= sbi->s_stripe) { @@ -2252,10 +2251,10 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group, */ if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { meta_group_info[i]->bb_free = - ext4_free_blocks_after_init(sb, group, desc); + ext4_free_clusters_after_init(sb, group, desc); } else { meta_group_info[i]->bb_free = - ext4_free_blks_count(sb, desc); + ext4_free_group_clusters(sb, desc); } INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list); @@ -2473,7 +2472,20 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery) sbi->s_mb_stats = MB_DEFAULT_STATS; sbi->s_mb_stream_request = MB_DEFAULT_STREAM_THRESHOLD; sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS; - sbi->s_mb_group_prealloc = MB_DEFAULT_GROUP_PREALLOC; + /* + * The default group preallocation is 512, which for 4k block + * sizes translates to 2 megabytes. However for bigalloc file + * systems, this is probably too big (i.e, if the cluster size + * is 1 megabyte, then group preallocation size becomes half a + * gigabyte!). As a default, we will keep a two megabyte + * group pralloc size for cluster sizes up to 64k, and after + * that, we will force a minimum group preallocation size of + * 32 clusters. This translates to 8 megs when the cluster + * size is 256k, and 32 megs when the cluster size is 1 meg, + * which seems reasonable as a default. + */ + sbi->s_mb_group_prealloc = max(MB_DEFAULT_GROUP_PREALLOC >> + sbi->s_cluster_bits, 32); /* * If there is a s_stripe > 1, then we set the s_mb_group_prealloc * to the lowest multiple of s_stripe which is bigger than @@ -2490,7 +2502,7 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery) sbi->s_locality_groups = alloc_percpu(struct ext4_locality_group); if (sbi->s_locality_groups == NULL) { ret = -ENOMEM; - goto out; + goto out_free_groupinfo_slab; } for_each_possible_cpu(i) { struct ext4_locality_group *lg; @@ -2503,9 +2515,8 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery) /* init file for buddy data */ ret = ext4_mb_init_backend(sb); - if (ret != 0) { - goto out; - } + if (ret != 0) + goto out_free_locality_groups; if (sbi->s_proc) proc_create_data("mb_groups", S_IRUGO, sbi->s_proc, @@ -2513,11 +2524,19 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery) if (sbi->s_journal) sbi->s_journal->j_commit_callback = release_blocks_on_commit; + + return 0; + +out_free_locality_groups: + free_percpu(sbi->s_locality_groups); + sbi->s_locality_groups = NULL; +out_free_groupinfo_slab: + ext4_groupinfo_destroy_slabs(); out: - if (ret) { - kfree(sbi->s_mb_offsets); - kfree(sbi->s_mb_maxs); - } + kfree(sbi->s_mb_offsets); + sbi->s_mb_offsets = NULL; + kfree(sbi->s_mb_maxs); + sbi->s_mb_maxs = NULL; return ret; } @@ -2602,11 +2621,13 @@ int ext4_mb_release(struct super_block *sb) } static inline int ext4_issue_discard(struct super_block *sb, - ext4_group_t block_group, ext4_grpblk_t block, int count) + ext4_group_t block_group, ext4_grpblk_t cluster, int count) { ext4_fsblk_t discard_block; - discard_block = block + ext4_group_first_block_no(sb, block_group); + discard_block = (EXT4_C2B(EXT4_SB(sb), cluster) + + ext4_group_first_block_no(sb, block_group)); + count = EXT4_C2B(EXT4_SB(sb), count); trace_ext4_discard_blocks(sb, (unsigned long long) discard_block, count); return sb_issue_discard(sb, discard_block, count, GFP_NOFS, 0); @@ -2633,7 +2654,7 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn) if (test_opt(sb, DISCARD)) ext4_issue_discard(sb, entry->group, - entry->start_blk, entry->count); + entry->start_cluster, entry->count); err = ext4_mb_load_buddy(sb, entry->group, &e4b); /* we expect to find existing buddy because it's pinned */ @@ -2646,7 +2667,7 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn) ext4_lock_group(sb, entry->group); /* Take it out of per group rb tree */ rb_erase(&entry->node, &(db->bb_free_root)); - mb_free_blocks(NULL, &e4b, entry->start_blk, entry->count); + mb_free_blocks(NULL, &e4b, entry->start_cluster, entry->count); /* * Clear the trimmed flag for the group so that the next @@ -2752,7 +2773,7 @@ void ext4_exit_mballoc(void) */ static noinline_for_stack int ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, - handle_t *handle, unsigned int reserv_blks) + handle_t *handle, unsigned int reserv_clstrs) { struct buffer_head *bitmap_bh = NULL; struct ext4_group_desc *gdp; @@ -2783,7 +2804,7 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, goto out_err; ext4_debug("using block group %u(%d)\n", ac->ac_b_ex.fe_group, - ext4_free_blks_count(sb, gdp)); + ext4_free_group_clusters(sb, gdp)); err = ext4_journal_get_write_access(handle, gdp_bh); if (err) @@ -2791,7 +2812,7 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex); - len = ac->ac_b_ex.fe_len; + len = EXT4_C2B(sbi, ac->ac_b_ex.fe_len); if (!ext4_data_block_valid(sbi, block, len)) { ext4_error(sb, "Allocating blocks %llu-%llu which overlap " "fs metadata\n", block, block+len); @@ -2823,28 +2844,29 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, ac->ac_b_ex.fe_len); if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT); - ext4_free_blks_set(sb, gdp, - ext4_free_blocks_after_init(sb, - ac->ac_b_ex.fe_group, gdp)); + ext4_free_group_clusters_set(sb, gdp, + ext4_free_clusters_after_init(sb, + ac->ac_b_ex.fe_group, gdp)); } - len = ext4_free_blks_count(sb, gdp) - ac->ac_b_ex.fe_len; - ext4_free_blks_set(sb, gdp, len); + len = ext4_free_group_clusters(sb, gdp) - ac->ac_b_ex.fe_len; + ext4_free_group_clusters_set(sb, gdp, len); gdp->bg_checksum = ext4_group_desc_csum(sbi, ac->ac_b_ex.fe_group, gdp); ext4_unlock_group(sb, ac->ac_b_ex.fe_group); - percpu_counter_sub(&sbi->s_freeblocks_counter, ac->ac_b_ex.fe_len); + percpu_counter_sub(&sbi->s_freeclusters_counter, ac->ac_b_ex.fe_len); /* * Now reduce the dirty block count also. Should not go negative */ if (!(ac->ac_flags & EXT4_MB_DELALLOC_RESERVED)) /* release all the reserved blocks if non delalloc */ - percpu_counter_sub(&sbi->s_dirtyblocks_counter, reserv_blks); + percpu_counter_sub(&sbi->s_dirtyclusters_counter, + reserv_clstrs); if (sbi->s_log_groups_per_flex) { ext4_group_t flex_group = ext4_flex_group(sbi, ac->ac_b_ex.fe_group); atomic_sub(ac->ac_b_ex.fe_len, - &sbi->s_flex_groups[flex_group].free_blocks); + &sbi->s_flex_groups[flex_group].free_clusters); } err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); @@ -2886,6 +2908,7 @@ static noinline_for_stack void ext4_mb_normalize_request(struct ext4_allocation_context *ac, struct ext4_allocation_request *ar) { + struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); int bsbits, max; ext4_lblk_t end; loff_t size, orig_size, start_off; @@ -2916,7 +2939,7 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac, /* first, let's learn actual file size * given current request is allocated */ - size = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len; + size = ac->ac_o_ex.fe_logical + EXT4_C2B(sbi, ac->ac_o_ex.fe_len); size = size << bsbits; if (size < i_size_read(ac->ac_inode)) size = i_size_read(ac->ac_inode); @@ -2988,7 +3011,8 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac, continue; } - pa_end = pa->pa_lstart + pa->pa_len; + pa_end = pa->pa_lstart + EXT4_C2B(EXT4_SB(ac->ac_sb), + pa->pa_len); /* PA must not overlap original request */ BUG_ON(!(ac->ac_o_ex.fe_logical >= pa_end || @@ -3018,9 +3042,11 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac, rcu_read_lock(); list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) { ext4_lblk_t pa_end; + spin_lock(&pa->pa_lock); if (pa->pa_deleted == 0) { - pa_end = pa->pa_lstart + pa->pa_len; + pa_end = pa->pa_lstart + EXT4_C2B(EXT4_SB(ac->ac_sb), + pa->pa_len); BUG_ON(!(start >= pa_end || end <= pa->pa_lstart)); } spin_unlock(&pa->pa_lock); @@ -3036,14 +3062,14 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac, } BUG_ON(start + size <= ac->ac_o_ex.fe_logical && start > ac->ac_o_ex.fe_logical); - BUG_ON(size <= 0 || size > EXT4_BLOCKS_PER_GROUP(ac->ac_sb)); + BUG_ON(size <= 0 || size > EXT4_CLUSTERS_PER_GROUP(ac->ac_sb)); /* now prepare goal request */ /* XXX: is it better to align blocks WRT to logical * placement or satisfy big request as is */ ac->ac_g_ex.fe_logical = start; - ac->ac_g_ex.fe_len = size; + ac->ac_g_ex.fe_len = EXT4_NUM_B2C(sbi, size); /* define goal start in order to merge */ if (ar->pright && (ar->lright == (start + size))) { @@ -3112,14 +3138,16 @@ static void ext4_discard_allocated_blocks(struct ext4_allocation_context *ac) static void ext4_mb_use_inode_pa(struct ext4_allocation_context *ac, struct ext4_prealloc_space *pa) { + struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); ext4_fsblk_t start; ext4_fsblk_t end; int len; /* found preallocated blocks, use them */ start = pa->pa_pstart + (ac->ac_o_ex.fe_logical - pa->pa_lstart); - end = min(pa->pa_pstart + pa->pa_len, start + ac->ac_o_ex.fe_len); - len = end - start; + end = min(pa->pa_pstart + EXT4_C2B(sbi, pa->pa_len), + start + EXT4_C2B(sbi, ac->ac_o_ex.fe_len)); + len = EXT4_NUM_B2C(sbi, end - start); ext4_get_group_no_and_offset(ac->ac_sb, start, &ac->ac_b_ex.fe_group, &ac->ac_b_ex.fe_start); ac->ac_b_ex.fe_len = len; @@ -3127,7 +3155,7 @@ static void ext4_mb_use_inode_pa(struct ext4_allocation_context *ac, ac->ac_pa = pa; BUG_ON(start < pa->pa_pstart); - BUG_ON(start + len > pa->pa_pstart + pa->pa_len); + BUG_ON(end > pa->pa_pstart + EXT4_C2B(sbi, pa->pa_len)); BUG_ON(pa->pa_free < len); pa->pa_free -= len; @@ -3193,6 +3221,7 @@ ext4_mb_check_group_pa(ext4_fsblk_t goal_block, static noinline_for_stack int ext4_mb_use_preallocated(struct ext4_allocation_context *ac) { + struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); int order, i; struct ext4_inode_info *ei = EXT4_I(ac->ac_inode); struct ext4_locality_group *lg; @@ -3210,12 +3239,14 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac) /* all fields in this condition don't change, * so we can skip locking for them */ if (ac->ac_o_ex.fe_logical < pa->pa_lstart || - ac->ac_o_ex.fe_logical >= pa->pa_lstart + pa->pa_len) + ac->ac_o_ex.fe_logical >= (pa->pa_lstart + + EXT4_C2B(sbi, pa->pa_len))) continue; /* non-extent files can't have physical blocks past 2^32 */ if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)) && - pa->pa_pstart + pa->pa_len > EXT4_MAX_BLOCK_FILE_PHYS) + (pa->pa_pstart + EXT4_C2B(sbi, pa->pa_len) > + EXT4_MAX_BLOCK_FILE_PHYS)) continue; /* found preallocated blocks, use them */ @@ -3291,7 +3322,7 @@ static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap, while (n) { entry = rb_entry(n, struct ext4_free_data, node); - ext4_set_bits(bitmap, entry->start_blk, entry->count); + ext4_set_bits(bitmap, entry->start_cluster, entry->count); n = rb_next(n); } return; @@ -3312,7 +3343,6 @@ void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, ext4_group_t groupnr; ext4_grpblk_t start; int preallocated = 0; - int count = 0; int len; /* all form of preallocation discards first load group, @@ -3335,7 +3365,6 @@ void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, BUG_ON(groupnr != group); ext4_set_bits(bitmap, start, len); preallocated += len; - count++; } mb_debug(1, "prellocated %u for group %u\n", preallocated, group); } @@ -3412,6 +3441,7 @@ static noinline_for_stack int ext4_mb_new_inode_pa(struct ext4_allocation_context *ac) { struct super_block *sb = ac->ac_sb; + struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_prealloc_space *pa; struct ext4_group_info *grp; struct ext4_inode_info *ei; @@ -3443,16 +3473,18 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac) winl = ac->ac_o_ex.fe_logical - ac->ac_g_ex.fe_logical; /* also, we should cover whole original request */ - wins = ac->ac_b_ex.fe_len - ac->ac_o_ex.fe_len; + wins = EXT4_C2B(sbi, ac->ac_b_ex.fe_len - ac->ac_o_ex.fe_len); /* the smallest one defines real window */ win = min(winl, wins); - offs = ac->ac_o_ex.fe_logical % ac->ac_b_ex.fe_len; + offs = ac->ac_o_ex.fe_logical % + EXT4_C2B(sbi, ac->ac_b_ex.fe_len); if (offs && offs < win) win = offs; - ac->ac_b_ex.fe_logical = ac->ac_o_ex.fe_logical - win; + ac->ac_b_ex.fe_logical = ac->ac_o_ex.fe_logical - + EXT4_B2C(sbi, win); BUG_ON(ac->ac_o_ex.fe_logical < ac->ac_b_ex.fe_logical); BUG_ON(ac->ac_o_ex.fe_len > ac->ac_b_ex.fe_len); } @@ -3477,7 +3509,7 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac) trace_ext4_mb_new_inode_pa(ac, pa); ext4_mb_use_inode_pa(ac, pa); - atomic_add(pa->pa_free, &EXT4_SB(sb)->s_mb_preallocated); + atomic_add(pa->pa_free, &sbi->s_mb_preallocated); ei = EXT4_I(ac->ac_inode); grp = ext4_get_group_info(sb, ac->ac_b_ex.fe_group); @@ -3592,7 +3624,7 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh, BUG_ON(pa->pa_deleted == 0); ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit); - grp_blk_start = pa->pa_pstart - bit; + grp_blk_start = pa->pa_pstart - EXT4_C2B(sbi, bit); BUG_ON(group != e4b->bd_group && pa->pa_len != 0); end = bit + pa->pa_len; @@ -3607,7 +3639,8 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh, free += next - bit; trace_ext4_mballoc_discard(sb, NULL, group, bit, next - bit); - trace_ext4_mb_release_inode_pa(pa, grp_blk_start + bit, + trace_ext4_mb_release_inode_pa(pa, (grp_blk_start + + EXT4_C2B(sbi, bit)), next - bit); mb_free_blocks(pa->pa_inode, e4b, bit, next - bit); bit = next + 1; @@ -3690,7 +3723,7 @@ ext4_mb_discard_group_preallocations(struct super_block *sb, } if (needed == 0) - needed = EXT4_BLOCKS_PER_GROUP(sb) + 1; + needed = EXT4_CLUSTERS_PER_GROUP(sb) + 1; INIT_LIST_HEAD(&list); repeat: @@ -3958,7 +3991,7 @@ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac) if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY)) return; - size = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len; + size = ac->ac_o_ex.fe_logical + EXT4_C2B(sbi, ac->ac_o_ex.fe_len); isize = (i_size_read(ac->ac_inode) + ac->ac_sb->s_blocksize - 1) >> bsbits; @@ -3969,6 +4002,11 @@ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac) return; } + if (sbi->s_mb_group_prealloc <= 0) { + ac->ac_flags |= EXT4_MB_STREAM_ALLOC; + return; + } + /* don't use group allocation for large files */ size = max(size, isize); if (size > sbi->s_mb_stream_request) { @@ -4007,8 +4045,8 @@ ext4_mb_initialize_context(struct ext4_allocation_context *ac, len = ar->len; /* just a dirty hack to filter too big requests */ - if (len >= EXT4_BLOCKS_PER_GROUP(sb) - 10) - len = EXT4_BLOCKS_PER_GROUP(sb) - 10; + if (len >= EXT4_CLUSTERS_PER_GROUP(sb) - 10) + len = EXT4_CLUSTERS_PER_GROUP(sb) - 10; /* start searching from the goal */ goal = ar->goal; @@ -4019,18 +4057,15 @@ ext4_mb_initialize_context(struct ext4_allocation_context *ac, /* set up allocation goals */ memset(ac, 0, sizeof(struct ext4_allocation_context)); - ac->ac_b_ex.fe_logical = ar->logical; + ac->ac_b_ex.fe_logical = ar->logical & ~(sbi->s_cluster_ratio - 1); ac->ac_status = AC_STATUS_CONTINUE; ac->ac_sb = sb; ac->ac_inode = ar->inode; - ac->ac_o_ex.fe_logical = ar->logical; + ac->ac_o_ex.fe_logical = ac->ac_b_ex.fe_logical; ac->ac_o_ex.fe_group = group; ac->ac_o_ex.fe_start = block; ac->ac_o_ex.fe_len = len; - ac->ac_g_ex.fe_logical = ar->logical; - ac->ac_g_ex.fe_group = group; - ac->ac_g_ex.fe_start = block; - ac->ac_g_ex.fe_len = len; + ac->ac_g_ex = ac->ac_o_ex; ac->ac_flags = ar->flags; /* we have to define context: we'll we work with a file or @@ -4182,13 +4217,14 @@ static void ext4_mb_add_n_trim(struct ext4_allocation_context *ac) */ static int ext4_mb_release_context(struct ext4_allocation_context *ac) { + struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); struct ext4_prealloc_space *pa = ac->ac_pa; if (pa) { if (pa->pa_type == MB_GROUP_PA) { /* see comment in ext4_mb_use_group_pa() */ spin_lock(&pa->pa_lock); - pa->pa_pstart += ac->ac_b_ex.fe_len; - pa->pa_lstart += ac->ac_b_ex.fe_len; + pa->pa_pstart += EXT4_C2B(sbi, ac->ac_b_ex.fe_len); + pa->pa_lstart += EXT4_C2B(sbi, ac->ac_b_ex.fe_len); pa->pa_free -= ac->ac_b_ex.fe_len; pa->pa_len -= ac->ac_b_ex.fe_len; spin_unlock(&pa->pa_lock); @@ -4249,13 +4285,17 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, struct super_block *sb; ext4_fsblk_t block = 0; unsigned int inquota = 0; - unsigned int reserv_blks = 0; + unsigned int reserv_clstrs = 0; sb = ar->inode->i_sb; sbi = EXT4_SB(sb); trace_ext4_request_blocks(ar); + /* Allow to use superuser reservation for quota file */ + if (IS_NOQUOTA(ar->inode)) + ar->flags |= EXT4_MB_USE_ROOT_BLOCKS; + /* * For delayed allocation, we could skip the ENOSPC and * EDQUOT check, as blocks and quotas have been already @@ -4269,7 +4309,7 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, * and verify allocation doesn't exceed the quota limits. */ while (ar->len && - ext4_claim_free_blocks(sbi, ar->len, ar->flags)) { + ext4_claim_free_clusters(sbi, ar->len, ar->flags)) { /* let others to free the space */ yield(); @@ -4279,12 +4319,14 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, *errp = -ENOSPC; return 0; } - reserv_blks = ar->len; + reserv_clstrs = ar->len; if (ar->flags & EXT4_MB_USE_ROOT_BLOCKS) { - dquot_alloc_block_nofail(ar->inode, ar->len); + dquot_alloc_block_nofail(ar->inode, + EXT4_C2B(sbi, ar->len)); } else { while (ar->len && - dquot_alloc_block(ar->inode, ar->len)) { + dquot_alloc_block(ar->inode, + EXT4_C2B(sbi, ar->len))) { ar->flags |= EXT4_MB_HINT_NOPREALLOC; ar->len--; @@ -4328,7 +4370,7 @@ repeat: ext4_mb_new_preallocation(ac); } if (likely(ac->ac_status == AC_STATUS_FOUND)) { - *errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_blks); + *errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_clstrs); if (*errp == -EAGAIN) { /* * drop the reference that we took @@ -4364,13 +4406,13 @@ out: if (ac) kmem_cache_free(ext4_ac_cachep, ac); if (inquota && ar->len < inquota) - dquot_free_block(ar->inode, inquota - ar->len); + dquot_free_block(ar->inode, EXT4_C2B(sbi, inquota - ar->len)); if (!ar->len) { if (!ext4_test_inode_state(ar->inode, EXT4_STATE_DELALLOC_RESERVED)) /* release all the reserved blocks if non delalloc */ - percpu_counter_sub(&sbi->s_dirtyblocks_counter, - reserv_blks); + percpu_counter_sub(&sbi->s_dirtyclusters_counter, + reserv_clstrs); } trace_ext4_allocate_blocks(ar, (unsigned long long)block); @@ -4388,7 +4430,7 @@ static int can_merge(struct ext4_free_data *entry1, { if ((entry1->t_tid == entry2->t_tid) && (entry1->group == entry2->group) && - ((entry1->start_blk + entry1->count) == entry2->start_blk)) + ((entry1->start_cluster + entry1->count) == entry2->start_cluster)) return 1; return 0; } @@ -4398,7 +4440,7 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b, struct ext4_free_data *new_entry) { ext4_group_t group = e4b->bd_group; - ext4_grpblk_t block; + ext4_grpblk_t cluster; struct ext4_free_data *entry; struct ext4_group_info *db = e4b->bd_info; struct super_block *sb = e4b->bd_sb; @@ -4411,7 +4453,7 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b, BUG_ON(e4b->bd_buddy_page == NULL); new_node = &new_entry->node; - block = new_entry->start_blk; + cluster = new_entry->start_cluster; if (!*n) { /* first free block exent. We need to @@ -4425,13 +4467,14 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b, while (*n) { parent = *n; entry = rb_entry(parent, struct ext4_free_data, node); - if (block < entry->start_blk) + if (cluster < entry->start_cluster) n = &(*n)->rb_left; - else if (block >= (entry->start_blk + entry->count)) + else if (cluster >= (entry->start_cluster + entry->count)) n = &(*n)->rb_right; else { ext4_grp_locked_error(sb, group, 0, - ext4_group_first_block_no(sb, group) + block, + ext4_group_first_block_no(sb, group) + + EXT4_C2B(sbi, cluster), "Block already on to-be-freed list"); return 0; } @@ -4445,7 +4488,7 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b, if (node) { entry = rb_entry(node, struct ext4_free_data, node); if (can_merge(entry, new_entry)) { - new_entry->start_blk = entry->start_blk; + new_entry->start_cluster = entry->start_cluster; new_entry->count += entry->count; rb_erase(node, &(db->bb_free_root)); spin_lock(&sbi->s_md_lock); @@ -4496,6 +4539,7 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode, ext4_group_t block_group; struct ext4_sb_info *sbi; struct ext4_buddy e4b; + unsigned int count_clusters; int err = 0; int ret; @@ -4544,6 +4588,38 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode, if (!ext4_should_writeback_data(inode)) flags |= EXT4_FREE_BLOCKS_METADATA; + /* + * If the extent to be freed does not begin on a cluster + * boundary, we need to deal with partial clusters at the + * beginning and end of the extent. Normally we will free + * blocks at the beginning or the end unless we are explicitly + * requested to avoid doing so. + */ + overflow = block & (sbi->s_cluster_ratio - 1); + if (overflow) { + if (flags & EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER) { + overflow = sbi->s_cluster_ratio - overflow; + block += overflow; + if (count > overflow) + count -= overflow; + else + return; + } else { + block -= overflow; + count += overflow; + } + } + overflow = count & (sbi->s_cluster_ratio - 1); + if (overflow) { + if (flags & EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER) { + if (count > overflow) + count -= overflow; + else + return; + } else + count += sbi->s_cluster_ratio - overflow; + } + do_more: overflow = 0; ext4_get_group_no_and_offset(sb, block, &block_group, &bit); @@ -4552,10 +4628,12 @@ do_more: * Check to see if we are freeing blocks across a group * boundary. */ - if (bit + count > EXT4_BLOCKS_PER_GROUP(sb)) { - overflow = bit + count - EXT4_BLOCKS_PER_GROUP(sb); + if (EXT4_C2B(sbi, bit) + count > EXT4_BLOCKS_PER_GROUP(sb)) { + overflow = EXT4_C2B(sbi, bit) + count - + EXT4_BLOCKS_PER_GROUP(sb); count -= overflow; } + count_clusters = EXT4_B2C(sbi, count); bitmap_bh = ext4_read_block_bitmap(sb, block_group); if (!bitmap_bh) { err = -EIO; @@ -4570,9 +4648,9 @@ do_more: if (in_range(ext4_block_bitmap(sb, gdp), block, count) || in_range(ext4_inode_bitmap(sb, gdp), block, count) || in_range(block, ext4_inode_table(sb, gdp), - EXT4_SB(sb)->s_itb_per_group) || + EXT4_SB(sb)->s_itb_per_group) || in_range(block + count - 1, ext4_inode_table(sb, gdp), - EXT4_SB(sb)->s_itb_per_group)) { + EXT4_SB(sb)->s_itb_per_group)) { ext4_error(sb, "Freeing blocks in system zone - " "Block = %llu, count = %lu", block, count); @@ -4597,11 +4675,11 @@ do_more: #ifdef AGGRESSIVE_CHECK { int i; - for (i = 0; i < count; i++) + for (i = 0; i < count_clusters; i++) BUG_ON(!mb_test_bit(bit + i, bitmap_bh->b_data)); } #endif - trace_ext4_mballoc_free(sb, inode, block_group, bit, count); + trace_ext4_mballoc_free(sb, inode, block_group, bit, count_clusters); err = ext4_mb_load_buddy(sb, block_group, &e4b); if (err) @@ -4618,13 +4696,13 @@ do_more: err = -ENOMEM; goto error_return; } - new_entry->start_blk = bit; + new_entry->start_cluster = bit; new_entry->group = block_group; - new_entry->count = count; + new_entry->count = count_clusters; new_entry->t_tid = handle->h_transaction->t_tid; ext4_lock_group(sb, block_group); - mb_clear_bits(bitmap_bh->b_data, bit, count); + mb_clear_bits(bitmap_bh->b_data, bit, count_clusters); ext4_mb_free_metadata(handle, &e4b, new_entry); } else { /* need to update group_info->bb_free and bitmap @@ -4632,25 +4710,29 @@ do_more: * them with group lock_held */ ext4_lock_group(sb, block_group); - mb_clear_bits(bitmap_bh->b_data, bit, count); - mb_free_blocks(inode, &e4b, bit, count); + mb_clear_bits(bitmap_bh->b_data, bit, count_clusters); + mb_free_blocks(inode, &e4b, bit, count_clusters); } - ret = ext4_free_blks_count(sb, gdp) + count; - ext4_free_blks_set(sb, gdp, ret); + ret = ext4_free_group_clusters(sb, gdp) + count_clusters; + ext4_free_group_clusters_set(sb, gdp, ret); gdp->bg_checksum = ext4_group_desc_csum(sbi, block_group, gdp); ext4_unlock_group(sb, block_group); - percpu_counter_add(&sbi->s_freeblocks_counter, count); + percpu_counter_add(&sbi->s_freeclusters_counter, count_clusters); if (sbi->s_log_groups_per_flex) { ext4_group_t flex_group = ext4_flex_group(sbi, block_group); - atomic_add(count, &sbi->s_flex_groups[flex_group].free_blocks); + atomic_add(count_clusters, + &sbi->s_flex_groups[flex_group].free_clusters); } ext4_mb_unload_buddy(&e4b); freed += count; + if (!(flags & EXT4_FREE_BLOCKS_NO_QUOT_UPDATE)) + dquot_free_block(inode, EXT4_C2B(sbi, count_clusters)); + /* We dirtied the bitmap block */ BUFFER_TRACE(bitmap_bh, "dirtied bitmap block"); err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); @@ -4669,8 +4751,6 @@ do_more: } ext4_mark_super_dirty(sb); error_return: - if (freed && !(flags & EXT4_FREE_BLOCKS_NO_QUOT_UPDATE)) - dquot_free_block(inode, freed); brelse(bitmap_bh); ext4_std_error(sb, err); return; @@ -4778,16 +4858,17 @@ int ext4_group_add_blocks(handle_t *handle, struct super_block *sb, ext4_lock_group(sb, block_group); mb_clear_bits(bitmap_bh->b_data, bit, count); mb_free_blocks(NULL, &e4b, bit, count); - blk_free_count = blocks_freed + ext4_free_blks_count(sb, desc); - ext4_free_blks_set(sb, desc, blk_free_count); + blk_free_count = blocks_freed + ext4_free_group_clusters(sb, desc); + ext4_free_group_clusters_set(sb, desc, blk_free_count); desc->bg_checksum = ext4_group_desc_csum(sbi, block_group, desc); ext4_unlock_group(sb, block_group); - percpu_counter_add(&sbi->s_freeblocks_counter, blocks_freed); + percpu_counter_add(&sbi->s_freeclusters_counter, + EXT4_B2C(sbi, blocks_freed)); if (sbi->s_log_groups_per_flex) { ext4_group_t flex_group = ext4_flex_group(sbi, block_group); - atomic_add(blocks_freed, - &sbi->s_flex_groups[flex_group].free_blocks); + atomic_add(EXT4_B2C(sbi, blocks_freed), + &sbi->s_flex_groups[flex_group].free_clusters); } ext4_mb_unload_buddy(&e4b); @@ -4948,7 +5029,7 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range) struct ext4_group_info *grp; ext4_group_t first_group, last_group; ext4_group_t group, ngroups = ext4_get_groups_count(sb); - ext4_grpblk_t cnt = 0, first_block, last_block; + ext4_grpblk_t cnt = 0, first_cluster, last_cluster; uint64_t start, len, minlen, trimmed = 0; ext4_fsblk_t first_data_blk = le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block); @@ -4958,7 +5039,7 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range) len = range->len >> sb->s_blocksize_bits; minlen = range->minlen >> sb->s_blocksize_bits; - if (unlikely(minlen > EXT4_BLOCKS_PER_GROUP(sb))) + if (unlikely(minlen > EXT4_CLUSTERS_PER_GROUP(sb))) return -EINVAL; if (start + len <= first_data_blk) goto out; @@ -4969,11 +5050,11 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range) /* Determine first and last group to examine based on start and len */ ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) start, - &first_group, &first_block); + &first_group, &first_cluster); ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) (start + len), - &last_group, &last_block); + &last_group, &last_cluster); last_group = (last_group > ngroups - 1) ? ngroups - 1 : last_group; - last_block = EXT4_BLOCKS_PER_GROUP(sb); + last_cluster = EXT4_CLUSTERS_PER_GROUP(sb); if (first_group > last_group) return -EINVAL; @@ -4993,20 +5074,20 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range) * change it for the last group in which case start + * len < EXT4_BLOCKS_PER_GROUP(sb). */ - if (first_block + len < EXT4_BLOCKS_PER_GROUP(sb)) - last_block = first_block + len; - len -= last_block - first_block; + if (first_cluster + len < EXT4_CLUSTERS_PER_GROUP(sb)) + last_cluster = first_cluster + len; + len -= last_cluster - first_cluster; if (grp->bb_free >= minlen) { - cnt = ext4_trim_all_free(sb, group, first_block, - last_block, minlen); + cnt = ext4_trim_all_free(sb, group, first_cluster, + last_cluster, minlen); if (cnt < 0) { ret = cnt; break; } } trimmed += cnt; - first_block = 0; + first_cluster = 0; } range->len = trimmed * sb->s_blocksize; diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h index 9d4a636..47705f3 100644 --- a/fs/ext4/mballoc.h +++ b/fs/ext4/mballoc.h @@ -106,7 +106,7 @@ struct ext4_free_data { ext4_group_t group; /* free block extent */ - ext4_grpblk_t start_blk; + ext4_grpblk_t start_cluster; ext4_grpblk_t count; /* transaction which freed this extent */ @@ -139,9 +139,9 @@ enum { struct ext4_free_extent { ext4_lblk_t fe_logical; - ext4_grpblk_t fe_start; + ext4_grpblk_t fe_start; /* In cluster units */ ext4_group_t fe_group; - ext4_grpblk_t fe_len; + ext4_grpblk_t fe_len; /* In cluster units */ }; /* @@ -175,7 +175,7 @@ struct ext4_allocation_context { /* the best found extent */ struct ext4_free_extent ac_b_ex; - /* copy of the bext found extent taken before preallocation efforts */ + /* copy of the best found extent taken before preallocation efforts */ struct ext4_free_extent ac_f_ex; /* number of iterations done. we have to track to limit searching */ @@ -216,6 +216,7 @@ struct ext4_buddy { static inline ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb, struct ext4_free_extent *fex) { - return ext4_group_first_block_no(sb, fex->fe_group) + fex->fe_start; + return ext4_group_first_block_no(sb, fex->fe_group) + + (fex->fe_start << EXT4_SB(sb)->s_cluster_bits); } #endif diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c index b57b98f..e7d6bb0 100644 --- a/fs/ext4/migrate.c +++ b/fs/ext4/migrate.c @@ -12,22 +12,20 @@ * */ -#include <linux/module.h> #include <linux/slab.h> #include "ext4_jbd2.h" -#include "ext4_extents.h" /* * The contiguous blocks details which can be * represented by a single extent */ -struct list_blocks_struct { - ext4_lblk_t first_block, last_block; +struct migrate_struct { + ext4_lblk_t first_block, last_block, curr_block; ext4_fsblk_t first_pblock, last_pblock; }; static int finish_range(handle_t *handle, struct inode *inode, - struct list_blocks_struct *lb) + struct migrate_struct *lb) { int retval = 0, needed; @@ -87,8 +85,7 @@ err_out: } static int update_extent_range(handle_t *handle, struct inode *inode, - ext4_fsblk_t pblock, ext4_lblk_t blk_num, - struct list_blocks_struct *lb) + ext4_fsblk_t pblock, struct migrate_struct *lb) { int retval; /* @@ -96,9 +93,10 @@ static int update_extent_range(handle_t *handle, struct inode *inode, */ if (lb->first_pblock && (lb->last_pblock+1 == pblock) && - (lb->last_block+1 == blk_num)) { + (lb->last_block+1 == lb->curr_block)) { lb->last_pblock = pblock; - lb->last_block = blk_num; + lb->last_block = lb->curr_block; + lb->curr_block++; return 0; } /* @@ -106,64 +104,49 @@ static int update_extent_range(handle_t *handle, struct inode *inode, */ retval = finish_range(handle, inode, lb); lb->first_pblock = lb->last_pblock = pblock; - lb->first_block = lb->last_block = blk_num; - + lb->first_block = lb->last_block = lb->curr_block; + lb->curr_block++; return retval; } static int update_ind_extent_range(handle_t *handle, struct inode *inode, - ext4_fsblk_t pblock, ext4_lblk_t *blk_nump, - struct list_blocks_struct *lb) + ext4_fsblk_t pblock, + struct migrate_struct *lb) { struct buffer_head *bh; __le32 *i_data; int i, retval = 0; - ext4_lblk_t blk_count = *blk_nump; unsigned long max_entries = inode->i_sb->s_blocksize >> 2; - if (!pblock) { - /* Only update the file block number */ - *blk_nump += max_entries; - return 0; - } - bh = sb_bread(inode->i_sb, pblock); if (!bh) return -EIO; i_data = (__le32 *)bh->b_data; - for (i = 0; i < max_entries; i++, blk_count++) { + for (i = 0; i < max_entries; i++) { if (i_data[i]) { retval = update_extent_range(handle, inode, - le32_to_cpu(i_data[i]), - blk_count, lb); + le32_to_cpu(i_data[i]), lb); if (retval) break; + } else { + lb->curr_block++; } } - - /* Update the file block number */ - *blk_nump = blk_count; put_bh(bh); return retval; } static int update_dind_extent_range(handle_t *handle, struct inode *inode, - ext4_fsblk_t pblock, ext4_lblk_t *blk_nump, - struct list_blocks_struct *lb) + ext4_fsblk_t pblock, + struct migrate_struct *lb) { struct buffer_head *bh; __le32 *i_data; int i, retval = 0; - ext4_lblk_t blk_count = *blk_nump; unsigned long max_entries = inode->i_sb->s_blocksize >> 2; - if (!pblock) { - /* Only update the file block number */ - *blk_nump += max_entries * max_entries; - return 0; - } bh = sb_bread(inode->i_sb, pblock); if (!bh) return -EIO; @@ -172,38 +155,28 @@ static int update_dind_extent_range(handle_t *handle, struct inode *inode, for (i = 0; i < max_entries; i++) { if (i_data[i]) { retval = update_ind_extent_range(handle, inode, - le32_to_cpu(i_data[i]), - &blk_count, lb); + le32_to_cpu(i_data[i]), lb); if (retval) break; } else { /* Only update the file block number */ - blk_count += max_entries; + lb->curr_block += max_entries; } } - - /* Update the file block number */ - *blk_nump = blk_count; put_bh(bh); return retval; } static int update_tind_extent_range(handle_t *handle, struct inode *inode, - ext4_fsblk_t pblock, ext4_lblk_t *blk_nump, - struct list_blocks_struct *lb) + ext4_fsblk_t pblock, + struct migrate_struct *lb) { struct buffer_head *bh; __le32 *i_data; int i, retval = 0; - ext4_lblk_t blk_count = *blk_nump; unsigned long max_entries = inode->i_sb->s_blocksize >> 2; - if (!pblock) { - /* Only update the file block number */ - *blk_nump += max_entries * max_entries * max_entries; - return 0; - } bh = sb_bread(inode->i_sb, pblock); if (!bh) return -EIO; @@ -212,16 +185,14 @@ static int update_tind_extent_range(handle_t *handle, struct inode *inode, for (i = 0; i < max_entries; i++) { if (i_data[i]) { retval = update_dind_extent_range(handle, inode, - le32_to_cpu(i_data[i]), - &blk_count, lb); + le32_to_cpu(i_data[i]), lb); if (retval) break; - } else + } else { /* Only update the file block number */ - blk_count += max_entries * max_entries; + lb->curr_block += max_entries * max_entries; + } } - /* Update the file block number */ - *blk_nump = blk_count; put_bh(bh); return retval; @@ -462,12 +433,12 @@ int ext4_ext_migrate(struct inode *inode) handle_t *handle; int retval = 0, i; __le32 *i_data; - ext4_lblk_t blk_count = 0; struct ext4_inode_info *ei; struct inode *tmp_inode = NULL; - struct list_blocks_struct lb; + struct migrate_struct lb; unsigned long max_entries; __u32 goal; + uid_t owner[2]; /* * If the filesystem does not support extents, or the inode @@ -495,10 +466,12 @@ int ext4_ext_migrate(struct inode *inode) } goal = (((inode->i_ino - 1) / EXT4_INODES_PER_GROUP(inode->i_sb)) * EXT4_INODES_PER_GROUP(inode->i_sb)) + 1; + owner[0] = inode->i_uid; + owner[1] = inode->i_gid; tmp_inode = ext4_new_inode(handle, inode->i_sb->s_root->d_inode, - S_IFREG, NULL, goal); + S_IFREG, NULL, goal, owner); if (IS_ERR(tmp_inode)) { - retval = -ENOMEM; + retval = PTR_ERR(inode); ext4_journal_stop(handle); return retval; } @@ -507,7 +480,7 @@ int ext4_ext_migrate(struct inode *inode) * Set the i_nlink to zero so it will be deleted later * when we drop inode reference. */ - tmp_inode->i_nlink = 0; + clear_nlink(tmp_inode); ext4_ext_tree_init(handle, tmp_inode); ext4_orphan_add(handle, tmp_inode); @@ -551,35 +524,32 @@ int ext4_ext_migrate(struct inode *inode) /* 32 bit block address 4 bytes */ max_entries = inode->i_sb->s_blocksize >> 2; - for (i = 0; i < EXT4_NDIR_BLOCKS; i++, blk_count++) { + for (i = 0; i < EXT4_NDIR_BLOCKS; i++) { if (i_data[i]) { retval = update_extent_range(handle, tmp_inode, - le32_to_cpu(i_data[i]), - blk_count, &lb); + le32_to_cpu(i_data[i]), &lb); if (retval) goto err_out; - } + } else + lb.curr_block++; } if (i_data[EXT4_IND_BLOCK]) { retval = update_ind_extent_range(handle, tmp_inode, - le32_to_cpu(i_data[EXT4_IND_BLOCK]), - &blk_count, &lb); + le32_to_cpu(i_data[EXT4_IND_BLOCK]), &lb); if (retval) goto err_out; } else - blk_count += max_entries; + lb.curr_block += max_entries; if (i_data[EXT4_DIND_BLOCK]) { retval = update_dind_extent_range(handle, tmp_inode, - le32_to_cpu(i_data[EXT4_DIND_BLOCK]), - &blk_count, &lb); + le32_to_cpu(i_data[EXT4_DIND_BLOCK]), &lb); if (retval) goto err_out; } else - blk_count += max_entries * max_entries; + lb.curr_block += max_entries * max_entries; if (i_data[EXT4_TIND_BLOCK]) { retval = update_tind_extent_range(handle, tmp_inode, - le32_to_cpu(i_data[EXT4_TIND_BLOCK]), - &blk_count, &lb); + le32_to_cpu(i_data[EXT4_TIND_BLOCK]), &lb); if (retval) goto err_out; } diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c index 9bdef3f..7ea4ba4 100644 --- a/fs/ext4/mmp.c +++ b/fs/ext4/mmp.c @@ -109,7 +109,7 @@ static int kmmpd(void *data) mmp->mmp_check_interval = cpu_to_le16(mmp_check_interval); bdevname(bh->b_bdev, mmp->mmp_bdevname); - memcpy(mmp->mmp_nodename, init_utsname()->sysname, + memcpy(mmp->mmp_nodename, init_utsname()->nodename, sizeof(mmp->mmp_nodename)); while (!kthread_should_stop()) { @@ -125,8 +125,9 @@ static int kmmpd(void *data) * Don't spew too many error messages. Print one every * (s_mmp_update_interval * 60) seconds. */ - if (retval && (failed_writes % 60) == 0) { - ext4_error(sb, "Error writing to MMP block"); + if (retval) { + if ((failed_writes % 60) == 0) + ext4_error(sb, "Error writing to MMP block"); failed_writes++; } @@ -295,7 +296,8 @@ skip: /* * write a new random sequence number. */ - mmp->mmp_seq = seq = cpu_to_le32(mmp_new_seq()); + seq = mmp_new_seq(); + mmp->mmp_seq = cpu_to_le32(seq); retval = write_mmp_block(bh); if (retval) diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index f57455a..c5826c6 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -17,7 +17,6 @@ #include <linux/quotaops.h> #include <linux/slab.h> #include "ext4_jbd2.h" -#include "ext4_extents.h" #include "ext4.h" /** diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 1c924fa..2043f48 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -1586,7 +1586,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry, dxtrace(dx_show_index("node", frames[1].entries)); dxtrace(dx_show_index("node", ((struct dx_node *) bh2->b_data)->entries)); - err = ext4_handle_dirty_metadata(handle, inode, bh2); + err = ext4_handle_dirty_metadata(handle, dir, bh2); if (err) goto journal_error; brelse (bh2); @@ -1612,7 +1612,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry, if (err) goto journal_error; } - err = ext4_handle_dirty_metadata(handle, inode, frames[0].bh); + err = ext4_handle_dirty_metadata(handle, dir, frames[0].bh); if (err) { ext4_std_error(inode->i_sb, err); goto cleanup; @@ -1694,7 +1694,7 @@ static void ext4_inc_count(handle_t *handle, struct inode *inode) if (is_dx(inode) && inode->i_nlink > 1) { /* limit is 16-bit i_links_count */ if (inode->i_nlink >= EXT4_LINK_MAX || inode->i_nlink == 2) { - inode->i_nlink = 1; + set_nlink(inode, 1); EXT4_SET_RO_COMPAT_FEATURE(inode->i_sb, EXT4_FEATURE_RO_COMPAT_DIR_NLINK); } @@ -1707,9 +1707,8 @@ static void ext4_inc_count(handle_t *handle, struct inode *inode) */ static void ext4_dec_count(handle_t *handle, struct inode *inode) { - drop_nlink(inode); - if (S_ISDIR(inode->i_mode) && inode->i_nlink == 0) - inc_nlink(inode); + if (!S_ISDIR(inode->i_mode) || inode->i_nlink > 2) + drop_nlink(inode); } @@ -1737,7 +1736,7 @@ static int ext4_add_nondir(handle_t *handle, * If the create succeeds, we fill in the inode information * with d_instantiate(). */ -static int ext4_create(struct inode *dir, struct dentry *dentry, int mode, +static int ext4_create(struct inode *dir, struct dentry *dentry, umode_t mode, struct nameidata *nd) { handle_t *handle; @@ -1756,7 +1755,7 @@ retry: if (IS_DIRSYNC(dir)) ext4_handle_sync(handle); - inode = ext4_new_inode(handle, dir, mode, &dentry->d_name, 0); + inode = ext4_new_inode(handle, dir, mode, &dentry->d_name, 0, NULL); err = PTR_ERR(inode); if (!IS_ERR(inode)) { inode->i_op = &ext4_file_inode_operations; @@ -1771,7 +1770,7 @@ retry: } static int ext4_mknod(struct inode *dir, struct dentry *dentry, - int mode, dev_t rdev) + umode_t mode, dev_t rdev) { handle_t *handle; struct inode *inode; @@ -1792,7 +1791,7 @@ retry: if (IS_DIRSYNC(dir)) ext4_handle_sync(handle); - inode = ext4_new_inode(handle, dir, mode, &dentry->d_name, 0); + inode = ext4_new_inode(handle, dir, mode, &dentry->d_name, 0, NULL); err = PTR_ERR(inode); if (!IS_ERR(inode)) { init_special_inode(inode, inode->i_mode, rdev); @@ -1807,7 +1806,7 @@ retry: return err; } -static int ext4_mkdir(struct inode *dir, struct dentry *dentry, int mode) +static int ext4_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) { handle_t *handle; struct inode *inode; @@ -1832,7 +1831,7 @@ retry: ext4_handle_sync(handle); inode = ext4_new_inode(handle, dir, S_IFDIR | mode, - &dentry->d_name, 0); + &dentry->d_name, 0, NULL); err = PTR_ERR(inode); if (IS_ERR(inode)) goto out_stop; @@ -1861,9 +1860,9 @@ retry: de->name_len = 2; strcpy(de->name, ".."); ext4_set_de_type(dir->i_sb, de, S_IFDIR); - inode->i_nlink = 2; + set_nlink(inode, 2); BUFFER_TRACE(dir_block, "call ext4_handle_dirty_metadata"); - err = ext4_handle_dirty_metadata(handle, dir, dir_block); + err = ext4_handle_dirty_metadata(handle, inode, dir_block); if (err) goto out_clear_inode; err = ext4_mark_inode_dirty(handle, inode); @@ -2214,7 +2213,7 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry) ext4_warning(inode->i_sb, "Deleting nonexistent file (%lu), %d", inode->i_ino, inode->i_nlink); - inode->i_nlink = 1; + set_nlink(inode, 1); } retval = ext4_delete_entry(handle, dir, de, bh); if (retval) @@ -2279,7 +2278,7 @@ retry: ext4_handle_sync(handle); inode = ext4_new_inode(handle, dir, S_IFLNK|S_IRWXUGO, - &dentry->d_name, 0); + &dentry->d_name, 0, NULL); err = PTR_ERR(inode); if (IS_ERR(inode)) goto out_stop; @@ -2316,7 +2315,7 @@ retry: err = PTR_ERR(handle); goto err_drop_inode; } - inc_nlink(inode); + set_nlink(inode, 1); err = ext4_orphan_del(handle, inode); if (err) { ext4_journal_stop(handle); @@ -2530,7 +2529,7 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, PARENT_INO(dir_bh->b_data, new_dir->i_sb->s_blocksize) = cpu_to_le32(new_dir->i_ino); BUFFER_TRACE(dir_bh, "call ext4_handle_dirty_metadata"); - retval = ext4_handle_dirty_metadata(handle, old_dir, dir_bh); + retval = ext4_handle_dirty_metadata(handle, old_inode, dir_bh); if (retval) { ext4_std_error(old_dir->i_sb, retval); goto end_rename; @@ -2539,7 +2538,7 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, if (new_inode) { /* checked empty_dir above, can't have another parent, * ext4_dec_count() won't work for many-linked dirs */ - new_inode->i_nlink = 0; + clear_nlink(new_inode); } else { ext4_inc_count(handle, new_dir); ext4_update_dx_flag(new_dir); diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 92f38ee..4758518 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -6,7 +6,6 @@ * Written by Theodore Ts'o, 2010. */ -#include <linux/module.h> #include <linux/fs.h> #include <linux/time.h> #include <linux/jbd2.h> @@ -70,7 +69,6 @@ static void put_io_page(struct ext4_io_page *io_page) void ext4_free_io_end(ext4_io_end_t *io) { int i; - wait_queue_head_t *wq; BUG_ON(!io); if (io->page) @@ -78,56 +76,43 @@ void ext4_free_io_end(ext4_io_end_t *io) for (i = 0; i < io->num_io_pages; i++) put_io_page(io->pages[i]); io->num_io_pages = 0; - wq = ext4_ioend_wq(io->inode); - if (atomic_dec_and_test(&EXT4_I(io->inode)->i_ioend_count) && - waitqueue_active(wq)) - wake_up_all(wq); + if (atomic_dec_and_test(&EXT4_I(io->inode)->i_ioend_count)) + wake_up_all(ext4_ioend_wq(io->inode)); kmem_cache_free(io_end_cachep, io); } /* * check a range of space and convert unwritten extents to written. + * + * Called with inode->i_mutex; we depend on this when we manipulate + * io->flag, since we could otherwise race with ext4_flush_completed_IO() */ int ext4_end_io_nolock(ext4_io_end_t *io) { struct inode *inode = io->inode; loff_t offset = io->offset; ssize_t size = io->size; - wait_queue_head_t *wq; int ret = 0; ext4_debug("ext4_end_io_nolock: io 0x%p from inode %lu,list->next 0x%p," "list->prev 0x%p\n", io, inode->i_ino, io->list.next, io->list.prev); - if (list_empty(&io->list)) - return ret; - - if (!(io->flag & EXT4_IO_END_UNWRITTEN)) - return ret; - ret = ext4_convert_unwritten_extents(inode, offset, size); if (ret < 0) { - printk(KERN_EMERG "%s: failed to convert unwritten " - "extents to written extents, error is %d " - "io is still on inode %lu aio dio list\n", - __func__, ret, inode->i_ino); - return ret; + ext4_msg(inode->i_sb, KERN_EMERG, + "failed to convert unwritten extents to written " + "extents -- potential data loss! " + "(inode %lu, offset %llu, size %zd, error %d)", + inode->i_ino, offset, size, ret); } if (io->iocb) aio_complete(io->iocb, io->result, 0); - /* clear the DIO AIO unwritten flag */ - if (io->flag & EXT4_IO_END_UNWRITTEN) { - io->flag &= ~EXT4_IO_END_UNWRITTEN; - /* Wake up anyone waiting on unwritten extent conversion */ - wq = ext4_ioend_wq(io->inode); - if (atomic_dec_and_test(&EXT4_I(inode)->i_aiodio_unwritten) && - waitqueue_active(wq)) { - wake_up_all(wq); - } - } + /* Wake up anyone waiting on unwritten extent conversion */ + if (atomic_dec_and_test(&EXT4_I(inode)->i_aiodio_unwritten)) + wake_up_all(ext4_ioend_wq(io->inode)); return ret; } @@ -140,9 +125,15 @@ static void ext4_end_io_work(struct work_struct *work) struct inode *inode = io->inode; struct ext4_inode_info *ei = EXT4_I(inode); unsigned long flags; - int ret; + + spin_lock_irqsave(&ei->i_completed_io_lock, flags); + if (list_empty(&io->list)) { + spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); + goto free; + } if (!mutex_trylock(&inode->i_mutex)) { + spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); /* * Requeue the work instead of waiting so that the work * items queued after this can be processed. @@ -159,17 +150,11 @@ static void ext4_end_io_work(struct work_struct *work) io->flag |= EXT4_IO_END_QUEUED; return; } - ret = ext4_end_io_nolock(io); - if (ret < 0) { - mutex_unlock(&inode->i_mutex); - return; - } - - spin_lock_irqsave(&ei->i_completed_io_lock, flags); - if (!list_empty(&io->list)) - list_del_init(&io->list); + list_del_init(&io->list); spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); + (void) ext4_end_io_nolock(io); mutex_unlock(&inode->i_mutex); +free: ext4_free_io_end(io); } @@ -350,10 +335,8 @@ submit_and_retry: if ((io_end->num_io_pages >= MAX_IO_PAGES) && (io_end->pages[io_end->num_io_pages-1] != io_page)) goto submit_and_retry; - if (buffer_uninit(bh) && !(io_end->flag & EXT4_IO_END_UNWRITTEN)) { - io_end->flag |= EXT4_IO_END_UNWRITTEN; - atomic_inc(&EXT4_I(inode)->i_aiodio_unwritten); - } + if (buffer_uninit(bh)) + ext4_set_io_unwritten_flag(inode, io_end); io->io_end->size += bh->b_size; io->io_next_block++; ret = bio_add_page(io->io_bio, bh->b_page, bh->b_size, bh_offset(bh)); @@ -401,6 +384,18 @@ int ext4_bio_write_page(struct ext4_io_submit *io, block_end = block_start + blocksize; if (block_start >= len) { + /* + * Comments copied from block_write_full_page_endio: + * + * The page straddles i_size. It must be zeroed out on + * each and every writepage invocation because it may + * be mmapped. "A file is mapped in multiples of the + * page size. For a file that is not a multiple of + * the page size, the remaining memory is zeroed when + * mapped, and writes to that region are not written + * out to the file." + */ + zero_user_segment(page, block_start, block_end); clear_buffer_dirty(bh); set_buffer_uptodate(bh); continue; diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index 707d3f1..996780a 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -875,7 +875,7 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input) ext4_block_bitmap_set(sb, gdp, input->block_bitmap); /* LV FIXME */ ext4_inode_bitmap_set(sb, gdp, input->inode_bitmap); /* LV FIXME */ ext4_inode_table_set(sb, gdp, input->inode_table); /* LV FIXME */ - ext4_free_blks_set(sb, gdp, input->free_blocks_count); + ext4_free_group_clusters_set(sb, gdp, input->free_blocks_count); ext4_free_inodes_set(sb, gdp, EXT4_INODES_PER_GROUP(sb)); gdp->bg_flags = cpu_to_le16(EXT4_BG_INODE_ZEROED); gdp->bg_checksum = ext4_group_desc_csum(sbi, input->group, gdp); @@ -937,8 +937,8 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input) input->reserved_blocks); /* Update the free space counts */ - percpu_counter_add(&sbi->s_freeblocks_counter, - input->free_blocks_count); + percpu_counter_add(&sbi->s_freeclusters_counter, + EXT4_B2C(sbi, input->free_blocks_count)); percpu_counter_add(&sbi->s_freeinodes_counter, EXT4_INODES_PER_GROUP(sb)); @@ -946,8 +946,8 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input) sbi->s_log_groups_per_flex) { ext4_group_t flex_group; flex_group = ext4_flex_group(sbi, input->group); - atomic_add(input->free_blocks_count, - &sbi->s_flex_groups[flex_group].free_blocks); + atomic_add(EXT4_B2C(sbi, input->free_blocks_count), + &sbi->s_flex_groups[flex_group].free_clusters); atomic_add(EXT4_INODES_PER_GROUP(sb), &sbi->s_flex_groups[flex_group].free_inodes); } diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 44d0c8d..ed3ce82 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -45,6 +45,7 @@ #include <linux/freezer.h> #include "ext4.h" +#include "ext4_extents.h" #include "ext4_jbd2.h" #include "xattr.h" #include "acl.h" @@ -163,8 +164,8 @@ ext4_fsblk_t ext4_inode_table(struct super_block *sb, (ext4_fsblk_t)le32_to_cpu(bg->bg_inode_table_hi) << 32 : 0); } -__u32 ext4_free_blks_count(struct super_block *sb, - struct ext4_group_desc *bg) +__u32 ext4_free_group_clusters(struct super_block *sb, + struct ext4_group_desc *bg) { return le16_to_cpu(bg->bg_free_blocks_count_lo) | (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ? @@ -219,8 +220,8 @@ void ext4_inode_table_set(struct super_block *sb, bg->bg_inode_table_hi = cpu_to_le32(blk >> 32); } -void ext4_free_blks_set(struct super_block *sb, - struct ext4_group_desc *bg, __u32 count) +void ext4_free_group_clusters_set(struct super_block *sb, + struct ext4_group_desc *bg, __u32 count) { bg->bg_free_blocks_count_lo = cpu_to_le16((__u16)count); if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT) @@ -414,6 +415,22 @@ static void save_error_info(struct super_block *sb, const char *func, ext4_commit_super(sb, 1); } +/* + * The del_gendisk() function uninitializes the disk-specific data + * structures, including the bdi structure, without telling anyone + * else. Once this happens, any attempt to call mark_buffer_dirty() + * (for example, by ext4_commit_super), will cause a kernel OOPS. + * This is a kludge to prevent these oops until we can put in a proper + * hook in del_gendisk() to inform the VFS and file system layers. + */ +static int block_device_ejected(struct super_block *sb) +{ + struct inode *bd_inode = sb->s_bdev->bd_inode; + struct backing_dev_info *bdi = bd_inode->i_mapping->backing_dev_info; + + return bdi->dev == NULL; +} + /* Deal with the reporting of failure conditions on a filesystem such as * inconsistencies detected or read IO failures. @@ -821,10 +838,10 @@ static void ext4_put_super(struct super_block *sb) brelse(sbi->s_group_desc[i]); ext4_kvfree(sbi->s_group_desc); ext4_kvfree(sbi->s_flex_groups); - percpu_counter_destroy(&sbi->s_freeblocks_counter); + percpu_counter_destroy(&sbi->s_freeclusters_counter); percpu_counter_destroy(&sbi->s_freeinodes_counter); percpu_counter_destroy(&sbi->s_dirs_counter); - percpu_counter_destroy(&sbi->s_dirtyblocks_counter); + percpu_counter_destroy(&sbi->s_dirtyclusters_counter); brelse(sbi->s_sbh); #ifdef CONFIG_QUOTA for (i = 0; i < MAXQUOTAS; i++) @@ -913,7 +930,6 @@ static int ext4_drop_inode(struct inode *inode) static void ext4_i_callback(struct rcu_head *head) { struct inode *inode = container_of(head, struct inode, i_rcu); - INIT_LIST_HEAD(&inode->i_dentry); kmem_cache_free(ext4_inode_cachep, EXT4_I(inode)); } @@ -1016,11 +1032,11 @@ static inline void ext4_show_quota_options(struct seq_file *seq, * - it's set to a non-default value OR * - if the per-sb default is different from the global default */ -static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs) +static int ext4_show_options(struct seq_file *seq, struct dentry *root) { int def_errors; unsigned long def_mount_opts; - struct super_block *sb = vfs->mnt_sb; + struct super_block *sb = root->d_sb; struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_super_block *es = sbi->s_es; @@ -1057,8 +1073,6 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs) seq_puts(seq, ",nouid32"); if (test_opt(sb, DEBUG) && !(def_mount_opts & EXT4_DEFM_DEBUG)) seq_puts(seq, ",debug"); - if (test_opt(sb, OLDALLOC)) - seq_puts(seq, ",oldalloc"); #ifdef CONFIG_EXT4_FS_XATTR if (test_opt(sb, XATTR_USER)) seq_puts(seq, ",user_xattr"); @@ -1140,9 +1154,9 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs) seq_puts(seq, ",block_validity"); if (!test_opt(sb, INIT_INODE_TABLE)) - seq_puts(seq, ",noinit_inode_table"); + seq_puts(seq, ",noinit_itable"); else if (sbi->s_li_wait_mult != EXT4_DEF_LI_WAIT_MULT) - seq_printf(seq, ",init_inode_table=%u", + seq_printf(seq, ",init_itable=%u", (unsigned) sbi->s_li_wait_mult); ext4_show_quota_options(seq, sb); @@ -1318,8 +1332,7 @@ enum { Opt_nomblk_io_submit, Opt_block_validity, Opt_noblock_validity, Opt_inode_readahead_blks, Opt_journal_ioprio, Opt_dioread_nolock, Opt_dioread_lock, - Opt_discard, Opt_nodiscard, - Opt_init_inode_table, Opt_noinit_inode_table, + Opt_discard, Opt_nodiscard, Opt_init_itable, Opt_noinit_itable, }; static const match_table_t tokens = { @@ -1392,9 +1405,9 @@ static const match_table_t tokens = { {Opt_dioread_lock, "dioread_lock"}, {Opt_discard, "discard"}, {Opt_nodiscard, "nodiscard"}, - {Opt_init_inode_table, "init_itable=%u"}, - {Opt_init_inode_table, "init_itable"}, - {Opt_noinit_inode_table, "noinit_itable"}, + {Opt_init_itable, "init_itable=%u"}, + {Opt_init_itable, "init_itable"}, + {Opt_noinit_itable, "noinit_itable"}, {Opt_err, NULL}, }; @@ -1567,10 +1580,12 @@ static int parse_options(char *options, struct super_block *sb, set_opt(sb, DEBUG); break; case Opt_oldalloc: - set_opt(sb, OLDALLOC); + ext4_msg(sb, KERN_WARNING, + "Ignoring deprecated oldalloc option"); break; case Opt_orlov: - clear_opt(sb, OLDALLOC); + ext4_msg(sb, KERN_WARNING, + "Ignoring deprecated orlov option"); break; #ifdef CONFIG_EXT4_FS_XATTR case Opt_user_xattr: @@ -1666,7 +1681,9 @@ static int parse_options(char *options, struct super_block *sb, data_opt = EXT4_MOUNT_WRITEBACK_DATA; datacheck: if (is_remount) { - if (test_opt(sb, DATA_FLAGS) != data_opt) { + if (!sbi->s_journal) + ext4_msg(sb, KERN_WARNING, "Remounting file system with no journal so ignoring journalled data option"); + else if (test_opt(sb, DATA_FLAGS) != data_opt) { ext4_msg(sb, KERN_ERR, "Cannot change data mode on remount"); return 0; @@ -1801,6 +1818,7 @@ set_qf_format: break; case Opt_nodelalloc: clear_opt(sb, DELALLOC); + clear_opt2(sb, EXPLICIT_DELALLOC); break; case Opt_mblk_io_submit: set_opt(sb, MBLK_IO_SUBMIT); @@ -1817,6 +1835,7 @@ set_qf_format: break; case Opt_delalloc: set_opt(sb, DELALLOC); + set_opt2(sb, EXPLICIT_DELALLOC); break; case Opt_block_validity: set_opt(sb, BLOCK_VALIDITY); @@ -1871,7 +1890,7 @@ set_qf_format: case Opt_dioread_lock: clear_opt(sb, DIOREAD_NOLOCK); break; - case Opt_init_inode_table: + case Opt_init_itable: set_opt(sb, INIT_INODE_TABLE); if (args[0].from) { if (match_int(&args[0], &option)) @@ -1882,7 +1901,7 @@ set_qf_format: return 0; sbi->s_li_wait_mult = option; break; - case Opt_noinit_inode_table: + case Opt_noinit_itable: clear_opt(sb, INIT_INODE_TABLE); break; default: @@ -1935,7 +1954,7 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es, res = MS_RDONLY; } if (read_only) - return res; + goto done; if (!(sbi->s_mount_state & EXT4_VALID_FS)) ext4_msg(sb, KERN_WARNING, "warning: mounting unchecked fs, " "running e2fsck is recommended"); @@ -1966,6 +1985,7 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es, EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); ext4_commit_super(sb, 1); +done: if (test_opt(sb, DEBUG)) printk(KERN_INFO "[EXT4 FS bs=%lu, gc=%u, " "bpg=%lu, ipg=%lu, mo=%04x, mo2=%04x]\n", @@ -2015,8 +2035,8 @@ static int ext4_fill_flex_info(struct super_block *sb) flex_group = ext4_flex_group(sbi, i); atomic_add(ext4_free_inodes_count(sb, gdp), &sbi->s_flex_groups[flex_group].free_inodes); - atomic_add(ext4_free_blks_count(sb, gdp), - &sbi->s_flex_groups[flex_group].free_blocks); + atomic_add(ext4_free_group_clusters(sb, gdp), + &sbi->s_flex_groups[flex_group].free_clusters); atomic_add(ext4_used_dirs_count(sb, gdp), &sbi->s_flex_groups[flex_group].used_dirs); } @@ -2134,7 +2154,8 @@ static int ext4_check_descriptors(struct super_block *sb, if (NULL != first_not_zeroed) *first_not_zeroed = grp; - ext4_free_blocks_count_set(sbi->s_es, ext4_count_free_blocks(sb)); + ext4_free_blocks_count_set(sbi->s_es, + EXT4_C2B(sbi, ext4_count_free_clusters(sb))); sbi->s_es->s_free_inodes_count =cpu_to_le32(ext4_count_free_inodes(sb)); return 1; } @@ -2454,7 +2475,8 @@ static ssize_t delayed_allocation_blocks_show(struct ext4_attr *a, char *buf) { return snprintf(buf, PAGE_SIZE, "%llu\n", - (s64) percpu_counter_sum(&sbi->s_dirtyblocks_counter)); + (s64) EXT4_C2B(sbi, + percpu_counter_sum(&sbi->s_dirtyclusters_counter))); } static ssize_t session_write_kbytes_show(struct ext4_attr *a, @@ -2682,6 +2704,13 @@ static int ext4_feature_set_ok(struct super_block *sb, int readonly) return 0; } } + if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_BIGALLOC) && + !EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) { + ext4_msg(sb, KERN_ERR, + "Can't support bigalloc feature without " + "extents feature\n"); + return 0; + } return 1; } @@ -2853,8 +2882,7 @@ cont_thread: } mutex_unlock(&eli->li_list_mtx); - if (freezing(current)) - refrigerator(); + try_to_freeze(); cur = jiffies; if ((time_after_eq(cur, next_wakeup)) || @@ -3070,8 +3098,6 @@ static void ext4_destroy_lazyinit_thread(void) } static int ext4_fill_super(struct super_block *sb, void *data, int silent) - __releases(kernel_lock) - __acquires(kernel_lock) { char *orig_data = kstrdup(data, GFP_KERNEL); struct buffer_head *bh; @@ -3087,10 +3113,10 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) char *cp; const char *descr; int ret = -ENOMEM; - int blocksize; + int blocksize, clustersize; unsigned int db_count; unsigned int i; - int needs_recovery, has_huge_files; + int needs_recovery, has_huge_files, has_bigalloc; __u64 blocks_count; int err; unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO; @@ -3224,6 +3250,33 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) &journal_ioprio, NULL, 0)) goto failed_mount; + if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) { + printk_once(KERN_WARNING "EXT4-fs: Warning: mounting " + "with data=journal disables delayed " + "allocation and O_DIRECT support!\n"); + if (test_opt2(sb, EXPLICIT_DELALLOC)) { + ext4_msg(sb, KERN_ERR, "can't mount with " + "both data=journal and delalloc"); + goto failed_mount; + } + if (test_opt(sb, DIOREAD_NOLOCK)) { + ext4_msg(sb, KERN_ERR, "can't mount with " + "both data=journal and delalloc"); + goto failed_mount; + } + if (test_opt(sb, DELALLOC)) + clear_opt(sb, DELALLOC); + } + + blocksize = BLOCK_SIZE << le32_to_cpu(es->s_log_block_size); + if (test_opt(sb, DIOREAD_NOLOCK)) { + if (blocksize < PAGE_SIZE) { + ext4_msg(sb, KERN_ERR, "can't mount with " + "dioread_nolock if block size != PAGE_SIZE"); + goto failed_mount; + } + } + sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | (test_opt(sb, POSIX_ACL) ? MS_POSIXACL : 0); @@ -3265,8 +3318,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) if (!ext4_feature_set_ok(sb, (sb->s_flags & MS_RDONLY))) goto failed_mount; - blocksize = BLOCK_SIZE << le32_to_cpu(es->s_log_block_size); - if (blocksize < EXT4_MIN_BLOCK_SIZE || blocksize > EXT4_MAX_BLOCK_SIZE) { ext4_msg(sb, KERN_ERR, @@ -3369,12 +3420,53 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) sb->s_dirt = 1; } - if (sbi->s_blocks_per_group > blocksize * 8) { - ext4_msg(sb, KERN_ERR, - "#blocks per group too big: %lu", - sbi->s_blocks_per_group); - goto failed_mount; + /* Handle clustersize */ + clustersize = BLOCK_SIZE << le32_to_cpu(es->s_log_cluster_size); + has_bigalloc = EXT4_HAS_RO_COMPAT_FEATURE(sb, + EXT4_FEATURE_RO_COMPAT_BIGALLOC); + if (has_bigalloc) { + if (clustersize < blocksize) { + ext4_msg(sb, KERN_ERR, + "cluster size (%d) smaller than " + "block size (%d)", clustersize, blocksize); + goto failed_mount; + } + sbi->s_cluster_bits = le32_to_cpu(es->s_log_cluster_size) - + le32_to_cpu(es->s_log_block_size); + sbi->s_clusters_per_group = + le32_to_cpu(es->s_clusters_per_group); + if (sbi->s_clusters_per_group > blocksize * 8) { + ext4_msg(sb, KERN_ERR, + "#clusters per group too big: %lu", + sbi->s_clusters_per_group); + goto failed_mount; + } + if (sbi->s_blocks_per_group != + (sbi->s_clusters_per_group * (clustersize / blocksize))) { + ext4_msg(sb, KERN_ERR, "blocks per group (%lu) and " + "clusters per group (%lu) inconsistent", + sbi->s_blocks_per_group, + sbi->s_clusters_per_group); + goto failed_mount; + } + } else { + if (clustersize != blocksize) { + ext4_warning(sb, "fragment/cluster size (%d) != " + "block size (%d)", clustersize, + blocksize); + clustersize = blocksize; + } + if (sbi->s_blocks_per_group > blocksize * 8) { + ext4_msg(sb, KERN_ERR, + "#blocks per group too big: %lu", + sbi->s_blocks_per_group); + goto failed_mount; + } + sbi->s_clusters_per_group = sbi->s_blocks_per_group; + sbi->s_cluster_bits = 0; } + sbi->s_cluster_ratio = clustersize / blocksize; + if (sbi->s_inodes_per_group > blocksize * 8) { ext4_msg(sb, KERN_ERR, "#inodes per group too big: %lu", @@ -3446,10 +3538,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) goto failed_mount; } -#ifdef CONFIG_PROC_FS if (ext4_proc_root) sbi->s_proc = proc_mkdir(sb->s_id, ext4_proc_root); -#endif bgl_lock_init(sbi->s_blockgroup_lock); @@ -3483,8 +3573,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) sbi->s_err_report.function = print_daily_error_info; sbi->s_err_report.data = (unsigned long) sb; - err = percpu_counter_init(&sbi->s_freeblocks_counter, - ext4_count_free_blocks(sb)); + err = percpu_counter_init(&sbi->s_freeclusters_counter, + ext4_count_free_clusters(sb)); if (!err) { err = percpu_counter_init(&sbi->s_freeinodes_counter, ext4_count_free_inodes(sb)); @@ -3494,7 +3584,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) ext4_count_dirs(sb)); } if (!err) { - err = percpu_counter_init(&sbi->s_dirtyblocks_counter, 0); + err = percpu_counter_init(&sbi->s_dirtyclusters_counter, 0); } if (err) { ext4_msg(sb, KERN_ERR, "insufficient memory"); @@ -3609,13 +3699,13 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) * The journal may have updated the bg summary counts, so we * need to update the global counters. */ - percpu_counter_set(&sbi->s_freeblocks_counter, - ext4_count_free_blocks(sb)); + percpu_counter_set(&sbi->s_freeclusters_counter, + ext4_count_free_clusters(sb)); percpu_counter_set(&sbi->s_freeinodes_counter, ext4_count_free_inodes(sb)); percpu_counter_set(&sbi->s_dirs_counter, ext4_count_dirs(sb)); - percpu_counter_set(&sbi->s_dirtyblocks_counter, 0); + percpu_counter_set(&sbi->s_dirtyclusters_counter, 0); no_journal: /* @@ -3643,10 +3733,12 @@ no_journal: } if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) { ext4_msg(sb, KERN_ERR, "corrupt root inode, run e2fsck"); + iput(root); goto failed_mount4; } sb->s_root = d_alloc_root(root); if (!sb->s_root) { + iput(root); ext4_msg(sb, KERN_ERR, "get root dentry failed"); ret = -ENOMEM; goto failed_mount4; @@ -3679,30 +3771,11 @@ no_journal: "available"); } - if (test_opt(sb, DELALLOC) && - (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)) { - ext4_msg(sb, KERN_WARNING, "Ignoring delalloc option - " - "requested data journaling mode"); - clear_opt(sb, DELALLOC); - } - if (test_opt(sb, DIOREAD_NOLOCK)) { - if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) { - ext4_msg(sb, KERN_WARNING, "Ignoring dioread_nolock " - "option - requested data journaling mode"); - clear_opt(sb, DIOREAD_NOLOCK); - } - if (sb->s_blocksize < PAGE_SIZE) { - ext4_msg(sb, KERN_WARNING, "Ignoring dioread_nolock " - "option - block size is too small"); - clear_opt(sb, DIOREAD_NOLOCK); - } - } - err = ext4_setup_system_zone(sb); if (err) { ext4_msg(sb, KERN_ERR, "failed to initialize system " "zone (%d)", err); - goto failed_mount4; + goto failed_mount4a; } ext4_ext_init(sb); @@ -3710,22 +3783,19 @@ no_journal: if (err) { ext4_msg(sb, KERN_ERR, "failed to initialize mballoc (%d)", err); - goto failed_mount4; + goto failed_mount5; } err = ext4_register_li_request(sb, first_not_zeroed); if (err) - goto failed_mount4; + goto failed_mount6; sbi->s_kobj.kset = ext4_kset; init_completion(&sbi->s_kobj_unregister); err = kobject_init_and_add(&sbi->s_kobj, &ext4_ktype, NULL, "%s", sb->s_id); - if (err) { - ext4_mb_release(sb); - ext4_ext_release(sb); - goto failed_mount4; - }; + if (err) + goto failed_mount7; EXT4_SB(sb)->s_mount_state |= EXT4_ORPHAN_FS; ext4_orphan_cleanup(sb, es); @@ -3759,13 +3829,20 @@ cantfind_ext4: ext4_msg(sb, KERN_ERR, "VFS: Can't find ext4 filesystem"); goto failed_mount; -failed_mount4: - iput(root); +failed_mount7: + ext4_unregister_li_request(sb); +failed_mount6: + ext4_mb_release(sb); +failed_mount5: + ext4_ext_release(sb); + ext4_release_system_zone(sb); +failed_mount4a: + dput(sb->s_root); sb->s_root = NULL; +failed_mount4: ext4_msg(sb, KERN_ERR, "mount failed"); destroy_workqueue(EXT4_SB(sb)->dio_unwritten_wq); failed_mount_wq: - ext4_release_system_zone(sb); if (sbi->s_journal) { jbd2_journal_destroy(sbi->s_journal); sbi->s_journal = NULL; @@ -3774,10 +3851,10 @@ failed_mount3: del_timer(&sbi->s_err_report); if (sbi->s_flex_groups) ext4_kvfree(sbi->s_flex_groups); - percpu_counter_destroy(&sbi->s_freeblocks_counter); + percpu_counter_destroy(&sbi->s_freeclusters_counter); percpu_counter_destroy(&sbi->s_freeinodes_counter); percpu_counter_destroy(&sbi->s_dirs_counter); - percpu_counter_destroy(&sbi->s_dirtyblocks_counter); + percpu_counter_destroy(&sbi->s_dirtyclusters_counter); if (sbi->s_mmp_tsk) kthread_stop(sbi->s_mmp_tsk); failed_mount2: @@ -4064,7 +4141,7 @@ static int ext4_commit_super(struct super_block *sb, int sync) struct buffer_head *sbh = EXT4_SB(sb)->s_sbh; int error = 0; - if (!sbh) + if (!sbh || block_device_ejected(sb)) return error; if (buffer_write_io_error(sbh)) { /* @@ -4100,8 +4177,9 @@ static int ext4_commit_super(struct super_block *sb, int sync) else es->s_kbytes_written = cpu_to_le64(EXT4_SB(sb)->s_kbytes_written); - ext4_free_blocks_count_set(es, percpu_counter_sum_positive( - &EXT4_SB(sb)->s_freeblocks_counter)); + ext4_free_blocks_count_set(es, + EXT4_C2B(EXT4_SB(sb), percpu_counter_sum_positive( + &EXT4_SB(sb)->s_freeclusters_counter))); es->s_free_inodes_count = cpu_to_le32(percpu_counter_sum_positive( &EXT4_SB(sb)->s_freeinodes_counter)); @@ -4506,16 +4584,34 @@ restore_opts: return err; } +/* + * Note: calculating the overhead so we can be compatible with + * historical BSD practice is quite difficult in the face of + * clusters/bigalloc. This is because multiple metadata blocks from + * different block group can end up in the same allocation cluster. + * Calculating the exact overhead in the face of clustered allocation + * requires either O(all block bitmaps) in memory or O(number of block + * groups**2) in time. We will still calculate the superblock for + * older file systems --- and if we come across with a bigalloc file + * system with zero in s_overhead_clusters the estimate will be close to + * correct especially for very large cluster sizes --- but for newer + * file systems, it's better to calculate this figure once at mkfs + * time, and store it in the superblock. If the superblock value is + * present (even for non-bigalloc file systems), we will use it. + */ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf) { struct super_block *sb = dentry->d_sb; struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_super_block *es = sbi->s_es; + struct ext4_group_desc *gdp; u64 fsid; s64 bfree; if (test_opt(sb, MINIX_DF)) { sbi->s_overhead_last = 0; + } else if (es->s_overhead_clusters) { + sbi->s_overhead_last = le32_to_cpu(es->s_overhead_clusters); } else if (sbi->s_blocks_last != ext4_blocks_count(es)) { ext4_group_t i, ngroups = ext4_get_groups_count(sb); ext4_fsblk_t overhead = 0; @@ -4530,24 +4626,16 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf) * All of the blocks before first_data_block are * overhead */ - overhead = le32_to_cpu(es->s_first_data_block); + overhead = EXT4_B2C(sbi, le32_to_cpu(es->s_first_data_block)); /* - * Add the overhead attributed to the superblock and - * block group descriptors. If the sparse superblocks - * feature is turned on, then not all groups have this. + * Add the overhead found in each block group */ for (i = 0; i < ngroups; i++) { - overhead += ext4_bg_has_super(sb, i) + - ext4_bg_num_gdb(sb, i); + gdp = ext4_get_group_desc(sb, i, NULL); + overhead += ext4_num_overhead_clusters(sb, i, gdp); cond_resched(); } - - /* - * Every block group has an inode bitmap, a block - * bitmap, and an inode table. - */ - overhead += ngroups * (2 + sbi->s_itb_per_group); sbi->s_overhead_last = overhead; smp_wmb(); sbi->s_blocks_last = ext4_blocks_count(es); @@ -4555,11 +4643,12 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_type = EXT4_SUPER_MAGIC; buf->f_bsize = sb->s_blocksize; - buf->f_blocks = ext4_blocks_count(es) - sbi->s_overhead_last; - bfree = percpu_counter_sum_positive(&sbi->s_freeblocks_counter) - - percpu_counter_sum_positive(&sbi->s_dirtyblocks_counter); + buf->f_blocks = (ext4_blocks_count(es) - + EXT4_C2B(sbi, sbi->s_overhead_last)); + bfree = percpu_counter_sum_positive(&sbi->s_freeclusters_counter) - + percpu_counter_sum_positive(&sbi->s_dirtyclusters_counter); /* prevent underflow in case that few free space is available */ - buf->f_bfree = max_t(s64, bfree, 0); + buf->f_bfree = EXT4_C2B(sbi, max_t(s64, bfree, 0)); buf->f_bavail = buf->f_bfree - ext4_r_blocks_count(es); if (buf->f_bfree < ext4_r_blocks_count(es)) buf->f_bavail = 0; @@ -4694,7 +4783,7 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id, return -EINVAL; /* Quotafile not on the same filesystem? */ - if (path->mnt->mnt_sb != sb) + if (path->dentry->d_sb != sb) return -EXDEV; /* Journaling quota? */ if (EXT4_SB(sb)->s_qf_names[type]) { @@ -4980,13 +5069,11 @@ static int __init ext4_init_fs(void) return err; err = ext4_init_system_zone(); if (err) - goto out7; + goto out6; ext4_kset = kset_create_and_add("ext4", NULL, fs_kobj); if (!ext4_kset) - goto out6; - ext4_proc_root = proc_mkdir("fs/ext4", NULL); - if (!ext4_proc_root) goto out5; + ext4_proc_root = proc_mkdir("fs/ext4", NULL); err = ext4_init_feat_adverts(); if (err) @@ -5022,12 +5109,12 @@ out2: out3: ext4_exit_feat_adverts(); out4: - remove_proc_entry("fs/ext4", NULL); -out5: + if (ext4_proc_root) + remove_proc_entry("fs/ext4", NULL); kset_unregister(ext4_kset); -out6: +out5: ext4_exit_system_zone(); -out7: +out6: ext4_exit_pageio(); return err; } diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index c757adc..93a00d8 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -820,8 +820,14 @@ inserted: if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) goal = goal & EXT4_MAX_BLOCK_FILE_PHYS; + /* + * take i_data_sem because we will test + * i_delalloc_reserved_flag in ext4_mb_new_blocks + */ + down_read((&EXT4_I(inode)->i_data_sem)); block = ext4_new_meta_blocks(handle, inode, goal, 0, NULL, &error); + up_read((&EXT4_I(inode)->i_data_sem)); if (error) goto cleanup; @@ -985,11 +991,7 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index, no_expand = ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND); ext4_set_inode_state(inode, EXT4_STATE_NO_EXPAND); - error = ext4_get_inode_loc(inode, &is.iloc); - if (error) - goto cleanup; - - error = ext4_journal_get_write_access(handle, is.iloc.bh); + error = ext4_reserve_inode_write(handle, inode, &is.iloc); if (error) goto cleanup; diff --git a/fs/ext4/xattr_security.c b/fs/ext4/xattr_security.c index 007c3bf..b60f9f8 100644 --- a/fs/ext4/xattr_security.c +++ b/fs/ext4/xattr_security.c @@ -3,7 +3,6 @@ * Handler for storing security labels as extended attributes. */ -#include <linux/module.h> #include <linux/string.h> #include <linux/fs.h> #include <linux/security.h> @@ -48,28 +47,32 @@ ext4_xattr_security_set(struct dentry *dentry, const char *name, name, value, size, flags); } -int -ext4_init_security(handle_t *handle, struct inode *inode, struct inode *dir, - const struct qstr *qstr) +int ext4_initxattrs(struct inode *inode, const struct xattr *xattr_array, + void *fs_info) { - int err; - size_t len; - void *value; - char *name; + const struct xattr *xattr; + handle_t *handle = fs_info; + int err = 0; - err = security_inode_init_security(inode, dir, qstr, &name, &value, &len); - if (err) { - if (err == -EOPNOTSUPP) - return 0; - return err; + for (xattr = xattr_array; xattr->name != NULL; xattr++) { + err = ext4_xattr_set_handle(handle, inode, + EXT4_XATTR_INDEX_SECURITY, + xattr->name, xattr->value, + xattr->value_len, 0); + if (err < 0) + break; } - err = ext4_xattr_set_handle(handle, inode, EXT4_XATTR_INDEX_SECURITY, - name, value, len, 0); - kfree(name); - kfree(value); return err; } +int +ext4_init_security(handle_t *handle, struct inode *inode, struct inode *dir, + const struct qstr *qstr) +{ + return security_inode_init_security(inode, dir, qstr, + &ext4_initxattrs, handle); +} + const struct xattr_handler ext4_xattr_security_handler = { .prefix = XATTR_SECURITY_PREFIX, .list = ext4_xattr_security_list, diff --git a/fs/ext4/xattr_trusted.c b/fs/ext4/xattr_trusted.c index 37e6ebc..95f1f4a 100644 --- a/fs/ext4/xattr_trusted.c +++ b/fs/ext4/xattr_trusted.c @@ -5,7 +5,6 @@ * Copyright (C) 2003 by Andreas Gruenbacher, <a.gruenbacher@computer.org> */ -#include <linux/module.h> #include <linux/string.h> #include <linux/capability.h> #include <linux/fs.h> diff --git a/fs/ext4/xattr_user.c b/fs/ext4/xattr_user.c index 98c3753..0edb7611 100644 --- a/fs/ext4/xattr_user.c +++ b/fs/ext4/xattr_user.c @@ -5,7 +5,6 @@ * Copyright (C) 2001 by Andreas Gruenbacher, <a.gruenbacher@computer.org> */ -#include <linux/module.h> #include <linux/string.h> #include <linux/fs.h> #include "ext4_jbd2.h" diff --git a/fs/fat/dir.c b/fs/fat/dir.c index 5efbd5d..aca191b 100644 --- a/fs/fat/dir.c +++ b/fs/fat/dir.c @@ -156,8 +156,8 @@ static int uni16_to_x8(struct super_block *sb, unsigned char *ascii, } else { if (uni_xlate == 1) { *op++ = ':'; - op = pack_hex_byte(op, ec >> 8); - op = pack_hex_byte(op, ec); + op = hex_byte_pack(op, ec >> 8); + op = hex_byte_pack(op, ec); len -= 5; } else { *op++ = '?'; diff --git a/fs/fat/fat.h b/fs/fat/fat.h index a5d3853..66994f3 100644 --- a/fs/fat/fat.h +++ b/fs/fat/fat.h @@ -141,7 +141,7 @@ static inline struct msdos_inode_info *MSDOS_I(struct inode *inode) static inline int fat_mode_can_hold_ro(struct inode *inode) { struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb); - mode_t mask; + umode_t mask; if (S_ISDIR(inode->i_mode)) { if (!sbi->options.rodir) @@ -156,8 +156,8 @@ static inline int fat_mode_can_hold_ro(struct inode *inode) } /* Convert attribute bits and a mask to the UNIX mode. */ -static inline mode_t fat_make_mode(struct msdos_sb_info *sbi, - u8 attrs, mode_t mode) +static inline umode_t fat_make_mode(struct msdos_sb_info *sbi, + u8 attrs, umode_t mode) { if (attrs & ATTR_RO && !((attrs & ATTR_DIR) && !sbi->options.rodir)) mode &= ~S_IWUGO; @@ -326,15 +326,14 @@ extern int fat_fill_super(struct super_block *sb, void *data, int silent, extern int fat_flush_inodes(struct super_block *sb, struct inode *i1, struct inode *i2); /* fat/misc.c */ -extern void -__fat_fs_error(struct super_block *sb, int report, const char *fmt, ...) - __attribute__ ((format (printf, 3, 4))) __cold; +extern __printf(3, 4) __cold +void __fat_fs_error(struct super_block *sb, int report, const char *fmt, ...); #define fat_fs_error(sb, fmt, args...) \ __fat_fs_error(sb, 1, fmt , ## args) #define fat_fs_error_ratelimit(sb, fmt, args...) \ __fat_fs_error(sb, __ratelimit(&MSDOS_SB(sb)->ratelimit), fmt , ## args) -void fat_msg(struct super_block *sb, const char *level, const char *fmt, ...) - __attribute__ ((format (printf, 3, 4))) __cold; +__printf(3, 4) __cold +void fat_msg(struct super_block *sb, const char *level, const char *fmt, ...); extern int fat_clusters_flush(struct super_block *sb); extern int fat_chain_add(struct inode *inode, int new_dclus, int nr_cluster); extern void fat_time_fat2unix(struct msdos_sb_info *sbi, struct timespec *ts, diff --git a/fs/fat/file.c b/fs/fat/file.c index c118acf..a71fe37 100644 --- a/fs/fat/file.c +++ b/fs/fat/file.c @@ -44,7 +44,7 @@ static int fat_ioctl_set_attributes(struct file *file, u32 __user *user_attr) goto out; mutex_lock(&inode->i_mutex); - err = mnt_want_write(file->f_path.mnt); + err = mnt_want_write_file(file); if (err) goto out_unlock_inode; @@ -108,7 +108,7 @@ static int fat_ioctl_set_attributes(struct file *file, u32 __user *user_attr) fat_save_attrs(inode, attr); mark_inode_dirty(inode); out_drop_write: - mnt_drop_write(file->f_path.mnt); + mnt_drop_write_file(file); out_unlock_inode: mutex_unlock(&inode->i_mutex); out: @@ -314,7 +314,7 @@ EXPORT_SYMBOL_GPL(fat_getattr); static int fat_sanitize_mode(const struct msdos_sb_info *sbi, struct inode *inode, umode_t *mode_ptr) { - mode_t mask, perm; + umode_t mask, perm; /* * Note, the basic check is already done by a caller of @@ -351,7 +351,7 @@ static int fat_sanitize_mode(const struct msdos_sb_info *sbi, static int fat_allow_set_time(struct msdos_sb_info *sbi, struct inode *inode) { - mode_t allow_utime = sbi->options.allow_utime; + umode_t allow_utime = sbi->options.allow_utime; if (current_fsuid() != inode->i_uid) { if (in_group_p(inode->i_gid)) diff --git a/fs/fat/inode.c b/fs/fat/inode.c index 1726d73..3ab8410 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -379,7 +379,7 @@ static int fat_fill_inode(struct inode *inode, struct msdos_dir_entry *de) return error; MSDOS_I(inode)->mmu_private = inode->i_size; - inode->i_nlink = fat_subdirs(inode); + set_nlink(inode, fat_subdirs(inode)); } else { /* not a directory */ inode->i_generation |= 1; inode->i_mode = fat_make_mode(sbi, de->attr, @@ -518,7 +518,6 @@ static struct inode *fat_alloc_inode(struct super_block *sb) static void fat_i_callback(struct rcu_head *head) { struct inode *inode = container_of(head, struct inode, i_rcu); - INIT_LIST_HEAD(&inode->i_dentry); kmem_cache_free(fat_inode_cachep, MSDOS_I(inode)); } @@ -672,7 +671,7 @@ int fat_sync_inode(struct inode *inode) EXPORT_SYMBOL_GPL(fat_sync_inode); -static int fat_show_options(struct seq_file *m, struct vfsmount *mnt); +static int fat_show_options(struct seq_file *m, struct dentry *root); static const struct super_operations fat_sops = { .alloc_inode = fat_alloc_inode, .destroy_inode = fat_destroy_inode, @@ -811,9 +810,9 @@ static const struct export_operations fat_export_ops = { .get_parent = fat_get_parent, }; -static int fat_show_options(struct seq_file *m, struct vfsmount *mnt) +static int fat_show_options(struct seq_file *m, struct dentry *root) { - struct msdos_sb_info *sbi = MSDOS_SB(mnt->mnt_sb); + struct msdos_sb_info *sbi = MSDOS_SB(root->d_sb); struct fat_mount_options *opts = &sbi->options; int isvfat = opts->isvfat; @@ -898,7 +897,7 @@ enum { Opt_charset, Opt_shortname_lower, Opt_shortname_win95, Opt_shortname_winnt, Opt_shortname_mixed, Opt_utf8_no, Opt_utf8_yes, Opt_uni_xl_no, Opt_uni_xl_yes, Opt_nonumtail_no, Opt_nonumtail_yes, - Opt_obsolate, Opt_flush, Opt_tz_utc, Opt_rodir, Opt_err_cont, + Opt_obsolete, Opt_flush, Opt_tz_utc, Opt_rodir, Opt_err_cont, Opt_err_panic, Opt_err_ro, Opt_discard, Opt_err, }; @@ -928,17 +927,17 @@ static const match_table_t fat_tokens = { {Opt_err_panic, "errors=panic"}, {Opt_err_ro, "errors=remount-ro"}, {Opt_discard, "discard"}, - {Opt_obsolate, "conv=binary"}, - {Opt_obsolate, "conv=text"}, - {Opt_obsolate, "conv=auto"}, - {Opt_obsolate, "conv=b"}, - {Opt_obsolate, "conv=t"}, - {Opt_obsolate, "conv=a"}, - {Opt_obsolate, "fat=%u"}, - {Opt_obsolate, "blocksize=%u"}, - {Opt_obsolate, "cvf_format=%20s"}, - {Opt_obsolate, "cvf_options=%100s"}, - {Opt_obsolate, "posix"}, + {Opt_obsolete, "conv=binary"}, + {Opt_obsolete, "conv=text"}, + {Opt_obsolete, "conv=auto"}, + {Opt_obsolete, "conv=b"}, + {Opt_obsolete, "conv=t"}, + {Opt_obsolete, "conv=a"}, + {Opt_obsolete, "fat=%u"}, + {Opt_obsolete, "blocksize=%u"}, + {Opt_obsolete, "cvf_format=%20s"}, + {Opt_obsolete, "cvf_options=%100s"}, + {Opt_obsolete, "posix"}, {Opt_err, NULL}, }; static const match_table_t msdos_tokens = { @@ -1170,7 +1169,7 @@ static int parse_options(struct super_block *sb, char *options, int is_vfat, break; /* obsolete mount options */ - case Opt_obsolate: + case Opt_obsolete: fat_msg(sb, KERN_INFO, "\"%s\" option is obsolete, " "not supported now", p); break; @@ -1233,7 +1232,7 @@ static int fat_read_root(struct inode *inode) fat_save_attrs(inode, ATTR_DIR); inode->i_mtime.tv_sec = inode->i_atime.tv_sec = inode->i_ctime.tv_sec = 0; inode->i_mtime.tv_nsec = inode->i_atime.tv_nsec = inode->i_ctime.tv_nsec = 0; - inode->i_nlink = fat_subdirs(inode)+2; + set_nlink(inode, fat_subdirs(inode)+2); return 0; } diff --git a/fs/fat/namei_msdos.c b/fs/fat/namei_msdos.c index 66e83b8..c5938c9 100644 --- a/fs/fat/namei_msdos.c +++ b/fs/fat/namei_msdos.c @@ -264,7 +264,7 @@ static int msdos_add_entry(struct inode *dir, const unsigned char *name, } /***** Create a file */ -static int msdos_create(struct inode *dir, struct dentry *dentry, int mode, +static int msdos_create(struct inode *dir, struct dentry *dentry, umode_t mode, struct nameidata *nd) { struct super_block *sb = dir->i_sb; @@ -346,7 +346,7 @@ out: } /***** Make a directory */ -static int msdos_mkdir(struct inode *dir, struct dentry *dentry, int mode) +static int msdos_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) { struct super_block *sb = dir->i_sb; struct fat_slot_info sinfo; @@ -387,7 +387,7 @@ static int msdos_mkdir(struct inode *dir, struct dentry *dentry, int mode) /* the directory was completed, just return a error */ goto out; } - inode->i_nlink = 2; + set_nlink(inode, 2); inode->i_mtime = inode->i_atime = inode->i_ctime = ts; /* timestamp is already written, so mark_inode_dirty() is unneeded. */ diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c index bb3f29c..a81eb23 100644 --- a/fs/fat/namei_vfat.c +++ b/fs/fat/namei_vfat.c @@ -512,7 +512,8 @@ xlate_to_uni(const unsigned char *name, int len, unsigned char *outname, int charlen; if (utf8) { - *outlen = utf8s_to_utf16s(name, len, (wchar_t *)outname); + *outlen = utf8s_to_utf16s(name, len, UTF16_HOST_ENDIAN, + (wchar_t *) outname, FAT_LFN_LEN + 2); if (*outlen < 0) return *outlen; else if (*outlen > FAT_LFN_LEN) @@ -781,7 +782,7 @@ error: return ERR_PTR(err); } -static int vfat_create(struct inode *dir, struct dentry *dentry, int mode, +static int vfat_create(struct inode *dir, struct dentry *dentry, umode_t mode, struct nameidata *nd) { struct super_block *sb = dir->i_sb; @@ -870,7 +871,7 @@ out: return err; } -static int vfat_mkdir(struct inode *dir, struct dentry *dentry, int mode) +static int vfat_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) { struct super_block *sb = dir->i_sb; struct inode *inode; @@ -900,7 +901,7 @@ static int vfat_mkdir(struct inode *dir, struct dentry *dentry, int mode) goto out; } inode->i_version++; - inode->i_nlink = 2; + set_nlink(inode, 2); inode->i_mtime = inode->i_atime = inode->i_ctime = ts; /* timestamp is already written, so mark_inode_dirty() is unneeded. */ diff --git a/fs/fhandle.c b/fs/fhandle.c index 6b08864..a48e4a1 100644 --- a/fs/fhandle.c +++ b/fs/fhandle.c @@ -10,6 +10,7 @@ #include <linux/personality.h> #include <asm/uaccess.h> #include "internal.h" +#include "mount.h" static long do_sys_name_to_handle(struct path *path, struct file_handle __user *ufh, @@ -24,8 +25,8 @@ static long do_sys_name_to_handle(struct path *path, * We need t make sure wether the file system * support decoding of the file handle */ - if (!path->mnt->mnt_sb->s_export_op || - !path->mnt->mnt_sb->s_export_op->fh_to_dentry) + if (!path->dentry->d_sb->s_export_op || + !path->dentry->d_sb->s_export_op->fh_to_dentry) return -EOPNOTSUPP; if (copy_from_user(&f_handle, ufh, sizeof(struct file_handle))) @@ -66,7 +67,8 @@ static long do_sys_name_to_handle(struct path *path, } else retval = 0; /* copy the mount id */ - if (copy_to_user(mnt_id, &path->mnt->mnt_id, sizeof(*mnt_id)) || + if (copy_to_user(mnt_id, &real_mount(path->mnt)->mnt_id, + sizeof(*mnt_id)) || copy_to_user(ufh, handle, sizeof(struct file_handle) + handle_bytes)) retval = -EFAULT; diff --git a/fs/file_table.c b/fs/file_table.c index c322794..20002e3 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -474,29 +474,6 @@ void file_sb_list_del(struct file *file) #endif -int fs_may_remount_ro(struct super_block *sb) -{ - struct file *file; - /* Check that no files are currently opened for writing. */ - lg_global_lock(files_lglock); - do_file_list_for_each_entry(sb, file) { - struct inode *inode = file->f_path.dentry->d_inode; - - /* File with pending delete? */ - if (inode->i_nlink == 0) - goto too_bad; - - /* Writeable file? */ - if (S_ISREG(inode->i_mode) && (file->f_mode & FMODE_WRITE)) - goto too_bad; - } while_file_list_for_each_entry; - lg_global_unlock(files_lglock); - return 1; /* Tis' cool bro. */ -too_bad: - lg_global_unlock(files_lglock); - return 0; -} - /** * mark_files_ro - mark all files read-only * @sb: superblock in question diff --git a/fs/filesystems.c b/fs/filesystems.c index 0845f84..96f2428 100644 --- a/fs/filesystems.c +++ b/fs/filesystems.c @@ -74,7 +74,6 @@ int register_filesystem(struct file_system_type * fs) BUG_ON(strchr(fs->name, '.')); if (fs->next) return -EBUSY; - INIT_LIST_HEAD(&fs->fs_supers); write_lock(&file_systems_lock); p = find_filesystem(fs->name, strlen(fs->name)); if (*p) diff --git a/fs/freevxfs/vxfs_inode.c b/fs/freevxfs/vxfs_inode.c index 1a43114..cf9ef91 100644 --- a/fs/freevxfs/vxfs_inode.c +++ b/fs/freevxfs/vxfs_inode.c @@ -187,10 +187,10 @@ vxfs_stiget(struct super_block *sbp, ino_t ino) * vxfs_transmod returns a Linux mode_t for a given * VxFS inode structure. */ -static __inline__ mode_t +static __inline__ umode_t vxfs_transmod(struct vxfs_inode_info *vip) { - mode_t ret = vip->vii_mode & ~VXFS_TYPE_MASK; + umode_t ret = vip->vii_mode & ~VXFS_TYPE_MASK; if (VXFS_ISFIFO(vip)) ret |= S_IFIFO; @@ -227,7 +227,7 @@ vxfs_iinit(struct inode *ip, struct vxfs_inode_info *vip) ip->i_uid = (uid_t)vip->vii_uid; ip->i_gid = (gid_t)vip->vii_gid; - ip->i_nlink = vip->vii_nlink; + set_nlink(ip, vip->vii_nlink); ip->i_size = vip->vii_size; ip->i_atime.tv_sec = vip->vii_atime; @@ -340,7 +340,6 @@ vxfs_iget(struct super_block *sbp, ino_t ino) static void vxfs_i_callback(struct rcu_head *head) { struct inode *inode = container_of(head, struct inode, i_rcu); - INIT_LIST_HEAD(&inode->i_dentry); kmem_cache_free(vxfs_inode_cachep, inode->i_private); } diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 04cf3b9..e295150 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -25,7 +25,6 @@ #include <linux/writeback.h> #include <linux/blkdev.h> #include <linux/backing-dev.h> -#include <linux/buffer_head.h> #include <linux/tracepoint.h> #include "internal.h" @@ -41,6 +40,7 @@ struct wb_writeback_work { unsigned int for_kupdate:1; unsigned int range_cyclic:1; unsigned int for_background:1; + enum wb_reason reason; /* why was writeback initiated? */ struct list_head list; /* pending work list */ struct completion *done; /* set if the caller waits */ @@ -115,7 +115,7 @@ static void bdi_queue_work(struct backing_dev_info *bdi, static void __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages, - bool range_cyclic) + bool range_cyclic, enum wb_reason reason) { struct wb_writeback_work *work; @@ -135,6 +135,7 @@ __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages, work->sync_mode = WB_SYNC_NONE; work->nr_pages = nr_pages; work->range_cyclic = range_cyclic; + work->reason = reason; bdi_queue_work(bdi, work); } @@ -143,6 +144,7 @@ __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages, * bdi_start_writeback - start writeback * @bdi: the backing device to write from * @nr_pages: the number of pages to write + * @reason: reason why some writeback work was initiated * * Description: * This does WB_SYNC_NONE opportunistic writeback. The IO is only @@ -150,9 +152,10 @@ __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages, * completion. Caller need not hold sb s_umount semaphore. * */ -void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages) +void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages, + enum wb_reason reason) { - __bdi_start_writeback(bdi, nr_pages, true); + __bdi_start_writeback(bdi, nr_pages, true, reason); } /** @@ -251,7 +254,7 @@ static bool inode_dirtied_after(struct inode *inode, unsigned long t) */ static int move_expired_inodes(struct list_head *delaying_queue, struct list_head *dispatch_queue, - unsigned long *older_than_this) + struct wb_writeback_work *work) { LIST_HEAD(tmp); struct list_head *pos, *node; @@ -262,8 +265,8 @@ static int move_expired_inodes(struct list_head *delaying_queue, while (!list_empty(delaying_queue)) { inode = wb_inode(delaying_queue->prev); - if (older_than_this && - inode_dirtied_after(inode, *older_than_this)) + if (work->older_than_this && + inode_dirtied_after(inode, *work->older_than_this)) break; if (sb && sb != inode->i_sb) do_sb_sort = 1; @@ -302,13 +305,13 @@ out: * | * +--> dequeue for IO */ -static void queue_io(struct bdi_writeback *wb, unsigned long *older_than_this) +static void queue_io(struct bdi_writeback *wb, struct wb_writeback_work *work) { int moved; assert_spin_locked(&wb->list_lock); list_splice_init(&wb->b_more_io, &wb->b_io); - moved = move_expired_inodes(&wb->b_dirty, &wb->b_io, older_than_this); - trace_writeback_queue_io(wb, older_than_this, moved); + moved = move_expired_inodes(&wb->b_dirty, &wb->b_io, work); + trace_writeback_queue_io(wb, work, moved); } static int write_inode(struct inode *inode, struct writeback_control *wbc) @@ -641,31 +644,40 @@ static long __writeback_inodes_wb(struct bdi_writeback *wb, return wrote; } -long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages) +long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages, + enum wb_reason reason) { struct wb_writeback_work work = { .nr_pages = nr_pages, .sync_mode = WB_SYNC_NONE, .range_cyclic = 1, + .reason = reason, }; spin_lock(&wb->list_lock); if (list_empty(&wb->b_io)) - queue_io(wb, NULL); + queue_io(wb, &work); __writeback_inodes_wb(wb, &work); spin_unlock(&wb->list_lock); return nr_pages - work.nr_pages; } -static inline bool over_bground_thresh(void) +static bool over_bground_thresh(struct backing_dev_info *bdi) { unsigned long background_thresh, dirty_thresh; global_dirty_limits(&background_thresh, &dirty_thresh); - return (global_page_state(NR_FILE_DIRTY) + - global_page_state(NR_UNSTABLE_NFS) > background_thresh); + if (global_page_state(NR_FILE_DIRTY) + + global_page_state(NR_UNSTABLE_NFS) > background_thresh) + return true; + + if (bdi_stat(bdi, BDI_RECLAIMABLE) > + bdi_dirty_limit(bdi, background_thresh)) + return true; + + return false; } /* @@ -675,7 +687,7 @@ static inline bool over_bground_thresh(void) static void wb_update_bandwidth(struct bdi_writeback *wb, unsigned long start_time) { - __bdi_update_bandwidth(wb->bdi, 0, 0, 0, 0, start_time); + __bdi_update_bandwidth(wb->bdi, 0, 0, 0, 0, 0, start_time); } /* @@ -727,7 +739,7 @@ static long wb_writeback(struct bdi_writeback *wb, * For background writeout, stop when we are below the * background dirty threshold */ - if (work->for_background && !over_bground_thresh()) + if (work->for_background && !over_bground_thresh(wb->bdi)) break; if (work->for_kupdate) { @@ -738,7 +750,7 @@ static long wb_writeback(struct bdi_writeback *wb, trace_writeback_start(wb->bdi, work); if (list_empty(&wb->b_io)) - queue_io(wb, work->older_than_this); + queue_io(wb, work); if (work->sb) progress = writeback_sb_inodes(work->sb, wb, work); else @@ -811,13 +823,14 @@ static unsigned long get_nr_dirty_pages(void) static long wb_check_background_flush(struct bdi_writeback *wb) { - if (over_bground_thresh()) { + if (over_bground_thresh(wb->bdi)) { struct wb_writeback_work work = { .nr_pages = LONG_MAX, .sync_mode = WB_SYNC_NONE, .for_background = 1, .range_cyclic = 1, + .reason = WB_REASON_BACKGROUND, }; return wb_writeback(wb, &work); @@ -851,6 +864,7 @@ static long wb_check_old_data_flush(struct bdi_writeback *wb) .sync_mode = WB_SYNC_NONE, .for_kupdate = 1, .range_cyclic = 1, + .reason = WB_REASON_PERIODIC, }; return wb_writeback(wb, &work); @@ -922,7 +936,7 @@ int bdi_writeback_thread(void *data) trace_writeback_thread_start(bdi); - while (!kthread_should_stop()) { + while (!kthread_freezable_should_stop(NULL)) { /* * Remove own delayed wake-up timer, since we are already awake * and we'll take care of the preriodic write-back. @@ -952,8 +966,6 @@ int bdi_writeback_thread(void *data) */ schedule(); } - - try_to_freeze(); } /* Flush any work that raced with us exiting */ @@ -969,7 +981,7 @@ int bdi_writeback_thread(void *data) * Start writeback of `nr_pages' pages. If `nr_pages' is zero, write back * the whole world. */ -void wakeup_flusher_threads(long nr_pages) +void wakeup_flusher_threads(long nr_pages, enum wb_reason reason) { struct backing_dev_info *bdi; @@ -982,7 +994,7 @@ void wakeup_flusher_threads(long nr_pages) list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) { if (!bdi_has_dirty_io(bdi)) continue; - __bdi_start_writeback(bdi, nr_pages, false); + __bdi_start_writeback(bdi, nr_pages, false, reason); } rcu_read_unlock(); } @@ -1198,12 +1210,15 @@ static void wait_sb_inodes(struct super_block *sb) * writeback_inodes_sb_nr - writeback dirty inodes from given super_block * @sb: the superblock * @nr: the number of pages to write + * @reason: reason why some writeback work initiated * * Start writeback on some inodes on this super_block. No guarantees are made * on how many (if any) will be written, and this function does not wait * for IO completion of submitted IO. */ -void writeback_inodes_sb_nr(struct super_block *sb, unsigned long nr) +void writeback_inodes_sb_nr(struct super_block *sb, + unsigned long nr, + enum wb_reason reason) { DECLARE_COMPLETION_ONSTACK(done); struct wb_writeback_work work = { @@ -1212,6 +1227,7 @@ void writeback_inodes_sb_nr(struct super_block *sb, unsigned long nr) .tagged_writepages = 1, .done = &done, .nr_pages = nr, + .reason = reason, }; WARN_ON(!rwsem_is_locked(&sb->s_umount)); @@ -1223,29 +1239,31 @@ EXPORT_SYMBOL(writeback_inodes_sb_nr); /** * writeback_inodes_sb - writeback dirty inodes from given super_block * @sb: the superblock + * @reason: reason why some writeback work was initiated * * Start writeback on some inodes on this super_block. No guarantees are made * on how many (if any) will be written, and this function does not wait * for IO completion of submitted IO. */ -void writeback_inodes_sb(struct super_block *sb) +void writeback_inodes_sb(struct super_block *sb, enum wb_reason reason) { - return writeback_inodes_sb_nr(sb, get_nr_dirty_pages()); + return writeback_inodes_sb_nr(sb, get_nr_dirty_pages(), reason); } EXPORT_SYMBOL(writeback_inodes_sb); /** * writeback_inodes_sb_if_idle - start writeback if none underway * @sb: the superblock + * @reason: reason why some writeback work was initiated * * Invoke writeback_inodes_sb if no writeback is currently underway. * Returns 1 if writeback was started, 0 if not. */ -int writeback_inodes_sb_if_idle(struct super_block *sb) +int writeback_inodes_sb_if_idle(struct super_block *sb, enum wb_reason reason) { if (!writeback_in_progress(sb->s_bdi)) { down_read(&sb->s_umount); - writeback_inodes_sb(sb); + writeback_inodes_sb(sb, reason); up_read(&sb->s_umount); return 1; } else @@ -1257,16 +1275,18 @@ EXPORT_SYMBOL(writeback_inodes_sb_if_idle); * writeback_inodes_sb_if_idle - start writeback if none underway * @sb: the superblock * @nr: the number of pages to write + * @reason: reason why some writeback work was initiated * * Invoke writeback_inodes_sb if no writeback is currently underway. * Returns 1 if writeback was started, 0 if not. */ int writeback_inodes_sb_nr_if_idle(struct super_block *sb, - unsigned long nr) + unsigned long nr, + enum wb_reason reason) { if (!writeback_in_progress(sb->s_bdi)) { down_read(&sb->s_umount); - writeback_inodes_sb_nr(sb, nr); + writeback_inodes_sb_nr(sb, nr, reason); up_read(&sb->s_umount); return 1; } else @@ -1290,6 +1310,7 @@ void sync_inodes_sb(struct super_block *sb) .nr_pages = LONG_MAX, .range_cyclic = 0, .done = &done, + .reason = WB_REASON_SYNC, }; WARN_ON(!rwsem_is_locked(&sb->s_umount)); diff --git a/fs/fuse/control.c b/fs/fuse/control.c index 85542a7..42593c5 100644 --- a/fs/fuse/control.c +++ b/fs/fuse/control.c @@ -231,7 +231,7 @@ static struct dentry *fuse_ctl_add_dentry(struct dentry *parent, if (iop) inode->i_op = iop; inode->i_fop = fop; - inode->i_nlink = nlink; + set_nlink(inode, nlink); inode->i_private = fc; d_add(dentry, inode); return dentry; diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c index b6cca47..3426521 100644 --- a/fs/fuse/cuse.c +++ b/fs/fuse/cuse.c @@ -47,6 +47,7 @@ #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/stat.h> +#include <linux/module.h> #include "fuse_i.h" diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 5cb8614..2aaf3ea 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -1512,7 +1512,7 @@ static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode, else if (outarg->offset + num > file_size) num = file_size - outarg->offset; - while (num) { + while (num && req->num_pages < FUSE_MAX_PAGES_PER_REQ) { struct page *page; unsigned int this_num; @@ -1526,6 +1526,7 @@ static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode, num -= this_num; total_len += this_num; + index++; } req->misc.retrieve_in.offset = outarg->offset; req->misc.retrieve_in.size = total_len; diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 9f63e49..5ddd6ea8 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -369,8 +369,8 @@ static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry, * If the filesystem doesn't support this, then fall back to separate * 'mknod' + 'open' requests. */ -static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode, - struct nameidata *nd) +static int fuse_create_open(struct inode *dir, struct dentry *entry, + umode_t mode, struct nameidata *nd) { int err; struct inode *inode; @@ -480,7 +480,7 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode, */ static int create_new_entry(struct fuse_conn *fc, struct fuse_req *req, struct inode *dir, struct dentry *entry, - int mode) + umode_t mode) { struct fuse_entry_out outarg; struct inode *inode; @@ -547,7 +547,7 @@ static int create_new_entry(struct fuse_conn *fc, struct fuse_req *req, return err; } -static int fuse_mknod(struct inode *dir, struct dentry *entry, int mode, +static int fuse_mknod(struct inode *dir, struct dentry *entry, umode_t mode, dev_t rdev) { struct fuse_mknod_in inarg; @@ -573,7 +573,7 @@ static int fuse_mknod(struct inode *dir, struct dentry *entry, int mode, return create_new_entry(fc, req, dir, entry, mode); } -static int fuse_create(struct inode *dir, struct dentry *entry, int mode, +static int fuse_create(struct inode *dir, struct dentry *entry, umode_t mode, struct nameidata *nd) { if (nd) { @@ -585,7 +585,7 @@ static int fuse_create(struct inode *dir, struct dentry *entry, int mode, return fuse_mknod(dir, entry, mode, 0); } -static int fuse_mkdir(struct inode *dir, struct dentry *entry, int mode) +static int fuse_mkdir(struct inode *dir, struct dentry *entry, umode_t mode) { struct fuse_mkdir_in inarg; struct fuse_conn *fc = get_fuse_conn(dir); diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 594f07a..0c84100 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1556,7 +1556,7 @@ static loff_t fuse_file_llseek(struct file *file, loff_t offset, int origin) struct inode *inode = file->f_path.dentry->d_inode; mutex_lock(&inode->i_mutex); - if (origin != SEEK_CUR || origin != SEEK_SET) { + if (origin != SEEK_CUR && origin != SEEK_SET) { retval = fuse_update_attributes(inode, NULL, file, NULL); if (retval) goto exit; @@ -1567,6 +1567,10 @@ static loff_t fuse_file_llseek(struct file *file, loff_t offset, int origin) offset += i_size_read(inode); break; case SEEK_CUR: + if (offset == 0) { + retval = file->f_pos; + goto exit; + } offset += file->f_pos; break; case SEEK_DATA: diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index cf6db0a..1964da0 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -80,7 +80,7 @@ struct fuse_inode { /** The sticky bit in inode->i_mode may have been removed, so preserve the original mode */ - mode_t orig_i_mode; + umode_t orig_i_mode; /** Version of last attribute change */ u64 attr_version; diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index add96f6..64cf8d0 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -107,7 +107,6 @@ static struct inode *fuse_alloc_inode(struct super_block *sb) static void fuse_i_callback(struct rcu_head *head) { struct inode *inode = container_of(head, struct inode, i_rcu); - INIT_LIST_HEAD(&inode->i_dentry); kmem_cache_free(fuse_inode_cachep, inode); } @@ -151,7 +150,7 @@ void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr, inode->i_ino = attr->ino; inode->i_mode = (inode->i_mode & S_IFMT) | (attr->mode & 07777); - inode->i_nlink = attr->nlink; + set_nlink(inode, attr->nlink); inode->i_uid = attr->uid; inode->i_gid = attr->gid; inode->i_blocks = attr->blocks; @@ -498,9 +497,10 @@ static int parse_fuse_opt(char *opt, struct fuse_mount_data *d, int is_bdev) return 1; } -static int fuse_show_options(struct seq_file *m, struct vfsmount *mnt) +static int fuse_show_options(struct seq_file *m, struct dentry *root) { - struct fuse_conn *fc = get_fuse_conn_super(mnt->mnt_sb); + struct super_block *sb = root->d_sb; + struct fuse_conn *fc = get_fuse_conn_super(sb); seq_printf(m, ",user_id=%u", fc->user_id); seq_printf(m, ",group_id=%u", fc->group_id); @@ -510,9 +510,8 @@ static int fuse_show_options(struct seq_file *m, struct vfsmount *mnt) seq_puts(m, ",allow_other"); if (fc->max_read != ~0) seq_printf(m, ",max_read=%u", fc->max_read); - if (mnt->mnt_sb->s_bdev && - mnt->mnt_sb->s_blocksize != FUSE_DEFAULT_BLKSIZE) - seq_printf(m, ",blksize=%lu", mnt->mnt_sb->s_blocksize); + if (sb->s_bdev && sb->s_blocksize != FUSE_DEFAULT_BLKSIZE) + seq_printf(m, ",blksize=%lu", sb->s_blocksize); return 0; } @@ -1138,28 +1137,28 @@ static int __init fuse_fs_init(void) { int err; - err = register_filesystem(&fuse_fs_type); - if (err) - goto out; - - err = register_fuseblk(); - if (err) - goto out_unreg; - fuse_inode_cachep = kmem_cache_create("fuse_inode", sizeof(struct fuse_inode), 0, SLAB_HWCACHE_ALIGN, fuse_inode_init_once); err = -ENOMEM; if (!fuse_inode_cachep) - goto out_unreg2; + goto out; + + err = register_fuseblk(); + if (err) + goto out2; + + err = register_filesystem(&fuse_fs_type); + if (err) + goto out3; return 0; - out_unreg2: + out3: unregister_fuseblk(); - out_unreg: - unregister_filesystem(&fuse_fs_type); + out2: + kmem_cache_destroy(fuse_inode_cachep); out: return err; } diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c index 34501b6..230eb0f 100644 --- a/fs/gfs2/acl.c +++ b/fs/gfs2/acl.c @@ -38,8 +38,9 @@ static const char *gfs2_acl_name(int type) return NULL; } -static struct posix_acl *gfs2_acl_get(struct gfs2_inode *ip, int type) +struct posix_acl *gfs2_get_acl(struct inode *inode, int type) { + struct gfs2_inode *ip = GFS2_I(inode); struct posix_acl *acl; const char *name; char *data; @@ -67,11 +68,6 @@ static struct posix_acl *gfs2_acl_get(struct gfs2_inode *ip, int type) return acl; } -struct posix_acl *gfs2_get_acl(struct inode *inode, int type) -{ - return gfs2_acl_get(GFS2_I(inode), type); -} - static int gfs2_set_mode(struct inode *inode, umode_t mode) { int error = 0; @@ -82,7 +78,7 @@ static int gfs2_set_mode(struct inode *inode, umode_t mode) iattr.ia_valid = ATTR_MODE; iattr.ia_mode = mode; - error = gfs2_setattr_simple(GFS2_I(inode), &iattr); + error = gfs2_setattr_simple(inode, &iattr); } return error; @@ -125,7 +121,7 @@ int gfs2_acl_create(struct gfs2_inode *dip, struct inode *inode) if (S_ISLNK(inode->i_mode)) return 0; - acl = gfs2_acl_get(dip, ACL_TYPE_DEFAULT); + acl = gfs2_get_acl(&dip->i_inode, ACL_TYPE_DEFAULT); if (IS_ERR(acl)) return PTR_ERR(acl); if (!acl) { @@ -160,16 +156,17 @@ out: int gfs2_acl_chmod(struct gfs2_inode *ip, struct iattr *attr) { + struct inode *inode = &ip->i_inode; struct posix_acl *acl; char *data; unsigned int len; int error; - acl = gfs2_acl_get(ip, ACL_TYPE_ACCESS); + acl = gfs2_get_acl(&ip->i_inode, ACL_TYPE_ACCESS); if (IS_ERR(acl)) return PTR_ERR(acl); if (!acl) - return gfs2_setattr_simple(ip, attr); + return gfs2_setattr_simple(inode, attr); error = posix_acl_chmod(&acl, GFP_NOFS, attr->ia_mode); if (error) @@ -215,7 +212,7 @@ static int gfs2_xattr_system_get(struct dentry *dentry, const char *name, if (type < 0) return type; - acl = gfs2_acl_get(GFS2_I(inode), type); + acl = gfs2_get_acl(inode, type); if (IS_ERR(acl)) return PTR_ERR(acl); if (acl == NULL) diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index f9fbbe9..501e5cb 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -615,7 +615,7 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping, unsigned int data_blocks = 0, ind_blocks = 0, rblocks; int alloc_required; int error = 0; - struct gfs2_alloc *al = NULL; + struct gfs2_qadata *qa = NULL; pgoff_t index = pos >> PAGE_CACHE_SHIFT; unsigned from = pos & (PAGE_CACHE_SIZE - 1); struct page *page; @@ -639,8 +639,8 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping, gfs2_write_calc_reserv(ip, len, &data_blocks, &ind_blocks); if (alloc_required) { - al = gfs2_alloc_get(ip); - if (!al) { + qa = gfs2_qadata_get(ip); + if (!qa) { error = -ENOMEM; goto out_unlock; } @@ -649,8 +649,7 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping, if (error) goto out_alloc_put; - al->al_requested = data_blocks + ind_blocks; - error = gfs2_inplace_reserve(ip); + error = gfs2_inplace_reserve(ip, data_blocks + ind_blocks); if (error) goto out_qunlock; } @@ -663,7 +662,7 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping, if (&ip->i_inode == sdp->sd_rindex) rblocks += 2 * RES_STATFS; if (alloc_required) - rblocks += gfs2_rg_blocks(al); + rblocks += gfs2_rg_blocks(ip); error = gfs2_trans_begin(sdp, rblocks, PAGE_CACHE_SIZE/sdp->sd_sb.sb_bsize); @@ -711,7 +710,7 @@ out_trans_fail: out_qunlock: gfs2_quota_unlock(ip); out_alloc_put: - gfs2_alloc_put(ip); + gfs2_qadata_put(ip); } out_unlock: if (&ip->i_inode == sdp->sd_rindex) { @@ -787,7 +786,6 @@ static int gfs2_stuffed_write_end(struct inode *inode, struct buffer_head *dibh, u64 to = pos + copied; void *kaddr; unsigned char *buf = dibh->b_data + sizeof(struct gfs2_dinode); - struct gfs2_dinode *di = (struct gfs2_dinode *)dibh->b_data; BUG_ON((pos + len) > (dibh->b_size - sizeof(struct gfs2_dinode))); kaddr = kmap_atomic(page, KM_USER0); @@ -804,7 +802,6 @@ static int gfs2_stuffed_write_end(struct inode *inode, struct buffer_head *dibh, if (copied) { if (inode->i_size < to) i_size_write(inode, to); - gfs2_dinode_out(ip, di); mark_inode_dirty(inode); } @@ -850,7 +847,7 @@ static int gfs2_write_end(struct file *file, struct address_space *mapping, struct gfs2_sbd *sdp = GFS2_SB(inode); struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode); struct buffer_head *dibh; - struct gfs2_alloc *al = ip->i_alloc; + struct gfs2_qadata *qa = ip->i_qadata; unsigned int from = pos & (PAGE_CACHE_SIZE - 1); unsigned int to = from + len; int ret; @@ -873,10 +870,6 @@ static int gfs2_write_end(struct file *file, struct address_space *mapping, gfs2_page_add_databufs(ip, page, from, to); ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata); - if (ret > 0) { - gfs2_dinode_out(ip, dibh->b_data); - mark_inode_dirty(inode); - } if (inode == sdp->sd_rindex) { adjust_fs_space(inode); @@ -886,10 +879,11 @@ static int gfs2_write_end(struct file *file, struct address_space *mapping, brelse(dibh); failed: gfs2_trans_end(sdp); - if (al) { + if (ip->i_res) gfs2_inplace_release(ip); + if (qa) { gfs2_quota_unlock(ip); - gfs2_alloc_put(ip); + gfs2_qadata_put(ip); } if (inode == sdp->sd_rindex) { gfs2_glock_dq(&m_ip->i_gh); diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index 7878c47..14a7040 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -10,6 +10,7 @@ #include <linux/spinlock.h> #include <linux/completion.h> #include <linux/buffer_head.h> +#include <linux/blkdev.h> #include <linux/gfs2_ondisk.h> #include <linux/crc32.h> @@ -36,11 +37,6 @@ struct metapath { __u16 mp_list[GFS2_MAX_META_HEIGHT]; }; -typedef int (*block_call_t) (struct gfs2_inode *ip, struct buffer_head *dibh, - struct buffer_head *bh, __be64 *top, - __be64 *bottom, unsigned int height, - void *data); - struct strip_mine { int sm_first; unsigned int sm_height; @@ -137,7 +133,7 @@ int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page) and write it out to disk */ unsigned int n = 1; - error = gfs2_alloc_block(ip, &block, &n); + error = gfs2_alloc_blocks(ip, &block, &n, 0, NULL); if (error) goto out_brelse; if (isdir) { @@ -273,6 +269,30 @@ static inline __be64 *metapointer(unsigned int height, const struct metapath *mp return ((__be64 *)(bh->b_data + head_size)) + mp->mp_list[height]; } +static void gfs2_metapath_ra(struct gfs2_glock *gl, + const struct buffer_head *bh, const __be64 *pos) +{ + struct buffer_head *rabh; + const __be64 *endp = (const __be64 *)(bh->b_data + bh->b_size); + const __be64 *t; + + for (t = pos; t < endp; t++) { + if (!*t) + continue; + + rabh = gfs2_getbuf(gl, be64_to_cpu(*t), CREATE); + if (trylock_buffer(rabh)) { + if (!buffer_uptodate(rabh)) { + rabh->b_end_io = end_buffer_read_sync; + submit_bh(READA | REQ_META, rabh); + continue; + } + unlock_buffer(rabh); + } + brelse(rabh); + } +} + /** * lookup_metapath - Walk the metadata tree to a specific point * @ip: The inode @@ -432,12 +452,14 @@ static int gfs2_bmap_alloc(struct inode *inode, const sector_t lblock, { struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_sbd *sdp = GFS2_SB(inode); + struct super_block *sb = sdp->sd_vfs; struct buffer_head *dibh = mp->mp_bh[0]; u64 bn, dblock = 0; unsigned n, i, blks, alloced = 0, iblks = 0, branch_start = 0; unsigned dblks = 0; unsigned ptrs_per_blk; const unsigned end_of_metadata = height - 1; + int ret; int eob = 0; enum alloc_state state; __be64 *ptr; @@ -481,7 +503,7 @@ static int gfs2_bmap_alloc(struct inode *inode, const sector_t lblock, do { int error; n = blks - alloced; - error = gfs2_alloc_block(ip, &bn, &n); + error = gfs2_alloc_blocks(ip, &bn, &n, 0, NULL); if (error) return error; alloced += n; @@ -540,6 +562,15 @@ static int gfs2_bmap_alloc(struct inode *inode, const sector_t lblock, dblock = bn; while (n-- > 0) *ptr++ = cpu_to_be64(bn++); + if (buffer_zeronew(bh_map)) { + ret = sb_issue_zeroout(sb, dblock, dblks, + GFP_NOFS); + if (ret) { + fs_err(sdp, + "Failed to zero data buffers\n"); + clear_buffer_zeronew(bh_map); + } + } break; } } while ((state != ALLOC_DATA) || !dblock); @@ -668,76 +699,6 @@ int gfs2_extent_map(struct inode *inode, u64 lblock, int *new, u64 *dblock, unsi } /** - * recursive_scan - recursively scan through the end of a file - * @ip: the inode - * @dibh: the dinode buffer - * @mp: the path through the metadata to the point to start - * @height: the height the recursion is at - * @block: the indirect block to look at - * @first: 1 if this is the first block - * @bc: the call to make for each piece of metadata - * @data: data opaque to this function to pass to @bc - * - * When this is first called @height and @block should be zero and - * @first should be 1. - * - * Returns: errno - */ - -static int recursive_scan(struct gfs2_inode *ip, struct buffer_head *dibh, - struct metapath *mp, unsigned int height, - u64 block, int first, block_call_t bc, - void *data) -{ - struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); - struct buffer_head *bh = NULL; - __be64 *top, *bottom; - u64 bn; - int error; - int mh_size = sizeof(struct gfs2_meta_header); - - if (!height) { - error = gfs2_meta_inode_buffer(ip, &bh); - if (error) - return error; - dibh = bh; - - top = (__be64 *)(bh->b_data + sizeof(struct gfs2_dinode)) + mp->mp_list[0]; - bottom = (__be64 *)(bh->b_data + sizeof(struct gfs2_dinode)) + sdp->sd_diptrs; - } else { - error = gfs2_meta_indirect_buffer(ip, height, block, 0, &bh); - if (error) - return error; - - top = (__be64 *)(bh->b_data + mh_size) + - (first ? mp->mp_list[height] : 0); - - bottom = (__be64 *)(bh->b_data + mh_size) + sdp->sd_inptrs; - } - - error = bc(ip, dibh, bh, top, bottom, height, data); - if (error) - goto out; - - if (height < ip->i_height - 1) - for (; top < bottom; top++, first = 0) { - if (!*top) - continue; - - bn = be64_to_cpu(*top); - - error = recursive_scan(ip, dibh, mp, height + 1, bn, - first, bc, data); - if (error) - break; - } - -out: - brelse(bh); - return error; -} - -/** * do_strip - Look for a layer a particular layer of the file and strip it off * @ip: the inode * @dibh: the dinode buffer @@ -752,9 +713,8 @@ out: static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh, struct buffer_head *bh, __be64 *top, __be64 *bottom, - unsigned int height, void *data) + unsigned int height, struct strip_mine *sm) { - struct strip_mine *sm = data; struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); struct gfs2_rgrp_list rlist; u64 bn, bstart; @@ -783,14 +743,6 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh, else if (ip->i_depth) revokes = sdp->sd_inptrs; - if (ip != GFS2_I(sdp->sd_rindex)) - error = gfs2_rindex_hold(sdp, &ip->i_alloc->al_ri_gh); - else if (!sdp->sd_rgrps) - error = gfs2_ri_update(ip); - - if (error) - return error; - memset(&rlist, 0, sizeof(struct gfs2_rgrp_list)); bstart = 0; blen = 0; @@ -805,7 +757,7 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh, blen++; else { if (bstart) - gfs2_rlist_add(sdp, &rlist, bstart); + gfs2_rlist_add(ip, &rlist, bstart); bstart = bn; blen = 1; @@ -813,7 +765,7 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh, } if (bstart) - gfs2_rlist_add(sdp, &rlist, bstart); + gfs2_rlist_add(ip, &rlist, bstart); else goto out; /* Nothing to do */ @@ -887,12 +839,82 @@ out_rg_gunlock: out_rlist: gfs2_rlist_free(&rlist); out: - if (ip != GFS2_I(sdp->sd_rindex)) - gfs2_glock_dq_uninit(&ip->i_alloc->al_ri_gh); return error; } /** + * recursive_scan - recursively scan through the end of a file + * @ip: the inode + * @dibh: the dinode buffer + * @mp: the path through the metadata to the point to start + * @height: the height the recursion is at + * @block: the indirect block to look at + * @first: 1 if this is the first block + * @sm: data opaque to this function to pass to @bc + * + * When this is first called @height and @block should be zero and + * @first should be 1. + * + * Returns: errno + */ + +static int recursive_scan(struct gfs2_inode *ip, struct buffer_head *dibh, + struct metapath *mp, unsigned int height, + u64 block, int first, struct strip_mine *sm) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + struct buffer_head *bh = NULL; + __be64 *top, *bottom; + u64 bn; + int error; + int mh_size = sizeof(struct gfs2_meta_header); + + if (!height) { + error = gfs2_meta_inode_buffer(ip, &bh); + if (error) + return error; + dibh = bh; + + top = (__be64 *)(bh->b_data + sizeof(struct gfs2_dinode)) + mp->mp_list[0]; + bottom = (__be64 *)(bh->b_data + sizeof(struct gfs2_dinode)) + sdp->sd_diptrs; + } else { + error = gfs2_meta_indirect_buffer(ip, height, block, 0, &bh); + if (error) + return error; + + top = (__be64 *)(bh->b_data + mh_size) + + (first ? mp->mp_list[height] : 0); + + bottom = (__be64 *)(bh->b_data + mh_size) + sdp->sd_inptrs; + } + + error = do_strip(ip, dibh, bh, top, bottom, height, sm); + if (error) + goto out; + + if (height < ip->i_height - 1) { + + gfs2_metapath_ra(ip->i_gl, bh, top); + + for (; top < bottom; top++, first = 0) { + if (!*top) + continue; + + bn = be64_to_cpu(*top); + + error = recursive_scan(ip, dibh, mp, height + 1, bn, + first, sm); + if (error) + break; + } + } +out: + brelse(bh); + return error; +} + + +/** * gfs2_block_truncate_page - Deal with zeroing out data for truncate * * This is partly borrowed from ext3. @@ -1019,7 +1041,7 @@ static int trunc_dealloc(struct gfs2_inode *ip, u64 size) lblock = (size - 1) >> sdp->sd_sb.sb_bsize_shift; find_metapath(sdp, lblock, &mp, ip->i_height); - if (!gfs2_alloc_get(ip)) + if (!gfs2_qadata_get(ip)) return -ENOMEM; error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE); @@ -1031,7 +1053,7 @@ static int trunc_dealloc(struct gfs2_inode *ip, u64 size) sm.sm_first = !!size; sm.sm_height = height; - error = recursive_scan(ip, NULL, &mp, 0, 0, 1, do_strip, &sm); + error = recursive_scan(ip, NULL, &mp, 0, 0, 1, &sm); if (error) break; } @@ -1039,7 +1061,7 @@ static int trunc_dealloc(struct gfs2_inode *ip, u64 size) gfs2_quota_unhold(ip); out: - gfs2_alloc_put(ip); + gfs2_qadata_put(ip); return error; } @@ -1141,21 +1163,20 @@ static int do_grow(struct inode *inode, u64 size) struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_sbd *sdp = GFS2_SB(inode); struct buffer_head *dibh; - struct gfs2_alloc *al = NULL; + struct gfs2_qadata *qa = NULL; int error; if (gfs2_is_stuffed(ip) && (size > (sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode)))) { - al = gfs2_alloc_get(ip); - if (al == NULL) + qa = gfs2_qadata_get(ip); + if (qa == NULL) return -ENOMEM; error = gfs2_quota_lock_check(ip); if (error) goto do_grow_alloc_put; - al->al_requested = 1; - error = gfs2_inplace_reserve(ip); + error = gfs2_inplace_reserve(ip, 1); if (error) goto do_grow_qunlock; } @@ -1164,7 +1185,7 @@ static int do_grow(struct inode *inode, u64 size) if (error) goto do_grow_release; - if (al) { + if (qa) { error = gfs2_unstuff_dinode(ip, NULL); if (error) goto do_end_trans; @@ -1183,12 +1204,12 @@ static int do_grow(struct inode *inode, u64 size) do_end_trans: gfs2_trans_end(sdp); do_grow_release: - if (al) { + if (qa) { gfs2_inplace_release(ip); do_grow_qunlock: gfs2_quota_unlock(ip); do_grow_alloc_put: - gfs2_alloc_put(ip); + gfs2_qadata_put(ip); } return error; } diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c index 1cc2f8e..c35573a 100644 --- a/fs/gfs2/dir.c +++ b/fs/gfs2/dir.c @@ -76,6 +76,8 @@ #define IS_LEAF 1 /* Hashed (leaf) directory */ #define IS_DINODE 2 /* Linear (stuffed dinode block) directory */ +#define MAX_RA_BLOCKS 32 /* max read-ahead blocks */ + #define gfs2_disk_hash2offset(h) (((u64)(h)) >> 1) #define gfs2_dir_offset2hash(p) ((u32)(((u64)(p)) << 1)) @@ -240,16 +242,15 @@ fail: return error; } -static int gfs2_dir_read_stuffed(struct gfs2_inode *ip, char *buf, - u64 offset, unsigned int size) +static int gfs2_dir_read_stuffed(struct gfs2_inode *ip, __be64 *buf, + unsigned int size) { struct buffer_head *dibh; int error; error = gfs2_meta_inode_buffer(ip, &dibh); if (!error) { - offset += sizeof(struct gfs2_dinode); - memcpy(buf, dibh->b_data + offset, size); + memcpy(buf, dibh->b_data + sizeof(struct gfs2_dinode), size); brelse(dibh); } @@ -261,13 +262,12 @@ static int gfs2_dir_read_stuffed(struct gfs2_inode *ip, char *buf, * gfs2_dir_read_data - Read a data from a directory inode * @ip: The GFS2 Inode * @buf: The buffer to place result into - * @offset: File offset to begin jdata_readng from * @size: Amount of data to transfer * * Returns: The amount of data actually copied or the error */ -static int gfs2_dir_read_data(struct gfs2_inode *ip, char *buf, u64 offset, - unsigned int size, unsigned ra) +static int gfs2_dir_read_data(struct gfs2_inode *ip, __be64 *buf, + unsigned int size) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); u64 lblock, dblock; @@ -275,24 +275,14 @@ static int gfs2_dir_read_data(struct gfs2_inode *ip, char *buf, u64 offset, unsigned int o; int copied = 0; int error = 0; - u64 disksize = i_size_read(&ip->i_inode); - - if (offset >= disksize) - return 0; - - if (offset + size > disksize) - size = disksize - offset; - - if (!size) - return 0; if (gfs2_is_stuffed(ip)) - return gfs2_dir_read_stuffed(ip, buf, offset, size); + return gfs2_dir_read_stuffed(ip, buf, size); if (gfs2_assert_warn(sdp, gfs2_is_jdata(ip))) return -EINVAL; - lblock = offset; + lblock = 0; o = do_div(lblock, sdp->sd_jbsize) + sizeof(struct gfs2_meta_header); while (copied < size) { @@ -311,8 +301,6 @@ static int gfs2_dir_read_data(struct gfs2_inode *ip, char *buf, u64 offset, if (error || !dblock) goto fail; BUG_ON(extlen < 1); - if (!ra) - extlen = 1; bh = gfs2_meta_ra(ip->i_gl, dblock, extlen); } else { error = gfs2_meta_read(ip->i_gl, dblock, DIO_WAIT, &bh); @@ -328,7 +316,7 @@ static int gfs2_dir_read_data(struct gfs2_inode *ip, char *buf, u64 offset, extlen--; memcpy(buf, bh->b_data + o, amount); brelse(bh); - buf += amount; + buf += (amount/sizeof(__be64)); copied += amount; lblock++; o = sizeof(struct gfs2_meta_header); @@ -371,7 +359,7 @@ static __be64 *gfs2_dir_get_hash_table(struct gfs2_inode *ip) if (hc == NULL) return ERR_PTR(-ENOMEM); - ret = gfs2_dir_read_data(ip, (char *)hc, 0, hsize, 1); + ret = gfs2_dir_read_data(ip, hc, hsize); if (ret < 0) { kfree(hc); return ERR_PTR(ret); @@ -835,7 +823,7 @@ static struct gfs2_leaf *new_leaf(struct inode *inode, struct buffer_head **pbh, struct gfs2_dirent *dent; struct qstr name = { .name = "", .len = 0, .hash = 0 }; - error = gfs2_alloc_block(ip, &bn, &n); + error = gfs2_alloc_blocks(ip, &bn, &n, 0, NULL); if (error) return NULL; bh = gfs2_meta_new(ip->i_gl, bn); @@ -1390,6 +1378,52 @@ out: return error; } +/** + * gfs2_dir_readahead - Issue read-ahead requests for leaf blocks. + * + * Note: we can't calculate each index like dir_e_read can because we don't + * have the leaf, and therefore we don't have the depth, and therefore we + * don't have the length. So we have to just read enough ahead to make up + * for the loss of information. + */ +static void gfs2_dir_readahead(struct inode *inode, unsigned hsize, u32 index, + struct file_ra_state *f_ra) +{ + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_glock *gl = ip->i_gl; + struct buffer_head *bh; + u64 blocknr = 0, last; + unsigned count; + + /* First check if we've already read-ahead for the whole range. */ + if (index + MAX_RA_BLOCKS < f_ra->start) + return; + + f_ra->start = max((pgoff_t)index, f_ra->start); + for (count = 0; count < MAX_RA_BLOCKS; count++) { + if (f_ra->start >= hsize) /* if exceeded the hash table */ + break; + + last = blocknr; + blocknr = be64_to_cpu(ip->i_hash_cache[f_ra->start]); + f_ra->start++; + if (blocknr == last) + continue; + + bh = gfs2_getbuf(gl, blocknr, 1); + if (trylock_buffer(bh)) { + if (buffer_uptodate(bh)) { + unlock_buffer(bh); + brelse(bh); + continue; + } + bh->b_end_io = end_buffer_read_sync; + submit_bh(READA | REQ_META, bh); + continue; + } + brelse(bh); + } +} /** * dir_e_read - Reads the entries from a directory into a filldir buffer @@ -1402,7 +1436,7 @@ out: */ static int dir_e_read(struct inode *inode, u64 *offset, void *opaque, - filldir_t filldir) + filldir_t filldir, struct file_ra_state *f_ra) { struct gfs2_inode *dip = GFS2_I(inode); u32 hsize, len = 0; @@ -1416,10 +1450,14 @@ static int dir_e_read(struct inode *inode, u64 *offset, void *opaque, hash = gfs2_dir_offset2hash(*offset); index = hash >> (32 - dip->i_depth); + if (dip->i_hash_cache == NULL) + f_ra->start = 0; lp = gfs2_dir_get_hash_table(dip); if (IS_ERR(lp)) return PTR_ERR(lp); + gfs2_dir_readahead(inode, hsize, index, f_ra); + while (index < hsize) { error = gfs2_dir_read_leaf(inode, offset, opaque, filldir, &copied, &depth, @@ -1437,7 +1475,7 @@ static int dir_e_read(struct inode *inode, u64 *offset, void *opaque, } int gfs2_dir_read(struct inode *inode, u64 *offset, void *opaque, - filldir_t filldir) + filldir_t filldir, struct file_ra_state *f_ra) { struct gfs2_inode *dip = GFS2_I(inode); struct gfs2_sbd *sdp = GFS2_SB(inode); @@ -1451,7 +1489,7 @@ int gfs2_dir_read(struct inode *inode, u64 *offset, void *opaque, return 0; if (dip->i_diskflags & GFS2_DIF_EXHASH) - return dir_e_read(inode, offset, opaque, filldir); + return dir_e_read(inode, offset, opaque, filldir, f_ra); if (!gfs2_is_stuffed(dip)) { gfs2_consist_inode(dip); @@ -1695,7 +1733,6 @@ int gfs2_dir_del(struct gfs2_inode *dip, const struct dentry *dentry) const struct qstr *name = &dentry->d_name; struct gfs2_dirent *dent, *prev = NULL; struct buffer_head *bh; - int error; /* Returns _either_ the entry (if its first in block) or the previous entry otherwise */ @@ -1724,22 +1761,15 @@ int gfs2_dir_del(struct gfs2_inode *dip, const struct dentry *dentry) } brelse(bh); - error = gfs2_meta_inode_buffer(dip, &bh); - if (error) - return error; - if (!dip->i_entries) gfs2_consist_inode(dip); - gfs2_trans_add_bh(dip->i_gl, bh, 1); dip->i_entries--; dip->i_inode.i_mtime = dip->i_inode.i_ctime = CURRENT_TIME; if (S_ISDIR(dentry->d_inode->i_mode)) drop_nlink(&dip->i_inode); - gfs2_dinode_out(dip, bh->b_data); - brelse(bh); mark_inode_dirty(&dip->i_inode); - return error; + return 0; } /** @@ -1820,7 +1850,7 @@ static int leaf_dealloc(struct gfs2_inode *dip, u32 index, u32 len, if (!ht) return -ENOMEM; - if (!gfs2_alloc_get(dip)) { + if (!gfs2_qadata_get(dip)) { error = -ENOMEM; goto out; } @@ -1829,10 +1859,6 @@ static int leaf_dealloc(struct gfs2_inode *dip, u32 index, u32 len, if (error) goto out_put; - error = gfs2_rindex_hold(sdp, &dip->i_alloc->al_ri_gh); - if (error) - goto out_qs; - /* Count the number of leaves */ bh = leaf_bh; @@ -1847,7 +1873,7 @@ static int leaf_dealloc(struct gfs2_inode *dip, u32 index, u32 len, if (blk != leaf_no) brelse(bh); - gfs2_rlist_add(sdp, &rlist, blk); + gfs2_rlist_add(dip, &rlist, blk); l_blocks++; } @@ -1911,11 +1937,9 @@ out_rg_gunlock: gfs2_glock_dq_m(rlist.rl_rgrps, rlist.rl_ghs); out_rlist: gfs2_rlist_free(&rlist); - gfs2_glock_dq_uninit(&dip->i_alloc->al_ri_gh); -out_qs: gfs2_quota_unhold(dip); out_put: - gfs2_alloc_put(dip); + gfs2_qadata_put(dip); out: kfree(ht); return error; diff --git a/fs/gfs2/dir.h b/fs/gfs2/dir.h index ff5772f..98c960b 100644 --- a/fs/gfs2/dir.h +++ b/fs/gfs2/dir.h @@ -25,7 +25,7 @@ extern int gfs2_dir_add(struct inode *inode, const struct qstr *filename, const struct gfs2_inode *ip); extern int gfs2_dir_del(struct gfs2_inode *dip, const struct dentry *dentry); extern int gfs2_dir_read(struct inode *inode, u64 *offset, void *opaque, - filldir_t filldir); + filldir_t filldir, struct file_ra_state *f_ra); extern int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename, const struct gfs2_inode *nip, unsigned int new_type); diff --git a/fs/gfs2/export.c b/fs/gfs2/export.c index fe9945f..70ba891 100644 --- a/fs/gfs2/export.c +++ b/fs/gfs2/export.c @@ -99,6 +99,7 @@ static int gfs2_get_name(struct dentry *parent, char *name, struct gfs2_holder gh; u64 offset = 0; int error; + struct file_ra_state f_ra = { .start = 0 }; if (!dir) return -EINVAL; @@ -118,7 +119,7 @@ static int gfs2_get_name(struct dentry *parent, char *name, if (error) return error; - error = gfs2_dir_read(dir, &offset, &gnfd, get_name_filldir); + error = gfs2_dir_read(dir, &offset, &gnfd, get_name_filldir, &f_ra); gfs2_glock_dq_uninit(&gh); diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index edeb9e8..c5fb359 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -59,15 +59,24 @@ static loff_t gfs2_llseek(struct file *file, loff_t offset, int origin) struct gfs2_holder i_gh; loff_t error; - if (origin == 2) { + switch (origin) { + case SEEK_END: /* These reference inode->i_size */ + case SEEK_DATA: + case SEEK_HOLE: error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh); if (!error) { - error = generic_file_llseek_unlocked(file, offset, origin); + error = generic_file_llseek(file, offset, origin); gfs2_glock_dq_uninit(&i_gh); } - } else - error = generic_file_llseek_unlocked(file, offset, origin); + break; + case SEEK_CUR: + case SEEK_SET: + error = generic_file_llseek(file, offset, origin); + break; + default: + error = -EINVAL; + } return error; } @@ -96,7 +105,7 @@ static int gfs2_readdir(struct file *file, void *dirent, filldir_t filldir) return error; } - error = gfs2_dir_read(dir, &offset, dirent, filldir); + error = gfs2_dir_read(dir, &offset, dirent, filldir, &file->f_ra); gfs2_glock_dq_uninit(&d_gh); @@ -214,7 +223,7 @@ static int do_gfs2_set_flags(struct file *filp, u32 reqflags, u32 mask) int error; u32 new_flags, flags; - error = mnt_want_write(filp->f_path.mnt); + error = mnt_want_write_file(filp); if (error) return error; @@ -276,7 +285,7 @@ out_trans_end: out: gfs2_glock_dq_uninit(&gh); out_drop_write: - mnt_drop_write(filp->f_path.mnt); + mnt_drop_write_file(filp); return error; } @@ -356,9 +365,16 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) u64 pos = page->index << PAGE_CACHE_SHIFT; unsigned int data_blocks, ind_blocks, rblocks; struct gfs2_holder gh; - struct gfs2_alloc *al; + struct gfs2_qadata *qa; + loff_t size; int ret; + /* Wait if fs is frozen. This is racy so we check again later on + * and retry if the fs has been frozen after the page lock has + * been acquired + */ + vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE); + gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh); ret = gfs2_glock_nq(&gh); if (ret) @@ -367,19 +383,25 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) set_bit(GLF_DIRTY, &ip->i_gl->gl_flags); set_bit(GIF_SW_PAGED, &ip->i_flags); - if (!gfs2_write_alloc_required(ip, pos, PAGE_CACHE_SIZE)) + if (!gfs2_write_alloc_required(ip, pos, PAGE_CACHE_SIZE)) { + lock_page(page); + if (!PageUptodate(page) || page->mapping != inode->i_mapping) { + ret = -EAGAIN; + unlock_page(page); + } goto out_unlock; + } + ret = -ENOMEM; - al = gfs2_alloc_get(ip); - if (al == NULL) + qa = gfs2_qadata_get(ip); + if (qa == NULL) goto out_unlock; ret = gfs2_quota_lock_check(ip); if (ret) goto out_alloc_put; gfs2_write_calc_reserv(ip, PAGE_CACHE_SIZE, &data_blocks, &ind_blocks); - al->al_requested = data_blocks + ind_blocks; - ret = gfs2_inplace_reserve(ip); + ret = gfs2_inplace_reserve(ip, data_blocks + ind_blocks); if (ret) goto out_quota_unlock; @@ -388,7 +410,7 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) rblocks += data_blocks ? data_blocks : 1; if (ind_blocks || data_blocks) { rblocks += RES_STATFS + RES_QUOTA; - rblocks += gfs2_rg_blocks(al); + rblocks += gfs2_rg_blocks(ip); } ret = gfs2_trans_begin(sdp, rblocks, 0); if (ret) @@ -396,37 +418,51 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) lock_page(page); ret = -EINVAL; - last_index = ip->i_inode.i_size >> PAGE_CACHE_SHIFT; - if (page->index > last_index) - goto out_unlock_page; + size = i_size_read(inode); + last_index = (size - 1) >> PAGE_CACHE_SHIFT; + /* Check page index against inode size */ + if (size == 0 || (page->index > last_index)) + goto out_trans_end; + + ret = -EAGAIN; + /* If truncated, we must retry the operation, we may have raced + * with the glock demotion code. + */ + if (!PageUptodate(page) || page->mapping != inode->i_mapping) + goto out_trans_end; + + /* Unstuff, if required, and allocate backing blocks for page */ ret = 0; - if (!PageUptodate(page) || page->mapping != ip->i_inode.i_mapping) - goto out_unlock_page; - if (gfs2_is_stuffed(ip)) { + if (gfs2_is_stuffed(ip)) ret = gfs2_unstuff_dinode(ip, page); - if (ret) - goto out_unlock_page; - } - ret = gfs2_allocate_page_backing(page); + if (ret == 0) + ret = gfs2_allocate_page_backing(page); -out_unlock_page: - unlock_page(page); +out_trans_end: + if (ret) + unlock_page(page); gfs2_trans_end(sdp); out_trans_fail: gfs2_inplace_release(ip); out_quota_unlock: gfs2_quota_unlock(ip); out_alloc_put: - gfs2_alloc_put(ip); + gfs2_qadata_put(ip); out_unlock: gfs2_glock_dq(&gh); out: gfs2_holder_uninit(&gh); - if (ret == -ENOMEM) - ret = VM_FAULT_OOM; - else if (ret) - ret = VM_FAULT_SIGBUS; - return ret; + if (ret == 0) { + set_page_dirty(page); + /* This check must be post dropping of transaction lock */ + if (inode->i_sb->s_frozen == SB_UNFROZEN) { + wait_on_page_writeback(page); + } else { + ret = -EAGAIN; + unlock_page(page); + } + } + return block_page_mkwrite_return(ret); } static const struct vm_operations_struct gfs2_vm_ops = { @@ -551,8 +587,16 @@ static int gfs2_close(struct inode *inode, struct file *file) * @end: the end position in the file to sync * @datasync: set if we can ignore timestamp changes * - * The VFS will flush data for us. We only need to worry - * about metadata here. + * We split the data flushing here so that we don't wait for the data + * until after we've also sent the metadata to disk. Note that for + * data=ordered, we will write & wait for the data at the log flush + * stage anyway, so this is unlikely to make much of a difference + * except in the data=writeback case. + * + * If the fdatawrite fails due to any reason except -EIO, we will + * continue the remainder of the fsync, although we'll still report + * the error at the end. This is to match filemap_write_and_wait_range() + * behaviour. * * Returns: errno */ @@ -560,30 +604,34 @@ static int gfs2_close(struct inode *inode, struct file *file) static int gfs2_fsync(struct file *file, loff_t start, loff_t end, int datasync) { - struct inode *inode = file->f_mapping->host; + struct address_space *mapping = file->f_mapping; + struct inode *inode = mapping->host; int sync_state = inode->i_state & (I_DIRTY_SYNC|I_DIRTY_DATASYNC); struct gfs2_inode *ip = GFS2_I(inode); - int ret; + int ret = 0, ret1 = 0; - ret = filemap_write_and_wait_range(inode->i_mapping, start, end); - if (ret) - return ret; - mutex_lock(&inode->i_mutex); + if (mapping->nrpages) { + ret1 = filemap_fdatawrite_range(mapping, start, end); + if (ret1 == -EIO) + return ret1; + } if (datasync) sync_state &= ~I_DIRTY_SYNC; if (sync_state) { ret = sync_inode_metadata(inode, 1); - if (ret) { - mutex_unlock(&inode->i_mutex); + if (ret) return ret; - } - gfs2_ail_flush(ip->i_gl); + if (gfs2_is_jdata(ip)) + filemap_write_and_wait(mapping); + gfs2_ail_flush(ip->i_gl, 1); } - mutex_unlock(&inode->i_mutex); - return 0; + if (mapping->nrpages) + ret = filemap_fdatawait_range(mapping, start, end); + + return ret ? ret : ret1; } /** @@ -620,135 +668,18 @@ static ssize_t gfs2_file_aio_write(struct kiocb *iocb, const struct iovec *iov, return generic_file_aio_write(iocb, iov, nr_segs, pos); } -static int empty_write_end(struct page *page, unsigned from, - unsigned to, int mode) -{ - struct inode *inode = page->mapping->host; - struct gfs2_inode *ip = GFS2_I(inode); - struct buffer_head *bh; - unsigned offset, blksize = 1 << inode->i_blkbits; - pgoff_t end_index = i_size_read(inode) >> PAGE_CACHE_SHIFT; - - zero_user(page, from, to-from); - mark_page_accessed(page); - - if (page->index < end_index || !(mode & FALLOC_FL_KEEP_SIZE)) { - if (!gfs2_is_writeback(ip)) - gfs2_page_add_databufs(ip, page, from, to); - - block_commit_write(page, from, to); - return 0; - } - - offset = 0; - bh = page_buffers(page); - while (offset < to) { - if (offset >= from) { - set_buffer_uptodate(bh); - mark_buffer_dirty(bh); - clear_buffer_new(bh); - write_dirty_buffer(bh, WRITE); - } - offset += blksize; - bh = bh->b_this_page; - } - - offset = 0; - bh = page_buffers(page); - while (offset < to) { - if (offset >= from) { - wait_on_buffer(bh); - if (!buffer_uptodate(bh)) - return -EIO; - } - offset += blksize; - bh = bh->b_this_page; - } - return 0; -} - -static int needs_empty_write(sector_t block, struct inode *inode) -{ - int error; - struct buffer_head bh_map = { .b_state = 0, .b_blocknr = 0 }; - - bh_map.b_size = 1 << inode->i_blkbits; - error = gfs2_block_map(inode, block, &bh_map, 0); - if (unlikely(error)) - return error; - return !buffer_mapped(&bh_map); -} - -static int write_empty_blocks(struct page *page, unsigned from, unsigned to, - int mode) -{ - struct inode *inode = page->mapping->host; - unsigned start, end, next, blksize; - sector_t block = page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits); - int ret; - - blksize = 1 << inode->i_blkbits; - next = end = 0; - while (next < from) { - next += blksize; - block++; - } - start = next; - do { - next += blksize; - ret = needs_empty_write(block, inode); - if (unlikely(ret < 0)) - return ret; - if (ret == 0) { - if (end) { - ret = __block_write_begin(page, start, end - start, - gfs2_block_map); - if (unlikely(ret)) - return ret; - ret = empty_write_end(page, start, end, mode); - if (unlikely(ret)) - return ret; - end = 0; - } - start = next; - } - else - end = next; - block++; - } while (next < to); - - if (end) { - ret = __block_write_begin(page, start, end - start, gfs2_block_map); - if (unlikely(ret)) - return ret; - ret = empty_write_end(page, start, end, mode); - if (unlikely(ret)) - return ret; - } - - return 0; -} - static int fallocate_chunk(struct inode *inode, loff_t offset, loff_t len, int mode) { struct gfs2_inode *ip = GFS2_I(inode); struct buffer_head *dibh; int error; - u64 start = offset >> PAGE_CACHE_SHIFT; - unsigned int start_offset = offset & ~PAGE_CACHE_MASK; - u64 end = (offset + len - 1) >> PAGE_CACHE_SHIFT; - pgoff_t curr; - struct page *page; - unsigned int end_offset = (offset + len) & ~PAGE_CACHE_MASK; - unsigned int from, to; - - if (!end_offset) - end_offset = PAGE_CACHE_SIZE; + unsigned int nr_blks; + sector_t lblock = offset >> inode->i_blkbits; error = gfs2_meta_inode_buffer(ip, &dibh); if (unlikely(error)) - goto out; + return error; gfs2_trans_add_bh(ip->i_gl, dibh, 1); @@ -758,40 +689,31 @@ static int fallocate_chunk(struct inode *inode, loff_t offset, loff_t len, goto out; } - curr = start; - offset = start << PAGE_CACHE_SHIFT; - from = start_offset; - to = PAGE_CACHE_SIZE; - while (curr <= end) { - page = grab_cache_page_write_begin(inode->i_mapping, curr, - AOP_FLAG_NOFS); - if (unlikely(!page)) { - error = -ENOMEM; - goto out; - } + while (len) { + struct buffer_head bh_map = { .b_state = 0, .b_blocknr = 0 }; + bh_map.b_size = len; + set_buffer_zeronew(&bh_map); - if (curr == end) - to = end_offset; - error = write_empty_blocks(page, from, to, mode); - if (!error && offset + to > inode->i_size && - !(mode & FALLOC_FL_KEEP_SIZE)) { - i_size_write(inode, offset + to); - } - unlock_page(page); - page_cache_release(page); - if (error) + error = gfs2_block_map(inode, lblock, &bh_map, 1); + if (unlikely(error)) + goto out; + len -= bh_map.b_size; + nr_blks = bh_map.b_size >> inode->i_blkbits; + lblock += nr_blks; + if (!buffer_new(&bh_map)) + continue; + if (unlikely(!buffer_zeronew(&bh_map))) { + error = -EIO; goto out; - curr++; - offset += PAGE_CACHE_SIZE; - from = 0; + } } + if (offset + len > inode->i_size && !(mode & FALLOC_FL_KEEP_SIZE)) + i_size_write(inode, offset + len); - gfs2_dinode_out(ip, dibh->b_data); mark_inode_dirty(inode); - brelse(dibh); - out: + brelse(dibh); return error; } @@ -799,7 +721,7 @@ static void calc_max_reserv(struct gfs2_inode *ip, loff_t max, loff_t *len, unsigned int *data_blocks, unsigned int *ind_blocks) { const struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); - unsigned int max_blocks = ip->i_alloc->al_rgd->rd_free_clone; + unsigned int max_blocks = ip->i_rgd->rd_free_clone; unsigned int tmp, max_data = max_blocks - 3 * (sdp->sd_max_height - 1); for (tmp = max_data; tmp > sdp->sd_diptrs;) { @@ -827,10 +749,13 @@ static long gfs2_fallocate(struct file *file, int mode, loff_t offset, struct gfs2_inode *ip = GFS2_I(inode); unsigned int data_blocks = 0, ind_blocks = 0, rblocks; loff_t bytes, max_bytes; - struct gfs2_alloc *al; + struct gfs2_qadata *qa; int error; + const loff_t pos = offset; + const loff_t count = len; loff_t bsize_mask = ~((loff_t)sdp->sd_sb.sb_bsize - 1); loff_t next = (offset + len - 1) >> sdp->sd_sb.sb_bsize_shift; + loff_t max_chunk_size = UINT_MAX & bsize_mask; next = (next + 1) << sdp->sd_sb.sb_bsize_shift; /* We only support the FALLOC_FL_KEEP_SIZE mode */ @@ -858,8 +783,8 @@ static long gfs2_fallocate(struct file *file, int mode, loff_t offset, while (len > 0) { if (len < bytes) bytes = len; - al = gfs2_alloc_get(ip); - if (!al) { + qa = gfs2_qadata_get(ip); + if (!qa) { error = -ENOMEM; goto out_unlock; } @@ -871,8 +796,7 @@ static long gfs2_fallocate(struct file *file, int mode, loff_t offset, retry: gfs2_write_calc_reserv(ip, bytes, &data_blocks, &ind_blocks); - al->al_requested = data_blocks + ind_blocks; - error = gfs2_inplace_reserve(ip); + error = gfs2_inplace_reserve(ip, data_blocks + ind_blocks); if (error) { if (error == -ENOSPC && bytes > sdp->sd_sb.sb_bsize) { bytes >>= 1; @@ -884,11 +808,11 @@ retry: goto out_qunlock; } max_bytes = bytes; - calc_max_reserv(ip, len, &max_bytes, &data_blocks, &ind_blocks); - al->al_requested = data_blocks + ind_blocks; + calc_max_reserv(ip, (len > max_chunk_size)? max_chunk_size: len, + &max_bytes, &data_blocks, &ind_blocks); rblocks = RES_DINODE + ind_blocks + RES_STATFS + RES_QUOTA + - RES_RG_HDR + gfs2_rg_blocks(al); + RES_RG_HDR + gfs2_rg_blocks(ip); if (gfs2_is_jdata(ip)) rblocks += data_blocks ? data_blocks : 1; @@ -907,8 +831,11 @@ retry: offset += max_bytes; gfs2_inplace_release(ip); gfs2_quota_unlock(ip); - gfs2_alloc_put(ip); + gfs2_qadata_put(ip); } + + if (error == 0) + error = generic_write_sync(file, pos, count); goto out_unlock; out_trans_fail: @@ -916,7 +843,7 @@ out_trans_fail: out_qunlock: gfs2_quota_unlock(ip); out_alloc_put: - gfs2_alloc_put(ip); + gfs2_qadata_put(ip); out_unlock: gfs2_glock_dq(&ip->i_gh); out_uninit: diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h index 6670711..2553b85 100644 --- a/fs/gfs2/glock.h +++ b/fs/gfs2/glock.h @@ -201,7 +201,7 @@ int gfs2_glock_nq_m(unsigned int num_gh, struct gfs2_holder *ghs); void gfs2_glock_dq_m(unsigned int num_gh, struct gfs2_holder *ghs); void gfs2_glock_dq_uninit_m(unsigned int num_gh, struct gfs2_holder *ghs); -__attribute__ ((format(printf, 2, 3))) +__printf(2, 3) void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...); /** diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c index da21eca..1656df7a 100644 --- a/fs/gfs2/glops.c +++ b/fs/gfs2/glops.c @@ -28,40 +28,55 @@ #include "trans.h" #include "dir.h" +static void gfs2_ail_error(struct gfs2_glock *gl, const struct buffer_head *bh) +{ + fs_err(gl->gl_sbd, "AIL buffer %p: blocknr %llu state 0x%08lx mapping %p page state 0x%lx\n", + bh, (unsigned long long)bh->b_blocknr, bh->b_state, + bh->b_page->mapping, bh->b_page->flags); + fs_err(gl->gl_sbd, "AIL glock %u:%llu mapping %p\n", + gl->gl_name.ln_type, gl->gl_name.ln_number, + gfs2_glock2aspace(gl)); + gfs2_lm_withdraw(gl->gl_sbd, "AIL error\n"); +} + /** * __gfs2_ail_flush - remove all buffers for a given lock from the AIL * @gl: the glock + * @fsync: set when called from fsync (not all buffers will be clean) * * None of the buffers should be dirty, locked, or pinned. */ -static void __gfs2_ail_flush(struct gfs2_glock *gl) +static void __gfs2_ail_flush(struct gfs2_glock *gl, bool fsync) { struct gfs2_sbd *sdp = gl->gl_sbd; struct list_head *head = &gl->gl_ail_list; - struct gfs2_bufdata *bd; + struct gfs2_bufdata *bd, *tmp; struct buffer_head *bh; + const unsigned long b_state = (1UL << BH_Dirty)|(1UL << BH_Pinned)|(1UL << BH_Lock); + sector_t blocknr; + gfs2_log_lock(sdp); spin_lock(&sdp->sd_ail_lock); - while (!list_empty(head)) { - bd = list_entry(head->next, struct gfs2_bufdata, - bd_ail_gl_list); + list_for_each_entry_safe(bd, tmp, head, bd_ail_gl_list) { bh = bd->bd_bh; - gfs2_remove_from_ail(bd); - bd->bd_bh = NULL; + if (bh->b_state & b_state) { + if (fsync) + continue; + gfs2_ail_error(gl, bh); + } + blocknr = bh->b_blocknr; bh->b_private = NULL; - spin_unlock(&sdp->sd_ail_lock); + gfs2_remove_from_ail(bd); /* drops ref on bh */ - bd->bd_blkno = bh->b_blocknr; - gfs2_log_lock(sdp); - gfs2_assert_withdraw(sdp, !buffer_busy(bh)); - gfs2_trans_add_revoke(sdp, bd); - gfs2_log_unlock(sdp); + bd->bd_bh = NULL; + bd->bd_blkno = blocknr; - spin_lock(&sdp->sd_ail_lock); + gfs2_trans_add_revoke(sdp, bd); } - gfs2_assert_withdraw(sdp, !atomic_read(&gl->gl_ail_count)); + BUG_ON(!fsync && atomic_read(&gl->gl_ail_count)); spin_unlock(&sdp->sd_ail_lock); + gfs2_log_unlock(sdp); } @@ -84,13 +99,13 @@ static void gfs2_ail_empty_gl(struct gfs2_glock *gl) BUG_ON(current->journal_info); current->journal_info = &tr; - __gfs2_ail_flush(gl); + __gfs2_ail_flush(gl, 0); gfs2_trans_end(sdp); gfs2_log_flush(sdp, NULL); } -void gfs2_ail_flush(struct gfs2_glock *gl) +void gfs2_ail_flush(struct gfs2_glock *gl, bool fsync) { struct gfs2_sbd *sdp = gl->gl_sbd; unsigned int revokes = atomic_read(&gl->gl_ail_count); @@ -102,7 +117,7 @@ void gfs2_ail_flush(struct gfs2_glock *gl) ret = gfs2_trans_begin(sdp, 0, revokes); if (ret) return; - __gfs2_ail_flush(gl); + __gfs2_ail_flush(gl, fsync); gfs2_trans_end(sdp); gfs2_log_flush(sdp, NULL); } @@ -119,6 +134,7 @@ void gfs2_ail_flush(struct gfs2_glock *gl) static void rgrp_go_sync(struct gfs2_glock *gl) { struct address_space *metamapping = gfs2_glock2aspace(gl); + struct gfs2_rgrpd *rgd; int error; if (!test_and_clear_bit(GLF_DIRTY, &gl->gl_flags)) @@ -130,6 +146,12 @@ static void rgrp_go_sync(struct gfs2_glock *gl) error = filemap_fdatawait(metamapping); mapping_set_error(metamapping, error); gfs2_ail_empty_gl(gl); + + spin_lock(&gl->gl_spin); + rgd = gl->gl_object; + if (rgd) + gfs2_free_clones(rgd); + spin_unlock(&gl->gl_spin); } /** @@ -277,7 +299,7 @@ static void gfs2_set_nlink(struct inode *inode, u32 nlink) if (nlink == 0) clear_nlink(inode); else - inode->i_nlink = nlink; + set_nlink(inode, nlink); } } @@ -430,33 +452,6 @@ static int inode_go_dump(struct seq_file *seq, const struct gfs2_glock *gl) } /** - * rgrp_go_lock - operation done after an rgrp lock is locked by - * a first holder on this node. - * @gl: the glock - * @flags: - * - * Returns: errno - */ - -static int rgrp_go_lock(struct gfs2_holder *gh) -{ - return gfs2_rgrp_bh_get(gh->gh_gl->gl_object); -} - -/** - * rgrp_go_unlock - operation done before an rgrp lock is unlocked by - * a last holder on this node. - * @gl: the glock - * @flags: - * - */ - -static void rgrp_go_unlock(struct gfs2_holder *gh) -{ - gfs2_rgrp_bh_put(gh->gh_gl->gl_object); -} - -/** * trans_go_sync - promote/demote the transaction glock * @gl: the glock * @state: the requested state @@ -558,8 +553,8 @@ const struct gfs2_glock_operations gfs2_inode_glops = { const struct gfs2_glock_operations gfs2_rgrp_glops = { .go_xmote_th = rgrp_go_sync, .go_inval = rgrp_go_inval, - .go_lock = rgrp_go_lock, - .go_unlock = rgrp_go_unlock, + .go_lock = gfs2_rgrp_go_lock, + .go_unlock = gfs2_rgrp_go_unlock, .go_dump = gfs2_rgrp_dump, .go_type = LM_TYPE_RGRP, .go_flags = GLOF_ASPACE, diff --git a/fs/gfs2/glops.h b/fs/gfs2/glops.h index 6fce409..bf95a2d 100644 --- a/fs/gfs2/glops.h +++ b/fs/gfs2/glops.h @@ -23,6 +23,6 @@ extern const struct gfs2_glock_operations gfs2_quota_glops; extern const struct gfs2_glock_operations gfs2_journal_glops; extern const struct gfs2_glock_operations *gfs2_glops_list[]; -extern void gfs2_ail_flush(struct gfs2_glock *gl); +extern void gfs2_ail_flush(struct gfs2_glock *gl, bool fsync); #endif /* __GLOPS_DOT_H__ */ diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index 892ac37..e1d3bb5 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -18,6 +18,7 @@ #include <linux/rcupdate.h> #include <linux/rculist_bl.h> #include <linux/completion.h> +#include <linux/rbtree.h> #define DIO_WAIT 0x00000010 #define DIO_METADATA 0x00000020 @@ -78,8 +79,7 @@ struct gfs2_bitmap { }; struct gfs2_rgrpd { - struct list_head rd_list; /* Link with superblock */ - struct list_head rd_list_mru; + struct rb_node rd_node; /* Link with superblock */ struct gfs2_glock *rd_gl; /* Glock for this rgrp */ u64 rd_addr; /* grp block disk address */ u64 rd_data0; /* first data location */ @@ -91,10 +91,7 @@ struct gfs2_rgrpd { u32 rd_dinodes; u64 rd_igeneration; struct gfs2_bitmap *rd_bits; - struct mutex rd_mutex; - struct gfs2_log_element rd_le; struct gfs2_sbd *rd_sbd; - unsigned int rd_bh_count; u32 rd_last_alloc; u32 rd_flags; #define GFS2_RDF_CHECK 0x10000000 /* check for unlinked inodes */ @@ -106,12 +103,15 @@ struct gfs2_rgrpd { enum gfs2_state_bits { BH_Pinned = BH_PrivateStart, BH_Escaped = BH_PrivateStart + 1, + BH_Zeronew = BH_PrivateStart + 2, }; BUFFER_FNS(Pinned, pinned) TAS_BUFFER_FNS(Pinned, pinned) BUFFER_FNS(Escaped, escaped) TAS_BUFFER_FNS(Escaped, escaped) +BUFFER_FNS(Zeronew, zeronew) +TAS_BUFFER_FNS(Zeronew, zeronew) struct gfs2_bufdata { struct buffer_head *bd_bh; @@ -244,29 +244,22 @@ struct gfs2_glock { #define GFS2_MIN_LVB_SIZE 32 /* Min size of LVB that gfs2 supports */ -struct gfs2_alloc { +struct gfs2_qadata { /* quota allocation data */ /* Quota stuff */ + struct gfs2_quota_data *qa_qd[2*MAXQUOTAS]; + struct gfs2_holder qa_qd_ghs[2*MAXQUOTAS]; + unsigned int qa_qd_num; +}; - struct gfs2_quota_data *al_qd[2*MAXQUOTAS]; - struct gfs2_holder al_qd_ghs[2*MAXQUOTAS]; - unsigned int al_qd_num; - - u32 al_requested; /* Filled in by caller of gfs2_inplace_reserve() */ - u32 al_alloced; /* Filled in by gfs2_alloc_*() */ - - /* Filled in by gfs2_inplace_reserve() */ - - unsigned int al_line; - char *al_file; - struct gfs2_holder al_ri_gh; - struct gfs2_holder al_rgd_gh; - struct gfs2_rgrpd *al_rgd; - +struct gfs2_blkreserv { + u32 rs_requested; /* Filled in by caller of gfs2_inplace_reserve() */ + struct gfs2_holder rs_rgd_gh; /* Filled in by gfs2_inplace_reserve() */ }; enum { GIF_INVALID = 0, GIF_QD_LOCKED = 1, + GIF_ALLOC_FAILED = 2, GIF_SW_PAGED = 3, }; @@ -281,7 +274,9 @@ struct gfs2_inode { struct gfs2_glock *i_gl; /* Move into i_gh? */ struct gfs2_holder i_iopen_gh; struct gfs2_holder i_gh; /* for prepare/commit_write only */ - struct gfs2_alloc *i_alloc; + struct gfs2_qadata *i_qadata; /* quota allocation data */ + struct gfs2_blkreserv *i_res; /* resource group block reservation */ + struct gfs2_rgrpd *i_rgd; u64 i_goal; /* goal block for allocations */ struct rw_semaphore i_rw_mutex; struct list_head i_trunc_list; @@ -574,9 +569,7 @@ struct gfs2_sbd { int sd_rindex_uptodate; spinlock_t sd_rindex_spin; struct mutex sd_rindex_mutex; - struct list_head sd_rindex_list; - struct list_head sd_rindex_mru_list; - struct gfs2_rgrpd *sd_rindex_forward; + struct rb_root sd_rindex_tree; unsigned int sd_rgrps; unsigned int sd_max_rg_data; diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index 900cf98..017960c 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -333,7 +333,7 @@ out: */ static int create_ok(struct gfs2_inode *dip, const struct qstr *name, - unsigned int mode) + umode_t mode) { int error; @@ -364,7 +364,7 @@ static int create_ok(struct gfs2_inode *dip, const struct qstr *name, return 0; } -static void munge_mode_uid_gid(struct gfs2_inode *dip, unsigned int *mode, +static void munge_mode_uid_gid(struct gfs2_inode *dip, umode_t *mode, unsigned int *uid, unsigned int *gid) { if (GFS2_SB(&dip->i_inode)->sd_args.ar_suiddir && @@ -389,12 +389,13 @@ static int alloc_dinode(struct gfs2_inode *dip, u64 *no_addr, u64 *generation) { struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode); int error; + int dblocks = 1; - if (gfs2_alloc_get(dip) == NULL) - return -ENOMEM; + error = gfs2_rindex_update(sdp); + if (error) + fs_warn(sdp, "rindex update returns %d\n", error); - dip->i_alloc->al_requested = RES_DINODE; - error = gfs2_inplace_reserve(dip); + error = gfs2_inplace_reserve(dip, RES_DINODE); if (error) goto out; @@ -402,14 +403,13 @@ static int alloc_dinode(struct gfs2_inode *dip, u64 *no_addr, u64 *generation) if (error) goto out_ipreserv; - error = gfs2_alloc_di(dip, no_addr, generation); + error = gfs2_alloc_blocks(dip, no_addr, &dblocks, 1, generation); gfs2_trans_end(sdp); out_ipreserv: gfs2_inplace_release(dip); out: - gfs2_alloc_put(dip); return error; } @@ -447,7 +447,7 @@ static void gfs2_init_dir(struct buffer_head *dibh, */ static void init_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl, - const struct gfs2_inum_host *inum, unsigned int mode, + const struct gfs2_inum_host *inum, umode_t mode, unsigned int uid, unsigned int gid, const u64 *generation, dev_t dev, const char *symname, unsigned size, struct buffer_head **bhp) @@ -516,7 +516,7 @@ static void init_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl, } static int make_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl, - unsigned int mode, const struct gfs2_inum_host *inum, + umode_t mode, const struct gfs2_inum_host *inum, const u64 *generation, dev_t dev, const char *symname, unsigned int size, struct buffer_head **bhp) { @@ -525,7 +525,7 @@ static int make_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl, int error; munge_mode_uid_gid(dip, &mode, &uid, &gid); - if (!gfs2_alloc_get(dip)) + if (!gfs2_qadata_get(dip)) return -ENOMEM; error = gfs2_quota_lock(dip, uid, gid); @@ -547,7 +547,7 @@ static int make_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl, out_quota: gfs2_quota_unlock(dip); out: - gfs2_alloc_put(dip); + gfs2_qadata_put(dip); return error; } @@ -555,13 +555,13 @@ static int link_dinode(struct gfs2_inode *dip, const struct qstr *name, struct gfs2_inode *ip) { struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode); - struct gfs2_alloc *al; + struct gfs2_qadata *qa; int alloc_required; struct buffer_head *dibh; int error; - al = gfs2_alloc_get(dip); - if (!al) + qa = gfs2_qadata_get(dip); + if (!qa) return -ENOMEM; error = gfs2_quota_lock(dip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE); @@ -576,14 +576,12 @@ static int link_dinode(struct gfs2_inode *dip, const struct qstr *name, if (error) goto fail_quota_locks; - al->al_requested = sdp->sd_max_dirres; - - error = gfs2_inplace_reserve(dip); + error = gfs2_inplace_reserve(dip, sdp->sd_max_dirres); if (error) goto fail_quota_locks; error = gfs2_trans_begin(sdp, sdp->sd_max_dirres + - al->al_rgd->rd_length + + dip->i_rgd->rd_length + 2 * RES_DINODE + RES_STATFS + RES_QUOTA, 0); if (error) @@ -613,42 +611,39 @@ fail_end_trans: gfs2_trans_end(sdp); fail_ipreserv: - if (dip->i_alloc->al_rgd) - gfs2_inplace_release(dip); + gfs2_inplace_release(dip); fail_quota_locks: gfs2_quota_unlock(dip); fail: - gfs2_alloc_put(dip); + gfs2_qadata_put(dip); return error; } -static int gfs2_security_init(struct gfs2_inode *dip, struct gfs2_inode *ip, - const struct qstr *qstr) +static int gfs2_initxattrs(struct inode *inode, const struct xattr *xattr_array, + void *fs_info) { - int err; - size_t len; - void *value; - char *name; - - err = security_inode_init_security(&ip->i_inode, &dip->i_inode, qstr, - &name, &value, &len); - - if (err) { - if (err == -EOPNOTSUPP) - return 0; - return err; + const struct xattr *xattr; + int err = 0; + + for (xattr = xattr_array; xattr->name != NULL; xattr++) { + err = __gfs2_xattr_set(inode, xattr->name, xattr->value, + xattr->value_len, 0, + GFS2_EATYPE_SECURITY); + if (err < 0) + break; } - - err = __gfs2_xattr_set(&ip->i_inode, name, value, len, 0, - GFS2_EATYPE_SECURITY); - kfree(value); - kfree(name); - return err; } +static int gfs2_security_init(struct gfs2_inode *dip, struct gfs2_inode *ip, + const struct qstr *qstr) +{ + return security_inode_init_security(&ip->i_inode, &dip->i_inode, qstr, + &gfs2_initxattrs, NULL); +} + /** * gfs2_create_inode - Create a new inode * @dir: The parent directory @@ -662,8 +657,8 @@ static int gfs2_security_init(struct gfs2_inode *dip, struct gfs2_inode *ip, */ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, - unsigned int mode, dev_t dev, const char *symname, - unsigned int size) + umode_t mode, dev_t dev, const char *symname, + unsigned int size, int excl) { const struct qstr *name = &dentry->d_name; struct gfs2_holder ghs[2]; @@ -683,6 +678,12 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, goto fail; error = create_ok(dip, name, mode); + if ((error == -EEXIST) && S_ISREG(mode) && !excl) { + inode = gfs2_lookupi(dir, &dentry->d_name, 0); + gfs2_glock_dq_uninit(ghs); + d_instantiate(dentry, inode); + return IS_ERR(inode) ? PTR_ERR(inode) : 0; + } if (error) goto fail_gunlock; @@ -725,21 +726,25 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, brelse(bh); gfs2_trans_end(sdp); - if (dip->i_alloc->al_rgd) + /* Check if we reserved space in the rgrp. Function link_dinode may + not, depending on whether alloc is required. */ + if (dip->i_res) gfs2_inplace_release(dip); gfs2_quota_unlock(dip); - gfs2_alloc_put(dip); - gfs2_glock_dq_uninit_m(2, ghs); + gfs2_qadata_put(dip); mark_inode_dirty(inode); + gfs2_glock_dq_uninit_m(2, ghs); d_instantiate(dentry, inode); return 0; fail_gunlock2: gfs2_glock_dq_uninit(ghs + 1); - if (inode && !IS_ERR(inode)) - iput(inode); fail_gunlock: gfs2_glock_dq_uninit(ghs); + if (inode && !IS_ERR(inode)) { + set_bit(GIF_ALLOC_FAILED, &GFS2_I(inode)->i_flags); + iput(inode); + } fail: if (bh) brelse(bh); @@ -756,26 +761,12 @@ fail: */ static int gfs2_create(struct inode *dir, struct dentry *dentry, - int mode, struct nameidata *nd) + umode_t mode, struct nameidata *nd) { - struct inode *inode; - int ret; - - for (;;) { - ret = gfs2_create_inode(dir, dentry, S_IFREG | mode, 0, NULL, 0); - if (ret != -EEXIST || (nd && (nd->flags & LOOKUP_EXCL))) - return ret; - - inode = gfs2_lookupi(dir, &dentry->d_name, 0); - if (inode) { - if (!IS_ERR(inode)) - break; - return PTR_ERR(inode); - } - } - - d_instantiate(dentry, inode); - return 0; + int excl = 0; + if (nd && (nd->flags & LOOKUP_EXCL)) + excl = 1; + return gfs2_create_inode(dir, dentry, S_IFREG | mode, 0, NULL, 0, excl); } /** @@ -885,8 +876,9 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir, error = 0; if (alloc_required) { - struct gfs2_alloc *al = gfs2_alloc_get(dip); - if (!al) { + struct gfs2_qadata *qa = gfs2_qadata_get(dip); + + if (!qa) { error = -ENOMEM; goto out_gunlock; } @@ -895,14 +887,12 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir, if (error) goto out_alloc; - al->al_requested = sdp->sd_max_dirres; - - error = gfs2_inplace_reserve(dip); + error = gfs2_inplace_reserve(dip, sdp->sd_max_dirres); if (error) goto out_gunlock_q; error = gfs2_trans_begin(sdp, sdp->sd_max_dirres + - gfs2_rg_blocks(al) + + gfs2_rg_blocks(dip) + 2 * RES_DINODE + RES_STATFS + RES_QUOTA, 0); if (error) @@ -924,8 +914,9 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir, gfs2_trans_add_bh(ip->i_gl, dibh, 1); inc_nlink(&ip->i_inode); ip->i_inode.i_ctime = CURRENT_TIME; - gfs2_dinode_out(ip, dibh->b_data); - mark_inode_dirty(&ip->i_inode); + ihold(inode); + d_instantiate(dentry, inode); + mark_inode_dirty(inode); out_brelse: brelse(dibh); @@ -939,7 +930,7 @@ out_gunlock_q: gfs2_quota_unlock(dip); out_alloc: if (alloc_required) - gfs2_alloc_put(dip); + gfs2_qadata_put(dip); out_gunlock: gfs2_glock_dq(ghs + 1); out_child: @@ -947,11 +938,6 @@ out_child: out_parent: gfs2_holder_uninit(ghs); gfs2_holder_uninit(ghs + 1); - if (!error) { - ihold(inode); - d_instantiate(dentry, inode); - mark_inode_dirty(inode); - } return error; } @@ -1024,8 +1010,6 @@ static int gfs2_unlink_inode(struct gfs2_inode *dip, clear_nlink(inode); else drop_nlink(inode); - gfs2_trans_add_bh(ip->i_gl, bh, 1); - gfs2_dinode_out(ip, bh->b_data); mark_inode_dirty(inode); if (inode->i_nlink == 0) gfs2_unlink_di(inode); @@ -1053,17 +1037,14 @@ static int gfs2_unlink(struct inode *dir, struct dentry *dentry) struct buffer_head *bh; struct gfs2_holder ghs[3]; struct gfs2_rgrpd *rgd; - struct gfs2_holder ri_gh; - int error; - - error = gfs2_rindex_hold(sdp, &ri_gh); - if (error) - return error; + int error = -EROFS; gfs2_holder_init(dip->i_gl, LM_ST_EXCLUSIVE, 0, ghs); gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + 1); rgd = gfs2_blk2rgrpd(sdp, ip->i_no_addr); + if (!rgd) + goto out_inodes; gfs2_holder_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, ghs + 2); @@ -1109,14 +1090,14 @@ out_end_trans: out_gunlock: gfs2_glock_dq(ghs + 2); out_rgrp: - gfs2_holder_uninit(ghs + 2); gfs2_glock_dq(ghs + 1); out_child: - gfs2_holder_uninit(ghs + 1); gfs2_glock_dq(ghs); out_parent: + gfs2_holder_uninit(ghs + 2); +out_inodes: + gfs2_holder_uninit(ghs + 1); gfs2_holder_uninit(ghs); - gfs2_glock_dq_uninit(&ri_gh); return error; } @@ -1139,7 +1120,7 @@ static int gfs2_symlink(struct inode *dir, struct dentry *dentry, if (size > sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode) - 1) return -ENAMETOOLONG; - return gfs2_create_inode(dir, dentry, S_IFLNK | S_IRWXUGO, 0, symname, size); + return gfs2_create_inode(dir, dentry, S_IFLNK | S_IRWXUGO, 0, symname, size, 0); } /** @@ -1151,9 +1132,9 @@ static int gfs2_symlink(struct inode *dir, struct dentry *dentry, * Returns: errno */ -static int gfs2_mkdir(struct inode *dir, struct dentry *dentry, int mode) +static int gfs2_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) { - return gfs2_create_inode(dir, dentry, S_IFDIR | mode, 0, NULL, 0); + return gfs2_create_inode(dir, dentry, S_IFDIR | mode, 0, NULL, 0, 0); } /** @@ -1165,10 +1146,10 @@ static int gfs2_mkdir(struct inode *dir, struct dentry *dentry, int mode) * */ -static int gfs2_mknod(struct inode *dir, struct dentry *dentry, int mode, +static int gfs2_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev) { - return gfs2_create_inode(dir, dentry, mode, dev, NULL, 0); + return gfs2_create_inode(dir, dentry, mode, dev, NULL, 0, 0); } /* @@ -1234,7 +1215,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry, struct gfs2_inode *ip = GFS2_I(odentry->d_inode); struct gfs2_inode *nip = NULL; struct gfs2_sbd *sdp = GFS2_SB(odir); - struct gfs2_holder ghs[5], r_gh = { .gh_gl = NULL, }, ri_gh; + struct gfs2_holder ghs[5], r_gh = { .gh_gl = NULL, }; struct gfs2_rgrpd *nrgd; unsigned int num_gh; int dir_rename = 0; @@ -1248,10 +1229,6 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry, return 0; } - error = gfs2_rindex_hold(sdp, &ri_gh); - if (error) - return error; - if (odip != ndip) { error = gfs2_glock_nq_init(sdp->sd_rename_gl, LM_ST_EXCLUSIVE, 0, &r_gh); @@ -1376,8 +1353,9 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry, error = 0; if (alloc_required) { - struct gfs2_alloc *al = gfs2_alloc_get(ndip); - if (!al) { + struct gfs2_qadata *qa = gfs2_qadata_get(ndip); + + if (!qa) { error = -ENOMEM; goto out_gunlock; } @@ -1386,14 +1364,12 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry, if (error) goto out_alloc; - al->al_requested = sdp->sd_max_dirres; - - error = gfs2_inplace_reserve_ri(ndip); + error = gfs2_inplace_reserve(ndip, sdp->sd_max_dirres); if (error) goto out_gunlock_q; error = gfs2_trans_begin(sdp, sdp->sd_max_dirres + - gfs2_rg_blocks(al) + + gfs2_rg_blocks(ndip) + 4 * RES_DINODE + 4 * RES_LEAF + RES_STATFS + RES_QUOTA + 4, 0); if (error) @@ -1449,7 +1425,7 @@ out_gunlock_q: gfs2_quota_unlock(ndip); out_alloc: if (alloc_required) - gfs2_alloc_put(ndip); + gfs2_qadata_put(ndip); out_gunlock: while (x--) { gfs2_glock_dq(ghs + x); @@ -1459,7 +1435,6 @@ out_gunlock_r: if (r_gh.gh_gl) gfs2_glock_dq_uninit(&r_gh); out: - gfs2_glock_dq_uninit(&ri_gh); return error; } @@ -1563,21 +1538,10 @@ int gfs2_permission(struct inode *inode, int mask) return error; } -static int __gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr) +static int __gfs2_setattr_simple(struct inode *inode, struct iattr *attr) { - struct inode *inode = &ip->i_inode; - struct buffer_head *dibh; - int error; - - error = gfs2_meta_inode_buffer(ip, &dibh); - if (error) - return error; - setattr_copy(inode, attr); mark_inode_dirty(inode); - gfs2_trans_add_bh(ip->i_gl, dibh, 1); - gfs2_dinode_out(ip, dibh->b_data); - brelse(dibh); return 0; } @@ -1589,19 +1553,19 @@ static int __gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr) * Returns: errno */ -int gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr) +int gfs2_setattr_simple(struct inode *inode, struct iattr *attr) { int error; if (current->journal_info) - return __gfs2_setattr_simple(ip, attr); + return __gfs2_setattr_simple(inode, attr); - error = gfs2_trans_begin(GFS2_SB(&ip->i_inode), RES_DINODE, 0); + error = gfs2_trans_begin(GFS2_SB(inode), RES_DINODE, 0); if (error) return error; - error = __gfs2_setattr_simple(ip, attr); - gfs2_trans_end(GFS2_SB(&ip->i_inode)); + error = __gfs2_setattr_simple(inode, attr); + gfs2_trans_end(GFS2_SB(inode)); return error; } @@ -1622,7 +1586,7 @@ static int setattr_chown(struct inode *inode, struct iattr *attr) if (!(attr->ia_valid & ATTR_GID) || ogid == ngid) ogid = ngid = NO_QUOTA_CHANGE; - if (!gfs2_alloc_get(ip)) + if (!gfs2_qadata_get(ip)) return -ENOMEM; error = gfs2_quota_lock(ip, nuid, ngid); @@ -1639,7 +1603,7 @@ static int setattr_chown(struct inode *inode, struct iattr *attr) if (error) goto out_gunlock_q; - error = gfs2_setattr_simple(ip, attr); + error = gfs2_setattr_simple(inode, attr); if (error) goto out_end_trans; @@ -1654,7 +1618,7 @@ out_end_trans: out_gunlock_q: gfs2_quota_unlock(ip); out_alloc: - gfs2_alloc_put(ip); + gfs2_qadata_put(ip); return error; } @@ -1695,12 +1659,12 @@ static int gfs2_setattr(struct dentry *dentry, struct iattr *attr) else if ((attr->ia_valid & ATTR_MODE) && IS_POSIXACL(inode)) error = gfs2_acl_chmod(ip, attr); else - error = gfs2_setattr_simple(ip, attr); + error = gfs2_setattr_simple(inode, attr); out: - gfs2_glock_dq_uninit(&i_gh); if (!error) mark_inode_dirty(inode); + gfs2_glock_dq_uninit(&i_gh); return error; } diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h index 8d90e0c..276e7b5 100644 --- a/fs/gfs2/inode.h +++ b/fs/gfs2/inode.h @@ -109,7 +109,7 @@ extern int gfs2_inode_refresh(struct gfs2_inode *ip); extern struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name, int is_root); extern int gfs2_permission(struct inode *inode, int mask); -extern int gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr); +extern int gfs2_setattr_simple(struct inode *inode, struct iattr *attr); extern struct inode *gfs2_lookup_simple(struct inode *dip, const char *name); extern void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf); diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c index 5986464..756fae9 100644 --- a/fs/gfs2/log.c +++ b/fs/gfs2/log.c @@ -626,7 +626,7 @@ static void log_write_header(struct gfs2_sbd *sdp, u32 flags, int pull) if (test_bit(SDF_NOBARRIERS, &sdp->sd_flags)) submit_bh(WRITE_SYNC | REQ_META | REQ_PRIO, bh); else - submit_bh(WRITE_FLUSH_FUA | REQ_META | REQ_PRIO, bh); + submit_bh(WRITE_FLUSH_FUA | REQ_META, bh); wait_on_buffer(bh); if (!buffer_uptodate(bh)) @@ -951,8 +951,8 @@ int gfs2_logd(void *data) wake_up(&sdp->sd_log_waitq); t = gfs2_tune_get(sdp, gt_logd_secs) * HZ; - if (freezing(current)) - refrigerator(); + + try_to_freeze(); do { prepare_to_wait(&sdp->sd_logd_waitq, &wait, diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c index 05bbb12..0301be6 100644 --- a/fs/gfs2/lops.c +++ b/fs/gfs2/lops.c @@ -60,6 +60,29 @@ static void gfs2_pin(struct gfs2_sbd *sdp, struct buffer_head *bh) trace_gfs2_pin(bd, 1); } +static bool buffer_is_rgrp(const struct gfs2_bufdata *bd) +{ + return bd->bd_gl->gl_name.ln_type == LM_TYPE_RGRP; +} + +static void maybe_release_space(struct gfs2_bufdata *bd) +{ + struct gfs2_glock *gl = bd->bd_gl; + struct gfs2_sbd *sdp = gl->gl_sbd; + struct gfs2_rgrpd *rgd = gl->gl_object; + unsigned int index = bd->bd_bh->b_blocknr - gl->gl_name.ln_number; + struct gfs2_bitmap *bi = rgd->rd_bits + index; + + if (bi->bi_clone == 0) + return; + if (sdp->sd_args.ar_discard) + gfs2_rgrp_send_discards(sdp, rgd->rd_data0, bd->bd_bh, bi); + memcpy(bi->bi_clone + bi->bi_offset, + bd->bd_bh->b_data + bi->bi_offset, bi->bi_len); + clear_bit(GBF_FULL, &bi->bi_flags); + rgd->rd_free_clone = rgd->rd_free; +} + /** * gfs2_unpin - Unpin a buffer * @sdp: the filesystem the buffer belongs to @@ -81,6 +104,9 @@ static void gfs2_unpin(struct gfs2_sbd *sdp, struct buffer_head *bh, mark_buffer_dirty(bh); clear_buffer_pinned(bh); + if (buffer_is_rgrp(bd)) + maybe_release_space(bd); + spin_lock(&sdp->sd_ail_lock); if (bd->bd_ail) { list_del(&bd->bd_ail_st_list); @@ -469,42 +495,6 @@ static void revoke_lo_after_scan(struct gfs2_jdesc *jd, int error, int pass) gfs2_revoke_clean(sdp); } -static void rg_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le) -{ - struct gfs2_rgrpd *rgd; - struct gfs2_trans *tr = current->journal_info; - - tr->tr_touched = 1; - - rgd = container_of(le, struct gfs2_rgrpd, rd_le); - - gfs2_log_lock(sdp); - if (!list_empty(&le->le_list)){ - gfs2_log_unlock(sdp); - return; - } - gfs2_rgrp_bh_hold(rgd); - sdp->sd_log_num_rg++; - list_add(&le->le_list, &sdp->sd_log_le_rg); - gfs2_log_unlock(sdp); -} - -static void rg_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_ail *ai) -{ - struct list_head *head = &sdp->sd_log_le_rg; - struct gfs2_rgrpd *rgd; - - while (!list_empty(head)) { - rgd = list_entry(head->next, struct gfs2_rgrpd, rd_le.le_list); - list_del_init(&rgd->rd_le.le_list); - sdp->sd_log_num_rg--; - - gfs2_rgrp_repolish_clones(rgd); - gfs2_rgrp_bh_put(rgd); - } - gfs2_assert_warn(sdp, !sdp->sd_log_num_rg); -} - /** * databuf_lo_add - Add a databuf to the transaction. * @@ -705,8 +695,6 @@ static int databuf_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start, brelse(bh_log); brelse(bh_ip); - if (error) - break; sdp->sd_replayed_blocks++; } @@ -771,8 +759,6 @@ const struct gfs2_log_operations gfs2_revoke_lops = { }; const struct gfs2_log_operations gfs2_rg_lops = { - .lo_add = rg_lo_add, - .lo_after_commit = rg_lo_after_commit, .lo_name = "rg", }; diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c index 8a139ff..c150298 100644 --- a/fs/gfs2/main.c +++ b/fs/gfs2/main.c @@ -40,7 +40,8 @@ static void gfs2_init_inode_once(void *foo) inode_init_once(&ip->i_inode); init_rwsem(&ip->i_rw_mutex); INIT_LIST_HEAD(&ip->i_trunc_list); - ip->i_alloc = NULL; + ip->i_qadata = NULL; + ip->i_res = NULL; ip->i_hash_cache = NULL; } diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c index be29858..181586e 100644 --- a/fs/gfs2/meta_io.c +++ b/fs/gfs2/meta_io.c @@ -435,7 +435,7 @@ struct buffer_head *gfs2_meta_ra(struct gfs2_glock *gl, u64 dblock, u32 extlen) if (buffer_uptodate(first_bh)) goto out; if (!buffer_locked(first_bh)) - ll_rw_block(READ_SYNC | REQ_META | REQ_PRIO, 1, &first_bh); + ll_rw_block(READ_SYNC | REQ_META, 1, &first_bh); dblock++; extlen--; @@ -444,7 +444,7 @@ struct buffer_head *gfs2_meta_ra(struct gfs2_glock *gl, u64 dblock, u32 extlen) bh = gfs2_getbuf(gl, dblock, CREATE); if (!buffer_uptodate(bh) && !buffer_locked(bh)) - ll_rw_block(READA, 1, &bh); + ll_rw_block(READA | REQ_META, 1, &bh); brelse(bh); dblock++; extlen--; diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index 079587e..fe72e79 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -14,6 +14,7 @@ #include <linux/buffer_head.h> #include <linux/blkdev.h> #include <linux/kthread.h> +#include <linux/export.h> #include <linux/namei.h> #include <linux/mount.h> #include <linux/gfs2_ondisk.h> @@ -77,8 +78,7 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb) spin_lock_init(&sdp->sd_rindex_spin); mutex_init(&sdp->sd_rindex_mutex); - INIT_LIST_HEAD(&sdp->sd_rindex_list); - INIT_LIST_HEAD(&sdp->sd_rindex_mru_list); + sdp->sd_rindex_tree.rb_node = NULL; INIT_LIST_HEAD(&sdp->sd_jindex_list); spin_lock_init(&sdp->sd_jindex_spin); @@ -224,7 +224,7 @@ static int gfs2_read_super(struct gfs2_sbd *sdp, sector_t sector, int silent) bio->bi_end_io = end_bio_io_page; bio->bi_private = page; - submit_bio(READ_SYNC | REQ_META | REQ_PRIO, bio); + submit_bio(READ_SYNC | REQ_META, bio); wait_on_page_locked(page); bio_put(bio); if (!PageUptodate(page)) { @@ -652,7 +652,6 @@ static int init_journal(struct gfs2_sbd *sdp, int undo) fs_err(sdp, "can't lookup journal index: %d\n", error); return PTR_ERR(sdp->sd_jindex); } - ip = GFS2_I(sdp->sd_jindex); /* Load in the journal index special file */ @@ -764,7 +763,6 @@ fail: static int init_inodes(struct gfs2_sbd *sdp, int undo) { int error = 0; - struct gfs2_inode *ip; struct inode *master = sdp->sd_master_dir->d_inode; if (undo) @@ -789,7 +787,6 @@ static int init_inodes(struct gfs2_sbd *sdp, int undo) fs_err(sdp, "can't get resource index inode: %d\n", error); goto fail_statfs; } - ip = GFS2_I(sdp->sd_rindex); sdp->sd_rindex_uptodate = 0; /* Read in the quota inode */ diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index 0e8bb13..a45b21b 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -494,11 +494,11 @@ static void qdsb_put(struct gfs2_quota_data *qd) int gfs2_quota_hold(struct gfs2_inode *ip, u32 uid, u32 gid) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); - struct gfs2_alloc *al = ip->i_alloc; - struct gfs2_quota_data **qd = al->al_qd; + struct gfs2_qadata *qa = ip->i_qadata; + struct gfs2_quota_data **qd = qa->qa_qd; int error; - if (gfs2_assert_warn(sdp, !al->al_qd_num) || + if (gfs2_assert_warn(sdp, !qa->qa_qd_num) || gfs2_assert_warn(sdp, !test_bit(GIF_QD_LOCKED, &ip->i_flags))) return -EIO; @@ -508,20 +508,20 @@ int gfs2_quota_hold(struct gfs2_inode *ip, u32 uid, u32 gid) error = qdsb_get(sdp, QUOTA_USER, ip->i_inode.i_uid, qd); if (error) goto out; - al->al_qd_num++; + qa->qa_qd_num++; qd++; error = qdsb_get(sdp, QUOTA_GROUP, ip->i_inode.i_gid, qd); if (error) goto out; - al->al_qd_num++; + qa->qa_qd_num++; qd++; if (uid != NO_QUOTA_CHANGE && uid != ip->i_inode.i_uid) { error = qdsb_get(sdp, QUOTA_USER, uid, qd); if (error) goto out; - al->al_qd_num++; + qa->qa_qd_num++; qd++; } @@ -529,7 +529,7 @@ int gfs2_quota_hold(struct gfs2_inode *ip, u32 uid, u32 gid) error = qdsb_get(sdp, QUOTA_GROUP, gid, qd); if (error) goto out; - al->al_qd_num++; + qa->qa_qd_num++; qd++; } @@ -542,16 +542,16 @@ out: void gfs2_quota_unhold(struct gfs2_inode *ip) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); - struct gfs2_alloc *al = ip->i_alloc; + struct gfs2_qadata *qa = ip->i_qadata; unsigned int x; gfs2_assert_warn(sdp, !test_bit(GIF_QD_LOCKED, &ip->i_flags)); - for (x = 0; x < al->al_qd_num; x++) { - qdsb_put(al->al_qd[x]); - al->al_qd[x] = NULL; + for (x = 0; x < qa->qa_qd_num; x++) { + qdsb_put(qa->qa_qd[x]); + qa->qa_qd[x] = NULL; } - al->al_qd_num = 0; + qa->qa_qd_num = 0; } static int sort_qd(const void *a, const void *b) @@ -638,15 +638,18 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc, unsigned long index = loc >> PAGE_CACHE_SHIFT; unsigned offset = loc & (PAGE_CACHE_SIZE - 1); unsigned blocksize, iblock, pos; - struct buffer_head *bh, *dibh; + struct buffer_head *bh; struct page *page; void *kaddr, *ptr; struct gfs2_quota q, *qp; int err, nbytes; u64 size; - if (gfs2_is_stuffed(ip)) - gfs2_unstuff_dinode(ip, NULL); + if (gfs2_is_stuffed(ip)) { + err = gfs2_unstuff_dinode(ip, NULL); + if (err) + return err; + } memset(&q, 0, sizeof(struct gfs2_quota)); err = gfs2_internal_read(ip, NULL, (char *)&q, &loc, sizeof(q)); @@ -709,7 +712,7 @@ get_a_page: set_buffer_uptodate(bh); if (!buffer_uptodate(bh)) { - ll_rw_block(READ | REQ_META | REQ_PRIO, 1, &bh); + ll_rw_block(READ | REQ_META, 1, &bh); wait_on_buffer(bh); if (!buffer_uptodate(bh)) goto unlock_out; @@ -736,22 +739,13 @@ get_a_page: goto get_a_page; } - /* Update the disk inode timestamp and size (if extended) */ - err = gfs2_meta_inode_buffer(ip, &dibh); - if (err) - goto out; - size = loc + sizeof(struct gfs2_quota); if (size > inode->i_size) i_size_write(inode, size); inode->i_mtime = inode->i_atime = CURRENT_TIME; - gfs2_trans_add_bh(ip->i_gl, dibh, 1); - gfs2_dinode_out(ip, dibh->b_data); - brelse(dibh); mark_inode_dirty(inode); - -out: return err; + unlock_out: unlock_page(page); page_cache_release(page); @@ -768,7 +762,6 @@ static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda) struct gfs2_quota_data *qd; loff_t offset; unsigned int nalloc = 0, blocks; - struct gfs2_alloc *al = NULL; int error; gfs2_write_calc_reserv(ip, sizeof(struct gfs2_quota), @@ -798,31 +791,24 @@ static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda) nalloc++; } - al = gfs2_alloc_get(ip); - if (!al) { - error = -ENOMEM; - goto out_gunlock; - } /* * 1 blk for unstuffing inode if stuffed. We add this extra * block to the reservation unconditionally. If the inode * doesn't need unstuffing, the block will be released to the * rgrp since it won't be allocated during the transaction */ - al->al_requested = 1; /* +3 in the end for unstuffing block, inode size update block * and another block in case quota straddles page boundary and * two blocks need to be updated instead of 1 */ blocks = num_qd * data_blocks + RES_DINODE + num_qd + 3; - if (nalloc) - al->al_requested += nalloc * (data_blocks + ind_blocks); - error = gfs2_inplace_reserve(ip); + error = gfs2_inplace_reserve(ip, 1 + + (nalloc * (data_blocks + ind_blocks))); if (error) goto out_alloc; if (nalloc) - blocks += gfs2_rg_blocks(al) + nalloc * ind_blocks + RES_STATFS; + blocks += gfs2_rg_blocks(ip) + nalloc * ind_blocks + RES_STATFS; error = gfs2_trans_begin(sdp, blocks, 0); if (error) @@ -846,8 +832,6 @@ out_end_trans: out_ipres: gfs2_inplace_release(ip); out_alloc: - gfs2_alloc_put(ip); -out_gunlock: gfs2_glock_dq_uninit(&i_gh); out: while (qx--) @@ -931,26 +915,28 @@ fail: int gfs2_quota_lock(struct gfs2_inode *ip, u32 uid, u32 gid) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); - struct gfs2_alloc *al = ip->i_alloc; + struct gfs2_qadata *qa = ip->i_qadata; struct gfs2_quota_data *qd; unsigned int x; int error = 0; - gfs2_quota_hold(ip, uid, gid); + error = gfs2_quota_hold(ip, uid, gid); + if (error) + return error; if (capable(CAP_SYS_RESOURCE) || sdp->sd_args.ar_quota != GFS2_QUOTA_ON) return 0; - sort(al->al_qd, al->al_qd_num, sizeof(struct gfs2_quota_data *), + sort(qa->qa_qd, qa->qa_qd_num, sizeof(struct gfs2_quota_data *), sort_qd, NULL); - for (x = 0; x < al->al_qd_num; x++) { + for (x = 0; x < qa->qa_qd_num; x++) { int force = NO_FORCE; - qd = al->al_qd[x]; + qd = qa->qa_qd[x]; if (test_and_clear_bit(QDF_REFRESH, &qd->qd_flags)) force = FORCE; - error = do_glock(qd, force, &al->al_qd_ghs[x]); + error = do_glock(qd, force, &qa->qa_qd_ghs[x]); if (error) break; } @@ -959,7 +945,7 @@ int gfs2_quota_lock(struct gfs2_inode *ip, u32 uid, u32 gid) set_bit(GIF_QD_LOCKED, &ip->i_flags); else { while (x--) - gfs2_glock_dq_uninit(&al->al_qd_ghs[x]); + gfs2_glock_dq_uninit(&qa->qa_qd_ghs[x]); gfs2_quota_unhold(ip); } @@ -1004,7 +990,7 @@ static int need_sync(struct gfs2_quota_data *qd) void gfs2_quota_unlock(struct gfs2_inode *ip) { - struct gfs2_alloc *al = ip->i_alloc; + struct gfs2_qadata *qa = ip->i_qadata; struct gfs2_quota_data *qda[4]; unsigned int count = 0; unsigned int x; @@ -1012,14 +998,14 @@ void gfs2_quota_unlock(struct gfs2_inode *ip) if (!test_and_clear_bit(GIF_QD_LOCKED, &ip->i_flags)) goto out; - for (x = 0; x < al->al_qd_num; x++) { + for (x = 0; x < qa->qa_qd_num; x++) { struct gfs2_quota_data *qd; int sync; - qd = al->al_qd[x]; + qd = qa->qa_qd[x]; sync = need_sync(qd); - gfs2_glock_dq_uninit(&al->al_qd_ghs[x]); + gfs2_glock_dq_uninit(&qa->qa_qd_ghs[x]); if (sync && qd_trylock(qd)) qda[count++] = qd; @@ -1052,7 +1038,7 @@ static int print_message(struct gfs2_quota_data *qd, char *type) int gfs2_quota_check(struct gfs2_inode *ip, u32 uid, u32 gid) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); - struct gfs2_alloc *al = ip->i_alloc; + struct gfs2_qadata *qa = ip->i_qadata; struct gfs2_quota_data *qd; s64 value; unsigned int x; @@ -1064,8 +1050,8 @@ int gfs2_quota_check(struct gfs2_inode *ip, u32 uid, u32 gid) if (sdp->sd_args.ar_quota != GFS2_QUOTA_ON) return 0; - for (x = 0; x < al->al_qd_num; x++) { - qd = al->al_qd[x]; + for (x = 0; x < qa->qa_qd_num; x++) { + qd = qa->qa_qd[x]; if (!((qd->qd_id == uid && test_bit(QDF_USER, &qd->qd_flags)) || (qd->qd_id == gid && !test_bit(QDF_USER, &qd->qd_flags)))) @@ -1103,7 +1089,7 @@ int gfs2_quota_check(struct gfs2_inode *ip, u32 uid, u32 gid) void gfs2_quota_change(struct gfs2_inode *ip, s64 change, u32 uid, u32 gid) { - struct gfs2_alloc *al = ip->i_alloc; + struct gfs2_qadata *qa = ip->i_qadata; struct gfs2_quota_data *qd; unsigned int x; @@ -1112,8 +1098,8 @@ void gfs2_quota_change(struct gfs2_inode *ip, s64 change, if (ip->i_diskflags & GFS2_DIF_SYSTEM) return; - for (x = 0; x < al->al_qd_num; x++) { - qd = al->al_qd[x]; + for (x = 0; x < qa->qa_qd_num; x++) { + qd = qa->qa_qd[x]; if ((qd->qd_id == uid && test_bit(QDF_USER, &qd->qd_flags)) || (qd->qd_id == gid && !test_bit(QDF_USER, &qd->qd_flags))) { @@ -1431,8 +1417,8 @@ int gfs2_quotad(void *data) /* Check for & recover partially truncated inodes */ quotad_check_trunc_list(sdp); - if (freezing(current)) - refrigerator(); + try_to_freeze(); + t = min(quotad_timeo, statfs_timeo); prepare_to_wait(&sdp->sd_quota_wait, &wait, TASK_INTERRUPTIBLE); @@ -1533,7 +1519,6 @@ static int gfs2_set_dqblk(struct super_block *sb, int type, qid_t id, unsigned int data_blocks, ind_blocks; unsigned int blocks = 0; int alloc_required; - struct gfs2_alloc *al; loff_t offset; int error; @@ -1598,16 +1583,13 @@ static int gfs2_set_dqblk(struct super_block *sb, int type, qid_t id, if (gfs2_is_stuffed(ip)) alloc_required = 1; if (alloc_required) { - al = gfs2_alloc_get(ip); - if (al == NULL) - goto out_i; gfs2_write_calc_reserv(ip, sizeof(struct gfs2_quota), &data_blocks, &ind_blocks); - blocks = al->al_requested = 1 + data_blocks + ind_blocks; - error = gfs2_inplace_reserve(ip); + blocks = 1 + data_blocks + ind_blocks; + error = gfs2_inplace_reserve(ip, blocks); if (error) - goto out_alloc; - blocks += gfs2_rg_blocks(al); + goto out_i; + blocks += gfs2_rg_blocks(ip); } /* Some quotas span block boundaries and can update two blocks, @@ -1621,11 +1603,8 @@ static int gfs2_set_dqblk(struct super_block *sb, int type, qid_t id, gfs2_trans_end(sdp); out_release: - if (alloc_required) { + if (alloc_required) gfs2_inplace_release(ip); -out_alloc: - gfs2_alloc_put(ip); - } out_i: gfs2_glock_dq_uninit(&i_gh); out_q: diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index 7f8af1e..2223462 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -15,6 +15,7 @@ #include <linux/gfs2_ondisk.h> #include <linux/prefetch.h> #include <linux/blkdev.h> +#include <linux/rbtree.h> #include "gfs2.h" #include "incore.h" @@ -64,8 +65,8 @@ static const char valid_change[16] = { }; static u32 rgblk_search(struct gfs2_rgrpd *rgd, u32 goal, - unsigned char old_state, unsigned char new_state, - unsigned int *n); + unsigned char old_state, + struct gfs2_bitmap **rbi); /** * gfs2_setbit - Set a bit in the bitmaps @@ -328,18 +329,22 @@ static inline int rgrp_contains_block(struct gfs2_rgrpd *rgd, u64 block) struct gfs2_rgrpd *gfs2_blk2rgrpd(struct gfs2_sbd *sdp, u64 blk) { - struct gfs2_rgrpd *rgd; + struct rb_node **newn; + struct gfs2_rgrpd *cur; spin_lock(&sdp->sd_rindex_spin); - - list_for_each_entry(rgd, &sdp->sd_rindex_mru_list, rd_list_mru) { - if (rgrp_contains_block(rgd, blk)) { - list_move(&rgd->rd_list_mru, &sdp->sd_rindex_mru_list); + newn = &sdp->sd_rindex_tree.rb_node; + while (*newn) { + cur = rb_entry(*newn, struct gfs2_rgrpd, rd_node); + if (blk < cur->rd_addr) + newn = &((*newn)->rb_left); + else if (blk >= cur->rd_data0 + cur->rd_data) + newn = &((*newn)->rb_right); + else { spin_unlock(&sdp->sd_rindex_spin); - return rgd; + return cur; } } - spin_unlock(&sdp->sd_rindex_spin); return NULL; @@ -354,8 +359,15 @@ struct gfs2_rgrpd *gfs2_blk2rgrpd(struct gfs2_sbd *sdp, u64 blk) struct gfs2_rgrpd *gfs2_rgrpd_get_first(struct gfs2_sbd *sdp) { - gfs2_assert(sdp, !list_empty(&sdp->sd_rindex_list)); - return list_entry(sdp->sd_rindex_list.next, struct gfs2_rgrpd, rd_list); + const struct rb_node *n; + struct gfs2_rgrpd *rgd; + + spin_lock(&sdp->sd_rindex_spin); + n = rb_first(&sdp->sd_rindex_tree); + rgd = rb_entry(n, struct gfs2_rgrpd, rd_node); + spin_unlock(&sdp->sd_rindex_spin); + + return rgd; } /** @@ -367,47 +379,60 @@ struct gfs2_rgrpd *gfs2_rgrpd_get_first(struct gfs2_sbd *sdp) struct gfs2_rgrpd *gfs2_rgrpd_get_next(struct gfs2_rgrpd *rgd) { - if (rgd->rd_list.next == &rgd->rd_sbd->sd_rindex_list) + struct gfs2_sbd *sdp = rgd->rd_sbd; + const struct rb_node *n; + + spin_lock(&sdp->sd_rindex_spin); + n = rb_next(&rgd->rd_node); + if (n == NULL) + n = rb_first(&sdp->sd_rindex_tree); + + if (unlikely(&rgd->rd_node == n)) { + spin_unlock(&sdp->sd_rindex_spin); return NULL; - return list_entry(rgd->rd_list.next, struct gfs2_rgrpd, rd_list); + } + rgd = rb_entry(n, struct gfs2_rgrpd, rd_node); + spin_unlock(&sdp->sd_rindex_spin); + return rgd; +} + +void gfs2_free_clones(struct gfs2_rgrpd *rgd) +{ + int x; + + for (x = 0; x < rgd->rd_length; x++) { + struct gfs2_bitmap *bi = rgd->rd_bits + x; + kfree(bi->bi_clone); + bi->bi_clone = NULL; + } } -static void clear_rgrpdi(struct gfs2_sbd *sdp) +void gfs2_clear_rgrpd(struct gfs2_sbd *sdp) { - struct list_head *head; + struct rb_node *n; struct gfs2_rgrpd *rgd; struct gfs2_glock *gl; - spin_lock(&sdp->sd_rindex_spin); - sdp->sd_rindex_forward = NULL; - spin_unlock(&sdp->sd_rindex_spin); - - head = &sdp->sd_rindex_list; - while (!list_empty(head)) { - rgd = list_entry(head->next, struct gfs2_rgrpd, rd_list); + while ((n = rb_first(&sdp->sd_rindex_tree))) { + rgd = rb_entry(n, struct gfs2_rgrpd, rd_node); gl = rgd->rd_gl; - list_del(&rgd->rd_list); - list_del(&rgd->rd_list_mru); + rb_erase(n, &sdp->sd_rindex_tree); if (gl) { + spin_lock(&gl->gl_spin); gl->gl_object = NULL; + spin_unlock(&gl->gl_spin); gfs2_glock_add_to_lru(gl); gfs2_glock_put(gl); } + gfs2_free_clones(rgd); kfree(rgd->rd_bits); kmem_cache_free(gfs2_rgrpd_cachep, rgd); } } -void gfs2_clear_rgrpd(struct gfs2_sbd *sdp) -{ - mutex_lock(&sdp->sd_rindex_mutex); - clear_rgrpdi(sdp); - mutex_unlock(&sdp->sd_rindex_mutex); -} - static void gfs2_rindex_print(const struct gfs2_rgrpd *rgd) { printk(KERN_INFO " ri_addr = %llu\n", (unsigned long long)rgd->rd_addr); @@ -524,22 +549,34 @@ u64 gfs2_ri_total(struct gfs2_sbd *sdp) return total_data; } -static void gfs2_rindex_in(struct gfs2_rgrpd *rgd, const void *buf) +static void rgd_insert(struct gfs2_rgrpd *rgd) { - const struct gfs2_rindex *str = buf; + struct gfs2_sbd *sdp = rgd->rd_sbd; + struct rb_node **newn = &sdp->sd_rindex_tree.rb_node, *parent = NULL; + + /* Figure out where to put new node */ + while (*newn) { + struct gfs2_rgrpd *cur = rb_entry(*newn, struct gfs2_rgrpd, + rd_node); + + parent = *newn; + if (rgd->rd_addr < cur->rd_addr) + newn = &((*newn)->rb_left); + else if (rgd->rd_addr > cur->rd_addr) + newn = &((*newn)->rb_right); + else + return; + } - rgd->rd_addr = be64_to_cpu(str->ri_addr); - rgd->rd_length = be32_to_cpu(str->ri_length); - rgd->rd_data0 = be64_to_cpu(str->ri_data0); - rgd->rd_data = be32_to_cpu(str->ri_data); - rgd->rd_bitbytes = be32_to_cpu(str->ri_bitbytes); + rb_link_node(&rgd->rd_node, parent, newn); + rb_insert_color(&rgd->rd_node, &sdp->sd_rindex_tree); } /** * read_rindex_entry - Pull in a new resource index entry from the disk * @gl: The glock covering the rindex inode * - * Returns: 0 on success, error code otherwise + * Returns: 0 on success, > 0 on EOF, error code otherwise */ static int read_rindex_entry(struct gfs2_inode *ip, @@ -547,44 +584,53 @@ static int read_rindex_entry(struct gfs2_inode *ip, { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); loff_t pos = sdp->sd_rgrps * sizeof(struct gfs2_rindex); - char buf[sizeof(struct gfs2_rindex)]; + struct gfs2_rindex buf; int error; struct gfs2_rgrpd *rgd; - error = gfs2_internal_read(ip, ra_state, buf, &pos, + if (pos >= i_size_read(&ip->i_inode)) + return 1; + + error = gfs2_internal_read(ip, ra_state, (char *)&buf, &pos, sizeof(struct gfs2_rindex)); - if (!error) - return 0; - if (error != sizeof(struct gfs2_rindex)) { - if (error > 0) - error = -EIO; - return error; - } + + if (error != sizeof(struct gfs2_rindex)) + return (error == 0) ? 1 : error; rgd = kmem_cache_zalloc(gfs2_rgrpd_cachep, GFP_NOFS); error = -ENOMEM; if (!rgd) return error; - mutex_init(&rgd->rd_mutex); - lops_init_le(&rgd->rd_le, &gfs2_rg_lops); rgd->rd_sbd = sdp; + rgd->rd_addr = be64_to_cpu(buf.ri_addr); + rgd->rd_length = be32_to_cpu(buf.ri_length); + rgd->rd_data0 = be64_to_cpu(buf.ri_data0); + rgd->rd_data = be32_to_cpu(buf.ri_data); + rgd->rd_bitbytes = be32_to_cpu(buf.ri_bitbytes); - list_add_tail(&rgd->rd_list, &sdp->sd_rindex_list); - list_add_tail(&rgd->rd_list_mru, &sdp->sd_rindex_mru_list); - - gfs2_rindex_in(rgd, buf); error = compute_bitstructs(rgd); if (error) - return error; + goto fail; error = gfs2_glock_get(sdp, rgd->rd_addr, &gfs2_rgrp_glops, CREATE, &rgd->rd_gl); if (error) - return error; + goto fail; rgd->rd_gl->gl_object = rgd; rgd->rd_flags &= ~GFS2_RDF_UPTODATE; + if (rgd->rd_data > sdp->sd_max_rg_data) + sdp->sd_max_rg_data = rgd->rd_data; + spin_lock(&sdp->sd_rindex_spin); + rgd_insert(rgd); + sdp->sd_rgrps++; + spin_unlock(&sdp->sd_rindex_spin); + return error; + +fail: + kfree(rgd->rd_bits); + kmem_cache_free(gfs2_rgrpd_cachep, rgd); return error; } @@ -595,40 +641,28 @@ static int read_rindex_entry(struct gfs2_inode *ip, * Returns: 0 on successful update, error code otherwise */ -int gfs2_ri_update(struct gfs2_inode *ip) +static int gfs2_ri_update(struct gfs2_inode *ip) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); struct inode *inode = &ip->i_inode; struct file_ra_state ra_state; - u64 rgrp_count = i_size_read(inode); - struct gfs2_rgrpd *rgd; - unsigned int max_data = 0; int error; - do_div(rgrp_count, sizeof(struct gfs2_rindex)); - clear_rgrpdi(sdp); - file_ra_state_init(&ra_state, inode->i_mapping); - for (sdp->sd_rgrps = 0; sdp->sd_rgrps < rgrp_count; sdp->sd_rgrps++) { + do { error = read_rindex_entry(ip, &ra_state); - if (error) { - clear_rgrpdi(sdp); - return error; - } - } + } while (error == 0); + + if (error < 0) + return error; - list_for_each_entry(rgd, &sdp->sd_rindex_list, rd_list) - if (rgd->rd_data > max_data) - max_data = rgd->rd_data; - sdp->sd_max_rg_data = max_data; sdp->sd_rindex_uptodate = 1; return 0; } /** - * gfs2_rindex_hold - Grab a lock on the rindex + * gfs2_rindex_update - Update the rindex if required * @sdp: The GFS2 superblock - * @ri_gh: the glock holder * * We grab a lock on the rindex inode to make sure that it doesn't * change whilst we are performing an operation. We keep this lock @@ -640,30 +674,29 @@ int gfs2_ri_update(struct gfs2_inode *ip) * special file, which might have been updated if someone expanded the * filesystem (via gfs2_grow utility), which adds new resource groups. * - * Returns: 0 on success, error code otherwise + * Returns: 0 on succeess, error code otherwise */ -int gfs2_rindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ri_gh) +int gfs2_rindex_update(struct gfs2_sbd *sdp) { struct gfs2_inode *ip = GFS2_I(sdp->sd_rindex); struct gfs2_glock *gl = ip->i_gl; - int error; - - error = gfs2_glock_nq_init(gl, LM_ST_SHARED, 0, ri_gh); - if (error) - return error; + struct gfs2_holder ri_gh; + int error = 0; /* Read new copy from disk if we don't have the latest */ if (!sdp->sd_rindex_uptodate) { mutex_lock(&sdp->sd_rindex_mutex); - if (!sdp->sd_rindex_uptodate) { + error = gfs2_glock_nq_init(gl, LM_ST_SHARED, 0, &ri_gh); + if (error) + return error; + if (!sdp->sd_rindex_uptodate) error = gfs2_ri_update(ip); - if (error) - gfs2_glock_dq_uninit(ri_gh); - } + gfs2_glock_dq_uninit(&ri_gh); mutex_unlock(&sdp->sd_rindex_mutex); } + return error; } @@ -694,7 +727,7 @@ static void gfs2_rgrp_out(struct gfs2_rgrpd *rgd, void *buf) } /** - * gfs2_rgrp_bh_get - Read in a RG's header and bitmaps + * gfs2_rgrp_go_lock - Read in a RG's header and bitmaps * @rgd: the struct gfs2_rgrpd describing the RG to read in * * Read in all of a Resource Group's header and bitmap blocks. @@ -703,8 +736,9 @@ static void gfs2_rgrp_out(struct gfs2_rgrpd *rgd, void *buf) * Returns: errno */ -int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd) +int gfs2_rgrp_go_lock(struct gfs2_holder *gh) { + struct gfs2_rgrpd *rgd = gh->gh_gl->gl_object; struct gfs2_sbd *sdp = rgd->rd_sbd; struct gfs2_glock *gl = rgd->rd_gl; unsigned int length = rgd->rd_length; @@ -712,17 +746,6 @@ int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd) unsigned int x, y; int error; - mutex_lock(&rgd->rd_mutex); - - spin_lock(&sdp->sd_rindex_spin); - if (rgd->rd_bh_count) { - rgd->rd_bh_count++; - spin_unlock(&sdp->sd_rindex_spin); - mutex_unlock(&rgd->rd_mutex); - return 0; - } - spin_unlock(&sdp->sd_rindex_spin); - for (x = 0; x < length; x++) { bi = rgd->rd_bits + x; error = gfs2_meta_read(gl, rgd->rd_addr + x, 0, &bi->bi_bh); @@ -747,15 +770,9 @@ int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd) clear_bit(GBF_FULL, &rgd->rd_bits[x].bi_flags); gfs2_rgrp_in(rgd, (rgd->rd_bits[0].bi_bh)->b_data); rgd->rd_flags |= (GFS2_RDF_UPTODATE | GFS2_RDF_CHECK); + rgd->rd_free_clone = rgd->rd_free; } - spin_lock(&sdp->sd_rindex_spin); - rgd->rd_free_clone = rgd->rd_free; - rgd->rd_bh_count++; - spin_unlock(&sdp->sd_rindex_spin); - - mutex_unlock(&rgd->rd_mutex); - return 0; fail: @@ -765,52 +782,32 @@ fail: bi->bi_bh = NULL; gfs2_assert_warn(sdp, !bi->bi_clone); } - mutex_unlock(&rgd->rd_mutex); return error; } -void gfs2_rgrp_bh_hold(struct gfs2_rgrpd *rgd) -{ - struct gfs2_sbd *sdp = rgd->rd_sbd; - - spin_lock(&sdp->sd_rindex_spin); - gfs2_assert_warn(rgd->rd_sbd, rgd->rd_bh_count); - rgd->rd_bh_count++; - spin_unlock(&sdp->sd_rindex_spin); -} - /** - * gfs2_rgrp_bh_put - Release RG bitmaps read in with gfs2_rgrp_bh_get() + * gfs2_rgrp_go_unlock - Release RG bitmaps read in with gfs2_rgrp_bh_get() * @rgd: the struct gfs2_rgrpd describing the RG to read in * */ -void gfs2_rgrp_bh_put(struct gfs2_rgrpd *rgd) +void gfs2_rgrp_go_unlock(struct gfs2_holder *gh) { - struct gfs2_sbd *sdp = rgd->rd_sbd; + struct gfs2_rgrpd *rgd = gh->gh_gl->gl_object; int x, length = rgd->rd_length; - spin_lock(&sdp->sd_rindex_spin); - gfs2_assert_warn(rgd->rd_sbd, rgd->rd_bh_count); - if (--rgd->rd_bh_count) { - spin_unlock(&sdp->sd_rindex_spin); - return; - } - for (x = 0; x < length; x++) { struct gfs2_bitmap *bi = rgd->rd_bits + x; - kfree(bi->bi_clone); - bi->bi_clone = NULL; brelse(bi->bi_bh); bi->bi_bh = NULL; } - spin_unlock(&sdp->sd_rindex_spin); } -static void gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset, - const struct gfs2_bitmap *bi) +void gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset, + struct buffer_head *bh, + const struct gfs2_bitmap *bi) { struct super_block *sb = sdp->sd_vfs; struct block_device *bdev = sb->s_bdev; @@ -823,7 +820,7 @@ static void gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset, unsigned int x; for (x = 0; x < bi->bi_len; x++) { - const u8 *orig = bi->bi_bh->b_data + bi->bi_offset + x; + const u8 *orig = bh->b_data + bi->bi_offset + x; const u8 *clone = bi->bi_clone + bi->bi_offset + x; u8 diff = ~(*orig | (*orig >> 1)) & (*clone | (*clone >> 1)); diff &= 0x55; @@ -862,69 +859,63 @@ fail: sdp->sd_args.ar_discard = 0; } -void gfs2_rgrp_repolish_clones(struct gfs2_rgrpd *rgd) -{ - struct gfs2_sbd *sdp = rgd->rd_sbd; - unsigned int length = rgd->rd_length; - unsigned int x; - - for (x = 0; x < length; x++) { - struct gfs2_bitmap *bi = rgd->rd_bits + x; - if (!bi->bi_clone) - continue; - if (sdp->sd_args.ar_discard) - gfs2_rgrp_send_discards(sdp, rgd->rd_data0, bi); - clear_bit(GBF_FULL, &bi->bi_flags); - memcpy(bi->bi_clone + bi->bi_offset, - bi->bi_bh->b_data + bi->bi_offset, bi->bi_len); - } +/** + * gfs2_qadata_get - get the struct gfs2_qadata structure for an inode + * @ip: the incore GFS2 inode structure + * + * Returns: the struct gfs2_qadata + */ - spin_lock(&sdp->sd_rindex_spin); - rgd->rd_free_clone = rgd->rd_free; - spin_unlock(&sdp->sd_rindex_spin); +struct gfs2_qadata *gfs2_qadata_get(struct gfs2_inode *ip) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + int error; + BUG_ON(ip->i_qadata != NULL); + ip->i_qadata = kzalloc(sizeof(struct gfs2_qadata), GFP_NOFS); + error = gfs2_rindex_update(sdp); + if (error) + fs_warn(sdp, "rindex update returns %d\n", error); + return ip->i_qadata; } /** - * gfs2_alloc_get - get the struct gfs2_alloc structure for an inode + * gfs2_blkrsv_get - get the struct gfs2_blkreserv structure for an inode * @ip: the incore GFS2 inode structure * - * Returns: the struct gfs2_alloc + * Returns: the struct gfs2_qadata */ -struct gfs2_alloc *gfs2_alloc_get(struct gfs2_inode *ip) +static struct gfs2_blkreserv *gfs2_blkrsv_get(struct gfs2_inode *ip) { - BUG_ON(ip->i_alloc != NULL); - ip->i_alloc = kzalloc(sizeof(struct gfs2_alloc), GFP_NOFS); - return ip->i_alloc; + BUG_ON(ip->i_res != NULL); + ip->i_res = kzalloc(sizeof(struct gfs2_blkreserv), GFP_NOFS); + return ip->i_res; } /** * try_rgrp_fit - See if a given reservation will fit in a given RG * @rgd: the RG data - * @al: the struct gfs2_alloc structure describing the reservation + * @ip: the inode * * If there's room for the requested blocks to be allocated from the RG: - * Sets the $al_rgd field in @al. * * Returns: 1 on success (it fits), 0 on failure (it doesn't fit) */ -static int try_rgrp_fit(struct gfs2_rgrpd *rgd, struct gfs2_alloc *al) +static int try_rgrp_fit(const struct gfs2_rgrpd *rgd, const struct gfs2_inode *ip) { - struct gfs2_sbd *sdp = rgd->rd_sbd; - int ret = 0; + const struct gfs2_blkreserv *rs = ip->i_res; if (rgd->rd_flags & (GFS2_RGF_NOALLOC | GFS2_RDF_ERROR)) return 0; + if (rgd->rd_free_clone >= rs->rs_requested) + return 1; + return 0; +} - spin_lock(&sdp->sd_rindex_spin); - if (rgd->rd_free_clone >= al->al_requested) { - al->al_rgd = rgd; - ret = 1; - } - spin_unlock(&sdp->sd_rindex_spin); - - return ret; +static inline u32 gfs2_bi2rgd_blk(struct gfs2_bitmap *bi, u32 blk) +{ + return (bi->bi_start * GFS2_NBBY) + blk; } /** @@ -940,20 +931,20 @@ static void try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked, u64 skip u32 goal = 0, block; u64 no_addr; struct gfs2_sbd *sdp = rgd->rd_sbd; - unsigned int n; struct gfs2_glock *gl; struct gfs2_inode *ip; int error; int found = 0; + struct gfs2_bitmap *bi; while (goal < rgd->rd_data) { down_write(&sdp->sd_log_flush_lock); - n = 1; - block = rgblk_search(rgd, goal, GFS2_BLKST_UNLINKED, - GFS2_BLKST_UNLINKED, &n); + block = rgblk_search(rgd, goal, GFS2_BLKST_UNLINKED, &bi); up_write(&sdp->sd_log_flush_lock); if (block == BFITNOENT) break; + + block = gfs2_bi2rgd_blk(bi, block); /* rgblk_search can return a block < goal, so we need to keep it marching forward. */ no_addr = block + rgd->rd_data0; @@ -992,76 +983,6 @@ static void try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked, u64 skip } /** - * recent_rgrp_next - get next RG from "recent" list - * @cur_rgd: current rgrp - * - * Returns: The next rgrp in the recent list - */ - -static struct gfs2_rgrpd *recent_rgrp_next(struct gfs2_rgrpd *cur_rgd) -{ - struct gfs2_sbd *sdp = cur_rgd->rd_sbd; - struct list_head *head; - struct gfs2_rgrpd *rgd; - - spin_lock(&sdp->sd_rindex_spin); - head = &sdp->sd_rindex_mru_list; - if (unlikely(cur_rgd->rd_list_mru.next == head)) { - spin_unlock(&sdp->sd_rindex_spin); - return NULL; - } - rgd = list_entry(cur_rgd->rd_list_mru.next, struct gfs2_rgrpd, rd_list_mru); - spin_unlock(&sdp->sd_rindex_spin); - return rgd; -} - -/** - * forward_rgrp_get - get an rgrp to try next from full list - * @sdp: The GFS2 superblock - * - * Returns: The rgrp to try next - */ - -static struct gfs2_rgrpd *forward_rgrp_get(struct gfs2_sbd *sdp) -{ - struct gfs2_rgrpd *rgd; - unsigned int journals = gfs2_jindex_size(sdp); - unsigned int rg = 0, x; - - spin_lock(&sdp->sd_rindex_spin); - - rgd = sdp->sd_rindex_forward; - if (!rgd) { - if (sdp->sd_rgrps >= journals) - rg = sdp->sd_rgrps * sdp->sd_jdesc->jd_jid / journals; - - for (x = 0, rgd = gfs2_rgrpd_get_first(sdp); x < rg; - x++, rgd = gfs2_rgrpd_get_next(rgd)) - /* Do Nothing */; - - sdp->sd_rindex_forward = rgd; - } - - spin_unlock(&sdp->sd_rindex_spin); - - return rgd; -} - -/** - * forward_rgrp_set - set the forward rgrp pointer - * @sdp: the filesystem - * @rgd: The new forward rgrp - * - */ - -static void forward_rgrp_set(struct gfs2_sbd *sdp, struct gfs2_rgrpd *rgd) -{ - spin_lock(&sdp->sd_rindex_spin); - sdp->sd_rindex_forward = rgd; - spin_unlock(&sdp->sd_rindex_spin); -} - -/** * get_local_rgrp - Choose and lock a rgrp for allocation * @ip: the inode to reserve space for * @rgp: the chosen and locked rgrp @@ -1075,15 +996,19 @@ static int get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); struct gfs2_rgrpd *rgd, *begin = NULL; - struct gfs2_alloc *al = ip->i_alloc; - int flags = LM_FLAG_TRY; - int skipped = 0; + struct gfs2_blkreserv *rs = ip->i_res; + int error, rg_locked, flags = LM_FLAG_TRY; int loops = 0; - int error, rg_locked; - rgd = gfs2_blk2rgrpd(sdp, ip->i_goal); + if (ip->i_rgd && rgrp_contains_block(ip->i_rgd, ip->i_goal)) + rgd = begin = ip->i_rgd; + else + rgd = begin = gfs2_blk2rgrpd(sdp, ip->i_goal); - while (rgd) { + if (rgd == NULL) + return -EBADSLT; + + while (loops < 3) { rg_locked = 0; if (gfs2_glock_is_locked_by_me(rgd->rd_gl)) { @@ -1091,145 +1016,85 @@ static int get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked) error = 0; } else { error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, - LM_FLAG_TRY, &al->al_rgd_gh); + flags, &rs->rs_rgd_gh); } switch (error) { case 0: - if (try_rgrp_fit(rgd, al)) - goto out; + if (try_rgrp_fit(rgd, ip)) { + ip->i_rgd = rgd; + return 0; + } if (rgd->rd_flags & GFS2_RDF_CHECK) try_rgrp_unlink(rgd, last_unlinked, ip->i_no_addr); if (!rg_locked) - gfs2_glock_dq_uninit(&al->al_rgd_gh); + gfs2_glock_dq_uninit(&rs->rs_rgd_gh); /* fall through */ case GLR_TRYFAILED: - rgd = recent_rgrp_next(rgd); - break; - - default: - return error; - } - } - - /* Go through full list of rgrps */ - - begin = rgd = forward_rgrp_get(sdp); - - for (;;) { - rg_locked = 0; - - if (gfs2_glock_is_locked_by_me(rgd->rd_gl)) { - rg_locked = 1; - error = 0; - } else { - error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, flags, - &al->al_rgd_gh); - } - switch (error) { - case 0: - if (try_rgrp_fit(rgd, al)) - goto out; - if (rgd->rd_flags & GFS2_RDF_CHECK) - try_rgrp_unlink(rgd, last_unlinked, ip->i_no_addr); - if (!rg_locked) - gfs2_glock_dq_uninit(&al->al_rgd_gh); - break; - - case GLR_TRYFAILED: - skipped++; + rgd = gfs2_rgrpd_get_next(rgd); + if (rgd == begin) { + flags = 0; + loops++; + } break; - default: return error; } - - rgd = gfs2_rgrpd_get_next(rgd); - if (!rgd) - rgd = gfs2_rgrpd_get_first(sdp); - - if (rgd == begin) { - if (++loops >= 3) - return -ENOSPC; - if (!skipped) - loops++; - flags = 0; - if (loops == 2) - gfs2_log_flush(sdp, NULL); - } } -out: - if (begin) { - spin_lock(&sdp->sd_rindex_spin); - list_move(&rgd->rd_list_mru, &sdp->sd_rindex_mru_list); - spin_unlock(&sdp->sd_rindex_spin); - rgd = gfs2_rgrpd_get_next(rgd); - if (!rgd) - rgd = gfs2_rgrpd_get_first(sdp); - forward_rgrp_set(sdp, rgd); - } + return -ENOSPC; +} - return 0; +static void gfs2_blkrsv_put(struct gfs2_inode *ip) +{ + BUG_ON(ip->i_res == NULL); + kfree(ip->i_res); + ip->i_res = NULL; } /** - * gfs2_inplace_reserve_i - Reserve space in the filesystem + * gfs2_inplace_reserve - Reserve space in the filesystem * @ip: the inode to reserve space for * * Returns: errno */ -int gfs2_inplace_reserve_i(struct gfs2_inode *ip, int hold_rindex, - char *file, unsigned int line) +int gfs2_inplace_reserve(struct gfs2_inode *ip, u32 requested) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); - struct gfs2_alloc *al = ip->i_alloc; + struct gfs2_blkreserv *rs; int error = 0; u64 last_unlinked = NO_BLOCK; int tries = 0; - if (gfs2_assert_warn(sdp, al->al_requested)) - return -EINVAL; + rs = gfs2_blkrsv_get(ip); + if (!rs) + return -ENOMEM; - if (hold_rindex) { - /* We need to hold the rindex unless the inode we're using is - the rindex itself, in which case it's already held. */ - if (ip != GFS2_I(sdp->sd_rindex)) - error = gfs2_rindex_hold(sdp, &al->al_ri_gh); - else if (!sdp->sd_rgrps) /* We may not have the rindex read - in, so: */ - error = gfs2_ri_update(ip); - if (error) - return error; + rs->rs_requested = requested; + if (gfs2_assert_warn(sdp, requested)) { + error = -EINVAL; + goto out; } -try_again: do { error = get_local_rgrp(ip, &last_unlinked); - /* If there is no space, flushing the log may release some */ - if (error) { - if (ip == GFS2_I(sdp->sd_rindex) && - !sdp->sd_rindex_uptodate) { - error = gfs2_ri_update(ip); - if (error) - return error; - goto try_again; - } - gfs2_log_flush(sdp, NULL); + if (error != -ENOSPC) + break; + /* Check that fs hasn't grown if writing to rindex */ + if (ip == GFS2_I(sdp->sd_rindex) && !sdp->sd_rindex_uptodate) { + error = gfs2_ri_update(ip); + if (error) + break; + continue; } - } while (error && tries++ < 3); + /* Flushing the log may release space */ + gfs2_log_flush(sdp, NULL); + } while (tries++ < 3); - if (error) { - if (hold_rindex && ip != GFS2_I(sdp->sd_rindex)) - gfs2_glock_dq_uninit(&al->al_ri_gh); - return error; - } - - /* no error, so we have the rgrp set in the inode's allocation. */ - al->al_file = file; - al->al_line = line; - - return 0; +out: + if (error) + gfs2_blkrsv_put(ip); + return error; } /** @@ -1241,20 +1106,11 @@ try_again: void gfs2_inplace_release(struct gfs2_inode *ip) { - struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); - struct gfs2_alloc *al = ip->i_alloc; - - if (gfs2_assert_warn(sdp, al->al_alloced <= al->al_requested) == -1) - fs_warn(sdp, "al_alloced = %u, al_requested = %u " - "al_file = %s, al_line = %u\n", - al->al_alloced, al->al_requested, al->al_file, - al->al_line); - - al->al_rgd = NULL; - if (al->al_rgd_gh.gh_gl) - gfs2_glock_dq_uninit(&al->al_rgd_gh); - if (ip != GFS2_I(sdp->sd_rindex) && al->al_ri_gh.gh_gl) - gfs2_glock_dq_uninit(&al->al_ri_gh); + struct gfs2_blkreserv *rs = ip->i_res; + + gfs2_blkrsv_put(ip); + if (rs->rs_rgd_gh.gh_gl) + gfs2_glock_dq_uninit(&rs->rs_rgd_gh); } /** @@ -1291,39 +1147,35 @@ static unsigned char gfs2_get_block_type(struct gfs2_rgrpd *rgd, u64 block) } /** - * rgblk_search - find a block in @old_state, change allocation - * state to @new_state + * rgblk_search - find a block in @state * @rgd: the resource group descriptor * @goal: the goal block within the RG (start here to search for avail block) - * @old_state: GFS2_BLKST_XXX the before-allocation state to find - * @new_state: GFS2_BLKST_XXX the after-allocation block state - * @n: The extent length + * @state: GFS2_BLKST_XXX the before-allocation state to find + * @dinode: TRUE if the first block we allocate is for a dinode + * @rbi: address of the pointer to the bitmap containing the block found * - * Walk rgrp's bitmap to find bits that represent a block in @old_state. - * Add the found bitmap buffer to the transaction. - * Set the found bits to @new_state to change block's allocation state. + * Walk rgrp's bitmap to find bits that represent a block in @state. * * This function never fails, because we wouldn't call it unless we * know (from reservation results, etc.) that a block is available. * - * Scope of @goal and returned block is just within rgrp, not the whole - * filesystem. + * Scope of @goal is just within rgrp, not the whole filesystem. + * Scope of @returned block is just within bitmap, not the whole filesystem. * - * Returns: the block number allocated + * Returns: the block number found relative to the bitmap rbi */ static u32 rgblk_search(struct gfs2_rgrpd *rgd, u32 goal, - unsigned char old_state, unsigned char new_state, - unsigned int *n) + unsigned char state, + struct gfs2_bitmap **rbi) { struct gfs2_bitmap *bi = NULL; const u32 length = rgd->rd_length; u32 blk = BFITNOENT; unsigned int buf, x; - const unsigned int elen = *n; const u8 *buffer = NULL; - *n = 0; + *rbi = NULL; /* Find bitmap block that contains bits for goal block */ for (buf = 0; buf < length; buf++) { bi = rgd->rd_bits + buf; @@ -1346,20 +1198,21 @@ do_search: bi = rgd->rd_bits + buf; if (test_bit(GBF_FULL, &bi->bi_flags) && - (old_state == GFS2_BLKST_FREE)) + (state == GFS2_BLKST_FREE)) goto skip; /* The GFS2_BLKST_UNLINKED state doesn't apply to the clone bitmaps, so we must search the originals for that. */ buffer = bi->bi_bh->b_data + bi->bi_offset; - if (old_state != GFS2_BLKST_UNLINKED && bi->bi_clone) + WARN_ON(!buffer_uptodate(bi->bi_bh)); + if (state != GFS2_BLKST_UNLINKED && bi->bi_clone) buffer = bi->bi_clone + bi->bi_offset; - blk = gfs2_bitfit(buffer, bi->bi_len, goal, old_state); + blk = gfs2_bitfit(buffer, bi->bi_len, goal, state); if (blk != BFITNOENT) break; - if ((goal == 0) && (old_state == GFS2_BLKST_FREE)) + if ((goal == 0) && (state == GFS2_BLKST_FREE)) set_bit(GBF_FULL, &bi->bi_flags); /* Try next bitmap block (wrap back to rgrp header if at end) */ @@ -1369,15 +1222,37 @@ skip: goal = 0; } - if (blk == BFITNOENT) - return blk; - *n = 1; - if (old_state == new_state) - goto out; + if (blk != BFITNOENT) + *rbi = bi; + return blk; +} + +/** + * gfs2_alloc_extent - allocate an extent from a given bitmap + * @rgd: the resource group descriptor + * @bi: the bitmap within the rgrp + * @blk: the block within the bitmap + * @dinode: TRUE if the first block we allocate is for a dinode + * @n: The extent length + * + * Add the found bitmap buffer to the transaction. + * Set the found bits to @new_state to change block's allocation state. + * Returns: starting block number of the extent (fs scope) + */ +static u64 gfs2_alloc_extent(struct gfs2_rgrpd *rgd, struct gfs2_bitmap *bi, + u32 blk, bool dinode, unsigned int *n) +{ + const unsigned int elen = *n; + u32 goal; + const u8 *buffer = NULL; + + *n = 0; + buffer = bi->bi_bh->b_data + bi->bi_offset; gfs2_trans_add_bh(rgd->rd_gl, bi->bi_bh, 1); gfs2_setbit(rgd, bi->bi_bh->b_data, bi->bi_clone, bi->bi_offset, - bi, blk, new_state); + bi, blk, dinode ? GFS2_BLKST_DINODE : GFS2_BLKST_USED); + (*n)++; goal = blk; while (*n < elen) { goal++; @@ -1387,11 +1262,12 @@ skip: GFS2_BLKST_FREE) break; gfs2_setbit(rgd, bi->bi_bh->b_data, bi->bi_clone, bi->bi_offset, - bi, goal, new_state); + bi, goal, GFS2_BLKST_USED); (*n)++; } -out: - return (bi->bi_start * GFS2_NBBY) + blk; + blk = gfs2_bi2rgd_blk(bi, blk); + rgd->rd_last_alloc = blk + *n - 1; + return rgd->rd_data0 + blk; } /** @@ -1479,125 +1355,93 @@ static void gfs2_rgrp_error(struct gfs2_rgrpd *rgd) } /** - * gfs2_alloc_block - Allocate one or more blocks + * gfs2_alloc_blocks - Allocate one or more blocks of data and/or a dinode * @ip: the inode to allocate the block for * @bn: Used to return the starting block number - * @n: requested number of blocks/extent length (value/result) + * @ndata: requested number of blocks/extent length (value/result) + * @dinode: 1 if we're allocating a dinode block, else 0 + * @generation: the generation number of the inode * * Returns: 0 or error */ -int gfs2_alloc_block(struct gfs2_inode *ip, u64 *bn, unsigned int *n) +int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *nblocks, + bool dinode, u64 *generation) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); struct buffer_head *dibh; - struct gfs2_alloc *al = ip->i_alloc; struct gfs2_rgrpd *rgd; - u32 goal, blk; - u64 block; + unsigned int ndata; + u32 goal, blk; /* block, within the rgrp scope */ + u64 block; /* block, within the file system scope */ int error; + struct gfs2_bitmap *bi; /* Only happens if there is a bug in gfs2, return something distinctive * to ensure that it is noticed. */ - if (al == NULL) + if (ip->i_res == NULL) return -ECANCELED; - rgd = al->al_rgd; + rgd = ip->i_rgd; - if (rgrp_contains_block(rgd, ip->i_goal)) + if (!dinode && rgrp_contains_block(rgd, ip->i_goal)) goal = ip->i_goal - rgd->rd_data0; else goal = rgd->rd_last_alloc; - blk = rgblk_search(rgd, goal, GFS2_BLKST_FREE, GFS2_BLKST_USED, n); + blk = rgblk_search(rgd, goal, GFS2_BLKST_FREE, &bi); /* Since all blocks are reserved in advance, this shouldn't happen */ if (blk == BFITNOENT) goto rgrp_error; - rgd->rd_last_alloc = blk; - block = rgd->rd_data0 + blk; - ip->i_goal = block; - error = gfs2_meta_inode_buffer(ip, &dibh); - if (error == 0) { - struct gfs2_dinode *di = (struct gfs2_dinode *)dibh->b_data; - gfs2_trans_add_bh(ip->i_gl, dibh, 1); - di->di_goal_meta = di->di_goal_data = cpu_to_be64(ip->i_goal); - brelse(dibh); + block = gfs2_alloc_extent(rgd, bi, blk, dinode, nblocks); + ndata = *nblocks; + if (dinode) + ndata--; + + if (!dinode) { + ip->i_goal = block + ndata - 1; + error = gfs2_meta_inode_buffer(ip, &dibh); + if (error == 0) { + struct gfs2_dinode *di = + (struct gfs2_dinode *)dibh->b_data; + gfs2_trans_add_bh(ip->i_gl, dibh, 1); + di->di_goal_meta = di->di_goal_data = + cpu_to_be64(ip->i_goal); + brelse(dibh); + } } - if (rgd->rd_free < *n) - goto rgrp_error; - - rgd->rd_free -= *n; - - gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1); - gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data); - - al->al_alloced += *n; - - gfs2_statfs_change(sdp, 0, -(s64)*n, 0); - gfs2_quota_change(ip, *n, ip->i_inode.i_uid, ip->i_inode.i_gid); - - spin_lock(&sdp->sd_rindex_spin); - rgd->rd_free_clone -= *n; - spin_unlock(&sdp->sd_rindex_spin); - trace_gfs2_block_alloc(ip, block, *n, GFS2_BLKST_USED); - *bn = block; - return 0; - -rgrp_error: - gfs2_rgrp_error(rgd); - return -EIO; -} - -/** - * gfs2_alloc_di - Allocate a dinode - * @dip: the directory that the inode is going in - * @bn: the block number which is allocated - * @generation: the generation number of the inode - * - * Returns: 0 on success or error - */ - -int gfs2_alloc_di(struct gfs2_inode *dip, u64 *bn, u64 *generation) -{ - struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode); - struct gfs2_alloc *al = dip->i_alloc; - struct gfs2_rgrpd *rgd = al->al_rgd; - u32 blk; - u64 block; - unsigned int n = 1; - - blk = rgblk_search(rgd, rgd->rd_last_alloc, - GFS2_BLKST_FREE, GFS2_BLKST_DINODE, &n); - - /* Since all blocks are reserved in advance, this shouldn't happen */ - if (blk == BFITNOENT) + if (rgd->rd_free < *nblocks) goto rgrp_error; - rgd->rd_last_alloc = blk; - block = rgd->rd_data0 + blk; - if (rgd->rd_free == 0) - goto rgrp_error; - - rgd->rd_free--; - rgd->rd_dinodes++; - *generation = rgd->rd_igeneration++; - if (*generation == 0) + rgd->rd_free -= *nblocks; + if (dinode) { + rgd->rd_dinodes++; *generation = rgd->rd_igeneration++; + if (*generation == 0) + *generation = rgd->rd_igeneration++; + } + gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1); gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data); - al->al_alloced++; + gfs2_statfs_change(sdp, 0, -(s64)*nblocks, dinode ? 1 : 0); + if (dinode) + gfs2_trans_add_unrevoke(sdp, block, 1); - gfs2_statfs_change(sdp, 0, -1, +1); - gfs2_trans_add_unrevoke(sdp, block, 1); + /* + * This needs reviewing to see why we cannot do the quota change + * at this point in the dinode case. + */ + if (ndata) + gfs2_quota_change(ip, ndata, ip->i_inode.i_uid, + ip->i_inode.i_gid); - spin_lock(&sdp->sd_rindex_spin); - rgd->rd_free_clone--; - spin_unlock(&sdp->sd_rindex_spin); - trace_gfs2_block_alloc(dip, block, 1, GFS2_BLKST_DINODE); + rgd->rd_free_clone -= *nblocks; + trace_gfs2_block_alloc(ip, block, *nblocks, + dinode ? GFS2_BLKST_DINODE : GFS2_BLKST_USED); *bn = block; return 0; @@ -1629,8 +1473,6 @@ void __gfs2_free_blocks(struct gfs2_inode *ip, u64 bstart, u32 blen, int meta) gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1); gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data); - gfs2_trans_add_rg(rgd); - /* Directories keep their data in the metadata address space */ if (meta || ip->i_depth) gfs2_meta_wipe(ip, bstart, blen); @@ -1666,7 +1508,6 @@ void gfs2_unlink_di(struct inode *inode) trace_gfs2_block_alloc(ip, blkno, 1, GFS2_BLKST_UNLINKED); gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1); gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data); - gfs2_trans_add_rg(rgd); } static void gfs2_free_uninit_di(struct gfs2_rgrpd *rgd, u64 blkno) @@ -1688,7 +1529,6 @@ static void gfs2_free_uninit_di(struct gfs2_rgrpd *rgd, u64 blkno) gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data); gfs2_statfs_change(sdp, 0, +1, -1); - gfs2_trans_add_rg(rgd); } @@ -1714,41 +1554,33 @@ void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip) int gfs2_check_blk_type(struct gfs2_sbd *sdp, u64 no_addr, unsigned int type) { struct gfs2_rgrpd *rgd; - struct gfs2_holder ri_gh, rgd_gh; - struct gfs2_inode *ip = GFS2_I(sdp->sd_rindex); - int ri_locked = 0; + struct gfs2_holder rgd_gh; int error; - if (!gfs2_glock_is_locked_by_me(ip->i_gl)) { - error = gfs2_rindex_hold(sdp, &ri_gh); - if (error) - goto fail; - ri_locked = 1; - } + error = gfs2_rindex_update(sdp); + if (error) + return error; error = -EINVAL; rgd = gfs2_blk2rgrpd(sdp, no_addr); if (!rgd) - goto fail_rindex; + goto fail; error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_SHARED, 0, &rgd_gh); if (error) - goto fail_rindex; + goto fail; if (gfs2_get_block_type(rgd, no_addr) != type) error = -ESTALE; gfs2_glock_dq_uninit(&rgd_gh); -fail_rindex: - if (ri_locked) - gfs2_glock_dq_uninit(&ri_gh); fail: return error; } /** * gfs2_rlist_add - add a RG to a list of RGs - * @sdp: the filesystem + * @ip: the inode * @rlist: the list of resource groups * @block: the block * @@ -1758,9 +1590,10 @@ fail: * */ -void gfs2_rlist_add(struct gfs2_sbd *sdp, struct gfs2_rgrp_list *rlist, +void gfs2_rlist_add(struct gfs2_inode *ip, struct gfs2_rgrp_list *rlist, u64 block) { + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); struct gfs2_rgrpd *rgd; struct gfs2_rgrpd **tmp; unsigned int new_space; @@ -1769,12 +1602,15 @@ void gfs2_rlist_add(struct gfs2_sbd *sdp, struct gfs2_rgrp_list *rlist, if (gfs2_assert_warn(sdp, !rlist->rl_ghs)) return; - rgd = gfs2_blk2rgrpd(sdp, block); + if (ip->i_rgd && rgrp_contains_block(ip->i_rgd, block)) + rgd = ip->i_rgd; + else + rgd = gfs2_blk2rgrpd(sdp, block); if (!rgd) { - if (gfs2_consist(sdp)) - fs_err(sdp, "block = %llu\n", (unsigned long long)block); + fs_err(sdp, "rlist_add: no rgrp for block %llu\n", (unsigned long long)block); return; } + ip->i_rgd = rgd; for (x = 0; x < rlist->rl_rgrps; x++) if (rlist->rl_rgd[x] == rgd) diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h index d253f9a..ceec910 100644 --- a/fs/gfs2/rgrp.h +++ b/fs/gfs2/rgrp.h @@ -18,39 +18,29 @@ struct gfs2_holder; extern void gfs2_rgrp_verify(struct gfs2_rgrpd *rgd); -struct gfs2_rgrpd *gfs2_blk2rgrpd(struct gfs2_sbd *sdp, u64 blk); -struct gfs2_rgrpd *gfs2_rgrpd_get_first(struct gfs2_sbd *sdp); -struct gfs2_rgrpd *gfs2_rgrpd_get_next(struct gfs2_rgrpd *rgd); +extern struct gfs2_rgrpd *gfs2_blk2rgrpd(struct gfs2_sbd *sdp, u64 blk); +extern struct gfs2_rgrpd *gfs2_rgrpd_get_first(struct gfs2_sbd *sdp); +extern struct gfs2_rgrpd *gfs2_rgrpd_get_next(struct gfs2_rgrpd *rgd); extern void gfs2_clear_rgrpd(struct gfs2_sbd *sdp); -extern int gfs2_rindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ri_gh); +extern int gfs2_rindex_update(struct gfs2_sbd *sdp); +extern void gfs2_free_clones(struct gfs2_rgrpd *rgd); +extern int gfs2_rgrp_go_lock(struct gfs2_holder *gh); +extern void gfs2_rgrp_go_unlock(struct gfs2_holder *gh); -extern int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd); -extern void gfs2_rgrp_bh_hold(struct gfs2_rgrpd *rgd); -extern void gfs2_rgrp_bh_put(struct gfs2_rgrpd *rgd); - -extern void gfs2_rgrp_repolish_clones(struct gfs2_rgrpd *rgd); - -extern struct gfs2_alloc *gfs2_alloc_get(struct gfs2_inode *ip); -static inline void gfs2_alloc_put(struct gfs2_inode *ip) +extern struct gfs2_qadata *gfs2_qadata_get(struct gfs2_inode *ip); +static inline void gfs2_qadata_put(struct gfs2_inode *ip) { - BUG_ON(ip->i_alloc == NULL); - kfree(ip->i_alloc); - ip->i_alloc = NULL; + BUG_ON(ip->i_qadata == NULL); + kfree(ip->i_qadata); + ip->i_qadata = NULL; } -extern int gfs2_inplace_reserve_i(struct gfs2_inode *ip, int hold_rindex, - char *file, unsigned int line); -#define gfs2_inplace_reserve(ip) \ - gfs2_inplace_reserve_i((ip), 1, __FILE__, __LINE__) -#define gfs2_inplace_reserve_ri(ip) \ - gfs2_inplace_reserve_i((ip), 0, __FILE__, __LINE__) - +extern int gfs2_inplace_reserve(struct gfs2_inode *ip, u32 requested); extern void gfs2_inplace_release(struct gfs2_inode *ip); -extern int gfs2_ri_update(struct gfs2_inode *ip); -extern int gfs2_alloc_block(struct gfs2_inode *ip, u64 *bn, unsigned int *n); -extern int gfs2_alloc_di(struct gfs2_inode *ip, u64 *bn, u64 *generation); +extern int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *n, + bool dinode, u64 *generation); extern void __gfs2_free_blocks(struct gfs2_inode *ip, u64 bstart, u32 blen, int meta); extern void gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen); @@ -66,11 +56,14 @@ struct gfs2_rgrp_list { struct gfs2_holder *rl_ghs; }; -extern void gfs2_rlist_add(struct gfs2_sbd *sdp, struct gfs2_rgrp_list *rlist, +extern void gfs2_rlist_add(struct gfs2_inode *ip, struct gfs2_rgrp_list *rlist, u64 block); extern void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist, unsigned int state); extern void gfs2_rlist_free(struct gfs2_rgrp_list *rlist); extern u64 gfs2_ri_total(struct gfs2_sbd *sdp); extern int gfs2_rgrp_dump(struct seq_file *seq, const struct gfs2_glock *gl); +extern void gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset, + struct buffer_head *bh, + const struct gfs2_bitmap *bi); #endif /* __RGRP_DOT_H__ */ diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index b7beadd..4553ce5 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -752,51 +752,77 @@ static int gfs2_write_inode(struct inode *inode, struct writeback_control *wbc) struct gfs2_sbd *sdp = GFS2_SB(inode); struct address_space *metamapping = gfs2_glock2aspace(ip->i_gl); struct backing_dev_info *bdi = metamapping->backing_dev_info; - struct gfs2_holder gh; + int ret = 0; + + if (wbc->sync_mode == WB_SYNC_ALL) + gfs2_log_flush(GFS2_SB(inode), ip->i_gl); + if (bdi->dirty_exceeded) + gfs2_ail1_flush(sdp, wbc); + else + filemap_fdatawrite(metamapping); + if (wbc->sync_mode == WB_SYNC_ALL) + ret = filemap_fdatawait(metamapping); + if (ret) + mark_inode_dirty_sync(inode); + return ret; +} + +/** + * gfs2_dirty_inode - check for atime updates + * @inode: The inode in question + * @flags: The type of dirty + * + * Unfortunately it can be called under any combination of inode + * glock and transaction lock, so we have to check carefully. + * + * At the moment this deals only with atime - it should be possible + * to expand that role in future, once a review of the locking has + * been carried out. + */ + +static void gfs2_dirty_inode(struct inode *inode, int flags) +{ + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_sbd *sdp = GFS2_SB(inode); struct buffer_head *bh; - struct timespec atime; - struct gfs2_dinode *di; - int ret = -EAGAIN; - int unlock_required = 0; - - /* Skip timestamp update, if this is from a memalloc */ - if (current->flags & PF_MEMALLOC) - goto do_flush; + struct gfs2_holder gh; + int need_unlock = 0; + int need_endtrans = 0; + int ret; + + if (!(flags & (I_DIRTY_DATASYNC|I_DIRTY_SYNC))) + return; + if (!gfs2_glock_is_locked_by_me(ip->i_gl)) { ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh); - if (ret) - goto do_flush; - unlock_required = 1; + if (ret) { + fs_err(sdp, "dirty_inode: glock %d\n", ret); + return; + } + need_unlock = 1; } - ret = gfs2_trans_begin(sdp, RES_DINODE, 0); - if (ret) - goto do_unlock; + + if (current->journal_info == NULL) { + ret = gfs2_trans_begin(sdp, RES_DINODE, 0); + if (ret) { + fs_err(sdp, "dirty_inode: gfs2_trans_begin %d\n", ret); + goto out; + } + need_endtrans = 1; + } + ret = gfs2_meta_inode_buffer(ip, &bh); if (ret == 0) { - di = (struct gfs2_dinode *)bh->b_data; - atime.tv_sec = be64_to_cpu(di->di_atime); - atime.tv_nsec = be32_to_cpu(di->di_atime_nsec); - if (timespec_compare(&inode->i_atime, &atime) > 0) { - gfs2_trans_add_bh(ip->i_gl, bh, 1); - gfs2_dinode_out(ip, bh->b_data); - } + gfs2_trans_add_bh(ip->i_gl, bh, 1); + gfs2_dinode_out(ip, bh->b_data); brelse(bh); } - gfs2_trans_end(sdp); -do_unlock: - if (unlock_required) + + if (need_endtrans) + gfs2_trans_end(sdp); +out: + if (need_unlock) gfs2_glock_dq_uninit(&gh); -do_flush: - if (wbc->sync_mode == WB_SYNC_ALL) - gfs2_log_flush(GFS2_SB(inode), ip->i_gl); - filemap_fdatawrite(metamapping); - if (bdi->dirty_exceeded) - gfs2_ail1_flush(sdp, wbc); - if (!ret && (wbc->sync_mode == WB_SYNC_ALL)) - ret = filemap_fdatawait(metamapping); - if (ret) - mark_inode_dirty_sync(inode); - return ret; } /** @@ -1011,7 +1037,6 @@ static int statfs_slow_fill(struct gfs2_rgrpd *rgd, static int gfs2_statfs_slow(struct gfs2_sbd *sdp, struct gfs2_statfs_change_host *sc) { - struct gfs2_holder ri_gh; struct gfs2_rgrpd *rgd_next; struct gfs2_holder *gha, *gh; unsigned int slots = 64; @@ -1024,10 +1049,6 @@ static int gfs2_statfs_slow(struct gfs2_sbd *sdp, struct gfs2_statfs_change_host if (!gha) return -ENOMEM; - error = gfs2_rindex_hold(sdp, &ri_gh); - if (error) - goto out; - rgd_next = gfs2_rgrpd_get_first(sdp); for (;;) { @@ -1070,9 +1091,6 @@ static int gfs2_statfs_slow(struct gfs2_sbd *sdp, struct gfs2_statfs_change_host yield(); } - gfs2_glock_dq_uninit(&ri_gh); - -out: kfree(gha); return error; } @@ -1124,6 +1142,10 @@ static int gfs2_statfs(struct dentry *dentry, struct kstatfs *buf) struct gfs2_statfs_change_host sc; int error; + error = gfs2_rindex_update(sdp); + if (error) + return error; + if (gfs2_tune_get(sdp, gt_statfs_slow)) error = gfs2_statfs_slow(sdp, &sc); else @@ -1262,18 +1284,18 @@ static int is_ancestor(const struct dentry *d1, const struct dentry *d2) /** * gfs2_show_options - Show mount options for /proc/mounts * @s: seq_file structure - * @mnt: vfsmount + * @root: root of this (sub)tree * * Returns: 0 on success or error code */ -static int gfs2_show_options(struct seq_file *s, struct vfsmount *mnt) +static int gfs2_show_options(struct seq_file *s, struct dentry *root) { - struct gfs2_sbd *sdp = mnt->mnt_sb->s_fs_info; + struct gfs2_sbd *sdp = root->d_sb->s_fs_info; struct gfs2_args *args = &sdp->sd_args; int val; - if (is_ancestor(mnt->mnt_root, sdp->sd_master_dir)) + if (is_ancestor(root, sdp->sd_master_dir)) seq_printf(s, ",meta"); if (args->ar_lockproto[0]) seq_printf(s, ",lockproto=%s", args->ar_lockproto); @@ -1377,8 +1399,9 @@ static void gfs2_final_release_pages(struct gfs2_inode *ip) static int gfs2_dinode_dealloc(struct gfs2_inode *ip) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); - struct gfs2_alloc *al; + struct gfs2_qadata *qa; struct gfs2_rgrpd *rgd; + struct gfs2_holder gh; int error; if (gfs2_get_inode_blocks(&ip->i_inode) != 1) { @@ -1386,29 +1409,24 @@ static int gfs2_dinode_dealloc(struct gfs2_inode *ip) return -EIO; } - al = gfs2_alloc_get(ip); - if (!al) + qa = gfs2_qadata_get(ip); + if (!qa) return -ENOMEM; error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE); if (error) goto out; - error = gfs2_rindex_hold(sdp, &al->al_ri_gh); - if (error) - goto out_qs; - rgd = gfs2_blk2rgrpd(sdp, ip->i_no_addr); if (!rgd) { gfs2_consist_inode(ip); error = -EIO; - goto out_rindex_relse; + goto out_qs; } - error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, - &al->al_rgd_gh); + error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, &gh); if (error) - goto out_rindex_relse; + goto out_qs; error = gfs2_trans_begin(sdp, RES_RG_BIT + RES_STATFS + RES_QUOTA, sdp->sd_jdesc->jd_blocks); @@ -1422,13 +1440,11 @@ static int gfs2_dinode_dealloc(struct gfs2_inode *ip) gfs2_trans_end(sdp); out_rg_gunlock: - gfs2_glock_dq_uninit(&al->al_rgd_gh); -out_rindex_relse: - gfs2_glock_dq_uninit(&al->al_ri_gh); + gfs2_glock_dq_uninit(&gh); out_qs: gfs2_quota_unhold(ip); out: - gfs2_alloc_put(ip); + gfs2_qadata_put(ip); return error; } @@ -1471,9 +1487,11 @@ static void gfs2_evict_inode(struct inode *inode) goto out; } - error = gfs2_check_blk_type(sdp, ip->i_no_addr, GFS2_BLKST_UNLINKED); - if (error) - goto out_truncate; + if (!test_bit(GIF_ALLOC_FAILED, &ip->i_flags)) { + error = gfs2_check_blk_type(sdp, ip->i_no_addr, GFS2_BLKST_UNLINKED); + if (error) + goto out_truncate; + } if (test_bit(GIF_INVALID, &ip->i_flags)) { error = gfs2_inode_refresh(ip); @@ -1513,6 +1531,10 @@ static void gfs2_evict_inode(struct inode *inode) goto out_unlock; out_truncate: + gfs2_log_flush(sdp, ip->i_gl); + write_inode_now(inode, 1); + gfs2_ail_flush(ip->i_gl, 0); + /* Case 2 starts here */ error = gfs2_trans_begin(sdp, 0, sdp->sd_jdesc->jd_blocks); if (error) @@ -1552,6 +1574,7 @@ static struct inode *gfs2_alloc_inode(struct super_block *sb) if (ip) { ip->i_flags = 0; ip->i_gl = NULL; + ip->i_rgd = NULL; } return &ip->i_inode; } @@ -1559,7 +1582,6 @@ static struct inode *gfs2_alloc_inode(struct super_block *sb) static void gfs2_i_callback(struct rcu_head *head) { struct inode *inode = container_of(head, struct inode, i_rcu); - INIT_LIST_HEAD(&inode->i_dentry); kmem_cache_free(gfs2_inode_cachep, inode); } @@ -1572,6 +1594,7 @@ const struct super_operations gfs2_super_ops = { .alloc_inode = gfs2_alloc_inode, .destroy_inode = gfs2_destroy_inode, .write_inode = gfs2_write_inode, + .dirty_inode = gfs2_dirty_inode, .evict_inode = gfs2_evict_inode, .put_super = gfs2_put_super, .sync_fs = gfs2_sync_fs, diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c index 9ec73a8..86ac75d 100644 --- a/fs/gfs2/trans.c +++ b/fs/gfs2/trans.c @@ -185,8 +185,3 @@ void gfs2_trans_add_unrevoke(struct gfs2_sbd *sdp, u64 blkno, unsigned int len) gfs2_log_unlock(sdp); } -void gfs2_trans_add_rg(struct gfs2_rgrpd *rgd) -{ - lops_add(rgd->rd_sbd, &rgd->rd_le); -} - diff --git a/fs/gfs2/trans.h b/fs/gfs2/trans.h index fb56b78..125d457 100644 --- a/fs/gfs2/trans.h +++ b/fs/gfs2/trans.h @@ -28,20 +28,20 @@ struct gfs2_glock; /* reserve either the number of blocks to be allocated plus the rg header * block, or all of the blocks in the rg, whichever is smaller */ -static inline unsigned int gfs2_rg_blocks(const struct gfs2_alloc *al) +static inline unsigned int gfs2_rg_blocks(const struct gfs2_inode *ip) { - return (al->al_requested < al->al_rgd->rd_length)? - al->al_requested + 1 : al->al_rgd->rd_length; + const struct gfs2_blkreserv *rs = ip->i_res; + if (rs->rs_requested < ip->i_rgd->rd_length) + return rs->rs_requested + 1; + return ip->i_rgd->rd_length; } -int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks, - unsigned int revokes); +extern int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks, + unsigned int revokes); -void gfs2_trans_end(struct gfs2_sbd *sdp); - -void gfs2_trans_add_bh(struct gfs2_glock *gl, struct buffer_head *bh, int meta); -void gfs2_trans_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd); -void gfs2_trans_add_unrevoke(struct gfs2_sbd *sdp, u64 blkno, unsigned int len); -void gfs2_trans_add_rg(struct gfs2_rgrpd *rgd); +extern void gfs2_trans_end(struct gfs2_sbd *sdp); +extern void gfs2_trans_add_bh(struct gfs2_glock *gl, struct buffer_head *bh, int meta); +extern void gfs2_trans_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd); +extern void gfs2_trans_add_unrevoke(struct gfs2_sbd *sdp, u64 blkno, unsigned int len); #endif /* __TRANS_DOT_H__ */ diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c index 439b61c0..e963659 100644 --- a/fs/gfs2/xattr.c +++ b/fs/gfs2/xattr.c @@ -321,29 +321,22 @@ static int ea_remove_unstuffed(struct gfs2_inode *ip, struct buffer_head *bh, struct gfs2_ea_header *ea, struct gfs2_ea_header *prev, int leave) { - struct gfs2_alloc *al; + struct gfs2_qadata *qa; int error; - al = gfs2_alloc_get(ip); - if (!al) + qa = gfs2_qadata_get(ip); + if (!qa) return -ENOMEM; error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE); if (error) goto out_alloc; - error = gfs2_rindex_hold(GFS2_SB(&ip->i_inode), &al->al_ri_gh); - if (error) - goto out_quota; - error = ea_dealloc_unstuffed(ip, bh, ea, prev, (leave) ? &error : NULL); - gfs2_glock_dq_uninit(&al->al_ri_gh); - -out_quota: gfs2_quota_unhold(ip); out_alloc: - gfs2_alloc_put(ip); + gfs2_qadata_put(ip); return error; } @@ -556,9 +549,10 @@ int gfs2_xattr_acl_get(struct gfs2_inode *ip, const char *name, char **ppdata) goto out; error = gfs2_ea_get_copy(ip, &el, data, len); - if (error == 0) - error = len; - *ppdata = data; + if (error < 0) + kfree(data); + else + *ppdata = data; out: brelse(el.el_bh); return error; @@ -616,7 +610,7 @@ static int ea_alloc_blk(struct gfs2_inode *ip, struct buffer_head **bhp) u64 block; int error; - error = gfs2_alloc_block(ip, &block, &n); + error = gfs2_alloc_blocks(ip, &block, &n, 0, NULL); if (error) return error; gfs2_trans_add_unrevoke(sdp, block, 1); @@ -678,7 +672,7 @@ static int ea_write(struct gfs2_inode *ip, struct gfs2_ea_header *ea, int mh_size = sizeof(struct gfs2_meta_header); unsigned int n = 1; - error = gfs2_alloc_block(ip, &block, &n); + error = gfs2_alloc_blocks(ip, &block, &n, 0, NULL); if (error) return error; gfs2_trans_add_unrevoke(sdp, block, 1); @@ -715,26 +709,24 @@ static int ea_alloc_skeleton(struct gfs2_inode *ip, struct gfs2_ea_request *er, unsigned int blks, ea_skeleton_call_t skeleton_call, void *private) { - struct gfs2_alloc *al; + struct gfs2_qadata *qa; struct buffer_head *dibh; int error; - al = gfs2_alloc_get(ip); - if (!al) + qa = gfs2_qadata_get(ip); + if (!qa) return -ENOMEM; error = gfs2_quota_lock_check(ip); if (error) goto out; - al->al_requested = blks; - - error = gfs2_inplace_reserve(ip); + error = gfs2_inplace_reserve(ip, blks); if (error) goto out_gunlock_q; error = gfs2_trans_begin(GFS2_SB(&ip->i_inode), - blks + gfs2_rg_blocks(al) + + blks + gfs2_rg_blocks(ip) + RES_DINODE + RES_STATFS + RES_QUOTA, 0); if (error) goto out_ipres; @@ -758,7 +750,7 @@ out_ipres: out_gunlock_q: gfs2_quota_unlock(ip); out: - gfs2_alloc_put(ip); + gfs2_qadata_put(ip); return error; } @@ -998,7 +990,7 @@ static int ea_set_block(struct gfs2_inode *ip, struct gfs2_ea_request *er, } else { u64 blk; unsigned int n = 1; - error = gfs2_alloc_block(ip, &blk, &n); + error = gfs2_alloc_blocks(ip, &blk, &n, 0, NULL); if (error) return error; gfs2_trans_add_unrevoke(sdp, blk, 1); @@ -1296,7 +1288,8 @@ fail: int gfs2_xattr_acl_chmod(struct gfs2_inode *ip, struct iattr *attr, char *data) { - struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + struct inode *inode = &ip->i_inode; + struct gfs2_sbd *sdp = GFS2_SB(inode); struct gfs2_ea_location el; int error; @@ -1319,7 +1312,7 @@ int gfs2_xattr_acl_chmod(struct gfs2_inode *ip, struct iattr *attr, char *data) if (error) return error; - error = gfs2_setattr_simple(ip, attr); + error = gfs2_setattr_simple(inode, attr); gfs2_trans_end(sdp); return error; } @@ -1362,14 +1355,14 @@ static int ea_dealloc_indirect(struct gfs2_inode *ip) blen++; else { if (bstart) - gfs2_rlist_add(sdp, &rlist, bstart); + gfs2_rlist_add(ip, &rlist, bstart); bstart = bn; blen = 1; } blks++; } if (bstart) - gfs2_rlist_add(sdp, &rlist, bstart); + gfs2_rlist_add(ip, &rlist, bstart); else goto out; @@ -1441,9 +1434,9 @@ out: static int ea_dealloc_block(struct gfs2_inode *ip) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); - struct gfs2_alloc *al = ip->i_alloc; struct gfs2_rgrpd *rgd; struct buffer_head *dibh; + struct gfs2_holder gh; int error; rgd = gfs2_blk2rgrpd(sdp, ip->i_eattr); @@ -1452,8 +1445,7 @@ static int ea_dealloc_block(struct gfs2_inode *ip) return -EIO; } - error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, - &al->al_rgd_gh); + error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, &gh); if (error) return error; @@ -1477,7 +1469,7 @@ static int ea_dealloc_block(struct gfs2_inode *ip) gfs2_trans_end(sdp); out_gunlock: - gfs2_glock_dq_uninit(&al->al_rgd_gh); + gfs2_glock_dq_uninit(&gh); return error; } @@ -1490,39 +1482,33 @@ out_gunlock: int gfs2_ea_dealloc(struct gfs2_inode *ip) { - struct gfs2_alloc *al; + struct gfs2_qadata *qa; int error; - al = gfs2_alloc_get(ip); - if (!al) + qa = gfs2_qadata_get(ip); + if (!qa) return -ENOMEM; error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE); if (error) goto out_alloc; - error = gfs2_rindex_hold(GFS2_SB(&ip->i_inode), &al->al_ri_gh); - if (error) - goto out_quota; - error = ea_foreach(ip, ea_dealloc_unstuffed, NULL); if (error) - goto out_rindex; + goto out_quota; if (ip->i_diskflags & GFS2_DIF_EA_INDIRECT) { error = ea_dealloc_indirect(ip); if (error) - goto out_rindex; + goto out_quota; } error = ea_dealloc_block(ip); -out_rindex: - gfs2_glock_dq_uninit(&al->al_ri_gh); out_quota: gfs2_quota_unhold(ip); out_alloc: - gfs2_alloc_put(ip); + gfs2_qadata_put(ip); return error; } diff --git a/fs/hfs/btree.c b/fs/hfs/btree.c index 3ebc437..1cbdeea 100644 --- a/fs/hfs/btree.c +++ b/fs/hfs/btree.c @@ -46,11 +46,26 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id, btree_keycmp ke case HFS_EXT_CNID: hfs_inode_read_fork(tree->inode, mdb->drXTExtRec, mdb->drXTFlSize, mdb->drXTFlSize, be32_to_cpu(mdb->drXTClpSiz)); + if (HFS_I(tree->inode)->alloc_blocks > + HFS_I(tree->inode)->first_blocks) { + printk(KERN_ERR "hfs: invalid btree extent records\n"); + unlock_new_inode(tree->inode); + goto free_inode; + } + tree->inode->i_mapping->a_ops = &hfs_btree_aops; break; case HFS_CAT_CNID: hfs_inode_read_fork(tree->inode, mdb->drCTExtRec, mdb->drCTFlSize, mdb->drCTFlSize, be32_to_cpu(mdb->drCTClpSiz)); + + if (!HFS_I(tree->inode)->first_blocks) { + printk(KERN_ERR "hfs: invalid btree extent records " + "(0 size).\n"); + unlock_new_inode(tree->inode); + goto free_inode; + } + tree->inode->i_mapping->a_ops = &hfs_btree_aops; break; default: @@ -59,11 +74,6 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id, btree_keycmp ke } unlock_new_inode(tree->inode); - if (!HFS_I(tree->inode)->first_blocks) { - printk(KERN_ERR "hfs: invalid btree extent records (0 size).\n"); - goto free_inode; - } - mapping = tree->inode->i_mapping; page = read_mapping_page(mapping, 0, NULL); if (IS_ERR(page)) diff --git a/fs/hfs/dir.c b/fs/hfs/dir.c index b4d70b1..62fc14e 100644 --- a/fs/hfs/dir.c +++ b/fs/hfs/dir.c @@ -186,7 +186,7 @@ static int hfs_dir_release(struct inode *inode, struct file *file) * a directory and return a corresponding inode, given the inode for * the directory and the name (and its length) of the new file. */ -static int hfs_create(struct inode *dir, struct dentry *dentry, int mode, +static int hfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, struct nameidata *nd) { struct inode *inode; @@ -198,7 +198,7 @@ static int hfs_create(struct inode *dir, struct dentry *dentry, int mode, res = hfs_cat_create(inode->i_ino, dir, &dentry->d_name, inode); if (res) { - inode->i_nlink = 0; + clear_nlink(inode); hfs_delete_inode(inode); iput(inode); return res; @@ -216,7 +216,7 @@ static int hfs_create(struct inode *dir, struct dentry *dentry, int mode, * in a directory, given the inode for the parent directory and the * name (and its length) of the new directory. */ -static int hfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) +static int hfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) { struct inode *inode; int res; @@ -227,7 +227,7 @@ static int hfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) res = hfs_cat_create(inode->i_ino, dir, &dentry->d_name, inode); if (res) { - inode->i_nlink = 0; + clear_nlink(inode); hfs_delete_inode(inode); iput(inode); return res; diff --git a/fs/hfs/hfs_fs.h b/fs/hfs/hfs_fs.h index ad97c2d..1bf967c 100644 --- a/fs/hfs/hfs_fs.h +++ b/fs/hfs/hfs_fs.h @@ -184,7 +184,7 @@ extern int hfs_get_block(struct inode *, sector_t, struct buffer_head *, int); extern const struct address_space_operations hfs_aops; extern const struct address_space_operations hfs_btree_aops; -extern struct inode *hfs_new_inode(struct inode *, struct qstr *, int); +extern struct inode *hfs_new_inode(struct inode *, struct qstr *, umode_t); extern void hfs_inode_write_fork(struct inode *, struct hfs_extent *, __be32 *, __be32 *); extern int hfs_write_inode(struct inode *, struct writeback_control *); extern int hfs_inode_setattr(struct dentry *, struct iattr *); diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c index 96a1b62..737dbeb 100644 --- a/fs/hfs/inode.c +++ b/fs/hfs/inode.c @@ -169,7 +169,7 @@ const struct address_space_operations hfs_aops = { /* * hfs_new_inode */ -struct inode *hfs_new_inode(struct inode *dir, struct qstr *name, int mode) +struct inode *hfs_new_inode(struct inode *dir, struct qstr *name, umode_t mode) { struct super_block *sb = dir->i_sb; struct inode *inode = new_inode(sb); @@ -183,7 +183,7 @@ struct inode *hfs_new_inode(struct inode *dir, struct qstr *name, int mode) inode->i_mode = mode; inode->i_uid = current_fsuid(); inode->i_gid = current_fsgid(); - inode->i_nlink = 1; + set_nlink(inode, 1); inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC; HFS_I(inode)->flags = 0; HFS_I(inode)->rsrc_inode = NULL; @@ -313,7 +313,7 @@ static int hfs_read_inode(struct inode *inode, void *data) /* Initialize the inode */ inode->i_uid = hsb->s_uid; inode->i_gid = hsb->s_gid; - inode->i_nlink = 1; + set_nlink(inode, 1); if (idata->key) HFS_I(inode)->cat_key = *idata->key; diff --git a/fs/hfs/super.c b/fs/hfs/super.c index 1b55f70..8137fb3 100644 --- a/fs/hfs/super.c +++ b/fs/hfs/super.c @@ -133,9 +133,9 @@ static int hfs_remount(struct super_block *sb, int *flags, char *data) return 0; } -static int hfs_show_options(struct seq_file *seq, struct vfsmount *mnt) +static int hfs_show_options(struct seq_file *seq, struct dentry *root) { - struct hfs_sb_info *sbi = HFS_SB(mnt->mnt_sb); + struct hfs_sb_info *sbi = HFS_SB(root->d_sb); if (sbi->s_creator != cpu_to_be32(0x3f3f3f3f)) seq_printf(seq, ",creator=%.4s", (char *)&sbi->s_creator); @@ -170,7 +170,6 @@ static struct inode *hfs_alloc_inode(struct super_block *sb) static void hfs_i_callback(struct rcu_head *head) { struct inode *inode = container_of(head, struct inode, i_rcu); - INIT_LIST_HEAD(&inode->i_dentry); kmem_cache_free(hfs_inode_cachep, HFS_I(inode)); } diff --git a/fs/hfs/trans.c b/fs/hfs/trans.c index e673a88..b1ce4c7 100644 --- a/fs/hfs/trans.c +++ b/fs/hfs/trans.c @@ -40,6 +40,8 @@ int hfs_mac2asc(struct super_block *sb, char *out, const struct hfs_name *in) src = in->name; srclen = in->len; + if (srclen > HFS_NAMELEN) + srclen = HFS_NAMELEN; dst = out; dstlen = HFS_MAX_NAMELEN; if (nls_io) { diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c index 25b2443..88e155f 100644 --- a/fs/hfsplus/dir.c +++ b/fs/hfsplus/dir.c @@ -415,7 +415,7 @@ static int hfsplus_symlink(struct inode *dir, struct dentry *dentry, goto out; out_err: - inode->i_nlink = 0; + clear_nlink(inode); hfsplus_delete_inode(inode); iput(inode); out: @@ -424,7 +424,7 @@ out: } static int hfsplus_mknod(struct inode *dir, struct dentry *dentry, - int mode, dev_t rdev) + umode_t mode, dev_t rdev) { struct hfsplus_sb_info *sbi = HFSPLUS_SB(dir->i_sb); struct inode *inode; @@ -440,7 +440,7 @@ static int hfsplus_mknod(struct inode *dir, struct dentry *dentry, res = hfsplus_create_cat(inode->i_ino, dir, &dentry->d_name, inode); if (res) { - inode->i_nlink = 0; + clear_nlink(inode); hfsplus_delete_inode(inode); iput(inode); goto out; @@ -453,13 +453,13 @@ out: return res; } -static int hfsplus_create(struct inode *dir, struct dentry *dentry, int mode, +static int hfsplus_create(struct inode *dir, struct dentry *dentry, umode_t mode, struct nameidata *nd) { return hfsplus_mknod(dir, dentry, mode, 0); } -static int hfsplus_mkdir(struct inode *dir, struct dentry *dentry, int mode) +static int hfsplus_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) { return hfsplus_mknod(dir, dentry, mode | S_IFDIR, 0); } diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h index d7674d0..21a5b7f 100644 --- a/fs/hfsplus/hfsplus_fs.h +++ b/fs/hfsplus/hfsplus_fs.h @@ -402,7 +402,7 @@ void hfsplus_inode_read_fork(struct inode *, struct hfsplus_fork_raw *); void hfsplus_inode_write_fork(struct inode *, struct hfsplus_fork_raw *); int hfsplus_cat_read_inode(struct inode *, struct hfs_find_data *); int hfsplus_cat_write_inode(struct inode *); -struct inode *hfsplus_new_inode(struct super_block *, int); +struct inode *hfsplus_new_inode(struct super_block *, umode_t); void hfsplus_delete_inode(struct inode *); int hfsplus_file_fsync(struct file *file, loff_t start, loff_t end, int datasync); @@ -419,7 +419,7 @@ ssize_t hfsplus_listxattr(struct dentry *dentry, char *buffer, size_t size); int hfsplus_parse_options(char *, struct hfsplus_sb_info *); int hfsplus_parse_options_remount(char *input, int *force); void hfsplus_fill_defaults(struct hfsplus_sb_info *); -int hfsplus_show_options(struct seq_file *, struct vfsmount *); +int hfsplus_show_options(struct seq_file *, struct dentry *); /* super.c */ struct inode *hfsplus_iget(struct super_block *, unsigned long); diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c index 4cc1e3a..6643b24 100644 --- a/fs/hfsplus/inode.c +++ b/fs/hfsplus/inode.c @@ -378,7 +378,7 @@ static const struct file_operations hfsplus_file_operations = { .unlocked_ioctl = hfsplus_ioctl, }; -struct inode *hfsplus_new_inode(struct super_block *sb, int mode) +struct inode *hfsplus_new_inode(struct super_block *sb, umode_t mode) { struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb); struct inode *inode = new_inode(sb); @@ -391,7 +391,7 @@ struct inode *hfsplus_new_inode(struct super_block *sb, int mode) inode->i_mode = mode; inode->i_uid = current_fsuid(); inode->i_gid = current_fsgid(); - inode->i_nlink = 1; + set_nlink(inode, 1); inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC; hip = HFSPLUS_I(inode); @@ -512,7 +512,7 @@ int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd) hfs_bnode_read(fd->bnode, &entry, fd->entryoffset, sizeof(struct hfsplus_cat_folder)); hfsplus_get_perms(inode, &folder->permissions, 1); - inode->i_nlink = 1; + set_nlink(inode, 1); inode->i_size = 2 + be32_to_cpu(folder->valence); inode->i_atime = hfsp_mt2ut(folder->access_date); inode->i_mtime = hfsp_mt2ut(folder->content_mod_date); @@ -532,11 +532,11 @@ int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd) hfsplus_inode_read_fork(inode, HFSPLUS_IS_RSRC(inode) ? &file->rsrc_fork : &file->data_fork); hfsplus_get_perms(inode, &file->permissions, 0); - inode->i_nlink = 1; + set_nlink(inode, 1); if (S_ISREG(inode->i_mode)) { if (file->permissions.dev) - inode->i_nlink = - be32_to_cpu(file->permissions.dev); + set_nlink(inode, + be32_to_cpu(file->permissions.dev)); inode->i_op = &hfsplus_file_inode_operations; inode->i_fop = &hfsplus_file_operations; inode->i_mapping->a_ops = &hfsplus_aops; diff --git a/fs/hfsplus/ioctl.c b/fs/hfsplus/ioctl.c index fbaa669..f66c765 100644 --- a/fs/hfsplus/ioctl.c +++ b/fs/hfsplus/ioctl.c @@ -43,7 +43,7 @@ static int hfsplus_ioctl_setflags(struct file *file, int __user *user_flags) unsigned int flags; int err = 0; - err = mnt_want_write(file->f_path.mnt); + err = mnt_want_write_file(file); if (err) goto out; @@ -94,7 +94,7 @@ static int hfsplus_ioctl_setflags(struct file *file, int __user *user_flags) out_unlock_inode: mutex_unlock(&inode->i_mutex); out_drop_write: - mnt_drop_write(file->f_path.mnt); + mnt_drop_write_file(file); out: return err; } diff --git a/fs/hfsplus/options.c b/fs/hfsplus/options.c index bb62a5882..06fa561 100644 --- a/fs/hfsplus/options.c +++ b/fs/hfsplus/options.c @@ -206,9 +206,9 @@ done: return 1; } -int hfsplus_show_options(struct seq_file *seq, struct vfsmount *mnt) +int hfsplus_show_options(struct seq_file *seq, struct dentry *root) { - struct hfsplus_sb_info *sbi = HFSPLUS_SB(mnt->mnt_sb); + struct hfsplus_sb_info *sbi = HFSPLUS_SB(root->d_sb); if (sbi->creator != HFSPLUS_DEF_CR_TYPE) seq_printf(seq, ",creator=%.4s", (char *)&sbi->creator); diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c index d24a9b6..edf0a80 100644 --- a/fs/hfsplus/super.c +++ b/fs/hfsplus/super.c @@ -558,7 +558,6 @@ static void hfsplus_i_callback(struct rcu_head *head) { struct inode *inode = container_of(head, struct inode, i_rcu); - INIT_LIST_HEAD(&inode->i_dentry); kmem_cache_free(hfsplus_inode_cachep, HFSPLUS_I(inode)); } diff --git a/fs/hostfs/hostfs.h b/fs/hostfs/hostfs.h index bf15a43..3cbfa93 100644 --- a/fs/hostfs/hostfs.h +++ b/fs/hostfs/hostfs.h @@ -39,7 +39,7 @@ struct hostfs_iattr { unsigned int ia_valid; - mode_t ia_mode; + unsigned short ia_mode; uid_t ia_uid; gid_t ia_gid; loff_t ia_size; diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c index 0d22afd..e130bd4 100644 --- a/fs/hostfs/hostfs_kern.c +++ b/fs/hostfs/hostfs_kern.c @@ -250,7 +250,6 @@ static void hostfs_evict_inode(struct inode *inode) static void hostfs_i_callback(struct rcu_head *head) { struct inode *inode = container_of(head, struct inode, i_rcu); - INIT_LIST_HEAD(&inode->i_dentry); kfree(HOSTFS_I(inode)); } @@ -259,9 +258,9 @@ static void hostfs_destroy_inode(struct inode *inode) call_rcu(&inode->i_rcu, hostfs_i_callback); } -static int hostfs_show_options(struct seq_file *seq, struct vfsmount *vfs) +static int hostfs_show_options(struct seq_file *seq, struct dentry *root) { - const char *root_path = vfs->mnt_sb->s_fs_info; + const char *root_path = root->d_sb->s_fs_info; size_t offset = strlen(root_ino) + 1; if (strlen(root_path) > offset) @@ -541,7 +540,7 @@ static int read_name(struct inode *ino, char *name) ino->i_ino = st.ino; ino->i_mode = st.mode; - ino->i_nlink = st.nlink; + set_nlink(ino, st.nlink); ino->i_uid = st.uid; ino->i_gid = st.gid; ino->i_atime = st.atime; @@ -552,7 +551,7 @@ static int read_name(struct inode *ino, char *name) return 0; } -int hostfs_create(struct inode *dir, struct dentry *dentry, int mode, +int hostfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, struct nameidata *nd) { struct inode *inode; @@ -677,7 +676,7 @@ int hostfs_symlink(struct inode *ino, struct dentry *dentry, const char *to) return err; } -int hostfs_mkdir(struct inode *ino, struct dentry *dentry, int mode) +int hostfs_mkdir(struct inode *ino, struct dentry *dentry, umode_t mode) { char *file; int err; @@ -701,7 +700,7 @@ int hostfs_rmdir(struct inode *ino, struct dentry *dentry) return err; } -int hostfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev) +static int hostfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev) { struct inode *inode; char *name; diff --git a/fs/hostfs/hostfs_user.c b/fs/hostfs/hostfs_user.c index d51a9838..dd7bc38 100644 --- a/fs/hostfs/hostfs_user.c +++ b/fs/hostfs/hostfs_user.c @@ -16,7 +16,6 @@ #include <sys/vfs.h> #include "hostfs.h" #include "os.h" -#include "user.h" #include <utime.h> static void stat64_to_hostfs(const struct stat64 *buf, struct hostfs_stat *p) diff --git a/fs/hpfs/dir.c b/fs/hpfs/dir.c index 96a8ed9..2fa0089 100644 --- a/fs/hpfs/dir.c +++ b/fs/hpfs/dir.c @@ -247,7 +247,7 @@ struct dentry *hpfs_lookup(struct inode *dir, struct dentry *dentry, struct name result->i_mode &= ~0111; result->i_op = &hpfs_file_iops; result->i_fop = &hpfs_file_ops; - result->i_nlink = 1; + set_nlink(result, 1); } unlock_new_inode(result); } diff --git a/fs/hpfs/hpfs_fn.h b/fs/hpfs/hpfs_fn.h index 331b5e2..de94617 100644 --- a/fs/hpfs/hpfs_fn.h +++ b/fs/hpfs/hpfs_fn.h @@ -311,8 +311,8 @@ static inline struct hpfs_sb_info *hpfs_sb(struct super_block *sb) /* super.c */ -void hpfs_error(struct super_block *, const char *, ...) - __attribute__((format (printf, 2, 3))); +__printf(2, 3) +void hpfs_error(struct super_block *, const char *, ...); int hpfs_stop_cycles(struct super_block *, int, int *, int *, char *); unsigned hpfs_count_one_bitmap(struct super_block *, secno); diff --git a/fs/hpfs/inode.c b/fs/hpfs/inode.c index 338cd83..3b2cec2 100644 --- a/fs/hpfs/inode.c +++ b/fs/hpfs/inode.c @@ -53,7 +53,7 @@ void hpfs_read_inode(struct inode *i) i->i_mode &= ~0111; i->i_op = &hpfs_file_iops; i->i_fop = &hpfs_file_ops; - i->i_nlink = 0;*/ + clear_nlink(i);*/ make_bad_inode(i); return; } @@ -77,7 +77,7 @@ void hpfs_read_inode(struct inode *i) i->i_mode = S_IFLNK | 0777; i->i_op = &page_symlink_inode_operations; i->i_data.a_ops = &hpfs_symlink_aops; - i->i_nlink = 1; + set_nlink(i, 1); i->i_size = ea_size; i->i_blocks = 1; brelse(bh); @@ -101,7 +101,7 @@ void hpfs_read_inode(struct inode *i) } if (S_ISBLK(mode) || S_ISCHR(mode) || S_ISFIFO(mode) || S_ISSOCK(mode)) { brelse(bh); - i->i_nlink = 1; + set_nlink(i, 1); i->i_size = 0; i->i_blocks = 1; init_special_inode(i, mode, @@ -125,13 +125,13 @@ void hpfs_read_inode(struct inode *i) hpfs_count_dnodes(i->i_sb, hpfs_inode->i_dno, &n_dnodes, &n_subdirs, NULL); i->i_blocks = 4 * n_dnodes; i->i_size = 2048 * n_dnodes; - i->i_nlink = 2 + n_subdirs; + set_nlink(i, 2 + n_subdirs); } else { i->i_mode |= S_IFREG; if (!hpfs_inode->i_ea_mode) i->i_mode &= ~0111; i->i_op = &hpfs_file_iops; i->i_fop = &hpfs_file_ops; - i->i_nlink = 1; + set_nlink(i, 1); i->i_size = le32_to_cpu(fnode->file_size); i->i_blocks = ((i->i_size + 511) >> 9) + 1; i->i_data.a_ops = &hpfs_aops; diff --git a/fs/hpfs/namei.c b/fs/hpfs/namei.c index 2df69e2..30dd7b1 100644 --- a/fs/hpfs/namei.c +++ b/fs/hpfs/namei.c @@ -8,7 +8,7 @@ #include <linux/sched.h> #include "hpfs_fn.h" -static int hpfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) +static int hpfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) { const unsigned char *name = dentry->d_name.name; unsigned len = dentry->d_name.len; @@ -56,7 +56,7 @@ static int hpfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) result->i_fop = &hpfs_dir_ops; result->i_blocks = 4; result->i_size = 2048; - result->i_nlink = 2; + set_nlink(result, 2); if (dee.read_only) result->i_mode &= ~0222; @@ -115,7 +115,7 @@ bail: return err; } -static int hpfs_create(struct inode *dir, struct dentry *dentry, int mode, struct nameidata *nd) +static int hpfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, struct nameidata *nd) { const unsigned char *name = dentry->d_name.name; unsigned len = dentry->d_name.len; @@ -150,7 +150,7 @@ static int hpfs_create(struct inode *dir, struct dentry *dentry, int mode, struc result->i_mode &= ~0111; result->i_op = &hpfs_file_iops; result->i_fop = &hpfs_file_ops; - result->i_nlink = 1; + set_nlink(result, 1); hpfs_i(result)->i_parent_dir = dir->i_ino; result->i_ctime.tv_sec = result->i_mtime.tv_sec = result->i_atime.tv_sec = local_to_gmt(dir->i_sb, le32_to_cpu(dee.creation_date)); result->i_ctime.tv_nsec = 0; @@ -201,7 +201,7 @@ bail: return err; } -static int hpfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t rdev) +static int hpfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev) { const unsigned char *name = dentry->d_name.name; unsigned len = dentry->d_name.len; @@ -242,7 +242,7 @@ static int hpfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t hpfs_i(result)->i_ea_size = 0; result->i_uid = current_fsuid(); result->i_gid = current_fsgid(); - result->i_nlink = 1; + set_nlink(result, 1); result->i_size = 0; result->i_blocks = 1; init_special_inode(result, mode, rdev); @@ -318,7 +318,7 @@ static int hpfs_symlink(struct inode *dir, struct dentry *dentry, const char *sy result->i_uid = current_fsuid(); result->i_gid = current_fsgid(); result->i_blocks = 1; - result->i_nlink = 1; + set_nlink(result, 1); result->i_size = strlen(symlink); result->i_op = &page_symlink_inode_operations; result->i_data.a_ops = &hpfs_symlink_aops; diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c index 98580a3..3690467c9 100644 --- a/fs/hpfs/super.c +++ b/fs/hpfs/super.c @@ -181,7 +181,6 @@ static struct inode *hpfs_alloc_inode(struct super_block *sb) static void hpfs_i_callback(struct rcu_head *head) { struct inode *inode = container_of(head, struct inode, i_rcu); - INIT_LIST_HEAD(&inode->i_dentry); kmem_cache_free(hpfs_inode_cachep, hpfs_i(inode)); } diff --git a/fs/hppfs/hppfs.c b/fs/hppfs/hppfs.c index 970ea98..d92f4ce 100644 --- a/fs/hppfs/hppfs.c +++ b/fs/hppfs/hppfs.c @@ -622,7 +622,6 @@ void hppfs_evict_inode(struct inode *ino) static void hppfs_i_callback(struct rcu_head *head) { struct inode *inode = container_of(head, struct inode, i_rcu); - INIT_LIST_HEAD(&inode->i_dentry); kfree(HPPFS_I(inode)); } @@ -702,7 +701,7 @@ static struct inode *get_inode(struct super_block *sb, struct dentry *dentry) inode->i_ctime = proc_ino->i_ctime; inode->i_ino = proc_ino->i_ino; inode->i_mode = proc_ino->i_mode; - inode->i_nlink = proc_ino->i_nlink; + set_nlink(inode, proc_ino->i_nlink); inode->i_size = proc_ino->i_size; inode->i_blocks = proc_ino->i_blocks; @@ -726,7 +725,7 @@ static int hppfs_fill_super(struct super_block *sb, void *d, int silent) sb->s_fs_info = proc_mnt; err = -ENOMEM; - root_inode = get_inode(sb, dget(proc_mnt->mnt_sb->s_root)); + root_inode = get_inode(sb, dget(proc_mnt->mnt_root)); if (!root_inode) goto out_mntput; diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index ec88953..e425ad9 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -447,8 +447,8 @@ static int hugetlbfs_setattr(struct dentry *dentry, struct iattr *attr) return 0; } -static struct inode *hugetlbfs_get_inode(struct super_block *sb, uid_t uid, - gid_t gid, int mode, dev_t dev) +static struct inode *hugetlbfs_get_root(struct super_block *sb, + struct hugetlbfs_config *config) { struct inode *inode; @@ -456,9 +456,31 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb, uid_t uid, if (inode) { struct hugetlbfs_inode_info *info; inode->i_ino = get_next_ino(); - inode->i_mode = mode; - inode->i_uid = uid; - inode->i_gid = gid; + inode->i_mode = S_IFDIR | config->mode; + inode->i_uid = config->uid; + inode->i_gid = config->gid; + inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; + info = HUGETLBFS_I(inode); + mpol_shared_policy_init(&info->policy, NULL); + inode->i_op = &hugetlbfs_dir_inode_operations; + inode->i_fop = &simple_dir_operations; + /* directory inodes start off with i_nlink == 2 (for "." entry) */ + inc_nlink(inode); + } + return inode; +} + +static struct inode *hugetlbfs_get_inode(struct super_block *sb, + struct inode *dir, + umode_t mode, dev_t dev) +{ + struct inode *inode; + + inode = new_inode(sb); + if (inode) { + struct hugetlbfs_inode_info *info; + inode->i_ino = get_next_ino(); + inode_init_owner(inode, dir, mode); inode->i_mapping->a_ops = &hugetlbfs_aops; inode->i_mapping->backing_dev_info =&hugetlbfs_backing_dev_info; inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; @@ -500,20 +522,12 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb, uid_t uid, * File creation. Allocate an inode, and we're done.. */ static int hugetlbfs_mknod(struct inode *dir, - struct dentry *dentry, int mode, dev_t dev) + struct dentry *dentry, umode_t mode, dev_t dev) { struct inode *inode; int error = -ENOSPC; - gid_t gid; - - if (dir->i_mode & S_ISGID) { - gid = dir->i_gid; - if (S_ISDIR(mode)) - mode |= S_ISGID; - } else { - gid = current_fsgid(); - } - inode = hugetlbfs_get_inode(dir->i_sb, current_fsuid(), gid, mode, dev); + + inode = hugetlbfs_get_inode(dir->i_sb, dir, mode, dev); if (inode) { dir->i_ctime = dir->i_mtime = CURRENT_TIME; d_instantiate(dentry, inode); @@ -523,7 +537,7 @@ static int hugetlbfs_mknod(struct inode *dir, return error; } -static int hugetlbfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) +static int hugetlbfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) { int retval = hugetlbfs_mknod(dir, dentry, mode | S_IFDIR, 0); if (!retval) @@ -531,7 +545,7 @@ static int hugetlbfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) return retval; } -static int hugetlbfs_create(struct inode *dir, struct dentry *dentry, int mode, struct nameidata *nd) +static int hugetlbfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, struct nameidata *nd) { return hugetlbfs_mknod(dir, dentry, mode | S_IFREG, 0); } @@ -541,15 +555,8 @@ static int hugetlbfs_symlink(struct inode *dir, { struct inode *inode; int error = -ENOSPC; - gid_t gid; - - if (dir->i_mode & S_ISGID) - gid = dir->i_gid; - else - gid = current_fsgid(); - inode = hugetlbfs_get_inode(dir->i_sb, current_fsuid(), - gid, S_IFLNK|S_IRWXUGO, 0); + inode = hugetlbfs_get_inode(dir->i_sb, dir, S_IFLNK|S_IRWXUGO, 0); if (inode) { int l = strlen(symname)+1; error = page_symlink(inode, symname, l); @@ -666,7 +673,6 @@ static struct inode *hugetlbfs_alloc_inode(struct super_block *sb) static void hugetlbfs_i_callback(struct rcu_head *head) { struct inode *inode = container_of(head, struct inode, i_rcu); - INIT_LIST_HEAD(&inode->i_dentry); kmem_cache_free(hugetlbfs_inode_cachep, HUGETLBFS_I(inode)); } @@ -858,8 +864,7 @@ hugetlbfs_fill_super(struct super_block *sb, void *data, int silent) sb->s_magic = HUGETLBFS_MAGIC; sb->s_op = &hugetlbfs_ops; sb->s_time_gran = 1; - inode = hugetlbfs_get_inode(sb, config.uid, config.gid, - S_IFDIR | config.mode, 0); + inode = hugetlbfs_get_root(sb, &config); if (!inode) goto out_free; @@ -957,8 +962,7 @@ struct file *hugetlb_file_setup(const char *name, size_t size, path.mnt = mntget(hugetlbfs_vfsmount); error = -ENOSPC; - inode = hugetlbfs_get_inode(root->d_sb, current_fsuid(), - current_fsgid(), S_IFREG | S_IRWXUGO, 0); + inode = hugetlbfs_get_inode(root->d_sb, NULL, S_IFREG | S_IRWXUGO, 0); if (!inode) goto out_dentry; @@ -970,7 +974,7 @@ struct file *hugetlb_file_setup(const char *name, size_t size, d_instantiate(path.dentry, inode); inode->i_size = size; - inode->i_nlink = 0; + clear_nlink(inode); error = -ENFILE; file = alloc_file(&path, FMODE_WRITE | FMODE_READ, @@ -26,6 +26,7 @@ #include <linux/ima.h> #include <linux/cred.h> #include <linux/buffer_head.h> /* for inode_has_buffers */ +#include <linux/ratelimit.h> #include "internal.h" /* @@ -142,7 +143,7 @@ int inode_init_always(struct super_block *sb, struct inode *inode) atomic_set(&inode->i_count, 1); inode->i_op = &empty_iops; inode->i_fop = &empty_fops; - inode->i_nlink = 1; + inode->__i_nlink = 1; inode->i_opflags = 0; inode->i_uid = 0; inode->i_gid = 0; @@ -191,6 +192,7 @@ int inode_init_always(struct super_block *sb, struct inode *inode) } inode->i_private = NULL; inode->i_mapping = mapping; + INIT_LIST_HEAD(&inode->i_dentry); /* buggered by rcu freeing */ #ifdef CONFIG_FS_POSIX_ACL inode->i_acl = inode->i_default_acl = ACL_NOT_CACHED; #endif @@ -241,6 +243,11 @@ void __destroy_inode(struct inode *inode) BUG_ON(inode_has_buffers(inode)); security_inode_free(inode); fsnotify_inode_delete(inode); + if (!inode->i_nlink) { + WARN_ON(atomic_long_read(&inode->i_sb->s_remove_count) == 0); + atomic_long_dec(&inode->i_sb->s_remove_count); + } + #ifdef CONFIG_FS_POSIX_ACL if (inode->i_acl && inode->i_acl != ACL_NOT_CACHED) posix_acl_release(inode->i_acl); @@ -254,7 +261,6 @@ EXPORT_SYMBOL(__destroy_inode); static void i_callback(struct rcu_head *head) { struct inode *inode = container_of(head, struct inode, i_rcu); - INIT_LIST_HEAD(&inode->i_dentry); kmem_cache_free(inode_cachep, inode); } @@ -268,6 +274,85 @@ static void destroy_inode(struct inode *inode) call_rcu(&inode->i_rcu, i_callback); } +/** + * drop_nlink - directly drop an inode's link count + * @inode: inode + * + * This is a low-level filesystem helper to replace any + * direct filesystem manipulation of i_nlink. In cases + * where we are attempting to track writes to the + * filesystem, a decrement to zero means an imminent + * write when the file is truncated and actually unlinked + * on the filesystem. + */ +void drop_nlink(struct inode *inode) +{ + WARN_ON(inode->i_nlink == 0); + inode->__i_nlink--; + if (!inode->i_nlink) + atomic_long_inc(&inode->i_sb->s_remove_count); +} +EXPORT_SYMBOL(drop_nlink); + +/** + * clear_nlink - directly zero an inode's link count + * @inode: inode + * + * This is a low-level filesystem helper to replace any + * direct filesystem manipulation of i_nlink. See + * drop_nlink() for why we care about i_nlink hitting zero. + */ +void clear_nlink(struct inode *inode) +{ + if (inode->i_nlink) { + inode->__i_nlink = 0; + atomic_long_inc(&inode->i_sb->s_remove_count); + } +} +EXPORT_SYMBOL(clear_nlink); + +/** + * set_nlink - directly set an inode's link count + * @inode: inode + * @nlink: new nlink (should be non-zero) + * + * This is a low-level filesystem helper to replace any + * direct filesystem manipulation of i_nlink. + */ +void set_nlink(struct inode *inode, unsigned int nlink) +{ + if (!nlink) { + printk_ratelimited(KERN_INFO + "set_nlink() clearing i_nlink on %s inode %li\n", + inode->i_sb->s_type->name, inode->i_ino); + clear_nlink(inode); + } else { + /* Yes, some filesystems do change nlink from zero to one */ + if (inode->i_nlink == 0) + atomic_long_dec(&inode->i_sb->s_remove_count); + + inode->__i_nlink = nlink; + } +} +EXPORT_SYMBOL(set_nlink); + +/** + * inc_nlink - directly increment an inode's link count + * @inode: inode + * + * This is a low-level filesystem helper to replace any + * direct filesystem manipulation of i_nlink. Currently, + * it is only here for parity with dec_nlink(). + */ +void inc_nlink(struct inode *inode) +{ + if (WARN_ON(inode->i_nlink == 0)) + atomic_long_dec(&inode->i_sb->s_remove_count); + + inode->__i_nlink++; +} +EXPORT_SYMBOL(inc_nlink); + void address_space_init_once(struct address_space *mapping) { memset(mapping, 0, sizeof(*mapping)); @@ -290,7 +375,6 @@ void inode_init_once(struct inode *inode) { memset(inode, 0, sizeof(*inode)); INIT_HLIST_NODE(&inode->i_hash); - INIT_LIST_HEAD(&inode->i_dentry); INIT_LIST_HEAD(&inode->i_devices); INIT_LIST_HEAD(&inode->i_wb_list); INIT_LIST_HEAD(&inode->i_lru); @@ -634,7 +718,7 @@ void prune_icache_sb(struct super_block *sb, int nr_to_scan) * inode to the back of the list so we don't spin on it. */ if (!spin_trylock(&inode->i_lock)) { - list_move(&inode->i_lru, &sb->s_inode_lru); + list_move_tail(&inode->i_lru, &sb->s_inode_lru); continue; } @@ -1508,7 +1592,7 @@ void file_update_time(struct file *file) if (sync_it & S_MTIME) inode->i_mtime = now; mark_inode_dirty_sync(inode); - mnt_drop_write(file->f_path.mnt); + mnt_drop_write_file(file); } EXPORT_SYMBOL(file_update_time); @@ -1647,7 +1731,7 @@ EXPORT_SYMBOL(init_special_inode); * @mode: mode of the new inode */ void inode_init_owner(struct inode *inode, const struct inode *dir, - mode_t mode) + umode_t mode) { inode->i_uid = current_fsuid(); if (dir && dir->i_mode & S_ISGID) { diff --git a/fs/internal.h b/fs/internal.h index fe327c2..9962c59 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -15,19 +15,14 @@ struct super_block; struct file_system_type; struct linux_binprm; struct path; +struct mount; /* * block_dev.c */ #ifdef CONFIG_BLOCK -extern struct super_block *blockdev_superblock; extern void __init bdev_cache_init(void); -static inline int sb_is_blkdev_sb(struct super_block *sb) -{ - return sb == blockdev_superblock; -} - extern int __sync_blockdev(struct block_device *bdev, int wait); #else @@ -35,11 +30,6 @@ static inline void bdev_cache_init(void) { } -static inline int sb_is_blkdev_sb(struct super_block *sb) -{ - return 0; -} - static inline int __sync_blockdev(struct block_device *bdev, int wait) { return 0; @@ -52,28 +42,17 @@ static inline int __sync_blockdev(struct block_device *bdev, int wait) extern void __init chrdev_init(void); /* - * exec.c - */ -extern int check_unsafe_exec(struct linux_binprm *); - -/* * namespace.c */ extern int copy_mount_options(const void __user *, unsigned long *); extern int copy_mount_string(const void __user *, char **); -extern unsigned int mnt_get_count(struct vfsmount *mnt); -extern struct vfsmount *__lookup_mnt(struct vfsmount *, struct dentry *, int); extern struct vfsmount *lookup_mnt(struct path *); -extern void mnt_set_mountpoint(struct vfsmount *, struct dentry *, - struct vfsmount *); -extern void release_mounts(struct list_head *); -extern void umount_tree(struct vfsmount *, int, struct list_head *); -extern struct vfsmount *copy_tree(struct vfsmount *, struct dentry *, int); extern int finish_automount(struct vfsmount *, struct path *); extern void mnt_make_longterm(struct vfsmount *); extern void mnt_make_shortterm(struct vfsmount *); +extern int sb_prepare_remount_readonly(struct super_block *); extern void __init mnt_init(void); @@ -98,10 +77,9 @@ extern struct file *get_empty_filp(void); */ extern int do_remount_sb(struct super_block *, int, void *, int); extern bool grab_super_passive(struct super_block *sb); -extern void __put_super(struct super_block *sb); -extern void put_super(struct super_block *sb); extern struct dentry *mount_fs(struct file_system_type *, int, const char *, void *); +extern struct super_block *user_get_super(dev_t); /* * open.c @@ -111,7 +89,7 @@ extern struct file *nameidata_to_filp(struct nameidata *); extern void release_open_intent(struct nameidata *); struct open_flags { int open_flag; - int mode; + umode_t mode; int acc_mode; int intent; }; @@ -42,7 +42,7 @@ static long vfs_ioctl(struct file *filp, unsigned int cmd, error = filp->f_op->unlocked_ioctl(filp, cmd, arg); if (error == -ENOIOCTLCMD) - error = -EINVAL; + error = -ENOTTY; out: return error; } diff --git a/fs/ioprio.c b/fs/ioprio.c index 7da2a06..f79dab8 100644 --- a/fs/ioprio.c +++ b/fs/ioprio.c @@ -21,6 +21,7 @@ */ #include <linux/gfp.h> #include <linux/kernel.h> +#include <linux/export.h> #include <linux/ioprio.h> #include <linux/blkdev.h> #include <linux/capability.h> diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c index a5d0367..bd62c76 100644 --- a/fs/isofs/inode.c +++ b/fs/isofs/inode.c @@ -20,6 +20,7 @@ #include <linux/statfs.h> #include <linux/cdrom.h> #include <linux/parser.h> +#include <linux/mpage.h> #include "isofs.h" #include "zisofs.h" @@ -84,7 +85,6 @@ static struct inode *isofs_alloc_inode(struct super_block *sb) static void isofs_i_callback(struct rcu_head *head) { struct inode *inode = container_of(head, struct inode, i_rcu); - INIT_LIST_HEAD(&inode->i_dentry); kmem_cache_free(isofs_inode_cachep, ISOFS_I(inode)); } @@ -169,8 +169,8 @@ struct iso9660_options{ unsigned char map; unsigned char check; unsigned int blocksize; - mode_t fmode; - mode_t dmode; + umode_t fmode; + umode_t dmode; gid_t gid; uid_t uid; char *iocharset; @@ -948,8 +948,11 @@ root_found: /* get the root dentry */ s->s_root = d_alloc_root(inode); - if (!(s->s_root)) - goto out_no_root; + if (!(s->s_root)) { + iput(inode); + error = -ENOMEM; + goto out_no_inode; + } kfree(opt.iocharset); @@ -1148,7 +1151,13 @@ struct buffer_head *isofs_bread(struct inode *inode, sector_t block) static int isofs_readpage(struct file *file, struct page *page) { - return block_read_full_page(page,isofs_get_block); + return mpage_readpage(page, isofs_get_block); +} + +static int isofs_readpages(struct file *file, struct address_space *mapping, + struct list_head *pages, unsigned nr_pages) +{ + return mpage_readpages(mapping, pages, nr_pages, isofs_get_block); } static sector_t _isofs_bmap(struct address_space *mapping, sector_t block) @@ -1158,6 +1167,7 @@ static sector_t _isofs_bmap(struct address_space *mapping, sector_t block) static const struct address_space_operations isofs_aops = { .readpage = isofs_readpage, + .readpages = isofs_readpages, .bmap = _isofs_bmap }; @@ -1319,7 +1329,7 @@ static int isofs_read_inode(struct inode *inode) inode->i_mode = S_IFDIR | sbi->s_dmode; else inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO; - inode->i_nlink = 1; /* + set_nlink(inode, 1); /* * Set to 1. We know there are 2, but * the find utility tries to optimize * if it is 2, and it screws up. It is @@ -1337,7 +1347,7 @@ static int isofs_read_inode(struct inode *inode) */ inode->i_mode = S_IFREG | S_IRUGO | S_IXUGO; } - inode->i_nlink = 1; + set_nlink(inode, 1); } inode->i_uid = sbi->s_uid; inode->i_gid = sbi->s_gid; diff --git a/fs/isofs/isofs.h b/fs/isofs/isofs.h index 7d33de8..0e73f63 100644 --- a/fs/isofs/isofs.h +++ b/fs/isofs/isofs.h @@ -50,14 +50,14 @@ struct isofs_sb_info { unsigned int s_uid_set:1; unsigned int s_gid_set:1; - mode_t s_fmode; - mode_t s_dmode; + umode_t s_fmode; + umode_t s_dmode; gid_t s_gid; uid_t s_uid; struct nls_table *s_nls_iocharset; /* Native language support table */ }; -#define ISOFS_INVALID_MODE ((mode_t) -1) +#define ISOFS_INVALID_MODE ((umode_t) -1) static inline struct isofs_sb_info *ISOFS_SB(struct super_block *sb) { diff --git a/fs/isofs/rock.c b/fs/isofs/rock.c index 1fbc7de..70e79d0 100644 --- a/fs/isofs/rock.c +++ b/fs/isofs/rock.c @@ -363,7 +363,7 @@ repeat: break; case SIG('P', 'X'): inode->i_mode = isonum_733(rr->u.PX.mode); - inode->i_nlink = isonum_733(rr->u.PX.n_links); + set_nlink(inode, isonum_733(rr->u.PX.n_links)); inode->i_uid = isonum_733(rr->u.PX.uid); inode->i_gid = isonum_733(rr->u.PX.gid); break; @@ -496,7 +496,7 @@ repeat: goto out; } inode->i_mode = reloc->i_mode; - inode->i_nlink = reloc->i_nlink; + set_nlink(inode, reloc->i_nlink); inode->i_uid = reloc->i_uid; inode->i_gid = reloc->i_gid; inode->i_rdev = reloc->i_rdev; diff --git a/fs/jbd/checkpoint.c b/fs/jbd/checkpoint.c index f94fc48..5d1a00a 100644 --- a/fs/jbd/checkpoint.c +++ b/fs/jbd/checkpoint.c @@ -537,7 +537,7 @@ int cleanup_journal_tail(journal_t *journal) * them. * * Called with j_list_lock held. - * Returns number of bufers reaped (for debug) + * Returns number of buffers reaped (for debug) */ static int journal_clean_one_cp_list(struct journal_head *jh, int *released) diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c index 8799207..f2b9a57 100644 --- a/fs/jbd/commit.c +++ b/fs/jbd/commit.c @@ -392,6 +392,12 @@ void journal_commit_transaction(journal_t *journal) jbd_debug (3, "JBD: commit phase 1\n"); /* + * Clear revoked flag to reflect there is no revoked buffers + * in the next transaction which is going to be started. + */ + journal_clear_buffer_revoked_flags(journal); + + /* * Switch to a new revoke table. */ journal_switch_revoke_table(journal); diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c index 9fe061f..59c09f9 100644 --- a/fs/jbd/journal.c +++ b/fs/jbd/journal.c @@ -166,7 +166,7 @@ loop: */ jbd_debug(1, "Now suspending kjournald\n"); spin_unlock(&journal->j_state_lock); - refrigerator(); + try_to_freeze(); spin_lock(&journal->j_state_lock); } else { /* @@ -721,7 +721,6 @@ static journal_t * journal_init_common (void) init_waitqueue_head(&journal->j_wait_checkpoint); init_waitqueue_head(&journal->j_wait_commit); init_waitqueue_head(&journal->j_wait_updates); - mutex_init(&journal->j_barrier); mutex_init(&journal->j_checkpoint_mutex); spin_lock_init(&journal->j_revoke_lock); spin_lock_init(&journal->j_list_lock); @@ -1135,6 +1134,14 @@ static int journal_get_superblock(journal_t *journal) goto out; } + if (be32_to_cpu(sb->s_first) == 0 || + be32_to_cpu(sb->s_first) >= journal->j_maxlen) { + printk(KERN_WARNING + "JBD: Invalid start block of journal: %u\n", + be32_to_cpu(sb->s_first)); + goto out; + } + return 0; out: diff --git a/fs/jbd/revoke.c b/fs/jbd/revoke.c index 305a907..25c713e 100644 --- a/fs/jbd/revoke.c +++ b/fs/jbd/revoke.c @@ -47,6 +47,10 @@ * overwriting the new data. We don't even need to clear the revoke * bit here. * + * We cache revoke status of a buffer in the current transaction in b_states + * bits. As the name says, revokevalid flag indicates that the cached revoke + * status of a buffer is valid and we can rely on the cached status. + * * Revoke information on buffers is a tri-state value: * * RevokeValid clear: no cached revoke status, need to look it up @@ -479,6 +483,36 @@ int journal_cancel_revoke(handle_t *handle, struct journal_head *jh) return did_revoke; } +/* + * journal_clear_revoked_flags clears revoked flag of buffers in + * revoke table to reflect there is no revoked buffer in the next + * transaction which is going to be started. + */ +void journal_clear_buffer_revoked_flags(journal_t *journal) +{ + struct jbd_revoke_table_s *revoke = journal->j_revoke; + int i = 0; + + for (i = 0; i < revoke->hash_size; i++) { + struct list_head *hash_list; + struct list_head *list_entry; + hash_list = &revoke->hash_table[i]; + + list_for_each(list_entry, hash_list) { + struct jbd_revoke_record_s *record; + struct buffer_head *bh; + record = (struct jbd_revoke_record_s *)list_entry; + bh = __find_get_block(journal->j_fs_dev, + record->blocknr, + journal->j_blocksize); + if (bh) { + clear_buffer_revoked(bh); + __brelse(bh); + } + } + } +} + /* journal_switch_revoke table select j_revoke for next transaction * we do not want to suspend any processing until all revokes are * written -bzzz diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c index 7e59c6e..7fce94b 100644 --- a/fs/jbd/transaction.c +++ b/fs/jbd/transaction.c @@ -426,17 +426,34 @@ int journal_restart(handle_t *handle, int nblocks) * void journal_lock_updates () - establish a transaction barrier. * @journal: Journal to establish a barrier on. * - * This locks out any further updates from being started, and blocks - * until all existing updates have completed, returning only once the - * journal is in a quiescent state with no updates running. - * - * The journal lock should not be held on entry. + * This locks out any further updates from being started, and blocks until all + * existing updates have completed, returning only once the journal is in a + * quiescent state with no updates running. + * + * We do not use simple mutex for synchronization as there are syscalls which + * want to return with filesystem locked and that trips up lockdep. Also + * hibernate needs to lock filesystem but locked mutex then blocks hibernation. + * Since locking filesystem is rare operation, we use simple counter and + * waitqueue for locking. */ void journal_lock_updates(journal_t *journal) { DEFINE_WAIT(wait); +wait: + /* Wait for previous locked operation to finish */ + wait_event(journal->j_wait_transaction_locked, + journal->j_barrier_count == 0); + spin_lock(&journal->j_state_lock); + /* + * Check reliably under the lock whether we are the ones winning the race + * and locking the journal + */ + if (journal->j_barrier_count > 0) { + spin_unlock(&journal->j_state_lock); + goto wait; + } ++journal->j_barrier_count; /* Wait until there are no running updates */ @@ -460,14 +477,6 @@ void journal_lock_updates(journal_t *journal) spin_lock(&journal->j_state_lock); } spin_unlock(&journal->j_state_lock); - - /* - * We have now established a barrier against other normal updates, but - * we also need to barrier against other journal_lock_updates() calls - * to make sure that we serialise special journal-locked operations - * too. - */ - mutex_lock(&journal->j_barrier); } /** @@ -475,14 +484,11 @@ void journal_lock_updates(journal_t *journal) * @journal: Journal to release the barrier on. * * Release a transaction barrier obtained with journal_lock_updates(). - * - * Should be called without the journal lock held. */ void journal_unlock_updates (journal_t *journal) { J_ASSERT(journal->j_barrier_count != 0); - mutex_unlock(&journal->j_barrier); spin_lock(&journal->j_state_lock); --journal->j_barrier_count; spin_unlock(&journal->j_state_lock); diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c index 16a698b..d49d202 100644 --- a/fs/jbd2/checkpoint.c +++ b/fs/jbd2/checkpoint.c @@ -565,7 +565,7 @@ int jbd2_cleanup_journal_tail(journal_t *journal) * * Called with the journal locked. * Called with j_list_lock held. - * Returns number of bufers reaped (for debug) + * Returns number of buffers reaped (for debug) */ static int journal_clean_one_cp_list(struct journal_head *jh, int *released) diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index eef6979..68d704d 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -352,7 +352,7 @@ void jbd2_journal_commit_transaction(journal_t *journal) J_ASSERT(commit_transaction->t_state == T_RUNNING); trace_jbd2_start_commit(journal, commit_transaction); - jbd_debug(1, "JBD: starting commit of transaction %d\n", + jbd_debug(1, "JBD2: starting commit of transaction %d\n", commit_transaction->t_tid); write_lock(&journal->j_state_lock); @@ -427,7 +427,7 @@ void jbd2_journal_commit_transaction(journal_t *journal) __jbd2_journal_clean_checkpoint_list(journal); spin_unlock(&journal->j_list_lock); - jbd_debug (3, "JBD: commit phase 1\n"); + jbd_debug(3, "JBD2: commit phase 1\n"); /* * Switch to a new revoke table. @@ -447,7 +447,7 @@ void jbd2_journal_commit_transaction(journal_t *journal) wake_up(&journal->j_wait_transaction_locked); write_unlock(&journal->j_state_lock); - jbd_debug (3, "JBD: commit phase 2\n"); + jbd_debug(3, "JBD2: commit phase 2\n"); /* * Now start flushing things to disk, in the order they appear @@ -462,7 +462,7 @@ void jbd2_journal_commit_transaction(journal_t *journal) WRITE_SYNC); blk_finish_plug(&plug); - jbd_debug(3, "JBD: commit phase 2\n"); + jbd_debug(3, "JBD2: commit phase 2\n"); /* * Way to go: we have now written out all of the data for a @@ -522,7 +522,7 @@ void jbd2_journal_commit_transaction(journal_t *journal) J_ASSERT (bufs == 0); - jbd_debug(4, "JBD: get descriptor\n"); + jbd_debug(4, "JBD2: get descriptor\n"); descriptor = jbd2_journal_get_descriptor_buffer(journal); if (!descriptor) { @@ -531,7 +531,7 @@ void jbd2_journal_commit_transaction(journal_t *journal) } bh = jh2bh(descriptor); - jbd_debug(4, "JBD: got buffer %llu (%p)\n", + jbd_debug(4, "JBD2: got buffer %llu (%p)\n", (unsigned long long)bh->b_blocknr, bh->b_data); header = (journal_header_t *)&bh->b_data[0]; header->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER); @@ -625,7 +625,7 @@ void jbd2_journal_commit_transaction(journal_t *journal) commit_transaction->t_buffers == NULL || space_left < tag_bytes + 16) { - jbd_debug(4, "JBD: Submit %d IOs\n", bufs); + jbd_debug(4, "JBD2: Submit %d IOs\n", bufs); /* Write an end-of-descriptor marker before submitting the IOs. "tag" still points to @@ -707,7 +707,7 @@ start_journal_io: so we incur less scheduling load. */ - jbd_debug(3, "JBD: commit phase 3\n"); + jbd_debug(3, "JBD2: commit phase 3\n"); /* * akpm: these are BJ_IO, and j_list_lock is not needed. @@ -771,7 +771,7 @@ wait_for_iobuf: J_ASSERT (commit_transaction->t_shadow_list == NULL); - jbd_debug(3, "JBD: commit phase 4\n"); + jbd_debug(3, "JBD2: commit phase 4\n"); /* Here we wait for the revoke record and descriptor record buffers */ wait_for_ctlbuf: @@ -801,7 +801,7 @@ wait_for_iobuf: if (err) jbd2_journal_abort(journal, err); - jbd_debug(3, "JBD: commit phase 5\n"); + jbd_debug(3, "JBD2: commit phase 5\n"); write_lock(&journal->j_state_lock); J_ASSERT(commit_transaction->t_state == T_COMMIT_DFLUSH); commit_transaction->t_state = T_COMMIT_JFLUSH; @@ -830,7 +830,7 @@ wait_for_iobuf: transaction can be removed from any checkpoint list it was on before. */ - jbd_debug(3, "JBD: commit phase 6\n"); + jbd_debug(3, "JBD2: commit phase 6\n"); J_ASSERT(list_empty(&commit_transaction->t_inode_list)); J_ASSERT(commit_transaction->t_buffers == NULL); @@ -964,7 +964,7 @@ restart_loop: /* Done with this transaction! */ - jbd_debug(3, "JBD: commit phase 7\n"); + jbd_debug(3, "JBD2: commit phase 7\n"); J_ASSERT(commit_transaction->t_state == T_COMMIT_JFLUSH); @@ -1039,7 +1039,7 @@ restart_loop: journal->j_commit_callback(journal, commit_transaction); trace_jbd2_end_commit(journal, commit_transaction); - jbd_debug(1, "JBD: commit %d complete, head %d\n", + jbd_debug(1, "JBD2: commit %d complete, head %d\n", journal->j_commit_sequence, journal->j_tail_sequence); if (to_free) kfree(commit_transaction); diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index f24df13..c0a5f9f 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -173,7 +173,7 @@ loop: */ jbd_debug(1, "Now suspending kjournald2\n"); write_unlock(&journal->j_state_lock); - refrigerator(); + try_to_freeze(); write_lock(&journal->j_state_lock); } else { /* @@ -491,7 +491,7 @@ int __jbd2_log_start_commit(journal_t *journal, tid_t target) */ journal->j_commit_request = target; - jbd_debug(1, "JBD: requesting commit %d/%d\n", + jbd_debug(1, "JBD2: requesting commit %d/%d\n", journal->j_commit_request, journal->j_commit_sequence); wake_up(&journal->j_wait_commit); @@ -500,7 +500,7 @@ int __jbd2_log_start_commit(journal_t *journal, tid_t target) /* This should never happen, but if it does, preserve the evidence before kjournald goes into a loop and increments j_commit_sequence beyond all recognition. */ - WARN_ONCE(1, "jbd: bad log_start_commit: %u %u %u %u\n", + WARN_ONCE(1, "JBD2: bad log_start_commit: %u %u %u %u\n", journal->j_commit_request, journal->j_commit_sequence, target, journal->j_running_transaction ? @@ -645,7 +645,7 @@ int jbd2_log_wait_commit(journal_t *journal, tid_t tid) } #endif while (tid_gt(tid, journal->j_commit_sequence)) { - jbd_debug(1, "JBD: want %d, j_commit_sequence=%d\n", + jbd_debug(1, "JBD2: want %d, j_commit_sequence=%d\n", tid, journal->j_commit_sequence); wake_up(&journal->j_wait_commit); read_unlock(&journal->j_state_lock); @@ -1093,7 +1093,7 @@ static int journal_reset(journal_t *journal) first = be32_to_cpu(sb->s_first); last = be32_to_cpu(sb->s_maxlen); if (first + JBD2_MIN_JOURNAL_BLOCKS > last + 1) { - printk(KERN_ERR "JBD: Journal too short (blocks %llu-%llu).\n", + printk(KERN_ERR "JBD2: Journal too short (blocks %llu-%llu).\n", first, last); journal_fail_superblock(journal); return -EINVAL; @@ -1139,7 +1139,7 @@ void jbd2_journal_update_superblock(journal_t *journal, int wait) */ if (sb->s_start == 0 && journal->j_tail_sequence == journal->j_transaction_sequence) { - jbd_debug(1,"JBD: Skipping superblock update on recovered sb " + jbd_debug(1, "JBD2: Skipping superblock update on recovered sb " "(start %ld, seq %d, errno %d)\n", journal->j_tail, journal->j_tail_sequence, journal->j_errno); @@ -1163,7 +1163,7 @@ void jbd2_journal_update_superblock(journal_t *journal, int wait) } read_lock(&journal->j_state_lock); - jbd_debug(1,"JBD: updating superblock (start %ld, seq %d, errno %d)\n", + jbd_debug(1, "JBD2: updating superblock (start %ld, seq %d, errno %d)\n", journal->j_tail, journal->j_tail_sequence, journal->j_errno); sb->s_sequence = cpu_to_be32(journal->j_tail_sequence); @@ -1216,8 +1216,8 @@ static int journal_get_superblock(journal_t *journal) ll_rw_block(READ, 1, &bh); wait_on_buffer(bh); if (!buffer_uptodate(bh)) { - printk (KERN_ERR - "JBD: IO error reading journal superblock\n"); + printk(KERN_ERR + "JBD2: IO error reading journal superblock\n"); goto out; } } @@ -1228,7 +1228,7 @@ static int journal_get_superblock(journal_t *journal) if (sb->s_header.h_magic != cpu_to_be32(JBD2_MAGIC_NUMBER) || sb->s_blocksize != cpu_to_be32(journal->j_blocksize)) { - printk(KERN_WARNING "JBD: no valid journal superblock found\n"); + printk(KERN_WARNING "JBD2: no valid journal superblock found\n"); goto out; } @@ -1240,14 +1240,22 @@ static int journal_get_superblock(journal_t *journal) journal->j_format_version = 2; break; default: - printk(KERN_WARNING "JBD: unrecognised superblock format ID\n"); + printk(KERN_WARNING "JBD2: unrecognised superblock format ID\n"); goto out; } if (be32_to_cpu(sb->s_maxlen) < journal->j_maxlen) journal->j_maxlen = be32_to_cpu(sb->s_maxlen); else if (be32_to_cpu(sb->s_maxlen) > journal->j_maxlen) { - printk (KERN_WARNING "JBD: journal file too short\n"); + printk(KERN_WARNING "JBD2: journal file too short\n"); + goto out; + } + + if (be32_to_cpu(sb->s_first) == 0 || + be32_to_cpu(sb->s_first) >= journal->j_maxlen) { + printk(KERN_WARNING + "JBD2: Invalid start block of journal: %u\n", + be32_to_cpu(sb->s_first)); goto out; } @@ -1310,8 +1318,8 @@ int jbd2_journal_load(journal_t *journal) ~cpu_to_be32(JBD2_KNOWN_ROCOMPAT_FEATURES)) || (sb->s_feature_incompat & ~cpu_to_be32(JBD2_KNOWN_INCOMPAT_FEATURES))) { - printk (KERN_WARNING - "JBD: Unrecognised features on journal\n"); + printk(KERN_WARNING + "JBD2: Unrecognised features on journal\n"); return -EINVAL; } } @@ -1346,7 +1354,7 @@ int jbd2_journal_load(journal_t *journal) return 0; recovery_error: - printk (KERN_WARNING "JBD: recovery failed\n"); + printk(KERN_WARNING "JBD2: recovery failed\n"); return -EIO; } @@ -1577,7 +1585,7 @@ static int journal_convert_superblock_v1(journal_t *journal, struct buffer_head *bh; printk(KERN_WARNING - "JBD: Converting superblock from version 1 to 2.\n"); + "JBD2: Converting superblock from version 1 to 2.\n"); /* Pre-initialise new fields to zero */ offset = ((char *) &(sb->s_feature_compat)) - ((char *) sb); @@ -1694,7 +1702,7 @@ int jbd2_journal_wipe(journal_t *journal, int write) if (!journal->j_tail) goto no_recovery; - printk (KERN_WARNING "JBD: %s recovery information on journal\n", + printk(KERN_WARNING "JBD2: %s recovery information on journal\n", write ? "Clearing" : "Ignoring"); err = jbd2_journal_skip_recovery(journal); @@ -2020,7 +2028,7 @@ static int journal_init_jbd2_journal_head_cache(void) retval = 0; if (!jbd2_journal_head_cache) { retval = -ENOMEM; - printk(KERN_EMERG "JBD: no memory for journal_head cache\n"); + printk(KERN_EMERG "JBD2: no memory for journal_head cache\n"); } return retval; } @@ -2383,7 +2391,7 @@ static void __exit journal_exit(void) #ifdef CONFIG_JBD2_DEBUG int n = atomic_read(&nr_journal_heads); if (n) - printk(KERN_EMERG "JBD: leaked %d journal_heads!\n", n); + printk(KERN_EMERG "JBD2: leaked %d journal_heads!\n", n); #endif jbd2_remove_debugfs_entry(); jbd2_remove_jbd_stats_proc_entry(); diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c index 1cad869..da6d7ba 100644 --- a/fs/jbd2/recovery.c +++ b/fs/jbd2/recovery.c @@ -89,7 +89,7 @@ static int do_readahead(journal_t *journal, unsigned int start) err = jbd2_journal_bmap(journal, next, &blocknr); if (err) { - printk (KERN_ERR "JBD: bad block at offset %u\n", + printk(KERN_ERR "JBD2: bad block at offset %u\n", next); goto failed; } @@ -138,14 +138,14 @@ static int jread(struct buffer_head **bhp, journal_t *journal, *bhp = NULL; if (offset >= journal->j_maxlen) { - printk(KERN_ERR "JBD: corrupted journal superblock\n"); + printk(KERN_ERR "JBD2: corrupted journal superblock\n"); return -EIO; } err = jbd2_journal_bmap(journal, offset, &blocknr); if (err) { - printk (KERN_ERR "JBD: bad block at offset %u\n", + printk(KERN_ERR "JBD2: bad block at offset %u\n", offset); return err; } @@ -163,7 +163,7 @@ static int jread(struct buffer_head **bhp, journal_t *journal, } if (!buffer_uptodate(bh)) { - printk (KERN_ERR "JBD: Failed to read block at offset %u\n", + printk(KERN_ERR "JBD2: Failed to read block at offset %u\n", offset); brelse(bh); return -EIO; @@ -251,10 +251,10 @@ int jbd2_journal_recover(journal_t *journal) if (!err) err = do_one_pass(journal, &info, PASS_REPLAY); - jbd_debug(1, "JBD: recovery, exit status %d, " + jbd_debug(1, "JBD2: recovery, exit status %d, " "recovered transactions %u to %u\n", err, info.start_transaction, info.end_transaction); - jbd_debug(1, "JBD: Replayed %d and revoked %d/%d blocks\n", + jbd_debug(1, "JBD2: Replayed %d and revoked %d/%d blocks\n", info.nr_replays, info.nr_revoke_hits, info.nr_revokes); /* Restart the log at the next transaction ID, thus invalidating @@ -293,14 +293,14 @@ int jbd2_journal_skip_recovery(journal_t *journal) err = do_one_pass(journal, &info, PASS_SCAN); if (err) { - printk(KERN_ERR "JBD: error %d scanning journal\n", err); + printk(KERN_ERR "JBD2: error %d scanning journal\n", err); ++journal->j_transaction_sequence; } else { #ifdef CONFIG_JBD2_DEBUG int dropped = info.end_transaction - be32_to_cpu(journal->j_superblock->s_sequence); jbd_debug(1, - "JBD: ignoring %d transaction%s from the journal.\n", + "JBD2: ignoring %d transaction%s from the journal.\n", dropped, (dropped == 1) ? "" : "s"); #endif journal->j_transaction_sequence = ++info.end_transaction; @@ -338,7 +338,7 @@ static int calc_chksums(journal_t *journal, struct buffer_head *bh, wrap(journal, *next_log_block); err = jread(&obh, journal, io_block); if (err) { - printk(KERN_ERR "JBD: IO error %d recovering block " + printk(KERN_ERR "JBD2: IO error %d recovering block " "%lu in log\n", err, io_block); return 1; } else { @@ -411,7 +411,7 @@ static int do_one_pass(journal_t *journal, * either the next descriptor block or the final commit * record. */ - jbd_debug(3, "JBD: checking block %ld\n", next_log_block); + jbd_debug(3, "JBD2: checking block %ld\n", next_log_block); err = jread(&bh, journal, next_log_block); if (err) goto failed; @@ -491,8 +491,8 @@ static int do_one_pass(journal_t *journal, /* Recover what we can, but * report failure at the end. */ success = err; - printk (KERN_ERR - "JBD: IO error %d recovering " + printk(KERN_ERR + "JBD2: IO error %d recovering " "block %ld in log\n", err, io_block); } else { @@ -520,7 +520,7 @@ static int do_one_pass(journal_t *journal, journal->j_blocksize); if (nbh == NULL) { printk(KERN_ERR - "JBD: Out of memory " + "JBD2: Out of memory " "during recovery.\n"); err = -ENOMEM; brelse(bh); @@ -689,7 +689,7 @@ static int do_one_pass(journal_t *journal, /* It's really bad news if different passes end up at * different places (but possible due to IO errors). */ if (info->end_transaction != next_commit_ID) { - printk (KERN_ERR "JBD: recovery pass %d ended at " + printk(KERN_ERR "JBD2: recovery pass %d ended at " "transaction %u, expected %u\n", pass, next_commit_ID, info->end_transaction); if (!success) diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index 2d71094..a0e41a4 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -27,6 +27,7 @@ #include <linux/highmem.h> #include <linux/hrtimer.h> #include <linux/backing-dev.h> +#include <linux/bug.h> #include <linux/module.h> static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh); @@ -115,7 +116,7 @@ static inline void update_t_max_wait(transaction_t *transaction, */ static int start_this_handle(journal_t *journal, handle_t *handle, - int gfp_mask) + gfp_t gfp_mask) { transaction_t *transaction, *new_transaction = NULL; tid_t tid; @@ -124,7 +125,7 @@ static int start_this_handle(journal_t *journal, handle_t *handle, unsigned long ts = jiffies; if (nblocks > journal->j_max_transaction_buffers) { - printk(KERN_ERR "JBD: %s wants too many credits (%d > %d)\n", + printk(KERN_ERR "JBD2: %s wants too many credits (%d > %d)\n", current->comm, nblocks, journal->j_max_transaction_buffers); return -ENOSPC; @@ -320,7 +321,7 @@ static handle_t *new_handle(int nblocks) * Return a pointer to a newly allocated handle, or an ERR_PTR() value * on failure. */ -handle_t *jbd2__journal_start(journal_t *journal, int nblocks, int gfp_mask) +handle_t *jbd2__journal_start(journal_t *journal, int nblocks, gfp_t gfp_mask) { handle_t *handle = journal_current_handle(); int err; @@ -443,7 +444,7 @@ out: * transaction capabable of guaranteeing the requested number of * credits. */ -int jbd2__journal_restart(handle_t *handle, int nblocks, int gfp_mask) +int jbd2__journal_restart(handle_t *handle, int nblocks, gfp_t gfp_mask) { transaction_t *transaction = handle->h_transaction; journal_t *journal = transaction->t_journal; @@ -563,7 +564,7 @@ static void warn_dirty_buffer(struct buffer_head *bh) char b[BDEVNAME_SIZE]; printk(KERN_WARNING - "JBD: Spotted dirty metadata buffer (dev = %s, blocknr = %llu). " + "JBD2: Spotted dirty metadata buffer (dev = %s, blocknr = %llu). " "There's a risk of filesystem corruption in case of system " "crash.\n", bdevname(bh->b_bdev, b), (unsigned long long)bh->b_blocknr); @@ -1049,6 +1050,10 @@ void jbd2_buffer_abort_trigger(struct journal_head *jh, * mark dirty metadata which needs to be journaled as part of the current * transaction. * + * The buffer must have previously had jbd2_journal_get_write_access() + * called so that it has a valid journal_head attached to the buffer + * head. + * * The buffer is placed on the transaction's metadata list and is marked * as belonging to the transaction. * @@ -1065,11 +1070,16 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh) transaction_t *transaction = handle->h_transaction; journal_t *journal = transaction->t_journal; struct journal_head *jh = bh2jh(bh); + int ret = 0; jbd_debug(5, "journal_head %p\n", jh); JBUFFER_TRACE(jh, "entry"); if (is_handle_aborted(handle)) goto out; + if (!buffer_jbd(bh)) { + ret = -EUCLEAN; + goto out; + } jbd_lock_bh_state(bh); @@ -1093,8 +1103,20 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh) */ if (jh->b_transaction == transaction && jh->b_jlist == BJ_Metadata) { JBUFFER_TRACE(jh, "fastpath"); - J_ASSERT_JH(jh, jh->b_transaction == - journal->j_running_transaction); + if (unlikely(jh->b_transaction != + journal->j_running_transaction)) { + printk(KERN_EMERG "JBD: %s: " + "jh->b_transaction (%llu, %p, %u) != " + "journal->j_running_transaction (%p, %u)", + journal->j_devname, + (unsigned long long) bh->b_blocknr, + jh->b_transaction, + jh->b_transaction ? jh->b_transaction->t_tid : 0, + journal->j_running_transaction, + journal->j_running_transaction ? + journal->j_running_transaction->t_tid : 0); + ret = -EINVAL; + } goto out_unlock_bh; } @@ -1108,9 +1130,32 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh) */ if (jh->b_transaction != transaction) { JBUFFER_TRACE(jh, "already on other transaction"); - J_ASSERT_JH(jh, jh->b_transaction == - journal->j_committing_transaction); - J_ASSERT_JH(jh, jh->b_next_transaction == transaction); + if (unlikely(jh->b_transaction != + journal->j_committing_transaction)) { + printk(KERN_EMERG "JBD: %s: " + "jh->b_transaction (%llu, %p, %u) != " + "journal->j_committing_transaction (%p, %u)", + journal->j_devname, + (unsigned long long) bh->b_blocknr, + jh->b_transaction, + jh->b_transaction ? jh->b_transaction->t_tid : 0, + journal->j_committing_transaction, + journal->j_committing_transaction ? + journal->j_committing_transaction->t_tid : 0); + ret = -EINVAL; + } + if (unlikely(jh->b_next_transaction != transaction)) { + printk(KERN_EMERG "JBD: %s: " + "jh->b_next_transaction (%llu, %p, %u) != " + "transaction (%p, %u)", + journal->j_devname, + (unsigned long long) bh->b_blocknr, + jh->b_next_transaction, + jh->b_next_transaction ? + jh->b_next_transaction->t_tid : 0, + transaction, transaction->t_tid); + ret = -EINVAL; + } /* And this case is illegal: we can't reuse another * transaction's data buffer, ever. */ goto out_unlock_bh; @@ -1127,7 +1172,8 @@ out_unlock_bh: jbd_unlock_bh_state(bh); out: JBUFFER_TRACE(jh, "exit"); - return 0; + WARN_ON(ret); /* All errors are bugs, so dump the stack */ + return ret; } /* diff --git a/fs/jffs2/compr.c b/fs/jffs2/compr.c index de42470..5b6c9d1 100644 --- a/fs/jffs2/compr.c +++ b/fs/jffs2/compr.c @@ -53,6 +53,78 @@ static int jffs2_is_best_compression(struct jffs2_compressor *this, return 0; } +/* + * jffs2_selected_compress: + * @compr: Explicit compression type to use (ie, JFFS2_COMPR_ZLIB). + * If 0, just take the first available compression mode. + * @data_in: Pointer to uncompressed data + * @cpage_out: Pointer to returned pointer to buffer for compressed data + * @datalen: On entry, holds the amount of data available for compression. + * On exit, expected to hold the amount of data actually compressed. + * @cdatalen: On entry, holds the amount of space available for compressed + * data. On exit, expected to hold the actual size of the compressed + * data. + * + * Returns: the compression type used. Zero is used to show that the data + * could not be compressed; probably because we couldn't find the requested + * compression mode. + */ +static int jffs2_selected_compress(u8 compr, unsigned char *data_in, + unsigned char **cpage_out, u32 *datalen, u32 *cdatalen) +{ + struct jffs2_compressor *this; + int err, ret = JFFS2_COMPR_NONE; + uint32_t orig_slen, orig_dlen; + char *output_buf; + + output_buf = kmalloc(*cdatalen, GFP_KERNEL); + if (!output_buf) { + printk(KERN_WARNING "JFFS2: No memory for compressor allocation. Compression failed.\n"); + return ret; + } + orig_slen = *datalen; + orig_dlen = *cdatalen; + spin_lock(&jffs2_compressor_list_lock); + list_for_each_entry(this, &jffs2_compressor_list, list) { + /* Skip decompress-only and disabled modules */ + if (!this->compress || this->disabled) + continue; + + /* Skip if not the desired compression type */ + if (compr && (compr != this->compr)) + continue; + + /* + * Either compression type was unspecified, or we found our + * compressor; either way, we're good to go. + */ + this->usecount++; + spin_unlock(&jffs2_compressor_list_lock); + + *datalen = orig_slen; + *cdatalen = orig_dlen; + err = this->compress(data_in, output_buf, datalen, cdatalen); + + spin_lock(&jffs2_compressor_list_lock); + this->usecount--; + if (!err) { + /* Success */ + ret = this->compr; + this->stat_compr_blocks++; + this->stat_compr_orig_size += *datalen; + this->stat_compr_new_size += *cdatalen; + break; + } + } + spin_unlock(&jffs2_compressor_list_lock); + if (ret == JFFS2_COMPR_NONE) + kfree(output_buf); + else + *cpage_out = output_buf; + + return ret; +} + /* jffs2_compress: * @data_in: Pointer to uncompressed data * @cpage_out: Pointer to returned pointer to buffer for compressed data @@ -76,47 +148,23 @@ uint16_t jffs2_compress(struct jffs2_sb_info *c, struct jffs2_inode_info *f, uint32_t *datalen, uint32_t *cdatalen) { int ret = JFFS2_COMPR_NONE; - int compr_ret; + int mode, compr_ret; struct jffs2_compressor *this, *best=NULL; unsigned char *output_buf = NULL, *tmp_buf; uint32_t orig_slen, orig_dlen; uint32_t best_slen=0, best_dlen=0; - switch (jffs2_compression_mode) { + if (c->mount_opts.override_compr) + mode = c->mount_opts.compr; + else + mode = jffs2_compression_mode; + + switch (mode) { case JFFS2_COMPR_MODE_NONE: break; case JFFS2_COMPR_MODE_PRIORITY: - output_buf = kmalloc(*cdatalen,GFP_KERNEL); - if (!output_buf) { - printk(KERN_WARNING "JFFS2: No memory for compressor allocation. Compression failed.\n"); - goto out; - } - orig_slen = *datalen; - orig_dlen = *cdatalen; - spin_lock(&jffs2_compressor_list_lock); - list_for_each_entry(this, &jffs2_compressor_list, list) { - /* Skip decompress-only backwards-compatibility and disabled modules */ - if ((!this->compress)||(this->disabled)) - continue; - - this->usecount++; - spin_unlock(&jffs2_compressor_list_lock); - *datalen = orig_slen; - *cdatalen = orig_dlen; - compr_ret = this->compress(data_in, output_buf, datalen, cdatalen); - spin_lock(&jffs2_compressor_list_lock); - this->usecount--; - if (!compr_ret) { - ret = this->compr; - this->stat_compr_blocks++; - this->stat_compr_orig_size += *datalen; - this->stat_compr_new_size += *cdatalen; - break; - } - } - spin_unlock(&jffs2_compressor_list_lock); - if (ret == JFFS2_COMPR_NONE) - kfree(output_buf); + ret = jffs2_selected_compress(0, data_in, cpage_out, datalen, + cdatalen); break; case JFFS2_COMPR_MODE_SIZE: case JFFS2_COMPR_MODE_FAVOURLZO: @@ -174,22 +222,28 @@ uint16_t jffs2_compress(struct jffs2_sb_info *c, struct jffs2_inode_info *f, best->stat_compr_orig_size += best_slen; best->stat_compr_new_size += best_dlen; ret = best->compr; + *cpage_out = output_buf; } spin_unlock(&jffs2_compressor_list_lock); break; + case JFFS2_COMPR_MODE_FORCELZO: + ret = jffs2_selected_compress(JFFS2_COMPR_LZO, data_in, + cpage_out, datalen, cdatalen); + break; + case JFFS2_COMPR_MODE_FORCEZLIB: + ret = jffs2_selected_compress(JFFS2_COMPR_ZLIB, data_in, + cpage_out, datalen, cdatalen); + break; default: printk(KERN_ERR "JFFS2: unknown compression mode.\n"); } - out: + if (ret == JFFS2_COMPR_NONE) { *cpage_out = data_in; *datalen = *cdatalen; none_stat_compr_blocks++; none_stat_compr_size += *datalen; } - else { - *cpage_out = output_buf; - } return ret; } diff --git a/fs/jffs2/compr.h b/fs/jffs2/compr.h index 13bb759..5e91d57 100644 --- a/fs/jffs2/compr.h +++ b/fs/jffs2/compr.h @@ -40,6 +40,8 @@ #define JFFS2_COMPR_MODE_PRIORITY 1 #define JFFS2_COMPR_MODE_SIZE 2 #define JFFS2_COMPR_MODE_FAVOURLZO 3 +#define JFFS2_COMPR_MODE_FORCELZO 4 +#define JFFS2_COMPR_MODE_FORCEZLIB 5 #define FAVOUR_LZO_PERCENT 80 diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c index 9659b7c..973ac58 100644 --- a/fs/jffs2/dir.c +++ b/fs/jffs2/dir.c @@ -22,16 +22,16 @@ static int jffs2_readdir (struct file *, void *, filldir_t); -static int jffs2_create (struct inode *,struct dentry *,int, +static int jffs2_create (struct inode *,struct dentry *,umode_t, struct nameidata *); static struct dentry *jffs2_lookup (struct inode *,struct dentry *, struct nameidata *); static int jffs2_link (struct dentry *,struct inode *,struct dentry *); static int jffs2_unlink (struct inode *,struct dentry *); static int jffs2_symlink (struct inode *,struct dentry *,const char *); -static int jffs2_mkdir (struct inode *,struct dentry *,int); +static int jffs2_mkdir (struct inode *,struct dentry *,umode_t); static int jffs2_rmdir (struct inode *,struct dentry *); -static int jffs2_mknod (struct inode *,struct dentry *,int,dev_t); +static int jffs2_mknod (struct inode *,struct dentry *,umode_t,dev_t); static int jffs2_rename (struct inode *, struct dentry *, struct inode *, struct dentry *); @@ -169,8 +169,8 @@ static int jffs2_readdir(struct file *filp, void *dirent, filldir_t filldir) /***********************************************************************/ -static int jffs2_create(struct inode *dir_i, struct dentry *dentry, int mode, - struct nameidata *nd) +static int jffs2_create(struct inode *dir_i, struct dentry *dentry, + umode_t mode, struct nameidata *nd) { struct jffs2_raw_inode *ri; struct jffs2_inode_info *f, *dir_f; @@ -245,7 +245,7 @@ static int jffs2_unlink(struct inode *dir_i, struct dentry *dentry) ret = jffs2_do_unlink(c, dir_f, dentry->d_name.name, dentry->d_name.len, dead_f, now); if (dead_f->inocache) - dentry->d_inode->i_nlink = dead_f->inocache->pino_nlink; + set_nlink(dentry->d_inode, dead_f->inocache->pino_nlink); if (!ret) dir_i->i_mtime = dir_i->i_ctime = ITIME(now); return ret; @@ -278,7 +278,7 @@ static int jffs2_link (struct dentry *old_dentry, struct inode *dir_i, struct de if (!ret) { mutex_lock(&f->sem); - old_dentry->d_inode->i_nlink = ++f->inocache->pino_nlink; + set_nlink(old_dentry->d_inode, ++f->inocache->pino_nlink); mutex_unlock(&f->sem); d_instantiate(dentry, old_dentry->d_inode); dir_i->i_mtime = dir_i->i_ctime = ITIME(now); @@ -450,7 +450,7 @@ static int jffs2_symlink (struct inode *dir_i, struct dentry *dentry, const char } -static int jffs2_mkdir (struct inode *dir_i, struct dentry *dentry, int mode) +static int jffs2_mkdir (struct inode *dir_i, struct dentry *dentry, umode_t mode) { struct jffs2_inode_info *f, *dir_f; struct jffs2_sb_info *c; @@ -497,7 +497,7 @@ static int jffs2_mkdir (struct inode *dir_i, struct dentry *dentry, int mode) f = JFFS2_INODE_INFO(inode); /* Directories get nlink 2 at start */ - inode->i_nlink = 2; + set_nlink(inode, 2); /* but ic->pino_nlink is the parent ino# */ f->inocache->pino_nlink = dir_i->i_ino; @@ -618,7 +618,7 @@ static int jffs2_rmdir (struct inode *dir_i, struct dentry *dentry) return ret; } -static int jffs2_mknod (struct inode *dir_i, struct dentry *dentry, int mode, dev_t rdev) +static int jffs2_mknod (struct inode *dir_i, struct dentry *dentry, umode_t mode, dev_t rdev) { struct jffs2_inode_info *f, *dir_f; struct jffs2_sb_info *c; diff --git a/fs/jffs2/erase.c b/fs/jffs2/erase.c index e513f19..a01cdad 100644 --- a/fs/jffs2/erase.c +++ b/fs/jffs2/erase.c @@ -74,7 +74,7 @@ static void jffs2_erase_block(struct jffs2_sb_info *c, ((struct erase_priv_struct *)instr->priv)->jeb = jeb; ((struct erase_priv_struct *)instr->priv)->c = c; - ret = c->mtd->erase(c->mtd, instr); + ret = mtd_erase(c->mtd, instr); if (!ret) return; @@ -336,12 +336,11 @@ static int jffs2_block_check_erase(struct jffs2_sb_info *c, struct jffs2_erasebl uint32_t ofs; size_t retlen; int ret = -EIO; + unsigned long *wordebuf; - if (c->mtd->point) { - unsigned long *wordebuf; - - ret = c->mtd->point(c->mtd, jeb->offset, c->sector_size, - &retlen, &ebuf, NULL); + ret = mtd_point(c->mtd, jeb->offset, c->sector_size, &retlen, + &ebuf, NULL); + if (ret != -EOPNOTSUPP) { if (ret) { D1(printk(KERN_DEBUG "MTD point failed %d\n", ret)); goto do_flash_read; @@ -349,7 +348,7 @@ static int jffs2_block_check_erase(struct jffs2_sb_info *c, struct jffs2_erasebl if (retlen < c->sector_size) { /* Don't muck about if it won't let us point to the whole erase sector */ D1(printk(KERN_DEBUG "MTD point returned len too short: 0x%zx\n", retlen)); - c->mtd->unpoint(c->mtd, jeb->offset, retlen); + mtd_unpoint(c->mtd, jeb->offset, retlen); goto do_flash_read; } wordebuf = ebuf-sizeof(*wordebuf); @@ -358,7 +357,7 @@ static int jffs2_block_check_erase(struct jffs2_sb_info *c, struct jffs2_erasebl if (*++wordebuf != ~0) break; } while(--retlen); - c->mtd->unpoint(c->mtd, jeb->offset, c->sector_size); + mtd_unpoint(c->mtd, jeb->offset, c->sector_size); if (retlen) { printk(KERN_WARNING "Newly-erased block contained word 0x%lx at offset 0x%08tx\n", *wordebuf, jeb->offset + c->sector_size-retlen*sizeof(*wordebuf)); @@ -381,7 +380,7 @@ static int jffs2_block_check_erase(struct jffs2_sb_info *c, struct jffs2_erasebl *bad_offset = ofs; - ret = c->mtd->read(c->mtd, ofs, readlen, &retlen, ebuf); + ret = mtd_read(c->mtd, ofs, readlen, &retlen, ebuf); if (ret) { printk(KERN_WARNING "Read of newly-erased block at 0x%08x failed: %d. Putting on bad_list\n", ofs, ret); ret = -EIO; diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c index bbcb975..2e01238 100644 --- a/fs/jffs2/fs.c +++ b/fs/jffs2/fs.c @@ -278,7 +278,7 @@ struct inode *jffs2_iget(struct super_block *sb, unsigned long ino) inode->i_mtime = ITIME(je32_to_cpu(latest_node.mtime)); inode->i_ctime = ITIME(je32_to_cpu(latest_node.ctime)); - inode->i_nlink = f->inocache->pino_nlink; + set_nlink(inode, f->inocache->pino_nlink); inode->i_blocks = (inode->i_size + 511) >> 9; @@ -291,7 +291,7 @@ struct inode *jffs2_iget(struct super_block *sb, unsigned long ino) case S_IFDIR: { struct jffs2_full_dirent *fd; - inode->i_nlink = 2; /* parent and '.' */ + set_nlink(inode, 2); /* parent and '.' */ for (fd=f->dents; fd; fd = fd->next) { if (fd->type == DT_DIR && fd->ino) @@ -379,7 +379,7 @@ void jffs2_dirty_inode(struct inode *inode, int flags) jffs2_do_setattr(inode, &iattr); } -int jffs2_remount_fs (struct super_block *sb, int *flags, char *data) +int jffs2_do_remount_fs(struct super_block *sb, int *flags, char *data) { struct jffs2_sb_info *c = JFFS2_SB_INFO(sb); @@ -453,7 +453,7 @@ struct inode *jffs2_new_inode (struct inode *dir_i, umode_t mode, struct jffs2_r iput(inode); return ERR_PTR(ret); } - inode->i_nlink = 1; + set_nlink(inode, 1); inode->i_ino = je32_to_cpu(ri->ino); inode->i_mode = jemode_to_cpu(ri->mode); inode->i_gid = je16_to_cpu(ri->gid); @@ -466,7 +466,6 @@ struct inode *jffs2_new_inode (struct inode *dir_i, umode_t mode, struct jffs2_r if (insert_inode_locked(inode) < 0) { make_bad_inode(inode); - unlock_new_inode(inode); iput(inode); return ERR_PTR(-EINVAL); } diff --git a/fs/jffs2/jffs2_fs_sb.h b/fs/jffs2/jffs2_fs_sb.h index 0bc6a6c..55a0c1d 100644 --- a/fs/jffs2/jffs2_fs_sb.h +++ b/fs/jffs2/jffs2_fs_sb.h @@ -29,6 +29,11 @@ struct jffs2_inodirty; +struct jffs2_mount_opts { + bool override_compr; + unsigned int compr; +}; + /* A struct for the overall file system control. Pointers to jffs2_sb_info structs are named `c' in the source code. Nee jffs_control @@ -126,6 +131,7 @@ struct jffs2_sb_info { #endif struct jffs2_summary *summary; /* Summary information */ + struct jffs2_mount_opts mount_opts; #ifdef CONFIG_JFFS2_FS_XATTR #define XATTRINDEX_HASHSIZE (57) diff --git a/fs/jffs2/os-linux.h b/fs/jffs2/os-linux.h index 6c1755c..ab65ee3 100644 --- a/fs/jffs2/os-linux.h +++ b/fs/jffs2/os-linux.h @@ -176,7 +176,7 @@ void jffs2_dirty_inode(struct inode *inode, int flags); struct inode *jffs2_new_inode (struct inode *dir_i, umode_t mode, struct jffs2_raw_inode *ri); int jffs2_statfs (struct dentry *, struct kstatfs *); -int jffs2_remount_fs (struct super_block *, int *, char *); +int jffs2_do_remount_fs(struct super_block *, int *, char *); int jffs2_do_fill_super(struct super_block *sb, void *data, int silent); void jffs2_gc_release_inode(struct jffs2_sb_info *c, struct jffs2_inode_info *f); diff --git a/fs/jffs2/readinode.c b/fs/jffs2/readinode.c index ee57bac..3093ac4 100644 --- a/fs/jffs2/readinode.c +++ b/fs/jffs2/readinode.c @@ -62,17 +62,15 @@ static int check_node_data(struct jffs2_sb_info *c, struct jffs2_tmp_dnode_info #ifndef __ECOS /* TODO: instead, incapsulate point() stuff to jffs2_flash_read(), * adding and jffs2_flash_read_end() interface. */ - if (c->mtd->point) { - err = c->mtd->point(c->mtd, ofs, len, &retlen, - (void **)&buffer, NULL); - if (!err && retlen < len) { - JFFS2_WARNING("MTD point returned len too short: %zu instead of %u.\n", retlen, tn->csize); - c->mtd->unpoint(c->mtd, ofs, retlen); - } else if (err) + err = mtd_point(c->mtd, ofs, len, &retlen, (void **)&buffer, NULL); + if (!err && retlen < len) { + JFFS2_WARNING("MTD point returned len too short: %zu instead of %u.\n", retlen, tn->csize); + mtd_unpoint(c->mtd, ofs, retlen); + } else if (err) { + if (err != -EOPNOTSUPP) JFFS2_WARNING("MTD point failed: error code %d.\n", err); - else - pointed = 1; /* succefully pointed to device */ - } + } else + pointed = 1; /* succefully pointed to device */ #endif if (!pointed) { @@ -101,7 +99,7 @@ static int check_node_data(struct jffs2_sb_info *c, struct jffs2_tmp_dnode_info kfree(buffer); #ifndef __ECOS else - c->mtd->unpoint(c->mtd, ofs, len); + mtd_unpoint(c->mtd, ofs, len); #endif if (crc != tn->data_crc) { @@ -137,7 +135,7 @@ free_out: kfree(buffer); #ifndef __ECOS else - c->mtd->unpoint(c->mtd, ofs, len); + mtd_unpoint(c->mtd, ofs, len); #endif return err; } diff --git a/fs/jffs2/scan.c b/fs/jffs2/scan.c index 8d8cd34..f994648 100644 --- a/fs/jffs2/scan.c +++ b/fs/jffs2/scan.c @@ -97,15 +97,15 @@ int jffs2_scan_medium(struct jffs2_sb_info *c) size_t pointlen, try_size; if (c->mtd->point) { - ret = c->mtd->point(c->mtd, 0, c->mtd->size, &pointlen, - (void **)&flashbuf, NULL); + ret = mtd_point(c->mtd, 0, c->mtd->size, &pointlen, + (void **)&flashbuf, NULL); if (!ret && pointlen < c->mtd->size) { /* Don't muck about if it won't let us point to the whole flash */ D1(printk(KERN_DEBUG "MTD point returned len too short: 0x%zx\n", pointlen)); - c->mtd->unpoint(c->mtd, 0, pointlen); + mtd_unpoint(c->mtd, 0, pointlen); flashbuf = NULL; } - if (ret) + if (ret && ret != -EOPNOTSUPP) D1(printk(KERN_DEBUG "MTD point failed %d\n", ret)); } #endif @@ -273,11 +273,9 @@ int jffs2_scan_medium(struct jffs2_sb_info *c) kfree(flashbuf); #ifndef __ECOS else - c->mtd->unpoint(c->mtd, 0, c->mtd->size); + mtd_unpoint(c->mtd, 0, c->mtd->size); #endif - if (s) - kfree(s); - + kfree(s); return ret; } @@ -457,7 +455,7 @@ static int jffs2_scan_eraseblock (struct jffs2_sb_info *c, struct jffs2_eraseblo if (jffs2_cleanmarker_oob(c)) { int ret; - if (c->mtd->block_isbad(c->mtd, jeb->offset)) + if (mtd_block_isbad(c->mtd, jeb->offset)) return BLK_STATE_BADBLOCK; ret = jffs2_check_nand_cleanmarker(c, jeb); diff --git a/fs/jffs2/security.c b/fs/jffs2/security.c index cfeb716..0f20208 100644 --- a/fs/jffs2/security.c +++ b/fs/jffs2/security.c @@ -22,26 +22,29 @@ #include <linux/security.h> #include "nodelist.h" -/* ---- Initial Security Label Attachment -------------- */ -int jffs2_init_security(struct inode *inode, struct inode *dir, - const struct qstr *qstr) +/* ---- Initial Security Label(s) Attachment callback --- */ +int jffs2_initxattrs(struct inode *inode, const struct xattr *xattr_array, + void *fs_info) { - int rc; - size_t len; - void *value; - char *name; + const struct xattr *xattr; + int err = 0; - rc = security_inode_init_security(inode, dir, qstr, &name, &value, &len); - if (rc) { - if (rc == -EOPNOTSUPP) - return 0; - return rc; + for (xattr = xattr_array; xattr->name != NULL; xattr++) { + err = do_jffs2_setxattr(inode, JFFS2_XPREFIX_SECURITY, + xattr->name, xattr->value, + xattr->value_len, 0); + if (err < 0) + break; } - rc = do_jffs2_setxattr(inode, JFFS2_XPREFIX_SECURITY, name, value, len, 0); + return err; +} - kfree(name); - kfree(value); - return rc; +/* ---- Initial Security Label(s) Attachment ----------- */ +int jffs2_init_security(struct inode *inode, struct inode *dir, + const struct qstr *qstr) +{ + return security_inode_init_security(inode, dir, qstr, + &jffs2_initxattrs, NULL); } /* ---- XATTR Handler for "security.*" ----------------- */ diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c index 853b8e3..f2d96b5 100644 --- a/fs/jffs2/super.c +++ b/fs/jffs2/super.c @@ -17,11 +17,13 @@ #include <linux/fs.h> #include <linux/err.h> #include <linux/mount.h> +#include <linux/parser.h> #include <linux/jffs2.h> #include <linux/pagemap.h> #include <linux/mtd/super.h> #include <linux/ctype.h> #include <linux/namei.h> +#include <linux/seq_file.h> #include <linux/exportfs.h> #include "compr.h" #include "nodelist.h" @@ -43,7 +45,6 @@ static struct inode *jffs2_alloc_inode(struct super_block *sb) static void jffs2_i_callback(struct rcu_head *head) { struct inode *inode = container_of(head, struct inode, i_rcu); - INIT_LIST_HEAD(&inode->i_dentry); kmem_cache_free(jffs2_inode_cachep, JFFS2_INODE_INFO(inode)); } @@ -75,6 +76,37 @@ static void jffs2_write_super(struct super_block *sb) unlock_super(sb); } +static const char *jffs2_compr_name(unsigned int compr) +{ + switch (compr) { + case JFFS2_COMPR_MODE_NONE: + return "none"; +#ifdef CONFIG_JFFS2_LZO + case JFFS2_COMPR_MODE_FORCELZO: + return "lzo"; +#endif +#ifdef CONFIG_JFFS2_ZLIB + case JFFS2_COMPR_MODE_FORCEZLIB: + return "zlib"; +#endif + default: + /* should never happen; programmer error */ + WARN_ON(1); + return ""; + } +} + +static int jffs2_show_options(struct seq_file *s, struct dentry *root) +{ + struct jffs2_sb_info *c = JFFS2_SB_INFO(root->d_sb); + struct jffs2_mount_opts *opts = &c->mount_opts; + + if (opts->override_compr) + seq_printf(s, ",compr=%s", jffs2_compr_name(opts->compr)); + + return 0; +} + static int jffs2_sync_fs(struct super_block *sb, int wait) { struct jffs2_sb_info *c = JFFS2_SB_INFO(sb); @@ -133,6 +165,85 @@ static const struct export_operations jffs2_export_ops = { .fh_to_parent = jffs2_fh_to_parent, }; +/* + * JFFS2 mount options. + * + * Opt_override_compr: override default compressor + * Opt_err: just end of array marker + */ +enum { + Opt_override_compr, + Opt_err, +}; + +static const match_table_t tokens = { + {Opt_override_compr, "compr=%s"}, + {Opt_err, NULL}, +}; + +static int jffs2_parse_options(struct jffs2_sb_info *c, char *data) +{ + substring_t args[MAX_OPT_ARGS]; + char *p, *name; + + if (!data) + return 0; + + while ((p = strsep(&data, ","))) { + int token; + + if (!*p) + continue; + + token = match_token(p, tokens, args); + switch (token) { + case Opt_override_compr: + name = match_strdup(&args[0]); + + if (!name) + return -ENOMEM; + if (!strcmp(name, "none")) + c->mount_opts.compr = JFFS2_COMPR_MODE_NONE; +#ifdef CONFIG_JFFS2_LZO + else if (!strcmp(name, "lzo")) + c->mount_opts.compr = JFFS2_COMPR_MODE_FORCELZO; +#endif +#ifdef CONFIG_JFFS2_ZLIB + else if (!strcmp(name, "zlib")) + c->mount_opts.compr = + JFFS2_COMPR_MODE_FORCEZLIB; +#endif + else { + printk(KERN_ERR "JFFS2 Error: unknown compressor \"%s\"", + name); + kfree(name); + return -EINVAL; + } + kfree(name); + c->mount_opts.override_compr = true; + break; + default: + printk(KERN_ERR "JFFS2 Error: unrecognized mount option '%s' or missing value\n", + p); + return -EINVAL; + } + } + + return 0; +} + +static int jffs2_remount_fs(struct super_block *sb, int *flags, char *data) +{ + struct jffs2_sb_info *c = JFFS2_SB_INFO(sb); + int err; + + err = jffs2_parse_options(c, data); + if (err) + return -EINVAL; + + return jffs2_do_remount_fs(sb, flags, data); +} + static const struct super_operations jffs2_super_operations = { .alloc_inode = jffs2_alloc_inode, @@ -143,6 +254,7 @@ static const struct super_operations jffs2_super_operations = .remount_fs = jffs2_remount_fs, .evict_inode = jffs2_evict_inode, .dirty_inode = jffs2_dirty_inode, + .show_options = jffs2_show_options, .sync_fs = jffs2_sync_fs, }; @@ -166,6 +278,12 @@ static int jffs2_fill_super(struct super_block *sb, void *data, int silent) c->os_priv = sb; sb->s_fs_info = c; + ret = jffs2_parse_options(c, data); + if (ret) { + kfree(c); + return -EINVAL; + } + /* Initialize JFFS2 superblock locks, the further initialization will * be done later */ mutex_init(&c->alloc_sem); @@ -217,9 +335,7 @@ static void jffs2_put_super (struct super_block *sb) jffs2_flash_cleanup(c); kfree(c->inocache_list); jffs2_clear_xattr_subsystem(c); - if (c->mtd->sync) - c->mtd->sync(c->mtd); - + mtd_sync(c->mtd); D1(printk(KERN_DEBUG "jffs2_put_super returning\n")); } diff --git a/fs/jffs2/wbuf.c b/fs/jffs2/wbuf.c index 4515bea..30e8f47 100644 --- a/fs/jffs2/wbuf.c +++ b/fs/jffs2/wbuf.c @@ -228,7 +228,7 @@ static int jffs2_verify_write(struct jffs2_sb_info *c, unsigned char *buf, size_t retlen; char *eccstr; - ret = c->mtd->read(c->mtd, ofs, c->wbuf_pagesize, &retlen, c->wbuf_verify); + ret = mtd_read(c->mtd, ofs, c->wbuf_pagesize, &retlen, c->wbuf_verify); if (ret && ret != -EUCLEAN && ret != -EBADMSG) { printk(KERN_WARNING "jffs2_verify_write(): Read back of page at %08x failed: %d\n", c->wbuf_ofs, ret); return ret; @@ -337,7 +337,8 @@ static void jffs2_wbuf_recover(struct jffs2_sb_info *c) } /* Do the read... */ - ret = c->mtd->read(c->mtd, start, c->wbuf_ofs - start, &retlen, buf); + ret = mtd_read(c->mtd, start, c->wbuf_ofs - start, &retlen, + buf); /* ECC recovered ? */ if ((ret == -EUCLEAN || ret == -EBADMSG) && @@ -413,13 +414,12 @@ static void jffs2_wbuf_recover(struct jffs2_sb_info *c) if (breakme++ == 20) { printk(KERN_NOTICE "Faking write error at 0x%08x\n", ofs); breakme = 0; - c->mtd->write(c->mtd, ofs, towrite, &retlen, - brokenbuf); + mtd_write(c->mtd, ofs, towrite, &retlen, brokenbuf); ret = -EIO; } else #endif - ret = c->mtd->write(c->mtd, ofs, towrite, &retlen, - rewrite_buf); + ret = mtd_write(c->mtd, ofs, towrite, &retlen, + rewrite_buf); if (ret || retlen != towrite || jffs2_verify_write(c, rewrite_buf, ofs)) { /* Argh. We tried. Really we did. */ @@ -578,8 +578,7 @@ static int __jffs2_flush_wbuf(struct jffs2_sb_info *c, int pad) if (!jffs2_is_writebuffered(c)) return 0; - if (mutex_trylock(&c->alloc_sem)) { - mutex_unlock(&c->alloc_sem); + if (!mutex_is_locked(&c->alloc_sem)) { printk(KERN_CRIT "jffs2_flush_wbuf() called with alloc_sem not locked!\n"); BUG(); } @@ -620,13 +619,14 @@ static int __jffs2_flush_wbuf(struct jffs2_sb_info *c, int pad) if (breakme++ == 20) { printk(KERN_NOTICE "Faking write error at 0x%08x\n", c->wbuf_ofs); breakme = 0; - c->mtd->write(c->mtd, c->wbuf_ofs, c->wbuf_pagesize, &retlen, - brokenbuf); + mtd_write(c->mtd, c->wbuf_ofs, c->wbuf_pagesize, &retlen, + brokenbuf); ret = -EIO; } else #endif - ret = c->mtd->write(c->mtd, c->wbuf_ofs, c->wbuf_pagesize, &retlen, c->wbuf); + ret = mtd_write(c->mtd, c->wbuf_ofs, c->wbuf_pagesize, + &retlen, c->wbuf); if (ret) { printk(KERN_WARNING "jffs2_flush_wbuf(): Write failed with %d\n", ret); @@ -862,8 +862,8 @@ int jffs2_flash_writev(struct jffs2_sb_info *c, const struct kvec *invecs, v += wbuf_retlen; if (vlen >= c->wbuf_pagesize) { - ret = c->mtd->write(c->mtd, outvec_to, PAGE_DIV(vlen), - &wbuf_retlen, v); + ret = mtd_write(c->mtd, outvec_to, PAGE_DIV(vlen), + &wbuf_retlen, v); if (ret < 0 || wbuf_retlen != PAGE_DIV(vlen)) goto outfile; @@ -949,11 +949,11 @@ int jffs2_flash_read(struct jffs2_sb_info *c, loff_t ofs, size_t len, size_t *re int ret; if (!jffs2_is_writebuffered(c)) - return c->mtd->read(c->mtd, ofs, len, retlen, buf); + return mtd_read(c->mtd, ofs, len, retlen, buf); /* Read flash */ down_read(&c->wbuf_sem); - ret = c->mtd->read(c->mtd, ofs, len, retlen, buf); + ret = mtd_read(c->mtd, ofs, len, retlen, buf); if ( (ret == -EBADMSG || ret == -EUCLEAN) && (*retlen == len) ) { if (ret == -EBADMSG) @@ -1026,13 +1026,13 @@ int jffs2_check_oob_empty(struct jffs2_sb_info *c, int cmlen = min_t(int, c->oobavail, OOB_CM_SIZE); struct mtd_oob_ops ops; - ops.mode = MTD_OOB_AUTO; + ops.mode = MTD_OPS_AUTO_OOB; ops.ooblen = NR_OOB_SCAN_PAGES * c->oobavail; ops.oobbuf = c->oobbuf; ops.len = ops.ooboffs = ops.retlen = ops.oobretlen = 0; ops.datbuf = NULL; - ret = c->mtd->read_oob(c->mtd, jeb->offset, &ops); + ret = mtd_read_oob(c->mtd, jeb->offset, &ops); if (ret || ops.oobretlen != ops.ooblen) { printk(KERN_ERR "cannot read OOB for EB at %08x, requested %zd" " bytes, read %zd bytes, error %d\n", @@ -1069,13 +1069,13 @@ int jffs2_check_nand_cleanmarker(struct jffs2_sb_info *c, struct mtd_oob_ops ops; int ret, cmlen = min_t(int, c->oobavail, OOB_CM_SIZE); - ops.mode = MTD_OOB_AUTO; + ops.mode = MTD_OPS_AUTO_OOB; ops.ooblen = cmlen; ops.oobbuf = c->oobbuf; ops.len = ops.ooboffs = ops.retlen = ops.oobretlen = 0; ops.datbuf = NULL; - ret = c->mtd->read_oob(c->mtd, jeb->offset, &ops); + ret = mtd_read_oob(c->mtd, jeb->offset, &ops); if (ret || ops.oobretlen != ops.ooblen) { printk(KERN_ERR "cannot read OOB for EB at %08x, requested %zd" " bytes, read %zd bytes, error %d\n", @@ -1095,13 +1095,13 @@ int jffs2_write_nand_cleanmarker(struct jffs2_sb_info *c, struct mtd_oob_ops ops; int cmlen = min_t(int, c->oobavail, OOB_CM_SIZE); - ops.mode = MTD_OOB_AUTO; + ops.mode = MTD_OPS_AUTO_OOB; ops.ooblen = cmlen; ops.oobbuf = (uint8_t *)&oob_cleanmarker; ops.len = ops.ooboffs = ops.retlen = ops.oobretlen = 0; ops.datbuf = NULL; - ret = c->mtd->write_oob(c->mtd, jeb->offset, &ops); + ret = mtd_write_oob(c->mtd, jeb->offset, &ops); if (ret || ops.oobretlen != ops.ooblen) { printk(KERN_ERR "cannot write OOB for EB at %08x, requested %zd" " bytes, read %zd bytes, error %d\n", @@ -1130,11 +1130,8 @@ int jffs2_write_nand_badblock(struct jffs2_sb_info *c, struct jffs2_eraseblock * if( ++jeb->bad_count < MAX_ERASE_FAILURES) return 0; - if (!c->mtd->block_markbad) - return 1; // What else can we do? - printk(KERN_WARNING "JFFS2: marking eraseblock at %08x\n as bad", bad_offset); - ret = c->mtd->block_markbad(c->mtd, bad_offset); + ret = mtd_block_markbad(c->mtd, bad_offset); if (ret) { D1(printk(KERN_WARNING "jffs2_write_nand_badblock(): Write failed for block at %08x: error %d\n", jeb->offset, ret)); diff --git a/fs/jffs2/writev.c b/fs/jffs2/writev.c index b9276b1..a1bda9d 100644 --- a/fs/jffs2/writev.c +++ b/fs/jffs2/writev.c @@ -13,30 +13,6 @@ #include <linux/mtd/mtd.h> #include "nodelist.h" -/* This ought to be in core MTD code. All registered MTD devices - without writev should have this put in place. Bug the MTD - maintainer */ -static inline int mtd_fake_writev(struct mtd_info *mtd, const struct kvec *vecs, - unsigned long count, loff_t to, size_t *retlen) -{ - unsigned long i; - size_t totlen = 0, thislen; - int ret = 0; - - for (i=0; i<count; i++) { - if (!vecs[i].iov_len) - continue; - ret = mtd->write(mtd, to, vecs[i].iov_len, &thislen, vecs[i].iov_base); - totlen += thislen; - if (ret || thislen != vecs[i].iov_len) - break; - to += vecs[i].iov_len; - } - if (retlen) - *retlen = totlen; - return ret; -} - int jffs2_flash_direct_writev(struct jffs2_sb_info *c, const struct kvec *vecs, unsigned long count, loff_t to, size_t *retlen) { @@ -50,18 +26,14 @@ int jffs2_flash_direct_writev(struct jffs2_sb_info *c, const struct kvec *vecs, } } - if (c->mtd->writev) - return c->mtd->writev(c->mtd, vecs, count, to, retlen); - else { - return mtd_fake_writev(c->mtd, vecs, count, to, retlen); - } + return mtd_writev(c->mtd, vecs, count, to, retlen); } int jffs2_flash_direct_write(struct jffs2_sb_info *c, loff_t ofs, size_t len, size_t *retlen, const u_char *buf) { int ret; - ret = c->mtd->write(c->mtd, ofs, len, retlen, buf); + ret = mtd_write(c->mtd, ofs, len, retlen, buf); if (jffs2_sum_active()) { struct kvec vecs[1]; diff --git a/fs/jfs/ioctl.c b/fs/jfs/ioctl.c index 6f98a18..f19d1e0 100644 --- a/fs/jfs/ioctl.c +++ b/fs/jfs/ioctl.c @@ -68,7 +68,7 @@ long jfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) unsigned int oldflags; int err; - err = mnt_want_write(filp->f_path.mnt); + err = mnt_want_write_file(filp); if (err) return err; @@ -120,7 +120,7 @@ long jfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) inode->i_ctime = CURRENT_TIME_SEC; mark_inode_dirty(inode); setflags_out: - mnt_drop_write(filp->f_path.mnt); + mnt_drop_write_file(filp); return err; } default: diff --git a/fs/jfs/jfs_imap.c b/fs/jfs/jfs_imap.c index b78b2f9..1b6f15f 100644 --- a/fs/jfs/jfs_imap.c +++ b/fs/jfs/jfs_imap.c @@ -457,7 +457,7 @@ struct inode *diReadSpecial(struct super_block *sb, ino_t inum, int secondary) /* read the page of fixed disk inode (AIT) in raw mode */ mp = read_metapage(ip, address << sbi->l2nbperpage, PSIZE, 1); if (mp == NULL) { - ip->i_nlink = 1; /* Don't want iput() deleting it */ + set_nlink(ip, 1); /* Don't want iput() deleting it */ iput(ip); return (NULL); } @@ -469,7 +469,7 @@ struct inode *diReadSpecial(struct super_block *sb, ino_t inum, int secondary) /* copy on-disk inode to in-memory inode */ if ((copy_from_dinode(dp, ip)) != 0) { /* handle bad return by returning NULL for ip */ - ip->i_nlink = 1; /* Don't want iput() deleting it */ + set_nlink(ip, 1); /* Don't want iput() deleting it */ iput(ip); /* release the page */ release_metapage(mp); @@ -3076,7 +3076,7 @@ static int copy_from_dinode(struct dinode * dip, struct inode *ip) ip->i_mode |= 0001; } } - ip->i_nlink = le32_to_cpu(dip->di_nlink); + set_nlink(ip, le32_to_cpu(dip->di_nlink)); jfs_ip->saved_uid = le32_to_cpu(dip->di_uid); if (sbi->uid == -1) diff --git a/fs/jfs/jfs_inode.c b/fs/jfs/jfs_inode.c index 2686531..c1a3e60 100644 --- a/fs/jfs/jfs_inode.c +++ b/fs/jfs/jfs_inode.c @@ -157,7 +157,7 @@ fail_drop: dquot_drop(inode); inode->i_flags |= S_NOQUOTA; fail_unlock: - inode->i_nlink = 0; + clear_nlink(inode); unlock_new_inode(inode); fail_put: iput(inode); diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c index 583636f..2eb952c 100644 --- a/fs/jfs/jfs_logmgr.c +++ b/fs/jfs/jfs_logmgr.c @@ -67,6 +67,7 @@ #include <linux/buffer_head.h> /* for sync_blockdev() */ #include <linux/bio.h> #include <linux/freezer.h> +#include <linux/export.h> #include <linux/delay.h> #include <linux/mutex.h> #include <linux/seq_file.h> @@ -2348,7 +2349,7 @@ int jfsIOWait(void *arg) if (freezing(current)) { spin_unlock_irq(&log_redrive_lock); - refrigerator(); + try_to_freeze(); } else { set_current_state(TASK_INTERRUPTIBLE); spin_unlock_irq(&log_redrive_lock); diff --git a/fs/jfs/jfs_txnmgr.c b/fs/jfs/jfs_txnmgr.c index af96060..bb8b661 100644 --- a/fs/jfs/jfs_txnmgr.c +++ b/fs/jfs/jfs_txnmgr.c @@ -2800,7 +2800,7 @@ int jfs_lazycommit(void *arg) if (freezing(current)) { LAZY_UNLOCK(flags); - refrigerator(); + try_to_freeze(); } else { DECLARE_WAITQUEUE(wq, current); @@ -2994,7 +2994,7 @@ int jfs_sync(void *arg) if (freezing(current)) { TXN_UNLOCK(); - refrigerator(); + try_to_freeze(); } else { set_current_state(TASK_INTERRUPTIBLE); TXN_UNLOCK(); diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c index e17545e..5f7c160 100644 --- a/fs/jfs/namei.c +++ b/fs/jfs/namei.c @@ -72,7 +72,7 @@ static inline void free_ea_wmap(struct inode *inode) * RETURN: Errors from subroutines * */ -static int jfs_create(struct inode *dip, struct dentry *dentry, int mode, +static int jfs_create(struct inode *dip, struct dentry *dentry, umode_t mode, struct nameidata *nd) { int rc = 0; @@ -172,7 +172,7 @@ static int jfs_create(struct inode *dip, struct dentry *dentry, int mode, mutex_unlock(&JFS_IP(dip)->commit_mutex); if (rc) { free_ea_wmap(ip); - ip->i_nlink = 0; + clear_nlink(ip); unlock_new_inode(ip); iput(ip); } else { @@ -205,7 +205,7 @@ static int jfs_create(struct inode *dip, struct dentry *dentry, int mode, * note: * EACCESS: user needs search+write permission on the parent directory */ -static int jfs_mkdir(struct inode *dip, struct dentry *dentry, int mode) +static int jfs_mkdir(struct inode *dip, struct dentry *dentry, umode_t mode) { int rc = 0; tid_t tid; /* transaction id */ @@ -292,7 +292,7 @@ static int jfs_mkdir(struct inode *dip, struct dentry *dentry, int mode) goto out3; } - ip->i_nlink = 2; /* for '.' */ + set_nlink(ip, 2); /* for '.' */ ip->i_op = &jfs_dir_inode_operations; ip->i_fop = &jfs_dir_operations; @@ -311,7 +311,7 @@ static int jfs_mkdir(struct inode *dip, struct dentry *dentry, int mode) mutex_unlock(&JFS_IP(dip)->commit_mutex); if (rc) { free_ea_wmap(ip); - ip->i_nlink = 0; + clear_nlink(ip); unlock_new_inode(ip); iput(ip); } else { @@ -844,7 +844,7 @@ static int jfs_link(struct dentry *old_dentry, rc = txCommit(tid, 2, &iplist[0], 0); if (rc) { - ip->i_nlink--; /* never instantiated */ + drop_nlink(ip); /* never instantiated */ iput(ip); } else d_instantiate(dentry, ip); @@ -1048,7 +1048,7 @@ static int jfs_symlink(struct inode *dip, struct dentry *dentry, mutex_unlock(&JFS_IP(dip)->commit_mutex); if (rc) { free_ea_wmap(ip); - ip->i_nlink = 0; + clear_nlink(ip); unlock_new_inode(ip); iput(ip); } else { @@ -1353,7 +1353,7 @@ static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry, * FUNCTION: Create a special file (device) */ static int jfs_mknod(struct inode *dir, struct dentry *dentry, - int mode, dev_t rdev) + umode_t mode, dev_t rdev) { struct jfs_inode_info *jfs_ip; struct btstack btstack; @@ -1433,7 +1433,7 @@ static int jfs_mknod(struct inode *dir, struct dentry *dentry, mutex_unlock(&JFS_IP(dir)->commit_mutex); if (rc) { free_ea_wmap(ip); - ip->i_nlink = 0; + clear_nlink(ip); unlock_new_inode(ip); iput(ip); } else { diff --git a/fs/jfs/super.c b/fs/jfs/super.c index 06c8a67..682bca6 100644 --- a/fs/jfs/super.c +++ b/fs/jfs/super.c @@ -119,7 +119,6 @@ static void jfs_i_callback(struct rcu_head *head) { struct inode *inode = container_of(head, struct inode, i_rcu); struct jfs_inode_info *ji = JFS_IP(inode); - INIT_LIST_HEAD(&inode->i_dentry); kmem_cache_free(jfs_inode_cachep, ji); } @@ -485,7 +484,6 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent) goto out_unload; } inode->i_ino = 0; - inode->i_nlink = 1; inode->i_size = sb->s_bdev->bd_inode->i_size; inode->i_mapping->a_ops = &jfs_metapage_aops; insert_inode_hash(inode); @@ -610,9 +608,9 @@ static int jfs_sync_fs(struct super_block *sb, int wait) return 0; } -static int jfs_show_options(struct seq_file *seq, struct vfsmount *vfs) +static int jfs_show_options(struct seq_file *seq, struct dentry *root) { - struct jfs_sb_info *sbi = JFS_SBI(vfs->mnt_sb); + struct jfs_sb_info *sbi = JFS_SBI(root->d_sb); if (sbi->uid != -1) seq_printf(seq, ",uid=%d", sbi->uid); diff --git a/fs/jfs/xattr.c b/fs/jfs/xattr.c index e87fede..26683e1 100644 --- a/fs/jfs/xattr.c +++ b/fs/jfs/xattr.c @@ -1089,38 +1089,37 @@ int jfs_removexattr(struct dentry *dentry, const char *name) } #ifdef CONFIG_JFS_SECURITY -int jfs_init_security(tid_t tid, struct inode *inode, struct inode *dir, - const struct qstr *qstr) +int jfs_initxattrs(struct inode *inode, const struct xattr *xattr_array, + void *fs_info) { - int rc; - size_t len; - void *value; - char *suffix; + const struct xattr *xattr; + tid_t *tid = fs_info; char *name; - - rc = security_inode_init_security(inode, dir, qstr, &suffix, &value, - &len); - if (rc) { - if (rc == -EOPNOTSUPP) - return 0; - return rc; - } - name = kmalloc(XATTR_SECURITY_PREFIX_LEN + 1 + strlen(suffix), - GFP_NOFS); - if (!name) { - rc = -ENOMEM; - goto kmalloc_failed; + int err = 0; + + for (xattr = xattr_array; xattr->name != NULL; xattr++) { + name = kmalloc(XATTR_SECURITY_PREFIX_LEN + + strlen(xattr->name) + 1, GFP_NOFS); + if (!name) { + err = -ENOMEM; + break; + } + strcpy(name, XATTR_SECURITY_PREFIX); + strcpy(name + XATTR_SECURITY_PREFIX_LEN, xattr->name); + + err = __jfs_setxattr(*tid, inode, name, + xattr->value, xattr->value_len, 0); + kfree(name); + if (err < 0) + break; } - strcpy(name, XATTR_SECURITY_PREFIX); - strcpy(name + XATTR_SECURITY_PREFIX_LEN, suffix); - - rc = __jfs_setxattr(tid, inode, name, value, len, 0); - - kfree(name); -kmalloc_failed: - kfree(suffix); - kfree(value); + return err; +} - return rc; +int jfs_init_security(tid_t tid, struct inode *inode, struct inode *dir, + const struct qstr *qstr) +{ + return security_inode_init_security(inode, dir, qstr, + &jfs_initxattrs, &tid); } #endif @@ -12,7 +12,7 @@ #include <linux/mutex.h> #include <linux/exportfs.h> #include <linux/writeback.h> -#include <linux/buffer_head.h> +#include <linux/buffer_head.h> /* sync_mapping_buffers */ #include <asm/uaccess.h> @@ -490,7 +490,7 @@ int simple_fill_super(struct super_block *s, unsigned long magic, inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; inode->i_op = &simple_dir_inode_operations; inode->i_fop = &simple_dir_operations; - inode->i_nlink = 2; + set_nlink(inode, 2); root = d_alloc_root(inode); if (!root) { iput(inode); @@ -510,8 +510,10 @@ int simple_fill_super(struct super_block *s, unsigned long magic, if (!dentry) goto out; inode = new_inode(s); - if (!inode) + if (!inode) { + dput(dentry); goto out; + } inode->i_mode = S_IFREG | files->mode; inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; inode->i_fop = files->ops; diff --git a/fs/lockd/host.c b/fs/lockd/host.c index b7c99bf..6f29836 100644 --- a/fs/lockd/host.c +++ b/fs/lockd/host.c @@ -316,14 +316,8 @@ struct nlm_host *nlmsvc_lookup_host(const struct svc_rqst *rqstp, struct hlist_node *pos; struct nlm_host *host = NULL; struct nsm_handle *nsm = NULL; - struct sockaddr_in sin = { - .sin_family = AF_INET, - }; - struct sockaddr_in6 sin6 = { - .sin6_family = AF_INET6, - }; - struct sockaddr *src_sap; - size_t src_len = rqstp->rq_addrlen; + struct sockaddr *src_sap = svc_daddr(rqstp); + size_t src_len = rqstp->rq_daddrlen; struct nlm_lookup_host_info ni = { .server = 1, .sap = svc_addr(rqstp), @@ -340,21 +334,6 @@ struct nlm_host *nlmsvc_lookup_host(const struct svc_rqst *rqstp, mutex_lock(&nlm_host_mutex); - switch (ni.sap->sa_family) { - case AF_INET: - sin.sin_addr.s_addr = rqstp->rq_daddr.addr.s_addr; - src_sap = (struct sockaddr *)&sin; - break; - case AF_INET6: - ipv6_addr_copy(&sin6.sin6_addr, &rqstp->rq_daddr.addr6); - src_sap = (struct sockaddr *)&sin6; - break; - default: - dprintk("lockd: %s failed; unrecognized address family\n", - __func__); - goto out; - } - if (time_after_eq(jiffies, next_gc)) nlm_gc_hosts(); diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c index abfff9d..c061b9a 100644 --- a/fs/lockd/svc.c +++ b/fs/lockd/svc.c @@ -282,7 +282,7 @@ int lockd_up(void) /* * Create the kernel thread and wait for it to start. */ - nlmsvc_rqst = svc_prepare_thread(serv, &serv->sv_pools[0]); + nlmsvc_rqst = svc_prepare_thread(serv, &serv->sv_pools[0], NUMA_NO_NODE); if (IS_ERR(nlmsvc_rqst)) { error = PTR_ERR(nlmsvc_rqst); nlmsvc_rqst = NULL; diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c index 1ca0679..2240d38 100644 --- a/fs/lockd/svcsubs.c +++ b/fs/lockd/svcsubs.c @@ -403,7 +403,7 @@ nlmsvc_match_sb(void *datap, struct nlm_file *file) { struct super_block *sb = datap; - return sb == file->f_file->f_path.mnt->mnt_sb; + return sb == file->f_file->f_path.dentry->d_sb; } /** @@ -60,7 +60,7 @@ * * Initial implementation of mandatory locks. SunOS turned out to be * a rotten model, so I implemented the "obvious" semantics. - * See 'Documentation/mandatory.txt' for details. + * See 'Documentation/filesystems/mandatory-locking.txt' for details. * Andy Walker (andy@lysaker.kvaerner.no), April 06, 1996. * * Don't allow mandatory locks on mmap()'ed files. Added simple functions to @@ -133,6 +133,20 @@ #define IS_FLOCK(fl) (fl->fl_flags & FL_FLOCK) #define IS_LEASE(fl) (fl->fl_flags & FL_LEASE) +static bool lease_breaking(struct file_lock *fl) +{ + return fl->fl_flags & (FL_UNLOCK_PENDING | FL_DOWNGRADE_PENDING); +} + +static int target_leasetype(struct file_lock *fl) +{ + if (fl->fl_flags & FL_UNLOCK_PENDING) + return F_UNLCK; + if (fl->fl_flags & FL_DOWNGRADE_PENDING) + return F_RDLCK; + return fl->fl_type; +} + int leases_enable = 1; int lease_break_time = 45; @@ -1119,6 +1133,17 @@ int locks_mandatory_area(int read_write, struct inode *inode, EXPORT_SYMBOL(locks_mandatory_area); +static void lease_clear_pending(struct file_lock *fl, int arg) +{ + switch (arg) { + case F_UNLCK: + fl->fl_flags &= ~FL_UNLOCK_PENDING; + /* fall through: */ + case F_RDLCK: + fl->fl_flags &= ~FL_DOWNGRADE_PENDING; + } +} + /* We already had a lease on this file; just change its type */ int lease_modify(struct file_lock **before, int arg) { @@ -1127,6 +1152,7 @@ int lease_modify(struct file_lock **before, int arg) if (error) return error; + lease_clear_pending(fl, arg); locks_wake_up_blocks(fl); if (arg == F_UNLCK) locks_delete_lock(before); @@ -1135,19 +1161,25 @@ int lease_modify(struct file_lock **before, int arg) EXPORT_SYMBOL(lease_modify); +static bool past_time(unsigned long then) +{ + if (!then) + /* 0 is a special value meaning "this never expires": */ + return false; + return time_after(jiffies, then); +} + static void time_out_leases(struct inode *inode) { struct file_lock **before; struct file_lock *fl; before = &inode->i_flock; - while ((fl = *before) && IS_LEASE(fl) && (fl->fl_type & F_INPROGRESS)) { - if ((fl->fl_break_time == 0) - || time_before(jiffies, fl->fl_break_time)) { - before = &fl->fl_next; - continue; - } - lease_modify(before, fl->fl_type & ~F_INPROGRESS); + while ((fl = *before) && IS_LEASE(fl) && lease_breaking(fl)) { + if (past_time(fl->fl_downgrade_time)) + lease_modify(before, F_RDLCK); + if (past_time(fl->fl_break_time)) + lease_modify(before, F_UNLCK); if (fl == *before) /* lease_modify may have freed fl */ before = &fl->fl_next; } @@ -1165,7 +1197,7 @@ static void time_out_leases(struct inode *inode) */ int __break_lease(struct inode *inode, unsigned int mode) { - int error = 0, future; + int error = 0; struct file_lock *new_fl, *flock; struct file_lock *fl; unsigned long break_time; @@ -1173,6 +1205,8 @@ int __break_lease(struct inode *inode, unsigned int mode) int want_write = (mode & O_ACCMODE) != O_RDONLY; new_fl = lease_alloc(NULL, want_write ? F_WRLCK : F_RDLCK); + if (IS_ERR(new_fl)) + return PTR_ERR(new_fl); lock_flocks(); @@ -1182,30 +1216,13 @@ int __break_lease(struct inode *inode, unsigned int mode) if ((flock == NULL) || !IS_LEASE(flock)) goto out; + if (!locks_conflict(flock, new_fl)) + goto out; + for (fl = flock; fl && IS_LEASE(fl); fl = fl->fl_next) if (fl->fl_owner == current->files) i_have_this_lease = 1; - if (want_write) { - /* If we want write access, we have to revoke any lease. */ - future = F_UNLCK | F_INPROGRESS; - } else if (flock->fl_type & F_INPROGRESS) { - /* If the lease is already being broken, we just leave it */ - future = flock->fl_type; - } else if (flock->fl_type & F_WRLCK) { - /* Downgrade the exclusive lease to a read-only lease. */ - future = F_RDLCK | F_INPROGRESS; - } else { - /* the existing lease was read-only, so we can read too. */ - goto out; - } - - if (IS_ERR(new_fl) && !i_have_this_lease - && ((mode & O_NONBLOCK) == 0)) { - error = PTR_ERR(new_fl); - goto out; - } - break_time = 0; if (lease_break_time > 0) { break_time = jiffies + lease_break_time * HZ; @@ -1214,12 +1231,18 @@ int __break_lease(struct inode *inode, unsigned int mode) } for (fl = flock; fl && IS_LEASE(fl); fl = fl->fl_next) { - if (fl->fl_type != future) { - fl->fl_type = future; + if (want_write) { + if (fl->fl_flags & FL_UNLOCK_PENDING) + continue; + fl->fl_flags |= FL_UNLOCK_PENDING; fl->fl_break_time = break_time; - /* lease must have lmops break callback */ - fl->fl_lmops->lm_break(fl); + } else { + if (lease_breaking(flock)) + continue; + fl->fl_flags |= FL_DOWNGRADE_PENDING; + fl->fl_downgrade_time = break_time; } + fl->fl_lmops->lm_break(fl); } if (i_have_this_lease || (mode & O_NONBLOCK)) { @@ -1243,10 +1266,13 @@ restart: if (error >= 0) { if (error == 0) time_out_leases(inode); - /* Wait for the next lease that has not been broken yet */ + /* + * Wait for the next conflicting lease that has not been + * broken yet + */ for (flock = inode->i_flock; flock && IS_LEASE(flock); flock = flock->fl_next) { - if (flock->fl_type & F_INPROGRESS) + if (locks_conflict(new_fl, flock)) goto restart; } error = 0; @@ -1254,8 +1280,7 @@ restart: out: unlock_flocks(); - if (!IS_ERR(new_fl)) - locks_free_lock(new_fl); + locks_free_lock(new_fl); return error; } @@ -1314,7 +1339,7 @@ int fcntl_getlease(struct file *filp) for (fl = filp->f_path.dentry->d_inode->i_flock; fl && IS_LEASE(fl); fl = fl->fl_next) { if (fl->fl_file == filp) { - type = fl->fl_type & ~F_INPROGRESS; + type = target_leasetype(fl); break; } } @@ -1322,50 +1347,23 @@ int fcntl_getlease(struct file *filp) return type; } -/** - * generic_setlease - sets a lease on an open file - * @filp: file pointer - * @arg: type of lease to obtain - * @flp: input - file_lock to use, output - file_lock inserted - * - * The (input) flp->fl_lmops->lm_break function is required - * by break_lease(). - * - * Called with file_lock_lock held. - */ -int generic_setlease(struct file *filp, long arg, struct file_lock **flp) +int generic_add_lease(struct file *filp, long arg, struct file_lock **flp) { struct file_lock *fl, **before, **my_before = NULL, *lease; struct dentry *dentry = filp->f_path.dentry; struct inode *inode = dentry->d_inode; - int error, rdlease_count = 0, wrlease_count = 0; + int error; lease = *flp; - error = -EACCES; - if ((current_fsuid() != inode->i_uid) && !capable(CAP_LEASE)) - goto out; - error = -EINVAL; - if (!S_ISREG(inode->i_mode)) + error = -EAGAIN; + if ((arg == F_RDLCK) && (atomic_read(&inode->i_writecount) > 0)) goto out; - error = security_file_lock(filp, arg); - if (error) + if ((arg == F_WRLCK) + && ((dentry->d_count > 1) + || (atomic_read(&inode->i_count) > 1))) goto out; - time_out_leases(inode); - - BUG_ON(!(*flp)->fl_lmops->lm_break); - - if (arg != F_UNLCK) { - error = -EAGAIN; - if ((arg == F_RDLCK) && (atomic_read(&inode->i_writecount) > 0)) - goto out; - if ((arg == F_WRLCK) - && ((dentry->d_count > 1) - || (atomic_read(&inode->i_count) > 1))) - goto out; - } - /* * At this point, we know that if there is an exclusive * lease on this file, then we hold it on this filp @@ -1374,27 +1372,28 @@ int generic_setlease(struct file *filp, long arg, struct file_lock **flp) * then the file is not open by anyone (including us) * except for this filp. */ + error = -EAGAIN; for (before = &inode->i_flock; ((fl = *before) != NULL) && IS_LEASE(fl); before = &fl->fl_next) { - if (fl->fl_file == filp) + if (fl->fl_file == filp) { my_before = before; - else if (fl->fl_type == (F_INPROGRESS | F_UNLCK)) - /* - * Someone is in the process of opening this - * file for writing so we may not take an - * exclusive lease on it. - */ - wrlease_count++; - else - rdlease_count++; + continue; + } + /* + * No exclusive leases if someone else has a lease on + * this file: + */ + if (arg == F_WRLCK) + goto out; + /* + * Modifying our existing lease is OK, but no getting a + * new lease if someone else is opening for write: + */ + if (fl->fl_flags & FL_UNLOCK_PENDING) + goto out; } - error = -EAGAIN; - if ((arg == F_RDLCK && (wrlease_count > 0)) || - (arg == F_WRLCK && ((rdlease_count + wrlease_count) > 0))) - goto out; - if (my_before != NULL) { error = lease->fl_lmops->lm_change(my_before, arg); if (!error) @@ -1402,9 +1401,6 @@ int generic_setlease(struct file *filp, long arg, struct file_lock **flp) goto out; } - if (arg == F_UNLCK) - goto out; - error = -EINVAL; if (!leases_enable) goto out; @@ -1415,6 +1411,62 @@ int generic_setlease(struct file *filp, long arg, struct file_lock **flp) out: return error; } + +int generic_delete_lease(struct file *filp, struct file_lock **flp) +{ + struct file_lock *fl, **before; + struct dentry *dentry = filp->f_path.dentry; + struct inode *inode = dentry->d_inode; + + for (before = &inode->i_flock; + ((fl = *before) != NULL) && IS_LEASE(fl); + before = &fl->fl_next) { + if (fl->fl_file != filp) + continue; + return (*flp)->fl_lmops->lm_change(before, F_UNLCK); + } + return -EAGAIN; +} + +/** + * generic_setlease - sets a lease on an open file + * @filp: file pointer + * @arg: type of lease to obtain + * @flp: input - file_lock to use, output - file_lock inserted + * + * The (input) flp->fl_lmops->lm_break function is required + * by break_lease(). + * + * Called with file_lock_lock held. + */ +int generic_setlease(struct file *filp, long arg, struct file_lock **flp) +{ + struct dentry *dentry = filp->f_path.dentry; + struct inode *inode = dentry->d_inode; + int error; + + if ((current_fsuid() != inode->i_uid) && !capable(CAP_LEASE)) + return -EACCES; + if (!S_ISREG(inode->i_mode)) + return -EINVAL; + error = security_file_lock(filp, arg); + if (error) + return error; + + time_out_leases(inode); + + BUG_ON(!(*flp)->fl_lmops->lm_break); + + switch (arg) { + case F_UNLCK: + return generic_delete_lease(filp, flp); + case F_RDLCK: + case F_WRLCK: + return generic_add_lease(filp, arg, flp); + default: + BUG(); + } +} EXPORT_SYMBOL(generic_setlease); static int __vfs_setlease(struct file *filp, long arg, struct file_lock **lease) @@ -2126,7 +2178,7 @@ static void lock_get_status(struct seq_file *f, struct file_lock *fl, } } else if (IS_LEASE(fl)) { seq_printf(f, "LEASE "); - if (fl->fl_type & F_INPROGRESS) + if (lease_breaking(fl)) seq_printf(f, "BREAKING "); else if (fl->fl_file) seq_printf(f, "ACTIVE "); @@ -2142,7 +2194,7 @@ static void lock_get_status(struct seq_file *f, struct file_lock *fl, : (fl->fl_type & LOCK_WRITE) ? "WRITE" : "NONE "); } else { seq_printf(f, "%s ", - (fl->fl_type & F_INPROGRESS) + (lease_breaking(fl)) ? (fl->fl_type & F_UNLCK) ? "UNLCK" : "READ " : (fl->fl_type & F_WRLCK) ? "WRITE" : "READ "); } diff --git a/fs/logfs/dev_mtd.c b/fs/logfs/dev_mtd.c index 339e17e..e97404d 100644 --- a/fs/logfs/dev_mtd.c +++ b/fs/logfs/dev_mtd.c @@ -13,13 +13,14 @@ #define PAGE_OFS(ofs) ((ofs) & (PAGE_SIZE-1)) -static int mtd_read(struct super_block *sb, loff_t ofs, size_t len, void *buf) +static int logfs_mtd_read(struct super_block *sb, loff_t ofs, size_t len, + void *buf) { struct mtd_info *mtd = logfs_super(sb)->s_mtd; size_t retlen; int ret; - ret = mtd->read(mtd, ofs, len, &retlen, buf); + ret = mtd_read(mtd, ofs, len, &retlen, buf); BUG_ON(ret == -EINVAL); if (ret) return ret; @@ -31,7 +32,8 @@ static int mtd_read(struct super_block *sb, loff_t ofs, size_t len, void *buf) return 0; } -static int mtd_write(struct super_block *sb, loff_t ofs, size_t len, void *buf) +static int loffs_mtd_write(struct super_block *sb, loff_t ofs, size_t len, + void *buf) { struct logfs_super *super = logfs_super(sb); struct mtd_info *mtd = super->s_mtd; @@ -47,7 +49,7 @@ static int mtd_write(struct super_block *sb, loff_t ofs, size_t len, void *buf) BUG_ON(len > PAGE_CACHE_SIZE); page_start = ofs & PAGE_CACHE_MASK; page_end = PAGE_CACHE_ALIGN(ofs + len) - 1; - ret = mtd->write(mtd, ofs, len, &retlen, buf); + ret = mtd_write(mtd, ofs, len, &retlen, buf); if (ret || (retlen != len)) return -EIO; @@ -60,14 +62,15 @@ static int mtd_write(struct super_block *sb, loff_t ofs, size_t len, void *buf) * asynchronous properties. So just to prevent the first implementor of such * a thing from breaking logfs in 2350, we do the usual pointless dance to * declare a completion variable and wait for completion before returning - * from mtd_erase(). What an exercise in futility! + * from logfs_mtd_erase(). What an exercise in futility! */ static void logfs_erase_callback(struct erase_info *ei) { complete((struct completion *)ei->priv); } -static int mtd_erase_mapping(struct super_block *sb, loff_t ofs, size_t len) +static int logfs_mtd_erase_mapping(struct super_block *sb, loff_t ofs, + size_t len) { struct logfs_super *super = logfs_super(sb); struct address_space *mapping = super->s_mapping_inode->i_mapping; @@ -84,7 +87,7 @@ static int mtd_erase_mapping(struct super_block *sb, loff_t ofs, size_t len) return 0; } -static int mtd_erase(struct super_block *sb, loff_t ofs, size_t len, +static int logfs_mtd_erase(struct super_block *sb, loff_t ofs, size_t len, int ensure_write) { struct mtd_info *mtd = logfs_super(sb)->s_mtd; @@ -102,30 +105,29 @@ static int mtd_erase(struct super_block *sb, loff_t ofs, size_t len, ei.len = len; ei.callback = logfs_erase_callback; ei.priv = (long)&complete; - ret = mtd->erase(mtd, &ei); + ret = mtd_erase(mtd, &ei); if (ret) return -EIO; wait_for_completion(&complete); if (ei.state != MTD_ERASE_DONE) return -EIO; - return mtd_erase_mapping(sb, ofs, len); + return logfs_mtd_erase_mapping(sb, ofs, len); } -static void mtd_sync(struct super_block *sb) +static void logfs_mtd_sync(struct super_block *sb) { struct mtd_info *mtd = logfs_super(sb)->s_mtd; - if (mtd->sync) - mtd->sync(mtd); + mtd_sync(mtd); } -static int mtd_readpage(void *_sb, struct page *page) +static int logfs_mtd_readpage(void *_sb, struct page *page) { struct super_block *sb = _sb; int err; - err = mtd_read(sb, page->index << PAGE_SHIFT, PAGE_SIZE, + err = logfs_mtd_read(sb, page->index << PAGE_SHIFT, PAGE_SIZE, page_address(page)); if (err == -EUCLEAN || err == -EBADMSG) { /* -EBADMSG happens regularly on power failures */ @@ -143,18 +145,18 @@ static int mtd_readpage(void *_sb, struct page *page) return err; } -static struct page *mtd_find_first_sb(struct super_block *sb, u64 *ofs) +static struct page *logfs_mtd_find_first_sb(struct super_block *sb, u64 *ofs) { struct logfs_super *super = logfs_super(sb); struct address_space *mapping = super->s_mapping_inode->i_mapping; - filler_t *filler = mtd_readpage; + filler_t *filler = logfs_mtd_readpage; struct mtd_info *mtd = super->s_mtd; - if (!mtd->block_isbad) + if (!mtd_can_have_bb(mtd)) return NULL; *ofs = 0; - while (mtd->block_isbad(mtd, *ofs)) { + while (mtd_block_isbad(mtd, *ofs)) { *ofs += mtd->erasesize; if (*ofs >= mtd->size) return NULL; @@ -163,18 +165,18 @@ static struct page *mtd_find_first_sb(struct super_block *sb, u64 *ofs) return read_cache_page(mapping, *ofs >> PAGE_SHIFT, filler, sb); } -static struct page *mtd_find_last_sb(struct super_block *sb, u64 *ofs) +static struct page *logfs_mtd_find_last_sb(struct super_block *sb, u64 *ofs) { struct logfs_super *super = logfs_super(sb); struct address_space *mapping = super->s_mapping_inode->i_mapping; - filler_t *filler = mtd_readpage; + filler_t *filler = logfs_mtd_readpage; struct mtd_info *mtd = super->s_mtd; - if (!mtd->block_isbad) + if (!mtd_can_have_bb(mtd)) return NULL; *ofs = mtd->size - mtd->erasesize; - while (mtd->block_isbad(mtd, *ofs)) { + while (mtd_block_isbad(mtd, *ofs)) { *ofs -= mtd->erasesize; if (*ofs <= 0) return NULL; @@ -184,7 +186,7 @@ static struct page *mtd_find_last_sb(struct super_block *sb, u64 *ofs) return read_cache_page(mapping, *ofs >> PAGE_SHIFT, filler, sb); } -static int __mtd_writeseg(struct super_block *sb, u64 ofs, pgoff_t index, +static int __logfs_mtd_writeseg(struct super_block *sb, u64 ofs, pgoff_t index, size_t nr_pages) { struct logfs_super *super = logfs_super(sb); @@ -196,8 +198,8 @@ static int __mtd_writeseg(struct super_block *sb, u64 ofs, pgoff_t index, page = find_lock_page(mapping, index + i); BUG_ON(!page); - err = mtd_write(sb, page->index << PAGE_SHIFT, PAGE_SIZE, - page_address(page)); + err = loffs_mtd_write(sb, page->index << PAGE_SHIFT, PAGE_SIZE, + page_address(page)); unlock_page(page); page_cache_release(page); if (err) @@ -206,7 +208,7 @@ static int __mtd_writeseg(struct super_block *sb, u64 ofs, pgoff_t index, return 0; } -static void mtd_writeseg(struct super_block *sb, u64 ofs, size_t len) +static void logfs_mtd_writeseg(struct super_block *sb, u64 ofs, size_t len) { struct logfs_super *super = logfs_super(sb); int head; @@ -227,15 +229,15 @@ static void mtd_writeseg(struct super_block *sb, u64 ofs, size_t len) len += head; } len = PAGE_ALIGN(len); - __mtd_writeseg(sb, ofs, ofs >> PAGE_SHIFT, len >> PAGE_SHIFT); + __logfs_mtd_writeseg(sb, ofs, ofs >> PAGE_SHIFT, len >> PAGE_SHIFT); } -static void mtd_put_device(struct logfs_super *s) +static void logfs_mtd_put_device(struct logfs_super *s) { put_mtd_device(s->s_mtd); } -static int mtd_can_write_buf(struct super_block *sb, u64 ofs) +static int logfs_mtd_can_write_buf(struct super_block *sb, u64 ofs) { struct logfs_super *super = logfs_super(sb); void *buf; @@ -244,7 +246,7 @@ static int mtd_can_write_buf(struct super_block *sb, u64 ofs) buf = kmalloc(super->s_writesize, GFP_KERNEL); if (!buf) return -ENOMEM; - err = mtd_read(sb, ofs, super->s_writesize, buf); + err = logfs_mtd_read(sb, ofs, super->s_writesize, buf); if (err) goto out; if (memchr_inv(buf, 0xff, super->s_writesize)) @@ -255,14 +257,14 @@ out: } static const struct logfs_device_ops mtd_devops = { - .find_first_sb = mtd_find_first_sb, - .find_last_sb = mtd_find_last_sb, - .readpage = mtd_readpage, - .writeseg = mtd_writeseg, - .erase = mtd_erase, - .can_write_buf = mtd_can_write_buf, - .sync = mtd_sync, - .put_device = mtd_put_device, + .find_first_sb = logfs_mtd_find_first_sb, + .find_last_sb = logfs_mtd_find_last_sb, + .readpage = logfs_mtd_readpage, + .writeseg = logfs_mtd_writeseg, + .erase = logfs_mtd_erase, + .can_write_buf = logfs_mtd_can_write_buf, + .sync = logfs_mtd_sync, + .put_device = logfs_mtd_put_device, }; int logfs_get_sb_mtd(struct logfs_super *s, int mtdnr) diff --git a/fs/logfs/dir.c b/fs/logfs/dir.c index b3ff3d8..501043e 100644 --- a/fs/logfs/dir.c +++ b/fs/logfs/dir.c @@ -197,7 +197,7 @@ static int logfs_remove_inode(struct inode *inode) { int ret; - inode->i_nlink--; + drop_nlink(inode); ret = write_inode(inode); LOGFS_BUG_ON(ret, inode->i_sb); return ret; @@ -433,7 +433,7 @@ static int __logfs_create(struct inode *dir, struct dentry *dentry, ta = kzalloc(sizeof(*ta), GFP_KERNEL); if (!ta) { - inode->i_nlink--; + drop_nlink(inode); iput(inode); return -ENOMEM; } @@ -456,7 +456,7 @@ static int __logfs_create(struct inode *dir, struct dentry *dentry, abort_transaction(inode, ta); li->li_flags |= LOGFS_IF_STILLBORN; /* FIXME: truncate symlink */ - inode->i_nlink--; + drop_nlink(inode); iput(inode); goto out; } @@ -482,7 +482,7 @@ out: return ret; } -static int logfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) +static int logfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) { struct inode *inode; @@ -501,7 +501,7 @@ static int logfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) return __logfs_create(dir, dentry, inode, NULL, 0); } -static int logfs_create(struct inode *dir, struct dentry *dentry, int mode, +static int logfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, struct nameidata *nd) { struct inode *inode; @@ -517,7 +517,7 @@ static int logfs_create(struct inode *dir, struct dentry *dentry, int mode, return __logfs_create(dir, dentry, inode, NULL, 0); } -static int logfs_mknod(struct inode *dir, struct dentry *dentry, int mode, +static int logfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev) { struct inode *inode; @@ -563,7 +563,7 @@ static int logfs_link(struct dentry *old_dentry, struct inode *dir, inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME; ihold(inode); - inode->i_nlink++; + inc_nlink(inode); mark_inode_dirty_sync(inode); return __logfs_create(dir, dentry, inode, NULL, 0); diff --git a/fs/logfs/inode.c b/fs/logfs/inode.c index edfea7a..388df1a 100644 --- a/fs/logfs/inode.c +++ b/fs/logfs/inode.c @@ -93,7 +93,7 @@ static struct inode *__logfs_iget(struct super_block *sb, ino_t ino) /* inode->i_nlink == 0 can be true when called from * block validator */ /* set i_nlink to 0 to prevent caching */ - inode->i_nlink = 0; + clear_nlink(inode); logfs_inode(inode)->li_flags |= LOGFS_IF_ZOMBIE; iget_failed(inode); if (!err) @@ -144,7 +144,6 @@ struct inode *logfs_safe_iget(struct super_block *sb, ino_t ino, int *is_cached) static void logfs_i_callback(struct rcu_head *head) { struct inode *inode = container_of(head, struct inode, i_rcu); - INIT_LIST_HEAD(&inode->i_dentry); kmem_cache_free(logfs_inode_cache, logfs_inode(inode)); } @@ -199,7 +198,6 @@ static void logfs_init_inode(struct super_block *sb, struct inode *inode) inode->i_blocks = 0; inode->i_ctime = CURRENT_TIME; inode->i_mtime = CURRENT_TIME; - inode->i_nlink = 1; li->li_refcount = 1; INIT_LIST_HEAD(&li->li_freeing_list); @@ -325,7 +323,7 @@ static void logfs_set_ino_generation(struct super_block *sb, mutex_unlock(&super->s_journal_mutex); } -struct inode *logfs_new_inode(struct inode *dir, int mode) +struct inode *logfs_new_inode(struct inode *dir, umode_t mode) { struct super_block *sb = dir->i_sb; struct inode *inode; diff --git a/fs/logfs/logfs.h b/fs/logfs/logfs.h index f22d108..9263738 100644 --- a/fs/logfs/logfs.h +++ b/fs/logfs/logfs.h @@ -520,7 +520,7 @@ extern const struct super_operations logfs_super_operations; struct inode *logfs_iget(struct super_block *sb, ino_t ino); struct inode *logfs_safe_iget(struct super_block *sb, ino_t ino, int *cookie); void logfs_safe_iput(struct inode *inode, int cookie); -struct inode *logfs_new_inode(struct inode *dir, int mode); +struct inode *logfs_new_inode(struct inode *dir, umode_t mode); struct inode *logfs_new_meta_inode(struct super_block *sb, u64 ino); struct inode *logfs_read_meta_inode(struct super_block *sb, u64 ino); int logfs_init_inode_cache(void); @@ -618,7 +618,6 @@ static inline int logfs_buf_recover(struct logfs_area *area, u64 ofs, struct page *emergency_read_begin(struct address_space *mapping, pgoff_t index); void emergency_read_end(struct page *page); void logfs_crash_dump(struct super_block *sb); -void *memchr_inv(const void *s, int c, size_t n); int logfs_statfs(struct dentry *dentry, struct kstatfs *stats); int logfs_check_ds(struct logfs_disk_super *ds); int logfs_write_sb(struct super_block *sb); diff --git a/fs/logfs/readwrite.c b/fs/logfs/readwrite.c index d8d0938..2ac4217 100644 --- a/fs/logfs/readwrite.c +++ b/fs/logfs/readwrite.c @@ -126,7 +126,7 @@ static void logfs_disk_to_inode(struct logfs_disk_inode *di, struct inode*inode) inode->i_atime = be64_to_timespec(di->di_atime); inode->i_ctime = be64_to_timespec(di->di_ctime); inode->i_mtime = be64_to_timespec(di->di_mtime); - inode->i_nlink = be32_to_cpu(di->di_refcount); + set_nlink(inode, be32_to_cpu(di->di_refcount)); inode->i_generation = be32_to_cpu(di->di_generation); switch (inode->i_mode & S_IFMT) { diff --git a/fs/logfs/super.c b/fs/logfs/super.c index ce03a18..e795c234 100644 --- a/fs/logfs/super.c +++ b/fs/logfs/super.c @@ -13,6 +13,7 @@ #include <linux/bio.h> #include <linux/slab.h> #include <linux/blkdev.h> +#include <linux/module.h> #include <linux/mtd/mtd.h> #include <linux/statfs.h> #include <linux/buffer_head.h> @@ -91,28 +92,6 @@ void logfs_crash_dump(struct super_block *sb) } /* - * TODO: move to lib/string.c - */ -/** - * memchr_inv - Find a character in an area of memory. - * @s: The memory area - * @c: The byte to search for - * @n: The size of the area. - * - * returns the address of the first character other than @c, or %NULL - * if the whole buffer contains just @c. - */ -void *memchr_inv(const void *s, int c, size_t n) -{ - const unsigned char *p = s; - while (n-- != 0) - if ((unsigned char)c != *p++) - return (void *)(p - 1); - - return NULL; -} - -/* * FIXME: There should be a reserve for root, similar to ext2. */ int logfs_statfs(struct dentry *dentry, struct kstatfs *stats) diff --git a/fs/minix/bitmap.c b/fs/minix/bitmap.c index 3f32bcb..4bc50da 100644 --- a/fs/minix/bitmap.c +++ b/fs/minix/bitmap.c @@ -16,38 +16,26 @@ #include <linux/bitops.h> #include <linux/sched.h> -static const int nibblemap[] = { 4,3,3,2,3,2,2,1,3,2,2,1,2,1,1,0 }; - static DEFINE_SPINLOCK(bitmap_lock); -static unsigned long count_free(struct buffer_head *map[], unsigned numblocks, __u32 numbits) +/* + * bitmap consists of blocks filled with 16bit words + * bit set == busy, bit clear == free + * endianness is a mess, but for counting zero bits it really doesn't matter... + */ +static __u32 count_free(struct buffer_head *map[], unsigned blocksize, __u32 numbits) { - unsigned i, j, sum = 0; - struct buffer_head *bh; - - for (i=0; i<numblocks-1; i++) { - if (!(bh=map[i])) - return(0); - for (j=0; j<bh->b_size; j++) - sum += nibblemap[bh->b_data[j] & 0xf] - + nibblemap[(bh->b_data[j]>>4) & 0xf]; - } + __u32 sum = 0; + unsigned blocks = DIV_ROUND_UP(numbits, blocksize * 8); - if (numblocks==0 || !(bh=map[numblocks-1])) - return(0); - i = ((numbits - (numblocks-1) * bh->b_size * 8) / 16) * 2; - for (j=0; j<i; j++) { - sum += nibblemap[bh->b_data[j] & 0xf] - + nibblemap[(bh->b_data[j]>>4) & 0xf]; + while (blocks--) { + unsigned words = blocksize / 2; + __u16 *p = (__u16 *)(*map++)->b_data; + while (words--) + sum += 16 - hweight16(*p++); } - i = numbits%16; - if (i!=0) { - i = *(__u16 *)(&bh->b_data[j]) | ~((1<<i) - 1); - sum += nibblemap[i & 0xf] + nibblemap[(i>>4) & 0xf]; - sum += nibblemap[(i>>8) & 0xf] + nibblemap[(i>>12) & 0xf]; - } - return(sum); + return sum; } void minix_free_block(struct inode *inode, unsigned long block) @@ -105,10 +93,12 @@ int minix_new_block(struct inode * inode) return 0; } -unsigned long minix_count_free_blocks(struct minix_sb_info *sbi) +unsigned long minix_count_free_blocks(struct super_block *sb) { - return (count_free(sbi->s_zmap, sbi->s_zmap_blocks, - sbi->s_nzones - sbi->s_firstdatazone + 1) + struct minix_sb_info *sbi = minix_sb(sb); + u32 bits = sbi->s_nzones - (sbi->s_firstdatazone + 1); + + return (count_free(sbi->s_zmap, sb->s_blocksize, bits) << sbi->s_log_zone_size); } @@ -219,7 +209,7 @@ void minix_free_inode(struct inode * inode) mark_buffer_dirty(bh); } -struct inode *minix_new_inode(const struct inode *dir, int mode, int *error) +struct inode *minix_new_inode(const struct inode *dir, umode_t mode, int *error) { struct super_block *sb = dir->i_sb; struct minix_sb_info *sbi = minix_sb(sb); @@ -273,7 +263,10 @@ struct inode *minix_new_inode(const struct inode *dir, int mode, int *error) return inode; } -unsigned long minix_count_free_inodes(struct minix_sb_info *sbi) +unsigned long minix_count_free_inodes(struct super_block *sb) { - return count_free(sbi->s_imap, sbi->s_imap_blocks, sbi->s_ninodes + 1); + struct minix_sb_info *sbi = minix_sb(sb); + u32 bits = sbi->s_ninodes + 1; + + return count_free(sbi->s_imap, sb->s_blocksize, bits); } diff --git a/fs/minix/inode.c b/fs/minix/inode.c index e7d23e25..fa8b612 100644 --- a/fs/minix/inode.c +++ b/fs/minix/inode.c @@ -71,7 +71,6 @@ static struct inode *minix_alloc_inode(struct super_block *sb) static void minix_i_callback(struct rcu_head *head) { struct inode *inode = container_of(head, struct inode, i_rcu); - INIT_LIST_HEAD(&inode->i_dentry); kmem_cache_free(minix_inode_cachep, minix_i(inode)); } @@ -263,6 +262,26 @@ static int minix_fill_super(struct super_block *s, void *data, int silent) goto out_no_root; } + /* Apparently minix can create filesystems that allocate more blocks for + * the bitmaps than needed. We simply ignore that, but verify it didn't + * create one with not enough blocks and bail out if so. + */ + block = minix_blocks_needed(sbi->s_ninodes, s->s_blocksize); + if (sbi->s_imap_blocks < block) { + printk("MINIX-fs: file system does not have enough " + "imap blocks allocated. Refusing to mount\n"); + goto out_iput; + } + + block = minix_blocks_needed( + (sbi->s_nzones - (sbi->s_firstdatazone + 1)), + s->s_blocksize); + if (sbi->s_zmap_blocks < block) { + printk("MINIX-fs: file system does not have enough " + "zmap blocks allocated. Refusing to mount.\n"); + goto out_iput; + } + ret = -ENOMEM; s->s_root = d_alloc_root(root_inode); if (!s->s_root) @@ -276,9 +295,10 @@ static int minix_fill_super(struct super_block *s, void *data, int silent) if (!(sbi->s_mount_state & MINIX_VALID_FS)) printk("MINIX-fs: mounting unchecked file system, " "running fsck is recommended\n"); - else if (sbi->s_mount_state & MINIX_ERROR_FS) + else if (sbi->s_mount_state & MINIX_ERROR_FS) printk("MINIX-fs: mounting file system with errors, " "running fsck is recommended\n"); + return 0; out_iput: @@ -339,10 +359,10 @@ static int minix_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_type = sb->s_magic; buf->f_bsize = sb->s_blocksize; buf->f_blocks = (sbi->s_nzones - sbi->s_firstdatazone) << sbi->s_log_zone_size; - buf->f_bfree = minix_count_free_blocks(sbi); + buf->f_bfree = minix_count_free_blocks(sb); buf->f_bavail = buf->f_bfree; buf->f_files = sbi->s_ninodes; - buf->f_ffree = minix_count_free_inodes(sbi); + buf->f_ffree = minix_count_free_inodes(sb); buf->f_namelen = sbi->s_namelen; buf->f_fsid.val[0] = (u32)id; buf->f_fsid.val[1] = (u32)(id >> 32); @@ -446,7 +466,7 @@ static struct inode *V1_minix_iget(struct inode *inode) inode->i_mode = raw_inode->i_mode; inode->i_uid = (uid_t)raw_inode->i_uid; inode->i_gid = (gid_t)raw_inode->i_gid; - inode->i_nlink = raw_inode->i_nlinks; + set_nlink(inode, raw_inode->i_nlinks); inode->i_size = raw_inode->i_size; inode->i_mtime.tv_sec = inode->i_atime.tv_sec = inode->i_ctime.tv_sec = raw_inode->i_time; inode->i_mtime.tv_nsec = 0; @@ -479,7 +499,7 @@ static struct inode *V2_minix_iget(struct inode *inode) inode->i_mode = raw_inode->i_mode; inode->i_uid = (uid_t)raw_inode->i_uid; inode->i_gid = (gid_t)raw_inode->i_gid; - inode->i_nlink = raw_inode->i_nlinks; + set_nlink(inode, raw_inode->i_nlinks); inode->i_size = raw_inode->i_size; inode->i_mtime.tv_sec = raw_inode->i_mtime; inode->i_atime.tv_sec = raw_inode->i_atime; diff --git a/fs/minix/minix.h b/fs/minix/minix.h index 341e212..c889ef0 100644 --- a/fs/minix/minix.h +++ b/fs/minix/minix.h @@ -46,12 +46,12 @@ struct minix_sb_info { extern struct inode *minix_iget(struct super_block *, unsigned long); extern struct minix_inode * minix_V1_raw_inode(struct super_block *, ino_t, struct buffer_head **); extern struct minix2_inode * minix_V2_raw_inode(struct super_block *, ino_t, struct buffer_head **); -extern struct inode * minix_new_inode(const struct inode *, int, int *); +extern struct inode * minix_new_inode(const struct inode *, umode_t, int *); extern void minix_free_inode(struct inode * inode); -extern unsigned long minix_count_free_inodes(struct minix_sb_info *sbi); +extern unsigned long minix_count_free_inodes(struct super_block *sb); extern int minix_new_block(struct inode * inode); extern void minix_free_block(struct inode *inode, unsigned long block); -extern unsigned long minix_count_free_blocks(struct minix_sb_info *sbi); +extern unsigned long minix_count_free_blocks(struct super_block *sb); extern int minix_getattr(struct vfsmount *, struct dentry *, struct kstat *); extern int minix_prepare_chunk(struct page *page, loff_t pos, unsigned len); @@ -88,6 +88,11 @@ static inline struct minix_inode_info *minix_i(struct inode *inode) return list_entry(inode, struct minix_inode_info, vfs_inode); } +static inline unsigned minix_blocks_needed(unsigned bits, unsigned blocksize) +{ + return DIV_ROUND_UP(bits, blocksize * 8); +} + #if defined(CONFIG_MINIX_FS_NATIVE_ENDIAN) && \ defined(CONFIG_MINIX_FS_BIG_ENDIAN_16BIT_INDEXED) @@ -125,7 +130,7 @@ static inline int minix_find_first_zero_bit(const void *vaddr, unsigned size) if (!size) return 0; - size = (size >> 4) + ((size & 15) > 0); + size >>= 4; while (*p++ == 0xffff) { if (--size == 0) return (p - addr) << 4; diff --git a/fs/minix/namei.c b/fs/minix/namei.c index 6e6777f..2f76e38 100644 --- a/fs/minix/namei.c +++ b/fs/minix/namei.c @@ -36,7 +36,7 @@ static struct dentry *minix_lookup(struct inode * dir, struct dentry *dentry, st return NULL; } -static int minix_mknod(struct inode * dir, struct dentry *dentry, int mode, dev_t rdev) +static int minix_mknod(struct inode * dir, struct dentry *dentry, umode_t mode, dev_t rdev) { int error; struct inode *inode; @@ -54,7 +54,7 @@ static int minix_mknod(struct inode * dir, struct dentry *dentry, int mode, dev_ return error; } -static int minix_create(struct inode * dir, struct dentry *dentry, int mode, +static int minix_create(struct inode *dir, struct dentry *dentry, umode_t mode, struct nameidata *nd) { return minix_mknod(dir, dentry, mode, 0); @@ -103,7 +103,7 @@ static int minix_link(struct dentry * old_dentry, struct inode * dir, return add_nondir(dentry, inode); } -static int minix_mkdir(struct inode * dir, struct dentry *dentry, int mode) +static int minix_mkdir(struct inode * dir, struct dentry *dentry, umode_t mode) { struct inode * inode; int err = -EMLINK; diff --git a/fs/mount.h b/fs/mount.h new file mode 100644 index 0000000..4ef36d9 --- /dev/null +++ b/fs/mount.h @@ -0,0 +1,76 @@ +#include <linux/mount.h> +#include <linux/seq_file.h> +#include <linux/poll.h> + +struct mnt_namespace { + atomic_t count; + struct mount * root; + struct list_head list; + wait_queue_head_t poll; + int event; +}; + +struct mnt_pcp { + int mnt_count; + int mnt_writers; +}; + +struct mount { + struct list_head mnt_hash; + struct mount *mnt_parent; + struct dentry *mnt_mountpoint; + struct vfsmount mnt; +#ifdef CONFIG_SMP + struct mnt_pcp __percpu *mnt_pcp; + atomic_t mnt_longterm; /* how many of the refs are longterm */ +#else + int mnt_count; + int mnt_writers; +#endif + struct list_head mnt_mounts; /* list of children, anchored here */ + struct list_head mnt_child; /* and going through their mnt_child */ + struct list_head mnt_instance; /* mount instance on sb->s_mounts */ + const char *mnt_devname; /* Name of device e.g. /dev/dsk/hda1 */ + struct list_head mnt_list; + struct list_head mnt_expire; /* link in fs-specific expiry list */ + struct list_head mnt_share; /* circular list of shared mounts */ + struct list_head mnt_slave_list;/* list of slave mounts */ + struct list_head mnt_slave; /* slave list entry */ + struct mount *mnt_master; /* slave is on master->mnt_slave_list */ + struct mnt_namespace *mnt_ns; /* containing namespace */ +#ifdef CONFIG_FSNOTIFY + struct hlist_head mnt_fsnotify_marks; + __u32 mnt_fsnotify_mask; +#endif + int mnt_id; /* mount identifier */ + int mnt_group_id; /* peer group identifier */ + int mnt_expiry_mark; /* true if marked for expiry */ + int mnt_pinned; + int mnt_ghosts; +}; + +static inline struct mount *real_mount(struct vfsmount *mnt) +{ + return container_of(mnt, struct mount, mnt); +} + +static inline int mnt_has_parent(struct mount *mnt) +{ + return mnt != mnt->mnt_parent; +} + +extern struct mount *__lookup_mnt(struct vfsmount *, struct dentry *, int); + +static inline void get_mnt_ns(struct mnt_namespace *ns) +{ + atomic_inc(&ns->count); +} + +struct proc_mounts { + struct seq_file m; /* must be the first element */ + struct mnt_namespace *ns; + struct path root; + int (*show)(struct seq_file *, struct vfsmount *); +}; + +extern const struct seq_operations mounts_op; @@ -36,6 +36,7 @@ #include <asm/uaccess.h> #include "internal.h" +#include "mount.h" /* [Feb-1997 T. Schoebel-Theuer] * Fundamental changes in the pathname lookup mechanisms (namei) @@ -137,7 +138,7 @@ static int do_getname(const char __user *filename, char *page) return retval; } -static char *getname_flags(const char __user * filename, int flags) +static char *getname_flags(const char __user *filename, int flags, int *empty) { char *tmp, *result; @@ -148,6 +149,8 @@ static char *getname_flags(const char __user * filename, int flags) result = tmp; if (retval < 0) { + if (retval == -ENOENT && empty) + *empty = 1; if (retval != -ENOENT || !(flags & LOOKUP_EMPTY)) { __putname(tmp); result = ERR_PTR(retval); @@ -160,7 +163,7 @@ static char *getname_flags(const char __user * filename, int flags) char *getname(const char __user * filename) { - return getname_flags(filename, 0); + return getname_flags(filename, 0, 0); } #ifdef CONFIG_AUDITSYSCALL @@ -221,14 +224,12 @@ static int check_acl(struct inode *inode, int mask) } /* - * This does basic POSIX ACL permission checking + * This does the basic permission checking */ static int acl_permission_check(struct inode *inode, int mask) { unsigned int mode = inode->i_mode; - mask &= MAY_READ | MAY_WRITE | MAY_EXEC | MAY_NOT_BLOCK; - if (current_user_ns() != inode_userns(inode)) goto other_perms; @@ -257,7 +258,7 @@ other_perms: /** * generic_permission - check for access rights on a Posix-like filesystem * @inode: inode to check access rights for - * @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC) + * @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC, ...) * * Used to check for read/write/execute permissions on a file. * We use "fsuid" for this, letting us set arbitrary permissions @@ -273,7 +274,7 @@ int generic_permission(struct inode *inode, int mask) int ret; /* - * Do the basic POSIX ACL permission checks. + * Do the basic permission checks. */ ret = acl_permission_check(inode, mask); if (ret != -EACCES) @@ -331,12 +332,14 @@ static inline int do_inode_permission(struct inode *inode, int mask) /** * inode_permission - check for access rights to a given inode * @inode: inode to check permission on - * @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC) + * @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC, ...) * * Used to check for read/write/execute permissions on an inode. * We use "fsuid" for this, letting us set arbitrary permissions * for filesystem access without changing the "normal" uids which * are used for other things. + * + * When checking for MAY_APPEND, MAY_WRITE must also be set in @mask. */ int inode_permission(struct inode *inode, int mask) { @@ -674,36 +677,38 @@ follow_link(struct path *link, struct nameidata *nd, void **p) static int follow_up_rcu(struct path *path) { - struct vfsmount *parent; + struct mount *mnt = real_mount(path->mnt); + struct mount *parent; struct dentry *mountpoint; - parent = path->mnt->mnt_parent; - if (parent == path->mnt) + parent = mnt->mnt_parent; + if (&parent->mnt == path->mnt) return 0; - mountpoint = path->mnt->mnt_mountpoint; + mountpoint = mnt->mnt_mountpoint; path->dentry = mountpoint; - path->mnt = parent; + path->mnt = &parent->mnt; return 1; } int follow_up(struct path *path) { - struct vfsmount *parent; + struct mount *mnt = real_mount(path->mnt); + struct mount *parent; struct dentry *mountpoint; br_read_lock(vfsmount_lock); - parent = path->mnt->mnt_parent; - if (parent == path->mnt) { + parent = mnt->mnt_parent; + if (&parent->mnt == path->mnt) { br_read_unlock(vfsmount_lock); return 0; } - mntget(parent); - mountpoint = dget(path->mnt->mnt_mountpoint); + mntget(&parent->mnt); + mountpoint = dget(mnt->mnt_mountpoint); br_read_unlock(vfsmount_lock); dput(path->dentry); path->dentry = mountpoint; mntput(path->mnt); - path->mnt = parent; + path->mnt = &parent->mnt; return 1; } @@ -850,7 +855,7 @@ static int follow_managed(struct path *path, unsigned flags) mntput(path->mnt); if (ret == -EISDIR) ret = 0; - return ret; + return ret < 0 ? ret : need_mntput; } int follow_down_one(struct path *path) @@ -882,7 +887,7 @@ static bool __follow_mount_rcu(struct nameidata *nd, struct path *path, struct inode **inode) { for (;;) { - struct vfsmount *mounted; + struct mount *mounted; /* * Don't forget we might have a non-mountpoint managed dentry * that wants to block transit. @@ -896,8 +901,9 @@ static bool __follow_mount_rcu(struct nameidata *nd, struct path *path, mounted = __lookup_mnt(path->mnt, path->dentry, 1); if (!mounted) break; - path->mnt = mounted; - path->dentry = mounted->mnt_root; + path->mnt = &mounted->mnt; + path->dentry = mounted->mnt.mnt_root; + nd->flags |= LOOKUP_JUMPED; nd->seq = read_seqcount_begin(&path->dentry->d_seq); /* * Update the inode too. We don't need to re-check the @@ -912,12 +918,12 @@ static bool __follow_mount_rcu(struct nameidata *nd, struct path *path, static void follow_mount_rcu(struct nameidata *nd) { while (d_mountpoint(nd->path.dentry)) { - struct vfsmount *mounted; + struct mount *mounted; mounted = __lookup_mnt(nd->path.mnt, nd->path.dentry, 1); if (!mounted) break; - nd->path.mnt = mounted; - nd->path.dentry = mounted->mnt_root; + nd->path.mnt = &mounted->mnt; + nd->path.dentry = mounted->mnt.mnt_root; nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq); } } @@ -1211,6 +1217,8 @@ retry: path_put_conditional(path, nd); return err; } + if (err) + nd->flags |= LOOKUP_JUMPED; *inode = path->dentry->d_inode; return 0; } @@ -1798,11 +1806,11 @@ struct dentry *lookup_one_len(const char *name, struct dentry *base, int len) return __lookup_hash(&this, base, NULL); } -int user_path_at(int dfd, const char __user *name, unsigned flags, - struct path *path) +int user_path_at_empty(int dfd, const char __user *name, unsigned flags, + struct path *path, int *empty) { struct nameidata nd; - char *tmp = getname_flags(name, flags); + char *tmp = getname_flags(name, flags, empty); int err = PTR_ERR(tmp); if (!IS_ERR(tmp)) { @@ -1816,6 +1824,12 @@ int user_path_at(int dfd, const char __user *name, unsigned flags, return err; } +int user_path_at(int dfd, const char __user *name, unsigned flags, + struct path *path) +{ + return user_path_at_empty(dfd, name, flags, path, 0); +} + static int user_path_parent(int dfd, const char __user *path, struct nameidata *nd, char **name) { @@ -1965,7 +1979,7 @@ void unlock_rename(struct dentry *p1, struct dentry *p2) } } -int vfs_create(struct inode *dir, struct dentry *dentry, int mode, +int vfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, struct nameidata *nd) { int error = may_create(dir, dentry); @@ -2035,10 +2049,7 @@ static int may_open(struct path *path, int acc_mode, int flag) if (flag & O_NOATIME && !inode_owner_or_capable(inode)) return -EPERM; - /* - * Ensure there are no outstanding leases on the file. - */ - return break_lease(inode, flag); + return 0; } static int handle_truncate(struct file *filp) @@ -2141,6 +2152,10 @@ static struct file *do_last(struct nameidata *nd, struct path *path, } /* create side of things */ + /* + * This will *only* deal with leaving RCU mode - LOOKUP_JUMPED has been + * cleared when we got to the last component we are about to look up + */ error = complete_walk(nd); if (error) return ERR_PTR(error); @@ -2165,7 +2180,7 @@ static struct file *do_last(struct nameidata *nd, struct path *path, /* Negative dentry, just create the file */ if (!dentry->d_inode) { - int mode = op->mode; + umode_t mode = op->mode; if (!IS_POSIXACL(dir->d_inode)) mode &= ~current_umask(); /* @@ -2209,6 +2224,9 @@ static struct file *do_last(struct nameidata *nd, struct path *path, if (error < 0) goto exit_dput; + if (error) + nd->flags |= LOOKUP_JUMPED; + error = -ENOENT; if (!path->dentry->d_inode) goto exit_dput; @@ -2218,6 +2236,10 @@ static struct file *do_last(struct nameidata *nd, struct path *path, path_to_nameidata(path, nd); nd->inode = path->dentry->d_inode; + /* Why this, you ask? _Now_ we might have grown LOOKUP_JUMPED... */ + error = complete_walk(nd); + if (error) + goto exit; error = -EISDIR; if (S_ISDIR(nd->inode->i_mode)) goto exit; @@ -2425,7 +2447,7 @@ struct dentry *user_path_create(int dfd, const char __user *pathname, struct pat } EXPORT_SYMBOL(user_path_create); -int vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev) +int vfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev) { int error = may_create(dir, dentry); @@ -2453,7 +2475,7 @@ int vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev) return error; } -static int may_mknod(mode_t mode) +static int may_mknod(umode_t mode) { switch (mode & S_IFMT) { case S_IFREG: @@ -2470,7 +2492,7 @@ static int may_mknod(mode_t mode) } } -SYSCALL_DEFINE4(mknodat, int, dfd, const char __user *, filename, int, mode, +SYSCALL_DEFINE4(mknodat, int, dfd, const char __user *, filename, umode_t, mode, unsigned, dev) { struct dentry *dentry; @@ -2517,12 +2539,12 @@ out_dput: return error; } -SYSCALL_DEFINE3(mknod, const char __user *, filename, int, mode, unsigned, dev) +SYSCALL_DEFINE3(mknod, const char __user *, filename, umode_t, mode, unsigned, dev) { return sys_mknodat(AT_FDCWD, filename, mode, dev); } -int vfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) +int vfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) { int error = may_create(dir, dentry); @@ -2543,7 +2565,7 @@ int vfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) return error; } -SYSCALL_DEFINE3(mkdirat, int, dfd, const char __user *, pathname, int, mode) +SYSCALL_DEFINE3(mkdirat, int, dfd, const char __user *, pathname, umode_t, mode) { struct dentry *dentry; struct path path; @@ -2571,7 +2593,7 @@ out_dput: return error; } -SYSCALL_DEFINE2(mkdir, const char __user *, pathname, int, mode) +SYSCALL_DEFINE2(mkdir, const char __user *, pathname, umode_t, mode) { return sys_mkdirat(AT_FDCWD, pathname, mode); } diff --git a/fs/namespace.c b/fs/namespace.c index b4febb2..e608199 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -9,30 +9,17 @@ */ #include <linux/syscalls.h> -#include <linux/slab.h> -#include <linux/sched.h> -#include <linux/spinlock.h> -#include <linux/percpu.h> -#include <linux/init.h> -#include <linux/kernel.h> -#include <linux/acct.h> +#include <linux/export.h> #include <linux/capability.h> -#include <linux/cpumask.h> -#include <linux/module.h> -#include <linux/sysfs.h> -#include <linux/seq_file.h> #include <linux/mnt_namespace.h> #include <linux/namei.h> -#include <linux/nsproxy.h> #include <linux/security.h> -#include <linux/mount.h> -#include <linux/ramfs.h> -#include <linux/log2.h> #include <linux/idr.h> -#include <linux/fs_struct.h> -#include <linux/fsnotify.h> -#include <asm/uaccess.h> -#include <asm/unistd.h> +#include <linux/acct.h> /* acct_auto_close_mnt */ +#include <linux/ramfs.h> /* init_rootfs */ +#include <linux/fs_struct.h> /* get_fs_root et.al. */ +#include <linux/fsnotify.h> /* fsnotify_vfsmount_delete */ +#include <linux/uaccess.h> #include "pnode.h" #include "internal.h" @@ -78,7 +65,7 @@ static inline unsigned long hash(struct vfsmount *mnt, struct dentry *dentry) * allocation is serialized by namespace_sem, but we need the spinlock to * serialize with freeing. */ -static int mnt_alloc_id(struct vfsmount *mnt) +static int mnt_alloc_id(struct mount *mnt) { int res; @@ -95,7 +82,7 @@ retry: return res; } -static void mnt_free_id(struct vfsmount *mnt) +static void mnt_free_id(struct mount *mnt) { int id = mnt->mnt_id; spin_lock(&mnt_id_lock); @@ -110,7 +97,7 @@ static void mnt_free_id(struct vfsmount *mnt) * * mnt_group_ida is protected by namespace_sem */ -static int mnt_alloc_group_id(struct vfsmount *mnt) +static int mnt_alloc_group_id(struct mount *mnt) { int res; @@ -129,7 +116,7 @@ static int mnt_alloc_group_id(struct vfsmount *mnt) /* * Release a peer group ID */ -void mnt_release_group_id(struct vfsmount *mnt) +void mnt_release_group_id(struct mount *mnt) { int id = mnt->mnt_group_id; ida_remove(&mnt_group_ida, id); @@ -141,7 +128,7 @@ void mnt_release_group_id(struct vfsmount *mnt) /* * vfsmount lock must be held for read */ -static inline void mnt_add_count(struct vfsmount *mnt, int n) +static inline void mnt_add_count(struct mount *mnt, int n) { #ifdef CONFIG_SMP this_cpu_add(mnt->mnt_pcp->mnt_count, n); @@ -152,35 +139,10 @@ static inline void mnt_add_count(struct vfsmount *mnt, int n) #endif } -static inline void mnt_set_count(struct vfsmount *mnt, int n) -{ -#ifdef CONFIG_SMP - this_cpu_write(mnt->mnt_pcp->mnt_count, n); -#else - mnt->mnt_count = n; -#endif -} - -/* - * vfsmount lock must be held for read - */ -static inline void mnt_inc_count(struct vfsmount *mnt) -{ - mnt_add_count(mnt, 1); -} - -/* - * vfsmount lock must be held for read - */ -static inline void mnt_dec_count(struct vfsmount *mnt) -{ - mnt_add_count(mnt, -1); -} - /* * vfsmount lock must be held for write */ -unsigned int mnt_get_count(struct vfsmount *mnt) +unsigned int mnt_get_count(struct mount *mnt) { #ifdef CONFIG_SMP unsigned int count = 0; @@ -196,9 +158,9 @@ unsigned int mnt_get_count(struct vfsmount *mnt) #endif } -static struct vfsmount *alloc_vfsmnt(const char *name) +static struct mount *alloc_vfsmnt(const char *name) { - struct vfsmount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL); + struct mount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL); if (mnt) { int err; @@ -277,7 +239,7 @@ int __mnt_is_readonly(struct vfsmount *mnt) } EXPORT_SYMBOL_GPL(__mnt_is_readonly); -static inline void mnt_inc_writers(struct vfsmount *mnt) +static inline void mnt_inc_writers(struct mount *mnt) { #ifdef CONFIG_SMP this_cpu_inc(mnt->mnt_pcp->mnt_writers); @@ -286,7 +248,7 @@ static inline void mnt_inc_writers(struct vfsmount *mnt) #endif } -static inline void mnt_dec_writers(struct vfsmount *mnt) +static inline void mnt_dec_writers(struct mount *mnt) { #ifdef CONFIG_SMP this_cpu_dec(mnt->mnt_pcp->mnt_writers); @@ -295,7 +257,7 @@ static inline void mnt_dec_writers(struct vfsmount *mnt) #endif } -static unsigned int mnt_get_writers(struct vfsmount *mnt) +static unsigned int mnt_get_writers(struct mount *mnt) { #ifdef CONFIG_SMP unsigned int count = 0; @@ -311,6 +273,15 @@ static unsigned int mnt_get_writers(struct vfsmount *mnt) #endif } +static int mnt_is_readonly(struct vfsmount *mnt) +{ + if (mnt->mnt_sb->s_readonly_remount) + return 1; + /* Order wrt setting s_flags/s_readonly_remount in do_remount() */ + smp_rmb(); + return __mnt_is_readonly(mnt); +} + /* * Most r/o checks on a fs are for operations that take * discrete amounts of time, like a write() or unlink(). @@ -321,7 +292,7 @@ static unsigned int mnt_get_writers(struct vfsmount *mnt) */ /** * mnt_want_write - get write access to a mount - * @mnt: the mount on which to take a write + * @m: the mount on which to take a write * * This tells the low-level filesystem that a write is * about to be performed to it, and makes sure that @@ -329,8 +300,9 @@ static unsigned int mnt_get_writers(struct vfsmount *mnt) * the write operation is finished, mnt_drop_write() * must be called. This is effectively a refcount. */ -int mnt_want_write(struct vfsmount *mnt) +int mnt_want_write(struct vfsmount *m) { + struct mount *mnt = real_mount(m); int ret = 0; preempt_disable(); @@ -341,7 +313,7 @@ int mnt_want_write(struct vfsmount *mnt) * incremented count after it has set MNT_WRITE_HOLD. */ smp_mb(); - while (mnt->mnt_flags & MNT_WRITE_HOLD) + while (mnt->mnt.mnt_flags & MNT_WRITE_HOLD) cpu_relax(); /* * After the slowpath clears MNT_WRITE_HOLD, mnt_is_readonly will @@ -349,12 +321,10 @@ int mnt_want_write(struct vfsmount *mnt) * MNT_WRITE_HOLD is cleared. */ smp_rmb(); - if (__mnt_is_readonly(mnt)) { + if (mnt_is_readonly(m)) { mnt_dec_writers(mnt); ret = -EROFS; - goto out; } -out: preempt_enable(); return ret; } @@ -378,7 +348,7 @@ int mnt_clone_write(struct vfsmount *mnt) if (__mnt_is_readonly(mnt)) return -EROFS; preempt_disable(); - mnt_inc_writers(mnt); + mnt_inc_writers(real_mount(mnt)); preempt_enable(); return 0; } @@ -412,17 +382,23 @@ EXPORT_SYMBOL_GPL(mnt_want_write_file); void mnt_drop_write(struct vfsmount *mnt) { preempt_disable(); - mnt_dec_writers(mnt); + mnt_dec_writers(real_mount(mnt)); preempt_enable(); } EXPORT_SYMBOL_GPL(mnt_drop_write); -static int mnt_make_readonly(struct vfsmount *mnt) +void mnt_drop_write_file(struct file *file) +{ + mnt_drop_write(file->f_path.mnt); +} +EXPORT_SYMBOL(mnt_drop_write_file); + +static int mnt_make_readonly(struct mount *mnt) { int ret = 0; br_write_lock(vfsmount_lock); - mnt->mnt_flags |= MNT_WRITE_HOLD; + mnt->mnt.mnt_flags |= MNT_WRITE_HOLD; /* * After storing MNT_WRITE_HOLD, we'll read the counters. This store * should be visible before we do. @@ -448,25 +424,61 @@ static int mnt_make_readonly(struct vfsmount *mnt) if (mnt_get_writers(mnt) > 0) ret = -EBUSY; else - mnt->mnt_flags |= MNT_READONLY; + mnt->mnt.mnt_flags |= MNT_READONLY; /* * MNT_READONLY must become visible before ~MNT_WRITE_HOLD, so writers * that become unheld will see MNT_READONLY. */ smp_wmb(); - mnt->mnt_flags &= ~MNT_WRITE_HOLD; + mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD; br_write_unlock(vfsmount_lock); return ret; } -static void __mnt_unmake_readonly(struct vfsmount *mnt) +static void __mnt_unmake_readonly(struct mount *mnt) { br_write_lock(vfsmount_lock); - mnt->mnt_flags &= ~MNT_READONLY; + mnt->mnt.mnt_flags &= ~MNT_READONLY; + br_write_unlock(vfsmount_lock); +} + +int sb_prepare_remount_readonly(struct super_block *sb) +{ + struct mount *mnt; + int err = 0; + + /* Racy optimization. Recheck the counter under MNT_WRITE_HOLD */ + if (atomic_long_read(&sb->s_remove_count)) + return -EBUSY; + + br_write_lock(vfsmount_lock); + list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) { + if (!(mnt->mnt.mnt_flags & MNT_READONLY)) { + mnt->mnt.mnt_flags |= MNT_WRITE_HOLD; + smp_mb(); + if (mnt_get_writers(mnt) > 0) { + err = -EBUSY; + break; + } + } + } + if (!err && atomic_long_read(&sb->s_remove_count)) + err = -EBUSY; + + if (!err) { + sb->s_readonly_remount = 1; + smp_wmb(); + } + list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) { + if (mnt->mnt.mnt_flags & MNT_WRITE_HOLD) + mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD; + } br_write_unlock(vfsmount_lock); + + return err; } -static void free_vfsmnt(struct vfsmount *mnt) +static void free_vfsmnt(struct mount *mnt) { kfree(mnt->mnt_devname); mnt_free_id(mnt); @@ -481,20 +493,20 @@ static void free_vfsmnt(struct vfsmount *mnt) * @dir. If @dir is set return the first mount else return the last mount. * vfsmount_lock must be held for read or write. */ -struct vfsmount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry, +struct mount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry, int dir) { struct list_head *head = mount_hashtable + hash(mnt, dentry); struct list_head *tmp = head; - struct vfsmount *p, *found = NULL; + struct mount *p, *found = NULL; for (;;) { tmp = dir ? tmp->next : tmp->prev; p = NULL; if (tmp == head) break; - p = list_entry(tmp, struct vfsmount, mnt_hash); - if (p->mnt_parent == mnt && p->mnt_mountpoint == dentry) { + p = list_entry(tmp, struct mount, mnt_hash); + if (&p->mnt_parent->mnt == mnt && p->mnt_mountpoint == dentry) { found = p; break; } @@ -508,16 +520,21 @@ struct vfsmount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry, */ struct vfsmount *lookup_mnt(struct path *path) { - struct vfsmount *child_mnt; + struct mount *child_mnt; br_read_lock(vfsmount_lock); - if ((child_mnt = __lookup_mnt(path->mnt, path->dentry, 1))) - mntget(child_mnt); - br_read_unlock(vfsmount_lock); - return child_mnt; + child_mnt = __lookup_mnt(path->mnt, path->dentry, 1); + if (child_mnt) { + mnt_add_count(child_mnt, 1); + br_read_unlock(vfsmount_lock); + return &child_mnt->mnt; + } else { + br_read_unlock(vfsmount_lock); + return NULL; + } } -static inline int check_mnt(struct vfsmount *mnt) +static inline int check_mnt(struct mount *mnt) { return mnt->mnt_ns == current->nsproxy->mnt_ns; } @@ -548,12 +565,12 @@ static void __touch_mnt_namespace(struct mnt_namespace *ns) * Clear dentry's mounted state if it has no remaining mounts. * vfsmount_lock must be held for write. */ -static void dentry_reset_mounted(struct vfsmount *mnt, struct dentry *dentry) +static void dentry_reset_mounted(struct dentry *dentry) { unsigned u; for (u = 0; u < HASH_SIZE; u++) { - struct vfsmount *p; + struct mount *p; list_for_each_entry(p, &mount_hashtable[u], mnt_hash) { if (p->mnt_mountpoint == dentry) @@ -568,25 +585,26 @@ static void dentry_reset_mounted(struct vfsmount *mnt, struct dentry *dentry) /* * vfsmount lock must be held for write */ -static void detach_mnt(struct vfsmount *mnt, struct path *old_path) +static void detach_mnt(struct mount *mnt, struct path *old_path) { old_path->dentry = mnt->mnt_mountpoint; - old_path->mnt = mnt->mnt_parent; + old_path->mnt = &mnt->mnt_parent->mnt; mnt->mnt_parent = mnt; - mnt->mnt_mountpoint = mnt->mnt_root; + mnt->mnt_mountpoint = mnt->mnt.mnt_root; list_del_init(&mnt->mnt_child); list_del_init(&mnt->mnt_hash); - dentry_reset_mounted(old_path->mnt, old_path->dentry); + dentry_reset_mounted(old_path->dentry); } /* * vfsmount lock must be held for write */ -void mnt_set_mountpoint(struct vfsmount *mnt, struct dentry *dentry, - struct vfsmount *child_mnt) +void mnt_set_mountpoint(struct mount *mnt, struct dentry *dentry, + struct mount *child_mnt) { - child_mnt->mnt_parent = mntget(mnt); + mnt_add_count(mnt, 1); /* essentially, that's mntget */ child_mnt->mnt_mountpoint = dget(dentry); + child_mnt->mnt_parent = mnt; spin_lock(&dentry->d_lock); dentry->d_flags |= DCACHE_MOUNTED; spin_unlock(&dentry->d_lock); @@ -595,15 +613,15 @@ void mnt_set_mountpoint(struct vfsmount *mnt, struct dentry *dentry, /* * vfsmount lock must be held for write */ -static void attach_mnt(struct vfsmount *mnt, struct path *path) +static void attach_mnt(struct mount *mnt, struct path *path) { - mnt_set_mountpoint(path->mnt, path->dentry, mnt); + mnt_set_mountpoint(real_mount(path->mnt), path->dentry, mnt); list_add_tail(&mnt->mnt_hash, mount_hashtable + hash(path->mnt, path->dentry)); - list_add_tail(&mnt->mnt_child, &path->mnt->mnt_mounts); + list_add_tail(&mnt->mnt_child, &real_mount(path->mnt)->mnt_mounts); } -static inline void __mnt_make_longterm(struct vfsmount *mnt) +static inline void __mnt_make_longterm(struct mount *mnt) { #ifdef CONFIG_SMP atomic_inc(&mnt->mnt_longterm); @@ -611,7 +629,7 @@ static inline void __mnt_make_longterm(struct vfsmount *mnt) } /* needs vfsmount lock for write */ -static inline void __mnt_make_shortterm(struct vfsmount *mnt) +static inline void __mnt_make_shortterm(struct mount *mnt) { #ifdef CONFIG_SMP atomic_dec(&mnt->mnt_longterm); @@ -621,10 +639,10 @@ static inline void __mnt_make_shortterm(struct vfsmount *mnt) /* * vfsmount lock must be held for write */ -static void commit_tree(struct vfsmount *mnt) +static void commit_tree(struct mount *mnt) { - struct vfsmount *parent = mnt->mnt_parent; - struct vfsmount *m; + struct mount *parent = mnt->mnt_parent; + struct mount *m; LIST_HEAD(head); struct mnt_namespace *n = parent->mnt_ns; @@ -639,12 +657,12 @@ static void commit_tree(struct vfsmount *mnt) list_splice(&head, n->list.prev); list_add_tail(&mnt->mnt_hash, mount_hashtable + - hash(parent, mnt->mnt_mountpoint)); + hash(&parent->mnt, mnt->mnt_mountpoint)); list_add_tail(&mnt->mnt_child, &parent->mnt_mounts); touch_mnt_namespace(n); } -static struct vfsmount *next_mnt(struct vfsmount *p, struct vfsmount *root) +static struct mount *next_mnt(struct mount *p, struct mount *root) { struct list_head *next = p->mnt_mounts.next; if (next == &p->mnt_mounts) { @@ -657,14 +675,14 @@ static struct vfsmount *next_mnt(struct vfsmount *p, struct vfsmount *root) p = p->mnt_parent; } } - return list_entry(next, struct vfsmount, mnt_child); + return list_entry(next, struct mount, mnt_child); } -static struct vfsmount *skip_mnt_tree(struct vfsmount *p) +static struct mount *skip_mnt_tree(struct mount *p) { struct list_head *prev = p->mnt_mounts.prev; while (prev != &p->mnt_mounts) { - p = list_entry(prev, struct vfsmount, mnt_child); + p = list_entry(prev, struct mount, mnt_child); prev = p->mnt_mounts.prev; } return p; @@ -673,7 +691,7 @@ static struct vfsmount *skip_mnt_tree(struct vfsmount *p) struct vfsmount * vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void *data) { - struct vfsmount *mnt; + struct mount *mnt; struct dentry *root; if (!type) @@ -684,7 +702,7 @@ vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void return ERR_PTR(-ENOMEM); if (flags & MS_KERNMOUNT) - mnt->mnt_flags = MNT_INTERNAL; + mnt->mnt.mnt_flags = MNT_INTERNAL; root = mount_fs(type, flags, name, data); if (IS_ERR(root)) { @@ -692,19 +710,22 @@ vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void return ERR_CAST(root); } - mnt->mnt_root = root; - mnt->mnt_sb = root->d_sb; - mnt->mnt_mountpoint = mnt->mnt_root; + mnt->mnt.mnt_root = root; + mnt->mnt.mnt_sb = root->d_sb; + mnt->mnt_mountpoint = mnt->mnt.mnt_root; mnt->mnt_parent = mnt; - return mnt; + br_write_lock(vfsmount_lock); + list_add_tail(&mnt->mnt_instance, &root->d_sb->s_mounts); + br_write_unlock(vfsmount_lock); + return &mnt->mnt; } EXPORT_SYMBOL_GPL(vfs_kern_mount); -static struct vfsmount *clone_mnt(struct vfsmount *old, struct dentry *root, +static struct mount *clone_mnt(struct mount *old, struct dentry *root, int flag) { - struct super_block *sb = old->mnt_sb; - struct vfsmount *mnt = alloc_vfsmnt(old->mnt_devname); + struct super_block *sb = old->mnt.mnt_sb; + struct mount *mnt = alloc_vfsmnt(old->mnt_devname); if (mnt) { if (flag & (CL_SLAVE | CL_PRIVATE)) @@ -718,12 +739,15 @@ static struct vfsmount *clone_mnt(struct vfsmount *old, struct dentry *root, goto out_free; } - mnt->mnt_flags = old->mnt_flags & ~MNT_WRITE_HOLD; + mnt->mnt.mnt_flags = old->mnt.mnt_flags & ~MNT_WRITE_HOLD; atomic_inc(&sb->s_active); - mnt->mnt_sb = sb; - mnt->mnt_root = dget(root); - mnt->mnt_mountpoint = mnt->mnt_root; + mnt->mnt.mnt_sb = sb; + mnt->mnt.mnt_root = dget(root); + mnt->mnt_mountpoint = mnt->mnt.mnt_root; mnt->mnt_parent = mnt; + br_write_lock(vfsmount_lock); + list_add_tail(&mnt->mnt_instance, &sb->s_mounts); + br_write_unlock(vfsmount_lock); if (flag & CL_SLAVE) { list_add(&mnt->mnt_slave, &old->mnt_slave_list); @@ -753,9 +777,10 @@ static struct vfsmount *clone_mnt(struct vfsmount *old, struct dentry *root, return NULL; } -static inline void mntfree(struct vfsmount *mnt) +static inline void mntfree(struct mount *mnt) { - struct super_block *sb = mnt->mnt_sb; + struct vfsmount *m = &mnt->mnt; + struct super_block *sb = m->mnt_sb; /* * This probably indicates that somebody messed @@ -768,32 +793,32 @@ static inline void mntfree(struct vfsmount *mnt) * so mnt_get_writers() below is safe. */ WARN_ON(mnt_get_writers(mnt)); - fsnotify_vfsmount_delete(mnt); - dput(mnt->mnt_root); + fsnotify_vfsmount_delete(m); + dput(m->mnt_root); free_vfsmnt(mnt); deactivate_super(sb); } -static void mntput_no_expire(struct vfsmount *mnt) +static void mntput_no_expire(struct mount *mnt) { put_again: #ifdef CONFIG_SMP br_read_lock(vfsmount_lock); if (likely(atomic_read(&mnt->mnt_longterm))) { - mnt_dec_count(mnt); + mnt_add_count(mnt, -1); br_read_unlock(vfsmount_lock); return; } br_read_unlock(vfsmount_lock); br_write_lock(vfsmount_lock); - mnt_dec_count(mnt); + mnt_add_count(mnt, -1); if (mnt_get_count(mnt)) { br_write_unlock(vfsmount_lock); return; } #else - mnt_dec_count(mnt); + mnt_add_count(mnt, -1); if (likely(mnt_get_count(mnt))) return; br_write_lock(vfsmount_lock); @@ -802,9 +827,10 @@ put_again: mnt_add_count(mnt, mnt->mnt_pinned + 1); mnt->mnt_pinned = 0; br_write_unlock(vfsmount_lock); - acct_auto_close_mnt(mnt); + acct_auto_close_mnt(&mnt->mnt); goto put_again; } + list_del(&mnt->mnt_instance); br_write_unlock(vfsmount_lock); mntfree(mnt); } @@ -812,10 +838,11 @@ put_again: void mntput(struct vfsmount *mnt) { if (mnt) { + struct mount *m = real_mount(mnt); /* avoid cacheline pingpong, hope gcc doesn't get "smart" */ - if (unlikely(mnt->mnt_expiry_mark)) - mnt->mnt_expiry_mark = 0; - mntput_no_expire(mnt); + if (unlikely(m->mnt_expiry_mark)) + m->mnt_expiry_mark = 0; + mntput_no_expire(m); } } EXPORT_SYMBOL(mntput); @@ -823,7 +850,7 @@ EXPORT_SYMBOL(mntput); struct vfsmount *mntget(struct vfsmount *mnt) { if (mnt) - mnt_inc_count(mnt); + mnt_add_count(real_mount(mnt), 1); return mnt; } EXPORT_SYMBOL(mntget); @@ -831,16 +858,17 @@ EXPORT_SYMBOL(mntget); void mnt_pin(struct vfsmount *mnt) { br_write_lock(vfsmount_lock); - mnt->mnt_pinned++; + real_mount(mnt)->mnt_pinned++; br_write_unlock(vfsmount_lock); } EXPORT_SYMBOL(mnt_pin); -void mnt_unpin(struct vfsmount *mnt) +void mnt_unpin(struct vfsmount *m) { + struct mount *mnt = real_mount(m); br_write_lock(vfsmount_lock); if (mnt->mnt_pinned) { - mnt_inc_count(mnt); + mnt_add_count(mnt, 1); mnt->mnt_pinned--; } br_write_unlock(vfsmount_lock); @@ -858,12 +886,12 @@ static inline void mangle(struct seq_file *m, const char *s) * * See also save_mount_options(). */ -int generic_show_options(struct seq_file *m, struct vfsmount *mnt) +int generic_show_options(struct seq_file *m, struct dentry *root) { const char *options; rcu_read_lock(); - options = rcu_dereference(mnt->mnt_sb->s_options); + options = rcu_dereference(root->d_sb->s_options); if (options != NULL && options[0]) { seq_putc(m, ','); @@ -907,10 +935,10 @@ void replace_mount_options(struct super_block *sb, char *options) EXPORT_SYMBOL(replace_mount_options); #ifdef CONFIG_PROC_FS -/* iterator */ +/* iterator; we want it to have access to namespace_sem, thus here... */ static void *m_start(struct seq_file *m, loff_t *pos) { - struct proc_mounts *p = m->private; + struct proc_mounts *p = container_of(m, struct proc_mounts, m); down_read(&namespace_sem); return seq_list_start(&p->ns->list, *pos); @@ -918,7 +946,7 @@ static void *m_start(struct seq_file *m, loff_t *pos) static void *m_next(struct seq_file *m, void *v, loff_t *pos) { - struct proc_mounts *p = m->private; + struct proc_mounts *p = container_of(m, struct proc_mounts, m); return seq_list_next(v, &p->ns->list, pos); } @@ -928,221 +956,18 @@ static void m_stop(struct seq_file *m, void *v) up_read(&namespace_sem); } -int mnt_had_events(struct proc_mounts *p) -{ - struct mnt_namespace *ns = p->ns; - int res = 0; - - br_read_lock(vfsmount_lock); - if (p->m.poll_event != ns->event) { - p->m.poll_event = ns->event; - res = 1; - } - br_read_unlock(vfsmount_lock); - - return res; -} - -struct proc_fs_info { - int flag; - const char *str; -}; - -static int show_sb_opts(struct seq_file *m, struct super_block *sb) +static int m_show(struct seq_file *m, void *v) { - static const struct proc_fs_info fs_info[] = { - { MS_SYNCHRONOUS, ",sync" }, - { MS_DIRSYNC, ",dirsync" }, - { MS_MANDLOCK, ",mand" }, - { 0, NULL } - }; - const struct proc_fs_info *fs_infop; - - for (fs_infop = fs_info; fs_infop->flag; fs_infop++) { - if (sb->s_flags & fs_infop->flag) - seq_puts(m, fs_infop->str); - } - - return security_sb_show_options(m, sb); -} - -static void show_mnt_opts(struct seq_file *m, struct vfsmount *mnt) -{ - static const struct proc_fs_info mnt_info[] = { - { MNT_NOSUID, ",nosuid" }, - { MNT_NODEV, ",nodev" }, - { MNT_NOEXEC, ",noexec" }, - { MNT_NOATIME, ",noatime" }, - { MNT_NODIRATIME, ",nodiratime" }, - { MNT_RELATIME, ",relatime" }, - { 0, NULL } - }; - const struct proc_fs_info *fs_infop; - - for (fs_infop = mnt_info; fs_infop->flag; fs_infop++) { - if (mnt->mnt_flags & fs_infop->flag) - seq_puts(m, fs_infop->str); - } -} - -static void show_type(struct seq_file *m, struct super_block *sb) -{ - mangle(m, sb->s_type->name); - if (sb->s_subtype && sb->s_subtype[0]) { - seq_putc(m, '.'); - mangle(m, sb->s_subtype); - } -} - -static int show_vfsmnt(struct seq_file *m, void *v) -{ - struct vfsmount *mnt = list_entry(v, struct vfsmount, mnt_list); - int err = 0; - struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt }; - - if (mnt->mnt_sb->s_op->show_devname) { - err = mnt->mnt_sb->s_op->show_devname(m, mnt); - if (err) - goto out; - } else { - mangle(m, mnt->mnt_devname ? mnt->mnt_devname : "none"); - } - seq_putc(m, ' '); - seq_path(m, &mnt_path, " \t\n\\"); - seq_putc(m, ' '); - show_type(m, mnt->mnt_sb); - seq_puts(m, __mnt_is_readonly(mnt) ? " ro" : " rw"); - err = show_sb_opts(m, mnt->mnt_sb); - if (err) - goto out; - show_mnt_opts(m, mnt); - if (mnt->mnt_sb->s_op->show_options) - err = mnt->mnt_sb->s_op->show_options(m, mnt); - seq_puts(m, " 0 0\n"); -out: - return err; + struct proc_mounts *p = container_of(m, struct proc_mounts, m); + struct mount *r = list_entry(v, struct mount, mnt_list); + return p->show(m, &r->mnt); } const struct seq_operations mounts_op = { .start = m_start, .next = m_next, .stop = m_stop, - .show = show_vfsmnt -}; - -static int show_mountinfo(struct seq_file *m, void *v) -{ - struct proc_mounts *p = m->private; - struct vfsmount *mnt = list_entry(v, struct vfsmount, mnt_list); - struct super_block *sb = mnt->mnt_sb; - struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt }; - struct path root = p->root; - int err = 0; - - seq_printf(m, "%i %i %u:%u ", mnt->mnt_id, mnt->mnt_parent->mnt_id, - MAJOR(sb->s_dev), MINOR(sb->s_dev)); - if (sb->s_op->show_path) - err = sb->s_op->show_path(m, mnt); - else - seq_dentry(m, mnt->mnt_root, " \t\n\\"); - if (err) - goto out; - seq_putc(m, ' '); - seq_path_root(m, &mnt_path, &root, " \t\n\\"); - if (root.mnt != p->root.mnt || root.dentry != p->root.dentry) { - /* - * Mountpoint is outside root, discard that one. Ugly, - * but less so than trying to do that in iterator in a - * race-free way (due to renames). - */ - return SEQ_SKIP; - } - seq_puts(m, mnt->mnt_flags & MNT_READONLY ? " ro" : " rw"); - show_mnt_opts(m, mnt); - - /* Tagged fields ("foo:X" or "bar") */ - if (IS_MNT_SHARED(mnt)) - seq_printf(m, " shared:%i", mnt->mnt_group_id); - if (IS_MNT_SLAVE(mnt)) { - int master = mnt->mnt_master->mnt_group_id; - int dom = get_dominating_id(mnt, &p->root); - seq_printf(m, " master:%i", master); - if (dom && dom != master) - seq_printf(m, " propagate_from:%i", dom); - } - if (IS_MNT_UNBINDABLE(mnt)) - seq_puts(m, " unbindable"); - - /* Filesystem specific data */ - seq_puts(m, " - "); - show_type(m, sb); - seq_putc(m, ' '); - if (sb->s_op->show_devname) - err = sb->s_op->show_devname(m, mnt); - else - mangle(m, mnt->mnt_devname ? mnt->mnt_devname : "none"); - if (err) - goto out; - seq_puts(m, sb->s_flags & MS_RDONLY ? " ro" : " rw"); - err = show_sb_opts(m, sb); - if (err) - goto out; - if (sb->s_op->show_options) - err = sb->s_op->show_options(m, mnt); - seq_putc(m, '\n'); -out: - return err; -} - -const struct seq_operations mountinfo_op = { - .start = m_start, - .next = m_next, - .stop = m_stop, - .show = show_mountinfo, -}; - -static int show_vfsstat(struct seq_file *m, void *v) -{ - struct vfsmount *mnt = list_entry(v, struct vfsmount, mnt_list); - struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt }; - int err = 0; - - /* device */ - if (mnt->mnt_sb->s_op->show_devname) { - err = mnt->mnt_sb->s_op->show_devname(m, mnt); - } else { - if (mnt->mnt_devname) { - seq_puts(m, "device "); - mangle(m, mnt->mnt_devname); - } else - seq_puts(m, "no device"); - } - - /* mount point */ - seq_puts(m, " mounted on "); - seq_path(m, &mnt_path, " \t\n\\"); - seq_putc(m, ' '); - - /* file system type */ - seq_puts(m, "with fstype "); - show_type(m, mnt->mnt_sb); - - /* optional statistics */ - if (mnt->mnt_sb->s_op->show_stats) { - seq_putc(m, ' '); - if (!err) - err = mnt->mnt_sb->s_op->show_stats(m, mnt); - } - - seq_putc(m, '\n'); - return err; -} - -const struct seq_operations mountstats_op = { - .start = m_start, - .next = m_next, - .stop = m_stop, - .show = show_vfsstat, + .show = m_show, }; #endif /* CONFIG_PROC_FS */ @@ -1154,11 +979,13 @@ const struct seq_operations mountstats_op = { * open files, pwds, chroots or sub mounts that are * busy. */ -int may_umount_tree(struct vfsmount *mnt) +int may_umount_tree(struct vfsmount *m) { + struct mount *mnt = real_mount(m); int actual_refs = 0; int minimum_refs = 0; - struct vfsmount *p; + struct mount *p; + BUG_ON(!m); /* write lock needed for mnt_get_count */ br_write_lock(vfsmount_lock); @@ -1194,7 +1021,7 @@ int may_umount(struct vfsmount *mnt) int ret = 1; down_read(&namespace_sem); br_write_lock(vfsmount_lock); - if (propagate_mount_busy(mnt, 2)) + if (propagate_mount_busy(real_mount(mnt), 2)) ret = 0; br_write_unlock(vfsmount_lock); up_read(&namespace_sem); @@ -1205,25 +1032,25 @@ EXPORT_SYMBOL(may_umount); void release_mounts(struct list_head *head) { - struct vfsmount *mnt; + struct mount *mnt; while (!list_empty(head)) { - mnt = list_first_entry(head, struct vfsmount, mnt_hash); + mnt = list_first_entry(head, struct mount, mnt_hash); list_del_init(&mnt->mnt_hash); - if (mnt->mnt_parent != mnt) { + if (mnt_has_parent(mnt)) { struct dentry *dentry; - struct vfsmount *m; + struct mount *m; br_write_lock(vfsmount_lock); dentry = mnt->mnt_mountpoint; m = mnt->mnt_parent; - mnt->mnt_mountpoint = mnt->mnt_root; + mnt->mnt_mountpoint = mnt->mnt.mnt_root; mnt->mnt_parent = mnt; m->mnt_ghosts--; br_write_unlock(vfsmount_lock); dput(dentry); - mntput(m); + mntput(&m->mnt); } - mntput(mnt); + mntput(&mnt->mnt); } } @@ -1231,10 +1058,10 @@ void release_mounts(struct list_head *head) * vfsmount lock must be held for write * namespace_sem must be held for write */ -void umount_tree(struct vfsmount *mnt, int propagate, struct list_head *kill) +void umount_tree(struct mount *mnt, int propagate, struct list_head *kill) { LIST_HEAD(tmp_list); - struct vfsmount *p; + struct mount *p; for (p = mnt; p; p = next_mnt(p, mnt)) list_move(&p->mnt_hash, &tmp_list); @@ -1249,24 +1076,24 @@ void umount_tree(struct vfsmount *mnt, int propagate, struct list_head *kill) p->mnt_ns = NULL; __mnt_make_shortterm(p); list_del_init(&p->mnt_child); - if (p->mnt_parent != p) { + if (mnt_has_parent(p)) { p->mnt_parent->mnt_ghosts++; - dentry_reset_mounted(p->mnt_parent, p->mnt_mountpoint); + dentry_reset_mounted(p->mnt_mountpoint); } change_mnt_propagation(p, MS_PRIVATE); } list_splice(&tmp_list, kill); } -static void shrink_submounts(struct vfsmount *mnt, struct list_head *umounts); +static void shrink_submounts(struct mount *mnt, struct list_head *umounts); -static int do_umount(struct vfsmount *mnt, int flags) +static int do_umount(struct mount *mnt, int flags) { - struct super_block *sb = mnt->mnt_sb; + struct super_block *sb = mnt->mnt.mnt_sb; int retval; LIST_HEAD(umount_list); - retval = security_sb_umount(mnt, flags); + retval = security_sb_umount(&mnt->mnt, flags); if (retval) return retval; @@ -1277,7 +1104,7 @@ static int do_umount(struct vfsmount *mnt, int flags) * (2) the usage count == 1 [parent vfsmount] + 1 [sys_umount] */ if (flags & MNT_EXPIRE) { - if (mnt == current->fs->root.mnt || + if (&mnt->mnt == current->fs->root.mnt || flags & (MNT_FORCE | MNT_DETACH)) return -EINVAL; @@ -1319,7 +1146,7 @@ static int do_umount(struct vfsmount *mnt, int flags) * /reboot - static binary that would close all descriptors and * call reboot(9). Then init(8) could umount root and exec /reboot. */ - if (mnt == current->fs->root.mnt && !(flags & MNT_DETACH)) { + if (&mnt->mnt == current->fs->root.mnt && !(flags & MNT_DETACH)) { /* * Special case for "unmounting" root ... * we just try to remount it readonly. @@ -1361,6 +1188,7 @@ static int do_umount(struct vfsmount *mnt, int flags) SYSCALL_DEFINE2(umount, char __user *, name, int, flags) { struct path path; + struct mount *mnt; int retval; int lookup_flags = 0; @@ -1373,21 +1201,22 @@ SYSCALL_DEFINE2(umount, char __user *, name, int, flags) retval = user_path_at(AT_FDCWD, name, lookup_flags, &path); if (retval) goto out; + mnt = real_mount(path.mnt); retval = -EINVAL; if (path.dentry != path.mnt->mnt_root) goto dput_and_out; - if (!check_mnt(path.mnt)) + if (!check_mnt(mnt)) goto dput_and_out; retval = -EPERM; if (!capable(CAP_SYS_ADMIN)) goto dput_and_out; - retval = do_umount(path.mnt, flags); + retval = do_umount(mnt, flags); dput_and_out: /* we mustn't call path_put() as that would clear mnt_expiry_mark */ dput(path.dentry); - mntput_no_expire(path.mnt); + mntput_no_expire(mnt); out: return retval; } @@ -1422,10 +1251,10 @@ static int mount_is_safe(struct path *path) #endif } -struct vfsmount *copy_tree(struct vfsmount *mnt, struct dentry *dentry, +struct mount *copy_tree(struct mount *mnt, struct dentry *dentry, int flag) { - struct vfsmount *res, *p, *q, *r, *s; + struct mount *res, *p, *q, *r; struct path path; if (!(flag & CL_COPY_ALL) && IS_MNT_UNBINDABLE(mnt)) @@ -1438,6 +1267,7 @@ struct vfsmount *copy_tree(struct vfsmount *mnt, struct dentry *dentry, p = mnt; list_for_each_entry(r, &mnt->mnt_mounts, mnt_child) { + struct mount *s; if (!is_subdir(r->mnt_mountpoint, dentry)) continue; @@ -1451,9 +1281,9 @@ struct vfsmount *copy_tree(struct vfsmount *mnt, struct dentry *dentry, q = q->mnt_parent; } p = s; - path.mnt = q; + path.mnt = &q->mnt; path.dentry = p->mnt_mountpoint; - q = clone_mnt(p, p->mnt_root, flag); + q = clone_mnt(p, p->mnt.mnt_root, flag); if (!q) goto Enomem; br_write_lock(vfsmount_lock); @@ -1476,11 +1306,12 @@ Enomem: struct vfsmount *collect_mounts(struct path *path) { - struct vfsmount *tree; + struct mount *tree; down_write(&namespace_sem); - tree = copy_tree(path->mnt, path->dentry, CL_COPY_ALL | CL_PRIVATE); + tree = copy_tree(real_mount(path->mnt), path->dentry, + CL_COPY_ALL | CL_PRIVATE); up_write(&namespace_sem); - return tree; + return tree ? &tree->mnt : NULL; } void drop_collected_mounts(struct vfsmount *mnt) @@ -1488,7 +1319,7 @@ void drop_collected_mounts(struct vfsmount *mnt) LIST_HEAD(umount_list); down_write(&namespace_sem); br_write_lock(vfsmount_lock); - umount_tree(mnt, 0, &umount_list); + umount_tree(real_mount(mnt), 0, &umount_list); br_write_unlock(vfsmount_lock); up_write(&namespace_sem); release_mounts(&umount_list); @@ -1497,21 +1328,21 @@ void drop_collected_mounts(struct vfsmount *mnt) int iterate_mounts(int (*f)(struct vfsmount *, void *), void *arg, struct vfsmount *root) { - struct vfsmount *mnt; + struct mount *mnt; int res = f(root, arg); if (res) return res; - list_for_each_entry(mnt, &root->mnt_list, mnt_list) { - res = f(mnt, arg); + list_for_each_entry(mnt, &real_mount(root)->mnt_list, mnt_list) { + res = f(&mnt->mnt, arg); if (res) return res; } return 0; } -static void cleanup_group_ids(struct vfsmount *mnt, struct vfsmount *end) +static void cleanup_group_ids(struct mount *mnt, struct mount *end) { - struct vfsmount *p; + struct mount *p; for (p = mnt; p != end; p = next_mnt(p, mnt)) { if (p->mnt_group_id && !IS_MNT_SHARED(p)) @@ -1519,9 +1350,9 @@ static void cleanup_group_ids(struct vfsmount *mnt, struct vfsmount *end) } } -static int invent_group_ids(struct vfsmount *mnt, bool recurse) +static int invent_group_ids(struct mount *mnt, bool recurse) { - struct vfsmount *p; + struct mount *p; for (p = mnt; p; p = recurse ? next_mnt(p, mnt) : NULL) { if (!p->mnt_group_id && !IS_MNT_SHARED(p)) { @@ -1599,13 +1430,13 @@ static int invent_group_ids(struct vfsmount *mnt, bool recurse) * Must be called without spinlocks held, since this function can sleep * in allocations. */ -static int attach_recursive_mnt(struct vfsmount *source_mnt, +static int attach_recursive_mnt(struct mount *source_mnt, struct path *path, struct path *parent_path) { LIST_HEAD(tree_list); - struct vfsmount *dest_mnt = path->mnt; + struct mount *dest_mnt = real_mount(path->mnt); struct dentry *dest_dentry = path->dentry; - struct vfsmount *child, *p; + struct mount *child, *p; int err; if (IS_MNT_SHARED(dest_mnt)) { @@ -1626,7 +1457,7 @@ static int attach_recursive_mnt(struct vfsmount *source_mnt, if (parent_path) { detach_mnt(source_mnt, parent_path); attach_mnt(source_mnt, path); - touch_mnt_namespace(parent_path->mnt->mnt_ns); + touch_mnt_namespace(source_mnt->mnt_ns); } else { mnt_set_mountpoint(dest_mnt, dest_dentry, source_mnt); commit_tree(source_mnt); @@ -1674,13 +1505,13 @@ static void unlock_mount(struct path *path) mutex_unlock(&path->dentry->d_inode->i_mutex); } -static int graft_tree(struct vfsmount *mnt, struct path *path) +static int graft_tree(struct mount *mnt, struct path *path) { - if (mnt->mnt_sb->s_flags & MS_NOUSER) + if (mnt->mnt.mnt_sb->s_flags & MS_NOUSER) return -EINVAL; if (S_ISDIR(path->dentry->d_inode->i_mode) != - S_ISDIR(mnt->mnt_root->d_inode->i_mode)) + S_ISDIR(mnt->mnt.mnt_root->d_inode->i_mode)) return -ENOTDIR; if (d_unlinked(path->dentry)) @@ -1711,7 +1542,8 @@ static int flags_to_propagation_type(int flags) */ static int do_change_type(struct path *path, int flag) { - struct vfsmount *m, *mnt = path->mnt; + struct mount *m; + struct mount *mnt = real_mount(path->mnt); int recurse = flag & MS_REC; int type; int err = 0; @@ -1751,7 +1583,7 @@ static int do_loopback(struct path *path, char *old_name, { LIST_HEAD(umount_list); struct path old_path; - struct vfsmount *mnt = NULL; + struct mount *mnt = NULL, *old; int err = mount_is_safe(path); if (err) return err; @@ -1765,18 +1597,20 @@ static int do_loopback(struct path *path, char *old_name, if (err) goto out; + old = real_mount(old_path.mnt); + err = -EINVAL; - if (IS_MNT_UNBINDABLE(old_path.mnt)) + if (IS_MNT_UNBINDABLE(old)) goto out2; - if (!check_mnt(path->mnt) || !check_mnt(old_path.mnt)) + if (!check_mnt(real_mount(path->mnt)) || !check_mnt(old)) goto out2; err = -ENOMEM; if (recurse) - mnt = copy_tree(old_path.mnt, old_path.dentry, 0); + mnt = copy_tree(old, old_path.dentry, 0); else - mnt = clone_mnt(old_path.mnt, old_path.dentry, 0); + mnt = clone_mnt(old, old_path.dentry, 0); if (!mnt) goto out2; @@ -1806,9 +1640,9 @@ static int change_mount_flags(struct vfsmount *mnt, int ms_flags) return 0; if (readonly_request) - error = mnt_make_readonly(mnt); + error = mnt_make_readonly(real_mount(mnt)); else - __mnt_unmake_readonly(mnt); + __mnt_unmake_readonly(real_mount(mnt)); return error; } @@ -1822,11 +1656,12 @@ static int do_remount(struct path *path, int flags, int mnt_flags, { int err; struct super_block *sb = path->mnt->mnt_sb; + struct mount *mnt = real_mount(path->mnt); if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (!check_mnt(path->mnt)) + if (!check_mnt(mnt)) return -EINVAL; if (path->dentry != path->mnt->mnt_root) @@ -1843,22 +1678,22 @@ static int do_remount(struct path *path, int flags, int mnt_flags, err = do_remount_sb(sb, flags, data, 0); if (!err) { br_write_lock(vfsmount_lock); - mnt_flags |= path->mnt->mnt_flags & MNT_PROPAGATION_MASK; - path->mnt->mnt_flags = mnt_flags; + mnt_flags |= mnt->mnt.mnt_flags & MNT_PROPAGATION_MASK; + mnt->mnt.mnt_flags = mnt_flags; br_write_unlock(vfsmount_lock); } up_write(&sb->s_umount); if (!err) { br_write_lock(vfsmount_lock); - touch_mnt_namespace(path->mnt->mnt_ns); + touch_mnt_namespace(mnt->mnt_ns); br_write_unlock(vfsmount_lock); } return err; } -static inline int tree_contains_unbindable(struct vfsmount *mnt) +static inline int tree_contains_unbindable(struct mount *mnt) { - struct vfsmount *p; + struct mount *p; for (p = mnt; p; p = next_mnt(p, mnt)) { if (IS_MNT_UNBINDABLE(p)) return 1; @@ -1869,7 +1704,8 @@ static inline int tree_contains_unbindable(struct vfsmount *mnt) static int do_move_mount(struct path *path, char *old_name) { struct path old_path, parent_path; - struct vfsmount *p; + struct mount *p; + struct mount *old; int err = 0; if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -1883,8 +1719,11 @@ static int do_move_mount(struct path *path, char *old_name) if (err < 0) goto out; + old = real_mount(old_path.mnt); + p = real_mount(path->mnt); + err = -EINVAL; - if (!check_mnt(path->mnt) || !check_mnt(old_path.mnt)) + if (!check_mnt(p) || !check_mnt(old)) goto out1; if (d_unlinked(path->dentry)) @@ -1894,7 +1733,7 @@ static int do_move_mount(struct path *path, char *old_name) if (old_path.dentry != old_path.mnt->mnt_root) goto out1; - if (old_path.mnt == old_path.mnt->mnt_parent) + if (!mnt_has_parent(old)) goto out1; if (S_ISDIR(path->dentry->d_inode->i_mode) != @@ -1903,28 +1742,26 @@ static int do_move_mount(struct path *path, char *old_name) /* * Don't move a mount residing in a shared parent. */ - if (old_path.mnt->mnt_parent && - IS_MNT_SHARED(old_path.mnt->mnt_parent)) + if (IS_MNT_SHARED(old->mnt_parent)) goto out1; /* * Don't move a mount tree containing unbindable mounts to a destination * mount which is shared. */ - if (IS_MNT_SHARED(path->mnt) && - tree_contains_unbindable(old_path.mnt)) + if (IS_MNT_SHARED(p) && tree_contains_unbindable(old)) goto out1; err = -ELOOP; - for (p = path->mnt; p->mnt_parent != p; p = p->mnt_parent) - if (p == old_path.mnt) + for (; mnt_has_parent(p); p = p->mnt_parent) + if (p == old) goto out1; - err = attach_recursive_mnt(old_path.mnt, path, &parent_path); + err = attach_recursive_mnt(old, path, &parent_path); if (err) goto out1; /* if the mount is moved, it should no longer be expire * automatically */ - list_del_init(&old_path.mnt->mnt_expire); + list_del_init(&old->mnt_expire); out1: unlock_mount(path); out: @@ -1957,7 +1794,7 @@ static struct vfsmount *fs_set_subtype(struct vfsmount *mnt, const char *fstype) return ERR_PTR(err); } -struct vfsmount * +static struct vfsmount * do_kern_mount(const char *fstype, int flags, const char *name, void *data) { struct file_system_type *type = get_fs_type(fstype); @@ -1971,12 +1808,11 @@ do_kern_mount(const char *fstype, int flags, const char *name, void *data) put_filesystem(type); return mnt; } -EXPORT_SYMBOL_GPL(do_kern_mount); /* * add a mount into a namespace's mount tree */ -static int do_add_mount(struct vfsmount *newmnt, struct path *path, int mnt_flags) +static int do_add_mount(struct mount *newmnt, struct path *path, int mnt_flags) { int err; @@ -1987,20 +1823,20 @@ static int do_add_mount(struct vfsmount *newmnt, struct path *path, int mnt_flag return err; err = -EINVAL; - if (!(mnt_flags & MNT_SHRINKABLE) && !check_mnt(path->mnt)) + if (!(mnt_flags & MNT_SHRINKABLE) && !check_mnt(real_mount(path->mnt))) goto unlock; /* Refuse the same filesystem on the same mount point */ err = -EBUSY; - if (path->mnt->mnt_sb == newmnt->mnt_sb && + if (path->mnt->mnt_sb == newmnt->mnt.mnt_sb && path->mnt->mnt_root == path->dentry) goto unlock; err = -EINVAL; - if (S_ISLNK(newmnt->mnt_root->d_inode->i_mode)) + if (S_ISLNK(newmnt->mnt.mnt_root->d_inode->i_mode)) goto unlock; - newmnt->mnt_flags = mnt_flags; + newmnt->mnt.mnt_flags = mnt_flags; err = graft_tree(newmnt, path); unlock: @@ -2029,7 +1865,7 @@ static int do_new_mount(struct path *path, char *type, int flags, if (IS_ERR(mnt)) return PTR_ERR(mnt); - err = do_add_mount(mnt, path, mnt_flags); + err = do_add_mount(real_mount(mnt), path, mnt_flags); if (err) mntput(mnt); return err; @@ -2037,11 +1873,12 @@ static int do_new_mount(struct path *path, char *type, int flags, int finish_automount(struct vfsmount *m, struct path *path) { + struct mount *mnt = real_mount(m); int err; /* The new mount record should have at least 2 refs to prevent it being * expired before we get a chance to add it */ - BUG_ON(mnt_get_count(m) < 2); + BUG_ON(mnt_get_count(mnt) < 2); if (m->mnt_sb == path->mnt->mnt_sb && m->mnt_root == path->dentry) { @@ -2049,15 +1886,15 @@ int finish_automount(struct vfsmount *m, struct path *path) goto fail; } - err = do_add_mount(m, path, path->mnt->mnt_flags | MNT_SHRINKABLE); + err = do_add_mount(mnt, path, path->mnt->mnt_flags | MNT_SHRINKABLE); if (!err) return 0; fail: /* remove m from any expiration list it may be on */ - if (!list_empty(&m->mnt_expire)) { + if (!list_empty(&mnt->mnt_expire)) { down_write(&namespace_sem); br_write_lock(vfsmount_lock); - list_del_init(&m->mnt_expire); + list_del_init(&mnt->mnt_expire); br_write_unlock(vfsmount_lock); up_write(&namespace_sem); } @@ -2076,7 +1913,7 @@ void mnt_set_expiry(struct vfsmount *mnt, struct list_head *expiry_list) down_write(&namespace_sem); br_write_lock(vfsmount_lock); - list_add_tail(&mnt->mnt_expire, expiry_list); + list_add_tail(&real_mount(mnt)->mnt_expire, expiry_list); br_write_unlock(vfsmount_lock); up_write(&namespace_sem); @@ -2090,7 +1927,7 @@ EXPORT_SYMBOL(mnt_set_expiry); */ void mark_mounts_for_expiry(struct list_head *mounts) { - struct vfsmount *mnt, *next; + struct mount *mnt, *next; LIST_HEAD(graveyard); LIST_HEAD(umounts); @@ -2113,7 +1950,7 @@ void mark_mounts_for_expiry(struct list_head *mounts) list_move(&mnt->mnt_expire, &graveyard); } while (!list_empty(&graveyard)) { - mnt = list_first_entry(&graveyard, struct vfsmount, mnt_expire); + mnt = list_first_entry(&graveyard, struct mount, mnt_expire); touch_mnt_namespace(mnt->mnt_ns); umount_tree(mnt, 1, &umounts); } @@ -2131,9 +1968,9 @@ EXPORT_SYMBOL_GPL(mark_mounts_for_expiry); * search the list of submounts for a given mountpoint, and move any * shrinkable submounts to the 'graveyard' list. */ -static int select_submounts(struct vfsmount *parent, struct list_head *graveyard) +static int select_submounts(struct mount *parent, struct list_head *graveyard) { - struct vfsmount *this_parent = parent; + struct mount *this_parent = parent; struct list_head *next; int found = 0; @@ -2142,10 +1979,10 @@ repeat: resume: while (next != &this_parent->mnt_mounts) { struct list_head *tmp = next; - struct vfsmount *mnt = list_entry(tmp, struct vfsmount, mnt_child); + struct mount *mnt = list_entry(tmp, struct mount, mnt_child); next = tmp->next; - if (!(mnt->mnt_flags & MNT_SHRINKABLE)) + if (!(mnt->mnt.mnt_flags & MNT_SHRINKABLE)) continue; /* * Descend a level if the d_mounts list is non-empty. @@ -2177,15 +2014,15 @@ resume: * * vfsmount_lock must be held for write */ -static void shrink_submounts(struct vfsmount *mnt, struct list_head *umounts) +static void shrink_submounts(struct mount *mnt, struct list_head *umounts) { LIST_HEAD(graveyard); - struct vfsmount *m; + struct mount *m; /* extract submounts of 'mountpoint' from the expiration list */ while (select_submounts(mnt, &graveyard)) { while (!list_empty(&graveyard)) { - m = list_first_entry(&graveyard, struct vfsmount, + m = list_first_entry(&graveyard, struct mount, mnt_expire); touch_mnt_namespace(m->mnt_ns); umount_tree(m, 1, umounts); @@ -2372,12 +2209,13 @@ static struct mnt_namespace *alloc_mnt_ns(void) void mnt_make_longterm(struct vfsmount *mnt) { - __mnt_make_longterm(mnt); + __mnt_make_longterm(real_mount(mnt)); } -void mnt_make_shortterm(struct vfsmount *mnt) +void mnt_make_shortterm(struct vfsmount *m) { #ifdef CONFIG_SMP + struct mount *mnt = real_mount(m); if (atomic_add_unless(&mnt->mnt_longterm, -1, 1)) return; br_write_lock(vfsmount_lock); @@ -2395,7 +2233,9 @@ static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns, { struct mnt_namespace *new_ns; struct vfsmount *rootmnt = NULL, *pwdmnt = NULL; - struct vfsmount *p, *q; + struct mount *p, *q; + struct mount *old = mnt_ns->root; + struct mount *new; new_ns = alloc_mnt_ns(); if (IS_ERR(new_ns)) @@ -2403,15 +2243,15 @@ static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns, down_write(&namespace_sem); /* First pass: copy the tree topology */ - new_ns->root = copy_tree(mnt_ns->root, mnt_ns->root->mnt_root, - CL_COPY_ALL | CL_EXPIRE); - if (!new_ns->root) { + new = copy_tree(old, old->mnt.mnt_root, CL_COPY_ALL | CL_EXPIRE); + if (!new) { up_write(&namespace_sem); kfree(new_ns); return ERR_PTR(-ENOMEM); } + new_ns->root = new; br_write_lock(vfsmount_lock); - list_add_tail(&new_ns->list, &new_ns->root->mnt_list); + list_add_tail(&new_ns->list, &new->mnt_list); br_write_unlock(vfsmount_lock); /* @@ -2419,27 +2259,27 @@ static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns, * as belonging to new namespace. We have already acquired a private * fs_struct, so tsk->fs->lock is not needed. */ - p = mnt_ns->root; - q = new_ns->root; + p = old; + q = new; while (p) { q->mnt_ns = new_ns; __mnt_make_longterm(q); if (fs) { - if (p == fs->root.mnt) { - fs->root.mnt = mntget(q); + if (&p->mnt == fs->root.mnt) { + fs->root.mnt = mntget(&q->mnt); __mnt_make_longterm(q); - mnt_make_shortterm(p); - rootmnt = p; + mnt_make_shortterm(&p->mnt); + rootmnt = &p->mnt; } - if (p == fs->pwd.mnt) { - fs->pwd.mnt = mntget(q); + if (&p->mnt == fs->pwd.mnt) { + fs->pwd.mnt = mntget(&q->mnt); __mnt_make_longterm(q); - mnt_make_shortterm(p); - pwdmnt = p; + mnt_make_shortterm(&p->mnt); + pwdmnt = &p->mnt; } } - p = next_mnt(p, mnt_ns->root); - q = next_mnt(q, new_ns->root); + p = next_mnt(p, old); + q = next_mnt(q, new); } up_write(&namespace_sem); @@ -2472,20 +2312,50 @@ struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns, * create_mnt_ns - creates a private namespace and adds a root filesystem * @mnt: pointer to the new root filesystem mountpoint */ -struct mnt_namespace *create_mnt_ns(struct vfsmount *mnt) +static struct mnt_namespace *create_mnt_ns(struct vfsmount *m) { - struct mnt_namespace *new_ns; - - new_ns = alloc_mnt_ns(); + struct mnt_namespace *new_ns = alloc_mnt_ns(); if (!IS_ERR(new_ns)) { + struct mount *mnt = real_mount(m); mnt->mnt_ns = new_ns; __mnt_make_longterm(mnt); new_ns->root = mnt; - list_add(&new_ns->list, &new_ns->root->mnt_list); + list_add(&new_ns->list, &mnt->mnt_list); + } else { + mntput(m); } return new_ns; } -EXPORT_SYMBOL(create_mnt_ns); + +struct dentry *mount_subtree(struct vfsmount *mnt, const char *name) +{ + struct mnt_namespace *ns; + struct super_block *s; + struct path path; + int err; + + ns = create_mnt_ns(mnt); + if (IS_ERR(ns)) + return ERR_CAST(ns); + + err = vfs_path_lookup(mnt->mnt_root, mnt, + name, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &path); + + put_mnt_ns(ns); + + if (err) + return ERR_PTR(err); + + /* trade a vfsmount reference for active sb one */ + s = path.mnt->mnt_sb; + atomic_inc(&s->s_active); + mntput(path.mnt); + /* lock the sucker */ + down_write(&s->s_umount); + /* ... and return the root of (sub)tree on it */ + return path.dentry; +} +EXPORT_SYMBOL(mount_subtree); SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *, dir_name, char __user *, type, unsigned long, flags, void __user *, data) @@ -2529,6 +2399,31 @@ out_type: } /* + * Return true if path is reachable from root + * + * namespace_sem or vfsmount_lock is held + */ +bool is_path_reachable(struct mount *mnt, struct dentry *dentry, + const struct path *root) +{ + while (&mnt->mnt != root->mnt && mnt_has_parent(mnt)) { + dentry = mnt->mnt_mountpoint; + mnt = mnt->mnt_parent; + } + return &mnt->mnt == root->mnt && is_subdir(dentry, root->dentry); +} + +int path_is_under(struct path *path1, struct path *path2) +{ + int res; + br_read_lock(vfsmount_lock); + res = is_path_reachable(real_mount(path1->mnt), path1->dentry, path2); + br_read_unlock(vfsmount_lock); + return res; +} +EXPORT_SYMBOL(path_is_under); + +/* * pivot_root Semantics: * Moves the root file system of the current process to the directory put_old, * makes new_root as the new root file system of the current process, and sets @@ -2556,8 +2451,8 @@ out_type: SYSCALL_DEFINE2(pivot_root, const char __user *, new_root, const char __user *, put_old) { - struct vfsmount *tmp; struct path new, old, parent_path, root_parent, root; + struct mount *new_mnt, *root_mnt; int error; if (!capable(CAP_SYS_ADMIN)) @@ -2581,11 +2476,13 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root, goto out3; error = -EINVAL; - if (IS_MNT_SHARED(old.mnt) || - IS_MNT_SHARED(new.mnt->mnt_parent) || - IS_MNT_SHARED(root.mnt->mnt_parent)) + new_mnt = real_mount(new.mnt); + root_mnt = real_mount(root.mnt); + if (IS_MNT_SHARED(real_mount(old.mnt)) || + IS_MNT_SHARED(new_mnt->mnt_parent) || + IS_MNT_SHARED(root_mnt->mnt_parent)) goto out4; - if (!check_mnt(root.mnt) || !check_mnt(new.mnt)) + if (!check_mnt(root_mnt) || !check_mnt(new_mnt)) goto out4; error = -ENOENT; if (d_unlinked(new.dentry)) @@ -2599,33 +2496,22 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root, error = -EINVAL; if (root.mnt->mnt_root != root.dentry) goto out4; /* not a mountpoint */ - if (root.mnt->mnt_parent == root.mnt) + if (!mnt_has_parent(root_mnt)) goto out4; /* not attached */ if (new.mnt->mnt_root != new.dentry) goto out4; /* not a mountpoint */ - if (new.mnt->mnt_parent == new.mnt) + if (!mnt_has_parent(new_mnt)) goto out4; /* not attached */ /* make sure we can reach put_old from new_root */ - tmp = old.mnt; - if (tmp != new.mnt) { - for (;;) { - if (tmp->mnt_parent == tmp) - goto out4; /* already mounted on put_old */ - if (tmp->mnt_parent == new.mnt) - break; - tmp = tmp->mnt_parent; - } - if (!is_subdir(tmp->mnt_mountpoint, new.dentry)) - goto out4; - } else if (!is_subdir(old.dentry, new.dentry)) + if (!is_path_reachable(real_mount(old.mnt), old.dentry, &new)) goto out4; br_write_lock(vfsmount_lock); - detach_mnt(new.mnt, &parent_path); - detach_mnt(root.mnt, &root_parent); + detach_mnt(new_mnt, &parent_path); + detach_mnt(root_mnt, &root_parent); /* mount old root on put_old */ - attach_mnt(root.mnt, &old); + attach_mnt(root_mnt, &old); /* mount new_root on / */ - attach_mnt(new.mnt, &root_parent); + attach_mnt(new_mnt, &root_parent); touch_mnt_namespace(current->nsproxy->mnt_ns); br_write_unlock(vfsmount_lock); chroot_fs_refs(&root, &new); @@ -2663,8 +2549,8 @@ static void __init init_mount_tree(void) init_task.nsproxy->mnt_ns = ns; get_mnt_ns(ns); - root.mnt = ns->root; - root.dentry = ns->root->mnt_root; + root.mnt = mnt; + root.dentry = mnt->mnt_root; set_fs_pwd(current->fs, &root); set_fs_root(current->fs, &root); @@ -2677,7 +2563,7 @@ void __init mnt_init(void) init_rwsem(&namespace_sem); - mnt_cache = kmem_cache_create("mnt_cache", sizeof(struct vfsmount), + mnt_cache = kmem_cache_create("mnt_cache", sizeof(struct mount), 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL); mount_hashtable = (struct list_head *)__get_free_page(GFP_ATOMIC); @@ -2717,7 +2603,6 @@ void put_mnt_ns(struct mnt_namespace *ns) release_mounts(&umount_list); kfree(ns); } -EXPORT_SYMBOL(put_mnt_ns); struct vfsmount *kern_mount_data(struct file_system_type *type, void *data) { @@ -2743,3 +2628,8 @@ void kern_unmount(struct vfsmount *mnt) } } EXPORT_SYMBOL(kern_unmount); + +bool our_mnt(struct vfsmount *mnt) +{ + return check_mnt(real_mount(mnt)); +} diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c index 9c51f62..aeed93a 100644 --- a/fs/ncpfs/dir.c +++ b/fs/ncpfs/dir.c @@ -30,15 +30,15 @@ static void ncp_do_readdir(struct file *, void *, filldir_t, static int ncp_readdir(struct file *, void *, filldir_t); -static int ncp_create(struct inode *, struct dentry *, int, struct nameidata *); +static int ncp_create(struct inode *, struct dentry *, umode_t, struct nameidata *); static struct dentry *ncp_lookup(struct inode *, struct dentry *, struct nameidata *); static int ncp_unlink(struct inode *, struct dentry *); -static int ncp_mkdir(struct inode *, struct dentry *, int); +static int ncp_mkdir(struct inode *, struct dentry *, umode_t); static int ncp_rmdir(struct inode *, struct dentry *); static int ncp_rename(struct inode *, struct dentry *, struct inode *, struct dentry *); static int ncp_mknod(struct inode * dir, struct dentry *dentry, - int mode, dev_t rdev); + umode_t mode, dev_t rdev); #if defined(CONFIG_NCPFS_EXTRAS) || defined(CONFIG_NCPFS_NFS_NS) extern int ncp_symlink(struct inode *, struct dentry *, const char *); #else @@ -919,7 +919,7 @@ out_close: goto out; } -int ncp_create_new(struct inode *dir, struct dentry *dentry, int mode, +int ncp_create_new(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev, __le32 attributes) { struct ncp_server *server = NCP_SERVER(dir); @@ -928,7 +928,7 @@ int ncp_create_new(struct inode *dir, struct dentry *dentry, int mode, int opmode; __u8 __name[NCP_MAXPATHLEN + 1]; - PPRINTK("ncp_create_new: creating %s/%s, mode=%x\n", + PPRINTK("ncp_create_new: creating %s/%s, mode=%hx\n", dentry->d_parent->d_name.name, dentry->d_name.name, mode); ncp_age_dentry(server, dentry); @@ -979,13 +979,13 @@ out: return error; } -static int ncp_create(struct inode *dir, struct dentry *dentry, int mode, +static int ncp_create(struct inode *dir, struct dentry *dentry, umode_t mode, struct nameidata *nd) { return ncp_create_new(dir, dentry, mode, 0, 0); } -static int ncp_mkdir(struct inode *dir, struct dentry *dentry, int mode) +static int ncp_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) { struct ncp_entry_info finfo; struct ncp_server *server = NCP_SERVER(dir); @@ -1201,12 +1201,12 @@ out: } static int ncp_mknod(struct inode * dir, struct dentry *dentry, - int mode, dev_t rdev) + umode_t mode, dev_t rdev) { if (!new_valid_dev(rdev)) return -EINVAL; if (ncp_is_nfs_extras(NCP_SERVER(dir), NCP_FINFO(dir)->volNumber)) { - DPRINTK(KERN_DEBUG "ncp_mknod: mode = 0%o\n", mode); + DPRINTK(KERN_DEBUG "ncp_mknod: mode = 0%ho\n", mode); return ncp_create_new(dir, dentry, mode, rdev, 0); } return -EPERM; /* Strange, but true */ diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c index 202f370..3d1e34f 100644 --- a/fs/ncpfs/inode.c +++ b/fs/ncpfs/inode.c @@ -44,7 +44,7 @@ static void ncp_evict_inode(struct inode *); static void ncp_put_super(struct super_block *); static int ncp_statfs(struct dentry *, struct kstatfs *); -static int ncp_show_options(struct seq_file *, struct vfsmount *); +static int ncp_show_options(struct seq_file *, struct dentry *); static struct kmem_cache * ncp_inode_cachep; @@ -60,7 +60,6 @@ static struct inode *ncp_alloc_inode(struct super_block *sb) static void ncp_i_callback(struct rcu_head *head) { struct inode *inode = container_of(head, struct inode, i_rcu); - INIT_LIST_HEAD(&inode->i_dentry); kmem_cache_free(ncp_inode_cachep, NCP_FINFO(inode)); } @@ -228,7 +227,7 @@ static void ncp_set_attr(struct inode *inode, struct ncp_entry_info *nwinfo) DDPRINTK("ncp_read_inode: inode->i_mode = %u\n", inode->i_mode); - inode->i_nlink = 1; + set_nlink(inode, 1); inode->i_uid = server->m.uid; inode->i_gid = server->m.gid; @@ -323,9 +322,9 @@ static void ncp_stop_tasks(struct ncp_server *server) { flush_work_sync(&server->timeout_tq); } -static int ncp_show_options(struct seq_file *seq, struct vfsmount *mnt) +static int ncp_show_options(struct seq_file *seq, struct dentry *root) { - struct ncp_server *server = NCP_SBP(mnt->mnt_sb); + struct ncp_server *server = NCP_SBP(root->d_sb); unsigned int tmp; if (server->m.uid != 0) @@ -548,7 +547,7 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent) error = bdi_setup_and_register(&server->bdi, "ncpfs", BDI_CAP_MAP_COPY); if (error) - goto out_bdi; + goto out_fput; server->ncp_filp = ncp_filp; server->ncp_sock = sock; @@ -559,7 +558,7 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent) error = -EBADF; server->info_filp = fget(data.info_fd); if (!server->info_filp) - goto out_fput; + goto out_bdi; error = -ENOTSOCK; sock_inode = server->info_filp->f_path.dentry->d_inode; if (!S_ISSOCK(sock_inode->i_mode)) @@ -746,9 +745,9 @@ out_nls: out_fput2: if (server->info_filp) fput(server->info_filp); -out_fput: - bdi_destroy(&server->bdi); out_bdi: + bdi_destroy(&server->bdi); +out_fput: /* 23/12/1998 Marcin Dalecki <dalecki@cs.net.pl>: * * The previously used put_filp(ncp_filp); was bogus, since diff --git a/fs/ncpfs/ioctl.c b/fs/ncpfs/ioctl.c index 790e92a..6958adf 100644 --- a/fs/ncpfs/ioctl.c +++ b/fs/ncpfs/ioctl.c @@ -901,7 +901,7 @@ long ncp_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) ret = __ncp_ioctl(inode, cmd, arg); outDropWrite: if (need_drop_write) - mnt_drop_write(filp->f_path.mnt); + mnt_drop_write_file(filp); out: return ret; } diff --git a/fs/ncpfs/ncplib_kernel.h b/fs/ncpfs/ncplib_kernel.h index 09881e6..32c0658 100644 --- a/fs/ncpfs/ncplib_kernel.h +++ b/fs/ncpfs/ncplib_kernel.h @@ -114,7 +114,7 @@ int ncp_dirhandle_alloc(struct ncp_server *, __u8 vol, __le32 dirent, __u8 *dirh int ncp_dirhandle_free(struct ncp_server *, __u8 dirhandle); int ncp_create_new(struct inode *dir, struct dentry *dentry, - int mode, dev_t rdev, __le32 attributes); + umode_t mode, dev_t rdev, __le32 attributes); static inline int ncp_is_nfs_extras(struct ncp_server* server, unsigned int volnum) { #ifdef CONFIG_NCPFS_NFS_NS diff --git a/fs/ncpfs/symlink.c b/fs/ncpfs/symlink.c index 661f861..52439dd 100644 --- a/fs/ncpfs/symlink.c +++ b/fs/ncpfs/symlink.c @@ -108,7 +108,7 @@ int ncp_symlink(struct inode *dir, struct dentry *dentry, const char *symname) { char *rawlink; int length, err, i, outlen; int kludge; - int mode; + umode_t mode; __le32 attr; unsigned int hdr; diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c index 9561c8f..281ae95 100644 --- a/fs/nfs/blocklayout/blocklayout.c +++ b/fs/nfs/blocklayout/blocklayout.c @@ -176,17 +176,6 @@ retry: return bio; } -static void bl_set_lo_fail(struct pnfs_layout_segment *lseg) -{ - if (lseg->pls_range.iomode == IOMODE_RW) { - dprintk("%s Setting layout IOMODE_RW fail bit\n", __func__); - set_bit(lo_fail_bit(IOMODE_RW), &lseg->pls_layout->plh_flags); - } else { - dprintk("%s Setting layout IOMODE_READ fail bit\n", __func__); - set_bit(lo_fail_bit(IOMODE_READ), &lseg->pls_layout->plh_flags); - } -} - /* This is basically copied from mpage_end_io_read */ static void bl_end_io_read(struct bio *bio, int err) { @@ -206,7 +195,7 @@ static void bl_end_io_read(struct bio *bio, int err) if (!uptodate) { if (!rdata->pnfs_error) rdata->pnfs_error = -EIO; - bl_set_lo_fail(rdata->lseg); + pnfs_set_lo_fail(rdata->lseg); } bio_put(bio); put_parallel(par); @@ -303,6 +292,7 @@ bl_read_pagelist(struct nfs_read_data *rdata) bl_end_io_read, par); if (IS_ERR(bio)) { rdata->pnfs_error = PTR_ERR(bio); + bio = NULL; goto out; } } @@ -370,7 +360,7 @@ static void bl_end_io_write_zero(struct bio *bio, int err) if (!uptodate) { if (!wdata->pnfs_error) wdata->pnfs_error = -EIO; - bl_set_lo_fail(wdata->lseg); + pnfs_set_lo_fail(wdata->lseg); } bio_put(bio); put_parallel(par); @@ -386,7 +376,7 @@ static void bl_end_io_write(struct bio *bio, int err) if (!uptodate) { if (!wdata->pnfs_error) wdata->pnfs_error = -EIO; - bl_set_lo_fail(wdata->lseg); + pnfs_set_lo_fail(wdata->lseg); } bio_put(bio); put_parallel(par); @@ -543,6 +533,11 @@ bl_write_pagelist(struct nfs_write_data *wdata, int sync) fill_invalid_ext: dprintk("%s need to zero %d pages\n", __func__, npg_zero); for (;npg_zero > 0; npg_zero--) { + if (bl_is_sector_init(be->be_inval, isect)) { + dprintk("isect %llu already init\n", + (unsigned long long)isect); + goto next_page; + } /* page ref released in bl_end_io_write_zero */ index = isect >> PAGE_CACHE_SECTOR_SHIFT; dprintk("%s zero %dth page: index %lu isect %llu\n", @@ -562,8 +557,7 @@ fill_invalid_ext: * PageUptodate: It was read before * sector_initialized: already written out */ - if (PageDirty(page) || PageWriteback(page) || - bl_is_sector_init(be->be_inval, isect)) { + if (PageDirty(page) || PageWriteback(page)) { print_page(page); unlock_page(page); page_cache_release(page); @@ -592,6 +586,7 @@ fill_invalid_ext: bl_end_io_write_zero, par); if (IS_ERR(bio)) { wdata->pnfs_error = PTR_ERR(bio); + bio = NULL; goto out; } /* FIXME: This should be done in bi_end_io */ @@ -640,6 +635,7 @@ next_page: bl_end_io_write, par); if (IS_ERR(bio)) { wdata->pnfs_error = PTR_ERR(bio); + bio = NULL; goto out; } isect += PAGE_CACHE_SECTORS; @@ -805,7 +801,7 @@ nfs4_blk_get_deviceinfo(struct nfs_server *server, const struct nfs_fh *fh, struct nfs4_deviceid *d_id) { struct pnfs_device *dev; - struct pnfs_block_dev *rv = NULL; + struct pnfs_block_dev *rv; u32 max_resp_sz; int max_pages; struct page **pages = NULL; @@ -823,18 +819,20 @@ nfs4_blk_get_deviceinfo(struct nfs_server *server, const struct nfs_fh *fh, dev = kmalloc(sizeof(*dev), GFP_NOFS); if (!dev) { dprintk("%s kmalloc failed\n", __func__); - return NULL; + return ERR_PTR(-ENOMEM); } pages = kzalloc(max_pages * sizeof(struct page *), GFP_NOFS); if (pages == NULL) { kfree(dev); - return NULL; + return ERR_PTR(-ENOMEM); } for (i = 0; i < max_pages; i++) { pages[i] = alloc_page(GFP_NOFS); - if (!pages[i]) + if (!pages[i]) { + rv = ERR_PTR(-ENOMEM); goto out_free; + } } memcpy(&dev->dev_id, d_id, sizeof(*d_id)); @@ -847,8 +845,10 @@ nfs4_blk_get_deviceinfo(struct nfs_server *server, const struct nfs_fh *fh, dprintk("%s: dev_id: %s\n", __func__, dev->dev_id.data); rc = nfs4_proc_getdeviceinfo(server, dev); dprintk("%s getdevice info returns %d\n", __func__, rc); - if (rc) + if (rc) { + rv = ERR_PTR(rc); goto out_free; + } rv = nfs4_blk_decode_device(server, dev); out_free: @@ -866,7 +866,7 @@ bl_set_layoutdriver(struct nfs_server *server, const struct nfs_fh *fh) struct pnfs_devicelist *dlist = NULL; struct pnfs_block_dev *bdev; LIST_HEAD(block_disklist); - int status = 0, i; + int status, i; dprintk("%s enter\n", __func__); @@ -898,8 +898,8 @@ bl_set_layoutdriver(struct nfs_server *server, const struct nfs_fh *fh) for (i = 0; i < dlist->num_devs; i++) { bdev = nfs4_blk_get_deviceinfo(server, fh, &dlist->dev_id[i]); - if (!bdev) { - status = -ENODEV; + if (IS_ERR(bdev)) { + status = PTR_ERR(bdev); goto out_error; } spin_lock(&b_mt_id->bm_lock); @@ -960,7 +960,7 @@ static struct pnfs_layoutdriver_type blocklayout_type = { }; static const struct rpc_pipe_ops bl_upcall_ops = { - .upcall = bl_pipe_upcall, + .upcall = rpc_pipe_generic_upcall, .downcall = bl_pipe_downcall, .destroy_msg = bl_pipe_destroy_msg, }; @@ -989,17 +989,20 @@ static int __init nfs4blocklayout_init(void) mnt, NFS_PIPE_DIRNAME, 0, &path); if (ret) - goto out_remove; + goto out_putrpc; bl_device_pipe = rpc_mkpipe(path.dentry, "blocklayout", NULL, &bl_upcall_ops, 0); + path_put(&path); if (IS_ERR(bl_device_pipe)) { ret = PTR_ERR(bl_device_pipe); - goto out_remove; + goto out_putrpc; } out: return ret; +out_putrpc: + rpc_put_mount(); out_remove: pnfs_unregister_layoutdriver(&blocklayout_type); return ret; @@ -1012,6 +1015,7 @@ static void __exit nfs4blocklayout_exit(void) pnfs_unregister_layoutdriver(&blocklayout_type); rpc_unlink(bl_device_pipe); + rpc_put_mount(); } MODULE_ALIAS("nfs-layouttype4-3"); diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h index f27d827..42acf7e 100644 --- a/fs/nfs/blocklayout/blocklayout.h +++ b/fs/nfs/blocklayout/blocklayout.h @@ -150,7 +150,7 @@ BLK_LSEG2EXT(struct pnfs_layout_segment *lseg) } struct bl_dev_msg { - int status; + int32_t status; uint32_t major, minor; }; @@ -169,8 +169,6 @@ extern wait_queue_head_t bl_wq; #define BL_DEVICE_REQUEST_ERR 0x2 /* User level process fails */ /* blocklayoutdev.c */ -ssize_t bl_pipe_upcall(struct file *, struct rpc_pipe_msg *, - char __user *, size_t); ssize_t bl_pipe_downcall(struct file *, const char __user *, size_t); void bl_pipe_destroy_msg(struct rpc_pipe_msg *); struct block_device *nfs4_blkdev_get(dev_t dev); diff --git a/fs/nfs/blocklayout/blocklayoutdev.c b/fs/nfs/blocklayout/blocklayoutdev.c index a83b393..d08ba91 100644 --- a/fs/nfs/blocklayout/blocklayoutdev.c +++ b/fs/nfs/blocklayout/blocklayoutdev.c @@ -79,28 +79,6 @@ int nfs4_blkdev_put(struct block_device *bdev) return blkdev_put(bdev, FMODE_READ); } -/* - * Shouldn't there be a rpc_generic_upcall() to do this for us? - */ -ssize_t bl_pipe_upcall(struct file *filp, struct rpc_pipe_msg *msg, - char __user *dst, size_t buflen) -{ - char *data = (char *)msg->data + msg->copied; - size_t mlen = min(msg->len - msg->copied, buflen); - unsigned long left; - - left = copy_to_user(dst, data, mlen); - if (left == mlen) { - msg->errno = -EFAULT; - return -EFAULT; - } - - mlen -= left; - msg->copied += mlen; - msg->errno = 0; - return mlen; -} - static struct bl_dev_msg bl_mount_reply; ssize_t bl_pipe_downcall(struct file *filp, const char __user *src, @@ -131,7 +109,7 @@ struct pnfs_block_dev * nfs4_blk_decode_device(struct nfs_server *server, struct pnfs_device *dev) { - struct pnfs_block_dev *rv = NULL; + struct pnfs_block_dev *rv; struct block_device *bd = NULL; struct rpc_pipe_msg msg; struct bl_msg_hdr bl_msg = { @@ -141,7 +119,7 @@ nfs4_blk_decode_device(struct nfs_server *server, uint8_t *dataptr; DECLARE_WAITQUEUE(wq, current); struct bl_dev_msg *reply = &bl_mount_reply; - int offset, len, i; + int offset, len, i, rc; dprintk("%s CREATING PIPEFS MESSAGE\n", __func__); dprintk("%s: deviceid: %s, mincount: %d\n", __func__, dev->dev_id.data, @@ -168,8 +146,10 @@ nfs4_blk_decode_device(struct nfs_server *server, dprintk("%s CALLING USERSPACE DAEMON\n", __func__); add_wait_queue(&bl_wq, &wq); - if (rpc_queue_upcall(bl_device_pipe->d_inode, &msg) < 0) { + rc = rpc_queue_upcall(bl_device_pipe->d_inode, &msg); + if (rc < 0) { remove_wait_queue(&bl_wq, &wq); + rv = ERR_PTR(rc); goto out; } @@ -187,8 +167,9 @@ nfs4_blk_decode_device(struct nfs_server *server, bd = nfs4_blkdev_get(MKDEV(reply->major, reply->minor)); if (IS_ERR(bd)) { - dprintk("%s failed to open device : %ld\n", - __func__, PTR_ERR(bd)); + rc = PTR_ERR(bd); + dprintk("%s failed to open device : %d\n", __func__, rc); + rv = ERR_PTR(rc); goto out; } diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c index e3d2942..516f337 100644 --- a/fs/nfs/callback.c +++ b/fs/nfs/callback.c @@ -125,7 +125,7 @@ nfs4_callback_up(struct svc_serv *serv) else goto out_err; - return svc_prepare_thread(serv, &serv->sv_pools[0]); + return svc_prepare_thread(serv, &serv->sv_pools[0], NUMA_NO_NODE); out_err: if (ret == 0) @@ -199,7 +199,7 @@ nfs41_callback_up(struct svc_serv *serv, struct rpc_xprt *xprt) INIT_LIST_HEAD(&serv->sv_cb_list); spin_lock_init(&serv->sv_cb_lock); init_waitqueue_head(&serv->sv_cb_waitq); - rqstp = svc_prepare_thread(serv, &serv->sv_pools[0]); + rqstp = svc_prepare_thread(serv, &serv->sv_pools[0], NUMA_NO_NODE); if (IS_ERR(rqstp)) { svc_xprt_put(serv->sv_bc_xprt); serv->sv_bc_xprt = NULL; diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c index 918ad64..726e59a 100644 --- a/fs/nfs/callback_xdr.c +++ b/fs/nfs/callback_xdr.c @@ -488,17 +488,18 @@ static __be32 decode_recallany_args(struct svc_rqst *rqstp, struct xdr_stream *xdr, struct cb_recallanyargs *args) { - __be32 *p; + uint32_t bitmap[2]; + __be32 *p, status; args->craa_addr = svc_addr(rqstp); p = read_buf(xdr, 4); if (unlikely(p == NULL)) return htonl(NFS4ERR_BADXDR); args->craa_objs_to_keep = ntohl(*p++); - p = read_buf(xdr, 4); - if (unlikely(p == NULL)) - return htonl(NFS4ERR_BADXDR); - args->craa_type_mask = ntohl(*p); + status = decode_bitmap(xdr, bitmap); + if (unlikely(status)) + return status; + args->craa_type_mask = bitmap[0]; return 0; } @@ -986,4 +987,5 @@ struct svc_version nfs4_callback_version4 = { .vs_proc = nfs4_callback_procedures1, .vs_xdrsize = NFS4_CALLBACK_XDRSIZE, .vs_dispatch = NULL, + .vs_hidden = 1, }; diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 5833fbb..873bf00 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -336,11 +336,12 @@ static int nfs_sockaddr_match_ipaddr6(const struct sockaddr *sa1, const struct sockaddr_in6 *sin1 = (const struct sockaddr_in6 *)sa1; const struct sockaddr_in6 *sin2 = (const struct sockaddr_in6 *)sa2; - if (ipv6_addr_scope(&sin1->sin6_addr) == IPV6_ADDR_SCOPE_LINKLOCAL && - sin1->sin6_scope_id != sin2->sin6_scope_id) + if (!ipv6_addr_equal(&sin1->sin6_addr, &sin2->sin6_addr)) return 0; + else if (ipv6_addr_type(&sin1->sin6_addr) & IPV6_ADDR_LINKLOCAL) + return sin1->sin6_scope_id == sin2->sin6_scope_id; - return ipv6_addr_equal(&sin1->sin6_addr, &sin2->sin6_addr); + return 1; } #else /* !defined(CONFIG_IPV6) && !defined(CONFIG_IPV6_MODULE) */ static int nfs_sockaddr_match_ipaddr6(const struct sockaddr *sa1, @@ -1867,6 +1868,10 @@ static int nfs_server_list_show(struct seq_file *m, void *v) /* display one transport per line on subsequent lines */ clp = list_entry(v, struct nfs_client, cl_share_link); + /* Check if the client is initialized */ + if (clp->cl_cons_state != NFS_CS_READY) + return 0; + seq_printf(m, "v%u %s %s %3d %s\n", clp->rpc_ops->version, rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_HEX_ADDR), diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c index 321a66b..7f26540 100644 --- a/fs/nfs/delegation.c +++ b/fs/nfs/delegation.c @@ -240,7 +240,7 @@ int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct sizeof(delegation->stateid.data)); delegation->type = res->delegation_type; delegation->maxsize = res->maxsize; - delegation->change_attr = nfsi->change_attr; + delegation->change_attr = inode->i_version; delegation->cred = get_rpccred(cred); delegation->inode = inode; delegation->flags = 1<<NFS_DELEGATION_REFERENCED; diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index b238d95..fd9a872 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -47,13 +47,13 @@ static int nfs_opendir(struct inode *, struct file *); static int nfs_closedir(struct inode *, struct file *); static int nfs_readdir(struct file *, void *, filldir_t); static struct dentry *nfs_lookup(struct inode *, struct dentry *, struct nameidata *); -static int nfs_create(struct inode *, struct dentry *, int, struct nameidata *); -static int nfs_mkdir(struct inode *, struct dentry *, int); +static int nfs_create(struct inode *, struct dentry *, umode_t, struct nameidata *); +static int nfs_mkdir(struct inode *, struct dentry *, umode_t); static int nfs_rmdir(struct inode *, struct dentry *); static int nfs_unlink(struct inode *, struct dentry *); static int nfs_symlink(struct inode *, struct dentry *, const char *); static int nfs_link(struct dentry *, struct inode *, struct dentry *); -static int nfs_mknod(struct inode *, struct dentry *, int, dev_t); +static int nfs_mknod(struct inode *, struct dentry *, umode_t, dev_t); static int nfs_rename(struct inode *, struct dentry *, struct inode *, struct dentry *); static int nfs_fsync_dir(struct file *, loff_t, loff_t, int); @@ -112,7 +112,7 @@ const struct inode_operations nfs3_dir_inode_operations = { #ifdef CONFIG_NFS_V4 static struct dentry *nfs_atomic_lookup(struct inode *, struct dentry *, struct nameidata *); -static int nfs_open_create(struct inode *dir, struct dentry *dentry, int mode, struct nameidata *nd); +static int nfs_open_create(struct inode *dir, struct dentry *dentry, umode_t mode, struct nameidata *nd); const struct inode_operations nfs4_dir_inode_operations = { .create = nfs_open_create, .lookup = nfs_atomic_lookup, @@ -1368,18 +1368,7 @@ static fmode_t flags_to_mode(int flags) static struct nfs_open_context *create_nfs_open_context(struct dentry *dentry, int open_flags) { - struct nfs_open_context *ctx; - struct rpc_cred *cred; - fmode_t fmode = flags_to_mode(open_flags); - - cred = rpc_lookup_cred(); - if (IS_ERR(cred)) - return ERR_CAST(cred); - ctx = alloc_nfs_open_context(dentry, cred, fmode); - put_rpccred(cred); - if (ctx == NULL) - return ERR_PTR(-ENOMEM); - return ctx; + return alloc_nfs_open_context(dentry, flags_to_mode(open_flags)); } static int do_open(struct inode *inode, struct file *filp) @@ -1468,12 +1457,12 @@ static struct dentry *nfs_atomic_lookup(struct inode *dir, struct dentry *dentry res = NULL; goto out; /* This turned out not to be a regular file */ + case -EISDIR: case -ENOTDIR: goto no_open; case -ELOOP: if (!(nd->intent.open.flags & O_NOFOLLOW)) goto no_open; - /* case -EISDIR: */ /* case -EINVAL: */ default: res = ERR_CAST(inode); @@ -1584,8 +1573,8 @@ no_open: return nfs_lookup_revalidate(dentry, nd); } -static int nfs_open_create(struct inode *dir, struct dentry *dentry, int mode, - struct nameidata *nd) +static int nfs_open_create(struct inode *dir, struct dentry *dentry, + umode_t mode, struct nameidata *nd) { struct nfs_open_context *ctx = NULL; struct iattr attr; @@ -1675,8 +1664,8 @@ out_error: * that the operation succeeded on the server, but an error in the * reply path made it appear to have failed. */ -static int nfs_create(struct inode *dir, struct dentry *dentry, int mode, - struct nameidata *nd) +static int nfs_create(struct inode *dir, struct dentry *dentry, + umode_t mode, struct nameidata *nd) { struct iattr attr; int error; @@ -1704,7 +1693,7 @@ out_err: * See comments for nfs_proc_create regarding failed operations. */ static int -nfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t rdev) +nfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev) { struct iattr attr; int status; @@ -1730,7 +1719,7 @@ out_err: /* * See comments for nfs_proc_create regarding failed operations. */ -static int nfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) +static int nfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) { struct iattr attr; int error; diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 28b8c3f..606ef0f 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -40,48 +40,8 @@ #define NFSDBG_FACILITY NFSDBG_FILE -static int nfs_file_open(struct inode *, struct file *); -static int nfs_file_release(struct inode *, struct file *); -static loff_t nfs_file_llseek(struct file *file, loff_t offset, int origin); -static int nfs_file_mmap(struct file *, struct vm_area_struct *); -static ssize_t nfs_file_splice_read(struct file *filp, loff_t *ppos, - struct pipe_inode_info *pipe, - size_t count, unsigned int flags); -static ssize_t nfs_file_read(struct kiocb *, const struct iovec *iov, - unsigned long nr_segs, loff_t pos); -static ssize_t nfs_file_splice_write(struct pipe_inode_info *pipe, - struct file *filp, loff_t *ppos, - size_t count, unsigned int flags); -static ssize_t nfs_file_write(struct kiocb *, const struct iovec *iov, - unsigned long nr_segs, loff_t pos); -static int nfs_file_flush(struct file *, fl_owner_t id); -static int nfs_file_fsync(struct file *, loff_t, loff_t, int datasync); -static int nfs_check_flags(int flags); -static int nfs_lock(struct file *filp, int cmd, struct file_lock *fl); -static int nfs_flock(struct file *filp, int cmd, struct file_lock *fl); -static int nfs_setlease(struct file *file, long arg, struct file_lock **fl); - static const struct vm_operations_struct nfs_file_vm_ops; -const struct file_operations nfs_file_operations = { - .llseek = nfs_file_llseek, - .read = do_sync_read, - .write = do_sync_write, - .aio_read = nfs_file_read, - .aio_write = nfs_file_write, - .mmap = nfs_file_mmap, - .open = nfs_file_open, - .flush = nfs_file_flush, - .release = nfs_file_release, - .fsync = nfs_file_fsync, - .lock = nfs_lock, - .flock = nfs_flock, - .splice_read = nfs_file_splice_read, - .splice_write = nfs_file_splice_write, - .check_flags = nfs_check_flags, - .setlease = nfs_setlease, -}; - const struct inode_operations nfs_file_inode_operations = { .permission = nfs_permission, .getattr = nfs_getattr, @@ -137,11 +97,9 @@ nfs_file_open(struct inode *inode, struct file *filp) static int nfs_file_release(struct inode *inode, struct file *filp) { - struct dentry *dentry = filp->f_path.dentry; - dprintk("NFS: release(%s/%s)\n", - dentry->d_parent->d_name.name, - dentry->d_name.name); + filp->f_path.dentry->d_parent->d_name.name, + filp->f_path.dentry->d_name.name); nfs_inc_stats(inode, NFSIOS_VFSRELEASE); return nfs_release(inode, filp); @@ -180,8 +138,6 @@ force_reval: static loff_t nfs_file_llseek(struct file *filp, loff_t offset, int origin) { - loff_t loff; - dprintk("NFS: llseek file(%s/%s, %lld, %d)\n", filp->f_path.dentry->d_parent->d_name.name, filp->f_path.dentry->d_name.name, @@ -191,19 +147,15 @@ static loff_t nfs_file_llseek(struct file *filp, loff_t offset, int origin) * origin == SEEK_END || SEEK_DATA || SEEK_HOLE => we must revalidate * the cached file length */ - if (origin != SEEK_SET || origin != SEEK_CUR) { + if (origin != SEEK_SET && origin != SEEK_CUR) { struct inode *inode = filp->f_mapping->host; int retval = nfs_revalidate_file_size(inode, filp); if (retval < 0) return (loff_t)retval; + } - spin_lock(&inode->i_lock); - loff = generic_file_llseek_unlocked(filp, offset, origin); - spin_unlock(&inode->i_lock); - } else - loff = generic_file_llseek_unlocked(filp, offset, origin); - return loff; + return generic_file_llseek(filp, offset, origin); } /* @@ -234,14 +186,13 @@ nfs_file_read(struct kiocb *iocb, const struct iovec *iov, struct dentry * dentry = iocb->ki_filp->f_path.dentry; struct inode * inode = dentry->d_inode; ssize_t result; - size_t count = iov_length(iov, nr_segs); if (iocb->ki_filp->f_flags & O_DIRECT) return nfs_file_direct_read(iocb, iov, nr_segs, pos); dprintk("NFS: read(%s/%s, %lu@%lu)\n", dentry->d_parent->d_name.name, dentry->d_name.name, - (unsigned long) count, (unsigned long) pos); + (unsigned long) iov_length(iov, nr_segs), (unsigned long) pos); result = nfs_revalidate_mapping(inode, iocb->ki_filp->f_mapping); if (!result) { @@ -895,3 +846,54 @@ static int nfs_setlease(struct file *file, long arg, struct file_lock **fl) file->f_path.dentry->d_name.name, arg); return -EINVAL; } + +const struct file_operations nfs_file_operations = { + .llseek = nfs_file_llseek, + .read = do_sync_read, + .write = do_sync_write, + .aio_read = nfs_file_read, + .aio_write = nfs_file_write, + .mmap = nfs_file_mmap, + .open = nfs_file_open, + .flush = nfs_file_flush, + .release = nfs_file_release, + .fsync = nfs_file_fsync, + .lock = nfs_lock, + .flock = nfs_flock, + .splice_read = nfs_file_splice_read, + .splice_write = nfs_file_splice_write, + .check_flags = nfs_check_flags, + .setlease = nfs_setlease, +}; + +#ifdef CONFIG_NFS_V4 +static int +nfs4_file_open(struct inode *inode, struct file *filp) +{ + /* + * NFSv4 opens are handled in d_lookup and d_revalidate. If we get to + * this point, then something is very wrong + */ + dprintk("NFS: %s called! inode=%p filp=%p\n", __func__, inode, filp); + return -ENOTDIR; +} + +const struct file_operations nfs4_file_operations = { + .llseek = nfs_file_llseek, + .read = do_sync_read, + .write = do_sync_write, + .aio_read = nfs_file_read, + .aio_write = nfs_file_write, + .mmap = nfs_file_mmap, + .open = nfs4_file_open, + .flush = nfs_file_flush, + .release = nfs_file_release, + .fsync = nfs_file_fsync, + .lock = nfs_lock, + .flock = nfs_flock, + .splice_read = nfs_file_splice_read, + .splice_write = nfs_file_splice_write, + .check_flags = nfs_check_flags, + .setlease = nfs_setlease, +}; +#endif /* CONFIG_NFS_V4 */ diff --git a/fs/nfs/fscache-index.c b/fs/nfs/fscache-index.c index 5b10064..7cf2c46 100644 --- a/fs/nfs/fscache-index.c +++ b/fs/nfs/fscache-index.c @@ -212,7 +212,7 @@ static uint16_t nfs_fscache_inode_get_aux(const void *cookie_netfs_data, auxdata.ctime = nfsi->vfs_inode.i_ctime; if (NFS_SERVER(&nfsi->vfs_inode)->nfs_client->rpc_ops->version == 4) - auxdata.change_attr = nfsi->change_attr; + auxdata.change_attr = nfsi->vfs_inode.i_version; if (bufmax > sizeof(auxdata)) bufmax = sizeof(auxdata); @@ -244,7 +244,7 @@ enum fscache_checkaux nfs_fscache_inode_check_aux(void *cookie_netfs_data, auxdata.ctime = nfsi->vfs_inode.i_ctime; if (NFS_SERVER(&nfsi->vfs_inode)->nfs_client->rpc_ops->version == 4) - auxdata.change_attr = nfsi->change_attr; + auxdata.change_attr = nfsi->vfs_inode.i_version; if (memcmp(data, &auxdata, datalen) != 0) return FSCACHE_CHECKAUX_OBSOLETE; diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c index f20801a..47d1c6f 100644 --- a/fs/nfs/idmap.c +++ b/fs/nfs/idmap.c @@ -336,8 +336,6 @@ struct idmap { struct idmap_hashtable idmap_group_hash; }; -static ssize_t idmap_pipe_upcall(struct file *, struct rpc_pipe_msg *, - char __user *, size_t); static ssize_t idmap_pipe_downcall(struct file *, const char __user *, size_t); static void idmap_pipe_destroy_msg(struct rpc_pipe_msg *); @@ -345,7 +343,7 @@ static void idmap_pipe_destroy_msg(struct rpc_pipe_msg *); static unsigned int fnvhash32(const void *, size_t); static const struct rpc_pipe_ops idmap_upcall_ops = { - .upcall = idmap_pipe_upcall, + .upcall = rpc_pipe_generic_upcall, .downcall = idmap_pipe_downcall, .destroy_msg = idmap_pipe_destroy_msg, }; @@ -595,27 +593,6 @@ nfs_idmap_name(struct idmap *idmap, struct idmap_hashtable *h, return ret; } -/* RPC pipefs upcall/downcall routines */ -static ssize_t -idmap_pipe_upcall(struct file *filp, struct rpc_pipe_msg *msg, - char __user *dst, size_t buflen) -{ - char *data = (char *)msg->data + msg->copied; - size_t mlen = min(msg->len, buflen); - unsigned long left; - - left = copy_to_user(dst, data, mlen); - if (left == mlen) { - msg->errno = -EFAULT; - return -EFAULT; - } - - mlen -= left; - msg->copied += mlen; - msg->errno = 0; - return mlen; -} - static ssize_t idmap_pipe_downcall(struct file *filp, const char __user *src, size_t mlen) { diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index fe12037..81db25e 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -38,6 +38,7 @@ #include <linux/nfs_xdr.h> #include <linux/slab.h> #include <linux/compat.h> +#include <linux/freezer.h> #include <asm/system.h> #include <asm/uaccess.h> @@ -77,7 +78,7 @@ int nfs_wait_bit_killable(void *word) { if (fatal_signal_pending(current)) return -ERESTARTSYS; - schedule(); + freezable_schedule(); return 0; } @@ -291,7 +292,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr) */ inode->i_op = NFS_SB(sb)->nfs_client->rpc_ops->file_inode_ops; if (S_ISREG(inode->i_mode)) { - inode->i_fop = &nfs_file_operations; + inode->i_fop = NFS_SB(sb)->nfs_client->rpc_ops->file_ops; inode->i_data.a_ops = &nfs_file_aops; inode->i_data.backing_dev_info = &NFS_SB(sb)->backing_dev_info; } else if (S_ISDIR(inode->i_mode)) { @@ -318,9 +319,9 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr) memset(&inode->i_atime, 0, sizeof(inode->i_atime)); memset(&inode->i_mtime, 0, sizeof(inode->i_mtime)); memset(&inode->i_ctime, 0, sizeof(inode->i_ctime)); - nfsi->change_attr = 0; + inode->i_version = 0; inode->i_size = 0; - inode->i_nlink = 0; + clear_nlink(inode); inode->i_uid = -2; inode->i_gid = -2; inode->i_blocks = 0; @@ -344,7 +345,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr) | NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL; if (fattr->valid & NFS_ATTR_FATTR_CHANGE) - nfsi->change_attr = fattr->change_attr; + inode->i_version = fattr->change_attr; else if (nfs_server_capable(inode, NFS_CAP_CHANGE_ATTR)) nfsi->cache_validity |= NFS_INO_INVALID_ATTR | NFS_INO_INVALID_DATA; @@ -355,7 +356,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr) | NFS_INO_INVALID_DATA | NFS_INO_REVAL_PAGECACHE; if (fattr->valid & NFS_ATTR_FATTR_NLINK) - inode->i_nlink = fattr->nlink; + set_nlink(inode, fattr->nlink); else if (nfs_server_capable(inode, NFS_CAP_NLINK)) nfsi->cache_validity |= NFS_INO_INVALID_ATTR; if (fattr->valid & NFS_ATTR_FATTR_OWNER) @@ -629,23 +630,28 @@ void nfs_close_context(struct nfs_open_context *ctx, int is_sync) nfs_revalidate_inode(server, inode); } -struct nfs_open_context *alloc_nfs_open_context(struct dentry *dentry, struct rpc_cred *cred, fmode_t f_mode) +struct nfs_open_context *alloc_nfs_open_context(struct dentry *dentry, fmode_t f_mode) { struct nfs_open_context *ctx; + struct rpc_cred *cred = rpc_lookup_cred(); + if (IS_ERR(cred)) + return ERR_CAST(cred); ctx = kmalloc(sizeof(*ctx), GFP_KERNEL); - if (ctx != NULL) { - nfs_sb_active(dentry->d_sb); - ctx->dentry = dget(dentry); - ctx->cred = get_rpccred(cred); - ctx->state = NULL; - ctx->mode = f_mode; - ctx->flags = 0; - ctx->error = 0; - nfs_init_lock_context(&ctx->lock_context); - ctx->lock_context.open_context = ctx; - INIT_LIST_HEAD(&ctx->list); + if (!ctx) { + put_rpccred(cred); + return ERR_PTR(-ENOMEM); } + nfs_sb_active(dentry->d_sb); + ctx->dentry = dget(dentry); + ctx->cred = cred; + ctx->state = NULL; + ctx->mode = f_mode; + ctx->flags = 0; + ctx->error = 0; + nfs_init_lock_context(&ctx->lock_context); + ctx->lock_context.open_context = ctx; + INIT_LIST_HEAD(&ctx->list); return ctx; } @@ -738,15 +744,10 @@ static void nfs_file_clear_open_context(struct file *filp) int nfs_open(struct inode *inode, struct file *filp) { struct nfs_open_context *ctx; - struct rpc_cred *cred; - cred = rpc_lookup_cred(); - if (IS_ERR(cred)) - return PTR_ERR(cred); - ctx = alloc_nfs_open_context(filp->f_path.dentry, cred, filp->f_mode); - put_rpccred(cred); - if (ctx == NULL) - return -ENOMEM; + ctx = alloc_nfs_open_context(filp->f_path.dentry, filp->f_mode); + if (IS_ERR(ctx)) + return PTR_ERR(ctx); nfs_file_set_open_context(filp, ctx); put_nfs_open_context(ctx); nfs_fscache_set_inode_cookie(inode, filp); @@ -897,8 +898,8 @@ static unsigned long nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr if ((fattr->valid & NFS_ATTR_FATTR_PRECHANGE) && (fattr->valid & NFS_ATTR_FATTR_CHANGE) - && nfsi->change_attr == fattr->pre_change_attr) { - nfsi->change_attr = fattr->change_attr; + && inode->i_version == fattr->pre_change_attr) { + inode->i_version = fattr->change_attr; if (S_ISDIR(inode->i_mode)) nfsi->cache_validity |= NFS_INO_INVALID_DATA; ret |= NFS_INO_INVALID_ATTR; @@ -952,7 +953,7 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat return -EIO; if ((fattr->valid & NFS_ATTR_FATTR_CHANGE) != 0 && - nfsi->change_attr != fattr->change_attr) + inode->i_version != fattr->change_attr) invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE; /* Verify a few of the more important attributes */ @@ -1163,7 +1164,7 @@ int nfs_post_op_update_inode_force_wcc(struct inode *inode, struct nfs_fattr *fa } if ((fattr->valid & NFS_ATTR_FATTR_CHANGE) != 0 && (fattr->valid & NFS_ATTR_FATTR_PRECHANGE) == 0) { - fattr->pre_change_attr = NFS_I(inode)->change_attr; + fattr->pre_change_attr = inode->i_version; fattr->valid |= NFS_ATTR_FATTR_PRECHANGE; } if ((fattr->valid & NFS_ATTR_FATTR_CTIME) != 0 && @@ -1244,13 +1245,13 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) /* More cache consistency checks */ if (fattr->valid & NFS_ATTR_FATTR_CHANGE) { - if (nfsi->change_attr != fattr->change_attr) { + if (inode->i_version != fattr->change_attr) { dprintk("NFS: change_attr change on server for file %s/%ld\n", inode->i_sb->s_id, inode->i_ino); invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL; if (S_ISDIR(inode->i_mode)) nfs_force_lookup_revalidate(inode); - nfsi->change_attr = fattr->change_attr; + inode->i_version = fattr->change_attr; } } else if (server->caps & NFS_CAP_CHANGE_ATTR) invalid |= save_cache_validity; @@ -1361,7 +1362,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) invalid |= NFS_INO_INVALID_ATTR; if (S_ISDIR(inode->i_mode)) invalid |= NFS_INO_INVALID_DATA; - inode->i_nlink = fattr->nlink; + set_nlink(inode, fattr->nlink); } } else if (server->caps & NFS_CAP_NLINK) invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR @@ -1464,7 +1465,6 @@ struct inode *nfs_alloc_inode(struct super_block *sb) static void nfs_i_callback(struct rcu_head *head) { struct inode *inode = container_of(head, struct inode, i_rcu); - INIT_LIST_HEAD(&inode->i_dentry); kmem_cache_free(nfs_inode_cachep, NFS_I(inode)); } diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index ab12913..3f4d957 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -299,6 +299,8 @@ extern void nfs_read_prepare(struct rpc_task *task, void *calldata); extern int nfs_generic_pagein(struct nfs_pageio_descriptor *desc, struct list_head *head); +extern void nfs_pageio_init_read_mds(struct nfs_pageio_descriptor *pgio, + struct inode *inode); extern void nfs_pageio_reset_read_mds(struct nfs_pageio_descriptor *pgio); extern void nfs_readdata_release(struct nfs_read_data *rdata); @@ -457,13 +459,3 @@ unsigned int nfs_page_array_len(unsigned int base, size_t len) PAGE_SIZE - 1) >> PAGE_SHIFT; } -/* - * Helper for restarting RPC calls in the possible presence of NFSv4.1 - * sessions. - */ -static inline int nfs_restart_rpc(struct rpc_task *task, const struct nfs_client *clp) -{ - if (nfs4_has_session(clp)) - return rpc_restart_call_prepare(task); - return rpc_restart_call(task); -} diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c index 85f1690..91943953 100644 --- a/fs/nfs/nfs3proc.c +++ b/fs/nfs/nfs3proc.c @@ -17,6 +17,7 @@ #include <linux/nfs_page.h> #include <linux/lockd/bind.h> #include <linux/nfs_mount.h> +#include <linux/freezer.h> #include "iostat.h" #include "internal.h" @@ -32,7 +33,7 @@ nfs3_rpc_wrapper(struct rpc_clnt *clnt, struct rpc_message *msg, int flags) res = rpc_call_sync(clnt, msg, flags); if (res != -EJUKEBOX && res != -EKEYEXPIRED) break; - schedule_timeout_killable(NFS_JUKEBOX_RETRY_TIME); + freezable_schedule_timeout_killable(NFS_JUKEBOX_RETRY_TIME); res = -ERESTARTSYS; } while (!fatal_signal_pending(current)); return res; @@ -853,6 +854,7 @@ const struct nfs_rpc_ops nfs_v3_clientops = { .dentry_ops = &nfs_dentry_operations, .dir_inode_ops = &nfs3_dir_inode_operations, .file_inode_ops = &nfs3_file_inode_operations, + .file_ops = &nfs_file_operations, .getroot = nfs3_proc_get_root, .getattr = nfs3_proc_getattr, .setattr = nfs3_proc_setattr, diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index 3e93e9a..693ae22 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -13,30 +13,6 @@ struct idmap; -/* - * In a seqid-mutating op, this macro controls which error return - * values trigger incrementation of the seqid. - * - * from rfc 3010: - * The client MUST monotonically increment the sequence number for the - * CLOSE, LOCK, LOCKU, OPEN, OPEN_CONFIRM, and OPEN_DOWNGRADE - * operations. This is true even in the event that the previous - * operation that used the sequence number received an error. The only - * exception to this rule is if the previous operation received one of - * the following errors: NFSERR_STALE_CLIENTID, NFSERR_STALE_STATEID, - * NFSERR_BAD_STATEID, NFSERR_BAD_SEQID, NFSERR_BADXDR, - * NFSERR_RESOURCE, NFSERR_NOFILEHANDLE. - * - */ -#define seqid_mutating_err(err) \ -(((err) != NFSERR_STALE_CLIENTID) && \ - ((err) != NFSERR_STALE_STATEID) && \ - ((err) != NFSERR_BAD_STATEID) && \ - ((err) != NFSERR_BAD_SEQID) && \ - ((err) != NFSERR_BAD_XDR) && \ - ((err) != NFSERR_RESOURCE) && \ - ((err) != NFSERR_NOFILEHANDLE)) - enum nfs4_client_state { NFS4CLNT_MANAGER_RUNNING = 0, NFS4CLNT_CHECK_LEASE, diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c index e8915d4..a62d36b 100644 --- a/fs/nfs/nfs4filelayout.c +++ b/fs/nfs/nfs4filelayout.c @@ -31,6 +31,7 @@ #include <linux/nfs_fs.h> #include <linux/nfs_page.h> +#include <linux/module.h> #include "internal.h" #include "nfs4filelayout.h" @@ -77,19 +78,6 @@ filelayout_get_dserver_offset(struct pnfs_layout_segment *lseg, loff_t offset) BUG(); } -/* For data server errors we don't recover from */ -static void -filelayout_set_lo_fail(struct pnfs_layout_segment *lseg) -{ - if (lseg->pls_range.iomode == IOMODE_RW) { - dprintk("%s Setting layout IOMODE_RW fail bit\n", __func__); - set_bit(lo_fail_bit(IOMODE_RW), &lseg->pls_layout->plh_flags); - } else { - dprintk("%s Setting layout IOMODE_READ fail bit\n", __func__); - set_bit(lo_fail_bit(IOMODE_READ), &lseg->pls_layout->plh_flags); - } -} - static int filelayout_async_handle_error(struct rpc_task *task, struct nfs4_state *state, struct nfs_client *clp, @@ -135,7 +123,6 @@ static int filelayout_async_handle_error(struct rpc_task *task, static int filelayout_read_done_cb(struct rpc_task *task, struct nfs_read_data *data) { - struct nfs_client *clp = data->ds_clp; int reset = 0; dprintk("%s DS read\n", __func__); @@ -145,11 +132,10 @@ static int filelayout_read_done_cb(struct rpc_task *task, dprintk("%s calling restart ds_clp %p ds_clp->cl_session %p\n", __func__, data->ds_clp, data->ds_clp->cl_session); if (reset) { - filelayout_set_lo_fail(data->lseg); + pnfs_set_lo_fail(data->lseg); nfs4_reset_read(task, data); - clp = NFS_SERVER(data->inode)->nfs_client; } - nfs_restart_rpc(task, clp); + rpc_restart_call_prepare(task); return -EAGAIN; } @@ -216,17 +202,13 @@ static int filelayout_write_done_cb(struct rpc_task *task, if (filelayout_async_handle_error(task, data->args.context->state, data->ds_clp, &reset) == -EAGAIN) { - struct nfs_client *clp; - dprintk("%s calling restart ds_clp %p ds_clp->cl_session %p\n", __func__, data->ds_clp, data->ds_clp->cl_session); if (reset) { - filelayout_set_lo_fail(data->lseg); + pnfs_set_lo_fail(data->lseg); nfs4_reset_write(task, data); - clp = NFS_SERVER(data->inode)->nfs_client; - } else - clp = data->ds_clp; - nfs_restart_rpc(task, clp); + } + rpc_restart_call_prepare(task); return -EAGAIN; } @@ -256,9 +238,9 @@ static int filelayout_commit_done_cb(struct rpc_task *task, __func__, data->ds_clp, data->ds_clp->cl_session); if (reset) { prepare_to_resend_writes(data); - filelayout_set_lo_fail(data->lseg); + pnfs_set_lo_fail(data->lseg); } else - nfs_restart_rpc(task, data->ds_clp); + rpc_restart_call_prepare(task); return -EAGAIN; } @@ -468,9 +450,8 @@ filelayout_check_layout(struct pnfs_layout_hdr *lo, fl->dsaddr = dsaddr; - if (fl->first_stripe_index < 0 || - fl->first_stripe_index >= dsaddr->stripe_count) { - dprintk("%s Bad first_stripe_index %d\n", + if (fl->first_stripe_index >= dsaddr->stripe_count) { + dprintk("%s Bad first_stripe_index %u\n", __func__, fl->first_stripe_index); goto out_put; } @@ -571,7 +552,7 @@ filelayout_decode_layout(struct pnfs_layout_hdr *flo, /* Note that a zero value for num_fh is legal for STRIPE_SPARSE. * Futher checking is done in filelayout_check_layout */ - if (fl->num_fh < 0 || fl->num_fh > + if (fl->num_fh > max(NFS4_PNFS_MAX_STRIPE_CNT, NFS4_PNFS_MAX_MULTI_CNT)) goto out_err; diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 4700fae..dcda0ba 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -39,6 +39,8 @@ #include <linux/delay.h> #include <linux/errno.h> #include <linux/string.h> +#include <linux/ratelimit.h> +#include <linux/printk.h> #include <linux/slab.h> #include <linux/sunrpc/clnt.h> #include <linux/sunrpc/gss_api.h> @@ -53,6 +55,7 @@ #include <linux/sunrpc/bc_xprt.h> #include <linux/xattr.h> #include <linux/utsname.h> +#include <linux/freezer.h> #include "nfs4_fs.h" #include "delegation.h" @@ -73,9 +76,6 @@ static int _nfs4_proc_open(struct nfs4_opendata *data); static int _nfs4_recover_proc_open(struct nfs4_opendata *data); static int nfs4_do_fsinfo(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *); static int nfs4_async_handle_error(struct rpc_task *, const struct nfs_server *, struct nfs4_state *); -static int _nfs4_proc_lookup(struct rpc_clnt *client, struct inode *dir, - const struct qstr *name, struct nfs_fh *fhandle, - struct nfs_fattr *fattr); static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr); static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred, struct nfs_fattr *fattr, struct iattr *sattr, @@ -244,7 +244,7 @@ static int nfs4_delay(struct rpc_clnt *clnt, long *timeout) *timeout = NFS4_POLL_RETRY_MIN; if (*timeout > NFS4_POLL_RETRY_MAX) *timeout = NFS4_POLL_RETRY_MAX; - schedule_timeout_killable(*timeout); + freezable_schedule_timeout_killable(*timeout); if (fatal_signal_pending(current)) res = -ERESTARTSYS; *timeout <<= 1; @@ -753,9 +753,9 @@ static void update_changeattr(struct inode *dir, struct nfs4_change_info *cinfo) spin_lock(&dir->i_lock); nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE|NFS_INO_INVALID_DATA; - if (!cinfo->atomic || cinfo->before != nfsi->change_attr) + if (!cinfo->atomic || cinfo->before != dir->i_version) nfs_force_lookup_revalidate(dir); - nfsi->change_attr = cinfo->after; + dir->i_version = cinfo->after; spin_unlock(&dir->i_lock); } @@ -897,6 +897,8 @@ out: static int can_open_delegated(struct nfs_delegation *delegation, fmode_t fmode) { + if (delegation == NULL) + return 0; if ((delegation->type & fmode) != fmode) return 0; if (test_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags)) @@ -1039,8 +1041,7 @@ static struct nfs4_state *nfs4_try_open_cached(struct nfs4_opendata *opendata) } rcu_read_lock(); delegation = rcu_dereference(nfsi->delegation); - if (delegation == NULL || - !can_open_delegated(delegation, fmode)) { + if (!can_open_delegated(delegation, fmode)) { rcu_read_unlock(); break; } @@ -1094,7 +1095,12 @@ static struct nfs4_state *nfs4_opendata_to_nfs4_state(struct nfs4_opendata *data if (delegation) delegation_flags = delegation->flags; rcu_read_unlock(); - if ((delegation_flags & 1UL<<NFS_DELEGATION_NEED_RECLAIM) == 0) + if (data->o_arg.claim == NFS4_OPEN_CLAIM_DELEGATE_CUR) { + pr_err_ratelimited("NFS: Broken NFSv4 server %s is " + "returning a delegation for " + "OPEN(CLAIM_DELEGATE_CUR)\n", + NFS_CLIENT(inode)->cl_server); + } else if ((delegation_flags & 1UL<<NFS_DELEGATION_NEED_RECLAIM) == 0) nfs_inode_set_delegation(state->inode, data->owner->so_cred, &data->o_res); @@ -1426,11 +1432,9 @@ static void nfs4_open_prepare(struct rpc_task *task, void *calldata) goto out_no_action; rcu_read_lock(); delegation = rcu_dereference(NFS_I(data->state->inode)->delegation); - if (delegation != NULL && - test_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags) == 0) { - rcu_read_unlock(); - goto out_no_action; - } + if (data->o_arg.claim != NFS4_OPEN_CLAIM_DELEGATE_CUR && + can_open_delegated(delegation, data->o_arg.fmode)) + goto unlock_no_action; rcu_read_unlock(); } /* Update sequence id. */ @@ -1447,6 +1451,8 @@ static void nfs4_open_prepare(struct rpc_task *task, void *calldata) return; rpc_call_start(task); return; +unlock_no_action: + rcu_read_unlock(); out_no_action: task->tk_action = NULL; @@ -1596,8 +1602,14 @@ static int _nfs4_proc_open(struct nfs4_opendata *data) int status; status = nfs4_run_open_task(data, 0); - if (status != 0 || !data->rpc_done) + if (!data->rpc_done) + return status; + if (status != 0) { + if (status == -NFS4ERR_BADNAME && + !(o_arg->open_flags & O_CREAT)) + return -ENOENT; return status; + } if (o_arg->open_flags & O_CREAT) { update_changeattr(dir, &o_res->cinfo); @@ -2408,14 +2420,15 @@ nfs4_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr, return status; } -static int _nfs4_proc_lookupfh(struct rpc_clnt *clnt, struct nfs_server *server, - const struct nfs_fh *dirfh, const struct qstr *name, - struct nfs_fh *fhandle, struct nfs_fattr *fattr) +static int _nfs4_proc_lookup(struct rpc_clnt *clnt, struct inode *dir, + const struct qstr *name, struct nfs_fh *fhandle, + struct nfs_fattr *fattr) { + struct nfs_server *server = NFS_SERVER(dir); int status; struct nfs4_lookup_arg args = { .bitmask = server->attr_bitmask, - .dir_fh = dirfh, + .dir_fh = NFS_FH(dir), .name = name, }; struct nfs4_lookup_res res = { @@ -2431,40 +2444,8 @@ static int _nfs4_proc_lookupfh(struct rpc_clnt *clnt, struct nfs_server *server, nfs_fattr_init(fattr); - dprintk("NFS call lookupfh %s\n", name->name); - status = nfs4_call_sync(clnt, server, &msg, &args.seq_args, &res.seq_res, 0); - dprintk("NFS reply lookupfh: %d\n", status); - return status; -} - -static int nfs4_proc_lookupfh(struct nfs_server *server, struct nfs_fh *dirfh, - struct qstr *name, struct nfs_fh *fhandle, - struct nfs_fattr *fattr) -{ - struct nfs4_exception exception = { }; - int err; - do { - err = _nfs4_proc_lookupfh(server->client, server, dirfh, name, fhandle, fattr); - /* FIXME: !!!! */ - if (err == -NFS4ERR_MOVED) { - err = -EREMOTE; - break; - } - err = nfs4_handle_exception(server, err, &exception); - } while (exception.retry); - return err; -} - -static int _nfs4_proc_lookup(struct rpc_clnt *clnt, struct inode *dir, - const struct qstr *name, struct nfs_fh *fhandle, - struct nfs_fattr *fattr) -{ - int status; - dprintk("NFS call lookup %s\n", name->name); - status = _nfs4_proc_lookupfh(clnt, NFS_SERVER(dir), NFS_FH(dir), name, fhandle, fattr); - if (status == -NFS4ERR_MOVED) - status = nfs4_get_referral(dir, name, fattr, fhandle); + status = nfs4_call_sync(clnt, server, &msg, &args.seq_args, &res.seq_res, 0); dprintk("NFS reply lookup: %d\n", status); return status; } @@ -2485,11 +2466,19 @@ static int nfs4_proc_lookup(struct rpc_clnt *clnt, struct inode *dir, struct qst struct nfs4_exception exception = { }; int err; do { - err = nfs4_handle_exception(NFS_SERVER(dir), - _nfs4_proc_lookup(clnt, dir, name, fhandle, fattr), - &exception); - if (err == -EPERM) + int status; + + status = _nfs4_proc_lookup(clnt, dir, name, fhandle, fattr); + switch (status) { + case -NFS4ERR_BADNAME: + return -ENOENT; + case -NFS4ERR_MOVED: + return nfs4_get_referral(dir, name, fattr, fhandle); + case -NFS4ERR_WRONGSEC: nfs_fixup_secinfo_attributes(fattr, fhandle); + } + err = nfs4_handle_exception(NFS_SERVER(dir), + status, &exception); } while (exception.retry); return err; } @@ -3210,7 +3199,7 @@ static int nfs4_read_done_cb(struct rpc_task *task, struct nfs_read_data *data) struct nfs_server *server = NFS_SERVER(data->inode); if (nfs4_async_handle_error(task, server, data->args.context->state) == -EAGAIN) { - nfs_restart_rpc(task, server->nfs_client); + rpc_restart_call_prepare(task); return -EAGAIN; } @@ -3260,7 +3249,7 @@ static int nfs4_write_done_cb(struct rpc_task *task, struct nfs_write_data *data struct inode *inode = data->inode; if (nfs4_async_handle_error(task, NFS_SERVER(inode), data->args.context->state) == -EAGAIN) { - nfs_restart_rpc(task, NFS_SERVER(inode)->nfs_client); + rpc_restart_call_prepare(task); return -EAGAIN; } if (task->tk_status >= 0) { @@ -3317,7 +3306,7 @@ static int nfs4_commit_done_cb(struct rpc_task *task, struct nfs_write_data *dat struct inode *inode = data->inode; if (nfs4_async_handle_error(task, NFS_SERVER(inode), NULL) == -EAGAIN) { - nfs_restart_rpc(task, NFS_SERVER(inode)->nfs_client); + rpc_restart_call_prepare(task); return -EAGAIN; } nfs_refresh_inode(inode, data->res.fattr); @@ -3857,7 +3846,7 @@ static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata) default: if (nfs4_async_handle_error(task, data->res.server, NULL) == -EAGAIN) { - nfs_restart_rpc(task, data->res.server->nfs_client); + rpc_restart_call_prepare(task); return; } } @@ -3970,7 +3959,7 @@ int nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, const nfs4 static unsigned long nfs4_set_lock_task_retry(unsigned long timeout) { - schedule_timeout_killable(timeout); + freezable_schedule_timeout_killable(timeout); timeout <<= 1; if (timeout > NFS4_LOCK_MAXTIMEOUT) return NFS4_LOCK_MAXTIMEOUT; @@ -4111,8 +4100,7 @@ static void nfs4_locku_done(struct rpc_task *task, void *data) break; default: if (nfs4_async_handle_error(task, calldata->server, NULL) == -EAGAIN) - nfs_restart_rpc(task, - calldata->server->nfs_client); + rpc_restart_call_prepare(task); } } @@ -4945,7 +4933,7 @@ static void nfs4_get_lease_time_done(struct rpc_task *task, void *calldata) task->tk_status = 0; /* fall through */ case -NFS4ERR_RETRY_UNCACHED_REP: - nfs_restart_rpc(task, data->clp); + rpc_restart_call_prepare(task); return; } dprintk("<-- %s\n", __func__); @@ -5786,7 +5774,7 @@ static void nfs4_layoutreturn_done(struct rpc_task *task, void *calldata) server = NFS_SERVER(lrp->args.inode); if (nfs4_async_handle_error(task, server, NULL) == -EAGAIN) { - nfs_restart_rpc(task, lrp->clp); + rpc_restart_call_prepare(task); return; } spin_lock(&lo->plh_inode->i_lock); @@ -5957,7 +5945,7 @@ nfs4_layoutcommit_done(struct rpc_task *task, void *calldata) } if (nfs4_async_handle_error(task, server, NULL) == -EAGAIN) { - nfs_restart_rpc(task, server->nfs_client); + rpc_restart_call_prepare(task); return; } @@ -5970,6 +5958,7 @@ static void nfs4_layoutcommit_release(void *calldata) { struct nfs4_layoutcommit_data *data = calldata; struct pnfs_layout_segment *lseg, *tmp; + unsigned long *bitlock = &NFS_I(data->args.inode)->flags; pnfs_cleanup_layoutcommit(data); /* Matched by references in pnfs_set_layoutcommit */ @@ -5979,6 +5968,11 @@ static void nfs4_layoutcommit_release(void *calldata) &lseg->pls_flags)) put_lseg(lseg); } + + clear_bit_unlock(NFS_INO_LAYOUTCOMMITTING, bitlock); + smp_mb__after_clear_bit(); + wake_up_bit(bitlock, NFS_INO_LAYOUTCOMMITTING); + put_rpccred(data->cred); kfree(data); } @@ -6267,10 +6261,10 @@ const struct nfs_rpc_ops nfs_v4_clientops = { .dentry_ops = &nfs4_dentry_operations, .dir_inode_ops = &nfs4_dir_inode_operations, .file_inode_ops = &nfs4_file_inode_operations, + .file_ops = &nfs4_file_operations, .getroot = nfs4_proc_get_root, .getattr = nfs4_proc_getattr, .setattr = nfs4_proc_setattr, - .lookupfh = nfs4_proc_lookupfh, .lookup = nfs4_proc_lookup, .access = nfs4_proc_access, .readlink = nfs4_proc_readlink, diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 39914be..6a7107a 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -1156,11 +1156,13 @@ restart: if (status >= 0) { status = nfs4_reclaim_locks(state, ops); if (status >= 0) { + spin_lock(&state->state_lock); list_for_each_entry(lock, &state->lock_states, ls_locks) { if (!(lock->ls_flags & NFS_LOCK_INITIALIZED)) printk("%s: Lock reclaim failed!\n", __func__); } + spin_unlock(&state->state_lock); nfs4_put_open_state(state); goto restart; } @@ -1224,10 +1226,12 @@ static void nfs4_clear_open_state(struct nfs4_state *state) clear_bit(NFS_O_RDONLY_STATE, &state->flags); clear_bit(NFS_O_WRONLY_STATE, &state->flags); clear_bit(NFS_O_RDWR_STATE, &state->flags); + spin_lock(&state->state_lock); list_for_each_entry(lock, &state->lock_states, ls_locks) { lock->ls_seqid.flags = 0; lock->ls_flags &= ~NFS_LOCK_INITIALIZED; } + spin_unlock(&state->state_lock); } static void nfs4_reset_seqids(struct nfs_server *server, @@ -1350,12 +1354,14 @@ static void nfs4_warn_keyexpired(const char *s) static int nfs4_recovery_handle_error(struct nfs_client *clp, int error) { switch (error) { + case 0: + break; case -NFS4ERR_CB_PATH_DOWN: nfs_handle_cb_pathdown(clp); - return 0; + break; case -NFS4ERR_NO_GRACE: nfs4_state_end_reclaim_reboot(clp); - return 0; + break; case -NFS4ERR_STALE_CLIENTID: case -NFS4ERR_LEASE_MOVED: set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state); @@ -1375,13 +1381,15 @@ static int nfs4_recovery_handle_error(struct nfs_client *clp, int error) case -NFS4ERR_SEQ_MISORDERED: set_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state); /* Zero session reset errors */ - return 0; + break; case -EKEYEXPIRED: /* Nothing we can do */ nfs4_warn_keyexpired(clp->cl_hostname); - return 0; + break; + default: + return error; } - return error; + return 0; } static int nfs4_do_reclaim(struct nfs_client *clp, const struct nfs4_state_recovery_ops *ops) @@ -1428,7 +1436,7 @@ static int nfs4_check_lease(struct nfs_client *clp) struct rpc_cred *cred; const struct nfs4_state_maintenance_ops *ops = clp->cl_mvops->state_renewal_ops; - int status = -NFS4ERR_EXPIRED; + int status; /* Is the client already known to have an expired lease? */ if (test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state)) @@ -1438,6 +1446,7 @@ static int nfs4_check_lease(struct nfs_client *clp) spin_unlock(&clp->cl_lock); if (cred == NULL) { cred = nfs4_get_setclientid_cred(clp); + status = -ENOKEY; if (cred == NULL) goto out; } @@ -1525,16 +1534,16 @@ void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags) { if (!flags) return; - else if (flags & SEQ4_STATUS_RESTART_RECLAIM_NEEDED) + if (flags & SEQ4_STATUS_RESTART_RECLAIM_NEEDED) nfs41_handle_server_reboot(clp); - else if (flags & (SEQ4_STATUS_EXPIRED_ALL_STATE_REVOKED | + if (flags & (SEQ4_STATUS_EXPIRED_ALL_STATE_REVOKED | SEQ4_STATUS_EXPIRED_SOME_STATE_REVOKED | SEQ4_STATUS_ADMIN_STATE_REVOKED | SEQ4_STATUS_LEASE_MOVED)) nfs41_handle_state_revoked(clp); - else if (flags & SEQ4_STATUS_RECALLABLE_STATE_REVOKED) + if (flags & SEQ4_STATUS_RECALLABLE_STATE_REVOKED) nfs41_handle_recallable_state_revoked(clp); - else if (flags & (SEQ4_STATUS_CB_PATH_DOWN | + if (flags & (SEQ4_STATUS_CB_PATH_DOWN | SEQ4_STATUS_BACKCHANNEL_FAULT | SEQ4_STATUS_CB_PATH_DOWN_SESSION)) nfs41_handle_cb_path_down(clp); @@ -1662,10 +1671,10 @@ static void nfs4_state_manager(struct nfs_client *clp) if (test_and_clear_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state)) { status = nfs4_check_lease(clp); + if (status < 0) + goto out_error; if (test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state)) continue; - if (status < 0 && status != -NFS4ERR_CB_PATH_DOWN) - goto out_error; } /* Initialize or reset the session */ diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 1dce12f..e6161b2 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -6602,8 +6602,6 @@ static int nfs4_xdr_dec_secinfo(struct rpc_rqst *rqstp, if (status) goto out; status = decode_secinfo(xdr, res); - if (status) - goto out; out: return status; } diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c index d0cda12..c807ab9 100644 --- a/fs/nfs/objlayout/objio_osd.c +++ b/fs/nfs/objlayout/objio_osd.c @@ -38,21 +38,15 @@ */ #include <linux/module.h> -#include <scsi/osd_initiator.h> +#include <scsi/osd_ore.h> #include "objlayout.h" #define NFSDBG_FACILITY NFSDBG_PNFS_LD -#define _LLU(x) ((unsigned long long)x) - -enum { BIO_MAX_PAGES_KMALLOC = - (PAGE_SIZE - sizeof(struct bio)) / sizeof(struct bio_vec), -}; - struct objio_dev_ent { struct nfs4_deviceid_node id_node; - struct osd_dev *od; + struct ore_dev od; }; static void @@ -60,8 +54,8 @@ objio_free_deviceid_node(struct nfs4_deviceid_node *d) { struct objio_dev_ent *de = container_of(d, struct objio_dev_ent, id_node); - dprintk("%s: free od=%p\n", __func__, de->od); - osduld_put_device(de->od); + dprintk("%s: free od=%p\n", __func__, de->od.od); + osduld_put_device(de->od.od); kfree(de); } @@ -98,12 +92,12 @@ _dev_list_add(const struct nfs_server *nfss, nfss->pnfs_curr_ld, nfss->nfs_client, d_id); - de->od = od; + de->od.od = od; d = nfs4_insert_deviceid_node(&de->id_node); n = container_of(d, struct objio_dev_ent, id_node); if (n != de) { - dprintk("%s: Race with other n->od=%p\n", __func__, n->od); + dprintk("%s: Race with other n->od=%p\n", __func__, n->od.od); objio_free_deviceid_node(&de->id_node); de = n; } @@ -111,28 +105,11 @@ _dev_list_add(const struct nfs_server *nfss, return de; } -struct caps_buffers { - u8 caps_key[OSD_CRYPTO_KEYID_SIZE]; - u8 creds[OSD_CAP_LEN]; -}; - struct objio_segment { struct pnfs_layout_segment lseg; - struct pnfs_osd_object_cred *comps; - - unsigned mirrors_p1; - unsigned stripe_unit; - unsigned group_width; /* Data stripe_units without integrity comps */ - u64 group_depth; - unsigned group_count; - - unsigned max_io_size; - - unsigned comps_index; - unsigned num_comps; - /* variable length */ - struct objio_dev_ent *ods[]; + struct ore_layout layout; + struct ore_components oc; }; static inline struct objio_segment * @@ -141,59 +118,44 @@ OBJIO_LSEG(struct pnfs_layout_segment *lseg) return container_of(lseg, struct objio_segment, lseg); } -struct objio_state; -typedef ssize_t (*objio_done_fn)(struct objio_state *ios); - struct objio_state { /* Generic layer */ - struct objlayout_io_state ol_state; - - struct objio_segment *layout; - - struct kref kref; - objio_done_fn done; - void *private; - - unsigned long length; - unsigned numdevs; /* Actually used devs in this IO */ - /* A per-device variable array of size numdevs */ - struct _objio_per_comp { - struct bio *bio; - struct osd_request *or; - unsigned long length; - u64 offset; - unsigned dev; - } per_dev[]; + struct objlayout_io_res oir; + + bool sync; + /*FIXME: Support for extra_bytes at ore_get_rw_state() */ + struct ore_io_state *ios; }; /* Send and wait for a get_device_info of devices in the layout, then look them up with the osd_initiator library */ -static struct objio_dev_ent *_device_lookup(struct pnfs_layout_hdr *pnfslay, - struct objio_segment *objio_seg, unsigned comp, - gfp_t gfp_flags) +static int objio_devices_lookup(struct pnfs_layout_hdr *pnfslay, + struct objio_segment *objio_seg, unsigned c, struct nfs4_deviceid *d_id, + gfp_t gfp_flags) { struct pnfs_osd_deviceaddr *deviceaddr; - struct nfs4_deviceid *d_id; struct objio_dev_ent *ode; struct osd_dev *od; struct osd_dev_info odi; int err; - d_id = &objio_seg->comps[comp].oc_object_id.oid_device_id; - ode = _dev_list_find(NFS_SERVER(pnfslay->plh_inode), d_id); - if (ode) - return ode; + if (ode) { + objio_seg->oc.ods[c] = &ode->od; /* must use container_of */ + return 0; + } err = objlayout_get_deviceinfo(pnfslay, d_id, &deviceaddr, gfp_flags); if (unlikely(err)) { dprintk("%s: objlayout_get_deviceinfo dev(%llx:%llx) =>%d\n", __func__, _DEVID_LO(d_id), _DEVID_HI(d_id), err); - return ERR_PTR(err); + return err; } odi.systemid_len = deviceaddr->oda_systemid.len; if (odi.systemid_len > sizeof(odi.systemid)) { + dprintk("%s: odi.systemid_len > sizeof(systemid=%zd)\n", + __func__, sizeof(odi.systemid)); err = -EINVAL; goto out; } else if (odi.systemid_len) @@ -218,96 +180,53 @@ static struct objio_dev_ent *_device_lookup(struct pnfs_layout_hdr *pnfslay, ode = _dev_list_add(NFS_SERVER(pnfslay->plh_inode), d_id, od, gfp_flags); - + objio_seg->oc.ods[c] = &ode->od; /* must use container_of */ + dprintk("Adding new dev_id(%llx:%llx)\n", + _DEVID_LO(d_id), _DEVID_HI(d_id)); out: - dprintk("%s: return=%d\n", __func__, err); objlayout_put_deviceinfo(deviceaddr); - return err ? ERR_PTR(err) : ode; + return err; } -static int objio_devices_lookup(struct pnfs_layout_hdr *pnfslay, - struct objio_segment *objio_seg, - gfp_t gfp_flags) +static void copy_single_comp(struct ore_components *oc, unsigned c, + struct pnfs_osd_object_cred *src_comp) { - unsigned i; - int err; + struct ore_comp *ocomp = &oc->comps[c]; - /* lookup all devices */ - for (i = 0; i < objio_seg->num_comps; i++) { - struct objio_dev_ent *ode; + WARN_ON(src_comp->oc_cap_key.cred_len > 0); /* libosd is NO_SEC only */ + WARN_ON(src_comp->oc_cap.cred_len > sizeof(ocomp->cred)); - ode = _device_lookup(pnfslay, objio_seg, i, gfp_flags); - if (unlikely(IS_ERR(ode))) { - err = PTR_ERR(ode); - goto out; - } - objio_seg->ods[i] = ode; - } - err = 0; + ocomp->obj.partition = src_comp->oc_object_id.oid_partition_id; + ocomp->obj.id = src_comp->oc_object_id.oid_object_id; -out: - dprintk("%s: return=%d\n", __func__, err); - return err; + memcpy(ocomp->cred, src_comp->oc_cap.cred, sizeof(ocomp->cred)); } -static int _verify_data_map(struct pnfs_osd_layout *layout) +int __alloc_objio_seg(unsigned numdevs, gfp_t gfp_flags, + struct objio_segment **pseg) { - struct pnfs_osd_data_map *data_map = &layout->olo_map; - u64 stripe_length; - u32 group_width; - -/* FIXME: Only raid0 for now. if not go through MDS */ - if (data_map->odm_raid_algorithm != PNFS_OSD_RAID_0) { - printk(KERN_ERR "Only RAID_0 for now\n"); - return -ENOTSUPP; - } - if (0 != (data_map->odm_num_comps % (data_map->odm_mirror_cnt + 1))) { - printk(KERN_ERR "Data Map wrong, num_comps=%u mirrors=%u\n", - data_map->odm_num_comps, data_map->odm_mirror_cnt); - return -EINVAL; - } + struct __alloc_objio_segment { + struct objio_segment olseg; + struct ore_dev *ods[numdevs]; + struct ore_comp comps[numdevs]; + } *aolseg; - if (data_map->odm_group_width) - group_width = data_map->odm_group_width; - else - group_width = data_map->odm_num_comps / - (data_map->odm_mirror_cnt + 1); - - stripe_length = (u64)data_map->odm_stripe_unit * group_width; - if (stripe_length >= (1ULL << 32)) { - printk(KERN_ERR "Total Stripe length(0x%llx)" - " >= 32bit is not supported\n", _LLU(stripe_length)); - return -ENOTSUPP; + aolseg = kzalloc(sizeof(*aolseg), gfp_flags); + if (unlikely(!aolseg)) { + dprintk("%s: Faild allocation numdevs=%d size=%zd\n", __func__, + numdevs, sizeof(*aolseg)); + return -ENOMEM; } - if (0 != (data_map->odm_stripe_unit & ~PAGE_MASK)) { - printk(KERN_ERR "Stripe Unit(0x%llx)" - " must be Multples of PAGE_SIZE(0x%lx)\n", - _LLU(data_map->odm_stripe_unit), PAGE_SIZE); - return -ENOTSUPP; - } + aolseg->olseg.oc.numdevs = numdevs; + aolseg->olseg.oc.single_comp = EC_MULTPLE_COMPS; + aolseg->olseg.oc.comps = aolseg->comps; + aolseg->olseg.oc.ods = aolseg->ods; + *pseg = &aolseg->olseg; return 0; } -static void copy_single_comp(struct pnfs_osd_object_cred *cur_comp, - struct pnfs_osd_object_cred *src_comp, - struct caps_buffers *caps_p) -{ - WARN_ON(src_comp->oc_cap_key.cred_len > sizeof(caps_p->caps_key)); - WARN_ON(src_comp->oc_cap.cred_len > sizeof(caps_p->creds)); - - *cur_comp = *src_comp; - - memcpy(caps_p->caps_key, src_comp->oc_cap_key.cred, - sizeof(caps_p->caps_key)); - cur_comp->oc_cap_key.cred = caps_p->caps_key; - - memcpy(caps_p->creds, src_comp->oc_cap.cred, - sizeof(caps_p->creds)); - cur_comp->oc_cap.cred = caps_p->creds; -} - int objio_alloc_lseg(struct pnfs_layout_segment **outp, struct pnfs_layout_hdr *pnfslay, struct pnfs_layout_range *range, @@ -317,59 +236,43 @@ int objio_alloc_lseg(struct pnfs_layout_segment **outp, struct objio_segment *objio_seg; struct pnfs_osd_xdr_decode_layout_iter iter; struct pnfs_osd_layout layout; - struct pnfs_osd_object_cred *cur_comp, src_comp; - struct caps_buffers *caps_p; + struct pnfs_osd_object_cred src_comp; + unsigned cur_comp; int err; err = pnfs_osd_xdr_decode_layout_map(&layout, &iter, xdr); if (unlikely(err)) return err; - err = _verify_data_map(&layout); + err = __alloc_objio_seg(layout.olo_num_comps, gfp_flags, &objio_seg); if (unlikely(err)) return err; - objio_seg = kzalloc(sizeof(*objio_seg) + - sizeof(objio_seg->ods[0]) * layout.olo_num_comps + - sizeof(*objio_seg->comps) * layout.olo_num_comps + - sizeof(struct caps_buffers) * layout.olo_num_comps, - gfp_flags); - if (!objio_seg) - return -ENOMEM; + objio_seg->layout.stripe_unit = layout.olo_map.odm_stripe_unit; + objio_seg->layout.group_width = layout.olo_map.odm_group_width; + objio_seg->layout.group_depth = layout.olo_map.odm_group_depth; + objio_seg->layout.mirrors_p1 = layout.olo_map.odm_mirror_cnt + 1; + objio_seg->layout.raid_algorithm = layout.olo_map.odm_raid_algorithm; - objio_seg->comps = (void *)(objio_seg->ods + layout.olo_num_comps); - cur_comp = objio_seg->comps; - caps_p = (void *)(cur_comp + layout.olo_num_comps); - while (pnfs_osd_xdr_decode_layout_comp(&src_comp, &iter, xdr, &err)) - copy_single_comp(cur_comp++, &src_comp, caps_p++); + err = ore_verify_layout(layout.olo_map.odm_num_comps, + &objio_seg->layout); if (unlikely(err)) goto err; - objio_seg->num_comps = layout.olo_num_comps; - objio_seg->comps_index = layout.olo_comps_index; - err = objio_devices_lookup(pnfslay, objio_seg, gfp_flags); - if (err) - goto err; - - objio_seg->mirrors_p1 = layout.olo_map.odm_mirror_cnt + 1; - objio_seg->stripe_unit = layout.olo_map.odm_stripe_unit; - if (layout.olo_map.odm_group_width) { - objio_seg->group_width = layout.olo_map.odm_group_width; - objio_seg->group_depth = layout.olo_map.odm_group_depth; - objio_seg->group_count = layout.olo_map.odm_num_comps / - objio_seg->mirrors_p1 / - objio_seg->group_width; - } else { - objio_seg->group_width = layout.olo_map.odm_num_comps / - objio_seg->mirrors_p1; - objio_seg->group_depth = -1; - objio_seg->group_count = 1; + objio_seg->oc.first_dev = layout.olo_comps_index; + cur_comp = 0; + while (pnfs_osd_xdr_decode_layout_comp(&src_comp, &iter, xdr, &err)) { + copy_single_comp(&objio_seg->oc, cur_comp, &src_comp); + err = objio_devices_lookup(pnfslay, objio_seg, cur_comp, + &src_comp.oc_object_id.oid_device_id, + gfp_flags); + if (err) + goto err; + ++cur_comp; } - - /* Cache this calculation it will hit for every page */ - objio_seg->max_io_size = (BIO_MAX_PAGES_KMALLOC * PAGE_SIZE - - objio_seg->stripe_unit) * - objio_seg->group_width; + /* pnfs_osd_xdr_decode_layout_comp returns false on error */ + if (unlikely(err)) + goto err; *outp = &objio_seg->lseg; return 0; @@ -386,43 +289,63 @@ void objio_free_lseg(struct pnfs_layout_segment *lseg) int i; struct objio_segment *objio_seg = OBJIO_LSEG(lseg); - for (i = 0; i < objio_seg->num_comps; i++) { - if (!objio_seg->ods[i]) + for (i = 0; i < objio_seg->oc.numdevs; i++) { + struct ore_dev *od = objio_seg->oc.ods[i]; + struct objio_dev_ent *ode; + + if (!od) break; - nfs4_put_deviceid_node(&objio_seg->ods[i]->id_node); + ode = container_of(od, typeof(*ode), od); + nfs4_put_deviceid_node(&ode->id_node); } kfree(objio_seg); } -int objio_alloc_io_state(struct pnfs_layout_segment *lseg, - struct objlayout_io_state **outp, - gfp_t gfp_flags) +static int +objio_alloc_io_state(struct pnfs_layout_hdr *pnfs_layout_type, bool is_reading, + struct pnfs_layout_segment *lseg, struct page **pages, unsigned pgbase, + loff_t offset, size_t count, void *rpcdata, gfp_t gfp_flags, + struct objio_state **outp) { struct objio_segment *objio_seg = OBJIO_LSEG(lseg); - struct objio_state *ios; - const unsigned first_size = sizeof(*ios) + - objio_seg->num_comps * sizeof(ios->per_dev[0]); - const unsigned sec_size = objio_seg->num_comps * - sizeof(ios->ol_state.ioerrs[0]); - - ios = kzalloc(first_size + sec_size, gfp_flags); - if (unlikely(!ios)) + struct ore_io_state *ios; + int ret; + struct __alloc_objio_state { + struct objio_state objios; + struct pnfs_osd_ioerr ioerrs[objio_seg->oc.numdevs]; + } *aos; + + aos = kzalloc(sizeof(*aos), gfp_flags); + if (unlikely(!aos)) return -ENOMEM; - ios->layout = objio_seg; - ios->ol_state.ioerrs = ((void *)ios) + first_size; - ios->ol_state.num_comps = objio_seg->num_comps; + objlayout_init_ioerrs(&aos->objios.oir, objio_seg->oc.numdevs, + aos->ioerrs, rpcdata, pnfs_layout_type); - *outp = &ios->ol_state; + ret = ore_get_rw_state(&objio_seg->layout, &objio_seg->oc, is_reading, + offset, count, &ios); + if (unlikely(ret)) { + kfree(aos); + return ret; + } + + ios->pages = pages; + ios->pgbase = pgbase; + ios->private = aos; + BUG_ON(ios->nr_pages > (pgbase + count + PAGE_SIZE - 1) >> PAGE_SHIFT); + + aos->objios.sync = 0; + aos->objios.ios = ios; + *outp = &aos->objios; return 0; } -void objio_free_io_state(struct objlayout_io_state *ol_state) +void objio_free_result(struct objlayout_io_res *oir) { - struct objio_state *ios = container_of(ol_state, struct objio_state, - ol_state); + struct objio_state *objios = container_of(oir, struct objio_state, oir); - kfree(ios); + ore_put_io_state(objios->ios); + kfree(objios); } enum pnfs_osd_errno osd_pri_2_pnfs_err(enum osd_err_priority oep) @@ -455,539 +378,152 @@ enum pnfs_osd_errno osd_pri_2_pnfs_err(enum osd_err_priority oep) } } -static void _clear_bio(struct bio *bio) +static void __on_dev_error(struct ore_io_state *ios, + struct ore_dev *od, unsigned dev_index, enum osd_err_priority oep, + u64 dev_offset, u64 dev_len) { - struct bio_vec *bv; - unsigned i; - - __bio_for_each_segment(bv, bio, i, 0) { - unsigned this_count = bv->bv_len; - - if (likely(PAGE_SIZE == this_count)) - clear_highpage(bv->bv_page); - else - zero_user(bv->bv_page, bv->bv_offset, this_count); - } -} - -static int _io_check(struct objio_state *ios, bool is_write) -{ - enum osd_err_priority oep = OSD_ERR_PRI_NO_ERROR; - int lin_ret = 0; - int i; - - for (i = 0; i < ios->numdevs; i++) { - struct osd_sense_info osi; - struct osd_request *or = ios->per_dev[i].or; - int ret; - - if (!or) - continue; + struct objio_state *objios = ios->private; + struct pnfs_osd_objid pooid; + struct objio_dev_ent *ode = container_of(od, typeof(*ode), od); + /* FIXME: what to do with more-then-one-group layouts. We need to + * translate from ore_io_state index to oc->comps index + */ + unsigned comp = dev_index; - ret = osd_req_decode_sense(or, &osi); - if (likely(!ret)) - continue; + pooid.oid_device_id = ode->id_node.deviceid; + pooid.oid_partition_id = ios->oc->comps[comp].obj.partition; + pooid.oid_object_id = ios->oc->comps[comp].obj.id; - if (OSD_ERR_PRI_CLEAR_PAGES == osi.osd_err_pri) { - /* start read offset passed endof file */ - BUG_ON(is_write); - _clear_bio(ios->per_dev[i].bio); - dprintk("%s: start read offset passed end of file " - "offset=0x%llx, length=0x%lx\n", __func__, - _LLU(ios->per_dev[i].offset), - ios->per_dev[i].length); - - continue; /* we recovered */ - } - objlayout_io_set_result(&ios->ol_state, i, - &ios->layout->comps[i].oc_object_id, - osd_pri_2_pnfs_err(osi.osd_err_pri), - ios->per_dev[i].offset, - ios->per_dev[i].length, - is_write); - - if (osi.osd_err_pri >= oep) { - oep = osi.osd_err_pri; - lin_ret = ret; - } - } - - return lin_ret; -} - -/* - * Common IO state helpers. - */ -static void _io_free(struct objio_state *ios) -{ - unsigned i; - - for (i = 0; i < ios->numdevs; i++) { - struct _objio_per_comp *per_dev = &ios->per_dev[i]; - - if (per_dev->or) { - osd_end_request(per_dev->or); - per_dev->or = NULL; - } - - if (per_dev->bio) { - bio_put(per_dev->bio); - per_dev->bio = NULL; - } - } -} - -struct osd_dev *_io_od(struct objio_state *ios, unsigned dev) -{ - unsigned min_dev = ios->layout->comps_index; - unsigned max_dev = min_dev + ios->layout->num_comps; - - BUG_ON(dev < min_dev || max_dev <= dev); - return ios->layout->ods[dev - min_dev]->od; -} - -struct _striping_info { - u64 obj_offset; - u64 group_length; - unsigned dev; - unsigned unit_off; -}; - -static void _calc_stripe_info(struct objio_state *ios, u64 file_offset, - struct _striping_info *si) -{ - u32 stripe_unit = ios->layout->stripe_unit; - u32 group_width = ios->layout->group_width; - u64 group_depth = ios->layout->group_depth; - u32 U = stripe_unit * group_width; - - u64 T = U * group_depth; - u64 S = T * ios->layout->group_count; - u64 M = div64_u64(file_offset, S); - - /* - G = (L - (M * S)) / T - H = (L - (M * S)) % T - */ - u64 LmodU = file_offset - M * S; - u32 G = div64_u64(LmodU, T); - u64 H = LmodU - G * T; - - u32 N = div_u64(H, U); - - div_u64_rem(file_offset, stripe_unit, &si->unit_off); - si->obj_offset = si->unit_off + (N * stripe_unit) + - (M * group_depth * stripe_unit); - - /* "H - (N * U)" is just "H % U" so it's bound to u32 */ - si->dev = (u32)(H - (N * U)) / stripe_unit + G * group_width; - si->dev *= ios->layout->mirrors_p1; - - si->group_length = T - H; -} - -static int _add_stripe_unit(struct objio_state *ios, unsigned *cur_pg, - unsigned pgbase, struct _objio_per_comp *per_dev, int len, - gfp_t gfp_flags) -{ - unsigned pg = *cur_pg; - int cur_len = len; - struct request_queue *q = - osd_request_queue(_io_od(ios, per_dev->dev)); - - if (per_dev->bio == NULL) { - unsigned pages_in_stripe = ios->layout->group_width * - (ios->layout->stripe_unit / PAGE_SIZE); - unsigned bio_size = (ios->ol_state.nr_pages + pages_in_stripe) / - ios->layout->group_width; - - if (BIO_MAX_PAGES_KMALLOC < bio_size) - bio_size = BIO_MAX_PAGES_KMALLOC; - - per_dev->bio = bio_kmalloc(gfp_flags, bio_size); - if (unlikely(!per_dev->bio)) { - dprintk("Faild to allocate BIO size=%u\n", bio_size); - return -ENOMEM; - } - } - - while (cur_len > 0) { - unsigned pglen = min_t(unsigned, PAGE_SIZE - pgbase, cur_len); - unsigned added_len; - - BUG_ON(ios->ol_state.nr_pages <= pg); - cur_len -= pglen; - - added_len = bio_add_pc_page(q, per_dev->bio, - ios->ol_state.pages[pg], pglen, pgbase); - if (unlikely(pglen != added_len)) - return -ENOMEM; - pgbase = 0; - ++pg; - } - BUG_ON(cur_len); - - per_dev->length += len; - *cur_pg = pg; - return 0; -} - -static int _prepare_one_group(struct objio_state *ios, u64 length, - struct _striping_info *si, unsigned *last_pg, - gfp_t gfp_flags) -{ - unsigned stripe_unit = ios->layout->stripe_unit; - unsigned mirrors_p1 = ios->layout->mirrors_p1; - unsigned devs_in_group = ios->layout->group_width * mirrors_p1; - unsigned dev = si->dev; - unsigned first_dev = dev - (dev % devs_in_group); - unsigned max_comp = ios->numdevs ? ios->numdevs - mirrors_p1 : 0; - unsigned cur_pg = *last_pg; - int ret = 0; - - while (length) { - struct _objio_per_comp *per_dev = &ios->per_dev[dev - first_dev]; - unsigned cur_len, page_off = 0; - - if (!per_dev->length) { - per_dev->dev = dev; - if (dev < si->dev) { - per_dev->offset = si->obj_offset + stripe_unit - - si->unit_off; - cur_len = stripe_unit; - } else if (dev == si->dev) { - per_dev->offset = si->obj_offset; - cur_len = stripe_unit - si->unit_off; - page_off = si->unit_off & ~PAGE_MASK; - BUG_ON(page_off && - (page_off != ios->ol_state.pgbase)); - } else { /* dev > si->dev */ - per_dev->offset = si->obj_offset - si->unit_off; - cur_len = stripe_unit; - } - - if (max_comp < dev - first_dev) - max_comp = dev - first_dev; - } else { - cur_len = stripe_unit; - } - if (cur_len >= length) - cur_len = length; - - ret = _add_stripe_unit(ios, &cur_pg, page_off , per_dev, - cur_len, gfp_flags); - if (unlikely(ret)) - goto out; - - dev += mirrors_p1; - dev = (dev % devs_in_group) + first_dev; - - length -= cur_len; - ios->length += cur_len; - } -out: - ios->numdevs = max_comp + mirrors_p1; - *last_pg = cur_pg; - return ret; -} - -static int _io_rw_pagelist(struct objio_state *ios, gfp_t gfp_flags) -{ - u64 length = ios->ol_state.count; - u64 offset = ios->ol_state.offset; - struct _striping_info si; - unsigned last_pg = 0; - int ret = 0; - - while (length) { - _calc_stripe_info(ios, offset, &si); - - if (length < si.group_length) - si.group_length = length; - - ret = _prepare_one_group(ios, si.group_length, &si, &last_pg, gfp_flags); - if (unlikely(ret)) - goto out; - - offset += si.group_length; - length -= si.group_length; - } - -out: - if (!ios->length) - return ret; - - return 0; -} - -static ssize_t _sync_done(struct objio_state *ios) -{ - struct completion *waiting = ios->private; - - complete(waiting); - return 0; -} - -static void _last_io(struct kref *kref) -{ - struct objio_state *ios = container_of(kref, struct objio_state, kref); - - ios->done(ios); -} - -static void _done_io(struct osd_request *or, void *p) -{ - struct objio_state *ios = p; - - kref_put(&ios->kref, _last_io); -} - -static ssize_t _io_exec(struct objio_state *ios) -{ - DECLARE_COMPLETION_ONSTACK(wait); - ssize_t status = 0; /* sync status */ - unsigned i; - objio_done_fn saved_done_fn = ios->done; - bool sync = ios->ol_state.sync; - - if (sync) { - ios->done = _sync_done; - ios->private = &wait; - } - - kref_init(&ios->kref); - - for (i = 0; i < ios->numdevs; i++) { - struct osd_request *or = ios->per_dev[i].or; - - if (!or) - continue; - - kref_get(&ios->kref); - osd_execute_request_async(or, _done_io, ios); - } - - kref_put(&ios->kref, _last_io); - - if (sync) { - wait_for_completion(&wait); - status = saved_done_fn(ios); - } - - return status; + objlayout_io_set_result(&objios->oir, comp, + &pooid, osd_pri_2_pnfs_err(oep), + dev_offset, dev_len, !ios->reading); } /* * read */ -static ssize_t _read_done(struct objio_state *ios) +static void _read_done(struct ore_io_state *ios, void *private) { + struct objio_state *objios = private; ssize_t status; - int ret = _io_check(ios, false); + int ret = ore_check_io(ios, &__on_dev_error); - _io_free(ios); + /* FIXME: _io_free(ios) can we dealocate the libosd resources; */ if (likely(!ret)) status = ios->length; else status = ret; - objlayout_read_done(&ios->ol_state, status, ios->ol_state.sync); - return status; + objlayout_read_done(&objios->oir, status, objios->sync); } -static int _read_mirrors(struct objio_state *ios, unsigned cur_comp) +int objio_read_pagelist(struct nfs_read_data *rdata) { - struct osd_request *or = NULL; - struct _objio_per_comp *per_dev = &ios->per_dev[cur_comp]; - unsigned dev = per_dev->dev; - struct pnfs_osd_object_cred *cred = - &ios->layout->comps[cur_comp]; - struct osd_obj_id obj = { - .partition = cred->oc_object_id.oid_partition_id, - .id = cred->oc_object_id.oid_object_id, - }; + struct objio_state *objios; int ret; - or = osd_start_request(_io_od(ios, dev), GFP_KERNEL); - if (unlikely(!or)) { - ret = -ENOMEM; - goto err; - } - per_dev->or = or; - - osd_req_read(or, &obj, per_dev->offset, per_dev->bio, per_dev->length); - - ret = osd_finalize_request(or, 0, cred->oc_cap.cred, NULL); - if (ret) { - dprintk("%s: Faild to osd_finalize_request() => %d\n", - __func__, ret); - goto err; - } - - dprintk("%s:[%d] dev=%d obj=0x%llx start=0x%llx length=0x%lx\n", - __func__, cur_comp, dev, obj.id, _LLU(per_dev->offset), - per_dev->length); - -err: - return ret; -} - -static ssize_t _read_exec(struct objio_state *ios) -{ - unsigned i; - int ret; - - for (i = 0; i < ios->numdevs; i += ios->layout->mirrors_p1) { - if (!ios->per_dev[i].length) - continue; - ret = _read_mirrors(ios, i); - if (unlikely(ret)) - goto err; - } - - ios->done = _read_done; - return _io_exec(ios); /* In sync mode exec returns the io status */ - -err: - _io_free(ios); - return ret; -} - -ssize_t objio_read_pagelist(struct objlayout_io_state *ol_state) -{ - struct objio_state *ios = container_of(ol_state, struct objio_state, - ol_state); - int ret; - - ret = _io_rw_pagelist(ios, GFP_KERNEL); + ret = objio_alloc_io_state(NFS_I(rdata->inode)->layout, true, + rdata->lseg, rdata->args.pages, rdata->args.pgbase, + rdata->args.offset, rdata->args.count, rdata, + GFP_KERNEL, &objios); if (unlikely(ret)) return ret; - return _read_exec(ios); + objios->ios->done = _read_done; + dprintk("%s: offset=0x%llx length=0x%x\n", __func__, + rdata->args.offset, rdata->args.count); + return ore_read(objios->ios); } /* * write */ -static ssize_t _write_done(struct objio_state *ios) +static void _write_done(struct ore_io_state *ios, void *private) { + struct objio_state *objios = private; ssize_t status; - int ret = _io_check(ios, true); + int ret = ore_check_io(ios, &__on_dev_error); - _io_free(ios); + /* FIXME: _io_free(ios) can we dealocate the libosd resources; */ if (likely(!ret)) { /* FIXME: should be based on the OSD's persistence model * See OSD2r05 Section 4.13 Data persistence model */ - ios->ol_state.committed = NFS_FILE_SYNC; + objios->oir.committed = NFS_FILE_SYNC; status = ios->length; } else { status = ret; } - objlayout_write_done(&ios->ol_state, status, ios->ol_state.sync); - return status; + objlayout_write_done(&objios->oir, status, objios->sync); } -static int _write_mirrors(struct objio_state *ios, unsigned cur_comp) +static struct page *__r4w_get_page(void *priv, u64 offset, bool *uptodate) { - struct _objio_per_comp *master_dev = &ios->per_dev[cur_comp]; - unsigned dev = ios->per_dev[cur_comp].dev; - unsigned last_comp = cur_comp + ios->layout->mirrors_p1; - int ret; - - for (; cur_comp < last_comp; ++cur_comp, ++dev) { - struct osd_request *or = NULL; - struct pnfs_osd_object_cred *cred = - &ios->layout->comps[cur_comp]; - struct osd_obj_id obj = { - .partition = cred->oc_object_id.oid_partition_id, - .id = cred->oc_object_id.oid_object_id, - }; - struct _objio_per_comp *per_dev = &ios->per_dev[cur_comp]; - struct bio *bio; - - or = osd_start_request(_io_od(ios, dev), GFP_NOFS); - if (unlikely(!or)) { - ret = -ENOMEM; - goto err; - } - per_dev->or = or; - - if (per_dev != master_dev) { - bio = bio_kmalloc(GFP_NOFS, - master_dev->bio->bi_max_vecs); - if (unlikely(!bio)) { - dprintk("Faild to allocate BIO size=%u\n", - master_dev->bio->bi_max_vecs); - ret = -ENOMEM; - goto err; - } - - __bio_clone(bio, master_dev->bio); - bio->bi_bdev = NULL; - bio->bi_next = NULL; - per_dev->bio = bio; - per_dev->dev = dev; - per_dev->length = master_dev->length; - per_dev->offset = master_dev->offset; - } else { - bio = master_dev->bio; - bio->bi_rw |= REQ_WRITE; - } - - osd_req_write(or, &obj, per_dev->offset, bio, per_dev->length); + struct objio_state *objios = priv; + struct nfs_write_data *wdata = objios->oir.rpcdata; + pgoff_t index = offset / PAGE_SIZE; + struct page *page = find_get_page(wdata->inode->i_mapping, index); - ret = osd_finalize_request(or, 0, cred->oc_cap.cred, NULL); - if (ret) { - dprintk("%s: Faild to osd_finalize_request() => %d\n", - __func__, ret); - goto err; + if (!page) { + page = find_or_create_page(wdata->inode->i_mapping, + index, GFP_NOFS); + if (unlikely(!page)) { + dprintk("%s: grab_cache_page Failed index=0x%lx\n", + __func__, index); + return NULL; } - - dprintk("%s:[%d] dev=%d obj=0x%llx start=0x%llx length=0x%lx\n", - __func__, cur_comp, dev, obj.id, _LLU(per_dev->offset), - per_dev->length); + unlock_page(page); } + if (PageDirty(page) || PageWriteback(page)) + *uptodate = true; + else + *uptodate = PageUptodate(page); + dprintk("%s: index=0x%lx uptodate=%d\n", __func__, index, *uptodate); + return page; +} -err: - return ret; +static void __r4w_put_page(void *priv, struct page *page) +{ + dprintk("%s: index=0x%lx\n", __func__, page->index); + page_cache_release(page); + return; } -static ssize_t _write_exec(struct objio_state *ios) +static const struct _ore_r4w_op _r4w_op = { + .get_page = &__r4w_get_page, + .put_page = &__r4w_put_page, +}; + +int objio_write_pagelist(struct nfs_write_data *wdata, int how) { - unsigned i; + struct objio_state *objios; int ret; - for (i = 0; i < ios->numdevs; i += ios->layout->mirrors_p1) { - if (!ios->per_dev[i].length) - continue; - ret = _write_mirrors(ios, i); - if (unlikely(ret)) - goto err; - } - - ios->done = _write_done; - return _io_exec(ios); /* In sync mode exec returns the io->status */ + ret = objio_alloc_io_state(NFS_I(wdata->inode)->layout, false, + wdata->lseg, wdata->args.pages, wdata->args.pgbase, + wdata->args.offset, wdata->args.count, wdata, GFP_NOFS, + &objios); + if (unlikely(ret)) + return ret; -err: - _io_free(ios); - return ret; -} + objios->sync = 0 != (how & FLUSH_SYNC); + objios->ios->r4w = &_r4w_op; -ssize_t objio_write_pagelist(struct objlayout_io_state *ol_state, bool stable) -{ - struct objio_state *ios = container_of(ol_state, struct objio_state, - ol_state); - int ret; + if (!objios->sync) + objios->ios->done = _write_done; - /* TODO: ios->stable = stable; */ - ret = _io_rw_pagelist(ios, GFP_NOFS); + dprintk("%s: offset=0x%llx length=0x%x\n", __func__, + wdata->args.offset, wdata->args.count); + ret = ore_write(objios->ios); if (unlikely(ret)) return ret; - return _write_exec(ios); + if (objios->sync) + _write_done(objios->ios, objios); + + return 0; } static bool objio_pg_test(struct nfs_pageio_descriptor *pgio, @@ -997,7 +533,7 @@ static bool objio_pg_test(struct nfs_pageio_descriptor *pgio, return false; return pgio->pg_count + req->wb_bytes <= - OBJIO_LSEG(pgio->pg_lseg)->max_io_size; + OBJIO_LSEG(pgio->pg_lseg)->layout.max_io_length; } static const struct nfs_pageio_ops objio_pg_read_ops = { diff --git a/fs/nfs/objlayout/objlayout.c b/fs/nfs/objlayout/objlayout.c index 1d06f8e..72074e3 100644 --- a/fs/nfs/objlayout/objlayout.c +++ b/fs/nfs/objlayout/objlayout.c @@ -156,77 +156,39 @@ last_byte_offset(u64 start, u64 len) return end > start ? end - 1 : NFS4_MAX_UINT64; } -static struct objlayout_io_state * -objlayout_alloc_io_state(struct pnfs_layout_hdr *pnfs_layout_type, - struct page **pages, - unsigned pgbase, - loff_t offset, - size_t count, - struct pnfs_layout_segment *lseg, - void *rpcdata, - gfp_t gfp_flags) +void _fix_verify_io_params(struct pnfs_layout_segment *lseg, + struct page ***p_pages, unsigned *p_pgbase, + u64 offset, unsigned long count) { - struct objlayout_io_state *state; u64 lseg_end_offset; - dprintk("%s: allocating io_state\n", __func__); - if (objio_alloc_io_state(lseg, &state, gfp_flags)) - return NULL; - BUG_ON(offset < lseg->pls_range.offset); lseg_end_offset = end_offset(lseg->pls_range.offset, lseg->pls_range.length); BUG_ON(offset >= lseg_end_offset); - if (offset + count > lseg_end_offset) { - count = lseg->pls_range.length - - (offset - lseg->pls_range.offset); - dprintk("%s: truncated count %Zd\n", __func__, count); - } + WARN_ON(offset + count > lseg_end_offset); - if (pgbase > PAGE_SIZE) { - pages += pgbase >> PAGE_SHIFT; - pgbase &= ~PAGE_MASK; + if (*p_pgbase > PAGE_SIZE) { + dprintk("%s: pgbase(0x%x) > PAGE_SIZE\n", __func__, *p_pgbase); + *p_pages += *p_pgbase >> PAGE_SHIFT; + *p_pgbase &= ~PAGE_MASK; } - - INIT_LIST_HEAD(&state->err_list); - state->lseg = lseg; - state->rpcdata = rpcdata; - state->pages = pages; - state->pgbase = pgbase; - state->nr_pages = (pgbase + count + PAGE_SIZE - 1) >> PAGE_SHIFT; - state->offset = offset; - state->count = count; - state->sync = 0; - - return state; -} - -static void -objlayout_free_io_state(struct objlayout_io_state *state) -{ - dprintk("%s: freeing io_state\n", __func__); - if (unlikely(!state)) - return; - - objio_free_io_state(state); } /* * I/O done common code */ static void -objlayout_iodone(struct objlayout_io_state *state) +objlayout_iodone(struct objlayout_io_res *oir) { - dprintk("%s: state %p status\n", __func__, state); - - if (likely(state->status >= 0)) { - objlayout_free_io_state(state); + if (likely(oir->status >= 0)) { + objio_free_result(oir); } else { - struct objlayout *objlay = OBJLAYOUT(state->lseg->pls_layout); + struct objlayout *objlay = oir->objlay; spin_lock(&objlay->lock); objlay->delta_space_valid = OBJ_DSU_INVALID; - list_add(&objlay->err_list, &state->err_list); + list_add(&objlay->err_list, &oir->err_list); spin_unlock(&objlay->lock); } } @@ -238,13 +200,13 @@ objlayout_iodone(struct objlayout_io_state *state) * the error for later reporting at layout-return. */ void -objlayout_io_set_result(struct objlayout_io_state *state, unsigned index, +objlayout_io_set_result(struct objlayout_io_res *oir, unsigned index, struct pnfs_osd_objid *pooid, int osd_error, u64 offset, u64 length, bool is_write) { - struct pnfs_osd_ioerr *ioerr = &state->ioerrs[index]; + struct pnfs_osd_ioerr *ioerr = &oir->ioerrs[index]; - BUG_ON(index >= state->num_comps); + BUG_ON(index >= oir->num_comps); if (osd_error) { ioerr->oer_component = *pooid; ioerr->oer_comp_offset = offset; @@ -285,21 +247,18 @@ static void _rpc_read_complete(struct work_struct *work) } void -objlayout_read_done(struct objlayout_io_state *state, ssize_t status, bool sync) +objlayout_read_done(struct objlayout_io_res *oir, ssize_t status, bool sync) { - int eof = state->eof; - struct nfs_read_data *rdata; + struct nfs_read_data *rdata = oir->rpcdata; - state->status = status; - dprintk("%s: Begin status=%zd eof=%d\n", __func__, status, eof); - rdata = state->rpcdata; - rdata->task.tk_status = status; - if (status >= 0) { + oir->status = rdata->task.tk_status = status; + if (status >= 0) rdata->res.count = status; - rdata->res.eof = eof; - } - objlayout_iodone(state); - /* must not use state after this point */ + objlayout_iodone(oir); + /* must not use oir after this point */ + + dprintk("%s: Return status=%zd eof=%d sync=%d\n", __func__, + status, rdata->res.eof, sync); if (sync) pnfs_ld_read_done(rdata); @@ -317,40 +276,36 @@ objlayout_read_pagelist(struct nfs_read_data *rdata) { loff_t offset = rdata->args.offset; size_t count = rdata->args.count; - struct objlayout_io_state *state; - ssize_t status = 0; + int err; loff_t eof; - dprintk("%s: Begin inode %p offset %llu count %d\n", - __func__, rdata->inode, offset, (int)count); - eof = i_size_read(rdata->inode); if (unlikely(offset + count > eof)) { if (offset >= eof) { - status = 0; + err = 0; rdata->res.count = 0; rdata->res.eof = 1; + /*FIXME: do we need to call pnfs_ld_read_done() */ goto out; } count = eof - offset; } - state = objlayout_alloc_io_state(NFS_I(rdata->inode)->layout, - rdata->args.pages, rdata->args.pgbase, - offset, count, - rdata->lseg, rdata, - GFP_KERNEL); - if (unlikely(!state)) { - status = -ENOMEM; - goto out; - } + rdata->res.eof = (offset + count) >= eof; + _fix_verify_io_params(rdata->lseg, &rdata->args.pages, + &rdata->args.pgbase, + rdata->args.offset, rdata->args.count); - state->eof = state->offset + state->count >= eof; + dprintk("%s: inode(%lx) offset 0x%llx count 0x%Zx eof=%d\n", + __func__, rdata->inode->i_ino, offset, count, rdata->res.eof); - status = objio_read_pagelist(state); + err = objio_read_pagelist(rdata); out: - dprintk("%s: Return status %Zd\n", __func__, status); - rdata->pnfs_error = status; + if (unlikely(err)) { + rdata->pnfs_error = err; + dprintk("%s: Returned Error %d\n", __func__, err); + return PNFS_NOT_ATTEMPTED; + } return PNFS_ATTEMPTED; } @@ -371,26 +326,20 @@ static void _rpc_write_complete(struct work_struct *work) } void -objlayout_write_done(struct objlayout_io_state *state, ssize_t status, - bool sync) +objlayout_write_done(struct objlayout_io_res *oir, ssize_t status, bool sync) { - struct nfs_write_data *wdata; + struct nfs_write_data *wdata = oir->rpcdata; - dprintk("%s: Begin\n", __func__); - wdata = state->rpcdata; - state->status = status; - wdata->task.tk_status = status; + oir->status = wdata->task.tk_status = status; if (status >= 0) { wdata->res.count = status; - wdata->verf.committed = state->committed; - dprintk("%s: Return status %d committed %d\n", - __func__, wdata->task.tk_status, - wdata->verf.committed); - } else - dprintk("%s: Return status %d\n", - __func__, wdata->task.tk_status); - objlayout_iodone(state); - /* must not use state after this point */ + wdata->verf.committed = oir->committed; + } + objlayout_iodone(oir); + /* must not use oir after this point */ + + dprintk("%s: Return status %zd committed %d sync=%d\n", __func__, + status, wdata->verf.committed, sync); if (sync) pnfs_ld_write_done(wdata); @@ -407,30 +356,18 @@ enum pnfs_try_status objlayout_write_pagelist(struct nfs_write_data *wdata, int how) { - struct objlayout_io_state *state; - ssize_t status; - - dprintk("%s: Begin inode %p offset %llu count %u\n", - __func__, wdata->inode, wdata->args.offset, wdata->args.count); - - state = objlayout_alloc_io_state(NFS_I(wdata->inode)->layout, - wdata->args.pages, - wdata->args.pgbase, - wdata->args.offset, - wdata->args.count, - wdata->lseg, wdata, - GFP_NOFS); - if (unlikely(!state)) { - status = -ENOMEM; - goto out; - } + int err; - state->sync = how & FLUSH_SYNC; + _fix_verify_io_params(wdata->lseg, &wdata->args.pages, + &wdata->args.pgbase, + wdata->args.offset, wdata->args.count); - status = objio_write_pagelist(state, how & FLUSH_STABLE); - out: - dprintk("%s: Return status %Zd\n", __func__, status); - wdata->pnfs_error = status; + err = objio_write_pagelist(wdata, how); + if (unlikely(err)) { + wdata->pnfs_error = err; + dprintk("%s: Returned Error %d\n", __func__, err); + return PNFS_NOT_ATTEMPTED; + } return PNFS_ATTEMPTED; } @@ -537,14 +474,14 @@ merge_ioerr(struct pnfs_osd_ioerr *dest_err, static void encode_accumulated_error(struct objlayout *objlay, __be32 *p) { - struct objlayout_io_state *state, *tmp; + struct objlayout_io_res *oir, *tmp; struct pnfs_osd_ioerr accumulated_err = {.oer_errno = 0}; - list_for_each_entry_safe(state, tmp, &objlay->err_list, err_list) { + list_for_each_entry_safe(oir, tmp, &objlay->err_list, err_list) { unsigned i; - for (i = 0; i < state->num_comps; i++) { - struct pnfs_osd_ioerr *ioerr = &state->ioerrs[i]; + for (i = 0; i < oir->num_comps; i++) { + struct pnfs_osd_ioerr *ioerr = &oir->ioerrs[i]; if (!ioerr->oer_errno) continue; @@ -563,8 +500,8 @@ encode_accumulated_error(struct objlayout *objlay, __be32 *p) merge_ioerr(&accumulated_err, ioerr); } - list_del(&state->err_list); - objlayout_free_io_state(state); + list_del(&oir->err_list); + objio_free_result(oir); } pnfs_osd_xdr_encode_ioerr(p, &accumulated_err); @@ -576,7 +513,7 @@ objlayout_encode_layoutreturn(struct pnfs_layout_hdr *pnfslay, const struct nfs4_layoutreturn_args *args) { struct objlayout *objlay = OBJLAYOUT(pnfslay); - struct objlayout_io_state *state, *tmp; + struct objlayout_io_res *oir, *tmp; __be32 *start; dprintk("%s: Begin\n", __func__); @@ -585,13 +522,13 @@ objlayout_encode_layoutreturn(struct pnfs_layout_hdr *pnfslay, spin_lock(&objlay->lock); - list_for_each_entry_safe(state, tmp, &objlay->err_list, err_list) { + list_for_each_entry_safe(oir, tmp, &objlay->err_list, err_list) { __be32 *last_xdr = NULL, *p; unsigned i; int res = 0; - for (i = 0; i < state->num_comps; i++) { - struct pnfs_osd_ioerr *ioerr = &state->ioerrs[i]; + for (i = 0; i < oir->num_comps; i++) { + struct pnfs_osd_ioerr *ioerr = &oir->ioerrs[i]; if (!ioerr->oer_errno) continue; @@ -615,7 +552,7 @@ objlayout_encode_layoutreturn(struct pnfs_layout_hdr *pnfslay, } last_xdr = p; - pnfs_osd_xdr_encode_ioerr(p, &state->ioerrs[i]); + pnfs_osd_xdr_encode_ioerr(p, &oir->ioerrs[i]); } /* TODO: use xdr_write_pages */ @@ -631,8 +568,8 @@ objlayout_encode_layoutreturn(struct pnfs_layout_hdr *pnfslay, encode_accumulated_error(objlay, last_xdr); goto loop_done; } - list_del(&state->err_list); - objlayout_free_io_state(state); + list_del(&oir->err_list); + objio_free_result(oir); } loop_done: spin_unlock(&objlay->lock); diff --git a/fs/nfs/objlayout/objlayout.h b/fs/nfs/objlayout/objlayout.h index a8244c8..8ec3472 100644 --- a/fs/nfs/objlayout/objlayout.h +++ b/fs/nfs/objlayout/objlayout.h @@ -74,19 +74,11 @@ OBJLAYOUT(struct pnfs_layout_hdr *lo) * per-I/O operation state * embedded in objects provider io_state data structure */ -struct objlayout_io_state { - struct pnfs_layout_segment *lseg; - - struct page **pages; - unsigned pgbase; - unsigned nr_pages; - unsigned long count; - loff_t offset; - bool sync; +struct objlayout_io_res { + struct objlayout *objlay; void *rpcdata; int status; /* res */ - int eof; /* res */ int committed; /* res */ /* Error reporting (layout_return) */ @@ -100,6 +92,18 @@ struct objlayout_io_state { struct pnfs_osd_ioerr *ioerrs; }; +static inline +void objlayout_init_ioerrs(struct objlayout_io_res *oir, unsigned num_comps, + struct pnfs_osd_ioerr *ioerrs, void *rpcdata, + struct pnfs_layout_hdr *pnfs_layout_type) +{ + oir->objlay = OBJLAYOUT(pnfs_layout_type); + oir->rpcdata = rpcdata; + INIT_LIST_HEAD(&oir->err_list); + oir->num_comps = num_comps; + oir->ioerrs = ioerrs; +} + /* * Raid engine I/O API */ @@ -110,28 +114,24 @@ extern int objio_alloc_lseg(struct pnfs_layout_segment **outp, gfp_t gfp_flags); extern void objio_free_lseg(struct pnfs_layout_segment *lseg); -extern int objio_alloc_io_state( - struct pnfs_layout_segment *lseg, - struct objlayout_io_state **outp, - gfp_t gfp_flags); -extern void objio_free_io_state(struct objlayout_io_state *state); +/* objio_free_result will free these @oir structs recieved from + * objlayout_{read,write}_done + */ +extern void objio_free_result(struct objlayout_io_res *oir); -extern ssize_t objio_read_pagelist(struct objlayout_io_state *ol_state); -extern ssize_t objio_write_pagelist(struct objlayout_io_state *ol_state, - bool stable); +extern int objio_read_pagelist(struct nfs_read_data *rdata); +extern int objio_write_pagelist(struct nfs_write_data *wdata, int how); /* * callback API */ -extern void objlayout_io_set_result(struct objlayout_io_state *state, +extern void objlayout_io_set_result(struct objlayout_io_res *oir, unsigned index, struct pnfs_osd_objid *pooid, int osd_error, u64 offset, u64 length, bool is_write); static inline void -objlayout_add_delta_space_used(struct objlayout_io_state *state, s64 space_used) +objlayout_add_delta_space_used(struct objlayout *objlay, s64 space_used) { - struct objlayout *objlay = OBJLAYOUT(state->lseg->pls_layout); - /* If one of the I/Os errored out and the delta_space_used was * invalid we render the complete report as invalid. Protocol mandate * the DSU be accurate or not reported. @@ -144,9 +144,9 @@ objlayout_add_delta_space_used(struct objlayout_io_state *state, s64 space_used) spin_unlock(&objlay->lock); } -extern void objlayout_read_done(struct objlayout_io_state *state, +extern void objlayout_read_done(struct objlayout_io_res *oir, ssize_t status, bool sync); -extern void objlayout_write_done(struct objlayout_io_state *state, +extern void objlayout_write_done(struct objlayout_io_res *oir, ssize_t status, bool sync); extern int objlayout_get_deviceinfo(struct pnfs_layout_hdr *pnfslay, diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index b60970c..5668f7c 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -18,6 +18,7 @@ #include <linux/nfs_page.h> #include <linux/nfs_fs.h> #include <linux/nfs_mount.h> +#include <linux/export.h> #include "internal.h" #include "pnfs.h" @@ -41,7 +42,7 @@ nfs_page_free(struct nfs_page *p) /** * nfs_create_request - Create an NFS read/write request. - * @file: file descriptor to use + * @ctx: open context to use * @inode: inode to which the request is attached * @page: page to write * @offset: starting offset within the page for the write diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index e550e88..8e672a2 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -29,6 +29,7 @@ #include <linux/nfs_fs.h> #include <linux/nfs_page.h> +#include <linux/module.h> #include "internal.h" #include "pnfs.h" #include "iostat.h" @@ -1168,23 +1169,17 @@ EXPORT_SYMBOL_GPL(pnfs_generic_pg_test); /* * Called by non rpc-based layout drivers */ -int -pnfs_ld_write_done(struct nfs_write_data *data) +void pnfs_ld_write_done(struct nfs_write_data *data) { - int status; - - if (!data->pnfs_error) { + if (likely(!data->pnfs_error)) { pnfs_set_layoutcommit(data); data->mds_ops->rpc_call_done(&data->task, data); - data->mds_ops->rpc_release(data); - return 0; + } else { + put_lseg(data->lseg); + data->lseg = NULL; + dprintk("pnfs write error = %d\n", data->pnfs_error); } - - dprintk("%s: pnfs_error=%d, retry via MDS\n", __func__, - data->pnfs_error); - status = nfs_initiate_write(data, NFS_CLIENT(data->inode), - data->mds_ops, NFS_FILE_SYNC); - return status ? : -EAGAIN; + data->mds_ops->rpc_release(data); } EXPORT_SYMBOL_GPL(pnfs_ld_write_done); @@ -1265,26 +1260,36 @@ pnfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc) } EXPORT_SYMBOL_GPL(pnfs_generic_pg_writepages); +static void pnfs_ld_handle_read_error(struct nfs_read_data *data) +{ + struct nfs_pageio_descriptor pgio; + + put_lseg(data->lseg); + data->lseg = NULL; + dprintk("pnfs write error = %d\n", data->pnfs_error); + + nfs_pageio_init_read_mds(&pgio, data->inode); + + while (!list_empty(&data->pages)) { + struct nfs_page *req = nfs_list_entry(data->pages.next); + + nfs_list_remove_request(req); + nfs_pageio_add_request(&pgio, req); + } + nfs_pageio_complete(&pgio); +} + /* * Called by non rpc-based layout drivers */ -int -pnfs_ld_read_done(struct nfs_read_data *data) +void pnfs_ld_read_done(struct nfs_read_data *data) { - int status; - - if (!data->pnfs_error) { + if (likely(!data->pnfs_error)) { __nfs4_read_done_cb(data); data->mds_ops->rpc_call_done(&data->task, data); - data->mds_ops->rpc_release(data); - return 0; - } - - dprintk("%s: pnfs_error=%d, retry via MDS\n", __func__, - data->pnfs_error); - status = nfs_initiate_read(data, NFS_CLIENT(data->inode), - data->mds_ops); - return status ? : -EAGAIN; + } else + pnfs_ld_handle_read_error(data); + data->mds_ops->rpc_release(data); } EXPORT_SYMBOL_GPL(pnfs_ld_read_done); @@ -1381,6 +1386,18 @@ static void pnfs_list_write_lseg(struct inode *inode, struct list_head *listp) } } +void pnfs_set_lo_fail(struct pnfs_layout_segment *lseg) +{ + if (lseg->pls_range.iomode == IOMODE_RW) { + dprintk("%s Setting layout IOMODE_RW fail bit\n", __func__); + set_bit(lo_fail_bit(IOMODE_RW), &lseg->pls_layout->plh_flags); + } else { + dprintk("%s Setting layout IOMODE_READ fail bit\n", __func__); + set_bit(lo_fail_bit(IOMODE_READ), &lseg->pls_layout->plh_flags); + } +} +EXPORT_SYMBOL_GPL(pnfs_set_lo_fail); + void pnfs_set_layoutcommit(struct nfs_write_data *wdata) { @@ -1443,17 +1460,31 @@ pnfs_layoutcommit_inode(struct inode *inode, bool sync) /* Note kzalloc ensures data->res.seq_res.sr_slot == NULL */ data = kzalloc(sizeof(*data), GFP_NOFS); if (!data) { - mark_inode_dirty_sync(inode); status = -ENOMEM; goto out; } + if (!test_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) + goto out_free; + + if (test_and_set_bit(NFS_INO_LAYOUTCOMMITTING, &nfsi->flags)) { + if (!sync) { + status = -EAGAIN; + goto out_free; + } + status = wait_on_bit_lock(&nfsi->flags, NFS_INO_LAYOUTCOMMITTING, + nfs_wait_bit_killable, TASK_KILLABLE); + if (status) + goto out_free; + } + INIT_LIST_HEAD(&data->lseg_list); spin_lock(&inode->i_lock); if (!test_and_clear_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) { + clear_bit(NFS_INO_LAYOUTCOMMITTING, &nfsi->flags); spin_unlock(&inode->i_lock); - kfree(data); - goto out; + wake_up_bit(&nfsi->flags, NFS_INO_LAYOUTCOMMITTING); + goto out_free; } pnfs_list_write_lseg(inode, &data->lseg_list); @@ -1475,6 +1506,11 @@ pnfs_layoutcommit_inode(struct inode *inode, bool sync) status = nfs4_proc_layoutcommit(data, sync); out: + if (status) + mark_inode_dirty_sync(inode); dprintk("<-- %s status %d\n", __func__, status); return status; +out_free: + kfree(data); + goto out; } diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index 01cbfd5..1509530 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -178,6 +178,7 @@ int pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc); void pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *, struct nfs_page *); int pnfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc); bool pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, struct nfs_page *req); +void pnfs_set_lo_fail(struct pnfs_layout_segment *lseg); int pnfs_layout_process(struct nfs4_layoutget *lgp); void pnfs_free_lseg_list(struct list_head *tmp_list); void pnfs_destroy_layout(struct nfs_inode *); @@ -200,8 +201,8 @@ void pnfs_set_layoutcommit(struct nfs_write_data *wdata); void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data); int pnfs_layoutcommit_inode(struct inode *inode, bool sync); int _pnfs_return_layout(struct inode *); -int pnfs_ld_write_done(struct nfs_write_data *); -int pnfs_ld_read_done(struct nfs_read_data *); +void pnfs_ld_write_done(struct nfs_write_data *); +void pnfs_ld_read_done(struct nfs_read_data *); struct pnfs_layout_segment *pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx, loff_t pos, diff --git a/fs/nfs/pnfs_dev.c b/fs/nfs/pnfs_dev.c index 6fda522..4f359d2 100644 --- a/fs/nfs/pnfs_dev.c +++ b/fs/nfs/pnfs_dev.c @@ -28,6 +28,7 @@ * such damages. */ +#include <linux/export.h> #include "pnfs.h" #define NFSDBG_FACILITY NFSDBG_PNFS diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c index ac40b85..0c672588 100644 --- a/fs/nfs/proc.c +++ b/fs/nfs/proc.c @@ -41,6 +41,7 @@ #include <linux/nfs_fs.h> #include <linux/nfs_page.h> #include <linux/lockd/bind.h> +#include <linux/freezer.h> #include "internal.h" #define NFSDBG_FACILITY NFSDBG_PROC @@ -59,7 +60,7 @@ nfs_rpc_wrapper(struct rpc_clnt *clnt, struct rpc_message *msg, int flags) res = rpc_call_sync(clnt, msg, flags); if (res != -EKEYEXPIRED) break; - schedule_timeout_killable(NFS_JUKEBOX_RETRY_TIME); + freezable_schedule_timeout_killable(NFS_JUKEBOX_RETRY_TIME); res = -ERESTARTSYS; } while (!fatal_signal_pending(current)); return res; @@ -710,6 +711,7 @@ const struct nfs_rpc_ops nfs_v2_clientops = { .dentry_ops = &nfs_dentry_operations, .dir_inode_ops = &nfs_dir_inode_operations, .file_inode_ops = &nfs_file_inode_operations, + .file_ops = &nfs_file_operations, .getroot = nfs_proc_get_root, .getattr = nfs_proc_getattr, .setattr = nfs_proc_setattr, diff --git a/fs/nfs/read.c b/fs/nfs/read.c index 2171c04..cfa175c 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -35,16 +35,13 @@ static const struct rpc_call_ops nfs_read_partial_ops; static const struct rpc_call_ops nfs_read_full_ops; static struct kmem_cache *nfs_rdata_cachep; -static mempool_t *nfs_rdata_mempool; - -#define MIN_POOL_READ (32) struct nfs_read_data *nfs_readdata_alloc(unsigned int pagecount) { - struct nfs_read_data *p = mempool_alloc(nfs_rdata_mempool, GFP_KERNEL); + struct nfs_read_data *p; + p = kmem_cache_zalloc(nfs_rdata_cachep, GFP_KERNEL); if (p) { - memset(p, 0, sizeof(*p)); INIT_LIST_HEAD(&p->pages); p->npages = pagecount; if (pagecount <= ARRAY_SIZE(p->page_array)) @@ -52,7 +49,7 @@ struct nfs_read_data *nfs_readdata_alloc(unsigned int pagecount) else { p->pagevec = kcalloc(pagecount, sizeof(struct page *), GFP_KERNEL); if (!p->pagevec) { - mempool_free(p, nfs_rdata_mempool); + kmem_cache_free(nfs_rdata_cachep, p); p = NULL; } } @@ -64,7 +61,7 @@ void nfs_readdata_free(struct nfs_read_data *p) { if (p && (p->pagevec != &p->page_array[0])) kfree(p->pagevec); - mempool_free(p, nfs_rdata_mempool); + kmem_cache_free(nfs_rdata_cachep, p); } void nfs_readdata_release(struct nfs_read_data *rdata) @@ -112,7 +109,7 @@ static void nfs_readpage_truncate_uninitialised_page(struct nfs_read_data *data) } } -static void nfs_pageio_init_read_mds(struct nfs_pageio_descriptor *pgio, +void nfs_pageio_init_read_mds(struct nfs_pageio_descriptor *pgio, struct inode *inode) { nfs_pageio_init(pgio, inode, &nfs_pageio_read_ops, @@ -276,7 +273,6 @@ nfs_async_read_error(struct list_head *head) while (!list_empty(head)) { req = nfs_list_entry(head->next); nfs_list_remove_request(req); - SetPageError(req->wb_page); nfs_readpage_release(req); } } @@ -322,7 +318,6 @@ static int nfs_pagein_multi(struct nfs_pageio_descriptor *desc, struct list_head offset += len; } while(nbytes != 0); atomic_set(&req->wb_complete, requests); - ClearPageError(page); desc->pg_rpc_callops = &nfs_read_partial_ops; return ret; out_bad: @@ -331,7 +326,6 @@ out_bad: list_del(&data->list); nfs_readdata_free(data); } - SetPageError(page); nfs_readpage_release(req); return -ENOMEM; } @@ -357,7 +351,6 @@ static int nfs_pagein_one(struct nfs_pageio_descriptor *desc, struct list_head * req = nfs_list_entry(head->next); nfs_list_remove_request(req); nfs_list_add_request(req, &data->pages); - ClearPageError(req->wb_page); *pages++ = req->wb_page; } req = nfs_list_entry(data->pages.next); @@ -435,7 +428,7 @@ static void nfs_readpage_retry(struct rpc_task *task, struct nfs_read_data *data argp->offset += resp->count; argp->pgbase += resp->count; argp->count -= resp->count; - nfs_restart_rpc(task, NFS_SERVER(data->inode)->nfs_client); + rpc_restart_call_prepare(task); } /* @@ -462,10 +455,10 @@ static void nfs_readpage_release_partial(void *calldata) int status = data->task.tk_status; if (status < 0) - SetPageError(page); + set_bit(PG_PARTIAL_READ_FAILED, &req->wb_flags); if (atomic_dec_and_test(&req->wb_complete)) { - if (!PageError(page)) + if (!test_bit(PG_PARTIAL_READ_FAILED, &req->wb_flags)) SetPageUptodate(page); nfs_readpage_release(req); } @@ -648,7 +641,6 @@ readpage_async_filler(void *data, struct page *page) return 0; out_error: error = PTR_ERR(new); - SetPageError(page); out_unlock: unlock_page(page); return error; @@ -711,16 +703,10 @@ int __init nfs_init_readpagecache(void) if (nfs_rdata_cachep == NULL) return -ENOMEM; - nfs_rdata_mempool = mempool_create_slab_pool(MIN_POOL_READ, - nfs_rdata_cachep); - if (nfs_rdata_mempool == NULL) - return -ENOMEM; - return 0; } void nfs_destroy_readpagecache(void) { - mempool_destroy(nfs_rdata_mempool); kmem_cache_destroy(nfs_rdata_cachep); } diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 5b19b6a..e463967 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -41,7 +41,6 @@ #include <linux/lockd/bind.h> #include <linux/seq_file.h> #include <linux/mount.h> -#include <linux/mnt_namespace.h> #include <linux/namei.h> #include <linux/nfs_idmap.h> #include <linux/vfs.h> @@ -263,10 +262,10 @@ static match_table_t nfs_local_lock_tokens = { static void nfs_umount_begin(struct super_block *); static int nfs_statfs(struct dentry *, struct kstatfs *); -static int nfs_show_options(struct seq_file *, struct vfsmount *); -static int nfs_show_devname(struct seq_file *, struct vfsmount *); -static int nfs_show_path(struct seq_file *, struct vfsmount *); -static int nfs_show_stats(struct seq_file *, struct vfsmount *); +static int nfs_show_options(struct seq_file *, struct dentry *); +static int nfs_show_devname(struct seq_file *, struct dentry *); +static int nfs_show_path(struct seq_file *, struct dentry *); +static int nfs_show_stats(struct seq_file *, struct dentry *); static struct dentry *nfs_fs_mount(struct file_system_type *, int, const char *, void *); static struct dentry *nfs_xdev_mount(struct file_system_type *fs_type, @@ -721,9 +720,9 @@ static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss, /* * Describe the mount options on this VFS mountpoint */ -static int nfs_show_options(struct seq_file *m, struct vfsmount *mnt) +static int nfs_show_options(struct seq_file *m, struct dentry *root) { - struct nfs_server *nfss = NFS_SB(mnt->mnt_sb); + struct nfs_server *nfss = NFS_SB(root->d_sb); nfs_show_mount_options(m, nfss, 0); @@ -733,18 +732,22 @@ static int nfs_show_options(struct seq_file *m, struct vfsmount *mnt) return 0; } + +#ifdef CONFIG_NFS_V4 #ifdef CONFIG_NFS_V4_1 -void show_sessions(struct seq_file *m, struct nfs_server *server) +static void show_sessions(struct seq_file *m, struct nfs_server *server) { if (nfs4_has_session(server->nfs_client)) seq_printf(m, ",sessions"); } #else -void show_sessions(struct seq_file *m, struct nfs_server *server) {} +static void show_sessions(struct seq_file *m, struct nfs_server *server) {} +#endif #endif +#ifdef CONFIG_NFS_V4 #ifdef CONFIG_NFS_V4_1 -void show_pnfs(struct seq_file *m, struct nfs_server *server) +static void show_pnfs(struct seq_file *m, struct nfs_server *server) { seq_printf(m, ",pnfs="); if (server->pnfs_curr_ld) @@ -752,18 +755,19 @@ void show_pnfs(struct seq_file *m, struct nfs_server *server) else seq_printf(m, "not configured"); } -#else /* CONFIG_NFS_V4_1 */ -void show_pnfs(struct seq_file *m, struct nfs_server *server) {} -#endif /* CONFIG_NFS_V4_1 */ +#else +static void show_pnfs(struct seq_file *m, struct nfs_server *server) {} +#endif +#endif -static int nfs_show_devname(struct seq_file *m, struct vfsmount *mnt) +static int nfs_show_devname(struct seq_file *m, struct dentry *root) { char *page = (char *) __get_free_page(GFP_KERNEL); char *devname, *dummy; int err = 0; if (!page) return -ENOMEM; - devname = nfs_path(&dummy, mnt->mnt_root, page, PAGE_SIZE); + devname = nfs_path(&dummy, root, page, PAGE_SIZE); if (IS_ERR(devname)) err = PTR_ERR(devname); else @@ -772,7 +776,7 @@ static int nfs_show_devname(struct seq_file *m, struct vfsmount *mnt) return err; } -static int nfs_show_path(struct seq_file *m, struct vfsmount *mnt) +static int nfs_show_path(struct seq_file *m, struct dentry *dentry) { seq_puts(m, "/"); return 0; @@ -781,10 +785,10 @@ static int nfs_show_path(struct seq_file *m, struct vfsmount *mnt) /* * Present statistical information for this VFS mountpoint */ -static int nfs_show_stats(struct seq_file *m, struct vfsmount *mnt) +static int nfs_show_stats(struct seq_file *m, struct dentry *root) { int i, cpu; - struct nfs_server *nfss = NFS_SB(mnt->mnt_sb); + struct nfs_server *nfss = NFS_SB(root->d_sb); struct rpc_auth *auth = nfss->client->cl_auth; struct nfs_iostats totals = { }; @@ -794,10 +798,10 @@ static int nfs_show_stats(struct seq_file *m, struct vfsmount *mnt) * Display all mount option settings */ seq_printf(m, "\n\topts:\t"); - seq_puts(m, mnt->mnt_sb->s_flags & MS_RDONLY ? "ro" : "rw"); - seq_puts(m, mnt->mnt_sb->s_flags & MS_SYNCHRONOUS ? ",sync" : ""); - seq_puts(m, mnt->mnt_sb->s_flags & MS_NOATIME ? ",noatime" : ""); - seq_puts(m, mnt->mnt_sb->s_flags & MS_NODIRATIME ? ",nodiratime" : ""); + seq_puts(m, root->d_sb->s_flags & MS_RDONLY ? "ro" : "rw"); + seq_puts(m, root->d_sb->s_flags & MS_SYNCHRONOUS ? ",sync" : ""); + seq_puts(m, root->d_sb->s_flags & MS_NOATIME ? ",noatime" : ""); + seq_puts(m, root->d_sb->s_flags & MS_NODIRATIME ? ",nodiratime" : ""); nfs_show_mount_options(m, nfss, 1); seq_printf(m, "\n\tage:\t%lu", (jiffies - nfss->mount_time) / HZ); @@ -2782,43 +2786,22 @@ static void nfs_referral_loop_unprotect(void) static struct dentry *nfs_follow_remote_path(struct vfsmount *root_mnt, const char *export_path) { - struct mnt_namespace *ns_private; - struct super_block *s; struct dentry *dentry; - struct path path; - int ret; + int err; - ns_private = create_mnt_ns(root_mnt); - ret = PTR_ERR(ns_private); - if (IS_ERR(ns_private)) - goto out_mntput; + if (IS_ERR(root_mnt)) + return ERR_CAST(root_mnt); - ret = nfs_referral_loop_protect(); - if (ret != 0) - goto out_put_mnt_ns; - - ret = vfs_path_lookup(root_mnt->mnt_root, root_mnt, - export_path, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &path); + err = nfs_referral_loop_protect(); + if (err) { + mntput(root_mnt); + return ERR_PTR(err); + } + dentry = mount_subtree(root_mnt, export_path); nfs_referral_loop_unprotect(); - put_mnt_ns(ns_private); - - if (ret != 0) - goto out_err; - - s = path.mnt->mnt_sb; - atomic_inc(&s->s_active); - dentry = dget(path.dentry); - path_put(&path); - down_write(&s->s_umount); return dentry; -out_put_mnt_ns: - put_mnt_ns(ns_private); -out_mntput: - mntput(root_mnt); -out_err: - return ERR_PTR(ret); } static struct dentry *nfs4_try_mount(int flags, const char *dev_name, @@ -2836,9 +2819,7 @@ static struct dentry *nfs4_try_mount(int flags, const char *dev_name, data->nfs_server.hostname); data->nfs_server.export_path = export_path; - res = ERR_CAST(root_mnt); - if (!IS_ERR(root_mnt)) - res = nfs_follow_remote_path(root_mnt, export_path); + res = nfs_follow_remote_path(root_mnt, export_path); dfprintk(MOUNT, "<-- nfs4_try_mount() = %ld%s\n", IS_ERR(res) ? PTR_ERR(res) : 0, @@ -3099,9 +3080,7 @@ static struct dentry *nfs4_referral_mount(struct file_system_type *fs_type, flags, data, data->hostname); data->mnt_path = export_path; - res = ERR_CAST(root_mnt); - if (!IS_ERR(root_mnt)) - res = nfs_follow_remote_path(root_mnt, export_path); + res = nfs_follow_remote_path(root_mnt, export_path); dprintk("<-- nfs4_referral_mount() = %ld%s\n", IS_ERR(res) ? PTR_ERR(res) : 0, IS_ERR(res) ? " [error]" : ""); diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c index b2fbbde..4f9319a 100644 --- a/fs/nfs/unlink.c +++ b/fs/nfs/unlink.c @@ -87,7 +87,7 @@ static void nfs_async_unlink_done(struct rpc_task *task, void *calldata) struct inode *dir = data->dir; if (!NFS_PROTO(dir)->unlink_done(task, dir)) - nfs_restart_rpc(task, NFS_SERVER(dir)->nfs_client); + rpc_restart_call_prepare(task); } /** @@ -369,7 +369,7 @@ static void nfs_async_rename_done(struct rpc_task *task, void *calldata) struct dentry *new_dentry = data->new_dentry; if (!NFS_PROTO(old_dir)->rename_done(task, old_dir, new_dir)) { - nfs_restart_rpc(task, NFS_SERVER(old_dir)->nfs_client); + rpc_restart_call_prepare(task); return; } diff --git a/fs/nfs/write.c b/fs/nfs/write.c index c9bd2a6..1dda78d 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -20,6 +20,7 @@ #include <linux/nfs_mount.h> #include <linux/nfs_page.h> #include <linux/backing-dev.h> +#include <linux/export.h> #include <asm/uaccess.h> @@ -390,7 +391,7 @@ static int nfs_inode_add_request(struct inode *inode, struct nfs_page *req) error = radix_tree_insert(&nfsi->nfs_page_tree, req->wb_index, req); BUG_ON(error); if (!nfsi->npages && nfs_have_delegation(inode, FMODE_WRITE)) - nfsi->change_attr++; + inode->i_version++; set_bit(PG_MAPPED, &req->wb_flags); SetPagePrivate(req->wb_page); set_page_private(req->wb_page, (unsigned long)req); @@ -428,7 +429,6 @@ static void nfs_mark_request_dirty(struct nfs_page *req) { __set_page_dirty_nobuffers(req->wb_page); - __mark_inode_dirty(req->wb_page->mapping->host, I_DIRTY_DATASYNC); } #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4) @@ -762,6 +762,8 @@ int nfs_updatepage(struct file *file, struct page *page, status = nfs_writepage_setup(ctx, page, offset, count); if (status < 0) nfs_set_pageerror(page); + else + __set_page_dirty_nobuffers(page); dprintk("NFS: nfs_updatepage returns %d (isize %lld)\n", status, (long long)i_size_read(inode)); @@ -1010,7 +1012,6 @@ static int nfs_flush_one(struct nfs_pageio_descriptor *desc, struct list_head *r req = nfs_list_entry(head->next); nfs_list_remove_request(req); nfs_list_add_request(req, &data->pages); - ClearPageError(req->wb_page); *pages++ = req->wb_page; } req = nfs_list_entry(data->pages.next); @@ -1165,7 +1166,13 @@ static void nfs_writeback_done_full(struct rpc_task *task, void *calldata) static void nfs_writeback_release_full(void *calldata) { struct nfs_write_data *data = calldata; - int status = data->task.tk_status; + int ret, status = data->task.tk_status; + struct nfs_pageio_descriptor pgio; + + if (data->pnfs_error) { + nfs_pageio_init_write_mds(&pgio, data->inode, FLUSH_STABLE); + pgio.pg_recoalesce = 1; + } /* Update attributes as result of writeback. */ while (!list_empty(&data->pages)) { @@ -1181,6 +1188,11 @@ static void nfs_writeback_release_full(void *calldata) req->wb_bytes, (long long)req_offset(req)); + if (data->pnfs_error) { + dprintk(", pnfs error = %d\n", data->pnfs_error); + goto next; + } + if (status < 0) { nfs_set_pageerror(page); nfs_context_set_write_error(req->wb_context, status); @@ -1200,7 +1212,19 @@ remove_request: next: nfs_clear_page_tag_locked(req); nfs_end_page_writeback(page); + if (data->pnfs_error) { + lock_page(page); + nfs_pageio_cond_complete(&pgio, page->index); + ret = nfs_page_async_flush(&pgio, page, 0); + if (ret) { + nfs_set_pageerror(page); + dprintk("rewrite to MDS error = %d\n", ret); + } + unlock_page(page); + } } + if (data->pnfs_error) + nfs_pageio_complete(&pgio); nfs_writedata_release(calldata); } @@ -1220,7 +1244,6 @@ void nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data) { struct nfs_writeargs *argp = &data->args; struct nfs_writeres *resp = &data->res; - struct nfs_server *server = NFS_SERVER(data->inode); int status; dprintk("NFS: %5u nfs_writeback_done (status %d)\n", @@ -1254,7 +1277,7 @@ void nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data) if (time_before(complain, jiffies)) { dprintk("NFS: faulty NFS server %s:" " (committed = %d) != (stable = %d)\n", - server->nfs_client->cl_hostname, + NFS_SERVER(data->inode)->nfs_client->cl_hostname, resp->verf->committed, argp->stable); complain = jiffies + 300 * HZ; } @@ -1281,7 +1304,7 @@ void nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data) */ argp->stable = NFS_FILE_SYNC; } - nfs_restart_rpc(task, server->nfs_client); + rpc_restart_call_prepare(task); return; } if (time_before(complain, jiffies)) { @@ -1553,6 +1576,10 @@ static int nfs_commit_unstable_pages(struct inode *inode, struct writeback_contr int flags = FLUSH_SYNC; int ret = 0; + /* no commits means nothing needs to be done */ + if (!nfsi->ncommit) + return ret; + if (wbc->sync_mode == WB_SYNC_NONE) { /* Don't commit yet if this is a non-blocking flush and there * are a lot of outstanding writes for this mapping. @@ -1686,34 +1713,20 @@ out_error: int nfs_migrate_page(struct address_space *mapping, struct page *newpage, struct page *page) { - struct nfs_page *req; - int ret; + /* + * If PagePrivate is set, then the page is currently associated with + * an in-progress read or write request. Don't try to migrate it. + * + * FIXME: we could do this in principle, but we'll need a way to ensure + * that we can safely release the inode reference while holding + * the page lock. + */ + if (PagePrivate(page)) + return -EBUSY; nfs_fscache_release_page(page, GFP_KERNEL); - req = nfs_find_and_lock_request(page, false); - ret = PTR_ERR(req); - if (IS_ERR(req)) - goto out; - - ret = migrate_page(mapping, newpage, page); - if (!req) - goto out; - if (ret) - goto out_unlock; - page_cache_get(newpage); - spin_lock(&mapping->host->i_lock); - req->wb_page = newpage; - SetPagePrivate(newpage); - set_page_private(newpage, (unsigned long)req); - ClearPagePrivate(page); - set_page_private(page, 0); - spin_unlock(&mapping->host->i_lock); - page_cache_release(page); -out_unlock: - nfs_clear_page_tag_locked(req); -out: - return ret; + return migrate_page(mapping, newpage, page); } #endif diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c index f4cc1e2..62f3b90 100644 --- a/fs/nfsd/export.c +++ b/fs/nfsd/export.c @@ -16,7 +16,6 @@ #include <linux/module.h> #include <linux/exportfs.h> -#include <linux/nfsd/syscall.h> #include <net/ipv6.h> #include "nfsd.h" @@ -318,7 +317,6 @@ static void svc_export_put(struct kref *ref) struct svc_export *exp = container_of(ref, struct svc_export, h.ref); path_put(&exp->ex_path); auth_domain_put(exp->ex_client); - kfree(exp->ex_pathname); nfsd4_fslocs_free(&exp->ex_fslocs); kfree(exp); } @@ -528,11 +526,6 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen) exp.ex_client = dom; - err = -ENOMEM; - exp.ex_pathname = kstrdup(buf, GFP_KERNEL); - if (!exp.ex_pathname) - goto out2; - /* expiry */ err = -EINVAL; exp.h.expiry_time = get_expiry(&mesg); @@ -613,8 +606,6 @@ out4: nfsd4_fslocs_free(&exp.ex_fslocs); kfree(exp.ex_uuid); out3: - kfree(exp.ex_pathname); -out2: path_put(&exp.ex_path); out1: auth_domain_put(dom); @@ -678,7 +669,6 @@ static void svc_export_init(struct cache_head *cnew, struct cache_head *citem) new->ex_client = item->ex_client; new->ex_path.dentry = dget(item->ex_path.dentry); new->ex_path.mnt = mntget(item->ex_path.mnt); - new->ex_pathname = NULL; new->ex_fslocs.locations = NULL; new->ex_fslocs.locations_count = 0; new->ex_fslocs.migrated = 0; @@ -696,8 +686,6 @@ static void export_update(struct cache_head *cnew, struct cache_head *citem) new->ex_fsid = item->ex_fsid; new->ex_uuid = item->ex_uuid; item->ex_uuid = NULL; - new->ex_pathname = item->ex_pathname; - item->ex_pathname = NULL; new->ex_fslocs.locations = item->ex_fslocs.locations; item->ex_fslocs.locations = NULL; new->ex_fslocs.locations_count = item->ex_fslocs.locations_count; @@ -1010,7 +998,7 @@ rqst_exp_parent(struct svc_rqst *rqstp, struct path *path) return exp; } -static struct svc_export *find_fsidzero_export(struct svc_rqst *rqstp) +struct svc_export *rqst_find_fsidzero_export(struct svc_rqst *rqstp) { u32 fsidv[2]; @@ -1030,7 +1018,7 @@ exp_pseudoroot(struct svc_rqst *rqstp, struct svc_fh *fhp) struct svc_export *exp; __be32 rv; - exp = find_fsidzero_export(rqstp); + exp = rqst_find_fsidzero_export(rqstp); if (IS_ERR(exp)) return nfserrno(PTR_ERR(exp)); rv = fh_compose(fhp, exp, exp->ex_path.dentry, NULL); diff --git a/fs/nfsd/nfs4acl.c b/fs/nfsd/nfs4acl.c index ad88f1c..9c51aff 100644 --- a/fs/nfsd/nfs4acl.c +++ b/fs/nfsd/nfs4acl.c @@ -36,6 +36,7 @@ #include <linux/slab.h> #include <linux/nfs_fs.h> +#include <linux/export.h> #include "acl.h" diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index 02eb4ed..7748d6a 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -39,6 +39,8 @@ #define NFSDDBG_FACILITY NFSDDBG_PROC +static void nfsd4_mark_cb_fault(struct nfs4_client *, int reason); + #define NFSPROC4_CB_NULL 0 #define NFSPROC4_CB_COMPOUND 1 @@ -351,7 +353,7 @@ static void encode_cb_recall4args(struct xdr_stream *xdr, __be32 *p; encode_nfs_cb_opnum4(xdr, OP_CB_RECALL); - encode_stateid4(xdr, &dp->dl_stateid); + encode_stateid4(xdr, &dp->dl_stid.sc_stateid); p = xdr_reserve_space(xdr, 4); *p++ = xdr_zero; /* truncate */ @@ -460,6 +462,8 @@ static int decode_cb_sequence4resok(struct xdr_stream *xdr, */ status = 0; out: + if (status) + nfsd4_mark_cb_fault(cb->cb_clp, status); return status; out_overflow: print_overflow_msg(__func__, xdr); @@ -686,6 +690,12 @@ static void nfsd4_mark_cb_down(struct nfs4_client *clp, int reason) warn_no_callback_path(clp, reason); } +static void nfsd4_mark_cb_fault(struct nfs4_client *clp, int reason) +{ + clp->cl_cb_state = NFSD4_CB_FAULT; + warn_no_callback_path(clp, reason); +} + static void nfsd4_cb_probe_done(struct rpc_task *task, void *calldata) { struct nfs4_client *clp = container_of(calldata, struct nfs4_client, cl_cb_null); @@ -787,7 +797,7 @@ static void nfsd4_cb_prepare(struct rpc_task *task, void *calldata) { struct nfsd4_callback *cb = calldata; struct nfs4_delegation *dp = container_of(cb, struct nfs4_delegation, dl_recall); - struct nfs4_client *clp = dp->dl_client; + struct nfs4_client *clp = dp->dl_stid.sc_client; u32 minorversion = clp->cl_minorversion; cb->cb_minorversion = minorversion; @@ -809,7 +819,7 @@ static void nfsd4_cb_done(struct rpc_task *task, void *calldata) { struct nfsd4_callback *cb = calldata; struct nfs4_delegation *dp = container_of(cb, struct nfs4_delegation, dl_recall); - struct nfs4_client *clp = dp->dl_client; + struct nfs4_client *clp = dp->dl_stid.sc_client; dprintk("%s: minorversion=%d\n", __func__, clp->cl_minorversion); @@ -832,7 +842,7 @@ static void nfsd4_cb_recall_done(struct rpc_task *task, void *calldata) { struct nfsd4_callback *cb = calldata; struct nfs4_delegation *dp = container_of(cb, struct nfs4_delegation, dl_recall); - struct nfs4_client *clp = dp->dl_client; + struct nfs4_client *clp = dp->dl_stid.sc_client; struct rpc_clnt *current_rpc_client = clp->cl_cb_client; nfsd4_cb_done(task, calldata); @@ -1006,7 +1016,7 @@ void nfsd4_do_callback_rpc(struct work_struct *w) void nfsd4_cb_recall(struct nfs4_delegation *dp) { struct nfsd4_callback *cb = &dp->dl_recall; - struct nfs4_client *clp = dp->dl_client; + struct nfs4_client *clp = dp->dl_stid.sc_client; dp->dl_retries = 1; cb->cb_op = dp; diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index e807776..c5e28ed8 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -35,6 +35,7 @@ #include <linux/file.h> #include <linux/slab.h> +#include "idmap.h" #include "cache.h" #include "xdr4.h" #include "vfs.h" @@ -156,6 +157,8 @@ do_open_permission(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfs !(open->op_share_access & NFS4_SHARE_ACCESS_WRITE)) return nfserr_inval; + accmode |= NFSD_MAY_READ_IF_EXEC; + if (open->op_share_access & NFS4_SHARE_ACCESS_READ) accmode |= NFSD_MAY_READ; if (open->op_share_access & NFS4_SHARE_ACCESS_WRITE) @@ -168,12 +171,29 @@ do_open_permission(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfs return status; } +static __be32 nfsd_check_obj_isreg(struct svc_fh *fh) +{ + umode_t mode = fh->fh_dentry->d_inode->i_mode; + + if (S_ISREG(mode)) + return nfs_ok; + if (S_ISDIR(mode)) + return nfserr_isdir; + /* + * Using err_symlink as our catch-all case may look odd; but + * there's no other obvious error for this case in 4.0, and we + * happen to know that it will cause the linux v4 client to do + * the right thing on attempts to open something other than a + * regular file. + */ + return nfserr_symlink; +} + static __be32 do_open_lookup(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_open *open) { struct svc_fh resfh; __be32 status; - int created = 0; fh_init(&resfh, NFS4_FHSIZE); open->op_truncate = 0; @@ -202,7 +222,7 @@ do_open_lookup(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_o open->op_fname.len, &open->op_iattr, &resfh, open->op_createmode, (u32 *)open->op_verf.data, - &open->op_truncate, &created); + &open->op_truncate, &open->op_created); /* * Following rfc 3530 14.2.16, use the returned bitmask @@ -216,6 +236,9 @@ do_open_lookup(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_o status = nfsd_lookup(rqstp, current_fh, open->op_fname.data, open->op_fname.len, &resfh); fh_unlock(current_fh); + if (status) + goto out; + status = nfsd_check_obj_isreg(&resfh); } if (status) goto out; @@ -227,9 +250,9 @@ do_open_lookup(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_o fh_dup2(current_fh, &resfh); /* set reply cache */ - fh_copy_shallow(&open->op_stateowner->so_replay.rp_openfh, + fh_copy_shallow(&open->op_openowner->oo_owner.so_replay.rp_openfh, &resfh.fh_handle); - if (!created) + if (!open->op_created) status = do_open_permission(rqstp, current_fh, open, NFSD_MAY_NOP); @@ -254,7 +277,7 @@ do_open_fhandle(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_ memset(&open->op_cinfo, 0, sizeof(struct nfsd4_change_info)); /* set replay cache */ - fh_copy_shallow(&open->op_stateowner->so_replay.rp_openfh, + fh_copy_shallow(&open->op_openowner->oo_owner.so_replay.rp_openfh, ¤t_fh->fh_handle); open->op_truncate = (open->op_iattr.ia_valid & ATTR_SIZE) && @@ -283,14 +306,18 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, __be32 status; struct nfsd4_compoundres *resp; - dprintk("NFSD: nfsd4_open filename %.*s op_stateowner %p\n", + dprintk("NFSD: nfsd4_open filename %.*s op_openowner %p\n", (int)open->op_fname.len, open->op_fname.data, - open->op_stateowner); + open->op_openowner); /* This check required by spec. */ if (open->op_create && open->op_claim_type != NFS4_OPEN_CLAIM_NULL) return nfserr_inval; + /* We don't yet support WANT bits: */ + open->op_share_access &= NFS4_SHARE_ACCESS_MASK; + + open->op_created = 0; /* * RFC5661 18.51.3 * Before RECLAIM_COMPLETE done, server should deny new lock @@ -309,7 +336,7 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, resp = rqstp->rq_resp; status = nfsd4_process_open1(&resp->cstate, open); if (status == nfserr_replay_me) { - struct nfs4_replay *rp = &open->op_stateowner->so_replay; + struct nfs4_replay *rp = &open->op_openowner->oo_owner.so_replay; fh_put(&cstate->current_fh); fh_copy_shallow(&cstate->current_fh.fh_handle, &rp->rp_openfh); @@ -339,32 +366,23 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, switch (open->op_claim_type) { case NFS4_OPEN_CLAIM_DELEGATE_CUR: case NFS4_OPEN_CLAIM_NULL: - /* - * (1) set CURRENT_FH to the file being opened, - * creating it if necessary, (2) set open->op_cinfo, - * (3) set open->op_truncate if the file is to be - * truncated after opening, (4) do permission checking. - */ status = do_open_lookup(rqstp, &cstate->current_fh, open); if (status) goto out; break; case NFS4_OPEN_CLAIM_PREVIOUS: - open->op_stateowner->so_confirmed = 1; - /* - * The CURRENT_FH is already set to the file being - * opened. (1) set open->op_cinfo, (2) set - * open->op_truncate if the file is to be truncated - * after opening, (3) do permission checking. - */ + open->op_openowner->oo_flags |= NFS4_OO_CONFIRMED; + case NFS4_OPEN_CLAIM_FH: + case NFS4_OPEN_CLAIM_DELEG_CUR_FH: status = do_open_fhandle(rqstp, &cstate->current_fh, open); if (status) goto out; break; + case NFS4_OPEN_CLAIM_DELEG_PREV_FH: case NFS4_OPEN_CLAIM_DELEGATE_PREV: - open->op_stateowner->so_confirmed = 1; + open->op_openowner->oo_flags |= NFS4_OO_CONFIRMED; dprintk("NFSD: unsupported OPEN claim type %d\n", open->op_claim_type); status = nfserr_notsupp; @@ -381,12 +399,13 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, * set, (2) sets open->op_stateid, (3) sets open->op_delegation. */ status = nfsd4_process_open2(rqstp, &cstate->current_fh, open); + WARN_ON(status && open->op_created); out: - if (open->op_stateowner) { - nfs4_get_stateowner(open->op_stateowner); - cstate->replay_owner = open->op_stateowner; - } - nfs4_unlock_state(); + nfsd4_cleanup_open_state(open, status); + if (open->op_openowner) + cstate->replay_owner = &open->op_openowner->oo_owner; + else + nfs4_unlock_state(); return status; } @@ -467,17 +486,12 @@ static __be32 nfsd4_commit(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_commit *commit) { - __be32 status; - u32 *p = (u32 *)commit->co_verf.data; *p++ = nfssvc_boot.tv_sec; *p++ = nfssvc_boot.tv_usec; - status = nfsd_commit(rqstp, &cstate->current_fh, commit->co_offset, + return nfsd_commit(rqstp, &cstate->current_fh, commit->co_offset, commit->co_count); - if (status == nfserr_symlink) - status = nfserr_inval; - return status; } static __be32 @@ -492,8 +506,6 @@ nfsd4_create(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, status = fh_verify(rqstp, &cstate->current_fh, S_IFDIR, NFSD_MAY_CREATE); - if (status == nfserr_symlink) - status = nfserr_notdir; if (status) return status; @@ -691,7 +703,7 @@ nfsd4_readdir(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, readdir->rd_bmval[1] &= nfsd_suppattrs1(cstate->minorversion); readdir->rd_bmval[2] &= nfsd_suppattrs2(cstate->minorversion); - if ((cookie > ~(u32)0) || (cookie == 1) || (cookie == 2) || + if ((cookie == 1) || (cookie == 2) || (cookie == 0 && memcmp(readdir->rd_verf.data, zeroverf.data, NFS4_VERIFIER_SIZE))) return nfserr_bad_cookie; @@ -719,8 +731,6 @@ nfsd4_remove(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, return nfserr_grace; status = nfsd_unlink(rqstp, &cstate->current_fh, 0, remove->rm_name, remove->rm_namelen); - if (status == nfserr_symlink) - return nfserr_notdir; if (!status) { fh_unlock(&cstate->current_fh); set_change_info(&remove->rm_cinfo, &cstate->current_fh); @@ -751,8 +761,6 @@ nfsd4_rename(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, (S_ISDIR(cstate->save_fh.fh_dentry->d_inode->i_mode) && S_ISDIR(cstate->current_fh.fh_dentry->d_inode->i_mode))) status = nfserr_exist; - else if (status == nfserr_symlink) - status = nfserr_notdir; if (!status) { set_change_info(&rename->rn_sinfo, &cstate->current_fh); @@ -830,7 +838,7 @@ nfsd4_setattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, return status; } } - status = mnt_want_write(cstate->current_fh.fh_export->ex_path.mnt); + status = fh_want_write(&cstate->current_fh); if (status) return status; status = nfs_ok; @@ -848,7 +856,7 @@ nfsd4_setattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, status = nfsd_setattr(rqstp, &cstate->current_fh, &setattr->sa_iattr, 0, (time_t)0); out: - mnt_drop_write(cstate->current_fh.fh_export->ex_path.mnt); + fh_drop_write(&cstate->current_fh); return status; } @@ -892,8 +900,6 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, write->wr_bytes_written = cnt; - if (status == nfserr_symlink) - status = nfserr_inval; return status; } @@ -930,7 +936,7 @@ _nfsd4_verify(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, count = 4 + (verify->ve_attrlen >> 2); buf = kmalloc(count << 2, GFP_KERNEL); if (!buf) - return nfserr_resource; + return nfserr_jukebox; status = nfsd4_encode_fattr(&cstate->current_fh, cstate->current_fh.fh_export, @@ -994,6 +1000,8 @@ static inline void nfsd4_increment_op_stats(u32 opnum) typedef __be32(*nfsd4op_func)(struct svc_rqst *, struct nfsd4_compound_state *, void *); +typedef u32(*nfsd4op_rsize)(struct svc_rqst *, struct nfsd4_op *op); + enum nfsd4_op_flags { ALLOWED_WITHOUT_FH = 1 << 0, /* No current filehandle required */ ALLOWED_ON_ABSENT_FS = 1 << 1, /* ops processed on absent fs */ @@ -1001,13 +1009,15 @@ enum nfsd4_op_flags { /* For rfc 5661 section 2.6.3.1.1: */ OP_HANDLES_WRONGSEC = 1 << 3, OP_IS_PUTFH_LIKE = 1 << 4, -}; - -struct nfsd4_operation { - nfsd4op_func op_func; - u32 op_flags; - char *op_name; /* + * These are the ops whose result size we estimate before + * encoding, to avoid performing an op then not being able to + * respond or cache a response. This includes writes and setattrs + * as well as the operations usually called "nonidempotent": + */ + OP_MODIFIES_SOMETHING = 1 << 5, + /* + * Cache compounds containing these ops in the xid-based drc: * We use the DRC for compounds containing non-idempotent * operations, *except* those that are 4.1-specific (since * sessions provide their own EOS), and except for stateful @@ -1015,7 +1025,15 @@ struct nfsd4_operation { * (since sequence numbers provide EOS for open, lock, etc in * the v4.0 case). */ - bool op_cacheresult; + OP_CACHEME = 1 << 6, +}; + +struct nfsd4_operation { + nfsd4op_func op_func; + u32 op_flags; + char *op_name; + /* Try to get response size before operation */ + nfsd4op_rsize op_rsize_bop; }; static struct nfsd4_operation nfsd4_ops[]; @@ -1062,7 +1080,7 @@ static inline struct nfsd4_operation *OPDESC(struct nfsd4_op *op) bool nfsd4_cache_this_op(struct nfsd4_op *op) { - return OPDESC(op)->op_cacheresult; + return OPDESC(op)->op_flags & OP_CACHEME; } static bool need_wrongsec_check(struct svc_rqst *rqstp) @@ -1110,6 +1128,7 @@ nfsd4_proc_compound(struct svc_rqst *rqstp, struct nfsd4_operation *opdesc; struct nfsd4_compound_state *cstate = &resp->cstate; int slack_bytes; + u32 plen = 0; __be32 status; resp->xbuf = &rqstp->rq_res; @@ -1188,6 +1207,15 @@ nfsd4_proc_compound(struct svc_rqst *rqstp, goto encode_op; } + /* If op is non-idempotent */ + if (opdesc->op_flags & OP_MODIFIES_SOMETHING) { + plen = opdesc->op_rsize_bop(rqstp, op); + op->status = nfsd4_check_resp_size(resp, plen); + } + + if (op->status) + goto encode_op; + if (opdesc->op_func) op->status = opdesc->op_func(rqstp, cstate, &op->u); else @@ -1217,7 +1245,7 @@ encode_op: be32_to_cpu(status)); if (cstate->replay_owner) { - nfs4_put_stateowner(cstate->replay_owner); + nfs4_unlock_state(); cstate->replay_owner = NULL; } /* XXX Ugh, we need to get rid of this kind of special case: */ @@ -1238,6 +1266,144 @@ out: return status; } +#define op_encode_hdr_size (2) +#define op_encode_stateid_maxsz (XDR_QUADLEN(NFS4_STATEID_SIZE)) +#define op_encode_verifier_maxsz (XDR_QUADLEN(NFS4_VERIFIER_SIZE)) +#define op_encode_change_info_maxsz (5) +#define nfs4_fattr_bitmap_maxsz (4) + +#define op_encode_lockowner_maxsz (1 + XDR_QUADLEN(IDMAP_NAMESZ)) +#define op_encode_lock_denied_maxsz (8 + op_encode_lockowner_maxsz) + +#define nfs4_owner_maxsz (1 + XDR_QUADLEN(IDMAP_NAMESZ)) + +#define op_encode_ace_maxsz (3 + nfs4_owner_maxsz) +#define op_encode_delegation_maxsz (1 + op_encode_stateid_maxsz + 1 + \ + op_encode_ace_maxsz) + +#define op_encode_channel_attrs_maxsz (6 + 1 + 1) + +static inline u32 nfsd4_only_status_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +{ + return (op_encode_hdr_size) * sizeof(__be32); +} + +static inline u32 nfsd4_status_stateid_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +{ + return (op_encode_hdr_size + op_encode_stateid_maxsz)* sizeof(__be32); +} + +static inline u32 nfsd4_commit_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +{ + return (op_encode_hdr_size + op_encode_verifier_maxsz) * sizeof(__be32); +} + +static inline u32 nfsd4_create_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +{ + return (op_encode_hdr_size + op_encode_change_info_maxsz + + nfs4_fattr_bitmap_maxsz) * sizeof(__be32); +} + +static inline u32 nfsd4_link_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +{ + return (op_encode_hdr_size + op_encode_change_info_maxsz) + * sizeof(__be32); +} + +static inline u32 nfsd4_lock_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +{ + return (op_encode_hdr_size + op_encode_lock_denied_maxsz) + * sizeof(__be32); +} + +static inline u32 nfsd4_open_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +{ + return (op_encode_hdr_size + op_encode_stateid_maxsz + + op_encode_change_info_maxsz + 1 + + nfs4_fattr_bitmap_maxsz + + op_encode_delegation_maxsz) * sizeof(__be32); +} + +static inline u32 nfsd4_read_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +{ + u32 maxcount = 0, rlen = 0; + + maxcount = svc_max_payload(rqstp); + rlen = op->u.read.rd_length; + + if (rlen > maxcount) + rlen = maxcount; + + return (op_encode_hdr_size + 2) * sizeof(__be32) + rlen; +} + +static inline u32 nfsd4_readdir_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +{ + u32 rlen = op->u.readdir.rd_maxcount; + + if (rlen > PAGE_SIZE) + rlen = PAGE_SIZE; + + return (op_encode_hdr_size + op_encode_verifier_maxsz) + * sizeof(__be32) + rlen; +} + +static inline u32 nfsd4_remove_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +{ + return (op_encode_hdr_size + op_encode_change_info_maxsz) + * sizeof(__be32); +} + +static inline u32 nfsd4_rename_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +{ + return (op_encode_hdr_size + op_encode_change_info_maxsz + + op_encode_change_info_maxsz) * sizeof(__be32); +} + +static inline u32 nfsd4_setattr_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +{ + return (op_encode_hdr_size + nfs4_fattr_bitmap_maxsz) * sizeof(__be32); +} + +static inline u32 nfsd4_setclientid_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +{ + return (op_encode_hdr_size + 2 + 1024) * sizeof(__be32); +} + +static inline u32 nfsd4_write_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +{ + return (op_encode_hdr_size + op_encode_verifier_maxsz) * sizeof(__be32); +} + +static inline u32 nfsd4_exchange_id_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +{ + return (op_encode_hdr_size + 2 + 1 + /* eir_clientid, eir_sequenceid */\ + 1 + 1 + 0 + /* eir_flags, spr_how, SP4_NONE (for now) */\ + 2 + /*eir_server_owner.so_minor_id */\ + /* eir_server_owner.so_major_id<> */\ + XDR_QUADLEN(NFS4_OPAQUE_LIMIT) + 1 +\ + /* eir_server_scope<> */\ + XDR_QUADLEN(NFS4_OPAQUE_LIMIT) + 1 +\ + 1 + /* eir_server_impl_id array length */\ + 0 /* ignored eir_server_impl_id contents */) * sizeof(__be32); +} + +static inline u32 nfsd4_bind_conn_to_session_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +{ + return (op_encode_hdr_size + \ + XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + /* bctsr_sessid */\ + 2 /* bctsr_dir, use_conn_in_rdma_mode */) * sizeof(__be32); +} + +static inline u32 nfsd4_create_session_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +{ + return (op_encode_hdr_size + \ + XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + /* sessionid */\ + 2 + /* csr_sequence, csr_flags */\ + op_encode_channel_attrs_maxsz + \ + op_encode_channel_attrs_maxsz) * sizeof(__be32); +} + static struct nfsd4_operation nfsd4_ops[] = { [OP_ACCESS] = { .op_func = (nfsd4op_func)nfsd4_access, @@ -1245,20 +1411,27 @@ static struct nfsd4_operation nfsd4_ops[] = { }, [OP_CLOSE] = { .op_func = (nfsd4op_func)nfsd4_close, + .op_flags = OP_MODIFIES_SOMETHING, .op_name = "OP_CLOSE", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_status_stateid_rsize, }, [OP_COMMIT] = { .op_func = (nfsd4op_func)nfsd4_commit, + .op_flags = OP_MODIFIES_SOMETHING, .op_name = "OP_COMMIT", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_commit_rsize, }, [OP_CREATE] = { .op_func = (nfsd4op_func)nfsd4_create, + .op_flags = OP_MODIFIES_SOMETHING | OP_CACHEME, .op_name = "OP_CREATE", - .op_cacheresult = true, + .op_rsize_bop = (nfsd4op_rsize)nfsd4_create_rsize, }, [OP_DELEGRETURN] = { .op_func = (nfsd4op_func)nfsd4_delegreturn, + .op_flags = OP_MODIFIES_SOMETHING, .op_name = "OP_DELEGRETURN", + .op_rsize_bop = nfsd4_only_status_rsize, }, [OP_GETATTR] = { .op_func = (nfsd4op_func)nfsd4_getattr, @@ -1271,12 +1444,16 @@ static struct nfsd4_operation nfsd4_ops[] = { }, [OP_LINK] = { .op_func = (nfsd4op_func)nfsd4_link, + .op_flags = ALLOWED_ON_ABSENT_FS | OP_MODIFIES_SOMETHING + | OP_CACHEME, .op_name = "OP_LINK", - .op_cacheresult = true, + .op_rsize_bop = (nfsd4op_rsize)nfsd4_link_rsize, }, [OP_LOCK] = { .op_func = (nfsd4op_func)nfsd4_lock, + .op_flags = OP_MODIFIES_SOMETHING, .op_name = "OP_LOCK", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_lock_rsize, }, [OP_LOCKT] = { .op_func = (nfsd4op_func)nfsd4_lockt, @@ -1284,7 +1461,9 @@ static struct nfsd4_operation nfsd4_ops[] = { }, [OP_LOCKU] = { .op_func = (nfsd4op_func)nfsd4_locku, + .op_flags = OP_MODIFIES_SOMETHING, .op_name = "OP_LOCKU", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_status_stateid_rsize, }, [OP_LOOKUP] = { .op_func = (nfsd4op_func)nfsd4_lookup, @@ -1302,42 +1481,54 @@ static struct nfsd4_operation nfsd4_ops[] = { }, [OP_OPEN] = { .op_func = (nfsd4op_func)nfsd4_open, - .op_flags = OP_HANDLES_WRONGSEC, + .op_flags = OP_HANDLES_WRONGSEC | OP_MODIFIES_SOMETHING, .op_name = "OP_OPEN", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_open_rsize, }, [OP_OPEN_CONFIRM] = { .op_func = (nfsd4op_func)nfsd4_open_confirm, + .op_flags = OP_MODIFIES_SOMETHING, .op_name = "OP_OPEN_CONFIRM", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_status_stateid_rsize, }, [OP_OPEN_DOWNGRADE] = { .op_func = (nfsd4op_func)nfsd4_open_downgrade, + .op_flags = OP_MODIFIES_SOMETHING, .op_name = "OP_OPEN_DOWNGRADE", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_status_stateid_rsize, }, [OP_PUTFH] = { .op_func = (nfsd4op_func)nfsd4_putfh, .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS - | OP_IS_PUTFH_LIKE, + | OP_IS_PUTFH_LIKE | OP_MODIFIES_SOMETHING, .op_name = "OP_PUTFH", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize, }, [OP_PUTPUBFH] = { .op_func = (nfsd4op_func)nfsd4_putrootfh, .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS - | OP_IS_PUTFH_LIKE, + | OP_IS_PUTFH_LIKE | OP_MODIFIES_SOMETHING, .op_name = "OP_PUTPUBFH", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize, }, [OP_PUTROOTFH] = { .op_func = (nfsd4op_func)nfsd4_putrootfh, .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS - | OP_IS_PUTFH_LIKE, + | OP_IS_PUTFH_LIKE | OP_MODIFIES_SOMETHING, .op_name = "OP_PUTROOTFH", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize, }, [OP_READ] = { .op_func = (nfsd4op_func)nfsd4_read, + .op_flags = OP_MODIFIES_SOMETHING, .op_name = "OP_READ", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_read_rsize, }, [OP_READDIR] = { .op_func = (nfsd4op_func)nfsd4_readdir, + .op_flags = OP_MODIFIES_SOMETHING, .op_name = "OP_READDIR", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_readdir_rsize, }, [OP_READLINK] = { .op_func = (nfsd4op_func)nfsd4_readlink, @@ -1345,29 +1536,36 @@ static struct nfsd4_operation nfsd4_ops[] = { }, [OP_REMOVE] = { .op_func = (nfsd4op_func)nfsd4_remove, + .op_flags = OP_MODIFIES_SOMETHING | OP_CACHEME, .op_name = "OP_REMOVE", - .op_cacheresult = true, + .op_rsize_bop = (nfsd4op_rsize)nfsd4_remove_rsize, }, [OP_RENAME] = { - .op_name = "OP_RENAME", .op_func = (nfsd4op_func)nfsd4_rename, - .op_cacheresult = true, + .op_flags = OP_MODIFIES_SOMETHING | OP_CACHEME, + .op_name = "OP_RENAME", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_rename_rsize, }, [OP_RENEW] = { .op_func = (nfsd4op_func)nfsd4_renew, - .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS, + .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS + | OP_MODIFIES_SOMETHING, .op_name = "OP_RENEW", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize, + }, [OP_RESTOREFH] = { .op_func = (nfsd4op_func)nfsd4_restorefh, .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS - | OP_IS_PUTFH_LIKE, + | OP_IS_PUTFH_LIKE | OP_MODIFIES_SOMETHING, .op_name = "OP_RESTOREFH", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize, }, [OP_SAVEFH] = { .op_func = (nfsd4op_func)nfsd4_savefh, - .op_flags = OP_HANDLES_WRONGSEC, + .op_flags = OP_HANDLES_WRONGSEC | OP_MODIFIES_SOMETHING, .op_name = "OP_SAVEFH", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize, }, [OP_SECINFO] = { .op_func = (nfsd4op_func)nfsd4_secinfo, @@ -1377,19 +1575,22 @@ static struct nfsd4_operation nfsd4_ops[] = { [OP_SETATTR] = { .op_func = (nfsd4op_func)nfsd4_setattr, .op_name = "OP_SETATTR", - .op_cacheresult = true, + .op_flags = OP_MODIFIES_SOMETHING | OP_CACHEME, + .op_rsize_bop = (nfsd4op_rsize)nfsd4_setattr_rsize, }, [OP_SETCLIENTID] = { .op_func = (nfsd4op_func)nfsd4_setclientid, - .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS, + .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS + | OP_MODIFIES_SOMETHING | OP_CACHEME, .op_name = "OP_SETCLIENTID", - .op_cacheresult = true, + .op_rsize_bop = (nfsd4op_rsize)nfsd4_setclientid_rsize, }, [OP_SETCLIENTID_CONFIRM] = { .op_func = (nfsd4op_func)nfsd4_setclientid_confirm, - .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS, + .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS + | OP_MODIFIES_SOMETHING | OP_CACHEME, .op_name = "OP_SETCLIENTID_CONFIRM", - .op_cacheresult = true, + .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize, }, [OP_VERIFY] = { .op_func = (nfsd4op_func)nfsd4_verify, @@ -1397,35 +1598,46 @@ static struct nfsd4_operation nfsd4_ops[] = { }, [OP_WRITE] = { .op_func = (nfsd4op_func)nfsd4_write, + .op_flags = OP_MODIFIES_SOMETHING | OP_CACHEME, .op_name = "OP_WRITE", - .op_cacheresult = true, + .op_rsize_bop = (nfsd4op_rsize)nfsd4_write_rsize, }, [OP_RELEASE_LOCKOWNER] = { .op_func = (nfsd4op_func)nfsd4_release_lockowner, - .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS, + .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS + | OP_MODIFIES_SOMETHING, .op_name = "OP_RELEASE_LOCKOWNER", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize, }, /* NFSv4.1 operations */ [OP_EXCHANGE_ID] = { .op_func = (nfsd4op_func)nfsd4_exchange_id, - .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP, + .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP + | OP_MODIFIES_SOMETHING, .op_name = "OP_EXCHANGE_ID", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_exchange_id_rsize, }, [OP_BIND_CONN_TO_SESSION] = { .op_func = (nfsd4op_func)nfsd4_bind_conn_to_session, - .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP, + .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP + | OP_MODIFIES_SOMETHING, .op_name = "OP_BIND_CONN_TO_SESSION", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_bind_conn_to_session_rsize, }, [OP_CREATE_SESSION] = { .op_func = (nfsd4op_func)nfsd4_create_session, - .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP, + .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP + | OP_MODIFIES_SOMETHING, .op_name = "OP_CREATE_SESSION", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_create_session_rsize, }, [OP_DESTROY_SESSION] = { .op_func = (nfsd4op_func)nfsd4_destroy_session, - .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP, + .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP + | OP_MODIFIES_SOMETHING, .op_name = "OP_DESTROY_SESSION", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize, }, [OP_SEQUENCE] = { .op_func = (nfsd4op_func)nfsd4_sequence, @@ -1433,14 +1645,17 @@ static struct nfsd4_operation nfsd4_ops[] = { .op_name = "OP_SEQUENCE", }, [OP_DESTROY_CLIENTID] = { - .op_func = NULL, - .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP, + .op_func = (nfsd4op_func)nfsd4_destroy_clientid, + .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP + | OP_MODIFIES_SOMETHING, .op_name = "OP_DESTROY_CLIENTID", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize, }, [OP_RECLAIM_COMPLETE] = { .op_func = (nfsd4op_func)nfsd4_reclaim_complete, - .op_flags = ALLOWED_WITHOUT_FH, + .op_flags = ALLOWED_WITHOUT_FH | OP_MODIFIES_SOMETHING, .op_name = "OP_RECLAIM_COMPLETE", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize, }, [OP_SECINFO_NO_NAME] = { .op_func = (nfsd4op_func)nfsd4_secinfo_no_name, @@ -1454,8 +1669,9 @@ static struct nfsd4_operation nfsd4_ops[] = { }, [OP_FREE_STATEID] = { .op_func = (nfsd4op_func)nfsd4_free_stateid, - .op_flags = ALLOWED_WITHOUT_FH, + .op_flags = ALLOWED_WITHOUT_FH | OP_MODIFIES_SOMETHING, .op_name = "OP_FREE_STATEID", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize, }, }; diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c index 29d77f6..80a0be9 100644 --- a/fs/nfsd/nfs4recover.c +++ b/fs/nfsd/nfs4recover.c @@ -45,6 +45,7 @@ /* Globals */ static struct file *rec_file; +static char user_recovery_dirname[PATH_MAX] = "/var/lib/nfs/v4recovery"; static int nfs4_save_creds(const struct cred **original_creds) @@ -88,7 +89,7 @@ nfs4_make_rec_clidname(char *dname, struct xdr_netobj *clname) struct xdr_netobj cksum; struct hash_desc desc; struct scatterlist sg; - __be32 status = nfserr_resource; + __be32 status = nfserr_jukebox; dprintk("NFSD: nfs4_make_rec_clidname for %.*s\n", clname->len, clname->data); @@ -129,6 +130,7 @@ nfsd4_create_clid_dir(struct nfs4_client *clp) if (!rec_file || clp->cl_firststate) return 0; + clp->cl_firststate = 1; status = nfs4_save_creds(&original_cred); if (status < 0) return status; @@ -143,25 +145,25 @@ nfsd4_create_clid_dir(struct nfs4_client *clp) goto out_unlock; } status = -EEXIST; - if (dentry->d_inode) { - dprintk("NFSD: nfsd4_create_clid_dir: DIRECTORY EXISTS\n"); + if (dentry->d_inode) goto out_put; - } - status = mnt_want_write(rec_file->f_path.mnt); + status = mnt_want_write_file(rec_file); if (status) goto out_put; status = vfs_mkdir(dir->d_inode, dentry, S_IRWXU); - mnt_drop_write(rec_file->f_path.mnt); + mnt_drop_write_file(rec_file); out_put: dput(dentry); out_unlock: mutex_unlock(&dir->d_inode->i_mutex); - if (status == 0) { - clp->cl_firststate = 1; + if (status == 0) vfs_fsync(rec_file, 0); - } + else + printk(KERN_ERR "NFSD: failed to write recovery record" + " (err %d); please check that %s exists" + " and is writeable", status, + user_recovery_dirname); nfs4_reset_creds(original_cred); - dprintk("NFSD: nfsd4_create_clid_dir returns %d\n", status); return status; } @@ -266,7 +268,7 @@ nfsd4_remove_clid_dir(struct nfs4_client *clp) if (!rec_file || !clp->cl_firststate) return; - status = mnt_want_write(rec_file->f_path.mnt); + status = mnt_want_write_file(rec_file); if (status) goto out; clp->cl_firststate = 0; @@ -279,7 +281,7 @@ nfsd4_remove_clid_dir(struct nfs4_client *clp) nfs4_reset_creds(original_cred); if (status == 0) vfs_fsync(rec_file, 0); - mnt_drop_write(rec_file->f_path.mnt); + mnt_drop_write_file(rec_file); out: if (status) printk("NFSD: Failed to remove expired client state directory" @@ -309,13 +311,13 @@ nfsd4_recdir_purge_old(void) { if (!rec_file) return; - status = mnt_want_write(rec_file->f_path.mnt); + status = mnt_want_write_file(rec_file); if (status) goto out; status = nfsd4_list_rec_dir(purge_old); if (status == 0) vfs_fsync(rec_file, 0); - mnt_drop_write(rec_file->f_path.mnt); + mnt_drop_write_file(rec_file); out: if (status) printk("nfsd4: failed to purge old clients from recovery" @@ -354,13 +356,13 @@ nfsd4_recdir_load(void) { */ void -nfsd4_init_recdir(char *rec_dirname) +nfsd4_init_recdir() { const struct cred *original_cred; int status; printk("NFSD: Using %s as the NFSv4 state recovery directory\n", - rec_dirname); + user_recovery_dirname); BUG_ON(rec_file); @@ -372,10 +374,10 @@ nfsd4_init_recdir(char *rec_dirname) return; } - rec_file = filp_open(rec_dirname, O_RDONLY | O_DIRECTORY, 0); + rec_file = filp_open(user_recovery_dirname, O_RDONLY | O_DIRECTORY, 0); if (IS_ERR(rec_file)) { printk("NFSD: unable to find recovery directory %s\n", - rec_dirname); + user_recovery_dirname); rec_file = NULL; } @@ -390,3 +392,30 @@ nfsd4_shutdown_recdir(void) fput(rec_file); rec_file = NULL; } + +/* + * Change the NFSv4 recovery directory to recdir. + */ +int +nfs4_reset_recoverydir(char *recdir) +{ + int status; + struct path path; + + status = kern_path(recdir, LOOKUP_FOLLOW, &path); + if (status) + return status; + status = -ENOTDIR; + if (S_ISDIR(path.dentry->d_inode->i_mode)) { + strcpy(user_recovery_dirname, recdir); + status = 0; + } + path_put(&path); + return status; +} + +char * +nfs4_recoverydir(void) +{ + return user_recovery_dirname; +} diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 3787ec1..9ca16dc 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -49,9 +49,6 @@ time_t nfsd4_lease = 90; /* default lease time */ time_t nfsd4_grace = 90; static time_t boot_time; -static u32 current_ownerid = 1; -static u32 current_fileid = 1; -static u32 current_delegid = 1; static stateid_t zerostateid; /* bits all 0 */ static stateid_t onestateid; /* bits all 1 */ static u64 current_sessionid = 1; @@ -60,13 +57,7 @@ static u64 current_sessionid = 1; #define ONE_STATEID(stateid) (!memcmp((stateid), &onestateid, sizeof(stateid_t))) /* forward declarations */ -static struct nfs4_stateid * find_stateid(stateid_t *stid, int flags); -static struct nfs4_stateid * search_for_stateid(stateid_t *stid); -static struct nfs4_delegation * search_for_delegation(stateid_t *stid); -static struct nfs4_delegation * find_delegation_stateid(struct inode *ino, stateid_t *stid); -static char user_recovery_dirname[PATH_MAX] = "/var/lib/nfs/v4recovery"; -static void nfs4_set_recdir(char *recdir); -static int check_for_locks(struct nfs4_file *filp, struct nfs4_stateowner *lowner); +static int check_for_locks(struct nfs4_file *filp, struct nfs4_lockowner *lowner); /* Locking: */ @@ -80,7 +71,8 @@ static DEFINE_MUTEX(client_mutex); */ static DEFINE_SPINLOCK(recall_lock); -static struct kmem_cache *stateowner_slab = NULL; +static struct kmem_cache *openowner_slab = NULL; +static struct kmem_cache *lockowner_slab = NULL; static struct kmem_cache *file_slab = NULL; static struct kmem_cache *stateid_slab = NULL; static struct kmem_cache *deleg_slab = NULL; @@ -112,6 +104,11 @@ opaque_hashval(const void *ptr, int nbytes) static struct list_head del_recall_lru; +static void nfsd4_free_file(struct nfs4_file *f) +{ + kmem_cache_free(file_slab, f); +} + static inline void put_nfs4_file(struct nfs4_file *fi) { @@ -119,7 +116,7 @@ put_nfs4_file(struct nfs4_file *fi) list_del(&fi->fi_hash); spin_unlock(&recall_lock); iput(fi->fi_inode); - kmem_cache_free(file_slab, fi); + nfsd4_free_file(fi); } } @@ -136,35 +133,33 @@ unsigned int max_delegations; * Open owner state (share locks) */ -/* hash tables for nfs4_stateowner */ -#define OWNER_HASH_BITS 8 -#define OWNER_HASH_SIZE (1 << OWNER_HASH_BITS) -#define OWNER_HASH_MASK (OWNER_HASH_SIZE - 1) +/* hash tables for open owners */ +#define OPEN_OWNER_HASH_BITS 8 +#define OPEN_OWNER_HASH_SIZE (1 << OPEN_OWNER_HASH_BITS) +#define OPEN_OWNER_HASH_MASK (OPEN_OWNER_HASH_SIZE - 1) -#define ownerid_hashval(id) \ - ((id) & OWNER_HASH_MASK) -#define ownerstr_hashval(clientid, ownername) \ - (((clientid) + opaque_hashval((ownername.data), (ownername.len))) & OWNER_HASH_MASK) +static unsigned int open_ownerstr_hashval(u32 clientid, struct xdr_netobj *ownername) +{ + unsigned int ret; -static struct list_head ownerid_hashtbl[OWNER_HASH_SIZE]; -static struct list_head ownerstr_hashtbl[OWNER_HASH_SIZE]; + ret = opaque_hashval(ownername->data, ownername->len); + ret += clientid; + return ret & OPEN_OWNER_HASH_MASK; +} + +static struct list_head open_ownerstr_hashtbl[OPEN_OWNER_HASH_SIZE]; /* hash table for nfs4_file */ #define FILE_HASH_BITS 8 #define FILE_HASH_SIZE (1 << FILE_HASH_BITS) -/* hash table for (open)nfs4_stateid */ -#define STATEID_HASH_BITS 10 -#define STATEID_HASH_SIZE (1 << STATEID_HASH_BITS) -#define STATEID_HASH_MASK (STATEID_HASH_SIZE - 1) - -#define file_hashval(x) \ - hash_ptr(x, FILE_HASH_BITS) -#define stateid_hashval(owner_id, file_id) \ - (((owner_id) + (file_id)) & STATEID_HASH_MASK) +static unsigned int file_hashval(struct inode *ino) +{ + /* XXX: why are we hashing on inode pointer, anyway? */ + return hash_ptr(ino, FILE_HASH_BITS); +} static struct list_head file_hashtbl[FILE_HASH_SIZE]; -static struct list_head stateid_hashtbl[STATEID_HASH_SIZE]; static void __nfs4_file_get_access(struct nfs4_file *fp, int oflag) { @@ -192,8 +187,15 @@ static void nfs4_file_put_fd(struct nfs4_file *fp, int oflag) static void __nfs4_file_put_access(struct nfs4_file *fp, int oflag) { if (atomic_dec_and_test(&fp->fi_access[oflag])) { - nfs4_file_put_fd(fp, O_RDWR); nfs4_file_put_fd(fp, oflag); + /* + * It's also safe to get rid of the RDWR open *if* + * we no longer have need of the other kind of access + * or if we already have the other kind of open: + */ + if (fp->fi_fds[1-oflag] + || atomic_read(&fp->fi_access[1 - oflag]) == 0) + nfs4_file_put_fd(fp, O_RDWR); } } @@ -206,8 +208,73 @@ static void nfs4_file_put_access(struct nfs4_file *fp, int oflag) __nfs4_file_put_access(fp, oflag); } +static inline int get_new_stid(struct nfs4_stid *stid) +{ + static int min_stateid = 0; + struct idr *stateids = &stid->sc_client->cl_stateids; + int new_stid; + int error; + + error = idr_get_new_above(stateids, stid, min_stateid, &new_stid); + /* + * Note: the necessary preallocation was done in + * nfs4_alloc_stateid(). The idr code caps the number of + * preallocations that can exist at a time, but the state lock + * prevents anyone from using ours before we get here: + */ + BUG_ON(error); + /* + * It shouldn't be a problem to reuse an opaque stateid value. + * I don't think it is for 4.1. But with 4.0 I worry that, for + * example, a stray write retransmission could be accepted by + * the server when it should have been rejected. Therefore, + * adopt a trick from the sctp code to attempt to maximize the + * amount of time until an id is reused, by ensuring they always + * "increase" (mod INT_MAX): + */ + + min_stateid = new_stid+1; + if (min_stateid == INT_MAX) + min_stateid = 0; + return new_stid; +} + +static void init_stid(struct nfs4_stid *stid, struct nfs4_client *cl, unsigned char type) +{ + stateid_t *s = &stid->sc_stateid; + int new_id; + + stid->sc_type = type; + stid->sc_client = cl; + s->si_opaque.so_clid = cl->cl_clientid; + new_id = get_new_stid(stid); + s->si_opaque.so_id = (u32)new_id; + /* Will be incremented before return to client: */ + s->si_generation = 0; +} + +static struct nfs4_stid *nfs4_alloc_stid(struct nfs4_client *cl, struct kmem_cache *slab) +{ + struct idr *stateids = &cl->cl_stateids; + + if (!idr_pre_get(stateids, GFP_KERNEL)) + return NULL; + /* + * Note: if we fail here (or any time between now and the time + * we actually get the new idr), we won't need to undo the idr + * preallocation, since the idr code caps the number of + * preallocated entries. + */ + return kmem_cache_alloc(slab, GFP_KERNEL); +} + +static struct nfs4_ol_stateid * nfs4_alloc_stateid(struct nfs4_client *clp) +{ + return openlockstateid(nfs4_alloc_stid(clp, stateid_slab)); +} + static struct nfs4_delegation * -alloc_init_deleg(struct nfs4_client *clp, struct nfs4_stateid *stp, struct svc_fh *current_fh, u32 type) +alloc_init_deleg(struct nfs4_client *clp, struct nfs4_ol_stateid *stp, struct svc_fh *current_fh, u32 type) { struct nfs4_delegation *dp; struct nfs4_file *fp = stp->st_file; @@ -224,21 +291,23 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_stateid *stp, struct svc_f return NULL; if (num_delegations > max_delegations) return NULL; - dp = kmem_cache_alloc(deleg_slab, GFP_KERNEL); + dp = delegstateid(nfs4_alloc_stid(clp, deleg_slab)); if (dp == NULL) return dp; + init_stid(&dp->dl_stid, clp, NFS4_DELEG_STID); + /* + * delegation seqid's are never incremented. The 4.1 special + * meaning of seqid 0 isn't meaningful, really, but let's avoid + * 0 anyway just for consistency and use 1: + */ + dp->dl_stid.sc_stateid.si_generation = 1; num_delegations++; INIT_LIST_HEAD(&dp->dl_perfile); INIT_LIST_HEAD(&dp->dl_perclnt); INIT_LIST_HEAD(&dp->dl_recall_lru); - dp->dl_client = clp; get_nfs4_file(fp); dp->dl_file = fp; dp->dl_type = type; - dp->dl_stateid.si_boot = boot_time; - dp->dl_stateid.si_stateownerid = current_delegid++; - dp->dl_stateid.si_fileid = 0; - dp->dl_stateid.si_generation = 0; fh_copy_shallow(&dp->dl_fh, ¤t_fh->fh_handle); dp->dl_time = 0; atomic_set(&dp->dl_count, 1); @@ -267,10 +336,18 @@ static void nfs4_put_deleg_lease(struct nfs4_file *fp) } } +static void unhash_stid(struct nfs4_stid *s) +{ + struct idr *stateids = &s->sc_client->cl_stateids; + + idr_remove(stateids, s->sc_stateid.si_opaque.so_id); +} + /* Called under the state lock. */ static void unhash_delegation(struct nfs4_delegation *dp) { + unhash_stid(&dp->dl_stid); list_del_init(&dp->dl_perclnt); spin_lock(&recall_lock); list_del_init(&dp->dl_perfile); @@ -292,10 +369,16 @@ static DEFINE_SPINLOCK(client_lock); #define CLIENT_HASH_SIZE (1 << CLIENT_HASH_BITS) #define CLIENT_HASH_MASK (CLIENT_HASH_SIZE - 1) -#define clientid_hashval(id) \ - ((id) & CLIENT_HASH_MASK) -#define clientstr_hashval(name) \ - (opaque_hashval((name), 8) & CLIENT_HASH_MASK) +static unsigned int clientid_hashval(u32 id) +{ + return id & CLIENT_HASH_MASK; +} + +static unsigned int clientstr_hashval(const char *name) +{ + return opaque_hashval(name, 8) & CLIENT_HASH_MASK; +} + /* * reclaim_str_hashtbl[] holds known client info from previous reset/reboot * used in reboot/reset lease grace period processing @@ -362,7 +445,7 @@ set_deny(unsigned int *deny, unsigned long bmap) { } static int -test_share(struct nfs4_stateid *stp, struct nfsd4_open *open) { +test_share(struct nfs4_ol_stateid *stp, struct nfsd4_open *open) { unsigned int access, deny; set_access(&access, stp->st_access_bmap); @@ -385,14 +468,13 @@ static int nfs4_access_to_omode(u32 access) BUG(); } -static void unhash_generic_stateid(struct nfs4_stateid *stp) +static void unhash_generic_stateid(struct nfs4_ol_stateid *stp) { - list_del(&stp->st_hash); list_del(&stp->st_perfile); list_del(&stp->st_perstateowner); } -static void free_generic_stateid(struct nfs4_stateid *stp) +static void close_generic_stateid(struct nfs4_ol_stateid *stp) { int i; @@ -401,84 +483,106 @@ static void free_generic_stateid(struct nfs4_stateid *stp) if (test_bit(i, &stp->st_access_bmap)) nfs4_file_put_access(stp->st_file, nfs4_access_to_omode(i)); + __clear_bit(i, &stp->st_access_bmap); } } put_nfs4_file(stp->st_file); + stp->st_file = NULL; +} + +static void free_generic_stateid(struct nfs4_ol_stateid *stp) +{ kmem_cache_free(stateid_slab, stp); } -static void release_lock_stateid(struct nfs4_stateid *stp) +static void release_lock_stateid(struct nfs4_ol_stateid *stp) { struct file *file; unhash_generic_stateid(stp); + unhash_stid(&stp->st_stid); file = find_any_file(stp->st_file); if (file) - locks_remove_posix(file, (fl_owner_t)stp->st_stateowner); + locks_remove_posix(file, (fl_owner_t)lockowner(stp->st_stateowner)); + close_generic_stateid(stp); free_generic_stateid(stp); } -static void unhash_lockowner(struct nfs4_stateowner *sop) +static void unhash_lockowner(struct nfs4_lockowner *lo) { - struct nfs4_stateid *stp; + struct nfs4_ol_stateid *stp; - list_del(&sop->so_idhash); - list_del(&sop->so_strhash); - list_del(&sop->so_perstateid); - while (!list_empty(&sop->so_stateids)) { - stp = list_first_entry(&sop->so_stateids, - struct nfs4_stateid, st_perstateowner); + list_del(&lo->lo_owner.so_strhash); + list_del(&lo->lo_perstateid); + while (!list_empty(&lo->lo_owner.so_stateids)) { + stp = list_first_entry(&lo->lo_owner.so_stateids, + struct nfs4_ol_stateid, st_perstateowner); release_lock_stateid(stp); } } -static void release_lockowner(struct nfs4_stateowner *sop) +static void release_lockowner(struct nfs4_lockowner *lo) { - unhash_lockowner(sop); - nfs4_put_stateowner(sop); + unhash_lockowner(lo); + nfs4_free_lockowner(lo); } static void -release_stateid_lockowners(struct nfs4_stateid *open_stp) +release_stateid_lockowners(struct nfs4_ol_stateid *open_stp) { - struct nfs4_stateowner *lock_sop; + struct nfs4_lockowner *lo; while (!list_empty(&open_stp->st_lockowners)) { - lock_sop = list_entry(open_stp->st_lockowners.next, - struct nfs4_stateowner, so_perstateid); - /* list_del(&open_stp->st_lockowners); */ - BUG_ON(lock_sop->so_is_open_owner); - release_lockowner(lock_sop); + lo = list_entry(open_stp->st_lockowners.next, + struct nfs4_lockowner, lo_perstateid); + release_lockowner(lo); } } -static void release_open_stateid(struct nfs4_stateid *stp) +static void unhash_open_stateid(struct nfs4_ol_stateid *stp) { unhash_generic_stateid(stp); release_stateid_lockowners(stp); + close_generic_stateid(stp); +} + +static void release_open_stateid(struct nfs4_ol_stateid *stp) +{ + unhash_open_stateid(stp); + unhash_stid(&stp->st_stid); free_generic_stateid(stp); } -static void unhash_openowner(struct nfs4_stateowner *sop) +static void unhash_openowner(struct nfs4_openowner *oo) { - struct nfs4_stateid *stp; + struct nfs4_ol_stateid *stp; - list_del(&sop->so_idhash); - list_del(&sop->so_strhash); - list_del(&sop->so_perclient); - list_del(&sop->so_perstateid); /* XXX: necessary? */ - while (!list_empty(&sop->so_stateids)) { - stp = list_first_entry(&sop->so_stateids, - struct nfs4_stateid, st_perstateowner); + list_del(&oo->oo_owner.so_strhash); + list_del(&oo->oo_perclient); + while (!list_empty(&oo->oo_owner.so_stateids)) { + stp = list_first_entry(&oo->oo_owner.so_stateids, + struct nfs4_ol_stateid, st_perstateowner); release_open_stateid(stp); } } -static void release_openowner(struct nfs4_stateowner *sop) +static void release_last_closed_stateid(struct nfs4_openowner *oo) { - unhash_openowner(sop); - list_del(&sop->so_close_lru); - nfs4_put_stateowner(sop); + struct nfs4_ol_stateid *s = oo->oo_last_closed_stid; + + if (s) { + unhash_stid(&s->st_stid); + free_generic_stateid(s); + oo->oo_last_closed_stid = NULL; + } +} + +static void release_openowner(struct nfs4_openowner *oo) +{ + unhash_openowner(oo); + list_del(&oo->oo_close_lru); + release_last_closed_stateid(oo); + nfs4_free_openowner(oo); } #define SESSION_HASH_SIZE 512 @@ -554,7 +658,7 @@ static int nfsd4_sanitize_slot_size(u32 size) /* * XXX: If we run out of reserved DRC memory we could (up to a point) * re-negotiate active sessions and reduce their slot usage to make - * rooom for new connections. For now we just fail the create session. + * room for new connections. For now we just fail the create session. */ static int nfsd4_get_drc_mem(int slotsize, u32 num) { @@ -843,9 +947,6 @@ renew_client_locked(struct nfs4_client *clp) return; } - /* - * Move client to the end to the LRU list. - */ dprintk("renewing client (clientid %08x/%08x)\n", clp->cl_clientid.cl_boot, clp->cl_clientid.cl_id); @@ -943,7 +1044,7 @@ unhash_client_locked(struct nfs4_client *clp) static void expire_client(struct nfs4_client *clp) { - struct nfs4_stateowner *sop; + struct nfs4_openowner *oo; struct nfs4_delegation *dp; struct list_head reaplist; @@ -961,8 +1062,8 @@ expire_client(struct nfs4_client *clp) unhash_delegation(dp); } while (!list_empty(&clp->cl_openowners)) { - sop = list_entry(clp->cl_openowners.next, struct nfs4_stateowner, so_perclient); - release_openowner(sop); + oo = list_entry(clp->cl_openowners.next, struct nfs4_openowner, oo_perclient); + release_openowner(oo); } nfsd4_shutdown_callback(clp); if (clp->cl_cb_conn.cb_xprt) @@ -1038,6 +1139,23 @@ static void gen_confirm(struct nfs4_client *clp) *p++ = i++; } +static struct nfs4_stid *find_stateid(struct nfs4_client *cl, stateid_t *t) +{ + return idr_find(&cl->cl_stateids, t->si_opaque.so_id); +} + +static struct nfs4_stid *find_stateid_by_type(struct nfs4_client *cl, stateid_t *t, char typemask) +{ + struct nfs4_stid *s; + + s = find_stateid(cl, t); + if (!s) + return NULL; + if (typemask & s->sc_type) + return s; + return NULL; +} + static struct nfs4_client *create_client(struct xdr_netobj name, char *recdir, struct svc_rqst *rqstp, nfs4_verifier *verf) { @@ -1060,6 +1178,7 @@ static struct nfs4_client *create_client(struct xdr_netobj name, char *recdir, } } + idr_init(&clp->cl_stateids); memcpy(clp->cl_recdir, recdir, HEXDIR_LEN); atomic_set(&clp->cl_refcount, 0); clp->cl_cb_state = NFSD4_CB_UNKNOWN; @@ -1083,17 +1202,6 @@ static struct nfs4_client *create_client(struct xdr_netobj name, char *recdir, return clp; } -static int check_name(struct xdr_netobj name) -{ - if (name.len == 0) - return 0; - if (name.len > NFS4_OPAQUE_LIMIT) { - dprintk("NFSD: check_name: name too long(%d)!\n", name.len); - return 0; - } - return 1; -} - static void add_to_unconfirmed(struct nfs4_client *clp, unsigned int strhashval) { @@ -1125,8 +1233,10 @@ find_confirmed_client(clientid_t *clid) unsigned int idhashval = clientid_hashval(clid->cl_id); list_for_each_entry(clp, &conf_id_hashtbl[idhashval], cl_idhash) { - if (same_clid(&clp->cl_clientid, clid)) + if (same_clid(&clp->cl_clientid, clid)) { + renew_client(clp); return clp; + } } return NULL; } @@ -1173,20 +1283,6 @@ find_unconfirmed_client_by_str(const char *dname, unsigned int hashval) return NULL; } -static void rpc_svcaddr2sockaddr(struct sockaddr *sa, unsigned short family, union svc_addr_u *svcaddr) -{ - switch (family) { - case AF_INET: - ((struct sockaddr_in *)sa)->sin_family = AF_INET; - ((struct sockaddr_in *)sa)->sin_addr = svcaddr->addr; - return; - case AF_INET6: - ((struct sockaddr_in6 *)sa)->sin6_family = AF_INET6; - ((struct sockaddr_in6 *)sa)->sin6_addr = svcaddr->addr6; - return; - } -} - static void gen_callback(struct nfs4_client *clp, struct nfsd4_setclientid *se, struct svc_rqst *rqstp) { @@ -1218,7 +1314,7 @@ gen_callback(struct nfs4_client *clp, struct nfsd4_setclientid *se, struct svc_r conn->cb_prog = se->se_callback_prog; conn->cb_ident = se->se_callback_ident; - rpc_svcaddr2sockaddr((struct sockaddr *)&conn->cb_saddr, expected_family, &rqstp->rq_daddr); + memcpy(&conn->cb_saddr, &rqstp->rq_daddr, rqstp->rq_daddrlen); return; out_err: conn->cb_addr.ss_family = AF_UNSPEC; @@ -1350,7 +1446,7 @@ nfsd4_exchange_id(struct svc_rqst *rqstp, __func__, rqstp, exid, exid->clname.len, exid->clname.data, addr_str, exid->flags, exid->spa_how); - if (!check_name(exid->clname) || (exid->flags & ~EXCHGID4_FLAG_MASK_A)) + if (exid->flags & ~EXCHGID4_FLAG_MASK_A) return nfserr_inval; /* Currently only support SP4_NONE */ @@ -1849,8 +1945,16 @@ out: nfsd4_get_session(cstate->session); atomic_inc(&clp->cl_refcount); - if (clp->cl_cb_state == NFSD4_CB_DOWN) - seq->status_flags |= SEQ4_STATUS_CB_PATH_DOWN; + switch (clp->cl_cb_state) { + case NFSD4_CB_DOWN: + seq->status_flags = SEQ4_STATUS_CB_PATH_DOWN; + break; + case NFSD4_CB_FAULT: + seq->status_flags = SEQ4_STATUS_BACKCHANNEL_FAULT; + break; + default: + seq->status_flags = 0; + } } kfree(conn); spin_unlock(&client_lock); @@ -1858,6 +1962,50 @@ out: return status; } +static inline bool has_resources(struct nfs4_client *clp) +{ + return !list_empty(&clp->cl_openowners) + || !list_empty(&clp->cl_delegations) + || !list_empty(&clp->cl_sessions); +} + +__be32 +nfsd4_destroy_clientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_destroy_clientid *dc) +{ + struct nfs4_client *conf, *unconf, *clp; + int status = 0; + + nfs4_lock_state(); + unconf = find_unconfirmed_client(&dc->clientid); + conf = find_confirmed_client(&dc->clientid); + + if (conf) { + clp = conf; + + if (!is_client_expired(conf) && has_resources(conf)) { + status = nfserr_clientid_busy; + goto out; + } + + /* rfc5661 18.50.3 */ + if (cstate->session && conf == cstate->session->se_client) { + status = nfserr_clientid_busy; + goto out; + } + } else if (unconf) + clp = unconf; + else { + status = nfserr_stale_clientid; + goto out; + } + + expire_client(clp); +out: + nfs4_unlock_state(); + dprintk("%s return %d\n", __func__, ntohl(status)); + return status; +} + __be32 nfsd4_reclaim_complete(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_reclaim_complete *rc) { @@ -1900,19 +2048,13 @@ __be32 nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_setclientid *setclid) { - struct xdr_netobj clname = { - .len = setclid->se_namelen, - .data = setclid->se_name, - }; + struct xdr_netobj clname = setclid->se_name; nfs4_verifier clverifier = setclid->se_verf; unsigned int strhashval; struct nfs4_client *conf, *unconf, *new; __be32 status; char dname[HEXDIR_LEN]; - if (!check_name(clname)) - return nfserr_inval; - status = nfs4_make_rec_clidname(dname, &clname); if (status) return status; @@ -1946,7 +2088,7 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, * of 5 bullet points, labeled as CASE0 - CASE4 below. */ unconf = find_unconfirmed_client_by_str(dname, strhashval); - status = nfserr_resource; + status = nfserr_jukebox; if (!conf) { /* * RFC 3530 14.2.33 CASE 4: @@ -2116,31 +2258,28 @@ out: return status; } +static struct nfs4_file *nfsd4_alloc_file(void) +{ + return kmem_cache_alloc(file_slab, GFP_KERNEL); +} + /* OPEN Share state helper functions */ -static inline struct nfs4_file * -alloc_init_file(struct inode *ino) +static void nfsd4_init_file(struct nfs4_file *fp, struct inode *ino) { - struct nfs4_file *fp; unsigned int hashval = file_hashval(ino); - fp = kmem_cache_alloc(file_slab, GFP_KERNEL); - if (fp) { - atomic_set(&fp->fi_ref, 1); - INIT_LIST_HEAD(&fp->fi_hash); - INIT_LIST_HEAD(&fp->fi_stateids); - INIT_LIST_HEAD(&fp->fi_delegations); - fp->fi_inode = igrab(ino); - fp->fi_id = current_fileid++; - fp->fi_had_conflict = false; - fp->fi_lease = NULL; - memset(fp->fi_fds, 0, sizeof(fp->fi_fds)); - memset(fp->fi_access, 0, sizeof(fp->fi_access)); - spin_lock(&recall_lock); - list_add(&fp->fi_hash, &file_hashtbl[hashval]); - spin_unlock(&recall_lock); - return fp; - } - return NULL; + atomic_set(&fp->fi_ref, 1); + INIT_LIST_HEAD(&fp->fi_hash); + INIT_LIST_HEAD(&fp->fi_stateids); + INIT_LIST_HEAD(&fp->fi_delegations); + fp->fi_inode = igrab(ino); + fp->fi_had_conflict = false; + fp->fi_lease = NULL; + memset(fp->fi_fds, 0, sizeof(fp->fi_fds)); + memset(fp->fi_access, 0, sizeof(fp->fi_access)); + spin_lock(&recall_lock); + list_add(&fp->fi_hash, &file_hashtbl[hashval]); + spin_unlock(&recall_lock); } static void @@ -2155,7 +2294,8 @@ nfsd4_free_slab(struct kmem_cache **slab) void nfsd4_free_slabs(void) { - nfsd4_free_slab(&stateowner_slab); + nfsd4_free_slab(&openowner_slab); + nfsd4_free_slab(&lockowner_slab); nfsd4_free_slab(&file_slab); nfsd4_free_slab(&stateid_slab); nfsd4_free_slab(&deleg_slab); @@ -2164,16 +2304,20 @@ nfsd4_free_slabs(void) static int nfsd4_init_slabs(void) { - stateowner_slab = kmem_cache_create("nfsd4_stateowners", - sizeof(struct nfs4_stateowner), 0, 0, NULL); - if (stateowner_slab == NULL) + openowner_slab = kmem_cache_create("nfsd4_openowners", + sizeof(struct nfs4_openowner), 0, 0, NULL); + if (openowner_slab == NULL) + goto out_nomem; + lockowner_slab = kmem_cache_create("nfsd4_lockowners", + sizeof(struct nfs4_openowner), 0, 0, NULL); + if (lockowner_slab == NULL) goto out_nomem; file_slab = kmem_cache_create("nfsd4_files", sizeof(struct nfs4_file), 0, 0, NULL); if (file_slab == NULL) goto out_nomem; stateid_slab = kmem_cache_create("nfsd4_stateids", - sizeof(struct nfs4_stateid), 0, 0, NULL); + sizeof(struct nfs4_ol_stateid), 0, 0, NULL); if (stateid_slab == NULL) goto out_nomem; deleg_slab = kmem_cache_create("nfsd4_delegations", @@ -2187,97 +2331,94 @@ out_nomem: return -ENOMEM; } -void -nfs4_free_stateowner(struct kref *kref) +void nfs4_free_openowner(struct nfs4_openowner *oo) { - struct nfs4_stateowner *sop = - container_of(kref, struct nfs4_stateowner, so_ref); - kfree(sop->so_owner.data); - kmem_cache_free(stateowner_slab, sop); + kfree(oo->oo_owner.so_owner.data); + kmem_cache_free(openowner_slab, oo); } -static inline struct nfs4_stateowner * -alloc_stateowner(struct xdr_netobj *owner) +void nfs4_free_lockowner(struct nfs4_lockowner *lo) { - struct nfs4_stateowner *sop; + kfree(lo->lo_owner.so_owner.data); + kmem_cache_free(lockowner_slab, lo); +} - if ((sop = kmem_cache_alloc(stateowner_slab, GFP_KERNEL))) { - if ((sop->so_owner.data = kmalloc(owner->len, GFP_KERNEL))) { - memcpy(sop->so_owner.data, owner->data, owner->len); - sop->so_owner.len = owner->len; - kref_init(&sop->so_ref); - return sop; - } - kmem_cache_free(stateowner_slab, sop); - } - return NULL; +static void init_nfs4_replay(struct nfs4_replay *rp) +{ + rp->rp_status = nfserr_serverfault; + rp->rp_buflen = 0; + rp->rp_buf = rp->rp_ibuf; } -static struct nfs4_stateowner * -alloc_init_open_stateowner(unsigned int strhashval, struct nfs4_client *clp, struct nfsd4_open *open) { +static inline void *alloc_stateowner(struct kmem_cache *slab, struct xdr_netobj *owner, struct nfs4_client *clp) +{ struct nfs4_stateowner *sop; - struct nfs4_replay *rp; - unsigned int idhashval; - if (!(sop = alloc_stateowner(&open->op_owner))) + sop = kmem_cache_alloc(slab, GFP_KERNEL); + if (!sop) + return NULL; + + sop->so_owner.data = kmemdup(owner->data, owner->len, GFP_KERNEL); + if (!sop->so_owner.data) { + kmem_cache_free(slab, sop); return NULL; - idhashval = ownerid_hashval(current_ownerid); - INIT_LIST_HEAD(&sop->so_idhash); - INIT_LIST_HEAD(&sop->so_strhash); - INIT_LIST_HEAD(&sop->so_perclient); + } + sop->so_owner.len = owner->len; + INIT_LIST_HEAD(&sop->so_stateids); - INIT_LIST_HEAD(&sop->so_perstateid); /* not used */ - INIT_LIST_HEAD(&sop->so_close_lru); - sop->so_time = 0; - list_add(&sop->so_idhash, &ownerid_hashtbl[idhashval]); - list_add(&sop->so_strhash, &ownerstr_hashtbl[strhashval]); - list_add(&sop->so_perclient, &clp->cl_openowners); - sop->so_is_open_owner = 1; - sop->so_id = current_ownerid++; sop->so_client = clp; - sop->so_seqid = open->op_seqid; - sop->so_confirmed = 0; - rp = &sop->so_replay; - rp->rp_status = nfserr_serverfault; - rp->rp_buflen = 0; - rp->rp_buf = rp->rp_ibuf; + init_nfs4_replay(&sop->so_replay); return sop; } -static inline void -init_stateid(struct nfs4_stateid *stp, struct nfs4_file *fp, struct nfsd4_open *open) { - struct nfs4_stateowner *sop = open->op_stateowner; - unsigned int hashval = stateid_hashval(sop->so_id, fp->fi_id); +static void hash_openowner(struct nfs4_openowner *oo, struct nfs4_client *clp, unsigned int strhashval) +{ + list_add(&oo->oo_owner.so_strhash, &open_ownerstr_hashtbl[strhashval]); + list_add(&oo->oo_perclient, &clp->cl_openowners); +} - INIT_LIST_HEAD(&stp->st_hash); - INIT_LIST_HEAD(&stp->st_perstateowner); +static struct nfs4_openowner * +alloc_init_open_stateowner(unsigned int strhashval, struct nfs4_client *clp, struct nfsd4_open *open) { + struct nfs4_openowner *oo; + + oo = alloc_stateowner(openowner_slab, &open->op_owner, clp); + if (!oo) + return NULL; + oo->oo_owner.so_is_open_owner = 1; + oo->oo_owner.so_seqid = open->op_seqid; + oo->oo_flags = NFS4_OO_NEW; + oo->oo_time = 0; + oo->oo_last_closed_stid = NULL; + INIT_LIST_HEAD(&oo->oo_close_lru); + hash_openowner(oo, clp, strhashval); + return oo; +} + +static void init_open_stateid(struct nfs4_ol_stateid *stp, struct nfs4_file *fp, struct nfsd4_open *open) { + struct nfs4_openowner *oo = open->op_openowner; + struct nfs4_client *clp = oo->oo_owner.so_client; + + init_stid(&stp->st_stid, clp, NFS4_OPEN_STID); INIT_LIST_HEAD(&stp->st_lockowners); - INIT_LIST_HEAD(&stp->st_perfile); - list_add(&stp->st_hash, &stateid_hashtbl[hashval]); - list_add(&stp->st_perstateowner, &sop->so_stateids); + list_add(&stp->st_perstateowner, &oo->oo_owner.so_stateids); list_add(&stp->st_perfile, &fp->fi_stateids); - stp->st_stateowner = sop; + stp->st_stateowner = &oo->oo_owner; get_nfs4_file(fp); stp->st_file = fp; - stp->st_stateid.si_boot = boot_time; - stp->st_stateid.si_stateownerid = sop->so_id; - stp->st_stateid.si_fileid = fp->fi_id; - stp->st_stateid.si_generation = 0; stp->st_access_bmap = 0; stp->st_deny_bmap = 0; - __set_bit(open->op_share_access & ~NFS4_SHARE_WANT_MASK, - &stp->st_access_bmap); + __set_bit(open->op_share_access, &stp->st_access_bmap); __set_bit(open->op_share_deny, &stp->st_deny_bmap); stp->st_openstp = NULL; } static void -move_to_close_lru(struct nfs4_stateowner *sop) +move_to_close_lru(struct nfs4_openowner *oo) { - dprintk("NFSD: move_to_close_lru nfs4_stateowner %p\n", sop); + dprintk("NFSD: move_to_close_lru nfs4_openowner %p\n", oo); - list_move_tail(&sop->so_close_lru, &close_lru); - sop->so_time = get_seconds(); + list_move_tail(&oo->oo_close_lru, &close_lru); + oo->oo_time = get_seconds(); } static int @@ -2289,14 +2430,18 @@ same_owner_str(struct nfs4_stateowner *sop, struct xdr_netobj *owner, (sop->so_client->cl_clientid.cl_id == clid->cl_id); } -static struct nfs4_stateowner * +static struct nfs4_openowner * find_openstateowner_str(unsigned int hashval, struct nfsd4_open *open) { - struct nfs4_stateowner *so = NULL; + struct nfs4_stateowner *so; + struct nfs4_openowner *oo; - list_for_each_entry(so, &ownerstr_hashtbl[hashval], so_strhash) { - if (same_owner_str(so, &open->op_owner, &open->op_clientid)) - return so; + list_for_each_entry(so, &open_ownerstr_hashtbl[hashval], so_strhash) { + if (same_owner_str(so, &open->op_owner, &open->op_clientid)) { + oo = openowner(so); + renew_client(oo->oo_owner.so_client); + return oo; + } } return NULL; } @@ -2320,31 +2465,6 @@ find_file(struct inode *ino) return NULL; } -static inline int access_valid(u32 x, u32 minorversion) -{ - if ((x & NFS4_SHARE_ACCESS_MASK) < NFS4_SHARE_ACCESS_READ) - return 0; - if ((x & NFS4_SHARE_ACCESS_MASK) > NFS4_SHARE_ACCESS_BOTH) - return 0; - x &= ~NFS4_SHARE_ACCESS_MASK; - if (minorversion && x) { - if ((x & NFS4_SHARE_WANT_MASK) > NFS4_SHARE_WANT_CANCEL) - return 0; - if ((x & NFS4_SHARE_WHEN_MASK) > NFS4_SHARE_PUSH_DELEG_WHEN_UNCONTENDED) - return 0; - x &= ~(NFS4_SHARE_WANT_MASK | NFS4_SHARE_WHEN_MASK); - } - if (x) - return 0; - return 1; -} - -static inline int deny_valid(u32 x) -{ - /* Note: unlike access bits, deny bits may be zero. */ - return x <= NFS4_SHARE_DENY_BOTH; -} - /* * Called to check deny when READ with all zero stateid or * WRITE with all zero or all one stateid @@ -2354,7 +2474,7 @@ nfs4_share_conflict(struct svc_fh *current_fh, unsigned int deny_type) { struct inode *ino = current_fh->fh_dentry->d_inode; struct nfs4_file *fp; - struct nfs4_stateid *stp; + struct nfs4_ol_stateid *stp; __be32 ret; dprintk("NFSD: nfs4_share_conflict\n"); @@ -2429,6 +2549,16 @@ static const struct lock_manager_operations nfsd_lease_mng_ops = { .lm_change = nfsd_change_deleg_cb, }; +static __be32 nfsd4_check_seqid(struct nfsd4_compound_state *cstate, struct nfs4_stateowner *so, u32 seqid) +{ + if (nfsd4_has_session(cstate)) + return nfs_ok; + if (seqid == so->so_seqid - 1) + return nfserr_replay_me; + if (seqid == so->so_seqid) + return nfs_ok; + return nfserr_bad_seqid; +} __be32 nfsd4_process_open1(struct nfsd4_compound_state *cstate, @@ -2437,57 +2567,49 @@ nfsd4_process_open1(struct nfsd4_compound_state *cstate, clientid_t *clientid = &open->op_clientid; struct nfs4_client *clp = NULL; unsigned int strhashval; - struct nfs4_stateowner *sop = NULL; - - if (!check_name(open->op_owner)) - return nfserr_inval; + struct nfs4_openowner *oo = NULL; + __be32 status; if (STALE_CLIENTID(&open->op_clientid)) return nfserr_stale_clientid; + /* + * In case we need it later, after we've already created the + * file and don't want to risk a further failure: + */ + open->op_file = nfsd4_alloc_file(); + if (open->op_file == NULL) + return nfserr_jukebox; - strhashval = ownerstr_hashval(clientid->cl_id, open->op_owner); - sop = find_openstateowner_str(strhashval, open); - open->op_stateowner = sop; - if (!sop) { - /* Make sure the client's lease hasn't expired. */ + strhashval = open_ownerstr_hashval(clientid->cl_id, &open->op_owner); + oo = find_openstateowner_str(strhashval, open); + open->op_openowner = oo; + if (!oo) { clp = find_confirmed_client(clientid); if (clp == NULL) return nfserr_expired; - goto renew; + goto new_owner; } - /* When sessions are used, skip open sequenceid processing */ - if (nfsd4_has_session(cstate)) - goto renew; - if (!sop->so_confirmed) { + if (!(oo->oo_flags & NFS4_OO_CONFIRMED)) { /* Replace unconfirmed owners without checking for replay. */ - clp = sop->so_client; - release_openowner(sop); - open->op_stateowner = NULL; - goto renew; - } - if (open->op_seqid == sop->so_seqid - 1) { - if (sop->so_replay.rp_buflen) - return nfserr_replay_me; - /* The original OPEN failed so spectacularly - * that we don't even have replay data saved! - * Therefore, we have no choice but to continue - * processing this OPEN; presumably, we'll - * fail again for the same reason. - */ - dprintk("nfsd4_process_open1: replay with no replay cache\n"); - goto renew; - } - if (open->op_seqid != sop->so_seqid) - return nfserr_bad_seqid; -renew: - if (open->op_stateowner == NULL) { - sop = alloc_init_open_stateowner(strhashval, clp, open); - if (sop == NULL) - return nfserr_resource; - open->op_stateowner = sop; + clp = oo->oo_owner.so_client; + release_openowner(oo); + open->op_openowner = NULL; + goto new_owner; } - list_del_init(&sop->so_close_lru); - renew_client(sop->so_client); + status = nfsd4_check_seqid(cstate, &oo->oo_owner, open->op_seqid); + if (status) + return status; + clp = oo->oo_owner.so_client; + goto alloc_stateid; +new_owner: + oo = alloc_init_open_stateowner(strhashval, clp, open); + if (oo == NULL) + return nfserr_jukebox; + open->op_openowner = oo; +alloc_stateid: + open->op_stp = nfs4_alloc_stateid(clp); + if (!open->op_stp) + return nfserr_jukebox; return nfs_ok; } @@ -2500,36 +2622,37 @@ nfs4_check_delegmode(struct nfs4_delegation *dp, int flags) return nfs_ok; } -static struct nfs4_delegation * -find_delegation_file(struct nfs4_file *fp, stateid_t *stid) +static int share_access_to_flags(u32 share_access) { - struct nfs4_delegation *dp; + share_access &= ~NFS4_SHARE_WANT_MASK; - spin_lock(&recall_lock); - list_for_each_entry(dp, &fp->fi_delegations, dl_perfile) - if (dp->dl_stateid.si_stateownerid == stid->si_stateownerid) { - spin_unlock(&recall_lock); - return dp; - } - spin_unlock(&recall_lock); - return NULL; + return share_access == NFS4_SHARE_ACCESS_READ ? RD_STATE : WR_STATE; } -static int share_access_to_flags(u32 share_access) +static struct nfs4_delegation *find_deleg_stateid(struct nfs4_client *cl, stateid_t *s) { - share_access &= ~NFS4_SHARE_WANT_MASK; + struct nfs4_stid *ret; - return share_access == NFS4_SHARE_ACCESS_READ ? RD_STATE : WR_STATE; + ret = find_stateid_by_type(cl, s, NFS4_DELEG_STID); + if (!ret) + return NULL; + return delegstateid(ret); +} + +static bool nfsd4_is_deleg_cur(struct nfsd4_open *open) +{ + return open->op_claim_type == NFS4_OPEN_CLAIM_DELEGATE_CUR || + open->op_claim_type == NFS4_OPEN_CLAIM_DELEG_CUR_FH; } static __be32 -nfs4_check_deleg(struct nfs4_file *fp, struct nfsd4_open *open, +nfs4_check_deleg(struct nfs4_client *cl, struct nfs4_file *fp, struct nfsd4_open *open, struct nfs4_delegation **dp) { int flags; __be32 status = nfserr_bad_stateid; - *dp = find_delegation_file(fp, &open->op_delegate_stateid); + *dp = find_deleg_stateid(cl, &open->op_delegate_stateid); if (*dp == NULL) goto out; flags = share_access_to_flags(open->op_share_access); @@ -2537,41 +2660,37 @@ nfs4_check_deleg(struct nfs4_file *fp, struct nfsd4_open *open, if (status) *dp = NULL; out: - if (open->op_claim_type != NFS4_OPEN_CLAIM_DELEGATE_CUR) + if (!nfsd4_is_deleg_cur(open)) return nfs_ok; if (status) return status; - open->op_stateowner->so_confirmed = 1; + open->op_openowner->oo_flags |= NFS4_OO_CONFIRMED; return nfs_ok; } static __be32 -nfs4_check_open(struct nfs4_file *fp, struct nfsd4_open *open, struct nfs4_stateid **stpp) +nfs4_check_open(struct nfs4_file *fp, struct nfsd4_open *open, struct nfs4_ol_stateid **stpp) { - struct nfs4_stateid *local; - __be32 status = nfserr_share_denied; - struct nfs4_stateowner *sop = open->op_stateowner; + struct nfs4_ol_stateid *local; + struct nfs4_openowner *oo = open->op_openowner; list_for_each_entry(local, &fp->fi_stateids, st_perfile) { /* ignore lock owners */ if (local->st_stateowner->so_is_open_owner == 0) continue; /* remember if we have seen this open owner */ - if (local->st_stateowner == sop) + if (local->st_stateowner == &oo->oo_owner) *stpp = local; /* check for conflicting share reservations */ if (!test_share(local, open)) - goto out; + return nfserr_share_denied; } - status = 0; -out: - return status; + return nfs_ok; } -static inline struct nfs4_stateid * -nfs4_alloc_stateid(void) +static void nfs4_free_stateid(struct nfs4_ol_stateid *s) { - return kmem_cache_alloc(stateid_slab, GFP_KERNEL); + kmem_cache_free(stateid_slab, s); } static inline int nfs4_access_to_access(u32 nfs4_access) @@ -2592,12 +2711,6 @@ static __be32 nfs4_get_vfs_file(struct svc_rqst *rqstp, struct nfs4_file *fp, int oflag = nfs4_access_to_omode(open->op_share_access); int access = nfs4_access_to_access(open->op_share_access); - /* CLAIM_DELEGATE_CUR is used in response to a broken lease; - * allowing it to break the lease and return EAGAIN leaves the - * client unable to make progress in returning the delegation */ - if (open->op_claim_type == NFS4_OPEN_CLAIM_DELEGATE_CUR) - access |= NFSD_MAY_NOT_BREAK_LEASE; - if (!fp->fi_fds[oflag]) { status = nfsd_open(rqstp, cur_fh, S_IFREG, access, &fp->fi_fds[oflag]); @@ -2609,27 +2722,6 @@ static __be32 nfs4_get_vfs_file(struct svc_rqst *rqstp, struct nfs4_file *fp, return nfs_ok; } -static __be32 -nfs4_new_open(struct svc_rqst *rqstp, struct nfs4_stateid **stpp, - struct nfs4_file *fp, struct svc_fh *cur_fh, - struct nfsd4_open *open) -{ - struct nfs4_stateid *stp; - __be32 status; - - stp = nfs4_alloc_stateid(); - if (stp == NULL) - return nfserr_resource; - - status = nfs4_get_vfs_file(rqstp, fp, cur_fh, open); - if (status) { - kmem_cache_free(stateid_slab, stp); - return status; - } - *stpp = stp; - return 0; -} - static inline __be32 nfsd4_truncate(struct svc_rqst *rqstp, struct svc_fh *fh, struct nfsd4_open *open) @@ -2646,9 +2738,9 @@ nfsd4_truncate(struct svc_rqst *rqstp, struct svc_fh *fh, } static __be32 -nfs4_upgrade_open(struct svc_rqst *rqstp, struct nfs4_file *fp, struct svc_fh *cur_fh, struct nfs4_stateid *stp, struct nfsd4_open *open) +nfs4_upgrade_open(struct svc_rqst *rqstp, struct nfs4_file *fp, struct svc_fh *cur_fh, struct nfs4_ol_stateid *stp, struct nfsd4_open *open) { - u32 op_share_access = open->op_share_access & ~NFS4_SHARE_WANT_MASK; + u32 op_share_access = open->op_share_access; bool new_access; __be32 status; @@ -2677,8 +2769,8 @@ nfs4_upgrade_open(struct svc_rqst *rqstp, struct nfs4_file *fp, struct svc_fh *c static void nfs4_set_claim_prev(struct nfsd4_open *open) { - open->op_stateowner->so_confirmed = 1; - open->op_stateowner->so_client->cl_firststate = 1; + open->op_openowner->oo_flags |= NFS4_OO_CONFIRMED; + open->op_openowner->oo_owner.so_client->cl_firststate = 1; } /* Should we give out recallable state?: */ @@ -2721,7 +2813,7 @@ static int nfs4_setlease(struct nfs4_delegation *dp, int flag) if (!fl) return -ENOMEM; fl->fl_file = find_readable_file(fp); - list_add(&dp->dl_perclnt, &dp->dl_client->cl_delegations); + list_add(&dp->dl_perclnt, &dp->dl_stid.sc_client->cl_delegations); status = vfs_setlease(fl->fl_file, fl->fl_type, &fl); if (status) { list_del_init(&dp->dl_perclnt); @@ -2750,7 +2842,7 @@ static int nfs4_set_delegation(struct nfs4_delegation *dp, int flag) atomic_inc(&fp->fi_delegees); list_add(&dp->dl_perfile, &fp->fi_delegations); spin_unlock(&recall_lock); - list_add(&dp->dl_perclnt, &dp->dl_client->cl_delegations); + list_add(&dp->dl_perclnt, &dp->dl_stid.sc_client->cl_delegations); return 0; } @@ -2758,14 +2850,14 @@ static int nfs4_set_delegation(struct nfs4_delegation *dp, int flag) * Attempt to hand out a delegation. */ static void -nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_stateid *stp) +nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_ol_stateid *stp) { struct nfs4_delegation *dp; - struct nfs4_stateowner *sop = stp->st_stateowner; + struct nfs4_openowner *oo = container_of(stp->st_stateowner, struct nfs4_openowner, oo_owner); int cb_up; int status, flag = 0; - cb_up = nfsd4_cb_channel_good(sop->so_client); + cb_up = nfsd4_cb_channel_good(oo->oo_owner.so_client); flag = NFS4_OPEN_DELEGATE_NONE; open->op_recall = 0; switch (open->op_claim_type) { @@ -2781,7 +2873,7 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_sta * had the chance to reclaim theirs.... */ if (locks_in_grace()) goto out; - if (!cb_up || !sop->so_confirmed) + if (!cb_up || !(oo->oo_flags & NFS4_OO_CONFIRMED)) goto out; if (open->op_share_access & NFS4_SHARE_ACCESS_WRITE) flag = NFS4_OPEN_DELEGATE_WRITE; @@ -2792,17 +2884,17 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_sta goto out; } - dp = alloc_init_deleg(sop->so_client, stp, fh, flag); + dp = alloc_init_deleg(oo->oo_owner.so_client, stp, fh, flag); if (dp == NULL) goto out_no_deleg; status = nfs4_set_delegation(dp, flag); if (status) goto out_free; - memcpy(&open->op_delegate_stateid, &dp->dl_stateid, sizeof(dp->dl_stateid)); + memcpy(&open->op_delegate_stateid, &dp->dl_stid.sc_stateid, sizeof(dp->dl_stid.sc_stateid)); dprintk("NFSD: delegation stateid=" STATEID_FMT "\n", - STATEID_VAL(&dp->dl_stateid)); + STATEID_VAL(&dp->dl_stid.sc_stateid)); out: if (open->op_claim_type == NFS4_OPEN_CLAIM_PREVIOUS && flag == NFS4_OPEN_DELEGATE_NONE @@ -2824,16 +2916,13 @@ __be32 nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_open *open) { struct nfsd4_compoundres *resp = rqstp->rq_resp; + struct nfs4_client *cl = open->op_openowner->oo_owner.so_client; struct nfs4_file *fp = NULL; struct inode *ino = current_fh->fh_dentry->d_inode; - struct nfs4_stateid *stp = NULL; + struct nfs4_ol_stateid *stp = NULL; struct nfs4_delegation *dp = NULL; __be32 status; - status = nfserr_inval; - if (!access_valid(open->op_share_access, resp->cstate.minorversion) - || !deny_valid(open->op_share_deny)) - goto out; /* * Lookup file; if found, lookup stateid and check open request, * and check for delegations in the process of being recalled. @@ -2843,17 +2932,17 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf if (fp) { if ((status = nfs4_check_open(fp, open, &stp))) goto out; - status = nfs4_check_deleg(fp, open, &dp); + status = nfs4_check_deleg(cl, fp, open, &dp); if (status) goto out; } else { status = nfserr_bad_stateid; - if (open->op_claim_type == NFS4_OPEN_CLAIM_DELEGATE_CUR) - goto out; - status = nfserr_resource; - fp = alloc_init_file(ino); - if (fp == NULL) + if (nfsd4_is_deleg_cur(open)) goto out; + status = nfserr_jukebox; + fp = open->op_file; + open->op_file = NULL; + nfsd4_init_file(fp, ino); } /* @@ -2865,24 +2954,24 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf status = nfs4_upgrade_open(rqstp, fp, current_fh, stp, open); if (status) goto out; - update_stateid(&stp->st_stateid); } else { - status = nfs4_new_open(rqstp, &stp, fp, current_fh, open); + status = nfs4_get_vfs_file(rqstp, fp, current_fh, open); if (status) goto out; - init_stateid(stp, fp, open); + stp = open->op_stp; + open->op_stp = NULL; + init_open_stateid(stp, fp, open); status = nfsd4_truncate(rqstp, current_fh, open); if (status) { release_open_stateid(stp); goto out; } - if (nfsd4_has_session(&resp->cstate)) - update_stateid(&stp->st_stateid); } - memcpy(&open->op_stateid, &stp->st_stateid, sizeof(stateid_t)); + update_stateid(&stp->st_stid.sc_stateid); + memcpy(&open->op_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t)); if (nfsd4_has_session(&resp->cstate)) - open->op_stateowner->so_confirmed = 1; + open->op_openowner->oo_flags |= NFS4_OO_CONFIRMED; /* * Attempt to hand out a delegation. No error return, because the @@ -2893,7 +2982,7 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf status = nfs_ok; dprintk("%s: stateid=" STATEID_FMT "\n", __func__, - STATEID_VAL(&stp->st_stateid)); + STATEID_VAL(&stp->st_stid.sc_stateid)); out: if (fp) put_nfs4_file(fp); @@ -2903,13 +2992,34 @@ out: * To finish the open response, we just need to set the rflags. */ open->op_rflags = NFS4_OPEN_RESULT_LOCKTYPE_POSIX; - if (!open->op_stateowner->so_confirmed && + if (!(open->op_openowner->oo_flags & NFS4_OO_CONFIRMED) && !nfsd4_has_session(&resp->cstate)) open->op_rflags |= NFS4_OPEN_RESULT_CONFIRM; return status; } +void nfsd4_cleanup_open_state(struct nfsd4_open *open, __be32 status) +{ + if (open->op_openowner) { + struct nfs4_openowner *oo = open->op_openowner; + + if (!list_empty(&oo->oo_owner.so_stateids)) + list_del_init(&oo->oo_close_lru); + if (oo->oo_flags & NFS4_OO_NEW) { + if (status) { + release_openowner(oo); + open->op_openowner = NULL; + } else + oo->oo_flags &= ~NFS4_OO_NEW; + } + } + if (open->op_file) + nfsd4_free_file(open->op_file); + if (open->op_stp) + nfs4_free_stateid(open->op_stp); +} + __be32 nfsd4_renew(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, clientid_t *clid) @@ -2930,7 +3040,6 @@ nfsd4_renew(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, dprintk("nfsd4_renew: clientid not found!\n"); goto out; } - renew_client(clp); status = nfserr_cb_path_down; if (!list_empty(&clp->cl_delegations) && clp->cl_cb_state != NFSD4_CB_UP) @@ -2962,7 +3071,7 @@ static time_t nfs4_laundromat(void) { struct nfs4_client *clp; - struct nfs4_stateowner *sop; + struct nfs4_openowner *oo; struct nfs4_delegation *dp; struct list_head *pos, *next, reaplist; time_t cutoff = get_seconds() - nfsd4_lease; @@ -3019,16 +3128,14 @@ nfs4_laundromat(void) } test_val = nfsd4_lease; list_for_each_safe(pos, next, &close_lru) { - sop = list_entry(pos, struct nfs4_stateowner, so_close_lru); - if (time_after((unsigned long)sop->so_time, (unsigned long)cutoff)) { - u = sop->so_time - cutoff; + oo = container_of(pos, struct nfs4_openowner, oo_close_lru); + if (time_after((unsigned long)oo->oo_time, (unsigned long)cutoff)) { + u = oo->oo_time - cutoff; if (test_val > u) test_val = u; break; } - dprintk("NFSD: purging unused open stateowner (so_id %d)\n", - sop->so_id); - release_openowner(sop); + release_openowner(oo); } if (clientid_val < NFSD_LAUNDROMAT_MINTIMEOUT) clientid_val = NFSD_LAUNDROMAT_MINTIMEOUT; @@ -3050,30 +3157,17 @@ laundromat_main(struct work_struct *not_used) queue_delayed_work(laundry_wq, &laundromat_work, t*HZ); } -static struct nfs4_stateowner * -search_close_lru(u32 st_id, int flags) +static inline __be32 nfs4_check_fh(struct svc_fh *fhp, struct nfs4_ol_stateid *stp) { - struct nfs4_stateowner *local = NULL; - - if (flags & CLOSE_STATE) { - list_for_each_entry(local, &close_lru, so_close_lru) { - if (local->so_id == st_id) - return local; - } - } - return NULL; -} - -static inline int -nfs4_check_fh(struct svc_fh *fhp, struct nfs4_stateid *stp) -{ - return fhp->fh_dentry->d_inode != stp->st_file->fi_inode; + if (fhp->fh_dentry->d_inode != stp->st_file->fi_inode) + return nfserr_bad_stateid; + return nfs_ok; } static int STALE_STATEID(stateid_t *stateid) { - if (stateid->si_boot == boot_time) + if (stateid->si_opaque.so_clid.cl_boot == boot_time) return 0; dprintk("NFSD: stale stateid " STATEID_FMT "!\n", STATEID_VAL(stateid)); @@ -3096,7 +3190,7 @@ access_permit_write(unsigned long access_bmap) } static -__be32 nfs4_check_openmode(struct nfs4_stateid *stp, int flags) +__be32 nfs4_check_openmode(struct nfs4_ol_stateid *stp, int flags) { __be32 status = nfserr_openmode; @@ -3139,68 +3233,80 @@ grace_disallows_io(struct inode *inode) return locks_in_grace() && mandatory_lock(inode); } -static int check_stateid_generation(stateid_t *in, stateid_t *ref, int flags) +/* Returns true iff a is later than b: */ +static bool stateid_generation_after(stateid_t *a, stateid_t *b) +{ + return (s32)a->si_generation - (s32)b->si_generation > 0; +} + +static int check_stateid_generation(stateid_t *in, stateid_t *ref, bool has_session) { /* * When sessions are used the stateid generation number is ignored * when it is zero. */ - if ((flags & HAS_SESSION) && in->si_generation == 0) - goto out; + if (has_session && in->si_generation == 0) + return nfs_ok; + + if (in->si_generation == ref->si_generation) + return nfs_ok; /* If the client sends us a stateid from the future, it's buggy: */ - if (in->si_generation > ref->si_generation) + if (stateid_generation_after(in, ref)) return nfserr_bad_stateid; /* - * The following, however, can happen. For example, if the - * client sends an open and some IO at the same time, the open - * may bump si_generation while the IO is still in flight. - * Thanks to hard links and renames, the client never knows what - * file an open will affect. So it could avoid that situation - * only by serializing all opens and IO from the same open - * owner. To recover from the old_stateid error, the client - * will just have to retry the IO: + * However, we could see a stateid from the past, even from a + * non-buggy client. For example, if the client sends a lock + * while some IO is outstanding, the lock may bump si_generation + * while the IO is still in flight. The client could avoid that + * situation by waiting for responses on all the IO requests, + * but better performance may result in retrying IO that + * receives an old_stateid error if requests are rarely + * reordered in flight: */ - if (in->si_generation < ref->si_generation) - return nfserr_old_stateid; -out: - return nfs_ok; + return nfserr_old_stateid; } -static int is_delegation_stateid(stateid_t *stateid) +__be32 nfs4_validate_stateid(struct nfs4_client *cl, stateid_t *stateid) { - return stateid->si_fileid == 0; -} + struct nfs4_stid *s; + struct nfs4_ol_stateid *ols; + __be32 status; -static int is_open_stateid(struct nfs4_stateid *stateid) -{ - return stateid->st_openstp == NULL; + if (STALE_STATEID(stateid)) + return nfserr_stale_stateid; + + s = find_stateid(cl, stateid); + if (!s) + return nfserr_stale_stateid; + status = check_stateid_generation(stateid, &s->sc_stateid, 1); + if (status) + return status; + if (!(s->sc_type & (NFS4_OPEN_STID | NFS4_LOCK_STID))) + return nfs_ok; + ols = openlockstateid(s); + if (ols->st_stateowner->so_is_open_owner + && !(openowner(ols->st_stateowner)->oo_flags & NFS4_OO_CONFIRMED)) + return nfserr_bad_stateid; + return nfs_ok; } -__be32 nfs4_validate_stateid(stateid_t *stateid, int flags) +static __be32 nfsd4_lookup_stateid(stateid_t *stateid, unsigned char typemask, struct nfs4_stid **s) { - struct nfs4_stateid *stp = NULL; - __be32 status = nfserr_stale_stateid; + struct nfs4_client *cl; + if (ZERO_STATEID(stateid) || ONE_STATEID(stateid)) + return nfserr_bad_stateid; if (STALE_STATEID(stateid)) - goto out; - - status = nfserr_expired; - stp = search_for_stateid(stateid); - if (!stp) - goto out; - status = nfserr_bad_stateid; - - if (!stp->st_stateowner->so_confirmed) - goto out; - - status = check_stateid_generation(stateid, &stp->st_stateid, flags); - if (status) - goto out; + return nfserr_stale_stateid; + cl = find_confirmed_client(&stateid->si_opaque.so_clid); + if (!cl) + return nfserr_expired; + *s = find_stateid_by_type(cl, stateid, typemask); + if (!*s) + return nfserr_bad_stateid; + return nfs_ok; - status = nfs_ok; -out: - return status; } /* @@ -3210,7 +3316,8 @@ __be32 nfs4_preprocess_stateid_op(struct nfsd4_compound_state *cstate, stateid_t *stateid, int flags, struct file **filpp) { - struct nfs4_stateid *stp = NULL; + struct nfs4_stid *s; + struct nfs4_ol_stateid *stp = NULL; struct nfs4_delegation *dp = NULL; struct svc_fh *current_fh = &cstate->current_fh; struct inode *ino = current_fh->fh_dentry->d_inode; @@ -3222,60 +3329,47 @@ nfs4_preprocess_stateid_op(struct nfsd4_compound_state *cstate, if (grace_disallows_io(ino)) return nfserr_grace; - if (nfsd4_has_session(cstate)) - flags |= HAS_SESSION; - if (ZERO_STATEID(stateid) || ONE_STATEID(stateid)) return check_special_stateids(current_fh, stateid, flags); - status = nfserr_stale_stateid; - if (STALE_STATEID(stateid)) + status = nfsd4_lookup_stateid(stateid, NFS4_DELEG_STID|NFS4_OPEN_STID|NFS4_LOCK_STID, &s); + if (status) + return status; + status = check_stateid_generation(stateid, &s->sc_stateid, nfsd4_has_session(cstate)); + if (status) goto out; - - /* - * We assume that any stateid that has the current boot time, - * but that we can't find, is expired: - */ - status = nfserr_expired; - if (is_delegation_stateid(stateid)) { - dp = find_delegation_stateid(ino, stateid); - if (!dp) - goto out; - status = check_stateid_generation(stateid, &dp->dl_stateid, - flags); - if (status) - goto out; + switch (s->sc_type) { + case NFS4_DELEG_STID: + dp = delegstateid(s); status = nfs4_check_delegmode(dp, flags); if (status) goto out; - renew_client(dp->dl_client); if (filpp) { *filpp = dp->dl_file->fi_deleg_file; BUG_ON(!*filpp); } - } else { /* open or lock stateid */ - stp = find_stateid(stateid, flags); - if (!stp) - goto out; - status = nfserr_bad_stateid; - if (nfs4_check_fh(current_fh, stp)) - goto out; - if (!stp->st_stateowner->so_confirmed) - goto out; - status = check_stateid_generation(stateid, &stp->st_stateid, - flags); + break; + case NFS4_OPEN_STID: + case NFS4_LOCK_STID: + stp = openlockstateid(s); + status = nfs4_check_fh(current_fh, stp); if (status) goto out; + if (stp->st_stateowner->so_is_open_owner + && !(openowner(stp->st_stateowner)->oo_flags & NFS4_OO_CONFIRMED)) + goto out; status = nfs4_check_openmode(stp, flags); if (status) goto out; - renew_client(stp->st_stateowner->so_client); if (filpp) { if (flags & RD_STATE) *filpp = find_readable_file(stp->st_file); else *filpp = find_writeable_file(stp->st_file); } + break; + default: + return nfserr_bad_stateid; } status = nfs_ok; out: @@ -3283,18 +3377,9 @@ out: } static __be32 -nfsd4_free_delegation_stateid(stateid_t *stateid) +nfsd4_free_lock_stateid(struct nfs4_ol_stateid *stp) { - struct nfs4_delegation *dp = search_for_delegation(stateid); - if (dp) - return nfserr_locks_held; - return nfserr_bad_stateid; -} - -static __be32 -nfsd4_free_lock_stateid(struct nfs4_stateid *stp) -{ - if (check_for_locks(stp->st_file, stp->st_stateowner)) + if (check_for_locks(stp->st_file, lockowner(stp->st_stateowner))) return nfserr_locks_held; release_lock_stateid(stp); return nfs_ok; @@ -3307,51 +3392,40 @@ __be32 nfsd4_test_stateid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_test_stateid *test_stateid) { - test_stateid->ts_has_session = nfsd4_has_session(cstate); + /* real work is done during encoding */ return nfs_ok; } -/* - * Free a state id - */ __be32 nfsd4_free_stateid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_free_stateid *free_stateid) { stateid_t *stateid = &free_stateid->fr_stateid; - struct nfs4_stateid *stp; - __be32 ret; + struct nfs4_stid *s; + struct nfs4_client *cl = cstate->session->se_client; + __be32 ret = nfserr_bad_stateid; nfs4_lock_state(); - if (is_delegation_stateid(stateid)) { - ret = nfsd4_free_delegation_stateid(stateid); - goto out; - } - - stp = search_for_stateid(stateid); - if (!stp) { - ret = nfserr_bad_stateid; + s = find_stateid(cl, stateid); + if (!s) goto out; - } - if (stateid->si_generation != 0) { - if (stateid->si_generation < stp->st_stateid.si_generation) { - ret = nfserr_old_stateid; - goto out; - } - if (stateid->si_generation > stp->st_stateid.si_generation) { - ret = nfserr_bad_stateid; - goto out; - } - } - - if (is_open_stateid(stp)) { + switch (s->sc_type) { + case NFS4_DELEG_STID: ret = nfserr_locks_held; goto out; - } else { - ret = nfsd4_free_lock_stateid(stp); - goto out; + case NFS4_OPEN_STID: + case NFS4_LOCK_STID: + ret = check_stateid_generation(stateid, &s->sc_stateid, 1); + if (ret) + goto out; + if (s->sc_type == NFS4_LOCK_STID) + ret = nfsd4_free_lock_stateid(openlockstateid(s)); + else + ret = nfserr_locks_held; + break; + default: + ret = nfserr_bad_stateid; } - out: nfs4_unlock_state(); return ret; @@ -3364,124 +3438,64 @@ setlkflg (int type) RD_STATE : WR_STATE; } +static __be32 nfs4_seqid_op_checks(struct nfsd4_compound_state *cstate, stateid_t *stateid, u32 seqid, struct nfs4_ol_stateid *stp) +{ + struct svc_fh *current_fh = &cstate->current_fh; + struct nfs4_stateowner *sop = stp->st_stateowner; + __be32 status; + + status = nfsd4_check_seqid(cstate, sop, seqid); + if (status) + return status; + if (stp->st_stid.sc_type == NFS4_CLOSED_STID) + /* + * "Closed" stateid's exist *only* to return + * nfserr_replay_me from the previous step. + */ + return nfserr_bad_stateid; + status = check_stateid_generation(stateid, &stp->st_stid.sc_stateid, nfsd4_has_session(cstate)); + if (status) + return status; + return nfs4_check_fh(current_fh, stp); +} + /* * Checks for sequence id mutating operations. */ static __be32 nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid, - stateid_t *stateid, int flags, - struct nfs4_stateowner **sopp, - struct nfs4_stateid **stpp, struct nfsd4_lock *lock) + stateid_t *stateid, char typemask, + struct nfs4_ol_stateid **stpp) { - struct nfs4_stateid *stp; - struct nfs4_stateowner *sop; - struct svc_fh *current_fh = &cstate->current_fh; __be32 status; + struct nfs4_stid *s; dprintk("NFSD: %s: seqid=%d stateid = " STATEID_FMT "\n", __func__, seqid, STATEID_VAL(stateid)); *stpp = NULL; - *sopp = NULL; - - if (ZERO_STATEID(stateid) || ONE_STATEID(stateid)) { - dprintk("NFSD: preprocess_seqid_op: magic stateid!\n"); - return nfserr_bad_stateid; - } - - if (STALE_STATEID(stateid)) - return nfserr_stale_stateid; - - if (nfsd4_has_session(cstate)) - flags |= HAS_SESSION; - - /* - * We return BAD_STATEID if filehandle doesn't match stateid, - * the confirmed flag is incorrecly set, or the generation - * number is incorrect. - */ - stp = find_stateid(stateid, flags); - if (stp == NULL) { - /* - * Also, we should make sure this isn't just the result of - * a replayed close: - */ - sop = search_close_lru(stateid->si_stateownerid, flags); - /* It's not stale; let's assume it's expired: */ - if (sop == NULL) - return nfserr_expired; - *sopp = sop; - goto check_replay; - } - - *stpp = stp; - *sopp = sop = stp->st_stateowner; - - if (lock) { - clientid_t *lockclid = &lock->v.new.clientid; - struct nfs4_client *clp = sop->so_client; - int lkflg = 0; - __be32 status; - - lkflg = setlkflg(lock->lk_type); - - if (lock->lk_is_new) { - if (!sop->so_is_open_owner) - return nfserr_bad_stateid; - if (!(flags & HAS_SESSION) && - !same_clid(&clp->cl_clientid, lockclid)) - return nfserr_bad_stateid; - /* stp is the open stateid */ - status = nfs4_check_openmode(stp, lkflg); - if (status) - return status; - } else { - /* stp is the lock stateid */ - status = nfs4_check_openmode(stp->st_openstp, lkflg); - if (status) - return status; - } - } + status = nfsd4_lookup_stateid(stateid, typemask, &s); + if (status) + return status; + *stpp = openlockstateid(s); + cstate->replay_owner = (*stpp)->st_stateowner; - if (nfs4_check_fh(current_fh, stp)) { - dprintk("NFSD: preprocess_seqid_op: fh-stateid mismatch!\n"); - return nfserr_bad_stateid; - } + return nfs4_seqid_op_checks(cstate, stateid, seqid, *stpp); +} - /* - * We now validate the seqid and stateid generation numbers. - * For the moment, we ignore the possibility of - * generation number wraparound. - */ - if (!(flags & HAS_SESSION) && seqid != sop->so_seqid) - goto check_replay; +static __be32 nfs4_preprocess_confirmed_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid, stateid_t *stateid, struct nfs4_ol_stateid **stpp) +{ + __be32 status; + struct nfs4_openowner *oo; - if (sop->so_confirmed && flags & CONFIRM) { - dprintk("NFSD: preprocess_seqid_op: expected" - " unconfirmed stateowner!\n"); - return nfserr_bad_stateid; - } - if (!sop->so_confirmed && !(flags & CONFIRM)) { - dprintk("NFSD: preprocess_seqid_op: stateowner not" - " confirmed yet!\n"); - return nfserr_bad_stateid; - } - status = check_stateid_generation(stateid, &stp->st_stateid, flags); + status = nfs4_preprocess_seqid_op(cstate, seqid, stateid, + NFS4_OPEN_STID, stpp); if (status) return status; - renew_client(sop->so_client); + oo = openowner((*stpp)->st_stateowner); + if (!(oo->oo_flags & NFS4_OO_CONFIRMED)) + return nfserr_bad_stateid; return nfs_ok; - -check_replay: - if (seqid == sop->so_seqid - 1) { - dprintk("NFSD: preprocess_seqid_op: retransmission?\n"); - /* indicate replay to calling function */ - return nfserr_replay_me; - } - dprintk("NFSD: preprocess_seqid_op: bad seqid (expected %d, got %d)\n", - sop->so_seqid, seqid); - *sopp = NULL; - return nfserr_bad_seqid; } __be32 @@ -3489,8 +3503,8 @@ nfsd4_open_confirm(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_open_confirm *oc) { __be32 status; - struct nfs4_stateowner *sop; - struct nfs4_stateid *stp; + struct nfs4_openowner *oo; + struct nfs4_ol_stateid *stp; dprintk("NFSD: nfsd4_open_confirm on file %.*s\n", (int)cstate->current_fh.fh_dentry->d_name.len, @@ -3502,38 +3516,52 @@ nfsd4_open_confirm(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, nfs4_lock_state(); - if ((status = nfs4_preprocess_seqid_op(cstate, + status = nfs4_preprocess_seqid_op(cstate, oc->oc_seqid, &oc->oc_req_stateid, - CONFIRM | OPEN_STATE, - &oc->oc_stateowner, &stp, NULL))) - goto out; - - sop = oc->oc_stateowner; - sop->so_confirmed = 1; - update_stateid(&stp->st_stateid); - memcpy(&oc->oc_resp_stateid, &stp->st_stateid, sizeof(stateid_t)); + NFS4_OPEN_STID, &stp); + if (status) + goto out; + oo = openowner(stp->st_stateowner); + status = nfserr_bad_stateid; + if (oo->oo_flags & NFS4_OO_CONFIRMED) + goto out; + oo->oo_flags |= NFS4_OO_CONFIRMED; + update_stateid(&stp->st_stid.sc_stateid); + memcpy(&oc->oc_resp_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t)); dprintk("NFSD: %s: success, seqid=%d stateid=" STATEID_FMT "\n", - __func__, oc->oc_seqid, STATEID_VAL(&stp->st_stateid)); + __func__, oc->oc_seqid, STATEID_VAL(&stp->st_stid.sc_stateid)); - nfsd4_create_clid_dir(sop->so_client); + nfsd4_create_clid_dir(oo->oo_owner.so_client); + status = nfs_ok; out: - if (oc->oc_stateowner) { - nfs4_get_stateowner(oc->oc_stateowner); - cstate->replay_owner = oc->oc_stateowner; - } - nfs4_unlock_state(); + if (!cstate->replay_owner) + nfs4_unlock_state(); return status; } -static inline void nfs4_file_downgrade(struct nfs4_stateid *stp, unsigned int to_access) +static inline void nfs4_stateid_downgrade_bit(struct nfs4_ol_stateid *stp, u32 access) { - int i; + if (!test_bit(access, &stp->st_access_bmap)) + return; + nfs4_file_put_access(stp->st_file, nfs4_access_to_omode(access)); + __clear_bit(access, &stp->st_access_bmap); +} - for (i = 1; i < 4; i++) { - if (test_bit(i, &stp->st_access_bmap) && !(i & to_access)) { - nfs4_file_put_access(stp->st_file, i); - __clear_bit(i, &stp->st_access_bmap); - } +static inline void nfs4_stateid_downgrade(struct nfs4_ol_stateid *stp, u32 to_access) +{ + switch (to_access) { + case NFS4_SHARE_ACCESS_READ: + nfs4_stateid_downgrade_bit(stp, NFS4_SHARE_ACCESS_WRITE); + nfs4_stateid_downgrade_bit(stp, NFS4_SHARE_ACCESS_BOTH); + break; + case NFS4_SHARE_ACCESS_WRITE: + nfs4_stateid_downgrade_bit(stp, NFS4_SHARE_ACCESS_READ); + nfs4_stateid_downgrade_bit(stp, NFS4_SHARE_ACCESS_BOTH); + break; + case NFS4_SHARE_ACCESS_BOTH: + break; + default: + BUG(); } } @@ -3553,24 +3581,20 @@ nfsd4_open_downgrade(struct svc_rqst *rqstp, struct nfsd4_open_downgrade *od) { __be32 status; - struct nfs4_stateid *stp; + struct nfs4_ol_stateid *stp; dprintk("NFSD: nfsd4_open_downgrade on file %.*s\n", (int)cstate->current_fh.fh_dentry->d_name.len, cstate->current_fh.fh_dentry->d_name.name); - if (!access_valid(od->od_share_access, cstate->minorversion) - || !deny_valid(od->od_share_deny)) - return nfserr_inval; + /* We don't yet support WANT bits: */ + od->od_share_access &= NFS4_SHARE_ACCESS_MASK; nfs4_lock_state(); - if ((status = nfs4_preprocess_seqid_op(cstate, - od->od_seqid, - &od->od_stateid, - OPEN_STATE, - &od->od_stateowner, &stp, NULL))) + status = nfs4_preprocess_confirmed_seqid_op(cstate, od->od_seqid, + &od->od_stateid, &stp); + if (status) goto out; - status = nfserr_inval; if (!test_bit(od->od_share_access, &stp->st_access_bmap)) { dprintk("NFSD:access not a subset current bitmap: 0x%lx, input access=%08x\n", @@ -3582,22 +3606,45 @@ nfsd4_open_downgrade(struct svc_rqst *rqstp, stp->st_deny_bmap, od->od_share_deny); goto out; } - nfs4_file_downgrade(stp, od->od_share_access); + nfs4_stateid_downgrade(stp, od->od_share_access); reset_union_bmap_deny(od->od_share_deny, &stp->st_deny_bmap); - update_stateid(&stp->st_stateid); - memcpy(&od->od_stateid, &stp->st_stateid, sizeof(stateid_t)); + update_stateid(&stp->st_stid.sc_stateid); + memcpy(&od->od_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t)); status = nfs_ok; out: - if (od->od_stateowner) { - nfs4_get_stateowner(od->od_stateowner); - cstate->replay_owner = od->od_stateowner; - } - nfs4_unlock_state(); + if (!cstate->replay_owner) + nfs4_unlock_state(); return status; } +void nfsd4_purge_closed_stateid(struct nfs4_stateowner *so) +{ + struct nfs4_openowner *oo; + struct nfs4_ol_stateid *s; + + if (!so->so_is_open_owner) + return; + oo = openowner(so); + s = oo->oo_last_closed_stid; + if (!s) + return; + if (!(oo->oo_flags & NFS4_OO_PURGE_CLOSE)) { + /* Release the last_closed_stid on the next seqid bump: */ + oo->oo_flags |= NFS4_OO_PURGE_CLOSE; + return; + } + oo->oo_flags &= ~NFS4_OO_PURGE_CLOSE; + release_last_closed_stateid(oo); +} + +static void nfsd4_close_open_stateid(struct nfs4_ol_stateid *s) +{ + unhash_open_stateid(s); + s->st_stid.sc_type = NFS4_CLOSED_STID; +} + /* * nfs4_unlock_state() called after encode */ @@ -3606,39 +3653,37 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_close *close) { __be32 status; - struct nfs4_stateid *stp; + struct nfs4_openowner *oo; + struct nfs4_ol_stateid *stp; dprintk("NFSD: nfsd4_close on file %.*s\n", (int)cstate->current_fh.fh_dentry->d_name.len, cstate->current_fh.fh_dentry->d_name.name); nfs4_lock_state(); - /* check close_lru for replay */ - if ((status = nfs4_preprocess_seqid_op(cstate, - close->cl_seqid, - &close->cl_stateid, - OPEN_STATE | CLOSE_STATE, - &close->cl_stateowner, &stp, NULL))) + status = nfs4_preprocess_seqid_op(cstate, close->cl_seqid, + &close->cl_stateid, + NFS4_OPEN_STID|NFS4_CLOSED_STID, + &stp); + if (status) goto out; + oo = openowner(stp->st_stateowner); status = nfs_ok; - update_stateid(&stp->st_stateid); - memcpy(&close->cl_stateid, &stp->st_stateid, sizeof(stateid_t)); + update_stateid(&stp->st_stid.sc_stateid); + memcpy(&close->cl_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t)); - /* release_stateid() calls nfsd_close() if needed */ - release_open_stateid(stp); + nfsd4_close_open_stateid(stp); + oo->oo_last_closed_stid = stp; /* place unused nfs4_stateowners on so_close_lru list to be * released by the laundromat service after the lease period * to enable us to handle CLOSE replay */ - if (list_empty(&close->cl_stateowner->so_stateids)) - move_to_close_lru(close->cl_stateowner); + if (list_empty(&oo->oo_owner.so_stateids)) + move_to_close_lru(oo); out: - if (close->cl_stateowner) { - nfs4_get_stateowner(close->cl_stateowner); - cstate->replay_owner = close->cl_stateowner; - } - nfs4_unlock_state(); + if (!cstate->replay_owner) + nfs4_unlock_state(); return status; } @@ -3648,34 +3693,22 @@ nfsd4_delegreturn(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, { struct nfs4_delegation *dp; stateid_t *stateid = &dr->dr_stateid; + struct nfs4_stid *s; struct inode *inode; __be32 status; - int flags = 0; if ((status = fh_verify(rqstp, &cstate->current_fh, S_IFREG, 0))) return status; inode = cstate->current_fh.fh_dentry->d_inode; - if (nfsd4_has_session(cstate)) - flags |= HAS_SESSION; nfs4_lock_state(); - status = nfserr_bad_stateid; - if (ZERO_STATEID(stateid) || ONE_STATEID(stateid)) - goto out; - status = nfserr_stale_stateid; - if (STALE_STATEID(stateid)) - goto out; - status = nfserr_bad_stateid; - if (!is_delegation_stateid(stateid)) - goto out; - status = nfserr_expired; - dp = find_delegation_stateid(inode, stateid); - if (!dp) + status = nfsd4_lookup_stateid(stateid, NFS4_DELEG_STID, &s); + if (status) goto out; - status = check_stateid_generation(stateid, &dp->dl_stateid, flags); + dp = delegstateid(s); + status = check_stateid_generation(stateid, &dp->dl_stid.sc_stateid, nfsd4_has_session(cstate)); if (status) goto out; - renew_client(dp->dl_client); unhash_delegation(dp); out: @@ -3713,9 +3746,6 @@ last_byte_offset(u64 start, u64 len) return end > start ? end - 1: NFS4_MAX_UINT64; } -#define lockownerid_hashval(id) \ - ((id) & LOCK_HASH_MASK) - static inline unsigned int lock_ownerstr_hashval(struct inode *inode, u32 cl_id, struct xdr_netobj *ownername) @@ -3725,101 +3755,7 @@ lock_ownerstr_hashval(struct inode *inode, u32 cl_id, & LOCK_HASH_MASK; } -static struct list_head lock_ownerid_hashtbl[LOCK_HASH_SIZE]; static struct list_head lock_ownerstr_hashtbl[LOCK_HASH_SIZE]; -static struct list_head lockstateid_hashtbl[STATEID_HASH_SIZE]; - -static int -same_stateid(stateid_t *id_one, stateid_t *id_two) -{ - if (id_one->si_stateownerid != id_two->si_stateownerid) - return 0; - return id_one->si_fileid == id_two->si_fileid; -} - -static struct nfs4_stateid * -find_stateid(stateid_t *stid, int flags) -{ - struct nfs4_stateid *local; - u32 st_id = stid->si_stateownerid; - u32 f_id = stid->si_fileid; - unsigned int hashval; - - dprintk("NFSD: find_stateid flags 0x%x\n",flags); - if (flags & (LOCK_STATE | RD_STATE | WR_STATE)) { - hashval = stateid_hashval(st_id, f_id); - list_for_each_entry(local, &lockstateid_hashtbl[hashval], st_hash) { - if ((local->st_stateid.si_stateownerid == st_id) && - (local->st_stateid.si_fileid == f_id)) - return local; - } - } - - if (flags & (OPEN_STATE | RD_STATE | WR_STATE)) { - hashval = stateid_hashval(st_id, f_id); - list_for_each_entry(local, &stateid_hashtbl[hashval], st_hash) { - if ((local->st_stateid.si_stateownerid == st_id) && - (local->st_stateid.si_fileid == f_id)) - return local; - } - } - return NULL; -} - -static struct nfs4_stateid * -search_for_stateid(stateid_t *stid) -{ - struct nfs4_stateid *local; - unsigned int hashval = stateid_hashval(stid->si_stateownerid, stid->si_fileid); - - list_for_each_entry(local, &lockstateid_hashtbl[hashval], st_hash) { - if (same_stateid(&local->st_stateid, stid)) - return local; - } - - list_for_each_entry(local, &stateid_hashtbl[hashval], st_hash) { - if (same_stateid(&local->st_stateid, stid)) - return local; - } - return NULL; -} - -static struct nfs4_delegation * -search_for_delegation(stateid_t *stid) -{ - struct nfs4_file *fp; - struct nfs4_delegation *dp; - struct list_head *pos; - int i; - - for (i = 0; i < FILE_HASH_SIZE; i++) { - list_for_each_entry(fp, &file_hashtbl[i], fi_hash) { - list_for_each(pos, &fp->fi_delegations) { - dp = list_entry(pos, struct nfs4_delegation, dl_perfile); - if (same_stateid(&dp->dl_stateid, stid)) - return dp; - } - } - } - return NULL; -} - -static struct nfs4_delegation * -find_delegation_stateid(struct inode *ino, stateid_t *stid) -{ - struct nfs4_file *fp; - struct nfs4_delegation *dl; - - dprintk("NFSD: %s: stateid=" STATEID_FMT "\n", __func__, - STATEID_VAL(stid)); - - fp = find_file(ino); - if (!fp) - return NULL; - dl = find_delegation_file(fp, stid); - put_nfs4_file(fp); - return dl; -} /* * TODO: Linux file offsets are _signed_ 64-bit quantities, which means that @@ -3846,15 +3782,21 @@ static const struct lock_manager_operations nfsd_posix_mng_ops = { static inline void nfs4_set_lock_denied(struct file_lock *fl, struct nfsd4_lock_denied *deny) { - struct nfs4_stateowner *sop; + struct nfs4_lockowner *lo; if (fl->fl_lmops == &nfsd_posix_mng_ops) { - sop = (struct nfs4_stateowner *) fl->fl_owner; - kref_get(&sop->so_ref); - deny->ld_sop = sop; - deny->ld_clientid = sop->so_client->cl_clientid; + lo = (struct nfs4_lockowner *) fl->fl_owner; + deny->ld_owner.data = kmemdup(lo->lo_owner.so_owner.data, + lo->lo_owner.so_owner.len, GFP_KERNEL); + if (!deny->ld_owner.data) + /* We just don't care that much */ + goto nevermind; + deny->ld_owner.len = lo->lo_owner.so_owner.len; + deny->ld_clientid = lo->lo_owner.so_client->cl_clientid; } else { - deny->ld_sop = NULL; +nevermind: + deny->ld_owner.len = 0; + deny->ld_owner.data = NULL; deny->ld_clientid.cl_boot = 0; deny->ld_clientid.cl_id = 0; } @@ -3867,8 +3809,8 @@ nfs4_set_lock_denied(struct file_lock *fl, struct nfsd4_lock_denied *deny) deny->ld_type = NFS4_WRITE_LT; } -static struct nfs4_stateowner * -find_lockstateowner_str(struct inode *inode, clientid_t *clid, +static struct nfs4_lockowner * +find_lockowner_str(struct inode *inode, clientid_t *clid, struct xdr_netobj *owner) { unsigned int hashval = lock_ownerstr_hashval(inode, clid->cl_id, owner); @@ -3876,11 +3818,17 @@ find_lockstateowner_str(struct inode *inode, clientid_t *clid, list_for_each_entry(op, &lock_ownerstr_hashtbl[hashval], so_strhash) { if (same_owner_str(op, owner, clid)) - return op; + return lockowner(op); } return NULL; } +static void hash_lockowner(struct nfs4_lockowner *lo, unsigned int strhashval, struct nfs4_client *clp, struct nfs4_ol_stateid *open_stp) +{ + list_add(&lo->lo_owner.so_strhash, &lock_ownerstr_hashtbl[strhashval]); + list_add(&lo->lo_perstateid, &open_stp->st_lockowners); +} + /* * Alloc a lock owner structure. * Called in nfsd4_lock - therefore, OPEN and OPEN_CONFIRM (if needed) has @@ -3889,67 +3837,40 @@ find_lockstateowner_str(struct inode *inode, clientid_t *clid, * strhashval = lock_ownerstr_hashval */ -static struct nfs4_stateowner * -alloc_init_lock_stateowner(unsigned int strhashval, struct nfs4_client *clp, struct nfs4_stateid *open_stp, struct nfsd4_lock *lock) { - struct nfs4_stateowner *sop; - struct nfs4_replay *rp; - unsigned int idhashval; +static struct nfs4_lockowner * +alloc_init_lock_stateowner(unsigned int strhashval, struct nfs4_client *clp, struct nfs4_ol_stateid *open_stp, struct nfsd4_lock *lock) { + struct nfs4_lockowner *lo; - if (!(sop = alloc_stateowner(&lock->lk_new_owner))) + lo = alloc_stateowner(lockowner_slab, &lock->lk_new_owner, clp); + if (!lo) return NULL; - idhashval = lockownerid_hashval(current_ownerid); - INIT_LIST_HEAD(&sop->so_idhash); - INIT_LIST_HEAD(&sop->so_strhash); - INIT_LIST_HEAD(&sop->so_perclient); - INIT_LIST_HEAD(&sop->so_stateids); - INIT_LIST_HEAD(&sop->so_perstateid); - INIT_LIST_HEAD(&sop->so_close_lru); /* not used */ - sop->so_time = 0; - list_add(&sop->so_idhash, &lock_ownerid_hashtbl[idhashval]); - list_add(&sop->so_strhash, &lock_ownerstr_hashtbl[strhashval]); - list_add(&sop->so_perstateid, &open_stp->st_lockowners); - sop->so_is_open_owner = 0; - sop->so_id = current_ownerid++; - sop->so_client = clp; + INIT_LIST_HEAD(&lo->lo_owner.so_stateids); + lo->lo_owner.so_is_open_owner = 0; /* It is the openowner seqid that will be incremented in encode in the * case of new lockowners; so increment the lock seqid manually: */ - sop->so_seqid = lock->lk_new_lock_seqid + 1; - sop->so_confirmed = 1; - rp = &sop->so_replay; - rp->rp_status = nfserr_serverfault; - rp->rp_buflen = 0; - rp->rp_buf = rp->rp_ibuf; - return sop; + lo->lo_owner.so_seqid = lock->lk_new_lock_seqid + 1; + hash_lockowner(lo, strhashval, clp, open_stp); + return lo; } -static struct nfs4_stateid * -alloc_init_lock_stateid(struct nfs4_stateowner *sop, struct nfs4_file *fp, struct nfs4_stateid *open_stp) +static struct nfs4_ol_stateid * +alloc_init_lock_stateid(struct nfs4_lockowner *lo, struct nfs4_file *fp, struct nfs4_ol_stateid *open_stp) { - struct nfs4_stateid *stp; - unsigned int hashval = stateid_hashval(sop->so_id, fp->fi_id); + struct nfs4_ol_stateid *stp; + struct nfs4_client *clp = lo->lo_owner.so_client; - stp = nfs4_alloc_stateid(); + stp = nfs4_alloc_stateid(clp); if (stp == NULL) - goto out; - INIT_LIST_HEAD(&stp->st_hash); - INIT_LIST_HEAD(&stp->st_perfile); - INIT_LIST_HEAD(&stp->st_perstateowner); - INIT_LIST_HEAD(&stp->st_lockowners); /* not used */ - list_add(&stp->st_hash, &lockstateid_hashtbl[hashval]); + return NULL; + init_stid(&stp->st_stid, clp, NFS4_LOCK_STID); list_add(&stp->st_perfile, &fp->fi_stateids); - list_add(&stp->st_perstateowner, &sop->so_stateids); - stp->st_stateowner = sop; + list_add(&stp->st_perstateowner, &lo->lo_owner.so_stateids); + stp->st_stateowner = &lo->lo_owner; get_nfs4_file(fp); stp->st_file = fp; - stp->st_stateid.si_boot = boot_time; - stp->st_stateid.si_stateownerid = sop->so_id; - stp->st_stateid.si_fileid = fp->fi_id; - stp->st_stateid.si_generation = 0; stp->st_access_bmap = 0; stp->st_deny_bmap = open_stp->st_deny_bmap; stp->st_openstp = open_stp; - -out: return stp; } @@ -3960,7 +3881,7 @@ check_lock_length(u64 offset, u64 length) LOFF_OVERFLOW(offset, length))); } -static void get_lock_access(struct nfs4_stateid *lock_stp, u32 access) +static void get_lock_access(struct nfs4_ol_stateid *lock_stp, u32 access) { struct nfs4_file *fp = lock_stp->st_file; int oflag = nfs4_access_to_omode(access); @@ -3978,15 +3899,16 @@ __be32 nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_lock *lock) { - struct nfs4_stateowner *open_sop = NULL; - struct nfs4_stateowner *lock_sop = NULL; - struct nfs4_stateid *lock_stp; + struct nfs4_openowner *open_sop = NULL; + struct nfs4_lockowner *lock_sop = NULL; + struct nfs4_ol_stateid *lock_stp; struct nfs4_file *fp; struct file *filp = NULL; struct file_lock file_lock; struct file_lock conflock; __be32 status = 0; unsigned int strhashval; + int lkflg; int err; dprintk("NFSD: nfsd4_lock: start=%Ld length=%Ld\n", @@ -4010,7 +3932,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, * Use open owner and open stateid to create lock owner and * lock stateid. */ - struct nfs4_stateid *open_stp = NULL; + struct nfs4_ol_stateid *open_stp = NULL; status = nfserr_stale_clientid; if (!nfsd4_has_session(cstate) && @@ -4018,26 +3940,29 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, goto out; /* validate and update open stateid and open seqid */ - status = nfs4_preprocess_seqid_op(cstate, + status = nfs4_preprocess_confirmed_seqid_op(cstate, lock->lk_new_open_seqid, &lock->lk_new_open_stateid, - OPEN_STATE, - &lock->lk_replay_owner, &open_stp, - lock); + &open_stp); if (status) goto out; - open_sop = lock->lk_replay_owner; + open_sop = openowner(open_stp->st_stateowner); + status = nfserr_bad_stateid; + if (!nfsd4_has_session(cstate) && + !same_clid(&open_sop->oo_owner.so_client->cl_clientid, + &lock->v.new.clientid)) + goto out; /* create lockowner and lock stateid */ fp = open_stp->st_file; - strhashval = lock_ownerstr_hashval(fp->fi_inode, - open_sop->so_client->cl_clientid.cl_id, + strhashval = lock_ownerstr_hashval(fp->fi_inode, + open_sop->oo_owner.so_client->cl_clientid.cl_id, &lock->v.new.owner); /* XXX: Do we need to check for duplicate stateowners on * the same file, or should they just be allowed (and * create new stateids)? */ - status = nfserr_resource; + status = nfserr_jukebox; lock_sop = alloc_init_lock_stateowner(strhashval, - open_sop->so_client, open_stp, lock); + open_sop->oo_owner.so_client, open_stp, lock); if (lock_sop == NULL) goto out; lock_stp = alloc_init_lock_stateid(lock_sop, fp, open_stp); @@ -4046,16 +3971,20 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, } else { /* lock (lock owner + lock stateid) already exists */ status = nfs4_preprocess_seqid_op(cstate, - lock->lk_old_lock_seqid, - &lock->lk_old_lock_stateid, - LOCK_STATE, - &lock->lk_replay_owner, &lock_stp, lock); + lock->lk_old_lock_seqid, + &lock->lk_old_lock_stateid, + NFS4_LOCK_STID, &lock_stp); if (status) goto out; - lock_sop = lock->lk_replay_owner; + lock_sop = lockowner(lock_stp->st_stateowner); fp = lock_stp->st_file; } - /* lock->lk_replay_owner and lock_stp have been created or found */ + /* lock_sop and lock_stp have been created or found */ + + lkflg = setlkflg(lock->lk_type); + status = nfs4_check_openmode(lock_stp, lkflg); + if (status) + goto out; status = nfserr_grace; if (locks_in_grace() && !lock->lk_reclaim) @@ -4106,8 +4035,8 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, err = vfs_lock_file(filp, F_SETLK, &file_lock, &conflock); switch (-err) { case 0: /* success! */ - update_stateid(&lock_stp->st_stateid); - memcpy(&lock->lk_resp_stateid, &lock_stp->st_stateid, + update_stateid(&lock_stp->st_stid.sc_stateid); + memcpy(&lock->lk_resp_stateid, &lock_stp->st_stid.sc_stateid, sizeof(stateid_t)); status = 0; break; @@ -4119,19 +4048,16 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, case (EDEADLK): status = nfserr_deadlock; break; - default: + default: dprintk("NFSD: nfsd4_lock: vfs_lock_file() failed! status %d\n",err); - status = nfserr_resource; + status = nfserrno(err); break; } out: if (status && lock->lk_is_new && lock_sop) release_lockowner(lock_sop); - if (lock->lk_replay_owner) { - nfs4_get_stateowner(lock->lk_replay_owner); - cstate->replay_owner = lock->lk_replay_owner; - } - nfs4_unlock_state(); + if (!cstate->replay_owner) + nfs4_unlock_state(); return status; } @@ -4163,6 +4089,7 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, { struct inode *inode; struct file_lock file_lock; + struct nfs4_lockowner *lo; int error; __be32 status; @@ -4172,19 +4099,14 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, if (check_lock_length(lockt->lt_offset, lockt->lt_length)) return nfserr_inval; - lockt->lt_stateowner = NULL; nfs4_lock_state(); status = nfserr_stale_clientid; if (!nfsd4_has_session(cstate) && STALE_CLIENTID(&lockt->lt_clientid)) goto out; - if ((status = fh_verify(rqstp, &cstate->current_fh, S_IFREG, 0))) { - dprintk("NFSD: nfsd4_lockt: fh_verify() failed!\n"); - if (status == nfserr_symlink) - status = nfserr_inval; + if ((status = fh_verify(rqstp, &cstate->current_fh, S_IFREG, 0))) goto out; - } inode = cstate->current_fh.fh_dentry->d_inode; locks_init_lock(&file_lock); @@ -4203,10 +4125,9 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, goto out; } - lockt->lt_stateowner = find_lockstateowner_str(inode, - &lockt->lt_clientid, &lockt->lt_owner); - if (lockt->lt_stateowner) - file_lock.fl_owner = (fl_owner_t)lockt->lt_stateowner; + lo = find_lockowner_str(inode, &lockt->lt_clientid, &lockt->lt_owner); + if (lo) + file_lock.fl_owner = (fl_owner_t)lo; file_lock.fl_pid = current->tgid; file_lock.fl_flags = FL_POSIX; @@ -4234,7 +4155,7 @@ __be32 nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_locku *locku) { - struct nfs4_stateid *stp; + struct nfs4_ol_stateid *stp; struct file *filp = NULL; struct file_lock file_lock; __be32 status; @@ -4249,13 +4170,10 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, nfs4_lock_state(); - if ((status = nfs4_preprocess_seqid_op(cstate, - locku->lu_seqid, - &locku->lu_stateid, - LOCK_STATE, - &locku->lu_stateowner, &stp, NULL))) + status = nfs4_preprocess_seqid_op(cstate, locku->lu_seqid, + &locku->lu_stateid, NFS4_LOCK_STID, &stp); + if (status) goto out; - filp = find_any_file(stp->st_file); if (!filp) { status = nfserr_lock_range; @@ -4264,7 +4182,7 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, BUG_ON(!filp); locks_init_lock(&file_lock); file_lock.fl_type = F_UNLCK; - file_lock.fl_owner = (fl_owner_t) locku->lu_stateowner; + file_lock.fl_owner = (fl_owner_t)lockowner(stp->st_stateowner); file_lock.fl_pid = current->tgid; file_lock.fl_file = filp; file_lock.fl_flags = FL_POSIX; @@ -4285,15 +4203,12 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, /* * OK, unlock succeeded; the only thing left to do is update the stateid. */ - update_stateid(&stp->st_stateid); - memcpy(&locku->lu_stateid, &stp->st_stateid, sizeof(stateid_t)); + update_stateid(&stp->st_stid.sc_stateid); + memcpy(&locku->lu_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t)); out: - if (locku->lu_stateowner) { - nfs4_get_stateowner(locku->lu_stateowner); - cstate->replay_owner = locku->lu_stateowner; - } - nfs4_unlock_state(); + if (!cstate->replay_owner) + nfs4_unlock_state(); return status; out_nfserr: @@ -4307,7 +4222,7 @@ out_nfserr: * 0: no locks held by lockowner */ static int -check_for_locks(struct nfs4_file *filp, struct nfs4_stateowner *lowner) +check_for_locks(struct nfs4_file *filp, struct nfs4_lockowner *lowner) { struct file_lock **flpp; struct inode *inode = filp->fi_inode; @@ -4332,7 +4247,8 @@ nfsd4_release_lockowner(struct svc_rqst *rqstp, { clientid_t *clid = &rlockowner->rl_clientid; struct nfs4_stateowner *sop; - struct nfs4_stateid *stp; + struct nfs4_lockowner *lo; + struct nfs4_ol_stateid *stp; struct xdr_netobj *owner = &rlockowner->rl_owner; struct list_head matches; int i; @@ -4356,16 +4272,15 @@ nfsd4_release_lockowner(struct svc_rqst *rqstp, * data structures. */ INIT_LIST_HEAD(&matches); for (i = 0; i < LOCK_HASH_SIZE; i++) { - list_for_each_entry(sop, &lock_ownerid_hashtbl[i], so_idhash) { + list_for_each_entry(sop, &lock_ownerstr_hashtbl[i], so_strhash) { if (!same_owner_str(sop, owner, clid)) continue; list_for_each_entry(stp, &sop->so_stateids, st_perstateowner) { - if (check_for_locks(stp->st_file, sop)) + lo = lockowner(sop); + if (check_for_locks(stp->st_file, lo)) goto out; - /* Note: so_perclient unused for lockowners, - * so it's OK to fool with here. */ - list_add(&sop->so_perclient, &matches); + list_add(&lo->lo_list, &matches); } } } @@ -4374,12 +4289,12 @@ nfsd4_release_lockowner(struct svc_rqst *rqstp, * have been checked. */ status = nfs_ok; while (!list_empty(&matches)) { - sop = list_entry(matches.next, struct nfs4_stateowner, - so_perclient); + lo = list_entry(matches.next, struct nfs4_lockowner, + lo_list); /* unhash_stateowner deletes so_perclient only * for openowners. */ - list_del(&sop->so_perclient); - release_lockowner(sop); + list_del(&lo->lo_list); + release_lockowner(lo); } out: nfs4_unlock_state(); @@ -4501,16 +4416,10 @@ nfs4_state_init(void) for (i = 0; i < FILE_HASH_SIZE; i++) { INIT_LIST_HEAD(&file_hashtbl[i]); } - for (i = 0; i < OWNER_HASH_SIZE; i++) { - INIT_LIST_HEAD(&ownerstr_hashtbl[i]); - INIT_LIST_HEAD(&ownerid_hashtbl[i]); - } - for (i = 0; i < STATEID_HASH_SIZE; i++) { - INIT_LIST_HEAD(&stateid_hashtbl[i]); - INIT_LIST_HEAD(&lockstateid_hashtbl[i]); + for (i = 0; i < OPEN_OWNER_HASH_SIZE; i++) { + INIT_LIST_HEAD(&open_ownerstr_hashtbl[i]); } for (i = 0; i < LOCK_HASH_SIZE; i++) { - INIT_LIST_HEAD(&lock_ownerid_hashtbl[i]); INIT_LIST_HEAD(&lock_ownerstr_hashtbl[i]); } memset(&onestateid, ~0, sizeof(stateid_t)); @@ -4527,7 +4436,7 @@ nfsd4_load_reboot_recovery_data(void) int status; nfs4_lock_state(); - nfsd4_init_recdir(user_recovery_dirname); + nfsd4_init_recdir(); status = nfsd4_recdir_load(); nfs4_unlock_state(); if (status) @@ -4636,40 +4545,3 @@ nfs4_state_shutdown(void) nfs4_unlock_state(); nfsd4_destroy_callback_queue(); } - -/* - * user_recovery_dirname is protected by the nfsd_mutex since it's only - * accessed when nfsd is starting. - */ -static void -nfs4_set_recdir(char *recdir) -{ - strcpy(user_recovery_dirname, recdir); -} - -/* - * Change the NFSv4 recovery directory to recdir. - */ -int -nfs4_reset_recoverydir(char *recdir) -{ - int status; - struct path path; - - status = kern_path(recdir, LOOKUP_FOLLOW, &path); - if (status) - return status; - status = -ENOTDIR; - if (S_ISDIR(path.dentry->d_inode->i_mode)) { - nfs4_set_recdir(recdir); - status = 0; - } - path_put(&path); - return status; -} - -char * -nfs4_recoverydir(void) -{ - return user_recovery_dirname; -} diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index c8bf405..b6fa792 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -456,7 +456,6 @@ nfsd4_decode_close(struct nfsd4_compoundargs *argp, struct nfsd4_close *close) { DECODE_HEAD; - close->cl_stateowner = NULL; READ_BUF(4); READ32(close->cl_seqid); return nfsd4_decode_stateid(argp, &close->cl_stateid); @@ -551,7 +550,6 @@ nfsd4_decode_lock(struct nfsd4_compoundargs *argp, struct nfsd4_lock *lock) { DECODE_HEAD; - lock->lk_replay_owner = NULL; /* * type, reclaim(boolean), offset, length, new_lock_owner(boolean) */ @@ -611,7 +609,6 @@ nfsd4_decode_locku(struct nfsd4_compoundargs *argp, struct nfsd4_locku *locku) { DECODE_HEAD; - locku->lu_stateowner = NULL; READ_BUF(8); READ32(locku->lu_type); if ((locku->lu_type < NFS4_READ_LT) || (locku->lu_type > NFS4_WRITEW_LT)) @@ -642,6 +639,83 @@ nfsd4_decode_lookup(struct nfsd4_compoundargs *argp, struct nfsd4_lookup *lookup DECODE_TAIL; } +static __be32 nfsd4_decode_share_access(struct nfsd4_compoundargs *argp, u32 *x) +{ + __be32 *p; + u32 w; + + READ_BUF(4); + READ32(w); + *x = w; + switch (w & NFS4_SHARE_ACCESS_MASK) { + case NFS4_SHARE_ACCESS_READ: + case NFS4_SHARE_ACCESS_WRITE: + case NFS4_SHARE_ACCESS_BOTH: + break; + default: + return nfserr_bad_xdr; + } + w &= ~NFS4_SHARE_ACCESS_MASK; + if (!w) + return nfs_ok; + if (!argp->minorversion) + return nfserr_bad_xdr; + switch (w & NFS4_SHARE_WANT_MASK) { + case NFS4_SHARE_WANT_NO_PREFERENCE: + case NFS4_SHARE_WANT_READ_DELEG: + case NFS4_SHARE_WANT_WRITE_DELEG: + case NFS4_SHARE_WANT_ANY_DELEG: + case NFS4_SHARE_WANT_NO_DELEG: + case NFS4_SHARE_WANT_CANCEL: + break; + default: + return nfserr_bad_xdr; + } + w &= ~NFS4_SHARE_WANT_MASK; + if (!w) + return nfs_ok; + switch (w) { + case NFS4_SHARE_SIGNAL_DELEG_WHEN_RESRC_AVAIL: + case NFS4_SHARE_PUSH_DELEG_WHEN_UNCONTENDED: + case (NFS4_SHARE_SIGNAL_DELEG_WHEN_RESRC_AVAIL | + NFS4_SHARE_PUSH_DELEG_WHEN_UNCONTENDED): + return nfs_ok; + } +xdr_error: + return nfserr_bad_xdr; +} + +static __be32 nfsd4_decode_share_deny(struct nfsd4_compoundargs *argp, u32 *x) +{ + __be32 *p; + + READ_BUF(4); + READ32(*x); + /* Note: unlinke access bits, deny bits may be zero. */ + if (*x & ~NFS4_SHARE_DENY_BOTH) + return nfserr_bad_xdr; + return nfs_ok; +xdr_error: + return nfserr_bad_xdr; +} + +static __be32 nfsd4_decode_opaque(struct nfsd4_compoundargs *argp, struct xdr_netobj *o) +{ + __be32 *p; + + READ_BUF(4); + READ32(o->len); + + if (o->len == 0 || o->len > NFS4_OPAQUE_LIMIT) + return nfserr_bad_xdr; + + READ_BUF(o->len); + SAVEMEM(o->data, o->len); + return nfs_ok; +xdr_error: + return nfserr_bad_xdr; +} + static __be32 nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open) { @@ -649,19 +723,23 @@ nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open) memset(open->op_bmval, 0, sizeof(open->op_bmval)); open->op_iattr.ia_valid = 0; - open->op_stateowner = NULL; + open->op_openowner = NULL; /* seqid, share_access, share_deny, clientid, ownerlen */ - READ_BUF(16 + sizeof(clientid_t)); + READ_BUF(4); READ32(open->op_seqid); - READ32(open->op_share_access); - READ32(open->op_share_deny); + status = nfsd4_decode_share_access(argp, &open->op_share_access); + if (status) + goto xdr_error; + status = nfsd4_decode_share_deny(argp, &open->op_share_deny); + if (status) + goto xdr_error; + READ_BUF(sizeof(clientid_t)); COPYMEM(&open->op_clientid, sizeof(clientid_t)); - READ32(open->op_owner.len); - - /* owner, open_flag */ - READ_BUF(open->op_owner.len + 4); - SAVEMEM(open->op_owner.data, open->op_owner.len); + status = nfsd4_decode_opaque(argp, &open->op_owner); + if (status) + goto xdr_error; + READ_BUF(4); READ32(open->op_create); switch (open->op_create) { case NFS4_OPEN_NOCREATE: @@ -727,6 +805,19 @@ nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open) if ((status = check_filename(open->op_fname.data, open->op_fname.len, nfserr_inval))) return status; break; + case NFS4_OPEN_CLAIM_FH: + case NFS4_OPEN_CLAIM_DELEG_PREV_FH: + if (argp->minorversion < 1) + goto xdr_error; + /* void */ + break; + case NFS4_OPEN_CLAIM_DELEG_CUR_FH: + if (argp->minorversion < 1) + goto xdr_error; + status = nfsd4_decode_stateid(argp, &open->op_delegate_stateid); + if (status) + return status; + break; default: goto xdr_error; } @@ -739,7 +830,6 @@ nfsd4_decode_open_confirm(struct nfsd4_compoundargs *argp, struct nfsd4_open_con { DECODE_HEAD; - open_conf->oc_stateowner = NULL; status = nfsd4_decode_stateid(argp, &open_conf->oc_req_stateid); if (status) return status; @@ -754,15 +844,17 @@ nfsd4_decode_open_downgrade(struct nfsd4_compoundargs *argp, struct nfsd4_open_d { DECODE_HEAD; - open_down->od_stateowner = NULL; status = nfsd4_decode_stateid(argp, &open_down->od_stateid); if (status) return status; - READ_BUF(12); + READ_BUF(4); READ32(open_down->od_seqid); - READ32(open_down->od_share_access); - READ32(open_down->od_share_deny); - + status = nfsd4_decode_share_access(argp, &open_down->od_share_access); + if (status) + return status; + status = nfsd4_decode_share_deny(argp, &open_down->od_share_deny); + if (status) + return status; DECODE_TAIL; } @@ -903,12 +995,13 @@ nfsd4_decode_setclientid(struct nfsd4_compoundargs *argp, struct nfsd4_setclient { DECODE_HEAD; - READ_BUF(12); + READ_BUF(8); COPYMEM(setclientid->se_verf.data, 8); - READ32(setclientid->se_namelen); - READ_BUF(setclientid->se_namelen + 8); - SAVEMEM(setclientid->se_name, setclientid->se_namelen); + status = nfsd4_decode_opaque(argp, &setclientid->se_name); + if (status) + return nfserr_bad_xdr; + READ_BUF(8); READ32(setclientid->se_callback_prog); READ32(setclientid->se_callback_netid_len); @@ -1051,11 +1144,9 @@ nfsd4_decode_exchange_id(struct nfsd4_compoundargs *argp, READ_BUF(NFS4_VERIFIER_SIZE); COPYMEM(exid->verifier.data, NFS4_VERIFIER_SIZE); - READ_BUF(4); - READ32(exid->clname.len); - - READ_BUF(exid->clname.len); - SAVEMEM(exid->clname.data, exid->clname.len); + status = nfsd4_decode_opaque(argp, &exid->clname); + if (status) + return nfserr_bad_xdr; READ_BUF(4); READ32(exid->flags); @@ -1326,6 +1417,16 @@ xdr_error: goto out; } +static __be32 nfsd4_decode_destroy_clientid(struct nfsd4_compoundargs *argp, struct nfsd4_destroy_clientid *dc) +{ + DECODE_HEAD; + + READ_BUF(8); + COPYMEM(&dc->clientid, 8); + + DECODE_TAIL; +} + static __be32 nfsd4_decode_reclaim_complete(struct nfsd4_compoundargs *argp, struct nfsd4_reclaim_complete *rc) { DECODE_HEAD; @@ -1447,7 +1548,7 @@ static nfsd4_dec nfsd41_dec_ops[] = { [OP_SET_SSV] = (nfsd4_dec)nfsd4_decode_notsupp, [OP_TEST_STATEID] = (nfsd4_dec)nfsd4_decode_test_stateid, [OP_WANT_DELEGATION] = (nfsd4_dec)nfsd4_decode_notsupp, - [OP_DESTROY_CLIENTID] = (nfsd4_dec)nfsd4_decode_notsupp, + [OP_DESTROY_CLIENTID] = (nfsd4_dec)nfsd4_decode_destroy_clientid, [OP_RECLAIM_COMPLETE] = (nfsd4_dec)nfsd4_decode_reclaim_complete, }; @@ -1630,15 +1731,20 @@ static void write_cinfo(__be32 **p, struct nfsd4_change_info *c) * we know whether the error to be returned is a sequence id mutating error. */ -#define ENCODE_SEQID_OP_TAIL(stateowner) do { \ - if (seqid_mutating_err(nfserr) && stateowner) { \ - stateowner->so_seqid++; \ - stateowner->so_replay.rp_status = nfserr; \ - stateowner->so_replay.rp_buflen = \ - (((char *)(resp)->p - (char *)save)); \ - memcpy(stateowner->so_replay.rp_buf, save, \ - stateowner->so_replay.rp_buflen); \ - } } while (0); +static void encode_seqid_op_tail(struct nfsd4_compoundres *resp, __be32 *save, __be32 nfserr) +{ + struct nfs4_stateowner *stateowner = resp->cstate.replay_owner; + + if (seqid_mutating_err(ntohl(nfserr)) && stateowner) { + stateowner->so_seqid++; + stateowner->so_replay.rp_status = nfserr; + stateowner->so_replay.rp_buflen = + (char *)resp->p - (char *)save; + memcpy(stateowner->so_replay.rp_buf, save, + stateowner->so_replay.rp_buflen); + nfsd4_purge_closed_stateid(stateowner); + } +} /* Encode as an array of strings the string given with components * separated @sep. @@ -1697,36 +1803,89 @@ static __be32 nfsd4_encode_fs_location4(struct nfsd4_fs_location *location, } /* - * Return the path to an export point in the pseudo filesystem namespace - * Returned string is safe to use as long as the caller holds a reference - * to @exp. + * Encode a path in RFC3530 'pathname4' format */ -static char *nfsd4_path(struct svc_rqst *rqstp, struct svc_export *exp, __be32 *stat) +static __be32 nfsd4_encode_path(const struct path *root, + const struct path *path, __be32 **pp, int *buflen) { - struct svc_fh tmp_fh; - char *path = NULL, *rootpath; - size_t rootlen; + struct path cur = { + .mnt = path->mnt, + .dentry = path->dentry, + }; + __be32 *p = *pp; + struct dentry **components = NULL; + unsigned int ncomponents = 0; + __be32 err = nfserr_jukebox; - fh_init(&tmp_fh, NFS4_FHSIZE); - *stat = exp_pseudoroot(rqstp, &tmp_fh); - if (*stat) - return NULL; - rootpath = tmp_fh.fh_export->ex_pathname; + dprintk("nfsd4_encode_components("); - path = exp->ex_pathname; + path_get(&cur); + /* First walk the path up to the nfsd root, and store the + * dentries/path components in an array. + */ + for (;;) { + if (cur.dentry == root->dentry && cur.mnt == root->mnt) + break; + if (cur.dentry == cur.mnt->mnt_root) { + if (follow_up(&cur)) + continue; + goto out_free; + } + if ((ncomponents & 15) == 0) { + struct dentry **new; + new = krealloc(components, + sizeof(*new) * (ncomponents + 16), + GFP_KERNEL); + if (!new) + goto out_free; + components = new; + } + components[ncomponents++] = cur.dentry; + cur.dentry = dget_parent(cur.dentry); + } - rootlen = strlen(rootpath); - if (strncmp(path, rootpath, rootlen)) { - dprintk("nfsd: fs_locations failed;" - "%s is not contained in %s\n", path, rootpath); - *stat = nfserr_notsupp; - path = NULL; - goto out; + *buflen -= 4; + if (*buflen < 0) + goto out_free; + WRITE32(ncomponents); + + while (ncomponents) { + struct dentry *dentry = components[ncomponents - 1]; + unsigned int len = dentry->d_name.len; + + *buflen -= 4 + (XDR_QUADLEN(len) << 2); + if (*buflen < 0) + goto out_free; + WRITE32(len); + WRITEMEM(dentry->d_name.name, len); + dprintk("/%s", dentry->d_name.name); + dput(dentry); + ncomponents--; } - path += rootlen; -out: - fh_put(&tmp_fh); - return path; + + *pp = p; + err = 0; +out_free: + dprintk(")\n"); + while (ncomponents) + dput(components[--ncomponents]); + kfree(components); + path_put(&cur); + return err; +} + +static __be32 nfsd4_encode_fsloc_fsroot(struct svc_rqst *rqstp, + const struct path *path, __be32 **pp, int *buflen) +{ + struct svc_export *exp_ps; + __be32 res; + + exp_ps = rqst_find_fsidzero_export(rqstp); + if (IS_ERR(exp_ps)) + return nfserrno(PTR_ERR(exp_ps)); + res = nfsd4_encode_path(&exp_ps->ex_path, path, pp, buflen); + exp_put(exp_ps); + return res; } /* @@ -1740,11 +1899,8 @@ static __be32 nfsd4_encode_fs_locations(struct svc_rqst *rqstp, int i; __be32 *p = *pp; struct nfsd4_fs_locations *fslocs = &exp->ex_fslocs; - char *root = nfsd4_path(rqstp, exp, &status); - if (status) - return status; - status = nfsd4_encode_components('/', root, &p, buflen); + status = nfsd4_encode_fsloc_fsroot(rqstp, &exp->ex_path, &p, buflen); if (status) return status; if ((*buflen -= 4) < 0) @@ -1760,12 +1916,19 @@ static __be32 nfsd4_encode_fs_locations(struct svc_rqst *rqstp, return 0; } -static u32 nfs4_ftypes[16] = { - NF4BAD, NF4FIFO, NF4CHR, NF4BAD, - NF4DIR, NF4BAD, NF4BLK, NF4BAD, - NF4REG, NF4BAD, NF4LNK, NF4BAD, - NF4SOCK, NF4BAD, NF4LNK, NF4BAD, -}; +static u32 nfs4_file_type(umode_t mode) +{ + switch (mode & S_IFMT) { + case S_IFIFO: return NF4FIFO; + case S_IFCHR: return NF4CHR; + case S_IFDIR: return NF4DIR; + case S_IFBLK: return NF4BLK; + case S_IFLNK: return NF4LNK; + case S_IFREG: return NF4REG; + case S_IFSOCK: return NF4SOCK; + default: return NF4BAD; + }; +} static __be32 nfsd4_encode_name(struct svc_rqst *rqstp, int whotype, uid_t id, int group, @@ -1954,7 +2117,7 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp, if (bmval0 & FATTR4_WORD0_TYPE) { if ((buflen -= 4) < 0) goto out_resource; - dummy = nfs4_ftypes[(stat.mode & S_IFMT) >> 12]; + dummy = nfs4_file_type(stat.mode); if (dummy == NF4BAD) goto out_serverfault; WRITE32(dummy); @@ -2488,7 +2651,7 @@ nfsd4_encode_close(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_c if (!nfserr) nfsd4_encode_stateid(resp, &close->cl_stateid); - ENCODE_SEQID_OP_TAIL(close->cl_stateowner); + encode_seqid_op_tail(resp, save, nfserr); return nfserr; } @@ -2564,17 +2727,18 @@ nfsd4_encode_getfh(struct nfsd4_compoundres *resp, __be32 nfserr, struct svc_fh static void nfsd4_encode_lock_denied(struct nfsd4_compoundres *resp, struct nfsd4_lock_denied *ld) { + struct xdr_netobj *conf = &ld->ld_owner; __be32 *p; - RESERVE_SPACE(32 + XDR_LEN(ld->ld_sop ? ld->ld_sop->so_owner.len : 0)); + RESERVE_SPACE(32 + XDR_LEN(conf->len)); WRITE64(ld->ld_start); WRITE64(ld->ld_length); WRITE32(ld->ld_type); - if (ld->ld_sop) { + if (conf->len) { WRITEMEM(&ld->ld_clientid, 8); - WRITE32(ld->ld_sop->so_owner.len); - WRITEMEM(ld->ld_sop->so_owner.data, ld->ld_sop->so_owner.len); - kref_put(&ld->ld_sop->so_ref, nfs4_free_stateowner); + WRITE32(conf->len); + WRITEMEM(conf->data, conf->len); + kfree(conf->data); } else { /* non - nfsv4 lock in conflict, no clientid nor owner */ WRITE64((u64)0); /* clientid */ WRITE32(0); /* length of owner name */ @@ -2592,7 +2756,7 @@ nfsd4_encode_lock(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_lo else if (nfserr == nfserr_denied) nfsd4_encode_lock_denied(resp, &lock->lk_denied); - ENCODE_SEQID_OP_TAIL(lock->lk_replay_owner); + encode_seqid_op_tail(resp, save, nfserr); return nfserr; } @@ -2612,7 +2776,7 @@ nfsd4_encode_locku(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_l if (!nfserr) nfsd4_encode_stateid(resp, &locku->lu_stateid); - ENCODE_SEQID_OP_TAIL(locku->lu_stateowner); + encode_seqid_op_tail(resp, save, nfserr); return nfserr; } @@ -2693,7 +2857,7 @@ nfsd4_encode_open(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_op } /* XXX save filehandle here */ out: - ENCODE_SEQID_OP_TAIL(open->op_stateowner); + encode_seqid_op_tail(resp, save, nfserr); return nfserr; } @@ -2705,7 +2869,7 @@ nfsd4_encode_open_confirm(struct nfsd4_compoundres *resp, __be32 nfserr, struct if (!nfserr) nfsd4_encode_stateid(resp, &oc->oc_resp_stateid); - ENCODE_SEQID_OP_TAIL(oc->oc_stateowner); + encode_seqid_op_tail(resp, save, nfserr); return nfserr; } @@ -2717,7 +2881,7 @@ nfsd4_encode_open_downgrade(struct nfsd4_compoundres *resp, __be32 nfserr, struc if (!nfserr) nfsd4_encode_stateid(resp, &od->od_stateid); - ENCODE_SEQID_OP_TAIL(od->od_stateowner); + encode_seqid_op_tail(resp, save, nfserr); return nfserr; } @@ -2759,8 +2923,6 @@ nfsd4_encode_read(struct nfsd4_compoundres *resp, __be32 nfserr, read->rd_offset, resp->rqstp->rq_vec, read->rd_vlen, &maxcount); - if (nfserr == nfserr_symlink) - nfserr = nfserr_inval; if (nfserr) return nfserr; eof = (read->rd_offset + maxcount >= @@ -2886,8 +3048,6 @@ nfsd4_encode_readdir(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4 readdir->common.err == nfserr_toosmall && readdir->buffer == page) nfserr = nfserr_toosmall; - if (nfserr == nfserr_symlink) - nfserr = nfserr_notdir; if (nfserr) goto err_no_verf; @@ -3218,9 +3378,9 @@ nfsd4_encode_sequence(struct nfsd4_compoundres *resp, int nfserr, WRITEMEM(seq->sessionid.data, NFS4_MAX_SESSIONID_LEN); WRITE32(seq->seqid); WRITE32(seq->slotid); - WRITE32(seq->maxslots); - /* For now: target_maxslots = maxslots */ - WRITE32(seq->maxslots); + /* Note slotid's are numbered from zero: */ + WRITE32(seq->maxslots - 1); /* sr_highest_slotid */ + WRITE32(seq->maxslots - 1); /* sr_target_highest_slotid */ WRITE32(seq->status_flags); ADJUST_ARGS(); @@ -3233,6 +3393,7 @@ nfsd4_encode_test_stateid(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_test_stateid *test_stateid) { struct nfsd4_compoundargs *argp; + struct nfs4_client *cl = resp->cstate.session->se_client; stateid_t si; __be32 *p; int i; @@ -3248,7 +3409,7 @@ nfsd4_encode_test_stateid(struct nfsd4_compoundres *resp, int nfserr, nfs4_lock_state(); for (i = 0; i < test_stateid->ts_num_ids; i++) { nfsd4_decode_stateid(argp, &si); - valid = nfs4_validate_stateid(&si, test_stateid->ts_has_session); + valid = nfs4_validate_stateid(cl, &si); RESERVE_SPACE(4); *p++ = htonl(valid); resp->p = p; @@ -3334,34 +3495,29 @@ static nfsd4_enc nfsd4_enc_ops[] = { /* * Calculate the total amount of memory that the compound response has taken - * after encoding the current operation. + * after encoding the current operation with pad. * - * pad: add on 8 bytes for the next operation's op_code and status so that - * there is room to cache a failure on the next operation. + * pad: if operation is non-idempotent, pad was calculate by op_rsize_bop() + * which was specified at nfsd4_operation, else pad is zero. * - * Compare this length to the session se_fmaxresp_cached. + * Compare this length to the session se_fmaxresp_sz and se_fmaxresp_cached. * * Our se_fmaxresp_cached will always be a multiple of PAGE_SIZE, and so * will be at least a page and will therefore hold the xdr_buf head. */ -static int nfsd4_check_drc_limit(struct nfsd4_compoundres *resp) +int nfsd4_check_resp_size(struct nfsd4_compoundres *resp, u32 pad) { - int status = 0; struct xdr_buf *xb = &resp->rqstp->rq_res; - struct nfsd4_compoundargs *args = resp->rqstp->rq_argp; struct nfsd4_session *session = NULL; struct nfsd4_slot *slot = resp->cstate.slot; - u32 length, tlen = 0, pad = 8; + u32 length, tlen = 0; if (!nfsd4_has_session(&resp->cstate)) - return status; + return 0; session = resp->cstate.session; - if (session == NULL || slot->sl_cachethis == 0) - return status; - - if (resp->opcnt >= args->opcnt) - pad = 0; /* this is the last operation */ + if (session == NULL) + return 0; if (xb->page_len == 0) { length = (char *)resp->p - (char *)xb->head[0].iov_base + pad; @@ -3374,10 +3530,14 @@ static int nfsd4_check_drc_limit(struct nfsd4_compoundres *resp) dprintk("%s length %u, xb->page_len %u tlen %u pad %u\n", __func__, length, xb->page_len, tlen, pad); - if (length <= session->se_fchannel.maxresp_cached) - return status; - else + if (length > session->se_fchannel.maxresp_sz) + return nfserr_rep_too_big; + + if (slot->sl_cachethis == 1 && + length > session->se_fchannel.maxresp_cached) return nfserr_rep_too_big_to_cache; + + return 0; } void @@ -3397,8 +3557,8 @@ nfsd4_encode_operation(struct nfsd4_compoundres *resp, struct nfsd4_op *op) !nfsd4_enc_ops[op->opnum]); op->status = nfsd4_enc_ops[op->opnum](resp, op->status, &op->u); /* nfsd4_check_drc_limit guarantees enough room for error status */ - if (!op->status && nfsd4_check_drc_limit(resp)) - op->status = nfserr_rep_too_big_to_cache; + if (!op->status) + op->status = nfsd4_check_resp_size(resp, 0); status: /* * Note: We write the status directly, instead of using WRITE32(), diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index c771614..bb4a11d 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -9,11 +9,11 @@ #include <linux/ctype.h> #include <linux/sunrpc/svcsock.h> -#include <linux/nfsd/syscall.h> #include <linux/lockd/lockd.h> #include <linux/sunrpc/clnt.h> #include <linux/sunrpc/gss_api.h> #include <linux/sunrpc/gss_krb5_enctypes.h> +#include <linux/module.h> #include "idmap.h" #include "nfsd.h" @@ -272,7 +272,7 @@ static ssize_t write_unlock_fs(struct file *file, char *buf, size_t size) * 2. Is that directory a mount point, or * 3. Is that directory the root of an exported file system? */ - error = nlmsvc_unlock_all_by_sb(path.mnt->mnt_sb); + error = nlmsvc_unlock_all_by_sb(path.dentry->d_sb); path_put(&path); return error; diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h index 7ecfa24..58134a2 100644 --- a/fs/nfsd/nfsd.h +++ b/fs/nfsd/nfsd.h @@ -11,13 +11,39 @@ #include <linux/types.h> #include <linux/mount.h> +#include <linux/nfs.h> +#include <linux/nfs2.h> +#include <linux/nfs3.h> +#include <linux/nfs4.h> +#include <linux/sunrpc/msg_prot.h> + #include <linux/nfsd/debug.h> #include <linux/nfsd/export.h> #include <linux/nfsd/stats.h> + /* * nfsd version */ #define NFSD_SUPPORTED_MINOR_VERSION 1 +/* + * Maximum blocksizes supported by daemon under various circumstances. + */ +#define NFSSVC_MAXBLKSIZE RPCSVC_MAXPAYLOAD +/* NFSv2 is limited by the protocol specification, see RFC 1094 */ +#define NFSSVC_MAXBLKSIZE_V2 (8*1024) + + +/* + * Largest number of bytes we need to allocate for an NFS + * call or reply. Used to control buffer sizes. We use + * the length of v3 WRITE, READDIR and READDIR replies + * which are an RPC header, up to 26 XDR units of reply + * data, and some page data. + * + * Note that accuracy here doesn't matter too much as the + * size is rounded up to a page size when allocating space. + */ +#define NFSD_BUFSIZE ((RPC_MAX_HEADER_WITH_AUTH+26)*XDR_UNIT + NFSSVC_MAXBLKSIZE) struct readdir_cd { __be32 err; /* 0, nfserr, or nfserr_eof */ @@ -335,6 +361,13 @@ static inline u32 nfsd_suppattrs2(u32 minorversion) #define NFSD_SUPPATTR_EXCLCREAT_WORD2 \ NFSD_WRITEABLE_ATTRS_WORD2 +extern int nfsd4_is_junction(struct dentry *dentry); +#else +static inline int nfsd4_is_junction(struct dentry *dentry) +{ + return 0; +} + #endif /* CONFIG_NFSD_V4 */ #endif /* LINUX_NFSD_NFSD_H */ diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c index 90c6aa6..68454e7 100644 --- a/fs/nfsd/nfsfh.c +++ b/fs/nfsd/nfsfh.c @@ -59,28 +59,25 @@ static int nfsd_acceptable(void *expv, struct dentry *dentry) * the write call). */ static inline __be32 -nfsd_mode_check(struct svc_rqst *rqstp, umode_t mode, int type) +nfsd_mode_check(struct svc_rqst *rqstp, umode_t mode, umode_t requested) { - /* Type can be negative when creating hardlinks - not to a dir */ - if (type > 0 && (mode & S_IFMT) != type) { - if (rqstp->rq_vers == 4 && (mode & S_IFMT) == S_IFLNK) - return nfserr_symlink; - else if (type == S_IFDIR) - return nfserr_notdir; - else if ((mode & S_IFMT) == S_IFDIR) - return nfserr_isdir; - else - return nfserr_inval; - } - if (type < 0 && (mode & S_IFMT) == -type) { - if (rqstp->rq_vers == 4 && (mode & S_IFMT) == S_IFLNK) - return nfserr_symlink; - else if (type == -S_IFDIR) - return nfserr_isdir; - else - return nfserr_notdir; - } - return 0; + mode &= S_IFMT; + + if (requested == 0) /* the caller doesn't care */ + return nfs_ok; + if (mode == requested) + return nfs_ok; + /* + * v4 has an error more specific than err_notdir which we should + * return in preference to err_notdir: + */ + if (rqstp->rq_vers == 4 && mode == S_IFLNK) + return nfserr_symlink; + if (requested == S_IFDIR) + return nfserr_notdir; + if (mode == S_IFDIR) + return nfserr_isdir; + return nfserr_inval; } static __be32 nfsd_setuser_and_check_port(struct svc_rqst *rqstp, @@ -296,7 +293,7 @@ out: * include/linux/nfsd/nfsd.h. */ __be32 -fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, int access) +fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type, int access) { struct svc_export *exp; struct dentry *dentry; diff --git a/fs/nfsd/nfsfh.h b/fs/nfsd/nfsfh.h index c16f8d8..e5e6707 100644 --- a/fs/nfsd/nfsfh.h +++ b/fs/nfsd/nfsfh.h @@ -102,7 +102,7 @@ extern char * SVCFH_fmt(struct svc_fh *fhp); /* * Function prototypes */ -__be32 fh_verify(struct svc_rqst *, struct svc_fh *, int, int); +__be32 fh_verify(struct svc_rqst *, struct svc_fh *, umode_t, int); __be32 fh_compose(struct svc_fh *, struct svc_export *, struct dentry *, struct svc_fh *); __be32 fh_update(struct svc_fh *); void fh_put(struct svc_fh *); diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index dc5a1bf..eda7d7e 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -8,6 +8,7 @@ #include <linux/sched.h> #include <linux/freezer.h> +#include <linux/module.h> #include <linux/fs_struct.h> #include <linux/swap.h> @@ -256,6 +257,8 @@ static void nfsd_last_thread(struct svc_serv *serv) nfsd_serv = NULL; nfsd_shutdown(); + svc_rpcb_cleanup(serv); + printk(KERN_WARNING "nfsd: last server has exited, flushing export " "cache\n"); nfsd_export_flush(); diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index 4eefaf1..a3cf384 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -35,6 +35,7 @@ #ifndef _NFSD4_STATE_H #define _NFSD4_STATE_H +#include <linux/idr.h> #include <linux/sunrpc/svc_xprt.h> #include <linux/nfsd/nfsfh.h> #include "nfsfh.h" @@ -45,24 +46,20 @@ typedef struct { } clientid_t; typedef struct { - u32 so_boot; - u32 so_stateownerid; - u32 so_fileid; + clientid_t so_clid; + u32 so_id; } stateid_opaque_t; typedef struct { u32 si_generation; stateid_opaque_t si_opaque; } stateid_t; -#define si_boot si_opaque.so_boot -#define si_stateownerid si_opaque.so_stateownerid -#define si_fileid si_opaque.so_fileid #define STATEID_FMT "(%08x/%08x/%08x/%08x)" #define STATEID_VAL(s) \ - (s)->si_boot, \ - (s)->si_stateownerid, \ - (s)->si_fileid, \ + (s)->si_opaque.so_clid.cl_boot, \ + (s)->si_opaque.so_clid.cl_id, \ + (s)->si_opaque.so_id, \ (s)->si_generation struct nfsd4_callback { @@ -76,17 +73,27 @@ struct nfsd4_callback { bool cb_done; }; +struct nfs4_stid { +#define NFS4_OPEN_STID 1 +#define NFS4_LOCK_STID 2 +#define NFS4_DELEG_STID 4 +/* For an open stateid kept around *only* to process close replays: */ +#define NFS4_CLOSED_STID 8 + unsigned char sc_type; + stateid_t sc_stateid; + struct nfs4_client *sc_client; +}; + struct nfs4_delegation { + struct nfs4_stid dl_stid; /* must be first field */ struct list_head dl_perfile; struct list_head dl_perclnt; struct list_head dl_recall_lru; /* delegation recalled */ atomic_t dl_count; /* ref count */ - struct nfs4_client *dl_client; struct nfs4_file *dl_file; u32 dl_type; time_t dl_time; /* For recall: */ - stateid_t dl_stateid; struct knfsd_fh dl_fh; int dl_retries; struct nfsd4_callback dl_recall; @@ -104,6 +111,11 @@ struct nfs4_cb_conn { struct svc_xprt *cb_xprt; /* minorversion 1 only */ }; +static inline struct nfs4_delegation *delegstateid(struct nfs4_stid *s) +{ + return container_of(s, struct nfs4_delegation, dl_stid); +} + /* Maximum number of slots per session. 160 is useful for long haul TCP */ #define NFSD_MAX_SLOTS_PER_SESSION 160 /* Maximum number of operations per session compound */ @@ -220,6 +232,7 @@ struct nfs4_client { struct list_head cl_idhash; /* hash by cl_clientid.id */ struct list_head cl_strhash; /* hash by cl_name */ struct list_head cl_openowners; + struct idr cl_stateids; /* stateid lookup */ struct list_head cl_delegations; struct list_head cl_lru; /* tail queue */ struct xdr_netobj cl_name; /* id generated by client */ @@ -245,6 +258,7 @@ struct nfs4_client { #define NFSD4_CB_UP 0 #define NFSD4_CB_UNKNOWN 1 #define NFSD4_CB_DOWN 2 +#define NFSD4_CB_FAULT 3 int cl_cb_state; struct nfsd4_callback cl_cb_null; struct nfsd4_session *cl_cb_session; @@ -293,6 +307,9 @@ static inline void update_stateid(stateid_t *stateid) { stateid->si_generation++; + /* Wraparound recommendation from 3530bis-13 9.1.3.2: */ + if (stateid->si_generation == 0) + stateid->si_generation = 1; } /* A reasonable value for REPLAY_ISIZE was estimated as follows: @@ -312,49 +329,57 @@ struct nfs4_replay { __be32 rp_status; unsigned int rp_buflen; char *rp_buf; - unsigned intrp_allocated; struct knfsd_fh rp_openfh; char rp_ibuf[NFSD4_REPLAY_ISIZE]; }; -/* -* nfs4_stateowner can either be an open_owner, or a lock_owner -* -* so_idhash: stateid_hashtbl[] for open owner, lockstateid_hashtbl[] -* for lock_owner -* so_strhash: ownerstr_hashtbl[] for open_owner, lock_ownerstr_hashtbl[] -* for lock_owner -* so_perclient: nfs4_client->cl_perclient entry - used when nfs4_client -* struct is reaped. -* so_perfilestate: heads the list of nfs4_stateid (either open or lock) -* and is used to ensure no dangling nfs4_stateid references when we -* release a stateowner. -* so_perlockowner: (open) nfs4_stateid->st_perlockowner entry - used when -* close is called to reap associated byte-range locks -* so_close_lru: (open) stateowner is placed on this list instead of being -* reaped (when so_perfilestate is empty) to hold the last close replay. -* reaped by laundramat thread after lease period. -*/ struct nfs4_stateowner { - struct kref so_ref; - struct list_head so_idhash; /* hash by so_id */ struct list_head so_strhash; /* hash by op_name */ - struct list_head so_perclient; struct list_head so_stateids; - struct list_head so_perstateid; /* for lockowners only */ - struct list_head so_close_lru; /* tail queue */ - time_t so_time; /* time of placement on so_close_lru */ - int so_is_open_owner; /* 1=openowner,0=lockowner */ - u32 so_id; struct nfs4_client * so_client; /* after increment in ENCODE_SEQID_OP_TAIL, represents the next * sequence id expected from the client: */ u32 so_seqid; struct xdr_netobj so_owner; /* open owner name */ - int so_confirmed; /* successful OPEN_CONFIRM? */ struct nfs4_replay so_replay; + bool so_is_open_owner; }; +struct nfs4_openowner { + struct nfs4_stateowner oo_owner; /* must be first field */ + struct list_head oo_perclient; + /* + * We keep around openowners a little while after last close, + * which saves clients from having to confirm, and allows us to + * handle close replays if they come soon enough. The close_lru + * is a list of such openowners, to be reaped by the laundromat + * thread eventually if they remain unused: + */ + struct list_head oo_close_lru; + struct nfs4_ol_stateid *oo_last_closed_stid; + time_t oo_time; /* time of placement on so_close_lru */ +#define NFS4_OO_CONFIRMED 1 +#define NFS4_OO_PURGE_CLOSE 2 +#define NFS4_OO_NEW 4 + unsigned char oo_flags; +}; + +struct nfs4_lockowner { + struct nfs4_stateowner lo_owner; /* must be first element */ + struct list_head lo_perstateid; /* for lockowners only */ + struct list_head lo_list; /* for temporary uses */ +}; + +static inline struct nfs4_openowner * openowner(struct nfs4_stateowner *so) +{ + return container_of(so, struct nfs4_openowner, oo_owner); +} + +static inline struct nfs4_lockowner * lockowner(struct nfs4_stateowner *so) +{ + return container_of(so, struct nfs4_lockowner, lo_owner); +} + /* * nfs4_file: a file opened by some number of (open) nfs4_stateowners. * o fi_perfile list is used to search for conflicting @@ -368,17 +393,17 @@ struct nfs4_file { /* One each for O_RDONLY, O_WRONLY, O_RDWR: */ struct file * fi_fds[3]; /* - * Each open or lock stateid contributes 1 to either - * fi_access[O_RDONLY], fi_access[O_WRONLY], or both, depending - * on open or lock mode: + * Each open or lock stateid contributes 0-4 to the counts + * below depending on which bits are set in st_access_bitmap: + * 1 to fi_access[O_RDONLY] if NFS4_SHARE_ACCES_READ is set + * + 1 to fi_access[O_WRONLY] if NFS4_SHARE_ACCESS_WRITE is set + * + 1 to both of the above if NFS4_SHARE_ACCESS_BOTH is set. */ atomic_t fi_access[2]; struct file *fi_deleg_file; struct file_lock *fi_lease; atomic_t fi_delegees; struct inode *fi_inode; - u32 fi_id; /* used with stateowner->so_id - * for stateid_hashtbl hash */ bool fi_had_conflict; }; @@ -408,50 +433,27 @@ static inline struct file *find_any_file(struct nfs4_file *f) return f->fi_fds[O_RDONLY]; } -/* -* nfs4_stateid can either be an open stateid or (eventually) a lock stateid -* -* (open)nfs4_stateid: one per (open)nfs4_stateowner, nfs4_file -* -* st_hash: stateid_hashtbl[] entry or lockstateid_hashtbl entry -* st_perfile: file_hashtbl[] entry. -* st_perfile_state: nfs4_stateowner->so_perfilestate -* st_perlockowner: (open stateid) list of lock nfs4_stateowners -* st_access_bmap: used only for open stateid -* st_deny_bmap: used only for open stateid -* st_openstp: open stateid lock stateid was derived from -* -* XXX: open stateids and lock stateids have diverged sufficiently that -* we should consider defining separate structs for the two cases. -*/ - -struct nfs4_stateid { - struct list_head st_hash; +/* "ol" stands for "Open or Lock". Better suggestions welcome. */ +struct nfs4_ol_stateid { + struct nfs4_stid st_stid; /* must be first field */ struct list_head st_perfile; struct list_head st_perstateowner; struct list_head st_lockowners; struct nfs4_stateowner * st_stateowner; struct nfs4_file * st_file; - stateid_t st_stateid; unsigned long st_access_bmap; unsigned long st_deny_bmap; - struct nfs4_stateid * st_openstp; + struct nfs4_ol_stateid * st_openstp; }; +static inline struct nfs4_ol_stateid *openlockstateid(struct nfs4_stid *s) +{ + return container_of(s, struct nfs4_ol_stateid, st_stid); +} + /* flags for preprocess_seqid_op() */ -#define HAS_SESSION 0x00000001 -#define CONFIRM 0x00000002 -#define OPEN_STATE 0x00000004 -#define LOCK_STATE 0x00000008 #define RD_STATE 0x00000010 #define WR_STATE 0x00000020 -#define CLOSE_STATE 0x00000040 - -#define seqid_mutating_err(err) \ - (((err) != nfserr_stale_clientid) && \ - ((err) != nfserr_bad_seqid) && \ - ((err) != nfserr_stale_stateid) && \ - ((err) != nfserr_bad_stateid)) struct nfsd4_compound_state; @@ -461,7 +463,8 @@ extern void nfs4_lock_state(void); extern void nfs4_unlock_state(void); extern int nfs4_in_grace(void); extern __be32 nfs4_check_open_reclaim(clientid_t *clid); -extern void nfs4_free_stateowner(struct kref *kref); +extern void nfs4_free_openowner(struct nfs4_openowner *); +extern void nfs4_free_lockowner(struct nfs4_lockowner *); extern int set_callback_cred(void); extern void nfsd4_probe_callback(struct nfs4_client *clp); extern void nfsd4_probe_callback_sync(struct nfs4_client *clp); @@ -473,7 +476,7 @@ extern void nfsd4_destroy_callback_queue(void); extern void nfsd4_shutdown_callback(struct nfs4_client *); extern void nfs4_put_delegation(struct nfs4_delegation *dp); extern __be32 nfs4_make_rec_clidname(char *clidname, struct xdr_netobj *clname); -extern void nfsd4_init_recdir(char *recdir_name); +extern void nfsd4_init_recdir(void); extern int nfsd4_recdir_load(void); extern void nfsd4_shutdown_recdir(void); extern int nfs4_client_to_reclaim(const char *name); @@ -482,18 +485,7 @@ extern void nfsd4_recdir_purge_old(void); extern int nfsd4_create_clid_dir(struct nfs4_client *clp); extern void nfsd4_remove_clid_dir(struct nfs4_client *clp); extern void release_session_client(struct nfsd4_session *); -extern __be32 nfs4_validate_stateid(stateid_t *, int); - -static inline void -nfs4_put_stateowner(struct nfs4_stateowner *so) -{ - kref_put(&so->so_ref, nfs4_free_stateowner); -} - -static inline void -nfs4_get_stateowner(struct nfs4_stateowner *so) -{ - kref_get(&so->so_ref); -} +extern __be32 nfs4_validate_stateid(struct nfs4_client *, stateid_t *); +extern void nfsd4_purge_closed_stateid(struct nfs4_stateowner *); #endif /* NFSD4_STATE_H */ diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index fd0acca..d25a723 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -168,6 +168,8 @@ int nfsd_mountpoint(struct dentry *dentry, struct svc_export *exp) { if (d_mountpoint(dentry)) return 1; + if (nfsd4_is_junction(dentry)) + return 1; if (!(exp->ex_flags & NFSEXP_V4ROOT)) return 0; return dentry->d_inode != NULL; @@ -305,7 +307,7 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap, struct dentry *dentry; struct inode *inode; int accmode = NFSD_MAY_SATTR; - int ftype = 0; + umode_t ftype = 0; __be32 err; int host_err; int size_change = 0; @@ -502,7 +504,7 @@ nfsd4_set_nfs4_acl(struct svc_rqst *rqstp, struct svc_fh *fhp, unsigned int flags = 0; /* Get inode */ - error = fh_verify(rqstp, fhp, 0 /* S_IFREG */, NFSD_MAY_SATTR); + error = fh_verify(rqstp, fhp, 0, NFSD_MAY_SATTR); if (error) return error; @@ -592,6 +594,22 @@ nfsd4_get_nfs4_acl(struct svc_rqst *rqstp, struct dentry *dentry, struct nfs4_ac return error; } +#define NFSD_XATTR_JUNCTION_PREFIX XATTR_TRUSTED_PREFIX "junction." +#define NFSD_XATTR_JUNCTION_TYPE NFSD_XATTR_JUNCTION_PREFIX "type" +int nfsd4_is_junction(struct dentry *dentry) +{ + struct inode *inode = dentry->d_inode; + + if (inode == NULL) + return 0; + if (inode->i_mode & S_IXUGO) + return 0; + if (!(inode->i_mode & S_ISVTX)) + return 0; + if (vfs_getxattr(dentry, NFSD_XATTR_JUNCTION_TYPE, NULL, 0) <= 0) + return 0; + return 1; +} #endif /* defined(CONFIG_NFSD_V4) */ #ifdef CONFIG_NFSD_V3 @@ -712,7 +730,7 @@ static int nfsd_open_break_lease(struct inode *inode, int access) * N.B. After this call fhp needs an fh_put */ __be32 -nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, +nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type, int access, struct file **filp) { struct dentry *dentry; @@ -1282,7 +1300,7 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp, goto out; } - host_err = mnt_want_write(fhp->fh_export->ex_path.mnt); + host_err = fh_want_write(fhp); if (host_err) goto out_nfserr; @@ -1307,7 +1325,7 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp, break; } if (host_err < 0) { - mnt_drop_write(fhp->fh_export->ex_path.mnt); + fh_drop_write(fhp); goto out_nfserr; } @@ -1321,7 +1339,7 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp, err2 = nfserrno(commit_metadata(fhp)); if (err2) err = err2; - mnt_drop_write(fhp->fh_export->ex_path.mnt); + fh_drop_write(fhp); /* * Update the file handle to get the new inode info. */ @@ -1352,7 +1370,7 @@ __be32 do_nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp, char *fname, int flen, struct iattr *iap, struct svc_fh *resfhp, int createmode, u32 *verifier, - int *truncp, int *created) + bool *truncp, bool *created) { struct dentry *dentry, *dchild = NULL; struct inode *dirp; @@ -1412,7 +1430,7 @@ do_nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp, v_atime = verifier[1]&0x7fffffff; } - host_err = mnt_want_write(fhp->fh_export->ex_path.mnt); + host_err = fh_want_write(fhp); if (host_err) goto out_nfserr; if (dchild->d_inode) { @@ -1451,13 +1469,13 @@ do_nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp, case NFS3_CREATE_GUARDED: err = nfserr_exist; } - mnt_drop_write(fhp->fh_export->ex_path.mnt); + fh_drop_write(fhp); goto out; } host_err = vfs_create(dirp, dchild, iap->ia_mode, NULL); if (host_err < 0) { - mnt_drop_write(fhp->fh_export->ex_path.mnt); + fh_drop_write(fhp); goto out_nfserr; } if (created) @@ -1485,7 +1503,7 @@ do_nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp, if (!err) err = nfserrno(commit_metadata(fhp)); - mnt_drop_write(fhp->fh_export->ex_path.mnt); + fh_drop_write(fhp); /* * Update the filehandle to get the new inode info. */ @@ -1582,7 +1600,7 @@ nfsd_symlink(struct svc_rqst *rqstp, struct svc_fh *fhp, if (IS_ERR(dnew)) goto out_nfserr; - host_err = mnt_want_write(fhp->fh_export->ex_path.mnt); + host_err = fh_want_write(fhp); if (host_err) goto out_nfserr; @@ -1603,7 +1621,7 @@ nfsd_symlink(struct svc_rqst *rqstp, struct svc_fh *fhp, err = nfserrno(commit_metadata(fhp)); fh_unlock(fhp); - mnt_drop_write(fhp->fh_export->ex_path.mnt); + fh_drop_write(fhp); cerr = fh_compose(resfhp, fhp->fh_export, dnew, fhp); dput(dnew); @@ -1632,10 +1650,12 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp, err = fh_verify(rqstp, ffhp, S_IFDIR, NFSD_MAY_CREATE); if (err) goto out; - err = fh_verify(rqstp, tfhp, -S_IFDIR, NFSD_MAY_NOP); + err = fh_verify(rqstp, tfhp, 0, NFSD_MAY_NOP); if (err) goto out; - + err = nfserr_isdir; + if (S_ISDIR(tfhp->fh_dentry->d_inode->i_mode)) + goto out; err = nfserr_perm; if (!len) goto out; @@ -1654,7 +1674,7 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp, dold = tfhp->fh_dentry; - host_err = mnt_want_write(tfhp->fh_export->ex_path.mnt); + host_err = fh_want_write(tfhp); if (host_err) { err = nfserrno(host_err); goto out_dput; @@ -1679,7 +1699,7 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp, err = nfserrno(host_err); } out_drop_write: - mnt_drop_write(tfhp->fh_export->ex_path.mnt); + fh_drop_write(tfhp); out_dput: dput(dnew); out_unlock: @@ -1756,7 +1776,7 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen, host_err = -EXDEV; if (ffhp->fh_export->ex_path.mnt != tfhp->fh_export->ex_path.mnt) goto out_dput_new; - host_err = mnt_want_write(ffhp->fh_export->ex_path.mnt); + host_err = fh_want_write(ffhp); if (host_err) goto out_dput_new; @@ -1775,7 +1795,7 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen, host_err = commit_metadata(ffhp); } out_drop_write: - mnt_drop_write(ffhp->fh_export->ex_path.mnt); + fh_drop_write(ffhp); out_dput_new: dput(ndentry); out_dput_old: @@ -1834,7 +1854,7 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, if (!type) type = rdentry->d_inode->i_mode & S_IFMT; - host_err = mnt_want_write(fhp->fh_export->ex_path.mnt); + host_err = fh_want_write(fhp); if (host_err) goto out_put; @@ -1848,7 +1868,7 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, if (!host_err) host_err = commit_metadata(fhp); out_drop_write: - mnt_drop_write(fhp->fh_export->ex_path.mnt); + fh_drop_write(fhp); out_put: dput(rdentry); @@ -2114,7 +2134,8 @@ nfsd_permission(struct svc_rqst *rqstp, struct svc_export *exp, /* Allow read access to binaries even when mode 111 */ if (err == -EACCES && S_ISREG(inode->i_mode) && - acc == (NFSD_MAY_READ | NFSD_MAY_OWNER_OVERRIDE)) + (acc == (NFSD_MAY_READ | NFSD_MAY_OWNER_OVERRIDE) || + acc == (NFSD_MAY_READ | NFSD_MAY_READ_IF_EXEC))) err = inode_permission(inode, MAY_EXEC); return err? nfserrno(err) : 0; @@ -2249,7 +2270,7 @@ nfsd_set_posix_acl(struct svc_fh *fhp, int type, struct posix_acl *acl) } else size = 0; - error = mnt_want_write(fhp->fh_export->ex_path.mnt); + error = fh_want_write(fhp); if (error) goto getout; if (size) @@ -2263,7 +2284,7 @@ nfsd_set_posix_acl(struct svc_fh *fhp, int type, struct posix_acl *acl) error = 0; } } - mnt_drop_write(fhp->fh_export->ex_path.mnt); + fh_drop_write(fhp); getout: kfree(value); diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h index e0bbac0..1dcd238 100644 --- a/fs/nfsd/vfs.h +++ b/fs/nfsd/vfs.h @@ -10,21 +10,22 @@ /* * Flags for nfsd_permission */ -#define NFSD_MAY_NOP 0 -#define NFSD_MAY_EXEC 1 /* == MAY_EXEC */ -#define NFSD_MAY_WRITE 2 /* == MAY_WRITE */ -#define NFSD_MAY_READ 4 /* == MAY_READ */ -#define NFSD_MAY_SATTR 8 -#define NFSD_MAY_TRUNC 16 -#define NFSD_MAY_LOCK 32 -#define NFSD_MAY_MASK 63 +#define NFSD_MAY_NOP 0 +#define NFSD_MAY_EXEC 0x001 /* == MAY_EXEC */ +#define NFSD_MAY_WRITE 0x002 /* == MAY_WRITE */ +#define NFSD_MAY_READ 0x004 /* == MAY_READ */ +#define NFSD_MAY_SATTR 0x008 +#define NFSD_MAY_TRUNC 0x010 +#define NFSD_MAY_LOCK 0x020 +#define NFSD_MAY_MASK 0x03f /* extra hints to permission and open routines: */ -#define NFSD_MAY_OWNER_OVERRIDE 64 -#define NFSD_MAY_LOCAL_ACCESS 128 /* IRIX doing local access check on device special file*/ -#define NFSD_MAY_BYPASS_GSS_ON_ROOT 256 -#define NFSD_MAY_NOT_BREAK_LEASE 512 -#define NFSD_MAY_BYPASS_GSS 1024 +#define NFSD_MAY_OWNER_OVERRIDE 0x040 +#define NFSD_MAY_LOCAL_ACCESS 0x080 /* for device special files */ +#define NFSD_MAY_BYPASS_GSS_ON_ROOT 0x100 +#define NFSD_MAY_NOT_BREAK_LEASE 0x200 +#define NFSD_MAY_BYPASS_GSS 0x400 +#define NFSD_MAY_READ_IF_EXEC 0x800 #define NFSD_MAY_CREATE (NFSD_MAY_EXEC|NFSD_MAY_WRITE) #define NFSD_MAY_REMOVE (NFSD_MAY_EXEC|NFSD_MAY_WRITE|NFSD_MAY_TRUNC) @@ -61,11 +62,11 @@ __be32 nfsd_access(struct svc_rqst *, struct svc_fh *, u32 *, u32 *); __be32 do_nfsd_create(struct svc_rqst *, struct svc_fh *, char *name, int len, struct iattr *attrs, struct svc_fh *res, int createmode, - u32 *verifier, int *truncp, int *created); + u32 *verifier, bool *truncp, bool *created); __be32 nfsd_commit(struct svc_rqst *, struct svc_fh *, loff_t, unsigned long); #endif /* CONFIG_NFSD_V3 */ -__be32 nfsd_open(struct svc_rqst *, struct svc_fh *, int, +__be32 nfsd_open(struct svc_rqst *, struct svc_fh *, umode_t, int, struct file **); void nfsd_close(struct file *); __be32 nfsd_read(struct svc_rqst *, struct svc_fh *, @@ -105,4 +106,14 @@ struct posix_acl *nfsd_get_posix_acl(struct svc_fh *, int); int nfsd_set_posix_acl(struct svc_fh *, int, struct posix_acl *); #endif +static inline int fh_want_write(struct svc_fh *fh) +{ + return mnt_want_write(fh->fh_export->ex_path.mnt); +} + +static inline void fh_drop_write(struct svc_fh *fh) +{ + mnt_drop_write(fh->fh_export->ex_path.mnt); +} + #endif /* LINUX_NFSD_VFS_H */ diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h index d2a8d044..2364747 100644 --- a/fs/nfsd/xdr4.h +++ b/fs/nfsd/xdr4.h @@ -81,7 +81,6 @@ struct nfsd4_access { struct nfsd4_close { u32 cl_seqid; /* request */ stateid_t cl_stateid; /* request+response */ - struct nfs4_stateowner * cl_stateowner; /* response */ }; struct nfsd4_commit { @@ -131,7 +130,7 @@ struct nfsd4_link { struct nfsd4_lock_denied { clientid_t ld_clientid; - struct nfs4_stateowner *ld_sop; + struct xdr_netobj ld_owner; u64 ld_start; u64 ld_length; u32 ld_type; @@ -165,9 +164,6 @@ struct nfsd4_lock { } ok; struct nfsd4_lock_denied denied; } u; - /* The lk_replay_owner is the open owner in the open_to_lock_owner - * case and the lock owner otherwise: */ - struct nfs4_stateowner *lk_replay_owner; }; #define lk_new_open_seqid v.new.open_seqid #define lk_new_open_stateid v.new.open_stateid @@ -188,7 +184,6 @@ struct nfsd4_lockt { struct xdr_netobj lt_owner; u64 lt_offset; u64 lt_length; - struct nfs4_stateowner * lt_stateowner; struct nfsd4_lock_denied lt_denied; }; @@ -199,7 +194,6 @@ struct nfsd4_locku { stateid_t lu_stateid; u64 lu_offset; u64 lu_length; - struct nfs4_stateowner *lu_stateowner; }; @@ -232,8 +226,11 @@ struct nfsd4_open { u32 op_recall; /* recall */ struct nfsd4_change_info op_cinfo; /* response */ u32 op_rflags; /* response */ - int op_truncate; /* used during processing */ - struct nfs4_stateowner *op_stateowner; /* used during processing */ + bool op_truncate; /* used during processing */ + bool op_created; /* used during processing */ + struct nfs4_openowner *op_openowner; /* used during processing */ + struct nfs4_file *op_file; /* used during processing */ + struct nfs4_ol_stateid *op_stp; /* used during processing */ struct nfs4_acl *op_acl; }; #define op_iattr iattr @@ -243,7 +240,6 @@ struct nfsd4_open_confirm { stateid_t oc_req_stateid /* request */; u32 oc_seqid /* request */; stateid_t oc_resp_stateid /* response */; - struct nfs4_stateowner * oc_stateowner; /* response */ }; struct nfsd4_open_downgrade { @@ -251,7 +247,6 @@ struct nfsd4_open_downgrade { u32 od_seqid; u32 od_share_access; u32 od_share_deny; - struct nfs4_stateowner *od_stateowner; }; @@ -325,8 +320,7 @@ struct nfsd4_setattr { struct nfsd4_setclientid { nfs4_verifier se_verf; /* request */ - u32 se_namelen; /* request */ - char * se_name; /* request */ + struct xdr_netobj se_name; u32 se_callback_prog; /* request */ u32 se_callback_netid_len; /* request */ char * se_callback_netid_val; /* request */ @@ -351,7 +345,6 @@ struct nfsd4_saved_compoundargs { struct nfsd4_test_stateid { __be32 ts_num_ids; - __be32 ts_has_session; struct nfsd4_compoundargs *ts_saved_args; struct nfsd4_saved_compoundargs ts_savedp; }; @@ -405,6 +398,10 @@ struct nfsd4_destroy_session { struct nfs4_sessionid sessionid; }; +struct nfsd4_destroy_clientid { + clientid_t clientid; +}; + struct nfsd4_reclaim_complete { u32 rca_one_fs; }; @@ -532,6 +529,7 @@ int nfs4svc_decode_compoundargs(struct svc_rqst *, __be32 *, struct nfsd4_compoundargs *); int nfs4svc_encode_compoundres(struct svc_rqst *, __be32 *, struct nfsd4_compoundres *); +int nfsd4_check_resp_size(struct nfsd4_compoundres *, u32); void nfsd4_encode_operation(struct nfsd4_compoundres *, struct nfsd4_op *); void nfsd4_encode_replay(struct nfsd4_compoundres *resp, struct nfsd4_op *op); __be32 nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp, @@ -558,11 +556,13 @@ extern __be32 nfsd4_sequence(struct svc_rqst *, extern __be32 nfsd4_destroy_session(struct svc_rqst *, struct nfsd4_compound_state *, struct nfsd4_destroy_session *); +extern __be32 nfsd4_destroy_clientid(struct svc_rqst *, struct nfsd4_compound_state *, struct nfsd4_destroy_clientid *); __be32 nfsd4_reclaim_complete(struct svc_rqst *, struct nfsd4_compound_state *, struct nfsd4_reclaim_complete *); extern __be32 nfsd4_process_open1(struct nfsd4_compound_state *, struct nfsd4_open *open); extern __be32 nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_open *open); +extern void nfsd4_cleanup_open_state(struct nfsd4_open *open, __be32 status); extern __be32 nfsd4_open_confirm(struct svc_rqst *rqstp, struct nfsd4_compound_state *, struct nfsd4_open_confirm *oc); extern __be32 nfsd4_close(struct svc_rqst *rqstp, diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c index 3a19239..ca35b3a 100644 --- a/fs/nilfs2/dir.c +++ b/fs/nilfs2/dir.c @@ -251,7 +251,7 @@ nilfs_type_by_mode[S_IFMT >> S_SHIFT] = { static void nilfs_set_de_type(struct nilfs_dir_entry *de, struct inode *inode) { - mode_t mode = inode->i_mode; + umode_t mode = inode->i_mode; de->file_type = nilfs_type_by_mode[(mode & S_IFMT)>>S_SHIFT]; } diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c index 666628b..8f7b95a 100644 --- a/fs/nilfs2/inode.c +++ b/fs/nilfs2/inode.c @@ -291,7 +291,7 @@ const struct address_space_operations nilfs_aops = { .is_partially_uptodate = block_is_partially_uptodate, }; -struct inode *nilfs_new_inode(struct inode *dir, int mode) +struct inode *nilfs_new_inode(struct inode *dir, umode_t mode) { struct super_block *sb = dir->i_sb; struct the_nilfs *nilfs = sb->s_fs_info; @@ -354,7 +354,7 @@ struct inode *nilfs_new_inode(struct inode *dir, int mode) failed_acl: failed_bmap: - inode->i_nlink = 0; + clear_nlink(inode); iput(inode); /* raw_inode will be deleted through generic_delete_inode() */ goto failed; @@ -396,7 +396,7 @@ int nilfs_read_inode_common(struct inode *inode, inode->i_mode = le16_to_cpu(raw_inode->i_mode); inode->i_uid = (uid_t)le32_to_cpu(raw_inode->i_uid); inode->i_gid = (gid_t)le32_to_cpu(raw_inode->i_gid); - inode->i_nlink = le16_to_cpu(raw_inode->i_links_count); + set_nlink(inode, le16_to_cpu(raw_inode->i_links_count)); inode->i_size = le64_to_cpu(raw_inode->i_size); inode->i_atime.tv_sec = le64_to_cpu(raw_inode->i_mtime); inode->i_ctime.tv_sec = le64_to_cpu(raw_inode->i_ctime); diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c index 41d6743..8866496 100644 --- a/fs/nilfs2/ioctl.c +++ b/fs/nilfs2/ioctl.c @@ -27,7 +27,7 @@ #include <linux/uaccess.h> /* copy_from_user(), copy_to_user() */ #include <linux/vmalloc.h> #include <linux/compat.h> /* compat_ptr() */ -#include <linux/mount.h> /* mnt_want_write(), mnt_drop_write() */ +#include <linux/mount.h> /* mnt_want_write_file(), mnt_drop_write_file() */ #include <linux/buffer_head.h> #include <linux/nilfs2_fs.h> #include "nilfs.h" @@ -119,7 +119,7 @@ static int nilfs_ioctl_setflags(struct inode *inode, struct file *filp, if (get_user(flags, (int __user *)argp)) return -EFAULT; - ret = mnt_want_write(filp->f_path.mnt); + ret = mnt_want_write_file(filp); if (ret) return ret; @@ -154,7 +154,7 @@ static int nilfs_ioctl_setflags(struct inode *inode, struct file *filp, ret = nilfs_transaction_commit(inode->i_sb); out: mutex_unlock(&inode->i_mutex); - mnt_drop_write(filp->f_path.mnt); + mnt_drop_write_file(filp); return ret; } @@ -174,7 +174,7 @@ static int nilfs_ioctl_change_cpmode(struct inode *inode, struct file *filp, if (!capable(CAP_SYS_ADMIN)) return -EPERM; - ret = mnt_want_write(filp->f_path.mnt); + ret = mnt_want_write_file(filp); if (ret) return ret; @@ -194,7 +194,7 @@ static int nilfs_ioctl_change_cpmode(struct inode *inode, struct file *filp, up_read(&inode->i_sb->s_umount); out: - mnt_drop_write(filp->f_path.mnt); + mnt_drop_write_file(filp); return ret; } @@ -210,7 +210,7 @@ nilfs_ioctl_delete_checkpoint(struct inode *inode, struct file *filp, if (!capable(CAP_SYS_ADMIN)) return -EPERM; - ret = mnt_want_write(filp->f_path.mnt); + ret = mnt_want_write_file(filp); if (ret) return ret; @@ -225,7 +225,7 @@ nilfs_ioctl_delete_checkpoint(struct inode *inode, struct file *filp, else nilfs_transaction_commit(inode->i_sb); /* never fails */ out: - mnt_drop_write(filp->f_path.mnt); + mnt_drop_write_file(filp); return ret; } @@ -591,7 +591,7 @@ static int nilfs_ioctl_clean_segments(struct inode *inode, struct file *filp, if (!capable(CAP_SYS_ADMIN)) return -EPERM; - ret = mnt_want_write(filp->f_path.mnt); + ret = mnt_want_write_file(filp); if (ret) return ret; @@ -625,6 +625,9 @@ static int nilfs_ioctl_clean_segments(struct inode *inode, struct file *filp, if (argv[n].v_nmembs > nsegs * nilfs->ns_blocks_per_segment) goto out_free; + if (argv[n].v_nmembs >= UINT_MAX / argv[n].v_size) + goto out_free; + len = argv[n].v_size * argv[n].v_nmembs; base = (void __user *)(unsigned long)argv[n].v_base; if (len == 0) { @@ -672,7 +675,7 @@ out_free: vfree(kbufs[n]); kfree(kbufs[4]); out: - mnt_drop_write(filp->f_path.mnt); + mnt_drop_write_file(filp); return ret; } @@ -707,7 +710,7 @@ static int nilfs_ioctl_resize(struct inode *inode, struct file *filp, if (!capable(CAP_SYS_ADMIN)) goto out; - ret = mnt_want_write(filp->f_path.mnt); + ret = mnt_want_write_file(filp); if (ret) goto out; @@ -718,7 +721,7 @@ static int nilfs_ioctl_resize(struct inode *inode, struct file *filp, ret = nilfs_resize_fs(inode->i_sb, newsize); out_drop_write: - mnt_drop_write(filp->f_path.mnt); + mnt_drop_write_file(filp); out: return ret; } @@ -842,6 +845,19 @@ long nilfs_compat_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) case FS_IOC32_GETVERSION: cmd = FS_IOC_GETVERSION; break; + case NILFS_IOCTL_CHANGE_CPMODE: + case NILFS_IOCTL_DELETE_CHECKPOINT: + case NILFS_IOCTL_GET_CPINFO: + case NILFS_IOCTL_GET_CPSTAT: + case NILFS_IOCTL_GET_SUINFO: + case NILFS_IOCTL_GET_SUSTAT: + case NILFS_IOCTL_GET_VINFO: + case NILFS_IOCTL_GET_BDESCS: + case NILFS_IOCTL_CLEAN_SEGMENTS: + case NILFS_IOCTL_SYNC: + case NILFS_IOCTL_RESIZE: + case NILFS_IOCTL_SET_ALLOC_RANGE: + break; default: return -ENOIOCTLCMD; } diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c index a314199..1cd3f62 100644 --- a/fs/nilfs2/namei.c +++ b/fs/nilfs2/namei.c @@ -84,7 +84,7 @@ nilfs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd) * If the create succeeds, we fill in the inode information * with d_instantiate(). */ -static int nilfs_create(struct inode *dir, struct dentry *dentry, int mode, +static int nilfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, struct nameidata *nd) { struct inode *inode; @@ -112,7 +112,7 @@ static int nilfs_create(struct inode *dir, struct dentry *dentry, int mode, } static int -nilfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t rdev) +nilfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev) { struct inode *inode; struct nilfs_transaction_info ti; @@ -213,7 +213,7 @@ static int nilfs_link(struct dentry *old_dentry, struct inode *dir, return err; } -static int nilfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) +static int nilfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) { struct inode *inode; struct nilfs_transaction_info ti; @@ -289,7 +289,7 @@ static int nilfs_do_unlink(struct inode *dir, struct dentry *dentry) nilfs_warning(inode->i_sb, __func__, "deleting nonexistent file (%lu), %d\n", inode->i_ino, inode->i_nlink); - inode->i_nlink = 1; + set_nlink(inode, 1); } err = nilfs_delete_entry(de, page); if (err) diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h index 255d5e1..250add8 100644 --- a/fs/nilfs2/nilfs.h +++ b/fs/nilfs2/nilfs.h @@ -246,7 +246,7 @@ int nilfs_ioctl_prepare_clean_segments(struct the_nilfs *, struct nilfs_argv *, /* inode.c */ void nilfs_inode_add_blocks(struct inode *inode, int n); void nilfs_inode_sub_blocks(struct inode *inode, int n); -extern struct inode *nilfs_new_inode(struct inode *, int); +extern struct inode *nilfs_new_inode(struct inode *, umode_t); extern void nilfs_free_inode(struct inode *); extern int nilfs_get_block(struct inode *, sector_t, struct buffer_head *, int); extern void nilfs_set_inode_flags(struct inode *); @@ -276,10 +276,10 @@ int nilfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, /* super.c */ extern struct inode *nilfs_alloc_inode(struct super_block *); extern void nilfs_destroy_inode(struct inode *); -extern void nilfs_error(struct super_block *, const char *, const char *, ...) - __attribute__ ((format (printf, 3, 4))); -extern void nilfs_warning(struct super_block *, const char *, const char *, ...) - __attribute__ ((format (printf, 3, 4))); +extern __printf(3, 4) +void nilfs_error(struct super_block *, const char *, const char *, ...); +extern __printf(3, 4) +void nilfs_warning(struct super_block *, const char *, const char *, ...); extern struct nilfs_super_block * nilfs_read_super_block(struct super_block *, u64, int, struct buffer_head **); extern int nilfs_store_magic_and_option(struct super_block *, diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index bb24ab6..0e72ad6 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -2470,7 +2470,7 @@ static int nilfs_segctor_thread(void *arg) if (freezing(current)) { spin_unlock(&sci->sc_state_lock); - refrigerator(); + try_to_freeze(); spin_lock(&sci->sc_state_lock); } else { DEFINE_WAIT(wait); diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c index 8351c44..08e3d4f 100644 --- a/fs/nilfs2/super.c +++ b/fs/nilfs2/super.c @@ -175,8 +175,6 @@ static void nilfs_i_callback(struct rcu_head *head) struct inode *inode = container_of(head, struct inode, i_rcu); struct nilfs_mdt_info *mdi = NILFS_MDT(inode); - INIT_LIST_HEAD(&inode->i_dentry); - if (mdi) { kfree(mdi->mi_bgl); /* kfree(NULL) is safe */ kfree(mdi); @@ -650,11 +648,11 @@ static int nilfs_statfs(struct dentry *dentry, struct kstatfs *buf) return 0; } -static int nilfs_show_options(struct seq_file *seq, struct vfsmount *vfs) +static int nilfs_show_options(struct seq_file *seq, struct dentry *dentry) { - struct super_block *sb = vfs->mnt_sb; + struct super_block *sb = dentry->d_sb; struct the_nilfs *nilfs = sb->s_fs_info; - struct nilfs_root *root = NILFS_I(vfs->mnt_root->d_inode)->i_root; + struct nilfs_root *root = NILFS_I(dentry->d_inode)->i_root; if (!nilfs_test_opt(nilfs, BARRIER)) seq_puts(seq, ",nobarrier"); diff --git a/fs/nls/nls_base.c b/fs/nls/nls_base.c index 44a88a9..fea6bd5 100644 --- a/fs/nls/nls_base.c +++ b/fs/nls/nls_base.c @@ -52,7 +52,7 @@ static const struct utf8_table utf8_table[] = #define SURROGATE_LOW 0x00000400 #define SURROGATE_BITS 0x000003ff -int utf8_to_utf32(const u8 *s, int len, unicode_t *pu) +int utf8_to_utf32(const u8 *s, int inlen, unicode_t *pu) { unsigned long l; int c0, c, nc; @@ -71,7 +71,7 @@ int utf8_to_utf32(const u8 *s, int len, unicode_t *pu) *pu = (unicode_t) l; return nc; } - if (len <= nc) + if (inlen <= nc) return -1; s++; c = (*s ^ 0x80) & 0xFF; @@ -83,7 +83,7 @@ int utf8_to_utf32(const u8 *s, int len, unicode_t *pu) } EXPORT_SYMBOL(utf8_to_utf32); -int utf32_to_utf8(unicode_t u, u8 *s, int maxlen) +int utf32_to_utf8(unicode_t u, u8 *s, int maxout) { unsigned long l; int c, nc; @@ -97,7 +97,7 @@ int utf32_to_utf8(unicode_t u, u8 *s, int maxlen) return -1; nc = 0; - for (t = utf8_table; t->cmask && maxlen; t++, maxlen--) { + for (t = utf8_table; t->cmask && maxout; t++, maxout--) { nc++; if (l <= t->lmask) { c = t->shift; @@ -114,34 +114,57 @@ int utf32_to_utf8(unicode_t u, u8 *s, int maxlen) } EXPORT_SYMBOL(utf32_to_utf8); -int utf8s_to_utf16s(const u8 *s, int len, wchar_t *pwcs) +static inline void put_utf16(wchar_t *s, unsigned c, enum utf16_endian endian) +{ + switch (endian) { + default: + *s = (wchar_t) c; + break; + case UTF16_LITTLE_ENDIAN: + *s = __cpu_to_le16(c); + break; + case UTF16_BIG_ENDIAN: + *s = __cpu_to_be16(c); + break; + } +} + +int utf8s_to_utf16s(const u8 *s, int inlen, enum utf16_endian endian, + wchar_t *pwcs, int maxout) { u16 *op; int size; unicode_t u; op = pwcs; - while (*s && len > 0) { + while (inlen > 0 && maxout > 0 && *s) { if (*s & 0x80) { - size = utf8_to_utf32(s, len, &u); + size = utf8_to_utf32(s, inlen, &u); if (size < 0) return -EINVAL; + s += size; + inlen -= size; if (u >= PLANE_SIZE) { + if (maxout < 2) + break; u -= PLANE_SIZE; - *op++ = (wchar_t) (SURROGATE_PAIR | - ((u >> 10) & SURROGATE_BITS)); - *op++ = (wchar_t) (SURROGATE_PAIR | + put_utf16(op++, SURROGATE_PAIR | + ((u >> 10) & SURROGATE_BITS), + endian); + put_utf16(op++, SURROGATE_PAIR | SURROGATE_LOW | - (u & SURROGATE_BITS)); + (u & SURROGATE_BITS), + endian); + maxout -= 2; } else { - *op++ = (wchar_t) u; + put_utf16(op++, u, endian); + maxout--; } - s += size; - len -= size; } else { - *op++ = *s++; - len--; + put_utf16(op++, *s++, endian); + inlen--; + maxout--; } } return op - pwcs; @@ -160,27 +183,27 @@ static inline unsigned long get_utf16(unsigned c, enum utf16_endian endian) } } -int utf16s_to_utf8s(const wchar_t *pwcs, int len, enum utf16_endian endian, - u8 *s, int maxlen) +int utf16s_to_utf8s(const wchar_t *pwcs, int inlen, enum utf16_endian endian, + u8 *s, int maxout) { u8 *op; int size; unsigned long u, v; op = s; - while (len > 0 && maxlen > 0) { + while (inlen > 0 && maxout > 0) { u = get_utf16(*pwcs, endian); if (!u) break; pwcs++; - len--; + inlen--; if (u > 0x7f) { if ((u & SURROGATE_MASK) == SURROGATE_PAIR) { if (u & SURROGATE_LOW) { /* Ignore character and move on */ continue; } - if (len <= 0) + if (inlen <= 0) break; v = get_utf16(*pwcs, endian); if ((v & SURROGATE_MASK) != SURROGATE_PAIR || @@ -191,18 +214,18 @@ int utf16s_to_utf8s(const wchar_t *pwcs, int len, enum utf16_endian endian, u = PLANE_SIZE + ((u & SURROGATE_BITS) << 10) + (v & SURROGATE_BITS); pwcs++; - len--; + inlen--; } - size = utf32_to_utf8(u, op, maxlen); + size = utf32_to_utf8(u, op, maxout); if (size == -1) { /* Ignore character and move on */ } else { op += size; - maxlen -= size; + maxout -= size; } } else { *op++ = (u8) u; - maxlen--; + maxout--; } } return op - s; diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index 9fde1c0..3568c8a 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -16,6 +16,8 @@ #include <asm/ioctls.h> +#include "../../mount.h" + #define FANOTIFY_DEFAULT_MAX_EVENTS 16384 #define FANOTIFY_DEFAULT_MAX_MARKS 8192 #define FANOTIFY_DEFAULT_MAX_LISTENERS 128 @@ -546,7 +548,7 @@ static int fanotify_remove_vfsmount_mark(struct fsnotify_group *group, removed = fanotify_mark_remove_from_mask(fsn_mark, mask, flags); fsnotify_put_mark(fsn_mark); - if (removed & mnt->mnt_fsnotify_mask) + if (removed & real_mount(mnt)->mnt_fsnotify_mask) fsnotify_recalc_vfsmount_mask(mnt); return 0; @@ -623,7 +625,7 @@ static int fanotify_add_vfsmount_mark(struct fsnotify_group *group, } added = fanotify_mark_add_to_mask(fsn_mark, mask, flags); - if (added & ~mnt->mnt_fsnotify_mask) + if (added & ~real_mount(mnt)->mnt_fsnotify_mask) fsnotify_recalc_vfsmount_mask(mnt); err: fsnotify_put_mark(fsn_mark); diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c index 79b47cb..ccb14d3 100644 --- a/fs/notify/fsnotify.c +++ b/fs/notify/fsnotify.c @@ -26,6 +26,7 @@ #include <linux/fsnotify_backend.h> #include "fsnotify.h" +#include "../mount.h" /* * Clear all of the marks on an inode when it is being evicted from core @@ -205,13 +206,13 @@ int fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is, struct fsnotify_mark *inode_mark = NULL, *vfsmount_mark = NULL; struct fsnotify_group *inode_group, *vfsmount_group; struct fsnotify_event *event = NULL; - struct vfsmount *mnt; + struct mount *mnt; int idx, ret = 0; /* global tests shouldn't care about events on child only the specific event */ __u32 test_mask = (mask & ~FS_EVENT_ON_CHILD); if (data_is == FSNOTIFY_EVENT_PATH) - mnt = ((struct path *)data)->mnt; + mnt = real_mount(((struct path *)data)->mnt); else mnt = NULL; @@ -262,11 +263,11 @@ int fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is, /* we didn't use the vfsmount_mark */ vfsmount_group = NULL; } else if (vfsmount_group > inode_group) { - ret = send_to_group(to_tell, mnt, NULL, vfsmount_mark, mask, data, + ret = send_to_group(to_tell, &mnt->mnt, NULL, vfsmount_mark, mask, data, data_is, cookie, file_name, &event); inode_group = NULL; } else { - ret = send_to_group(to_tell, mnt, inode_mark, vfsmount_mark, + ret = send_to_group(to_tell, &mnt->mnt, inode_mark, vfsmount_mark, mask, data, data_is, cookie, file_name, &event); } diff --git a/fs/notify/vfsmount_mark.c b/fs/notify/vfsmount_mark.c index 778fe6c..b7b4b0e 100644 --- a/fs/notify/vfsmount_mark.c +++ b/fs/notify/vfsmount_mark.c @@ -28,15 +28,17 @@ #include <linux/fsnotify_backend.h> #include "fsnotify.h" +#include "../mount.h" void fsnotify_clear_marks_by_mount(struct vfsmount *mnt) { struct fsnotify_mark *mark, *lmark; struct hlist_node *pos, *n; + struct mount *m = real_mount(mnt); LIST_HEAD(free_list); spin_lock(&mnt->mnt_root->d_lock); - hlist_for_each_entry_safe(mark, pos, n, &mnt->mnt_fsnotify_marks, m.m_list) { + hlist_for_each_entry_safe(mark, pos, n, &m->mnt_fsnotify_marks, m.m_list) { list_add(&mark->m.free_m_list, &free_list); hlist_del_init_rcu(&mark->m.m_list); fsnotify_get_mark(mark); @@ -59,15 +61,16 @@ void fsnotify_clear_vfsmount_marks_by_group(struct fsnotify_group *group) */ static void fsnotify_recalc_vfsmount_mask_locked(struct vfsmount *mnt) { + struct mount *m = real_mount(mnt); struct fsnotify_mark *mark; struct hlist_node *pos; __u32 new_mask = 0; assert_spin_locked(&mnt->mnt_root->d_lock); - hlist_for_each_entry(mark, pos, &mnt->mnt_fsnotify_marks, m.m_list) + hlist_for_each_entry(mark, pos, &m->mnt_fsnotify_marks, m.m_list) new_mask |= mark->mask; - mnt->mnt_fsnotify_mask = new_mask; + m->mnt_fsnotify_mask = new_mask; } /* @@ -101,12 +104,13 @@ void fsnotify_destroy_vfsmount_mark(struct fsnotify_mark *mark) static struct fsnotify_mark *fsnotify_find_vfsmount_mark_locked(struct fsnotify_group *group, struct vfsmount *mnt) { + struct mount *m = real_mount(mnt); struct fsnotify_mark *mark; struct hlist_node *pos; assert_spin_locked(&mnt->mnt_root->d_lock); - hlist_for_each_entry(mark, pos, &mnt->mnt_fsnotify_marks, m.m_list) { + hlist_for_each_entry(mark, pos, &m->mnt_fsnotify_marks, m.m_list) { if (mark->group == group) { fsnotify_get_mark(mark); return mark; @@ -140,6 +144,7 @@ int fsnotify_add_vfsmount_mark(struct fsnotify_mark *mark, struct fsnotify_group *group, struct vfsmount *mnt, int allow_dups) { + struct mount *m = real_mount(mnt); struct fsnotify_mark *lmark; struct hlist_node *node, *last = NULL; int ret = 0; @@ -154,13 +159,13 @@ int fsnotify_add_vfsmount_mark(struct fsnotify_mark *mark, mark->m.mnt = mnt; /* is mark the first mark? */ - if (hlist_empty(&mnt->mnt_fsnotify_marks)) { - hlist_add_head_rcu(&mark->m.m_list, &mnt->mnt_fsnotify_marks); + if (hlist_empty(&m->mnt_fsnotify_marks)) { + hlist_add_head_rcu(&mark->m.m_list, &m->mnt_fsnotify_marks); goto out; } /* should mark be in the middle of the current list? */ - hlist_for_each_entry(lmark, node, &mnt->mnt_fsnotify_marks, m.m_list) { + hlist_for_each_entry(lmark, node, &m->mnt_fsnotify_marks, m.m_list) { last = node; if ((lmark->group == group) && !allow_dups) { diff --git a/fs/ntfs/debug.h b/fs/ntfs/debug.h index 2142b1c..53c27ea 100644 --- a/fs/ntfs/debug.h +++ b/fs/ntfs/debug.h @@ -30,8 +30,9 @@ extern int debug_msgs; -extern void __ntfs_debug(const char *file, int line, const char *function, - const char *format, ...) __attribute__ ((format (printf, 4, 5))); +extern __printf(4, 5) +void __ntfs_debug(const char *file, int line, const char *function, + const char *format, ...); /** * ntfs_debug - write a debug level message to syslog * @f: a printf format string containing the message @@ -52,12 +53,14 @@ extern void ntfs_debug_dump_runlist(const runlist_element *rl); #endif /* !DEBUG */ -extern void __ntfs_warning(const char *function, const struct super_block *sb, - const char *fmt, ...) __attribute__ ((format (printf, 3, 4))); +extern __printf(3, 4) +void __ntfs_warning(const char *function, const struct super_block *sb, + const char *fmt, ...); #define ntfs_warning(sb, f, a...) __ntfs_warning(__func__, sb, f, ##a) -extern void __ntfs_error(const char *function, const struct super_block *sb, - const char *fmt, ...) __attribute__ ((format (printf, 3, 4))); +extern __printf(3, 4) +void __ntfs_error(const char *function, const struct super_block *sb, + const char *fmt, ...); #define ntfs_error(sb, f, a...) __ntfs_error(__func__, sb, f, ##a) #endif /* _LINUX_NTFS_DEBUG_H */ diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c index 1371487..2eaa666 100644 --- a/fs/ntfs/inode.c +++ b/fs/ntfs/inode.c @@ -335,7 +335,6 @@ struct inode *ntfs_alloc_big_inode(struct super_block *sb) static void ntfs_i_callback(struct rcu_head *head) { struct inode *inode = container_of(head, struct inode, i_rcu); - INIT_LIST_HEAD(&inode->i_dentry); kmem_cache_free(ntfs_big_inode_cache, NTFS_I(inode)); } @@ -612,7 +611,7 @@ static int ntfs_read_locked_inode(struct inode *vi) * might be tricky due to vfs interactions. Need to think about this * some more when implementing the unlink command. */ - vi->i_nlink = le16_to_cpu(m->link_count); + set_nlink(vi, le16_to_cpu(m->link_count)); /* * FIXME: Reparse points can have the directory bit set even though * they would be S_IFLNK. Need to deal with this further below when we @@ -634,7 +633,7 @@ static int ntfs_read_locked_inode(struct inode *vi) vi->i_mode &= ~vol->dmask; /* Things break without this kludge! */ if (vi->i_nlink > 1) - vi->i_nlink = 1; + set_nlink(vi, 1); } else { vi->i_mode |= S_IFREG; /* Apply the file permissions mask set in the mount options. */ @@ -1242,7 +1241,7 @@ static int ntfs_read_locked_attr_inode(struct inode *base_vi, struct inode *vi) vi->i_version = base_vi->i_version; vi->i_uid = base_vi->i_uid; vi->i_gid = base_vi->i_gid; - vi->i_nlink = base_vi->i_nlink; + set_nlink(vi, base_vi->i_nlink); vi->i_mtime = base_vi->i_mtime; vi->i_ctime = base_vi->i_ctime; vi->i_atime = base_vi->i_atime; @@ -1508,7 +1507,7 @@ static int ntfs_read_locked_index_inode(struct inode *base_vi, struct inode *vi) vi->i_version = base_vi->i_version; vi->i_uid = base_vi->i_uid; vi->i_gid = base_vi->i_gid; - vi->i_nlink = base_vi->i_nlink; + set_nlink(vi, base_vi->i_nlink); vi->i_mtime = base_vi->i_mtime; vi->i_ctime = base_vi->i_ctime; vi->i_atime = base_vi->i_atime; @@ -2301,16 +2300,16 @@ void ntfs_evict_big_inode(struct inode *vi) /** * ntfs_show_options - show mount options in /proc/mounts * @sf: seq_file in which to write our mount options - * @mnt: vfs mount whose mount options to display + * @root: root of the mounted tree whose mount options to display * * Called by the VFS once for each mounted ntfs volume when someone reads * /proc/mounts in order to display the NTFS specific mount options of each - * mount. The mount options of the vfs mount @mnt are written to the seq file + * mount. The mount options of fs specified by @root are written to the seq file * @sf and success is returned. */ -int ntfs_show_options(struct seq_file *sf, struct vfsmount *mnt) +int ntfs_show_options(struct seq_file *sf, struct dentry *root) { - ntfs_volume *vol = NTFS_SB(mnt->mnt_sb); + ntfs_volume *vol = NTFS_SB(root->d_sb); int i; seq_printf(sf, ",uid=%i", vol->uid); diff --git a/fs/ntfs/inode.h b/fs/ntfs/inode.h index fe8e7e9..db29695 100644 --- a/fs/ntfs/inode.h +++ b/fs/ntfs/inode.h @@ -298,7 +298,7 @@ extern void ntfs_clear_extent_inode(ntfs_inode *ni); extern int ntfs_read_inode_mount(struct inode *vi); -extern int ntfs_show_options(struct seq_file *sf, struct vfsmount *mnt); +extern int ntfs_show_options(struct seq_file *sf, struct dentry *root); #ifdef NTFS_RW diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c index b52706d..608be45 100644 --- a/fs/ntfs/super.c +++ b/fs/ntfs/super.c @@ -104,7 +104,7 @@ static bool parse_options(ntfs_volume *vol, char *opt) int errors = 0, sloppy = 0; uid_t uid = (uid_t)-1; gid_t gid = (gid_t)-1; - mode_t fmask = (mode_t)-1, dmask = (mode_t)-1; + umode_t fmask = (umode_t)-1, dmask = (umode_t)-1; int mft_zone_multiplier = -1, on_errors = -1; int show_sys_files = -1, case_sensitive = -1, disable_sparse = -1; struct nls_table *nls_map = NULL, *old_nls; @@ -287,9 +287,9 @@ no_mount_options: vol->uid = uid; if (gid != (gid_t)-1) vol->gid = gid; - if (fmask != (mode_t)-1) + if (fmask != (umode_t)-1) vol->fmask = fmask; - if (dmask != (mode_t)-1) + if (dmask != (umode_t)-1) vol->dmask = dmask; if (show_sys_files != -1) { if (show_sys_files) diff --git a/fs/ntfs/volume.h b/fs/ntfs/volume.h index 406ab55..15e3ba8 100644 --- a/fs/ntfs/volume.h +++ b/fs/ntfs/volume.h @@ -48,8 +48,8 @@ typedef struct { unsigned long flags; /* Miscellaneous flags, see below. */ uid_t uid; /* uid that files will be mounted as. */ gid_t gid; /* gid that files will be mounted as. */ - mode_t fmask; /* The mask for file permissions. */ - mode_t dmask; /* The mask for directory + umode_t fmask; /* The mask for file permissions. */ + umode_t dmask; /* The mask for directory permissions. */ u8 mft_zone_multiplier; /* Initial mft zone multiplier. */ u8 on_errors; /* What to do on filesystem errors. */ diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index ed553c6..3165aeb 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -5699,7 +5699,7 @@ int ocfs2_remove_btree_range(struct inode *inode, OCFS2_JOURNAL_ACCESS_WRITE); if (ret) { mlog_errno(ret); - goto out; + goto out_commit; } dquot_free_space_nodirty(inode, diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index c1efe93..78b68af 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -290,7 +290,15 @@ static int ocfs2_readpage(struct file *file, struct page *page) } if (down_read_trylock(&oi->ip_alloc_sem) == 0) { + /* + * Unlock the page and cycle ip_alloc_sem so that we don't + * busyloop waiting for ip_alloc_sem to unlock + */ ret = AOP_TRUNCATED_PAGE; + unlock_page(page); + unlock = 0; + down_read(&oi->ip_alloc_sem); + up_read(&oi->ip_alloc_sem); goto out_inode_unlock; } @@ -563,6 +571,7 @@ static void ocfs2_dio_end_io(struct kiocb *iocb, { struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode; int level; + wait_queue_head_t *wq = ocfs2_ioend_wq(inode); /* this io's submitter should not have unlocked this before we could */ BUG_ON(!ocfs2_iocb_is_rw_locked(iocb)); @@ -570,6 +579,15 @@ static void ocfs2_dio_end_io(struct kiocb *iocb, if (ocfs2_iocb_is_sem_locked(iocb)) ocfs2_iocb_clear_sem_locked(iocb); + if (ocfs2_iocb_is_unaligned_aio(iocb)) { + ocfs2_iocb_clear_unaligned_aio(iocb); + + if (atomic_dec_and_test(&OCFS2_I(inode)->ip_unaligned_aio) && + waitqueue_active(wq)) { + wake_up_all(wq); + } + } + ocfs2_iocb_clear_rw_locked(iocb); level = ocfs2_iocb_rw_locked_level(iocb); @@ -863,6 +881,12 @@ struct ocfs2_write_ctxt { struct page *w_target_page; /* + * w_target_locked is used for page_mkwrite path indicating no unlocking + * against w_target_page in ocfs2_write_end_nolock. + */ + unsigned int w_target_locked:1; + + /* * ocfs2_write_end() uses this to know what the real range to * write in the target should be. */ @@ -895,6 +919,24 @@ void ocfs2_unlock_and_free_pages(struct page **pages, int num_pages) static void ocfs2_free_write_ctxt(struct ocfs2_write_ctxt *wc) { + int i; + + /* + * w_target_locked is only set to true in the page_mkwrite() case. + * The intent is to allow us to lock the target page from write_begin() + * to write_end(). The caller must hold a ref on w_target_page. + */ + if (wc->w_target_locked) { + BUG_ON(!wc->w_target_page); + for (i = 0; i < wc->w_num_pages; i++) { + if (wc->w_target_page == wc->w_pages[i]) { + wc->w_pages[i] = NULL; + break; + } + } + mark_page_accessed(wc->w_target_page); + page_cache_release(wc->w_target_page); + } ocfs2_unlock_and_free_pages(wc->w_pages, wc->w_num_pages); brelse(wc->w_di_bh); @@ -1132,20 +1174,17 @@ static int ocfs2_grab_pages_for_write(struct address_space *mapping, */ lock_page(mmap_page); + /* Exit and let the caller retry */ if (mmap_page->mapping != mapping) { + WARN_ON(mmap_page->mapping); unlock_page(mmap_page); - /* - * Sanity check - the locking in - * ocfs2_pagemkwrite() should ensure - * that this code doesn't trigger. - */ - ret = -EINVAL; - mlog_errno(ret); + ret = -EAGAIN; goto out; } page_cache_get(mmap_page); wc->w_pages[i] = mmap_page; + wc->w_target_locked = true; } else { wc->w_pages[i] = find_or_create_page(mapping, index, GFP_NOFS); @@ -1160,6 +1199,8 @@ static int ocfs2_grab_pages_for_write(struct address_space *mapping, wc->w_target_page = wc->w_pages[i]; } out: + if (ret) + wc->w_target_locked = false; return ret; } @@ -1817,11 +1858,23 @@ try_again: */ ret = ocfs2_grab_pages_for_write(mapping, wc, wc->w_cpos, pos, len, cluster_of_pages, mmap_page); - if (ret) { + if (ret && ret != -EAGAIN) { mlog_errno(ret); goto out_quota; } + /* + * ocfs2_grab_pages_for_write() returns -EAGAIN if it could not lock + * the target page. In this case, we exit with no error and no target + * page. This will trigger the caller, page_mkwrite(), to re-try + * the operation. + */ + if (ret == -EAGAIN) { + BUG_ON(wc->w_target_page); + ret = 0; + goto out_quota; + } + ret = ocfs2_write_cluster_by_desc(mapping, data_ac, meta_ac, wc, pos, len); if (ret) { diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h index 75cf3ad..ffb2da3 100644 --- a/fs/ocfs2/aops.h +++ b/fs/ocfs2/aops.h @@ -78,6 +78,7 @@ enum ocfs2_iocb_lock_bits { OCFS2_IOCB_RW_LOCK = 0, OCFS2_IOCB_RW_LOCK_LEVEL, OCFS2_IOCB_SEM, + OCFS2_IOCB_UNALIGNED_IO, OCFS2_IOCB_NUM_LOCKS }; @@ -91,4 +92,17 @@ enum ocfs2_iocb_lock_bits { clear_bit(OCFS2_IOCB_SEM, (unsigned long *)&iocb->private) #define ocfs2_iocb_is_sem_locked(iocb) \ test_bit(OCFS2_IOCB_SEM, (unsigned long *)&iocb->private) + +#define ocfs2_iocb_set_unaligned_aio(iocb) \ + set_bit(OCFS2_IOCB_UNALIGNED_IO, (unsigned long *)&iocb->private) +#define ocfs2_iocb_clear_unaligned_aio(iocb) \ + clear_bit(OCFS2_IOCB_UNALIGNED_IO, (unsigned long *)&iocb->private) +#define ocfs2_iocb_is_unaligned_aio(iocb) \ + test_bit(OCFS2_IOCB_UNALIGNED_IO, (unsigned long *)&iocb->private) + +#define OCFS2_IOEND_WQ_HASH_SZ 37 +#define ocfs2_ioend_wq(v) (&ocfs2__ioend_wq[((unsigned long)(v)) %\ + OCFS2_IOEND_WQ_HASH_SZ]) +extern wait_queue_head_t ocfs2__ioend_wq[OCFS2_IOEND_WQ_HASH_SZ]; + #endif /* OCFS2_FILE_H */ diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index 9a3e6bb..a4e855e 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -216,6 +216,7 @@ struct o2hb_region { struct list_head hr_all_item; unsigned hr_unclean_stop:1, + hr_aborted_start:1, hr_item_pinned:1, hr_item_dropped:1; @@ -254,6 +255,10 @@ struct o2hb_region { * a more complete api that doesn't lead to this sort of fragility. */ atomic_t hr_steady_iterations; + /* terminate o2hb thread if it does not reach steady state + * (hr_steady_iterations == 0) within hr_unsteady_iterations */ + atomic_t hr_unsteady_iterations; + char hr_dev_name[BDEVNAME_SIZE]; unsigned int hr_timeout_ms; @@ -324,6 +329,10 @@ static void o2hb_write_timeout(struct work_struct *work) static void o2hb_arm_write_timeout(struct o2hb_region *reg) { + /* Arm writeout only after thread reaches steady state */ + if (atomic_read(®->hr_steady_iterations) != 0) + return; + mlog(ML_HEARTBEAT, "Queue write timeout for %u ms\n", O2HB_MAX_WRITE_TIMEOUT_MS); @@ -537,9 +546,14 @@ static int o2hb_verify_crc(struct o2hb_region *reg, return read == computed; } -/* We want to make sure that nobody is heartbeating on top of us -- - * this will help detect an invalid configuration. */ -static void o2hb_check_last_timestamp(struct o2hb_region *reg) +/* + * Compare the slot data with what we wrote in the last iteration. + * If the match fails, print an appropriate error message. This is to + * detect errors like... another node hearting on the same slot, + * flaky device that is losing writes, etc. + * Returns 1 if check succeeds, 0 otherwise. + */ +static int o2hb_check_own_slot(struct o2hb_region *reg) { struct o2hb_disk_slot *slot; struct o2hb_disk_heartbeat_block *hb_block; @@ -548,13 +562,13 @@ static void o2hb_check_last_timestamp(struct o2hb_region *reg) slot = ®->hr_slots[o2nm_this_node()]; /* Don't check on our 1st timestamp */ if (!slot->ds_last_time) - return; + return 0; hb_block = slot->ds_raw_block; if (le64_to_cpu(hb_block->hb_seq) == slot->ds_last_time && le64_to_cpu(hb_block->hb_generation) == slot->ds_last_generation && hb_block->hb_node == slot->ds_node_num) - return; + return 1; #define ERRSTR1 "Another node is heartbeating on device" #define ERRSTR2 "Heartbeat generation mismatch on device" @@ -574,6 +588,8 @@ static void o2hb_check_last_timestamp(struct o2hb_region *reg) (unsigned long long)slot->ds_last_time, hb_block->hb_node, (unsigned long long)le64_to_cpu(hb_block->hb_generation), (unsigned long long)le64_to_cpu(hb_block->hb_seq)); + + return 0; } static inline void o2hb_prepare_block(struct o2hb_region *reg, @@ -719,17 +735,24 @@ static void o2hb_shutdown_slot(struct o2hb_disk_slot *slot) o2nm_node_put(node); } -static void o2hb_set_quorum_device(struct o2hb_region *reg, - struct o2hb_disk_slot *slot) +static void o2hb_set_quorum_device(struct o2hb_region *reg) { - assert_spin_locked(&o2hb_live_lock); - if (!o2hb_global_heartbeat_active()) return; - if (test_bit(reg->hr_region_num, o2hb_quorum_region_bitmap)) + /* Prevent race with o2hb_heartbeat_group_drop_item() */ + if (kthread_should_stop()) + return; + + /* Tag region as quorum only after thread reaches steady state */ + if (atomic_read(®->hr_steady_iterations) != 0) return; + spin_lock(&o2hb_live_lock); + + if (test_bit(reg->hr_region_num, o2hb_quorum_region_bitmap)) + goto unlock; + /* * A region can be added to the quorum only when it sees all * live nodes heartbeat on it. In other words, the region has been @@ -737,13 +760,10 @@ static void o2hb_set_quorum_device(struct o2hb_region *reg, */ if (memcmp(reg->hr_live_node_bitmap, o2hb_live_node_bitmap, sizeof(o2hb_live_node_bitmap))) - return; - - if (slot->ds_changed_samples < O2HB_LIVE_THRESHOLD) - return; + goto unlock; - printk(KERN_NOTICE "o2hb: Region %s is now a quorum device\n", - config_item_name(®->hr_item)); + printk(KERN_NOTICE "o2hb: Region %s (%s) is now a quorum device\n", + config_item_name(®->hr_item), reg->hr_dev_name); set_bit(reg->hr_region_num, o2hb_quorum_region_bitmap); @@ -754,6 +774,8 @@ static void o2hb_set_quorum_device(struct o2hb_region *reg, if (o2hb_pop_count(&o2hb_quorum_region_bitmap, O2NM_MAX_REGIONS) > O2HB_PIN_CUT_OFF) o2hb_region_unpin(NULL); +unlock: + spin_unlock(&o2hb_live_lock); } static int o2hb_check_slot(struct o2hb_region *reg, @@ -925,8 +947,6 @@ fire_callbacks: slot->ds_equal_samples = 0; } out: - o2hb_set_quorum_device(reg, slot); - spin_unlock(&o2hb_live_lock); o2hb_run_event_list(&event); @@ -957,7 +977,8 @@ static int o2hb_highest_node(unsigned long *nodes, static int o2hb_do_disk_heartbeat(struct o2hb_region *reg) { - int i, ret, highest_node, change = 0; + int i, ret, highest_node; + int membership_change = 0, own_slot_ok = 0; unsigned long configured_nodes[BITS_TO_LONGS(O2NM_MAX_NODES)]; unsigned long live_node_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)]; struct o2hb_bio_wait_ctxt write_wc; @@ -966,7 +987,7 @@ static int o2hb_do_disk_heartbeat(struct o2hb_region *reg) sizeof(configured_nodes)); if (ret) { mlog_errno(ret); - return ret; + goto bail; } /* @@ -982,8 +1003,9 @@ static int o2hb_do_disk_heartbeat(struct o2hb_region *reg) highest_node = o2hb_highest_node(configured_nodes, O2NM_MAX_NODES); if (highest_node >= O2NM_MAX_NODES) { - mlog(ML_NOTICE, "ocfs2_heartbeat: no configured nodes found!\n"); - return -EINVAL; + mlog(ML_NOTICE, "o2hb: No configured nodes found!\n"); + ret = -EINVAL; + goto bail; } /* No sense in reading the slots of nodes that don't exist @@ -993,29 +1015,27 @@ static int o2hb_do_disk_heartbeat(struct o2hb_region *reg) ret = o2hb_read_slots(reg, highest_node + 1); if (ret < 0) { mlog_errno(ret); - return ret; + goto bail; } /* With an up to date view of the slots, we can check that no * other node has been improperly configured to heartbeat in * our slot. */ - o2hb_check_last_timestamp(reg); + own_slot_ok = o2hb_check_own_slot(reg); /* fill in the proper info for our next heartbeat */ o2hb_prepare_block(reg, reg->hr_generation); - /* And fire off the write. Note that we don't wait on this I/O - * until later. */ ret = o2hb_issue_node_write(reg, &write_wc); if (ret < 0) { mlog_errno(ret); - return ret; + goto bail; } i = -1; while((i = find_next_bit(configured_nodes, O2NM_MAX_NODES, i + 1)) < O2NM_MAX_NODES) { - change |= o2hb_check_slot(reg, ®->hr_slots[i]); + membership_change |= o2hb_check_slot(reg, ®->hr_slots[i]); } /* @@ -1030,18 +1050,39 @@ static int o2hb_do_disk_heartbeat(struct o2hb_region *reg) * disk */ mlog(ML_ERROR, "Write error %d on device \"%s\"\n", write_wc.wc_error, reg->hr_dev_name); - return write_wc.wc_error; + ret = write_wc.wc_error; + goto bail; } - o2hb_arm_write_timeout(reg); + /* Skip disarming the timeout if own slot has stale/bad data */ + if (own_slot_ok) { + o2hb_set_quorum_device(reg); + o2hb_arm_write_timeout(reg); + } +bail: /* let the person who launched us know when things are steady */ - if (!change && (atomic_read(®->hr_steady_iterations) != 0)) { - if (atomic_dec_and_test(®->hr_steady_iterations)) + if (atomic_read(®->hr_steady_iterations) != 0) { + if (!ret && own_slot_ok && !membership_change) { + if (atomic_dec_and_test(®->hr_steady_iterations)) + wake_up(&o2hb_steady_queue); + } + } + + if (atomic_read(®->hr_steady_iterations) != 0) { + if (atomic_dec_and_test(®->hr_unsteady_iterations)) { + printk(KERN_NOTICE "o2hb: Unable to stabilize " + "heartbeart on region %s (%s)\n", + config_item_name(®->hr_item), + reg->hr_dev_name); + atomic_set(®->hr_steady_iterations, 0); + reg->hr_aborted_start = 1; wake_up(&o2hb_steady_queue); + ret = -EIO; + } } - return 0; + return ret; } /* Subtract b from a, storing the result in a. a *must* have a larger @@ -1095,7 +1136,8 @@ static int o2hb_thread(void *data) /* Pin node */ o2nm_depend_this_node(); - while (!kthread_should_stop() && !reg->hr_unclean_stop) { + while (!kthread_should_stop() && + !reg->hr_unclean_stop && !reg->hr_aborted_start) { /* We track the time spent inside * o2hb_do_disk_heartbeat so that we avoid more than * hr_timeout_ms between disk writes. On busy systems @@ -1103,10 +1145,7 @@ static int o2hb_thread(void *data) * likely to time itself out. */ do_gettimeofday(&before_hb); - i = 0; - do { - ret = o2hb_do_disk_heartbeat(reg); - } while (ret && ++i < 2); + ret = o2hb_do_disk_heartbeat(reg); do_gettimeofday(&after_hb); elapsed_msec = o2hb_elapsed_msecs(&before_hb, &after_hb); @@ -1117,7 +1156,8 @@ static int o2hb_thread(void *data) after_hb.tv_sec, (unsigned long) after_hb.tv_usec, elapsed_msec); - if (elapsed_msec < reg->hr_timeout_ms) { + if (!kthread_should_stop() && + elapsed_msec < reg->hr_timeout_ms) { /* the kthread api has blocked signals for us so no * need to record the return value. */ msleep_interruptible(reg->hr_timeout_ms - elapsed_msec); @@ -1134,20 +1174,20 @@ static int o2hb_thread(void *data) * to timeout on this region when we could just as easily * write a clear generation - thus indicating to them that * this node has left this region. - * - * XXX: Should we skip this on unclean_stop? */ - o2hb_prepare_block(reg, 0); - ret = o2hb_issue_node_write(reg, &write_wc); - if (ret == 0) { - o2hb_wait_on_io(reg, &write_wc); - } else { - mlog_errno(ret); + */ + if (!reg->hr_unclean_stop && !reg->hr_aborted_start) { + o2hb_prepare_block(reg, 0); + ret = o2hb_issue_node_write(reg, &write_wc); + if (ret == 0) + o2hb_wait_on_io(reg, &write_wc); + else + mlog_errno(ret); } /* Unpin node */ o2nm_undepend_this_node(); - mlog(ML_HEARTBEAT|ML_KTHREAD, "hb thread exiting\n"); + mlog(ML_HEARTBEAT|ML_KTHREAD, "o2hb thread exiting\n"); return 0; } @@ -1158,6 +1198,7 @@ static int o2hb_debug_open(struct inode *inode, struct file *file) struct o2hb_debug_buf *db = inode->i_private; struct o2hb_region *reg; unsigned long map[BITS_TO_LONGS(O2NM_MAX_NODES)]; + unsigned long lts; char *buf = NULL; int i = -1; int out = 0; @@ -1194,9 +1235,11 @@ static int o2hb_debug_open(struct inode *inode, struct file *file) case O2HB_DB_TYPE_REGION_ELAPSED_TIME: reg = (struct o2hb_region *)db->db_data; - out += snprintf(buf + out, PAGE_SIZE - out, "%u\n", - jiffies_to_msecs(jiffies - - reg->hr_last_timeout_start)); + lts = reg->hr_last_timeout_start; + /* If 0, it has never been set before */ + if (lts) + lts = jiffies_to_msecs(jiffies - lts); + out += snprintf(buf + out, PAGE_SIZE - out, "%lu\n", lts); goto done; case O2HB_DB_TYPE_REGION_PINNED: @@ -1426,6 +1469,8 @@ static void o2hb_region_release(struct config_item *item) struct page *page; struct o2hb_region *reg = to_o2hb_region(item); + mlog(ML_HEARTBEAT, "hb region release (%s)\n", reg->hr_dev_name); + if (reg->hr_tmp_block) kfree(reg->hr_tmp_block); @@ -1792,7 +1837,10 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg, live_threshold <<= 1; spin_unlock(&o2hb_live_lock); } - atomic_set(®->hr_steady_iterations, live_threshold + 1); + ++live_threshold; + atomic_set(®->hr_steady_iterations, live_threshold); + /* unsteady_iterations is double the steady_iterations */ + atomic_set(®->hr_unsteady_iterations, (live_threshold << 1)); hb_task = kthread_run(o2hb_thread, reg, "o2hb-%s", reg->hr_item.ci_name); @@ -1809,14 +1857,12 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg, ret = wait_event_interruptible(o2hb_steady_queue, atomic_read(®->hr_steady_iterations) == 0); if (ret) { - /* We got interrupted (hello ptrace!). Clean up */ - spin_lock(&o2hb_live_lock); - hb_task = reg->hr_task; - reg->hr_task = NULL; - spin_unlock(&o2hb_live_lock); + atomic_set(®->hr_steady_iterations, 0); + reg->hr_aborted_start = 1; + } - if (hb_task) - kthread_stop(hb_task); + if (reg->hr_aborted_start) { + ret = -EIO; goto out; } @@ -1833,8 +1879,8 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg, ret = -EIO; if (hb_task && o2hb_global_heartbeat_active()) - printk(KERN_NOTICE "o2hb: Heartbeat started on region %s\n", - config_item_name(®->hr_item)); + printk(KERN_NOTICE "o2hb: Heartbeat started on region %s (%s)\n", + config_item_name(®->hr_item), reg->hr_dev_name); out: if (filp) @@ -2092,13 +2138,6 @@ static void o2hb_heartbeat_group_drop_item(struct config_group *group, /* stop the thread when the user removes the region dir */ spin_lock(&o2hb_live_lock); - if (o2hb_global_heartbeat_active()) { - clear_bit(reg->hr_region_num, o2hb_region_bitmap); - clear_bit(reg->hr_region_num, o2hb_live_region_bitmap); - if (test_bit(reg->hr_region_num, o2hb_quorum_region_bitmap)) - quorum_region = 1; - clear_bit(reg->hr_region_num, o2hb_quorum_region_bitmap); - } hb_task = reg->hr_task; reg->hr_task = NULL; reg->hr_item_dropped = 1; @@ -2107,19 +2146,30 @@ static void o2hb_heartbeat_group_drop_item(struct config_group *group, if (hb_task) kthread_stop(hb_task); + if (o2hb_global_heartbeat_active()) { + spin_lock(&o2hb_live_lock); + clear_bit(reg->hr_region_num, o2hb_region_bitmap); + clear_bit(reg->hr_region_num, o2hb_live_region_bitmap); + if (test_bit(reg->hr_region_num, o2hb_quorum_region_bitmap)) + quorum_region = 1; + clear_bit(reg->hr_region_num, o2hb_quorum_region_bitmap); + spin_unlock(&o2hb_live_lock); + printk(KERN_NOTICE "o2hb: Heartbeat %s on region %s (%s)\n", + ((atomic_read(®->hr_steady_iterations) == 0) ? + "stopped" : "start aborted"), config_item_name(item), + reg->hr_dev_name); + } + /* * If we're racing a dev_write(), we need to wake them. They will * check reg->hr_task */ if (atomic_read(®->hr_steady_iterations) != 0) { + reg->hr_aborted_start = 1; atomic_set(®->hr_steady_iterations, 0); wake_up(&o2hb_steady_queue); } - if (o2hb_global_heartbeat_active()) - printk(KERN_NOTICE "o2hb: Heartbeat stopped on region %s\n", - config_item_name(®->hr_item)); - config_item_put(item); if (!o2hb_global_heartbeat_active() || !quorum_region) diff --git a/fs/ocfs2/cluster/netdebug.c b/fs/ocfs2/cluster/netdebug.c index 3a583590..73ba819 100644 --- a/fs/ocfs2/cluster/netdebug.c +++ b/fs/ocfs2/cluster/netdebug.c @@ -47,6 +47,7 @@ #define SC_DEBUG_NAME "sock_containers" #define NST_DEBUG_NAME "send_tracking" #define STATS_DEBUG_NAME "stats" +#define NODES_DEBUG_NAME "connected_nodes" #define SHOW_SOCK_CONTAINERS 0 #define SHOW_SOCK_STATS 1 @@ -55,6 +56,7 @@ static struct dentry *o2net_dentry; static struct dentry *sc_dentry; static struct dentry *nst_dentry; static struct dentry *stats_dentry; +static struct dentry *nodes_dentry; static DEFINE_SPINLOCK(o2net_debug_lock); @@ -491,53 +493,87 @@ static const struct file_operations sc_seq_fops = { .release = sc_fop_release, }; -int o2net_debugfs_init(void) +static int o2net_fill_bitmap(char *buf, int len) { - o2net_dentry = debugfs_create_dir(O2NET_DEBUG_DIR, NULL); - if (!o2net_dentry) { - mlog_errno(-ENOMEM); - goto bail; - } + unsigned long map[BITS_TO_LONGS(O2NM_MAX_NODES)]; + int i = -1, out = 0; - nst_dentry = debugfs_create_file(NST_DEBUG_NAME, S_IFREG|S_IRUSR, - o2net_dentry, NULL, - &nst_seq_fops); - if (!nst_dentry) { - mlog_errno(-ENOMEM); - goto bail; - } + o2net_fill_node_map(map, sizeof(map)); - sc_dentry = debugfs_create_file(SC_DEBUG_NAME, S_IFREG|S_IRUSR, - o2net_dentry, NULL, - &sc_seq_fops); - if (!sc_dentry) { - mlog_errno(-ENOMEM); - goto bail; - } + while ((i = find_next_bit(map, O2NM_MAX_NODES, i + 1)) < O2NM_MAX_NODES) + out += snprintf(buf + out, PAGE_SIZE - out, "%d ", i); + out += snprintf(buf + out, PAGE_SIZE - out, "\n"); - stats_dentry = debugfs_create_file(STATS_DEBUG_NAME, S_IFREG|S_IRUSR, - o2net_dentry, NULL, - &stats_seq_fops); - if (!stats_dentry) { - mlog_errno(-ENOMEM); - goto bail; - } + return out; +} + +static int nodes_fop_open(struct inode *inode, struct file *file) +{ + char *buf; + + buf = kmalloc(PAGE_SIZE, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + i_size_write(inode, o2net_fill_bitmap(buf, PAGE_SIZE)); + + file->private_data = buf; return 0; -bail: - debugfs_remove(stats_dentry); - debugfs_remove(sc_dentry); - debugfs_remove(nst_dentry); - debugfs_remove(o2net_dentry); - return -ENOMEM; } +static int o2net_debug_release(struct inode *inode, struct file *file) +{ + kfree(file->private_data); + return 0; +} + +static ssize_t o2net_debug_read(struct file *file, char __user *buf, + size_t nbytes, loff_t *ppos) +{ + return simple_read_from_buffer(buf, nbytes, ppos, file->private_data, + i_size_read(file->f_mapping->host)); +} + +static const struct file_operations nodes_fops = { + .open = nodes_fop_open, + .release = o2net_debug_release, + .read = o2net_debug_read, + .llseek = generic_file_llseek, +}; + void o2net_debugfs_exit(void) { + debugfs_remove(nodes_dentry); debugfs_remove(stats_dentry); debugfs_remove(sc_dentry); debugfs_remove(nst_dentry); debugfs_remove(o2net_dentry); } +int o2net_debugfs_init(void) +{ + umode_t mode = S_IFREG|S_IRUSR; + + o2net_dentry = debugfs_create_dir(O2NET_DEBUG_DIR, NULL); + if (o2net_dentry) + nst_dentry = debugfs_create_file(NST_DEBUG_NAME, mode, + o2net_dentry, NULL, &nst_seq_fops); + if (nst_dentry) + sc_dentry = debugfs_create_file(SC_DEBUG_NAME, mode, + o2net_dentry, NULL, &sc_seq_fops); + if (sc_dentry) + stats_dentry = debugfs_create_file(STATS_DEBUG_NAME, mode, + o2net_dentry, NULL, &stats_seq_fops); + if (stats_dentry) + nodes_dentry = debugfs_create_file(NODES_DEBUG_NAME, mode, + o2net_dentry, NULL, &nodes_fops); + if (nodes_dentry) + return 0; + + o2net_debugfs_exit(); + mlog_errno(-ENOMEM); + return -ENOMEM; +} + #endif /* CONFIG_DEBUG_FS */ diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c index db5ee4b..044e7b5 100644 --- a/fs/ocfs2/cluster/tcp.c +++ b/fs/ocfs2/cluster/tcp.c @@ -59,6 +59,7 @@ #include <linux/idr.h> #include <linux/kref.h> #include <linux/net.h> +#include <linux/export.h> #include <net/tcp.h> #include <asm/uaccess.h> @@ -545,7 +546,7 @@ static void o2net_set_nn_state(struct o2net_node *nn, } if (was_valid && !valid) { - printk(KERN_NOTICE "o2net: no longer connected to " + printk(KERN_NOTICE "o2net: No longer connected to " SC_NODEF_FMT "\n", SC_NODEF_ARGS(old_sc)); o2net_complete_nodes_nsw(nn); } @@ -555,7 +556,7 @@ static void o2net_set_nn_state(struct o2net_node *nn, cancel_delayed_work(&nn->nn_connect_expired); printk(KERN_NOTICE "o2net: %s " SC_NODEF_FMT "\n", o2nm_this_node() > sc->sc_node->nd_num ? - "connected to" : "accepted connection from", + "Connected to" : "Accepted connection from", SC_NODEF_ARGS(sc)); } @@ -643,7 +644,7 @@ static void o2net_state_change(struct sock *sk) o2net_sc_queue_work(sc, &sc->sc_connect_work); break; default: - printk(KERN_INFO "o2net: connection to " SC_NODEF_FMT + printk(KERN_INFO "o2net: Connection to " SC_NODEF_FMT " shutdown, state %d\n", SC_NODEF_ARGS(sc), sk->sk_state); o2net_sc_queue_work(sc, &sc->sc_shutdown_work); @@ -1034,6 +1035,25 @@ static int o2net_tx_can_proceed(struct o2net_node *nn, return ret; } +/* Get a map of all nodes to which this node is currently connected to */ +void o2net_fill_node_map(unsigned long *map, unsigned bytes) +{ + struct o2net_sock_container *sc; + int node, ret; + + BUG_ON(bytes < (BITS_TO_LONGS(O2NM_MAX_NODES) * sizeof(unsigned long))); + + memset(map, 0, bytes); + for (node = 0; node < O2NM_MAX_NODES; ++node) { + o2net_tx_can_proceed(o2net_nn_from_num(node), &sc, &ret); + if (!ret) { + set_bit(node, map); + sc_put(sc); + } + } +} +EXPORT_SYMBOL_GPL(o2net_fill_node_map); + int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *caller_vec, size_t caller_veclen, u8 target_node, int *status) { @@ -1284,11 +1304,11 @@ static int o2net_check_handshake(struct o2net_sock_container *sc) struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num); if (hand->protocol_version != cpu_to_be64(O2NET_PROTOCOL_VERSION)) { - mlog(ML_NOTICE, SC_NODEF_FMT " advertised net protocol " - "version %llu but %llu is required, disconnecting\n", - SC_NODEF_ARGS(sc), - (unsigned long long)be64_to_cpu(hand->protocol_version), - O2NET_PROTOCOL_VERSION); + printk(KERN_NOTICE "o2net: " SC_NODEF_FMT " Advertised net " + "protocol version %llu but %llu is required. " + "Disconnecting.\n", SC_NODEF_ARGS(sc), + (unsigned long long)be64_to_cpu(hand->protocol_version), + O2NET_PROTOCOL_VERSION); /* don't bother reconnecting if its the wrong version. */ o2net_ensure_shutdown(nn, sc, -ENOTCONN); @@ -1302,33 +1322,33 @@ static int o2net_check_handshake(struct o2net_sock_container *sc) */ if (be32_to_cpu(hand->o2net_idle_timeout_ms) != o2net_idle_timeout()) { - mlog(ML_NOTICE, SC_NODEF_FMT " uses a network idle timeout of " - "%u ms, but we use %u ms locally. disconnecting\n", - SC_NODEF_ARGS(sc), - be32_to_cpu(hand->o2net_idle_timeout_ms), - o2net_idle_timeout()); + printk(KERN_NOTICE "o2net: " SC_NODEF_FMT " uses a network " + "idle timeout of %u ms, but we use %u ms locally. " + "Disconnecting.\n", SC_NODEF_ARGS(sc), + be32_to_cpu(hand->o2net_idle_timeout_ms), + o2net_idle_timeout()); o2net_ensure_shutdown(nn, sc, -ENOTCONN); return -1; } if (be32_to_cpu(hand->o2net_keepalive_delay_ms) != o2net_keepalive_delay()) { - mlog(ML_NOTICE, SC_NODEF_FMT " uses a keepalive delay of " - "%u ms, but we use %u ms locally. disconnecting\n", - SC_NODEF_ARGS(sc), - be32_to_cpu(hand->o2net_keepalive_delay_ms), - o2net_keepalive_delay()); + printk(KERN_NOTICE "o2net: " SC_NODEF_FMT " uses a keepalive " + "delay of %u ms, but we use %u ms locally. " + "Disconnecting.\n", SC_NODEF_ARGS(sc), + be32_to_cpu(hand->o2net_keepalive_delay_ms), + o2net_keepalive_delay()); o2net_ensure_shutdown(nn, sc, -ENOTCONN); return -1; } if (be32_to_cpu(hand->o2hb_heartbeat_timeout_ms) != O2HB_MAX_WRITE_TIMEOUT_MS) { - mlog(ML_NOTICE, SC_NODEF_FMT " uses a heartbeat timeout of " - "%u ms, but we use %u ms locally. disconnecting\n", - SC_NODEF_ARGS(sc), - be32_to_cpu(hand->o2hb_heartbeat_timeout_ms), - O2HB_MAX_WRITE_TIMEOUT_MS); + printk(KERN_NOTICE "o2net: " SC_NODEF_FMT " uses a heartbeat " + "timeout of %u ms, but we use %u ms locally. " + "Disconnecting.\n", SC_NODEF_ARGS(sc), + be32_to_cpu(hand->o2hb_heartbeat_timeout_ms), + O2HB_MAX_WRITE_TIMEOUT_MS); o2net_ensure_shutdown(nn, sc, -ENOTCONN); return -1; } @@ -1539,28 +1559,16 @@ static void o2net_idle_timer(unsigned long data) { struct o2net_sock_container *sc = (struct o2net_sock_container *)data; struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num); - #ifdef CONFIG_DEBUG_FS - ktime_t now = ktime_get(); + unsigned long msecs = ktime_to_ms(ktime_get()) - + ktime_to_ms(sc->sc_tv_timer); +#else + unsigned long msecs = o2net_idle_timeout(); #endif - printk(KERN_NOTICE "o2net: connection to " SC_NODEF_FMT " has been idle for %u.%u " - "seconds, shutting it down.\n", SC_NODEF_ARGS(sc), - o2net_idle_timeout() / 1000, - o2net_idle_timeout() % 1000); - -#ifdef CONFIG_DEBUG_FS - mlog(ML_NOTICE, "Here are some times that might help debug the " - "situation: (Timer: %lld, Now %lld, DataReady %lld, Advance %lld-%lld, " - "Key 0x%08x, Func %u, FuncTime %lld-%lld)\n", - (long long)ktime_to_us(sc->sc_tv_timer), (long long)ktime_to_us(now), - (long long)ktime_to_us(sc->sc_tv_data_ready), - (long long)ktime_to_us(sc->sc_tv_advance_start), - (long long)ktime_to_us(sc->sc_tv_advance_stop), - sc->sc_msg_key, sc->sc_msg_type, - (long long)ktime_to_us(sc->sc_tv_func_start), - (long long)ktime_to_us(sc->sc_tv_func_stop)); -#endif + printk(KERN_NOTICE "o2net: Connection to " SC_NODEF_FMT " has been " + "idle for %lu.%lu secs, shutting it down.\n", SC_NODEF_ARGS(sc), + msecs / 1000, msecs % 1000); /* * Initialize the nn_timeout so that the next connection attempt @@ -1693,8 +1701,8 @@ static void o2net_start_connect(struct work_struct *work) out: if (ret) { - mlog(ML_NOTICE, "connect attempt to " SC_NODEF_FMT " failed " - "with errno %d\n", SC_NODEF_ARGS(sc), ret); + printk(KERN_NOTICE "o2net: Connect attempt to " SC_NODEF_FMT + " failed with errno %d\n", SC_NODEF_ARGS(sc), ret); /* 0 err so that another will be queued and attempted * from set_nn_state */ if (sc) @@ -1717,8 +1725,8 @@ static void o2net_connect_expired(struct work_struct *work) spin_lock(&nn->nn_lock); if (!nn->nn_sc_valid) { - mlog(ML_ERROR, "no connection established with node %u after " - "%u.%u seconds, giving up and returning errors.\n", + printk(KERN_NOTICE "o2net: No connection established with " + "node %u after %u.%u seconds, giving up.\n", o2net_num_from_nn(nn), o2net_idle_timeout() / 1000, o2net_idle_timeout() % 1000); @@ -1861,21 +1869,21 @@ static int o2net_accept_one(struct socket *sock) node = o2nm_get_node_by_ip(sin.sin_addr.s_addr); if (node == NULL) { - mlog(ML_NOTICE, "attempt to connect from unknown node at %pI4:%d\n", - &sin.sin_addr.s_addr, ntohs(sin.sin_port)); + printk(KERN_NOTICE "o2net: Attempt to connect from unknown " + "node at %pI4:%d\n", &sin.sin_addr.s_addr, + ntohs(sin.sin_port)); ret = -EINVAL; goto out; } if (o2nm_this_node() >= node->nd_num) { local_node = o2nm_get_node_by_num(o2nm_this_node()); - mlog(ML_NOTICE, "unexpected connect attempt seen at node '%s' (" - "%u, %pI4:%d) from node '%s' (%u, %pI4:%d)\n", - local_node->nd_name, local_node->nd_num, - &(local_node->nd_ipv4_address), - ntohs(local_node->nd_ipv4_port), - node->nd_name, node->nd_num, &sin.sin_addr.s_addr, - ntohs(sin.sin_port)); + printk(KERN_NOTICE "o2net: Unexpected connect attempt seen " + "at node '%s' (%u, %pI4:%d) from node '%s' (%u, " + "%pI4:%d)\n", local_node->nd_name, local_node->nd_num, + &(local_node->nd_ipv4_address), + ntohs(local_node->nd_ipv4_port), node->nd_name, + node->nd_num, &sin.sin_addr.s_addr, ntohs(sin.sin_port)); ret = -EINVAL; goto out; } @@ -1900,10 +1908,10 @@ static int o2net_accept_one(struct socket *sock) ret = 0; spin_unlock(&nn->nn_lock); if (ret) { - mlog(ML_NOTICE, "attempt to connect from node '%s' at " - "%pI4:%d but it already has an open connection\n", - node->nd_name, &sin.sin_addr.s_addr, - ntohs(sin.sin_port)); + printk(KERN_NOTICE "o2net: Attempt to connect from node '%s' " + "at %pI4:%d but it already has an open connection\n", + node->nd_name, &sin.sin_addr.s_addr, + ntohs(sin.sin_port)); goto out; } @@ -1983,7 +1991,7 @@ static int o2net_open_listening_sock(__be32 addr, __be16 port) ret = sock_create(PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock); if (ret < 0) { - mlog(ML_ERROR, "unable to create socket, ret=%d\n", ret); + printk(KERN_ERR "o2net: Error %d while creating socket\n", ret); goto out; } @@ -2000,16 +2008,15 @@ static int o2net_open_listening_sock(__be32 addr, __be16 port) sock->sk->sk_reuse = 1; ret = sock->ops->bind(sock, (struct sockaddr *)&sin, sizeof(sin)); if (ret < 0) { - mlog(ML_ERROR, "unable to bind socket at %pI4:%u, " - "ret=%d\n", &addr, ntohs(port), ret); + printk(KERN_ERR "o2net: Error %d while binding socket at " + "%pI4:%u\n", ret, &addr, ntohs(port)); goto out; } ret = sock->ops->listen(sock, 64); - if (ret < 0) { - mlog(ML_ERROR, "unable to listen on %pI4:%u, ret=%d\n", - &addr, ntohs(port), ret); - } + if (ret < 0) + printk(KERN_ERR "o2net: Error %d while listening on %pI4:%u\n", + ret, &addr, ntohs(port)); out: if (ret) { diff --git a/fs/ocfs2/cluster/tcp.h b/fs/ocfs2/cluster/tcp.h index fd6179e..5bada2a 100644 --- a/fs/ocfs2/cluster/tcp.h +++ b/fs/ocfs2/cluster/tcp.h @@ -106,6 +106,8 @@ int o2net_register_handler(u32 msg_type, u32 key, u32 max_len, struct list_head *unreg_list); void o2net_unregister_handler_list(struct list_head *list); +void o2net_fill_node_map(unsigned long *map, unsigned bytes); + struct o2nm_node; int o2net_register_hb_callbacks(void); void o2net_unregister_hb_callbacks(void); diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c index 8582e3f..8fe4e28 100644 --- a/fs/ocfs2/dir.c +++ b/fs/ocfs2/dir.c @@ -1184,8 +1184,7 @@ static int __ocfs2_delete_entry(handle_t *handle, struct inode *dir, if (pde) le16_add_cpu(&pde->rec_len, le16_to_cpu(de->rec_len)); - else - de->inode = 0; + de->inode = 0; dir->i_version++; ocfs2_journal_dirty(handle, bh); goto bail; @@ -2292,7 +2291,7 @@ static int ocfs2_fill_new_dir_id(struct ocfs2_super *osb, ocfs2_journal_dirty(handle, di_bh); i_size_write(inode, size); - inode->i_nlink = 2; + set_nlink(inode, 2); inode->i_blocks = ocfs2_inode_sector_count(inode); ret = ocfs2_mark_inode_dirty(handle, inode, di_bh); @@ -2354,7 +2353,7 @@ static int ocfs2_fill_new_dir_el(struct ocfs2_super *osb, ocfs2_journal_dirty(handle, new_bh); i_size_write(inode, inode->i_sb->s_blocksize); - inode->i_nlink = 2; + set_nlink(inode, 2); inode->i_blocks = ocfs2_inode_sector_count(inode); status = ocfs2_mark_inode_dirty(handle, inode, fe_bh); if (status < 0) { diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h index d602abb..a5952ce 100644 --- a/fs/ocfs2/dlm/dlmcommon.h +++ b/fs/ocfs2/dlm/dlmcommon.h @@ -859,8 +859,8 @@ void dlm_complete_recovery_thread(struct dlm_ctxt *dlm); void dlm_wait_for_recovery(struct dlm_ctxt *dlm); void dlm_kick_recovery_thread(struct dlm_ctxt *dlm); int dlm_is_node_dead(struct dlm_ctxt *dlm, u8 node); -int dlm_wait_for_node_death(struct dlm_ctxt *dlm, u8 node, int timeout); -int dlm_wait_for_node_recovery(struct dlm_ctxt *dlm, u8 node, int timeout); +void dlm_wait_for_node_death(struct dlm_ctxt *dlm, u8 node, int timeout); +void dlm_wait_for_node_recovery(struct dlm_ctxt *dlm, u8 node, int timeout); void dlm_put(struct dlm_ctxt *dlm); struct dlm_ctxt *dlm_grab(struct dlm_ctxt *dlm); @@ -877,9 +877,8 @@ static inline void dlm_lockres_get(struct dlm_lock_resource *res) kref_get(&res->refs); } void dlm_lockres_put(struct dlm_lock_resource *res); -void __dlm_unhash_lockres(struct dlm_lock_resource *res); -void __dlm_insert_lockres(struct dlm_ctxt *dlm, - struct dlm_lock_resource *res); +void __dlm_unhash_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res); +void __dlm_insert_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res); struct dlm_lock_resource * __dlm_lookup_lockres_full(struct dlm_ctxt *dlm, const char *name, unsigned int len, @@ -902,46 +901,15 @@ struct dlm_lock_resource *dlm_new_lockres(struct dlm_ctxt *dlm, const char *name, unsigned int namelen); -#define dlm_lockres_set_refmap_bit(bit,res) \ - __dlm_lockres_set_refmap_bit(bit,res,__FILE__,__LINE__) -#define dlm_lockres_clear_refmap_bit(bit,res) \ - __dlm_lockres_clear_refmap_bit(bit,res,__FILE__,__LINE__) +void dlm_lockres_set_refmap_bit(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, int bit); +void dlm_lockres_clear_refmap_bit(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, int bit); -static inline void __dlm_lockres_set_refmap_bit(int bit, - struct dlm_lock_resource *res, - const char *file, - int line) -{ - //printk("%s:%d:%.*s: setting bit %d\n", file, line, - // res->lockname.len, res->lockname.name, bit); - set_bit(bit, res->refmap); -} - -static inline void __dlm_lockres_clear_refmap_bit(int bit, - struct dlm_lock_resource *res, - const char *file, - int line) -{ - //printk("%s:%d:%.*s: clearing bit %d\n", file, line, - // res->lockname.len, res->lockname.name, bit); - clear_bit(bit, res->refmap); -} - -void __dlm_lockres_drop_inflight_ref(struct dlm_ctxt *dlm, - struct dlm_lock_resource *res, - const char *file, - int line); -void __dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm, - struct dlm_lock_resource *res, - int new_lockres, - const char *file, - int line); -#define dlm_lockres_drop_inflight_ref(d,r) \ - __dlm_lockres_drop_inflight_ref(d,r,__FILE__,__LINE__) -#define dlm_lockres_grab_inflight_ref(d,r) \ - __dlm_lockres_grab_inflight_ref(d,r,0,__FILE__,__LINE__) -#define dlm_lockres_grab_inflight_ref_new(d,r) \ - __dlm_lockres_grab_inflight_ref(d,r,1,__FILE__,__LINE__) +void dlm_lockres_drop_inflight_ref(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res); +void dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res); void dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock); void dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock); diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c index 56f82cb..0e28e24 100644 --- a/fs/ocfs2/dlm/dlmdebug.c +++ b/fs/ocfs2/dlm/dlmdebug.c @@ -30,6 +30,7 @@ #include <linux/sysctl.h> #include <linux/spinlock.h> #include <linux/debugfs.h> +#include <linux/export.h> #include "cluster/heartbeat.h" #include "cluster/nodemanager.h" diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c index 6ed6b95..92f2ead 100644 --- a/fs/ocfs2/dlm/dlmdomain.c +++ b/fs/ocfs2/dlm/dlmdomain.c @@ -157,16 +157,18 @@ static int dlm_protocol_compare(struct dlm_protocol_version *existing, static void dlm_unregister_domain_handlers(struct dlm_ctxt *dlm); -void __dlm_unhash_lockres(struct dlm_lock_resource *lockres) +void __dlm_unhash_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res) { - if (!hlist_unhashed(&lockres->hash_node)) { - hlist_del_init(&lockres->hash_node); - dlm_lockres_put(lockres); - } + if (hlist_unhashed(&res->hash_node)) + return; + + mlog(0, "%s: Unhash res %.*s\n", dlm->name, res->lockname.len, + res->lockname.name); + hlist_del_init(&res->hash_node); + dlm_lockres_put(res); } -void __dlm_insert_lockres(struct dlm_ctxt *dlm, - struct dlm_lock_resource *res) +void __dlm_insert_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res) { struct hlist_head *bucket; struct qstr *q; @@ -180,6 +182,9 @@ void __dlm_insert_lockres(struct dlm_ctxt *dlm, dlm_lockres_get(res); hlist_add_head(&res->hash_node, bucket); + + mlog(0, "%s: Hash res %.*s\n", dlm->name, res->lockname.len, + res->lockname.name); } struct dlm_lock_resource * __dlm_lookup_lockres_full(struct dlm_ctxt *dlm, @@ -539,17 +544,17 @@ again: static void __dlm_print_nodes(struct dlm_ctxt *dlm) { - int node = -1; + int node = -1, num = 0; assert_spin_locked(&dlm->spinlock); - printk(KERN_NOTICE "o2dlm: Nodes in domain %s: ", dlm->name); - + printk("( "); while ((node = find_next_bit(dlm->domain_map, O2NM_MAX_NODES, node + 1)) < O2NM_MAX_NODES) { printk("%d ", node); + ++num; } - printk("\n"); + printk(") %u nodes\n", num); } static int dlm_exit_domain_handler(struct o2net_msg *msg, u32 len, void *data, @@ -566,11 +571,10 @@ static int dlm_exit_domain_handler(struct o2net_msg *msg, u32 len, void *data, node = exit_msg->node_idx; - printk(KERN_NOTICE "o2dlm: Node %u leaves domain %s\n", node, dlm->name); - spin_lock(&dlm->spinlock); clear_bit(node, dlm->domain_map); clear_bit(node, dlm->exit_domain_map); + printk(KERN_NOTICE "o2dlm: Node %u leaves domain %s ", node, dlm->name); __dlm_print_nodes(dlm); /* notify anything attached to the heartbeat events */ @@ -755,6 +759,7 @@ void dlm_unregister_domain(struct dlm_ctxt *dlm) dlm_mark_domain_leaving(dlm); dlm_leave_domain(dlm); + printk(KERN_NOTICE "o2dlm: Leaving domain %s\n", dlm->name); dlm_force_free_mles(dlm); dlm_complete_dlm_shutdown(dlm); } @@ -970,7 +975,7 @@ static int dlm_assert_joined_handler(struct o2net_msg *msg, u32 len, void *data, clear_bit(assert->node_idx, dlm->exit_domain_map); __dlm_set_joining_node(dlm, DLM_LOCK_RES_OWNER_UNKNOWN); - printk(KERN_NOTICE "o2dlm: Node %u joins domain %s\n", + printk(KERN_NOTICE "o2dlm: Node %u joins domain %s ", assert->node_idx, dlm->name); __dlm_print_nodes(dlm); @@ -1701,8 +1706,10 @@ static int dlm_try_to_join_domain(struct dlm_ctxt *dlm) bail: spin_lock(&dlm->spinlock); __dlm_set_joining_node(dlm, DLM_LOCK_RES_OWNER_UNKNOWN); - if (!status) + if (!status) { + printk(KERN_NOTICE "o2dlm: Joining domain %s ", dlm->name); __dlm_print_nodes(dlm); + } spin_unlock(&dlm->spinlock); if (ctxt) { @@ -2131,13 +2138,6 @@ struct dlm_ctxt * dlm_register_domain(const char *domain, goto leave; } - if (!o2hb_check_local_node_heartbeating()) { - mlog(ML_ERROR, "the local node has not been configured, or is " - "not heartbeating\n"); - ret = -EPROTO; - goto leave; - } - mlog(0, "register called for domain \"%s\"\n", domain); retry: diff --git a/fs/ocfs2/dlm/dlmlock.c b/fs/ocfs2/dlm/dlmlock.c index 8d39e0fd6..975810b 100644 --- a/fs/ocfs2/dlm/dlmlock.c +++ b/fs/ocfs2/dlm/dlmlock.c @@ -183,10 +183,6 @@ static enum dlm_status dlmlock_master(struct dlm_ctxt *dlm, kick_thread = 1; } } - /* reduce the inflight count, this may result in the lockres - * being purged below during calc_usage */ - if (lock->ml.node == dlm->node_num) - dlm_lockres_drop_inflight_ref(dlm, res); spin_unlock(&res->spinlock); wake_up(&res->wq); @@ -231,10 +227,16 @@ static enum dlm_status dlmlock_remote(struct dlm_ctxt *dlm, lock->ml.type, res->lockname.len, res->lockname.name, flags); + /* + * Wait if resource is getting recovered, remastered, etc. + * If the resource was remastered and new owner is self, then exit. + */ spin_lock(&res->spinlock); - - /* will exit this call with spinlock held */ __dlm_wait_on_lockres(res); + if (res->owner == dlm->node_num) { + spin_unlock(&res->spinlock); + return DLM_RECOVERING; + } res->state |= DLM_LOCK_RES_IN_PROGRESS; /* add lock to local (secondary) queue */ @@ -319,27 +321,23 @@ static enum dlm_status dlm_send_remote_lock_request(struct dlm_ctxt *dlm, tmpret = o2net_send_message(DLM_CREATE_LOCK_MSG, dlm->key, &create, sizeof(create), res->owner, &status); if (tmpret >= 0) { - // successfully sent and received - ret = status; // this is already a dlm_status + ret = status; if (ret == DLM_REJECTED) { - mlog(ML_ERROR, "%s:%.*s: BUG. this is a stale lockres " - "no longer owned by %u. that node is coming back " - "up currently.\n", dlm->name, create.namelen, + mlog(ML_ERROR, "%s: res %.*s, Stale lockres no longer " + "owned by node %u. That node is coming back up " + "currently.\n", dlm->name, create.namelen, create.name, res->owner); dlm_print_one_lock_resource(res); BUG(); } } else { - mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to " - "node %u\n", tmpret, DLM_CREATE_LOCK_MSG, dlm->key, - res->owner); - if (dlm_is_host_down(tmpret)) { + mlog(ML_ERROR, "%s: res %.*s, Error %d send CREATE LOCK to " + "node %u\n", dlm->name, create.namelen, create.name, + tmpret, res->owner); + if (dlm_is_host_down(tmpret)) ret = DLM_RECOVERING; - mlog(0, "node %u died so returning DLM_RECOVERING " - "from lock message!\n", res->owner); - } else { + else ret = dlm_err_to_dlm_status(tmpret); - } } return ret; @@ -440,7 +438,7 @@ struct dlm_lock * dlm_new_lock(int type, u8 node, u64 cookie, /* zero memory only if kernel-allocated */ lksb = kzalloc(sizeof(*lksb), GFP_NOFS); if (!lksb) { - kfree(lock); + kmem_cache_free(dlm_lock_cache, lock); return NULL; } kernel_allocated = 1; @@ -718,18 +716,10 @@ retry_lock: if (status == DLM_RECOVERING || status == DLM_MIGRATING || status == DLM_FORWARD) { - mlog(0, "retrying lock with migration/" - "recovery/in progress\n"); msleep(100); - /* no waiting for dlm_reco_thread */ if (recovery) { if (status != DLM_RECOVERING) goto retry_lock; - - mlog(0, "%s: got RECOVERING " - "for $RECOVERY lock, master " - "was %u\n", dlm->name, - res->owner); /* wait to see the node go down, then * drop down and allow the lockres to * get cleaned up. need to remaster. */ @@ -741,6 +731,14 @@ retry_lock: } } + /* Inflight taken in dlm_get_lock_resource() is dropped here */ + spin_lock(&res->spinlock); + dlm_lockres_drop_inflight_ref(dlm, res); + spin_unlock(&res->spinlock); + + dlm_lockres_calc_usage(dlm, res); + dlm_kick_thread(dlm, res); + if (status != DLM_NORMAL) { lock->lksb->flags &= ~DLM_LKSB_GET_LVB; if (status != DLM_NOTQUEUED) diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c index 11eefb8..005261c 100644 --- a/fs/ocfs2/dlm/dlmmaster.c +++ b/fs/ocfs2/dlm/dlmmaster.c @@ -631,39 +631,54 @@ error: return NULL; } -void __dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm, - struct dlm_lock_resource *res, - int new_lockres, - const char *file, - int line) +void dlm_lockres_set_refmap_bit(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, int bit) { - if (!new_lockres) - assert_spin_locked(&res->spinlock); + assert_spin_locked(&res->spinlock); + + mlog(0, "res %.*s, set node %u, %ps()\n", res->lockname.len, + res->lockname.name, bit, __builtin_return_address(0)); + + set_bit(bit, res->refmap); +} + +void dlm_lockres_clear_refmap_bit(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, int bit) +{ + assert_spin_locked(&res->spinlock); + + mlog(0, "res %.*s, clr node %u, %ps()\n", res->lockname.len, + res->lockname.name, bit, __builtin_return_address(0)); + + clear_bit(bit, res->refmap); +} + + +void dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res) +{ + assert_spin_locked(&res->spinlock); - if (!test_bit(dlm->node_num, res->refmap)) { - BUG_ON(res->inflight_locks != 0); - dlm_lockres_set_refmap_bit(dlm->node_num, res); - } res->inflight_locks++; - mlog(0, "%s:%.*s: inflight++: now %u\n", - dlm->name, res->lockname.len, res->lockname.name, - res->inflight_locks); + + mlog(0, "%s: res %.*s, inflight++: now %u, %ps()\n", dlm->name, + res->lockname.len, res->lockname.name, res->inflight_locks, + __builtin_return_address(0)); } -void __dlm_lockres_drop_inflight_ref(struct dlm_ctxt *dlm, - struct dlm_lock_resource *res, - const char *file, - int line) +void dlm_lockres_drop_inflight_ref(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res) { assert_spin_locked(&res->spinlock); BUG_ON(res->inflight_locks == 0); + res->inflight_locks--; - mlog(0, "%s:%.*s: inflight--: now %u\n", - dlm->name, res->lockname.len, res->lockname.name, - res->inflight_locks); - if (res->inflight_locks == 0) - dlm_lockres_clear_refmap_bit(dlm->node_num, res); + + mlog(0, "%s: res %.*s, inflight--: now %u, %ps()\n", dlm->name, + res->lockname.len, res->lockname.name, res->inflight_locks, + __builtin_return_address(0)); + wake_up(&res->wq); } @@ -697,7 +712,6 @@ struct dlm_lock_resource * dlm_get_lock_resource(struct dlm_ctxt *dlm, unsigned int hash; int tries = 0; int bit, wait_on_recovery = 0; - int drop_inflight_if_nonlocal = 0; BUG_ON(!lockid); @@ -709,36 +723,33 @@ lookup: spin_lock(&dlm->spinlock); tmpres = __dlm_lookup_lockres_full(dlm, lockid, namelen, hash); if (tmpres) { - int dropping_ref = 0; - spin_unlock(&dlm->spinlock); - spin_lock(&tmpres->spinlock); - /* We wait for the other thread that is mastering the resource */ + /* Wait on the thread that is mastering the resource */ if (tmpres->owner == DLM_LOCK_RES_OWNER_UNKNOWN) { __dlm_wait_on_lockres(tmpres); BUG_ON(tmpres->owner == DLM_LOCK_RES_OWNER_UNKNOWN); + spin_unlock(&tmpres->spinlock); + dlm_lockres_put(tmpres); + tmpres = NULL; + goto lookup; } - if (tmpres->owner == dlm->node_num) { - BUG_ON(tmpres->state & DLM_LOCK_RES_DROPPING_REF); - dlm_lockres_grab_inflight_ref(dlm, tmpres); - } else if (tmpres->state & DLM_LOCK_RES_DROPPING_REF) - dropping_ref = 1; - spin_unlock(&tmpres->spinlock); - - /* wait until done messaging the master, drop our ref to allow - * the lockres to be purged, start over. */ - if (dropping_ref) { - spin_lock(&tmpres->spinlock); - __dlm_wait_on_lockres_flags(tmpres, DLM_LOCK_RES_DROPPING_REF); + /* Wait on the resource purge to complete before continuing */ + if (tmpres->state & DLM_LOCK_RES_DROPPING_REF) { + BUG_ON(tmpres->owner == dlm->node_num); + __dlm_wait_on_lockres_flags(tmpres, + DLM_LOCK_RES_DROPPING_REF); spin_unlock(&tmpres->spinlock); dlm_lockres_put(tmpres); tmpres = NULL; goto lookup; } - mlog(0, "found in hash!\n"); + /* Grab inflight ref to pin the resource */ + dlm_lockres_grab_inflight_ref(dlm, tmpres); + + spin_unlock(&tmpres->spinlock); if (res) dlm_lockres_put(res); res = tmpres; @@ -829,8 +840,8 @@ lookup: * but they might own this lockres. wait on them. */ bit = find_next_bit(dlm->recovery_map, O2NM_MAX_NODES, 0); if (bit < O2NM_MAX_NODES) { - mlog(ML_NOTICE, "%s:%.*s: at least one node (%d) to " - "recover before lock mastery can begin\n", + mlog(0, "%s: res %.*s, At least one node (%d) " + "to recover before lock mastery can begin\n", dlm->name, namelen, (char *)lockid, bit); wait_on_recovery = 1; } @@ -843,12 +854,11 @@ lookup: /* finally add the lockres to its hash bucket */ __dlm_insert_lockres(dlm, res); - /* since this lockres is new it doesn't not require the spinlock */ - dlm_lockres_grab_inflight_ref_new(dlm, res); - /* if this node does not become the master make sure to drop - * this inflight reference below */ - drop_inflight_if_nonlocal = 1; + /* Grab inflight ref to pin the resource */ + spin_lock(&res->spinlock); + dlm_lockres_grab_inflight_ref(dlm, res); + spin_unlock(&res->spinlock); /* get an extra ref on the mle in case this is a BLOCK * if so, the creator of the BLOCK may try to put the last @@ -864,8 +874,8 @@ redo_request: * dlm spinlock would be detectable be a change on the mle, * so we only need to clear out the recovery map once. */ if (dlm_is_recovery_lock(lockid, namelen)) { - mlog(ML_NOTICE, "%s: recovery map is not empty, but " - "must master $RECOVERY lock now\n", dlm->name); + mlog(0, "%s: Recovery map is not empty, but must " + "master $RECOVERY lock now\n", dlm->name); if (!dlm_pre_master_reco_lockres(dlm, res)) wait_on_recovery = 0; else { @@ -883,8 +893,8 @@ redo_request: spin_lock(&dlm->spinlock); bit = find_next_bit(dlm->recovery_map, O2NM_MAX_NODES, 0); if (bit < O2NM_MAX_NODES) { - mlog(ML_NOTICE, "%s:%.*s: at least one node (%d) to " - "recover before lock mastery can begin\n", + mlog(0, "%s: res %.*s, At least one node (%d) " + "to recover before lock mastery can begin\n", dlm->name, namelen, (char *)lockid, bit); wait_on_recovery = 1; } else @@ -913,8 +923,8 @@ redo_request: * yet, keep going until it does. this is how the * master will know that asserts are needed back to * the lower nodes. */ - mlog(0, "%s:%.*s: requests only up to %u but master " - "is %u, keep going\n", dlm->name, namelen, + mlog(0, "%s: res %.*s, Requests only up to %u but " + "master is %u, keep going\n", dlm->name, namelen, lockid, nodenum, mle->master); } } @@ -924,13 +934,12 @@ wait: ret = dlm_wait_for_lock_mastery(dlm, res, mle, &blocked); if (ret < 0) { wait_on_recovery = 1; - mlog(0, "%s:%.*s: node map changed, redo the " - "master request now, blocked=%d\n", - dlm->name, res->lockname.len, + mlog(0, "%s: res %.*s, Node map changed, redo the master " + "request now, blocked=%d\n", dlm->name, res->lockname.len, res->lockname.name, blocked); if (++tries > 20) { - mlog(ML_ERROR, "%s:%.*s: spinning on " - "dlm_wait_for_lock_mastery, blocked=%d\n", + mlog(ML_ERROR, "%s: res %.*s, Spinning on " + "dlm_wait_for_lock_mastery, blocked = %d\n", dlm->name, res->lockname.len, res->lockname.name, blocked); dlm_print_one_lock_resource(res); @@ -940,7 +949,8 @@ wait: goto redo_request; } - mlog(0, "lockres mastered by %u\n", res->owner); + mlog(0, "%s: res %.*s, Mastered by %u\n", dlm->name, res->lockname.len, + res->lockname.name, res->owner); /* make sure we never continue without this */ BUG_ON(res->owner == O2NM_MAX_NODES); @@ -952,8 +962,6 @@ wait: wake_waiters: spin_lock(&res->spinlock); - if (res->owner != dlm->node_num && drop_inflight_if_nonlocal) - dlm_lockres_drop_inflight_ref(dlm, res); res->state &= ~DLM_LOCK_RES_IN_PROGRESS; spin_unlock(&res->spinlock); wake_up(&res->wq); @@ -1426,9 +1434,7 @@ way_up_top: } if (res->owner == dlm->node_num) { - mlog(0, "%s:%.*s: setting bit %u in refmap\n", - dlm->name, namelen, name, request->node_idx); - dlm_lockres_set_refmap_bit(request->node_idx, res); + dlm_lockres_set_refmap_bit(dlm, res, request->node_idx); spin_unlock(&res->spinlock); response = DLM_MASTER_RESP_YES; if (mle) @@ -1493,10 +1499,8 @@ way_up_top: * go back and clean the mles on any * other nodes */ dispatch_assert = 1; - dlm_lockres_set_refmap_bit(request->node_idx, res); - mlog(0, "%s:%.*s: setting bit %u in refmap\n", - dlm->name, namelen, name, - request->node_idx); + dlm_lockres_set_refmap_bit(dlm, res, + request->node_idx); } else response = DLM_MASTER_RESP_NO; } else { @@ -1702,7 +1706,7 @@ again: "lockres, set the bit in the refmap\n", namelen, lockname, to); spin_lock(&res->spinlock); - dlm_lockres_set_refmap_bit(to, res); + dlm_lockres_set_refmap_bit(dlm, res, to); spin_unlock(&res->spinlock); } } @@ -2187,8 +2191,6 @@ int dlm_drop_lockres_ref(struct dlm_ctxt *dlm, struct dlm_lock_resource *res) namelen = res->lockname.len; BUG_ON(namelen > O2NM_MAX_NAME_LEN); - mlog(0, "%s:%.*s: sending deref to %d\n", - dlm->name, namelen, lockname, res->owner); memset(&deref, 0, sizeof(deref)); deref.node_idx = dlm->node_num; deref.namelen = namelen; @@ -2197,14 +2199,12 @@ int dlm_drop_lockres_ref(struct dlm_ctxt *dlm, struct dlm_lock_resource *res) ret = o2net_send_message(DLM_DEREF_LOCKRES_MSG, dlm->key, &deref, sizeof(deref), res->owner, &r); if (ret < 0) - mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to " - "node %u\n", ret, DLM_DEREF_LOCKRES_MSG, dlm->key, - res->owner); + mlog(ML_ERROR, "%s: res %.*s, error %d send DEREF to node %u\n", + dlm->name, namelen, lockname, ret, res->owner); else if (r < 0) { /* BAD. other node says I did not have a ref. */ - mlog(ML_ERROR,"while dropping ref on %s:%.*s " - "(master=%u) got %d.\n", dlm->name, namelen, - lockname, res->owner, r); + mlog(ML_ERROR, "%s: res %.*s, DEREF to node %u got %d\n", + dlm->name, namelen, lockname, res->owner, r); dlm_print_one_lock_resource(res); BUG(); } @@ -2260,7 +2260,7 @@ int dlm_deref_lockres_handler(struct o2net_msg *msg, u32 len, void *data, else { BUG_ON(res->state & DLM_LOCK_RES_DROPPING_REF); if (test_bit(node, res->refmap)) { - dlm_lockres_clear_refmap_bit(node, res); + dlm_lockres_clear_refmap_bit(dlm, res, node); cleared = 1; } } @@ -2320,7 +2320,7 @@ static void dlm_deref_lockres_worker(struct dlm_work_item *item, void *data) BUG_ON(res->state & DLM_LOCK_RES_DROPPING_REF); if (test_bit(node, res->refmap)) { __dlm_wait_on_lockres_flags(res, DLM_LOCK_RES_SETREF_INPROG); - dlm_lockres_clear_refmap_bit(node, res); + dlm_lockres_clear_refmap_bit(dlm, res, node); cleared = 1; } spin_unlock(&res->spinlock); @@ -2802,7 +2802,8 @@ static void dlm_remove_nonlocal_locks(struct dlm_ctxt *dlm, BUG_ON(!list_empty(&lock->bast_list)); BUG_ON(lock->ast_pending); BUG_ON(lock->bast_pending); - dlm_lockres_clear_refmap_bit(lock->ml.node, res); + dlm_lockres_clear_refmap_bit(dlm, res, + lock->ml.node); list_del_init(&lock->list); dlm_lock_put(lock); /* In a normal unlock, we would have added a @@ -2823,7 +2824,7 @@ static void dlm_remove_nonlocal_locks(struct dlm_ctxt *dlm, mlog(0, "%s:%.*s: node %u had a ref to this " "migrating lockres, clearing\n", dlm->name, res->lockname.len, res->lockname.name, bit); - dlm_lockres_clear_refmap_bit(bit, res); + dlm_lockres_clear_refmap_bit(dlm, res, bit); } bit++; } @@ -2916,9 +2917,9 @@ static int dlm_do_migrate_request(struct dlm_ctxt *dlm, &migrate, sizeof(migrate), nodenum, &status); if (ret < 0) { - mlog(ML_ERROR, "Error %d when sending message %u (key " - "0x%x) to node %u\n", ret, DLM_MIGRATE_REQUEST_MSG, - dlm->key, nodenum); + mlog(ML_ERROR, "%s: res %.*s, Error %d send " + "MIGRATE_REQUEST to node %u\n", dlm->name, + migrate.namelen, migrate.name, ret, nodenum); if (!dlm_is_host_down(ret)) { mlog(ML_ERROR, "unhandled error=%d!\n", ret); BUG(); @@ -2937,7 +2938,7 @@ static int dlm_do_migrate_request(struct dlm_ctxt *dlm, dlm->name, res->lockname.len, res->lockname.name, nodenum); spin_lock(&res->spinlock); - dlm_lockres_set_refmap_bit(nodenum, res); + dlm_lockres_set_refmap_bit(dlm, res, nodenum); spin_unlock(&res->spinlock); } } @@ -3271,7 +3272,7 @@ int dlm_finish_migration(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, * mastery reference here since old_master will briefly have * a reference after the migration completes */ spin_lock(&res->spinlock); - dlm_lockres_set_refmap_bit(old_master, res); + dlm_lockres_set_refmap_bit(dlm, res, old_master); spin_unlock(&res->spinlock); mlog(0, "now time to do a migrate request to other nodes\n"); diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c index 7efab6d..01ebfd0 100644 --- a/fs/ocfs2/dlm/dlmrecovery.c +++ b/fs/ocfs2/dlm/dlmrecovery.c @@ -362,40 +362,38 @@ static int dlm_is_node_recovered(struct dlm_ctxt *dlm, u8 node) } -int dlm_wait_for_node_death(struct dlm_ctxt *dlm, u8 node, int timeout) +void dlm_wait_for_node_death(struct dlm_ctxt *dlm, u8 node, int timeout) { - if (timeout) { - mlog(ML_NOTICE, "%s: waiting %dms for notification of " - "death of node %u\n", dlm->name, timeout, node); + if (dlm_is_node_dead(dlm, node)) + return; + + printk(KERN_NOTICE "o2dlm: Waiting on the death of node %u in " + "domain %s\n", node, dlm->name); + + if (timeout) wait_event_timeout(dlm->dlm_reco_thread_wq, - dlm_is_node_dead(dlm, node), - msecs_to_jiffies(timeout)); - } else { - mlog(ML_NOTICE, "%s: waiting indefinitely for notification " - "of death of node %u\n", dlm->name, node); + dlm_is_node_dead(dlm, node), + msecs_to_jiffies(timeout)); + else wait_event(dlm->dlm_reco_thread_wq, dlm_is_node_dead(dlm, node)); - } - /* for now, return 0 */ - return 0; } -int dlm_wait_for_node_recovery(struct dlm_ctxt *dlm, u8 node, int timeout) +void dlm_wait_for_node_recovery(struct dlm_ctxt *dlm, u8 node, int timeout) { - if (timeout) { - mlog(0, "%s: waiting %dms for notification of " - "recovery of node %u\n", dlm->name, timeout, node); + if (dlm_is_node_recovered(dlm, node)) + return; + + printk(KERN_NOTICE "o2dlm: Waiting on the recovery of node %u in " + "domain %s\n", node, dlm->name); + + if (timeout) wait_event_timeout(dlm->dlm_reco_thread_wq, - dlm_is_node_recovered(dlm, node), - msecs_to_jiffies(timeout)); - } else { - mlog(0, "%s: waiting indefinitely for notification " - "of recovery of node %u\n", dlm->name, node); + dlm_is_node_recovered(dlm, node), + msecs_to_jiffies(timeout)); + else wait_event(dlm->dlm_reco_thread_wq, dlm_is_node_recovered(dlm, node)); - } - /* for now, return 0 */ - return 0; } /* callers of the top-level api calls (dlmlock/dlmunlock) should @@ -430,6 +428,8 @@ static void dlm_begin_recovery(struct dlm_ctxt *dlm) { spin_lock(&dlm->spinlock); BUG_ON(dlm->reco.state & DLM_RECO_STATE_ACTIVE); + printk(KERN_NOTICE "o2dlm: Begin recovery on domain %s for node %u\n", + dlm->name, dlm->reco.dead_node); dlm->reco.state |= DLM_RECO_STATE_ACTIVE; spin_unlock(&dlm->spinlock); } @@ -440,9 +440,18 @@ static void dlm_end_recovery(struct dlm_ctxt *dlm) BUG_ON(!(dlm->reco.state & DLM_RECO_STATE_ACTIVE)); dlm->reco.state &= ~DLM_RECO_STATE_ACTIVE; spin_unlock(&dlm->spinlock); + printk(KERN_NOTICE "o2dlm: End recovery on domain %s\n", dlm->name); wake_up(&dlm->reco.event); } +static void dlm_print_recovery_master(struct dlm_ctxt *dlm) +{ + printk(KERN_NOTICE "o2dlm: Node %u (%s) is the Recovery Master for the " + "dead node %u in domain %s\n", dlm->reco.new_master, + (dlm->node_num == dlm->reco.new_master ? "me" : "he"), + dlm->reco.dead_node, dlm->name); +} + static int dlm_do_recovery(struct dlm_ctxt *dlm) { int status = 0; @@ -505,9 +514,8 @@ static int dlm_do_recovery(struct dlm_ctxt *dlm) } mlog(0, "another node will master this recovery session.\n"); } - mlog(0, "dlm=%s (%d), new_master=%u, this node=%u, dead_node=%u\n", - dlm->name, task_pid_nr(dlm->dlm_reco_thread_task), dlm->reco.new_master, - dlm->node_num, dlm->reco.dead_node); + + dlm_print_recovery_master(dlm); /* it is safe to start everything back up here * because all of the dead node's lock resources @@ -518,15 +526,13 @@ static int dlm_do_recovery(struct dlm_ctxt *dlm) return 0; master_here: - mlog(ML_NOTICE, "(%d) Node %u is the Recovery Master for the Dead Node " - "%u for Domain %s\n", task_pid_nr(dlm->dlm_reco_thread_task), - dlm->node_num, dlm->reco.dead_node, dlm->name); + dlm_print_recovery_master(dlm); status = dlm_remaster_locks(dlm, dlm->reco.dead_node); if (status < 0) { /* we should never hit this anymore */ - mlog(ML_ERROR, "error %d remastering locks for node %u, " - "retrying.\n", status, dlm->reco.dead_node); + mlog(ML_ERROR, "%s: Error %d remastering locks for node %u, " + "retrying.\n", dlm->name, status, dlm->reco.dead_node); /* yield a bit to allow any final network messages * to get handled on remaining nodes */ msleep(100); @@ -567,7 +573,7 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node) BUG_ON(ndata->state != DLM_RECO_NODE_DATA_INIT); ndata->state = DLM_RECO_NODE_DATA_REQUESTING; - mlog(0, "requesting lock info from node %u\n", + mlog(0, "%s: Requesting lock info from node %u\n", dlm->name, ndata->node_num); if (ndata->node_num == dlm->node_num) { @@ -640,7 +646,7 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node) spin_unlock(&dlm_reco_state_lock); } - mlog(0, "done requesting all lock info\n"); + mlog(0, "%s: Done requesting all lock info\n", dlm->name); /* nodes should be sending reco data now * just need to wait */ @@ -802,10 +808,9 @@ static int dlm_request_all_locks(struct dlm_ctxt *dlm, u8 request_from, /* negative status is handled by caller */ if (ret < 0) - mlog(ML_ERROR, "Error %d when sending message %u (key " - "0x%x) to node %u\n", ret, DLM_LOCK_REQUEST_MSG, - dlm->key, request_from); - + mlog(ML_ERROR, "%s: Error %d send LOCK_REQUEST to node %u " + "to recover dead node %u\n", dlm->name, ret, + request_from, dead_node); // return from here, then // sleep until all received or error return ret; @@ -956,9 +961,9 @@ static int dlm_send_all_done_msg(struct dlm_ctxt *dlm, u8 dead_node, u8 send_to) ret = o2net_send_message(DLM_RECO_DATA_DONE_MSG, dlm->key, &done_msg, sizeof(done_msg), send_to, &tmpret); if (ret < 0) { - mlog(ML_ERROR, "Error %d when sending message %u (key " - "0x%x) to node %u\n", ret, DLM_RECO_DATA_DONE_MSG, - dlm->key, send_to); + mlog(ML_ERROR, "%s: Error %d send RECO_DATA_DONE to node %u " + "to recover dead node %u\n", dlm->name, ret, send_to, + dead_node); if (!dlm_is_host_down(ret)) { BUG(); } @@ -1127,9 +1132,11 @@ static int dlm_send_mig_lockres_msg(struct dlm_ctxt *dlm, if (ret < 0) { /* XXX: negative status is not handled. * this will end up killing this node. */ - mlog(ML_ERROR, "Error %d when sending message %u (key " - "0x%x) to node %u\n", ret, DLM_MIG_LOCKRES_MSG, - dlm->key, send_to); + mlog(ML_ERROR, "%s: res %.*s, Error %d send MIG_LOCKRES to " + "node %u (%s)\n", dlm->name, mres->lockname_len, + mres->lockname, ret, send_to, + (orig_flags & DLM_MRES_MIGRATION ? + "migration" : "recovery")); } else { /* might get an -ENOMEM back here */ ret = status; @@ -1767,7 +1774,7 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm, dlm->name, mres->lockname_len, mres->lockname, from); spin_lock(&res->spinlock); - dlm_lockres_set_refmap_bit(from, res); + dlm_lockres_set_refmap_bit(dlm, res, from); spin_unlock(&res->spinlock); added++; break; @@ -1965,7 +1972,7 @@ skip_lvb: mlog(0, "%s:%.*s: added lock for node %u, " "setting refmap bit\n", dlm->name, res->lockname.len, res->lockname.name, ml->node); - dlm_lockres_set_refmap_bit(ml->node, res); + dlm_lockres_set_refmap_bit(dlm, res, ml->node); added++; } spin_unlock(&res->spinlock); @@ -2084,6 +2091,9 @@ static void dlm_finish_local_lockres_recovery(struct dlm_ctxt *dlm, list_for_each_entry_safe(res, next, &dlm->reco.resources, recovering) { if (res->owner == dead_node) { + mlog(0, "%s: res %.*s, Changing owner from %u to %u\n", + dlm->name, res->lockname.len, res->lockname.name, + res->owner, new_master); list_del_init(&res->recovering); spin_lock(&res->spinlock); /* new_master has our reference from @@ -2105,40 +2115,30 @@ static void dlm_finish_local_lockres_recovery(struct dlm_ctxt *dlm, for (i = 0; i < DLM_HASH_BUCKETS; i++) { bucket = dlm_lockres_hash(dlm, i); hlist_for_each_entry(res, hash_iter, bucket, hash_node) { - if (res->state & DLM_LOCK_RES_RECOVERING) { - if (res->owner == dead_node) { - mlog(0, "(this=%u) res %.*s owner=%u " - "was not on recovering list, but " - "clearing state anyway\n", - dlm->node_num, res->lockname.len, - res->lockname.name, new_master); - } else if (res->owner == dlm->node_num) { - mlog(0, "(this=%u) res %.*s owner=%u " - "was not on recovering list, " - "owner is THIS node, clearing\n", - dlm->node_num, res->lockname.len, - res->lockname.name, new_master); - } else - continue; + if (!(res->state & DLM_LOCK_RES_RECOVERING)) + continue; - if (!list_empty(&res->recovering)) { - mlog(0, "%s:%.*s: lockres was " - "marked RECOVERING, owner=%u\n", - dlm->name, res->lockname.len, - res->lockname.name, res->owner); - list_del_init(&res->recovering); - dlm_lockres_put(res); - } - spin_lock(&res->spinlock); - /* new_master has our reference from - * the lock state sent during recovery */ - dlm_change_lockres_owner(dlm, res, new_master); - res->state &= ~DLM_LOCK_RES_RECOVERING; - if (__dlm_lockres_has_locks(res)) - __dlm_dirty_lockres(dlm, res); - spin_unlock(&res->spinlock); - wake_up(&res->wq); + if (res->owner != dead_node && + res->owner != dlm->node_num) + continue; + + if (!list_empty(&res->recovering)) { + list_del_init(&res->recovering); + dlm_lockres_put(res); } + + /* new_master has our reference from + * the lock state sent during recovery */ + mlog(0, "%s: res %.*s, Changing owner from %u to %u\n", + dlm->name, res->lockname.len, res->lockname.name, + res->owner, new_master); + spin_lock(&res->spinlock); + dlm_change_lockres_owner(dlm, res, new_master); + res->state &= ~DLM_LOCK_RES_RECOVERING; + if (__dlm_lockres_has_locks(res)) + __dlm_dirty_lockres(dlm, res); + spin_unlock(&res->spinlock); + wake_up(&res->wq); } } } @@ -2252,12 +2252,12 @@ static void dlm_free_dead_locks(struct dlm_ctxt *dlm, res->lockname.len, res->lockname.name, freed, dead_node); __dlm_print_one_lock_resource(res); } - dlm_lockres_clear_refmap_bit(dead_node, res); + dlm_lockres_clear_refmap_bit(dlm, res, dead_node); } else if (test_bit(dead_node, res->refmap)) { mlog(0, "%s:%.*s: dead node %u had a ref, but had " "no locks and had not purged before dying\n", dlm->name, res->lockname.len, res->lockname.name, dead_node); - dlm_lockres_clear_refmap_bit(dead_node, res); + dlm_lockres_clear_refmap_bit(dlm, res, dead_node); } /* do not kick thread yet */ @@ -2324,9 +2324,9 @@ static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node) dlm_revalidate_lvb(dlm, res, dead_node); if (res->owner == dead_node) { if (res->state & DLM_LOCK_RES_DROPPING_REF) { - mlog(ML_NOTICE, "Ignore %.*s for " + mlog(ML_NOTICE, "%s: res %.*s, Skip " "recovery as it is being freed\n", - res->lockname.len, + dlm->name, res->lockname.len, res->lockname.name); } else dlm_move_lockres_to_recovery_list(dlm, diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c index 1d6d1d2..e73c833 100644 --- a/fs/ocfs2/dlm/dlmthread.c +++ b/fs/ocfs2/dlm/dlmthread.c @@ -94,24 +94,26 @@ int __dlm_lockres_unused(struct dlm_lock_resource *res) { int bit; + assert_spin_locked(&res->spinlock); + if (__dlm_lockres_has_locks(res)) return 0; + /* Locks are in the process of being created */ + if (res->inflight_locks) + return 0; + if (!list_empty(&res->dirty) || res->state & DLM_LOCK_RES_DIRTY) return 0; if (res->state & DLM_LOCK_RES_RECOVERING) return 0; + /* Another node has this resource with this node as the master */ bit = find_next_bit(res->refmap, O2NM_MAX_NODES, 0); if (bit < O2NM_MAX_NODES) return 0; - /* - * since the bit for dlm->node_num is not set, inflight_locks better - * be zero - */ - BUG_ON(res->inflight_locks != 0); return 1; } @@ -185,8 +187,6 @@ static void dlm_purge_lockres(struct dlm_ctxt *dlm, /* clear our bit from the master's refmap, ignore errors */ ret = dlm_drop_lockres_ref(dlm, res); if (ret < 0) { - mlog(ML_ERROR, "%s: deref %.*s failed %d\n", dlm->name, - res->lockname.len, res->lockname.name, ret); if (!dlm_is_host_down(ret)) BUG(); } @@ -209,7 +209,7 @@ static void dlm_purge_lockres(struct dlm_ctxt *dlm, BUG(); } - __dlm_unhash_lockres(res); + __dlm_unhash_lockres(dlm, res); /* lockres is not in the hash now. drop the flag and wake up * any processes waiting in dlm_get_lock_resource. */ diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c index b420767..abfac0d 100644 --- a/fs/ocfs2/dlmfs/dlmfs.c +++ b/fs/ocfs2/dlmfs/dlmfs.c @@ -354,7 +354,6 @@ static struct inode *dlmfs_alloc_inode(struct super_block *sb) static void dlmfs_i_callback(struct rcu_head *head) { struct inode *inode = container_of(head, struct inode, i_rcu); - INIT_LIST_HEAD(&inode->i_dentry); kmem_cache_free(dlmfs_inode_cache, DLMFS_I(inode)); } @@ -401,16 +400,14 @@ static struct backing_dev_info dlmfs_backing_dev_info = { static struct inode *dlmfs_get_root_inode(struct super_block *sb) { struct inode *inode = new_inode(sb); - int mode = S_IFDIR | 0755; + umode_t mode = S_IFDIR | 0755; struct dlmfs_inode_private *ip; if (inode) { ip = DLMFS_I(inode); inode->i_ino = get_next_ino(); - inode->i_mode = mode; - inode->i_uid = current_fsuid(); - inode->i_gid = current_fsgid(); + inode_init_owner(inode, NULL, mode); inode->i_mapping->backing_dev_info = &dlmfs_backing_dev_info; inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; inc_nlink(inode); @@ -424,7 +421,7 @@ static struct inode *dlmfs_get_root_inode(struct super_block *sb) static struct inode *dlmfs_get_inode(struct inode *parent, struct dentry *dentry, - int mode) + umode_t mode) { struct super_block *sb = parent->i_sb; struct inode * inode = new_inode(sb); @@ -434,9 +431,7 @@ static struct inode *dlmfs_get_inode(struct inode *parent, return NULL; inode->i_ino = get_next_ino(); - inode->i_mode = mode; - inode->i_uid = current_fsuid(); - inode->i_gid = current_fsgid(); + inode_init_owner(inode, parent, mode); inode->i_mapping->backing_dev_info = &dlmfs_backing_dev_info; inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; @@ -473,13 +468,6 @@ static struct inode *dlmfs_get_inode(struct inode *parent, inc_nlink(inode); break; } - - if (parent->i_mode & S_ISGID) { - inode->i_gid = parent->i_gid; - if (S_ISDIR(mode)) - inode->i_mode |= S_ISGID; - } - return inode; } @@ -489,7 +477,7 @@ static struct inode *dlmfs_get_inode(struct inode *parent, /* SMP-safe */ static int dlmfs_mkdir(struct inode * dir, struct dentry * dentry, - int mode) + umode_t mode) { int status; struct inode *inode = NULL; @@ -537,7 +525,7 @@ bail: static int dlmfs_create(struct inode *dir, struct dentry *dentry, - int mode, + umode_t mode, struct nameidata *nd) { int status = 0; diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index 7642d7c..81a4cd2 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -1692,7 +1692,7 @@ int ocfs2_open_lock(struct inode *inode) mlog(0, "inode %llu take PRMODE open lock\n", (unsigned long long)OCFS2_I(inode)->ip_blkno); - if (ocfs2_mount_local(osb)) + if (ocfs2_is_hard_readonly(osb) || ocfs2_mount_local(osb)) goto out; lockres = &OCFS2_I(inode)->ip_open_lockres; @@ -1718,6 +1718,12 @@ int ocfs2_try_open_lock(struct inode *inode, int write) (unsigned long long)OCFS2_I(inode)->ip_blkno, write ? "EXMODE" : "PRMODE"); + if (ocfs2_is_hard_readonly(osb)) { + if (write) + status = -EROFS; + goto out; + } + if (ocfs2_mount_local(osb)) goto out; @@ -2092,7 +2098,7 @@ static void ocfs2_refresh_inode_from_lvb(struct inode *inode) inode->i_uid = be32_to_cpu(lvb->lvb_iuid); inode->i_gid = be32_to_cpu(lvb->lvb_igid); inode->i_mode = be16_to_cpu(lvb->lvb_imode); - inode->i_nlink = be16_to_cpu(lvb->lvb_inlink); + set_nlink(inode, be16_to_cpu(lvb->lvb_inlink)); ocfs2_unpack_timespec(&inode->i_atime, be64_to_cpu(lvb->lvb_iatime_packed)); ocfs2_unpack_timespec(&inode->i_mtime, @@ -2298,7 +2304,7 @@ int ocfs2_inode_lock_full_nested(struct inode *inode, if (ocfs2_is_hard_readonly(osb)) { if (ex) status = -EROFS; - goto bail; + goto getbh; } if (ocfs2_mount_local(osb)) @@ -2356,7 +2362,7 @@ local: mlog_errno(status); goto bail; } - +getbh: if (ret_bh) { status = ocfs2_assign_bh(inode, ret_bh, local_bh); if (status < 0) { @@ -2628,8 +2634,11 @@ int ocfs2_dentry_lock(struct dentry *dentry, int ex) BUG_ON(!dl); - if (ocfs2_is_hard_readonly(osb)) - return -EROFS; + if (ocfs2_is_hard_readonly(osb)) { + if (ex) + return -EROFS; + return 0; + } if (ocfs2_mount_local(osb)) return 0; @@ -2647,7 +2656,7 @@ void ocfs2_dentry_unlock(struct dentry *dentry, int ex) struct ocfs2_dentry_lock *dl = dentry->d_fsdata; struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb); - if (!ocfs2_mount_local(osb)) + if (!ocfs2_is_hard_readonly(osb) && !ocfs2_mount_local(osb)) ocfs2_cluster_unlock(osb, &dl->dl_lockres, level); } diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c index 23457b4..2f5b92e 100644 --- a/fs/ocfs2/extent_map.c +++ b/fs/ocfs2/extent_map.c @@ -832,6 +832,102 @@ out: return ret; } +int ocfs2_seek_data_hole_offset(struct file *file, loff_t *offset, int origin) +{ + struct inode *inode = file->f_mapping->host; + int ret; + unsigned int is_last = 0, is_data = 0; + u16 cs_bits = OCFS2_SB(inode->i_sb)->s_clustersize_bits; + u32 cpos, cend, clen, hole_size; + u64 extoff, extlen; + struct buffer_head *di_bh = NULL; + struct ocfs2_extent_rec rec; + + BUG_ON(origin != SEEK_DATA && origin != SEEK_HOLE); + + ret = ocfs2_inode_lock(inode, &di_bh, 0); + if (ret) { + mlog_errno(ret); + goto out; + } + + down_read(&OCFS2_I(inode)->ip_alloc_sem); + + if (*offset >= inode->i_size) { + ret = -ENXIO; + goto out_unlock; + } + + if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { + if (origin == SEEK_HOLE) + *offset = inode->i_size; + goto out_unlock; + } + + clen = 0; + cpos = *offset >> cs_bits; + cend = ocfs2_clusters_for_bytes(inode->i_sb, inode->i_size); + + while (cpos < cend && !is_last) { + ret = ocfs2_get_clusters_nocache(inode, di_bh, cpos, &hole_size, + &rec, &is_last); + if (ret) { + mlog_errno(ret); + goto out_unlock; + } + + extoff = cpos; + extoff <<= cs_bits; + + if (rec.e_blkno == 0ULL) { + clen = hole_size; + is_data = 0; + } else { + clen = le16_to_cpu(rec.e_leaf_clusters) - + (cpos - le32_to_cpu(rec.e_cpos)); + is_data = (rec.e_flags & OCFS2_EXT_UNWRITTEN) ? 0 : 1; + } + + if ((!is_data && origin == SEEK_HOLE) || + (is_data && origin == SEEK_DATA)) { + if (extoff > *offset) + *offset = extoff; + goto out_unlock; + } + + if (!is_last) + cpos += clen; + } + + if (origin == SEEK_HOLE) { + extoff = cpos; + extoff <<= cs_bits; + extlen = clen; + extlen <<= cs_bits; + + if ((extoff + extlen) > inode->i_size) + extlen = inode->i_size - extoff; + extoff += extlen; + if (extoff > *offset) + *offset = extoff; + goto out_unlock; + } + + ret = -ENXIO; + +out_unlock: + + brelse(di_bh); + + up_read(&OCFS2_I(inode)->ip_alloc_sem); + + ocfs2_inode_unlock(inode, 0); +out: + if (ret && ret != -ENXIO) + ret = -ENXIO; + return ret; +} + int ocfs2_read_virt_blocks(struct inode *inode, u64 v_block, int nr, struct buffer_head *bhs[], int flags, int (*validate)(struct super_block *sb, diff --git a/fs/ocfs2/extent_map.h b/fs/ocfs2/extent_map.h index e79d41c..67ea57d 100644 --- a/fs/ocfs2/extent_map.h +++ b/fs/ocfs2/extent_map.h @@ -53,6 +53,8 @@ int ocfs2_extent_map_get_blocks(struct inode *inode, u64 v_blkno, u64 *p_blkno, int ocfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 map_start, u64 map_len); +int ocfs2_seek_data_hole_offset(struct file *file, loff_t *offset, int origin); + int ocfs2_xattr_get_clusters(struct inode *inode, u32 v_cluster, u32 *p_cluster, u32 *num_clusters, struct ocfs2_extent_list *el, diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index de4ea1a..061591a 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -1950,6 +1950,9 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode, if (ret < 0) mlog_errno(ret); + if (file->f_flags & O_SYNC) + handle->h_sync = 1; + ocfs2_commit_trans(osb, handle); out_inode_unlock: @@ -2052,6 +2055,23 @@ out: return ret; } +static void ocfs2_aiodio_wait(struct inode *inode) +{ + wait_queue_head_t *wq = ocfs2_ioend_wq(inode); + + wait_event(*wq, (atomic_read(&OCFS2_I(inode)->ip_unaligned_aio) == 0)); +} + +static int ocfs2_is_io_unaligned(struct inode *inode, size_t count, loff_t pos) +{ + int blockmask = inode->i_sb->s_blocksize - 1; + loff_t final_size = pos + count; + + if ((pos & blockmask) || (final_size & blockmask)) + return 1; + return 0; +} + static int ocfs2_prepare_inode_for_refcount(struct inode *inode, struct file *file, loff_t pos, size_t count, @@ -2108,7 +2128,7 @@ static int ocfs2_prepare_inode_for_write(struct file *file, * remove_suid() calls ->setattr without any hint that * we may have already done our cluster locking. Since * ocfs2_setattr() *must* take cluster locks to - * proceeed, this will lead us to recursively lock the + * proceed, this will lead us to recursively lock the * inode. There's also the dinode i_size state which * can be lost via setattr during extending writes (we * set inode->i_size at the end of a write. */ @@ -2230,6 +2250,7 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb, struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); int full_coherency = !(osb->s_mount_opt & OCFS2_MOUNT_COHERENCY_BUFFERED); + int unaligned_dio = 0; trace_ocfs2_file_aio_write(inode, file, file->f_path.dentry, (unsigned long long)OCFS2_I(inode)->ip_blkno, @@ -2297,6 +2318,10 @@ relock: goto out; } + if (direct_io && !is_sync_kiocb(iocb)) + unaligned_dio = ocfs2_is_io_unaligned(inode, iocb->ki_left, + *ppos); + /* * We can't complete the direct I/O as requested, fall back to * buffered I/O. @@ -2311,6 +2336,18 @@ relock: goto relock; } + if (unaligned_dio) { + /* + * Wait on previous unaligned aio to complete before + * proceeding. + */ + ocfs2_aiodio_wait(inode); + + /* Mark the iocb as needing a decrement in ocfs2_dio_end_io */ + atomic_inc(&OCFS2_I(inode)->ip_unaligned_aio); + ocfs2_iocb_set_unaligned_aio(iocb); + } + /* * To later detect whether a journal commit for sync writes is * necessary, we sample i_size, and cluster count here. @@ -2382,8 +2419,12 @@ out_dio: if ((ret == -EIOCBQUEUED) || (!ocfs2_iocb_is_rw_locked(iocb))) { rw_level = -1; have_alloc_sem = 0; + unaligned_dio = 0; } + if (unaligned_dio) + atomic_dec(&OCFS2_I(inode)->ip_unaligned_aio); + out: if (rw_level != -1) ocfs2_rw_unlock(inode, rw_level); @@ -2591,6 +2632,57 @@ bail: return ret; } +/* Refer generic_file_llseek_unlocked() */ +static loff_t ocfs2_file_llseek(struct file *file, loff_t offset, int origin) +{ + struct inode *inode = file->f_mapping->host; + int ret = 0; + + mutex_lock(&inode->i_mutex); + + switch (origin) { + case SEEK_SET: + break; + case SEEK_END: + offset += inode->i_size; + break; + case SEEK_CUR: + if (offset == 0) { + offset = file->f_pos; + goto out; + } + offset += file->f_pos; + break; + case SEEK_DATA: + case SEEK_HOLE: + ret = ocfs2_seek_data_hole_offset(file, &offset, origin); + if (ret) + goto out; + break; + default: + ret = -EINVAL; + goto out; + } + + if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET)) + ret = -EINVAL; + if (!ret && offset > inode->i_sb->s_maxbytes) + ret = -EINVAL; + if (ret) + goto out; + + if (offset != file->f_pos) { + file->f_pos = offset; + file->f_version = 0; + } + +out: + mutex_unlock(&inode->i_mutex); + if (ret) + return ret; + return offset; +} + const struct inode_operations ocfs2_file_iops = { .setattr = ocfs2_setattr, .getattr = ocfs2_getattr, @@ -2615,7 +2707,7 @@ const struct inode_operations ocfs2_special_file_iops = { * ocfs2_fops_no_plocks and ocfs2_dops_no_plocks! */ const struct file_operations ocfs2_fops = { - .llseek = generic_file_llseek, + .llseek = ocfs2_file_llseek, .read = do_sync_read, .write = do_sync_write, .mmap = ocfs2_mmap, @@ -2663,7 +2755,7 @@ const struct file_operations ocfs2_dops = { * the cluster. */ const struct file_operations ocfs2_fops_no_plocks = { - .llseek = generic_file_llseek, + .llseek = ocfs2_file_llseek, .read = do_sync_read, .write = do_sync_write, .mmap = ocfs2_mmap, diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index b4c8bb6..17454a9 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c @@ -291,7 +291,7 @@ void ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe, (unsigned long long)OCFS2_I(inode)->ip_blkno, (unsigned long long)le64_to_cpu(fe->i_blkno)); - inode->i_nlink = ocfs2_read_links_count(fe); + set_nlink(inode, ocfs2_read_links_count(fe)); trace_ocfs2_populate_inode(OCFS2_I(inode)->ip_blkno, le32_to_cpu(fe->i_flags)); @@ -951,7 +951,7 @@ static void ocfs2_cleanup_delete_inode(struct inode *inode, trace_ocfs2_cleanup_delete_inode( (unsigned long long)OCFS2_I(inode)->ip_blkno, sync_data); if (sync_data) - write_inode_now(inode, 1); + filemap_write_and_wait(inode->i_mapping); truncate_inode_pages(&inode->i_data, 0); } @@ -1290,7 +1290,7 @@ void ocfs2_refresh_inode(struct inode *inode, OCFS2_I(inode)->ip_dyn_features = le16_to_cpu(fe->i_dyn_features); ocfs2_set_inode_flags(inode); i_size_write(inode, le64_to_cpu(fe->i_size)); - inode->i_nlink = ocfs2_read_links_count(fe); + set_nlink(inode, ocfs2_read_links_count(fe)); inode->i_uid = le32_to_cpu(fe->i_uid); inode->i_gid = le32_to_cpu(fe->i_gid); inode->i_mode = le16_to_cpu(fe->i_mode); diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h index 1c508b1..88924a3 100644 --- a/fs/ocfs2/inode.h +++ b/fs/ocfs2/inode.h @@ -43,6 +43,9 @@ struct ocfs2_inode_info /* protects extended attribute changes on this inode */ struct rw_semaphore ip_xattr_sem; + /* Number of outstanding AIO's which are not page aligned */ + atomic_t ip_unaligned_aio; + /* These fields are protected by ip_lock */ spinlock_t ip_lock; u32 ip_open_count; diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c index bc91072..a6fda3c 100644 --- a/fs/ocfs2/ioctl.c +++ b/fs/ocfs2/ioctl.c @@ -122,7 +122,7 @@ static int ocfs2_set_inode_attr(struct inode *inode, unsigned flags, if ((oldflags & OCFS2_IMMUTABLE_FL) || ((flags ^ oldflags) & (OCFS2_APPEND_FL | OCFS2_IMMUTABLE_FL))) { if (!capable(CAP_LINUX_IMMUTABLE)) - goto bail_unlock; + goto bail_commit; } ocfs2_inode->ip_attr = flags; @@ -132,6 +132,7 @@ static int ocfs2_set_inode_attr(struct inode *inode, unsigned flags, if (status < 0) mlog_errno(status); +bail_commit: ocfs2_commit_trans(osb, handle); bail_unlock: ocfs2_inode_unlock(inode, 1); @@ -381,7 +382,7 @@ int ocfs2_info_handle_freeinode(struct inode *inode, if (!oifi) { status = -ENOMEM; mlog_errno(status); - goto bail; + goto out_err; } if (o2info_from_user(*oifi, req)) @@ -431,7 +432,7 @@ bail: o2info_set_request_error(&oifi->ifi_req, req); kfree(oifi); - +out_err: return status; } @@ -666,7 +667,7 @@ int ocfs2_info_handle_freefrag(struct inode *inode, if (!oiff) { status = -ENOMEM; mlog_errno(status); - goto bail; + goto out_err; } if (o2info_from_user(*oiff, req)) @@ -716,7 +717,7 @@ bail: o2info_set_request_error(&oiff->iff_req, req); kfree(oiff); - +out_err: return status; } @@ -905,12 +906,12 @@ long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) if (get_user(flags, (int __user *) arg)) return -EFAULT; - status = mnt_want_write(filp->f_path.mnt); + status = mnt_want_write_file(filp); if (status) return status; status = ocfs2_set_inode_attr(inode, flags, OCFS2_FL_MODIFIABLE); - mnt_drop_write(filp->f_path.mnt); + mnt_drop_write_file(filp); return status; case OCFS2_IOC_RESVSP: case OCFS2_IOC_RESVSP64: diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index 295d564..0a42ae9 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -1544,9 +1544,9 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb, /* we need to run complete recovery for offline orphan slots */ ocfs2_replay_map_set_state(osb, REPLAY_NEEDED); - mlog(ML_NOTICE, "Recovering node %d from slot %d on device (%u,%u)\n", - node_num, slot_num, - MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev)); + printk(KERN_NOTICE "ocfs2: Begin replay journal (node %d, slot %d) on "\ + "device (%u,%u)\n", node_num, slot_num, MAJOR(osb->sb->s_dev), + MINOR(osb->sb->s_dev)); OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters); @@ -1601,6 +1601,9 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb, jbd2_journal_destroy(journal); + printk(KERN_NOTICE "ocfs2: End replay journal (node %d, slot %d) on "\ + "device (%u,%u)\n", node_num, slot_num, MAJOR(osb->sb->s_dev), + MINOR(osb->sb->s_dev)); done: /* drop the lock on this nodes journal */ if (got_lock) @@ -1808,6 +1811,20 @@ static inline unsigned long ocfs2_orphan_scan_timeout(void) * every slot, queuing a recovery of the slot on the ocfs2_wq thread. This * is done to catch any orphans that are left over in orphan directories. * + * It scans all slots, even ones that are in use. It does so to handle the + * case described below: + * + * Node 1 has an inode it was using. The dentry went away due to memory + * pressure. Node 1 closes the inode, but it's on the free list. The node + * has the open lock. + * Node 2 unlinks the inode. It grabs the dentry lock to notify others, + * but node 1 has no dentry and doesn't get the message. It trylocks the + * open lock, sees that another node has a PR, and does nothing. + * Later node 2 runs its orphan dir. It igets the inode, trylocks the + * open lock, sees the PR still, and does nothing. + * Basically, we have to trigger an orphan iput on node 1. The only way + * for this to happen is if node 1 runs node 2's orphan dir. + * * ocfs2_queue_orphan_scan gets called every ORPHAN_SCAN_SCHEDULE_TIMEOUT * seconds. It gets an EX lock on os_lockres and checks sequence number * stored in LVB. If the sequence number has changed, it means some other diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h index 68cf2f6..a3385b6 100644 --- a/fs/ocfs2/journal.h +++ b/fs/ocfs2/journal.h @@ -441,10 +441,11 @@ static inline int ocfs2_mknod_credits(struct super_block *sb, int is_dir, #define OCFS2_SIMPLE_DIR_EXTEND_CREDITS (2) /* file update (nlink, etc) + directory mtime/ctime + dir entry block + quota - * update on dir + index leaf + dx root update for free list */ + * update on dir + index leaf + dx root update for free list + + * previous dirblock update in the free list */ static inline int ocfs2_link_credits(struct super_block *sb) { - return 2*OCFS2_INODE_UPDATE_CREDITS + 3 + + return 2*OCFS2_INODE_UPDATE_CREDITS + 4 + ocfs2_quota_trans_credits(sb); } diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c index 3e9393c..9cd4108 100644 --- a/fs/ocfs2/mmap.c +++ b/fs/ocfs2/mmap.c @@ -61,7 +61,7 @@ static int ocfs2_fault(struct vm_area_struct *area, struct vm_fault *vmf) static int __ocfs2_page_mkwrite(struct file *file, struct buffer_head *di_bh, struct page *page) { - int ret; + int ret = VM_FAULT_NOPAGE; struct inode *inode = file->f_path.dentry->d_inode; struct address_space *mapping = inode->i_mapping; loff_t pos = page_offset(page); @@ -71,32 +71,25 @@ static int __ocfs2_page_mkwrite(struct file *file, struct buffer_head *di_bh, void *fsdata; loff_t size = i_size_read(inode); - /* - * Another node might have truncated while we were waiting on - * cluster locks. - * We don't check size == 0 before the shift. This is borrowed - * from do_generic_file_read. - */ last_index = (size - 1) >> PAGE_CACHE_SHIFT; - if (unlikely(!size || page->index > last_index)) { - ret = -EINVAL; - goto out; - } /* - * The i_size check above doesn't catch the case where nodes - * truncated and then re-extended the file. We'll re-check the - * page mapping after taking the page lock inside of - * ocfs2_write_begin_nolock(). + * There are cases that lead to the page no longer bebongs to the + * mapping. + * 1) pagecache truncates locally due to memory pressure. + * 2) pagecache truncates when another is taking EX lock against + * inode lock. see ocfs2_data_convert_worker. + * + * The i_size check doesn't catch the case where nodes truncated and + * then re-extended the file. We'll re-check the page mapping after + * taking the page lock inside of ocfs2_write_begin_nolock(). + * + * Let VM retry with these cases. */ - if (!PageUptodate(page) || page->mapping != inode->i_mapping) { - /* - * the page has been umapped in ocfs2_data_downconvert_worker. - * So return 0 here and let VFS retry. - */ - ret = 0; + if ((page->mapping != inode->i_mapping) || + (!PageUptodate(page)) || + (page_offset(page) >= size)) goto out; - } /* * Call ocfs2_write_begin() and ocfs2_write_end() to take @@ -116,17 +109,21 @@ static int __ocfs2_page_mkwrite(struct file *file, struct buffer_head *di_bh, if (ret) { if (ret != -ENOSPC) mlog_errno(ret); + if (ret == -ENOMEM) + ret = VM_FAULT_OOM; + else + ret = VM_FAULT_SIGBUS; goto out; } - ret = ocfs2_write_end_nolock(mapping, pos, len, len, locked_page, - fsdata); - if (ret < 0) { - mlog_errno(ret); + if (!locked_page) { + ret = VM_FAULT_NOPAGE; goto out; } + ret = ocfs2_write_end_nolock(mapping, pos, len, len, locked_page, + fsdata); BUG_ON(ret != len); - ret = 0; + ret = VM_FAULT_LOCKED; out: return ret; } @@ -168,8 +165,6 @@ static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) out: ocfs2_unblock_signals(&oldset); - if (ret) - ret = VM_FAULT_SIGBUS; return ret; } diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c index d53cb70..b1e3fce 100644 --- a/fs/ocfs2/move_extents.c +++ b/fs/ocfs2/move_extents.c @@ -745,7 +745,7 @@ static int ocfs2_move_extent(struct ocfs2_move_extents_context *context, */ ocfs2_probe_alloc_group(inode, gd_bh, &goal_bit, len, move_max_hop, new_phys_cpos); - if (!new_phys_cpos) { + if (!*new_phys_cpos) { ret = -ENOSPC; goto out_commit; } @@ -1059,7 +1059,7 @@ int ocfs2_ioctl_move_extents(struct file *filp, void __user *argp) struct ocfs2_move_extents range; struct ocfs2_move_extents_context *context = NULL; - status = mnt_want_write(filp->f_path.mnt); + status = mnt_want_write_file(filp); if (status) return status; @@ -1145,7 +1145,7 @@ out: kfree(context); - mnt_drop_write(filp->f_path.mnt); + mnt_drop_write_file(filp); return status; } diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index 53aa41e..be24469 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -185,7 +185,7 @@ bail: return ret; } -static struct inode *ocfs2_get_init_inode(struct inode *dir, int mode) +static struct inode *ocfs2_get_init_inode(struct inode *dir, umode_t mode) { struct inode *inode; @@ -199,9 +199,7 @@ static struct inode *ocfs2_get_init_inode(struct inode *dir, int mode) * these are used by the support functions here and in * callers. */ if (S_ISDIR(mode)) - inode->i_nlink = 2; - else - inode->i_nlink = 1; + set_nlink(inode, 2); inode_init_owner(inode, dir, mode); dquot_initialize(inode); return inode; @@ -209,7 +207,7 @@ static struct inode *ocfs2_get_init_inode(struct inode *dir, int mode) static int ocfs2_mknod(struct inode *dir, struct dentry *dentry, - int mode, + umode_t mode, dev_t dev) { int status = 0; @@ -604,7 +602,7 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb, static int ocfs2_mkdir(struct inode *dir, struct dentry *dentry, - int mode) + umode_t mode) { int ret; @@ -619,7 +617,7 @@ static int ocfs2_mkdir(struct inode *dir, static int ocfs2_create(struct inode *dir, struct dentry *dentry, - int mode, + umode_t mode, struct nameidata *nd) { int ret; @@ -1379,7 +1377,7 @@ static int ocfs2_rename(struct inode *old_dir, } if (new_inode) { - new_inode->i_nlink--; + drop_nlink(new_inode); new_inode->i_ctime = CURRENT_TIME; } old_dir->i_ctime = old_dir->i_mtime = CURRENT_TIME; @@ -1387,9 +1385,9 @@ static int ocfs2_rename(struct inode *old_dir, if (update_dot_dot) { status = ocfs2_update_entry(old_inode, handle, &old_inode_dot_dot_res, new_dir); - old_dir->i_nlink--; + drop_nlink(old_dir); if (new_inode) { - new_inode->i_nlink--; + drop_nlink(new_inode); } else { inc_nlink(new_dir); mark_inode_dirty(new_dir); @@ -2018,7 +2016,7 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb, orphan_fe = (struct ocfs2_dinode *) orphan_dir_bh->b_data; if (S_ISDIR(inode->i_mode)) ocfs2_add_links_count(orphan_fe, 1); - orphan_dir_inode->i_nlink = ocfs2_read_links_count(orphan_fe); + set_nlink(orphan_dir_inode, ocfs2_read_links_count(orphan_fe)); ocfs2_journal_dirty(handle, orphan_dir_bh); status = __ocfs2_add_entry(handle, orphan_dir_inode, name, @@ -2116,7 +2114,7 @@ int ocfs2_orphan_del(struct ocfs2_super *osb, orphan_fe = (struct ocfs2_dinode *) orphan_dir_bh->b_data; if (S_ISDIR(inode->i_mode)) ocfs2_add_links_count(orphan_fe, -1); - orphan_dir_inode->i_nlink = ocfs2_read_links_count(orphan_fe); + set_nlink(orphan_dir_inode, ocfs2_read_links_count(orphan_fe)); ocfs2_journal_dirty(handle, orphan_dir_bh); leave: @@ -2282,7 +2280,7 @@ int ocfs2_create_inode_in_orphan(struct inode *dir, goto leave; } - inode->i_nlink = 0; + clear_nlink(inode); /* do the real work now. */ status = __ocfs2_mknod_locked(dir, inode, 0, &new_di_bh, parent_di_bh, handle, @@ -2437,7 +2435,7 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir, di = (struct ocfs2_dinode *)di_bh->b_data; le32_add_cpu(&di->i_flags, -OCFS2_ORPHANED_FL); di->i_orphaned_slot = 0; - inode->i_nlink = 1; + set_nlink(inode, 1); ocfs2_set_links_count(di, inode->i_nlink); ocfs2_journal_dirty(handle, di_bh); diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index 4092858..d355e6e 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -836,18 +836,65 @@ static inline unsigned int ocfs2_clusters_to_megabytes(struct super_block *sb, static inline void _ocfs2_set_bit(unsigned int bit, unsigned long *bitmap) { - __test_and_set_bit_le(bit, bitmap); + __set_bit_le(bit, bitmap); } #define ocfs2_set_bit(bit, addr) _ocfs2_set_bit((bit), (unsigned long *)(addr)) static inline void _ocfs2_clear_bit(unsigned int bit, unsigned long *bitmap) { - __test_and_clear_bit_le(bit, bitmap); + __clear_bit_le(bit, bitmap); } #define ocfs2_clear_bit(bit, addr) _ocfs2_clear_bit((bit), (unsigned long *)(addr)) #define ocfs2_test_bit test_bit_le #define ocfs2_find_next_zero_bit find_next_zero_bit_le #define ocfs2_find_next_bit find_next_bit_le + +static inline void *correct_addr_and_bit_unaligned(int *bit, void *addr) +{ +#if BITS_PER_LONG == 64 + *bit += ((unsigned long) addr & 7UL) << 3; + addr = (void *) ((unsigned long) addr & ~7UL); +#elif BITS_PER_LONG == 32 + *bit += ((unsigned long) addr & 3UL) << 3; + addr = (void *) ((unsigned long) addr & ~3UL); +#else +#error "how many bits you are?!" +#endif + return addr; +} + +static inline void ocfs2_set_bit_unaligned(int bit, void *bitmap) +{ + bitmap = correct_addr_and_bit_unaligned(&bit, bitmap); + ocfs2_set_bit(bit, bitmap); +} + +static inline void ocfs2_clear_bit_unaligned(int bit, void *bitmap) +{ + bitmap = correct_addr_and_bit_unaligned(&bit, bitmap); + ocfs2_clear_bit(bit, bitmap); +} + +static inline int ocfs2_test_bit_unaligned(int bit, void *bitmap) +{ + bitmap = correct_addr_and_bit_unaligned(&bit, bitmap); + return ocfs2_test_bit(bit, bitmap); +} + +static inline int ocfs2_find_next_zero_bit_unaligned(void *bitmap, int max, + int start) +{ + int fix = 0, ret, tmpmax; + bitmap = correct_addr_and_bit_unaligned(&fix, bitmap); + tmpmax = max + fix; + start += fix; + + ret = ocfs2_find_next_zero_bit(bitmap, tmpmax, start) - fix; + if (ret > max) + return max; + return ret; +} + #endif /* OCFS2_H */ diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c index dc8007f..f100bf7 100644 --- a/fs/ocfs2/quota_local.c +++ b/fs/ocfs2/quota_local.c @@ -404,7 +404,9 @@ struct ocfs2_quota_recovery *ocfs2_begin_quota_recovery( int status = 0; struct ocfs2_quota_recovery *rec; - mlog(ML_NOTICE, "Beginning quota recovery in slot %u\n", slot_num); + printk(KERN_NOTICE "ocfs2: Beginning quota recovery on device (%s) for " + "slot %u\n", osb->dev_str, slot_num); + rec = ocfs2_alloc_quota_recovery(); if (!rec) return ERR_PTR(-ENOMEM); @@ -549,8 +551,8 @@ static int ocfs2_recover_local_quota_file(struct inode *lqinode, goto out_commit; } lock_buffer(qbh); - WARN_ON(!ocfs2_test_bit(bit, dchunk->dqc_bitmap)); - ocfs2_clear_bit(bit, dchunk->dqc_bitmap); + WARN_ON(!ocfs2_test_bit_unaligned(bit, dchunk->dqc_bitmap)); + ocfs2_clear_bit_unaligned(bit, dchunk->dqc_bitmap); le32_add_cpu(&dchunk->dqc_free, 1); unlock_buffer(qbh); ocfs2_journal_dirty(handle, qbh); @@ -596,7 +598,9 @@ int ocfs2_finish_quota_recovery(struct ocfs2_super *osb, struct inode *lqinode; unsigned int flags; - mlog(ML_NOTICE, "Finishing quota recovery in slot %u\n", slot_num); + printk(KERN_NOTICE "ocfs2: Finishing quota recovery on device (%s) for " + "slot %u\n", osb->dev_str, slot_num); + mutex_lock(&sb_dqopt(sb)->dqonoff_mutex); for (type = 0; type < MAXQUOTAS; type++) { if (list_empty(&(rec->r_list[type]))) @@ -612,8 +616,9 @@ int ocfs2_finish_quota_recovery(struct ocfs2_super *osb, /* Someone else is holding the lock? Then he must be * doing the recovery. Just skip the file... */ if (status == -EAGAIN) { - mlog(ML_NOTICE, "skipping quota recovery for slot %d " - "because quota file is locked.\n", slot_num); + printk(KERN_NOTICE "ocfs2: Skipping quota recovery on " + "device (%s) for slot %d because quota file is " + "locked.\n", osb->dev_str, slot_num); status = 0; goto out_put; } else if (status < 0) { @@ -944,7 +949,7 @@ static struct ocfs2_quota_chunk *ocfs2_find_free_entry(struct super_block *sb, * ol_quota_entries_per_block(sb); } - found = ocfs2_find_next_zero_bit(dchunk->dqc_bitmap, len, 0); + found = ocfs2_find_next_zero_bit_unaligned(dchunk->dqc_bitmap, len, 0); /* We failed? */ if (found == len) { mlog(ML_ERROR, "Did not find empty entry in chunk %d with %u" @@ -1208,7 +1213,7 @@ static void olq_alloc_dquot(struct buffer_head *bh, void *private) struct ocfs2_local_disk_chunk *dchunk; dchunk = (struct ocfs2_local_disk_chunk *)bh->b_data; - ocfs2_set_bit(*offset, dchunk->dqc_bitmap); + ocfs2_set_bit_unaligned(*offset, dchunk->dqc_bitmap); le32_add_cpu(&dchunk->dqc_free, -1); } @@ -1289,7 +1294,7 @@ int ocfs2_local_release_dquot(handle_t *handle, struct dquot *dquot) (od->dq_chunk->qc_headerbh->b_data); /* Mark structure as freed */ lock_buffer(od->dq_chunk->qc_headerbh); - ocfs2_clear_bit(offset, dchunk->dqc_bitmap); + ocfs2_clear_bit_unaligned(offset, dchunk->dqc_bitmap); le32_add_cpu(&dchunk->dqc_free, 1); unlock_buffer(od->dq_chunk->qc_headerbh); ocfs2_journal_dirty(handle, od->dq_chunk->qc_headerbh); diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c index 26fc001..1424c15 100644 --- a/fs/ocfs2/slot_map.c +++ b/fs/ocfs2/slot_map.c @@ -493,8 +493,8 @@ int ocfs2_find_slot(struct ocfs2_super *osb) goto bail; } } else - mlog(ML_NOTICE, "slot %d is already allocated to this node!\n", - slot); + printk(KERN_INFO "ocfs2: Slot %d on device (%s) was already " + "allocated to this node!\n", slot, osb->dev_str); ocfs2_set_slot(si, slot, osb->node_num); osb->slot_num = slot; diff --git a/fs/ocfs2/stack_o2cb.c b/fs/ocfs2/stack_o2cb.c index 19965b0..9436801 100644 --- a/fs/ocfs2/stack_o2cb.c +++ b/fs/ocfs2/stack_o2cb.c @@ -28,6 +28,7 @@ #include "cluster/masklog.h" #include "cluster/nodemanager.h" #include "cluster/heartbeat.h" +#include "cluster/tcp.h" #include "stackglue.h" @@ -256,6 +257,61 @@ static void o2cb_dump_lksb(struct ocfs2_dlm_lksb *lksb) } /* + * Check if this node is heartbeating and is connected to all other + * heartbeating nodes. + */ +static int o2cb_cluster_check(void) +{ + u8 node_num; + int i; + unsigned long hbmap[BITS_TO_LONGS(O2NM_MAX_NODES)]; + unsigned long netmap[BITS_TO_LONGS(O2NM_MAX_NODES)]; + + node_num = o2nm_this_node(); + if (node_num == O2NM_MAX_NODES) { + printk(KERN_ERR "o2cb: This node has not been configured.\n"); + return -EINVAL; + } + + /* + * o2dlm expects o2net sockets to be created. If not, then + * dlm_join_domain() fails with a stack of errors which are both cryptic + * and incomplete. The idea here is to detect upfront whether we have + * managed to connect to all nodes or not. If not, then list the nodes + * to allow the user to check the configuration (incorrect IP, firewall, + * etc.) Yes, this is racy. But its not the end of the world. + */ +#define O2CB_MAP_STABILIZE_COUNT 60 + for (i = 0; i < O2CB_MAP_STABILIZE_COUNT; ++i) { + o2hb_fill_node_map(hbmap, sizeof(hbmap)); + if (!test_bit(node_num, hbmap)) { + printk(KERN_ERR "o2cb: %s heartbeat has not been " + "started.\n", (o2hb_global_heartbeat_active() ? + "Global" : "Local")); + return -EINVAL; + } + o2net_fill_node_map(netmap, sizeof(netmap)); + /* Force set the current node to allow easy compare */ + set_bit(node_num, netmap); + if (!memcmp(hbmap, netmap, sizeof(hbmap))) + return 0; + if (i < O2CB_MAP_STABILIZE_COUNT) + msleep(1000); + } + + printk(KERN_ERR "o2cb: This node could not connect to nodes:"); + i = -1; + while ((i = find_next_bit(hbmap, O2NM_MAX_NODES, + i + 1)) < O2NM_MAX_NODES) { + if (!test_bit(i, netmap)) + printk(" %u", i); + } + printk(".\n"); + + return -ENOTCONN; +} + +/* * Called from the dlm when it's about to evict a node. This is how the * classic stack signals node death. */ @@ -263,8 +319,8 @@ static void o2dlm_eviction_cb(int node_num, void *data) { struct ocfs2_cluster_connection *conn = data; - mlog(ML_NOTICE, "o2dlm has evicted node %d from group %.*s\n", - node_num, conn->cc_namelen, conn->cc_name); + printk(KERN_NOTICE "o2cb: o2dlm has evicted node %d from domain %.*s\n", + node_num, conn->cc_namelen, conn->cc_name); conn->cc_recovery_handler(node_num, conn->cc_recovery_data); } @@ -280,12 +336,11 @@ static int o2cb_cluster_connect(struct ocfs2_cluster_connection *conn) BUG_ON(conn == NULL); BUG_ON(conn->cc_proto == NULL); - /* for now we only have one cluster/node, make sure we see it - * in the heartbeat universe */ - if (!o2hb_check_local_node_heartbeating()) { - if (o2hb_global_heartbeat_active()) - mlog(ML_ERROR, "Global heartbeat not started\n"); - rc = -EINVAL; + /* Ensure cluster stack is up and all nodes are connected */ + rc = o2cb_cluster_check(); + if (rc) { + printk(KERN_ERR "o2cb: Cluster check failed. Fix errors " + "before retrying.\n"); goto out; } diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 56f6102..604e12c 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -54,6 +54,7 @@ #include "ocfs1_fs_compat.h" #include "alloc.h" +#include "aops.h" #include "blockcheck.h" #include "dlmglue.h" #include "export.h" @@ -107,7 +108,7 @@ static int ocfs2_parse_options(struct super_block *sb, char *options, int is_remount); static int ocfs2_check_set_options(struct super_block *sb, struct mount_options *options); -static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt); +static int ocfs2_show_options(struct seq_file *s, struct dentry *root); static void ocfs2_put_super(struct super_block *sb); static int ocfs2_mount_volume(struct super_block *sb); static int ocfs2_remount(struct super_block *sb, int *flags, char *data); @@ -568,7 +569,6 @@ static struct inode *ocfs2_alloc_inode(struct super_block *sb) static void ocfs2_i_callback(struct rcu_head *head) { struct inode *inode = container_of(head, struct inode, i_rcu); - INIT_LIST_HEAD(&inode->i_dentry); kmem_cache_free(ocfs2_inode_cachep, OCFS2_I(inode)); } @@ -1107,9 +1107,9 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) ocfs2_set_ro_flag(osb, 1); - printk(KERN_NOTICE "Readonly device detected. No cluster " - "services will be utilized for this mount. Recovery " - "will be skipped.\n"); + printk(KERN_NOTICE "ocfs2: Readonly device (%s) detected. " + "Cluster services will not be used for this mount. " + "Recovery will be skipped.\n", osb->dev_str); } if (!ocfs2_is_hard_readonly(osb)) { @@ -1533,9 +1533,9 @@ bail: return status; } -static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt) +static int ocfs2_show_options(struct seq_file *s, struct dentry *root) { - struct ocfs2_super *osb = OCFS2_SB(mnt->mnt_sb); + struct ocfs2_super *osb = OCFS2_SB(root->d_sb); unsigned long opts = osb->s_mount_opt; unsigned int local_alloc_megs; @@ -1567,8 +1567,7 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt) if (osb->preferred_slot != OCFS2_INVALID_SLOT) seq_printf(s, ",preferred_slot=%d", osb->preferred_slot); - if (!(mnt->mnt_flags & MNT_NOATIME) && !(mnt->mnt_flags & MNT_RELATIME)) - seq_printf(s, ",atime_quantum=%u", osb->s_atime_quantum); + seq_printf(s, ",atime_quantum=%u", osb->s_atime_quantum); if (osb->osb_commit_interval) seq_printf(s, ",commit=%u", @@ -1616,12 +1615,17 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt) return 0; } +wait_queue_head_t ocfs2__ioend_wq[OCFS2_IOEND_WQ_HASH_SZ]; + static int __init ocfs2_init(void) { - int status; + int status, i; ocfs2_print_version(); + for (i = 0; i < OCFS2_IOEND_WQ_HASH_SZ; i++) + init_waitqueue_head(&ocfs2__ioend_wq[i]); + status = init_ocfs2_uptodate_cache(); if (status < 0) { mlog_errno(status); @@ -1760,7 +1764,7 @@ static void ocfs2_inode_init_once(void *data) ocfs2_extent_map_init(&oi->vfs_inode); INIT_LIST_HEAD(&oi->ip_io_markers); oi->ip_dir_start_lookup = 0; - + atomic_set(&oi->ip_unaligned_aio, 0); init_rwsem(&oi->ip_alloc_sem); init_rwsem(&oi->ip_xattr_sem); mutex_init(&oi->ip_io_mutex); @@ -1974,7 +1978,8 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err) * If we failed before we got a uuid_str yet, we can't stop * heartbeat. Otherwise, do it. */ - if (!mnt_err && !ocfs2_mount_local(osb) && osb->uuid_str) + if (!mnt_err && !ocfs2_mount_local(osb) && osb->uuid_str && + !ocfs2_is_hard_readonly(osb)) hangup_needed = 1; if (osb->cconn) @@ -2353,7 +2358,7 @@ static int ocfs2_initialize_super(struct super_block *sb, mlog_errno(status); goto bail; } - cleancache_init_shared_fs((char *)&uuid_net_key, sb); + cleancache_init_shared_fs((char *)&di->id2.i_super.s_uuid, sb); bail: return status; @@ -2462,8 +2467,8 @@ static int ocfs2_check_volume(struct ocfs2_super *osb) goto finally; } } else { - mlog(ML_NOTICE, "File system was not unmounted cleanly, " - "recovering volume.\n"); + printk(KERN_NOTICE "ocfs2: File system on device (%s) was not " + "unmounted cleanly, recovering it.\n", osb->dev_str); } local = ocfs2_mount_local(osb); diff --git a/fs/ocfs2/super.h b/fs/ocfs2/super.h index 40c7de0..74ff74c 100644 --- a/fs/ocfs2/super.h +++ b/fs/ocfs2/super.h @@ -31,17 +31,15 @@ extern struct workqueue_struct *ocfs2_wq; int ocfs2_publish_get_mount_state(struct ocfs2_super *osb, int node_num); -void __ocfs2_error(struct super_block *sb, - const char *function, - const char *fmt, ...) - __attribute__ ((format (printf, 3, 4))); +__printf(3, 4) +void __ocfs2_error(struct super_block *sb, const char *function, + const char *fmt, ...); #define ocfs2_error(sb, fmt, args...) __ocfs2_error(sb, __PRETTY_FUNCTION__, fmt, ##args) -void __ocfs2_abort(struct super_block *sb, - const char *function, - const char *fmt, ...) - __attribute__ ((format (printf, 3, 4))); +__printf(3, 4) +void __ocfs2_abort(struct super_block *sb, const char *function, + const char *fmt, ...); #define ocfs2_abort(sb, fmt, args...) __ocfs2_abort(sb, __PRETTY_FUNCTION__, fmt, ##args) diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index 81ecf9c..0ba9ea1 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -623,7 +623,7 @@ int ocfs2_calc_security_init(struct inode *dir, int ocfs2_calc_xattr_init(struct inode *dir, struct buffer_head *dir_bh, - int mode, + umode_t mode, struct ocfs2_security_xattr_info *si, int *want_clusters, int *xattr_credits, @@ -2376,16 +2376,18 @@ static int ocfs2_remove_value_outside(struct inode*inode, } ret = ocfs2_xattr_value_truncate(inode, vb, 0, &ctxt); - if (ret < 0) { - mlog_errno(ret); - break; - } ocfs2_commit_trans(osb, ctxt.handle); if (ctxt.meta_ac) { ocfs2_free_alloc_context(ctxt.meta_ac); ctxt.meta_ac = NULL; } + + if (ret < 0) { + mlog_errno(ret); + break; + } + } if (ctxt.meta_ac) @@ -7185,20 +7187,9 @@ int ocfs2_init_security_and_acl(struct inode *dir, { int ret = 0; struct buffer_head *dir_bh = NULL; - struct ocfs2_security_xattr_info si = { - .enable = 1, - }; - ret = ocfs2_init_security_get(inode, dir, qstr, &si); + ret = ocfs2_init_security_get(inode, dir, qstr, NULL); if (!ret) { - ret = ocfs2_xattr_set(inode, OCFS2_XATTR_INDEX_SECURITY, - si.name, si.value, si.value_len, - XATTR_CREATE); - if (ret) { - mlog_errno(ret); - goto leave; - } - } else if (ret != -EOPNOTSUPP) { mlog_errno(ret); goto leave; } @@ -7255,6 +7246,22 @@ static int ocfs2_xattr_security_set(struct dentry *dentry, const char *name, name, value, size, flags); } +int ocfs2_initxattrs(struct inode *inode, const struct xattr *xattr_array, + void *fs_info) +{ + const struct xattr *xattr; + int err = 0; + + for (xattr = xattr_array; xattr->name != NULL; xattr++) { + err = ocfs2_xattr_set(inode, OCFS2_XATTR_INDEX_SECURITY, + xattr->name, xattr->value, + xattr->value_len, XATTR_CREATE); + if (err) + break; + } + return err; +} + int ocfs2_init_security_get(struct inode *inode, struct inode *dir, const struct qstr *qstr, @@ -7263,8 +7270,13 @@ int ocfs2_init_security_get(struct inode *inode, /* check whether ocfs2 support feature xattr */ if (!ocfs2_supports_xattr(OCFS2_SB(dir->i_sb))) return -EOPNOTSUPP; - return security_inode_init_security(inode, dir, qstr, &si->name, - &si->value, &si->value_len); + if (si) + return security_old_inode_init_security(inode, dir, qstr, + &si->name, &si->value, + &si->value_len); + + return security_inode_init_security(inode, dir, qstr, + &ocfs2_initxattrs, NULL); } int ocfs2_init_security_set(handle_t *handle, diff --git a/fs/ocfs2/xattr.h b/fs/ocfs2/xattr.h index d63cfb7..e5c7f15 100644 --- a/fs/ocfs2/xattr.h +++ b/fs/ocfs2/xattr.h @@ -68,7 +68,7 @@ int ocfs2_calc_security_init(struct inode *, struct ocfs2_security_xattr_info *, int *, int *, struct ocfs2_alloc_context **); int ocfs2_calc_xattr_init(struct inode *, struct buffer_head *, - int, struct ocfs2_security_xattr_info *, + umode_t, struct ocfs2_security_xattr_info *, int *, int *, int *); /* diff --git a/fs/omfs/dir.c b/fs/omfs/dir.c index 98e5442..f00576e 100644 --- a/fs/omfs/dir.c +++ b/fs/omfs/dir.c @@ -255,7 +255,7 @@ static int omfs_remove(struct inode *dir, struct dentry *dentry) return 0; } -static int omfs_add_node(struct inode *dir, struct dentry *dentry, int mode) +static int omfs_add_node(struct inode *dir, struct dentry *dentry, umode_t mode) { int err; struct inode *inode = omfs_new_inode(dir, mode); @@ -279,12 +279,12 @@ out_free_inode: return err; } -static int omfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) +static int omfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) { return omfs_add_node(dir, dentry, mode | S_IFDIR); } -static int omfs_create(struct inode *dir, struct dentry *dentry, int mode, +static int omfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, struct nameidata *nd) { return omfs_add_node(dir, dentry, mode | S_IFREG); diff --git a/fs/omfs/inode.c b/fs/omfs/inode.c index e043c4c..6065bb0 100644 --- a/fs/omfs/inode.c +++ b/fs/omfs/inode.c @@ -28,7 +28,7 @@ struct buffer_head *omfs_bread(struct super_block *sb, sector_t block) return sb_bread(sb, clus_to_blk(sbi, block)); } -struct inode *omfs_new_inode(struct inode *dir, int mode) +struct inode *omfs_new_inode(struct inode *dir, umode_t mode) { struct inode *inode; u64 new_block; diff --git a/fs/omfs/omfs.h b/fs/omfs/omfs.h index 7d414fe..8941f12 100644 --- a/fs/omfs/omfs.h +++ b/fs/omfs/omfs.h @@ -60,7 +60,7 @@ extern int omfs_shrink_inode(struct inode *inode); /* inode.c */ extern struct buffer_head *omfs_bread(struct super_block *sb, sector_t block); extern struct inode *omfs_iget(struct super_block *sb, ino_t inode); -extern struct inode *omfs_new_inode(struct inode *dir, int mode); +extern struct inode *omfs_new_inode(struct inode *dir, umode_t mode); extern int omfs_reserve_block(struct super_block *sb, sector_t block); extern int omfs_find_empty_block(struct super_block *sb, int mode, ino_t *ino); extern int omfs_sync_inode(struct inode *inode); @@ -456,7 +456,7 @@ static int chmod_common(struct path *path, umode_t mode) if (error) return error; mutex_lock(&inode->i_mutex); - error = security_path_chmod(path->dentry, path->mnt, mode); + error = security_path_chmod(path, mode); if (error) goto out_unlock; newattrs.ia_mode = (mode & S_IALLUGO) | (inode->i_mode & ~S_IALLUGO); @@ -468,7 +468,7 @@ out_unlock: return error; } -SYSCALL_DEFINE2(fchmod, unsigned int, fd, mode_t, mode) +SYSCALL_DEFINE2(fchmod, unsigned int, fd, umode_t, mode) { struct file * file; int err = -EBADF; @@ -482,7 +482,7 @@ SYSCALL_DEFINE2(fchmod, unsigned int, fd, mode_t, mode) return err; } -SYSCALL_DEFINE3(fchmodat, int, dfd, const char __user *, filename, mode_t, mode) +SYSCALL_DEFINE3(fchmodat, int, dfd, const char __user *, filename, umode_t, mode) { struct path path; int error; @@ -495,7 +495,7 @@ SYSCALL_DEFINE3(fchmodat, int, dfd, const char __user *, filename, mode_t, mode) return error; } -SYSCALL_DEFINE2(chmod, const char __user *, filename, mode_t, mode) +SYSCALL_DEFINE2(chmod, const char __user *, filename, umode_t, mode) { return sys_fchmodat(AT_FDCWD, filename, mode); } @@ -608,7 +608,7 @@ SYSCALL_DEFINE3(fchown, unsigned int, fd, uid_t, user, gid_t, group) dentry = file->f_path.dentry; audit_inode(NULL, dentry); error = chown_common(&file->f_path, user, group); - mnt_drop_write(file->f_path.mnt); + mnt_drop_write_file(file); out_fput: fput(file); out: @@ -685,6 +685,10 @@ static struct file *__dentry_open(struct dentry *dentry, struct vfsmount *mnt, if (error) goto cleanup_all; + error = break_lease(inode, f->f_flags); + if (error) + goto cleanup_all; + if (!open && f->f_op) open = f->f_op->open; if (open) { @@ -873,7 +877,7 @@ void fd_install(unsigned int fd, struct file *file) EXPORT_SYMBOL(fd_install); -static inline int build_open_flags(int flags, int mode, struct open_flags *op) +static inline int build_open_flags(int flags, umode_t mode, struct open_flags *op) { int lookup_flags = 0; int acc_mode; @@ -944,7 +948,7 @@ static inline int build_open_flags(int flags, int mode, struct open_flags *op) * have to. But in generally you should not do this, so please move * along, nothing to see here.. */ -struct file *filp_open(const char *filename, int flags, int mode) +struct file *filp_open(const char *filename, int flags, umode_t mode) { struct open_flags op; int lookup = build_open_flags(flags, mode, &op); @@ -966,7 +970,7 @@ struct file *file_open_root(struct dentry *dentry, struct vfsmount *mnt, } EXPORT_SYMBOL(file_open_root); -long do_sys_open(int dfd, const char __user *filename, int flags, int mode) +long do_sys_open(int dfd, const char __user *filename, int flags, umode_t mode) { struct open_flags op; int lookup = build_open_flags(flags, mode, &op); @@ -990,7 +994,7 @@ long do_sys_open(int dfd, const char __user *filename, int flags, int mode) return fd; } -SYSCALL_DEFINE3(open, const char __user *, filename, int, flags, int, mode) +SYSCALL_DEFINE3(open, const char __user *, filename, int, flags, umode_t, mode) { long ret; @@ -1004,7 +1008,7 @@ SYSCALL_DEFINE3(open, const char __user *, filename, int, flags, int, mode) } SYSCALL_DEFINE4(openat, int, dfd, const char __user *, filename, int, flags, - int, mode) + umode_t, mode) { long ret; @@ -1023,7 +1027,7 @@ SYSCALL_DEFINE4(openat, int, dfd, const char __user *, filename, int, flags, * For backward compatibility? Maybe this should be moved * into arch/i386 instead? */ -SYSCALL_DEFINE2(creat, const char __user *, pathname, int, mode) +SYSCALL_DEFINE2(creat, const char __user *, pathname, umode_t, mode) { return sys_open(pathname, O_CREAT | O_WRONLY | O_TRUNC, mode); } diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c index a2a5bff..a88c03b 100644 --- a/fs/openpromfs/inode.c +++ b/fs/openpromfs/inode.c @@ -242,7 +242,7 @@ found: inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO; inode->i_op = &openprom_inode_operations; inode->i_fop = &openprom_operations; - inode->i_nlink = 2; + set_nlink(inode, 2); break; case op_inode_prop: if (!strcmp(dp->name, "options") && (len == 17) && @@ -251,7 +251,7 @@ found: else inode->i_mode = S_IFREG | S_IRUGO; inode->i_fop = &openpromfs_prop_ops; - inode->i_nlink = 1; + set_nlink(inode, 1); inode->i_size = ent_oi->u.prop->length; break; } @@ -346,7 +346,6 @@ static struct inode *openprom_alloc_inode(struct super_block *sb) static void openprom_i_callback(struct rcu_head *head) { struct inode *inode = container_of(head, struct inode, i_rcu); - INIT_LIST_HEAD(&inode->i_dentry); kmem_cache_free(op_inode_cachep, OP_I(inode)); } diff --git a/fs/partitions/Kconfig b/fs/partitions/Kconfig deleted file mode 100644 index cb5f0a3..0000000 --- a/fs/partitions/Kconfig +++ /dev/null @@ -1,251 +0,0 @@ -# -# Partition configuration -# -config PARTITION_ADVANCED - bool "Advanced partition selection" - help - Say Y here if you would like to use hard disks under Linux which - were partitioned under an operating system running on a different - architecture than your Linux system. - - Note that the answer to this question won't directly affect the - kernel: saying N will just cause the configurator to skip all - the questions about foreign partitioning schemes. - - If unsure, say N. - -config ACORN_PARTITION - bool "Acorn partition support" if PARTITION_ADVANCED - default y if ARCH_ACORN - help - Support hard disks partitioned under Acorn operating systems. - -config ACORN_PARTITION_CUMANA - bool "Cumana partition support" if PARTITION_ADVANCED - default y if ARCH_ACORN - depends on ACORN_PARTITION - help - Say Y here if you would like to use hard disks under Linux which - were partitioned using the Cumana interface on Acorn machines. - -config ACORN_PARTITION_EESOX - bool "EESOX partition support" if PARTITION_ADVANCED - default y if ARCH_ACORN - depends on ACORN_PARTITION - -config ACORN_PARTITION_ICS - bool "ICS partition support" if PARTITION_ADVANCED - default y if ARCH_ACORN - depends on ACORN_PARTITION - help - Say Y here if you would like to use hard disks under Linux which - were partitioned using the ICS interface on Acorn machines. - -config ACORN_PARTITION_ADFS - bool "Native filecore partition support" if PARTITION_ADVANCED - default y if ARCH_ACORN - depends on ACORN_PARTITION - help - The Acorn Disc Filing System is the standard file system of the - RiscOS operating system which runs on Acorn's ARM-based Risc PC - systems and the Acorn Archimedes range of machines. If you say - `Y' here, Linux will support disk partitions created under ADFS. - -config ACORN_PARTITION_POWERTEC - bool "PowerTec partition support" if PARTITION_ADVANCED - default y if ARCH_ACORN - depends on ACORN_PARTITION - help - Support reading partition tables created on Acorn machines using - the PowerTec SCSI drive. - -config ACORN_PARTITION_RISCIX - bool "RISCiX partition support" if PARTITION_ADVANCED - default y if ARCH_ACORN - depends on ACORN_PARTITION - help - Once upon a time, there was a native Unix port for the Acorn series - of machines called RISCiX. If you say 'Y' here, Linux will be able - to read disks partitioned under RISCiX. - -config OSF_PARTITION - bool "Alpha OSF partition support" if PARTITION_ADVANCED - default y if ALPHA - help - Say Y here if you would like to use hard disks under Linux which - were partitioned on an Alpha machine. - -config AMIGA_PARTITION - bool "Amiga partition table support" if PARTITION_ADVANCED - default y if (AMIGA || AFFS_FS=y) - help - Say Y here if you would like to use hard disks under Linux which - were partitioned under AmigaOS. - -config ATARI_PARTITION - bool "Atari partition table support" if PARTITION_ADVANCED - default y if ATARI - help - Say Y here if you would like to use hard disks under Linux which - were partitioned under the Atari OS. - -config IBM_PARTITION - bool "IBM disk label and partition support" - depends on PARTITION_ADVANCED && S390 - help - Say Y here if you would like to be able to read the hard disk - partition table format used by IBM DASD disks operating under CMS. - Otherwise, say N. - -config MAC_PARTITION - bool "Macintosh partition map support" if PARTITION_ADVANCED - default y if (MAC || PPC_PMAC) - help - Say Y here if you would like to use hard disks under Linux which - were partitioned on a Macintosh. - -config MSDOS_PARTITION - bool "PC BIOS (MSDOS partition tables) support" if PARTITION_ADVANCED - default y - help - Say Y here. - -config BSD_DISKLABEL - bool "BSD disklabel (FreeBSD partition tables) support" - depends on PARTITION_ADVANCED && MSDOS_PARTITION - help - FreeBSD uses its own hard disk partition scheme on your PC. It - requires only one entry in the primary partition table of your disk - and manages it similarly to DOS extended partitions, putting in its - first sector a new partition table in BSD disklabel format. Saying Y - here allows you to read these disklabels and further mount FreeBSD - partitions from within Linux if you have also said Y to "UFS - file system support", above. If you don't know what all this is - about, say N. - -config MINIX_SUBPARTITION - bool "Minix subpartition support" - depends on PARTITION_ADVANCED && MSDOS_PARTITION - help - Minix 2.0.0/2.0.2 subpartition table support for Linux. - Say Y here if you want to mount and use Minix 2.0.0/2.0.2 - subpartitions. - -config SOLARIS_X86_PARTITION - bool "Solaris (x86) partition table support" - depends on PARTITION_ADVANCED && MSDOS_PARTITION - help - Like most systems, Solaris x86 uses its own hard disk partition - table format, incompatible with all others. Saying Y here allows you - to read these partition tables and further mount Solaris x86 - partitions from within Linux if you have also said Y to "UFS - file system support", above. - -config UNIXWARE_DISKLABEL - bool "Unixware slices support" - depends on PARTITION_ADVANCED && MSDOS_PARTITION - ---help--- - Like some systems, UnixWare uses its own slice table inside a - partition (VTOC - Virtual Table of Contents). Its format is - incompatible with all other OSes. Saying Y here allows you to read - VTOC and further mount UnixWare partitions read-only from within - Linux if you have also said Y to "UFS file system support" or - "System V and Coherent file system support", above. - - This is mainly used to carry data from a UnixWare box to your - Linux box via a removable medium like magneto-optical, ZIP or - removable IDE drives. Note, however, that a good portable way to - transport files and directories between unixes (and even other - operating systems) is given by the tar program ("man tar" or - preferably "info tar"). - - If you don't know what all this is about, say N. - -config LDM_PARTITION - bool "Windows Logical Disk Manager (Dynamic Disk) support" - depends on PARTITION_ADVANCED - ---help--- - Say Y here if you would like to use hard disks under Linux which - were partitioned using Windows 2000's/XP's or Vista's Logical Disk - Manager. They are also known as "Dynamic Disks". - - Note this driver only supports Dynamic Disks with a protective MBR - label, i.e. DOS partition table. It does not support GPT labelled - Dynamic Disks yet as can be created with Vista. - - Windows 2000 introduced the concept of Dynamic Disks to get around - the limitations of the PC's partitioning scheme. The Logical Disk - Manager allows the user to repartition a disk and create spanned, - mirrored, striped or RAID volumes, all without the need for - rebooting. - - Normal partitions are now called Basic Disks under Windows 2000, XP, - and Vista. - - For a fuller description read <file:Documentation/ldm.txt>. - - If unsure, say N. - -config LDM_DEBUG - bool "Windows LDM extra logging" - depends on LDM_PARTITION - help - Say Y here if you would like LDM to log verbosely. This could be - helpful if the driver doesn't work as expected and you'd like to - report a bug. - - If unsure, say N. - -config SGI_PARTITION - bool "SGI partition support" if PARTITION_ADVANCED - default y if DEFAULT_SGI_PARTITION - help - Say Y here if you would like to be able to read the hard disk - partition table format used by SGI machines. - -config ULTRIX_PARTITION - bool "Ultrix partition table support" if PARTITION_ADVANCED - default y if MACH_DECSTATION - help - Say Y here if you would like to be able to read the hard disk - partition table format used by DEC (now Compaq) Ultrix machines. - Otherwise, say N. - -config SUN_PARTITION - bool "Sun partition tables support" if PARTITION_ADVANCED - default y if (SPARC || SUN3 || SUN3X) - ---help--- - Like most systems, SunOS uses its own hard disk partition table - format, incompatible with all others. Saying Y here allows you to - read these partition tables and further mount SunOS partitions from - within Linux if you have also said Y to "UFS file system support", - above. This is mainly used to carry data from a SPARC under SunOS to - your Linux box via a removable medium like magneto-optical or ZIP - drives; note however that a good portable way to transport files and - directories between unixes (and even other operating systems) is - given by the tar program ("man tar" or preferably "info tar"). If - you don't know what all this is about, say N. - -config KARMA_PARTITION - bool "Karma Partition support" - depends on PARTITION_ADVANCED - help - Say Y here if you would like to mount the Rio Karma MP3 player, as it - uses a proprietary partition table. - -config EFI_PARTITION - bool "EFI GUID Partition support" - depends on PARTITION_ADVANCED - select CRC32 - help - Say Y here if you would like to use hard disks under Linux which - were partitioned using EFI GPT. - -config SYSV68_PARTITION - bool "SYSV68 partition table support" if PARTITION_ADVANCED - default y if VME - help - Say Y here if you would like to be able to read the hard disk - partition table format used by Motorola Delta machines (using - sysv68). - Otherwise, say N. diff --git a/fs/partitions/Makefile b/fs/partitions/Makefile deleted file mode 100644 index 03af8ea..0000000 --- a/fs/partitions/Makefile +++ /dev/null @@ -1,20 +0,0 @@ -# -# Makefile for the linux kernel. -# - -obj-$(CONFIG_BLOCK) := check.o - -obj-$(CONFIG_ACORN_PARTITION) += acorn.o -obj-$(CONFIG_AMIGA_PARTITION) += amiga.o -obj-$(CONFIG_ATARI_PARTITION) += atari.o -obj-$(CONFIG_MAC_PARTITION) += mac.o -obj-$(CONFIG_LDM_PARTITION) += ldm.o -obj-$(CONFIG_MSDOS_PARTITION) += msdos.o -obj-$(CONFIG_OSF_PARTITION) += osf.o -obj-$(CONFIG_SGI_PARTITION) += sgi.o -obj-$(CONFIG_SUN_PARTITION) += sun.o -obj-$(CONFIG_ULTRIX_PARTITION) += ultrix.o -obj-$(CONFIG_IBM_PARTITION) += ibm.o -obj-$(CONFIG_EFI_PARTITION) += efi.o -obj-$(CONFIG_KARMA_PARTITION) += karma.o -obj-$(CONFIG_SYSV68_PARTITION) += sysv68.o diff --git a/fs/partitions/acorn.c b/fs/partitions/acorn.c deleted file mode 100644 index fbeb697..0000000 --- a/fs/partitions/acorn.c +++ /dev/null @@ -1,556 +0,0 @@ -/* - * linux/fs/partitions/acorn.c - * - * Copyright (c) 1996-2000 Russell King. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * Scan ADFS partitions on hard disk drives. Unfortunately, there - * isn't a standard for partitioning drives on Acorn machines, so - * every single manufacturer of SCSI and IDE cards created their own - * method. - */ -#include <linux/buffer_head.h> -#include <linux/adfs_fs.h> - -#include "check.h" -#include "acorn.h" - -/* - * Partition types. (Oh for reusability) - */ -#define PARTITION_RISCIX_MFM 1 -#define PARTITION_RISCIX_SCSI 2 -#define PARTITION_LINUX 9 - -#if defined(CONFIG_ACORN_PARTITION_CUMANA) || \ - defined(CONFIG_ACORN_PARTITION_ADFS) -static struct adfs_discrecord * -adfs_partition(struct parsed_partitions *state, char *name, char *data, - unsigned long first_sector, int slot) -{ - struct adfs_discrecord *dr; - unsigned int nr_sects; - - if (adfs_checkbblk(data)) - return NULL; - - dr = (struct adfs_discrecord *)(data + 0x1c0); - - if (dr->disc_size == 0 && dr->disc_size_high == 0) - return NULL; - - nr_sects = (le32_to_cpu(dr->disc_size_high) << 23) | - (le32_to_cpu(dr->disc_size) >> 9); - - if (name) { - strlcat(state->pp_buf, " [", PAGE_SIZE); - strlcat(state->pp_buf, name, PAGE_SIZE); - strlcat(state->pp_buf, "]", PAGE_SIZE); - } - put_partition(state, slot, first_sector, nr_sects); - return dr; -} -#endif - -#ifdef CONFIG_ACORN_PARTITION_RISCIX - -struct riscix_part { - __le32 start; - __le32 length; - __le32 one; - char name[16]; -}; - -struct riscix_record { - __le32 magic; -#define RISCIX_MAGIC cpu_to_le32(0x4a657320) - __le32 date; - struct riscix_part part[8]; -}; - -#if defined(CONFIG_ACORN_PARTITION_CUMANA) || \ - defined(CONFIG_ACORN_PARTITION_ADFS) -static int riscix_partition(struct parsed_partitions *state, - unsigned long first_sect, int slot, - unsigned long nr_sects) -{ - Sector sect; - struct riscix_record *rr; - - rr = read_part_sector(state, first_sect, §); - if (!rr) - return -1; - - strlcat(state->pp_buf, " [RISCiX]", PAGE_SIZE); - - - if (rr->magic == RISCIX_MAGIC) { - unsigned long size = nr_sects > 2 ? 2 : nr_sects; - int part; - - strlcat(state->pp_buf, " <", PAGE_SIZE); - - put_partition(state, slot++, first_sect, size); - for (part = 0; part < 8; part++) { - if (rr->part[part].one && - memcmp(rr->part[part].name, "All\0", 4)) { - put_partition(state, slot++, - le32_to_cpu(rr->part[part].start), - le32_to_cpu(rr->part[part].length)); - strlcat(state->pp_buf, "(", PAGE_SIZE); - strlcat(state->pp_buf, rr->part[part].name, PAGE_SIZE); - strlcat(state->pp_buf, ")", PAGE_SIZE); - } - } - - strlcat(state->pp_buf, " >\n", PAGE_SIZE); - } else { - put_partition(state, slot++, first_sect, nr_sects); - } - - put_dev_sector(sect); - return slot; -} -#endif -#endif - -#define LINUX_NATIVE_MAGIC 0xdeafa1de -#define LINUX_SWAP_MAGIC 0xdeafab1e - -struct linux_part { - __le32 magic; - __le32 start_sect; - __le32 nr_sects; -}; - -#if defined(CONFIG_ACORN_PARTITION_CUMANA) || \ - defined(CONFIG_ACORN_PARTITION_ADFS) -static int linux_partition(struct parsed_partitions *state, - unsigned long first_sect, int slot, - unsigned long nr_sects) -{ - Sector sect; - struct linux_part *linuxp; - unsigned long size = nr_sects > 2 ? 2 : nr_sects; - - strlcat(state->pp_buf, " [Linux]", PAGE_SIZE); - - put_partition(state, slot++, first_sect, size); - - linuxp = read_part_sector(state, first_sect, §); - if (!linuxp) - return -1; - - strlcat(state->pp_buf, " <", PAGE_SIZE); - while (linuxp->magic == cpu_to_le32(LINUX_NATIVE_MAGIC) || - linuxp->magic == cpu_to_le32(LINUX_SWAP_MAGIC)) { - if (slot == state->limit) - break; - put_partition(state, slot++, first_sect + - le32_to_cpu(linuxp->start_sect), - le32_to_cpu(linuxp->nr_sects)); - linuxp ++; - } - strlcat(state->pp_buf, " >", PAGE_SIZE); - - put_dev_sector(sect); - return slot; -} -#endif - -#ifdef CONFIG_ACORN_PARTITION_CUMANA -int adfspart_check_CUMANA(struct parsed_partitions *state) -{ - unsigned long first_sector = 0; - unsigned int start_blk = 0; - Sector sect; - unsigned char *data; - char *name = "CUMANA/ADFS"; - int first = 1; - int slot = 1; - - /* - * Try Cumana style partitions - sector 6 contains ADFS boot block - * with pointer to next 'drive'. - * - * There are unknowns in this code - is the 'cylinder number' of the - * next partition relative to the start of this one - I'm assuming - * it is. - * - * Also, which ID did Cumana use? - * - * This is totally unfinished, and will require more work to get it - * going. Hence it is totally untested. - */ - do { - struct adfs_discrecord *dr; - unsigned int nr_sects; - - data = read_part_sector(state, start_blk * 2 + 6, §); - if (!data) - return -1; - - if (slot == state->limit) - break; - - dr = adfs_partition(state, name, data, first_sector, slot++); - if (!dr) - break; - - name = NULL; - - nr_sects = (data[0x1fd] + (data[0x1fe] << 8)) * - (dr->heads + (dr->lowsector & 0x40 ? 1 : 0)) * - dr->secspertrack; - - if (!nr_sects) - break; - - first = 0; - first_sector += nr_sects; - start_blk += nr_sects >> (BLOCK_SIZE_BITS - 9); - nr_sects = 0; /* hmm - should be partition size */ - - switch (data[0x1fc] & 15) { - case 0: /* No partition / ADFS? */ - break; - -#ifdef CONFIG_ACORN_PARTITION_RISCIX - case PARTITION_RISCIX_SCSI: - /* RISCiX - we don't know how to find the next one. */ - slot = riscix_partition(state, first_sector, slot, - nr_sects); - break; -#endif - - case PARTITION_LINUX: - slot = linux_partition(state, first_sector, slot, - nr_sects); - break; - } - put_dev_sector(sect); - if (slot == -1) - return -1; - } while (1); - put_dev_sector(sect); - return first ? 0 : 1; -} -#endif - -#ifdef CONFIG_ACORN_PARTITION_ADFS -/* - * Purpose: allocate ADFS partitions. - * - * Params : hd - pointer to gendisk structure to store partition info. - * dev - device number to access. - * - * Returns: -1 on error, 0 for no ADFS boot sector, 1 for ok. - * - * Alloc : hda = whole drive - * hda1 = ADFS partition on first drive. - * hda2 = non-ADFS partition. - */ -int adfspart_check_ADFS(struct parsed_partitions *state) -{ - unsigned long start_sect, nr_sects, sectscyl, heads; - Sector sect; - unsigned char *data; - struct adfs_discrecord *dr; - unsigned char id; - int slot = 1; - - data = read_part_sector(state, 6, §); - if (!data) - return -1; - - dr = adfs_partition(state, "ADFS", data, 0, slot++); - if (!dr) { - put_dev_sector(sect); - return 0; - } - - heads = dr->heads + ((dr->lowsector >> 6) & 1); - sectscyl = dr->secspertrack * heads; - start_sect = ((data[0x1fe] << 8) + data[0x1fd]) * sectscyl; - id = data[0x1fc] & 15; - put_dev_sector(sect); - - /* - * Work out start of non-adfs partition. - */ - nr_sects = (state->bdev->bd_inode->i_size >> 9) - start_sect; - - if (start_sect) { - switch (id) { -#ifdef CONFIG_ACORN_PARTITION_RISCIX - case PARTITION_RISCIX_SCSI: - case PARTITION_RISCIX_MFM: - slot = riscix_partition(state, start_sect, slot, - nr_sects); - break; -#endif - - case PARTITION_LINUX: - slot = linux_partition(state, start_sect, slot, - nr_sects); - break; - } - } - strlcat(state->pp_buf, "\n", PAGE_SIZE); - return 1; -} -#endif - -#ifdef CONFIG_ACORN_PARTITION_ICS - -struct ics_part { - __le32 start; - __le32 size; -}; - -static int adfspart_check_ICSLinux(struct parsed_partitions *state, - unsigned long block) -{ - Sector sect; - unsigned char *data = read_part_sector(state, block, §); - int result = 0; - - if (data) { - if (memcmp(data, "LinuxPart", 9) == 0) - result = 1; - put_dev_sector(sect); - } - - return result; -} - -/* - * Check for a valid ICS partition using the checksum. - */ -static inline int valid_ics_sector(const unsigned char *data) -{ - unsigned long sum; - int i; - - for (i = 0, sum = 0x50617274; i < 508; i++) - sum += data[i]; - - sum -= le32_to_cpu(*(__le32 *)(&data[508])); - - return sum == 0; -} - -/* - * Purpose: allocate ICS partitions. - * Params : hd - pointer to gendisk structure to store partition info. - * dev - device number to access. - * Returns: -1 on error, 0 for no ICS table, 1 for partitions ok. - * Alloc : hda = whole drive - * hda1 = ADFS partition 0 on first drive. - * hda2 = ADFS partition 1 on first drive. - * ..etc.. - */ -int adfspart_check_ICS(struct parsed_partitions *state) -{ - const unsigned char *data; - const struct ics_part *p; - int slot; - Sector sect; - - /* - * Try ICS style partitions - sector 0 contains partition info. - */ - data = read_part_sector(state, 0, §); - if (!data) - return -1; - - if (!valid_ics_sector(data)) { - put_dev_sector(sect); - return 0; - } - - strlcat(state->pp_buf, " [ICS]", PAGE_SIZE); - - for (slot = 1, p = (const struct ics_part *)data; p->size; p++) { - u32 start = le32_to_cpu(p->start); - s32 size = le32_to_cpu(p->size); /* yes, it's signed. */ - - if (slot == state->limit) - break; - - /* - * Negative sizes tell the RISC OS ICS driver to ignore - * this partition - in effect it says that this does not - * contain an ADFS filesystem. - */ - if (size < 0) { - size = -size; - - /* - * Our own extension - We use the first sector - * of the partition to identify what type this - * partition is. We must not make this visible - * to the filesystem. - */ - if (size > 1 && adfspart_check_ICSLinux(state, start)) { - start += 1; - size -= 1; - } - } - - if (size) - put_partition(state, slot++, start, size); - } - - put_dev_sector(sect); - strlcat(state->pp_buf, "\n", PAGE_SIZE); - return 1; -} -#endif - -#ifdef CONFIG_ACORN_PARTITION_POWERTEC -struct ptec_part { - __le32 unused1; - __le32 unused2; - __le32 start; - __le32 size; - __le32 unused5; - char type[8]; -}; - -static inline int valid_ptec_sector(const unsigned char *data) -{ - unsigned char checksum = 0x2a; - int i; - - /* - * If it looks like a PC/BIOS partition, then it - * probably isn't PowerTec. - */ - if (data[510] == 0x55 && data[511] == 0xaa) - return 0; - - for (i = 0; i < 511; i++) - checksum += data[i]; - - return checksum == data[511]; -} - -/* - * Purpose: allocate ICS partitions. - * Params : hd - pointer to gendisk structure to store partition info. - * dev - device number to access. - * Returns: -1 on error, 0 for no ICS table, 1 for partitions ok. - * Alloc : hda = whole drive - * hda1 = ADFS partition 0 on first drive. - * hda2 = ADFS partition 1 on first drive. - * ..etc.. - */ -int adfspart_check_POWERTEC(struct parsed_partitions *state) -{ - Sector sect; - const unsigned char *data; - const struct ptec_part *p; - int slot = 1; - int i; - - data = read_part_sector(state, 0, §); - if (!data) - return -1; - - if (!valid_ptec_sector(data)) { - put_dev_sector(sect); - return 0; - } - - strlcat(state->pp_buf, " [POWERTEC]", PAGE_SIZE); - - for (i = 0, p = (const struct ptec_part *)data; i < 12; i++, p++) { - u32 start = le32_to_cpu(p->start); - u32 size = le32_to_cpu(p->size); - - if (size) - put_partition(state, slot++, start, size); - } - - put_dev_sector(sect); - strlcat(state->pp_buf, "\n", PAGE_SIZE); - return 1; -} -#endif - -#ifdef CONFIG_ACORN_PARTITION_EESOX -struct eesox_part { - char magic[6]; - char name[10]; - __le32 start; - __le32 unused6; - __le32 unused7; - __le32 unused8; -}; - -/* - * Guess who created this format? - */ -static const char eesox_name[] = { - 'N', 'e', 'i', 'l', ' ', - 'C', 'r', 'i', 't', 'c', 'h', 'e', 'l', 'l', ' ', ' ' -}; - -/* - * EESOX SCSI partition format. - * - * This is a goddamned awful partition format. We don't seem to store - * the size of the partition in this table, only the start addresses. - * - * There are two possibilities where the size comes from: - * 1. The individual ADFS boot block entries that are placed on the disk. - * 2. The start address of the next entry. - */ -int adfspart_check_EESOX(struct parsed_partitions *state) -{ - Sector sect; - const unsigned char *data; - unsigned char buffer[256]; - struct eesox_part *p; - sector_t start = 0; - int i, slot = 1; - - data = read_part_sector(state, 7, §); - if (!data) - return -1; - - /* - * "Decrypt" the partition table. God knows why... - */ - for (i = 0; i < 256; i++) - buffer[i] = data[i] ^ eesox_name[i & 15]; - - put_dev_sector(sect); - - for (i = 0, p = (struct eesox_part *)buffer; i < 8; i++, p++) { - sector_t next; - - if (memcmp(p->magic, "Eesox", 6)) - break; - - next = le32_to_cpu(p->start); - if (i) - put_partition(state, slot++, start, next - start); - start = next; - } - - if (i != 0) { - sector_t size; - - size = get_capacity(state->bdev->bd_disk); - put_partition(state, slot++, start, size - start); - strlcat(state->pp_buf, "\n", PAGE_SIZE); - } - - return i ? 1 : 0; -} -#endif diff --git a/fs/partitions/acorn.h b/fs/partitions/acorn.h deleted file mode 100644 index ede8285..0000000 --- a/fs/partitions/acorn.h +++ /dev/null @@ -1,14 +0,0 @@ -/* - * linux/fs/partitions/acorn.h - * - * Copyright (C) 1996-2001 Russell King. - * - * I _hate_ this partitioning mess - why can't we have one defined - * format, and everyone stick to it? - */ - -int adfspart_check_CUMANA(struct parsed_partitions *state); -int adfspart_check_ADFS(struct parsed_partitions *state); -int adfspart_check_ICS(struct parsed_partitions *state); -int adfspart_check_POWERTEC(struct parsed_partitions *state); -int adfspart_check_EESOX(struct parsed_partitions *state); diff --git a/fs/partitions/amiga.c b/fs/partitions/amiga.c deleted file mode 100644 index 70cbf44..0000000 --- a/fs/partitions/amiga.c +++ /dev/null @@ -1,139 +0,0 @@ -/* - * fs/partitions/amiga.c - * - * Code extracted from drivers/block/genhd.c - * - * Copyright (C) 1991-1998 Linus Torvalds - * Re-organised Feb 1998 Russell King - */ - -#include <linux/types.h> -#include <linux/affs_hardblocks.h> - -#include "check.h" -#include "amiga.h" - -static __inline__ u32 -checksum_block(__be32 *m, int size) -{ - u32 sum = 0; - - while (size--) - sum += be32_to_cpu(*m++); - return sum; -} - -int amiga_partition(struct parsed_partitions *state) -{ - Sector sect; - unsigned char *data; - struct RigidDiskBlock *rdb; - struct PartitionBlock *pb; - int start_sect, nr_sects, blk, part, res = 0; - int blksize = 1; /* Multiplier for disk block size */ - int slot = 1; - char b[BDEVNAME_SIZE]; - - for (blk = 0; ; blk++, put_dev_sector(sect)) { - if (blk == RDB_ALLOCATION_LIMIT) - goto rdb_done; - data = read_part_sector(state, blk, §); - if (!data) { - if (warn_no_part) - printk("Dev %s: unable to read RDB block %d\n", - bdevname(state->bdev, b), blk); - res = -1; - goto rdb_done; - } - if (*(__be32 *)data != cpu_to_be32(IDNAME_RIGIDDISK)) - continue; - - rdb = (struct RigidDiskBlock *)data; - if (checksum_block((__be32 *)data, be32_to_cpu(rdb->rdb_SummedLongs) & 0x7F) == 0) - break; - /* Try again with 0xdc..0xdf zeroed, Windows might have - * trashed it. - */ - *(__be32 *)(data+0xdc) = 0; - if (checksum_block((__be32 *)data, - be32_to_cpu(rdb->rdb_SummedLongs) & 0x7F)==0) { - printk("Warning: Trashed word at 0xd0 in block %d " - "ignored in checksum calculation\n",blk); - break; - } - - printk("Dev %s: RDB in block %d has bad checksum\n", - bdevname(state->bdev, b), blk); - } - - /* blksize is blocks per 512 byte standard block */ - blksize = be32_to_cpu( rdb->rdb_BlockBytes ) / 512; - - { - char tmp[7 + 10 + 1 + 1]; - - /* Be more informative */ - snprintf(tmp, sizeof(tmp), " RDSK (%d)", blksize * 512); - strlcat(state->pp_buf, tmp, PAGE_SIZE); - } - blk = be32_to_cpu(rdb->rdb_PartitionList); - put_dev_sector(sect); - for (part = 1; blk>0 && part<=16; part++, put_dev_sector(sect)) { - blk *= blksize; /* Read in terms partition table understands */ - data = read_part_sector(state, blk, §); - if (!data) { - if (warn_no_part) - printk("Dev %s: unable to read partition block %d\n", - bdevname(state->bdev, b), blk); - res = -1; - goto rdb_done; - } - pb = (struct PartitionBlock *)data; - blk = be32_to_cpu(pb->pb_Next); - if (pb->pb_ID != cpu_to_be32(IDNAME_PARTITION)) - continue; - if (checksum_block((__be32 *)pb, be32_to_cpu(pb->pb_SummedLongs) & 0x7F) != 0 ) - continue; - - /* Tell Kernel about it */ - - nr_sects = (be32_to_cpu(pb->pb_Environment[10]) + 1 - - be32_to_cpu(pb->pb_Environment[9])) * - be32_to_cpu(pb->pb_Environment[3]) * - be32_to_cpu(pb->pb_Environment[5]) * - blksize; - if (!nr_sects) - continue; - start_sect = be32_to_cpu(pb->pb_Environment[9]) * - be32_to_cpu(pb->pb_Environment[3]) * - be32_to_cpu(pb->pb_Environment[5]) * - blksize; - put_partition(state,slot++,start_sect,nr_sects); - { - /* Be even more informative to aid mounting */ - char dostype[4]; - char tmp[42]; - - __be32 *dt = (__be32 *)dostype; - *dt = pb->pb_Environment[16]; - if (dostype[3] < ' ') - snprintf(tmp, sizeof(tmp), " (%c%c%c^%c)", - dostype[0], dostype[1], - dostype[2], dostype[3] + '@' ); - else - snprintf(tmp, sizeof(tmp), " (%c%c%c%c)", - dostype[0], dostype[1], - dostype[2], dostype[3]); - strlcat(state->pp_buf, tmp, PAGE_SIZE); - snprintf(tmp, sizeof(tmp), "(res %d spb %d)", - be32_to_cpu(pb->pb_Environment[6]), - be32_to_cpu(pb->pb_Environment[4])); - strlcat(state->pp_buf, tmp, PAGE_SIZE); - } - res = 1; - } - strlcat(state->pp_buf, "\n", PAGE_SIZE); - -rdb_done: - return res; -} diff --git a/fs/partitions/amiga.h b/fs/partitions/amiga.h deleted file mode 100644 index d094585..0000000 --- a/fs/partitions/amiga.h +++ /dev/null @@ -1,6 +0,0 @@ -/* - * fs/partitions/amiga.h - */ - -int amiga_partition(struct parsed_partitions *state); - diff --git a/fs/partitions/atari.c b/fs/partitions/atari.c deleted file mode 100644 index 9875b05..0000000 --- a/fs/partitions/atari.c +++ /dev/null @@ -1,149 +0,0 @@ -/* - * fs/partitions/atari.c - * - * Code extracted from drivers/block/genhd.c - * - * Copyright (C) 1991-1998 Linus Torvalds - * Re-organised Feb 1998 Russell King - */ - -#include <linux/ctype.h> -#include "check.h" -#include "atari.h" - -/* ++guenther: this should be settable by the user ("make config")?. - */ -#define ICD_PARTS - -/* check if a partition entry looks valid -- Atari format is assumed if at - least one of the primary entries is ok this way */ -#define VALID_PARTITION(pi,hdsiz) \ - (((pi)->flg & 1) && \ - isalnum((pi)->id[0]) && isalnum((pi)->id[1]) && isalnum((pi)->id[2]) && \ - be32_to_cpu((pi)->st) <= (hdsiz) && \ - be32_to_cpu((pi)->st) + be32_to_cpu((pi)->siz) <= (hdsiz)) - -static inline int OK_id(char *s) -{ - return memcmp (s, "GEM", 3) == 0 || memcmp (s, "BGM", 3) == 0 || - memcmp (s, "LNX", 3) == 0 || memcmp (s, "SWP", 3) == 0 || - memcmp (s, "RAW", 3) == 0 ; -} - -int atari_partition(struct parsed_partitions *state) -{ - Sector sect; - struct rootsector *rs; - struct partition_info *pi; - u32 extensect; - u32 hd_size; - int slot; -#ifdef ICD_PARTS - int part_fmt = 0; /* 0:unknown, 1:AHDI, 2:ICD/Supra */ -#endif - - rs = read_part_sector(state, 0, §); - if (!rs) - return -1; - - /* Verify this is an Atari rootsector: */ - hd_size = state->bdev->bd_inode->i_size >> 9; - if (!VALID_PARTITION(&rs->part[0], hd_size) && - !VALID_PARTITION(&rs->part[1], hd_size) && - !VALID_PARTITION(&rs->part[2], hd_size) && - !VALID_PARTITION(&rs->part[3], hd_size)) { - /* - * if there's no valid primary partition, assume that no Atari - * format partition table (there's no reliable magic or the like - * :-() - */ - put_dev_sector(sect); - return 0; - } - - pi = &rs->part[0]; - strlcat(state->pp_buf, " AHDI", PAGE_SIZE); - for (slot = 1; pi < &rs->part[4] && slot < state->limit; slot++, pi++) { - struct rootsector *xrs; - Sector sect2; - ulong partsect; - - if ( !(pi->flg & 1) ) - continue; - /* active partition */ - if (memcmp (pi->id, "XGM", 3) != 0) { - /* we don't care about other id's */ - put_partition (state, slot, be32_to_cpu(pi->st), - be32_to_cpu(pi->siz)); - continue; - } - /* extension partition */ -#ifdef ICD_PARTS - part_fmt = 1; -#endif - strlcat(state->pp_buf, " XGM<", PAGE_SIZE); - partsect = extensect = be32_to_cpu(pi->st); - while (1) { - xrs = read_part_sector(state, partsect, §2); - if (!xrs) { - printk (" block %ld read failed\n", partsect); - put_dev_sector(sect); - return -1; - } - - /* ++roman: sanity check: bit 0 of flg field must be set */ - if (!(xrs->part[0].flg & 1)) { - printk( "\nFirst sub-partition in extended partition is not valid!\n" ); - put_dev_sector(sect2); - break; - } - - put_partition(state, slot, - partsect + be32_to_cpu(xrs->part[0].st), - be32_to_cpu(xrs->part[0].siz)); - - if (!(xrs->part[1].flg & 1)) { - /* end of linked partition list */ - put_dev_sector(sect2); - break; - } - if (memcmp( xrs->part[1].id, "XGM", 3 ) != 0) { - printk("\nID of extended partition is not XGM!\n"); - put_dev_sector(sect2); - break; - } - - partsect = be32_to_cpu(xrs->part[1].st) + extensect; - put_dev_sector(sect2); - if (++slot == state->limit) { - printk( "\nMaximum number of partitions reached!\n" ); - break; - } - } - strlcat(state->pp_buf, " >", PAGE_SIZE); - } -#ifdef ICD_PARTS - if ( part_fmt!=1 ) { /* no extended partitions -> test ICD-format */ - pi = &rs->icdpart[0]; - /* sanity check: no ICD format if first partition invalid */ - if (OK_id(pi->id)) { - strlcat(state->pp_buf, " ICD<", PAGE_SIZE); - for (; pi < &rs->icdpart[8] && slot < state->limit; slot++, pi++) { - /* accept only GEM,BGM,RAW,LNX,SWP partitions */ - if (!((pi->flg & 1) && OK_id(pi->id))) - continue; - part_fmt = 2; - put_partition (state, slot, - be32_to_cpu(pi->st), - be32_to_cpu(pi->siz)); - } - strlcat(state->pp_buf, " >", PAGE_SIZE); - } - } -#endif - put_dev_sector(sect); - - strlcat(state->pp_buf, "\n", PAGE_SIZE); - - return 1; -} diff --git a/fs/partitions/atari.h b/fs/partitions/atari.h deleted file mode 100644 index fe2d32a..0000000 --- a/fs/partitions/atari.h +++ /dev/null @@ -1,34 +0,0 @@ -/* - * fs/partitions/atari.h - * Moved by Russell King from: - * - * linux/include/linux/atari_rootsec.h - * definitions for Atari Rootsector layout - * by Andreas Schwab (schwab@ls5.informatik.uni-dortmund.de) - * - * modified for ICD/Supra partitioning scheme restricted to at most 12 - * partitions - * by Guenther Kelleter (guenther@pool.informatik.rwth-aachen.de) - */ - -struct partition_info -{ - u8 flg; /* bit 0: active; bit 7: bootable */ - char id[3]; /* "GEM", "BGM", "XGM", or other */ - __be32 st; /* start of partition */ - __be32 siz; /* length of partition */ -}; - -struct rootsector -{ - char unused[0x156]; /* room for boot code */ - struct partition_info icdpart[8]; /* info for ICD-partitions 5..12 */ - char unused2[0xc]; - u32 hd_siz; /* size of disk in blocks */ - struct partition_info part[4]; - u32 bsl_st; /* start of bad sector list */ - u32 bsl_cnt; /* length of bad sector list */ - u16 checksum; /* checksum for bootable disks */ -} __attribute__((__packed__)); - -int atari_partition(struct parsed_partitions *state); diff --git a/fs/partitions/check.c b/fs/partitions/check.c deleted file mode 100644 index e3c63d1..0000000 --- a/fs/partitions/check.c +++ /dev/null @@ -1,687 +0,0 @@ -/* - * fs/partitions/check.c - * - * Code extracted from drivers/block/genhd.c - * Copyright (C) 1991-1998 Linus Torvalds - * Re-organised Feb 1998 Russell King - * - * We now have independent partition support from the - * block drivers, which allows all the partition code to - * be grouped in one location, and it to be mostly self - * contained. - * - * Added needed MAJORS for new pairs, {hdi,hdj}, {hdk,hdl} - */ - -#include <linux/init.h> -#include <linux/module.h> -#include <linux/fs.h> -#include <linux/slab.h> -#include <linux/kmod.h> -#include <linux/ctype.h> -#include <linux/genhd.h> -#include <linux/blktrace_api.h> - -#include "check.h" - -#include "acorn.h" -#include "amiga.h" -#include "atari.h" -#include "ldm.h" -#include "mac.h" -#include "msdos.h" -#include "osf.h" -#include "sgi.h" -#include "sun.h" -#include "ibm.h" -#include "ultrix.h" -#include "efi.h" -#include "karma.h" -#include "sysv68.h" - -#ifdef CONFIG_BLK_DEV_MD -extern void md_autodetect_dev(dev_t dev); -#endif - -int warn_no_part = 1; /*This is ugly: should make genhd removable media aware*/ - -static int (*check_part[])(struct parsed_partitions *) = { - /* - * Probe partition formats with tables at disk address 0 - * that also have an ADFS boot block at 0xdc0. - */ -#ifdef CONFIG_ACORN_PARTITION_ICS - adfspart_check_ICS, -#endif -#ifdef CONFIG_ACORN_PARTITION_POWERTEC - adfspart_check_POWERTEC, -#endif -#ifdef CONFIG_ACORN_PARTITION_EESOX - adfspart_check_EESOX, -#endif - - /* - * Now move on to formats that only have partition info at - * disk address 0xdc0. Since these may also have stale - * PC/BIOS partition tables, they need to come before - * the msdos entry. - */ -#ifdef CONFIG_ACORN_PARTITION_CUMANA - adfspart_check_CUMANA, -#endif -#ifdef CONFIG_ACORN_PARTITION_ADFS - adfspart_check_ADFS, -#endif - -#ifdef CONFIG_EFI_PARTITION - efi_partition, /* this must come before msdos */ -#endif -#ifdef CONFIG_SGI_PARTITION - sgi_partition, -#endif -#ifdef CONFIG_LDM_PARTITION - ldm_partition, /* this must come before msdos */ -#endif -#ifdef CONFIG_MSDOS_PARTITION - msdos_partition, -#endif -#ifdef CONFIG_OSF_PARTITION - osf_partition, -#endif -#ifdef CONFIG_SUN_PARTITION - sun_partition, -#endif -#ifdef CONFIG_AMIGA_PARTITION - amiga_partition, -#endif -#ifdef CONFIG_ATARI_PARTITION - atari_partition, -#endif -#ifdef CONFIG_MAC_PARTITION - mac_partition, -#endif -#ifdef CONFIG_ULTRIX_PARTITION - ultrix_partition, -#endif -#ifdef CONFIG_IBM_PARTITION - ibm_partition, -#endif -#ifdef CONFIG_KARMA_PARTITION - karma_partition, -#endif -#ifdef CONFIG_SYSV68_PARTITION - sysv68_partition, -#endif - NULL -}; - -/* - * disk_name() is used by partition check code and the genhd driver. - * It formats the devicename of the indicated disk into - * the supplied buffer (of size at least 32), and returns - * a pointer to that same buffer (for convenience). - */ - -char *disk_name(struct gendisk *hd, int partno, char *buf) -{ - if (!partno) - snprintf(buf, BDEVNAME_SIZE, "%s", hd->disk_name); - else if (isdigit(hd->disk_name[strlen(hd->disk_name)-1])) - snprintf(buf, BDEVNAME_SIZE, "%sp%d", hd->disk_name, partno); - else - snprintf(buf, BDEVNAME_SIZE, "%s%d", hd->disk_name, partno); - - return buf; -} - -const char *bdevname(struct block_device *bdev, char *buf) -{ - return disk_name(bdev->bd_disk, bdev->bd_part->partno, buf); -} - -EXPORT_SYMBOL(bdevname); - -/* - * There's very little reason to use this, you should really - * have a struct block_device just about everywhere and use - * bdevname() instead. - */ -const char *__bdevname(dev_t dev, char *buffer) -{ - scnprintf(buffer, BDEVNAME_SIZE, "unknown-block(%u,%u)", - MAJOR(dev), MINOR(dev)); - return buffer; -} - -EXPORT_SYMBOL(__bdevname); - -static struct parsed_partitions * -check_partition(struct gendisk *hd, struct block_device *bdev) -{ - struct parsed_partitions *state; - int i, res, err; - - state = kzalloc(sizeof(struct parsed_partitions), GFP_KERNEL); - if (!state) - return NULL; - state->pp_buf = (char *)__get_free_page(GFP_KERNEL); - if (!state->pp_buf) { - kfree(state); - return NULL; - } - state->pp_buf[0] = '\0'; - - state->bdev = bdev; - disk_name(hd, 0, state->name); - snprintf(state->pp_buf, PAGE_SIZE, " %s:", state->name); - if (isdigit(state->name[strlen(state->name)-1])) - sprintf(state->name, "p"); - - state->limit = disk_max_parts(hd); - i = res = err = 0; - while (!res && check_part[i]) { - memset(&state->parts, 0, sizeof(state->parts)); - res = check_part[i++](state); - if (res < 0) { - /* We have hit an I/O error which we don't report now. - * But record it, and let the others do their job. - */ - err = res; - res = 0; - } - - } - if (res > 0) { - printk(KERN_INFO "%s", state->pp_buf); - - free_page((unsigned long)state->pp_buf); - return state; - } - if (state->access_beyond_eod) - err = -ENOSPC; - if (err) - /* The partition is unrecognized. So report I/O errors if there were any */ - res = err; - if (!res) - strlcat(state->pp_buf, " unknown partition table\n", PAGE_SIZE); - else if (warn_no_part) - strlcat(state->pp_buf, " unable to read partition table\n", PAGE_SIZE); - - printk(KERN_INFO "%s", state->pp_buf); - - free_page((unsigned long)state->pp_buf); - kfree(state); - return ERR_PTR(res); -} - -static ssize_t part_partition_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct hd_struct *p = dev_to_part(dev); - - return sprintf(buf, "%d\n", p->partno); -} - -static ssize_t part_start_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct hd_struct *p = dev_to_part(dev); - - return sprintf(buf, "%llu\n",(unsigned long long)p->start_sect); -} - -ssize_t part_size_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct hd_struct *p = dev_to_part(dev); - return sprintf(buf, "%llu\n",(unsigned long long)p->nr_sects); -} - -static ssize_t part_ro_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct hd_struct *p = dev_to_part(dev); - return sprintf(buf, "%d\n", p->policy ? 1 : 0); -} - -static ssize_t part_alignment_offset_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct hd_struct *p = dev_to_part(dev); - return sprintf(buf, "%llu\n", (unsigned long long)p->alignment_offset); -} - -static ssize_t part_discard_alignment_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct hd_struct *p = dev_to_part(dev); - return sprintf(buf, "%u\n", p->discard_alignment); -} - -ssize_t part_stat_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct hd_struct *p = dev_to_part(dev); - int cpu; - - cpu = part_stat_lock(); - part_round_stats(cpu, p); - part_stat_unlock(); - return sprintf(buf, - "%8lu %8lu %8llu %8u " - "%8lu %8lu %8llu %8u " - "%8u %8u %8u" - "\n", - part_stat_read(p, ios[READ]), - part_stat_read(p, merges[READ]), - (unsigned long long)part_stat_read(p, sectors[READ]), - jiffies_to_msecs(part_stat_read(p, ticks[READ])), - part_stat_read(p, ios[WRITE]), - part_stat_read(p, merges[WRITE]), - (unsigned long long)part_stat_read(p, sectors[WRITE]), - jiffies_to_msecs(part_stat_read(p, ticks[WRITE])), - part_in_flight(p), - jiffies_to_msecs(part_stat_read(p, io_ticks)), - jiffies_to_msecs(part_stat_read(p, time_in_queue))); -} - -ssize_t part_inflight_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct hd_struct *p = dev_to_part(dev); - - return sprintf(buf, "%8u %8u\n", atomic_read(&p->in_flight[0]), - atomic_read(&p->in_flight[1])); -} - -#ifdef CONFIG_FAIL_MAKE_REQUEST -ssize_t part_fail_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct hd_struct *p = dev_to_part(dev); - - return sprintf(buf, "%d\n", p->make_it_fail); -} - -ssize_t part_fail_store(struct device *dev, - struct device_attribute *attr, - const char *buf, size_t count) -{ - struct hd_struct *p = dev_to_part(dev); - int i; - - if (count > 0 && sscanf(buf, "%d", &i) > 0) - p->make_it_fail = (i == 0) ? 0 : 1; - - return count; -} -#endif - -static DEVICE_ATTR(partition, S_IRUGO, part_partition_show, NULL); -static DEVICE_ATTR(start, S_IRUGO, part_start_show, NULL); -static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL); -static DEVICE_ATTR(ro, S_IRUGO, part_ro_show, NULL); -static DEVICE_ATTR(alignment_offset, S_IRUGO, part_alignment_offset_show, NULL); -static DEVICE_ATTR(discard_alignment, S_IRUGO, part_discard_alignment_show, - NULL); -static DEVICE_ATTR(stat, S_IRUGO, part_stat_show, NULL); -static DEVICE_ATTR(inflight, S_IRUGO, part_inflight_show, NULL); -#ifdef CONFIG_FAIL_MAKE_REQUEST -static struct device_attribute dev_attr_fail = - __ATTR(make-it-fail, S_IRUGO|S_IWUSR, part_fail_show, part_fail_store); -#endif - -static struct attribute *part_attrs[] = { - &dev_attr_partition.attr, - &dev_attr_start.attr, - &dev_attr_size.attr, - &dev_attr_ro.attr, - &dev_attr_alignment_offset.attr, - &dev_attr_discard_alignment.attr, - &dev_attr_stat.attr, - &dev_attr_inflight.attr, -#ifdef CONFIG_FAIL_MAKE_REQUEST - &dev_attr_fail.attr, -#endif - NULL -}; - -static struct attribute_group part_attr_group = { - .attrs = part_attrs, -}; - -static const struct attribute_group *part_attr_groups[] = { - &part_attr_group, -#ifdef CONFIG_BLK_DEV_IO_TRACE - &blk_trace_attr_group, -#endif - NULL -}; - -static void part_release(struct device *dev) -{ - struct hd_struct *p = dev_to_part(dev); - free_part_stats(p); - free_part_info(p); - kfree(p); -} - -struct device_type part_type = { - .name = "partition", - .groups = part_attr_groups, - .release = part_release, -}; - -static void delete_partition_rcu_cb(struct rcu_head *head) -{ - struct hd_struct *part = container_of(head, struct hd_struct, rcu_head); - - part->start_sect = 0; - part->nr_sects = 0; - part_stat_set_all(part, 0); - put_device(part_to_dev(part)); -} - -void __delete_partition(struct hd_struct *part) -{ - call_rcu(&part->rcu_head, delete_partition_rcu_cb); -} - -void delete_partition(struct gendisk *disk, int partno) -{ - struct disk_part_tbl *ptbl = disk->part_tbl; - struct hd_struct *part; - - if (partno >= ptbl->len) - return; - - part = ptbl->part[partno]; - if (!part) - return; - - blk_free_devt(part_devt(part)); - rcu_assign_pointer(ptbl->part[partno], NULL); - rcu_assign_pointer(ptbl->last_lookup, NULL); - kobject_put(part->holder_dir); - device_del(part_to_dev(part)); - - hd_struct_put(part); -} - -static ssize_t whole_disk_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - return 0; -} -static DEVICE_ATTR(whole_disk, S_IRUSR | S_IRGRP | S_IROTH, - whole_disk_show, NULL); - -struct hd_struct *add_partition(struct gendisk *disk, int partno, - sector_t start, sector_t len, int flags, - struct partition_meta_info *info) -{ - struct hd_struct *p; - dev_t devt = MKDEV(0, 0); - struct device *ddev = disk_to_dev(disk); - struct device *pdev; - struct disk_part_tbl *ptbl; - const char *dname; - int err; - - err = disk_expand_part_tbl(disk, partno); - if (err) - return ERR_PTR(err); - ptbl = disk->part_tbl; - - if (ptbl->part[partno]) - return ERR_PTR(-EBUSY); - - p = kzalloc(sizeof(*p), GFP_KERNEL); - if (!p) - return ERR_PTR(-EBUSY); - - if (!init_part_stats(p)) { - err = -ENOMEM; - goto out_free; - } - pdev = part_to_dev(p); - - p->start_sect = start; - p->alignment_offset = - queue_limit_alignment_offset(&disk->queue->limits, start); - p->discard_alignment = - queue_limit_discard_alignment(&disk->queue->limits, start); - p->nr_sects = len; - p->partno = partno; - p->policy = get_disk_ro(disk); - - if (info) { - struct partition_meta_info *pinfo = alloc_part_info(disk); - if (!pinfo) - goto out_free_stats; - memcpy(pinfo, info, sizeof(*info)); - p->info = pinfo; - } - - dname = dev_name(ddev); - if (isdigit(dname[strlen(dname) - 1])) - dev_set_name(pdev, "%sp%d", dname, partno); - else - dev_set_name(pdev, "%s%d", dname, partno); - - device_initialize(pdev); - pdev->class = &block_class; - pdev->type = &part_type; - pdev->parent = ddev; - - err = blk_alloc_devt(p, &devt); - if (err) - goto out_free_info; - pdev->devt = devt; - - /* delay uevent until 'holders' subdir is created */ - dev_set_uevent_suppress(pdev, 1); - err = device_add(pdev); - if (err) - goto out_put; - - err = -ENOMEM; - p->holder_dir = kobject_create_and_add("holders", &pdev->kobj); - if (!p->holder_dir) - goto out_del; - - dev_set_uevent_suppress(pdev, 0); - if (flags & ADDPART_FLAG_WHOLEDISK) { - err = device_create_file(pdev, &dev_attr_whole_disk); - if (err) - goto out_del; - } - - /* everything is up and running, commence */ - rcu_assign_pointer(ptbl->part[partno], p); - - /* suppress uevent if the disk suppresses it */ - if (!dev_get_uevent_suppress(ddev)) - kobject_uevent(&pdev->kobj, KOBJ_ADD); - - hd_ref_init(p); - return p; - -out_free_info: - free_part_info(p); -out_free_stats: - free_part_stats(p); -out_free: - kfree(p); - return ERR_PTR(err); -out_del: - kobject_put(p->holder_dir); - device_del(pdev); -out_put: - put_device(pdev); - blk_free_devt(devt); - return ERR_PTR(err); -} - -static bool disk_unlock_native_capacity(struct gendisk *disk) -{ - const struct block_device_operations *bdops = disk->fops; - - if (bdops->unlock_native_capacity && - !(disk->flags & GENHD_FL_NATIVE_CAPACITY)) { - printk(KERN_CONT "enabling native capacity\n"); - bdops->unlock_native_capacity(disk); - disk->flags |= GENHD_FL_NATIVE_CAPACITY; - return true; - } else { - printk(KERN_CONT "truncated\n"); - return false; - } -} - -int rescan_partitions(struct gendisk *disk, struct block_device *bdev) -{ - struct parsed_partitions *state = NULL; - struct disk_part_iter piter; - struct hd_struct *part; - int p, highest, res; -rescan: - if (state && !IS_ERR(state)) { - kfree(state); - state = NULL; - } - - if (bdev->bd_part_count) - return -EBUSY; - res = invalidate_partition(disk, 0); - if (res) - return res; - - disk_part_iter_init(&piter, disk, DISK_PITER_INCL_EMPTY); - while ((part = disk_part_iter_next(&piter))) - delete_partition(disk, part->partno); - disk_part_iter_exit(&piter); - - if (disk->fops->revalidate_disk) - disk->fops->revalidate_disk(disk); - check_disk_size_change(disk, bdev); - bdev->bd_invalidated = 0; - if (!get_capacity(disk) || !(state = check_partition(disk, bdev))) - return 0; - if (IS_ERR(state)) { - /* - * I/O error reading the partition table. If any - * partition code tried to read beyond EOD, retry - * after unlocking native capacity. - */ - if (PTR_ERR(state) == -ENOSPC) { - printk(KERN_WARNING "%s: partition table beyond EOD, ", - disk->disk_name); - if (disk_unlock_native_capacity(disk)) - goto rescan; - } - return -EIO; - } - /* - * If any partition code tried to read beyond EOD, try - * unlocking native capacity even if partition table is - * successfully read as we could be missing some partitions. - */ - if (state->access_beyond_eod) { - printk(KERN_WARNING - "%s: partition table partially beyond EOD, ", - disk->disk_name); - if (disk_unlock_native_capacity(disk)) - goto rescan; - } - - /* tell userspace that the media / partition table may have changed */ - kobject_uevent(&disk_to_dev(disk)->kobj, KOBJ_CHANGE); - - /* Detect the highest partition number and preallocate - * disk->part_tbl. This is an optimization and not strictly - * necessary. - */ - for (p = 1, highest = 0; p < state->limit; p++) - if (state->parts[p].size) - highest = p; - - disk_expand_part_tbl(disk, highest); - - /* add partitions */ - for (p = 1; p < state->limit; p++) { - sector_t size, from; - struct partition_meta_info *info = NULL; - - size = state->parts[p].size; - if (!size) - continue; - - from = state->parts[p].from; - if (from >= get_capacity(disk)) { - printk(KERN_WARNING - "%s: p%d start %llu is beyond EOD, ", - disk->disk_name, p, (unsigned long long) from); - if (disk_unlock_native_capacity(disk)) - goto rescan; - continue; - } - - if (from + size > get_capacity(disk)) { - printk(KERN_WARNING - "%s: p%d size %llu extends beyond EOD, ", - disk->disk_name, p, (unsigned long long) size); - - if (disk_unlock_native_capacity(disk)) { - /* free state and restart */ - goto rescan; - } else { - /* - * we can not ignore partitions of broken tables - * created by for example camera firmware, but - * we limit them to the end of the disk to avoid - * creating invalid block devices - */ - size = get_capacity(disk) - from; - } - } - - if (state->parts[p].has_info) - info = &state->parts[p].info; - part = add_partition(disk, p, from, size, - state->parts[p].flags, - &state->parts[p].info); - if (IS_ERR(part)) { - printk(KERN_ERR " %s: p%d could not be added: %ld\n", - disk->disk_name, p, -PTR_ERR(part)); - continue; - } -#ifdef CONFIG_BLK_DEV_MD - if (state->parts[p].flags & ADDPART_FLAG_RAID) - md_autodetect_dev(part_to_dev(part)->devt); -#endif - } - kfree(state); - return 0; -} - -unsigned char *read_dev_sector(struct block_device *bdev, sector_t n, Sector *p) -{ - struct address_space *mapping = bdev->bd_inode->i_mapping; - struct page *page; - - page = read_mapping_page(mapping, (pgoff_t)(n >> (PAGE_CACHE_SHIFT-9)), - NULL); - if (!IS_ERR(page)) { - if (PageError(page)) - goto fail; - p->v = page; - return (unsigned char *)page_address(page) + ((n & ((1 << (PAGE_CACHE_SHIFT - 9)) - 1)) << 9); -fail: - page_cache_release(page); - } - p->v = NULL; - return NULL; -} - -EXPORT_SYMBOL(read_dev_sector); diff --git a/fs/partitions/check.h b/fs/partitions/check.h deleted file mode 100644 index d68bf4d..0000000 --- a/fs/partitions/check.h +++ /dev/null @@ -1,49 +0,0 @@ -#include <linux/pagemap.h> -#include <linux/blkdev.h> -#include <linux/genhd.h> - -/* - * add_gd_partition adds a partitions details to the devices partition - * description. - */ -struct parsed_partitions { - struct block_device *bdev; - char name[BDEVNAME_SIZE]; - struct { - sector_t from; - sector_t size; - int flags; - bool has_info; - struct partition_meta_info info; - } parts[DISK_MAX_PARTS]; - int next; - int limit; - bool access_beyond_eod; - char *pp_buf; -}; - -static inline void *read_part_sector(struct parsed_partitions *state, - sector_t n, Sector *p) -{ - if (n >= get_capacity(state->bdev->bd_disk)) { - state->access_beyond_eod = true; - return NULL; - } - return read_dev_sector(state->bdev, n, p); -} - -static inline void -put_partition(struct parsed_partitions *p, int n, sector_t from, sector_t size) -{ - if (n < p->limit) { - char tmp[1 + BDEVNAME_SIZE + 10 + 1]; - - p->parts[n].from = from; - p->parts[n].size = size; - snprintf(tmp, sizeof(tmp), " %s%d", p->name, n); - strlcat(p->pp_buf, tmp, PAGE_SIZE); - } -} - -extern int warn_no_part; - diff --git a/fs/partitions/efi.c b/fs/partitions/efi.c deleted file mode 100644 index 6296b40..0000000 --- a/fs/partitions/efi.c +++ /dev/null @@ -1,675 +0,0 @@ -/************************************************************ - * EFI GUID Partition Table handling - * - * http://www.uefi.org/specs/ - * http://www.intel.com/technology/efi/ - * - * efi.[ch] by Matt Domsch <Matt_Domsch@dell.com> - * Copyright 2000,2001,2002,2004 Dell Inc. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - * - * - * TODO: - * - * Changelog: - * Mon Nov 09 2004 Matt Domsch <Matt_Domsch@dell.com> - * - test for valid PMBR and valid PGPT before ever reading - * AGPT, allow override with 'gpt' kernel command line option. - * - check for first/last_usable_lba outside of size of disk - * - * Tue Mar 26 2002 Matt Domsch <Matt_Domsch@dell.com> - * - Ported to 2.5.7-pre1 and 2.5.7-dj2 - * - Applied patch to avoid fault in alternate header handling - * - cleaned up find_valid_gpt - * - On-disk structure and copy in memory is *always* LE now - - * swab fields as needed - * - remove print_gpt_header() - * - only use first max_p partition entries, to keep the kernel minor number - * and partition numbers tied. - * - * Mon Feb 04 2002 Matt Domsch <Matt_Domsch@dell.com> - * - Removed __PRIPTR_PREFIX - not being used - * - * Mon Jan 14 2002 Matt Domsch <Matt_Domsch@dell.com> - * - Ported to 2.5.2-pre11 + library crc32 patch Linus applied - * - * Thu Dec 6 2001 Matt Domsch <Matt_Domsch@dell.com> - * - Added compare_gpts(). - * - moved le_efi_guid_to_cpus() back into this file. GPT is the only - * thing that keeps EFI GUIDs on disk. - * - Changed gpt structure names and members to be simpler and more Linux-like. - * - * Wed Oct 17 2001 Matt Domsch <Matt_Domsch@dell.com> - * - Removed CONFIG_DEVFS_VOLUMES_UUID code entirely per Martin Wilck - * - * Wed Oct 10 2001 Matt Domsch <Matt_Domsch@dell.com> - * - Changed function comments to DocBook style per Andreas Dilger suggestion. - * - * Mon Oct 08 2001 Matt Domsch <Matt_Domsch@dell.com> - * - Change read_lba() to use the page cache per Al Viro's work. - * - print u64s properly on all architectures - * - fixed debug_printk(), now Dprintk() - * - * Mon Oct 01 2001 Matt Domsch <Matt_Domsch@dell.com> - * - Style cleanups - * - made most functions static - * - Endianness addition - * - remove test for second alternate header, as it's not per spec, - * and is unnecessary. There's now a method to read/write the last - * sector of an odd-sized disk from user space. No tools have ever - * been released which used this code, so it's effectively dead. - * - Per Asit Mallick of Intel, added a test for a valid PMBR. - * - Added kernel command line option 'gpt' to override valid PMBR test. - * - * Wed Jun 6 2001 Martin Wilck <Martin.Wilck@Fujitsu-Siemens.com> - * - added devfs volume UUID support (/dev/volumes/uuids) for - * mounting file systems by the partition GUID. - * - * Tue Dec 5 2000 Matt Domsch <Matt_Domsch@dell.com> - * - Moved crc32() to linux/lib, added efi_crc32(). - * - * Thu Nov 30 2000 Matt Domsch <Matt_Domsch@dell.com> - * - Replaced Intel's CRC32 function with an equivalent - * non-license-restricted version. - * - * Wed Oct 25 2000 Matt Domsch <Matt_Domsch@dell.com> - * - Fixed the last_lba() call to return the proper last block - * - * Thu Oct 12 2000 Matt Domsch <Matt_Domsch@dell.com> - * - Thanks to Andries Brouwer for his debugging assistance. - * - Code works, detects all the partitions. - * - ************************************************************/ -#include <linux/crc32.h> -#include <linux/ctype.h> -#include <linux/math64.h> -#include <linux/slab.h> -#include "check.h" -#include "efi.h" - -/* This allows a kernel command line option 'gpt' to override - * the test for invalid PMBR. Not __initdata because reloading - * the partition tables happens after init too. - */ -static int force_gpt; -static int __init -force_gpt_fn(char *str) -{ - force_gpt = 1; - return 1; -} -__setup("gpt", force_gpt_fn); - - -/** - * efi_crc32() - EFI version of crc32 function - * @buf: buffer to calculate crc32 of - * @len - length of buf - * - * Description: Returns EFI-style CRC32 value for @buf - * - * This function uses the little endian Ethernet polynomial - * but seeds the function with ~0, and xor's with ~0 at the end. - * Note, the EFI Specification, v1.02, has a reference to - * Dr. Dobbs Journal, May 1994 (actually it's in May 1992). - */ -static inline u32 -efi_crc32(const void *buf, unsigned long len) -{ - return (crc32(~0L, buf, len) ^ ~0L); -} - -/** - * last_lba(): return number of last logical block of device - * @bdev: block device - * - * Description: Returns last LBA value on success, 0 on error. - * This is stored (by sd and ide-geometry) in - * the part[0] entry for this disk, and is the number of - * physical sectors available on the disk. - */ -static u64 last_lba(struct block_device *bdev) -{ - if (!bdev || !bdev->bd_inode) - return 0; - return div_u64(bdev->bd_inode->i_size, - bdev_logical_block_size(bdev)) - 1ULL; -} - -static inline int -pmbr_part_valid(struct partition *part) -{ - if (part->sys_ind == EFI_PMBR_OSTYPE_EFI_GPT && - le32_to_cpu(part->start_sect) == 1UL) - return 1; - return 0; -} - -/** - * is_pmbr_valid(): test Protective MBR for validity - * @mbr: pointer to a legacy mbr structure - * - * Description: Returns 1 if PMBR is valid, 0 otherwise. - * Validity depends on two things: - * 1) MSDOS signature is in the last two bytes of the MBR - * 2) One partition of type 0xEE is found - */ -static int -is_pmbr_valid(legacy_mbr *mbr) -{ - int i; - if (!mbr || le16_to_cpu(mbr->signature) != MSDOS_MBR_SIGNATURE) - return 0; - for (i = 0; i < 4; i++) - if (pmbr_part_valid(&mbr->partition_record[i])) - return 1; - return 0; -} - -/** - * read_lba(): Read bytes from disk, starting at given LBA - * @state - * @lba - * @buffer - * @size_t - * - * Description: Reads @count bytes from @state->bdev into @buffer. - * Returns number of bytes read on success, 0 on error. - */ -static size_t read_lba(struct parsed_partitions *state, - u64 lba, u8 *buffer, size_t count) -{ - size_t totalreadcount = 0; - struct block_device *bdev = state->bdev; - sector_t n = lba * (bdev_logical_block_size(bdev) / 512); - - if (!buffer || lba > last_lba(bdev)) - return 0; - - while (count) { - int copied = 512; - Sector sect; - unsigned char *data = read_part_sector(state, n++, §); - if (!data) - break; - if (copied > count) - copied = count; - memcpy(buffer, data, copied); - put_dev_sector(sect); - buffer += copied; - totalreadcount +=copied; - count -= copied; - } - return totalreadcount; -} - -/** - * alloc_read_gpt_entries(): reads partition entries from disk - * @state - * @gpt - GPT header - * - * Description: Returns ptes on success, NULL on error. - * Allocates space for PTEs based on information found in @gpt. - * Notes: remember to free pte when you're done! - */ -static gpt_entry *alloc_read_gpt_entries(struct parsed_partitions *state, - gpt_header *gpt) -{ - size_t count; - gpt_entry *pte; - - if (!gpt) - return NULL; - - count = le32_to_cpu(gpt->num_partition_entries) * - le32_to_cpu(gpt->sizeof_partition_entry); - if (!count) - return NULL; - pte = kzalloc(count, GFP_KERNEL); - if (!pte) - return NULL; - - if (read_lba(state, le64_to_cpu(gpt->partition_entry_lba), - (u8 *) pte, - count) < count) { - kfree(pte); - pte=NULL; - return NULL; - } - return pte; -} - -/** - * alloc_read_gpt_header(): Allocates GPT header, reads into it from disk - * @state - * @lba is the Logical Block Address of the partition table - * - * Description: returns GPT header on success, NULL on error. Allocates - * and fills a GPT header starting at @ from @state->bdev. - * Note: remember to free gpt when finished with it. - */ -static gpt_header *alloc_read_gpt_header(struct parsed_partitions *state, - u64 lba) -{ - gpt_header *gpt; - unsigned ssz = bdev_logical_block_size(state->bdev); - - gpt = kzalloc(ssz, GFP_KERNEL); - if (!gpt) - return NULL; - - if (read_lba(state, lba, (u8 *) gpt, ssz) < ssz) { - kfree(gpt); - gpt=NULL; - return NULL; - } - - return gpt; -} - -/** - * is_gpt_valid() - tests one GPT header and PTEs for validity - * @state - * @lba is the logical block address of the GPT header to test - * @gpt is a GPT header ptr, filled on return. - * @ptes is a PTEs ptr, filled on return. - * - * Description: returns 1 if valid, 0 on error. - * If valid, returns pointers to newly allocated GPT header and PTEs. - */ -static int is_gpt_valid(struct parsed_partitions *state, u64 lba, - gpt_header **gpt, gpt_entry **ptes) -{ - u32 crc, origcrc; - u64 lastlba; - - if (!ptes) - return 0; - if (!(*gpt = alloc_read_gpt_header(state, lba))) - return 0; - - /* Check the GUID Partition Table signature */ - if (le64_to_cpu((*gpt)->signature) != GPT_HEADER_SIGNATURE) { - pr_debug("GUID Partition Table Header signature is wrong:" - "%lld != %lld\n", - (unsigned long long)le64_to_cpu((*gpt)->signature), - (unsigned long long)GPT_HEADER_SIGNATURE); - goto fail; - } - - /* Check the GUID Partition Table header size */ - if (le32_to_cpu((*gpt)->header_size) > - bdev_logical_block_size(state->bdev)) { - pr_debug("GUID Partition Table Header size is wrong: %u > %u\n", - le32_to_cpu((*gpt)->header_size), - bdev_logical_block_size(state->bdev)); - goto fail; - } - - /* Check the GUID Partition Table CRC */ - origcrc = le32_to_cpu((*gpt)->header_crc32); - (*gpt)->header_crc32 = 0; - crc = efi_crc32((const unsigned char *) (*gpt), le32_to_cpu((*gpt)->header_size)); - - if (crc != origcrc) { - pr_debug("GUID Partition Table Header CRC is wrong: %x != %x\n", - crc, origcrc); - goto fail; - } - (*gpt)->header_crc32 = cpu_to_le32(origcrc); - - /* Check that the my_lba entry points to the LBA that contains - * the GUID Partition Table */ - if (le64_to_cpu((*gpt)->my_lba) != lba) { - pr_debug("GPT my_lba incorrect: %lld != %lld\n", - (unsigned long long)le64_to_cpu((*gpt)->my_lba), - (unsigned long long)lba); - goto fail; - } - - /* Check the first_usable_lba and last_usable_lba are - * within the disk. - */ - lastlba = last_lba(state->bdev); - if (le64_to_cpu((*gpt)->first_usable_lba) > lastlba) { - pr_debug("GPT: first_usable_lba incorrect: %lld > %lld\n", - (unsigned long long)le64_to_cpu((*gpt)->first_usable_lba), - (unsigned long long)lastlba); - goto fail; - } - if (le64_to_cpu((*gpt)->last_usable_lba) > lastlba) { - pr_debug("GPT: last_usable_lba incorrect: %lld > %lld\n", - (unsigned long long)le64_to_cpu((*gpt)->last_usable_lba), - (unsigned long long)lastlba); - goto fail; - } - - /* Check that sizeof_partition_entry has the correct value */ - if (le32_to_cpu((*gpt)->sizeof_partition_entry) != sizeof(gpt_entry)) { - pr_debug("GUID Partitition Entry Size check failed.\n"); - goto fail; - } - - if (!(*ptes = alloc_read_gpt_entries(state, *gpt))) - goto fail; - - /* Check the GUID Partition Entry Array CRC */ - crc = efi_crc32((const unsigned char *) (*ptes), - le32_to_cpu((*gpt)->num_partition_entries) * - le32_to_cpu((*gpt)->sizeof_partition_entry)); - - if (crc != le32_to_cpu((*gpt)->partition_entry_array_crc32)) { - pr_debug("GUID Partitition Entry Array CRC check failed.\n"); - goto fail_ptes; - } - - /* We're done, all's well */ - return 1; - - fail_ptes: - kfree(*ptes); - *ptes = NULL; - fail: - kfree(*gpt); - *gpt = NULL; - return 0; -} - -/** - * is_pte_valid() - tests one PTE for validity - * @pte is the pte to check - * @lastlba is last lba of the disk - * - * Description: returns 1 if valid, 0 on error. - */ -static inline int -is_pte_valid(const gpt_entry *pte, const u64 lastlba) -{ - if ((!efi_guidcmp(pte->partition_type_guid, NULL_GUID)) || - le64_to_cpu(pte->starting_lba) > lastlba || - le64_to_cpu(pte->ending_lba) > lastlba) - return 0; - return 1; -} - -/** - * compare_gpts() - Search disk for valid GPT headers and PTEs - * @pgpt is the primary GPT header - * @agpt is the alternate GPT header - * @lastlba is the last LBA number - * Description: Returns nothing. Sanity checks pgpt and agpt fields - * and prints warnings on discrepancies. - * - */ -static void -compare_gpts(gpt_header *pgpt, gpt_header *agpt, u64 lastlba) -{ - int error_found = 0; - if (!pgpt || !agpt) - return; - if (le64_to_cpu(pgpt->my_lba) != le64_to_cpu(agpt->alternate_lba)) { - printk(KERN_WARNING - "GPT:Primary header LBA != Alt. header alternate_lba\n"); - printk(KERN_WARNING "GPT:%lld != %lld\n", - (unsigned long long)le64_to_cpu(pgpt->my_lba), - (unsigned long long)le64_to_cpu(agpt->alternate_lba)); - error_found++; - } - if (le64_to_cpu(pgpt->alternate_lba) != le64_to_cpu(agpt->my_lba)) { - printk(KERN_WARNING - "GPT:Primary header alternate_lba != Alt. header my_lba\n"); - printk(KERN_WARNING "GPT:%lld != %lld\n", - (unsigned long long)le64_to_cpu(pgpt->alternate_lba), - (unsigned long long)le64_to_cpu(agpt->my_lba)); - error_found++; - } - if (le64_to_cpu(pgpt->first_usable_lba) != - le64_to_cpu(agpt->first_usable_lba)) { - printk(KERN_WARNING "GPT:first_usable_lbas don't match.\n"); - printk(KERN_WARNING "GPT:%lld != %lld\n", - (unsigned long long)le64_to_cpu(pgpt->first_usable_lba), - (unsigned long long)le64_to_cpu(agpt->first_usable_lba)); - error_found++; - } - if (le64_to_cpu(pgpt->last_usable_lba) != - le64_to_cpu(agpt->last_usable_lba)) { - printk(KERN_WARNING "GPT:last_usable_lbas don't match.\n"); - printk(KERN_WARNING "GPT:%lld != %lld\n", - (unsigned long long)le64_to_cpu(pgpt->last_usable_lba), - (unsigned long long)le64_to_cpu(agpt->last_usable_lba)); - error_found++; - } - if (efi_guidcmp(pgpt->disk_guid, agpt->disk_guid)) { - printk(KERN_WARNING "GPT:disk_guids don't match.\n"); - error_found++; - } - if (le32_to_cpu(pgpt->num_partition_entries) != - le32_to_cpu(agpt->num_partition_entries)) { - printk(KERN_WARNING "GPT:num_partition_entries don't match: " - "0x%x != 0x%x\n", - le32_to_cpu(pgpt->num_partition_entries), - le32_to_cpu(agpt->num_partition_entries)); - error_found++; - } - if (le32_to_cpu(pgpt->sizeof_partition_entry) != - le32_to_cpu(agpt->sizeof_partition_entry)) { - printk(KERN_WARNING - "GPT:sizeof_partition_entry values don't match: " - "0x%x != 0x%x\n", - le32_to_cpu(pgpt->sizeof_partition_entry), - le32_to_cpu(agpt->sizeof_partition_entry)); - error_found++; - } - if (le32_to_cpu(pgpt->partition_entry_array_crc32) != - le32_to_cpu(agpt->partition_entry_array_crc32)) { - printk(KERN_WARNING - "GPT:partition_entry_array_crc32 values don't match: " - "0x%x != 0x%x\n", - le32_to_cpu(pgpt->partition_entry_array_crc32), - le32_to_cpu(agpt->partition_entry_array_crc32)); - error_found++; - } - if (le64_to_cpu(pgpt->alternate_lba) != lastlba) { - printk(KERN_WARNING - "GPT:Primary header thinks Alt. header is not at the end of the disk.\n"); - printk(KERN_WARNING "GPT:%lld != %lld\n", - (unsigned long long)le64_to_cpu(pgpt->alternate_lba), - (unsigned long long)lastlba); - error_found++; - } - - if (le64_to_cpu(agpt->my_lba) != lastlba) { - printk(KERN_WARNING - "GPT:Alternate GPT header not at the end of the disk.\n"); - printk(KERN_WARNING "GPT:%lld != %lld\n", - (unsigned long long)le64_to_cpu(agpt->my_lba), - (unsigned long long)lastlba); - error_found++; - } - - if (error_found) - printk(KERN_WARNING - "GPT: Use GNU Parted to correct GPT errors.\n"); - return; -} - -/** - * find_valid_gpt() - Search disk for valid GPT headers and PTEs - * @state - * @gpt is a GPT header ptr, filled on return. - * @ptes is a PTEs ptr, filled on return. - * Description: Returns 1 if valid, 0 on error. - * If valid, returns pointers to newly allocated GPT header and PTEs. - * Validity depends on PMBR being valid (or being overridden by the - * 'gpt' kernel command line option) and finding either the Primary - * GPT header and PTEs valid, or the Alternate GPT header and PTEs - * valid. If the Primary GPT header is not valid, the Alternate GPT header - * is not checked unless the 'gpt' kernel command line option is passed. - * This protects against devices which misreport their size, and forces - * the user to decide to use the Alternate GPT. - */ -static int find_valid_gpt(struct parsed_partitions *state, gpt_header **gpt, - gpt_entry **ptes) -{ - int good_pgpt = 0, good_agpt = 0, good_pmbr = 0; - gpt_header *pgpt = NULL, *agpt = NULL; - gpt_entry *pptes = NULL, *aptes = NULL; - legacy_mbr *legacymbr; - u64 lastlba; - - if (!ptes) - return 0; - - lastlba = last_lba(state->bdev); - if (!force_gpt) { - /* This will be added to the EFI Spec. per Intel after v1.02. */ - legacymbr = kzalloc(sizeof (*legacymbr), GFP_KERNEL); - if (legacymbr) { - read_lba(state, 0, (u8 *) legacymbr, - sizeof (*legacymbr)); - good_pmbr = is_pmbr_valid(legacymbr); - kfree(legacymbr); - } - if (!good_pmbr) - goto fail; - } - - good_pgpt = is_gpt_valid(state, GPT_PRIMARY_PARTITION_TABLE_LBA, - &pgpt, &pptes); - if (good_pgpt) - good_agpt = is_gpt_valid(state, - le64_to_cpu(pgpt->alternate_lba), - &agpt, &aptes); - if (!good_agpt && force_gpt) - good_agpt = is_gpt_valid(state, lastlba, &agpt, &aptes); - - /* The obviously unsuccessful case */ - if (!good_pgpt && !good_agpt) - goto fail; - - compare_gpts(pgpt, agpt, lastlba); - - /* The good cases */ - if (good_pgpt) { - *gpt = pgpt; - *ptes = pptes; - kfree(agpt); - kfree(aptes); - if (!good_agpt) { - printk(KERN_WARNING - "Alternate GPT is invalid, " - "using primary GPT.\n"); - } - return 1; - } - else if (good_agpt) { - *gpt = agpt; - *ptes = aptes; - kfree(pgpt); - kfree(pptes); - printk(KERN_WARNING - "Primary GPT is invalid, using alternate GPT.\n"); - return 1; - } - - fail: - kfree(pgpt); - kfree(agpt); - kfree(pptes); - kfree(aptes); - *gpt = NULL; - *ptes = NULL; - return 0; -} - -/** - * efi_partition(struct parsed_partitions *state) - * @state - * - * Description: called from check.c, if the disk contains GPT - * partitions, sets up partition entries in the kernel. - * - * If the first block on the disk is a legacy MBR, - * it will get handled by msdos_partition(). - * If it's a Protective MBR, we'll handle it here. - * - * We do not create a Linux partition for GPT, but - * only for the actual data partitions. - * Returns: - * -1 if unable to read the partition table - * 0 if this isn't our partition table - * 1 if successful - * - */ -int efi_partition(struct parsed_partitions *state) -{ - gpt_header *gpt = NULL; - gpt_entry *ptes = NULL; - u32 i; - unsigned ssz = bdev_logical_block_size(state->bdev) / 512; - u8 unparsed_guid[37]; - - if (!find_valid_gpt(state, &gpt, &ptes) || !gpt || !ptes) { - kfree(gpt); - kfree(ptes); - return 0; - } - - pr_debug("GUID Partition Table is valid! Yea!\n"); - - for (i = 0; i < le32_to_cpu(gpt->num_partition_entries) && i < state->limit-1; i++) { - struct partition_meta_info *info; - unsigned label_count = 0; - unsigned label_max; - u64 start = le64_to_cpu(ptes[i].starting_lba); - u64 size = le64_to_cpu(ptes[i].ending_lba) - - le64_to_cpu(ptes[i].starting_lba) + 1ULL; - - if (!is_pte_valid(&ptes[i], last_lba(state->bdev))) - continue; - - put_partition(state, i+1, start * ssz, size * ssz); - - /* If this is a RAID volume, tell md */ - if (!efi_guidcmp(ptes[i].partition_type_guid, - PARTITION_LINUX_RAID_GUID)) - state->parts[i + 1].flags = ADDPART_FLAG_RAID; - - info = &state->parts[i + 1].info; - /* Instead of doing a manual swap to big endian, reuse the - * common ASCII hex format as the interim. - */ - efi_guid_unparse(&ptes[i].unique_partition_guid, unparsed_guid); - part_pack_uuid(unparsed_guid, info->uuid); - - /* Naively convert UTF16-LE to 7 bits. */ - label_max = min(sizeof(info->volname) - 1, - sizeof(ptes[i].partition_name)); - info->volname[label_max] = 0; - while (label_count < label_max) { - u8 c = ptes[i].partition_name[label_count] & 0xff; - if (c && !isprint(c)) - c = '!'; - info->volname[label_count] = c; - label_count++; - } - state->parts[i + 1].has_info = true; - } - kfree(ptes); - kfree(gpt); - strlcat(state->pp_buf, "\n", PAGE_SIZE); - return 1; -} diff --git a/fs/partitions/efi.h b/fs/partitions/efi.h deleted file mode 100644 index b69ab72..0000000 --- a/fs/partitions/efi.h +++ /dev/null @@ -1,134 +0,0 @@ -/************************************************************ - * EFI GUID Partition Table - * Per Intel EFI Specification v1.02 - * http://developer.intel.com/technology/efi/efi.htm - * - * By Matt Domsch <Matt_Domsch@dell.com> Fri Sep 22 22:15:56 CDT 2000 - * Copyright 2000,2001 Dell Inc. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - * - ************************************************************/ - -#ifndef FS_PART_EFI_H_INCLUDED -#define FS_PART_EFI_H_INCLUDED - -#include <linux/types.h> -#include <linux/fs.h> -#include <linux/genhd.h> -#include <linux/kernel.h> -#include <linux/major.h> -#include <linux/string.h> -#include <linux/efi.h> - -#define MSDOS_MBR_SIGNATURE 0xaa55 -#define EFI_PMBR_OSTYPE_EFI 0xEF -#define EFI_PMBR_OSTYPE_EFI_GPT 0xEE - -#define GPT_HEADER_SIGNATURE 0x5452415020494645ULL -#define GPT_HEADER_REVISION_V1 0x00010000 -#define GPT_PRIMARY_PARTITION_TABLE_LBA 1 - -#define PARTITION_SYSTEM_GUID \ - EFI_GUID( 0xC12A7328, 0xF81F, 0x11d2, \ - 0xBA, 0x4B, 0x00, 0xA0, 0xC9, 0x3E, 0xC9, 0x3B) -#define LEGACY_MBR_PARTITION_GUID \ - EFI_GUID( 0x024DEE41, 0x33E7, 0x11d3, \ - 0x9D, 0x69, 0x00, 0x08, 0xC7, 0x81, 0xF3, 0x9F) -#define PARTITION_MSFT_RESERVED_GUID \ - EFI_GUID( 0xE3C9E316, 0x0B5C, 0x4DB8, \ - 0x81, 0x7D, 0xF9, 0x2D, 0xF0, 0x02, 0x15, 0xAE) -#define PARTITION_BASIC_DATA_GUID \ - EFI_GUID( 0xEBD0A0A2, 0xB9E5, 0x4433, \ - 0x87, 0xC0, 0x68, 0xB6, 0xB7, 0x26, 0x99, 0xC7) -#define PARTITION_LINUX_RAID_GUID \ - EFI_GUID( 0xa19d880f, 0x05fc, 0x4d3b, \ - 0xa0, 0x06, 0x74, 0x3f, 0x0f, 0x84, 0x91, 0x1e) -#define PARTITION_LINUX_SWAP_GUID \ - EFI_GUID( 0x0657fd6d, 0xa4ab, 0x43c4, \ - 0x84, 0xe5, 0x09, 0x33, 0xc8, 0x4b, 0x4f, 0x4f) -#define PARTITION_LINUX_LVM_GUID \ - EFI_GUID( 0xe6d6d379, 0xf507, 0x44c2, \ - 0xa2, 0x3c, 0x23, 0x8f, 0x2a, 0x3d, 0xf9, 0x28) - -typedef struct _gpt_header { - __le64 signature; - __le32 revision; - __le32 header_size; - __le32 header_crc32; - __le32 reserved1; - __le64 my_lba; - __le64 alternate_lba; - __le64 first_usable_lba; - __le64 last_usable_lba; - efi_guid_t disk_guid; - __le64 partition_entry_lba; - __le32 num_partition_entries; - __le32 sizeof_partition_entry; - __le32 partition_entry_array_crc32; - - /* The rest of the logical block is reserved by UEFI and must be zero. - * EFI standard handles this by: - * - * uint8_t reserved2[ BlockSize - 92 ]; - */ -} __attribute__ ((packed)) gpt_header; - -typedef struct _gpt_entry_attributes { - u64 required_to_function:1; - u64 reserved:47; - u64 type_guid_specific:16; -} __attribute__ ((packed)) gpt_entry_attributes; - -typedef struct _gpt_entry { - efi_guid_t partition_type_guid; - efi_guid_t unique_partition_guid; - __le64 starting_lba; - __le64 ending_lba; - gpt_entry_attributes attributes; - efi_char16_t partition_name[72 / sizeof (efi_char16_t)]; -} __attribute__ ((packed)) gpt_entry; - -typedef struct _legacy_mbr { - u8 boot_code[440]; - __le32 unique_mbr_signature; - __le16 unknown; - struct partition partition_record[4]; - __le16 signature; -} __attribute__ ((packed)) legacy_mbr; - -/* Functions */ -extern int efi_partition(struct parsed_partitions *state); - -#endif - -/* - * Overrides for Emacs so that we follow Linus's tabbing style. - * Emacs will notice this stuff at the end of the file and automatically - * adjust the settings for this buffer only. This must remain at the end - * of the file. - * -------------------------------------------------------------------------- - * Local variables: - * c-indent-level: 4 - * c-brace-imaginary-offset: 0 - * c-brace-offset: -4 - * c-argdecl-indent: 4 - * c-label-offset: -4 - * c-continued-statement-offset: 4 - * c-continued-brace-offset: 0 - * indent-tabs-mode: nil - * tab-width: 8 - * End: - */ diff --git a/fs/partitions/ibm.c b/fs/partitions/ibm.c deleted file mode 100644 index d513a07..0000000 --- a/fs/partitions/ibm.c +++ /dev/null @@ -1,275 +0,0 @@ -/* - * File...........: linux/fs/partitions/ibm.c - * Author(s)......: Holger Smolinski <Holger.Smolinski@de.ibm.com> - * Volker Sameske <sameske@de.ibm.com> - * Bugreports.to..: <Linux390@de.ibm.com> - * (C) IBM Corporation, IBM Deutschland Entwicklung GmbH, 1999,2000 - */ - -#include <linux/buffer_head.h> -#include <linux/hdreg.h> -#include <linux/slab.h> -#include <asm/dasd.h> -#include <asm/ebcdic.h> -#include <asm/uaccess.h> -#include <asm/vtoc.h> - -#include "check.h" -#include "ibm.h" - -/* - * compute the block number from a - * cyl-cyl-head-head structure - */ -static sector_t -cchh2blk (struct vtoc_cchh *ptr, struct hd_geometry *geo) { - - sector_t cyl; - __u16 head; - - /*decode cylinder and heads for large volumes */ - cyl = ptr->hh & 0xFFF0; - cyl <<= 12; - cyl |= ptr->cc; - head = ptr->hh & 0x000F; - return cyl * geo->heads * geo->sectors + - head * geo->sectors; -} - -/* - * compute the block number from a - * cyl-cyl-head-head-block structure - */ -static sector_t -cchhb2blk (struct vtoc_cchhb *ptr, struct hd_geometry *geo) { - - sector_t cyl; - __u16 head; - - /*decode cylinder and heads for large volumes */ - cyl = ptr->hh & 0xFFF0; - cyl <<= 12; - cyl |= ptr->cc; - head = ptr->hh & 0x000F; - return cyl * geo->heads * geo->sectors + - head * geo->sectors + - ptr->b; -} - -/* - */ -int ibm_partition(struct parsed_partitions *state) -{ - struct block_device *bdev = state->bdev; - int blocksize, res; - loff_t i_size, offset, size, fmt_size; - dasd_information2_t *info; - struct hd_geometry *geo; - char type[5] = {0,}; - char name[7] = {0,}; - union label_t { - struct vtoc_volume_label_cdl vol; - struct vtoc_volume_label_ldl lnx; - struct vtoc_cms_label cms; - } *label; - unsigned char *data; - Sector sect; - sector_t labelsect; - char tmp[64]; - - res = 0; - blocksize = bdev_logical_block_size(bdev); - if (blocksize <= 0) - goto out_exit; - i_size = i_size_read(bdev->bd_inode); - if (i_size == 0) - goto out_exit; - - info = kmalloc(sizeof(dasd_information2_t), GFP_KERNEL); - if (info == NULL) - goto out_exit; - geo = kmalloc(sizeof(struct hd_geometry), GFP_KERNEL); - if (geo == NULL) - goto out_nogeo; - label = kmalloc(sizeof(union label_t), GFP_KERNEL); - if (label == NULL) - goto out_nolab; - - if (ioctl_by_bdev(bdev, BIODASDINFO2, (unsigned long)info) != 0 || - ioctl_by_bdev(bdev, HDIO_GETGEO, (unsigned long)geo) != 0) - goto out_freeall; - - /* - * Special case for FBA disks: label sector does not depend on - * blocksize. - */ - if ((info->cu_type == 0x6310 && info->dev_type == 0x9336) || - (info->cu_type == 0x3880 && info->dev_type == 0x3370)) - labelsect = info->label_block; - else - labelsect = info->label_block * (blocksize >> 9); - - /* - * Get volume label, extract name and type. - */ - data = read_part_sector(state, labelsect, §); - if (data == NULL) - goto out_readerr; - - memcpy(label, data, sizeof(union label_t)); - put_dev_sector(sect); - - if ((!info->FBA_layout) && (!strcmp(info->type, "ECKD"))) { - strncpy(type, label->vol.vollbl, 4); - strncpy(name, label->vol.volid, 6); - } else { - strncpy(type, label->lnx.vollbl, 4); - strncpy(name, label->lnx.volid, 6); - } - EBCASC(type, 4); - EBCASC(name, 6); - - res = 1; - - /* - * Three different formats: LDL, CDL and unformated disk - * - * identified by info->format - * - * unformated disks we do not have to care about - */ - if (info->format == DASD_FORMAT_LDL) { - if (strncmp(type, "CMS1", 4) == 0) { - /* - * VM style CMS1 labeled disk - */ - blocksize = label->cms.block_size; - if (label->cms.disk_offset != 0) { - snprintf(tmp, sizeof(tmp), "CMS1/%8s(MDSK):", name); - strlcat(state->pp_buf, tmp, PAGE_SIZE); - /* disk is reserved minidisk */ - offset = label->cms.disk_offset; - size = (label->cms.block_count - 1) - * (blocksize >> 9); - } else { - snprintf(tmp, sizeof(tmp), "CMS1/%8s:", name); - strlcat(state->pp_buf, tmp, PAGE_SIZE); - offset = (info->label_block + 1); - size = label->cms.block_count - * (blocksize >> 9); - } - put_partition(state, 1, offset*(blocksize >> 9), - size-offset*(blocksize >> 9)); - } else { - if (strncmp(type, "LNX1", 4) == 0) { - snprintf(tmp, sizeof(tmp), "LNX1/%8s:", name); - strlcat(state->pp_buf, tmp, PAGE_SIZE); - if (label->lnx.ldl_version == 0xf2) { - fmt_size = label->lnx.formatted_blocks - * (blocksize >> 9); - } else if (!strcmp(info->type, "ECKD")) { - /* formated w/o large volume support */ - fmt_size = geo->cylinders * geo->heads - * geo->sectors * (blocksize >> 9); - } else { - /* old label and no usable disk geometry - * (e.g. DIAG) */ - fmt_size = i_size >> 9; - } - size = i_size >> 9; - if (fmt_size < size) - size = fmt_size; - offset = (info->label_block + 1); - } else { - /* unlabeled disk */ - strlcat(state->pp_buf, "(nonl)", PAGE_SIZE); - size = i_size >> 9; - offset = (info->label_block + 1); - } - put_partition(state, 1, offset*(blocksize >> 9), - size-offset*(blocksize >> 9)); - } - } else if (info->format == DASD_FORMAT_CDL) { - /* - * New style CDL formatted disk - */ - sector_t blk; - int counter; - - /* - * check if VOL1 label is available - * if not, something is wrong, skipping partition detection - */ - if (strncmp(type, "VOL1", 4) == 0) { - snprintf(tmp, sizeof(tmp), "VOL1/%8s:", name); - strlcat(state->pp_buf, tmp, PAGE_SIZE); - /* - * get block number and read then go through format1 - * labels - */ - blk = cchhb2blk(&label->vol.vtoc, geo) + 1; - counter = 0; - data = read_part_sector(state, blk * (blocksize/512), - §); - while (data != NULL) { - struct vtoc_format1_label f1; - - memcpy(&f1, data, - sizeof(struct vtoc_format1_label)); - put_dev_sector(sect); - - /* skip FMT4 / FMT5 / FMT7 labels */ - if (f1.DS1FMTID == _ascebc['4'] - || f1.DS1FMTID == _ascebc['5'] - || f1.DS1FMTID == _ascebc['7'] - || f1.DS1FMTID == _ascebc['9']) { - blk++; - data = read_part_sector(state, - blk * (blocksize/512), §); - continue; - } - - /* only FMT1 and 8 labels valid at this point */ - if (f1.DS1FMTID != _ascebc['1'] && - f1.DS1FMTID != _ascebc['8']) - break; - - /* OK, we got valid partition data */ - offset = cchh2blk(&f1.DS1EXT1.llimit, geo); - size = cchh2blk(&f1.DS1EXT1.ulimit, geo) - - offset + geo->sectors; - if (counter >= state->limit) - break; - put_partition(state, counter + 1, - offset * (blocksize >> 9), - size * (blocksize >> 9)); - counter++; - blk++; - data = read_part_sector(state, - blk * (blocksize/512), §); - } - - if (!data) - /* Are we not supposed to report this ? */ - goto out_readerr; - } else - printk(KERN_WARNING "Warning, expected Label VOL1 not " - "found, treating as CDL formated Disk"); - - } - - strlcat(state->pp_buf, "\n", PAGE_SIZE); - goto out_freeall; - - -out_readerr: - res = -1; -out_freeall: - kfree(label); -out_nolab: - kfree(geo); -out_nogeo: - kfree(info); -out_exit: - return res; -} diff --git a/fs/partitions/ibm.h b/fs/partitions/ibm.h deleted file mode 100644 index 08fb080..0000000 --- a/fs/partitions/ibm.h +++ /dev/null @@ -1 +0,0 @@ -int ibm_partition(struct parsed_partitions *); diff --git a/fs/partitions/karma.c b/fs/partitions/karma.c deleted file mode 100644 index 0ea1931..0000000 --- a/fs/partitions/karma.c +++ /dev/null @@ -1,57 +0,0 @@ -/* - * fs/partitions/karma.c - * Rio Karma partition info. - * - * Copyright (C) 2006 Bob Copeland (me@bobcopeland.com) - * based on osf.c - */ - -#include "check.h" -#include "karma.h" - -int karma_partition(struct parsed_partitions *state) -{ - int i; - int slot = 1; - Sector sect; - unsigned char *data; - struct disklabel { - u8 d_reserved[270]; - struct d_partition { - __le32 p_res; - u8 p_fstype; - u8 p_res2[3]; - __le32 p_offset; - __le32 p_size; - } d_partitions[2]; - u8 d_blank[208]; - __le16 d_magic; - } __attribute__((packed)) *label; - struct d_partition *p; - - data = read_part_sector(state, 0, §); - if (!data) - return -1; - - label = (struct disklabel *)data; - if (le16_to_cpu(label->d_magic) != KARMA_LABEL_MAGIC) { - put_dev_sector(sect); - return 0; - } - - p = label->d_partitions; - for (i = 0 ; i < 2; i++, p++) { - if (slot == state->limit) - break; - - if (p->p_fstype == 0x4d && le32_to_cpu(p->p_size)) { - put_partition(state, slot, le32_to_cpu(p->p_offset), - le32_to_cpu(p->p_size)); - } - slot++; - } - strlcat(state->pp_buf, "\n", PAGE_SIZE); - put_dev_sector(sect); - return 1; -} - diff --git a/fs/partitions/karma.h b/fs/partitions/karma.h deleted file mode 100644 index c764b2e..0000000 --- a/fs/partitions/karma.h +++ /dev/null @@ -1,8 +0,0 @@ -/* - * fs/partitions/karma.h - */ - -#define KARMA_LABEL_MAGIC 0xAB56 - -int karma_partition(struct parsed_partitions *state); - diff --git a/fs/partitions/ldm.c b/fs/partitions/ldm.c deleted file mode 100644 index af9fdf0..0000000 --- a/fs/partitions/ldm.c +++ /dev/null @@ -1,1568 +0,0 @@ -/** - * ldm - Support for Windows Logical Disk Manager (Dynamic Disks) - * - * Copyright (C) 2001,2002 Richard Russon <ldm@flatcap.org> - * Copyright (c) 2001-2007 Anton Altaparmakov - * Copyright (C) 2001,2002 Jakob Kemi <jakob.kemi@telia.com> - * - * Documentation is available at http://www.linux-ntfs.org/doku.php?id=downloads - * - * This program is free software; you can redistribute it and/or modify it under - * the terms of the GNU General Public License as published by the Free Software - * Foundation; either version 2 of the License, or (at your option) any later - * version. - * - * This program is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS - * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more - * details. - * - * You should have received a copy of the GNU General Public License along with - * this program (in the main directory of the source in the file COPYING); if - * not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, - * Boston, MA 02111-1307 USA - */ - -#include <linux/slab.h> -#include <linux/pagemap.h> -#include <linux/stringify.h> -#include <linux/kernel.h> -#include "ldm.h" -#include "check.h" -#include "msdos.h" - -/** - * ldm_debug/info/error/crit - Output an error message - * @f: A printf format string containing the message - * @...: Variables to substitute into @f - * - * ldm_debug() writes a DEBUG level message to the syslog but only if the - * driver was compiled with debug enabled. Otherwise, the call turns into a NOP. - */ -#ifndef CONFIG_LDM_DEBUG -#define ldm_debug(...) do {} while (0) -#else -#define ldm_debug(f, a...) _ldm_printk (KERN_DEBUG, __func__, f, ##a) -#endif - -#define ldm_crit(f, a...) _ldm_printk (KERN_CRIT, __func__, f, ##a) -#define ldm_error(f, a...) _ldm_printk (KERN_ERR, __func__, f, ##a) -#define ldm_info(f, a...) _ldm_printk (KERN_INFO, __func__, f, ##a) - -__attribute__ ((format (printf, 3, 4))) -static void _ldm_printk (const char *level, const char *function, - const char *fmt, ...) -{ - static char buf[128]; - va_list args; - - va_start (args, fmt); - vsnprintf (buf, sizeof (buf), fmt, args); - va_end (args); - - printk ("%s%s(): %s\n", level, function, buf); -} - -/** - * ldm_parse_hexbyte - Convert a ASCII hex number to a byte - * @src: Pointer to at least 2 characters to convert. - * - * Convert a two character ASCII hex string to a number. - * - * Return: 0-255 Success, the byte was parsed correctly - * -1 Error, an invalid character was supplied - */ -static int ldm_parse_hexbyte (const u8 *src) -{ - unsigned int x; /* For correct wrapping */ - int h; - - /* high part */ - x = h = hex_to_bin(src[0]); - if (h < 0) - return -1; - - /* low part */ - h = hex_to_bin(src[1]); - if (h < 0) - return -1; - - return (x << 4) + h; -} - -/** - * ldm_parse_guid - Convert GUID from ASCII to binary - * @src: 36 char string of the form fa50ff2b-f2e8-45de-83fa-65417f2f49ba - * @dest: Memory block to hold binary GUID (16 bytes) - * - * N.B. The GUID need not be NULL terminated. - * - * Return: 'true' @dest contains binary GUID - * 'false' @dest contents are undefined - */ -static bool ldm_parse_guid (const u8 *src, u8 *dest) -{ - static const int size[] = { 4, 2, 2, 2, 6 }; - int i, j, v; - - if (src[8] != '-' || src[13] != '-' || - src[18] != '-' || src[23] != '-') - return false; - - for (j = 0; j < 5; j++, src++) - for (i = 0; i < size[j]; i++, src+=2, *dest++ = v) - if ((v = ldm_parse_hexbyte (src)) < 0) - return false; - - return true; -} - -/** - * ldm_parse_privhead - Read the LDM Database PRIVHEAD structure - * @data: Raw database PRIVHEAD structure loaded from the device - * @ph: In-memory privhead structure in which to return parsed information - * - * This parses the LDM database PRIVHEAD structure supplied in @data and - * sets up the in-memory privhead structure @ph with the obtained information. - * - * Return: 'true' @ph contains the PRIVHEAD data - * 'false' @ph contents are undefined - */ -static bool ldm_parse_privhead(const u8 *data, struct privhead *ph) -{ - bool is_vista = false; - - BUG_ON(!data || !ph); - if (MAGIC_PRIVHEAD != get_unaligned_be64(data)) { - ldm_error("Cannot find PRIVHEAD structure. LDM database is" - " corrupt. Aborting."); - return false; - } - ph->ver_major = get_unaligned_be16(data + 0x000C); - ph->ver_minor = get_unaligned_be16(data + 0x000E); - ph->logical_disk_start = get_unaligned_be64(data + 0x011B); - ph->logical_disk_size = get_unaligned_be64(data + 0x0123); - ph->config_start = get_unaligned_be64(data + 0x012B); - ph->config_size = get_unaligned_be64(data + 0x0133); - /* Version 2.11 is Win2k/XP and version 2.12 is Vista. */ - if (ph->ver_major == 2 && ph->ver_minor == 12) - is_vista = true; - if (!is_vista && (ph->ver_major != 2 || ph->ver_minor != 11)) { - ldm_error("Expected PRIVHEAD version 2.11 or 2.12, got %d.%d." - " Aborting.", ph->ver_major, ph->ver_minor); - return false; - } - ldm_debug("PRIVHEAD version %d.%d (Windows %s).", ph->ver_major, - ph->ver_minor, is_vista ? "Vista" : "2000/XP"); - if (ph->config_size != LDM_DB_SIZE) { /* 1 MiB in sectors. */ - /* Warn the user and continue, carefully. */ - ldm_info("Database is normally %u bytes, it claims to " - "be %llu bytes.", LDM_DB_SIZE, - (unsigned long long)ph->config_size); - } - if ((ph->logical_disk_size == 0) || (ph->logical_disk_start + - ph->logical_disk_size > ph->config_start)) { - ldm_error("PRIVHEAD disk size doesn't match real disk size"); - return false; - } - if (!ldm_parse_guid(data + 0x0030, ph->disk_id)) { - ldm_error("PRIVHEAD contains an invalid GUID."); - return false; - } - ldm_debug("Parsed PRIVHEAD successfully."); - return true; -} - -/** - * ldm_parse_tocblock - Read the LDM Database TOCBLOCK structure - * @data: Raw database TOCBLOCK structure loaded from the device - * @toc: In-memory toc structure in which to return parsed information - * - * This parses the LDM Database TOCBLOCK (table of contents) structure supplied - * in @data and sets up the in-memory tocblock structure @toc with the obtained - * information. - * - * N.B. The *_start and *_size values returned in @toc are not range-checked. - * - * Return: 'true' @toc contains the TOCBLOCK data - * 'false' @toc contents are undefined - */ -static bool ldm_parse_tocblock (const u8 *data, struct tocblock *toc) -{ - BUG_ON (!data || !toc); - - if (MAGIC_TOCBLOCK != get_unaligned_be64(data)) { - ldm_crit ("Cannot find TOCBLOCK, database may be corrupt."); - return false; - } - strncpy (toc->bitmap1_name, data + 0x24, sizeof (toc->bitmap1_name)); - toc->bitmap1_name[sizeof (toc->bitmap1_name) - 1] = 0; - toc->bitmap1_start = get_unaligned_be64(data + 0x2E); - toc->bitmap1_size = get_unaligned_be64(data + 0x36); - - if (strncmp (toc->bitmap1_name, TOC_BITMAP1, - sizeof (toc->bitmap1_name)) != 0) { - ldm_crit ("TOCBLOCK's first bitmap is '%s', should be '%s'.", - TOC_BITMAP1, toc->bitmap1_name); - return false; - } - strncpy (toc->bitmap2_name, data + 0x46, sizeof (toc->bitmap2_name)); - toc->bitmap2_name[sizeof (toc->bitmap2_name) - 1] = 0; - toc->bitmap2_start = get_unaligned_be64(data + 0x50); - toc->bitmap2_size = get_unaligned_be64(data + 0x58); - if (strncmp (toc->bitmap2_name, TOC_BITMAP2, - sizeof (toc->bitmap2_name)) != 0) { - ldm_crit ("TOCBLOCK's second bitmap is '%s', should be '%s'.", - TOC_BITMAP2, toc->bitmap2_name); - return false; - } - ldm_debug ("Parsed TOCBLOCK successfully."); - return true; -} - -/** - * ldm_parse_vmdb - Read the LDM Database VMDB structure - * @data: Raw database VMDB structure loaded from the device - * @vm: In-memory vmdb structure in which to return parsed information - * - * This parses the LDM Database VMDB structure supplied in @data and sets up - * the in-memory vmdb structure @vm with the obtained information. - * - * N.B. The *_start, *_size and *_seq values will be range-checked later. - * - * Return: 'true' @vm contains VMDB info - * 'false' @vm contents are undefined - */ -static bool ldm_parse_vmdb (const u8 *data, struct vmdb *vm) -{ - BUG_ON (!data || !vm); - - if (MAGIC_VMDB != get_unaligned_be32(data)) { - ldm_crit ("Cannot find the VMDB, database may be corrupt."); - return false; - } - - vm->ver_major = get_unaligned_be16(data + 0x12); - vm->ver_minor = get_unaligned_be16(data + 0x14); - if ((vm->ver_major != 4) || (vm->ver_minor != 10)) { - ldm_error ("Expected VMDB version %d.%d, got %d.%d. " - "Aborting.", 4, 10, vm->ver_major, vm->ver_minor); - return false; - } - - vm->vblk_size = get_unaligned_be32(data + 0x08); - if (vm->vblk_size == 0) { - ldm_error ("Illegal VBLK size"); - return false; - } - - vm->vblk_offset = get_unaligned_be32(data + 0x0C); - vm->last_vblk_seq = get_unaligned_be32(data + 0x04); - - ldm_debug ("Parsed VMDB successfully."); - return true; -} - -/** - * ldm_compare_privheads - Compare two privhead objects - * @ph1: First privhead - * @ph2: Second privhead - * - * This compares the two privhead structures @ph1 and @ph2. - * - * Return: 'true' Identical - * 'false' Different - */ -static bool ldm_compare_privheads (const struct privhead *ph1, - const struct privhead *ph2) -{ - BUG_ON (!ph1 || !ph2); - - return ((ph1->ver_major == ph2->ver_major) && - (ph1->ver_minor == ph2->ver_minor) && - (ph1->logical_disk_start == ph2->logical_disk_start) && - (ph1->logical_disk_size == ph2->logical_disk_size) && - (ph1->config_start == ph2->config_start) && - (ph1->config_size == ph2->config_size) && - !memcmp (ph1->disk_id, ph2->disk_id, GUID_SIZE)); -} - -/** - * ldm_compare_tocblocks - Compare two tocblock objects - * @toc1: First toc - * @toc2: Second toc - * - * This compares the two tocblock structures @toc1 and @toc2. - * - * Return: 'true' Identical - * 'false' Different - */ -static bool ldm_compare_tocblocks (const struct tocblock *toc1, - const struct tocblock *toc2) -{ - BUG_ON (!toc1 || !toc2); - - return ((toc1->bitmap1_start == toc2->bitmap1_start) && - (toc1->bitmap1_size == toc2->bitmap1_size) && - (toc1->bitmap2_start == toc2->bitmap2_start) && - (toc1->bitmap2_size == toc2->bitmap2_size) && - !strncmp (toc1->bitmap1_name, toc2->bitmap1_name, - sizeof (toc1->bitmap1_name)) && - !strncmp (toc1->bitmap2_name, toc2->bitmap2_name, - sizeof (toc1->bitmap2_name))); -} - -/** - * ldm_validate_privheads - Compare the primary privhead with its backups - * @state: Partition check state including device holding the LDM Database - * @ph1: Memory struct to fill with ph contents - * - * Read and compare all three privheads from disk. - * - * The privheads on disk show the size and location of the main disk area and - * the configuration area (the database). The values are range-checked against - * @hd, which contains the real size of the disk. - * - * Return: 'true' Success - * 'false' Error - */ -static bool ldm_validate_privheads(struct parsed_partitions *state, - struct privhead *ph1) -{ - static const int off[3] = { OFF_PRIV1, OFF_PRIV2, OFF_PRIV3 }; - struct privhead *ph[3] = { ph1 }; - Sector sect; - u8 *data; - bool result = false; - long num_sects; - int i; - - BUG_ON (!state || !ph1); - - ph[1] = kmalloc (sizeof (*ph[1]), GFP_KERNEL); - ph[2] = kmalloc (sizeof (*ph[2]), GFP_KERNEL); - if (!ph[1] || !ph[2]) { - ldm_crit ("Out of memory."); - goto out; - } - - /* off[1 & 2] are relative to ph[0]->config_start */ - ph[0]->config_start = 0; - - /* Read and parse privheads */ - for (i = 0; i < 3; i++) { - data = read_part_sector(state, ph[0]->config_start + off[i], - §); - if (!data) { - ldm_crit ("Disk read failed."); - goto out; - } - result = ldm_parse_privhead (data, ph[i]); - put_dev_sector (sect); - if (!result) { - ldm_error ("Cannot find PRIVHEAD %d.", i+1); /* Log again */ - if (i < 2) - goto out; /* Already logged */ - else - break; /* FIXME ignore for now, 3rd PH can fail on odd-sized disks */ - } - } - - num_sects = state->bdev->bd_inode->i_size >> 9; - - if ((ph[0]->config_start > num_sects) || - ((ph[0]->config_start + ph[0]->config_size) > num_sects)) { - ldm_crit ("Database extends beyond the end of the disk."); - goto out; - } - - if ((ph[0]->logical_disk_start > ph[0]->config_start) || - ((ph[0]->logical_disk_start + ph[0]->logical_disk_size) - > ph[0]->config_start)) { - ldm_crit ("Disk and database overlap."); - goto out; - } - - if (!ldm_compare_privheads (ph[0], ph[1])) { - ldm_crit ("Primary and backup PRIVHEADs don't match."); - goto out; - } - /* FIXME ignore this for now - if (!ldm_compare_privheads (ph[0], ph[2])) { - ldm_crit ("Primary and backup PRIVHEADs don't match."); - goto out; - }*/ - ldm_debug ("Validated PRIVHEADs successfully."); - result = true; -out: - kfree (ph[1]); - kfree (ph[2]); - return result; -} - -/** - * ldm_validate_tocblocks - Validate the table of contents and its backups - * @state: Partition check state including device holding the LDM Database - * @base: Offset, into @state->bdev, of the database - * @ldb: Cache of the database structures - * - * Find and compare the four tables of contents of the LDM Database stored on - * @state->bdev and return the parsed information into @toc1. - * - * The offsets and sizes of the configs are range-checked against a privhead. - * - * Return: 'true' @toc1 contains validated TOCBLOCK info - * 'false' @toc1 contents are undefined - */ -static bool ldm_validate_tocblocks(struct parsed_partitions *state, - unsigned long base, struct ldmdb *ldb) -{ - static const int off[4] = { OFF_TOCB1, OFF_TOCB2, OFF_TOCB3, OFF_TOCB4}; - struct tocblock *tb[4]; - struct privhead *ph; - Sector sect; - u8 *data; - int i, nr_tbs; - bool result = false; - - BUG_ON(!state || !ldb); - ph = &ldb->ph; - tb[0] = &ldb->toc; - tb[1] = kmalloc(sizeof(*tb[1]) * 3, GFP_KERNEL); - if (!tb[1]) { - ldm_crit("Out of memory."); - goto err; - } - tb[2] = (struct tocblock*)((u8*)tb[1] + sizeof(*tb[1])); - tb[3] = (struct tocblock*)((u8*)tb[2] + sizeof(*tb[2])); - /* - * Try to read and parse all four TOCBLOCKs. - * - * Windows Vista LDM v2.12 does not always have all four TOCBLOCKs so - * skip any that fail as long as we get at least one valid TOCBLOCK. - */ - for (nr_tbs = i = 0; i < 4; i++) { - data = read_part_sector(state, base + off[i], §); - if (!data) { - ldm_error("Disk read failed for TOCBLOCK %d.", i); - continue; - } - if (ldm_parse_tocblock(data, tb[nr_tbs])) - nr_tbs++; - put_dev_sector(sect); - } - if (!nr_tbs) { - ldm_crit("Failed to find a valid TOCBLOCK."); - goto err; - } - /* Range check the TOCBLOCK against a privhead. */ - if (((tb[0]->bitmap1_start + tb[0]->bitmap1_size) > ph->config_size) || - ((tb[0]->bitmap2_start + tb[0]->bitmap2_size) > - ph->config_size)) { - ldm_crit("The bitmaps are out of range. Giving up."); - goto err; - } - /* Compare all loaded TOCBLOCKs. */ - for (i = 1; i < nr_tbs; i++) { - if (!ldm_compare_tocblocks(tb[0], tb[i])) { - ldm_crit("TOCBLOCKs 0 and %d do not match.", i); - goto err; - } - } - ldm_debug("Validated %d TOCBLOCKs successfully.", nr_tbs); - result = true; -err: - kfree(tb[1]); - return result; -} - -/** - * ldm_validate_vmdb - Read the VMDB and validate it - * @state: Partition check state including device holding the LDM Database - * @base: Offset, into @bdev, of the database - * @ldb: Cache of the database structures - * - * Find the vmdb of the LDM Database stored on @bdev and return the parsed - * information in @ldb. - * - * Return: 'true' @ldb contains validated VBDB info - * 'false' @ldb contents are undefined - */ -static bool ldm_validate_vmdb(struct parsed_partitions *state, - unsigned long base, struct ldmdb *ldb) -{ - Sector sect; - u8 *data; - bool result = false; - struct vmdb *vm; - struct tocblock *toc; - - BUG_ON (!state || !ldb); - - vm = &ldb->vm; - toc = &ldb->toc; - - data = read_part_sector(state, base + OFF_VMDB, §); - if (!data) { - ldm_crit ("Disk read failed."); - return false; - } - - if (!ldm_parse_vmdb (data, vm)) - goto out; /* Already logged */ - - /* Are there uncommitted transactions? */ - if (get_unaligned_be16(data + 0x10) != 0x01) { - ldm_crit ("Database is not in a consistent state. Aborting."); - goto out; - } - - if (vm->vblk_offset != 512) - ldm_info ("VBLKs start at offset 0x%04x.", vm->vblk_offset); - - /* - * The last_vblkd_seq can be before the end of the vmdb, just make sure - * it is not out of bounds. - */ - if ((vm->vblk_size * vm->last_vblk_seq) > (toc->bitmap1_size << 9)) { - ldm_crit ("VMDB exceeds allowed size specified by TOCBLOCK. " - "Database is corrupt. Aborting."); - goto out; - } - - result = true; -out: - put_dev_sector (sect); - return result; -} - - -/** - * ldm_validate_partition_table - Determine whether bdev might be a dynamic disk - * @state: Partition check state including device holding the LDM Database - * - * This function provides a weak test to decide whether the device is a dynamic - * disk or not. It looks for an MS-DOS-style partition table containing at - * least one partition of type 0x42 (formerly SFS, now used by Windows for - * dynamic disks). - * - * N.B. The only possible error can come from the read_part_sector and that is - * only likely to happen if the underlying device is strange. If that IS - * the case we should return zero to let someone else try. - * - * Return: 'true' @state->bdev is a dynamic disk - * 'false' @state->bdev is not a dynamic disk, or an error occurred - */ -static bool ldm_validate_partition_table(struct parsed_partitions *state) -{ - Sector sect; - u8 *data; - struct partition *p; - int i; - bool result = false; - - BUG_ON(!state); - - data = read_part_sector(state, 0, §); - if (!data) { - ldm_info ("Disk read failed."); - return false; - } - - if (*(__le16*) (data + 0x01FE) != cpu_to_le16 (MSDOS_LABEL_MAGIC)) - goto out; - - p = (struct partition*)(data + 0x01BE); - for (i = 0; i < 4; i++, p++) - if (SYS_IND (p) == LDM_PARTITION) { - result = true; - break; - } - - if (result) - ldm_debug ("Found W2K dynamic disk partition type."); - -out: - put_dev_sector (sect); - return result; -} - -/** - * ldm_get_disk_objid - Search a linked list of vblk's for a given Disk Id - * @ldb: Cache of the database structures - * - * The LDM Database contains a list of all partitions on all dynamic disks. - * The primary PRIVHEAD, at the beginning of the physical disk, tells us - * the GUID of this disk. This function searches for the GUID in a linked - * list of vblk's. - * - * Return: Pointer, A matching vblk was found - * NULL, No match, or an error - */ -static struct vblk * ldm_get_disk_objid (const struct ldmdb *ldb) -{ - struct list_head *item; - - BUG_ON (!ldb); - - list_for_each (item, &ldb->v_disk) { - struct vblk *v = list_entry (item, struct vblk, list); - if (!memcmp (v->vblk.disk.disk_id, ldb->ph.disk_id, GUID_SIZE)) - return v; - } - - return NULL; -} - -/** - * ldm_create_data_partitions - Create data partitions for this device - * @pp: List of the partitions parsed so far - * @ldb: Cache of the database structures - * - * The database contains ALL the partitions for ALL disk groups, so we need to - * filter out this specific disk. Using the disk's object id, we can find all - * the partitions in the database that belong to this disk. - * - * Add each partition in our database, to the parsed_partitions structure. - * - * N.B. This function creates the partitions in the order it finds partition - * objects in the linked list. - * - * Return: 'true' Partition created - * 'false' Error, probably a range checking problem - */ -static bool ldm_create_data_partitions (struct parsed_partitions *pp, - const struct ldmdb *ldb) -{ - struct list_head *item; - struct vblk *vb; - struct vblk *disk; - struct vblk_part *part; - int part_num = 1; - - BUG_ON (!pp || !ldb); - - disk = ldm_get_disk_objid (ldb); - if (!disk) { - ldm_crit ("Can't find the ID of this disk in the database."); - return false; - } - - strlcat(pp->pp_buf, " [LDM]", PAGE_SIZE); - - /* Create the data partitions */ - list_for_each (item, &ldb->v_part) { - vb = list_entry (item, struct vblk, list); - part = &vb->vblk.part; - - if (part->disk_id != disk->obj_id) - continue; - - put_partition (pp, part_num, ldb->ph.logical_disk_start + - part->start, part->size); - part_num++; - } - - strlcat(pp->pp_buf, "\n", PAGE_SIZE); - return true; -} - - -/** - * ldm_relative - Calculate the next relative offset - * @buffer: Block of data being worked on - * @buflen: Size of the block of data - * @base: Size of the previous fixed width fields - * @offset: Cumulative size of the previous variable-width fields - * - * Because many of the VBLK fields are variable-width, it's necessary - * to calculate each offset based on the previous one and the length - * of the field it pointed to. - * - * Return: -1 Error, the calculated offset exceeded the size of the buffer - * n OK, a range-checked offset into buffer - */ -static int ldm_relative(const u8 *buffer, int buflen, int base, int offset) -{ - - base += offset; - if (!buffer || offset < 0 || base > buflen) { - if (!buffer) - ldm_error("!buffer"); - if (offset < 0) - ldm_error("offset (%d) < 0", offset); - if (base > buflen) - ldm_error("base (%d) > buflen (%d)", base, buflen); - return -1; - } - if (base + buffer[base] >= buflen) { - ldm_error("base (%d) + buffer[base] (%d) >= buflen (%d)", base, - buffer[base], buflen); - return -1; - } - return buffer[base] + offset + 1; -} - -/** - * ldm_get_vnum - Convert a variable-width, big endian number, into cpu order - * @block: Pointer to the variable-width number to convert - * - * Large numbers in the LDM Database are often stored in a packed format. Each - * number is prefixed by a one byte width marker. All numbers in the database - * are stored in big-endian byte order. This function reads one of these - * numbers and returns the result - * - * N.B. This function DOES NOT perform any range checking, though the most - * it will read is eight bytes. - * - * Return: n A number - * 0 Zero, or an error occurred - */ -static u64 ldm_get_vnum (const u8 *block) -{ - u64 tmp = 0; - u8 length; - - BUG_ON (!block); - - length = *block++; - - if (length && length <= 8) - while (length--) - tmp = (tmp << 8) | *block++; - else - ldm_error ("Illegal length %d.", length); - - return tmp; -} - -/** - * ldm_get_vstr - Read a length-prefixed string into a buffer - * @block: Pointer to the length marker - * @buffer: Location to copy string to - * @buflen: Size of the output buffer - * - * Many of the strings in the LDM Database are not NULL terminated. Instead - * they are prefixed by a one byte length marker. This function copies one of - * these strings into a buffer. - * - * N.B. This function DOES NOT perform any range checking on the input. - * If the buffer is too small, the output will be truncated. - * - * Return: 0, Error and @buffer contents are undefined - * n, String length in characters (excluding NULL) - * buflen-1, String was truncated. - */ -static int ldm_get_vstr (const u8 *block, u8 *buffer, int buflen) -{ - int length; - - BUG_ON (!block || !buffer); - - length = block[0]; - if (length >= buflen) { - ldm_error ("Truncating string %d -> %d.", length, buflen); - length = buflen - 1; - } - memcpy (buffer, block + 1, length); - buffer[length] = 0; - return length; -} - - -/** - * ldm_parse_cmp3 - Read a raw VBLK Component object into a vblk structure - * @buffer: Block of data being worked on - * @buflen: Size of the block of data - * @vb: In-memory vblk in which to return information - * - * Read a raw VBLK Component object (version 3) into a vblk structure. - * - * Return: 'true' @vb contains a Component VBLK - * 'false' @vb contents are not defined - */ -static bool ldm_parse_cmp3 (const u8 *buffer, int buflen, struct vblk *vb) -{ - int r_objid, r_name, r_vstate, r_child, r_parent, r_stripe, r_cols, len; - struct vblk_comp *comp; - - BUG_ON (!buffer || !vb); - - r_objid = ldm_relative (buffer, buflen, 0x18, 0); - r_name = ldm_relative (buffer, buflen, 0x18, r_objid); - r_vstate = ldm_relative (buffer, buflen, 0x18, r_name); - r_child = ldm_relative (buffer, buflen, 0x1D, r_vstate); - r_parent = ldm_relative (buffer, buflen, 0x2D, r_child); - - if (buffer[0x12] & VBLK_FLAG_COMP_STRIPE) { - r_stripe = ldm_relative (buffer, buflen, 0x2E, r_parent); - r_cols = ldm_relative (buffer, buflen, 0x2E, r_stripe); - len = r_cols; - } else { - r_stripe = 0; - r_cols = 0; - len = r_parent; - } - if (len < 0) - return false; - - len += VBLK_SIZE_CMP3; - if (len != get_unaligned_be32(buffer + 0x14)) - return false; - - comp = &vb->vblk.comp; - ldm_get_vstr (buffer + 0x18 + r_name, comp->state, - sizeof (comp->state)); - comp->type = buffer[0x18 + r_vstate]; - comp->children = ldm_get_vnum (buffer + 0x1D + r_vstate); - comp->parent_id = ldm_get_vnum (buffer + 0x2D + r_child); - comp->chunksize = r_stripe ? ldm_get_vnum (buffer+r_parent+0x2E) : 0; - - return true; -} - -/** - * ldm_parse_dgr3 - Read a raw VBLK Disk Group object into a vblk structure - * @buffer: Block of data being worked on - * @buflen: Size of the block of data - * @vb: In-memory vblk in which to return information - * - * Read a raw VBLK Disk Group object (version 3) into a vblk structure. - * - * Return: 'true' @vb contains a Disk Group VBLK - * 'false' @vb contents are not defined - */ -static int ldm_parse_dgr3 (const u8 *buffer, int buflen, struct vblk *vb) -{ - int r_objid, r_name, r_diskid, r_id1, r_id2, len; - struct vblk_dgrp *dgrp; - - BUG_ON (!buffer || !vb); - - r_objid = ldm_relative (buffer, buflen, 0x18, 0); - r_name = ldm_relative (buffer, buflen, 0x18, r_objid); - r_diskid = ldm_relative (buffer, buflen, 0x18, r_name); - - if (buffer[0x12] & VBLK_FLAG_DGR3_IDS) { - r_id1 = ldm_relative (buffer, buflen, 0x24, r_diskid); - r_id2 = ldm_relative (buffer, buflen, 0x24, r_id1); - len = r_id2; - } else { - r_id1 = 0; - r_id2 = 0; - len = r_diskid; - } - if (len < 0) - return false; - - len += VBLK_SIZE_DGR3; - if (len != get_unaligned_be32(buffer + 0x14)) - return false; - - dgrp = &vb->vblk.dgrp; - ldm_get_vstr (buffer + 0x18 + r_name, dgrp->disk_id, - sizeof (dgrp->disk_id)); - return true; -} - -/** - * ldm_parse_dgr4 - Read a raw VBLK Disk Group object into a vblk structure - * @buffer: Block of data being worked on - * @buflen: Size of the block of data - * @vb: In-memory vblk in which to return information - * - * Read a raw VBLK Disk Group object (version 4) into a vblk structure. - * - * Return: 'true' @vb contains a Disk Group VBLK - * 'false' @vb contents are not defined - */ -static bool ldm_parse_dgr4 (const u8 *buffer, int buflen, struct vblk *vb) -{ - char buf[64]; - int r_objid, r_name, r_id1, r_id2, len; - struct vblk_dgrp *dgrp; - - BUG_ON (!buffer || !vb); - - r_objid = ldm_relative (buffer, buflen, 0x18, 0); - r_name = ldm_relative (buffer, buflen, 0x18, r_objid); - - if (buffer[0x12] & VBLK_FLAG_DGR4_IDS) { - r_id1 = ldm_relative (buffer, buflen, 0x44, r_name); - r_id2 = ldm_relative (buffer, buflen, 0x44, r_id1); - len = r_id2; - } else { - r_id1 = 0; - r_id2 = 0; - len = r_name; - } - if (len < 0) - return false; - - len += VBLK_SIZE_DGR4; - if (len != get_unaligned_be32(buffer + 0x14)) - return false; - - dgrp = &vb->vblk.dgrp; - - ldm_get_vstr (buffer + 0x18 + r_objid, buf, sizeof (buf)); - return true; -} - -/** - * ldm_parse_dsk3 - Read a raw VBLK Disk object into a vblk structure - * @buffer: Block of data being worked on - * @buflen: Size of the block of data - * @vb: In-memory vblk in which to return information - * - * Read a raw VBLK Disk object (version 3) into a vblk structure. - * - * Return: 'true' @vb contains a Disk VBLK - * 'false' @vb contents are not defined - */ -static bool ldm_parse_dsk3 (const u8 *buffer, int buflen, struct vblk *vb) -{ - int r_objid, r_name, r_diskid, r_altname, len; - struct vblk_disk *disk; - - BUG_ON (!buffer || !vb); - - r_objid = ldm_relative (buffer, buflen, 0x18, 0); - r_name = ldm_relative (buffer, buflen, 0x18, r_objid); - r_diskid = ldm_relative (buffer, buflen, 0x18, r_name); - r_altname = ldm_relative (buffer, buflen, 0x18, r_diskid); - len = r_altname; - if (len < 0) - return false; - - len += VBLK_SIZE_DSK3; - if (len != get_unaligned_be32(buffer + 0x14)) - return false; - - disk = &vb->vblk.disk; - ldm_get_vstr (buffer + 0x18 + r_diskid, disk->alt_name, - sizeof (disk->alt_name)); - if (!ldm_parse_guid (buffer + 0x19 + r_name, disk->disk_id)) - return false; - - return true; -} - -/** - * ldm_parse_dsk4 - Read a raw VBLK Disk object into a vblk structure - * @buffer: Block of data being worked on - * @buflen: Size of the block of data - * @vb: In-memory vblk in which to return information - * - * Read a raw VBLK Disk object (version 4) into a vblk structure. - * - * Return: 'true' @vb contains a Disk VBLK - * 'false' @vb contents are not defined - */ -static bool ldm_parse_dsk4 (const u8 *buffer, int buflen, struct vblk *vb) -{ - int r_objid, r_name, len; - struct vblk_disk *disk; - - BUG_ON (!buffer || !vb); - - r_objid = ldm_relative (buffer, buflen, 0x18, 0); - r_name = ldm_relative (buffer, buflen, 0x18, r_objid); - len = r_name; - if (len < 0) - return false; - - len += VBLK_SIZE_DSK4; - if (len != get_unaligned_be32(buffer + 0x14)) - return false; - - disk = &vb->vblk.disk; - memcpy (disk->disk_id, buffer + 0x18 + r_name, GUID_SIZE); - return true; -} - -/** - * ldm_parse_prt3 - Read a raw VBLK Partition object into a vblk structure - * @buffer: Block of data being worked on - * @buflen: Size of the block of data - * @vb: In-memory vblk in which to return information - * - * Read a raw VBLK Partition object (version 3) into a vblk structure. - * - * Return: 'true' @vb contains a Partition VBLK - * 'false' @vb contents are not defined - */ -static bool ldm_parse_prt3(const u8 *buffer, int buflen, struct vblk *vb) -{ - int r_objid, r_name, r_size, r_parent, r_diskid, r_index, len; - struct vblk_part *part; - - BUG_ON(!buffer || !vb); - r_objid = ldm_relative(buffer, buflen, 0x18, 0); - if (r_objid < 0) { - ldm_error("r_objid %d < 0", r_objid); - return false; - } - r_name = ldm_relative(buffer, buflen, 0x18, r_objid); - if (r_name < 0) { - ldm_error("r_name %d < 0", r_name); - return false; - } - r_size = ldm_relative(buffer, buflen, 0x34, r_name); - if (r_size < 0) { - ldm_error("r_size %d < 0", r_size); - return false; - } - r_parent = ldm_relative(buffer, buflen, 0x34, r_size); - if (r_parent < 0) { - ldm_error("r_parent %d < 0", r_parent); - return false; - } - r_diskid = ldm_relative(buffer, buflen, 0x34, r_parent); - if (r_diskid < 0) { - ldm_error("r_diskid %d < 0", r_diskid); - return false; - } - if (buffer[0x12] & VBLK_FLAG_PART_INDEX) { - r_index = ldm_relative(buffer, buflen, 0x34, r_diskid); - if (r_index < 0) { - ldm_error("r_index %d < 0", r_index); - return false; - } - len = r_index; - } else { - r_index = 0; - len = r_diskid; - } - if (len < 0) { - ldm_error("len %d < 0", len); - return false; - } - len += VBLK_SIZE_PRT3; - if (len > get_unaligned_be32(buffer + 0x14)) { - ldm_error("len %d > BE32(buffer + 0x14) %d", len, - get_unaligned_be32(buffer + 0x14)); - return false; - } - part = &vb->vblk.part; - part->start = get_unaligned_be64(buffer + 0x24 + r_name); - part->volume_offset = get_unaligned_be64(buffer + 0x2C + r_name); - part->size = ldm_get_vnum(buffer + 0x34 + r_name); - part->parent_id = ldm_get_vnum(buffer + 0x34 + r_size); - part->disk_id = ldm_get_vnum(buffer + 0x34 + r_parent); - if (vb->flags & VBLK_FLAG_PART_INDEX) - part->partnum = buffer[0x35 + r_diskid]; - else - part->partnum = 0; - return true; -} - -/** - * ldm_parse_vol5 - Read a raw VBLK Volume object into a vblk structure - * @buffer: Block of data being worked on - * @buflen: Size of the block of data - * @vb: In-memory vblk in which to return information - * - * Read a raw VBLK Volume object (version 5) into a vblk structure. - * - * Return: 'true' @vb contains a Volume VBLK - * 'false' @vb contents are not defined - */ -static bool ldm_parse_vol5(const u8 *buffer, int buflen, struct vblk *vb) -{ - int r_objid, r_name, r_vtype, r_disable_drive_letter, r_child, r_size; - int r_id1, r_id2, r_size2, r_drive, len; - struct vblk_volu *volu; - - BUG_ON(!buffer || !vb); - r_objid = ldm_relative(buffer, buflen, 0x18, 0); - if (r_objid < 0) { - ldm_error("r_objid %d < 0", r_objid); - return false; - } - r_name = ldm_relative(buffer, buflen, 0x18, r_objid); - if (r_name < 0) { - ldm_error("r_name %d < 0", r_name); - return false; - } - r_vtype = ldm_relative(buffer, buflen, 0x18, r_name); - if (r_vtype < 0) { - ldm_error("r_vtype %d < 0", r_vtype); - return false; - } - r_disable_drive_letter = ldm_relative(buffer, buflen, 0x18, r_vtype); - if (r_disable_drive_letter < 0) { - ldm_error("r_disable_drive_letter %d < 0", - r_disable_drive_letter); - return false; - } - r_child = ldm_relative(buffer, buflen, 0x2D, r_disable_drive_letter); - if (r_child < 0) { - ldm_error("r_child %d < 0", r_child); - return false; - } - r_size = ldm_relative(buffer, buflen, 0x3D, r_child); - if (r_size < 0) { - ldm_error("r_size %d < 0", r_size); - return false; - } - if (buffer[0x12] & VBLK_FLAG_VOLU_ID1) { - r_id1 = ldm_relative(buffer, buflen, 0x52, r_size); - if (r_id1 < 0) { - ldm_error("r_id1 %d < 0", r_id1); - return false; - } - } else - r_id1 = r_size; - if (buffer[0x12] & VBLK_FLAG_VOLU_ID2) { - r_id2 = ldm_relative(buffer, buflen, 0x52, r_id1); - if (r_id2 < 0) { - ldm_error("r_id2 %d < 0", r_id2); - return false; - } - } else - r_id2 = r_id1; - if (buffer[0x12] & VBLK_FLAG_VOLU_SIZE) { - r_size2 = ldm_relative(buffer, buflen, 0x52, r_id2); - if (r_size2 < 0) { - ldm_error("r_size2 %d < 0", r_size2); - return false; - } - } else - r_size2 = r_id2; - if (buffer[0x12] & VBLK_FLAG_VOLU_DRIVE) { - r_drive = ldm_relative(buffer, buflen, 0x52, r_size2); - if (r_drive < 0) { - ldm_error("r_drive %d < 0", r_drive); - return false; - } - } else - r_drive = r_size2; - len = r_drive; - if (len < 0) { - ldm_error("len %d < 0", len); - return false; - } - len += VBLK_SIZE_VOL5; - if (len > get_unaligned_be32(buffer + 0x14)) { - ldm_error("len %d > BE32(buffer + 0x14) %d", len, - get_unaligned_be32(buffer + 0x14)); - return false; - } - volu = &vb->vblk.volu; - ldm_get_vstr(buffer + 0x18 + r_name, volu->volume_type, - sizeof(volu->volume_type)); - memcpy(volu->volume_state, buffer + 0x18 + r_disable_drive_letter, - sizeof(volu->volume_state)); - volu->size = ldm_get_vnum(buffer + 0x3D + r_child); - volu->partition_type = buffer[0x41 + r_size]; - memcpy(volu->guid, buffer + 0x42 + r_size, sizeof(volu->guid)); - if (buffer[0x12] & VBLK_FLAG_VOLU_DRIVE) { - ldm_get_vstr(buffer + 0x52 + r_size, volu->drive_hint, - sizeof(volu->drive_hint)); - } - return true; -} - -/** - * ldm_parse_vblk - Read a raw VBLK object into a vblk structure - * @buf: Block of data being worked on - * @len: Size of the block of data - * @vb: In-memory vblk in which to return information - * - * Read a raw VBLK object into a vblk structure. This function just reads the - * information common to all VBLK types, then delegates the rest of the work to - * helper functions: ldm_parse_*. - * - * Return: 'true' @vb contains a VBLK - * 'false' @vb contents are not defined - */ -static bool ldm_parse_vblk (const u8 *buf, int len, struct vblk *vb) -{ - bool result = false; - int r_objid; - - BUG_ON (!buf || !vb); - - r_objid = ldm_relative (buf, len, 0x18, 0); - if (r_objid < 0) { - ldm_error ("VBLK header is corrupt."); - return false; - } - - vb->flags = buf[0x12]; - vb->type = buf[0x13]; - vb->obj_id = ldm_get_vnum (buf + 0x18); - ldm_get_vstr (buf+0x18+r_objid, vb->name, sizeof (vb->name)); - - switch (vb->type) { - case VBLK_CMP3: result = ldm_parse_cmp3 (buf, len, vb); break; - case VBLK_DSK3: result = ldm_parse_dsk3 (buf, len, vb); break; - case VBLK_DSK4: result = ldm_parse_dsk4 (buf, len, vb); break; - case VBLK_DGR3: result = ldm_parse_dgr3 (buf, len, vb); break; - case VBLK_DGR4: result = ldm_parse_dgr4 (buf, len, vb); break; - case VBLK_PRT3: result = ldm_parse_prt3 (buf, len, vb); break; - case VBLK_VOL5: result = ldm_parse_vol5 (buf, len, vb); break; - } - - if (result) - ldm_debug ("Parsed VBLK 0x%llx (type: 0x%02x) ok.", - (unsigned long long) vb->obj_id, vb->type); - else - ldm_error ("Failed to parse VBLK 0x%llx (type: 0x%02x).", - (unsigned long long) vb->obj_id, vb->type); - - return result; -} - - -/** - * ldm_ldmdb_add - Adds a raw VBLK entry to the ldmdb database - * @data: Raw VBLK to add to the database - * @len: Size of the raw VBLK - * @ldb: Cache of the database structures - * - * The VBLKs are sorted into categories. Partitions are also sorted by offset. - * - * N.B. This function does not check the validity of the VBLKs. - * - * Return: 'true' The VBLK was added - * 'false' An error occurred - */ -static bool ldm_ldmdb_add (u8 *data, int len, struct ldmdb *ldb) -{ - struct vblk *vb; - struct list_head *item; - - BUG_ON (!data || !ldb); - - vb = kmalloc (sizeof (*vb), GFP_KERNEL); - if (!vb) { - ldm_crit ("Out of memory."); - return false; - } - - if (!ldm_parse_vblk (data, len, vb)) { - kfree(vb); - return false; /* Already logged */ - } - - /* Put vblk into the correct list. */ - switch (vb->type) { - case VBLK_DGR3: - case VBLK_DGR4: - list_add (&vb->list, &ldb->v_dgrp); - break; - case VBLK_DSK3: - case VBLK_DSK4: - list_add (&vb->list, &ldb->v_disk); - break; - case VBLK_VOL5: - list_add (&vb->list, &ldb->v_volu); - break; - case VBLK_CMP3: - list_add (&vb->list, &ldb->v_comp); - break; - case VBLK_PRT3: - /* Sort by the partition's start sector. */ - list_for_each (item, &ldb->v_part) { - struct vblk *v = list_entry (item, struct vblk, list); - if ((v->vblk.part.disk_id == vb->vblk.part.disk_id) && - (v->vblk.part.start > vb->vblk.part.start)) { - list_add_tail (&vb->list, &v->list); - return true; - } - } - list_add_tail (&vb->list, &ldb->v_part); - break; - } - return true; -} - -/** - * ldm_frag_add - Add a VBLK fragment to a list - * @data: Raw fragment to be added to the list - * @size: Size of the raw fragment - * @frags: Linked list of VBLK fragments - * - * Fragmented VBLKs may not be consecutive in the database, so they are placed - * in a list so they can be pieced together later. - * - * Return: 'true' Success, the VBLK was added to the list - * 'false' Error, a problem occurred - */ -static bool ldm_frag_add (const u8 *data, int size, struct list_head *frags) -{ - struct frag *f; - struct list_head *item; - int rec, num, group; - - BUG_ON (!data || !frags); - - if (size < 2 * VBLK_SIZE_HEAD) { - ldm_error("Value of size is to small."); - return false; - } - - group = get_unaligned_be32(data + 0x08); - rec = get_unaligned_be16(data + 0x0C); - num = get_unaligned_be16(data + 0x0E); - if ((num < 1) || (num > 4)) { - ldm_error ("A VBLK claims to have %d parts.", num); - return false; - } - if (rec >= num) { - ldm_error("REC value (%d) exceeds NUM value (%d)", rec, num); - return false; - } - - list_for_each (item, frags) { - f = list_entry (item, struct frag, list); - if (f->group == group) - goto found; - } - - f = kmalloc (sizeof (*f) + size*num, GFP_KERNEL); - if (!f) { - ldm_crit ("Out of memory."); - return false; - } - - f->group = group; - f->num = num; - f->rec = rec; - f->map = 0xFF << num; - - list_add_tail (&f->list, frags); -found: - if (rec >= f->num) { - ldm_error("REC value (%d) exceeds NUM value (%d)", rec, f->num); - return false; - } - - if (f->map & (1 << rec)) { - ldm_error ("Duplicate VBLK, part %d.", rec); - f->map &= 0x7F; /* Mark the group as broken */ - return false; - } - - f->map |= (1 << rec); - - data += VBLK_SIZE_HEAD; - size -= VBLK_SIZE_HEAD; - - memcpy (f->data+rec*(size-VBLK_SIZE_HEAD)+VBLK_SIZE_HEAD, data, size); - - return true; -} - -/** - * ldm_frag_free - Free a linked list of VBLK fragments - * @list: Linked list of fragments - * - * Free a linked list of VBLK fragments - * - * Return: none - */ -static void ldm_frag_free (struct list_head *list) -{ - struct list_head *item, *tmp; - - BUG_ON (!list); - - list_for_each_safe (item, tmp, list) - kfree (list_entry (item, struct frag, list)); -} - -/** - * ldm_frag_commit - Validate fragmented VBLKs and add them to the database - * @frags: Linked list of VBLK fragments - * @ldb: Cache of the database structures - * - * Now that all the fragmented VBLKs have been collected, they must be added to - * the database for later use. - * - * Return: 'true' All the fragments we added successfully - * 'false' One or more of the fragments we invalid - */ -static bool ldm_frag_commit (struct list_head *frags, struct ldmdb *ldb) -{ - struct frag *f; - struct list_head *item; - - BUG_ON (!frags || !ldb); - - list_for_each (item, frags) { - f = list_entry (item, struct frag, list); - - if (f->map != 0xFF) { - ldm_error ("VBLK group %d is incomplete (0x%02x).", - f->group, f->map); - return false; - } - - if (!ldm_ldmdb_add (f->data, f->num*ldb->vm.vblk_size, ldb)) - return false; /* Already logged */ - } - return true; -} - -/** - * ldm_get_vblks - Read the on-disk database of VBLKs into memory - * @state: Partition check state including device holding the LDM Database - * @base: Offset, into @state->bdev, of the database - * @ldb: Cache of the database structures - * - * To use the information from the VBLKs, they need to be read from the disk, - * unpacked and validated. We cache them in @ldb according to their type. - * - * Return: 'true' All the VBLKs were read successfully - * 'false' An error occurred - */ -static bool ldm_get_vblks(struct parsed_partitions *state, unsigned long base, - struct ldmdb *ldb) -{ - int size, perbuf, skip, finish, s, v, recs; - u8 *data = NULL; - Sector sect; - bool result = false; - LIST_HEAD (frags); - - BUG_ON(!state || !ldb); - - size = ldb->vm.vblk_size; - perbuf = 512 / size; - skip = ldb->vm.vblk_offset >> 9; /* Bytes to sectors */ - finish = (size * ldb->vm.last_vblk_seq) >> 9; - - for (s = skip; s < finish; s++) { /* For each sector */ - data = read_part_sector(state, base + OFF_VMDB + s, §); - if (!data) { - ldm_crit ("Disk read failed."); - goto out; - } - - for (v = 0; v < perbuf; v++, data+=size) { /* For each vblk */ - if (MAGIC_VBLK != get_unaligned_be32(data)) { - ldm_error ("Expected to find a VBLK."); - goto out; - } - - recs = get_unaligned_be16(data + 0x0E); /* Number of records */ - if (recs == 1) { - if (!ldm_ldmdb_add (data, size, ldb)) - goto out; /* Already logged */ - } else if (recs > 1) { - if (!ldm_frag_add (data, size, &frags)) - goto out; /* Already logged */ - } - /* else Record is not in use, ignore it. */ - } - put_dev_sector (sect); - data = NULL; - } - - result = ldm_frag_commit (&frags, ldb); /* Failures, already logged */ -out: - if (data) - put_dev_sector (sect); - ldm_frag_free (&frags); - - return result; -} - -/** - * ldm_free_vblks - Free a linked list of vblk's - * @lh: Head of a linked list of struct vblk - * - * Free a list of vblk's and free the memory used to maintain the list. - * - * Return: none - */ -static void ldm_free_vblks (struct list_head *lh) -{ - struct list_head *item, *tmp; - - BUG_ON (!lh); - - list_for_each_safe (item, tmp, lh) - kfree (list_entry (item, struct vblk, list)); -} - - -/** - * ldm_partition - Find out whether a device is a dynamic disk and handle it - * @state: Partition check state including device holding the LDM Database - * - * This determines whether the device @bdev is a dynamic disk and if so creates - * the partitions necessary in the gendisk structure pointed to by @hd. - * - * We create a dummy device 1, which contains the LDM database, and then create - * each partition described by the LDM database in sequence as devices 2+. For - * example, if the device is hda, we would have: hda1: LDM database, hda2, hda3, - * and so on: the actual data containing partitions. - * - * Return: 1 Success, @state->bdev is a dynamic disk and we handled it - * 0 Success, @state->bdev is not a dynamic disk - * -1 An error occurred before enough information had been read - * Or @state->bdev is a dynamic disk, but it may be corrupted - */ -int ldm_partition(struct parsed_partitions *state) -{ - struct ldmdb *ldb; - unsigned long base; - int result = -1; - - BUG_ON(!state); - - /* Look for signs of a Dynamic Disk */ - if (!ldm_validate_partition_table(state)) - return 0; - - ldb = kmalloc (sizeof (*ldb), GFP_KERNEL); - if (!ldb) { - ldm_crit ("Out of memory."); - goto out; - } - - /* Parse and check privheads. */ - if (!ldm_validate_privheads(state, &ldb->ph)) - goto out; /* Already logged */ - - /* All further references are relative to base (database start). */ - base = ldb->ph.config_start; - - /* Parse and check tocs and vmdb. */ - if (!ldm_validate_tocblocks(state, base, ldb) || - !ldm_validate_vmdb(state, base, ldb)) - goto out; /* Already logged */ - - /* Initialize vblk lists in ldmdb struct */ - INIT_LIST_HEAD (&ldb->v_dgrp); - INIT_LIST_HEAD (&ldb->v_disk); - INIT_LIST_HEAD (&ldb->v_volu); - INIT_LIST_HEAD (&ldb->v_comp); - INIT_LIST_HEAD (&ldb->v_part); - - if (!ldm_get_vblks(state, base, ldb)) { - ldm_crit ("Failed to read the VBLKs from the database."); - goto cleanup; - } - - /* Finally, create the data partition devices. */ - if (ldm_create_data_partitions(state, ldb)) { - ldm_debug ("Parsed LDM database successfully."); - result = 1; - } - /* else Already logged */ - -cleanup: - ldm_free_vblks (&ldb->v_dgrp); - ldm_free_vblks (&ldb->v_disk); - ldm_free_vblks (&ldb->v_volu); - ldm_free_vblks (&ldb->v_comp); - ldm_free_vblks (&ldb->v_part); -out: - kfree (ldb); - return result; -} diff --git a/fs/partitions/ldm.h b/fs/partitions/ldm.h deleted file mode 100644 index 374242c..0000000 --- a/fs/partitions/ldm.h +++ /dev/null @@ -1,215 +0,0 @@ -/** - * ldm - Part of the Linux-NTFS project. - * - * Copyright (C) 2001,2002 Richard Russon <ldm@flatcap.org> - * Copyright (c) 2001-2007 Anton Altaparmakov - * Copyright (C) 2001,2002 Jakob Kemi <jakob.kemi@telia.com> - * - * Documentation is available at http://www.linux-ntfs.org/doku.php?id=downloads - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License as published by the Free - * Software Foundation; either version 2 of the License, or (at your option) - * any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program (in the main directory of the Linux-NTFS source - * in the file COPYING); if not, write to the Free Software Foundation, - * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - */ - -#ifndef _FS_PT_LDM_H_ -#define _FS_PT_LDM_H_ - -#include <linux/types.h> -#include <linux/list.h> -#include <linux/genhd.h> -#include <linux/fs.h> -#include <asm/unaligned.h> -#include <asm/byteorder.h> - -struct parsed_partitions; - -/* Magic numbers in CPU format. */ -#define MAGIC_VMDB 0x564D4442 /* VMDB */ -#define MAGIC_VBLK 0x56424C4B /* VBLK */ -#define MAGIC_PRIVHEAD 0x5052495648454144ULL /* PRIVHEAD */ -#define MAGIC_TOCBLOCK 0x544F43424C4F434BULL /* TOCBLOCK */ - -/* The defined vblk types. */ -#define VBLK_VOL5 0x51 /* Volume, version 5 */ -#define VBLK_CMP3 0x32 /* Component, version 3 */ -#define VBLK_PRT3 0x33 /* Partition, version 3 */ -#define VBLK_DSK3 0x34 /* Disk, version 3 */ -#define VBLK_DSK4 0x44 /* Disk, version 4 */ -#define VBLK_DGR3 0x35 /* Disk Group, version 3 */ -#define VBLK_DGR4 0x45 /* Disk Group, version 4 */ - -/* vblk flags indicating extra information will be present */ -#define VBLK_FLAG_COMP_STRIPE 0x10 -#define VBLK_FLAG_PART_INDEX 0x08 -#define VBLK_FLAG_DGR3_IDS 0x08 -#define VBLK_FLAG_DGR4_IDS 0x08 -#define VBLK_FLAG_VOLU_ID1 0x08 -#define VBLK_FLAG_VOLU_ID2 0x20 -#define VBLK_FLAG_VOLU_SIZE 0x80 -#define VBLK_FLAG_VOLU_DRIVE 0x02 - -/* size of a vblk's static parts */ -#define VBLK_SIZE_HEAD 16 -#define VBLK_SIZE_CMP3 22 /* Name and version */ -#define VBLK_SIZE_DGR3 12 -#define VBLK_SIZE_DGR4 44 -#define VBLK_SIZE_DSK3 12 -#define VBLK_SIZE_DSK4 45 -#define VBLK_SIZE_PRT3 28 -#define VBLK_SIZE_VOL5 58 - -/* component types */ -#define COMP_STRIPE 0x01 /* Stripe-set */ -#define COMP_BASIC 0x02 /* Basic disk */ -#define COMP_RAID 0x03 /* Raid-set */ - -/* Other constants. */ -#define LDM_DB_SIZE 2048 /* Size in sectors (= 1MiB). */ - -#define OFF_PRIV1 6 /* Offset of the first privhead - relative to the start of the - device in sectors */ - -/* Offsets to structures within the LDM Database in sectors. */ -#define OFF_PRIV2 1856 /* Backup private headers. */ -#define OFF_PRIV3 2047 - -#define OFF_TOCB1 1 /* Tables of contents. */ -#define OFF_TOCB2 2 -#define OFF_TOCB3 2045 -#define OFF_TOCB4 2046 - -#define OFF_VMDB 17 /* List of partitions. */ - -#define LDM_PARTITION 0x42 /* Formerly SFS (Landis). */ - -#define TOC_BITMAP1 "config" /* Names of the two defined */ -#define TOC_BITMAP2 "log" /* bitmaps in the TOCBLOCK. */ - -/* Borrowed from msdos.c */ -#define SYS_IND(p) (get_unaligned(&(p)->sys_ind)) - -struct frag { /* VBLK Fragment handling */ - struct list_head list; - u32 group; - u8 num; /* Total number of records */ - u8 rec; /* This is record number n */ - u8 map; /* Which portions are in use */ - u8 data[0]; -}; - -/* In memory LDM database structures. */ - -#define GUID_SIZE 16 - -struct privhead { /* Offsets and sizes are in sectors. */ - u16 ver_major; - u16 ver_minor; - u64 logical_disk_start; - u64 logical_disk_size; - u64 config_start; - u64 config_size; - u8 disk_id[GUID_SIZE]; -}; - -struct tocblock { /* We have exactly two bitmaps. */ - u8 bitmap1_name[16]; - u64 bitmap1_start; - u64 bitmap1_size; - u8 bitmap2_name[16]; - u64 bitmap2_start; - u64 bitmap2_size; -}; - -struct vmdb { /* VMDB: The database header */ - u16 ver_major; - u16 ver_minor; - u32 vblk_size; - u32 vblk_offset; - u32 last_vblk_seq; -}; - -struct vblk_comp { /* VBLK Component */ - u8 state[16]; - u64 parent_id; - u8 type; - u8 children; - u16 chunksize; -}; - -struct vblk_dgrp { /* VBLK Disk Group */ - u8 disk_id[64]; -}; - -struct vblk_disk { /* VBLK Disk */ - u8 disk_id[GUID_SIZE]; - u8 alt_name[128]; -}; - -struct vblk_part { /* VBLK Partition */ - u64 start; - u64 size; /* start, size and vol_off in sectors */ - u64 volume_offset; - u64 parent_id; - u64 disk_id; - u8 partnum; -}; - -struct vblk_volu { /* VBLK Volume */ - u8 volume_type[16]; - u8 volume_state[16]; - u8 guid[16]; - u8 drive_hint[4]; - u64 size; - u8 partition_type; -}; - -struct vblk_head { /* VBLK standard header */ - u32 group; - u16 rec; - u16 nrec; -}; - -struct vblk { /* Generalised VBLK */ - u8 name[64]; - u64 obj_id; - u32 sequence; - u8 flags; - u8 type; - union { - struct vblk_comp comp; - struct vblk_dgrp dgrp; - struct vblk_disk disk; - struct vblk_part part; - struct vblk_volu volu; - } vblk; - struct list_head list; -}; - -struct ldmdb { /* Cache of the database */ - struct privhead ph; - struct tocblock toc; - struct vmdb vm; - struct list_head v_dgrp; - struct list_head v_disk; - struct list_head v_volu; - struct list_head v_comp; - struct list_head v_part; -}; - -int ldm_partition(struct parsed_partitions *state); - -#endif /* _FS_PT_LDM_H_ */ - diff --git a/fs/partitions/mac.c b/fs/partitions/mac.c deleted file mode 100644 index 11f688b..0000000 --- a/fs/partitions/mac.c +++ /dev/null @@ -1,134 +0,0 @@ -/* - * fs/partitions/mac.c - * - * Code extracted from drivers/block/genhd.c - * Copyright (C) 1991-1998 Linus Torvalds - * Re-organised Feb 1998 Russell King - */ - -#include <linux/ctype.h> -#include "check.h" -#include "mac.h" - -#ifdef CONFIG_PPC_PMAC -#include <asm/machdep.h> -extern void note_bootable_part(dev_t dev, int part, int goodness); -#endif - -/* - * Code to understand MacOS partition tables. - */ - -static inline void mac_fix_string(char *stg, int len) -{ - int i; - - for (i = len - 1; i >= 0 && stg[i] == ' '; i--) - stg[i] = 0; -} - -int mac_partition(struct parsed_partitions *state) -{ - Sector sect; - unsigned char *data; - int slot, blocks_in_map; - unsigned secsize; -#ifdef CONFIG_PPC_PMAC - int found_root = 0; - int found_root_goodness = 0; -#endif - struct mac_partition *part; - struct mac_driver_desc *md; - - /* Get 0th block and look at the first partition map entry. */ - md = read_part_sector(state, 0, §); - if (!md) - return -1; - if (be16_to_cpu(md->signature) != MAC_DRIVER_MAGIC) { - put_dev_sector(sect); - return 0; - } - secsize = be16_to_cpu(md->block_size); - put_dev_sector(sect); - data = read_part_sector(state, secsize/512, §); - if (!data) - return -1; - part = (struct mac_partition *) (data + secsize%512); - if (be16_to_cpu(part->signature) != MAC_PARTITION_MAGIC) { - put_dev_sector(sect); - return 0; /* not a MacOS disk */ - } - blocks_in_map = be32_to_cpu(part->map_count); - if (blocks_in_map < 0 || blocks_in_map >= DISK_MAX_PARTS) { - put_dev_sector(sect); - return 0; - } - strlcat(state->pp_buf, " [mac]", PAGE_SIZE); - for (slot = 1; slot <= blocks_in_map; ++slot) { - int pos = slot * secsize; - put_dev_sector(sect); - data = read_part_sector(state, pos/512, §); - if (!data) - return -1; - part = (struct mac_partition *) (data + pos%512); - if (be16_to_cpu(part->signature) != MAC_PARTITION_MAGIC) - break; - put_partition(state, slot, - be32_to_cpu(part->start_block) * (secsize/512), - be32_to_cpu(part->block_count) * (secsize/512)); - - if (!strnicmp(part->type, "Linux_RAID", 10)) - state->parts[slot].flags = ADDPART_FLAG_RAID; -#ifdef CONFIG_PPC_PMAC - /* - * If this is the first bootable partition, tell the - * setup code, in case it wants to make this the root. - */ - if (machine_is(powermac)) { - int goodness = 0; - - mac_fix_string(part->processor, 16); - mac_fix_string(part->name, 32); - mac_fix_string(part->type, 32); - - if ((be32_to_cpu(part->status) & MAC_STATUS_BOOTABLE) - && strcasecmp(part->processor, "powerpc") == 0) - goodness++; - - if (strcasecmp(part->type, "Apple_UNIX_SVR2") == 0 - || (strnicmp(part->type, "Linux", 5) == 0 - && strcasecmp(part->type, "Linux_swap") != 0)) { - int i, l; - - goodness++; - l = strlen(part->name); - if (strcmp(part->name, "/") == 0) - goodness++; - for (i = 0; i <= l - 4; ++i) { - if (strnicmp(part->name + i, "root", - 4) == 0) { - goodness += 2; - break; - } - } - if (strnicmp(part->name, "swap", 4) == 0) - goodness--; - } - - if (goodness > found_root_goodness) { - found_root = slot; - found_root_goodness = goodness; - } - } -#endif /* CONFIG_PPC_PMAC */ - } -#ifdef CONFIG_PPC_PMAC - if (found_root_goodness) - note_bootable_part(state->bdev->bd_dev, found_root, - found_root_goodness); -#endif - - put_dev_sector(sect); - strlcat(state->pp_buf, "\n", PAGE_SIZE); - return 1; -} diff --git a/fs/partitions/mac.h b/fs/partitions/mac.h deleted file mode 100644 index 3c7d984..0000000 --- a/fs/partitions/mac.h +++ /dev/null @@ -1,44 +0,0 @@ -/* - * fs/partitions/mac.h - */ - -#define MAC_PARTITION_MAGIC 0x504d - -/* type field value for A/UX or other Unix partitions */ -#define APPLE_AUX_TYPE "Apple_UNIX_SVR2" - -struct mac_partition { - __be16 signature; /* expected to be MAC_PARTITION_MAGIC */ - __be16 res1; - __be32 map_count; /* # blocks in partition map */ - __be32 start_block; /* absolute starting block # of partition */ - __be32 block_count; /* number of blocks in partition */ - char name[32]; /* partition name */ - char type[32]; /* string type description */ - __be32 data_start; /* rel block # of first data block */ - __be32 data_count; /* number of data blocks */ - __be32 status; /* partition status bits */ - __be32 boot_start; - __be32 boot_size; - __be32 boot_load; - __be32 boot_load2; - __be32 boot_entry; - __be32 boot_entry2; - __be32 boot_cksum; - char processor[16]; /* identifies ISA of boot */ - /* there is more stuff after this that we don't need */ -}; - -#define MAC_STATUS_BOOTABLE 8 /* partition is bootable */ - -#define MAC_DRIVER_MAGIC 0x4552 - -/* Driver descriptor structure, in block 0 */ -struct mac_driver_desc { - __be16 signature; /* expected to be MAC_DRIVER_MAGIC */ - __be16 block_size; - __be32 block_count; - /* ... more stuff */ -}; - -int mac_partition(struct parsed_partitions *state); diff --git a/fs/partitions/msdos.c b/fs/partitions/msdos.c deleted file mode 100644 index 5f79a66..0000000 --- a/fs/partitions/msdos.c +++ /dev/null @@ -1,552 +0,0 @@ -/* - * fs/partitions/msdos.c - * - * Code extracted from drivers/block/genhd.c - * Copyright (C) 1991-1998 Linus Torvalds - * - * Thanks to Branko Lankester, lankeste@fwi.uva.nl, who found a bug - * in the early extended-partition checks and added DM partitions - * - * Support for DiskManager v6.0x added by Mark Lord, - * with information provided by OnTrack. This now works for linux fdisk - * and LILO, as well as loadlin and bootln. Note that disks other than - * /dev/hda *must* have a "DOS" type 0x51 partition in the first slot (hda1). - * - * More flexible handling of extended partitions - aeb, 950831 - * - * Check partition table on IDE disks for common CHS translations - * - * Re-organised Feb 1998 Russell King - */ -#include <linux/msdos_fs.h> - -#include "check.h" -#include "msdos.h" -#include "efi.h" - -/* - * Many architectures don't like unaligned accesses, while - * the nr_sects and start_sect partition table entries are - * at a 2 (mod 4) address. - */ -#include <asm/unaligned.h> - -#define SYS_IND(p) get_unaligned(&p->sys_ind) - -static inline sector_t nr_sects(struct partition *p) -{ - return (sector_t)get_unaligned_le32(&p->nr_sects); -} - -static inline sector_t start_sect(struct partition *p) -{ - return (sector_t)get_unaligned_le32(&p->start_sect); -} - -static inline int is_extended_partition(struct partition *p) -{ - return (SYS_IND(p) == DOS_EXTENDED_PARTITION || - SYS_IND(p) == WIN98_EXTENDED_PARTITION || - SYS_IND(p) == LINUX_EXTENDED_PARTITION); -} - -#define MSDOS_LABEL_MAGIC1 0x55 -#define MSDOS_LABEL_MAGIC2 0xAA - -static inline int -msdos_magic_present(unsigned char *p) -{ - return (p[0] == MSDOS_LABEL_MAGIC1 && p[1] == MSDOS_LABEL_MAGIC2); -} - -/* Value is EBCDIC 'IBMA' */ -#define AIX_LABEL_MAGIC1 0xC9 -#define AIX_LABEL_MAGIC2 0xC2 -#define AIX_LABEL_MAGIC3 0xD4 -#define AIX_LABEL_MAGIC4 0xC1 -static int aix_magic_present(struct parsed_partitions *state, unsigned char *p) -{ - struct partition *pt = (struct partition *) (p + 0x1be); - Sector sect; - unsigned char *d; - int slot, ret = 0; - - if (!(p[0] == AIX_LABEL_MAGIC1 && - p[1] == AIX_LABEL_MAGIC2 && - p[2] == AIX_LABEL_MAGIC3 && - p[3] == AIX_LABEL_MAGIC4)) - return 0; - /* Assume the partition table is valid if Linux partitions exists */ - for (slot = 1; slot <= 4; slot++, pt++) { - if (pt->sys_ind == LINUX_SWAP_PARTITION || - pt->sys_ind == LINUX_RAID_PARTITION || - pt->sys_ind == LINUX_DATA_PARTITION || - pt->sys_ind == LINUX_LVM_PARTITION || - is_extended_partition(pt)) - return 0; - } - d = read_part_sector(state, 7, §); - if (d) { - if (d[0] == '_' && d[1] == 'L' && d[2] == 'V' && d[3] == 'M') - ret = 1; - put_dev_sector(sect); - }; - return ret; -} - -/* - * Create devices for each logical partition in an extended partition. - * The logical partitions form a linked list, with each entry being - * a partition table with two entries. The first entry - * is the real data partition (with a start relative to the partition - * table start). The second is a pointer to the next logical partition - * (with a start relative to the entire extended partition). - * We do not create a Linux partition for the partition tables, but - * only for the actual data partitions. - */ - -static void parse_extended(struct parsed_partitions *state, - sector_t first_sector, sector_t first_size) -{ - struct partition *p; - Sector sect; - unsigned char *data; - sector_t this_sector, this_size; - sector_t sector_size = bdev_logical_block_size(state->bdev) / 512; - int loopct = 0; /* number of links followed - without finding a data partition */ - int i; - - this_sector = first_sector; - this_size = first_size; - - while (1) { - if (++loopct > 100) - return; - if (state->next == state->limit) - return; - data = read_part_sector(state, this_sector, §); - if (!data) - return; - - if (!msdos_magic_present(data + 510)) - goto done; - - p = (struct partition *) (data + 0x1be); - - /* - * Usually, the first entry is the real data partition, - * the 2nd entry is the next extended partition, or empty, - * and the 3rd and 4th entries are unused. - * However, DRDOS sometimes has the extended partition as - * the first entry (when the data partition is empty), - * and OS/2 seems to use all four entries. - */ - - /* - * First process the data partition(s) - */ - for (i=0; i<4; i++, p++) { - sector_t offs, size, next; - if (!nr_sects(p) || is_extended_partition(p)) - continue; - - /* Check the 3rd and 4th entries - - these sometimes contain random garbage */ - offs = start_sect(p)*sector_size; - size = nr_sects(p)*sector_size; - next = this_sector + offs; - if (i >= 2) { - if (offs + size > this_size) - continue; - if (next < first_sector) - continue; - if (next + size > first_sector + first_size) - continue; - } - - put_partition(state, state->next, next, size); - if (SYS_IND(p) == LINUX_RAID_PARTITION) - state->parts[state->next].flags = ADDPART_FLAG_RAID; - loopct = 0; - if (++state->next == state->limit) - goto done; - } - /* - * Next, process the (first) extended partition, if present. - * (So far, there seems to be no reason to make - * parse_extended() recursive and allow a tree - * of extended partitions.) - * It should be a link to the next logical partition. - */ - p -= 4; - for (i=0; i<4; i++, p++) - if (nr_sects(p) && is_extended_partition(p)) - break; - if (i == 4) - goto done; /* nothing left to do */ - - this_sector = first_sector + start_sect(p) * sector_size; - this_size = nr_sects(p) * sector_size; - put_dev_sector(sect); - } -done: - put_dev_sector(sect); -} - -/* james@bpgc.com: Solaris has a nasty indicator: 0x82 which also - indicates linux swap. Be careful before believing this is Solaris. */ - -static void parse_solaris_x86(struct parsed_partitions *state, - sector_t offset, sector_t size, int origin) -{ -#ifdef CONFIG_SOLARIS_X86_PARTITION - Sector sect; - struct solaris_x86_vtoc *v; - int i; - short max_nparts; - - v = read_part_sector(state, offset + 1, §); - if (!v) - return; - if (le32_to_cpu(v->v_sanity) != SOLARIS_X86_VTOC_SANE) { - put_dev_sector(sect); - return; - } - { - char tmp[1 + BDEVNAME_SIZE + 10 + 11 + 1]; - - snprintf(tmp, sizeof(tmp), " %s%d: <solaris:", state->name, origin); - strlcat(state->pp_buf, tmp, PAGE_SIZE); - } - if (le32_to_cpu(v->v_version) != 1) { - char tmp[64]; - - snprintf(tmp, sizeof(tmp), " cannot handle version %d vtoc>\n", - le32_to_cpu(v->v_version)); - strlcat(state->pp_buf, tmp, PAGE_SIZE); - put_dev_sector(sect); - return; - } - /* Ensure we can handle previous case of VTOC with 8 entries gracefully */ - max_nparts = le16_to_cpu (v->v_nparts) > 8 ? SOLARIS_X86_NUMSLICE : 8; - for (i=0; i<max_nparts && state->next<state->limit; i++) { - struct solaris_x86_slice *s = &v->v_slice[i]; - char tmp[3 + 10 + 1 + 1]; - - if (s->s_size == 0) - continue; - snprintf(tmp, sizeof(tmp), " [s%d]", i); - strlcat(state->pp_buf, tmp, PAGE_SIZE); - /* solaris partitions are relative to current MS-DOS - * one; must add the offset of the current partition */ - put_partition(state, state->next++, - le32_to_cpu(s->s_start)+offset, - le32_to_cpu(s->s_size)); - } - put_dev_sector(sect); - strlcat(state->pp_buf, " >\n", PAGE_SIZE); -#endif -} - -#if defined(CONFIG_BSD_DISKLABEL) -/* - * Create devices for BSD partitions listed in a disklabel, under a - * dos-like partition. See parse_extended() for more information. - */ -static void parse_bsd(struct parsed_partitions *state, - sector_t offset, sector_t size, int origin, char *flavour, - int max_partitions) -{ - Sector sect; - struct bsd_disklabel *l; - struct bsd_partition *p; - char tmp[64]; - - l = read_part_sector(state, offset + 1, §); - if (!l) - return; - if (le32_to_cpu(l->d_magic) != BSD_DISKMAGIC) { - put_dev_sector(sect); - return; - } - - snprintf(tmp, sizeof(tmp), " %s%d: <%s:", state->name, origin, flavour); - strlcat(state->pp_buf, tmp, PAGE_SIZE); - - if (le16_to_cpu(l->d_npartitions) < max_partitions) - max_partitions = le16_to_cpu(l->d_npartitions); - for (p = l->d_partitions; p - l->d_partitions < max_partitions; p++) { - sector_t bsd_start, bsd_size; - - if (state->next == state->limit) - break; - if (p->p_fstype == BSD_FS_UNUSED) - continue; - bsd_start = le32_to_cpu(p->p_offset); - bsd_size = le32_to_cpu(p->p_size); - if (offset == bsd_start && size == bsd_size) - /* full parent partition, we have it already */ - continue; - if (offset > bsd_start || offset+size < bsd_start+bsd_size) { - strlcat(state->pp_buf, "bad subpartition - ignored\n", PAGE_SIZE); - continue; - } - put_partition(state, state->next++, bsd_start, bsd_size); - } - put_dev_sector(sect); - if (le16_to_cpu(l->d_npartitions) > max_partitions) { - snprintf(tmp, sizeof(tmp), " (ignored %d more)", - le16_to_cpu(l->d_npartitions) - max_partitions); - strlcat(state->pp_buf, tmp, PAGE_SIZE); - } - strlcat(state->pp_buf, " >\n", PAGE_SIZE); -} -#endif - -static void parse_freebsd(struct parsed_partitions *state, - sector_t offset, sector_t size, int origin) -{ -#ifdef CONFIG_BSD_DISKLABEL - parse_bsd(state, offset, size, origin, "bsd", BSD_MAXPARTITIONS); -#endif -} - -static void parse_netbsd(struct parsed_partitions *state, - sector_t offset, sector_t size, int origin) -{ -#ifdef CONFIG_BSD_DISKLABEL - parse_bsd(state, offset, size, origin, "netbsd", BSD_MAXPARTITIONS); -#endif -} - -static void parse_openbsd(struct parsed_partitions *state, - sector_t offset, sector_t size, int origin) -{ -#ifdef CONFIG_BSD_DISKLABEL - parse_bsd(state, offset, size, origin, "openbsd", - OPENBSD_MAXPARTITIONS); -#endif -} - -/* - * Create devices for Unixware partitions listed in a disklabel, under a - * dos-like partition. See parse_extended() for more information. - */ -static void parse_unixware(struct parsed_partitions *state, - sector_t offset, sector_t size, int origin) -{ -#ifdef CONFIG_UNIXWARE_DISKLABEL - Sector sect; - struct unixware_disklabel *l; - struct unixware_slice *p; - - l = read_part_sector(state, offset + 29, §); - if (!l) - return; - if (le32_to_cpu(l->d_magic) != UNIXWARE_DISKMAGIC || - le32_to_cpu(l->vtoc.v_magic) != UNIXWARE_DISKMAGIC2) { - put_dev_sector(sect); - return; - } - { - char tmp[1 + BDEVNAME_SIZE + 10 + 12 + 1]; - - snprintf(tmp, sizeof(tmp), " %s%d: <unixware:", state->name, origin); - strlcat(state->pp_buf, tmp, PAGE_SIZE); - } - p = &l->vtoc.v_slice[1]; - /* I omit the 0th slice as it is the same as whole disk. */ - while (p - &l->vtoc.v_slice[0] < UNIXWARE_NUMSLICE) { - if (state->next == state->limit) - break; - - if (p->s_label != UNIXWARE_FS_UNUSED) - put_partition(state, state->next++, - le32_to_cpu(p->start_sect), - le32_to_cpu(p->nr_sects)); - p++; - } - put_dev_sector(sect); - strlcat(state->pp_buf, " >\n", PAGE_SIZE); -#endif -} - -/* - * Minix 2.0.0/2.0.2 subpartition support. - * Anand Krishnamurthy <anandk@wiproge.med.ge.com> - * Rajeev V. Pillai <rajeevvp@yahoo.com> - */ -static void parse_minix(struct parsed_partitions *state, - sector_t offset, sector_t size, int origin) -{ -#ifdef CONFIG_MINIX_SUBPARTITION - Sector sect; - unsigned char *data; - struct partition *p; - int i; - - data = read_part_sector(state, offset, §); - if (!data) - return; - - p = (struct partition *)(data + 0x1be); - - /* The first sector of a Minix partition can have either - * a secondary MBR describing its subpartitions, or - * the normal boot sector. */ - if (msdos_magic_present (data + 510) && - SYS_IND(p) == MINIX_PARTITION) { /* subpartition table present */ - char tmp[1 + BDEVNAME_SIZE + 10 + 9 + 1]; - - snprintf(tmp, sizeof(tmp), " %s%d: <minix:", state->name, origin); - strlcat(state->pp_buf, tmp, PAGE_SIZE); - for (i = 0; i < MINIX_NR_SUBPARTITIONS; i++, p++) { - if (state->next == state->limit) - break; - /* add each partition in use */ - if (SYS_IND(p) == MINIX_PARTITION) - put_partition(state, state->next++, - start_sect(p), nr_sects(p)); - } - strlcat(state->pp_buf, " >\n", PAGE_SIZE); - } - put_dev_sector(sect); -#endif /* CONFIG_MINIX_SUBPARTITION */ -} - -static struct { - unsigned char id; - void (*parse)(struct parsed_partitions *, sector_t, sector_t, int); -} subtypes[] = { - {FREEBSD_PARTITION, parse_freebsd}, - {NETBSD_PARTITION, parse_netbsd}, - {OPENBSD_PARTITION, parse_openbsd}, - {MINIX_PARTITION, parse_minix}, - {UNIXWARE_PARTITION, parse_unixware}, - {SOLARIS_X86_PARTITION, parse_solaris_x86}, - {NEW_SOLARIS_X86_PARTITION, parse_solaris_x86}, - {0, NULL}, -}; - -int msdos_partition(struct parsed_partitions *state) -{ - sector_t sector_size = bdev_logical_block_size(state->bdev) / 512; - Sector sect; - unsigned char *data; - struct partition *p; - struct fat_boot_sector *fb; - int slot; - - data = read_part_sector(state, 0, §); - if (!data) - return -1; - if (!msdos_magic_present(data + 510)) { - put_dev_sector(sect); - return 0; - } - - if (aix_magic_present(state, data)) { - put_dev_sector(sect); - strlcat(state->pp_buf, " [AIX]", PAGE_SIZE); - return 0; - } - - /* - * Now that the 55aa signature is present, this is probably - * either the boot sector of a FAT filesystem or a DOS-type - * partition table. Reject this in case the boot indicator - * is not 0 or 0x80. - */ - p = (struct partition *) (data + 0x1be); - for (slot = 1; slot <= 4; slot++, p++) { - if (p->boot_ind != 0 && p->boot_ind != 0x80) { - /* - * Even without a valid boot inidicator value - * its still possible this is valid FAT filesystem - * without a partition table. - */ - fb = (struct fat_boot_sector *) data; - if (slot == 1 && fb->reserved && fb->fats - && fat_valid_media(fb->media)) { - strlcat(state->pp_buf, "\n", PAGE_SIZE); - put_dev_sector(sect); - return 1; - } else { - put_dev_sector(sect); - return 0; - } - } - } - -#ifdef CONFIG_EFI_PARTITION - p = (struct partition *) (data + 0x1be); - for (slot = 1 ; slot <= 4 ; slot++, p++) { - /* If this is an EFI GPT disk, msdos should ignore it. */ - if (SYS_IND(p) == EFI_PMBR_OSTYPE_EFI_GPT) { - put_dev_sector(sect); - return 0; - } - } -#endif - p = (struct partition *) (data + 0x1be); - - /* - * Look for partitions in two passes: - * First find the primary and DOS-type extended partitions. - * On the second pass look inside *BSD, Unixware and Solaris partitions. - */ - - state->next = 5; - for (slot = 1 ; slot <= 4 ; slot++, p++) { - sector_t start = start_sect(p)*sector_size; - sector_t size = nr_sects(p)*sector_size; - if (!size) - continue; - if (is_extended_partition(p)) { - /* - * prevent someone doing mkfs or mkswap on an - * extended partition, but leave room for LILO - * FIXME: this uses one logical sector for > 512b - * sector, although it may not be enough/proper. - */ - sector_t n = 2; - n = min(size, max(sector_size, n)); - put_partition(state, slot, start, n); - - strlcat(state->pp_buf, " <", PAGE_SIZE); - parse_extended(state, start, size); - strlcat(state->pp_buf, " >", PAGE_SIZE); - continue; - } - put_partition(state, slot, start, size); - if (SYS_IND(p) == LINUX_RAID_PARTITION) - state->parts[slot].flags = ADDPART_FLAG_RAID; - if (SYS_IND(p) == DM6_PARTITION) - strlcat(state->pp_buf, "[DM]", PAGE_SIZE); - if (SYS_IND(p) == EZD_PARTITION) - strlcat(state->pp_buf, "[EZD]", PAGE_SIZE); - } - - strlcat(state->pp_buf, "\n", PAGE_SIZE); - - /* second pass - output for each on a separate line */ - p = (struct partition *) (0x1be + data); - for (slot = 1 ; slot <= 4 ; slot++, p++) { - unsigned char id = SYS_IND(p); - int n; - - if (!nr_sects(p)) - continue; - - for (n = 0; subtypes[n].parse && id != subtypes[n].id; n++) - ; - - if (!subtypes[n].parse) - continue; - subtypes[n].parse(state, start_sect(p) * sector_size, - nr_sects(p) * sector_size, slot); - } - put_dev_sector(sect); - return 1; -} diff --git a/fs/partitions/msdos.h b/fs/partitions/msdos.h deleted file mode 100644 index 38c781c..0000000 --- a/fs/partitions/msdos.h +++ /dev/null @@ -1,8 +0,0 @@ -/* - * fs/partitions/msdos.h - */ - -#define MSDOS_LABEL_MAGIC 0xAA55 - -int msdos_partition(struct parsed_partitions *state); - diff --git a/fs/partitions/osf.c b/fs/partitions/osf.c deleted file mode 100644 index 764b86a..0000000 --- a/fs/partitions/osf.c +++ /dev/null @@ -1,86 +0,0 @@ -/* - * fs/partitions/osf.c - * - * Code extracted from drivers/block/genhd.c - * - * Copyright (C) 1991-1998 Linus Torvalds - * Re-organised Feb 1998 Russell King - */ - -#include "check.h" -#include "osf.h" - -#define MAX_OSF_PARTITIONS 18 - -int osf_partition(struct parsed_partitions *state) -{ - int i; - int slot = 1; - unsigned int npartitions; - Sector sect; - unsigned char *data; - struct disklabel { - __le32 d_magic; - __le16 d_type,d_subtype; - u8 d_typename[16]; - u8 d_packname[16]; - __le32 d_secsize; - __le32 d_nsectors; - __le32 d_ntracks; - __le32 d_ncylinders; - __le32 d_secpercyl; - __le32 d_secprtunit; - __le16 d_sparespertrack; - __le16 d_sparespercyl; - __le32 d_acylinders; - __le16 d_rpm, d_interleave, d_trackskew, d_cylskew; - __le32 d_headswitch, d_trkseek, d_flags; - __le32 d_drivedata[5]; - __le32 d_spare[5]; - __le32 d_magic2; - __le16 d_checksum; - __le16 d_npartitions; - __le32 d_bbsize, d_sbsize; - struct d_partition { - __le32 p_size; - __le32 p_offset; - __le32 p_fsize; - u8 p_fstype; - u8 p_frag; - __le16 p_cpg; - } d_partitions[MAX_OSF_PARTITIONS]; - } * label; - struct d_partition * partition; - - data = read_part_sector(state, 0, §); - if (!data) - return -1; - - label = (struct disklabel *) (data+64); - partition = label->d_partitions; - if (le32_to_cpu(label->d_magic) != DISKLABELMAGIC) { - put_dev_sector(sect); - return 0; - } - if (le32_to_cpu(label->d_magic2) != DISKLABELMAGIC) { - put_dev_sector(sect); - return 0; - } - npartitions = le16_to_cpu(label->d_npartitions); - if (npartitions > MAX_OSF_PARTITIONS) { - put_dev_sector(sect); - return 0; - } - for (i = 0 ; i < npartitions; i++, partition++) { - if (slot == state->limit) - break; - if (le32_to_cpu(partition->p_size)) - put_partition(state, slot, - le32_to_cpu(partition->p_offset), - le32_to_cpu(partition->p_size)); - slot++; - } - strlcat(state->pp_buf, "\n", PAGE_SIZE); - put_dev_sector(sect); - return 1; -} diff --git a/fs/partitions/osf.h b/fs/partitions/osf.h deleted file mode 100644 index 20ed231..0000000 --- a/fs/partitions/osf.h +++ /dev/null @@ -1,7 +0,0 @@ -/* - * fs/partitions/osf.h - */ - -#define DISKLABELMAGIC (0x82564557UL) - -int osf_partition(struct parsed_partitions *state); diff --git a/fs/partitions/sgi.c b/fs/partitions/sgi.c deleted file mode 100644 index ea8a86d..0000000 --- a/fs/partitions/sgi.c +++ /dev/null @@ -1,82 +0,0 @@ -/* - * fs/partitions/sgi.c - * - * Code extracted from drivers/block/genhd.c - */ - -#include "check.h" -#include "sgi.h" - -struct sgi_disklabel { - __be32 magic_mushroom; /* Big fat spliff... */ - __be16 root_part_num; /* Root partition number */ - __be16 swap_part_num; /* Swap partition number */ - s8 boot_file[16]; /* Name of boot file for ARCS */ - u8 _unused0[48]; /* Device parameter useless crapola.. */ - struct sgi_volume { - s8 name[8]; /* Name of volume */ - __be32 block_num; /* Logical block number */ - __be32 num_bytes; /* How big, in bytes */ - } volume[15]; - struct sgi_partition { - __be32 num_blocks; /* Size in logical blocks */ - __be32 first_block; /* First logical block */ - __be32 type; /* Type of this partition */ - } partitions[16]; - __be32 csum; /* Disk label checksum */ - __be32 _unused1; /* Padding */ -}; - -int sgi_partition(struct parsed_partitions *state) -{ - int i, csum; - __be32 magic; - int slot = 1; - unsigned int start, blocks; - __be32 *ui, cs; - Sector sect; - struct sgi_disklabel *label; - struct sgi_partition *p; - char b[BDEVNAME_SIZE]; - - label = read_part_sector(state, 0, §); - if (!label) - return -1; - p = &label->partitions[0]; - magic = label->magic_mushroom; - if(be32_to_cpu(magic) != SGI_LABEL_MAGIC) { - /*printk("Dev %s SGI disklabel: bad magic %08x\n", - bdevname(bdev, b), be32_to_cpu(magic));*/ - put_dev_sector(sect); - return 0; - } - ui = ((__be32 *) (label + 1)) - 1; - for(csum = 0; ui >= ((__be32 *) label);) { - cs = *ui--; - csum += be32_to_cpu(cs); - } - if(csum) { - printk(KERN_WARNING "Dev %s SGI disklabel: csum bad, label corrupted\n", - bdevname(state->bdev, b)); - put_dev_sector(sect); - return 0; - } - /* All SGI disk labels have 16 partitions, disks under Linux only - * have 15 minor's. Luckily there are always a few zero length - * partitions which we don't care about so we never overflow the - * current_minor. - */ - for(i = 0; i < 16; i++, p++) { - blocks = be32_to_cpu(p->num_blocks); - start = be32_to_cpu(p->first_block); - if (blocks) { - put_partition(state, slot, start, blocks); - if (be32_to_cpu(p->type) == LINUX_RAID_PARTITION) - state->parts[slot].flags = ADDPART_FLAG_RAID; - } - slot++; - } - strlcat(state->pp_buf, "\n", PAGE_SIZE); - put_dev_sector(sect); - return 1; -} diff --git a/fs/partitions/sgi.h b/fs/partitions/sgi.h deleted file mode 100644 index b9553eb..0000000 --- a/fs/partitions/sgi.h +++ /dev/null @@ -1,8 +0,0 @@ -/* - * fs/partitions/sgi.h - */ - -extern int sgi_partition(struct parsed_partitions *state); - -#define SGI_LABEL_MAGIC 0x0be5a941 - diff --git a/fs/partitions/sun.c b/fs/partitions/sun.c deleted file mode 100644 index b5b6fcf..0000000 --- a/fs/partitions/sun.c +++ /dev/null @@ -1,122 +0,0 @@ -/* - * fs/partitions/sun.c - * - * Code extracted from drivers/block/genhd.c - * - * Copyright (C) 1991-1998 Linus Torvalds - * Re-organised Feb 1998 Russell King - */ - -#include "check.h" -#include "sun.h" - -int sun_partition(struct parsed_partitions *state) -{ - int i; - __be16 csum; - int slot = 1; - __be16 *ush; - Sector sect; - struct sun_disklabel { - unsigned char info[128]; /* Informative text string */ - struct sun_vtoc { - __be32 version; /* Layout version */ - char volume[8]; /* Volume name */ - __be16 nparts; /* Number of partitions */ - struct sun_info { /* Partition hdrs, sec 2 */ - __be16 id; - __be16 flags; - } infos[8]; - __be16 padding; /* Alignment padding */ - __be32 bootinfo[3]; /* Info needed by mboot */ - __be32 sanity; /* To verify vtoc sanity */ - __be32 reserved[10]; /* Free space */ - __be32 timestamp[8]; /* Partition timestamp */ - } vtoc; - __be32 write_reinstruct; /* sectors to skip, writes */ - __be32 read_reinstruct; /* sectors to skip, reads */ - unsigned char spare[148]; /* Padding */ - __be16 rspeed; /* Disk rotational speed */ - __be16 pcylcount; /* Physical cylinder count */ - __be16 sparecyl; /* extra sects per cylinder */ - __be16 obs1; /* gap1 */ - __be16 obs2; /* gap2 */ - __be16 ilfact; /* Interleave factor */ - __be16 ncyl; /* Data cylinder count */ - __be16 nacyl; /* Alt. cylinder count */ - __be16 ntrks; /* Tracks per cylinder */ - __be16 nsect; /* Sectors per track */ - __be16 obs3; /* bhead - Label head offset */ - __be16 obs4; /* ppart - Physical Partition */ - struct sun_partition { - __be32 start_cylinder; - __be32 num_sectors; - } partitions[8]; - __be16 magic; /* Magic number */ - __be16 csum; /* Label xor'd checksum */ - } * label; - struct sun_partition *p; - unsigned long spc; - char b[BDEVNAME_SIZE]; - int use_vtoc; - int nparts; - - label = read_part_sector(state, 0, §); - if (!label) - return -1; - - p = label->partitions; - if (be16_to_cpu(label->magic) != SUN_LABEL_MAGIC) { -/* printk(KERN_INFO "Dev %s Sun disklabel: bad magic %04x\n", - bdevname(bdev, b), be16_to_cpu(label->magic)); */ - put_dev_sector(sect); - return 0; - } - /* Look at the checksum */ - ush = ((__be16 *) (label+1)) - 1; - for (csum = 0; ush >= ((__be16 *) label);) - csum ^= *ush--; - if (csum) { - printk("Dev %s Sun disklabel: Csum bad, label corrupted\n", - bdevname(state->bdev, b)); - put_dev_sector(sect); - return 0; - } - - /* Check to see if we can use the VTOC table */ - use_vtoc = ((be32_to_cpu(label->vtoc.sanity) == SUN_VTOC_SANITY) && - (be32_to_cpu(label->vtoc.version) == 1) && - (be16_to_cpu(label->vtoc.nparts) <= 8)); - - /* Use 8 partition entries if not specified in validated VTOC */ - nparts = (use_vtoc) ? be16_to_cpu(label->vtoc.nparts) : 8; - - /* - * So that old Linux-Sun partitions continue to work, - * alow the VTOC to be used under the additional condition ... - */ - use_vtoc = use_vtoc || !(label->vtoc.sanity || - label->vtoc.version || label->vtoc.nparts); - spc = be16_to_cpu(label->ntrks) * be16_to_cpu(label->nsect); - for (i = 0; i < nparts; i++, p++) { - unsigned long st_sector; - unsigned int num_sectors; - - st_sector = be32_to_cpu(p->start_cylinder) * spc; - num_sectors = be32_to_cpu(p->num_sectors); - if (num_sectors) { - put_partition(state, slot, st_sector, num_sectors); - state->parts[slot].flags = 0; - if (use_vtoc) { - if (be16_to_cpu(label->vtoc.infos[i].id) == LINUX_RAID_PARTITION) - state->parts[slot].flags |= ADDPART_FLAG_RAID; - else if (be16_to_cpu(label->vtoc.infos[i].id) == SUN_WHOLE_DISK) - state->parts[slot].flags |= ADDPART_FLAG_WHOLEDISK; - } - } - slot++; - } - strlcat(state->pp_buf, "\n", PAGE_SIZE); - put_dev_sector(sect); - return 1; -} diff --git a/fs/partitions/sun.h b/fs/partitions/sun.h deleted file mode 100644 index 2424baa..0000000 --- a/fs/partitions/sun.h +++ /dev/null @@ -1,8 +0,0 @@ -/* - * fs/partitions/sun.h - */ - -#define SUN_LABEL_MAGIC 0xDABE -#define SUN_VTOC_SANITY 0x600DDEEE - -int sun_partition(struct parsed_partitions *state); diff --git a/fs/partitions/sysv68.c b/fs/partitions/sysv68.c deleted file mode 100644 index 9627ccf..0000000 --- a/fs/partitions/sysv68.c +++ /dev/null @@ -1,95 +0,0 @@ -/* - * fs/partitions/sysv68.c - * - * Copyright (C) 2007 Philippe De Muyter <phdm@macqel.be> - */ - -#include "check.h" -#include "sysv68.h" - -/* - * Volume ID structure: on first 256-bytes sector of disk - */ - -struct volumeid { - u8 vid_unused[248]; - u8 vid_mac[8]; /* ASCII string "MOTOROLA" */ -}; - -/* - * config block: second 256-bytes sector on disk - */ - -struct dkconfig { - u8 ios_unused0[128]; - __be32 ios_slcblk; /* Slice table block number */ - __be16 ios_slccnt; /* Number of entries in slice table */ - u8 ios_unused1[122]; -}; - -/* - * combined volumeid and dkconfig block - */ - -struct dkblk0 { - struct volumeid dk_vid; - struct dkconfig dk_ios; -}; - -/* - * Slice Table Structure - */ - -struct slice { - __be32 nblocks; /* slice size (in blocks) */ - __be32 blkoff; /* block offset of slice */ -}; - - -int sysv68_partition(struct parsed_partitions *state) -{ - int i, slices; - int slot = 1; - Sector sect; - unsigned char *data; - struct dkblk0 *b; - struct slice *slice; - char tmp[64]; - - data = read_part_sector(state, 0, §); - if (!data) - return -1; - - b = (struct dkblk0 *)data; - if (memcmp(b->dk_vid.vid_mac, "MOTOROLA", sizeof(b->dk_vid.vid_mac))) { - put_dev_sector(sect); - return 0; - } - slices = be16_to_cpu(b->dk_ios.ios_slccnt); - i = be32_to_cpu(b->dk_ios.ios_slcblk); - put_dev_sector(sect); - - data = read_part_sector(state, i, §); - if (!data) - return -1; - - slices -= 1; /* last slice is the whole disk */ - snprintf(tmp, sizeof(tmp), "sysV68: %s(s%u)", state->name, slices); - strlcat(state->pp_buf, tmp, PAGE_SIZE); - slice = (struct slice *)data; - for (i = 0; i < slices; i++, slice++) { - if (slot == state->limit) - break; - if (be32_to_cpu(slice->nblocks)) { - put_partition(state, slot, - be32_to_cpu(slice->blkoff), - be32_to_cpu(slice->nblocks)); - snprintf(tmp, sizeof(tmp), "(s%u)", i); - strlcat(state->pp_buf, tmp, PAGE_SIZE); - } - slot++; - } - strlcat(state->pp_buf, "\n", PAGE_SIZE); - put_dev_sector(sect); - return 1; -} diff --git a/fs/partitions/sysv68.h b/fs/partitions/sysv68.h deleted file mode 100644 index bf2f5ff..0000000 --- a/fs/partitions/sysv68.h +++ /dev/null @@ -1 +0,0 @@ -extern int sysv68_partition(struct parsed_partitions *state); diff --git a/fs/partitions/ultrix.c b/fs/partitions/ultrix.c deleted file mode 100644 index 8dbaf9f..0000000 --- a/fs/partitions/ultrix.c +++ /dev/null @@ -1,48 +0,0 @@ -/* - * fs/partitions/ultrix.c - * - * Code extracted from drivers/block/genhd.c - * - * Re-organised Jul 1999 Russell King - */ - -#include "check.h" -#include "ultrix.h" - -int ultrix_partition(struct parsed_partitions *state) -{ - int i; - Sector sect; - unsigned char *data; - struct ultrix_disklabel { - s32 pt_magic; /* magic no. indicating part. info exits */ - s32 pt_valid; /* set by driver if pt is current */ - struct pt_info { - s32 pi_nblocks; /* no. of sectors */ - u32 pi_blkoff; /* block offset for start */ - } pt_part[8]; - } *label; - -#define PT_MAGIC 0x032957 /* Partition magic number */ -#define PT_VALID 1 /* Indicates if struct is valid */ - - data = read_part_sector(state, (16384 - sizeof(*label))/512, §); - if (!data) - return -1; - - label = (struct ultrix_disklabel *)(data + 512 - sizeof(*label)); - - if (label->pt_magic == PT_MAGIC && label->pt_valid == PT_VALID) { - for (i=0; i<8; i++) - if (label->pt_part[i].pi_nblocks) - put_partition(state, i+1, - label->pt_part[i].pi_blkoff, - label->pt_part[i].pi_nblocks); - put_dev_sector(sect); - strlcat(state->pp_buf, "\n", PAGE_SIZE); - return 1; - } else { - put_dev_sector(sect); - return 0; - } -} diff --git a/fs/partitions/ultrix.h b/fs/partitions/ultrix.h deleted file mode 100644 index a3cc00b..0000000 --- a/fs/partitions/ultrix.h +++ /dev/null @@ -1,5 +0,0 @@ -/* - * fs/partitions/ultrix.h - */ - -int ultrix_partition(struct parsed_partitions *state); @@ -1254,6 +1254,7 @@ out: static const struct super_operations pipefs_ops = { .destroy_inode = free_inode_nonrcu, + .statfs = simple_statfs, }; /* @@ -1289,11 +1290,4 @@ static int __init init_pipe_fs(void) return err; } -static void __exit exit_pipe_fs(void) -{ - kern_unmount(pipe_mnt); - unregister_filesystem(&pipe_fs_type); -} - fs_initcall(init_pipe_fs); -module_exit(exit_pipe_fs); @@ -13,45 +13,30 @@ #include "pnode.h" /* return the next shared peer mount of @p */ -static inline struct vfsmount *next_peer(struct vfsmount *p) +static inline struct mount *next_peer(struct mount *p) { - return list_entry(p->mnt_share.next, struct vfsmount, mnt_share); + return list_entry(p->mnt_share.next, struct mount, mnt_share); } -static inline struct vfsmount *first_slave(struct vfsmount *p) +static inline struct mount *first_slave(struct mount *p) { - return list_entry(p->mnt_slave_list.next, struct vfsmount, mnt_slave); + return list_entry(p->mnt_slave_list.next, struct mount, mnt_slave); } -static inline struct vfsmount *next_slave(struct vfsmount *p) +static inline struct mount *next_slave(struct mount *p) { - return list_entry(p->mnt_slave.next, struct vfsmount, mnt_slave); + return list_entry(p->mnt_slave.next, struct mount, mnt_slave); } -/* - * Return true if path is reachable from root - * - * namespace_sem is held, and mnt is attached - */ -static bool is_path_reachable(struct vfsmount *mnt, struct dentry *dentry, - const struct path *root) -{ - while (mnt != root->mnt && mnt->mnt_parent != mnt) { - dentry = mnt->mnt_mountpoint; - mnt = mnt->mnt_parent; - } - return mnt == root->mnt && is_subdir(dentry, root->dentry); -} - -static struct vfsmount *get_peer_under_root(struct vfsmount *mnt, - struct mnt_namespace *ns, - const struct path *root) +static struct mount *get_peer_under_root(struct mount *mnt, + struct mnt_namespace *ns, + const struct path *root) { - struct vfsmount *m = mnt; + struct mount *m = mnt; do { /* Check the namespace first for optimization */ - if (m->mnt_ns == ns && is_path_reachable(m, m->mnt_root, root)) + if (m->mnt_ns == ns && is_path_reachable(m, m->mnt.mnt_root, root)) return m; m = next_peer(m); @@ -66,12 +51,12 @@ static struct vfsmount *get_peer_under_root(struct vfsmount *mnt, * * Caller must hold namespace_sem */ -int get_dominating_id(struct vfsmount *mnt, const struct path *root) +int get_dominating_id(struct mount *mnt, const struct path *root) { - struct vfsmount *m; + struct mount *m; for (m = mnt->mnt_master; m != NULL; m = m->mnt_master) { - struct vfsmount *d = get_peer_under_root(m, mnt->mnt_ns, root); + struct mount *d = get_peer_under_root(m, mnt->mnt_ns, root); if (d) return d->mnt_group_id; } @@ -79,10 +64,10 @@ int get_dominating_id(struct vfsmount *mnt, const struct path *root) return 0; } -static int do_make_slave(struct vfsmount *mnt) +static int do_make_slave(struct mount *mnt) { - struct vfsmount *peer_mnt = mnt, *master = mnt->mnt_master; - struct vfsmount *slave_mnt; + struct mount *peer_mnt = mnt, *master = mnt->mnt_master; + struct mount *slave_mnt; /* * slave 'mnt' to a peer mount that has the @@ -90,7 +75,7 @@ static int do_make_slave(struct vfsmount *mnt) * slave it to anything that is available. */ while ((peer_mnt = next_peer(peer_mnt)) != mnt && - peer_mnt->mnt_root != mnt->mnt_root) ; + peer_mnt->mnt.mnt_root != mnt->mnt.mnt_root) ; if (peer_mnt == mnt) { peer_mnt = next_peer(mnt); @@ -116,7 +101,7 @@ static int do_make_slave(struct vfsmount *mnt) struct list_head *p = &mnt->mnt_slave_list; while (!list_empty(p)) { slave_mnt = list_first_entry(p, - struct vfsmount, mnt_slave); + struct mount, mnt_slave); list_del_init(&slave_mnt->mnt_slave); slave_mnt->mnt_master = NULL; } @@ -129,7 +114,7 @@ static int do_make_slave(struct vfsmount *mnt) /* * vfsmount lock must be held for write */ -void change_mnt_propagation(struct vfsmount *mnt, int type) +void change_mnt_propagation(struct mount *mnt, int type) { if (type == MS_SHARED) { set_mnt_shared(mnt); @@ -140,9 +125,9 @@ void change_mnt_propagation(struct vfsmount *mnt, int type) list_del_init(&mnt->mnt_slave); mnt->mnt_master = NULL; if (type == MS_UNBINDABLE) - mnt->mnt_flags |= MNT_UNBINDABLE; + mnt->mnt.mnt_flags |= MNT_UNBINDABLE; else - mnt->mnt_flags &= ~MNT_UNBINDABLE; + mnt->mnt.mnt_flags &= ~MNT_UNBINDABLE; } } @@ -156,20 +141,19 @@ void change_mnt_propagation(struct vfsmount *mnt, int type) * vfsmount found while iterating with propagation_next() is * a peer of one we'd found earlier. */ -static struct vfsmount *propagation_next(struct vfsmount *m, - struct vfsmount *origin) +static struct mount *propagation_next(struct mount *m, + struct mount *origin) { /* are there any slaves of this mount? */ if (!IS_MNT_NEW(m) && !list_empty(&m->mnt_slave_list)) return first_slave(m); while (1) { - struct vfsmount *next; - struct vfsmount *master = m->mnt_master; + struct mount *master = m->mnt_master; if (master == origin->mnt_master) { - next = next_peer(m); - return ((next == origin) ? NULL : next); + struct mount *next = next_peer(m); + return (next == origin) ? NULL : next; } else if (m->mnt_slave.next != &master->mnt_slave_list) return next_slave(m); @@ -187,13 +171,13 @@ static struct vfsmount *propagation_next(struct vfsmount *m, * @type return CL_SLAVE if the new mount has to be * cloned as a slave. */ -static struct vfsmount *get_source(struct vfsmount *dest, - struct vfsmount *last_dest, - struct vfsmount *last_src, - int *type) +static struct mount *get_source(struct mount *dest, + struct mount *last_dest, + struct mount *last_src, + int *type) { - struct vfsmount *p_last_src = NULL; - struct vfsmount *p_last_dest = NULL; + struct mount *p_last_src = NULL; + struct mount *p_last_dest = NULL; while (last_dest != dest->mnt_master) { p_last_dest = last_dest; @@ -233,33 +217,33 @@ static struct vfsmount *get_source(struct vfsmount *dest, * @source_mnt: source mount. * @tree_list : list of heads of trees to be attached. */ -int propagate_mnt(struct vfsmount *dest_mnt, struct dentry *dest_dentry, - struct vfsmount *source_mnt, struct list_head *tree_list) +int propagate_mnt(struct mount *dest_mnt, struct dentry *dest_dentry, + struct mount *source_mnt, struct list_head *tree_list) { - struct vfsmount *m, *child; + struct mount *m, *child; int ret = 0; - struct vfsmount *prev_dest_mnt = dest_mnt; - struct vfsmount *prev_src_mnt = source_mnt; + struct mount *prev_dest_mnt = dest_mnt; + struct mount *prev_src_mnt = source_mnt; LIST_HEAD(tmp_list); LIST_HEAD(umount_list); for (m = propagation_next(dest_mnt, dest_mnt); m; m = propagation_next(m, dest_mnt)) { int type; - struct vfsmount *source; + struct mount *source; if (IS_MNT_NEW(m)) continue; source = get_source(m, prev_dest_mnt, prev_src_mnt, &type); - if (!(child = copy_tree(source, source->mnt_root, type))) { + if (!(child = copy_tree(source, source->mnt.mnt_root, type))) { ret = -ENOMEM; list_splice(tree_list, tmp_list.prev); goto out; } - if (is_subdir(dest_dentry, m->mnt_root)) { + if (is_subdir(dest_dentry, m->mnt.mnt_root)) { mnt_set_mountpoint(m, dest_dentry, child); list_add_tail(&child->mnt_hash, tree_list); } else { @@ -275,7 +259,7 @@ int propagate_mnt(struct vfsmount *dest_mnt, struct dentry *dest_dentry, out: br_write_lock(vfsmount_lock); while (!list_empty(&tmp_list)) { - child = list_first_entry(&tmp_list, struct vfsmount, mnt_hash); + child = list_first_entry(&tmp_list, struct mount, mnt_hash); umount_tree(child, 0, &umount_list); } br_write_unlock(vfsmount_lock); @@ -286,7 +270,7 @@ out: /* * return true if the refcount is greater than count */ -static inline int do_refcount_check(struct vfsmount *mnt, int count) +static inline int do_refcount_check(struct mount *mnt, int count) { int mycount = mnt_get_count(mnt) - mnt->mnt_ghosts; return (mycount > count); @@ -302,10 +286,10 @@ static inline int do_refcount_check(struct vfsmount *mnt, int count) * * vfsmount lock must be held for write */ -int propagate_mount_busy(struct vfsmount *mnt, int refcnt) +int propagate_mount_busy(struct mount *mnt, int refcnt) { - struct vfsmount *m, *child; - struct vfsmount *parent = mnt->mnt_parent; + struct mount *m, *child; + struct mount *parent = mnt->mnt_parent; int ret = 0; if (mnt == parent) @@ -321,7 +305,7 @@ int propagate_mount_busy(struct vfsmount *mnt, int refcnt) for (m = propagation_next(parent, parent); m; m = propagation_next(m, parent)) { - child = __lookup_mnt(m, mnt->mnt_mountpoint, 0); + child = __lookup_mnt(&m->mnt, mnt->mnt_mountpoint, 0); if (child && list_empty(&child->mnt_mounts) && (ret = do_refcount_check(child, 1))) break; @@ -333,17 +317,17 @@ int propagate_mount_busy(struct vfsmount *mnt, int refcnt) * NOTE: unmounting 'mnt' naturally propagates to all other mounts its * parent propagates to. */ -static void __propagate_umount(struct vfsmount *mnt) +static void __propagate_umount(struct mount *mnt) { - struct vfsmount *parent = mnt->mnt_parent; - struct vfsmount *m; + struct mount *parent = mnt->mnt_parent; + struct mount *m; BUG_ON(parent == mnt); for (m = propagation_next(parent, parent); m; m = propagation_next(m, parent)) { - struct vfsmount *child = __lookup_mnt(m, + struct mount *child = __lookup_mnt(&m->mnt, mnt->mnt_mountpoint, 0); /* * umount the child only if the child has no @@ -363,7 +347,7 @@ static void __propagate_umount(struct vfsmount *mnt) */ int propagate_umount(struct list_head *list) { - struct vfsmount *mnt; + struct mount *mnt; list_for_each_entry(mnt, list, mnt_hash) __propagate_umount(mnt); @@ -9,13 +9,13 @@ #define _LINUX_PNODE_H #include <linux/list.h> -#include <linux/mount.h> +#include "mount.h" -#define IS_MNT_SHARED(mnt) (mnt->mnt_flags & MNT_SHARED) -#define IS_MNT_SLAVE(mnt) (mnt->mnt_master) -#define IS_MNT_NEW(mnt) (!mnt->mnt_ns) -#define CLEAR_MNT_SHARED(mnt) (mnt->mnt_flags &= ~MNT_SHARED) -#define IS_MNT_UNBINDABLE(mnt) (mnt->mnt_flags & MNT_UNBINDABLE) +#define IS_MNT_SHARED(m) ((m)->mnt.mnt_flags & MNT_SHARED) +#define IS_MNT_SLAVE(m) ((m)->mnt_master) +#define IS_MNT_NEW(m) (!(m)->mnt_ns) +#define CLEAR_MNT_SHARED(m) ((m)->mnt.mnt_flags &= ~MNT_SHARED) +#define IS_MNT_UNBINDABLE(m) ((m)->mnt.mnt_flags & MNT_UNBINDABLE) #define CL_EXPIRE 0x01 #define CL_SLAVE 0x02 @@ -23,17 +23,25 @@ #define CL_MAKE_SHARED 0x08 #define CL_PRIVATE 0x10 -static inline void set_mnt_shared(struct vfsmount *mnt) +static inline void set_mnt_shared(struct mount *mnt) { - mnt->mnt_flags &= ~MNT_SHARED_MASK; - mnt->mnt_flags |= MNT_SHARED; + mnt->mnt.mnt_flags &= ~MNT_SHARED_MASK; + mnt->mnt.mnt_flags |= MNT_SHARED; } -void change_mnt_propagation(struct vfsmount *, int); -int propagate_mnt(struct vfsmount *, struct dentry *, struct vfsmount *, +void change_mnt_propagation(struct mount *, int); +int propagate_mnt(struct mount *, struct dentry *, struct mount *, struct list_head *); int propagate_umount(struct list_head *); -int propagate_mount_busy(struct vfsmount *, int); -void mnt_release_group_id(struct vfsmount *); -int get_dominating_id(struct vfsmount *mnt, const struct path *root); +int propagate_mount_busy(struct mount *, int); +void mnt_release_group_id(struct mount *); +int get_dominating_id(struct mount *mnt, const struct path *root); +unsigned int mnt_get_count(struct mount *mnt); +void mnt_set_mountpoint(struct mount *, struct dentry *, + struct mount *); +void release_mounts(struct list_head *); +void umount_tree(struct mount *, int, struct list_head *); +struct mount *copy_tree(struct mount *, struct dentry *, int); +bool is_path_reachable(struct mount *, struct dentry *, + const struct path *root); #endif /* _LINUX_PNODE_H */ diff --git a/fs/posix_acl.c b/fs/posix_acl.c index 10027b4..cea4623 100644 --- a/fs/posix_acl.c +++ b/fs/posix_acl.c @@ -218,6 +218,8 @@ posix_acl_permission(struct inode *inode, const struct posix_acl *acl, int want) const struct posix_acl_entry *pa, *pe, *mask_obj; int found = 0; + want &= MAY_READ | MAY_WRITE | MAY_EXEC | MAY_NOT_BLOCK; + FOREACH_ACL_ENTRY(pa, acl, pe) { switch(pa->e_tag) { case ACL_USER_OBJ: diff --git a/fs/proc/array.c b/fs/proc/array.c index 3a1dafd..8c344f0 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -394,8 +394,8 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, sigemptyset(&sigign); sigemptyset(&sigcatch); - cutime = cstime = utime = stime = cputime_zero; - cgtime = gtime = cputime_zero; + cutime = cstime = utime = stime = 0; + cgtime = gtime = 0; if (lock_task_sighand(task, &flags)) { struct signal_struct *sig = task->signal; @@ -423,14 +423,14 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, do { min_flt += t->min_flt; maj_flt += t->maj_flt; - gtime = cputime_add(gtime, t->gtime); + gtime += t->gtime; t = next_thread(t); } while (t != task); min_flt += sig->min_flt; maj_flt += sig->maj_flt; thread_group_times(task, &utime, &stime); - gtime = cputime_add(gtime, sig->gtime); + gtime += sig->gtime; } sid = task_session_nr_ns(task, ns); diff --git a/fs/proc/base.c b/fs/proc/base.c index 5eb0206..a1dddda 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -101,7 +101,7 @@ struct pid_entry { char *name; int len; - mode_t mode; + umode_t mode; const struct inode_operations *iop; const struct file_operations *fop; union proc_op op; @@ -631,120 +631,6 @@ static const struct inode_operations proc_def_inode_operations = { .setattr = proc_setattr, }; -static int mounts_open_common(struct inode *inode, struct file *file, - const struct seq_operations *op) -{ - struct task_struct *task = get_proc_task(inode); - struct nsproxy *nsp; - struct mnt_namespace *ns = NULL; - struct path root; - struct proc_mounts *p; - int ret = -EINVAL; - - if (task) { - rcu_read_lock(); - nsp = task_nsproxy(task); - if (nsp) { - ns = nsp->mnt_ns; - if (ns) - get_mnt_ns(ns); - } - rcu_read_unlock(); - if (ns && get_task_root(task, &root) == 0) - ret = 0; - put_task_struct(task); - } - - if (!ns) - goto err; - if (ret) - goto err_put_ns; - - ret = -ENOMEM; - p = kmalloc(sizeof(struct proc_mounts), GFP_KERNEL); - if (!p) - goto err_put_path; - - file->private_data = &p->m; - ret = seq_open(file, op); - if (ret) - goto err_free; - - p->m.private = p; - p->ns = ns; - p->root = root; - p->m.poll_event = ns->event; - - return 0; - - err_free: - kfree(p); - err_put_path: - path_put(&root); - err_put_ns: - put_mnt_ns(ns); - err: - return ret; -} - -static int mounts_release(struct inode *inode, struct file *file) -{ - struct proc_mounts *p = file->private_data; - path_put(&p->root); - put_mnt_ns(p->ns); - return seq_release(inode, file); -} - -static unsigned mounts_poll(struct file *file, poll_table *wait) -{ - struct proc_mounts *p = file->private_data; - unsigned res = POLLIN | POLLRDNORM; - - poll_wait(file, &p->ns->poll, wait); - if (mnt_had_events(p)) - res |= POLLERR | POLLPRI; - - return res; -} - -static int mounts_open(struct inode *inode, struct file *file) -{ - return mounts_open_common(inode, file, &mounts_op); -} - -static const struct file_operations proc_mounts_operations = { - .open = mounts_open, - .read = seq_read, - .llseek = seq_lseek, - .release = mounts_release, - .poll = mounts_poll, -}; - -static int mountinfo_open(struct inode *inode, struct file *file) -{ - return mounts_open_common(inode, file, &mountinfo_op); -} - -static const struct file_operations proc_mountinfo_operations = { - .open = mountinfo_open, - .read = seq_read, - .llseek = seq_lseek, - .release = mounts_release, - .poll = mounts_poll, -}; - -static int mountstats_open(struct inode *inode, struct file *file) -{ - return mounts_open_common(inode, file, &mountstats_op); -} - -static const struct file_operations proc_mountstats_operations = { - .open = mountstats_open, - .read = seq_read, - .llseek = seq_lseek, - .release = mounts_release, -}; - #define PROC_BLOCK_SIZE (3*1024) /* 4K page size but our output routines use some slack for overruns */ static ssize_t proc_info_read(struct file * file, char __user * buf, @@ -1107,13 +993,6 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf, goto err_sighand; } - if (oom_adjust != task->signal->oom_adj) { - if (oom_adjust == OOM_DISABLE) - atomic_inc(&task->mm->oom_disable_count); - if (task->signal->oom_adj == OOM_DISABLE) - atomic_dec(&task->mm->oom_disable_count); - } - /* * Warn that /proc/pid/oom_adj is deprecated, see * Documentation/feature-removal-schedule.txt. @@ -1215,12 +1094,6 @@ static ssize_t oom_score_adj_write(struct file *file, const char __user *buf, goto err_sighand; } - if (oom_score_adj != task->signal->oom_score_adj) { - if (oom_score_adj == OOM_SCORE_ADJ_MIN) - atomic_inc(&task->mm->oom_disable_count); - if (task->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) - atomic_dec(&task->mm->oom_disable_count); - } task->signal->oom_score_adj = oom_score_adj; if (has_capability_noaudit(current, CAP_SYS_RESOURCE)) task->signal->oom_score_adj_min = oom_score_adj; @@ -2261,7 +2134,7 @@ static struct dentry *proc_pident_instantiate(struct inode *dir, ei = PROC_I(inode); inode->i_mode = p->mode; if (S_ISDIR(inode->i_mode)) - inode->i_nlink = 2; /* Use getattr to fix if necessary */ + set_nlink(inode, 2); /* Use getattr to fix if necessary */ if (p->iop) inode->i_op = p->iop; if (p->fop) @@ -2655,7 +2528,7 @@ static struct dentry *proc_base_instantiate(struct inode *dir, inode->i_mode = p->mode; if (S_ISDIR(inode->i_mode)) - inode->i_nlink = 2; + set_nlink(inode, 2); if (S_ISLNK(inode->i_mode)) inode->i_size = 64; if (p->iop) @@ -2994,8 +2867,8 @@ static struct dentry *proc_pid_instantiate(struct inode *dir, inode->i_fop = &proc_tgid_base_operations; inode->i_flags|=S_IMMUTABLE; - inode->i_nlink = 2 + pid_entry_count_dirs(tgid_base_stuff, - ARRAY_SIZE(tgid_base_stuff)); + set_nlink(inode, 2 + pid_entry_count_dirs(tgid_base_stuff, + ARRAY_SIZE(tgid_base_stuff))); d_set_d_op(dentry, &pid_dentry_operations); @@ -3246,8 +3119,8 @@ static struct dentry *proc_task_instantiate(struct inode *dir, inode->i_fop = &proc_tid_base_operations; inode->i_flags|=S_IMMUTABLE; - inode->i_nlink = 2 + pid_entry_count_dirs(tid_base_stuff, - ARRAY_SIZE(tid_base_stuff)); + set_nlink(inode, 2 + pid_entry_count_dirs(tid_base_stuff, + ARRAY_SIZE(tid_base_stuff))); d_set_d_op(dentry, &pid_dentry_operations); diff --git a/fs/proc/generic.c b/fs/proc/generic.c index 9d99131..2edf34f 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -283,7 +283,7 @@ static int proc_getattr(struct vfsmount *mnt, struct dentry *dentry, struct inode *inode = dentry->d_inode; struct proc_dir_entry *de = PROC_I(inode)->pde; if (de && de->nlink) - inode->i_nlink = de->nlink; + set_nlink(inode, de->nlink); generic_fillattr(inode, stat); return 0; @@ -597,7 +597,7 @@ static int proc_register(struct proc_dir_entry * dir, struct proc_dir_entry * dp static struct proc_dir_entry *__proc_create(struct proc_dir_entry **parent, const char *name, - mode_t mode, + umode_t mode, nlink_t nlink) { struct proc_dir_entry *ent = NULL; @@ -659,7 +659,7 @@ struct proc_dir_entry *proc_symlink(const char *name, } EXPORT_SYMBOL(proc_symlink); -struct proc_dir_entry *proc_mkdir_mode(const char *name, mode_t mode, +struct proc_dir_entry *proc_mkdir_mode(const char *name, umode_t mode, struct proc_dir_entry *parent) { struct proc_dir_entry *ent; @@ -699,7 +699,7 @@ struct proc_dir_entry *proc_mkdir(const char *name, } EXPORT_SYMBOL(proc_mkdir); -struct proc_dir_entry *create_proc_entry(const char *name, mode_t mode, +struct proc_dir_entry *create_proc_entry(const char *name, umode_t mode, struct proc_dir_entry *parent) { struct proc_dir_entry *ent; @@ -728,7 +728,7 @@ struct proc_dir_entry *create_proc_entry(const char *name, mode_t mode, } EXPORT_SYMBOL(create_proc_entry); -struct proc_dir_entry *proc_create_data(const char *name, mode_t mode, +struct proc_dir_entry *proc_create_data(const char *name, umode_t mode, struct proc_dir_entry *parent, const struct file_operations *proc_fops, void *data) diff --git a/fs/proc/inode.c b/fs/proc/inode.c index 7ed72d6..51a17662 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -77,7 +77,6 @@ static struct inode *proc_alloc_inode(struct super_block *sb) static void proc_i_callback(struct rcu_head *head) { struct inode *inode = container_of(head, struct inode, i_rcu); - INIT_LIST_HEAD(&inode->i_dentry); kmem_cache_free(proc_inode_cachep, PROC_I(inode)); } @@ -445,7 +444,7 @@ struct inode *proc_get_inode(struct super_block *sb, struct proc_dir_entry *de) if (de->size) inode->i_size = de->size; if (de->nlink) - inode->i_nlink = de->nlink; + set_nlink(inode, de->nlink); if (de->proc_iops) inode->i_op = de->proc_iops; if (de->proc_fops) { diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index 5861741..80e4645 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -131,12 +131,13 @@ static int meminfo_proc_show(struct seq_file *m, void *v) K(i.freeswap), K(global_page_state(NR_FILE_DIRTY)), K(global_page_state(NR_WRITEBACK)), - K(global_page_state(NR_ANON_PAGES) #ifdef CONFIG_TRANSPARENT_HUGEPAGE + K(global_page_state(NR_ANON_PAGES) + global_page_state(NR_ANON_TRANSPARENT_HUGEPAGES) * - HPAGE_PMD_NR + HPAGE_PMD_NR), +#else + K(global_page_state(NR_ANON_PAGES)), #endif - ), K(global_page_state(NR_FILE_MAPPED)), K(global_page_state(NR_SHMEM)), K(global_page_state(NR_SLAB_RECLAIMABLE) + diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c index be177f7..27da860 100644 --- a/fs/proc/namespaces.c +++ b/fs/proc/namespaces.c @@ -9,7 +9,6 @@ #include <linux/file.h> #include <linux/utsname.h> #include <net/net_namespace.h> -#include <linux/mnt_namespace.h> #include <linux/ipc_namespace.h> #include <linux/pid_namespace.h> #include "internal.h" diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c index f738024..06e1cc1 100644 --- a/fs/proc/proc_net.c +++ b/fs/proc/proc_net.c @@ -179,7 +179,7 @@ const struct file_operations proc_net_operations = { struct proc_dir_entry *proc_net_fops_create(struct net *net, - const char *name, mode_t mode, const struct file_operations *fops) + const char *name, umode_t mode, const struct file_operations *fops) { return proc_create(name, mode, net->proc_net, fops); } diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index 1a77dbe..a6b6217 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -3,6 +3,7 @@ */ #include <linux/init.h> #include <linux/sysctl.h> +#include <linux/poll.h> #include <linux/proc_fs.h> #include <linux/security.h> #include <linux/namei.h> @@ -14,6 +15,15 @@ static const struct inode_operations proc_sys_inode_operations; static const struct file_operations proc_sys_dir_file_operations; static const struct inode_operations proc_sys_dir_operations; +void proc_sys_poll_notify(struct ctl_table_poll *poll) +{ + if (!poll) + return; + + atomic_inc(&poll->event); + wake_up_interruptible(&poll->wait); +} + static struct inode *proc_sys_make_inode(struct super_block *sb, struct ctl_table_header *head, struct ctl_table *table) { @@ -39,7 +49,7 @@ static struct inode *proc_sys_make_inode(struct super_block *sb, inode->i_fop = &proc_sys_file_operations; } else { inode->i_mode |= S_IFDIR; - inode->i_nlink = 0; + clear_nlink(inode); inode->i_op = &proc_sys_dir_operations; inode->i_fop = &proc_sys_dir_file_operations; } @@ -176,6 +186,39 @@ static ssize_t proc_sys_write(struct file *filp, const char __user *buf, return proc_sys_call_handler(filp, (void __user *)buf, count, ppos, 1); } +static int proc_sys_open(struct inode *inode, struct file *filp) +{ + struct ctl_table *table = PROC_I(inode)->sysctl_entry; + + if (table->poll) + filp->private_data = proc_sys_poll_event(table->poll); + + return 0; +} + +static unsigned int proc_sys_poll(struct file *filp, poll_table *wait) +{ + struct inode *inode = filp->f_path.dentry->d_inode; + struct ctl_table *table = PROC_I(inode)->sysctl_entry; + unsigned long event = (unsigned long)filp->private_data; + unsigned int ret = DEFAULT_POLLMASK; + + if (!table->proc_handler) + goto out; + + if (!table->poll) + goto out; + + poll_wait(filp, &table->poll->wait, wait); + + if (event != atomic_read(&table->poll->event)) { + filp->private_data = proc_sys_poll_event(table->poll); + ret = POLLIN | POLLRDNORM | POLLERR | POLLPRI; + } + +out: + return ret; +} static int proc_sys_fill_cache(struct file *filp, void *dirent, filldir_t filldir, @@ -364,12 +407,15 @@ static int proc_sys_getattr(struct vfsmount *mnt, struct dentry *dentry, struct } static const struct file_operations proc_sys_file_operations = { + .open = proc_sys_open, + .poll = proc_sys_poll, .read = proc_sys_read, .write = proc_sys_write, .llseek = default_llseek, }; static const struct file_operations proc_sys_dir_file_operations = { + .read = generic_read_dir, .readdir = proc_sys_readdir, .llseek = generic_file_llseek, }; diff --git a/fs/proc/root.c b/fs/proc/root.c index 9a8a2b7..03102d9 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -91,20 +91,18 @@ static struct file_system_type proc_fs_type = { void __init proc_root_init(void) { - struct vfsmount *mnt; int err; proc_init_inodecache(); err = register_filesystem(&proc_fs_type); if (err) return; - mnt = kern_mount_data(&proc_fs_type, &init_pid_ns); - if (IS_ERR(mnt)) { + err = pid_ns_prepare_proc(&init_pid_ns); + if (err) { unregister_filesystem(&proc_fs_type); return; } - init_pid_ns.proc_mnt = mnt; proc_symlink("mounts", NULL, "self/mounts"); proc_net_init(); @@ -209,5 +207,5 @@ int pid_ns_prepare_proc(struct pid_namespace *ns) void pid_ns_release_proc(struct pid_namespace *ns) { - mntput(ns->proc_mnt); + kern_unmount(ns->proc_mnt); } diff --git a/fs/proc/stat.c b/fs/proc/stat.c index 9758b65..d76ca6a 100644 --- a/fs/proc/stat.c +++ b/fs/proc/stat.c @@ -10,6 +10,7 @@ #include <linux/time.h> #include <linux/irqnr.h> #include <asm/cputime.h> +#include <linux/tick.h> #ifndef arch_irq_stat_cpu #define arch_irq_stat_cpu(cpu) 0 @@ -21,38 +22,61 @@ #define arch_idle_time(cpu) 0 #endif +static u64 get_idle_time(int cpu) +{ + u64 idle, idle_time = get_cpu_idle_time_us(cpu, NULL); + + if (idle_time == -1ULL) { + /* !NO_HZ so we can rely on cpustat.idle */ + idle = kcpustat_cpu(cpu).cpustat[CPUTIME_IDLE]; + idle += arch_idle_time(cpu); + } else + idle = usecs_to_cputime64(idle_time); + + return idle; +} + +static u64 get_iowait_time(int cpu) +{ + u64 iowait, iowait_time = get_cpu_iowait_time_us(cpu, NULL); + + if (iowait_time == -1ULL) + /* !NO_HZ so we can rely on cpustat.iowait */ + iowait = kcpustat_cpu(cpu).cpustat[CPUTIME_IOWAIT]; + else + iowait = usecs_to_cputime64(iowait_time); + + return iowait; +} + static int show_stat(struct seq_file *p, void *v) { int i, j; unsigned long jif; - cputime64_t user, nice, system, idle, iowait, irq, softirq, steal; - cputime64_t guest, guest_nice; + u64 user, nice, system, idle, iowait, irq, softirq, steal; + u64 guest, guest_nice; u64 sum = 0; u64 sum_softirq = 0; unsigned int per_softirq_sums[NR_SOFTIRQS] = {0}; struct timespec boottime; user = nice = system = idle = iowait = - irq = softirq = steal = cputime64_zero; - guest = guest_nice = cputime64_zero; + irq = softirq = steal = 0; + guest = guest_nice = 0; getboottime(&boottime); jif = boottime.tv_sec; for_each_possible_cpu(i) { - user = cputime64_add(user, kstat_cpu(i).cpustat.user); - nice = cputime64_add(nice, kstat_cpu(i).cpustat.nice); - system = cputime64_add(system, kstat_cpu(i).cpustat.system); - idle = cputime64_add(idle, kstat_cpu(i).cpustat.idle); - idle = cputime64_add(idle, arch_idle_time(i)); - iowait = cputime64_add(iowait, kstat_cpu(i).cpustat.iowait); - irq = cputime64_add(irq, kstat_cpu(i).cpustat.irq); - softirq = cputime64_add(softirq, kstat_cpu(i).cpustat.softirq); - steal = cputime64_add(steal, kstat_cpu(i).cpustat.steal); - guest = cputime64_add(guest, kstat_cpu(i).cpustat.guest); - guest_nice = cputime64_add(guest_nice, - kstat_cpu(i).cpustat.guest_nice); - sum += kstat_cpu_irqs_sum(i); - sum += arch_irq_stat_cpu(i); + user += kcpustat_cpu(i).cpustat[CPUTIME_USER]; + nice += kcpustat_cpu(i).cpustat[CPUTIME_NICE]; + system += kcpustat_cpu(i).cpustat[CPUTIME_SYSTEM]; + idle += get_idle_time(i); + iowait += get_iowait_time(i); + irq += kcpustat_cpu(i).cpustat[CPUTIME_IRQ]; + softirq += kcpustat_cpu(i).cpustat[CPUTIME_SOFTIRQ]; + steal += kcpustat_cpu(i).cpustat[CPUTIME_STEAL]; + guest += kcpustat_cpu(i).cpustat[CPUTIME_GUEST]; + guest_nice += kcpustat_cpu(i).cpustat[CPUTIME_GUEST_NICE]; for (j = 0; j < NR_SOFTIRQS; j++) { unsigned int softirq_stat = kstat_softirqs_cpu(j, i); @@ -76,19 +100,17 @@ static int show_stat(struct seq_file *p, void *v) (unsigned long long)cputime64_to_clock_t(guest), (unsigned long long)cputime64_to_clock_t(guest_nice)); for_each_online_cpu(i) { - /* Copy values here to work around gcc-2.95.3, gcc-2.96 */ - user = kstat_cpu(i).cpustat.user; - nice = kstat_cpu(i).cpustat.nice; - system = kstat_cpu(i).cpustat.system; - idle = kstat_cpu(i).cpustat.idle; - idle = cputime64_add(idle, arch_idle_time(i)); - iowait = kstat_cpu(i).cpustat.iowait; - irq = kstat_cpu(i).cpustat.irq; - softirq = kstat_cpu(i).cpustat.softirq; - steal = kstat_cpu(i).cpustat.steal; - guest = kstat_cpu(i).cpustat.guest; - guest_nice = kstat_cpu(i).cpustat.guest_nice; + user = kcpustat_cpu(i).cpustat[CPUTIME_USER]; + nice = kcpustat_cpu(i).cpustat[CPUTIME_NICE]; + system = kcpustat_cpu(i).cpustat[CPUTIME_SYSTEM]; + idle = get_idle_time(i); + iowait = get_iowait_time(i); + irq = kcpustat_cpu(i).cpustat[CPUTIME_IRQ]; + softirq = kcpustat_cpu(i).cpustat[CPUTIME_SOFTIRQ]; + steal = kcpustat_cpu(i).cpustat[CPUTIME_STEAL]; + guest = kcpustat_cpu(i).cpustat[CPUTIME_GUEST]; + guest_nice = kcpustat_cpu(i).cpustat[CPUTIME_GUEST_NICE]; seq_printf(p, "cpu%d %llu %llu %llu %llu %llu %llu %llu %llu %llu " "%llu\n", diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 5afaa58..e418c5a 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -44,6 +44,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) "VmPeak:\t%8lu kB\n" "VmSize:\t%8lu kB\n" "VmLck:\t%8lu kB\n" + "VmPin:\t%8lu kB\n" "VmHWM:\t%8lu kB\n" "VmRSS:\t%8lu kB\n" "VmData:\t%8lu kB\n" @@ -55,6 +56,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) hiwater_vm << (PAGE_SHIFT-10), (total_vm - mm->reserved_vm) << (PAGE_SHIFT-10), mm->locked_vm << (PAGE_SHIFT-10), + mm->pinned_vm << (PAGE_SHIFT-10), hiwater_rss << (PAGE_SHIFT-10), total_rss << (PAGE_SHIFT-10), data << (PAGE_SHIFT-10), @@ -1039,6 +1041,9 @@ static int show_numa_map(struct seq_file *m, void *v) seq_printf(m, " stack"); } + if (is_vm_hugetlb_page(vma)) + seq_printf(m, " huge"); + walk_page_range(vma->vm_start, vma->vm_end, &walk); if (!md->pages) diff --git a/fs/proc/uptime.c b/fs/proc/uptime.c index 766b1d4..9610ac7 100644 --- a/fs/proc/uptime.c +++ b/fs/proc/uptime.c @@ -11,15 +11,20 @@ static int uptime_proc_show(struct seq_file *m, void *v) { struct timespec uptime; struct timespec idle; + u64 idletime; + u64 nsec; + u32 rem; int i; - cputime_t idletime = cputime_zero; + idletime = 0; for_each_possible_cpu(i) - idletime = cputime64_add(idletime, kstat_cpu(i).cpustat.idle); + idletime += (__force u64) kcpustat_cpu(i).cpustat[CPUTIME_IDLE]; do_posix_clock_monotonic_gettime(&uptime); monotonic_to_bootbased(&uptime); - cputime_to_timespec(idletime, &idle); + nsec = cputime64_to_jiffies64(idletime) * TICK_NSEC; + idle.tv_sec = div_u64_rem(nsec, NSEC_PER_SEC, &rem); + idle.tv_nsec = rem; seq_printf(m, "%lu.%02lu %lu.%02lu\n", (unsigned long) uptime.tv_sec, (uptime.tv_nsec / (NSEC_PER_SEC / 100)), diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c index cd99bf5..b0f450a 100644 --- a/fs/proc/vmcore.c +++ b/fs/proc/vmcore.c @@ -12,6 +12,7 @@ #include <linux/user.h> #include <linux/elf.h> #include <linux/elfcore.h> +#include <linux/export.h> #include <linux/slab.h> #include <linux/highmem.h> #include <linux/bootmem.h> diff --git a/fs/proc_namespace.c b/fs/proc_namespace.c new file mode 100644 index 0000000..1241285 --- /dev/null +++ b/fs/proc_namespace.c @@ -0,0 +1,333 @@ +/* + * fs/proc_namespace.c - handling of /proc/<pid>/{mounts,mountinfo,mountstats} + * + * In fact, that's a piece of procfs; it's *almost* isolated from + * the rest of fs/proc, but has rather close relationships with + * fs/namespace.c, thus here instead of fs/proc + * + */ +#include <linux/mnt_namespace.h> +#include <linux/nsproxy.h> +#include <linux/security.h> +#include <linux/fs_struct.h> +#include "proc/internal.h" /* only for get_proc_task() in ->open() */ + +#include "pnode.h" +#include "internal.h" + +static unsigned mounts_poll(struct file *file, poll_table *wait) +{ + struct proc_mounts *p = file->private_data; + struct mnt_namespace *ns = p->ns; + unsigned res = POLLIN | POLLRDNORM; + + poll_wait(file, &p->ns->poll, wait); + + br_read_lock(vfsmount_lock); + if (p->m.poll_event != ns->event) { + p->m.poll_event = ns->event; + res |= POLLERR | POLLPRI; + } + br_read_unlock(vfsmount_lock); + + return res; +} + +struct proc_fs_info { + int flag; + const char *str; +}; + +static int show_sb_opts(struct seq_file *m, struct super_block *sb) +{ + static const struct proc_fs_info fs_info[] = { + { MS_SYNCHRONOUS, ",sync" }, + { MS_DIRSYNC, ",dirsync" }, + { MS_MANDLOCK, ",mand" }, + { 0, NULL } + }; + const struct proc_fs_info *fs_infop; + + for (fs_infop = fs_info; fs_infop->flag; fs_infop++) { + if (sb->s_flags & fs_infop->flag) + seq_puts(m, fs_infop->str); + } + + return security_sb_show_options(m, sb); +} + +static void show_mnt_opts(struct seq_file *m, struct vfsmount *mnt) +{ + static const struct proc_fs_info mnt_info[] = { + { MNT_NOSUID, ",nosuid" }, + { MNT_NODEV, ",nodev" }, + { MNT_NOEXEC, ",noexec" }, + { MNT_NOATIME, ",noatime" }, + { MNT_NODIRATIME, ",nodiratime" }, + { MNT_RELATIME, ",relatime" }, + { 0, NULL } + }; + const struct proc_fs_info *fs_infop; + + for (fs_infop = mnt_info; fs_infop->flag; fs_infop++) { + if (mnt->mnt_flags & fs_infop->flag) + seq_puts(m, fs_infop->str); + } +} + +static inline void mangle(struct seq_file *m, const char *s) +{ + seq_escape(m, s, " \t\n\\"); +} + +static void show_type(struct seq_file *m, struct super_block *sb) +{ + mangle(m, sb->s_type->name); + if (sb->s_subtype && sb->s_subtype[0]) { + seq_putc(m, '.'); + mangle(m, sb->s_subtype); + } +} + +static int show_vfsmnt(struct seq_file *m, struct vfsmount *mnt) +{ + struct mount *r = real_mount(mnt); + int err = 0; + struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt }; + struct super_block *sb = mnt_path.dentry->d_sb; + + if (sb->s_op->show_devname) { + err = sb->s_op->show_devname(m, mnt_path.dentry); + if (err) + goto out; + } else { + mangle(m, r->mnt_devname ? r->mnt_devname : "none"); + } + seq_putc(m, ' '); + seq_path(m, &mnt_path, " \t\n\\"); + seq_putc(m, ' '); + show_type(m, sb); + seq_puts(m, __mnt_is_readonly(mnt) ? " ro" : " rw"); + err = show_sb_opts(m, sb); + if (err) + goto out; + show_mnt_opts(m, mnt); + if (sb->s_op->show_options) + err = sb->s_op->show_options(m, mnt_path.dentry); + seq_puts(m, " 0 0\n"); +out: + return err; +} + +static int show_mountinfo(struct seq_file *m, struct vfsmount *mnt) +{ + struct proc_mounts *p = m->private; + struct mount *r = real_mount(mnt); + struct super_block *sb = mnt->mnt_sb; + struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt }; + struct path root = p->root; + int err = 0; + + seq_printf(m, "%i %i %u:%u ", r->mnt_id, r->mnt_parent->mnt_id, + MAJOR(sb->s_dev), MINOR(sb->s_dev)); + if (sb->s_op->show_path) + err = sb->s_op->show_path(m, mnt->mnt_root); + else + seq_dentry(m, mnt->mnt_root, " \t\n\\"); + if (err) + goto out; + seq_putc(m, ' '); + + /* mountpoints outside of chroot jail will give SEQ_SKIP on this */ + err = seq_path_root(m, &mnt_path, &root, " \t\n\\"); + if (err) + goto out; + + seq_puts(m, mnt->mnt_flags & MNT_READONLY ? " ro" : " rw"); + show_mnt_opts(m, mnt); + + /* Tagged fields ("foo:X" or "bar") */ + if (IS_MNT_SHARED(r)) + seq_printf(m, " shared:%i", r->mnt_group_id); + if (IS_MNT_SLAVE(r)) { + int master = r->mnt_master->mnt_group_id; + int dom = get_dominating_id(r, &p->root); + seq_printf(m, " master:%i", master); + if (dom && dom != master) + seq_printf(m, " propagate_from:%i", dom); + } + if (IS_MNT_UNBINDABLE(r)) + seq_puts(m, " unbindable"); + + /* Filesystem specific data */ + seq_puts(m, " - "); + show_type(m, sb); + seq_putc(m, ' '); + if (sb->s_op->show_devname) + err = sb->s_op->show_devname(m, mnt->mnt_root); + else + mangle(m, r->mnt_devname ? r->mnt_devname : "none"); + if (err) + goto out; + seq_puts(m, sb->s_flags & MS_RDONLY ? " ro" : " rw"); + err = show_sb_opts(m, sb); + if (err) + goto out; + if (sb->s_op->show_options) + err = sb->s_op->show_options(m, mnt->mnt_root); + seq_putc(m, '\n'); +out: + return err; +} + +static int show_vfsstat(struct seq_file *m, struct vfsmount *mnt) +{ + struct mount *r = real_mount(mnt); + struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt }; + struct super_block *sb = mnt_path.dentry->d_sb; + int err = 0; + + /* device */ + if (sb->s_op->show_devname) { + seq_puts(m, "device "); + err = sb->s_op->show_devname(m, mnt_path.dentry); + } else { + if (r->mnt_devname) { + seq_puts(m, "device "); + mangle(m, r->mnt_devname); + } else + seq_puts(m, "no device"); + } + + /* mount point */ + seq_puts(m, " mounted on "); + seq_path(m, &mnt_path, " \t\n\\"); + seq_putc(m, ' '); + + /* file system type */ + seq_puts(m, "with fstype "); + show_type(m, sb); + + /* optional statistics */ + if (sb->s_op->show_stats) { + seq_putc(m, ' '); + if (!err) + err = sb->s_op->show_stats(m, mnt_path.dentry); + } + + seq_putc(m, '\n'); + return err; +} + +static int mounts_open_common(struct inode *inode, struct file *file, + int (*show)(struct seq_file *, struct vfsmount *)) +{ + struct task_struct *task = get_proc_task(inode); + struct nsproxy *nsp; + struct mnt_namespace *ns = NULL; + struct path root; + struct proc_mounts *p; + int ret = -EINVAL; + + if (!task) + goto err; + + rcu_read_lock(); + nsp = task_nsproxy(task); + if (!nsp) { + rcu_read_unlock(); + put_task_struct(task); + goto err; + } + ns = nsp->mnt_ns; + if (!ns) { + rcu_read_unlock(); + put_task_struct(task); + goto err; + } + get_mnt_ns(ns); + rcu_read_unlock(); + task_lock(task); + if (!task->fs) { + task_unlock(task); + put_task_struct(task); + ret = -ENOENT; + goto err_put_ns; + } + get_fs_root(task->fs, &root); + task_unlock(task); + put_task_struct(task); + + ret = -ENOMEM; + p = kmalloc(sizeof(struct proc_mounts), GFP_KERNEL); + if (!p) + goto err_put_path; + + file->private_data = &p->m; + ret = seq_open(file, &mounts_op); + if (ret) + goto err_free; + + p->m.private = p; + p->ns = ns; + p->root = root; + p->m.poll_event = ns->event; + p->show = show; + + return 0; + + err_free: + kfree(p); + err_put_path: + path_put(&root); + err_put_ns: + put_mnt_ns(ns); + err: + return ret; +} + +static int mounts_release(struct inode *inode, struct file *file) +{ + struct proc_mounts *p = file->private_data; + path_put(&p->root); + put_mnt_ns(p->ns); + return seq_release(inode, file); +} + +static int mounts_open(struct inode *inode, struct file *file) +{ + return mounts_open_common(inode, file, show_vfsmnt); +} + +static int mountinfo_open(struct inode *inode, struct file *file) +{ + return mounts_open_common(inode, file, show_mountinfo); +} + +static int mountstats_open(struct inode *inode, struct file *file) +{ + return mounts_open_common(inode, file, show_vfsstat); +} + +const struct file_operations proc_mounts_operations = { + .open = mounts_open, + .read = seq_read, + .llseek = seq_lseek, + .release = mounts_release, + .poll = mounts_poll, +}; + +const struct file_operations proc_mountinfo_operations = { + .open = mountinfo_open, + .read = seq_read, + .llseek = seq_lseek, + .release = mounts_release, + .poll = mounts_poll, +}; + +const struct file_operations proc_mountstats_operations = { + .open = mountstats_open, + .read = seq_read, + .llseek = seq_lseek, + .release = mounts_release, +}; diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c index 893b961..b3b426e 100644 --- a/fs/pstore/inode.c +++ b/fs/pstore/inode.c @@ -24,6 +24,7 @@ #include <linux/highmem.h> #include <linux/time.h> #include <linux/init.h> +#include <linux/list.h> #include <linux/string.h> #include <linux/mount.h> #include <linux/ramfs.h> @@ -32,13 +33,18 @@ #include <linux/magic.h> #include <linux/pstore.h> #include <linux/slab.h> +#include <linux/spinlock.h> #include <linux/uaccess.h> #include "internal.h" #define PSTORE_NAMELEN 64 +static DEFINE_SPINLOCK(allpstore_lock); +static LIST_HEAD(allpstore); + struct pstore_private { + struct list_head list; struct pstore_info *psi; enum pstore_type_id type; u64 id; @@ -74,15 +80,24 @@ static int pstore_unlink(struct inode *dir, struct dentry *dentry) { struct pstore_private *p = dentry->d_inode->i_private; - p->psi->erase(p->type, p->id, p->psi); + if (p->psi->erase) + p->psi->erase(p->type, p->id, p->psi); return simple_unlink(dir, dentry); } static void pstore_evict_inode(struct inode *inode) { + struct pstore_private *p = inode->i_private; + unsigned long flags; + end_writeback(inode); - kfree(inode->i_private); + if (p) { + spin_lock_irqsave(&allpstore_lock, flags); + list_del(&p->list); + spin_unlock_irqrestore(&allpstore_lock, flags); + kfree(p); + } } static const struct inode_operations pstore_dir_inode_operations = { @@ -182,9 +197,23 @@ int pstore_mkfile(enum pstore_type_id type, char *psname, u64 id, struct dentry *root = pstore_sb->s_root; struct dentry *dentry; struct inode *inode; - int rc; + int rc = 0; char name[PSTORE_NAMELEN]; - struct pstore_private *private; + struct pstore_private *private, *pos; + unsigned long flags; + + spin_lock_irqsave(&allpstore_lock, flags); + list_for_each_entry(pos, &allpstore, list) { + if (pos->type == type && + pos->id == id && + pos->psi == psi) { + rc = -EEXIST; + break; + } + } + spin_unlock_irqrestore(&allpstore_lock, flags); + if (rc) + return rc; rc = -ENOMEM; inode = pstore_get_inode(pstore_sb, root->d_inode, S_IFREG | 0444, 0); @@ -229,6 +258,10 @@ int pstore_mkfile(enum pstore_type_id type, char *psname, u64 id, d_add(dentry, inode); + spin_lock_irqsave(&allpstore_lock, flags); + list_add(&private->list, &allpstore); + spin_unlock_irqrestore(&allpstore_lock, flags); + mutex_unlock(&root->d_inode->i_mutex); return 0; @@ -277,7 +310,7 @@ int pstore_fill_super(struct super_block *sb, void *data, int silent) goto fail; } - pstore_get_records(); + pstore_get_records(0); return 0; fail: diff --git a/fs/pstore/internal.h b/fs/pstore/internal.h index 611c1b3..3bde461 100644 --- a/fs/pstore/internal.h +++ b/fs/pstore/internal.h @@ -1,5 +1,5 @@ extern void pstore_set_kmsg_bytes(int); -extern void pstore_get_records(void); +extern void pstore_get_records(int); extern int pstore_mkfile(enum pstore_type_id, char *psname, u64 id, char *data, size_t size, struct timespec time, struct pstore_info *psi); diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c index c5300ec..9ec22d3 100644 --- a/fs/pstore/platform.c +++ b/fs/pstore/platform.c @@ -25,12 +25,30 @@ #include <linux/module.h> #include <linux/pstore.h> #include <linux/string.h> +#include <linux/timer.h> #include <linux/slab.h> #include <linux/uaccess.h> +#include <linux/hardirq.h> +#include <linux/workqueue.h> #include "internal.h" /* + * We defer making "oops" entries appear in pstore - see + * whether the system is actually still running well enough + * to let someone see the entry + */ +#define PSTORE_INTERVAL (60 * HZ) + +static int pstore_new_entry; + +static void pstore_timefunc(unsigned long); +static DEFINE_TIMER(pstore_timer, pstore_timefunc, 0, 0); + +static void pstore_dowork(struct work_struct *); +static DECLARE_WORK(pstore_work, pstore_dowork); + +/* * pstore_lock just protects "psinfo" during * calls to pstore_register() */ @@ -69,15 +87,22 @@ static void pstore_dump(struct kmsg_dumper *dumper, unsigned long size, total = 0; char *dst, *why; u64 id; - int hsize; + int hsize, ret; unsigned int part = 1; + unsigned long flags = 0; + int is_locked = 0; if (reason < ARRAY_SIZE(reason_str)) why = reason_str[reason]; else why = "Unknown"; - mutex_lock(&psinfo->buf_mutex); + if (in_nmi()) { + is_locked = spin_trylock(&psinfo->buf_lock); + if (!is_locked) + pr_err("pstore dump routine blocked in NMI, may corrupt error record\n"); + } else + spin_lock_irqsave(&psinfo->buf_lock, flags); oopscount++; while (total < kmsg_bytes) { dst = psinfo->buf; @@ -97,18 +122,20 @@ static void pstore_dump(struct kmsg_dumper *dumper, memcpy(dst, s1 + s1_start, l1_cpy); memcpy(dst + l1_cpy, s2 + s2_start, l2_cpy); - id = psinfo->write(PSTORE_TYPE_DMESG, part, + ret = psinfo->write(PSTORE_TYPE_DMESG, reason, &id, part, hsize + l1_cpy + l2_cpy, psinfo); - if (reason == KMSG_DUMP_OOPS && pstore_is_mounted()) - pstore_mkfile(PSTORE_TYPE_DMESG, psinfo->name, id, - psinfo->buf, hsize + l1_cpy + l2_cpy, - CURRENT_TIME, psinfo); + if (ret == 0 && reason == KMSG_DUMP_OOPS && pstore_is_mounted()) + pstore_new_entry = 1; l1 -= l1_cpy; l2 -= l2_cpy; total += l1_cpy + l2_cpy; part++; } - mutex_unlock(&psinfo->buf_mutex); + if (in_nmi()) { + if (is_locked) + spin_unlock(&psinfo->buf_lock); + } else + spin_unlock_irqrestore(&psinfo->buf_lock, flags); } static struct kmsg_dumper pstore_dumper = { @@ -140,6 +167,7 @@ int pstore_register(struct pstore_info *psi) } psinfo = psi; + mutex_init(&psinfo->read_mutex); spin_unlock(&pstore_lock); if (owner && !try_module_get(owner)) { @@ -148,21 +176,27 @@ int pstore_register(struct pstore_info *psi) } if (pstore_is_mounted()) - pstore_get_records(); + pstore_get_records(0); kmsg_dump_register(&pstore_dumper); + pstore_timer.expires = jiffies + PSTORE_INTERVAL; + add_timer(&pstore_timer); + return 0; } EXPORT_SYMBOL_GPL(pstore_register); /* - * Read all the records from the persistent store. Create and - * file files in our filesystem. + * Read all the records from the persistent store. Create + * files in our filesystem. Don't warn about -EEXIST errors + * when we are re-scanning the backing store looking to add new + * error records. */ -void pstore_get_records(void) +void pstore_get_records(int quiet) { struct pstore_info *psi = psinfo; + char *buf = NULL; ssize_t size; u64 id; enum pstore_type_id type; @@ -172,50 +206,42 @@ void pstore_get_records(void) if (!psi) return; - mutex_lock(&psinfo->buf_mutex); - rc = psi->open(psi); - if (rc) + mutex_lock(&psi->read_mutex); + if (psi->open && psi->open(psi)) goto out; - while ((size = psi->read(&id, &type, &time, psi)) > 0) { - if (pstore_mkfile(type, psi->name, id, psi->buf, (size_t)size, - time, psi)) + while ((size = psi->read(&id, &type, &time, &buf, psi)) > 0) { + rc = pstore_mkfile(type, psi->name, id, buf, (size_t)size, + time, psi); + kfree(buf); + buf = NULL; + if (rc && (rc != -EEXIST || !quiet)) failed++; } - psi->close(psi); + if (psi->close) + psi->close(psi); out: - mutex_unlock(&psinfo->buf_mutex); + mutex_unlock(&psi->read_mutex); if (failed) printk(KERN_WARNING "pstore: failed to load %d record(s) from '%s'\n", failed, psi->name); } -/* - * Call platform driver to write a record to the - * persistent store. - */ -int pstore_write(enum pstore_type_id type, char *buf, size_t size) +static void pstore_dowork(struct work_struct *work) { - u64 id; - - if (!psinfo) - return -ENODEV; - - if (size > psinfo->bufsize) - return -EFBIG; + pstore_get_records(1); +} - mutex_lock(&psinfo->buf_mutex); - memcpy(psinfo->buf, buf, size); - id = psinfo->write(type, 0, size, psinfo); - if (pstore_is_mounted()) - pstore_mkfile(PSTORE_TYPE_DMESG, psinfo->name, id, psinfo->buf, - size, CURRENT_TIME, psinfo); - mutex_unlock(&psinfo->buf_mutex); +static void pstore_timefunc(unsigned long dummy) +{ + if (pstore_new_entry) { + pstore_new_entry = 0; + schedule_work(&pstore_work); + } - return 0; + mod_timer(&pstore_timer, jiffies + PSTORE_INTERVAL); } -EXPORT_SYMBOL_GPL(pstore_write); module_param(backend, charp, 0444); MODULE_PARM_DESC(backend, "Pstore backend to use"); diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c index 2b06466..2bfd987 100644 --- a/fs/qnx4/inode.c +++ b/fs/qnx4/inode.c @@ -199,12 +199,13 @@ static const char *qnx4_checkroot(struct super_block *sb) if (!strcmp(rootdir->di_fname, QNX4_BMNAME)) { found = 1; - qnx4_sb(sb)->BitMap = kmalloc( sizeof( struct qnx4_inode_entry ), GFP_KERNEL ); + qnx4_sb(sb)->BitMap = kmemdup(rootdir, + sizeof(struct qnx4_inode_entry), + GFP_KERNEL); if (!qnx4_sb(sb)->BitMap) { brelse (bh); return "not enough memory for bitmap inode"; - } - memcpy( qnx4_sb(sb)->BitMap, rootdir, sizeof( struct qnx4_inode_entry ) ); /* keep bitmap inode known */ + }/* keep bitmap inode known */ break; } } @@ -379,7 +380,7 @@ struct inode *qnx4_iget(struct super_block *sb, unsigned long ino) inode->i_mode = le16_to_cpu(raw_inode->di_mode); inode->i_uid = (uid_t)le16_to_cpu(raw_inode->di_uid); inode->i_gid = (gid_t)le16_to_cpu(raw_inode->di_gid); - inode->i_nlink = le16_to_cpu(raw_inode->di_nlink); + set_nlink(inode, le16_to_cpu(raw_inode->di_nlink)); inode->i_size = le32_to_cpu(raw_inode->di_size); inode->i_mtime.tv_sec = le32_to_cpu(raw_inode->di_mtime); inode->i_mtime.tv_nsec = 0; @@ -427,7 +428,6 @@ static struct inode *qnx4_alloc_inode(struct super_block *sb) static void qnx4_i_callback(struct rcu_head *head) { struct inode *inode = container_of(head, struct inode, i_rcu); - INIT_LIST_HEAD(&inode->i_dentry); kmem_cache_free(qnx4_inode_cachep, qnx4_i(inode)); } diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index 5b572c8..5ec59b2 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -73,7 +73,6 @@ #include <linux/security.h> #include <linux/kmod.h> #include <linux/namei.h> -#include <linux/buffer_head.h> #include <linux/capability.h> #include <linux/quotaops.h> #include "../internal.h" /* ugh */ @@ -2199,7 +2198,7 @@ int dquot_quota_on(struct super_block *sb, int type, int format_id, if (error) return error; /* Quota file not on the same filesystem? */ - if (path->mnt->mnt_sb != sb) + if (path->dentry->d_sb != sb) error = -EXDEV; else error = vfs_load_quota_inode(path->dentry->d_inode, type, diff --git a/fs/quota/quota.c b/fs/quota/quota.c index 10b6be3..7898cd6 100644 --- a/fs/quota/quota.c +++ b/fs/quota/quota.c @@ -13,7 +13,6 @@ #include <linux/kernel.h> #include <linux/security.h> #include <linux/syscalls.h> -#include <linux/buffer_head.h> #include <linux/capability.h> #include <linux/quotaops.h> #include <linux/types.h> @@ -286,7 +285,7 @@ static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id, /* caller already holds s_umount */ if (sb->s_flags & MS_RDONLY) return -EROFS; - writeback_inodes_sb(sb); + writeback_inodes_sb(sb, WB_REASON_SYNC); return 0; default: return -EINVAL; @@ -363,12 +362,15 @@ SYSCALL_DEFINE4(quotactl, unsigned int, cmd, const char __user *, special, } sb = quotactl_block(special); - if (IS_ERR(sb)) - return PTR_ERR(sb); + if (IS_ERR(sb)) { + ret = PTR_ERR(sb); + goto out; + } ret = do_quotactl(sb, type, cmds, id, addr, pathp); drop_super(sb); +out: if (pathp && !IS_ERR(pathp)) path_put(pathp); return ret; diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c index eacb166..aec766a 100644 --- a/fs/ramfs/inode.c +++ b/fs/ramfs/inode.c @@ -23,7 +23,6 @@ * caches is sufficient. */ -#include <linux/module.h> #include <linux/fs.h> #include <linux/pagemap.h> #include <linux/highmem.h> @@ -53,7 +52,7 @@ static struct backing_dev_info ramfs_backing_dev_info = { }; struct inode *ramfs_get_inode(struct super_block *sb, - const struct inode *dir, int mode, dev_t dev) + const struct inode *dir, umode_t mode, dev_t dev) { struct inode * inode = new_inode(sb); @@ -93,7 +92,7 @@ struct inode *ramfs_get_inode(struct super_block *sb, */ /* SMP-safe */ static int -ramfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev) +ramfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev) { struct inode * inode = ramfs_get_inode(dir->i_sb, dir, mode, dev); int error = -ENOSPC; @@ -107,7 +106,7 @@ ramfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev) return error; } -static int ramfs_mkdir(struct inode * dir, struct dentry * dentry, int mode) +static int ramfs_mkdir(struct inode * dir, struct dentry * dentry, umode_t mode) { int retval = ramfs_mknod(dir, dentry, mode | S_IFDIR, 0); if (!retval) @@ -115,7 +114,7 @@ static int ramfs_mkdir(struct inode * dir, struct dentry * dentry, int mode) return retval; } -static int ramfs_create(struct inode *dir, struct dentry *dentry, int mode, struct nameidata *nd) +static int ramfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, struct nameidata *nd) { return ramfs_mknod(dir, dentry, mode | S_IFREG, 0); } @@ -288,14 +287,7 @@ static int __init init_ramfs_fs(void) { return register_filesystem(&ramfs_fs_type); } - -static void __exit exit_ramfs_fs(void) -{ - unregister_filesystem(&ramfs_fs_type); -} - module_init(init_ramfs_fs) -module_exit(exit_ramfs_fs) int __init init_rootfs(void) { @@ -311,5 +303,3 @@ int __init init_rootfs(void) return err; } - -MODULE_LICENSE("GPL"); diff --git a/fs/read_write.c b/fs/read_write.c index 179f1c3..5ad4248 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -35,23 +35,45 @@ static inline int unsigned_offsets(struct file *file) return file->f_mode & FMODE_UNSIGNED_OFFSET; } +static loff_t lseek_execute(struct file *file, struct inode *inode, + loff_t offset, loff_t maxsize) +{ + if (offset < 0 && !unsigned_offsets(file)) + return -EINVAL; + if (offset > maxsize) + return -EINVAL; + + if (offset != file->f_pos) { + file->f_pos = offset; + file->f_version = 0; + } + return offset; +} + /** - * generic_file_llseek_unlocked - lockless generic llseek implementation + * generic_file_llseek_size - generic llseek implementation for regular files * @file: file structure to seek on * @offset: file offset to seek to * @origin: type of seek + * @size: max size of file system + * + * This is a variant of generic_file_llseek that allows passing in a custom + * file size. * - * Updates the file offset to the value specified by @offset and @origin. - * Locking must be provided by the caller. + * Synchronization: + * SEEK_SET and SEEK_END are unsynchronized (but atomic on 64bit platforms) + * SEEK_CUR is synchronized against other SEEK_CURs, but not read/writes. + * read/writes behave like SEEK_SET against seeks. */ loff_t -generic_file_llseek_unlocked(struct file *file, loff_t offset, int origin) +generic_file_llseek_size(struct file *file, loff_t offset, int origin, + loff_t maxsize) { struct inode *inode = file->f_mapping->host; switch (origin) { case SEEK_END: - offset += inode->i_size; + offset += i_size_read(inode); break; case SEEK_CUR: /* @@ -62,14 +84,22 @@ generic_file_llseek_unlocked(struct file *file, loff_t offset, int origin) */ if (offset == 0) return file->f_pos; - offset += file->f_pos; - break; + /* + * f_lock protects against read/modify/write race with other + * SEEK_CURs. Note that parallel writes and reads behave + * like SEEK_SET. + */ + spin_lock(&file->f_lock); + offset = lseek_execute(file, inode, file->f_pos + offset, + maxsize); + spin_unlock(&file->f_lock); + return offset; case SEEK_DATA: /* * In the generic case the entire file is data, so as long as * offset isn't at the end of the file then the offset is data. */ - if (offset >= inode->i_size) + if (offset >= i_size_read(inode)) return -ENXIO; break; case SEEK_HOLE: @@ -77,26 +107,15 @@ generic_file_llseek_unlocked(struct file *file, loff_t offset, int origin) * There is a virtual hole at the end of the file, so as long as * offset isn't i_size or larger, return i_size. */ - if (offset >= inode->i_size) + if (offset >= i_size_read(inode)) return -ENXIO; - offset = inode->i_size; + offset = i_size_read(inode); break; } - if (offset < 0 && !unsigned_offsets(file)) - return -EINVAL; - if (offset > inode->i_sb->s_maxbytes) - return -EINVAL; - - /* Special lock needed here? */ - if (offset != file->f_pos) { - file->f_pos = offset; - file->f_version = 0; - } - - return offset; + return lseek_execute(file, inode, offset, maxsize); } -EXPORT_SYMBOL(generic_file_llseek_unlocked); +EXPORT_SYMBOL(generic_file_llseek_size); /** * generic_file_llseek - generic llseek implementation for regular files @@ -110,13 +129,10 @@ EXPORT_SYMBOL(generic_file_llseek_unlocked); */ loff_t generic_file_llseek(struct file *file, loff_t offset, int origin) { - loff_t rval; - - mutex_lock(&file->f_dentry->d_inode->i_mutex); - rval = generic_file_llseek_unlocked(file, offset, origin); - mutex_unlock(&file->f_dentry->d_inode->i_mutex); + struct inode *inode = file->f_mapping->host; - return rval; + return generic_file_llseek_size(file, offset, origin, + inode->i_sb->s_maxbytes); } EXPORT_SYMBOL(generic_file_llseek); @@ -617,7 +633,8 @@ ssize_t do_loop_readv_writev(struct file *filp, struct iovec *iov, ssize_t rw_copy_check_uvector(int type, const struct iovec __user * uvector, unsigned long nr_segs, unsigned long fast_segs, struct iovec *fast_pointer, - struct iovec **ret_pointer) + struct iovec **ret_pointer, + int check_access) { unsigned long seg; ssize_t ret; @@ -673,7 +690,8 @@ ssize_t rw_copy_check_uvector(int type, const struct iovec __user * uvector, ret = -EINVAL; goto out; } - if (unlikely(!access_ok(vrfy_dir(type), buf, len))) { + if (check_access + && unlikely(!access_ok(vrfy_dir(type), buf, len))) { ret = -EFAULT; goto out; } @@ -705,7 +723,7 @@ static ssize_t do_readv_writev(int type, struct file *file, } ret = rw_copy_check_uvector(type, uvector, nr_segs, - ARRAY_SIZE(iovstack), iovstack, &iov); + ARRAY_SIZE(iovstack), iovstack, &iov, 1); if (ret <= 0) goto out; diff --git a/fs/reiserfs/bitmap.c b/fs/reiserfs/bitmap.c index d1aca1df..a945cd2 100644 --- a/fs/reiserfs/bitmap.c +++ b/fs/reiserfs/bitmap.c @@ -13,6 +13,7 @@ #include <linux/reiserfs_fs_sb.h> #include <linux/reiserfs_fs_i.h> #include <linux/quotaops.h> +#include <linux/seq_file.h> #define PREALLOCATION_SIZE 9 @@ -634,6 +635,96 @@ int reiserfs_parse_alloc_options(struct super_block *s, char *options) return 0; } +static void print_sep(struct seq_file *seq, int *first) +{ + if (!*first) + seq_puts(seq, ":"); + else + *first = 0; +} + +void show_alloc_options(struct seq_file *seq, struct super_block *s) +{ + int first = 1; + + if (SB_ALLOC_OPTS(s) == ((1 << _ALLOC_skip_busy) | + (1 << _ALLOC_dirid_groups) | (1 << _ALLOC_packing_groups))) + return; + + seq_puts(seq, ",alloc="); + + if (TEST_OPTION(concentrating_formatted_nodes, s)) { + print_sep(seq, &first); + if (REISERFS_SB(s)->s_alloc_options.border != 10) { + seq_printf(seq, "concentrating_formatted_nodes=%d", + 100 / REISERFS_SB(s)->s_alloc_options.border); + } else + seq_puts(seq, "concentrating_formatted_nodes"); + } + if (TEST_OPTION(displacing_large_files, s)) { + print_sep(seq, &first); + if (REISERFS_SB(s)->s_alloc_options.large_file_size != 16) { + seq_printf(seq, "displacing_large_files=%lu", + REISERFS_SB(s)->s_alloc_options.large_file_size); + } else + seq_puts(seq, "displacing_large_files"); + } + if (TEST_OPTION(displacing_new_packing_localities, s)) { + print_sep(seq, &first); + seq_puts(seq, "displacing_new_packing_localities"); + } + if (TEST_OPTION(old_hashed_relocation, s)) { + print_sep(seq, &first); + seq_puts(seq, "old_hashed_relocation"); + } + if (TEST_OPTION(new_hashed_relocation, s)) { + print_sep(seq, &first); + seq_puts(seq, "new_hashed_relocation"); + } + if (TEST_OPTION(dirid_groups, s)) { + print_sep(seq, &first); + seq_puts(seq, "dirid_groups"); + } + if (TEST_OPTION(oid_groups, s)) { + print_sep(seq, &first); + seq_puts(seq, "oid_groups"); + } + if (TEST_OPTION(packing_groups, s)) { + print_sep(seq, &first); + seq_puts(seq, "packing_groups"); + } + if (TEST_OPTION(hashed_formatted_nodes, s)) { + print_sep(seq, &first); + seq_puts(seq, "hashed_formatted_nodes"); + } + if (TEST_OPTION(skip_busy, s)) { + print_sep(seq, &first); + seq_puts(seq, "skip_busy"); + } + if (TEST_OPTION(hundredth_slices, s)) { + print_sep(seq, &first); + seq_puts(seq, "hundredth_slices"); + } + if (TEST_OPTION(old_way, s)) { + print_sep(seq, &first); + seq_puts(seq, "old_way"); + } + if (TEST_OPTION(displace_based_on_dirid, s)) { + print_sep(seq, &first); + seq_puts(seq, "displace_based_on_dirid"); + } + if (REISERFS_SB(s)->s_alloc_options.preallocmin != 0) { + print_sep(seq, &first); + seq_printf(seq, "preallocmin=%d", + REISERFS_SB(s)->s_alloc_options.preallocmin); + } + if (REISERFS_SB(s)->s_alloc_options.preallocsize != 17) { + print_sep(seq, &first); + seq_printf(seq, "preallocsize=%d", + REISERFS_SB(s)->s_alloc_options.preallocsize); + } +} + static inline void new_hashed_relocation(reiserfs_blocknr_hint_t * hint) { char *hash_in; diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index 9b0d4b7..9e8cd5a 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c @@ -1154,7 +1154,7 @@ static void init_inode(struct inode *inode, struct treepath *path) set_inode_item_key_version(inode, KEY_FORMAT_3_5); set_inode_sd_version(inode, STAT_DATA_V1); inode->i_mode = sd_v1_mode(sd); - inode->i_nlink = sd_v1_nlink(sd); + set_nlink(inode, sd_v1_nlink(sd)); inode->i_uid = sd_v1_uid(sd); inode->i_gid = sd_v1_gid(sd); inode->i_size = sd_v1_size(sd); @@ -1199,7 +1199,7 @@ static void init_inode(struct inode *inode, struct treepath *path) struct stat_data *sd = (struct stat_data *)B_I_PITEM(bh, ih); inode->i_mode = sd_v2_mode(sd); - inode->i_nlink = sd_v2_nlink(sd); + set_nlink(inode, sd_v2_nlink(sd)); inode->i_uid = sd_v2_uid(sd); inode->i_size = sd_v2_size(sd); inode->i_gid = sd_v2_gid(sd); @@ -1444,7 +1444,7 @@ void reiserfs_read_locked_inode(struct inode *inode, /* a stale NFS handle can trigger this without it being an error */ pathrelse(&path_to_sd); reiserfs_make_bad_inode(inode); - inode->i_nlink = 0; + clear_nlink(inode); return; } @@ -1766,7 +1766,7 @@ static int reiserfs_new_symlink(struct reiserfs_transaction_handle *th, struct i for the fresh inode. This can only be done outside a transaction, so if we return non-zero, we also end the transaction. */ int reiserfs_new_inode(struct reiserfs_transaction_handle *th, - struct inode *dir, int mode, const char *symname, + struct inode *dir, umode_t mode, const char *symname, /* 0 for regular, EMTRY_DIR_SIZE for dirs, strlen (symname) for symlinks) */ loff_t i_size, struct dentry *dentry, @@ -1832,7 +1832,7 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th, #endif /* fill stat data */ - inode->i_nlink = (S_ISDIR(mode) ? 2 : 1); + set_nlink(inode, (S_ISDIR(mode) ? 2 : 1)); /* uid and gid must already be set by the caller for quota init */ @@ -1987,7 +1987,7 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th, make_bad_inode(inode); out_inserted_sd: - inode->i_nlink = 0; + clear_nlink(inode); th->t_trans_id = 0; /* so the caller can't use this handle later */ unlock_new_inode(inode); /* OK to do even if we hadn't locked it */ iput(inode); diff --git a/fs/reiserfs/ioctl.c b/fs/reiserfs/ioctl.c index 4e15305..950e3d1 100644 --- a/fs/reiserfs/ioctl.c +++ b/fs/reiserfs/ioctl.c @@ -55,7 +55,7 @@ long reiserfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) break; } - err = mnt_want_write(filp->f_path.mnt); + err = mnt_want_write_file(filp); if (err) break; @@ -96,7 +96,7 @@ long reiserfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) inode->i_ctime = CURRENT_TIME_SEC; mark_inode_dirty(inode); setflags_out: - mnt_drop_write(filp->f_path.mnt); + mnt_drop_write_file(filp); break; } case REISERFS_IOC_GETVERSION: @@ -107,7 +107,7 @@ setflags_out: err = -EPERM; break; } - err = mnt_want_write(filp->f_path.mnt); + err = mnt_want_write_file(filp); if (err) break; if (get_user(inode->i_generation, (int __user *)arg)) { @@ -117,7 +117,7 @@ setflags_out: inode->i_ctime = CURRENT_TIME_SEC; mark_inode_dirty(inode); setversion_out: - mnt_drop_write(filp->f_path.mnt); + mnt_drop_write_file(filp); break; default: err = -ENOTTY; diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c index a159ba5..eb71106 100644 --- a/fs/reiserfs/journal.c +++ b/fs/reiserfs/journal.c @@ -291,14 +291,13 @@ int reiserfs_allocate_list_bitmaps(struct super_block *sb, for (i = 0; i < JOURNAL_NUM_BITMAPS; i++) { jb = jb_array + i; jb->journal_list = NULL; - jb->bitmaps = vmalloc(mem); + jb->bitmaps = vzalloc(mem); if (!jb->bitmaps) { reiserfs_warning(sb, "clm-2000", "unable to " "allocate bitmaps for journal lists"); failed = 1; break; } - memset(jb->bitmaps, 0, mem); } if (failed) { free_list_bitmaps(sb, jb_array); @@ -353,11 +352,10 @@ static struct reiserfs_journal_cnode *allocate_cnodes(int num_cnodes) if (num_cnodes <= 0) { return NULL; } - head = vmalloc(num_cnodes * sizeof(struct reiserfs_journal_cnode)); + head = vzalloc(num_cnodes * sizeof(struct reiserfs_journal_cnode)); if (!head) { return NULL; } - memset(head, 0, num_cnodes * sizeof(struct reiserfs_journal_cnode)); head[0].prev = NULL; head[0].next = head + 1; for (i = 1; i < num_cnodes; i++) { @@ -2685,14 +2683,13 @@ int journal_init(struct super_block *sb, const char *j_dev_name, * dependency inversion warnings. */ reiserfs_write_unlock(sb); - journal = SB_JOURNAL(sb) = vmalloc(sizeof(struct reiserfs_journal)); + journal = SB_JOURNAL(sb) = vzalloc(sizeof(struct reiserfs_journal)); if (!journal) { reiserfs_warning(sb, "journal-1256", "unable to get memory for journal structure"); reiserfs_write_lock(sb); return 1; } - memset(journal, 0, sizeof(struct reiserfs_journal)); INIT_LIST_HEAD(&journal->j_bitmap_nodes); INIT_LIST_HEAD(&journal->j_prealloc_list); INIT_LIST_HEAD(&journal->j_working_list); diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c index ef39232..1463788 100644 --- a/fs/reiserfs/namei.c +++ b/fs/reiserfs/namei.c @@ -19,7 +19,7 @@ #include <linux/reiserfs_xattr.h> #include <linux/quotaops.h> -#define INC_DIR_INODE_NLINK(i) if (i->i_nlink != 1) { inc_nlink(i); if (i->i_nlink >= REISERFS_LINK_MAX) i->i_nlink=1; } +#define INC_DIR_INODE_NLINK(i) if (i->i_nlink != 1) { inc_nlink(i); if (i->i_nlink >= REISERFS_LINK_MAX) set_nlink(i, 1); } #define DEC_DIR_INODE_NLINK(i) if (i->i_nlink != 1) drop_nlink(i); // directory item contains array of entry headers. This performs @@ -559,7 +559,7 @@ static int drop_new_inode(struct inode *inode) ** outside of a transaction, so we had to pull some bits of ** reiserfs_new_inode out into this func. */ -static int new_inode_init(struct inode *inode, struct inode *dir, int mode) +static int new_inode_init(struct inode *inode, struct inode *dir, umode_t mode) { /* Make inode invalid - just in case we are going to drop it before * the initialization happens */ @@ -572,7 +572,7 @@ static int new_inode_init(struct inode *inode, struct inode *dir, int mode) return 0; } -static int reiserfs_create(struct inode *dir, struct dentry *dentry, int mode, +static int reiserfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, struct nameidata *nd) { int retval; @@ -622,7 +622,7 @@ static int reiserfs_create(struct inode *dir, struct dentry *dentry, int mode, dentry->d_name.len, inode, 1 /*visible */ ); if (retval) { int err; - inode->i_nlink--; + drop_nlink(inode); reiserfs_update_sd(&th, inode); err = journal_end(&th, dir->i_sb, jbegin_count); if (err) @@ -643,7 +643,7 @@ static int reiserfs_create(struct inode *dir, struct dentry *dentry, int mode, return retval; } -static int reiserfs_mknod(struct inode *dir, struct dentry *dentry, int mode, +static int reiserfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev) { int retval; @@ -702,7 +702,7 @@ static int reiserfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dentry->d_name.len, inode, 1 /*visible */ ); if (retval) { int err; - inode->i_nlink--; + drop_nlink(inode); reiserfs_update_sd(&th, inode); err = journal_end(&th, dir->i_sb, jbegin_count); if (err) @@ -721,7 +721,7 @@ static int reiserfs_mknod(struct inode *dir, struct dentry *dentry, int mode, return retval; } -static int reiserfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) +static int reiserfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) { int retval; struct inode *inode; @@ -787,7 +787,7 @@ static int reiserfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) dentry->d_name.len, inode, 1 /*visible */ ); if (retval) { int err; - inode->i_nlink = 0; + clear_nlink(inode); DEC_DIR_INODE_NLINK(dir); reiserfs_update_sd(&th, inode); err = journal_end(&th, dir->i_sb, jbegin_count); @@ -964,7 +964,7 @@ static int reiserfs_unlink(struct inode *dir, struct dentry *dentry) reiserfs_warning(inode->i_sb, "reiserfs-7042", "deleting nonexistent file (%lu), %d", inode->i_ino, inode->i_nlink); - inode->i_nlink = 1; + set_nlink(inode, 1); } drop_nlink(inode); @@ -1086,7 +1086,7 @@ static int reiserfs_symlink(struct inode *parent_dir, dentry->d_name.len, inode, 1 /*visible */ ); if (retval) { int err; - inode->i_nlink--; + drop_nlink(inode); reiserfs_update_sd(&th, inode); err = journal_end(&th, parent_dir->i_sb, jbegin_count); if (err) @@ -1129,7 +1129,7 @@ static int reiserfs_link(struct dentry *old_dentry, struct inode *dir, retval = journal_begin(&th, dir->i_sb, jbegin_count); if (retval) { - inode->i_nlink--; + drop_nlink(inode); reiserfs_write_unlock(dir->i_sb); return retval; } @@ -1144,7 +1144,7 @@ static int reiserfs_link(struct dentry *old_dentry, struct inode *dir, if (retval) { int err; - inode->i_nlink--; + drop_nlink(inode); err = journal_end(&th, dir->i_sb, jbegin_count); reiserfs_write_unlock(dir->i_sb); return err ? err : retval; diff --git a/fs/reiserfs/resize.c b/fs/reiserfs/resize.c index b6b9b1f..7483279 100644 --- a/fs/reiserfs/resize.c +++ b/fs/reiserfs/resize.c @@ -111,15 +111,13 @@ int reiserfs_resize(struct super_block *s, unsigned long block_count_new) /* allocate additional bitmap blocks, reallocate array of bitmap * block pointers */ bitmap = - vmalloc(sizeof(struct reiserfs_bitmap_info) * bmap_nr_new); + vzalloc(sizeof(struct reiserfs_bitmap_info) * bmap_nr_new); if (!bitmap) { /* Journal bitmaps are still supersized, but the memory isn't * leaked, so I guess it's ok */ printk("reiserfs_resize: unable to allocate memory.\n"); return -ENOMEM; } - memset(bitmap, 0, - sizeof(struct reiserfs_bitmap_info) * bmap_nr_new); for (i = 0; i < bmap_nr; i++) bitmap[i] = old_bitmap[i]; diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index 14363b9..1d42e70 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -28,6 +28,7 @@ #include <linux/mount.h> #include <linux/namei.h> #include <linux/crc32.h> +#include <linux/seq_file.h> struct file_system_type reiserfs_fs_type; @@ -61,6 +62,7 @@ static int is_any_reiserfs_magic_string(struct reiserfs_super_block *rs) static int reiserfs_remount(struct super_block *s, int *flags, char *data); static int reiserfs_statfs(struct dentry *dentry, struct kstatfs *buf); +void show_alloc_options(struct seq_file *seq, struct super_block *s); static int reiserfs_sync_fs(struct super_block *s, int wait) { @@ -453,16 +455,20 @@ int remove_save_link(struct inode *inode, int truncate) static void reiserfs_kill_sb(struct super_block *s) { if (REISERFS_SB(s)) { - if (REISERFS_SB(s)->xattr_root) { - d_invalidate(REISERFS_SB(s)->xattr_root); - dput(REISERFS_SB(s)->xattr_root); - REISERFS_SB(s)->xattr_root = NULL; - } - if (REISERFS_SB(s)->priv_root) { - d_invalidate(REISERFS_SB(s)->priv_root); - dput(REISERFS_SB(s)->priv_root); - REISERFS_SB(s)->priv_root = NULL; - } + /* + * Force any pending inode evictions to occur now. Any + * inodes to be removed that have extended attributes + * associated with them need to clean them up before + * we can release the extended attribute root dentries. + * shrink_dcache_for_umount will BUG if we don't release + * those before it's called so ->put_super is too late. + */ + shrink_dcache_sb(s); + + dput(REISERFS_SB(s)->xattr_root); + REISERFS_SB(s)->xattr_root = NULL; + dput(REISERFS_SB(s)->priv_root); + REISERFS_SB(s)->priv_root = NULL; } kill_block_super(s); @@ -532,7 +538,6 @@ static struct inode *reiserfs_alloc_inode(struct super_block *sb) static void reiserfs_i_callback(struct rcu_head *head) { struct inode *inode = container_of(head, struct inode, i_rcu); - INIT_LIST_HEAD(&inode->i_dentry); kmem_cache_free(reiserfs_inode_cachep, REISERFS_I(inode)); } @@ -597,6 +602,82 @@ out: reiserfs_write_unlock_once(inode->i_sb, lock_depth); } +static int reiserfs_show_options(struct seq_file *seq, struct dentry *root) +{ + struct super_block *s = root->d_sb; + struct reiserfs_journal *journal = SB_JOURNAL(s); + long opts = REISERFS_SB(s)->s_mount_opt; + + if (opts & (1 << REISERFS_LARGETAIL)) + seq_puts(seq, ",tails=on"); + else if (!(opts & (1 << REISERFS_SMALLTAIL))) + seq_puts(seq, ",notail"); + /* tails=small is default so we don't show it */ + + if (!(opts & (1 << REISERFS_BARRIER_FLUSH))) + seq_puts(seq, ",barrier=none"); + /* barrier=flush is default so we don't show it */ + + if (opts & (1 << REISERFS_ERROR_CONTINUE)) + seq_puts(seq, ",errors=continue"); + else if (opts & (1 << REISERFS_ERROR_PANIC)) + seq_puts(seq, ",errors=panic"); + /* errors=ro is default so we don't show it */ + + if (opts & (1 << REISERFS_DATA_LOG)) + seq_puts(seq, ",data=journal"); + else if (opts & (1 << REISERFS_DATA_WRITEBACK)) + seq_puts(seq, ",data=writeback"); + /* data=ordered is default so we don't show it */ + + if (opts & (1 << REISERFS_ATTRS)) + seq_puts(seq, ",attrs"); + + if (opts & (1 << REISERFS_XATTRS_USER)) + seq_puts(seq, ",user_xattr"); + + if (opts & (1 << REISERFS_EXPOSE_PRIVROOT)) + seq_puts(seq, ",expose_privroot"); + + if (opts & (1 << REISERFS_POSIXACL)) + seq_puts(seq, ",acl"); + + if (REISERFS_SB(s)->s_jdev) + seq_printf(seq, ",jdev=%s", REISERFS_SB(s)->s_jdev); + + if (journal->j_max_commit_age != journal->j_default_max_commit_age) + seq_printf(seq, ",commit=%d", journal->j_max_commit_age); + +#ifdef CONFIG_QUOTA + if (REISERFS_SB(s)->s_qf_names[USRQUOTA]) + seq_printf(seq, ",usrjquota=%s", REISERFS_SB(s)->s_qf_names[USRQUOTA]); + else if (opts & (1 << REISERFS_USRQUOTA)) + seq_puts(seq, ",usrquota"); + if (REISERFS_SB(s)->s_qf_names[GRPQUOTA]) + seq_printf(seq, ",grpjquota=%s", REISERFS_SB(s)->s_qf_names[GRPQUOTA]); + else if (opts & (1 << REISERFS_GRPQUOTA)) + seq_puts(seq, ",grpquota"); + if (REISERFS_SB(s)->s_jquota_fmt) { + if (REISERFS_SB(s)->s_jquota_fmt == QFMT_VFS_OLD) + seq_puts(seq, ",jqfmt=vfsold"); + else if (REISERFS_SB(s)->s_jquota_fmt == QFMT_VFS_V0) + seq_puts(seq, ",jqfmt=vfsv0"); + } +#endif + + /* Block allocator options */ + if (opts & (1 << REISERFS_NO_BORDER)) + seq_puts(seq, ",block-allocator=noborder"); + if (opts & (1 << REISERFS_NO_UNHASHED_RELOCATION)) + seq_puts(seq, ",block-allocator=no_unhashed_relocation"); + if (opts & (1 << REISERFS_HASHED_RELOCATION)) + seq_puts(seq, ",block-allocator=hashed_relocation"); + if (opts & (1 << REISERFS_TEST4)) + seq_puts(seq, ",block-allocator=test4"); + show_alloc_options(seq, s); + return 0; +} + #ifdef CONFIG_QUOTA static ssize_t reiserfs_quota_write(struct super_block *, int, const char *, size_t, loff_t); @@ -617,7 +698,7 @@ static const struct super_operations reiserfs_sops = { .unfreeze_fs = reiserfs_unfreeze, .statfs = reiserfs_statfs, .remount_fs = reiserfs_remount, - .show_options = generic_show_options, + .show_options = reiserfs_show_options, #ifdef CONFIG_QUOTA .quota_read = reiserfs_quota_read, .quota_write = reiserfs_quota_write, @@ -915,9 +996,9 @@ static int reiserfs_parse_options(struct super_block *s, char *options, /* strin {"jdev",.arg_required = 'j',.values = NULL}, {"nolargeio",.arg_required = 'w',.values = NULL}, {"commit",.arg_required = 'c',.values = NULL}, - {"usrquota",.setmask = 1 << REISERFS_QUOTA}, - {"grpquota",.setmask = 1 << REISERFS_QUOTA}, - {"noquota",.clrmask = 1 << REISERFS_QUOTA}, + {"usrquota",.setmask = 1 << REISERFS_USRQUOTA}, + {"grpquota",.setmask = 1 << REISERFS_GRPQUOTA}, + {"noquota",.clrmask = 1 << REISERFS_USRQUOTA | 1 << REISERFS_GRPQUOTA}, {"errors",.arg_required = 'e',.values = error_actions}, {"usrjquota",.arg_required = 'u' | (1 << REISERFS_OPT_ALLOWEMPTY),.values = NULL}, @@ -1031,12 +1112,19 @@ static int reiserfs_parse_options(struct super_block *s, char *options, /* strin return 0; } strcpy(qf_names[qtype], arg); - *mount_options |= 1 << REISERFS_QUOTA; + if (qtype == USRQUOTA) + *mount_options |= 1 << REISERFS_USRQUOTA; + else + *mount_options |= 1 << REISERFS_GRPQUOTA; } else { if (qf_names[qtype] != REISERFS_SB(s)->s_qf_names[qtype]) kfree(qf_names[qtype]); qf_names[qtype] = NULL; + if (qtype == USRQUOTA) + *mount_options &= ~(1 << REISERFS_USRQUOTA); + else + *mount_options &= ~(1 << REISERFS_GRPQUOTA); } } if (c == 'f') { @@ -1075,9 +1163,10 @@ static int reiserfs_parse_options(struct super_block *s, char *options, /* strin "journaled quota format not specified."); return 0; } - /* This checking is not precise wrt the quota type but for our purposes it is sufficient */ - if (!(*mount_options & (1 << REISERFS_QUOTA)) - && sb_any_quota_loaded(s)) { + if ((!(*mount_options & (1 << REISERFS_USRQUOTA)) && + sb_has_quota_loaded(s, USRQUOTA)) || + (!(*mount_options & (1 << REISERFS_GRPQUOTA)) && + sb_has_quota_loaded(s, GRPQUOTA))) { reiserfs_warning(s, "super-6516", "quota options must " "be present when quota is turned on."); return 0; @@ -1164,7 +1253,8 @@ static void handle_quota_files(struct super_block *s, char **qf_names, kfree(REISERFS_SB(s)->s_qf_names[i]); REISERFS_SB(s)->s_qf_names[i] = qf_names[i]; } - REISERFS_SB(s)->s_jquota_fmt = *qfmt; + if (*qfmt) + REISERFS_SB(s)->s_jquota_fmt = *qfmt; } #endif @@ -1225,7 +1315,8 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg) safe_mask |= 1 << REISERFS_ERROR_RO; safe_mask |= 1 << REISERFS_ERROR_CONTINUE; safe_mask |= 1 << REISERFS_ERROR_PANIC; - safe_mask |= 1 << REISERFS_QUOTA; + safe_mask |= 1 << REISERFS_USRQUOTA; + safe_mask |= 1 << REISERFS_GRPQUOTA; /* Update the bitmask, taking care to keep * the bits we're not allowed to change here */ @@ -1672,6 +1763,14 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent) &commit_max_age, qf_names, &qfmt) == 0) { goto error; } + if (jdev_name && jdev_name[0]) { + REISERFS_SB(s)->s_jdev = kstrdup(jdev_name, GFP_KERNEL); + if (!REISERFS_SB(s)->s_jdev) { + SWARN(silent, s, "", "Cannot allocate memory for " + "journal device name"); + goto error; + } + } #ifdef CONFIG_QUOTA handle_quota_files(s, qf_names, &qfmt); #endif @@ -2054,12 +2153,13 @@ static int reiserfs_quota_on(struct super_block *sb, int type, int format_id, int err; struct inode *inode; struct reiserfs_transaction_handle th; + int opt = type == USRQUOTA ? REISERFS_USRQUOTA : REISERFS_GRPQUOTA; - if (!(REISERFS_SB(sb)->s_mount_opt & (1 << REISERFS_QUOTA))) + if (!(REISERFS_SB(sb)->s_mount_opt & (1 << opt))) return -EINVAL; /* Quotafile not on the same filesystem? */ - if (path->mnt->mnt_sb != sb) { + if (path->dentry->d_sb != sb) { err = -EXDEV; goto out; } diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c index 6bc346c..c24deda 100644 --- a/fs/reiserfs/xattr.c +++ b/fs/reiserfs/xattr.c @@ -66,7 +66,7 @@ static int xattr_create(struct inode *dir, struct dentry *dentry, int mode) } #endif -static int xattr_mkdir(struct inode *dir, struct dentry *dentry, int mode) +static int xattr_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) { BUG_ON(!mutex_is_locked(&dir->i_mutex)); return dir->i_op->mkdir(dir, dentry, mode); diff --git a/fs/reiserfs/xattr_security.c b/fs/reiserfs/xattr_security.c index ef66c18..534668f 100644 --- a/fs/reiserfs/xattr_security.c +++ b/fs/reiserfs/xattr_security.c @@ -66,8 +66,8 @@ int reiserfs_security_init(struct inode *dir, struct inode *inode, if (IS_PRIVATE(dir)) return 0; - error = security_inode_init_security(inode, dir, qstr, &sec->name, - &sec->value, &sec->length); + error = security_old_inode_init_security(inode, dir, qstr, &sec->name, + &sec->value, &sec->length); if (error) { if (error == -EOPNOTSUPP) error = 0; diff --git a/fs/romfs/mmap-nommu.c b/fs/romfs/mmap-nommu.c index eed9942..e1a7779 100644 --- a/fs/romfs/mmap-nommu.c +++ b/fs/romfs/mmap-nommu.c @@ -28,9 +28,10 @@ static unsigned long romfs_get_unmapped_area(struct file *file, struct inode *inode = file->f_mapping->host; struct mtd_info *mtd = inode->i_sb->s_mtd; unsigned long isize, offset, maxpages, lpages; + int ret; if (!mtd) - goto cant_map_directly; + return (unsigned long) -ENOSYS; /* the mapping mustn't extend beyond the EOF */ lpages = (len + PAGE_SIZE - 1) >> PAGE_SHIFT; @@ -41,23 +42,20 @@ static unsigned long romfs_get_unmapped_area(struct file *file, if ((pgoff >= maxpages) || (maxpages - pgoff < lpages)) return (unsigned long) -EINVAL; - /* we need to call down to the MTD layer to do the actual mapping */ - if (mtd->get_unmapped_area) { - if (addr != 0) - return (unsigned long) -EINVAL; - - if (len > mtd->size || pgoff >= (mtd->size >> PAGE_SHIFT)) - return (unsigned long) -EINVAL; + if (addr != 0) + return (unsigned long) -EINVAL; - offset += ROMFS_I(inode)->i_dataoffset; - if (offset > mtd->size - len) - return (unsigned long) -EINVAL; + if (len > mtd->size || pgoff >= (mtd->size >> PAGE_SHIFT)) + return (unsigned long) -EINVAL; - return mtd->get_unmapped_area(mtd, len, offset, flags); - } + offset += ROMFS_I(inode)->i_dataoffset; + if (offset > mtd->size - len) + return (unsigned long) -EINVAL; -cant_map_directly: - return (unsigned long) -ENOSYS; + ret = mtd_get_unmapped_area(mtd, len, offset, flags); + if (ret == -EOPNOTSUPP) + ret = -ENOSYS; + return (unsigned long) ret; } /* diff --git a/fs/romfs/super.c b/fs/romfs/super.c index 2305e31..bb36ab7 100644 --- a/fs/romfs/super.c +++ b/fs/romfs/super.c @@ -337,7 +337,7 @@ static struct inode *romfs_iget(struct super_block *sb, unsigned long pos) inode->i_metasize = (ROMFH_SIZE + nlen + 1 + ROMFH_PAD) & ROMFH_MASK; inode->i_dataoffset = pos + inode->i_metasize; - i->i_nlink = 1; /* Hard to decide.. */ + set_nlink(i, 1); /* Hard to decide.. */ i->i_size = be32_to_cpu(ri.size); i->i_mtime.tv_sec = i->i_atime.tv_sec = i->i_ctime.tv_sec = 0; i->i_mtime.tv_nsec = i->i_atime.tv_nsec = i->i_ctime.tv_nsec = 0; @@ -403,7 +403,6 @@ static struct inode *romfs_alloc_inode(struct super_block *sb) static void romfs_i_callback(struct rcu_head *head) { struct inode *inode = container_of(head, struct inode, i_rcu); - INIT_LIST_HEAD(&inode->i_dentry); kmem_cache_free(romfs_inode_cachep, ROMFS_I(inode)); } diff --git a/fs/seq_file.c b/fs/seq_file.c index 05d6b0e..4023d6b 100644 --- a/fs/seq_file.c +++ b/fs/seq_file.c @@ -397,7 +397,7 @@ EXPORT_SYMBOL(seq_printf); * Returns pointer past last written character in @s, or NULL in case of * failure. */ -char *mangle_path(char *s, char *p, char *esc) +char *mangle_path(char *s, const char *p, const char *esc) { while (s <= p) { char c = *p++; @@ -427,7 +427,7 @@ EXPORT_SYMBOL(mangle_path); * return the absolute path of 'path', as represented by the * dentry / mnt pair in the path parameter. */ -int seq_path(struct seq_file *m, struct path *path, char *esc) +int seq_path(struct seq_file *m, const struct path *path, const char *esc) { char *buf; size_t size = seq_get_buf(m, &buf); @@ -449,11 +449,9 @@ EXPORT_SYMBOL(seq_path); /* * Same as seq_path, but relative to supplied root. - * - * root may be changed, see __d_path(). */ -int seq_path_root(struct seq_file *m, struct path *path, struct path *root, - char *esc) +int seq_path_root(struct seq_file *m, const struct path *path, + const struct path *root, const char *esc) { char *buf; size_t size = seq_get_buf(m, &buf); @@ -463,6 +461,8 @@ int seq_path_root(struct seq_file *m, struct path *path, struct path *root, char *p; p = __d_path(path, root, buf, size); + if (!p) + return SEQ_SKIP; res = PTR_ERR(p); if (!IS_ERR(p)) { char *end = mangle_path(buf, p, esc); @@ -474,13 +474,13 @@ int seq_path_root(struct seq_file *m, struct path *path, struct path *root, } seq_commit(m, res); - return res < 0 ? res : 0; + return res < 0 && res != -ENAMETOOLONG ? res : 0; } /* * returns the path of the 'dentry' from the root of its filesystem. */ -int seq_dentry(struct seq_file *m, struct dentry *dentry, char *esc) +int seq_dentry(struct seq_file *m, struct dentry *dentry, const char *esc) { char *buf; size_t size = seq_get_buf(m, &buf); diff --git a/fs/splice.c b/fs/splice.c index fa2defa..1ec0493 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -25,7 +25,6 @@ #include <linux/mm_inline.h> #include <linux/swap.h> #include <linux/writeback.h> -#include <linux/buffer_head.h> #include <linux/module.h> #include <linux/syscalls.h> #include <linux/uio.h> diff --git a/fs/squashfs/Kconfig b/fs/squashfs/Kconfig index 1360d4f..c70111e 100644 --- a/fs/squashfs/Kconfig +++ b/fs/squashfs/Kconfig @@ -19,9 +19,9 @@ config SQUASHFS If you want to compile this as a module ( = code which can be inserted in and removed from the running kernel whenever you want), - say M here and read <file:Documentation/modules.txt>. The module - will be called squashfs. Note that the root file system (the one - containing the directory /) cannot be compiled as a module. + say M here. The module will be called squashfs. Note that the root + file system (the one containing the directory /) cannot be compiled + as a module. If unsure, say N. @@ -78,6 +78,28 @@ config SQUASHFS_XZ If unsure, say N. +config SQUASHFS_4K_DEVBLK_SIZE + bool "Use 4K device block size?" + depends on SQUASHFS + help + By default Squashfs sets the dev block size (sb_min_blocksize) + to 1K or the smallest block size supported by the block device + (if larger). This, because blocks are packed together and + unaligned in Squashfs, should reduce latency. + + This, however, gives poor performance on MTD NAND devices where + the optimal I/O size is 4K (even though the devices can support + smaller block sizes). + + Using a 4K device block size may also improve overall I/O + performance for some file access patterns (e.g. sequential + accesses of files in filesystem order) on all media. + + Setting this option will force Squashfs to use a 4K device block + size by default. + + If unsure, say N. + config SQUASHFS_EMBEDDED bool "Additional option for memory-constrained systems" depends on SQUASHFS diff --git a/fs/squashfs/inode.c b/fs/squashfs/inode.c index 04bebca..fd7b3b3 100644 --- a/fs/squashfs/inode.c +++ b/fs/squashfs/inode.c @@ -159,7 +159,7 @@ int squashfs_read_inode(struct inode *inode, long long ino) frag_offset = 0; } - inode->i_nlink = 1; + set_nlink(inode, 1); inode->i_size = le32_to_cpu(sqsh_ino->file_size); inode->i_fop = &generic_ro_fops; inode->i_mode |= S_IFREG; @@ -203,7 +203,7 @@ int squashfs_read_inode(struct inode *inode, long long ino) } xattr_id = le32_to_cpu(sqsh_ino->xattr); - inode->i_nlink = le32_to_cpu(sqsh_ino->nlink); + set_nlink(inode, le32_to_cpu(sqsh_ino->nlink)); inode->i_size = le64_to_cpu(sqsh_ino->file_size); inode->i_op = &squashfs_inode_ops; inode->i_fop = &generic_ro_fops; @@ -232,7 +232,7 @@ int squashfs_read_inode(struct inode *inode, long long ino) if (err < 0) goto failed_read; - inode->i_nlink = le32_to_cpu(sqsh_ino->nlink); + set_nlink(inode, le32_to_cpu(sqsh_ino->nlink)); inode->i_size = le16_to_cpu(sqsh_ino->file_size); inode->i_op = &squashfs_dir_inode_ops; inode->i_fop = &squashfs_dir_ops; @@ -257,7 +257,7 @@ int squashfs_read_inode(struct inode *inode, long long ino) goto failed_read; xattr_id = le32_to_cpu(sqsh_ino->xattr); - inode->i_nlink = le32_to_cpu(sqsh_ino->nlink); + set_nlink(inode, le32_to_cpu(sqsh_ino->nlink)); inode->i_size = le32_to_cpu(sqsh_ino->file_size); inode->i_op = &squashfs_dir_inode_ops; inode->i_fop = &squashfs_dir_ops; @@ -284,7 +284,7 @@ int squashfs_read_inode(struct inode *inode, long long ino) if (err < 0) goto failed_read; - inode->i_nlink = le32_to_cpu(sqsh_ino->nlink); + set_nlink(inode, le32_to_cpu(sqsh_ino->nlink)); inode->i_size = le32_to_cpu(sqsh_ino->symlink_size); inode->i_op = &squashfs_symlink_inode_ops; inode->i_data.a_ops = &squashfs_symlink_aops; @@ -325,7 +325,7 @@ int squashfs_read_inode(struct inode *inode, long long ino) inode->i_mode |= S_IFCHR; else inode->i_mode |= S_IFBLK; - inode->i_nlink = le32_to_cpu(sqsh_ino->nlink); + set_nlink(inode, le32_to_cpu(sqsh_ino->nlink)); rdev = le32_to_cpu(sqsh_ino->rdev); init_special_inode(inode, inode->i_mode, new_decode_dev(rdev)); @@ -349,7 +349,7 @@ int squashfs_read_inode(struct inode *inode, long long ino) inode->i_mode |= S_IFBLK; xattr_id = le32_to_cpu(sqsh_ino->xattr); inode->i_op = &squashfs_inode_ops; - inode->i_nlink = le32_to_cpu(sqsh_ino->nlink); + set_nlink(inode, le32_to_cpu(sqsh_ino->nlink)); rdev = le32_to_cpu(sqsh_ino->rdev); init_special_inode(inode, inode->i_mode, new_decode_dev(rdev)); @@ -370,7 +370,7 @@ int squashfs_read_inode(struct inode *inode, long long ino) inode->i_mode |= S_IFIFO; else inode->i_mode |= S_IFSOCK; - inode->i_nlink = le32_to_cpu(sqsh_ino->nlink); + set_nlink(inode, le32_to_cpu(sqsh_ino->nlink)); init_special_inode(inode, inode->i_mode, 0); break; } @@ -389,7 +389,7 @@ int squashfs_read_inode(struct inode *inode, long long ino) inode->i_mode |= S_IFSOCK; xattr_id = le32_to_cpu(sqsh_ino->xattr); inode->i_op = &squashfs_inode_ops; - inode->i_nlink = le32_to_cpu(sqsh_ino->nlink); + set_nlink(inode, le32_to_cpu(sqsh_ino->nlink)); init_special_inode(inode, inode->i_mode, 0); break; } diff --git a/fs/squashfs/squashfs_fs.h b/fs/squashfs/squashfs_fs.h index b4a4e53..e8e1464 100644 --- a/fs/squashfs/squashfs_fs.h +++ b/fs/squashfs/squashfs_fs.h @@ -36,6 +36,13 @@ #define SQUASHFS_FILE_SIZE 131072 #define SQUASHFS_FILE_LOG 17 +/* default size of block device I/O */ +#ifdef CONFIG_SQUASHFS_4K_DEVBLK_SIZE +#define SQUASHFS_DEVBLK_SIZE 4096 +#else +#define SQUASHFS_DEVBLK_SIZE 1024 +#endif + #define SQUASHFS_FILE_MAX_SIZE 1048576 #define SQUASHFS_FILE_MAX_LOG 20 diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c index 7438850..d0858c2 100644 --- a/fs/squashfs/super.c +++ b/fs/squashfs/super.c @@ -95,7 +95,7 @@ static int squashfs_fill_super(struct super_block *sb, void *data, int silent) } msblk = sb->s_fs_info; - msblk->devblksize = sb_min_blocksize(sb, BLOCK_SIZE); + msblk->devblksize = sb_min_blocksize(sb, SQUASHFS_DEVBLK_SIZE); msblk->devblksize_log2 = ffz(~msblk->devblksize); mutex_init(&msblk->read_data_mutex); @@ -464,7 +464,6 @@ static struct inode *squashfs_alloc_inode(struct super_block *sb) static void squashfs_i_callback(struct rcu_head *head) { struct inode *inode = container_of(head, struct inode, i_rcu); - INIT_LIST_HEAD(&inode->i_dentry); kmem_cache_free(squashfs_inode_cachep, squashfs_i(inode)); } @@ -71,6 +71,6 @@ void fsstack_copy_attr_all(struct inode *dest, const struct inode *src) dest->i_ctime = src->i_ctime; dest->i_blkbits = src->i_blkbits; dest->i_flags = src->i_flags; - dest->i_nlink = src->i_nlink; + set_nlink(dest, src->i_nlink); } EXPORT_SYMBOL_GPL(fsstack_copy_attr_all); @@ -294,15 +294,16 @@ SYSCALL_DEFINE4(readlinkat, int, dfd, const char __user *, pathname, { struct path path; int error; + int empty = 0; if (bufsiz <= 0) return -EINVAL; - error = user_path_at(dfd, pathname, LOOKUP_EMPTY, &path); + error = user_path_at_empty(dfd, pathname, LOOKUP_EMPTY, &path, &empty); if (!error) { struct inode *inode = path.dentry->d_inode; - error = -EINVAL; + error = empty ? -ENOENT : -EINVAL; if (inode->i_op->readlink) { error = security_inode_readlink(path.dentry); if (!error) { diff --git a/fs/statfs.c b/fs/statfs.c index 8244924..2aa6a22 100644 --- a/fs/statfs.c +++ b/fs/statfs.c @@ -7,6 +7,7 @@ #include <linux/statfs.h> #include <linux/security.h> #include <linux/uaccess.h> +#include "internal.h" static int flags_by_mnt(int mnt_flags) { @@ -45,7 +46,7 @@ static int calculate_f_flags(struct vfsmount *mnt) flags_by_sb(mnt->mnt_sb->s_flags); } -int statfs_by_dentry(struct dentry *dentry, struct kstatfs *buf) +static int statfs_by_dentry(struct dentry *dentry, struct kstatfs *buf) { int retval; @@ -76,7 +77,7 @@ EXPORT_SYMBOL(vfs_statfs); int user_statfs(const char __user *pathname, struct kstatfs *st) { struct path path; - int error = user_path(pathname, &path); + int error = user_path_at(AT_FDCWD, pathname, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &path); if (!error) { error = vfs_statfs(&path, st); path_put(&path); @@ -205,19 +206,23 @@ SYSCALL_DEFINE3(fstatfs64, unsigned int, fd, size_t, sz, struct statfs64 __user return error; } -SYSCALL_DEFINE2(ustat, unsigned, dev, struct ustat __user *, ubuf) +int vfs_ustat(dev_t dev, struct kstatfs *sbuf) { - struct super_block *s; - struct ustat tmp; - struct kstatfs sbuf; + struct super_block *s = user_get_super(dev); int err; - - s = user_get_super(new_decode_dev(dev)); if (!s) return -EINVAL; - err = statfs_by_dentry(s->s_root, &sbuf); + err = statfs_by_dentry(s->s_root, sbuf); drop_super(s); + return err; +} + +SYSCALL_DEFINE2(ustat, unsigned, dev, struct ustat __user *, ubuf) +{ + struct ustat tmp; + struct kstatfs sbuf; + int err = vfs_ustat(new_decode_dev(dev), &sbuf); if (err) return err; @@ -61,7 +61,7 @@ static int prune_super(struct shrinker *shrink, struct shrink_control *sc) return -1; if (!grab_super_passive(sb)) - return -1; + return !sc->nr_to_scan ? 0 : -1; if (sb->s_op && sb->s_op->nr_cached_objects) fs_objects = sb->s_op->nr_cached_objects(sb); @@ -136,12 +136,13 @@ static struct super_block *alloc_super(struct file_system_type *type) INIT_LIST_HEAD(&s->s_files); #endif s->s_bdi = &default_backing_dev_info; - INIT_LIST_HEAD(&s->s_instances); + INIT_HLIST_NODE(&s->s_instances); INIT_HLIST_BL_HEAD(&s->s_anon); INIT_LIST_HEAD(&s->s_inodes); INIT_LIST_HEAD(&s->s_dentry_lru); INIT_LIST_HEAD(&s->s_inode_lru); spin_lock_init(&s->s_inode_lru_lock); + INIT_LIST_HEAD(&s->s_mounts); init_rwsem(&s->s_umount); mutex_init(&s->s_lock); lockdep_set_class(&s->s_umount, &type->s_umount_key); @@ -200,6 +201,7 @@ static inline void destroy_super(struct super_block *s) free_percpu(s->s_files); #endif security_sb_free(s); + WARN_ON(!list_empty(&s->s_mounts)); kfree(s->s_subtype); kfree(s->s_options); kfree(s); @@ -210,7 +212,7 @@ static inline void destroy_super(struct super_block *s) /* * Drop a superblock's refcount. The caller must hold sb_lock. */ -void __put_super(struct super_block *sb) +static void __put_super(struct super_block *sb) { if (!--sb->s_count) { list_del_init(&sb->s_list); @@ -225,7 +227,7 @@ void __put_super(struct super_block *sb) * Drops a temporary reference, frees superblock if there's no * references left. */ -void put_super(struct super_block *sb) +static void put_super(struct super_block *sb) { spin_lock(&sb_lock); __put_super(sb); @@ -328,7 +330,7 @@ static int grab_super(struct super_block *s) __releases(sb_lock) bool grab_super_passive(struct super_block *sb) { spin_lock(&sb_lock); - if (list_empty(&sb->s_instances)) { + if (hlist_unhashed(&sb->s_instances)) { spin_unlock(&sb_lock); return false; } @@ -337,7 +339,7 @@ bool grab_super_passive(struct super_block *sb) spin_unlock(&sb_lock); if (down_read_trylock(&sb->s_umount)) { - if (sb->s_root) + if (sb->s_root && (sb->s_flags & MS_BORN)) return true; up_read(&sb->s_umount); } @@ -400,7 +402,7 @@ void generic_shutdown_super(struct super_block *sb) } spin_lock(&sb_lock); /* should be initialized for __put_super_and_need_restart() */ - list_del_init(&sb->s_instances); + hlist_del_init(&sb->s_instances); spin_unlock(&sb_lock); up_write(&sb->s_umount); } @@ -420,13 +422,14 @@ struct super_block *sget(struct file_system_type *type, void *data) { struct super_block *s = NULL; + struct hlist_node *node; struct super_block *old; int err; retry: spin_lock(&sb_lock); if (test) { - list_for_each_entry(old, &type->fs_supers, s_instances) { + hlist_for_each_entry(old, node, &type->fs_supers, s_instances) { if (!test(old, data)) continue; if (!grab_super(old)) @@ -462,7 +465,7 @@ retry: s->s_type = type; strlcpy(s->s_id, type->name, sizeof(s->s_id)); list_add_tail(&s->s_list, &super_blocks); - list_add(&s->s_instances, &type->fs_supers); + hlist_add_head(&s->s_instances, &type->fs_supers); spin_unlock(&sb_lock); get_filesystem(type); register_shrinker(&s->s_shrink); @@ -497,14 +500,14 @@ void sync_supers(void) spin_lock(&sb_lock); list_for_each_entry(sb, &super_blocks, s_list) { - if (list_empty(&sb->s_instances)) + if (hlist_unhashed(&sb->s_instances)) continue; if (sb->s_op->write_super && sb->s_dirt) { sb->s_count++; spin_unlock(&sb_lock); down_read(&sb->s_umount); - if (sb->s_root && sb->s_dirt) + if (sb->s_root && sb->s_dirt && (sb->s_flags & MS_BORN)) sb->s_op->write_super(sb); up_read(&sb->s_umount); @@ -533,13 +536,13 @@ void iterate_supers(void (*f)(struct super_block *, void *), void *arg) spin_lock(&sb_lock); list_for_each_entry(sb, &super_blocks, s_list) { - if (list_empty(&sb->s_instances)) + if (hlist_unhashed(&sb->s_instances)) continue; sb->s_count++; spin_unlock(&sb_lock); down_read(&sb->s_umount); - if (sb->s_root) + if (sb->s_root && (sb->s_flags & MS_BORN)) f(sb, arg); up_read(&sb->s_umount); @@ -566,14 +569,15 @@ void iterate_supers_type(struct file_system_type *type, void (*f)(struct super_block *, void *), void *arg) { struct super_block *sb, *p = NULL; + struct hlist_node *node; spin_lock(&sb_lock); - list_for_each_entry(sb, &type->fs_supers, s_instances) { + hlist_for_each_entry(sb, node, &type->fs_supers, s_instances) { sb->s_count++; spin_unlock(&sb_lock); down_read(&sb->s_umount); - if (sb->s_root) + if (sb->s_root && (sb->s_flags & MS_BORN)) f(sb, arg); up_read(&sb->s_umount); @@ -607,14 +611,14 @@ struct super_block *get_super(struct block_device *bdev) spin_lock(&sb_lock); rescan: list_for_each_entry(sb, &super_blocks, s_list) { - if (list_empty(&sb->s_instances)) + if (hlist_unhashed(&sb->s_instances)) continue; if (sb->s_bdev == bdev) { sb->s_count++; spin_unlock(&sb_lock); down_read(&sb->s_umount); /* still alive? */ - if (sb->s_root) + if (sb->s_root && (sb->s_flags & MS_BORN)) return sb; up_read(&sb->s_umount); /* nope, got unmounted */ @@ -647,7 +651,7 @@ struct super_block *get_active_super(struct block_device *bdev) restart: spin_lock(&sb_lock); list_for_each_entry(sb, &super_blocks, s_list) { - if (list_empty(&sb->s_instances)) + if (hlist_unhashed(&sb->s_instances)) continue; if (sb->s_bdev == bdev) { if (grab_super(sb)) /* drops sb_lock */ @@ -667,14 +671,14 @@ struct super_block *user_get_super(dev_t dev) spin_lock(&sb_lock); rescan: list_for_each_entry(sb, &super_blocks, s_list) { - if (list_empty(&sb->s_instances)) + if (hlist_unhashed(&sb->s_instances)) continue; if (sb->s_dev == dev) { sb->s_count++; spin_unlock(&sb_lock); down_read(&sb->s_umount); /* still alive? */ - if (sb->s_root) + if (sb->s_root && (sb->s_flags & MS_BORN)) return sb; up_read(&sb->s_umount); /* nope, got unmounted */ @@ -719,18 +723,29 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force) /* If we are remounting RDONLY and current sb is read/write, make sure there are no rw files opened */ if (remount_ro) { - if (force) + if (force) { mark_files_ro(sb); - else if (!fs_may_remount_ro(sb)) - return -EBUSY; + } else { + retval = sb_prepare_remount_readonly(sb); + if (retval) + return retval; + } } if (sb->s_op->remount_fs) { retval = sb->s_op->remount_fs(sb, &flags, data); - if (retval) - return retval; + if (retval) { + if (!force) + goto cancel_readonly; + /* If forced remount, go ahead despite any errors */ + WARN(1, "forced remount of a %s fs returned %i\n", + sb->s_type->name, retval); + } } sb->s_flags = (sb->s_flags & ~MS_RMT_MASK) | (flags & MS_RMT_MASK); + /* Needs to be ordered wrt mnt_is_readonly() */ + smp_wmb(); + sb->s_readonly_remount = 0; /* * Some filesystems modify their metadata via some other path than the @@ -743,6 +758,10 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force) if (remount_ro && sb->s_bdev) invalidate_bdev(sb->s_bdev); return 0; + +cancel_readonly: + sb->s_readonly_remount = 0; + return retval; } static void do_emergency_remount(struct work_struct *work) @@ -751,12 +770,13 @@ static void do_emergency_remount(struct work_struct *work) spin_lock(&sb_lock); list_for_each_entry(sb, &super_blocks, s_list) { - if (list_empty(&sb->s_instances)) + if (hlist_unhashed(&sb->s_instances)) continue; sb->s_count++; spin_unlock(&sb_lock); down_write(&sb->s_umount); - if (sb->s_root && sb->s_bdev && !(sb->s_flags & MS_RDONLY)) { + if (sb->s_root && sb->s_bdev && (sb->s_flags & MS_BORN) && + !(sb->s_flags & MS_RDONLY)) { /* * What lock protects sb->s_flags?? */ @@ -1139,6 +1159,11 @@ int freeze_super(struct super_block *sb) return -EBUSY; } + if (!(sb->s_flags & MS_BORN)) { + up_write(&sb->s_umount); + return 0; /* sic - it's "nothing to do" */ + } + if (sb->s_flags & MS_RDONLY) { sb->s_frozen = SB_FREEZE_TRANS; smp_wmb(); @@ -14,7 +14,6 @@ #include <linux/linkage.h> #include <linux/pagemap.h> #include <linux/quotaops.h> -#include <linux/buffer_head.h> #include <linux/backing-dev.h> #include "internal.h" @@ -43,7 +42,7 @@ static int __sync_filesystem(struct super_block *sb, int wait) if (wait) sync_inodes_sb(sb); else - writeback_inodes_sb(sb); + writeback_inodes_sb(sb, WB_REASON_SYNC); if (sb->s_op->sync_fs) sb->s_op->sync_fs(sb, wait); @@ -98,7 +97,7 @@ static void sync_filesystems(int wait) */ SYSCALL_DEFINE0(sync) { - wakeup_flusher_threads(0); + wakeup_flusher_threads(0, WB_REASON_SYNC); sync_filesystems(0); sync_filesystems(1); if (unlikely(laptop_mode)) diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c index ea9120a..7fdf6a7 100644 --- a/fs/sysfs/dir.c +++ b/fs/sysfs/dir.c @@ -43,20 +43,48 @@ static DEFINE_IDA(sysfs_ino_ida); static void sysfs_link_sibling(struct sysfs_dirent *sd) { struct sysfs_dirent *parent_sd = sd->s_parent; - struct sysfs_dirent **pos; - BUG_ON(sd->s_sibling); - - /* Store directory entries in order by ino. This allows - * readdir to properly restart without having to add a - * cursor into the s_dir.children list. - */ - for (pos = &parent_sd->s_dir.children; *pos; pos = &(*pos)->s_sibling) { - if (sd->s_ino < (*pos)->s_ino) - break; + struct rb_node **p; + struct rb_node *parent; + + if (sysfs_type(sd) == SYSFS_DIR) + parent_sd->s_dir.subdirs++; + + p = &parent_sd->s_dir.inode_tree.rb_node; + parent = NULL; + while (*p) { + parent = *p; +#define node rb_entry(parent, struct sysfs_dirent, inode_node) + if (sd->s_ino < node->s_ino) { + p = &node->inode_node.rb_left; + } else if (sd->s_ino > node->s_ino) { + p = &node->inode_node.rb_right; + } else { + printk(KERN_CRIT "sysfs: inserting duplicate inode '%lx'\n", + (unsigned long) sd->s_ino); + BUG(); + } +#undef node } - sd->s_sibling = *pos; - *pos = sd; + rb_link_node(&sd->inode_node, parent, p); + rb_insert_color(&sd->inode_node, &parent_sd->s_dir.inode_tree); + + p = &parent_sd->s_dir.name_tree.rb_node; + parent = NULL; + while (*p) { + int c; + parent = *p; +#define node rb_entry(parent, struct sysfs_dirent, name_node) + c = strcmp(sd->s_name, node->s_name); + if (c < 0) { + p = &node->name_node.rb_left; + } else { + p = &node->name_node.rb_right; + } +#undef node + } + rb_link_node(&sd->name_node, parent, p); + rb_insert_color(&sd->name_node, &parent_sd->s_dir.name_tree); } /** @@ -71,16 +99,11 @@ static void sysfs_link_sibling(struct sysfs_dirent *sd) */ static void sysfs_unlink_sibling(struct sysfs_dirent *sd) { - struct sysfs_dirent **pos; + if (sysfs_type(sd) == SYSFS_DIR) + sd->s_parent->s_dir.subdirs--; - for (pos = &sd->s_parent->s_dir.children; *pos; - pos = &(*pos)->s_sibling) { - if (*pos == sd) { - *pos = sd->s_sibling; - sd->s_sibling = NULL; - break; - } - } + rb_erase(&sd->inode_node, &sd->s_parent->s_dir.inode_tree); + rb_erase(&sd->name_node, &sd->s_parent->s_dir.name_tree); } /** @@ -126,7 +149,6 @@ struct sysfs_dirent *sysfs_get_active(struct sysfs_dirent *sd) */ void sysfs_put_active(struct sysfs_dirent *sd) { - struct completion *cmpl; int v; if (unlikely(!sd)) @@ -138,10 +160,9 @@ void sysfs_put_active(struct sysfs_dirent *sd) return; /* atomic_dec_return() is a mb(), we'll always see the updated - * sd->s_sibling. + * sd->u.completion. */ - cmpl = (void *)sd->s_sibling; - complete(cmpl); + complete(sd->u.completion); } /** @@ -155,16 +176,16 @@ static void sysfs_deactivate(struct sysfs_dirent *sd) DECLARE_COMPLETION_ONSTACK(wait); int v; - BUG_ON(sd->s_sibling || !(sd->s_flags & SYSFS_FLAG_REMOVED)); + BUG_ON(!(sd->s_flags & SYSFS_FLAG_REMOVED)); if (!(sysfs_type(sd) & SYSFS_ACTIVE_REF)) return; - sd->s_sibling = (void *)&wait; + sd->u.completion = (void *)&wait; rwsem_acquire(&sd->dep_map, 0, 0, _RET_IP_); /* atomic_add_return() is a mb(), put_active() will always see - * the updated sd->s_sibling. + * the updated sd->u.completion. */ v = atomic_add_return(SD_DEACTIVATED_BIAS, &sd->s_active); @@ -173,8 +194,6 @@ static void sysfs_deactivate(struct sysfs_dirent *sd) wait_for_completion(&wait); } - sd->s_sibling = NULL; - lock_acquired(&sd->dep_map, _RET_IP_); rwsem_release(&sd->dep_map, 1, _RET_IP_); } @@ -384,6 +403,13 @@ int __sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd) { struct sysfs_inode_attrs *ps_iattr; + if (!!sysfs_ns_type(acxt->parent_sd) != !!sd->s_ns) { + WARN(1, KERN_WARNING "sysfs: ns %s in '%s' for '%s'\n", + sysfs_ns_type(acxt->parent_sd)? "required": "invalid", + acxt->parent_sd->s_name, sd->s_name); + return -EINVAL; + } + if (sysfs_find_dirent(acxt->parent_sd, sd->s_ns, sd->s_name)) return -EEXIST; @@ -490,7 +516,7 @@ void sysfs_remove_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd) } sd->s_flags |= SYSFS_FLAG_REMOVED; - sd->s_sibling = acxt->removed; + sd->u.removed_list = acxt->removed; acxt->removed = sd; } @@ -514,8 +540,7 @@ void sysfs_addrm_finish(struct sysfs_addrm_cxt *acxt) while (acxt->removed) { struct sysfs_dirent *sd = acxt->removed; - acxt->removed = sd->s_sibling; - sd->s_sibling = NULL; + acxt->removed = sd->u.removed_list; sysfs_deactivate(sd); unmap_bin_file(sd); @@ -540,15 +565,43 @@ struct sysfs_dirent *sysfs_find_dirent(struct sysfs_dirent *parent_sd, const void *ns, const unsigned char *name) { - struct sysfs_dirent *sd; + struct rb_node *p = parent_sd->s_dir.name_tree.rb_node; + struct sysfs_dirent *found = NULL; - for (sd = parent_sd->s_dir.children; sd; sd = sd->s_sibling) { - if (ns && sd->s_ns && (sd->s_ns != ns)) - continue; - if (!strcmp(sd->s_name, name)) - return sd; + if (!!sysfs_ns_type(parent_sd) != !!ns) { + WARN(1, KERN_WARNING "sysfs: ns %s in '%s' for '%s'\n", + sysfs_ns_type(parent_sd)? "required": "invalid", + parent_sd->s_name, name); + return NULL; } - return NULL; + + while (p) { + int c; +#define node rb_entry(p, struct sysfs_dirent, name_node) + c = strcmp(name, node->s_name); + if (c < 0) { + p = node->name_node.rb_left; + } else if (c > 0) { + p = node->name_node.rb_right; + } else { + found = node; + p = node->name_node.rb_left; + } +#undef node + } + + if (found) { + while (found->s_ns != ns) { + p = rb_next(&found->name_node); + if (!p) + return NULL; + found = rb_entry(p, struct sysfs_dirent, name_node); + if (strcmp(name, found->s_name)) + return NULL; + } + } + + return found; } /** @@ -744,21 +797,19 @@ void sysfs_remove_subdir(struct sysfs_dirent *sd) static void __sysfs_remove_dir(struct sysfs_dirent *dir_sd) { struct sysfs_addrm_cxt acxt; - struct sysfs_dirent **pos; + struct rb_node *pos; if (!dir_sd) return; pr_debug("sysfs %s: removing dir\n", dir_sd->s_name); sysfs_addrm_start(&acxt, dir_sd); - pos = &dir_sd->s_dir.children; - while (*pos) { - struct sysfs_dirent *sd = *pos; - + pos = rb_first(&dir_sd->s_dir.inode_tree); + while (pos) { + struct sysfs_dirent *sd = rb_entry(pos, struct sysfs_dirent, inode_node); + pos = rb_next(pos); if (sysfs_type(sd) != SYSFS_DIR) sysfs_remove_one(&acxt, sd); - else - pos = &(*pos)->s_sibling; } sysfs_addrm_finish(&acxt); @@ -814,15 +865,13 @@ int sysfs_rename(struct sysfs_dirent *sd, sd->s_name = new_name; } - /* Remove from old parent's list and insert into new parent's list. */ - if (sd->s_parent != new_parent_sd) { - sysfs_unlink_sibling(sd); - sysfs_get(new_parent_sd); - sysfs_put(sd->s_parent); - sd->s_parent = new_parent_sd; - sysfs_link_sibling(sd); - } + /* Move to the appropriate place in the appropriate directories rbtree. */ + sysfs_unlink_sibling(sd); + sysfs_get(new_parent_sd); + sysfs_put(sd->s_parent); sd->s_ns = new_ns; + sd->s_parent = new_parent_sd; + sysfs_link_sibling(sd); error = 0; out: @@ -881,12 +930,28 @@ static struct sysfs_dirent *sysfs_dir_pos(const void *ns, pos = NULL; } if (!pos && (ino > 1) && (ino < INT_MAX)) { - pos = parent_sd->s_dir.children; - while (pos && (ino > pos->s_ino)) - pos = pos->s_sibling; + struct rb_node *p = parent_sd->s_dir.inode_tree.rb_node; + while (p) { +#define node rb_entry(p, struct sysfs_dirent, inode_node) + if (ino < node->s_ino) { + pos = node; + p = node->inode_node.rb_left; + } else if (ino > node->s_ino) { + p = node->inode_node.rb_right; + } else { + pos = node; + break; + } +#undef node + } + } + while (pos && pos->s_ns != ns) { + struct rb_node *p = rb_next(&pos->inode_node); + if (!p) + pos = NULL; + else + pos = rb_entry(p, struct sysfs_dirent, inode_node); } - while (pos && pos->s_ns && pos->s_ns != ns) - pos = pos->s_sibling; return pos; } @@ -894,10 +959,13 @@ static struct sysfs_dirent *sysfs_dir_next_pos(const void *ns, struct sysfs_dirent *parent_sd, ino_t ino, struct sysfs_dirent *pos) { pos = sysfs_dir_pos(ns, parent_sd, ino, pos); - if (pos) - pos = pos->s_sibling; - while (pos && pos->s_ns && pos->s_ns != ns) - pos = pos->s_sibling; + if (pos) do { + struct rb_node *p = rb_next(&pos->inode_node); + if (!p) + pos = NULL; + else + pos = rb_entry(p, struct sysfs_dirent, inode_node); + } while (pos && pos->s_ns != ns); return pos; } diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c index 1ad8c93..62f4fb3 100644 --- a/fs/sysfs/file.c +++ b/fs/sysfs/file.c @@ -466,9 +466,6 @@ void sysfs_notify(struct kobject *k, const char *dir, const char *attr) mutex_lock(&sysfs_mutex); if (sd && dir) - /* Only directories are tagged, so no need to pass - * a tag explicitly. - */ sd = sysfs_find_dirent(sd, NULL, dir); if (sd && attr) sd = sysfs_find_dirent(sd, NULL, attr); @@ -488,17 +485,56 @@ const struct file_operations sysfs_file_operations = { .poll = sysfs_poll, }; +int sysfs_attr_ns(struct kobject *kobj, const struct attribute *attr, + const void **pns) +{ + struct sysfs_dirent *dir_sd = kobj->sd; + const struct sysfs_ops *ops; + const void *ns = NULL; + int err; + + err = 0; + if (!sysfs_ns_type(dir_sd)) + goto out; + + err = -EINVAL; + if (!kobj->ktype) + goto out; + ops = kobj->ktype->sysfs_ops; + if (!ops) + goto out; + if (!ops->namespace) + goto out; + + err = 0; + ns = ops->namespace(kobj, attr); +out: + if (err) { + WARN(1, KERN_ERR "missing sysfs namespace attribute operation for " + "kobject: %s\n", kobject_name(kobj)); + } + *pns = ns; + return err; +} + int sysfs_add_file_mode(struct sysfs_dirent *dir_sd, - const struct attribute *attr, int type, mode_t amode) + const struct attribute *attr, int type, umode_t amode) { umode_t mode = (amode & S_IALLUGO) | S_IFREG; struct sysfs_addrm_cxt acxt; struct sysfs_dirent *sd; + const void *ns; int rc; + rc = sysfs_attr_ns(dir_sd->s_dir.kobj, attr, &ns); + if (rc) + return rc; + sd = sysfs_new_dirent(attr->name, mode, type); if (!sd) return -ENOMEM; + + sd->s_ns = ns; sd->s_attr.attr = (void *)attr; sysfs_dirent_init_lockdep(sd); @@ -582,16 +618,21 @@ EXPORT_SYMBOL_GPL(sysfs_add_file_to_group); * */ int sysfs_chmod_file(struct kobject *kobj, const struct attribute *attr, - mode_t mode) + umode_t mode) { struct sysfs_dirent *sd; struct iattr newattrs; + const void *ns; int rc; + rc = sysfs_attr_ns(kobj, attr, &ns); + if (rc) + return rc; + mutex_lock(&sysfs_mutex); rc = -ENOENT; - sd = sysfs_find_dirent(kobj->sd, NULL, attr->name); + sd = sysfs_find_dirent(kobj->sd, ns, attr->name); if (!sd) goto out; @@ -616,7 +657,12 @@ EXPORT_SYMBOL_GPL(sysfs_chmod_file); void sysfs_remove_file(struct kobject * kobj, const struct attribute * attr) { - sysfs_hash_and_remove(kobj->sd, NULL, attr->name); + const void *ns; + + if (sysfs_attr_ns(kobj, attr, &ns)) + return; + + sysfs_hash_and_remove(kobj->sd, ns, attr->name); } void sysfs_remove_files(struct kobject * kobj, const struct attribute **ptr) diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c index 194414f..dd1701c 100644 --- a/fs/sysfs/group.c +++ b/fs/sysfs/group.c @@ -33,7 +33,7 @@ static int create_files(struct sysfs_dirent *dir_sd, struct kobject *kobj, int error = 0, i; for (i = 0, attr = grp->attrs; *attr && !error; i++, attr++) { - mode_t mode = 0; + umode_t mode = 0; /* in update mode, we're changing the permissions or * visibility. Do this by first removing then diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c index e3f091a..4a802b4 100644 --- a/fs/sysfs/inode.c +++ b/fs/sysfs/inode.c @@ -187,7 +187,7 @@ out: return error; } -static inline void set_default_inode_attr(struct inode * inode, mode_t mode) +static inline void set_default_inode_attr(struct inode * inode, umode_t mode) { inode->i_mode = mode; inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; @@ -202,18 +202,6 @@ static inline void set_inode_attr(struct inode * inode, struct iattr * iattr) inode->i_ctime = iattr->ia_ctime; } -static int sysfs_count_nlink(struct sysfs_dirent *sd) -{ - struct sysfs_dirent *child; - int nr = 0; - - for (child = sd->s_dir.children; child; child = child->s_sibling) - if (sysfs_type(child) == SYSFS_DIR) - nr++; - - return nr + 2; -} - static void sysfs_refresh_inode(struct sysfs_dirent *sd, struct inode *inode) { struct sysfs_inode_attrs *iattrs = sd->s_iattr; @@ -230,7 +218,7 @@ static void sysfs_refresh_inode(struct sysfs_dirent *sd, struct inode *inode) } if (sysfs_type(sd) == SYSFS_DIR) - inode->i_nlink = sysfs_count_nlink(sd); + set_nlink(inode, sd->s_dir.subdirs + 2); } int sysfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) @@ -336,8 +324,6 @@ int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const void *ns, const cha sysfs_addrm_start(&acxt, dir_sd); sd = sysfs_find_dirent(dir_sd, ns, name); - if (sd && (sd->s_ns != ns)) - sd = NULL; if (sd) sysfs_remove_one(&acxt, sd); diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h index 845ab3a..7484a36 100644 --- a/fs/sysfs/sysfs.h +++ b/fs/sysfs/sysfs.h @@ -11,14 +11,18 @@ #include <linux/lockdep.h> #include <linux/kobject_ns.h> #include <linux/fs.h> +#include <linux/rbtree.h> struct sysfs_open_dirent; /* type-specific structures for sysfs_dirent->s_* union members */ struct sysfs_elem_dir { struct kobject *kobj; - /* children list starts here and goes through sd->s_sibling */ - struct sysfs_dirent *children; + + unsigned long subdirs; + + struct rb_root inode_tree; + struct rb_root name_tree; }; struct sysfs_elem_symlink { @@ -56,9 +60,16 @@ struct sysfs_dirent { struct lockdep_map dep_map; #endif struct sysfs_dirent *s_parent; - struct sysfs_dirent *s_sibling; const char *s_name; + struct rb_node inode_node; + struct rb_node name_node; + + union { + struct completion *completion; + struct sysfs_dirent *removed_list; + } u; + const void *s_ns; /* namespace tag */ union { struct sysfs_elem_dir s_dir; @@ -68,7 +79,7 @@ struct sysfs_dirent { }; unsigned int s_flags; - unsigned short s_mode; + umode_t s_mode; ino_t s_ino; struct sysfs_inode_attrs *s_iattr; }; @@ -218,7 +229,7 @@ int sysfs_add_file(struct sysfs_dirent *dir_sd, const struct attribute *attr, int type); int sysfs_add_file_mode(struct sysfs_dirent *dir_sd, - const struct attribute *attr, int type, mode_t amode); + const struct attribute *attr, int type, umode_t amode); /* * bin.c */ diff --git a/fs/sysv/ialloc.c b/fs/sysv/ialloc.c index 0c96c98..8233b02 100644 --- a/fs/sysv/ialloc.c +++ b/fs/sysv/ialloc.c @@ -132,7 +132,7 @@ void sysv_free_inode(struct inode * inode) brelse(bh); } -struct inode * sysv_new_inode(const struct inode * dir, mode_t mode) +struct inode * sysv_new_inode(const struct inode * dir, umode_t mode) { struct super_block *sb = dir->i_sb; struct sysv_sb_info *sbi = SYSV_SB(sb); diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c index 0630eb96..3da5ce2 100644 --- a/fs/sysv/inode.c +++ b/fs/sysv/inode.c @@ -219,7 +219,7 @@ struct inode *sysv_iget(struct super_block *sb, unsigned int ino) inode->i_mode = fs16_to_cpu(sbi, raw_inode->i_mode); inode->i_uid = (uid_t)fs16_to_cpu(sbi, raw_inode->i_uid); inode->i_gid = (gid_t)fs16_to_cpu(sbi, raw_inode->i_gid); - inode->i_nlink = fs16_to_cpu(sbi, raw_inode->i_nlink); + set_nlink(inode, fs16_to_cpu(sbi, raw_inode->i_nlink)); inode->i_size = fs32_to_cpu(sbi, raw_inode->i_size); inode->i_atime.tv_sec = fs32_to_cpu(sbi, raw_inode->i_atime); inode->i_mtime.tv_sec = fs32_to_cpu(sbi, raw_inode->i_mtime); @@ -336,7 +336,6 @@ static struct inode *sysv_alloc_inode(struct super_block *sb) static void sysv_i_callback(struct rcu_head *head) { struct inode *inode = container_of(head, struct inode, i_rcu); - INIT_LIST_HEAD(&inode->i_dentry); kmem_cache_free(sysv_inode_cachep, SYSV_I(inode)); } diff --git a/fs/sysv/itree.c b/fs/sysv/itree.c index fa8d43c..90b54b4 100644 --- a/fs/sysv/itree.c +++ b/fs/sysv/itree.c @@ -442,7 +442,7 @@ static unsigned sysv_nblocks(struct super_block *s, loff_t size) int sysv_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) { - struct super_block *s = mnt->mnt_sb; + struct super_block *s = dentry->d_sb; generic_fillattr(dentry->d_inode, stat); stat->blocks = (s->s_blocksize / 512) * sysv_nblocks(s, stat->size); stat->blksize = s->s_blocksize; diff --git a/fs/sysv/namei.c b/fs/sysv/namei.c index e474fbc..b217797 100644 --- a/fs/sysv/namei.c +++ b/fs/sysv/namei.c @@ -61,7 +61,7 @@ static struct dentry *sysv_lookup(struct inode * dir, struct dentry * dentry, st return NULL; } -static int sysv_mknod(struct inode * dir, struct dentry * dentry, int mode, dev_t rdev) +static int sysv_mknod(struct inode * dir, struct dentry * dentry, umode_t mode, dev_t rdev) { struct inode * inode; int err; @@ -80,7 +80,7 @@ static int sysv_mknod(struct inode * dir, struct dentry * dentry, int mode, dev_ return err; } -static int sysv_create(struct inode * dir, struct dentry * dentry, int mode, struct nameidata *nd) +static int sysv_create(struct inode * dir, struct dentry * dentry, umode_t mode, struct nameidata *nd) { return sysv_mknod(dir, dentry, mode, 0); } @@ -131,7 +131,7 @@ static int sysv_link(struct dentry * old_dentry, struct inode * dir, return add_nondir(dentry, inode); } -static int sysv_mkdir(struct inode * dir, struct dentry *dentry, int mode) +static int sysv_mkdir(struct inode * dir, struct dentry *dentry, umode_t mode) { struct inode * inode; int err = -EMLINK; diff --git a/fs/sysv/sysv.h b/fs/sysv/sysv.h index bb55cdb..0e4b821 100644 --- a/fs/sysv/sysv.h +++ b/fs/sysv/sysv.h @@ -125,7 +125,7 @@ static inline void dirty_sb(struct super_block *sb) /* ialloc.c */ extern struct sysv_inode *sysv_raw_inode(struct super_block *, unsigned, struct buffer_head **); -extern struct inode * sysv_new_inode(const struct inode *, mode_t); +extern struct inode * sysv_new_inode(const struct inode *, umode_t); extern void sysv_free_inode(struct inode *); extern unsigned long sysv_count_free_inodes(struct super_block *); diff --git a/fs/ubifs/budget.c b/fs/ubifs/budget.c index 315de66..bc4f94b 100644 --- a/fs/ubifs/budget.c +++ b/fs/ubifs/budget.c @@ -63,7 +63,7 @@ static void shrink_liability(struct ubifs_info *c, int nr_to_write) { down_read(&c->vfs_sb->s_umount); - writeback_inodes_sb(c->vfs_sb); + writeback_inodes_sb(c->vfs_sb, WB_REASON_FS_FREE_SPACE); up_read(&c->vfs_sb->s_umount); } diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c index eef109a..b09ba2d 100644 --- a/fs/ubifs/debug.c +++ b/fs/ubifs/debug.c @@ -870,6 +870,22 @@ void dbg_dump_lpt_info(struct ubifs_info *c) spin_unlock(&dbg_lock); } +void dbg_dump_sleb(const struct ubifs_info *c, + const struct ubifs_scan_leb *sleb, int offs) +{ + struct ubifs_scan_node *snod; + + printk(KERN_DEBUG "(pid %d) start dumping scanned data from LEB %d:%d\n", + current->pid, sleb->lnum, offs); + + list_for_each_entry(snod, &sleb->nodes, list) { + cond_resched(); + printk(KERN_DEBUG "Dumping node at LEB %d:%d len %d\n", sleb->lnum, + snod->offs, snod->len); + dbg_dump_node(c, snod->node); + } +} + void dbg_dump_leb(const struct ubifs_info *c, int lnum) { struct ubifs_scan_leb *sleb; diff --git a/fs/ubifs/debug.h b/fs/ubifs/debug.h index feb361e..8d9c468 100644 --- a/fs/ubifs/debug.h +++ b/fs/ubifs/debug.h @@ -269,6 +269,8 @@ void dbg_dump_lprop(const struct ubifs_info *c, const struct ubifs_lprops *lp); void dbg_dump_lprops(struct ubifs_info *c); void dbg_dump_lpt_info(struct ubifs_info *c); void dbg_dump_leb(const struct ubifs_info *c, int lnum); +void dbg_dump_sleb(const struct ubifs_info *c, + const struct ubifs_scan_leb *sleb, int offs); void dbg_dump_znode(const struct ubifs_info *c, const struct ubifs_znode *znode); void dbg_dump_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap, int cat); @@ -387,6 +389,9 @@ static inline void dbg_dump_lpt_info(struct ubifs_info *c) { return; } static inline void dbg_dump_leb(const struct ubifs_info *c, int lnum) { return; } static inline void +dbg_dump_sleb(const struct ubifs_info *c, + const struct ubifs_scan_leb *sleb, int offs) { return; } +static inline void dbg_dump_znode(const struct ubifs_info *c, const struct ubifs_znode *znode) { return; } static inline void dbg_dump_heap(struct ubifs_info *c, diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c index 6834920..d6fe1c7 100644 --- a/fs/ubifs/dir.c +++ b/fs/ubifs/dir.c @@ -56,7 +56,7 @@ * * This function returns the inherited flags. */ -static int inherit_flags(const struct inode *dir, int mode) +static int inherit_flags(const struct inode *dir, umode_t mode) { int flags; const struct ubifs_inode *ui = ubifs_inode(dir); @@ -86,7 +86,7 @@ static int inherit_flags(const struct inode *dir, int mode) * case of failure. */ struct inode *ubifs_new_inode(struct ubifs_info *c, const struct inode *dir, - int mode) + umode_t mode) { struct inode *inode; struct ubifs_inode *ui; @@ -253,7 +253,7 @@ out: return ERR_PTR(err); } -static int ubifs_create(struct inode *dir, struct dentry *dentry, int mode, +static int ubifs_create(struct inode *dir, struct dentry *dentry, umode_t mode, struct nameidata *nd) { struct inode *inode; @@ -268,7 +268,7 @@ static int ubifs_create(struct inode *dir, struct dentry *dentry, int mode, * parent directory inode. */ - dbg_gen("dent '%.*s', mode %#x in dir ino %lu", + dbg_gen("dent '%.*s', mode %#hx in dir ino %lu", dentry->d_name.len, dentry->d_name.name, mode, dir->i_ino); err = ubifs_budget_space(c, &req); @@ -712,7 +712,7 @@ out_cancel: return err; } -static int ubifs_mkdir(struct inode *dir, struct dentry *dentry, int mode) +static int ubifs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) { struct inode *inode; struct ubifs_inode *dir_ui = ubifs_inode(dir); @@ -725,7 +725,7 @@ static int ubifs_mkdir(struct inode *dir, struct dentry *dentry, int mode) * directory inode. */ - dbg_gen("dent '%.*s', mode %#x in dir ino %lu", + dbg_gen("dent '%.*s', mode %#hx in dir ino %lu", dentry->d_name.len, dentry->d_name.name, mode, dir->i_ino); err = ubifs_budget_space(c, &req); @@ -769,7 +769,7 @@ out_budg: } static int ubifs_mknod(struct inode *dir, struct dentry *dentry, - int mode, dev_t rdev) + umode_t mode, dev_t rdev) { struct inode *inode; struct ubifs_inode *ui; diff --git a/fs/ubifs/ioctl.c b/fs/ubifs/ioctl.c index 548acf4..1a7e2d8 100644 --- a/fs/ubifs/ioctl.c +++ b/fs/ubifs/ioctl.c @@ -173,12 +173,12 @@ long ubifs_ioctl(struct file *file, unsigned int cmd, unsigned long arg) * Make sure the file-system is read-write and make sure it * will not become read-only while we are changing the flags. */ - err = mnt_want_write(file->f_path.mnt); + err = mnt_want_write_file(file); if (err) return err; dbg_gen("set flags: %#x, i_flags %#x", flags, inode->i_flags); err = setflags(inode, flags); - mnt_drop_write(file->f_path.mnt); + mnt_drop_write_file(file); return err; } diff --git a/fs/ubifs/recovery.c b/fs/ubifs/recovery.c index af02790..ee4f43f 100644 --- a/fs/ubifs/recovery.c +++ b/fs/ubifs/recovery.c @@ -983,7 +983,7 @@ int ubifs_recover_inl_heads(struct ubifs_info *c, void *sbuf) } /** - * clean_an_unclean_leb - read and write a LEB to remove corruption. + * clean_an_unclean_leb - read and write a LEB to remove corruption. * @c: UBIFS file-system description object * @ucleb: unclean LEB information * @sbuf: LEB-sized buffer to use diff --git a/fs/ubifs/sb.c b/fs/ubifs/sb.c index 93d938a..6094c5a 100644 --- a/fs/ubifs/sb.c +++ b/fs/ubifs/sb.c @@ -247,7 +247,7 @@ static int create_default_filesystem(struct ubifs_info *c) mst->total_dirty = cpu_to_le64(tmp64); /* The indexing LEB does not contribute to dark space */ - tmp64 = (c->main_lebs - 1) * c->dark_wm; + tmp64 = ((long long)(c->main_lebs - 1) * c->dark_wm); mst->total_dark = cpu_to_le64(tmp64); mst->total_used = cpu_to_le64(UBIFS_INO_NODE_SZ); diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index b281212..63765d5 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -129,7 +129,7 @@ struct inode *ubifs_iget(struct super_block *sb, unsigned long inum) goto out_ino; inode->i_flags |= (S_NOCMTIME | S_NOATIME); - inode->i_nlink = le32_to_cpu(ino->nlink); + set_nlink(inode, le32_to_cpu(ino->nlink)); inode->i_uid = le32_to_cpu(ino->uid); inode->i_gid = le32_to_cpu(ino->gid); inode->i_atime.tv_sec = (int64_t)le64_to_cpu(ino->atime_sec); @@ -276,7 +276,6 @@ static void ubifs_i_callback(struct rcu_head *head) { struct inode *inode = container_of(head, struct inode, i_rcu); struct ubifs_inode *ui = ubifs_inode(inode); - INIT_LIST_HEAD(&inode->i_dentry); kmem_cache_free(ubifs_inode_slab, ui); } @@ -420,9 +419,9 @@ static int ubifs_statfs(struct dentry *dentry, struct kstatfs *buf) return 0; } -static int ubifs_show_options(struct seq_file *s, struct vfsmount *mnt) +static int ubifs_show_options(struct seq_file *s, struct dentry *root) { - struct ubifs_info *c = mnt->mnt_sb->s_fs_info; + struct ubifs_info *c = root->d_sb->s_fs_info; if (c->mount_opts.unmount_mode == 2) seq_printf(s, ",fast_unmount"); @@ -2264,19 +2263,12 @@ static int __init ubifs_init(void) return -EINVAL; } - err = register_filesystem(&ubifs_fs_type); - if (err) { - ubifs_err("cannot register file system, error %d", err); - return err; - } - - err = -ENOMEM; ubifs_inode_slab = kmem_cache_create("ubifs_inode_slab", sizeof(struct ubifs_inode), 0, SLAB_MEM_SPREAD | SLAB_RECLAIM_ACCOUNT, &inode_slab_ctor); if (!ubifs_inode_slab) - goto out_reg; + return -ENOMEM; register_shrinker(&ubifs_shrinker_info); @@ -2288,15 +2280,20 @@ static int __init ubifs_init(void) if (err) goto out_compr; + err = register_filesystem(&ubifs_fs_type); + if (err) { + ubifs_err("cannot register file system, error %d", err); + goto out_dbg; + } return 0; +out_dbg: + dbg_debugfs_exit(); out_compr: ubifs_compressors_exit(); out_shrinker: unregister_shrinker(&ubifs_shrinker_info); kmem_cache_destroy(ubifs_inode_slab); -out_reg: - unregister_filesystem(&ubifs_fs_type); return err; } /* late_initcall to let compressors initialize first */ diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h index 27f2255..12e9477 100644 --- a/fs/ubifs/ubifs.h +++ b/fs/ubifs/ubifs.h @@ -1734,7 +1734,7 @@ int ubifs_setattr(struct dentry *dentry, struct iattr *attr); /* dir.c */ struct inode *ubifs_new_inode(struct ubifs_info *c, const struct inode *dir, - int mode); + umode_t mode); int ubifs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat); diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c index 16f19f5..bf18f7a 100644 --- a/fs/ubifs/xattr.c +++ b/fs/ubifs/xattr.c @@ -558,10 +558,10 @@ int ubifs_removexattr(struct dentry *dentry, const char *name) } ubifs_assert(inode->i_nlink == 1); - inode->i_nlink = 0; + clear_nlink(inode); err = remove_xattr(c, host, inode, &nm); if (err) - inode->i_nlink = 1; + set_nlink(inode, 1); /* If @i_nlink is 0, 'iput()' will delete the inode */ iput(inode); diff --git a/fs/udf/balloc.c b/fs/udf/balloc.c index 95518a9..987585b 100644 --- a/fs/udf/balloc.c +++ b/fs/udf/balloc.c @@ -59,8 +59,8 @@ static int __load_block_bitmap(struct super_block *sb, int nr_groups = bitmap->s_nr_groups; if (block_group >= nr_groups) { - udf_debug("block_group (%d) > nr_groups (%d)\n", block_group, - nr_groups); + udf_debug("block_group (%d) > nr_groups (%d)\n", + block_group, nr_groups); } if (bitmap->s_block_bitmap[block_group]) { @@ -126,8 +126,9 @@ static void udf_bitmap_free_blocks(struct super_block *sb, if (bloc->logicalBlockNum + count < count || (bloc->logicalBlockNum + count) > partmap->s_partition_len) { udf_debug("%d < %d || %d + %d > %d\n", - bloc->logicalBlockNum, 0, bloc->logicalBlockNum, - count, partmap->s_partition_len); + bloc->logicalBlockNum, 0, + bloc->logicalBlockNum, count, + partmap->s_partition_len); goto error_return; } @@ -155,7 +156,7 @@ static void udf_bitmap_free_blocks(struct super_block *sb, if (udf_set_bit(bit + i, bh->b_data)) { udf_debug("bit %ld already set\n", bit + i); udf_debug("byte=%2x\n", - ((char *)bh->b_data)[(bit + i) >> 3]); + ((char *)bh->b_data)[(bit + i) >> 3]); } } udf_add_free_space(sb, sbi->s_partition, count); @@ -369,7 +370,8 @@ static void udf_table_free_blocks(struct super_block *sb, if (bloc->logicalBlockNum + count < count || (bloc->logicalBlockNum + count) > partmap->s_partition_len) { udf_debug("%d < %d || %d + %d > %d\n", - bloc->logicalBlockNum, 0, bloc->logicalBlockNum, count, + bloc->logicalBlockNum, 0, + bloc->logicalBlockNum, count, partmap->s_partition_len); goto error_return; } diff --git a/fs/udf/directory.c b/fs/udf/directory.c index 2ffdb67..3e44f57 100644 --- a/fs/udf/directory.c +++ b/fs/udf/directory.c @@ -162,8 +162,8 @@ struct fileIdentDesc *udf_get_fileident(void *buffer, int bufsize, int *offset) int padlen; if ((!buffer) || (!offset)) { - udf_debug("invalidparms\n, buffer=%p, offset=%p\n", buffer, - offset); + udf_debug("invalidparms, buffer=%p, offset=%p\n", + buffer, offset); return NULL; } @@ -201,7 +201,7 @@ struct short_ad *udf_get_fileshortad(uint8_t *ptr, int maxoffset, uint32_t *offs struct short_ad *sa; if ((!ptr) || (!offset)) { - printk(KERN_ERR "udf: udf_get_fileshortad() invalidparms\n"); + pr_err("%s: invalidparms\n", __func__); return NULL; } @@ -223,7 +223,7 @@ struct long_ad *udf_get_filelongad(uint8_t *ptr, int maxoffset, uint32_t *offset struct long_ad *la; if ((!ptr) || (!offset)) { - printk(KERN_ERR "udf: udf_get_filelongad() invalidparms\n"); + pr_err("%s: invalidparms\n", __func__); return NULL; } diff --git a/fs/udf/file.c b/fs/udf/file.c index d8ffa7c..dca0c38 100644 --- a/fs/udf/file.c +++ b/fs/udf/file.c @@ -125,7 +125,6 @@ static ssize_t udf_file_aio_write(struct kiocb *iocb, const struct iovec *iov, err = udf_expand_file_adinicb(inode); if (err) { udf_debug("udf_expand_adinicb: err=%d\n", err); - up_write(&iinfo->i_data_sem); return err; } } else { @@ -133,9 +132,10 @@ static ssize_t udf_file_aio_write(struct kiocb *iocb, const struct iovec *iov, iinfo->i_lenAlloc = pos + count; else iinfo->i_lenAlloc = inode->i_size; + up_write(&iinfo->i_data_sem); } - } - up_write(&iinfo->i_data_sem); + } else + up_write(&iinfo->i_data_sem); retval = generic_file_aio_write(iocb, iov, nr_segs, ppos); if (retval > 0) diff --git a/fs/udf/ialloc.c b/fs/udf/ialloc.c index 6fb7e0a..05ab481 100644 --- a/fs/udf/ialloc.c +++ b/fs/udf/ialloc.c @@ -46,7 +46,7 @@ void udf_free_inode(struct inode *inode) udf_free_blocks(sb, NULL, &UDF_I(inode)->i_location, 0, 1); } -struct inode *udf_new_inode(struct inode *dir, int mode, int *err) +struct inode *udf_new_inode(struct inode *dir, umode_t mode, int *err) { struct super_block *sb = dir->i_sb; struct udf_sb_info *sbi = UDF_SB(sb); diff --git a/fs/udf/inode.c b/fs/udf/inode.c index 1d1358e..7699df7 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -37,6 +37,7 @@ #include <linux/writeback.h> #include <linux/slab.h> #include <linux/crc-itu-t.h> +#include <linux/mpage.h> #include "udf_i.h" #include "udf_sb.h" @@ -47,13 +48,12 @@ MODULE_LICENSE("GPL"); #define EXTENT_MERGE_SIZE 5 -static mode_t udf_convert_permissions(struct fileEntry *); +static umode_t udf_convert_permissions(struct fileEntry *); static int udf_update_inode(struct inode *, int); static void udf_fill_inode(struct inode *, struct buffer_head *); static int udf_sync_inode(struct inode *inode); static int udf_alloc_i_data(struct inode *inode, size_t size); -static struct buffer_head *inode_getblk(struct inode *, sector_t, int *, - sector_t *, int *); +static sector_t inode_getblk(struct inode *, sector_t, int *, int *); static int8_t udf_insert_aext(struct inode *, struct extent_position, struct kernel_lb_addr, uint32_t); static void udf_split_extents(struct inode *, int *, int, int, @@ -83,12 +83,10 @@ void udf_evict_inode(struct inode *inode) end_writeback(inode); if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB && inode->i_size != iinfo->i_lenExtents) { - printk(KERN_WARNING "UDF-fs (%s): Inode %lu (mode %o) has " - "inode size %llu different from extent length %llu. " - "Filesystem need not be standards compliant.\n", - inode->i_sb->s_id, inode->i_ino, inode->i_mode, - (unsigned long long)inode->i_size, - (unsigned long long)iinfo->i_lenExtents); + udf_warn(inode->i_sb, "Inode %lu (mode %o) has inode size %llu different from extent length %llu. Filesystem need not be standards compliant.\n", + inode->i_ino, inode->i_mode, + (unsigned long long)inode->i_size, + (unsigned long long)iinfo->i_lenExtents); } kfree(iinfo->i_ext.i_data); iinfo->i_ext.i_data = NULL; @@ -104,7 +102,13 @@ static int udf_writepage(struct page *page, struct writeback_control *wbc) static int udf_readpage(struct file *file, struct page *page) { - return block_read_full_page(page, udf_get_block); + return mpage_readpage(page, udf_get_block); +} + +static int udf_readpages(struct file *file, struct address_space *mapping, + struct list_head *pages, unsigned nr_pages) +{ + return mpage_readpages(mapping, pages, nr_pages, udf_get_block); } static int udf_write_begin(struct file *file, struct address_space *mapping, @@ -139,12 +143,19 @@ static sector_t udf_bmap(struct address_space *mapping, sector_t block) const struct address_space_operations udf_aops = { .readpage = udf_readpage, + .readpages = udf_readpages, .writepage = udf_writepage, .write_begin = udf_write_begin, .write_end = generic_write_end, .bmap = udf_bmap, }; +/* + * Expand file stored in ICB to a normal one-block-file + * + * This function requires i_data_sem for writing and releases it. + * This function requires i_mutex held + */ int udf_expand_file_adinicb(struct inode *inode) { struct page *page; @@ -163,9 +174,15 @@ int udf_expand_file_adinicb(struct inode *inode) iinfo->i_alloc_type = ICBTAG_FLAG_AD_LONG; /* from now on we have normal address_space methods */ inode->i_data.a_ops = &udf_aops; + up_write(&iinfo->i_data_sem); mark_inode_dirty(inode); return 0; } + /* + * Release i_data_sem so that we can lock a page - page lock ranks + * above i_data_sem. i_mutex still protects us against file changes. + */ + up_write(&iinfo->i_data_sem); page = find_or_create_page(inode->i_mapping, 0, GFP_NOFS); if (!page) @@ -181,6 +198,7 @@ int udf_expand_file_adinicb(struct inode *inode) SetPageUptodate(page); kunmap(page); } + down_write(&iinfo->i_data_sem); memset(iinfo->i_ext.i_data + iinfo->i_lenEAttr, 0x00, iinfo->i_lenAlloc); iinfo->i_lenAlloc = 0; @@ -190,17 +208,20 @@ int udf_expand_file_adinicb(struct inode *inode) iinfo->i_alloc_type = ICBTAG_FLAG_AD_LONG; /* from now on we have normal address_space methods */ inode->i_data.a_ops = &udf_aops; + up_write(&iinfo->i_data_sem); err = inode->i_data.a_ops->writepage(page, &udf_wbc); if (err) { /* Restore everything back so that we don't lose data... */ lock_page(page); kaddr = kmap(page); + down_write(&iinfo->i_data_sem); memcpy(iinfo->i_ext.i_data + iinfo->i_lenEAttr, kaddr, inode->i_size); kunmap(page); unlock_page(page); iinfo->i_alloc_type = ICBTAG_FLAG_AD_IN_ICB; inode->i_data.a_ops = &udf_adinicb_aops; + up_write(&iinfo->i_data_sem); } page_cache_release(page); mark_inode_dirty(inode); @@ -304,7 +325,6 @@ static int udf_get_block(struct inode *inode, sector_t block, struct buffer_head *bh_result, int create) { int err, new; - struct buffer_head *bh; sector_t phys = 0; struct udf_inode_info *iinfo; @@ -317,7 +337,6 @@ static int udf_get_block(struct inode *inode, sector_t block, err = -EIO; new = 0; - bh = NULL; iinfo = UDF_I(inode); down_write(&iinfo->i_data_sem); @@ -326,13 +345,10 @@ static int udf_get_block(struct inode *inode, sector_t block, iinfo->i_next_alloc_goal++; } - err = 0; - bh = inode_getblk(inode, block, &err, &phys, &new); - BUG_ON(bh); - if (err) + phys = inode_getblk(inode, block, &err, &new); + if (!phys) goto abort; - BUG_ON(!phys); if (new) set_buffer_new(bh_result); @@ -541,11 +557,10 @@ out: return err; } -static struct buffer_head *inode_getblk(struct inode *inode, sector_t block, - int *err, sector_t *phys, int *new) +static sector_t inode_getblk(struct inode *inode, sector_t block, + int *err, int *new) { static sector_t last_block; - struct buffer_head *result = NULL; struct kernel_long_ad laarr[EXTENT_MERGE_SIZE]; struct extent_position prev_epos, cur_epos, next_epos; int count = 0, startnum = 0, endnum = 0; @@ -560,6 +575,8 @@ static struct buffer_head *inode_getblk(struct inode *inode, sector_t block, int goal = 0, pgoal = iinfo->i_location.logicalBlockNum; int lastblock = 0; + *err = 0; + *new = 0; prev_epos.offset = udf_file_entry_alloc_offset(inode); prev_epos.block = iinfo->i_location; prev_epos.bh = NULL; @@ -629,8 +646,7 @@ static struct buffer_head *inode_getblk(struct inode *inode, sector_t block, brelse(cur_epos.bh); brelse(next_epos.bh); newblock = udf_get_lb_pblock(inode->i_sb, &eloc, offset); - *phys = newblock; - return NULL; + return newblock; } last_block = block; @@ -658,7 +674,7 @@ static struct buffer_head *inode_getblk(struct inode *inode, sector_t block, brelse(cur_epos.bh); brelse(next_epos.bh); *err = ret; - return NULL; + return 0; } c = 0; offset = 0; @@ -723,7 +739,7 @@ static struct buffer_head *inode_getblk(struct inode *inode, sector_t block, if (!newblocknum) { brelse(prev_epos.bh); *err = -ENOSPC; - return NULL; + return 0; } iinfo->i_lenExtents += inode->i_sb->s_blocksize; } @@ -755,10 +771,10 @@ static struct buffer_head *inode_getblk(struct inode *inode, sector_t block, newblock = udf_get_pblock(inode->i_sb, newblocknum, iinfo->i_location.partitionReferenceNum, 0); - if (!newblock) - return NULL; - *phys = newblock; - *err = 0; + if (!newblock) { + *err = -EIO; + return 0; + } *new = 1; iinfo->i_next_alloc_block = block; iinfo->i_next_alloc_goal = newblocknum; @@ -769,7 +785,7 @@ static struct buffer_head *inode_getblk(struct inode *inode, sector_t block, else mark_inode_dirty(inode); - return result; + return newblock; } static void udf_split_extents(struct inode *inode, int *c, int offset, @@ -1105,10 +1121,9 @@ int udf_setsize(struct inode *inode, loff_t newsize) if (bsize < (udf_file_entry_alloc_offset(inode) + newsize)) { err = udf_expand_file_adinicb(inode); - if (err) { - up_write(&iinfo->i_data_sem); + if (err) return err; - } + down_write(&iinfo->i_data_sem); } else iinfo->i_lenAlloc = newsize; } @@ -1169,16 +1184,15 @@ static void __udf_read_inode(struct inode *inode) */ bh = udf_read_ptagged(inode->i_sb, &iinfo->i_location, 0, &ident); if (!bh) { - printk(KERN_ERR "udf: udf_read_inode(ino %ld) failed !bh\n", - inode->i_ino); + udf_err(inode->i_sb, "(ino %ld) failed !bh\n", inode->i_ino); make_bad_inode(inode); return; } if (ident != TAG_IDENT_FE && ident != TAG_IDENT_EFE && ident != TAG_IDENT_USE) { - printk(KERN_ERR "udf: udf_read_inode(ino %ld) " - "failed ident=%d\n", inode->i_ino, ident); + udf_err(inode->i_sb, "(ino %ld) failed ident=%d\n", + inode->i_ino, ident); brelse(bh); make_bad_inode(inode); return; @@ -1218,8 +1232,8 @@ static void __udf_read_inode(struct inode *inode) } brelse(ibh); } else if (fe->icbTag.strategyType != cpu_to_le16(4)) { - printk(KERN_ERR "udf: unsupported strategy type: %d\n", - le16_to_cpu(fe->icbTag.strategyType)); + udf_err(inode->i_sb, "unsupported strategy type: %d\n", + le16_to_cpu(fe->icbTag.strategyType)); brelse(bh); make_bad_inode(inode); return; @@ -1236,6 +1250,7 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh) int offset; struct udf_sb_info *sbi = UDF_SB(inode->i_sb); struct udf_inode_info *iinfo = UDF_I(inode); + unsigned int link_count; fe = (struct fileEntry *)bh->b_data; efe = (struct extendedFileEntry *)bh->b_data; @@ -1318,9 +1333,10 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh) inode->i_mode &= ~sbi->s_umask; read_unlock(&sbi->s_cred_lock); - inode->i_nlink = le16_to_cpu(fe->fileLinkCount); - if (!inode->i_nlink) - inode->i_nlink = 1; + link_count = le16_to_cpu(fe->fileLinkCount); + if (!link_count) + link_count = 1; + set_nlink(inode, link_count); inode->i_size = le64_to_cpu(fe->informationLength); iinfo->i_lenExtents = inode->i_size; @@ -1413,9 +1429,8 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh) udf_debug("METADATA BITMAP FILE-----\n"); break; default: - printk(KERN_ERR "udf: udf_fill_inode(ino %ld) failed unknown " - "file type=%d\n", inode->i_ino, - fe->icbTag.fileType); + udf_err(inode->i_sb, "(ino %ld) failed unknown file type=%d\n", + inode->i_ino, fe->icbTag.fileType); make_bad_inode(inode); return; } @@ -1438,17 +1453,17 @@ static int udf_alloc_i_data(struct inode *inode, size_t size) iinfo->i_ext.i_data = kmalloc(size, GFP_KERNEL); if (!iinfo->i_ext.i_data) { - printk(KERN_ERR "udf:udf_alloc_i_data (ino %ld) " - "no free memory\n", inode->i_ino); + udf_err(inode->i_sb, "(ino %ld) no free memory\n", + inode->i_ino); return -ENOMEM; } return 0; } -static mode_t udf_convert_permissions(struct fileEntry *fe) +static umode_t udf_convert_permissions(struct fileEntry *fe) { - mode_t mode; + umode_t mode; uint32_t permissions; uint32_t flags; @@ -1689,9 +1704,8 @@ out: if (do_sync) { sync_dirty_buffer(bh); if (buffer_write_io_error(bh)) { - printk(KERN_WARNING "IO error syncing udf inode " - "[%s:%08lx]\n", inode->i_sb->s_id, - inode->i_ino); + udf_warn(inode->i_sb, "IO error syncing udf inode [%08lx]\n", + inode->i_ino); err = -EIO; } } @@ -1982,8 +1996,7 @@ int8_t udf_current_aext(struct inode *inode, struct extent_position *epos, *elen = le32_to_cpu(lad->extLength) & UDF_EXTENT_LENGTH_MASK; break; default: - udf_debug("alloc_type = %d unsupported\n", - iinfo->i_alloc_type); + udf_debug("alloc_type = %d unsupported\n", iinfo->i_alloc_type); return -1; } diff --git a/fs/udf/lowlevel.c b/fs/udf/lowlevel.c index 43e24a3..6583fe9 100644 --- a/fs/udf/lowlevel.c +++ b/fs/udf/lowlevel.c @@ -38,7 +38,7 @@ unsigned int udf_get_last_session(struct super_block *sb) if (i == 0) { udf_debug("XA disk: %s, vol_desc_start=%d\n", - (ms_info.xa_flag ? "yes" : "no"), ms_info.addr.lba); + ms_info.xa_flag ? "yes" : "no", ms_info.addr.lba); if (ms_info.xa_flag) /* necessary for a valid ms_info.addr */ vol_desc_start = ms_info.addr.lba; } else { diff --git a/fs/udf/misc.c b/fs/udf/misc.c index 9215700..c175b4d 100644 --- a/fs/udf/misc.c +++ b/fs/udf/misc.c @@ -204,6 +204,7 @@ struct buffer_head *udf_read_tagged(struct super_block *sb, uint32_t block, { struct tag *tag_p; struct buffer_head *bh = NULL; + u8 checksum; /* Read the block */ if (block == 0xFFFFFFFF) @@ -211,8 +212,8 @@ struct buffer_head *udf_read_tagged(struct super_block *sb, uint32_t block, bh = udf_tread(sb, block); if (!bh) { - udf_debug("block=%d, location=%d: read failed\n", - block, location); + udf_err(sb, "read failed, block=%u, location=%d\n", + block, location); return NULL; } @@ -227,16 +228,18 @@ struct buffer_head *udf_read_tagged(struct super_block *sb, uint32_t block, } /* Verify the tag checksum */ - if (udf_tag_checksum(tag_p) != tag_p->tagChecksum) { - printk(KERN_ERR "udf: tag checksum failed block %d\n", block); + checksum = udf_tag_checksum(tag_p); + if (checksum != tag_p->tagChecksum) { + udf_err(sb, "tag checksum failed, block %u: 0x%02x != 0x%02x\n", + block, checksum, tag_p->tagChecksum); goto error_out; } /* Verify the tag version */ if (tag_p->descVersion != cpu_to_le16(0x0002U) && tag_p->descVersion != cpu_to_le16(0x0003U)) { - udf_debug("tag version 0x%04x != 0x0002 || 0x0003 block %d\n", - le16_to_cpu(tag_p->descVersion), block); + udf_err(sb, "tag version 0x%04x != 0x0002 || 0x0003, block %u\n", + le16_to_cpu(tag_p->descVersion), block); goto error_out; } @@ -248,8 +251,8 @@ struct buffer_head *udf_read_tagged(struct super_block *sb, uint32_t block, return bh; udf_debug("Crc failure block %d: crc = %d, crclen = %d\n", block, - le16_to_cpu(tag_p->descCRC), le16_to_cpu(tag_p->descCRCLength)); - + le16_to_cpu(tag_p->descCRC), + le16_to_cpu(tag_p->descCRCLength)); error_out: brelse(bh); return NULL; diff --git a/fs/udf/namei.c b/fs/udf/namei.c index f1dce84..08bf46e 100644 --- a/fs/udf/namei.c +++ b/fs/udf/namei.c @@ -552,7 +552,7 @@ static int udf_delete_entry(struct inode *inode, struct fileIdentDesc *fi, return udf_write_fi(inode, cfi, fi, fibh, NULL, NULL); } -static int udf_create(struct inode *dir, struct dentry *dentry, int mode, +static int udf_create(struct inode *dir, struct dentry *dentry, umode_t mode, struct nameidata *nd) { struct udf_fileident_bh fibh; @@ -577,8 +577,7 @@ static int udf_create(struct inode *dir, struct dentry *dentry, int mode, fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err); if (!fi) { - inode->i_nlink--; - mark_inode_dirty(inode); + inode_dec_link_count(inode); iput(inode); return err; } @@ -597,7 +596,7 @@ static int udf_create(struct inode *dir, struct dentry *dentry, int mode, return 0; } -static int udf_mknod(struct inode *dir, struct dentry *dentry, int mode, +static int udf_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev) { struct inode *inode; @@ -618,8 +617,7 @@ static int udf_mknod(struct inode *dir, struct dentry *dentry, int mode, init_special_inode(inode, mode, rdev); fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err); if (!fi) { - inode->i_nlink--; - mark_inode_dirty(inode); + inode_dec_link_count(inode); iput(inode); return err; } @@ -642,7 +640,7 @@ out: return err; } -static int udf_mkdir(struct inode *dir, struct dentry *dentry, int mode) +static int udf_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) { struct inode *inode; struct udf_fileident_bh fibh; @@ -665,12 +663,11 @@ static int udf_mkdir(struct inode *dir, struct dentry *dentry, int mode) inode->i_fop = &udf_dir_operations; fi = udf_add_entry(inode, NULL, &fibh, &cfi, &err); if (!fi) { - inode->i_nlink--; - mark_inode_dirty(inode); + inode_dec_link_count(inode); iput(inode); goto out; } - inode->i_nlink = 2; + set_nlink(inode, 2); cfi.icb.extLength = cpu_to_le32(inode->i_sb->s_blocksize); cfi.icb.extLocation = cpu_to_lelb(dinfo->i_location); *(__le32 *)((struct allocDescImpUse *)cfi.icb.impUse)->impUse = @@ -683,7 +680,7 @@ static int udf_mkdir(struct inode *dir, struct dentry *dentry, int mode) fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err); if (!fi) { - inode->i_nlink = 0; + clear_nlink(inode); mark_inode_dirty(inode); iput(inode); goto out; @@ -799,9 +796,8 @@ static int udf_rmdir(struct inode *dir, struct dentry *dentry) if (retval) goto end_rmdir; if (inode->i_nlink != 2) - udf_warning(inode->i_sb, "udf_rmdir", - "empty directory has nlink != 2 (%d)", - inode->i_nlink); + udf_warn(inode->i_sb, "empty directory has nlink != 2 (%d)\n", + inode->i_nlink); clear_nlink(inode); inode->i_size = 0; inode_dec_link_count(dir); @@ -840,7 +836,7 @@ static int udf_unlink(struct inode *dir, struct dentry *dentry) if (!inode->i_nlink) { udf_debug("Deleting nonexistent file (%lu), %d\n", inode->i_ino, inode->i_nlink); - inode->i_nlink = 1; + set_nlink(inode, 1); } retval = udf_delete_entry(dir, fi, &fibh, &cfi); if (retval) diff --git a/fs/udf/partition.c b/fs/udf/partition.c index a71090e..d6caf01 100644 --- a/fs/udf/partition.c +++ b/fs/udf/partition.c @@ -33,8 +33,8 @@ uint32_t udf_get_pblock(struct super_block *sb, uint32_t block, struct udf_sb_info *sbi = UDF_SB(sb); struct udf_part_map *map; if (partition >= sbi->s_partitions) { - udf_debug("block=%d, partition=%d, offset=%d: " - "invalid partition\n", block, partition, offset); + udf_debug("block=%d, partition=%d, offset=%d: invalid partition\n", + block, partition, offset); return 0xFFFFFFFF; } map = &sbi->s_partmaps[partition]; @@ -60,8 +60,8 @@ uint32_t udf_get_pblock_virt15(struct super_block *sb, uint32_t block, vdata = &map->s_type_specific.s_virtual; if (block > vdata->s_num_entries) { - udf_debug("Trying to access block beyond end of VAT " - "(%d max %d)\n", block, vdata->s_num_entries); + udf_debug("Trying to access block beyond end of VAT (%d max %d)\n", + block, vdata->s_num_entries); return 0xFFFFFFFF; } @@ -321,9 +321,14 @@ uint32_t udf_get_pblock_meta25(struct super_block *sb, uint32_t block, /* We shouldn't mount such media... */ BUG_ON(!inode); retblk = udf_try_read_meta(inode, block, partition, offset); - if (retblk == 0xFFFFFFFF) { - udf_warning(sb, __func__, "error reading from METADATA, " - "trying to read from MIRROR"); + if (retblk == 0xFFFFFFFF && mdata->s_metadata_fe) { + udf_warn(sb, "error reading from METADATA, trying to read from MIRROR\n"); + if (!(mdata->s_flags & MF_MIRROR_FE_LOADED)) { + mdata->s_mirror_fe = udf_find_metadata_inode_efe(sb, + mdata->s_mirror_file_loc, map->s_partition_num); + mdata->s_flags |= MF_MIRROR_FE_LOADED; + } + inode = mdata->s_mirror_fe; if (!inode) return 0xFFFFFFFF; diff --git a/fs/udf/super.c b/fs/udf/super.c index 7b27b06..c09a84da 100644 --- a/fs/udf/super.c +++ b/fs/udf/super.c @@ -75,8 +75,6 @@ #define UDF_DEFAULT_BLOCKSIZE 2048 -static char error_buf[1024]; - /* These are the "meat" - everything else is stuffing */ static int udf_fill_super(struct super_block *, void *, int); static void udf_put_super(struct super_block *); @@ -91,9 +89,7 @@ static void udf_open_lvid(struct super_block *); static void udf_close_lvid(struct super_block *); static unsigned int udf_count_free(struct super_block *); static int udf_statfs(struct dentry *, struct kstatfs *); -static int udf_show_options(struct seq_file *, struct vfsmount *); -static void udf_error(struct super_block *sb, const char *function, - const char *fmt, ...); +static int udf_show_options(struct seq_file *, struct dentry *); struct logicalVolIntegrityDescImpUse *udf_sb_lvidiu(struct udf_sb_info *sbi) { @@ -142,7 +138,6 @@ static struct inode *udf_alloc_inode(struct super_block *sb) static void udf_i_callback(struct rcu_head *head) { struct inode *inode = container_of(head, struct inode, i_rcu); - INIT_LIST_HEAD(&inode->i_dentry); kmem_cache_free(udf_inode_cachep, UDF_I(inode)); } @@ -200,11 +195,11 @@ struct udf_options { unsigned int fileset; unsigned int rootdir; unsigned int flags; - mode_t umask; + umode_t umask; gid_t gid; uid_t uid; - mode_t fmode; - mode_t dmode; + umode_t fmode; + umode_t dmode; struct nls_table *nls_map; }; @@ -244,9 +239,8 @@ static int udf_sb_alloc_partition_maps(struct super_block *sb, u32 count) sbi->s_partmaps = kcalloc(count, sizeof(struct udf_part_map), GFP_KERNEL); if (!sbi->s_partmaps) { - udf_error(sb, __func__, - "Unable to allocate space for %d partition maps", - count); + udf_err(sb, "Unable to allocate space for %d partition maps\n", + count); sbi->s_partitions = 0; return -ENOMEM; } @@ -255,9 +249,9 @@ static int udf_sb_alloc_partition_maps(struct super_block *sb, u32 count) return 0; } -static int udf_show_options(struct seq_file *seq, struct vfsmount *mnt) +static int udf_show_options(struct seq_file *seq, struct dentry *root) { - struct super_block *sb = mnt->mnt_sb; + struct super_block *sb = root->d_sb; struct udf_sb_info *sbi = UDF_SB(sb); if (!UDF_QUERY_FLAG(sb, UDF_FLAG_STRICT)) @@ -285,11 +279,11 @@ static int udf_show_options(struct seq_file *seq, struct vfsmount *mnt) if (UDF_QUERY_FLAG(sb, UDF_FLAG_GID_SET)) seq_printf(seq, ",gid=%u", sbi->s_gid); if (sbi->s_umask != 0) - seq_printf(seq, ",umask=%o", sbi->s_umask); + seq_printf(seq, ",umask=%ho", sbi->s_umask); if (sbi->s_fmode != UDF_INVALID_MODE) - seq_printf(seq, ",mode=%o", sbi->s_fmode); + seq_printf(seq, ",mode=%ho", sbi->s_fmode); if (sbi->s_dmode != UDF_INVALID_MODE) - seq_printf(seq, ",dmode=%o", sbi->s_dmode); + seq_printf(seq, ",dmode=%ho", sbi->s_dmode); if (UDF_QUERY_FLAG(sb, UDF_FLAG_SESSION_SET)) seq_printf(seq, ",session=%u", sbi->s_session); if (UDF_QUERY_FLAG(sb, UDF_FLAG_LASTBLOCK_SET)) @@ -550,8 +544,7 @@ static int udf_parse_options(char *options, struct udf_options *uopt, uopt->dmode = option & 0777; break; default: - printk(KERN_ERR "udf: bad mount option \"%s\" " - "or missing value\n", p); + pr_err("bad mount option \"%s\" or missing value\n", p); return 0; } } @@ -645,20 +638,16 @@ static loff_t udf_check_vsd(struct super_block *sb) udf_debug("ISO9660 Boot Record found\n"); break; case 1: - udf_debug("ISO9660 Primary Volume Descriptor " - "found\n"); + udf_debug("ISO9660 Primary Volume Descriptor found\n"); break; case 2: - udf_debug("ISO9660 Supplementary Volume " - "Descriptor found\n"); + udf_debug("ISO9660 Supplementary Volume Descriptor found\n"); break; case 3: - udf_debug("ISO9660 Volume Partition Descriptor " - "found\n"); + udf_debug("ISO9660 Volume Partition Descriptor found\n"); break; case 255: - udf_debug("ISO9660 Volume Descriptor Set " - "Terminator found\n"); + udf_debug("ISO9660 Volume Descriptor Set Terminator found\n"); break; default: udf_debug("ISO9660 VRS (%u) found\n", @@ -809,8 +798,7 @@ static int udf_load_pvoldesc(struct super_block *sb, sector_t block) pvoldesc->recordingDateAndTime)) { #ifdef UDFFS_DEBUG struct timestamp *ts = &pvoldesc->recordingDateAndTime; - udf_debug("recording time %04u/%02u/%02u" - " %02u:%02u (%x)\n", + udf_debug("recording time %04u/%02u/%02u %02u:%02u (%x)\n", le16_to_cpu(ts->year), ts->month, ts->day, ts->hour, ts->minute, le16_to_cpu(ts->typeAndTimezone)); #endif @@ -821,7 +809,7 @@ static int udf_load_pvoldesc(struct super_block *sb, sector_t block) strncpy(UDF_SB(sb)->s_volume_ident, outstr->u_name, outstr->u_len > 31 ? 31 : outstr->u_len); udf_debug("volIdent[] = '%s'\n", - UDF_SB(sb)->s_volume_ident); + UDF_SB(sb)->s_volume_ident); } if (!udf_build_ustr(instr, pvoldesc->volSetIdent, 128)) @@ -837,64 +825,57 @@ out1: return ret; } +struct inode *udf_find_metadata_inode_efe(struct super_block *sb, + u32 meta_file_loc, u32 partition_num) +{ + struct kernel_lb_addr addr; + struct inode *metadata_fe; + + addr.logicalBlockNum = meta_file_loc; + addr.partitionReferenceNum = partition_num; + + metadata_fe = udf_iget(sb, &addr); + + if (metadata_fe == NULL) + udf_warn(sb, "metadata inode efe not found\n"); + else if (UDF_I(metadata_fe)->i_alloc_type != ICBTAG_FLAG_AD_SHORT) { + udf_warn(sb, "metadata inode efe does not have short allocation descriptors!\n"); + iput(metadata_fe); + metadata_fe = NULL; + } + + return metadata_fe; +} + static int udf_load_metadata_files(struct super_block *sb, int partition) { struct udf_sb_info *sbi = UDF_SB(sb); struct udf_part_map *map; struct udf_meta_data *mdata; struct kernel_lb_addr addr; - int fe_error = 0; map = &sbi->s_partmaps[partition]; mdata = &map->s_type_specific.s_metadata; /* metadata address */ - addr.logicalBlockNum = mdata->s_meta_file_loc; - addr.partitionReferenceNum = map->s_partition_num; - udf_debug("Metadata file location: block = %d part = %d\n", - addr.logicalBlockNum, addr.partitionReferenceNum); + mdata->s_meta_file_loc, map->s_partition_num); - mdata->s_metadata_fe = udf_iget(sb, &addr); + mdata->s_metadata_fe = udf_find_metadata_inode_efe(sb, + mdata->s_meta_file_loc, map->s_partition_num); if (mdata->s_metadata_fe == NULL) { - udf_warning(sb, __func__, "metadata inode efe not found, " - "will try mirror inode."); - fe_error = 1; - } else if (UDF_I(mdata->s_metadata_fe)->i_alloc_type != - ICBTAG_FLAG_AD_SHORT) { - udf_warning(sb, __func__, "metadata inode efe does not have " - "short allocation descriptors!"); - fe_error = 1; - iput(mdata->s_metadata_fe); - mdata->s_metadata_fe = NULL; - } - - /* mirror file entry */ - addr.logicalBlockNum = mdata->s_mirror_file_loc; - addr.partitionReferenceNum = map->s_partition_num; - - udf_debug("Mirror metadata file location: block = %d part = %d\n", - addr.logicalBlockNum, addr.partitionReferenceNum); + /* mirror file entry */ + udf_debug("Mirror metadata file location: block = %d part = %d\n", + mdata->s_mirror_file_loc, map->s_partition_num); - mdata->s_mirror_fe = udf_iget(sb, &addr); + mdata->s_mirror_fe = udf_find_metadata_inode_efe(sb, + mdata->s_mirror_file_loc, map->s_partition_num); - if (mdata->s_mirror_fe == NULL) { - if (fe_error) { - udf_error(sb, __func__, "mirror inode efe not found " - "and metadata inode is missing too, exiting..."); - goto error_exit; - } else - udf_warning(sb, __func__, "mirror inode efe not found," - " but metadata inode is OK"); - } else if (UDF_I(mdata->s_mirror_fe)->i_alloc_type != - ICBTAG_FLAG_AD_SHORT) { - udf_warning(sb, __func__, "mirror inode efe does not have " - "short allocation descriptors!"); - iput(mdata->s_mirror_fe); - mdata->s_mirror_fe = NULL; - if (fe_error) + if (mdata->s_mirror_fe == NULL) { + udf_err(sb, "Both metadata and mirror metadata inode efe can not found\n"); goto error_exit; + } } /* @@ -907,18 +888,15 @@ static int udf_load_metadata_files(struct super_block *sb, int partition) addr.partitionReferenceNum = map->s_partition_num; udf_debug("Bitmap file location: block = %d part = %d\n", - addr.logicalBlockNum, addr.partitionReferenceNum); + addr.logicalBlockNum, addr.partitionReferenceNum); mdata->s_bitmap_fe = udf_iget(sb, &addr); if (mdata->s_bitmap_fe == NULL) { if (sb->s_flags & MS_RDONLY) - udf_warning(sb, __func__, "bitmap inode efe " - "not found but it's ok since the disc" - " is mounted read-only"); + udf_warn(sb, "bitmap inode efe not found but it's ok since the disc is mounted read-only\n"); else { - udf_error(sb, __func__, "bitmap inode efe not " - "found and attempted read-write mount"); + udf_err(sb, "bitmap inode efe not found and attempted read-write mount\n"); goto error_exit; } } @@ -971,9 +949,8 @@ static struct udf_bitmap *udf_sb_alloc_bitmap(struct super_block *sb, u32 index) bitmap = vzalloc(size); /* TODO: get rid of vzalloc */ if (bitmap == NULL) { - udf_error(sb, __func__, - "Unable to allocate space for bitmap " - "and %d buffer_head pointers", nr_groups); + udf_err(sb, "Unable to allocate space for bitmap and %d buffer_head pointers\n", + nr_groups); return NULL; } @@ -1003,10 +980,9 @@ static int udf_fill_partdesc_info(struct super_block *sb, if (p->accessType == cpu_to_le32(PD_ACCESS_TYPE_OVERWRITABLE)) map->s_partition_flags |= UDF_PART_FLAG_OVERWRITABLE; - udf_debug("Partition (%d type %x) starts at physical %d, " - "block length %d\n", p_index, - map->s_partition_type, map->s_partition_root, - map->s_partition_len); + udf_debug("Partition (%d type %x) starts at physical %d, block length %d\n", + p_index, map->s_partition_type, + map->s_partition_root, map->s_partition_len); if (strcmp(p->partitionContents.ident, PD_PARTITION_CONTENTS_NSR02) && strcmp(p->partitionContents.ident, PD_PARTITION_CONTENTS_NSR03)) @@ -1023,12 +999,12 @@ static int udf_fill_partdesc_info(struct super_block *sb, map->s_uspace.s_table = udf_iget(sb, &loc); if (!map->s_uspace.s_table) { udf_debug("cannot load unallocSpaceTable (part %d)\n", - p_index); + p_index); return 1; } map->s_partition_flags |= UDF_PART_FLAG_UNALLOC_TABLE; udf_debug("unallocSpaceTable (part %d) @ %ld\n", - p_index, map->s_uspace.s_table->i_ino); + p_index, map->s_uspace.s_table->i_ino); } if (phd->unallocSpaceBitmap.extLength) { @@ -1041,8 +1017,8 @@ static int udf_fill_partdesc_info(struct super_block *sb, bitmap->s_extPosition = le32_to_cpu( phd->unallocSpaceBitmap.extPosition); map->s_partition_flags |= UDF_PART_FLAG_UNALLOC_BITMAP; - udf_debug("unallocSpaceBitmap (part %d) @ %d\n", p_index, - bitmap->s_extPosition); + udf_debug("unallocSpaceBitmap (part %d) @ %d\n", + p_index, bitmap->s_extPosition); } if (phd->partitionIntegrityTable.extLength) @@ -1058,13 +1034,13 @@ static int udf_fill_partdesc_info(struct super_block *sb, map->s_fspace.s_table = udf_iget(sb, &loc); if (!map->s_fspace.s_table) { udf_debug("cannot load freedSpaceTable (part %d)\n", - p_index); + p_index); return 1; } map->s_partition_flags |= UDF_PART_FLAG_FREED_TABLE; udf_debug("freedSpaceTable (part %d) @ %ld\n", - p_index, map->s_fspace.s_table->i_ino); + p_index, map->s_fspace.s_table->i_ino); } if (phd->freedSpaceBitmap.extLength) { @@ -1077,8 +1053,8 @@ static int udf_fill_partdesc_info(struct super_block *sb, bitmap->s_extPosition = le32_to_cpu( phd->freedSpaceBitmap.extPosition); map->s_partition_flags |= UDF_PART_FLAG_FREED_BITMAP; - udf_debug("freedSpaceBitmap (part %d) @ %d\n", p_index, - bitmap->s_extPosition); + udf_debug("freedSpaceBitmap (part %d) @ %d\n", + p_index, bitmap->s_extPosition); } return 0; } @@ -1118,11 +1094,9 @@ static int udf_load_vat(struct super_block *sb, int p_index, int type1_index) udf_find_vat_block(sb, p_index, type1_index, sbi->s_last_block); if (!sbi->s_vat_inode && sbi->s_last_block != blocks - 1) { - printk(KERN_NOTICE "UDF-fs: Failed to read VAT inode from the" - " last recorded block (%lu), retrying with the last " - "block of the device (%lu).\n", - (unsigned long)sbi->s_last_block, - (unsigned long)blocks - 1); + pr_notice("Failed to read VAT inode from the last recorded block (%lu), retrying with the last block of the device (%lu).\n", + (unsigned long)sbi->s_last_block, + (unsigned long)blocks - 1); udf_find_vat_block(sb, p_index, type1_index, blocks - 1); } if (!sbi->s_vat_inode) @@ -1220,8 +1194,8 @@ static int udf_load_partdesc(struct super_block *sb, sector_t block) if (map->s_partition_type == UDF_METADATA_MAP25) { ret = udf_load_metadata_files(sb, i); if (ret) { - printk(KERN_ERR "UDF-fs: error loading MetaData " - "partition map %d\n", i); + udf_err(sb, "error loading MetaData partition map %d\n", + i); goto out_bh; } } else { @@ -1234,9 +1208,7 @@ static int udf_load_partdesc(struct super_block *sb, sector_t block) * overwrite blocks instead of relocating them). */ sb->s_flags |= MS_RDONLY; - printk(KERN_NOTICE "UDF-fs: Filesystem marked read-only " - "because writing to pseudooverwrite partition is " - "not implemented.\n"); + pr_notice("Filesystem marked read-only because writing to pseudooverwrite partition is not implemented\n"); } out_bh: /* In case loading failed, we handle cleanup in udf_fill_super */ @@ -1344,9 +1316,8 @@ static int udf_load_logicalvol(struct super_block *sb, sector_t block, struct metadataPartitionMap *mdm = (struct metadataPartitionMap *) &(lvd->partitionMaps[offset]); - udf_debug("Parsing Logical vol part %d " - "type %d id=%s\n", i, type, - UDF_ID_METADATA); + udf_debug("Parsing Logical vol part %d type %d id=%s\n", + i, type, UDF_ID_METADATA); map->s_partition_type = UDF_METADATA_MAP25; map->s_partition_func = udf_get_pblock_meta25; @@ -1361,25 +1332,24 @@ static int udf_load_logicalvol(struct super_block *sb, sector_t block, le32_to_cpu(mdm->allocUnitSize); mdata->s_align_unit_size = le16_to_cpu(mdm->alignUnitSize); - mdata->s_dup_md_flag = - mdm->flags & 0x01; + if (mdm->flags & 0x01) + mdata->s_flags |= MF_DUPLICATE_MD; udf_debug("Metadata Ident suffix=0x%x\n", - (le16_to_cpu( - ((__le16 *) - mdm->partIdent.identSuffix)[0]))); + le16_to_cpu(*(__le16 *) + mdm->partIdent.identSuffix)); udf_debug("Metadata part num=%d\n", - le16_to_cpu(mdm->partitionNum)); + le16_to_cpu(mdm->partitionNum)); udf_debug("Metadata part alloc unit size=%d\n", - le32_to_cpu(mdm->allocUnitSize)); + le32_to_cpu(mdm->allocUnitSize)); udf_debug("Metadata file loc=%d\n", - le32_to_cpu(mdm->metadataFileLoc)); + le32_to_cpu(mdm->metadataFileLoc)); udf_debug("Mirror file loc=%d\n", - le32_to_cpu(mdm->metadataMirrorFileLoc)); + le32_to_cpu(mdm->metadataMirrorFileLoc)); udf_debug("Bitmap file loc=%d\n", - le32_to_cpu(mdm->metadataBitmapFileLoc)); - udf_debug("Duplicate Flag: %d %d\n", - mdata->s_dup_md_flag, mdm->flags); + le32_to_cpu(mdm->metadataBitmapFileLoc)); + udf_debug("Flags: %d %d\n", + mdata->s_flags, mdm->flags); } else { udf_debug("Unknown ident: %s\n", upm2->partIdent.ident); @@ -1389,16 +1359,15 @@ static int udf_load_logicalvol(struct super_block *sb, sector_t block, map->s_partition_num = le16_to_cpu(upm2->partitionNum); } udf_debug("Partition (%d:%d) type %d on volume %d\n", - i, map->s_partition_num, type, - map->s_volumeseqnum); + i, map->s_partition_num, type, map->s_volumeseqnum); } if (fileset) { struct long_ad *la = (struct long_ad *)&(lvd->logicalVolContentsUse[0]); *fileset = lelb_to_cpu(la->extLocation); - udf_debug("FileSet found in LogicalVolDesc at block=%d, " - "partition=%d\n", fileset->logicalBlockNum, + udf_debug("FileSet found in LogicalVolDesc at block=%d, partition=%d\n", + fileset->logicalBlockNum, fileset->partitionReferenceNum); } if (lvd->integritySeqExt.extLength) @@ -1478,9 +1447,9 @@ static noinline int udf_process_sequence(struct super_block *sb, long block, bh = udf_read_tagged(sb, block, block, &ident); if (!bh) { - printk(KERN_ERR "udf: Block %Lu of volume descriptor " - "sequence is corrupted or we could not read " - "it.\n", (unsigned long long)block); + udf_err(sb, + "Block %llu of volume descriptor sequence is corrupted or we could not read it\n", + (unsigned long long)block); return 1; } @@ -1553,7 +1522,7 @@ static noinline int udf_process_sequence(struct super_block *sb, long block, * in a suitable order */ if (!vds[VDS_POS_PRIMARY_VOL_DESC].block) { - printk(KERN_ERR "udf: Primary Volume Descriptor not found!\n"); + udf_err(sb, "Primary Volume Descriptor not found!\n"); return 1; } if (udf_load_pvoldesc(sb, vds[VDS_POS_PRIMARY_VOL_DESC].block)) @@ -1740,7 +1709,7 @@ static int udf_load_vrs(struct super_block *sb, struct udf_options *uopt, if (!sb_set_blocksize(sb, uopt->blocksize)) { if (!silent) - printk(KERN_WARNING "UDF-fs: Bad block size\n"); + udf_warn(sb, "Bad block size\n"); return 0; } sbi->s_last_block = uopt->lastblock; @@ -1749,12 +1718,11 @@ static int udf_load_vrs(struct super_block *sb, struct udf_options *uopt, nsr_off = udf_check_vsd(sb); if (!nsr_off) { if (!silent) - printk(KERN_WARNING "UDF-fs: No VRS found\n"); + udf_warn(sb, "No VRS found\n"); return 0; } if (nsr_off == -1) - udf_debug("Failed to read byte 32768. Assuming open " - "disc. Skipping validity check\n"); + udf_debug("Failed to read byte 32768. Assuming open disc. Skipping validity check\n"); if (!sbi->s_last_block) sbi->s_last_block = udf_get_last_block(sb); } else { @@ -1765,7 +1733,7 @@ static int udf_load_vrs(struct super_block *sb, struct udf_options *uopt, sbi->s_anchor = uopt->anchor; if (!udf_find_anchor(sb, fileset)) { if (!silent) - printk(KERN_WARNING "UDF-fs: No anchor found\n"); + udf_warn(sb, "No anchor found\n"); return 0; } return 1; @@ -1830,6 +1798,12 @@ static void udf_close_lvid(struct super_block *sb) le16_to_cpu(lvid->descTag.descCRCLength))); lvid->descTag.tagChecksum = udf_tag_checksum(&lvid->descTag); + /* + * We set buffer uptodate unconditionally here to avoid spurious + * warnings from mark_buffer_dirty() when previous EIO has marked + * the buffer as !uptodate + */ + set_buffer_uptodate(bh); mark_buffer_dirty(bh); sbi->s_lvid_dirty = 0; mutex_unlock(&sbi->s_alloc_mutex); @@ -1937,8 +1911,7 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent) if (uopt.flags & (1 << UDF_FLAG_UTF8) && uopt.flags & (1 << UDF_FLAG_NLS_MAP)) { - udf_error(sb, "udf_read_super", - "utf8 cannot be combined with iocharset\n"); + udf_err(sb, "utf8 cannot be combined with iocharset\n"); goto error_out; } #ifdef CONFIG_UDF_NLS @@ -1987,15 +1960,14 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent) ret = udf_load_vrs(sb, &uopt, silent, &fileset); if (!ret && uopt.blocksize != UDF_DEFAULT_BLOCKSIZE) { if (!silent) - printk(KERN_NOTICE - "UDF-fs: Rescanning with blocksize " - "%d\n", UDF_DEFAULT_BLOCKSIZE); + pr_notice("Rescanning with blocksize %d\n", + UDF_DEFAULT_BLOCKSIZE); uopt.blocksize = UDF_DEFAULT_BLOCKSIZE; ret = udf_load_vrs(sb, &uopt, silent, &fileset); } } if (!ret) { - printk(KERN_WARNING "UDF-fs: No partition found (1)\n"); + udf_warn(sb, "No partition found (1)\n"); goto error_out; } @@ -2010,10 +1982,9 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent) le16_to_cpu(lvidiu->maxUDFWriteRev); */ if (minUDFReadRev > UDF_MAX_READ_VERSION) { - printk(KERN_ERR "UDF-fs: minUDFReadRev=%x " - "(max is %x)\n", - le16_to_cpu(lvidiu->minUDFReadRev), - UDF_MAX_READ_VERSION); + udf_err(sb, "minUDFReadRev=%x (max is %x)\n", + le16_to_cpu(lvidiu->minUDFReadRev), + UDF_MAX_READ_VERSION); goto error_out; } else if (minUDFWriteRev > UDF_MAX_WRITE_VERSION) sb->s_flags |= MS_RDONLY; @@ -2027,28 +1998,27 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent) } if (!sbi->s_partitions) { - printk(KERN_WARNING "UDF-fs: No partition found (2)\n"); + udf_warn(sb, "No partition found (2)\n"); goto error_out; } if (sbi->s_partmaps[sbi->s_partition].s_partition_flags & UDF_PART_FLAG_READ_ONLY) { - printk(KERN_NOTICE "UDF-fs: Partition marked readonly; " - "forcing readonly mount\n"); + pr_notice("Partition marked readonly; forcing readonly mount\n"); sb->s_flags |= MS_RDONLY; } if (udf_find_fileset(sb, &fileset, &rootdir)) { - printk(KERN_WARNING "UDF-fs: No fileset found\n"); + udf_warn(sb, "No fileset found\n"); goto error_out; } if (!silent) { struct timestamp ts; udf_time_to_disk_stamp(&ts, sbi->s_record_time); - udf_info("UDF: Mounting volume '%s', " - "timestamp %04u/%02u/%02u %02u:%02u (%x)\n", - sbi->s_volume_ident, le16_to_cpu(ts.year), ts.month, ts.day, + udf_info("Mounting volume '%s', timestamp %04u/%02u/%02u %02u:%02u (%x)\n", + sbi->s_volume_ident, + le16_to_cpu(ts.year), ts.month, ts.day, ts.hour, ts.minute, le16_to_cpu(ts.typeAndTimezone)); } if (!(sb->s_flags & MS_RDONLY)) @@ -2059,8 +2029,7 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent) /* perhaps it's not extensible enough, but for now ... */ inode = udf_iget(sb, &rootdir); if (!inode) { - printk(KERN_ERR "UDF-fs: Error in udf_iget, block=%d, " - "partition=%d\n", + udf_err(sb, "Error in udf_iget, block=%d, partition=%d\n", rootdir.logicalBlockNum, rootdir.partitionReferenceNum); goto error_out; } @@ -2068,7 +2037,7 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent) /* Allocate a dentry for the root inode */ sb->s_root = d_alloc_root(inode); if (!sb->s_root) { - printk(KERN_ERR "UDF-fs: Couldn't allocate root dentry\n"); + udf_err(sb, "Couldn't allocate root dentry\n"); iput(inode); goto error_out; } @@ -2096,32 +2065,40 @@ error_out: return -EINVAL; } -static void udf_error(struct super_block *sb, const char *function, - const char *fmt, ...) +void _udf_err(struct super_block *sb, const char *function, + const char *fmt, ...) { + struct va_format vaf; va_list args; - if (!(sb->s_flags & MS_RDONLY)) { - /* mark sb error */ + /* mark sb error */ + if (!(sb->s_flags & MS_RDONLY)) sb->s_dirt = 1; - } + va_start(args, fmt); - vsnprintf(error_buf, sizeof(error_buf), fmt, args); + + vaf.fmt = fmt; + vaf.va = &args; + + pr_err("error (device %s): %s: %pV", sb->s_id, function, &vaf); + va_end(args); - printk(KERN_CRIT "UDF-fs error (device %s): %s: %s\n", - sb->s_id, function, error_buf); } -void udf_warning(struct super_block *sb, const char *function, - const char *fmt, ...) +void _udf_warn(struct super_block *sb, const char *function, + const char *fmt, ...) { + struct va_format vaf; va_list args; va_start(args, fmt); - vsnprintf(error_buf, sizeof(error_buf), fmt, args); + + vaf.fmt = fmt; + vaf.va = &args; + + pr_warn("warning (device %s): %s: %pV", sb->s_id, function, &vaf); + va_end(args); - printk(KERN_WARNING "UDF-fs warning (device %s): %s: %s\n", - sb->s_id, function, error_buf); } static void udf_put_super(struct super_block *sb) @@ -2213,11 +2190,11 @@ static unsigned int udf_count_free_bitmap(struct super_block *sb, bh = udf_read_ptagged(sb, &loc, 0, &ident); if (!bh) { - printk(KERN_ERR "udf: udf_count_free failed\n"); + udf_err(sb, "udf_count_free failed\n"); goto out; } else if (ident != TAG_IDENT_SBD) { brelse(bh); - printk(KERN_ERR "udf: udf_count_free failed\n"); + udf_err(sb, "udf_count_free failed\n"); goto out; } diff --git a/fs/udf/symlink.c b/fs/udf/symlink.c index b1d4488..d7c6dbe 100644 --- a/fs/udf/symlink.c +++ b/fs/udf/symlink.c @@ -41,10 +41,16 @@ static void udf_pc_to_char(struct super_block *sb, unsigned char *from, pc = (struct pathComponent *)(from + elen); switch (pc->componentType) { case 1: - if (pc->lengthComponentIdent == 0) { - p = to; - *p++ = '/'; - } + /* + * Symlink points to some place which should be agreed + * upon between originator and receiver of the media. Ignore. + */ + if (pc->lengthComponentIdent > 0) + break; + /* Fall through */ + case 2: + p = to; + *p++ = '/'; break; case 3: memcpy(p, "../", 3); diff --git a/fs/udf/truncate.c b/fs/udf/truncate.c index 8424308..4b98fee 100644 --- a/fs/udf/truncate.c +++ b/fs/udf/truncate.c @@ -95,23 +95,21 @@ void udf_truncate_tail_extent(struct inode *inode) lbcount += elen; if (lbcount > inode->i_size) { if (lbcount - inode->i_size >= inode->i_sb->s_blocksize) - printk(KERN_WARNING - "udf_truncate_tail_extent(): Too long " - "extent after EOF in inode %u: i_size: " - "%Ld lbcount: %Ld extent %u+%u\n", - (unsigned)inode->i_ino, - (long long)inode->i_size, - (long long)lbcount, - (unsigned)eloc.logicalBlockNum, - (unsigned)elen); + udf_warn(inode->i_sb, + "Too long extent after EOF in inode %u: i_size: %lld lbcount: %lld extent %u+%u\n", + (unsigned)inode->i_ino, + (long long)inode->i_size, + (long long)lbcount, + (unsigned)eloc.logicalBlockNum, + (unsigned)elen); nelen = elen - (lbcount - inode->i_size); epos.offset -= adsize; extent_trunc(inode, &epos, &eloc, etype, elen, nelen); epos.offset += adsize; if (udf_next_aext(inode, &epos, &eloc, &elen, 1) != -1) - printk(KERN_ERR "udf_truncate_tail_extent(): " - "Extent after EOF in inode %u.\n", - (unsigned)inode->i_ino); + udf_err(inode->i_sb, + "Extent after EOF in inode %u\n", + (unsigned)inode->i_ino); break; } } diff --git a/fs/udf/udf_sb.h b/fs/udf/udf_sb.h index 4858c19..42ad69a 100644 --- a/fs/udf/udf_sb.h +++ b/fs/udf/udf_sb.h @@ -50,17 +50,20 @@ #define UDF_SPARABLE_MAP15 0x1522U #define UDF_METADATA_MAP25 0x2511U -#define UDF_INVALID_MODE ((mode_t)-1) +#define UDF_INVALID_MODE ((umode_t)-1) #pragma pack(1) /* XXX(hch): Why? This file just defines in-core structures */ +#define MF_DUPLICATE_MD 0x01 +#define MF_MIRROR_FE_LOADED 0x02 + struct udf_meta_data { __u32 s_meta_file_loc; __u32 s_mirror_file_loc; __u32 s_bitmap_file_loc; __u32 s_alloc_unit_size; __u16 s_align_unit_size; - __u8 s_dup_md_flag; + int s_flags; struct inode *s_metadata_fe; struct inode *s_mirror_fe; struct inode *s_bitmap_fe; @@ -124,11 +127,11 @@ struct udf_sb_info { struct buffer_head *s_lvid_bh; /* Default permissions */ - mode_t s_umask; + umode_t s_umask; gid_t s_gid; uid_t s_uid; - mode_t s_fmode; - mode_t s_dmode; + umode_t s_fmode; + umode_t s_dmode; /* Lock protecting consistency of above permission settings */ rwlock_t s_cred_lock; diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h index dbd52d4b..ebe1031 100644 --- a/fs/udf/udfdecl.h +++ b/fs/udf/udfdecl.h @@ -1,6 +1,8 @@ #ifndef __UDF_DECL_H #define __UDF_DECL_H +#define pr_fmt(fmt) "UDF-fs: " fmt + #include "ecma_167.h" #include "osta_udf.h" @@ -16,23 +18,30 @@ #define UDF_PREALLOCATE #define UDF_DEFAULT_PREALLOC_BLOCKS 8 +extern __printf(3, 4) void _udf_err(struct super_block *sb, + const char *function, const char *fmt, ...); +#define udf_err(sb, fmt, ...) \ + _udf_err(sb, __func__, fmt, ##__VA_ARGS__) + +extern __printf(3, 4) void _udf_warn(struct super_block *sb, + const char *function, const char *fmt, ...); +#define udf_warn(sb, fmt, ...) \ + _udf_warn(sb, __func__, fmt, ##__VA_ARGS__) + +#define udf_info(fmt, ...) \ + pr_info("INFO " fmt, ##__VA_ARGS__) + #undef UDFFS_DEBUG #ifdef UDFFS_DEBUG -#define udf_debug(f, a...) \ -do { \ - printk(KERN_DEBUG "UDF-fs DEBUG %s:%d:%s: ", \ - __FILE__, __LINE__, __func__); \ - printk(f, ##a); \ -} while (0) +#define udf_debug(fmt, ...) \ + printk(KERN_DEBUG pr_fmt("%s:%d:%s: " fmt), \ + __FILE__, __LINE__, __func__, ##__VA_ARGS__) #else -#define udf_debug(f, a...) /**/ +#define udf_debug(fmt, ...) \ + no_printk(fmt, ##__VA_ARGS__) #endif -#define udf_info(f, a...) \ - printk(KERN_INFO "UDF-fs INFO " f, ##a); - - #define udf_fixed_to_variable(x) ( ( ( (x) >> 5 ) * 39 ) + ( (x) & 0x0000001F ) ) #define udf_variable_to_fixed(x) ( ( ( (x) / 39 ) << 5 ) + ( (x) % 39 ) ) @@ -112,8 +121,6 @@ struct extent_position { /* super.c */ -__attribute__((format(printf, 3, 4))) -extern void udf_warning(struct super_block *, const char *, const char *, ...); static inline void udf_updated_lvid(struct super_block *sb) { struct buffer_head *bh = UDF_SB(sb)->s_lvid_bh; @@ -126,6 +133,8 @@ static inline void udf_updated_lvid(struct super_block *sb) UDF_SB(sb)->s_lvid_dirty = 1; } extern u64 lvid_get_unique_id(struct super_block *sb); +struct inode *udf_find_metadata_inode_efe(struct super_block *sb, + u32 meta_file_loc, u32 partition_num); /* namei.c */ extern int udf_write_fi(struct inode *inode, struct fileIdentDesc *, @@ -206,7 +215,7 @@ extern int udf_CS0toUTF8(struct ustr *, const struct ustr *); /* ialloc.c */ extern void udf_free_inode(struct inode *); -extern struct inode *udf_new_inode(struct inode *, int, int *); +extern struct inode *udf_new_inode(struct inode *, umode_t, int *); /* truncate.c */ extern void udf_truncate_tail_extent(struct inode *); diff --git a/fs/udf/udftime.c b/fs/udf/udftime.c index b8c828c..1f11483 100644 --- a/fs/udf/udftime.c +++ b/fs/udf/udftime.c @@ -34,9 +34,10 @@ * http://www.boulder.nist.gov/timefreq/pubs/bulletin/leapsecond.htm */ +#include "udfdecl.h" + #include <linux/types.h> #include <linux/kernel.h> -#include "udfdecl.h" #define EPOCH_YEAR 1970 diff --git a/fs/udf/unicode.c b/fs/udf/unicode.c index d03a90b..44b815e 100644 --- a/fs/udf/unicode.c +++ b/fs/udf/unicode.c @@ -114,7 +114,7 @@ int udf_CS0toUTF8(struct ustr *utf_o, const struct ustr *ocu_i) cmp_id = ocu_i->u_cmpID; if (cmp_id != 8 && cmp_id != 16) { memset(utf_o, 0, sizeof(struct ustr)); - printk(KERN_ERR "udf: unknown compression code (%d) stri=%s\n", + pr_err("unknown compression code (%d) stri=%s\n", cmp_id, ocu_i->u_name); return 0; } @@ -242,7 +242,7 @@ try_again: if (utf_cnt) { error_out: ocu[++u_len] = '?'; - printk(KERN_DEBUG "udf: bad UTF-8 character\n"); + printk(KERN_DEBUG pr_fmt("bad UTF-8 character\n")); } ocu[length - 1] = (uint8_t)u_len + 1; @@ -267,7 +267,7 @@ static int udf_CS0toNLS(struct nls_table *nls, struct ustr *utf_o, cmp_id = ocu_i->u_cmpID; if (cmp_id != 8 && cmp_id != 16) { memset(utf_o, 0, sizeof(struct ustr)); - printk(KERN_ERR "udf: unknown compression code (%d) stri=%s\n", + pr_err("unknown compression code (%d) stri=%s\n", cmp_id, ocu_i->u_name); return 0; } diff --git a/fs/ufs/ialloc.c b/fs/ufs/ialloc.c index 2eabf04..4ec5c10 100644 --- a/fs/ufs/ialloc.c +++ b/fs/ufs/ialloc.c @@ -170,7 +170,7 @@ static void ufs2_init_inodes_chunk(struct super_block *sb, * For other inodes, search forward from the parent directory's block * group to find a free inode. */ -struct inode * ufs_new_inode(struct inode * dir, int mode) +struct inode *ufs_new_inode(struct inode *dir, umode_t mode) { struct super_block * sb; struct ufs_sb_info * sbi; @@ -341,7 +341,7 @@ cg_found: fail_remove_inode: unlock_super(sb); - inode->i_nlink = 0; + clear_nlink(inode); iput(inode); UFSD("EXIT (FAILED): err %d\n", err); return ERR_PTR(err); diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c index b4d791a..9094e1d 100644 --- a/fs/ufs/inode.c +++ b/fs/ufs/inode.c @@ -583,13 +583,13 @@ static int ufs1_read_inode(struct inode *inode, struct ufs_inode *ufs_inode) { struct ufs_inode_info *ufsi = UFS_I(inode); struct super_block *sb = inode->i_sb; - mode_t mode; + umode_t mode; /* * Copy data to the in-core inode. */ inode->i_mode = mode = fs16_to_cpu(sb, ufs_inode->ui_mode); - inode->i_nlink = fs16_to_cpu(sb, ufs_inode->ui_nlink); + set_nlink(inode, fs16_to_cpu(sb, ufs_inode->ui_nlink)); if (inode->i_nlink == 0) { ufs_error (sb, "ufs_read_inode", "inode %lu has zero nlink\n", inode->i_ino); return -1; @@ -630,14 +630,14 @@ static int ufs2_read_inode(struct inode *inode, struct ufs2_inode *ufs2_inode) { struct ufs_inode_info *ufsi = UFS_I(inode); struct super_block *sb = inode->i_sb; - mode_t mode; + umode_t mode; UFSD("Reading ufs2 inode, ino %lu\n", inode->i_ino); /* * Copy data to the in-core inode. */ inode->i_mode = mode = fs16_to_cpu(sb, ufs2_inode->ui_mode); - inode->i_nlink = fs16_to_cpu(sb, ufs2_inode->ui_nlink); + set_nlink(inode, fs16_to_cpu(sb, ufs2_inode->ui_nlink)); if (inode->i_nlink == 0) { ufs_error (sb, "ufs_read_inode", "inode %lu has zero nlink\n", inode->i_ino); return -1; diff --git a/fs/ufs/namei.c b/fs/ufs/namei.c index 639d491..38cac19 100644 --- a/fs/ufs/namei.c +++ b/fs/ufs/namei.c @@ -70,7 +70,7 @@ static struct dentry *ufs_lookup(struct inode * dir, struct dentry *dentry, stru * If the create succeeds, we fill in the inode information * with d_instantiate(). */ -static int ufs_create (struct inode * dir, struct dentry * dentry, int mode, +static int ufs_create (struct inode * dir, struct dentry * dentry, umode_t mode, struct nameidata *nd) { struct inode *inode; @@ -94,7 +94,7 @@ static int ufs_create (struct inode * dir, struct dentry * dentry, int mode, return err; } -static int ufs_mknod (struct inode * dir, struct dentry *dentry, int mode, dev_t rdev) +static int ufs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev) { struct inode *inode; int err; @@ -180,7 +180,7 @@ static int ufs_link (struct dentry * old_dentry, struct inode * dir, return error; } -static int ufs_mkdir(struct inode * dir, struct dentry * dentry, int mode) +static int ufs_mkdir(struct inode * dir, struct dentry * dentry, umode_t mode) { struct inode * inode; int err = -EMLINK; diff --git a/fs/ufs/super.c b/fs/ufs/super.c index 3915ade..5246ee3 100644 --- a/fs/ufs/super.c +++ b/fs/ufs/super.c @@ -1351,9 +1351,9 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data) return 0; } -static int ufs_show_options(struct seq_file *seq, struct vfsmount *vfs) +static int ufs_show_options(struct seq_file *seq, struct dentry *root) { - struct ufs_sb_info *sbi = UFS_SB(vfs->mnt_sb); + struct ufs_sb_info *sbi = UFS_SB(root->d_sb); unsigned mval = sbi->s_mount_opt & UFS_MOUNT_UFSTYPE; const struct match_token *tp = tokens; @@ -1425,7 +1425,6 @@ static struct inode *ufs_alloc_inode(struct super_block *sb) static void ufs_i_callback(struct rcu_head *head) { struct inode *inode = container_of(head, struct inode, i_rcu); - INIT_LIST_HEAD(&inode->i_dentry); kmem_cache_free(ufs_inode_cachep, UFS_I(inode)); } diff --git a/fs/ufs/ufs.h b/fs/ufs/ufs.h index 5be2755..528750b 100644 --- a/fs/ufs/ufs.h +++ b/fs/ufs/ufs.h @@ -104,7 +104,7 @@ extern const struct address_space_operations ufs_aops; /* ialloc.c */ extern void ufs_free_inode (struct inode *inode); -extern struct inode * ufs_new_inode (struct inode *, int); +extern struct inode * ufs_new_inode (struct inode *, umode_t); /* inode.c */ extern struct inode *ufs_iget(struct super_block *, unsigned long); @@ -117,9 +117,12 @@ extern int ufs_getfrag_block (struct inode *inode, sector_t fragment, struct buf extern const struct file_operations ufs_dir_operations; /* super.c */ -extern void ufs_warning (struct super_block *, const char *, const char *, ...) __attribute__ ((format (printf, 3, 4))); -extern void ufs_error (struct super_block *, const char *, const char *, ...) __attribute__ ((format (printf, 3, 4))); -extern void ufs_panic (struct super_block *, const char *, const char *, ...) __attribute__ ((format (printf, 3, 4))); +extern __printf(3, 4) +void ufs_warning(struct super_block *, const char *, const char *, ...); +extern __printf(3, 4) +void ufs_error(struct super_block *, const char *, const char *, ...); +extern __printf(3, 4) +void ufs_panic(struct super_block *, const char *, const char *, ...); /* symlink.c */ extern const struct inode_operations ufs_fast_symlink_inode_operations; @@ -14,6 +14,7 @@ #include <linux/mount.h> #include <linux/namei.h> #include <linux/security.h> +#include <linux/evm.h> #include <linux/syscalls.h> #include <linux/module.h> #include <linux/fsnotify.h> @@ -166,6 +167,64 @@ out_noalloc: } EXPORT_SYMBOL_GPL(xattr_getsecurity); +/* + * vfs_getxattr_alloc - allocate memory, if necessary, before calling getxattr + * + * Allocate memory, if not already allocated, or re-allocate correct size, + * before retrieving the extended attribute. + * + * Returns the result of alloc, if failed, or the getxattr operation. + */ +ssize_t +vfs_getxattr_alloc(struct dentry *dentry, const char *name, char **xattr_value, + size_t xattr_size, gfp_t flags) +{ + struct inode *inode = dentry->d_inode; + char *value = *xattr_value; + int error; + + error = xattr_permission(inode, name, MAY_READ); + if (error) + return error; + + if (!inode->i_op->getxattr) + return -EOPNOTSUPP; + + error = inode->i_op->getxattr(dentry, name, NULL, 0); + if (error < 0) + return error; + + if (!value || (error > xattr_size)) { + value = krealloc(*xattr_value, error + 1, flags); + if (!value) + return -ENOMEM; + memset(value, 0, error + 1); + } + + error = inode->i_op->getxattr(dentry, name, value, error); + *xattr_value = value; + return error; +} + +/* Compare an extended attribute value with the given value */ +int vfs_xattr_cmp(struct dentry *dentry, const char *xattr_name, + const char *value, size_t size, gfp_t flags) +{ + char *xattr_value = NULL; + int rc; + + rc = vfs_getxattr_alloc(dentry, xattr_name, &xattr_value, 0, flags); + if (rc < 0) + return rc; + + if ((rc != size) || (memcmp(xattr_value, value, rc) != 0)) + rc = -EINVAL; + else + rc = 0; + kfree(xattr_value); + return rc; +} + ssize_t vfs_getxattr(struct dentry *dentry, const char *name, void *value, size_t size) { @@ -243,8 +302,10 @@ vfs_removexattr(struct dentry *dentry, const char *name) error = inode->i_op->removexattr(dentry, name); mutex_unlock(&inode->i_mutex); - if (!error) + if (!error) { fsnotify_xattr(dentry); + evm_inode_post_removexattr(dentry, name); + } return error; } EXPORT_SYMBOL_GPL(vfs_removexattr); @@ -336,7 +397,7 @@ SYSCALL_DEFINE5(fsetxattr, int, fd, const char __user *, name, error = mnt_want_write_file(f); if (!error) { error = setxattr(dentry, name, value, size, flags); - mnt_drop_write(f->f_path.mnt); + mnt_drop_write_file(f); } fput(f); return error; @@ -563,7 +624,7 @@ SYSCALL_DEFINE2(fremovexattr, int, fd, const char __user *, name) error = mnt_want_write_file(f); if (!error) { error = removexattr(dentry, name); - mnt_drop_write(f->f_path.mnt); + mnt_drop_write_file(f); } fput(f); return error; diff --git a/fs/xfs/kmem.h b/fs/xfs/kmem.h index f7c8f7a..292eff1 100644 --- a/fs/xfs/kmem.h +++ b/fs/xfs/kmem.h @@ -61,12 +61,7 @@ extern void kmem_free(const void *); static inline void *kmem_zalloc_large(size_t size) { - void *ptr; - - ptr = vmalloc(size); - if (ptr) - memset(ptr, 0, size); - return ptr; + return vzalloc(size); } static inline void kmem_free_large(void *ptr) { diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c index b6c4b37..ac702a6 100644 --- a/fs/xfs/xfs_acl.c +++ b/fs/xfs/xfs_acl.c @@ -39,9 +39,11 @@ xfs_acl_from_disk(struct xfs_acl *aclp) struct posix_acl_entry *acl_e; struct posix_acl *acl; struct xfs_acl_entry *ace; - int count, i; + unsigned int count, i; count = be32_to_cpu(aclp->acl_cnt); + if (count > XFS_ACL_MAX_ENTRIES) + return ERR_PTR(-EFSCORRUPTED); acl = posix_acl_alloc(count, GFP_KERNEL); if (!acl) diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c index bdd9cb5..ce84ffd 100644 --- a/fs/xfs/xfs_alloc.c +++ b/fs/xfs/xfs_alloc.c @@ -452,7 +452,7 @@ xfs_alloc_read_agfl( if (error) return error; ASSERT(!xfs_buf_geterror(bp)); - XFS_BUF_SET_VTYPE_REF(bp, B_FS_AGFL, XFS_AGFL_REF); + xfs_buf_set_ref(bp, XFS_AGFL_REF); *bpp = bp; return 0; } @@ -2139,7 +2139,7 @@ xfs_read_agf( xfs_trans_brelse(tp, *bpp); return XFS_ERROR(EFSCORRUPTED); } - XFS_BUF_SET_VTYPE_REF(*bpp, B_FS_AGF, XFS_AGF_REF); + xfs_buf_set_ref(*bpp, XFS_AGF_REF); return 0; } diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 8c37dde..574d4ee 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -38,40 +38,6 @@ #include <linux/pagevec.h> #include <linux/writeback.h> - -/* - * Prime number of hash buckets since address is used as the key. - */ -#define NVSYNC 37 -#define to_ioend_wq(v) (&xfs_ioend_wq[((unsigned long)v) % NVSYNC]) -static wait_queue_head_t xfs_ioend_wq[NVSYNC]; - -void __init -xfs_ioend_init(void) -{ - int i; - - for (i = 0; i < NVSYNC; i++) - init_waitqueue_head(&xfs_ioend_wq[i]); -} - -void -xfs_ioend_wait( - xfs_inode_t *ip) -{ - wait_queue_head_t *wq = to_ioend_wq(ip); - - wait_event(*wq, (atomic_read(&ip->i_iocount) == 0)); -} - -STATIC void -xfs_ioend_wake( - xfs_inode_t *ip) -{ - if (atomic_dec_and_test(&ip->i_iocount)) - wake_up(to_ioend_wq(ip)); -} - void xfs_count_page_state( struct page *page, @@ -115,25 +81,20 @@ xfs_destroy_ioend( xfs_ioend_t *ioend) { struct buffer_head *bh, *next; - struct xfs_inode *ip = XFS_I(ioend->io_inode); for (bh = ioend->io_buffer_head; bh; bh = next) { next = bh->b_private; bh->b_end_io(bh, !ioend->io_error); } - /* - * Volume managers supporting multiple paths can send back ENODEV - * when the final path disappears. In this case continuing to fill - * the page cache with dirty data which cannot be written out is - * evil, so prevent that. - */ - if (unlikely(ioend->io_error == -ENODEV)) { - xfs_do_force_shutdown(ip->i_mount, SHUTDOWN_DEVICE_REQ, - __FILE__, __LINE__); + if (ioend->io_iocb) { + if (ioend->io_isasync) { + aio_complete(ioend->io_iocb, ioend->io_error ? + ioend->io_error : ioend->io_result, 0); + } + inode_dio_done(ioend->io_inode); } - xfs_ioend_wake(ip); mempool_free(ioend, xfs_ioend_pool); } @@ -156,6 +117,15 @@ xfs_ioend_new_eof( } /* + * Fast and loose check if this write could update the on-disk inode size. + */ +static inline bool xfs_ioend_is_append(struct xfs_ioend *ioend) +{ + return ioend->io_offset + ioend->io_size > + XFS_I(ioend->io_inode)->i_d.di_size; +} + +/* * Update on-disk file size now that data has been written to disk. The * current in-memory file size is i_size. If a write is beyond eof i_new_size * will be the intended file size until i_size is updated. If this write does @@ -173,9 +143,6 @@ xfs_setfilesize( xfs_inode_t *ip = XFS_I(ioend->io_inode); xfs_fsize_t isize; - if (unlikely(ioend->io_error)) - return 0; - if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) return EAGAIN; @@ -192,6 +159,9 @@ xfs_setfilesize( /* * Schedule IO completion handling on the final put of an ioend. + * + * If there is no work to do we might as well call it a day and free the + * ioend right now. */ STATIC void xfs_finish_ioend( @@ -200,8 +170,10 @@ xfs_finish_ioend( if (atomic_dec_and_test(&ioend->io_remaining)) { if (ioend->io_type == IO_UNWRITTEN) queue_work(xfsconvertd_workqueue, &ioend->io_work); - else + else if (xfs_ioend_is_append(ioend)) queue_work(xfsdatad_workqueue, &ioend->io_work); + else + xfs_destroy_ioend(ioend); } } @@ -216,17 +188,24 @@ xfs_end_io( struct xfs_inode *ip = XFS_I(ioend->io_inode); int error = 0; + if (XFS_FORCED_SHUTDOWN(ip->i_mount)) { + ioend->io_error = -EIO; + goto done; + } + if (ioend->io_error) + goto done; + /* * For unwritten extents we need to issue transactions to convert a * range to normal written extens after the data I/O has finished. */ - if (ioend->io_type == IO_UNWRITTEN && - likely(!ioend->io_error && !XFS_FORCED_SHUTDOWN(ip->i_mount))) { - + if (ioend->io_type == IO_UNWRITTEN) { error = xfs_iomap_write_unwritten(ip, ioend->io_offset, ioend->io_size); - if (error) - ioend->io_error = error; + if (error) { + ioend->io_error = -error; + goto done; + } } /* @@ -236,6 +215,7 @@ xfs_end_io( error = xfs_setfilesize(ioend); ASSERT(!error || error == EAGAIN); +done: /* * If we didn't complete processing of the ioend, requeue it to the * tail of the workqueue for another attempt later. Otherwise destroy @@ -247,8 +227,6 @@ xfs_end_io( /* ensure we don't spin on blocked ioends */ delay(1); } else { - if (ioend->io_iocb) - aio_complete(ioend->io_iocb, ioend->io_result, 0); xfs_destroy_ioend(ioend); } } @@ -285,13 +263,13 @@ xfs_alloc_ioend( * all the I/O from calling the completion routine too early. */ atomic_set(&ioend->io_remaining, 1); + ioend->io_isasync = 0; ioend->io_error = 0; ioend->io_list = NULL; ioend->io_type = type; ioend->io_inode = inode; ioend->io_buffer_head = NULL; ioend->io_buffer_tail = NULL; - atomic_inc(&XFS_I(ioend->io_inode)->i_iocount); ioend->io_offset = 0; ioend->io_size = 0; ioend->io_iocb = NULL; @@ -337,8 +315,8 @@ xfs_map_blocks( count = mp->m_maxioffset - offset; end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count); offset_fsb = XFS_B_TO_FSBT(mp, offset); - error = xfs_bmapi(NULL, ip, offset_fsb, end_fsb - offset_fsb, - bmapi_flags, NULL, 0, imap, &nimaps, NULL); + error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, + imap, &nimaps, bmapi_flags); xfs_iunlock(ip, XFS_ILOCK_SHARED); if (error) @@ -551,7 +529,6 @@ xfs_cancel_ioend( unlock_buffer(bh); } while ((bh = next_bh) != NULL); - xfs_ioend_wake(XFS_I(ioend->io_inode)); mempool_free(ioend, xfs_ioend_pool); } while ((ioend = next) != NULL); } @@ -925,11 +902,11 @@ xfs_vm_writepage( * random callers for direct reclaim or memcg reclaim. We explicitly * allow reclaim from kswapd as the stack usage there is relatively low. * - * This should really be done by the core VM, but until that happens - * filesystems like XFS, btrfs and ext4 have to take care of this - * by themselves. + * This should never happen except in the case of a VM regression so + * warn about it. */ - if ((current->flags & (PF_MEMALLOC|PF_KSWAPD)) == PF_MEMALLOC) + if (WARN_ON_ONCE((current->flags & (PF_MEMALLOC|PF_KSWAPD)) == + PF_MEMALLOC)) goto redirty; /* @@ -1161,8 +1138,8 @@ __xfs_get_blocks( end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + size); offset_fsb = XFS_B_TO_FSBT(mp, offset); - error = xfs_bmapi(NULL, ip, offset_fsb, end_fsb - offset_fsb, - XFS_BMAPI_ENTIRE, NULL, 0, &imap, &nimaps, NULL); + error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, + &imap, &nimaps, XFS_BMAPI_ENTIRE); if (error) goto out_unlock; @@ -1300,7 +1277,6 @@ xfs_end_io_direct_write( bool is_async) { struct xfs_ioend *ioend = iocb->private; - struct inode *inode = ioend->io_inode; /* * blockdev_direct_IO can return an error even after the I/O @@ -1311,28 +1287,17 @@ xfs_end_io_direct_write( ioend->io_offset = offset; ioend->io_size = size; + ioend->io_iocb = iocb; + ioend->io_result = ret; if (private && size > 0) ioend->io_type = IO_UNWRITTEN; if (is_async) { - /* - * If we are converting an unwritten extent we need to delay - * the AIO completion until after the unwrittent extent - * conversion has completed, otherwise do it ASAP. - */ - if (ioend->io_type == IO_UNWRITTEN) { - ioend->io_iocb = iocb; - ioend->io_result = ret; - } else { - aio_complete(iocb, ret, 0); - } + ioend->io_isasync = 1; xfs_finish_ioend(ioend); } else { xfs_finish_ioend_sync(ioend); } - - /* XXX: probably should move into the real I/O completion handler */ - inode_dio_done(inode); } STATIC ssize_t diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h index 71f721e..116dd5c 100644 --- a/fs/xfs/xfs_aops.h +++ b/fs/xfs/xfs_aops.h @@ -47,6 +47,7 @@ typedef struct xfs_ioend { unsigned int io_type; /* delalloc / unwritten */ int io_error; /* I/O error code */ atomic_t io_remaining; /* hold count */ + unsigned int io_isasync : 1; /* needs aio_complete */ struct inode *io_inode; /* file being written to */ struct buffer_head *io_buffer_head;/* buffer linked list head */ struct buffer_head *io_buffer_tail;/* buffer linked list tail */ @@ -60,9 +61,6 @@ typedef struct xfs_ioend { extern const struct address_space_operations xfs_address_space_operations; extern int xfs_get_blocks(struct inode *, sector_t, struct buffer_head *, int); -extern void xfs_ioend_init(void); -extern void xfs_ioend_wait(struct xfs_inode *); - extern void xfs_count_page_state(struct page *, int *, int *); #endif /* __XFS_AOPS_H__ */ diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c index 160bcdc..1e5d97f 100644 --- a/fs/xfs/xfs_attr.c +++ b/fs/xfs/xfs_attr.c @@ -319,7 +319,7 @@ xfs_attr_set_int( return (error); } - xfs_trans_ijoin(args.trans, dp); + xfs_trans_ijoin(args.trans, dp, 0); /* * If the attribute list is non-existent or a shortform list, @@ -389,7 +389,7 @@ xfs_attr_set_int( * a new one. We need the inode to be in all transactions. */ if (committed) - xfs_trans_ijoin(args.trans, dp); + xfs_trans_ijoin(args.trans, dp, 0); /* * Commit the leaf transformation. We'll need another (linked) @@ -537,7 +537,7 @@ xfs_attr_remove_int(xfs_inode_t *dp, struct xfs_name *name, int flags) * No need to make quota reservations here. We expect to release some * blocks not allocate in the common case. */ - xfs_trans_ijoin(args.trans, dp); + xfs_trans_ijoin(args.trans, dp, 0); /* * Decide on what work routines to call based on the inode size. @@ -809,7 +809,7 @@ xfs_attr_inactive(xfs_inode_t *dp) * No need to make quota reservations here. We expect to release some * blocks, not allocate, in the common case. */ - xfs_trans_ijoin(trans, dp); + xfs_trans_ijoin(trans, dp, 0); /* * Decide on what work routines to call based on the inode size. @@ -823,18 +823,6 @@ xfs_attr_inactive(xfs_inode_t *dp) if (error) goto out; - /* - * Signal synchronous inactive transactions unless this is a - * synchronous mount filesystem in which case we know that we're here - * because we've been called out of xfs_inactive which means that the - * last reference is gone and the unlink transaction has already hit - * the disk so async inactive transactions are safe. - */ - if (!(mp->m_flags & XFS_MOUNT_WSYNC)) { - if (dp->i_d.di_anextents > 0) - xfs_trans_set_sync(trans); - } - error = xfs_itruncate_extents(&trans, dp, XFS_ATTR_FORK, 0); if (error) goto out; @@ -973,7 +961,7 @@ xfs_attr_leaf_addname(xfs_da_args_t *args) * a new one. We need the inode to be in all transactions. */ if (committed) - xfs_trans_ijoin(args->trans, dp); + xfs_trans_ijoin(args->trans, dp, 0); /* * Commit the current trans (including the inode) and start @@ -1075,7 +1063,7 @@ xfs_attr_leaf_addname(xfs_da_args_t *args) * in all transactions. */ if (committed) - xfs_trans_ijoin(args->trans, dp); + xfs_trans_ijoin(args->trans, dp, 0); } else xfs_da_buf_done(bp); @@ -1149,7 +1137,7 @@ xfs_attr_leaf_removename(xfs_da_args_t *args) * a new one. We need the inode to be in all transactions. */ if (committed) - xfs_trans_ijoin(args->trans, dp); + xfs_trans_ijoin(args->trans, dp, 0); } else xfs_da_buf_done(bp); return(0); @@ -1303,7 +1291,7 @@ restart: * in all transactions. */ if (committed) - xfs_trans_ijoin(args->trans, dp); + xfs_trans_ijoin(args->trans, dp, 0); /* * Commit the node conversion and start the next @@ -1340,7 +1328,7 @@ restart: * a new one. We need the inode to be in all transactions. */ if (committed) - xfs_trans_ijoin(args->trans, dp); + xfs_trans_ijoin(args->trans, dp, 0); } else { /* * Addition succeeded, update Btree hashvals. @@ -1452,7 +1440,7 @@ restart: * in all transactions. */ if (committed) - xfs_trans_ijoin(args->trans, dp); + xfs_trans_ijoin(args->trans, dp, 0); } /* @@ -1584,7 +1572,7 @@ xfs_attr_node_removename(xfs_da_args_t *args) * a new one. We need the inode to be in all transactions. */ if (committed) - xfs_trans_ijoin(args->trans, dp); + xfs_trans_ijoin(args->trans, dp, 0); /* * Commit the Btree join operation and start a new trans. @@ -1635,7 +1623,7 @@ xfs_attr_node_removename(xfs_da_args_t *args) * in all transactions. */ if (committed) - xfs_trans_ijoin(args->trans, dp); + xfs_trans_ijoin(args->trans, dp, 0); } else xfs_da_brelse(args->trans, bp); } @@ -1975,10 +1963,9 @@ xfs_attr_rmtval_get(xfs_da_args_t *args) lblkno = args->rmtblkno; while (valuelen > 0) { nmap = ATTR_RMTVALUE_MAPSIZE; - error = xfs_bmapi(args->trans, args->dp, (xfs_fileoff_t)lblkno, - args->rmtblkcnt, - XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA, - NULL, 0, map, &nmap, NULL); + error = xfs_bmapi_read(args->dp, (xfs_fileoff_t)lblkno, + args->rmtblkcnt, map, &nmap, + XFS_BMAPI_ATTRFORK); if (error) return(error); ASSERT(nmap >= 1); @@ -2052,10 +2039,9 @@ xfs_attr_rmtval_set(xfs_da_args_t *args) */ xfs_bmap_init(args->flist, args->firstblock); nmap = 1; - error = xfs_bmapi(args->trans, dp, (xfs_fileoff_t)lblkno, + error = xfs_bmapi_write(args->trans, dp, (xfs_fileoff_t)lblkno, blkcnt, - XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA | - XFS_BMAPI_WRITE, + XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA, args->firstblock, args->total, &map, &nmap, args->flist); if (!error) { @@ -2074,7 +2060,7 @@ xfs_attr_rmtval_set(xfs_da_args_t *args) * a new one. We need the inode to be in all transactions. */ if (committed) - xfs_trans_ijoin(args->trans, dp); + xfs_trans_ijoin(args->trans, dp, 0); ASSERT(nmap == 1); ASSERT((map.br_startblock != DELAYSTARTBLOCK) && @@ -2104,14 +2090,11 @@ xfs_attr_rmtval_set(xfs_da_args_t *args) */ xfs_bmap_init(args->flist, args->firstblock); nmap = 1; - error = xfs_bmapi(NULL, dp, (xfs_fileoff_t)lblkno, - args->rmtblkcnt, - XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA, - args->firstblock, 0, &map, &nmap, - NULL); - if (error) { + error = xfs_bmapi_read(dp, (xfs_fileoff_t)lblkno, + args->rmtblkcnt, &map, &nmap, + XFS_BMAPI_ATTRFORK); + if (error) return(error); - } ASSERT(nmap == 1); ASSERT((map.br_startblock != DELAYSTARTBLOCK) && (map.br_startblock != HOLESTARTBLOCK)); @@ -2121,16 +2104,17 @@ xfs_attr_rmtval_set(xfs_da_args_t *args) bp = xfs_buf_get(mp->m_ddev_targp, dblkno, blkcnt, XBF_LOCK | XBF_DONT_BLOCK); - ASSERT(!xfs_buf_geterror(bp)); - + if (!bp) + return ENOMEM; tmp = (valuelen < XFS_BUF_SIZE(bp)) ? valuelen : XFS_BUF_SIZE(bp); xfs_buf_iomove(bp, 0, tmp, src, XBRW_WRITE); if (tmp < XFS_BUF_SIZE(bp)) xfs_buf_zero(bp, tmp, XFS_BUF_SIZE(bp) - tmp); - if ((error = xfs_bwrite(mp, bp))) {/* GROT: NOTE: synchronous write */ - return (error); - } + error = xfs_bwrite(bp); /* GROT: NOTE: synchronous write */ + xfs_buf_relse(bp); + if (error) + return error; src += tmp; valuelen -= tmp; @@ -2166,16 +2150,12 @@ xfs_attr_rmtval_remove(xfs_da_args_t *args) /* * Try to remember where we decided to put the value. */ - xfs_bmap_init(args->flist, args->firstblock); nmap = 1; - error = xfs_bmapi(NULL, args->dp, (xfs_fileoff_t)lblkno, - args->rmtblkcnt, - XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA, - args->firstblock, 0, &map, &nmap, - args->flist); - if (error) { + error = xfs_bmapi_read(args->dp, (xfs_fileoff_t)lblkno, + args->rmtblkcnt, &map, &nmap, + XFS_BMAPI_ATTRFORK); + if (error) return(error); - } ASSERT(nmap == 1); ASSERT((map.br_startblock != DELAYSTARTBLOCK) && (map.br_startblock != HOLESTARTBLOCK)); @@ -2188,8 +2168,7 @@ xfs_attr_rmtval_remove(xfs_da_args_t *args) */ bp = xfs_incore(mp->m_ddev_targp, dblkno, blkcnt, XBF_TRYLOCK); if (bp) { - XFS_BUF_STALE(bp); - XFS_BUF_UNDELAYWRITE(bp); + xfs_buf_stale(bp); xfs_buf_relse(bp); bp = NULL; } @@ -2227,7 +2206,7 @@ xfs_attr_rmtval_remove(xfs_da_args_t *args) * a new one. We need the inode to be in all transactions. */ if (committed) - xfs_trans_ijoin(args->trans, args->dp); + xfs_trans_ijoin(args->trans, args->dp, 0); /* * Close out trans and start the next one in the chain. diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c index 8fad960..c1b55e5 100644 --- a/fs/xfs/xfs_attr_leaf.c +++ b/fs/xfs/xfs_attr_leaf.c @@ -110,6 +110,7 @@ xfs_attr_namesp_match(int arg_flags, int ondisk_flags) /* * Query whether the requested number of additional bytes of extended * attribute space will be able to fit inline. + * * Returns zero if not, else the di_forkoff fork offset to be used in the * literal area for attribute data once the new bytes have been added. * @@ -122,7 +123,7 @@ xfs_attr_shortform_bytesfit(xfs_inode_t *dp, int bytes) int offset; int minforkoff; /* lower limit on valid forkoff locations */ int maxforkoff; /* upper limit on valid forkoff locations */ - int dsize; + int dsize; xfs_mount_t *mp = dp->i_mount; offset = (XFS_LITINO(mp) - bytes) >> 3; /* rounded down */ @@ -136,47 +137,60 @@ xfs_attr_shortform_bytesfit(xfs_inode_t *dp, int bytes) return (offset >= minforkoff) ? minforkoff : 0; } - if (!(mp->m_flags & XFS_MOUNT_ATTR2)) { - if (bytes <= XFS_IFORK_ASIZE(dp)) - return dp->i_d.di_forkoff; + /* + * If the requested numbers of bytes is smaller or equal to the + * current attribute fork size we can always proceed. + * + * Note that if_bytes in the data fork might actually be larger than + * the current data fork size is due to delalloc extents. In that + * case either the extent count will go down when they are converted + * to real extents, or the delalloc conversion will take care of the + * literal area rebalancing. + */ + if (bytes <= XFS_IFORK_ASIZE(dp)) + return dp->i_d.di_forkoff; + + /* + * For attr2 we can try to move the forkoff if there is space in the + * literal area, but for the old format we are done if there is no + * space in the fixed attribute fork. + */ + if (!(mp->m_flags & XFS_MOUNT_ATTR2)) return 0; - } dsize = dp->i_df.if_bytes; - + switch (dp->i_d.di_format) { case XFS_DINODE_FMT_EXTENTS: - /* + /* * If there is no attr fork and the data fork is extents, - * determine if creating the default attr fork will result - * in the extents form migrating to btree. If so, the - * minimum offset only needs to be the space required for + * determine if creating the default attr fork will result + * in the extents form migrating to btree. If so, the + * minimum offset only needs to be the space required for * the btree root. - */ + */ if (!dp->i_d.di_forkoff && dp->i_df.if_bytes > xfs_default_attroffset(dp)) dsize = XFS_BMDR_SPACE_CALC(MINDBTPTRS); break; - case XFS_DINODE_FMT_BTREE: /* - * If have data btree then keep forkoff if we have one, - * otherwise we are adding a new attr, so then we set - * minforkoff to where the btree root can finish so we have + * If we have a data btree then keep forkoff if we have one, + * otherwise we are adding a new attr, so then we set + * minforkoff to where the btree root can finish so we have * plenty of room for attrs */ if (dp->i_d.di_forkoff) { - if (offset < dp->i_d.di_forkoff) + if (offset < dp->i_d.di_forkoff) return 0; - else - return dp->i_d.di_forkoff; - } else - dsize = XFS_BMAP_BROOT_SPACE(dp->i_df.if_broot); + return dp->i_d.di_forkoff; + } + dsize = XFS_BMAP_BROOT_SPACE(dp->i_df.if_broot); break; } - - /* - * A data fork btree root must have space for at least + + /* + * A data fork btree root must have space for at least * MINDBTPTRS key/ptr pairs if the data fork is small or empty. */ minforkoff = MAX(dsize, XFS_BMDR_SPACE_CALC(MINDBTPTRS)); @@ -186,10 +200,10 @@ xfs_attr_shortform_bytesfit(xfs_inode_t *dp, int bytes) maxforkoff = XFS_LITINO(mp) - XFS_BMDR_SPACE_CALC(MINABTPTRS); maxforkoff = maxforkoff >> 3; /* rounded down */ - if (offset >= minforkoff && offset < maxforkoff) - return offset; if (offset >= maxforkoff) return maxforkoff; + if (offset >= minforkoff) + return offset; return 0; } @@ -2926,9 +2940,8 @@ xfs_attr_leaf_freextent(xfs_trans_t **trans, xfs_inode_t *dp, * Try to remember where we decided to put the value. */ nmap = 1; - error = xfs_bmapi(*trans, dp, (xfs_fileoff_t)tblkno, tblkcnt, - XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA, - NULL, 0, &map, &nmap, NULL); + error = xfs_bmapi_read(dp, (xfs_fileoff_t)tblkno, tblkcnt, + &map, &nmap, XFS_BMAPI_ATTRFORK); if (error) { return(error); } @@ -2948,6 +2961,8 @@ xfs_attr_leaf_freextent(xfs_trans_t **trans, xfs_inode_t *dp, bp = xfs_trans_get_buf(*trans, dp->i_mount->m_ddev_targp, dblkno, dblkcnt, XBF_LOCK); + if (!bp) + return ENOMEM; xfs_trans_binval(*trans, bp); /* * Roll to next transaction. diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c index 452a291..d0ab788 100644 --- a/fs/xfs/xfs_bmap.c +++ b/fs/xfs/xfs_bmap.c @@ -50,17 +50,22 @@ #include "xfs_trace.h" -#ifdef DEBUG -STATIC void -xfs_bmap_check_leaf_extents(xfs_btree_cur_t *cur, xfs_inode_t *ip, int whichfork); -#endif - kmem_zone_t *xfs_bmap_free_item_zone; /* * Prototypes for internal bmap routines. */ +#ifdef DEBUG +STATIC void +xfs_bmap_check_leaf_extents( + struct xfs_btree_cur *cur, + struct xfs_inode *ip, + int whichfork); +#else +#define xfs_bmap_check_leaf_extents(cur, ip, whichfork) do { } while (0) +#endif + /* * Called from xfs_bmap_add_attrfork to handle extents format files. @@ -85,58 +90,6 @@ xfs_bmap_add_attrfork_local( int *flags); /* inode logging flags */ /* - * Called by xfs_bmap_add_extent to handle cases converting a delayed - * allocation to a real allocation. - */ -STATIC int /* error */ -xfs_bmap_add_extent_delay_real( - struct xfs_trans *tp, /* transaction pointer */ - xfs_inode_t *ip, /* incore inode pointer */ - xfs_extnum_t *idx, /* extent number to update/insert */ - xfs_btree_cur_t **curp, /* if *curp is null, not a btree */ - xfs_bmbt_irec_t *new, /* new data to add to file extents */ - xfs_filblks_t *dnew, /* new delayed-alloc indirect blocks */ - xfs_fsblock_t *first, /* pointer to firstblock variable */ - xfs_bmap_free_t *flist, /* list of extents to be freed */ - int *logflagsp); /* inode logging flags */ - -/* - * Called by xfs_bmap_add_extent to handle cases converting a hole - * to a delayed allocation. - */ -STATIC int /* error */ -xfs_bmap_add_extent_hole_delay( - xfs_inode_t *ip, /* incore inode pointer */ - xfs_extnum_t *idx, /* extent number to update/insert */ - xfs_bmbt_irec_t *new, /* new data to add to file extents */ - int *logflagsp); /* inode logging flags */ - -/* - * Called by xfs_bmap_add_extent to handle cases converting a hole - * to a real allocation. - */ -STATIC int /* error */ -xfs_bmap_add_extent_hole_real( - xfs_inode_t *ip, /* incore inode pointer */ - xfs_extnum_t *idx, /* extent number to update/insert */ - xfs_btree_cur_t *cur, /* if null, not a btree */ - xfs_bmbt_irec_t *new, /* new data to add to file extents */ - int *logflagsp, /* inode logging flags */ - int whichfork); /* data or attr fork */ - -/* - * Called by xfs_bmap_add_extent to handle cases converting an unwritten - * allocation to a real allocation or vice versa. - */ -STATIC int /* error */ -xfs_bmap_add_extent_unwritten_real( - xfs_inode_t *ip, /* incore inode pointer */ - xfs_extnum_t *idx, /* extent number to update/insert */ - xfs_btree_cur_t **curp, /* if *curp is null, not a btree */ - xfs_bmbt_irec_t *new, /* new data to add to file extents */ - int *logflagsp); /* inode logging flags */ - -/* * xfs_bmap_alloc is called by xfs_bmapi to allocate an extent for a file. * It figures out where to ask the underlying allocator to put the new extent. */ @@ -215,19 +168,6 @@ xfs_bmap_search_extents( xfs_bmbt_irec_t *prevp); /* out: previous extent entry found */ /* - * Check the last inode extent to determine whether this allocation will result - * in blocks being allocated at the end of the file. When we allocate new data - * blocks at the end of the file which do not start at the previous data block, - * we will try to align the new blocks at stripe unit boundaries. - */ -STATIC int /* error */ -xfs_bmap_isaeof( - xfs_inode_t *ip, /* incore inode pointer */ - xfs_fileoff_t off, /* file offset in fsblocks */ - int whichfork, /* data or attribute fork */ - char *aeof); /* return value */ - -/* * Compute the worst-case number of indirect blocks that will be used * for ip's delayed extent of length "len". */ @@ -431,188 +371,13 @@ xfs_bmap_add_attrfork_local( } /* - * Called by xfs_bmapi to update file extent records and the btree - * after allocating space (or doing a delayed allocation). - */ -STATIC int /* error */ -xfs_bmap_add_extent( - struct xfs_trans *tp, /* transaction pointer */ - xfs_inode_t *ip, /* incore inode pointer */ - xfs_extnum_t *idx, /* extent number to update/insert */ - xfs_btree_cur_t **curp, /* if *curp is null, not a btree */ - xfs_bmbt_irec_t *new, /* new data to add to file extents */ - xfs_fsblock_t *first, /* pointer to firstblock variable */ - xfs_bmap_free_t *flist, /* list of extents to be freed */ - int *logflagsp, /* inode logging flags */ - int whichfork) /* data or attr fork */ -{ - xfs_btree_cur_t *cur; /* btree cursor or null */ - xfs_filblks_t da_new; /* new count del alloc blocks used */ - xfs_filblks_t da_old; /* old count del alloc blocks used */ - int error; /* error return value */ - xfs_ifork_t *ifp; /* inode fork ptr */ - int logflags; /* returned value */ - xfs_extnum_t nextents; /* number of extents in file now */ - - XFS_STATS_INC(xs_add_exlist); - - cur = *curp; - ifp = XFS_IFORK_PTR(ip, whichfork); - nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); - da_old = da_new = 0; - error = 0; - - ASSERT(*idx >= 0); - ASSERT(*idx <= nextents); - - /* - * This is the first extent added to a new/empty file. - * Special case this one, so other routines get to assume there are - * already extents in the list. - */ - if (nextents == 0) { - xfs_iext_insert(ip, *idx, 1, new, - whichfork == XFS_ATTR_FORK ? BMAP_ATTRFORK : 0); - - ASSERT(cur == NULL); - - if (!isnullstartblock(new->br_startblock)) { - XFS_IFORK_NEXT_SET(ip, whichfork, 1); - logflags = XFS_ILOG_CORE | xfs_ilog_fext(whichfork); - } else - logflags = 0; - } - /* - * Any kind of new delayed allocation goes here. - */ - else if (isnullstartblock(new->br_startblock)) { - if (cur) - ASSERT((cur->bc_private.b.flags & - XFS_BTCUR_BPRV_WASDEL) == 0); - error = xfs_bmap_add_extent_hole_delay(ip, idx, new, - &logflags); - } - /* - * Real allocation off the end of the file. - */ - else if (*idx == nextents) { - if (cur) - ASSERT((cur->bc_private.b.flags & - XFS_BTCUR_BPRV_WASDEL) == 0); - error = xfs_bmap_add_extent_hole_real(ip, idx, cur, new, - &logflags, whichfork); - } else { - xfs_bmbt_irec_t prev; /* old extent at offset idx */ - - /* - * Get the record referred to by idx. - */ - xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx), &prev); - /* - * If it's a real allocation record, and the new allocation ends - * after the start of the referred to record, then we're filling - * in a delayed or unwritten allocation with a real one, or - * converting real back to unwritten. - */ - if (!isnullstartblock(new->br_startblock) && - new->br_startoff + new->br_blockcount > prev.br_startoff) { - if (prev.br_state != XFS_EXT_UNWRITTEN && - isnullstartblock(prev.br_startblock)) { - da_old = startblockval(prev.br_startblock); - if (cur) - ASSERT(cur->bc_private.b.flags & - XFS_BTCUR_BPRV_WASDEL); - error = xfs_bmap_add_extent_delay_real(tp, ip, - idx, &cur, new, &da_new, - first, flist, &logflags); - } else { - ASSERT(new->br_state == XFS_EXT_NORM || - new->br_state == XFS_EXT_UNWRITTEN); - - error = xfs_bmap_add_extent_unwritten_real(ip, - idx, &cur, new, &logflags); - if (error) - goto done; - } - } - /* - * Otherwise we're filling in a hole with an allocation. - */ - else { - if (cur) - ASSERT((cur->bc_private.b.flags & - XFS_BTCUR_BPRV_WASDEL) == 0); - error = xfs_bmap_add_extent_hole_real(ip, idx, cur, - new, &logflags, whichfork); - } - } - - if (error) - goto done; - ASSERT(*curp == cur || *curp == NULL); - - /* - * Convert to a btree if necessary. - */ - if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS && - XFS_IFORK_NEXTENTS(ip, whichfork) > ifp->if_ext_max) { - int tmp_logflags; /* partial log flag return val */ - - ASSERT(cur == NULL); - error = xfs_bmap_extents_to_btree(tp, ip, first, - flist, &cur, da_old > 0, &tmp_logflags, whichfork); - logflags |= tmp_logflags; - if (error) - goto done; - } - /* - * Adjust for changes in reserved delayed indirect blocks. - * Nothing to do for disk quotas here. - */ - if (da_old || da_new) { - xfs_filblks_t nblks; - - nblks = da_new; - if (cur) - nblks += cur->bc_private.b.allocated; - ASSERT(nblks <= da_old); - if (nblks < da_old) - xfs_icsb_modify_counters(ip->i_mount, XFS_SBS_FDBLOCKS, - (int64_t)(da_old - nblks), 0); - } - /* - * Clear out the allocated field, done with it now in any case. - */ - if (cur) { - cur->bc_private.b.allocated = 0; - *curp = cur; - } -done: -#ifdef DEBUG - if (!error) - xfs_bmap_check_leaf_extents(*curp, ip, whichfork); -#endif - *logflagsp = logflags; - return error; -} - -/* - * Called by xfs_bmap_add_extent to handle cases converting a delayed - * allocation to a real allocation. + * Convert a delayed allocation to a real allocation. */ STATIC int /* error */ xfs_bmap_add_extent_delay_real( - struct xfs_trans *tp, /* transaction pointer */ - xfs_inode_t *ip, /* incore inode pointer */ - xfs_extnum_t *idx, /* extent number to update/insert */ - xfs_btree_cur_t **curp, /* if *curp is null, not a btree */ - xfs_bmbt_irec_t *new, /* new data to add to file extents */ - xfs_filblks_t *dnew, /* new delayed-alloc indirect blocks */ - xfs_fsblock_t *first, /* pointer to firstblock variable */ - xfs_bmap_free_t *flist, /* list of extents to be freed */ - int *logflagsp) /* inode logging flags */ + struct xfs_bmalloca *bma) { - xfs_btree_cur_t *cur; /* btree cursor */ + struct xfs_bmbt_irec *new = &bma->got; int diff; /* temp value */ xfs_bmbt_rec_host_t *ep; /* extent entry for idx */ int error; /* error return value */ @@ -623,10 +388,22 @@ xfs_bmap_add_extent_delay_real( /* left is 0, right is 1, prev is 2 */ int rval=0; /* return value (logging flags) */ int state = 0;/* state bits, accessed thru macros */ - xfs_filblks_t temp=0; /* value for dnew calculations */ - xfs_filblks_t temp2=0;/* value for dnew calculations */ + xfs_filblks_t da_new; /* new count del alloc blocks used */ + xfs_filblks_t da_old; /* old count del alloc blocks used */ + xfs_filblks_t temp=0; /* value for da_new calculations */ + xfs_filblks_t temp2=0;/* value for da_new calculations */ int tmp_rval; /* partial logging flags */ + ifp = XFS_IFORK_PTR(bma->ip, XFS_DATA_FORK); + + ASSERT(bma->idx >= 0); + ASSERT(bma->idx <= ifp->if_bytes / sizeof(struct xfs_bmbt_rec)); + ASSERT(!isnullstartblock(new->br_startblock)); + ASSERT(!bma->cur || + (bma->cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL)); + + XFS_STATS_INC(xs_add_exlist); + #define LEFT r[0] #define RIGHT r[1] #define PREV r[2] @@ -634,14 +411,15 @@ xfs_bmap_add_extent_delay_real( /* * Set up a bunch of variables to make the tests simpler. */ - cur = *curp; - ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); - ep = xfs_iext_get_ext(ifp, *idx); + ep = xfs_iext_get_ext(ifp, bma->idx); xfs_bmbt_get_all(ep, &PREV); new_endoff = new->br_startoff + new->br_blockcount; ASSERT(PREV.br_startoff <= new->br_startoff); ASSERT(PREV.br_startoff + PREV.br_blockcount >= new_endoff); + da_old = startblockval(PREV.br_startblock); + da_new = 0; + /* * Set flags determining what part of the previous delayed allocation * extent is being replaced by a real allocation. @@ -655,9 +433,9 @@ xfs_bmap_add_extent_delay_real( * Check and set flags if this segment has a left neighbor. * Don't set contiguous if the combined extent would be too large. */ - if (*idx > 0) { + if (bma->idx > 0) { state |= BMAP_LEFT_VALID; - xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx - 1), &LEFT); + xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx - 1), &LEFT); if (isnullstartblock(LEFT.br_startblock)) state |= BMAP_LEFT_DELAY; @@ -675,9 +453,9 @@ xfs_bmap_add_extent_delay_real( * Don't set contiguous if the combined extent would be too large. * Also check for all-three-contiguous being too large. */ - if (*idx < ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t) - 1) { + if (bma->idx < bma->ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t) - 1) { state |= BMAP_RIGHT_VALID; - xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx + 1), &RIGHT); + xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx + 1), &RIGHT); if (isnullstartblock(RIGHT.br_startblock)) state |= BMAP_RIGHT_DELAY; @@ -708,38 +486,41 @@ xfs_bmap_add_extent_delay_real( * Filling in all of a previously delayed allocation extent. * The left and right neighbors are both contiguous with new. */ - --*idx; - trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); - xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx), + bma->idx--; + trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_); + xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, bma->idx), LEFT.br_blockcount + PREV.br_blockcount + RIGHT.br_blockcount); - trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_); - xfs_iext_remove(ip, *idx + 1, 2, state); - ip->i_d.di_nextents--; - if (cur == NULL) + xfs_iext_remove(bma->ip, bma->idx + 1, 2, state); + bma->ip->i_d.di_nextents--; + if (bma->cur == NULL) rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; else { rval = XFS_ILOG_CORE; - if ((error = xfs_bmbt_lookup_eq(cur, RIGHT.br_startoff, + error = xfs_bmbt_lookup_eq(bma->cur, RIGHT.br_startoff, RIGHT.br_startblock, - RIGHT.br_blockcount, &i))) + RIGHT.br_blockcount, &i); + if (error) goto done; XFS_WANT_CORRUPTED_GOTO(i == 1, done); - if ((error = xfs_btree_delete(cur, &i))) + error = xfs_btree_delete(bma->cur, &i); + if (error) goto done; XFS_WANT_CORRUPTED_GOTO(i == 1, done); - if ((error = xfs_btree_decrement(cur, 0, &i))) + error = xfs_btree_decrement(bma->cur, 0, &i); + if (error) goto done; XFS_WANT_CORRUPTED_GOTO(i == 1, done); - if ((error = xfs_bmbt_update(cur, LEFT.br_startoff, + error = xfs_bmbt_update(bma->cur, LEFT.br_startoff, LEFT.br_startblock, LEFT.br_blockcount + PREV.br_blockcount + - RIGHT.br_blockcount, LEFT.br_state))) + RIGHT.br_blockcount, LEFT.br_state); + if (error) goto done; } - *dnew = 0; break; case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING | BMAP_LEFT_CONTIG: @@ -747,30 +528,31 @@ xfs_bmap_add_extent_delay_real( * Filling in all of a previously delayed allocation extent. * The left neighbor is contiguous, the right is not. */ - --*idx; + bma->idx--; - trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); - xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx), + trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_); + xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, bma->idx), LEFT.br_blockcount + PREV.br_blockcount); - trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_); - xfs_iext_remove(ip, *idx + 1, 1, state); - if (cur == NULL) + xfs_iext_remove(bma->ip, bma->idx + 1, 1, state); + if (bma->cur == NULL) rval = XFS_ILOG_DEXT; else { rval = 0; - if ((error = xfs_bmbt_lookup_eq(cur, LEFT.br_startoff, + error = xfs_bmbt_lookup_eq(bma->cur, LEFT.br_startoff, LEFT.br_startblock, LEFT.br_blockcount, - &i))) + &i); + if (error) goto done; XFS_WANT_CORRUPTED_GOTO(i == 1, done); - if ((error = xfs_bmbt_update(cur, LEFT.br_startoff, + error = xfs_bmbt_update(bma->cur, LEFT.br_startoff, LEFT.br_startblock, LEFT.br_blockcount + - PREV.br_blockcount, LEFT.br_state))) + PREV.br_blockcount, LEFT.br_state); + if (error) goto done; } - *dnew = 0; break; case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG: @@ -778,30 +560,30 @@ xfs_bmap_add_extent_delay_real( * Filling in all of a previously delayed allocation extent. * The right neighbor is contiguous, the left is not. */ - trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); + trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_); xfs_bmbt_set_startblock(ep, new->br_startblock); xfs_bmbt_set_blockcount(ep, PREV.br_blockcount + RIGHT.br_blockcount); - trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_); - xfs_iext_remove(ip, *idx + 1, 1, state); - if (cur == NULL) + xfs_iext_remove(bma->ip, bma->idx + 1, 1, state); + if (bma->cur == NULL) rval = XFS_ILOG_DEXT; else { rval = 0; - if ((error = xfs_bmbt_lookup_eq(cur, RIGHT.br_startoff, + error = xfs_bmbt_lookup_eq(bma->cur, RIGHT.br_startoff, RIGHT.br_startblock, - RIGHT.br_blockcount, &i))) + RIGHT.br_blockcount, &i); + if (error) goto done; XFS_WANT_CORRUPTED_GOTO(i == 1, done); - if ((error = xfs_bmbt_update(cur, PREV.br_startoff, + error = xfs_bmbt_update(bma->cur, PREV.br_startoff, new->br_startblock, PREV.br_blockcount + - RIGHT.br_blockcount, PREV.br_state))) + RIGHT.br_blockcount, PREV.br_state); + if (error) goto done; } - - *dnew = 0; break; case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING: @@ -810,27 +592,27 @@ xfs_bmap_add_extent_delay_real( * Neither the left nor right neighbors are contiguous with * the new one. */ - trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); + trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_); xfs_bmbt_set_startblock(ep, new->br_startblock); - trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_); - ip->i_d.di_nextents++; - if (cur == NULL) + bma->ip->i_d.di_nextents++; + if (bma->cur == NULL) rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; else { rval = XFS_ILOG_CORE; - if ((error = xfs_bmbt_lookup_eq(cur, new->br_startoff, + error = xfs_bmbt_lookup_eq(bma->cur, new->br_startoff, new->br_startblock, new->br_blockcount, - &i))) + &i); + if (error) goto done; XFS_WANT_CORRUPTED_GOTO(i == 0, done); - cur->bc_rec.b.br_state = XFS_EXT_NORM; - if ((error = xfs_btree_insert(cur, &i))) + bma->cur->bc_rec.b.br_state = XFS_EXT_NORM; + error = xfs_btree_insert(bma->cur, &i); + if (error) goto done; XFS_WANT_CORRUPTED_GOTO(i == 1, done); } - - *dnew = 0; break; case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG: @@ -838,39 +620,40 @@ xfs_bmap_add_extent_delay_real( * Filling in the first part of a previous delayed allocation. * The left neighbor is contiguous. */ - trace_xfs_bmap_pre_update(ip, *idx - 1, state, _THIS_IP_); - xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx - 1), + trace_xfs_bmap_pre_update(bma->ip, bma->idx - 1, state, _THIS_IP_); + xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, bma->idx - 1), LEFT.br_blockcount + new->br_blockcount); xfs_bmbt_set_startoff(ep, PREV.br_startoff + new->br_blockcount); - trace_xfs_bmap_post_update(ip, *idx - 1, state, _THIS_IP_); + trace_xfs_bmap_post_update(bma->ip, bma->idx - 1, state, _THIS_IP_); temp = PREV.br_blockcount - new->br_blockcount; - trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); + trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_); xfs_bmbt_set_blockcount(ep, temp); - if (cur == NULL) + if (bma->cur == NULL) rval = XFS_ILOG_DEXT; else { rval = 0; - if ((error = xfs_bmbt_lookup_eq(cur, LEFT.br_startoff, + error = xfs_bmbt_lookup_eq(bma->cur, LEFT.br_startoff, LEFT.br_startblock, LEFT.br_blockcount, - &i))) + &i); + if (error) goto done; XFS_WANT_CORRUPTED_GOTO(i == 1, done); - if ((error = xfs_bmbt_update(cur, LEFT.br_startoff, + error = xfs_bmbt_update(bma->cur, LEFT.br_startoff, LEFT.br_startblock, LEFT.br_blockcount + new->br_blockcount, - LEFT.br_state))) + LEFT.br_state); + if (error) goto done; } - temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), + da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(bma->ip, temp), startblockval(PREV.br_startblock)); - xfs_bmbt_set_startblock(ep, nullstartblock((int)temp)); - trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + xfs_bmbt_set_startblock(ep, nullstartblock(da_new)); + trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_); - --*idx; - *dnew = temp; + bma->idx--; break; case BMAP_LEFT_FILLING: @@ -878,43 +661,43 @@ xfs_bmap_add_extent_delay_real( * Filling in the first part of a previous delayed allocation. * The left neighbor is not contiguous. */ - trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); + trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_); xfs_bmbt_set_startoff(ep, new_endoff); temp = PREV.br_blockcount - new->br_blockcount; xfs_bmbt_set_blockcount(ep, temp); - xfs_iext_insert(ip, *idx, 1, new, state); - ip->i_d.di_nextents++; - if (cur == NULL) + xfs_iext_insert(bma->ip, bma->idx, 1, new, state); + bma->ip->i_d.di_nextents++; + if (bma->cur == NULL) rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; else { rval = XFS_ILOG_CORE; - if ((error = xfs_bmbt_lookup_eq(cur, new->br_startoff, + error = xfs_bmbt_lookup_eq(bma->cur, new->br_startoff, new->br_startblock, new->br_blockcount, - &i))) + &i); + if (error) goto done; XFS_WANT_CORRUPTED_GOTO(i == 0, done); - cur->bc_rec.b.br_state = XFS_EXT_NORM; - if ((error = xfs_btree_insert(cur, &i))) + bma->cur->bc_rec.b.br_state = XFS_EXT_NORM; + error = xfs_btree_insert(bma->cur, &i); + if (error) goto done; XFS_WANT_CORRUPTED_GOTO(i == 1, done); } - if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS && - ip->i_d.di_nextents > ip->i_df.if_ext_max) { - error = xfs_bmap_extents_to_btree(tp, ip, - first, flist, &cur, 1, &tmp_rval, - XFS_DATA_FORK); + if (bma->ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS && + bma->ip->i_d.di_nextents > bma->ip->i_df.if_ext_max) { + error = xfs_bmap_extents_to_btree(bma->tp, bma->ip, + bma->firstblock, bma->flist, + &bma->cur, 1, &tmp_rval, XFS_DATA_FORK); rval |= tmp_rval; if (error) goto done; } - temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), + da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(bma->ip, temp), startblockval(PREV.br_startblock) - - (cur ? cur->bc_private.b.allocated : 0)); - ep = xfs_iext_get_ext(ifp, *idx + 1); - xfs_bmbt_set_startblock(ep, nullstartblock((int)temp)); - trace_xfs_bmap_post_update(ip, *idx + 1, state, _THIS_IP_); - - *dnew = temp; + (bma->cur ? bma->cur->bc_private.b.allocated : 0)); + ep = xfs_iext_get_ext(ifp, bma->idx + 1); + xfs_bmbt_set_startblock(ep, nullstartblock(da_new)); + trace_xfs_bmap_post_update(bma->ip, bma->idx + 1, state, _THIS_IP_); break; case BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG: @@ -923,38 +706,39 @@ xfs_bmap_add_extent_delay_real( * The right neighbor is contiguous with the new allocation. */ temp = PREV.br_blockcount - new->br_blockcount; - trace_xfs_bmap_pre_update(ip, *idx + 1, state, _THIS_IP_); + trace_xfs_bmap_pre_update(bma->ip, bma->idx + 1, state, _THIS_IP_); xfs_bmbt_set_blockcount(ep, temp); - xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, *idx + 1), + xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, bma->idx + 1), new->br_startoff, new->br_startblock, new->br_blockcount + RIGHT.br_blockcount, RIGHT.br_state); - trace_xfs_bmap_post_update(ip, *idx + 1, state, _THIS_IP_); - if (cur == NULL) + trace_xfs_bmap_post_update(bma->ip, bma->idx + 1, state, _THIS_IP_); + if (bma->cur == NULL) rval = XFS_ILOG_DEXT; else { rval = 0; - if ((error = xfs_bmbt_lookup_eq(cur, RIGHT.br_startoff, + error = xfs_bmbt_lookup_eq(bma->cur, RIGHT.br_startoff, RIGHT.br_startblock, - RIGHT.br_blockcount, &i))) + RIGHT.br_blockcount, &i); + if (error) goto done; XFS_WANT_CORRUPTED_GOTO(i == 1, done); - if ((error = xfs_bmbt_update(cur, new->br_startoff, + error = xfs_bmbt_update(bma->cur, new->br_startoff, new->br_startblock, new->br_blockcount + RIGHT.br_blockcount, - RIGHT.br_state))) + RIGHT.br_state); + if (error) goto done; } - temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), + da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(bma->ip, temp), startblockval(PREV.br_startblock)); - trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); - xfs_bmbt_set_startblock(ep, nullstartblock((int)temp)); - trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_); + xfs_bmbt_set_startblock(ep, nullstartblock(da_new)); + trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_); - ++*idx; - *dnew = temp; + bma->idx++; break; case BMAP_RIGHT_FILLING: @@ -963,42 +747,43 @@ xfs_bmap_add_extent_delay_real( * The right neighbor is not contiguous. */ temp = PREV.br_blockcount - new->br_blockcount; - trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); + trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_); xfs_bmbt_set_blockcount(ep, temp); - xfs_iext_insert(ip, *idx + 1, 1, new, state); - ip->i_d.di_nextents++; - if (cur == NULL) + xfs_iext_insert(bma->ip, bma->idx + 1, 1, new, state); + bma->ip->i_d.di_nextents++; + if (bma->cur == NULL) rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; else { rval = XFS_ILOG_CORE; - if ((error = xfs_bmbt_lookup_eq(cur, new->br_startoff, + error = xfs_bmbt_lookup_eq(bma->cur, new->br_startoff, new->br_startblock, new->br_blockcount, - &i))) + &i); + if (error) goto done; XFS_WANT_CORRUPTED_GOTO(i == 0, done); - cur->bc_rec.b.br_state = XFS_EXT_NORM; - if ((error = xfs_btree_insert(cur, &i))) + bma->cur->bc_rec.b.br_state = XFS_EXT_NORM; + error = xfs_btree_insert(bma->cur, &i); + if (error) goto done; XFS_WANT_CORRUPTED_GOTO(i == 1, done); } - if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS && - ip->i_d.di_nextents > ip->i_df.if_ext_max) { - error = xfs_bmap_extents_to_btree(tp, ip, - first, flist, &cur, 1, &tmp_rval, - XFS_DATA_FORK); + if (bma->ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS && + bma->ip->i_d.di_nextents > bma->ip->i_df.if_ext_max) { + error = xfs_bmap_extents_to_btree(bma->tp, bma->ip, + bma->firstblock, bma->flist, &bma->cur, 1, + &tmp_rval, XFS_DATA_FORK); rval |= tmp_rval; if (error) goto done; } - temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), + da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(bma->ip, temp), startblockval(PREV.br_startblock) - - (cur ? cur->bc_private.b.allocated : 0)); - ep = xfs_iext_get_ext(ifp, *idx); - xfs_bmbt_set_startblock(ep, nullstartblock((int)temp)); - trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + (bma->cur ? bma->cur->bc_private.b.allocated : 0)); + ep = xfs_iext_get_ext(ifp, bma->idx); + xfs_bmbt_set_startblock(ep, nullstartblock(da_new)); + trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_); - ++*idx; - *dnew = temp; + bma->idx++; break; case 0: @@ -1024,82 +809,65 @@ xfs_bmap_add_extent_delay_real( */ temp = new->br_startoff - PREV.br_startoff; temp2 = PREV.br_startoff + PREV.br_blockcount - new_endoff; - trace_xfs_bmap_pre_update(ip, *idx, 0, _THIS_IP_); + trace_xfs_bmap_pre_update(bma->ip, bma->idx, 0, _THIS_IP_); xfs_bmbt_set_blockcount(ep, temp); /* truncate PREV */ LEFT = *new; RIGHT.br_state = PREV.br_state; RIGHT.br_startblock = nullstartblock( - (int)xfs_bmap_worst_indlen(ip, temp2)); + (int)xfs_bmap_worst_indlen(bma->ip, temp2)); RIGHT.br_startoff = new_endoff; RIGHT.br_blockcount = temp2; /* insert LEFT (r[0]) and RIGHT (r[1]) at the same time */ - xfs_iext_insert(ip, *idx + 1, 2, &LEFT, state); - ip->i_d.di_nextents++; - if (cur == NULL) + xfs_iext_insert(bma->ip, bma->idx + 1, 2, &LEFT, state); + bma->ip->i_d.di_nextents++; + if (bma->cur == NULL) rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; else { rval = XFS_ILOG_CORE; - if ((error = xfs_bmbt_lookup_eq(cur, new->br_startoff, + error = xfs_bmbt_lookup_eq(bma->cur, new->br_startoff, new->br_startblock, new->br_blockcount, - &i))) + &i); + if (error) goto done; XFS_WANT_CORRUPTED_GOTO(i == 0, done); - cur->bc_rec.b.br_state = XFS_EXT_NORM; - if ((error = xfs_btree_insert(cur, &i))) + bma->cur->bc_rec.b.br_state = XFS_EXT_NORM; + error = xfs_btree_insert(bma->cur, &i); + if (error) goto done; XFS_WANT_CORRUPTED_GOTO(i == 1, done); } - if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS && - ip->i_d.di_nextents > ip->i_df.if_ext_max) { - error = xfs_bmap_extents_to_btree(tp, ip, - first, flist, &cur, 1, &tmp_rval, - XFS_DATA_FORK); + if (bma->ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS && + bma->ip->i_d.di_nextents > bma->ip->i_df.if_ext_max) { + error = xfs_bmap_extents_to_btree(bma->tp, bma->ip, + bma->firstblock, bma->flist, &bma->cur, + 1, &tmp_rval, XFS_DATA_FORK); rval |= tmp_rval; if (error) goto done; } - temp = xfs_bmap_worst_indlen(ip, temp); - temp2 = xfs_bmap_worst_indlen(ip, temp2); + temp = xfs_bmap_worst_indlen(bma->ip, temp); + temp2 = xfs_bmap_worst_indlen(bma->ip, temp2); diff = (int)(temp + temp2 - startblockval(PREV.br_startblock) - - (cur ? cur->bc_private.b.allocated : 0)); - if (diff > 0 && - xfs_icsb_modify_counters(ip->i_mount, XFS_SBS_FDBLOCKS, - -((int64_t)diff), 0)) { - /* - * Ick gross gag me with a spoon. - */ - ASSERT(0); /* want to see if this ever happens! */ - while (diff > 0) { - if (temp) { - temp--; - diff--; - if (!diff || - !xfs_icsb_modify_counters(ip->i_mount, - XFS_SBS_FDBLOCKS, - -((int64_t)diff), 0)) - break; - } - if (temp2) { - temp2--; - diff--; - if (!diff || - !xfs_icsb_modify_counters(ip->i_mount, - XFS_SBS_FDBLOCKS, - -((int64_t)diff), 0)) - break; - } - } + (bma->cur ? bma->cur->bc_private.b.allocated : 0)); + if (diff > 0) { + error = xfs_icsb_modify_counters(bma->ip->i_mount, + XFS_SBS_FDBLOCKS, + -((int64_t)diff), 0); + ASSERT(!error); + if (error) + goto done; } - ep = xfs_iext_get_ext(ifp, *idx); + + ep = xfs_iext_get_ext(ifp, bma->idx); xfs_bmbt_set_startblock(ep, nullstartblock((int)temp)); - trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); - trace_xfs_bmap_pre_update(ip, *idx + 2, state, _THIS_IP_); - xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, *idx + 2), + trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_); + trace_xfs_bmap_pre_update(bma->ip, bma->idx + 2, state, _THIS_IP_); + xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, bma->idx + 2), nullstartblock((int)temp2)); - trace_xfs_bmap_post_update(ip, *idx + 2, state, _THIS_IP_); + trace_xfs_bmap_post_update(bma->ip, bma->idx + 2, state, _THIS_IP_); - ++*idx; - *dnew = temp + temp2; + bma->idx++; + da_new = temp + temp2; break; case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG: @@ -1114,9 +882,40 @@ xfs_bmap_add_extent_delay_real( */ ASSERT(0); } - *curp = cur; + + /* convert to a btree if necessary */ + if (XFS_IFORK_FORMAT(bma->ip, XFS_DATA_FORK) == XFS_DINODE_FMT_EXTENTS && + XFS_IFORK_NEXTENTS(bma->ip, XFS_DATA_FORK) > ifp->if_ext_max) { + int tmp_logflags; /* partial log flag return val */ + + ASSERT(bma->cur == NULL); + error = xfs_bmap_extents_to_btree(bma->tp, bma->ip, + bma->firstblock, bma->flist, &bma->cur, + da_old > 0, &tmp_logflags, XFS_DATA_FORK); + bma->logflags |= tmp_logflags; + if (error) + goto done; + } + + /* adjust for changes in reserved delayed indirect blocks */ + if (da_old || da_new) { + temp = da_new; + if (bma->cur) + temp += bma->cur->bc_private.b.allocated; + ASSERT(temp <= da_old); + if (temp < da_old) + xfs_icsb_modify_counters(bma->ip->i_mount, + XFS_SBS_FDBLOCKS, + (int64_t)(da_old - temp), 0); + } + + /* clear out the allocated field, done with it now in any case. */ + if (bma->cur) + bma->cur->bc_private.b.allocated = 0; + + xfs_bmap_check_leaf_extents(bma->cur, bma->ip, XFS_DATA_FORK); done: - *logflagsp = rval; + bma->logflags |= rval; return error; #undef LEFT #undef RIGHT @@ -1124,15 +923,17 @@ done: } /* - * Called by xfs_bmap_add_extent to handle cases converting an unwritten - * allocation to a real allocation or vice versa. + * Convert an unwritten allocation to a real allocation or vice versa. */ STATIC int /* error */ xfs_bmap_add_extent_unwritten_real( + struct xfs_trans *tp, xfs_inode_t *ip, /* incore inode pointer */ xfs_extnum_t *idx, /* extent number to update/insert */ xfs_btree_cur_t **curp, /* if *curp is null, not a btree */ xfs_bmbt_irec_t *new, /* new data to add to file extents */ + xfs_fsblock_t *first, /* pointer to firstblock variable */ + xfs_bmap_free_t *flist, /* list of extents to be freed */ int *logflagsp) /* inode logging flags */ { xfs_btree_cur_t *cur; /* btree cursor */ @@ -1148,15 +949,25 @@ xfs_bmap_add_extent_unwritten_real( int rval=0; /* return value (logging flags) */ int state = 0;/* state bits, accessed thru macros */ + *logflagsp = 0; + + cur = *curp; + ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); + + ASSERT(*idx >= 0); + ASSERT(*idx <= ifp->if_bytes / sizeof(struct xfs_bmbt_rec)); + ASSERT(!isnullstartblock(new->br_startblock)); + + XFS_STATS_INC(xs_add_exlist); + #define LEFT r[0] #define RIGHT r[1] #define PREV r[2] + /* * Set up a bunch of variables to make the tests simpler. */ error = 0; - cur = *curp; - ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); ep = xfs_iext_get_ext(ifp, *idx); xfs_bmbt_get_all(ep, &PREV); newext = new->br_state; @@ -1406,10 +1217,11 @@ xfs_bmap_add_extent_unwritten_real( goto done; if ((error = xfs_btree_decrement(cur, 0, &i))) goto done; - if (xfs_bmbt_update(cur, LEFT.br_startoff, + error = xfs_bmbt_update(cur, LEFT.br_startoff, LEFT.br_startblock, LEFT.br_blockcount + new->br_blockcount, - LEFT.br_state)) + LEFT.br_state); + if (error) goto done; } break; @@ -1607,9 +1419,29 @@ xfs_bmap_add_extent_unwritten_real( */ ASSERT(0); } - *curp = cur; + + /* convert to a btree if necessary */ + if (XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) == XFS_DINODE_FMT_EXTENTS && + XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) > ifp->if_ext_max) { + int tmp_logflags; /* partial log flag return val */ + + ASSERT(cur == NULL); + error = xfs_bmap_extents_to_btree(tp, ip, first, flist, &cur, + 0, &tmp_logflags, XFS_DATA_FORK); + *logflagsp |= tmp_logflags; + if (error) + goto done; + } + + /* clear out the allocated field, done with it now in any case. */ + if (cur) { + cur->bc_private.b.allocated = 0; + *curp = cur; + } + + xfs_bmap_check_leaf_extents(*curp, ip, XFS_DATA_FORK); done: - *logflagsp = rval; + *logflagsp |= rval; return error; #undef LEFT #undef RIGHT @@ -1617,16 +1449,13 @@ done: } /* - * Called by xfs_bmap_add_extent to handle cases converting a hole - * to a delayed allocation. + * Convert a hole to a delayed allocation. */ -/*ARGSUSED*/ -STATIC int /* error */ +STATIC void xfs_bmap_add_extent_hole_delay( xfs_inode_t *ip, /* incore inode pointer */ xfs_extnum_t *idx, /* extent number to update/insert */ - xfs_bmbt_irec_t *new, /* new data to add to file extents */ - int *logflagsp) /* inode logging flags */ + xfs_bmbt_irec_t *new) /* new data to add to file extents */ { xfs_ifork_t *ifp; /* inode fork pointer */ xfs_bmbt_irec_t left; /* left neighbor extent entry */ @@ -1761,23 +1590,17 @@ xfs_bmap_add_extent_hole_delay( * Nothing to do for disk quota accounting here. */ } - *logflagsp = 0; - return 0; } /* - * Called by xfs_bmap_add_extent to handle cases converting a hole - * to a real allocation. + * Convert a hole to a real allocation. */ STATIC int /* error */ xfs_bmap_add_extent_hole_real( - xfs_inode_t *ip, /* incore inode pointer */ - xfs_extnum_t *idx, /* extent number to update/insert */ - xfs_btree_cur_t *cur, /* if null, not a btree */ - xfs_bmbt_irec_t *new, /* new data to add to file extents */ - int *logflagsp, /* inode logging flags */ - int whichfork) /* data or attr fork */ + struct xfs_bmalloca *bma, + int whichfork) { + struct xfs_bmbt_irec *new = &bma->got; int error; /* error return value */ int i; /* temp state */ xfs_ifork_t *ifp; /* inode fork pointer */ @@ -1786,19 +1609,26 @@ xfs_bmap_add_extent_hole_real( int rval=0; /* return value (logging flags) */ int state; /* state bits, accessed thru macros */ - ifp = XFS_IFORK_PTR(ip, whichfork); - ASSERT(*idx <= ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t)); - state = 0; + ifp = XFS_IFORK_PTR(bma->ip, whichfork); + + ASSERT(bma->idx >= 0); + ASSERT(bma->idx <= ifp->if_bytes / sizeof(struct xfs_bmbt_rec)); + ASSERT(!isnullstartblock(new->br_startblock)); + ASSERT(!bma->cur || + !(bma->cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL)); + + XFS_STATS_INC(xs_add_exlist); + state = 0; if (whichfork == XFS_ATTR_FORK) state |= BMAP_ATTRFORK; /* * Check and set flags if this segment has a left neighbor. */ - if (*idx > 0) { + if (bma->idx > 0) { state |= BMAP_LEFT_VALID; - xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx - 1), &left); + xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx - 1), &left); if (isnullstartblock(left.br_startblock)) state |= BMAP_LEFT_DELAY; } @@ -1807,9 +1637,9 @@ xfs_bmap_add_extent_hole_real( * Check and set flags if this segment has a current value. * Not true if we're inserting into the "hole" at eof. */ - if (*idx < ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t)) { + if (bma->idx < ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t)) { state |= BMAP_RIGHT_VALID; - xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx), &right); + xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx), &right); if (isnullstartblock(right.br_startblock)) state |= BMAP_RIGHT_DELAY; } @@ -1846,39 +1676,42 @@ xfs_bmap_add_extent_hole_real( * left and on the right. * Merge all three into a single extent record. */ - --*idx; - trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); - xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx), + --bma->idx; + trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_); + xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, bma->idx), left.br_blockcount + new->br_blockcount + right.br_blockcount); - trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_); - xfs_iext_remove(ip, *idx + 1, 1, state); + xfs_iext_remove(bma->ip, bma->idx + 1, 1, state); - XFS_IFORK_NEXT_SET(ip, whichfork, - XFS_IFORK_NEXTENTS(ip, whichfork) - 1); - if (cur == NULL) { + XFS_IFORK_NEXT_SET(bma->ip, whichfork, + XFS_IFORK_NEXTENTS(bma->ip, whichfork) - 1); + if (bma->cur == NULL) { rval = XFS_ILOG_CORE | xfs_ilog_fext(whichfork); } else { rval = XFS_ILOG_CORE; - if ((error = xfs_bmbt_lookup_eq(cur, - right.br_startoff, - right.br_startblock, - right.br_blockcount, &i))) + error = xfs_bmbt_lookup_eq(bma->cur, right.br_startoff, + right.br_startblock, right.br_blockcount, + &i); + if (error) goto done; XFS_WANT_CORRUPTED_GOTO(i == 1, done); - if ((error = xfs_btree_delete(cur, &i))) + error = xfs_btree_delete(bma->cur, &i); + if (error) goto done; XFS_WANT_CORRUPTED_GOTO(i == 1, done); - if ((error = xfs_btree_decrement(cur, 0, &i))) + error = xfs_btree_decrement(bma->cur, 0, &i); + if (error) goto done; XFS_WANT_CORRUPTED_GOTO(i == 1, done); - if ((error = xfs_bmbt_update(cur, left.br_startoff, + error = xfs_bmbt_update(bma->cur, left.br_startoff, left.br_startblock, left.br_blockcount + new->br_blockcount + right.br_blockcount, - left.br_state))) + left.br_state); + if (error) goto done; } break; @@ -1889,27 +1722,28 @@ xfs_bmap_add_extent_hole_real( * on the left. * Merge the new allocation with the left neighbor. */ - --*idx; - trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); - xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx), + --bma->idx; + trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_); + xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, bma->idx), left.br_blockcount + new->br_blockcount); - trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_); - if (cur == NULL) { + if (bma->cur == NULL) { rval = xfs_ilog_fext(whichfork); } else { rval = 0; - if ((error = xfs_bmbt_lookup_eq(cur, - left.br_startoff, - left.br_startblock, - left.br_blockcount, &i))) + error = xfs_bmbt_lookup_eq(bma->cur, left.br_startoff, + left.br_startblock, left.br_blockcount, + &i); + if (error) goto done; XFS_WANT_CORRUPTED_GOTO(i == 1, done); - if ((error = xfs_bmbt_update(cur, left.br_startoff, + error = xfs_bmbt_update(bma->cur, left.br_startoff, left.br_startblock, left.br_blockcount + new->br_blockcount, - left.br_state))) + left.br_state); + if (error) goto done; } break; @@ -1920,28 +1754,30 @@ xfs_bmap_add_extent_hole_real( * on the right. * Merge the new allocation with the right neighbor. */ - trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); - xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, *idx), + trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_); + xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, bma->idx), new->br_startoff, new->br_startblock, new->br_blockcount + right.br_blockcount, right.br_state); - trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_); - if (cur == NULL) { + if (bma->cur == NULL) { rval = xfs_ilog_fext(whichfork); } else { rval = 0; - if ((error = xfs_bmbt_lookup_eq(cur, + error = xfs_bmbt_lookup_eq(bma->cur, right.br_startoff, right.br_startblock, - right.br_blockcount, &i))) + right.br_blockcount, &i); + if (error) goto done; XFS_WANT_CORRUPTED_GOTO(i == 1, done); - if ((error = xfs_bmbt_update(cur, new->br_startoff, + error = xfs_bmbt_update(bma->cur, new->br_startoff, new->br_startblock, new->br_blockcount + right.br_blockcount, - right.br_state))) + right.br_state); + if (error) goto done; } break; @@ -1952,28 +1788,50 @@ xfs_bmap_add_extent_hole_real( * real allocation. * Insert a new entry. */ - xfs_iext_insert(ip, *idx, 1, new, state); - XFS_IFORK_NEXT_SET(ip, whichfork, - XFS_IFORK_NEXTENTS(ip, whichfork) + 1); - if (cur == NULL) { + xfs_iext_insert(bma->ip, bma->idx, 1, new, state); + XFS_IFORK_NEXT_SET(bma->ip, whichfork, + XFS_IFORK_NEXTENTS(bma->ip, whichfork) + 1); + if (bma->cur == NULL) { rval = XFS_ILOG_CORE | xfs_ilog_fext(whichfork); } else { rval = XFS_ILOG_CORE; - if ((error = xfs_bmbt_lookup_eq(cur, + error = xfs_bmbt_lookup_eq(bma->cur, new->br_startoff, new->br_startblock, - new->br_blockcount, &i))) + new->br_blockcount, &i); + if (error) goto done; XFS_WANT_CORRUPTED_GOTO(i == 0, done); - cur->bc_rec.b.br_state = new->br_state; - if ((error = xfs_btree_insert(cur, &i))) + bma->cur->bc_rec.b.br_state = new->br_state; + error = xfs_btree_insert(bma->cur, &i); + if (error) goto done; XFS_WANT_CORRUPTED_GOTO(i == 1, done); } break; } + + /* convert to a btree if necessary */ + if (XFS_IFORK_FORMAT(bma->ip, whichfork) == XFS_DINODE_FMT_EXTENTS && + XFS_IFORK_NEXTENTS(bma->ip, whichfork) > ifp->if_ext_max) { + int tmp_logflags; /* partial log flag return val */ + + ASSERT(bma->cur == NULL); + error = xfs_bmap_extents_to_btree(bma->tp, bma->ip, + bma->firstblock, bma->flist, &bma->cur, + 0, &tmp_logflags, whichfork); + bma->logflags |= tmp_logflags; + if (error) + goto done; + } + + /* clear out the allocated field, done with it now in any case. */ + if (bma->cur) + bma->cur->bc_private.b.allocated = 0; + + xfs_bmap_check_leaf_extents(bma->cur, bma->ip, whichfork); done: - *logflagsp = rval; + bma->logflags |= rval; return error; } @@ -2160,26 +2018,26 @@ xfs_bmap_adjacent( XFS_FSB_TO_AGBNO(mp, x) < mp->m_sb.sb_agblocks) mp = ap->ip->i_mount; - nullfb = ap->firstblock == NULLFSBLOCK; + nullfb = *ap->firstblock == NULLFSBLOCK; rt = XFS_IS_REALTIME_INODE(ap->ip) && ap->userdata; - fb_agno = nullfb ? NULLAGNUMBER : XFS_FSB_TO_AGNO(mp, ap->firstblock); + fb_agno = nullfb ? NULLAGNUMBER : XFS_FSB_TO_AGNO(mp, *ap->firstblock); /* * If allocating at eof, and there's a previous real block, * try to use its last block as our starting point. */ - if (ap->eof && ap->prevp->br_startoff != NULLFILEOFF && - !isnullstartblock(ap->prevp->br_startblock) && - ISVALID(ap->prevp->br_startblock + ap->prevp->br_blockcount, - ap->prevp->br_startblock)) { - ap->rval = ap->prevp->br_startblock + ap->prevp->br_blockcount; + if (ap->eof && ap->prev.br_startoff != NULLFILEOFF && + !isnullstartblock(ap->prev.br_startblock) && + ISVALID(ap->prev.br_startblock + ap->prev.br_blockcount, + ap->prev.br_startblock)) { + ap->blkno = ap->prev.br_startblock + ap->prev.br_blockcount; /* * Adjust for the gap between prevp and us. */ - adjust = ap->off - - (ap->prevp->br_startoff + ap->prevp->br_blockcount); + adjust = ap->offset - + (ap->prev.br_startoff + ap->prev.br_blockcount); if (adjust && - ISVALID(ap->rval + adjust, ap->prevp->br_startblock)) - ap->rval += adjust; + ISVALID(ap->blkno + adjust, ap->prev.br_startblock)) + ap->blkno += adjust; } /* * If not at eof, then compare the two neighbor blocks. @@ -2196,17 +2054,17 @@ xfs_bmap_adjacent( * If there's a previous (left) block, select a requested * start block based on it. */ - if (ap->prevp->br_startoff != NULLFILEOFF && - !isnullstartblock(ap->prevp->br_startblock) && - (prevbno = ap->prevp->br_startblock + - ap->prevp->br_blockcount) && - ISVALID(prevbno, ap->prevp->br_startblock)) { + if (ap->prev.br_startoff != NULLFILEOFF && + !isnullstartblock(ap->prev.br_startblock) && + (prevbno = ap->prev.br_startblock + + ap->prev.br_blockcount) && + ISVALID(prevbno, ap->prev.br_startblock)) { /* * Calculate gap to end of previous block. */ - adjust = prevdiff = ap->off - - (ap->prevp->br_startoff + - ap->prevp->br_blockcount); + adjust = prevdiff = ap->offset - + (ap->prev.br_startoff + + ap->prev.br_blockcount); /* * Figure the startblock based on the previous block's * end and the gap size. @@ -2215,9 +2073,9 @@ xfs_bmap_adjacent( * allocating, or using it gives us an invalid block * number, then just use the end of the previous block. */ - if (prevdiff <= XFS_ALLOC_GAP_UNITS * ap->alen && + if (prevdiff <= XFS_ALLOC_GAP_UNITS * ap->length && ISVALID(prevbno + prevdiff, - ap->prevp->br_startblock)) + ap->prev.br_startblock)) prevbno += adjust; else prevdiff += adjust; @@ -2238,16 +2096,16 @@ xfs_bmap_adjacent( * If there's a following (right) block, select a requested * start block based on it. */ - if (!isnullstartblock(ap->gotp->br_startblock)) { + if (!isnullstartblock(ap->got.br_startblock)) { /* * Calculate gap to start of next block. */ - adjust = gotdiff = ap->gotp->br_startoff - ap->off; + adjust = gotdiff = ap->got.br_startoff - ap->offset; /* * Figure the startblock based on the next block's * start and the gap size. */ - gotbno = ap->gotp->br_startblock; + gotbno = ap->got.br_startblock; /* * Heuristic! * If the gap is large relative to the piece we're @@ -2255,12 +2113,12 @@ xfs_bmap_adjacent( * number, then just use the start of the next block * offset by our length. */ - if (gotdiff <= XFS_ALLOC_GAP_UNITS * ap->alen && + if (gotdiff <= XFS_ALLOC_GAP_UNITS * ap->length && ISVALID(gotbno - gotdiff, gotbno)) gotbno -= adjust; - else if (ISVALID(gotbno - ap->alen, gotbno)) { - gotbno -= ap->alen; - gotdiff += adjust - ap->alen; + else if (ISVALID(gotbno - ap->length, gotbno)) { + gotbno -= ap->length; + gotdiff += adjust - ap->length; } else gotdiff += adjust; /* @@ -2278,14 +2136,14 @@ xfs_bmap_adjacent( gotbno = NULLFSBLOCK; /* * If both valid, pick the better one, else the only good - * one, else ap->rval is already set (to 0 or the inode block). + * one, else ap->blkno is already set (to 0 or the inode block). */ if (prevbno != NULLFSBLOCK && gotbno != NULLFSBLOCK) - ap->rval = prevdiff <= gotdiff ? prevbno : gotbno; + ap->blkno = prevdiff <= gotdiff ? prevbno : gotbno; else if (prevbno != NULLFSBLOCK) - ap->rval = prevbno; + ap->blkno = prevbno; else if (gotbno != NULLFSBLOCK) - ap->rval = gotbno; + ap->blkno = gotbno; } #undef ISVALID } @@ -2305,24 +2163,24 @@ xfs_bmap_rtalloc( mp = ap->ip->i_mount; align = xfs_get_extsz_hint(ap->ip); prod = align / mp->m_sb.sb_rextsize; - error = xfs_bmap_extsize_align(mp, ap->gotp, ap->prevp, + error = xfs_bmap_extsize_align(mp, &ap->got, &ap->prev, align, 1, ap->eof, 0, - ap->conv, &ap->off, &ap->alen); + ap->conv, &ap->offset, &ap->length); if (error) return error; - ASSERT(ap->alen); - ASSERT(ap->alen % mp->m_sb.sb_rextsize == 0); + ASSERT(ap->length); + ASSERT(ap->length % mp->m_sb.sb_rextsize == 0); /* * If the offset & length are not perfectly aligned * then kill prod, it will just get us in trouble. */ - if (do_mod(ap->off, align) || ap->alen % align) + if (do_mod(ap->offset, align) || ap->length % align) prod = 1; /* * Set ralen to be the actual requested length in rtextents. */ - ralen = ap->alen / mp->m_sb.sb_rextsize; + ralen = ap->length / mp->m_sb.sb_rextsize; /* * If the old value was close enough to MAXEXTLEN that * we rounded up to it, cut it back so it's valid again. @@ -2337,21 +2195,21 @@ xfs_bmap_rtalloc( * Lock out other modifications to the RT bitmap inode. */ xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL); - xfs_trans_ijoin_ref(ap->tp, mp->m_rbmip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(ap->tp, mp->m_rbmip, XFS_ILOCK_EXCL); /* * If it's an allocation to an empty file at offset 0, * pick an extent that will space things out in the rt area. */ - if (ap->eof && ap->off == 0) { + if (ap->eof && ap->offset == 0) { xfs_rtblock_t uninitialized_var(rtx); /* realtime extent no */ error = xfs_rtpick_extent(mp, ap->tp, ralen, &rtx); if (error) return error; - ap->rval = rtx * mp->m_sb.sb_rextsize; + ap->blkno = rtx * mp->m_sb.sb_rextsize; } else { - ap->rval = 0; + ap->blkno = 0; } xfs_bmap_adjacent(ap); @@ -2359,23 +2217,23 @@ xfs_bmap_rtalloc( /* * Realtime allocation, done through xfs_rtallocate_extent. */ - atype = ap->rval == 0 ? XFS_ALLOCTYPE_ANY_AG : XFS_ALLOCTYPE_NEAR_BNO; - do_div(ap->rval, mp->m_sb.sb_rextsize); - rtb = ap->rval; - ap->alen = ralen; - if ((error = xfs_rtallocate_extent(ap->tp, ap->rval, 1, ap->alen, + atype = ap->blkno == 0 ? XFS_ALLOCTYPE_ANY_AG : XFS_ALLOCTYPE_NEAR_BNO; + do_div(ap->blkno, mp->m_sb.sb_rextsize); + rtb = ap->blkno; + ap->length = ralen; + if ((error = xfs_rtallocate_extent(ap->tp, ap->blkno, 1, ap->length, &ralen, atype, ap->wasdel, prod, &rtb))) return error; if (rtb == NULLFSBLOCK && prod > 1 && - (error = xfs_rtallocate_extent(ap->tp, ap->rval, 1, - ap->alen, &ralen, atype, + (error = xfs_rtallocate_extent(ap->tp, ap->blkno, 1, + ap->length, &ralen, atype, ap->wasdel, 1, &rtb))) return error; - ap->rval = rtb; - if (ap->rval != NULLFSBLOCK) { - ap->rval *= mp->m_sb.sb_rextsize; + ap->blkno = rtb; + if (ap->blkno != NULLFSBLOCK) { + ap->blkno *= mp->m_sb.sb_rextsize; ralen *= mp->m_sb.sb_rextsize; - ap->alen = ralen; + ap->length = ralen; ap->ip->i_d.di_nblocks += ralen; xfs_trans_log_inode(ap->tp, ap->ip, XFS_ILOG_CORE); if (ap->wasdel) @@ -2388,7 +2246,7 @@ xfs_bmap_rtalloc( ap->wasdel ? XFS_TRANS_DQ_DELRTBCOUNT : XFS_TRANS_DQ_RTBCOUNT, (long) ralen); } else { - ap->alen = 0; + ap->length = 0; } return 0; } @@ -2503,7 +2361,7 @@ xfs_bmap_btalloc_nullfb( * AG as the stream may have moved. */ if (xfs_inode_is_filestream(ap->ip)) - ap->rval = args->fsbno = XFS_AGB_TO_FSB(mp, ag, 0); + ap->blkno = args->fsbno = XFS_AGB_TO_FSB(mp, ag, 0); return 0; } @@ -2525,55 +2383,57 @@ xfs_bmap_btalloc( int tryagain; int error; + ASSERT(ap->length); + mp = ap->ip->i_mount; align = ap->userdata ? xfs_get_extsz_hint(ap->ip) : 0; if (unlikely(align)) { - error = xfs_bmap_extsize_align(mp, ap->gotp, ap->prevp, + error = xfs_bmap_extsize_align(mp, &ap->got, &ap->prev, align, 0, ap->eof, 0, ap->conv, - &ap->off, &ap->alen); + &ap->offset, &ap->length); ASSERT(!error); - ASSERT(ap->alen); + ASSERT(ap->length); } - nullfb = ap->firstblock == NULLFSBLOCK; - fb_agno = nullfb ? NULLAGNUMBER : XFS_FSB_TO_AGNO(mp, ap->firstblock); + nullfb = *ap->firstblock == NULLFSBLOCK; + fb_agno = nullfb ? NULLAGNUMBER : XFS_FSB_TO_AGNO(mp, *ap->firstblock); if (nullfb) { if (ap->userdata && xfs_inode_is_filestream(ap->ip)) { ag = xfs_filestream_lookup_ag(ap->ip); ag = (ag != NULLAGNUMBER) ? ag : 0; - ap->rval = XFS_AGB_TO_FSB(mp, ag, 0); + ap->blkno = XFS_AGB_TO_FSB(mp, ag, 0); } else { - ap->rval = XFS_INO_TO_FSB(mp, ap->ip->i_ino); + ap->blkno = XFS_INO_TO_FSB(mp, ap->ip->i_ino); } } else - ap->rval = ap->firstblock; + ap->blkno = *ap->firstblock; xfs_bmap_adjacent(ap); /* - * If allowed, use ap->rval; otherwise must use firstblock since + * If allowed, use ap->blkno; otherwise must use firstblock since * it's in the right allocation group. */ - if (nullfb || XFS_FSB_TO_AGNO(mp, ap->rval) == fb_agno) + if (nullfb || XFS_FSB_TO_AGNO(mp, ap->blkno) == fb_agno) ; else - ap->rval = ap->firstblock; + ap->blkno = *ap->firstblock; /* * Normal allocation, done through xfs_alloc_vextent. */ tryagain = isaligned = 0; args.tp = ap->tp; args.mp = mp; - args.fsbno = ap->rval; + args.fsbno = ap->blkno; /* Trim the allocation back to the maximum an AG can fit. */ - args.maxlen = MIN(ap->alen, XFS_ALLOC_AG_MAX_USABLE(mp)); - args.firstblock = ap->firstblock; + args.maxlen = MIN(ap->length, XFS_ALLOC_AG_MAX_USABLE(mp)); + args.firstblock = *ap->firstblock; blen = 0; if (nullfb) { error = xfs_bmap_btalloc_nullfb(ap, &args, &blen); if (error) return error; - } else if (ap->low) { + } else if (ap->flist->xbf_low) { if (xfs_inode_is_filestream(ap->ip)) args.type = XFS_ALLOCTYPE_FIRST_AG; else @@ -2587,14 +2447,14 @@ xfs_bmap_btalloc( /* apply extent size hints if obtained earlier */ if (unlikely(align)) { args.prod = align; - if ((args.mod = (xfs_extlen_t)do_mod(ap->off, args.prod))) + if ((args.mod = (xfs_extlen_t)do_mod(ap->offset, args.prod))) args.mod = (xfs_extlen_t)(args.prod - args.mod); } else if (mp->m_sb.sb_blocksize >= PAGE_CACHE_SIZE) { args.prod = 1; args.mod = 0; } else { args.prod = PAGE_CACHE_SIZE >> mp->m_sb.sb_blocklog; - if ((args.mod = (xfs_extlen_t)(do_mod(ap->off, args.prod)))) + if ((args.mod = (xfs_extlen_t)(do_mod(ap->offset, args.prod)))) args.mod = (xfs_extlen_t)(args.prod - args.mod); } /* @@ -2606,8 +2466,8 @@ xfs_bmap_btalloc( * is >= the stripe unit and the allocation offset is * at the end of file. */ - if (!ap->low && ap->aeof) { - if (!ap->off) { + if (!ap->flist->xbf_low && ap->aeof) { + if (!ap->offset) { args.alignment = mp->m_dalign; atype = args.type; isaligned = 1; @@ -2660,7 +2520,7 @@ xfs_bmap_btalloc( * turned on. */ args.type = atype; - args.fsbno = ap->rval; + args.fsbno = ap->blkno; args.alignment = mp->m_dalign; args.minlen = nextminlen; args.minalignslop = 0; @@ -2674,7 +2534,7 @@ xfs_bmap_btalloc( * try again. */ args.type = atype; - args.fsbno = ap->rval; + args.fsbno = ap->blkno; args.alignment = 0; if ((error = xfs_alloc_vextent(&args))) return error; @@ -2683,7 +2543,7 @@ xfs_bmap_btalloc( args.minlen > ap->minlen) { args.minlen = ap->minlen; args.type = XFS_ALLOCTYPE_START_BNO; - args.fsbno = ap->rval; + args.fsbno = ap->blkno; if ((error = xfs_alloc_vextent(&args))) return error; } @@ -2694,13 +2554,26 @@ xfs_bmap_btalloc( args.minleft = 0; if ((error = xfs_alloc_vextent(&args))) return error; - ap->low = 1; + ap->flist->xbf_low = 1; } if (args.fsbno != NULLFSBLOCK) { - ap->firstblock = ap->rval = args.fsbno; + /* + * check the allocation happened at the same or higher AG than + * the first block that was allocated. + */ + ASSERT(*ap->firstblock == NULLFSBLOCK || + XFS_FSB_TO_AGNO(mp, *ap->firstblock) == + XFS_FSB_TO_AGNO(mp, args.fsbno) || + (ap->flist->xbf_low && + XFS_FSB_TO_AGNO(mp, *ap->firstblock) < + XFS_FSB_TO_AGNO(mp, args.fsbno))); + + ap->blkno = args.fsbno; + if (*ap->firstblock == NULLFSBLOCK) + *ap->firstblock = args.fsbno; ASSERT(nullfb || fb_agno == args.agno || - (ap->low && fb_agno < args.agno)); - ap->alen = args.len; + (ap->flist->xbf_low && fb_agno < args.agno)); + ap->length = args.len; ap->ip->i_d.di_nblocks += args.len; xfs_trans_log_inode(ap->tp, ap->ip, XFS_ILOG_CORE); if (ap->wasdel) @@ -2714,8 +2587,8 @@ xfs_bmap_btalloc( XFS_TRANS_DQ_BCOUNT, (long) args.len); } else { - ap->rval = NULLFSBLOCK; - ap->alen = 0; + ap->blkno = NULLFSBLOCK; + ap->length = 0; } return 0; } @@ -3589,7 +3462,7 @@ xfs_bmap_add_attrfork( } ASSERT(ip->i_d.di_anextents == 0); - xfs_trans_ijoin_ref(tp, ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); switch (ip->i_d.di_format) { @@ -3782,19 +3655,11 @@ xfs_bmap_compute_maxlevels( * Routine to be called at transaction's end by xfs_bmapi, xfs_bunmapi * caller. Frees all the extents that need freeing, which must be done * last due to locking considerations. We never free any extents in - * the first transaction. This is to allow the caller to make the first - * transaction a synchronous one so that the pointers to the data being - * broken in this transaction will be permanent before the data is actually - * freed. This is necessary to prevent blocks from being reallocated - * and written to before the free and reallocation are actually permanent. - * We do not just make the first transaction synchronous here, because - * there are more efficient ways to gain the same protection in some cases - * (see the file truncation code). + * the first transaction. * * Return 1 if the given transaction was committed and a new one * started, and 0 otherwise in the committed parameter. */ -/*ARGSUSED*/ int /* error */ xfs_bmap_finish( xfs_trans_t **tp, /* transaction pointer addr */ @@ -3994,42 +3859,122 @@ xfs_bmap_last_before( return 0; } +STATIC int +xfs_bmap_last_extent( + struct xfs_trans *tp, + struct xfs_inode *ip, + int whichfork, + struct xfs_bmbt_irec *rec, + int *is_empty) +{ + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + int error; + int nextents; + + if (!(ifp->if_flags & XFS_IFEXTENTS)) { + error = xfs_iread_extents(tp, ip, whichfork); + if (error) + return error; + } + + nextents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t); + if (nextents == 0) { + *is_empty = 1; + return 0; + } + + xfs_bmbt_get_all(xfs_iext_get_ext(ifp, nextents - 1), rec); + *is_empty = 0; + return 0; +} + +/* + * Check the last inode extent to determine whether this allocation will result + * in blocks being allocated at the end of the file. When we allocate new data + * blocks at the end of the file which do not start at the previous data block, + * we will try to align the new blocks at stripe unit boundaries. + * + * Returns 0 in bma->aeof if the file (fork) is empty as any new write will be + * at, or past the EOF. + */ +STATIC int +xfs_bmap_isaeof( + struct xfs_bmalloca *bma, + int whichfork) +{ + struct xfs_bmbt_irec rec; + int is_empty; + int error; + + bma->aeof = 0; + error = xfs_bmap_last_extent(NULL, bma->ip, whichfork, &rec, + &is_empty); + if (error || is_empty) + return error; + + /* + * Check if we are allocation or past the last extent, or at least into + * the last delayed allocated extent. + */ + bma->aeof = bma->offset >= rec.br_startoff + rec.br_blockcount || + (bma->offset >= rec.br_startoff && + isnullstartblock(rec.br_startblock)); + return 0; +} + +/* + * Check if the endoff is outside the last extent. If so the caller will grow + * the allocation to a stripe unit boundary. All offsets are considered outside + * the end of file for an empty fork, so 1 is returned in *eof in that case. + */ +int +xfs_bmap_eof( + struct xfs_inode *ip, + xfs_fileoff_t endoff, + int whichfork, + int *eof) +{ + struct xfs_bmbt_irec rec; + int error; + + error = xfs_bmap_last_extent(NULL, ip, whichfork, &rec, eof); + if (error || *eof) + return error; + + *eof = endoff >= rec.br_startoff + rec.br_blockcount; + return 0; +} + /* * Returns the file-relative block number of the first block past eof in * the file. This is not based on i_size, it is based on the extent records. * Returns 0 for local files, as they do not have extent records. */ -int /* error */ +int xfs_bmap_last_offset( - xfs_trans_t *tp, /* transaction pointer */ - xfs_inode_t *ip, /* incore inode */ - xfs_fileoff_t *last_block, /* last block */ - int whichfork) /* data or attr fork */ + struct xfs_trans *tp, + struct xfs_inode *ip, + xfs_fileoff_t *last_block, + int whichfork) { - xfs_bmbt_rec_host_t *ep; /* pointer to last extent */ - int error; /* error return value */ - xfs_ifork_t *ifp; /* inode fork pointer */ - xfs_extnum_t nextents; /* number of extent entries */ + struct xfs_bmbt_irec rec; + int is_empty; + int error; + + *last_block = 0; + + if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) + return 0; if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE && - XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS && - XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL) + XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS) return XFS_ERROR(EIO); - if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) { - *last_block = 0; - return 0; - } - ifp = XFS_IFORK_PTR(ip, whichfork); - if (!(ifp->if_flags & XFS_IFEXTENTS) && - (error = xfs_iread_extents(tp, ip, whichfork))) + + error = xfs_bmap_last_extent(NULL, ip, whichfork, &rec, &is_empty); + if (error || is_empty) return error; - nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); - if (!nextents) { - *last_block = 0; - return 0; - } - ep = xfs_iext_get_ext(ifp, nextents - 1); - *last_block = xfs_bmbt_get_startoff(ep) + xfs_bmbt_get_blockcount(ep); + + *last_block = rec.br_startoff + rec.br_blockcount; return 0; } @@ -4159,7 +4104,6 @@ xfs_bmap_read_extents( xfs_extnum_t num_recs; xfs_extnum_t start; - num_recs = xfs_btree_get_numrecs(block); if (unlikely(i + num_recs > room)) { ASSERT(i + num_recs <= room); @@ -4282,9 +4226,8 @@ xfs_bmap_validate_ret( ASSERT(i == 0 || mval[i - 1].br_startoff + mval[i - 1].br_blockcount == mval[i].br_startoff); - if ((flags & XFS_BMAPI_WRITE) && !(flags & XFS_BMAPI_DELAY)) - ASSERT(mval[i].br_startblock != DELAYSTARTBLOCK && - mval[i].br_startblock != HOLESTARTBLOCK); + ASSERT(mval[i].br_startblock != DELAYSTARTBLOCK && + mval[i].br_startblock != HOLESTARTBLOCK); ASSERT(mval[i].br_state == XFS_EXT_NORM || mval[i].br_state == XFS_EXT_UNWRITTEN); } @@ -4293,66 +4236,611 @@ xfs_bmap_validate_ret( /* - * Map file blocks to filesystem blocks. - * File range is given by the bno/len pair. - * Adds blocks to file if a write ("flags & XFS_BMAPI_WRITE" set) - * into a hole or past eof. - * Only allocates blocks from a single allocation group, - * to avoid locking problems. + * Trim the returned map to the required bounds + */ +STATIC void +xfs_bmapi_trim_map( + struct xfs_bmbt_irec *mval, + struct xfs_bmbt_irec *got, + xfs_fileoff_t *bno, + xfs_filblks_t len, + xfs_fileoff_t obno, + xfs_fileoff_t end, + int n, + int flags) +{ + if ((flags & XFS_BMAPI_ENTIRE) || + got->br_startoff + got->br_blockcount <= obno) { + *mval = *got; + if (isnullstartblock(got->br_startblock)) + mval->br_startblock = DELAYSTARTBLOCK; + return; + } + + if (obno > *bno) + *bno = obno; + ASSERT((*bno >= obno) || (n == 0)); + ASSERT(*bno < end); + mval->br_startoff = *bno; + if (isnullstartblock(got->br_startblock)) + mval->br_startblock = DELAYSTARTBLOCK; + else + mval->br_startblock = got->br_startblock + + (*bno - got->br_startoff); + /* + * Return the minimum of what we got and what we asked for for + * the length. We can use the len variable here because it is + * modified below and we could have been there before coming + * here if the first part of the allocation didn't overlap what + * was asked for. + */ + mval->br_blockcount = XFS_FILBLKS_MIN(end - *bno, + got->br_blockcount - (*bno - got->br_startoff)); + mval->br_state = got->br_state; + ASSERT(mval->br_blockcount <= len); + return; +} + +/* + * Update and validate the extent map to return + */ +STATIC void +xfs_bmapi_update_map( + struct xfs_bmbt_irec **map, + xfs_fileoff_t *bno, + xfs_filblks_t *len, + xfs_fileoff_t obno, + xfs_fileoff_t end, + int *n, + int flags) +{ + xfs_bmbt_irec_t *mval = *map; + + ASSERT((flags & XFS_BMAPI_ENTIRE) || + ((mval->br_startoff + mval->br_blockcount) <= end)); + ASSERT((flags & XFS_BMAPI_ENTIRE) || (mval->br_blockcount <= *len) || + (mval->br_startoff < obno)); + + *bno = mval->br_startoff + mval->br_blockcount; + *len = end - *bno; + if (*n > 0 && mval->br_startoff == mval[-1].br_startoff) { + /* update previous map with new information */ + ASSERT(mval->br_startblock == mval[-1].br_startblock); + ASSERT(mval->br_blockcount > mval[-1].br_blockcount); + ASSERT(mval->br_state == mval[-1].br_state); + mval[-1].br_blockcount = mval->br_blockcount; + mval[-1].br_state = mval->br_state; + } else if (*n > 0 && mval->br_startblock != DELAYSTARTBLOCK && + mval[-1].br_startblock != DELAYSTARTBLOCK && + mval[-1].br_startblock != HOLESTARTBLOCK && + mval->br_startblock == mval[-1].br_startblock + + mval[-1].br_blockcount && + ((flags & XFS_BMAPI_IGSTATE) || + mval[-1].br_state == mval->br_state)) { + ASSERT(mval->br_startoff == + mval[-1].br_startoff + mval[-1].br_blockcount); + mval[-1].br_blockcount += mval->br_blockcount; + } else if (*n > 0 && + mval->br_startblock == DELAYSTARTBLOCK && + mval[-1].br_startblock == DELAYSTARTBLOCK && + mval->br_startoff == + mval[-1].br_startoff + mval[-1].br_blockcount) { + mval[-1].br_blockcount += mval->br_blockcount; + mval[-1].br_state = mval->br_state; + } else if (!((*n == 0) && + ((mval->br_startoff + mval->br_blockcount) <= + obno))) { + mval++; + (*n)++; + } + *map = mval; +} + +/* + * Map file blocks to filesystem blocks without allocation. + */ +int +xfs_bmapi_read( + struct xfs_inode *ip, + xfs_fileoff_t bno, + xfs_filblks_t len, + struct xfs_bmbt_irec *mval, + int *nmap, + int flags) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_ifork *ifp; + struct xfs_bmbt_irec got; + struct xfs_bmbt_irec prev; + xfs_fileoff_t obno; + xfs_fileoff_t end; + xfs_extnum_t lastx; + int error; + int eof; + int n = 0; + int whichfork = (flags & XFS_BMAPI_ATTRFORK) ? + XFS_ATTR_FORK : XFS_DATA_FORK; + + ASSERT(*nmap >= 1); + ASSERT(!(flags & ~(XFS_BMAPI_ATTRFORK|XFS_BMAPI_ENTIRE| + XFS_BMAPI_IGSTATE))); + + if (unlikely(XFS_TEST_ERROR( + (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS && + XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE), + mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) { + XFS_ERROR_REPORT("xfs_bmapi_read", XFS_ERRLEVEL_LOW, mp); + return XFS_ERROR(EFSCORRUPTED); + } + + if (XFS_FORCED_SHUTDOWN(mp)) + return XFS_ERROR(EIO); + + XFS_STATS_INC(xs_blk_mapr); + + ifp = XFS_IFORK_PTR(ip, whichfork); + ASSERT(ifp->if_ext_max == + XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t)); + + if (!(ifp->if_flags & XFS_IFEXTENTS)) { + error = xfs_iread_extents(NULL, ip, whichfork); + if (error) + return error; + } + + xfs_bmap_search_extents(ip, bno, whichfork, &eof, &lastx, &got, &prev); + end = bno + len; + obno = bno; + + while (bno < end && n < *nmap) { + /* Reading past eof, act as though there's a hole up to end. */ + if (eof) + got.br_startoff = end; + if (got.br_startoff > bno) { + /* Reading in a hole. */ + mval->br_startoff = bno; + mval->br_startblock = HOLESTARTBLOCK; + mval->br_blockcount = + XFS_FILBLKS_MIN(len, got.br_startoff - bno); + mval->br_state = XFS_EXT_NORM; + bno += mval->br_blockcount; + len -= mval->br_blockcount; + mval++; + n++; + continue; + } + + /* set up the extent map to return. */ + xfs_bmapi_trim_map(mval, &got, &bno, len, obno, end, n, flags); + xfs_bmapi_update_map(&mval, &bno, &len, obno, end, &n, flags); + + /* If we're done, stop now. */ + if (bno >= end || n >= *nmap) + break; + + /* Else go on to the next record. */ + if (++lastx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t)) + xfs_bmbt_get_all(xfs_iext_get_ext(ifp, lastx), &got); + else + eof = 1; + } + *nmap = n; + return 0; +} + +STATIC int +xfs_bmapi_reserve_delalloc( + struct xfs_inode *ip, + xfs_fileoff_t aoff, + xfs_filblks_t len, + struct xfs_bmbt_irec *got, + struct xfs_bmbt_irec *prev, + xfs_extnum_t *lastx, + int eof) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); + xfs_extlen_t alen; + xfs_extlen_t indlen; + char rt = XFS_IS_REALTIME_INODE(ip); + xfs_extlen_t extsz; + int error; + + alen = XFS_FILBLKS_MIN(len, MAXEXTLEN); + if (!eof) + alen = XFS_FILBLKS_MIN(alen, got->br_startoff - aoff); + + /* Figure out the extent size, adjust alen */ + extsz = xfs_get_extsz_hint(ip); + if (extsz) { + /* + * Make sure we don't exceed a single extent length when we + * align the extent by reducing length we are going to + * allocate by the maximum amount extent size aligment may + * require. + */ + alen = XFS_FILBLKS_MIN(len, MAXEXTLEN - (2 * extsz - 1)); + error = xfs_bmap_extsize_align(mp, got, prev, extsz, rt, eof, + 1, 0, &aoff, &alen); + ASSERT(!error); + } + + if (rt) + extsz = alen / mp->m_sb.sb_rextsize; + + /* + * Make a transaction-less quota reservation for delayed allocation + * blocks. This number gets adjusted later. We return if we haven't + * allocated blocks already inside this loop. + */ + error = xfs_trans_reserve_quota_nblks(NULL, ip, (long)alen, 0, + rt ? XFS_QMOPT_RES_RTBLKS : XFS_QMOPT_RES_REGBLKS); + if (error) + return error; + + /* + * Split changing sb for alen and indlen since they could be coming + * from different places. + */ + indlen = (xfs_extlen_t)xfs_bmap_worst_indlen(ip, alen); + ASSERT(indlen > 0); + + if (rt) { + error = xfs_mod_incore_sb(mp, XFS_SBS_FREXTENTS, + -((int64_t)extsz), 0); + } else { + error = xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS, + -((int64_t)alen), 0); + } + + if (error) + goto out_unreserve_quota; + + error = xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS, + -((int64_t)indlen), 0); + if (error) + goto out_unreserve_blocks; + + + ip->i_delayed_blks += alen; + + got->br_startoff = aoff; + got->br_startblock = nullstartblock(indlen); + got->br_blockcount = alen; + got->br_state = XFS_EXT_NORM; + xfs_bmap_add_extent_hole_delay(ip, lastx, got); + + /* + * Update our extent pointer, given that xfs_bmap_add_extent_hole_delay + * might have merged it into one of the neighbouring ones. + */ + xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *lastx), got); + + ASSERT(got->br_startoff <= aoff); + ASSERT(got->br_startoff + got->br_blockcount >= aoff + alen); + ASSERT(isnullstartblock(got->br_startblock)); + ASSERT(got->br_state == XFS_EXT_NORM); + return 0; + +out_unreserve_blocks: + if (rt) + xfs_mod_incore_sb(mp, XFS_SBS_FREXTENTS, extsz, 0); + else + xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS, alen, 0); +out_unreserve_quota: + if (XFS_IS_QUOTA_ON(mp)) + xfs_trans_unreserve_quota_nblks(NULL, ip, alen, 0, rt ? + XFS_QMOPT_RES_RTBLKS : XFS_QMOPT_RES_REGBLKS); + return error; +} + +/* + * Map file blocks to filesystem blocks, adding delayed allocations as needed. + */ +int +xfs_bmapi_delay( + struct xfs_inode *ip, /* incore inode */ + xfs_fileoff_t bno, /* starting file offs. mapped */ + xfs_filblks_t len, /* length to map in file */ + struct xfs_bmbt_irec *mval, /* output: map values */ + int *nmap, /* i/o: mval size/count */ + int flags) /* XFS_BMAPI_... */ +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); + struct xfs_bmbt_irec got; /* current file extent record */ + struct xfs_bmbt_irec prev; /* previous file extent record */ + xfs_fileoff_t obno; /* old block number (offset) */ + xfs_fileoff_t end; /* end of mapped file region */ + xfs_extnum_t lastx; /* last useful extent number */ + int eof; /* we've hit the end of extents */ + int n = 0; /* current extent index */ + int error = 0; + + ASSERT(*nmap >= 1); + ASSERT(*nmap <= XFS_BMAP_MAX_NMAP); + ASSERT(!(flags & ~XFS_BMAPI_ENTIRE)); + + if (unlikely(XFS_TEST_ERROR( + (XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) != XFS_DINODE_FMT_EXTENTS && + XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) != XFS_DINODE_FMT_BTREE), + mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) { + XFS_ERROR_REPORT("xfs_bmapi_delay", XFS_ERRLEVEL_LOW, mp); + return XFS_ERROR(EFSCORRUPTED); + } + + if (XFS_FORCED_SHUTDOWN(mp)) + return XFS_ERROR(EIO); + + XFS_STATS_INC(xs_blk_mapw); + + if (!(ifp->if_flags & XFS_IFEXTENTS)) { + error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK); + if (error) + return error; + } + + xfs_bmap_search_extents(ip, bno, XFS_DATA_FORK, &eof, &lastx, &got, &prev); + end = bno + len; + obno = bno; + + while (bno < end && n < *nmap) { + if (eof || got.br_startoff > bno) { + error = xfs_bmapi_reserve_delalloc(ip, bno, len, &got, + &prev, &lastx, eof); + if (error) { + if (n == 0) { + *nmap = 0; + return error; + } + break; + } + } + + /* set up the extent map to return. */ + xfs_bmapi_trim_map(mval, &got, &bno, len, obno, end, n, flags); + xfs_bmapi_update_map(&mval, &bno, &len, obno, end, &n, flags); + + /* If we're done, stop now. */ + if (bno >= end || n >= *nmap) + break; + + /* Else go on to the next record. */ + prev = got; + if (++lastx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t)) + xfs_bmbt_get_all(xfs_iext_get_ext(ifp, lastx), &got); + else + eof = 1; + } + + *nmap = n; + return 0; +} + + +STATIC int +xfs_bmapi_allocate( + struct xfs_bmalloca *bma, + int flags) +{ + struct xfs_mount *mp = bma->ip->i_mount; + int whichfork = (flags & XFS_BMAPI_ATTRFORK) ? + XFS_ATTR_FORK : XFS_DATA_FORK; + struct xfs_ifork *ifp = XFS_IFORK_PTR(bma->ip, whichfork); + int tmp_logflags = 0; + int error; + int rt; + + ASSERT(bma->length > 0); + + rt = (whichfork == XFS_DATA_FORK) && XFS_IS_REALTIME_INODE(bma->ip); + + /* + * For the wasdelay case, we could also just allocate the stuff asked + * for in this bmap call but that wouldn't be as good. + */ + if (bma->wasdel) { + bma->length = (xfs_extlen_t)bma->got.br_blockcount; + bma->offset = bma->got.br_startoff; + if (bma->idx != NULLEXTNUM && bma->idx) { + xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx - 1), + &bma->prev); + } + } else { + bma->length = XFS_FILBLKS_MIN(bma->length, MAXEXTLEN); + if (!bma->eof) + bma->length = XFS_FILBLKS_MIN(bma->length, + bma->got.br_startoff - bma->offset); + } + + /* + * Indicate if this is the first user data in the file, or just any + * user data. + */ + if (!(flags & XFS_BMAPI_METADATA)) { + bma->userdata = (bma->offset == 0) ? + XFS_ALLOC_INITIAL_USER_DATA : XFS_ALLOC_USERDATA; + } + + bma->minlen = (flags & XFS_BMAPI_CONTIG) ? bma->length : 1; + + /* + * Only want to do the alignment at the eof if it is userdata and + * allocation length is larger than a stripe unit. + */ + if (mp->m_dalign && bma->length >= mp->m_dalign && + !(flags & XFS_BMAPI_METADATA) && whichfork == XFS_DATA_FORK) { + error = xfs_bmap_isaeof(bma, whichfork); + if (error) + return error; + } + + error = xfs_bmap_alloc(bma); + if (error) + return error; + + if (bma->flist->xbf_low) + bma->minleft = 0; + if (bma->cur) + bma->cur->bc_private.b.firstblock = *bma->firstblock; + if (bma->blkno == NULLFSBLOCK) + return 0; + if ((ifp->if_flags & XFS_IFBROOT) && !bma->cur) { + bma->cur = xfs_bmbt_init_cursor(mp, bma->tp, bma->ip, whichfork); + bma->cur->bc_private.b.firstblock = *bma->firstblock; + bma->cur->bc_private.b.flist = bma->flist; + } + /* + * Bump the number of extents we've allocated + * in this call. + */ + bma->nallocs++; + + if (bma->cur) + bma->cur->bc_private.b.flags = + bma->wasdel ? XFS_BTCUR_BPRV_WASDEL : 0; + + bma->got.br_startoff = bma->offset; + bma->got.br_startblock = bma->blkno; + bma->got.br_blockcount = bma->length; + bma->got.br_state = XFS_EXT_NORM; + + /* + * A wasdelay extent has been initialized, so shouldn't be flagged + * as unwritten. + */ + if (!bma->wasdel && (flags & XFS_BMAPI_PREALLOC) && + xfs_sb_version_hasextflgbit(&mp->m_sb)) + bma->got.br_state = XFS_EXT_UNWRITTEN; + + if (bma->wasdel) + error = xfs_bmap_add_extent_delay_real(bma); + else + error = xfs_bmap_add_extent_hole_real(bma, whichfork); + + bma->logflags |= tmp_logflags; + if (error) + return error; + + /* + * Update our extent pointer, given that xfs_bmap_add_extent_delay_real + * or xfs_bmap_add_extent_hole_real might have merged it into one of + * the neighbouring ones. + */ + xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx), &bma->got); + + ASSERT(bma->got.br_startoff <= bma->offset); + ASSERT(bma->got.br_startoff + bma->got.br_blockcount >= + bma->offset + bma->length); + ASSERT(bma->got.br_state == XFS_EXT_NORM || + bma->got.br_state == XFS_EXT_UNWRITTEN); + return 0; +} + +STATIC int +xfs_bmapi_convert_unwritten( + struct xfs_bmalloca *bma, + struct xfs_bmbt_irec *mval, + xfs_filblks_t len, + int flags) +{ + int whichfork = (flags & XFS_BMAPI_ATTRFORK) ? + XFS_ATTR_FORK : XFS_DATA_FORK; + struct xfs_ifork *ifp = XFS_IFORK_PTR(bma->ip, whichfork); + int tmp_logflags = 0; + int error; + + /* check if we need to do unwritten->real conversion */ + if (mval->br_state == XFS_EXT_UNWRITTEN && + (flags & XFS_BMAPI_PREALLOC)) + return 0; + + /* check if we need to do real->unwritten conversion */ + if (mval->br_state == XFS_EXT_NORM && + (flags & (XFS_BMAPI_PREALLOC | XFS_BMAPI_CONVERT)) != + (XFS_BMAPI_PREALLOC | XFS_BMAPI_CONVERT)) + return 0; + + /* + * Modify (by adding) the state flag, if writing. + */ + ASSERT(mval->br_blockcount <= len); + if ((ifp->if_flags & XFS_IFBROOT) && !bma->cur) { + bma->cur = xfs_bmbt_init_cursor(bma->ip->i_mount, bma->tp, + bma->ip, whichfork); + bma->cur->bc_private.b.firstblock = *bma->firstblock; + bma->cur->bc_private.b.flist = bma->flist; + } + mval->br_state = (mval->br_state == XFS_EXT_UNWRITTEN) + ? XFS_EXT_NORM : XFS_EXT_UNWRITTEN; + + error = xfs_bmap_add_extent_unwritten_real(bma->tp, bma->ip, &bma->idx, + &bma->cur, mval, bma->firstblock, bma->flist, + &tmp_logflags); + bma->logflags |= tmp_logflags; + if (error) + return error; + + /* + * Update our extent pointer, given that + * xfs_bmap_add_extent_unwritten_real might have merged it into one + * of the neighbouring ones. + */ + xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx), &bma->got); + + /* + * We may have combined previously unwritten space with written space, + * so generate another request. + */ + if (mval->br_blockcount < len) + return EAGAIN; + return 0; +} + +/* + * Map file blocks to filesystem blocks, and allocate blocks or convert the + * extent state if necessary. Details behaviour is controlled by the flags + * parameter. Only allocates blocks from a single allocation group, to avoid + * locking problems. + * * The returned value in "firstblock" from the first call in a transaction * must be remembered and presented to subsequent calls in "firstblock". * An upper bound for the number of blocks to be allocated is supplied to * the first call in "total"; if no allocation group has that many free * blocks then the call will fail (return NULLFSBLOCK in "firstblock"). */ -int /* error */ -xfs_bmapi( - xfs_trans_t *tp, /* transaction pointer */ - xfs_inode_t *ip, /* incore inode */ - xfs_fileoff_t bno, /* starting file offs. mapped */ - xfs_filblks_t len, /* length to map in file */ - int flags, /* XFS_BMAPI_... */ - xfs_fsblock_t *firstblock, /* first allocated block - controls a.g. for allocs */ - xfs_extlen_t total, /* total blocks needed */ - xfs_bmbt_irec_t *mval, /* output: map values */ - int *nmap, /* i/o: mval size/count */ - xfs_bmap_free_t *flist) /* i/o: list extents to free */ +int +xfs_bmapi_write( + struct xfs_trans *tp, /* transaction pointer */ + struct xfs_inode *ip, /* incore inode */ + xfs_fileoff_t bno, /* starting file offs. mapped */ + xfs_filblks_t len, /* length to map in file */ + int flags, /* XFS_BMAPI_... */ + xfs_fsblock_t *firstblock, /* first allocated block + controls a.g. for allocs */ + xfs_extlen_t total, /* total blocks needed */ + struct xfs_bmbt_irec *mval, /* output: map values */ + int *nmap, /* i/o: mval size/count */ + struct xfs_bmap_free *flist) /* i/o: list extents to free */ { - xfs_fsblock_t abno; /* allocated block number */ - xfs_extlen_t alen; /* allocated extent length */ - xfs_fileoff_t aoff; /* allocated file offset */ - xfs_bmalloca_t bma = { 0 }; /* args for xfs_bmap_alloc */ - xfs_btree_cur_t *cur; /* bmap btree cursor */ - xfs_fileoff_t end; /* end of mapped file region */ - int eof; /* we've hit the end of extents */ - xfs_bmbt_rec_host_t *ep; /* extent record pointer */ - int error; /* error return */ - xfs_bmbt_irec_t got; /* current file extent record */ - xfs_ifork_t *ifp; /* inode fork pointer */ - xfs_extlen_t indlen; /* indirect blocks length */ - xfs_extnum_t lastx; /* last useful extent number */ - int logflags; /* flags for transaction logging */ - xfs_extlen_t minleft; /* min blocks left after allocation */ - xfs_extlen_t minlen; /* min allocation size */ - xfs_mount_t *mp; /* xfs mount structure */ - int n; /* current extent index */ - int nallocs; /* number of extents alloc'd */ - xfs_extnum_t nextents; /* number of extents in file */ - xfs_fileoff_t obno; /* old block number (offset) */ - xfs_bmbt_irec_t prev; /* previous file extent record */ - int tmp_logflags; /* temp flags holder */ - int whichfork; /* data or attr fork */ - char inhole; /* current location is hole in file */ - char wasdelay; /* old extent was delayed */ - char wr; /* this is a write request */ - char rt; /* this is a realtime file */ + struct xfs_mount *mp = ip->i_mount; + struct xfs_ifork *ifp; + struct xfs_bmalloca bma = { 0 }; /* args for xfs_bmap_alloc */ + xfs_fileoff_t end; /* end of mapped file region */ + int eof; /* after the end of extents */ + int error; /* error return */ + int n; /* current extent index */ + xfs_fileoff_t obno; /* old block number (offset) */ + int whichfork; /* data or attr fork */ + char inhole; /* current location is hole in file */ + char wasdelay; /* old extent was delayed */ + #ifdef DEBUG - xfs_fileoff_t orig_bno; /* original block number value */ - int orig_flags; /* original flags arg value */ - xfs_filblks_t orig_len; /* original value of len arg */ - xfs_bmbt_irec_t *orig_mval; /* original value of mval */ - int orig_nmap; /* original value of *nmap */ + xfs_fileoff_t orig_bno; /* original block number value */ + int orig_flags; /* original flags arg value */ + xfs_filblks_t orig_len; /* original value of len arg */ + struct xfs_bmbt_irec *orig_mval; /* original value of mval */ + int orig_nmap; /* original value of *nmap */ orig_bno = bno; orig_len = len; @@ -4360,488 +4848,147 @@ xfs_bmapi( orig_mval = mval; orig_nmap = *nmap; #endif + ASSERT(*nmap >= 1); - ASSERT(*nmap <= XFS_BMAP_MAX_NMAP || !(flags & XFS_BMAPI_WRITE)); + ASSERT(*nmap <= XFS_BMAP_MAX_NMAP); + ASSERT(!(flags & XFS_BMAPI_IGSTATE)); + ASSERT(tp != NULL); + ASSERT(len > 0); + whichfork = (flags & XFS_BMAPI_ATTRFORK) ? XFS_ATTR_FORK : XFS_DATA_FORK; - mp = ip->i_mount; + if (unlikely(XFS_TEST_ERROR( (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS && XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE && XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL), mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) { - XFS_ERROR_REPORT("xfs_bmapi", XFS_ERRLEVEL_LOW, mp); + XFS_ERROR_REPORT("xfs_bmapi_write", XFS_ERRLEVEL_LOW, mp); return XFS_ERROR(EFSCORRUPTED); } + if (XFS_FORCED_SHUTDOWN(mp)) return XFS_ERROR(EIO); - rt = (whichfork == XFS_DATA_FORK) && XFS_IS_REALTIME_INODE(ip); + ifp = XFS_IFORK_PTR(ip, whichfork); ASSERT(ifp->if_ext_max == XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t)); - if ((wr = (flags & XFS_BMAPI_WRITE)) != 0) - XFS_STATS_INC(xs_blk_mapw); - else - XFS_STATS_INC(xs_blk_mapr); - /* - * IGSTATE flag is used to combine extents which - * differ only due to the state of the extents. - * This technique is used from xfs_getbmap() - * when the caller does not wish to see the - * separation (which is the default). - * - * This technique is also used when writing a - * buffer which has been partially written, - * (usually by being flushed during a chunkread), - * to ensure one write takes place. This also - * prevents a change in the xfs inode extents at - * this time, intentionally. This change occurs - * on completion of the write operation, in - * xfs_strat_comp(), where the xfs_bmapi() call - * is transactioned, and the extents combined. - */ - if ((flags & XFS_BMAPI_IGSTATE) && wr) /* if writing unwritten space */ - wr = 0; /* no allocations are allowed */ - ASSERT(wr || !(flags & XFS_BMAPI_DELAY)); - logflags = 0; - nallocs = 0; - cur = NULL; + + XFS_STATS_INC(xs_blk_mapw); + if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) { - ASSERT(wr && tp); - if ((error = xfs_bmap_local_to_extents(tp, ip, - firstblock, total, &logflags, whichfork))) + error = xfs_bmap_local_to_extents(tp, ip, firstblock, total, + &bma.logflags, whichfork); + if (error) goto error0; } - if (wr && *firstblock == NULLFSBLOCK) { + + if (*firstblock == NULLFSBLOCK) { if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE) - minleft = be16_to_cpu(ifp->if_broot->bb_level) + 1; + bma.minleft = be16_to_cpu(ifp->if_broot->bb_level) + 1; else - minleft = 1; - } else - minleft = 0; - if (!(ifp->if_flags & XFS_IFEXTENTS) && - (error = xfs_iread_extents(tp, ip, whichfork))) - goto error0; - ep = xfs_bmap_search_extents(ip, bno, whichfork, &eof, &lastx, &got, - &prev); - nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); + bma.minleft = 1; + } else { + bma.minleft = 0; + } + + if (!(ifp->if_flags & XFS_IFEXTENTS)) { + error = xfs_iread_extents(tp, ip, whichfork); + if (error) + goto error0; + } + + xfs_bmap_search_extents(ip, bno, whichfork, &eof, &bma.idx, &bma.got, + &bma.prev); n = 0; end = bno + len; obno = bno; - bma.ip = NULL; + + bma.tp = tp; + bma.ip = ip; + bma.total = total; + bma.userdata = 0; + bma.flist = flist; + bma.firstblock = firstblock; while (bno < end && n < *nmap) { - /* - * Reading past eof, act as though there's a hole - * up to end. - */ - if (eof && !wr) - got.br_startoff = end; - inhole = eof || got.br_startoff > bno; - wasdelay = wr && !inhole && !(flags & XFS_BMAPI_DELAY) && - isnullstartblock(got.br_startblock); + inhole = eof || bma.got.br_startoff > bno; + wasdelay = !inhole && isnullstartblock(bma.got.br_startblock); + /* * First, deal with the hole before the allocated space * that we found, if any. */ - if (wr && (inhole || wasdelay)) { - /* - * For the wasdelay case, we could also just - * allocate the stuff asked for in this bmap call - * but that wouldn't be as good. - */ - if (wasdelay) { - alen = (xfs_extlen_t)got.br_blockcount; - aoff = got.br_startoff; - if (lastx != NULLEXTNUM && lastx) { - ep = xfs_iext_get_ext(ifp, lastx - 1); - xfs_bmbt_get_all(ep, &prev); - } - } else { - alen = (xfs_extlen_t) - XFS_FILBLKS_MIN(len, MAXEXTLEN); - if (!eof) - alen = (xfs_extlen_t) - XFS_FILBLKS_MIN(alen, - got.br_startoff - bno); - aoff = bno; - } - minlen = (flags & XFS_BMAPI_CONTIG) ? alen : 1; - if (flags & XFS_BMAPI_DELAY) { - xfs_extlen_t extsz; - - /* Figure out the extent size, adjust alen */ - extsz = xfs_get_extsz_hint(ip); - if (extsz) { - /* - * make sure we don't exceed a single - * extent length when we align the - * extent by reducing length we are - * going to allocate by the maximum - * amount extent size aligment may - * require. - */ - alen = XFS_FILBLKS_MIN(len, - MAXEXTLEN - (2 * extsz - 1)); - error = xfs_bmap_extsize_align(mp, - &got, &prev, extsz, - rt, eof, - flags&XFS_BMAPI_DELAY, - flags&XFS_BMAPI_CONVERT, - &aoff, &alen); - ASSERT(!error); - } - - if (rt) - extsz = alen / mp->m_sb.sb_rextsize; - - /* - * Make a transaction-less quota reservation for - * delayed allocation blocks. This number gets - * adjusted later. We return if we haven't - * allocated blocks already inside this loop. - */ - error = xfs_trans_reserve_quota_nblks( - NULL, ip, (long)alen, 0, - rt ? XFS_QMOPT_RES_RTBLKS : - XFS_QMOPT_RES_REGBLKS); - if (error) { - if (n == 0) { - *nmap = 0; - ASSERT(cur == NULL); - return error; - } - break; - } - - /* - * Split changing sb for alen and indlen since - * they could be coming from different places. - */ - indlen = (xfs_extlen_t) - xfs_bmap_worst_indlen(ip, alen); - ASSERT(indlen > 0); - - if (rt) { - error = xfs_mod_incore_sb(mp, - XFS_SBS_FREXTENTS, - -((int64_t)extsz), 0); - } else { - error = xfs_icsb_modify_counters(mp, - XFS_SBS_FDBLOCKS, - -((int64_t)alen), 0); - } - if (!error) { - error = xfs_icsb_modify_counters(mp, - XFS_SBS_FDBLOCKS, - -((int64_t)indlen), 0); - if (error && rt) - xfs_mod_incore_sb(mp, - XFS_SBS_FREXTENTS, - (int64_t)extsz, 0); - else if (error) - xfs_icsb_modify_counters(mp, - XFS_SBS_FDBLOCKS, - (int64_t)alen, 0); - } + if (inhole || wasdelay) { + bma.eof = eof; + bma.conv = !!(flags & XFS_BMAPI_CONVERT); + bma.wasdel = wasdelay; + bma.offset = bno; - if (error) { - if (XFS_IS_QUOTA_ON(mp)) - /* unreserve the blocks now */ - (void) - xfs_trans_unreserve_quota_nblks( - NULL, ip, - (long)alen, 0, rt ? - XFS_QMOPT_RES_RTBLKS : - XFS_QMOPT_RES_REGBLKS); - break; - } - - ip->i_delayed_blks += alen; - abno = nullstartblock(indlen); - } else { - /* - * If first time, allocate and fill in - * once-only bma fields. - */ - if (bma.ip == NULL) { - bma.tp = tp; - bma.ip = ip; - bma.prevp = &prev; - bma.gotp = &got; - bma.total = total; - bma.userdata = 0; - } - /* Indicate if this is the first user data - * in the file, or just any user data. - */ - if (!(flags & XFS_BMAPI_METADATA)) { - bma.userdata = (aoff == 0) ? - XFS_ALLOC_INITIAL_USER_DATA : - XFS_ALLOC_USERDATA; - } - /* - * Fill in changeable bma fields. - */ - bma.eof = eof; - bma.firstblock = *firstblock; - bma.alen = alen; - bma.off = aoff; - bma.conv = !!(flags & XFS_BMAPI_CONVERT); - bma.wasdel = wasdelay; - bma.minlen = minlen; - bma.low = flist->xbf_low; - bma.minleft = minleft; - /* - * Only want to do the alignment at the - * eof if it is userdata and allocation length - * is larger than a stripe unit. - */ - if (mp->m_dalign && alen >= mp->m_dalign && - (!(flags & XFS_BMAPI_METADATA)) && - (whichfork == XFS_DATA_FORK)) { - if ((error = xfs_bmap_isaeof(ip, aoff, - whichfork, &bma.aeof))) - goto error0; - } else - bma.aeof = 0; - /* - * Call allocator. - */ - if ((error = xfs_bmap_alloc(&bma))) - goto error0; - /* - * Copy out result fields. - */ - abno = bma.rval; - if ((flist->xbf_low = bma.low)) - minleft = 0; - alen = bma.alen; - aoff = bma.off; - ASSERT(*firstblock == NULLFSBLOCK || - XFS_FSB_TO_AGNO(mp, *firstblock) == - XFS_FSB_TO_AGNO(mp, bma.firstblock) || - (flist->xbf_low && - XFS_FSB_TO_AGNO(mp, *firstblock) < - XFS_FSB_TO_AGNO(mp, bma.firstblock))); - *firstblock = bma.firstblock; - if (cur) - cur->bc_private.b.firstblock = - *firstblock; - if (abno == NULLFSBLOCK) - break; - if ((ifp->if_flags & XFS_IFBROOT) && !cur) { - cur = xfs_bmbt_init_cursor(mp, tp, - ip, whichfork); - cur->bc_private.b.firstblock = - *firstblock; - cur->bc_private.b.flist = flist; - } - /* - * Bump the number of extents we've allocated - * in this call. - */ - nallocs++; - } - if (cur) - cur->bc_private.b.flags = - wasdelay ? XFS_BTCUR_BPRV_WASDEL : 0; - got.br_startoff = aoff; - got.br_startblock = abno; - got.br_blockcount = alen; - got.br_state = XFS_EXT_NORM; /* assume normal */ - /* - * Determine state of extent, and the filesystem. - * A wasdelay extent has been initialized, so - * shouldn't be flagged as unwritten. - */ - if (wr && xfs_sb_version_hasextflgbit(&mp->m_sb)) { - if (!wasdelay && (flags & XFS_BMAPI_PREALLOC)) - got.br_state = XFS_EXT_UNWRITTEN; - } - error = xfs_bmap_add_extent(tp, ip, &lastx, &cur, &got, - firstblock, flist, &tmp_logflags, - whichfork); - logflags |= tmp_logflags; - if (error) - goto error0; - ep = xfs_iext_get_ext(ifp, lastx); - nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); - xfs_bmbt_get_all(ep, &got); - ASSERT(got.br_startoff <= aoff); - ASSERT(got.br_startoff + got.br_blockcount >= - aoff + alen); -#ifdef DEBUG - if (flags & XFS_BMAPI_DELAY) { - ASSERT(isnullstartblock(got.br_startblock)); - ASSERT(startblockval(got.br_startblock) > 0); - } - ASSERT(got.br_state == XFS_EXT_NORM || - got.br_state == XFS_EXT_UNWRITTEN); -#endif - /* - * Fall down into the found allocated space case. - */ - } else if (inhole) { - /* - * Reading in a hole. - */ - mval->br_startoff = bno; - mval->br_startblock = HOLESTARTBLOCK; - mval->br_blockcount = - XFS_FILBLKS_MIN(len, got.br_startoff - bno); - mval->br_state = XFS_EXT_NORM; - bno += mval->br_blockcount; - len -= mval->br_blockcount; - mval++; - n++; - continue; - } - /* - * Then deal with the allocated space we found. - */ - ASSERT(ep != NULL); - if (!(flags & XFS_BMAPI_ENTIRE) && - (got.br_startoff + got.br_blockcount > obno)) { - if (obno > bno) - bno = obno; - ASSERT((bno >= obno) || (n == 0)); - ASSERT(bno < end); - mval->br_startoff = bno; - if (isnullstartblock(got.br_startblock)) { - ASSERT(!wr || (flags & XFS_BMAPI_DELAY)); - mval->br_startblock = DELAYSTARTBLOCK; - } else - mval->br_startblock = - got.br_startblock + - (bno - got.br_startoff); /* - * Return the minimum of what we got and what we - * asked for for the length. We can use the len - * variable here because it is modified below - * and we could have been there before coming - * here if the first part of the allocation - * didn't overlap what was asked for. + * There's a 32/64 bit type mismatch between the + * allocation length request (which can be 64 bits in + * length) and the bma length request, which is + * xfs_extlen_t and therefore 32 bits. Hence we have to + * check for 32-bit overflows and handle them here. */ - mval->br_blockcount = - XFS_FILBLKS_MIN(end - bno, got.br_blockcount - - (bno - got.br_startoff)); - mval->br_state = got.br_state; - ASSERT(mval->br_blockcount <= len); - } else { - *mval = got; - if (isnullstartblock(mval->br_startblock)) { - ASSERT(!wr || (flags & XFS_BMAPI_DELAY)); - mval->br_startblock = DELAYSTARTBLOCK; - } - } + if (len > (xfs_filblks_t)MAXEXTLEN) + bma.length = MAXEXTLEN; + else + bma.length = len; - /* - * Check if writing previously allocated but - * unwritten extents. - */ - if (wr && - ((mval->br_state == XFS_EXT_UNWRITTEN && - ((flags & (XFS_BMAPI_PREALLOC|XFS_BMAPI_DELAY)) == 0)) || - (mval->br_state == XFS_EXT_NORM && - ((flags & (XFS_BMAPI_PREALLOC|XFS_BMAPI_CONVERT)) == - (XFS_BMAPI_PREALLOC|XFS_BMAPI_CONVERT))))) { - /* - * Modify (by adding) the state flag, if writing. - */ - ASSERT(mval->br_blockcount <= len); - if ((ifp->if_flags & XFS_IFBROOT) && !cur) { - cur = xfs_bmbt_init_cursor(mp, - tp, ip, whichfork); - cur->bc_private.b.firstblock = - *firstblock; - cur->bc_private.b.flist = flist; - } - mval->br_state = (mval->br_state == XFS_EXT_UNWRITTEN) - ? XFS_EXT_NORM - : XFS_EXT_UNWRITTEN; - error = xfs_bmap_add_extent(tp, ip, &lastx, &cur, mval, - firstblock, flist, &tmp_logflags, - whichfork); - logflags |= tmp_logflags; + ASSERT(len > 0); + ASSERT(bma.length > 0); + error = xfs_bmapi_allocate(&bma, flags); if (error) goto error0; - ep = xfs_iext_get_ext(ifp, lastx); - nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); - xfs_bmbt_get_all(ep, &got); - /* - * We may have combined previously unwritten - * space with written space, so generate - * another request. - */ - if (mval->br_blockcount < len) - continue; + if (bma.blkno == NULLFSBLOCK) + break; } - ASSERT((flags & XFS_BMAPI_ENTIRE) || - ((mval->br_startoff + mval->br_blockcount) <= end)); - ASSERT((flags & XFS_BMAPI_ENTIRE) || - (mval->br_blockcount <= len) || - (mval->br_startoff < obno)); - bno = mval->br_startoff + mval->br_blockcount; - len = end - bno; - if (n > 0 && mval->br_startoff == mval[-1].br_startoff) { - ASSERT(mval->br_startblock == mval[-1].br_startblock); - ASSERT(mval->br_blockcount > mval[-1].br_blockcount); - ASSERT(mval->br_state == mval[-1].br_state); - mval[-1].br_blockcount = mval->br_blockcount; - mval[-1].br_state = mval->br_state; - } else if (n > 0 && mval->br_startblock != DELAYSTARTBLOCK && - mval[-1].br_startblock != DELAYSTARTBLOCK && - mval[-1].br_startblock != HOLESTARTBLOCK && - mval->br_startblock == - mval[-1].br_startblock + mval[-1].br_blockcount && - ((flags & XFS_BMAPI_IGSTATE) || - mval[-1].br_state == mval->br_state)) { - ASSERT(mval->br_startoff == - mval[-1].br_startoff + mval[-1].br_blockcount); - mval[-1].br_blockcount += mval->br_blockcount; - } else if (n > 0 && - mval->br_startblock == DELAYSTARTBLOCK && - mval[-1].br_startblock == DELAYSTARTBLOCK && - mval->br_startoff == - mval[-1].br_startoff + mval[-1].br_blockcount) { - mval[-1].br_blockcount += mval->br_blockcount; - mval[-1].br_state = mval->br_state; - } else if (!((n == 0) && - ((mval->br_startoff + mval->br_blockcount) <= - obno))) { - mval++; - n++; - } + /* Deal with the allocated space we found. */ + xfs_bmapi_trim_map(mval, &bma.got, &bno, len, obno, + end, n, flags); + + /* Execute unwritten extent conversion if necessary */ + error = xfs_bmapi_convert_unwritten(&bma, mval, len, flags); + if (error == EAGAIN) + continue; + if (error) + goto error0; + + /* update the extent map to return */ + xfs_bmapi_update_map(&mval, &bno, &len, obno, end, &n, flags); + /* * If we're done, stop now. Stop when we've allocated * XFS_BMAP_MAX_NMAP extents no matter what. Otherwise * the transaction may get too big. */ - if (bno >= end || n >= *nmap || nallocs >= *nmap) + if (bno >= end || n >= *nmap || bma.nallocs >= *nmap) break; - /* - * Else go on to the next record. - */ - prev = got; - if (++lastx < nextents) { - ep = xfs_iext_get_ext(ifp, lastx); - xfs_bmbt_get_all(ep, &got); - } else { + + /* Else go on to the next record. */ + bma.prev = bma.got; + if (++bma.idx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t)) { + xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma.idx), + &bma.got); + } else eof = 1; - } } *nmap = n; + /* * Transform from btree to extents, give it cur. */ - if (tp && XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE && + if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE && XFS_IFORK_NEXTENTS(ip, whichfork) <= ifp->if_ext_max) { - ASSERT(wr && cur); - error = xfs_bmap_btree_to_extents(tp, ip, cur, + int tmp_logflags = 0; + + ASSERT(bma.cur); + error = xfs_bmap_btree_to_extents(tp, ip, bma.cur, &tmp_logflags, whichfork); - logflags |= tmp_logflags; + bma.logflags |= tmp_logflags; if (error) goto error0; } @@ -4855,34 +5002,33 @@ error0: * Log everything. Do this after conversion, there's no point in * logging the extent records if we've converted to btree format. */ - if ((logflags & xfs_ilog_fext(whichfork)) && + if ((bma.logflags & xfs_ilog_fext(whichfork)) && XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS) - logflags &= ~xfs_ilog_fext(whichfork); - else if ((logflags & xfs_ilog_fbroot(whichfork)) && + bma.logflags &= ~xfs_ilog_fext(whichfork); + else if ((bma.logflags & xfs_ilog_fbroot(whichfork)) && XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE) - logflags &= ~xfs_ilog_fbroot(whichfork); + bma.logflags &= ~xfs_ilog_fbroot(whichfork); /* * Log whatever the flags say, even if error. Otherwise we might miss * detecting a case where the data is changed, there's an error, * and it's not logged so we don't shutdown when we should. */ - if (logflags) { - ASSERT(tp && wr); - xfs_trans_log_inode(tp, ip, logflags); - } - if (cur) { + if (bma.logflags) + xfs_trans_log_inode(tp, ip, bma.logflags); + + if (bma.cur) { if (!error) { ASSERT(*firstblock == NULLFSBLOCK || XFS_FSB_TO_AGNO(mp, *firstblock) == XFS_FSB_TO_AGNO(mp, - cur->bc_private.b.firstblock) || + bma.cur->bc_private.b.firstblock) || (flist->xbf_low && XFS_FSB_TO_AGNO(mp, *firstblock) < XFS_FSB_TO_AGNO(mp, - cur->bc_private.b.firstblock))); - *firstblock = cur->bc_private.b.firstblock; + bma.cur->bc_private.b.firstblock))); + *firstblock = bma.cur->bc_private.b.firstblock; } - xfs_btree_del_cursor(cur, + xfs_btree_del_cursor(bma.cur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR); } if (!error) @@ -4892,58 +5038,6 @@ error0: } /* - * Map file blocks to filesystem blocks, simple version. - * One block (extent) only, read-only. - * For flags, only the XFS_BMAPI_ATTRFORK flag is examined. - * For the other flag values, the effect is as if XFS_BMAPI_METADATA - * was set and all the others were clear. - */ -int /* error */ -xfs_bmapi_single( - xfs_trans_t *tp, /* transaction pointer */ - xfs_inode_t *ip, /* incore inode */ - int whichfork, /* data or attr fork */ - xfs_fsblock_t *fsb, /* output: mapped block */ - xfs_fileoff_t bno) /* starting file offs. mapped */ -{ - int eof; /* we've hit the end of extents */ - int error; /* error return */ - xfs_bmbt_irec_t got; /* current file extent record */ - xfs_ifork_t *ifp; /* inode fork pointer */ - xfs_extnum_t lastx; /* last useful extent number */ - xfs_bmbt_irec_t prev; /* previous file extent record */ - - ifp = XFS_IFORK_PTR(ip, whichfork); - if (unlikely( - XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE && - XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS)) { - XFS_ERROR_REPORT("xfs_bmapi_single", XFS_ERRLEVEL_LOW, - ip->i_mount); - return XFS_ERROR(EFSCORRUPTED); - } - if (XFS_FORCED_SHUTDOWN(ip->i_mount)) - return XFS_ERROR(EIO); - XFS_STATS_INC(xs_blk_mapr); - if (!(ifp->if_flags & XFS_IFEXTENTS) && - (error = xfs_iread_extents(tp, ip, whichfork))) - return error; - (void)xfs_bmap_search_extents(ip, bno, whichfork, &eof, &lastx, &got, - &prev); - /* - * Reading past eof, act as though there's a hole - * up to end. - */ - if (eof || got.br_startoff > bno) { - *fsb = NULLFSBLOCK; - return 0; - } - ASSERT(!isnullstartblock(got.br_startblock)); - ASSERT(bno < got.br_startoff + got.br_blockcount); - *fsb = got.br_startblock + (bno - got.br_startoff); - return 0; -} - -/* * Unmap (remove) blocks from a file. * If nexts is nonzero then the number of extents to remove is limited to * that value. If not all extents in the block range can be removed then @@ -5114,9 +5208,9 @@ xfs_bunmapi( del.br_blockcount = mod; } del.br_state = XFS_EXT_UNWRITTEN; - error = xfs_bmap_add_extent(tp, ip, &lastx, &cur, &del, - firstblock, flist, &logflags, - XFS_DATA_FORK); + error = xfs_bmap_add_extent_unwritten_real(tp, ip, + &lastx, &cur, &del, firstblock, flist, + &logflags); if (error) goto error0; goto nodelete; @@ -5172,18 +5266,18 @@ xfs_bunmapi( } prev.br_state = XFS_EXT_UNWRITTEN; lastx--; - error = xfs_bmap_add_extent(tp, ip, &lastx, - &cur, &prev, firstblock, flist, - &logflags, XFS_DATA_FORK); + error = xfs_bmap_add_extent_unwritten_real(tp, + ip, &lastx, &cur, &prev, + firstblock, flist, &logflags); if (error) goto error0; goto nodelete; } else { ASSERT(del.br_state == XFS_EXT_NORM); del.br_state = XFS_EXT_UNWRITTEN; - error = xfs_bmap_add_extent(tp, ip, &lastx, - &cur, &del, firstblock, flist, - &logflags, XFS_DATA_FORK); + error = xfs_bmap_add_extent_unwritten_real(tp, + ip, &lastx, &cur, &del, + firstblock, flist, &logflags); if (error) goto error0; goto nodelete; @@ -5505,10 +5599,9 @@ xfs_getbmap( do { nmap = (nexleft > subnex) ? subnex : nexleft; - error = xfs_bmapi(NULL, ip, XFS_BB_TO_FSBT(mp, bmv->bmv_offset), - XFS_BB_TO_FSB(mp, bmv->bmv_length), - bmapi_flags, NULL, 0, map, &nmap, - NULL); + error = xfs_bmapi_read(ip, XFS_BB_TO_FSBT(mp, bmv->bmv_offset), + XFS_BB_TO_FSB(mp, bmv->bmv_length), + map, &nmap, bmapi_flags); if (error) goto out_free_map; ASSERT(nmap <= subnex); @@ -5582,89 +5675,6 @@ xfs_getbmap( return error; } -/* - * Check the last inode extent to determine whether this allocation will result - * in blocks being allocated at the end of the file. When we allocate new data - * blocks at the end of the file which do not start at the previous data block, - * we will try to align the new blocks at stripe unit boundaries. - */ -STATIC int /* error */ -xfs_bmap_isaeof( - xfs_inode_t *ip, /* incore inode pointer */ - xfs_fileoff_t off, /* file offset in fsblocks */ - int whichfork, /* data or attribute fork */ - char *aeof) /* return value */ -{ - int error; /* error return value */ - xfs_ifork_t *ifp; /* inode fork pointer */ - xfs_bmbt_rec_host_t *lastrec; /* extent record pointer */ - xfs_extnum_t nextents; /* number of file extents */ - xfs_bmbt_irec_t s; /* expanded extent record */ - - ASSERT(whichfork == XFS_DATA_FORK); - ifp = XFS_IFORK_PTR(ip, whichfork); - if (!(ifp->if_flags & XFS_IFEXTENTS) && - (error = xfs_iread_extents(NULL, ip, whichfork))) - return error; - nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); - if (nextents == 0) { - *aeof = 1; - return 0; - } - /* - * Go to the last extent - */ - lastrec = xfs_iext_get_ext(ifp, nextents - 1); - xfs_bmbt_get_all(lastrec, &s); - /* - * Check we are allocating in the last extent (for delayed allocations) - * or past the last extent for non-delayed allocations. - */ - *aeof = (off >= s.br_startoff && - off < s.br_startoff + s.br_blockcount && - isnullstartblock(s.br_startblock)) || - off >= s.br_startoff + s.br_blockcount; - return 0; -} - -/* - * Check if the endoff is outside the last extent. If so the caller will grow - * the allocation to a stripe unit boundary. - */ -int /* error */ -xfs_bmap_eof( - xfs_inode_t *ip, /* incore inode pointer */ - xfs_fileoff_t endoff, /* file offset in fsblocks */ - int whichfork, /* data or attribute fork */ - int *eof) /* result value */ -{ - xfs_fsblock_t blockcount; /* extent block count */ - int error; /* error return value */ - xfs_ifork_t *ifp; /* inode fork pointer */ - xfs_bmbt_rec_host_t *lastrec; /* extent record pointer */ - xfs_extnum_t nextents; /* number of file extents */ - xfs_fileoff_t startoff; /* extent starting file offset */ - - ASSERT(whichfork == XFS_DATA_FORK); - ifp = XFS_IFORK_PTR(ip, whichfork); - if (!(ifp->if_flags & XFS_IFEXTENTS) && - (error = xfs_iread_extents(NULL, ip, whichfork))) - return error; - nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); - if (nextents == 0) { - *eof = 1; - return 0; - } - /* - * Go to the last extent - */ - lastrec = xfs_iext_get_ext(ifp, nextents - 1); - startoff = xfs_bmbt_get_startoff(lastrec); - blockcount = xfs_bmbt_get_blockcount(lastrec); - *eof = endoff >= startoff + blockcount; - return 0; -} - #ifdef DEBUG STATIC struct xfs_buf * xfs_bmap_get_bp( @@ -6099,9 +6109,8 @@ xfs_bmap_punch_delalloc_range( * trying to remove a real extent (which requires a * transaction) or a hole, which is probably a bad idea... */ - error = xfs_bmapi(NULL, ip, start_fsb, 1, - XFS_BMAPI_ENTIRE, NULL, 0, &imap, - &nimaps, NULL); + error = xfs_bmapi_read(ip, start_fsb, 1, &imap, &nimaps, + XFS_BMAPI_ENTIRE); if (error) { /* something screwed, just bail */ diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h index c62234b..89ee672 100644 --- a/fs/xfs/xfs_bmap.h +++ b/fs/xfs/xfs_bmap.h @@ -62,27 +62,23 @@ typedef struct xfs_bmap_free #define XFS_BMAP_MAX_NMAP 4 /* - * Flags for xfs_bmapi + * Flags for xfs_bmapi_* */ -#define XFS_BMAPI_WRITE 0x001 /* write operation: allocate space */ -#define XFS_BMAPI_DELAY 0x002 /* delayed write operation */ -#define XFS_BMAPI_ENTIRE 0x004 /* return entire extent, not trimmed */ -#define XFS_BMAPI_METADATA 0x008 /* mapping metadata not user data */ -#define XFS_BMAPI_ATTRFORK 0x010 /* use attribute fork not data */ -#define XFS_BMAPI_PREALLOC 0x040 /* preallocation op: unwritten space */ -#define XFS_BMAPI_IGSTATE 0x080 /* Ignore state - */ +#define XFS_BMAPI_ENTIRE 0x001 /* return entire extent, not trimmed */ +#define XFS_BMAPI_METADATA 0x002 /* mapping metadata not user data */ +#define XFS_BMAPI_ATTRFORK 0x004 /* use attribute fork not data */ +#define XFS_BMAPI_PREALLOC 0x008 /* preallocation op: unwritten space */ +#define XFS_BMAPI_IGSTATE 0x010 /* Ignore state - */ /* combine contig. space */ -#define XFS_BMAPI_CONTIG 0x100 /* must allocate only one extent */ +#define XFS_BMAPI_CONTIG 0x020 /* must allocate only one extent */ /* * unwritten extent conversion - this needs write cache flushing and no additional * allocation alignments. When specified with XFS_BMAPI_PREALLOC it converts * from written to unwritten, otherwise convert from unwritten to written. */ -#define XFS_BMAPI_CONVERT 0x200 +#define XFS_BMAPI_CONVERT 0x040 #define XFS_BMAPI_FLAGS \ - { XFS_BMAPI_WRITE, "WRITE" }, \ - { XFS_BMAPI_DELAY, "DELAY" }, \ { XFS_BMAPI_ENTIRE, "ENTIRE" }, \ { XFS_BMAPI_METADATA, "METADATA" }, \ { XFS_BMAPI_ATTRFORK, "ATTRFORK" }, \ @@ -113,21 +109,28 @@ static inline void xfs_bmap_init(xfs_bmap_free_t *flp, xfs_fsblock_t *fbp) * Argument structure for xfs_bmap_alloc. */ typedef struct xfs_bmalloca { - xfs_fsblock_t firstblock; /* i/o first block allocated */ - xfs_fsblock_t rval; /* starting block of new extent */ - xfs_fileoff_t off; /* offset in file filling in */ + xfs_fsblock_t *firstblock; /* i/o first block allocated */ + struct xfs_bmap_free *flist; /* bmap freelist */ struct xfs_trans *tp; /* transaction pointer */ struct xfs_inode *ip; /* incore inode pointer */ - struct xfs_bmbt_irec *prevp; /* extent before the new one */ - struct xfs_bmbt_irec *gotp; /* extent after, or delayed */ - xfs_extlen_t alen; /* i/o length asked/allocated */ + struct xfs_bmbt_irec prev; /* extent before the new one */ + struct xfs_bmbt_irec got; /* extent after, or delayed */ + + xfs_fileoff_t offset; /* offset in file filling in */ + xfs_extlen_t length; /* i/o length asked/allocated */ + xfs_fsblock_t blkno; /* starting block of new extent */ + + struct xfs_btree_cur *cur; /* btree cursor */ + xfs_extnum_t idx; /* current extent index */ + int nallocs;/* number of extents alloc'd */ + int logflags;/* flags for transaction logging */ + xfs_extlen_t total; /* total blocks needed for xaction */ xfs_extlen_t minlen; /* minimum allocation size (blocks) */ xfs_extlen_t minleft; /* amount must be left after alloc */ char eof; /* set if allocating past last extent */ char wasdel; /* replacing a delayed allocation */ char userdata;/* set if is user data */ - char low; /* low on space, using seq'l ags */ char aeof; /* allocated space at eof */ char conv; /* overwriting unwritten extents */ } xfs_bmalloca_t; @@ -152,251 +155,62 @@ typedef struct xfs_bmalloca { { BMAP_RIGHT_FILLING, "RF" }, \ { BMAP_ATTRFORK, "ATTR" } -/* - * Add bmap trace insert entries for all the contents of the extent list. - * - * Quite excessive tracing. Only do this for debug builds. - */ #if defined(__KERNEL) && defined(DEBUG) -void -xfs_bmap_trace_exlist( - struct xfs_inode *ip, /* incore inode pointer */ - xfs_extnum_t cnt, /* count of entries in list */ - int whichfork, - unsigned long caller_ip); /* data or attr fork */ +void xfs_bmap_trace_exlist(struct xfs_inode *ip, xfs_extnum_t cnt, + int whichfork, unsigned long caller_ip); #define XFS_BMAP_TRACE_EXLIST(ip,c,w) \ xfs_bmap_trace_exlist(ip,c,w, _THIS_IP_) #else #define XFS_BMAP_TRACE_EXLIST(ip,c,w) #endif -/* - * Convert inode from non-attributed to attributed. - * Must not be in a transaction, ip must not be locked. - */ -int /* error code */ -xfs_bmap_add_attrfork( - struct xfs_inode *ip, /* incore inode pointer */ - int size, /* space needed for new attribute */ - int rsvd); /* flag for reserved block allocation */ - -/* - * Add the extent to the list of extents to be free at transaction end. - * The list is maintained sorted (by block number). - */ -void -xfs_bmap_add_free( - xfs_fsblock_t bno, /* fs block number of extent */ - xfs_filblks_t len, /* length of extent */ - xfs_bmap_free_t *flist, /* list of extents */ - struct xfs_mount *mp); /* mount point structure */ - -/* - * Routine to clean up the free list data structure when - * an error occurs during a transaction. - */ -void -xfs_bmap_cancel( - xfs_bmap_free_t *flist); /* free list to clean up */ - -/* - * Compute and fill in the value of the maximum depth of a bmap btree - * in this filesystem. Done once, during mount. - */ -void -xfs_bmap_compute_maxlevels( - struct xfs_mount *mp, /* file system mount structure */ - int whichfork); /* data or attr fork */ - -/* - * Returns the file-relative block number of the first unused block in the file. - * This is the lowest-address hole if the file has holes, else the first block - * past the end of file. - */ -int /* error */ -xfs_bmap_first_unused( - struct xfs_trans *tp, /* transaction pointer */ - struct xfs_inode *ip, /* incore inode */ - xfs_extlen_t len, /* size of hole to find */ - xfs_fileoff_t *unused, /* unused block num */ - int whichfork); /* data or attr fork */ - -/* - * Returns the file-relative block number of the last block + 1 before - * last_block (input value) in the file. - * This is not based on i_size, it is based on the extent list. - * Returns 0 for local files, as they do not have an extent list. - */ -int /* error */ -xfs_bmap_last_before( - struct xfs_trans *tp, /* transaction pointer */ - struct xfs_inode *ip, /* incore inode */ - xfs_fileoff_t *last_block, /* last block */ - int whichfork); /* data or attr fork */ - -/* - * Returns the file-relative block number of the first block past eof in - * the file. This is not based on i_size, it is based on the extent list. - * Returns 0 for local files, as they do not have an extent list. - */ -int /* error */ -xfs_bmap_last_offset( - struct xfs_trans *tp, /* transaction pointer */ - struct xfs_inode *ip, /* incore inode */ - xfs_fileoff_t *unused, /* last block num */ - int whichfork); /* data or attr fork */ - -/* - * Returns whether the selected fork of the inode has exactly one - * block or not. For the data fork we check this matches di_size, - * implying the file's range is 0..bsize-1. - */ -int -xfs_bmap_one_block( - struct xfs_inode *ip, /* incore inode */ - int whichfork); /* data or attr fork */ - -/* - * Read in the extents to iu_extents. - * All inode fields are set up by caller, we just traverse the btree - * and copy the records in. - */ -int /* error */ -xfs_bmap_read_extents( - struct xfs_trans *tp, /* transaction pointer */ - struct xfs_inode *ip, /* incore inode */ - int whichfork); /* data or attr fork */ - -/* - * Map file blocks to filesystem blocks. - * File range is given by the bno/len pair. - * Adds blocks to file if a write ("flags & XFS_BMAPI_WRITE" set) - * into a hole or past eof. - * Only allocates blocks from a single allocation group, - * to avoid locking problems. - * The returned value in "firstblock" from the first call in a transaction - * must be remembered and presented to subsequent calls in "firstblock". - * An upper bound for the number of blocks to be allocated is supplied to - * the first call in "total"; if no allocation group has that many free - * blocks then the call will fail (return NULLFSBLOCK in "firstblock"). - */ -int /* error */ -xfs_bmapi( - struct xfs_trans *tp, /* transaction pointer */ - struct xfs_inode *ip, /* incore inode */ - xfs_fileoff_t bno, /* starting file offs. mapped */ - xfs_filblks_t len, /* length to map in file */ - int flags, /* XFS_BMAPI_... */ - xfs_fsblock_t *firstblock, /* first allocated block - controls a.g. for allocs */ - xfs_extlen_t total, /* total blocks needed */ - struct xfs_bmbt_irec *mval, /* output: map values */ - int *nmap, /* i/o: mval size/count */ - xfs_bmap_free_t *flist); /* i/o: list extents to free */ - -/* - * Map file blocks to filesystem blocks, simple version. - * One block only, read-only. - * For flags, only the XFS_BMAPI_ATTRFORK flag is examined. - * For the other flag values, the effect is as if XFS_BMAPI_METADATA - * was set and all the others were clear. - */ -int /* error */ -xfs_bmapi_single( - struct xfs_trans *tp, /* transaction pointer */ - struct xfs_inode *ip, /* incore inode */ - int whichfork, /* data or attr fork */ - xfs_fsblock_t *fsb, /* output: mapped block */ - xfs_fileoff_t bno); /* starting file offs. mapped */ - -/* - * Unmap (remove) blocks from a file. - * If nexts is nonzero then the number of extents to remove is limited to - * that value. If not all extents in the block range can be removed then - * *done is set. - */ -int /* error */ -xfs_bunmapi( - struct xfs_trans *tp, /* transaction pointer */ - struct xfs_inode *ip, /* incore inode */ - xfs_fileoff_t bno, /* starting offset to unmap */ - xfs_filblks_t len, /* length to unmap in file */ - int flags, /* XFS_BMAPI_... */ - xfs_extnum_t nexts, /* number of extents max */ - xfs_fsblock_t *firstblock, /* first allocated block - controls a.g. for allocs */ - xfs_bmap_free_t *flist, /* i/o: list extents to free */ - int *done); /* set if not done yet */ - -/* - * Check an extent list, which has just been read, for - * any bit in the extent flag field. - */ -int -xfs_check_nostate_extents( - struct xfs_ifork *ifp, - xfs_extnum_t idx, - xfs_extnum_t num); - -uint -xfs_default_attroffset( - struct xfs_inode *ip); +int xfs_bmap_add_attrfork(struct xfs_inode *ip, int size, int rsvd); +void xfs_bmap_add_free(xfs_fsblock_t bno, xfs_filblks_t len, + struct xfs_bmap_free *flist, struct xfs_mount *mp); +void xfs_bmap_cancel(struct xfs_bmap_free *flist); +void xfs_bmap_compute_maxlevels(struct xfs_mount *mp, int whichfork); +int xfs_bmap_first_unused(struct xfs_trans *tp, struct xfs_inode *ip, + xfs_extlen_t len, xfs_fileoff_t *unused, int whichfork); +int xfs_bmap_last_before(struct xfs_trans *tp, struct xfs_inode *ip, + xfs_fileoff_t *last_block, int whichfork); +int xfs_bmap_last_offset(struct xfs_trans *tp, struct xfs_inode *ip, + xfs_fileoff_t *unused, int whichfork); +int xfs_bmap_one_block(struct xfs_inode *ip, int whichfork); +int xfs_bmap_read_extents(struct xfs_trans *tp, struct xfs_inode *ip, + int whichfork); +int xfs_bmapi_read(struct xfs_inode *ip, xfs_fileoff_t bno, + xfs_filblks_t len, struct xfs_bmbt_irec *mval, + int *nmap, int flags); +int xfs_bmapi_delay(struct xfs_inode *ip, xfs_fileoff_t bno, + xfs_filblks_t len, struct xfs_bmbt_irec *mval, + int *nmap, int flags); +int xfs_bmapi_write(struct xfs_trans *tp, struct xfs_inode *ip, + xfs_fileoff_t bno, xfs_filblks_t len, int flags, + xfs_fsblock_t *firstblock, xfs_extlen_t total, + struct xfs_bmbt_irec *mval, int *nmap, + struct xfs_bmap_free *flist); +int xfs_bunmapi(struct xfs_trans *tp, struct xfs_inode *ip, + xfs_fileoff_t bno, xfs_filblks_t len, int flags, + xfs_extnum_t nexts, xfs_fsblock_t *firstblock, + struct xfs_bmap_free *flist, int *done); +int xfs_check_nostate_extents(struct xfs_ifork *ifp, xfs_extnum_t idx, + xfs_extnum_t num); +uint xfs_default_attroffset(struct xfs_inode *ip); #ifdef __KERNEL__ - -/* - * Routine to be called at transaction's end by xfs_bmapi, xfs_bunmapi - * caller. Frees all the extents that need freeing, which must be done - * last due to locking considerations. - * - * Return 1 if the given transaction was committed and a new one allocated, - * and 0 otherwise. - */ -int /* error */ -xfs_bmap_finish( - struct xfs_trans **tp, /* transaction pointer addr */ - xfs_bmap_free_t *flist, /* i/o: list extents to free */ - int *committed); /* xact committed or not */ - /* bmap to userspace formatter - copy to user & advance pointer */ typedef int (*xfs_bmap_format_t)(void **, struct getbmapx *, int *); -/* - * Get inode's extents as described in bmv, and format for output. - */ -int /* error code */ -xfs_getbmap( - xfs_inode_t *ip, - struct getbmapx *bmv, /* user bmap structure */ - xfs_bmap_format_t formatter, /* format to user */ - void *arg); /* formatter arg */ - -/* - * Check if the endoff is outside the last extent. If so the caller will grow - * the allocation to a stripe unit boundary - */ -int -xfs_bmap_eof( - struct xfs_inode *ip, - xfs_fileoff_t endoff, - int whichfork, - int *eof); - -/* - * Count fsblocks of the given fork. - */ -int -xfs_bmap_count_blocks( - xfs_trans_t *tp, - struct xfs_inode *ip, - int whichfork, - int *count); - -int -xfs_bmap_punch_delalloc_range( - struct xfs_inode *ip, - xfs_fileoff_t start_fsb, - xfs_fileoff_t length); +int xfs_bmap_finish(struct xfs_trans **tp, struct xfs_bmap_free *flist, + int *committed); +int xfs_getbmap(struct xfs_inode *ip, struct getbmapx *bmv, + xfs_bmap_format_t formatter, void *arg); +int xfs_bmap_eof(struct xfs_inode *ip, xfs_fileoff_t endoff, + int whichfork, int *eof); +int xfs_bmap_count_blocks(struct xfs_trans *tp, struct xfs_inode *ip, + int whichfork, int *count); +int xfs_bmap_punch_delalloc_range(struct xfs_inode *ip, + xfs_fileoff_t start_fsb, xfs_fileoff_t length); #endif /* __KERNEL__ */ #endif /* __XFS_BMAP_H__ */ diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c index 2b9fd38..1f19f03 100644 --- a/fs/xfs/xfs_btree.c +++ b/fs/xfs/xfs_btree.c @@ -631,7 +631,7 @@ xfs_btree_read_bufl( } ASSERT(!xfs_buf_geterror(bp)); if (bp) - XFS_BUF_SET_VTYPE_REF(bp, B_FS_MAP, refval); + xfs_buf_set_ref(bp, refval); *bpp = bp; return 0; } @@ -939,13 +939,13 @@ xfs_btree_set_refs( switch (cur->bc_btnum) { case XFS_BTNUM_BNO: case XFS_BTNUM_CNT: - XFS_BUF_SET_VTYPE_REF(bp, B_FS_MAP, XFS_ALLOC_BTREE_REF); + xfs_buf_set_ref(bp, XFS_ALLOC_BTREE_REF); break; case XFS_BTNUM_INO: - XFS_BUF_SET_VTYPE_REF(bp, B_FS_INOMAP, XFS_INO_BTREE_REF); + xfs_buf_set_ref(bp, XFS_INO_BTREE_REF); break; case XFS_BTNUM_BMAP: - XFS_BUF_SET_VTYPE_REF(bp, B_FS_MAP, XFS_BMAP_BTREE_REF); + xfs_buf_set_ref(bp, XFS_BMAP_BTREE_REF); break; default: ASSERT(0); @@ -970,7 +970,8 @@ xfs_btree_get_buf_block( *bpp = xfs_trans_get_buf(cur->bc_tp, mp->m_ddev_targp, d, mp->m_bsize, flags); - ASSERT(!xfs_buf_geterror(*bpp)); + if (!*bpp) + return ENOMEM; *block = XFS_BUF_TO_BLOCK(*bpp); return 0; diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index c57836d..4dff85c 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -43,7 +43,6 @@ static kmem_zone_t *xfs_buf_zone; STATIC int xfsbufd(void *); -STATIC void xfs_buf_delwri_queue(xfs_buf_t *, int); static struct workqueue_struct *xfslogd_workqueue; struct workqueue_struct *xfsdatad_workqueue; @@ -66,10 +65,6 @@ struct workqueue_struct *xfsconvertd_workqueue; #define xb_to_km(flags) \ (((flags) & XBF_DONT_BLOCK) ? KM_NOFS : KM_SLEEP) -#define xfs_buf_allocate(flags) \ - kmem_zone_alloc(xfs_buf_zone, xb_to_km(flags)) -#define xfs_buf_deallocate(bp) \ - kmem_zone_free(xfs_buf_zone, (bp)); static inline int xfs_buf_is_vmapped( @@ -152,6 +147,7 @@ xfs_buf_stale( struct xfs_buf *bp) { bp->b_flags |= XBF_STALE; + xfs_buf_delwri_dequeue(bp); atomic_set(&(bp)->b_lru_ref, 0); if (!list_empty(&bp->b_lru)) { struct xfs_buftarg *btp = bp->b_target; @@ -167,14 +163,19 @@ xfs_buf_stale( ASSERT(atomic_read(&bp->b_hold) >= 1); } -STATIC void -_xfs_buf_initialize( - xfs_buf_t *bp, - xfs_buftarg_t *target, +struct xfs_buf * +xfs_buf_alloc( + struct xfs_buftarg *target, xfs_off_t range_base, size_t range_length, xfs_buf_flags_t flags) { + struct xfs_buf *bp; + + bp = kmem_zone_alloc(xfs_buf_zone, xb_to_km(flags)); + if (unlikely(!bp)) + return NULL; + /* * We don't want certain flags to appear in b_flags. */ @@ -203,8 +204,9 @@ _xfs_buf_initialize( init_waitqueue_head(&bp->b_waiters); XFS_STATS_INC(xb_create); - trace_xfs_buf_init(bp, _RET_IP_); + + return bp; } /* @@ -277,7 +279,7 @@ xfs_buf_free( } else if (bp->b_flags & _XBF_KMEM) kmem_free(bp->b_addr); _xfs_buf_free_pages(bp); - xfs_buf_deallocate(bp); + kmem_zone_free(xfs_buf_zone, bp); } /* @@ -416,10 +418,7 @@ _xfs_buf_map_pages( /* * Look up, and creates if absent, a lockable buffer for * a given range of an inode. The buffer is returned - * locked. If other overlapping buffers exist, they are - * released before the new buffer is created and locked, - * which may imply that this call will block until those buffers - * are unlocked. No I/O is implied by this call. + * locked. No I/O is implied by this call. */ xfs_buf_t * _xfs_buf_find( @@ -481,8 +480,6 @@ _xfs_buf_find( /* No match found */ if (new_bp) { - _xfs_buf_initialize(new_bp, btp, range_base, - range_length, flags); rb_link_node(&new_bp->b_rbnode, parent, rbp); rb_insert_color(&new_bp->b_rbnode, &pag->pag_buf_tree); /* the buffer keeps the perag reference until it is freed */ @@ -525,35 +522,51 @@ found: } /* - * Assembles a buffer covering the specified range. - * Storage in memory for all portions of the buffer will be allocated, - * although backing storage may not be. + * Assembles a buffer covering the specified range. The code is optimised for + * cache hits, as metadata intensive workloads will see 3 orders of magnitude + * more hits than misses. */ -xfs_buf_t * +struct xfs_buf * xfs_buf_get( xfs_buftarg_t *target,/* target for buffer */ xfs_off_t ioff, /* starting offset of range */ size_t isize, /* length of range */ xfs_buf_flags_t flags) { - xfs_buf_t *bp, *new_bp; + struct xfs_buf *bp; + struct xfs_buf *new_bp; int error = 0; - new_bp = xfs_buf_allocate(flags); + bp = _xfs_buf_find(target, ioff, isize, flags, NULL); + if (likely(bp)) + goto found; + + new_bp = xfs_buf_alloc(target, ioff << BBSHIFT, isize << BBSHIFT, + flags); if (unlikely(!new_bp)) return NULL; bp = _xfs_buf_find(target, ioff, isize, flags, new_bp); + if (!bp) { + kmem_zone_free(xfs_buf_zone, new_bp); + return NULL; + } + if (bp == new_bp) { error = xfs_buf_allocate_memory(bp, flags); if (error) goto no_buffer; - } else { - xfs_buf_deallocate(new_bp); - if (unlikely(bp == NULL)) - return NULL; - } + } else + kmem_zone_free(xfs_buf_zone, new_bp); + /* + * Now we have a workable buffer, fill in the block number so + * that we can do IO on it. + */ + bp->b_bn = ioff; + bp->b_count_desired = bp->b_buffer_length; + +found: if (!(bp->b_flags & XBF_MAPPED)) { error = _xfs_buf_map_pages(bp, flags); if (unlikely(error)) { @@ -564,18 +577,10 @@ xfs_buf_get( } XFS_STATS_INC(xb_get); - - /* - * Always fill in the block number now, the mapped cases can do - * their own overlay of this later. - */ - bp->b_bn = ioff; - bp->b_count_desired = bp->b_buffer_length; - trace_xfs_buf_get(bp, flags, _RET_IP_); return bp; - no_buffer: +no_buffer: if (flags & (XBF_LOCK | XBF_TRYLOCK)) xfs_buf_unlock(bp); xfs_buf_rele(bp); @@ -689,19 +694,6 @@ xfs_buf_read_uncached( return bp; } -xfs_buf_t * -xfs_buf_get_empty( - size_t len, - xfs_buftarg_t *target) -{ - xfs_buf_t *bp; - - bp = xfs_buf_allocate(0); - if (bp) - _xfs_buf_initialize(bp, target, 0, len, 0); - return bp; -} - /* * Return a buffer allocated as an empty buffer and associated to external * memory via xfs_buf_associate_memory() back to it's empty state. @@ -787,10 +779,9 @@ xfs_buf_get_uncached( int error, i; xfs_buf_t *bp; - bp = xfs_buf_allocate(0); + bp = xfs_buf_alloc(target, 0, len, 0); if (unlikely(bp == NULL)) goto fail; - _xfs_buf_initialize(bp, target, 0, len, 0); error = _xfs_buf_get_pages(bp, page_count, 0); if (error) @@ -818,7 +809,7 @@ xfs_buf_get_uncached( __free_page(bp->b_pages[i]); _xfs_buf_free_pages(bp); fail_free_buf: - xfs_buf_deallocate(bp); + kmem_zone_free(xfs_buf_zone, bp); fail: return NULL; } @@ -937,12 +928,6 @@ void xfs_buf_unlock( struct xfs_buf *bp) { - if ((bp->b_flags & (XBF_DELWRI|_XBF_DELWRI_Q)) == XBF_DELWRI) { - atomic_inc(&bp->b_hold); - bp->b_flags |= XBF_ASYNC; - xfs_buf_delwri_queue(bp, 0); - } - XB_CLEAR_OWNER(bp); up(&bp->b_sema); @@ -1019,9 +1004,19 @@ xfs_buf_ioerror( trace_xfs_buf_ioerror(bp, error, _RET_IP_); } +void +xfs_buf_ioerror_alert( + struct xfs_buf *bp, + const char *func) +{ + xfs_alert(bp->b_target->bt_mount, +"metadata I/O error: block 0x%llx (\"%s\") error %d buf count %zd", + (__uint64_t)XFS_BUF_ADDR(bp), func, + bp->b_error, XFS_BUF_COUNT(bp)); +} + int xfs_bwrite( - struct xfs_mount *mp, struct xfs_buf *bp) { int error; @@ -1033,25 +1028,13 @@ xfs_bwrite( xfs_bdstrat_cb(bp); error = xfs_buf_iowait(bp); - if (error) - xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); - xfs_buf_relse(bp); + if (error) { + xfs_force_shutdown(bp->b_target->bt_mount, + SHUTDOWN_META_IO_ERROR); + } return error; } -void -xfs_bdwrite( - void *mp, - struct xfs_buf *bp) -{ - trace_xfs_buf_bdwrite(bp, _RET_IP_); - - bp->b_flags &= ~XBF_READ; - bp->b_flags |= (XBF_DELWRI | XBF_ASYNC); - - xfs_buf_delwri_queue(bp, 1); -} - /* * Called when we want to stop a buffer from getting written or read. * We attach the EIO error, muck with its flags, and call xfs_buf_ioend @@ -1074,9 +1057,8 @@ xfs_bioerror( * We're calling xfs_buf_ioend, so delete XBF_DONE flag. */ XFS_BUF_UNREAD(bp); - XFS_BUF_UNDELAYWRITE(bp); XFS_BUF_UNDONE(bp); - XFS_BUF_STALE(bp); + xfs_buf_stale(bp); xfs_buf_ioend(bp, 0); @@ -1103,9 +1085,8 @@ xfs_bioerror_relse( * change that interface. */ XFS_BUF_UNREAD(bp); - XFS_BUF_UNDELAYWRITE(bp); XFS_BUF_DONE(bp); - XFS_BUF_STALE(bp); + xfs_buf_stale(bp); bp->b_iodone = NULL; if (!(fl & XBF_ASYNC)) { /* @@ -1115,7 +1096,7 @@ xfs_bioerror_relse( * ASYNC buffers. */ xfs_buf_ioerror(bp, EIO); - XFS_BUF_FINISH_IOWAIT(bp); + complete(&bp->b_iowait); } else { xfs_buf_relse(bp); } @@ -1275,15 +1256,10 @@ xfs_buf_iorequest( { trace_xfs_buf_iorequest(bp, _RET_IP_); - if (bp->b_flags & XBF_DELWRI) { - xfs_buf_delwri_queue(bp, 1); - return 0; - } + ASSERT(!(bp->b_flags & XBF_DELWRI)); - if (bp->b_flags & XBF_WRITE) { + if (bp->b_flags & XBF_WRITE) xfs_buf_wait_unpin(bp); - } - xfs_buf_hold(bp); /* Set the count to 1 initially, this will stop an I/O @@ -1394,7 +1370,7 @@ restart: goto restart; } /* - * clear the LRU reference count so the bufer doesn't get + * clear the LRU reference count so the buffer doesn't get * ignored in xfs_buf_rele(). */ atomic_set(&bp->b_lru_ref, 0); @@ -1481,9 +1457,13 @@ xfs_setsize_buftarg_flags( btp->bt_smask = sectorsize - 1; if (set_blocksize(btp->bt_bdev, sectorsize)) { + char name[BDEVNAME_SIZE]; + + bdevname(btp->bt_bdev, name); + xfs_warn(btp->bt_mount, "Cannot set_blocksize to %u on device %s\n", - sectorsize, xfs_buf_target_name(btp)); + sectorsize, name); return EINVAL; } @@ -1514,12 +1494,12 @@ xfs_setsize_buftarg( } STATIC int -xfs_alloc_delwrite_queue( +xfs_alloc_delwri_queue( xfs_buftarg_t *btp, const char *fsname) { - INIT_LIST_HEAD(&btp->bt_delwrite_queue); - spin_lock_init(&btp->bt_delwrite_lock); + INIT_LIST_HEAD(&btp->bt_delwri_queue); + spin_lock_init(&btp->bt_delwri_lock); btp->bt_flags = 0; btp->bt_task = kthread_run(xfsbufd, btp, "xfsbufd/%s", fsname); if (IS_ERR(btp->bt_task)) @@ -1549,7 +1529,7 @@ xfs_alloc_buftarg( spin_lock_init(&btp->bt_lru_lock); if (xfs_setsize_buftarg_early(btp, bdev)) goto error; - if (xfs_alloc_delwrite_queue(btp, fsname)) + if (xfs_alloc_delwri_queue(btp, fsname)) goto error; btp->bt_shrinker.shrink = xfs_buftarg_shrink; btp->bt_shrinker.seeks = DEFAULT_SEEKS; @@ -1565,56 +1545,48 @@ error: /* * Delayed write buffer handling */ -STATIC void +void xfs_buf_delwri_queue( - xfs_buf_t *bp, - int unlock) + xfs_buf_t *bp) { - struct list_head *dwq = &bp->b_target->bt_delwrite_queue; - spinlock_t *dwlk = &bp->b_target->bt_delwrite_lock; + struct xfs_buftarg *btp = bp->b_target; trace_xfs_buf_delwri_queue(bp, _RET_IP_); - ASSERT((bp->b_flags&(XBF_DELWRI|XBF_ASYNC)) == (XBF_DELWRI|XBF_ASYNC)); + ASSERT(!(bp->b_flags & XBF_READ)); - spin_lock(dwlk); - /* If already in the queue, dequeue and place at tail */ + spin_lock(&btp->bt_delwri_lock); if (!list_empty(&bp->b_list)) { + /* if already in the queue, move it to the tail */ ASSERT(bp->b_flags & _XBF_DELWRI_Q); - if (unlock) - atomic_dec(&bp->b_hold); - list_del(&bp->b_list); - } - - if (list_empty(dwq)) { + list_move_tail(&bp->b_list, &btp->bt_delwri_queue); + } else { /* start xfsbufd as it is about to have something to do */ - wake_up_process(bp->b_target->bt_task); - } + if (list_empty(&btp->bt_delwri_queue)) + wake_up_process(bp->b_target->bt_task); - bp->b_flags |= _XBF_DELWRI_Q; - list_add_tail(&bp->b_list, dwq); + atomic_inc(&bp->b_hold); + bp->b_flags |= XBF_DELWRI | _XBF_DELWRI_Q | XBF_ASYNC; + list_add_tail(&bp->b_list, &btp->bt_delwri_queue); + } bp->b_queuetime = jiffies; - spin_unlock(dwlk); - - if (unlock) - xfs_buf_unlock(bp); + spin_unlock(&btp->bt_delwri_lock); } void xfs_buf_delwri_dequeue( xfs_buf_t *bp) { - spinlock_t *dwlk = &bp->b_target->bt_delwrite_lock; int dequeued = 0; - spin_lock(dwlk); + spin_lock(&bp->b_target->bt_delwri_lock); if ((bp->b_flags & XBF_DELWRI) && !list_empty(&bp->b_list)) { ASSERT(bp->b_flags & _XBF_DELWRI_Q); list_del_init(&bp->b_list); dequeued = 1; } bp->b_flags &= ~(XBF_DELWRI|_XBF_DELWRI_Q); - spin_unlock(dwlk); + spin_unlock(&bp->b_target->bt_delwri_lock); if (dequeued) xfs_buf_rele(bp); @@ -1646,16 +1618,9 @@ xfs_buf_delwri_promote( if (bp->b_queuetime < jiffies - age) return; bp->b_queuetime = jiffies - age; - spin_lock(&btp->bt_delwrite_lock); - list_move(&bp->b_list, &btp->bt_delwrite_queue); - spin_unlock(&btp->bt_delwrite_lock); -} - -STATIC void -xfs_buf_runall_queues( - struct workqueue_struct *queue) -{ - flush_workqueue(queue); + spin_lock(&btp->bt_delwri_lock); + list_move(&bp->b_list, &btp->bt_delwri_queue); + spin_unlock(&btp->bt_delwri_lock); } /* @@ -1669,15 +1634,13 @@ xfs_buf_delwri_split( unsigned long age) { xfs_buf_t *bp, *n; - struct list_head *dwq = &target->bt_delwrite_queue; - spinlock_t *dwlk = &target->bt_delwrite_lock; int skipped = 0; int force; force = test_and_clear_bit(XBT_FORCE_FLUSH, &target->bt_flags); INIT_LIST_HEAD(list); - spin_lock(dwlk); - list_for_each_entry_safe(bp, n, dwq, b_list) { + spin_lock(&target->bt_delwri_lock); + list_for_each_entry_safe(bp, n, &target->bt_delwri_queue, b_list) { ASSERT(bp->b_flags & XBF_DELWRI); if (!xfs_buf_ispinned(bp) && xfs_buf_trylock(bp)) { @@ -1694,10 +1657,9 @@ xfs_buf_delwri_split( } else skipped++; } - spin_unlock(dwlk); + spin_unlock(&target->bt_delwri_lock); return skipped; - } /* @@ -1739,15 +1701,11 @@ xfsbufd( struct list_head tmp; struct blk_plug plug; - if (unlikely(freezing(current))) { - set_bit(XBT_FORCE_SLEEP, &target->bt_flags); - refrigerator(); - } else { - clear_bit(XBT_FORCE_SLEEP, &target->bt_flags); - } + if (unlikely(freezing(current))) + try_to_freeze(); /* sleep for a long time if there is nothing to do. */ - if (list_empty(&target->bt_delwrite_queue)) + if (list_empty(&target->bt_delwri_queue)) tout = MAX_SCHEDULE_TIMEOUT; schedule_timeout_interruptible(tout); @@ -1783,9 +1741,7 @@ xfs_flush_buftarg( LIST_HEAD(wait_list); struct blk_plug plug; - xfs_buf_runall_queues(xfsconvertd_workqueue); - xfs_buf_runall_queues(xfsdatad_workqueue); - xfs_buf_runall_queues(xfslogd_workqueue); + flush_workqueue(xfslogd_workqueue); set_bit(XBT_FORCE_FLUSH, &target->bt_flags); pincount = xfs_buf_delwri_split(target, &tmp_list, 0); @@ -1866,11 +1822,3 @@ xfs_buf_terminate(void) destroy_workqueue(xfslogd_workqueue); kmem_zone_destroy(xfs_buf_zone); } - -#ifdef CONFIG_KDB_MODULES -struct list_head * -xfs_get_buftarg_list(void) -{ - return &xfs_buftarg_list; -} -#endif diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index 620972b..df7ffb0 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -90,8 +90,7 @@ typedef unsigned int xfs_buf_flags_t; { _XBF_DELWRI_Q, "DELWRI_Q" } typedef enum { - XBT_FORCE_SLEEP = 0, - XBT_FORCE_FLUSH = 1, + XBT_FORCE_FLUSH = 0, } xfs_buftarg_flags_t; typedef struct xfs_buftarg { @@ -105,8 +104,8 @@ typedef struct xfs_buftarg { /* per device delwri queue */ struct task_struct *bt_task; - struct list_head bt_delwrite_queue; - spinlock_t bt_delwrite_lock; + struct list_head bt_delwri_queue; + spinlock_t bt_delwri_lock; unsigned long bt_flags; /* LRU control structures */ @@ -175,7 +174,8 @@ extern xfs_buf_t *xfs_buf_get(xfs_buftarg_t *, xfs_off_t, size_t, extern xfs_buf_t *xfs_buf_read(xfs_buftarg_t *, xfs_off_t, size_t, xfs_buf_flags_t); -extern xfs_buf_t *xfs_buf_get_empty(size_t, xfs_buftarg_t *); +struct xfs_buf *xfs_buf_alloc(struct xfs_buftarg *, xfs_off_t, size_t, + xfs_buf_flags_t); extern void xfs_buf_set_empty(struct xfs_buf *bp, size_t len); extern xfs_buf_t *xfs_buf_get_uncached(struct xfs_buftarg *, size_t, int); extern int xfs_buf_associate_memory(xfs_buf_t *, void *, size_t); @@ -197,14 +197,14 @@ extern void xfs_buf_unlock(xfs_buf_t *); ((bp)->b_sema.count <= 0) /* Buffer Read and Write Routines */ -extern int xfs_bwrite(struct xfs_mount *mp, struct xfs_buf *bp); -extern void xfs_bdwrite(void *mp, xfs_buf_t *bp); +extern int xfs_bwrite(struct xfs_buf *bp); extern void xfsbdstrat(struct xfs_mount *, struct xfs_buf *); extern int xfs_bdstrat_cb(struct xfs_buf *); extern void xfs_buf_ioend(xfs_buf_t *, int); extern void xfs_buf_ioerror(xfs_buf_t *, int); +extern void xfs_buf_ioerror_alert(struct xfs_buf *, const char *func); extern int xfs_buf_iorequest(xfs_buf_t *); extern int xfs_buf_iowait(xfs_buf_t *); extern void xfs_buf_iomove(xfs_buf_t *, size_t, size_t, void *, @@ -221,38 +221,22 @@ static inline int xfs_buf_geterror(xfs_buf_t *bp) extern xfs_caddr_t xfs_buf_offset(xfs_buf_t *, size_t); /* Delayed Write Buffer Routines */ -extern void xfs_buf_delwri_dequeue(xfs_buf_t *); -extern void xfs_buf_delwri_promote(xfs_buf_t *); +extern void xfs_buf_delwri_queue(struct xfs_buf *); +extern void xfs_buf_delwri_dequeue(struct xfs_buf *); +extern void xfs_buf_delwri_promote(struct xfs_buf *); /* Buffer Daemon Setup Routines */ extern int xfs_buf_init(void); extern void xfs_buf_terminate(void); -static inline const char * -xfs_buf_target_name(struct xfs_buftarg *target) -{ - static char __b[BDEVNAME_SIZE]; - - return bdevname(target->bt_bdev, __b); -} - - #define XFS_BUF_ZEROFLAGS(bp) \ ((bp)->b_flags &= ~(XBF_READ|XBF_WRITE|XBF_ASYNC|XBF_DELWRI| \ XBF_SYNCIO|XBF_FUA|XBF_FLUSH)) void xfs_buf_stale(struct xfs_buf *bp); -#define XFS_BUF_STALE(bp) xfs_buf_stale(bp); #define XFS_BUF_UNSTALE(bp) ((bp)->b_flags &= ~XBF_STALE) #define XFS_BUF_ISSTALE(bp) ((bp)->b_flags & XBF_STALE) -#define XFS_BUF_SUPER_STALE(bp) do { \ - XFS_BUF_STALE(bp); \ - xfs_buf_delwri_dequeue(bp); \ - XFS_BUF_DONE(bp); \ - } while (0) - -#define XFS_BUF_DELAYWRITE(bp) ((bp)->b_flags |= XBF_DELWRI) -#define XFS_BUF_UNDELAYWRITE(bp) xfs_buf_delwri_dequeue(bp) + #define XFS_BUF_ISDELAYWRITE(bp) ((bp)->b_flags & XBF_DELWRI) #define XFS_BUF_DONE(bp) ((bp)->b_flags |= XBF_DONE) @@ -280,23 +264,16 @@ void xfs_buf_stale(struct xfs_buf *bp); #define XFS_BUF_SIZE(bp) ((bp)->b_buffer_length) #define XFS_BUF_SET_SIZE(bp, cnt) ((bp)->b_buffer_length = (cnt)) -static inline void -xfs_buf_set_ref( - struct xfs_buf *bp, - int lru_ref) +static inline void xfs_buf_set_ref(struct xfs_buf *bp, int lru_ref) { atomic_set(&bp->b_lru_ref, lru_ref); } -#define XFS_BUF_SET_VTYPE_REF(bp, type, ref) xfs_buf_set_ref(bp, ref) -#define XFS_BUF_SET_VTYPE(bp, type) do { } while (0) static inline int xfs_buf_ispinned(struct xfs_buf *bp) { return atomic_read(&bp->b_pin_count); } -#define XFS_BUF_FINISH_IOWAIT(bp) complete(&bp->b_iowait); - static inline void xfs_buf_relse(xfs_buf_t *bp) { xfs_buf_unlock(bp); @@ -313,14 +290,7 @@ extern void xfs_wait_buftarg(xfs_buftarg_t *); extern int xfs_setsize_buftarg(xfs_buftarg_t *, unsigned int, unsigned int); extern int xfs_flush_buftarg(xfs_buftarg_t *, int); -#ifdef CONFIG_KDB_MODULES -extern struct list_head *xfs_get_buftarg_list(void); -#endif - #define xfs_getsize_buftarg(buftarg) block_size((buftarg)->bt_bdev) #define xfs_readonly_buftarg(buftarg) bdev_read_only((buftarg)->bt_bdev) -#define xfs_binval(buftarg) xfs_flush_buftarg(buftarg, 1) -#define XFS_bflush(buftarg) xfs_flush_buftarg(buftarg, 1) - #endif /* __XFS_BUF_H__ */ diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index ef43fce..eac97ef 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -656,7 +656,7 @@ xfs_buf_item_committing( /* * This is the ops vector shared by all buf log items. */ -static struct xfs_item_ops xfs_buf_item_ops = { +static const struct xfs_item_ops xfs_buf_item_ops = { .iop_size = xfs_buf_item_size, .iop_format = xfs_buf_item_format, .iop_pin = xfs_buf_item_pin, @@ -967,7 +967,8 @@ xfs_buf_iodone_callbacks( * I/O errors, there's no point in giving this a retry. */ if (XFS_FORCED_SHUTDOWN(mp)) { - XFS_BUF_SUPER_STALE(bp); + xfs_buf_stale(bp); + XFS_BUF_DONE(bp); trace_xfs_buf_item_iodone(bp, _RET_IP_); goto do_callbacks; } @@ -975,9 +976,7 @@ xfs_buf_iodone_callbacks( if (bp->b_target != lasttarg || time_after(jiffies, (lasttime + 5*HZ))) { lasttime = jiffies; - xfs_alert(mp, "Device %s: metadata write error block 0x%llx", - xfs_buf_target_name(bp->b_target), - (__uint64_t)XFS_BUF_ADDR(bp)); + xfs_buf_ioerror_alert(bp, __func__); } lasttarg = bp->b_target; @@ -993,7 +992,7 @@ xfs_buf_iodone_callbacks( xfs_buf_ioerror(bp, 0); /* errno of 0 unsets the flag */ if (!XFS_BUF_ISSTALE(bp)) { - XFS_BUF_DELAYWRITE(bp); + xfs_buf_delwri_queue(bp); XFS_BUF_DONE(bp); } ASSERT(bp->b_iodone != NULL); @@ -1006,9 +1005,8 @@ xfs_buf_iodone_callbacks( * If the write of the buffer was synchronous, we want to make * sure to return the error to the caller of xfs_bwrite(). */ - XFS_BUF_STALE(bp); + xfs_buf_stale(bp); XFS_BUF_DONE(bp); - XFS_BUF_UNDELAYWRITE(bp); trace_xfs_buf_error_relse(bp, _RET_IP_); diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c index ee9d542..77c7425 100644 --- a/fs/xfs/xfs_da_btree.c +++ b/fs/xfs/xfs_da_btree.c @@ -1578,9 +1578,8 @@ xfs_da_grow_inode_int( */ nmap = 1; ASSERT(args->firstblock != NULL); - error = xfs_bmapi(tp, dp, *bno, count, - xfs_bmapi_aflag(w)|XFS_BMAPI_WRITE|XFS_BMAPI_METADATA| - XFS_BMAPI_CONTIG, + error = xfs_bmapi_write(tp, dp, *bno, count, + xfs_bmapi_aflag(w)|XFS_BMAPI_METADATA|XFS_BMAPI_CONTIG, args->firstblock, args->total, &map, &nmap, args->flist); if (error) @@ -1602,9 +1601,8 @@ xfs_da_grow_inode_int( for (b = *bno, mapi = 0; b < *bno + count; ) { nmap = MIN(XFS_BMAP_MAX_NMAP, count); c = (int)(*bno + count - b); - error = xfs_bmapi(tp, dp, b, c, - xfs_bmapi_aflag(w)|XFS_BMAPI_WRITE| - XFS_BMAPI_METADATA, + error = xfs_bmapi_write(tp, dp, b, c, + xfs_bmapi_aflag(w)|XFS_BMAPI_METADATA, args->firstblock, args->total, &mapp[mapi], &nmap, args->flist); if (error) @@ -1975,33 +1973,16 @@ xfs_da_do_buf( /* * Optimize the one-block case. */ - if (nfsb == 1) { - xfs_fsblock_t fsb; - - if ((error = - xfs_bmapi_single(trans, dp, whichfork, &fsb, - (xfs_fileoff_t)bno))) { - return error; - } + if (nfsb == 1) mapp = ↦ - if (fsb == NULLFSBLOCK) { - nmap = 0; - } else { - map.br_startblock = fsb; - map.br_startoff = (xfs_fileoff_t)bno; - map.br_blockcount = 1; - nmap = 1; - } - } else { + else mapp = kmem_alloc(sizeof(*mapp) * nfsb, KM_SLEEP); - nmap = nfsb; - if ((error = xfs_bmapi(trans, dp, (xfs_fileoff_t)bno, - nfsb, - XFS_BMAPI_METADATA | - xfs_bmapi_aflag(whichfork), - NULL, 0, mapp, &nmap, NULL))) - goto exit0; - } + + nmap = nfsb; + error = xfs_bmapi_read(dp, (xfs_fileoff_t)bno, nfsb, mapp, + &nmap, xfs_bmapi_aflag(whichfork)); + if (error) + goto exit0; } else { map.br_startblock = XFS_DADDR_TO_FSB(mp, mappedbno); map.br_startoff = (xfs_fileoff_t)bno; @@ -2072,13 +2053,10 @@ xfs_da_do_buf( if (!bp) continue; if (caller == 1) { - if (whichfork == XFS_ATTR_FORK) { - XFS_BUF_SET_VTYPE_REF(bp, B_FS_ATTR_BTREE, - XFS_ATTR_BTREE_REF); - } else { - XFS_BUF_SET_VTYPE_REF(bp, B_FS_DIR_BTREE, - XFS_DIR_BTREE_REF); - } + if (whichfork == XFS_ATTR_FORK) + xfs_buf_set_ref(bp, XFS_ATTR_BTREE_REF); + else + xfs_buf_set_ref(bp, XFS_DIR_BTREE_REF); } if (bplist) { bplist[nbplist++] = bp; diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c index 9a84a85..654dc6f 100644 --- a/fs/xfs/xfs_dfrag.c +++ b/fs/xfs/xfs_dfrag.c @@ -425,8 +425,8 @@ xfs_swap_extents( } - xfs_trans_ijoin_ref(tp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); - xfs_trans_ijoin_ref(tp, tip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); + xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); + xfs_trans_ijoin(tp, tip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); xfs_trans_log_inode(tp, ip, ilf_fields); xfs_trans_log_inode(tp, tip, tilf_fields); @@ -438,7 +438,7 @@ xfs_swap_extents( if (mp->m_flags & XFS_MOUNT_WSYNC) xfs_trans_set_sync(tp); - error = xfs_trans_commit(tp, XFS_TRANS_SWAPEXT); + error = xfs_trans_commit(tp, 0); trace_xfs_swap_extent_after(ip, 0); trace_xfs_swap_extent_after(tip, 1); diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c index ca2386d..66e108f 100644 --- a/fs/xfs/xfs_dir2_leaf.c +++ b/fs/xfs/xfs_dir2_leaf.c @@ -888,12 +888,10 @@ xfs_dir2_leaf_getdents( * we already have in the table. */ nmap = map_size - map_valid; - error = xfs_bmapi(NULL, dp, - map_off, + error = xfs_bmapi_read(dp, map_off, xfs_dir2_byte_to_da(mp, XFS_DIR2_LEAF_OFFSET) - map_off, - XFS_BMAPI_METADATA, NULL, 0, - &map[map_valid], &nmap, NULL); + &map[map_valid], &nmap, 0); /* * Don't know if we should ignore this or * try to return an error. diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c index 244e797..286a051 100644 --- a/fs/xfs/xfs_discard.c +++ b/fs/xfs/xfs_discard.c @@ -38,7 +38,7 @@ xfs_trim_extents( struct xfs_mount *mp, xfs_agnumber_t agno, xfs_fsblock_t start, - xfs_fsblock_t len, + xfs_fsblock_t end, xfs_fsblock_t minlen, __uint64_t *blocks_trimmed) { @@ -68,7 +68,7 @@ xfs_trim_extents( * Look up the longest btree in the AGF and start with it. */ error = xfs_alloc_lookup_le(cur, 0, - XFS_BUF_TO_AGF(agbp)->agf_longest, &i); + be32_to_cpu(XFS_BUF_TO_AGF(agbp)->agf_longest), &i); if (error) goto out_del_cursor; @@ -84,7 +84,7 @@ xfs_trim_extents( if (error) goto out_del_cursor; XFS_WANT_CORRUPTED_GOTO(i == 1, out_del_cursor); - ASSERT(flen <= XFS_BUF_TO_AGF(agbp)->agf_longest); + ASSERT(flen <= be32_to_cpu(XFS_BUF_TO_AGF(agbp)->agf_longest)); /* * Too small? Give up. @@ -100,7 +100,7 @@ xfs_trim_extents( * down partially overlapping ranges for now. */ if (XFS_AGB_TO_FSB(mp, agno, fbno) + flen < start || - XFS_AGB_TO_FSB(mp, agno, fbno) >= start + len) { + XFS_AGB_TO_FSB(mp, agno, fbno) > end) { trace_xfs_discard_exclude(mp, agno, fbno, flen); goto next_extent; } @@ -145,7 +145,7 @@ xfs_ioc_trim( struct request_queue *q = mp->m_ddev_targp->bt_bdev->bd_disk->queue; unsigned int granularity = q->limits.discard_granularity; struct fstrim_range range; - xfs_fsblock_t start, len, minlen; + xfs_fsblock_t start, end, minlen; xfs_agnumber_t start_agno, end_agno, agno; __uint64_t blocks_trimmed = 0; int error, last_error = 0; @@ -165,19 +165,19 @@ xfs_ioc_trim( * matter as trimming blocks is an advisory interface. */ start = XFS_B_TO_FSBT(mp, range.start); - len = XFS_B_TO_FSBT(mp, range.len); + end = start + XFS_B_TO_FSBT(mp, range.len) - 1; minlen = XFS_B_TO_FSB(mp, max_t(u64, granularity, range.minlen)); - start_agno = XFS_FSB_TO_AGNO(mp, start); - if (start_agno >= mp->m_sb.sb_agcount) + if (start >= mp->m_sb.sb_dblocks) return -XFS_ERROR(EINVAL); + if (end > mp->m_sb.sb_dblocks - 1) + end = mp->m_sb.sb_dblocks - 1; - end_agno = XFS_FSB_TO_AGNO(mp, start + len); - if (end_agno >= mp->m_sb.sb_agcount) - end_agno = mp->m_sb.sb_agcount - 1; + start_agno = XFS_FSB_TO_AGNO(mp, start); + end_agno = XFS_FSB_TO_AGNO(mp, end); for (agno = start_agno; agno <= end_agno; agno++) { - error = -xfs_trim_extents(mp, agno, start, len, minlen, + error = -xfs_trim_extents(mp, agno, start, end, minlen, &blocks_trimmed); if (error) last_error = error; diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index db62959..b4ff40b 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -39,20 +39,19 @@ #include "xfs_qm.h" #include "xfs_trace.h" - /* - LOCK ORDER - - inode lock (ilock) - dquot hash-chain lock (hashlock) - xqm dquot freelist lock (freelistlock - mount's dquot list lock (mplistlock) - user dquot lock - lock ordering among dquots is based on the uid or gid - group dquot lock - similar to udquots. Between the two dquots, the udquot - has to be locked first. - pin lock - the dquot lock must be held to take this lock. - flush lock - ditto. -*/ + * Lock order: + * + * ip->i_lock + * qh->qh_lock + * qi->qi_dqlist_lock + * dquot->q_qlock (xfs_dqlock() and friends) + * dquot->q_flush (xfs_dqflock() and friends) + * xfs_Gqm->qm_dqfrlist_lock + * + * If two dquots need to be locked the order is user before group/project, + * otherwise by the lowest id first, see xfs_dqlock2. + */ #ifdef DEBUG xfs_buftarg_t *xfs_dqerror_target; @@ -155,24 +154,6 @@ xfs_qm_dqdestroy( } /* - * This is what a 'fresh' dquot inside a dquot chunk looks like on disk. - */ -STATIC void -xfs_qm_dqinit_core( - xfs_dqid_t id, - uint type, - xfs_dqblk_t *d) -{ - /* - * Caller has zero'd the entire dquot 'chunk' already. - */ - d->dd_diskdq.d_magic = cpu_to_be16(XFS_DQUOT_MAGIC); - d->dd_diskdq.d_version = XFS_DQUOT_VERSION; - d->dd_diskdq.d_id = cpu_to_be32(id); - d->dd_diskdq.d_flags = type; -} - -/* * If default limits are in force, push them into the dquot now. * We overwrite the dquot limits only if they are zero and this * is not the root dquot. @@ -328,8 +309,13 @@ xfs_qm_init_dquot_blk( curid = id - (id % q->qi_dqperchunk); ASSERT(curid >= 0); memset(d, 0, BBTOB(q->qi_dqchunklen)); - for (i = 0; i < q->qi_dqperchunk; i++, d++, curid++) - xfs_qm_dqinit_core(curid, type, d); + for (i = 0; i < q->qi_dqperchunk; i++, d++, curid++) { + d->dd_diskdq.d_magic = cpu_to_be16(XFS_DQUOT_MAGIC); + d->dd_diskdq.d_version = XFS_DQUOT_VERSION; + d->dd_diskdq.d_id = cpu_to_be32(curid); + d->dd_diskdq.d_flags = type; + } + xfs_trans_dquot_buf(tp, bp, (type & XFS_DQ_USER ? XFS_BLF_UDQUOT_BUF : ((type & XFS_DQ_PROJ) ? XFS_BLF_PDQUOT_BUF : @@ -377,16 +363,14 @@ xfs_qm_dqalloc( return (ESRCH); } - xfs_trans_ijoin_ref(tp, quotip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, quotip, XFS_ILOCK_EXCL); nmaps = 1; - if ((error = xfs_bmapi(tp, quotip, - offset_fsb, XFS_DQUOT_CLUSTER_SIZE_FSB, - XFS_BMAPI_METADATA | XFS_BMAPI_WRITE, - &firstblock, - XFS_QM_DQALLOC_SPACE_RES(mp), - &map, &nmaps, &flist))) { + error = xfs_bmapi_write(tp, quotip, offset_fsb, + XFS_DQUOT_CLUSTER_SIZE_FSB, XFS_BMAPI_METADATA, + &firstblock, XFS_QM_DQALLOC_SPACE_RES(mp), + &map, &nmaps, &flist); + if (error) goto error0; - } ASSERT(map.br_blockcount == XFS_DQUOT_CLUSTER_SIZE_FSB); ASSERT(nmaps == 1); ASSERT((map.br_startblock != DELAYSTARTBLOCK) && @@ -402,8 +386,11 @@ xfs_qm_dqalloc( dqp->q_blkno, mp->m_quotainfo->qi_dqchunklen, 0); - if (!bp || (error = xfs_buf_geterror(bp))) + + error = xfs_buf_geterror(bp); + if (error) goto error1; + /* * Make a chunk of dquots out of this buffer and log * the entire thing. @@ -485,9 +472,8 @@ xfs_qm_dqtobp( /* * Find the block map; no allocations yet */ - error = xfs_bmapi(NULL, quotip, dqp->q_fileoffset, - XFS_DQUOT_CLUSTER_SIZE_FSB, XFS_BMAPI_METADATA, - NULL, 0, &map, &nmaps, NULL); + error = xfs_bmapi_read(quotip, dqp->q_fileoffset, + XFS_DQUOT_CLUSTER_SIZE_FSB, &map, &nmaps, 0); xfs_iunlock(quotip, XFS_ILOCK_SHARED); if (error) @@ -564,36 +550,62 @@ xfs_qm_dqtobp( * Read in the ondisk dquot using dqtobp() then copy it to an incore version, * and release the buffer immediately. * + * If XFS_QMOPT_DQALLOC is set, allocate a dquot on disk if it needed. */ -/* ARGSUSED */ -STATIC int +int xfs_qm_dqread( - xfs_trans_t **tpp, - xfs_dqid_t id, - xfs_dquot_t *dqp, /* dquot to get filled in */ - uint flags) + struct xfs_mount *mp, + xfs_dqid_t id, + uint type, + uint flags, + struct xfs_dquot **O_dqpp) { - xfs_disk_dquot_t *ddqp; - xfs_buf_t *bp; - int error; - xfs_trans_t *tp; + struct xfs_dquot *dqp; + struct xfs_disk_dquot *ddqp; + struct xfs_buf *bp; + struct xfs_trans *tp = NULL; + int error; + int cancelflags = 0; - ASSERT(tpp); + dqp = xfs_qm_dqinit(mp, id, type); trace_xfs_dqread(dqp); + if (flags & XFS_QMOPT_DQALLOC) { + tp = xfs_trans_alloc(mp, XFS_TRANS_QM_DQALLOC); + error = xfs_trans_reserve(tp, XFS_QM_DQALLOC_SPACE_RES(mp), + XFS_WRITE_LOG_RES(mp) + + /* + * Round the chunklen up to the next multiple + * of 128 (buf log item chunk size)). + */ + BBTOB(mp->m_quotainfo->qi_dqchunklen) - 1 + 128, + 0, + XFS_TRANS_PERM_LOG_RES, + XFS_WRITE_LOG_COUNT); + if (error) + goto error1; + cancelflags = XFS_TRANS_RELEASE_LOG_RES; + } + /* * get a pointer to the on-disk dquot and the buffer containing it * dqp already knows its own type (GROUP/USER). */ - if ((error = xfs_qm_dqtobp(tpp, dqp, &ddqp, &bp, flags))) { - return (error); + error = xfs_qm_dqtobp(&tp, dqp, &ddqp, &bp, flags); + if (error) { + /* + * This can happen if quotas got turned off (ESRCH), + * or if the dquot didn't exist on disk and we ask to + * allocate (ENOENT). + */ + trace_xfs_dqread_fail(dqp); + cancelflags |= XFS_TRANS_ABORT; + goto error1; } - tp = *tpp; /* copy everything from disk dquot to the incore dquot */ memcpy(&dqp->q_core, ddqp, sizeof(xfs_disk_dquot_t)); - ASSERT(be32_to_cpu(dqp->q_core.d_id) == id); xfs_qm_dquot_logitem_init(dqp); /* @@ -605,7 +617,7 @@ xfs_qm_dqread( dqp->q_res_rtbcount = be64_to_cpu(ddqp->d_rtbcount); /* Mark the buf so that this will stay incore a little longer */ - XFS_BUF_SET_VTYPE_REF(bp, B_FS_DQUOT, XFS_DQUOT_REF); + xfs_buf_set_ref(bp, XFS_DQUOT_REF); /* * We got the buffer with a xfs_trans_read_buf() (in dqtobp()) @@ -622,77 +634,22 @@ xfs_qm_dqread( ASSERT(xfs_buf_islocked(bp)); xfs_trans_brelse(tp, bp); - return (error); -} - - -/* - * allocate an incore dquot from the kernel heap, - * and fill its core with quota information kept on disk. - * If XFS_QMOPT_DQALLOC is set, it'll allocate a dquot on disk - * if it wasn't already allocated. - */ -STATIC int -xfs_qm_idtodq( - xfs_mount_t *mp, - xfs_dqid_t id, /* gid or uid, depending on type */ - uint type, /* UDQUOT or GDQUOT */ - uint flags, /* DQALLOC, DQREPAIR */ - xfs_dquot_t **O_dqpp)/* OUT : incore dquot, not locked */ -{ - xfs_dquot_t *dqp; - int error; - xfs_trans_t *tp; - int cancelflags=0; - - dqp = xfs_qm_dqinit(mp, id, type); - tp = NULL; - if (flags & XFS_QMOPT_DQALLOC) { - tp = xfs_trans_alloc(mp, XFS_TRANS_QM_DQALLOC); - error = xfs_trans_reserve(tp, XFS_QM_DQALLOC_SPACE_RES(mp), - XFS_WRITE_LOG_RES(mp) + - BBTOB(mp->m_quotainfo->qi_dqchunklen) - 1 + - 128, - 0, - XFS_TRANS_PERM_LOG_RES, - XFS_WRITE_LOG_COUNT); - if (error) { - cancelflags = 0; - goto error0; - } - cancelflags = XFS_TRANS_RELEASE_LOG_RES; - } - - /* - * Read it from disk; xfs_dqread() takes care of - * all the necessary initialization of dquot's fields (locks, etc) - */ - if ((error = xfs_qm_dqread(&tp, id, dqp, flags))) { - /* - * This can happen if quotas got turned off (ESRCH), - * or if the dquot didn't exist on disk and we ask to - * allocate (ENOENT). - */ - trace_xfs_dqread_fail(dqp); - cancelflags |= XFS_TRANS_ABORT; - goto error0; - } if (tp) { - if ((error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES))) - goto error1; + error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + if (error) + goto error0; } *O_dqpp = dqp; - return (0); + return error; - error0: - ASSERT(error); +error1: if (tp) xfs_trans_cancel(tp, cancelflags); - error1: +error0: xfs_qm_dqdestroy(dqp); *O_dqpp = NULL; - return (error); + return error; } /* @@ -710,12 +667,9 @@ xfs_qm_dqlookup( xfs_dquot_t **O_dqpp) { xfs_dquot_t *dqp; - uint flist_locked; ASSERT(mutex_is_locked(&qh->qh_lock)); - flist_locked = B_FALSE; - /* * Traverse the hashchain looking for a match */ @@ -725,70 +679,31 @@ xfs_qm_dqlookup( * dqlock to look at the id field of the dquot, since the * id can't be modified without the hashlock anyway. */ - if (be32_to_cpu(dqp->q_core.d_id) == id && dqp->q_mount == mp) { - trace_xfs_dqlookup_found(dqp); + if (be32_to_cpu(dqp->q_core.d_id) != id || dqp->q_mount != mp) + continue; - /* - * All in core dquots must be on the dqlist of mp - */ - ASSERT(!list_empty(&dqp->q_mplist)); - - xfs_dqlock(dqp); - if (dqp->q_nrefs == 0) { - ASSERT(!list_empty(&dqp->q_freelist)); - if (!mutex_trylock(&xfs_Gqm->qm_dqfrlist_lock)) { - trace_xfs_dqlookup_want(dqp); - - /* - * We may have raced with dqreclaim_one() - * (and lost). So, flag that we don't - * want the dquot to be reclaimed. - */ - dqp->dq_flags |= XFS_DQ_WANT; - xfs_dqunlock(dqp); - mutex_lock(&xfs_Gqm->qm_dqfrlist_lock); - xfs_dqlock(dqp); - dqp->dq_flags &= ~(XFS_DQ_WANT); - } - flist_locked = B_TRUE; - } + trace_xfs_dqlookup_found(dqp); - /* - * id couldn't have changed; we had the hashlock all - * along - */ - ASSERT(be32_to_cpu(dqp->q_core.d_id) == id); - - if (flist_locked) { - if (dqp->q_nrefs != 0) { - mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock); - flist_locked = B_FALSE; - } else { - /* take it off the freelist */ - trace_xfs_dqlookup_freelist(dqp); - list_del_init(&dqp->q_freelist); - xfs_Gqm->qm_dqfrlist_cnt--; - } - } + xfs_dqlock(dqp); + if (dqp->dq_flags & XFS_DQ_FREEING) { + *O_dqpp = NULL; + xfs_dqunlock(dqp); + return -1; + } - XFS_DQHOLD(dqp); + dqp->q_nrefs++; - if (flist_locked) - mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock); - /* - * move the dquot to the front of the hashchain - */ - ASSERT(mutex_is_locked(&qh->qh_lock)); - list_move(&dqp->q_hashlist, &qh->qh_list); - trace_xfs_dqlookup_done(dqp); - *O_dqpp = dqp; - return 0; - } + /* + * move the dquot to the front of the hashchain + */ + list_move(&dqp->q_hashlist, &qh->qh_list); + trace_xfs_dqlookup_done(dqp); + *O_dqpp = dqp; + return 0; } *O_dqpp = NULL; - ASSERT(mutex_is_locked(&qh->qh_lock)); - return (1); + return 1; } /* @@ -829,11 +744,7 @@ xfs_qm_dqget( return (EIO); } } -#endif - again: - -#ifdef DEBUG ASSERT(type == XFS_DQ_USER || type == XFS_DQ_PROJ || type == XFS_DQ_GROUP); @@ -845,13 +756,21 @@ xfs_qm_dqget( ASSERT(ip->i_gdquot == NULL); } #endif + +restart: mutex_lock(&h->qh_lock); /* * Look in the cache (hashtable). * The chain is kept locked during lookup. */ - if (xfs_qm_dqlookup(mp, id, h, O_dqpp) == 0) { + switch (xfs_qm_dqlookup(mp, id, h, O_dqpp)) { + case -1: + XQM_STATS_INC(xqmstats.xs_qm_dquot_dups); + mutex_unlock(&h->qh_lock); + delay(1); + goto restart; + case 0: XQM_STATS_INC(xqmstats.xs_qm_dqcachehits); /* * The dquot was found, moved to the front of the chain, @@ -862,9 +781,11 @@ xfs_qm_dqget( ASSERT(XFS_DQ_IS_LOCKED(*O_dqpp)); mutex_unlock(&h->qh_lock); trace_xfs_dqget_hit(*O_dqpp); - return (0); /* success */ + return 0; /* success */ + default: + XQM_STATS_INC(xqmstats.xs_qm_dqcachemisses); + break; } - XQM_STATS_INC(xqmstats.xs_qm_dqcachemisses); /* * Dquot cache miss. We don't want to keep the inode lock across @@ -882,41 +803,18 @@ xfs_qm_dqget( version = h->qh_version; mutex_unlock(&h->qh_lock); - /* - * Allocate the dquot on the kernel heap, and read the ondisk - * portion off the disk. Also, do all the necessary initialization - * This can return ENOENT if dquot didn't exist on disk and we didn't - * ask it to allocate; ESRCH if quotas got turned off suddenly. - */ - if ((error = xfs_qm_idtodq(mp, id, type, - flags & (XFS_QMOPT_DQALLOC|XFS_QMOPT_DQREPAIR| - XFS_QMOPT_DOWARN), - &dqp))) { - if (ip) - xfs_ilock(ip, XFS_ILOCK_EXCL); - return (error); - } + error = xfs_qm_dqread(mp, id, type, flags, &dqp); - /* - * See if this is mount code calling to look at the overall quota limits - * which are stored in the id == 0 user or group's dquot. - * Since we may not have done a quotacheck by this point, just return - * the dquot without attaching it to any hashtables, lists, etc, or even - * taking a reference. - * The caller must dqdestroy this once done. - */ - if (flags & XFS_QMOPT_DQSUSER) { - ASSERT(id == 0); - ASSERT(! ip); - goto dqret; - } + if (ip) + xfs_ilock(ip, XFS_ILOCK_EXCL); + + if (error) + return error; /* * Dquot lock comes after hashlock in the lock ordering */ if (ip) { - xfs_ilock(ip, XFS_ILOCK_EXCL); - /* * A dquot could be attached to this inode by now, since * we had dropped the ilock. @@ -961,16 +859,21 @@ xfs_qm_dqget( * lock order between the two dquots here since dqp isn't * on any findable lists yet. */ - if (xfs_qm_dqlookup(mp, id, h, &tmpdqp) == 0) { + switch (xfs_qm_dqlookup(mp, id, h, &tmpdqp)) { + case 0: + case -1: /* - * Duplicate found. Just throw away the new dquot - * and start over. + * Duplicate found, either in cache or on its way out. + * Just throw away the new dquot and start over. */ - xfs_qm_dqput(tmpdqp); + if (tmpdqp) + xfs_qm_dqput(tmpdqp); mutex_unlock(&h->qh_lock); xfs_qm_dqdestroy(dqp); XQM_STATS_INC(xqmstats.xs_qm_dquot_dups); - goto again; + goto restart; + default: + break; } } @@ -1015,67 +918,49 @@ xfs_qm_dqget( */ void xfs_qm_dqput( - xfs_dquot_t *dqp) + struct xfs_dquot *dqp) { - xfs_dquot_t *gdqp; + struct xfs_dquot *gdqp; ASSERT(dqp->q_nrefs > 0); ASSERT(XFS_DQ_IS_LOCKED(dqp)); trace_xfs_dqput(dqp); - if (dqp->q_nrefs != 1) { - dqp->q_nrefs--; +recurse: + if (--dqp->q_nrefs > 0) { xfs_dqunlock(dqp); return; } + trace_xfs_dqput_free(dqp); + + mutex_lock(&xfs_Gqm->qm_dqfrlist_lock); + if (list_empty(&dqp->q_freelist)) { + list_add_tail(&dqp->q_freelist, &xfs_Gqm->qm_dqfrlist); + xfs_Gqm->qm_dqfrlist_cnt++; + } + mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock); + /* - * drop the dqlock and acquire the freelist and dqlock - * in the right order; but try to get it out-of-order first + * If we just added a udquot to the freelist, then we want to release + * the gdquot reference that it (probably) has. Otherwise it'll keep + * the gdquot from getting reclaimed. */ - if (!mutex_trylock(&xfs_Gqm->qm_dqfrlist_lock)) { - trace_xfs_dqput_wait(dqp); - xfs_dqunlock(dqp); - mutex_lock(&xfs_Gqm->qm_dqfrlist_lock); - xfs_dqlock(dqp); + gdqp = dqp->q_gdquot; + if (gdqp) { + xfs_dqlock(gdqp); + dqp->q_gdquot = NULL; } + xfs_dqunlock(dqp); - while (1) { - gdqp = NULL; - - /* We can't depend on nrefs being == 1 here */ - if (--dqp->q_nrefs == 0) { - trace_xfs_dqput_free(dqp); - - list_add_tail(&dqp->q_freelist, &xfs_Gqm->qm_dqfrlist); - xfs_Gqm->qm_dqfrlist_cnt++; - - /* - * If we just added a udquot to the freelist, then - * we want to release the gdquot reference that - * it (probably) has. Otherwise it'll keep the - * gdquot from getting reclaimed. - */ - if ((gdqp = dqp->q_gdquot)) { - /* - * Avoid a recursive dqput call - */ - xfs_dqlock(gdqp); - dqp->q_gdquot = NULL; - } - } - xfs_dqunlock(dqp); - - /* - * If we had a group quota inside the user quota as a hint, - * release it now. - */ - if (! gdqp) - break; + /* + * If we had a group quota hint, release it now. + */ + if (gdqp) { dqp = gdqp; + goto recurse; } - mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock); } /* @@ -1169,7 +1054,7 @@ xfs_qm_dqflush( * If not dirty, or it's pinned and we are not supposed to block, nada. */ if (!XFS_DQ_IS_DIRTY(dqp) || - (!(flags & SYNC_WAIT) && atomic_read(&dqp->q_pincount) > 0)) { + ((flags & SYNC_TRYLOCK) && atomic_read(&dqp->q_pincount) > 0)) { xfs_dqfunlock(dqp); return 0; } @@ -1242,9 +1127,11 @@ xfs_qm_dqflush( } if (flags & SYNC_WAIT) - error = xfs_bwrite(mp, bp); + error = xfs_bwrite(bp); else - xfs_bdwrite(mp, bp); + xfs_buf_delwri_queue(bp); + + xfs_buf_relse(bp); trace_xfs_dqflush_done(dqp); @@ -1255,40 +1142,17 @@ xfs_qm_dqflush( } -int -xfs_qm_dqlock_nowait( - xfs_dquot_t *dqp) -{ - return mutex_trylock(&dqp->q_qlock); -} - -void -xfs_dqlock( - xfs_dquot_t *dqp) -{ - mutex_lock(&dqp->q_qlock); -} - void xfs_dqunlock( xfs_dquot_t *dqp) { - mutex_unlock(&(dqp->q_qlock)); + xfs_dqunlock_nonotify(dqp); if (dqp->q_logitem.qli_dquot == dqp) { - /* Once was dqp->q_mount, but might just have been cleared */ xfs_trans_unlocked_item(dqp->q_logitem.qli_item.li_ailp, - (xfs_log_item_t*)&(dqp->q_logitem)); + &dqp->q_logitem.qli_item); } } - -void -xfs_dqunlock_nonotify( - xfs_dquot_t *dqp) -{ - mutex_unlock(&(dqp->q_qlock)); -} - /* * Lock two xfs_dquot structures. * @@ -1317,43 +1181,18 @@ xfs_dqlock2( } } - /* - * Take a dquot out of the mount's dqlist as well as the hashlist. - * This is called via unmount as well as quotaoff, and the purge - * will always succeed unless there are soft (temp) references - * outstanding. - * - * This returns 0 if it was purged, 1 if it wasn't. It's not an error code - * that we're returning! XXXsup - not cool. + * Take a dquot out of the mount's dqlist as well as the hashlist. This is + * called via unmount as well as quotaoff, and the purge will always succeed. */ -/* ARGSUSED */ -int +void xfs_qm_dqpurge( - xfs_dquot_t *dqp) + struct xfs_dquot *dqp) { - xfs_dqhash_t *qh = dqp->q_hash; - xfs_mount_t *mp = dqp->q_mount; - - ASSERT(mutex_is_locked(&mp->m_quotainfo->qi_dqlist_lock)); - ASSERT(mutex_is_locked(&dqp->q_hash->qh_lock)); + struct xfs_mount *mp = dqp->q_mount; + struct xfs_dqhash *qh = dqp->q_hash; xfs_dqlock(dqp); - /* - * We really can't afford to purge a dquot that is - * referenced, because these are hard refs. - * It shouldn't happen in general because we went thru _all_ inodes in - * dqrele_all_inodes before calling this and didn't let the mountlock go. - * However it is possible that we have dquots with temporary - * references that are not attached to an inode. e.g. see xfs_setattr(). - */ - if (dqp->q_nrefs != 0) { - xfs_dqunlock(dqp); - mutex_unlock(&dqp->q_hash->qh_lock); - return (1); - } - - ASSERT(!list_empty(&dqp->q_freelist)); /* * If we're turning off quotas, we have to make sure that, for @@ -1368,23 +1207,18 @@ xfs_qm_dqpurge( * Block on the flush lock after nudging dquot buffer, * if it is incore. */ - xfs_qm_dqflock_pushbuf_wait(dqp); + xfs_dqflock_pushbuf_wait(dqp); } /* - * XXXIf we're turning this type of quotas off, we don't care + * If we are turning this type of quotas off, we don't care * about the dirty metadata sitting in this dquot. OTOH, if * we're unmounting, we do care, so we flush it and wait. */ if (XFS_DQ_IS_DIRTY(dqp)) { int error; - /* dqflush unlocks dqflock */ /* - * Given that dqpurge is a very rare occurrence, it is OK - * that we're holding the hashlist and mplist locks - * across the disk write. But, ... XXXsup - * * We don't care about getting disk errors here. We need * to purge this dquot anyway, so we go ahead regardless. */ @@ -1394,38 +1228,44 @@ xfs_qm_dqpurge( __func__, dqp); xfs_dqflock(dqp); } + ASSERT(atomic_read(&dqp->q_pincount) == 0); ASSERT(XFS_FORCED_SHUTDOWN(mp) || !(dqp->q_logitem.qli_item.li_flags & XFS_LI_IN_AIL)); + xfs_dqfunlock(dqp); + xfs_dqunlock(dqp); + + mutex_lock(&qh->qh_lock); list_del_init(&dqp->q_hashlist); qh->qh_version++; + mutex_unlock(&qh->qh_lock); + + mutex_lock(&mp->m_quotainfo->qi_dqlist_lock); list_del_init(&dqp->q_mplist); mp->m_quotainfo->qi_dqreclaims++; mp->m_quotainfo->qi_dquots--; + mutex_unlock(&mp->m_quotainfo->qi_dqlist_lock); + /* - * XXX Move this to the front of the freelist, if we can get the - * freelist lock. + * We move dquots to the freelist as soon as their reference count + * hits zero, so it really should be on the freelist here. */ + mutex_lock(&xfs_Gqm->qm_dqfrlist_lock); ASSERT(!list_empty(&dqp->q_freelist)); + list_del_init(&dqp->q_freelist); + xfs_Gqm->qm_dqfrlist_cnt--; + mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock); - dqp->q_mount = NULL; - dqp->q_hash = NULL; - dqp->dq_flags = XFS_DQ_INACTIVE; - memset(&dqp->q_core, 0, sizeof(dqp->q_core)); - xfs_dqfunlock(dqp); - xfs_dqunlock(dqp); - mutex_unlock(&qh->qh_lock); - return (0); + xfs_qm_dqdestroy(dqp); } - /* * Give the buffer a little push if it is incore and * wait on the flush lock. */ void -xfs_qm_dqflock_pushbuf_wait( +xfs_dqflock_pushbuf_wait( xfs_dquot_t *dqp) { xfs_mount_t *mp = dqp->q_mount; diff --git a/fs/xfs/xfs_dquot.h b/fs/xfs/xfs_dquot.h index 34b7e94..a1d91d8 100644 --- a/fs/xfs/xfs_dquot.h +++ b/fs/xfs/xfs_dquot.h @@ -80,8 +80,6 @@ enum { XFS_QLOCK_NESTED, }; -#define XFS_DQHOLD(dqp) ((dqp)->q_nrefs++) - /* * Manage the q_flush completion queue embedded in the dquot. This completion * queue synchronizes processes attempting to flush the in-core dquot back to @@ -102,6 +100,21 @@ static inline void xfs_dqfunlock(xfs_dquot_t *dqp) complete(&dqp->q_flush); } +static inline int xfs_dqlock_nowait(struct xfs_dquot *dqp) +{ + return mutex_trylock(&dqp->q_qlock); +} + +static inline void xfs_dqlock(struct xfs_dquot *dqp) +{ + mutex_lock(&dqp->q_qlock); +} + +static inline void xfs_dqunlock_nonotify(struct xfs_dquot *dqp) +{ + mutex_unlock(&dqp->q_qlock); +} + #define XFS_DQ_IS_LOCKED(dqp) (mutex_is_locked(&((dqp)->q_qlock))) #define XFS_DQ_IS_DIRTY(dqp) ((dqp)->dq_flags & XFS_DQ_DIRTY) #define XFS_QM_ISUDQ(dqp) ((dqp)->dq_flags & XFS_DQ_USER) @@ -116,12 +129,12 @@ static inline void xfs_dqfunlock(xfs_dquot_t *dqp) (XFS_IS_UQUOTA_ON((d)->q_mount)) : \ (XFS_IS_OQUOTA_ON((d)->q_mount)))) +extern int xfs_qm_dqread(struct xfs_mount *, xfs_dqid_t, uint, + uint, struct xfs_dquot **); extern void xfs_qm_dqdestroy(xfs_dquot_t *); extern int xfs_qm_dqflush(xfs_dquot_t *, uint); -extern int xfs_qm_dqpurge(xfs_dquot_t *); +extern void xfs_qm_dqpurge(xfs_dquot_t *); extern void xfs_qm_dqunpin_wait(xfs_dquot_t *); -extern int xfs_qm_dqlock_nowait(xfs_dquot_t *); -extern void xfs_qm_dqflock_pushbuf_wait(xfs_dquot_t *dqp); extern void xfs_qm_adjust_dqtimers(xfs_mount_t *, xfs_disk_dquot_t *); extern void xfs_qm_adjust_dqlimits(xfs_mount_t *, @@ -129,9 +142,17 @@ extern void xfs_qm_adjust_dqlimits(xfs_mount_t *, extern int xfs_qm_dqget(xfs_mount_t *, xfs_inode_t *, xfs_dqid_t, uint, uint, xfs_dquot_t **); extern void xfs_qm_dqput(xfs_dquot_t *); -extern void xfs_dqlock(xfs_dquot_t *); -extern void xfs_dqlock2(xfs_dquot_t *, xfs_dquot_t *); -extern void xfs_dqunlock(xfs_dquot_t *); -extern void xfs_dqunlock_nonotify(xfs_dquot_t *); + +extern void xfs_dqlock2(struct xfs_dquot *, struct xfs_dquot *); +extern void xfs_dqunlock(struct xfs_dquot *); +extern void xfs_dqflock_pushbuf_wait(struct xfs_dquot *dqp); + +static inline struct xfs_dquot *xfs_qm_dqhold(struct xfs_dquot *dqp) +{ + xfs_dqlock(dqp); + dqp->q_nrefs++; + xfs_dqunlock(dqp); + return dqp; +} #endif /* __XFS_DQUOT_H__ */ diff --git a/fs/xfs/xfs_dquot_item.c b/fs/xfs/xfs_dquot_item.c index bb3f71d..34baeae 100644 --- a/fs/xfs/xfs_dquot_item.c +++ b/fs/xfs/xfs_dquot_item.c @@ -73,7 +73,6 @@ xfs_qm_dquot_logitem_format( logvec->i_len = sizeof(xfs_disk_dquot_t); logvec->i_type = XLOG_REG_TYPE_DQUOT; - ASSERT(2 == lip->li_desc->lid_size); qlip->qli_format.qlf_size = 2; } @@ -134,7 +133,7 @@ xfs_qm_dquot_logitem_push( * lock without sleeping, then there must not have been * anyone in the process of flushing the dquot. */ - error = xfs_qm_dqflush(dqp, 0); + error = xfs_qm_dqflush(dqp, SYNC_TRYLOCK); if (error) xfs_warn(dqp->q_mount, "%s: push error %d on dqp %p", __func__, error, dqp); @@ -237,7 +236,7 @@ xfs_qm_dquot_logitem_trylock( if (atomic_read(&dqp->q_pincount) > 0) return XFS_ITEM_PINNED; - if (!xfs_qm_dqlock_nowait(dqp)) + if (!xfs_dqlock_nowait(dqp)) return XFS_ITEM_LOCKED; if (!xfs_dqflock_nowait(dqp)) { @@ -295,7 +294,7 @@ xfs_qm_dquot_logitem_committing( /* * This is the ops vector for dquots */ -static struct xfs_item_ops xfs_dquot_item_ops = { +static const struct xfs_item_ops xfs_dquot_item_ops = { .iop_size = xfs_qm_dquot_logitem_size, .iop_format = xfs_qm_dquot_logitem_format, .iop_pin = xfs_qm_dquot_logitem_pin, @@ -483,7 +482,7 @@ xfs_qm_qoff_logitem_committing( { } -static struct xfs_item_ops xfs_qm_qoffend_logitem_ops = { +static const struct xfs_item_ops xfs_qm_qoffend_logitem_ops = { .iop_size = xfs_qm_qoff_logitem_size, .iop_format = xfs_qm_qoff_logitem_format, .iop_pin = xfs_qm_qoff_logitem_pin, @@ -498,7 +497,7 @@ static struct xfs_item_ops xfs_qm_qoffend_logitem_ops = { /* * This is the ops vector shared by all quotaoff-start log items. */ -static struct xfs_item_ops xfs_qm_qoff_logitem_ops = { +static const struct xfs_item_ops xfs_qm_qoff_logitem_ops = { .iop_size = xfs_qm_qoff_logitem_size, .iop_format = xfs_qm_qoff_logitem_format, .iop_pin = xfs_qm_qoff_logitem_pin, diff --git a/fs/xfs/xfs_export.c b/fs/xfs/xfs_export.c index 75e5d32..558910f 100644 --- a/fs/xfs/xfs_export.c +++ b/fs/xfs/xfs_export.c @@ -98,22 +98,22 @@ xfs_fs_encode_fh( switch (fileid_type) { case FILEID_INO32_GEN_PARENT: spin_lock(&dentry->d_lock); - fid->i32.parent_ino = dentry->d_parent->d_inode->i_ino; + fid->i32.parent_ino = XFS_I(dentry->d_parent->d_inode)->i_ino; fid->i32.parent_gen = dentry->d_parent->d_inode->i_generation; spin_unlock(&dentry->d_lock); /*FALLTHRU*/ case FILEID_INO32_GEN: - fid->i32.ino = inode->i_ino; + fid->i32.ino = XFS_I(inode)->i_ino; fid->i32.gen = inode->i_generation; break; case FILEID_INO32_GEN_PARENT | XFS_FILEID_TYPE_64FLAG: spin_lock(&dentry->d_lock); - fid64->parent_ino = dentry->d_parent->d_inode->i_ino; + fid64->parent_ino = XFS_I(dentry->d_parent->d_inode)->i_ino; fid64->parent_gen = dentry->d_parent->d_inode->i_generation; spin_unlock(&dentry->d_lock); /*FALLTHRU*/ case FILEID_INO32_GEN | XFS_FILEID_TYPE_64FLAG: - fid64->ino = inode->i_ino; + fid64->ino = XFS_I(inode)->i_ino; fid64->gen = inode->i_generation; break; } @@ -229,16 +229,16 @@ xfs_fs_nfs_commit_metadata( { struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; - int error = 0; + xfs_lsn_t lsn = 0; xfs_ilock(ip, XFS_ILOCK_SHARED); - if (xfs_ipincount(ip)) { - error = _xfs_log_force_lsn(mp, ip->i_itemp->ili_last_lsn, - XFS_LOG_SYNC, NULL); - } + if (xfs_ipincount(ip)) + lsn = ip->i_itemp->ili_last_lsn; xfs_iunlock(ip, XFS_ILOCK_SHARED); - return error; + if (!lsn) + return 0; + return _xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, NULL); } const struct export_operations xfs_export_operations = { diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index d22e626..35c2aff 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -217,7 +217,7 @@ xfs_efi_item_committing( /* * This is the ops vector shared by all efi log items. */ -static struct xfs_item_ops xfs_efi_item_ops = { +static const struct xfs_item_ops xfs_efi_item_ops = { .iop_size = xfs_efi_item_size, .iop_format = xfs_efi_item_format, .iop_pin = xfs_efi_item_pin, @@ -477,7 +477,7 @@ xfs_efd_item_committing( /* * This is the ops vector shared by all efd log items. */ -static struct xfs_item_ops xfs_efd_item_ops = { +static const struct xfs_item_ops xfs_efd_item_ops = { .iop_size = xfs_efd_item_size, .iop_format = xfs_efd_item_format, .iop_pin = xfs_efd_item_pin, diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 7f7b424..f675f3d 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -124,6 +124,35 @@ xfs_iozero( return (-status); } +/* + * Fsync operations on directories are much simpler than on regular files, + * as there is no file data to flush, and thus also no need for explicit + * cache flush operations, and there are no non-transaction metadata updates + * on directories either. + */ +STATIC int +xfs_dir_fsync( + struct file *file, + loff_t start, + loff_t end, + int datasync) +{ + struct xfs_inode *ip = XFS_I(file->f_mapping->host); + struct xfs_mount *mp = ip->i_mount; + xfs_lsn_t lsn = 0; + + trace_xfs_dir_fsync(ip); + + xfs_ilock(ip, XFS_ILOCK_SHARED); + if (xfs_ipincount(ip)) + lsn = ip->i_itemp->ili_last_lsn; + xfs_iunlock(ip, XFS_ILOCK_SHARED); + + if (!lsn) + return 0; + return _xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, NULL); +} + STATIC int xfs_file_fsync( struct file *file, @@ -137,6 +166,7 @@ xfs_file_fsync( struct xfs_trans *tp; int error = 0; int log_flushed = 0; + xfs_lsn_t lsn = 0; trace_xfs_file_fsync(ip); @@ -149,10 +179,6 @@ xfs_file_fsync( xfs_iflags_clear(ip, XFS_ITRUNCATED); - xfs_ilock(ip, XFS_IOLOCK_SHARED); - xfs_ioend_wait(ip); - xfs_iunlock(ip, XFS_IOLOCK_SHARED); - if (mp->m_flags & XFS_MOUNT_BARRIER) { /* * If we have an RT and/or log subvolume we need to make sure @@ -183,10 +209,10 @@ xfs_file_fsync( /* * First check if the VFS inode is marked dirty. All the dirtying - * of non-transactional updates no goes through mark_inode_dirty*, - * which allows us to distinguish beteeen pure timestamp updates + * of non-transactional updates do not go through mark_inode_dirty*, + * which allows us to distinguish between pure timestamp updates * and i_size updates which need to be caught for fdatasync. - * After that also theck for the dirty state in the XFS inode, which + * After that also check for the dirty state in the XFS inode, which * might gets cleared when the inode gets written out via the AIL * or xfs_iflush_cluster. */ @@ -216,11 +242,11 @@ xfs_file_fsync( * transaction. So we play it safe and fire off the * transaction anyway. */ - xfs_trans_ijoin(tp, ip); + xfs_trans_ijoin(tp, ip, 0); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - xfs_trans_set_sync(tp); - error = _xfs_trans_commit(tp, 0, &log_flushed); + error = xfs_trans_commit(tp, 0); + lsn = ip->i_itemp->ili_last_lsn; xfs_iunlock(ip, XFS_ILOCK_EXCL); } else { /* @@ -231,14 +257,14 @@ xfs_file_fsync( * disk yet, the inode will be still be pinned. If it is, * force the log. */ - if (xfs_ipincount(ip)) { - error = _xfs_log_force_lsn(mp, - ip->i_itemp->ili_last_lsn, - XFS_LOG_SYNC, &log_flushed); - } + if (xfs_ipincount(ip)) + lsn = ip->i_itemp->ili_last_lsn; xfs_iunlock(ip, XFS_ILOCK_SHARED); } + if (!error && lsn) + error = _xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, &log_flushed); + /* * If we only have a single device, and the log force about was * a no-op we might have to flush the data device cache here. @@ -317,7 +343,19 @@ xfs_file_aio_read( if (XFS_FORCED_SHUTDOWN(mp)) return -EIO; - if (unlikely(ioflags & IO_ISDIRECT)) { + /* + * Locking is a bit tricky here. If we take an exclusive lock + * for direct IO, we effectively serialise all new concurrent + * read IO to this file and block it behind IO that is currently in + * progress because IO in progress holds the IO lock shared. We only + * need to hold the lock exclusive to blow away the page cache, so + * only take lock exclusively if the page cache needs invalidation. + * This allows the normal direct IO case of no page cache pages to + * proceeed concurrently without serialisation. + */ + xfs_rw_ilock(ip, XFS_IOLOCK_SHARED); + if ((ioflags & IO_ISDIRECT) && inode->i_mapping->nrpages) { + xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED); xfs_rw_ilock(ip, XFS_IOLOCK_EXCL); if (inode->i_mapping->nrpages) { @@ -330,8 +368,7 @@ xfs_file_aio_read( } } xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL); - } else - xfs_rw_ilock(ip, XFS_IOLOCK_SHARED); + } trace_xfs_file_read(ip, size, iocb->ki_pos, ioflags); @@ -407,11 +444,13 @@ xfs_aio_write_isize_update( */ STATIC void xfs_aio_write_newsize_update( - struct xfs_inode *ip) + struct xfs_inode *ip, + xfs_fsize_t new_size) { - if (ip->i_new_size) { + if (new_size == ip->i_new_size) { xfs_rw_ilock(ip, XFS_ILOCK_EXCL); - ip->i_new_size = 0; + if (new_size == ip->i_new_size) + ip->i_new_size = 0; if (ip->i_d.di_size > ip->i_size) ip->i_d.di_size = ip->i_size; xfs_rw_iunlock(ip, XFS_ILOCK_EXCL); @@ -462,7 +501,7 @@ xfs_file_splice_write( ret = generic_file_splice_write(pipe, outfilp, ppos, count, flags); xfs_aio_write_isize_update(inode, ppos, ret); - xfs_aio_write_newsize_update(ip); + xfs_aio_write_newsize_update(ip, new_size); xfs_iunlock(ip, XFS_IOLOCK_EXCL); return ret; } @@ -500,11 +539,9 @@ xfs_zero_last_block( last_fsb = XFS_B_TO_FSBT(mp, isize); nimaps = 1; - error = xfs_bmapi(NULL, ip, last_fsb, 1, 0, NULL, 0, &imap, - &nimaps, NULL); - if (error) { + error = xfs_bmapi_read(ip, last_fsb, 1, &imap, &nimaps, 0); + if (error) return error; - } ASSERT(nimaps > 0); /* * If the block underlying isize is just a hole, then there @@ -595,8 +632,8 @@ xfs_zero_eof( while (start_zero_fsb <= end_zero_fsb) { nimaps = 1; zero_count_fsb = end_zero_fsb - start_zero_fsb + 1; - error = xfs_bmapi(NULL, ip, start_zero_fsb, zero_count_fsb, - 0, NULL, 0, &imap, &nimaps, NULL); + error = xfs_bmapi_read(ip, start_zero_fsb, zero_count_fsb, + &imap, &nimaps, 0); if (error) { ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL)); return error; @@ -659,6 +696,7 @@ xfs_file_aio_write_checks( struct file *file, loff_t *pos, size_t *count, + xfs_fsize_t *new_sizep, int *iolock) { struct inode *inode = file->f_mapping->host; @@ -666,6 +704,9 @@ xfs_file_aio_write_checks( xfs_fsize_t new_size; int error = 0; + xfs_rw_ilock(ip, XFS_ILOCK_EXCL); + *new_sizep = 0; +restart: error = generic_write_checks(file, pos, count, S_ISBLK(inode->i_mode)); if (error) { xfs_rw_iunlock(ip, XFS_ILOCK_EXCL | *iolock); @@ -673,20 +714,41 @@ xfs_file_aio_write_checks( return error; } - new_size = *pos + *count; - if (new_size > ip->i_size) - ip->i_new_size = new_size; - if (likely(!(file->f_mode & FMODE_NOCMTIME))) file_update_time(file); /* * If the offset is beyond the size of the file, we need to zero any * blocks that fall between the existing EOF and the start of this - * write. + * write. There is no need to issue zeroing if another in-flght IO ends + * at or before this one If zeronig is needed and we are currently + * holding the iolock shared, we need to update it to exclusive which + * involves dropping all locks and relocking to maintain correct locking + * order. If we do this, restart the function to ensure all checks and + * values are still valid. */ - if (*pos > ip->i_size) + if ((ip->i_new_size && *pos > ip->i_new_size) || + (!ip->i_new_size && *pos > ip->i_size)) { + if (*iolock == XFS_IOLOCK_SHARED) { + xfs_rw_iunlock(ip, XFS_ILOCK_EXCL | *iolock); + *iolock = XFS_IOLOCK_EXCL; + xfs_rw_ilock(ip, XFS_ILOCK_EXCL | *iolock); + goto restart; + } error = -xfs_zero_eof(ip, *pos, ip->i_size); + } + + /* + * If this IO extends beyond EOF, we may need to update ip->i_new_size. + * We have already zeroed space beyond EOF (if necessary). Only update + * ip->i_new_size if this IO ends beyond any other in-flight writes. + */ + new_size = *pos + *count; + if (new_size > ip->i_size) { + if (new_size > ip->i_new_size) + ip->i_new_size = new_size; + *new_sizep = new_size; + } xfs_rw_iunlock(ip, XFS_ILOCK_EXCL); if (error) @@ -721,7 +783,7 @@ xfs_file_aio_write_checks( * the dio layer. To avoid the problem with aio, we also need to wait for * outstanding IOs to complete so that unwritten extent conversion is completed * before we try to map the overlapping block. This is currently implemented by - * hitting it with a big hammer (i.e. xfs_ioend_wait()). + * hitting it with a big hammer (i.e. inode_dio_wait()). * * Returns with locks held indicated by @iolock and errors indicated by * negative return values. @@ -733,6 +795,7 @@ xfs_file_dio_aio_write( unsigned long nr_segs, loff_t pos, size_t ocount, + xfs_fsize_t *new_size, int *iolock) { struct file *file = iocb->ki_filp; @@ -753,18 +816,35 @@ xfs_file_dio_aio_write( if ((pos & mp->m_blockmask) || ((pos + count) & mp->m_blockmask)) unaligned_io = 1; - if (unaligned_io || mapping->nrpages || pos > ip->i_size) + /* + * We don't need to take an exclusive lock unless there page cache needs + * to be invalidated or unaligned IO is being executed. We don't need to + * consider the EOF extension case here because + * xfs_file_aio_write_checks() will relock the inode as necessary for + * EOF zeroing cases and fill out the new inode size as appropriate. + */ + if (unaligned_io || mapping->nrpages) *iolock = XFS_IOLOCK_EXCL; else *iolock = XFS_IOLOCK_SHARED; - xfs_rw_ilock(ip, XFS_ILOCK_EXCL | *iolock); + xfs_rw_ilock(ip, *iolock); - ret = xfs_file_aio_write_checks(file, &pos, &count, iolock); + /* + * Recheck if there are cached pages that need invalidate after we got + * the iolock to protect against other threads adding new pages while + * we were waiting for the iolock. + */ + if (mapping->nrpages && *iolock == XFS_IOLOCK_SHARED) { + xfs_rw_iunlock(ip, *iolock); + *iolock = XFS_IOLOCK_EXCL; + xfs_rw_ilock(ip, *iolock); + } + + ret = xfs_file_aio_write_checks(file, &pos, &count, new_size, iolock); if (ret) return ret; if (mapping->nrpages) { - WARN_ON(*iolock != XFS_IOLOCK_EXCL); ret = -xfs_flushinval_pages(ip, (pos & PAGE_CACHE_MASK), -1, FI_REMAPF_LOCKED); if (ret) @@ -776,7 +856,7 @@ xfs_file_dio_aio_write( * otherwise demote the lock if we had to flush cached pages */ if (unaligned_io) - xfs_ioend_wait(ip); + inode_dio_wait(inode); else if (*iolock == XFS_IOLOCK_EXCL) { xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL); *iolock = XFS_IOLOCK_SHARED; @@ -798,6 +878,7 @@ xfs_file_buffered_aio_write( unsigned long nr_segs, loff_t pos, size_t ocount, + xfs_fsize_t *new_size, int *iolock) { struct file *file = iocb->ki_filp; @@ -809,9 +890,9 @@ xfs_file_buffered_aio_write( size_t count = ocount; *iolock = XFS_IOLOCK_EXCL; - xfs_rw_ilock(ip, XFS_ILOCK_EXCL | *iolock); + xfs_rw_ilock(ip, *iolock); - ret = xfs_file_aio_write_checks(file, &pos, &count, iolock); + ret = xfs_file_aio_write_checks(file, &pos, &count, new_size, iolock); if (ret) return ret; @@ -851,6 +932,7 @@ xfs_file_aio_write( ssize_t ret; int iolock; size_t ocount = 0; + xfs_fsize_t new_size = 0; XFS_STATS_INC(xs_write_calls); @@ -870,10 +952,10 @@ xfs_file_aio_write( if (unlikely(file->f_flags & O_DIRECT)) ret = xfs_file_dio_aio_write(iocb, iovp, nr_segs, pos, - ocount, &iolock); + ocount, &new_size, &iolock); else ret = xfs_file_buffered_aio_write(iocb, iovp, nr_segs, pos, - ocount, &iolock); + ocount, &new_size, &iolock); xfs_aio_write_isize_update(inode, &iocb->ki_pos, ret); @@ -894,7 +976,7 @@ xfs_file_aio_write( } out_unlock: - xfs_aio_write_newsize_update(ip); + xfs_aio_write_newsize_update(ip, new_size); xfs_rw_iunlock(ip, iolock); return ret; } @@ -1087,7 +1169,7 @@ const struct file_operations xfs_dir_file_operations = { #ifdef CONFIG_COMPAT .compat_ioctl = xfs_file_compat_ioctl, #endif - .fsync = xfs_file_fsync, + .fsync = xfs_dir_fsync, }; static const struct vm_operations_struct xfs_file_vm_ops = { diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c index 3ff3d9e..5170306 100644 --- a/fs/xfs/xfs_filestream.c +++ b/fs/xfs/xfs_filestream.c @@ -682,7 +682,7 @@ xfs_filestream_new_ag( ip = ap->ip; mp = ip->i_mount; cache = mp->m_filestream; - minlen = ap->alen; + minlen = ap->length; *agp = NULLAGNUMBER; /* @@ -761,7 +761,7 @@ xfs_filestream_new_ag( */ ag = (ag == NULLAGNUMBER) ? 0 : (ag + 1) % mp->m_sb.sb_agcount; flags = (ap->userdata ? XFS_PICK_USERDATA : 0) | - (ap->low ? XFS_PICK_LOWSPACE : 0); + (ap->flist->xbf_low ? XFS_PICK_LOWSPACE : 0); err = _xfs_filestream_pick_ag(mp, ag, agp, flags, minlen); if (err || *agp == NULLAGNUMBER) diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index 9153d2c..1c6fdeb 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -194,6 +194,10 @@ xfs_growfs_data_private( bp = xfs_buf_get(mp->m_ddev_targp, XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR(mp)), XFS_FSS_TO_BB(mp, 1), XBF_LOCK | XBF_MAPPED); + if (!bp) { + error = ENOMEM; + goto error0; + } agf = XFS_BUF_TO_AGF(bp); memset(agf, 0, mp->m_sb.sb_sectsize); agf->agf_magicnum = cpu_to_be32(XFS_AGF_MAGIC); @@ -216,16 +220,21 @@ xfs_growfs_data_private( tmpsize = agsize - XFS_PREALLOC_BLOCKS(mp); agf->agf_freeblks = cpu_to_be32(tmpsize); agf->agf_longest = cpu_to_be32(tmpsize); - error = xfs_bwrite(mp, bp); - if (error) { + error = xfs_bwrite(bp); + xfs_buf_relse(bp); + if (error) goto error0; - } + /* * AG inode header block */ bp = xfs_buf_get(mp->m_ddev_targp, XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)), XFS_FSS_TO_BB(mp, 1), XBF_LOCK | XBF_MAPPED); + if (!bp) { + error = ENOMEM; + goto error0; + } agi = XFS_BUF_TO_AGI(bp); memset(agi, 0, mp->m_sb.sb_sectsize); agi->agi_magicnum = cpu_to_be32(XFS_AGI_MAGIC); @@ -240,10 +249,11 @@ xfs_growfs_data_private( agi->agi_dirino = cpu_to_be32(NULLAGINO); for (bucket = 0; bucket < XFS_AGI_UNLINKED_BUCKETS; bucket++) agi->agi_unlinked[bucket] = cpu_to_be32(NULLAGINO); - error = xfs_bwrite(mp, bp); - if (error) { + error = xfs_bwrite(bp); + xfs_buf_relse(bp); + if (error) goto error0; - } + /* * BNO btree root block */ @@ -251,6 +261,10 @@ xfs_growfs_data_private( XFS_AGB_TO_DADDR(mp, agno, XFS_BNO_BLOCK(mp)), BTOBB(mp->m_sb.sb_blocksize), XBF_LOCK | XBF_MAPPED); + if (!bp) { + error = ENOMEM; + goto error0; + } block = XFS_BUF_TO_BLOCK(bp); memset(block, 0, mp->m_sb.sb_blocksize); block->bb_magic = cpu_to_be32(XFS_ABTB_MAGIC); @@ -262,10 +276,11 @@ xfs_growfs_data_private( arec->ar_startblock = cpu_to_be32(XFS_PREALLOC_BLOCKS(mp)); arec->ar_blockcount = cpu_to_be32( agsize - be32_to_cpu(arec->ar_startblock)); - error = xfs_bwrite(mp, bp); - if (error) { + error = xfs_bwrite(bp); + xfs_buf_relse(bp); + if (error) goto error0; - } + /* * CNT btree root block */ @@ -273,6 +288,10 @@ xfs_growfs_data_private( XFS_AGB_TO_DADDR(mp, agno, XFS_CNT_BLOCK(mp)), BTOBB(mp->m_sb.sb_blocksize), XBF_LOCK | XBF_MAPPED); + if (!bp) { + error = ENOMEM; + goto error0; + } block = XFS_BUF_TO_BLOCK(bp); memset(block, 0, mp->m_sb.sb_blocksize); block->bb_magic = cpu_to_be32(XFS_ABTC_MAGIC); @@ -285,10 +304,11 @@ xfs_growfs_data_private( arec->ar_blockcount = cpu_to_be32( agsize - be32_to_cpu(arec->ar_startblock)); nfree += be32_to_cpu(arec->ar_blockcount); - error = xfs_bwrite(mp, bp); - if (error) { + error = xfs_bwrite(bp); + xfs_buf_relse(bp); + if (error) goto error0; - } + /* * INO btree root block */ @@ -296,6 +316,10 @@ xfs_growfs_data_private( XFS_AGB_TO_DADDR(mp, agno, XFS_IBT_BLOCK(mp)), BTOBB(mp->m_sb.sb_blocksize), XBF_LOCK | XBF_MAPPED); + if (!bp) { + error = ENOMEM; + goto error0; + } block = XFS_BUF_TO_BLOCK(bp); memset(block, 0, mp->m_sb.sb_blocksize); block->bb_magic = cpu_to_be32(XFS_IBT_MAGIC); @@ -303,10 +327,10 @@ xfs_growfs_data_private( block->bb_numrecs = 0; block->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK); block->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK); - error = xfs_bwrite(mp, bp); - if (error) { + error = xfs_bwrite(bp); + xfs_buf_relse(bp); + if (error) goto error0; - } } xfs_trans_agblocks_delta(tp, nfree); /* @@ -396,9 +420,9 @@ xfs_growfs_data_private( * just issue a warning and continue. The real work is * already done and committed. */ - if (!(error = xfs_bwrite(mp, bp))) { - continue; - } else { + error = xfs_bwrite(bp); + xfs_buf_relse(bp); + if (error) { xfs_warn(mp, "write error %d updating secondary superblock for ag %d", error, agno); diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c index 9f24ec2..dad1a31 100644 --- a/fs/xfs/xfs_ialloc.c +++ b/fs/xfs/xfs_ialloc.c @@ -150,7 +150,7 @@ xfs_check_agi_freecount( /* * Initialise a new set of inodes. */ -STATIC void +STATIC int xfs_ialloc_inode_init( struct xfs_mount *mp, struct xfs_trans *tp, @@ -202,8 +202,8 @@ xfs_ialloc_inode_init( fbuf = xfs_trans_get_buf(tp, mp->m_ddev_targp, d, mp->m_bsize * blks_per_cluster, XBF_LOCK); - ASSERT(!xfs_buf_geterror(fbuf)); - + if (!fbuf) + return ENOMEM; /* * Initialize all inodes in this buffer and then log them. * @@ -225,6 +225,7 @@ xfs_ialloc_inode_init( } xfs_trans_inode_alloc_buf(tp, fbuf); } + return 0; } /* @@ -369,9 +370,11 @@ xfs_ialloc_ag_alloc( * rather than a linear progression to prevent the next generation * number from being easily guessable. */ - xfs_ialloc_inode_init(args.mp, tp, agno, args.agbno, args.len, - random32()); + error = xfs_ialloc_inode_init(args.mp, tp, agno, args.agbno, + args.len, random32()); + if (error) + return error; /* * Convert the results. */ @@ -444,7 +447,7 @@ STATIC xfs_buf_t * /* allocation group buffer */ xfs_ialloc_ag_select( xfs_trans_t *tp, /* transaction pointer */ xfs_ino_t parent, /* parent directory inode number */ - mode_t mode, /* bits set to indicate file type */ + umode_t mode, /* bits set to indicate file type */ int okalloc) /* ok to allocate more space */ { xfs_buf_t *agbp; /* allocation group header buffer */ @@ -637,7 +640,7 @@ int xfs_dialloc( xfs_trans_t *tp, /* transaction pointer */ xfs_ino_t parent, /* parent inode (directory) */ - mode_t mode, /* mode bits for new inode */ + umode_t mode, /* mode bits for new inode */ int okalloc, /* ok to allocate more space */ xfs_buf_t **IO_agbp, /* in/out ag header's buffer */ boolean_t *alloc_done, /* true if we needed to replenish @@ -1502,7 +1505,7 @@ xfs_read_agi( return XFS_ERROR(EFSCORRUPTED); } - XFS_BUF_SET_VTYPE_REF(*bpp, B_FS_AGI, XFS_AGI_REF); + xfs_buf_set_ref(*bpp, XFS_AGI_REF); xfs_check_agi_unlinked(agi); return 0; diff --git a/fs/xfs/xfs_ialloc.h b/fs/xfs/xfs_ialloc.h index bb53854..666a037 100644 --- a/fs/xfs/xfs_ialloc.h +++ b/fs/xfs/xfs_ialloc.h @@ -81,7 +81,7 @@ int /* error */ xfs_dialloc( struct xfs_trans *tp, /* transaction pointer */ xfs_ino_t parent, /* parent inode (directory) */ - mode_t mode, /* mode bits for new inode */ + umode_t mode, /* mode bits for new inode */ int okalloc, /* ok to allocate more space */ struct xfs_buf **agbp, /* buf for a.g. inode header */ boolean_t *alloc_done, /* an allocation was done to replenish diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c index 7759812..3960a06 100644 --- a/fs/xfs/xfs_iget.c +++ b/fs/xfs/xfs_iget.c @@ -75,7 +75,6 @@ xfs_inode_alloc( return NULL; } - ASSERT(atomic_read(&ip->i_iocount) == 0); ASSERT(atomic_read(&ip->i_pincount) == 0); ASSERT(!spin_is_locked(&ip->i_flags_lock)); ASSERT(completion_done(&ip->i_flush)); @@ -108,7 +107,6 @@ xfs_inode_free_callback( struct inode *inode = container_of(head, struct inode, i_rcu); struct xfs_inode *ip = XFS_I(inode); - INIT_LIST_HEAD(&inode->i_dentry); kmem_zone_free(xfs_inode_zone, ip); } @@ -150,7 +148,6 @@ xfs_inode_free( } /* asserts to verify all state is correct here */ - ASSERT(atomic_read(&ip->i_iocount) == 0); ASSERT(atomic_read(&ip->i_pincount) == 0); ASSERT(!spin_is_locked(&ip->i_flags_lock)); ASSERT(completion_done(&ip->i_flush)); diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 0239a7c..9dda7cc 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -190,12 +190,6 @@ xfs_imap_to_bp( } xfs_inobp_check(mp, bp); - - /* - * Mark the buffer as an inode buffer now that it looks good - */ - XFS_BUF_SET_VTYPE(bp, B_FS_INO); - *bpp = bp; return 0; } @@ -967,7 +961,7 @@ int xfs_ialloc( xfs_trans_t *tp, xfs_inode_t *pip, - mode_t mode, + umode_t mode, xfs_nlink_t nlink, xfs_dev_t rdev, prid_t prid, @@ -1008,7 +1002,7 @@ xfs_ialloc( return error; ASSERT(ip != NULL); - ip->i_d.di_mode = (__uint16_t)mode; + ip->i_d.di_mode = mode; ip->i_d.di_onlink = 0; ip->i_d.di_nlink = nlink; ASSERT(ip->i_d.di_nlink == nlink); @@ -1152,7 +1146,7 @@ xfs_ialloc( /* * Log the new values stuffed into the inode. */ - xfs_trans_ijoin_ref(tp, ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); xfs_trans_log_inode(tp, ip, flags); /* now that we have an i_mode we can setup inode ops and unlock */ @@ -1187,6 +1181,7 @@ xfs_isize_check( xfs_fileoff_t map_first; int nimaps; xfs_bmbt_irec_t imaps[2]; + int error; if (!S_ISREG(ip->i_d.di_mode)) return; @@ -1203,13 +1198,12 @@ xfs_isize_check( * The filesystem could be shutting down, so bmapi may return * an error. */ - if (xfs_bmapi(NULL, ip, map_first, + error = xfs_bmapi_read(ip, map_first, (XFS_B_TO_FSB(mp, - (xfs_ufsize_t)XFS_MAXIOFFSET(mp)) - - map_first), - XFS_BMAPI_ENTIRE, NULL, 0, imaps, &nimaps, - NULL)) - return; + (xfs_ufsize_t)XFS_MAXIOFFSET(mp)) - map_first), + imaps, &nimaps, XFS_BMAPI_ENTIRE); + if (error) + return; ASSERT(nimaps == 1); ASSERT(imaps[0].br_startblock == HOLESTARTBLOCK); } @@ -1297,7 +1291,7 @@ xfs_itruncate_extents( */ error = xfs_bmap_finish(&tp, &free_list, &committed); if (committed) - xfs_trans_ijoin(tp, ip); + xfs_trans_ijoin(tp, ip, 0); if (error) goto out_bmap_cancel; @@ -1313,7 +1307,7 @@ xfs_itruncate_extents( error = xfs_trans_commit(tp, 0); tp = ntp; - xfs_trans_ijoin(tp, ip); + xfs_trans_ijoin(tp, ip, 0); if (error) goto out; @@ -1644,7 +1638,7 @@ xfs_iunlink_remove( * inodes that are in memory - they all must be marked stale and attached to * the cluster buffer. */ -STATIC void +STATIC int xfs_ifree_cluster( xfs_inode_t *free_ip, xfs_trans_t *tp, @@ -1690,6 +1684,8 @@ xfs_ifree_cluster( mp->m_bsize * blks_per_cluster, XBF_LOCK); + if (!bp) + return ENOMEM; /* * Walk the inodes already attached to the buffer and mark them * stale. These will all have the flush locks held, so an @@ -1799,6 +1795,7 @@ retry: } xfs_perag_put(pag); + return 0; } /* @@ -1878,10 +1875,10 @@ xfs_ifree( dip->di_mode = 0; if (delete) { - xfs_ifree_cluster(ip, tp, first_ino); + error = xfs_ifree_cluster(ip, tp, first_ino); } - return 0; + return error; } /* @@ -2472,11 +2469,11 @@ cluster_corrupt_out: */ if (bp->b_iodone) { XFS_BUF_UNDONE(bp); - XFS_BUF_STALE(bp); + xfs_buf_stale(bp); xfs_buf_ioerror(bp, EIO); xfs_buf_ioend(bp, 0); } else { - XFS_BUF_STALE(bp); + xfs_buf_stale(bp); xfs_buf_relse(bp); } } @@ -2597,9 +2594,11 @@ xfs_iflush( goto cluster_corrupt_out; if (flags & SYNC_WAIT) - error = xfs_bwrite(mp, bp); + error = xfs_bwrite(bp); else - xfs_bdwrite(mp, bp); + xfs_buf_delwri_queue(bp); + + xfs_buf_relse(bp); return error; corrupt_out: @@ -2836,6 +2835,27 @@ corrupt_out: return XFS_ERROR(EFSCORRUPTED); } +void +xfs_promote_inode( + struct xfs_inode *ip) +{ + struct xfs_buf *bp; + + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); + + bp = xfs_incore(ip->i_mount->m_ddev_targp, ip->i_imap.im_blkno, + ip->i_imap.im_len, XBF_TRYLOCK); + if (!bp) + return; + + if (XFS_BUF_ISDELAYWRITE(bp)) { + xfs_buf_delwri_promote(bp); + wake_up_process(ip->i_mount->m_ddev_targp->bt_task); + } + + xfs_buf_relse(bp); +} + /* * Return a pointer to the extent record at file index idx. */ diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 2380a4b..f0e6b15 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -257,7 +257,6 @@ typedef struct xfs_inode { xfs_fsize_t i_size; /* in-memory size */ xfs_fsize_t i_new_size; /* size when write completes */ - atomic_t i_iocount; /* outstanding I/O count */ /* VFS inode */ struct inode i_vnode; /* embedded VFS inode */ @@ -482,7 +481,7 @@ void xfs_inode_free(struct xfs_inode *ip); /* * xfs_inode.c prototypes. */ -int xfs_ialloc(struct xfs_trans *, xfs_inode_t *, mode_t, +int xfs_ialloc(struct xfs_trans *, xfs_inode_t *, umode_t, xfs_nlink_t, xfs_dev_t, prid_t, int, struct xfs_buf **, boolean_t *, xfs_inode_t **); @@ -499,6 +498,7 @@ int xfs_iunlink(struct xfs_trans *, xfs_inode_t *); void xfs_iext_realloc(xfs_inode_t *, int, int); void xfs_iunpin_wait(xfs_inode_t *); int xfs_iflush(xfs_inode_t *, uint); +void xfs_promote_inode(struct xfs_inode *); void xfs_lock_inodes(xfs_inode_t **, int, uint); void xfs_lock_two_inodes(xfs_inode_t *, xfs_inode_t *, uint); diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index 836ad80..cfd6c7f 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -437,7 +437,6 @@ xfs_inode_item_format( * Assert that no attribute-related log flags are set. */ if (!XFS_IFORK_Q(ip)) { - ASSERT(nvecs == lip->li_desc->lid_size); iip->ili_format.ilf_size = nvecs; ASSERT(!(iip->ili_format.ilf_fields & (XFS_ILOG_ADATA | XFS_ILOG_ABROOT | XFS_ILOG_AEXT))); @@ -521,7 +520,6 @@ xfs_inode_item_format( break; } - ASSERT(nvecs == lip->li_desc->lid_size); iip->ili_format.ilf_size = nvecs; } @@ -658,10 +656,8 @@ xfs_inode_item_unlock( lock_flags = iip->ili_lock_flags; iip->ili_lock_flags = 0; - if (lock_flags) { + if (lock_flags) xfs_iunlock(ip, lock_flags); - IRELE(ip); - } } /* @@ -797,7 +793,7 @@ xfs_inode_item_committing( /* * This is the ops vector shared by all buf log items. */ -static struct xfs_item_ops xfs_inode_item_ops = { +static const struct xfs_item_ops xfs_inode_item_ops = { .iop_size = xfs_inode_item_size, .iop_format = xfs_inode_item_format, .iop_pin = xfs_inode_item_pin, diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index f7ce7de..76f3ca5 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -559,23 +559,23 @@ xfs_attrmulti_by_handle( ops[i].am_flags); break; case ATTR_OP_SET: - ops[i].am_error = mnt_want_write(parfilp->f_path.mnt); + ops[i].am_error = mnt_want_write_file(parfilp); if (ops[i].am_error) break; ops[i].am_error = xfs_attrmulti_attr_set( dentry->d_inode, attr_name, ops[i].am_attrvalue, ops[i].am_length, ops[i].am_flags); - mnt_drop_write(parfilp->f_path.mnt); + mnt_drop_write_file(parfilp); break; case ATTR_OP_REMOVE: - ops[i].am_error = mnt_want_write(parfilp->f_path.mnt); + ops[i].am_error = mnt_want_write_file(parfilp); if (ops[i].am_error) break; ops[i].am_error = xfs_attrmulti_attr_remove( dentry->d_inode, attr_name, ops[i].am_flags); - mnt_drop_write(parfilp->f_path.mnt); + mnt_drop_write_file(parfilp); break; default: ops[i].am_error = EINVAL; @@ -1069,7 +1069,7 @@ xfs_ioctl_setattr( } } - xfs_trans_ijoin(tp, ip); + xfs_trans_ijoin(tp, ip, 0); /* * Change file ownership. Must be the owner or privileged. diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c index 54e623b..f9ccb7b 100644 --- a/fs/xfs/xfs_ioctl32.c +++ b/fs/xfs/xfs_ioctl32.c @@ -454,23 +454,23 @@ xfs_compat_attrmulti_by_handle( &ops[i].am_length, ops[i].am_flags); break; case ATTR_OP_SET: - ops[i].am_error = mnt_want_write(parfilp->f_path.mnt); + ops[i].am_error = mnt_want_write_file(parfilp); if (ops[i].am_error) break; ops[i].am_error = xfs_attrmulti_attr_set( dentry->d_inode, attr_name, compat_ptr(ops[i].am_attrvalue), ops[i].am_length, ops[i].am_flags); - mnt_drop_write(parfilp->f_path.mnt); + mnt_drop_write_file(parfilp); break; case ATTR_OP_REMOVE: - ops[i].am_error = mnt_want_write(parfilp->f_path.mnt); + ops[i].am_error = mnt_want_write_file(parfilp); if (ops[i].am_error) break; ops[i].am_error = xfs_attrmulti_attr_remove( dentry->d_inode, attr_name, ops[i].am_flags); - mnt_drop_write(parfilp->f_path.mnt); + mnt_drop_write_file(parfilp); break; default: ops[i].am_error = EINVAL; diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 091d82b..9afa282 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -208,22 +208,20 @@ xfs_iomap_write_direct( if (error) goto error1; - xfs_trans_ijoin(tp, ip); + xfs_trans_ijoin(tp, ip, 0); - bmapi_flag = XFS_BMAPI_WRITE; + bmapi_flag = 0; if (offset < ip->i_size || extsz) bmapi_flag |= XFS_BMAPI_PREALLOC; /* - * Issue the xfs_bmapi() call to allocate the blocks. - * * From this point onwards we overwrite the imap pointer that the * caller gave to us. */ xfs_bmap_init(&free_list, &firstfsb); nimaps = 1; - error = xfs_bmapi(tp, ip, offset_fsb, count_fsb, bmapi_flag, - &firstfsb, 0, imap, &nimaps, &free_list); + error = xfs_bmapi_write(tp, ip, offset_fsb, count_fsb, bmapi_flag, + &firstfsb, 0, imap, &nimaps, &free_list); if (error) goto error0; @@ -300,8 +298,8 @@ xfs_iomap_eof_want_preallocate( while (count_fsb > 0) { imaps = nimaps; firstblock = NULLFSBLOCK; - error = xfs_bmapi(NULL, ip, start_fsb, count_fsb, 0, - &firstblock, 0, imap, &imaps, NULL); + error = xfs_bmapi_read(ip, start_fsb, count_fsb, imap, &imaps, + 0); if (error) return error; for (n = 0; n < imaps; n++) { @@ -381,7 +379,6 @@ xfs_iomap_write_delay( xfs_fileoff_t last_fsb; xfs_off_t aligned_offset; xfs_fileoff_t ioalign; - xfs_fsblock_t firstblock; xfs_extlen_t extsz; int nimaps; xfs_bmbt_irec_t imap[XFS_WRITE_IMAPS]; @@ -425,12 +422,8 @@ retry: } nimaps = XFS_WRITE_IMAPS; - firstblock = NULLFSBLOCK; - error = xfs_bmapi(NULL, ip, offset_fsb, - (xfs_filblks_t)(last_fsb - offset_fsb), - XFS_BMAPI_DELAY | XFS_BMAPI_WRITE | - XFS_BMAPI_ENTIRE, &firstblock, 1, imap, - &nimaps, NULL); + error = xfs_bmapi_delay(ip, offset_fsb, last_fsb - offset_fsb, + imap, &nimaps, XFS_BMAPI_ENTIRE); switch (error) { case 0: case ENOSPC: @@ -535,7 +528,7 @@ xfs_iomap_write_allocate( return XFS_ERROR(error); } xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, ip); + xfs_trans_ijoin(tp, ip, 0); xfs_bmap_init(&free_list, &first_block); @@ -587,14 +580,12 @@ xfs_iomap_write_allocate( } /* - * Go get the actual blocks. - * * From this point onwards we overwrite the imap * pointer that the caller gave to us. */ - error = xfs_bmapi(tp, ip, map_start_fsb, count_fsb, - XFS_BMAPI_WRITE, &first_block, 1, - imap, &nimaps, &free_list); + error = xfs_bmapi_write(tp, ip, map_start_fsb, + count_fsb, 0, &first_block, 1, + imap, &nimaps, &free_list); if (error) goto trans_cancel; @@ -701,15 +692,15 @@ xfs_iomap_write_unwritten( } xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, ip); + xfs_trans_ijoin(tp, ip, 0); /* * Modify the unwritten extent state of the buffer. */ xfs_bmap_init(&free_list, &firstfsb); nimaps = 1; - error = xfs_bmapi(tp, ip, offset_fsb, count_fsb, - XFS_BMAPI_WRITE|XFS_BMAPI_CONVERT, &firstfsb, + error = xfs_bmapi_write(tp, ip, offset_fsb, count_fsb, + XFS_BMAPI_CONVERT, &firstfsb, 1, &imap, &nimaps, &free_list); if (error) goto error_on_bmapi_transaction; diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index 673704f..f9babd1 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -102,37 +102,38 @@ xfs_mark_inode_dirty( } + +int xfs_initxattrs(struct inode *inode, const struct xattr *xattr_array, + void *fs_info) +{ + const struct xattr *xattr; + struct xfs_inode *ip = XFS_I(inode); + int error = 0; + + for (xattr = xattr_array; xattr->name != NULL; xattr++) { + error = xfs_attr_set(ip, xattr->name, xattr->value, + xattr->value_len, ATTR_SECURE); + if (error < 0) + break; + } + return error; +} + /* * Hook in SELinux. This is not quite correct yet, what we really need * here (as we do for default ACLs) is a mechanism by which creation of * these attrs can be journalled at inode creation time (along with the * inode, of course, such that log replay can't cause these to be lost). */ + STATIC int xfs_init_security( struct inode *inode, struct inode *dir, const struct qstr *qstr) { - struct xfs_inode *ip = XFS_I(inode); - size_t length; - void *value; - unsigned char *name; - int error; - - error = security_inode_init_security(inode, dir, qstr, (char **)&name, - &value, &length); - if (error) { - if (error == -EOPNOTSUPP) - return 0; - return -error; - } - - error = xfs_attr_set(ip, name, value, length, ATTR_SECURE); - - kfree(name); - kfree(value); - return error; + return security_inode_init_security(inode, dir, qstr, + &xfs_initxattrs, NULL); } static void @@ -167,7 +168,7 @@ STATIC int xfs_vn_mknod( struct inode *dir, struct dentry *dentry, - int mode, + umode_t mode, dev_t rdev) { struct inode *inode; @@ -230,7 +231,7 @@ STATIC int xfs_vn_create( struct inode *dir, struct dentry *dentry, - int mode, + umode_t mode, struct nameidata *nd) { return xfs_vn_mknod(dir, dentry, mode, 0); @@ -240,7 +241,7 @@ STATIC int xfs_vn_mkdir( struct inode *dir, struct dentry *dentry, - int mode) + umode_t mode) { return xfs_vn_mknod(dir, dentry, mode|S_IFDIR, 0); } @@ -365,7 +366,7 @@ xfs_vn_symlink( struct xfs_inode *cip = NULL; struct xfs_name name; int error; - mode_t mode; + umode_t mode; mode = S_IFLNK | (irix_symlink_mode ? 0777 & ~current_umask() : S_IRWXUGO); @@ -465,7 +466,7 @@ xfs_vn_getattr( trace_xfs_getattr(ip); if (XFS_FORCED_SHUTDOWN(mp)) - return XFS_ERROR(EIO); + return -XFS_ERROR(EIO); stat->size = XFS_ISIZE(ip); stat->dev = inode->i_sb->s_dev; @@ -611,7 +612,7 @@ xfs_setattr_nonsize( } } - xfs_trans_ijoin(tp, ip); + xfs_trans_ijoin(tp, ip, 0); /* * Change file ownership. Must be the owner or privileged. @@ -833,16 +834,16 @@ xfs_setattr_size( * care about here. */ if (ip->i_size != ip->i_d.di_size && iattr->ia_size > ip->i_d.di_size) { - error = xfs_flush_pages(ip, ip->i_d.di_size, iattr->ia_size, - XBF_ASYNC, FI_NONE); + error = xfs_flush_pages(ip, ip->i_d.di_size, iattr->ia_size, 0, + FI_NONE); if (error) goto out_unlock; } /* - * Wait for all I/O to complete. + * Wait for all direct I/O to complete. */ - xfs_ioend_wait(ip); + inode_dio_wait(inode); error = -block_truncate_page(inode->i_mapping, iattr->ia_size, xfs_get_blocks); @@ -863,7 +864,7 @@ xfs_setattr_size( xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, ip); + xfs_trans_ijoin(tp, ip, 0); /* * Only change the c/mtime if we are changing the size or we are @@ -1152,7 +1153,7 @@ xfs_setup_inode( hlist_add_fake(&inode->i_hash); inode->i_mode = ip->i_d.di_mode; - inode->i_nlink = ip->i_d.di_nlink; + set_nlink(inode, ip->i_d.di_nlink); inode->i_uid = ip->i_d.di_uid; inode->i_gid = ip->i_d.di_gid; diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 3a8d4f6..e2cc356 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -150,6 +150,117 @@ xlog_grant_add_space( } while (head_val != old); } +STATIC bool +xlog_reserveq_wake( + struct log *log, + int *free_bytes) +{ + struct xlog_ticket *tic; + int need_bytes; + + list_for_each_entry(tic, &log->l_reserveq, t_queue) { + if (tic->t_flags & XLOG_TIC_PERM_RESERV) + need_bytes = tic->t_unit_res * tic->t_cnt; + else + need_bytes = tic->t_unit_res; + + if (*free_bytes < need_bytes) + return false; + *free_bytes -= need_bytes; + + trace_xfs_log_grant_wake_up(log, tic); + wake_up(&tic->t_wait); + } + + return true; +} + +STATIC bool +xlog_writeq_wake( + struct log *log, + int *free_bytes) +{ + struct xlog_ticket *tic; + int need_bytes; + + list_for_each_entry(tic, &log->l_writeq, t_queue) { + ASSERT(tic->t_flags & XLOG_TIC_PERM_RESERV); + + need_bytes = tic->t_unit_res; + + if (*free_bytes < need_bytes) + return false; + *free_bytes -= need_bytes; + + trace_xfs_log_regrant_write_wake_up(log, tic); + wake_up(&tic->t_wait); + } + + return true; +} + +STATIC int +xlog_reserveq_wait( + struct log *log, + struct xlog_ticket *tic, + int need_bytes) +{ + list_add_tail(&tic->t_queue, &log->l_reserveq); + + do { + if (XLOG_FORCED_SHUTDOWN(log)) + goto shutdown; + xlog_grant_push_ail(log, need_bytes); + + XFS_STATS_INC(xs_sleep_logspace); + trace_xfs_log_grant_sleep(log, tic); + + xlog_wait(&tic->t_wait, &log->l_grant_reserve_lock); + trace_xfs_log_grant_wake(log, tic); + + spin_lock(&log->l_grant_reserve_lock); + if (XLOG_FORCED_SHUTDOWN(log)) + goto shutdown; + } while (xlog_space_left(log, &log->l_grant_reserve_head) < need_bytes); + + list_del_init(&tic->t_queue); + return 0; +shutdown: + list_del_init(&tic->t_queue); + return XFS_ERROR(EIO); +} + +STATIC int +xlog_writeq_wait( + struct log *log, + struct xlog_ticket *tic, + int need_bytes) +{ + list_add_tail(&tic->t_queue, &log->l_writeq); + + do { + if (XLOG_FORCED_SHUTDOWN(log)) + goto shutdown; + xlog_grant_push_ail(log, need_bytes); + + XFS_STATS_INC(xs_sleep_logspace); + trace_xfs_log_regrant_write_sleep(log, tic); + + xlog_wait(&tic->t_wait, &log->l_grant_write_lock); + trace_xfs_log_regrant_write_wake(log, tic); + + spin_lock(&log->l_grant_write_lock); + if (XLOG_FORCED_SHUTDOWN(log)) + goto shutdown; + } while (xlog_space_left(log, &log->l_grant_write_head) < need_bytes); + + list_del_init(&tic->t_queue); + return 0; +shutdown: + list_del_init(&tic->t_queue); + return XFS_ERROR(EIO); +} + static void xlog_tic_reset_res(xlog_ticket_t *tic) { @@ -350,8 +461,19 @@ xfs_log_reserve( retval = xlog_grant_log_space(log, internal_ticket); } + if (unlikely(retval)) { + /* + * If we are failing, make sure the ticket doesn't have any + * current reservations. We don't want to add this back + * when the ticket/ transaction gets cancelled. + */ + internal_ticket->t_curr_res = 0; + /* ungrant will give back unit_res * t_cnt. */ + internal_ticket->t_cnt = 0; + } + return retval; -} /* xfs_log_reserve */ +} /* @@ -626,7 +748,7 @@ xfs_log_item_init( struct xfs_mount *mp, struct xfs_log_item *item, int type, - struct xfs_item_ops *ops) + const struct xfs_item_ops *ops) { item->li_mountp = mp; item->li_ailp = mp->m_ail; @@ -638,38 +760,6 @@ xfs_log_item_init( INIT_LIST_HEAD(&item->li_cil); } -/* - * Write region vectors to log. The write happens using the space reservation - * of the ticket (tic). It is not a requirement that all writes for a given - * transaction occur with one call to xfs_log_write(). However, it is important - * to note that the transaction reservation code makes an assumption about the - * number of log headers a transaction requires that may be violated if you - * don't pass all the transaction vectors in one call.... - */ -int -xfs_log_write( - struct xfs_mount *mp, - struct xfs_log_iovec reg[], - int nentries, - struct xlog_ticket *tic, - xfs_lsn_t *start_lsn) -{ - struct log *log = mp->m_log; - int error; - struct xfs_log_vec vec = { - .lv_niovecs = nentries, - .lv_iovecp = reg, - }; - - if (XLOG_FORCED_SHUTDOWN(log)) - return XFS_ERROR(EIO); - - error = xlog_write(log, &vec, tic, start_lsn, NULL, 0); - if (error) - xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR); - return error; -} - void xfs_log_move_tail(xfs_mount_t *mp, xfs_lsn_t tail_lsn) @@ -880,8 +970,8 @@ xlog_iodone(xfs_buf_t *bp) */ if (XFS_TEST_ERROR((xfs_buf_geterror(bp)), l->l_mp, XFS_ERRTAG_IODONE_IOERR, XFS_RANDOM_IODONE_IOERR)) { - xfs_ioerror_alert("xlog_iodone", l->l_mp, bp, XFS_BUF_ADDR(bp)); - XFS_BUF_STALE(bp); + xfs_buf_ioerror_alert(bp, __func__); + xfs_buf_stale(bp); xfs_force_shutdown(l->l_mp, SHUTDOWN_LOG_IO_ERROR); /* * This flag will be propagated to the trans-committed @@ -1047,7 +1137,7 @@ xlog_alloc_log(xfs_mount_t *mp, xlog_get_iclog_buffer_size(mp, log); error = ENOMEM; - bp = xfs_buf_get_empty(log->l_iclog_size, mp->m_logdev_targp); + bp = xfs_buf_alloc(mp->m_logdev_targp, 0, log->l_iclog_size, 0); if (!bp) goto out_free_log; bp->b_iodone = xlog_iodone; @@ -1247,7 +1337,7 @@ xlog_bdstrat( if (iclog->ic_state & XLOG_STATE_IOERROR) { xfs_buf_ioerror(bp, EIO); - XFS_BUF_STALE(bp); + xfs_buf_stale(bp); xfs_buf_ioend(bp, 0); /* * It would seem logical to return EIO here, but we rely on @@ -1387,9 +1477,9 @@ xlog_sync(xlog_t *log, */ XFS_BUF_WRITE(bp); - if ((error = xlog_bdstrat(bp))) { - xfs_ioerror_alert("xlog_sync", log->l_mp, bp, - XFS_BUF_ADDR(bp)); + error = xlog_bdstrat(bp); + if (error) { + xfs_buf_ioerror_alert(bp, "xlog_sync"); return error; } if (split) { @@ -1423,9 +1513,9 @@ xlog_sync(xlog_t *log, /* account for internal log which doesn't start at block #0 */ XFS_BUF_SET_ADDR(bp, XFS_BUF_ADDR(bp) + log->l_logBBstart); XFS_BUF_WRITE(bp); - if ((error = xlog_bdstrat(bp))) { - xfs_ioerror_alert("xlog_sync (split)", log->l_mp, - bp, XFS_BUF_ADDR(bp)); + error = xlog_bdstrat(bp); + if (error) { + xfs_buf_ioerror_alert(bp, "xlog_sync (split)"); return error; } } @@ -1563,7 +1653,7 @@ xlog_print_tic_res( }; xfs_warn(mp, - "xfs_log_write: reservation summary:\n" + "xlog_write: reservation summary:\n" " trans type = %s (%u)\n" " unit res = %d bytes\n" " current res = %d bytes\n" @@ -1592,7 +1682,7 @@ xlog_print_tic_res( } xfs_alert_tag(mp, XFS_PTAG_LOGRES, - "xfs_log_write: reservation ran out. Need to up reservation"); + "xlog_write: reservation ran out. Need to up reservation"); xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); } @@ -1846,23 +1936,21 @@ xlog_write( *start_lsn = 0; len = xlog_write_calc_vec_length(ticket, log_vector); - if (log->l_cilp) { - /* - * Region headers and bytes are already accounted for. - * We only need to take into account start records and - * split regions in this function. - */ - if (ticket->t_flags & XLOG_TIC_INITED) - ticket->t_curr_res -= sizeof(xlog_op_header_t); - /* - * Commit record headers need to be accounted for. These - * come in as separate writes so are easy to detect. - */ - if (flags & (XLOG_COMMIT_TRANS | XLOG_UNMOUNT_TRANS)) - ticket->t_curr_res -= sizeof(xlog_op_header_t); - } else - ticket->t_curr_res -= len; + /* + * Region headers and bytes are already accounted for. + * We only need to take into account start records and + * split regions in this function. + */ + if (ticket->t_flags & XLOG_TIC_INITED) + ticket->t_curr_res -= sizeof(xlog_op_header_t); + + /* + * Commit record headers need to be accounted for. These + * come in as separate writes so are easy to detect. + */ + if (flags & (XLOG_COMMIT_TRANS | XLOG_UNMOUNT_TRANS)) + ticket->t_curr_res -= sizeof(xlog_op_header_t); if (ticket->t_curr_res < 0) xlog_print_tic_res(log->l_mp, ticket); @@ -2481,8 +2569,8 @@ restart: /* * Atomically get the log space required for a log ticket. * - * Once a ticket gets put onto the reserveq, it will only return after - * the needed reservation is satisfied. + * Once a ticket gets put onto the reserveq, it will only return after the + * needed reservation is satisfied. * * This function is structured so that it has a lock free fast path. This is * necessary because every new transaction reservation will come through this @@ -2490,113 +2578,53 @@ restart: * every pass. * * As tickets are only ever moved on and off the reserveq under the - * l_grant_reserve_lock, we only need to take that lock if we are going - * to add the ticket to the queue and sleep. We can avoid taking the lock if the - * ticket was never added to the reserveq because the t_queue list head will be - * empty and we hold the only reference to it so it can safely be checked - * unlocked. + * l_grant_reserve_lock, we only need to take that lock if we are going to add + * the ticket to the queue and sleep. We can avoid taking the lock if the ticket + * was never added to the reserveq because the t_queue list head will be empty + * and we hold the only reference to it so it can safely be checked unlocked. */ STATIC int -xlog_grant_log_space(xlog_t *log, - xlog_ticket_t *tic) +xlog_grant_log_space( + struct log *log, + struct xlog_ticket *tic) { - int free_bytes; - int need_bytes; + int free_bytes, need_bytes; + int error = 0; -#ifdef DEBUG - if (log->l_flags & XLOG_ACTIVE_RECOVERY) - panic("grant Recovery problem"); -#endif + ASSERT(!(log->l_flags & XLOG_ACTIVE_RECOVERY)); trace_xfs_log_grant_enter(log, tic); + /* + * If there are other waiters on the queue then give them a chance at + * logspace before us. Wake up the first waiters, if we do not wake + * up all the waiters then go to sleep waiting for more free space, + * otherwise try to get some space for this transaction. + */ need_bytes = tic->t_unit_res; if (tic->t_flags & XFS_LOG_PERM_RESERV) need_bytes *= tic->t_ocnt; - - /* something is already sleeping; insert new transaction at end */ - if (!list_empty_careful(&log->l_reserveq)) { - spin_lock(&log->l_grant_reserve_lock); - /* recheck the queue now we are locked */ - if (list_empty(&log->l_reserveq)) { - spin_unlock(&log->l_grant_reserve_lock); - goto redo; - } - list_add_tail(&tic->t_queue, &log->l_reserveq); - - trace_xfs_log_grant_sleep1(log, tic); - - /* - * Gotta check this before going to sleep, while we're - * holding the grant lock. - */ - if (XLOG_FORCED_SHUTDOWN(log)) - goto error_return; - - XFS_STATS_INC(xs_sleep_logspace); - xlog_wait(&tic->t_wait, &log->l_grant_reserve_lock); - - /* - * If we got an error, and the filesystem is shutting down, - * we'll catch it down below. So just continue... - */ - trace_xfs_log_grant_wake1(log, tic); - } - -redo: - if (XLOG_FORCED_SHUTDOWN(log)) - goto error_return_unlocked; - free_bytes = xlog_space_left(log, &log->l_grant_reserve_head); - if (free_bytes < need_bytes) { + if (!list_empty_careful(&log->l_reserveq)) { spin_lock(&log->l_grant_reserve_lock); - if (list_empty(&tic->t_queue)) - list_add_tail(&tic->t_queue, &log->l_reserveq); - - trace_xfs_log_grant_sleep2(log, tic); - - if (XLOG_FORCED_SHUTDOWN(log)) - goto error_return; - - xlog_grant_push_ail(log, need_bytes); - - XFS_STATS_INC(xs_sleep_logspace); - xlog_wait(&tic->t_wait, &log->l_grant_reserve_lock); - - trace_xfs_log_grant_wake2(log, tic); - goto redo; - } - - if (!list_empty(&tic->t_queue)) { + if (!xlog_reserveq_wake(log, &free_bytes) || + free_bytes < need_bytes) + error = xlog_reserveq_wait(log, tic, need_bytes); + spin_unlock(&log->l_grant_reserve_lock); + } else if (free_bytes < need_bytes) { spin_lock(&log->l_grant_reserve_lock); - list_del_init(&tic->t_queue); + error = xlog_reserveq_wait(log, tic, need_bytes); spin_unlock(&log->l_grant_reserve_lock); } + if (error) + return error; - /* we've got enough space */ xlog_grant_add_space(log, &log->l_grant_reserve_head, need_bytes); xlog_grant_add_space(log, &log->l_grant_write_head, need_bytes); trace_xfs_log_grant_exit(log, tic); xlog_verify_grant_tail(log); return 0; - -error_return_unlocked: - spin_lock(&log->l_grant_reserve_lock); -error_return: - list_del_init(&tic->t_queue); - spin_unlock(&log->l_grant_reserve_lock); - trace_xfs_log_grant_error(log, tic); - - /* - * If we are failing, make sure the ticket doesn't have any - * current reservations. We don't want to add this back when - * the ticket/transaction gets cancelled. - */ - tic->t_curr_res = 0; - tic->t_cnt = 0; /* ungrant will give back unit_res * t_cnt. */ - return XFS_ERROR(EIO); -} /* xlog_grant_log_space */ - +} /* * Replenish the byte reservation required by moving the grant write head. @@ -2605,10 +2633,12 @@ error_return: * free fast path. */ STATIC int -xlog_regrant_write_log_space(xlog_t *log, - xlog_ticket_t *tic) +xlog_regrant_write_log_space( + struct log *log, + struct xlog_ticket *tic) { - int free_bytes, need_bytes; + int free_bytes, need_bytes; + int error = 0; tic->t_curr_res = tic->t_unit_res; xlog_tic_reset_res(tic); @@ -2616,104 +2646,38 @@ xlog_regrant_write_log_space(xlog_t *log, if (tic->t_cnt > 0) return 0; -#ifdef DEBUG - if (log->l_flags & XLOG_ACTIVE_RECOVERY) - panic("regrant Recovery problem"); -#endif + ASSERT(!(log->l_flags & XLOG_ACTIVE_RECOVERY)); trace_xfs_log_regrant_write_enter(log, tic); - if (XLOG_FORCED_SHUTDOWN(log)) - goto error_return_unlocked; - /* If there are other waiters on the queue then give them a - * chance at logspace before us. Wake up the first waiters, - * if we do not wake up all the waiters then go to sleep waiting - * for more free space, otherwise try to get some space for - * this transaction. + /* + * If there are other waiters on the queue then give them a chance at + * logspace before us. Wake up the first waiters, if we do not wake + * up all the waiters then go to sleep waiting for more free space, + * otherwise try to get some space for this transaction. */ need_bytes = tic->t_unit_res; - if (!list_empty_careful(&log->l_writeq)) { - struct xlog_ticket *ntic; - - spin_lock(&log->l_grant_write_lock); - free_bytes = xlog_space_left(log, &log->l_grant_write_head); - list_for_each_entry(ntic, &log->l_writeq, t_queue) { - ASSERT(ntic->t_flags & XLOG_TIC_PERM_RESERV); - - if (free_bytes < ntic->t_unit_res) - break; - free_bytes -= ntic->t_unit_res; - wake_up(&ntic->t_wait); - } - - if (ntic != list_first_entry(&log->l_writeq, - struct xlog_ticket, t_queue)) { - if (list_empty(&tic->t_queue)) - list_add_tail(&tic->t_queue, &log->l_writeq); - trace_xfs_log_regrant_write_sleep1(log, tic); - - xlog_grant_push_ail(log, need_bytes); - - XFS_STATS_INC(xs_sleep_logspace); - xlog_wait(&tic->t_wait, &log->l_grant_write_lock); - trace_xfs_log_regrant_write_wake1(log, tic); - } else - spin_unlock(&log->l_grant_write_lock); - } - -redo: - if (XLOG_FORCED_SHUTDOWN(log)) - goto error_return_unlocked; - free_bytes = xlog_space_left(log, &log->l_grant_write_head); - if (free_bytes < need_bytes) { + if (!list_empty_careful(&log->l_writeq)) { spin_lock(&log->l_grant_write_lock); - if (list_empty(&tic->t_queue)) - list_add_tail(&tic->t_queue, &log->l_writeq); - - if (XLOG_FORCED_SHUTDOWN(log)) - goto error_return; - - xlog_grant_push_ail(log, need_bytes); - - XFS_STATS_INC(xs_sleep_logspace); - trace_xfs_log_regrant_write_sleep2(log, tic); - xlog_wait(&tic->t_wait, &log->l_grant_write_lock); - - trace_xfs_log_regrant_write_wake2(log, tic); - goto redo; - } - - if (!list_empty(&tic->t_queue)) { + if (!xlog_writeq_wake(log, &free_bytes) || + free_bytes < need_bytes) + error = xlog_writeq_wait(log, tic, need_bytes); + spin_unlock(&log->l_grant_write_lock); + } else if (free_bytes < need_bytes) { spin_lock(&log->l_grant_write_lock); - list_del_init(&tic->t_queue); + error = xlog_writeq_wait(log, tic, need_bytes); spin_unlock(&log->l_grant_write_lock); } - /* we've got enough space */ + if (error) + return error; + xlog_grant_add_space(log, &log->l_grant_write_head, need_bytes); trace_xfs_log_regrant_write_exit(log, tic); xlog_verify_grant_tail(log); return 0; - - - error_return_unlocked: - spin_lock(&log->l_grant_write_lock); - error_return: - list_del_init(&tic->t_queue); - spin_unlock(&log->l_grant_write_lock); - trace_xfs_log_regrant_write_error(log, tic); - - /* - * If we are failing, make sure the ticket doesn't have any - * current reservations. We don't want to add this back when - * the ticket/transaction gets cancelled. - */ - tic->t_curr_res = 0; - tic->t_cnt = 0; /* ungrant will give back unit_res * t_cnt. */ - return XFS_ERROR(EIO); -} /* xlog_regrant_write_log_space */ - +} /* The first cnt-1 times through here we don't need to * move the grant write head because the permanent @@ -2933,8 +2897,7 @@ _xfs_log_force( XFS_STATS_INC(xs_log_force); - if (log->l_cilp) - xlog_cil_force(log); + xlog_cil_force(log); spin_lock(&log->l_icloglock); @@ -3083,11 +3046,9 @@ _xfs_log_force_lsn( XFS_STATS_INC(xs_log_force); - if (log->l_cilp) { - lsn = xlog_cil_force_lsn(log, lsn); - if (lsn == NULLCOMMITLSN) - return 0; - } + lsn = xlog_cil_force_lsn(log, lsn); + if (lsn == NULLCOMMITLSN) + return 0; try_again: spin_lock(&log->l_icloglock); @@ -3655,7 +3616,7 @@ xfs_log_force_umount( * completed transactions are flushed to disk with the xfs_log_force() * call below. */ - if (!logerror && (mp->m_flags & XFS_MOUNT_DELAYLOG)) + if (!logerror) xlog_cil_force(log); /* diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h index 78c9039..2aee3b2 100644 --- a/fs/xfs/xfs_log.h +++ b/fs/xfs/xfs_log.h @@ -137,7 +137,7 @@ struct xfs_trans; void xfs_log_item_init(struct xfs_mount *mp, struct xfs_log_item *item, int type, - struct xfs_item_ops *ops); + const struct xfs_item_ops *ops); xfs_lsn_t xfs_log_done(struct xfs_mount *mp, struct xlog_ticket *ticket, @@ -174,11 +174,6 @@ int xfs_log_reserve(struct xfs_mount *mp, __uint8_t clientid, uint flags, uint t_type); -int xfs_log_write(struct xfs_mount *mp, - xfs_log_iovec_t region[], - int nentries, - struct xlog_ticket *ticket, - xfs_lsn_t *start_lsn); int xfs_log_unmount_write(struct xfs_mount *mp); void xfs_log_unmount(struct xfs_mount *mp); int xfs_log_force_umount(struct xfs_mount *mp, int logerror); @@ -189,8 +184,7 @@ void xlog_iodone(struct xfs_buf *); struct xlog_ticket *xfs_log_ticket_get(struct xlog_ticket *ticket); void xfs_log_ticket_put(struct xlog_ticket *ticket); -void xfs_log_commit_cil(struct xfs_mount *mp, struct xfs_trans *tp, - struct xfs_log_vec *log_vector, +int xfs_log_commit_cil(struct xfs_mount *mp, struct xfs_trans *tp, xfs_lsn_t *commit_lsn, int flags); bool xfs_log_item_in_current_chkpt(struct xfs_log_item *lip); diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index c7755d5..d4fadbe 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -32,10 +32,7 @@ #include "xfs_discard.h" /* - * Perform initial CIL structure initialisation. If the CIL is not - * enabled in this filesystem, ensure the log->l_cilp is null so - * we can check this conditional to determine if we are doing delayed - * logging or not. + * Perform initial CIL structure initialisation. */ int xlog_cil_init( @@ -44,10 +41,6 @@ xlog_cil_init( struct xfs_cil *cil; struct xfs_cil_ctx *ctx; - log->l_cilp = NULL; - if (!(log->l_mp->m_flags & XFS_MOUNT_DELAYLOG)) - return 0; - cil = kmem_zalloc(sizeof(*cil), KM_SLEEP|KM_MAYFAIL); if (!cil) return ENOMEM; @@ -80,9 +73,6 @@ void xlog_cil_destroy( struct log *log) { - if (!log->l_cilp) - return; - if (log->l_cilp->xc_ctx) { if (log->l_cilp->xc_ctx->ticket) xfs_log_ticket_put(log->l_cilp->xc_ctx->ticket); @@ -137,9 +127,6 @@ void xlog_cil_init_post_recovery( struct log *log) { - if (!log->l_cilp) - return; - log->l_cilp->xc_ctx->ticket = xlog_cil_ticket_alloc(log); log->l_cilp->xc_ctx->sequence = 1; log->l_cilp->xc_ctx->commit_lsn = xlog_assign_lsn(log->l_curr_cycle, @@ -172,37 +159,73 @@ xlog_cil_init_post_recovery( * format the regions into the iclog as though they are being formatted * directly out of the objects themselves. */ -static void -xlog_cil_format_items( - struct log *log, - struct xfs_log_vec *log_vector) +static struct xfs_log_vec * +xlog_cil_prepare_log_vecs( + struct xfs_trans *tp) { - struct xfs_log_vec *lv; + struct xfs_log_item_desc *lidp; + struct xfs_log_vec *lv = NULL; + struct xfs_log_vec *ret_lv = NULL; - ASSERT(log_vector); - for (lv = log_vector; lv; lv = lv->lv_next) { + + /* Bail out if we didn't find a log item. */ + if (list_empty(&tp->t_items)) { + ASSERT(0); + return NULL; + } + + list_for_each_entry(lidp, &tp->t_items, lid_trans) { + struct xfs_log_vec *new_lv; void *ptr; int index; int len = 0; + uint niovecs; + + /* Skip items which aren't dirty in this transaction. */ + if (!(lidp->lid_flags & XFS_LID_DIRTY)) + continue; + + /* Skip items that do not have any vectors for writing */ + niovecs = IOP_SIZE(lidp->lid_item); + if (!niovecs) + continue; + + new_lv = kmem_zalloc(sizeof(*new_lv) + + niovecs * sizeof(struct xfs_log_iovec), + KM_SLEEP); + + /* The allocated iovec region lies beyond the log vector. */ + new_lv->lv_iovecp = (struct xfs_log_iovec *)&new_lv[1]; + new_lv->lv_niovecs = niovecs; + new_lv->lv_item = lidp->lid_item; /* build the vector array and calculate it's length */ - IOP_FORMAT(lv->lv_item, lv->lv_iovecp); - for (index = 0; index < lv->lv_niovecs; index++) - len += lv->lv_iovecp[index].i_len; + IOP_FORMAT(new_lv->lv_item, new_lv->lv_iovecp); + for (index = 0; index < new_lv->lv_niovecs; index++) + len += new_lv->lv_iovecp[index].i_len; - lv->lv_buf_len = len; - lv->lv_buf = kmem_alloc(lv->lv_buf_len, KM_SLEEP|KM_NOFS); - ptr = lv->lv_buf; + new_lv->lv_buf_len = len; + new_lv->lv_buf = kmem_alloc(new_lv->lv_buf_len, + KM_SLEEP|KM_NOFS); + ptr = new_lv->lv_buf; - for (index = 0; index < lv->lv_niovecs; index++) { - struct xfs_log_iovec *vec = &lv->lv_iovecp[index]; + for (index = 0; index < new_lv->lv_niovecs; index++) { + struct xfs_log_iovec *vec = &new_lv->lv_iovecp[index]; memcpy(ptr, vec->i_addr, vec->i_len); vec->i_addr = ptr; ptr += vec->i_len; } - ASSERT(ptr == lv->lv_buf + lv->lv_buf_len); + ASSERT(ptr == new_lv->lv_buf + new_lv->lv_buf_len); + + if (!ret_lv) + ret_lv = new_lv; + else + lv->lv_next = new_lv; + lv = new_lv; } + + return ret_lv; } /* @@ -256,7 +279,7 @@ xfs_cil_prepare_item( * Insert the log items into the CIL and calculate the difference in space * consumed by the item. Add the space to the checkpoint ticket and calculate * if the change requires additional log metadata. If it does, take that space - * as well. Remove the amount of space we addded to the checkpoint ticket from + * as well. Remove the amount of space we added to the checkpoint ticket from * the current transaction ticket so that the accounting works out correctly. */ static void @@ -635,28 +658,30 @@ out_abort: * background commit, returns without it held once background commits are * allowed again. */ -void +int xfs_log_commit_cil( struct xfs_mount *mp, struct xfs_trans *tp, - struct xfs_log_vec *log_vector, xfs_lsn_t *commit_lsn, int flags) { struct log *log = mp->m_log; int log_flags = 0; int push = 0; + struct xfs_log_vec *log_vector; if (flags & XFS_TRANS_RELEASE_LOG_RES) log_flags = XFS_LOG_REL_PERM_RESERV; /* - * do all the hard work of formatting items (including memory + * Do all the hard work of formatting items (including memory * allocation) outside the CIL context lock. This prevents stalling CIL * pushes when we are low on memory and a transaction commit spends a * lot of time in memory reclaim. */ - xlog_cil_format_items(log, log_vector); + log_vector = xlog_cil_prepare_log_vecs(tp); + if (!log_vector) + return ENOMEM; /* lock out background commit */ down_read(&log->l_cilp->xc_ctx_lock); @@ -709,6 +734,7 @@ xfs_log_commit_cil( */ if (push) xlog_cil_push(log, 0); + return 0; } /* @@ -786,8 +812,6 @@ xfs_log_item_in_current_chkpt( { struct xfs_cil_ctx *ctx; - if (!(lip->li_mountp->m_flags & XFS_MOUNT_DELAYLOG)) - return false; if (list_empty(&lip->li_cil)) return false; diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index a199dbc..541a508 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -183,8 +183,7 @@ xlog_bread_noalign( xfsbdstrat(log->l_mp, bp); error = xfs_buf_iowait(bp); if (error) - xfs_ioerror_alert("xlog_bread", log->l_mp, - bp, XFS_BUF_ADDR(bp)); + xfs_buf_ioerror_alert(bp, __func__); return error; } @@ -268,9 +267,10 @@ xlog_bwrite( xfs_buf_lock(bp); XFS_BUF_SET_COUNT(bp, BBTOB(nbblks)); - if ((error = xfs_bwrite(log->l_mp, bp))) - xfs_ioerror_alert("xlog_bwrite", log->l_mp, - bp, XFS_BUF_ADDR(bp)); + error = xfs_bwrite(bp); + if (error) + xfs_buf_ioerror_alert(bp, __func__); + xfs_buf_relse(bp); return error; } @@ -361,9 +361,7 @@ xlog_recover_iodone( * We're not going to bother about retrying * this during recovery. One strike! */ - xfs_ioerror_alert("xlog_recover_iodone", - bp->b_target->bt_mount, bp, - XFS_BUF_ADDR(bp)); + xfs_buf_ioerror_alert(bp, __func__); xfs_force_shutdown(bp->b_target->bt_mount, SHUTDOWN_META_IO_ERROR); } @@ -2135,8 +2133,7 @@ xlog_recover_buffer_pass2( return XFS_ERROR(ENOMEM); error = bp->b_error; if (error) { - xfs_ioerror_alert("xlog_recover_do..(read#1)", mp, - bp, buf_f->blf_blkno); + xfs_buf_ioerror_alert(bp, "xlog_recover_do..(read#1)"); xfs_buf_relse(bp); return error; } @@ -2171,15 +2168,16 @@ xlog_recover_buffer_pass2( be16_to_cpu(*((__be16 *)xfs_buf_offset(bp, 0))) && (XFS_BUF_COUNT(bp) != MAX(log->l_mp->m_sb.sb_blocksize, (__uint32_t)XFS_INODE_CLUSTER_SIZE(log->l_mp)))) { - XFS_BUF_STALE(bp); - error = xfs_bwrite(mp, bp); + xfs_buf_stale(bp); + error = xfs_bwrite(bp); } else { ASSERT(bp->b_target->bt_mount == mp); bp->b_iodone = xlog_recover_iodone; - xfs_bdwrite(mp, bp); + xfs_buf_delwri_queue(bp); } - return (error); + xfs_buf_relse(bp); + return error; } STATIC int @@ -2230,8 +2228,7 @@ xlog_recover_inode_pass2( } error = bp->b_error; if (error) { - xfs_ioerror_alert("xlog_recover_do..(read#2)", mp, - bp, in_f->ilf_blkno); + xfs_buf_ioerror_alert(bp, "xlog_recover_do..(read#2)"); xfs_buf_relse(bp); goto error; } @@ -2439,7 +2436,8 @@ xlog_recover_inode_pass2( write_inode_buffer: ASSERT(bp->b_target->bt_mount == mp); bp->b_iodone = xlog_recover_iodone; - xfs_bdwrite(mp, bp); + xfs_buf_delwri_queue(bp); + xfs_buf_relse(bp); error: if (need_free) kmem_free(in_f); @@ -2537,8 +2535,7 @@ xlog_recover_dquot_pass2( XFS_FSB_TO_BB(mp, dq_f->qlf_len), 0, &bp); if (error) { - xfs_ioerror_alert("xlog_recover_do..(read#3)", mp, - bp, dq_f->qlf_blkno); + xfs_buf_ioerror_alert(bp, "xlog_recover_do..(read#3)"); return error; } ASSERT(bp); @@ -2561,7 +2558,8 @@ xlog_recover_dquot_pass2( ASSERT(dq_f->qlf_size == 2); ASSERT(bp->b_target->bt_mount == mp); bp->b_iodone = xlog_recover_iodone; - xfs_bdwrite(mp, bp); + xfs_buf_delwri_queue(bp); + xfs_buf_relse(bp); return (0); } @@ -3656,7 +3654,7 @@ xlog_do_recover( return error; } - XFS_bflush(log->l_mp->m_ddev_targp); + xfs_flush_buftarg(log->l_mp->m_ddev_targp, 1); /* * If IO errors happened during recovery, bail out. @@ -3689,8 +3687,7 @@ xlog_do_recover( xfsbdstrat(log->l_mp, bp); error = xfs_buf_iowait(bp); if (error) { - xfs_ioerror_alert("xlog_do_recover", - log->l_mp, bp, XFS_BUF_ADDR(bp)); + xfs_buf_ioerror_alert(bp, __func__); ASSERT(0); xfs_buf_relse(bp); return error; diff --git a/fs/xfs/xfs_message.h b/fs/xfs/xfs_message.h index 7fb7ea0..56dc0c1 100644 --- a/fs/xfs/xfs_message.h +++ b/fs/xfs/xfs_message.h @@ -3,31 +3,29 @@ struct xfs_mount; -extern void xfs_emerg(const struct xfs_mount *mp, const char *fmt, ...) - __attribute__ ((format (printf, 2, 3))); -extern void xfs_alert(const struct xfs_mount *mp, const char *fmt, ...) - __attribute__ ((format (printf, 2, 3))); -extern void xfs_alert_tag(const struct xfs_mount *mp, int tag, - const char *fmt, ...) - __attribute__ ((format (printf, 3, 4))); -extern void xfs_crit(const struct xfs_mount *mp, const char *fmt, ...) - __attribute__ ((format (printf, 2, 3))); -extern void xfs_err(const struct xfs_mount *mp, const char *fmt, ...) - __attribute__ ((format (printf, 2, 3))); -extern void xfs_warn(const struct xfs_mount *mp, const char *fmt, ...) - __attribute__ ((format (printf, 2, 3))); -extern void xfs_notice(const struct xfs_mount *mp, const char *fmt, ...) - __attribute__ ((format (printf, 2, 3))); -extern void xfs_info(const struct xfs_mount *mp, const char *fmt, ...) - __attribute__ ((format (printf, 2, 3))); +extern __printf(2, 3) +void xfs_emerg(const struct xfs_mount *mp, const char *fmt, ...); +extern __printf(2, 3) +void xfs_alert(const struct xfs_mount *mp, const char *fmt, ...); +extern __printf(3, 4) +void xfs_alert_tag(const struct xfs_mount *mp, int tag, const char *fmt, ...); +extern __printf(2, 3) +void xfs_crit(const struct xfs_mount *mp, const char *fmt, ...); +extern __printf(2, 3) +void xfs_err(const struct xfs_mount *mp, const char *fmt, ...); +extern __printf(2, 3) +void xfs_warn(const struct xfs_mount *mp, const char *fmt, ...); +extern __printf(2, 3) +void xfs_notice(const struct xfs_mount *mp, const char *fmt, ...); +extern __printf(2, 3) +void xfs_info(const struct xfs_mount *mp, const char *fmt, ...); #ifdef DEBUG -extern void xfs_debug(const struct xfs_mount *mp, const char *fmt, ...) - __attribute__ ((format (printf, 2, 3))); +extern __printf(2, 3) +void xfs_debug(const struct xfs_mount *mp, const char *fmt, ...); #else -static inline void -__attribute__ ((format (printf, 2, 3))) -xfs_debug(const struct xfs_mount *mp, const char *fmt, ...) +static inline __printf(2, 3) +void xfs_debug(const struct xfs_mount *mp, const char *fmt, ...) { } #endif diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 0081657..d06afbc 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -44,9 +44,6 @@ #include "xfs_trace.h" -STATIC void xfs_unmountfs_wait(xfs_mount_t *); - - #ifdef HAVE_PERCPU_SB STATIC void xfs_icsb_balance_counter(xfs_mount_t *, xfs_sb_field_t, int); @@ -1484,7 +1481,7 @@ xfs_unmountfs( * state as much as possible. */ xfs_reclaim_inodes(mp, 0); - XFS_bflush(mp->m_ddev_targp); + xfs_flush_buftarg(mp->m_ddev_targp, 1); xfs_reclaim_inodes(mp, SYNC_WAIT); xfs_qm_unmount(mp); @@ -1496,11 +1493,6 @@ xfs_unmountfs( */ xfs_log_force(mp, XFS_LOG_SYNC); - xfs_binval(mp->m_ddev_targp); - if (mp->m_rtdev_targp) { - xfs_binval(mp->m_rtdev_targp); - } - /* * Unreserve any blocks we have so that when we unmount we don't account * the reserved free space as used. This is really only necessary for @@ -1526,7 +1518,16 @@ xfs_unmountfs( xfs_warn(mp, "Unable to update superblock counters. " "Freespace may not be correct on next mount."); xfs_unmountfs_writesb(mp); - xfs_unmountfs_wait(mp); /* wait for async bufs */ + + /* + * Make sure all buffers have been flushed and completed before + * unmounting the log. + */ + error = xfs_flush_buftarg(mp->m_ddev_targp, 1); + if (error) + xfs_warn(mp, "%d busy buffers during unmount.", error); + xfs_wait_buftarg(mp->m_ddev_targp); + xfs_log_unmount_write(mp); xfs_log_unmount(mp); xfs_uuid_unmount(mp); @@ -1537,16 +1538,6 @@ xfs_unmountfs( xfs_free_perag(mp); } -STATIC void -xfs_unmountfs_wait(xfs_mount_t *mp) -{ - if (mp->m_logdev_targp != mp->m_ddev_targp) - xfs_wait_buftarg(mp->m_logdev_targp); - if (mp->m_rtdev_targp) - xfs_wait_buftarg(mp->m_rtdev_targp); - xfs_wait_buftarg(mp->m_ddev_targp); -} - int xfs_fs_writable(xfs_mount_t *mp) { @@ -1612,15 +1603,14 @@ xfs_unmountfs_writesb(xfs_mount_t *mp) XFS_BUF_UNDONE(sbp); XFS_BUF_UNREAD(sbp); - XFS_BUF_UNDELAYWRITE(sbp); + xfs_buf_delwri_dequeue(sbp); XFS_BUF_WRITE(sbp); XFS_BUF_UNASYNC(sbp); ASSERT(sbp->b_target == mp->m_ddev_targp); xfsbdstrat(mp, sbp); error = xfs_buf_iowait(sbp); if (error) - xfs_ioerror_alert("xfs_unmountfs_writesb", - mp, sbp, XFS_BUF_ADDR(sbp)); + xfs_buf_ioerror_alert(sbp, __func__); xfs_buf_relse(sbp); } return error; diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index bb24dac..19f69e2 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -219,7 +219,6 @@ typedef struct xfs_mount { #define XFS_MOUNT_WSYNC (1ULL << 0) /* for nfs - all metadata ops must be synchronous except for space allocations */ -#define XFS_MOUNT_DELAYLOG (1ULL << 1) /* delayed logging is enabled */ #define XFS_MOUNT_WAS_CLEAN (1ULL << 3) #define XFS_MOUNT_FS_SHUTDOWN (1ULL << 4) /* atomic stop of all filesystem operations, typically for diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index 9a0aa76..671f37e 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -154,12 +154,17 @@ STATIC void xfs_qm_destroy( struct xfs_qm *xqm) { - struct xfs_dquot *dqp, *n; int hsize, i; ASSERT(xqm != NULL); ASSERT(xqm->qm_nrefs == 0); + unregister_shrinker(&xfs_qm_shaker); + + mutex_lock(&xqm->qm_dqfrlist_lock); + ASSERT(list_empty(&xqm->qm_dqfrlist)); + mutex_unlock(&xqm->qm_dqfrlist_lock); + hsize = xqm->qm_dqhashmask + 1; for (i = 0; i < hsize; i++) { xfs_qm_list_destroy(&(xqm->qm_usr_dqhtable[i])); @@ -171,17 +176,6 @@ xfs_qm_destroy( xqm->qm_grp_dqhtable = NULL; xqm->qm_dqhashmask = 0; - /* frlist cleanup */ - mutex_lock(&xqm->qm_dqfrlist_lock); - list_for_each_entry_safe(dqp, n, &xqm->qm_dqfrlist, q_freelist) { - xfs_dqlock(dqp); - list_del_init(&dqp->q_freelist); - xfs_Gqm->qm_dqfrlist_cnt--; - xfs_dqunlock(dqp); - xfs_qm_dqdestroy(dqp); - } - mutex_unlock(&xqm->qm_dqfrlist_lock); - mutex_destroy(&xqm->qm_dqfrlist_lock); kmem_free(xqm); } @@ -232,34 +226,10 @@ STATIC void xfs_qm_rele_quotafs_ref( struct xfs_mount *mp) { - xfs_dquot_t *dqp, *n; - ASSERT(xfs_Gqm); ASSERT(xfs_Gqm->qm_nrefs > 0); /* - * Go thru the freelist and destroy all inactive dquots. - */ - mutex_lock(&xfs_Gqm->qm_dqfrlist_lock); - - list_for_each_entry_safe(dqp, n, &xfs_Gqm->qm_dqfrlist, q_freelist) { - xfs_dqlock(dqp); - if (dqp->dq_flags & XFS_DQ_INACTIVE) { - ASSERT(dqp->q_mount == NULL); - ASSERT(! XFS_DQ_IS_DIRTY(dqp)); - ASSERT(list_empty(&dqp->q_hashlist)); - ASSERT(list_empty(&dqp->q_mplist)); - list_del_init(&dqp->q_freelist); - xfs_Gqm->qm_dqfrlist_cnt--; - xfs_dqunlock(dqp); - xfs_qm_dqdestroy(dqp); - } else { - xfs_dqunlock(dqp); - } - } - mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock); - - /* * Destroy the entire XQM. If somebody mounts with quotaon, this'll * be restarted. */ @@ -415,8 +385,7 @@ xfs_qm_unmount_quotas( */ STATIC int xfs_qm_dqflush_all( - struct xfs_mount *mp, - int sync_mode) + struct xfs_mount *mp) { struct xfs_quotainfo *q = mp->m_quotainfo; int recl; @@ -429,7 +398,8 @@ again: mutex_lock(&q->qi_dqlist_lock); list_for_each_entry(dqp, &q->qi_dqlist, q_mplist) { xfs_dqlock(dqp); - if (! XFS_DQ_IS_DIRTY(dqp)) { + if ((dqp->dq_flags & XFS_DQ_FREEING) || + !XFS_DQ_IS_DIRTY(dqp)) { xfs_dqunlock(dqp); continue; } @@ -444,14 +414,14 @@ again: * out immediately. We'll be able to acquire * the flush lock when the I/O completes. */ - xfs_qm_dqflock_pushbuf_wait(dqp); + xfs_dqflock_pushbuf_wait(dqp); } /* * Let go of the mplist lock. We don't want to hold it * across a disk write. */ mutex_unlock(&q->qi_dqlist_lock); - error = xfs_qm_dqflush(dqp, sync_mode); + error = xfs_qm_dqflush(dqp, 0); xfs_dqunlock(dqp); if (error) return error; @@ -468,6 +438,7 @@ again: /* return ! busy */ return 0; } + /* * Release the group dquot pointers the user dquots may be * carrying around as a hint. mplist is locked on entry and exit. @@ -478,31 +449,26 @@ xfs_qm_detach_gdquots( { struct xfs_quotainfo *q = mp->m_quotainfo; struct xfs_dquot *dqp, *gdqp; - int nrecl; again: ASSERT(mutex_is_locked(&q->qi_dqlist_lock)); list_for_each_entry(dqp, &q->qi_dqlist, q_mplist) { xfs_dqlock(dqp); - if ((gdqp = dqp->q_gdquot)) { - xfs_dqlock(gdqp); - dqp->q_gdquot = NULL; - } - xfs_dqunlock(dqp); - - if (gdqp) { - /* - * Can't hold the mplist lock across a dqput. - * XXXmust convert to marker based iterations here. - */ - nrecl = q->qi_dqreclaims; + if (dqp->dq_flags & XFS_DQ_FREEING) { + xfs_dqunlock(dqp); mutex_unlock(&q->qi_dqlist_lock); - xfs_qm_dqput(gdqp); - + delay(1); mutex_lock(&q->qi_dqlist_lock); - if (nrecl != q->qi_dqreclaims) - goto again; + goto again; } + + gdqp = dqp->q_gdquot; + if (gdqp) + dqp->q_gdquot = NULL; + xfs_dqunlock(dqp); + + if (gdqp) + xfs_qm_dqrele(gdqp); } } @@ -520,8 +486,8 @@ xfs_qm_dqpurge_int( struct xfs_quotainfo *q = mp->m_quotainfo; struct xfs_dquot *dqp, *n; uint dqtype; - int nrecl; - int nmisses; + int nmisses = 0; + LIST_HEAD (dispose_list); if (!q) return 0; @@ -540,47 +506,26 @@ xfs_qm_dqpurge_int( */ xfs_qm_detach_gdquots(mp); - again: - nmisses = 0; - ASSERT(mutex_is_locked(&q->qi_dqlist_lock)); /* - * Try to get rid of all of the unwanted dquots. The idea is to - * get them off mplist and hashlist, but leave them on freelist. + * Try to get rid of all of the unwanted dquots. */ list_for_each_entry_safe(dqp, n, &q->qi_dqlist, q_mplist) { - /* - * It's OK to look at the type without taking dqlock here. - * We're holding the mplist lock here, and that's needed for - * a dqreclaim. - */ - if ((dqp->dq_flags & dqtype) == 0) - continue; - - if (!mutex_trylock(&dqp->q_hash->qh_lock)) { - nrecl = q->qi_dqreclaims; - mutex_unlock(&q->qi_dqlist_lock); - mutex_lock(&dqp->q_hash->qh_lock); - mutex_lock(&q->qi_dqlist_lock); - - /* - * XXXTheoretically, we can get into a very long - * ping pong game here. - * No one can be adding dquots to the mplist at - * this point, but somebody might be taking things off. - */ - if (nrecl != q->qi_dqreclaims) { - mutex_unlock(&dqp->q_hash->qh_lock); - goto again; - } + xfs_dqlock(dqp); + if ((dqp->dq_flags & dqtype) != 0 && + !(dqp->dq_flags & XFS_DQ_FREEING)) { + if (dqp->q_nrefs == 0) { + dqp->dq_flags |= XFS_DQ_FREEING; + list_move_tail(&dqp->q_mplist, &dispose_list); + } else + nmisses++; } - - /* - * Take the dquot off the mplist and hashlist. It may remain on - * freelist in INACTIVE state. - */ - nmisses += xfs_qm_dqpurge(dqp); + xfs_dqunlock(dqp); } mutex_unlock(&q->qi_dqlist_lock); + + list_for_each_entry_safe(dqp, n, &dispose_list, q_mplist) + xfs_qm_dqpurge(dqp); + return nmisses; } @@ -648,12 +593,9 @@ xfs_qm_dqattach_one( */ dqp = udqhint->q_gdquot; if (dqp && be32_to_cpu(dqp->q_core.d_id) == id) { - xfs_dqlock(dqp); - XFS_DQHOLD(dqp); ASSERT(*IO_idqpp == NULL); - *IO_idqpp = dqp; - xfs_dqunlock(dqp); + *IO_idqpp = xfs_qm_dqhold(dqp); xfs_dqunlock(udqhint); return 0; } @@ -674,7 +616,8 @@ xfs_qm_dqattach_one( * disk and we didn't ask it to allocate; * ESRCH if quotas got turned off suddenly. */ - error = xfs_qm_dqget(ip->i_mount, ip, id, type, XFS_QMOPT_DOWARN, &dqp); + error = xfs_qm_dqget(ip->i_mount, ip, id, type, + doalloc | XFS_QMOPT_DOWARN, &dqp); if (error) return error; @@ -692,11 +635,7 @@ xfs_qm_dqattach_one( /* * Given a udquot and gdquot, attach a ptr to the group dquot in the - * udquot as a hint for future lookups. The idea sounds simple, but the - * execution isn't, because the udquot might have a group dquot attached - * already and getting rid of that gets us into lock ordering constraints. - * The process is complicated more by the fact that the dquots may or may not - * be locked on entry. + * udquot as a hint for future lookups. */ STATIC void xfs_qm_dqattach_grouphint( @@ -707,45 +646,17 @@ xfs_qm_dqattach_grouphint( xfs_dqlock(udq); - if ((tmp = udq->q_gdquot)) { - if (tmp == gdq) { - xfs_dqunlock(udq); - return; - } + tmp = udq->q_gdquot; + if (tmp) { + if (tmp == gdq) + goto done; udq->q_gdquot = NULL; - /* - * We can't keep any dqlocks when calling dqrele, - * because the freelist lock comes before dqlocks. - */ - xfs_dqunlock(udq); - /* - * we took a hard reference once upon a time in dqget, - * so give it back when the udquot no longer points at it - * dqput() does the unlocking of the dquot. - */ xfs_qm_dqrele(tmp); - - xfs_dqlock(udq); - xfs_dqlock(gdq); - - } else { - ASSERT(XFS_DQ_IS_LOCKED(udq)); - xfs_dqlock(gdq); - } - - ASSERT(XFS_DQ_IS_LOCKED(udq)); - ASSERT(XFS_DQ_IS_LOCKED(gdq)); - /* - * Somebody could have attached a gdquot here, - * when we dropped the uqlock. If so, just do nothing. - */ - if (udq->q_gdquot == NULL) { - XFS_DQHOLD(gdq); - udq->q_gdquot = gdq; } - xfs_dqunlock(gdq); + udq->q_gdquot = xfs_qm_dqhold(gdq); +done: xfs_dqunlock(udq); } @@ -812,17 +723,13 @@ xfs_qm_dqattach_locked( ASSERT(ip->i_gdquot); /* - * We may or may not have the i_udquot locked at this point, - * but this check is OK since we don't depend on the i_gdquot to - * be accurate 100% all the time. It is just a hint, and this - * will succeed in general. - */ - if (ip->i_udquot->q_gdquot == ip->i_gdquot) - goto done; - /* - * Attach i_gdquot to the gdquot hint inside the i_udquot. + * We do not have i_udquot locked at this point, but this check + * is OK since we don't depend on the i_gdquot to be accurate + * 100% all the time. It is just a hint, and this will + * succeed in general. */ - xfs_qm_dqattach_grouphint(ip->i_udquot, ip->i_gdquot); + if (ip->i_udquot->q_gdquot != ip->i_gdquot) + xfs_qm_dqattach_grouphint(ip->i_udquot, ip->i_gdquot); } done: @@ -878,100 +785,6 @@ xfs_qm_dqdetach( } } -int -xfs_qm_sync( - struct xfs_mount *mp, - int flags) -{ - struct xfs_quotainfo *q = mp->m_quotainfo; - int recl, restarts; - struct xfs_dquot *dqp; - int error; - - if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp)) - return 0; - - restarts = 0; - - again: - mutex_lock(&q->qi_dqlist_lock); - /* - * dqpurge_all() also takes the mplist lock and iterate thru all dquots - * in quotaoff. However, if the QUOTA_ACTIVE bits are not cleared - * when we have the mplist lock, we know that dquots will be consistent - * as long as we have it locked. - */ - if (!XFS_IS_QUOTA_ON(mp)) { - mutex_unlock(&q->qi_dqlist_lock); - return 0; - } - ASSERT(mutex_is_locked(&q->qi_dqlist_lock)); - list_for_each_entry(dqp, &q->qi_dqlist, q_mplist) { - /* - * If this is vfs_sync calling, then skip the dquots that - * don't 'seem' to be dirty. ie. don't acquire dqlock. - * This is very similar to what xfs_sync does with inodes. - */ - if (flags & SYNC_TRYLOCK) { - if (!XFS_DQ_IS_DIRTY(dqp)) - continue; - if (!xfs_qm_dqlock_nowait(dqp)) - continue; - } else { - xfs_dqlock(dqp); - } - - /* - * Now, find out for sure if this dquot is dirty or not. - */ - if (! XFS_DQ_IS_DIRTY(dqp)) { - xfs_dqunlock(dqp); - continue; - } - - /* XXX a sentinel would be better */ - recl = q->qi_dqreclaims; - if (!xfs_dqflock_nowait(dqp)) { - if (flags & SYNC_TRYLOCK) { - xfs_dqunlock(dqp); - continue; - } - /* - * If we can't grab the flush lock then if the caller - * really wanted us to give this our best shot, so - * see if we can give a push to the buffer before we wait - * on the flush lock. At this point, we know that - * even though the dquot is being flushed, - * it has (new) dirty data. - */ - xfs_qm_dqflock_pushbuf_wait(dqp); - } - /* - * Let go of the mplist lock. We don't want to hold it - * across a disk write - */ - mutex_unlock(&q->qi_dqlist_lock); - error = xfs_qm_dqflush(dqp, flags); - xfs_dqunlock(dqp); - if (error && XFS_FORCED_SHUTDOWN(mp)) - return 0; /* Need to prevent umount failure */ - else if (error) - return error; - - mutex_lock(&q->qi_dqlist_lock); - if (recl != q->qi_dqreclaims) { - if (++restarts >= XFS_QM_SYNC_MAX_RESTARTS) - break; - - mutex_unlock(&q->qi_dqlist_lock); - goto again; - } - } - - mutex_unlock(&q->qi_dqlist_lock); - return 0; -} - /* * The hash chains and the mplist use the same xfs_dqhash structure as * their list head, but we can take the mplist qh_lock and one of the @@ -1033,18 +846,21 @@ xfs_qm_init_quotainfo( /* * We try to get the limits from the superuser's limits fields. * This is quite hacky, but it is standard quota practice. + * * We look at the USR dquot with id == 0 first, but if user quotas * are not enabled we goto the GRP dquot with id == 0. * We don't really care to keep separate default limits for user * and group quotas, at least not at this point. + * + * Since we may not have done a quotacheck by this point, just read + * the dquot without attaching it to any hashtables or lists. */ - error = xfs_qm_dqget(mp, NULL, (xfs_dqid_t)0, - XFS_IS_UQUOTA_RUNNING(mp) ? XFS_DQ_USER : - (XFS_IS_GQUOTA_RUNNING(mp) ? XFS_DQ_GROUP : - XFS_DQ_PROJ), - XFS_QMOPT_DQSUSER|XFS_QMOPT_DOWARN, - &dqp); - if (! error) { + error = xfs_qm_dqread(mp, 0, + XFS_IS_UQUOTA_RUNNING(mp) ? XFS_DQ_USER : + (XFS_IS_GQUOTA_RUNNING(mp) ? XFS_DQ_GROUP : + XFS_DQ_PROJ), + XFS_QMOPT_DOWARN, &dqp); + if (!error) { xfs_disk_dquot_t *ddqp = &dqp->q_core; /* @@ -1071,11 +887,6 @@ xfs_qm_init_quotainfo( qinf->qi_rtbhardlimit = be64_to_cpu(ddqp->d_rtb_hardlimit); qinf->qi_rtbsoftlimit = be64_to_cpu(ddqp->d_rtb_softlimit); - /* - * We sent the XFS_QMOPT_DQSUSER flag to dqget because - * we don't want this dquot cached. We haven't done a - * quotacheck yet, and quotacheck doesn't like incore dquots. - */ xfs_qm_dqdestroy(dqp); } else { qinf->qi_btimelimit = XFS_QM_BTIMELIMIT; @@ -1296,7 +1107,8 @@ xfs_qm_dqiter_bufs( break; xfs_qm_reset_dqcounts(mp, bp, firstid, type); - xfs_bdwrite(mp, bp); + xfs_buf_delwri_queue(bp); + xfs_buf_relse(bp); /* * goto the next block. */ @@ -1346,11 +1158,8 @@ xfs_qm_dqiterate( * the inode is never added to the transaction. */ xfs_ilock(qip, XFS_ILOCK_SHARED); - error = xfs_bmapi(NULL, qip, lblkno, - maxlblkcnt - lblkno, - XFS_BMAPI_METADATA, - NULL, - 0, map, &nmaps, NULL); + error = xfs_bmapi_read(qip, lblkno, maxlblkcnt - lblkno, + map, &nmaps, 0); xfs_iunlock(qip, XFS_ILOCK_SHARED); if (error) break; @@ -1662,7 +1471,7 @@ xfs_qm_quotacheck( * successfully. */ if (!error) - error = xfs_qm_dqflush_all(mp, 0); + error = xfs_qm_dqflush_all(mp); /* * We can get this error if we couldn't do a dquot allocation inside @@ -1683,7 +1492,7 @@ xfs_qm_quotacheck( * quotacheck'd stamp on the superblock. So, here we do a synchronous * flush. */ - XFS_bflush(mp->m_ddev_targp); + xfs_flush_buftarg(mp->m_ddev_targp, 1); /* * If one type of quotas is off, then it will lose its @@ -1794,59 +1603,33 @@ xfs_qm_init_quotainos( /* - * Just pop the least recently used dquot off the freelist and - * recycle it. The returned dquot is locked. + * Pop the least recently used dquot off the freelist and recycle it. */ -STATIC xfs_dquot_t * +STATIC struct xfs_dquot * xfs_qm_dqreclaim_one(void) { - xfs_dquot_t *dqpout; - xfs_dquot_t *dqp; - int restarts; - int startagain; - - restarts = 0; - dqpout = NULL; + struct xfs_dquot *dqp; + int restarts = 0; - /* lockorder: hashchainlock, freelistlock, mplistlock, dqlock, dqflock */ -again: - startagain = 0; mutex_lock(&xfs_Gqm->qm_dqfrlist_lock); - +restart: list_for_each_entry(dqp, &xfs_Gqm->qm_dqfrlist, q_freelist) { struct xfs_mount *mp = dqp->q_mount; - xfs_dqlock(dqp); + + if (!xfs_dqlock_nowait(dqp)) + continue; /* - * We are racing with dqlookup here. Naturally we don't - * want to reclaim a dquot that lookup wants. We release the - * freelist lock and start over, so that lookup will grab - * both the dquot and the freelistlock. + * This dquot has already been grabbed by dqlookup. + * Remove it from the freelist and try again. */ - if (dqp->dq_flags & XFS_DQ_WANT) { - ASSERT(! (dqp->dq_flags & XFS_DQ_INACTIVE)); - + if (dqp->q_nrefs) { trace_xfs_dqreclaim_want(dqp); XQM_STATS_INC(xqmstats.xs_qm_dqwants); - restarts++; - startagain = 1; - goto dqunlock; - } - /* - * If the dquot is inactive, we are assured that it is - * not on the mplist or the hashlist, and that makes our - * life easier. - */ - if (dqp->dq_flags & XFS_DQ_INACTIVE) { - ASSERT(mp == NULL); - ASSERT(! XFS_DQ_IS_DIRTY(dqp)); - ASSERT(list_empty(&dqp->q_hashlist)); - ASSERT(list_empty(&dqp->q_mplist)); list_del_init(&dqp->q_freelist); xfs_Gqm->qm_dqfrlist_cnt--; - dqpout = dqp; - XQM_STATS_INC(xqmstats.xs_qm_dqinact_reclaims); + restarts++; goto dqunlock; } @@ -1875,64 +1658,49 @@ again: * We flush it delayed write, so don't bother * releasing the freelist lock. */ - error = xfs_qm_dqflush(dqp, 0); + error = xfs_qm_dqflush(dqp, SYNC_TRYLOCK); if (error) { xfs_warn(mp, "%s: dquot %p flush failed", __func__, dqp); } goto dqunlock; } + xfs_dqfunlock(dqp); /* - * We're trying to get the hashlock out of order. This races - * with dqlookup; so, we giveup and goto the next dquot if - * we couldn't get the hashlock. This way, we won't starve - * a dqlookup process that holds the hashlock that is - * waiting for the freelist lock. + * Prevent lookup now that we are going to reclaim the dquot. + * Once XFS_DQ_FREEING is set lookup won't touch the dquot, + * thus we can drop the lock now. */ - if (!mutex_trylock(&dqp->q_hash->qh_lock)) { - restarts++; - goto dqfunlock; - } + dqp->dq_flags |= XFS_DQ_FREEING; + xfs_dqunlock(dqp); - /* - * This races with dquot allocation code as well as dqflush_all - * and reclaim code. So, if we failed to grab the mplist lock, - * giveup everything and start over. - */ - if (!mutex_trylock(&mp->m_quotainfo->qi_dqlist_lock)) { - restarts++; - startagain = 1; - goto qhunlock; - } + mutex_lock(&dqp->q_hash->qh_lock); + list_del_init(&dqp->q_hashlist); + dqp->q_hash->qh_version++; + mutex_unlock(&dqp->q_hash->qh_lock); - ASSERT(dqp->q_nrefs == 0); + mutex_lock(&mp->m_quotainfo->qi_dqlist_lock); list_del_init(&dqp->q_mplist); mp->m_quotainfo->qi_dquots--; mp->m_quotainfo->qi_dqreclaims++; - list_del_init(&dqp->q_hashlist); - dqp->q_hash->qh_version++; + mutex_unlock(&mp->m_quotainfo->qi_dqlist_lock); + + ASSERT(dqp->q_nrefs == 0); list_del_init(&dqp->q_freelist); xfs_Gqm->qm_dqfrlist_cnt--; - dqpout = dqp; - mutex_unlock(&mp->m_quotainfo->qi_dqlist_lock); -qhunlock: - mutex_unlock(&dqp->q_hash->qh_lock); -dqfunlock: - xfs_dqfunlock(dqp); + + mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock); + return dqp; dqunlock: xfs_dqunlock(dqp); - if (dqpout) - break; if (restarts >= XFS_QM_RECLAIM_MAX_RESTARTS) break; - if (startagain) { - mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock); - goto again; - } + goto restart; } + mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock); - return dqpout; + return NULL; } /* @@ -2152,10 +1920,7 @@ xfs_qm_vop_dqalloc( * this to caller */ ASSERT(ip->i_udquot); - uq = ip->i_udquot; - xfs_dqlock(uq); - XFS_DQHOLD(uq); - xfs_dqunlock(uq); + uq = xfs_qm_dqhold(ip->i_udquot); } } if ((flags & XFS_QMOPT_GQUOTA) && XFS_IS_GQUOTA_ON(mp)) { @@ -2176,10 +1941,7 @@ xfs_qm_vop_dqalloc( xfs_ilock(ip, lockflags); } else { ASSERT(ip->i_gdquot); - gq = ip->i_gdquot; - xfs_dqlock(gq); - XFS_DQHOLD(gq); - xfs_dqunlock(gq); + gq = xfs_qm_dqhold(ip->i_gdquot); } } else if ((flags & XFS_QMOPT_PQUOTA) && XFS_IS_PQUOTA_ON(mp)) { if (xfs_get_projid(ip) != prid) { @@ -2199,10 +1961,7 @@ xfs_qm_vop_dqalloc( xfs_ilock(ip, lockflags); } else { ASSERT(ip->i_gdquot); - gq = ip->i_gdquot; - xfs_dqlock(gq); - XFS_DQHOLD(gq); - xfs_dqunlock(gq); + gq = xfs_qm_dqhold(ip->i_gdquot); } } if (uq) @@ -2252,14 +2011,10 @@ xfs_qm_vop_chown( xfs_trans_mod_dquot(tp, newdq, XFS_TRANS_DQ_ICOUNT, 1); /* - * Take an extra reference, because the inode - * is going to keep this dquot pointer even - * after the trans_commit. + * Take an extra reference, because the inode is going to keep + * this dquot pointer even after the trans_commit. */ - xfs_dqlock(newdq); - XFS_DQHOLD(newdq); - xfs_dqunlock(newdq); - *IO_olddq = newdq; + *IO_olddq = xfs_qm_dqhold(newdq); return prevdq; } @@ -2391,25 +2146,21 @@ xfs_qm_vop_create_dqattach( ASSERT(XFS_IS_QUOTA_RUNNING(mp)); if (udqp) { - xfs_dqlock(udqp); - XFS_DQHOLD(udqp); - xfs_dqunlock(udqp); ASSERT(ip->i_udquot == NULL); - ip->i_udquot = udqp; ASSERT(XFS_IS_UQUOTA_ON(mp)); ASSERT(ip->i_d.di_uid == be32_to_cpu(udqp->q_core.d_id)); + + ip->i_udquot = xfs_qm_dqhold(udqp); xfs_trans_mod_dquot(tp, udqp, XFS_TRANS_DQ_ICOUNT, 1); } if (gdqp) { - xfs_dqlock(gdqp); - XFS_DQHOLD(gdqp); - xfs_dqunlock(gdqp); ASSERT(ip->i_gdquot == NULL); - ip->i_gdquot = gdqp; ASSERT(XFS_IS_OQUOTA_ON(mp)); ASSERT((XFS_IS_GQUOTA_ON(mp) ? ip->i_d.di_gid : xfs_get_projid(ip)) == be32_to_cpu(gdqp->q_core.d_id)); + + ip->i_gdquot = xfs_qm_dqhold(gdqp); xfs_trans_mod_dquot(tp, gdqp, XFS_TRANS_DQ_ICOUNT, 1); } } diff --git a/fs/xfs/xfs_qm.h b/fs/xfs/xfs_qm.h index 43b9abe..9b4f3ad 100644 --- a/fs/xfs/xfs_qm.h +++ b/fs/xfs/xfs_qm.h @@ -33,12 +33,6 @@ extern kmem_zone_t *qm_dqzone; extern kmem_zone_t *qm_dqtrxzone; /* - * Used in xfs_qm_sync called by xfs_sync to count the max times that it can - * iterate over the mountpt's dquot list in one call. - */ -#define XFS_QM_SYNC_MAX_RESTARTS 7 - -/* * Ditto, for xfs_qm_dqreclaim_one. */ #define XFS_QM_RECLAIM_MAX_RESTARTS 4 diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c index 609246f..5cc3dde 100644 --- a/fs/xfs/xfs_qm_syscalls.c +++ b/fs/xfs/xfs_qm_syscalls.c @@ -261,7 +261,7 @@ xfs_qm_scall_trunc_qfile( } xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, ip); + xfs_trans_ijoin(tp, ip, 0); error = xfs_itruncate_data(&tp, ip, 0); if (error) { diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h index a595f29..8a0807e 100644 --- a/fs/xfs/xfs_quota.h +++ b/fs/xfs/xfs_quota.h @@ -87,8 +87,7 @@ typedef struct xfs_dqblk { #define XFS_DQ_PROJ 0x0002 /* project quota */ #define XFS_DQ_GROUP 0x0004 /* a group quota */ #define XFS_DQ_DIRTY 0x0008 /* dquot is dirty */ -#define XFS_DQ_WANT 0x0010 /* for lookup/reclaim race */ -#define XFS_DQ_INACTIVE 0x0020 /* dq off mplist & hashlist */ +#define XFS_DQ_FREEING 0x0010 /* dquot is beeing torn down */ #define XFS_DQ_ALLTYPES (XFS_DQ_USER|XFS_DQ_PROJ|XFS_DQ_GROUP) @@ -97,8 +96,7 @@ typedef struct xfs_dqblk { { XFS_DQ_PROJ, "PROJ" }, \ { XFS_DQ_GROUP, "GROUP" }, \ { XFS_DQ_DIRTY, "DIRTY" }, \ - { XFS_DQ_WANT, "WANT" }, \ - { XFS_DQ_INACTIVE, "INACTIVE" } + { XFS_DQ_FREEING, "FREEING" } /* * In the worst case, when both user and group quotas are on, @@ -199,7 +197,6 @@ typedef struct xfs_qoff_logformat { #define XFS_QMOPT_UQUOTA 0x0000004 /* user dquot requested */ #define XFS_QMOPT_PQUOTA 0x0000008 /* project dquot requested */ #define XFS_QMOPT_FORCE_RES 0x0000010 /* ignore quota limits */ -#define XFS_QMOPT_DQSUSER 0x0000020 /* don't cache super users dquot */ #define XFS_QMOPT_SBVERSION 0x0000040 /* change superblock version num */ #define XFS_QMOPT_DOWARN 0x0000400 /* increase warning cnt if needed */ #define XFS_QMOPT_DQREPAIR 0x0001000 /* repair dquot if damaged */ @@ -326,7 +323,6 @@ extern int xfs_qm_dqattach_locked(struct xfs_inode *, uint); extern void xfs_qm_dqdetach(struct xfs_inode *); extern void xfs_qm_dqrele(struct xfs_dquot *); extern void xfs_qm_statvfs(struct xfs_inode *, struct kstatfs *); -extern int xfs_qm_sync(struct xfs_mount *, int); extern int xfs_qm_newmount(struct xfs_mount *, uint *, uint *); extern void xfs_qm_mount_quotas(struct xfs_mount *); extern void xfs_qm_unmount(struct xfs_mount *); @@ -366,10 +362,6 @@ static inline int xfs_trans_reserve_quota_bydquots(struct xfs_trans *tp, #define xfs_qm_dqdetach(ip) #define xfs_qm_dqrele(d) #define xfs_qm_statvfs(ip, s) -static inline int xfs_qm_sync(struct xfs_mount *mp, int flags) -{ - return 0; -} #define xfs_qm_newmount(mp, a, b) (0) #define xfs_qm_mount_quotas(mp) #define xfs_qm_unmount(mp) diff --git a/fs/xfs/xfs_rename.c b/fs/xfs/xfs_rename.c index df78c29..866de27 100644 --- a/fs/xfs/xfs_rename.c +++ b/fs/xfs/xfs_rename.c @@ -170,12 +170,12 @@ xfs_rename( * we can rely on either trans_commit or trans_cancel to unlock * them. */ - xfs_trans_ijoin_ref(tp, src_dp, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, src_dp, XFS_ILOCK_EXCL); if (new_parent) - xfs_trans_ijoin_ref(tp, target_dp, XFS_ILOCK_EXCL); - xfs_trans_ijoin_ref(tp, src_ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, target_dp, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, src_ip, XFS_ILOCK_EXCL); if (target_ip) - xfs_trans_ijoin_ref(tp, target_ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, target_ip, XFS_ILOCK_EXCL); /* * If we are using project inheritance, we only allow renames diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index 35561a5..87323f1 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -112,7 +112,7 @@ xfs_growfs_rt_alloc( * Lock the inode. */ xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_trans_ijoin_ref(tp, ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); xfs_bmap_init(&flist, &firstblock); /* @@ -120,9 +120,9 @@ xfs_growfs_rt_alloc( */ nmap = 1; cancelflags |= XFS_TRANS_ABORT; - error = xfs_bmapi(tp, ip, oblocks, nblocks - oblocks, - XFS_BMAPI_WRITE | XFS_BMAPI_METADATA, &firstblock, - resblks, &map, &nmap, &flist); + error = xfs_bmapi_write(tp, ip, oblocks, nblocks - oblocks, + XFS_BMAPI_METADATA, &firstblock, + resblks, &map, &nmap, &flist); if (!error && nmap < 1) error = XFS_ERROR(ENOSPC); if (error) @@ -155,7 +155,7 @@ xfs_growfs_rt_alloc( * Lock the bitmap inode. */ xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_trans_ijoin_ref(tp, ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); /* * Get a buffer for the block. */ @@ -856,33 +856,23 @@ xfs_rtbuf_get( xfs_buf_t **bpp) /* output: buffer for the block */ { xfs_buf_t *bp; /* block buffer, result */ - xfs_daddr_t d; /* disk addr of block */ - int error; /* error value */ - xfs_fsblock_t fsb; /* fs block number for block */ xfs_inode_t *ip; /* bitmap or summary inode */ + xfs_bmbt_irec_t map; + int nmap; + int error; /* error value */ ip = issum ? mp->m_rsumip : mp->m_rbmip; - /* - * Map from the file offset (block) and inode number to the - * file system block. - */ - error = xfs_bmapi_single(tp, ip, XFS_DATA_FORK, &fsb, block); - if (error) { + + error = xfs_bmapi_read(ip, block, 1, &map, &nmap, XFS_DATA_FORK); + if (error) return error; - } - ASSERT(fsb != NULLFSBLOCK); - /* - * Convert to disk address for buffer cache. - */ - d = XFS_FSB_TO_DADDR(mp, fsb); - /* - * Read the buffer. - */ - error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, d, + + ASSERT(map.br_startblock != NULLFSBLOCK); + error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, + XFS_FSB_TO_DADDR(mp, map.br_startblock), mp->m_bsize, 0, &bp); - if (error) { + if (error) return error; - } ASSERT(!xfs_buf_geterror(bp)); *bpp = bp; return 0; @@ -1970,7 +1960,7 @@ xfs_growfs_rt( * Lock out other callers by grabbing the bitmap inode lock. */ xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL); - xfs_trans_ijoin_ref(tp, mp->m_rbmip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, mp->m_rbmip, XFS_ILOCK_EXCL); /* * Update the bitmap inode's size. */ @@ -1982,7 +1972,7 @@ xfs_growfs_rt( * Get the summary inode into the transaction. */ xfs_ilock(mp->m_rsumip, XFS_ILOCK_EXCL); - xfs_trans_ijoin_ref(tp, mp->m_rsumip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, mp->m_rsumip, XFS_ILOCK_EXCL); /* * Update the summary inode's size. */ @@ -2153,7 +2143,7 @@ xfs_rtfree_extent( * Synchronize by locking the bitmap inode. */ xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL); - xfs_trans_ijoin_ref(tp, mp->m_rbmip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, mp->m_rbmip, XFS_ILOCK_EXCL); #if defined(__KERNEL__) && defined(DEBUG) /* diff --git a/fs/xfs/xfs_rw.c b/fs/xfs/xfs_rw.c index c96a8a0..597d044 100644 --- a/fs/xfs/xfs_rw.c +++ b/fs/xfs/xfs_rw.c @@ -92,24 +92,6 @@ xfs_do_force_shutdown( } /* - * Prints out an ALERT message about I/O error. - */ -void -xfs_ioerror_alert( - char *func, - struct xfs_mount *mp, - xfs_buf_t *bp, - xfs_daddr_t blkno) -{ - xfs_alert(mp, - "I/O error occurred: meta-data dev %s block 0x%llx" - " (\"%s\") error %d buf count %zd", - xfs_buf_target_name(bp->b_target), - (__uint64_t)blkno, func, - bp->b_error, XFS_BUF_COUNT(bp)); -} - -/* * This isn't an absolute requirement, but it is * just a good idea to call xfs_read_buf instead of * directly doing a read_buf call. For one, we shouldn't @@ -143,14 +125,13 @@ xfs_read_buf( } else { *bpp = NULL; if (error) { - xfs_ioerror_alert("xfs_read_buf", mp, bp, XFS_BUF_ADDR(bp)); + xfs_buf_ioerror_alert(bp, __func__); } else { error = XFS_ERROR(EIO); } if (bp) { XFS_BUF_UNDONE(bp); - XFS_BUF_UNDELAYWRITE(bp); - XFS_BUF_STALE(bp); + xfs_buf_stale(bp); /* * brelse clears B_ERROR and b_error */ diff --git a/fs/xfs/xfs_rw.h b/fs/xfs/xfs_rw.h index 11c41ec..bbdb9ad 100644 --- a/fs/xfs/xfs_rw.h +++ b/fs/xfs/xfs_rw.h @@ -42,8 +42,6 @@ xfs_fsb_to_db(struct xfs_inode *ip, xfs_fsblock_t fsb) extern int xfs_read_buf(struct xfs_mount *mp, xfs_buftarg_t *btp, xfs_daddr_t blkno, int len, uint flags, struct xfs_buf **bpp); -extern void xfs_ioerror_alert(char *func, struct xfs_mount *mp, - xfs_buf_t *bp, xfs_daddr_t blkno); extern xfs_extlen_t xfs_get_extsz_hint(struct xfs_inode *ip); #endif /* __XFS_RW_H__ */ diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 5cf06b8..281961c 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -199,7 +199,6 @@ xfs_parseargs( mp->m_flags |= XFS_MOUNT_BARRIER; mp->m_flags |= XFS_MOUNT_COMPAT_IOSIZE; mp->m_flags |= XFS_MOUNT_SMALL_INUMS; - mp->m_flags |= XFS_MOUNT_DELAYLOG; /* * These can be overridden by the mount option parsing. @@ -353,11 +352,11 @@ xfs_parseargs( mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE); mp->m_qflags &= ~XFS_OQUOTA_ENFD; } else if (!strcmp(this_char, MNTOPT_DELAYLOG)) { - mp->m_flags |= XFS_MOUNT_DELAYLOG; + xfs_warn(mp, + "delaylog is the default now, option is deprecated."); } else if (!strcmp(this_char, MNTOPT_NODELAYLOG)) { - mp->m_flags &= ~XFS_MOUNT_DELAYLOG; xfs_warn(mp, - "nodelaylog is deprecated and will be removed in Linux 3.3"); + "nodelaylog support has been removed, option is deprecated."); } else if (!strcmp(this_char, MNTOPT_DISCARD)) { mp->m_flags |= XFS_MOUNT_DISCARD; } else if (!strcmp(this_char, MNTOPT_NODISCARD)) { @@ -395,13 +394,6 @@ xfs_parseargs( return EINVAL; } - if ((mp->m_flags & XFS_MOUNT_DISCARD) && - !(mp->m_flags & XFS_MOUNT_DELAYLOG)) { - xfs_warn(mp, - "the discard option is incompatible with the nodelaylog option"); - return EINVAL; - } - #ifndef CONFIG_XFS_QUOTA if (XFS_IS_QUOTA_RUNNING(mp)) { xfs_warn(mp, "quota support not available in this kernel."); @@ -501,7 +493,6 @@ xfs_showargs( { XFS_MOUNT_ATTR2, "," MNTOPT_ATTR2 }, { XFS_MOUNT_FILESTREAMS, "," MNTOPT_FILESTREAM }, { XFS_MOUNT_GRPID, "," MNTOPT_GRPID }, - { XFS_MOUNT_DELAYLOG, "," MNTOPT_DELAYLOG }, { XFS_MOUNT_DISCARD, "," MNTOPT_DISCARD }, { 0, NULL } }; @@ -796,8 +787,6 @@ xfs_fs_destroy_inode( if (is_bad_inode(inode)) goto out_reclaim; - xfs_ioend_wait(ip); - ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) || ip->i_delayed_blks == 0); /* @@ -837,7 +826,6 @@ xfs_fs_inode_init_once( inode_init_once(VFS_I(ip)); /* xfs inode */ - atomic_set(&ip->i_iocount, 0); atomic_set(&ip->i_pincount, 0); spin_lock_init(&ip->i_flags_lock); init_waitqueue_head(&ip->i_ipin_wait); @@ -872,27 +860,6 @@ xfs_fs_dirty_inode( } STATIC int -xfs_log_inode( - struct xfs_inode *ip) -{ - struct xfs_mount *mp = ip->i_mount; - struct xfs_trans *tp; - int error; - - tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS); - error = xfs_trans_reserve(tp, 0, XFS_FSYNC_TS_LOG_RES(mp), 0, 0, 0); - if (error) { - xfs_trans_cancel(tp, 0); - return error; - } - - xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_trans_ijoin_ref(tp, ip, XFS_ILOCK_EXCL); - xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - return xfs_trans_commit(tp, 0); -} - -STATIC int xfs_fs_write_inode( struct inode *inode, struct writeback_control *wbc) @@ -905,23 +872,23 @@ xfs_fs_write_inode( if (XFS_FORCED_SHUTDOWN(mp)) return -XFS_ERROR(EIO); - if (!ip->i_update_core) - return 0; - if (wbc->sync_mode == WB_SYNC_ALL) { + if (wbc->sync_mode == WB_SYNC_ALL || wbc->for_kupdate) { /* * Make sure the inode has made it it into the log. Instead * of forcing it all the way to stable storage using a * synchronous transaction we let the log force inside the * ->sync_fs call do that for thus, which reduces the number - * of synchronous log foces dramatically. + * of synchronous log forces dramatically. */ - xfs_ioend_wait(ip); - error = xfs_log_inode(ip); + error = xfs_log_dirty_inode(ip, NULL, 0); if (error) goto out; return 0; } else { + if (!ip->i_update_core) + return 0; + /* * We make this non-blocking if the inode is contended, return * EAGAIN to indicate to the caller that they did not succeed. @@ -1019,7 +986,7 @@ xfs_fs_put_super( */ xfs_filestream_unmount(mp); - XFS_bflush(mp->m_ddev_targp); + xfs_flush_buftarg(mp->m_ddev_targp, 1); xfs_unmountfs(mp); xfs_freesb(mp); @@ -1038,17 +1005,10 @@ xfs_fs_sync_fs( int error; /* - * Not much we can do for the first async pass. Writing out the - * superblock would be counter-productive as we are going to redirty - * when writing out other data and metadata (and writing out a single - * block is quite fast anyway). - * - * Try to asynchronously kick off quota syncing at least. + * Doing anything during the async pass would be counterproductive. */ - if (!wait) { - xfs_qm_sync(mp, SYNC_TRYLOCK); + if (!wait) return 0; - } error = xfs_quiesce_data(mp); if (error) @@ -1262,9 +1222,9 @@ xfs_fs_unfreeze( STATIC int xfs_fs_show_options( struct seq_file *m, - struct vfsmount *mnt) + struct dentry *root) { - return -xfs_showargs(XFS_M(mnt->mnt_sb), m); + return -xfs_showargs(XFS_M(root->d_sb), m); } /* @@ -1443,7 +1403,7 @@ xfs_fs_fill_super( */ xfs_filestream_unmount(mp); - XFS_bflush(mp->m_ddev_targp); + xfs_flush_buftarg(mp->m_ddev_targp, 1); xfs_unmountfs(mp); goto out_free_sb; @@ -1645,12 +1605,12 @@ STATIC int __init xfs_init_workqueues(void) { /* - * max_active is set to 8 to give enough concurency to allow - * multiple work operations on each CPU to run. This allows multiple - * filesystems to be running sync work concurrently, and scales with - * the number of CPUs in the system. + * We never want to the same work item to run twice, reclaiming inodes + * or idling the log is not going to get any faster by multiple CPUs + * competing for ressources. Use the default large max_active value + * so that even lots of filesystems can perform these task in parallel. */ - xfs_syncd_wq = alloc_workqueue("xfssyncd", WQ_CPU_INTENSIVE, 8); + xfs_syncd_wq = alloc_workqueue("xfssyncd", WQ_NON_REENTRANT, 0); if (!xfs_syncd_wq) return -ENOMEM; return 0; @@ -1670,7 +1630,6 @@ init_xfs_fs(void) printk(KERN_INFO XFS_VERSION_STRING " with " XFS_BUILD_OPTIONS " enabled\n"); - xfs_ioend_init(); xfs_dir_startup(); error = xfs_init_zones(); diff --git a/fs/xfs/xfs_sync.c b/fs/xfs/xfs_sync.c index 4604f90..72c01a1 100644 --- a/fs/xfs/xfs_sync.c +++ b/fs/xfs/xfs_sync.c @@ -227,21 +227,17 @@ xfs_sync_inode_data( int error = 0; if (!mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) - goto out_wait; + return 0; if (!xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED)) { if (flags & SYNC_TRYLOCK) - goto out_wait; + return 0; xfs_ilock(ip, XFS_IOLOCK_SHARED); } error = xfs_flush_pages(ip, 0, -1, (flags & SYNC_WAIT) ? 0 : XBF_ASYNC, FI_NONE); xfs_iunlock(ip, XFS_IOLOCK_SHARED); - - out_wait: - if (flags & SYNC_WAIT) - xfs_ioend_wait(ip); return error; } @@ -322,6 +318,7 @@ xfs_sync_fsdata( struct xfs_mount *mp) { struct xfs_buf *bp; + int error; /* * If the buffer is pinned then push on the log so we won't get stuck @@ -334,8 +331,35 @@ xfs_sync_fsdata( bp = xfs_getsb(mp, 0); if (xfs_buf_ispinned(bp)) xfs_log_force(mp, 0); + error = xfs_bwrite(bp); + xfs_buf_relse(bp); + return error; +} - return xfs_bwrite(mp, bp); +int +xfs_log_dirty_inode( + struct xfs_inode *ip, + struct xfs_perag *pag, + int flags) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_trans *tp; + int error; + + if (!ip->i_update_core) + return 0; + + tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS); + error = xfs_trans_reserve(tp, 0, XFS_FSYNC_TS_LOG_RES(mp), 0, 0, 0); + if (error) { + xfs_trans_cancel(tp, 0); + return error; + } + + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + return xfs_trans_commit(tp, 0); } /* @@ -361,10 +385,17 @@ xfs_quiesce_data( { int error, error2 = 0; - xfs_qm_sync(mp, SYNC_TRYLOCK); - xfs_qm_sync(mp, SYNC_WAIT); + /* + * Log all pending size and timestamp updates. The vfs writeback + * code is supposed to do this, but due to its overagressive + * livelock detection it will skip inodes where appending writes + * were written out in the first non-blocking sync phase if their + * completion took long enough that it happened after taking the + * timestamp for the cut-off in the blocking phase. + */ + xfs_inode_ag_iterator(mp, xfs_log_dirty_inode, 0); - /* force out the newly dirtied log buffers */ + /* force out the log */ xfs_log_force(mp, XFS_LOG_SYNC); /* write superblock and hoover up shutdown errors */ @@ -379,7 +410,7 @@ xfs_quiesce_data( /* flush data-only devices */ if (mp->m_rtdev_targp) - XFS_bflush(mp->m_rtdev_targp); + xfs_flush_buftarg(mp->m_rtdev_targp, 1); return error ? error : error2; } @@ -472,7 +503,6 @@ xfs_sync_worker( error = xfs_fs_log_dummy(mp); else xfs_log_force(mp, 0); - error = xfs_qm_sync(mp, SYNC_TRYLOCK); /* start pushing all the metadata that is currently dirty */ xfs_ail_push_all(mp->m_ail); @@ -772,6 +802,17 @@ restart: if (!xfs_iflock_nowait(ip)) { if (!(sync_mode & SYNC_WAIT)) goto out; + + /* + * If we only have a single dirty inode in a cluster there is + * a fair chance that the AIL push may have pushed it into + * the buffer, but xfsbufd won't touch it until 30 seconds + * from now, and thus we will lock up here. + * + * Promote the inode buffer to the front of the delwri list + * and wake up xfsbufd now. + */ + xfs_promote_inode(ip); xfs_iflock(ip); } diff --git a/fs/xfs/xfs_sync.h b/fs/xfs/xfs_sync.h index 941202e..fa96547 100644 --- a/fs/xfs/xfs_sync.h +++ b/fs/xfs/xfs_sync.h @@ -34,6 +34,8 @@ void xfs_quiesce_attr(struct xfs_mount *mp); void xfs_flush_inodes(struct xfs_inode *ip); +int xfs_log_dirty_inode(struct xfs_inode *ip, struct xfs_perag *pag, int flags); + int xfs_reclaim_inodes(struct xfs_mount *mp, int mode); int xfs_reclaim_inodes_count(struct xfs_mount *mp); void xfs_reclaim_inodes_nr(struct xfs_mount *mp, int nr_to_scan); diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 690fc7a..a9d5b1e 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -30,6 +30,7 @@ struct xfs_buf_log_item; struct xfs_da_args; struct xfs_da_node_entry; struct xfs_dquot; +struct xfs_log_item; struct xlog_ticket; struct log; struct xlog_recover; @@ -320,7 +321,6 @@ DEFINE_BUF_EVENT(xfs_buf_rele); DEFINE_BUF_EVENT(xfs_buf_iodone); DEFINE_BUF_EVENT(xfs_buf_iorequest); DEFINE_BUF_EVENT(xfs_buf_bawrite); -DEFINE_BUF_EVENT(xfs_buf_bdwrite); DEFINE_BUF_EVENT(xfs_buf_lock); DEFINE_BUF_EVENT(xfs_buf_lock_done); DEFINE_BUF_EVENT(xfs_buf_trylock); @@ -577,6 +577,7 @@ DEFINE_INODE_EVENT(xfs_vm_bmap); DEFINE_INODE_EVENT(xfs_file_ioctl); DEFINE_INODE_EVENT(xfs_file_compat_ioctl); DEFINE_INODE_EVENT(xfs_ioctl_setattr); +DEFINE_INODE_EVENT(xfs_dir_fsync); DEFINE_INODE_EVENT(xfs_file_fsync); DEFINE_INODE_EVENT(xfs_destroy_inode); DEFINE_INODE_EVENT(xfs_write_inode); @@ -742,8 +743,6 @@ DEFINE_DQUOT_EVENT(xfs_dqtobp_read); DEFINE_DQUOT_EVENT(xfs_dqread); DEFINE_DQUOT_EVENT(xfs_dqread_fail); DEFINE_DQUOT_EVENT(xfs_dqlookup_found); -DEFINE_DQUOT_EVENT(xfs_dqlookup_want); -DEFINE_DQUOT_EVENT(xfs_dqlookup_freelist); DEFINE_DQUOT_EVENT(xfs_dqlookup_done); DEFINE_DQUOT_EVENT(xfs_dqget_hit); DEFINE_DQUOT_EVENT(xfs_dqget_miss); @@ -833,18 +832,14 @@ DEFINE_LOGGRANT_EVENT(xfs_log_umount_write); DEFINE_LOGGRANT_EVENT(xfs_log_grant_enter); DEFINE_LOGGRANT_EVENT(xfs_log_grant_exit); DEFINE_LOGGRANT_EVENT(xfs_log_grant_error); -DEFINE_LOGGRANT_EVENT(xfs_log_grant_sleep1); -DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake1); -DEFINE_LOGGRANT_EVENT(xfs_log_grant_sleep2); -DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake2); +DEFINE_LOGGRANT_EVENT(xfs_log_grant_sleep); +DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake); DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake_up); DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_enter); DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_exit); DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_error); -DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_sleep1); -DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_wake1); -DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_sleep2); -DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_wake2); +DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_sleep); +DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_wake); DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_wake_up); DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_enter); DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_exit); @@ -853,6 +848,42 @@ DEFINE_LOGGRANT_EVENT(xfs_log_ungrant_enter); DEFINE_LOGGRANT_EVENT(xfs_log_ungrant_exit); DEFINE_LOGGRANT_EVENT(xfs_log_ungrant_sub); +DECLARE_EVENT_CLASS(xfs_log_item_class, + TP_PROTO(struct xfs_log_item *lip), + TP_ARGS(lip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(void *, lip) + __field(uint, type) + __field(uint, flags) + __field(xfs_lsn_t, lsn) + ), + TP_fast_assign( + __entry->dev = lip->li_mountp->m_super->s_dev; + __entry->lip = lip; + __entry->type = lip->li_type; + __entry->flags = lip->li_flags; + __entry->lsn = lip->li_lsn; + ), + TP_printk("dev %d:%d lip 0x%p lsn %d/%d type %s flags %s", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->lip, + CYCLE_LSN(__entry->lsn), BLOCK_LSN(__entry->lsn), + __print_symbolic(__entry->type, XFS_LI_TYPE_DESC), + __print_flags(__entry->flags, "|", XFS_LI_FLAGS)) +) + +#define DEFINE_LOG_ITEM_EVENT(name) \ +DEFINE_EVENT(xfs_log_item_class, name, \ + TP_PROTO(struct xfs_log_item *lip), \ + TP_ARGS(lip)) +DEFINE_LOG_ITEM_EVENT(xfs_ail_push); +DEFINE_LOG_ITEM_EVENT(xfs_ail_pushbuf); +DEFINE_LOG_ITEM_EVENT(xfs_ail_pushbuf_pinned); +DEFINE_LOG_ITEM_EVENT(xfs_ail_pinned); +DEFINE_LOG_ITEM_EVENT(xfs_ail_locked); + + DECLARE_EVENT_CLASS(xfs_file_class, TP_PROTO(struct xfs_inode *ip, size_t count, loff_t offset, int flags), TP_ARGS(ip, count, offset, flags), diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index efc147f..329b06a 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -1158,7 +1158,6 @@ xfs_trans_add_item( lidp->lid_item = lip; lidp->lid_flags = 0; - lidp->lid_size = 0; list_add_tail(&lidp->lid_trans, &tp->t_items); lip->li_desc = lidp; @@ -1210,219 +1209,6 @@ xfs_trans_free_items( } } -/* - * Unlock the items associated with a transaction. - * - * Items which were not logged should be freed. Those which were logged must - * still be tracked so they can be unpinned when the transaction commits. - */ -STATIC void -xfs_trans_unlock_items( - struct xfs_trans *tp, - xfs_lsn_t commit_lsn) -{ - struct xfs_log_item_desc *lidp, *next; - - list_for_each_entry_safe(lidp, next, &tp->t_items, lid_trans) { - struct xfs_log_item *lip = lidp->lid_item; - - lip->li_desc = NULL; - - if (commit_lsn != NULLCOMMITLSN) - IOP_COMMITTING(lip, commit_lsn); - IOP_UNLOCK(lip); - - /* - * Free the descriptor if the item is not dirty - * within this transaction. - */ - if (!(lidp->lid_flags & XFS_LID_DIRTY)) - xfs_trans_free_item_desc(lidp); - } -} - -/* - * Total up the number of log iovecs needed to commit this - * transaction. The transaction itself needs one for the - * transaction header. Ask each dirty item in turn how many - * it needs to get the total. - */ -static uint -xfs_trans_count_vecs( - struct xfs_trans *tp) -{ - int nvecs; - struct xfs_log_item_desc *lidp; - - nvecs = 1; - - /* In the non-debug case we need to start bailing out if we - * didn't find a log_item here, return zero and let trans_commit - * deal with it. - */ - if (list_empty(&tp->t_items)) { - ASSERT(0); - return 0; - } - - list_for_each_entry(lidp, &tp->t_items, lid_trans) { - /* - * Skip items which aren't dirty in this transaction. - */ - if (!(lidp->lid_flags & XFS_LID_DIRTY)) - continue; - lidp->lid_size = IOP_SIZE(lidp->lid_item); - nvecs += lidp->lid_size; - } - - return nvecs; -} - -/* - * Fill in the vector with pointers to data to be logged - * by this transaction. The transaction header takes - * the first vector, and then each dirty item takes the - * number of vectors it indicated it needed in xfs_trans_count_vecs(). - * - * As each item fills in the entries it needs, also pin the item - * so that it cannot be flushed out until the log write completes. - */ -static void -xfs_trans_fill_vecs( - struct xfs_trans *tp, - struct xfs_log_iovec *log_vector) -{ - struct xfs_log_item_desc *lidp; - struct xfs_log_iovec *vecp; - uint nitems; - - /* - * Skip over the entry for the transaction header, we'll - * fill that in at the end. - */ - vecp = log_vector + 1; - - nitems = 0; - ASSERT(!list_empty(&tp->t_items)); - list_for_each_entry(lidp, &tp->t_items, lid_trans) { - /* Skip items which aren't dirty in this transaction. */ - if (!(lidp->lid_flags & XFS_LID_DIRTY)) - continue; - - /* - * The item may be marked dirty but not log anything. This can - * be used to get called when a transaction is committed. - */ - if (lidp->lid_size) - nitems++; - IOP_FORMAT(lidp->lid_item, vecp); - vecp += lidp->lid_size; - IOP_PIN(lidp->lid_item); - } - - /* - * Now that we've counted the number of items in this transaction, fill - * in the transaction header. Note that the transaction header does not - * have a log item. - */ - tp->t_header.th_magic = XFS_TRANS_HEADER_MAGIC; - tp->t_header.th_type = tp->t_type; - tp->t_header.th_num_items = nitems; - log_vector->i_addr = (xfs_caddr_t)&tp->t_header; - log_vector->i_len = sizeof(xfs_trans_header_t); - log_vector->i_type = XLOG_REG_TYPE_TRANSHDR; -} - -/* - * The committed item processing consists of calling the committed routine of - * each logged item, updating the item's position in the AIL if necessary, and - * unpinning each item. If the committed routine returns -1, then do nothing - * further with the item because it may have been freed. - * - * Since items are unlocked when they are copied to the incore log, it is - * possible for two transactions to be completing and manipulating the same - * item simultaneously. The AIL lock will protect the lsn field of each item. - * The value of this field can never go backwards. - * - * We unpin the items after repositioning them in the AIL, because otherwise - * they could be immediately flushed and we'd have to race with the flusher - * trying to pull the item from the AIL as we add it. - */ -static void -xfs_trans_item_committed( - struct xfs_log_item *lip, - xfs_lsn_t commit_lsn, - int aborted) -{ - xfs_lsn_t item_lsn; - struct xfs_ail *ailp; - - if (aborted) - lip->li_flags |= XFS_LI_ABORTED; - item_lsn = IOP_COMMITTED(lip, commit_lsn); - - /* item_lsn of -1 means the item needs no further processing */ - if (XFS_LSN_CMP(item_lsn, (xfs_lsn_t)-1) == 0) - return; - - /* - * If the returned lsn is greater than what it contained before, update - * the location of the item in the AIL. If it is not, then do nothing. - * Items can never move backwards in the AIL. - * - * While the new lsn should usually be greater, it is possible that a - * later transaction completing simultaneously with an earlier one - * using the same item could complete first with a higher lsn. This - * would cause the earlier transaction to fail the test below. - */ - ailp = lip->li_ailp; - spin_lock(&ailp->xa_lock); - if (XFS_LSN_CMP(item_lsn, lip->li_lsn) > 0) { - /* - * This will set the item's lsn to item_lsn and update the - * position of the item in the AIL. - * - * xfs_trans_ail_update() drops the AIL lock. - */ - xfs_trans_ail_update(ailp, lip, item_lsn); - } else { - spin_unlock(&ailp->xa_lock); - } - - /* - * Now that we've repositioned the item in the AIL, unpin it so it can - * be flushed. Pass information about buffer stale state down from the - * log item flags, if anyone else stales the buffer we do not want to - * pay any attention to it. - */ - IOP_UNPIN(lip, 0); -} - -/* - * This is typically called by the LM when a transaction has been fully - * committed to disk. It needs to unpin the items which have - * been logged by the transaction and update their positions - * in the AIL if necessary. - * - * This also gets called when the transactions didn't get written out - * because of an I/O error. Abortflag & XFS_LI_ABORTED is set then. - */ -STATIC void -xfs_trans_committed( - void *arg, - int abortflag) -{ - struct xfs_trans *tp = arg; - struct xfs_log_item_desc *lidp, *next; - - list_for_each_entry_safe(lidp, next, &tp->t_items, lid_trans) { - xfs_trans_item_committed(lidp->lid_item, tp->t_lsn, abortflag); - xfs_trans_free_item_desc(lidp); - } - - xfs_trans_free(tp); -} - static inline void xfs_log_item_batch_insert( struct xfs_ail *ailp, @@ -1538,261 +1324,7 @@ xfs_trans_committed_bulk( } /* - * Called from the trans_commit code when we notice that the filesystem is in - * the middle of a forced shutdown. - * - * When we are called here, we have already pinned all the items in the - * transaction. However, neither IOP_COMMITTING or IOP_UNLOCK has been called - * so we can simply walk the items in the transaction, unpin them with an abort - * flag and then free the items. Note that unpinning the items can result in - * them being freed immediately, so we need to use a safe list traversal method - * here. - */ -STATIC void -xfs_trans_uncommit( - struct xfs_trans *tp, - uint flags) -{ - struct xfs_log_item_desc *lidp, *n; - - list_for_each_entry_safe(lidp, n, &tp->t_items, lid_trans) { - if (lidp->lid_flags & XFS_LID_DIRTY) - IOP_UNPIN(lidp->lid_item, 1); - } - - xfs_trans_unreserve_and_mod_sb(tp); - xfs_trans_unreserve_and_mod_dquots(tp); - - xfs_trans_free_items(tp, NULLCOMMITLSN, flags); - xfs_trans_free(tp); -} - -/* - * Format the transaction direct to the iclog. This isolates the physical - * transaction commit operation from the logical operation and hence allows - * other methods to be introduced without affecting the existing commit path. - */ -static int -xfs_trans_commit_iclog( - struct xfs_mount *mp, - struct xfs_trans *tp, - xfs_lsn_t *commit_lsn, - int flags) -{ - int shutdown; - int error; - int log_flags = 0; - struct xlog_in_core *commit_iclog; -#define XFS_TRANS_LOGVEC_COUNT 16 - struct xfs_log_iovec log_vector_fast[XFS_TRANS_LOGVEC_COUNT]; - struct xfs_log_iovec *log_vector; - uint nvec; - - - /* - * Ask each log item how many log_vector entries it will - * need so we can figure out how many to allocate. - * Try to avoid the kmem_alloc() call in the common case - * by using a vector from the stack when it fits. - */ - nvec = xfs_trans_count_vecs(tp); - if (nvec == 0) { - return ENOMEM; /* triggers a shutdown! */ - } else if (nvec <= XFS_TRANS_LOGVEC_COUNT) { - log_vector = log_vector_fast; - } else { - log_vector = (xfs_log_iovec_t *)kmem_alloc(nvec * - sizeof(xfs_log_iovec_t), - KM_SLEEP); - } - - /* - * Fill in the log_vector and pin the logged items, and - * then write the transaction to the log. - */ - xfs_trans_fill_vecs(tp, log_vector); - - if (flags & XFS_TRANS_RELEASE_LOG_RES) - log_flags = XFS_LOG_REL_PERM_RESERV; - - error = xfs_log_write(mp, log_vector, nvec, tp->t_ticket, &(tp->t_lsn)); - - /* - * The transaction is committed incore here, and can go out to disk - * at any time after this call. However, all the items associated - * with the transaction are still locked and pinned in memory. - */ - *commit_lsn = xfs_log_done(mp, tp->t_ticket, &commit_iclog, log_flags); - - tp->t_commit_lsn = *commit_lsn; - trace_xfs_trans_commit_lsn(tp); - - if (nvec > XFS_TRANS_LOGVEC_COUNT) - kmem_free(log_vector); - - /* - * If we got a log write error. Unpin the logitems that we - * had pinned, clean up, free trans structure, and return error. - */ - if (error || *commit_lsn == -1) { - current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); - xfs_trans_uncommit(tp, flags|XFS_TRANS_ABORT); - return XFS_ERROR(EIO); - } - - /* - * Once the transaction has committed, unused - * reservations need to be released and changes to - * the superblock need to be reflected in the in-core - * version. Do that now. - */ - xfs_trans_unreserve_and_mod_sb(tp); - - /* - * Tell the LM to call the transaction completion routine - * when the log write with LSN commit_lsn completes (e.g. - * when the transaction commit really hits the on-disk log). - * After this call we cannot reference tp, because the call - * can happen at any time and the call will free the transaction - * structure pointed to by tp. The only case where we call - * the completion routine (xfs_trans_committed) directly is - * if the log is turned off on a debug kernel or we're - * running in simulation mode (the log is explicitly turned - * off). - */ - tp->t_logcb.cb_func = xfs_trans_committed; - tp->t_logcb.cb_arg = tp; - - /* - * We need to pass the iclog buffer which was used for the - * transaction commit record into this function, and attach - * the callback to it. The callback must be attached before - * the items are unlocked to avoid racing with other threads - * waiting for an item to unlock. - */ - shutdown = xfs_log_notify(mp, commit_iclog, &(tp->t_logcb)); - - /* - * Mark this thread as no longer being in a transaction - */ - current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); - - /* - * Once all the items of the transaction have been copied - * to the in core log and the callback is attached, the - * items can be unlocked. - * - * This will free descriptors pointing to items which were - * not logged since there is nothing more to do with them. - * For items which were logged, we will keep pointers to them - * so they can be unpinned after the transaction commits to disk. - * This will also stamp each modified meta-data item with - * the commit lsn of this transaction for dependency tracking - * purposes. - */ - xfs_trans_unlock_items(tp, *commit_lsn); - - /* - * If we detected a log error earlier, finish committing - * the transaction now (unpin log items, etc). - * - * Order is critical here, to avoid using the transaction - * pointer after its been freed (by xfs_trans_committed - * either here now, or as a callback). We cannot do this - * step inside xfs_log_notify as was done earlier because - * of this issue. - */ - if (shutdown) - xfs_trans_committed(tp, XFS_LI_ABORTED); - - /* - * Now that the xfs_trans_committed callback has been attached, - * and the items are released we can finally allow the iclog to - * go to disk. - */ - return xfs_log_release_iclog(mp, commit_iclog); -} - -/* - * Walk the log items and allocate log vector structures for - * each item large enough to fit all the vectors they require. - * Note that this format differs from the old log vector format in - * that there is no transaction header in these log vectors. - */ -STATIC struct xfs_log_vec * -xfs_trans_alloc_log_vecs( - xfs_trans_t *tp) -{ - struct xfs_log_item_desc *lidp; - struct xfs_log_vec *lv = NULL; - struct xfs_log_vec *ret_lv = NULL; - - - /* Bail out if we didn't find a log item. */ - if (list_empty(&tp->t_items)) { - ASSERT(0); - return NULL; - } - - list_for_each_entry(lidp, &tp->t_items, lid_trans) { - struct xfs_log_vec *new_lv; - - /* Skip items which aren't dirty in this transaction. */ - if (!(lidp->lid_flags & XFS_LID_DIRTY)) - continue; - - /* Skip items that do not have any vectors for writing */ - lidp->lid_size = IOP_SIZE(lidp->lid_item); - if (!lidp->lid_size) - continue; - - new_lv = kmem_zalloc(sizeof(*new_lv) + - lidp->lid_size * sizeof(struct xfs_log_iovec), - KM_SLEEP); - - /* The allocated iovec region lies beyond the log vector. */ - new_lv->lv_iovecp = (struct xfs_log_iovec *)&new_lv[1]; - new_lv->lv_niovecs = lidp->lid_size; - new_lv->lv_item = lidp->lid_item; - if (!ret_lv) - ret_lv = new_lv; - else - lv->lv_next = new_lv; - lv = new_lv; - } - - return ret_lv; -} - -static int -xfs_trans_commit_cil( - struct xfs_mount *mp, - struct xfs_trans *tp, - xfs_lsn_t *commit_lsn, - int flags) -{ - struct xfs_log_vec *log_vector; - - /* - * Get each log item to allocate a vector structure for - * the log item to to pass to the log write code. The - * CIL commit code will format the vector and save it away. - */ - log_vector = xfs_trans_alloc_log_vecs(tp); - if (!log_vector) - return ENOMEM; - - xfs_log_commit_cil(mp, tp, log_vector, commit_lsn, flags); - - current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); - xfs_trans_free(tp); - return 0; -} - -/* - * xfs_trans_commit - * - * Commit the given transaction to the log a/synchronously. + * Commit the given transaction to the log. * * XFS disk error handling mechanism is not based on a typical * transaction abort mechanism. Logically after the filesystem @@ -1804,10 +1336,9 @@ xfs_trans_commit_cil( * Do not reference the transaction structure after this call. */ int -_xfs_trans_commit( +xfs_trans_commit( struct xfs_trans *tp, - uint flags, - int *log_flushed) + uint flags) { struct xfs_mount *mp = tp->t_mountp; xfs_lsn_t commit_lsn = -1; @@ -1848,17 +1379,16 @@ _xfs_trans_commit( xfs_trans_apply_sb_deltas(tp); xfs_trans_apply_dquot_deltas(tp); - if (mp->m_flags & XFS_MOUNT_DELAYLOG) - error = xfs_trans_commit_cil(mp, tp, &commit_lsn, flags); - else - error = xfs_trans_commit_iclog(mp, tp, &commit_lsn, flags); - + error = xfs_log_commit_cil(mp, tp, &commit_lsn, flags); if (error == ENOMEM) { xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR); error = XFS_ERROR(EIO); goto out_unreserve; } + current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); + xfs_trans_free(tp); + /* * If the transaction needs to be synchronous, then force the * log out now and wait for it. @@ -1866,7 +1396,7 @@ _xfs_trans_commit( if (sync) { if (!error) { error = _xfs_log_force_lsn(mp, commit_lsn, - XFS_LOG_SYNC, log_flushed); + XFS_LOG_SYNC, NULL); } XFS_STATS_INC(xs_trans_sync); } else { @@ -2021,6 +1551,6 @@ xfs_trans_roll( if (error) return error; - xfs_trans_ijoin(trans, dp); + xfs_trans_ijoin(trans, dp, 0); return 0; } diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index 53597f4..f611870 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -163,9 +163,8 @@ typedef struct xfs_trans_header { */ struct xfs_log_item_desc { struct xfs_log_item *lid_item; - ushort lid_size; - unsigned char lid_flags; struct list_head lid_trans; + unsigned char lid_flags; }; #define XFS_LID_DIRTY 0x1 @@ -326,7 +325,7 @@ typedef struct xfs_log_item { struct xfs_log_item *); /* buffer item iodone */ /* callback func */ - struct xfs_item_ops *li_ops; /* function list */ + const struct xfs_item_ops *li_ops; /* function list */ /* delayed logging */ struct list_head li_cil; /* CIL pointers */ @@ -341,7 +340,7 @@ typedef struct xfs_log_item { { XFS_LI_IN_AIL, "IN_AIL" }, \ { XFS_LI_ABORTED, "ABORTED" } -typedef struct xfs_item_ops { +struct xfs_item_ops { uint (*iop_size)(xfs_log_item_t *); void (*iop_format)(xfs_log_item_t *, struct xfs_log_iovec *); void (*iop_pin)(xfs_log_item_t *); @@ -352,7 +351,7 @@ typedef struct xfs_item_ops { void (*iop_push)(xfs_log_item_t *); bool (*iop_pushbuf)(xfs_log_item_t *); void (*iop_committing)(xfs_log_item_t *, xfs_lsn_t); -} xfs_item_ops_t; +}; #define IOP_SIZE(ip) (*(ip)->li_ops->iop_size)(ip) #define IOP_FORMAT(ip,vp) (*(ip)->li_ops->iop_format)(ip, vp) @@ -470,8 +469,7 @@ void xfs_trans_stale_inode_buf(xfs_trans_t *, struct xfs_buf *); void xfs_trans_dquot_buf(xfs_trans_t *, struct xfs_buf *, uint); void xfs_trans_inode_alloc_buf(xfs_trans_t *, struct xfs_buf *); void xfs_trans_ichgtime(struct xfs_trans *, struct xfs_inode *, int); -void xfs_trans_ijoin_ref(struct xfs_trans *, struct xfs_inode *, uint); -void xfs_trans_ijoin(struct xfs_trans *, struct xfs_inode *); +void xfs_trans_ijoin(struct xfs_trans *, struct xfs_inode *, uint); void xfs_trans_log_buf(xfs_trans_t *, struct xfs_buf *, uint, uint); void xfs_trans_log_inode(xfs_trans_t *, struct xfs_inode *, uint); struct xfs_efi_log_item *xfs_trans_get_efi(xfs_trans_t *, uint); @@ -487,10 +485,7 @@ void xfs_trans_log_efd_extent(xfs_trans_t *, struct xfs_efd_log_item *, xfs_fsblock_t, xfs_extlen_t); -int _xfs_trans_commit(xfs_trans_t *, - uint flags, - int *); -#define xfs_trans_commit(tp, flags) _xfs_trans_commit(tp, flags, NULL) +int xfs_trans_commit(xfs_trans_t *, uint flags); void xfs_trans_cancel(xfs_trans_t *, int); int xfs_trans_ail_init(struct xfs_mount *); void xfs_trans_ail_destroy(struct xfs_mount *); diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c index 3a1e7ca5..ed9252b 100644 --- a/fs/xfs/xfs_trans_ail.c +++ b/fs/xfs/xfs_trans_ail.c @@ -26,6 +26,7 @@ #include "xfs_ag.h" #include "xfs_mount.h" #include "xfs_trans_priv.h" +#include "xfs_trace.h" #include "xfs_error.h" #ifdef DEBUG @@ -364,12 +365,24 @@ xfsaild_push( xfs_lsn_t lsn; xfs_lsn_t target; long tout = 10; - int flush_log = 0; int stuck = 0; int count = 0; int push_xfsbufd = 0; + /* + * If last time we ran we encountered pinned items, force the log first + * and wait for it before pushing again. + */ spin_lock(&ailp->xa_lock); + if (ailp->xa_last_pushed_lsn == 0 && ailp->xa_log_flush && + !list_empty(&ailp->xa_ail)) { + ailp->xa_log_flush = 0; + spin_unlock(&ailp->xa_lock); + XFS_STATS_INC(xs_push_ail_flush); + xfs_log_force(mp, XFS_LOG_SYNC); + spin_lock(&ailp->xa_lock); + } + target = ailp->xa_target; lip = xfs_trans_ail_cursor_first(ailp, &cur, ailp->xa_last_pushed_lsn); if (!lip || XFS_FORCED_SHUTDOWN(mp)) { @@ -413,16 +426,20 @@ xfsaild_push( switch (lock_result) { case XFS_ITEM_SUCCESS: XFS_STATS_INC(xs_push_ail_success); + trace_xfs_ail_push(lip); + IOP_PUSH(lip); ailp->xa_last_pushed_lsn = lsn; break; case XFS_ITEM_PUSHBUF: XFS_STATS_INC(xs_push_ail_pushbuf); + trace_xfs_ail_pushbuf(lip); if (!IOP_PUSHBUF(lip)) { + trace_xfs_ail_pushbuf_pinned(lip); stuck++; - flush_log = 1; + ailp->xa_log_flush++; } else { ailp->xa_last_pushed_lsn = lsn; } @@ -431,12 +448,15 @@ xfsaild_push( case XFS_ITEM_PINNED: XFS_STATS_INC(xs_push_ail_pinned); + trace_xfs_ail_pinned(lip); + stuck++; - flush_log = 1; + ailp->xa_log_flush++; break; case XFS_ITEM_LOCKED: XFS_STATS_INC(xs_push_ail_locked); + trace_xfs_ail_locked(lip); stuck++; break; @@ -476,16 +496,6 @@ xfsaild_push( xfs_trans_ail_cursor_done(ailp, &cur); spin_unlock(&ailp->xa_lock); - if (flush_log) { - /* - * If something we need to push out was pinned, then - * push out the log so it will become unpinned and - * move forward in the AIL. - */ - XFS_STATS_INC(xs_push_ail_flush); - xfs_log_force(mp, 0); - } - if (push_xfsbufd) { /* we've got delayed write buffers to flush */ wake_up_process(mp->m_ddev_targp->bt_task); @@ -496,6 +506,7 @@ out_done: if (!count) { /* We're past our target or empty, so idle */ ailp->xa_last_pushed_lsn = 0; + ailp->xa_log_flush = 0; tout = 50; } else if (XFS_LSN_CMP(lsn, target) >= 0) { @@ -514,9 +525,13 @@ out_done: * were stuck. * * Backoff a bit more to allow some I/O to complete before - * continuing from where we were. + * restarting from the start of the AIL. This prevents us + * from spinning on the same items, and if they are pinned will + * all the restart to issue a log force to unpin the stuck + * items. */ tout = 20; + ailp->xa_last_pushed_lsn = 0; } return tout; diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c index 137e2b9..475a4de 100644 --- a/fs/xfs/xfs_trans_buf.c +++ b/fs/xfs/xfs_trans_buf.c @@ -160,8 +160,10 @@ xfs_trans_get_buf(xfs_trans_t *tp, bp = xfs_trans_buf_item_match(tp, target_dev, blkno, len); if (bp != NULL) { ASSERT(xfs_buf_islocked(bp)); - if (XFS_FORCED_SHUTDOWN(tp->t_mountp)) - XFS_BUF_SUPER_STALE(bp); + if (XFS_FORCED_SHUTDOWN(tp->t_mountp)) { + xfs_buf_stale(bp); + XFS_BUF_DONE(bp); + } /* * If the buffer is stale then it was binval'ed @@ -294,8 +296,7 @@ xfs_trans_read_buf( if (bp->b_error) { error = bp->b_error; - xfs_ioerror_alert("xfs_trans_read_buf", mp, - bp, blkno); + xfs_buf_ioerror_alert(bp, __func__); xfs_buf_relse(bp); return error; } @@ -337,8 +338,7 @@ xfs_trans_read_buf( xfsbdstrat(tp->t_mountp, bp); error = xfs_buf_iowait(bp); if (error) { - xfs_ioerror_alert("xfs_trans_read_buf", mp, - bp, blkno); + xfs_buf_ioerror_alert(bp, __func__); xfs_buf_relse(bp); /* * We can gracefully recover from most read @@ -387,9 +387,9 @@ xfs_trans_read_buf( } if (bp->b_error) { error = bp->b_error; - XFS_BUF_SUPER_STALE(bp); - xfs_ioerror_alert("xfs_trans_read_buf", mp, - bp, blkno); + xfs_buf_stale(bp); + XFS_BUF_DONE(bp); + xfs_buf_ioerror_alert(bp, __func__); if (tp->t_flags & XFS_TRANS_DIRTY) xfs_force_shutdown(tp->t_mountp, SHUTDOWN_META_IO_ERROR); xfs_buf_relse(bp); @@ -643,13 +643,14 @@ xfs_trans_log_buf(xfs_trans_t *tp, * inside the b_bdstrat callback so that this won't get written to * disk. */ - XFS_BUF_DELAYWRITE(bp); XFS_BUF_DONE(bp); ASSERT(atomic_read(&bip->bli_refcount) > 0); bp->b_iodone = xfs_buf_iodone_callbacks; bip->bli_item.li_cb = xfs_buf_iodone; + xfs_buf_delwri_queue(bp); + trace_xfs_trans_log_buf(bip); /* @@ -738,8 +739,7 @@ xfs_trans_binval( * We set the stale bit in the buffer as well since we're getting * rid of it. */ - XFS_BUF_UNDELAYWRITE(bp); - XFS_BUF_STALE(bp); + xfs_buf_stale(bp); bip->bli_flags |= XFS_BLI_STALE; bip->bli_flags &= ~(XFS_BLI_INODE_BUF | XFS_BLI_LOGGED | XFS_BLI_DIRTY); bip->bli_format.blf_flags &= ~XFS_BLF_INODE_BUF; diff --git a/fs/xfs/xfs_trans_inode.c b/fs/xfs/xfs_trans_inode.c index c8dea2fd..32f0288 100644 --- a/fs/xfs/xfs_trans_inode.c +++ b/fs/xfs/xfs_trans_inode.c @@ -47,11 +47,13 @@ xfs_trans_inode_broot_debug( * Add a locked inode to the transaction. * * The inode must be locked, and it cannot be associated with any transaction. + * If lock_flags is non-zero the inode will be unlocked on transaction commit. */ void xfs_trans_ijoin( struct xfs_trans *tp, - struct xfs_inode *ip) + struct xfs_inode *ip, + uint lock_flags) { xfs_inode_log_item_t *iip; @@ -59,7 +61,9 @@ xfs_trans_ijoin( if (ip->i_itemp == NULL) xfs_inode_item_init(ip, ip->i_mount); iip = ip->i_itemp; + ASSERT(iip->ili_lock_flags == 0); + iip->ili_lock_flags = lock_flags; /* * Get a log_item_desc to point at the new item. @@ -70,25 +74,6 @@ xfs_trans_ijoin( } /* - * Add a locked inode to the transaction. - * - * - * Grabs a reference to the inode which will be dropped when the transaction - * is committed. The inode will also be unlocked at that point. The inode - * must be locked, and it cannot be associated with any transaction. - */ -void -xfs_trans_ijoin_ref( - struct xfs_trans *tp, - struct xfs_inode *ip, - uint lock_flags) -{ - xfs_trans_ijoin(tp, ip); - IHOLD(ip); - ip->i_itemp->ili_lock_flags = lock_flags; -} - -/* * Transactional inode timestamp update. Requires the inode to be locked and * joined to the transaction supplied. Relies on the transaction subsystem to * track dirty state and update/writeback the inode accordingly. diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h index 22750b5..44820b9 100644 --- a/fs/xfs/xfs_trans_priv.h +++ b/fs/xfs/xfs_trans_priv.h @@ -70,6 +70,7 @@ struct xfs_ail { struct list_head xa_cursors; spinlock_t xa_lock; xfs_lsn_t xa_last_pushed_lsn; + int xa_log_flush; }; /* diff --git a/fs/xfs/xfs_utils.c b/fs/xfs/xfs_utils.c index 8b32d1a..89dbb4a 100644 --- a/fs/xfs/xfs_utils.c +++ b/fs/xfs/xfs_utils.c @@ -53,7 +53,7 @@ xfs_dir_ialloc( output: may be a new transaction. */ xfs_inode_t *dp, /* directory within whose allocate the inode. */ - mode_t mode, + umode_t mode, xfs_nlink_t nlink, xfs_dev_t rdev, prid_t prid, /* project id */ diff --git a/fs/xfs/xfs_utils.h b/fs/xfs/xfs_utils.h index 456fca3..5eeab46 100644 --- a/fs/xfs/xfs_utils.h +++ b/fs/xfs/xfs_utils.h @@ -18,7 +18,7 @@ #ifndef __XFS_UTILS_H__ #define __XFS_UTILS_H__ -extern int xfs_dir_ialloc(xfs_trans_t **, xfs_inode_t *, mode_t, xfs_nlink_t, +extern int xfs_dir_ialloc(xfs_trans_t **, xfs_inode_t *, umode_t, xfs_nlink_t, xfs_dev_t, prid_t, int, xfs_inode_t **, int *); extern int xfs_droplink(xfs_trans_t *, xfs_inode_t *); extern int xfs_bumplink(xfs_trans_t *, xfs_inode_t *); diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c index 51fc429..f2fea86 100644 --- a/fs/xfs/xfs_vnodeops.c +++ b/fs/xfs/xfs_vnodeops.c @@ -72,8 +72,8 @@ xfs_readlink_bmap( xfs_buf_t *bp; int error = 0; - error = xfs_bmapi(NULL, ip, 0, XFS_B_TO_FSB(mp, pathlen), 0, NULL, 0, - mval, &nmaps, NULL); + error = xfs_bmapi_read(ip, 0, XFS_B_TO_FSB(mp, pathlen), mval, &nmaps, + 0); if (error) goto out; @@ -87,8 +87,7 @@ xfs_readlink_bmap( return XFS_ERROR(ENOMEM); error = bp->b_error; if (error) { - xfs_ioerror_alert("xfs_readlink", - ip->i_mount, bp, XFS_BUF_ADDR(bp)); + xfs_buf_ioerror_alert(bp, __func__); xfs_buf_relse(bp); goto out; } @@ -113,7 +112,7 @@ xfs_readlink( char *link) { xfs_mount_t *mp = ip->i_mount; - int pathlen; + xfs_fsize_t pathlen; int error = 0; trace_xfs_readlink(ip); @@ -123,13 +122,19 @@ xfs_readlink( xfs_ilock(ip, XFS_ILOCK_SHARED); - ASSERT(S_ISLNK(ip->i_d.di_mode)); - ASSERT(ip->i_d.di_size <= MAXPATHLEN); - pathlen = ip->i_d.di_size; if (!pathlen) goto out; + if (pathlen < 0 || pathlen > MAXPATHLEN) { + xfs_alert(mp, "%s: inode (%llu) bad symlink length (%lld)", + __func__, (unsigned long long) ip->i_ino, + (long long) pathlen); + ASSERT(0); + return XFS_ERROR(EFSCORRUPTED); + } + + if (ip->i_df.if_flags & XFS_IFINLINE) { memcpy(link, ip->i_df.if_u1.if_data, pathlen); link[pathlen] = '\0'; @@ -178,8 +183,7 @@ xfs_free_eofblocks( nimaps = 1; xfs_ilock(ip, XFS_ILOCK_SHARED); - error = xfs_bmapi(NULL, ip, end_fsb, map_len, 0, - NULL, 0, &imap, &nimaps, NULL); + error = xfs_bmapi_read(ip, end_fsb, map_len, &imap, &nimaps, 0); xfs_iunlock(ip, XFS_ILOCK_SHARED); if (!error && (nimaps != 0) && @@ -220,7 +224,7 @@ xfs_free_eofblocks( } xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, ip); + xfs_trans_ijoin(tp, ip, 0); error = xfs_itruncate_data(&tp, ip, ip->i_size); if (error) { @@ -289,7 +293,7 @@ xfs_inactive_symlink_rmt( xfs_ilock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); size = (int)ip->i_d.di_size; ip->i_d.di_size = 0; - xfs_trans_ijoin(tp, ip); + xfs_trans_ijoin(tp, ip, 0); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); /* * Find the block(s) so we can inval and unmap them. @@ -297,9 +301,9 @@ xfs_inactive_symlink_rmt( done = 0; xfs_bmap_init(&free_list, &first_block); nmaps = ARRAY_SIZE(mval); - if ((error = xfs_bmapi(tp, ip, 0, XFS_B_TO_FSB(mp, size), - XFS_BMAPI_METADATA, &first_block, 0, mval, &nmaps, - &free_list))) + error = xfs_bmapi_read(ip, 0, XFS_B_TO_FSB(mp, size), + mval, &nmaps, 0); + if (error) goto error0; /* * Invalidate the block(s). @@ -308,6 +312,10 @@ xfs_inactive_symlink_rmt( bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, XFS_FSB_TO_DADDR(mp, mval[i].br_startblock), XFS_FSB_TO_BB(mp, mval[i].br_blockcount), 0); + if (!bp) { + error = ENOMEM; + goto error1; + } xfs_trans_binval(tp, bp); } /* @@ -333,7 +341,7 @@ xfs_inactive_symlink_rmt( * Mark it dirty so it will be logged and moved forward in the log as * part of every commit. */ - xfs_trans_ijoin(tp, ip); + xfs_trans_ijoin(tp, ip, 0); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); /* * Get a new, empty transaction to return to our caller. @@ -466,7 +474,7 @@ xfs_inactive_attrs( goto error_cancel; xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, ip); + xfs_trans_ijoin(tp, ip, 0); xfs_idestroy_fork(ip, XFS_ATTR_FORK); ASSERT(ip->i_d.di_anextents == 0); @@ -647,8 +655,6 @@ xfs_inactive( if (truncate) { xfs_ilock(ip, XFS_IOLOCK_EXCL); - xfs_ioend_wait(ip); - error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0, XFS_TRANS_PERM_LOG_RES, @@ -662,7 +668,7 @@ xfs_inactive( } xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, ip); + xfs_trans_ijoin(tp, ip, 0); error = xfs_itruncate_data(&tp, ip, 0); if (error) { @@ -686,7 +692,7 @@ xfs_inactive( return VN_INACTIVE_CACHE; } - xfs_trans_ijoin(tp, ip); + xfs_trans_ijoin(tp, ip, 0); } else { error = xfs_trans_reserve(tp, 0, XFS_IFREE_LOG_RES(mp), @@ -699,7 +705,7 @@ xfs_inactive( } xfs_ilock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); - xfs_trans_ijoin(tp, ip); + xfs_trans_ijoin(tp, ip, 0); } /* @@ -816,7 +822,7 @@ int xfs_create( xfs_inode_t *dp, struct xfs_name *name, - mode_t mode, + umode_t mode, xfs_dev_t rdev, xfs_inode_t **ipp) { @@ -939,7 +945,7 @@ xfs_create( * the transaction cancel unlocking dp so don't do it explicitly in the * error path. */ - xfs_trans_ijoin_ref(tp, dp, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL); unlock_dp_on_error = B_FALSE; error = xfs_dir_createname(tp, dp, name, ip->i_ino, @@ -1260,8 +1266,8 @@ xfs_remove( xfs_lock_two_inodes(dp, ip, XFS_ILOCK_EXCL); - xfs_trans_ijoin_ref(tp, dp, XFS_ILOCK_EXCL); - xfs_trans_ijoin_ref(tp, ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); /* * If we're removing a directory perform some additional validation. @@ -1406,8 +1412,8 @@ xfs_link( xfs_lock_two_inodes(sip, tdp, XFS_ILOCK_EXCL); - xfs_trans_ijoin_ref(tp, sip, XFS_ILOCK_EXCL); - xfs_trans_ijoin_ref(tp, tdp, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, sip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, tdp, XFS_ILOCK_EXCL); /* * If the source has too many links, we can't make any more to it. @@ -1475,7 +1481,7 @@ xfs_symlink( xfs_inode_t *dp, struct xfs_name *link_name, const char *target_path, - mode_t mode, + umode_t mode, xfs_inode_t **ipp) { xfs_mount_t *mp = dp->i_mount; @@ -1601,7 +1607,7 @@ xfs_symlink( * transaction cancel unlocking dp so don't do it explicitly in the * error path. */ - xfs_trans_ijoin_ref(tp, dp, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL); unlock_dp_on_error = B_FALSE; /* @@ -1632,10 +1638,9 @@ xfs_symlink( first_fsb = 0; nmaps = SYMLINK_MAPS; - error = xfs_bmapi(tp, ip, first_fsb, fs_blocks, - XFS_BMAPI_WRITE | XFS_BMAPI_METADATA, - &first_block, resblks, mval, &nmaps, - &free_list); + error = xfs_bmapi_write(tp, ip, first_fsb, fs_blocks, + XFS_BMAPI_METADATA, &first_block, resblks, + mval, &nmaps, &free_list); if (error) goto error2; @@ -1650,7 +1655,10 @@ xfs_symlink( byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount); bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, d, BTOBB(byte_cnt), 0); - ASSERT(!xfs_buf_geterror(bp)); + if (!bp) { + error = ENOMEM; + goto error2; + } if (pathlen < byte_cnt) { byte_cnt = pathlen; } @@ -1732,7 +1740,7 @@ xfs_set_dmattrs( return error; } xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_trans_ijoin_ref(tp, ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); ip->i_d.di_dmevmask = evmask; ip->i_d.di_dmstate = state; @@ -1778,7 +1786,6 @@ xfs_alloc_file_space( xfs_fileoff_t startoffset_fsb; xfs_fsblock_t firstfsb; int nimaps; - int bmapi_flag; int quota_flag; int rt; xfs_trans_t *tp; @@ -1806,7 +1813,6 @@ xfs_alloc_file_space( count = len; imapp = &imaps[0]; nimaps = 1; - bmapi_flag = XFS_BMAPI_WRITE | alloc_type; startoffset_fsb = XFS_B_TO_FSBT(mp, offset); allocatesize_fsb = XFS_B_TO_FSB(mp, count); @@ -1877,16 +1883,12 @@ xfs_alloc_file_space( if (error) goto error1; - xfs_trans_ijoin(tp, ip); + xfs_trans_ijoin(tp, ip, 0); - /* - * Issue the xfs_bmapi() call to allocate the blocks - */ xfs_bmap_init(&free_list, &firstfsb); - error = xfs_bmapi(tp, ip, startoffset_fsb, - allocatesize_fsb, bmapi_flag, - &firstfsb, 0, imapp, &nimaps, - &free_list); + error = xfs_bmapi_write(tp, ip, startoffset_fsb, + allocatesize_fsb, alloc_type, &firstfsb, + 0, imapp, &nimaps, &free_list); if (error) { goto error0; } @@ -1976,8 +1978,7 @@ xfs_zero_remaining_bytes( for (offset = startoff; offset <= endoff; offset = lastoffset + 1) { offset_fsb = XFS_B_TO_FSBT(mp, offset); nimap = 1; - error = xfs_bmapi(NULL, ip, offset_fsb, 1, 0, - NULL, 0, &imap, &nimap, NULL); + error = xfs_bmapi_read(ip, offset_fsb, 1, &imap, &nimap, 0); if (error || nimap < 1) break; ASSERT(imap.br_blockcount >= 1); @@ -1997,8 +1998,8 @@ xfs_zero_remaining_bytes( xfsbdstrat(mp, bp); error = xfs_buf_iowait(bp); if (error) { - xfs_ioerror_alert("xfs_zero_remaining_bytes(read)", - mp, bp, XFS_BUF_ADDR(bp)); + xfs_buf_ioerror_alert(bp, + "xfs_zero_remaining_bytes(read)"); break; } memset(bp->b_addr + @@ -2010,8 +2011,8 @@ xfs_zero_remaining_bytes( xfsbdstrat(mp, bp); error = xfs_buf_iowait(bp); if (error) { - xfs_ioerror_alert("xfs_zero_remaining_bytes(write)", - mp, bp, XFS_BUF_ADDR(bp)); + xfs_buf_ioerror_alert(bp, + "xfs_zero_remaining_bytes(write)"); break; } } @@ -2076,7 +2077,7 @@ xfs_free_file_space( if (need_iolock) { xfs_ilock(ip, XFS_IOLOCK_EXCL); /* wait for the completion of any pending DIOs */ - xfs_ioend_wait(ip); + inode_dio_wait(VFS_I(ip)); } rounding = max_t(uint, 1 << mp->m_sb.sb_blocklog, PAGE_CACHE_SIZE); @@ -2096,8 +2097,8 @@ xfs_free_file_space( */ if (rt && !xfs_sb_version_hasextflgbit(&mp->m_sb)) { nimap = 1; - error = xfs_bmapi(NULL, ip, startoffset_fsb, - 1, 0, NULL, 0, &imap, &nimap, NULL); + error = xfs_bmapi_read(ip, startoffset_fsb, 1, + &imap, &nimap, 0); if (error) goto out_unlock_iolock; ASSERT(nimap == 0 || nimap == 1); @@ -2111,8 +2112,8 @@ xfs_free_file_space( startoffset_fsb += mp->m_sb.sb_rextsize - mod; } nimap = 1; - error = xfs_bmapi(NULL, ip, endoffset_fsb - 1, - 1, 0, NULL, 0, &imap, &nimap, NULL); + error = xfs_bmapi_read(ip, endoffset_fsb - 1, 1, + &imap, &nimap, 0); if (error) goto out_unlock_iolock; ASSERT(nimap == 0 || nimap == 1); @@ -2180,7 +2181,7 @@ xfs_free_file_space( if (error) goto error1; - xfs_trans_ijoin(tp, ip); + xfs_trans_ijoin(tp, ip, 0); /* * issue the bunmapi() call to free the blocks @@ -2353,8 +2354,7 @@ xfs_change_file_space( } xfs_ilock(ip, XFS_ILOCK_EXCL); - - xfs_trans_ijoin(tp, ip); + xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); if ((attr_flags & XFS_ATTR_DMI) == 0) { ip->i_d.di_mode &= ~S_ISUID; @@ -2379,10 +2379,5 @@ xfs_change_file_space( xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); if (attr_flags & XFS_ATTR_SYNC) xfs_trans_set_sync(tp); - - error = xfs_trans_commit(tp, 0); - - xfs_iunlock(ip, XFS_ILOCK_EXCL); - - return error; + return xfs_trans_commit(tp, 0); } diff --git a/fs/xfs/xfs_vnodeops.h b/fs/xfs/xfs_vnodeops.h index 35d3d51..0c877cb 100644 --- a/fs/xfs/xfs_vnodeops.h +++ b/fs/xfs/xfs_vnodeops.h @@ -26,7 +26,7 @@ int xfs_release(struct xfs_inode *ip); int xfs_inactive(struct xfs_inode *ip); int xfs_lookup(struct xfs_inode *dp, struct xfs_name *name, struct xfs_inode **ipp, struct xfs_name *ci_name); -int xfs_create(struct xfs_inode *dp, struct xfs_name *name, mode_t mode, +int xfs_create(struct xfs_inode *dp, struct xfs_name *name, umode_t mode, xfs_dev_t rdev, struct xfs_inode **ipp); int xfs_remove(struct xfs_inode *dp, struct xfs_name *name, struct xfs_inode *ip); @@ -35,7 +35,7 @@ int xfs_link(struct xfs_inode *tdp, struct xfs_inode *sip, int xfs_readdir(struct xfs_inode *dp, void *dirent, size_t bufsize, xfs_off_t *offset, filldir_t filldir); int xfs_symlink(struct xfs_inode *dp, struct xfs_name *link_name, - const char *target_path, mode_t mode, struct xfs_inode **ipp); + const char *target_path, umode_t mode, struct xfs_inode **ipp); int xfs_set_dmattrs(struct xfs_inode *ip, u_int evmask, u_int16_t state); int xfs_change_file_space(struct xfs_inode *ip, int cmd, xfs_flock64_t *bf, xfs_off_t offset, int attr_flags); |