diff options
Diffstat (limited to 'fs')
179 files changed, 10092 insertions, 2679 deletions
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c index 047c791..c061c3f 100644 --- a/fs/9p/v9fs.c +++ b/fs/9p/v9fs.c @@ -55,7 +55,7 @@ enum { Opt_err }; -static match_table_t tokens = { +static const match_table_t tokens = { {Opt_debug, "debug=%x"}, {Opt_dfltuid, "dfltuid=%u"}, {Opt_dfltgid, "dfltgid=%u"}, @@ -160,7 +160,7 @@ config EXT4_FS filesystem initially. To compile this file system support as a module, choose M here. The - module will be called ext4dev. + module will be called ext4. If unsure, say N. @@ -220,17 +220,16 @@ config JBD tristate help This is a generic journalling layer for block devices. It is - currently used by the ext3 and OCFS2 file systems, but it could - also be used to add journal support to other file systems or block + currently used by the ext3 file system, but it could also be + used to add journal support to other file systems or block devices such as RAID or LVM. - If you are using the ext3 or OCFS2 file systems, you need to - say Y here. If you are not using ext3 OCFS2 then you will probably - want to say N. + If you are using the ext3 file system, you need to say Y here. + If you are not using ext3 then you will probably want to say N. To compile this device as a module, choose M here: the module will be - called jbd. If you are compiling ext3 or OCFS2 into the kernel, - you cannot compile this code as a module. + called jbd. If you are compiling ext3 into the kernel, you + cannot compile this code as a module. config JBD_DEBUG bool "JBD (ext3) debugging support" @@ -254,15 +253,16 @@ config JBD2 help This is a generic journaling layer for block devices that support both 32-bit and 64-bit block numbers. It is currently used by - the ext4 filesystem, but it could also be used to add + the ext4 and OCFS2 filesystems, but it could also be used to add journal support to other file systems or block devices such as RAID or LVM. - If you are using ext4, you need to say Y here. If you are not - using ext4 then you will probably want to say N. + If you are using ext4 or OCFS2, you need to say Y here. + If you are not using ext4 or OCFS2 then you will + probably want to say N. To compile this device as a module, choose M here. The module will be - called jbd2. If you are compiling ext4 into the kernel, + called jbd2. If you are compiling ext4 or OCFS2 into the kernel, you cannot compile this code as a module. config JBD2_DEBUG @@ -433,6 +433,14 @@ config FS_POSIX_ACL bool default n +config FILE_LOCKING + bool "Enable POSIX file locking API" if EMBEDDED + default y + help + This option enables standard file locking support, required + for filesystems like NFS and for the flock() system + call. Disabling this option saves about 11k. + source "fs/xfs/Kconfig" source "fs/gfs2/Kconfig" @@ -440,7 +448,7 @@ config OCFS2_FS tristate "OCFS2 file system support" depends on NET && SYSFS select CONFIGFS_FS - select JBD + select JBD2 select CRC32 help OCFS2 is a general purpose extent based shared disk cluster file @@ -511,6 +519,16 @@ config OCFS2_DEBUG_FS this option for debugging only as it is likely to decrease performance of the filesystem. +config OCFS2_COMPAT_JBD + bool "Use JBD for compatibility" + depends on OCFS2_FS + default n + select JBD + help + The ocfs2 filesystem now uses JBD2 for its journalling. JBD2 + is backwards compatible with JBD. It is safe to say N here. + However, if you really want to use the original JBD, say Y here. + endif # BLOCK config DNOTIFY @@ -1779,6 +1797,28 @@ config SUNRPC_XPRT_RDMA If unsure, say N. +config SUNRPC_REGISTER_V4 + bool "Register local RPC services via rpcbind v4 (EXPERIMENTAL)" + depends on SUNRPC && EXPERIMENTAL + default n + help + Sun added support for registering RPC services at an IPv6 + address by creating two new versions of the rpcbind protocol + (RFC 1833). + + This option enables support in the kernel RPC server for + registering kernel RPC services via version 4 of the rpcbind + protocol. If you enable this option, you must run a portmapper + daemon that supports rpcbind protocol version 4. + + Serving NFS over IPv6 from knfsd (the kernel's NFS server) + requires that you enable this option and use a portmapper that + supports rpcbind version 4. + + If unsure, say N to get traditional behavior (register kernel + RPC services using only rpcbind version 2). Distributions + using the legacy Linux portmapper daemon must say N here. + config RPCSEC_GSS_KRB5 tristate "Secure RPC: Kerberos V mechanism (EXPERIMENTAL)" depends on SUNRPC && EXPERIMENTAL diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt index 4a551af..801db13 100644 --- a/fs/Kconfig.binfmt +++ b/fs/Kconfig.binfmt @@ -25,7 +25,7 @@ config BINFMT_ELF config COMPAT_BINFMT_ELF bool - depends on COMPAT && MMU + depends on COMPAT && BINFMT_ELF config BINFMT_ELF_FDPIC bool "Kernel support for FDPIC ELF binaries" @@ -59,10 +59,12 @@ config BINFMT_SHARED_FLAT help Support FLAT shared libraries +config HAVE_AOUT + def_bool n + config BINFMT_AOUT tristate "Kernel support for a.out and ECOFF binaries" - depends on ARCH_SUPPORTS_AOUT && \ - (X86_32 || ALPHA || ARM || M68K) + depends on HAVE_AOUT ---help--- A.out (Assembler.OUTput) is a set of formats for libraries and executables used in the earliest versions of UNIX. Linux used diff --git a/fs/Makefile b/fs/Makefile index de404b0..2168c90 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -7,8 +7,8 @@ obj-y := open.o read_write.o file_table.o super.o \ char_dev.o stat.o exec.o pipe.o namei.o fcntl.o \ - ioctl.o readdir.o select.o fifo.o locks.o dcache.o inode.o \ - attr.o bad_inode.o file.o filesystems.o namespace.o aio.o \ + ioctl.o readdir.o select.o fifo.o dcache.o inode.o \ + attr.o bad_inode.o file.o filesystems.o namespace.o \ seq_file.o xattr.o libfs.o fs-writeback.o \ pnode.o drop_caches.o splice.o sync.o utimes.o \ stack.o @@ -27,6 +27,8 @@ obj-$(CONFIG_ANON_INODES) += anon_inodes.o obj-$(CONFIG_SIGNALFD) += signalfd.o obj-$(CONFIG_TIMERFD) += timerfd.o obj-$(CONFIG_EVENTFD) += eventfd.o +obj-$(CONFIG_AIO) += aio.o +obj-$(CONFIG_FILE_LOCKING) += locks.o obj-$(CONFIG_COMPAT) += compat.o compat_ioctl.o nfsd-$(CONFIG_NFSD) := nfsctl.o @@ -69,7 +71,7 @@ obj-$(CONFIG_DLM) += dlm/ # Do not add any filesystems before this line obj-$(CONFIG_REISERFS_FS) += reiserfs/ obj-$(CONFIG_EXT3_FS) += ext3/ # Before ext2 so root fs can be ext3 -obj-$(CONFIG_EXT4_FS) += ext4/ # Before ext2 so root fs can be ext4dev +obj-$(CONFIG_EXT4_FS) += ext4/ # Before ext2 so root fs can be ext4 obj-$(CONFIG_JBD) += jbd/ obj-$(CONFIG_JBD2) += jbd2/ obj-$(CONFIG_EXT2_FS) += ext2/ diff --git a/fs/adfs/super.c b/fs/adfs/super.c index 26f3b43..7f83a46 100644 --- a/fs/adfs/super.c +++ b/fs/adfs/super.c @@ -157,7 +157,7 @@ static int adfs_show_options(struct seq_file *seq, struct vfsmount *mnt) enum {Opt_uid, Opt_gid, Opt_ownmask, Opt_othmask, Opt_err}; -static match_table_t tokens = { +static const match_table_t tokens = { {Opt_uid, "uid=%u"}, {Opt_gid, "gid=%u"}, {Opt_ownmask, "ownmask=%o"}, diff --git a/fs/affs/super.c b/fs/affs/super.c index 3a89094..8989c93 100644 --- a/fs/affs/super.c +++ b/fs/affs/super.c @@ -135,7 +135,7 @@ enum { Opt_verbose, Opt_volume, Opt_ignore, Opt_err, }; -static match_table_t tokens = { +static const match_table_t tokens = { {Opt_bs, "bs=%u"}, {Opt_mode, "mode=%o"}, {Opt_mufs, "mufs"}, diff --git a/fs/afs/file.c b/fs/afs/file.c index 525f7c5..a390176 100644 --- a/fs/afs/file.c +++ b/fs/afs/file.c @@ -50,8 +50,8 @@ const struct address_space_operations afs_fs_aops = { .launder_page = afs_launder_page, .releasepage = afs_releasepage, .invalidatepage = afs_invalidatepage, - .prepare_write = afs_prepare_write, - .commit_write = afs_commit_write, + .write_begin = afs_write_begin, + .write_end = afs_write_end, .writepage = afs_writepage, .writepages = afs_writepages, }; diff --git a/fs/afs/internal.h b/fs/afs/internal.h index 3cb6920..67f259d 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -728,8 +728,12 @@ extern int afs_volume_release_fileserver(struct afs_vnode *, */ extern int afs_set_page_dirty(struct page *); extern void afs_put_writeback(struct afs_writeback *); -extern int afs_prepare_write(struct file *, struct page *, unsigned, unsigned); -extern int afs_commit_write(struct file *, struct page *, unsigned, unsigned); +extern int afs_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata); +extern int afs_write_end(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata); extern int afs_writepage(struct page *, struct writeback_control *); extern int afs_writepages(struct address_space *, struct writeback_control *); extern int afs_write_inode(struct inode *, int); diff --git a/fs/afs/super.c b/fs/afs/super.c index 250d8c4..aee239a 100644 --- a/fs/afs/super.c +++ b/fs/afs/super.c @@ -64,7 +64,7 @@ enum { afs_opt_vol, }; -static match_table_t afs_options_list = { +static const match_table_t afs_options_list = { { afs_opt_cell, "cell=%s" }, { afs_opt_rwpath, "rwpath" }, { afs_opt_vol, "vol=%s" }, diff --git a/fs/afs/write.c b/fs/afs/write.c index 065b4e1..d6b85da 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -84,15 +84,23 @@ void afs_put_writeback(struct afs_writeback *wb) * partly or wholly fill a page that's under preparation for writing */ static int afs_fill_page(struct afs_vnode *vnode, struct key *key, - unsigned start, unsigned len, struct page *page) + loff_t pos, unsigned len, struct page *page) { + loff_t i_size; + unsigned eof; int ret; - _enter(",,%u,%u", start, len); + _enter(",,%llu,%u", (unsigned long long)pos, len); - ASSERTCMP(start + len, <=, PAGE_SIZE); + ASSERTCMP(len, <=, PAGE_CACHE_SIZE); - ret = afs_vnode_fetch_data(vnode, key, start, len, page); + i_size = i_size_read(&vnode->vfs_inode); + if (pos + len > i_size) + eof = i_size; + else + eof = PAGE_CACHE_SIZE; + + ret = afs_vnode_fetch_data(vnode, key, 0, eof, page); if (ret < 0) { if (ret == -ENOENT) { _debug("got NOENT from server" @@ -107,109 +115,55 @@ static int afs_fill_page(struct afs_vnode *vnode, struct key *key, } /* - * prepare a page for being written to - */ -static int afs_prepare_page(struct afs_vnode *vnode, struct page *page, - struct key *key, unsigned offset, unsigned to) -{ - unsigned eof, tail, start, stop, len; - loff_t i_size, pos; - void *p; - int ret; - - _enter(""); - - if (offset == 0 && to == PAGE_SIZE) - return 0; - - p = kmap_atomic(page, KM_USER0); - - i_size = i_size_read(&vnode->vfs_inode); - pos = (loff_t) page->index << PAGE_SHIFT; - if (pos >= i_size) { - /* partial write, page beyond EOF */ - _debug("beyond"); - if (offset > 0) - memset(p, 0, offset); - if (to < PAGE_SIZE) - memset(p + to, 0, PAGE_SIZE - to); - kunmap_atomic(p, KM_USER0); - return 0; - } - - if (i_size - pos >= PAGE_SIZE) { - /* partial write, page entirely before EOF */ - _debug("before"); - tail = eof = PAGE_SIZE; - } else { - /* partial write, page overlaps EOF */ - eof = i_size - pos; - _debug("overlap %u", eof); - tail = max(eof, to); - if (tail < PAGE_SIZE) - memset(p + tail, 0, PAGE_SIZE - tail); - if (offset > eof) - memset(p + eof, 0, PAGE_SIZE - eof); - } - - kunmap_atomic(p, KM_USER0); - - ret = 0; - if (offset > 0 || eof > to) { - /* need to fill one or two bits that aren't going to be written - * (cover both fillers in one read if there are two) */ - start = (offset > 0) ? 0 : to; - stop = (eof > to) ? eof : offset; - len = stop - start; - _debug("wr=%u-%u av=0-%u rd=%u@%u", - offset, to, eof, start, len); - ret = afs_fill_page(vnode, key, start, len, page); - } - - _leave(" = %d", ret); - return ret; -} - -/* * prepare to perform part of a write to a page - * - the caller holds the page locked, preventing it from being written out or - * modified by anyone else */ -int afs_prepare_write(struct file *file, struct page *page, - unsigned offset, unsigned to) +int afs_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata) { struct afs_writeback *candidate, *wb; struct afs_vnode *vnode = AFS_FS_I(file->f_dentry->d_inode); + struct page *page; struct key *key = file->private_data; - pgoff_t index; + unsigned from = pos & (PAGE_CACHE_SIZE - 1); + unsigned to = from + len; + pgoff_t index = pos >> PAGE_CACHE_SHIFT; int ret; _enter("{%x:%u},{%lx},%u,%u", - vnode->fid.vid, vnode->fid.vnode, page->index, offset, to); + vnode->fid.vid, vnode->fid.vnode, index, from, to); candidate = kzalloc(sizeof(*candidate), GFP_KERNEL); if (!candidate) return -ENOMEM; candidate->vnode = vnode; - candidate->first = candidate->last = page->index; - candidate->offset_first = offset; + candidate->first = candidate->last = index; + candidate->offset_first = from; candidate->to_last = to; candidate->usage = 1; candidate->state = AFS_WBACK_PENDING; init_waitqueue_head(&candidate->waitq); + page = __grab_cache_page(mapping, index); + if (!page) { + kfree(candidate); + return -ENOMEM; + } + *pagep = page; + /* page won't leak in error case: it eventually gets cleaned off LRU */ + if (!PageUptodate(page)) { _debug("not up to date"); - ret = afs_prepare_page(vnode, page, key, offset, to); + ret = afs_fill_page(vnode, key, pos, len, page); if (ret < 0) { kfree(candidate); _leave(" = %d [prep]", ret); return ret; } + SetPageUptodate(page); } try_again: - index = page->index; spin_lock(&vnode->writeback_lock); /* see if this page is already pending a writeback under a suitable key @@ -242,8 +196,8 @@ try_again: subsume_in_current_wb: _debug("subsume"); ASSERTRANGE(wb->first, <=, index, <=, wb->last); - if (index == wb->first && offset < wb->offset_first) - wb->offset_first = offset; + if (index == wb->first && from < wb->offset_first) + wb->offset_first = from; if (index == wb->last && to > wb->to_last) wb->to_last = to; spin_unlock(&vnode->writeback_lock); @@ -289,17 +243,17 @@ flush_conflicting_wb: /* * finalise part of a write to a page */ -int afs_commit_write(struct file *file, struct page *page, - unsigned offset, unsigned to) +int afs_write_end(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) { struct afs_vnode *vnode = AFS_FS_I(file->f_dentry->d_inode); loff_t i_size, maybe_i_size; - _enter("{%x:%u},{%lx},%u,%u", - vnode->fid.vid, vnode->fid.vnode, page->index, offset, to); + _enter("{%x:%u},{%lx}", + vnode->fid.vid, vnode->fid.vnode, page->index); - maybe_i_size = (loff_t) page->index << PAGE_SHIFT; - maybe_i_size += to; + maybe_i_size = pos + copied; i_size = i_size_read(&vnode->vfs_inode); if (maybe_i_size > i_size) { @@ -310,12 +264,13 @@ int afs_commit_write(struct file *file, struct page *page, spin_unlock(&vnode->writeback_lock); } - SetPageUptodate(page); set_page_dirty(page); if (PageDirty(page)) _debug("dirtied"); + unlock_page(page); + page_cache_release(page); - return 0; + return copied; } /* diff --git a/fs/autofs/inode.c b/fs/autofs/inode.c index dda510d..b70eea1 100644 --- a/fs/autofs/inode.c +++ b/fs/autofs/inode.c @@ -59,7 +59,7 @@ static const struct super_operations autofs_sops = { enum {Opt_err, Opt_fd, Opt_uid, Opt_gid, Opt_pgrp, Opt_minproto, Opt_maxproto}; -static match_table_t autofs_tokens = { +static const match_table_t autofs_tokens = { {Opt_fd, "fd=%u"}, {Opt_uid, "uid=%u"}, {Opt_gid, "gid=%u"}, diff --git a/fs/autofs4/Makefile b/fs/autofs4/Makefile index f2c3b79..a811c1f 100644 --- a/fs/autofs4/Makefile +++ b/fs/autofs4/Makefile @@ -4,4 +4,4 @@ obj-$(CONFIG_AUTOFS4_FS) += autofs4.o -autofs4-objs := init.o inode.o root.o symlink.o waitq.o expire.o +autofs4-objs := init.o inode.o root.o symlink.o waitq.o expire.o dev-ioctl.o diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h index 69a2f5c..e0f16da 100644 --- a/fs/autofs4/autofs_i.h +++ b/fs/autofs4/autofs_i.h @@ -14,6 +14,7 @@ /* Internal header file for autofs */ #include <linux/auto_fs4.h> +#include <linux/auto_dev-ioctl.h> #include <linux/mutex.h> #include <linux/list.h> @@ -21,6 +22,11 @@ #define AUTOFS_IOC_FIRST AUTOFS_IOC_READY #define AUTOFS_IOC_COUNT 32 +#define AUTOFS_DEV_IOCTL_IOC_FIRST (AUTOFS_DEV_IOCTL_VERSION) +#define AUTOFS_DEV_IOCTL_IOC_COUNT (AUTOFS_IOC_COUNT - 11) + +#define AUTOFS_TYPE_TRIGGER (AUTOFS_TYPE_DIRECT|AUTOFS_TYPE_OFFSET) + #include <linux/kernel.h> #include <linux/slab.h> #include <linux/time.h> @@ -35,11 +41,27 @@ /* #define DEBUG */ #ifdef DEBUG -#define DPRINTK(fmt,args...) do { printk(KERN_DEBUG "pid %d: %s: " fmt "\n" , current->pid , __func__ , ##args); } while(0) +#define DPRINTK(fmt, args...) \ +do { \ + printk(KERN_DEBUG "pid %d: %s: " fmt "\n", \ + current->pid, __func__, ##args); \ +} while (0) #else -#define DPRINTK(fmt,args...) do {} while(0) +#define DPRINTK(fmt, args...) do {} while (0) #endif +#define AUTOFS_WARN(fmt, args...) \ +do { \ + printk(KERN_WARNING "pid %d: %s: " fmt "\n", \ + current->pid, __func__, ##args); \ +} while (0) + +#define AUTOFS_ERROR(fmt, args...) \ +do { \ + printk(KERN_ERR "pid %d: %s: " fmt "\n", \ + current->pid, __func__, ##args); \ +} while (0) + /* Unified info structure. This is pointed to by both the dentry and inode structures. Each file in the filesystem has an instance of this structure. It holds a reference to the dentry, so dentries are never @@ -61,6 +83,9 @@ struct autofs_info { unsigned long last_used; atomic_t count; + uid_t uid; + gid_t gid; + mode_t mode; size_t size; @@ -92,10 +117,6 @@ struct autofs_wait_queue { #define AUTOFS_SBI_MAGIC 0x6d4a556d -#define AUTOFS_TYPE_INDIRECT 0x0001 -#define AUTOFS_TYPE_DIRECT 0x0002 -#define AUTOFS_TYPE_OFFSET 0x0004 - struct autofs_sb_info { u32 magic; int pipefd; @@ -169,6 +190,17 @@ int autofs4_expire_run(struct super_block *, struct vfsmount *, struct autofs_packet_expire __user *); int autofs4_expire_multi(struct super_block *, struct vfsmount *, struct autofs_sb_info *, int __user *); +struct dentry *autofs4_expire_direct(struct super_block *sb, + struct vfsmount *mnt, + struct autofs_sb_info *sbi, int how); +struct dentry *autofs4_expire_indirect(struct super_block *sb, + struct vfsmount *mnt, + struct autofs_sb_info *sbi, int how); + +/* Device node initialization */ + +int autofs_dev_ioctl_init(void); +void autofs_dev_ioctl_exit(void); /* Operations structures */ diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c new file mode 100644 index 0000000..625abf5 --- /dev/null +++ b/fs/autofs4/dev-ioctl.c @@ -0,0 +1,863 @@ +/* + * Copyright 2008 Red Hat, Inc. All rights reserved. + * Copyright 2008 Ian Kent <raven@themaw.net> + * + * This file is part of the Linux kernel and is made available under + * the terms of the GNU General Public License, version 2, or at your + * option, any later version, incorporated herein by reference. + */ + +#include <linux/module.h> +#include <linux/vmalloc.h> +#include <linux/miscdevice.h> +#include <linux/init.h> +#include <linux/wait.h> +#include <linux/namei.h> +#include <linux/fcntl.h> +#include <linux/file.h> +#include <linux/fdtable.h> +#include <linux/sched.h> +#include <linux/compat.h> +#include <linux/syscalls.h> +#include <linux/smp_lock.h> +#include <linux/magic.h> +#include <linux/dcache.h> +#include <linux/uaccess.h> + +#include "autofs_i.h" + +/* + * This module implements an interface for routing autofs ioctl control + * commands via a miscellaneous device file. + * + * The alternate interface is needed because we need to be able open + * an ioctl file descriptor on an autofs mount that may be covered by + * another mount. This situation arises when starting automount(8) + * or other user space daemon which uses direct mounts or offset + * mounts (used for autofs lazy mount/umount of nested mount trees), + * which have been left busy at at service shutdown. + */ + +#define AUTOFS_DEV_IOCTL_SIZE sizeof(struct autofs_dev_ioctl) + +typedef int (*ioctl_fn)(struct file *, struct autofs_sb_info *, + struct autofs_dev_ioctl *); + +static int check_name(const char *name) +{ + if (!strchr(name, '/')) + return -EINVAL; + return 0; +} + +/* + * Check a string doesn't overrun the chunk of + * memory we copied from user land. + */ +static int invalid_str(char *str, void *end) +{ + while ((void *) str <= end) + if (!*str++) + return 0; + return -EINVAL; +} + +/* + * Check that the user compiled against correct version of autofs + * misc device code. + * + * As well as checking the version compatibility this always copies + * the kernel interface version out. + */ +static int check_dev_ioctl_version(int cmd, struct autofs_dev_ioctl *param) +{ + int err = 0; + + if ((AUTOFS_DEV_IOCTL_VERSION_MAJOR != param->ver_major) || + (AUTOFS_DEV_IOCTL_VERSION_MINOR < param->ver_minor)) { + AUTOFS_WARN("ioctl control interface version mismatch: " + "kernel(%u.%u), user(%u.%u), cmd(%d)", + AUTOFS_DEV_IOCTL_VERSION_MAJOR, + AUTOFS_DEV_IOCTL_VERSION_MINOR, + param->ver_major, param->ver_minor, cmd); + err = -EINVAL; + } + + /* Fill in the kernel version. */ + param->ver_major = AUTOFS_DEV_IOCTL_VERSION_MAJOR; + param->ver_minor = AUTOFS_DEV_IOCTL_VERSION_MINOR; + + return err; +} + +/* + * Copy parameter control struct, including a possible path allocated + * at the end of the struct. + */ +static struct autofs_dev_ioctl *copy_dev_ioctl(struct autofs_dev_ioctl __user *in) +{ + struct autofs_dev_ioctl tmp, *ads; + + if (copy_from_user(&tmp, in, sizeof(tmp))) + return ERR_PTR(-EFAULT); + + if (tmp.size < sizeof(tmp)) + return ERR_PTR(-EINVAL); + + ads = kmalloc(tmp.size, GFP_KERNEL); + if (!ads) + return ERR_PTR(-ENOMEM); + + if (copy_from_user(ads, in, tmp.size)) { + kfree(ads); + return ERR_PTR(-EFAULT); + } + + return ads; +} + +static inline void free_dev_ioctl(struct autofs_dev_ioctl *param) +{ + kfree(param); + return; +} + +/* + * Check sanity of parameter control fields and if a path is present + * check that it has a "/" and is terminated. + */ +static int validate_dev_ioctl(int cmd, struct autofs_dev_ioctl *param) +{ + int err = -EINVAL; + + if (check_dev_ioctl_version(cmd, param)) { + AUTOFS_WARN("invalid device control module version " + "supplied for cmd(0x%08x)", cmd); + goto out; + } + + if (param->size > sizeof(*param)) { + err = check_name(param->path); + if (err) { + AUTOFS_WARN("invalid path supplied for cmd(0x%08x)", + cmd); + goto out; + } + + err = invalid_str(param->path, + (void *) ((size_t) param + param->size)); + if (err) { + AUTOFS_WARN("invalid path supplied for cmd(0x%08x)", + cmd); + goto out; + } + } + + err = 0; +out: + return err; +} + +/* + * Get the autofs super block info struct from the file opened on + * the autofs mount point. + */ +static struct autofs_sb_info *autofs_dev_ioctl_sbi(struct file *f) +{ + struct autofs_sb_info *sbi = NULL; + struct inode *inode; + + if (f) { + inode = f->f_path.dentry->d_inode; + sbi = autofs4_sbi(inode->i_sb); + } + return sbi; +} + +/* Return autofs module protocol version */ +static int autofs_dev_ioctl_protover(struct file *fp, + struct autofs_sb_info *sbi, + struct autofs_dev_ioctl *param) +{ + param->arg1 = sbi->version; + return 0; +} + +/* Return autofs module protocol sub version */ +static int autofs_dev_ioctl_protosubver(struct file *fp, + struct autofs_sb_info *sbi, + struct autofs_dev_ioctl *param) +{ + param->arg1 = sbi->sub_version; + return 0; +} + +/* + * Walk down the mount stack looking for an autofs mount that + * has the requested device number (aka. new_encode_dev(sb->s_dev). + */ +static int autofs_dev_ioctl_find_super(struct nameidata *nd, dev_t devno) +{ + struct dentry *dentry; + struct inode *inode; + struct super_block *sb; + dev_t s_dev; + unsigned int err; + + err = -ENOENT; + + /* Lookup the dentry name at the base of our mount point */ + dentry = d_lookup(nd->path.dentry, &nd->last); + if (!dentry) + goto out; + + dput(nd->path.dentry); + nd->path.dentry = dentry; + + /* And follow the mount stack looking for our autofs mount */ + while (follow_down(&nd->path.mnt, &nd->path.dentry)) { + inode = nd->path.dentry->d_inode; + if (!inode) + break; + + sb = inode->i_sb; + s_dev = new_encode_dev(sb->s_dev); + if (devno == s_dev) { + if (sb->s_magic == AUTOFS_SUPER_MAGIC) { + err = 0; + break; + } + } + } +out: + return err; +} + +/* + * Walk down the mount stack looking for an autofs mount that + * has the requested mount type (ie. indirect, direct or offset). + */ +static int autofs_dev_ioctl_find_sbi_type(struct nameidata *nd, unsigned int type) +{ + struct dentry *dentry; + struct autofs_info *ino; + unsigned int err; + + err = -ENOENT; + + /* Lookup the dentry name at the base of our mount point */ + dentry = d_lookup(nd->path.dentry, &nd->last); + if (!dentry) + goto out; + + dput(nd->path.dentry); + nd->path.dentry = dentry; + + /* And follow the mount stack looking for our autofs mount */ + while (follow_down(&nd->path.mnt, &nd->path.dentry)) { + ino = autofs4_dentry_ino(nd->path.dentry); + if (ino && ino->sbi->type & type) { + err = 0; + break; + } + } +out: + return err; +} + +static void autofs_dev_ioctl_fd_install(unsigned int fd, struct file *file) +{ + struct files_struct *files = current->files; + struct fdtable *fdt; + + spin_lock(&files->file_lock); + fdt = files_fdtable(files); + BUG_ON(fdt->fd[fd] != NULL); + rcu_assign_pointer(fdt->fd[fd], file); + FD_SET(fd, fdt->close_on_exec); + spin_unlock(&files->file_lock); +} + + +/* + * Open a file descriptor on the autofs mount point corresponding + * to the given path and device number (aka. new_encode_dev(sb->s_dev)). + */ +static int autofs_dev_ioctl_open_mountpoint(const char *path, dev_t devid) +{ + struct file *filp; + struct nameidata nd; + int err, fd; + + fd = get_unused_fd(); + if (likely(fd >= 0)) { + /* Get nameidata of the parent directory */ + err = path_lookup(path, LOOKUP_PARENT, &nd); + if (err) + goto out; + + /* + * Search down, within the parent, looking for an + * autofs super block that has the device number + * corresponding to the autofs fs we want to open. + */ + err = autofs_dev_ioctl_find_super(&nd, devid); + if (err) { + path_put(&nd.path); + goto out; + } + + filp = dentry_open(nd.path.dentry, nd.path.mnt, O_RDONLY); + if (IS_ERR(filp)) { + err = PTR_ERR(filp); + goto out; + } + + autofs_dev_ioctl_fd_install(fd, filp); + } + + return fd; + +out: + put_unused_fd(fd); + return err; +} + +/* Open a file descriptor on an autofs mount point */ +static int autofs_dev_ioctl_openmount(struct file *fp, + struct autofs_sb_info *sbi, + struct autofs_dev_ioctl *param) +{ + const char *path; + dev_t devid; + int err, fd; + + /* param->path has already been checked */ + if (!param->arg1) + return -EINVAL; + + param->ioctlfd = -1; + + path = param->path; + devid = param->arg1; + + err = 0; + fd = autofs_dev_ioctl_open_mountpoint(path, devid); + if (unlikely(fd < 0)) { + err = fd; + goto out; + } + + param->ioctlfd = fd; +out: + return err; +} + +/* Close file descriptor allocated above (user can also use close(2)). */ +static int autofs_dev_ioctl_closemount(struct file *fp, + struct autofs_sb_info *sbi, + struct autofs_dev_ioctl *param) +{ + return sys_close(param->ioctlfd); +} + +/* + * Send "ready" status for an existing wait (either a mount or an expire + * request). + */ +static int autofs_dev_ioctl_ready(struct file *fp, + struct autofs_sb_info *sbi, + struct autofs_dev_ioctl *param) +{ + autofs_wqt_t token; + + token = (autofs_wqt_t) param->arg1; + return autofs4_wait_release(sbi, token, 0); +} + +/* + * Send "fail" status for an existing wait (either a mount or an expire + * request). + */ +static int autofs_dev_ioctl_fail(struct file *fp, + struct autofs_sb_info *sbi, + struct autofs_dev_ioctl *param) +{ + autofs_wqt_t token; + int status; + + token = (autofs_wqt_t) param->arg1; + status = param->arg2 ? param->arg2 : -ENOENT; + return autofs4_wait_release(sbi, token, status); +} + +/* + * Set the pipe fd for kernel communication to the daemon. + * + * Normally this is set at mount using an option but if we + * are reconnecting to a busy mount then we need to use this + * to tell the autofs mount about the new kernel pipe fd. In + * order to protect mounts against incorrectly setting the + * pipefd we also require that the autofs mount be catatonic. + * + * This also sets the process group id used to identify the + * controlling process (eg. the owning automount(8) daemon). + */ +static int autofs_dev_ioctl_setpipefd(struct file *fp, + struct autofs_sb_info *sbi, + struct autofs_dev_ioctl *param) +{ + int pipefd; + int err = 0; + + if (param->arg1 == -1) + return -EINVAL; + + pipefd = param->arg1; + + mutex_lock(&sbi->wq_mutex); + if (!sbi->catatonic) { + mutex_unlock(&sbi->wq_mutex); + return -EBUSY; + } else { + struct file *pipe = fget(pipefd); + if (!pipe->f_op || !pipe->f_op->write) { + err = -EPIPE; + fput(pipe); + goto out; + } + sbi->oz_pgrp = task_pgrp_nr(current); + sbi->pipefd = pipefd; + sbi->pipe = pipe; + sbi->catatonic = 0; + } +out: + mutex_unlock(&sbi->wq_mutex); + return err; +} + +/* + * Make the autofs mount point catatonic, no longer responsive to + * mount requests. Also closes the kernel pipe file descriptor. + */ +static int autofs_dev_ioctl_catatonic(struct file *fp, + struct autofs_sb_info *sbi, + struct autofs_dev_ioctl *param) +{ + autofs4_catatonic_mode(sbi); + return 0; +} + +/* Set the autofs mount timeout */ +static int autofs_dev_ioctl_timeout(struct file *fp, + struct autofs_sb_info *sbi, + struct autofs_dev_ioctl *param) +{ + unsigned long timeout; + + timeout = param->arg1; + param->arg1 = sbi->exp_timeout / HZ; + sbi->exp_timeout = timeout * HZ; + return 0; +} + +/* + * Return the uid and gid of the last request for the mount + * + * When reconstructing an autofs mount tree with active mounts + * we need to re-connect to mounts that may have used the original + * process uid and gid (or string variations of them) for mount + * lookups within the map entry. + */ +static int autofs_dev_ioctl_requester(struct file *fp, + struct autofs_sb_info *sbi, + struct autofs_dev_ioctl *param) +{ + struct autofs_info *ino; + struct nameidata nd; + const char *path; + dev_t devid; + int err = -ENOENT; + + if (param->size <= sizeof(*param)) { + err = -EINVAL; + goto out; + } + + path = param->path; + devid = sbi->sb->s_dev; + + param->arg1 = param->arg2 = -1; + + /* Get nameidata of the parent directory */ + err = path_lookup(path, LOOKUP_PARENT, &nd); + if (err) + goto out; + + err = autofs_dev_ioctl_find_super(&nd, devid); + if (err) + goto out_release; + + ino = autofs4_dentry_ino(nd.path.dentry); + if (ino) { + err = 0; + autofs4_expire_wait(nd.path.dentry); + spin_lock(&sbi->fs_lock); + param->arg1 = ino->uid; + param->arg2 = ino->gid; + spin_unlock(&sbi->fs_lock); + } + +out_release: + path_put(&nd.path); +out: + return err; +} + +/* + * Call repeatedly until it returns -EAGAIN, meaning there's nothing + * more that can be done. + */ +static int autofs_dev_ioctl_expire(struct file *fp, + struct autofs_sb_info *sbi, + struct autofs_dev_ioctl *param) +{ + struct dentry *dentry; + struct vfsmount *mnt; + int err = -EAGAIN; + int how; + + how = param->arg1; + mnt = fp->f_path.mnt; + + if (sbi->type & AUTOFS_TYPE_TRIGGER) + dentry = autofs4_expire_direct(sbi->sb, mnt, sbi, how); + else + dentry = autofs4_expire_indirect(sbi->sb, mnt, sbi, how); + + if (dentry) { + struct autofs_info *ino = autofs4_dentry_ino(dentry); + + /* + * This is synchronous because it makes the daemon a + * little easier + */ + err = autofs4_wait(sbi, dentry, NFY_EXPIRE); + + spin_lock(&sbi->fs_lock); + if (ino->flags & AUTOFS_INF_MOUNTPOINT) { + ino->flags &= ~AUTOFS_INF_MOUNTPOINT; + sbi->sb->s_root->d_mounted++; + } + ino->flags &= ~AUTOFS_INF_EXPIRING; + complete_all(&ino->expire_complete); + spin_unlock(&sbi->fs_lock); + dput(dentry); + } + + return err; +} + +/* Check if autofs mount point is in use */ +static int autofs_dev_ioctl_askumount(struct file *fp, + struct autofs_sb_info *sbi, + struct autofs_dev_ioctl *param) +{ + param->arg1 = 0; + if (may_umount(fp->f_path.mnt)) + param->arg1 = 1; + return 0; +} + +/* + * Check if the given path is a mountpoint. + * + * If we are supplied with the file descriptor of an autofs + * mount we're looking for a specific mount. In this case + * the path is considered a mountpoint if it is itself a + * mountpoint or contains a mount, such as a multi-mount + * without a root mount. In this case we return 1 if the + * path is a mount point and the super magic of the covering + * mount if there is one or 0 if it isn't a mountpoint. + * + * If we aren't supplied with a file descriptor then we + * lookup the nameidata of the path and check if it is the + * root of a mount. If a type is given we are looking for + * a particular autofs mount and if we don't find a match + * we return fail. If the located nameidata path is the + * root of a mount we return 1 along with the super magic + * of the mount or 0 otherwise. + * + * In both cases the the device number (as returned by + * new_encode_dev()) is also returned. + */ +static int autofs_dev_ioctl_ismountpoint(struct file *fp, + struct autofs_sb_info *sbi, + struct autofs_dev_ioctl *param) +{ + struct nameidata nd; + const char *path; + unsigned int type; + int err = -ENOENT; + + if (param->size <= sizeof(*param)) { + err = -EINVAL; + goto out; + } + + path = param->path; + type = param->arg1; + + param->arg1 = 0; + param->arg2 = 0; + + if (!fp || param->ioctlfd == -1) { + if (type == AUTOFS_TYPE_ANY) { + struct super_block *sb; + + err = path_lookup(path, LOOKUP_FOLLOW, &nd); + if (err) + goto out; + + sb = nd.path.dentry->d_sb; + param->arg1 = new_encode_dev(sb->s_dev); + } else { + struct autofs_info *ino; + + err = path_lookup(path, LOOKUP_PARENT, &nd); + if (err) + goto out; + + err = autofs_dev_ioctl_find_sbi_type(&nd, type); + if (err) + goto out_release; + + ino = autofs4_dentry_ino(nd.path.dentry); + param->arg1 = autofs4_get_dev(ino->sbi); + } + + err = 0; + if (nd.path.dentry->d_inode && + nd.path.mnt->mnt_root == nd.path.dentry) { + err = 1; + param->arg2 = nd.path.dentry->d_inode->i_sb->s_magic; + } + } else { + dev_t devid = new_encode_dev(sbi->sb->s_dev); + + err = path_lookup(path, LOOKUP_PARENT, &nd); + if (err) + goto out; + + err = autofs_dev_ioctl_find_super(&nd, devid); + if (err) + goto out_release; + + param->arg1 = autofs4_get_dev(sbi); + + err = have_submounts(nd.path.dentry); + + if (nd.path.mnt->mnt_mountpoint != nd.path.mnt->mnt_root) { + if (follow_down(&nd.path.mnt, &nd.path.dentry)) { + struct inode *inode = nd.path.dentry->d_inode; + param->arg2 = inode->i_sb->s_magic; + } + } + } + +out_release: + path_put(&nd.path); +out: + return err; +} + +/* + * Our range of ioctl numbers isn't 0 based so we need to shift + * the array index by _IOC_NR(AUTOFS_CTL_IOC_FIRST) for the table + * lookup. + */ +#define cmd_idx(cmd) (cmd - _IOC_NR(AUTOFS_DEV_IOCTL_IOC_FIRST)) + +static ioctl_fn lookup_dev_ioctl(unsigned int cmd) +{ + static struct { + int cmd; + ioctl_fn fn; + } _ioctls[] = { + {cmd_idx(AUTOFS_DEV_IOCTL_VERSION_CMD), NULL}, + {cmd_idx(AUTOFS_DEV_IOCTL_PROTOVER_CMD), + autofs_dev_ioctl_protover}, + {cmd_idx(AUTOFS_DEV_IOCTL_PROTOSUBVER_CMD), + autofs_dev_ioctl_protosubver}, + {cmd_idx(AUTOFS_DEV_IOCTL_OPENMOUNT_CMD), + autofs_dev_ioctl_openmount}, + {cmd_idx(AUTOFS_DEV_IOCTL_CLOSEMOUNT_CMD), + autofs_dev_ioctl_closemount}, + {cmd_idx(AUTOFS_DEV_IOCTL_READY_CMD), + autofs_dev_ioctl_ready}, + {cmd_idx(AUTOFS_DEV_IOCTL_FAIL_CMD), + autofs_dev_ioctl_fail}, + {cmd_idx(AUTOFS_DEV_IOCTL_SETPIPEFD_CMD), + autofs_dev_ioctl_setpipefd}, + {cmd_idx(AUTOFS_DEV_IOCTL_CATATONIC_CMD), + autofs_dev_ioctl_catatonic}, + {cmd_idx(AUTOFS_DEV_IOCTL_TIMEOUT_CMD), + autofs_dev_ioctl_timeout}, + {cmd_idx(AUTOFS_DEV_IOCTL_REQUESTER_CMD), + autofs_dev_ioctl_requester}, + {cmd_idx(AUTOFS_DEV_IOCTL_EXPIRE_CMD), + autofs_dev_ioctl_expire}, + {cmd_idx(AUTOFS_DEV_IOCTL_ASKUMOUNT_CMD), + autofs_dev_ioctl_askumount}, + {cmd_idx(AUTOFS_DEV_IOCTL_ISMOUNTPOINT_CMD), + autofs_dev_ioctl_ismountpoint} + }; + unsigned int idx = cmd_idx(cmd); + + return (idx >= ARRAY_SIZE(_ioctls)) ? NULL : _ioctls[idx].fn; +} + +/* ioctl dispatcher */ +static int _autofs_dev_ioctl(unsigned int command, struct autofs_dev_ioctl __user *user) +{ + struct autofs_dev_ioctl *param; + struct file *fp; + struct autofs_sb_info *sbi; + unsigned int cmd_first, cmd; + ioctl_fn fn = NULL; + int err = 0; + + /* only root can play with this */ + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + cmd_first = _IOC_NR(AUTOFS_DEV_IOCTL_IOC_FIRST); + cmd = _IOC_NR(command); + + if (_IOC_TYPE(command) != _IOC_TYPE(AUTOFS_DEV_IOCTL_IOC_FIRST) || + cmd - cmd_first >= AUTOFS_DEV_IOCTL_IOC_COUNT) { + return -ENOTTY; + } + + /* Copy the parameters into kernel space. */ + param = copy_dev_ioctl(user); + if (IS_ERR(param)) + return PTR_ERR(param); + + err = validate_dev_ioctl(command, param); + if (err) + goto out; + + /* The validate routine above always sets the version */ + if (cmd == AUTOFS_DEV_IOCTL_VERSION_CMD) + goto done; + + fn = lookup_dev_ioctl(cmd); + if (!fn) { + AUTOFS_WARN("unknown command 0x%08x", command); + return -ENOTTY; + } + + fp = NULL; + sbi = NULL; + + /* + * For obvious reasons the openmount can't have a file + * descriptor yet. We don't take a reference to the + * file during close to allow for immediate release. + */ + if (cmd != AUTOFS_DEV_IOCTL_OPENMOUNT_CMD && + cmd != AUTOFS_DEV_IOCTL_CLOSEMOUNT_CMD) { + fp = fget(param->ioctlfd); + if (!fp) { + if (cmd == AUTOFS_DEV_IOCTL_ISMOUNTPOINT_CMD) + goto cont; + err = -EBADF; + goto out; + } + + if (!fp->f_op) { + err = -ENOTTY; + fput(fp); + goto out; + } + + sbi = autofs_dev_ioctl_sbi(fp); + if (!sbi || sbi->magic != AUTOFS_SBI_MAGIC) { + err = -EINVAL; + fput(fp); + goto out; + } + + /* + * Admin needs to be able to set the mount catatonic in + * order to be able to perform the re-open. + */ + if (!autofs4_oz_mode(sbi) && + cmd != AUTOFS_DEV_IOCTL_CATATONIC_CMD) { + err = -EACCES; + fput(fp); + goto out; + } + } +cont: + err = fn(fp, sbi, param); + + if (fp) + fput(fp); +done: + if (err >= 0 && copy_to_user(user, param, AUTOFS_DEV_IOCTL_SIZE)) + err = -EFAULT; +out: + free_dev_ioctl(param); + return err; +} + +static long autofs_dev_ioctl(struct file *file, uint command, ulong u) +{ + int err; + err = _autofs_dev_ioctl(command, (struct autofs_dev_ioctl __user *) u); + return (long) err; +} + +#ifdef CONFIG_COMPAT +static long autofs_dev_ioctl_compat(struct file *file, uint command, ulong u) +{ + return (long) autofs_dev_ioctl(file, command, (ulong) compat_ptr(u)); +} +#else +#define autofs_dev_ioctl_compat NULL +#endif + +static const struct file_operations _dev_ioctl_fops = { + .unlocked_ioctl = autofs_dev_ioctl, + .compat_ioctl = autofs_dev_ioctl_compat, + .owner = THIS_MODULE, +}; + +static struct miscdevice _autofs_dev_ioctl_misc = { + .minor = MISC_DYNAMIC_MINOR, + .name = AUTOFS_DEVICE_NAME, + .fops = &_dev_ioctl_fops +}; + +/* Register/deregister misc character device */ +int autofs_dev_ioctl_init(void) +{ + int r; + + r = misc_register(&_autofs_dev_ioctl_misc); + if (r) { + AUTOFS_ERROR("misc_register failed for control device"); + return r; + } + + return 0; +} + +void autofs_dev_ioctl_exit(void) +{ + misc_deregister(&_autofs_dev_ioctl_misc); + return; +} + diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c index cdabb79..cde2f8e 100644 --- a/fs/autofs4/expire.c +++ b/fs/autofs4/expire.c @@ -244,10 +244,10 @@ cont: } /* Check if we can expire a direct mount (possibly a tree) */ -static struct dentry *autofs4_expire_direct(struct super_block *sb, - struct vfsmount *mnt, - struct autofs_sb_info *sbi, - int how) +struct dentry *autofs4_expire_direct(struct super_block *sb, + struct vfsmount *mnt, + struct autofs_sb_info *sbi, + int how) { unsigned long timeout; struct dentry *root = dget(sb->s_root); @@ -283,10 +283,10 @@ static struct dentry *autofs4_expire_direct(struct super_block *sb, * - it is unused by any user process * - it has been unused for exp_timeout time */ -static struct dentry *autofs4_expire_indirect(struct super_block *sb, - struct vfsmount *mnt, - struct autofs_sb_info *sbi, - int how) +struct dentry *autofs4_expire_indirect(struct super_block *sb, + struct vfsmount *mnt, + struct autofs_sb_info *sbi, + int how) { unsigned long timeout; struct dentry *root = sb->s_root; @@ -479,7 +479,7 @@ int autofs4_expire_multi(struct super_block *sb, struct vfsmount *mnt, if (arg && get_user(do_now, arg)) return -EFAULT; - if (sbi->type & AUTOFS_TYPE_DIRECT) + if (sbi->type & AUTOFS_TYPE_TRIGGER) dentry = autofs4_expire_direct(sb, mnt, sbi, do_now); else dentry = autofs4_expire_indirect(sb, mnt, sbi, do_now); diff --git a/fs/autofs4/init.c b/fs/autofs4/init.c index 723a1c5e..9722e4b 100644 --- a/fs/autofs4/init.c +++ b/fs/autofs4/init.c @@ -29,11 +29,20 @@ static struct file_system_type autofs_fs_type = { static int __init init_autofs4_fs(void) { - return register_filesystem(&autofs_fs_type); + int err; + + err = register_filesystem(&autofs_fs_type); + if (err) + return err; + + autofs_dev_ioctl_init(); + + return err; } static void __exit exit_autofs4_fs(void) { + autofs_dev_ioctl_exit(); unregister_filesystem(&autofs_fs_type); } diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c index 7bb3e5b..c7e65bb 100644 --- a/fs/autofs4/inode.c +++ b/fs/autofs4/inode.c @@ -53,6 +53,8 @@ struct autofs_info *autofs4_init_ino(struct autofs_info *ino, atomic_set(&ino->count, 0); } + ino->uid = 0; + ino->gid = 0; ino->mode = mode; ino->last_used = jiffies; @@ -213,7 +215,7 @@ static const struct super_operations autofs4_sops = { enum {Opt_err, Opt_fd, Opt_uid, Opt_gid, Opt_pgrp, Opt_minproto, Opt_maxproto, Opt_indirect, Opt_direct, Opt_offset}; -static match_table_t tokens = { +static const match_table_t tokens = { {Opt_fd, "fd=%u"}, {Opt_uid, "uid=%u"}, {Opt_gid, "gid=%u"}, @@ -288,7 +290,7 @@ static int parse_options(char *options, int *pipefd, uid_t *uid, gid_t *gid, *type = AUTOFS_TYPE_DIRECT; break; case Opt_offset: - *type = AUTOFS_TYPE_DIRECT | AUTOFS_TYPE_OFFSET; + *type = AUTOFS_TYPE_OFFSET; break; default: return 1; @@ -336,7 +338,7 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent) sbi->sb = s; sbi->version = 0; sbi->sub_version = 0; - sbi->type = 0; + sbi->type = AUTOFS_TYPE_INDIRECT; sbi->min_proto = 0; sbi->max_proto = 0; mutex_init(&sbi->wq_mutex); @@ -378,7 +380,7 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent) } root_inode->i_fop = &autofs4_root_operations; - root_inode->i_op = sbi->type & AUTOFS_TYPE_DIRECT ? + root_inode->i_op = sbi->type & AUTOFS_TYPE_TRIGGER ? &autofs4_direct_root_inode_operations : &autofs4_indirect_root_inode_operations; diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c index 35216d1..4b67c2a 100644 --- a/fs/autofs4/waitq.c +++ b/fs/autofs4/waitq.c @@ -337,7 +337,7 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry, * is very similar for indirect mounts except only dentrys * in the root of the autofs file system may be negative. */ - if (sbi->type & (AUTOFS_TYPE_DIRECT|AUTOFS_TYPE_OFFSET)) + if (sbi->type & AUTOFS_TYPE_TRIGGER) return -ENOENT; else if (!IS_ROOT(dentry->d_parent)) return -ENOENT; @@ -348,7 +348,7 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry, return -ENOMEM; /* If this is a direct mount request create a dummy name */ - if (IS_ROOT(dentry) && (sbi->type & AUTOFS_TYPE_DIRECT)) + if (IS_ROOT(dentry) && sbi->type & AUTOFS_TYPE_TRIGGER) qstr.len = sprintf(name, "%p", dentry); else { qstr.len = autofs4_getpath(sbi, dentry, &name); @@ -406,11 +406,11 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry, type = autofs_ptype_expire_multi; } else { if (notify == NFY_MOUNT) - type = (sbi->type & AUTOFS_TYPE_DIRECT) ? + type = (sbi->type & AUTOFS_TYPE_TRIGGER) ? autofs_ptype_missing_direct : autofs_ptype_missing_indirect; else - type = (sbi->type & AUTOFS_TYPE_DIRECT) ? + type = (sbi->type & AUTOFS_TYPE_TRIGGER) ? autofs_ptype_expire_direct : autofs_ptype_expire_indirect; } @@ -457,6 +457,40 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry, status = wq->status; + /* + * For direct and offset mounts we need to track the requester's + * uid and gid in the dentry info struct. This is so it can be + * supplied, on request, by the misc device ioctl interface. + * This is needed during daemon resatart when reconnecting + * to existing, active, autofs mounts. The uid and gid (and + * related string values) may be used for macro substitution + * in autofs mount maps. + */ + if (!status) { + struct autofs_info *ino; + struct dentry *de = NULL; + + /* direct mount or browsable map */ + ino = autofs4_dentry_ino(dentry); + if (!ino) { + /* If not lookup actual dentry used */ + de = d_lookup(dentry->d_parent, &dentry->d_name); + if (de) + ino = autofs4_dentry_ino(de); + } + + /* Set mount requester */ + if (ino) { + spin_lock(&sbi->fs_lock); + ino->uid = wq->uid; + ino->gid = wq->gid; + spin_unlock(&sbi->fs_lock); + } + + if (de) + dput(de); + } + /* Are we the last process to need status? */ mutex_lock(&sbi->wq_mutex); if (!--wq->wait_ctr) diff --git a/fs/befs/befs_fs_types.h b/fs/befs/befs_fs_types.h index e2595c2..7893eaa 100644 --- a/fs/befs/befs_fs_types.h +++ b/fs/befs/befs_fs_types.h @@ -55,8 +55,12 @@ enum super_flags { }; #define BEFS_BYTEORDER_NATIVE 0x42494745 +#define BEFS_BYTEORDER_NATIVE_LE (__force fs32)cpu_to_le32(BEFS_BYTEORDER_NATIVE) +#define BEFS_BYTEORDER_NATIVE_BE (__force fs32)cpu_to_be32(BEFS_BYTEORDER_NATIVE) #define BEFS_SUPER_MAGIC BEFS_SUPER_MAGIC1 +#define BEFS_SUPER_MAGIC1_LE (__force fs32)cpu_to_le32(BEFS_SUPER_MAGIC1) +#define BEFS_SUPER_MAGIC1_BE (__force fs32)cpu_to_be32(BEFS_SUPER_MAGIC1) /* * Flags of inode diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c index 740f536..b6dfee3 100644 --- a/fs/befs/linuxvfs.c +++ b/fs/befs/linuxvfs.c @@ -650,7 +650,7 @@ enum { Opt_uid, Opt_gid, Opt_charset, Opt_debug, Opt_err, }; -static match_table_t befs_tokens = { +static const match_table_t befs_tokens = { {Opt_uid, "uid=%d"}, {Opt_gid, "gid=%d"}, {Opt_charset, "iocharset=%s"}, @@ -809,8 +809,8 @@ befs_fill_super(struct super_block *sb, void *data, int silent) /* account for offset of super block on x86 */ disk_sb = (befs_super_block *) bh->b_data; - if ((le32_to_cpu(disk_sb->magic1) == BEFS_SUPER_MAGIC1) || - (be32_to_cpu(disk_sb->magic1) == BEFS_SUPER_MAGIC1)) { + if ((disk_sb->magic1 == BEFS_SUPER_MAGIC1_LE) || + (disk_sb->magic1 == BEFS_SUPER_MAGIC1_BE)) { befs_debug(sb, "Using PPC superblock location"); } else { befs_debug(sb, "Using x86 superblock location"); diff --git a/fs/befs/super.c b/fs/befs/super.c index 8c3401f..41f2b4d 100644 --- a/fs/befs/super.c +++ b/fs/befs/super.c @@ -26,10 +26,10 @@ befs_load_sb(struct super_block *sb, befs_super_block * disk_sb) befs_sb_info *befs_sb = BEFS_SB(sb); /* Check the byte order of the filesystem */ - if (le32_to_cpu(disk_sb->fs_byte_order) == BEFS_BYTEORDER_NATIVE) + if (disk_sb->fs_byte_order == BEFS_BYTEORDER_NATIVE_LE) befs_sb->byte_order = BEFS_BYTESEX_LE; - else if (be32_to_cpu(disk_sb->fs_byte_order) == BEFS_BYTEORDER_NATIVE) - befs_sb->byte_order = BEFS_BYTESEX_BE; + else if (disk_sb->fs_byte_order == BEFS_BYTEORDER_NATIVE_BE) + befs_sb->byte_order = BEFS_BYTESEX_BE; befs_sb->magic1 = fs32_to_cpu(sb, disk_sb->magic1); befs_sb->magic2 = fs32_to_cpu(sb, disk_sb->magic2); diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 655ed8d..c76afa2 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -683,7 +683,7 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs) * switch really is going to happen - do this in * flush_thread(). - akpm */ - SET_PERSONALITY(loc->elf_ex, 0); + SET_PERSONALITY(loc->elf_ex); interpreter = open_exec(elf_interpreter); retval = PTR_ERR(interpreter); @@ -734,7 +734,7 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs) goto out_free_dentry; } else { /* Executables without an interpreter also need a personality */ - SET_PERSONALITY(loc->elf_ex, 0); + SET_PERSONALITY(loc->elf_ex); } /* Flush all traces of the currently running executable */ @@ -748,7 +748,7 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs) /* Do this immediately, since STACK_TOP as used in setup_arg_pages may depend on the personality. */ - SET_PERSONALITY(loc->elf_ex, 0); + SET_PERSONALITY(loc->elf_ex); if (elf_read_implies_exec(loc->elf_ex, executable_stack)) current->personality |= READ_IMPLIES_EXEC; diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index 80c1f95..0e8367c 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -25,6 +25,7 @@ #include <linux/fcntl.h> #include <linux/slab.h> #include <linux/pagemap.h> +#include <linux/security.h> #include <linux/highmem.h> #include <linux/highuid.h> #include <linux/personality.h> @@ -455,8 +456,19 @@ error_kill: } /*****************************************************************************/ + +#ifndef ELF_BASE_PLATFORM /* - * present useful information to the program + * AT_BASE_PLATFORM indicates the "real" hardware/microarchitecture. + * If the arch defines ELF_BASE_PLATFORM (in asm/elf.h), the value + * will be copied to the user stack in the same manner as AT_PLATFORM. + */ +#define ELF_BASE_PLATFORM NULL +#endif + +/* + * present useful information to the program by shovelling it onto the new + * process's stack */ static int create_elf_fdpic_tables(struct linux_binprm *bprm, struct mm_struct *mm, @@ -466,15 +478,19 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm, unsigned long sp, csp, nitems; elf_caddr_t __user *argv, *envp; size_t platform_len = 0, len; - char *k_platform; - char __user *u_platform, *p; + char *k_platform, *k_base_platform; + char __user *u_platform, *u_base_platform, *p; long hwcap; int loop; int nr; /* reset for each csp adjustment */ - /* we're going to shovel a whole load of stuff onto the stack */ #ifdef CONFIG_MMU - sp = bprm->p; + /* In some cases (e.g. Hyper-Threading), we want to avoid L1 evictions + * by the processes running on the same package. One thing we can do is + * to shuffle the initial stack for them, so we give the architecture + * an opportunity to do so here. + */ + sp = arch_align_stack(bprm->p); #else sp = mm->start_stack; @@ -483,11 +499,14 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm, return -EFAULT; #endif - /* get hold of platform and hardware capabilities masks for the machine - * we are running on. In some cases (Sparc), this info is impossible - * to get, in others (i386) it is merely difficult. - */ hwcap = ELF_HWCAP; + + /* + * If this architecture has a platform capability string, copy it + * to userspace. In some cases (Sparc), this info is impossible + * for userspace to get any other way, in others (i386) it is + * merely difficult. + */ k_platform = ELF_PLATFORM; u_platform = NULL; @@ -499,19 +518,20 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm, return -EFAULT; } -#if defined(__i386__) && defined(CONFIG_SMP) - /* in some cases (e.g. Hyper-Threading), we want to avoid L1 evictions - * by the processes running on the same package. One thing we can do is - * to shuffle the initial stack for them. - * - * the conditionals here are unneeded, but kept in to make the code - * behaviour the same as pre change unless we have hyperthreaded - * processors. This keeps Mr Marcelo Person happier but should be - * removed for 2.5 + /* + * If this architecture has a "base" platform capability + * string, copy it to userspace. */ - if (smp_num_siblings > 1) - sp = sp - ((current->pid % 64) << 7); -#endif + k_base_platform = ELF_BASE_PLATFORM; + u_base_platform = NULL; + + if (k_base_platform) { + platform_len = strlen(k_base_platform) + 1; + sp -= platform_len; + u_base_platform = (char __user *) sp; + if (__copy_to_user(u_base_platform, k_base_platform, platform_len) != 0) + return -EFAULT; + } sp &= ~7UL; @@ -541,9 +561,13 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm, } /* force 16 byte _final_ alignment here for generality */ -#define DLINFO_ITEMS 13 +#define DLINFO_ITEMS 15 + + nitems = 1 + DLINFO_ITEMS + (k_platform ? 1 : 0) + + (k_base_platform ? 1 : 0) + AT_VECTOR_SIZE_ARCH; - nitems = 1 + DLINFO_ITEMS + (k_platform ? 1 : 0) + AT_VECTOR_SIZE_ARCH; + if (bprm->interp_flags & BINPRM_FLAGS_EXECFD) + nitems++; csp = sp; sp -= nitems * 2 * sizeof(unsigned long); @@ -575,6 +599,19 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm, (elf_addr_t) (unsigned long) u_platform); } + if (k_base_platform) { + nr = 0; + csp -= 2 * sizeof(unsigned long); + NEW_AUX_ENT(AT_BASE_PLATFORM, + (elf_addr_t) (unsigned long) u_base_platform); + } + + if (bprm->interp_flags & BINPRM_FLAGS_EXECFD) { + nr = 0; + csp -= 2 * sizeof(unsigned long); + NEW_AUX_ENT(AT_EXECFD, bprm->interp_data); + } + nr = 0; csp -= DLINFO_ITEMS * 2 * sizeof(unsigned long); NEW_AUX_ENT(AT_HWCAP, hwcap); @@ -590,6 +627,8 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm, NEW_AUX_ENT(AT_EUID, (elf_addr_t) current->euid); NEW_AUX_ENT(AT_GID, (elf_addr_t) current->gid); NEW_AUX_ENT(AT_EGID, (elf_addr_t) current->egid); + NEW_AUX_ENT(AT_SECURE, security_bprm_secureexec(bprm)); + NEW_AUX_ENT(AT_EXECFN, bprm->exec); #ifdef ARCH_DLINFO nr = 0; diff --git a/fs/binfmt_em86.c b/fs/binfmt_em86.c index f9c88d0..32fb00b 100644 --- a/fs/binfmt_em86.c +++ b/fs/binfmt_em86.c @@ -43,7 +43,7 @@ static int load_em86(struct linux_binprm *bprm,struct pt_regs *regs) return -ENOEXEC; } - bprm->sh_bang = 1; /* Well, the bang-shell is implicit... */ + bprm->recursion_depth++; /* Well, the bang-shell is implicit... */ allow_write_access(bprm->file); fput(bprm->file); bprm->file = NULL; diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c index dfc0197..ccb781a 100644 --- a/fs/binfmt_flat.c +++ b/fs/binfmt_flat.c @@ -229,13 +229,13 @@ static int decompress_exec( ret = 10; if (buf[3] & EXTRA_FIELD) { ret += 2 + buf[10] + (buf[11] << 8); - if (unlikely(LBUFSIZE == ret)) { + if (unlikely(LBUFSIZE <= ret)) { DBG_FLT("binfmt_flat: buffer overflow (EXTRA)?\n"); goto out_free_buf; } } if (buf[3] & ORIG_NAME) { - for (; ret < LBUFSIZE && (buf[ret] != 0); ret++) + while (ret < LBUFSIZE && buf[ret++] != 0) ; if (unlikely(LBUFSIZE == ret)) { DBG_FLT("binfmt_flat: buffer overflow (ORIG_NAME)?\n"); @@ -243,7 +243,7 @@ static int decompress_exec( } } if (buf[3] & COMMENT) { - for (; ret < LBUFSIZE && (buf[ret] != 0); ret++) + while (ret < LBUFSIZE && buf[ret++] != 0) ; if (unlikely(LBUFSIZE == ret)) { DBG_FLT("binfmt_flat: buffer overflow (COMMENT)?\n"); diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c index 8d7e88e..f2744ab 100644 --- a/fs/binfmt_misc.c +++ b/fs/binfmt_misc.c @@ -117,7 +117,7 @@ static int load_misc_binary(struct linux_binprm *bprm, struct pt_regs *regs) goto _ret; retval = -ENOEXEC; - if (bprm->misc_bang) + if (bprm->recursion_depth > BINPRM_MAX_RECURSION) goto _ret; /* to keep locking time low, we copy the interpreter string */ @@ -197,7 +197,7 @@ static int load_misc_binary(struct linux_binprm *bprm, struct pt_regs *regs) if (retval < 0) goto _error; - bprm->misc_bang = 1; + bprm->recursion_depth++; retval = search_binary_handler (bprm, regs); if (retval < 0) diff --git a/fs/binfmt_script.c b/fs/binfmt_script.c index 9e3963f..0834350 100644 --- a/fs/binfmt_script.c +++ b/fs/binfmt_script.c @@ -22,14 +22,15 @@ static int load_script(struct linux_binprm *bprm,struct pt_regs *regs) char interp[BINPRM_BUF_SIZE]; int retval; - if ((bprm->buf[0] != '#') || (bprm->buf[1] != '!') || (bprm->sh_bang)) + if ((bprm->buf[0] != '#') || (bprm->buf[1] != '!') || + (bprm->recursion_depth > BINPRM_MAX_RECURSION)) return -ENOEXEC; /* * This section does the #! interpretation. * Sorta complicated, but hopefully it will work. -TYT */ - bprm->sh_bang = 1; + bprm->recursion_depth++; allow_write_access(bprm->file); fput(bprm->file); bprm->file = NULL; diff --git a/fs/binfmt_som.c b/fs/binfmt_som.c index 68be580..74e587a 100644 --- a/fs/binfmt_som.c +++ b/fs/binfmt_som.c @@ -306,3 +306,5 @@ static void __exit exit_som_binfmt(void) core_initcall(init_som_binfmt); module_exit(exit_som_binfmt); + +MODULE_LICENSE("GPL"); diff --git a/fs/block_dev.c b/fs/block_dev.c index d84f0469..218408e 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -1262,7 +1262,7 @@ EXPORT_SYMBOL(ioctl_by_bdev); /** * lookup_bdev - lookup a struct block_device by name - * @pathname: special file representing the block device + * @path: special file representing the block device * * Get a reference to the blockdevice at @pathname in the current * namespace if possible and return it. Return ERR_PTR(error) diff --git a/fs/char_dev.c b/fs/char_dev.c index 3cb7cda..262fa10 100644 --- a/fs/char_dev.c +++ b/fs/char_dev.c @@ -22,9 +22,6 @@ #include <linux/mutex.h> #include <linux/backing-dev.h> -#ifdef CONFIG_KMOD -#include <linux/kmod.h> -#endif #include "internal.h" /* diff --git a/fs/coda/psdev.c b/fs/coda/psdev.c index 0d9b80e..cfd29da 100644 --- a/fs/coda/psdev.c +++ b/fs/coda/psdev.c @@ -362,9 +362,8 @@ static int init_coda_psdev(void) goto out_chrdev; } for (i = 0; i < MAX_CODADEVS; i++) - device_create_drvdata(coda_psdev_class, NULL, - MKDEV(CODA_PSDEV_MAJOR, i), - NULL, "cfs%d", i); + device_create(coda_psdev_class, NULL, + MKDEV(CODA_PSDEV_MAJOR, i), NULL, "cfs%d", i); coda_sysctl_init(); goto out; diff --git a/fs/compat.c b/fs/compat.c index 075d050..5f9ec44 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -137,6 +137,45 @@ asmlinkage long compat_sys_utimes(char __user *filename, struct compat_timeval _ return compat_sys_futimesat(AT_FDCWD, filename, t); } +static int cp_compat_stat(struct kstat *stat, struct compat_stat __user *ubuf) +{ + compat_ino_t ino = stat->ino; + typeof(ubuf->st_uid) uid = 0; + typeof(ubuf->st_gid) gid = 0; + int err; + + SET_UID(uid, stat->uid); + SET_GID(gid, stat->gid); + + if ((u64) stat->size > MAX_NON_LFS || + !old_valid_dev(stat->dev) || + !old_valid_dev(stat->rdev)) + return -EOVERFLOW; + if (sizeof(ino) < sizeof(stat->ino) && ino != stat->ino) + return -EOVERFLOW; + + if (clear_user(ubuf, sizeof(*ubuf))) + return -EFAULT; + + err = __put_user(old_encode_dev(stat->dev), &ubuf->st_dev); + err |= __put_user(ino, &ubuf->st_ino); + err |= __put_user(stat->mode, &ubuf->st_mode); + err |= __put_user(stat->nlink, &ubuf->st_nlink); + err |= __put_user(uid, &ubuf->st_uid); + err |= __put_user(gid, &ubuf->st_gid); + err |= __put_user(old_encode_dev(stat->rdev), &ubuf->st_rdev); + err |= __put_user(stat->size, &ubuf->st_size); + err |= __put_user(stat->atime.tv_sec, &ubuf->st_atime); + err |= __put_user(stat->atime.tv_nsec, &ubuf->st_atime_nsec); + err |= __put_user(stat->mtime.tv_sec, &ubuf->st_mtime); + err |= __put_user(stat->mtime.tv_nsec, &ubuf->st_mtime_nsec); + err |= __put_user(stat->ctime.tv_sec, &ubuf->st_ctime); + err |= __put_user(stat->ctime.tv_nsec, &ubuf->st_ctime_nsec); + err |= __put_user(stat->blksize, &ubuf->st_blksize); + err |= __put_user(stat->blocks, &ubuf->st_blocks); + return err; +} + asmlinkage long compat_sys_newstat(char __user * filename, struct compat_stat __user *statbuf) { @@ -1239,7 +1278,7 @@ static int compat_count(compat_uptr_t __user *argv, int max) if (!p) break; argv++; - if(++i > max) + if (i++ >= max) return -E2BIG; } } diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c index 08e28c9..3dbe216 100644 --- a/fs/debugfs/inode.c +++ b/fs/debugfs/inode.c @@ -26,8 +26,7 @@ #include <linux/debugfs.h> #include <linux/fsnotify.h> #include <linux/string.h> - -#define DEBUGFS_MAGIC 0x64626720 +#include <linux/magic.h> static struct vfsmount *debugfs_mount; static int debugfs_mount_count; diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c index a70d5d0..4a714f6c 100644 --- a/fs/devpts/inode.c +++ b/fs/devpts/inode.c @@ -49,7 +49,7 @@ enum { Opt_err }; -static match_table_t tokens = { +static const match_table_t tokens = { {Opt_uid, "uid=%u"}, {Opt_gid, "gid=%u"}, {Opt_mode, "mode=%o"}, diff --git a/fs/direct-io.c b/fs/direct-io.c index 9606ee8..af0558d 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -5,11 +5,11 @@ * * O_DIRECT * - * 04Jul2002 akpm@zip.com.au + * 04Jul2002 Andrew Morton * Initial version * 11Sep2002 janetinc@us.ibm.com * added readv/writev support. - * 29Oct2002 akpm@zip.com.au + * 29Oct2002 Andrew Morton * rewrote bio_add_page() support. * 30Oct2002 pbadari@us.ibm.com * added support for non-aligned IO. @@ -9,8 +9,6 @@ * implementation is based on one of the several variants of the LINUX * inode-subsystem with added complexity of the diskquota system. * - * Version: $Id: dquot.c,v 6.3 1996/11/17 18:35:34 mvw Exp mvw $ - * * Author: Marco van Wieringen <mvw@planets.elm.net> * * Fixes: Dmitry Gorodchanin <pgmdsg@ibi.com>, 11 Feb 96 diff --git a/fs/ecryptfs/Makefile b/fs/ecryptfs/Makefile index b4755a8..2cc9ee4 100644 --- a/fs/ecryptfs/Makefile +++ b/fs/ecryptfs/Makefile @@ -4,4 +4,4 @@ obj-$(CONFIG_ECRYPT_FS) += ecryptfs.o -ecryptfs-objs := dentry.o file.o inode.o main.o super.o mmap.o read_write.o crypto.o keystore.o messaging.o netlink.o miscdev.o kthread.o debug.o +ecryptfs-objs := dentry.o file.o inode.o main.o super.o mmap.o read_write.o crypto.o keystore.o messaging.o miscdev.o kthread.o debug.o diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h index b73fb75..3504cf9 100644 --- a/fs/ecryptfs/ecryptfs_kernel.h +++ b/fs/ecryptfs/ecryptfs_kernel.h @@ -79,11 +79,6 @@ #define ECRYPTFS_MAX_PKI_NAME_BYTES 16 #define ECRYPTFS_DEFAULT_NUM_USERS 4 #define ECRYPTFS_MAX_NUM_USERS 32768 -#define ECRYPTFS_TRANSPORT_NETLINK 0 -#define ECRYPTFS_TRANSPORT_CONNECTOR 1 -#define ECRYPTFS_TRANSPORT_RELAYFS 2 -#define ECRYPTFS_TRANSPORT_MISCDEV 3 -#define ECRYPTFS_DEFAULT_TRANSPORT ECRYPTFS_TRANSPORT_MISCDEV #define ECRYPTFS_XATTR_NAME "user.ecryptfs" #define RFC2440_CIPHER_DES3_EDE 0x02 @@ -400,8 +395,6 @@ struct ecryptfs_msg_ctx { struct mutex mux; }; -extern unsigned int ecryptfs_transport; - struct ecryptfs_daemon; struct ecryptfs_daemon { @@ -627,31 +620,20 @@ int ecryptfs_setxattr(struct dentry *dentry, const char *name, const void *value, size_t size, int flags); int ecryptfs_read_xattr_region(char *page_virt, struct inode *ecryptfs_inode); -int ecryptfs_process_helo(unsigned int transport, uid_t euid, - struct user_namespace *user_ns, struct pid *pid); +int ecryptfs_process_helo(uid_t euid, struct user_namespace *user_ns, + struct pid *pid); int ecryptfs_process_quit(uid_t euid, struct user_namespace *user_ns, struct pid *pid); int ecryptfs_process_response(struct ecryptfs_message *msg, uid_t euid, struct user_namespace *user_ns, struct pid *pid, u32 seq); -int ecryptfs_send_message(unsigned int transport, char *data, int data_len, +int ecryptfs_send_message(char *data, int data_len, struct ecryptfs_msg_ctx **msg_ctx); int ecryptfs_wait_for_response(struct ecryptfs_msg_ctx *msg_ctx, struct ecryptfs_message **emsg); -int ecryptfs_init_messaging(unsigned int transport); -void ecryptfs_release_messaging(unsigned int transport); +int ecryptfs_init_messaging(void); +void ecryptfs_release_messaging(void); -int ecryptfs_send_netlink(char *data, int data_len, - struct ecryptfs_msg_ctx *msg_ctx, u8 msg_type, - u16 msg_flags, struct pid *daemon_pid); -int ecryptfs_init_netlink(void); -void ecryptfs_release_netlink(void); - -int ecryptfs_send_connector(char *data, int data_len, - struct ecryptfs_msg_ctx *msg_ctx, u8 msg_type, - u16 msg_flags, struct pid *daemon_pid); -int ecryptfs_init_connector(void); -void ecryptfs_release_connector(void); void ecryptfs_write_header_metadata(char *virt, struct ecryptfs_crypt_stat *crypt_stat, diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c index 9244d65..eb3dc4c 100644 --- a/fs/ecryptfs/file.c +++ b/fs/ecryptfs/file.c @@ -71,12 +71,11 @@ struct ecryptfs_getdents_callback { void *dirent; struct dentry *dentry; filldir_t filldir; - int err; int filldir_called; int entries_written; }; -/* Inspired by generic filldir in fs/readir.c */ +/* Inspired by generic filldir in fs/readdir.c */ static int ecryptfs_filldir(void *dirent, const char *name, int namelen, loff_t offset, u64 ino, unsigned int d_type) @@ -125,18 +124,18 @@ static int ecryptfs_readdir(struct file *file, void *dirent, filldir_t filldir) buf.dirent = dirent; buf.dentry = file->f_path.dentry; buf.filldir = filldir; -retry: buf.filldir_called = 0; buf.entries_written = 0; - buf.err = 0; rc = vfs_readdir(lower_file, ecryptfs_filldir, (void *)&buf); - if (buf.err) - rc = buf.err; - if (buf.filldir_called && !buf.entries_written) - goto retry; file->f_pos = lower_file->f_pos; + if (rc < 0) + goto out; + if (buf.filldir_called && !buf.entries_written) + goto out; if (rc >= 0) - fsstack_copy_attr_atime(inode, lower_file->f_path.dentry->d_inode); + fsstack_copy_attr_atime(inode, + lower_file->f_path.dentry->d_inode); +out: return rc; } diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c index f5b76a3..e22bc39 100644 --- a/fs/ecryptfs/keystore.c +++ b/fs/ecryptfs/keystore.c @@ -234,8 +234,8 @@ parse_tag_65_packet(struct ecryptfs_session_key *session_key, u8 *cipher_code, } i += data_len; if (message_len < (i + m_size)) { - ecryptfs_printk(KERN_ERR, "The received netlink message is " - "shorter than expected\n"); + ecryptfs_printk(KERN_ERR, "The message received from ecryptfsd " + "is shorter than expected\n"); rc = -EIO; goto out; } @@ -438,8 +438,8 @@ decrypt_pki_encrypted_session_key(struct ecryptfs_auth_tok *auth_tok, struct ecryptfs_msg_ctx *msg_ctx; struct ecryptfs_message *msg = NULL; char *auth_tok_sig; - char *netlink_message; - size_t netlink_message_length; + char *payload; + size_t payload_len; int rc; rc = ecryptfs_get_auth_tok_sig(&auth_tok_sig, auth_tok); @@ -449,15 +449,15 @@ decrypt_pki_encrypted_session_key(struct ecryptfs_auth_tok *auth_tok, goto out; } rc = write_tag_64_packet(auth_tok_sig, &(auth_tok->session_key), - &netlink_message, &netlink_message_length); + &payload, &payload_len); if (rc) { ecryptfs_printk(KERN_ERR, "Failed to write tag 64 packet\n"); goto out; } - rc = ecryptfs_send_message(ecryptfs_transport, netlink_message, - netlink_message_length, &msg_ctx); + rc = ecryptfs_send_message(payload, payload_len, &msg_ctx); if (rc) { - ecryptfs_printk(KERN_ERR, "Error sending netlink message\n"); + ecryptfs_printk(KERN_ERR, "Error sending message to " + "ecryptfsd\n"); goto out; } rc = ecryptfs_wait_for_response(msg_ctx, &msg); @@ -1333,23 +1333,22 @@ pki_encrypt_session_key(struct ecryptfs_auth_tok *auth_tok, struct ecryptfs_key_record *key_rec) { struct ecryptfs_msg_ctx *msg_ctx = NULL; - char *netlink_payload; - size_t netlink_payload_length; + char *payload = NULL; + size_t payload_len; struct ecryptfs_message *msg; int rc; rc = write_tag_66_packet(auth_tok->token.private_key.signature, ecryptfs_code_for_cipher_string(crypt_stat), - crypt_stat, &netlink_payload, - &netlink_payload_length); + crypt_stat, &payload, &payload_len); if (rc) { ecryptfs_printk(KERN_ERR, "Error generating tag 66 packet\n"); goto out; } - rc = ecryptfs_send_message(ecryptfs_transport, netlink_payload, - netlink_payload_length, &msg_ctx); + rc = ecryptfs_send_message(payload, payload_len, &msg_ctx); if (rc) { - ecryptfs_printk(KERN_ERR, "Error sending netlink message\n"); + ecryptfs_printk(KERN_ERR, "Error sending message to " + "ecryptfsd\n"); goto out; } rc = ecryptfs_wait_for_response(msg_ctx, &msg); @@ -1364,8 +1363,7 @@ pki_encrypt_session_key(struct ecryptfs_auth_tok *auth_tok, ecryptfs_printk(KERN_ERR, "Error parsing tag 67 packet\n"); kfree(msg); out: - if (netlink_payload) - kfree(netlink_payload); + kfree(payload); return rc; } /** diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c index 448dfd5..046e027 100644 --- a/fs/ecryptfs/main.c +++ b/fs/ecryptfs/main.c @@ -30,7 +30,6 @@ #include <linux/namei.h> #include <linux/skbuff.h> #include <linux/crypto.h> -#include <linux/netlink.h> #include <linux/mount.h> #include <linux/pagemap.h> #include <linux/key.h> @@ -49,8 +48,7 @@ MODULE_PARM_DESC(ecryptfs_verbosity, "0, which is Quiet)"); /** - * Module parameter that defines the number of netlink message buffer - * elements + * Module parameter that defines the number of message buffer elements */ unsigned int ecryptfs_message_buf_len = ECRYPTFS_DEFAULT_MSG_CTX_ELEMS; @@ -60,9 +58,9 @@ MODULE_PARM_DESC(ecryptfs_message_buf_len, /** * Module parameter that defines the maximum guaranteed amount of time to wait - * for a response through netlink. The actual sleep time will be, more than + * for a response from ecryptfsd. The actual sleep time will be, more than * likely, a small amount greater than this specified value, but only less if - * the netlink message successfully arrives. + * the message successfully arrives. */ signed long ecryptfs_message_wait_timeout = ECRYPTFS_MAX_MSG_CTX_TTL / HZ; @@ -83,8 +81,6 @@ module_param(ecryptfs_number_of_users, uint, 0); MODULE_PARM_DESC(ecryptfs_number_of_users, "An estimate of the number of " "concurrent users of eCryptfs"); -unsigned int ecryptfs_transport = ECRYPTFS_DEFAULT_TRANSPORT; - void __ecryptfs_printk(const char *fmt, ...) { va_list args; @@ -211,7 +207,7 @@ enum { ecryptfs_opt_sig, ecryptfs_opt_ecryptfs_sig, ecryptfs_opt_passthrough, ecryptfs_opt_xattr_metadata, ecryptfs_opt_encrypted_view, ecryptfs_opt_err }; -static match_table_t tokens = { +static const match_table_t tokens = { {ecryptfs_opt_sig, "sig=%s"}, {ecryptfs_opt_ecryptfs_sig, "ecryptfs_sig=%s"}, {ecryptfs_opt_cipher, "cipher=%s"}, @@ -779,10 +775,11 @@ static int __init ecryptfs_init(void) "rc = [%d]\n", __func__, rc); goto out_do_sysfs_unregistration; } - rc = ecryptfs_init_messaging(ecryptfs_transport); + rc = ecryptfs_init_messaging(); if (rc) { printk(KERN_ERR "Failure occured while attempting to " - "initialize the eCryptfs netlink socket\n"); + "initialize the communications channel to " + "ecryptfsd\n"); goto out_destroy_kthread; } rc = ecryptfs_init_crypto(); @@ -797,7 +794,7 @@ static int __init ecryptfs_init(void) goto out; out_release_messaging: - ecryptfs_release_messaging(ecryptfs_transport); + ecryptfs_release_messaging(); out_destroy_kthread: ecryptfs_destroy_kthread(); out_do_sysfs_unregistration: @@ -818,7 +815,7 @@ static void __exit ecryptfs_exit(void) if (rc) printk(KERN_ERR "Failure whilst attempting to destroy crypto; " "rc = [%d]\n", rc); - ecryptfs_release_messaging(ecryptfs_transport); + ecryptfs_release_messaging(); ecryptfs_destroy_kthread(); do_sysfs_unregistration(); unregister_filesystem(&ecryptfs_fs_type); diff --git a/fs/ecryptfs/messaging.c b/fs/ecryptfs/messaging.c index 1b5c200..c698397 100644 --- a/fs/ecryptfs/messaging.c +++ b/fs/ecryptfs/messaging.c @@ -134,12 +134,11 @@ out: } static int -ecryptfs_send_message_locked(unsigned int transport, char *data, int data_len, - u8 msg_type, struct ecryptfs_msg_ctx **msg_ctx); +ecryptfs_send_message_locked(char *data, int data_len, u8 msg_type, + struct ecryptfs_msg_ctx **msg_ctx); /** * ecryptfs_send_raw_message - * @transport: Transport type * @msg_type: Message type * @daemon: Daemon struct for recipient of message * @@ -150,38 +149,25 @@ ecryptfs_send_message_locked(unsigned int transport, char *data, int data_len, * * Returns zero on success; non-zero otherwise */ -static int ecryptfs_send_raw_message(unsigned int transport, u8 msg_type, +static int ecryptfs_send_raw_message(u8 msg_type, struct ecryptfs_daemon *daemon) { struct ecryptfs_msg_ctx *msg_ctx; int rc; - switch(transport) { - case ECRYPTFS_TRANSPORT_NETLINK: - rc = ecryptfs_send_netlink(NULL, 0, NULL, msg_type, 0, - daemon->pid); - break; - case ECRYPTFS_TRANSPORT_MISCDEV: - rc = ecryptfs_send_message_locked(transport, NULL, 0, msg_type, - &msg_ctx); - if (rc) { - printk(KERN_ERR "%s: Error whilst attempting to send " - "message via procfs; rc = [%d]\n", __func__, rc); - goto out; - } - /* Raw messages are logically context-free (e.g., no - * reply is expected), so we set the state of the - * ecryptfs_msg_ctx object to indicate that it should - * be freed as soon as the transport sends out the message. */ - mutex_lock(&msg_ctx->mux); - msg_ctx->state = ECRYPTFS_MSG_CTX_STATE_NO_REPLY; - mutex_unlock(&msg_ctx->mux); - break; - case ECRYPTFS_TRANSPORT_CONNECTOR: - case ECRYPTFS_TRANSPORT_RELAYFS: - default: - rc = -ENOSYS; + rc = ecryptfs_send_message_locked(NULL, 0, msg_type, &msg_ctx); + if (rc) { + printk(KERN_ERR "%s: Error whilst attempting to send " + "message to ecryptfsd; rc = [%d]\n", __func__, rc); + goto out; } + /* Raw messages are logically context-free (e.g., no + * reply is expected), so we set the state of the + * ecryptfs_msg_ctx object to indicate that it should + * be freed as soon as the message is sent. */ + mutex_lock(&msg_ctx->mux); + msg_ctx->state = ECRYPTFS_MSG_CTX_STATE_NO_REPLY; + mutex_unlock(&msg_ctx->mux); out: return rc; } @@ -227,7 +213,6 @@ out: /** * ecryptfs_process_helo - * @transport: The underlying transport (netlink, etc.) * @euid: The user ID owner of the message * @user_ns: The namespace in which @euid applies * @pid: The process ID for the userspace program that sent the @@ -239,8 +224,8 @@ out: * Returns zero after adding a new daemon to the hash list; * non-zero otherwise. */ -int ecryptfs_process_helo(unsigned int transport, uid_t euid, - struct user_namespace *user_ns, struct pid *pid) +int ecryptfs_process_helo(uid_t euid, struct user_namespace *user_ns, + struct pid *pid) { struct ecryptfs_daemon *new_daemon; struct ecryptfs_daemon *old_daemon; @@ -252,8 +237,7 @@ int ecryptfs_process_helo(unsigned int transport, uid_t euid, printk(KERN_WARNING "Received request from user [%d] " "to register daemon [0x%p]; unregistering daemon " "[0x%p]\n", euid, pid, old_daemon->pid); - rc = ecryptfs_send_raw_message(transport, ECRYPTFS_MSG_QUIT, - old_daemon); + rc = ecryptfs_send_raw_message(ECRYPTFS_MSG_QUIT, old_daemon); if (rc) printk(KERN_WARNING "Failed to send QUIT " "message to daemon [0x%p]; rc = [%d]\n", @@ -467,8 +451,6 @@ out: /** * ecryptfs_send_message_locked - * @transport: The transport over which to send the message (i.e., - * netlink) * @data: The data to send * @data_len: The length of data * @msg_ctx: The message context allocated for the send @@ -478,8 +460,8 @@ out: * Returns zero on success; non-zero otherwise */ static int -ecryptfs_send_message_locked(unsigned int transport, char *data, int data_len, - u8 msg_type, struct ecryptfs_msg_ctx **msg_ctx) +ecryptfs_send_message_locked(char *data, int data_len, u8 msg_type, + struct ecryptfs_msg_ctx **msg_ctx) { struct ecryptfs_daemon *daemon; int rc; @@ -503,20 +485,8 @@ ecryptfs_send_message_locked(unsigned int transport, char *data, int data_len, ecryptfs_msg_ctx_free_to_alloc(*msg_ctx); mutex_unlock(&(*msg_ctx)->mux); mutex_unlock(&ecryptfs_msg_ctx_lists_mux); - switch (transport) { - case ECRYPTFS_TRANSPORT_NETLINK: - rc = ecryptfs_send_netlink(data, data_len, *msg_ctx, msg_type, - 0, daemon->pid); - break; - case ECRYPTFS_TRANSPORT_MISCDEV: - rc = ecryptfs_send_miscdev(data, data_len, *msg_ctx, msg_type, - 0, daemon); - break; - case ECRYPTFS_TRANSPORT_CONNECTOR: - case ECRYPTFS_TRANSPORT_RELAYFS: - default: - rc = -ENOSYS; - } + rc = ecryptfs_send_miscdev(data, data_len, *msg_ctx, msg_type, 0, + daemon); if (rc) printk(KERN_ERR "%s: Error attempting to send message to " "userspace daemon; rc = [%d]\n", __func__, rc); @@ -526,8 +496,6 @@ out: /** * ecryptfs_send_message - * @transport: The transport over which to send the message (i.e., - * netlink) * @data: The data to send * @data_len: The length of data * @msg_ctx: The message context allocated for the send @@ -536,14 +504,14 @@ out: * * Returns zero on success; non-zero otherwise */ -int ecryptfs_send_message(unsigned int transport, char *data, int data_len, +int ecryptfs_send_message(char *data, int data_len, struct ecryptfs_msg_ctx **msg_ctx) { int rc; mutex_lock(&ecryptfs_daemon_hash_mux); - rc = ecryptfs_send_message_locked(transport, data, data_len, - ECRYPTFS_MSG_REQUEST, msg_ctx); + rc = ecryptfs_send_message_locked(data, data_len, ECRYPTFS_MSG_REQUEST, + msg_ctx); mutex_unlock(&ecryptfs_daemon_hash_mux); return rc; } @@ -586,7 +554,7 @@ sleep: return rc; } -int ecryptfs_init_messaging(unsigned int transport) +int ecryptfs_init_messaging(void) { int i; int rc = 0; @@ -639,27 +607,14 @@ int ecryptfs_init_messaging(unsigned int transport) mutex_unlock(&ecryptfs_msg_ctx_arr[i].mux); } mutex_unlock(&ecryptfs_msg_ctx_lists_mux); - switch(transport) { - case ECRYPTFS_TRANSPORT_NETLINK: - rc = ecryptfs_init_netlink(); - if (rc) - ecryptfs_release_messaging(transport); - break; - case ECRYPTFS_TRANSPORT_MISCDEV: - rc = ecryptfs_init_ecryptfs_miscdev(); - if (rc) - ecryptfs_release_messaging(transport); - break; - case ECRYPTFS_TRANSPORT_CONNECTOR: - case ECRYPTFS_TRANSPORT_RELAYFS: - default: - rc = -ENOSYS; - } + rc = ecryptfs_init_ecryptfs_miscdev(); + if (rc) + ecryptfs_release_messaging(); out: return rc; } -void ecryptfs_release_messaging(unsigned int transport) +void ecryptfs_release_messaging(void) { if (ecryptfs_msg_ctx_arr) { int i; @@ -698,17 +653,6 @@ void ecryptfs_release_messaging(unsigned int transport) kfree(ecryptfs_daemon_hash); mutex_unlock(&ecryptfs_daemon_hash_mux); } - switch(transport) { - case ECRYPTFS_TRANSPORT_NETLINK: - ecryptfs_release_netlink(); - break; - case ECRYPTFS_TRANSPORT_MISCDEV: - ecryptfs_destroy_ecryptfs_miscdev(); - break; - case ECRYPTFS_TRANSPORT_CONNECTOR: - case ECRYPTFS_TRANSPORT_RELAYFS: - default: - break; - } + ecryptfs_destroy_ecryptfs_miscdev(); return; } diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c index 245c2dc..04d7b3f 100644 --- a/fs/ecryptfs/mmap.c +++ b/fs/ecryptfs/mmap.c @@ -265,22 +265,34 @@ out: } /** - * ecryptfs_prepare_write + * ecryptfs_write_begin * @file: The eCryptfs file - * @page: The eCryptfs page - * @from: The start byte from which we will write - * @to: The end byte to which we will write + * @mapping: The eCryptfs object + * @pos: The file offset at which to start writing + * @len: Length of the write + * @flags: Various flags + * @pagep: Pointer to return the page + * @fsdata: Pointer to return fs data (unused) * * This function must zero any hole we create * * Returns zero on success; non-zero otherwise */ -static int ecryptfs_prepare_write(struct file *file, struct page *page, - unsigned from, unsigned to) +static int ecryptfs_write_begin(struct file *file, + struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata) { + pgoff_t index = pos >> PAGE_CACHE_SHIFT; + struct page *page; loff_t prev_page_end_size; int rc = 0; + page = __grab_cache_page(mapping, index); + if (!page) + return -ENOMEM; + *pagep = page; + if (!PageUptodate(page)) { struct ecryptfs_crypt_stat *crypt_stat = &ecryptfs_inode_to_private( @@ -289,8 +301,7 @@ static int ecryptfs_prepare_write(struct file *file, struct page *page, if (!(crypt_stat->flags & ECRYPTFS_ENCRYPTED) || (crypt_stat->flags & ECRYPTFS_NEW_FILE)) { rc = ecryptfs_read_lower_page_segment( - page, page->index, 0, PAGE_CACHE_SIZE, - page->mapping->host); + page, index, 0, PAGE_CACHE_SIZE, mapping->host); if (rc) { printk(KERN_ERR "%s: Error attemping to read " "lower page segment; rc = [%d]\n", @@ -316,8 +327,8 @@ static int ecryptfs_prepare_write(struct file *file, struct page *page, SetPageUptodate(page); } else { rc = ecryptfs_read_lower_page_segment( - page, page->index, 0, PAGE_CACHE_SIZE, - page->mapping->host); + page, index, 0, PAGE_CACHE_SIZE, + mapping->host); if (rc) { printk(KERN_ERR "%s: Error reading " "page; rc = [%d]\n", @@ -339,10 +350,10 @@ static int ecryptfs_prepare_write(struct file *file, struct page *page, SetPageUptodate(page); } } - prev_page_end_size = ((loff_t)page->index << PAGE_CACHE_SHIFT); + prev_page_end_size = ((loff_t)index << PAGE_CACHE_SHIFT); /* If creating a page or more of holes, zero them out via truncate. * Note, this will increase i_size. */ - if (page->index != 0) { + if (index != 0) { if (prev_page_end_size > i_size_read(page->mapping->host)) { rc = ecryptfs_truncate(file->f_path.dentry, prev_page_end_size); @@ -357,8 +368,8 @@ static int ecryptfs_prepare_write(struct file *file, struct page *page, } /* Writing to a new page, and creating a small hole from start * of page? Zero it out. */ - if ((i_size_read(page->mapping->host) == prev_page_end_size) - && (from != 0)) + if ((i_size_read(mapping->host) == prev_page_end_size) + && (pos != 0)) zero_user(page, 0, PAGE_CACHE_SIZE); out: return rc; @@ -445,21 +456,28 @@ int ecryptfs_write_inode_size_to_metadata(struct inode *ecryptfs_inode) } /** - * ecryptfs_commit_write + * ecryptfs_write_end * @file: The eCryptfs file object + * @mapping: The eCryptfs object + * @pos: The file position + * @len: The length of the data (unused) + * @copied: The amount of data copied * @page: The eCryptfs page - * @from: Ignored (we rotate the page IV on each write) - * @to: Ignored + * @fsdata: The fsdata (unused) * * This is where we encrypt the data and pass the encrypted data to * the lower filesystem. In OpenPGP-compatible mode, we operate on * entire underlying packets. */ -static int ecryptfs_commit_write(struct file *file, struct page *page, - unsigned from, unsigned to) +static int ecryptfs_write_end(struct file *file, + struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) { - loff_t pos; - struct inode *ecryptfs_inode = page->mapping->host; + pgoff_t index = pos >> PAGE_CACHE_SHIFT; + unsigned from = pos & (PAGE_CACHE_SIZE - 1); + unsigned to = from + copied; + struct inode *ecryptfs_inode = mapping->host; struct ecryptfs_crypt_stat *crypt_stat = &ecryptfs_inode_to_private(file->f_path.dentry->d_inode)->crypt_stat; int rc; @@ -471,25 +489,22 @@ static int ecryptfs_commit_write(struct file *file, struct page *page, } else ecryptfs_printk(KERN_DEBUG, "Not a new file\n"); ecryptfs_printk(KERN_DEBUG, "Calling fill_zeros_to_end_of_page" - "(page w/ index = [0x%.16x], to = [%d])\n", page->index, - to); + "(page w/ index = [0x%.16x], to = [%d])\n", index, to); /* Fills in zeros if 'to' goes beyond inode size */ rc = fill_zeros_to_end_of_page(page, to); if (rc) { ecryptfs_printk(KERN_WARNING, "Error attempting to fill " - "zeros in page with index = [0x%.16x]\n", - page->index); + "zeros in page with index = [0x%.16x]\n", index); goto out; } rc = ecryptfs_encrypt_page(page); if (rc) { ecryptfs_printk(KERN_WARNING, "Error encrypting page (upper " - "index [0x%.16x])\n", page->index); + "index [0x%.16x])\n", index); goto out; } - pos = (((loff_t)page->index) << PAGE_CACHE_SHIFT) + to; - if (pos > i_size_read(ecryptfs_inode)) { - i_size_write(ecryptfs_inode, pos); + if (pos + copied > i_size_read(ecryptfs_inode)) { + i_size_write(ecryptfs_inode, pos + copied); ecryptfs_printk(KERN_DEBUG, "Expanded file size to " "[0x%.16x]\n", i_size_read(ecryptfs_inode)); } @@ -497,7 +512,11 @@ static int ecryptfs_commit_write(struct file *file, struct page *page, if (rc) printk(KERN_ERR "Error writing inode size to metadata; " "rc = [%d]\n", rc); + else + rc = copied; out: + unlock_page(page); + page_cache_release(page); return rc; } @@ -518,7 +537,7 @@ static sector_t ecryptfs_bmap(struct address_space *mapping, sector_t block) struct address_space_operations ecryptfs_aops = { .writepage = ecryptfs_writepage, .readpage = ecryptfs_readpage, - .prepare_write = ecryptfs_prepare_write, - .commit_write = ecryptfs_commit_write, + .write_begin = ecryptfs_write_begin, + .write_end = ecryptfs_write_end, .bmap = ecryptfs_bmap, }; diff --git a/fs/ecryptfs/netlink.c b/fs/ecryptfs/netlink.c deleted file mode 100644 index e0abad6..0000000 --- a/fs/ecryptfs/netlink.c +++ /dev/null @@ -1,249 +0,0 @@ -/** - * eCryptfs: Linux filesystem encryption layer - * - * Copyright (C) 2004-2006 International Business Machines Corp. - * Author(s): Michael A. Halcrow <mhalcrow@us.ibm.com> - * Tyler Hicks <tyhicks@ou.edu> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License version - * 2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA - * 02111-1307, USA. - */ - -#include <net/sock.h> -#include <linux/hash.h> -#include <linux/random.h> -#include "ecryptfs_kernel.h" - -static struct sock *ecryptfs_nl_sock; - -/** - * ecryptfs_send_netlink - * @data: The data to include as the payload - * @data_len: The byte count of the data - * @msg_ctx: The netlink context that will be used to handle the - * response message - * @msg_type: The type of netlink message to send - * @msg_flags: The flags to include in the netlink header - * @daemon_pid: The process id of the daemon to send the message to - * - * Sends the data to the specified daemon pid and uses the netlink - * context element to store the data needed for validation upon - * receiving the response. The data and the netlink context can be - * null if just sending a netlink header is sufficient. Returns zero - * upon sending the message; non-zero upon error. - */ -int ecryptfs_send_netlink(char *data, int data_len, - struct ecryptfs_msg_ctx *msg_ctx, u8 msg_type, - u16 msg_flags, struct pid *daemon_pid) -{ - struct sk_buff *skb; - struct nlmsghdr *nlh; - struct ecryptfs_message *msg; - size_t payload_len; - int rc; - - payload_len = ((data && data_len) ? (sizeof(*msg) + data_len) : 0); - skb = alloc_skb(NLMSG_SPACE(payload_len), GFP_KERNEL); - if (!skb) { - rc = -ENOMEM; - ecryptfs_printk(KERN_ERR, "Failed to allocate socket buffer\n"); - goto out; - } - nlh = NLMSG_PUT(skb, pid_nr(daemon_pid), msg_ctx ? msg_ctx->counter : 0, - msg_type, payload_len); - nlh->nlmsg_flags = msg_flags; - if (msg_ctx && payload_len) { - msg = (struct ecryptfs_message *)NLMSG_DATA(nlh); - msg->index = msg_ctx->index; - msg->data_len = data_len; - memcpy(msg->data, data, data_len); - } - rc = netlink_unicast(ecryptfs_nl_sock, skb, pid_nr(daemon_pid), 0); - if (rc < 0) { - ecryptfs_printk(KERN_ERR, "Failed to send eCryptfs netlink " - "message; rc = [%d]\n", rc); - goto out; - } - rc = 0; - goto out; -nlmsg_failure: - rc = -EMSGSIZE; - kfree_skb(skb); -out: - return rc; -} - -/** - * ecryptfs_process_nl_reponse - * @skb: The socket buffer containing the netlink message of state - * RESPONSE - * - * Processes a response message after sending a operation request to - * userspace. Attempts to assign the msg to a netlink context element - * at the index specified in the msg. The sk_buff and nlmsghdr must - * be validated before this function. Returns zero upon delivery to - * desired context element; non-zero upon delivery failure or error. - */ -static int ecryptfs_process_nl_response(struct sk_buff *skb) -{ - struct nlmsghdr *nlh = nlmsg_hdr(skb); - struct ecryptfs_message *msg = NLMSG_DATA(nlh); - struct pid *pid; - int rc; - - if (skb->len - NLMSG_HDRLEN - sizeof(*msg) != msg->data_len) { - rc = -EINVAL; - ecryptfs_printk(KERN_ERR, "Received netlink message with " - "incorrectly specified data length\n"); - goto out; - } - pid = find_get_pid(NETLINK_CREDS(skb)->pid); - rc = ecryptfs_process_response(msg, NETLINK_CREDS(skb)->uid, NULL, - pid, nlh->nlmsg_seq); - put_pid(pid); - if (rc) - printk(KERN_ERR - "Error processing response message; rc = [%d]\n", rc); -out: - return rc; -} - -/** - * ecryptfs_process_nl_helo - * @skb: The socket buffer containing the nlmsghdr in HELO state - * - * Gets uid and pid of the skb and adds the values to the daemon id - * hash. Returns zero after adding a new daemon id to the hash list; - * non-zero otherwise. - */ -static int ecryptfs_process_nl_helo(struct sk_buff *skb) -{ - struct pid *pid; - int rc; - - pid = find_get_pid(NETLINK_CREDS(skb)->pid); - rc = ecryptfs_process_helo(ECRYPTFS_TRANSPORT_NETLINK, - NETLINK_CREDS(skb)->uid, NULL, pid); - put_pid(pid); - if (rc) - printk(KERN_WARNING "Error processing HELO; rc = [%d]\n", rc); - return rc; -} - -/** - * ecryptfs_process_nl_quit - * @skb: The socket buffer containing the nlmsghdr in QUIT state - * - * Gets uid and pid of the skb and deletes the corresponding daemon - * id, if it is the registered that is requesting the - * deletion. Returns zero after deleting the desired daemon id; - * non-zero otherwise. - */ -static int ecryptfs_process_nl_quit(struct sk_buff *skb) -{ - struct pid *pid; - int rc; - - pid = find_get_pid(NETLINK_CREDS(skb)->pid); - rc = ecryptfs_process_quit(NETLINK_CREDS(skb)->uid, NULL, pid); - put_pid(pid); - if (rc) - printk(KERN_WARNING - "Error processing QUIT message; rc = [%d]\n", rc); - return rc; -} - -/** - * ecryptfs_receive_nl_message - * - * Callback function called by netlink system when a message arrives. - * If the message looks to be valid, then an attempt is made to assign - * it to its desired netlink context element and wake up the process - * that is waiting for a response. - */ -static void ecryptfs_receive_nl_message(struct sk_buff *skb) -{ - struct nlmsghdr *nlh; - - nlh = nlmsg_hdr(skb); - if (!NLMSG_OK(nlh, skb->len)) { - ecryptfs_printk(KERN_ERR, "Received corrupt netlink " - "message\n"); - goto free; - } - switch (nlh->nlmsg_type) { - case ECRYPTFS_MSG_RESPONSE: - if (ecryptfs_process_nl_response(skb)) { - ecryptfs_printk(KERN_WARNING, "Failed to " - "deliver netlink response to " - "requesting operation\n"); - } - break; - case ECRYPTFS_MSG_HELO: - if (ecryptfs_process_nl_helo(skb)) { - ecryptfs_printk(KERN_WARNING, "Failed to " - "fulfill HELO request\n"); - } - break; - case ECRYPTFS_MSG_QUIT: - if (ecryptfs_process_nl_quit(skb)) { - ecryptfs_printk(KERN_WARNING, "Failed to " - "fulfill QUIT request\n"); - } - break; - default: - ecryptfs_printk(KERN_WARNING, "Dropping netlink " - "message of unrecognized type [%d]\n", - nlh->nlmsg_type); - break; - } -free: - kfree_skb(skb); -} - -/** - * ecryptfs_init_netlink - * - * Initializes the daemon id hash list, netlink context array, and - * necessary locks. Returns zero upon success; non-zero upon error. - */ -int ecryptfs_init_netlink(void) -{ - int rc; - - ecryptfs_nl_sock = netlink_kernel_create(&init_net, NETLINK_ECRYPTFS, 0, - ecryptfs_receive_nl_message, - NULL, THIS_MODULE); - if (!ecryptfs_nl_sock) { - rc = -EIO; - ecryptfs_printk(KERN_ERR, "Failed to create netlink socket\n"); - goto out; - } - ecryptfs_nl_sock->sk_sndtimeo = ECRYPTFS_DEFAULT_SEND_TIMEOUT; - rc = 0; -out: - return rc; -} - -/** - * ecryptfs_release_netlink - * - * Frees all memory used by the netlink context array and releases the - * netlink socket. - */ -void ecryptfs_release_netlink(void) -{ - netlink_kernel_release(ecryptfs_nl_sock); - ecryptfs_nl_sock = NULL; -} diff --git a/fs/efs/super.c b/fs/efs/super.c index 567b134..73b19cf 100644 --- a/fs/efs/super.c +++ b/fs/efs/super.c @@ -341,8 +341,6 @@ static int efs_statfs(struct dentry *dentry, struct kstatfs *buf) { sb->inode_blocks * (EFS_BLOCKSIZE / sizeof(struct efs_dinode)); buf->f_ffree = sb->inode_free; /* free inodes */ - buf->f_fsid.val[0] = (sb->fs_magic >> 16) & 0xffff; /* fs ID */ - buf->f_fsid.val[1] = sb->fs_magic & 0xffff; /* fs ID */ buf->f_namelen = EFS_MAXNAMELEN; /* max filename length */ return 0; diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 7cc0eb7..99368bd 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -927,14 +927,11 @@ errxit: /* * During the time we spent in the loop above, some other events * might have been queued by the poll callback. We re-insert them - * here (in case they are not already queued, or they're one-shot). + * inside the main ready-list here. */ for (nepi = ep->ovflist; (epi = nepi) != NULL; - nepi = epi->next, epi->next = EP_UNACTIVE_PTR) { - if (!ep_is_linked(&epi->rdllink) && - (epi->event.events & ~EP_PRIVATE_BITS)) - list_add_tail(&epi->rdllink, &ep->rdllist); - } + nepi = epi->next, epi->next = EP_UNACTIVE_PTR) + list_add_tail(&epi->rdllink, &ep->rdllist); /* * We need to set back ep->ovflist to EP_UNACTIVE_PTR, so that after * releasing the lock, events will be queued in the normal way inside @@ -50,15 +50,12 @@ #include <linux/cn_proc.h> #include <linux/audit.h> #include <linux/tracehook.h> +#include <linux/kmod.h> #include <asm/uaccess.h> #include <asm/mmu_context.h> #include <asm/tlb.h> -#ifdef CONFIG_KMOD -#include <linux/kmod.h> -#endif - #ifdef __alpha__ /* for /sbin/loader handling in search_binary_handler() */ #include <linux/a.out.h> @@ -391,7 +388,7 @@ static int count(char __user * __user * argv, int max) if (!p) break; argv++; - if(++i > max) + if (i++ >= max) return -E2BIG; cond_resched(); } @@ -825,8 +822,6 @@ static int de_thread(struct task_struct *tsk) schedule(); } - if (unlikely(task_child_reaper(tsk) == leader)) - task_active_pid_ns(tsk)->child_reaper = tsk; /* * The only record we have of the real-time age of a * process, regardless of execs it's done, is start_time. @@ -1189,7 +1184,7 @@ int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs) return retval; /* Remember if the application is TASO. */ - bprm->sh_bang = eh->ah.entry < 0x100000000UL; + bprm->taso = eh->ah.entry < 0x100000000UL; bprm->file = file; bprm->loader = loader; @@ -1247,8 +1242,8 @@ int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs) read_unlock(&binfmt_lock); if (retval != -ENOEXEC || bprm->mm == NULL) { break; -#ifdef CONFIG_KMOD - }else{ +#ifdef CONFIG_MODULES + } else { #define printable(c) (((c)=='\t') || ((c)=='\n') || (0x20<=(c) && (c)<=0x7e)) if (printable(bprm->buf[0]) && printable(bprm->buf[1]) && diff --git a/fs/ext2/balloc.c b/fs/ext2/balloc.c index 10bb02c..6dac7ba 100644 --- a/fs/ext2/balloc.c +++ b/fs/ext2/balloc.c @@ -1295,6 +1295,7 @@ retry_alloc: * turn off reservation for this allocation */ if (my_rsv && (free_blocks < windowsz) + && (free_blocks > 0) && (rsv_is_empty(&my_rsv->rsv_window))) my_rsv = NULL; @@ -1332,7 +1333,7 @@ retry_alloc: * free blocks is less than half of the reservation * window size. */ - if (free_blocks <= (windowsz/2)) + if (my_rsv && (free_blocks <= (windowsz/2))) continue; brelse(bitmap_bh); diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c index a78c6b4..11a49ce 100644 --- a/fs/ext2/dir.c +++ b/fs/ext2/dir.c @@ -103,7 +103,7 @@ static int ext2_commit_chunk(struct page *page, loff_t pos, unsigned len) return err; } -static void ext2_check_page(struct page *page) +static void ext2_check_page(struct page *page, int quiet) { struct inode *dir = page->mapping->host; struct super_block *sb = dir->i_sb; @@ -146,10 +146,10 @@ out: /* Too bad, we had an error */ Ebadsize: - ext2_error(sb, "ext2_check_page", - "size of directory #%lu is not a multiple of chunk size", - dir->i_ino - ); + if (!quiet) + ext2_error(sb, __func__, + "size of directory #%lu is not a multiple " + "of chunk size", dir->i_ino); goto fail; Eshort: error = "rec_len is smaller than minimal"; @@ -166,32 +166,36 @@ Espan: Einumber: error = "inode out of bounds"; bad_entry: - ext2_error (sb, "ext2_check_page", "bad entry in directory #%lu: %s - " - "offset=%lu, inode=%lu, rec_len=%d, name_len=%d", - dir->i_ino, error, (page->index<<PAGE_CACHE_SHIFT)+offs, - (unsigned long) le32_to_cpu(p->inode), - rec_len, p->name_len); + if (!quiet) + ext2_error(sb, __func__, "bad entry in directory #%lu: : %s - " + "offset=%lu, inode=%lu, rec_len=%d, name_len=%d", + dir->i_ino, error, (page->index<<PAGE_CACHE_SHIFT)+offs, + (unsigned long) le32_to_cpu(p->inode), + rec_len, p->name_len); goto fail; Eend: - p = (ext2_dirent *)(kaddr + offs); - ext2_error (sb, "ext2_check_page", - "entry in directory #%lu spans the page boundary" - "offset=%lu, inode=%lu", - dir->i_ino, (page->index<<PAGE_CACHE_SHIFT)+offs, - (unsigned long) le32_to_cpu(p->inode)); + if (!quiet) { + p = (ext2_dirent *)(kaddr + offs); + ext2_error(sb, "ext2_check_page", + "entry in directory #%lu spans the page boundary" + "offset=%lu, inode=%lu", + dir->i_ino, (page->index<<PAGE_CACHE_SHIFT)+offs, + (unsigned long) le32_to_cpu(p->inode)); + } fail: SetPageChecked(page); SetPageError(page); } -static struct page * ext2_get_page(struct inode *dir, unsigned long n) +static struct page * ext2_get_page(struct inode *dir, unsigned long n, + int quiet) { struct address_space *mapping = dir->i_mapping; struct page *page = read_mapping_page(mapping, n, NULL); if (!IS_ERR(page)) { kmap(page); if (!PageChecked(page)) - ext2_check_page(page); + ext2_check_page(page, quiet); if (PageError(page)) goto fail; } @@ -292,7 +296,7 @@ ext2_readdir (struct file * filp, void * dirent, filldir_t filldir) for ( ; n < npages; n++, offset = 0) { char *kaddr, *limit; ext2_dirent *de; - struct page *page = ext2_get_page(inode, n); + struct page *page = ext2_get_page(inode, n, 0); if (IS_ERR(page)) { ext2_error(sb, __func__, @@ -361,6 +365,7 @@ struct ext2_dir_entry_2 * ext2_find_entry (struct inode * dir, struct page *page = NULL; struct ext2_inode_info *ei = EXT2_I(dir); ext2_dirent * de; + int dir_has_error = 0; if (npages == 0) goto out; @@ -374,7 +379,7 @@ struct ext2_dir_entry_2 * ext2_find_entry (struct inode * dir, n = start; do { char *kaddr; - page = ext2_get_page(dir, n); + page = ext2_get_page(dir, n, dir_has_error); if (!IS_ERR(page)) { kaddr = page_address(page); de = (ext2_dirent *) kaddr; @@ -391,7 +396,9 @@ struct ext2_dir_entry_2 * ext2_find_entry (struct inode * dir, de = ext2_next_entry(de); } ext2_put_page(page); - } + } else + dir_has_error = 1; + if (++n >= npages) n = 0; /* next page is past the blocks we've got */ @@ -414,7 +421,7 @@ found: struct ext2_dir_entry_2 * ext2_dotdot (struct inode *dir, struct page **p) { - struct page *page = ext2_get_page(dir, 0); + struct page *page = ext2_get_page(dir, 0, 0); ext2_dirent *de = NULL; if (!IS_ERR(page)) { @@ -487,7 +494,7 @@ int ext2_add_link (struct dentry *dentry, struct inode *inode) for (n = 0; n <= npages; n++) { char *dir_end; - page = ext2_get_page(dir, n); + page = ext2_get_page(dir, n, 0); err = PTR_ERR(page); if (IS_ERR(page)) goto out; @@ -655,14 +662,17 @@ int ext2_empty_dir (struct inode * inode) { struct page *page = NULL; unsigned long i, npages = dir_pages(inode); + int dir_has_error = 0; for (i = 0; i < npages; i++) { char *kaddr; ext2_dirent * de; - page = ext2_get_page(inode, i); + page = ext2_get_page(inode, i, dir_has_error); - if (IS_ERR(page)) + if (IS_ERR(page)) { + dir_has_error = 1; continue; + } kaddr = page_address(page); de = (ext2_dirent *)kaddr; diff --git a/fs/ext2/super.c b/fs/ext2/super.c index fd88c7b4..647cd88 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -393,7 +393,7 @@ enum { Opt_usrquota, Opt_grpquota, Opt_reservation, Opt_noreservation }; -static match_table_t tokens = { +static const match_table_t tokens = { {Opt_bsd_df, "bsddf"}, {Opt_minix_df, "minixdf"}, {Opt_grpid, "grpid"}, diff --git a/fs/ext3/super.c b/fs/ext3/super.c index f38a5af..399a96a 100644 --- a/fs/ext3/super.c +++ b/fs/ext3/super.c @@ -760,7 +760,7 @@ enum { Opt_grpquota }; -static match_table_t tokens = { +static const match_table_t tokens = { {Opt_bsd_df, "bsddf"}, {Opt_minix_df, "minixdf"}, {Opt_grpid, "grpid"}, diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index bd2ece2..b9821be 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -568,8 +568,16 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode, /* this isn't the right place to decide whether block is metadata * inode.c/extents.c knows better, but for safety ... */ - if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode) || - ext4_should_journal_data(inode)) + if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) + metadata = 1; + + /* We need to make sure we don't reuse + * block released untill the transaction commit. + * writeback mode have weak data consistency so + * don't force data as metadata when freeing block + * for writeback mode. + */ + if (metadata == 0 && !ext4_should_writeback_data(inode)) metadata = 1; sb = inode->i_sb; diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 6690a41..4880cc3 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -511,7 +511,6 @@ do { \ /* * Mount flags */ -#define EXT4_MOUNT_CHECK 0x00001 /* Do mount-time checks */ #define EXT4_MOUNT_OLDALLOC 0x00002 /* Don't use the new Orlov allocator */ #define EXT4_MOUNT_GRPID 0x00004 /* Create files with directory's group */ #define EXT4_MOUNT_DEBUG 0x00008 /* Some debugging messages */ diff --git a/fs/ext4/ext4_sb.h b/fs/ext4/ext4_sb.h index 6a0b40d..445fde6 100644 --- a/fs/ext4/ext4_sb.h +++ b/fs/ext4/ext4_sb.h @@ -99,9 +99,6 @@ struct ext4_sb_info { struct inode *s_buddy_cache; long s_blocks_reserved; spinlock_t s_reserve_lock; - struct list_head s_active_transaction; - struct list_head s_closed_transaction; - struct list_head s_committed_transaction; spinlock_t s_md_lock; tid_t s_last_transaction; unsigned short *s_mb_offsets, *s_mb_maxs; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 9b4ec9d..8dbf695 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1648,6 +1648,7 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd) int ret = 0, err, nr_pages, i; unsigned long index, end; struct pagevec pvec; + long pages_skipped; BUG_ON(mpd->next_page <= mpd->first_page); pagevec_init(&pvec, 0); @@ -1655,20 +1656,30 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd) end = mpd->next_page - 1; while (index <= end) { - /* XXX: optimize tail */ - nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE); + /* + * We can use PAGECACHE_TAG_DIRTY lookup here because + * even though we have cleared the dirty flag on the page + * We still keep the page in the radix tree with tag + * PAGECACHE_TAG_DIRTY. See clear_page_dirty_for_io. + * The PAGECACHE_TAG_DIRTY is cleared in set_page_writeback + * which is called via the below writepage callback. + */ + nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, + PAGECACHE_TAG_DIRTY, + min(end - index, + (pgoff_t)PAGEVEC_SIZE-1) + 1); if (nr_pages == 0) break; for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; - index = page->index; - if (index > end) - break; - index++; - + pages_skipped = mpd->wbc->pages_skipped; err = mapping->a_ops->writepage(page, mpd->wbc); - if (!err) + if (!err && (pages_skipped == mpd->wbc->pages_skipped)) + /* + * have successfully written the page + * without skipping the same + */ mpd->pages_written++; /* * In error case, we have to continue because @@ -2104,7 +2115,6 @@ static int mpage_da_writepages(struct address_space *mapping, struct writeback_control *wbc, struct mpage_da_data *mpd) { - long to_write; int ret; if (!mpd->get_block) @@ -2119,19 +2129,18 @@ static int mpage_da_writepages(struct address_space *mapping, mpd->pages_written = 0; mpd->retval = 0; - to_write = wbc->nr_to_write; - ret = write_cache_pages(mapping, wbc, __mpage_da_writepage, mpd); - /* * Handle last extent of pages */ if (!mpd->io_done && mpd->next_page != mpd->first_page) { if (mpage_da_map_blocks(mpd) == 0) mpage_da_submit_io(mpd); - } - wbc->nr_to_write = to_write - mpd->pages_written; + mpd->io_done = 1; + ret = MPAGE_DA_EXTENT_TAIL; + } + wbc->nr_to_write -= mpd->pages_written; return ret; } @@ -2360,12 +2369,14 @@ static int ext4_da_writepages_trans_blocks(struct inode *inode) static int ext4_da_writepages(struct address_space *mapping, struct writeback_control *wbc) { + pgoff_t index; + int range_whole = 0; handle_t *handle = NULL; - loff_t range_start = 0; struct mpage_da_data mpd; struct inode *inode = mapping->host; + int no_nrwrite_index_update; + long pages_written = 0, pages_skipped; int needed_blocks, ret = 0, nr_to_writebump = 0; - long to_write, pages_skipped = 0; struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb); /* @@ -2385,23 +2396,26 @@ static int ext4_da_writepages(struct address_space *mapping, nr_to_writebump = sbi->s_mb_stream_request - wbc->nr_to_write; wbc->nr_to_write = sbi->s_mb_stream_request; } + if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) + range_whole = 1; - if (!wbc->range_cyclic) - /* - * If range_cyclic is not set force range_cont - * and save the old writeback_index - */ - wbc->range_cont = 1; - - range_start = wbc->range_start; - pages_skipped = wbc->pages_skipped; + if (wbc->range_cyclic) + index = mapping->writeback_index; + else + index = wbc->range_start >> PAGE_CACHE_SHIFT; mpd.wbc = wbc; mpd.inode = mapping->host; -restart_loop: - to_write = wbc->nr_to_write; - while (!ret && to_write > 0) { + /* + * we don't want write_cache_pages to update + * nr_to_write and writeback_index + */ + no_nrwrite_index_update = wbc->no_nrwrite_index_update; + wbc->no_nrwrite_index_update = 1; + pages_skipped = wbc->pages_skipped; + + while (!ret && wbc->nr_to_write > 0) { /* * we insert one extent at a time. So we need @@ -2422,48 +2436,53 @@ restart_loop: dump_stack(); goto out_writepages; } - to_write -= wbc->nr_to_write; - mpd.get_block = ext4_da_get_block_write; ret = mpage_da_writepages(mapping, wbc, &mpd); ext4_journal_stop(handle); - if (mpd.retval == -ENOSPC) + if (mpd.retval == -ENOSPC) { + /* commit the transaction which would + * free blocks released in the transaction + * and try again + */ jbd2_journal_force_commit_nested(sbi->s_journal); - - /* reset the retry count */ - if (ret == MPAGE_DA_EXTENT_TAIL) { + wbc->pages_skipped = pages_skipped; + ret = 0; + } else if (ret == MPAGE_DA_EXTENT_TAIL) { /* * got one extent now try with * rest of the pages */ - to_write += wbc->nr_to_write; + pages_written += mpd.pages_written; + wbc->pages_skipped = pages_skipped; ret = 0; - } else if (wbc->nr_to_write) { + } else if (wbc->nr_to_write) /* * There is no more writeout needed * or we requested for a noblocking writeout * and we found the device congested */ - to_write += wbc->nr_to_write; break; - } - wbc->nr_to_write = to_write; - } - - if (wbc->range_cont && (pages_skipped != wbc->pages_skipped)) { - /* We skipped pages in this loop */ - wbc->range_start = range_start; - wbc->nr_to_write = to_write + - wbc->pages_skipped - pages_skipped; - wbc->pages_skipped = pages_skipped; - goto restart_loop; } + if (pages_skipped != wbc->pages_skipped) + printk(KERN_EMERG "This should not happen leaving %s " + "with nr_to_write = %ld ret = %d\n", + __func__, wbc->nr_to_write, ret); + + /* Update index */ + index += pages_written; + if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) + /* + * set the writeback_index so that range_cyclic + * mode will write it back later + */ + mapping->writeback_index = index; out_writepages: - wbc->nr_to_write = to_write - nr_to_writebump; - wbc->range_start = range_start; + if (!no_nrwrite_index_update) + wbc->no_nrwrite_index_update = 0; + wbc->nr_to_write -= nr_to_writebump; return ret; } @@ -4175,7 +4194,6 @@ static int ext4_inode_blocks_set(handle_t *handle, struct inode *inode = &(ei->vfs_inode); u64 i_blocks = inode->i_blocks; struct super_block *sb = inode->i_sb; - int err = 0; if (i_blocks <= ~0U) { /* @@ -4185,36 +4203,27 @@ static int ext4_inode_blocks_set(handle_t *handle, raw_inode->i_blocks_lo = cpu_to_le32(i_blocks); raw_inode->i_blocks_high = 0; ei->i_flags &= ~EXT4_HUGE_FILE_FL; - } else if (i_blocks <= 0xffffffffffffULL) { + return 0; + } + if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_HUGE_FILE)) + return -EFBIG; + + if (i_blocks <= 0xffffffffffffULL) { /* * i_blocks can be represented in a 48 bit variable * as multiple of 512 bytes */ - err = ext4_update_rocompat_feature(handle, sb, - EXT4_FEATURE_RO_COMPAT_HUGE_FILE); - if (err) - goto err_out; - /* i_block is stored in the split 48 bit fields */ raw_inode->i_blocks_lo = cpu_to_le32(i_blocks); raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32); ei->i_flags &= ~EXT4_HUGE_FILE_FL; } else { - /* - * i_blocks should be represented in a 48 bit variable - * as multiple of file system block size - */ - err = ext4_update_rocompat_feature(handle, sb, - EXT4_FEATURE_RO_COMPAT_HUGE_FILE); - if (err) - goto err_out; ei->i_flags |= EXT4_HUGE_FILE_FL; /* i_block is stored in file system block size */ i_blocks = i_blocks >> (inode->i_blkbits - 9); raw_inode->i_blocks_lo = cpu_to_le32(i_blocks); raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32); } -err_out: - return err; + return 0; } /* diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index b580714..dfe17a1 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -2300,6 +2300,7 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group, } INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list); + meta_group_info[i]->bb_free_root.rb_node = NULL;; #ifdef DOUBLE_CHECK { @@ -2522,9 +2523,6 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery) } spin_lock_init(&sbi->s_md_lock); - INIT_LIST_HEAD(&sbi->s_active_transaction); - INIT_LIST_HEAD(&sbi->s_closed_transaction); - INIT_LIST_HEAD(&sbi->s_committed_transaction); spin_lock_init(&sbi->s_bal_lock); sbi->s_mb_max_to_scan = MB_DEFAULT_MAX_TO_SCAN; @@ -2553,6 +2551,8 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery) ext4_mb_init_per_dev_proc(sb); ext4_mb_history_init(sb); + sbi->s_journal->j_commit_callback = release_blocks_on_commit; + printk(KERN_INFO "EXT4-fs: mballoc enabled\n"); return 0; } @@ -2568,7 +2568,7 @@ static void ext4_mb_cleanup_pa(struct ext4_group_info *grp) pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list); list_del(&pa->pa_group_list); count++; - kfree(pa); + kmem_cache_free(ext4_pspace_cachep, pa); } if (count) mb_debug("mballoc: %u PAs left\n", count); @@ -2582,15 +2582,6 @@ int ext4_mb_release(struct super_block *sb) struct ext4_group_info *grinfo; struct ext4_sb_info *sbi = EXT4_SB(sb); - /* release freed, non-committed blocks */ - spin_lock(&sbi->s_md_lock); - list_splice_init(&sbi->s_closed_transaction, - &sbi->s_committed_transaction); - list_splice_init(&sbi->s_active_transaction, - &sbi->s_committed_transaction); - spin_unlock(&sbi->s_md_lock); - ext4_mb_free_committed_blocks(sb); - if (sbi->s_group_info) { for (i = 0; i < sbi->s_groups_count; i++) { grinfo = ext4_get_group_info(sb, i); @@ -2644,61 +2635,57 @@ int ext4_mb_release(struct super_block *sb) return 0; } -static noinline_for_stack void -ext4_mb_free_committed_blocks(struct super_block *sb) +/* + * This function is called by the jbd2 layer once the commit has finished, + * so we know we can free the blocks that were released with that commit. + */ +static void release_blocks_on_commit(journal_t *journal, transaction_t *txn) { - struct ext4_sb_info *sbi = EXT4_SB(sb); - int err; - int i; - int count = 0; - int count2 = 0; - struct ext4_free_metadata *md; + struct super_block *sb = journal->j_private; struct ext4_buddy e4b; + struct ext4_group_info *db; + int err, count = 0, count2 = 0; + struct ext4_free_data *entry; + ext4_fsblk_t discard_block; + struct list_head *l, *ltmp; - if (list_empty(&sbi->s_committed_transaction)) - return; - - /* there is committed blocks to be freed yet */ - do { - /* get next array of blocks */ - md = NULL; - spin_lock(&sbi->s_md_lock); - if (!list_empty(&sbi->s_committed_transaction)) { - md = list_entry(sbi->s_committed_transaction.next, - struct ext4_free_metadata, list); - list_del(&md->list); - } - spin_unlock(&sbi->s_md_lock); - - if (md == NULL) - break; + list_for_each_safe(l, ltmp, &txn->t_private_list) { + entry = list_entry(l, struct ext4_free_data, list); mb_debug("gonna free %u blocks in group %lu (0x%p):", - md->num, md->group, md); + entry->count, entry->group, entry); - err = ext4_mb_load_buddy(sb, md->group, &e4b); + err = ext4_mb_load_buddy(sb, entry->group, &e4b); /* we expect to find existing buddy because it's pinned */ BUG_ON(err != 0); + db = e4b.bd_info; /* there are blocks to put in buddy to make them really free */ - count += md->num; + count += entry->count; count2++; - ext4_lock_group(sb, md->group); - for (i = 0; i < md->num; i++) { - mb_debug(" %u", md->blocks[i]); - mb_free_blocks(NULL, &e4b, md->blocks[i], 1); + ext4_lock_group(sb, entry->group); + /* Take it out of per group rb tree */ + rb_erase(&entry->node, &(db->bb_free_root)); + mb_free_blocks(NULL, &e4b, entry->start_blk, entry->count); + + if (!db->bb_free_root.rb_node) { + /* No more items in the per group rb tree + * balance refcounts from ext4_mb_free_metadata() + */ + page_cache_release(e4b.bd_buddy_page); + page_cache_release(e4b.bd_bitmap_page); } - mb_debug("\n"); - ext4_unlock_group(sb, md->group); - - /* balance refcounts from ext4_mb_free_metadata() */ - page_cache_release(e4b.bd_buddy_page); - page_cache_release(e4b.bd_bitmap_page); - - kfree(md); + ext4_unlock_group(sb, entry->group); + discard_block = (ext4_fsblk_t) entry->group * EXT4_BLOCKS_PER_GROUP(sb) + + entry->start_blk + + le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block); + trace_mark(ext4_discard_blocks, "dev %s blk %llu count %u", sb->s_id, + (unsigned long long) discard_block, entry->count); + sb_issue_discard(sb, discard_block, entry->count); + + kmem_cache_free(ext4_free_ext_cachep, entry); ext4_mb_release_desc(&e4b); - - } while (md); + } mb_debug("freed %u blocks in %u structures\n", count, count2); } @@ -2712,6 +2699,7 @@ ext4_mb_free_committed_blocks(struct super_block *sb) static int ext4_mb_init_per_dev_proc(struct super_block *sb) { +#ifdef CONFIG_PROC_FS mode_t mode = S_IFREG | S_IRUGO | S_IWUSR; struct ext4_sb_info *sbi = EXT4_SB(sb); struct proc_dir_entry *proc; @@ -2735,10 +2723,14 @@ err_out: remove_proc_entry(EXT4_MB_MAX_TO_SCAN_NAME, sbi->s_proc); remove_proc_entry(EXT4_MB_STATS_NAME, sbi->s_proc); return -ENOMEM; +#else + return 0; +#endif } static int ext4_mb_destroy_per_dev_proc(struct super_block *sb) { +#ifdef CONFIG_PROC_FS struct ext4_sb_info *sbi = EXT4_SB(sb); if (sbi->s_proc == NULL) @@ -2750,7 +2742,7 @@ static int ext4_mb_destroy_per_dev_proc(struct super_block *sb) remove_proc_entry(EXT4_MB_MIN_TO_SCAN_NAME, sbi->s_proc); remove_proc_entry(EXT4_MB_MAX_TO_SCAN_NAME, sbi->s_proc); remove_proc_entry(EXT4_MB_STATS_NAME, sbi->s_proc); - +#endif return 0; } @@ -2771,6 +2763,16 @@ int __init init_ext4_mballoc(void) kmem_cache_destroy(ext4_pspace_cachep); return -ENOMEM; } + + ext4_free_ext_cachep = + kmem_cache_create("ext4_free_block_extents", + sizeof(struct ext4_free_data), + 0, SLAB_RECLAIM_ACCOUNT, NULL); + if (ext4_free_ext_cachep == NULL) { + kmem_cache_destroy(ext4_pspace_cachep); + kmem_cache_destroy(ext4_ac_cachep); + return -ENOMEM; + } return 0; } @@ -2779,6 +2781,7 @@ void exit_ext4_mballoc(void) /* XXX: synchronize_rcu(); */ kmem_cache_destroy(ext4_pspace_cachep); kmem_cache_destroy(ext4_ac_cachep); + kmem_cache_destroy(ext4_free_ext_cachep); } @@ -4324,8 +4327,6 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, goto out1; } - ext4_mb_poll_new_transaction(sb, handle); - *errp = ext4_mb_initialize_context(ac, ar); if (*errp) { ar->len = 0; @@ -4384,35 +4385,20 @@ out1: return block; } -static void ext4_mb_poll_new_transaction(struct super_block *sb, - handle_t *handle) -{ - struct ext4_sb_info *sbi = EXT4_SB(sb); - - if (sbi->s_last_transaction == handle->h_transaction->t_tid) - return; - - /* new transaction! time to close last one and free blocks for - * committed transaction. we know that only transaction can be - * active, so previos transaction can be being logged and we - * know that transaction before previous is known to be already - * logged. this means that now we may free blocks freed in all - * transactions before previous one. hope I'm clear enough ... */ - spin_lock(&sbi->s_md_lock); - if (sbi->s_last_transaction != handle->h_transaction->t_tid) { - mb_debug("new transaction %lu, old %lu\n", - (unsigned long) handle->h_transaction->t_tid, - (unsigned long) sbi->s_last_transaction); - list_splice_init(&sbi->s_closed_transaction, - &sbi->s_committed_transaction); - list_splice_init(&sbi->s_active_transaction, - &sbi->s_closed_transaction); - sbi->s_last_transaction = handle->h_transaction->t_tid; - } - spin_unlock(&sbi->s_md_lock); - - ext4_mb_free_committed_blocks(sb); +/* + * We can merge two free data extents only if the physical blocks + * are contiguous, AND the extents were freed by the same transaction, + * AND the blocks are associated with the same group. + */ +static int can_merge(struct ext4_free_data *entry1, + struct ext4_free_data *entry2) +{ + if ((entry1->t_tid == entry2->t_tid) && + (entry1->group == entry2->group) && + ((entry1->start_blk + entry1->count) == entry2->start_blk)) + return 1; + return 0; } static noinline_for_stack int @@ -4422,57 +4408,80 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b, struct ext4_group_info *db = e4b->bd_info; struct super_block *sb = e4b->bd_sb; struct ext4_sb_info *sbi = EXT4_SB(sb); - struct ext4_free_metadata *md; - int i; + struct ext4_free_data *entry, *new_entry; + struct rb_node **n = &db->bb_free_root.rb_node, *node; + struct rb_node *parent = NULL, *new_node; + BUG_ON(e4b->bd_bitmap_page == NULL); BUG_ON(e4b->bd_buddy_page == NULL); + new_entry = kmem_cache_alloc(ext4_free_ext_cachep, GFP_NOFS); + new_entry->start_blk = block; + new_entry->group = group; + new_entry->count = count; + new_entry->t_tid = handle->h_transaction->t_tid; + new_node = &new_entry->node; + ext4_lock_group(sb, group); - for (i = 0; i < count; i++) { - md = db->bb_md_cur; - if (md && db->bb_tid != handle->h_transaction->t_tid) { - db->bb_md_cur = NULL; - md = NULL; + if (!*n) { + /* first free block exent. We need to + protect buddy cache from being freed, + * otherwise we'll refresh it from + * on-disk bitmap and lose not-yet-available + * blocks */ + page_cache_get(e4b->bd_buddy_page); + page_cache_get(e4b->bd_bitmap_page); + } + while (*n) { + parent = *n; + entry = rb_entry(parent, struct ext4_free_data, node); + if (block < entry->start_blk) + n = &(*n)->rb_left; + else if (block >= (entry->start_blk + entry->count)) + n = &(*n)->rb_right; + else { + ext4_error(sb, __func__, + "Double free of blocks %d (%d %d)\n", + block, entry->start_blk, entry->count); + return 0; } + } - if (md == NULL) { - ext4_unlock_group(sb, group); - md = kmalloc(sizeof(*md), GFP_NOFS); - if (md == NULL) - return -ENOMEM; - md->num = 0; - md->group = group; - - ext4_lock_group(sb, group); - if (db->bb_md_cur == NULL) { - spin_lock(&sbi->s_md_lock); - list_add(&md->list, &sbi->s_active_transaction); - spin_unlock(&sbi->s_md_lock); - /* protect buddy cache from being freed, - * otherwise we'll refresh it from - * on-disk bitmap and lose not-yet-available - * blocks */ - page_cache_get(e4b->bd_buddy_page); - page_cache_get(e4b->bd_bitmap_page); - db->bb_md_cur = md; - db->bb_tid = handle->h_transaction->t_tid; - mb_debug("new md 0x%p for group %lu\n", - md, md->group); - } else { - kfree(md); - md = db->bb_md_cur; - } + rb_link_node(new_node, parent, n); + rb_insert_color(new_node, &db->bb_free_root); + + /* Now try to see the extent can be merged to left and right */ + node = rb_prev(new_node); + if (node) { + entry = rb_entry(node, struct ext4_free_data, node); + if (can_merge(entry, new_entry)) { + new_entry->start_blk = entry->start_blk; + new_entry->count += entry->count; + rb_erase(node, &(db->bb_free_root)); + spin_lock(&sbi->s_md_lock); + list_del(&entry->list); + spin_unlock(&sbi->s_md_lock); + kmem_cache_free(ext4_free_ext_cachep, entry); } + } - BUG_ON(md->num >= EXT4_BB_MAX_BLOCKS); - md->blocks[md->num] = block + i; - md->num++; - if (md->num == EXT4_BB_MAX_BLOCKS) { - /* no more space, put full container on a sb's list */ - db->bb_md_cur = NULL; + node = rb_next(new_node); + if (node) { + entry = rb_entry(node, struct ext4_free_data, node); + if (can_merge(new_entry, entry)) { + new_entry->count += entry->count; + rb_erase(node, &(db->bb_free_root)); + spin_lock(&sbi->s_md_lock); + list_del(&entry->list); + spin_unlock(&sbi->s_md_lock); + kmem_cache_free(ext4_free_ext_cachep, entry); } } + /* Add the extent to transaction's private list */ + spin_lock(&sbi->s_md_lock); + list_add(&new_entry->list, &handle->h_transaction->t_private_list); + spin_unlock(&sbi->s_md_lock); ext4_unlock_group(sb, group); return 0; } @@ -4500,8 +4509,6 @@ void ext4_mb_free_blocks(handle_t *handle, struct inode *inode, *freed = 0; - ext4_mb_poll_new_transaction(sb, handle); - sbi = EXT4_SB(sb); es = EXT4_SB(sb)->s_es; if (block < le32_to_cpu(es->s_first_data_block) || diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h index b3b4828..b5dff1f 100644 --- a/fs/ext4/mballoc.h +++ b/fs/ext4/mballoc.h @@ -18,6 +18,8 @@ #include <linux/pagemap.h> #include <linux/seq_file.h> #include <linux/version.h> +#include <linux/blkdev.h> +#include <linux/marker.h> #include "ext4_jbd2.h" #include "ext4.h" #include "group.h" @@ -98,23 +100,29 @@ static struct kmem_cache *ext4_pspace_cachep; static struct kmem_cache *ext4_ac_cachep; +static struct kmem_cache *ext4_free_ext_cachep; -#ifdef EXT4_BB_MAX_BLOCKS -#undef EXT4_BB_MAX_BLOCKS -#endif -#define EXT4_BB_MAX_BLOCKS 30 +struct ext4_free_data { + /* this links the free block information from group_info */ + struct rb_node node; -struct ext4_free_metadata { - ext4_group_t group; - unsigned short num; - ext4_grpblk_t blocks[EXT4_BB_MAX_BLOCKS]; + /* this links the free block information from ext4_sb_info */ struct list_head list; + + /* group which free block extent belongs */ + ext4_group_t group; + + /* free block extent */ + ext4_grpblk_t start_blk; + ext4_grpblk_t count; + + /* transaction which freed this extent */ + tid_t t_tid; }; struct ext4_group_info { unsigned long bb_state; - unsigned long bb_tid; - struct ext4_free_metadata *bb_md_cur; + struct rb_root bb_free_root; unsigned short bb_first_free; unsigned short bb_free; unsigned short bb_fragments; @@ -261,8 +269,6 @@ struct buffer_head *read_block_bitmap(struct super_block *, ext4_group_t); static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, ext4_group_t group); -static void ext4_mb_poll_new_transaction(struct super_block *, handle_t *); -static void ext4_mb_free_committed_blocks(struct super_block *); static void ext4_mb_return_to_preallocation(struct inode *inode, struct ext4_buddy *e4b, sector_t block, int count); @@ -270,6 +276,7 @@ static void ext4_mb_put_pa(struct ext4_allocation_context *, struct super_block *, struct ext4_prealloc_space *pa); static int ext4_mb_init_per_dev_proc(struct super_block *sb); static int ext4_mb_destroy_per_dev_proc(struct super_block *sb); +static void release_blocks_on_commit(journal_t *journal, transaction_t *txn); static inline void ext4_lock_group(struct super_block *sb, ext4_group_t group) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index fb940c2..9b2b2bc 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -374,66 +374,6 @@ void ext4_update_dynamic_rev(struct super_block *sb) */ } -int ext4_update_compat_feature(handle_t *handle, - struct super_block *sb, __u32 compat) -{ - int err = 0; - if (!EXT4_HAS_COMPAT_FEATURE(sb, compat)) { - err = ext4_journal_get_write_access(handle, - EXT4_SB(sb)->s_sbh); - if (err) - return err; - EXT4_SET_COMPAT_FEATURE(sb, compat); - sb->s_dirt = 1; - handle->h_sync = 1; - BUFFER_TRACE(EXT4_SB(sb)->s_sbh, - "call ext4_journal_dirty_met adata"); - err = ext4_journal_dirty_metadata(handle, - EXT4_SB(sb)->s_sbh); - } - return err; -} - -int ext4_update_rocompat_feature(handle_t *handle, - struct super_block *sb, __u32 rocompat) -{ - int err = 0; - if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, rocompat)) { - err = ext4_journal_get_write_access(handle, - EXT4_SB(sb)->s_sbh); - if (err) - return err; - EXT4_SET_RO_COMPAT_FEATURE(sb, rocompat); - sb->s_dirt = 1; - handle->h_sync = 1; - BUFFER_TRACE(EXT4_SB(sb)->s_sbh, - "call ext4_journal_dirty_met adata"); - err = ext4_journal_dirty_metadata(handle, - EXT4_SB(sb)->s_sbh); - } - return err; -} - -int ext4_update_incompat_feature(handle_t *handle, - struct super_block *sb, __u32 incompat) -{ - int err = 0; - if (!EXT4_HAS_INCOMPAT_FEATURE(sb, incompat)) { - err = ext4_journal_get_write_access(handle, - EXT4_SB(sb)->s_sbh); - if (err) - return err; - EXT4_SET_INCOMPAT_FEATURE(sb, incompat); - sb->s_dirt = 1; - handle->h_sync = 1; - BUFFER_TRACE(EXT4_SB(sb)->s_sbh, - "call ext4_journal_dirty_met adata"); - err = ext4_journal_dirty_metadata(handle, - EXT4_SB(sb)->s_sbh); - } - return err; -} - /* * Open the external journal device */ @@ -904,7 +844,7 @@ static const struct export_operations ext4_export_ops = { enum { Opt_bsd_df, Opt_minix_df, Opt_grpid, Opt_nogrpid, Opt_resgid, Opt_resuid, Opt_sb, Opt_err_cont, Opt_err_panic, Opt_err_ro, - Opt_nouid32, Opt_nocheck, Opt_debug, Opt_oldalloc, Opt_orlov, + Opt_nouid32, Opt_debug, Opt_oldalloc, Opt_orlov, Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl, Opt_reservation, Opt_noreservation, Opt_noload, Opt_nobh, Opt_bh, Opt_commit, Opt_journal_update, Opt_journal_inum, Opt_journal_dev, @@ -915,11 +855,11 @@ enum { Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota, Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota, Opt_grpquota, Opt_extents, Opt_noextents, Opt_i_version, - Opt_mballoc, Opt_nomballoc, Opt_stripe, Opt_delalloc, Opt_nodelalloc, + Opt_stripe, Opt_delalloc, Opt_nodelalloc, Opt_inode_readahead_blks }; -static match_table_t tokens = { +static const match_table_t tokens = { {Opt_bsd_df, "bsddf"}, {Opt_minix_df, "minixdf"}, {Opt_grpid, "grpid"}, @@ -933,8 +873,6 @@ static match_table_t tokens = { {Opt_err_panic, "errors=panic"}, {Opt_err_ro, "errors=remount-ro"}, {Opt_nouid32, "nouid32"}, - {Opt_nocheck, "nocheck"}, - {Opt_nocheck, "check=none"}, {Opt_debug, "debug"}, {Opt_oldalloc, "oldalloc"}, {Opt_orlov, "orlov"}, @@ -973,8 +911,6 @@ static match_table_t tokens = { {Opt_extents, "extents"}, {Opt_noextents, "noextents"}, {Opt_i_version, "i_version"}, - {Opt_mballoc, "mballoc"}, - {Opt_nomballoc, "nomballoc"}, {Opt_stripe, "stripe=%u"}, {Opt_resize, "resize"}, {Opt_delalloc, "delalloc"}, @@ -1073,9 +1009,6 @@ static int parse_options(char *options, struct super_block *sb, case Opt_nouid32: set_opt(sbi->s_mount_opt, NO_UID32); break; - case Opt_nocheck: - clear_opt(sbi->s_mount_opt, CHECK); - break; case Opt_debug: set_opt(sbi->s_mount_opt, DEBUG); break; @@ -1618,14 +1551,14 @@ static int ext4_check_descriptors(struct super_block *sb) if (block_bitmap < first_block || block_bitmap > last_block) { printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: " "Block bitmap for group %lu not in group " - "(block %llu)!", i, block_bitmap); + "(block %llu)!\n", i, block_bitmap); return 0; } inode_bitmap = ext4_inode_bitmap(sb, gdp); if (inode_bitmap < first_block || inode_bitmap > last_block) { printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: " "Inode bitmap for group %lu not in group " - "(block %llu)!", i, inode_bitmap); + "(block %llu)!\n", i, inode_bitmap); return 0; } inode_table = ext4_inode_table(sb, gdp); @@ -1633,7 +1566,7 @@ static int ext4_check_descriptors(struct super_block *sb) inode_table + sbi->s_itb_per_group - 1 > last_block) { printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: " "Inode table for group %lu not in group " - "(block %llu)!", i, inode_table); + "(block %llu)!\n", i, inode_table); return 0; } spin_lock(sb_bgl_lock(sbi, i)); @@ -1778,13 +1711,13 @@ static void ext4_orphan_cleanup(struct super_block *sb, * * Note, this does *not* consider any metadata overhead for vfs i_blocks. */ -static loff_t ext4_max_size(int blkbits) +static loff_t ext4_max_size(int blkbits, int has_huge_files) { loff_t res; loff_t upper_limit = MAX_LFS_FILESIZE; /* small i_blocks in vfs inode? */ - if (sizeof(blkcnt_t) < sizeof(u64)) { + if (!has_huge_files || sizeof(blkcnt_t) < sizeof(u64)) { /* * CONFIG_LSF is not enabled implies the inode * i_block represent total blocks in 512 bytes @@ -1814,7 +1747,7 @@ static loff_t ext4_max_size(int blkbits) * block limit, and also a limit of (2^48 - 1) 512-byte sectors in i_blocks. * We need to be 1 filesystem block less than the 2^48 sector limit. */ -static loff_t ext4_max_bitmap_size(int bits) +static loff_t ext4_max_bitmap_size(int bits, int has_huge_files) { loff_t res = EXT4_NDIR_BLOCKS; int meta_blocks; @@ -1827,11 +1760,11 @@ static loff_t ext4_max_bitmap_size(int bits) * total number of 512 bytes blocks of the file */ - if (sizeof(blkcnt_t) < sizeof(u64)) { + if (!has_huge_files || sizeof(blkcnt_t) < sizeof(u64)) { /* - * CONFIG_LSF is not enabled implies the inode - * i_block represent total blocks in 512 bytes - * 32 == size of vfs inode i_blocks * 8 + * !has_huge_files or CONFIG_LSF is not enabled + * implies the inode i_block represent total blocks in + * 512 bytes 32 == size of vfs inode i_blocks * 8 */ upper_limit = (1LL << 32) - 1; @@ -1940,7 +1873,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) int blocksize; int db_count; int i; - int needs_recovery; + int needs_recovery, has_huge_files; __le32 features; __u64 blocks_count; int err; @@ -2081,7 +2014,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) sb->s_id, le32_to_cpu(features)); goto failed_mount; } - if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_HUGE_FILE)) { + has_huge_files = EXT4_HAS_RO_COMPAT_FEATURE(sb, + EXT4_FEATURE_RO_COMPAT_HUGE_FILE); + if (has_huge_files) { /* * Large file size enabled file system can only be * mount if kernel is build with CONFIG_LSF @@ -2131,8 +2066,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) } } - sbi->s_bitmap_maxbytes = ext4_max_bitmap_size(sb->s_blocksize_bits); - sb->s_maxbytes = ext4_max_size(sb->s_blocksize_bits); + sbi->s_bitmap_maxbytes = ext4_max_bitmap_size(sb->s_blocksize_bits, + has_huge_files); + sb->s_maxbytes = ext4_max_size(sb->s_blocksize_bits, has_huge_files); if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV) { sbi->s_inode_size = EXT4_GOOD_OLD_INODE_SIZE; @@ -2456,6 +2392,21 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) "available.\n"); } + if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) { + printk(KERN_WARNING "EXT4-fs: Ignoring delalloc option - " + "requested data journaling mode\n"); + clear_opt(sbi->s_mount_opt, DELALLOC); + } else if (test_opt(sb, DELALLOC)) + printk(KERN_INFO "EXT4-fs: delayed allocation enabled\n"); + + ext4_ext_init(sb); + err = ext4_mb_init(sb, needs_recovery); + if (err) { + printk(KERN_ERR "EXT4-fs: failed to initalize mballoc (%d)\n", + err); + goto failed_mount4; + } + /* * akpm: core read_super() calls in here with the superblock locked. * That deadlocks, because orphan cleanup needs to lock the superblock @@ -2475,21 +2426,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA ? "ordered": "writeback"); - if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) { - printk(KERN_WARNING "EXT4-fs: Ignoring delalloc option - " - "requested data journaling mode\n"); - clear_opt(sbi->s_mount_opt, DELALLOC); - } else if (test_opt(sb, DELALLOC)) - printk(KERN_INFO "EXT4-fs: delayed allocation enabled\n"); - - ext4_ext_init(sb); - err = ext4_mb_init(sb, needs_recovery); - if (err) { - printk(KERN_ERR "EXT4-fs: failed to initalize mballoc (%d)\n", - err); - goto failed_mount4; - } - lock_kernel(); return 0; diff --git a/fs/fat/inode.c b/fs/fat/inode.c index 80ff338..d12cdf2 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -855,7 +855,7 @@ enum { Opt_obsolate, Opt_flush, Opt_tz_utc, Opt_err, }; -static match_table_t fat_tokens = { +static const match_table_t fat_tokens = { {Opt_check_r, "check=relaxed"}, {Opt_check_s, "check=strict"}, {Opt_check_n, "check=normal"}, @@ -890,14 +890,14 @@ static match_table_t fat_tokens = { {Opt_tz_utc, "tz=UTC"}, {Opt_err, NULL}, }; -static match_table_t msdos_tokens = { +static const match_table_t msdos_tokens = { {Opt_nodots, "nodots"}, {Opt_nodots, "dotsOK=no"}, {Opt_dots, "dots"}, {Opt_dots, "dotsOK=yes"}, {Opt_err, NULL} }; -static match_table_t vfat_tokens = { +static const match_table_t vfat_tokens = { {Opt_charset, "iocharset=%s"}, {Opt_shortname_lower, "shortname=lower"}, {Opt_shortname_win95, "shortname=win95"}, diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 25adfc3..d0ff0b8 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -8,7 +8,7 @@ * pages against inodes. ie: data writeback. Writeout of the * inode itself is not handled here. * - * 10Apr2002 akpm@zip.com.au + * 10Apr2002 Andrew Morton * Split out of fs/inode.c * Additions for address_space-based writeback */ diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index d2249f1..6a84388 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -354,7 +354,7 @@ enum { OPT_ERR }; -static match_table_t tokens = { +static const match_table_t tokens = { {OPT_FD, "fd=%u"}, {OPT_ROOTMODE, "rootmode=%o"}, {OPT_USER_ID, "user_id=%u"}, diff --git a/fs/gfs2/mount.c b/fs/gfs2/mount.c index df48333..f96eb90 100644 --- a/fs/gfs2/mount.c +++ b/fs/gfs2/mount.c @@ -46,7 +46,7 @@ enum { Opt_err, }; -static match_table_t tokens = { +static const match_table_t tokens = { {Opt_lockproto, "lockproto=%s"}, {Opt_locktable, "locktable=%s"}, {Opt_hostdata, "hostdata=%s"}, diff --git a/fs/hfs/catalog.c b/fs/hfs/catalog.c index ba85157..6d98f11 100644 --- a/fs/hfs/catalog.c +++ b/fs/hfs/catalog.c @@ -190,6 +190,10 @@ int hfs_cat_find_brec(struct super_block *sb, u32 cnid, fd->search_key->cat.ParID = rec.thread.ParID; len = fd->search_key->cat.CName.len = rec.thread.CName.len; + if (len > HFS_NAMELEN) { + printk(KERN_ERR "hfs: bad catalog namelength\n"); + return -EIO; + } memcpy(fd->search_key->cat.CName.name, rec.thread.CName.name, len); return hfs_brec_find(fd); } diff --git a/fs/hfs/super.c b/fs/hfs/super.c index 4abb104..3c7c763 100644 --- a/fs/hfs/super.c +++ b/fs/hfs/super.c @@ -173,7 +173,7 @@ enum { opt_err }; -static match_table_t tokens = { +static const match_table_t tokens = { { opt_uid, "uid=%u" }, { opt_gid, "gid=%u" }, { opt_umask, "umask=%o" }, diff --git a/fs/hfsplus/bitmap.c b/fs/hfsplus/bitmap.c index d128a25b..ea30afc 100644 --- a/fs/hfsplus/bitmap.c +++ b/fs/hfsplus/bitmap.c @@ -32,6 +32,10 @@ int hfsplus_block_allocate(struct super_block *sb, u32 size, u32 offset, u32 *ma mutex_lock(&HFSPLUS_SB(sb).alloc_file->i_mutex); mapping = HFSPLUS_SB(sb).alloc_file->i_mapping; page = read_mapping_page(mapping, offset / PAGE_CACHE_BITS, NULL); + if (IS_ERR(page)) { + start = size; + goto out; + } pptr = kmap(page); curr = pptr + (offset & (PAGE_CACHE_BITS - 1)) / 32; i = offset % 32; @@ -73,6 +77,10 @@ int hfsplus_block_allocate(struct super_block *sb, u32 size, u32 offset, u32 *ma break; page = read_mapping_page(mapping, offset / PAGE_CACHE_BITS, NULL); + if (IS_ERR(page)) { + start = size; + goto out; + } curr = pptr = kmap(page); if ((size ^ offset) / PAGE_CACHE_BITS) end = pptr + PAGE_CACHE_BITS / 32; @@ -120,6 +128,10 @@ found: offset += PAGE_CACHE_BITS; page = read_mapping_page(mapping, offset / PAGE_CACHE_BITS, NULL); + if (IS_ERR(page)) { + start = size; + goto out; + } pptr = kmap(page); curr = pptr; end = pptr + PAGE_CACHE_BITS / 32; diff --git a/fs/hfsplus/catalog.c b/fs/hfsplus/catalog.c index ba117c4..f6874ac 100644 --- a/fs/hfsplus/catalog.c +++ b/fs/hfsplus/catalog.c @@ -168,6 +168,11 @@ int hfsplus_find_cat(struct super_block *sb, u32 cnid, return -EIO; } + if (be16_to_cpu(tmp.thread.nodeName.length) > 255) { + printk(KERN_ERR "hfs: catalog name length corrupted\n"); + return -EIO; + } + hfsplus_cat_build_key_uni(fd->search_key, be32_to_cpu(tmp.thread.parentID), &tmp.thread.nodeName); return hfs_brec_find(fd); diff --git a/fs/hfsplus/options.c b/fs/hfsplus/options.c index 9997cbf..9699c56 100644 --- a/fs/hfsplus/options.c +++ b/fs/hfsplus/options.c @@ -25,7 +25,7 @@ enum { opt_force, opt_err }; -static match_table_t tokens = { +static const match_table_t tokens = { { opt_creator, "creator=%s" }, { opt_type, "type=%s" }, { opt_umask, "umask=%o" }, diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c index e834e57..eb74531 100644 --- a/fs/hfsplus/super.c +++ b/fs/hfsplus/super.c @@ -356,7 +356,7 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent) } else if (vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_SOFTLOCK)) { printk(KERN_WARNING "hfs: Filesystem is marked locked, mounting read-only.\n"); sb->s_flags |= MS_RDONLY; - } else if (vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_JOURNALED)) { + } else if ((vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_JOURNALED)) && !(sb->s_flags & MS_RDONLY)) { printk(KERN_WARNING "hfs: write access to a journaled filesystem is not supported, " "use the force option at your own risk, mounting read-only.\n"); sb->s_flags |= MS_RDONLY; diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c index b8ae9c9..29ad461 100644 --- a/fs/hpfs/super.c +++ b/fs/hpfs/super.c @@ -215,7 +215,7 @@ enum { Opt_timeshift, Opt_err, }; -static match_table_t tokens = { +static const match_table_t tokens = { {Opt_help, "help"}, {Opt_uid, "uid=%u"}, {Opt_gid, "gid=%u"}, diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 3f58923..61edc70 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -57,7 +57,7 @@ enum { Opt_err, }; -static match_table_t tokens = { +static const match_table_t tokens = { {Opt_size, "size=%s"}, {Opt_nr_inodes, "nr_inodes=%s"}, {Opt_mode, "mode=%o"}, diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c index 26948a6..3f8af0f 100644 --- a/fs/isofs/inode.c +++ b/fs/isofs/inode.c @@ -310,7 +310,7 @@ enum { Opt_nocompress, Opt_hide, Opt_showassoc, Opt_dmode, }; -static match_table_t tokens = { +static const match_table_t tokens = { {Opt_norock, "norock"}, {Opt_nojoliet, "nojoliet"}, {Opt_unhide, "unhide"}, diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 0abe02c..8b119e1 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -995,6 +995,9 @@ restart_loop: } spin_unlock(&journal->j_list_lock); + if (journal->j_commit_callback) + journal->j_commit_callback(journal, commit_transaction); + trace_mark(jbd2_end_commit, "dev %s transaction %d head %d", journal->j_devname, commit_transaction->t_tid, journal->j_tail_sequence); diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index e5d5405..39b7805 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -52,6 +52,7 @@ jbd2_get_transaction(journal_t *journal, transaction_t *transaction) transaction->t_expires = jiffies + journal->j_commit_interval; spin_lock_init(&transaction->t_handle_lock); INIT_LIST_HEAD(&transaction->t_inode_list); + INIT_LIST_HEAD(&transaction->t_private_list); /* Set up the commit timer for the new transaction. */ journal->j_commit_timer.expires = round_jiffies(transaction->t_expires); diff --git a/fs/jfs/super.c b/fs/jfs/super.c index 3630718..0dae345 100644 --- a/fs/jfs/super.c +++ b/fs/jfs/super.c @@ -199,7 +199,7 @@ enum { Opt_usrquota, Opt_grpquota, Opt_uid, Opt_gid, Opt_umask }; -static match_table_t tokens = { +static const match_table_t tokens = { {Opt_integrity, "integrity"}, {Opt_nointegrity, "nointegrity"}, {Opt_iocharset, "iocharset=%s"}, diff --git a/fs/lockd/Makefile b/fs/lockd/Makefile index 7725a0a..97f6073 100644 --- a/fs/lockd/Makefile +++ b/fs/lockd/Makefile @@ -5,6 +5,6 @@ obj-$(CONFIG_LOCKD) += lockd.o lockd-objs-y := clntlock.o clntproc.o host.o svc.o svclock.o svcshare.o \ - svcproc.o svcsubs.o mon.o xdr.o + svcproc.o svcsubs.o mon.o xdr.o grace.o lockd-objs-$(CONFIG_LOCKD_V4) += xdr4.o svc4proc.o lockd-objs := $(lockd-objs-y) diff --git a/fs/lockd/clntlock.c b/fs/lockd/clntlock.c index 0b45fd3..8307dd6 100644 --- a/fs/lockd/clntlock.c +++ b/fs/lockd/clntlock.c @@ -54,14 +54,13 @@ struct nlm_host *nlmclnt_init(const struct nlmclnt_initdata *nlm_init) u32 nlm_version = (nlm_init->nfs_version == 2) ? 1 : 4; int status; - status = lockd_up(nlm_init->protocol); + status = lockd_up(); if (status < 0) return ERR_PTR(status); - host = nlmclnt_lookup_host((struct sockaddr_in *)nlm_init->address, + host = nlmclnt_lookup_host(nlm_init->address, nlm_init->addrlen, nlm_init->protocol, nlm_version, - nlm_init->hostname, - strlen(nlm_init->hostname)); + nlm_init->hostname); if (host == NULL) { lockd_down(); return ERR_PTR(-ENOLCK); @@ -142,7 +141,7 @@ int nlmclnt_block(struct nlm_wait *block, struct nlm_rqst *req, long timeout) /* * The server lockd has called us back to tell us the lock was granted */ -__be32 nlmclnt_grant(const struct sockaddr_in *addr, const struct nlm_lock *lock) +__be32 nlmclnt_grant(const struct sockaddr *addr, const struct nlm_lock *lock) { const struct file_lock *fl = &lock->fl; const struct nfs_fh *fh = &lock->fh; @@ -166,7 +165,7 @@ __be32 nlmclnt_grant(const struct sockaddr_in *addr, const struct nlm_lock *lock */ if (fl_blocked->fl_u.nfs_fl.owner->pid != lock->svid) continue; - if (!nlm_cmp_addr(&block->b_host->h_addr, addr)) + if (!nlm_cmp_addr(nlm_addr(block->b_host), addr)) continue; if (nfs_compare_fh(NFS_FH(fl_blocked->fl_file->f_path.dentry->d_inode) ,fh) != 0) continue; @@ -216,7 +215,7 @@ reclaimer(void *ptr) /* This one ensures that our parent doesn't terminate while the * reclaim is in progress */ lock_kernel(); - lockd_up(0); /* note: this cannot fail as lockd is already running */ + lockd_up(); /* note: this cannot fail as lockd is already running */ dprintk("lockd: reclaiming locks for host %s\n", host->h_name); diff --git a/fs/lockd/grace.c b/fs/lockd/grace.c new file mode 100644 index 0000000..183cc1f --- /dev/null +++ b/fs/lockd/grace.c @@ -0,0 +1,59 @@ +/* + * Common code for control of lockd and nfsv4 grace periods. + */ + +#include <linux/module.h> +#include <linux/lockd/bind.h> + +static LIST_HEAD(grace_list); +static DEFINE_SPINLOCK(grace_lock); + +/** + * locks_start_grace + * @lm: who this grace period is for + * + * A grace period is a period during which locks should not be given + * out. Currently grace periods are only enforced by the two lock + * managers (lockd and nfsd), using the locks_in_grace() function to + * check when they are in a grace period. + * + * This function is called to start a grace period. + */ +void locks_start_grace(struct lock_manager *lm) +{ + spin_lock(&grace_lock); + list_add(&lm->list, &grace_list); + spin_unlock(&grace_lock); +} +EXPORT_SYMBOL_GPL(locks_start_grace); + +/** + * locks_end_grace + * @lm: who this grace period is for + * + * Call this function to state that the given lock manager is ready to + * resume regular locking. The grace period will not end until all lock + * managers that called locks_start_grace() also call locks_end_grace(). + * Note that callers count on it being safe to call this more than once, + * and the second call should be a no-op. + */ +void locks_end_grace(struct lock_manager *lm) +{ + spin_lock(&grace_lock); + list_del_init(&lm->list); + spin_unlock(&grace_lock); +} +EXPORT_SYMBOL_GPL(locks_end_grace); + +/** + * locks_in_grace + * + * Lock managers call this function to determine when it is OK for them + * to answer ordinary lock requests, and when they should accept only + * lock reclaims. + */ +int locks_in_grace(void) +{ + return !list_empty(&grace_list); +} +EXPORT_SYMBOL_GPL(locks_in_grace); diff --git a/fs/lockd/host.c b/fs/lockd/host.c index a17664c..9fd8889 100644 --- a/fs/lockd/host.c +++ b/fs/lockd/host.c @@ -11,16 +11,17 @@ #include <linux/types.h> #include <linux/slab.h> #include <linux/in.h> +#include <linux/in6.h> #include <linux/sunrpc/clnt.h> #include <linux/sunrpc/svc.h> #include <linux/lockd/lockd.h> #include <linux/lockd/sm_inter.h> #include <linux/mutex.h> +#include <net/ipv6.h> #define NLMDBG_FACILITY NLMDBG_HOSTCACHE #define NLM_HOST_NRHASH 32 -#define NLM_ADDRHASH(addr) (ntohl(addr) & (NLM_HOST_NRHASH-1)) #define NLM_HOST_REBIND (60 * HZ) #define NLM_HOST_EXPIRE (300 * HZ) #define NLM_HOST_COLLECT (120 * HZ) @@ -30,42 +31,115 @@ static unsigned long next_gc; static int nrhosts; static DEFINE_MUTEX(nlm_host_mutex); - static void nlm_gc_hosts(void); -static struct nsm_handle * __nsm_find(const struct sockaddr_in *, - const char *, unsigned int, int); -static struct nsm_handle * nsm_find(const struct sockaddr_in *sin, - const char *hostname, - unsigned int hostname_len); +static struct nsm_handle *nsm_find(const struct sockaddr *sap, + const size_t salen, + const char *hostname, + const size_t hostname_len, + const int create); + +struct nlm_lookup_host_info { + const int server; /* search for server|client */ + const struct sockaddr *sap; /* address to search for */ + const size_t salen; /* it's length */ + const unsigned short protocol; /* transport to search for*/ + const u32 version; /* NLM version to search for */ + const char *hostname; /* remote's hostname */ + const size_t hostname_len; /* it's length */ + const struct sockaddr *src_sap; /* our address (optional) */ + const size_t src_len; /* it's length */ +}; + +/* + * Hash function must work well on big- and little-endian platforms + */ +static unsigned int __nlm_hash32(const __be32 n) +{ + unsigned int hash = (__force u32)n ^ ((__force u32)n >> 16); + return hash ^ (hash >> 8); +} + +static unsigned int __nlm_hash_addr4(const struct sockaddr *sap) +{ + const struct sockaddr_in *sin = (struct sockaddr_in *)sap; + return __nlm_hash32(sin->sin_addr.s_addr); +} + +static unsigned int __nlm_hash_addr6(const struct sockaddr *sap) +{ + const struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sap; + const struct in6_addr addr = sin6->sin6_addr; + return __nlm_hash32(addr.s6_addr32[0]) ^ + __nlm_hash32(addr.s6_addr32[1]) ^ + __nlm_hash32(addr.s6_addr32[2]) ^ + __nlm_hash32(addr.s6_addr32[3]); +} + +static unsigned int nlm_hash_address(const struct sockaddr *sap) +{ + unsigned int hash; + + switch (sap->sa_family) { + case AF_INET: + hash = __nlm_hash_addr4(sap); + break; + case AF_INET6: + hash = __nlm_hash_addr6(sap); + break; + default: + hash = 0; + } + return hash & (NLM_HOST_NRHASH - 1); +} + +static void nlm_clear_port(struct sockaddr *sap) +{ + switch (sap->sa_family) { + case AF_INET: + ((struct sockaddr_in *)sap)->sin_port = 0; + break; + case AF_INET6: + ((struct sockaddr_in6 *)sap)->sin6_port = 0; + break; + } +} + +static void nlm_display_address(const struct sockaddr *sap, + char *buf, const size_t len) +{ + const struct sockaddr_in *sin = (struct sockaddr_in *)sap; + const struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sap; + + switch (sap->sa_family) { + case AF_UNSPEC: + snprintf(buf, len, "unspecified"); + break; + case AF_INET: + snprintf(buf, len, NIPQUAD_FMT, NIPQUAD(sin->sin_addr.s_addr)); + break; + case AF_INET6: + if (ipv6_addr_v4mapped(&sin6->sin6_addr)) + snprintf(buf, len, NIPQUAD_FMT, + NIPQUAD(sin6->sin6_addr.s6_addr32[3])); + else + snprintf(buf, len, NIP6_FMT, NIP6(sin6->sin6_addr)); + break; + default: + snprintf(buf, len, "unsupported address family"); + break; + } +} /* * Common host lookup routine for server & client */ -static struct nlm_host *nlm_lookup_host(int server, - const struct sockaddr_in *sin, - int proto, u32 version, - const char *hostname, - unsigned int hostname_len, - const struct sockaddr_in *ssin) +static struct nlm_host *nlm_lookup_host(struct nlm_lookup_host_info *ni) { struct hlist_head *chain; struct hlist_node *pos; struct nlm_host *host; struct nsm_handle *nsm = NULL; - int hash; - - dprintk("lockd: nlm_lookup_host("NIPQUAD_FMT"->"NIPQUAD_FMT - ", p=%d, v=%u, my role=%s, name=%.*s)\n", - NIPQUAD(ssin->sin_addr.s_addr), - NIPQUAD(sin->sin_addr.s_addr), proto, version, - server? "server" : "client", - hostname_len, - hostname? hostname : "<none>"); - - hash = NLM_ADDRHASH(sin->sin_addr.s_addr); - - /* Lock hash table */ mutex_lock(&nlm_host_mutex); if (time_after_eq(jiffies, next_gc)) @@ -78,22 +152,22 @@ static struct nlm_host *nlm_lookup_host(int server, * different NLM rpc_clients into one single nlm_host object. * This would allow us to have one nlm_host per address. */ - chain = &nlm_hosts[hash]; + chain = &nlm_hosts[nlm_hash_address(ni->sap)]; hlist_for_each_entry(host, pos, chain, h_hash) { - if (!nlm_cmp_addr(&host->h_addr, sin)) + if (!nlm_cmp_addr(nlm_addr(host), ni->sap)) continue; /* See if we have an NSM handle for this client */ if (!nsm) nsm = host->h_nsmhandle; - if (host->h_proto != proto) + if (host->h_proto != ni->protocol) continue; - if (host->h_version != version) + if (host->h_version != ni->version) continue; - if (host->h_server != server) + if (host->h_server != ni->server) continue; - if (!nlm_cmp_addr(&host->h_saddr, ssin)) + if (!nlm_cmp_addr(nlm_srcaddr(host), ni->src_sap)) continue; /* Move to head of hash chain. */ @@ -101,30 +175,41 @@ static struct nlm_host *nlm_lookup_host(int server, hlist_add_head(&host->h_hash, chain); nlm_get_host(host); + dprintk("lockd: nlm_lookup_host found host %s (%s)\n", + host->h_name, host->h_addrbuf); goto out; } - if (nsm) - atomic_inc(&nsm->sm_count); - - host = NULL; - /* Sadly, the host isn't in our hash table yet. See if - * we have an NSM handle for it. If not, create one. + /* + * The host wasn't in our hash table. If we don't + * have an NSM handle for it yet, create one. */ - if (!nsm && !(nsm = nsm_find(sin, hostname, hostname_len))) - goto out; + if (nsm) + atomic_inc(&nsm->sm_count); + else { + host = NULL; + nsm = nsm_find(ni->sap, ni->salen, + ni->hostname, ni->hostname_len, 1); + if (!nsm) { + dprintk("lockd: nlm_lookup_host failed; " + "no nsm handle\n"); + goto out; + } + } host = kzalloc(sizeof(*host), GFP_KERNEL); if (!host) { nsm_release(nsm); + dprintk("lockd: nlm_lookup_host failed; no memory\n"); goto out; } host->h_name = nsm->sm_name; - host->h_addr = *sin; - host->h_addr.sin_port = 0; /* ouch! */ - host->h_saddr = *ssin; - host->h_version = version; - host->h_proto = proto; + memcpy(nlm_addr(host), ni->sap, ni->salen); + host->h_addrlen = ni->salen; + nlm_clear_port(nlm_addr(host)); + memcpy(nlm_srcaddr(host), ni->src_sap, ni->src_len); + host->h_version = ni->version; + host->h_proto = ni->protocol; host->h_rpcclnt = NULL; mutex_init(&host->h_mutex); host->h_nextrebind = jiffies + NLM_HOST_REBIND; @@ -135,7 +220,7 @@ static struct nlm_host *nlm_lookup_host(int server, host->h_state = 0; /* pseudo NSM state */ host->h_nsmstate = 0; /* real NSM state */ host->h_nsmhandle = nsm; - host->h_server = server; + host->h_server = ni->server; hlist_add_head(&host->h_hash, chain); INIT_LIST_HEAD(&host->h_lockowners); spin_lock_init(&host->h_lock); @@ -143,6 +228,15 @@ static struct nlm_host *nlm_lookup_host(int server, INIT_LIST_HEAD(&host->h_reclaim); nrhosts++; + + nlm_display_address((struct sockaddr *)&host->h_addr, + host->h_addrbuf, sizeof(host->h_addrbuf)); + nlm_display_address((struct sockaddr *)&host->h_srcaddr, + host->h_srcaddrbuf, sizeof(host->h_srcaddrbuf)); + + dprintk("lockd: nlm_lookup_host created host %s\n", + host->h_name); + out: mutex_unlock(&nlm_host_mutex); return host; @@ -170,33 +264,103 @@ nlm_destroy_host(struct nlm_host *host) kfree(host); } -/* - * Find an NLM server handle in the cache. If there is none, create it. +/** + * nlmclnt_lookup_host - Find an NLM host handle matching a remote server + * @sap: network address of server + * @salen: length of server address + * @protocol: transport protocol to use + * @version: NLM protocol version + * @hostname: '\0'-terminated hostname of server + * + * Returns an nlm_host structure that matches the passed-in + * [server address, transport protocol, NLM version, server hostname]. + * If one doesn't already exist in the host cache, a new handle is + * created and returned. */ -struct nlm_host *nlmclnt_lookup_host(const struct sockaddr_in *sin, - int proto, u32 version, - const char *hostname, - unsigned int hostname_len) +struct nlm_host *nlmclnt_lookup_host(const struct sockaddr *sap, + const size_t salen, + const unsigned short protocol, + const u32 version, const char *hostname) { - struct sockaddr_in ssin = {0}; - - return nlm_lookup_host(0, sin, proto, version, - hostname, hostname_len, &ssin); + const struct sockaddr source = { + .sa_family = AF_UNSPEC, + }; + struct nlm_lookup_host_info ni = { + .server = 0, + .sap = sap, + .salen = salen, + .protocol = protocol, + .version = version, + .hostname = hostname, + .hostname_len = strlen(hostname), + .src_sap = &source, + .src_len = sizeof(source), + }; + + dprintk("lockd: %s(host='%s', vers=%u, proto=%s)\n", __func__, + (hostname ? hostname : "<none>"), version, + (protocol == IPPROTO_UDP ? "udp" : "tcp")); + + return nlm_lookup_host(&ni); } -/* - * Find an NLM client handle in the cache. If there is none, create it. +/** + * nlmsvc_lookup_host - Find an NLM host handle matching a remote client + * @rqstp: incoming NLM request + * @hostname: name of client host + * @hostname_len: length of client hostname + * + * Returns an nlm_host structure that matches the [client address, + * transport protocol, NLM version, client hostname] of the passed-in + * NLM request. If one doesn't already exist in the host cache, a + * new handle is created and returned. + * + * Before possibly creating a new nlm_host, construct a sockaddr + * for a specific source address in case the local system has + * multiple network addresses. The family of the address in + * rq_daddr is guaranteed to be the same as the family of the + * address in rq_addr, so it's safe to use the same family for + * the source address. */ -struct nlm_host * -nlmsvc_lookup_host(struct svc_rqst *rqstp, - const char *hostname, unsigned int hostname_len) +struct nlm_host *nlmsvc_lookup_host(const struct svc_rqst *rqstp, + const char *hostname, + const size_t hostname_len) { - struct sockaddr_in ssin = {0}; + struct sockaddr_in sin = { + .sin_family = AF_INET, + }; + struct sockaddr_in6 sin6 = { + .sin6_family = AF_INET6, + }; + struct nlm_lookup_host_info ni = { + .server = 1, + .sap = svc_addr(rqstp), + .salen = rqstp->rq_addrlen, + .protocol = rqstp->rq_prot, + .version = rqstp->rq_vers, + .hostname = hostname, + .hostname_len = hostname_len, + .src_len = rqstp->rq_addrlen, + }; + + dprintk("lockd: %s(host='%*s', vers=%u, proto=%s)\n", __func__, + (int)hostname_len, hostname, rqstp->rq_vers, + (rqstp->rq_prot == IPPROTO_UDP ? "udp" : "tcp")); + + switch (ni.sap->sa_family) { + case AF_INET: + sin.sin_addr.s_addr = rqstp->rq_daddr.addr.s_addr; + ni.src_sap = (struct sockaddr *)&sin; + break; + case AF_INET6: + ipv6_addr_copy(&sin6.sin6_addr, &rqstp->rq_daddr.addr6); + ni.src_sap = (struct sockaddr *)&sin6; + break; + default: + return NULL; + } - ssin.sin_addr = rqstp->rq_daddr.addr; - return nlm_lookup_host(1, svc_addr_in(rqstp), - rqstp->rq_prot, rqstp->rq_vers, - hostname, hostname_len, &ssin); + return nlm_lookup_host(&ni); } /* @@ -207,9 +371,8 @@ nlm_bind_host(struct nlm_host *host) { struct rpc_clnt *clnt; - dprintk("lockd: nlm_bind_host("NIPQUAD_FMT"->"NIPQUAD_FMT")\n", - NIPQUAD(host->h_saddr.sin_addr), - NIPQUAD(host->h_addr.sin_addr)); + dprintk("lockd: nlm_bind_host %s (%s), my addr=%s\n", + host->h_name, host->h_addrbuf, host->h_srcaddrbuf); /* Lock host handle */ mutex_lock(&host->h_mutex); @@ -221,7 +384,7 @@ nlm_bind_host(struct nlm_host *host) if (time_after_eq(jiffies, host->h_nextrebind)) { rpc_force_rebind(clnt); host->h_nextrebind = jiffies + NLM_HOST_REBIND; - dprintk("lockd: next rebind in %ld jiffies\n", + dprintk("lockd: next rebind in %lu jiffies\n", host->h_nextrebind - jiffies); } } else { @@ -234,9 +397,9 @@ nlm_bind_host(struct nlm_host *host) }; struct rpc_create_args args = { .protocol = host->h_proto, - .address = (struct sockaddr *)&host->h_addr, - .addrsize = sizeof(host->h_addr), - .saddress = (struct sockaddr *)&host->h_saddr, + .address = nlm_addr(host), + .addrsize = host->h_addrlen, + .saddress = nlm_srcaddr(host), .timeout = &timeparms, .servername = host->h_name, .program = &nlm_program, @@ -324,12 +487,16 @@ void nlm_host_rebooted(const struct sockaddr_in *sin, struct nsm_handle *nsm; struct nlm_host *host; - dprintk("lockd: nlm_host_rebooted(%s, %u.%u.%u.%u)\n", - hostname, NIPQUAD(sin->sin_addr)); - - /* Find the NSM handle for this peer */ - if (!(nsm = __nsm_find(sin, hostname, hostname_len, 0))) + nsm = nsm_find((struct sockaddr *)sin, sizeof(*sin), + hostname, hostname_len, 0); + if (nsm == NULL) { + dprintk("lockd: never saw rebooted peer '%.*s' before\n", + hostname_len, hostname); return; + } + + dprintk("lockd: nlm_host_rebooted(%.*s, %s)\n", + hostname_len, hostname, nsm->sm_addrbuf); /* When reclaiming locks on this peer, make sure that * we set up a new notification */ @@ -461,22 +628,23 @@ nlm_gc_hosts(void) static LIST_HEAD(nsm_handles); static DEFINE_SPINLOCK(nsm_lock); -static struct nsm_handle * -__nsm_find(const struct sockaddr_in *sin, - const char *hostname, unsigned int hostname_len, - int create) +static struct nsm_handle *nsm_find(const struct sockaddr *sap, + const size_t salen, + const char *hostname, + const size_t hostname_len, + const int create) { struct nsm_handle *nsm = NULL; struct nsm_handle *pos; - if (!sin) + if (!sap) return NULL; if (hostname && memchr(hostname, '/', hostname_len) != NULL) { if (printk_ratelimit()) { printk(KERN_WARNING "Invalid hostname \"%.*s\" " "in NFS lock request\n", - hostname_len, hostname); + (int)hostname_len, hostname); } return NULL; } @@ -489,7 +657,7 @@ retry: if (strlen(pos->sm_name) != hostname_len || memcmp(pos->sm_name, hostname, hostname_len)) continue; - } else if (!nlm_cmp_addr(&pos->sm_addr, sin)) + } else if (!nlm_cmp_addr(nsm_addr(pos), sap)) continue; atomic_inc(&pos->sm_count); kfree(nsm); @@ -509,10 +677,13 @@ retry: if (nsm == NULL) return NULL; - nsm->sm_addr = *sin; + memcpy(nsm_addr(nsm), sap, salen); + nsm->sm_addrlen = salen; nsm->sm_name = (char *) (nsm + 1); memcpy(nsm->sm_name, hostname, hostname_len); nsm->sm_name[hostname_len] = '\0'; + nlm_display_address((struct sockaddr *)&nsm->sm_addr, + nsm->sm_addrbuf, sizeof(nsm->sm_addrbuf)); atomic_set(&nsm->sm_count, 1); goto retry; @@ -521,13 +692,6 @@ found: return nsm; } -static struct nsm_handle * -nsm_find(const struct sockaddr_in *sin, const char *hostname, - unsigned int hostname_len) -{ - return __nsm_find(sin, hostname, hostname_len, 1); -} - /* * Release an NSM handle */ diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c index e4d5635..4e7e958 100644 --- a/fs/lockd/mon.c +++ b/fs/lockd/mon.c @@ -51,7 +51,7 @@ nsm_mon_unmon(struct nsm_handle *nsm, u32 proc, struct nsm_res *res) memset(&args, 0, sizeof(args)); args.mon_name = nsm->sm_name; - args.addr = nsm->sm_addr.sin_addr.s_addr; + args.addr = nsm_addr_in(nsm)->sin_addr.s_addr; args.prog = NLM_PROGRAM; args.vers = 3; args.proc = NLMPROC_NSM_NOTIFY; diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c index 5bd9bf0..c631a83 100644 --- a/fs/lockd/svc.c +++ b/fs/lockd/svc.c @@ -51,7 +51,6 @@ static DEFINE_MUTEX(nlmsvc_mutex); static unsigned int nlmsvc_users; static struct task_struct *nlmsvc_task; static struct svc_rqst *nlmsvc_rqst; -int nlmsvc_grace_period; unsigned long nlmsvc_timeout; /* @@ -85,27 +84,23 @@ static unsigned long get_lockd_grace_period(void) return nlm_timeout * 5 * HZ; } -unsigned long get_nfs_grace_period(void) -{ - unsigned long lockdgrace = get_lockd_grace_period(); - unsigned long nfsdgrace = 0; - - if (nlmsvc_ops) - nfsdgrace = nlmsvc_ops->get_grace_period(); - - return max(lockdgrace, nfsdgrace); -} -EXPORT_SYMBOL(get_nfs_grace_period); +static struct lock_manager lockd_manager = { +}; -static unsigned long set_grace_period(void) +static void grace_ender(struct work_struct *not_used) { - nlmsvc_grace_period = 1; - return get_nfs_grace_period() + jiffies; + locks_end_grace(&lockd_manager); } -static inline void clear_grace_period(void) +static DECLARE_DELAYED_WORK(grace_period_end, grace_ender); + +static void set_grace_period(void) { - nlmsvc_grace_period = 0; + unsigned long grace_period = get_lockd_grace_period(); + + locks_start_grace(&lockd_manager); + cancel_delayed_work_sync(&grace_period_end); + schedule_delayed_work(&grace_period_end, grace_period); } /* @@ -116,7 +111,6 @@ lockd(void *vrqstp) { int err = 0, preverr = 0; struct svc_rqst *rqstp = vrqstp; - unsigned long grace_period_expire; /* try_to_freeze() is called from svc_recv() */ set_freezable(); @@ -139,7 +133,7 @@ lockd(void *vrqstp) nlm_timeout = LOCKD_DFLT_TIMEO; nlmsvc_timeout = nlm_timeout * HZ; - grace_period_expire = set_grace_period(); + set_grace_period(); /* * The main request loop. We don't terminate until the last @@ -153,21 +147,12 @@ lockd(void *vrqstp) flush_signals(current); if (nlmsvc_ops) { nlmsvc_invalidate_all(); - grace_period_expire = set_grace_period(); + set_grace_period(); } continue; } - /* - * Retry any blocked locks that have been notified by - * the VFS. Don't do this during grace period. - * (Theoretically, there shouldn't even be blocked locks - * during grace period). - */ - if (!nlmsvc_grace_period) { - timeout = nlmsvc_retry_blocked(); - } else if (time_before(grace_period_expire, jiffies)) - clear_grace_period(); + timeout = nlmsvc_retry_blocked(); /* * Find a socket with data available and call its @@ -195,6 +180,7 @@ lockd(void *vrqstp) svc_process(rqstp); } flush_signals(current); + cancel_delayed_work_sync(&grace_period_end); if (nlmsvc_ops) nlmsvc_invalidate_all(); nlm_shutdown_hosts(); @@ -203,25 +189,28 @@ lockd(void *vrqstp) } /* - * Make any sockets that are needed but not present. - * If nlm_udpport or nlm_tcpport were set as module - * options, make those sockets unconditionally + * Ensure there are active UDP and TCP listeners for lockd. + * + * Even if we have only TCP NFS mounts and/or TCP NFSDs, some + * local services (such as rpc.statd) still require UDP, and + * some NFS servers do not yet support NLM over TCP. + * + * Returns zero if all listeners are available; otherwise a + * negative errno value is returned. */ -static int make_socks(struct svc_serv *serv, int proto) +static int make_socks(struct svc_serv *serv) { static int warned; struct svc_xprt *xprt; int err = 0; - if (proto == IPPROTO_UDP || nlm_udpport) { - xprt = svc_find_xprt(serv, "udp", 0, 0); - if (!xprt) - err = svc_create_xprt(serv, "udp", nlm_udpport, - SVC_SOCK_DEFAULTS); - else - svc_xprt_put(xprt); - } - if (err >= 0 && (proto == IPPROTO_TCP || nlm_tcpport)) { + xprt = svc_find_xprt(serv, "udp", 0, 0); + if (!xprt) + err = svc_create_xprt(serv, "udp", nlm_udpport, + SVC_SOCK_DEFAULTS); + else + svc_xprt_put(xprt); + if (err >= 0) { xprt = svc_find_xprt(serv, "tcp", 0, 0); if (!xprt) err = svc_create_xprt(serv, "tcp", nlm_tcpport, @@ -241,8 +230,7 @@ static int make_socks(struct svc_serv *serv, int proto) /* * Bring up the lockd process if it's not already up. */ -int -lockd_up(int proto) /* Maybe add a 'family' option when IPv6 is supported ?? */ +int lockd_up(void) { struct svc_serv *serv; int error = 0; @@ -251,11 +239,8 @@ lockd_up(int proto) /* Maybe add a 'family' option when IPv6 is supported ?? */ /* * Check whether we're already up and running. */ - if (nlmsvc_rqst) { - if (proto) - error = make_socks(nlmsvc_rqst->rq_server, proto); + if (nlmsvc_rqst) goto out; - } /* * Sanity check: if there's no pid, @@ -266,13 +251,14 @@ lockd_up(int proto) /* Maybe add a 'family' option when IPv6 is supported ?? */ "lockd_up: no pid, %d users??\n", nlmsvc_users); error = -ENOMEM; - serv = svc_create(&nlmsvc_program, LOCKD_BUFSIZE, NULL); + serv = svc_create(&nlmsvc_program, LOCKD_BUFSIZE, AF_INET, NULL); if (!serv) { printk(KERN_WARNING "lockd_up: create service failed\n"); goto out; } - if ((error = make_socks(serv, proto)) < 0) + error = make_socks(serv); + if (error < 0) goto destroy_and_out; /* diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c index 4a714f64..014f6ce 100644 --- a/fs/lockd/svc4proc.c +++ b/fs/lockd/svc4proc.c @@ -88,12 +88,6 @@ nlm4svc_proc_test(struct svc_rqst *rqstp, struct nlm_args *argp, dprintk("lockd: TEST4 called\n"); resp->cookie = argp->cookie; - /* Don't accept test requests during grace period */ - if (nlmsvc_grace_period) { - resp->status = nlm_lck_denied_grace_period; - return rc; - } - /* Obtain client and file */ if ((resp->status = nlm4svc_retrieve_args(rqstp, argp, &host, &file))) return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success; @@ -122,12 +116,6 @@ nlm4svc_proc_lock(struct svc_rqst *rqstp, struct nlm_args *argp, resp->cookie = argp->cookie; - /* Don't accept new lock requests during grace period */ - if (nlmsvc_grace_period && !argp->reclaim) { - resp->status = nlm_lck_denied_grace_period; - return rc; - } - /* Obtain client and file */ if ((resp->status = nlm4svc_retrieve_args(rqstp, argp, &host, &file))) return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success; @@ -146,7 +134,8 @@ nlm4svc_proc_lock(struct svc_rqst *rqstp, struct nlm_args *argp, /* Now try to lock the file */ resp->status = nlmsvc_lock(rqstp, file, host, &argp->lock, - argp->block, &argp->cookie); + argp->block, &argp->cookie, + argp->reclaim); if (resp->status == nlm_drop_reply) rc = rpc_drop_reply; else @@ -169,7 +158,7 @@ nlm4svc_proc_cancel(struct svc_rqst *rqstp, struct nlm_args *argp, resp->cookie = argp->cookie; /* Don't accept requests during grace period */ - if (nlmsvc_grace_period) { + if (locks_in_grace()) { resp->status = nlm_lck_denied_grace_period; return rpc_success; } @@ -202,7 +191,7 @@ nlm4svc_proc_unlock(struct svc_rqst *rqstp, struct nlm_args *argp, resp->cookie = argp->cookie; /* Don't accept new lock requests during grace period */ - if (nlmsvc_grace_period) { + if (locks_in_grace()) { resp->status = nlm_lck_denied_grace_period; return rpc_success; } @@ -231,7 +220,7 @@ nlm4svc_proc_granted(struct svc_rqst *rqstp, struct nlm_args *argp, resp->cookie = argp->cookie; dprintk("lockd: GRANTED called\n"); - resp->status = nlmclnt_grant(svc_addr_in(rqstp), &argp->lock); + resp->status = nlmclnt_grant(svc_addr(rqstp), &argp->lock); dprintk("lockd: GRANTED status %d\n", ntohl(resp->status)); return rpc_success; } @@ -341,7 +330,7 @@ nlm4svc_proc_share(struct svc_rqst *rqstp, struct nlm_args *argp, resp->cookie = argp->cookie; /* Don't accept new lock requests during grace period */ - if (nlmsvc_grace_period && !argp->reclaim) { + if (locks_in_grace() && !argp->reclaim) { resp->status = nlm_lck_denied_grace_period; return rpc_success; } @@ -374,7 +363,7 @@ nlm4svc_proc_unshare(struct svc_rqst *rqstp, struct nlm_args *argp, resp->cookie = argp->cookie; /* Don't accept requests during grace period */ - if (nlmsvc_grace_period) { + if (locks_in_grace()) { resp->status = nlm_lck_denied_grace_period; return rpc_success; } @@ -432,11 +421,9 @@ nlm4svc_proc_sm_notify(struct svc_rqst *rqstp, struct nlm_reboot *argp, { struct sockaddr_in saddr; - memcpy(&saddr, svc_addr_in(rqstp), sizeof(saddr)); - dprintk("lockd: SM_NOTIFY called\n"); - if (saddr.sin_addr.s_addr != htonl(INADDR_LOOPBACK) - || ntohs(saddr.sin_port) >= 1024) { + + if (!nlm_privileged_requester(rqstp)) { char buf[RPC_MAX_ADDRBUFLEN]; printk(KERN_WARNING "lockd: rejected NSM callback from %s\n", svc_print_addr(rqstp, buf, sizeof(buf))); diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c index cf0d5c2c..6063a8e 100644 --- a/fs/lockd/svclock.c +++ b/fs/lockd/svclock.c @@ -360,7 +360,7 @@ nlmsvc_defer_lock_rqst(struct svc_rqst *rqstp, struct nlm_block *block) __be32 nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file, struct nlm_host *host, struct nlm_lock *lock, int wait, - struct nlm_cookie *cookie) + struct nlm_cookie *cookie, int reclaim) { struct nlm_block *block = NULL; int error; @@ -406,6 +406,15 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file, goto out; } + if (locks_in_grace() && !reclaim) { + ret = nlm_lck_denied_grace_period; + goto out; + } + if (reclaim && !locks_in_grace()) { + ret = nlm_lck_denied_grace_period; + goto out; + } + if (!wait) lock->fl.fl_flags &= ~FL_SLEEP; error = vfs_lock_file(file->f_file, F_SETLK, &lock->fl, NULL); @@ -502,6 +511,10 @@ nlmsvc_testlock(struct svc_rqst *rqstp, struct nlm_file *file, goto out; } + if (locks_in_grace()) { + ret = nlm_lck_denied_grace_period; + goto out; + } error = vfs_test_lock(file->f_file, &lock->fl); if (error == FILE_LOCK_DEFERRED) { ret = nlmsvc_defer_lock_rqst(rqstp, block); @@ -582,6 +595,9 @@ nlmsvc_cancel_blocked(struct nlm_file *file, struct nlm_lock *lock) (long long)lock->fl.fl_start, (long long)lock->fl.fl_end); + if (locks_in_grace()) + return nlm_lck_denied_grace_period; + mutex_lock(&file->f_mutex); block = nlmsvc_lookup_block(file, lock); mutex_unlock(&file->f_mutex); diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c index 76262c1..548b0bb 100644 --- a/fs/lockd/svcproc.c +++ b/fs/lockd/svcproc.c @@ -117,12 +117,6 @@ nlmsvc_proc_test(struct svc_rqst *rqstp, struct nlm_args *argp, dprintk("lockd: TEST called\n"); resp->cookie = argp->cookie; - /* Don't accept test requests during grace period */ - if (nlmsvc_grace_period) { - resp->status = nlm_lck_denied_grace_period; - return rc; - } - /* Obtain client and file */ if ((resp->status = nlmsvc_retrieve_args(rqstp, argp, &host, &file))) return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success; @@ -152,12 +146,6 @@ nlmsvc_proc_lock(struct svc_rqst *rqstp, struct nlm_args *argp, resp->cookie = argp->cookie; - /* Don't accept new lock requests during grace period */ - if (nlmsvc_grace_period && !argp->reclaim) { - resp->status = nlm_lck_denied_grace_period; - return rc; - } - /* Obtain client and file */ if ((resp->status = nlmsvc_retrieve_args(rqstp, argp, &host, &file))) return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success; @@ -176,7 +164,8 @@ nlmsvc_proc_lock(struct svc_rqst *rqstp, struct nlm_args *argp, /* Now try to lock the file */ resp->status = cast_status(nlmsvc_lock(rqstp, file, host, &argp->lock, - argp->block, &argp->cookie)); + argp->block, &argp->cookie, + argp->reclaim)); if (resp->status == nlm_drop_reply) rc = rpc_drop_reply; else @@ -199,7 +188,7 @@ nlmsvc_proc_cancel(struct svc_rqst *rqstp, struct nlm_args *argp, resp->cookie = argp->cookie; /* Don't accept requests during grace period */ - if (nlmsvc_grace_period) { + if (locks_in_grace()) { resp->status = nlm_lck_denied_grace_period; return rpc_success; } @@ -232,7 +221,7 @@ nlmsvc_proc_unlock(struct svc_rqst *rqstp, struct nlm_args *argp, resp->cookie = argp->cookie; /* Don't accept new lock requests during grace period */ - if (nlmsvc_grace_period) { + if (locks_in_grace()) { resp->status = nlm_lck_denied_grace_period; return rpc_success; } @@ -261,7 +250,7 @@ nlmsvc_proc_granted(struct svc_rqst *rqstp, struct nlm_args *argp, resp->cookie = argp->cookie; dprintk("lockd: GRANTED called\n"); - resp->status = nlmclnt_grant(svc_addr_in(rqstp), &argp->lock); + resp->status = nlmclnt_grant(svc_addr(rqstp), &argp->lock); dprintk("lockd: GRANTED status %d\n", ntohl(resp->status)); return rpc_success; } @@ -373,7 +362,7 @@ nlmsvc_proc_share(struct svc_rqst *rqstp, struct nlm_args *argp, resp->cookie = argp->cookie; /* Don't accept new lock requests during grace period */ - if (nlmsvc_grace_period && !argp->reclaim) { + if (locks_in_grace() && !argp->reclaim) { resp->status = nlm_lck_denied_grace_period; return rpc_success; } @@ -406,7 +395,7 @@ nlmsvc_proc_unshare(struct svc_rqst *rqstp, struct nlm_args *argp, resp->cookie = argp->cookie; /* Don't accept requests during grace period */ - if (nlmsvc_grace_period) { + if (locks_in_grace()) { resp->status = nlm_lck_denied_grace_period; return rpc_success; } @@ -464,11 +453,9 @@ nlmsvc_proc_sm_notify(struct svc_rqst *rqstp, struct nlm_reboot *argp, { struct sockaddr_in saddr; - memcpy(&saddr, svc_addr_in(rqstp), sizeof(saddr)); - dprintk("lockd: SM_NOTIFY called\n"); - if (saddr.sin_addr.s_addr != htonl(INADDR_LOOPBACK) - || ntohs(saddr.sin_port) >= 1024) { + + if (!nlm_privileged_requester(rqstp)) { char buf[RPC_MAX_ADDRBUFLEN]; printk(KERN_WARNING "lockd: rejected NSM callback from %s\n", svc_print_addr(rqstp, buf, sizeof(buf))); diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c index 198b4e5..34c2766 100644 --- a/fs/lockd/svcsubs.c +++ b/fs/lockd/svcsubs.c @@ -418,7 +418,7 @@ EXPORT_SYMBOL_GPL(nlmsvc_unlock_all_by_sb); static int nlmsvc_match_ip(void *datap, struct nlm_host *host) { - return nlm_cmp_addr(&host->h_saddr, datap); + return nlm_cmp_addr(nlm_srcaddr(host), datap); } /** diff --git a/fs/lockd/xdr.c b/fs/lockd/xdr.c index 3e459e1..1f22629 100644 --- a/fs/lockd/xdr.c +++ b/fs/lockd/xdr.c @@ -351,8 +351,6 @@ nlmsvc_decode_reboot(struct svc_rqst *rqstp, __be32 *p, struct nlm_reboot *argp) argp->state = ntohl(*p++); /* Preserve the address in network byte order */ argp->addr = *p++; - argp->vers = *p++; - argp->proto = *p++; return xdr_argsize_check(rqstp, p); } diff --git a/fs/lockd/xdr4.c b/fs/lockd/xdr4.c index 43ff939..50c493a 100644 --- a/fs/lockd/xdr4.c +++ b/fs/lockd/xdr4.c @@ -358,8 +358,6 @@ nlm4svc_decode_reboot(struct svc_rqst *rqstp, __be32 *p, struct nlm_reboot *argp argp->state = ntohl(*p++); /* Preserve the address in network byte order */ argp->addr = *p++; - argp->vers = *p++; - argp->proto = *p++; return xdr_argsize_check(rqstp, p); } @@ -6,7 +6,7 @@ * Contains functions related to preparing and submitting BIOs which contain * multiple pagecache pages. * - * 15May2002 akpm@zip.com.au + * 15May2002 Andrew Morton * Initial version * 27Jun2002 axboe@suse.de * use bio_add_page() to build bio's just the right size diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c index f447f4b..6a09760 100644 --- a/fs/nfs/callback.c +++ b/fs/nfs/callback.c @@ -105,7 +105,8 @@ int nfs_callback_up(void) mutex_lock(&nfs_callback_mutex); if (nfs_callback_info.users++ || nfs_callback_info.task != NULL) goto out; - serv = svc_create(&nfs4_callback_program, NFS4_CALLBACK_BUFSIZE, NULL); + serv = svc_create(&nfs4_callback_program, NFS4_CALLBACK_BUFSIZE, + AF_INET, NULL); ret = -ENOMEM; if (!serv) goto out_err; diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 5ee23e7..7547600 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -675,7 +675,7 @@ static int nfs_init_server(struct nfs_server *server, server->nfs_client = clp; /* Initialise the client representation from the mount data */ - server->flags = data->flags & NFS_MOUNT_FLAGMASK; + server->flags = data->flags; if (data->rsize) server->rsize = nfs_block_size(data->rsize, NULL); @@ -850,7 +850,6 @@ static struct nfs_server *nfs_alloc_server(void) INIT_LIST_HEAD(&server->client_link); INIT_LIST_HEAD(&server->master_link); - init_waitqueue_head(&server->active_wq); atomic_set(&server->active, 0); server->io_stats = nfs_alloc_iostats(); @@ -1073,7 +1072,7 @@ static int nfs4_init_server(struct nfs_server *server, goto error; /* Initialise the client representation from the mount data */ - server->flags = data->flags & NFS_MOUNT_FLAGMASK; + server->flags = data->flags; server->caps |= NFS_CAP_ATOMIC_OPEN; if (data->rsize) diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 74f92b7..2ab70d4 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -156,6 +156,7 @@ typedef struct { decode_dirent_t decode; int plus; unsigned long timestamp; + unsigned long gencount; int timestamp_valid; } nfs_readdir_descriptor_t; @@ -177,7 +178,7 @@ int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page *page) struct file *file = desc->file; struct inode *inode = file->f_path.dentry->d_inode; struct rpc_cred *cred = nfs_file_cred(file); - unsigned long timestamp; + unsigned long timestamp, gencount; int error; dfprintk(DIRCACHE, "NFS: %s: reading cookie %Lu into page %lu\n", @@ -186,6 +187,7 @@ int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page *page) again: timestamp = jiffies; + gencount = nfs_inc_attr_generation_counter(); error = NFS_PROTO(inode)->readdir(file->f_path.dentry, cred, desc->entry->cookie, page, NFS_SERVER(inode)->dtsize, desc->plus); if (error < 0) { @@ -199,6 +201,7 @@ int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page *page) goto error; } desc->timestamp = timestamp; + desc->gencount = gencount; desc->timestamp_valid = 1; SetPageUptodate(page); /* Ensure consistent page alignment of the data. @@ -224,9 +227,10 @@ int dir_decode(nfs_readdir_descriptor_t *desc) if (IS_ERR(p)) return PTR_ERR(p); desc->ptr = p; - if (desc->timestamp_valid) + if (desc->timestamp_valid) { desc->entry->fattr->time_start = desc->timestamp; - else + desc->entry->fattr->gencount = desc->gencount; + } else desc->entry->fattr->valid &= ~NFS_ATTR_FATTR; return 0; } @@ -471,7 +475,7 @@ int uncached_readdir(nfs_readdir_descriptor_t *desc, void *dirent, struct rpc_cred *cred = nfs_file_cred(file); struct page *page = NULL; int status; - unsigned long timestamp; + unsigned long timestamp, gencount; dfprintk(DIRCACHE, "NFS: uncached_readdir() searching for cookie %Lu\n", (unsigned long long)*desc->dir_cookie); @@ -482,6 +486,7 @@ int uncached_readdir(nfs_readdir_descriptor_t *desc, void *dirent, goto out; } timestamp = jiffies; + gencount = nfs_inc_attr_generation_counter(); status = NFS_PROTO(inode)->readdir(file->f_path.dentry, cred, *desc->dir_cookie, page, NFS_SERVER(inode)->dtsize, @@ -490,6 +495,7 @@ int uncached_readdir(nfs_readdir_descriptor_t *desc, void *dirent, desc->ptr = kmap(page); /* matching kunmap in nfs_do_filldir */ if (status >= 0) { desc->timestamp = timestamp; + desc->gencount = gencount; desc->timestamp_valid = 1; if ((status = dir_decode(desc)) == 0) desc->entry->prev_cookie = *desc->dir_cookie; @@ -655,7 +661,7 @@ static int nfs_fsync_dir(struct file *filp, struct dentry *dentry, int datasync) */ void nfs_force_lookup_revalidate(struct inode *dir) { - NFS_I(dir)->cache_change_attribute = jiffies; + NFS_I(dir)->cache_change_attribute++; } /* @@ -667,6 +673,8 @@ static int nfs_check_verifier(struct inode *dir, struct dentry *dentry) { if (IS_ROOT(dentry)) return 1; + if (NFS_SERVER(dir)->flags & NFS_MOUNT_LOOKUP_CACHE_NONE) + return 0; if (!nfs_verify_change_attribute(dir, dentry->d_time)) return 0; /* Revalidate nfsi->cache_change_attribute before we declare a match */ @@ -750,6 +758,8 @@ int nfs_neg_need_reval(struct inode *dir, struct dentry *dentry, /* Don't revalidate a negative dentry if we're creating a new file */ if (nd != NULL && nfs_lookup_check_intent(nd, LOOKUP_CREATE) != 0) return 0; + if (NFS_SERVER(dir)->flags & NFS_MOUNT_LOOKUP_CACHE_NONEG) + return 1; return !nfs_check_verifier(dir, dentry); } diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 7846065..d319b49 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -188,13 +188,16 @@ static loff_t nfs_file_llseek(struct file *filp, loff_t offset, int origin) /* origin == SEEK_END => we must revalidate the cached file length */ if (origin == SEEK_END) { struct inode *inode = filp->f_mapping->host; + int retval = nfs_revalidate_file_size(inode, filp); if (retval < 0) return (loff_t)retval; - } - lock_kernel(); /* BKL needed? */ - loff = generic_file_llseek_unlocked(filp, offset, origin); - unlock_kernel(); + + spin_lock(&inode->i_lock); + loff = generic_file_llseek_unlocked(filp, offset, origin); + spin_unlock(&inode->i_lock); + } else + loff = generic_file_llseek_unlocked(filp, offset, origin); return loff; } @@ -699,13 +702,6 @@ static int nfs_flock(struct file *filp, int cmd, struct file_lock *fl) filp->f_path.dentry->d_name.name, fl->fl_type, fl->fl_flags); - /* - * No BSD flocks over NFS allowed. - * Note: we could try to fake a POSIX lock request here by - * using ((u32) filp | 0x80000000) or some such as the pid. - * Not sure whether that would be unique, though, or whether - * that would break in other places. - */ if (!(fl->fl_flags & FL_FLOCK)) return -ENOLCK; diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 52daefa..b9195c0 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -305,8 +305,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr) init_special_inode(inode, inode->i_mode, fattr->rdev); nfsi->read_cache_jiffies = fattr->time_start; - nfsi->last_updated = now; - nfsi->cache_change_attribute = now; + nfsi->attr_gencount = fattr->gencount; inode->i_atime = fattr->atime; inode->i_mtime = fattr->mtime; inode->i_ctime = fattr->ctime; @@ -453,6 +452,7 @@ out_big: void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr) { if ((attr->ia_valid & (ATTR_MODE|ATTR_UID|ATTR_GID)) != 0) { + spin_lock(&inode->i_lock); if ((attr->ia_valid & ATTR_MODE) != 0) { int mode = attr->ia_mode & S_IALLUGO; mode |= inode->i_mode & ~S_IALLUGO; @@ -462,7 +462,6 @@ void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr) inode->i_uid = attr->ia_uid; if ((attr->ia_valid & ATTR_GID) != 0) inode->i_gid = attr->ia_gid; - spin_lock(&inode->i_lock); NFS_I(inode)->cache_validity |= NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL; spin_unlock(&inode->i_lock); } @@ -472,37 +471,6 @@ void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr) } } -static int nfs_wait_schedule(void *word) -{ - if (signal_pending(current)) - return -ERESTARTSYS; - schedule(); - return 0; -} - -/* - * Wait for the inode to get unlocked. - */ -static int nfs_wait_on_inode(struct inode *inode) -{ - struct nfs_inode *nfsi = NFS_I(inode); - int error; - - error = wait_on_bit_lock(&nfsi->flags, NFS_INO_REVALIDATING, - nfs_wait_schedule, TASK_KILLABLE); - - return error; -} - -static void nfs_wake_up_inode(struct inode *inode) -{ - struct nfs_inode *nfsi = NFS_I(inode); - - clear_bit(NFS_INO_REVALIDATING, &nfsi->flags); - smp_mb__after_clear_bit(); - wake_up_bit(&nfsi->flags, NFS_INO_REVALIDATING); -} - int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) { struct inode *inode = dentry->d_inode; @@ -697,20 +665,15 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode) dfprintk(PAGECACHE, "NFS: revalidating (%s/%Ld)\n", inode->i_sb->s_id, (long long)NFS_FILEID(inode)); - nfs_inc_stats(inode, NFSIOS_INODEREVALIDATE); if (is_bad_inode(inode)) - goto out_nowait; + goto out; if (NFS_STALE(inode)) - goto out_nowait; - - status = nfs_wait_on_inode(inode); - if (status < 0) goto out; - status = -ESTALE; if (NFS_STALE(inode)) goto out; + nfs_inc_stats(inode, NFSIOS_INODEREVALIDATE); status = NFS_PROTO(inode)->getattr(server, NFS_FH(inode), &fattr); if (status != 0) { dfprintk(PAGECACHE, "nfs_revalidate_inode: (%s/%Ld) getattr failed, error=%d\n", @@ -724,16 +687,13 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode) goto out; } - spin_lock(&inode->i_lock); - status = nfs_update_inode(inode, &fattr); + status = nfs_refresh_inode(inode, &fattr); if (status) { - spin_unlock(&inode->i_lock); dfprintk(PAGECACHE, "nfs_revalidate_inode: (%s/%Ld) refresh failed, error=%d\n", inode->i_sb->s_id, (long long)NFS_FILEID(inode), status); goto out; } - spin_unlock(&inode->i_lock); if (nfsi->cache_validity & NFS_INO_INVALID_ACL) nfs_zap_acl_cache(inode); @@ -743,9 +703,6 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode) (long long)NFS_FILEID(inode)); out: - nfs_wake_up_inode(inode); - - out_nowait: return status; } @@ -908,9 +865,6 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat return -EIO; } - /* Do atomic weak cache consistency updates */ - nfs_wcc_update_inode(inode, fattr); - if ((fattr->valid & NFS_ATTR_FATTR_V4) != 0 && nfsi->change_attr != fattr->change_attr) invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE; @@ -939,15 +893,81 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat if (invalid != 0) nfsi->cache_validity |= invalid; - else - nfsi->cache_validity &= ~(NFS_INO_INVALID_ATTR - | NFS_INO_INVALID_ATIME - | NFS_INO_REVAL_PAGECACHE); nfsi->read_cache_jiffies = fattr->time_start; return 0; } +static int nfs_ctime_need_update(const struct inode *inode, const struct nfs_fattr *fattr) +{ + return timespec_compare(&fattr->ctime, &inode->i_ctime) > 0; +} + +static int nfs_size_need_update(const struct inode *inode, const struct nfs_fattr *fattr) +{ + return nfs_size_to_loff_t(fattr->size) > i_size_read(inode); +} + +static unsigned long nfs_attr_generation_counter; + +static unsigned long nfs_read_attr_generation_counter(void) +{ + smp_rmb(); + return nfs_attr_generation_counter; +} + +unsigned long nfs_inc_attr_generation_counter(void) +{ + unsigned long ret; + smp_rmb(); + ret = ++nfs_attr_generation_counter; + smp_wmb(); + return ret; +} + +void nfs_fattr_init(struct nfs_fattr *fattr) +{ + fattr->valid = 0; + fattr->time_start = jiffies; + fattr->gencount = nfs_inc_attr_generation_counter(); +} + +/** + * nfs_inode_attrs_need_update - check if the inode attributes need updating + * @inode - pointer to inode + * @fattr - attributes + * + * Attempt to divine whether or not an RPC call reply carrying stale + * attributes got scheduled after another call carrying updated ones. + * + * To do so, the function first assumes that a more recent ctime means + * that the attributes in fattr are newer, however it also attempt to + * catch the case where ctime either didn't change, or went backwards + * (if someone reset the clock on the server) by looking at whether + * or not this RPC call was started after the inode was last updated. + * Note also the check for wraparound of 'attr_gencount' + * + * The function returns 'true' if it thinks the attributes in 'fattr' are + * more recent than the ones cached in the inode. + * + */ +static int nfs_inode_attrs_need_update(const struct inode *inode, const struct nfs_fattr *fattr) +{ + const struct nfs_inode *nfsi = NFS_I(inode); + + return ((long)fattr->gencount - (long)nfsi->attr_gencount) > 0 || + nfs_ctime_need_update(inode, fattr) || + nfs_size_need_update(inode, fattr) || + ((long)nfsi->attr_gencount - (long)nfs_read_attr_generation_counter() > 0); +} + +static int nfs_refresh_inode_locked(struct inode *inode, struct nfs_fattr *fattr) +{ + if (nfs_inode_attrs_need_update(inode, fattr)) + return nfs_update_inode(inode, fattr); + return nfs_check_inode_attributes(inode, fattr); +} + /** * nfs_refresh_inode - try to update the inode attribute cache * @inode - pointer to inode @@ -960,21 +980,28 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat */ int nfs_refresh_inode(struct inode *inode, struct nfs_fattr *fattr) { - struct nfs_inode *nfsi = NFS_I(inode); int status; if ((fattr->valid & NFS_ATTR_FATTR) == 0) return 0; spin_lock(&inode->i_lock); - if (time_after(fattr->time_start, nfsi->last_updated)) - status = nfs_update_inode(inode, fattr); - else - status = nfs_check_inode_attributes(inode, fattr); - + status = nfs_refresh_inode_locked(inode, fattr); spin_unlock(&inode->i_lock); return status; } +static int nfs_post_op_update_inode_locked(struct inode *inode, struct nfs_fattr *fattr) +{ + struct nfs_inode *nfsi = NFS_I(inode); + + nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE; + if (S_ISDIR(inode->i_mode)) + nfsi->cache_validity |= NFS_INO_INVALID_DATA; + if ((fattr->valid & NFS_ATTR_FATTR) == 0) + return 0; + return nfs_refresh_inode_locked(inode, fattr); +} + /** * nfs_post_op_update_inode - try to update the inode attribute cache * @inode - pointer to inode @@ -991,14 +1018,12 @@ int nfs_refresh_inode(struct inode *inode, struct nfs_fattr *fattr) */ int nfs_post_op_update_inode(struct inode *inode, struct nfs_fattr *fattr) { - struct nfs_inode *nfsi = NFS_I(inode); + int status; spin_lock(&inode->i_lock); - nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE; - if (S_ISDIR(inode->i_mode)) - nfsi->cache_validity |= NFS_INO_INVALID_DATA; + status = nfs_post_op_update_inode_locked(inode, fattr); spin_unlock(&inode->i_lock); - return nfs_refresh_inode(inode, fattr); + return status; } /** @@ -1014,6 +1039,15 @@ int nfs_post_op_update_inode(struct inode *inode, struct nfs_fattr *fattr) */ int nfs_post_op_update_inode_force_wcc(struct inode *inode, struct nfs_fattr *fattr) { + int status; + + spin_lock(&inode->i_lock); + /* Don't do a WCC update if these attributes are already stale */ + if ((fattr->valid & NFS_ATTR_FATTR) == 0 || + !nfs_inode_attrs_need_update(inode, fattr)) { + fattr->valid &= ~(NFS_ATTR_WCC_V4|NFS_ATTR_WCC); + goto out_noforce; + } if ((fattr->valid & NFS_ATTR_FATTR_V4) != 0 && (fattr->valid & NFS_ATTR_WCC_V4) == 0) { fattr->pre_change_attr = NFS_I(inode)->change_attr; @@ -1026,7 +1060,10 @@ int nfs_post_op_update_inode_force_wcc(struct inode *inode, struct nfs_fattr *fa fattr->pre_size = i_size_read(inode); fattr->valid |= NFS_ATTR_WCC; } - return nfs_post_op_update_inode(inode, fattr); +out_noforce: + status = nfs_post_op_update_inode_locked(inode, fattr); + spin_unlock(&inode->i_lock); + return status; } /* @@ -1092,7 +1129,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) } /* If ctime has changed we should definitely clear access+acl caches */ if (!timespec_equal(&inode->i_ctime, &fattr->ctime)) - invalid |= NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL; + invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL; } else if (nfsi->change_attr != fattr->change_attr) { dprintk("NFS: change_attr change on server for file %s/%ld\n", inode->i_sb->s_id, inode->i_ino); @@ -1126,6 +1163,9 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) inode->i_gid != fattr->gid) invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL; + if (inode->i_nlink != fattr->nlink) + invalid |= NFS_INO_INVALID_ATTR; + inode->i_mode = fattr->mode; inode->i_nlink = fattr->nlink; inode->i_uid = fattr->uid; @@ -1145,18 +1185,13 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) nfs_inc_stats(inode, NFSIOS_ATTRINVALIDATE); nfsi->attrtimeo = NFS_MINATTRTIMEO(inode); nfsi->attrtimeo_timestamp = now; - nfsi->last_updated = now; + nfsi->attr_gencount = nfs_inc_attr_generation_counter(); } else { if (!time_in_range(now, nfsi->attrtimeo_timestamp, nfsi->attrtimeo_timestamp + nfsi->attrtimeo)) { if ((nfsi->attrtimeo <<= 1) > NFS_MAXATTRTIMEO(inode)) nfsi->attrtimeo = NFS_MAXATTRTIMEO(inode); nfsi->attrtimeo_timestamp = now; } - /* - * Avoid jiffy wraparound issues with nfsi->last_updated - */ - if (!time_in_range(nfsi->last_updated, nfsi->read_cache_jiffies, now)) - nfsi->last_updated = nfsi->read_cache_jiffies; } invalid &= ~NFS_INO_INVALID_ATTR; /* Don't invalidate the data if we were to blame */ diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 24241fc..d212ee4 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -153,6 +153,7 @@ extern void nfs4_clear_inode(struct inode *); void nfs_zap_acl_cache(struct inode *inode); /* super.c */ +void nfs_parse_ip_address(char *, size_t, struct sockaddr *, size_t *); extern struct file_system_type nfs_xdev_fs_type; #ifdef CONFIG_NFS_V4 extern struct file_system_type nfs4_xdev_fs_type; @@ -163,8 +164,8 @@ extern struct rpc_stat nfs_rpcstat; extern int __init register_nfs_fs(void); extern void __exit unregister_nfs_fs(void); -extern void nfs_sb_active(struct nfs_server *server); -extern void nfs_sb_deactive(struct nfs_server *server); +extern void nfs_sb_active(struct super_block *sb); +extern void nfs_sb_deactive(struct super_block *sb); /* namespace.c */ extern char *nfs_path(const char *base, @@ -276,3 +277,23 @@ unsigned int nfs_page_array_len(unsigned int base, size_t len) PAGE_SIZE - 1) >> PAGE_SHIFT; } +#define IPV6_SCOPE_DELIMITER '%' + +/* + * Set the port number in an address. Be agnostic about the address + * family. + */ +static inline void nfs_set_port(struct sockaddr *sap, unsigned short port) +{ + struct sockaddr_in *ap = (struct sockaddr_in *)sap; + struct sockaddr_in6 *ap6 = (struct sockaddr_in6 *)sap; + + switch (sap->sa_family) { + case AF_INET: + ap->sin_port = htons(port); + break; + case AF_INET6: + ap6->sin6_port = htons(port); + break; + } +} diff --git a/fs/nfs/mount_clnt.c b/fs/nfs/mount_clnt.c index 779d2eb..086a683 100644 --- a/fs/nfs/mount_clnt.c +++ b/fs/nfs/mount_clnt.c @@ -14,6 +14,7 @@ #include <linux/sunrpc/clnt.h> #include <linux/sunrpc/sched.h> #include <linux/nfs_fs.h> +#include "internal.h" #ifdef RPC_DEBUG # define NFSDBG_FACILITY NFSDBG_MOUNT @@ -98,7 +99,7 @@ out_call_err: out_mnt_err: dprintk("NFS: MNT server returned result %d\n", result.status); - status = -EACCES; + status = nfs_stat_to_errno(result.status); goto out; } diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c index 66df08d..64a288e 100644 --- a/fs/nfs/namespace.c +++ b/fs/nfs/namespace.c @@ -105,7 +105,10 @@ static void * nfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd) dprintk("--> nfs_follow_mountpoint()\n"); - BUG_ON(IS_ROOT(dentry)); + err = -ESTALE; + if (IS_ROOT(dentry)) + goto out_err; + dprintk("%s: enter\n", __func__); dput(nd->path.dentry); nd->path.dentry = dget(dentry); @@ -189,7 +192,7 @@ static struct vfsmount *nfs_do_clone_mount(struct nfs_server *server, struct nfs_clone_mount *mountdata) { #ifdef CONFIG_NFS_V4 - struct vfsmount *mnt = NULL; + struct vfsmount *mnt = ERR_PTR(-EINVAL); switch (server->nfs_client->rpc_ops->version) { case 2: case 3: diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c index 423842f..cef6255 100644 --- a/fs/nfs/nfs3acl.c +++ b/fs/nfs/nfs3acl.c @@ -229,6 +229,7 @@ struct posix_acl *nfs3_proc_getacl(struct inode *inode, int type) dprintk("NFS call getacl\n"); msg.rpc_proc = &server->client_acl->cl_procinfo[ACLPROC3_GETACL]; + nfs_fattr_init(&fattr); status = rpc_call_sync(server->client_acl, &msg, 0); dprintk("NFS reply getacl: %d\n", status); @@ -322,6 +323,7 @@ static int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl, dprintk("NFS call setacl\n"); msg.rpc_proc = &server->client_acl->cl_procinfo[ACLPROC3_SETACL]; + nfs_fattr_init(&fattr); status = rpc_call_sync(server->client_acl, &msg, 0); nfs_access_zap_cache(inode); nfs_zap_acl_cache(inode); diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c index 1e750e45..c55be7a 100644 --- a/fs/nfs/nfs3proc.c +++ b/fs/nfs/nfs3proc.c @@ -699,7 +699,7 @@ nfs3_proc_statfs(struct nfs_server *server, struct nfs_fh *fhandle, } static int -nfs3_proc_fsinfo(struct nfs_server *server, struct nfs_fh *fhandle, +do_proc_fsinfo(struct rpc_clnt *client, struct nfs_fh *fhandle, struct nfs_fsinfo *info) { struct rpc_message msg = { @@ -711,11 +711,27 @@ nfs3_proc_fsinfo(struct nfs_server *server, struct nfs_fh *fhandle, dprintk("NFS call fsinfo\n"); nfs_fattr_init(info->fattr); - status = rpc_call_sync(server->nfs_client->cl_rpcclient, &msg, 0); + status = rpc_call_sync(client, &msg, 0); dprintk("NFS reply fsinfo: %d\n", status); return status; } +/* + * Bare-bones access to fsinfo: this is for nfs_get_root/nfs_get_sb via + * nfs_create_server + */ +static int +nfs3_proc_fsinfo(struct nfs_server *server, struct nfs_fh *fhandle, + struct nfs_fsinfo *info) +{ + int status; + + status = do_proc_fsinfo(server->client, fhandle, info); + if (status && server->nfs_client->cl_rpcclient != server->client) + status = do_proc_fsinfo(server->nfs_client->cl_rpcclient, fhandle, info); + return status; +} + static int nfs3_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_pathconf *info) diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c index b112857..30befc3 100644 --- a/fs/nfs/nfs4namespace.c +++ b/fs/nfs/nfs4namespace.c @@ -93,21 +93,52 @@ static int nfs4_validate_fspath(const struct vfsmount *mnt_parent, return 0; } -/* - * Check if the string represents a "valid" IPv4 address - */ -static inline int valid_ipaddr4(const char *buf) +static struct vfsmount *try_location(struct nfs_clone_mount *mountdata, + char *page, char *page2, + const struct nfs4_fs_location *location) { - int rc, count, in[4]; - - rc = sscanf(buf, "%d.%d.%d.%d", &in[0], &in[1], &in[2], &in[3]); - if (rc != 4) - return -EINVAL; - for (count = 0; count < 4; count++) { - if (in[count] > 255) - return -EINVAL; + struct vfsmount *mnt = ERR_PTR(-ENOENT); + char *mnt_path; + int page2len; + unsigned int s; + + mnt_path = nfs4_pathname_string(&location->rootpath, page2, PAGE_SIZE); + if (IS_ERR(mnt_path)) + return mnt; + mountdata->mnt_path = mnt_path; + page2 += strlen(mnt_path) + 1; + page2len = PAGE_SIZE - strlen(mnt_path) - 1; + + for (s = 0; s < location->nservers; s++) { + const struct nfs4_string *buf = &location->servers[s]; + struct sockaddr_storage addr; + + if (buf->len <= 0 || buf->len >= PAGE_SIZE) + continue; + + mountdata->addr = (struct sockaddr *)&addr; + + if (memchr(buf->data, IPV6_SCOPE_DELIMITER, buf->len)) + continue; + nfs_parse_ip_address(buf->data, buf->len, + mountdata->addr, &mountdata->addrlen); + if (mountdata->addr->sa_family == AF_UNSPEC) + continue; + nfs_set_port(mountdata->addr, NFS_PORT); + + strncpy(page2, buf->data, page2len); + page2[page2len] = '\0'; + mountdata->hostname = page2; + + snprintf(page, PAGE_SIZE, "%s:%s", + mountdata->hostname, + mountdata->mnt_path); + + mnt = vfs_kern_mount(&nfs4_referral_fs_type, 0, page, mountdata); + if (!IS_ERR(mnt)) + break; } - return 0; + return mnt; } /** @@ -128,7 +159,6 @@ static struct vfsmount *nfs_follow_referral(const struct vfsmount *mnt_parent, .authflavor = NFS_SB(mnt_parent->mnt_sb)->client->cl_auth->au_flavor, }; char *page = NULL, *page2 = NULL; - unsigned int s; int loc, error; if (locations == NULL || locations->nlocations <= 0) @@ -152,53 +182,16 @@ static struct vfsmount *nfs_follow_referral(const struct vfsmount *mnt_parent, goto out; } - loc = 0; - while (loc < locations->nlocations && IS_ERR(mnt)) { + for (loc = 0; loc < locations->nlocations; loc++) { const struct nfs4_fs_location *location = &locations->locations[loc]; - char *mnt_path; if (location == NULL || location->nservers <= 0 || - location->rootpath.ncomponents == 0) { - loc++; + location->rootpath.ncomponents == 0) continue; - } - mnt_path = nfs4_pathname_string(&location->rootpath, page2, PAGE_SIZE); - if (IS_ERR(mnt_path)) { - loc++; - continue; - } - mountdata.mnt_path = mnt_path; - - s = 0; - while (s < location->nservers) { - struct sockaddr_in addr = { - .sin_family = AF_INET, - .sin_port = htons(NFS_PORT), - }; - - if (location->servers[s].len <= 0 || - valid_ipaddr4(location->servers[s].data) < 0) { - s++; - continue; - } - - mountdata.hostname = location->servers[s].data; - addr.sin_addr.s_addr = in_aton(mountdata.hostname), - mountdata.addr = (struct sockaddr *)&addr; - mountdata.addrlen = sizeof(addr); - - snprintf(page, PAGE_SIZE, "%s:%s", - mountdata.hostname, - mountdata.mnt_path); - - mnt = vfs_kern_mount(&nfs4_referral_fs_type, 0, page, &mountdata); - if (!IS_ERR(mnt)) { - break; - } - s++; - } - loc++; + mnt = try_location(&mountdata, page, page2, location); + if (!IS_ERR(mnt)) + break; } out: diff --git a/fs/nfs/nfsroot.c b/fs/nfs/nfsroot.c index 46763d1c..8478fc2 100644 --- a/fs/nfs/nfsroot.c +++ b/fs/nfs/nfsroot.c @@ -127,7 +127,7 @@ enum { Opt_err }; -static match_table_t __initdata tokens = { +static match_table_t __initconst tokens = { {Opt_port, "port=%u"}, {Opt_rsize, "rsize=%u"}, {Opt_wsize, "wsize=%u"}, diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c index 4dbb84d..1934652 100644 --- a/fs/nfs/proc.c +++ b/fs/nfs/proc.c @@ -65,14 +65,20 @@ nfs_proc_get_root(struct nfs_server *server, struct nfs_fh *fhandle, dprintk("%s: call getattr\n", __func__); nfs_fattr_init(fattr); - status = rpc_call_sync(server->nfs_client->cl_rpcclient, &msg, 0); + status = rpc_call_sync(server->client, &msg, 0); + /* Retry with default authentication if different */ + if (status && server->nfs_client->cl_rpcclient != server->client) + status = rpc_call_sync(server->nfs_client->cl_rpcclient, &msg, 0); dprintk("%s: reply getattr: %d\n", __func__, status); if (status) return status; dprintk("%s: call statfs\n", __func__); msg.rpc_proc = &nfs_procedures[NFSPROC_STATFS]; msg.rpc_resp = &fsinfo; - status = rpc_call_sync(server->nfs_client->cl_rpcclient, &msg, 0); + status = rpc_call_sync(server->client, &msg, 0); + /* Retry with default authentication if different */ + if (status && server->nfs_client->cl_rpcclient != server->client) + status = rpc_call_sync(server->nfs_client->cl_rpcclient, &msg, 0); dprintk("%s: reply statfs: %d\n", __func__, status); if (status) return status; diff --git a/fs/nfs/super.c b/fs/nfs/super.c index e9b2017..8b28b95c 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -91,6 +91,7 @@ enum { /* Mount options that take string arguments */ Opt_sec, Opt_proto, Opt_mountproto, Opt_mounthost, Opt_addr, Opt_mountaddr, Opt_clientaddr, + Opt_lookupcache, /* Special mount options */ Opt_userspace, Opt_deprecated, Opt_sloppy, @@ -98,7 +99,7 @@ enum { Opt_err }; -static match_table_t nfs_mount_option_tokens = { +static const match_table_t nfs_mount_option_tokens = { { Opt_userspace, "bg" }, { Opt_userspace, "fg" }, { Opt_userspace, "retry=%s" }, @@ -154,6 +155,8 @@ static match_table_t nfs_mount_option_tokens = { { Opt_mounthost, "mounthost=%s" }, { Opt_mountaddr, "mountaddr=%s" }, + { Opt_lookupcache, "lookupcache=%s" }, + { Opt_err, NULL } }; @@ -163,7 +166,7 @@ enum { Opt_xprt_err }; -static match_table_t nfs_xprt_protocol_tokens = { +static const match_table_t nfs_xprt_protocol_tokens = { { Opt_xprt_udp, "udp" }, { Opt_xprt_tcp, "tcp" }, { Opt_xprt_rdma, "rdma" }, @@ -180,7 +183,7 @@ enum { Opt_sec_err }; -static match_table_t nfs_secflavor_tokens = { +static const match_table_t nfs_secflavor_tokens = { { Opt_sec_none, "none" }, { Opt_sec_none, "null" }, { Opt_sec_sys, "sys" }, @@ -200,6 +203,22 @@ static match_table_t nfs_secflavor_tokens = { { Opt_sec_err, NULL } }; +enum { + Opt_lookupcache_all, Opt_lookupcache_positive, + Opt_lookupcache_none, + + Opt_lookupcache_err +}; + +static match_table_t nfs_lookupcache_tokens = { + { Opt_lookupcache_all, "all" }, + { Opt_lookupcache_positive, "pos" }, + { Opt_lookupcache_positive, "positive" }, + { Opt_lookupcache_none, "none" }, + + { Opt_lookupcache_err, NULL } +}; + static void nfs_umount_begin(struct super_block *); static int nfs_statfs(struct dentry *, struct kstatfs *); @@ -209,7 +228,6 @@ static int nfs_get_sb(struct file_system_type *, int, const char *, void *, stru static int nfs_xdev_get_sb(struct file_system_type *fs_type, int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt); static void nfs_kill_super(struct super_block *); -static void nfs_put_super(struct super_block *); static int nfs_remount(struct super_block *sb, int *flags, char *raw_data); static struct file_system_type nfs_fs_type = { @@ -232,7 +250,6 @@ static const struct super_operations nfs_sops = { .alloc_inode = nfs_alloc_inode, .destroy_inode = nfs_destroy_inode, .write_inode = nfs_write_inode, - .put_super = nfs_put_super, .statfs = nfs_statfs, .clear_inode = nfs_clear_inode, .umount_begin = nfs_umount_begin, @@ -337,26 +354,20 @@ void __exit unregister_nfs_fs(void) unregister_filesystem(&nfs_fs_type); } -void nfs_sb_active(struct nfs_server *server) +void nfs_sb_active(struct super_block *sb) { - atomic_inc(&server->active); -} + struct nfs_server *server = NFS_SB(sb); -void nfs_sb_deactive(struct nfs_server *server) -{ - if (atomic_dec_and_test(&server->active)) - wake_up(&server->active_wq); + if (atomic_inc_return(&server->active) == 1) + atomic_inc(&sb->s_active); } -static void nfs_put_super(struct super_block *sb) +void nfs_sb_deactive(struct super_block *sb) { struct nfs_server *server = NFS_SB(sb); - /* - * Make sure there are no outstanding ops to this server. - * If so, wait for them to finish before allowing the - * unmount to continue. - */ - wait_event(server->active_wq, atomic_read(&server->active) == 0); + + if (atomic_dec_and_test(&server->active)) + deactivate_super(sb); } /* @@ -664,25 +675,6 @@ static void nfs_umount_begin(struct super_block *sb) } /* - * Set the port number in an address. Be agnostic about the address family. - */ -static void nfs_set_port(struct sockaddr *sap, unsigned short port) -{ - switch (sap->sa_family) { - case AF_INET: { - struct sockaddr_in *ap = (struct sockaddr_in *)sap; - ap->sin_port = htons(port); - break; - } - case AF_INET6: { - struct sockaddr_in6 *ap = (struct sockaddr_in6 *)sap; - ap->sin6_port = htons(port); - break; - } - } -} - -/* * Sanity-check a server address provided by the mount command. * * Address family must be initialized, and address must not be @@ -724,20 +716,22 @@ static void nfs_parse_ipv4_address(char *string, size_t str_len, *addr_len = 0; } -#define IPV6_SCOPE_DELIMITER '%' - #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) -static void nfs_parse_ipv6_scope_id(const char *string, const size_t str_len, - const char *delim, - struct sockaddr_in6 *sin6) +static int nfs_parse_ipv6_scope_id(const char *string, const size_t str_len, + const char *delim, + struct sockaddr_in6 *sin6) { char *p; size_t len; - if (!(ipv6_addr_type(&sin6->sin6_addr) & IPV6_ADDR_LINKLOCAL)) - return ; + if ((string + str_len) == delim) + return 1; + if (*delim != IPV6_SCOPE_DELIMITER) - return; + return 0; + + if (!(ipv6_addr_type(&sin6->sin6_addr) & IPV6_ADDR_LINKLOCAL)) + return 0; len = (string + str_len) - delim - 1; p = kstrndup(delim + 1, len, GFP_KERNEL); @@ -750,14 +744,20 @@ static void nfs_parse_ipv6_scope_id(const char *string, const size_t str_len, scope_id = dev->ifindex; dev_put(dev); } else { - /* scope_id is set to zero on error */ - strict_strtoul(p, 10, &scope_id); + if (strict_strtoul(p, 10, &scope_id) == 0) { + kfree(p); + return 0; + } } kfree(p); + sin6->sin6_scope_id = scope_id; dfprintk(MOUNT, "NFS: IPv6 scope ID = %lu\n", scope_id); + return 1; } + + return 0; } static void nfs_parse_ipv6_address(char *string, size_t str_len, @@ -773,9 +773,11 @@ static void nfs_parse_ipv6_address(char *string, size_t str_len, sin6->sin6_family = AF_INET6; *addr_len = sizeof(*sin6); - if (in6_pton(string, str_len, addr, IPV6_SCOPE_DELIMITER, &delim)) { - nfs_parse_ipv6_scope_id(string, str_len, delim, sin6); - return; + if (in6_pton(string, str_len, addr, + IPV6_SCOPE_DELIMITER, &delim) != 0) { + if (nfs_parse_ipv6_scope_id(string, str_len, + delim, sin6) != 0) + return; } } @@ -798,7 +800,7 @@ static void nfs_parse_ipv6_address(char *string, size_t str_len, * If there is a problem constructing the new sockaddr, set the address * family to AF_UNSPEC. */ -static void nfs_parse_ip_address(char *string, size_t str_len, +void nfs_parse_ip_address(char *string, size_t str_len, struct sockaddr *sap, size_t *addr_len) { unsigned int i, colons; @@ -1258,6 +1260,30 @@ static int nfs_parse_mount_options(char *raw, &mnt->mount_server.addrlen); kfree(string); break; + case Opt_lookupcache: + string = match_strdup(args); + if (string == NULL) + goto out_nomem; + token = match_token(string, + nfs_lookupcache_tokens, args); + kfree(string); + switch (token) { + case Opt_lookupcache_all: + mnt->flags &= ~(NFS_MOUNT_LOOKUP_CACHE_NONEG|NFS_MOUNT_LOOKUP_CACHE_NONE); + break; + case Opt_lookupcache_positive: + mnt->flags &= ~NFS_MOUNT_LOOKUP_CACHE_NONE; + mnt->flags |= NFS_MOUNT_LOOKUP_CACHE_NONEG; + break; + case Opt_lookupcache_none: + mnt->flags |= NFS_MOUNT_LOOKUP_CACHE_NONEG|NFS_MOUNT_LOOKUP_CACHE_NONE; + break; + default: + errors++; + dfprintk(MOUNT, "NFS: invalid " + "lookupcache argument\n"); + }; + break; /* * Special options @@ -1558,7 +1584,7 @@ static int nfs_validate_mount_data(void *options, * Translate to nfs_parsed_mount_data, which nfs_fill_super * can deal with. */ - args->flags = data->flags; + args->flags = data->flags & NFS_MOUNT_FLAGMASK; args->rsize = data->rsize; args->wsize = data->wsize; args->timeo = data->timeo; diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c index f089e58..ecc2953 100644 --- a/fs/nfs/unlink.c +++ b/fs/nfs/unlink.c @@ -99,7 +99,7 @@ static void nfs_async_unlink_release(void *calldata) nfs_dec_sillycount(data->dir); nfs_free_unlinkdata(data); - nfs_sb_deactive(NFS_SB(sb)); + nfs_sb_deactive(sb); } static const struct rpc_call_ops nfs_unlink_ops = { @@ -118,6 +118,7 @@ static int nfs_do_call_unlink(struct dentry *parent, struct inode *dir, struct n .rpc_message = &msg, .callback_ops = &nfs_unlink_ops, .callback_data = data, + .workqueue = nfsiod_workqueue, .flags = RPC_TASK_ASYNC, }; struct rpc_task *task; @@ -149,7 +150,7 @@ static int nfs_do_call_unlink(struct dentry *parent, struct inode *dir, struct n nfs_dec_sillycount(dir); return 0; } - nfs_sb_active(NFS_SERVER(dir)); + nfs_sb_active(dir->i_sb); data->args.fh = NFS_FH(dir); nfs_fattr_init(&data->res.dir_attr); diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 3229e21..9f98458 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -1427,8 +1427,9 @@ static int nfs_write_mapping(struct address_space *mapping, int how) .bdi = mapping->backing_dev_info, .sync_mode = WB_SYNC_NONE, .nr_to_write = LONG_MAX, + .range_start = 0, + .range_end = LLONG_MAX, .for_writepages = 1, - .range_cyclic = 1, }; int ret; diff --git a/fs/nfsd/lockd.c b/fs/nfsd/lockd.c index 15c6fae..b2786a5 100644 --- a/fs/nfsd/lockd.c +++ b/fs/nfsd/lockd.c @@ -70,7 +70,6 @@ nlm_fclose(struct file *filp) static struct nlmsvc_binding nfsd_nlm_ops = { .fopen = nlm_fopen, /* open file for locking */ .fclose = nlm_fclose, /* close file */ - .get_grace_period = get_nfs4_grace_period, }; void diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c index 4d617ea..9dbd2eb 100644 --- a/fs/nfsd/nfs3proc.c +++ b/fs/nfsd/nfs3proc.c @@ -63,7 +63,8 @@ nfsd3_proc_getattr(struct svc_rqst *rqstp, struct nfsd_fhandle *argp, SVCFH_fmt(&argp->fh)); fh_copy(&resp->fh, &argp->fh); - nfserr = fh_verify(rqstp, &resp->fh, 0, NFSD_MAY_NOP); + nfserr = fh_verify(rqstp, &resp->fh, 0, + NFSD_MAY_NOP | NFSD_MAY_BYPASS_GSS_ON_ROOT); if (nfserr) RETURN_STATUS(nfserr); @@ -530,7 +531,7 @@ nfsd3_proc_fsstat(struct svc_rqst * rqstp, struct nfsd_fhandle *argp, dprintk("nfsd: FSSTAT(3) %s\n", SVCFH_fmt(&argp->fh)); - nfserr = nfsd_statfs(rqstp, &argp->fh, &resp->stats); + nfserr = nfsd_statfs(rqstp, &argp->fh, &resp->stats, 0); fh_put(&argp->fh); RETURN_STATUS(nfserr); } @@ -558,7 +559,8 @@ nfsd3_proc_fsinfo(struct svc_rqst * rqstp, struct nfsd_fhandle *argp, resp->f_maxfilesize = ~(u32) 0; resp->f_properties = NFS3_FSF_DEFAULT; - nfserr = fh_verify(rqstp, &argp->fh, 0, NFSD_MAY_NOP); + nfserr = fh_verify(rqstp, &argp->fh, 0, + NFSD_MAY_NOP | NFSD_MAY_BYPASS_GSS_ON_ROOT); /* Check special features of the file system. May request * different read/write sizes for file systems known to have diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index 702fa57..094747a 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -225,7 +225,8 @@ encode_cb_recall(struct xdr_stream *xdr, struct nfs4_cb_recall *cb_rec) RESERVE_SPACE(12+sizeof(cb_rec->cbr_stateid) + len); WRITE32(OP_CB_RECALL); - WRITEMEM(&cb_rec->cbr_stateid, sizeof(stateid_t)); + WRITE32(cb_rec->cbr_stateid.si_generation); + WRITEMEM(&cb_rec->cbr_stateid.si_opaque, sizeof(stateid_opaque_t)); WRITE32(cb_rec->cbr_trunc); WRITE32(len); WRITEMEM(cb_rec->cbr_fhval, len); @@ -379,6 +380,7 @@ static int do_probe_callback(void *data) .addrsize = sizeof(addr), .timeout = &timeparms, .program = &cb_program, + .prognumber = cb->cb_prog, .version = nfs_cb_version[1]->number, .authflavor = RPC_AUTH_UNIX, /* XXX: need AUTH_GSS... */ .flags = (RPC_CLNT_CREATE_NOPING | RPC_CLNT_CREATE_QUIET), @@ -396,9 +398,6 @@ static int do_probe_callback(void *data) addr.sin_port = htons(cb->cb_port); addr.sin_addr.s_addr = htonl(cb->cb_addr); - /* Initialize rpc_stat */ - memset(args.program->stats, 0, sizeof(struct rpc_stat)); - /* Create RPC client */ client = rpc_create(&args); if (IS_ERR(client)) { diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index e5b51ff..669461e 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -201,10 +201,10 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, /* Openowner is now set, so sequence id will get bumped. Now we need * these checks before we do any creates: */ status = nfserr_grace; - if (nfs4_in_grace() && open->op_claim_type != NFS4_OPEN_CLAIM_PREVIOUS) + if (locks_in_grace() && open->op_claim_type != NFS4_OPEN_CLAIM_PREVIOUS) goto out; status = nfserr_no_grace; - if (!nfs4_in_grace() && open->op_claim_type == NFS4_OPEN_CLAIM_PREVIOUS) + if (!locks_in_grace() && open->op_claim_type == NFS4_OPEN_CLAIM_PREVIOUS) goto out; switch (open->op_claim_type) { @@ -575,7 +575,7 @@ nfsd4_remove(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, { __be32 status; - if (nfs4_in_grace()) + if (locks_in_grace()) return nfserr_grace; status = nfsd_unlink(rqstp, &cstate->current_fh, 0, remove->rm_name, remove->rm_namelen); @@ -596,7 +596,7 @@ nfsd4_rename(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, if (!cstate->save_fh.fh_dentry) return status; - if (nfs4_in_grace() && !(cstate->save_fh.fh_export->ex_flags + if (locks_in_grace() && !(cstate->save_fh.fh_export->ex_flags & NFSEXP_NOSUBTREECHECK)) return nfserr_grace; status = nfsd_rename(rqstp, &cstate->save_fh, rename->rn_sname, diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 1578d7a..0cc7ff5 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -61,7 +61,6 @@ static time_t lease_time = 90; /* default lease time */ static time_t user_lease_time = 90; static time_t boot_time; -static int in_grace = 1; static u32 current_ownerid = 1; static u32 current_fileid = 1; static u32 current_delegid = 1; @@ -1640,7 +1639,7 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_sta case NFS4_OPEN_CLAIM_NULL: /* Let's not give out any delegations till everyone's * had the chance to reclaim theirs.... */ - if (nfs4_in_grace()) + if (locks_in_grace()) goto out; if (!atomic_read(&cb->cb_set) || !sop->so_confirmed) goto out; @@ -1816,12 +1815,15 @@ out: return status; } +struct lock_manager nfsd4_manager = { +}; + static void -end_grace(void) +nfsd4_end_grace(void) { dprintk("NFSD: end of grace period\n"); nfsd4_recdir_purge_old(); - in_grace = 0; + locks_end_grace(&nfsd4_manager); } static time_t @@ -1838,8 +1840,8 @@ nfs4_laundromat(void) nfs4_lock_state(); dprintk("NFSD: laundromat service - starting\n"); - if (in_grace) - end_grace(); + if (locks_in_grace()) + nfsd4_end_grace(); list_for_each_safe(pos, next, &client_lru) { clp = list_entry(pos, struct nfs4_client, cl_lru); if (time_after((unsigned long)clp->cl_time, (unsigned long)cutoff)) { @@ -1974,7 +1976,7 @@ check_special_stateids(svc_fh *current_fh, stateid_t *stateid, int flags) return nfserr_bad_stateid; else if (ONE_STATEID(stateid) && (flags & RD_STATE)) return nfs_ok; - else if (nfs4_in_grace()) { + else if (locks_in_grace()) { /* Answer in remaining cases depends on existance of * conflicting state; so we must wait out the grace period. */ return nfserr_grace; @@ -1993,7 +1995,7 @@ check_special_stateids(svc_fh *current_fh, stateid_t *stateid, int flags) static inline int io_during_grace_disallowed(struct inode *inode, int flags) { - return nfs4_in_grace() && (flags & (RD_STATE | WR_STATE)) + return locks_in_grace() && (flags & (RD_STATE | WR_STATE)) && mandatory_lock(inode); } @@ -2693,10 +2695,10 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, filp = lock_stp->st_vfs_file; status = nfserr_grace; - if (nfs4_in_grace() && !lock->lk_reclaim) + if (locks_in_grace() && !lock->lk_reclaim) goto out; status = nfserr_no_grace; - if (!nfs4_in_grace() && lock->lk_reclaim) + if (!locks_in_grace() && lock->lk_reclaim) goto out; locks_init_lock(&file_lock); @@ -2779,7 +2781,7 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, int error; __be32 status; - if (nfs4_in_grace()) + if (locks_in_grace()) return nfserr_grace; if (check_lock_length(lockt->lt_offset, lockt->lt_length)) @@ -3192,9 +3194,9 @@ __nfs4_state_start(void) unsigned long grace_time; boot_time = get_seconds(); - grace_time = get_nfs_grace_period(); + grace_time = get_nfs4_grace_period(); lease_time = user_lease_time; - in_grace = 1; + locks_start_grace(&nfsd4_manager); printk(KERN_INFO "NFSD: starting %ld-second grace period\n", grace_time/HZ); laundry_wq = create_singlethread_workqueue("nfsd4"); @@ -3213,12 +3215,6 @@ nfs4_state_start(void) return; } -int -nfs4_in_grace(void) -{ - return in_grace; -} - time_t nfs4_lease_time(void) { diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 14ba4d9..afcdf4b 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -413,6 +413,18 @@ out_nfserr: } static __be32 +nfsd4_decode_stateid(struct nfsd4_compoundargs *argp, stateid_t *sid) +{ + DECODE_HEAD; + + READ_BUF(sizeof(stateid_t)); + READ32(sid->si_generation); + COPYMEM(&sid->si_opaque, sizeof(stateid_opaque_t)); + + DECODE_TAIL; +} + +static __be32 nfsd4_decode_access(struct nfsd4_compoundargs *argp, struct nfsd4_access *access) { DECODE_HEAD; @@ -429,10 +441,9 @@ nfsd4_decode_close(struct nfsd4_compoundargs *argp, struct nfsd4_close *close) DECODE_HEAD; close->cl_stateowner = NULL; - READ_BUF(4 + sizeof(stateid_t)); + READ_BUF(4); READ32(close->cl_seqid); - READ32(close->cl_stateid.si_generation); - COPYMEM(&close->cl_stateid.si_opaque, sizeof(stateid_opaque_t)); + return nfsd4_decode_stateid(argp, &close->cl_stateid); DECODE_TAIL; } @@ -493,13 +504,7 @@ nfsd4_decode_create(struct nfsd4_compoundargs *argp, struct nfsd4_create *create static inline __be32 nfsd4_decode_delegreturn(struct nfsd4_compoundargs *argp, struct nfsd4_delegreturn *dr) { - DECODE_HEAD; - - READ_BUF(sizeof(stateid_t)); - READ32(dr->dr_stateid.si_generation); - COPYMEM(&dr->dr_stateid.si_opaque, sizeof(stateid_opaque_t)); - - DECODE_TAIL; + return nfsd4_decode_stateid(argp, &dr->dr_stateid); } static inline __be32 @@ -542,20 +547,22 @@ nfsd4_decode_lock(struct nfsd4_compoundargs *argp, struct nfsd4_lock *lock) READ32(lock->lk_is_new); if (lock->lk_is_new) { - READ_BUF(36); + READ_BUF(4); READ32(lock->lk_new_open_seqid); - READ32(lock->lk_new_open_stateid.si_generation); - - COPYMEM(&lock->lk_new_open_stateid.si_opaque, sizeof(stateid_opaque_t)); + status = nfsd4_decode_stateid(argp, &lock->lk_new_open_stateid); + if (status) + return status; + READ_BUF(8 + sizeof(clientid_t)); READ32(lock->lk_new_lock_seqid); COPYMEM(&lock->lk_new_clientid, sizeof(clientid_t)); READ32(lock->lk_new_owner.len); READ_BUF(lock->lk_new_owner.len); READMEM(lock->lk_new_owner.data, lock->lk_new_owner.len); } else { - READ_BUF(20); - READ32(lock->lk_old_lock_stateid.si_generation); - COPYMEM(&lock->lk_old_lock_stateid.si_opaque, sizeof(stateid_opaque_t)); + status = nfsd4_decode_stateid(argp, &lock->lk_old_lock_stateid); + if (status) + return status; + READ_BUF(4); READ32(lock->lk_old_lock_seqid); } @@ -587,13 +594,15 @@ nfsd4_decode_locku(struct nfsd4_compoundargs *argp, struct nfsd4_locku *locku) DECODE_HEAD; locku->lu_stateowner = NULL; - READ_BUF(24 + sizeof(stateid_t)); + READ_BUF(8); READ32(locku->lu_type); if ((locku->lu_type < NFS4_READ_LT) || (locku->lu_type > NFS4_WRITEW_LT)) goto xdr_error; READ32(locku->lu_seqid); - READ32(locku->lu_stateid.si_generation); - COPYMEM(&locku->lu_stateid.si_opaque, sizeof(stateid_opaque_t)); + status = nfsd4_decode_stateid(argp, &locku->lu_stateid); + if (status) + return status; + READ_BUF(16); READ64(locku->lu_offset); READ64(locku->lu_length); @@ -678,8 +687,10 @@ nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open) READ32(open->op_delegate_type); break; case NFS4_OPEN_CLAIM_DELEGATE_CUR: - READ_BUF(sizeof(stateid_t) + 4); - COPYMEM(&open->op_delegate_stateid, sizeof(stateid_t)); + status = nfsd4_decode_stateid(argp, &open->op_delegate_stateid); + if (status) + return status; + READ_BUF(4); READ32(open->op_fname.len); READ_BUF(open->op_fname.len); SAVEMEM(open->op_fname.data, open->op_fname.len); @@ -699,9 +710,10 @@ nfsd4_decode_open_confirm(struct nfsd4_compoundargs *argp, struct nfsd4_open_con DECODE_HEAD; open_conf->oc_stateowner = NULL; - READ_BUF(4 + sizeof(stateid_t)); - READ32(open_conf->oc_req_stateid.si_generation); - COPYMEM(&open_conf->oc_req_stateid.si_opaque, sizeof(stateid_opaque_t)); + status = nfsd4_decode_stateid(argp, &open_conf->oc_req_stateid); + if (status) + return status; + READ_BUF(4); READ32(open_conf->oc_seqid); DECODE_TAIL; @@ -713,9 +725,10 @@ nfsd4_decode_open_downgrade(struct nfsd4_compoundargs *argp, struct nfsd4_open_d DECODE_HEAD; open_down->od_stateowner = NULL; - READ_BUF(12 + sizeof(stateid_t)); - READ32(open_down->od_stateid.si_generation); - COPYMEM(&open_down->od_stateid.si_opaque, sizeof(stateid_opaque_t)); + status = nfsd4_decode_stateid(argp, &open_down->od_stateid); + if (status) + return status; + READ_BUF(12); READ32(open_down->od_seqid); READ32(open_down->od_share_access); READ32(open_down->od_share_deny); @@ -743,9 +756,10 @@ nfsd4_decode_read(struct nfsd4_compoundargs *argp, struct nfsd4_read *read) { DECODE_HEAD; - READ_BUF(sizeof(stateid_t) + 12); - READ32(read->rd_stateid.si_generation); - COPYMEM(&read->rd_stateid.si_opaque, sizeof(stateid_opaque_t)); + status = nfsd4_decode_stateid(argp, &read->rd_stateid); + if (status) + return status; + READ_BUF(12); READ64(read->rd_offset); READ32(read->rd_length); @@ -834,15 +848,13 @@ nfsd4_decode_secinfo(struct nfsd4_compoundargs *argp, static __be32 nfsd4_decode_setattr(struct nfsd4_compoundargs *argp, struct nfsd4_setattr *setattr) { - DECODE_HEAD; - - READ_BUF(sizeof(stateid_t)); - READ32(setattr->sa_stateid.si_generation); - COPYMEM(&setattr->sa_stateid.si_opaque, sizeof(stateid_opaque_t)); - if ((status = nfsd4_decode_fattr(argp, setattr->sa_bmval, &setattr->sa_iattr, &setattr->sa_acl))) - goto out; + __be32 status; - DECODE_TAIL; + status = nfsd4_decode_stateid(argp, &setattr->sa_stateid); + if (status) + return status; + return nfsd4_decode_fattr(argp, setattr->sa_bmval, + &setattr->sa_iattr, &setattr->sa_acl); } static __be32 @@ -927,9 +939,10 @@ nfsd4_decode_write(struct nfsd4_compoundargs *argp, struct nfsd4_write *write) int len; DECODE_HEAD; - READ_BUF(sizeof(stateid_opaque_t) + 20); - READ32(write->wr_stateid.si_generation); - COPYMEM(&write->wr_stateid.si_opaque, sizeof(stateid_opaque_t)); + status = nfsd4_decode_stateid(argp, &write->wr_stateid); + if (status) + return status; + READ_BUF(16); READ64(write->wr_offset); READ32(write->wr_stable_how); if (write->wr_stable_how > 2) @@ -1183,7 +1196,6 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp) * Header routine to setup seqid operation replay cache */ #define ENCODE_SEQID_OP_HEAD \ - __be32 *p; \ __be32 *save; \ \ save = resp->p; @@ -1950,6 +1962,17 @@ fail: return -EINVAL; } +static void +nfsd4_encode_stateid(struct nfsd4_compoundres *resp, stateid_t *sid) +{ + ENCODE_HEAD; + + RESERVE_SPACE(sizeof(stateid_t)); + WRITE32(sid->si_generation); + WRITEMEM(&sid->si_opaque, sizeof(stateid_opaque_t)); + ADJUST_ARGS(); +} + static __be32 nfsd4_encode_access(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_access *access) { @@ -1969,12 +1992,9 @@ nfsd4_encode_close(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_c { ENCODE_SEQID_OP_HEAD; - if (!nfserr) { - RESERVE_SPACE(sizeof(stateid_t)); - WRITE32(close->cl_stateid.si_generation); - WRITEMEM(&close->cl_stateid.si_opaque, sizeof(stateid_opaque_t)); - ADJUST_ARGS(); - } + if (!nfserr) + nfsd4_encode_stateid(resp, &close->cl_stateid); + ENCODE_SEQID_OP_TAIL(close->cl_stateowner); return nfserr; } @@ -2074,12 +2094,9 @@ nfsd4_encode_lock(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_lo { ENCODE_SEQID_OP_HEAD; - if (!nfserr) { - RESERVE_SPACE(4 + sizeof(stateid_t)); - WRITE32(lock->lk_resp_stateid.si_generation); - WRITEMEM(&lock->lk_resp_stateid.si_opaque, sizeof(stateid_opaque_t)); - ADJUST_ARGS(); - } else if (nfserr == nfserr_denied) + if (!nfserr) + nfsd4_encode_stateid(resp, &lock->lk_resp_stateid); + else if (nfserr == nfserr_denied) nfsd4_encode_lock_denied(resp, &lock->lk_denied); ENCODE_SEQID_OP_TAIL(lock->lk_replay_owner); @@ -2099,13 +2116,9 @@ nfsd4_encode_locku(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_l { ENCODE_SEQID_OP_HEAD; - if (!nfserr) { - RESERVE_SPACE(sizeof(stateid_t)); - WRITE32(locku->lu_stateid.si_generation); - WRITEMEM(&locku->lu_stateid.si_opaque, sizeof(stateid_opaque_t)); - ADJUST_ARGS(); - } - + if (!nfserr) + nfsd4_encode_stateid(resp, &locku->lu_stateid); + ENCODE_SEQID_OP_TAIL(locku->lu_stateowner); return nfserr; } @@ -2128,14 +2141,14 @@ nfsd4_encode_link(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_li static __be32 nfsd4_encode_open(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_open *open) { + ENCODE_HEAD; ENCODE_SEQID_OP_HEAD; if (nfserr) goto out; - RESERVE_SPACE(36 + sizeof(stateid_t)); - WRITE32(open->op_stateid.si_generation); - WRITEMEM(&open->op_stateid.si_opaque, sizeof(stateid_opaque_t)); + nfsd4_encode_stateid(resp, &open->op_stateid); + RESERVE_SPACE(40); WRITECINFO(open->op_cinfo); WRITE32(open->op_rflags); WRITE32(2); @@ -2148,8 +2161,8 @@ nfsd4_encode_open(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_op case NFS4_OPEN_DELEGATE_NONE: break; case NFS4_OPEN_DELEGATE_READ: - RESERVE_SPACE(20 + sizeof(stateid_t)); - WRITEMEM(&open->op_delegate_stateid, sizeof(stateid_t)); + nfsd4_encode_stateid(resp, &open->op_delegate_stateid); + RESERVE_SPACE(20); WRITE32(open->op_recall); /* @@ -2162,8 +2175,8 @@ nfsd4_encode_open(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_op ADJUST_ARGS(); break; case NFS4_OPEN_DELEGATE_WRITE: - RESERVE_SPACE(32 + sizeof(stateid_t)); - WRITEMEM(&open->op_delegate_stateid, sizeof(stateid_t)); + nfsd4_encode_stateid(resp, &open->op_delegate_stateid); + RESERVE_SPACE(32); WRITE32(0); /* @@ -2195,13 +2208,9 @@ static __be32 nfsd4_encode_open_confirm(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_open_confirm *oc) { ENCODE_SEQID_OP_HEAD; - - if (!nfserr) { - RESERVE_SPACE(sizeof(stateid_t)); - WRITE32(oc->oc_resp_stateid.si_generation); - WRITEMEM(&oc->oc_resp_stateid.si_opaque, sizeof(stateid_opaque_t)); - ADJUST_ARGS(); - } + + if (!nfserr) + nfsd4_encode_stateid(resp, &oc->oc_resp_stateid); ENCODE_SEQID_OP_TAIL(oc->oc_stateowner); return nfserr; @@ -2211,13 +2220,9 @@ static __be32 nfsd4_encode_open_downgrade(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_open_downgrade *od) { ENCODE_SEQID_OP_HEAD; - - if (!nfserr) { - RESERVE_SPACE(sizeof(stateid_t)); - WRITE32(od->od_stateid.si_generation); - WRITEMEM(&od->od_stateid.si_opaque, sizeof(stateid_opaque_t)); - ADJUST_ARGS(); - } + + if (!nfserr) + nfsd4_encode_stateid(resp, &od->od_stateid); ENCODE_SEQID_OP_TAIL(od->od_stateowner); return nfserr; diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index c53e65f..97543df 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -614,10 +614,9 @@ static ssize_t __write_ports(struct file *file, char *buf, size_t size) return -EINVAL; err = nfsd_create_serv(); if (!err) { - int proto = 0; - err = svc_addsock(nfsd_serv, fd, buf, &proto); + err = svc_addsock(nfsd_serv, fd, buf); if (err >= 0) { - err = lockd_up(proto); + err = lockd_up(); if (err < 0) svc_sock_names(buf+strlen(buf)+1, nfsd_serv, buf); } diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c index ea37c96..cd25d91 100644 --- a/fs/nfsd/nfsfh.c +++ b/fs/nfsd/nfsfh.c @@ -302,17 +302,27 @@ fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, int access) if (error) goto out; - if (!(access & NFSD_MAY_LOCK)) { - /* - * pseudoflavor restrictions are not enforced on NLM, - * which clients virtually always use auth_sys for, - * even while using RPCSEC_GSS for NFS. - */ - error = check_nfsd_access(exp, rqstp); - if (error) - goto out; - } + /* + * pseudoflavor restrictions are not enforced on NLM, + * which clients virtually always use auth_sys for, + * even while using RPCSEC_GSS for NFS. + */ + if (access & NFSD_MAY_LOCK) + goto skip_pseudoflavor_check; + /* + * Clients may expect to be able to use auth_sys during mount, + * even if they use gss for everything else; see section 2.3.2 + * of rfc 2623. + */ + if (access & NFSD_MAY_BYPASS_GSS_ON_ROOT + && exp->ex_path.dentry == dentry) + goto skip_pseudoflavor_check; + + error = check_nfsd_access(exp, rqstp); + if (error) + goto out; +skip_pseudoflavor_check: /* Finally, check access permissions. */ error = nfsd_permission(rqstp, exp, dentry, access); diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c index 0766f95..5cffeca 100644 --- a/fs/nfsd/nfsproc.c +++ b/fs/nfsd/nfsproc.c @@ -65,7 +65,8 @@ nfsd_proc_getattr(struct svc_rqst *rqstp, struct nfsd_fhandle *argp, dprintk("nfsd: GETATTR %s\n", SVCFH_fmt(&argp->fh)); fh_copy(&resp->fh, &argp->fh); - nfserr = fh_verify(rqstp, &resp->fh, 0, NFSD_MAY_NOP); + nfserr = fh_verify(rqstp, &resp->fh, 0, + NFSD_MAY_NOP | NFSD_MAY_BYPASS_GSS_ON_ROOT); return nfsd_return_attrs(nfserr, resp); } @@ -521,7 +522,8 @@ nfsd_proc_statfs(struct svc_rqst * rqstp, struct nfsd_fhandle *argp, dprintk("nfsd: STATFS %s\n", SVCFH_fmt(&argp->fh)); - nfserr = nfsd_statfs(rqstp, &argp->fh, &resp->stats); + nfserr = nfsd_statfs(rqstp, &argp->fh, &resp->stats, + NFSD_MAY_BYPASS_GSS_ON_ROOT); fh_put(&argp->fh); return nfserr; } diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index 80292ff..59eeb46 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -229,6 +229,7 @@ int nfsd_create_serv(void) atomic_set(&nfsd_busy, 0); nfsd_serv = svc_create_pooled(&nfsd_program, nfsd_max_blksize, + AF_INET, nfsd_last_thread, nfsd, THIS_MODULE); if (nfsd_serv == NULL) err = -ENOMEM; @@ -243,25 +244,20 @@ static int nfsd_init_socks(int port) if (!list_empty(&nfsd_serv->sv_permsocks)) return 0; - error = lockd_up(IPPROTO_UDP); - if (error >= 0) { - error = svc_create_xprt(nfsd_serv, "udp", port, + error = svc_create_xprt(nfsd_serv, "udp", port, SVC_SOCK_DEFAULTS); - if (error < 0) - lockd_down(); - } if (error < 0) return error; - error = lockd_up(IPPROTO_TCP); - if (error >= 0) { - error = svc_create_xprt(nfsd_serv, "tcp", port, + error = svc_create_xprt(nfsd_serv, "tcp", port, SVC_SOCK_DEFAULTS); - if (error < 0) - lockd_down(); - } if (error < 0) return error; + + error = lockd_up(); + if (error < 0) + return error; + return 0; } diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 18060be..aa1d0d6 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -83,7 +83,6 @@ struct raparm_hbucket { spinlock_t pb_lock; } ____cacheline_aligned_in_smp; -static struct raparms * raparml; #define RAPARM_HASH_BITS 4 #define RAPARM_HASH_SIZE (1<<RAPARM_HASH_BITS) #define RAPARM_HASH_MASK (RAPARM_HASH_SIZE-1) @@ -1866,9 +1865,9 @@ out: * N.B. After this call fhp needs an fh_put */ __be32 -nfsd_statfs(struct svc_rqst *rqstp, struct svc_fh *fhp, struct kstatfs *stat) +nfsd_statfs(struct svc_rqst *rqstp, struct svc_fh *fhp, struct kstatfs *stat, int access) { - __be32 err = fh_verify(rqstp, fhp, 0, NFSD_MAY_NOP); + __be32 err = fh_verify(rqstp, fhp, 0, NFSD_MAY_NOP | access); if (!err && vfs_statfs(fhp->fh_dentry,stat)) err = nfserr_io; return err; @@ -1966,11 +1965,20 @@ nfsd_permission(struct svc_rqst *rqstp, struct svc_export *exp, void nfsd_racache_shutdown(void) { - if (!raparml) - return; + struct raparms *raparm, *last_raparm; + unsigned int i; + dprintk("nfsd: freeing readahead buffers.\n"); - kfree(raparml); - raparml = NULL; + + for (i = 0; i < RAPARM_HASH_SIZE; i++) { + raparm = raparm_hash[i].pb_head; + while(raparm) { + last_raparm = raparm; + raparm = raparm->p_next; + kfree(last_raparm); + } + raparm_hash[i].pb_head = NULL; + } } /* * Initialize readahead param cache @@ -1981,35 +1989,38 @@ nfsd_racache_init(int cache_size) int i; int j = 0; int nperbucket; + struct raparms **raparm = NULL; - if (raparml) + if (raparm_hash[0].pb_head) return 0; - if (cache_size < 2*RAPARM_HASH_SIZE) - cache_size = 2*RAPARM_HASH_SIZE; - raparml = kcalloc(cache_size, sizeof(struct raparms), GFP_KERNEL); - - if (!raparml) { - printk(KERN_WARNING - "nfsd: Could not allocate memory read-ahead cache.\n"); - return -ENOMEM; - } + nperbucket = DIV_ROUND_UP(cache_size, RAPARM_HASH_SIZE); + if (nperbucket < 2) + nperbucket = 2; + cache_size = nperbucket * RAPARM_HASH_SIZE; dprintk("nfsd: allocating %d readahead buffers.\n", cache_size); - for (i = 0 ; i < RAPARM_HASH_SIZE ; i++) { - raparm_hash[i].pb_head = NULL; + + for (i = 0; i < RAPARM_HASH_SIZE; i++) { spin_lock_init(&raparm_hash[i].pb_lock); - } - nperbucket = DIV_ROUND_UP(cache_size, RAPARM_HASH_SIZE); - for (i = 0; i < cache_size - 1; i++) { - if (i % nperbucket == 0) - raparm_hash[j++].pb_head = raparml + i; - if (i % nperbucket < nperbucket-1) - raparml[i].p_next = raparml + i + 1; + + raparm = &raparm_hash[i].pb_head; + for (j = 0; j < nperbucket; j++) { + *raparm = kzalloc(sizeof(struct raparms), GFP_KERNEL); + if (!*raparm) + goto out_nomem; + raparm = &(*raparm)->p_next; + } + *raparm = NULL; } nfsdstats.ra_size = cache_size; return 0; + +out_nomem: + dprintk("nfsd: kmalloc failed, freeing readahead buffers\n"); + nfsd_racache_shutdown(); + return -ENOMEM; } #if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) diff --git a/fs/nls/nls_base.c b/fs/nls/nls_base.c index 64965e1..9b0efda 100644 --- a/fs/nls/nls_base.c +++ b/fs/nls/nls_base.c @@ -13,9 +13,7 @@ #include <linux/nls.h> #include <linux/kernel.h> #include <linux/errno.h> -#ifdef CONFIG_KMOD #include <linux/kmod.h> -#endif #include <linux/spinlock.h> static struct nls_table default_table; @@ -215,24 +213,7 @@ static struct nls_table *find_nls(char *charset) struct nls_table *load_nls(char *charset) { - struct nls_table *nls; -#ifdef CONFIG_KMOD - int ret; -#endif - - nls = find_nls(charset); - if (nls) - return nls; - -#ifdef CONFIG_KMOD - ret = request_module("nls_%s", charset); - if (ret != 0) { - printk("Unable to load NLS charset %s\n", charset); - return NULL; - } - nls = find_nls(charset); -#endif - return nls; + return try_then_request_module(find_nls(charset), "nls_%s", charset); } void unload_nls(struct nls_table *nls) diff --git a/fs/ocfs2/Makefile b/fs/ocfs2/Makefile index f6956de..589dcdf 100644 --- a/fs/ocfs2/Makefile +++ b/fs/ocfs2/Makefile @@ -34,7 +34,8 @@ ocfs2-objs := \ symlink.o \ sysfile.o \ uptodate.o \ - ver.o + ver.o \ + xattr.o ocfs2_stackglue-objs := stackglue.o ocfs2_stack_o2cb-objs := stack_o2cb.o diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index 29ff57e..0cc2deb 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -49,6 +49,340 @@ #include "buffer_head_io.h" + +/* + * Operations for a specific extent tree type. + * + * To implement an on-disk btree (extent tree) type in ocfs2, add + * an ocfs2_extent_tree_operations structure and the matching + * ocfs2_init_<thingy>_extent_tree() function. That's pretty much it + * for the allocation portion of the extent tree. + */ +struct ocfs2_extent_tree_operations { + /* + * last_eb_blk is the block number of the right most leaf extent + * block. Most on-disk structures containing an extent tree store + * this value for fast access. The ->eo_set_last_eb_blk() and + * ->eo_get_last_eb_blk() operations access this value. They are + * both required. + */ + void (*eo_set_last_eb_blk)(struct ocfs2_extent_tree *et, + u64 blkno); + u64 (*eo_get_last_eb_blk)(struct ocfs2_extent_tree *et); + + /* + * The on-disk structure usually keeps track of how many total + * clusters are stored in this extent tree. This function updates + * that value. new_clusters is the delta, and must be + * added to the total. Required. + */ + void (*eo_update_clusters)(struct inode *inode, + struct ocfs2_extent_tree *et, + u32 new_clusters); + + /* + * If ->eo_insert_check() exists, it is called before rec is + * inserted into the extent tree. It is optional. + */ + int (*eo_insert_check)(struct inode *inode, + struct ocfs2_extent_tree *et, + struct ocfs2_extent_rec *rec); + int (*eo_sanity_check)(struct inode *inode, struct ocfs2_extent_tree *et); + + /* + * -------------------------------------------------------------- + * The remaining are internal to ocfs2_extent_tree and don't have + * accessor functions + */ + + /* + * ->eo_fill_root_el() takes et->et_object and sets et->et_root_el. + * It is required. + */ + void (*eo_fill_root_el)(struct ocfs2_extent_tree *et); + + /* + * ->eo_fill_max_leaf_clusters sets et->et_max_leaf_clusters if + * it exists. If it does not, et->et_max_leaf_clusters is set + * to 0 (unlimited). Optional. + */ + void (*eo_fill_max_leaf_clusters)(struct inode *inode, + struct ocfs2_extent_tree *et); +}; + + +/* + * Pre-declare ocfs2_dinode_et_ops so we can use it as a sanity check + * in the methods. + */ +static u64 ocfs2_dinode_get_last_eb_blk(struct ocfs2_extent_tree *et); +static void ocfs2_dinode_set_last_eb_blk(struct ocfs2_extent_tree *et, + u64 blkno); +static void ocfs2_dinode_update_clusters(struct inode *inode, + struct ocfs2_extent_tree *et, + u32 clusters); +static int ocfs2_dinode_insert_check(struct inode *inode, + struct ocfs2_extent_tree *et, + struct ocfs2_extent_rec *rec); +static int ocfs2_dinode_sanity_check(struct inode *inode, + struct ocfs2_extent_tree *et); +static void ocfs2_dinode_fill_root_el(struct ocfs2_extent_tree *et); +static struct ocfs2_extent_tree_operations ocfs2_dinode_et_ops = { + .eo_set_last_eb_blk = ocfs2_dinode_set_last_eb_blk, + .eo_get_last_eb_blk = ocfs2_dinode_get_last_eb_blk, + .eo_update_clusters = ocfs2_dinode_update_clusters, + .eo_insert_check = ocfs2_dinode_insert_check, + .eo_sanity_check = ocfs2_dinode_sanity_check, + .eo_fill_root_el = ocfs2_dinode_fill_root_el, +}; + +static void ocfs2_dinode_set_last_eb_blk(struct ocfs2_extent_tree *et, + u64 blkno) +{ + struct ocfs2_dinode *di = et->et_object; + + BUG_ON(et->et_ops != &ocfs2_dinode_et_ops); + di->i_last_eb_blk = cpu_to_le64(blkno); +} + +static u64 ocfs2_dinode_get_last_eb_blk(struct ocfs2_extent_tree *et) +{ + struct ocfs2_dinode *di = et->et_object; + + BUG_ON(et->et_ops != &ocfs2_dinode_et_ops); + return le64_to_cpu(di->i_last_eb_blk); +} + +static void ocfs2_dinode_update_clusters(struct inode *inode, + struct ocfs2_extent_tree *et, + u32 clusters) +{ + struct ocfs2_dinode *di = et->et_object; + + le32_add_cpu(&di->i_clusters, clusters); + spin_lock(&OCFS2_I(inode)->ip_lock); + OCFS2_I(inode)->ip_clusters = le32_to_cpu(di->i_clusters); + spin_unlock(&OCFS2_I(inode)->ip_lock); +} + +static int ocfs2_dinode_insert_check(struct inode *inode, + struct ocfs2_extent_tree *et, + struct ocfs2_extent_rec *rec) +{ + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + + BUG_ON(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL); + mlog_bug_on_msg(!ocfs2_sparse_alloc(osb) && + (OCFS2_I(inode)->ip_clusters != rec->e_cpos), + "Device %s, asking for sparse allocation: inode %llu, " + "cpos %u, clusters %u\n", + osb->dev_str, + (unsigned long long)OCFS2_I(inode)->ip_blkno, + rec->e_cpos, + OCFS2_I(inode)->ip_clusters); + + return 0; +} + +static int ocfs2_dinode_sanity_check(struct inode *inode, + struct ocfs2_extent_tree *et) +{ + int ret = 0; + struct ocfs2_dinode *di; + + BUG_ON(et->et_ops != &ocfs2_dinode_et_ops); + + di = et->et_object; + if (!OCFS2_IS_VALID_DINODE(di)) { + ret = -EIO; + ocfs2_error(inode->i_sb, + "Inode %llu has invalid path root", + (unsigned long long)OCFS2_I(inode)->ip_blkno); + } + + return ret; +} + +static void ocfs2_dinode_fill_root_el(struct ocfs2_extent_tree *et) +{ + struct ocfs2_dinode *di = et->et_object; + + et->et_root_el = &di->id2.i_list; +} + + +static void ocfs2_xattr_value_fill_root_el(struct ocfs2_extent_tree *et) +{ + struct ocfs2_xattr_value_root *xv = et->et_object; + + et->et_root_el = &xv->xr_list; +} + +static void ocfs2_xattr_value_set_last_eb_blk(struct ocfs2_extent_tree *et, + u64 blkno) +{ + struct ocfs2_xattr_value_root *xv = + (struct ocfs2_xattr_value_root *)et->et_object; + + xv->xr_last_eb_blk = cpu_to_le64(blkno); +} + +static u64 ocfs2_xattr_value_get_last_eb_blk(struct ocfs2_extent_tree *et) +{ + struct ocfs2_xattr_value_root *xv = + (struct ocfs2_xattr_value_root *) et->et_object; + + return le64_to_cpu(xv->xr_last_eb_blk); +} + +static void ocfs2_xattr_value_update_clusters(struct inode *inode, + struct ocfs2_extent_tree *et, + u32 clusters) +{ + struct ocfs2_xattr_value_root *xv = + (struct ocfs2_xattr_value_root *)et->et_object; + + le32_add_cpu(&xv->xr_clusters, clusters); +} + +static struct ocfs2_extent_tree_operations ocfs2_xattr_value_et_ops = { + .eo_set_last_eb_blk = ocfs2_xattr_value_set_last_eb_blk, + .eo_get_last_eb_blk = ocfs2_xattr_value_get_last_eb_blk, + .eo_update_clusters = ocfs2_xattr_value_update_clusters, + .eo_fill_root_el = ocfs2_xattr_value_fill_root_el, +}; + +static void ocfs2_xattr_tree_fill_root_el(struct ocfs2_extent_tree *et) +{ + struct ocfs2_xattr_block *xb = et->et_object; + + et->et_root_el = &xb->xb_attrs.xb_root.xt_list; +} + +static void ocfs2_xattr_tree_fill_max_leaf_clusters(struct inode *inode, + struct ocfs2_extent_tree *et) +{ + et->et_max_leaf_clusters = + ocfs2_clusters_for_bytes(inode->i_sb, + OCFS2_MAX_XATTR_TREE_LEAF_SIZE); +} + +static void ocfs2_xattr_tree_set_last_eb_blk(struct ocfs2_extent_tree *et, + u64 blkno) +{ + struct ocfs2_xattr_block *xb = et->et_object; + struct ocfs2_xattr_tree_root *xt = &xb->xb_attrs.xb_root; + + xt->xt_last_eb_blk = cpu_to_le64(blkno); +} + +static u64 ocfs2_xattr_tree_get_last_eb_blk(struct ocfs2_extent_tree *et) +{ + struct ocfs2_xattr_block *xb = et->et_object; + struct ocfs2_xattr_tree_root *xt = &xb->xb_attrs.xb_root; + + return le64_to_cpu(xt->xt_last_eb_blk); +} + +static void ocfs2_xattr_tree_update_clusters(struct inode *inode, + struct ocfs2_extent_tree *et, + u32 clusters) +{ + struct ocfs2_xattr_block *xb = et->et_object; + + le32_add_cpu(&xb->xb_attrs.xb_root.xt_clusters, clusters); +} + +static struct ocfs2_extent_tree_operations ocfs2_xattr_tree_et_ops = { + .eo_set_last_eb_blk = ocfs2_xattr_tree_set_last_eb_blk, + .eo_get_last_eb_blk = ocfs2_xattr_tree_get_last_eb_blk, + .eo_update_clusters = ocfs2_xattr_tree_update_clusters, + .eo_fill_root_el = ocfs2_xattr_tree_fill_root_el, + .eo_fill_max_leaf_clusters = ocfs2_xattr_tree_fill_max_leaf_clusters, +}; + +static void __ocfs2_init_extent_tree(struct ocfs2_extent_tree *et, + struct inode *inode, + struct buffer_head *bh, + void *obj, + struct ocfs2_extent_tree_operations *ops) +{ + et->et_ops = ops; + et->et_root_bh = bh; + if (!obj) + obj = (void *)bh->b_data; + et->et_object = obj; + + et->et_ops->eo_fill_root_el(et); + if (!et->et_ops->eo_fill_max_leaf_clusters) + et->et_max_leaf_clusters = 0; + else + et->et_ops->eo_fill_max_leaf_clusters(inode, et); +} + +void ocfs2_init_dinode_extent_tree(struct ocfs2_extent_tree *et, + struct inode *inode, + struct buffer_head *bh) +{ + __ocfs2_init_extent_tree(et, inode, bh, NULL, &ocfs2_dinode_et_ops); +} + +void ocfs2_init_xattr_tree_extent_tree(struct ocfs2_extent_tree *et, + struct inode *inode, + struct buffer_head *bh) +{ + __ocfs2_init_extent_tree(et, inode, bh, NULL, + &ocfs2_xattr_tree_et_ops); +} + +void ocfs2_init_xattr_value_extent_tree(struct ocfs2_extent_tree *et, + struct inode *inode, + struct buffer_head *bh, + struct ocfs2_xattr_value_root *xv) +{ + __ocfs2_init_extent_tree(et, inode, bh, xv, + &ocfs2_xattr_value_et_ops); +} + +static inline void ocfs2_et_set_last_eb_blk(struct ocfs2_extent_tree *et, + u64 new_last_eb_blk) +{ + et->et_ops->eo_set_last_eb_blk(et, new_last_eb_blk); +} + +static inline u64 ocfs2_et_get_last_eb_blk(struct ocfs2_extent_tree *et) +{ + return et->et_ops->eo_get_last_eb_blk(et); +} + +static inline void ocfs2_et_update_clusters(struct inode *inode, + struct ocfs2_extent_tree *et, + u32 clusters) +{ + et->et_ops->eo_update_clusters(inode, et, clusters); +} + +static inline int ocfs2_et_insert_check(struct inode *inode, + struct ocfs2_extent_tree *et, + struct ocfs2_extent_rec *rec) +{ + int ret = 0; + + if (et->et_ops->eo_insert_check) + ret = et->et_ops->eo_insert_check(inode, et, rec); + return ret; +} + +static inline int ocfs2_et_sanity_check(struct inode *inode, + struct ocfs2_extent_tree *et) +{ + int ret = 0; + + if (et->et_ops->eo_sanity_check) + ret = et->et_ops->eo_sanity_check(inode, et); + return ret; +} + static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc); static int ocfs2_cache_extent_block_free(struct ocfs2_cached_dealloc_ctxt *ctxt, struct ocfs2_extent_block *eb); @@ -205,17 +539,6 @@ static struct ocfs2_path *ocfs2_new_path(struct buffer_head *root_bh, } /* - * Allocate and initialize a new path based on a disk inode tree. - */ -static struct ocfs2_path *ocfs2_new_inode_path(struct buffer_head *di_bh) -{ - struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; - struct ocfs2_extent_list *el = &di->id2.i_list; - - return ocfs2_new_path(di_bh, el); -} - -/* * Convenience function to journal all components in a path. */ static int ocfs2_journal_access_path(struct inode *inode, handle_t *handle, @@ -368,39 +691,35 @@ struct ocfs2_merge_ctxt { */ int ocfs2_num_free_extents(struct ocfs2_super *osb, struct inode *inode, - struct ocfs2_dinode *fe) + struct ocfs2_extent_tree *et) { int retval; - struct ocfs2_extent_list *el; + struct ocfs2_extent_list *el = NULL; struct ocfs2_extent_block *eb; struct buffer_head *eb_bh = NULL; + u64 last_eb_blk = 0; mlog_entry_void(); - if (!OCFS2_IS_VALID_DINODE(fe)) { - OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe); - retval = -EIO; - goto bail; - } + el = et->et_root_el; + last_eb_blk = ocfs2_et_get_last_eb_blk(et); - if (fe->i_last_eb_blk) { - retval = ocfs2_read_block(osb, le64_to_cpu(fe->i_last_eb_blk), - &eb_bh, OCFS2_BH_CACHED, inode); + if (last_eb_blk) { + retval = ocfs2_read_block(inode, last_eb_blk, + &eb_bh); if (retval < 0) { mlog_errno(retval); goto bail; } eb = (struct ocfs2_extent_block *) eb_bh->b_data; el = &eb->h_list; - } else - el = &fe->id2.i_list; + } BUG_ON(el->l_tree_depth != 0); retval = le16_to_cpu(el->l_count) - le16_to_cpu(el->l_next_free_rec); bail: - if (eb_bh) - brelse(eb_bh); + brelse(eb_bh); mlog_exit(retval); return retval; @@ -486,8 +805,7 @@ static int ocfs2_create_new_meta_bhs(struct ocfs2_super *osb, bail: if (status < 0) { for(i = 0; i < wanted; i++) { - if (bhs[i]) - brelse(bhs[i]); + brelse(bhs[i]); bhs[i] = NULL; } } @@ -531,7 +849,7 @@ static inline u32 ocfs2_sum_rightmost_rec(struct ocfs2_extent_list *el) static int ocfs2_add_branch(struct ocfs2_super *osb, handle_t *handle, struct inode *inode, - struct buffer_head *fe_bh, + struct ocfs2_extent_tree *et, struct buffer_head *eb_bh, struct buffer_head **last_eb_bh, struct ocfs2_alloc_context *meta_ac) @@ -540,7 +858,6 @@ static int ocfs2_add_branch(struct ocfs2_super *osb, u64 next_blkno, new_last_eb_blk; struct buffer_head *bh; struct buffer_head **new_eb_bhs = NULL; - struct ocfs2_dinode *fe; struct ocfs2_extent_block *eb; struct ocfs2_extent_list *eb_el; struct ocfs2_extent_list *el; @@ -550,13 +867,11 @@ static int ocfs2_add_branch(struct ocfs2_super *osb, BUG_ON(!last_eb_bh || !*last_eb_bh); - fe = (struct ocfs2_dinode *) fe_bh->b_data; - if (eb_bh) { eb = (struct ocfs2_extent_block *) eb_bh->b_data; el = &eb->h_list; } else - el = &fe->id2.i_list; + el = et->et_root_el; /* we never add a branch to a leaf. */ BUG_ON(!el->l_tree_depth); @@ -646,7 +961,7 @@ static int ocfs2_add_branch(struct ocfs2_super *osb, mlog_errno(status); goto bail; } - status = ocfs2_journal_access(handle, inode, fe_bh, + status = ocfs2_journal_access(handle, inode, et->et_root_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { mlog_errno(status); @@ -662,7 +977,7 @@ static int ocfs2_add_branch(struct ocfs2_super *osb, } /* Link the new branch into the rest of the tree (el will - * either be on the fe, or the extent block passed in. */ + * either be on the root_bh, or the extent block passed in. */ i = le16_to_cpu(el->l_next_free_rec); el->l_recs[i].e_blkno = cpu_to_le64(next_blkno); el->l_recs[i].e_cpos = cpu_to_le32(new_cpos); @@ -671,7 +986,7 @@ static int ocfs2_add_branch(struct ocfs2_super *osb, /* fe needs a new last extent block pointer, as does the * next_leaf on the previously last-extent-block. */ - fe->i_last_eb_blk = cpu_to_le64(new_last_eb_blk); + ocfs2_et_set_last_eb_blk(et, new_last_eb_blk); eb = (struct ocfs2_extent_block *) (*last_eb_bh)->b_data; eb->h_next_leaf_blk = cpu_to_le64(new_last_eb_blk); @@ -679,7 +994,7 @@ static int ocfs2_add_branch(struct ocfs2_super *osb, status = ocfs2_journal_dirty(handle, *last_eb_bh); if (status < 0) mlog_errno(status); - status = ocfs2_journal_dirty(handle, fe_bh); + status = ocfs2_journal_dirty(handle, et->et_root_bh); if (status < 0) mlog_errno(status); if (eb_bh) { @@ -700,8 +1015,7 @@ static int ocfs2_add_branch(struct ocfs2_super *osb, bail: if (new_eb_bhs) { for (i = 0; i < new_blocks; i++) - if (new_eb_bhs[i]) - brelse(new_eb_bhs[i]); + brelse(new_eb_bhs[i]); kfree(new_eb_bhs); } @@ -717,16 +1031,15 @@ bail: static int ocfs2_shift_tree_depth(struct ocfs2_super *osb, handle_t *handle, struct inode *inode, - struct buffer_head *fe_bh, + struct ocfs2_extent_tree *et, struct ocfs2_alloc_context *meta_ac, struct buffer_head **ret_new_eb_bh) { int status, i; u32 new_clusters; struct buffer_head *new_eb_bh = NULL; - struct ocfs2_dinode *fe; struct ocfs2_extent_block *eb; - struct ocfs2_extent_list *fe_el; + struct ocfs2_extent_list *root_el; struct ocfs2_extent_list *eb_el; mlog_entry_void(); @@ -746,8 +1059,7 @@ static int ocfs2_shift_tree_depth(struct ocfs2_super *osb, } eb_el = &eb->h_list; - fe = (struct ocfs2_dinode *) fe_bh->b_data; - fe_el = &fe->id2.i_list; + root_el = et->et_root_el; status = ocfs2_journal_access(handle, inode, new_eb_bh, OCFS2_JOURNAL_ACCESS_CREATE); @@ -756,11 +1068,11 @@ static int ocfs2_shift_tree_depth(struct ocfs2_super *osb, goto bail; } - /* copy the fe data into the new extent block */ - eb_el->l_tree_depth = fe_el->l_tree_depth; - eb_el->l_next_free_rec = fe_el->l_next_free_rec; - for(i = 0; i < le16_to_cpu(fe_el->l_next_free_rec); i++) - eb_el->l_recs[i] = fe_el->l_recs[i]; + /* copy the root extent list data into the new extent block */ + eb_el->l_tree_depth = root_el->l_tree_depth; + eb_el->l_next_free_rec = root_el->l_next_free_rec; + for (i = 0; i < le16_to_cpu(root_el->l_next_free_rec); i++) + eb_el->l_recs[i] = root_el->l_recs[i]; status = ocfs2_journal_dirty(handle, new_eb_bh); if (status < 0) { @@ -768,7 +1080,7 @@ static int ocfs2_shift_tree_depth(struct ocfs2_super *osb, goto bail; } - status = ocfs2_journal_access(handle, inode, fe_bh, + status = ocfs2_journal_access(handle, inode, et->et_root_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { mlog_errno(status); @@ -777,21 +1089,21 @@ static int ocfs2_shift_tree_depth(struct ocfs2_super *osb, new_clusters = ocfs2_sum_rightmost_rec(eb_el); - /* update fe now */ - le16_add_cpu(&fe_el->l_tree_depth, 1); - fe_el->l_recs[0].e_cpos = 0; - fe_el->l_recs[0].e_blkno = eb->h_blkno; - fe_el->l_recs[0].e_int_clusters = cpu_to_le32(new_clusters); - for(i = 1; i < le16_to_cpu(fe_el->l_next_free_rec); i++) - memset(&fe_el->l_recs[i], 0, sizeof(struct ocfs2_extent_rec)); - fe_el->l_next_free_rec = cpu_to_le16(1); + /* update root_bh now */ + le16_add_cpu(&root_el->l_tree_depth, 1); + root_el->l_recs[0].e_cpos = 0; + root_el->l_recs[0].e_blkno = eb->h_blkno; + root_el->l_recs[0].e_int_clusters = cpu_to_le32(new_clusters); + for (i = 1; i < le16_to_cpu(root_el->l_next_free_rec); i++) + memset(&root_el->l_recs[i], 0, sizeof(struct ocfs2_extent_rec)); + root_el->l_next_free_rec = cpu_to_le16(1); /* If this is our 1st tree depth shift, then last_eb_blk * becomes the allocated extent block */ - if (fe_el->l_tree_depth == cpu_to_le16(1)) - fe->i_last_eb_blk = eb->h_blkno; + if (root_el->l_tree_depth == cpu_to_le16(1)) + ocfs2_et_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno)); - status = ocfs2_journal_dirty(handle, fe_bh); + status = ocfs2_journal_dirty(handle, et->et_root_bh); if (status < 0) { mlog_errno(status); goto bail; @@ -801,8 +1113,7 @@ static int ocfs2_shift_tree_depth(struct ocfs2_super *osb, new_eb_bh = NULL; status = 0; bail: - if (new_eb_bh) - brelse(new_eb_bh); + brelse(new_eb_bh); mlog_exit(status); return status; @@ -817,22 +1128,21 @@ bail: * 1) a lowest extent block is found, then we pass it back in * *lowest_eb_bh and return '0' * - * 2) the search fails to find anything, but the dinode has room. We + * 2) the search fails to find anything, but the root_el has room. We * pass NULL back in *lowest_eb_bh, but still return '0' * - * 3) the search fails to find anything AND the dinode is full, in + * 3) the search fails to find anything AND the root_el is full, in * which case we return > 0 * * return status < 0 indicates an error. */ static int ocfs2_find_branch_target(struct ocfs2_super *osb, struct inode *inode, - struct buffer_head *fe_bh, + struct ocfs2_extent_tree *et, struct buffer_head **target_bh) { int status = 0, i; u64 blkno; - struct ocfs2_dinode *fe; struct ocfs2_extent_block *eb; struct ocfs2_extent_list *el; struct buffer_head *bh = NULL; @@ -842,8 +1152,7 @@ static int ocfs2_find_branch_target(struct ocfs2_super *osb, *target_bh = NULL; - fe = (struct ocfs2_dinode *) fe_bh->b_data; - el = &fe->id2.i_list; + el = et->et_root_el; while(le16_to_cpu(el->l_tree_depth) > 1) { if (le16_to_cpu(el->l_next_free_rec) == 0) { @@ -864,13 +1173,10 @@ static int ocfs2_find_branch_target(struct ocfs2_super *osb, goto bail; } - if (bh) { - brelse(bh); - bh = NULL; - } + brelse(bh); + bh = NULL; - status = ocfs2_read_block(osb, blkno, &bh, OCFS2_BH_CACHED, - inode); + status = ocfs2_read_block(inode, blkno, &bh); if (status < 0) { mlog_errno(status); goto bail; @@ -886,8 +1192,7 @@ static int ocfs2_find_branch_target(struct ocfs2_super *osb, if (le16_to_cpu(el->l_next_free_rec) < le16_to_cpu(el->l_count)) { - if (lowest_bh) - brelse(lowest_bh); + brelse(lowest_bh); lowest_bh = bh; get_bh(lowest_bh); } @@ -895,14 +1200,13 @@ static int ocfs2_find_branch_target(struct ocfs2_super *osb, /* If we didn't find one and the fe doesn't have any room, * then return '1' */ - if (!lowest_bh - && (fe->id2.i_list.l_next_free_rec == fe->id2.i_list.l_count)) + el = et->et_root_el; + if (!lowest_bh && (el->l_next_free_rec == el->l_count)) status = 1; *target_bh = lowest_bh; bail: - if (bh) - brelse(bh); + brelse(bh); mlog_exit(status); return status; @@ -919,19 +1223,19 @@ bail: * *last_eb_bh will be updated by ocfs2_add_branch(). */ static int ocfs2_grow_tree(struct inode *inode, handle_t *handle, - struct buffer_head *di_bh, int *final_depth, + struct ocfs2_extent_tree *et, int *final_depth, struct buffer_head **last_eb_bh, struct ocfs2_alloc_context *meta_ac) { int ret, shift; - struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; - int depth = le16_to_cpu(di->id2.i_list.l_tree_depth); + struct ocfs2_extent_list *el = et->et_root_el; + int depth = le16_to_cpu(el->l_tree_depth); struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); struct buffer_head *bh = NULL; BUG_ON(meta_ac == NULL); - shift = ocfs2_find_branch_target(osb, inode, di_bh, &bh); + shift = ocfs2_find_branch_target(osb, inode, et, &bh); if (shift < 0) { ret = shift; mlog_errno(ret); @@ -948,7 +1252,7 @@ static int ocfs2_grow_tree(struct inode *inode, handle_t *handle, /* ocfs2_shift_tree_depth will return us a buffer with * the new extent block (so we can pass that to * ocfs2_add_branch). */ - ret = ocfs2_shift_tree_depth(osb, handle, inode, di_bh, + ret = ocfs2_shift_tree_depth(osb, handle, inode, et, meta_ac, &bh); if (ret < 0) { mlog_errno(ret); @@ -975,7 +1279,7 @@ static int ocfs2_grow_tree(struct inode *inode, handle_t *handle, /* call ocfs2_add_branch to add the final part of the tree with * the new data. */ mlog(0, "add branch. bh = %p\n", bh); - ret = ocfs2_add_branch(osb, handle, inode, di_bh, bh, last_eb_bh, + ret = ocfs2_add_branch(osb, handle, inode, et, bh, last_eb_bh, meta_ac); if (ret < 0) { mlog_errno(ret); @@ -1236,8 +1540,7 @@ static int __ocfs2_find_path(struct inode *inode, brelse(bh); bh = NULL; - ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), blkno, - &bh, OCFS2_BH_CACHED, inode); + ret = ocfs2_read_block(inode, blkno, &bh); if (ret) { mlog_errno(ret); goto out; @@ -2058,11 +2361,11 @@ static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle, struct ocfs2_path *right_path, int subtree_index, struct ocfs2_cached_dealloc_ctxt *dealloc, - int *deleted) + int *deleted, + struct ocfs2_extent_tree *et) { int ret, i, del_right_subtree = 0, right_has_empty = 0; - struct buffer_head *root_bh, *di_bh = path_root_bh(right_path); - struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; + struct buffer_head *root_bh, *et_root_bh = path_root_bh(right_path); struct ocfs2_extent_list *right_leaf_el, *left_leaf_el; struct ocfs2_extent_block *eb; @@ -2114,7 +2417,7 @@ static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle, * We have to update i_last_eb_blk during the meta * data delete. */ - ret = ocfs2_journal_access(handle, inode, di_bh, + ret = ocfs2_journal_access(handle, inode, et_root_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (ret) { mlog_errno(ret); @@ -2189,7 +2492,7 @@ static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle, ocfs2_update_edge_lengths(inode, handle, left_path); eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data; - di->i_last_eb_blk = eb->h_blkno; + ocfs2_et_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno)); /* * Removal of the extent in the left leaf was skipped @@ -2199,7 +2502,7 @@ static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle, if (right_has_empty) ocfs2_remove_empty_extent(left_leaf_el); - ret = ocfs2_journal_dirty(handle, di_bh); + ret = ocfs2_journal_dirty(handle, et_root_bh); if (ret) mlog_errno(ret); @@ -2322,7 +2625,8 @@ static int __ocfs2_rotate_tree_left(struct inode *inode, handle_t *handle, int orig_credits, struct ocfs2_path *path, struct ocfs2_cached_dealloc_ctxt *dealloc, - struct ocfs2_path **empty_extent_path) + struct ocfs2_path **empty_extent_path, + struct ocfs2_extent_tree *et) { int ret, subtree_root, deleted; u32 right_cpos; @@ -2395,7 +2699,7 @@ static int __ocfs2_rotate_tree_left(struct inode *inode, ret = ocfs2_rotate_subtree_left(inode, handle, left_path, right_path, subtree_root, - dealloc, &deleted); + dealloc, &deleted, et); if (ret == -EAGAIN) { /* * The rotation has to temporarily stop due to @@ -2438,29 +2742,20 @@ out: } static int ocfs2_remove_rightmost_path(struct inode *inode, handle_t *handle, - struct ocfs2_path *path, - struct ocfs2_cached_dealloc_ctxt *dealloc) + struct ocfs2_path *path, + struct ocfs2_cached_dealloc_ctxt *dealloc, + struct ocfs2_extent_tree *et) { int ret, subtree_index; u32 cpos; struct ocfs2_path *left_path = NULL; - struct ocfs2_dinode *di; struct ocfs2_extent_block *eb; struct ocfs2_extent_list *el; - /* - * XXX: This code assumes that the root is an inode, which is - * true for now but may change as tree code gets generic. - */ - di = (struct ocfs2_dinode *)path_root_bh(path)->b_data; - if (!OCFS2_IS_VALID_DINODE(di)) { - ret = -EIO; - ocfs2_error(inode->i_sb, - "Inode %llu has invalid path root", - (unsigned long long)OCFS2_I(inode)->ip_blkno); - goto out; - } + ret = ocfs2_et_sanity_check(inode, et); + if (ret) + goto out; /* * There's two ways we handle this depending on * whether path is the only existing one. @@ -2517,7 +2812,7 @@ static int ocfs2_remove_rightmost_path(struct inode *inode, handle_t *handle, ocfs2_update_edge_lengths(inode, handle, left_path); eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data; - di->i_last_eb_blk = eb->h_blkno; + ocfs2_et_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno)); } else { /* * 'path' is also the leftmost path which @@ -2528,12 +2823,12 @@ static int ocfs2_remove_rightmost_path(struct inode *inode, handle_t *handle, */ ocfs2_unlink_path(inode, handle, dealloc, path, 1); - el = &di->id2.i_list; + el = et->et_root_el; el->l_tree_depth = 0; el->l_next_free_rec = 0; memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec)); - di->i_last_eb_blk = 0; + ocfs2_et_set_last_eb_blk(et, 0); } ocfs2_journal_dirty(handle, path_root_bh(path)); @@ -2561,7 +2856,8 @@ out: */ static int ocfs2_rotate_tree_left(struct inode *inode, handle_t *handle, struct ocfs2_path *path, - struct ocfs2_cached_dealloc_ctxt *dealloc) + struct ocfs2_cached_dealloc_ctxt *dealloc, + struct ocfs2_extent_tree *et) { int ret, orig_credits = handle->h_buffer_credits; struct ocfs2_path *tmp_path = NULL, *restart_path = NULL; @@ -2575,7 +2871,7 @@ static int ocfs2_rotate_tree_left(struct inode *inode, handle_t *handle, if (path->p_tree_depth == 0) { rightmost_no_delete: /* - * In-inode extents. This is trivially handled, so do + * Inline extents. This is trivially handled, so do * it up front. */ ret = ocfs2_rotate_rightmost_leaf_left(inode, handle, @@ -2629,7 +2925,7 @@ rightmost_no_delete: */ ret = ocfs2_remove_rightmost_path(inode, handle, path, - dealloc); + dealloc, et); if (ret) mlog_errno(ret); goto out; @@ -2641,7 +2937,7 @@ rightmost_no_delete: */ try_rotate: ret = __ocfs2_rotate_tree_left(inode, handle, orig_credits, path, - dealloc, &restart_path); + dealloc, &restart_path, et); if (ret && ret != -EAGAIN) { mlog_errno(ret); goto out; @@ -2653,7 +2949,7 @@ try_rotate: ret = __ocfs2_rotate_tree_left(inode, handle, orig_credits, tmp_path, dealloc, - &restart_path); + &restart_path, et); if (ret && ret != -EAGAIN) { mlog_errno(ret); goto out; @@ -2939,6 +3235,7 @@ static int ocfs2_merge_rec_left(struct inode *inode, handle_t *handle, struct ocfs2_extent_rec *split_rec, struct ocfs2_cached_dealloc_ctxt *dealloc, + struct ocfs2_extent_tree *et, int index) { int ret, i, subtree_index = 0, has_empty_extent = 0; @@ -3059,7 +3356,8 @@ static int ocfs2_merge_rec_left(struct inode *inode, le16_to_cpu(el->l_next_free_rec) == 1) { ret = ocfs2_remove_rightmost_path(inode, handle, - right_path, dealloc); + right_path, + dealloc, et); if (ret) { mlog_errno(ret); goto out; @@ -3086,7 +3384,8 @@ static int ocfs2_try_to_merge_extent(struct inode *inode, int split_index, struct ocfs2_extent_rec *split_rec, struct ocfs2_cached_dealloc_ctxt *dealloc, - struct ocfs2_merge_ctxt *ctxt) + struct ocfs2_merge_ctxt *ctxt, + struct ocfs2_extent_tree *et) { int ret = 0; @@ -3104,7 +3403,7 @@ static int ocfs2_try_to_merge_extent(struct inode *inode, * illegal. */ ret = ocfs2_rotate_tree_left(inode, handle, path, - dealloc); + dealloc, et); if (ret) { mlog_errno(ret); goto out; @@ -3147,7 +3446,8 @@ static int ocfs2_try_to_merge_extent(struct inode *inode, BUG_ON(!ocfs2_is_empty_extent(&el->l_recs[0])); /* The merge left us with an empty extent, remove it. */ - ret = ocfs2_rotate_tree_left(inode, handle, path, dealloc); + ret = ocfs2_rotate_tree_left(inode, handle, path, + dealloc, et); if (ret) { mlog_errno(ret); goto out; @@ -3161,7 +3461,7 @@ static int ocfs2_try_to_merge_extent(struct inode *inode, */ ret = ocfs2_merge_rec_left(inode, path, handle, rec, - dealloc, + dealloc, et, split_index); if (ret) { @@ -3170,7 +3470,7 @@ static int ocfs2_try_to_merge_extent(struct inode *inode, } ret = ocfs2_rotate_tree_left(inode, handle, path, - dealloc); + dealloc, et); /* * Error from this last rotate is not critical, so * print but don't bubble it up. @@ -3190,7 +3490,7 @@ static int ocfs2_try_to_merge_extent(struct inode *inode, ret = ocfs2_merge_rec_left(inode, path, handle, split_rec, - dealloc, + dealloc, et, split_index); if (ret) { mlog_errno(ret); @@ -3213,7 +3513,7 @@ static int ocfs2_try_to_merge_extent(struct inode *inode, * our leaf. Try to rotate it away. */ ret = ocfs2_rotate_tree_left(inode, handle, path, - dealloc); + dealloc, et); if (ret) mlog_errno(ret); ret = 0; @@ -3347,16 +3647,6 @@ rotate: ocfs2_rotate_leaf(el, insert_rec); } -static inline void ocfs2_update_dinode_clusters(struct inode *inode, - struct ocfs2_dinode *di, - u32 clusters) -{ - le32_add_cpu(&di->i_clusters, clusters); - spin_lock(&OCFS2_I(inode)->ip_lock); - OCFS2_I(inode)->ip_clusters = le32_to_cpu(di->i_clusters); - spin_unlock(&OCFS2_I(inode)->ip_lock); -} - static void ocfs2_adjust_rightmost_records(struct inode *inode, handle_t *handle, struct ocfs2_path *path, @@ -3558,8 +3848,8 @@ static void ocfs2_split_record(struct inode *inode, } /* - * This function only does inserts on an allocation b-tree. For dinode - * lists, ocfs2_insert_at_leaf() is called directly. + * This function only does inserts on an allocation b-tree. For tree + * depth = 0, ocfs2_insert_at_leaf() is called directly. * * right_path is the path we want to do the actual insert * in. left_path should only be passed in if we need to update that @@ -3656,7 +3946,7 @@ out: static int ocfs2_do_insert_extent(struct inode *inode, handle_t *handle, - struct buffer_head *di_bh, + struct ocfs2_extent_tree *et, struct ocfs2_extent_rec *insert_rec, struct ocfs2_insert_type *type) { @@ -3664,13 +3954,11 @@ static int ocfs2_do_insert_extent(struct inode *inode, u32 cpos; struct ocfs2_path *right_path = NULL; struct ocfs2_path *left_path = NULL; - struct ocfs2_dinode *di; struct ocfs2_extent_list *el; - di = (struct ocfs2_dinode *) di_bh->b_data; - el = &di->id2.i_list; + el = et->et_root_el; - ret = ocfs2_journal_access(handle, inode, di_bh, + ret = ocfs2_journal_access(handle, inode, et->et_root_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (ret) { mlog_errno(ret); @@ -3682,7 +3970,7 @@ static int ocfs2_do_insert_extent(struct inode *inode, goto out_update_clusters; } - right_path = ocfs2_new_inode_path(di_bh); + right_path = ocfs2_new_path(et->et_root_bh, et->et_root_el); if (!right_path) { ret = -ENOMEM; mlog_errno(ret); @@ -3732,7 +4020,7 @@ static int ocfs2_do_insert_extent(struct inode *inode, * ocfs2_rotate_tree_right() might have extended the * transaction without re-journaling our tree root. */ - ret = ocfs2_journal_access(handle, inode, di_bh, + ret = ocfs2_journal_access(handle, inode, et->et_root_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (ret) { mlog_errno(ret); @@ -3757,10 +4045,10 @@ static int ocfs2_do_insert_extent(struct inode *inode, out_update_clusters: if (type->ins_split == SPLIT_NONE) - ocfs2_update_dinode_clusters(inode, di, - le16_to_cpu(insert_rec->e_leaf_clusters)); + ocfs2_et_update_clusters(inode, et, + le16_to_cpu(insert_rec->e_leaf_clusters)); - ret = ocfs2_journal_dirty(handle, di_bh); + ret = ocfs2_journal_dirty(handle, et->et_root_bh); if (ret) mlog_errno(ret); @@ -3890,7 +4178,8 @@ out: static void ocfs2_figure_contig_type(struct inode *inode, struct ocfs2_insert_type *insert, struct ocfs2_extent_list *el, - struct ocfs2_extent_rec *insert_rec) + struct ocfs2_extent_rec *insert_rec, + struct ocfs2_extent_tree *et) { int i; enum ocfs2_contig_type contig_type = CONTIG_NONE; @@ -3906,6 +4195,21 @@ static void ocfs2_figure_contig_type(struct inode *inode, } } insert->ins_contig = contig_type; + + if (insert->ins_contig != CONTIG_NONE) { + struct ocfs2_extent_rec *rec = + &el->l_recs[insert->ins_contig_index]; + unsigned int len = le16_to_cpu(rec->e_leaf_clusters) + + le16_to_cpu(insert_rec->e_leaf_clusters); + + /* + * Caller might want us to limit the size of extents, don't + * calculate contiguousness if we might exceed that limit. + */ + if (et->et_max_leaf_clusters && + (len > et->et_max_leaf_clusters)) + insert->ins_contig = CONTIG_NONE; + } } /* @@ -3914,8 +4218,8 @@ static void ocfs2_figure_contig_type(struct inode *inode, * ocfs2_figure_appending_type() will figure out whether we'll have to * insert at the tail of the rightmost leaf. * - * This should also work against the dinode list for tree's with 0 - * depth. If we consider the dinode list to be the rightmost leaf node + * This should also work against the root extent list for tree's with 0 + * depth. If we consider the root extent list to be the rightmost leaf node * then the logic here makes sense. */ static void ocfs2_figure_appending_type(struct ocfs2_insert_type *insert, @@ -3966,14 +4270,13 @@ set_tail_append: * structure. */ static int ocfs2_figure_insert_type(struct inode *inode, - struct buffer_head *di_bh, + struct ocfs2_extent_tree *et, struct buffer_head **last_eb_bh, struct ocfs2_extent_rec *insert_rec, int *free_records, struct ocfs2_insert_type *insert) { int ret; - struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; struct ocfs2_extent_block *eb; struct ocfs2_extent_list *el; struct ocfs2_path *path = NULL; @@ -3981,7 +4284,7 @@ static int ocfs2_figure_insert_type(struct inode *inode, insert->ins_split = SPLIT_NONE; - el = &di->id2.i_list; + el = et->et_root_el; insert->ins_tree_depth = le16_to_cpu(el->l_tree_depth); if (el->l_tree_depth) { @@ -3991,9 +4294,7 @@ static int ocfs2_figure_insert_type(struct inode *inode, * ocfs2_figure_insert_type() and ocfs2_add_branch() * may want it later. */ - ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), - le64_to_cpu(di->i_last_eb_blk), &bh, - OCFS2_BH_CACHED, inode); + ret = ocfs2_read_block(inode, ocfs2_et_get_last_eb_blk(et), &bh); if (ret) { mlog_exit(ret); goto out; @@ -4014,12 +4315,12 @@ static int ocfs2_figure_insert_type(struct inode *inode, le16_to_cpu(el->l_next_free_rec); if (!insert->ins_tree_depth) { - ocfs2_figure_contig_type(inode, insert, el, insert_rec); + ocfs2_figure_contig_type(inode, insert, el, insert_rec, et); ocfs2_figure_appending_type(insert, el, insert_rec); return 0; } - path = ocfs2_new_inode_path(di_bh); + path = ocfs2_new_path(et->et_root_bh, et->et_root_el); if (!path) { ret = -ENOMEM; mlog_errno(ret); @@ -4048,7 +4349,7 @@ static int ocfs2_figure_insert_type(struct inode *inode, * into two types of appends: simple record append, or a * rotate inside the tail leaf. */ - ocfs2_figure_contig_type(inode, insert, el, insert_rec); + ocfs2_figure_contig_type(inode, insert, el, insert_rec, et); /* * The insert code isn't quite ready to deal with all cases of @@ -4069,7 +4370,8 @@ static int ocfs2_figure_insert_type(struct inode *inode, * the case that we're doing a tail append, so maybe we can * take advantage of that information somehow. */ - if (le64_to_cpu(di->i_last_eb_blk) == path_leaf_bh(path)->b_blocknr) { + if (ocfs2_et_get_last_eb_blk(et) == + path_leaf_bh(path)->b_blocknr) { /* * Ok, ocfs2_find_path() returned us the rightmost * tree path. This might be an appending insert. There are @@ -4099,7 +4401,7 @@ out: int ocfs2_insert_extent(struct ocfs2_super *osb, handle_t *handle, struct inode *inode, - struct buffer_head *fe_bh, + struct ocfs2_extent_tree *et, u32 cpos, u64 start_blk, u32 new_clusters, @@ -4112,26 +4414,21 @@ int ocfs2_insert_extent(struct ocfs2_super *osb, struct ocfs2_insert_type insert = {0, }; struct ocfs2_extent_rec rec; - BUG_ON(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL); - mlog(0, "add %u clusters at position %u to inode %llu\n", new_clusters, cpos, (unsigned long long)OCFS2_I(inode)->ip_blkno); - mlog_bug_on_msg(!ocfs2_sparse_alloc(osb) && - (OCFS2_I(inode)->ip_clusters != cpos), - "Device %s, asking for sparse allocation: inode %llu, " - "cpos %u, clusters %u\n", - osb->dev_str, - (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos, - OCFS2_I(inode)->ip_clusters); - memset(&rec, 0, sizeof(rec)); rec.e_cpos = cpu_to_le32(cpos); rec.e_blkno = cpu_to_le64(start_blk); rec.e_leaf_clusters = cpu_to_le16(new_clusters); rec.e_flags = flags; + status = ocfs2_et_insert_check(inode, et, &rec); + if (status) { + mlog_errno(status); + goto bail; + } - status = ocfs2_figure_insert_type(inode, fe_bh, &last_eb_bh, &rec, + status = ocfs2_figure_insert_type(inode, et, &last_eb_bh, &rec, &free_records, &insert); if (status < 0) { mlog_errno(status); @@ -4145,7 +4442,7 @@ int ocfs2_insert_extent(struct ocfs2_super *osb, free_records, insert.ins_tree_depth); if (insert.ins_contig == CONTIG_NONE && free_records == 0) { - status = ocfs2_grow_tree(inode, handle, fe_bh, + status = ocfs2_grow_tree(inode, handle, et, &insert.ins_tree_depth, &last_eb_bh, meta_ac); if (status) { @@ -4155,17 +4452,124 @@ int ocfs2_insert_extent(struct ocfs2_super *osb, } /* Finally, we can add clusters. This might rotate the tree for us. */ - status = ocfs2_do_insert_extent(inode, handle, fe_bh, &rec, &insert); + status = ocfs2_do_insert_extent(inode, handle, et, &rec, &insert); if (status < 0) mlog_errno(status); - else + else if (et->et_ops == &ocfs2_dinode_et_ops) ocfs2_extent_map_insert_rec(inode, &rec); bail: - if (last_eb_bh) - brelse(last_eb_bh); + brelse(last_eb_bh); + + mlog_exit(status); + return status; +} + +/* + * Allcate and add clusters into the extent b-tree. + * The new clusters(clusters_to_add) will be inserted at logical_offset. + * The extent b-tree's root is specified by et, and + * it is not limited to the file storage. Any extent tree can use this + * function if it implements the proper ocfs2_extent_tree. + */ +int ocfs2_add_clusters_in_btree(struct ocfs2_super *osb, + struct inode *inode, + u32 *logical_offset, + u32 clusters_to_add, + int mark_unwritten, + struct ocfs2_extent_tree *et, + handle_t *handle, + struct ocfs2_alloc_context *data_ac, + struct ocfs2_alloc_context *meta_ac, + enum ocfs2_alloc_restarted *reason_ret) +{ + int status = 0; + int free_extents; + enum ocfs2_alloc_restarted reason = RESTART_NONE; + u32 bit_off, num_bits; + u64 block; + u8 flags = 0; + + BUG_ON(!clusters_to_add); + + if (mark_unwritten) + flags = OCFS2_EXT_UNWRITTEN; + + free_extents = ocfs2_num_free_extents(osb, inode, et); + if (free_extents < 0) { + status = free_extents; + mlog_errno(status); + goto leave; + } + + /* there are two cases which could cause us to EAGAIN in the + * we-need-more-metadata case: + * 1) we haven't reserved *any* + * 2) we are so fragmented, we've needed to add metadata too + * many times. */ + if (!free_extents && !meta_ac) { + mlog(0, "we haven't reserved any metadata!\n"); + status = -EAGAIN; + reason = RESTART_META; + goto leave; + } else if ((!free_extents) + && (ocfs2_alloc_context_bits_left(meta_ac) + < ocfs2_extend_meta_needed(et->et_root_el))) { + mlog(0, "filesystem is really fragmented...\n"); + status = -EAGAIN; + reason = RESTART_META; + goto leave; + } + + status = __ocfs2_claim_clusters(osb, handle, data_ac, 1, + clusters_to_add, &bit_off, &num_bits); + if (status < 0) { + if (status != -ENOSPC) + mlog_errno(status); + goto leave; + } + + BUG_ON(num_bits > clusters_to_add); + /* reserve our write early -- insert_extent may update the inode */ + status = ocfs2_journal_access(handle, inode, et->et_root_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto leave; + } + + block = ocfs2_clusters_to_blocks(osb->sb, bit_off); + mlog(0, "Allocating %u clusters at block %u for inode %llu\n", + num_bits, bit_off, (unsigned long long)OCFS2_I(inode)->ip_blkno); + status = ocfs2_insert_extent(osb, handle, inode, et, + *logical_offset, block, + num_bits, flags, meta_ac); + if (status < 0) { + mlog_errno(status); + goto leave; + } + + status = ocfs2_journal_dirty(handle, et->et_root_bh); + if (status < 0) { + mlog_errno(status); + goto leave; + } + + clusters_to_add -= num_bits; + *logical_offset += num_bits; + + if (clusters_to_add) { + mlog(0, "need to alloc once more, wanted = %u\n", + clusters_to_add); + status = -EAGAIN; + reason = RESTART_TRANS; + } + +leave: mlog_exit(status); + if (reason_ret) + *reason_ret = reason; return status; } @@ -4192,7 +4596,7 @@ static void ocfs2_make_right_split_rec(struct super_block *sb, static int ocfs2_split_and_insert(struct inode *inode, handle_t *handle, struct ocfs2_path *path, - struct buffer_head *di_bh, + struct ocfs2_extent_tree *et, struct buffer_head **last_eb_bh, int split_index, struct ocfs2_extent_rec *orig_split_rec, @@ -4206,7 +4610,6 @@ static int ocfs2_split_and_insert(struct inode *inode, struct ocfs2_extent_rec split_rec = *orig_split_rec; struct ocfs2_insert_type insert; struct ocfs2_extent_block *eb; - struct ocfs2_dinode *di; leftright: /* @@ -4215,8 +4618,7 @@ leftright: */ rec = path_leaf_el(path)->l_recs[split_index]; - di = (struct ocfs2_dinode *)di_bh->b_data; - rightmost_el = &di->id2.i_list; + rightmost_el = et->et_root_el; depth = le16_to_cpu(rightmost_el->l_tree_depth); if (depth) { @@ -4227,8 +4629,8 @@ leftright: if (le16_to_cpu(rightmost_el->l_next_free_rec) == le16_to_cpu(rightmost_el->l_count)) { - ret = ocfs2_grow_tree(inode, handle, di_bh, &depth, last_eb_bh, - meta_ac); + ret = ocfs2_grow_tree(inode, handle, et, + &depth, last_eb_bh, meta_ac); if (ret) { mlog_errno(ret); goto out; @@ -4265,8 +4667,7 @@ leftright: do_leftright = 1; } - ret = ocfs2_do_insert_extent(inode, handle, di_bh, &split_rec, - &insert); + ret = ocfs2_do_insert_extent(inode, handle, et, &split_rec, &insert); if (ret) { mlog_errno(ret); goto out; @@ -4308,8 +4709,9 @@ out: * of the tree is required. All other cases will degrade into a less * optimal tree layout. * - * last_eb_bh should be the rightmost leaf block for any inode with a - * btree. Since a split may grow the tree or a merge might shrink it, the caller cannot trust the contents of that buffer after this call. + * last_eb_bh should be the rightmost leaf block for any extent + * btree. Since a split may grow the tree or a merge might shrink it, + * the caller cannot trust the contents of that buffer after this call. * * This code is optimized for readability - several passes might be * made over certain portions of the tree. All of those blocks will @@ -4317,7 +4719,7 @@ out: * extra overhead is not expressed in terms of disk reads. */ static int __ocfs2_mark_extent_written(struct inode *inode, - struct buffer_head *di_bh, + struct ocfs2_extent_tree *et, handle_t *handle, struct ocfs2_path *path, int split_index, @@ -4357,11 +4759,9 @@ static int __ocfs2_mark_extent_written(struct inode *inode, */ if (path->p_tree_depth) { struct ocfs2_extent_block *eb; - struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; - ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), - le64_to_cpu(di->i_last_eb_blk), - &last_eb_bh, OCFS2_BH_CACHED, inode); + ret = ocfs2_read_block(inode, ocfs2_et_get_last_eb_blk(et), + &last_eb_bh); if (ret) { mlog_exit(ret); goto out; @@ -4394,7 +4794,7 @@ static int __ocfs2_mark_extent_written(struct inode *inode, if (ctxt.c_split_covers_rec) el->l_recs[split_index] = *split_rec; else - ret = ocfs2_split_and_insert(inode, handle, path, di_bh, + ret = ocfs2_split_and_insert(inode, handle, path, et, &last_eb_bh, split_index, split_rec, meta_ac); if (ret) @@ -4402,7 +4802,7 @@ static int __ocfs2_mark_extent_written(struct inode *inode, } else { ret = ocfs2_try_to_merge_extent(inode, handle, path, split_index, split_rec, - dealloc, &ctxt); + dealloc, &ctxt, et); if (ret) mlog_errno(ret); } @@ -4420,7 +4820,8 @@ out: * * The caller is responsible for passing down meta_ac if we'll need it. */ -int ocfs2_mark_extent_written(struct inode *inode, struct buffer_head *di_bh, +int ocfs2_mark_extent_written(struct inode *inode, + struct ocfs2_extent_tree *et, handle_t *handle, u32 cpos, u32 len, u32 phys, struct ocfs2_alloc_context *meta_ac, struct ocfs2_cached_dealloc_ctxt *dealloc) @@ -4446,10 +4847,14 @@ int ocfs2_mark_extent_written(struct inode *inode, struct buffer_head *di_bh, /* * XXX: This should be fixed up so that we just re-insert the * next extent records. + * + * XXX: This is a hack on the extent tree, maybe it should be + * an op? */ - ocfs2_extent_map_trunc(inode, 0); + if (et->et_ops == &ocfs2_dinode_et_ops) + ocfs2_extent_map_trunc(inode, 0); - left_path = ocfs2_new_inode_path(di_bh); + left_path = ocfs2_new_path(et->et_root_bh, et->et_root_el); if (!left_path) { ret = -ENOMEM; mlog_errno(ret); @@ -4480,8 +4885,9 @@ int ocfs2_mark_extent_written(struct inode *inode, struct buffer_head *di_bh, split_rec.e_flags = path_leaf_el(left_path)->l_recs[index].e_flags; split_rec.e_flags &= ~OCFS2_EXT_UNWRITTEN; - ret = __ocfs2_mark_extent_written(inode, di_bh, handle, left_path, - index, &split_rec, meta_ac, dealloc); + ret = __ocfs2_mark_extent_written(inode, et, handle, left_path, + index, &split_rec, meta_ac, + dealloc); if (ret) mlog_errno(ret); @@ -4490,13 +4896,12 @@ out: return ret; } -static int ocfs2_split_tree(struct inode *inode, struct buffer_head *di_bh, +static int ocfs2_split_tree(struct inode *inode, struct ocfs2_extent_tree *et, handle_t *handle, struct ocfs2_path *path, int index, u32 new_range, struct ocfs2_alloc_context *meta_ac) { int ret, depth, credits = handle->h_buffer_credits; - struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; struct buffer_head *last_eb_bh = NULL; struct ocfs2_extent_block *eb; struct ocfs2_extent_list *rightmost_el, *el; @@ -4513,9 +4918,8 @@ static int ocfs2_split_tree(struct inode *inode, struct buffer_head *di_bh, depth = path->p_tree_depth; if (depth > 0) { - ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), - le64_to_cpu(di->i_last_eb_blk), - &last_eb_bh, OCFS2_BH_CACHED, inode); + ret = ocfs2_read_block(inode, ocfs2_et_get_last_eb_blk(et), + &last_eb_bh); if (ret < 0) { mlog_errno(ret); goto out; @@ -4526,7 +4930,8 @@ static int ocfs2_split_tree(struct inode *inode, struct buffer_head *di_bh, } else rightmost_el = path_leaf_el(path); - credits += path->p_tree_depth + ocfs2_extend_meta_needed(di); + credits += path->p_tree_depth + + ocfs2_extend_meta_needed(et->et_root_el); ret = ocfs2_extend_trans(handle, credits); if (ret) { mlog_errno(ret); @@ -4535,7 +4940,7 @@ static int ocfs2_split_tree(struct inode *inode, struct buffer_head *di_bh, if (le16_to_cpu(rightmost_el->l_next_free_rec) == le16_to_cpu(rightmost_el->l_count)) { - ret = ocfs2_grow_tree(inode, handle, di_bh, &depth, &last_eb_bh, + ret = ocfs2_grow_tree(inode, handle, et, &depth, &last_eb_bh, meta_ac); if (ret) { mlog_errno(ret); @@ -4549,7 +4954,7 @@ static int ocfs2_split_tree(struct inode *inode, struct buffer_head *di_bh, insert.ins_split = SPLIT_RIGHT; insert.ins_tree_depth = depth; - ret = ocfs2_do_insert_extent(inode, handle, di_bh, &split_rec, &insert); + ret = ocfs2_do_insert_extent(inode, handle, et, &split_rec, &insert); if (ret) mlog_errno(ret); @@ -4561,7 +4966,8 @@ out: static int ocfs2_truncate_rec(struct inode *inode, handle_t *handle, struct ocfs2_path *path, int index, struct ocfs2_cached_dealloc_ctxt *dealloc, - u32 cpos, u32 len) + u32 cpos, u32 len, + struct ocfs2_extent_tree *et) { int ret; u32 left_cpos, rec_range, trunc_range; @@ -4573,7 +4979,7 @@ static int ocfs2_truncate_rec(struct inode *inode, handle_t *handle, struct ocfs2_extent_block *eb; if (ocfs2_is_empty_extent(&el->l_recs[0]) && index > 0) { - ret = ocfs2_rotate_tree_left(inode, handle, path, dealloc); + ret = ocfs2_rotate_tree_left(inode, handle, path, dealloc, et); if (ret) { mlog_errno(ret); goto out; @@ -4704,7 +5110,7 @@ static int ocfs2_truncate_rec(struct inode *inode, handle_t *handle, ocfs2_journal_dirty(handle, path_leaf_bh(path)); - ret = ocfs2_rotate_tree_left(inode, handle, path, dealloc); + ret = ocfs2_rotate_tree_left(inode, handle, path, dealloc, et); if (ret) { mlog_errno(ret); goto out; @@ -4715,7 +5121,8 @@ out: return ret; } -int ocfs2_remove_extent(struct inode *inode, struct buffer_head *di_bh, +int ocfs2_remove_extent(struct inode *inode, + struct ocfs2_extent_tree *et, u32 cpos, u32 len, handle_t *handle, struct ocfs2_alloc_context *meta_ac, struct ocfs2_cached_dealloc_ctxt *dealloc) @@ -4724,11 +5131,11 @@ int ocfs2_remove_extent(struct inode *inode, struct buffer_head *di_bh, u32 rec_range, trunc_range; struct ocfs2_extent_rec *rec; struct ocfs2_extent_list *el; - struct ocfs2_path *path; + struct ocfs2_path *path = NULL; ocfs2_extent_map_trunc(inode, 0); - path = ocfs2_new_inode_path(di_bh); + path = ocfs2_new_path(et->et_root_bh, et->et_root_el); if (!path) { ret = -ENOMEM; mlog_errno(ret); @@ -4781,13 +5188,13 @@ int ocfs2_remove_extent(struct inode *inode, struct buffer_head *di_bh, if (le32_to_cpu(rec->e_cpos) == cpos || rec_range == trunc_range) { ret = ocfs2_truncate_rec(inode, handle, path, index, dealloc, - cpos, len); + cpos, len, et); if (ret) { mlog_errno(ret); goto out; } } else { - ret = ocfs2_split_tree(inode, di_bh, handle, path, index, + ret = ocfs2_split_tree(inode, et, handle, path, index, trunc_range, meta_ac); if (ret) { mlog_errno(ret); @@ -4836,7 +5243,7 @@ int ocfs2_remove_extent(struct inode *inode, struct buffer_head *di_bh, } ret = ocfs2_truncate_rec(inode, handle, path, index, dealloc, - cpos, len); + cpos, len, et); if (ret) { mlog_errno(ret); goto out; @@ -5179,8 +5586,7 @@ static int ocfs2_get_truncate_log_info(struct ocfs2_super *osb, goto bail; } - status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, &bh, - OCFS2_BH_CACHED, inode); + status = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, &bh); if (status < 0) { iput(inode); mlog_errno(status); @@ -5255,8 +5661,7 @@ int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb, bail: if (tl_inode) iput(tl_inode); - if (tl_bh) - brelse(tl_bh); + brelse(tl_bh); if (status < 0 && (*tl_copy)) { kfree(*tl_copy); @@ -5999,20 +6404,13 @@ bail: return status; } -static int ocfs2_writeback_zero_func(handle_t *handle, struct buffer_head *bh) +static int ocfs2_zero_func(handle_t *handle, struct buffer_head *bh) { set_buffer_uptodate(bh); mark_buffer_dirty(bh); return 0; } -static int ocfs2_ordered_zero_func(handle_t *handle, struct buffer_head *bh) -{ - set_buffer_uptodate(bh); - mark_buffer_dirty(bh); - return ocfs2_journal_dirty_data(handle, bh); -} - static void ocfs2_map_and_dirty_page(struct inode *inode, handle_t *handle, unsigned int from, unsigned int to, struct page *page, int zero, u64 *phys) @@ -6031,17 +6429,18 @@ static void ocfs2_map_and_dirty_page(struct inode *inode, handle_t *handle, * here if they aren't - ocfs2_map_page_blocks() * might've skipped some */ - if (ocfs2_should_order_data(inode)) { - ret = walk_page_buffers(handle, - page_buffers(page), - from, to, &partial, - ocfs2_ordered_zero_func); - if (ret < 0) - mlog_errno(ret); - } else { + ret = walk_page_buffers(handle, page_buffers(page), + from, to, &partial, + ocfs2_zero_func); + if (ret < 0) + mlog_errno(ret); + else if (ocfs2_should_order_data(inode)) { + ret = ocfs2_jbd2_file_inode(handle, inode); +#ifdef CONFIG_OCFS2_COMPAT_JBD ret = walk_page_buffers(handle, page_buffers(page), from, to, &partial, - ocfs2_writeback_zero_func); + ocfs2_journal_dirty_data); +#endif if (ret < 0) mlog_errno(ret); } @@ -6206,20 +6605,29 @@ out: return ret; } -static void ocfs2_zero_dinode_id2(struct inode *inode, struct ocfs2_dinode *di) +static void ocfs2_zero_dinode_id2_with_xattr(struct inode *inode, + struct ocfs2_dinode *di) { unsigned int blocksize = 1 << inode->i_sb->s_blocksize_bits; + unsigned int xattrsize = le16_to_cpu(di->i_xattr_inline_size); - memset(&di->id2, 0, blocksize - offsetof(struct ocfs2_dinode, id2)); + if (le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_XATTR_FL) + memset(&di->id2, 0, blocksize - + offsetof(struct ocfs2_dinode, id2) - + xattrsize); + else + memset(&di->id2, 0, blocksize - + offsetof(struct ocfs2_dinode, id2)); } void ocfs2_dinode_new_extent_list(struct inode *inode, struct ocfs2_dinode *di) { - ocfs2_zero_dinode_id2(inode, di); + ocfs2_zero_dinode_id2_with_xattr(inode, di); di->id2.i_list.l_tree_depth = 0; di->id2.i_list.l_next_free_rec = 0; - di->id2.i_list.l_count = cpu_to_le16(ocfs2_extent_recs_per_inode(inode->i_sb)); + di->id2.i_list.l_count = cpu_to_le16( + ocfs2_extent_recs_per_inode_with_xattr(inode->i_sb, di)); } void ocfs2_set_inode_data_inline(struct inode *inode, struct ocfs2_dinode *di) @@ -6236,9 +6644,10 @@ void ocfs2_set_inode_data_inline(struct inode *inode, struct ocfs2_dinode *di) * We clear the entire i_data structure here so that all * fields can be properly initialized. */ - ocfs2_zero_dinode_id2(inode, di); + ocfs2_zero_dinode_id2_with_xattr(inode, di); - idata->id_count = cpu_to_le16(ocfs2_max_inline_data(inode->i_sb)); + idata->id_count = cpu_to_le16( + ocfs2_max_inline_data_with_xattr(inode->i_sb, di)); } int ocfs2_convert_inline_data_to_extents(struct inode *inode, @@ -6253,6 +6662,7 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode, struct ocfs2_alloc_context *data_ac = NULL; struct page **pages = NULL; loff_t end = osb->s_clustersize; + struct ocfs2_extent_tree et; has_data = i_size_read(inode) ? 1 : 0; @@ -6352,7 +6762,8 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode, * this proves to be false, we could always re-build * the in-inode data from our pages. */ - ret = ocfs2_insert_extent(osb, handle, inode, di_bh, + ocfs2_init_dinode_extent_tree(&et, inode, di_bh); + ret = ocfs2_insert_extent(osb, handle, inode, &et, 0, block, 1, 0, NULL); if (ret) { mlog_errno(ret); @@ -6395,13 +6806,14 @@ int ocfs2_commit_truncate(struct ocfs2_super *osb, handle_t *handle = NULL; struct inode *tl_inode = osb->osb_tl_inode; struct ocfs2_path *path = NULL; + struct ocfs2_dinode *di = (struct ocfs2_dinode *)fe_bh->b_data; mlog_entry_void(); new_highest_cpos = ocfs2_clusters_for_bytes(osb->sb, i_size_read(inode)); - path = ocfs2_new_inode_path(fe_bh); + path = ocfs2_new_path(fe_bh, &di->id2.i_list); if (!path) { status = -ENOMEM; mlog_errno(status); @@ -6572,8 +6984,8 @@ int ocfs2_prepare_truncate(struct ocfs2_super *osb, ocfs2_init_dealloc_ctxt(&(*tc)->tc_dealloc); if (fe->id2.i_list.l_tree_depth) { - status = ocfs2_read_block(osb, le64_to_cpu(fe->i_last_eb_blk), - &last_eb_bh, OCFS2_BH_CACHED, inode); + status = ocfs2_read_block(inode, le64_to_cpu(fe->i_last_eb_blk), + &last_eb_bh); if (status < 0) { mlog_errno(status); goto bail; @@ -6686,8 +7098,7 @@ static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc) mlog(ML_NOTICE, "Truncate completion has non-empty dealloc context\n"); - if (tc->tc_last_eb_bh) - brelse(tc->tc_last_eb_bh); + brelse(tc->tc_last_eb_bh); kfree(tc); } diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h index 60cd3d5..70257c8 100644 --- a/fs/ocfs2/alloc.h +++ b/fs/ocfs2/alloc.h @@ -26,30 +26,102 @@ #ifndef OCFS2_ALLOC_H #define OCFS2_ALLOC_H + +/* + * For xattr tree leaf, we limit the leaf byte size to be 64K. + */ +#define OCFS2_MAX_XATTR_TREE_LEAF_SIZE 65536 + +/* + * ocfs2_extent_tree and ocfs2_extent_tree_operations are used to abstract + * the b-tree operations in ocfs2. Now all the b-tree operations are not + * limited to ocfs2_dinode only. Any data which need to allocate clusters + * to store can use b-tree. And it only needs to implement its ocfs2_extent_tree + * and operation. + * + * ocfs2_extent_tree becomes the first-class object for extent tree + * manipulation. Callers of the alloc.c code need to fill it via one of + * the ocfs2_init_*_extent_tree() operations below. + * + * ocfs2_extent_tree contains info for the root of the b-tree, it must have a + * root ocfs2_extent_list and a root_bh so that they can be used in the b-tree + * functions. + * ocfs2_extent_tree_operations abstract the normal operations we do for + * the root of extent b-tree. + */ +struct ocfs2_extent_tree_operations; +struct ocfs2_extent_tree { + struct ocfs2_extent_tree_operations *et_ops; + struct buffer_head *et_root_bh; + struct ocfs2_extent_list *et_root_el; + void *et_object; + unsigned int et_max_leaf_clusters; +}; + +/* + * ocfs2_init_*_extent_tree() will fill an ocfs2_extent_tree from the + * specified object buffer. + */ +void ocfs2_init_dinode_extent_tree(struct ocfs2_extent_tree *et, + struct inode *inode, + struct buffer_head *bh); +void ocfs2_init_xattr_tree_extent_tree(struct ocfs2_extent_tree *et, + struct inode *inode, + struct buffer_head *bh); +void ocfs2_init_xattr_value_extent_tree(struct ocfs2_extent_tree *et, + struct inode *inode, + struct buffer_head *bh, + struct ocfs2_xattr_value_root *xv); + struct ocfs2_alloc_context; int ocfs2_insert_extent(struct ocfs2_super *osb, handle_t *handle, struct inode *inode, - struct buffer_head *fe_bh, + struct ocfs2_extent_tree *et, u32 cpos, u64 start_blk, u32 new_clusters, u8 flags, struct ocfs2_alloc_context *meta_ac); + +enum ocfs2_alloc_restarted { + RESTART_NONE = 0, + RESTART_TRANS, + RESTART_META +}; +int ocfs2_add_clusters_in_btree(struct ocfs2_super *osb, + struct inode *inode, + u32 *logical_offset, + u32 clusters_to_add, + int mark_unwritten, + struct ocfs2_extent_tree *et, + handle_t *handle, + struct ocfs2_alloc_context *data_ac, + struct ocfs2_alloc_context *meta_ac, + enum ocfs2_alloc_restarted *reason_ret); struct ocfs2_cached_dealloc_ctxt; -int ocfs2_mark_extent_written(struct inode *inode, struct buffer_head *di_bh, +int ocfs2_mark_extent_written(struct inode *inode, + struct ocfs2_extent_tree *et, handle_t *handle, u32 cpos, u32 len, u32 phys, struct ocfs2_alloc_context *meta_ac, struct ocfs2_cached_dealloc_ctxt *dealloc); -int ocfs2_remove_extent(struct inode *inode, struct buffer_head *di_bh, +int ocfs2_remove_extent(struct inode *inode, + struct ocfs2_extent_tree *et, u32 cpos, u32 len, handle_t *handle, struct ocfs2_alloc_context *meta_ac, struct ocfs2_cached_dealloc_ctxt *dealloc); int ocfs2_num_free_extents(struct ocfs2_super *osb, struct inode *inode, - struct ocfs2_dinode *fe); -/* how many new metadata chunks would an allocation need at maximum? */ -static inline int ocfs2_extend_meta_needed(struct ocfs2_dinode *fe) + struct ocfs2_extent_tree *et); + +/* + * how many new metadata chunks would an allocation need at maximum? + * + * Please note that the caller must make sure that root_el is the root + * of extent tree. So for an inode, it should be &fe->id2.i_list. Otherwise + * the result may be wrong. + */ +static inline int ocfs2_extend_meta_needed(struct ocfs2_extent_list *root_el) { /* * Rather than do all the work of determining how much we need @@ -59,7 +131,7 @@ static inline int ocfs2_extend_meta_needed(struct ocfs2_dinode *fe) * new tree_depth==0 extent_block, and one block at the new * top-of-the tree. */ - return le16_to_cpu(fe->id2.i_list.l_tree_depth) + 2; + return le16_to_cpu(root_el->l_tree_depth) + 2; } void ocfs2_dinode_new_extent_list(struct inode *inode, struct ocfs2_dinode *di); diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index a53da14..c22543b 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -68,9 +68,7 @@ static int ocfs2_symlink_get_block(struct inode *inode, sector_t iblock, goto bail; } - status = ocfs2_read_block(OCFS2_SB(inode->i_sb), - OCFS2_I(inode)->ip_blkno, - &bh, OCFS2_BH_CACHED, inode); + status = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, &bh); if (status < 0) { mlog_errno(status); goto bail; @@ -128,8 +126,7 @@ static int ocfs2_symlink_get_block(struct inode *inode, sector_t iblock, err = 0; bail: - if (bh) - brelse(bh); + brelse(bh); mlog_exit(err); return err; @@ -261,13 +258,11 @@ static int ocfs2_readpage_inline(struct inode *inode, struct page *page) { int ret; struct buffer_head *di_bh = NULL; - struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); BUG_ON(!PageLocked(page)); BUG_ON(!(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)); - ret = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, &di_bh, - OCFS2_BH_CACHED, inode); + ret = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, &di_bh); if (ret) { mlog_errno(ret); goto out; @@ -485,11 +480,14 @@ handle_t *ocfs2_start_walk_page_trans(struct inode *inode, } if (ocfs2_should_order_data(inode)) { + ret = ocfs2_jbd2_file_inode(handle, inode); +#ifdef CONFIG_OCFS2_COMPAT_JBD ret = walk_page_buffers(handle, page_buffers(page), from, to, NULL, ocfs2_journal_dirty_data); - if (ret < 0) +#endif + if (ret < 0) mlog_errno(ret); } out: @@ -669,7 +667,7 @@ static void ocfs2_invalidatepage(struct page *page, unsigned long offset) { journal_t *journal = OCFS2_SB(page->mapping->host->i_sb)->journal->j_journal; - journal_invalidatepage(journal, page, offset); + jbd2_journal_invalidatepage(journal, page, offset); } static int ocfs2_releasepage(struct page *page, gfp_t wait) @@ -678,7 +676,7 @@ static int ocfs2_releasepage(struct page *page, gfp_t wait) if (!page_has_buffers(page)) return 0; - return journal_try_to_free_buffers(journal, page, wait); + return jbd2_journal_try_to_free_buffers(journal, page, wait); } static ssize_t ocfs2_direct_IO(int rw, @@ -1074,11 +1072,15 @@ static void ocfs2_write_failure(struct inode *inode, tmppage = wc->w_pages[i]; if (page_has_buffers(tmppage)) { - if (ocfs2_should_order_data(inode)) + if (ocfs2_should_order_data(inode)) { + ocfs2_jbd2_file_inode(wc->w_handle, inode); +#ifdef CONFIG_OCFS2_COMPAT_JBD walk_page_buffers(wc->w_handle, page_buffers(tmppage), from, to, NULL, ocfs2_journal_dirty_data); +#endif + } block_commit_write(tmppage, from, to); } @@ -1242,6 +1244,7 @@ static int ocfs2_write_cluster(struct address_space *mapping, int ret, i, new, should_zero = 0; u64 v_blkno, p_blkno; struct inode *inode = mapping->host; + struct ocfs2_extent_tree et; new = phys == 0 ? 1 : 0; if (new || unwritten) @@ -1255,10 +1258,10 @@ static int ocfs2_write_cluster(struct address_space *mapping, * any additional semaphores or cluster locks. */ tmp_pos = cpos; - ret = ocfs2_do_extend_allocation(OCFS2_SB(inode->i_sb), inode, - &tmp_pos, 1, 0, wc->w_di_bh, - wc->w_handle, data_ac, - meta_ac, NULL); + ret = ocfs2_add_inode_data(OCFS2_SB(inode->i_sb), inode, + &tmp_pos, 1, 0, wc->w_di_bh, + wc->w_handle, data_ac, + meta_ac, NULL); /* * This shouldn't happen because we must have already * calculated the correct meta data allocation required. The @@ -1276,7 +1279,8 @@ static int ocfs2_write_cluster(struct address_space *mapping, goto out; } } else if (unwritten) { - ret = ocfs2_mark_extent_written(inode, wc->w_di_bh, + ocfs2_init_dinode_extent_tree(&et, inode, wc->w_di_bh); + ret = ocfs2_mark_extent_written(inode, &et, wc->w_handle, cpos, 1, phys, meta_ac, &wc->w_dealloc); if (ret < 0) { @@ -1665,6 +1669,7 @@ int ocfs2_write_begin_nolock(struct address_space *mapping, struct ocfs2_alloc_context *data_ac = NULL; struct ocfs2_alloc_context *meta_ac = NULL; handle_t *handle; + struct ocfs2_extent_tree et; ret = ocfs2_alloc_write_ctxt(&wc, osb, pos, len, di_bh); if (ret) { @@ -1712,14 +1717,23 @@ int ocfs2_write_begin_nolock(struct address_space *mapping, * ocfs2_lock_allocators(). It greatly over-estimates * the work to be done. */ - ret = ocfs2_lock_allocators(inode, di, clusters_to_alloc, - extents_to_split, &data_ac, &meta_ac); + mlog(0, "extend inode %llu, i_size = %lld, di->i_clusters = %u," + " clusters_to_add = %u, extents_to_split = %u\n", + (unsigned long long)OCFS2_I(inode)->ip_blkno, + (long long)i_size_read(inode), le32_to_cpu(di->i_clusters), + clusters_to_alloc, extents_to_split); + + ocfs2_init_dinode_extent_tree(&et, inode, wc->w_di_bh); + ret = ocfs2_lock_allocators(inode, &et, + clusters_to_alloc, extents_to_split, + &data_ac, &meta_ac); if (ret) { mlog_errno(ret); goto out; } - credits = ocfs2_calc_extend_credits(inode->i_sb, di, + credits = ocfs2_calc_extend_credits(inode->i_sb, + &di->id2.i_list, clusters_to_alloc); } @@ -1905,11 +1919,15 @@ int ocfs2_write_end_nolock(struct address_space *mapping, } if (page_has_buffers(tmppage)) { - if (ocfs2_should_order_data(inode)) + if (ocfs2_should_order_data(inode)) { + ocfs2_jbd2_file_inode(wc->w_handle, inode); +#ifdef CONFIG_OCFS2_COMPAT_JBD walk_page_buffers(wc->w_handle, page_buffers(tmppage), from, to, NULL, ocfs2_journal_dirty_data); +#endif + } block_commit_write(tmppage, from, to); } } diff --git a/fs/ocfs2/buffer_head_io.c b/fs/ocfs2/buffer_head_io.c index f136639..7e947c6 100644 --- a/fs/ocfs2/buffer_head_io.c +++ b/fs/ocfs2/buffer_head_io.c @@ -66,7 +66,7 @@ int ocfs2_write_block(struct ocfs2_super *osb, struct buffer_head *bh, /* remove from dirty list before I/O. */ clear_buffer_dirty(bh); - get_bh(bh); /* for end_buffer_write_sync() */ + get_bh(bh); /* for end_buffer_write_sync() */ bh->b_end_io = end_buffer_write_sync; submit_bh(WRITE, bh); @@ -88,22 +88,103 @@ out: return ret; } -int ocfs2_read_blocks(struct ocfs2_super *osb, u64 block, int nr, - struct buffer_head *bhs[], int flags, - struct inode *inode) +int ocfs2_read_blocks_sync(struct ocfs2_super *osb, u64 block, + unsigned int nr, struct buffer_head *bhs[]) +{ + int status = 0; + unsigned int i; + struct buffer_head *bh; + + if (!nr) { + mlog(ML_BH_IO, "No buffers will be read!\n"); + goto bail; + } + + for (i = 0 ; i < nr ; i++) { + if (bhs[i] == NULL) { + bhs[i] = sb_getblk(osb->sb, block++); + if (bhs[i] == NULL) { + status = -EIO; + mlog_errno(status); + goto bail; + } + } + bh = bhs[i]; + + if (buffer_jbd(bh)) { + mlog(ML_ERROR, + "trying to sync read a jbd " + "managed bh (blocknr = %llu), skipping\n", + (unsigned long long)bh->b_blocknr); + continue; + } + + if (buffer_dirty(bh)) { + /* This should probably be a BUG, or + * at least return an error. */ + mlog(ML_ERROR, + "trying to sync read a dirty " + "buffer! (blocknr = %llu), skipping\n", + (unsigned long long)bh->b_blocknr); + continue; + } + + lock_buffer(bh); + if (buffer_jbd(bh)) { + mlog(ML_ERROR, + "block %llu had the JBD bit set " + "while I was in lock_buffer!", + (unsigned long long)bh->b_blocknr); + BUG(); + } + + clear_buffer_uptodate(bh); + get_bh(bh); /* for end_buffer_read_sync() */ + bh->b_end_io = end_buffer_read_sync; + submit_bh(READ, bh); + } + + for (i = nr; i > 0; i--) { + bh = bhs[i - 1]; + + if (buffer_jbd(bh)) { + mlog(ML_ERROR, + "the journal got the buffer while it was " + "locked for io! (blocknr = %llu)\n", + (unsigned long long)bh->b_blocknr); + BUG(); + } + + wait_on_buffer(bh); + if (!buffer_uptodate(bh)) { + /* Status won't be cleared from here on out, + * so we can safely record this and loop back + * to cleanup the other buffers. */ + status = -EIO; + put_bh(bh); + bhs[i - 1] = NULL; + } + } + +bail: + return status; +} + +int ocfs2_read_blocks(struct inode *inode, u64 block, int nr, + struct buffer_head *bhs[], int flags) { int status = 0; - struct super_block *sb; int i, ignore_cache = 0; struct buffer_head *bh; - mlog_entry("(block=(%llu), nr=(%d), flags=%d, inode=%p)\n", - (unsigned long long)block, nr, flags, inode); + mlog_entry("(inode=%p, block=(%llu), nr=(%d), flags=%d)\n", + inode, (unsigned long long)block, nr, flags); + BUG_ON(!inode); BUG_ON((flags & OCFS2_BH_READAHEAD) && - (!inode || !(flags & OCFS2_BH_CACHED))); + (flags & OCFS2_BH_IGNORE_CACHE)); - if (osb == NULL || osb->sb == NULL || bhs == NULL) { + if (bhs == NULL) { status = -EINVAL; mlog_errno(status); goto bail; @@ -122,26 +203,19 @@ int ocfs2_read_blocks(struct ocfs2_super *osb, u64 block, int nr, goto bail; } - sb = osb->sb; - - if (flags & OCFS2_BH_CACHED && !inode) - flags &= ~OCFS2_BH_CACHED; - - if (inode) - mutex_lock(&OCFS2_I(inode)->ip_io_mutex); + mutex_lock(&OCFS2_I(inode)->ip_io_mutex); for (i = 0 ; i < nr ; i++) { if (bhs[i] == NULL) { - bhs[i] = sb_getblk(sb, block++); + bhs[i] = sb_getblk(inode->i_sb, block++); if (bhs[i] == NULL) { - if (inode) - mutex_unlock(&OCFS2_I(inode)->ip_io_mutex); + mutex_unlock(&OCFS2_I(inode)->ip_io_mutex); status = -EIO; mlog_errno(status); goto bail; } } bh = bhs[i]; - ignore_cache = 0; + ignore_cache = (flags & OCFS2_BH_IGNORE_CACHE); /* There are three read-ahead cases here which we need to * be concerned with. All three assume a buffer has @@ -167,26 +241,27 @@ int ocfs2_read_blocks(struct ocfs2_super *osb, u64 block, int nr, * before our is-it-in-flight check. */ - if (flags & OCFS2_BH_CACHED && - !ocfs2_buffer_uptodate(inode, bh)) { + if (!ignore_cache && !ocfs2_buffer_uptodate(inode, bh)) { mlog(ML_UPTODATE, "bh (%llu), inode %llu not uptodate\n", (unsigned long long)bh->b_blocknr, (unsigned long long)OCFS2_I(inode)->ip_blkno); + /* We're using ignore_cache here to say + * "go to disk" */ ignore_cache = 1; } /* XXX: Can we ever get this and *not* have the cached * flag set? */ if (buffer_jbd(bh)) { - if (!(flags & OCFS2_BH_CACHED) || ignore_cache) + if (ignore_cache) mlog(ML_BH_IO, "trying to sync read a jbd " "managed bh (blocknr = %llu)\n", (unsigned long long)bh->b_blocknr); continue; } - if (!(flags & OCFS2_BH_CACHED) || ignore_cache) { + if (ignore_cache) { if (buffer_dirty(bh)) { /* This should probably be a BUG, or * at least return an error. */ @@ -221,7 +296,7 @@ int ocfs2_read_blocks(struct ocfs2_super *osb, u64 block, int nr, * previously read-ahead buffer may have * completed I/O while we were waiting for the * buffer lock. */ - if ((flags & OCFS2_BH_CACHED) + if (!(flags & OCFS2_BH_IGNORE_CACHE) && !(flags & OCFS2_BH_READAHEAD) && ocfs2_buffer_uptodate(inode, bh)) { unlock_buffer(bh); @@ -265,15 +340,14 @@ int ocfs2_read_blocks(struct ocfs2_super *osb, u64 block, int nr, /* Always set the buffer in the cache, even if it was * a forced read, or read-ahead which hasn't yet * completed. */ - if (inode) - ocfs2_set_buffer_uptodate(inode, bh); + ocfs2_set_buffer_uptodate(inode, bh); } - if (inode) - mutex_unlock(&OCFS2_I(inode)->ip_io_mutex); + mutex_unlock(&OCFS2_I(inode)->ip_io_mutex); mlog(ML_BH_IO, "block=(%llu), nr=(%d), cached=%s, flags=0x%x\n", (unsigned long long)block, nr, - (!(flags & OCFS2_BH_CACHED) || ignore_cache) ? "no" : "yes", flags); + ((flags & OCFS2_BH_IGNORE_CACHE) || ignore_cache) ? "no" : "yes", + flags); bail: diff --git a/fs/ocfs2/buffer_head_io.h b/fs/ocfs2/buffer_head_io.h index c2e7861..75e1dcb 100644 --- a/fs/ocfs2/buffer_head_io.h +++ b/fs/ocfs2/buffer_head_io.h @@ -31,31 +31,29 @@ void ocfs2_end_buffer_io_sync(struct buffer_head *bh, int uptodate); -static inline int ocfs2_read_block(struct ocfs2_super *osb, +static inline int ocfs2_read_block(struct inode *inode, u64 off, - struct buffer_head **bh, - int flags, - struct inode *inode); + struct buffer_head **bh); int ocfs2_write_block(struct ocfs2_super *osb, struct buffer_head *bh, struct inode *inode); -int ocfs2_read_blocks(struct ocfs2_super *osb, +int ocfs2_read_blocks(struct inode *inode, u64 block, int nr, struct buffer_head *bhs[], - int flags, - struct inode *inode); + int flags); +int ocfs2_read_blocks_sync(struct ocfs2_super *osb, u64 block, + unsigned int nr, struct buffer_head *bhs[]); int ocfs2_write_super_or_backup(struct ocfs2_super *osb, struct buffer_head *bh); -#define OCFS2_BH_CACHED 1 +#define OCFS2_BH_IGNORE_CACHE 1 #define OCFS2_BH_READAHEAD 8 -static inline int ocfs2_read_block(struct ocfs2_super * osb, u64 off, - struct buffer_head **bh, int flags, - struct inode *inode) +static inline int ocfs2_read_block(struct inode *inode, u64 off, + struct buffer_head **bh) { int status = 0; @@ -65,8 +63,7 @@ static inline int ocfs2_read_block(struct ocfs2_super * osb, u64 off, goto bail; } - status = ocfs2_read_blocks(osb, off, 1, bh, - flags, inode); + status = ocfs2_read_blocks(inode, off, 1, bh, 0); bail: return status; diff --git a/fs/ocfs2/cluster/masklog.c b/fs/ocfs2/cluster/masklog.c index 23c732f..d8a0cb9 100644 --- a/fs/ocfs2/cluster/masklog.c +++ b/fs/ocfs2/cluster/masklog.c @@ -109,6 +109,7 @@ static struct mlog_attribute mlog_attrs[MLOG_MAX_BITS] = { define_mask(CONN), define_mask(QUORUM), define_mask(EXPORT), + define_mask(XATTR), define_mask(ERROR), define_mask(NOTICE), define_mask(KTHREAD), diff --git a/fs/ocfs2/cluster/masklog.h b/fs/ocfs2/cluster/masklog.h index 597e064..57670c6 100644 --- a/fs/ocfs2/cluster/masklog.h +++ b/fs/ocfs2/cluster/masklog.h @@ -112,6 +112,7 @@ #define ML_CONN 0x0000000004000000ULL /* net connection management */ #define ML_QUORUM 0x0000000008000000ULL /* net connection quorum */ #define ML_EXPORT 0x0000000010000000ULL /* ocfs2 export operations */ +#define ML_XATTR 0x0000000020000000ULL /* ocfs2 extended attributes */ /* bits that are infrequently given and frequently matched in the high word */ #define ML_ERROR 0x0000000100000000ULL /* sent to KERN_ERR */ #define ML_NOTICE 0x0000000200000000ULL /* setn to KERN_NOTICE */ diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c index 9cce563..026e6eb 100644 --- a/fs/ocfs2/dir.c +++ b/fs/ocfs2/dir.c @@ -82,6 +82,49 @@ static int ocfs2_do_extend_dir(struct super_block *sb, struct ocfs2_alloc_context *meta_ac, struct buffer_head **new_bh); +static struct buffer_head *ocfs2_bread(struct inode *inode, + int block, int *err, int reada) +{ + struct buffer_head *bh = NULL; + int tmperr; + u64 p_blkno; + int readflags = 0; + + if (reada) + readflags |= OCFS2_BH_READAHEAD; + + if (((u64)block << inode->i_sb->s_blocksize_bits) >= + i_size_read(inode)) { + BUG_ON(!reada); + return NULL; + } + + down_read(&OCFS2_I(inode)->ip_alloc_sem); + tmperr = ocfs2_extent_map_get_blocks(inode, block, &p_blkno, NULL, + NULL); + up_read(&OCFS2_I(inode)->ip_alloc_sem); + if (tmperr < 0) { + mlog_errno(tmperr); + goto fail; + } + + tmperr = ocfs2_read_blocks(inode, p_blkno, 1, &bh, readflags); + if (tmperr < 0) + goto fail; + + tmperr = 0; + + *err = 0; + return bh; + +fail: + brelse(bh); + bh = NULL; + + *err = -EIO; + return NULL; +} + /* * bh passed here can be an inode block or a dir data block, depending * on the inode inline data flag. @@ -188,8 +231,7 @@ static struct buffer_head *ocfs2_find_entry_id(const char *name, struct ocfs2_dinode *di; struct ocfs2_inline_data *data; - ret = ocfs2_read_block(OCFS2_SB(dir->i_sb), OCFS2_I(dir)->ip_blkno, - &di_bh, OCFS2_BH_CACHED, dir); + ret = ocfs2_read_block(dir, OCFS2_I(dir)->ip_blkno, &di_bh); if (ret) { mlog_errno(ret); goto out; @@ -260,14 +302,13 @@ restart: } if ((bh = bh_use[ra_ptr++]) == NULL) goto next; - wait_on_buffer(bh); - if (!buffer_uptodate(bh)) { - /* read error, skip block & hope for the best */ + if (ocfs2_read_block(dir, block, &bh)) { + /* read error, skip block & hope for the best. + * ocfs2_read_block() has released the bh. */ ocfs2_error(dir->i_sb, "reading directory %llu, " "offset %lu\n", (unsigned long long)OCFS2_I(dir)->ip_blkno, block); - brelse(bh); goto next; } i = ocfs2_search_dirblock(bh, dir, name, namelen, @@ -417,8 +458,7 @@ static inline int ocfs2_delete_entry_id(handle_t *handle, struct ocfs2_dinode *di; struct ocfs2_inline_data *data; - ret = ocfs2_read_block(OCFS2_SB(dir->i_sb), OCFS2_I(dir)->ip_blkno, - &di_bh, OCFS2_BH_CACHED, dir); + ret = ocfs2_read_block(dir, OCFS2_I(dir)->ip_blkno, &di_bh); if (ret) { mlog_errno(ret); goto out; @@ -596,8 +636,7 @@ static int ocfs2_dir_foreach_blk_id(struct inode *inode, struct ocfs2_inline_data *data; struct ocfs2_dir_entry *de; - ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), OCFS2_I(inode)->ip_blkno, - &di_bh, OCFS2_BH_CACHED, inode); + ret = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, &di_bh); if (ret) { mlog(ML_ERROR, "Unable to read inode block for dir %llu\n", (unsigned long long)OCFS2_I(inode)->ip_blkno); @@ -716,8 +755,7 @@ static int ocfs2_dir_foreach_blk_el(struct inode *inode, for (i = ra_sectors >> (sb->s_blocksize_bits - 9); i > 0; i--) { tmp = ocfs2_bread(inode, ++blk, &err, 1); - if (tmp) - brelse(tmp); + brelse(tmp); } last_ra_blk = blk; ra_sectors = 8; @@ -899,10 +937,8 @@ int ocfs2_find_files_on_disk(const char *name, leave: if (status < 0) { *dirent = NULL; - if (*dirent_bh) { - brelse(*dirent_bh); - *dirent_bh = NULL; - } + brelse(*dirent_bh); + *dirent_bh = NULL; } mlog_exit(status); @@ -951,8 +987,7 @@ int ocfs2_check_dir_for_entry(struct inode *dir, ret = 0; bail: - if (dirent_bh) - brelse(dirent_bh); + brelse(dirent_bh); mlog_exit(ret); return ret; @@ -1127,8 +1162,7 @@ static int ocfs2_fill_new_dir_el(struct ocfs2_super *osb, status = 0; bail: - if (new_bh) - brelse(new_bh); + brelse(new_bh); mlog_exit(status); return status; @@ -1192,6 +1226,9 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh, struct buffer_head *dirdata_bh = NULL; struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; handle_t *handle; + struct ocfs2_extent_tree et; + + ocfs2_init_dinode_extent_tree(&et, dir, di_bh); alloc = ocfs2_clusters_for_bytes(sb, bytes); @@ -1305,8 +1342,8 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh, * This should never fail as our extent list is empty and all * related blocks have been journaled already. */ - ret = ocfs2_insert_extent(osb, handle, dir, di_bh, 0, blkno, len, 0, - NULL); + ret = ocfs2_insert_extent(osb, handle, dir, &et, 0, blkno, len, + 0, NULL); if (ret) { mlog_errno(ret); goto out_commit; @@ -1337,8 +1374,8 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh, } blkno = ocfs2_clusters_to_blocks(dir->i_sb, bit_off); - ret = ocfs2_insert_extent(osb, handle, dir, di_bh, 1, blkno, - len, 0, NULL); + ret = ocfs2_insert_extent(osb, handle, dir, &et, 1, + blkno, len, 0, NULL); if (ret) { mlog_errno(ret); goto out_commit; @@ -1383,9 +1420,9 @@ static int ocfs2_do_extend_dir(struct super_block *sb, if (extend) { u32 offset = OCFS2_I(dir)->ip_clusters; - status = ocfs2_do_extend_allocation(OCFS2_SB(sb), dir, &offset, - 1, 0, parent_fe_bh, handle, - data_ac, meta_ac, NULL); + status = ocfs2_add_inode_data(OCFS2_SB(sb), dir, &offset, + 1, 0, parent_fe_bh, handle, + data_ac, meta_ac, NULL); BUG_ON(status == -EAGAIN); if (status < 0) { mlog_errno(status); @@ -1430,12 +1467,14 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb, int credits, num_free_extents, drop_alloc_sem = 0; loff_t dir_i_size; struct ocfs2_dinode *fe = (struct ocfs2_dinode *) parent_fe_bh->b_data; + struct ocfs2_extent_list *el = &fe->id2.i_list; struct ocfs2_alloc_context *data_ac = NULL; struct ocfs2_alloc_context *meta_ac = NULL; handle_t *handle = NULL; struct buffer_head *new_bh = NULL; struct ocfs2_dir_entry * de; struct super_block *sb = osb->sb; + struct ocfs2_extent_tree et; mlog_entry_void(); @@ -1479,7 +1518,8 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb, spin_lock(&OCFS2_I(dir)->ip_lock); if (dir_i_size == ocfs2_clusters_to_bytes(sb, OCFS2_I(dir)->ip_clusters)) { spin_unlock(&OCFS2_I(dir)->ip_lock); - num_free_extents = ocfs2_num_free_extents(osb, dir, fe); + ocfs2_init_dinode_extent_tree(&et, dir, parent_fe_bh); + num_free_extents = ocfs2_num_free_extents(osb, dir, &et); if (num_free_extents < 0) { status = num_free_extents; mlog_errno(status); @@ -1487,7 +1527,7 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb, } if (!num_free_extents) { - status = ocfs2_reserve_new_metadata(osb, fe, &meta_ac); + status = ocfs2_reserve_new_metadata(osb, el, &meta_ac); if (status < 0) { if (status != -ENOSPC) mlog_errno(status); @@ -1502,7 +1542,7 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb, goto bail; } - credits = ocfs2_calc_extend_credits(sb, fe, 1); + credits = ocfs2_calc_extend_credits(sb, el, 1); } else { spin_unlock(&OCFS2_I(dir)->ip_lock); credits = OCFS2_SIMPLE_DIR_EXTEND_CREDITS; @@ -1568,8 +1608,7 @@ bail: if (meta_ac) ocfs2_free_alloc_context(meta_ac); - if (new_bh) - brelse(new_bh); + brelse(new_bh); mlog_exit(status); return status; @@ -1696,8 +1735,7 @@ static int ocfs2_find_dir_space_el(struct inode *dir, const char *name, status = 0; bail: - if (bh) - brelse(bh); + brelse(bh); mlog_exit(status); return status; @@ -1756,7 +1794,6 @@ int ocfs2_prepare_dir_for_insert(struct ocfs2_super *osb, *ret_de_bh = bh; bh = NULL; out: - if (bh) - brelse(bh); + brelse(bh); return ret; } diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index eae3d64..ec68442 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -2024,8 +2024,7 @@ static int ocfs2_inode_lock_update(struct inode *inode, } else { /* Boo, we have to go to disk. */ /* read bh, cast, ocfs2_refresh_inode */ - status = ocfs2_read_block(OCFS2_SB(inode->i_sb), oi->ip_blkno, - bh, OCFS2_BH_CACHED, inode); + status = ocfs2_read_block(inode, oi->ip_blkno, bh); if (status < 0) { mlog_errno(status); goto bail_refresh; @@ -2086,11 +2085,7 @@ static int ocfs2_assign_bh(struct inode *inode, return 0; } - status = ocfs2_read_block(OCFS2_SB(inode->i_sb), - OCFS2_I(inode)->ip_blkno, - ret_bh, - OCFS2_BH_CACHED, - inode); + status = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, ret_bh); if (status < 0) mlog_errno(status); diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c index aed268e..2baedac 100644 --- a/fs/ocfs2/extent_map.c +++ b/fs/ocfs2/extent_map.c @@ -293,8 +293,7 @@ static int ocfs2_last_eb_is_empty(struct inode *inode, struct ocfs2_extent_block *eb; struct ocfs2_extent_list *el; - ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), last_eb_blk, - &eb_bh, OCFS2_BH_CACHED, inode); + ret = ocfs2_read_block(inode, last_eb_blk, &eb_bh); if (ret) { mlog_errno(ret); goto out; @@ -382,9 +381,9 @@ static int ocfs2_figure_hole_clusters(struct inode *inode, if (le64_to_cpu(eb->h_next_leaf_blk) == 0ULL) goto no_more_extents; - ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), + ret = ocfs2_read_block(inode, le64_to_cpu(eb->h_next_leaf_blk), - &next_eb_bh, OCFS2_BH_CACHED, inode); + &next_eb_bh); if (ret) { mlog_errno(ret); goto out; @@ -551,6 +550,66 @@ static void ocfs2_relative_extent_offsets(struct super_block *sb, *num_clusters = le16_to_cpu(rec->e_leaf_clusters) - coff; } +int ocfs2_xattr_get_clusters(struct inode *inode, u32 v_cluster, + u32 *p_cluster, u32 *num_clusters, + struct ocfs2_extent_list *el) +{ + int ret = 0, i; + struct buffer_head *eb_bh = NULL; + struct ocfs2_extent_block *eb; + struct ocfs2_extent_rec *rec; + u32 coff; + + if (el->l_tree_depth) { + ret = ocfs2_find_leaf(inode, el, v_cluster, &eb_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + eb = (struct ocfs2_extent_block *) eb_bh->b_data; + el = &eb->h_list; + + if (el->l_tree_depth) { + ocfs2_error(inode->i_sb, + "Inode %lu has non zero tree depth in " + "xattr leaf block %llu\n", inode->i_ino, + (unsigned long long)eb_bh->b_blocknr); + ret = -EROFS; + goto out; + } + } + + i = ocfs2_search_extent_list(el, v_cluster); + if (i == -1) { + ret = -EROFS; + mlog_errno(ret); + goto out; + } else { + rec = &el->l_recs[i]; + BUG_ON(v_cluster < le32_to_cpu(rec->e_cpos)); + + if (!rec->e_blkno) { + ocfs2_error(inode->i_sb, "Inode %lu has bad extent " + "record (%u, %u, 0) in xattr", inode->i_ino, + le32_to_cpu(rec->e_cpos), + ocfs2_rec_clusters(el, rec)); + ret = -EROFS; + goto out; + } + coff = v_cluster - le32_to_cpu(rec->e_cpos); + *p_cluster = ocfs2_blocks_to_clusters(inode->i_sb, + le64_to_cpu(rec->e_blkno)); + *p_cluster = *p_cluster + coff; + if (num_clusters) + *num_clusters = ocfs2_rec_clusters(el, rec) - coff; + } +out: + if (eb_bh) + brelse(eb_bh); + return ret; +} + int ocfs2_get_clusters(struct inode *inode, u32 v_cluster, u32 *p_cluster, u32 *num_clusters, unsigned int *extent_flags) @@ -571,8 +630,7 @@ int ocfs2_get_clusters(struct inode *inode, u32 v_cluster, if (ret == 0) goto out; - ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), OCFS2_I(inode)->ip_blkno, - &di_bh, OCFS2_BH_CACHED, inode); + ret = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, &di_bh); if (ret) { mlog_errno(ret); goto out; diff --git a/fs/ocfs2/extent_map.h b/fs/ocfs2/extent_map.h index 1b97490..1c4aa8b 100644 --- a/fs/ocfs2/extent_map.h +++ b/fs/ocfs2/extent_map.h @@ -53,4 +53,8 @@ int ocfs2_extent_map_get_blocks(struct inode *inode, u64 v_blkno, u64 *p_blkno, int ocfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 map_start, u64 map_len); +int ocfs2_xattr_get_clusters(struct inode *inode, u32 v_cluster, + u32 *p_cluster, u32 *num_clusters, + struct ocfs2_extent_list *el); + #endif /* _EXTENT_MAP_H */ diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index ed38796..8d3225a 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -55,6 +55,7 @@ #include "mmap.h" #include "suballoc.h" #include "super.h" +#include "xattr.h" #include "buffer_head_io.h" @@ -184,7 +185,7 @@ static int ocfs2_sync_file(struct file *file, goto bail; journal = osb->journal->j_journal; - err = journal_force_commit(journal); + err = jbd2_journal_force_commit(journal); bail: mlog_exit(err); @@ -488,7 +489,7 @@ bail: } /* - * extend allocation only here. + * extend file allocation only here. * we'll update all the disk stuff, and oip->alloc_size * * expect stuff to be locked, a transaction started and enough data / @@ -497,189 +498,25 @@ bail: * Will return -EAGAIN, and a reason if a restart is needed. * If passed in, *reason will always be set, even in error. */ -int ocfs2_do_extend_allocation(struct ocfs2_super *osb, - struct inode *inode, - u32 *logical_offset, - u32 clusters_to_add, - int mark_unwritten, - struct buffer_head *fe_bh, - handle_t *handle, - struct ocfs2_alloc_context *data_ac, - struct ocfs2_alloc_context *meta_ac, - enum ocfs2_alloc_restarted *reason_ret) +int ocfs2_add_inode_data(struct ocfs2_super *osb, + struct inode *inode, + u32 *logical_offset, + u32 clusters_to_add, + int mark_unwritten, + struct buffer_head *fe_bh, + handle_t *handle, + struct ocfs2_alloc_context *data_ac, + struct ocfs2_alloc_context *meta_ac, + enum ocfs2_alloc_restarted *reason_ret) { - int status = 0; - int free_extents; - struct ocfs2_dinode *fe = (struct ocfs2_dinode *) fe_bh->b_data; - enum ocfs2_alloc_restarted reason = RESTART_NONE; - u32 bit_off, num_bits; - u64 block; - u8 flags = 0; - - BUG_ON(!clusters_to_add); - - if (mark_unwritten) - flags = OCFS2_EXT_UNWRITTEN; - - free_extents = ocfs2_num_free_extents(osb, inode, fe); - if (free_extents < 0) { - status = free_extents; - mlog_errno(status); - goto leave; - } - - /* there are two cases which could cause us to EAGAIN in the - * we-need-more-metadata case: - * 1) we haven't reserved *any* - * 2) we are so fragmented, we've needed to add metadata too - * many times. */ - if (!free_extents && !meta_ac) { - mlog(0, "we haven't reserved any metadata!\n"); - status = -EAGAIN; - reason = RESTART_META; - goto leave; - } else if ((!free_extents) - && (ocfs2_alloc_context_bits_left(meta_ac) - < ocfs2_extend_meta_needed(fe))) { - mlog(0, "filesystem is really fragmented...\n"); - status = -EAGAIN; - reason = RESTART_META; - goto leave; - } - - status = __ocfs2_claim_clusters(osb, handle, data_ac, 1, - clusters_to_add, &bit_off, &num_bits); - if (status < 0) { - if (status != -ENOSPC) - mlog_errno(status); - goto leave; - } - - BUG_ON(num_bits > clusters_to_add); - - /* reserve our write early -- insert_extent may update the inode */ - status = ocfs2_journal_access(handle, inode, fe_bh, - OCFS2_JOURNAL_ACCESS_WRITE); - if (status < 0) { - mlog_errno(status); - goto leave; - } - - block = ocfs2_clusters_to_blocks(osb->sb, bit_off); - mlog(0, "Allocating %u clusters at block %u for inode %llu\n", - num_bits, bit_off, (unsigned long long)OCFS2_I(inode)->ip_blkno); - status = ocfs2_insert_extent(osb, handle, inode, fe_bh, - *logical_offset, block, num_bits, - flags, meta_ac); - if (status < 0) { - mlog_errno(status); - goto leave; - } - - status = ocfs2_journal_dirty(handle, fe_bh); - if (status < 0) { - mlog_errno(status); - goto leave; - } - - clusters_to_add -= num_bits; - *logical_offset += num_bits; - - if (clusters_to_add) { - mlog(0, "need to alloc once more, clusters = %u, wanted = " - "%u\n", fe->i_clusters, clusters_to_add); - status = -EAGAIN; - reason = RESTART_TRANS; - } - -leave: - mlog_exit(status); - if (reason_ret) - *reason_ret = reason; - return status; -} - -/* - * For a given allocation, determine which allocators will need to be - * accessed, and lock them, reserving the appropriate number of bits. - * - * Sparse file systems call this from ocfs2_write_begin_nolock() - * and ocfs2_allocate_unwritten_extents(). - * - * File systems which don't support holes call this from - * ocfs2_extend_allocation(). - */ -int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_dinode *di, - u32 clusters_to_add, u32 extents_to_split, - struct ocfs2_alloc_context **data_ac, - struct ocfs2_alloc_context **meta_ac) -{ - int ret = 0, num_free_extents; - unsigned int max_recs_needed = clusters_to_add + 2 * extents_to_split; - struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); - - *meta_ac = NULL; - if (data_ac) - *data_ac = NULL; - - BUG_ON(clusters_to_add != 0 && data_ac == NULL); - - mlog(0, "extend inode %llu, i_size = %lld, di->i_clusters = %u, " - "clusters_to_add = %u, extents_to_split = %u\n", - (unsigned long long)OCFS2_I(inode)->ip_blkno, (long long)i_size_read(inode), - le32_to_cpu(di->i_clusters), clusters_to_add, extents_to_split); - - num_free_extents = ocfs2_num_free_extents(osb, inode, di); - if (num_free_extents < 0) { - ret = num_free_extents; - mlog_errno(ret); - goto out; - } - - /* - * Sparse allocation file systems need to be more conservative - * with reserving room for expansion - the actual allocation - * happens while we've got a journal handle open so re-taking - * a cluster lock (because we ran out of room for another - * extent) will violate ordering rules. - * - * Most of the time we'll only be seeing this 1 cluster at a time - * anyway. - * - * Always lock for any unwritten extents - we might want to - * add blocks during a split. - */ - if (!num_free_extents || - (ocfs2_sparse_alloc(osb) && num_free_extents < max_recs_needed)) { - ret = ocfs2_reserve_new_metadata(osb, di, meta_ac); - if (ret < 0) { - if (ret != -ENOSPC) - mlog_errno(ret); - goto out; - } - } - - if (clusters_to_add == 0) - goto out; - - ret = ocfs2_reserve_clusters(osb, clusters_to_add, data_ac); - if (ret < 0) { - if (ret != -ENOSPC) - mlog_errno(ret); - goto out; - } - -out: - if (ret) { - if (*meta_ac) { - ocfs2_free_alloc_context(*meta_ac); - *meta_ac = NULL; - } + int ret; + struct ocfs2_extent_tree et; - /* - * We cannot have an error and a non null *data_ac. - */ - } + ocfs2_init_dinode_extent_tree(&et, inode, fe_bh); + ret = ocfs2_add_clusters_in_btree(osb, inode, logical_offset, + clusters_to_add, mark_unwritten, + &et, handle, + data_ac, meta_ac, reason_ret); return ret; } @@ -698,6 +535,7 @@ static int __ocfs2_extend_allocation(struct inode *inode, u32 logical_start, struct ocfs2_alloc_context *meta_ac = NULL; enum ocfs2_alloc_restarted why; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct ocfs2_extent_tree et; mlog_entry("(clusters_to_add = %u)\n", clusters_to_add); @@ -707,8 +545,7 @@ static int __ocfs2_extend_allocation(struct inode *inode, u32 logical_start, */ BUG_ON(mark_unwritten && !ocfs2_sparse_alloc(osb)); - status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, &bh, - OCFS2_BH_CACHED, inode); + status = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, &bh); if (status < 0) { mlog_errno(status); goto leave; @@ -724,14 +561,21 @@ static int __ocfs2_extend_allocation(struct inode *inode, u32 logical_start, restart_all: BUG_ON(le32_to_cpu(fe->i_clusters) != OCFS2_I(inode)->ip_clusters); - status = ocfs2_lock_allocators(inode, fe, clusters_to_add, 0, &data_ac, - &meta_ac); + mlog(0, "extend inode %llu, i_size = %lld, di->i_clusters = %u, " + "clusters_to_add = %u\n", + (unsigned long long)OCFS2_I(inode)->ip_blkno, + (long long)i_size_read(inode), le32_to_cpu(fe->i_clusters), + clusters_to_add); + ocfs2_init_dinode_extent_tree(&et, inode, bh); + status = ocfs2_lock_allocators(inode, &et, clusters_to_add, 0, + &data_ac, &meta_ac); if (status) { mlog_errno(status); goto leave; } - credits = ocfs2_calc_extend_credits(osb->sb, fe, clusters_to_add); + credits = ocfs2_calc_extend_credits(osb->sb, &fe->id2.i_list, + clusters_to_add); handle = ocfs2_start_trans(osb, credits); if (IS_ERR(handle)) { status = PTR_ERR(handle); @@ -753,16 +597,16 @@ restarted_transaction: prev_clusters = OCFS2_I(inode)->ip_clusters; - status = ocfs2_do_extend_allocation(osb, - inode, - &logical_start, - clusters_to_add, - mark_unwritten, - bh, - handle, - data_ac, - meta_ac, - &why); + status = ocfs2_add_inode_data(osb, + inode, + &logical_start, + clusters_to_add, + mark_unwritten, + bh, + handle, + data_ac, + meta_ac, + &why); if ((status < 0) && (status != -EAGAIN)) { if (status != -ENOSPC) mlog_errno(status); @@ -789,7 +633,7 @@ restarted_transaction: mlog(0, "restarting transaction.\n"); /* TODO: This can be more intelligent. */ credits = ocfs2_calc_extend_credits(osb->sb, - fe, + &fe->id2.i_list, clusters_to_add); status = ocfs2_extend_trans(handle, credits); if (status < 0) { @@ -826,10 +670,8 @@ leave: restart_func = 0; goto restart_all; } - if (bh) { - brelse(bh); - bh = NULL; - } + brelse(bh); + bh = NULL; mlog_exit(status); return status; @@ -1096,9 +938,15 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr) goto bail_unlock; } - if (i_size_read(inode) > attr->ia_size) + if (i_size_read(inode) > attr->ia_size) { + if (ocfs2_should_order_data(inode)) { + status = ocfs2_begin_ordered_truncate(inode, + attr->ia_size); + if (status) + goto bail_unlock; + } status = ocfs2_truncate_file(inode, bh, attr->ia_size); - else + } else status = ocfs2_extend_file(inode, bh, attr->ia_size); if (status < 0) { if (status != -ENOSPC) @@ -1140,8 +988,7 @@ bail_unlock_rw: if (size_change) ocfs2_rw_unlock(inode, 1); bail: - if (bh) - brelse(bh); + brelse(bh); mlog_exit(status); return status; @@ -1284,8 +1131,7 @@ static int ocfs2_write_remove_suid(struct inode *inode) struct buffer_head *bh = NULL; struct ocfs2_inode_info *oi = OCFS2_I(inode); - ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), - oi->ip_blkno, &bh, OCFS2_BH_CACHED, inode); + ret = ocfs2_read_block(inode, oi->ip_blkno, &bh); if (ret < 0) { mlog_errno(ret); goto out; @@ -1311,9 +1157,8 @@ static int ocfs2_allocate_unwritten_extents(struct inode *inode, struct buffer_head *di_bh = NULL; if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { - ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), - OCFS2_I(inode)->ip_blkno, &di_bh, - OCFS2_BH_CACHED, inode); + ret = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, + &di_bh); if (ret) { mlog_errno(ret); goto out; @@ -1394,8 +1239,11 @@ static int __ocfs2_remove_inode_range(struct inode *inode, handle_t *handle; struct ocfs2_alloc_context *meta_ac = NULL; struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; + struct ocfs2_extent_tree et; - ret = ocfs2_lock_allocators(inode, di, 0, 1, NULL, &meta_ac); + ocfs2_init_dinode_extent_tree(&et, inode, di_bh); + + ret = ocfs2_lock_allocators(inode, &et, 0, 1, NULL, &meta_ac); if (ret) { mlog_errno(ret); return ret; @@ -1425,7 +1273,7 @@ static int __ocfs2_remove_inode_range(struct inode *inode, goto out; } - ret = ocfs2_remove_extent(inode, di_bh, cpos, len, handle, meta_ac, + ret = ocfs2_remove_extent(inode, &et, cpos, len, handle, meta_ac, dealloc); if (ret) { mlog_errno(ret); @@ -2040,7 +1888,7 @@ out_dio: */ if (old_size != i_size_read(inode) || old_clusters != OCFS2_I(inode)->ip_clusters) { - ret = journal_force_commit(osb->journal->j_journal); + ret = jbd2_journal_force_commit(osb->journal->j_journal); if (ret < 0) written = ret; } @@ -2227,6 +2075,10 @@ const struct inode_operations ocfs2_file_iops = { .setattr = ocfs2_setattr, .getattr = ocfs2_getattr, .permission = ocfs2_permission, + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, + .listxattr = ocfs2_listxattr, + .removexattr = generic_removexattr, .fallocate = ocfs2_fallocate, .fiemap = ocfs2_fiemap, }; @@ -2237,6 +2089,10 @@ const struct inode_operations ocfs2_special_file_iops = { .permission = ocfs2_permission, }; +/* + * Other than ->lock, keep ocfs2_fops and ocfs2_dops in sync with + * ocfs2_fops_no_plocks and ocfs2_dops_no_plocks! + */ const struct file_operations ocfs2_fops = { .llseek = generic_file_llseek, .read = do_sync_read, @@ -2251,6 +2107,7 @@ const struct file_operations ocfs2_fops = { #ifdef CONFIG_COMPAT .compat_ioctl = ocfs2_compat_ioctl, #endif + .lock = ocfs2_lock, .flock = ocfs2_flock, .splice_read = ocfs2_file_splice_read, .splice_write = ocfs2_file_splice_write, @@ -2267,5 +2124,51 @@ const struct file_operations ocfs2_dops = { #ifdef CONFIG_COMPAT .compat_ioctl = ocfs2_compat_ioctl, #endif + .lock = ocfs2_lock, + .flock = ocfs2_flock, +}; + +/* + * POSIX-lockless variants of our file_operations. + * + * These will be used if the underlying cluster stack does not support + * posix file locking, if the user passes the "localflocks" mount + * option, or if we have a local-only fs. + * + * ocfs2_flock is in here because all stacks handle UNIX file locks, + * so we still want it in the case of no stack support for + * plocks. Internally, it will do the right thing when asked to ignore + * the cluster. + */ +const struct file_operations ocfs2_fops_no_plocks = { + .llseek = generic_file_llseek, + .read = do_sync_read, + .write = do_sync_write, + .mmap = ocfs2_mmap, + .fsync = ocfs2_sync_file, + .release = ocfs2_file_release, + .open = ocfs2_file_open, + .aio_read = ocfs2_file_aio_read, + .aio_write = ocfs2_file_aio_write, + .unlocked_ioctl = ocfs2_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = ocfs2_compat_ioctl, +#endif + .flock = ocfs2_flock, + .splice_read = ocfs2_file_splice_read, + .splice_write = ocfs2_file_splice_write, +}; + +const struct file_operations ocfs2_dops_no_plocks = { + .llseek = generic_file_llseek, + .read = generic_read_dir, + .readdir = ocfs2_readdir, + .fsync = ocfs2_sync_file, + .release = ocfs2_dir_release, + .open = ocfs2_dir_open, + .unlocked_ioctl = ocfs2_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = ocfs2_compat_ioctl, +#endif .flock = ocfs2_flock, }; diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h index 1e27b4d..e92382c 100644 --- a/fs/ocfs2/file.h +++ b/fs/ocfs2/file.h @@ -28,9 +28,12 @@ extern const struct file_operations ocfs2_fops; extern const struct file_operations ocfs2_dops; +extern const struct file_operations ocfs2_fops_no_plocks; +extern const struct file_operations ocfs2_dops_no_plocks; extern const struct inode_operations ocfs2_file_iops; extern const struct inode_operations ocfs2_special_file_iops; struct ocfs2_alloc_context; +enum ocfs2_alloc_restarted; struct ocfs2_file_private { struct file *fp_file; @@ -38,27 +41,18 @@ struct ocfs2_file_private { struct ocfs2_lock_res fp_flock; }; -enum ocfs2_alloc_restarted { - RESTART_NONE = 0, - RESTART_TRANS, - RESTART_META -}; -int ocfs2_do_extend_allocation(struct ocfs2_super *osb, - struct inode *inode, - u32 *logical_offset, - u32 clusters_to_add, - int mark_unwritten, - struct buffer_head *fe_bh, - handle_t *handle, - struct ocfs2_alloc_context *data_ac, - struct ocfs2_alloc_context *meta_ac, - enum ocfs2_alloc_restarted *reason_ret); +int ocfs2_add_inode_data(struct ocfs2_super *osb, + struct inode *inode, + u32 *logical_offset, + u32 clusters_to_add, + int mark_unwritten, + struct buffer_head *fe_bh, + handle_t *handle, + struct ocfs2_alloc_context *data_ac, + struct ocfs2_alloc_context *meta_ac, + enum ocfs2_alloc_restarted *reason_ret); int ocfs2_extend_no_holes(struct inode *inode, u64 new_i_size, u64 zero_to); -int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_dinode *di, - u32 clusters_to_add, u32 extents_to_split, - struct ocfs2_alloc_context **data_ac, - struct ocfs2_alloc_context **meta_ac); int ocfs2_setattr(struct dentry *dentry, struct iattr *attr); int ocfs2_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat); diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index 7e9e4c7..4903688 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c @@ -49,6 +49,7 @@ #include "symlink.h" #include "sysfile.h" #include "uptodate.h" +#include "xattr.h" #include "buffer_head_io.h" @@ -219,6 +220,7 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe, struct super_block *sb; struct ocfs2_super *osb; int status = -EINVAL; + int use_plocks = 1; mlog_entry("(0x%p, size:%llu)\n", inode, (unsigned long long)le64_to_cpu(fe->i_size)); @@ -226,6 +228,10 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe, sb = inode->i_sb; osb = OCFS2_SB(sb); + if ((osb->s_mount_opt & OCFS2_MOUNT_LOCALFLOCKS) || + ocfs2_mount_local(osb) || !ocfs2_stack_supports_plocks()) + use_plocks = 0; + /* this means that read_inode cannot create a superblock inode * today. change if needed. */ if (!OCFS2_IS_VALID_DINODE(fe) || @@ -295,13 +301,19 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe, switch (inode->i_mode & S_IFMT) { case S_IFREG: - inode->i_fop = &ocfs2_fops; + if (use_plocks) + inode->i_fop = &ocfs2_fops; + else + inode->i_fop = &ocfs2_fops_no_plocks; inode->i_op = &ocfs2_file_iops; i_size_write(inode, le64_to_cpu(fe->i_size)); break; case S_IFDIR: inode->i_op = &ocfs2_dir_iops; - inode->i_fop = &ocfs2_dops; + if (use_plocks) + inode->i_fop = &ocfs2_dops; + else + inode->i_fop = &ocfs2_dops_no_plocks; i_size_write(inode, le64_to_cpu(fe->i_size)); break; case S_IFLNK: @@ -448,8 +460,11 @@ static int ocfs2_read_locked_inode(struct inode *inode, } } - status = ocfs2_read_block(osb, args->fi_blkno, &bh, 0, - can_lock ? inode : NULL); + if (can_lock) + status = ocfs2_read_blocks(inode, args->fi_blkno, 1, &bh, + OCFS2_BH_IGNORE_CACHE); + else + status = ocfs2_read_blocks_sync(osb, args->fi_blkno, 1, &bh); if (status < 0) { mlog_errno(status); goto bail; @@ -522,6 +537,9 @@ static int ocfs2_truncate_for_delete(struct ocfs2_super *osb, * data and fast symlinks. */ if (fe->i_clusters) { + if (ocfs2_should_order_data(inode)) + ocfs2_begin_ordered_truncate(inode, 0); + handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); if (IS_ERR(handle)) { status = PTR_ERR(handle); @@ -730,6 +748,13 @@ static int ocfs2_wipe_inode(struct inode *inode, goto bail_unlock_dir; } + /*Free extended attribute resources associated with this inode.*/ + status = ocfs2_xattr_remove(inode, di_bh); + if (status < 0) { + mlog_errno(status); + goto bail_unlock_dir; + } + status = ocfs2_remove_inode(inode, di_bh, orphan_dir_inode, orphan_dir_bh); if (status < 0) @@ -1081,6 +1106,8 @@ void ocfs2_clear_inode(struct inode *inode) oi->ip_last_trans = 0; oi->ip_dir_start_lookup = 0; oi->ip_blkno = 0ULL; + jbd2_journal_release_jbd_inode(OCFS2_SB(inode->i_sb)->journal->j_journal, + &oi->ip_jinode); bail: mlog_exit_void(); @@ -1107,58 +1134,6 @@ void ocfs2_drop_inode(struct inode *inode) } /* - * TODO: this should probably be merged into ocfs2_get_block - * - * However, you now need to pay attention to the cont_prepare_write() - * stuff in ocfs2_get_block (that is, ocfs2_get_block pretty much - * expects never to extend). - */ -struct buffer_head *ocfs2_bread(struct inode *inode, - int block, int *err, int reada) -{ - struct buffer_head *bh = NULL; - int tmperr; - u64 p_blkno; - int readflags = OCFS2_BH_CACHED; - - if (reada) - readflags |= OCFS2_BH_READAHEAD; - - if (((u64)block << inode->i_sb->s_blocksize_bits) >= - i_size_read(inode)) { - BUG_ON(!reada); - return NULL; - } - - down_read(&OCFS2_I(inode)->ip_alloc_sem); - tmperr = ocfs2_extent_map_get_blocks(inode, block, &p_blkno, NULL, - NULL); - up_read(&OCFS2_I(inode)->ip_alloc_sem); - if (tmperr < 0) { - mlog_errno(tmperr); - goto fail; - } - - tmperr = ocfs2_read_block(OCFS2_SB(inode->i_sb), p_blkno, &bh, - readflags, inode); - if (tmperr < 0) - goto fail; - - tmperr = 0; - - *err = 0; - return bh; - -fail: - if (bh) { - brelse(bh); - bh = NULL; - } - *err = -EIO; - return NULL; -} - -/* * This is called from our getattr. */ int ocfs2_inode_revalidate(struct dentry *dentry) diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h index 390a855..2f37af9 100644 --- a/fs/ocfs2/inode.h +++ b/fs/ocfs2/inode.h @@ -40,6 +40,9 @@ struct ocfs2_inode_info /* protects allocation changes on this inode. */ struct rw_semaphore ip_alloc_sem; + /* protects extended attribute changes on this inode */ + struct rw_semaphore ip_xattr_sem; + /* These fields are protected by ip_lock */ spinlock_t ip_lock; u32 ip_open_count; @@ -68,6 +71,7 @@ struct ocfs2_inode_info struct ocfs2_extent_map ip_extent_map; struct inode vfs_inode; + struct jbd2_inode ip_jinode; }; /* @@ -113,8 +117,6 @@ extern struct kmem_cache *ocfs2_inode_cache; extern const struct address_space_operations ocfs2_aops; -struct buffer_head *ocfs2_bread(struct inode *inode, int block, - int *err, int reada); void ocfs2_clear_inode(struct inode *inode); void ocfs2_delete_inode(struct inode *inode); void ocfs2_drop_inode(struct inode *inode); diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c index 7b142f0..9fcd36d 100644 --- a/fs/ocfs2/ioctl.c +++ b/fs/ocfs2/ioctl.c @@ -102,8 +102,7 @@ bail_unlock: bail: mutex_unlock(&inode->i_mutex); - if (bh) - brelse(bh); + brelse(bh); mlog_exit(status); return status; diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index c47bc2a..81e4067 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -215,9 +215,9 @@ static int ocfs2_commit_cache(struct ocfs2_super *osb) goto finally; } - journal_lock_updates(journal->j_journal); - status = journal_flush(journal->j_journal); - journal_unlock_updates(journal->j_journal); + jbd2_journal_lock_updates(journal->j_journal); + status = jbd2_journal_flush(journal->j_journal); + jbd2_journal_unlock_updates(journal->j_journal); if (status < 0) { up_write(&journal->j_trans_barrier); mlog_errno(status); @@ -264,7 +264,7 @@ handle_t *ocfs2_start_trans(struct ocfs2_super *osb, int max_buffs) down_read(&osb->journal->j_trans_barrier); - handle = journal_start(journal, max_buffs); + handle = jbd2_journal_start(journal, max_buffs); if (IS_ERR(handle)) { up_read(&osb->journal->j_trans_barrier); @@ -290,7 +290,7 @@ int ocfs2_commit_trans(struct ocfs2_super *osb, BUG_ON(!handle); - ret = journal_stop(handle); + ret = jbd2_journal_stop(handle); if (ret < 0) mlog_errno(ret); @@ -304,7 +304,7 @@ int ocfs2_commit_trans(struct ocfs2_super *osb, * transaction. extend_trans will either extend the current handle by * nblocks, or commit it and start a new one with nblocks credits. * - * This might call journal_restart() which will commit dirty buffers + * This might call jbd2_journal_restart() which will commit dirty buffers * and then restart the transaction. Before calling * ocfs2_extend_trans(), any changed blocks should have been * dirtied. After calling it, all blocks which need to be changed must @@ -332,7 +332,7 @@ int ocfs2_extend_trans(handle_t *handle, int nblocks) #ifdef CONFIG_OCFS2_DEBUG_FS status = 1; #else - status = journal_extend(handle, nblocks); + status = jbd2_journal_extend(handle, nblocks); if (status < 0) { mlog_errno(status); goto bail; @@ -340,8 +340,10 @@ int ocfs2_extend_trans(handle_t *handle, int nblocks) #endif if (status > 0) { - mlog(0, "journal_extend failed, trying journal_restart\n"); - status = journal_restart(handle, nblocks); + mlog(0, + "jbd2_journal_extend failed, trying " + "jbd2_journal_restart\n"); + status = jbd2_journal_restart(handle, nblocks); if (status < 0) { mlog_errno(status); goto bail; @@ -393,11 +395,11 @@ int ocfs2_journal_access(handle_t *handle, switch (type) { case OCFS2_JOURNAL_ACCESS_CREATE: case OCFS2_JOURNAL_ACCESS_WRITE: - status = journal_get_write_access(handle, bh); + status = jbd2_journal_get_write_access(handle, bh); break; case OCFS2_JOURNAL_ACCESS_UNDO: - status = journal_get_undo_access(handle, bh); + status = jbd2_journal_get_undo_access(handle, bh); break; default: @@ -422,7 +424,7 @@ int ocfs2_journal_dirty(handle_t *handle, mlog_entry("(bh->b_blocknr=%llu)\n", (unsigned long long)bh->b_blocknr); - status = journal_dirty_metadata(handle, bh); + status = jbd2_journal_dirty_metadata(handle, bh); if (status < 0) mlog(ML_ERROR, "Could not dirty metadata buffer. " "(bh->b_blocknr=%llu)\n", @@ -432,6 +434,7 @@ int ocfs2_journal_dirty(handle_t *handle, return status; } +#ifdef CONFIG_OCFS2_COMPAT_JBD int ocfs2_journal_dirty_data(handle_t *handle, struct buffer_head *bh) { @@ -443,8 +446,9 @@ int ocfs2_journal_dirty_data(handle_t *handle, return err; } +#endif -#define OCFS2_DEFAULT_COMMIT_INTERVAL (HZ * JBD_DEFAULT_MAX_COMMIT_AGE) +#define OCFS2_DEFAULT_COMMIT_INTERVAL (HZ * JBD2_DEFAULT_MAX_COMMIT_AGE) void ocfs2_set_journal_params(struct ocfs2_super *osb) { @@ -457,9 +461,9 @@ void ocfs2_set_journal_params(struct ocfs2_super *osb) spin_lock(&journal->j_state_lock); journal->j_commit_interval = commit_interval; if (osb->s_mount_opt & OCFS2_MOUNT_BARRIER) - journal->j_flags |= JFS_BARRIER; + journal->j_flags |= JBD2_BARRIER; else - journal->j_flags &= ~JFS_BARRIER; + journal->j_flags &= ~JBD2_BARRIER; spin_unlock(&journal->j_state_lock); } @@ -524,14 +528,14 @@ int ocfs2_journal_init(struct ocfs2_journal *journal, int *dirty) mlog(0, "inode->ip_clusters = %u\n", OCFS2_I(inode)->ip_clusters); /* call the kernels journal init function now */ - j_journal = journal_init_inode(inode); + j_journal = jbd2_journal_init_inode(inode); if (j_journal == NULL) { mlog(ML_ERROR, "Linux journal layer error\n"); status = -EINVAL; goto done; } - mlog(0, "Returned from journal_init_inode\n"); + mlog(0, "Returned from jbd2_journal_init_inode\n"); mlog(0, "j_journal->j_maxlen = %u\n", j_journal->j_maxlen); *dirty = (le32_to_cpu(di->id1.journal1.ij_flags) & @@ -550,8 +554,7 @@ done: if (status < 0) { if (inode_lock) ocfs2_inode_unlock(inode, 1); - if (bh != NULL) - brelse(bh); + brelse(bh); if (inode) { OCFS2_I(inode)->ip_open_count--; iput(inode); @@ -639,7 +642,7 @@ void ocfs2_journal_shutdown(struct ocfs2_super *osb) if (journal->j_state != OCFS2_JOURNAL_LOADED) goto done; - /* need to inc inode use count as journal_destroy will iput. */ + /* need to inc inode use count - jbd2_journal_destroy will iput. */ if (!igrab(inode)) BUG(); @@ -668,9 +671,9 @@ void ocfs2_journal_shutdown(struct ocfs2_super *osb) BUG_ON(atomic_read(&(osb->journal->j_num_trans)) != 0); if (ocfs2_mount_local(osb)) { - journal_lock_updates(journal->j_journal); - status = journal_flush(journal->j_journal); - journal_unlock_updates(journal->j_journal); + jbd2_journal_lock_updates(journal->j_journal); + status = jbd2_journal_flush(journal->j_journal); + jbd2_journal_unlock_updates(journal->j_journal); if (status < 0) mlog_errno(status); } @@ -686,7 +689,7 @@ void ocfs2_journal_shutdown(struct ocfs2_super *osb) } /* Shutdown the kernel journal system */ - journal_destroy(journal->j_journal); + jbd2_journal_destroy(journal->j_journal); OCFS2_I(inode)->ip_open_count--; @@ -711,15 +714,15 @@ static void ocfs2_clear_journal_error(struct super_block *sb, { int olderr; - olderr = journal_errno(journal); + olderr = jbd2_journal_errno(journal); if (olderr) { mlog(ML_ERROR, "File system error %d recorded in " "journal %u.\n", olderr, slot); mlog(ML_ERROR, "File system on device %s needs checking.\n", sb->s_id); - journal_ack_err(journal); - journal_clear_err(journal); + jbd2_journal_ack_err(journal); + jbd2_journal_clear_err(journal); } } @@ -734,7 +737,7 @@ int ocfs2_journal_load(struct ocfs2_journal *journal, int local, int replayed) osb = journal->j_osb; - status = journal_load(journal->j_journal); + status = jbd2_journal_load(journal->j_journal); if (status < 0) { mlog(ML_ERROR, "Failed to load journal!\n"); goto done; @@ -778,7 +781,7 @@ int ocfs2_journal_wipe(struct ocfs2_journal *journal, int full) BUG_ON(!journal); - status = journal_wipe(journal->j_journal, full); + status = jbd2_journal_wipe(journal->j_journal, full); if (status < 0) { mlog_errno(status); goto bail; @@ -847,9 +850,8 @@ static int ocfs2_force_read_journal(struct inode *inode) /* We are reading journal data which should not * be put in the uptodate cache */ - status = ocfs2_read_blocks(OCFS2_SB(inode->i_sb), - p_blkno, p_blocks, bhs, 0, - NULL); + status = ocfs2_read_blocks_sync(OCFS2_SB(inode->i_sb), + p_blkno, p_blocks, bhs); if (status < 0) { mlog_errno(status); goto bail; @@ -865,8 +867,7 @@ static int ocfs2_force_read_journal(struct inode *inode) bail: for(i = 0; i < CONCURRENT_JOURNAL_FILL; i++) - if (bhs[i]) - brelse(bhs[i]); + brelse(bhs[i]); mlog_exit(status); return status; } @@ -1133,7 +1134,8 @@ static int ocfs2_read_journal_inode(struct ocfs2_super *osb, } SET_INODE_JOURNAL(inode); - status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, bh, 0, inode); + status = ocfs2_read_blocks(inode, OCFS2_I(inode)->ip_blkno, 1, bh, + OCFS2_BH_IGNORE_CACHE); if (status < 0) { mlog_errno(status); goto bail; @@ -1229,19 +1231,19 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb, } mlog(0, "calling journal_init_inode\n"); - journal = journal_init_inode(inode); + journal = jbd2_journal_init_inode(inode); if (journal == NULL) { mlog(ML_ERROR, "Linux journal layer error\n"); status = -EIO; goto done; } - status = journal_load(journal); + status = jbd2_journal_load(journal); if (status < 0) { mlog_errno(status); if (!igrab(inode)) BUG(); - journal_destroy(journal); + jbd2_journal_destroy(journal); goto done; } @@ -1249,9 +1251,9 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb, /* wipe the journal */ mlog(0, "flushing the journal.\n"); - journal_lock_updates(journal); - status = journal_flush(journal); - journal_unlock_updates(journal); + jbd2_journal_lock_updates(journal); + status = jbd2_journal_flush(journal); + jbd2_journal_unlock_updates(journal); if (status < 0) mlog_errno(status); @@ -1272,7 +1274,7 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb, if (!igrab(inode)) BUG(); - journal_destroy(journal); + jbd2_journal_destroy(journal); done: /* drop the lock on this nodes journal */ @@ -1282,8 +1284,7 @@ done: if (inode) iput(inode); - if (bh) - brelse(bh); + brelse(bh); mlog_exit(status); return status; diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h index 2178ebf..d4d14e9 100644 --- a/fs/ocfs2/journal.h +++ b/fs/ocfs2/journal.h @@ -27,7 +27,12 @@ #define OCFS2_JOURNAL_H #include <linux/fs.h> -#include <linux/jbd.h> +#ifndef CONFIG_OCFS2_COMPAT_JBD +# include <linux/jbd2.h> +#else +# include <linux/jbd.h> +# include "ocfs2_jbd_compat.h" +#endif enum ocfs2_journal_state { OCFS2_JOURNAL_FREE = 0, @@ -215,8 +220,8 @@ static inline void ocfs2_checkpoint_inode(struct inode *inode) * buffer. Will have to call ocfs2_journal_dirty once * we've actually dirtied it. Type is one of . or . * ocfs2_journal_dirty - Mark a journalled buffer as having dirty data. - * ocfs2_journal_dirty_data - Indicate that a data buffer should go out before - * the current handle commits. + * ocfs2_jbd2_file_inode - Mark an inode so that its data goes out before + * the current handle commits. */ /* You must always start_trans with a number of buffs > 0, but it's @@ -268,8 +273,10 @@ int ocfs2_journal_access(handle_t *handle, */ int ocfs2_journal_dirty(handle_t *handle, struct buffer_head *bh); +#ifdef CONFIG_OCFS2_COMPAT_JBD int ocfs2_journal_dirty_data(handle_t *handle, struct buffer_head *bh); +#endif /* * Credit Macros: @@ -283,6 +290,9 @@ int ocfs2_journal_dirty_data(handle_t *handle, /* simple file updates like chmod, etc. */ #define OCFS2_INODE_UPDATE_CREDITS 1 +/* extended attribute block update */ +#define OCFS2_XATTR_BLOCK_UPDATE_CREDITS 1 + /* group extend. inode update and last group update. */ #define OCFS2_GROUP_EXTEND_CREDITS (OCFS2_INODE_UPDATE_CREDITS + 1) @@ -340,11 +350,23 @@ int ocfs2_journal_dirty_data(handle_t *handle, #define OCFS2_RENAME_CREDITS (3 * OCFS2_INODE_UPDATE_CREDITS + 3 \ + OCFS2_UNLINK_CREDITS) +/* global bitmap dinode, group desc., relinked group, + * suballocator dinode, group desc., relinked group, + * dinode, xattr block */ +#define OCFS2_XATTR_BLOCK_CREATE_CREDITS (OCFS2_SUBALLOC_ALLOC * 2 + \ + + OCFS2_INODE_UPDATE_CREDITS \ + + OCFS2_XATTR_BLOCK_UPDATE_CREDITS) + +/* + * Please note that the caller must make sure that root_el is the root + * of extent tree. So for an inode, it should be &fe->id2.i_list. Otherwise + * the result may be wrong. + */ static inline int ocfs2_calc_extend_credits(struct super_block *sb, - struct ocfs2_dinode *fe, + struct ocfs2_extent_list *root_el, u32 bits_wanted) { - int bitmap_blocks, sysfile_bitmap_blocks, dinode_blocks; + int bitmap_blocks, sysfile_bitmap_blocks, extent_blocks; /* bitmap dinode, group desc. + relinked group. */ bitmap_blocks = OCFS2_SUBALLOC_ALLOC; @@ -355,16 +377,16 @@ static inline int ocfs2_calc_extend_credits(struct super_block *sb, * however many metadata chunks needed * a remaining suballoc * alloc. */ sysfile_bitmap_blocks = 1 + - (OCFS2_SUBALLOC_ALLOC - 1) * ocfs2_extend_meta_needed(fe); + (OCFS2_SUBALLOC_ALLOC - 1) * ocfs2_extend_meta_needed(root_el); /* this does not include *new* metadata blocks, which are - * accounted for in sysfile_bitmap_blocks. fe + + * accounted for in sysfile_bitmap_blocks. root_el + * prev. last_eb_blk + blocks along edge of tree. * calc_symlink_credits passes because we just need 1 * credit for the dinode there. */ - dinode_blocks = 1 + 1 + le16_to_cpu(fe->id2.i_list.l_tree_depth); + extent_blocks = 1 + 1 + le16_to_cpu(root_el->l_tree_depth); - return bitmap_blocks + sysfile_bitmap_blocks + dinode_blocks; + return bitmap_blocks + sysfile_bitmap_blocks + extent_blocks; } static inline int ocfs2_calc_symlink_credits(struct super_block *sb) @@ -415,4 +437,16 @@ static inline int ocfs2_calc_tree_trunc_credits(struct super_block *sb, return credits; } +static inline int ocfs2_jbd2_file_inode(handle_t *handle, struct inode *inode) +{ + return jbd2_journal_file_inode(handle, &OCFS2_I(inode)->ip_jinode); +} + +static inline int ocfs2_begin_ordered_truncate(struct inode *inode, + loff_t new_size) +{ + return jbd2_journal_begin_ordered_truncate(&OCFS2_I(inode)->ip_jinode, + new_size); +} + #endif /* OCFS2_JOURNAL_H */ diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c index 28e492e..687b287 100644 --- a/fs/ocfs2/localalloc.c +++ b/fs/ocfs2/localalloc.c @@ -28,6 +28,7 @@ #include <linux/slab.h> #include <linux/highmem.h> #include <linux/bitops.h> +#include <linux/debugfs.h> #define MLOG_MASK_PREFIX ML_DISK_ALLOC #include <cluster/masklog.h> @@ -47,8 +48,6 @@ #define OCFS2_LOCAL_ALLOC(dinode) (&((dinode)->id2.i_lab)) -static inline int ocfs2_local_alloc_window_bits(struct ocfs2_super *osb); - static u32 ocfs2_local_alloc_count_bits(struct ocfs2_dinode *alloc); static int ocfs2_local_alloc_find_clear_bits(struct ocfs2_super *osb, @@ -75,24 +74,129 @@ static int ocfs2_local_alloc_new_window(struct ocfs2_super *osb, static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb, struct inode *local_alloc_inode); -static inline int ocfs2_local_alloc_window_bits(struct ocfs2_super *osb) +#ifdef CONFIG_OCFS2_FS_STATS + +static int ocfs2_la_debug_open(struct inode *inode, struct file *file) +{ + file->private_data = inode->i_private; + return 0; +} + +#define LA_DEBUG_BUF_SZ PAGE_CACHE_SIZE +#define LA_DEBUG_VER 1 +static ssize_t ocfs2_la_debug_read(struct file *file, char __user *userbuf, + size_t count, loff_t *ppos) +{ + static DEFINE_MUTEX(la_debug_mutex); + struct ocfs2_super *osb = file->private_data; + int written, ret; + char *buf = osb->local_alloc_debug_buf; + + mutex_lock(&la_debug_mutex); + memset(buf, 0, LA_DEBUG_BUF_SZ); + + written = snprintf(buf, LA_DEBUG_BUF_SZ, + "0x%x\t0x%llx\t%u\t%u\t0x%x\n", + LA_DEBUG_VER, + (unsigned long long)osb->la_last_gd, + osb->local_alloc_default_bits, + osb->local_alloc_bits, osb->local_alloc_state); + + ret = simple_read_from_buffer(userbuf, count, ppos, buf, written); + + mutex_unlock(&la_debug_mutex); + return ret; +} + +static const struct file_operations ocfs2_la_debug_fops = { + .open = ocfs2_la_debug_open, + .read = ocfs2_la_debug_read, +}; + +static void ocfs2_init_la_debug(struct ocfs2_super *osb) +{ + osb->local_alloc_debug_buf = kmalloc(LA_DEBUG_BUF_SZ, GFP_NOFS); + if (!osb->local_alloc_debug_buf) + return; + + osb->local_alloc_debug = debugfs_create_file("local_alloc_stats", + S_IFREG|S_IRUSR, + osb->osb_debug_root, + osb, + &ocfs2_la_debug_fops); + if (!osb->local_alloc_debug) { + kfree(osb->local_alloc_debug_buf); + osb->local_alloc_debug_buf = NULL; + } +} + +static void ocfs2_shutdown_la_debug(struct ocfs2_super *osb) +{ + if (osb->local_alloc_debug) + debugfs_remove(osb->local_alloc_debug); + + if (osb->local_alloc_debug_buf) + kfree(osb->local_alloc_debug_buf); + + osb->local_alloc_debug_buf = NULL; + osb->local_alloc_debug = NULL; +} +#else /* CONFIG_OCFS2_FS_STATS */ +static void ocfs2_init_la_debug(struct ocfs2_super *osb) +{ + return; +} +static void ocfs2_shutdown_la_debug(struct ocfs2_super *osb) +{ + return; +} +#endif + +static inline int ocfs2_la_state_enabled(struct ocfs2_super *osb) { - BUG_ON(osb->s_clustersize_bits > 20); + return (osb->local_alloc_state == OCFS2_LA_THROTTLED || + osb->local_alloc_state == OCFS2_LA_ENABLED); +} - /* Size local alloc windows by the megabyte */ - return osb->local_alloc_size << (20 - osb->s_clustersize_bits); +void ocfs2_local_alloc_seen_free_bits(struct ocfs2_super *osb, + unsigned int num_clusters) +{ + spin_lock(&osb->osb_lock); + if (osb->local_alloc_state == OCFS2_LA_DISABLED || + osb->local_alloc_state == OCFS2_LA_THROTTLED) + if (num_clusters >= osb->local_alloc_default_bits) { + cancel_delayed_work(&osb->la_enable_wq); + osb->local_alloc_state = OCFS2_LA_ENABLED; + } + spin_unlock(&osb->osb_lock); +} + +void ocfs2_la_enable_worker(struct work_struct *work) +{ + struct ocfs2_super *osb = + container_of(work, struct ocfs2_super, + la_enable_wq.work); + spin_lock(&osb->osb_lock); + osb->local_alloc_state = OCFS2_LA_ENABLED; + spin_unlock(&osb->osb_lock); } /* * Tell us whether a given allocation should use the local alloc * file. Otherwise, it has to go to the main bitmap. + * + * This function does semi-dirty reads of local alloc size and state! + * This is ok however, as the values are re-checked once under mutex. */ int ocfs2_alloc_should_use_local(struct ocfs2_super *osb, u64 bits) { - int la_bits = ocfs2_local_alloc_window_bits(osb); int ret = 0; + int la_bits; + + spin_lock(&osb->osb_lock); + la_bits = osb->local_alloc_bits; - if (osb->local_alloc_state != OCFS2_LA_ENABLED) + if (!ocfs2_la_state_enabled(osb)) goto bail; /* la_bits should be at least twice the size (in clusters) of @@ -106,6 +210,7 @@ int ocfs2_alloc_should_use_local(struct ocfs2_super *osb, u64 bits) bail: mlog(0, "state=%d, bits=%llu, la_bits=%d, ret=%d\n", osb->local_alloc_state, (unsigned long long)bits, la_bits, ret); + spin_unlock(&osb->osb_lock); return ret; } @@ -120,14 +225,18 @@ int ocfs2_load_local_alloc(struct ocfs2_super *osb) mlog_entry_void(); - if (osb->local_alloc_size == 0) + ocfs2_init_la_debug(osb); + + if (osb->local_alloc_bits == 0) goto bail; - if (ocfs2_local_alloc_window_bits(osb) >= osb->bitmap_cpg) { + if (osb->local_alloc_bits >= osb->bitmap_cpg) { mlog(ML_NOTICE, "Requested local alloc window %d is larger " "than max possible %u. Using defaults.\n", - ocfs2_local_alloc_window_bits(osb), (osb->bitmap_cpg - 1)); - osb->local_alloc_size = OCFS2_DEFAULT_LOCAL_ALLOC_SIZE; + osb->local_alloc_bits, (osb->bitmap_cpg - 1)); + osb->local_alloc_bits = + ocfs2_megabytes_to_clusters(osb->sb, + OCFS2_DEFAULT_LOCAL_ALLOC_SIZE); } /* read the alloc off disk */ @@ -139,8 +248,8 @@ int ocfs2_load_local_alloc(struct ocfs2_super *osb) goto bail; } - status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, - &alloc_bh, 0, inode); + status = ocfs2_read_blocks(inode, OCFS2_I(inode)->ip_blkno, 1, + &alloc_bh, OCFS2_BH_IGNORE_CACHE); if (status < 0) { mlog_errno(status); goto bail; @@ -185,13 +294,14 @@ int ocfs2_load_local_alloc(struct ocfs2_super *osb) bail: if (status < 0) - if (alloc_bh) - brelse(alloc_bh); + brelse(alloc_bh); if (inode) iput(inode); - mlog(0, "Local alloc window bits = %d\n", - ocfs2_local_alloc_window_bits(osb)); + if (status < 0) + ocfs2_shutdown_la_debug(osb); + + mlog(0, "Local alloc window bits = %d\n", osb->local_alloc_bits); mlog_exit(status); return status; @@ -217,6 +327,11 @@ void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb) mlog_entry_void(); + cancel_delayed_work(&osb->la_enable_wq); + flush_workqueue(ocfs2_wq); + + ocfs2_shutdown_la_debug(osb); + if (osb->local_alloc_state == OCFS2_LA_UNUSED) goto out; @@ -295,8 +410,7 @@ out_commit: ocfs2_commit_trans(osb, handle); out_unlock: - if (main_bm_bh) - brelse(main_bm_bh); + brelse(main_bm_bh); ocfs2_inode_unlock(main_bm_inode, 1); @@ -345,8 +459,8 @@ int ocfs2_begin_local_alloc_recovery(struct ocfs2_super *osb, mutex_lock(&inode->i_mutex); - status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, - &alloc_bh, 0, inode); + status = ocfs2_read_blocks(inode, OCFS2_I(inode)->ip_blkno, 1, + &alloc_bh, OCFS2_BH_IGNORE_CACHE); if (status < 0) { mlog_errno(status); goto bail; @@ -372,8 +486,7 @@ bail: *alloc_copy = NULL; } - if (alloc_bh) - brelse(alloc_bh); + brelse(alloc_bh); if (inode) { mutex_unlock(&inode->i_mutex); @@ -441,8 +554,7 @@ out_unlock: out_mutex: mutex_unlock(&main_bm_inode->i_mutex); - if (main_bm_bh) - brelse(main_bm_bh); + brelse(main_bm_bh); iput(main_bm_inode); @@ -453,8 +565,48 @@ out: return status; } +/* Check to see if the local alloc window is within ac->ac_max_block */ +static int ocfs2_local_alloc_in_range(struct inode *inode, + struct ocfs2_alloc_context *ac, + u32 bits_wanted) +{ + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct ocfs2_dinode *alloc; + struct ocfs2_local_alloc *la; + int start; + u64 block_off; + + if (!ac->ac_max_block) + return 1; + + alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data; + la = OCFS2_LOCAL_ALLOC(alloc); + + start = ocfs2_local_alloc_find_clear_bits(osb, alloc, bits_wanted); + if (start == -1) { + mlog_errno(-ENOSPC); + return 0; + } + + /* + * Converting (bm_off + start + bits_wanted) to blocks gives us + * the blkno just past our actual allocation. This is perfect + * to compare with ac_max_block. + */ + block_off = ocfs2_clusters_to_blocks(inode->i_sb, + le32_to_cpu(la->la_bm_off) + + start + bits_wanted); + mlog(0, "Checking %llu against %llu\n", + (unsigned long long)block_off, + (unsigned long long)ac->ac_max_block); + if (block_off > ac->ac_max_block) + return 0; + + return 1; +} + /* - * make sure we've got at least bitswanted contiguous bits in the + * make sure we've got at least bits_wanted contiguous bits in the * local alloc. You lose them when you drop i_mutex. * * We will add ourselves to the transaction passed in, but may start @@ -485,16 +637,18 @@ int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb, mutex_lock(&local_alloc_inode->i_mutex); - if (osb->local_alloc_state != OCFS2_LA_ENABLED) { - status = -ENOSPC; - goto bail; - } - - if (bits_wanted > ocfs2_local_alloc_window_bits(osb)) { - mlog(0, "Asking for more than my max window size!\n"); + /* + * We must double check state and allocator bits because + * another process may have changed them while holding i_mutex. + */ + spin_lock(&osb->osb_lock); + if (!ocfs2_la_state_enabled(osb) || + (bits_wanted > osb->local_alloc_bits)) { + spin_unlock(&osb->osb_lock); status = -ENOSPC; goto bail; } + spin_unlock(&osb->osb_lock); alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data; @@ -522,6 +676,36 @@ int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb, mlog_errno(status); goto bail; } + + /* + * Under certain conditions, the window slide code + * might have reduced the number of bits available or + * disabled the the local alloc entirely. Re-check + * here and return -ENOSPC if necessary. + */ + status = -ENOSPC; + if (!ocfs2_la_state_enabled(osb)) + goto bail; + + free_bits = le32_to_cpu(alloc->id1.bitmap1.i_total) - + le32_to_cpu(alloc->id1.bitmap1.i_used); + if (bits_wanted > free_bits) + goto bail; + } + + if (ac->ac_max_block) + mlog(0, "Calling in_range for max block %llu\n", + (unsigned long long)ac->ac_max_block); + + if (!ocfs2_local_alloc_in_range(local_alloc_inode, ac, + bits_wanted)) { + /* + * The window is outside ac->ac_max_block. + * This errno tells the caller to keep localalloc enabled + * but to get the allocation from the main bitmap. + */ + status = -EFBIG; + goto bail; } ac->ac_inode = local_alloc_inode; @@ -789,6 +973,85 @@ bail: return status; } +enum ocfs2_la_event { + OCFS2_LA_EVENT_SLIDE, /* Normal window slide. */ + OCFS2_LA_EVENT_FRAGMENTED, /* The global bitmap has + * enough bits theoretically + * free, but a contiguous + * allocation could not be + * found. */ + OCFS2_LA_EVENT_ENOSPC, /* Global bitmap doesn't have + * enough bits free to satisfy + * our request. */ +}; +#define OCFS2_LA_ENABLE_INTERVAL (30 * HZ) +/* + * Given an event, calculate the size of our next local alloc window. + * + * This should always be called under i_mutex of the local alloc inode + * so that local alloc disabling doesn't race with processes trying to + * use the allocator. + * + * Returns the state which the local alloc was left in. This value can + * be ignored by some paths. + */ +static int ocfs2_recalc_la_window(struct ocfs2_super *osb, + enum ocfs2_la_event event) +{ + unsigned int bits; + int state; + + spin_lock(&osb->osb_lock); + if (osb->local_alloc_state == OCFS2_LA_DISABLED) { + WARN_ON_ONCE(osb->local_alloc_state == OCFS2_LA_DISABLED); + goto out_unlock; + } + + /* + * ENOSPC and fragmentation are treated similarly for now. + */ + if (event == OCFS2_LA_EVENT_ENOSPC || + event == OCFS2_LA_EVENT_FRAGMENTED) { + /* + * We ran out of contiguous space in the primary + * bitmap. Drastically reduce the number of bits used + * by local alloc until we have to disable it. + */ + bits = osb->local_alloc_bits >> 1; + if (bits > ocfs2_megabytes_to_clusters(osb->sb, 1)) { + /* + * By setting state to THROTTLED, we'll keep + * the number of local alloc bits used down + * until an event occurs which would give us + * reason to assume the bitmap situation might + * have changed. + */ + osb->local_alloc_state = OCFS2_LA_THROTTLED; + osb->local_alloc_bits = bits; + } else { + osb->local_alloc_state = OCFS2_LA_DISABLED; + } + queue_delayed_work(ocfs2_wq, &osb->la_enable_wq, + OCFS2_LA_ENABLE_INTERVAL); + goto out_unlock; + } + + /* + * Don't increase the size of the local alloc window until we + * know we might be able to fulfill the request. Otherwise, we + * risk bouncing around the global bitmap during periods of + * low space. + */ + if (osb->local_alloc_state != OCFS2_LA_THROTTLED) + osb->local_alloc_bits = osb->local_alloc_default_bits; + +out_unlock: + state = osb->local_alloc_state; + spin_unlock(&osb->osb_lock); + + return state; +} + static int ocfs2_local_alloc_reserve_for_window(struct ocfs2_super *osb, struct ocfs2_alloc_context **ac, struct inode **bitmap_inode, @@ -803,12 +1066,21 @@ static int ocfs2_local_alloc_reserve_for_window(struct ocfs2_super *osb, goto bail; } - (*ac)->ac_bits_wanted = ocfs2_local_alloc_window_bits(osb); +retry_enospc: + (*ac)->ac_bits_wanted = osb->local_alloc_bits; status = ocfs2_reserve_cluster_bitmap_bits(osb, *ac); + if (status == -ENOSPC) { + if (ocfs2_recalc_la_window(osb, OCFS2_LA_EVENT_ENOSPC) == + OCFS2_LA_DISABLED) + goto bail; + + ocfs2_free_ac_resource(*ac); + memset(*ac, 0, sizeof(struct ocfs2_alloc_context)); + goto retry_enospc; + } if (status < 0) { - if (status != -ENOSPC) - mlog_errno(status); + mlog_errno(status); goto bail; } @@ -849,7 +1121,7 @@ static int ocfs2_local_alloc_new_window(struct ocfs2_super *osb, "one\n"); mlog(0, "Allocating %u clusters for a new window.\n", - ocfs2_local_alloc_window_bits(osb)); + osb->local_alloc_bits); /* Instruct the allocation code to try the most recently used * cluster group. We'll re-record the group used this pass @@ -859,9 +1131,36 @@ static int ocfs2_local_alloc_new_window(struct ocfs2_super *osb, /* we used the generic suballoc reserve function, but we set * everything up nicely, so there's no reason why we can't use * the more specific cluster api to claim bits. */ - status = ocfs2_claim_clusters(osb, handle, ac, - ocfs2_local_alloc_window_bits(osb), + status = ocfs2_claim_clusters(osb, handle, ac, osb->local_alloc_bits, &cluster_off, &cluster_count); + if (status == -ENOSPC) { +retry_enospc: + /* + * Note: We could also try syncing the journal here to + * allow use of any free bits which the current + * transaction can't give us access to. --Mark + */ + if (ocfs2_recalc_la_window(osb, OCFS2_LA_EVENT_FRAGMENTED) == + OCFS2_LA_DISABLED) + goto bail; + + status = ocfs2_claim_clusters(osb, handle, ac, + osb->local_alloc_bits, + &cluster_off, + &cluster_count); + if (status == -ENOSPC) + goto retry_enospc; + /* + * We only shrunk the *minimum* number of in our + * request - it's entirely possible that the allocator + * might give us more than we asked for. + */ + if (status == 0) { + spin_lock(&osb->osb_lock); + osb->local_alloc_bits = cluster_count; + spin_unlock(&osb->osb_lock); + } + } if (status < 0) { if (status != -ENOSPC) mlog_errno(status); @@ -905,6 +1204,8 @@ static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb, mlog_entry_void(); + ocfs2_recalc_la_window(osb, OCFS2_LA_EVENT_SLIDE); + /* This will lock the main bitmap for us. */ status = ocfs2_local_alloc_reserve_for_window(osb, &ac, @@ -976,8 +1277,7 @@ bail: if (handle) ocfs2_commit_trans(osb, handle); - if (main_bm_bh) - brelse(main_bm_bh); + brelse(main_bm_bh); if (main_bm_inode) iput(main_bm_inode); diff --git a/fs/ocfs2/localalloc.h b/fs/ocfs2/localalloc.h index 3f76631..ac5ea9f8 100644 --- a/fs/ocfs2/localalloc.h +++ b/fs/ocfs2/localalloc.h @@ -52,4 +52,8 @@ int ocfs2_claim_local_alloc_bits(struct ocfs2_super *osb, u32 *bit_off, u32 *num_bits); +void ocfs2_local_alloc_seen_free_bits(struct ocfs2_super *osb, + unsigned int num_clusters); +void ocfs2_la_enable_worker(struct work_struct *work); + #endif /* OCFS2_LOCALALLOC_H */ diff --git a/fs/ocfs2/locks.c b/fs/ocfs2/locks.c index 203f871..544ac62 100644 --- a/fs/ocfs2/locks.c +++ b/fs/ocfs2/locks.c @@ -24,6 +24,7 @@ */ #include <linux/fs.h> +#include <linux/fcntl.h> #define MLOG_MASK_PREFIX ML_INODE #include <cluster/masklog.h> @@ -32,6 +33,7 @@ #include "dlmglue.h" #include "file.h" +#include "inode.h" #include "locks.h" static int ocfs2_do_flock(struct file *file, struct inode *inode, @@ -123,3 +125,16 @@ int ocfs2_flock(struct file *file, int cmd, struct file_lock *fl) else return ocfs2_do_flock(file, inode, cmd, fl); } + +int ocfs2_lock(struct file *file, int cmd, struct file_lock *fl) +{ + struct inode *inode = file->f_mapping->host; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + + if (!(fl->fl_flags & FL_POSIX)) + return -ENOLCK; + if (__mandatory_lock(inode)) + return -ENOLCK; + + return ocfs2_plock(osb->cconn, OCFS2_I(inode)->ip_blkno, file, cmd, fl); +} diff --git a/fs/ocfs2/locks.h b/fs/ocfs2/locks.h index 9743ef2..496d488 100644 --- a/fs/ocfs2/locks.h +++ b/fs/ocfs2/locks.h @@ -27,5 +27,6 @@ #define OCFS2_LOCKS_H int ocfs2_flock(struct file *file, int cmd, struct file_lock *fl); +int ocfs2_lock(struct file *file, int cmd, struct file_lock *fl); #endif /* OCFS2_LOCKS_H */ diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index d5d808f..485a6aa 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -60,6 +60,7 @@ #include "symlink.h" #include "sysfile.h" #include "uptodate.h" +#include "xattr.h" #include "buffer_head_io.h" @@ -327,14 +328,9 @@ leave: if (status == -ENOSPC) mlog(0, "Disk is full\n"); - if (new_fe_bh) - brelse(new_fe_bh); - - if (de_bh) - brelse(de_bh); - - if (parent_fe_bh) - brelse(parent_fe_bh); + brelse(new_fe_bh); + brelse(de_bh); + brelse(parent_fe_bh); if ((status < 0) && inode) iput(inode); @@ -647,12 +643,9 @@ out_unlock_inode: out: ocfs2_inode_unlock(dir, 1); - if (de_bh) - brelse(de_bh); - if (fe_bh) - brelse(fe_bh); - if (parent_fe_bh) - brelse(parent_fe_bh); + brelse(de_bh); + brelse(fe_bh); + brelse(parent_fe_bh); mlog_exit(err); @@ -851,17 +844,10 @@ leave: iput(orphan_dir); } - if (fe_bh) - brelse(fe_bh); - - if (dirent_bh) - brelse(dirent_bh); - - if (parent_node_bh) - brelse(parent_node_bh); - - if (orphan_entry_bh) - brelse(orphan_entry_bh); + brelse(fe_bh); + brelse(dirent_bh); + brelse(parent_node_bh); + brelse(orphan_entry_bh); mlog_exit(status); @@ -1372,24 +1358,15 @@ bail: if (new_inode) iput(new_inode); - if (newfe_bh) - brelse(newfe_bh); - if (old_inode_bh) - brelse(old_inode_bh); - if (old_dir_bh) - brelse(old_dir_bh); - if (new_dir_bh) - brelse(new_dir_bh); - if (new_de_bh) - brelse(new_de_bh); - if (old_de_bh) - brelse(old_de_bh); - if (old_inode_de_bh) - brelse(old_inode_de_bh); - if (orphan_entry_bh) - brelse(orphan_entry_bh); - if (insert_entry_bh) - brelse(insert_entry_bh); + brelse(newfe_bh); + brelse(old_inode_bh); + brelse(old_dir_bh); + brelse(new_dir_bh); + brelse(new_de_bh); + brelse(old_de_bh); + brelse(old_inode_de_bh); + brelse(orphan_entry_bh); + brelse(insert_entry_bh); mlog_exit(status); @@ -1492,8 +1469,7 @@ bail: if (bhs) { for(i = 0; i < blocks; i++) - if (bhs[i]) - brelse(bhs[i]); + brelse(bhs[i]); kfree(bhs); } @@ -1598,10 +1574,10 @@ static int ocfs2_symlink(struct inode *dir, u32 offset = 0; inode->i_op = &ocfs2_symlink_inode_operations; - status = ocfs2_do_extend_allocation(osb, inode, &offset, 1, 0, - new_fe_bh, - handle, data_ac, NULL, - NULL); + status = ocfs2_add_inode_data(osb, inode, &offset, 1, 0, + new_fe_bh, + handle, data_ac, NULL, + NULL); if (status < 0) { if (status != -ENOSPC && status != -EINTR) { mlog(ML_ERROR, @@ -1659,12 +1635,9 @@ bail: ocfs2_inode_unlock(dir, 1); - if (new_fe_bh) - brelse(new_fe_bh); - if (parent_fe_bh) - brelse(parent_fe_bh); - if (de_bh) - brelse(de_bh); + brelse(new_fe_bh); + brelse(parent_fe_bh); + brelse(de_bh); if (inode_ac) ocfs2_free_alloc_context(inode_ac); if (data_ac) @@ -1759,8 +1732,7 @@ leave: iput(orphan_dir_inode); } - if (orphan_dir_bh) - brelse(orphan_dir_bh); + brelse(orphan_dir_bh); mlog_exit(status); return status; @@ -1780,10 +1752,9 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb, mlog_entry("(inode->i_ino = %lu)\n", inode->i_ino); - status = ocfs2_read_block(osb, + status = ocfs2_read_block(orphan_dir_inode, OCFS2_I(orphan_dir_inode)->ip_blkno, - &orphan_dir_bh, OCFS2_BH_CACHED, - orphan_dir_inode); + &orphan_dir_bh); if (status < 0) { mlog_errno(status); goto leave; @@ -1829,8 +1800,7 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb, (unsigned long long)OCFS2_I(inode)->ip_blkno, osb->slot_num); leave: - if (orphan_dir_bh) - brelse(orphan_dir_bh); + brelse(orphan_dir_bh); mlog_exit(status); return status; @@ -1898,8 +1868,7 @@ int ocfs2_orphan_del(struct ocfs2_super *osb, } leave: - if (target_de_bh) - brelse(target_de_bh); + brelse(target_de_bh); mlog_exit(status); return status; @@ -1918,4 +1887,8 @@ const struct inode_operations ocfs2_dir_iops = { .setattr = ocfs2_setattr, .getattr = ocfs2_getattr, .permission = ocfs2_permission, + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, + .listxattr = ocfs2_listxattr, + .removexattr = generic_removexattr, }; diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index 7f625f2..a21a465 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -34,7 +34,12 @@ #include <linux/workqueue.h> #include <linux/kref.h> #include <linux/mutex.h> -#include <linux/jbd.h> +#ifndef CONFIG_OCFS2_COMPAT_JBD +# include <linux/jbd2.h> +#else +# include <linux/jbd.h> +# include "ocfs2_jbd_compat.h" +#endif /* For union ocfs2_dlm_lksb */ #include "stackglue.h" @@ -171,9 +176,13 @@ struct ocfs2_alloc_stats enum ocfs2_local_alloc_state { - OCFS2_LA_UNUSED = 0, - OCFS2_LA_ENABLED, - OCFS2_LA_DISABLED + OCFS2_LA_UNUSED = 0, /* Local alloc will never be used for + * this mountpoint. */ + OCFS2_LA_ENABLED, /* Local alloc is in use. */ + OCFS2_LA_THROTTLED, /* Local alloc is in use, but number + * of bits has been reduced. */ + OCFS2_LA_DISABLED /* Local alloc has temporarily been + * disabled. */ }; enum ocfs2_mount_options @@ -184,6 +193,8 @@ enum ocfs2_mount_options OCFS2_MOUNT_ERRORS_PANIC = 1 << 3, /* Panic on errors */ OCFS2_MOUNT_DATA_WRITEBACK = 1 << 4, /* No data ordering */ OCFS2_MOUNT_LOCALFLOCKS = 1 << 5, /* No cluster aware user file locks */ + OCFS2_MOUNT_NOUSERXATTR = 1 << 6, /* No user xattr */ + OCFS2_MOUNT_INODE64 = 1 << 7, /* Allow inode numbers > 2^32 */ }; #define OCFS2_OSB_SOFT_RO 0x0001 @@ -214,6 +225,7 @@ struct ocfs2_super u32 bitmap_cpg; u8 *uuid; char *uuid_str; + u32 uuid_hash; u8 *vol_label; u64 first_cluster_group_blkno; u32 fs_generation; @@ -241,6 +253,7 @@ struct ocfs2_super int s_sectsize_bits; int s_clustersize; int s_clustersize_bits; + unsigned int s_xattr_inline_size; atomic_t vol_state; struct mutex recovery_lock; @@ -252,11 +265,27 @@ struct ocfs2_super struct ocfs2_journal *journal; unsigned long osb_commit_interval; - int local_alloc_size; - enum ocfs2_local_alloc_state local_alloc_state; + struct delayed_work la_enable_wq; + + /* + * Must hold local alloc i_mutex and osb->osb_lock to change + * local_alloc_bits. Reads can be done under either lock. + */ + unsigned int local_alloc_bits; + unsigned int local_alloc_default_bits; + + enum ocfs2_local_alloc_state local_alloc_state; /* protected + * by osb_lock */ + struct buffer_head *local_alloc_bh; + u64 la_last_gd; +#ifdef CONFIG_OCFS2_FS_STATS + struct dentry *local_alloc_debug; + char *local_alloc_debug_buf; +#endif + /* Next two fields are for local node slot recovery during * mount. */ int dirty; @@ -340,6 +369,13 @@ static inline int ocfs2_supports_inline_data(struct ocfs2_super *osb) return 0; } +static inline int ocfs2_supports_xattr(struct ocfs2_super *osb) +{ + if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_XATTR) + return 1; + return 0; +} + /* set / clear functions because cluster events can make these happen * in parallel so we want the transitions to be atomic. this also * means that any future flags osb_flags must be protected by spinlock @@ -554,6 +590,14 @@ static inline unsigned int ocfs2_pages_per_cluster(struct super_block *sb) return pages_per_cluster; } +static inline unsigned int ocfs2_megabytes_to_clusters(struct super_block *sb, + unsigned int megs) +{ + BUILD_BUG_ON(OCFS2_MAX_CLUSTERSIZE > 1048576); + + return megs << (20 - OCFS2_SB(sb)->s_clustersize_bits); +} + static inline void ocfs2_init_inode_steal_slot(struct ocfs2_super *osb) { spin_lock(&osb->osb_lock); diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h index 4f61985..f24ce3d 100644 --- a/fs/ocfs2/ocfs2_fs.h +++ b/fs/ocfs2/ocfs2_fs.h @@ -64,6 +64,7 @@ #define OCFS2_INODE_SIGNATURE "INODE01" #define OCFS2_EXTENT_BLOCK_SIGNATURE "EXBLK01" #define OCFS2_GROUP_DESC_SIGNATURE "GROUP01" +#define OCFS2_XATTR_BLOCK_SIGNATURE "XATTR01" /* Compatibility flags */ #define OCFS2_HAS_COMPAT_FEATURE(sb,mask) \ @@ -90,7 +91,8 @@ | OCFS2_FEATURE_INCOMPAT_SPARSE_ALLOC \ | OCFS2_FEATURE_INCOMPAT_INLINE_DATA \ | OCFS2_FEATURE_INCOMPAT_EXTENDED_SLOT_MAP \ - | OCFS2_FEATURE_INCOMPAT_USERSPACE_STACK) + | OCFS2_FEATURE_INCOMPAT_USERSPACE_STACK \ + | OCFS2_FEATURE_INCOMPAT_XATTR) #define OCFS2_FEATURE_RO_COMPAT_SUPP OCFS2_FEATURE_RO_COMPAT_UNWRITTEN /* @@ -127,10 +129,6 @@ /* Support for data packed into inode blocks */ #define OCFS2_FEATURE_INCOMPAT_INLINE_DATA 0x0040 -/* Support for the extended slot map */ -#define OCFS2_FEATURE_INCOMPAT_EXTENDED_SLOT_MAP 0x100 - - /* * Support for alternate, userspace cluster stacks. If set, the superblock * field s_cluster_info contains a tag for the alternate stack in use as @@ -142,6 +140,12 @@ */ #define OCFS2_FEATURE_INCOMPAT_USERSPACE_STACK 0x0080 +/* Support for the extended slot map */ +#define OCFS2_FEATURE_INCOMPAT_EXTENDED_SLOT_MAP 0x100 + +/* Support for extended attributes */ +#define OCFS2_FEATURE_INCOMPAT_XATTR 0x0200 + /* * backup superblock flag is used to indicate that this volume * has backup superblocks. @@ -299,6 +303,12 @@ struct ocfs2_new_group_input { */ #define OCFS2_DEFAULT_LOCAL_ALLOC_SIZE 8 +/* + * Inline extended attribute size (in bytes) + * The value chosen should be aligned to 16 byte boundaries. + */ +#define OCFS2_MIN_XATTR_INLINE_SIZE 256 + struct ocfs2_system_inode_info { char *si_name; int si_iflags; @@ -563,7 +573,7 @@ struct ocfs2_super_block { /*40*/ __le16 s_max_slots; /* Max number of simultaneous mounts before tunefs required */ __le16 s_tunefs_flag; - __le32 s_reserved1; + __le32 s_uuid_hash; /* hash value of uuid */ __le64 s_first_cluster_group; /* Block offset of 1st cluster * group header */ /*50*/ __u8 s_label[OCFS2_MAX_VOL_LABEL_LEN]; /* Label for mounting, etc. */ @@ -571,7 +581,11 @@ struct ocfs2_super_block { /*A0*/ struct ocfs2_cluster_info s_cluster_info; /* Selected userspace stack. Only valid with INCOMPAT flag. */ -/*B8*/ __le64 s_reserved2[17]; /* Fill out superblock */ +/*B8*/ __le16 s_xattr_inline_size; /* extended attribute inline size + for this fs*/ + __le16 s_reserved0; + __le32 s_reserved1; +/*C0*/ __le64 s_reserved2[16]; /* Fill out superblock */ /*140*/ /* @@ -621,7 +635,8 @@ struct ocfs2_dinode { belongs to */ __le16 i_suballoc_bit; /* Bit offset in suballocator block group */ -/*10*/ __le32 i_reserved0; +/*10*/ __le16 i_reserved0; + __le16 i_xattr_inline_size; __le32 i_clusters; /* Cluster count */ __le32 i_uid; /* Owner UID */ __le32 i_gid; /* Owning GID */ @@ -640,11 +655,12 @@ struct ocfs2_dinode { __le32 i_atime_nsec; __le32 i_ctime_nsec; __le32 i_mtime_nsec; - __le32 i_attr; +/*70*/ __le32 i_attr; __le16 i_orphaned_slot; /* Only valid when OCFS2_ORPHANED_FL was set in i_flags */ __le16 i_dyn_features; -/*70*/ __le64 i_reserved2[8]; + __le64 i_xattr_loc; +/*80*/ __le64 i_reserved2[7]; /*B8*/ union { __le64 i_pad1; /* Generic way to refer to this 64bit union */ @@ -715,6 +731,136 @@ struct ocfs2_group_desc /*40*/ __u8 bg_bitmap[0]; }; +/* + * On disk extended attribute structure for OCFS2. + */ + +/* + * ocfs2_xattr_entry indicates one extend attribute. + * + * Note that it can be stored in inode, one block or one xattr bucket. + */ +struct ocfs2_xattr_entry { + __le32 xe_name_hash; /* hash value of xattr prefix+suffix. */ + __le16 xe_name_offset; /* byte offset from the 1st etnry in the local + local xattr storage(inode, xattr block or + xattr bucket). */ + __u8 xe_name_len; /* xattr name len, does't include prefix. */ + __u8 xe_type; /* the low 7 bits indicates the name prefix's + * type and the highest 1 bits indicate whether + * the EA is stored in the local storage. */ + __le64 xe_value_size; /* real xattr value length. */ +}; + +/* + * On disk structure for xattr header. + * + * One ocfs2_xattr_header describes how many ocfs2_xattr_entry records in + * the local xattr storage. + */ +struct ocfs2_xattr_header { + __le16 xh_count; /* contains the count of how + many records are in the + local xattr storage. */ + __le16 xh_free_start; /* current offset for storing + xattr. */ + __le16 xh_name_value_len; /* total length of name/value + length in this bucket. */ + __le16 xh_num_buckets; /* bucket nums in one extent + record, only valid in the + first bucket. */ + __le64 xh_csum; + struct ocfs2_xattr_entry xh_entries[0]; /* xattr entry list. */ +}; + +/* + * On disk structure for xattr value root. + * + * It is used when one extended attribute's size is larger, and we will save it + * in an outside cluster. It will stored in a b-tree like file content. + */ +struct ocfs2_xattr_value_root { +/*00*/ __le32 xr_clusters; /* clusters covered by xattr value. */ + __le32 xr_reserved0; + __le64 xr_last_eb_blk; /* Pointer to last extent block */ +/*10*/ struct ocfs2_extent_list xr_list; /* Extent record list */ +}; + +/* + * On disk structure for xattr tree root. + * + * It is used when there are too many extended attributes for one file. These + * attributes will be organized and stored in an indexed-btree. + */ +struct ocfs2_xattr_tree_root { +/*00*/ __le32 xt_clusters; /* clusters covered by xattr. */ + __le32 xt_reserved0; + __le64 xt_last_eb_blk; /* Pointer to last extent block */ +/*10*/ struct ocfs2_extent_list xt_list; /* Extent record list */ +}; + +#define OCFS2_XATTR_INDEXED 0x1 +#define OCFS2_HASH_SHIFT 5 +#define OCFS2_XATTR_ROUND 3 +#define OCFS2_XATTR_SIZE(size) (((size) + OCFS2_XATTR_ROUND) & \ + ~(OCFS2_XATTR_ROUND)) + +#define OCFS2_XATTR_BUCKET_SIZE 4096 +#define OCFS2_XATTR_MAX_BLOCKS_PER_BUCKET (OCFS2_XATTR_BUCKET_SIZE \ + / OCFS2_MIN_BLOCKSIZE) + +/* + * On disk structure for xattr block. + */ +struct ocfs2_xattr_block { +/*00*/ __u8 xb_signature[8]; /* Signature for verification */ + __le16 xb_suballoc_slot; /* Slot suballocator this + block belongs to. */ + __le16 xb_suballoc_bit; /* Bit offset in suballocator + block group */ + __le32 xb_fs_generation; /* Must match super block */ +/*10*/ __le64 xb_blkno; /* Offset on disk, in blocks */ + __le64 xb_csum; +/*20*/ __le16 xb_flags; /* Indicates whether this block contains + real xattr or a xattr tree. */ + __le16 xb_reserved0; + __le32 xb_reserved1; + __le64 xb_reserved2; +/*30*/ union { + struct ocfs2_xattr_header xb_header; /* xattr header if this + block contains xattr */ + struct ocfs2_xattr_tree_root xb_root;/* xattr tree root if this + block cotains xattr + tree. */ + } xb_attrs; +}; + +#define OCFS2_XATTR_ENTRY_LOCAL 0x80 +#define OCFS2_XATTR_TYPE_MASK 0x7F +static inline void ocfs2_xattr_set_local(struct ocfs2_xattr_entry *xe, + int local) +{ + if (local) + xe->xe_type |= OCFS2_XATTR_ENTRY_LOCAL; + else + xe->xe_type &= ~OCFS2_XATTR_ENTRY_LOCAL; +} + +static inline int ocfs2_xattr_is_local(struct ocfs2_xattr_entry *xe) +{ + return xe->xe_type & OCFS2_XATTR_ENTRY_LOCAL; +} + +static inline void ocfs2_xattr_set_type(struct ocfs2_xattr_entry *xe, int type) +{ + xe->xe_type |= type & OCFS2_XATTR_TYPE_MASK; +} + +static inline int ocfs2_xattr_get_type(struct ocfs2_xattr_entry *xe) +{ + return xe->xe_type & OCFS2_XATTR_TYPE_MASK; +} + #ifdef __KERNEL__ static inline int ocfs2_fast_symlink_chars(struct super_block *sb) { @@ -728,6 +874,20 @@ static inline int ocfs2_max_inline_data(struct super_block *sb) offsetof(struct ocfs2_dinode, id2.i_data.id_data); } +static inline int ocfs2_max_inline_data_with_xattr(struct super_block *sb, + struct ocfs2_dinode *di) +{ + unsigned int xattrsize = le16_to_cpu(di->i_xattr_inline_size); + + if (le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_XATTR_FL) + return sb->s_blocksize - + offsetof(struct ocfs2_dinode, id2.i_data.id_data) - + xattrsize; + else + return sb->s_blocksize - + offsetof(struct ocfs2_dinode, id2.i_data.id_data); +} + static inline int ocfs2_extent_recs_per_inode(struct super_block *sb) { int size; @@ -738,6 +898,24 @@ static inline int ocfs2_extent_recs_per_inode(struct super_block *sb) return size / sizeof(struct ocfs2_extent_rec); } +static inline int ocfs2_extent_recs_per_inode_with_xattr( + struct super_block *sb, + struct ocfs2_dinode *di) +{ + int size; + unsigned int xattrsize = le16_to_cpu(di->i_xattr_inline_size); + + if (le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_XATTR_FL) + size = sb->s_blocksize - + offsetof(struct ocfs2_dinode, id2.i_list.l_recs) - + xattrsize; + else + size = sb->s_blocksize - + offsetof(struct ocfs2_dinode, id2.i_list.l_recs); + + return size / sizeof(struct ocfs2_extent_rec); +} + static inline int ocfs2_chain_recs_per_inode(struct super_block *sb) { int size; @@ -801,6 +979,17 @@ static inline u64 ocfs2_backup_super_blkno(struct super_block *sb, int index) return 0; } + +static inline u16 ocfs2_xattr_recs_per_xb(struct super_block *sb) +{ + int size; + + size = sb->s_blocksize - + offsetof(struct ocfs2_xattr_block, + xb_attrs.xb_root.xt_list.l_recs); + + return size / sizeof(struct ocfs2_extent_rec); +} #else static inline int ocfs2_fast_symlink_chars(int blocksize) { @@ -884,6 +1073,17 @@ static inline uint64_t ocfs2_backup_super_blkno(int blocksize, int index) return 0; } + +static inline int ocfs2_xattr_recs_per_xb(int blocksize) +{ + int size; + + size = blocksize - + offsetof(struct ocfs2_xattr_block, + xb_attrs.xb_root.xt_list.l_recs); + + return size / sizeof(struct ocfs2_extent_rec); +} #endif /* __KERNEL__ */ diff --git a/fs/ocfs2/ocfs2_jbd_compat.h b/fs/ocfs2/ocfs2_jbd_compat.h new file mode 100644 index 0000000..b91c78f --- /dev/null +++ b/fs/ocfs2/ocfs2_jbd_compat.h @@ -0,0 +1,82 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * ocfs2_jbd_compat.h + * + * Compatibility defines for JBD. + * + * Copyright (C) 2008 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ + +#ifndef OCFS2_JBD_COMPAT_H +#define OCFS2_JBD_COMPAT_H + +#ifndef CONFIG_OCFS2_COMPAT_JBD +# error Should not have been included +#endif + +struct jbd2_inode { + unsigned int dummy; +}; + +#define JBD2_BARRIER JFS_BARRIER +#define JBD2_DEFAULT_MAX_COMMIT_AGE JBD_DEFAULT_MAX_COMMIT_AGE + +#define jbd2_journal_ack_err journal_ack_err +#define jbd2_journal_clear_err journal_clear_err +#define jbd2_journal_destroy journal_destroy +#define jbd2_journal_dirty_metadata journal_dirty_metadata +#define jbd2_journal_errno journal_errno +#define jbd2_journal_extend journal_extend +#define jbd2_journal_flush journal_flush +#define jbd2_journal_force_commit journal_force_commit +#define jbd2_journal_get_write_access journal_get_write_access +#define jbd2_journal_get_undo_access journal_get_undo_access +#define jbd2_journal_init_inode journal_init_inode +#define jbd2_journal_invalidatepage journal_invalidatepage +#define jbd2_journal_load journal_load +#define jbd2_journal_lock_updates journal_lock_updates +#define jbd2_journal_restart journal_restart +#define jbd2_journal_start journal_start +#define jbd2_journal_start_commit journal_start_commit +#define jbd2_journal_stop journal_stop +#define jbd2_journal_try_to_free_buffers journal_try_to_free_buffers +#define jbd2_journal_unlock_updates journal_unlock_updates +#define jbd2_journal_wipe journal_wipe +#define jbd2_log_wait_commit log_wait_commit + +static inline int jbd2_journal_file_inode(handle_t *handle, + struct jbd2_inode *inode) +{ + return 0; +} + +static inline int jbd2_journal_begin_ordered_truncate(struct jbd2_inode *inode, + loff_t new_size) +{ + return 0; +} + +static inline void jbd2_journal_init_jbd_inode(struct jbd2_inode *jinode, + struct inode *inode) +{ + return; +} + +static inline void jbd2_journal_release_jbd_inode(journal_t *journal, + struct jbd2_inode *jinode) +{ + return; +} + + +#endif /* OCFS2_JBD_COMPAT_H */ diff --git a/fs/ocfs2/resize.c b/fs/ocfs2/resize.c index 8166968..ffd48db 100644 --- a/fs/ocfs2/resize.c +++ b/fs/ocfs2/resize.c @@ -200,7 +200,7 @@ static int update_backups(struct inode * inode, u32 clusters, char *data) if (cluster > clusters) break; - ret = ocfs2_read_block(osb, blkno, &backup, 0, NULL); + ret = ocfs2_read_blocks_sync(osb, blkno, 1, &backup); if (ret < 0) { mlog_errno(ret); break; @@ -236,8 +236,8 @@ static void ocfs2_update_super_and_backups(struct inode *inode, * update the superblock last. * It doesn't matter if the write failed. */ - ret = ocfs2_read_block(osb, OCFS2_SUPER_BLOCK_BLKNO, - &super_bh, 0, NULL); + ret = ocfs2_read_blocks_sync(osb, OCFS2_SUPER_BLOCK_BLKNO, 1, + &super_bh); if (ret < 0) { mlog_errno(ret); goto out; @@ -332,8 +332,7 @@ int ocfs2_group_extend(struct inode * inode, int new_clusters) lgd_blkno = ocfs2_which_cluster_group(main_bm_inode, first_new_cluster - 1); - ret = ocfs2_read_block(osb, lgd_blkno, &group_bh, OCFS2_BH_CACHED, - main_bm_inode); + ret = ocfs2_read_block(main_bm_inode, lgd_blkno, &group_bh); if (ret < 0) { mlog_errno(ret); goto out_unlock; @@ -540,7 +539,7 @@ int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input) goto out_unlock; } - ret = ocfs2_read_block(osb, input->group, &group_bh, 0, NULL); + ret = ocfs2_read_blocks_sync(osb, input->group, 1, &group_bh); if (ret < 0) { mlog(ML_ERROR, "Can't read the group descriptor # %llu " "from the device.", (unsigned long long)input->group); diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c index bb5ff89..bdda2d8 100644 --- a/fs/ocfs2/slot_map.c +++ b/fs/ocfs2/slot_map.c @@ -150,8 +150,8 @@ int ocfs2_refresh_slot_info(struct ocfs2_super *osb) * be !NULL. Thus, ocfs2_read_blocks() will ignore blocknr. If * this is not true, the read of -1 (UINT64_MAX) will fail. */ - ret = ocfs2_read_blocks(osb, -1, si->si_blocks, si->si_bh, 0, - si->si_inode); + ret = ocfs2_read_blocks(si->si_inode, -1, si->si_blocks, si->si_bh, + OCFS2_BH_IGNORE_CACHE); if (ret == 0) { spin_lock(&osb->osb_lock); ocfs2_update_slot_info(si); @@ -404,7 +404,8 @@ static int ocfs2_map_slot_buffers(struct ocfs2_super *osb, (unsigned long long)blkno); bh = NULL; /* Acquire a fresh bh */ - status = ocfs2_read_block(osb, blkno, &bh, 0, si->si_inode); + status = ocfs2_read_blocks(si->si_inode, blkno, 1, &bh, + OCFS2_BH_IGNORE_CACHE); if (status < 0) { mlog_errno(status); goto bail; diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c index 353fc35..faec2d8 100644 --- a/fs/ocfs2/stack_user.c +++ b/fs/ocfs2/stack_user.c @@ -28,6 +28,7 @@ #include "ocfs2.h" /* For struct ocfs2_lock_res */ #include "stackglue.h" +#include <linux/dlm_plock.h> /* * The control protocol starts with a handshake. Until the handshake @@ -746,6 +747,37 @@ static void user_dlm_dump_lksb(union ocfs2_dlm_lksb *lksb) { } +static int user_plock(struct ocfs2_cluster_connection *conn, + u64 ino, + struct file *file, + int cmd, + struct file_lock *fl) +{ + /* + * This more or less just demuxes the plock request into any + * one of three dlm calls. + * + * Internally, fs/dlm will pass these to a misc device, which + * a userspace daemon will read and write to. + * + * For now, cancel requests (which happen internally only), + * are turned into unlocks. Most of this function taken from + * gfs2_lock. + */ + + if (cmd == F_CANCELLK) { + cmd = F_SETLK; + fl->fl_type = F_UNLCK; + } + + if (IS_GETLK(cmd)) + return dlm_posix_get(conn->cc_lockspace, ino, file, fl); + else if (fl->fl_type == F_UNLCK) + return dlm_posix_unlock(conn->cc_lockspace, ino, file, fl); + else + return dlm_posix_lock(conn->cc_lockspace, ino, file, cmd, fl); +} + /* * Compare a requested locking protocol version against the current one. * @@ -839,6 +871,7 @@ static struct ocfs2_stack_operations ocfs2_user_plugin_ops = { .dlm_unlock = user_dlm_unlock, .lock_status = user_dlm_lock_status, .lock_lvb = user_dlm_lvb, + .plock = user_plock, .dump_lksb = user_dlm_dump_lksb, }; diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c index 07f348b..68b668b 100644 --- a/fs/ocfs2/stackglue.c +++ b/fs/ocfs2/stackglue.c @@ -288,6 +288,26 @@ void ocfs2_dlm_dump_lksb(union ocfs2_dlm_lksb *lksb) } EXPORT_SYMBOL_GPL(ocfs2_dlm_dump_lksb); +int ocfs2_stack_supports_plocks(void) +{ + return active_stack && active_stack->sp_ops->plock; +} +EXPORT_SYMBOL_GPL(ocfs2_stack_supports_plocks); + +/* + * ocfs2_plock() can only be safely called if + * ocfs2_stack_supports_plocks() returned true + */ +int ocfs2_plock(struct ocfs2_cluster_connection *conn, u64 ino, + struct file *file, int cmd, struct file_lock *fl) +{ + WARN_ON_ONCE(active_stack->sp_ops->plock == NULL); + if (active_stack->sp_ops->plock) + return active_stack->sp_ops->plock(conn, ino, file, cmd, fl); + return -EOPNOTSUPP; +} +EXPORT_SYMBOL_GPL(ocfs2_plock); + int ocfs2_cluster_connect(const char *stack_name, const char *group, int grouplen, diff --git a/fs/ocfs2/stackglue.h b/fs/ocfs2/stackglue.h index db56281..c571af3 100644 --- a/fs/ocfs2/stackglue.h +++ b/fs/ocfs2/stackglue.h @@ -28,6 +28,10 @@ #include "dlm/dlmapi.h" #include <linux/dlm.h> +/* Needed for plock-related prototypes */ +struct file; +struct file_lock; + /* * dlmconstants.h does not have a LOCAL flag. We hope to remove it * some day, but right now we need it. Let's fake it. This value is larger @@ -187,6 +191,17 @@ struct ocfs2_stack_operations { void *(*lock_lvb)(union ocfs2_dlm_lksb *lksb); /* + * Cluster-aware posix locks + * + * This is NULL for stacks which do not support posix locks. + */ + int (*plock)(struct ocfs2_cluster_connection *conn, + u64 ino, + struct file *file, + int cmd, + struct file_lock *fl); + + /* * This is an optoinal debugging hook. If provided, the * stack can dump debugging information about this lock. */ @@ -240,6 +255,10 @@ int ocfs2_dlm_lock_status(union ocfs2_dlm_lksb *lksb); void *ocfs2_dlm_lvb(union ocfs2_dlm_lksb *lksb); void ocfs2_dlm_dump_lksb(union ocfs2_dlm_lksb *lksb); +int ocfs2_stack_supports_plocks(void); +int ocfs2_plock(struct ocfs2_cluster_connection *conn, u64 ino, + struct file *file, int cmd, struct file_lock *fl); + void ocfs2_stack_glue_set_locking_protocol(struct ocfs2_locking_protocol *proto); diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c index d2d278f..c5ff18b 100644 --- a/fs/ocfs2/suballoc.c +++ b/fs/ocfs2/suballoc.c @@ -62,15 +62,18 @@ static int ocfs2_block_group_fill(handle_t *handle, struct ocfs2_chain_list *cl); static int ocfs2_block_group_alloc(struct ocfs2_super *osb, struct inode *alloc_inode, - struct buffer_head *bh); + struct buffer_head *bh, + u64 max_block); static int ocfs2_cluster_group_search(struct inode *inode, struct buffer_head *group_bh, u32 bits_wanted, u32 min_bits, + u64 max_block, u16 *bit_off, u16 *bits_found); static int ocfs2_block_group_search(struct inode *inode, struct buffer_head *group_bh, u32 bits_wanted, u32 min_bits, + u64 max_block, u16 *bit_off, u16 *bits_found); static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb, struct ocfs2_alloc_context *ac, @@ -110,8 +113,11 @@ static inline void ocfs2_block_to_cluster_group(struct inode *inode, u64 data_blkno, u64 *bg_blkno, u16 *bg_bit_off); +static int ocfs2_reserve_clusters_with_limit(struct ocfs2_super *osb, + u32 bits_wanted, u64 max_block, + struct ocfs2_alloc_context **ac); -static void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac) +void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac) { struct inode *inode = ac->ac_inode; @@ -124,10 +130,8 @@ static void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac) iput(inode); ac->ac_inode = NULL; } - if (ac->ac_bh) { - brelse(ac->ac_bh); - ac->ac_bh = NULL; - } + brelse(ac->ac_bh); + ac->ac_bh = NULL; } void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac) @@ -276,7 +280,8 @@ static inline u16 ocfs2_find_smallest_chain(struct ocfs2_chain_list *cl) */ static int ocfs2_block_group_alloc(struct ocfs2_super *osb, struct inode *alloc_inode, - struct buffer_head *bh) + struct buffer_head *bh, + u64 max_block) { int status, credits; struct ocfs2_dinode *fe = (struct ocfs2_dinode *) bh->b_data; @@ -294,9 +299,9 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb, mlog_entry_void(); cl = &fe->id2.i_chain; - status = ocfs2_reserve_clusters(osb, - le16_to_cpu(cl->cl_cpg), - &ac); + status = ocfs2_reserve_clusters_with_limit(osb, + le16_to_cpu(cl->cl_cpg), + max_block, &ac); if (status < 0) { if (status != -ENOSPC) mlog_errno(status); @@ -394,8 +399,7 @@ bail: if (ac) ocfs2_free_alloc_context(ac); - if (bg_bh) - brelse(bg_bh); + brelse(bg_bh); mlog_exit(status); return status; @@ -469,7 +473,8 @@ static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb, goto bail; } - status = ocfs2_block_group_alloc(osb, alloc_inode, bh); + status = ocfs2_block_group_alloc(osb, alloc_inode, bh, + ac->ac_max_block); if (status < 0) { if (status != -ENOSPC) mlog_errno(status); @@ -486,16 +491,15 @@ static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb, get_bh(bh); ac->ac_bh = bh; bail: - if (bh) - brelse(bh); + brelse(bh); mlog_exit(status); return status; } -int ocfs2_reserve_new_metadata(struct ocfs2_super *osb, - struct ocfs2_dinode *fe, - struct ocfs2_alloc_context **ac) +int ocfs2_reserve_new_metadata_blocks(struct ocfs2_super *osb, + int blocks, + struct ocfs2_alloc_context **ac) { int status; u32 slot; @@ -507,7 +511,7 @@ int ocfs2_reserve_new_metadata(struct ocfs2_super *osb, goto bail; } - (*ac)->ac_bits_wanted = ocfs2_extend_meta_needed(fe); + (*ac)->ac_bits_wanted = blocks; (*ac)->ac_which = OCFS2_AC_USE_META; slot = osb->slot_num; (*ac)->ac_group_search = ocfs2_block_group_search; @@ -532,6 +536,15 @@ bail: return status; } +int ocfs2_reserve_new_metadata(struct ocfs2_super *osb, + struct ocfs2_extent_list *root_el, + struct ocfs2_alloc_context **ac) +{ + return ocfs2_reserve_new_metadata_blocks(osb, + ocfs2_extend_meta_needed(root_el), + ac); +} + static int ocfs2_steal_inode_from_other_nodes(struct ocfs2_super *osb, struct ocfs2_alloc_context *ac) { @@ -582,6 +595,14 @@ int ocfs2_reserve_new_inode(struct ocfs2_super *osb, (*ac)->ac_group_search = ocfs2_block_group_search; /* + * stat(2) can't handle i_ino > 32bits, so we tell the + * lower levels not to allocate us a block group past that + * limit. The 'inode64' mount option avoids this behavior. + */ + if (!(osb->s_mount_opt & OCFS2_MOUNT_INODE64)) + (*ac)->ac_max_block = (u32)~0U; + + /* * slot is set when we successfully steal inode from other nodes. * It is reset in 3 places: * 1. when we flush the truncate log @@ -661,9 +682,9 @@ bail: /* Callers don't need to care which bitmap (local alloc or main) to * use so we figure it out for them, but unfortunately this clutters * things a bit. */ -int ocfs2_reserve_clusters(struct ocfs2_super *osb, - u32 bits_wanted, - struct ocfs2_alloc_context **ac) +static int ocfs2_reserve_clusters_with_limit(struct ocfs2_super *osb, + u32 bits_wanted, u64 max_block, + struct ocfs2_alloc_context **ac) { int status; @@ -677,24 +698,20 @@ int ocfs2_reserve_clusters(struct ocfs2_super *osb, } (*ac)->ac_bits_wanted = bits_wanted; + (*ac)->ac_max_block = max_block; status = -ENOSPC; if (ocfs2_alloc_should_use_local(osb, bits_wanted)) { status = ocfs2_reserve_local_alloc_bits(osb, bits_wanted, *ac); - if ((status < 0) && (status != -ENOSPC)) { + if (status == -EFBIG) { + /* The local alloc window is outside ac_max_block. + * use the main bitmap. */ + status = -ENOSPC; + } else if ((status < 0) && (status != -ENOSPC)) { mlog_errno(status); goto bail; - } else if (status == -ENOSPC) { - /* reserve_local_bits will return enospc with - * the local alloc inode still locked, so we - * can change this safely here. */ - mlog(0, "Disabling local alloc\n"); - /* We set to OCFS2_LA_DISABLED so that umount - * can clean up what's left of the local - * allocation */ - osb->local_alloc_state = OCFS2_LA_DISABLED; } } @@ -718,6 +735,13 @@ bail: return status; } +int ocfs2_reserve_clusters(struct ocfs2_super *osb, + u32 bits_wanted, + struct ocfs2_alloc_context **ac) +{ + return ocfs2_reserve_clusters_with_limit(osb, bits_wanted, 0, ac); +} + /* * More or less lifted from ext3. I'll leave their description below: * @@ -1000,11 +1024,14 @@ static inline int ocfs2_block_group_reasonably_empty(struct ocfs2_group_desc *bg static int ocfs2_cluster_group_search(struct inode *inode, struct buffer_head *group_bh, u32 bits_wanted, u32 min_bits, + u64 max_block, u16 *bit_off, u16 *bits_found) { int search = -ENOSPC; int ret; + u64 blkoff; struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *) group_bh->b_data; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); u16 tmp_off, tmp_found; unsigned int max_bits, gd_cluster_off; @@ -1037,6 +1064,17 @@ static int ocfs2_cluster_group_search(struct inode *inode, if (ret) return ret; + if (max_block) { + blkoff = ocfs2_clusters_to_blocks(inode->i_sb, + gd_cluster_off + + tmp_off + tmp_found); + mlog(0, "Checking %llu against %llu\n", + (unsigned long long)blkoff, + (unsigned long long)max_block); + if (blkoff > max_block) + return -ENOSPC; + } + /* ocfs2_block_group_find_clear_bits() might * return success, but we still want to return * -ENOSPC unless it found the minimum number @@ -1045,6 +1083,12 @@ static int ocfs2_cluster_group_search(struct inode *inode, *bit_off = tmp_off; *bits_found = tmp_found; search = 0; /* success */ + } else if (tmp_found) { + /* + * Don't show bits which we'll be returning + * for allocation to the local alloc bitmap. + */ + ocfs2_local_alloc_seen_free_bits(osb, tmp_found); } } @@ -1054,19 +1098,31 @@ static int ocfs2_cluster_group_search(struct inode *inode, static int ocfs2_block_group_search(struct inode *inode, struct buffer_head *group_bh, u32 bits_wanted, u32 min_bits, + u64 max_block, u16 *bit_off, u16 *bits_found) { int ret = -ENOSPC; + u64 blkoff; struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) group_bh->b_data; BUG_ON(min_bits != 1); BUG_ON(ocfs2_is_cluster_bitmap(inode)); - if (bg->bg_free_bits_count) + if (bg->bg_free_bits_count) { ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb), group_bh, bits_wanted, le16_to_cpu(bg->bg_bits), bit_off, bits_found); + if (!ret && max_block) { + blkoff = le64_to_cpu(bg->bg_blkno) + *bit_off + + *bits_found; + mlog(0, "Checking %llu against %llu\n", + (unsigned long long)blkoff, + (unsigned long long)max_block); + if (blkoff > max_block) + ret = -ENOSPC; + } + } return ret; } @@ -1116,8 +1172,7 @@ static int ocfs2_search_one_group(struct ocfs2_alloc_context *ac, struct ocfs2_group_desc *gd; struct inode *alloc_inode = ac->ac_inode; - ret = ocfs2_read_block(OCFS2_SB(alloc_inode->i_sb), gd_blkno, - &group_bh, OCFS2_BH_CACHED, alloc_inode); + ret = ocfs2_read_block(alloc_inode, gd_blkno, &group_bh); if (ret < 0) { mlog_errno(ret); return ret; @@ -1131,7 +1186,7 @@ static int ocfs2_search_one_group(struct ocfs2_alloc_context *ac, } ret = ac->ac_group_search(alloc_inode, group_bh, bits_wanted, min_bits, - bit_off, &found); + ac->ac_max_block, bit_off, &found); if (ret < 0) { if (ret != -ENOSPC) mlog_errno(ret); @@ -1186,9 +1241,9 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac, bits_wanted, chain, (unsigned long long)OCFS2_I(alloc_inode)->ip_blkno); - status = ocfs2_read_block(OCFS2_SB(alloc_inode->i_sb), + status = ocfs2_read_block(alloc_inode, le64_to_cpu(cl->cl_recs[chain].c_blkno), - &group_bh, OCFS2_BH_CACHED, alloc_inode); + &group_bh); if (status < 0) { mlog_errno(status); goto bail; @@ -1204,21 +1259,20 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac, /* for now, the chain search is a bit simplistic. We just use * the 1st group with any empty bits. */ while ((status = ac->ac_group_search(alloc_inode, group_bh, - bits_wanted, min_bits, bit_off, + bits_wanted, min_bits, + ac->ac_max_block, bit_off, &tmp_bits)) == -ENOSPC) { if (!bg->bg_next_group) break; - if (prev_group_bh) { - brelse(prev_group_bh); - prev_group_bh = NULL; - } + brelse(prev_group_bh); + prev_group_bh = NULL; + next_group = le64_to_cpu(bg->bg_next_group); prev_group_bh = group_bh; group_bh = NULL; - status = ocfs2_read_block(OCFS2_SB(alloc_inode->i_sb), - next_group, &group_bh, - OCFS2_BH_CACHED, alloc_inode); + status = ocfs2_read_block(alloc_inode, + next_group, &group_bh); if (status < 0) { mlog_errno(status); goto bail; @@ -1307,10 +1361,8 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac, *bg_blkno = le64_to_cpu(bg->bg_blkno); *bits_left = le16_to_cpu(bg->bg_free_bits_count); bail: - if (group_bh) - brelse(group_bh); - if (prev_group_bh) - brelse(prev_group_bh); + brelse(group_bh); + brelse(prev_group_bh); mlog_exit(status); return status; @@ -1723,7 +1775,6 @@ int ocfs2_free_suballoc_bits(handle_t *handle, { int status = 0; u32 tmp_used; - struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb); struct ocfs2_dinode *fe = (struct ocfs2_dinode *) alloc_bh->b_data; struct ocfs2_chain_list *cl = &fe->id2.i_chain; struct buffer_head *group_bh = NULL; @@ -1742,8 +1793,7 @@ int ocfs2_free_suballoc_bits(handle_t *handle, (unsigned long long)OCFS2_I(alloc_inode)->ip_blkno, count, (unsigned long long)bg_blkno, start_bit); - status = ocfs2_read_block(osb, bg_blkno, &group_bh, OCFS2_BH_CACHED, - alloc_inode); + status = ocfs2_read_block(alloc_inode, bg_blkno, &group_bh); if (status < 0) { mlog_errno(status); goto bail; @@ -1784,8 +1834,7 @@ int ocfs2_free_suballoc_bits(handle_t *handle, } bail: - if (group_bh) - brelse(group_bh); + brelse(group_bh); mlog_exit(status); return status; @@ -1838,9 +1887,15 @@ int ocfs2_free_clusters(handle_t *handle, status = ocfs2_free_suballoc_bits(handle, bitmap_inode, bitmap_bh, bg_start_bit, bg_blkno, num_clusters); - if (status < 0) + if (status < 0) { mlog_errno(status); + goto out; + } + ocfs2_local_alloc_seen_free_bits(OCFS2_SB(bitmap_inode->i_sb), + num_clusters); + +out: mlog_exit(status); return status; } @@ -1891,3 +1946,84 @@ static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode *fe) (unsigned long long)fe->id2.i_chain.cl_recs[i].c_blkno); } } + +/* + * For a given allocation, determine which allocators will need to be + * accessed, and lock them, reserving the appropriate number of bits. + * + * Sparse file systems call this from ocfs2_write_begin_nolock() + * and ocfs2_allocate_unwritten_extents(). + * + * File systems which don't support holes call this from + * ocfs2_extend_allocation(). + */ +int ocfs2_lock_allocators(struct inode *inode, + struct ocfs2_extent_tree *et, + u32 clusters_to_add, u32 extents_to_split, + struct ocfs2_alloc_context **data_ac, + struct ocfs2_alloc_context **meta_ac) +{ + int ret = 0, num_free_extents; + unsigned int max_recs_needed = clusters_to_add + 2 * extents_to_split; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + + *meta_ac = NULL; + if (data_ac) + *data_ac = NULL; + + BUG_ON(clusters_to_add != 0 && data_ac == NULL); + + num_free_extents = ocfs2_num_free_extents(osb, inode, et); + if (num_free_extents < 0) { + ret = num_free_extents; + mlog_errno(ret); + goto out; + } + + /* + * Sparse allocation file systems need to be more conservative + * with reserving room for expansion - the actual allocation + * happens while we've got a journal handle open so re-taking + * a cluster lock (because we ran out of room for another + * extent) will violate ordering rules. + * + * Most of the time we'll only be seeing this 1 cluster at a time + * anyway. + * + * Always lock for any unwritten extents - we might want to + * add blocks during a split. + */ + if (!num_free_extents || + (ocfs2_sparse_alloc(osb) && num_free_extents < max_recs_needed)) { + ret = ocfs2_reserve_new_metadata(osb, et->et_root_el, meta_ac); + if (ret < 0) { + if (ret != -ENOSPC) + mlog_errno(ret); + goto out; + } + } + + if (clusters_to_add == 0) + goto out; + + ret = ocfs2_reserve_clusters(osb, clusters_to_add, data_ac); + if (ret < 0) { + if (ret != -ENOSPC) + mlog_errno(ret); + goto out; + } + +out: + if (ret) { + if (*meta_ac) { + ocfs2_free_alloc_context(*meta_ac); + *meta_ac = NULL; + } + + /* + * We cannot have an error and a non null *data_ac. + */ + } + + return ret; +} diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h index 544c600..4df159d 100644 --- a/fs/ocfs2/suballoc.h +++ b/fs/ocfs2/suballoc.h @@ -28,10 +28,11 @@ typedef int (group_search_t)(struct inode *, struct buffer_head *, - u32, - u32, - u16 *, - u16 *); + u32, /* bits_wanted */ + u32, /* min_bits */ + u64, /* max_block */ + u16 *, /* *bit_off */ + u16 *); /* *bits_found */ struct ocfs2_alloc_context { struct inode *ac_inode; /* which bitmap are we allocating from? */ @@ -51,6 +52,8 @@ struct ocfs2_alloc_context { group_search_t *ac_group_search; u64 ac_last_group; + u64 ac_max_block; /* Highest block number to allocate. 0 is + is the same as ~0 - unlimited */ }; void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac); @@ -59,9 +62,17 @@ static inline int ocfs2_alloc_context_bits_left(struct ocfs2_alloc_context *ac) return ac->ac_bits_wanted - ac->ac_bits_given; } +/* + * Please note that the caller must make sure that root_el is the root + * of extent tree. So for an inode, it should be &fe->id2.i_list. Otherwise + * the result may be wrong. + */ int ocfs2_reserve_new_metadata(struct ocfs2_super *osb, - struct ocfs2_dinode *fe, + struct ocfs2_extent_list *root_el, struct ocfs2_alloc_context **ac); +int ocfs2_reserve_new_metadata_blocks(struct ocfs2_super *osb, + int blocks, + struct ocfs2_alloc_context **ac); int ocfs2_reserve_new_inode(struct ocfs2_super *osb, struct ocfs2_alloc_context **ac); int ocfs2_reserve_clusters(struct ocfs2_super *osb, @@ -147,6 +158,7 @@ static inline int ocfs2_is_cluster_bitmap(struct inode *inode) * apis above. */ int ocfs2_reserve_cluster_bitmap_bits(struct ocfs2_super *osb, struct ocfs2_alloc_context *ac); +void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac); /* given a cluster offset, calculate which block group it belongs to * and return that block offset. */ @@ -156,4 +168,8 @@ u64 ocfs2_which_cluster_group(struct inode *inode, u32 cluster); int ocfs2_check_group_descriptor(struct super_block *sb, struct ocfs2_dinode *di, struct ocfs2_group_desc *gd); +int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_extent_tree *et, + u32 clusters_to_add, u32 extents_to_split, + struct ocfs2_alloc_context **data_ac, + struct ocfs2_alloc_context **meta_ac); #endif /* _CHAINALLOC_H_ */ diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 88255d3f..304b63a 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -64,6 +64,7 @@ #include "sysfile.h" #include "uptodate.h" #include "ver.h" +#include "xattr.h" #include "buffer_head_io.h" @@ -154,10 +155,13 @@ enum { Opt_localalloc, Opt_localflocks, Opt_stack, + Opt_user_xattr, + Opt_nouser_xattr, + Opt_inode64, Opt_err, }; -static match_table_t tokens = { +static const match_table_t tokens = { {Opt_barrier, "barrier=%u"}, {Opt_err_panic, "errors=panic"}, {Opt_err_ro, "errors=remount-ro"}, @@ -173,6 +177,9 @@ static match_table_t tokens = { {Opt_localalloc, "localalloc=%d"}, {Opt_localflocks, "localflocks"}, {Opt_stack, "cluster_stack=%s"}, + {Opt_user_xattr, "user_xattr"}, + {Opt_nouser_xattr, "nouser_xattr"}, + {Opt_inode64, "inode64"}, {Opt_err, NULL} }; @@ -205,10 +212,11 @@ static int ocfs2_sync_fs(struct super_block *sb, int wait) ocfs2_schedule_truncate_log_flush(osb, 0); } - if (journal_start_commit(OCFS2_SB(sb)->journal->j_journal, &target)) { + if (jbd2_journal_start_commit(OCFS2_SB(sb)->journal->j_journal, + &target)) { if (wait) - log_wait_commit(OCFS2_SB(sb)->journal->j_journal, - target); + jbd2_log_wait_commit(OCFS2_SB(sb)->journal->j_journal, + target); } return 0; } @@ -325,6 +333,7 @@ static struct inode *ocfs2_alloc_inode(struct super_block *sb) if (!oi) return NULL; + jbd2_journal_init_jbd_inode(&oi->ip_jinode, &oi->vfs_inode); return &oi->vfs_inode; } @@ -406,6 +415,15 @@ static int ocfs2_remount(struct super_block *sb, int *flags, char *data) goto out; } + /* Probably don't want this on remount; it might + * mess with other nodes */ + if (!(osb->s_mount_opt & OCFS2_MOUNT_INODE64) && + (parsed_options.mount_opt & OCFS2_MOUNT_INODE64)) { + ret = -EINVAL; + mlog(ML_ERROR, "Cannot enable inode64 on remount\n"); + goto out; + } + /* We're going to/from readonly mode. */ if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY)) { /* Lock here so the check of HARD_RO and the potential @@ -637,7 +655,8 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) osb->s_atime_quantum = parsed_options.atime_quantum; osb->preferred_slot = parsed_options.slot; osb->osb_commit_interval = parsed_options.commit_interval; - osb->local_alloc_size = parsed_options.localalloc_opt; + osb->local_alloc_default_bits = ocfs2_megabytes_to_clusters(sb, parsed_options.localalloc_opt); + osb->local_alloc_bits = osb->local_alloc_default_bits; status = ocfs2_verify_userspace_stack(osb, &parsed_options); if (status) @@ -743,8 +762,7 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) return status; read_super_error: - if (bh != NULL) - brelse(bh); + brelse(bh); if (inode) iput(inode); @@ -847,6 +865,12 @@ static int ocfs2_parse_options(struct super_block *sb, case Opt_data_writeback: mopt->mount_opt |= OCFS2_MOUNT_DATA_WRITEBACK; break; + case Opt_user_xattr: + mopt->mount_opt &= ~OCFS2_MOUNT_NOUSERXATTR; + break; + case Opt_nouser_xattr: + mopt->mount_opt |= OCFS2_MOUNT_NOUSERXATTR; + break; case Opt_atime_quantum: if (match_int(&args[0], &option)) { status = 0; @@ -873,7 +897,7 @@ static int ocfs2_parse_options(struct super_block *sb, if (option < 0) return 0; if (option == 0) - option = JBD_DEFAULT_MAX_COMMIT_AGE; + option = JBD2_DEFAULT_MAX_COMMIT_AGE; mopt->commit_interval = HZ * option; break; case Opt_localalloc: @@ -918,6 +942,9 @@ static int ocfs2_parse_options(struct super_block *sb, OCFS2_STACK_LABEL_LEN); mopt->cluster_stack[OCFS2_STACK_LABEL_LEN] = '\0'; break; + case Opt_inode64: + mopt->mount_opt |= OCFS2_MOUNT_INODE64; + break; default: mlog(ML_ERROR, "Unrecognized mount option \"%s\" " @@ -938,6 +965,7 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt) { struct ocfs2_super *osb = OCFS2_SB(mnt->mnt_sb); unsigned long opts = osb->s_mount_opt; + unsigned int local_alloc_megs; if (opts & OCFS2_MOUNT_HB_LOCAL) seq_printf(s, ",_netdev,heartbeat=local"); @@ -970,8 +998,9 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt) seq_printf(s, ",commit=%u", (unsigned) (osb->osb_commit_interval / HZ)); - if (osb->local_alloc_size != OCFS2_DEFAULT_LOCAL_ALLOC_SIZE) - seq_printf(s, ",localalloc=%d", osb->local_alloc_size); + local_alloc_megs = osb->local_alloc_bits >> (20 - osb->s_clustersize_bits); + if (local_alloc_megs != OCFS2_DEFAULT_LOCAL_ALLOC_SIZE) + seq_printf(s, ",localalloc=%d", local_alloc_megs); if (opts & OCFS2_MOUNT_LOCALFLOCKS) seq_printf(s, ",localflocks,"); @@ -980,6 +1009,14 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt) seq_printf(s, ",cluster_stack=%.*s", OCFS2_STACK_LABEL_LEN, osb->osb_cluster_stack); + if (opts & OCFS2_MOUNT_NOUSERXATTR) + seq_printf(s, ",nouser_xattr"); + else + seq_printf(s, ",user_xattr"); + + if (opts & OCFS2_MOUNT_INODE64) + seq_printf(s, ",inode64"); + return 0; } @@ -1132,6 +1169,7 @@ static void ocfs2_inode_init_once(void *data) oi->ip_dir_start_lookup = 0; init_rwsem(&oi->ip_alloc_sem); + init_rwsem(&oi->ip_xattr_sem); mutex_init(&oi->ip_io_mutex); oi->ip_blkno = 0ULL; @@ -1375,6 +1413,7 @@ static int ocfs2_initialize_super(struct super_block *sb, sb->s_fs_info = osb; sb->s_op = &ocfs2_sops; sb->s_export_op = &ocfs2_export_ops; + sb->s_xattr = ocfs2_xattr_handlers; sb->s_time_gran = 1; sb->s_flags |= MS_NOATIME; /* this is needed to support O_LARGEFILE */ @@ -1421,8 +1460,12 @@ static int ocfs2_initialize_super(struct super_block *sb, osb->slot_num = OCFS2_INVALID_SLOT; + osb->s_xattr_inline_size = le16_to_cpu( + di->id2.i_super.s_xattr_inline_size); + osb->local_alloc_state = OCFS2_LA_UNUSED; osb->local_alloc_bh = NULL; + INIT_DELAYED_WORK(&osb->la_enable_wq, ocfs2_la_enable_worker); init_waitqueue_head(&osb->osb_mount_event); @@ -1568,6 +1611,7 @@ static int ocfs2_initialize_super(struct super_block *sb, osb->first_cluster_group_blkno = le64_to_cpu(di->id2.i_super.s_first_cluster_group); osb->fs_generation = le32_to_cpu(di->i_fs_generation); + osb->uuid_hash = le32_to_cpu(di->id2.i_super.s_uuid_hash); mlog(0, "vol_label: %s\n", osb->vol_label); mlog(0, "uuid: %s\n", osb->uuid_str); mlog(0, "root_blkno=%llu, system_dir_blkno=%llu\n", diff --git a/fs/ocfs2/symlink.c b/fs/ocfs2/symlink.c index ba9dbb5..cbd03df 100644 --- a/fs/ocfs2/symlink.c +++ b/fs/ocfs2/symlink.c @@ -50,6 +50,7 @@ #include "inode.h" #include "journal.h" #include "symlink.h" +#include "xattr.h" #include "buffer_head_io.h" @@ -83,11 +84,7 @@ static char *ocfs2_fast_symlink_getlink(struct inode *inode, mlog_entry_void(); - status = ocfs2_read_block(OCFS2_SB(inode->i_sb), - OCFS2_I(inode)->ip_blkno, - bh, - OCFS2_BH_CACHED, - inode); + status = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, bh); if (status < 0) { mlog_errno(status); link = ERR_PTR(status); @@ -157,8 +154,7 @@ bail: kunmap(page); page_cache_release(page); } - if (bh) - brelse(bh); + brelse(bh); return ERR_PTR(status); } @@ -168,10 +164,18 @@ const struct inode_operations ocfs2_symlink_inode_operations = { .follow_link = ocfs2_follow_link, .getattr = ocfs2_getattr, .setattr = ocfs2_setattr, + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, + .listxattr = ocfs2_listxattr, + .removexattr = generic_removexattr, }; const struct inode_operations ocfs2_fast_symlink_inode_operations = { .readlink = ocfs2_readlink, .follow_link = ocfs2_follow_link, .getattr = ocfs2_getattr, .setattr = ocfs2_setattr, + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, + .listxattr = ocfs2_listxattr, + .removexattr = generic_removexattr, }; diff --git a/fs/ocfs2/uptodate.c b/fs/ocfs2/uptodate.c index 4da8851..187b99f 100644 --- a/fs/ocfs2/uptodate.c +++ b/fs/ocfs2/uptodate.c @@ -53,7 +53,11 @@ #include <linux/highmem.h> #include <linux/buffer_head.h> #include <linux/rbtree.h> -#include <linux/jbd.h> +#ifndef CONFIG_OCFS2_COMPAT_JBD +# include <linux/jbd2.h> +#else +# include <linux/jbd.h> +#endif #define MLOG_MASK_PREFIX ML_UPTODATE @@ -511,14 +515,10 @@ static void ocfs2_remove_metadata_tree(struct ocfs2_caching_info *ci, ci->ci_num_cached--; } -/* Called when we remove a chunk of metadata from an inode. We don't - * bother reverting things to an inlined array in the case of a remove - * which moves us back under the limit. */ -void ocfs2_remove_from_cache(struct inode *inode, - struct buffer_head *bh) +static void ocfs2_remove_block_from_cache(struct inode *inode, + sector_t block) { int index; - sector_t block = bh->b_blocknr; struct ocfs2_meta_cache_item *item = NULL; struct ocfs2_inode_info *oi = OCFS2_I(inode); struct ocfs2_caching_info *ci = &oi->ip_metadata_cache; @@ -544,6 +544,30 @@ void ocfs2_remove_from_cache(struct inode *inode, kmem_cache_free(ocfs2_uptodate_cachep, item); } +/* + * Called when we remove a chunk of metadata from an inode. We don't + * bother reverting things to an inlined array in the case of a remove + * which moves us back under the limit. + */ +void ocfs2_remove_from_cache(struct inode *inode, + struct buffer_head *bh) +{ + sector_t block = bh->b_blocknr; + + ocfs2_remove_block_from_cache(inode, block); +} + +/* Called when we remove xattr clusters from an inode. */ +void ocfs2_remove_xattr_clusters_from_cache(struct inode *inode, + sector_t block, + u32 c_len) +{ + unsigned int i, b_len = ocfs2_clusters_to_blocks(inode->i_sb, 1) * c_len; + + for (i = 0; i < b_len; i++, block++) + ocfs2_remove_block_from_cache(inode, block); +} + int __init init_ocfs2_uptodate_cache(void) { ocfs2_uptodate_cachep = kmem_cache_create("ocfs2_uptodate", diff --git a/fs/ocfs2/uptodate.h b/fs/ocfs2/uptodate.h index 2e73206..531b4b3 100644 --- a/fs/ocfs2/uptodate.h +++ b/fs/ocfs2/uptodate.h @@ -40,6 +40,9 @@ void ocfs2_set_new_buffer_uptodate(struct inode *inode, struct buffer_head *bh); void ocfs2_remove_from_cache(struct inode *inode, struct buffer_head *bh); +void ocfs2_remove_xattr_clusters_from_cache(struct inode *inode, + sector_t block, + u32 c_len); int ocfs2_buffer_read_ahead(struct inode *inode, struct buffer_head *bh); diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c new file mode 100644 index 0000000..802c414 --- /dev/null +++ b/fs/ocfs2/xattr.c @@ -0,0 +1,4832 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * xattr.c + * + * Copyright (C) 2008 Oracle. All rights reserved. + * + * CREDITS: + * Lots of code in this file is taken from ext3. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/capability.h> +#include <linux/fs.h> +#include <linux/types.h> +#include <linux/slab.h> +#include <linux/highmem.h> +#include <linux/pagemap.h> +#include <linux/uio.h> +#include <linux/sched.h> +#include <linux/splice.h> +#include <linux/mount.h> +#include <linux/writeback.h> +#include <linux/falloc.h> +#include <linux/sort.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/string.h> + +#define MLOG_MASK_PREFIX ML_XATTR +#include <cluster/masklog.h> + +#include "ocfs2.h" +#include "alloc.h" +#include "dlmglue.h" +#include "file.h" +#include "symlink.h" +#include "sysfile.h" +#include "inode.h" +#include "journal.h" +#include "ocfs2_fs.h" +#include "suballoc.h" +#include "uptodate.h" +#include "buffer_head_io.h" +#include "super.h" +#include "xattr.h" + + +struct ocfs2_xattr_def_value_root { + struct ocfs2_xattr_value_root xv; + struct ocfs2_extent_rec er; +}; + +struct ocfs2_xattr_bucket { + struct buffer_head *bhs[OCFS2_XATTR_MAX_BLOCKS_PER_BUCKET]; + struct ocfs2_xattr_header *xh; +}; + +#define OCFS2_XATTR_ROOT_SIZE (sizeof(struct ocfs2_xattr_def_value_root)) +#define OCFS2_XATTR_INLINE_SIZE 80 + +static struct ocfs2_xattr_def_value_root def_xv = { + .xv.xr_list.l_count = cpu_to_le16(1), +}; + +struct xattr_handler *ocfs2_xattr_handlers[] = { + &ocfs2_xattr_user_handler, + &ocfs2_xattr_trusted_handler, + NULL +}; + +static struct xattr_handler *ocfs2_xattr_handler_map[] = { + [OCFS2_XATTR_INDEX_USER] = &ocfs2_xattr_user_handler, + [OCFS2_XATTR_INDEX_TRUSTED] = &ocfs2_xattr_trusted_handler, +}; + +struct ocfs2_xattr_info { + int name_index; + const char *name; + const void *value; + size_t value_len; +}; + +struct ocfs2_xattr_search { + struct buffer_head *inode_bh; + /* + * xattr_bh point to the block buffer head which has extended attribute + * when extended attribute in inode, xattr_bh is equal to inode_bh. + */ + struct buffer_head *xattr_bh; + struct ocfs2_xattr_header *header; + struct ocfs2_xattr_bucket bucket; + void *base; + void *end; + struct ocfs2_xattr_entry *here; + int not_found; +}; + +static int ocfs2_xattr_bucket_get_name_value(struct inode *inode, + struct ocfs2_xattr_header *xh, + int index, + int *block_off, + int *new_offset); + +static int ocfs2_xattr_index_block_find(struct inode *inode, + struct buffer_head *root_bh, + int name_index, + const char *name, + struct ocfs2_xattr_search *xs); + +static int ocfs2_xattr_tree_list_index_block(struct inode *inode, + struct ocfs2_xattr_tree_root *xt, + char *buffer, + size_t buffer_size); + +static int ocfs2_xattr_create_index_block(struct inode *inode, + struct ocfs2_xattr_search *xs); + +static int ocfs2_xattr_set_entry_index_block(struct inode *inode, + struct ocfs2_xattr_info *xi, + struct ocfs2_xattr_search *xs); + +static int ocfs2_delete_xattr_index_block(struct inode *inode, + struct buffer_head *xb_bh); + +static inline const char *ocfs2_xattr_prefix(int name_index) +{ + struct xattr_handler *handler = NULL; + + if (name_index > 0 && name_index < OCFS2_XATTR_MAX) + handler = ocfs2_xattr_handler_map[name_index]; + + return handler ? handler->prefix : NULL; +} + +static u32 ocfs2_xattr_name_hash(struct inode *inode, + const char *name, + int name_len) +{ + /* Get hash value of uuid from super block */ + u32 hash = OCFS2_SB(inode->i_sb)->uuid_hash; + int i; + + /* hash extended attribute name */ + for (i = 0; i < name_len; i++) { + hash = (hash << OCFS2_HASH_SHIFT) ^ + (hash >> (8*sizeof(hash) - OCFS2_HASH_SHIFT)) ^ + *name++; + } + + return hash; +} + +/* + * ocfs2_xattr_hash_entry() + * + * Compute the hash of an extended attribute. + */ +static void ocfs2_xattr_hash_entry(struct inode *inode, + struct ocfs2_xattr_header *header, + struct ocfs2_xattr_entry *entry) +{ + u32 hash = 0; + char *name = (char *)header + le16_to_cpu(entry->xe_name_offset); + + hash = ocfs2_xattr_name_hash(inode, name, entry->xe_name_len); + entry->xe_name_hash = cpu_to_le32(hash); + + return; +} + +static int ocfs2_xattr_extend_allocation(struct inode *inode, + u32 clusters_to_add, + struct buffer_head *xattr_bh, + struct ocfs2_xattr_value_root *xv) +{ + int status = 0; + int restart_func = 0; + int credits = 0; + handle_t *handle = NULL; + struct ocfs2_alloc_context *data_ac = NULL; + struct ocfs2_alloc_context *meta_ac = NULL; + enum ocfs2_alloc_restarted why; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + u32 prev_clusters, logical_start = le32_to_cpu(xv->xr_clusters); + struct ocfs2_extent_tree et; + + mlog(0, "(clusters_to_add for xattr= %u)\n", clusters_to_add); + + ocfs2_init_xattr_value_extent_tree(&et, inode, xattr_bh, xv); + +restart_all: + + status = ocfs2_lock_allocators(inode, &et, clusters_to_add, 0, + &data_ac, &meta_ac); + if (status) { + mlog_errno(status); + goto leave; + } + + credits = ocfs2_calc_extend_credits(osb->sb, et.et_root_el, + clusters_to_add); + handle = ocfs2_start_trans(osb, credits); + if (IS_ERR(handle)) { + status = PTR_ERR(handle); + handle = NULL; + mlog_errno(status); + goto leave; + } + +restarted_transaction: + status = ocfs2_journal_access(handle, inode, xattr_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto leave; + } + + prev_clusters = le32_to_cpu(xv->xr_clusters); + status = ocfs2_add_clusters_in_btree(osb, + inode, + &logical_start, + clusters_to_add, + 0, + &et, + handle, + data_ac, + meta_ac, + &why); + if ((status < 0) && (status != -EAGAIN)) { + if (status != -ENOSPC) + mlog_errno(status); + goto leave; + } + + status = ocfs2_journal_dirty(handle, xattr_bh); + if (status < 0) { + mlog_errno(status); + goto leave; + } + + clusters_to_add -= le32_to_cpu(xv->xr_clusters) - prev_clusters; + + if (why != RESTART_NONE && clusters_to_add) { + if (why == RESTART_META) { + mlog(0, "restarting function.\n"); + restart_func = 1; + } else { + BUG_ON(why != RESTART_TRANS); + + mlog(0, "restarting transaction.\n"); + /* TODO: This can be more intelligent. */ + credits = ocfs2_calc_extend_credits(osb->sb, + et.et_root_el, + clusters_to_add); + status = ocfs2_extend_trans(handle, credits); + if (status < 0) { + /* handle still has to be committed at + * this point. */ + status = -ENOMEM; + mlog_errno(status); + goto leave; + } + goto restarted_transaction; + } + } + +leave: + if (handle) { + ocfs2_commit_trans(osb, handle); + handle = NULL; + } + if (data_ac) { + ocfs2_free_alloc_context(data_ac); + data_ac = NULL; + } + if (meta_ac) { + ocfs2_free_alloc_context(meta_ac); + meta_ac = NULL; + } + if ((!status) && restart_func) { + restart_func = 0; + goto restart_all; + } + + return status; +} + +static int __ocfs2_remove_xattr_range(struct inode *inode, + struct buffer_head *root_bh, + struct ocfs2_xattr_value_root *xv, + u32 cpos, u32 phys_cpos, u32 len, + struct ocfs2_cached_dealloc_ctxt *dealloc) +{ + int ret; + u64 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos); + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct inode *tl_inode = osb->osb_tl_inode; + handle_t *handle; + struct ocfs2_alloc_context *meta_ac = NULL; + struct ocfs2_extent_tree et; + + ocfs2_init_xattr_value_extent_tree(&et, inode, root_bh, xv); + + ret = ocfs2_lock_allocators(inode, &et, 0, 1, NULL, &meta_ac); + if (ret) { + mlog_errno(ret); + return ret; + } + + mutex_lock(&tl_inode->i_mutex); + + if (ocfs2_truncate_log_needs_flush(osb)) { + ret = __ocfs2_flush_truncate_log(osb); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + } + + handle = ocfs2_start_trans(osb, OCFS2_REMOVE_EXTENT_CREDITS); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + goto out; + } + + ret = ocfs2_journal_access(handle, inode, root_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + ret = ocfs2_remove_extent(inode, &et, cpos, len, handle, meta_ac, + dealloc); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + le32_add_cpu(&xv->xr_clusters, -len); + + ret = ocfs2_journal_dirty(handle, root_bh); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + ret = ocfs2_truncate_log_append(osb, handle, phys_blkno, len); + if (ret) + mlog_errno(ret); + +out_commit: + ocfs2_commit_trans(osb, handle); +out: + mutex_unlock(&tl_inode->i_mutex); + + if (meta_ac) + ocfs2_free_alloc_context(meta_ac); + + return ret; +} + +static int ocfs2_xattr_shrink_size(struct inode *inode, + u32 old_clusters, + u32 new_clusters, + struct buffer_head *root_bh, + struct ocfs2_xattr_value_root *xv) +{ + int ret = 0; + u32 trunc_len, cpos, phys_cpos, alloc_size; + u64 block; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct ocfs2_cached_dealloc_ctxt dealloc; + + ocfs2_init_dealloc_ctxt(&dealloc); + + if (old_clusters <= new_clusters) + return 0; + + cpos = new_clusters; + trunc_len = old_clusters - new_clusters; + while (trunc_len) { + ret = ocfs2_xattr_get_clusters(inode, cpos, &phys_cpos, + &alloc_size, &xv->xr_list); + if (ret) { + mlog_errno(ret); + goto out; + } + + if (alloc_size > trunc_len) + alloc_size = trunc_len; + + ret = __ocfs2_remove_xattr_range(inode, root_bh, xv, cpos, + phys_cpos, alloc_size, + &dealloc); + if (ret) { + mlog_errno(ret); + goto out; + } + + block = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos); + ocfs2_remove_xattr_clusters_from_cache(inode, block, + alloc_size); + cpos += alloc_size; + trunc_len -= alloc_size; + } + +out: + ocfs2_schedule_truncate_log_flush(osb, 1); + ocfs2_run_deallocs(osb, &dealloc); + + return ret; +} + +static int ocfs2_xattr_value_truncate(struct inode *inode, + struct buffer_head *root_bh, + struct ocfs2_xattr_value_root *xv, + int len) +{ + int ret; + u32 new_clusters = ocfs2_clusters_for_bytes(inode->i_sb, len); + u32 old_clusters = le32_to_cpu(xv->xr_clusters); + + if (new_clusters == old_clusters) + return 0; + + if (new_clusters > old_clusters) + ret = ocfs2_xattr_extend_allocation(inode, + new_clusters - old_clusters, + root_bh, xv); + else + ret = ocfs2_xattr_shrink_size(inode, + old_clusters, new_clusters, + root_bh, xv); + + return ret; +} + +static int ocfs2_xattr_list_entry(char *buffer, size_t size, + size_t *result, const char *prefix, + const char *name, int name_len) +{ + char *p = buffer + *result; + int prefix_len = strlen(prefix); + int total_len = prefix_len + name_len + 1; + + *result += total_len; + + /* we are just looking for how big our buffer needs to be */ + if (!size) + return 0; + + if (*result > size) + return -ERANGE; + + memcpy(p, prefix, prefix_len); + memcpy(p + prefix_len, name, name_len); + p[prefix_len + name_len] = '\0'; + + return 0; +} + +static int ocfs2_xattr_list_entries(struct inode *inode, + struct ocfs2_xattr_header *header, + char *buffer, size_t buffer_size) +{ + size_t result = 0; + int i, type, ret; + const char *prefix, *name; + + for (i = 0 ; i < le16_to_cpu(header->xh_count); i++) { + struct ocfs2_xattr_entry *entry = &header->xh_entries[i]; + type = ocfs2_xattr_get_type(entry); + prefix = ocfs2_xattr_prefix(type); + + if (prefix) { + name = (const char *)header + + le16_to_cpu(entry->xe_name_offset); + + ret = ocfs2_xattr_list_entry(buffer, buffer_size, + &result, prefix, name, + entry->xe_name_len); + if (ret) + return ret; + } + } + + return result; +} + +static int ocfs2_xattr_ibody_list(struct inode *inode, + struct ocfs2_dinode *di, + char *buffer, + size_t buffer_size) +{ + struct ocfs2_xattr_header *header = NULL; + struct ocfs2_inode_info *oi = OCFS2_I(inode); + int ret = 0; + + if (!(oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL)) + return ret; + + header = (struct ocfs2_xattr_header *) + ((void *)di + inode->i_sb->s_blocksize - + le16_to_cpu(di->i_xattr_inline_size)); + + ret = ocfs2_xattr_list_entries(inode, header, buffer, buffer_size); + + return ret; +} + +static int ocfs2_xattr_block_list(struct inode *inode, + struct ocfs2_dinode *di, + char *buffer, + size_t buffer_size) +{ + struct buffer_head *blk_bh = NULL; + struct ocfs2_xattr_block *xb; + int ret = 0; + + if (!di->i_xattr_loc) + return ret; + + ret = ocfs2_read_block(inode, le64_to_cpu(di->i_xattr_loc), &blk_bh); + if (ret < 0) { + mlog_errno(ret); + return ret; + } + /*Verify the signature of xattr block*/ + if (memcmp((void *)blk_bh->b_data, OCFS2_XATTR_BLOCK_SIGNATURE, + strlen(OCFS2_XATTR_BLOCK_SIGNATURE))) { + ret = -EFAULT; + goto cleanup; + } + + xb = (struct ocfs2_xattr_block *)blk_bh->b_data; + + if (!(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED)) { + struct ocfs2_xattr_header *header = &xb->xb_attrs.xb_header; + ret = ocfs2_xattr_list_entries(inode, header, + buffer, buffer_size); + } else { + struct ocfs2_xattr_tree_root *xt = &xb->xb_attrs.xb_root; + ret = ocfs2_xattr_tree_list_index_block(inode, xt, + buffer, buffer_size); + } +cleanup: + brelse(blk_bh); + + return ret; +} + +ssize_t ocfs2_listxattr(struct dentry *dentry, + char *buffer, + size_t size) +{ + int ret = 0, i_ret = 0, b_ret = 0; + struct buffer_head *di_bh = NULL; + struct ocfs2_dinode *di = NULL; + struct ocfs2_inode_info *oi = OCFS2_I(dentry->d_inode); + + if (!ocfs2_supports_xattr(OCFS2_SB(dentry->d_sb))) + return -EOPNOTSUPP; + + if (!(oi->ip_dyn_features & OCFS2_HAS_XATTR_FL)) + return ret; + + ret = ocfs2_inode_lock(dentry->d_inode, &di_bh, 0); + if (ret < 0) { + mlog_errno(ret); + return ret; + } + + di = (struct ocfs2_dinode *)di_bh->b_data; + + down_read(&oi->ip_xattr_sem); + i_ret = ocfs2_xattr_ibody_list(dentry->d_inode, di, buffer, size); + if (i_ret < 0) + b_ret = 0; + else { + if (buffer) { + buffer += i_ret; + size -= i_ret; + } + b_ret = ocfs2_xattr_block_list(dentry->d_inode, di, + buffer, size); + if (b_ret < 0) + i_ret = 0; + } + up_read(&oi->ip_xattr_sem); + ocfs2_inode_unlock(dentry->d_inode, 0); + + brelse(di_bh); + + return i_ret + b_ret; +} + +static int ocfs2_xattr_find_entry(int name_index, + const char *name, + struct ocfs2_xattr_search *xs) +{ + struct ocfs2_xattr_entry *entry; + size_t name_len; + int i, cmp = 1; + + if (name == NULL) + return -EINVAL; + + name_len = strlen(name); + entry = xs->here; + for (i = 0; i < le16_to_cpu(xs->header->xh_count); i++) { + cmp = name_index - ocfs2_xattr_get_type(entry); + if (!cmp) + cmp = name_len - entry->xe_name_len; + if (!cmp) + cmp = memcmp(name, (xs->base + + le16_to_cpu(entry->xe_name_offset)), + name_len); + if (cmp == 0) + break; + entry += 1; + } + xs->here = entry; + + return cmp ? -ENODATA : 0; +} + +static int ocfs2_xattr_get_value_outside(struct inode *inode, + struct ocfs2_xattr_value_root *xv, + void *buffer, + size_t len) +{ + u32 cpos, p_cluster, num_clusters, bpc, clusters; + u64 blkno; + int i, ret = 0; + size_t cplen, blocksize; + struct buffer_head *bh = NULL; + struct ocfs2_extent_list *el; + + el = &xv->xr_list; + clusters = le32_to_cpu(xv->xr_clusters); + bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1); + blocksize = inode->i_sb->s_blocksize; + + cpos = 0; + while (cpos < clusters) { + ret = ocfs2_xattr_get_clusters(inode, cpos, &p_cluster, + &num_clusters, el); + if (ret) { + mlog_errno(ret); + goto out; + } + + blkno = ocfs2_clusters_to_blocks(inode->i_sb, p_cluster); + /* Copy ocfs2_xattr_value */ + for (i = 0; i < num_clusters * bpc; i++, blkno++) { + ret = ocfs2_read_block(inode, blkno, &bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + cplen = len >= blocksize ? blocksize : len; + memcpy(buffer, bh->b_data, cplen); + len -= cplen; + buffer += cplen; + + brelse(bh); + bh = NULL; + if (len == 0) + break; + } + cpos += num_clusters; + } +out: + return ret; +} + +static int ocfs2_xattr_ibody_get(struct inode *inode, + int name_index, + const char *name, + void *buffer, + size_t buffer_size, + struct ocfs2_xattr_search *xs) +{ + struct ocfs2_inode_info *oi = OCFS2_I(inode); + struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data; + struct ocfs2_xattr_value_root *xv; + size_t size; + int ret = 0; + + if (!(oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL)) + return -ENODATA; + + xs->end = (void *)di + inode->i_sb->s_blocksize; + xs->header = (struct ocfs2_xattr_header *) + (xs->end - le16_to_cpu(di->i_xattr_inline_size)); + xs->base = (void *)xs->header; + xs->here = xs->header->xh_entries; + + ret = ocfs2_xattr_find_entry(name_index, name, xs); + if (ret) + return ret; + size = le64_to_cpu(xs->here->xe_value_size); + if (buffer) { + if (size > buffer_size) + return -ERANGE; + if (ocfs2_xattr_is_local(xs->here)) { + memcpy(buffer, (void *)xs->base + + le16_to_cpu(xs->here->xe_name_offset) + + OCFS2_XATTR_SIZE(xs->here->xe_name_len), size); + } else { + xv = (struct ocfs2_xattr_value_root *) + (xs->base + le16_to_cpu( + xs->here->xe_name_offset) + + OCFS2_XATTR_SIZE(xs->here->xe_name_len)); + ret = ocfs2_xattr_get_value_outside(inode, xv, + buffer, size); + if (ret < 0) { + mlog_errno(ret); + return ret; + } + } + } + + return size; +} + +static int ocfs2_xattr_block_get(struct inode *inode, + int name_index, + const char *name, + void *buffer, + size_t buffer_size, + struct ocfs2_xattr_search *xs) +{ + struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data; + struct buffer_head *blk_bh = NULL; + struct ocfs2_xattr_block *xb; + struct ocfs2_xattr_value_root *xv; + size_t size; + int ret = -ENODATA, name_offset, name_len, block_off, i; + + if (!di->i_xattr_loc) + return ret; + + memset(&xs->bucket, 0, sizeof(xs->bucket)); + + ret = ocfs2_read_block(inode, le64_to_cpu(di->i_xattr_loc), &blk_bh); + if (ret < 0) { + mlog_errno(ret); + return ret; + } + /*Verify the signature of xattr block*/ + if (memcmp((void *)blk_bh->b_data, OCFS2_XATTR_BLOCK_SIGNATURE, + strlen(OCFS2_XATTR_BLOCK_SIGNATURE))) { + ret = -EFAULT; + goto cleanup; + } + + xs->xattr_bh = blk_bh; + xb = (struct ocfs2_xattr_block *)blk_bh->b_data; + + if (!(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED)) { + xs->header = &xb->xb_attrs.xb_header; + xs->base = (void *)xs->header; + xs->end = (void *)(blk_bh->b_data) + blk_bh->b_size; + xs->here = xs->header->xh_entries; + + ret = ocfs2_xattr_find_entry(name_index, name, xs); + } else + ret = ocfs2_xattr_index_block_find(inode, blk_bh, + name_index, + name, xs); + + if (ret) + goto cleanup; + size = le64_to_cpu(xs->here->xe_value_size); + if (buffer) { + ret = -ERANGE; + if (size > buffer_size) + goto cleanup; + + name_offset = le16_to_cpu(xs->here->xe_name_offset); + name_len = OCFS2_XATTR_SIZE(xs->here->xe_name_len); + i = xs->here - xs->header->xh_entries; + + if (le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED) { + ret = ocfs2_xattr_bucket_get_name_value(inode, + xs->bucket.xh, + i, + &block_off, + &name_offset); + xs->base = xs->bucket.bhs[block_off]->b_data; + } + if (ocfs2_xattr_is_local(xs->here)) { + memcpy(buffer, (void *)xs->base + + name_offset + name_len, size); + } else { + xv = (struct ocfs2_xattr_value_root *) + (xs->base + name_offset + name_len); + ret = ocfs2_xattr_get_value_outside(inode, xv, + buffer, size); + if (ret < 0) { + mlog_errno(ret); + goto cleanup; + } + } + } + ret = size; +cleanup: + for (i = 0; i < OCFS2_XATTR_MAX_BLOCKS_PER_BUCKET; i++) + brelse(xs->bucket.bhs[i]); + memset(&xs->bucket, 0, sizeof(xs->bucket)); + + brelse(blk_bh); + return ret; +} + +/* ocfs2_xattr_get() + * + * Copy an extended attribute into the buffer provided. + * Buffer is NULL to compute the size of buffer required. + */ +int ocfs2_xattr_get(struct inode *inode, + int name_index, + const char *name, + void *buffer, + size_t buffer_size) +{ + int ret; + struct ocfs2_dinode *di = NULL; + struct buffer_head *di_bh = NULL; + struct ocfs2_inode_info *oi = OCFS2_I(inode); + struct ocfs2_xattr_search xis = { + .not_found = -ENODATA, + }; + struct ocfs2_xattr_search xbs = { + .not_found = -ENODATA, + }; + + if (!ocfs2_supports_xattr(OCFS2_SB(inode->i_sb))) + return -EOPNOTSUPP; + + if (!(oi->ip_dyn_features & OCFS2_HAS_XATTR_FL)) + ret = -ENODATA; + + ret = ocfs2_inode_lock(inode, &di_bh, 0); + if (ret < 0) { + mlog_errno(ret); + return ret; + } + xis.inode_bh = xbs.inode_bh = di_bh; + di = (struct ocfs2_dinode *)di_bh->b_data; + + down_read(&oi->ip_xattr_sem); + ret = ocfs2_xattr_ibody_get(inode, name_index, name, buffer, + buffer_size, &xis); + if (ret == -ENODATA) + ret = ocfs2_xattr_block_get(inode, name_index, name, buffer, + buffer_size, &xbs); + up_read(&oi->ip_xattr_sem); + ocfs2_inode_unlock(inode, 0); + + brelse(di_bh); + + return ret; +} + +static int __ocfs2_xattr_set_value_outside(struct inode *inode, + struct ocfs2_xattr_value_root *xv, + const void *value, + int value_len) +{ + int ret = 0, i, cp_len, credits; + u16 blocksize = inode->i_sb->s_blocksize; + u32 p_cluster, num_clusters; + u32 cpos = 0, bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1); + u32 clusters = ocfs2_clusters_for_bytes(inode->i_sb, value_len); + u64 blkno; + struct buffer_head *bh = NULL; + handle_t *handle; + + BUG_ON(clusters > le32_to_cpu(xv->xr_clusters)); + + credits = clusters * bpc; + handle = ocfs2_start_trans(OCFS2_SB(inode->i_sb), credits); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + goto out; + } + + while (cpos < clusters) { + ret = ocfs2_xattr_get_clusters(inode, cpos, &p_cluster, + &num_clusters, &xv->xr_list); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + blkno = ocfs2_clusters_to_blocks(inode->i_sb, p_cluster); + + for (i = 0; i < num_clusters * bpc; i++, blkno++) { + ret = ocfs2_read_block(inode, blkno, &bh); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + ret = ocfs2_journal_access(handle, + inode, + bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret < 0) { + mlog_errno(ret); + goto out_commit; + } + + cp_len = value_len > blocksize ? blocksize : value_len; + memcpy(bh->b_data, value, cp_len); + value_len -= cp_len; + value += cp_len; + if (cp_len < blocksize) + memset(bh->b_data + cp_len, 0, + blocksize - cp_len); + + ret = ocfs2_journal_dirty(handle, bh); + if (ret < 0) { + mlog_errno(ret); + goto out_commit; + } + brelse(bh); + bh = NULL; + + /* + * XXX: do we need to empty all the following + * blocks in this cluster? + */ + if (!value_len) + break; + } + cpos += num_clusters; + } +out_commit: + ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle); +out: + brelse(bh); + + return ret; +} + +static int ocfs2_xattr_cleanup(struct inode *inode, + struct ocfs2_xattr_info *xi, + struct ocfs2_xattr_search *xs, + size_t offs) +{ + handle_t *handle = NULL; + int ret = 0; + size_t name_len = strlen(xi->name); + void *val = xs->base + offs; + size_t size = OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_ROOT_SIZE; + + handle = ocfs2_start_trans((OCFS2_SB(inode->i_sb)), + OCFS2_XATTR_BLOCK_UPDATE_CREDITS); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + goto out; + } + ret = ocfs2_journal_access(handle, inode, xs->xattr_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + /* Decrease xattr count */ + le16_add_cpu(&xs->header->xh_count, -1); + /* Remove the xattr entry and tree root which has already be set*/ + memset((void *)xs->here, 0, sizeof(struct ocfs2_xattr_entry)); + memset(val, 0, size); + + ret = ocfs2_journal_dirty(handle, xs->xattr_bh); + if (ret < 0) + mlog_errno(ret); +out_commit: + ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle); +out: + return ret; +} + +static int ocfs2_xattr_update_entry(struct inode *inode, + struct ocfs2_xattr_info *xi, + struct ocfs2_xattr_search *xs, + size_t offs) +{ + handle_t *handle = NULL; + int ret = 0; + + handle = ocfs2_start_trans((OCFS2_SB(inode->i_sb)), + OCFS2_XATTR_BLOCK_UPDATE_CREDITS); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + goto out; + } + ret = ocfs2_journal_access(handle, inode, xs->xattr_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + xs->here->xe_name_offset = cpu_to_le16(offs); + xs->here->xe_value_size = cpu_to_le64(xi->value_len); + if (xi->value_len <= OCFS2_XATTR_INLINE_SIZE) + ocfs2_xattr_set_local(xs->here, 1); + else + ocfs2_xattr_set_local(xs->here, 0); + ocfs2_xattr_hash_entry(inode, xs->header, xs->here); + + ret = ocfs2_journal_dirty(handle, xs->xattr_bh); + if (ret < 0) + mlog_errno(ret); +out_commit: + ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle); +out: + return ret; +} + +/* + * ocfs2_xattr_set_value_outside() + * + * Set large size value in B tree. + */ +static int ocfs2_xattr_set_value_outside(struct inode *inode, + struct ocfs2_xattr_info *xi, + struct ocfs2_xattr_search *xs, + size_t offs) +{ + size_t name_len = strlen(xi->name); + void *val = xs->base + offs; + struct ocfs2_xattr_value_root *xv = NULL; + size_t size = OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_ROOT_SIZE; + int ret = 0; + + memset(val, 0, size); + memcpy(val, xi->name, name_len); + xv = (struct ocfs2_xattr_value_root *) + (val + OCFS2_XATTR_SIZE(name_len)); + xv->xr_clusters = 0; + xv->xr_last_eb_blk = 0; + xv->xr_list.l_tree_depth = 0; + xv->xr_list.l_count = cpu_to_le16(1); + xv->xr_list.l_next_free_rec = 0; + + ret = ocfs2_xattr_value_truncate(inode, xs->xattr_bh, xv, + xi->value_len); + if (ret < 0) { + mlog_errno(ret); + return ret; + } + ret = __ocfs2_xattr_set_value_outside(inode, xv, xi->value, + xi->value_len); + if (ret < 0) { + mlog_errno(ret); + return ret; + } + ret = ocfs2_xattr_update_entry(inode, xi, xs, offs); + if (ret < 0) + mlog_errno(ret); + + return ret; +} + +/* + * ocfs2_xattr_set_entry_local() + * + * Set, replace or remove extended attribute in local. + */ +static void ocfs2_xattr_set_entry_local(struct inode *inode, + struct ocfs2_xattr_info *xi, + struct ocfs2_xattr_search *xs, + struct ocfs2_xattr_entry *last, + size_t min_offs) +{ + size_t name_len = strlen(xi->name); + int i; + + if (xi->value && xs->not_found) { + /* Insert the new xattr entry. */ + le16_add_cpu(&xs->header->xh_count, 1); + ocfs2_xattr_set_type(last, xi->name_index); + ocfs2_xattr_set_local(last, 1); + last->xe_name_len = name_len; + } else { + void *first_val; + void *val; + size_t offs, size; + + first_val = xs->base + min_offs; + offs = le16_to_cpu(xs->here->xe_name_offset); + val = xs->base + offs; + + if (le64_to_cpu(xs->here->xe_value_size) > + OCFS2_XATTR_INLINE_SIZE) + size = OCFS2_XATTR_SIZE(name_len) + + OCFS2_XATTR_ROOT_SIZE; + else + size = OCFS2_XATTR_SIZE(name_len) + + OCFS2_XATTR_SIZE(le64_to_cpu(xs->here->xe_value_size)); + + if (xi->value && size == OCFS2_XATTR_SIZE(name_len) + + OCFS2_XATTR_SIZE(xi->value_len)) { + /* The old and the new value have the + same size. Just replace the value. */ + ocfs2_xattr_set_local(xs->here, 1); + xs->here->xe_value_size = cpu_to_le64(xi->value_len); + /* Clear value bytes. */ + memset(val + OCFS2_XATTR_SIZE(name_len), + 0, + OCFS2_XATTR_SIZE(xi->value_len)); + memcpy(val + OCFS2_XATTR_SIZE(name_len), + xi->value, + xi->value_len); + return; + } + /* Remove the old name+value. */ + memmove(first_val + size, first_val, val - first_val); + memset(first_val, 0, size); + xs->here->xe_name_hash = 0; + xs->here->xe_name_offset = 0; + ocfs2_xattr_set_local(xs->here, 1); + xs->here->xe_value_size = 0; + + min_offs += size; + + /* Adjust all value offsets. */ + last = xs->header->xh_entries; + for (i = 0 ; i < le16_to_cpu(xs->header->xh_count); i++) { + size_t o = le16_to_cpu(last->xe_name_offset); + + if (o < offs) + last->xe_name_offset = cpu_to_le16(o + size); + last += 1; + } + + if (!xi->value) { + /* Remove the old entry. */ + last -= 1; + memmove(xs->here, xs->here + 1, + (void *)last - (void *)xs->here); + memset(last, 0, sizeof(struct ocfs2_xattr_entry)); + le16_add_cpu(&xs->header->xh_count, -1); + } + } + if (xi->value) { + /* Insert the new name+value. */ + size_t size = OCFS2_XATTR_SIZE(name_len) + + OCFS2_XATTR_SIZE(xi->value_len); + void *val = xs->base + min_offs - size; + + xs->here->xe_name_offset = cpu_to_le16(min_offs - size); + memset(val, 0, size); + memcpy(val, xi->name, name_len); + memcpy(val + OCFS2_XATTR_SIZE(name_len), + xi->value, + xi->value_len); + xs->here->xe_value_size = cpu_to_le64(xi->value_len); + ocfs2_xattr_set_local(xs->here, 1); + ocfs2_xattr_hash_entry(inode, xs->header, xs->here); + } + + return; +} + +/* + * ocfs2_xattr_set_entry() + * + * Set extended attribute entry into inode or block. + * + * If extended attribute value size > OCFS2_XATTR_INLINE_SIZE, + * We first insert tree root(ocfs2_xattr_value_root) with set_entry_local(), + * then set value in B tree with set_value_outside(). + */ +static int ocfs2_xattr_set_entry(struct inode *inode, + struct ocfs2_xattr_info *xi, + struct ocfs2_xattr_search *xs, + int flag) +{ + struct ocfs2_xattr_entry *last; + struct ocfs2_inode_info *oi = OCFS2_I(inode); + struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data; + size_t min_offs = xs->end - xs->base, name_len = strlen(xi->name); + size_t size_l = 0; + handle_t *handle = NULL; + int free, i, ret; + struct ocfs2_xattr_info xi_l = { + .name_index = xi->name_index, + .name = xi->name, + .value = xi->value, + .value_len = xi->value_len, + }; + + /* Compute min_offs, last and free space. */ + last = xs->header->xh_entries; + + for (i = 0 ; i < le16_to_cpu(xs->header->xh_count); i++) { + size_t offs = le16_to_cpu(last->xe_name_offset); + if (offs < min_offs) + min_offs = offs; + last += 1; + } + + free = min_offs - ((void *)last - xs->base) - sizeof(__u32); + if (free < 0) + return -EFAULT; + + if (!xs->not_found) { + size_t size = 0; + if (ocfs2_xattr_is_local(xs->here)) + size = OCFS2_XATTR_SIZE(name_len) + + OCFS2_XATTR_SIZE(le64_to_cpu(xs->here->xe_value_size)); + else + size = OCFS2_XATTR_SIZE(name_len) + + OCFS2_XATTR_ROOT_SIZE; + free += (size + sizeof(struct ocfs2_xattr_entry)); + } + /* Check free space in inode or block */ + if (xi->value && xi->value_len > OCFS2_XATTR_INLINE_SIZE) { + if (free < sizeof(struct ocfs2_xattr_entry) + + OCFS2_XATTR_SIZE(name_len) + + OCFS2_XATTR_ROOT_SIZE) { + ret = -ENOSPC; + goto out; + } + size_l = OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_ROOT_SIZE; + xi_l.value = (void *)&def_xv; + xi_l.value_len = OCFS2_XATTR_ROOT_SIZE; + } else if (xi->value) { + if (free < sizeof(struct ocfs2_xattr_entry) + + OCFS2_XATTR_SIZE(name_len) + + OCFS2_XATTR_SIZE(xi->value_len)) { + ret = -ENOSPC; + goto out; + } + } + + if (!xs->not_found) { + /* For existing extended attribute */ + size_t size = OCFS2_XATTR_SIZE(name_len) + + OCFS2_XATTR_SIZE(le64_to_cpu(xs->here->xe_value_size)); + size_t offs = le16_to_cpu(xs->here->xe_name_offset); + void *val = xs->base + offs; + + if (ocfs2_xattr_is_local(xs->here) && size == size_l) { + /* Replace existing local xattr with tree root */ + ret = ocfs2_xattr_set_value_outside(inode, xi, xs, + offs); + if (ret < 0) + mlog_errno(ret); + goto out; + } else if (!ocfs2_xattr_is_local(xs->here)) { + /* For existing xattr which has value outside */ + struct ocfs2_xattr_value_root *xv = NULL; + xv = (struct ocfs2_xattr_value_root *)(val + + OCFS2_XATTR_SIZE(name_len)); + + if (xi->value_len > OCFS2_XATTR_INLINE_SIZE) { + /* + * If new value need set outside also, + * first truncate old value to new value, + * then set new value with set_value_outside(). + */ + ret = ocfs2_xattr_value_truncate(inode, + xs->xattr_bh, + xv, + xi->value_len); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + + ret = __ocfs2_xattr_set_value_outside(inode, + xv, + xi->value, + xi->value_len); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_xattr_update_entry(inode, + xi, + xs, + offs); + if (ret < 0) + mlog_errno(ret); + goto out; + } else { + /* + * If new value need set in local, + * just trucate old value to zero. + */ + ret = ocfs2_xattr_value_truncate(inode, + xs->xattr_bh, + xv, + 0); + if (ret < 0) + mlog_errno(ret); + } + } + } + + handle = ocfs2_start_trans((OCFS2_SB(inode->i_sb)), + OCFS2_INODE_UPDATE_CREDITS); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + goto out; + } + + ret = ocfs2_journal_access(handle, inode, xs->inode_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + if (!(flag & OCFS2_INLINE_XATTR_FL)) { + /* set extended attribute in external block. */ + ret = ocfs2_extend_trans(handle, + OCFS2_INODE_UPDATE_CREDITS + + OCFS2_XATTR_BLOCK_UPDATE_CREDITS); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + ret = ocfs2_journal_access(handle, inode, xs->xattr_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + } + + /* + * Set value in local, include set tree root in local. + * This is the first step for value size >INLINE_SIZE. + */ + ocfs2_xattr_set_entry_local(inode, &xi_l, xs, last, min_offs); + + if (!(flag & OCFS2_INLINE_XATTR_FL)) { + ret = ocfs2_journal_dirty(handle, xs->xattr_bh); + if (ret < 0) { + mlog_errno(ret); + goto out_commit; + } + } + + if (!(oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL) && + (flag & OCFS2_INLINE_XATTR_FL)) { + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + unsigned int xattrsize = osb->s_xattr_inline_size; + + /* + * Adjust extent record count or inline data size + * to reserve space for extended attribute. + */ + if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) { + struct ocfs2_inline_data *idata = &di->id2.i_data; + le16_add_cpu(&idata->id_count, -xattrsize); + } else if (!(ocfs2_inode_is_fast_symlink(inode))) { + struct ocfs2_extent_list *el = &di->id2.i_list; + le16_add_cpu(&el->l_count, -(xattrsize / + sizeof(struct ocfs2_extent_rec))); + } + di->i_xattr_inline_size = cpu_to_le16(xattrsize); + } + /* Update xattr flag */ + spin_lock(&oi->ip_lock); + oi->ip_dyn_features |= flag; + di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features); + spin_unlock(&oi->ip_lock); + /* Update inode ctime */ + inode->i_ctime = CURRENT_TIME; + di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec); + di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); + + ret = ocfs2_journal_dirty(handle, xs->inode_bh); + if (ret < 0) + mlog_errno(ret); + +out_commit: + ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle); + + if (!ret && xi->value_len > OCFS2_XATTR_INLINE_SIZE) { + /* + * Set value outside in B tree. + * This is the second step for value size > INLINE_SIZE. + */ + size_t offs = le16_to_cpu(xs->here->xe_name_offset); + ret = ocfs2_xattr_set_value_outside(inode, xi, xs, offs); + if (ret < 0) { + int ret2; + + mlog_errno(ret); + /* + * If set value outside failed, we have to clean + * the junk tree root we have already set in local. + */ + ret2 = ocfs2_xattr_cleanup(inode, xi, xs, offs); + if (ret2 < 0) + mlog_errno(ret2); + } + } +out: + return ret; + +} + +static int ocfs2_remove_value_outside(struct inode*inode, + struct buffer_head *bh, + struct ocfs2_xattr_header *header) +{ + int ret = 0, i; + + for (i = 0; i < le16_to_cpu(header->xh_count); i++) { + struct ocfs2_xattr_entry *entry = &header->xh_entries[i]; + + if (!ocfs2_xattr_is_local(entry)) { + struct ocfs2_xattr_value_root *xv; + void *val; + + val = (void *)header + + le16_to_cpu(entry->xe_name_offset); + xv = (struct ocfs2_xattr_value_root *) + (val + OCFS2_XATTR_SIZE(entry->xe_name_len)); + ret = ocfs2_xattr_value_truncate(inode, bh, xv, 0); + if (ret < 0) { + mlog_errno(ret); + return ret; + } + } + } + + return ret; +} + +static int ocfs2_xattr_ibody_remove(struct inode *inode, + struct buffer_head *di_bh) +{ + + struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; + struct ocfs2_xattr_header *header; + int ret; + + header = (struct ocfs2_xattr_header *) + ((void *)di + inode->i_sb->s_blocksize - + le16_to_cpu(di->i_xattr_inline_size)); + + ret = ocfs2_remove_value_outside(inode, di_bh, header); + + return ret; +} + +static int ocfs2_xattr_block_remove(struct inode *inode, + struct buffer_head *blk_bh) +{ + struct ocfs2_xattr_block *xb; + int ret = 0; + + xb = (struct ocfs2_xattr_block *)blk_bh->b_data; + if (!(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED)) { + struct ocfs2_xattr_header *header = &(xb->xb_attrs.xb_header); + ret = ocfs2_remove_value_outside(inode, blk_bh, header); + } else + ret = ocfs2_delete_xattr_index_block(inode, blk_bh); + + return ret; +} + +static int ocfs2_xattr_free_block(struct inode *inode, + u64 block) +{ + struct inode *xb_alloc_inode; + struct buffer_head *xb_alloc_bh = NULL; + struct buffer_head *blk_bh = NULL; + struct ocfs2_xattr_block *xb; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + handle_t *handle; + int ret = 0; + u64 blk, bg_blkno; + u16 bit; + + ret = ocfs2_read_block(inode, block, &blk_bh); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + + /*Verify the signature of xattr block*/ + if (memcmp((void *)blk_bh->b_data, OCFS2_XATTR_BLOCK_SIGNATURE, + strlen(OCFS2_XATTR_BLOCK_SIGNATURE))) { + ret = -EFAULT; + goto out; + } + + ret = ocfs2_xattr_block_remove(inode, blk_bh); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + + xb = (struct ocfs2_xattr_block *)blk_bh->b_data; + blk = le64_to_cpu(xb->xb_blkno); + bit = le16_to_cpu(xb->xb_suballoc_bit); + bg_blkno = ocfs2_which_suballoc_group(blk, bit); + + xb_alloc_inode = ocfs2_get_system_file_inode(osb, + EXTENT_ALLOC_SYSTEM_INODE, + le16_to_cpu(xb->xb_suballoc_slot)); + if (!xb_alloc_inode) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + mutex_lock(&xb_alloc_inode->i_mutex); + + ret = ocfs2_inode_lock(xb_alloc_inode, &xb_alloc_bh, 1); + if (ret < 0) { + mlog_errno(ret); + goto out_mutex; + } + + handle = ocfs2_start_trans(osb, OCFS2_SUBALLOC_FREE); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + goto out_unlock; + } + + ret = ocfs2_free_suballoc_bits(handle, xb_alloc_inode, xb_alloc_bh, + bit, bg_blkno, 1); + if (ret < 0) + mlog_errno(ret); + + ocfs2_commit_trans(osb, handle); +out_unlock: + ocfs2_inode_unlock(xb_alloc_inode, 1); + brelse(xb_alloc_bh); +out_mutex: + mutex_unlock(&xb_alloc_inode->i_mutex); + iput(xb_alloc_inode); +out: + brelse(blk_bh); + return ret; +} + +/* + * ocfs2_xattr_remove() + * + * Free extended attribute resources associated with this inode. + */ +int ocfs2_xattr_remove(struct inode *inode, struct buffer_head *di_bh) +{ + struct ocfs2_inode_info *oi = OCFS2_I(inode); + struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; + handle_t *handle; + int ret; + + if (!ocfs2_supports_xattr(OCFS2_SB(inode->i_sb))) + return 0; + + if (!(oi->ip_dyn_features & OCFS2_HAS_XATTR_FL)) + return 0; + + if (oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL) { + ret = ocfs2_xattr_ibody_remove(inode, di_bh); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + } + + if (di->i_xattr_loc) { + ret = ocfs2_xattr_free_block(inode, + le64_to_cpu(di->i_xattr_loc)); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + } + + handle = ocfs2_start_trans((OCFS2_SB(inode->i_sb)), + OCFS2_INODE_UPDATE_CREDITS); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + goto out; + } + ret = ocfs2_journal_access(handle, inode, di_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + di->i_xattr_loc = 0; + + spin_lock(&oi->ip_lock); + oi->ip_dyn_features &= ~(OCFS2_INLINE_XATTR_FL | OCFS2_HAS_XATTR_FL); + di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features); + spin_unlock(&oi->ip_lock); + + ret = ocfs2_journal_dirty(handle, di_bh); + if (ret < 0) + mlog_errno(ret); +out_commit: + ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle); +out: + return ret; +} + +static int ocfs2_xattr_has_space_inline(struct inode *inode, + struct ocfs2_dinode *di) +{ + struct ocfs2_inode_info *oi = OCFS2_I(inode); + unsigned int xattrsize = OCFS2_SB(inode->i_sb)->s_xattr_inline_size; + int free; + + if (xattrsize < OCFS2_MIN_XATTR_INLINE_SIZE) + return 0; + + if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) { + struct ocfs2_inline_data *idata = &di->id2.i_data; + free = le16_to_cpu(idata->id_count) - le64_to_cpu(di->i_size); + } else if (ocfs2_inode_is_fast_symlink(inode)) { + free = ocfs2_fast_symlink_chars(inode->i_sb) - + le64_to_cpu(di->i_size); + } else { + struct ocfs2_extent_list *el = &di->id2.i_list; + free = (le16_to_cpu(el->l_count) - + le16_to_cpu(el->l_next_free_rec)) * + sizeof(struct ocfs2_extent_rec); + } + if (free >= xattrsize) + return 1; + + return 0; +} + +/* + * ocfs2_xattr_ibody_find() + * + * Find extended attribute in inode block and + * fill search info into struct ocfs2_xattr_search. + */ +static int ocfs2_xattr_ibody_find(struct inode *inode, + int name_index, + const char *name, + struct ocfs2_xattr_search *xs) +{ + struct ocfs2_inode_info *oi = OCFS2_I(inode); + struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data; + int ret; + int has_space = 0; + + if (inode->i_sb->s_blocksize == OCFS2_MIN_BLOCKSIZE) + return 0; + + if (!(oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL)) { + down_read(&oi->ip_alloc_sem); + has_space = ocfs2_xattr_has_space_inline(inode, di); + up_read(&oi->ip_alloc_sem); + if (!has_space) + return 0; + } + + xs->xattr_bh = xs->inode_bh; + xs->end = (void *)di + inode->i_sb->s_blocksize; + if (oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL) + xs->header = (struct ocfs2_xattr_header *) + (xs->end - le16_to_cpu(di->i_xattr_inline_size)); + else + xs->header = (struct ocfs2_xattr_header *) + (xs->end - OCFS2_SB(inode->i_sb)->s_xattr_inline_size); + xs->base = (void *)xs->header; + xs->here = xs->header->xh_entries; + + /* Find the named attribute. */ + if (oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL) { + ret = ocfs2_xattr_find_entry(name_index, name, xs); + if (ret && ret != -ENODATA) + return ret; + xs->not_found = ret; + } + + return 0; +} + +/* + * ocfs2_xattr_ibody_set() + * + * Set, replace or remove an extended attribute into inode block. + * + */ +static int ocfs2_xattr_ibody_set(struct inode *inode, + struct ocfs2_xattr_info *xi, + struct ocfs2_xattr_search *xs) +{ + struct ocfs2_inode_info *oi = OCFS2_I(inode); + struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data; + int ret; + + if (inode->i_sb->s_blocksize == OCFS2_MIN_BLOCKSIZE) + return -ENOSPC; + + down_write(&oi->ip_alloc_sem); + if (!(oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL)) { + if (!ocfs2_xattr_has_space_inline(inode, di)) { + ret = -ENOSPC; + goto out; + } + } + + ret = ocfs2_xattr_set_entry(inode, xi, xs, + (OCFS2_INLINE_XATTR_FL | OCFS2_HAS_XATTR_FL)); +out: + up_write(&oi->ip_alloc_sem); + + return ret; +} + +/* + * ocfs2_xattr_block_find() + * + * Find extended attribute in external block and + * fill search info into struct ocfs2_xattr_search. + */ +static int ocfs2_xattr_block_find(struct inode *inode, + int name_index, + const char *name, + struct ocfs2_xattr_search *xs) +{ + struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data; + struct buffer_head *blk_bh = NULL; + struct ocfs2_xattr_block *xb; + int ret = 0; + + if (!di->i_xattr_loc) + return ret; + + ret = ocfs2_read_block(inode, le64_to_cpu(di->i_xattr_loc), &blk_bh); + if (ret < 0) { + mlog_errno(ret); + return ret; + } + /*Verify the signature of xattr block*/ + if (memcmp((void *)blk_bh->b_data, OCFS2_XATTR_BLOCK_SIGNATURE, + strlen(OCFS2_XATTR_BLOCK_SIGNATURE))) { + ret = -EFAULT; + goto cleanup; + } + + xs->xattr_bh = blk_bh; + xb = (struct ocfs2_xattr_block *)blk_bh->b_data; + + if (!(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED)) { + xs->header = &xb->xb_attrs.xb_header; + xs->base = (void *)xs->header; + xs->end = (void *)(blk_bh->b_data) + blk_bh->b_size; + xs->here = xs->header->xh_entries; + + ret = ocfs2_xattr_find_entry(name_index, name, xs); + } else + ret = ocfs2_xattr_index_block_find(inode, blk_bh, + name_index, + name, xs); + + if (ret && ret != -ENODATA) { + xs->xattr_bh = NULL; + goto cleanup; + } + xs->not_found = ret; + return 0; +cleanup: + brelse(blk_bh); + + return ret; +} + +/* + * When all the xattrs are deleted from index btree, the ocfs2_xattr_tree + * will be erased and ocfs2_xattr_block will have its ocfs2_xattr_header + * re-initialized. + */ +static int ocfs2_restore_xattr_block(struct inode *inode, + struct ocfs2_xattr_search *xs) +{ + int ret; + handle_t *handle; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct ocfs2_xattr_block *xb = + (struct ocfs2_xattr_block *)xs->xattr_bh->b_data; + struct ocfs2_extent_list *el = &xb->xb_attrs.xb_root.xt_list; + u16 xb_flags = le16_to_cpu(xb->xb_flags); + + BUG_ON(!(xb_flags & OCFS2_XATTR_INDEXED) || + le16_to_cpu(el->l_next_free_rec) != 0); + + handle = ocfs2_start_trans(osb, OCFS2_XATTR_BLOCK_UPDATE_CREDITS); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + handle = NULL; + goto out; + } + + ret = ocfs2_journal_access(handle, inode, xs->xattr_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret < 0) { + mlog_errno(ret); + goto out_commit; + } + + memset(&xb->xb_attrs, 0, inode->i_sb->s_blocksize - + offsetof(struct ocfs2_xattr_block, xb_attrs)); + + xb->xb_flags = cpu_to_le16(xb_flags & ~OCFS2_XATTR_INDEXED); + + ocfs2_journal_dirty(handle, xs->xattr_bh); + +out_commit: + ocfs2_commit_trans(osb, handle); +out: + return ret; +} + +/* + * ocfs2_xattr_block_set() + * + * Set, replace or remove an extended attribute into external block. + * + */ +static int ocfs2_xattr_block_set(struct inode *inode, + struct ocfs2_xattr_info *xi, + struct ocfs2_xattr_search *xs) +{ + struct buffer_head *new_bh = NULL; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data; + struct ocfs2_alloc_context *meta_ac = NULL; + handle_t *handle = NULL; + struct ocfs2_xattr_block *xblk = NULL; + u16 suballoc_bit_start; + u32 num_got; + u64 first_blkno; + int ret; + + if (!xs->xattr_bh) { + /* + * Alloc one external block for extended attribute + * outside of inode. + */ + ret = ocfs2_reserve_new_metadata_blocks(osb, 1, &meta_ac); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + handle = ocfs2_start_trans(osb, + OCFS2_XATTR_BLOCK_CREATE_CREDITS); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + goto out; + } + ret = ocfs2_journal_access(handle, inode, xs->inode_bh, + OCFS2_JOURNAL_ACCESS_CREATE); + if (ret < 0) { + mlog_errno(ret); + goto out_commit; + } + + ret = ocfs2_claim_metadata(osb, handle, meta_ac, 1, + &suballoc_bit_start, &num_got, + &first_blkno); + if (ret < 0) { + mlog_errno(ret); + goto out_commit; + } + + new_bh = sb_getblk(inode->i_sb, first_blkno); + ocfs2_set_new_buffer_uptodate(inode, new_bh); + + ret = ocfs2_journal_access(handle, inode, new_bh, + OCFS2_JOURNAL_ACCESS_CREATE); + if (ret < 0) { + mlog_errno(ret); + goto out_commit; + } + + /* Initialize ocfs2_xattr_block */ + xs->xattr_bh = new_bh; + xblk = (struct ocfs2_xattr_block *)new_bh->b_data; + memset(xblk, 0, inode->i_sb->s_blocksize); + strcpy((void *)xblk, OCFS2_XATTR_BLOCK_SIGNATURE); + xblk->xb_suballoc_slot = cpu_to_le16(osb->slot_num); + xblk->xb_suballoc_bit = cpu_to_le16(suballoc_bit_start); + xblk->xb_fs_generation = cpu_to_le32(osb->fs_generation); + xblk->xb_blkno = cpu_to_le64(first_blkno); + + xs->header = &xblk->xb_attrs.xb_header; + xs->base = (void *)xs->header; + xs->end = (void *)xblk + inode->i_sb->s_blocksize; + xs->here = xs->header->xh_entries; + + + ret = ocfs2_journal_dirty(handle, new_bh); + if (ret < 0) { + mlog_errno(ret); + goto out_commit; + } + di->i_xattr_loc = cpu_to_le64(first_blkno); + ret = ocfs2_journal_dirty(handle, xs->inode_bh); + if (ret < 0) + mlog_errno(ret); +out_commit: + ocfs2_commit_trans(osb, handle); +out: + if (meta_ac) + ocfs2_free_alloc_context(meta_ac); + if (ret < 0) + return ret; + } else + xblk = (struct ocfs2_xattr_block *)xs->xattr_bh->b_data; + + if (!(le16_to_cpu(xblk->xb_flags) & OCFS2_XATTR_INDEXED)) { + /* Set extended attribute into external block */ + ret = ocfs2_xattr_set_entry(inode, xi, xs, OCFS2_HAS_XATTR_FL); + if (!ret || ret != -ENOSPC) + goto end; + + ret = ocfs2_xattr_create_index_block(inode, xs); + if (ret) + goto end; + } + + ret = ocfs2_xattr_set_entry_index_block(inode, xi, xs); + if (!ret && xblk->xb_attrs.xb_root.xt_list.l_next_free_rec == 0) + ret = ocfs2_restore_xattr_block(inode, xs); + +end: + + return ret; +} + +/* + * ocfs2_xattr_set() + * + * Set, replace or remove an extended attribute for this inode. + * value is NULL to remove an existing extended attribute, else either + * create or replace an extended attribute. + */ +int ocfs2_xattr_set(struct inode *inode, + int name_index, + const char *name, + const void *value, + size_t value_len, + int flags) +{ + struct buffer_head *di_bh = NULL; + struct ocfs2_dinode *di; + int ret; + u16 i, blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb); + + struct ocfs2_xattr_info xi = { + .name_index = name_index, + .name = name, + .value = value, + .value_len = value_len, + }; + + struct ocfs2_xattr_search xis = { + .not_found = -ENODATA, + }; + + struct ocfs2_xattr_search xbs = { + .not_found = -ENODATA, + }; + + if (!ocfs2_supports_xattr(OCFS2_SB(inode->i_sb))) + return -EOPNOTSUPP; + + ret = ocfs2_inode_lock(inode, &di_bh, 1); + if (ret < 0) { + mlog_errno(ret); + return ret; + } + xis.inode_bh = xbs.inode_bh = di_bh; + di = (struct ocfs2_dinode *)di_bh->b_data; + + down_write(&OCFS2_I(inode)->ip_xattr_sem); + /* + * Scan inode and external block to find the same name + * extended attribute and collect search infomation. + */ + ret = ocfs2_xattr_ibody_find(inode, name_index, name, &xis); + if (ret) + goto cleanup; + if (xis.not_found) { + ret = ocfs2_xattr_block_find(inode, name_index, name, &xbs); + if (ret) + goto cleanup; + } + + if (xis.not_found && xbs.not_found) { + ret = -ENODATA; + if (flags & XATTR_REPLACE) + goto cleanup; + ret = 0; + if (!value) + goto cleanup; + } else { + ret = -EEXIST; + if (flags & XATTR_CREATE) + goto cleanup; + } + + if (!value) { + /* Remove existing extended attribute */ + if (!xis.not_found) + ret = ocfs2_xattr_ibody_set(inode, &xi, &xis); + else if (!xbs.not_found) + ret = ocfs2_xattr_block_set(inode, &xi, &xbs); + } else { + /* We always try to set extended attribute into inode first*/ + ret = ocfs2_xattr_ibody_set(inode, &xi, &xis); + if (!ret && !xbs.not_found) { + /* + * If succeed and that extended attribute existing in + * external block, then we will remove it. + */ + xi.value = NULL; + xi.value_len = 0; + ret = ocfs2_xattr_block_set(inode, &xi, &xbs); + } else if (ret == -ENOSPC) { + if (di->i_xattr_loc && !xbs.xattr_bh) { + ret = ocfs2_xattr_block_find(inode, name_index, + name, &xbs); + if (ret) + goto cleanup; + } + /* + * If no space in inode, we will set extended attribute + * into external block. + */ + ret = ocfs2_xattr_block_set(inode, &xi, &xbs); + if (ret) + goto cleanup; + if (!xis.not_found) { + /* + * If succeed and that extended attribute + * existing in inode, we will remove it. + */ + xi.value = NULL; + xi.value_len = 0; + ret = ocfs2_xattr_ibody_set(inode, &xi, &xis); + } + } + } +cleanup: + up_write(&OCFS2_I(inode)->ip_xattr_sem); + ocfs2_inode_unlock(inode, 1); + brelse(di_bh); + brelse(xbs.xattr_bh); + for (i = 0; i < blk_per_bucket; i++) + brelse(xbs.bucket.bhs[i]); + + return ret; +} + +/* + * Find the xattr extent rec which may contains name_hash. + * e_cpos will be the first name hash of the xattr rec. + * el must be the ocfs2_xattr_header.xb_attrs.xb_root.xt_list. + */ +static int ocfs2_xattr_get_rec(struct inode *inode, + u32 name_hash, + u64 *p_blkno, + u32 *e_cpos, + u32 *num_clusters, + struct ocfs2_extent_list *el) +{ + int ret = 0, i; + struct buffer_head *eb_bh = NULL; + struct ocfs2_extent_block *eb; + struct ocfs2_extent_rec *rec = NULL; + u64 e_blkno = 0; + + if (el->l_tree_depth) { + ret = ocfs2_find_leaf(inode, el, name_hash, &eb_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + eb = (struct ocfs2_extent_block *) eb_bh->b_data; + el = &eb->h_list; + + if (el->l_tree_depth) { + ocfs2_error(inode->i_sb, + "Inode %lu has non zero tree depth in " + "xattr tree block %llu\n", inode->i_ino, + (unsigned long long)eb_bh->b_blocknr); + ret = -EROFS; + goto out; + } + } + + for (i = le16_to_cpu(el->l_next_free_rec) - 1; i >= 0; i--) { + rec = &el->l_recs[i]; + + if (le32_to_cpu(rec->e_cpos) <= name_hash) { + e_blkno = le64_to_cpu(rec->e_blkno); + break; + } + } + + if (!e_blkno) { + ocfs2_error(inode->i_sb, "Inode %lu has bad extent " + "record (%u, %u, 0) in xattr", inode->i_ino, + le32_to_cpu(rec->e_cpos), + ocfs2_rec_clusters(el, rec)); + ret = -EROFS; + goto out; + } + + *p_blkno = le64_to_cpu(rec->e_blkno); + *num_clusters = le16_to_cpu(rec->e_leaf_clusters); + if (e_cpos) + *e_cpos = le32_to_cpu(rec->e_cpos); +out: + brelse(eb_bh); + return ret; +} + +typedef int (xattr_bucket_func)(struct inode *inode, + struct ocfs2_xattr_bucket *bucket, + void *para); + +static int ocfs2_find_xe_in_bucket(struct inode *inode, + struct buffer_head *header_bh, + int name_index, + const char *name, + u32 name_hash, + u16 *xe_index, + int *found) +{ + int i, ret = 0, cmp = 1, block_off, new_offset; + struct ocfs2_xattr_header *xh = + (struct ocfs2_xattr_header *)header_bh->b_data; + size_t name_len = strlen(name); + struct ocfs2_xattr_entry *xe = NULL; + struct buffer_head *name_bh = NULL; + char *xe_name; + + /* + * We don't use binary search in the bucket because there + * may be multiple entries with the same name hash. + */ + for (i = 0; i < le16_to_cpu(xh->xh_count); i++) { + xe = &xh->xh_entries[i]; + + if (name_hash > le32_to_cpu(xe->xe_name_hash)) + continue; + else if (name_hash < le32_to_cpu(xe->xe_name_hash)) + break; + + cmp = name_index - ocfs2_xattr_get_type(xe); + if (!cmp) + cmp = name_len - xe->xe_name_len; + if (cmp) + continue; + + ret = ocfs2_xattr_bucket_get_name_value(inode, + xh, + i, + &block_off, + &new_offset); + if (ret) { + mlog_errno(ret); + break; + } + + ret = ocfs2_read_block(inode, header_bh->b_blocknr + block_off, + &name_bh); + if (ret) { + mlog_errno(ret); + break; + } + xe_name = name_bh->b_data + new_offset; + + cmp = memcmp(name, xe_name, name_len); + brelse(name_bh); + name_bh = NULL; + + if (cmp == 0) { + *xe_index = i; + *found = 1; + ret = 0; + break; + } + } + + return ret; +} + +/* + * Find the specified xattr entry in a series of buckets. + * This series start from p_blkno and last for num_clusters. + * The ocfs2_xattr_header.xh_num_buckets of the first bucket contains + * the num of the valid buckets. + * + * Return the buffer_head this xattr should reside in. And if the xattr's + * hash is in the gap of 2 buckets, return the lower bucket. + */ +static int ocfs2_xattr_bucket_find(struct inode *inode, + int name_index, + const char *name, + u32 name_hash, + u64 p_blkno, + u32 first_hash, + u32 num_clusters, + struct ocfs2_xattr_search *xs) +{ + int ret, found = 0; + struct buffer_head *bh = NULL; + struct buffer_head *lower_bh = NULL; + struct ocfs2_xattr_header *xh = NULL; + struct ocfs2_xattr_entry *xe = NULL; + u16 index = 0; + u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb); + int low_bucket = 0, bucket, high_bucket; + u32 last_hash; + u64 blkno; + + ret = ocfs2_read_block(inode, p_blkno, &bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + xh = (struct ocfs2_xattr_header *)bh->b_data; + high_bucket = le16_to_cpu(xh->xh_num_buckets) - 1; + + while (low_bucket <= high_bucket) { + brelse(bh); + bh = NULL; + bucket = (low_bucket + high_bucket) / 2; + + blkno = p_blkno + bucket * blk_per_bucket; + + ret = ocfs2_read_block(inode, blkno, &bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + xh = (struct ocfs2_xattr_header *)bh->b_data; + xe = &xh->xh_entries[0]; + if (name_hash < le32_to_cpu(xe->xe_name_hash)) { + high_bucket = bucket - 1; + continue; + } + + /* + * Check whether the hash of the last entry in our + * bucket is larger than the search one. for an empty + * bucket, the last one is also the first one. + */ + if (xh->xh_count) + xe = &xh->xh_entries[le16_to_cpu(xh->xh_count) - 1]; + + last_hash = le32_to_cpu(xe->xe_name_hash); + + /* record lower_bh which may be the insert place. */ + brelse(lower_bh); + lower_bh = bh; + bh = NULL; + + if (name_hash > le32_to_cpu(xe->xe_name_hash)) { + low_bucket = bucket + 1; + continue; + } + + /* the searched xattr should reside in this bucket if exists. */ + ret = ocfs2_find_xe_in_bucket(inode, lower_bh, + name_index, name, name_hash, + &index, &found); + if (ret) { + mlog_errno(ret); + goto out; + } + break; + } + + /* + * Record the bucket we have found. + * When the xattr's hash value is in the gap of 2 buckets, we will + * always set it to the previous bucket. + */ + if (!lower_bh) { + /* + * We can't find any bucket whose first name_hash is less + * than the find name_hash. + */ + BUG_ON(bh->b_blocknr != p_blkno); + lower_bh = bh; + bh = NULL; + } + xs->bucket.bhs[0] = lower_bh; + xs->bucket.xh = (struct ocfs2_xattr_header *) + xs->bucket.bhs[0]->b_data; + lower_bh = NULL; + + xs->header = xs->bucket.xh; + xs->base = xs->bucket.bhs[0]->b_data; + xs->end = xs->base + inode->i_sb->s_blocksize; + + if (found) { + /* + * If we have found the xattr enty, read all the blocks in + * this bucket. + */ + ret = ocfs2_read_blocks(inode, xs->bucket.bhs[0]->b_blocknr + 1, + blk_per_bucket - 1, &xs->bucket.bhs[1], + 0); + if (ret) { + mlog_errno(ret); + goto out; + } + + xs->here = &xs->header->xh_entries[index]; + mlog(0, "find xattr %s in bucket %llu, entry = %u\n", name, + (unsigned long long)xs->bucket.bhs[0]->b_blocknr, index); + } else + ret = -ENODATA; + +out: + brelse(bh); + brelse(lower_bh); + return ret; +} + +static int ocfs2_xattr_index_block_find(struct inode *inode, + struct buffer_head *root_bh, + int name_index, + const char *name, + struct ocfs2_xattr_search *xs) +{ + int ret; + struct ocfs2_xattr_block *xb = + (struct ocfs2_xattr_block *)root_bh->b_data; + struct ocfs2_xattr_tree_root *xb_root = &xb->xb_attrs.xb_root; + struct ocfs2_extent_list *el = &xb_root->xt_list; + u64 p_blkno = 0; + u32 first_hash, num_clusters = 0; + u32 name_hash = ocfs2_xattr_name_hash(inode, name, strlen(name)); + + if (le16_to_cpu(el->l_next_free_rec) == 0) + return -ENODATA; + + mlog(0, "find xattr %s, hash = %u, index = %d in xattr tree\n", + name, name_hash, name_index); + + ret = ocfs2_xattr_get_rec(inode, name_hash, &p_blkno, &first_hash, + &num_clusters, el); + if (ret) { + mlog_errno(ret); + goto out; + } + + BUG_ON(p_blkno == 0 || num_clusters == 0 || first_hash > name_hash); + + mlog(0, "find xattr extent rec %u clusters from %llu, the first hash " + "in the rec is %u\n", num_clusters, p_blkno, first_hash); + + ret = ocfs2_xattr_bucket_find(inode, name_index, name, name_hash, + p_blkno, first_hash, num_clusters, xs); + +out: + return ret; +} + +static int ocfs2_iterate_xattr_buckets(struct inode *inode, + u64 blkno, + u32 clusters, + xattr_bucket_func *func, + void *para) +{ + int i, j, ret = 0; + int blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb); + u32 bpc = ocfs2_xattr_buckets_per_cluster(OCFS2_SB(inode->i_sb)); + u32 num_buckets = clusters * bpc; + struct ocfs2_xattr_bucket bucket; + + memset(&bucket, 0, sizeof(bucket)); + + mlog(0, "iterating xattr buckets in %u clusters starting from %llu\n", + clusters, blkno); + + for (i = 0; i < num_buckets; i++, blkno += blk_per_bucket) { + ret = ocfs2_read_blocks(inode, blkno, blk_per_bucket, + bucket.bhs, 0); + if (ret) { + mlog_errno(ret); + goto out; + } + + bucket.xh = (struct ocfs2_xattr_header *)bucket.bhs[0]->b_data; + /* + * The real bucket num in this series of blocks is stored + * in the 1st bucket. + */ + if (i == 0) + num_buckets = le16_to_cpu(bucket.xh->xh_num_buckets); + + mlog(0, "iterating xattr bucket %llu, first hash %u\n", blkno, + le32_to_cpu(bucket.xh->xh_entries[0].xe_name_hash)); + if (func) { + ret = func(inode, &bucket, para); + if (ret) { + mlog_errno(ret); + break; + } + } + + for (j = 0; j < blk_per_bucket; j++) + brelse(bucket.bhs[j]); + memset(&bucket, 0, sizeof(bucket)); + } + +out: + for (j = 0; j < blk_per_bucket; j++) + brelse(bucket.bhs[j]); + + return ret; +} + +struct ocfs2_xattr_tree_list { + char *buffer; + size_t buffer_size; + size_t result; +}; + +static int ocfs2_xattr_bucket_get_name_value(struct inode *inode, + struct ocfs2_xattr_header *xh, + int index, + int *block_off, + int *new_offset) +{ + u16 name_offset; + + if (index < 0 || index >= le16_to_cpu(xh->xh_count)) + return -EINVAL; + + name_offset = le16_to_cpu(xh->xh_entries[index].xe_name_offset); + + *block_off = name_offset >> inode->i_sb->s_blocksize_bits; + *new_offset = name_offset % inode->i_sb->s_blocksize; + + return 0; +} + +static int ocfs2_list_xattr_bucket(struct inode *inode, + struct ocfs2_xattr_bucket *bucket, + void *para) +{ + int ret = 0, type; + struct ocfs2_xattr_tree_list *xl = (struct ocfs2_xattr_tree_list *)para; + int i, block_off, new_offset; + const char *prefix, *name; + + for (i = 0 ; i < le16_to_cpu(bucket->xh->xh_count); i++) { + struct ocfs2_xattr_entry *entry = &bucket->xh->xh_entries[i]; + type = ocfs2_xattr_get_type(entry); + prefix = ocfs2_xattr_prefix(type); + + if (prefix) { + ret = ocfs2_xattr_bucket_get_name_value(inode, + bucket->xh, + i, + &block_off, + &new_offset); + if (ret) + break; + + name = (const char *)bucket->bhs[block_off]->b_data + + new_offset; + ret = ocfs2_xattr_list_entry(xl->buffer, + xl->buffer_size, + &xl->result, + prefix, name, + entry->xe_name_len); + if (ret) + break; + } + } + + return ret; +} + +static int ocfs2_xattr_tree_list_index_block(struct inode *inode, + struct ocfs2_xattr_tree_root *xt, + char *buffer, + size_t buffer_size) +{ + struct ocfs2_extent_list *el = &xt->xt_list; + int ret = 0; + u32 name_hash = UINT_MAX, e_cpos = 0, num_clusters = 0; + u64 p_blkno = 0; + struct ocfs2_xattr_tree_list xl = { + .buffer = buffer, + .buffer_size = buffer_size, + .result = 0, + }; + + if (le16_to_cpu(el->l_next_free_rec) == 0) + return 0; + + while (name_hash > 0) { + ret = ocfs2_xattr_get_rec(inode, name_hash, &p_blkno, + &e_cpos, &num_clusters, el); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_iterate_xattr_buckets(inode, p_blkno, num_clusters, + ocfs2_list_xattr_bucket, + &xl); + if (ret) { + mlog_errno(ret); + goto out; + } + + if (e_cpos == 0) + break; + + name_hash = e_cpos - 1; + } + + ret = xl.result; +out: + return ret; +} + +static int cmp_xe(const void *a, const void *b) +{ + const struct ocfs2_xattr_entry *l = a, *r = b; + u32 l_hash = le32_to_cpu(l->xe_name_hash); + u32 r_hash = le32_to_cpu(r->xe_name_hash); + + if (l_hash > r_hash) + return 1; + if (l_hash < r_hash) + return -1; + return 0; +} + +static void swap_xe(void *a, void *b, int size) +{ + struct ocfs2_xattr_entry *l = a, *r = b, tmp; + + tmp = *l; + memcpy(l, r, sizeof(struct ocfs2_xattr_entry)); + memcpy(r, &tmp, sizeof(struct ocfs2_xattr_entry)); +} + +/* + * When the ocfs2_xattr_block is filled up, new bucket will be created + * and all the xattr entries will be moved to the new bucket. + * Note: we need to sort the entries since they are not saved in order + * in the ocfs2_xattr_block. + */ +static void ocfs2_cp_xattr_block_to_bucket(struct inode *inode, + struct buffer_head *xb_bh, + struct buffer_head *xh_bh, + struct buffer_head *data_bh) +{ + int i, blocksize = inode->i_sb->s_blocksize; + u16 offset, size, off_change; + struct ocfs2_xattr_entry *xe; + struct ocfs2_xattr_block *xb = + (struct ocfs2_xattr_block *)xb_bh->b_data; + struct ocfs2_xattr_header *xb_xh = &xb->xb_attrs.xb_header; + struct ocfs2_xattr_header *xh = + (struct ocfs2_xattr_header *)xh_bh->b_data; + u16 count = le16_to_cpu(xb_xh->xh_count); + char *target = xh_bh->b_data, *src = xb_bh->b_data; + + mlog(0, "cp xattr from block %llu to bucket %llu\n", + (unsigned long long)xb_bh->b_blocknr, + (unsigned long long)xh_bh->b_blocknr); + + memset(xh_bh->b_data, 0, blocksize); + if (data_bh) + memset(data_bh->b_data, 0, blocksize); + /* + * Since the xe_name_offset is based on ocfs2_xattr_header, + * there is a offset change corresponding to the change of + * ocfs2_xattr_header's position. + */ + off_change = offsetof(struct ocfs2_xattr_block, xb_attrs.xb_header); + xe = &xb_xh->xh_entries[count - 1]; + offset = le16_to_cpu(xe->xe_name_offset) + off_change; + size = blocksize - offset; + + /* copy all the names and values. */ + if (data_bh) + target = data_bh->b_data; + memcpy(target + offset, src + offset, size); + + /* Init new header now. */ + xh->xh_count = xb_xh->xh_count; + xh->xh_num_buckets = cpu_to_le16(1); + xh->xh_name_value_len = cpu_to_le16(size); + xh->xh_free_start = cpu_to_le16(OCFS2_XATTR_BUCKET_SIZE - size); + + /* copy all the entries. */ + target = xh_bh->b_data; + offset = offsetof(struct ocfs2_xattr_header, xh_entries); + size = count * sizeof(struct ocfs2_xattr_entry); + memcpy(target + offset, (char *)xb_xh + offset, size); + + /* Change the xe offset for all the xe because of the move. */ + off_change = OCFS2_XATTR_BUCKET_SIZE - blocksize + + offsetof(struct ocfs2_xattr_block, xb_attrs.xb_header); + for (i = 0; i < count; i++) + le16_add_cpu(&xh->xh_entries[i].xe_name_offset, off_change); + + mlog(0, "copy entry: start = %u, size = %u, offset_change = %u\n", + offset, size, off_change); + + sort(target + offset, count, sizeof(struct ocfs2_xattr_entry), + cmp_xe, swap_xe); +} + +/* + * After we move xattr from block to index btree, we have to + * update ocfs2_xattr_search to the new xe and base. + * + * When the entry is in xattr block, xattr_bh indicates the storage place. + * While if the entry is in index b-tree, "bucket" indicates the + * real place of the xattr. + */ +static int ocfs2_xattr_update_xattr_search(struct inode *inode, + struct ocfs2_xattr_search *xs, + struct buffer_head *old_bh, + struct buffer_head *new_bh) +{ + int ret = 0; + char *buf = old_bh->b_data; + struct ocfs2_xattr_block *old_xb = (struct ocfs2_xattr_block *)buf; + struct ocfs2_xattr_header *old_xh = &old_xb->xb_attrs.xb_header; + int i, blocksize = inode->i_sb->s_blocksize; + u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb); + + xs->bucket.bhs[0] = new_bh; + get_bh(new_bh); + xs->bucket.xh = (struct ocfs2_xattr_header *)xs->bucket.bhs[0]->b_data; + xs->header = xs->bucket.xh; + + xs->base = new_bh->b_data; + xs->end = xs->base + inode->i_sb->s_blocksize; + + if (!xs->not_found) { + if (OCFS2_XATTR_BUCKET_SIZE != blocksize) { + ret = ocfs2_read_blocks(inode, + xs->bucket.bhs[0]->b_blocknr + 1, + blk_per_bucket - 1, &xs->bucket.bhs[1], + 0); + if (ret) { + mlog_errno(ret); + return ret; + } + + i = xs->here - old_xh->xh_entries; + xs->here = &xs->header->xh_entries[i]; + } + } + + return ret; +} + +static int ocfs2_xattr_create_index_block(struct inode *inode, + struct ocfs2_xattr_search *xs) +{ + int ret, credits = OCFS2_SUBALLOC_ALLOC; + u32 bit_off, len; + u64 blkno; + handle_t *handle; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct ocfs2_inode_info *oi = OCFS2_I(inode); + struct ocfs2_alloc_context *data_ac; + struct buffer_head *xh_bh = NULL, *data_bh = NULL; + struct buffer_head *xb_bh = xs->xattr_bh; + struct ocfs2_xattr_block *xb = + (struct ocfs2_xattr_block *)xb_bh->b_data; + struct ocfs2_xattr_tree_root *xr; + u16 xb_flags = le16_to_cpu(xb->xb_flags); + u16 bpb = ocfs2_blocks_per_xattr_bucket(inode->i_sb); + + mlog(0, "create xattr index block for %llu\n", + (unsigned long long)xb_bh->b_blocknr); + + BUG_ON(xb_flags & OCFS2_XATTR_INDEXED); + + ret = ocfs2_reserve_clusters(osb, 1, &data_ac); + if (ret) { + mlog_errno(ret); + goto out; + } + + /* + * XXX: + * We can use this lock for now, and maybe move to a dedicated mutex + * if performance becomes a problem later. + */ + down_write(&oi->ip_alloc_sem); + + /* + * 3 more credits, one for xattr block update, one for the 1st block + * of the new xattr bucket and one for the value/data. + */ + credits += 3; + handle = ocfs2_start_trans(osb, credits); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + goto out_sem; + } + + ret = ocfs2_journal_access(handle, inode, xb_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + ret = ocfs2_claim_clusters(osb, handle, data_ac, 1, &bit_off, &len); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + /* + * The bucket may spread in many blocks, and + * we will only touch the 1st block and the last block + * in the whole bucket(one for entry and one for data). + */ + blkno = ocfs2_clusters_to_blocks(inode->i_sb, bit_off); + + mlog(0, "allocate 1 cluster from %llu to xattr block\n", blkno); + + xh_bh = sb_getblk(inode->i_sb, blkno); + if (!xh_bh) { + ret = -EIO; + mlog_errno(ret); + goto out_commit; + } + + ocfs2_set_new_buffer_uptodate(inode, xh_bh); + + ret = ocfs2_journal_access(handle, inode, xh_bh, + OCFS2_JOURNAL_ACCESS_CREATE); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + if (bpb > 1) { + data_bh = sb_getblk(inode->i_sb, blkno + bpb - 1); + if (!data_bh) { + ret = -EIO; + mlog_errno(ret); + goto out_commit; + } + + ocfs2_set_new_buffer_uptodate(inode, data_bh); + + ret = ocfs2_journal_access(handle, inode, data_bh, + OCFS2_JOURNAL_ACCESS_CREATE); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + } + + ocfs2_cp_xattr_block_to_bucket(inode, xb_bh, xh_bh, data_bh); + + ocfs2_journal_dirty(handle, xh_bh); + if (data_bh) + ocfs2_journal_dirty(handle, data_bh); + + ocfs2_xattr_update_xattr_search(inode, xs, xb_bh, xh_bh); + + /* Change from ocfs2_xattr_header to ocfs2_xattr_tree_root */ + memset(&xb->xb_attrs, 0, inode->i_sb->s_blocksize - + offsetof(struct ocfs2_xattr_block, xb_attrs)); + + xr = &xb->xb_attrs.xb_root; + xr->xt_clusters = cpu_to_le32(1); + xr->xt_last_eb_blk = 0; + xr->xt_list.l_tree_depth = 0; + xr->xt_list.l_count = cpu_to_le16(ocfs2_xattr_recs_per_xb(inode->i_sb)); + xr->xt_list.l_next_free_rec = cpu_to_le16(1); + + xr->xt_list.l_recs[0].e_cpos = 0; + xr->xt_list.l_recs[0].e_blkno = cpu_to_le64(blkno); + xr->xt_list.l_recs[0].e_leaf_clusters = cpu_to_le16(1); + + xb->xb_flags = cpu_to_le16(xb_flags | OCFS2_XATTR_INDEXED); + + ret = ocfs2_journal_dirty(handle, xb_bh); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + +out_commit: + ocfs2_commit_trans(osb, handle); + +out_sem: + up_write(&oi->ip_alloc_sem); + +out: + if (data_ac) + ocfs2_free_alloc_context(data_ac); + + brelse(xh_bh); + brelse(data_bh); + + return ret; +} + +static int cmp_xe_offset(const void *a, const void *b) +{ + const struct ocfs2_xattr_entry *l = a, *r = b; + u32 l_name_offset = le16_to_cpu(l->xe_name_offset); + u32 r_name_offset = le16_to_cpu(r->xe_name_offset); + + if (l_name_offset < r_name_offset) + return 1; + if (l_name_offset > r_name_offset) + return -1; + return 0; +} + +/* + * defrag a xattr bucket if we find that the bucket has some + * holes beteen name/value pairs. + * We will move all the name/value pairs to the end of the bucket + * so that we can spare some space for insertion. + */ +static int ocfs2_defrag_xattr_bucket(struct inode *inode, + struct ocfs2_xattr_bucket *bucket) +{ + int ret, i; + size_t end, offset, len, value_len; + struct ocfs2_xattr_header *xh; + char *entries, *buf, *bucket_buf = NULL; + u64 blkno = bucket->bhs[0]->b_blocknr; + u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb); + u16 xh_free_start; + size_t blocksize = inode->i_sb->s_blocksize; + handle_t *handle; + struct buffer_head **bhs; + struct ocfs2_xattr_entry *xe; + + bhs = kzalloc(sizeof(struct buffer_head *) * blk_per_bucket, + GFP_NOFS); + if (!bhs) + return -ENOMEM; + + ret = ocfs2_read_blocks(inode, blkno, blk_per_bucket, bhs, 0); + if (ret) + goto out; + + /* + * In order to make the operation more efficient and generic, + * we copy all the blocks into a contiguous memory and do the + * defragment there, so if anything is error, we will not touch + * the real block. + */ + bucket_buf = kmalloc(OCFS2_XATTR_BUCKET_SIZE, GFP_NOFS); + if (!bucket_buf) { + ret = -EIO; + goto out; + } + + buf = bucket_buf; + for (i = 0; i < blk_per_bucket; i++, buf += blocksize) + memcpy(buf, bhs[i]->b_data, blocksize); + + handle = ocfs2_start_trans((OCFS2_SB(inode->i_sb)), blk_per_bucket); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + handle = NULL; + mlog_errno(ret); + goto out; + } + + for (i = 0; i < blk_per_bucket; i++) { + ret = ocfs2_journal_access(handle, inode, bhs[i], + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret < 0) { + mlog_errno(ret); + goto commit; + } + } + + xh = (struct ocfs2_xattr_header *)bucket_buf; + entries = (char *)xh->xh_entries; + xh_free_start = le16_to_cpu(xh->xh_free_start); + + mlog(0, "adjust xattr bucket in %llu, count = %u, " + "xh_free_start = %u, xh_name_value_len = %u.\n", + blkno, le16_to_cpu(xh->xh_count), xh_free_start, + le16_to_cpu(xh->xh_name_value_len)); + + /* + * sort all the entries by their offset. + * the largest will be the first, so that we can + * move them to the end one by one. + */ + sort(entries, le16_to_cpu(xh->xh_count), + sizeof(struct ocfs2_xattr_entry), + cmp_xe_offset, swap_xe); + + /* Move all name/values to the end of the bucket. */ + xe = xh->xh_entries; + end = OCFS2_XATTR_BUCKET_SIZE; + for (i = 0; i < le16_to_cpu(xh->xh_count); i++, xe++) { + offset = le16_to_cpu(xe->xe_name_offset); + if (ocfs2_xattr_is_local(xe)) + value_len = OCFS2_XATTR_SIZE( + le64_to_cpu(xe->xe_value_size)); + else + value_len = OCFS2_XATTR_ROOT_SIZE; + len = OCFS2_XATTR_SIZE(xe->xe_name_len) + value_len; + + /* + * We must make sure that the name/value pair + * exist in the same block. So adjust end to + * the previous block end if needed. + */ + if (((end - len) / blocksize != + (end - 1) / blocksize)) + end = end - end % blocksize; + + if (end > offset + len) { + memmove(bucket_buf + end - len, + bucket_buf + offset, len); + xe->xe_name_offset = cpu_to_le16(end - len); + } + + mlog_bug_on_msg(end < offset + len, "Defrag check failed for " + "bucket %llu\n", (unsigned long long)blkno); + + end -= len; + } + + mlog_bug_on_msg(xh_free_start > end, "Defrag check failed for " + "bucket %llu\n", (unsigned long long)blkno); + + if (xh_free_start == end) + goto commit; + + memset(bucket_buf + xh_free_start, 0, end - xh_free_start); + xh->xh_free_start = cpu_to_le16(end); + + /* sort the entries by their name_hash. */ + sort(entries, le16_to_cpu(xh->xh_count), + sizeof(struct ocfs2_xattr_entry), + cmp_xe, swap_xe); + + buf = bucket_buf; + for (i = 0; i < blk_per_bucket; i++, buf += blocksize) { + memcpy(bhs[i]->b_data, buf, blocksize); + ocfs2_journal_dirty(handle, bhs[i]); + } + +commit: + ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle); +out: + + if (bhs) { + for (i = 0; i < blk_per_bucket; i++) + brelse(bhs[i]); + } + kfree(bhs); + + kfree(bucket_buf); + return ret; +} + +/* + * Move half nums of the xattr bucket in the previous cluster to this new + * cluster. We only touch the last cluster of the previous extend record. + * + * first_bh is the first buffer_head of a series of bucket in the same + * extent rec and header_bh is the header of one bucket in this cluster. + * They will be updated if we move the data header_bh contains to the new + * cluster. first_hash will be set as the 1st xe's name_hash of the new cluster. + */ +static int ocfs2_mv_xattr_bucket_cross_cluster(struct inode *inode, + handle_t *handle, + struct buffer_head **first_bh, + struct buffer_head **header_bh, + u64 new_blkno, + u64 prev_blkno, + u32 num_clusters, + u32 *first_hash) +{ + int i, ret, credits; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + int bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1); + int num_buckets = ocfs2_xattr_buckets_per_cluster(osb); + int blocksize = inode->i_sb->s_blocksize; + struct buffer_head *old_bh, *new_bh, *prev_bh, *new_first_bh = NULL; + struct ocfs2_xattr_header *new_xh; + struct ocfs2_xattr_header *xh = + (struct ocfs2_xattr_header *)((*first_bh)->b_data); + + BUG_ON(le16_to_cpu(xh->xh_num_buckets) < num_buckets); + BUG_ON(OCFS2_XATTR_BUCKET_SIZE == osb->s_clustersize); + + prev_bh = *first_bh; + get_bh(prev_bh); + xh = (struct ocfs2_xattr_header *)prev_bh->b_data; + + prev_blkno += (num_clusters - 1) * bpc + bpc / 2; + + mlog(0, "move half of xattrs in cluster %llu to %llu\n", + prev_blkno, new_blkno); + + /* + * We need to update the 1st half of the new cluster and + * 1 more for the update of the 1st bucket of the previous + * extent record. + */ + credits = bpc / 2 + 1; + ret = ocfs2_extend_trans(handle, credits); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_journal_access(handle, inode, prev_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out; + } + + for (i = 0; i < bpc / 2; i++, prev_blkno++, new_blkno++) { + old_bh = new_bh = NULL; + new_bh = sb_getblk(inode->i_sb, new_blkno); + if (!new_bh) { + ret = -EIO; + mlog_errno(ret); + goto out; + } + + ocfs2_set_new_buffer_uptodate(inode, new_bh); + + ret = ocfs2_journal_access(handle, inode, new_bh, + OCFS2_JOURNAL_ACCESS_CREATE); + if (ret < 0) { + mlog_errno(ret); + brelse(new_bh); + goto out; + } + + ret = ocfs2_read_block(inode, prev_blkno, &old_bh); + if (ret < 0) { + mlog_errno(ret); + brelse(new_bh); + goto out; + } + + memcpy(new_bh->b_data, old_bh->b_data, blocksize); + + if (i == 0) { + new_xh = (struct ocfs2_xattr_header *)new_bh->b_data; + new_xh->xh_num_buckets = cpu_to_le16(num_buckets / 2); + + if (first_hash) + *first_hash = le32_to_cpu( + new_xh->xh_entries[0].xe_name_hash); + new_first_bh = new_bh; + get_bh(new_first_bh); + } + + ocfs2_journal_dirty(handle, new_bh); + + if (*header_bh == old_bh) { + brelse(*header_bh); + *header_bh = new_bh; + get_bh(*header_bh); + + brelse(*first_bh); + *first_bh = new_first_bh; + get_bh(*first_bh); + } + brelse(new_bh); + brelse(old_bh); + } + + le16_add_cpu(&xh->xh_num_buckets, -(num_buckets / 2)); + + ocfs2_journal_dirty(handle, prev_bh); +out: + brelse(prev_bh); + brelse(new_first_bh); + return ret; +} + +static int ocfs2_read_xattr_bucket(struct inode *inode, + u64 blkno, + struct buffer_head **bhs, + int new) +{ + int ret = 0; + u16 i, blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb); + + if (!new) + return ocfs2_read_blocks(inode, blkno, + blk_per_bucket, bhs, 0); + + for (i = 0; i < blk_per_bucket; i++) { + bhs[i] = sb_getblk(inode->i_sb, blkno + i); + if (bhs[i] == NULL) { + ret = -EIO; + mlog_errno(ret); + break; + } + ocfs2_set_new_buffer_uptodate(inode, bhs[i]); + } + + return ret; +} + +/* + * Move half num of the xattrs in old bucket(blk) to new bucket(new_blk). + * first_hash will record the 1st hash of the new bucket. + */ +static int ocfs2_half_xattr_bucket(struct inode *inode, + handle_t *handle, + u64 blk, + u64 new_blk, + u32 *first_hash, + int new_bucket_head) +{ + int ret, i; + u16 count, start, len, name_value_len, xe_len, name_offset; + u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb); + struct buffer_head **s_bhs, **t_bhs = NULL; + struct ocfs2_xattr_header *xh; + struct ocfs2_xattr_entry *xe; + int blocksize = inode->i_sb->s_blocksize; + + mlog(0, "move half of xattrs from bucket %llu to %llu\n", + blk, new_blk); + + s_bhs = kcalloc(blk_per_bucket, sizeof(struct buffer_head *), GFP_NOFS); + if (!s_bhs) + return -ENOMEM; + + ret = ocfs2_read_xattr_bucket(inode, blk, s_bhs, 0); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_journal_access(handle, inode, s_bhs[0], + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out; + } + + t_bhs = kcalloc(blk_per_bucket, sizeof(struct buffer_head *), GFP_NOFS); + if (!t_bhs) { + ret = -ENOMEM; + goto out; + } + + ret = ocfs2_read_xattr_bucket(inode, new_blk, t_bhs, new_bucket_head); + if (ret) { + mlog_errno(ret); + goto out; + } + + for (i = 0; i < blk_per_bucket; i++) { + ret = ocfs2_journal_access(handle, inode, t_bhs[i], + OCFS2_JOURNAL_ACCESS_CREATE); + if (ret) { + mlog_errno(ret); + goto out; + } + } + + /* copy the whole bucket to the new first. */ + for (i = 0; i < blk_per_bucket; i++) + memcpy(t_bhs[i]->b_data, s_bhs[i]->b_data, blocksize); + + /* update the new bucket. */ + xh = (struct ocfs2_xattr_header *)t_bhs[0]->b_data; + count = le16_to_cpu(xh->xh_count); + start = count / 2; + + /* + * Calculate the total name/value len and xh_free_start for + * the old bucket first. + */ + name_offset = OCFS2_XATTR_BUCKET_SIZE; + name_value_len = 0; + for (i = 0; i < start; i++) { + xe = &xh->xh_entries[i]; + xe_len = OCFS2_XATTR_SIZE(xe->xe_name_len); + if (ocfs2_xattr_is_local(xe)) + xe_len += + OCFS2_XATTR_SIZE(le64_to_cpu(xe->xe_value_size)); + else + xe_len += OCFS2_XATTR_ROOT_SIZE; + name_value_len += xe_len; + if (le16_to_cpu(xe->xe_name_offset) < name_offset) + name_offset = le16_to_cpu(xe->xe_name_offset); + } + + /* + * Now begin the modification to the new bucket. + * + * In the new bucket, We just move the xattr entry to the beginning + * and don't touch the name/value. So there will be some holes in the + * bucket, and they will be removed when ocfs2_defrag_xattr_bucket is + * called. + */ + xe = &xh->xh_entries[start]; + len = sizeof(struct ocfs2_xattr_entry) * (count - start); + mlog(0, "mv xattr entry len %d from %d to %d\n", len, + (int)((char *)xe - (char *)xh), + (int)((char *)xh->xh_entries - (char *)xh)); + memmove((char *)xh->xh_entries, (char *)xe, len); + xe = &xh->xh_entries[count - start]; + len = sizeof(struct ocfs2_xattr_entry) * start; + memset((char *)xe, 0, len); + + le16_add_cpu(&xh->xh_count, -start); + le16_add_cpu(&xh->xh_name_value_len, -name_value_len); + + /* Calculate xh_free_start for the new bucket. */ + xh->xh_free_start = cpu_to_le16(OCFS2_XATTR_BUCKET_SIZE); + for (i = 0; i < le16_to_cpu(xh->xh_count); i++) { + xe = &xh->xh_entries[i]; + xe_len = OCFS2_XATTR_SIZE(xe->xe_name_len); + if (ocfs2_xattr_is_local(xe)) + xe_len += + OCFS2_XATTR_SIZE(le64_to_cpu(xe->xe_value_size)); + else + xe_len += OCFS2_XATTR_ROOT_SIZE; + if (le16_to_cpu(xe->xe_name_offset) < + le16_to_cpu(xh->xh_free_start)) + xh->xh_free_start = xe->xe_name_offset; + } + + /* set xh->xh_num_buckets for the new xh. */ + if (new_bucket_head) + xh->xh_num_buckets = cpu_to_le16(1); + else + xh->xh_num_buckets = 0; + + for (i = 0; i < blk_per_bucket; i++) { + ocfs2_journal_dirty(handle, t_bhs[i]); + if (ret) + mlog_errno(ret); + } + + /* store the first_hash of the new bucket. */ + if (first_hash) + *first_hash = le32_to_cpu(xh->xh_entries[0].xe_name_hash); + + /* + * Now only update the 1st block of the old bucket. + * Please note that the entry has been sorted already above. + */ + xh = (struct ocfs2_xattr_header *)s_bhs[0]->b_data; + memset(&xh->xh_entries[start], 0, + sizeof(struct ocfs2_xattr_entry) * (count - start)); + xh->xh_count = cpu_to_le16(start); + xh->xh_free_start = cpu_to_le16(name_offset); + xh->xh_name_value_len = cpu_to_le16(name_value_len); + + ocfs2_journal_dirty(handle, s_bhs[0]); + if (ret) + mlog_errno(ret); + +out: + if (s_bhs) { + for (i = 0; i < blk_per_bucket; i++) + brelse(s_bhs[i]); + } + kfree(s_bhs); + + if (t_bhs) { + for (i = 0; i < blk_per_bucket; i++) + brelse(t_bhs[i]); + } + kfree(t_bhs); + + return ret; +} + +/* + * Copy xattr from one bucket to another bucket. + * + * The caller must make sure that the journal transaction + * has enough space for journaling. + */ +static int ocfs2_cp_xattr_bucket(struct inode *inode, + handle_t *handle, + u64 s_blkno, + u64 t_blkno, + int t_is_new) +{ + int ret, i; + int blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb); + int blocksize = inode->i_sb->s_blocksize; + struct buffer_head **s_bhs, **t_bhs = NULL; + + BUG_ON(s_blkno == t_blkno); + + mlog(0, "cp bucket %llu to %llu, target is %d\n", + s_blkno, t_blkno, t_is_new); + + s_bhs = kzalloc(sizeof(struct buffer_head *) * blk_per_bucket, + GFP_NOFS); + if (!s_bhs) + return -ENOMEM; + + ret = ocfs2_read_xattr_bucket(inode, s_blkno, s_bhs, 0); + if (ret) + goto out; + + t_bhs = kzalloc(sizeof(struct buffer_head *) * blk_per_bucket, + GFP_NOFS); + if (!t_bhs) { + ret = -ENOMEM; + goto out; + } + + ret = ocfs2_read_xattr_bucket(inode, t_blkno, t_bhs, t_is_new); + if (ret) + goto out; + + for (i = 0; i < blk_per_bucket; i++) { + ret = ocfs2_journal_access(handle, inode, t_bhs[i], + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) + goto out; + } + + for (i = 0; i < blk_per_bucket; i++) { + memcpy(t_bhs[i]->b_data, s_bhs[i]->b_data, blocksize); + ocfs2_journal_dirty(handle, t_bhs[i]); + } + +out: + if (s_bhs) { + for (i = 0; i < blk_per_bucket; i++) + brelse(s_bhs[i]); + } + kfree(s_bhs); + + if (t_bhs) { + for (i = 0; i < blk_per_bucket; i++) + brelse(t_bhs[i]); + } + kfree(t_bhs); + + return ret; +} + +/* + * Copy one xattr cluster from src_blk to to_blk. + * The to_blk will become the first bucket header of the cluster, so its + * xh_num_buckets will be initialized as the bucket num in the cluster. + */ +static int ocfs2_cp_xattr_cluster(struct inode *inode, + handle_t *handle, + struct buffer_head *first_bh, + u64 src_blk, + u64 to_blk, + u32 *first_hash) +{ + int i, ret, credits; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + int bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1); + int num_buckets = ocfs2_xattr_buckets_per_cluster(osb); + struct buffer_head *bh = NULL; + struct ocfs2_xattr_header *xh; + u64 to_blk_start = to_blk; + + mlog(0, "cp xattrs from cluster %llu to %llu\n", src_blk, to_blk); + + /* + * We need to update the new cluster and 1 more for the update of + * the 1st bucket of the previous extent rec. + */ + credits = bpc + 1; + ret = ocfs2_extend_trans(handle, credits); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_journal_access(handle, inode, first_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out; + } + + for (i = 0; i < num_buckets; i++) { + ret = ocfs2_cp_xattr_bucket(inode, handle, + src_blk, to_blk, 1); + if (ret) { + mlog_errno(ret); + goto out; + } + + src_blk += ocfs2_blocks_per_xattr_bucket(inode->i_sb); + to_blk += ocfs2_blocks_per_xattr_bucket(inode->i_sb); + } + + /* update the old bucket header. */ + xh = (struct ocfs2_xattr_header *)first_bh->b_data; + le16_add_cpu(&xh->xh_num_buckets, -num_buckets); + + ocfs2_journal_dirty(handle, first_bh); + + /* update the new bucket header. */ + ret = ocfs2_read_block(inode, to_blk_start, &bh); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_journal_access(handle, inode, bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out; + } + + xh = (struct ocfs2_xattr_header *)bh->b_data; + xh->xh_num_buckets = cpu_to_le16(num_buckets); + + ocfs2_journal_dirty(handle, bh); + + if (first_hash) + *first_hash = le32_to_cpu(xh->xh_entries[0].xe_name_hash); +out: + brelse(bh); + return ret; +} + +/* + * Move half of the xattrs in this cluster to the new cluster. + * This function should only be called when bucket size == cluster size. + * Otherwise ocfs2_mv_xattr_bucket_cross_cluster should be used instead. + */ +static int ocfs2_half_xattr_cluster(struct inode *inode, + handle_t *handle, + u64 prev_blk, + u64 new_blk, + u32 *first_hash) +{ + u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb); + int ret, credits = 2 * blk_per_bucket; + + BUG_ON(OCFS2_XATTR_BUCKET_SIZE < OCFS2_SB(inode->i_sb)->s_clustersize); + + ret = ocfs2_extend_trans(handle, credits); + if (ret) { + mlog_errno(ret); + return ret; + } + + /* Move half of the xattr in start_blk to the next bucket. */ + return ocfs2_half_xattr_bucket(inode, handle, prev_blk, + new_blk, first_hash, 1); +} + +/* + * Move some xattrs from the old cluster to the new one since they are not + * contiguous in ocfs2 xattr tree. + * + * new_blk starts a new separate cluster, and we will move some xattrs from + * prev_blk to it. v_start will be set as the first name hash value in this + * new cluster so that it can be used as e_cpos during tree insertion and + * don't collide with our original b-tree operations. first_bh and header_bh + * will also be updated since they will be used in ocfs2_extend_xattr_bucket + * to extend the insert bucket. + * + * The problem is how much xattr should we move to the new one and when should + * we update first_bh and header_bh? + * 1. If cluster size > bucket size, that means the previous cluster has more + * than 1 bucket, so just move half nums of bucket into the new cluster and + * update the first_bh and header_bh if the insert bucket has been moved + * to the new cluster. + * 2. If cluster_size == bucket_size: + * a) If the previous extent rec has more than one cluster and the insert + * place isn't in the last cluster, copy the entire last cluster to the + * new one. This time, we don't need to upate the first_bh and header_bh + * since they will not be moved into the new cluster. + * b) Otherwise, move the bottom half of the xattrs in the last cluster into + * the new one. And we set the extend flag to zero if the insert place is + * moved into the new allocated cluster since no extend is needed. + */ +static int ocfs2_adjust_xattr_cross_cluster(struct inode *inode, + handle_t *handle, + struct buffer_head **first_bh, + struct buffer_head **header_bh, + u64 new_blk, + u64 prev_blk, + u32 prev_clusters, + u32 *v_start, + int *extend) +{ + int ret = 0; + int bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1); + + mlog(0, "adjust xattrs from cluster %llu len %u to %llu\n", + prev_blk, prev_clusters, new_blk); + + if (ocfs2_xattr_buckets_per_cluster(OCFS2_SB(inode->i_sb)) > 1) + ret = ocfs2_mv_xattr_bucket_cross_cluster(inode, + handle, + first_bh, + header_bh, + new_blk, + prev_blk, + prev_clusters, + v_start); + else { + u64 last_blk = prev_blk + bpc * (prev_clusters - 1); + + if (prev_clusters > 1 && (*header_bh)->b_blocknr != last_blk) + ret = ocfs2_cp_xattr_cluster(inode, handle, *first_bh, + last_blk, new_blk, + v_start); + else { + ret = ocfs2_half_xattr_cluster(inode, handle, + last_blk, new_blk, + v_start); + + if ((*header_bh)->b_blocknr == last_blk && extend) + *extend = 0; + } + } + + return ret; +} + +/* + * Add a new cluster for xattr storage. + * + * If the new cluster is contiguous with the previous one, it will be + * appended to the same extent record, and num_clusters will be updated. + * If not, we will insert a new extent for it and move some xattrs in + * the last cluster into the new allocated one. + * We also need to limit the maximum size of a btree leaf, otherwise we'll + * lose the benefits of hashing because we'll have to search large leaves. + * So now the maximum size is OCFS2_MAX_XATTR_TREE_LEAF_SIZE(or clustersize, + * if it's bigger). + * + * first_bh is the first block of the previous extent rec and header_bh + * indicates the bucket we will insert the new xattrs. They will be updated + * when the header_bh is moved into the new cluster. + */ +static int ocfs2_add_new_xattr_cluster(struct inode *inode, + struct buffer_head *root_bh, + struct buffer_head **first_bh, + struct buffer_head **header_bh, + u32 *num_clusters, + u32 prev_cpos, + u64 prev_blkno, + int *extend) +{ + int ret, credits; + u16 bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1); + u32 prev_clusters = *num_clusters; + u32 clusters_to_add = 1, bit_off, num_bits, v_start = 0; + u64 block; + handle_t *handle = NULL; + struct ocfs2_alloc_context *data_ac = NULL; + struct ocfs2_alloc_context *meta_ac = NULL; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct ocfs2_extent_tree et; + + mlog(0, "Add new xattr cluster for %llu, previous xattr hash = %u, " + "previous xattr blkno = %llu\n", + (unsigned long long)OCFS2_I(inode)->ip_blkno, + prev_cpos, prev_blkno); + + ocfs2_init_xattr_tree_extent_tree(&et, inode, root_bh); + + ret = ocfs2_lock_allocators(inode, &et, clusters_to_add, 0, + &data_ac, &meta_ac); + if (ret) { + mlog_errno(ret); + goto leave; + } + + credits = ocfs2_calc_extend_credits(osb->sb, et.et_root_el, + clusters_to_add); + handle = ocfs2_start_trans(osb, credits); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + handle = NULL; + mlog_errno(ret); + goto leave; + } + + ret = ocfs2_journal_access(handle, inode, root_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret < 0) { + mlog_errno(ret); + goto leave; + } + + ret = __ocfs2_claim_clusters(osb, handle, data_ac, 1, + clusters_to_add, &bit_off, &num_bits); + if (ret < 0) { + if (ret != -ENOSPC) + mlog_errno(ret); + goto leave; + } + + BUG_ON(num_bits > clusters_to_add); + + block = ocfs2_clusters_to_blocks(osb->sb, bit_off); + mlog(0, "Allocating %u clusters at block %u for xattr in inode %llu\n", + num_bits, bit_off, (unsigned long long)OCFS2_I(inode)->ip_blkno); + + if (prev_blkno + prev_clusters * bpc == block && + (prev_clusters + num_bits) << osb->s_clustersize_bits <= + OCFS2_MAX_XATTR_TREE_LEAF_SIZE) { + /* + * If this cluster is contiguous with the old one and + * adding this new cluster, we don't surpass the limit of + * OCFS2_MAX_XATTR_TREE_LEAF_SIZE, cool. We will let it be + * initialized and used like other buckets in the previous + * cluster. + * So add it as a contiguous one. The caller will handle + * its init process. + */ + v_start = prev_cpos + prev_clusters; + *num_clusters = prev_clusters + num_bits; + mlog(0, "Add contiguous %u clusters to previous extent rec.\n", + num_bits); + } else { + ret = ocfs2_adjust_xattr_cross_cluster(inode, + handle, + first_bh, + header_bh, + block, + prev_blkno, + prev_clusters, + &v_start, + extend); + if (ret) { + mlog_errno(ret); + goto leave; + } + } + + if (handle->h_buffer_credits < credits) { + /* + * The journal has been restarted before, and don't + * have enough space for the insertion, so extend it + * here. + */ + ret = ocfs2_extend_trans(handle, credits); + if (ret) { + mlog_errno(ret); + goto leave; + } + } + mlog(0, "Insert %u clusters at block %llu for xattr at %u\n", + num_bits, block, v_start); + ret = ocfs2_insert_extent(osb, handle, inode, &et, v_start, block, + num_bits, 0, meta_ac); + if (ret < 0) { + mlog_errno(ret); + goto leave; + } + + ret = ocfs2_journal_dirty(handle, root_bh); + if (ret < 0) { + mlog_errno(ret); + goto leave; + } + +leave: + if (handle) + ocfs2_commit_trans(osb, handle); + if (data_ac) + ocfs2_free_alloc_context(data_ac); + if (meta_ac) + ocfs2_free_alloc_context(meta_ac); + + return ret; +} + +/* + * Extend a new xattr bucket and move xattrs to the end one by one until + * We meet with start_bh. Only move half of the xattrs to the bucket after it. + */ +static int ocfs2_extend_xattr_bucket(struct inode *inode, + struct buffer_head *first_bh, + struct buffer_head *start_bh, + u32 num_clusters) +{ + int ret, credits; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb); + u64 start_blk = start_bh->b_blocknr, end_blk; + u32 num_buckets = num_clusters * ocfs2_xattr_buckets_per_cluster(osb); + handle_t *handle; + struct ocfs2_xattr_header *first_xh = + (struct ocfs2_xattr_header *)first_bh->b_data; + u16 bucket = le16_to_cpu(first_xh->xh_num_buckets); + + mlog(0, "extend xattr bucket in %llu, xattr extend rec starting " + "from %llu, len = %u\n", start_blk, + (unsigned long long)first_bh->b_blocknr, num_clusters); + + BUG_ON(bucket >= num_buckets); + + end_blk = first_bh->b_blocknr + (bucket - 1) * blk_per_bucket; + + /* + * We will touch all the buckets after the start_bh(include it). + * Add one more bucket and modify the first_bh. + */ + credits = end_blk - start_blk + 2 * blk_per_bucket + 1; + handle = ocfs2_start_trans(osb, credits); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + handle = NULL; + mlog_errno(ret); + goto out; + } + + ret = ocfs2_journal_access(handle, inode, first_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto commit; + } + + while (end_blk != start_blk) { + ret = ocfs2_cp_xattr_bucket(inode, handle, end_blk, + end_blk + blk_per_bucket, 0); + if (ret) + goto commit; + end_blk -= blk_per_bucket; + } + + /* Move half of the xattr in start_blk to the next bucket. */ + ret = ocfs2_half_xattr_bucket(inode, handle, start_blk, + start_blk + blk_per_bucket, NULL, 0); + + le16_add_cpu(&first_xh->xh_num_buckets, 1); + ocfs2_journal_dirty(handle, first_bh); + +commit: + ocfs2_commit_trans(osb, handle); +out: + return ret; +} + +/* + * Add new xattr bucket in an extent record and adjust the buckets accordingly. + * xb_bh is the ocfs2_xattr_block. + * We will move all the buckets starting from header_bh to the next place. As + * for this one, half num of its xattrs will be moved to the next one. + * + * We will allocate a new cluster if current cluster is full and adjust + * header_bh and first_bh if the insert place is moved to the new cluster. + */ +static int ocfs2_add_new_xattr_bucket(struct inode *inode, + struct buffer_head *xb_bh, + struct buffer_head *header_bh) +{ + struct ocfs2_xattr_header *first_xh = NULL; + struct buffer_head *first_bh = NULL; + struct ocfs2_xattr_block *xb = + (struct ocfs2_xattr_block *)xb_bh->b_data; + struct ocfs2_xattr_tree_root *xb_root = &xb->xb_attrs.xb_root; + struct ocfs2_extent_list *el = &xb_root->xt_list; + struct ocfs2_xattr_header *xh = + (struct ocfs2_xattr_header *)header_bh->b_data; + u32 name_hash = le32_to_cpu(xh->xh_entries[0].xe_name_hash); + struct super_block *sb = inode->i_sb; + struct ocfs2_super *osb = OCFS2_SB(sb); + int ret, num_buckets, extend = 1; + u64 p_blkno; + u32 e_cpos, num_clusters; + + mlog(0, "Add new xattr bucket starting form %llu\n", + (unsigned long long)header_bh->b_blocknr); + + /* + * Add refrence for header_bh here because it may be + * changed in ocfs2_add_new_xattr_cluster and we need + * to free it in the end. + */ + get_bh(header_bh); + + ret = ocfs2_xattr_get_rec(inode, name_hash, &p_blkno, &e_cpos, + &num_clusters, el); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_read_block(inode, p_blkno, &first_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + num_buckets = ocfs2_xattr_buckets_per_cluster(osb) * num_clusters; + first_xh = (struct ocfs2_xattr_header *)first_bh->b_data; + + if (num_buckets == le16_to_cpu(first_xh->xh_num_buckets)) { + ret = ocfs2_add_new_xattr_cluster(inode, + xb_bh, + &first_bh, + &header_bh, + &num_clusters, + e_cpos, + p_blkno, + &extend); + if (ret) { + mlog_errno(ret); + goto out; + } + } + + if (extend) + ret = ocfs2_extend_xattr_bucket(inode, + first_bh, + header_bh, + num_clusters); + if (ret) + mlog_errno(ret); +out: + brelse(first_bh); + brelse(header_bh); + return ret; +} + +static inline char *ocfs2_xattr_bucket_get_val(struct inode *inode, + struct ocfs2_xattr_bucket *bucket, + int offs) +{ + int block_off = offs >> inode->i_sb->s_blocksize_bits; + + offs = offs % inode->i_sb->s_blocksize; + return bucket->bhs[block_off]->b_data + offs; +} + +/* + * Handle the normal xattr set, including replace, delete and new. + * + * Note: "local" indicates the real data's locality. So we can't + * just its bucket locality by its length. + */ +static void ocfs2_xattr_set_entry_normal(struct inode *inode, + struct ocfs2_xattr_info *xi, + struct ocfs2_xattr_search *xs, + u32 name_hash, + int local) +{ + struct ocfs2_xattr_entry *last, *xe; + int name_len = strlen(xi->name); + struct ocfs2_xattr_header *xh = xs->header; + u16 count = le16_to_cpu(xh->xh_count), start; + size_t blocksize = inode->i_sb->s_blocksize; + char *val; + size_t offs, size, new_size; + + last = &xh->xh_entries[count]; + if (!xs->not_found) { + xe = xs->here; + offs = le16_to_cpu(xe->xe_name_offset); + if (ocfs2_xattr_is_local(xe)) + size = OCFS2_XATTR_SIZE(name_len) + + OCFS2_XATTR_SIZE(le64_to_cpu(xe->xe_value_size)); + else + size = OCFS2_XATTR_SIZE(name_len) + + OCFS2_XATTR_SIZE(OCFS2_XATTR_ROOT_SIZE); + + /* + * If the new value will be stored outside, xi->value has been + * initalized as an empty ocfs2_xattr_value_root, and the same + * goes with xi->value_len, so we can set new_size safely here. + * See ocfs2_xattr_set_in_bucket. + */ + new_size = OCFS2_XATTR_SIZE(name_len) + + OCFS2_XATTR_SIZE(xi->value_len); + + le16_add_cpu(&xh->xh_name_value_len, -size); + if (xi->value) { + if (new_size > size) + goto set_new_name_value; + + /* Now replace the old value with new one. */ + if (local) + xe->xe_value_size = cpu_to_le64(xi->value_len); + else + xe->xe_value_size = 0; + + val = ocfs2_xattr_bucket_get_val(inode, + &xs->bucket, offs); + memset(val + OCFS2_XATTR_SIZE(name_len), 0, + size - OCFS2_XATTR_SIZE(name_len)); + if (OCFS2_XATTR_SIZE(xi->value_len) > 0) + memcpy(val + OCFS2_XATTR_SIZE(name_len), + xi->value, xi->value_len); + + le16_add_cpu(&xh->xh_name_value_len, new_size); + ocfs2_xattr_set_local(xe, local); + return; + } else { + /* + * Remove the old entry if there is more than one. + * We don't remove the last entry so that we can + * use it to indicate the hash value of the empty + * bucket. + */ + last -= 1; + le16_add_cpu(&xh->xh_count, -1); + if (xh->xh_count) { + memmove(xe, xe + 1, + (void *)last - (void *)xe); + memset(last, 0, + sizeof(struct ocfs2_xattr_entry)); + } else + xh->xh_free_start = + cpu_to_le16(OCFS2_XATTR_BUCKET_SIZE); + + return; + } + } else { + /* find a new entry for insert. */ + int low = 0, high = count - 1, tmp; + struct ocfs2_xattr_entry *tmp_xe; + + while (low <= high && count) { + tmp = (low + high) / 2; + tmp_xe = &xh->xh_entries[tmp]; + + if (name_hash > le32_to_cpu(tmp_xe->xe_name_hash)) + low = tmp + 1; + else if (name_hash < + le32_to_cpu(tmp_xe->xe_name_hash)) + high = tmp - 1; + else { + low = tmp; + break; + } + } + + xe = &xh->xh_entries[low]; + if (low != count) + memmove(xe + 1, xe, (void *)last - (void *)xe); + + le16_add_cpu(&xh->xh_count, 1); + memset(xe, 0, sizeof(struct ocfs2_xattr_entry)); + xe->xe_name_hash = cpu_to_le32(name_hash); + xe->xe_name_len = name_len; + ocfs2_xattr_set_type(xe, xi->name_index); + } + +set_new_name_value: + /* Insert the new name+value. */ + size = OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_SIZE(xi->value_len); + + /* + * We must make sure that the name/value pair + * exists in the same block. + */ + offs = le16_to_cpu(xh->xh_free_start); + start = offs - size; + + if (start >> inode->i_sb->s_blocksize_bits != + (offs - 1) >> inode->i_sb->s_blocksize_bits) { + offs = offs - offs % blocksize; + xh->xh_free_start = cpu_to_le16(offs); + } + + val = ocfs2_xattr_bucket_get_val(inode, + &xs->bucket, offs - size); + xe->xe_name_offset = cpu_to_le16(offs - size); + + memset(val, 0, size); + memcpy(val, xi->name, name_len); + memcpy(val + OCFS2_XATTR_SIZE(name_len), xi->value, xi->value_len); + + xe->xe_value_size = cpu_to_le64(xi->value_len); + ocfs2_xattr_set_local(xe, local); + xs->here = xe; + le16_add_cpu(&xh->xh_free_start, -size); + le16_add_cpu(&xh->xh_name_value_len, size); + + return; +} + +static int ocfs2_xattr_bucket_handle_journal(struct inode *inode, + handle_t *handle, + struct ocfs2_xattr_search *xs, + struct buffer_head **bhs, + u16 bh_num) +{ + int ret = 0, off, block_off; + struct ocfs2_xattr_entry *xe = xs->here; + + /* + * First calculate all the blocks we should journal_access + * and journal_dirty. The first block should always be touched. + */ + ret = ocfs2_journal_dirty(handle, bhs[0]); + if (ret) + mlog_errno(ret); + + /* calc the data. */ + off = le16_to_cpu(xe->xe_name_offset); + block_off = off >> inode->i_sb->s_blocksize_bits; + ret = ocfs2_journal_dirty(handle, bhs[block_off]); + if (ret) + mlog_errno(ret); + + return ret; +} + +/* + * Set the xattr entry in the specified bucket. + * The bucket is indicated by xs->bucket and it should have the enough + * space for the xattr insertion. + */ +static int ocfs2_xattr_set_entry_in_bucket(struct inode *inode, + struct ocfs2_xattr_info *xi, + struct ocfs2_xattr_search *xs, + u32 name_hash, + int local) +{ + int i, ret; + handle_t *handle = NULL; + u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb); + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + + mlog(0, "Set xattr entry len = %lu index = %d in bucket %llu\n", + (unsigned long)xi->value_len, xi->name_index, + (unsigned long long)xs->bucket.bhs[0]->b_blocknr); + + if (!xs->bucket.bhs[1]) { + ret = ocfs2_read_blocks(inode, + xs->bucket.bhs[0]->b_blocknr + 1, + blk_per_bucket - 1, &xs->bucket.bhs[1], + 0); + if (ret) { + mlog_errno(ret); + goto out; + } + } + + handle = ocfs2_start_trans(osb, blk_per_bucket); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + handle = NULL; + mlog_errno(ret); + goto out; + } + + for (i = 0; i < blk_per_bucket; i++) { + ret = ocfs2_journal_access(handle, inode, xs->bucket.bhs[i], + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + } + + ocfs2_xattr_set_entry_normal(inode, xi, xs, name_hash, local); + + /*Only dirty the blocks we have touched in set xattr. */ + ret = ocfs2_xattr_bucket_handle_journal(inode, handle, xs, + xs->bucket.bhs, blk_per_bucket); + if (ret) + mlog_errno(ret); +out: + ocfs2_commit_trans(osb, handle); + + return ret; +} + +static int ocfs2_xattr_value_update_size(struct inode *inode, + struct buffer_head *xe_bh, + struct ocfs2_xattr_entry *xe, + u64 new_size) +{ + int ret; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + handle_t *handle = NULL; + + handle = ocfs2_start_trans(osb, 1); + if (handle == NULL) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + + ret = ocfs2_journal_access(handle, inode, xe_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret < 0) { + mlog_errno(ret); + goto out_commit; + } + + xe->xe_value_size = cpu_to_le64(new_size); + + ret = ocfs2_journal_dirty(handle, xe_bh); + if (ret < 0) + mlog_errno(ret); + +out_commit: + ocfs2_commit_trans(osb, handle); +out: + return ret; +} + +/* + * Truncate the specified xe_off entry in xattr bucket. + * bucket is indicated by header_bh and len is the new length. + * Both the ocfs2_xattr_value_root and the entry will be updated here. + * + * Copy the new updated xe and xe_value_root to new_xe and new_xv if needed. + */ +static int ocfs2_xattr_bucket_value_truncate(struct inode *inode, + struct buffer_head *header_bh, + int xe_off, + int len) +{ + int ret, offset; + u64 value_blk; + struct buffer_head *value_bh = NULL; + struct ocfs2_xattr_value_root *xv; + struct ocfs2_xattr_entry *xe; + struct ocfs2_xattr_header *xh = + (struct ocfs2_xattr_header *)header_bh->b_data; + size_t blocksize = inode->i_sb->s_blocksize; + + xe = &xh->xh_entries[xe_off]; + + BUG_ON(!xe || ocfs2_xattr_is_local(xe)); + + offset = le16_to_cpu(xe->xe_name_offset) + + OCFS2_XATTR_SIZE(xe->xe_name_len); + + value_blk = offset / blocksize; + + /* We don't allow ocfs2_xattr_value to be stored in different block. */ + BUG_ON(value_blk != (offset + OCFS2_XATTR_ROOT_SIZE - 1) / blocksize); + value_blk += header_bh->b_blocknr; + + ret = ocfs2_read_block(inode, value_blk, &value_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + xv = (struct ocfs2_xattr_value_root *) + (value_bh->b_data + offset % blocksize); + + mlog(0, "truncate %u in xattr bucket %llu to %d bytes.\n", + xe_off, (unsigned long long)header_bh->b_blocknr, len); + ret = ocfs2_xattr_value_truncate(inode, value_bh, xv, len); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_xattr_value_update_size(inode, header_bh, xe, len); + if (ret) { + mlog_errno(ret); + goto out; + } + +out: + brelse(value_bh); + return ret; +} + +static int ocfs2_xattr_bucket_value_truncate_xs(struct inode *inode, + struct ocfs2_xattr_search *xs, + int len) +{ + int ret, offset; + struct ocfs2_xattr_entry *xe = xs->here; + struct ocfs2_xattr_header *xh = (struct ocfs2_xattr_header *)xs->base; + + BUG_ON(!xs->bucket.bhs[0] || !xe || ocfs2_xattr_is_local(xe)); + + offset = xe - xh->xh_entries; + ret = ocfs2_xattr_bucket_value_truncate(inode, xs->bucket.bhs[0], + offset, len); + if (ret) + mlog_errno(ret); + + return ret; +} + +static int ocfs2_xattr_bucket_set_value_outside(struct inode *inode, + struct ocfs2_xattr_search *xs, + char *val, + int value_len) +{ + int offset; + struct ocfs2_xattr_value_root *xv; + struct ocfs2_xattr_entry *xe = xs->here; + + BUG_ON(!xs->base || !xe || ocfs2_xattr_is_local(xe)); + + offset = le16_to_cpu(xe->xe_name_offset) + + OCFS2_XATTR_SIZE(xe->xe_name_len); + + xv = (struct ocfs2_xattr_value_root *)(xs->base + offset); + + return __ocfs2_xattr_set_value_outside(inode, xv, val, value_len); +} + +static int ocfs2_rm_xattr_cluster(struct inode *inode, + struct buffer_head *root_bh, + u64 blkno, + u32 cpos, + u32 len) +{ + int ret; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct inode *tl_inode = osb->osb_tl_inode; + handle_t *handle; + struct ocfs2_xattr_block *xb = + (struct ocfs2_xattr_block *)root_bh->b_data; + struct ocfs2_alloc_context *meta_ac = NULL; + struct ocfs2_cached_dealloc_ctxt dealloc; + struct ocfs2_extent_tree et; + + ocfs2_init_xattr_tree_extent_tree(&et, inode, root_bh); + + ocfs2_init_dealloc_ctxt(&dealloc); + + mlog(0, "rm xattr extent rec at %u len = %u, start from %llu\n", + cpos, len, (unsigned long long)blkno); + + ocfs2_remove_xattr_clusters_from_cache(inode, blkno, len); + + ret = ocfs2_lock_allocators(inode, &et, 0, 1, NULL, &meta_ac); + if (ret) { + mlog_errno(ret); + return ret; + } + + mutex_lock(&tl_inode->i_mutex); + + if (ocfs2_truncate_log_needs_flush(osb)) { + ret = __ocfs2_flush_truncate_log(osb); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + } + + handle = ocfs2_start_trans(osb, OCFS2_REMOVE_EXTENT_CREDITS); + if (handle == NULL) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + + ret = ocfs2_journal_access(handle, inode, root_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + ret = ocfs2_remove_extent(inode, &et, cpos, len, handle, meta_ac, + &dealloc); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + le32_add_cpu(&xb->xb_attrs.xb_root.xt_clusters, -len); + + ret = ocfs2_journal_dirty(handle, root_bh); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + ret = ocfs2_truncate_log_append(osb, handle, blkno, len); + if (ret) + mlog_errno(ret); + +out_commit: + ocfs2_commit_trans(osb, handle); +out: + ocfs2_schedule_truncate_log_flush(osb, 1); + + mutex_unlock(&tl_inode->i_mutex); + + if (meta_ac) + ocfs2_free_alloc_context(meta_ac); + + ocfs2_run_deallocs(osb, &dealloc); + + return ret; +} + +static void ocfs2_xattr_bucket_remove_xs(struct inode *inode, + struct ocfs2_xattr_search *xs) +{ + handle_t *handle = NULL; + struct ocfs2_xattr_header *xh = xs->bucket.xh; + struct ocfs2_xattr_entry *last = &xh->xh_entries[ + le16_to_cpu(xh->xh_count) - 1]; + int ret = 0; + + handle = ocfs2_start_trans((OCFS2_SB(inode->i_sb)), 1); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + return; + } + + ret = ocfs2_journal_access(handle, inode, xs->bucket.bhs[0], + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + /* Remove the old entry. */ + memmove(xs->here, xs->here + 1, + (void *)last - (void *)xs->here); + memset(last, 0, sizeof(struct ocfs2_xattr_entry)); + le16_add_cpu(&xh->xh_count, -1); + + ret = ocfs2_journal_dirty(handle, xs->bucket.bhs[0]); + if (ret < 0) + mlog_errno(ret); +out_commit: + ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle); +} + +/* + * Set the xattr name/value in the bucket specified in xs. + * + * As the new value in xi may be stored in the bucket or in an outside cluster, + * we divide the whole process into 3 steps: + * 1. insert name/value in the bucket(ocfs2_xattr_set_entry_in_bucket) + * 2. truncate of the outside cluster(ocfs2_xattr_bucket_value_truncate_xs) + * 3. Set the value to the outside cluster(ocfs2_xattr_bucket_set_value_outside) + * 4. If the clusters for the new outside value can't be allocated, we need + * to free the xattr we allocated in set. + */ +static int ocfs2_xattr_set_in_bucket(struct inode *inode, + struct ocfs2_xattr_info *xi, + struct ocfs2_xattr_search *xs) +{ + int ret, local = 1; + size_t value_len; + char *val = (char *)xi->value; + struct ocfs2_xattr_entry *xe = xs->here; + u32 name_hash = ocfs2_xattr_name_hash(inode, xi->name, + strlen(xi->name)); + + if (!xs->not_found && !ocfs2_xattr_is_local(xe)) { + /* + * We need to truncate the xattr storage first. + * + * If both the old and new value are stored to + * outside block, we only need to truncate + * the storage and then set the value outside. + * + * If the new value should be stored within block, + * we should free all the outside block first and + * the modification to the xattr block will be done + * by following steps. + */ + if (xi->value_len > OCFS2_XATTR_INLINE_SIZE) + value_len = xi->value_len; + else + value_len = 0; + + ret = ocfs2_xattr_bucket_value_truncate_xs(inode, xs, + value_len); + if (ret) + goto out; + + if (value_len) + goto set_value_outside; + } + + value_len = xi->value_len; + /* So we have to handle the inside block change now. */ + if (value_len > OCFS2_XATTR_INLINE_SIZE) { + /* + * If the new value will be stored outside of block, + * initalize a new empty value root and insert it first. + */ + local = 0; + xi->value = &def_xv; + xi->value_len = OCFS2_XATTR_ROOT_SIZE; + } + + ret = ocfs2_xattr_set_entry_in_bucket(inode, xi, xs, name_hash, local); + if (ret) { + mlog_errno(ret); + goto out; + } + + if (value_len <= OCFS2_XATTR_INLINE_SIZE) + goto out; + + /* allocate the space now for the outside block storage. */ + ret = ocfs2_xattr_bucket_value_truncate_xs(inode, xs, + value_len); + if (ret) { + mlog_errno(ret); + + if (xs->not_found) { + /* + * We can't allocate enough clusters for outside + * storage and we have allocated xattr already, + * so need to remove it. + */ + ocfs2_xattr_bucket_remove_xs(inode, xs); + } + goto out; + } + +set_value_outside: + ret = ocfs2_xattr_bucket_set_value_outside(inode, xs, val, value_len); +out: + return ret; +} + +/* check whether the xattr bucket is filled up with the same hash value. */ +static int ocfs2_check_xattr_bucket_collision(struct inode *inode, + struct ocfs2_xattr_bucket *bucket) +{ + struct ocfs2_xattr_header *xh = bucket->xh; + + if (xh->xh_entries[le16_to_cpu(xh->xh_count) - 1].xe_name_hash == + xh->xh_entries[0].xe_name_hash) { + mlog(ML_ERROR, "Too much hash collision in xattr bucket %llu, " + "hash = %u\n", + (unsigned long long)bucket->bhs[0]->b_blocknr, + le32_to_cpu(xh->xh_entries[0].xe_name_hash)); + return -ENOSPC; + } + + return 0; +} + +static int ocfs2_xattr_set_entry_index_block(struct inode *inode, + struct ocfs2_xattr_info *xi, + struct ocfs2_xattr_search *xs) +{ + struct ocfs2_xattr_header *xh; + struct ocfs2_xattr_entry *xe; + u16 count, header_size, xh_free_start; + int i, free, max_free, need, old; + size_t value_size = 0, name_len = strlen(xi->name); + size_t blocksize = inode->i_sb->s_blocksize; + int ret, allocation = 0; + u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb); + + mlog_entry("Set xattr %s in xattr index block\n", xi->name); + +try_again: + xh = xs->header; + count = le16_to_cpu(xh->xh_count); + xh_free_start = le16_to_cpu(xh->xh_free_start); + header_size = sizeof(struct ocfs2_xattr_header) + + count * sizeof(struct ocfs2_xattr_entry); + max_free = OCFS2_XATTR_BUCKET_SIZE - + le16_to_cpu(xh->xh_name_value_len) - header_size; + + mlog_bug_on_msg(header_size > blocksize, "bucket %llu has header size " + "of %u which exceed block size\n", + (unsigned long long)xs->bucket.bhs[0]->b_blocknr, + header_size); + + if (xi->value && xi->value_len > OCFS2_XATTR_INLINE_SIZE) + value_size = OCFS2_XATTR_ROOT_SIZE; + else if (xi->value) + value_size = OCFS2_XATTR_SIZE(xi->value_len); + + if (xs->not_found) + need = sizeof(struct ocfs2_xattr_entry) + + OCFS2_XATTR_SIZE(name_len) + value_size; + else { + need = value_size + OCFS2_XATTR_SIZE(name_len); + + /* + * We only replace the old value if the new length is smaller + * than the old one. Otherwise we will allocate new space in the + * bucket to store it. + */ + xe = xs->here; + if (ocfs2_xattr_is_local(xe)) + old = OCFS2_XATTR_SIZE(le64_to_cpu(xe->xe_value_size)); + else + old = OCFS2_XATTR_SIZE(OCFS2_XATTR_ROOT_SIZE); + + if (old >= value_size) + need = 0; + } + + free = xh_free_start - header_size; + /* + * We need to make sure the new name/value pair + * can exist in the same block. + */ + if (xh_free_start % blocksize < need) + free -= xh_free_start % blocksize; + + mlog(0, "xs->not_found = %d, in xattr bucket %llu: free = %d, " + "need = %d, max_free = %d, xh_free_start = %u, xh_name_value_len =" + " %u\n", xs->not_found, + (unsigned long long)xs->bucket.bhs[0]->b_blocknr, + free, need, max_free, le16_to_cpu(xh->xh_free_start), + le16_to_cpu(xh->xh_name_value_len)); + + if (free < need || count == ocfs2_xattr_max_xe_in_bucket(inode->i_sb)) { + if (need <= max_free && + count < ocfs2_xattr_max_xe_in_bucket(inode->i_sb)) { + /* + * We can create the space by defragment. Since only the + * name/value will be moved, the xe shouldn't be changed + * in xs. + */ + ret = ocfs2_defrag_xattr_bucket(inode, &xs->bucket); + if (ret) { + mlog_errno(ret); + goto out; + } + + xh_free_start = le16_to_cpu(xh->xh_free_start); + free = xh_free_start - header_size; + if (xh_free_start % blocksize < need) + free -= xh_free_start % blocksize; + + if (free >= need) + goto xattr_set; + + mlog(0, "Can't get enough space for xattr insert by " + "defragment. Need %u bytes, but we have %d, so " + "allocate new bucket for it.\n", need, free); + } + + /* + * We have to add new buckets or clusters and one + * allocation should leave us enough space for insert. + */ + BUG_ON(allocation); + + /* + * We do not allow for overlapping ranges between buckets. And + * the maximum number of collisions we will allow for then is + * one bucket's worth, so check it here whether we need to + * add a new bucket for the insert. + */ + ret = ocfs2_check_xattr_bucket_collision(inode, &xs->bucket); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_add_new_xattr_bucket(inode, + xs->xattr_bh, + xs->bucket.bhs[0]); + if (ret) { + mlog_errno(ret); + goto out; + } + + for (i = 0; i < blk_per_bucket; i++) + brelse(xs->bucket.bhs[i]); + + memset(&xs->bucket, 0, sizeof(xs->bucket)); + + ret = ocfs2_xattr_index_block_find(inode, xs->xattr_bh, + xi->name_index, + xi->name, xs); + if (ret && ret != -ENODATA) + goto out; + xs->not_found = ret; + allocation = 1; + goto try_again; + } + +xattr_set: + ret = ocfs2_xattr_set_in_bucket(inode, xi, xs); +out: + mlog_exit(ret); + return ret; +} + +static int ocfs2_delete_xattr_in_bucket(struct inode *inode, + struct ocfs2_xattr_bucket *bucket, + void *para) +{ + int ret = 0; + struct ocfs2_xattr_header *xh = bucket->xh; + u16 i; + struct ocfs2_xattr_entry *xe; + + for (i = 0; i < le16_to_cpu(xh->xh_count); i++) { + xe = &xh->xh_entries[i]; + if (ocfs2_xattr_is_local(xe)) + continue; + + ret = ocfs2_xattr_bucket_value_truncate(inode, + bucket->bhs[0], + i, 0); + if (ret) { + mlog_errno(ret); + break; + } + } + + return ret; +} + +static int ocfs2_delete_xattr_index_block(struct inode *inode, + struct buffer_head *xb_bh) +{ + struct ocfs2_xattr_block *xb = + (struct ocfs2_xattr_block *)xb_bh->b_data; + struct ocfs2_extent_list *el = &xb->xb_attrs.xb_root.xt_list; + int ret = 0; + u32 name_hash = UINT_MAX, e_cpos, num_clusters; + u64 p_blkno; + + if (le16_to_cpu(el->l_next_free_rec) == 0) + return 0; + + while (name_hash > 0) { + ret = ocfs2_xattr_get_rec(inode, name_hash, &p_blkno, + &e_cpos, &num_clusters, el); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_iterate_xattr_buckets(inode, p_blkno, num_clusters, + ocfs2_delete_xattr_in_bucket, + NULL); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_rm_xattr_cluster(inode, xb_bh, + p_blkno, e_cpos, num_clusters); + if (ret) { + mlog_errno(ret); + break; + } + + if (e_cpos == 0) + break; + + name_hash = e_cpos - 1; + } + +out: + return ret; +} + +/* + * 'trusted' attributes support + */ + +#define XATTR_TRUSTED_PREFIX "trusted." + +static size_t ocfs2_xattr_trusted_list(struct inode *inode, char *list, + size_t list_size, const char *name, + size_t name_len) +{ + const size_t prefix_len = sizeof(XATTR_TRUSTED_PREFIX) - 1; + const size_t total_len = prefix_len + name_len + 1; + + if (list && total_len <= list_size) { + memcpy(list, XATTR_TRUSTED_PREFIX, prefix_len); + memcpy(list + prefix_len, name, name_len); + list[prefix_len + name_len] = '\0'; + } + return total_len; +} + +static int ocfs2_xattr_trusted_get(struct inode *inode, const char *name, + void *buffer, size_t size) +{ + if (strcmp(name, "") == 0) + return -EINVAL; + return ocfs2_xattr_get(inode, OCFS2_XATTR_INDEX_TRUSTED, name, + buffer, size); +} + +static int ocfs2_xattr_trusted_set(struct inode *inode, const char *name, + const void *value, size_t size, int flags) +{ + if (strcmp(name, "") == 0) + return -EINVAL; + + return ocfs2_xattr_set(inode, OCFS2_XATTR_INDEX_TRUSTED, name, value, + size, flags); +} + +struct xattr_handler ocfs2_xattr_trusted_handler = { + .prefix = XATTR_TRUSTED_PREFIX, + .list = ocfs2_xattr_trusted_list, + .get = ocfs2_xattr_trusted_get, + .set = ocfs2_xattr_trusted_set, +}; + + +/* + * 'user' attributes support + */ + +#define XATTR_USER_PREFIX "user." + +static size_t ocfs2_xattr_user_list(struct inode *inode, char *list, + size_t list_size, const char *name, + size_t name_len) +{ + const size_t prefix_len = sizeof(XATTR_USER_PREFIX) - 1; + const size_t total_len = prefix_len + name_len + 1; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + + if (osb->s_mount_opt & OCFS2_MOUNT_NOUSERXATTR) + return 0; + + if (list && total_len <= list_size) { + memcpy(list, XATTR_USER_PREFIX, prefix_len); + memcpy(list + prefix_len, name, name_len); + list[prefix_len + name_len] = '\0'; + } + return total_len; +} + +static int ocfs2_xattr_user_get(struct inode *inode, const char *name, + void *buffer, size_t size) +{ + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + + if (strcmp(name, "") == 0) + return -EINVAL; + if (osb->s_mount_opt & OCFS2_MOUNT_NOUSERXATTR) + return -EOPNOTSUPP; + return ocfs2_xattr_get(inode, OCFS2_XATTR_INDEX_USER, name, + buffer, size); +} + +static int ocfs2_xattr_user_set(struct inode *inode, const char *name, + const void *value, size_t size, int flags) +{ + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + + if (strcmp(name, "") == 0) + return -EINVAL; + if (osb->s_mount_opt & OCFS2_MOUNT_NOUSERXATTR) + return -EOPNOTSUPP; + + return ocfs2_xattr_set(inode, OCFS2_XATTR_INDEX_USER, name, value, + size, flags); +} + +struct xattr_handler ocfs2_xattr_user_handler = { + .prefix = XATTR_USER_PREFIX, + .list = ocfs2_xattr_user_list, + .get = ocfs2_xattr_user_get, + .set = ocfs2_xattr_user_set, +}; diff --git a/fs/ocfs2/xattr.h b/fs/ocfs2/xattr.h new file mode 100644 index 0000000..c25c7c6 --- /dev/null +++ b/fs/ocfs2/xattr.h @@ -0,0 +1,68 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * xattr.h + * + * Function prototypes + * + * Copyright (C) 2008 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef OCFS2_XATTR_H +#define OCFS2_XATTR_H + +#include <linux/init.h> +#include <linux/xattr.h> + +enum ocfs2_xattr_type { + OCFS2_XATTR_INDEX_USER = 1, + OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS, + OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT, + OCFS2_XATTR_INDEX_TRUSTED, + OCFS2_XATTR_INDEX_SECURITY, + OCFS2_XATTR_MAX +}; + +extern struct xattr_handler ocfs2_xattr_user_handler; +extern struct xattr_handler ocfs2_xattr_trusted_handler; + +extern ssize_t ocfs2_listxattr(struct dentry *, char *, size_t); +extern int ocfs2_xattr_get(struct inode *, int, const char *, void *, size_t); +extern int ocfs2_xattr_set(struct inode *, int, const char *, const void *, + size_t, int); +extern int ocfs2_xattr_remove(struct inode *inode, struct buffer_head *di_bh); +extern struct xattr_handler *ocfs2_xattr_handlers[]; + +static inline u16 ocfs2_xattr_buckets_per_cluster(struct ocfs2_super *osb) +{ + return (1 << osb->s_clustersize_bits) / OCFS2_XATTR_BUCKET_SIZE; +} + +static inline u16 ocfs2_blocks_per_xattr_bucket(struct super_block *sb) +{ + return OCFS2_XATTR_BUCKET_SIZE / (1 << sb->s_blocksize_bits); +} + +static inline u16 ocfs2_xattr_max_xe_in_bucket(struct super_block *sb) +{ + u16 len = sb->s_blocksize - + offsetof(struct ocfs2_xattr_header, xh_entries); + + return len / sizeof(struct ocfs2_xattr_entry); +} +#endif /* OCFS2_XATTR_H */ diff --git a/fs/omfs/inode.c b/fs/omfs/inode.c index d29047b..cbf047a 100644 --- a/fs/omfs/inode.c +++ b/fs/omfs/inode.c @@ -346,7 +346,7 @@ enum { Opt_uid, Opt_gid, Opt_umask, Opt_dmask, Opt_fmask }; -static match_table_t tokens = { +static const match_table_t tokens = { {Opt_uid, "uid=%u"}, {Opt_gid, "gid=%u"}, {Opt_umask, "umask=%o"}, diff --git a/fs/partitions/acorn.c b/fs/partitions/acorn.c index 3d3e166..a97b477 100644 --- a/fs/partitions/acorn.c +++ b/fs/partitions/acorn.c @@ -275,16 +275,6 @@ adfspart_check_ADFS(struct parsed_partitions *state, struct block_device *bdev) id = data[0x1fc] & 15; put_dev_sector(sect); -#ifdef CONFIG_BLK_DEV_MFM - if (MAJOR(bdev->bd_dev) == MFM_ACORN_MAJOR) { - extern void xd_set_geometry(struct block_device *, - unsigned char, unsigned char, unsigned int); - xd_set_geometry(bdev, dr->secspertrack, heads, 1); - invalidate_bh_lrus(); - truncate_inode_pages(bdev->bd_inode->i_mapping, 0); - } -#endif - /* * Work out start of non-adfs partition. */ diff --git a/fs/partitions/check.c b/fs/partitions/check.c index 7408227..cfb0c80 100644 --- a/fs/partitions/check.c +++ b/fs/partitions/check.c @@ -195,6 +195,14 @@ check_partition(struct gendisk *hd, struct block_device *bdev) return ERR_PTR(res); } +static ssize_t part_partition_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct hd_struct *p = dev_to_part(dev); + + return sprintf(buf, "%d\n", p->partno); +} + static ssize_t part_start_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -260,6 +268,7 @@ ssize_t part_fail_store(struct device *dev, } #endif +static DEVICE_ATTR(partition, S_IRUGO, part_partition_show, NULL); static DEVICE_ATTR(start, S_IRUGO, part_start_show, NULL); static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL); static DEVICE_ATTR(stat, S_IRUGO, part_stat_show, NULL); @@ -269,6 +278,7 @@ static struct device_attribute dev_attr_fail = #endif static struct attribute *part_attrs[] = { + &dev_attr_partition.attr, &dev_attr_start.attr, &dev_attr_size.attr, &dev_attr_stat.attr, @@ -538,10 +548,23 @@ int rescan_partitions(struct gendisk *disk, struct block_device *bdev) sector_t from = state->parts[p].from; if (!size) continue; + if (from >= get_capacity(disk)) { + printk(KERN_WARNING + "%s: p%d ignored, start %llu is behind the end of the disk\n", + disk->disk_name, p, (unsigned long long) from); + continue; + } if (from + size > get_capacity(disk)) { + /* + * we can not ignore partitions of broken tables + * created by for example camera firmware, but we + * limit them to the end of the disk to avoid + * creating invalid block devices + */ printk(KERN_WARNING - "%s: p%d exceeds device capacity\n", - disk->disk_name, p); + "%s: p%d size %llu limited to end of disk\n", + disk->disk_name, p, (unsigned long long) size); + size = get_capacity(disk) - from; } res = add_partition(disk, p, from, size, state->parts[p].flags); if (res) { diff --git a/fs/proc/Kconfig b/fs/proc/Kconfig index 73cd7a4..50f8f06 100644 --- a/fs/proc/Kconfig +++ b/fs/proc/Kconfig @@ -57,3 +57,13 @@ config PROC_SYSCTL As it is generally a good thing, you should say Y here unless building a kernel for install/rescue disks or your system is very limited in memory. + +config PROC_PAGE_MONITOR + default y + depends on PROC_FS && MMU + bool "Enable /proc page monitoring" if EMBEDDED + help + Various /proc files exist to monitor process memory utilization: + /proc/pid/smaps, /proc/pid/clear_refs, /proc/pid/pagemap, + /proc/kpagecount, and /proc/kpageflags. Disabling these + interfaces will reduce the size of the kernel by approximately 4kb. diff --git a/fs/proc/array.c b/fs/proc/array.c index 71c9be5..f4bc0e7 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -86,11 +86,6 @@ #include <asm/processor.h> #include "internal.h" -/* Gcc optimizes away "strlen(x)" for constant x */ -#define ADDBUF(buffer, string) \ -do { memcpy(buffer, string, strlen(string)); \ - buffer += strlen(string); } while (0) - static inline void task_name(struct seq_file *m, struct task_struct *p) { int i; @@ -261,7 +256,6 @@ static inline void task_sig(struct seq_file *m, struct task_struct *p) sigemptyset(&ignored); sigemptyset(&caught); - rcu_read_lock(); if (lock_task_sighand(p, &flags)) { pending = p->pending.signal; shpending = p->signal->shared_pending.signal; @@ -272,7 +266,6 @@ static inline void task_sig(struct seq_file *m, struct task_struct *p) qlim = p->signal->rlim[RLIMIT_SIGPENDING].rlim_cur; unlock_task_sighand(p, &flags); } - rcu_read_unlock(); seq_printf(m, "Threads:\t%d\n", num_threads); seq_printf(m, "SigQ:\t%lu/%lu\n", qsize, qlim); diff --git a/fs/proc/base.c b/fs/proc/base.c index a28840b..b5918ae 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -148,9 +148,6 @@ static unsigned int pid_entry_count_dirs(const struct pid_entry *entries, return count; } -int maps_protect; -EXPORT_SYMBOL(maps_protect); - static struct fs_struct *get_fs_struct(struct task_struct *task) { struct fs_struct *fs; @@ -164,7 +161,6 @@ static struct fs_struct *get_fs_struct(struct task_struct *task) static int get_nr_threads(struct task_struct *tsk) { - /* Must be called with the rcu_read_lock held */ unsigned long flags; int count = 0; @@ -471,14 +467,10 @@ static int proc_pid_limits(struct task_struct *task, char *buffer) struct rlimit rlim[RLIM_NLIMITS]; - rcu_read_lock(); - if (!lock_task_sighand(task,&flags)) { - rcu_read_unlock(); + if (!lock_task_sighand(task, &flags)) return 0; - } memcpy(rlim, task->signal->rlim, sizeof(struct rlimit) * RLIM_NLIMITS); unlock_task_sighand(task, &flags); - rcu_read_unlock(); /* * print the file header @@ -2443,6 +2435,13 @@ static int proc_tgid_io_accounting(struct task_struct *task, char *buffer) } #endif /* CONFIG_TASK_IO_ACCOUNTING */ +static int proc_pid_personality(struct seq_file *m, struct pid_namespace *ns, + struct pid *pid, struct task_struct *task) +{ + seq_printf(m, "%08x\n", task->personality); + return 0; +} + /* * Thread groups */ @@ -2459,6 +2458,7 @@ static const struct pid_entry tgid_base_stuff[] = { REG("environ", S_IRUSR, environ), INF("auxv", S_IRUSR, pid_auxv), ONE("status", S_IRUGO, pid_status), + ONE("personality", S_IRUSR, pid_personality), INF("limits", S_IRUSR, pid_limits), #ifdef CONFIG_SCHED_DEBUG REG("sched", S_IRUGO|S_IWUSR, pid_sched), @@ -2794,6 +2794,7 @@ static const struct pid_entry tid_base_stuff[] = { REG("environ", S_IRUSR, environ), INF("auxv", S_IRUSR, pid_auxv), ONE("status", S_IRUGO, pid_status), + ONE("personality", S_IRUSR, pid_personality), INF("limits", S_IRUSR, pid_limits), #ifdef CONFIG_SCHED_DEBUG REG("sched", S_IRUGO|S_IWUSR, pid_sched), @@ -3088,9 +3089,7 @@ static int proc_task_getattr(struct vfsmount *mnt, struct dentry *dentry, struct generic_fillattr(inode, stat); if (p) { - rcu_read_lock(); stat->nlink += get_nr_threads(p); - rcu_read_unlock(); put_task_struct(p); } diff --git a/fs/proc/inode.c b/fs/proc/inode.c index 8bb03f0..c6b4fa7 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -342,7 +342,7 @@ static int proc_reg_open(struct inode *inode, struct file *file) if (!pde->proc_fops) { spin_unlock(&pde->pde_unload_lock); kfree(pdeo); - return rv; + return -EINVAL; } pde->pde_users++; open = pde->proc_fops->open; diff --git a/fs/proc/internal.h b/fs/proc/internal.h index 4422023..3bfb7b8 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -45,8 +45,6 @@ do { \ extern int nommu_vma_show(struct seq_file *, struct vm_area_struct *); #endif -extern int maps_protect; - extern int proc_tid_stat(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task); extern int proc_tgid_stat(struct seq_file *m, struct pid_namespace *ns, diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c index 29e20c6..59ea42e 100644 --- a/fs/proc/proc_misc.c +++ b/fs/proc/proc_misc.c @@ -45,7 +45,6 @@ #include <linux/blkdev.h> #include <linux/hugetlb.h> #include <linux/jiffies.h> -#include <linux/sysrq.h> #include <linux/vmalloc.h> #include <linux/crash_dump.h> #include <linux/pid_namespace.h> @@ -68,7 +67,6 @@ extern int get_hardware_list(char *); extern int get_stram_list(char *); extern int get_exec_domain_list(char *); -extern int get_dma_list(char *); static int proc_calc_metrics(char *page, char **start, off_t off, int count, int *eof, int len) @@ -684,6 +682,7 @@ static int cmdline_read_proc(char *page, char **start, off_t off, return proc_calc_metrics(page, start, off, count, eof, len); } +#ifdef CONFIG_FILE_LOCKING static int locks_open(struct inode *inode, struct file *filp) { return seq_open(filp, &locks_seq_operations); @@ -695,6 +694,7 @@ static const struct file_operations proc_locks_operations = { .llseek = seq_lseek, .release = seq_release, }; +#endif /* CONFIG_FILE_LOCKING */ static int execdomains_read_proc(char *page, char **start, off_t off, int count, int *eof, void *data) @@ -703,28 +703,6 @@ static int execdomains_read_proc(char *page, char **start, off_t off, return proc_calc_metrics(page, start, off, count, eof, len); } -#ifdef CONFIG_MAGIC_SYSRQ -/* - * writing 'C' to /proc/sysrq-trigger is like sysrq-C - */ -static ssize_t write_sysrq_trigger(struct file *file, const char __user *buf, - size_t count, loff_t *ppos) -{ - if (count) { - char c; - - if (get_user(c, buf)) - return -EFAULT; - __handle_sysrq(c, NULL, 0); - } - return count; -} - -static const struct file_operations proc_sysrq_trigger_operations = { - .write = write_sysrq_trigger, -}; -#endif - #ifdef CONFIG_PROC_PAGE_MONITOR #define KPMSIZE sizeof(u64) #define KPMMASK (KPMSIZE - 1) @@ -888,7 +866,9 @@ void __init proc_misc_init(void) #ifdef CONFIG_PRINTK proc_create("kmsg", S_IRUSR, NULL, &proc_kmsg_operations); #endif +#ifdef CONFIG_FILE_LOCKING proc_create("locks", 0, NULL, &proc_locks_operations); +#endif proc_create("devices", 0, NULL, &proc_devinfo_operations); proc_create("cpuinfo", 0, NULL, &proc_cpuinfo_operations); #ifdef CONFIG_BLOCK @@ -931,7 +911,4 @@ void __init proc_misc_init(void) #ifdef CONFIG_PROC_VMCORE proc_vmcore = proc_create("vmcore", S_IRUSR, NULL, &proc_vmcore_operations); #endif -#ifdef CONFIG_MAGIC_SYSRQ - proc_create("sysrq-trigger", S_IWUSR, NULL, &proc_sysrq_trigger_operations); -#endif } diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index f9a8b89..945a810 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -66,7 +66,7 @@ static struct ctl_table *find_in_table(struct ctl_table *p, struct qstr *name) return NULL; } -struct ctl_table_header *grab_header(struct inode *inode) +static struct ctl_table_header *grab_header(struct inode *inode) { if (PROC_I(inode)->sysctl) return sysctl_head_grab(PROC_I(inode)->sysctl); @@ -395,10 +395,10 @@ static struct dentry_operations proc_sys_dentry_operations = { .d_compare = proc_sys_compare, }; -static struct proc_dir_entry *proc_sys_root; - int proc_sys_init(void) { + struct proc_dir_entry *proc_sys_root; + proc_sys_root = proc_mkdir("sys", NULL); proc_sys_root->proc_iops = &proc_sys_dir_operations; proc_sys_root->proc_fops = &proc_sys_dir_file_operations; diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 73d1891..4806830 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -210,9 +210,6 @@ static int show_map(struct seq_file *m, void *v) dev_t dev = 0; int len; - if (maps_protect && !ptrace_may_access(task, PTRACE_MODE_READ)) - return -EACCES; - if (file) { struct inode *inode = vma->vm_file->f_path.dentry->d_inode; dev = inode->i_sb->s_dev; @@ -742,22 +739,11 @@ const struct file_operations proc_pagemap_operations = { #ifdef CONFIG_NUMA extern int show_numa_map(struct seq_file *m, void *v); -static int show_numa_map_checked(struct seq_file *m, void *v) -{ - struct proc_maps_private *priv = m->private; - struct task_struct *task = priv->task; - - if (maps_protect && !ptrace_may_access(task, PTRACE_MODE_READ)) - return -EACCES; - - return show_numa_map(m, v); -} - static const struct seq_operations proc_pid_numa_maps_op = { .start = m_start, .next = m_next, .stop = m_stop, - .show = show_numa_map_checked + .show = show_numa_map, }; static int numa_maps_open(struct inode *inode, struct file *file) diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c index 5d84e71..219bd79e 100644 --- a/fs/proc/task_nommu.c +++ b/fs/proc/task_nommu.c @@ -110,11 +110,6 @@ int task_statm(struct mm_struct *mm, int *shared, int *text, static int show_map(struct seq_file *m, void *_vml) { struct vm_list_struct *vml = _vml; - struct proc_maps_private *priv = m->private; - struct task_struct *task = priv->task; - - if (maps_protect && !ptrace_may_access(task, PTRACE_MODE_READ)) - return -EACCES; return nommu_vma_show(m, vml->vma); } diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c index 9ac0f5e..841368b 100644 --- a/fs/proc/vmcore.c +++ b/fs/proc/vmcore.c @@ -165,14 +165,8 @@ static ssize_t read_vmcore(struct file *file, char __user *buffer, return acc; } -static int open_vmcore(struct inode *inode, struct file *filp) -{ - return 0; -} - const struct file_operations proc_vmcore_operations = { .read = read_vmcore, - .open = open_vmcore, }; static struct vmcore* __init get_new_element(void) diff --git a/fs/reiserfs/procfs.c b/fs/reiserfs/procfs.c index b9dbeec..37173fa 100644 --- a/fs/reiserfs/procfs.c +++ b/fs/reiserfs/procfs.c @@ -8,8 +8,6 @@ /* proc info support a la one created by Sizif@Botik.RU for PGC */ -/* $Id: procfs.c,v 1.1.8.2 2001/07/15 17:08:42 god Exp $ */ - #include <linux/module.h> #include <linux/time.h> #include <linux/seq_file.h> @@ -621,7 +619,6 @@ int reiserfs_global_version_in_proc(char *buffer, char **start, #endif /* - * $Log: procfs.c,v $ * Revision 1.1.8.2 2001/07/15 17:08:42 god * . use get_super() in procfs.c * . remove remove_save_link() from reiserfs_do_truncate() diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c index bb3cb5b..ad92461 100644 --- a/fs/reiserfs/xattr.c +++ b/fs/reiserfs/xattr.c @@ -155,7 +155,7 @@ static struct dentry *get_xa_file_dentry(const struct inode *inode, xadir = open_xa_dir(inode, flags); if (IS_ERR(xadir)) { return ERR_CAST(xadir); - } else if (xadir && !xadir->d_inode) { + } else if (!xadir->d_inode) { dput(xadir); return ERR_PTR(-ENODATA); } diff --git a/fs/sysfs/bin.c b/fs/sysfs/bin.c index 006fc64..66f6e58 100644 --- a/fs/sysfs/bin.c +++ b/fs/sysfs/bin.c @@ -61,6 +61,7 @@ read(struct file *file, char __user *userbuf, size_t bytes, loff_t *off) int size = dentry->d_inode->i_size; loff_t offs = *off; int count = min_t(size_t, bytes, PAGE_SIZE); + char *temp; if (size) { if (offs > size) @@ -69,23 +70,33 @@ read(struct file *file, char __user *userbuf, size_t bytes, loff_t *off) count = size - offs; } + temp = kmalloc(count, GFP_KERNEL); + if (!temp) + return -ENOMEM; + mutex_lock(&bb->mutex); count = fill_read(dentry, bb->buffer, offs, count); - if (count < 0) - goto out_unlock; + if (count < 0) { + mutex_unlock(&bb->mutex); + goto out_free; + } - if (copy_to_user(userbuf, bb->buffer, count)) { + memcpy(temp, bb->buffer, count); + + mutex_unlock(&bb->mutex); + + if (copy_to_user(userbuf, temp, count)) { count = -EFAULT; - goto out_unlock; + goto out_free; } pr_debug("offs = %lld, *off = %lld, count = %d\n", offs, *off, count); *off = offs + count; - out_unlock: - mutex_unlock(&bb->mutex); + out_free: + kfree(temp); return count; } @@ -118,6 +129,7 @@ static ssize_t write(struct file *file, const char __user *userbuf, int size = dentry->d_inode->i_size; loff_t offs = *off; int count = min_t(size_t, bytes, PAGE_SIZE); + char *temp; if (size) { if (offs > size) @@ -126,19 +138,27 @@ static ssize_t write(struct file *file, const char __user *userbuf, count = size - offs; } - mutex_lock(&bb->mutex); + temp = kmalloc(count, GFP_KERNEL); + if (!temp) + return -ENOMEM; - if (copy_from_user(bb->buffer, userbuf, count)) { + if (copy_from_user(temp, userbuf, count)) { count = -EFAULT; - goto out_unlock; + goto out_free; } + mutex_lock(&bb->mutex); + + memcpy(bb->buffer, temp, count); + count = flush_write(dentry, bb->buffer, offs, count); + mutex_unlock(&bb->mutex); + if (count > 0) *off = offs + count; - out_unlock: - mutex_unlock(&bb->mutex); +out_free: + kfree(temp); return count; } diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c index aedaeba..3a05a59 100644 --- a/fs/sysfs/dir.c +++ b/fs/sysfs/dir.c @@ -370,17 +370,17 @@ void sysfs_addrm_start(struct sysfs_addrm_cxt *acxt, memset(acxt, 0, sizeof(*acxt)); acxt->parent_sd = parent_sd; - /* Lookup parent inode. inode initialization and I_NEW - * clearing are protected by sysfs_mutex. By grabbing it and - * looking up with _nowait variant, inode state can be - * determined reliably. + /* Lookup parent inode. inode initialization is protected by + * sysfs_mutex, so inode existence can be determined by + * looking up inode while holding sysfs_mutex. */ mutex_lock(&sysfs_mutex); - inode = ilookup5_nowait(sysfs_sb, parent_sd->s_ino, sysfs_ilookup_test, - parent_sd); + inode = ilookup5(sysfs_sb, parent_sd->s_ino, sysfs_ilookup_test, + parent_sd); + if (inode) { + WARN_ON(inode->i_state & I_NEW); - if (inode && !(inode->i_state & I_NEW)) { /* parent inode available */ acxt->parent_inode = inode; @@ -393,8 +393,7 @@ void sysfs_addrm_start(struct sysfs_addrm_cxt *acxt, mutex_lock(&inode->i_mutex); mutex_lock(&sysfs_mutex); } - } else - iput(inode); + } } /** @@ -636,6 +635,7 @@ struct sysfs_dirent *sysfs_get_dirent(struct sysfs_dirent *parent_sd, return sd; } +EXPORT_SYMBOL_GPL(sysfs_get_dirent); static int create_dir(struct kobject *kobj, struct sysfs_dirent *parent_sd, const char *name, struct sysfs_dirent **p_sd) @@ -829,16 +829,12 @@ int sysfs_rename_dir(struct kobject * kobj, const char *new_name) if (!new_dentry) goto out_unlock; - /* rename kobject and sysfs_dirent */ + /* rename sysfs_dirent */ error = -ENOMEM; new_name = dup_name = kstrdup(new_name, GFP_KERNEL); if (!new_name) goto out_unlock; - error = kobject_set_name(kobj, "%s", new_name); - if (error) - goto out_unlock; - dup_name = sd->s_name; sd->s_name = new_name; diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c index c9e4e50..1f4a3f8 100644 --- a/fs/sysfs/file.c +++ b/fs/sysfs/file.c @@ -19,10 +19,18 @@ #include <linux/poll.h> #include <linux/list.h> #include <linux/mutex.h> +#include <linux/limits.h> #include <asm/uaccess.h> #include "sysfs.h" +/* used in crash dumps to help with debugging */ +static char last_sysfs_file[PATH_MAX]; +void sysfs_printk_last_file(void) +{ + printk(KERN_EMERG "last sysfs file: %s\n", last_sysfs_file); +} + /* * There's one sysfs_buffer for each open file and one * sysfs_open_dirent for each sysfs_dirent with one or more open @@ -328,6 +336,11 @@ static int sysfs_open_file(struct inode *inode, struct file *file) struct sysfs_buffer *buffer; struct sysfs_ops *ops; int error = -EACCES; + char *p; + + p = d_path(&file->f_path, last_sysfs_file, sizeof(last_sysfs_file)); + if (p) + memmove(last_sysfs_file, p, strlen(p) + 1); /* need attr_sd for attr and ops, its parent for kobj */ if (!sysfs_get_active_two(attr_sd)) @@ -440,7 +453,23 @@ static unsigned int sysfs_poll(struct file *filp, poll_table *wait) return POLLERR|POLLPRI; } -void sysfs_notify(struct kobject *k, char *dir, char *attr) +void sysfs_notify_dirent(struct sysfs_dirent *sd) +{ + struct sysfs_open_dirent *od; + + spin_lock(&sysfs_open_dirent_lock); + + od = sd->s_attr.open; + if (od) { + atomic_inc(&od->event); + wake_up_interruptible(&od->poll); + } + + spin_unlock(&sysfs_open_dirent_lock); +} +EXPORT_SYMBOL_GPL(sysfs_notify_dirent); + +void sysfs_notify(struct kobject *k, const char *dir, const char *attr) { struct sysfs_dirent *sd = k->sd; @@ -450,19 +479,8 @@ void sysfs_notify(struct kobject *k, char *dir, char *attr) sd = sysfs_find_dirent(sd, dir); if (sd && attr) sd = sysfs_find_dirent(sd, attr); - if (sd) { - struct sysfs_open_dirent *od; - - spin_lock(&sysfs_open_dirent_lock); - - od = sd->s_attr.open; - if (od) { - atomic_inc(&od->event); - wake_up_interruptible(&od->poll); - } - - spin_unlock(&sysfs_open_dirent_lock); - } + if (sd) + sysfs_notify_dirent(sd); mutex_unlock(&sysfs_mutex); } diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c index 14f0023..ab343e3 100644 --- a/fs/sysfs/mount.c +++ b/fs/sysfs/mount.c @@ -16,6 +16,7 @@ #include <linux/mount.h> #include <linux/pagemap.h> #include <linux/init.h> +#include <linux/module.h> #include "sysfs.h" @@ -115,3 +116,17 @@ out_err: sysfs_dir_cachep = NULL; goto out; } + +#undef sysfs_get +struct sysfs_dirent *sysfs_get(struct sysfs_dirent *sd) +{ + return __sysfs_get(sd); +} +EXPORT_SYMBOL_GPL(sysfs_get); + +#undef sysfs_put +void sysfs_put(struct sysfs_dirent *sd) +{ + __sysfs_put(sd); +} +EXPORT_SYMBOL_GPL(sysfs_put); diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h index a5db496..93c6d6b 100644 --- a/fs/sysfs/sysfs.h +++ b/fs/sysfs/sysfs.h @@ -124,7 +124,7 @@ int sysfs_create_subdir(struct kobject *kobj, const char *name, struct sysfs_dirent **p_sd); void sysfs_remove_subdir(struct sysfs_dirent *sd); -static inline struct sysfs_dirent *sysfs_get(struct sysfs_dirent *sd) +static inline struct sysfs_dirent *__sysfs_get(struct sysfs_dirent *sd) { if (sd) { WARN_ON(!atomic_read(&sd->s_count)); @@ -132,12 +132,14 @@ static inline struct sysfs_dirent *sysfs_get(struct sysfs_dirent *sd) } return sd; } +#define sysfs_get(sd) __sysfs_get(sd) -static inline void sysfs_put(struct sysfs_dirent *sd) +static inline void __sysfs_put(struct sysfs_dirent *sd) { if (sd && atomic_dec_and_test(&sd->s_count)) release_sysfs_dirent(sd); } +#define sysfs_put(sd) __sysfs_put(sd) /* * inode.c diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index 3f49020..9a92203 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -848,7 +848,7 @@ enum { Opt_err, }; -static match_table_t tokens = { +static const match_table_t tokens = { {Opt_fast_unmount, "fast_unmount"}, {Opt_norm_unmount, "norm_unmount"}, {Opt_err, NULL}, diff --git a/fs/udf/super.c b/fs/udf/super.c index 5698bbf..e25e701 100644 --- a/fs/udf/super.c +++ b/fs/udf/super.c @@ -369,7 +369,7 @@ enum { Opt_err, Opt_uforget, Opt_uignore, Opt_gforget, Opt_gignore }; -static match_table_t tokens = { +static const match_table_t tokens = { {Opt_novrs, "novrs"}, {Opt_nostrict, "nostrict"}, {Opt_bs, "bs=%u"}, diff --git a/fs/ufs/super.c b/fs/ufs/super.c index 3141969..e65212d 100644 --- a/fs/ufs/super.c +++ b/fs/ufs/super.c @@ -309,7 +309,7 @@ enum { Opt_err }; -static match_table_t tokens = { +static const match_table_t tokens = { {Opt_type_old, "ufstype=old"}, {Opt_type_sunx86, "ufstype=sunx86"}, {Opt_type_sun, "ufstype=sun"}, @@ -1233,7 +1233,7 @@ static int ufs_show_options(struct seq_file *seq, struct vfsmount *vfs) { struct ufs_sb_info *sbi = UFS_SB(vfs->mnt_sb); unsigned mval = sbi->s_mount_opt & UFS_MOUNT_UFSTYPE; - struct match_token *tp = tokens; + const struct match_token *tp = tokens; while (tp->token != Opt_onerror_panic && tp->token != mval) ++tp; diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c index 18d3c84..e390136 100644 --- a/fs/xfs/linux-2.6/xfs_super.c +++ b/fs/xfs/linux-2.6/xfs_super.c @@ -158,7 +158,7 @@ enum { Opt_barrier, Opt_nobarrier, Opt_err }; -static match_table_t tokens = { +static const match_table_t tokens = { {Opt_barrier, "barrier"}, {Opt_nobarrier, "nobarrier"}, {Opt_err, NULL} @@ -1323,7 +1323,7 @@ xfs_fs_remount( "XFS: mount option \"%s\" not supported for remount\n", p); return -EINVAL; #else - return 0; + break; #endif } } |