diff options
author | Steve French <sfrench@us.ibm.com> | 2008-04-24 15:26:50 +0000 |
---|---|---|
committer | Steve French <sfrench@us.ibm.com> | 2008-04-24 15:26:50 +0000 |
commit | 36d99df2fb474222ab47fbe8ae7385661033223b (patch) | |
tree | 962e068491b752a944f61c454fad3f8619a1ea3f /fs | |
parent | 076d8423a98659a92837b07aa494cb74bfefe77c (diff) | |
parent | 3dc5063786b273f1aee545844f6bd4e9651ebffe (diff) | |
download | op-kernel-dev-36d99df2fb474222ab47fbe8ae7385661033223b.zip op-kernel-dev-36d99df2fb474222ab47fbe8ae7385661033223b.tar.gz |
Merge branch 'master' of /pub/scm/linux/kernel/git/torvalds/linux-2.6
Diffstat (limited to 'fs')
238 files changed, 12005 insertions, 6571 deletions
diff --git a/fs/9p/fid.c b/fs/9p/fid.c index dfebdbe..3031e32 100644 --- a/fs/9p/fid.c +++ b/fs/9p/fid.c @@ -26,7 +26,6 @@ #include <linux/fs.h> #include <linux/sched.h> #include <linux/idr.h> -#include <asm/semaphore.h> #include <net/9p/9p.h> #include <net/9p/client.h> @@ -444,6 +444,32 @@ config OCFS2_FS For more information on OCFS2, see the file <file:Documentation/filesystems/ocfs2.txt>. +config OCFS2_FS_O2CB + tristate "O2CB Kernelspace Clustering" + depends on OCFS2_FS + default y + help + OCFS2 includes a simple kernelspace clustering package, the OCFS2 + Cluster Base. It only requires a very small userspace component + to configure it. This comes with the standard ocfs2-tools package. + O2CB is limited to maintaining a cluster for OCFS2 file systems. + It cannot manage any other cluster applications. + + It is always safe to say Y here, as the clustering method is + run-time selectable. + +config OCFS2_FS_USERSPACE_CLUSTER + tristate "OCFS2 Userspace Clustering" + depends on OCFS2_FS && DLM + default y + help + This option will allow OCFS2 to use userspace clustering services + in conjunction with the DLM in fs/dlm. If you are using a + userspace cluster manager, say Y here. + + It is safe to say Y, as the clustering method is run-time + selectable. + config OCFS2_DEBUG_MASKLOG bool "OCFS2 logging support" depends on OCFS2_FS @@ -663,6 +689,7 @@ config ZISOFS config UDF_FS tristate "UDF file system support" + select CRC_ITU_T help This is the new file system used on some CD-ROMs and DVDs. Say Y if you intend to mount DVD discs or CDRW's written in packet mode, or diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt index b5c3b61..853845a 100644 --- a/fs/Kconfig.binfmt +++ b/fs/Kconfig.binfmt @@ -62,7 +62,7 @@ config BINFMT_SHARED_FLAT config BINFMT_AOUT tristate "Kernel support for a.out and ECOFF binaries" depends on ARCH_SUPPORTS_AOUT && \ - (X86_32 || ALPHA || ARM || M68K || SPARC32) + (X86_32 || ALPHA || ARM || M68K) ---help--- A.out (Assembler.OUTput) is a set of formats for libraries and executables used in the earliest versions of UNIX. Linux used @@ -444,22 +444,27 @@ int bio_add_page(struct bio *bio, struct page *page, unsigned int len, struct bio_map_data { struct bio_vec *iovecs; - void __user *userptr; + int nr_sgvecs; + struct sg_iovec *sgvecs; }; -static void bio_set_map_data(struct bio_map_data *bmd, struct bio *bio) +static void bio_set_map_data(struct bio_map_data *bmd, struct bio *bio, + struct sg_iovec *iov, int iov_count) { memcpy(bmd->iovecs, bio->bi_io_vec, sizeof(struct bio_vec) * bio->bi_vcnt); + memcpy(bmd->sgvecs, iov, sizeof(struct sg_iovec) * iov_count); + bmd->nr_sgvecs = iov_count; bio->bi_private = bmd; } static void bio_free_map_data(struct bio_map_data *bmd) { kfree(bmd->iovecs); + kfree(bmd->sgvecs); kfree(bmd); } -static struct bio_map_data *bio_alloc_map_data(int nr_segs) +static struct bio_map_data *bio_alloc_map_data(int nr_segs, int iov_count) { struct bio_map_data *bmd = kmalloc(sizeof(*bmd), GFP_KERNEL); @@ -467,13 +472,71 @@ static struct bio_map_data *bio_alloc_map_data(int nr_segs) return NULL; bmd->iovecs = kmalloc(sizeof(struct bio_vec) * nr_segs, GFP_KERNEL); - if (bmd->iovecs) + if (!bmd->iovecs) { + kfree(bmd); + return NULL; + } + + bmd->sgvecs = kmalloc(sizeof(struct sg_iovec) * iov_count, GFP_KERNEL); + if (bmd->sgvecs) return bmd; + kfree(bmd->iovecs); kfree(bmd); return NULL; } +static int __bio_copy_iov(struct bio *bio, struct sg_iovec *iov, int iov_count, + int uncopy) +{ + int ret = 0, i; + struct bio_vec *bvec; + int iov_idx = 0; + unsigned int iov_off = 0; + int read = bio_data_dir(bio) == READ; + + __bio_for_each_segment(bvec, bio, i, 0) { + char *bv_addr = page_address(bvec->bv_page); + unsigned int bv_len = bvec->bv_len; + + while (bv_len && iov_idx < iov_count) { + unsigned int bytes; + char *iov_addr; + + bytes = min_t(unsigned int, + iov[iov_idx].iov_len - iov_off, bv_len); + iov_addr = iov[iov_idx].iov_base + iov_off; + + if (!ret) { + if (!read && !uncopy) + ret = copy_from_user(bv_addr, iov_addr, + bytes); + if (read && uncopy) + ret = copy_to_user(iov_addr, bv_addr, + bytes); + + if (ret) + ret = -EFAULT; + } + + bv_len -= bytes; + bv_addr += bytes; + iov_addr += bytes; + iov_off += bytes; + + if (iov[iov_idx].iov_len == iov_off) { + iov_idx++; + iov_off = 0; + } + } + + if (uncopy) + __free_page(bvec->bv_page); + } + + return ret; +} + /** * bio_uncopy_user - finish previously mapped bio * @bio: bio being terminated @@ -484,55 +547,56 @@ static struct bio_map_data *bio_alloc_map_data(int nr_segs) int bio_uncopy_user(struct bio *bio) { struct bio_map_data *bmd = bio->bi_private; - const int read = bio_data_dir(bio) == READ; - struct bio_vec *bvec; - int i, ret = 0; + int ret; - __bio_for_each_segment(bvec, bio, i, 0) { - char *addr = page_address(bvec->bv_page); - unsigned int len = bmd->iovecs[i].bv_len; + ret = __bio_copy_iov(bio, bmd->sgvecs, bmd->nr_sgvecs, 1); - if (read && !ret && copy_to_user(bmd->userptr, addr, len)) - ret = -EFAULT; - - __free_page(bvec->bv_page); - bmd->userptr += len; - } bio_free_map_data(bmd); bio_put(bio); return ret; } /** - * bio_copy_user - copy user data to bio + * bio_copy_user_iov - copy user data to bio * @q: destination block queue - * @uaddr: start of user address - * @len: length in bytes + * @iov: the iovec. + * @iov_count: number of elements in the iovec * @write_to_vm: bool indicating writing to pages or not * * Prepares and returns a bio for indirect user io, bouncing data * to/from kernel pages as necessary. Must be paired with * call bio_uncopy_user() on io completion. */ -struct bio *bio_copy_user(struct request_queue *q, unsigned long uaddr, - unsigned int len, int write_to_vm) +struct bio *bio_copy_user_iov(struct request_queue *q, struct sg_iovec *iov, + int iov_count, int write_to_vm) { - unsigned long end = (uaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT; - unsigned long start = uaddr >> PAGE_SHIFT; struct bio_map_data *bmd; struct bio_vec *bvec; struct page *page; struct bio *bio; int i, ret; + int nr_pages = 0; + unsigned int len = 0; - bmd = bio_alloc_map_data(end - start); + for (i = 0; i < iov_count; i++) { + unsigned long uaddr; + unsigned long end; + unsigned long start; + + uaddr = (unsigned long)iov[i].iov_base; + end = (uaddr + iov[i].iov_len + PAGE_SIZE - 1) >> PAGE_SHIFT; + start = uaddr >> PAGE_SHIFT; + + nr_pages += end - start; + len += iov[i].iov_len; + } + + bmd = bio_alloc_map_data(nr_pages, iov_count); if (!bmd) return ERR_PTR(-ENOMEM); - bmd->userptr = (void __user *) uaddr; - ret = -ENOMEM; - bio = bio_alloc(GFP_KERNEL, end - start); + bio = bio_alloc(GFP_KERNEL, nr_pages); if (!bio) goto out_bmd; @@ -564,22 +628,12 @@ struct bio *bio_copy_user(struct request_queue *q, unsigned long uaddr, * success */ if (!write_to_vm) { - char __user *p = (char __user *) uaddr; - - /* - * for a write, copy in data to kernel pages - */ - ret = -EFAULT; - bio_for_each_segment(bvec, bio, i) { - char *addr = page_address(bvec->bv_page); - - if (copy_from_user(addr, p, bvec->bv_len)) - goto cleanup; - p += bvec->bv_len; - } + ret = __bio_copy_iov(bio, iov, iov_count, 0); + if (ret) + goto cleanup; } - bio_set_map_data(bmd, bio); + bio_set_map_data(bmd, bio, iov, iov_count); return bio; cleanup: bio_for_each_segment(bvec, bio, i) @@ -591,6 +645,28 @@ out_bmd: return ERR_PTR(ret); } +/** + * bio_copy_user - copy user data to bio + * @q: destination block queue + * @uaddr: start of user address + * @len: length in bytes + * @write_to_vm: bool indicating writing to pages or not + * + * Prepares and returns a bio for indirect user io, bouncing data + * to/from kernel pages as necessary. Must be paired with + * call bio_uncopy_user() on io completion. + */ +struct bio *bio_copy_user(struct request_queue *q, unsigned long uaddr, + unsigned int len, int write_to_vm) +{ + struct sg_iovec iov; + + iov.iov_base = (void __user *)uaddr; + iov.iov_len = len; + + return bio_copy_user_iov(q, &iov, 1, write_to_vm); +} + static struct bio *__bio_map_user_iov(struct request_queue *q, struct block_device *bdev, struct sg_iovec *iov, int iov_count, diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c index 350680f..0c3b618 100644 --- a/fs/cramfs/inode.c +++ b/fs/cramfs/inode.c @@ -23,7 +23,6 @@ #include <linux/buffer_head.h> #include <linux/vfs.h> #include <linux/mutex.h> -#include <asm/semaphore.h> #include <asm/uaccess.h> diff --git a/fs/dcache.c b/fs/dcache.c index 4345577..3ee588d5 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -1746,12 +1746,21 @@ shouldnt_be_hashed: goto shouldnt_be_hashed; } +static int prepend(char **buffer, int *buflen, const char *str, + int namelen) +{ + *buflen -= namelen; + if (*buflen < 0) + return -ENAMETOOLONG; + *buffer -= namelen; + memcpy(*buffer, str, namelen); + return 0; +} + /** * d_path - return the path of a dentry - * @dentry: dentry to report - * @vfsmnt: vfsmnt to which the dentry belongs - * @root: root dentry - * @rootmnt: vfsmnt to which the root dentry belongs + * @path: the dentry/vfsmount to report + * @root: root vfsmnt/dentry (may be modified by this function) * @buffer: buffer to return value in * @buflen: buffer length * @@ -1761,23 +1770,22 @@ shouldnt_be_hashed: * Returns the buffer or an error code if the path was too long. * * "buflen" should be positive. Caller holds the dcache_lock. + * + * If path is not reachable from the supplied root, then the value of + * root is changed (without modifying refcounts). */ -static char *__d_path(struct dentry *dentry, struct vfsmount *vfsmnt, - struct path *root, char *buffer, int buflen) +char *__d_path(const struct path *path, struct path *root, + char *buffer, int buflen) { + struct dentry *dentry = path->dentry; + struct vfsmount *vfsmnt = path->mnt; char * end = buffer+buflen; char * retval; - int namelen; - - *--end = '\0'; - buflen--; - if (!IS_ROOT(dentry) && d_unhashed(dentry)) { - buflen -= 10; - end -= 10; - if (buflen < 0) + + prepend(&end, &buflen, "\0", 1); + if (!IS_ROOT(dentry) && d_unhashed(dentry) && + (prepend(&end, &buflen, " (deleted)", 10) != 0)) goto Elong; - memcpy(end, " (deleted)", 10); - } if (buflen < 1) goto Elong; @@ -1804,13 +1812,10 @@ static char *__d_path(struct dentry *dentry, struct vfsmount *vfsmnt, } parent = dentry->d_parent; prefetch(parent); - namelen = dentry->d_name.len; - buflen -= namelen + 1; - if (buflen < 0) + if ((prepend(&end, &buflen, dentry->d_name.name, + dentry->d_name.len) != 0) || + (prepend(&end, &buflen, "/", 1) != 0)) goto Elong; - end -= namelen; - memcpy(end, dentry->d_name.name, namelen); - *--end = '/'; retval = end; dentry = parent; } @@ -1818,12 +1823,12 @@ static char *__d_path(struct dentry *dentry, struct vfsmount *vfsmnt, return retval; global_root: - namelen = dentry->d_name.len; - buflen -= namelen; - if (buflen < 0) + retval += 1; /* hit the slash */ + if (prepend(&retval, &buflen, dentry->d_name.name, + dentry->d_name.len) != 0) goto Elong; - retval -= namelen-1; /* hit the slash */ - memcpy(retval, dentry->d_name.name, namelen); + root->mnt = vfsmnt; + root->dentry = dentry; return retval; Elong: return ERR_PTR(-ENAMETOOLONG); @@ -1846,6 +1851,7 @@ char *d_path(struct path *path, char *buf, int buflen) { char *res; struct path root; + struct path tmp; /* * We have various synthetic filesystems that never get mounted. On @@ -1859,10 +1865,11 @@ char *d_path(struct path *path, char *buf, int buflen) read_lock(¤t->fs->lock); root = current->fs->root; - path_get(¤t->fs->root); + path_get(&root); read_unlock(¤t->fs->lock); spin_lock(&dcache_lock); - res = __d_path(path->dentry, path->mnt, &root, buf, buflen); + tmp = root; + res = __d_path(path, &tmp, buf, buflen); spin_unlock(&dcache_lock); path_put(&root); return res; @@ -1890,6 +1897,48 @@ char *dynamic_dname(struct dentry *dentry, char *buffer, int buflen, } /* + * Write full pathname from the root of the filesystem into the buffer. + */ +char *dentry_path(struct dentry *dentry, char *buf, int buflen) +{ + char *end = buf + buflen; + char *retval; + + spin_lock(&dcache_lock); + prepend(&end, &buflen, "\0", 1); + if (!IS_ROOT(dentry) && d_unhashed(dentry) && + (prepend(&end, &buflen, "//deleted", 9) != 0)) + goto Elong; + if (buflen < 1) + goto Elong; + /* Get '/' right */ + retval = end-1; + *retval = '/'; + + for (;;) { + struct dentry *parent; + if (IS_ROOT(dentry)) + break; + + parent = dentry->d_parent; + prefetch(parent); + + if ((prepend(&end, &buflen, dentry->d_name.name, + dentry->d_name.len) != 0) || + (prepend(&end, &buflen, "/", 1) != 0)) + goto Elong; + + retval = end; + dentry = parent; + } + spin_unlock(&dcache_lock); + return retval; +Elong: + spin_unlock(&dcache_lock); + return ERR_PTR(-ENAMETOOLONG); +} + +/* * NOTE! The user-level library version returns a * character pointer. The kernel system call just * returns the length of the buffer filled (which @@ -1918,9 +1967,9 @@ asmlinkage long sys_getcwd(char __user *buf, unsigned long size) read_lock(¤t->fs->lock); pwd = current->fs->pwd; - path_get(¤t->fs->pwd); + path_get(&pwd); root = current->fs->root; - path_get(¤t->fs->root); + path_get(&root); read_unlock(¤t->fs->lock); error = -ENOENT; @@ -1928,9 +1977,10 @@ asmlinkage long sys_getcwd(char __user *buf, unsigned long size) spin_lock(&dcache_lock); if (pwd.dentry->d_parent == pwd.dentry || !d_unhashed(pwd.dentry)) { unsigned long len; + struct path tmp = root; char * cwd; - cwd = __d_path(pwd.dentry, pwd.mnt, &root, page, PAGE_SIZE); + cwd = __d_path(&pwd, &tmp, page, PAGE_SIZE); spin_unlock(&dcache_lock); error = PTR_ERR(cwd); diff --git a/fs/dlm/Makefile b/fs/dlm/Makefile index d248e60..ca1c912 100644 --- a/fs/dlm/Makefile +++ b/fs/dlm/Makefile @@ -10,6 +10,7 @@ dlm-y := ast.o \ midcomms.o \ netlink.o \ lowcomms.o \ + plock.o \ rcom.o \ recover.o \ recoverd.o \ diff --git a/fs/dlm/config.c b/fs/dlm/config.c index c3ad1df..eac23bd 100644 --- a/fs/dlm/config.c +++ b/fs/dlm/config.c @@ -114,7 +114,7 @@ struct cluster_attribute { }; static ssize_t cluster_set(struct cluster *cl, unsigned int *cl_field, - unsigned int *info_field, int check_zero, + int *info_field, int check_zero, const char *buf, size_t len) { unsigned int x; @@ -284,6 +284,7 @@ struct node { struct list_head list; /* space->members */ int nodeid; int weight; + int new; }; static struct configfs_group_operations clusters_ops = { @@ -565,6 +566,7 @@ static struct config_item *make_node(struct config_group *g, const char *name) config_item_init_type_name(&nd->item, name, &node_type); nd->nodeid = -1; nd->weight = 1; /* default weight of 1 if none is set */ + nd->new = 1; /* set to 0 once it's been read by dlm_nodeid_list() */ mutex_lock(&sp->members_lock); list_add(&nd->list, &sp->members); @@ -805,12 +807,13 @@ static void put_comm(struct comm *cm) } /* caller must free mem */ -int dlm_nodeid_list(char *lsname, int **ids_out) +int dlm_nodeid_list(char *lsname, int **ids_out, int *ids_count_out, + int **new_out, int *new_count_out) { struct space *sp; struct node *nd; - int i = 0, rv = 0; - int *ids; + int i = 0, rv = 0, ids_count = 0, new_count = 0; + int *ids, *new; sp = get_space(lsname); if (!sp) @@ -818,23 +821,50 @@ int dlm_nodeid_list(char *lsname, int **ids_out) mutex_lock(&sp->members_lock); if (!sp->members_count) { - rv = 0; + rv = -EINVAL; + printk(KERN_ERR "dlm: zero members_count\n"); goto out; } - ids = kcalloc(sp->members_count, sizeof(int), GFP_KERNEL); + ids_count = sp->members_count; + + ids = kcalloc(ids_count, sizeof(int), GFP_KERNEL); if (!ids) { rv = -ENOMEM; goto out; } - rv = sp->members_count; - list_for_each_entry(nd, &sp->members, list) + list_for_each_entry(nd, &sp->members, list) { ids[i++] = nd->nodeid; + if (nd->new) + new_count++; + } + + if (ids_count != i) + printk(KERN_ERR "dlm: bad nodeid count %d %d\n", ids_count, i); + + if (!new_count) + goto out_ids; + + new = kcalloc(new_count, sizeof(int), GFP_KERNEL); + if (!new) { + kfree(ids); + rv = -ENOMEM; + goto out; + } - if (rv != i) - printk("bad nodeid count %d %d\n", rv, i); + i = 0; + list_for_each_entry(nd, &sp->members, list) { + if (nd->new) { + new[i++] = nd->nodeid; + nd->new = 0; + } + } + *new_count_out = new_count; + *new_out = new; + out_ids: + *ids_count_out = ids_count; *ids_out = ids; out: mutex_unlock(&sp->members_lock); diff --git a/fs/dlm/config.h b/fs/dlm/config.h index a3170fe..4f1d6fc 100644 --- a/fs/dlm/config.h +++ b/fs/dlm/config.h @@ -35,7 +35,8 @@ extern struct dlm_config_info dlm_config; int dlm_config_init(void); void dlm_config_exit(void); int dlm_node_weight(char *lsname, int nodeid); -int dlm_nodeid_list(char *lsname, int **ids_out); +int dlm_nodeid_list(char *lsname, int **ids_out, int *ids_count_out, + int **new_out, int *new_count_out); int dlm_nodeid_to_addr(int nodeid, struct sockaddr_storage *addr); int dlm_addr_to_nodeid(struct sockaddr_storage *addr, int *nodeid); int dlm_our_nodeid(void); diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h index d30ea8b..5a7ac33 100644 --- a/fs/dlm/dlm_internal.h +++ b/fs/dlm/dlm_internal.h @@ -37,14 +37,11 @@ #include <linux/jhash.h> #include <linux/miscdevice.h> #include <linux/mutex.h> -#include <asm/semaphore.h> #include <asm/uaccess.h> #include <linux/dlm.h> #include "config.h" -#define DLM_LOCKSPACE_LEN 64 - /* Size of the temp buffer midcomms allocates on the stack. We try to make this large enough so most messages fit. FIXME: should sctp make this unnecessary? */ @@ -133,8 +130,10 @@ struct dlm_member { struct dlm_recover { struct list_head list; - int *nodeids; + int *nodeids; /* nodeids of all members */ int node_count; + int *new; /* nodeids of new members */ + int new_count; uint64_t seq; }; @@ -580,6 +579,8 @@ static inline int dlm_no_directory(struct dlm_ls *ls) int dlm_netlink_init(void); void dlm_netlink_exit(void); void dlm_timeout_warn(struct dlm_lkb *lkb); +int dlm_plock_init(void); +void dlm_plock_exit(void); #ifdef CONFIG_DLM_DEBUG int dlm_register_debugfs(void); diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c index 8f250ac..2d3d102 100644 --- a/fs/dlm/lock.c +++ b/fs/dlm/lock.c @@ -165,7 +165,7 @@ void dlm_print_lkb(struct dlm_lkb *lkb) lkb->lkb_grmode, lkb->lkb_wait_type, lkb->lkb_ast_type); } -void dlm_print_rsb(struct dlm_rsb *r) +static void dlm_print_rsb(struct dlm_rsb *r) { printk(KERN_ERR "rsb: nodeid %d flags %lx first %x rlc %d name %s\n", r->res_nodeid, r->res_flags, r->res_first_lkid, @@ -1956,8 +1956,7 @@ static void confirm_master(struct dlm_rsb *r, int error) list_del_init(&lkb->lkb_rsb_lookup); r->res_first_lkid = lkb->lkb_id; _request_lock(r, lkb); - } else - r->res_nodeid = -1; + } break; default: diff --git a/fs/dlm/lock.h b/fs/dlm/lock.h index 05d9c82..88e93c8 100644 --- a/fs/dlm/lock.h +++ b/fs/dlm/lock.h @@ -13,7 +13,6 @@ #ifndef __LOCK_DOT_H__ #define __LOCK_DOT_H__ -void dlm_print_rsb(struct dlm_rsb *r); void dlm_dump_rsb(struct dlm_rsb *r); void dlm_print_lkb(struct dlm_lkb *lkb); void dlm_receive_message_saved(struct dlm_ls *ls, struct dlm_message *ms); diff --git a/fs/dlm/main.c b/fs/dlm/main.c index 58487fb..b80e0aa 100644 --- a/fs/dlm/main.c +++ b/fs/dlm/main.c @@ -46,10 +46,16 @@ static int __init init_dlm(void) if (error) goto out_user; + error = dlm_plock_init(); + if (error) + goto out_netlink; + printk("DLM (built %s %s) installed\n", __DATE__, __TIME__); return 0; + out_netlink: + dlm_netlink_exit(); out_user: dlm_user_exit(); out_debug: @@ -66,6 +72,7 @@ static int __init init_dlm(void) static void __exit exit_dlm(void) { + dlm_plock_exit(); dlm_netlink_exit(); dlm_user_exit(); dlm_config_exit(); diff --git a/fs/dlm/member.c b/fs/dlm/member.c index fa17f5a..26133f0 100644 --- a/fs/dlm/member.c +++ b/fs/dlm/member.c @@ -210,6 +210,23 @@ int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out) } } + /* Add an entry to ls_nodes_gone for members that were removed and + then added again, so that previous state for these nodes will be + cleared during recovery. */ + + for (i = 0; i < rv->new_count; i++) { + if (!dlm_is_member(ls, rv->new[i])) + continue; + log_debug(ls, "new nodeid %d is a re-added member", rv->new[i]); + + memb = kzalloc(sizeof(struct dlm_member), GFP_KERNEL); + if (!memb) + return -ENOMEM; + memb->nodeid = rv->new[i]; + list_add_tail(&memb->list, &ls->ls_nodes_gone); + neg++; + } + /* add new members to ls_nodes */ for (i = 0; i < rv->node_count; i++) { @@ -314,15 +331,16 @@ int dlm_ls_stop(struct dlm_ls *ls) int dlm_ls_start(struct dlm_ls *ls) { struct dlm_recover *rv = NULL, *rv_old; - int *ids = NULL; - int error, count; + int *ids = NULL, *new = NULL; + int error, ids_count = 0, new_count = 0; rv = kzalloc(sizeof(struct dlm_recover), GFP_KERNEL); if (!rv) return -ENOMEM; - error = count = dlm_nodeid_list(ls->ls_name, &ids); - if (error <= 0) + error = dlm_nodeid_list(ls->ls_name, &ids, &ids_count, + &new, &new_count); + if (error < 0) goto fail; spin_lock(&ls->ls_recover_lock); @@ -337,14 +355,19 @@ int dlm_ls_start(struct dlm_ls *ls) } rv->nodeids = ids; - rv->node_count = count; + rv->node_count = ids_count; + rv->new = new; + rv->new_count = new_count; rv->seq = ++ls->ls_recover_seq; rv_old = ls->ls_recover_args; ls->ls_recover_args = rv; spin_unlock(&ls->ls_recover_lock); if (rv_old) { + log_error(ls, "unused recovery %llx %d", + (unsigned long long)rv_old->seq, rv_old->node_count); kfree(rv_old->nodeids); + kfree(rv_old->new); kfree(rv_old); } @@ -354,6 +377,7 @@ int dlm_ls_start(struct dlm_ls *ls) fail: kfree(rv); kfree(ids); + kfree(new); return error; } diff --git a/fs/gfs2/locking/dlm/plock.c b/fs/dlm/plock.c index 2ebd374..d6d6e37 100644 --- a/fs/gfs2/locking/dlm/plock.c +++ b/fs/dlm/plock.c @@ -1,17 +1,19 @@ /* - * Copyright (C) 2005 Red Hat, Inc. All rights reserved. + * Copyright (C) 2005-2008 Red Hat, Inc. All rights reserved. * * This copyrighted material is made available to anyone wishing to use, * modify, copy, or redistribute it subject to the terms and conditions * of the GNU General Public License version 2. */ +#include <linux/fs.h> #include <linux/miscdevice.h> -#include <linux/lock_dlm_plock.h> #include <linux/poll.h> +#include <linux/dlm.h> +#include <linux/dlm_plock.h> -#include "lock_dlm.h" - +#include "dlm_internal.h" +#include "lockspace.h" static spinlock_t ops_lock; static struct list_head send_list; @@ -22,7 +24,7 @@ static wait_queue_head_t recv_wq; struct plock_op { struct list_head list; int done; - struct gdlm_plock_info info; + struct dlm_plock_info info; }; struct plock_xop { @@ -34,22 +36,22 @@ struct plock_xop { }; -static inline void set_version(struct gdlm_plock_info *info) +static inline void set_version(struct dlm_plock_info *info) { - info->version[0] = GDLM_PLOCK_VERSION_MAJOR; - info->version[1] = GDLM_PLOCK_VERSION_MINOR; - info->version[2] = GDLM_PLOCK_VERSION_PATCH; + info->version[0] = DLM_PLOCK_VERSION_MAJOR; + info->version[1] = DLM_PLOCK_VERSION_MINOR; + info->version[2] = DLM_PLOCK_VERSION_PATCH; } -static int check_version(struct gdlm_plock_info *info) +static int check_version(struct dlm_plock_info *info) { - if ((GDLM_PLOCK_VERSION_MAJOR != info->version[0]) || - (GDLM_PLOCK_VERSION_MINOR < info->version[1])) { - log_error("plock device version mismatch: " + if ((DLM_PLOCK_VERSION_MAJOR != info->version[0]) || + (DLM_PLOCK_VERSION_MINOR < info->version[1])) { + log_print("plock device version mismatch: " "kernel (%u.%u.%u), user (%u.%u.%u)", - GDLM_PLOCK_VERSION_MAJOR, - GDLM_PLOCK_VERSION_MINOR, - GDLM_PLOCK_VERSION_PATCH, + DLM_PLOCK_VERSION_MAJOR, + DLM_PLOCK_VERSION_MINOR, + DLM_PLOCK_VERSION_PATCH, info->version[0], info->version[1], info->version[2]); @@ -68,25 +70,31 @@ static void send_op(struct plock_op *op) wake_up(&send_wq); } -int gdlm_plock(void *lockspace, struct lm_lockname *name, - struct file *file, int cmd, struct file_lock *fl) +int dlm_posix_lock(dlm_lockspace_t *lockspace, u64 number, struct file *file, + int cmd, struct file_lock *fl) { - struct gdlm_ls *ls = lockspace; + struct dlm_ls *ls; struct plock_op *op; struct plock_xop *xop; int rv; + ls = dlm_find_lockspace_local(lockspace); + if (!ls) + return -EINVAL; + xop = kzalloc(sizeof(*xop), GFP_KERNEL); - if (!xop) - return -ENOMEM; + if (!xop) { + rv = -ENOMEM; + goto out; + } op = &xop->xop; - op->info.optype = GDLM_PLOCK_OP_LOCK; + op->info.optype = DLM_PLOCK_OP_LOCK; op->info.pid = fl->fl_pid; op->info.ex = (fl->fl_type == F_WRLCK); op->info.wait = IS_SETLKW(cmd); - op->info.fsid = ls->id; - op->info.number = name->ln_number; + op->info.fsid = ls->ls_global_id; + op->info.number = number; op->info.start = fl->fl_start; op->info.end = fl->fl_end; if (fl->fl_lmops && fl->fl_lmops->fl_grant) { @@ -107,12 +115,15 @@ int gdlm_plock(void *lockspace, struct lm_lockname *name, if (xop->callback == NULL) wait_event(recv_wq, (op->done != 0)); - else - return -EINPROGRESS; + else { + rv = -EINPROGRESS; + goto out; + } spin_lock(&ops_lock); if (!list_empty(&op->list)) { - printk(KERN_INFO "plock op on list\n"); + log_error(ls, "dlm_posix_lock: op on list %llx", + (unsigned long long)number); list_del(&op->list); } spin_unlock(&ops_lock); @@ -121,17 +132,19 @@ int gdlm_plock(void *lockspace, struct lm_lockname *name, if (!rv) { if (posix_lock_file_wait(file, fl) < 0) - log_error("gdlm_plock: vfs lock error %x,%llx", - name->ln_type, - (unsigned long long)name->ln_number); + log_error(ls, "dlm_posix_lock: vfs lock error %llx", + (unsigned long long)number); } kfree(xop); +out: + dlm_put_lockspace(ls); return rv; } +EXPORT_SYMBOL_GPL(dlm_posix_lock); /* Returns failure iff a succesful lock operation should be canceled */ -static int gdlm_plock_callback(struct plock_op *op) +static int dlm_plock_callback(struct plock_op *op) { struct file *file; struct file_lock *fl; @@ -142,7 +155,8 @@ static int gdlm_plock_callback(struct plock_op *op) spin_lock(&ops_lock); if (!list_empty(&op->list)) { - printk(KERN_INFO "plock op on list\n"); + log_print("dlm_plock_callback: op on list %llx", + (unsigned long long)op->info.number); list_del(&op->list); } spin_unlock(&ops_lock); @@ -165,19 +179,19 @@ static int gdlm_plock_callback(struct plock_op *op) * This can only happen in the case of kmalloc() failure. * The filesystem's own lock is the authoritative lock, * so a failure to get the lock locally is not a disaster. - * As long as GFS cannot reliably cancel locks (especially + * As long as the fs cannot reliably cancel locks (especially * in a low-memory situation), we're better off ignoring * this failure than trying to recover. */ - log_error("gdlm_plock: vfs lock error file %p fl %p", - file, fl); + log_print("dlm_plock_callback: vfs lock error %llx file %p fl %p", + (unsigned long long)op->info.number, file, fl); } rv = notify(flc, NULL, 0); if (rv) { /* XXX: We need to cancel the fs lock here: */ - printk("gfs2 lock granted after lock request failed;" - " dangling lock!\n"); + log_print("dlm_plock_callback: lock granted after lock request " + "failed; dangling lock!\n"); goto out; } @@ -186,25 +200,31 @@ out: return rv; } -int gdlm_punlock(void *lockspace, struct lm_lockname *name, - struct file *file, struct file_lock *fl) +int dlm_posix_unlock(dlm_lockspace_t *lockspace, u64 number, struct file *file, + struct file_lock *fl) { - struct gdlm_ls *ls = lockspace; + struct dlm_ls *ls; struct plock_op *op; int rv; + ls = dlm_find_lockspace_local(lockspace); + if (!ls) + return -EINVAL; + op = kzalloc(sizeof(*op), GFP_KERNEL); - if (!op) - return -ENOMEM; + if (!op) { + rv = -ENOMEM; + goto out; + } if (posix_lock_file_wait(file, fl) < 0) - log_error("gdlm_punlock: vfs unlock error %x,%llx", - name->ln_type, (unsigned long long)name->ln_number); + log_error(ls, "dlm_posix_unlock: vfs unlock error %llx", + (unsigned long long)number); - op->info.optype = GDLM_PLOCK_OP_UNLOCK; + op->info.optype = DLM_PLOCK_OP_UNLOCK; op->info.pid = fl->fl_pid; - op->info.fsid = ls->id; - op->info.number = name->ln_number; + op->info.fsid = ls->ls_global_id; + op->info.number = number; op->info.start = fl->fl_start; op->info.end = fl->fl_end; if (fl->fl_lmops && fl->fl_lmops->fl_grant) @@ -217,7 +237,8 @@ int gdlm_punlock(void *lockspace, struct lm_lockname *name, spin_lock(&ops_lock); if (!list_empty(&op->list)) { - printk(KERN_INFO "punlock op on list\n"); + log_error(ls, "dlm_posix_unlock: op on list %llx", + (unsigned long long)number); list_del(&op->list); } spin_unlock(&ops_lock); @@ -228,25 +249,34 @@ int gdlm_punlock(void *lockspace, struct lm_lockname *name, rv = 0; kfree(op); +out: + dlm_put_lockspace(ls); return rv; } +EXPORT_SYMBOL_GPL(dlm_posix_unlock); -int gdlm_plock_get(void *lockspace, struct lm_lockname *name, - struct file *file, struct file_lock *fl) +int dlm_posix_get(dlm_lockspace_t *lockspace, u64 number, struct file *file, + struct file_lock *fl) { - struct gdlm_ls *ls = lockspace; + struct dlm_ls *ls; struct plock_op *op; int rv; + ls = dlm_find_lockspace_local(lockspace); + if (!ls) + return -EINVAL; + op = kzalloc(sizeof(*op), GFP_KERNEL); - if (!op) - return -ENOMEM; + if (!op) { + rv = -ENOMEM; + goto out; + } - op->info.optype = GDLM_PLOCK_OP_GET; + op->info.optype = DLM_PLOCK_OP_GET; op->info.pid = fl->fl_pid; op->info.ex = (fl->fl_type == F_WRLCK); - op->info.fsid = ls->id; - op->info.number = name->ln_number; + op->info.fsid = ls->ls_global_id; + op->info.number = number; op->info.start = fl->fl_start; op->info.end = fl->fl_end; if (fl->fl_lmops && fl->fl_lmops->fl_grant) @@ -259,7 +289,8 @@ int gdlm_plock_get(void *lockspace, struct lm_lockname *name, spin_lock(&ops_lock); if (!list_empty(&op->list)) { - printk(KERN_INFO "plock_get op on list\n"); + log_error(ls, "dlm_posix_get: op on list %llx", + (unsigned long long)number); list_del(&op->list); } spin_unlock(&ops_lock); @@ -281,14 +312,17 @@ int gdlm_plock_get(void *lockspace, struct lm_lockname *name, } kfree(op); +out: + dlm_put_lockspace(ls); return rv; } +EXPORT_SYMBOL_GPL(dlm_posix_get); /* a read copies out one plock request from the send list */ static ssize_t dev_read(struct file *file, char __user *u, size_t count, loff_t *ppos) { - struct gdlm_plock_info info; + struct dlm_plock_info info; struct plock_op *op = NULL; if (count < sizeof(info)) @@ -315,7 +349,7 @@ static ssize_t dev_read(struct file *file, char __user *u, size_t count, static ssize_t dev_write(struct file *file, const char __user *u, size_t count, loff_t *ppos) { - struct gdlm_plock_info info; + struct dlm_plock_info info; struct plock_op *op; int found = 0; @@ -345,12 +379,12 @@ static ssize_t dev_write(struct file *file, const char __user *u, size_t count, struct plock_xop *xop; xop = (struct plock_xop *)op; if (xop->callback) - count = gdlm_plock_callback(op); + count = dlm_plock_callback(op); else wake_up(&recv_wq); } else - printk(KERN_INFO "gdlm dev_write no op %x %llx\n", info.fsid, - (unsigned long long)info.number); + log_print("dev_write no op %x %llx", info.fsid, + (unsigned long long)info.number); return count; } @@ -377,11 +411,11 @@ static const struct file_operations dev_fops = { static struct miscdevice plock_dev_misc = { .minor = MISC_DYNAMIC_MINOR, - .name = GDLM_PLOCK_MISC_NAME, + .name = DLM_PLOCK_MISC_NAME, .fops = &dev_fops }; -int gdlm_plock_init(void) +int dlm_plock_init(void) { int rv; @@ -393,14 +427,13 @@ int gdlm_plock_init(void) rv = misc_register(&plock_dev_misc); if (rv) - printk(KERN_INFO "gdlm_plock_init: misc_register failed %d", - rv); + log_print("dlm_plock_init: misc_register failed %d", rv); return rv; } -void gdlm_plock_exit(void) +void dlm_plock_exit(void) { if (misc_deregister(&plock_dev_misc) < 0) - printk(KERN_INFO "gdlm_plock_exit: misc_deregister failed"); + log_print("dlm_plock_exit: misc_deregister failed"); } diff --git a/fs/dlm/recoverd.c b/fs/dlm/recoverd.c index 997f953..fd677c8 100644 --- a/fs/dlm/recoverd.c +++ b/fs/dlm/recoverd.c @@ -257,6 +257,7 @@ static void do_ls_recovery(struct dlm_ls *ls) if (rv) { ls_recover(ls, rv); kfree(rv->nodeids); + kfree(rv->new); kfree(rv); } } diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c index 5deb8b7..08f647d 100644 --- a/fs/ext2/ialloc.c +++ b/fs/ext2/ialloc.c @@ -253,7 +253,7 @@ static int find_group_dir(struct super_block *sb, struct inode *parent) * it has too few free inodes left (min_inodes) or * it has too few free blocks left (min_blocks) or * it's already running too large debt (max_debt). - * Parent's group is prefered, if it doesn't satisfy these + * Parent's group is preferred, if it doesn't satisfy these * conditions we search cyclically through the rest. If none * of the groups look good we just look for a group with more * free inodes than average (starting at parent's group). diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index c620068..b8a2990 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -239,7 +239,7 @@ no_block: * @inode: owner * @ind: descriptor of indirect block. * - * This function returns the prefered place for block allocation. + * This function returns the preferred place for block allocation. * It is used when heuristic for sequential allocation fails. * Rules are: * + if there is a block to the left of our position - allocate near it. @@ -283,7 +283,7 @@ static unsigned long ext2_find_near(struct inode *inode, Indirect *ind) } /** - * ext2_find_goal - find a prefered place for allocation. + * ext2_find_goal - find a preferred place for allocation. * @inode: owner * @block: block we want * @partial: pointer to the last triple within a chain diff --git a/fs/ext2/ioctl.c b/fs/ext2/ioctl.c index b8ea11f..de876fa 100644 --- a/fs/ext2/ioctl.c +++ b/fs/ext2/ioctl.c @@ -12,6 +12,7 @@ #include <linux/time.h> #include <linux/sched.h> #include <linux/compat.h> +#include <linux/mount.h> #include <linux/smp_lock.h> #include <asm/current.h> #include <asm/uaccess.h> @@ -23,6 +24,7 @@ long ext2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) struct ext2_inode_info *ei = EXT2_I(inode); unsigned int flags; unsigned short rsv_window_size; + int ret; ext2_debug ("cmd = %u, arg = %lu\n", cmd, arg); @@ -34,14 +36,19 @@ long ext2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) case EXT2_IOC_SETFLAGS: { unsigned int oldflags; - if (IS_RDONLY(inode)) - return -EROFS; + ret = mnt_want_write(filp->f_path.mnt); + if (ret) + return ret; - if (!is_owner_or_cap(inode)) - return -EACCES; + if (!is_owner_or_cap(inode)) { + ret = -EACCES; + goto setflags_out; + } - if (get_user(flags, (int __user *) arg)) - return -EFAULT; + if (get_user(flags, (int __user *) arg)) { + ret = -EFAULT; + goto setflags_out; + } if (!S_ISDIR(inode->i_mode)) flags &= ~EXT2_DIRSYNC_FL; @@ -50,7 +57,8 @@ long ext2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) /* Is it quota file? Do not allow user to mess with it */ if (IS_NOQUOTA(inode)) { mutex_unlock(&inode->i_mutex); - return -EPERM; + ret = -EPERM; + goto setflags_out; } oldflags = ei->i_flags; @@ -63,7 +71,8 @@ long ext2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) if ((flags ^ oldflags) & (EXT2_APPEND_FL | EXT2_IMMUTABLE_FL)) { if (!capable(CAP_LINUX_IMMUTABLE)) { mutex_unlock(&inode->i_mutex); - return -EPERM; + ret = -EPERM; + goto setflags_out; } } @@ -75,20 +84,26 @@ long ext2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) ext2_set_inode_flags(inode); inode->i_ctime = CURRENT_TIME_SEC; mark_inode_dirty(inode); - return 0; +setflags_out: + mnt_drop_write(filp->f_path.mnt); + return ret; } case EXT2_IOC_GETVERSION: return put_user(inode->i_generation, (int __user *) arg); case EXT2_IOC_SETVERSION: if (!is_owner_or_cap(inode)) return -EPERM; - if (IS_RDONLY(inode)) - return -EROFS; - if (get_user(inode->i_generation, (int __user *) arg)) - return -EFAULT; - inode->i_ctime = CURRENT_TIME_SEC; - mark_inode_dirty(inode); - return 0; + ret = mnt_want_write(filp->f_path.mnt); + if (ret) + return ret; + if (get_user(inode->i_generation, (int __user *) arg)) { + ret = -EFAULT; + } else { + inode->i_ctime = CURRENT_TIME_SEC; + mark_inode_dirty(inode); + } + mnt_drop_write(filp->f_path.mnt); + return ret; case EXT2_IOC_GETRSVSZ: if (test_opt(inode->i_sb, RESERVATION) && S_ISREG(inode->i_mode) @@ -102,15 +117,16 @@ long ext2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) if (!test_opt(inode->i_sb, RESERVATION) ||!S_ISREG(inode->i_mode)) return -ENOTTY; - if (IS_RDONLY(inode)) - return -EROFS; - - if ((current->fsuid != inode->i_uid) && !capable(CAP_FOWNER)) + if (!is_owner_or_cap(inode)) return -EACCES; if (get_user(rsv_window_size, (int __user *)arg)) return -EFAULT; + ret = mnt_want_write(filp->f_path.mnt); + if (ret) + return ret; + if (rsv_window_size > EXT2_MAX_RESERVE_BLOCKS) rsv_window_size = EXT2_MAX_RESERVE_BLOCKS; @@ -131,6 +147,7 @@ long ext2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) rsv->rsv_goal_size = rsv_window_size; } mutex_unlock(&ei->truncate_mutex); + mnt_drop_write(filp->f_path.mnt); return 0; } default: diff --git a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c index 4f4020c..96dd557 100644 --- a/fs/ext3/ialloc.c +++ b/fs/ext3/ialloc.c @@ -239,7 +239,7 @@ static int find_group_dir(struct super_block *sb, struct inode *parent) * it has too few free inodes left (min_inodes) or * it has too few free blocks left (min_blocks) or * it's already running too large debt (max_debt). - * Parent's group is prefered, if it doesn't satisfy these + * Parent's group is preferred, if it doesn't satisfy these * conditions we search cyclically through the rest. If none * of the groups look good we just look for a group with more * free inodes than average (starting at parent's group). diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c index eb95670..c683609 100644 --- a/fs/ext3/inode.c +++ b/fs/ext3/inode.c @@ -392,7 +392,7 @@ no_block: * @inode: owner * @ind: descriptor of indirect block. * - * This function returns the prefered place for block allocation. + * This function returns the preferred place for block allocation. * It is used when heuristic for sequential allocation fails. * Rules are: * + if there is a block to the left of our position - allocate near it. @@ -436,12 +436,12 @@ static ext3_fsblk_t ext3_find_near(struct inode *inode, Indirect *ind) } /** - * ext3_find_goal - find a prefered place for allocation. + * ext3_find_goal - find a preferred place for allocation. * @inode: owner * @block: block we want * @partial: pointer to the last triple within a chain * - * Normally this function find the prefered place for block allocation, + * Normally this function find the preferred place for block allocation, * returns it. */ diff --git a/fs/ext3/ioctl.c b/fs/ext3/ioctl.c index 023a070..0d0c701 100644 --- a/fs/ext3/ioctl.c +++ b/fs/ext3/ioctl.c @@ -12,6 +12,7 @@ #include <linux/capability.h> #include <linux/ext3_fs.h> #include <linux/ext3_jbd.h> +#include <linux/mount.h> #include <linux/time.h> #include <linux/compat.h> #include <linux/smp_lock.h> @@ -38,14 +39,19 @@ int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd, unsigned int oldflags; unsigned int jflag; - if (IS_RDONLY(inode)) - return -EROFS; + err = mnt_want_write(filp->f_path.mnt); + if (err) + return err; - if (!is_owner_or_cap(inode)) - return -EACCES; + if (!is_owner_or_cap(inode)) { + err = -EACCES; + goto flags_out; + } - if (get_user(flags, (int __user *) arg)) - return -EFAULT; + if (get_user(flags, (int __user *) arg)) { + err = -EFAULT; + goto flags_out; + } if (!S_ISDIR(inode->i_mode)) flags &= ~EXT3_DIRSYNC_FL; @@ -54,7 +60,8 @@ int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd, /* Is it quota file? Do not allow user to mess with it */ if (IS_NOQUOTA(inode)) { mutex_unlock(&inode->i_mutex); - return -EPERM; + err = -EPERM; + goto flags_out; } oldflags = ei->i_flags; @@ -70,7 +77,8 @@ int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd, if ((flags ^ oldflags) & (EXT3_APPEND_FL | EXT3_IMMUTABLE_FL)) { if (!capable(CAP_LINUX_IMMUTABLE)) { mutex_unlock(&inode->i_mutex); - return -EPERM; + err = -EPERM; + goto flags_out; } } @@ -81,7 +89,8 @@ int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd, if ((jflag ^ oldflags) & (EXT3_JOURNAL_DATA_FL)) { if (!capable(CAP_SYS_RESOURCE)) { mutex_unlock(&inode->i_mutex); - return -EPERM; + err = -EPERM; + goto flags_out; } } @@ -89,7 +98,8 @@ int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd, handle = ext3_journal_start(inode, 1); if (IS_ERR(handle)) { mutex_unlock(&inode->i_mutex); - return PTR_ERR(handle); + err = PTR_ERR(handle); + goto flags_out; } if (IS_SYNC(inode)) handle->h_sync = 1; @@ -115,6 +125,8 @@ flags_err: if ((jflag ^ oldflags) & (EXT3_JOURNAL_DATA_FL)) err = ext3_change_inode_journal_flag(inode, jflag); mutex_unlock(&inode->i_mutex); +flags_out: + mnt_drop_write(filp->f_path.mnt); return err; } case EXT3_IOC_GETVERSION: @@ -129,14 +141,18 @@ flags_err: if (!is_owner_or_cap(inode)) return -EPERM; - if (IS_RDONLY(inode)) - return -EROFS; - if (get_user(generation, (int __user *) arg)) - return -EFAULT; - + err = mnt_want_write(filp->f_path.mnt); + if (err) + return err; + if (get_user(generation, (int __user *) arg)) { + err = -EFAULT; + goto setversion_out; + } handle = ext3_journal_start(inode, 1); - if (IS_ERR(handle)) - return PTR_ERR(handle); + if (IS_ERR(handle)) { + err = PTR_ERR(handle); + goto setversion_out; + } err = ext3_reserve_inode_write(handle, inode, &iloc); if (err == 0) { inode->i_ctime = CURRENT_TIME_SEC; @@ -144,6 +160,8 @@ flags_err: err = ext3_mark_iloc_dirty(handle, inode, &iloc); } ext3_journal_stop(handle); +setversion_out: + mnt_drop_write(filp->f_path.mnt); return err; } #ifdef CONFIG_JBD_DEBUG @@ -179,18 +197,24 @@ flags_err: } return -ENOTTY; case EXT3_IOC_SETRSVSZ: { + int err; if (!test_opt(inode->i_sb, RESERVATION) ||!S_ISREG(inode->i_mode)) return -ENOTTY; - if (IS_RDONLY(inode)) - return -EROFS; + err = mnt_want_write(filp->f_path.mnt); + if (err) + return err; - if (!is_owner_or_cap(inode)) - return -EACCES; + if (!is_owner_or_cap(inode)) { + err = -EACCES; + goto setrsvsz_out; + } - if (get_user(rsv_window_size, (int __user *)arg)) - return -EFAULT; + if (get_user(rsv_window_size, (int __user *)arg)) { + err = -EFAULT; + goto setrsvsz_out; + } if (rsv_window_size > EXT3_MAX_RESERVE_BLOCKS) rsv_window_size = EXT3_MAX_RESERVE_BLOCKS; @@ -208,7 +232,9 @@ flags_err: rsv->rsv_goal_size = rsv_window_size; } mutex_unlock(&ei->truncate_mutex); - return 0; +setrsvsz_out: + mnt_drop_write(filp->f_path.mnt); + return err; } case EXT3_IOC_GROUP_EXTEND: { ext3_fsblk_t n_blocks_count; @@ -218,17 +244,20 @@ flags_err: if (!capable(CAP_SYS_RESOURCE)) return -EPERM; - if (IS_RDONLY(inode)) - return -EROFS; - - if (get_user(n_blocks_count, (__u32 __user *)arg)) - return -EFAULT; + err = mnt_want_write(filp->f_path.mnt); + if (err) + return err; + if (get_user(n_blocks_count, (__u32 __user *)arg)) { + err = -EFAULT; + goto group_extend_out; + } err = ext3_group_extend(sb, EXT3_SB(sb)->s_es, n_blocks_count); journal_lock_updates(EXT3_SB(sb)->s_journal); journal_flush(EXT3_SB(sb)->s_journal); journal_unlock_updates(EXT3_SB(sb)->s_journal); - +group_extend_out: + mnt_drop_write(filp->f_path.mnt); return err; } case EXT3_IOC_GROUP_ADD: { @@ -239,18 +268,22 @@ flags_err: if (!capable(CAP_SYS_RESOURCE)) return -EPERM; - if (IS_RDONLY(inode)) - return -EROFS; + err = mnt_want_write(filp->f_path.mnt); + if (err) + return err; if (copy_from_user(&input, (struct ext3_new_group_input __user *)arg, - sizeof(input))) - return -EFAULT; + sizeof(input))) { + err = -EFAULT; + goto group_add_out; + } err = ext3_group_add(sb, &input); journal_lock_updates(EXT3_SB(sb)->s_journal); journal_flush(EXT3_SB(sb)->s_journal); journal_unlock_updates(EXT3_SB(sb)->s_journal); - +group_add_out: + mnt_drop_write(filp->f_path.mnt); return err; } diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 8036b9b..486e46a 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -305,7 +305,7 @@ static int find_group_dir(struct super_block *sb, struct inode *parent, * it has too few free inodes left (min_inodes) or * it has too few free blocks left (min_blocks) or * it's already running too large debt (max_debt). - * Parent's group is prefered, if it doesn't satisfy these + * Parent's group is preferred, if it doesn't satisfy these * conditions we search cyclically through the rest. If none * of the groups look good we just look for a group with more * free inodes than average (starting at parent's group). diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 945cbf6..8fab233 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -382,7 +382,7 @@ no_block: * @inode: owner * @ind: descriptor of indirect block. * - * This function returns the prefered place for block allocation. + * This function returns the preferred place for block allocation. * It is used when heuristic for sequential allocation fails. * Rules are: * + if there is a block to the left of our position - allocate near it. @@ -432,12 +432,12 @@ static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind) } /** - * ext4_find_goal - find a prefered place for allocation. + * ext4_find_goal - find a preferred place for allocation. * @inode: owner * @block: block we want * @partial: pointer to the last triple within a chain * - * Normally this function find the prefered place for block allocation, + * Normally this function find the preferred place for block allocation, * returns it. */ static ext4_fsblk_t ext4_find_goal(struct inode *inode, ext4_lblk_t block, diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 2ed7c37..25b13ed 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -15,6 +15,7 @@ #include <linux/time.h> #include <linux/compat.h> #include <linux/smp_lock.h> +#include <linux/mount.h> #include <asm/uaccess.h> int ext4_ioctl (struct inode * inode, struct file * filp, unsigned int cmd, @@ -38,24 +39,25 @@ int ext4_ioctl (struct inode * inode, struct file * filp, unsigned int cmd, unsigned int oldflags; unsigned int jflag; - if (IS_RDONLY(inode)) - return -EROFS; - if (!is_owner_or_cap(inode)) return -EACCES; if (get_user(flags, (int __user *) arg)) return -EFAULT; + err = mnt_want_write(filp->f_path.mnt); + if (err) + return err; + if (!S_ISDIR(inode->i_mode)) flags &= ~EXT4_DIRSYNC_FL; + err = -EPERM; mutex_lock(&inode->i_mutex); /* Is it quota file? Do not allow user to mess with it */ - if (IS_NOQUOTA(inode)) { - mutex_unlock(&inode->i_mutex); - return -EPERM; - } + if (IS_NOQUOTA(inode)) + goto flags_out; + oldflags = ei->i_flags; /* The JOURNAL_DATA flag is modifiable only by root */ @@ -68,10 +70,8 @@ int ext4_ioctl (struct inode * inode, struct file * filp, unsigned int cmd, * This test looks nicer. Thanks to Pauline Middelink */ if ((flags ^ oldflags) & (EXT4_APPEND_FL | EXT4_IMMUTABLE_FL)) { - if (!capable(CAP_LINUX_IMMUTABLE)) { - mutex_unlock(&inode->i_mutex); - return -EPERM; - } + if (!capable(CAP_LINUX_IMMUTABLE)) + goto flags_out; } /* @@ -79,17 +79,14 @@ int ext4_ioctl (struct inode * inode, struct file * filp, unsigned int cmd, * the relevant capability. */ if ((jflag ^ oldflags) & (EXT4_JOURNAL_DATA_FL)) { - if (!capable(CAP_SYS_RESOURCE)) { - mutex_unlock(&inode->i_mutex); - return -EPERM; - } + if (!capable(CAP_SYS_RESOURCE)) + goto flags_out; } - handle = ext4_journal_start(inode, 1); if (IS_ERR(handle)) { - mutex_unlock(&inode->i_mutex); - return PTR_ERR(handle); + err = PTR_ERR(handle); + goto flags_out; } if (IS_SYNC(inode)) handle->h_sync = 1; @@ -107,14 +104,14 @@ int ext4_ioctl (struct inode * inode, struct file * filp, unsigned int cmd, err = ext4_mark_iloc_dirty(handle, inode, &iloc); flags_err: ext4_journal_stop(handle); - if (err) { - mutex_unlock(&inode->i_mutex); - return err; - } + if (err) + goto flags_out; if ((jflag ^ oldflags) & (EXT4_JOURNAL_DATA_FL)) err = ext4_change_inode_journal_flag(inode, jflag); +flags_out: mutex_unlock(&inode->i_mutex); + mnt_drop_write(filp->f_path.mnt); return err; } case EXT4_IOC_GETVERSION: @@ -129,14 +126,20 @@ flags_err: if (!is_owner_or_cap(inode)) return -EPERM; - if (IS_RDONLY(inode)) - return -EROFS; - if (get_user(generation, (int __user *) arg)) - return -EFAULT; + + err = mnt_want_write(filp->f_path.mnt); + if (err) + return err; + if (get_user(generation, (int __user *) arg)) { + err = -EFAULT; + goto setversion_out; + } handle = ext4_journal_start(inode, 1); - if (IS_ERR(handle)) - return PTR_ERR(handle); + if (IS_ERR(handle)) { + err = PTR_ERR(handle); + goto setversion_out; + } err = ext4_reserve_inode_write(handle, inode, &iloc); if (err == 0) { inode->i_ctime = ext4_current_time(inode); @@ -144,6 +147,8 @@ flags_err: err = ext4_mark_iloc_dirty(handle, inode, &iloc); } ext4_journal_stop(handle); +setversion_out: + mnt_drop_write(filp->f_path.mnt); return err; } #ifdef CONFIG_JBD2_DEBUG @@ -179,19 +184,21 @@ flags_err: } return -ENOTTY; case EXT4_IOC_SETRSVSZ: { + int err; if (!test_opt(inode->i_sb, RESERVATION) ||!S_ISREG(inode->i_mode)) return -ENOTTY; - if (IS_RDONLY(inode)) - return -EROFS; - if (!is_owner_or_cap(inode)) return -EACCES; if (get_user(rsv_window_size, (int __user *)arg)) return -EFAULT; + err = mnt_want_write(filp->f_path.mnt); + if (err) + return err; + if (rsv_window_size > EXT4_MAX_RESERVE_BLOCKS) rsv_window_size = EXT4_MAX_RESERVE_BLOCKS; @@ -208,6 +215,7 @@ flags_err: rsv->rsv_goal_size = rsv_window_size; } up_write(&ei->i_data_sem); + mnt_drop_write(filp->f_path.mnt); return 0; } case EXT4_IOC_GROUP_EXTEND: { @@ -218,16 +226,18 @@ flags_err: if (!capable(CAP_SYS_RESOURCE)) return -EPERM; - if (IS_RDONLY(inode)) - return -EROFS; - if (get_user(n_blocks_count, (__u32 __user *)arg)) return -EFAULT; + err = mnt_want_write(filp->f_path.mnt); + if (err) + return err; + err = ext4_group_extend(sb, EXT4_SB(sb)->s_es, n_blocks_count); jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal); jbd2_journal_flush(EXT4_SB(sb)->s_journal); jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal); + mnt_drop_write(filp->f_path.mnt); return err; } @@ -239,17 +249,19 @@ flags_err: if (!capable(CAP_SYS_RESOURCE)) return -EPERM; - if (IS_RDONLY(inode)) - return -EROFS; - if (copy_from_user(&input, (struct ext4_new_group_input __user *)arg, sizeof(input))) return -EFAULT; + err = mnt_want_write(filp->f_path.mnt); + if (err) + return err; + err = ext4_group_add(sb, &input); jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal); jbd2_journal_flush(EXT4_SB(sb)->s_journal); jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal); + mnt_drop_write(filp->f_path.mnt); return err; } diff --git a/fs/fat/file.c b/fs/fat/file.c index c614175..2a3bed9 100644 --- a/fs/fat/file.c +++ b/fs/fat/file.c @@ -8,6 +8,7 @@ #include <linux/capability.h> #include <linux/module.h> +#include <linux/mount.h> #include <linux/time.h> #include <linux/msdos_fs.h> #include <linux/smp_lock.h> @@ -46,10 +47,9 @@ int fat_generic_ioctl(struct inode *inode, struct file *filp, mutex_lock(&inode->i_mutex); - if (IS_RDONLY(inode)) { - err = -EROFS; - goto up; - } + err = mnt_want_write(filp->f_path.mnt); + if (err) + goto up_no_drop_write; /* * ATTR_VOLUME and ATTR_DIR cannot be changed; this also @@ -105,7 +105,9 @@ int fat_generic_ioctl(struct inode *inode, struct file *filp, MSDOS_I(inode)->i_attrs = attr & ATTR_UNUSED; mark_inode_dirty(inode); - up: +up: + mnt_drop_write(filp->f_path.mnt); +up_no_drop_write: mutex_unlock(&inode->i_mutex); return err; } diff --git a/fs/file_table.c b/fs/file_table.c index 986ff4e..7a0a9b8 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -42,6 +42,7 @@ static inline void file_free_rcu(struct rcu_head *head) static inline void file_free(struct file *f) { percpu_counter_dec(&nr_files); + file_check_state(f); call_rcu(&f->f_u.fu_rcuhead, file_free_rcu); } @@ -199,6 +200,18 @@ int init_file(struct file *file, struct vfsmount *mnt, struct dentry *dentry, file->f_mapping = dentry->d_inode->i_mapping; file->f_mode = mode; file->f_op = fop; + + /* + * These mounts don't really matter in practice + * for r/o bind mounts. They aren't userspace- + * visible. We do this for consistency, and so + * that we can do debugging checks at __fput() + */ + if ((mode & FMODE_WRITE) && !special_file(dentry->d_inode->i_mode)) { + file_take_write(file); + error = mnt_want_write(mnt); + WARN_ON(error); + } return error; } EXPORT_SYMBOL(init_file); @@ -211,6 +224,31 @@ void fput(struct file *file) EXPORT_SYMBOL(fput); +/** + * drop_file_write_access - give up ability to write to a file + * @file: the file to which we will stop writing + * + * This is a central place which will give up the ability + * to write to @file, along with access to write through + * its vfsmount. + */ +void drop_file_write_access(struct file *file) +{ + struct vfsmount *mnt = file->f_path.mnt; + struct dentry *dentry = file->f_path.dentry; + struct inode *inode = dentry->d_inode; + + put_write_access(inode); + + if (special_file(inode->i_mode)) + return; + if (file_check_writeable(file) != 0) + return; + mnt_drop_write(mnt); + file_release_write(file); +} +EXPORT_SYMBOL_GPL(drop_file_write_access); + /* __fput is called from task context when aio completion releases the last * last use of a struct file *. Do not use otherwise. */ @@ -236,10 +274,10 @@ void __fput(struct file *file) if (unlikely(S_ISCHR(inode->i_mode) && inode->i_cdev != NULL)) cdev_put(inode->i_cdev); fops_put(file->f_op); - if (file->f_mode & FMODE_WRITE) - put_write_access(inode); put_pid(file->f_owner.pid); file_kill(file); + if (file->f_mode & FMODE_WRITE) + drop_file_write_access(file); file->f_path.dentry = NULL; file->f_path.mnt = NULL; file_free(file); diff --git a/fs/gfs2/Kconfig b/fs/gfs2/Kconfig index de8e64c..7f7947e 100644 --- a/fs/gfs2/Kconfig +++ b/fs/gfs2/Kconfig @@ -1,6 +1,6 @@ config GFS2_FS tristate "GFS2 file system support" - depends on EXPERIMENTAL + depends on EXPERIMENTAL && (64BIT || (LSF && LBD)) select FS_POSIX_ACL select CRC32 help diff --git a/fs/gfs2/Makefile b/fs/gfs2/Makefile index 8fff110..e2350df 100644 --- a/fs/gfs2/Makefile +++ b/fs/gfs2/Makefile @@ -1,6 +1,6 @@ obj-$(CONFIG_GFS2_FS) += gfs2.o gfs2-y := acl.o bmap.o daemon.o dir.o eaops.o eattr.o glock.o \ - glops.o inode.o lm.o log.o lops.o locking.o main.o meta_io.o \ + glops.o inode.o log.o lops.o locking.o main.o meta_io.o \ mount.o ops_address.o ops_dentry.o ops_export.o ops_file.o \ ops_fstype.o ops_inode.o ops_super.o quota.o \ recovery.o rgrp.o super.o sys.o trans.o util.o diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c index 1047a8c..3e9bd46 100644 --- a/fs/gfs2/acl.c +++ b/fs/gfs2/acl.c @@ -116,7 +116,7 @@ static int acl_get(struct gfs2_inode *ip, int access, struct posix_acl **acl, goto out; er.er_data_len = GFS2_EA_DATA_LEN(el->el_ea); - er.er_data = kmalloc(er.er_data_len, GFP_KERNEL); + er.er_data = kmalloc(er.er_data_len, GFP_NOFS); error = -ENOMEM; if (!er.er_data) goto out; @@ -222,7 +222,7 @@ int gfs2_acl_create(struct gfs2_inode *dip, struct gfs2_inode *ip) return error; } - clone = posix_acl_clone(acl, GFP_KERNEL); + clone = posix_acl_clone(acl, GFP_NOFS); error = -ENOMEM; if (!clone) goto out; @@ -272,7 +272,7 @@ int gfs2_acl_chmod(struct gfs2_inode *ip, struct iattr *attr) if (!acl) return gfs2_setattr_simple(ip, attr); - clone = posix_acl_clone(acl, GFP_KERNEL); + clone = posix_acl_clone(acl, GFP_NOFS); error = -ENOMEM; if (!clone) goto out; diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index e9456eb..c19184f 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -33,6 +33,7 @@ * keep it small. */ struct metapath { + struct buffer_head *mp_bh[GFS2_MAX_META_HEIGHT]; __u16 mp_list[GFS2_MAX_META_HEIGHT]; }; @@ -135,9 +136,10 @@ int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page) /* Get a free block, fill it with the stuffed data, and write it out to disk */ + unsigned int n = 1; + block = gfs2_alloc_block(ip, &n); if (isdir) { - block = gfs2_alloc_meta(ip); - + gfs2_trans_add_unrevoke(GFS2_SB(&ip->i_inode), block, 1); error = gfs2_dir_get_new_buffer(ip, block, &bh); if (error) goto out_brelse; @@ -145,8 +147,6 @@ int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page) dibh, sizeof(struct gfs2_dinode)); brelse(bh); } else { - block = gfs2_alloc_data(ip); - error = gfs2_unstuffer_page(ip, dibh, block, page); if (error) goto out_brelse; @@ -161,12 +161,11 @@ int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page) if (ip->i_di.di_size) { *(__be64 *)(di + 1) = cpu_to_be64(block); - ip->i_di.di_blocks++; - gfs2_set_inode_blocks(&ip->i_inode); - di->di_blocks = cpu_to_be64(ip->i_di.di_blocks); + gfs2_add_inode_blocks(&ip->i_inode, 1); + di->di_blocks = cpu_to_be64(gfs2_get_inode_blocks(&ip->i_inode)); } - ip->i_di.di_height = 1; + ip->i_height = 1; di->di_height = cpu_to_be16(1); out_brelse: @@ -176,114 +175,13 @@ out: return error; } -/** - * calc_tree_height - Calculate the height of a metadata tree - * @ip: The GFS2 inode - * @size: The proposed size of the file - * - * Work out how tall a metadata tree needs to be in order to accommodate a - * file of a particular size. If size is less than the current size of - * the inode, then the current size of the inode is used instead of the - * supplied one. - * - * Returns: the height the tree should be - */ - -static unsigned int calc_tree_height(struct gfs2_inode *ip, u64 size) -{ - struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); - u64 *arr; - unsigned int max, height; - - if (ip->i_di.di_size > size) - size = ip->i_di.di_size; - - if (gfs2_is_dir(ip)) { - arr = sdp->sd_jheightsize; - max = sdp->sd_max_jheight; - } else { - arr = sdp->sd_heightsize; - max = sdp->sd_max_height; - } - - for (height = 0; height < max; height++) - if (arr[height] >= size) - break; - - return height; -} - -/** - * build_height - Build a metadata tree of the requested height - * @ip: The GFS2 inode - * @height: The height to build to - * - * - * Returns: errno - */ - -static int build_height(struct inode *inode, unsigned height) -{ - struct gfs2_inode *ip = GFS2_I(inode); - unsigned new_height = height - ip->i_di.di_height; - struct buffer_head *dibh; - struct buffer_head *blocks[GFS2_MAX_META_HEIGHT]; - struct gfs2_dinode *di; - int error; - __be64 *bp; - u64 bn; - unsigned n; - - if (height <= ip->i_di.di_height) - return 0; - - error = gfs2_meta_inode_buffer(ip, &dibh); - if (error) - return error; - - for(n = 0; n < new_height; n++) { - bn = gfs2_alloc_meta(ip); - blocks[n] = gfs2_meta_new(ip->i_gl, bn); - gfs2_trans_add_bh(ip->i_gl, blocks[n], 1); - } - - n = 0; - bn = blocks[0]->b_blocknr; - if (new_height > 1) { - for(; n < new_height-1; n++) { - gfs2_metatype_set(blocks[n], GFS2_METATYPE_IN, - GFS2_FORMAT_IN); - gfs2_buffer_clear_tail(blocks[n], - sizeof(struct gfs2_meta_header)); - bp = (__be64 *)(blocks[n]->b_data + - sizeof(struct gfs2_meta_header)); - *bp = cpu_to_be64(blocks[n+1]->b_blocknr); - brelse(blocks[n]); - blocks[n] = NULL; - } - } - gfs2_metatype_set(blocks[n], GFS2_METATYPE_IN, GFS2_FORMAT_IN); - gfs2_buffer_copy_tail(blocks[n], sizeof(struct gfs2_meta_header), - dibh, sizeof(struct gfs2_dinode)); - brelse(blocks[n]); - gfs2_trans_add_bh(ip->i_gl, dibh, 1); - di = (struct gfs2_dinode *)dibh->b_data; - gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode)); - *(__be64 *)(di + 1) = cpu_to_be64(bn); - ip->i_di.di_height += new_height; - ip->i_di.di_blocks += new_height; - gfs2_set_inode_blocks(&ip->i_inode); - di->di_height = cpu_to_be16(ip->i_di.di_height); - di->di_blocks = cpu_to_be64(ip->i_di.di_blocks); - brelse(dibh); - return error; -} /** * find_metapath - Find path through the metadata tree - * @ip: The inode pointer + * @sdp: The superblock * @mp: The metapath to return the result in * @block: The disk block to look up + * @height: The pre-calculated height of the metadata tree * * This routine returns a struct metapath structure that defines a path * through the metadata of inode "ip" to get to block "block". @@ -338,21 +236,29 @@ static int build_height(struct inode *inode, unsigned height) * */ -static void find_metapath(struct gfs2_inode *ip, u64 block, - struct metapath *mp) +static void find_metapath(const struct gfs2_sbd *sdp, u64 block, + struct metapath *mp, unsigned int height) { - struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); - u64 b = block; unsigned int i; - for (i = ip->i_di.di_height; i--;) - mp->mp_list[i] = do_div(b, sdp->sd_inptrs); + for (i = height; i--;) + mp->mp_list[i] = do_div(block, sdp->sd_inptrs); + +} +static inline unsigned int zero_metapath_length(const struct metapath *mp, + unsigned height) +{ + unsigned int i; + for (i = 0; i < height - 1; i++) { + if (mp->mp_list[i] != 0) + return i; + } + return height; } /** * metapointer - Return pointer to start of metadata in a buffer - * @bh: The buffer * @height: The metadata height (0 = dinode) * @mp: The metapath * @@ -361,93 +267,302 @@ static void find_metapath(struct gfs2_inode *ip, u64 block, * metadata tree. */ -static inline __be64 *metapointer(struct buffer_head *bh, int *boundary, - unsigned int height, const struct metapath *mp) +static inline __be64 *metapointer(unsigned int height, const struct metapath *mp) { + struct buffer_head *bh = mp->mp_bh[height]; unsigned int head_size = (height > 0) ? sizeof(struct gfs2_meta_header) : sizeof(struct gfs2_dinode); - __be64 *ptr; - *boundary = 0; - ptr = ((__be64 *)(bh->b_data + head_size)) + mp->mp_list[height]; - if (ptr + 1 == (__be64 *)(bh->b_data + bh->b_size)) - *boundary = 1; - return ptr; + return ((__be64 *)(bh->b_data + head_size)) + mp->mp_list[height]; } /** - * lookup_block - Get the next metadata block in metadata tree - * @ip: The GFS2 inode - * @bh: Buffer containing the pointers to metadata blocks - * @height: The height of the tree (0 = dinode) + * lookup_metapath - Walk the metadata tree to a specific point + * @ip: The inode * @mp: The metapath - * @create: Non-zero if we may create a new meatdata block - * @new: Used to indicate if we did create a new metadata block - * @block: the returned disk block number * - * Given a metatree, complete to a particular height, checks to see if the next - * height of the tree exists. If not the next height of the tree is created. - * The block number of the next height of the metadata tree is returned. + * Assumes that the inode's buffer has already been looked up and + * hooked onto mp->mp_bh[0] and that the metapath has been initialised + * by find_metapath(). + * + * If this function encounters part of the tree which has not been + * allocated, it returns the current height of the tree at the point + * at which it found the unallocated block. Blocks which are found are + * added to the mp->mp_bh[] list. * + * Returns: error or height of metadata tree */ -static int lookup_block(struct gfs2_inode *ip, struct buffer_head *bh, - unsigned int height, struct metapath *mp, int create, - int *new, u64 *block) +static int lookup_metapath(struct gfs2_inode *ip, struct metapath *mp) { - int boundary; - __be64 *ptr = metapointer(bh, &boundary, height, mp); + unsigned int end_of_metadata = ip->i_height - 1; + unsigned int x; + __be64 *ptr; + u64 dblock; + int ret; - if (*ptr) { - *block = be64_to_cpu(*ptr); - return boundary; - } + for (x = 0; x < end_of_metadata; x++) { + ptr = metapointer(x, mp); + dblock = be64_to_cpu(*ptr); + if (!dblock) + return x + 1; - *block = 0; + ret = gfs2_meta_indirect_buffer(ip, x+1, dblock, 0, &mp->mp_bh[x+1]); + if (ret) + return ret; + } - if (!create) - return 0; + return ip->i_height; +} - if (height == ip->i_di.di_height - 1 && !gfs2_is_dir(ip)) - *block = gfs2_alloc_data(ip); - else - *block = gfs2_alloc_meta(ip); +static inline void release_metapath(struct metapath *mp) +{ + int i; - gfs2_trans_add_bh(ip->i_gl, bh, 1); + for (i = 0; i < GFS2_MAX_META_HEIGHT; i++) { + if (mp->mp_bh[i] == NULL) + break; + brelse(mp->mp_bh[i]); + } +} - *ptr = cpu_to_be64(*block); - ip->i_di.di_blocks++; - gfs2_set_inode_blocks(&ip->i_inode); +/** + * gfs2_extent_length - Returns length of an extent of blocks + * @start: Start of the buffer + * @len: Length of the buffer in bytes + * @ptr: Current position in the buffer + * @limit: Max extent length to return (0 = unlimited) + * @eob: Set to 1 if we hit "end of block" + * + * If the first block is zero (unallocated) it will return the number of + * unallocated blocks in the extent, otherwise it will return the number + * of contiguous blocks in the extent. + * + * Returns: The length of the extent (minimum of one block) + */ - *new = 1; - return 0; +static inline unsigned int gfs2_extent_length(void *start, unsigned int len, __be64 *ptr, unsigned limit, int *eob) +{ + const __be64 *end = (start + len); + const __be64 *first = ptr; + u64 d = be64_to_cpu(*ptr); + + *eob = 0; + do { + ptr++; + if (ptr >= end) + break; + if (limit && --limit == 0) + break; + if (d) + d++; + } while(be64_to_cpu(*ptr) == d); + if (ptr >= end) + *eob = 1; + return (ptr - first); } -static inline void bmap_lock(struct inode *inode, int create) +static inline void bmap_lock(struct gfs2_inode *ip, int create) { - struct gfs2_inode *ip = GFS2_I(inode); if (create) down_write(&ip->i_rw_mutex); else down_read(&ip->i_rw_mutex); } -static inline void bmap_unlock(struct inode *inode, int create) +static inline void bmap_unlock(struct gfs2_inode *ip, int create) { - struct gfs2_inode *ip = GFS2_I(inode); if (create) up_write(&ip->i_rw_mutex); else up_read(&ip->i_rw_mutex); } +static inline __be64 *gfs2_indirect_init(struct metapath *mp, + struct gfs2_glock *gl, unsigned int i, + unsigned offset, u64 bn) +{ + __be64 *ptr = (__be64 *)(mp->mp_bh[i - 1]->b_data + + ((i > 1) ? sizeof(struct gfs2_meta_header) : + sizeof(struct gfs2_dinode))); + BUG_ON(i < 1); + BUG_ON(mp->mp_bh[i] != NULL); + mp->mp_bh[i] = gfs2_meta_new(gl, bn); + gfs2_trans_add_bh(gl, mp->mp_bh[i], 1); + gfs2_metatype_set(mp->mp_bh[i], GFS2_METATYPE_IN, GFS2_FORMAT_IN); + gfs2_buffer_clear_tail(mp->mp_bh[i], sizeof(struct gfs2_meta_header)); + ptr += offset; + *ptr = cpu_to_be64(bn); + return ptr; +} + +enum alloc_state { + ALLOC_DATA = 0, + ALLOC_GROW_DEPTH = 1, + ALLOC_GROW_HEIGHT = 2, + /* ALLOC_UNSTUFF = 3, TBD and rather complicated */ +}; + +/** + * gfs2_bmap_alloc - Build a metadata tree of the requested height + * @inode: The GFS2 inode + * @lblock: The logical starting block of the extent + * @bh_map: This is used to return the mapping details + * @mp: The metapath + * @sheight: The starting height (i.e. whats already mapped) + * @height: The height to build to + * @maxlen: The max number of data blocks to alloc + * + * In this routine we may have to alloc: + * i) Indirect blocks to grow the metadata tree height + * ii) Indirect blocks to fill in lower part of the metadata tree + * iii) Data blocks + * + * The function is in two parts. The first part works out the total + * number of blocks which we need. The second part does the actual + * allocation asking for an extent at a time (if enough contiguous free + * blocks are available, there will only be one request per bmap call) + * and uses the state machine to initialise the blocks in order. + * + * Returns: errno on error + */ + +static int gfs2_bmap_alloc(struct inode *inode, const sector_t lblock, + struct buffer_head *bh_map, struct metapath *mp, + const unsigned int sheight, + const unsigned int height, + const unsigned int maxlen) +{ + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_sbd *sdp = GFS2_SB(inode); + struct buffer_head *dibh = mp->mp_bh[0]; + u64 bn, dblock = 0; + unsigned n, i, blks, alloced = 0, iblks = 0, zmpl = 0; + unsigned dblks = 0; + unsigned ptrs_per_blk; + const unsigned end_of_metadata = height - 1; + int eob = 0; + enum alloc_state state; + __be64 *ptr; + __be64 zero_bn = 0; + + BUG_ON(sheight < 1); + BUG_ON(dibh == NULL); + + gfs2_trans_add_bh(ip->i_gl, dibh, 1); + + if (height == sheight) { + struct buffer_head *bh; + /* Bottom indirect block exists, find unalloced extent size */ + ptr = metapointer(end_of_metadata, mp); + bh = mp->mp_bh[end_of_metadata]; + dblks = gfs2_extent_length(bh->b_data, bh->b_size, ptr, maxlen, + &eob); + BUG_ON(dblks < 1); + state = ALLOC_DATA; + } else { + /* Need to allocate indirect blocks */ + ptrs_per_blk = height > 1 ? sdp->sd_inptrs : sdp->sd_diptrs; + dblks = min(maxlen, ptrs_per_blk - mp->mp_list[end_of_metadata]); + if (height == ip->i_height) { + /* Writing into existing tree, extend tree down */ + iblks = height - sheight; + state = ALLOC_GROW_DEPTH; + } else { + /* Building up tree height */ + state = ALLOC_GROW_HEIGHT; + iblks = height - ip->i_height; + zmpl = zero_metapath_length(mp, height); + iblks -= zmpl; + iblks += height; + } + } + + /* start of the second part of the function (state machine) */ + + blks = dblks + iblks; + i = sheight; + do { + n = blks - alloced; + bn = gfs2_alloc_block(ip, &n); + alloced += n; + if (state != ALLOC_DATA || gfs2_is_jdata(ip)) + gfs2_trans_add_unrevoke(sdp, bn, n); + switch (state) { + /* Growing height of tree */ + case ALLOC_GROW_HEIGHT: + if (i == 1) { + ptr = (__be64 *)(dibh->b_data + + sizeof(struct gfs2_dinode)); + zero_bn = *ptr; + } + for (; i - 1 < height - ip->i_height && n > 0; i++, n--) + gfs2_indirect_init(mp, ip->i_gl, i, 0, bn++); + if (i - 1 == height - ip->i_height) { + i--; + gfs2_buffer_copy_tail(mp->mp_bh[i], + sizeof(struct gfs2_meta_header), + dibh, sizeof(struct gfs2_dinode)); + gfs2_buffer_clear_tail(dibh, + sizeof(struct gfs2_dinode) + + sizeof(__be64)); + ptr = (__be64 *)(mp->mp_bh[i]->b_data + + sizeof(struct gfs2_meta_header)); + *ptr = zero_bn; + state = ALLOC_GROW_DEPTH; + for(i = zmpl; i < height; i++) { + if (mp->mp_bh[i] == NULL) + break; + brelse(mp->mp_bh[i]); + mp->mp_bh[i] = NULL; + } + i = zmpl; + } + if (n == 0) + break; + /* Branching from existing tree */ + case ALLOC_GROW_DEPTH: + if (i > 1 && i < height) + gfs2_trans_add_bh(ip->i_gl, mp->mp_bh[i-1], 1); + for (; i < height && n > 0; i++, n--) + gfs2_indirect_init(mp, ip->i_gl, i, + mp->mp_list[i-1], bn++); + if (i == height) + state = ALLOC_DATA; + if (n == 0) + break; + /* Tree complete, adding data blocks */ + case ALLOC_DATA: + BUG_ON(n > dblks); + BUG_ON(mp->mp_bh[end_of_metadata] == NULL); + gfs2_trans_add_bh(ip->i_gl, mp->mp_bh[end_of_metadata], 1); + dblks = n; + ptr = metapointer(end_of_metadata, mp); + dblock = bn; + while (n-- > 0) + *ptr++ = cpu_to_be64(bn++); + break; + } + } while (state != ALLOC_DATA); + + ip->i_height = height; + gfs2_add_inode_blocks(&ip->i_inode, alloced); + gfs2_dinode_out(ip, mp->mp_bh[0]->b_data); + map_bh(bh_map, inode->i_sb, dblock); + bh_map->b_size = dblks << inode->i_blkbits; + set_buffer_new(bh_map); + return 0; +} + /** * gfs2_block_map - Map a block from an inode to a disk block * @inode: The inode * @lblock: The logical block number * @bh_map: The bh to be mapped + * @create: True if its ok to alloc blocks to satify the request * - * Find the block number on the current device which corresponds to an - * inode's block. If the block had to be created, "new" will be set. + * Sets buffer_mapped() if successful, sets buffer_boundary() if a + * read of metadata will be required before the next block can be + * mapped. Sets buffer_new() if new blocks were allocated. * * Returns: errno */ @@ -457,97 +572,78 @@ int gfs2_block_map(struct inode *inode, sector_t lblock, { struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_sbd *sdp = GFS2_SB(inode); - struct buffer_head *bh; - unsigned int bsize; - unsigned int height; - unsigned int end_of_metadata; - unsigned int x; - int error = 0; - int new = 0; - u64 dblock = 0; - int boundary; - unsigned int maxlen = bh_map->b_size >> inode->i_blkbits; - struct metapath mp; + unsigned int bsize = sdp->sd_sb.sb_bsize; + const unsigned int maxlen = bh_map->b_size >> inode->i_blkbits; + const u64 *arr = sdp->sd_heightsize; + __be64 *ptr; u64 size; - struct buffer_head *dibh = NULL; + struct metapath mp; + int ret; + int eob; + unsigned int len; + struct buffer_head *bh; + u8 height; BUG_ON(maxlen == 0); - if (gfs2_assert_warn(sdp, !gfs2_is_stuffed(ip))) - return 0; - - bmap_lock(inode, create); + memset(mp.mp_bh, 0, sizeof(mp.mp_bh)); + bmap_lock(ip, create); clear_buffer_mapped(bh_map); clear_buffer_new(bh_map); clear_buffer_boundary(bh_map); - bsize = gfs2_is_dir(ip) ? sdp->sd_jbsize : sdp->sd_sb.sb_bsize; - size = (lblock + 1) * bsize; - - if (size > ip->i_di.di_size) { - height = calc_tree_height(ip, size); - if (ip->i_di.di_height < height) { - if (!create) - goto out_ok; - - error = build_height(inode, height); - if (error) - goto out_fail; - } + if (gfs2_is_dir(ip)) { + bsize = sdp->sd_jbsize; + arr = sdp->sd_jheightsize; } - find_metapath(ip, lblock, &mp); - end_of_metadata = ip->i_di.di_height - 1; - error = gfs2_meta_inode_buffer(ip, &bh); - if (error) - goto out_fail; - dibh = bh; - get_bh(dibh); + ret = gfs2_meta_inode_buffer(ip, &mp.mp_bh[0]); + if (ret) + goto out; - for (x = 0; x < end_of_metadata; x++) { - lookup_block(ip, bh, x, &mp, create, &new, &dblock); - brelse(bh); - if (!dblock) - goto out_ok; + height = ip->i_height; + size = (lblock + 1) * bsize; + while (size > arr[height]) + height++; + find_metapath(sdp, lblock, &mp, height); + ret = 1; + if (height > ip->i_height || gfs2_is_stuffed(ip)) + goto do_alloc; + ret = lookup_metapath(ip, &mp); + if (ret < 0) + goto out; + if (ret != ip->i_height) + goto do_alloc; + ptr = metapointer(ip->i_height - 1, &mp); + if (*ptr == 0) + goto do_alloc; + map_bh(bh_map, inode->i_sb, be64_to_cpu(*ptr)); + bh = mp.mp_bh[ip->i_height - 1]; + len = gfs2_extent_length(bh->b_data, bh->b_size, ptr, maxlen, &eob); + bh_map->b_size = (len << inode->i_blkbits); + if (eob) + set_buffer_boundary(bh_map); + ret = 0; +out: + release_metapath(&mp); + bmap_unlock(ip, create); + return ret; - error = gfs2_meta_indirect_buffer(ip, x+1, dblock, new, &bh); - if (error) - goto out_fail; +do_alloc: + /* All allocations are done here, firstly check create flag */ + if (!create) { + BUG_ON(gfs2_is_stuffed(ip)); + ret = 0; + goto out; } - boundary = lookup_block(ip, bh, end_of_metadata, &mp, create, &new, &dblock); - if (dblock) { - map_bh(bh_map, inode->i_sb, dblock); - if (boundary) - set_buffer_boundary(bh_map); - if (new) { - gfs2_trans_add_bh(ip->i_gl, dibh, 1); - gfs2_dinode_out(ip, dibh->b_data); - set_buffer_new(bh_map); - goto out_brelse; - } - while(--maxlen && !buffer_boundary(bh_map)) { - u64 eblock; - - mp.mp_list[end_of_metadata]++; - boundary = lookup_block(ip, bh, end_of_metadata, &mp, 0, &new, &eblock); - if (eblock != ++dblock) - break; - bh_map->b_size += (1 << inode->i_blkbits); - if (boundary) - set_buffer_boundary(bh_map); - } - } -out_brelse: - brelse(bh); -out_ok: - error = 0; -out_fail: - if (dibh) - brelse(dibh); - bmap_unlock(inode, create); - return error; + /* At this point ret is the tree depth of already allocated blocks */ + ret = gfs2_bmap_alloc(inode, lblock, bh_map, &mp, ret, height, maxlen); + goto out; } +/* + * Deprecated: do not use in new code + */ int gfs2_extent_map(struct inode *inode, u64 lblock, int *new, u64 *dblock, unsigned *extlen) { struct buffer_head bh = { .b_state = 0, .b_blocknr = 0 }; @@ -558,7 +654,7 @@ int gfs2_extent_map(struct inode *inode, u64 lblock, int *new, u64 *dblock, unsi BUG_ON(!dblock); BUG_ON(!new); - bh.b_size = 1 << (inode->i_blkbits + 5); + bh.b_size = 1 << (inode->i_blkbits + (create ? 0 : 5)); ret = gfs2_block_map(inode, lblock, &bh, create); *extlen = bh.b_size >> inode->i_blkbits; *dblock = bh.b_blocknr; @@ -621,7 +717,7 @@ static int recursive_scan(struct gfs2_inode *ip, struct buffer_head *dibh, if (error) goto out; - if (height < ip->i_di.di_height - 1) + if (height < ip->i_height - 1) for (; top < bottom; top++, first = 0) { if (!*top) continue; @@ -679,7 +775,7 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh, sm->sm_first = 0; } - metadata = (height != ip->i_di.di_height - 1); + metadata = (height != ip->i_height - 1); if (metadata) revokes = (height) ? sdp->sd_inptrs : sdp->sd_diptrs; @@ -713,7 +809,7 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh, else goto out; /* Nothing to do */ - gfs2_rlist_alloc(&rlist, LM_ST_EXCLUSIVE, 0); + gfs2_rlist_alloc(&rlist, LM_ST_EXCLUSIVE); for (x = 0; x < rlist.rl_rgrps; x++) { struct gfs2_rgrpd *rgd; @@ -760,10 +856,7 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh, } *p = 0; - if (!ip->i_di.di_blocks) - gfs2_consist_inode(ip); - ip->i_di.di_blocks--; - gfs2_set_inode_blocks(&ip->i_inode); + gfs2_add_inode_blocks(&ip->i_inode, -1); } if (bstart) { if (metadata) @@ -804,19 +897,16 @@ static int do_grow(struct gfs2_inode *ip, u64 size) struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); struct gfs2_alloc *al; struct buffer_head *dibh; - unsigned int h; int error; al = gfs2_alloc_get(ip); + if (!al) + return -ENOMEM; - error = gfs2_quota_lock(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE); + error = gfs2_quota_lock_check(ip); if (error) goto out; - error = gfs2_quota_check(ip, ip->i_inode.i_uid, ip->i_inode.i_gid); - if (error) - goto out_gunlock_q; - al->al_requested = sdp->sd_max_height + RES_DATA; error = gfs2_inplace_reserve(ip); @@ -829,34 +919,25 @@ static int do_grow(struct gfs2_inode *ip, u64 size) if (error) goto out_ipres; + error = gfs2_meta_inode_buffer(ip, &dibh); + if (error) + goto out_end_trans; + if (size > sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode)) { if (gfs2_is_stuffed(ip)) { error = gfs2_unstuff_dinode(ip, NULL); if (error) - goto out_end_trans; - } - - h = calc_tree_height(ip, size); - if (ip->i_di.di_height < h) { - down_write(&ip->i_rw_mutex); - error = build_height(&ip->i_inode, h); - up_write(&ip->i_rw_mutex); - if (error) - goto out_end_trans; + goto out_brelse; } } ip->i_di.di_size = size; ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME; - - error = gfs2_meta_inode_buffer(ip, &dibh); - if (error) - goto out_end_trans; - gfs2_trans_add_bh(ip->i_gl, dibh, 1); gfs2_dinode_out(ip, dibh->b_data); - brelse(dibh); +out_brelse: + brelse(dibh); out_end_trans: gfs2_trans_end(sdp); out_ipres: @@ -986,7 +1067,8 @@ out: static int trunc_dealloc(struct gfs2_inode *ip, u64 size) { - unsigned int height = ip->i_di.di_height; + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + unsigned int height = ip->i_height; u64 lblock; struct metapath mp; int error; @@ -994,10 +1076,11 @@ static int trunc_dealloc(struct gfs2_inode *ip, u64 size) if (!size) lblock = 0; else - lblock = (size - 1) >> GFS2_SB(&ip->i_inode)->sd_sb.sb_bsize_shift; + lblock = (size - 1) >> sdp->sd_sb.sb_bsize_shift; - find_metapath(ip, lblock, &mp); - gfs2_alloc_get(ip); + find_metapath(sdp, lblock, &mp, ip->i_height); + if (!gfs2_alloc_get(ip)) + return -ENOMEM; error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE); if (error) @@ -1037,10 +1120,8 @@ static int trunc_end(struct gfs2_inode *ip) goto out; if (!ip->i_di.di_size) { - ip->i_di.di_height = 0; - ip->i_di.di_goal_meta = - ip->i_di.di_goal_data = - ip->i_no_addr; + ip->i_height = 0; + ip->i_goal = ip->i_no_addr; gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode)); } ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME; @@ -1197,10 +1278,9 @@ int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset, unsigned int len, int *alloc_required) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); - u64 lblock, lblock_stop, dblock; - u32 extlen; - int new = 0; - int error = 0; + struct buffer_head bh; + unsigned int shift; + u64 lblock, lblock_stop, size; *alloc_required = 0; @@ -1214,6 +1294,8 @@ int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset, return 0; } + *alloc_required = 1; + shift = sdp->sd_sb.sb_bsize_shift; if (gfs2_is_dir(ip)) { unsigned int bsize = sdp->sd_jbsize; lblock = offset; @@ -1221,27 +1303,25 @@ int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset, lblock_stop = offset + len + bsize - 1; do_div(lblock_stop, bsize); } else { - unsigned int shift = sdp->sd_sb.sb_bsize_shift; u64 end_of_file = (ip->i_di.di_size + sdp->sd_sb.sb_bsize - 1) >> shift; lblock = offset >> shift; lblock_stop = (offset + len + sdp->sd_sb.sb_bsize - 1) >> shift; - if (lblock_stop > end_of_file) { - *alloc_required = 1; + if (lblock_stop > end_of_file) return 0; - } } - for (; lblock < lblock_stop; lblock += extlen) { - error = gfs2_extent_map(&ip->i_inode, lblock, &new, &dblock, &extlen); - if (error) - return error; - - if (!dblock) { - *alloc_required = 1; + size = (lblock_stop - lblock) << shift; + do { + bh.b_state = 0; + bh.b_size = size; + gfs2_block_map(&ip->i_inode, lblock, &bh, 0); + if (!buffer_mapped(&bh)) return 0; - } - } + size -= bh.b_size; + lblock += (bh.b_size >> ip->i_inode.i_blkbits); + } while(size > 0); + *alloc_required = 0; return 0; } diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c index c347095..eed040d 100644 --- a/fs/gfs2/dir.c +++ b/fs/gfs2/dir.c @@ -159,6 +159,7 @@ static int gfs2_dir_write_data(struct gfs2_inode *ip, const char *buf, unsigned int o; int copied = 0; int error = 0; + int new = 0; if (!size) return 0; @@ -183,7 +184,6 @@ static int gfs2_dir_write_data(struct gfs2_inode *ip, const char *buf, while (copied < size) { unsigned int amount; struct buffer_head *bh; - int new = 0; amount = size - copied; if (amount > sdp->sd_sb.sb_bsize - o) @@ -757,7 +757,7 @@ static struct gfs2_dirent *gfs2_dirent_search(struct inode *inode, if (ip->i_di.di_flags & GFS2_DIF_EXHASH) { struct gfs2_leaf *leaf; - unsigned hsize = 1 << ip->i_di.di_depth; + unsigned hsize = 1 << ip->i_depth; unsigned index; u64 ln; if (hsize * sizeof(u64) != ip->i_di.di_size) { @@ -765,7 +765,7 @@ static struct gfs2_dirent *gfs2_dirent_search(struct inode *inode, return ERR_PTR(-EIO); } - index = name->hash >> (32 - ip->i_di.di_depth); + index = name->hash >> (32 - ip->i_depth); error = get_first_leaf(ip, index, &bh); if (error) return ERR_PTR(error); @@ -803,14 +803,15 @@ got_dent: static struct gfs2_leaf *new_leaf(struct inode *inode, struct buffer_head **pbh, u16 depth) { struct gfs2_inode *ip = GFS2_I(inode); - u64 bn = gfs2_alloc_meta(ip); + unsigned int n = 1; + u64 bn = gfs2_alloc_block(ip, &n); struct buffer_head *bh = gfs2_meta_new(ip->i_gl, bn); struct gfs2_leaf *leaf; struct gfs2_dirent *dent; struct qstr name = { .name = "", .len = 0, .hash = 0 }; if (!bh) return NULL; - + gfs2_trans_add_unrevoke(GFS2_SB(inode), bn, 1); gfs2_trans_add_bh(ip->i_gl, bh, 1); gfs2_metatype_set(bh, GFS2_METATYPE_LF, GFS2_FORMAT_LF); leaf = (struct gfs2_leaf *)bh->b_data; @@ -905,12 +906,11 @@ static int dir_make_exhash(struct inode *inode) *lp = cpu_to_be64(bn); dip->i_di.di_size = sdp->sd_sb.sb_bsize / 2; - dip->i_di.di_blocks++; - gfs2_set_inode_blocks(&dip->i_inode); + gfs2_add_inode_blocks(&dip->i_inode, 1); dip->i_di.di_flags |= GFS2_DIF_EXHASH; for (x = sdp->sd_hash_ptrs, y = -1; x; x >>= 1, y++) ; - dip->i_di.di_depth = y; + dip->i_depth = y; gfs2_dinode_out(dip, dibh->b_data); @@ -941,7 +941,7 @@ static int dir_split_leaf(struct inode *inode, const struct qstr *name) int x, moved = 0; int error; - index = name->hash >> (32 - dip->i_di.di_depth); + index = name->hash >> (32 - dip->i_depth); error = get_leaf_nr(dip, index, &leaf_no); if (error) return error; @@ -952,7 +952,7 @@ static int dir_split_leaf(struct inode *inode, const struct qstr *name) return error; oleaf = (struct gfs2_leaf *)obh->b_data; - if (dip->i_di.di_depth == be16_to_cpu(oleaf->lf_depth)) { + if (dip->i_depth == be16_to_cpu(oleaf->lf_depth)) { brelse(obh); return 1; /* can't split */ } @@ -967,10 +967,10 @@ static int dir_split_leaf(struct inode *inode, const struct qstr *name) bn = nbh->b_blocknr; /* Compute the start and len of leaf pointers in the hash table. */ - len = 1 << (dip->i_di.di_depth - be16_to_cpu(oleaf->lf_depth)); + len = 1 << (dip->i_depth - be16_to_cpu(oleaf->lf_depth)); half_len = len >> 1; if (!half_len) { - printk(KERN_WARNING "di_depth %u lf_depth %u index %u\n", dip->i_di.di_depth, be16_to_cpu(oleaf->lf_depth), index); + printk(KERN_WARNING "i_depth %u lf_depth %u index %u\n", dip->i_depth, be16_to_cpu(oleaf->lf_depth), index); gfs2_consist_inode(dip); error = -EIO; goto fail_brelse; @@ -997,7 +997,7 @@ static int dir_split_leaf(struct inode *inode, const struct qstr *name) kfree(lp); /* Compute the divider */ - divider = (start + half_len) << (32 - dip->i_di.di_depth); + divider = (start + half_len) << (32 - dip->i_depth); /* Copy the entries */ dirent_first(dip, obh, &dent); @@ -1021,13 +1021,13 @@ static int dir_split_leaf(struct inode *inode, const struct qstr *name) new->de_inum = dent->de_inum; /* No endian worries */ new->de_type = dent->de_type; /* No endian worries */ - nleaf->lf_entries = cpu_to_be16(be16_to_cpu(nleaf->lf_entries)+1); + be16_add_cpu(&nleaf->lf_entries, 1); dirent_del(dip, obh, prev, dent); if (!oleaf->lf_entries) gfs2_consist_inode(dip); - oleaf->lf_entries = cpu_to_be16(be16_to_cpu(oleaf->lf_entries)-1); + be16_add_cpu(&oleaf->lf_entries, -1); if (!prev) prev = dent; @@ -1044,8 +1044,7 @@ static int dir_split_leaf(struct inode *inode, const struct qstr *name) error = gfs2_meta_inode_buffer(dip, &dibh); if (!gfs2_assert_withdraw(GFS2_SB(&dip->i_inode), !error)) { gfs2_trans_add_bh(dip->i_gl, dibh, 1); - dip->i_di.di_blocks++; - gfs2_set_inode_blocks(&dip->i_inode); + gfs2_add_inode_blocks(&dip->i_inode, 1); gfs2_dinode_out(dip, dibh->b_data); brelse(dibh); } @@ -1082,7 +1081,7 @@ static int dir_double_exhash(struct gfs2_inode *dip) int x; int error = 0; - hsize = 1 << dip->i_di.di_depth; + hsize = 1 << dip->i_depth; if (hsize * sizeof(u64) != dip->i_di.di_size) { gfs2_consist_inode(dip); return -EIO; @@ -1090,7 +1089,7 @@ static int dir_double_exhash(struct gfs2_inode *dip) /* Allocate both the "from" and "to" buffers in one big chunk */ - buf = kcalloc(3, sdp->sd_hash_bsize, GFP_KERNEL | __GFP_NOFAIL); + buf = kcalloc(3, sdp->sd_hash_bsize, GFP_NOFS | __GFP_NOFAIL); for (block = dip->i_di.di_size >> sdp->sd_hash_bsize_shift; block--;) { error = gfs2_dir_read_data(dip, (char *)buf, @@ -1125,7 +1124,7 @@ static int dir_double_exhash(struct gfs2_inode *dip) error = gfs2_meta_inode_buffer(dip, &dibh); if (!gfs2_assert_withdraw(sdp, !error)) { - dip->i_di.di_depth++; + dip->i_depth++; gfs2_dinode_out(dip, dibh->b_data); brelse(dibh); } @@ -1370,16 +1369,16 @@ static int dir_e_read(struct inode *inode, u64 *offset, void *opaque, int error = 0; unsigned depth = 0; - hsize = 1 << dip->i_di.di_depth; + hsize = 1 << dip->i_depth; if (hsize * sizeof(u64) != dip->i_di.di_size) { gfs2_consist_inode(dip); return -EIO; } hash = gfs2_dir_offset2hash(*offset); - index = hash >> (32 - dip->i_di.di_depth); + index = hash >> (32 - dip->i_depth); - lp = kmalloc(sdp->sd_hash_bsize, GFP_KERNEL); + lp = kmalloc(sdp->sd_hash_bsize, GFP_NOFS); if (!lp) return -ENOMEM; @@ -1405,7 +1404,7 @@ static int dir_e_read(struct inode *inode, u64 *offset, void *opaque, if (error) break; - len = 1 << (dip->i_di.di_depth - depth); + len = 1 << (dip->i_depth - depth); index = (index & ~(len - 1)) + len; } @@ -1444,7 +1443,7 @@ int gfs2_dir_read(struct inode *inode, u64 *offset, void *opaque, error = -ENOMEM; /* 96 is max number of dirents which can be stuffed into an inode */ - darr = kmalloc(96 * sizeof(struct gfs2_dirent *), GFP_KERNEL); + darr = kmalloc(96 * sizeof(struct gfs2_dirent *), GFP_NOFS); if (darr) { g.pdent = darr; g.offset = 0; @@ -1549,7 +1548,7 @@ static int dir_new_leaf(struct inode *inode, const struct qstr *name) u32 index; u64 bn; - index = name->hash >> (32 - ip->i_di.di_depth); + index = name->hash >> (32 - ip->i_depth); error = get_first_leaf(ip, index, &obh); if (error) return error; @@ -1579,8 +1578,7 @@ static int dir_new_leaf(struct inode *inode, const struct qstr *name) if (error) return error; gfs2_trans_add_bh(ip->i_gl, bh, 1); - ip->i_di.di_blocks++; - gfs2_set_inode_blocks(&ip->i_inode); + gfs2_add_inode_blocks(&ip->i_inode, 1); gfs2_dinode_out(ip, bh->b_data); brelse(bh); return 0; @@ -1616,7 +1614,7 @@ int gfs2_dir_add(struct inode *inode, const struct qstr *name, dent->de_type = cpu_to_be16(type); if (ip->i_di.di_flags & GFS2_DIF_EXHASH) { leaf = (struct gfs2_leaf *)bh->b_data; - leaf->lf_entries = cpu_to_be16(be16_to_cpu(leaf->lf_entries) + 1); + be16_add_cpu(&leaf->lf_entries, 1); } brelse(bh); error = gfs2_meta_inode_buffer(ip, &bh); @@ -1641,7 +1639,7 @@ int gfs2_dir_add(struct inode *inode, const struct qstr *name, continue; if (error < 0) break; - if (ip->i_di.di_depth < GFS2_DIR_MAX_DEPTH) { + if (ip->i_depth < GFS2_DIR_MAX_DEPTH) { error = dir_double_exhash(ip); if (error) break; @@ -1785,13 +1783,13 @@ static int foreach_leaf(struct gfs2_inode *dip, leaf_call_t lc, void *data) u64 leaf_no; int error = 0; - hsize = 1 << dip->i_di.di_depth; + hsize = 1 << dip->i_depth; if (hsize * sizeof(u64) != dip->i_di.di_size) { gfs2_consist_inode(dip); return -EIO; } - lp = kmalloc(sdp->sd_hash_bsize, GFP_KERNEL); + lp = kmalloc(sdp->sd_hash_bsize, GFP_NOFS); if (!lp) return -ENOMEM; @@ -1817,7 +1815,7 @@ static int foreach_leaf(struct gfs2_inode *dip, leaf_call_t lc, void *data) if (error) goto out; leaf = (struct gfs2_leaf *)bh->b_data; - len = 1 << (dip->i_di.di_depth - be16_to_cpu(leaf->lf_depth)); + len = 1 << (dip->i_depth - be16_to_cpu(leaf->lf_depth)); brelse(bh); error = lc(dip, index, len, leaf_no, data); @@ -1866,15 +1864,18 @@ static int leaf_dealloc(struct gfs2_inode *dip, u32 index, u32 len, memset(&rlist, 0, sizeof(struct gfs2_rgrp_list)); - ht = kzalloc(size, GFP_KERNEL); + ht = kzalloc(size, GFP_NOFS); if (!ht) return -ENOMEM; - gfs2_alloc_get(dip); + if (!gfs2_alloc_get(dip)) { + error = -ENOMEM; + goto out; + } error = gfs2_quota_hold(dip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE); if (error) - goto out; + goto out_put; error = gfs2_rindex_hold(sdp, &dip->i_alloc->al_ri_gh); if (error) @@ -1894,7 +1895,7 @@ static int leaf_dealloc(struct gfs2_inode *dip, u32 index, u32 len, l_blocks++; } - gfs2_rlist_alloc(&rlist, LM_ST_EXCLUSIVE, 0); + gfs2_rlist_alloc(&rlist, LM_ST_EXCLUSIVE); for (x = 0; x < rlist.rl_rgrps; x++) { struct gfs2_rgrpd *rgd; @@ -1921,11 +1922,7 @@ static int leaf_dealloc(struct gfs2_inode *dip, u32 index, u32 len, brelse(bh); gfs2_free_meta(dip, blk, 1); - - if (!dip->i_di.di_blocks) - gfs2_consist_inode(dip); - dip->i_di.di_blocks--; - gfs2_set_inode_blocks(&dip->i_inode); + gfs2_add_inode_blocks(&dip->i_inode, -1); } error = gfs2_dir_write_data(dip, ht, index * sizeof(u64), size); @@ -1952,8 +1949,9 @@ out_rlist: gfs2_glock_dq_uninit(&dip->i_alloc->al_ri_gh); out_qs: gfs2_quota_unhold(dip); -out: +out_put: gfs2_alloc_put(dip); +out: kfree(ht); return error; } diff --git a/fs/gfs2/eattr.c b/fs/gfs2/eattr.c index bee9970..e3f76f4 100644 --- a/fs/gfs2/eattr.c +++ b/fs/gfs2/eattr.c @@ -277,10 +277,7 @@ static int ea_dealloc_unstuffed(struct gfs2_inode *ip, struct buffer_head *bh, } *dataptrs = 0; - if (!ip->i_di.di_blocks) - gfs2_consist_inode(ip); - ip->i_di.di_blocks--; - gfs2_set_inode_blocks(&ip->i_inode); + gfs2_add_inode_blocks(&ip->i_inode, -1); } if (bstart) gfs2_free_meta(ip, bstart, blen); @@ -321,6 +318,8 @@ static int ea_remove_unstuffed(struct gfs2_inode *ip, struct buffer_head *bh, int error; al = gfs2_alloc_get(ip); + if (!al) + return -ENOMEM; error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE); if (error) @@ -449,7 +448,7 @@ static int ea_get_unstuffed(struct gfs2_inode *ip, struct gfs2_ea_header *ea, unsigned int x; int error = 0; - bh = kcalloc(nptrs, sizeof(struct buffer_head *), GFP_KERNEL); + bh = kcalloc(nptrs, sizeof(struct buffer_head *), GFP_NOFS); if (!bh) return -ENOMEM; @@ -582,10 +581,11 @@ static int ea_alloc_blk(struct gfs2_inode *ip, struct buffer_head **bhp) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); struct gfs2_ea_header *ea; + unsigned int n = 1; u64 block; - block = gfs2_alloc_meta(ip); - + block = gfs2_alloc_block(ip, &n); + gfs2_trans_add_unrevoke(sdp, block, 1); *bhp = gfs2_meta_new(ip->i_gl, block); gfs2_trans_add_bh(ip->i_gl, *bhp, 1); gfs2_metatype_set(*bhp, GFS2_METATYPE_EA, GFS2_FORMAT_EA); @@ -597,8 +597,7 @@ static int ea_alloc_blk(struct gfs2_inode *ip, struct buffer_head **bhp) ea->ea_flags = GFS2_EAFLAG_LAST; ea->ea_num_ptrs = 0; - ip->i_di.di_blocks++; - gfs2_set_inode_blocks(&ip->i_inode); + gfs2_add_inode_blocks(&ip->i_inode, 1); return 0; } @@ -642,15 +641,15 @@ static int ea_write(struct gfs2_inode *ip, struct gfs2_ea_header *ea, struct buffer_head *bh; u64 block; int mh_size = sizeof(struct gfs2_meta_header); + unsigned int n = 1; - block = gfs2_alloc_meta(ip); - + block = gfs2_alloc_block(ip, &n); + gfs2_trans_add_unrevoke(sdp, block, 1); bh = gfs2_meta_new(ip->i_gl, block); gfs2_trans_add_bh(ip->i_gl, bh, 1); gfs2_metatype_set(bh, GFS2_METATYPE_ED, GFS2_FORMAT_ED); - ip->i_di.di_blocks++; - gfs2_set_inode_blocks(&ip->i_inode); + gfs2_add_inode_blocks(&ip->i_inode, 1); copy = data_len > sdp->sd_jbsize ? sdp->sd_jbsize : data_len; @@ -684,15 +683,13 @@ static int ea_alloc_skeleton(struct gfs2_inode *ip, struct gfs2_ea_request *er, int error; al = gfs2_alloc_get(ip); + if (!al) + return -ENOMEM; - error = gfs2_quota_lock(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE); + error = gfs2_quota_lock_check(ip); if (error) goto out; - error = gfs2_quota_check(ip, ip->i_inode.i_uid, ip->i_inode.i_gid); - if (error) - goto out_gunlock_q; - al->al_requested = blks; error = gfs2_inplace_reserve(ip); @@ -966,9 +963,9 @@ static int ea_set_block(struct gfs2_inode *ip, struct gfs2_ea_request *er, gfs2_trans_add_bh(ip->i_gl, indbh, 1); } else { u64 blk; - - blk = gfs2_alloc_meta(ip); - + unsigned int n = 1; + blk = gfs2_alloc_block(ip, &n); + gfs2_trans_add_unrevoke(sdp, blk, 1); indbh = gfs2_meta_new(ip->i_gl, blk); gfs2_trans_add_bh(ip->i_gl, indbh, 1); gfs2_metatype_set(indbh, GFS2_METATYPE_IN, GFS2_FORMAT_IN); @@ -978,8 +975,7 @@ static int ea_set_block(struct gfs2_inode *ip, struct gfs2_ea_request *er, *eablk = cpu_to_be64(ip->i_di.di_eattr); ip->i_di.di_eattr = blk; ip->i_di.di_flags |= GFS2_DIF_EA_INDIRECT; - ip->i_di.di_blocks++; - gfs2_set_inode_blocks(&ip->i_inode); + gfs2_add_inode_blocks(&ip->i_inode, 1); eablk++; } @@ -1210,7 +1206,7 @@ static int ea_acl_chmod_unstuffed(struct gfs2_inode *ip, unsigned int x; int error; - bh = kcalloc(nptrs, sizeof(struct buffer_head *), GFP_KERNEL); + bh = kcalloc(nptrs, sizeof(struct buffer_head *), GFP_NOFS); if (!bh) return -ENOMEM; @@ -1347,7 +1343,7 @@ static int ea_dealloc_indirect(struct gfs2_inode *ip) else goto out; - gfs2_rlist_alloc(&rlist, LM_ST_EXCLUSIVE, 0); + gfs2_rlist_alloc(&rlist, LM_ST_EXCLUSIVE); for (x = 0; x < rlist.rl_rgrps; x++) { struct gfs2_rgrpd *rgd; @@ -1387,10 +1383,7 @@ static int ea_dealloc_indirect(struct gfs2_inode *ip) } *eablk = 0; - if (!ip->i_di.di_blocks) - gfs2_consist_inode(ip); - ip->i_di.di_blocks--; - gfs2_set_inode_blocks(&ip->i_inode); + gfs2_add_inode_blocks(&ip->i_inode, -1); } if (bstart) gfs2_free_meta(ip, bstart, blen); @@ -1442,10 +1435,7 @@ static int ea_dealloc_block(struct gfs2_inode *ip) gfs2_free_meta(ip, ip->i_di.di_eattr, 1); ip->i_di.di_eattr = 0; - if (!ip->i_di.di_blocks) - gfs2_consist_inode(ip); - ip->i_di.di_blocks--; - gfs2_set_inode_blocks(&ip->i_inode); + gfs2_add_inode_blocks(&ip->i_inode, -1); error = gfs2_meta_inode_buffer(ip, &dibh); if (!error) { @@ -1474,6 +1464,8 @@ int gfs2_ea_dealloc(struct gfs2_inode *ip) int error; al = gfs2_alloc_get(ip); + if (!al) + return -ENOMEM; error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE); if (error) diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 7175a4d..d636b3e 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -1,6 +1,6 @@ /* * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. - * Copyright (C) 2004-2007 Red Hat, Inc. All rights reserved. + * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved. * * This copyrighted material is made available to anyone wishing to use, * modify, copy, or redistribute it subject to the terms and conditions @@ -35,7 +35,6 @@ #include "glock.h" #include "glops.h" #include "inode.h" -#include "lm.h" #include "lops.h" #include "meta_io.h" #include "quota.h" @@ -183,7 +182,8 @@ static void glock_free(struct gfs2_glock *gl) struct gfs2_sbd *sdp = gl->gl_sbd; struct inode *aspace = gl->gl_aspace; - gfs2_lm_put_lock(sdp, gl->gl_lock); + if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) + sdp->sd_lockstruct.ls_ops->lm_put_lock(gl->gl_lock); if (aspace) gfs2_aspace_put(aspace); @@ -197,7 +197,7 @@ static void glock_free(struct gfs2_glock *gl) * */ -void gfs2_glock_hold(struct gfs2_glock *gl) +static void gfs2_glock_hold(struct gfs2_glock *gl) { atomic_inc(&gl->gl_ref); } @@ -293,6 +293,16 @@ static void glock_work_func(struct work_struct *work) gfs2_glock_put(gl); } +static int gfs2_lm_get_lock(struct gfs2_sbd *sdp, struct lm_lockname *name, + void **lockp) +{ + int error = -EIO; + if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) + error = sdp->sd_lockstruct.ls_ops->lm_get_lock( + sdp->sd_lockstruct.ls_lockspace, name, lockp); + return error; +} + /** * gfs2_glock_get() - Get a glock, or create one if one doesn't exist * @sdp: The GFS2 superblock @@ -338,8 +348,6 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number, gl->gl_ip = 0; gl->gl_ops = glops; gl->gl_req_gh = NULL; - gl->gl_req_bh = NULL; - gl->gl_vn = 0; gl->gl_stamp = jiffies; gl->gl_tchange = jiffies; gl->gl_object = NULL; @@ -595,11 +603,12 @@ static void run_queue(struct gfs2_glock *gl) blocked = rq_mutex(gh); } else if (test_bit(GLF_DEMOTE, &gl->gl_flags)) { blocked = rq_demote(gl); - if (gl->gl_waiters2 && !blocked) { + if (test_bit(GLF_WAITERS2, &gl->gl_flags) && + !blocked) { set_bit(GLF_DEMOTE, &gl->gl_flags); gl->gl_demote_state = LM_ST_UNLOCKED; } - gl->gl_waiters2 = 0; + clear_bit(GLF_WAITERS2, &gl->gl_flags); } else if (!list_empty(&gl->gl_waiters3)) { gh = list_entry(gl->gl_waiters3.next, struct gfs2_holder, gh_list); @@ -710,7 +719,7 @@ static void handle_callback(struct gfs2_glock *gl, unsigned int state, } else if (gl->gl_demote_state != LM_ST_UNLOCKED && gl->gl_demote_state != state) { if (test_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags)) - gl->gl_waiters2 = 1; + set_bit(GLF_WAITERS2, &gl->gl_flags); else gl->gl_demote_state = LM_ST_UNLOCKED; } @@ -743,6 +752,43 @@ static void state_change(struct gfs2_glock *gl, unsigned int new_state) } /** + * drop_bh - Called after a lock module unlock completes + * @gl: the glock + * @ret: the return status + * + * Doesn't wake up the process waiting on the struct gfs2_holder (if any) + * Doesn't drop the reference on the glock the top half took out + * + */ + +static void drop_bh(struct gfs2_glock *gl, unsigned int ret) +{ + struct gfs2_sbd *sdp = gl->gl_sbd; + struct gfs2_holder *gh = gl->gl_req_gh; + + gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags)); + gfs2_assert_warn(sdp, list_empty(&gl->gl_holders)); + gfs2_assert_warn(sdp, !ret); + + state_change(gl, LM_ST_UNLOCKED); + + if (test_and_clear_bit(GLF_CONV_DEADLK, &gl->gl_flags)) { + spin_lock(&gl->gl_spin); + gh->gh_error = 0; + spin_unlock(&gl->gl_spin); + gfs2_glock_xmote_th(gl, gl->gl_req_gh); + gfs2_glock_put(gl); + return; + } + + spin_lock(&gl->gl_spin); + gfs2_demote_wake(gl); + clear_bit(GLF_LOCK, &gl->gl_flags); + spin_unlock(&gl->gl_spin); + gfs2_glock_put(gl); +} + +/** * xmote_bh - Called after the lock module is done acquiring a lock * @gl: The glock in question * @ret: the int returned from the lock module @@ -754,25 +800,19 @@ static void xmote_bh(struct gfs2_glock *gl, unsigned int ret) struct gfs2_sbd *sdp = gl->gl_sbd; const struct gfs2_glock_operations *glops = gl->gl_ops; struct gfs2_holder *gh = gl->gl_req_gh; - int prev_state = gl->gl_state; int op_done = 1; + if (!gh && (ret & LM_OUT_ST_MASK) == LM_ST_UNLOCKED) { + drop_bh(gl, ret); + return; + } + gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags)); gfs2_assert_warn(sdp, list_empty(&gl->gl_holders)); gfs2_assert_warn(sdp, !(ret & LM_OUT_ASYNC)); state_change(gl, ret & LM_OUT_ST_MASK); - if (prev_state != LM_ST_UNLOCKED && !(ret & LM_OUT_CACHEABLE)) { - if (glops->go_inval) - glops->go_inval(gl, DIO_METADATA); - } else if (gl->gl_state == LM_ST_DEFERRED) { - /* We might not want to do this here. - Look at moving to the inode glops. */ - if (glops->go_inval) - glops->go_inval(gl, 0); - } - /* Deal with each possible exit condition */ if (!gh) { @@ -782,7 +822,6 @@ static void xmote_bh(struct gfs2_glock *gl, unsigned int ret) } else { spin_lock(&gl->gl_spin); if (gl->gl_state != gl->gl_demote_state) { - gl->gl_req_bh = NULL; spin_unlock(&gl->gl_spin); gfs2_glock_drop_th(gl); gfs2_glock_put(gl); @@ -793,6 +832,14 @@ static void xmote_bh(struct gfs2_glock *gl, unsigned int ret) } } else { spin_lock(&gl->gl_spin); + if (ret & LM_OUT_CONV_DEADLK) { + gh->gh_error = 0; + set_bit(GLF_CONV_DEADLK, &gl->gl_flags); + spin_unlock(&gl->gl_spin); + gfs2_glock_drop_th(gl); + gfs2_glock_put(gl); + return; + } list_del_init(&gh->gh_list); gh->gh_error = -EIO; if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) @@ -824,7 +871,6 @@ out: if (op_done) { spin_lock(&gl->gl_spin); gl->gl_req_gh = NULL; - gl->gl_req_bh = NULL; clear_bit(GLF_LOCK, &gl->gl_flags); spin_unlock(&gl->gl_spin); } @@ -835,6 +881,17 @@ out: gfs2_holder_wake(gh); } +static unsigned int gfs2_lm_lock(struct gfs2_sbd *sdp, void *lock, + unsigned int cur_state, unsigned int req_state, + unsigned int flags) +{ + int ret = 0; + if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) + ret = sdp->sd_lockstruct.ls_ops->lm_lock(lock, cur_state, + req_state, flags); + return ret; +} + /** * gfs2_glock_xmote_th - Call into the lock module to acquire or change a glock * @gl: The glock in question @@ -856,6 +913,8 @@ static void gfs2_glock_xmote_th(struct gfs2_glock *gl, struct gfs2_holder *gh) if (glops->go_xmote_th) glops->go_xmote_th(gl); + if (state == LM_ST_DEFERRED && glops->go_inval) + glops->go_inval(gl, DIO_METADATA); gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags)); gfs2_assert_warn(sdp, list_empty(&gl->gl_holders)); @@ -863,7 +922,6 @@ static void gfs2_glock_xmote_th(struct gfs2_glock *gl, struct gfs2_holder *gh) gfs2_assert_warn(sdp, state != gl->gl_state); gfs2_glock_hold(gl); - gl->gl_req_bh = xmote_bh; lck_ret = gfs2_lm_lock(sdp, gl->gl_lock, gl->gl_state, state, lck_flags); @@ -876,49 +934,13 @@ static void gfs2_glock_xmote_th(struct gfs2_glock *gl, struct gfs2_holder *gh) xmote_bh(gl, lck_ret); } -/** - * drop_bh - Called after a lock module unlock completes - * @gl: the glock - * @ret: the return status - * - * Doesn't wake up the process waiting on the struct gfs2_holder (if any) - * Doesn't drop the reference on the glock the top half took out - * - */ - -static void drop_bh(struct gfs2_glock *gl, unsigned int ret) +static unsigned int gfs2_lm_unlock(struct gfs2_sbd *sdp, void *lock, + unsigned int cur_state) { - struct gfs2_sbd *sdp = gl->gl_sbd; - const struct gfs2_glock_operations *glops = gl->gl_ops; - struct gfs2_holder *gh = gl->gl_req_gh; - - gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags)); - gfs2_assert_warn(sdp, list_empty(&gl->gl_holders)); - gfs2_assert_warn(sdp, !ret); - - state_change(gl, LM_ST_UNLOCKED); - - if (glops->go_inval) - glops->go_inval(gl, DIO_METADATA); - - if (gh) { - spin_lock(&gl->gl_spin); - list_del_init(&gh->gh_list); - gh->gh_error = 0; - spin_unlock(&gl->gl_spin); - } - - spin_lock(&gl->gl_spin); - gfs2_demote_wake(gl); - gl->gl_req_gh = NULL; - gl->gl_req_bh = NULL; - clear_bit(GLF_LOCK, &gl->gl_flags); - spin_unlock(&gl->gl_spin); - - gfs2_glock_put(gl); - - if (gh) - gfs2_holder_wake(gh); + int ret = 0; + if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) + ret = sdp->sd_lockstruct.ls_ops->lm_unlock(lock, cur_state); + return ret; } /** @@ -935,13 +957,14 @@ static void gfs2_glock_drop_th(struct gfs2_glock *gl) if (glops->go_xmote_th) glops->go_xmote_th(gl); + if (glops->go_inval) + glops->go_inval(gl, DIO_METADATA); gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags)); gfs2_assert_warn(sdp, list_empty(&gl->gl_holders)); gfs2_assert_warn(sdp, gl->gl_state != LM_ST_UNLOCKED); gfs2_glock_hold(gl); - gl->gl_req_bh = drop_bh; ret = gfs2_lm_unlock(sdp, gl->gl_lock, gl->gl_state); @@ -964,16 +987,17 @@ static void gfs2_glock_drop_th(struct gfs2_glock *gl) static void do_cancels(struct gfs2_holder *gh) { struct gfs2_glock *gl = gh->gh_gl; + struct gfs2_sbd *sdp = gl->gl_sbd; spin_lock(&gl->gl_spin); while (gl->gl_req_gh != gh && !test_bit(HIF_HOLDER, &gh->gh_iflags) && !list_empty(&gh->gh_list)) { - if (gl->gl_req_bh && !(gl->gl_req_gh && - (gl->gl_req_gh->gh_flags & GL_NOCANCEL))) { + if (!(gl->gl_req_gh && (gl->gl_req_gh->gh_flags & GL_NOCANCEL))) { spin_unlock(&gl->gl_spin); - gfs2_lm_cancel(gl->gl_sbd, gl->gl_lock); + if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) + sdp->sd_lockstruct.ls_ops->lm_cancel(gl->gl_lock); msleep(100); spin_lock(&gl->gl_spin); } else { @@ -1041,7 +1065,6 @@ static int glock_wait_internal(struct gfs2_holder *gh) spin_lock(&gl->gl_spin); gl->gl_req_gh = NULL; - gl->gl_req_bh = NULL; clear_bit(GLF_LOCK, &gl->gl_flags); run_queue(gl); spin_unlock(&gl->gl_spin); @@ -1428,6 +1451,14 @@ void gfs2_glock_dq_uninit_m(unsigned int num_gh, struct gfs2_holder *ghs) gfs2_glock_dq_uninit(&ghs[x]); } +static int gfs2_lm_hold_lvb(struct gfs2_sbd *sdp, void *lock, char **lvbp) +{ + int error = -EIO; + if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) + error = sdp->sd_lockstruct.ls_ops->lm_hold_lvb(lock, lvbp); + return error; +} + /** * gfs2_lvb_hold - attach a LVB from a glock * @gl: The glock in question @@ -1463,12 +1494,15 @@ int gfs2_lvb_hold(struct gfs2_glock *gl) void gfs2_lvb_unhold(struct gfs2_glock *gl) { + struct gfs2_sbd *sdp = gl->gl_sbd; + gfs2_glock_hold(gl); gfs2_glmutex_lock(gl); gfs2_assert(gl->gl_sbd, atomic_read(&gl->gl_lvb_count) > 0); if (atomic_dec_and_test(&gl->gl_lvb_count)) { - gfs2_lm_unhold_lvb(gl->gl_sbd, gl->gl_lock, gl->gl_lvb); + if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) + sdp->sd_lockstruct.ls_ops->lm_unhold_lvb(gl->gl_lock, gl->gl_lvb); gl->gl_lvb = NULL; gfs2_glock_put(gl); } @@ -1534,8 +1568,7 @@ void gfs2_glock_cb(void *cb_data, unsigned int type, void *data) gl = gfs2_glock_find(sdp, &async->lc_name); if (gfs2_assert_warn(sdp, gl)) return; - if (!gfs2_assert_warn(sdp, gl->gl_req_bh)) - gl->gl_req_bh(gl, async->lc_ret); + xmote_bh(gl, async->lc_ret); if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0) gfs2_glock_put(gl); up_read(&gfs2_umount_flush_sem); @@ -1594,10 +1627,10 @@ void gfs2_glock_schedule_for_reclaim(struct gfs2_glock *gl) gfs2_glock_hold(gl); list_add(&gl->gl_reclaim, &sdp->sd_reclaim_list); atomic_inc(&sdp->sd_reclaim_count); - } - spin_unlock(&sdp->sd_reclaim_lock); - - wake_up(&sdp->sd_reclaim_wq); + spin_unlock(&sdp->sd_reclaim_lock); + wake_up(&sdp->sd_reclaim_wq); + } else + spin_unlock(&sdp->sd_reclaim_lock); } /** @@ -1897,7 +1930,6 @@ static int dump_glock(struct glock_iter *gi, struct gfs2_glock *gl) print_dbg(gi, " gl_owner = -1\n"); print_dbg(gi, " gl_ip = %lu\n", gl->gl_ip); print_dbg(gi, " req_gh = %s\n", (gl->gl_req_gh) ? "yes" : "no"); - print_dbg(gi, " req_bh = %s\n", (gl->gl_req_bh) ? "yes" : "no"); print_dbg(gi, " lvb_count = %d\n", atomic_read(&gl->gl_lvb_count)); print_dbg(gi, " object = %s\n", (gl->gl_object) ? "yes" : "no"); print_dbg(gi, " reclaim = %s\n", diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h index 2f9c6d1..cdad3e6 100644 --- a/fs/gfs2/glock.h +++ b/fs/gfs2/glock.h @@ -32,24 +32,23 @@ #define GLR_TRYFAILED 13 #define GLR_CANCELED 14 -static inline int gfs2_glock_is_locked_by_me(struct gfs2_glock *gl) +static inline struct gfs2_holder *gfs2_glock_is_locked_by_me(struct gfs2_glock *gl) { struct gfs2_holder *gh; - int locked = 0; struct pid *pid; /* Look in glock's list of holders for one with current task as owner */ spin_lock(&gl->gl_spin); pid = task_pid(current); list_for_each_entry(gh, &gl->gl_holders, gh_list) { - if (gh->gh_owner_pid == pid) { - locked = 1; - break; - } + if (gh->gh_owner_pid == pid) + goto out; } + gh = NULL; +out: spin_unlock(&gl->gl_spin); - return locked; + return gh; } static inline int gfs2_glock_is_held_excl(struct gfs2_glock *gl) @@ -79,7 +78,6 @@ static inline int gfs2_glock_is_blocking(struct gfs2_glock *gl) int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number, const struct gfs2_glock_operations *glops, int create, struct gfs2_glock **glp); -void gfs2_glock_hold(struct gfs2_glock *gl); int gfs2_glock_put(struct gfs2_glock *gl); void gfs2_holder_init(struct gfs2_glock *gl, unsigned int state, unsigned flags, struct gfs2_holder *gh); diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c index c663b7a..d31bada 100644 --- a/fs/gfs2/glops.c +++ b/fs/gfs2/glops.c @@ -1,6 +1,6 @@ /* * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. - * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved. * * This copyrighted material is made available to anyone wishing to use, * modify, copy, or redistribute it subject to the terms and conditions @@ -126,7 +126,13 @@ static void meta_go_inval(struct gfs2_glock *gl, int flags) return; gfs2_meta_inval(gl); - gl->gl_vn++; + if (gl->gl_object == GFS2_I(gl->gl_sbd->sd_rindex)) + gl->gl_sbd->sd_rindex_uptodate = 0; + else if (gl->gl_ops == &gfs2_rgrp_glops && gl->gl_object) { + struct gfs2_rgrpd *rgd = (struct gfs2_rgrpd *)gl->gl_object; + + rgd->rd_flags &= ~GFS2_RDF_UPTODATE; + } } /** diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index 525dcae..9c2c0b9 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -1,6 +1,6 @@ /* * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. - * Copyright (C) 2004-2007 Red Hat, Inc. All rights reserved. + * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved. * * This copyrighted material is made available to anyone wishing to use, * modify, copy, or redistribute it subject to the terms and conditions @@ -44,7 +44,6 @@ struct gfs2_log_header_host { struct gfs2_log_operations { void (*lo_add) (struct gfs2_sbd *sdp, struct gfs2_log_element *le); - void (*lo_incore_commit) (struct gfs2_sbd *sdp, struct gfs2_trans *tr); void (*lo_before_commit) (struct gfs2_sbd *sdp); void (*lo_after_commit) (struct gfs2_sbd *sdp, struct gfs2_ail *ai); void (*lo_before_scan) (struct gfs2_jdesc *jd, @@ -70,7 +69,6 @@ struct gfs2_bitmap { }; struct gfs2_rgrp_host { - u32 rg_flags; u32 rg_free; u32 rg_dinodes; u64 rg_igeneration; @@ -87,17 +85,17 @@ struct gfs2_rgrpd { u32 rd_data; /* num of data blocks in rgrp */ u32 rd_bitbytes; /* number of bytes in data bitmaps */ struct gfs2_rgrp_host rd_rg; - u64 rd_rg_vn; struct gfs2_bitmap *rd_bits; unsigned int rd_bh_count; struct mutex rd_mutex; u32 rd_free_clone; struct gfs2_log_element rd_le; - u32 rd_last_alloc_data; - u32 rd_last_alloc_meta; + u32 rd_last_alloc; struct gfs2_sbd *rd_sbd; - unsigned long rd_flags; -#define GFS2_RDF_CHECK 0x0001 /* Need to check for unlinked inodes */ + unsigned char rd_flags; +#define GFS2_RDF_CHECK 0x01 /* Need to check for unlinked inodes */ +#define GFS2_RDF_NOALLOC 0x02 /* rg prohibits allocation */ +#define GFS2_RDF_UPTODATE 0x04 /* rg is up to date */ }; enum gfs2_state_bits { @@ -168,6 +166,8 @@ enum { GLF_DIRTY = 5, GLF_DEMOTE_IN_PROGRESS = 6, GLF_LFLUSH = 7, + GLF_WAITERS2 = 8, + GLF_CONV_DEADLK = 9, }; struct gfs2_glock { @@ -187,18 +187,15 @@ struct gfs2_glock { struct list_head gl_holders; struct list_head gl_waiters1; /* HIF_MUTEX */ struct list_head gl_waiters3; /* HIF_PROMOTE */ - int gl_waiters2; /* GIF_DEMOTE */ const struct gfs2_glock_operations *gl_ops; struct gfs2_holder *gl_req_gh; - gfs2_glop_bh_t gl_req_bh; void *gl_lock; char *gl_lvb; atomic_t gl_lvb_count; - u64 gl_vn; unsigned long gl_stamp; unsigned long gl_tchange; void *gl_object; @@ -213,6 +210,8 @@ struct gfs2_glock { struct delayed_work gl_work; }; +#define GFS2_MIN_LVB_SIZE 32 /* Min size of LVB that gfs2 supports */ + struct gfs2_alloc { /* Quota stuff */ @@ -241,14 +240,9 @@ enum { struct gfs2_dinode_host { u64 di_size; /* number of bytes in file */ - u64 di_blocks; /* number of blocks in file */ - u64 di_goal_meta; /* rgrp to alloc from next */ - u64 di_goal_data; /* data block goal */ u64 di_generation; /* generation number for NFS */ u32 di_flags; /* GFS2_DIF_... */ - u16 di_height; /* height of metadata */ /* These only apply to directories */ - u16 di_depth; /* Number of bits in the table */ u32 di_entries; /* The number of entries in the directory */ u64 di_eattr; /* extended attribute block number */ }; @@ -265,9 +259,10 @@ struct gfs2_inode { struct gfs2_holder i_iopen_gh; struct gfs2_holder i_gh; /* for prepare/commit_write only */ struct gfs2_alloc *i_alloc; - u64 i_last_rg_alloc; - + u64 i_goal; /* goal block for allocations */ struct rw_semaphore i_rw_mutex; + u8 i_height; + u8 i_depth; }; /* @@ -490,9 +485,9 @@ struct gfs2_sbd { u32 sd_qc_per_block; u32 sd_max_dirres; /* Max blocks needed to add a directory entry */ u32 sd_max_height; /* Max height of a file's metadata tree */ - u64 sd_heightsize[GFS2_MAX_META_HEIGHT]; + u64 sd_heightsize[GFS2_MAX_META_HEIGHT + 1]; u32 sd_max_jheight; /* Max height of journaled file's meta tree */ - u64 sd_jheightsize[GFS2_MAX_META_HEIGHT]; + u64 sd_jheightsize[GFS2_MAX_META_HEIGHT + 1]; struct gfs2_args sd_args; /* Mount arguments */ struct gfs2_tune sd_tune; /* Filesystem tuning structure */ @@ -533,7 +528,7 @@ struct gfs2_sbd { /* Resource group stuff */ - u64 sd_rindex_vn; + int sd_rindex_uptodate; spinlock_t sd_rindex_spin; struct mutex sd_rindex_mutex; struct list_head sd_rindex_list; @@ -637,9 +632,6 @@ struct gfs2_sbd { /* Counters */ - atomic_t sd_glock_count; - atomic_t sd_glock_held_count; - atomic_t sd_inode_count; atomic_t sd_reclaimed; char sd_fsname[GFS2_FSNAME_LEN]; diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index 37725ad..3a9ef52 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -1,6 +1,6 @@ /* * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. - * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved. * * This copyrighted material is made available to anyone wishing to use, * modify, copy, or redistribute it subject to the terms and conditions @@ -149,7 +149,8 @@ void gfs2_set_iop(struct inode *inode) } else if (S_ISLNK(mode)) { inode->i_op = &gfs2_symlink_iops; } else { - inode->i_op = &gfs2_dev_iops; + inode->i_op = &gfs2_file_iops; + init_special_inode(inode, inode->i_mode, inode->i_rdev); } unlock_new_inode(inode); @@ -248,12 +249,10 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf) { struct gfs2_dinode_host *di = &ip->i_di; const struct gfs2_dinode *str = buf; + u16 height, depth; - if (ip->i_no_addr != be64_to_cpu(str->di_num.no_addr)) { - if (gfs2_consist_inode(ip)) - gfs2_dinode_print(ip); - return -EIO; - } + if (unlikely(ip->i_no_addr != be64_to_cpu(str->di_num.no_addr))) + goto corrupt; ip->i_no_formal_ino = be64_to_cpu(str->di_num.no_formal_ino); ip->i_inode.i_mode = be32_to_cpu(str->di_mode); ip->i_inode.i_rdev = 0; @@ -275,8 +274,7 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf) ip->i_inode.i_nlink = be32_to_cpu(str->di_nlink); di->di_size = be64_to_cpu(str->di_size); i_size_write(&ip->i_inode, di->di_size); - di->di_blocks = be64_to_cpu(str->di_blocks); - gfs2_set_inode_blocks(&ip->i_inode); + gfs2_set_inode_blocks(&ip->i_inode, be64_to_cpu(str->di_blocks)); ip->i_inode.i_atime.tv_sec = be64_to_cpu(str->di_atime); ip->i_inode.i_atime.tv_nsec = be32_to_cpu(str->di_atime_nsec); ip->i_inode.i_mtime.tv_sec = be64_to_cpu(str->di_mtime); @@ -284,15 +282,20 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf) ip->i_inode.i_ctime.tv_sec = be64_to_cpu(str->di_ctime); ip->i_inode.i_ctime.tv_nsec = be32_to_cpu(str->di_ctime_nsec); - di->di_goal_meta = be64_to_cpu(str->di_goal_meta); - di->di_goal_data = be64_to_cpu(str->di_goal_data); + ip->i_goal = be64_to_cpu(str->di_goal_meta); di->di_generation = be64_to_cpu(str->di_generation); di->di_flags = be32_to_cpu(str->di_flags); gfs2_set_inode_flags(&ip->i_inode); - di->di_height = be16_to_cpu(str->di_height); - - di->di_depth = be16_to_cpu(str->di_depth); + height = be16_to_cpu(str->di_height); + if (unlikely(height > GFS2_MAX_META_HEIGHT)) + goto corrupt; + ip->i_height = (u8)height; + + depth = be16_to_cpu(str->di_depth); + if (unlikely(depth > GFS2_DIR_MAX_DEPTH)) + goto corrupt; + ip->i_depth = (u8)depth; di->di_entries = be32_to_cpu(str->di_entries); di->di_eattr = be64_to_cpu(str->di_eattr); @@ -300,6 +303,10 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf) gfs2_set_aops(&ip->i_inode); return 0; +corrupt: + if (gfs2_consist_inode(ip)) + gfs2_dinode_print(ip); + return -EIO; } /** @@ -337,13 +344,15 @@ int gfs2_dinode_dealloc(struct gfs2_inode *ip) struct gfs2_rgrpd *rgd; int error; - if (ip->i_di.di_blocks != 1) { + if (gfs2_get_inode_blocks(&ip->i_inode) != 1) { if (gfs2_consist_inode(ip)) gfs2_dinode_print(ip); return -EIO; } al = gfs2_alloc_get(ip); + if (!al) + return -ENOMEM; error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE); if (error) @@ -487,7 +496,7 @@ struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name, return dir; } - if (gfs2_glock_is_locked_by_me(dip->i_gl) == 0) { + if (gfs2_glock_is_locked_by_me(dip->i_gl) == NULL) { error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, &d_gh); if (error) return ERR_PTR(error); @@ -818,7 +827,8 @@ static int make_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl, int error; munge_mode_uid_gid(dip, &mode, &uid, &gid); - gfs2_alloc_get(dip); + if (!gfs2_alloc_get(dip)) + return -ENOMEM; error = gfs2_quota_lock(dip, uid, gid); if (error) @@ -853,6 +863,8 @@ static int link_dinode(struct gfs2_inode *dip, const struct qstr *name, int error; al = gfs2_alloc_get(dip); + if (!al) + return -ENOMEM; error = gfs2_quota_lock(dip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE); if (error) @@ -1219,7 +1231,7 @@ int gfs2_readlinki(struct gfs2_inode *ip, char **buf, unsigned int *len) x = ip->i_di.di_size + 1; if (x > *len) { - *buf = kmalloc(x, GFP_KERNEL); + *buf = kmalloc(x, GFP_NOFS); if (!*buf) { error = -ENOMEM; goto out_brelse; @@ -1391,21 +1403,21 @@ void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf) str->di_gid = cpu_to_be32(ip->i_inode.i_gid); str->di_nlink = cpu_to_be32(ip->i_inode.i_nlink); str->di_size = cpu_to_be64(di->di_size); - str->di_blocks = cpu_to_be64(di->di_blocks); + str->di_blocks = cpu_to_be64(gfs2_get_inode_blocks(&ip->i_inode)); str->di_atime = cpu_to_be64(ip->i_inode.i_atime.tv_sec); str->di_mtime = cpu_to_be64(ip->i_inode.i_mtime.tv_sec); str->di_ctime = cpu_to_be64(ip->i_inode.i_ctime.tv_sec); - str->di_goal_meta = cpu_to_be64(di->di_goal_meta); - str->di_goal_data = cpu_to_be64(di->di_goal_data); + str->di_goal_meta = cpu_to_be64(ip->i_goal); + str->di_goal_data = cpu_to_be64(ip->i_goal); str->di_generation = cpu_to_be64(di->di_generation); str->di_flags = cpu_to_be32(di->di_flags); - str->di_height = cpu_to_be16(di->di_height); + str->di_height = cpu_to_be16(ip->i_height); str->di_payload_format = cpu_to_be32(S_ISDIR(ip->i_inode.i_mode) && !(ip->i_di.di_flags & GFS2_DIF_EXHASH) ? GFS2_FORMAT_DE : 0); - str->di_depth = cpu_to_be16(di->di_depth); + str->di_depth = cpu_to_be16(ip->i_depth); str->di_entries = cpu_to_be32(di->di_entries); str->di_eattr = cpu_to_be64(di->di_eattr); @@ -1423,15 +1435,13 @@ void gfs2_dinode_print(const struct gfs2_inode *ip) printk(KERN_INFO " no_addr = %llu\n", (unsigned long long)ip->i_no_addr); printk(KERN_INFO " di_size = %llu\n", (unsigned long long)di->di_size); - printk(KERN_INFO " di_blocks = %llu\n", - (unsigned long long)di->di_blocks); - printk(KERN_INFO " di_goal_meta = %llu\n", - (unsigned long long)di->di_goal_meta); - printk(KERN_INFO " di_goal_data = %llu\n", - (unsigned long long)di->di_goal_data); + printk(KERN_INFO " blocks = %llu\n", + (unsigned long long)gfs2_get_inode_blocks(&ip->i_inode)); + printk(KERN_INFO " i_goal = %llu\n", + (unsigned long long)ip->i_goal); printk(KERN_INFO " di_flags = 0x%.8X\n", di->di_flags); - printk(KERN_INFO " di_height = %u\n", di->di_height); - printk(KERN_INFO " di_depth = %u\n", di->di_depth); + printk(KERN_INFO " i_height = %u\n", ip->i_height); + printk(KERN_INFO " i_depth = %u\n", ip->i_depth); printk(KERN_INFO " di_entries = %u\n", di->di_entries); printk(KERN_INFO " di_eattr = %llu\n", (unsigned long long)di->di_eattr); diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h index d446506..580da45 100644 --- a/fs/gfs2/inode.h +++ b/fs/gfs2/inode.h @@ -10,9 +10,11 @@ #ifndef __INODE_DOT_H__ #define __INODE_DOT_H__ +#include "util.h" + static inline int gfs2_is_stuffed(const struct gfs2_inode *ip) { - return !ip->i_di.di_height; + return !ip->i_height; } static inline int gfs2_is_jdata(const struct gfs2_inode *ip) @@ -37,13 +39,25 @@ static inline int gfs2_is_dir(const struct gfs2_inode *ip) return S_ISDIR(ip->i_inode.i_mode); } -static inline void gfs2_set_inode_blocks(struct inode *inode) +static inline void gfs2_set_inode_blocks(struct inode *inode, u64 blocks) +{ + inode->i_blocks = blocks << + (GFS2_SB(inode)->sd_sb.sb_bsize_shift - GFS2_BASIC_BLOCK_SHIFT); +} + +static inline u64 gfs2_get_inode_blocks(const struct inode *inode) { - struct gfs2_inode *ip = GFS2_I(inode); - inode->i_blocks = ip->i_di.di_blocks << + return inode->i_blocks >> (GFS2_SB(inode)->sd_sb.sb_bsize_shift - GFS2_BASIC_BLOCK_SHIFT); } +static inline void gfs2_add_inode_blocks(struct inode *inode, s64 change) +{ + gfs2_assert(GFS2_SB(inode), (change >= 0 || inode->i_blocks > -change)); + change *= (GFS2_SB(inode)->sd_sb.sb_bsize/GFS2_BASIC_BLOCK); + inode->i_blocks += change; +} + static inline int gfs2_check_inum(const struct gfs2_inode *ip, u64 no_addr, u64 no_formal_ino) { diff --git a/fs/gfs2/lm.c b/fs/gfs2/lm.c deleted file mode 100644 index cfcc39b..0000000 --- a/fs/gfs2/lm.c +++ /dev/null @@ -1,210 +0,0 @@ -/* - * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. - * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. - * - * This copyrighted material is made available to anyone wishing to use, - * modify, copy, or redistribute it subject to the terms and conditions - * of the GNU General Public License version 2. - */ - -#include <linux/slab.h> -#include <linux/spinlock.h> -#include <linux/completion.h> -#include <linux/buffer_head.h> -#include <linux/delay.h> -#include <linux/gfs2_ondisk.h> -#include <linux/lm_interface.h> - -#include "gfs2.h" -#include "incore.h" -#include "glock.h" -#include "lm.h" -#include "super.h" -#include "util.h" - -/** - * gfs2_lm_mount - mount a locking protocol - * @sdp: the filesystem - * @args: mount arguements - * @silent: if 1, don't complain if the FS isn't a GFS2 fs - * - * Returns: errno - */ - -int gfs2_lm_mount(struct gfs2_sbd *sdp, int silent) -{ - char *proto = sdp->sd_proto_name; - char *table = sdp->sd_table_name; - int flags = 0; - int error; - - if (sdp->sd_args.ar_spectator) - flags |= LM_MFLAG_SPECTATOR; - - fs_info(sdp, "Trying to join cluster \"%s\", \"%s\"\n", proto, table); - - error = gfs2_mount_lockproto(proto, table, sdp->sd_args.ar_hostdata, - gfs2_glock_cb, sdp, - GFS2_MIN_LVB_SIZE, flags, - &sdp->sd_lockstruct, &sdp->sd_kobj); - if (error) { - fs_info(sdp, "can't mount proto=%s, table=%s, hostdata=%s\n", - proto, table, sdp->sd_args.ar_hostdata); - goto out; - } - - if (gfs2_assert_warn(sdp, sdp->sd_lockstruct.ls_lockspace) || - gfs2_assert_warn(sdp, sdp->sd_lockstruct.ls_ops) || - gfs2_assert_warn(sdp, sdp->sd_lockstruct.ls_lvb_size >= - GFS2_MIN_LVB_SIZE)) { - gfs2_unmount_lockproto(&sdp->sd_lockstruct); - goto out; - } - - if (sdp->sd_args.ar_spectator) - snprintf(sdp->sd_fsname, GFS2_FSNAME_LEN, "%s.s", table); - else - snprintf(sdp->sd_fsname, GFS2_FSNAME_LEN, "%s.%u", table, - sdp->sd_lockstruct.ls_jid); - - fs_info(sdp, "Joined cluster. Now mounting FS...\n"); - - if ((sdp->sd_lockstruct.ls_flags & LM_LSFLAG_LOCAL) && - !sdp->sd_args.ar_ignore_local_fs) { - sdp->sd_args.ar_localflocks = 1; - sdp->sd_args.ar_localcaching = 1; - } - -out: - return error; -} - -void gfs2_lm_others_may_mount(struct gfs2_sbd *sdp) -{ - if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) - sdp->sd_lockstruct.ls_ops->lm_others_may_mount( - sdp->sd_lockstruct.ls_lockspace); -} - -void gfs2_lm_unmount(struct gfs2_sbd *sdp) -{ - if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) - gfs2_unmount_lockproto(&sdp->sd_lockstruct); -} - -int gfs2_lm_withdraw(struct gfs2_sbd *sdp, char *fmt, ...) -{ - va_list args; - - if (test_and_set_bit(SDF_SHUTDOWN, &sdp->sd_flags)) - return 0; - - va_start(args, fmt); - vprintk(fmt, args); - va_end(args); - - fs_err(sdp, "about to withdraw this file system\n"); - BUG_ON(sdp->sd_args.ar_debug); - - fs_err(sdp, "telling LM to withdraw\n"); - gfs2_withdraw_lockproto(&sdp->sd_lockstruct); - fs_err(sdp, "withdrawn\n"); - dump_stack(); - - return -1; -} - -int gfs2_lm_get_lock(struct gfs2_sbd *sdp, struct lm_lockname *name, - void **lockp) -{ - int error = -EIO; - if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) - error = sdp->sd_lockstruct.ls_ops->lm_get_lock( - sdp->sd_lockstruct.ls_lockspace, name, lockp); - return error; -} - -void gfs2_lm_put_lock(struct gfs2_sbd *sdp, void *lock) -{ - if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) - sdp->sd_lockstruct.ls_ops->lm_put_lock(lock); -} - -unsigned int gfs2_lm_lock(struct gfs2_sbd *sdp, void *lock, - unsigned int cur_state, unsigned int req_state, - unsigned int flags) -{ - int ret = 0; - if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) - ret = sdp->sd_lockstruct.ls_ops->lm_lock(lock, cur_state, - req_state, flags); - return ret; -} - -unsigned int gfs2_lm_unlock(struct gfs2_sbd *sdp, void *lock, - unsigned int cur_state) -{ - int ret = 0; - if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) - ret = sdp->sd_lockstruct.ls_ops->lm_unlock(lock, cur_state); - return ret; -} - -void gfs2_lm_cancel(struct gfs2_sbd *sdp, void *lock) -{ - if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) - sdp->sd_lockstruct.ls_ops->lm_cancel(lock); -} - -int gfs2_lm_hold_lvb(struct gfs2_sbd *sdp, void *lock, char **lvbp) -{ - int error = -EIO; - if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) - error = sdp->sd_lockstruct.ls_ops->lm_hold_lvb(lock, lvbp); - return error; -} - -void gfs2_lm_unhold_lvb(struct gfs2_sbd *sdp, void *lock, char *lvb) -{ - if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) - sdp->sd_lockstruct.ls_ops->lm_unhold_lvb(lock, lvb); -} - -int gfs2_lm_plock_get(struct gfs2_sbd *sdp, struct lm_lockname *name, - struct file *file, struct file_lock *fl) -{ - int error = -EIO; - if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) - error = sdp->sd_lockstruct.ls_ops->lm_plock_get( - sdp->sd_lockstruct.ls_lockspace, name, file, fl); - return error; -} - -int gfs2_lm_plock(struct gfs2_sbd *sdp, struct lm_lockname *name, - struct file *file, int cmd, struct file_lock *fl) -{ - int error = -EIO; - if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) - error = sdp->sd_lockstruct.ls_ops->lm_plock( - sdp->sd_lockstruct.ls_lockspace, name, file, cmd, fl); - return error; -} - -int gfs2_lm_punlock(struct gfs2_sbd *sdp, struct lm_lockname *name, - struct file *file, struct file_lock *fl) -{ - int error = -EIO; - if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) - error = sdp->sd_lockstruct.ls_ops->lm_punlock( - sdp->sd_lockstruct.ls_lockspace, name, file, fl); - return error; -} - -void gfs2_lm_recovery_done(struct gfs2_sbd *sdp, unsigned int jid, - unsigned int message) -{ - if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) - sdp->sd_lockstruct.ls_ops->lm_recovery_done( - sdp->sd_lockstruct.ls_lockspace, jid, message); -} - diff --git a/fs/gfs2/lm.h b/fs/gfs2/lm.h deleted file mode 100644 index 21cdc30..0000000 --- a/fs/gfs2/lm.h +++ /dev/null @@ -1,42 +0,0 @@ -/* - * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. - * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. - * - * This copyrighted material is made available to anyone wishing to use, - * modify, copy, or redistribute it subject to the terms and conditions - * of the GNU General Public License version 2. - */ - -#ifndef __LM_DOT_H__ -#define __LM_DOT_H__ - -struct gfs2_sbd; - -#define GFS2_MIN_LVB_SIZE 32 - -int gfs2_lm_mount(struct gfs2_sbd *sdp, int silent); -void gfs2_lm_others_may_mount(struct gfs2_sbd *sdp); -void gfs2_lm_unmount(struct gfs2_sbd *sdp); -int gfs2_lm_withdraw(struct gfs2_sbd *sdp, char *fmt, ...) - __attribute__ ((format(printf, 2, 3))); -int gfs2_lm_get_lock(struct gfs2_sbd *sdp, struct lm_lockname *name, - void **lockp); -void gfs2_lm_put_lock(struct gfs2_sbd *sdp, void *lock); -unsigned int gfs2_lm_lock(struct gfs2_sbd *sdp, void *lock, - unsigned int cur_state, unsigned int req_state, - unsigned int flags); -unsigned int gfs2_lm_unlock(struct gfs2_sbd *sdp, void *lock, - unsigned int cur_state); -void gfs2_lm_cancel(struct gfs2_sbd *sdp, void *lock); -int gfs2_lm_hold_lvb(struct gfs2_sbd *sdp, void *lock, char **lvbp); -void gfs2_lm_unhold_lvb(struct gfs2_sbd *sdp, void *lock, char *lvb); -int gfs2_lm_plock_get(struct gfs2_sbd *sdp, struct lm_lockname *name, - struct file *file, struct file_lock *fl); -int gfs2_lm_plock(struct gfs2_sbd *sdp, struct lm_lockname *name, - struct file *file, int cmd, struct file_lock *fl); -int gfs2_lm_punlock(struct gfs2_sbd *sdp, struct lm_lockname *name, - struct file *file, struct file_lock *fl); -void gfs2_lm_recovery_done(struct gfs2_sbd *sdp, unsigned int jid, - unsigned int message); - -#endif /* __LM_DOT_H__ */ diff --git a/fs/gfs2/locking/dlm/Makefile b/fs/gfs2/locking/dlm/Makefile index 89b93b6b..2609bb6 100644 --- a/fs/gfs2/locking/dlm/Makefile +++ b/fs/gfs2/locking/dlm/Makefile @@ -1,3 +1,3 @@ obj-$(CONFIG_GFS2_FS_LOCKING_DLM) += lock_dlm.o -lock_dlm-y := lock.o main.o mount.o sysfs.o thread.o plock.o +lock_dlm-y := lock.o main.o mount.o sysfs.o thread.o diff --git a/fs/gfs2/locking/dlm/lock.c b/fs/gfs2/locking/dlm/lock.c index 542a797..cf7ea8a 100644 --- a/fs/gfs2/locking/dlm/lock.c +++ b/fs/gfs2/locking/dlm/lock.c @@ -137,7 +137,8 @@ static inline unsigned int make_flags(struct gdlm_lock *lp, /* Conversion deadlock avoidance by DLM */ - if (!test_bit(LFL_FORCE_PROMOTE, &lp->flags) && + if (!(lp->ls->fsflags & LM_MFLAG_CONV_NODROP) && + !test_bit(LFL_FORCE_PROMOTE, &lp->flags) && !(lkf & DLM_LKF_NOQUEUE) && cur > DLM_LOCK_NL && req > DLM_LOCK_NL && cur != req) lkf |= DLM_LKF_CONVDEADLK; @@ -164,7 +165,7 @@ static int gdlm_create_lp(struct gdlm_ls *ls, struct lm_lockname *name, { struct gdlm_lock *lp; - lp = kzalloc(sizeof(struct gdlm_lock), GFP_KERNEL); + lp = kzalloc(sizeof(struct gdlm_lock), GFP_NOFS); if (!lp) return -ENOMEM; @@ -382,7 +383,7 @@ static int gdlm_add_lvb(struct gdlm_lock *lp) { char *lvb; - lvb = kzalloc(GDLM_LVB_SIZE, GFP_KERNEL); + lvb = kzalloc(GDLM_LVB_SIZE, GFP_NOFS); if (!lvb) return -ENOMEM; diff --git a/fs/gfs2/locking/dlm/lock_dlm.h b/fs/gfs2/locking/dlm/lock_dlm.h index 9e8265d..a243cf6 100644 --- a/fs/gfs2/locking/dlm/lock_dlm.h +++ b/fs/gfs2/locking/dlm/lock_dlm.h @@ -25,6 +25,7 @@ #include <net/sock.h> #include <linux/dlm.h> +#include <linux/dlm_plock.h> #include <linux/lm_interface.h> /* @@ -173,15 +174,9 @@ void gdlm_cancel(void *); int gdlm_hold_lvb(void *, char **); void gdlm_unhold_lvb(void *, char *); -/* plock.c */ +/* mount.c */ + +extern const struct lm_lockops gdlm_ops; -int gdlm_plock_init(void); -void gdlm_plock_exit(void); -int gdlm_plock(void *, struct lm_lockname *, struct file *, int, - struct file_lock *); -int gdlm_plock_get(void *, struct lm_lockname *, struct file *, - struct file_lock *); -int gdlm_punlock(void *, struct lm_lockname *, struct file *, - struct file_lock *); #endif diff --git a/fs/gfs2/locking/dlm/main.c b/fs/gfs2/locking/dlm/main.c index a0e7eda..b9a03a7 100644 --- a/fs/gfs2/locking/dlm/main.c +++ b/fs/gfs2/locking/dlm/main.c @@ -11,8 +11,6 @@ #include "lock_dlm.h" -extern struct lm_lockops gdlm_ops; - static int __init init_lock_dlm(void) { int error; @@ -30,13 +28,6 @@ static int __init init_lock_dlm(void) return error; } - error = gdlm_plock_init(); - if (error) { - gdlm_sysfs_exit(); - gfs2_unregister_lockproto(&gdlm_ops); - return error; - } - printk(KERN_INFO "Lock_DLM (built %s %s) installed\n", __DATE__, __TIME__); return 0; @@ -44,7 +35,6 @@ static int __init init_lock_dlm(void) static void __exit exit_lock_dlm(void) { - gdlm_plock_exit(); gdlm_sysfs_exit(); gfs2_unregister_lockproto(&gdlm_ops); } diff --git a/fs/gfs2/locking/dlm/mount.c b/fs/gfs2/locking/dlm/mount.c index f2efff4..470bdf6 100644 --- a/fs/gfs2/locking/dlm/mount.c +++ b/fs/gfs2/locking/dlm/mount.c @@ -236,6 +236,27 @@ static void gdlm_withdraw(void *lockspace) gdlm_kobject_release(ls); } +static int gdlm_plock(void *lockspace, struct lm_lockname *name, + struct file *file, int cmd, struct file_lock *fl) +{ + struct gdlm_ls *ls = lockspace; + return dlm_posix_lock(ls->dlm_lockspace, name->ln_number, file, cmd, fl); +} + +static int gdlm_punlock(void *lockspace, struct lm_lockname *name, + struct file *file, struct file_lock *fl) +{ + struct gdlm_ls *ls = lockspace; + return dlm_posix_unlock(ls->dlm_lockspace, name->ln_number, file, fl); +} + +static int gdlm_plock_get(void *lockspace, struct lm_lockname *name, + struct file *file, struct file_lock *fl) +{ + struct gdlm_ls *ls = lockspace; + return dlm_posix_get(ls->dlm_lockspace, name->ln_number, file, fl); +} + const struct lm_lockops gdlm_ops = { .lm_proto_name = "lock_dlm", .lm_mount = gdlm_mount, diff --git a/fs/gfs2/locking/dlm/sysfs.c b/fs/gfs2/locking/dlm/sysfs.c index a87b098..8479da4 100644 --- a/fs/gfs2/locking/dlm/sysfs.c +++ b/fs/gfs2/locking/dlm/sysfs.c @@ -12,8 +12,6 @@ #include "lock_dlm.h" -extern struct lm_lockops gdlm_ops; - static ssize_t proto_name_show(struct gdlm_ls *ls, char *buf) { return sprintf(buf, "%s\n", gdlm_ops.lm_proto_name); diff --git a/fs/gfs2/locking/dlm/thread.c b/fs/gfs2/locking/dlm/thread.c index 521694f..e53db6f 100644 --- a/fs/gfs2/locking/dlm/thread.c +++ b/fs/gfs2/locking/dlm/thread.c @@ -135,7 +135,15 @@ static void process_complete(struct gdlm_lock *lp) lp->lksb.sb_status, lp->lockname.ln_type, (unsigned long long)lp->lockname.ln_number, lp->flags); - return; + if (lp->lksb.sb_status == -EDEADLOCK && + lp->ls->fsflags & LM_MFLAG_CONV_NODROP) { + lp->req = lp->cur; + acb.lc_ret |= LM_OUT_CONV_DEADLK; + if (lp->cur == DLM_LOCK_IV) + lp->lksb.sb_lkid = 0; + goto out; + } else + return; } /* diff --git a/fs/gfs2/locking/nolock/main.c b/fs/gfs2/locking/nolock/main.c index d3b8ce6..284a5ec 100644 --- a/fs/gfs2/locking/nolock/main.c +++ b/fs/gfs2/locking/nolock/main.c @@ -140,7 +140,7 @@ static int nolock_hold_lvb(void *lock, char **lvbp) struct nolock_lockspace *nl = lock; int error = 0; - *lvbp = kzalloc(nl->nl_lvb_size, GFP_KERNEL); + *lvbp = kzalloc(nl->nl_lvb_size, GFP_NOFS); if (!*lvbp) error = -ENOMEM; diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c index 161ab6f..548264b 100644 --- a/fs/gfs2/log.c +++ b/fs/gfs2/log.c @@ -769,8 +769,8 @@ static void log_refund(struct gfs2_sbd *sdp, struct gfs2_trans *tr) sdp->sd_log_commited_revoke += tr->tr_num_revoke - tr->tr_num_revoke_rm; gfs2_assert_withdraw(sdp, ((int)sdp->sd_log_commited_revoke) >= 0); reserved = calc_reserved(sdp); + gfs2_assert_withdraw(sdp, sdp->sd_log_blks_reserved + tr->tr_reserved >= reserved); unused = sdp->sd_log_blks_reserved - reserved + tr->tr_reserved; - gfs2_assert_withdraw(sdp, unused >= 0); atomic_add(unused, &sdp->sd_log_blks_free); gfs2_assert_withdraw(sdp, atomic_read(&sdp->sd_log_blks_free) <= sdp->sd_jdesc->jd_blocks); @@ -779,6 +779,21 @@ static void log_refund(struct gfs2_sbd *sdp, struct gfs2_trans *tr) gfs2_log_unlock(sdp); } +static void buf_lo_incore_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr) +{ + struct list_head *head = &tr->tr_list_buf; + struct gfs2_bufdata *bd; + + gfs2_log_lock(sdp); + while (!list_empty(head)) { + bd = list_entry(head->next, struct gfs2_bufdata, bd_list_tr); + list_del_init(&bd->bd_list_tr); + tr->tr_num_buf--; + } + gfs2_log_unlock(sdp); + gfs2_assert_warn(sdp, !tr->tr_num_buf); +} + /** * gfs2_log_commit - Commit a transaction to the log * @sdp: the filesystem @@ -790,7 +805,7 @@ static void log_refund(struct gfs2_sbd *sdp, struct gfs2_trans *tr) void gfs2_log_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr) { log_refund(sdp, tr); - lops_incore_commit(sdp, tr); + buf_lo_incore_commit(sdp, tr); sdp->sd_vfs->s_dirt = 1; up_read(&sdp->sd_log_flush_lock); diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c index fae59d6..4390f6f 100644 --- a/fs/gfs2/lops.c +++ b/fs/gfs2/lops.c @@ -152,21 +152,6 @@ out: unlock_buffer(bd->bd_bh); } -static void buf_lo_incore_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr) -{ - struct list_head *head = &tr->tr_list_buf; - struct gfs2_bufdata *bd; - - gfs2_log_lock(sdp); - while (!list_empty(head)) { - bd = list_entry(head->next, struct gfs2_bufdata, bd_list_tr); - list_del_init(&bd->bd_list_tr); - tr->tr_num_buf--; - } - gfs2_log_unlock(sdp); - gfs2_assert_warn(sdp, !tr->tr_num_buf); -} - static void buf_lo_before_commit(struct gfs2_sbd *sdp) { struct buffer_head *bh; @@ -419,8 +404,10 @@ static int revoke_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start, blkno = be64_to_cpu(*(__be64 *)(bh->b_data + offset)); error = gfs2_revoke_add(sdp, blkno, start); - if (error < 0) + if (error < 0) { + brelse(bh); return error; + } else if (error) sdp->sd_found_revokes++; @@ -737,7 +724,6 @@ static void databuf_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_ail *ai) const struct gfs2_log_operations gfs2_buf_lops = { .lo_add = buf_lo_add, - .lo_incore_commit = buf_lo_incore_commit, .lo_before_commit = buf_lo_before_commit, .lo_after_commit = buf_lo_after_commit, .lo_before_scan = buf_lo_before_scan, @@ -763,7 +749,6 @@ const struct gfs2_log_operations gfs2_rg_lops = { const struct gfs2_log_operations gfs2_databuf_lops = { .lo_add = databuf_lo_add, - .lo_incore_commit = buf_lo_incore_commit, .lo_before_commit = databuf_lo_before_commit, .lo_after_commit = databuf_lo_after_commit, .lo_scan_elements = databuf_lo_scan_elements, diff --git a/fs/gfs2/lops.h b/fs/gfs2/lops.h index 41a00df..3c0b273 100644 --- a/fs/gfs2/lops.h +++ b/fs/gfs2/lops.h @@ -1,6 +1,6 @@ /* * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. - * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved. * * This copyrighted material is made available to anyone wishing to use, * modify, copy, or redistribute it subject to the terms and conditions @@ -57,15 +57,6 @@ static inline void lops_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le) le->le_ops->lo_add(sdp, le); } -static inline void lops_incore_commit(struct gfs2_sbd *sdp, - struct gfs2_trans *tr) -{ - int x; - for (x = 0; gfs2_log_ops[x]; x++) - if (gfs2_log_ops[x]->lo_incore_commit) - gfs2_log_ops[x]->lo_incore_commit(sdp, tr); -} - static inline void lops_before_commit(struct gfs2_sbd *sdp) { int x; diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c index 9c7765c..053e2eb 100644 --- a/fs/gfs2/main.c +++ b/fs/gfs2/main.c @@ -89,6 +89,12 @@ static int __init init_gfs2_fs(void) if (!gfs2_bufdata_cachep) goto fail; + gfs2_rgrpd_cachep = kmem_cache_create("gfs2_rgrpd", + sizeof(struct gfs2_rgrpd), + 0, 0, NULL); + if (!gfs2_rgrpd_cachep) + goto fail; + error = register_filesystem(&gfs2_fs_type); if (error) goto fail; @@ -108,6 +114,9 @@ fail_unregister: fail: gfs2_glock_exit(); + if (gfs2_rgrpd_cachep) + kmem_cache_destroy(gfs2_rgrpd_cachep); + if (gfs2_bufdata_cachep) kmem_cache_destroy(gfs2_bufdata_cachep); @@ -133,6 +142,7 @@ static void __exit exit_gfs2_fs(void) unregister_filesystem(&gfs2_fs_type); unregister_filesystem(&gfs2meta_fs_type); + kmem_cache_destroy(gfs2_rgrpd_cachep); kmem_cache_destroy(gfs2_bufdata_cachep); kmem_cache_destroy(gfs2_inode_cachep); kmem_cache_destroy(gfs2_glock_cachep); diff --git a/fs/gfs2/ops_address.c b/fs/gfs2/ops_address.c index ac772b6..90a04a6 100644 --- a/fs/gfs2/ops_address.c +++ b/fs/gfs2/ops_address.c @@ -1,6 +1,6 @@ /* * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. - * Copyright (C) 2004-2007 Red Hat, Inc. All rights reserved. + * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved. * * This copyrighted material is made available to anyone wishing to use, * modify, copy, or redistribute it subject to the terms and conditions @@ -21,7 +21,6 @@ #include <linux/gfs2_ondisk.h> #include <linux/lm_interface.h> #include <linux/backing-dev.h> -#include <linux/pagevec.h> #include "gfs2.h" #include "incore.h" @@ -104,11 +103,9 @@ static int gfs2_writepage_common(struct page *page, loff_t i_size = i_size_read(inode); pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT; unsigned offset; - int ret = -EIO; if (gfs2_assert_withdraw(sdp, gfs2_glock_is_held_excl(ip->i_gl))) goto out; - ret = 0; if (current->journal_info) goto redirty; /* Is the page fully outside i_size? (truncate in progress) */ @@ -280,7 +277,7 @@ static int gfs2_write_jdata_pagevec(struct address_space *mapping, int i; int ret; - ret = gfs2_trans_begin(sdp, nrblocks, 0); + ret = gfs2_trans_begin(sdp, nrblocks, nrblocks); if (ret < 0) return ret; @@ -510,23 +507,26 @@ static int __gfs2_readpage(void *file, struct page *page) static int gfs2_readpage(struct file *file, struct page *page) { struct gfs2_inode *ip = GFS2_I(page->mapping->host); - struct gfs2_holder gh; + struct gfs2_holder *gh; int error; - gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME|LM_FLAG_TRY_1CB, &gh); - error = gfs2_glock_nq_atime(&gh); - if (unlikely(error)) { + gh = gfs2_glock_is_locked_by_me(ip->i_gl); + if (!gh) { + gh = kmalloc(sizeof(struct gfs2_holder), GFP_NOFS); + if (!gh) + return -ENOBUFS; + gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME, gh); unlock_page(page); - goto out; + error = gfs2_glock_nq_atime(gh); + if (likely(error != 0)) + goto out; + return AOP_TRUNCATED_PAGE; } error = __gfs2_readpage(file, page); - gfs2_glock_dq(&gh); + gfs2_glock_dq(gh); out: - gfs2_holder_uninit(&gh); - if (error == GLR_TRYFAILED) { - yield(); - return AOP_TRUNCATED_PAGE; - } + gfs2_holder_uninit(gh); + kfree(gh); return error; } @@ -648,15 +648,15 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping, if (alloc_required) { al = gfs2_alloc_get(ip); + if (!al) { + error = -ENOMEM; + goto out_unlock; + } - error = gfs2_quota_lock(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE); + error = gfs2_quota_lock_check(ip); if (error) goto out_alloc_put; - error = gfs2_quota_check(ip, ip->i_inode.i_uid, ip->i_inode.i_gid); - if (error) - goto out_qunlock; - al->al_requested = data_blocks + ind_blocks; error = gfs2_inplace_reserve(ip); if (error) @@ -828,7 +828,7 @@ static int gfs2_write_end(struct file *file, struct address_space *mapping, unsigned int to = from + len; int ret; - BUG_ON(gfs2_glock_is_locked_by_me(ip->i_gl) == 0); + BUG_ON(gfs2_glock_is_locked_by_me(ip->i_gl) == NULL); ret = gfs2_meta_inode_buffer(ip, &dibh); if (unlikely(ret)) { diff --git a/fs/gfs2/ops_dentry.c b/fs/gfs2/ops_dentry.c index 793e334..4a5e676 100644 --- a/fs/gfs2/ops_dentry.c +++ b/fs/gfs2/ops_dentry.c @@ -43,7 +43,7 @@ static int gfs2_drevalidate(struct dentry *dentry, struct nameidata *nd) struct gfs2_holder d_gh; struct gfs2_inode *ip = NULL; int error; - int had_lock=0; + int had_lock = 0; if (inode) { if (is_bad_inode(inode)) @@ -54,7 +54,7 @@ static int gfs2_drevalidate(struct dentry *dentry, struct nameidata *nd) if (sdp->sd_args.ar_localcaching) goto valid; - had_lock = gfs2_glock_is_locked_by_me(dip->i_gl); + had_lock = (gfs2_glock_is_locked_by_me(dip->i_gl) != NULL); if (!had_lock) { error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, &d_gh); if (error) diff --git a/fs/gfs2/ops_export.c b/fs/gfs2/ops_export.c index 334c7f8..990d9f4 100644 --- a/fs/gfs2/ops_export.c +++ b/fs/gfs2/ops_export.c @@ -204,8 +204,6 @@ static struct dentry *gfs2_get_dentry(struct super_block *sb, inode = gfs2_inode_lookup(sb, DT_UNKNOWN, inum->no_addr, 0, 0); - if (!inode) - goto fail; if (IS_ERR(inode)) { error = PTR_ERR(inode); goto fail; diff --git a/fs/gfs2/ops_file.c b/fs/gfs2/ops_file.c index f4842f2..e1b7d52 100644 --- a/fs/gfs2/ops_file.c +++ b/fs/gfs2/ops_file.c @@ -30,7 +30,6 @@ #include "glock.h" #include "glops.h" #include "inode.h" -#include "lm.h" #include "log.h" #include "meta_io.h" #include "quota.h" @@ -39,6 +38,7 @@ #include "util.h" #include "eaops.h" #include "ops_address.h" +#include "ops_inode.h" /** * gfs2_llseek - seek to a location in a file @@ -369,12 +369,9 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct page *page) if (al == NULL) goto out_unlock; - ret = gfs2_quota_lock(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE); + ret = gfs2_quota_lock_check(ip); if (ret) goto out_alloc_put; - ret = gfs2_quota_check(ip, ip->i_inode.i_uid, ip->i_inode.i_gid); - if (ret) - goto out_quota_unlock; al->al_requested = data_blocks + ind_blocks; ret = gfs2_inplace_reserve(ip); if (ret) @@ -596,6 +593,36 @@ static int gfs2_setlease(struct file *file, long arg, struct file_lock **fl) return generic_setlease(file, arg, fl); } +static int gfs2_lm_plock_get(struct gfs2_sbd *sdp, struct lm_lockname *name, + struct file *file, struct file_lock *fl) +{ + int error = -EIO; + if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) + error = sdp->sd_lockstruct.ls_ops->lm_plock_get( + sdp->sd_lockstruct.ls_lockspace, name, file, fl); + return error; +} + +static int gfs2_lm_plock(struct gfs2_sbd *sdp, struct lm_lockname *name, + struct file *file, int cmd, struct file_lock *fl) +{ + int error = -EIO; + if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) + error = sdp->sd_lockstruct.ls_ops->lm_plock( + sdp->sd_lockstruct.ls_lockspace, name, file, cmd, fl); + return error; +} + +static int gfs2_lm_punlock(struct gfs2_sbd *sdp, struct lm_lockname *name, + struct file *file, struct file_lock *fl) +{ + int error = -EIO; + if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) + error = sdp->sd_lockstruct.ls_ops->lm_punlock( + sdp->sd_lockstruct.ls_lockspace, name, file, fl); + return error; +} + /** * gfs2_lock - acquire/release a posix lock on a file * @file: the file pointer diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index 4bee6aa..ef9c6c4 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -1,6 +1,6 @@ /* * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. - * Copyright (C) 2004-2007 Red Hat, Inc. All rights reserved. + * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved. * * This copyrighted material is made available to anyone wishing to use, * modify, copy, or redistribute it subject to the terms and conditions @@ -26,7 +26,6 @@ #include "glock.h" #include "glops.h" #include "inode.h" -#include "lm.h" #include "mount.h" #include "ops_fstype.h" #include "ops_dentry.h" @@ -363,6 +362,13 @@ static int map_journal_extents(struct gfs2_sbd *sdp) return rc; } +static void gfs2_lm_others_may_mount(struct gfs2_sbd *sdp) +{ + if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) + sdp->sd_lockstruct.ls_ops->lm_others_may_mount( + sdp->sd_lockstruct.ls_lockspace); +} + static int init_journal(struct gfs2_sbd *sdp, int undo) { struct gfs2_holder ji_gh; @@ -542,7 +548,7 @@ static int init_inodes(struct gfs2_sbd *sdp, int undo) } ip = GFS2_I(sdp->sd_rindex); set_bit(GLF_STICKY, &ip->i_gl->gl_flags); - sdp->sd_rindex_vn = ip->i_gl->gl_vn - 1; + sdp->sd_rindex_uptodate = 0; /* Read in the quota inode */ sdp->sd_quota_inode = gfs2_lookup_simple(sdp->sd_master_dir, "quota"); @@ -705,6 +711,69 @@ fail: } /** + * gfs2_lm_mount - mount a locking protocol + * @sdp: the filesystem + * @args: mount arguements + * @silent: if 1, don't complain if the FS isn't a GFS2 fs + * + * Returns: errno + */ + +static int gfs2_lm_mount(struct gfs2_sbd *sdp, int silent) +{ + char *proto = sdp->sd_proto_name; + char *table = sdp->sd_table_name; + int flags = LM_MFLAG_CONV_NODROP; + int error; + + if (sdp->sd_args.ar_spectator) + flags |= LM_MFLAG_SPECTATOR; + + fs_info(sdp, "Trying to join cluster \"%s\", \"%s\"\n", proto, table); + + error = gfs2_mount_lockproto(proto, table, sdp->sd_args.ar_hostdata, + gfs2_glock_cb, sdp, + GFS2_MIN_LVB_SIZE, flags, + &sdp->sd_lockstruct, &sdp->sd_kobj); + if (error) { + fs_info(sdp, "can't mount proto=%s, table=%s, hostdata=%s\n", + proto, table, sdp->sd_args.ar_hostdata); + goto out; + } + + if (gfs2_assert_warn(sdp, sdp->sd_lockstruct.ls_lockspace) || + gfs2_assert_warn(sdp, sdp->sd_lockstruct.ls_ops) || + gfs2_assert_warn(sdp, sdp->sd_lockstruct.ls_lvb_size >= + GFS2_MIN_LVB_SIZE)) { + gfs2_unmount_lockproto(&sdp->sd_lockstruct); + goto out; + } + + if (sdp->sd_args.ar_spectator) + snprintf(sdp->sd_fsname, GFS2_FSNAME_LEN, "%s.s", table); + else + snprintf(sdp->sd_fsname, GFS2_FSNAME_LEN, "%s.%u", table, + sdp->sd_lockstruct.ls_jid); + + fs_info(sdp, "Joined cluster. Now mounting FS...\n"); + + if ((sdp->sd_lockstruct.ls_flags & LM_LSFLAG_LOCAL) && + !sdp->sd_args.ar_ignore_local_fs) { + sdp->sd_args.ar_localflocks = 1; + sdp->sd_args.ar_localcaching = 1; + } + +out: + return error; +} + +void gfs2_lm_unmount(struct gfs2_sbd *sdp) +{ + if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) + gfs2_unmount_lockproto(&sdp->sd_lockstruct); +} + +/** * fill_super - Read in superblock * @sb: The VFS superblock * @data: Mount options @@ -874,7 +943,6 @@ static struct super_block* get_gfs2_sb(const char *dev_name) { struct kstat stat; struct nameidata nd; - struct file_system_type *fstype; struct super_block *sb = NULL, *s; int error; @@ -886,8 +954,7 @@ static struct super_block* get_gfs2_sb(const char *dev_name) } error = vfs_getattr(nd.path.mnt, nd.path.dentry, &stat); - fstype = get_fs_type("gfs2"); - list_for_each_entry(s, &fstype->fs_supers, s_instances) { + list_for_each_entry(s, &gfs2_fs_type.fs_supers, s_instances) { if ((S_ISBLK(stat.mode) && s->s_dev == stat.rdev) || (S_ISDIR(stat.mode) && s == nd.path.dentry->d_inode->i_sb)) { @@ -931,7 +998,6 @@ static int gfs2_get_sb_meta(struct file_system_type *fs_type, int flags, error = PTR_ERR(new); goto error; } - module_put(fs_type->owner); new->s_flags = flags; strlcpy(new->s_id, sb->s_id, sizeof(new->s_id)); sb_set_blocksize(new, sb->s_blocksize); diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c index e874129..2686ad4 100644 --- a/fs/gfs2/ops_inode.c +++ b/fs/gfs2/ops_inode.c @@ -200,15 +200,15 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir, if (alloc_required) { struct gfs2_alloc *al = gfs2_alloc_get(dip); + if (!al) { + error = -ENOMEM; + goto out_gunlock; + } - error = gfs2_quota_lock(dip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE); + error = gfs2_quota_lock_check(dip); if (error) goto out_alloc; - error = gfs2_quota_check(dip, dip->i_inode.i_uid, dip->i_inode.i_gid); - if (error) - goto out_gunlock_q; - al->al_requested = sdp->sd_max_dirres; error = gfs2_inplace_reserve(dip); @@ -716,15 +716,15 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry, if (alloc_required) { struct gfs2_alloc *al = gfs2_alloc_get(ndip); + if (!al) { + error = -ENOMEM; + goto out_gunlock; + } - error = gfs2_quota_lock(ndip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE); + error = gfs2_quota_lock_check(ndip); if (error) goto out_alloc; - error = gfs2_quota_check(ndip, ndip->i_inode.i_uid, ndip->i_inode.i_gid); - if (error) - goto out_gunlock_q; - al->al_requested = sdp->sd_max_dirres; error = gfs2_inplace_reserve(ndip); @@ -898,7 +898,7 @@ static int gfs2_permission(struct inode *inode, int mask, struct nameidata *nd) int error; int unlock = 0; - if (gfs2_glock_is_locked_by_me(ip->i_gl) == 0) { + if (gfs2_glock_is_locked_by_me(ip->i_gl) == NULL) { error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh); if (error) return error; @@ -953,7 +953,8 @@ static int setattr_chown(struct inode *inode, struct iattr *attr) if (!(attr->ia_valid & ATTR_GID) || ogid == ngid) ogid = ngid = NO_QUOTA_CHANGE; - gfs2_alloc_get(ip); + if (!gfs2_alloc_get(ip)) + return -ENOMEM; error = gfs2_quota_lock(ip, nuid, ngid); if (error) @@ -981,8 +982,9 @@ static int setattr_chown(struct inode *inode, struct iattr *attr) brelse(dibh); if (ouid != NO_QUOTA_CHANGE || ogid != NO_QUOTA_CHANGE) { - gfs2_quota_change(ip, -ip->i_di.di_blocks, ouid, ogid); - gfs2_quota_change(ip, ip->i_di.di_blocks, nuid, ngid); + u64 blocks = gfs2_get_inode_blocks(&ip->i_inode); + gfs2_quota_change(ip, -blocks, ouid, ogid); + gfs2_quota_change(ip, blocks, nuid, ngid); } out_end_trans: @@ -1064,7 +1066,7 @@ static int gfs2_getattr(struct vfsmount *mnt, struct dentry *dentry, int error; int unlock = 0; - if (gfs2_glock_is_locked_by_me(ip->i_gl) == 0) { + if (gfs2_glock_is_locked_by_me(ip->i_gl) == NULL) { error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &gh); if (error) return error; @@ -1148,16 +1150,6 @@ const struct inode_operations gfs2_file_iops = { .removexattr = gfs2_removexattr, }; -const struct inode_operations gfs2_dev_iops = { - .permission = gfs2_permission, - .setattr = gfs2_setattr, - .getattr = gfs2_getattr, - .setxattr = gfs2_setxattr, - .getxattr = gfs2_getxattr, - .listxattr = gfs2_listxattr, - .removexattr = gfs2_removexattr, -}; - const struct inode_operations gfs2_dir_iops = { .create = gfs2_create, .lookup = gfs2_lookup, diff --git a/fs/gfs2/ops_inode.h b/fs/gfs2/ops_inode.h index fd8cee2..14b4b79 100644 --- a/fs/gfs2/ops_inode.h +++ b/fs/gfs2/ops_inode.h @@ -15,7 +15,6 @@ extern const struct inode_operations gfs2_file_iops; extern const struct inode_operations gfs2_dir_iops; extern const struct inode_operations gfs2_symlink_iops; -extern const struct inode_operations gfs2_dev_iops; extern const struct file_operations gfs2_file_fops; extern const struct file_operations gfs2_dir_fops; extern const struct file_operations gfs2_file_fops_nolock; diff --git a/fs/gfs2/ops_super.c b/fs/gfs2/ops_super.c index 5e52421..2278c68 100644 --- a/fs/gfs2/ops_super.c +++ b/fs/gfs2/ops_super.c @@ -25,7 +25,6 @@ #include "incore.h" #include "glock.h" #include "inode.h" -#include "lm.h" #include "log.h" #include "mount.h" #include "ops_super.h" diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index a08dabd..56aaf91 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -94,7 +94,7 @@ static int qd_alloc(struct gfs2_sbd *sdp, int user, u32 id, struct gfs2_quota_data *qd; int error; - qd = kzalloc(sizeof(struct gfs2_quota_data), GFP_KERNEL); + qd = kzalloc(sizeof(struct gfs2_quota_data), GFP_NOFS); if (!qd) return -ENOMEM; @@ -616,16 +616,9 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc, s64 value; int err = -EIO; - if (gfs2_is_stuffed(ip)) { - struct gfs2_alloc *al = NULL; - al = gfs2_alloc_get(ip); - /* just request 1 blk */ - al->al_requested = 1; - gfs2_inplace_reserve(ip); + if (gfs2_is_stuffed(ip)) gfs2_unstuff_dinode(ip, NULL); - gfs2_inplace_release(ip); - gfs2_alloc_put(ip); - } + page = grab_cache_page(mapping, index); if (!page) return -ENOMEM; @@ -690,14 +683,14 @@ static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda) unsigned int qx, x; struct gfs2_quota_data *qd; loff_t offset; - unsigned int nalloc = 0; + unsigned int nalloc = 0, blocks; struct gfs2_alloc *al = NULL; int error; gfs2_write_calc_reserv(ip, sizeof(struct gfs2_quota), &data_blocks, &ind_blocks); - ghs = kcalloc(num_qd, sizeof(struct gfs2_holder), GFP_KERNEL); + ghs = kcalloc(num_qd, sizeof(struct gfs2_holder), GFP_NOFS); if (!ghs) return -ENOMEM; @@ -727,30 +720,33 @@ static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda) nalloc++; } - if (nalloc) { - al = gfs2_alloc_get(ip); + al = gfs2_alloc_get(ip); + if (!al) { + error = -ENOMEM; + goto out_gunlock; + } + /* + * 1 blk for unstuffing inode if stuffed. We add this extra + * block to the reservation unconditionally. If the inode + * doesn't need unstuffing, the block will be released to the + * rgrp since it won't be allocated during the transaction + */ + al->al_requested = 1; + /* +1 in the end for block requested above for unstuffing */ + blocks = num_qd * data_blocks + RES_DINODE + num_qd + 1; - al->al_requested = nalloc * (data_blocks + ind_blocks); + if (nalloc) + al->al_requested += nalloc * (data_blocks + ind_blocks); + error = gfs2_inplace_reserve(ip); + if (error) + goto out_alloc; - error = gfs2_inplace_reserve(ip); - if (error) - goto out_alloc; - - error = gfs2_trans_begin(sdp, - al->al_rgd->rd_length + - num_qd * data_blocks + - nalloc * ind_blocks + - RES_DINODE + num_qd + - RES_STATFS, 0); - if (error) - goto out_ipres; - } else { - error = gfs2_trans_begin(sdp, - num_qd * data_blocks + - RES_DINODE + num_qd, 0); - if (error) - goto out_gunlock; - } + if (nalloc) + blocks += al->al_rgd->rd_length + nalloc * ind_blocks + RES_STATFS; + + error = gfs2_trans_begin(sdp, blocks, 0); + if (error) + goto out_ipres; for (x = 0; x < num_qd; x++) { qd = qda[x]; @@ -769,11 +765,9 @@ static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda) out_end_trans: gfs2_trans_end(sdp); out_ipres: - if (nalloc) - gfs2_inplace_release(ip); + gfs2_inplace_release(ip); out_alloc: - if (nalloc) - gfs2_alloc_put(ip); + gfs2_alloc_put(ip); out_gunlock: gfs2_glock_dq_uninit(&i_gh); out: @@ -1124,12 +1118,12 @@ int gfs2_quota_init(struct gfs2_sbd *sdp) error = -ENOMEM; sdp->sd_quota_bitmap = kcalloc(sdp->sd_quota_chunks, - sizeof(unsigned char *), GFP_KERNEL); + sizeof(unsigned char *), GFP_NOFS); if (!sdp->sd_quota_bitmap) return error; for (x = 0; x < sdp->sd_quota_chunks; x++) { - sdp->sd_quota_bitmap[x] = kzalloc(PAGE_SIZE, GFP_KERNEL); + sdp->sd_quota_bitmap[x] = kzalloc(PAGE_SIZE, GFP_NOFS); if (!sdp->sd_quota_bitmap[x]) goto fail; } diff --git a/fs/gfs2/quota.h b/fs/gfs2/quota.h index a8be141..3b7f4b0 100644 --- a/fs/gfs2/quota.h +++ b/fs/gfs2/quota.h @@ -32,4 +32,21 @@ int gfs2_quota_init(struct gfs2_sbd *sdp); void gfs2_quota_scan(struct gfs2_sbd *sdp); void gfs2_quota_cleanup(struct gfs2_sbd *sdp); +static inline int gfs2_quota_lock_check(struct gfs2_inode *ip) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + int ret; + if (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF) + return 0; + ret = gfs2_quota_lock(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE); + if (ret) + return ret; + if (sdp->sd_args.ar_quota != GFS2_QUOTA_ON) + return 0; + ret = gfs2_quota_check(ip, ip->i_inode.i_uid, ip->i_inode.i_gid); + if (ret) + gfs2_quota_unlock(ip); + return ret; +} + #endif /* __QUOTA_DOT_H__ */ diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c index 6fb07d6..2888e4b 100644 --- a/fs/gfs2/recovery.c +++ b/fs/gfs2/recovery.c @@ -20,7 +20,6 @@ #include "bmap.h" #include "glock.h" #include "glops.h" -#include "lm.h" #include "lops.h" #include "meta_io.h" #include "recovery.h" @@ -69,7 +68,7 @@ int gfs2_revoke_add(struct gfs2_sbd *sdp, u64 blkno, unsigned int where) return 0; } - rr = kmalloc(sizeof(struct gfs2_revoke_replay), GFP_KERNEL); + rr = kmalloc(sizeof(struct gfs2_revoke_replay), GFP_NOFS); if (!rr) return -ENOMEM; @@ -150,7 +149,7 @@ static int get_log_header(struct gfs2_jdesc *jd, unsigned int blk, struct gfs2_log_header_host *head) { struct buffer_head *bh; - struct gfs2_log_header_host lh; + struct gfs2_log_header_host uninitialized_var(lh); const u32 nothing = 0; u32 hash; int error; @@ -425,6 +424,16 @@ static int clean_journal(struct gfs2_jdesc *jd, struct gfs2_log_header_host *hea return error; } + +static void gfs2_lm_recovery_done(struct gfs2_sbd *sdp, unsigned int jid, + unsigned int message) +{ + if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) + sdp->sd_lockstruct.ls_ops->lm_recovery_done( + sdp->sd_lockstruct.ls_lockspace, jid, message); +} + + /** * gfs2_recover_journal - recovery a given journal * @jd: the struct gfs2_jdesc describing the journal diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index 3552110..7e8f0b1 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -1,6 +1,6 @@ /* * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. - * Copyright (C) 2004-2007 Red Hat, Inc. All rights reserved. + * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved. * * This copyrighted material is made available to anyone wishing to use, * modify, copy, or redistribute it subject to the terms and conditions @@ -14,6 +14,7 @@ #include <linux/fs.h> #include <linux/gfs2_ondisk.h> #include <linux/lm_interface.h> +#include <linux/prefetch.h> #include "gfs2.h" #include "incore.h" @@ -33,6 +34,16 @@ #define BFITNOENT ((u32)~0) #define NO_BLOCK ((u64)~0) +#if BITS_PER_LONG == 32 +#define LBITMASK (0x55555555UL) +#define LBITSKIP55 (0x55555555UL) +#define LBITSKIP00 (0x00000000UL) +#else +#define LBITMASK (0x5555555555555555UL) +#define LBITSKIP55 (0x5555555555555555UL) +#define LBITSKIP00 (0x0000000000000000UL) +#endif + /* * These routines are used by the resource group routines (rgrp.c) * to keep track of block allocation. Each block is represented by two @@ -53,7 +64,8 @@ static const char valid_change[16] = { }; static u32 rgblk_search(struct gfs2_rgrpd *rgd, u32 goal, - unsigned char old_state, unsigned char new_state); + unsigned char old_state, unsigned char new_state, + unsigned int *n); /** * gfs2_setbit - Set a bit in the bitmaps @@ -64,26 +76,32 @@ static u32 rgblk_search(struct gfs2_rgrpd *rgd, u32 goal, * */ -static void gfs2_setbit(struct gfs2_rgrpd *rgd, unsigned char *buffer, - unsigned int buflen, u32 block, - unsigned char new_state) +static inline void gfs2_setbit(struct gfs2_rgrpd *rgd, unsigned char *buf1, + unsigned char *buf2, unsigned int offset, + unsigned int buflen, u32 block, + unsigned char new_state) { - unsigned char *byte, *end, cur_state; - unsigned int bit; + unsigned char *byte1, *byte2, *end, cur_state; + const unsigned int bit = (block % GFS2_NBBY) * GFS2_BIT_SIZE; - byte = buffer + (block / GFS2_NBBY); - bit = (block % GFS2_NBBY) * GFS2_BIT_SIZE; - end = buffer + buflen; + byte1 = buf1 + offset + (block / GFS2_NBBY); + end = buf1 + offset + buflen; - gfs2_assert(rgd->rd_sbd, byte < end); + BUG_ON(byte1 >= end); - cur_state = (*byte >> bit) & GFS2_BIT_MASK; + cur_state = (*byte1 >> bit) & GFS2_BIT_MASK; - if (valid_change[new_state * 4 + cur_state]) { - *byte ^= cur_state << bit; - *byte |= new_state << bit; - } else + if (unlikely(!valid_change[new_state * 4 + cur_state])) { gfs2_consist_rgrpd(rgd); + return; + } + *byte1 ^= (cur_state ^ new_state) << bit; + + if (buf2) { + byte2 = buf2 + offset + (block / GFS2_NBBY); + cur_state = (*byte2 >> bit) & GFS2_BIT_MASK; + *byte2 ^= (cur_state ^ new_state) << bit; + } } /** @@ -94,10 +112,12 @@ static void gfs2_setbit(struct gfs2_rgrpd *rgd, unsigned char *buffer, * */ -static unsigned char gfs2_testbit(struct gfs2_rgrpd *rgd, unsigned char *buffer, - unsigned int buflen, u32 block) +static inline unsigned char gfs2_testbit(struct gfs2_rgrpd *rgd, + const unsigned char *buffer, + unsigned int buflen, u32 block) { - unsigned char *byte, *end, cur_state; + const unsigned char *byte, *end; + unsigned char cur_state; unsigned int bit; byte = buffer + (block / GFS2_NBBY); @@ -126,47 +146,66 @@ static unsigned char gfs2_testbit(struct gfs2_rgrpd *rgd, unsigned char *buffer, * Return: the block number (bitmap buffer scope) that was found */ -static u32 gfs2_bitfit(unsigned char *buffer, unsigned int buflen, u32 goal, - unsigned char old_state) +static u32 gfs2_bitfit(const u8 *buffer, unsigned int buflen, u32 goal, + u8 old_state) { - unsigned char *byte; - u32 blk = goal; - unsigned int bit, bitlong; - unsigned long *plong, plong55; - - byte = buffer + (goal / GFS2_NBBY); - plong = (unsigned long *)(buffer + (goal / GFS2_NBBY)); - bit = (goal % GFS2_NBBY) * GFS2_BIT_SIZE; - bitlong = bit; -#if BITS_PER_LONG == 32 - plong55 = 0x55555555; -#else - plong55 = 0x5555555555555555; -#endif - while (byte < buffer + buflen) { - - if (bitlong == 0 && old_state == 0 && *plong == plong55) { - plong++; - byte += sizeof(unsigned long); - blk += sizeof(unsigned long) * GFS2_NBBY; - continue; + const u8 *byte, *start, *end; + int bit, startbit; + u32 g1, g2, misaligned; + unsigned long *plong; + unsigned long lskipval; + + lskipval = (old_state & GFS2_BLKST_USED) ? LBITSKIP00 : LBITSKIP55; + g1 = (goal / GFS2_NBBY); + start = buffer + g1; + byte = start; + end = buffer + buflen; + g2 = ALIGN(g1, sizeof(unsigned long)); + plong = (unsigned long *)(buffer + g2); + startbit = bit = (goal % GFS2_NBBY) * GFS2_BIT_SIZE; + misaligned = g2 - g1; + if (!misaligned) + goto ulong_aligned; +/* parse the bitmap a byte at a time */ +misaligned: + while (byte < end) { + if (((*byte >> bit) & GFS2_BIT_MASK) == old_state) { + return goal + + (((byte - start) * GFS2_NBBY) + + ((bit - startbit) >> 1)); } - if (((*byte >> bit) & GFS2_BIT_MASK) == old_state) - return blk; bit += GFS2_BIT_SIZE; - if (bit >= 8) { + if (bit >= GFS2_NBBY * GFS2_BIT_SIZE) { bit = 0; byte++; + misaligned--; + if (!misaligned) { + plong = (unsigned long *)byte; + goto ulong_aligned; + } } - bitlong += GFS2_BIT_SIZE; - if (bitlong >= sizeof(unsigned long) * 8) { - bitlong = 0; - plong++; - } - - blk++; } + return BFITNOENT; +/* parse the bitmap a unsigned long at a time */ +ulong_aligned: + /* Stop at "end - 1" or else prefetch can go past the end and segfault. + We could "if" it but we'd lose some of the performance gained. + This way will only slow down searching the very last 4/8 bytes + depending on architecture. I've experimented with several ways + of writing this section such as using an else before the goto + but this one seems to be the fastest. */ + while ((unsigned char *)plong < end - 1) { + prefetch(plong + 1); + if (((*plong) & LBITMASK) != lskipval) + break; + plong++; + } + if ((unsigned char *)plong < end) { + byte = (const u8 *)plong; + misaligned += sizeof(unsigned long) - 1; + goto misaligned; + } return BFITNOENT; } @@ -179,14 +218,14 @@ static u32 gfs2_bitfit(unsigned char *buffer, unsigned int buflen, u32 goal, * Returns: The number of bits */ -static u32 gfs2_bitcount(struct gfs2_rgrpd *rgd, unsigned char *buffer, - unsigned int buflen, unsigned char state) +static u32 gfs2_bitcount(struct gfs2_rgrpd *rgd, const u8 *buffer, + unsigned int buflen, u8 state) { - unsigned char *byte = buffer; - unsigned char *end = buffer + buflen; - unsigned char state1 = state << 2; - unsigned char state2 = state << 4; - unsigned char state3 = state << 6; + const u8 *byte = buffer; + const u8 *end = buffer + buflen; + const u8 state1 = state << 2; + const u8 state2 = state << 4; + const u8 state3 = state << 6; u32 count = 0; for (; byte < end; byte++) { @@ -353,7 +392,7 @@ static void clear_rgrpdi(struct gfs2_sbd *sdp) } kfree(rgd->rd_bits); - kfree(rgd); + kmem_cache_free(gfs2_rgrpd_cachep, rgd); } } @@ -516,7 +555,7 @@ static int read_rindex_entry(struct gfs2_inode *ip, return error; } - rgd = kzalloc(sizeof(struct gfs2_rgrpd), GFP_NOFS); + rgd = kmem_cache_zalloc(gfs2_rgrpd_cachep, GFP_NOFS); error = -ENOMEM; if (!rgd) return error; @@ -539,7 +578,7 @@ static int read_rindex_entry(struct gfs2_inode *ip, return error; rgd->rd_gl->gl_object = rgd; - rgd->rd_rg_vn = rgd->rd_gl->gl_vn - 1; + rgd->rd_flags &= ~GFS2_RDF_UPTODATE; rgd->rd_flags |= GFS2_RDF_CHECK; return error; } @@ -575,7 +614,7 @@ static int gfs2_ri_update(struct gfs2_inode *ip) } } - sdp->sd_rindex_vn = ip->i_gl->gl_vn; + sdp->sd_rindex_uptodate = 1; return 0; } @@ -609,7 +648,7 @@ static int gfs2_ri_update_special(struct gfs2_inode *ip) } } - sdp->sd_rindex_vn = ip->i_gl->gl_vn; + sdp->sd_rindex_uptodate = 1; return 0; } @@ -642,9 +681,9 @@ int gfs2_rindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ri_gh) return error; /* Read new copy from disk if we don't have the latest */ - if (sdp->sd_rindex_vn != gl->gl_vn) { + if (!sdp->sd_rindex_uptodate) { mutex_lock(&sdp->sd_rindex_mutex); - if (sdp->sd_rindex_vn != gl->gl_vn) { + if (!sdp->sd_rindex_uptodate) { error = gfs2_ri_update(ip); if (error) gfs2_glock_dq_uninit(ri_gh); @@ -655,21 +694,31 @@ int gfs2_rindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ri_gh) return error; } -static void gfs2_rgrp_in(struct gfs2_rgrp_host *rg, const void *buf) +static void gfs2_rgrp_in(struct gfs2_rgrpd *rgd, const void *buf) { const struct gfs2_rgrp *str = buf; + struct gfs2_rgrp_host *rg = &rgd->rd_rg; + u32 rg_flags; - rg->rg_flags = be32_to_cpu(str->rg_flags); + rg_flags = be32_to_cpu(str->rg_flags); + if (rg_flags & GFS2_RGF_NOALLOC) + rgd->rd_flags |= GFS2_RDF_NOALLOC; + else + rgd->rd_flags &= ~GFS2_RDF_NOALLOC; rg->rg_free = be32_to_cpu(str->rg_free); rg->rg_dinodes = be32_to_cpu(str->rg_dinodes); rg->rg_igeneration = be64_to_cpu(str->rg_igeneration); } -static void gfs2_rgrp_out(const struct gfs2_rgrp_host *rg, void *buf) +static void gfs2_rgrp_out(struct gfs2_rgrpd *rgd, void *buf) { struct gfs2_rgrp *str = buf; + struct gfs2_rgrp_host *rg = &rgd->rd_rg; + u32 rg_flags = 0; - str->rg_flags = cpu_to_be32(rg->rg_flags); + if (rgd->rd_flags & GFS2_RDF_NOALLOC) + rg_flags |= GFS2_RGF_NOALLOC; + str->rg_flags = cpu_to_be32(rg_flags); str->rg_free = cpu_to_be32(rg->rg_free); str->rg_dinodes = cpu_to_be32(rg->rg_dinodes); str->__pad = cpu_to_be32(0); @@ -726,9 +775,9 @@ int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd) } } - if (rgd->rd_rg_vn != gl->gl_vn) { - gfs2_rgrp_in(&rgd->rd_rg, (rgd->rd_bits[0].bi_bh)->b_data); - rgd->rd_rg_vn = gl->gl_vn; + if (!(rgd->rd_flags & GFS2_RDF_UPTODATE)) { + gfs2_rgrp_in(rgd, (rgd->rd_bits[0].bi_bh)->b_data); + rgd->rd_flags |= GFS2_RDF_UPTODATE; } spin_lock(&sdp->sd_rindex_spin); @@ -840,7 +889,7 @@ static int try_rgrp_fit(struct gfs2_rgrpd *rgd, struct gfs2_alloc *al) struct gfs2_sbd *sdp = rgd->rd_sbd; int ret = 0; - if (rgd->rd_rg.rg_flags & GFS2_RGF_NOALLOC) + if (rgd->rd_flags & GFS2_RDF_NOALLOC) return 0; spin_lock(&sdp->sd_rindex_spin); @@ -866,13 +915,15 @@ static struct inode *try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked) u32 goal = 0, block; u64 no_addr; struct gfs2_sbd *sdp = rgd->rd_sbd; + unsigned int n; for(;;) { if (goal >= rgd->rd_data) break; down_write(&sdp->sd_log_flush_lock); + n = 1; block = rgblk_search(rgd, goal, GFS2_BLKST_UNLINKED, - GFS2_BLKST_UNLINKED); + GFS2_BLKST_UNLINKED, &n); up_write(&sdp->sd_log_flush_lock); if (block == BFITNOENT) break; @@ -904,24 +955,20 @@ static struct inode *try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked) static struct gfs2_rgrpd *recent_rgrp_first(struct gfs2_sbd *sdp, u64 rglast) { - struct gfs2_rgrpd *rgd = NULL; + struct gfs2_rgrpd *rgd; spin_lock(&sdp->sd_rindex_spin); - if (list_empty(&sdp->sd_rindex_recent_list)) - goto out; - - if (!rglast) - goto first; - - list_for_each_entry(rgd, &sdp->sd_rindex_recent_list, rd_recent) { - if (rgd->rd_addr == rglast) - goto out; + if (rglast) { + list_for_each_entry(rgd, &sdp->sd_rindex_recent_list, rd_recent) { + if (rgrp_contains_block(rgd, rglast)) + goto out; + } } - -first: - rgd = list_entry(sdp->sd_rindex_recent_list.next, struct gfs2_rgrpd, - rd_recent); + rgd = NULL; + if (!list_empty(&sdp->sd_rindex_recent_list)) + rgd = list_entry(sdp->sd_rindex_recent_list.next, + struct gfs2_rgrpd, rd_recent); out: spin_unlock(&sdp->sd_rindex_spin); return rgd; @@ -1067,7 +1114,7 @@ static struct inode *get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked) /* Try recently successful rgrps */ - rgd = recent_rgrp_first(sdp, ip->i_last_rg_alloc); + rgd = recent_rgrp_first(sdp, ip->i_goal); while (rgd) { rg_locked = 0; @@ -1151,8 +1198,6 @@ static struct inode *get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked) } out: - ip->i_last_rg_alloc = rgd->rd_addr; - if (begin) { recent_rgrp_add(rgd); rgd = gfs2_rgrpd_get_next(rgd); @@ -1275,6 +1320,7 @@ unsigned char gfs2_get_block_type(struct gfs2_rgrpd *rgd, u64 block) * @goal: the goal block within the RG (start here to search for avail block) * @old_state: GFS2_BLKST_XXX the before-allocation state to find * @new_state: GFS2_BLKST_XXX the after-allocation block state + * @n: The extent length * * Walk rgrp's bitmap to find bits that represent a block in @old_state. * Add the found bitmap buffer to the transaction. @@ -1290,13 +1336,17 @@ unsigned char gfs2_get_block_type(struct gfs2_rgrpd *rgd, u64 block) */ static u32 rgblk_search(struct gfs2_rgrpd *rgd, u32 goal, - unsigned char old_state, unsigned char new_state) + unsigned char old_state, unsigned char new_state, + unsigned int *n) { struct gfs2_bitmap *bi = NULL; - u32 length = rgd->rd_length; + const u32 length = rgd->rd_length; u32 blk = 0; unsigned int buf, x; + const unsigned int elen = *n; + const u8 *buffer; + *n = 0; /* Find bitmap block that contains bits for goal block */ for (buf = 0; buf < length; buf++) { bi = rgd->rd_bits + buf; @@ -1317,12 +1367,11 @@ static u32 rgblk_search(struct gfs2_rgrpd *rgd, u32 goal, for (x = 0; x <= length; x++) { /* The GFS2_BLKST_UNLINKED state doesn't apply to the clone bitmaps, so we must search the originals for that. */ + buffer = bi->bi_bh->b_data + bi->bi_offset; if (old_state != GFS2_BLKST_UNLINKED && bi->bi_clone) - blk = gfs2_bitfit(bi->bi_clone + bi->bi_offset, - bi->bi_len, goal, old_state); - else - blk = gfs2_bitfit(bi->bi_bh->b_data + bi->bi_offset, - bi->bi_len, goal, old_state); + buffer = bi->bi_clone + bi->bi_offset; + + blk = gfs2_bitfit(buffer, bi->bi_len, goal, old_state); if (blk != BFITNOENT) break; @@ -1333,12 +1382,23 @@ static u32 rgblk_search(struct gfs2_rgrpd *rgd, u32 goal, } if (blk != BFITNOENT && old_state != new_state) { + *n = 1; gfs2_trans_add_bh(rgd->rd_gl, bi->bi_bh, 1); - gfs2_setbit(rgd, bi->bi_bh->b_data + bi->bi_offset, + gfs2_setbit(rgd, bi->bi_bh->b_data, bi->bi_clone, bi->bi_offset, bi->bi_len, blk, new_state); - if (bi->bi_clone) - gfs2_setbit(rgd, bi->bi_clone + bi->bi_offset, - bi->bi_len, blk, new_state); + goal = blk; + while (*n < elen) { + goal++; + if (goal >= (bi->bi_len * GFS2_NBBY)) + break; + if (gfs2_testbit(rgd, buffer, bi->bi_len, goal) != + GFS2_BLKST_FREE) + break; + gfs2_setbit(rgd, bi->bi_bh->b_data, bi->bi_clone, + bi->bi_offset, bi->bi_len, goal, + new_state); + (*n)++; + } } return (blk == BFITNOENT) ? blk : (bi->bi_start * GFS2_NBBY) + blk; @@ -1393,7 +1453,7 @@ static struct gfs2_rgrpd *rgblk_free(struct gfs2_sbd *sdp, u64 bstart, bi->bi_len); } gfs2_trans_add_bh(rgd->rd_gl, bi->bi_bh, 1); - gfs2_setbit(rgd, bi->bi_bh->b_data + bi->bi_offset, + gfs2_setbit(rgd, bi->bi_bh->b_data, NULL, bi->bi_offset, bi->bi_len, buf_blk, new_state); } @@ -1401,13 +1461,13 @@ static struct gfs2_rgrpd *rgblk_free(struct gfs2_sbd *sdp, u64 bstart, } /** - * gfs2_alloc_data - Allocate a data block - * @ip: the inode to allocate the data block for + * gfs2_alloc_block - Allocate a block + * @ip: the inode to allocate the block for * * Returns: the allocated block */ -u64 gfs2_alloc_data(struct gfs2_inode *ip) +u64 gfs2_alloc_block(struct gfs2_inode *ip, unsigned int *n) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); struct gfs2_alloc *al = ip->i_alloc; @@ -1415,77 +1475,31 @@ u64 gfs2_alloc_data(struct gfs2_inode *ip) u32 goal, blk; u64 block; - if (rgrp_contains_block(rgd, ip->i_di.di_goal_data)) - goal = ip->i_di.di_goal_data - rgd->rd_data0; + if (rgrp_contains_block(rgd, ip->i_goal)) + goal = ip->i_goal - rgd->rd_data0; else - goal = rgd->rd_last_alloc_data; + goal = rgd->rd_last_alloc; - blk = rgblk_search(rgd, goal, GFS2_BLKST_FREE, GFS2_BLKST_USED); + blk = rgblk_search(rgd, goal, GFS2_BLKST_FREE, GFS2_BLKST_USED, n); BUG_ON(blk == BFITNOENT); - rgd->rd_last_alloc_data = blk; + rgd->rd_last_alloc = blk; block = rgd->rd_data0 + blk; - ip->i_di.di_goal_data = block; + ip->i_goal = block; - gfs2_assert_withdraw(sdp, rgd->rd_rg.rg_free); - rgd->rd_rg.rg_free--; + gfs2_assert_withdraw(sdp, rgd->rd_rg.rg_free >= *n); + rgd->rd_rg.rg_free -= *n; gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1); - gfs2_rgrp_out(&rgd->rd_rg, rgd->rd_bits[0].bi_bh->b_data); + gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data); - al->al_alloced++; + al->al_alloced += *n; - gfs2_statfs_change(sdp, 0, -1, 0); - gfs2_quota_change(ip, +1, ip->i_inode.i_uid, ip->i_inode.i_gid); + gfs2_statfs_change(sdp, 0, -*n, 0); + gfs2_quota_change(ip, *n, ip->i_inode.i_uid, ip->i_inode.i_gid); spin_lock(&sdp->sd_rindex_spin); - rgd->rd_free_clone--; - spin_unlock(&sdp->sd_rindex_spin); - - return block; -} - -/** - * gfs2_alloc_meta - Allocate a metadata block - * @ip: the inode to allocate the metadata block for - * - * Returns: the allocated block - */ - -u64 gfs2_alloc_meta(struct gfs2_inode *ip) -{ - struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); - struct gfs2_alloc *al = ip->i_alloc; - struct gfs2_rgrpd *rgd = al->al_rgd; - u32 goal, blk; - u64 block; - - if (rgrp_contains_block(rgd, ip->i_di.di_goal_meta)) - goal = ip->i_di.di_goal_meta - rgd->rd_data0; - else - goal = rgd->rd_last_alloc_meta; - - blk = rgblk_search(rgd, goal, GFS2_BLKST_FREE, GFS2_BLKST_USED); - BUG_ON(blk == BFITNOENT); - rgd->rd_last_alloc_meta = blk; - - block = rgd->rd_data0 + blk; - ip->i_di.di_goal_meta = block; - - gfs2_assert_withdraw(sdp, rgd->rd_rg.rg_free); - rgd->rd_rg.rg_free--; - - gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1); - gfs2_rgrp_out(&rgd->rd_rg, rgd->rd_bits[0].bi_bh->b_data); - - al->al_alloced++; - - gfs2_statfs_change(sdp, 0, -1, 0); - gfs2_quota_change(ip, +1, ip->i_inode.i_uid, ip->i_inode.i_gid); - gfs2_trans_add_unrevoke(sdp, block); - - spin_lock(&sdp->sd_rindex_spin); - rgd->rd_free_clone--; + rgd->rd_free_clone -= *n; spin_unlock(&sdp->sd_rindex_spin); return block; @@ -1505,12 +1519,13 @@ u64 gfs2_alloc_di(struct gfs2_inode *dip, u64 *generation) struct gfs2_rgrpd *rgd = al->al_rgd; u32 blk; u64 block; + unsigned int n = 1; - blk = rgblk_search(rgd, rgd->rd_last_alloc_meta, - GFS2_BLKST_FREE, GFS2_BLKST_DINODE); + blk = rgblk_search(rgd, rgd->rd_last_alloc, + GFS2_BLKST_FREE, GFS2_BLKST_DINODE, &n); BUG_ON(blk == BFITNOENT); - rgd->rd_last_alloc_meta = blk; + rgd->rd_last_alloc = blk; block = rgd->rd_data0 + blk; @@ -1519,12 +1534,12 @@ u64 gfs2_alloc_di(struct gfs2_inode *dip, u64 *generation) rgd->rd_rg.rg_dinodes++; *generation = rgd->rd_rg.rg_igeneration++; gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1); - gfs2_rgrp_out(&rgd->rd_rg, rgd->rd_bits[0].bi_bh->b_data); + gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data); al->al_alloced++; gfs2_statfs_change(sdp, 0, -1, +1); - gfs2_trans_add_unrevoke(sdp, block); + gfs2_trans_add_unrevoke(sdp, block, 1); spin_lock(&sdp->sd_rindex_spin); rgd->rd_free_clone--; @@ -1553,7 +1568,7 @@ void gfs2_free_data(struct gfs2_inode *ip, u64 bstart, u32 blen) rgd->rd_rg.rg_free += blen; gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1); - gfs2_rgrp_out(&rgd->rd_rg, rgd->rd_bits[0].bi_bh->b_data); + gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data); gfs2_trans_add_rg(rgd); @@ -1581,7 +1596,7 @@ void gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen) rgd->rd_rg.rg_free += blen; gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1); - gfs2_rgrp_out(&rgd->rd_rg, rgd->rd_bits[0].bi_bh->b_data); + gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data); gfs2_trans_add_rg(rgd); @@ -1601,7 +1616,7 @@ void gfs2_unlink_di(struct inode *inode) if (!rgd) return; gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1); - gfs2_rgrp_out(&rgd->rd_rg, rgd->rd_bits[0].bi_bh->b_data); + gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data); gfs2_trans_add_rg(rgd); } @@ -1621,7 +1636,7 @@ static void gfs2_free_uninit_di(struct gfs2_rgrpd *rgd, u64 blkno) rgd->rd_rg.rg_free++; gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1); - gfs2_rgrp_out(&rgd->rd_rg, rgd->rd_bits[0].bi_bh->b_data); + gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data); gfs2_statfs_change(sdp, 0, +1, -1); gfs2_trans_add_rg(rgd); @@ -1699,8 +1714,7 @@ void gfs2_rlist_add(struct gfs2_sbd *sdp, struct gfs2_rgrp_list *rlist, * */ -void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist, unsigned int state, - int flags) +void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist, unsigned int state) { unsigned int x; @@ -1708,7 +1722,7 @@ void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist, unsigned int state, GFP_NOFS | __GFP_NOFAIL); for (x = 0; x < rlist->rl_rgrps; x++) gfs2_holder_init(rlist->rl_rgd[x]->rd_gl, - state, flags, + state, 0, &rlist->rl_ghs[x]); } diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h index 149bb16..3181c7e 100644 --- a/fs/gfs2/rgrp.h +++ b/fs/gfs2/rgrp.h @@ -1,6 +1,6 @@ /* * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. - * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved. * * This copyrighted material is made available to anyone wishing to use, * modify, copy, or redistribute it subject to the terms and conditions @@ -46,8 +46,7 @@ void gfs2_inplace_release(struct gfs2_inode *ip); unsigned char gfs2_get_block_type(struct gfs2_rgrpd *rgd, u64 block); -u64 gfs2_alloc_data(struct gfs2_inode *ip); -u64 gfs2_alloc_meta(struct gfs2_inode *ip); +u64 gfs2_alloc_block(struct gfs2_inode *ip, unsigned int *n); u64 gfs2_alloc_di(struct gfs2_inode *ip, u64 *generation); void gfs2_free_data(struct gfs2_inode *ip, u64 bstart, u32 blen); @@ -64,8 +63,7 @@ struct gfs2_rgrp_list { void gfs2_rlist_add(struct gfs2_sbd *sdp, struct gfs2_rgrp_list *rlist, u64 block); -void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist, unsigned int state, - int flags); +void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist, unsigned int state); void gfs2_rlist_free(struct gfs2_rgrp_list *rlist); u64 gfs2_ri_total(struct gfs2_sbd *sdp); diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index ef0562c..7aeacbc 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -210,7 +210,7 @@ int gfs2_read_super(struct gfs2_sbd *sdp, sector_t sector) struct page *page; struct bio *bio; - page = alloc_page(GFP_KERNEL); + page = alloc_page(GFP_NOFS); if (unlikely(!page)) return -ENOBUFS; @@ -218,7 +218,7 @@ int gfs2_read_super(struct gfs2_sbd *sdp, sector_t sector) ClearPageDirty(page); lock_page(page); - bio = bio_alloc(GFP_KERNEL, 1); + bio = bio_alloc(GFP_NOFS, 1); if (unlikely(!bio)) { __free_page(page); return -ENOBUFS; @@ -316,6 +316,7 @@ int gfs2_read_sb(struct gfs2_sbd *sdp, struct gfs2_glock *gl, int silent) sdp->sd_heightsize[x] = space; } sdp->sd_max_height = x; + sdp->sd_heightsize[x] = ~0; gfs2_assert(sdp, sdp->sd_max_height <= GFS2_MAX_META_HEIGHT); sdp->sd_jheightsize[0] = sdp->sd_sb.sb_bsize - @@ -334,6 +335,7 @@ int gfs2_read_sb(struct gfs2_sbd *sdp, struct gfs2_glock *gl, int silent) sdp->sd_jheightsize[x] = space; } sdp->sd_max_jheight = x; + sdp->sd_jheightsize[x] = ~0; gfs2_assert(sdp, sdp->sd_max_jheight <= GFS2_MAX_META_HEIGHT); return 0; diff --git a/fs/gfs2/super.h b/fs/gfs2/super.h index 60a870e..44361ec 100644 --- a/fs/gfs2/super.h +++ b/fs/gfs2/super.h @@ -17,6 +17,7 @@ void gfs2_tune_init(struct gfs2_tune *gt); int gfs2_check_sb(struct gfs2_sbd *sdp, struct gfs2_sb_host *sb, int silent); int gfs2_read_sb(struct gfs2_sbd *sdp, struct gfs2_glock *gl, int silent); int gfs2_read_super(struct gfs2_sbd *sdp, sector_t sector); +void gfs2_lm_unmount(struct gfs2_sbd *sdp); static inline unsigned int gfs2_jindex_size(struct gfs2_sbd *sdp) { diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c index eaa3b7b..9ab9fc8 100644 --- a/fs/gfs2/sys.c +++ b/fs/gfs2/sys.c @@ -20,7 +20,6 @@ #include "gfs2.h" #include "incore.h" -#include "lm.h" #include "sys.h" #include "super.h" #include "glock.h" @@ -328,15 +327,9 @@ static ssize_t name##_show(struct gfs2_sbd *sdp, char *buf) \ } \ static struct counters_attr counters_attr_##name = __ATTR_RO(name) -COUNTERS_ATTR(glock_count, "%u\n"); -COUNTERS_ATTR(glock_held_count, "%u\n"); -COUNTERS_ATTR(inode_count, "%u\n"); COUNTERS_ATTR(reclaimed, "%u\n"); static struct attribute *counters_attrs[] = { - &counters_attr_glock_count.attr, - &counters_attr_glock_held_count.attr, - &counters_attr_inode_count.attr, &counters_attr_reclaimed.attr, NULL, }; diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c index 73e5d92..f677b8a 100644 --- a/fs/gfs2/trans.c +++ b/fs/gfs2/trans.c @@ -146,30 +146,25 @@ void gfs2_trans_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd) lops_add(sdp, &bd->bd_le); } -void gfs2_trans_add_unrevoke(struct gfs2_sbd *sdp, u64 blkno) +void gfs2_trans_add_unrevoke(struct gfs2_sbd *sdp, u64 blkno, unsigned int len) { - struct gfs2_bufdata *bd; - int found = 0; + struct gfs2_bufdata *bd, *tmp; + struct gfs2_trans *tr = current->journal_info; + unsigned int n = len; gfs2_log_lock(sdp); - - list_for_each_entry(bd, &sdp->sd_log_le_revoke, bd_le.le_list) { - if (bd->bd_blkno == blkno) { + list_for_each_entry_safe(bd, tmp, &sdp->sd_log_le_revoke, bd_le.le_list) { + if ((bd->bd_blkno >= blkno) && (bd->bd_blkno < (blkno + len))) { list_del_init(&bd->bd_le.le_list); gfs2_assert_withdraw(sdp, sdp->sd_log_num_revoke); sdp->sd_log_num_revoke--; - found = 1; - break; + kmem_cache_free(gfs2_bufdata_cachep, bd); + tr->tr_num_revoke_rm++; + if (--n == 0) + break; } } - gfs2_log_unlock(sdp); - - if (found) { - struct gfs2_trans *tr = current->journal_info; - kmem_cache_free(gfs2_bufdata_cachep, bd); - tr->tr_num_revoke_rm++; - } } void gfs2_trans_add_rg(struct gfs2_rgrpd *rgd) diff --git a/fs/gfs2/trans.h b/fs/gfs2/trans.h index e826f0d..edf9d4b 100644 --- a/fs/gfs2/trans.h +++ b/fs/gfs2/trans.h @@ -32,7 +32,7 @@ void gfs2_trans_end(struct gfs2_sbd *sdp); void gfs2_trans_add_bh(struct gfs2_glock *gl, struct buffer_head *bh, int meta); void gfs2_trans_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd); -void gfs2_trans_add_unrevoke(struct gfs2_sbd *sdp, u64 blkno); +void gfs2_trans_add_unrevoke(struct gfs2_sbd *sdp, u64 blkno, unsigned int len); void gfs2_trans_add_rg(struct gfs2_rgrpd *rgd); #endif /* __TRANS_DOT_H__ */ diff --git a/fs/gfs2/util.c b/fs/gfs2/util.c index 424a077..d31e355 100644 --- a/fs/gfs2/util.c +++ b/fs/gfs2/util.c @@ -19,12 +19,12 @@ #include "gfs2.h" #include "incore.h" #include "glock.h" -#include "lm.h" #include "util.h" struct kmem_cache *gfs2_glock_cachep __read_mostly; struct kmem_cache *gfs2_inode_cachep __read_mostly; struct kmem_cache *gfs2_bufdata_cachep __read_mostly; +struct kmem_cache *gfs2_rgrpd_cachep __read_mostly; void gfs2_assert_i(struct gfs2_sbd *sdp) { @@ -32,6 +32,28 @@ void gfs2_assert_i(struct gfs2_sbd *sdp) sdp->sd_fsname); } +int gfs2_lm_withdraw(struct gfs2_sbd *sdp, char *fmt, ...) +{ + va_list args; + + if (test_and_set_bit(SDF_SHUTDOWN, &sdp->sd_flags)) + return 0; + + va_start(args, fmt); + vprintk(fmt, args); + va_end(args); + + fs_err(sdp, "about to withdraw this file system\n"); + BUG_ON(sdp->sd_args.ar_debug); + + fs_err(sdp, "telling LM to withdraw\n"); + gfs2_withdraw_lockproto(&sdp->sd_lockstruct); + fs_err(sdp, "withdrawn\n"); + dump_stack(); + + return -1; +} + /** * gfs2_assert_withdraw_i - Cause the machine to withdraw if @assertion is false * Returns: -1 if this call withdrew the machine, diff --git a/fs/gfs2/util.h b/fs/gfs2/util.h index 28938a4..509c5d6 100644 --- a/fs/gfs2/util.h +++ b/fs/gfs2/util.h @@ -147,6 +147,7 @@ gfs2_io_error_bh_i((sdp), (bh), __FUNCTION__, __FILE__, __LINE__); extern struct kmem_cache *gfs2_glock_cachep; extern struct kmem_cache *gfs2_inode_cachep; extern struct kmem_cache *gfs2_bufdata_cachep; +extern struct kmem_cache *gfs2_rgrpd_cachep; static inline unsigned int gfs2_tune_get_i(struct gfs2_tune *gt, unsigned int *p) @@ -163,6 +164,7 @@ gfs2_tune_get_i(&(sdp)->sd_tune, &(sdp)->sd_tune.field) void gfs2_icbit_munge(struct gfs2_sbd *sdp, unsigned char **bitmap, unsigned int bit, int new_value); +int gfs2_lm_withdraw(struct gfs2_sbd *sdp, char *fmt, ...); #endif /* __UTIL_DOT_H__ */ diff --git a/fs/hfsplus/ioctl.c b/fs/hfsplus/ioctl.c index b60c0af..f457d2c 100644 --- a/fs/hfsplus/ioctl.c +++ b/fs/hfsplus/ioctl.c @@ -14,6 +14,7 @@ #include <linux/capability.h> #include <linux/fs.h> +#include <linux/mount.h> #include <linux/sched.h> #include <linux/xattr.h> #include <asm/uaccess.h> @@ -35,25 +36,32 @@ int hfsplus_ioctl(struct inode *inode, struct file *filp, unsigned int cmd, flags |= FS_NODUMP_FL; /* EXT2_NODUMP_FL */ return put_user(flags, (int __user *)arg); case HFSPLUS_IOC_EXT2_SETFLAGS: { - if (IS_RDONLY(inode)) - return -EROFS; - - if (!is_owner_or_cap(inode)) - return -EACCES; - - if (get_user(flags, (int __user *)arg)) - return -EFAULT; - + int err = 0; + err = mnt_want_write(filp->f_path.mnt); + if (err) + return err; + + if (!is_owner_or_cap(inode)) { + err = -EACCES; + goto setflags_out; + } + if (get_user(flags, (int __user *)arg)) { + err = -EFAULT; + goto setflags_out; + } if (flags & (FS_IMMUTABLE_FL|FS_APPEND_FL) || HFSPLUS_I(inode).rootflags & (HFSPLUS_FLG_IMMUTABLE|HFSPLUS_FLG_APPEND)) { - if (!capable(CAP_LINUX_IMMUTABLE)) - return -EPERM; + if (!capable(CAP_LINUX_IMMUTABLE)) { + err = -EPERM; + goto setflags_out; + } } /* don't silently ignore unsupported ext2 flags */ - if (flags & ~(FS_IMMUTABLE_FL|FS_APPEND_FL|FS_NODUMP_FL)) - return -EOPNOTSUPP; - + if (flags & ~(FS_IMMUTABLE_FL|FS_APPEND_FL|FS_NODUMP_FL)) { + err = -EOPNOTSUPP; + goto setflags_out; + } if (flags & FS_IMMUTABLE_FL) { /* EXT2_IMMUTABLE_FL */ inode->i_flags |= S_IMMUTABLE; HFSPLUS_I(inode).rootflags |= HFSPLUS_FLG_IMMUTABLE; @@ -75,7 +83,9 @@ int hfsplus_ioctl(struct inode *inode, struct file *filp, unsigned int cmd, inode->i_ctime = CURRENT_TIME_SEC; mark_inode_dirty(inode); - return 0; +setflags_out: + mnt_drop_write(filp->f_path.mnt); + return err; } default: return -ENOTTY; @@ -1199,42 +1199,37 @@ void touch_atime(struct vfsmount *mnt, struct dentry *dentry) struct inode *inode = dentry->d_inode; struct timespec now; - if (inode->i_flags & S_NOATIME) + if (mnt_want_write(mnt)) return; + if (inode->i_flags & S_NOATIME) + goto out; if (IS_NOATIME(inode)) - return; + goto out; if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode)) - return; + goto out; - /* - * We may have a NULL vfsmount when coming from NFSD - */ - if (mnt) { - if (mnt->mnt_flags & MNT_NOATIME) - return; - if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode)) - return; - - if (mnt->mnt_flags & MNT_RELATIME) { - /* - * With relative atime, only update atime if the - * previous atime is earlier than either the ctime or - * mtime. - */ - if (timespec_compare(&inode->i_mtime, - &inode->i_atime) < 0 && - timespec_compare(&inode->i_ctime, - &inode->i_atime) < 0) - return; - } + if (mnt->mnt_flags & MNT_NOATIME) + goto out; + if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode)) + goto out; + if (mnt->mnt_flags & MNT_RELATIME) { + /* + * With relative atime, only update atime if the previous + * atime is earlier than either the ctime or mtime. + */ + if (timespec_compare(&inode->i_mtime, &inode->i_atime) < 0 && + timespec_compare(&inode->i_ctime, &inode->i_atime) < 0) + goto out; } now = current_fs_time(inode->i_sb); if (timespec_equal(&inode->i_atime, &now)) - return; + goto out; inode->i_atime = now; mark_inode_dirty_sync(inode); +out: + mnt_drop_write(mnt); } EXPORT_SYMBOL(touch_atime); @@ -1255,10 +1250,13 @@ void file_update_time(struct file *file) struct inode *inode = file->f_path.dentry->d_inode; struct timespec now; int sync_it = 0; + int err; if (IS_NOCMTIME(inode)) return; - if (IS_RDONLY(inode)) + + err = mnt_want_write(file->f_path.mnt); + if (err) return; now = current_fs_time(inode->i_sb); @@ -1279,6 +1277,7 @@ void file_update_time(struct file *file) if (sync_it) mark_inode_dirty_sync(inode); + mnt_drop_write(file->f_path.mnt); } EXPORT_SYMBOL(file_update_time); diff --git a/fs/internal.h b/fs/internal.h index 392e8cc..80aa9a0 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -43,3 +43,14 @@ extern void __init chrdev_init(void); * namespace.c */ extern int copy_mount_options(const void __user *, unsigned long *); + +extern void free_vfsmnt(struct vfsmount *); +extern struct vfsmount *alloc_vfsmnt(const char *); +extern struct vfsmount *__lookup_mnt(struct vfsmount *, struct dentry *, int); +extern void mnt_set_mountpoint(struct vfsmount *, struct dentry *, + struct vfsmount *); +extern void release_mounts(struct list_head *); +extern void umount_tree(struct vfsmount *, int, struct list_head *); +extern struct vfsmount *copy_tree(struct vfsmount *, struct dentry *, int); + +extern void __init mnt_init(void); diff --git a/fs/jffs2/jffs2_fs_i.h b/fs/jffs2/jffs2_fs_i.h index 0b78fdc..a841f49 100644 --- a/fs/jffs2/jffs2_fs_i.h +++ b/fs/jffs2/jffs2_fs_i.h @@ -15,7 +15,7 @@ #include <linux/version.h> #include <linux/rbtree.h> #include <linux/posix_acl.h> -#include <asm/semaphore.h> +#include <linux/semaphore.h> struct jffs2_inode_info { /* We need an internal mutex similar to inode->i_mutex. diff --git a/fs/jffs2/jffs2_fs_sb.h b/fs/jffs2/jffs2_fs_sb.h index 3a2197f..18fca2b 100644 --- a/fs/jffs2/jffs2_fs_sb.h +++ b/fs/jffs2/jffs2_fs_sb.h @@ -16,7 +16,7 @@ #include <linux/spinlock.h> #include <linux/workqueue.h> #include <linux/completion.h> -#include <asm/semaphore.h> +#include <linux/semaphore.h> #include <linux/timer.h> #include <linux/wait.h> #include <linux/list.h> diff --git a/fs/jfs/ioctl.c b/fs/jfs/ioctl.c index a1f8e37..afe222b 100644 --- a/fs/jfs/ioctl.c +++ b/fs/jfs/ioctl.c @@ -8,6 +8,7 @@ #include <linux/fs.h> #include <linux/ctype.h> #include <linux/capability.h> +#include <linux/mount.h> #include <linux/time.h> #include <linux/sched.h> #include <asm/current.h> @@ -65,23 +66,30 @@ long jfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) return put_user(flags, (int __user *) arg); case JFS_IOC_SETFLAGS: { unsigned int oldflags; + int err; - if (IS_RDONLY(inode)) - return -EROFS; + err = mnt_want_write(filp->f_path.mnt); + if (err) + return err; - if (!is_owner_or_cap(inode)) - return -EACCES; - - if (get_user(flags, (int __user *) arg)) - return -EFAULT; + if (!is_owner_or_cap(inode)) { + err = -EACCES; + goto setflags_out; + } + if (get_user(flags, (int __user *) arg)) { + err = -EFAULT; + goto setflags_out; + } flags = jfs_map_ext2(flags, 1); if (!S_ISDIR(inode->i_mode)) flags &= ~JFS_DIRSYNC_FL; /* Is it quota file? Do not allow user to mess with it */ - if (IS_NOQUOTA(inode)) - return -EPERM; + if (IS_NOQUOTA(inode)) { + err = -EPERM; + goto setflags_out; + } /* Lock against other parallel changes of flags */ mutex_lock(&inode->i_mutex); @@ -98,7 +106,8 @@ long jfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) (JFS_APPEND_FL | JFS_IMMUTABLE_FL))) { if (!capable(CAP_LINUX_IMMUTABLE)) { mutex_unlock(&inode->i_mutex); - return -EPERM; + err = -EPERM; + goto setflags_out; } } @@ -110,7 +119,9 @@ long jfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) mutex_unlock(&inode->i_mutex); inode->i_ctime = CURRENT_TIME_SEC; mark_inode_dirty(inode); - return 0; +setflags_out: + mnt_drop_write(filp->f_path.mnt); + return err; } default: return -ENOTTY; diff --git a/fs/jfs/jfs_dmap.c b/fs/jfs/jfs_dmap.c index e198506..2bc7d8a 100644 --- a/fs/jfs/jfs_dmap.c +++ b/fs/jfs/jfs_dmap.c @@ -2172,7 +2172,7 @@ static void dbAllocBits(struct bmap * bmp, struct dmap * dp, s64 blkno, } /* update the free count for this dmap */ - dp->nfree = cpu_to_le32(le32_to_cpu(dp->nfree) - nblocks); + le32_add_cpu(&dp->nfree, -nblocks); BMAP_LOCK(bmp); @@ -2316,7 +2316,7 @@ static int dbFreeBits(struct bmap * bmp, struct dmap * dp, s64 blkno, /* update the free count for this dmap. */ - dp->nfree = cpu_to_le32(le32_to_cpu(dp->nfree) + nblocks); + le32_add_cpu(&dp->nfree, nblocks); BMAP_LOCK(bmp); @@ -3226,7 +3226,7 @@ static int dbAllocDmapBU(struct bmap * bmp, struct dmap * dp, s64 blkno, } /* update the free count for this dmap */ - dp->nfree = cpu_to_le32(le32_to_cpu(dp->nfree) - nblocks); + le32_add_cpu(&dp->nfree, -nblocks); /* reconstruct summary tree */ dbInitDmapTree(dp); @@ -3660,9 +3660,8 @@ static int dbInitDmap(struct dmap * dp, s64 Blkno, int nblocks) goto initTree; } } else { - dp->nblocks = - cpu_to_le32(le32_to_cpu(dp->nblocks) + nblocks); - dp->nfree = cpu_to_le32(le32_to_cpu(dp->nfree) + nblocks); + le32_add_cpu(&dp->nblocks, nblocks); + le32_add_cpu(&dp->nfree, nblocks); } /* word number containing start block number */ diff --git a/fs/jfs/jfs_dmap.h b/fs/jfs/jfs_dmap.h index 11e6d47..1a6eb41 100644 --- a/fs/jfs/jfs_dmap.h +++ b/fs/jfs/jfs_dmap.h @@ -61,7 +61,7 @@ * determine the maximum free string for four (lower level) nodes * of the tree. */ -static __inline signed char TREEMAX(signed char *cp) +static inline signed char TREEMAX(signed char *cp) { signed char tmp1, tmp2; diff --git a/fs/jfs/jfs_imap.c b/fs/jfs/jfs_imap.c index 9bf29f7..734ec91 100644 --- a/fs/jfs/jfs_imap.c +++ b/fs/jfs/jfs_imap.c @@ -1019,8 +1019,7 @@ int diFree(struct inode *ip) /* update the free inode counts at the iag, ag and * map level. */ - iagp->nfreeinos = - cpu_to_le32(le32_to_cpu(iagp->nfreeinos) + 1); + le32_add_cpu(&iagp->nfreeinos, 1); imap->im_agctl[agno].numfree += 1; atomic_inc(&imap->im_numfree); @@ -1219,9 +1218,8 @@ int diFree(struct inode *ip) /* update the number of free inodes and number of free extents * for the iag. */ - iagp->nfreeinos = cpu_to_le32(le32_to_cpu(iagp->nfreeinos) - - (INOSPEREXT - 1)); - iagp->nfreeexts = cpu_to_le32(le32_to_cpu(iagp->nfreeexts) + 1); + le32_add_cpu(&iagp->nfreeinos, -(INOSPEREXT - 1)); + le32_add_cpu(&iagp->nfreeexts, 1); /* update the number of free inodes and backed inodes * at the ag and inode map level. @@ -2124,7 +2122,7 @@ static int diAllocBit(struct inomap * imap, struct iag * iagp, int ino) /* update the free inode count at the iag, ag, inode * map levels. */ - iagp->nfreeinos = cpu_to_le32(le32_to_cpu(iagp->nfreeinos) - 1); + le32_add_cpu(&iagp->nfreeinos, -1); imap->im_agctl[agno].numfree -= 1; atomic_dec(&imap->im_numfree); @@ -2378,9 +2376,8 @@ static int diNewExt(struct inomap * imap, struct iag * iagp, int extno) /* update the free inode and free extent counts for the * iag. */ - iagp->nfreeinos = cpu_to_le32(le32_to_cpu(iagp->nfreeinos) + - (INOSPEREXT - 1)); - iagp->nfreeexts = cpu_to_le32(le32_to_cpu(iagp->nfreeexts) - 1); + le32_add_cpu(&iagp->nfreeinos, (INOSPEREXT - 1)); + le32_add_cpu(&iagp->nfreeexts, -1); /* update the free and backed inode counts for the ag. */ diff --git a/fs/jfs/jfs_xtree.c b/fs/jfs/jfs_xtree.c index a000aaa..5a61ebf 100644 --- a/fs/jfs/jfs_xtree.c +++ b/fs/jfs/jfs_xtree.c @@ -905,8 +905,7 @@ int xtInsert(tid_t tid, /* transaction id */ XT_PUTENTRY(xad, xflag, xoff, xlen, xaddr); /* advance next available entry index */ - p->header.nextindex = - cpu_to_le16(le16_to_cpu(p->header.nextindex) + 1); + le16_add_cpu(&p->header.nextindex, 1); /* Don't log it if there are no links to the file */ if (!test_cflag(COMMIT_Nolink, ip)) { @@ -997,8 +996,7 @@ xtSplitUp(tid_t tid, split->addr); /* advance next available entry index */ - sp->header.nextindex = - cpu_to_le16(le16_to_cpu(sp->header.nextindex) + 1); + le16_add_cpu(&sp->header.nextindex, 1); /* Don't log it if there are no links to the file */ if (!test_cflag(COMMIT_Nolink, ip)) { @@ -1167,9 +1165,7 @@ xtSplitUp(tid_t tid, JFS_SBI(ip->i_sb)->nbperpage, rcbn); /* advance next available entry index. */ - sp->header.nextindex = - cpu_to_le16(le16_to_cpu(sp->header.nextindex) + - 1); + le16_add_cpu(&sp->header.nextindex, 1); /* Don't log it if there are no links to the file */ if (!test_cflag(COMMIT_Nolink, ip)) { @@ -1738,8 +1734,7 @@ int xtExtend(tid_t tid, /* transaction id */ XT_PUTENTRY(xad, XAD_NEW, xoff, len, xaddr); /* advance next available entry index */ - p->header.nextindex = - cpu_to_le16(le16_to_cpu(p->header.nextindex) + 1); + le16_add_cpu(&p->header.nextindex, 1); } /* get back old entry */ @@ -1905,8 +1900,7 @@ printf("xtTailgate: xoff:0x%lx xlen:0x%x xaddr:0x%lx\n", XT_PUTENTRY(xad, XAD_NEW, xoff, xlen, xaddr); /* advance next available entry index */ - p->header.nextindex = - cpu_to_le16(le16_to_cpu(p->header.nextindex) + 1); + le16_add_cpu(&p->header.nextindex, 1); } /* get back old XAD */ @@ -2567,8 +2561,7 @@ int xtAppend(tid_t tid, /* transaction id */ XT_PUTENTRY(xad, xflag, xoff, xlen, xaddr); /* advance next available entry index */ - p->header.nextindex = - cpu_to_le16(le16_to_cpu(p->header.nextindex) + 1); + le16_add_cpu(&p->header.nextindex, 1); xtlck->lwm.offset = (xtlck->lwm.offset) ? min(index,(int) xtlck->lwm.offset) : index; @@ -2631,8 +2624,7 @@ int xtDelete(tid_t tid, struct inode *ip, s64 xoff, s32 xlen, int flag) * delete the entry from the leaf page */ nextindex = le16_to_cpu(p->header.nextindex); - p->header.nextindex = - cpu_to_le16(le16_to_cpu(p->header.nextindex) - 1); + le16_add_cpu(&p->header.nextindex, -1); /* * if the leaf page bocome empty, free the page @@ -2795,9 +2787,7 @@ xtDeleteUp(tid_t tid, struct inode *ip, (nextindex - index - 1) << L2XTSLOTSIZE); - p->header.nextindex = - cpu_to_le16(le16_to_cpu(p->header.nextindex) - - 1); + le16_add_cpu(&p->header.nextindex, -1); jfs_info("xtDeleteUp(entry): 0x%lx[%d]", (ulong) parent->bn, index); } @@ -127,7 +127,6 @@ #include <linux/rcupdate.h> #include <linux/pid_namespace.h> -#include <asm/semaphore.h> #include <asm/uaccess.h> #define IS_POSIX(fl) (fl->fl_flags & FL_POSIX) @@ -1623,8 +1623,7 @@ int may_open(struct nameidata *nd, int acc_mode, int flag) return -EACCES; flag &= ~O_TRUNC; - } else if (IS_RDONLY(inode) && (acc_mode & MAY_WRITE)) - return -EROFS; + } error = vfs_permission(nd, acc_mode); if (error) @@ -1677,7 +1676,12 @@ int may_open(struct nameidata *nd, int acc_mode, int flag) return 0; } -static int open_namei_create(struct nameidata *nd, struct path *path, +/* + * Be careful about ever adding any more callers of this + * function. Its flags must be in the namei format, not + * what get passed to sys_open(). + */ +static int __open_namei_create(struct nameidata *nd, struct path *path, int flag, int mode) { int error; @@ -1696,26 +1700,56 @@ static int open_namei_create(struct nameidata *nd, struct path *path, } /* - * open_namei() + * Note that while the flag value (low two bits) for sys_open means: + * 00 - read-only + * 01 - write-only + * 10 - read-write + * 11 - special + * it is changed into + * 00 - no permissions needed + * 01 - read-permission + * 10 - write-permission + * 11 - read-write + * for the internal routines (ie open_namei()/follow_link() etc) + * This is more logical, and also allows the 00 "no perm needed" + * to be used for symlinks (where the permissions are checked + * later). * - * namei for open - this is in fact almost the whole open-routine. - * - * Note that the low bits of "flag" aren't the same as in the open - * system call - they are 00 - no permissions needed - * 01 - read permission needed - * 10 - write permission needed - * 11 - read/write permissions needed - * which is a lot more logical, and also allows the "no perm" needed - * for symlinks (where the permissions are checked later). - * SMP-safe +*/ +static inline int open_to_namei_flags(int flag) +{ + if ((flag+1) & O_ACCMODE) + flag++; + return flag; +} + +static int open_will_write_to_fs(int flag, struct inode *inode) +{ + /* + * We'll never write to the fs underlying + * a device file. + */ + if (special_file(inode->i_mode)) + return 0; + return (flag & O_TRUNC); +} + +/* + * Note that the low bits of the passed in "open_flag" + * are not the same as in the local variable "flag". See + * open_to_namei_flags() for more details. */ -int open_namei(int dfd, const char *pathname, int flag, - int mode, struct nameidata *nd) +struct file *do_filp_open(int dfd, const char *pathname, + int open_flag, int mode) { + struct file *filp; + struct nameidata nd; int acc_mode, error; struct path path; struct dentry *dir; int count = 0; + int will_write; + int flag = open_to_namei_flags(open_flag); acc_mode = ACC_MODE(flag); @@ -1733,18 +1767,19 @@ int open_namei(int dfd, const char *pathname, int flag, */ if (!(flag & O_CREAT)) { error = path_lookup_open(dfd, pathname, lookup_flags(flag), - nd, flag); + &nd, flag); if (error) - return error; + return ERR_PTR(error); goto ok; } /* * Create - we need to know the parent. */ - error = path_lookup_create(dfd,pathname,LOOKUP_PARENT,nd,flag,mode); + error = path_lookup_create(dfd, pathname, LOOKUP_PARENT, + &nd, flag, mode); if (error) - return error; + return ERR_PTR(error); /* * We have the parent and last component. First of all, check @@ -1752,14 +1787,14 @@ int open_namei(int dfd, const char *pathname, int flag, * will not do. */ error = -EISDIR; - if (nd->last_type != LAST_NORM || nd->last.name[nd->last.len]) + if (nd.last_type != LAST_NORM || nd.last.name[nd.last.len]) goto exit; - dir = nd->path.dentry; - nd->flags &= ~LOOKUP_PARENT; + dir = nd.path.dentry; + nd.flags &= ~LOOKUP_PARENT; mutex_lock(&dir->d_inode->i_mutex); - path.dentry = lookup_hash(nd); - path.mnt = nd->path.mnt; + path.dentry = lookup_hash(&nd); + path.mnt = nd.path.mnt; do_last: error = PTR_ERR(path.dentry); @@ -1768,18 +1803,31 @@ do_last: goto exit; } - if (IS_ERR(nd->intent.open.file)) { - mutex_unlock(&dir->d_inode->i_mutex); - error = PTR_ERR(nd->intent.open.file); - goto exit_dput; + if (IS_ERR(nd.intent.open.file)) { + error = PTR_ERR(nd.intent.open.file); + goto exit_mutex_unlock; } /* Negative dentry, just create the file */ if (!path.dentry->d_inode) { - error = open_namei_create(nd, &path, flag, mode); + /* + * This write is needed to ensure that a + * ro->rw transition does not occur between + * the time when the file is created and when + * a permanent write count is taken through + * the 'struct file' in nameidata_to_filp(). + */ + error = mnt_want_write(nd.path.mnt); if (error) + goto exit_mutex_unlock; + error = __open_namei_create(&nd, &path, flag, mode); + if (error) { + mnt_drop_write(nd.path.mnt); goto exit; - return 0; + } + filp = nameidata_to_filp(&nd, open_flag); + mnt_drop_write(nd.path.mnt); + return filp; } /* @@ -1804,23 +1852,52 @@ do_last: if (path.dentry->d_inode->i_op && path.dentry->d_inode->i_op->follow_link) goto do_link; - path_to_nameidata(&path, nd); + path_to_nameidata(&path, &nd); error = -EISDIR; if (path.dentry->d_inode && S_ISDIR(path.dentry->d_inode->i_mode)) goto exit; ok: - error = may_open(nd, acc_mode, flag); - if (error) + /* + * Consider: + * 1. may_open() truncates a file + * 2. a rw->ro mount transition occurs + * 3. nameidata_to_filp() fails due to + * the ro mount. + * That would be inconsistent, and should + * be avoided. Taking this mnt write here + * ensures that (2) can not occur. + */ + will_write = open_will_write_to_fs(flag, nd.path.dentry->d_inode); + if (will_write) { + error = mnt_want_write(nd.path.mnt); + if (error) + goto exit; + } + error = may_open(&nd, acc_mode, flag); + if (error) { + if (will_write) + mnt_drop_write(nd.path.mnt); goto exit; - return 0; + } + filp = nameidata_to_filp(&nd, open_flag); + /* + * It is now safe to drop the mnt write + * because the filp has had a write taken + * on its behalf. + */ + if (will_write) + mnt_drop_write(nd.path.mnt); + return filp; +exit_mutex_unlock: + mutex_unlock(&dir->d_inode->i_mutex); exit_dput: - path_put_conditional(&path, nd); + path_put_conditional(&path, &nd); exit: - if (!IS_ERR(nd->intent.open.file)) - release_open_intent(nd); - path_put(&nd->path); - return error; + if (!IS_ERR(nd.intent.open.file)) + release_open_intent(&nd); + path_put(&nd.path); + return ERR_PTR(error); do_link: error = -ELOOP; @@ -1836,43 +1913,60 @@ do_link: * stored in nd->last.name and we will have to putname() it when we * are done. Procfs-like symlinks just set LAST_BIND. */ - nd->flags |= LOOKUP_PARENT; - error = security_inode_follow_link(path.dentry, nd); + nd.flags |= LOOKUP_PARENT; + error = security_inode_follow_link(path.dentry, &nd); if (error) goto exit_dput; - error = __do_follow_link(&path, nd); + error = __do_follow_link(&path, &nd); if (error) { /* Does someone understand code flow here? Or it is only * me so stupid? Anathema to whoever designed this non-sense * with "intent.open". */ - release_open_intent(nd); - return error; + release_open_intent(&nd); + return ERR_PTR(error); } - nd->flags &= ~LOOKUP_PARENT; - if (nd->last_type == LAST_BIND) + nd.flags &= ~LOOKUP_PARENT; + if (nd.last_type == LAST_BIND) goto ok; error = -EISDIR; - if (nd->last_type != LAST_NORM) + if (nd.last_type != LAST_NORM) goto exit; - if (nd->last.name[nd->last.len]) { - __putname(nd->last.name); + if (nd.last.name[nd.last.len]) { + __putname(nd.last.name); goto exit; } error = -ELOOP; if (count++==32) { - __putname(nd->last.name); + __putname(nd.last.name); goto exit; } - dir = nd->path.dentry; + dir = nd.path.dentry; mutex_lock(&dir->d_inode->i_mutex); - path.dentry = lookup_hash(nd); - path.mnt = nd->path.mnt; - __putname(nd->last.name); + path.dentry = lookup_hash(&nd); + path.mnt = nd.path.mnt; + __putname(nd.last.name); goto do_last; } /** + * filp_open - open file and return file pointer + * + * @filename: path to open + * @flags: open flags as per the open(2) second argument + * @mode: mode for the new file if O_CREAT is set, else ignored + * + * This is the helper to open a file from kernelspace if you really + * have to. But in generally you should not do this, so please move + * along, nothing to see here.. + */ +struct file *filp_open(const char *filename, int flags, int mode) +{ + return do_filp_open(AT_FDCWD, filename, flags, mode); +} +EXPORT_SYMBOL(filp_open); + +/** * lookup_create - lookup a dentry, creating it if it doesn't exist * @nd: nameidata info * @is_dir: directory flag @@ -1945,6 +2039,23 @@ int vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev) return error; } +static int may_mknod(mode_t mode) +{ + switch (mode & S_IFMT) { + case S_IFREG: + case S_IFCHR: + case S_IFBLK: + case S_IFIFO: + case S_IFSOCK: + case 0: /* zero mode translates to S_IFREG */ + return 0; + case S_IFDIR: + return -EPERM; + default: + return -EINVAL; + } +} + asmlinkage long sys_mknodat(int dfd, const char __user *filename, int mode, unsigned dev) { @@ -1963,12 +2074,19 @@ asmlinkage long sys_mknodat(int dfd, const char __user *filename, int mode, if (error) goto out; dentry = lookup_create(&nd, 0); - error = PTR_ERR(dentry); - + if (IS_ERR(dentry)) { + error = PTR_ERR(dentry); + goto out_unlock; + } if (!IS_POSIXACL(nd.path.dentry->d_inode)) mode &= ~current->fs->umask; - if (!IS_ERR(dentry)) { - switch (mode & S_IFMT) { + error = may_mknod(mode); + if (error) + goto out_dput; + error = mnt_want_write(nd.path.mnt); + if (error) + goto out_dput; + switch (mode & S_IFMT) { case 0: case S_IFREG: error = vfs_create(nd.path.dentry->d_inode,dentry,mode,&nd); break; @@ -1979,14 +2097,11 @@ asmlinkage long sys_mknodat(int dfd, const char __user *filename, int mode, case S_IFIFO: case S_IFSOCK: error = vfs_mknod(nd.path.dentry->d_inode,dentry,mode,0); break; - case S_IFDIR: - error = -EPERM; - break; - default: - error = -EINVAL; - } - dput(dentry); } + mnt_drop_write(nd.path.mnt); +out_dput: + dput(dentry); +out_unlock: mutex_unlock(&nd.path.dentry->d_inode->i_mutex); path_put(&nd.path); out: @@ -2044,7 +2159,12 @@ asmlinkage long sys_mkdirat(int dfd, const char __user *pathname, int mode) if (!IS_POSIXACL(nd.path.dentry->d_inode)) mode &= ~current->fs->umask; + error = mnt_want_write(nd.path.mnt); + if (error) + goto out_dput; error = vfs_mkdir(nd.path.dentry->d_inode, dentry, mode); + mnt_drop_write(nd.path.mnt); +out_dput: dput(dentry); out_unlock: mutex_unlock(&nd.path.dentry->d_inode->i_mutex); @@ -2151,7 +2271,12 @@ static long do_rmdir(int dfd, const char __user *pathname) error = PTR_ERR(dentry); if (IS_ERR(dentry)) goto exit2; + error = mnt_want_write(nd.path.mnt); + if (error) + goto exit3; error = vfs_rmdir(nd.path.dentry->d_inode, dentry); + mnt_drop_write(nd.path.mnt); +exit3: dput(dentry); exit2: mutex_unlock(&nd.path.dentry->d_inode->i_mutex); @@ -2232,7 +2357,11 @@ static long do_unlinkat(int dfd, const char __user *pathname) inode = dentry->d_inode; if (inode) atomic_inc(&inode->i_count); + error = mnt_want_write(nd.path.mnt); + if (error) + goto exit2; error = vfs_unlink(nd.path.dentry->d_inode, dentry); + mnt_drop_write(nd.path.mnt); exit2: dput(dentry); } @@ -2313,7 +2442,12 @@ asmlinkage long sys_symlinkat(const char __user *oldname, if (IS_ERR(dentry)) goto out_unlock; + error = mnt_want_write(nd.path.mnt); + if (error) + goto out_dput; error = vfs_symlink(nd.path.dentry->d_inode, dentry, from, S_IALLUGO); + mnt_drop_write(nd.path.mnt); +out_dput: dput(dentry); out_unlock: mutex_unlock(&nd.path.dentry->d_inode->i_mutex); @@ -2408,7 +2542,12 @@ asmlinkage long sys_linkat(int olddfd, const char __user *oldname, error = PTR_ERR(new_dentry); if (IS_ERR(new_dentry)) goto out_unlock; + error = mnt_want_write(nd.path.mnt); + if (error) + goto out_dput; error = vfs_link(old_nd.path.dentry, nd.path.dentry->d_inode, new_dentry); + mnt_drop_write(nd.path.mnt); +out_dput: dput(new_dentry); out_unlock: mutex_unlock(&nd.path.dentry->d_inode->i_mutex); @@ -2634,8 +2773,12 @@ static int do_rename(int olddfd, const char *oldname, if (new_dentry == trap) goto exit5; + error = mnt_want_write(oldnd.path.mnt); + if (error) + goto exit5; error = vfs_rename(old_dir->d_inode, old_dentry, new_dir->d_inode, new_dentry); + mnt_drop_write(oldnd.path.mnt); exit5: dput(new_dentry); exit4: diff --git a/fs/namespace.c b/fs/namespace.c index 94f026e..0505fb6 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -17,6 +17,7 @@ #include <linux/quotaops.h> #include <linux/acct.h> #include <linux/capability.h> +#include <linux/cpumask.h> #include <linux/module.h> #include <linux/sysfs.h> #include <linux/seq_file.h> @@ -26,6 +27,7 @@ #include <linux/mount.h> #include <linux/ramfs.h> #include <linux/log2.h> +#include <linux/idr.h> #include <asm/uaccess.h> #include <asm/unistd.h> #include "pnode.h" @@ -38,6 +40,8 @@ __cacheline_aligned_in_smp DEFINE_SPINLOCK(vfsmount_lock); static int event; +static DEFINE_IDA(mnt_id_ida); +static DEFINE_IDA(mnt_group_ida); static struct list_head *mount_hashtable __read_mostly; static struct kmem_cache *mnt_cache __read_mostly; @@ -55,10 +59,65 @@ static inline unsigned long hash(struct vfsmount *mnt, struct dentry *dentry) return tmp & (HASH_SIZE - 1); } +#define MNT_WRITER_UNDERFLOW_LIMIT -(1<<16) + +/* allocation is serialized by namespace_sem */ +static int mnt_alloc_id(struct vfsmount *mnt) +{ + int res; + +retry: + ida_pre_get(&mnt_id_ida, GFP_KERNEL); + spin_lock(&vfsmount_lock); + res = ida_get_new(&mnt_id_ida, &mnt->mnt_id); + spin_unlock(&vfsmount_lock); + if (res == -EAGAIN) + goto retry; + + return res; +} + +static void mnt_free_id(struct vfsmount *mnt) +{ + spin_lock(&vfsmount_lock); + ida_remove(&mnt_id_ida, mnt->mnt_id); + spin_unlock(&vfsmount_lock); +} + +/* + * Allocate a new peer group ID + * + * mnt_group_ida is protected by namespace_sem + */ +static int mnt_alloc_group_id(struct vfsmount *mnt) +{ + if (!ida_pre_get(&mnt_group_ida, GFP_KERNEL)) + return -ENOMEM; + + return ida_get_new_above(&mnt_group_ida, 1, &mnt->mnt_group_id); +} + +/* + * Release a peer group ID + */ +void mnt_release_group_id(struct vfsmount *mnt) +{ + ida_remove(&mnt_group_ida, mnt->mnt_group_id); + mnt->mnt_group_id = 0; +} + struct vfsmount *alloc_vfsmnt(const char *name) { struct vfsmount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL); if (mnt) { + int err; + + err = mnt_alloc_id(mnt); + if (err) { + kmem_cache_free(mnt_cache, mnt); + return NULL; + } + atomic_set(&mnt->mnt_count, 1); INIT_LIST_HEAD(&mnt->mnt_hash); INIT_LIST_HEAD(&mnt->mnt_child); @@ -68,6 +127,7 @@ struct vfsmount *alloc_vfsmnt(const char *name) INIT_LIST_HEAD(&mnt->mnt_share); INIT_LIST_HEAD(&mnt->mnt_slave_list); INIT_LIST_HEAD(&mnt->mnt_slave); + atomic_set(&mnt->__mnt_writers, 0); if (name) { int size = strlen(name) + 1; char *newname = kmalloc(size, GFP_KERNEL); @@ -80,6 +140,263 @@ struct vfsmount *alloc_vfsmnt(const char *name) return mnt; } +/* + * Most r/o checks on a fs are for operations that take + * discrete amounts of time, like a write() or unlink(). + * We must keep track of when those operations start + * (for permission checks) and when they end, so that + * we can determine when writes are able to occur to + * a filesystem. + */ +/* + * __mnt_is_readonly: check whether a mount is read-only + * @mnt: the mount to check for its write status + * + * This shouldn't be used directly ouside of the VFS. + * It does not guarantee that the filesystem will stay + * r/w, just that it is right *now*. This can not and + * should not be used in place of IS_RDONLY(inode). + * mnt_want/drop_write() will _keep_ the filesystem + * r/w. + */ +int __mnt_is_readonly(struct vfsmount *mnt) +{ + if (mnt->mnt_flags & MNT_READONLY) + return 1; + if (mnt->mnt_sb->s_flags & MS_RDONLY) + return 1; + return 0; +} +EXPORT_SYMBOL_GPL(__mnt_is_readonly); + +struct mnt_writer { + /* + * If holding multiple instances of this lock, they + * must be ordered by cpu number. + */ + spinlock_t lock; + struct lock_class_key lock_class; /* compiles out with !lockdep */ + unsigned long count; + struct vfsmount *mnt; +} ____cacheline_aligned_in_smp; +static DEFINE_PER_CPU(struct mnt_writer, mnt_writers); + +static int __init init_mnt_writers(void) +{ + int cpu; + for_each_possible_cpu(cpu) { + struct mnt_writer *writer = &per_cpu(mnt_writers, cpu); + spin_lock_init(&writer->lock); + lockdep_set_class(&writer->lock, &writer->lock_class); + writer->count = 0; + } + return 0; +} +fs_initcall(init_mnt_writers); + +static void unlock_mnt_writers(void) +{ + int cpu; + struct mnt_writer *cpu_writer; + + for_each_possible_cpu(cpu) { + cpu_writer = &per_cpu(mnt_writers, cpu); + spin_unlock(&cpu_writer->lock); + } +} + +static inline void __clear_mnt_count(struct mnt_writer *cpu_writer) +{ + if (!cpu_writer->mnt) + return; + /* + * This is in case anyone ever leaves an invalid, + * old ->mnt and a count of 0. + */ + if (!cpu_writer->count) + return; + atomic_add(cpu_writer->count, &cpu_writer->mnt->__mnt_writers); + cpu_writer->count = 0; +} + /* + * must hold cpu_writer->lock + */ +static inline void use_cpu_writer_for_mount(struct mnt_writer *cpu_writer, + struct vfsmount *mnt) +{ + if (cpu_writer->mnt == mnt) + return; + __clear_mnt_count(cpu_writer); + cpu_writer->mnt = mnt; +} + +/* + * Most r/o checks on a fs are for operations that take + * discrete amounts of time, like a write() or unlink(). + * We must keep track of when those operations start + * (for permission checks) and when they end, so that + * we can determine when writes are able to occur to + * a filesystem. + */ +/** + * mnt_want_write - get write access to a mount + * @mnt: the mount on which to take a write + * + * This tells the low-level filesystem that a write is + * about to be performed to it, and makes sure that + * writes are allowed before returning success. When + * the write operation is finished, mnt_drop_write() + * must be called. This is effectively a refcount. + */ +int mnt_want_write(struct vfsmount *mnt) +{ + int ret = 0; + struct mnt_writer *cpu_writer; + + cpu_writer = &get_cpu_var(mnt_writers); + spin_lock(&cpu_writer->lock); + if (__mnt_is_readonly(mnt)) { + ret = -EROFS; + goto out; + } + use_cpu_writer_for_mount(cpu_writer, mnt); + cpu_writer->count++; +out: + spin_unlock(&cpu_writer->lock); + put_cpu_var(mnt_writers); + return ret; +} +EXPORT_SYMBOL_GPL(mnt_want_write); + +static void lock_mnt_writers(void) +{ + int cpu; + struct mnt_writer *cpu_writer; + + for_each_possible_cpu(cpu) { + cpu_writer = &per_cpu(mnt_writers, cpu); + spin_lock(&cpu_writer->lock); + __clear_mnt_count(cpu_writer); + cpu_writer->mnt = NULL; + } +} + +/* + * These per-cpu write counts are not guaranteed to have + * matched increments and decrements on any given cpu. + * A file open()ed for write on one cpu and close()d on + * another cpu will imbalance this count. Make sure it + * does not get too far out of whack. + */ +static void handle_write_count_underflow(struct vfsmount *mnt) +{ + if (atomic_read(&mnt->__mnt_writers) >= + MNT_WRITER_UNDERFLOW_LIMIT) + return; + /* + * It isn't necessary to hold all of the locks + * at the same time, but doing it this way makes + * us share a lot more code. + */ + lock_mnt_writers(); + /* + * vfsmount_lock is for mnt_flags. + */ + spin_lock(&vfsmount_lock); + /* + * If coalescing the per-cpu writer counts did not + * get us back to a positive writer count, we have + * a bug. + */ + if ((atomic_read(&mnt->__mnt_writers) < 0) && + !(mnt->mnt_flags & MNT_IMBALANCED_WRITE_COUNT)) { + printk(KERN_DEBUG "leak detected on mount(%p) writers " + "count: %d\n", + mnt, atomic_read(&mnt->__mnt_writers)); + WARN_ON(1); + /* use the flag to keep the dmesg spam down */ + mnt->mnt_flags |= MNT_IMBALANCED_WRITE_COUNT; + } + spin_unlock(&vfsmount_lock); + unlock_mnt_writers(); +} + +/** + * mnt_drop_write - give up write access to a mount + * @mnt: the mount on which to give up write access + * + * Tells the low-level filesystem that we are done + * performing writes to it. Must be matched with + * mnt_want_write() call above. + */ +void mnt_drop_write(struct vfsmount *mnt) +{ + int must_check_underflow = 0; + struct mnt_writer *cpu_writer; + + cpu_writer = &get_cpu_var(mnt_writers); + spin_lock(&cpu_writer->lock); + + use_cpu_writer_for_mount(cpu_writer, mnt); + if (cpu_writer->count > 0) { + cpu_writer->count--; + } else { + must_check_underflow = 1; + atomic_dec(&mnt->__mnt_writers); + } + + spin_unlock(&cpu_writer->lock); + /* + * Logically, we could call this each time, + * but the __mnt_writers cacheline tends to + * be cold, and makes this expensive. + */ + if (must_check_underflow) + handle_write_count_underflow(mnt); + /* + * This could be done right after the spinlock + * is taken because the spinlock keeps us on + * the cpu, and disables preemption. However, + * putting it here bounds the amount that + * __mnt_writers can underflow. Without it, + * we could theoretically wrap __mnt_writers. + */ + put_cpu_var(mnt_writers); +} +EXPORT_SYMBOL_GPL(mnt_drop_write); + +static int mnt_make_readonly(struct vfsmount *mnt) +{ + int ret = 0; + + lock_mnt_writers(); + /* + * With all the locks held, this value is stable + */ + if (atomic_read(&mnt->__mnt_writers) > 0) { + ret = -EBUSY; + goto out; + } + /* + * nobody can do a successful mnt_want_write() with all + * of the counts in MNT_DENIED_WRITE and the locks held. + */ + spin_lock(&vfsmount_lock); + if (!ret) + mnt->mnt_flags |= MNT_READONLY; + spin_unlock(&vfsmount_lock); +out: + unlock_mnt_writers(); + return ret; +} + +static void __mnt_unmake_readonly(struct vfsmount *mnt) +{ + spin_lock(&vfsmount_lock); + mnt->mnt_flags &= ~MNT_READONLY; + spin_unlock(&vfsmount_lock); +} + int simple_set_mnt(struct vfsmount *mnt, struct super_block *sb) { mnt->mnt_sb = sb; @@ -92,6 +409,7 @@ EXPORT_SYMBOL(simple_set_mnt); void free_vfsmnt(struct vfsmount *mnt) { kfree(mnt->mnt_devname); + mnt_free_id(mnt); kmem_cache_free(mnt_cache, mnt); } @@ -238,6 +556,17 @@ static struct vfsmount *clone_mnt(struct vfsmount *old, struct dentry *root, struct vfsmount *mnt = alloc_vfsmnt(old->mnt_devname); if (mnt) { + if (flag & (CL_SLAVE | CL_PRIVATE)) + mnt->mnt_group_id = 0; /* not a peer of original */ + else + mnt->mnt_group_id = old->mnt_group_id; + + if ((flag & CL_MAKE_SHARED) && !mnt->mnt_group_id) { + int err = mnt_alloc_group_id(mnt); + if (err) + goto out_free; + } + mnt->mnt_flags = old->mnt_flags; atomic_inc(&sb->s_active); mnt->mnt_sb = sb; @@ -267,11 +596,44 @@ static struct vfsmount *clone_mnt(struct vfsmount *old, struct dentry *root, } } return mnt; + + out_free: + free_vfsmnt(mnt); + return NULL; } static inline void __mntput(struct vfsmount *mnt) { + int cpu; struct super_block *sb = mnt->mnt_sb; + /* + * We don't have to hold all of the locks at the + * same time here because we know that we're the + * last reference to mnt and that no new writers + * can come in. + */ + for_each_possible_cpu(cpu) { + struct mnt_writer *cpu_writer = &per_cpu(mnt_writers, cpu); + if (cpu_writer->mnt != mnt) + continue; + spin_lock(&cpu_writer->lock); + atomic_add(cpu_writer->count, &mnt->__mnt_writers); + cpu_writer->count = 0; + /* + * Might as well do this so that no one + * ever sees the pointer and expects + * it to be valid. + */ + cpu_writer->mnt = NULL; + spin_unlock(&cpu_writer->lock); + } + /* + * This probably indicates that somebody messed + * up a mnt_want/drop_write() pair. If this + * happens, the filesystem was probably unable + * to make r/w->r/o transitions. + */ + WARN_ON(atomic_read(&mnt->__mnt_writers)); dput(mnt->mnt_root); free_vfsmnt(mnt); deactivate_super(sb); @@ -362,20 +724,21 @@ void save_mount_options(struct super_block *sb, char *options) } EXPORT_SYMBOL(save_mount_options); +#ifdef CONFIG_PROC_FS /* iterator */ static void *m_start(struct seq_file *m, loff_t *pos) { - struct mnt_namespace *n = m->private; + struct proc_mounts *p = m->private; down_read(&namespace_sem); - return seq_list_start(&n->list, *pos); + return seq_list_start(&p->ns->list, *pos); } static void *m_next(struct seq_file *m, void *v, loff_t *pos) { - struct mnt_namespace *n = m->private; + struct proc_mounts *p = m->private; - return seq_list_next(v, &n->list, pos); + return seq_list_next(v, &p->ns->list, pos); } static void m_stop(struct seq_file *m, void *v) @@ -383,20 +746,30 @@ static void m_stop(struct seq_file *m, void *v) up_read(&namespace_sem); } -static int show_vfsmnt(struct seq_file *m, void *v) +struct proc_fs_info { + int flag; + const char *str; +}; + +static void show_sb_opts(struct seq_file *m, struct super_block *sb) { - struct vfsmount *mnt = list_entry(v, struct vfsmount, mnt_list); - int err = 0; - static struct proc_fs_info { - int flag; - char *str; - } fs_info[] = { + static const struct proc_fs_info fs_info[] = { { MS_SYNCHRONOUS, ",sync" }, { MS_DIRSYNC, ",dirsync" }, { MS_MANDLOCK, ",mand" }, { 0, NULL } }; - static struct proc_fs_info mnt_info[] = { + const struct proc_fs_info *fs_infop; + + for (fs_infop = fs_info; fs_infop->flag; fs_infop++) { + if (sb->s_flags & fs_infop->flag) + seq_puts(m, fs_infop->str); + } +} + +static void show_mnt_opts(struct seq_file *m, struct vfsmount *mnt) +{ + static const struct proc_fs_info mnt_info[] = { { MNT_NOSUID, ",nosuid" }, { MNT_NODEV, ",nodev" }, { MNT_NOEXEC, ",noexec" }, @@ -405,40 +778,108 @@ static int show_vfsmnt(struct seq_file *m, void *v) { MNT_RELATIME, ",relatime" }, { 0, NULL } }; - struct proc_fs_info *fs_infop; + const struct proc_fs_info *fs_infop; + + for (fs_infop = mnt_info; fs_infop->flag; fs_infop++) { + if (mnt->mnt_flags & fs_infop->flag) + seq_puts(m, fs_infop->str); + } +} + +static void show_type(struct seq_file *m, struct super_block *sb) +{ + mangle(m, sb->s_type->name); + if (sb->s_subtype && sb->s_subtype[0]) { + seq_putc(m, '.'); + mangle(m, sb->s_subtype); + } +} + +static int show_vfsmnt(struct seq_file *m, void *v) +{ + struct vfsmount *mnt = list_entry(v, struct vfsmount, mnt_list); + int err = 0; struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt }; mangle(m, mnt->mnt_devname ? mnt->mnt_devname : "none"); seq_putc(m, ' '); seq_path(m, &mnt_path, " \t\n\\"); seq_putc(m, ' '); - mangle(m, mnt->mnt_sb->s_type->name); - if (mnt->mnt_sb->s_subtype && mnt->mnt_sb->s_subtype[0]) { - seq_putc(m, '.'); - mangle(m, mnt->mnt_sb->s_subtype); - } - seq_puts(m, mnt->mnt_sb->s_flags & MS_RDONLY ? " ro" : " rw"); - for (fs_infop = fs_info; fs_infop->flag; fs_infop++) { - if (mnt->mnt_sb->s_flags & fs_infop->flag) - seq_puts(m, fs_infop->str); - } - for (fs_infop = mnt_info; fs_infop->flag; fs_infop++) { - if (mnt->mnt_flags & fs_infop->flag) - seq_puts(m, fs_infop->str); - } + show_type(m, mnt->mnt_sb); + seq_puts(m, __mnt_is_readonly(mnt) ? " ro" : " rw"); + show_sb_opts(m, mnt->mnt_sb); + show_mnt_opts(m, mnt); if (mnt->mnt_sb->s_op->show_options) err = mnt->mnt_sb->s_op->show_options(m, mnt); seq_puts(m, " 0 0\n"); return err; } -struct seq_operations mounts_op = { +const struct seq_operations mounts_op = { .start = m_start, .next = m_next, .stop = m_stop, .show = show_vfsmnt }; +static int show_mountinfo(struct seq_file *m, void *v) +{ + struct proc_mounts *p = m->private; + struct vfsmount *mnt = list_entry(v, struct vfsmount, mnt_list); + struct super_block *sb = mnt->mnt_sb; + struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt }; + struct path root = p->root; + int err = 0; + + seq_printf(m, "%i %i %u:%u ", mnt->mnt_id, mnt->mnt_parent->mnt_id, + MAJOR(sb->s_dev), MINOR(sb->s_dev)); + seq_dentry(m, mnt->mnt_root, " \t\n\\"); + seq_putc(m, ' '); + seq_path_root(m, &mnt_path, &root, " \t\n\\"); + if (root.mnt != p->root.mnt || root.dentry != p->root.dentry) { + /* + * Mountpoint is outside root, discard that one. Ugly, + * but less so than trying to do that in iterator in a + * race-free way (due to renames). + */ + return SEQ_SKIP; + } + seq_puts(m, mnt->mnt_flags & MNT_READONLY ? " ro" : " rw"); + show_mnt_opts(m, mnt); + + /* Tagged fields ("foo:X" or "bar") */ + if (IS_MNT_SHARED(mnt)) + seq_printf(m, " shared:%i", mnt->mnt_group_id); + if (IS_MNT_SLAVE(mnt)) { + int master = mnt->mnt_master->mnt_group_id; + int dom = get_dominating_id(mnt, &p->root); + seq_printf(m, " master:%i", master); + if (dom && dom != master) + seq_printf(m, " propagate_from:%i", dom); + } + if (IS_MNT_UNBINDABLE(mnt)) + seq_puts(m, " unbindable"); + + /* Filesystem specific data */ + seq_puts(m, " - "); + show_type(m, sb); + seq_putc(m, ' '); + mangle(m, mnt->mnt_devname ? mnt->mnt_devname : "none"); + seq_puts(m, sb->s_flags & MS_RDONLY ? " ro" : " rw"); + show_sb_opts(m, sb); + if (sb->s_op->show_options) + err = sb->s_op->show_options(m, mnt); + seq_putc(m, '\n'); + return err; +} + +const struct seq_operations mountinfo_op = { + .start = m_start, + .next = m_next, + .stop = m_stop, + .show = show_mountinfo, +}; + static int show_vfsstat(struct seq_file *m, void *v) { struct vfsmount *mnt = list_entry(v, struct vfsmount, mnt_list); @@ -459,7 +900,7 @@ static int show_vfsstat(struct seq_file *m, void *v) /* file system type */ seq_puts(m, "with fstype "); - mangle(m, mnt->mnt_sb->s_type->name); + show_type(m, mnt->mnt_sb); /* optional statistics */ if (mnt->mnt_sb->s_op->show_stats) { @@ -471,12 +912,13 @@ static int show_vfsstat(struct seq_file *m, void *v) return err; } -struct seq_operations mountstats_op = { +const struct seq_operations mountstats_op = { .start = m_start, .next = m_next, .stop = m_stop, .show = show_vfsstat, }; +#endif /* CONFIG_PROC_FS */ /** * may_umount_tree - check if a mount tree is busy @@ -801,23 +1243,50 @@ Enomem: struct vfsmount *collect_mounts(struct vfsmount *mnt, struct dentry *dentry) { struct vfsmount *tree; - down_read(&namespace_sem); + down_write(&namespace_sem); tree = copy_tree(mnt, dentry, CL_COPY_ALL | CL_PRIVATE); - up_read(&namespace_sem); + up_write(&namespace_sem); return tree; } void drop_collected_mounts(struct vfsmount *mnt) { LIST_HEAD(umount_list); - down_read(&namespace_sem); + down_write(&namespace_sem); spin_lock(&vfsmount_lock); umount_tree(mnt, 0, &umount_list); spin_unlock(&vfsmount_lock); - up_read(&namespace_sem); + up_write(&namespace_sem); release_mounts(&umount_list); } +static void cleanup_group_ids(struct vfsmount *mnt, struct vfsmount *end) +{ + struct vfsmount *p; + + for (p = mnt; p != end; p = next_mnt(p, mnt)) { + if (p->mnt_group_id && !IS_MNT_SHARED(p)) + mnt_release_group_id(p); + } +} + +static int invent_group_ids(struct vfsmount *mnt, bool recurse) +{ + struct vfsmount *p; + + for (p = mnt; p; p = recurse ? next_mnt(p, mnt) : NULL) { + if (!p->mnt_group_id && !IS_MNT_SHARED(p)) { + int err = mnt_alloc_group_id(p); + if (err) { + cleanup_group_ids(mnt, p); + return err; + } + } + } + + return 0; +} + /* * @source_mnt : mount tree to be attached * @nd : place the mount tree @source_mnt is attached @@ -888,9 +1357,16 @@ static int attach_recursive_mnt(struct vfsmount *source_mnt, struct vfsmount *dest_mnt = path->mnt; struct dentry *dest_dentry = path->dentry; struct vfsmount *child, *p; + int err; - if (propagate_mnt(dest_mnt, dest_dentry, source_mnt, &tree_list)) - return -EINVAL; + if (IS_MNT_SHARED(dest_mnt)) { + err = invent_group_ids(source_mnt, true); + if (err) + goto out; + } + err = propagate_mnt(dest_mnt, dest_dentry, source_mnt, &tree_list); + if (err) + goto out_cleanup_ids; if (IS_MNT_SHARED(dest_mnt)) { for (p = source_mnt; p; p = next_mnt(p, source_mnt)) @@ -913,34 +1389,40 @@ static int attach_recursive_mnt(struct vfsmount *source_mnt, } spin_unlock(&vfsmount_lock); return 0; + + out_cleanup_ids: + if (IS_MNT_SHARED(dest_mnt)) + cleanup_group_ids(source_mnt, NULL); + out: + return err; } -static int graft_tree(struct vfsmount *mnt, struct nameidata *nd) +static int graft_tree(struct vfsmount *mnt, struct path *path) { int err; if (mnt->mnt_sb->s_flags & MS_NOUSER) return -EINVAL; - if (S_ISDIR(nd->path.dentry->d_inode->i_mode) != + if (S_ISDIR(path->dentry->d_inode->i_mode) != S_ISDIR(mnt->mnt_root->d_inode->i_mode)) return -ENOTDIR; err = -ENOENT; - mutex_lock(&nd->path.dentry->d_inode->i_mutex); - if (IS_DEADDIR(nd->path.dentry->d_inode)) + mutex_lock(&path->dentry->d_inode->i_mutex); + if (IS_DEADDIR(path->dentry->d_inode)) goto out_unlock; - err = security_sb_check_sb(mnt, nd); + err = security_sb_check_sb(mnt, path); if (err) goto out_unlock; err = -ENOENT; - if (IS_ROOT(nd->path.dentry) || !d_unhashed(nd->path.dentry)) - err = attach_recursive_mnt(mnt, &nd->path, NULL); + if (IS_ROOT(path->dentry) || !d_unhashed(path->dentry)) + err = attach_recursive_mnt(mnt, path, NULL); out_unlock: - mutex_unlock(&nd->path.dentry->d_inode->i_mutex); + mutex_unlock(&path->dentry->d_inode->i_mutex); if (!err) - security_sb_post_addmount(mnt, nd); + security_sb_post_addmount(mnt, path); return err; } @@ -953,6 +1435,7 @@ static noinline int do_change_type(struct nameidata *nd, int flag) struct vfsmount *m, *mnt = nd->path.mnt; int recurse = flag & MS_REC; int type = flag & ~MS_REC; + int err = 0; if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -961,12 +1444,20 @@ static noinline int do_change_type(struct nameidata *nd, int flag) return -EINVAL; down_write(&namespace_sem); + if (type == MS_SHARED) { + err = invent_group_ids(mnt, recurse); + if (err) + goto out_unlock; + } + spin_lock(&vfsmount_lock); for (m = mnt; m; m = (recurse ? next_mnt(m, mnt) : NULL)) change_mnt_propagation(m, type); spin_unlock(&vfsmount_lock); + + out_unlock: up_write(&namespace_sem); - return 0; + return err; } /* @@ -1004,7 +1495,7 @@ static noinline int do_loopback(struct nameidata *nd, char *old_name, if (!mnt) goto out; - err = graft_tree(mnt, nd); + err = graft_tree(mnt, &nd->path); if (err) { LIST_HEAD(umount_list); spin_lock(&vfsmount_lock); @@ -1019,6 +1510,23 @@ out: return err; } +static int change_mount_flags(struct vfsmount *mnt, int ms_flags) +{ + int error = 0; + int readonly_request = 0; + + if (ms_flags & MS_RDONLY) + readonly_request = 1; + if (readonly_request == __mnt_is_readonly(mnt)) + return 0; + + if (readonly_request) + error = mnt_make_readonly(mnt); + else + __mnt_unmake_readonly(mnt); + return error; +} + /* * change filesystem flags. dir should be a physical root of filesystem. * If you've mounted a non-root directory somewhere and want to do remount @@ -1041,7 +1549,10 @@ static noinline int do_remount(struct nameidata *nd, int flags, int mnt_flags, return -EINVAL; down_write(&sb->s_umount); - err = do_remount_sb(sb, flags, data, 0); + if (flags & MS_BIND) + err = change_mount_flags(nd->path.mnt, flags); + else + err = do_remount_sb(sb, flags, data, 0); if (!err) nd->path.mnt->mnt_flags = mnt_flags; up_write(&sb->s_umount); @@ -1191,7 +1702,7 @@ int do_add_mount(struct vfsmount *newmnt, struct nameidata *nd, goto unlock; newmnt->mnt_flags = mnt_flags; - if ((err = graft_tree(newmnt, nd))) + if ((err = graft_tree(newmnt, &nd->path))) goto unlock; if (fslist) /* add to the specified expiration list */ @@ -1425,6 +1936,8 @@ long do_mount(char *dev_name, char *dir_name, char *type_page, mnt_flags |= MNT_NODIRATIME; if (flags & MS_RELATIME) mnt_flags |= MNT_RELATIME; + if (flags & MS_RDONLY) + mnt_flags |= MNT_READONLY; flags &= ~(MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_ACTIVE | MS_NOATIME | MS_NODIRATIME | MS_RELATIME| MS_KERNMOUNT); @@ -1434,7 +1947,8 @@ long do_mount(char *dev_name, char *dir_name, char *type_page, if (retval) return retval; - retval = security_sb_mount(dev_name, &nd, type_page, flags, data_page); + retval = security_sb_mount(dev_name, &nd.path, + type_page, flags, data_page); if (retval) goto dput_out; @@ -1674,15 +2188,13 @@ asmlinkage long sys_pivot_root(const char __user * new_root, const char __user * put_old) { struct vfsmount *tmp; - struct nameidata new_nd, old_nd, user_nd; - struct path parent_path, root_parent; + struct nameidata new_nd, old_nd; + struct path parent_path, root_parent, root; int error; if (!capable(CAP_SYS_ADMIN)) return -EPERM; - lock_kernel(); - error = __user_walk(new_root, LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &new_nd); if (error) @@ -1695,14 +2207,14 @@ asmlinkage long sys_pivot_root(const char __user * new_root, if (error) goto out1; - error = security_sb_pivotroot(&old_nd, &new_nd); + error = security_sb_pivotroot(&old_nd.path, &new_nd.path); if (error) { path_put(&old_nd.path); goto out1; } read_lock(¤t->fs->lock); - user_nd.path = current->fs->root; + root = current->fs->root; path_get(¤t->fs->root); read_unlock(¤t->fs->lock); down_write(&namespace_sem); @@ -1710,9 +2222,9 @@ asmlinkage long sys_pivot_root(const char __user * new_root, error = -EINVAL; if (IS_MNT_SHARED(old_nd.path.mnt) || IS_MNT_SHARED(new_nd.path.mnt->mnt_parent) || - IS_MNT_SHARED(user_nd.path.mnt->mnt_parent)) + IS_MNT_SHARED(root.mnt->mnt_parent)) goto out2; - if (!check_mnt(user_nd.path.mnt)) + if (!check_mnt(root.mnt)) goto out2; error = -ENOENT; if (IS_DEADDIR(new_nd.path.dentry->d_inode)) @@ -1722,13 +2234,13 @@ asmlinkage long sys_pivot_root(const char __user * new_root, if (d_unhashed(old_nd.path.dentry) && !IS_ROOT(old_nd.path.dentry)) goto out2; error = -EBUSY; - if (new_nd.path.mnt == user_nd.path.mnt || - old_nd.path.mnt == user_nd.path.mnt) + if (new_nd.path.mnt == root.mnt || + old_nd.path.mnt == root.mnt) goto out2; /* loop, on the same file system */ error = -EINVAL; - if (user_nd.path.mnt->mnt_root != user_nd.path.dentry) + if (root.mnt->mnt_root != root.dentry) goto out2; /* not a mountpoint */ - if (user_nd.path.mnt->mnt_parent == user_nd.path.mnt) + if (root.mnt->mnt_parent == root.mnt) goto out2; /* not attached */ if (new_nd.path.mnt->mnt_root != new_nd.path.dentry) goto out2; /* not a mountpoint */ @@ -1750,27 +2262,26 @@ asmlinkage long sys_pivot_root(const char __user * new_root, } else if (!is_subdir(old_nd.path.dentry, new_nd.path.dentry)) goto out3; detach_mnt(new_nd.path.mnt, &parent_path); - detach_mnt(user_nd.path.mnt, &root_parent); + detach_mnt(root.mnt, &root_parent); /* mount old root on put_old */ - attach_mnt(user_nd.path.mnt, &old_nd.path); + attach_mnt(root.mnt, &old_nd.path); /* mount new_root on / */ attach_mnt(new_nd.path.mnt, &root_parent); touch_mnt_namespace(current->nsproxy->mnt_ns); spin_unlock(&vfsmount_lock); - chroot_fs_refs(&user_nd.path, &new_nd.path); - security_sb_post_pivotroot(&user_nd, &new_nd); + chroot_fs_refs(&root, &new_nd.path); + security_sb_post_pivotroot(&root, &new_nd.path); error = 0; path_put(&root_parent); path_put(&parent_path); out2: mutex_unlock(&old_nd.path.dentry->d_inode->i_mutex); up_write(&namespace_sem); - path_put(&user_nd.path); + path_put(&root); path_put(&old_nd.path); out1: path_put(&new_nd.path); out0: - unlock_kernel(); return error; out3: spin_unlock(&vfsmount_lock); diff --git a/fs/ncpfs/ioctl.c b/fs/ncpfs/ioctl.c index c67b4bd..ad8f167 100644 --- a/fs/ncpfs/ioctl.c +++ b/fs/ncpfs/ioctl.c @@ -14,6 +14,7 @@ #include <linux/ioctl.h> #include <linux/time.h> #include <linux/mm.h> +#include <linux/mount.h> #include <linux/highuid.h> #include <linux/smp_lock.h> #include <linux/vmalloc.h> @@ -261,7 +262,7 @@ ncp_get_charsets(struct ncp_server* server, struct ncp_nls_ioctl __user *arg) } #endif /* CONFIG_NCPFS_NLS */ -int ncp_ioctl(struct inode *inode, struct file *filp, +static int __ncp_ioctl(struct inode *inode, struct file *filp, unsigned int cmd, unsigned long arg) { struct ncp_server *server = NCP_SERVER(inode); @@ -822,6 +823,57 @@ outrel: return -EINVAL; } +static int ncp_ioctl_need_write(unsigned int cmd) +{ + switch (cmd) { + case NCP_IOC_GET_FS_INFO: + case NCP_IOC_GET_FS_INFO_V2: + case NCP_IOC_NCPREQUEST: + case NCP_IOC_SETDENTRYTTL: + case NCP_IOC_SIGN_INIT: + case NCP_IOC_LOCKUNLOCK: + case NCP_IOC_SET_SIGN_WANTED: + return 1; + case NCP_IOC_GETOBJECTNAME: + case NCP_IOC_SETOBJECTNAME: + case NCP_IOC_GETPRIVATEDATA: + case NCP_IOC_SETPRIVATEDATA: + case NCP_IOC_SETCHARSETS: + case NCP_IOC_GETCHARSETS: + case NCP_IOC_CONN_LOGGED_IN: + case NCP_IOC_GETDENTRYTTL: + case NCP_IOC_GETMOUNTUID2: + case NCP_IOC_SIGN_WANTED: + case NCP_IOC_GETROOT: + case NCP_IOC_SETROOT: + return 0; + default: + /* unkown IOCTL command, assume write */ + return 1; + } +} + +int ncp_ioctl(struct inode *inode, struct file *filp, + unsigned int cmd, unsigned long arg) +{ + int ret; + + if (ncp_ioctl_need_write(cmd)) { + /* + * inside the ioctl(), any failures which + * are because of file_permission() are + * -EACCESS, so it seems consistent to keep + * that here. + */ + if (mnt_want_write(filp->f_path.mnt)) + return -EACCES; + } + ret = __ncp_ioctl(inode, filp, cmd, arg); + if (ncp_ioctl_need_write(cmd)) + mnt_drop_write(filp->f_path.mnt); + return ret; +} + #ifdef CONFIG_COMPAT long ncp_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 6cea747..d9e30ac 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -967,7 +967,8 @@ static int is_atomic_open(struct inode *dir, struct nameidata *nd) if (nd->flags & LOOKUP_DIRECTORY) return 0; /* Are we trying to write to a read only partition? */ - if (IS_RDONLY(dir) && (nd->intent.open.flags & (O_CREAT|O_TRUNC|FMODE_WRITE))) + if (__mnt_is_readonly(nd->path.mnt) && + (nd->intent.open.flags & (O_CREAT|O_TRUNC|FMODE_WRITE))) return 0; return 1; } diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index c593db0..c309c88 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -658,14 +658,19 @@ nfsd4_setattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, return status; } } + status = mnt_want_write(cstate->current_fh.fh_export->ex_path.mnt); + if (status) + return status; status = nfs_ok; if (setattr->sa_acl != NULL) status = nfsd4_set_nfs4_acl(rqstp, &cstate->current_fh, setattr->sa_acl); if (status) - return status; + goto out; status = nfsd_setattr(rqstp, &cstate->current_fh, &setattr->sa_iattr, 0, (time_t)0); +out: + mnt_drop_write(cstate->current_fh.fh_export->ex_path.mnt); return status; } diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c index 1ff9062..145b3c8 100644 --- a/fs/nfsd/nfs4recover.c +++ b/fs/nfsd/nfs4recover.c @@ -46,6 +46,7 @@ #include <linux/scatterlist.h> #include <linux/crypto.h> #include <linux/sched.h> +#include <linux/mount.h> #define NFSDDBG_FACILITY NFSDDBG_PROC @@ -154,7 +155,11 @@ nfsd4_create_clid_dir(struct nfs4_client *clp) dprintk("NFSD: nfsd4_create_clid_dir: DIRECTORY EXISTS\n"); goto out_put; } + status = mnt_want_write(rec_dir.path.mnt); + if (status) + goto out_put; status = vfs_mkdir(rec_dir.path.dentry->d_inode, dentry, S_IRWXU); + mnt_drop_write(rec_dir.path.mnt); out_put: dput(dentry); out_unlock: @@ -313,12 +318,17 @@ nfsd4_remove_clid_dir(struct nfs4_client *clp) if (!rec_dir_init || !clp->cl_firststate) return; + status = mnt_want_write(rec_dir.path.mnt); + if (status) + goto out; clp->cl_firststate = 0; nfs4_save_user(&uid, &gid); status = nfsd4_unlink_clid_dir(clp->cl_recdir, HEXDIR_LEN-1); nfs4_reset_user(uid, gid); if (status == 0) nfsd4_sync_rec_dir(); + mnt_drop_write(rec_dir.path.mnt); +out: if (status) printk("NFSD: Failed to remove expired client state directory" " %.*s\n", HEXDIR_LEN, clp->cl_recdir); @@ -347,13 +357,17 @@ nfsd4_recdir_purge_old(void) { if (!rec_dir_init) return; + status = mnt_want_write(rec_dir.path.mnt); + if (status) + goto out; status = nfsd4_list_rec_dir(rec_dir.path.dentry, purge_old); if (status == 0) nfsd4_sync_rec_dir(); + mnt_drop_write(rec_dir.path.mnt); +out: if (status) printk("nfsd4: failed to purge old clients from recovery" " directory %s\n", rec_dir.path.dentry->d_name.name); - return; } static int diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index bcb97d8..81a75f3 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -41,6 +41,7 @@ #include <linux/sunrpc/svc.h> #include <linux/nfsd/nfsd.h> #include <linux/nfsd/cache.h> +#include <linux/file.h> #include <linux/mount.h> #include <linux/workqueue.h> #include <linux/smp_lock.h> @@ -1239,7 +1240,7 @@ static inline void nfs4_file_downgrade(struct file *filp, unsigned int share_access) { if (share_access & NFS4_SHARE_ACCESS_WRITE) { - put_write_access(filp->f_path.dentry->d_inode); + drop_file_write_access(filp); filp->f_mode = (filp->f_mode | FMODE_READ) & ~FMODE_WRITE; } } diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 46f59d5..304bf5f 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -1255,23 +1255,35 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp, err = 0; switch (type) { case S_IFREG: + host_err = mnt_want_write(fhp->fh_export->ex_path.mnt); + if (host_err) + goto out_nfserr; host_err = vfs_create(dirp, dchild, iap->ia_mode, NULL); break; case S_IFDIR: + host_err = mnt_want_write(fhp->fh_export->ex_path.mnt); + if (host_err) + goto out_nfserr; host_err = vfs_mkdir(dirp, dchild, iap->ia_mode); break; case S_IFCHR: case S_IFBLK: case S_IFIFO: case S_IFSOCK: + host_err = mnt_want_write(fhp->fh_export->ex_path.mnt); + if (host_err) + goto out_nfserr; host_err = vfs_mknod(dirp, dchild, iap->ia_mode, rdev); break; default: printk("nfsd: bad file type %o in nfsd_create\n", type); host_err = -EINVAL; + goto out_nfserr; } - if (host_err < 0) + if (host_err < 0) { + mnt_drop_write(fhp->fh_export->ex_path.mnt); goto out_nfserr; + } if (EX_ISSYNC(fhp->fh_export)) { err = nfserrno(nfsd_sync_dir(dentry)); @@ -1282,6 +1294,7 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp, err2 = nfsd_create_setattr(rqstp, resfhp, iap); if (err2) err = err2; + mnt_drop_write(fhp->fh_export->ex_path.mnt); /* * Update the file handle to get the new inode info. */ @@ -1359,6 +1372,9 @@ nfsd_create_v3(struct svc_rqst *rqstp, struct svc_fh *fhp, v_atime = verifier[1]&0x7fffffff; } + host_err = mnt_want_write(fhp->fh_export->ex_path.mnt); + if (host_err) + goto out_nfserr; if (dchild->d_inode) { err = 0; @@ -1390,12 +1406,15 @@ nfsd_create_v3(struct svc_rqst *rqstp, struct svc_fh *fhp, case NFS3_CREATE_GUARDED: err = nfserr_exist; } + mnt_drop_write(fhp->fh_export->ex_path.mnt); goto out; } host_err = vfs_create(dirp, dchild, iap->ia_mode, NULL); - if (host_err < 0) + if (host_err < 0) { + mnt_drop_write(fhp->fh_export->ex_path.mnt); goto out_nfserr; + } if (created) *created = 1; @@ -1420,6 +1439,7 @@ nfsd_create_v3(struct svc_rqst *rqstp, struct svc_fh *fhp, if (err2) err = err2; + mnt_drop_write(fhp->fh_export->ex_path.mnt); /* * Update the filehandle to get the new inode info. */ @@ -1522,6 +1542,10 @@ nfsd_symlink(struct svc_rqst *rqstp, struct svc_fh *fhp, if (iap && (iap->ia_valid & ATTR_MODE)) mode = iap->ia_mode & S_IALLUGO; + host_err = mnt_want_write(fhp->fh_export->ex_path.mnt); + if (host_err) + goto out_nfserr; + if (unlikely(path[plen] != 0)) { char *path_alloced = kmalloc(plen+1, GFP_KERNEL); if (path_alloced == NULL) @@ -1542,6 +1566,8 @@ nfsd_symlink(struct svc_rqst *rqstp, struct svc_fh *fhp, err = nfserrno(host_err); fh_unlock(fhp); + mnt_drop_write(fhp->fh_export->ex_path.mnt); + cerr = fh_compose(resfhp, fhp->fh_export, dnew, fhp); dput(dnew); if (err==0) err = cerr; @@ -1592,6 +1618,11 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp, dold = tfhp->fh_dentry; dest = dold->d_inode; + host_err = mnt_want_write(tfhp->fh_export->ex_path.mnt); + if (host_err) { + err = nfserrno(host_err); + goto out_dput; + } host_err = vfs_link(dold, dirp, dnew); if (!host_err) { if (EX_ISSYNC(ffhp->fh_export)) { @@ -1605,7 +1636,8 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp, else err = nfserrno(host_err); } - + mnt_drop_write(tfhp->fh_export->ex_path.mnt); +out_dput: dput(dnew); out_unlock: fh_unlock(ffhp); @@ -1678,13 +1710,20 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen, if (ndentry == trap) goto out_dput_new; -#ifdef MSNFS - if ((ffhp->fh_export->ex_flags & NFSEXP_MSNFS) && + if (svc_msnfs(ffhp) && ((atomic_read(&odentry->d_count) > 1) || (atomic_read(&ndentry->d_count) > 1))) { host_err = -EPERM; - } else -#endif + goto out_dput_new; + } + + host_err = -EXDEV; + if (ffhp->fh_export->ex_path.mnt != tfhp->fh_export->ex_path.mnt) + goto out_dput_new; + host_err = mnt_want_write(ffhp->fh_export->ex_path.mnt); + if (host_err) + goto out_dput_new; + host_err = vfs_rename(fdir, odentry, tdir, ndentry); if (!host_err && EX_ISSYNC(tfhp->fh_export)) { host_err = nfsd_sync_dir(tdentry); @@ -1692,6 +1731,8 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen, host_err = nfsd_sync_dir(fdentry); } + mnt_drop_write(ffhp->fh_export->ex_path.mnt); + out_dput_new: dput(ndentry); out_dput_old: @@ -1750,6 +1791,10 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, if (!type) type = rdentry->d_inode->i_mode & S_IFMT; + host_err = mnt_want_write(fhp->fh_export->ex_path.mnt); + if (host_err) + goto out_nfserr; + if (type != S_IFDIR) { /* It's UNLINK */ #ifdef MSNFS if ((fhp->fh_export->ex_flags & NFSEXP_MSNFS) && @@ -1765,10 +1810,12 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, dput(rdentry); if (host_err) - goto out_nfserr; + goto out_drop; if (EX_ISSYNC(fhp->fh_export)) host_err = nfsd_sync_dir(dentry); +out_drop: + mnt_drop_write(fhp->fh_export->ex_path.mnt); out_nfserr: err = nfserrno(host_err); out: @@ -1865,7 +1912,7 @@ nfsd_permission(struct svc_rqst *rqstp, struct svc_export *exp, inode->i_mode, IS_IMMUTABLE(inode)? " immut" : "", IS_APPEND(inode)? " append" : "", - IS_RDONLY(inode)? " ro" : ""); + __mnt_is_readonly(exp->ex_path.mnt)? " ro" : ""); dprintk(" owner %d/%d user %d/%d\n", inode->i_uid, inode->i_gid, current->fsuid, current->fsgid); #endif @@ -1876,7 +1923,8 @@ nfsd_permission(struct svc_rqst *rqstp, struct svc_export *exp, */ if (!(acc & MAY_LOCAL_ACCESS)) if (acc & (MAY_WRITE | MAY_SATTR | MAY_TRUNC)) { - if (exp_rdonly(rqstp, exp) || IS_RDONLY(inode)) + if (exp_rdonly(rqstp, exp) || + __mnt_is_readonly(exp->ex_path.mnt)) return nfserr_rofs; if (/* (acc & MAY_WRITE) && */ IS_IMMUTABLE(inode)) return nfserr_perm; @@ -2039,6 +2087,9 @@ nfsd_set_posix_acl(struct svc_fh *fhp, int type, struct posix_acl *acl) } else size = 0; + error = mnt_want_write(fhp->fh_export->ex_path.mnt); + if (error) + goto getout; if (size) error = vfs_setxattr(fhp->fh_dentry, name, value, size, 0); else { @@ -2050,6 +2101,7 @@ nfsd_set_posix_acl(struct svc_fh *fhp, int type, struct posix_acl *acl) error = 0; } } + mnt_drop_write(fhp->fh_export->ex_path.mnt); getout: kfree(value); diff --git a/fs/ocfs2/Makefile b/fs/ocfs2/Makefile index 4d4ce48..f6956de 100644 --- a/fs/ocfs2/Makefile +++ b/fs/ocfs2/Makefile @@ -2,7 +2,12 @@ EXTRA_CFLAGS += -Ifs/ocfs2 EXTRA_CFLAGS += -DCATCH_BH_JBD_RACES -obj-$(CONFIG_OCFS2_FS) += ocfs2.o +obj-$(CONFIG_OCFS2_FS) += \ + ocfs2.o \ + ocfs2_stackglue.o + +obj-$(CONFIG_OCFS2_FS_O2CB) += ocfs2_stack_o2cb.o +obj-$(CONFIG_OCFS2_FS_USERSPACE_CLUSTER) += ocfs2_stack_user.o ocfs2-objs := \ alloc.o \ @@ -31,5 +36,10 @@ ocfs2-objs := \ uptodate.o \ ver.o +ocfs2_stackglue-objs := stackglue.o +ocfs2_stack_o2cb-objs := stack_o2cb.o +ocfs2_stack_user-objs := stack_user.o + +# cluster/ is always needed when OCFS2_FS for masklog support obj-$(CONFIG_OCFS2_FS) += cluster/ -obj-$(CONFIG_OCFS2_FS) += dlm/ +obj-$(CONFIG_OCFS2_FS_O2CB) += dlm/ diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index 447206eb..41f84c9 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -1029,8 +1029,7 @@ static void ocfs2_rotate_leaf(struct ocfs2_extent_list *el, BUG_ON(!next_free); /* The tree code before us didn't allow enough room in the leaf. */ - if (el->l_next_free_rec == el->l_count && !has_empty) - BUG(); + BUG_ON(el->l_next_free_rec == el->l_count && !has_empty); /* * The easiest way to approach this is to just remove the @@ -1450,6 +1449,8 @@ static void ocfs2_adjust_root_records(struct ocfs2_extent_list *root_el, * - When our insert into the right path leaf is at the leftmost edge * and requires an update of the path immediately to it's left. This * can occur at the end of some types of rotation and appending inserts. + * - When we've adjusted the last extent record in the left path leaf and the + * 1st extent record in the right path leaf during cross extent block merge. */ static void ocfs2_complete_edge_insert(struct inode *inode, handle_t *handle, struct ocfs2_path *left_path, @@ -2712,24 +2713,147 @@ static void ocfs2_cleanup_merge(struct ocfs2_extent_list *el, } } +static int ocfs2_get_right_path(struct inode *inode, + struct ocfs2_path *left_path, + struct ocfs2_path **ret_right_path) +{ + int ret; + u32 right_cpos; + struct ocfs2_path *right_path = NULL; + struct ocfs2_extent_list *left_el; + + *ret_right_path = NULL; + + /* This function shouldn't be called for non-trees. */ + BUG_ON(left_path->p_tree_depth == 0); + + left_el = path_leaf_el(left_path); + BUG_ON(left_el->l_next_free_rec != left_el->l_count); + + ret = ocfs2_find_cpos_for_right_leaf(inode->i_sb, left_path, + &right_cpos); + if (ret) { + mlog_errno(ret); + goto out; + } + + /* This function shouldn't be called for the rightmost leaf. */ + BUG_ON(right_cpos == 0); + + right_path = ocfs2_new_path(path_root_bh(left_path), + path_root_el(left_path)); + if (!right_path) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + + ret = ocfs2_find_path(inode, right_path, right_cpos); + if (ret) { + mlog_errno(ret); + goto out; + } + + *ret_right_path = right_path; +out: + if (ret) + ocfs2_free_path(right_path); + return ret; +} + /* * Remove split_rec clusters from the record at index and merge them - * onto the beginning of the record at index + 1. + * onto the beginning of the record "next" to it. + * For index < l_count - 1, the next means the extent rec at index + 1. + * For index == l_count - 1, the "next" means the 1st extent rec of the + * next extent block. */ -static int ocfs2_merge_rec_right(struct inode *inode, struct buffer_head *bh, - handle_t *handle, - struct ocfs2_extent_rec *split_rec, - struct ocfs2_extent_list *el, int index) +static int ocfs2_merge_rec_right(struct inode *inode, + struct ocfs2_path *left_path, + handle_t *handle, + struct ocfs2_extent_rec *split_rec, + int index) { - int ret; + int ret, next_free, i; unsigned int split_clusters = le16_to_cpu(split_rec->e_leaf_clusters); struct ocfs2_extent_rec *left_rec; struct ocfs2_extent_rec *right_rec; + struct ocfs2_extent_list *right_el; + struct ocfs2_path *right_path = NULL; + int subtree_index = 0; + struct ocfs2_extent_list *el = path_leaf_el(left_path); + struct buffer_head *bh = path_leaf_bh(left_path); + struct buffer_head *root_bh = NULL; BUG_ON(index >= le16_to_cpu(el->l_next_free_rec)); - left_rec = &el->l_recs[index]; - right_rec = &el->l_recs[index + 1]; + + if (index == le16_to_cpu(el->l_next_free_rec - 1) && + le16_to_cpu(el->l_next_free_rec) == le16_to_cpu(el->l_count)) { + /* we meet with a cross extent block merge. */ + ret = ocfs2_get_right_path(inode, left_path, &right_path); + if (ret) { + mlog_errno(ret); + goto out; + } + + right_el = path_leaf_el(right_path); + next_free = le16_to_cpu(right_el->l_next_free_rec); + BUG_ON(next_free <= 0); + right_rec = &right_el->l_recs[0]; + if (ocfs2_is_empty_extent(right_rec)) { + BUG_ON(le16_to_cpu(next_free) <= 1); + right_rec = &right_el->l_recs[1]; + } + + BUG_ON(le32_to_cpu(left_rec->e_cpos) + + le16_to_cpu(left_rec->e_leaf_clusters) != + le32_to_cpu(right_rec->e_cpos)); + + subtree_index = ocfs2_find_subtree_root(inode, + left_path, right_path); + + ret = ocfs2_extend_rotate_transaction(handle, subtree_index, + handle->h_buffer_credits, + right_path); + if (ret) { + mlog_errno(ret); + goto out; + } + + root_bh = left_path->p_node[subtree_index].bh; + BUG_ON(root_bh != right_path->p_node[subtree_index].bh); + + ret = ocfs2_journal_access(handle, inode, root_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out; + } + + for (i = subtree_index + 1; + i < path_num_items(right_path); i++) { + ret = ocfs2_journal_access(handle, inode, + right_path->p_node[i].bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_journal_access(handle, inode, + left_path->p_node[i].bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out; + } + } + + } else { + BUG_ON(index == le16_to_cpu(el->l_next_free_rec) - 1); + right_rec = &el->l_recs[index + 1]; + } ret = ocfs2_journal_access(handle, inode, bh, OCFS2_JOURNAL_ACCESS_WRITE); @@ -2751,30 +2875,156 @@ static int ocfs2_merge_rec_right(struct inode *inode, struct buffer_head *bh, if (ret) mlog_errno(ret); + if (right_path) { + ret = ocfs2_journal_dirty(handle, path_leaf_bh(right_path)); + if (ret) + mlog_errno(ret); + + ocfs2_complete_edge_insert(inode, handle, left_path, + right_path, subtree_index); + } +out: + if (right_path) + ocfs2_free_path(right_path); + return ret; +} + +static int ocfs2_get_left_path(struct inode *inode, + struct ocfs2_path *right_path, + struct ocfs2_path **ret_left_path) +{ + int ret; + u32 left_cpos; + struct ocfs2_path *left_path = NULL; + + *ret_left_path = NULL; + + /* This function shouldn't be called for non-trees. */ + BUG_ON(right_path->p_tree_depth == 0); + + ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, + right_path, &left_cpos); + if (ret) { + mlog_errno(ret); + goto out; + } + + /* This function shouldn't be called for the leftmost leaf. */ + BUG_ON(left_cpos == 0); + + left_path = ocfs2_new_path(path_root_bh(right_path), + path_root_el(right_path)); + if (!left_path) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + + ret = ocfs2_find_path(inode, left_path, left_cpos); + if (ret) { + mlog_errno(ret); + goto out; + } + + *ret_left_path = left_path; out: + if (ret) + ocfs2_free_path(left_path); return ret; } /* * Remove split_rec clusters from the record at index and merge them - * onto the tail of the record at index - 1. + * onto the tail of the record "before" it. + * For index > 0, the "before" means the extent rec at index - 1. + * + * For index == 0, the "before" means the last record of the previous + * extent block. And there is also a situation that we may need to + * remove the rightmost leaf extent block in the right_path and change + * the right path to indicate the new rightmost path. */ -static int ocfs2_merge_rec_left(struct inode *inode, struct buffer_head *bh, +static int ocfs2_merge_rec_left(struct inode *inode, + struct ocfs2_path *right_path, handle_t *handle, struct ocfs2_extent_rec *split_rec, - struct ocfs2_extent_list *el, int index) + struct ocfs2_cached_dealloc_ctxt *dealloc, + int index) { - int ret, has_empty_extent = 0; + int ret, i, subtree_index = 0, has_empty_extent = 0; unsigned int split_clusters = le16_to_cpu(split_rec->e_leaf_clusters); struct ocfs2_extent_rec *left_rec; struct ocfs2_extent_rec *right_rec; + struct ocfs2_extent_list *el = path_leaf_el(right_path); + struct buffer_head *bh = path_leaf_bh(right_path); + struct buffer_head *root_bh = NULL; + struct ocfs2_path *left_path = NULL; + struct ocfs2_extent_list *left_el; - BUG_ON(index <= 0); + BUG_ON(index < 0); - left_rec = &el->l_recs[index - 1]; right_rec = &el->l_recs[index]; - if (ocfs2_is_empty_extent(&el->l_recs[0])) - has_empty_extent = 1; + if (index == 0) { + /* we meet with a cross extent block merge. */ + ret = ocfs2_get_left_path(inode, right_path, &left_path); + if (ret) { + mlog_errno(ret); + goto out; + } + + left_el = path_leaf_el(left_path); + BUG_ON(le16_to_cpu(left_el->l_next_free_rec) != + le16_to_cpu(left_el->l_count)); + + left_rec = &left_el->l_recs[ + le16_to_cpu(left_el->l_next_free_rec) - 1]; + BUG_ON(le32_to_cpu(left_rec->e_cpos) + + le16_to_cpu(left_rec->e_leaf_clusters) != + le32_to_cpu(split_rec->e_cpos)); + + subtree_index = ocfs2_find_subtree_root(inode, + left_path, right_path); + + ret = ocfs2_extend_rotate_transaction(handle, subtree_index, + handle->h_buffer_credits, + left_path); + if (ret) { + mlog_errno(ret); + goto out; + } + + root_bh = left_path->p_node[subtree_index].bh; + BUG_ON(root_bh != right_path->p_node[subtree_index].bh); + + ret = ocfs2_journal_access(handle, inode, root_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out; + } + + for (i = subtree_index + 1; + i < path_num_items(right_path); i++) { + ret = ocfs2_journal_access(handle, inode, + right_path->p_node[i].bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_journal_access(handle, inode, + left_path->p_node[i].bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out; + } + } + } else { + left_rec = &el->l_recs[index - 1]; + if (ocfs2_is_empty_extent(&el->l_recs[0])) + has_empty_extent = 1; + } ret = ocfs2_journal_access(handle, inode, bh, OCFS2_JOURNAL_ACCESS_WRITE); @@ -2790,9 +3040,8 @@ static int ocfs2_merge_rec_left(struct inode *inode, struct buffer_head *bh, *left_rec = *split_rec; has_empty_extent = 0; - } else { + } else le16_add_cpu(&left_rec->e_leaf_clusters, split_clusters); - } le32_add_cpu(&right_rec->e_cpos, split_clusters); le64_add_cpu(&right_rec->e_blkno, @@ -2805,13 +3054,44 @@ static int ocfs2_merge_rec_left(struct inode *inode, struct buffer_head *bh, if (ret) mlog_errno(ret); + if (left_path) { + ret = ocfs2_journal_dirty(handle, path_leaf_bh(left_path)); + if (ret) + mlog_errno(ret); + + /* + * In the situation that the right_rec is empty and the extent + * block is empty also, ocfs2_complete_edge_insert can't handle + * it and we need to delete the right extent block. + */ + if (le16_to_cpu(right_rec->e_leaf_clusters) == 0 && + le16_to_cpu(el->l_next_free_rec) == 1) { + + ret = ocfs2_remove_rightmost_path(inode, handle, + right_path, dealloc); + if (ret) { + mlog_errno(ret); + goto out; + } + + /* Now the rightmost extent block has been deleted. + * So we use the new rightmost path. + */ + ocfs2_mv_path(right_path, left_path); + left_path = NULL; + } else + ocfs2_complete_edge_insert(inode, handle, left_path, + right_path, subtree_index); + } out: + if (left_path) + ocfs2_free_path(left_path); return ret; } static int ocfs2_try_to_merge_extent(struct inode *inode, handle_t *handle, - struct ocfs2_path *left_path, + struct ocfs2_path *path, int split_index, struct ocfs2_extent_rec *split_rec, struct ocfs2_cached_dealloc_ctxt *dealloc, @@ -2819,7 +3099,7 @@ static int ocfs2_try_to_merge_extent(struct inode *inode, { int ret = 0; - struct ocfs2_extent_list *el = path_leaf_el(left_path); + struct ocfs2_extent_list *el = path_leaf_el(path); struct ocfs2_extent_rec *rec = &el->l_recs[split_index]; BUG_ON(ctxt->c_contig_type == CONTIG_NONE); @@ -2832,7 +3112,7 @@ static int ocfs2_try_to_merge_extent(struct inode *inode, * extents - having more than one in a leaf is * illegal. */ - ret = ocfs2_rotate_tree_left(inode, handle, left_path, + ret = ocfs2_rotate_tree_left(inode, handle, path, dealloc); if (ret) { mlog_errno(ret); @@ -2847,7 +3127,6 @@ static int ocfs2_try_to_merge_extent(struct inode *inode, * Left-right contig implies this. */ BUG_ON(!ctxt->c_split_covers_rec); - BUG_ON(split_index == 0); /* * Since the leftright insert always covers the entire @@ -2858,9 +3137,14 @@ static int ocfs2_try_to_merge_extent(struct inode *inode, * Since the adding of an empty extent shifts * everything back to the right, there's no need to * update split_index here. + * + * When the split_index is zero, we need to merge it to the + * prevoius extent block. It is more efficient and easier + * if we do merge_right first and merge_left later. */ - ret = ocfs2_merge_rec_left(inode, path_leaf_bh(left_path), - handle, split_rec, el, split_index); + ret = ocfs2_merge_rec_right(inode, path, + handle, split_rec, + split_index); if (ret) { mlog_errno(ret); goto out; @@ -2871,32 +3155,30 @@ static int ocfs2_try_to_merge_extent(struct inode *inode, */ BUG_ON(!ocfs2_is_empty_extent(&el->l_recs[0])); - /* - * The left merge left us with an empty extent, remove - * it. - */ - ret = ocfs2_rotate_tree_left(inode, handle, left_path, dealloc); + /* The merge left us with an empty extent, remove it. */ + ret = ocfs2_rotate_tree_left(inode, handle, path, dealloc); if (ret) { mlog_errno(ret); goto out; } - split_index--; + rec = &el->l_recs[split_index]; /* * Note that we don't pass split_rec here on purpose - - * we've merged it into the left side. + * we've merged it into the rec already. */ - ret = ocfs2_merge_rec_right(inode, path_leaf_bh(left_path), - handle, rec, el, split_index); + ret = ocfs2_merge_rec_left(inode, path, + handle, rec, + dealloc, + split_index); + if (ret) { mlog_errno(ret); goto out; } - BUG_ON(!ocfs2_is_empty_extent(&el->l_recs[0])); - - ret = ocfs2_rotate_tree_left(inode, handle, left_path, + ret = ocfs2_rotate_tree_left(inode, handle, path, dealloc); /* * Error from this last rotate is not critical, so @@ -2915,8 +3197,9 @@ static int ocfs2_try_to_merge_extent(struct inode *inode, */ if (ctxt->c_contig_type == CONTIG_RIGHT) { ret = ocfs2_merge_rec_left(inode, - path_leaf_bh(left_path), - handle, split_rec, el, + path, + handle, split_rec, + dealloc, split_index); if (ret) { mlog_errno(ret); @@ -2924,8 +3207,8 @@ static int ocfs2_try_to_merge_extent(struct inode *inode, } } else { ret = ocfs2_merge_rec_right(inode, - path_leaf_bh(left_path), - handle, split_rec, el, + path, + handle, split_rec, split_index); if (ret) { mlog_errno(ret); @@ -2938,7 +3221,7 @@ static int ocfs2_try_to_merge_extent(struct inode *inode, * The merge may have left an empty extent in * our leaf. Try to rotate it away. */ - ret = ocfs2_rotate_tree_left(inode, handle, left_path, + ret = ocfs2_rotate_tree_left(inode, handle, path, dealloc); if (ret) mlog_errno(ret); @@ -3498,20 +3781,57 @@ out: } static enum ocfs2_contig_type -ocfs2_figure_merge_contig_type(struct inode *inode, +ocfs2_figure_merge_contig_type(struct inode *inode, struct ocfs2_path *path, struct ocfs2_extent_list *el, int index, struct ocfs2_extent_rec *split_rec) { - struct ocfs2_extent_rec *rec; + int status; enum ocfs2_contig_type ret = CONTIG_NONE; + u32 left_cpos, right_cpos; + struct ocfs2_extent_rec *rec = NULL; + struct ocfs2_extent_list *new_el; + struct ocfs2_path *left_path = NULL, *right_path = NULL; + struct buffer_head *bh; + struct ocfs2_extent_block *eb; + + if (index > 0) { + rec = &el->l_recs[index - 1]; + } else if (path->p_tree_depth > 0) { + status = ocfs2_find_cpos_for_left_leaf(inode->i_sb, + path, &left_cpos); + if (status) + goto out; + + if (left_cpos != 0) { + left_path = ocfs2_new_path(path_root_bh(path), + path_root_el(path)); + if (!left_path) + goto out; + + status = ocfs2_find_path(inode, left_path, left_cpos); + if (status) + goto out; + + new_el = path_leaf_el(left_path); + + if (le16_to_cpu(new_el->l_next_free_rec) != + le16_to_cpu(new_el->l_count)) { + bh = path_leaf_bh(left_path); + eb = (struct ocfs2_extent_block *)bh->b_data; + OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, + eb); + goto out; + } + rec = &new_el->l_recs[ + le16_to_cpu(new_el->l_next_free_rec) - 1]; + } + } /* * We're careful to check for an empty extent record here - * the merge code will know what to do if it sees one. */ - - if (index > 0) { - rec = &el->l_recs[index - 1]; + if (rec) { if (index == 1 && ocfs2_is_empty_extent(rec)) { if (split_rec->e_cpos == el->l_recs[index].e_cpos) ret = CONTIG_RIGHT; @@ -3520,10 +3840,45 @@ ocfs2_figure_merge_contig_type(struct inode *inode, } } - if (index < (le16_to_cpu(el->l_next_free_rec) - 1)) { + rec = NULL; + if (index < (le16_to_cpu(el->l_next_free_rec) - 1)) + rec = &el->l_recs[index + 1]; + else if (le16_to_cpu(el->l_next_free_rec) == le16_to_cpu(el->l_count) && + path->p_tree_depth > 0) { + status = ocfs2_find_cpos_for_right_leaf(inode->i_sb, + path, &right_cpos); + if (status) + goto out; + + if (right_cpos == 0) + goto out; + + right_path = ocfs2_new_path(path_root_bh(path), + path_root_el(path)); + if (!right_path) + goto out; + + status = ocfs2_find_path(inode, right_path, right_cpos); + if (status) + goto out; + + new_el = path_leaf_el(right_path); + rec = &new_el->l_recs[0]; + if (ocfs2_is_empty_extent(rec)) { + if (le16_to_cpu(new_el->l_next_free_rec) <= 1) { + bh = path_leaf_bh(right_path); + eb = (struct ocfs2_extent_block *)bh->b_data; + OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, + eb); + goto out; + } + rec = &new_el->l_recs[1]; + } + } + + if (rec) { enum ocfs2_contig_type contig_type; - rec = &el->l_recs[index + 1]; contig_type = ocfs2_extent_contig(inode, rec, split_rec); if (contig_type == CONTIG_LEFT && ret == CONTIG_RIGHT) @@ -3532,6 +3887,12 @@ ocfs2_figure_merge_contig_type(struct inode *inode, ret = contig_type; } +out: + if (left_path) + ocfs2_free_path(left_path); + if (right_path) + ocfs2_free_path(right_path); + return ret; } @@ -3994,7 +4355,7 @@ static int __ocfs2_mark_extent_written(struct inode *inode, goto out; } - ctxt.c_contig_type = ocfs2_figure_merge_contig_type(inode, el, + ctxt.c_contig_type = ocfs2_figure_merge_contig_type(inode, path, el, split_index, split_rec); @@ -4788,6 +5149,8 @@ static void ocfs2_truncate_log_worker(struct work_struct *work) status = ocfs2_flush_truncate_log(osb); if (status < 0) mlog_errno(status); + else + ocfs2_init_inode_steal_slot(osb); mlog_exit(status); } diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 90383ed..17964c0 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -467,11 +467,11 @@ handle_t *ocfs2_start_walk_page_trans(struct inode *inode, unsigned to) { struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); - handle_t *handle = NULL; + handle_t *handle; int ret = 0; handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); - if (!handle) { + if (IS_ERR(handle)) { ret = -ENOMEM; mlog_errno(ret); goto out; @@ -487,7 +487,7 @@ handle_t *ocfs2_start_walk_page_trans(struct inode *inode, } out: if (ret) { - if (handle) + if (!IS_ERR(handle)) ocfs2_commit_trans(osb, handle); handle = ERR_PTR(ret); } diff --git a/fs/ocfs2/cluster/Makefile b/fs/ocfs2/cluster/Makefile index cdd162f..bc8c5e7 100644 --- a/fs/ocfs2/cluster/Makefile +++ b/fs/ocfs2/cluster/Makefile @@ -1,4 +1,4 @@ obj-$(CONFIG_OCFS2_FS) += ocfs2_nodemanager.o ocfs2_nodemanager-objs := heartbeat.o masklog.o sys.o nodemanager.o \ - quorum.o tcp.o ver.o + quorum.o tcp.o netdebug.o ver.o diff --git a/fs/ocfs2/cluster/netdebug.c b/fs/ocfs2/cluster/netdebug.c new file mode 100644 index 0000000..7bf3c0e --- /dev/null +++ b/fs/ocfs2/cluster/netdebug.c @@ -0,0 +1,441 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * netdebug.c + * + * debug functionality for o2net + * + * Copyright (C) 2005, 2008 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + * + */ + +#ifdef CONFIG_DEBUG_FS + +#include <linux/module.h> +#include <linux/types.h> +#include <linux/slab.h> +#include <linux/idr.h> +#include <linux/kref.h> +#include <linux/seq_file.h> +#include <linux/debugfs.h> + +#include <linux/uaccess.h> + +#include "tcp.h" +#include "nodemanager.h" +#define MLOG_MASK_PREFIX ML_TCP +#include "masklog.h" + +#include "tcp_internal.h" + +#define O2NET_DEBUG_DIR "o2net" +#define SC_DEBUG_NAME "sock_containers" +#define NST_DEBUG_NAME "send_tracking" + +static struct dentry *o2net_dentry; +static struct dentry *sc_dentry; +static struct dentry *nst_dentry; + +static DEFINE_SPINLOCK(o2net_debug_lock); + +static LIST_HEAD(sock_containers); +static LIST_HEAD(send_tracking); + +void o2net_debug_add_nst(struct o2net_send_tracking *nst) +{ + spin_lock(&o2net_debug_lock); + list_add(&nst->st_net_debug_item, &send_tracking); + spin_unlock(&o2net_debug_lock); +} + +void o2net_debug_del_nst(struct o2net_send_tracking *nst) +{ + spin_lock(&o2net_debug_lock); + if (!list_empty(&nst->st_net_debug_item)) + list_del_init(&nst->st_net_debug_item); + spin_unlock(&o2net_debug_lock); +} + +static struct o2net_send_tracking + *next_nst(struct o2net_send_tracking *nst_start) +{ + struct o2net_send_tracking *nst, *ret = NULL; + + assert_spin_locked(&o2net_debug_lock); + + list_for_each_entry(nst, &nst_start->st_net_debug_item, + st_net_debug_item) { + /* discover the head of the list */ + if (&nst->st_net_debug_item == &send_tracking) + break; + + /* use st_task to detect real nsts in the list */ + if (nst->st_task != NULL) { + ret = nst; + break; + } + } + + return ret; +} + +static void *nst_seq_start(struct seq_file *seq, loff_t *pos) +{ + struct o2net_send_tracking *nst, *dummy_nst = seq->private; + + spin_lock(&o2net_debug_lock); + nst = next_nst(dummy_nst); + spin_unlock(&o2net_debug_lock); + + return nst; +} + +static void *nst_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct o2net_send_tracking *nst, *dummy_nst = seq->private; + + spin_lock(&o2net_debug_lock); + nst = next_nst(dummy_nst); + list_del_init(&dummy_nst->st_net_debug_item); + if (nst) + list_add(&dummy_nst->st_net_debug_item, + &nst->st_net_debug_item); + spin_unlock(&o2net_debug_lock); + + return nst; /* unused, just needs to be null when done */ +} + +static int nst_seq_show(struct seq_file *seq, void *v) +{ + struct o2net_send_tracking *nst, *dummy_nst = seq->private; + + spin_lock(&o2net_debug_lock); + nst = next_nst(dummy_nst); + + if (nst != NULL) { + /* get_task_comm isn't exported. oh well. */ + seq_printf(seq, "%p:\n" + " pid: %lu\n" + " tgid: %lu\n" + " process name: %s\n" + " node: %u\n" + " sc: %p\n" + " message id: %d\n" + " message type: %u\n" + " message key: 0x%08x\n" + " sock acquiry: %lu.%lu\n" + " send start: %lu.%lu\n" + " wait start: %lu.%lu\n", + nst, (unsigned long)nst->st_task->pid, + (unsigned long)nst->st_task->tgid, + nst->st_task->comm, nst->st_node, + nst->st_sc, nst->st_id, nst->st_msg_type, + nst->st_msg_key, + nst->st_sock_time.tv_sec, nst->st_sock_time.tv_usec, + nst->st_send_time.tv_sec, nst->st_send_time.tv_usec, + nst->st_status_time.tv_sec, + nst->st_status_time.tv_usec); + } + + spin_unlock(&o2net_debug_lock); + + return 0; +} + +static void nst_seq_stop(struct seq_file *seq, void *v) +{ +} + +static struct seq_operations nst_seq_ops = { + .start = nst_seq_start, + .next = nst_seq_next, + .stop = nst_seq_stop, + .show = nst_seq_show, +}; + +static int nst_fop_open(struct inode *inode, struct file *file) +{ + struct o2net_send_tracking *dummy_nst; + struct seq_file *seq; + int ret; + + dummy_nst = kmalloc(sizeof(struct o2net_send_tracking), GFP_KERNEL); + if (dummy_nst == NULL) { + ret = -ENOMEM; + goto out; + } + dummy_nst->st_task = NULL; + + ret = seq_open(file, &nst_seq_ops); + if (ret) + goto out; + + seq = file->private_data; + seq->private = dummy_nst; + o2net_debug_add_nst(dummy_nst); + + dummy_nst = NULL; + +out: + kfree(dummy_nst); + return ret; +} + +static int nst_fop_release(struct inode *inode, struct file *file) +{ + struct seq_file *seq = file->private_data; + struct o2net_send_tracking *dummy_nst = seq->private; + + o2net_debug_del_nst(dummy_nst); + return seq_release_private(inode, file); +} + +static struct file_operations nst_seq_fops = { + .open = nst_fop_open, + .read = seq_read, + .llseek = seq_lseek, + .release = nst_fop_release, +}; + +void o2net_debug_add_sc(struct o2net_sock_container *sc) +{ + spin_lock(&o2net_debug_lock); + list_add(&sc->sc_net_debug_item, &sock_containers); + spin_unlock(&o2net_debug_lock); +} + +void o2net_debug_del_sc(struct o2net_sock_container *sc) +{ + spin_lock(&o2net_debug_lock); + list_del_init(&sc->sc_net_debug_item); + spin_unlock(&o2net_debug_lock); +} + +static struct o2net_sock_container + *next_sc(struct o2net_sock_container *sc_start) +{ + struct o2net_sock_container *sc, *ret = NULL; + + assert_spin_locked(&o2net_debug_lock); + + list_for_each_entry(sc, &sc_start->sc_net_debug_item, + sc_net_debug_item) { + /* discover the head of the list miscast as a sc */ + if (&sc->sc_net_debug_item == &sock_containers) + break; + + /* use sc_page to detect real scs in the list */ + if (sc->sc_page != NULL) { + ret = sc; + break; + } + } + + return ret; +} + +static void *sc_seq_start(struct seq_file *seq, loff_t *pos) +{ + struct o2net_sock_container *sc, *dummy_sc = seq->private; + + spin_lock(&o2net_debug_lock); + sc = next_sc(dummy_sc); + spin_unlock(&o2net_debug_lock); + + return sc; +} + +static void *sc_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct o2net_sock_container *sc, *dummy_sc = seq->private; + + spin_lock(&o2net_debug_lock); + sc = next_sc(dummy_sc); + list_del_init(&dummy_sc->sc_net_debug_item); + if (sc) + list_add(&dummy_sc->sc_net_debug_item, &sc->sc_net_debug_item); + spin_unlock(&o2net_debug_lock); + + return sc; /* unused, just needs to be null when done */ +} + +#define TV_SEC_USEC(TV) TV.tv_sec, TV.tv_usec + +static int sc_seq_show(struct seq_file *seq, void *v) +{ + struct o2net_sock_container *sc, *dummy_sc = seq->private; + + spin_lock(&o2net_debug_lock); + sc = next_sc(dummy_sc); + + if (sc != NULL) { + struct inet_sock *inet = NULL; + + __be32 saddr = 0, daddr = 0; + __be16 sport = 0, dport = 0; + + if (sc->sc_sock) { + inet = inet_sk(sc->sc_sock->sk); + /* the stack's structs aren't sparse endian clean */ + saddr = (__force __be32)inet->saddr; + daddr = (__force __be32)inet->daddr; + sport = (__force __be16)inet->sport; + dport = (__force __be16)inet->dport; + } + + /* XXX sigh, inet-> doesn't have sparse annotation so any + * use of it here generates a warning with -Wbitwise */ + seq_printf(seq, "%p:\n" + " krefs: %d\n" + " sock: %u.%u.%u.%u:%u -> " + "%u.%u.%u.%u:%u\n" + " remote node: %s\n" + " page off: %zu\n" + " handshake ok: %u\n" + " timer: %lu.%lu\n" + " data ready: %lu.%lu\n" + " advance start: %lu.%lu\n" + " advance stop: %lu.%lu\n" + " func start: %lu.%lu\n" + " func stop: %lu.%lu\n" + " func key: %u\n" + " func type: %u\n", + sc, + atomic_read(&sc->sc_kref.refcount), + NIPQUAD(saddr), inet ? ntohs(sport) : 0, + NIPQUAD(daddr), inet ? ntohs(dport) : 0, + sc->sc_node->nd_name, + sc->sc_page_off, + sc->sc_handshake_ok, + TV_SEC_USEC(sc->sc_tv_timer), + TV_SEC_USEC(sc->sc_tv_data_ready), + TV_SEC_USEC(sc->sc_tv_advance_start), + TV_SEC_USEC(sc->sc_tv_advance_stop), + TV_SEC_USEC(sc->sc_tv_func_start), + TV_SEC_USEC(sc->sc_tv_func_stop), + sc->sc_msg_key, + sc->sc_msg_type); + } + + + spin_unlock(&o2net_debug_lock); + + return 0; +} + +static void sc_seq_stop(struct seq_file *seq, void *v) +{ +} + +static struct seq_operations sc_seq_ops = { + .start = sc_seq_start, + .next = sc_seq_next, + .stop = sc_seq_stop, + .show = sc_seq_show, +}; + +static int sc_fop_open(struct inode *inode, struct file *file) +{ + struct o2net_sock_container *dummy_sc; + struct seq_file *seq; + int ret; + + dummy_sc = kmalloc(sizeof(struct o2net_sock_container), GFP_KERNEL); + if (dummy_sc == NULL) { + ret = -ENOMEM; + goto out; + } + dummy_sc->sc_page = NULL; + + ret = seq_open(file, &sc_seq_ops); + if (ret) + goto out; + + seq = file->private_data; + seq->private = dummy_sc; + o2net_debug_add_sc(dummy_sc); + + dummy_sc = NULL; + +out: + kfree(dummy_sc); + return ret; +} + +static int sc_fop_release(struct inode *inode, struct file *file) +{ + struct seq_file *seq = file->private_data; + struct o2net_sock_container *dummy_sc = seq->private; + + o2net_debug_del_sc(dummy_sc); + return seq_release_private(inode, file); +} + +static struct file_operations sc_seq_fops = { + .open = sc_fop_open, + .read = seq_read, + .llseek = seq_lseek, + .release = sc_fop_release, +}; + +int o2net_debugfs_init(void) +{ + o2net_dentry = debugfs_create_dir(O2NET_DEBUG_DIR, NULL); + if (!o2net_dentry) { + mlog_errno(-ENOMEM); + goto bail; + } + + nst_dentry = debugfs_create_file(NST_DEBUG_NAME, S_IFREG|S_IRUSR, + o2net_dentry, NULL, + &nst_seq_fops); + if (!nst_dentry) { + mlog_errno(-ENOMEM); + goto bail; + } + + sc_dentry = debugfs_create_file(SC_DEBUG_NAME, S_IFREG|S_IRUSR, + o2net_dentry, NULL, + &sc_seq_fops); + if (!sc_dentry) { + mlog_errno(-ENOMEM); + goto bail; + } + + return 0; +bail: + if (sc_dentry) + debugfs_remove(sc_dentry); + if (nst_dentry) + debugfs_remove(nst_dentry); + if (o2net_dentry) + debugfs_remove(o2net_dentry); + return -ENOMEM; +} + +void o2net_debugfs_exit(void) +{ + if (sc_dentry) + debugfs_remove(sc_dentry); + if (nst_dentry) + debugfs_remove(nst_dentry); + if (o2net_dentry) + debugfs_remove(o2net_dentry); +} + +#endif /* CONFIG_DEBUG_FS */ diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c index 709fba2..cf9401e 100644 --- a/fs/ocfs2/cluster/nodemanager.c +++ b/fs/ocfs2/cluster/nodemanager.c @@ -959,7 +959,10 @@ static int __init init_o2nm(void) cluster_print_version(); o2hb_init(); - o2net_init(); + + ret = o2net_init(); + if (ret) + goto out; ocfs2_table_header = register_sysctl_table(ocfs2_root_table); if (!ocfs2_table_header) { diff --git a/fs/ocfs2/cluster/sys.c b/fs/ocfs2/cluster/sys.c index 0c095ce..98429fd 100644 --- a/fs/ocfs2/cluster/sys.c +++ b/fs/ocfs2/cluster/sys.c @@ -57,6 +57,7 @@ static struct kset *o2cb_kset; void o2cb_sys_shutdown(void) { mlog_sys_shutdown(); + sysfs_remove_link(NULL, "o2cb"); kset_unregister(o2cb_kset); } @@ -68,6 +69,14 @@ int o2cb_sys_init(void) if (!o2cb_kset) return -ENOMEM; + /* + * Create this symlink for backwards compatibility with old + * versions of ocfs2-tools which look for things in /sys/o2cb. + */ + ret = sysfs_create_link(NULL, &o2cb_kset->kobj, "o2cb"); + if (ret) + goto error; + ret = sysfs_create_group(&o2cb_kset->kobj, &o2cb_attr_group); if (ret) goto error; diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c index b8057c5..1e44ad1 100644 --- a/fs/ocfs2/cluster/tcp.c +++ b/fs/ocfs2/cluster/tcp.c @@ -142,23 +142,65 @@ static void o2net_idle_timer(unsigned long data); static void o2net_sc_postpone_idle(struct o2net_sock_container *sc); static void o2net_sc_reset_idle_timer(struct o2net_sock_container *sc); -/* - * FIXME: These should use to_o2nm_cluster_from_node(), but we end up - * losing our parent link to the cluster during shutdown. This can be - * solved by adding a pre-removal callback to configfs, or passing - * around the cluster with the node. -jeffm - */ -static inline int o2net_reconnect_delay(struct o2nm_node *node) +static void o2net_init_nst(struct o2net_send_tracking *nst, u32 msgtype, + u32 msgkey, struct task_struct *task, u8 node) +{ +#ifdef CONFIG_DEBUG_FS + INIT_LIST_HEAD(&nst->st_net_debug_item); + nst->st_task = task; + nst->st_msg_type = msgtype; + nst->st_msg_key = msgkey; + nst->st_node = node; +#endif +} + +static void o2net_set_nst_sock_time(struct o2net_send_tracking *nst) +{ +#ifdef CONFIG_DEBUG_FS + do_gettimeofday(&nst->st_sock_time); +#endif +} + +static void o2net_set_nst_send_time(struct o2net_send_tracking *nst) +{ +#ifdef CONFIG_DEBUG_FS + do_gettimeofday(&nst->st_send_time); +#endif +} + +static void o2net_set_nst_status_time(struct o2net_send_tracking *nst) +{ +#ifdef CONFIG_DEBUG_FS + do_gettimeofday(&nst->st_status_time); +#endif +} + +static void o2net_set_nst_sock_container(struct o2net_send_tracking *nst, + struct o2net_sock_container *sc) +{ +#ifdef CONFIG_DEBUG_FS + nst->st_sc = sc; +#endif +} + +static void o2net_set_nst_msg_id(struct o2net_send_tracking *nst, u32 msg_id) +{ +#ifdef CONFIG_DEBUG_FS + nst->st_id = msg_id; +#endif +} + +static inline int o2net_reconnect_delay(void) { return o2nm_single_cluster->cl_reconnect_delay_ms; } -static inline int o2net_keepalive_delay(struct o2nm_node *node) +static inline int o2net_keepalive_delay(void) { return o2nm_single_cluster->cl_keepalive_delay_ms; } -static inline int o2net_idle_timeout(struct o2nm_node *node) +static inline int o2net_idle_timeout(void) { return o2nm_single_cluster->cl_idle_timeout_ms; } @@ -296,6 +338,7 @@ static void sc_kref_release(struct kref *kref) o2nm_node_put(sc->sc_node); sc->sc_node = NULL; + o2net_debug_del_sc(sc); kfree(sc); } @@ -336,6 +379,7 @@ static struct o2net_sock_container *sc_alloc(struct o2nm_node *node) ret = sc; sc->sc_page = page; + o2net_debug_add_sc(sc); sc = NULL; page = NULL; @@ -399,8 +443,6 @@ static void o2net_set_nn_state(struct o2net_node *nn, mlog_bug_on_msg(err && valid, "err %d valid %u\n", err, valid); mlog_bug_on_msg(valid && !sc, "valid %u sc %p\n", valid, sc); - /* we won't reconnect after our valid conn goes away for - * this hb iteration.. here so it shows up in the logs */ if (was_valid && !valid && err == 0) err = -ENOTCONN; @@ -430,11 +472,6 @@ static void o2net_set_nn_state(struct o2net_node *nn, if (!was_valid && valid) { o2quo_conn_up(o2net_num_from_nn(nn)); - /* this is a bit of a hack. we only try reconnecting - * when heartbeating starts until we get a connection. - * if that connection then dies we don't try reconnecting. - * the only way to start connecting again is to down - * heartbeat and bring it back up. */ cancel_delayed_work(&nn->nn_connect_expired); printk(KERN_INFO "o2net: %s " SC_NODEF_FMT "\n", o2nm_this_node() > sc->sc_node->nd_num ? @@ -451,12 +488,24 @@ static void o2net_set_nn_state(struct o2net_node *nn, /* delay if we're withing a RECONNECT_DELAY of the * last attempt */ delay = (nn->nn_last_connect_attempt + - msecs_to_jiffies(o2net_reconnect_delay(NULL))) + msecs_to_jiffies(o2net_reconnect_delay())) - jiffies; - if (delay > msecs_to_jiffies(o2net_reconnect_delay(NULL))) + if (delay > msecs_to_jiffies(o2net_reconnect_delay())) delay = 0; mlog(ML_CONN, "queueing conn attempt in %lu jiffies\n", delay); queue_delayed_work(o2net_wq, &nn->nn_connect_work, delay); + + /* + * Delay the expired work after idle timeout. + * + * We might have lots of failed connection attempts that run + * through here but we only cancel the connect_expired work when + * a connection attempt succeeds. So only the first enqueue of + * the connect_expired work will do anything. The rest will see + * that it's already queued and do nothing. + */ + delay += msecs_to_jiffies(o2net_idle_timeout()); + queue_delayed_work(o2net_wq, &nn->nn_connect_expired, delay); } /* keep track of the nn's sc ref for the caller */ @@ -914,6 +963,9 @@ int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *caller_vec, struct o2net_status_wait nsw = { .ns_node_item = LIST_HEAD_INIT(nsw.ns_node_item), }; + struct o2net_send_tracking nst; + + o2net_init_nst(&nst, msg_type, key, current, target_node); if (o2net_wq == NULL) { mlog(0, "attempt to tx without o2netd running\n"); @@ -939,6 +991,10 @@ int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *caller_vec, goto out; } + o2net_debug_add_nst(&nst); + + o2net_set_nst_sock_time(&nst); + ret = wait_event_interruptible(nn->nn_sc_wq, o2net_tx_can_proceed(nn, &sc, &error)); if (!ret && error) @@ -946,6 +1002,8 @@ int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *caller_vec, if (ret) goto out; + o2net_set_nst_sock_container(&nst, sc); + veclen = caller_veclen + 1; vec = kmalloc(sizeof(struct kvec) * veclen, GFP_ATOMIC); if (vec == NULL) { @@ -972,6 +1030,9 @@ int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *caller_vec, goto out; msg->msg_num = cpu_to_be32(nsw.ns_id); + o2net_set_nst_msg_id(&nst, nsw.ns_id); + + o2net_set_nst_send_time(&nst); /* finally, convert the message header to network byte-order * and send */ @@ -986,6 +1047,7 @@ int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *caller_vec, } /* wait on other node's handler */ + o2net_set_nst_status_time(&nst); wait_event(nsw.ns_wq, o2net_nsw_completed(nn, &nsw)); /* Note that we avoid overwriting the callers status return @@ -998,6 +1060,7 @@ int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *caller_vec, mlog(0, "woken, returning system status %d, user status %d\n", ret, nsw.ns_status); out: + o2net_debug_del_nst(&nst); /* must be before dropping sc and node */ if (sc) sc_put(sc); if (vec) @@ -1154,23 +1217,23 @@ static int o2net_check_handshake(struct o2net_sock_container *sc) * but isn't. This can ultimately cause corruption. */ if (be32_to_cpu(hand->o2net_idle_timeout_ms) != - o2net_idle_timeout(sc->sc_node)) { + o2net_idle_timeout()) { mlog(ML_NOTICE, SC_NODEF_FMT " uses a network idle timeout of " "%u ms, but we use %u ms locally. disconnecting\n", SC_NODEF_ARGS(sc), be32_to_cpu(hand->o2net_idle_timeout_ms), - o2net_idle_timeout(sc->sc_node)); + o2net_idle_timeout()); o2net_ensure_shutdown(nn, sc, -ENOTCONN); return -1; } if (be32_to_cpu(hand->o2net_keepalive_delay_ms) != - o2net_keepalive_delay(sc->sc_node)) { + o2net_keepalive_delay()) { mlog(ML_NOTICE, SC_NODEF_FMT " uses a keepalive delay of " "%u ms, but we use %u ms locally. disconnecting\n", SC_NODEF_ARGS(sc), be32_to_cpu(hand->o2net_keepalive_delay_ms), - o2net_keepalive_delay(sc->sc_node)); + o2net_keepalive_delay()); o2net_ensure_shutdown(nn, sc, -ENOTCONN); return -1; } @@ -1193,6 +1256,7 @@ static int o2net_check_handshake(struct o2net_sock_container *sc) * shut down already */ if (nn->nn_sc == sc) { o2net_sc_reset_idle_timer(sc); + atomic_set(&nn->nn_timeout, 0); o2net_set_nn_state(nn, sc, 1, 0); } spin_unlock(&nn->nn_lock); @@ -1347,12 +1411,11 @@ static void o2net_initialize_handshake(void) { o2net_hand->o2hb_heartbeat_timeout_ms = cpu_to_be32( O2HB_MAX_WRITE_TIMEOUT_MS); - o2net_hand->o2net_idle_timeout_ms = cpu_to_be32( - o2net_idle_timeout(NULL)); + o2net_hand->o2net_idle_timeout_ms = cpu_to_be32(o2net_idle_timeout()); o2net_hand->o2net_keepalive_delay_ms = cpu_to_be32( - o2net_keepalive_delay(NULL)); + o2net_keepalive_delay()); o2net_hand->o2net_reconnect_delay_ms = cpu_to_be32( - o2net_reconnect_delay(NULL)); + o2net_reconnect_delay()); } /* ------------------------------------------------------------ */ @@ -1391,14 +1454,15 @@ static void o2net_sc_send_keep_req(struct work_struct *work) static void o2net_idle_timer(unsigned long data) { struct o2net_sock_container *sc = (struct o2net_sock_container *)data; + struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num); struct timeval now; do_gettimeofday(&now); printk(KERN_INFO "o2net: connection to " SC_NODEF_FMT " has been idle for %u.%u " "seconds, shutting it down.\n", SC_NODEF_ARGS(sc), - o2net_idle_timeout(sc->sc_node) / 1000, - o2net_idle_timeout(sc->sc_node) % 1000); + o2net_idle_timeout() / 1000, + o2net_idle_timeout() % 1000); mlog(ML_NOTICE, "here are some times that might help debug the " "situation: (tmr %ld.%ld now %ld.%ld dr %ld.%ld adv " "%ld.%ld:%ld.%ld func (%08x:%u) %ld.%ld:%ld.%ld)\n", @@ -1413,6 +1477,12 @@ static void o2net_idle_timer(unsigned long data) sc->sc_tv_func_start.tv_sec, (long) sc->sc_tv_func_start.tv_usec, sc->sc_tv_func_stop.tv_sec, (long) sc->sc_tv_func_stop.tv_usec); + /* + * Initialize the nn_timeout so that the next connection attempt + * will continue in o2net_start_connect. + */ + atomic_set(&nn->nn_timeout, 1); + o2net_sc_queue_work(sc, &sc->sc_shutdown_work); } @@ -1420,10 +1490,10 @@ static void o2net_sc_reset_idle_timer(struct o2net_sock_container *sc) { o2net_sc_cancel_delayed_work(sc, &sc->sc_keepalive_work); o2net_sc_queue_delayed_work(sc, &sc->sc_keepalive_work, - msecs_to_jiffies(o2net_keepalive_delay(sc->sc_node))); + msecs_to_jiffies(o2net_keepalive_delay())); do_gettimeofday(&sc->sc_tv_timer); mod_timer(&sc->sc_idle_timeout, - jiffies + msecs_to_jiffies(o2net_idle_timeout(sc->sc_node))); + jiffies + msecs_to_jiffies(o2net_idle_timeout())); } static void o2net_sc_postpone_idle(struct o2net_sock_container *sc) @@ -1447,6 +1517,7 @@ static void o2net_start_connect(struct work_struct *work) struct socket *sock = NULL; struct sockaddr_in myaddr = {0, }, remoteaddr = {0, }; int ret = 0, stop; + unsigned int timeout; /* if we're greater we initiate tx, otherwise we accept */ if (o2nm_this_node() <= o2net_num_from_nn(nn)) @@ -1466,8 +1537,17 @@ static void o2net_start_connect(struct work_struct *work) } spin_lock(&nn->nn_lock); - /* see if we already have one pending or have given up */ - stop = (nn->nn_sc || nn->nn_persistent_error); + /* + * see if we already have one pending or have given up. + * For nn_timeout, it is set when we close the connection + * because of the idle time out. So it means that we have + * at least connected to that node successfully once, + * now try to connect to it again. + */ + timeout = atomic_read(&nn->nn_timeout); + stop = (nn->nn_sc || + (nn->nn_persistent_error && + (nn->nn_persistent_error != -ENOTCONN || timeout == 0))); spin_unlock(&nn->nn_lock); if (stop) goto out; @@ -1555,8 +1635,8 @@ static void o2net_connect_expired(struct work_struct *work) mlog(ML_ERROR, "no connection established with node %u after " "%u.%u seconds, giving up and returning errors.\n", o2net_num_from_nn(nn), - o2net_idle_timeout(NULL) / 1000, - o2net_idle_timeout(NULL) % 1000); + o2net_idle_timeout() / 1000, + o2net_idle_timeout() % 1000); o2net_set_nn_state(nn, NULL, 0, -ENOTCONN); } @@ -1579,6 +1659,7 @@ void o2net_disconnect_node(struct o2nm_node *node) /* don't reconnect until it's heartbeating again */ spin_lock(&nn->nn_lock); + atomic_set(&nn->nn_timeout, 0); o2net_set_nn_state(nn, NULL, 0, -ENOTCONN); spin_unlock(&nn->nn_lock); @@ -1610,20 +1691,15 @@ static void o2net_hb_node_up_cb(struct o2nm_node *node, int node_num, /* ensure an immediate connect attempt */ nn->nn_last_connect_attempt = jiffies - - (msecs_to_jiffies(o2net_reconnect_delay(node)) + 1); + (msecs_to_jiffies(o2net_reconnect_delay()) + 1); if (node_num != o2nm_this_node()) { - /* heartbeat doesn't work unless a local node number is - * configured and doing so brings up the o2net_wq, so we can - * use it.. */ - queue_delayed_work(o2net_wq, &nn->nn_connect_expired, - msecs_to_jiffies(o2net_idle_timeout(node))); - /* believe it or not, accept and node hearbeating testing * can succeed for this node before we got here.. so * only use set_nn_state to clear the persistent error * if that hasn't already happened */ spin_lock(&nn->nn_lock); + atomic_set(&nn->nn_timeout, 0); if (nn->nn_persistent_error) o2net_set_nn_state(nn, NULL, 0, 0); spin_unlock(&nn->nn_lock); @@ -1747,6 +1823,7 @@ static int o2net_accept_one(struct socket *sock) new_sock = NULL; spin_lock(&nn->nn_lock); + atomic_set(&nn->nn_timeout, 0); o2net_set_nn_state(nn, sc, 0, 0); spin_unlock(&nn->nn_lock); @@ -1922,6 +1999,9 @@ int o2net_init(void) o2quo_init(); + if (o2net_debugfs_init()) + return -ENOMEM; + o2net_hand = kzalloc(sizeof(struct o2net_handshake), GFP_KERNEL); o2net_keep_req = kzalloc(sizeof(struct o2net_msg), GFP_KERNEL); o2net_keep_resp = kzalloc(sizeof(struct o2net_msg), GFP_KERNEL); @@ -1941,6 +2021,7 @@ int o2net_init(void) for (i = 0; i < ARRAY_SIZE(o2net_nodes); i++) { struct o2net_node *nn = o2net_nn_from_num(i); + atomic_set(&nn->nn_timeout, 0); spin_lock_init(&nn->nn_lock); INIT_DELAYED_WORK(&nn->nn_connect_work, o2net_start_connect); INIT_DELAYED_WORK(&nn->nn_connect_expired, @@ -1962,4 +2043,5 @@ void o2net_exit(void) kfree(o2net_hand); kfree(o2net_keep_req); kfree(o2net_keep_resp); + o2net_debugfs_exit(); } diff --git a/fs/ocfs2/cluster/tcp.h b/fs/ocfs2/cluster/tcp.h index f36f66a..a705d5d 100644 --- a/fs/ocfs2/cluster/tcp.h +++ b/fs/ocfs2/cluster/tcp.h @@ -117,4 +117,36 @@ int o2net_num_connected_peers(void); int o2net_init(void); void o2net_exit(void); +struct o2net_send_tracking; +struct o2net_sock_container; + +#ifdef CONFIG_DEBUG_FS +int o2net_debugfs_init(void); +void o2net_debugfs_exit(void); +void o2net_debug_add_nst(struct o2net_send_tracking *nst); +void o2net_debug_del_nst(struct o2net_send_tracking *nst); +void o2net_debug_add_sc(struct o2net_sock_container *sc); +void o2net_debug_del_sc(struct o2net_sock_container *sc); +#else +static int o2net_debugfs_init(void) +{ + return 0; +} +static void o2net_debugfs_exit(void) +{ +} +static void o2net_debug_add_nst(struct o2net_send_tracking *nst) +{ +} +static void o2net_debug_del_nst(struct o2net_send_tracking *nst) +{ +} +static void o2net_debug_add_sc(struct o2net_sock_container *sc) +{ +} +static void o2net_debug_del_sc(struct o2net_sock_container *sc) +{ +} +#endif /* CONFIG_DEBUG_FS */ + #endif /* O2CLUSTER_TCP_H */ diff --git a/fs/ocfs2/cluster/tcp_internal.h b/fs/ocfs2/cluster/tcp_internal.h index d25b9af..8d58cfe 100644 --- a/fs/ocfs2/cluster/tcp_internal.h +++ b/fs/ocfs2/cluster/tcp_internal.h @@ -95,6 +95,8 @@ struct o2net_node { unsigned nn_sc_valid:1; /* if this is set tx just returns it */ int nn_persistent_error; + /* It is only set to 1 after the idle time out. */ + atomic_t nn_timeout; /* threads waiting for an sc to arrive wait on the wq for generation * to increase. it is increased when a connecting socket succeeds @@ -164,7 +166,9 @@ struct o2net_sock_container { /* original handlers for the sockets */ void (*sc_state_change)(struct sock *sk); void (*sc_data_ready)(struct sock *sk, int bytes); - +#ifdef CONFIG_DEBUG_FS + struct list_head sc_net_debug_item; +#endif struct timeval sc_tv_timer; struct timeval sc_tv_data_ready; struct timeval sc_tv_advance_start; @@ -206,4 +210,24 @@ struct o2net_status_wait { struct list_head ns_node_item; }; +#ifdef CONFIG_DEBUG_FS +/* just for state dumps */ +struct o2net_send_tracking { + struct list_head st_net_debug_item; + struct task_struct *st_task; + struct o2net_sock_container *st_sc; + u32 st_id; + u32 st_msg_type; + u32 st_msg_key; + u8 st_node; + struct timeval st_sock_time; + struct timeval st_send_time; + struct timeval st_status_time; +}; +#else +struct o2net_send_tracking { + u32 dummy; +}; +#endif /* CONFIG_DEBUG_FS */ + #endif /* O2CLUSTER_TCP_INTERNAL_H */ diff --git a/fs/ocfs2/dlm/Makefile b/fs/ocfs2/dlm/Makefile index ce3f7c2..1903613 100644 --- a/fs/ocfs2/dlm/Makefile +++ b/fs/ocfs2/dlm/Makefile @@ -1,6 +1,6 @@ EXTRA_CFLAGS += -Ifs/ocfs2 -obj-$(CONFIG_OCFS2_FS) += ocfs2_dlm.o ocfs2_dlmfs.o +obj-$(CONFIG_OCFS2_FS_O2CB) += ocfs2_dlm.o ocfs2_dlmfs.o ocfs2_dlm-objs := dlmdomain.o dlmdebug.o dlmthread.o dlmrecovery.o \ dlmmaster.o dlmast.o dlmconvert.o dlmlock.o dlmunlock.o dlmver.o diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h index dc8ea66..d5a86fb 100644 --- a/fs/ocfs2/dlm/dlmcommon.h +++ b/fs/ocfs2/dlm/dlmcommon.h @@ -49,6 +49,41 @@ /* Intended to make it easier for us to switch out hash functions */ #define dlm_lockid_hash(_n, _l) full_name_hash(_n, _l) +enum dlm_mle_type { + DLM_MLE_BLOCK, + DLM_MLE_MASTER, + DLM_MLE_MIGRATION +}; + +struct dlm_lock_name { + u8 len; + u8 name[DLM_LOCKID_NAME_MAX]; +}; + +struct dlm_master_list_entry { + struct list_head list; + struct list_head hb_events; + struct dlm_ctxt *dlm; + spinlock_t spinlock; + wait_queue_head_t wq; + atomic_t woken; + struct kref mle_refs; + int inuse; + unsigned long maybe_map[BITS_TO_LONGS(O2NM_MAX_NODES)]; + unsigned long vote_map[BITS_TO_LONGS(O2NM_MAX_NODES)]; + unsigned long response_map[BITS_TO_LONGS(O2NM_MAX_NODES)]; + unsigned long node_map[BITS_TO_LONGS(O2NM_MAX_NODES)]; + u8 master; + u8 new_master; + enum dlm_mle_type type; + struct o2hb_callback_func mle_hb_up; + struct o2hb_callback_func mle_hb_down; + union { + struct dlm_lock_resource *res; + struct dlm_lock_name name; + } u; +}; + enum dlm_ast_type { DLM_AST = 0, DLM_BAST, @@ -101,6 +136,7 @@ struct dlm_ctxt struct list_head purge_list; struct list_head pending_asts; struct list_head pending_basts; + struct list_head tracking_list; unsigned int purge_count; spinlock_t spinlock; spinlock_t ast_lock; @@ -122,6 +158,9 @@ struct dlm_ctxt atomic_t remote_resources; atomic_t unknown_resources; + struct dlm_debug_ctxt *dlm_debug_ctxt; + struct dentry *dlm_debugfs_subroot; + /* NOTE: Next three are protected by dlm_domain_lock */ struct kref dlm_refs; enum dlm_ctxt_state dlm_state; @@ -270,6 +309,9 @@ struct dlm_lock_resource struct list_head dirty; struct list_head recovering; // dlm_recovery_ctxt.resources list + /* Added during init and removed during release */ + struct list_head tracking; /* dlm->tracking_list */ + /* unused lock resources have their last_used stamped and are * put on a list for the dlm thread to run. */ unsigned long last_used; @@ -963,9 +1005,16 @@ static inline void __dlm_wait_on_lockres(struct dlm_lock_resource *res) DLM_LOCK_RES_MIGRATING)); } +/* create/destroy slab caches */ +int dlm_init_master_caches(void); +void dlm_destroy_master_caches(void); + +int dlm_init_lock_cache(void); +void dlm_destroy_lock_cache(void); int dlm_init_mle_cache(void); void dlm_destroy_mle_cache(void); + void dlm_hb_event_notify_attached(struct dlm_ctxt *dlm, int idx, int node_up); int dlm_drop_lockres_ref(struct dlm_ctxt *dlm, struct dlm_lock_resource *res); diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c index 64239b3..5f6d8587 100644 --- a/fs/ocfs2/dlm/dlmdebug.c +++ b/fs/ocfs2/dlm/dlmdebug.c @@ -5,7 +5,7 @@ * * debug functionality for the dlm * - * Copyright (C) 2004 Oracle. All rights reserved. + * Copyright (C) 2004, 2008 Oracle. All rights reserved. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public @@ -30,6 +30,7 @@ #include <linux/utsname.h> #include <linux/sysctl.h> #include <linux/spinlock.h> +#include <linux/debugfs.h> #include "cluster/heartbeat.h" #include "cluster/nodemanager.h" @@ -37,17 +38,16 @@ #include "dlmapi.h" #include "dlmcommon.h" - #include "dlmdomain.h" +#include "dlmdebug.h" #define MLOG_MASK_PREFIX ML_DLM #include "cluster/masklog.h" +int stringify_lockname(const char *lockname, int locklen, char *buf, int len); + void dlm_print_one_lock_resource(struct dlm_lock_resource *res) { - mlog(ML_NOTICE, "lockres: %.*s, owner=%u, state=%u\n", - res->lockname.len, res->lockname.name, - res->owner, res->state); spin_lock(&res->spinlock); __dlm_print_one_lock_resource(res); spin_unlock(&res->spinlock); @@ -58,7 +58,7 @@ static void dlm_print_lockres_refmap(struct dlm_lock_resource *res) int bit; assert_spin_locked(&res->spinlock); - mlog(ML_NOTICE, " refmap nodes: [ "); + printk(" refmap nodes: [ "); bit = 0; while (1) { bit = find_next_bit(res->refmap, O2NM_MAX_NODES, bit); @@ -70,63 +70,66 @@ static void dlm_print_lockres_refmap(struct dlm_lock_resource *res) printk("], inflight=%u\n", res->inflight_locks); } +static void __dlm_print_lock(struct dlm_lock *lock) +{ + spin_lock(&lock->spinlock); + + printk(" type=%d, conv=%d, node=%u, cookie=%u:%llu, " + "ref=%u, ast=(empty=%c,pend=%c), bast=(empty=%c,pend=%c), " + "pending=(conv=%c,lock=%c,cancel=%c,unlock=%c)\n", + lock->ml.type, lock->ml.convert_type, lock->ml.node, + dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)), + dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)), + atomic_read(&lock->lock_refs.refcount), + (list_empty(&lock->ast_list) ? 'y' : 'n'), + (lock->ast_pending ? 'y' : 'n'), + (list_empty(&lock->bast_list) ? 'y' : 'n'), + (lock->bast_pending ? 'y' : 'n'), + (lock->convert_pending ? 'y' : 'n'), + (lock->lock_pending ? 'y' : 'n'), + (lock->cancel_pending ? 'y' : 'n'), + (lock->unlock_pending ? 'y' : 'n')); + + spin_unlock(&lock->spinlock); +} + void __dlm_print_one_lock_resource(struct dlm_lock_resource *res) { struct list_head *iter2; struct dlm_lock *lock; + char buf[DLM_LOCKID_NAME_MAX]; assert_spin_locked(&res->spinlock); - mlog(ML_NOTICE, "lockres: %.*s, owner=%u, state=%u\n", - res->lockname.len, res->lockname.name, - res->owner, res->state); - mlog(ML_NOTICE, " last used: %lu, on purge list: %s\n", - res->last_used, list_empty(&res->purge) ? "no" : "yes"); + stringify_lockname(res->lockname.name, res->lockname.len, + buf, sizeof(buf) - 1); + printk("lockres: %s, owner=%u, state=%u\n", + buf, res->owner, res->state); + printk(" last used: %lu, refcnt: %u, on purge list: %s\n", + res->last_used, atomic_read(&res->refs.refcount), + list_empty(&res->purge) ? "no" : "yes"); + printk(" on dirty list: %s, on reco list: %s, " + "migrating pending: %s\n", + list_empty(&res->dirty) ? "no" : "yes", + list_empty(&res->recovering) ? "no" : "yes", + res->migration_pending ? "yes" : "no"); + printk(" inflight locks: %d, asts reserved: %d\n", + res->inflight_locks, atomic_read(&res->asts_reserved)); dlm_print_lockres_refmap(res); - mlog(ML_NOTICE, " granted queue: \n"); + printk(" granted queue:\n"); list_for_each(iter2, &res->granted) { lock = list_entry(iter2, struct dlm_lock, list); - spin_lock(&lock->spinlock); - mlog(ML_NOTICE, " type=%d, conv=%d, node=%u, " - "cookie=%u:%llu, ast=(empty=%c,pend=%c), bast=(empty=%c,pend=%c)\n", - lock->ml.type, lock->ml.convert_type, lock->ml.node, - dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)), - dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)), - list_empty(&lock->ast_list) ? 'y' : 'n', - lock->ast_pending ? 'y' : 'n', - list_empty(&lock->bast_list) ? 'y' : 'n', - lock->bast_pending ? 'y' : 'n'); - spin_unlock(&lock->spinlock); + __dlm_print_lock(lock); } - mlog(ML_NOTICE, " converting queue: \n"); + printk(" converting queue:\n"); list_for_each(iter2, &res->converting) { lock = list_entry(iter2, struct dlm_lock, list); - spin_lock(&lock->spinlock); - mlog(ML_NOTICE, " type=%d, conv=%d, node=%u, " - "cookie=%u:%llu, ast=(empty=%c,pend=%c), bast=(empty=%c,pend=%c)\n", - lock->ml.type, lock->ml.convert_type, lock->ml.node, - dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)), - dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)), - list_empty(&lock->ast_list) ? 'y' : 'n', - lock->ast_pending ? 'y' : 'n', - list_empty(&lock->bast_list) ? 'y' : 'n', - lock->bast_pending ? 'y' : 'n'); - spin_unlock(&lock->spinlock); + __dlm_print_lock(lock); } - mlog(ML_NOTICE, " blocked queue: \n"); + printk(" blocked queue:\n"); list_for_each(iter2, &res->blocked) { lock = list_entry(iter2, struct dlm_lock, list); - spin_lock(&lock->spinlock); - mlog(ML_NOTICE, " type=%d, conv=%d, node=%u, " - "cookie=%u:%llu, ast=(empty=%c,pend=%c), bast=(empty=%c,pend=%c)\n", - lock->ml.type, lock->ml.convert_type, lock->ml.node, - dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)), - dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)), - list_empty(&lock->ast_list) ? 'y' : 'n', - lock->ast_pending ? 'y' : 'n', - list_empty(&lock->bast_list) ? 'y' : 'n', - lock->bast_pending ? 'y' : 'n'); - spin_unlock(&lock->spinlock); + __dlm_print_lock(lock); } } @@ -136,31 +139,6 @@ void dlm_print_one_lock(struct dlm_lock *lockid) } EXPORT_SYMBOL_GPL(dlm_print_one_lock); -#if 0 -void dlm_dump_lock_resources(struct dlm_ctxt *dlm) -{ - struct dlm_lock_resource *res; - struct hlist_node *iter; - struct hlist_head *bucket; - int i; - - mlog(ML_NOTICE, "struct dlm_ctxt: %s, node=%u, key=%u\n", - dlm->name, dlm->node_num, dlm->key); - if (!dlm || !dlm->name) { - mlog(ML_ERROR, "dlm=%p\n", dlm); - return; - } - - spin_lock(&dlm->spinlock); - for (i=0; i<DLM_HASH_BUCKETS; i++) { - bucket = dlm_lockres_hash(dlm, i); - hlist_for_each_entry(res, iter, bucket, hash_node) - dlm_print_one_lock_resource(res); - } - spin_unlock(&dlm->spinlock); -} -#endif /* 0 */ - static const char *dlm_errnames[] = { [DLM_NORMAL] = "DLM_NORMAL", [DLM_GRANTED] = "DLM_GRANTED", @@ -266,3 +244,792 @@ const char *dlm_errname(enum dlm_status err) return dlm_errnames[err]; } EXPORT_SYMBOL_GPL(dlm_errname); + +/* NOTE: This function converts a lockname into a string. It uses knowledge + * of the format of the lockname that should be outside the purview of the dlm. + * We are adding only to make dlm debugging slightly easier. + * + * For more on lockname formats, please refer to dlmglue.c and ocfs2_lockid.h. + */ +int stringify_lockname(const char *lockname, int locklen, char *buf, int len) +{ + int out = 0; + __be64 inode_blkno_be; + +#define OCFS2_DENTRY_LOCK_INO_START 18 + if (*lockname == 'N') { + memcpy((__be64 *)&inode_blkno_be, + (char *)&lockname[OCFS2_DENTRY_LOCK_INO_START], + sizeof(__be64)); + out += snprintf(buf + out, len - out, "%.*s%08x", + OCFS2_DENTRY_LOCK_INO_START - 1, lockname, + (unsigned int)be64_to_cpu(inode_blkno_be)); + } else + out += snprintf(buf + out, len - out, "%.*s", + locklen, lockname); + return out; +} + +static int stringify_nodemap(unsigned long *nodemap, int maxnodes, + char *buf, int len) +{ + int out = 0; + int i = -1; + + while ((i = find_next_bit(nodemap, maxnodes, i + 1)) < maxnodes) + out += snprintf(buf + out, len - out, "%d ", i); + + return out; +} + +static int dump_mle(struct dlm_master_list_entry *mle, char *buf, int len) +{ + int out = 0; + unsigned int namelen; + const char *name; + char *mle_type; + + if (mle->type != DLM_MLE_MASTER) { + namelen = mle->u.name.len; + name = mle->u.name.name; + } else { + namelen = mle->u.res->lockname.len; + name = mle->u.res->lockname.name; + } + + if (mle->type == DLM_MLE_BLOCK) + mle_type = "BLK"; + else if (mle->type == DLM_MLE_MASTER) + mle_type = "MAS"; + else + mle_type = "MIG"; + + out += stringify_lockname(name, namelen, buf + out, len - out); + out += snprintf(buf + out, len - out, + "\t%3s\tmas=%3u\tnew=%3u\tevt=%1d\tuse=%1d\tref=%3d\n", + mle_type, mle->master, mle->new_master, + !list_empty(&mle->hb_events), + !!mle->inuse, + atomic_read(&mle->mle_refs.refcount)); + + out += snprintf(buf + out, len - out, "Maybe="); + out += stringify_nodemap(mle->maybe_map, O2NM_MAX_NODES, + buf + out, len - out); + out += snprintf(buf + out, len - out, "\n"); + + out += snprintf(buf + out, len - out, "Vote="); + out += stringify_nodemap(mle->vote_map, O2NM_MAX_NODES, + buf + out, len - out); + out += snprintf(buf + out, len - out, "\n"); + + out += snprintf(buf + out, len - out, "Response="); + out += stringify_nodemap(mle->response_map, O2NM_MAX_NODES, + buf + out, len - out); + out += snprintf(buf + out, len - out, "\n"); + + out += snprintf(buf + out, len - out, "Node="); + out += stringify_nodemap(mle->node_map, O2NM_MAX_NODES, + buf + out, len - out); + out += snprintf(buf + out, len - out, "\n"); + + out += snprintf(buf + out, len - out, "\n"); + + return out; +} + +void dlm_print_one_mle(struct dlm_master_list_entry *mle) +{ + char *buf; + + buf = (char *) get_zeroed_page(GFP_NOFS); + if (buf) { + dump_mle(mle, buf, PAGE_SIZE - 1); + free_page((unsigned long)buf); + } +} + +#ifdef CONFIG_DEBUG_FS + +static struct dentry *dlm_debugfs_root = NULL; + +#define DLM_DEBUGFS_DIR "o2dlm" +#define DLM_DEBUGFS_DLM_STATE "dlm_state" +#define DLM_DEBUGFS_LOCKING_STATE "locking_state" +#define DLM_DEBUGFS_MLE_STATE "mle_state" +#define DLM_DEBUGFS_PURGE_LIST "purge_list" + +/* begin - utils funcs */ +static void dlm_debug_free(struct kref *kref) +{ + struct dlm_debug_ctxt *dc; + + dc = container_of(kref, struct dlm_debug_ctxt, debug_refcnt); + + kfree(dc); +} + +void dlm_debug_put(struct dlm_debug_ctxt *dc) +{ + if (dc) + kref_put(&dc->debug_refcnt, dlm_debug_free); +} + +static void dlm_debug_get(struct dlm_debug_ctxt *dc) +{ + kref_get(&dc->debug_refcnt); +} + +static struct debug_buffer *debug_buffer_allocate(void) +{ + struct debug_buffer *db = NULL; + + db = kzalloc(sizeof(struct debug_buffer), GFP_KERNEL); + if (!db) + goto bail; + + db->len = PAGE_SIZE; + db->buf = kmalloc(db->len, GFP_KERNEL); + if (!db->buf) + goto bail; + + return db; +bail: + kfree(db); + return NULL; +} + +static ssize_t debug_buffer_read(struct file *file, char __user *buf, + size_t nbytes, loff_t *ppos) +{ + struct debug_buffer *db = file->private_data; + + return simple_read_from_buffer(buf, nbytes, ppos, db->buf, db->len); +} + +static loff_t debug_buffer_llseek(struct file *file, loff_t off, int whence) +{ + struct debug_buffer *db = file->private_data; + loff_t new = -1; + + switch (whence) { + case 0: + new = off; + break; + case 1: + new = file->f_pos + off; + break; + } + + if (new < 0 || new > db->len) + return -EINVAL; + + return (file->f_pos = new); +} + +static int debug_buffer_release(struct inode *inode, struct file *file) +{ + struct debug_buffer *db = (struct debug_buffer *)file->private_data; + + if (db) + kfree(db->buf); + kfree(db); + + return 0; +} +/* end - util funcs */ + +/* begin - purge list funcs */ +static int debug_purgelist_print(struct dlm_ctxt *dlm, struct debug_buffer *db) +{ + struct dlm_lock_resource *res; + int out = 0; + unsigned long total = 0; + + out += snprintf(db->buf + out, db->len - out, + "Dumping Purgelist for Domain: %s\n", dlm->name); + + spin_lock(&dlm->spinlock); + list_for_each_entry(res, &dlm->purge_list, purge) { + ++total; + if (db->len - out < 100) + continue; + spin_lock(&res->spinlock); + out += stringify_lockname(res->lockname.name, + res->lockname.len, + db->buf + out, db->len - out); + out += snprintf(db->buf + out, db->len - out, "\t%ld\n", + (jiffies - res->last_used)/HZ); + spin_unlock(&res->spinlock); + } + spin_unlock(&dlm->spinlock); + + out += snprintf(db->buf + out, db->len - out, + "Total on list: %ld\n", total); + + return out; +} + +static int debug_purgelist_open(struct inode *inode, struct file *file) +{ + struct dlm_ctxt *dlm = inode->i_private; + struct debug_buffer *db; + + db = debug_buffer_allocate(); + if (!db) + goto bail; + + db->len = debug_purgelist_print(dlm, db); + + file->private_data = db; + + return 0; +bail: + return -ENOMEM; +} + +static struct file_operations debug_purgelist_fops = { + .open = debug_purgelist_open, + .release = debug_buffer_release, + .read = debug_buffer_read, + .llseek = debug_buffer_llseek, +}; +/* end - purge list funcs */ + +/* begin - debug mle funcs */ +static int debug_mle_print(struct dlm_ctxt *dlm, struct debug_buffer *db) +{ + struct dlm_master_list_entry *mle; + int out = 0; + unsigned long total = 0; + + out += snprintf(db->buf + out, db->len - out, + "Dumping MLEs for Domain: %s\n", dlm->name); + + spin_lock(&dlm->master_lock); + list_for_each_entry(mle, &dlm->master_list, list) { + ++total; + if (db->len - out < 200) + continue; + out += dump_mle(mle, db->buf + out, db->len - out); + } + spin_unlock(&dlm->master_lock); + + out += snprintf(db->buf + out, db->len - out, + "Total on list: %ld\n", total); + return out; +} + +static int debug_mle_open(struct inode *inode, struct file *file) +{ + struct dlm_ctxt *dlm = inode->i_private; + struct debug_buffer *db; + + db = debug_buffer_allocate(); + if (!db) + goto bail; + + db->len = debug_mle_print(dlm, db); + + file->private_data = db; + + return 0; +bail: + return -ENOMEM; +} + +static struct file_operations debug_mle_fops = { + .open = debug_mle_open, + .release = debug_buffer_release, + .read = debug_buffer_read, + .llseek = debug_buffer_llseek, +}; + +/* end - debug mle funcs */ + +/* begin - debug lockres funcs */ +static int dump_lock(struct dlm_lock *lock, int list_type, char *buf, int len) +{ + int out; + +#define DEBUG_LOCK_VERSION 1 + spin_lock(&lock->spinlock); + out = snprintf(buf, len, "LOCK:%d,%d,%d,%d,%d,%d:%lld,%d,%d,%d,%d,%d," + "%d,%d,%d,%d\n", + DEBUG_LOCK_VERSION, + list_type, lock->ml.type, lock->ml.convert_type, + lock->ml.node, + dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)), + dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)), + !list_empty(&lock->ast_list), + !list_empty(&lock->bast_list), + lock->ast_pending, lock->bast_pending, + lock->convert_pending, lock->lock_pending, + lock->cancel_pending, lock->unlock_pending, + atomic_read(&lock->lock_refs.refcount)); + spin_unlock(&lock->spinlock); + + return out; +} + +static int dump_lockres(struct dlm_lock_resource *res, char *buf, int len) +{ + struct dlm_lock *lock; + int i; + int out = 0; + + out += snprintf(buf + out, len - out, "NAME:"); + out += stringify_lockname(res->lockname.name, res->lockname.len, + buf + out, len - out); + out += snprintf(buf + out, len - out, "\n"); + +#define DEBUG_LRES_VERSION 1 + out += snprintf(buf + out, len - out, + "LRES:%d,%d,%d,%ld,%d,%d,%d,%d,%d,%d,%d\n", + DEBUG_LRES_VERSION, + res->owner, res->state, res->last_used, + !list_empty(&res->purge), + !list_empty(&res->dirty), + !list_empty(&res->recovering), + res->inflight_locks, res->migration_pending, + atomic_read(&res->asts_reserved), + atomic_read(&res->refs.refcount)); + + /* refmap */ + out += snprintf(buf + out, len - out, "RMAP:"); + out += stringify_nodemap(res->refmap, O2NM_MAX_NODES, + buf + out, len - out); + out += snprintf(buf + out, len - out, "\n"); + + /* lvb */ + out += snprintf(buf + out, len - out, "LVBX:"); + for (i = 0; i < DLM_LVB_LEN; i++) + out += snprintf(buf + out, len - out, + "%02x", (unsigned char)res->lvb[i]); + out += snprintf(buf + out, len - out, "\n"); + + /* granted */ + list_for_each_entry(lock, &res->granted, list) + out += dump_lock(lock, 0, buf + out, len - out); + + /* converting */ + list_for_each_entry(lock, &res->converting, list) + out += dump_lock(lock, 1, buf + out, len - out); + + /* blocked */ + list_for_each_entry(lock, &res->blocked, list) + out += dump_lock(lock, 2, buf + out, len - out); + + out += snprintf(buf + out, len - out, "\n"); + + return out; +} + +static void *lockres_seq_start(struct seq_file *m, loff_t *pos) +{ + struct debug_lockres *dl = m->private; + struct dlm_ctxt *dlm = dl->dl_ctxt; + struct dlm_lock_resource *res = NULL; + + spin_lock(&dlm->spinlock); + + if (dl->dl_res) { + list_for_each_entry(res, &dl->dl_res->tracking, tracking) { + if (dl->dl_res) { + dlm_lockres_put(dl->dl_res); + dl->dl_res = NULL; + } + if (&res->tracking == &dlm->tracking_list) { + mlog(0, "End of list found, %p\n", res); + dl = NULL; + break; + } + dlm_lockres_get(res); + dl->dl_res = res; + break; + } + } else { + if (!list_empty(&dlm->tracking_list)) { + list_for_each_entry(res, &dlm->tracking_list, tracking) + break; + dlm_lockres_get(res); + dl->dl_res = res; + } else + dl = NULL; + } + + if (dl) { + spin_lock(&dl->dl_res->spinlock); + dump_lockres(dl->dl_res, dl->dl_buf, dl->dl_len - 1); + spin_unlock(&dl->dl_res->spinlock); + } + + spin_unlock(&dlm->spinlock); + + return dl; +} + +static void lockres_seq_stop(struct seq_file *m, void *v) +{ +} + +static void *lockres_seq_next(struct seq_file *m, void *v, loff_t *pos) +{ + return NULL; +} + +static int lockres_seq_show(struct seq_file *s, void *v) +{ + struct debug_lockres *dl = (struct debug_lockres *)v; + + seq_printf(s, "%s", dl->dl_buf); + + return 0; +} + +static struct seq_operations debug_lockres_ops = { + .start = lockres_seq_start, + .stop = lockres_seq_stop, + .next = lockres_seq_next, + .show = lockres_seq_show, +}; + +static int debug_lockres_open(struct inode *inode, struct file *file) +{ + struct dlm_ctxt *dlm = inode->i_private; + int ret = -ENOMEM; + struct seq_file *seq; + struct debug_lockres *dl = NULL; + + dl = kzalloc(sizeof(struct debug_lockres), GFP_KERNEL); + if (!dl) { + mlog_errno(ret); + goto bail; + } + + dl->dl_len = PAGE_SIZE; + dl->dl_buf = kmalloc(dl->dl_len, GFP_KERNEL); + if (!dl->dl_buf) { + mlog_errno(ret); + goto bail; + } + + ret = seq_open(file, &debug_lockres_ops); + if (ret) { + mlog_errno(ret); + goto bail; + } + + seq = (struct seq_file *) file->private_data; + seq->private = dl; + + dlm_grab(dlm); + dl->dl_ctxt = dlm; + + return 0; +bail: + if (dl) + kfree(dl->dl_buf); + kfree(dl); + return ret; +} + +static int debug_lockres_release(struct inode *inode, struct file *file) +{ + struct seq_file *seq = (struct seq_file *)file->private_data; + struct debug_lockres *dl = (struct debug_lockres *)seq->private; + + if (dl->dl_res) + dlm_lockres_put(dl->dl_res); + dlm_put(dl->dl_ctxt); + kfree(dl->dl_buf); + return seq_release_private(inode, file); +} + +static struct file_operations debug_lockres_fops = { + .open = debug_lockres_open, + .release = debug_lockres_release, + .read = seq_read, + .llseek = seq_lseek, +}; +/* end - debug lockres funcs */ + +/* begin - debug state funcs */ +static int debug_state_print(struct dlm_ctxt *dlm, struct debug_buffer *db) +{ + int out = 0; + struct dlm_reco_node_data *node; + char *state; + int lres, rres, ures, tres; + + lres = atomic_read(&dlm->local_resources); + rres = atomic_read(&dlm->remote_resources); + ures = atomic_read(&dlm->unknown_resources); + tres = lres + rres + ures; + + spin_lock(&dlm->spinlock); + + switch (dlm->dlm_state) { + case DLM_CTXT_NEW: + state = "NEW"; break; + case DLM_CTXT_JOINED: + state = "JOINED"; break; + case DLM_CTXT_IN_SHUTDOWN: + state = "SHUTDOWN"; break; + case DLM_CTXT_LEAVING: + state = "LEAVING"; break; + default: + state = "UNKNOWN"; break; + } + + /* Domain: xxxxxxxxxx Key: 0xdfbac769 */ + out += snprintf(db->buf + out, db->len - out, + "Domain: %s Key: 0x%08x\n", dlm->name, dlm->key); + + /* Thread Pid: xxx Node: xxx State: xxxxx */ + out += snprintf(db->buf + out, db->len - out, + "Thread Pid: %d Node: %d State: %s\n", + dlm->dlm_thread_task->pid, dlm->node_num, state); + + /* Number of Joins: xxx Joining Node: xxx */ + out += snprintf(db->buf + out, db->len - out, + "Number of Joins: %d Joining Node: %d\n", + dlm->num_joins, dlm->joining_node); + + /* Domain Map: xx xx xx */ + out += snprintf(db->buf + out, db->len - out, "Domain Map: "); + out += stringify_nodemap(dlm->domain_map, O2NM_MAX_NODES, + db->buf + out, db->len - out); + out += snprintf(db->buf + out, db->len - out, "\n"); + + /* Live Map: xx xx xx */ + out += snprintf(db->buf + out, db->len - out, "Live Map: "); + out += stringify_nodemap(dlm->live_nodes_map, O2NM_MAX_NODES, + db->buf + out, db->len - out); + out += snprintf(db->buf + out, db->len - out, "\n"); + + /* Mastered Resources Total: xxx Locally: xxx Remotely: ... */ + out += snprintf(db->buf + out, db->len - out, + "Mastered Resources Total: %d Locally: %d " + "Remotely: %d Unknown: %d\n", + tres, lres, rres, ures); + + /* Lists: Dirty=Empty Purge=InUse PendingASTs=Empty ... */ + out += snprintf(db->buf + out, db->len - out, + "Lists: Dirty=%s Purge=%s PendingASTs=%s " + "PendingBASTs=%s Master=%s\n", + (list_empty(&dlm->dirty_list) ? "Empty" : "InUse"), + (list_empty(&dlm->purge_list) ? "Empty" : "InUse"), + (list_empty(&dlm->pending_asts) ? "Empty" : "InUse"), + (list_empty(&dlm->pending_basts) ? "Empty" : "InUse"), + (list_empty(&dlm->master_list) ? "Empty" : "InUse")); + + /* Purge Count: xxx Refs: xxx */ + out += snprintf(db->buf + out, db->len - out, + "Purge Count: %d Refs: %d\n", dlm->purge_count, + atomic_read(&dlm->dlm_refs.refcount)); + + /* Dead Node: xxx */ + out += snprintf(db->buf + out, db->len - out, + "Dead Node: %d\n", dlm->reco.dead_node); + + /* What about DLM_RECO_STATE_FINALIZE? */ + if (dlm->reco.state == DLM_RECO_STATE_ACTIVE) + state = "ACTIVE"; + else + state = "INACTIVE"; + + /* Recovery Pid: xxxx Master: xxx State: xxxx */ + out += snprintf(db->buf + out, db->len - out, + "Recovery Pid: %d Master: %d State: %s\n", + dlm->dlm_reco_thread_task->pid, + dlm->reco.new_master, state); + + /* Recovery Map: xx xx */ + out += snprintf(db->buf + out, db->len - out, "Recovery Map: "); + out += stringify_nodemap(dlm->recovery_map, O2NM_MAX_NODES, + db->buf + out, db->len - out); + out += snprintf(db->buf + out, db->len - out, "\n"); + + /* Recovery Node State: */ + out += snprintf(db->buf + out, db->len - out, "Recovery Node State:\n"); + list_for_each_entry(node, &dlm->reco.node_data, list) { + switch (node->state) { + case DLM_RECO_NODE_DATA_INIT: + state = "INIT"; + break; + case DLM_RECO_NODE_DATA_REQUESTING: + state = "REQUESTING"; + break; + case DLM_RECO_NODE_DATA_DEAD: + state = "DEAD"; + break; + case DLM_RECO_NODE_DATA_RECEIVING: + state = "RECEIVING"; + break; + case DLM_RECO_NODE_DATA_REQUESTED: + state = "REQUESTED"; + break; + case DLM_RECO_NODE_DATA_DONE: + state = "DONE"; + break; + case DLM_RECO_NODE_DATA_FINALIZE_SENT: + state = "FINALIZE-SENT"; + break; + default: + state = "BAD"; + break; + } + out += snprintf(db->buf + out, db->len - out, "\t%u - %s\n", + node->node_num, state); + } + + spin_unlock(&dlm->spinlock); + + return out; +} + +static int debug_state_open(struct inode *inode, struct file *file) +{ + struct dlm_ctxt *dlm = inode->i_private; + struct debug_buffer *db = NULL; + + db = debug_buffer_allocate(); + if (!db) + goto bail; + + db->len = debug_state_print(dlm, db); + + file->private_data = db; + + return 0; +bail: + return -ENOMEM; +} + +static struct file_operations debug_state_fops = { + .open = debug_state_open, + .release = debug_buffer_release, + .read = debug_buffer_read, + .llseek = debug_buffer_llseek, +}; +/* end - debug state funcs */ + +/* files in subroot */ +int dlm_debug_init(struct dlm_ctxt *dlm) +{ + struct dlm_debug_ctxt *dc = dlm->dlm_debug_ctxt; + + /* for dumping dlm_ctxt */ + dc->debug_state_dentry = debugfs_create_file(DLM_DEBUGFS_DLM_STATE, + S_IFREG|S_IRUSR, + dlm->dlm_debugfs_subroot, + dlm, &debug_state_fops); + if (!dc->debug_state_dentry) { + mlog_errno(-ENOMEM); + goto bail; + } + + /* for dumping lockres */ + dc->debug_lockres_dentry = + debugfs_create_file(DLM_DEBUGFS_LOCKING_STATE, + S_IFREG|S_IRUSR, + dlm->dlm_debugfs_subroot, + dlm, &debug_lockres_fops); + if (!dc->debug_lockres_dentry) { + mlog_errno(-ENOMEM); + goto bail; + } + + /* for dumping mles */ + dc->debug_mle_dentry = debugfs_create_file(DLM_DEBUGFS_MLE_STATE, + S_IFREG|S_IRUSR, + dlm->dlm_debugfs_subroot, + dlm, &debug_mle_fops); + if (!dc->debug_mle_dentry) { + mlog_errno(-ENOMEM); + goto bail; + } + + /* for dumping lockres on the purge list */ + dc->debug_purgelist_dentry = + debugfs_create_file(DLM_DEBUGFS_PURGE_LIST, + S_IFREG|S_IRUSR, + dlm->dlm_debugfs_subroot, + dlm, &debug_purgelist_fops); + if (!dc->debug_purgelist_dentry) { + mlog_errno(-ENOMEM); + goto bail; + } + + dlm_debug_get(dc); + return 0; + +bail: + dlm_debug_shutdown(dlm); + return -ENOMEM; +} + +void dlm_debug_shutdown(struct dlm_ctxt *dlm) +{ + struct dlm_debug_ctxt *dc = dlm->dlm_debug_ctxt; + + if (dc) { + if (dc->debug_purgelist_dentry) + debugfs_remove(dc->debug_purgelist_dentry); + if (dc->debug_mle_dentry) + debugfs_remove(dc->debug_mle_dentry); + if (dc->debug_lockres_dentry) + debugfs_remove(dc->debug_lockres_dentry); + if (dc->debug_state_dentry) + debugfs_remove(dc->debug_state_dentry); + dlm_debug_put(dc); + } +} + +/* subroot - domain dir */ +int dlm_create_debugfs_subroot(struct dlm_ctxt *dlm) +{ + dlm->dlm_debugfs_subroot = debugfs_create_dir(dlm->name, + dlm_debugfs_root); + if (!dlm->dlm_debugfs_subroot) { + mlog_errno(-ENOMEM); + goto bail; + } + + dlm->dlm_debug_ctxt = kzalloc(sizeof(struct dlm_debug_ctxt), + GFP_KERNEL); + if (!dlm->dlm_debug_ctxt) { + mlog_errno(-ENOMEM); + goto bail; + } + kref_init(&dlm->dlm_debug_ctxt->debug_refcnt); + + return 0; +bail: + dlm_destroy_debugfs_subroot(dlm); + return -ENOMEM; +} + +void dlm_destroy_debugfs_subroot(struct dlm_ctxt *dlm) +{ + if (dlm->dlm_debugfs_subroot) + debugfs_remove(dlm->dlm_debugfs_subroot); +} + +/* debugfs root */ +int dlm_create_debugfs_root(void) +{ + dlm_debugfs_root = debugfs_create_dir(DLM_DEBUGFS_DIR, NULL); + if (!dlm_debugfs_root) { + mlog_errno(-ENOMEM); + return -ENOMEM; + } + return 0; +} + +void dlm_destroy_debugfs_root(void) +{ + if (dlm_debugfs_root) + debugfs_remove(dlm_debugfs_root); +} +#endif /* CONFIG_DEBUG_FS */ diff --git a/fs/ocfs2/dlm/dlmdebug.h b/fs/ocfs2/dlm/dlmdebug.h new file mode 100644 index 0000000..d34a62a --- /dev/null +++ b/fs/ocfs2/dlm/dlmdebug.h @@ -0,0 +1,86 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * dlmdebug.h + * + * Copyright (C) 2008 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + * + */ + +#ifndef DLMDEBUG_H +#define DLMDEBUG_H + +void dlm_print_one_mle(struct dlm_master_list_entry *mle); + +#ifdef CONFIG_DEBUG_FS + +struct dlm_debug_ctxt { + struct kref debug_refcnt; + struct dentry *debug_state_dentry; + struct dentry *debug_lockres_dentry; + struct dentry *debug_mle_dentry; + struct dentry *debug_purgelist_dentry; +}; + +struct debug_buffer { + int len; + char *buf; +}; + +struct debug_lockres { + int dl_len; + char *dl_buf; + struct dlm_ctxt *dl_ctxt; + struct dlm_lock_resource *dl_res; +}; + +int dlm_debug_init(struct dlm_ctxt *dlm); +void dlm_debug_shutdown(struct dlm_ctxt *dlm); + +int dlm_create_debugfs_subroot(struct dlm_ctxt *dlm); +void dlm_destroy_debugfs_subroot(struct dlm_ctxt *dlm); + +int dlm_create_debugfs_root(void); +void dlm_destroy_debugfs_root(void); + +#else + +static int dlm_debug_init(struct dlm_ctxt *dlm) +{ + return 0; +} +static void dlm_debug_shutdown(struct dlm_ctxt *dlm) +{ +} +static int dlm_create_debugfs_subroot(struct dlm_ctxt *dlm) +{ + return 0; +} +static void dlm_destroy_debugfs_subroot(struct dlm_ctxt *dlm) +{ +} +static int dlm_create_debugfs_root(void) +{ + return 0; +} +static void dlm_destroy_debugfs_root(void) +{ +} + +#endif /* CONFIG_DEBUG_FS */ +#endif /* DLMDEBUG_H */ diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c index 0879d86..63f8125 100644 --- a/fs/ocfs2/dlm/dlmdomain.c +++ b/fs/ocfs2/dlm/dlmdomain.c @@ -33,6 +33,7 @@ #include <linux/spinlock.h> #include <linux/delay.h> #include <linux/err.h> +#include <linux/debugfs.h> #include "cluster/heartbeat.h" #include "cluster/nodemanager.h" @@ -40,8 +41,8 @@ #include "dlmapi.h" #include "dlmcommon.h" - #include "dlmdomain.h" +#include "dlmdebug.h" #include "dlmver.h" @@ -298,6 +299,8 @@ static int dlm_wait_on_domain_helper(const char *domain) static void dlm_free_ctxt_mem(struct dlm_ctxt *dlm) { + dlm_destroy_debugfs_subroot(dlm); + if (dlm->lockres_hash) dlm_free_pagevec((void **)dlm->lockres_hash, DLM_HASH_PAGES); @@ -395,6 +398,7 @@ static void dlm_destroy_dlm_worker(struct dlm_ctxt *dlm) static void dlm_complete_dlm_shutdown(struct dlm_ctxt *dlm) { dlm_unregister_domain_handlers(dlm); + dlm_debug_shutdown(dlm); dlm_complete_thread(dlm); dlm_complete_recovery_thread(dlm); dlm_destroy_dlm_worker(dlm); @@ -644,6 +648,7 @@ int dlm_shutting_down(struct dlm_ctxt *dlm) void dlm_unregister_domain(struct dlm_ctxt *dlm) { int leave = 0; + struct dlm_lock_resource *res; spin_lock(&dlm_domain_lock); BUG_ON(dlm->dlm_state != DLM_CTXT_JOINED); @@ -673,6 +678,15 @@ void dlm_unregister_domain(struct dlm_ctxt *dlm) msleep(500); mlog(0, "%s: more migration to do\n", dlm->name); } + + /* This list should be empty. If not, print remaining lockres */ + if (!list_empty(&dlm->tracking_list)) { + mlog(ML_ERROR, "Following lockres' are still on the " + "tracking list:\n"); + list_for_each_entry(res, &dlm->tracking_list, tracking) + dlm_print_one_lock_resource(res); + } + dlm_mark_domain_leaving(dlm); dlm_leave_domain(dlm); dlm_complete_dlm_shutdown(dlm); @@ -1405,6 +1419,12 @@ static int dlm_join_domain(struct dlm_ctxt *dlm) goto bail; } + status = dlm_debug_init(dlm); + if (status < 0) { + mlog_errno(status); + goto bail; + } + status = dlm_launch_thread(dlm); if (status < 0) { mlog_errno(status); @@ -1472,6 +1492,7 @@ bail: if (status) { dlm_unregister_domain_handlers(dlm); + dlm_debug_shutdown(dlm); dlm_complete_thread(dlm); dlm_complete_recovery_thread(dlm); dlm_destroy_dlm_worker(dlm); @@ -1484,6 +1505,7 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain, u32 key) { int i; + int ret; struct dlm_ctxt *dlm = NULL; dlm = kzalloc(sizeof(*dlm), GFP_KERNEL); @@ -1516,6 +1538,15 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain, dlm->key = key; dlm->node_num = o2nm_this_node(); + ret = dlm_create_debugfs_subroot(dlm); + if (ret < 0) { + dlm_free_pagevec((void **)dlm->lockres_hash, DLM_HASH_PAGES); + kfree(dlm->name); + kfree(dlm); + dlm = NULL; + goto leave; + } + spin_lock_init(&dlm->spinlock); spin_lock_init(&dlm->master_lock); spin_lock_init(&dlm->ast_lock); @@ -1526,6 +1557,7 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain, INIT_LIST_HEAD(&dlm->reco.node_data); INIT_LIST_HEAD(&dlm->purge_list); INIT_LIST_HEAD(&dlm->dlm_domain_handlers); + INIT_LIST_HEAD(&dlm->tracking_list); dlm->reco.state = 0; INIT_LIST_HEAD(&dlm->pending_asts); @@ -1816,21 +1848,49 @@ static int __init dlm_init(void) dlm_print_version(); status = dlm_init_mle_cache(); - if (status) - return -1; + if (status) { + mlog(ML_ERROR, "Could not create o2dlm_mle slabcache\n"); + goto error; + } + + status = dlm_init_master_caches(); + if (status) { + mlog(ML_ERROR, "Could not create o2dlm_lockres and " + "o2dlm_lockname slabcaches\n"); + goto error; + } + + status = dlm_init_lock_cache(); + if (status) { + mlog(ML_ERROR, "Count not create o2dlm_lock slabcache\n"); + goto error; + } status = dlm_register_net_handlers(); if (status) { - dlm_destroy_mle_cache(); - return -1; + mlog(ML_ERROR, "Unable to register network handlers\n"); + goto error; } + status = dlm_create_debugfs_root(); + if (status) + goto error; + return 0; +error: + dlm_unregister_net_handlers(); + dlm_destroy_lock_cache(); + dlm_destroy_master_caches(); + dlm_destroy_mle_cache(); + return -1; } static void __exit dlm_exit (void) { + dlm_destroy_debugfs_root(); dlm_unregister_net_handlers(); + dlm_destroy_lock_cache(); + dlm_destroy_master_caches(); dlm_destroy_mle_cache(); } diff --git a/fs/ocfs2/dlm/dlmlock.c b/fs/ocfs2/dlm/dlmlock.c index 52578d9..83a9f29 100644 --- a/fs/ocfs2/dlm/dlmlock.c +++ b/fs/ocfs2/dlm/dlmlock.c @@ -53,6 +53,8 @@ #define MLOG_MASK_PREFIX ML_DLM #include "cluster/masklog.h" +static struct kmem_cache *dlm_lock_cache = NULL; + static DEFINE_SPINLOCK(dlm_cookie_lock); static u64 dlm_next_cookie = 1; @@ -64,6 +66,22 @@ static void dlm_init_lock(struct dlm_lock *newlock, int type, static void dlm_lock_release(struct kref *kref); static void dlm_lock_detach_lockres(struct dlm_lock *lock); +int dlm_init_lock_cache(void) +{ + dlm_lock_cache = kmem_cache_create("o2dlm_lock", + sizeof(struct dlm_lock), + 0, SLAB_HWCACHE_ALIGN, NULL); + if (dlm_lock_cache == NULL) + return -ENOMEM; + return 0; +} + +void dlm_destroy_lock_cache(void) +{ + if (dlm_lock_cache) + kmem_cache_destroy(dlm_lock_cache); +} + /* Tell us whether we can grant a new lock request. * locking: * caller needs: res->spinlock @@ -353,7 +371,7 @@ static void dlm_lock_release(struct kref *kref) mlog(0, "freeing kernel-allocated lksb\n"); kfree(lock->lksb); } - kfree(lock); + kmem_cache_free(dlm_lock_cache, lock); } /* associate a lock with it's lockres, getting a ref on the lockres */ @@ -412,7 +430,7 @@ struct dlm_lock * dlm_new_lock(int type, u8 node, u64 cookie, struct dlm_lock *lock; int kernel_allocated = 0; - lock = kzalloc(sizeof(*lock), GFP_NOFS); + lock = (struct dlm_lock *) kmem_cache_zalloc(dlm_lock_cache, GFP_NOFS); if (!lock) return NULL; diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c index ea6b895..efc015c 100644 --- a/fs/ocfs2/dlm/dlmmaster.c +++ b/fs/ocfs2/dlm/dlmmaster.c @@ -48,47 +48,11 @@ #include "dlmapi.h" #include "dlmcommon.h" #include "dlmdomain.h" +#include "dlmdebug.h" #define MLOG_MASK_PREFIX (ML_DLM|ML_DLM_MASTER) #include "cluster/masklog.h" -enum dlm_mle_type { - DLM_MLE_BLOCK, - DLM_MLE_MASTER, - DLM_MLE_MIGRATION -}; - -struct dlm_lock_name -{ - u8 len; - u8 name[DLM_LOCKID_NAME_MAX]; -}; - -struct dlm_master_list_entry -{ - struct list_head list; - struct list_head hb_events; - struct dlm_ctxt *dlm; - spinlock_t spinlock; - wait_queue_head_t wq; - atomic_t woken; - struct kref mle_refs; - int inuse; - unsigned long maybe_map[BITS_TO_LONGS(O2NM_MAX_NODES)]; - unsigned long vote_map[BITS_TO_LONGS(O2NM_MAX_NODES)]; - unsigned long response_map[BITS_TO_LONGS(O2NM_MAX_NODES)]; - unsigned long node_map[BITS_TO_LONGS(O2NM_MAX_NODES)]; - u8 master; - u8 new_master; - enum dlm_mle_type type; - struct o2hb_callback_func mle_hb_up; - struct o2hb_callback_func mle_hb_down; - union { - struct dlm_lock_resource *res; - struct dlm_lock_name name; - } u; -}; - static void dlm_mle_node_down(struct dlm_ctxt *dlm, struct dlm_master_list_entry *mle, struct o2nm_node *node, @@ -128,98 +92,10 @@ static inline int dlm_mle_equal(struct dlm_ctxt *dlm, return 1; } -#define dlm_print_nodemap(m) _dlm_print_nodemap(m,#m) -static void _dlm_print_nodemap(unsigned long *map, const char *mapname) -{ - int i; - printk("%s=[ ", mapname); - for (i=0; i<O2NM_MAX_NODES; i++) - if (test_bit(i, map)) - printk("%d ", i); - printk("]"); -} - -static void dlm_print_one_mle(struct dlm_master_list_entry *mle) -{ - int refs; - char *type; - char attached; - u8 master; - unsigned int namelen; - const char *name; - struct kref *k; - unsigned long *maybe = mle->maybe_map, - *vote = mle->vote_map, - *resp = mle->response_map, - *node = mle->node_map; - - k = &mle->mle_refs; - if (mle->type == DLM_MLE_BLOCK) - type = "BLK"; - else if (mle->type == DLM_MLE_MASTER) - type = "MAS"; - else - type = "MIG"; - refs = atomic_read(&k->refcount); - master = mle->master; - attached = (list_empty(&mle->hb_events) ? 'N' : 'Y'); - - if (mle->type != DLM_MLE_MASTER) { - namelen = mle->u.name.len; - name = mle->u.name.name; - } else { - namelen = mle->u.res->lockname.len; - name = mle->u.res->lockname.name; - } - - mlog(ML_NOTICE, "%.*s: %3s refs=%3d mas=%3u new=%3u evt=%c inuse=%d ", - namelen, name, type, refs, master, mle->new_master, attached, - mle->inuse); - dlm_print_nodemap(maybe); - printk(", "); - dlm_print_nodemap(vote); - printk(", "); - dlm_print_nodemap(resp); - printk(", "); - dlm_print_nodemap(node); - printk(", "); - printk("\n"); -} - -#if 0 -/* Code here is included but defined out as it aids debugging */ - -static void dlm_dump_mles(struct dlm_ctxt *dlm) -{ - struct dlm_master_list_entry *mle; - - mlog(ML_NOTICE, "dumping all mles for domain %s:\n", dlm->name); - spin_lock(&dlm->master_lock); - list_for_each_entry(mle, &dlm->master_list, list) - dlm_print_one_mle(mle); - spin_unlock(&dlm->master_lock); -} - -int dlm_dump_all_mles(const char __user *data, unsigned int len) -{ - struct dlm_ctxt *dlm; - - spin_lock(&dlm_domain_lock); - list_for_each_entry(dlm, &dlm_domains, list) { - mlog(ML_NOTICE, "found dlm: %p, name=%s\n", dlm, dlm->name); - dlm_dump_mles(dlm); - } - spin_unlock(&dlm_domain_lock); - return len; -} -EXPORT_SYMBOL_GPL(dlm_dump_all_mles); - -#endif /* 0 */ - - +static struct kmem_cache *dlm_lockres_cache = NULL; +static struct kmem_cache *dlm_lockname_cache = NULL; static struct kmem_cache *dlm_mle_cache = NULL; - static void dlm_mle_release(struct kref *kref); static void dlm_init_mle(struct dlm_master_list_entry *mle, enum dlm_mle_type type, @@ -507,7 +383,7 @@ static void dlm_mle_node_up(struct dlm_ctxt *dlm, int dlm_init_mle_cache(void) { - dlm_mle_cache = kmem_cache_create("dlm_mle_cache", + dlm_mle_cache = kmem_cache_create("o2dlm_mle", sizeof(struct dlm_master_list_entry), 0, SLAB_HWCACHE_ALIGN, NULL); @@ -560,6 +436,35 @@ static void dlm_mle_release(struct kref *kref) * LOCK RESOURCE FUNCTIONS */ +int dlm_init_master_caches(void) +{ + dlm_lockres_cache = kmem_cache_create("o2dlm_lockres", + sizeof(struct dlm_lock_resource), + 0, SLAB_HWCACHE_ALIGN, NULL); + if (!dlm_lockres_cache) + goto bail; + + dlm_lockname_cache = kmem_cache_create("o2dlm_lockname", + DLM_LOCKID_NAME_MAX, 0, + SLAB_HWCACHE_ALIGN, NULL); + if (!dlm_lockname_cache) + goto bail; + + return 0; +bail: + dlm_destroy_master_caches(); + return -ENOMEM; +} + +void dlm_destroy_master_caches(void) +{ + if (dlm_lockname_cache) + kmem_cache_destroy(dlm_lockname_cache); + + if (dlm_lockres_cache) + kmem_cache_destroy(dlm_lockres_cache); +} + static void dlm_set_lockres_owner(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, u8 owner) @@ -610,6 +515,14 @@ static void dlm_lockres_release(struct kref *kref) mlog(0, "destroying lockres %.*s\n", res->lockname.len, res->lockname.name); + if (!list_empty(&res->tracking)) + list_del_init(&res->tracking); + else { + mlog(ML_ERROR, "Resource %.*s not on the Tracking list\n", + res->lockname.len, res->lockname.name); + dlm_print_one_lock_resource(res); + } + if (!hlist_unhashed(&res->hash_node) || !list_empty(&res->granted) || !list_empty(&res->converting) || @@ -642,9 +555,9 @@ static void dlm_lockres_release(struct kref *kref) BUG_ON(!list_empty(&res->recovering)); BUG_ON(!list_empty(&res->purge)); - kfree(res->lockname.name); + kmem_cache_free(dlm_lockname_cache, (void *)res->lockname.name); - kfree(res); + kmem_cache_free(dlm_lockres_cache, res); } void dlm_lockres_put(struct dlm_lock_resource *res) @@ -677,6 +590,7 @@ static void dlm_init_lockres(struct dlm_ctxt *dlm, INIT_LIST_HEAD(&res->dirty); INIT_LIST_HEAD(&res->recovering); INIT_LIST_HEAD(&res->purge); + INIT_LIST_HEAD(&res->tracking); atomic_set(&res->asts_reserved, 0); res->migration_pending = 0; res->inflight_locks = 0; @@ -692,6 +606,8 @@ static void dlm_init_lockres(struct dlm_ctxt *dlm, res->last_used = 0; + list_add_tail(&res->tracking, &dlm->tracking_list); + memset(res->lvb, 0, DLM_LVB_LEN); memset(res->refmap, 0, sizeof(res->refmap)); } @@ -700,20 +616,28 @@ struct dlm_lock_resource *dlm_new_lockres(struct dlm_ctxt *dlm, const char *name, unsigned int namelen) { - struct dlm_lock_resource *res; + struct dlm_lock_resource *res = NULL; - res = kmalloc(sizeof(struct dlm_lock_resource), GFP_NOFS); + res = (struct dlm_lock_resource *) + kmem_cache_zalloc(dlm_lockres_cache, GFP_NOFS); if (!res) - return NULL; + goto error; - res->lockname.name = kmalloc(namelen, GFP_NOFS); - if (!res->lockname.name) { - kfree(res); - return NULL; - } + res->lockname.name = (char *) + kmem_cache_zalloc(dlm_lockname_cache, GFP_NOFS); + if (!res->lockname.name) + goto error; dlm_init_lockres(dlm, res, name, namelen); return res; + +error: + if (res && res->lockname.name) + kmem_cache_free(dlm_lockname_cache, (void *)res->lockname.name); + + if (res) + kmem_cache_free(dlm_lockres_cache, res); + return NULL; } void __dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm, diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index 1f1873b..394d25a 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -27,18 +27,11 @@ #include <linux/slab.h> #include <linux/highmem.h> #include <linux/mm.h> -#include <linux/crc32.h> #include <linux/kthread.h> #include <linux/pagemap.h> #include <linux/debugfs.h> #include <linux/seq_file.h> -#include <cluster/heartbeat.h> -#include <cluster/nodemanager.h> -#include <cluster/tcp.h> - -#include <dlm/dlmapi.h> - #define MLOG_MASK_PREFIX ML_DLM_GLUE #include <cluster/masklog.h> @@ -53,6 +46,7 @@ #include "heartbeat.h" #include "inode.h" #include "journal.h" +#include "stackglue.h" #include "slot_map.h" #include "super.h" #include "uptodate.h" @@ -113,7 +107,8 @@ static void ocfs2_dump_meta_lvb_info(u64 level, unsigned int line, struct ocfs2_lock_res *lockres) { - struct ocfs2_meta_lvb *lvb = (struct ocfs2_meta_lvb *) lockres->l_lksb.lvb; + struct ocfs2_meta_lvb *lvb = + (struct ocfs2_meta_lvb *)ocfs2_dlm_lvb(&lockres->l_lksb); mlog(level, "LVB information for %s (called from %s:%u):\n", lockres->l_name, function, line); @@ -259,31 +254,6 @@ static struct ocfs2_lock_res_ops ocfs2_flock_lops = { .flags = 0, }; -/* - * This is the filesystem locking protocol version. - * - * Whenever the filesystem does new things with locks (adds or removes a - * lock, orders them differently, does different things underneath a lock), - * the version must be changed. The protocol is negotiated when joining - * the dlm domain. A node may join the domain if its major version is - * identical to all other nodes and its minor version is greater than - * or equal to all other nodes. When its minor version is greater than - * the other nodes, it will run at the minor version specified by the - * other nodes. - * - * If a locking change is made that will not be compatible with older - * versions, the major number must be increased and the minor version set - * to zero. If a change merely adds a behavior that can be disabled when - * speaking to older versions, the minor version must be increased. If a - * change adds a fully backwards compatible change (eg, LVB changes that - * are just ignored by older versions), the version does not need to be - * updated. - */ -const struct dlm_protocol_version ocfs2_locking_protocol = { - .pv_major = OCFS2_LOCKING_PROTOCOL_MAJOR, - .pv_minor = OCFS2_LOCKING_PROTOCOL_MINOR, -}; - static inline int ocfs2_is_inode_lock(struct ocfs2_lock_res *lockres) { return lockres->l_type == OCFS2_LOCK_TYPE_META || @@ -316,7 +286,7 @@ static inline struct ocfs2_super *ocfs2_get_lockres_osb(struct ocfs2_lock_res *l static int ocfs2_lock_create(struct ocfs2_super *osb, struct ocfs2_lock_res *lockres, int level, - int dlm_flags); + u32 dlm_flags); static inline int ocfs2_may_continue_on_blocked_lock(struct ocfs2_lock_res *lockres, int wanted); static void ocfs2_cluster_unlock(struct ocfs2_super *osb, @@ -330,10 +300,9 @@ static void ocfs2_schedule_blocked_lock(struct ocfs2_super *osb, struct ocfs2_lock_res *lockres); static inline void ocfs2_recover_from_dlm_error(struct ocfs2_lock_res *lockres, int convert); -#define ocfs2_log_dlm_error(_func, _stat, _lockres) do { \ - mlog(ML_ERROR, "Dlm error \"%s\" while calling %s on " \ - "resource %s: %s\n", dlm_errname(_stat), _func, \ - _lockres->l_name, dlm_errmsg(_stat)); \ +#define ocfs2_log_dlm_error(_func, _err, _lockres) do { \ + mlog(ML_ERROR, "DLM error %d while calling %s on resource %s\n", \ + _err, _func, _lockres->l_name); \ } while (0) static int ocfs2_downconvert_thread(void *arg); static void ocfs2_downconvert_on_unlock(struct ocfs2_super *osb, @@ -342,12 +311,13 @@ static int ocfs2_inode_lock_update(struct inode *inode, struct buffer_head **bh); static void ocfs2_drop_osb_locks(struct ocfs2_super *osb); static inline int ocfs2_highest_compat_lock_level(int level); -static void ocfs2_prepare_downconvert(struct ocfs2_lock_res *lockres, - int new_level); +static unsigned int ocfs2_prepare_downconvert(struct ocfs2_lock_res *lockres, + int new_level); static int ocfs2_downconvert_lock(struct ocfs2_super *osb, struct ocfs2_lock_res *lockres, int new_level, - int lvb); + int lvb, + unsigned int generation); static int ocfs2_prepare_cancel_convert(struct ocfs2_super *osb, struct ocfs2_lock_res *lockres); static int ocfs2_cancel_convert(struct ocfs2_super *osb, @@ -406,9 +376,9 @@ static void ocfs2_lock_res_init_common(struct ocfs2_super *osb, res->l_ops = ops; res->l_priv = priv; - res->l_level = LKM_IVMODE; - res->l_requested = LKM_IVMODE; - res->l_blocking = LKM_IVMODE; + res->l_level = DLM_LOCK_IV; + res->l_requested = DLM_LOCK_IV; + res->l_blocking = DLM_LOCK_IV; res->l_action = OCFS2_AST_INVALID; res->l_unlock_action = OCFS2_UNLOCK_INVALID; @@ -604,10 +574,10 @@ static inline void ocfs2_inc_holders(struct ocfs2_lock_res *lockres, BUG_ON(!lockres); switch(level) { - case LKM_EXMODE: + case DLM_LOCK_EX: lockres->l_ex_holders++; break; - case LKM_PRMODE: + case DLM_LOCK_PR: lockres->l_ro_holders++; break; default: @@ -625,11 +595,11 @@ static inline void ocfs2_dec_holders(struct ocfs2_lock_res *lockres, BUG_ON(!lockres); switch(level) { - case LKM_EXMODE: + case DLM_LOCK_EX: BUG_ON(!lockres->l_ex_holders); lockres->l_ex_holders--; break; - case LKM_PRMODE: + case DLM_LOCK_PR: BUG_ON(!lockres->l_ro_holders); lockres->l_ro_holders--; break; @@ -644,12 +614,12 @@ static inline void ocfs2_dec_holders(struct ocfs2_lock_res *lockres, * lock types are added. */ static inline int ocfs2_highest_compat_lock_level(int level) { - int new_level = LKM_EXMODE; + int new_level = DLM_LOCK_EX; - if (level == LKM_EXMODE) - new_level = LKM_NLMODE; - else if (level == LKM_PRMODE) - new_level = LKM_PRMODE; + if (level == DLM_LOCK_EX) + new_level = DLM_LOCK_NL; + else if (level == DLM_LOCK_PR) + new_level = DLM_LOCK_PR; return new_level; } @@ -688,12 +658,12 @@ static inline void ocfs2_generic_handle_downconvert_action(struct ocfs2_lock_res BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BUSY)); BUG_ON(!(lockres->l_flags & OCFS2_LOCK_ATTACHED)); BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BLOCKED)); - BUG_ON(lockres->l_blocking <= LKM_NLMODE); + BUG_ON(lockres->l_blocking <= DLM_LOCK_NL); lockres->l_level = lockres->l_requested; if (lockres->l_level <= ocfs2_highest_compat_lock_level(lockres->l_blocking)) { - lockres->l_blocking = LKM_NLMODE; + lockres->l_blocking = DLM_LOCK_NL; lockres_clear_flags(lockres, OCFS2_LOCK_BLOCKED); } lockres_clear_flags(lockres, OCFS2_LOCK_BUSY); @@ -712,7 +682,7 @@ static inline void ocfs2_generic_handle_convert_action(struct ocfs2_lock_res *lo * information is already up to data. Convert from NL to * *anything* however should mark ourselves as needing an * update */ - if (lockres->l_level == LKM_NLMODE && + if (lockres->l_level == DLM_LOCK_NL && lockres->l_ops->flags & LOCK_TYPE_REQUIRES_REFRESH) lockres_or_flags(lockres, OCFS2_LOCK_NEEDS_REFRESH); @@ -729,7 +699,7 @@ static inline void ocfs2_generic_handle_attach_action(struct ocfs2_lock_res *loc BUG_ON((!(lockres->l_flags & OCFS2_LOCK_BUSY))); BUG_ON(lockres->l_flags & OCFS2_LOCK_ATTACHED); - if (lockres->l_requested > LKM_NLMODE && + if (lockres->l_requested > DLM_LOCK_NL && !(lockres->l_flags & OCFS2_LOCK_LOCAL) && lockres->l_ops->flags & LOCK_TYPE_REQUIRES_REFRESH) lockres_or_flags(lockres, OCFS2_LOCK_NEEDS_REFRESH); @@ -767,6 +737,113 @@ static int ocfs2_generic_handle_bast(struct ocfs2_lock_res *lockres, return needs_downconvert; } +/* + * OCFS2_LOCK_PENDING and l_pending_gen. + * + * Why does OCFS2_LOCK_PENDING exist? To close a race between setting + * OCFS2_LOCK_BUSY and calling ocfs2_dlm_lock(). See ocfs2_unblock_lock() + * for more details on the race. + * + * OCFS2_LOCK_PENDING closes the race quite nicely. However, it introduces + * a race on itself. In o2dlm, we can get the ast before ocfs2_dlm_lock() + * returns. The ast clears OCFS2_LOCK_BUSY, and must therefore clear + * OCFS2_LOCK_PENDING at the same time. When ocfs2_dlm_lock() returns, + * the caller is going to try to clear PENDING again. If nothing else is + * happening, __lockres_clear_pending() sees PENDING is unset and does + * nothing. + * + * But what if another path (eg downconvert thread) has just started a + * new locking action? The other path has re-set PENDING. Our path + * cannot clear PENDING, because that will re-open the original race + * window. + * + * [Example] + * + * ocfs2_meta_lock() + * ocfs2_cluster_lock() + * set BUSY + * set PENDING + * drop l_lock + * ocfs2_dlm_lock() + * ocfs2_locking_ast() ocfs2_downconvert_thread() + * clear PENDING ocfs2_unblock_lock() + * take_l_lock + * !BUSY + * ocfs2_prepare_downconvert() + * set BUSY + * set PENDING + * drop l_lock + * take l_lock + * clear PENDING + * drop l_lock + * <window> + * ocfs2_dlm_lock() + * + * So as you can see, we now have a window where l_lock is not held, + * PENDING is not set, and ocfs2_dlm_lock() has not been called. + * + * The core problem is that ocfs2_cluster_lock() has cleared the PENDING + * set by ocfs2_prepare_downconvert(). That wasn't nice. + * + * To solve this we introduce l_pending_gen. A call to + * lockres_clear_pending() will only do so when it is passed a generation + * number that matches the lockres. lockres_set_pending() will return the + * current generation number. When ocfs2_cluster_lock() goes to clear + * PENDING, it passes the generation it got from set_pending(). In our + * example above, the generation numbers will *not* match. Thus, + * ocfs2_cluster_lock() will not clear the PENDING set by + * ocfs2_prepare_downconvert(). + */ + +/* Unlocked version for ocfs2_locking_ast() */ +static void __lockres_clear_pending(struct ocfs2_lock_res *lockres, + unsigned int generation, + struct ocfs2_super *osb) +{ + assert_spin_locked(&lockres->l_lock); + + /* + * The ast and locking functions can race us here. The winner + * will clear pending, the loser will not. + */ + if (!(lockres->l_flags & OCFS2_LOCK_PENDING) || + (lockres->l_pending_gen != generation)) + return; + + lockres_clear_flags(lockres, OCFS2_LOCK_PENDING); + lockres->l_pending_gen++; + + /* + * The downconvert thread may have skipped us because we + * were PENDING. Wake it up. + */ + if (lockres->l_flags & OCFS2_LOCK_BLOCKED) + ocfs2_wake_downconvert_thread(osb); +} + +/* Locked version for callers of ocfs2_dlm_lock() */ +static void lockres_clear_pending(struct ocfs2_lock_res *lockres, + unsigned int generation, + struct ocfs2_super *osb) +{ + unsigned long flags; + + spin_lock_irqsave(&lockres->l_lock, flags); + __lockres_clear_pending(lockres, generation, osb); + spin_unlock_irqrestore(&lockres->l_lock, flags); +} + +static unsigned int lockres_set_pending(struct ocfs2_lock_res *lockres) +{ + assert_spin_locked(&lockres->l_lock); + BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BUSY)); + + lockres_or_flags(lockres, OCFS2_LOCK_PENDING); + + return lockres->l_pending_gen; +} + + static void ocfs2_blocking_ast(void *opaque, int level) { struct ocfs2_lock_res *lockres = opaque; @@ -774,7 +851,7 @@ static void ocfs2_blocking_ast(void *opaque, int level) int needs_downconvert; unsigned long flags; - BUG_ON(level <= LKM_NLMODE); + BUG_ON(level <= DLM_LOCK_NL); mlog(0, "BAST fired for lockres %s, blocking %d, level %d type %s\n", lockres->l_name, level, lockres->l_level, @@ -801,14 +878,22 @@ static void ocfs2_blocking_ast(void *opaque, int level) static void ocfs2_locking_ast(void *opaque) { struct ocfs2_lock_res *lockres = opaque; - struct dlm_lockstatus *lksb = &lockres->l_lksb; + struct ocfs2_super *osb = ocfs2_get_lockres_osb(lockres); unsigned long flags; + int status; spin_lock_irqsave(&lockres->l_lock, flags); - if (lksb->status != DLM_NORMAL) { - mlog(ML_ERROR, "lockres %s: lksb status value of %u!\n", - lockres->l_name, lksb->status); + status = ocfs2_dlm_lock_status(&lockres->l_lksb); + + if (status == -EAGAIN) { + lockres_clear_flags(lockres, OCFS2_LOCK_BUSY); + goto out; + } + + if (status) { + mlog(ML_ERROR, "lockres %s: lksb status value of %d!\n", + lockres->l_name, status); spin_unlock_irqrestore(&lockres->l_lock, flags); return; } @@ -831,11 +916,23 @@ static void ocfs2_locking_ast(void *opaque) lockres->l_unlock_action); BUG(); } - +out: /* set it to something invalid so if we get called again we * can catch it. */ lockres->l_action = OCFS2_AST_INVALID; + /* Did we try to cancel this lock? Clear that state */ + if (lockres->l_unlock_action == OCFS2_UNLOCK_CANCEL_CONVERT) + lockres->l_unlock_action = OCFS2_UNLOCK_INVALID; + + /* + * We may have beaten the locking functions here. We certainly + * know that dlm_lock() has been called :-) + * Because we can't have two lock calls in flight at once, we + * can use lockres->l_pending_gen. + */ + __lockres_clear_pending(lockres, lockres->l_pending_gen, osb); + wake_up(&lockres->l_event); spin_unlock_irqrestore(&lockres->l_lock, flags); } @@ -865,15 +962,15 @@ static inline void ocfs2_recover_from_dlm_error(struct ocfs2_lock_res *lockres, static int ocfs2_lock_create(struct ocfs2_super *osb, struct ocfs2_lock_res *lockres, int level, - int dlm_flags) + u32 dlm_flags) { int ret = 0; - enum dlm_status status = DLM_NORMAL; unsigned long flags; + unsigned int gen; mlog_entry_void(); - mlog(0, "lock %s, level = %d, flags = %d\n", lockres->l_name, level, + mlog(0, "lock %s, level = %d, flags = %u\n", lockres->l_name, level, dlm_flags); spin_lock_irqsave(&lockres->l_lock, flags); @@ -886,24 +983,23 @@ static int ocfs2_lock_create(struct ocfs2_super *osb, lockres->l_action = OCFS2_AST_ATTACH; lockres->l_requested = level; lockres_or_flags(lockres, OCFS2_LOCK_BUSY); + gen = lockres_set_pending(lockres); spin_unlock_irqrestore(&lockres->l_lock, flags); - status = dlmlock(osb->dlm, - level, - &lockres->l_lksb, - dlm_flags, - lockres->l_name, - OCFS2_LOCK_ID_MAX_LEN - 1, - ocfs2_locking_ast, - lockres, - ocfs2_blocking_ast); - if (status != DLM_NORMAL) { - ocfs2_log_dlm_error("dlmlock", status, lockres); - ret = -EINVAL; + ret = ocfs2_dlm_lock(osb->cconn, + level, + &lockres->l_lksb, + dlm_flags, + lockres->l_name, + OCFS2_LOCK_ID_MAX_LEN - 1, + lockres); + lockres_clear_pending(lockres, gen, osb); + if (ret) { + ocfs2_log_dlm_error("ocfs2_dlm_lock", ret, lockres); ocfs2_recover_from_dlm_error(lockres, 1); } - mlog(0, "lock %s, successfull return from dlmlock\n", lockres->l_name); + mlog(0, "lock %s, return from ocfs2_dlm_lock\n", lockres->l_name); bail: mlog_exit(ret); @@ -1016,21 +1112,22 @@ static int ocfs2_wait_for_mask_interruptible(struct ocfs2_mask_waiter *mw, static int ocfs2_cluster_lock(struct ocfs2_super *osb, struct ocfs2_lock_res *lockres, int level, - int lkm_flags, + u32 lkm_flags, int arg_flags) { struct ocfs2_mask_waiter mw; - enum dlm_status status; int wait, catch_signals = !(osb->s_mount_opt & OCFS2_MOUNT_NOINTR); int ret = 0; /* gcc doesn't realize wait = 1 guarantees ret is set */ unsigned long flags; + unsigned int gen; + int noqueue_attempted = 0; mlog_entry_void(); ocfs2_init_mask_waiter(&mw); if (lockres->l_ops->flags & LOCK_TYPE_USES_LVB) - lkm_flags |= LKM_VALBLK; + lkm_flags |= DLM_LKF_VALBLK; again: wait = 0; @@ -1068,52 +1165,56 @@ again: } if (level > lockres->l_level) { + if (noqueue_attempted > 0) { + ret = -EAGAIN; + goto unlock; + } + if (lkm_flags & DLM_LKF_NOQUEUE) + noqueue_attempted = 1; + if (lockres->l_action != OCFS2_AST_INVALID) mlog(ML_ERROR, "lockres %s has action %u pending\n", lockres->l_name, lockres->l_action); if (!(lockres->l_flags & OCFS2_LOCK_ATTACHED)) { lockres->l_action = OCFS2_AST_ATTACH; - lkm_flags &= ~LKM_CONVERT; + lkm_flags &= ~DLM_LKF_CONVERT; } else { lockres->l_action = OCFS2_AST_CONVERT; - lkm_flags |= LKM_CONVERT; + lkm_flags |= DLM_LKF_CONVERT; } lockres->l_requested = level; lockres_or_flags(lockres, OCFS2_LOCK_BUSY); + gen = lockres_set_pending(lockres); spin_unlock_irqrestore(&lockres->l_lock, flags); - BUG_ON(level == LKM_IVMODE); - BUG_ON(level == LKM_NLMODE); + BUG_ON(level == DLM_LOCK_IV); + BUG_ON(level == DLM_LOCK_NL); mlog(0, "lock %s, convert from %d to level = %d\n", lockres->l_name, lockres->l_level, level); /* call dlm_lock to upgrade lock now */ - status = dlmlock(osb->dlm, - level, - &lockres->l_lksb, - lkm_flags, - lockres->l_name, - OCFS2_LOCK_ID_MAX_LEN - 1, - ocfs2_locking_ast, - lockres, - ocfs2_blocking_ast); - if (status != DLM_NORMAL) { - if ((lkm_flags & LKM_NOQUEUE) && - (status == DLM_NOTQUEUED)) - ret = -EAGAIN; - else { - ocfs2_log_dlm_error("dlmlock", status, - lockres); - ret = -EINVAL; + ret = ocfs2_dlm_lock(osb->cconn, + level, + &lockres->l_lksb, + lkm_flags, + lockres->l_name, + OCFS2_LOCK_ID_MAX_LEN - 1, + lockres); + lockres_clear_pending(lockres, gen, osb); + if (ret) { + if (!(lkm_flags & DLM_LKF_NOQUEUE) || + (ret != -EAGAIN)) { + ocfs2_log_dlm_error("ocfs2_dlm_lock", + ret, lockres); } ocfs2_recover_from_dlm_error(lockres, 1); goto out; } - mlog(0, "lock %s, successfull return from dlmlock\n", + mlog(0, "lock %s, successfull return from ocfs2_dlm_lock\n", lockres->l_name); /* At this point we've gone inside the dlm and need to @@ -1177,9 +1278,9 @@ static int ocfs2_create_new_lock(struct ocfs2_super *osb, int ex, int local) { - int level = ex ? LKM_EXMODE : LKM_PRMODE; + int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR; unsigned long flags; - int lkm_flags = local ? LKM_LOCAL : 0; + u32 lkm_flags = local ? DLM_LKF_LOCAL : 0; spin_lock_irqsave(&lockres->l_lock, flags); BUG_ON(lockres->l_flags & OCFS2_LOCK_ATTACHED); @@ -1222,7 +1323,7 @@ int ocfs2_create_new_inode_locks(struct inode *inode) } /* - * We don't want to use LKM_LOCAL on a meta data lock as they + * We don't want to use DLM_LKF_LOCAL on a meta data lock as they * don't use a generation in their lock names. */ ret = ocfs2_create_new_lock(osb, &OCFS2_I(inode)->ip_inode_lockres, 1, 0); @@ -1261,7 +1362,7 @@ int ocfs2_rw_lock(struct inode *inode, int write) lockres = &OCFS2_I(inode)->ip_rw_lockres; - level = write ? LKM_EXMODE : LKM_PRMODE; + level = write ? DLM_LOCK_EX : DLM_LOCK_PR; status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres, level, 0, 0); @@ -1274,7 +1375,7 @@ int ocfs2_rw_lock(struct inode *inode, int write) void ocfs2_rw_unlock(struct inode *inode, int write) { - int level = write ? LKM_EXMODE : LKM_PRMODE; + int level = write ? DLM_LOCK_EX : DLM_LOCK_PR; struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_rw_lockres; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); @@ -1312,7 +1413,7 @@ int ocfs2_open_lock(struct inode *inode) lockres = &OCFS2_I(inode)->ip_open_lockres; status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres, - LKM_PRMODE, 0, 0); + DLM_LOCK_PR, 0, 0); if (status < 0) mlog_errno(status); @@ -1340,16 +1441,16 @@ int ocfs2_try_open_lock(struct inode *inode, int write) lockres = &OCFS2_I(inode)->ip_open_lockres; - level = write ? LKM_EXMODE : LKM_PRMODE; + level = write ? DLM_LOCK_EX : DLM_LOCK_PR; /* * The file system may already holding a PRMODE/EXMODE open lock. - * Since we pass LKM_NOQUEUE, the request won't block waiting on + * Since we pass DLM_LKF_NOQUEUE, the request won't block waiting on * other nodes and the -EAGAIN will indicate to the caller that * this inode is still in use. */ status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres, - level, LKM_NOQUEUE, 0); + level, DLM_LKF_NOQUEUE, 0); out: mlog_exit(status); @@ -1374,10 +1475,10 @@ void ocfs2_open_unlock(struct inode *inode) if(lockres->l_ro_holders) ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres, - LKM_PRMODE); + DLM_LOCK_PR); if(lockres->l_ex_holders) ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres, - LKM_EXMODE); + DLM_LOCK_EX); out: mlog_exit_void(); @@ -1464,7 +1565,7 @@ int ocfs2_file_lock(struct file *file, int ex, int trylock) ocfs2_init_mask_waiter(&mw); if ((lockres->l_flags & OCFS2_LOCK_BUSY) || - (lockres->l_level > LKM_NLMODE)) { + (lockres->l_level > DLM_LOCK_NL)) { mlog(ML_ERROR, "File lock \"%s\" has busy or locked state: flags: 0x%lx, " "level: %u\n", lockres->l_name, lockres->l_flags, @@ -1503,14 +1604,12 @@ int ocfs2_file_lock(struct file *file, int ex, int trylock) lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BUSY, 0); spin_unlock_irqrestore(&lockres->l_lock, flags); - ret = dlmlock(osb->dlm, level, &lockres->l_lksb, lkm_flags, - lockres->l_name, OCFS2_LOCK_ID_MAX_LEN - 1, - ocfs2_locking_ast, lockres, ocfs2_blocking_ast); - if (ret != DLM_NORMAL) { - if (trylock && ret == DLM_NOTQUEUED) - ret = -EAGAIN; - else { - ocfs2_log_dlm_error("dlmlock", ret, lockres); + ret = ocfs2_dlm_lock(osb->cconn, level, &lockres->l_lksb, lkm_flags, + lockres->l_name, OCFS2_LOCK_ID_MAX_LEN - 1, + lockres); + if (ret) { + if (!trylock || (ret != -EAGAIN)) { + ocfs2_log_dlm_error("ocfs2_dlm_lock", ret, lockres); ret = -EINVAL; } @@ -1537,6 +1636,10 @@ int ocfs2_file_lock(struct file *file, int ex, int trylock) * to just bubble sucess back up to the user. */ ret = ocfs2_flock_handle_signal(lockres, level); + } else if (!ret && (level > lockres->l_level)) { + /* Trylock failed asynchronously */ + BUG_ON(!trylock); + ret = -EAGAIN; } out: @@ -1549,6 +1652,7 @@ out: void ocfs2_file_unlock(struct file *file) { int ret; + unsigned int gen; unsigned long flags; struct ocfs2_file_private *fp = file->private_data; struct ocfs2_lock_res *lockres = &fp->fp_flock; @@ -1572,13 +1676,13 @@ void ocfs2_file_unlock(struct file *file) * Fake a blocking ast for the downconvert code. */ lockres_or_flags(lockres, OCFS2_LOCK_BLOCKED); - lockres->l_blocking = LKM_EXMODE; + lockres->l_blocking = DLM_LOCK_EX; - ocfs2_prepare_downconvert(lockres, LKM_NLMODE); + gen = ocfs2_prepare_downconvert(lockres, LKM_NLMODE); lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BUSY, 0); spin_unlock_irqrestore(&lockres->l_lock, flags); - ret = ocfs2_downconvert_lock(osb, lockres, LKM_NLMODE, 0); + ret = ocfs2_downconvert_lock(osb, lockres, LKM_NLMODE, 0, gen); if (ret) { mlog_errno(ret); return; @@ -1601,11 +1705,11 @@ static void ocfs2_downconvert_on_unlock(struct ocfs2_super *osb, * condition. */ if (lockres->l_flags & OCFS2_LOCK_BLOCKED) { switch(lockres->l_blocking) { - case LKM_EXMODE: + case DLM_LOCK_EX: if (!lockres->l_ex_holders && !lockres->l_ro_holders) kick = 1; break; - case LKM_PRMODE: + case DLM_LOCK_PR: if (!lockres->l_ex_holders) kick = 1; break; @@ -1648,7 +1752,7 @@ static void __ocfs2_stuff_meta_lvb(struct inode *inode) mlog_entry_void(); - lvb = (struct ocfs2_meta_lvb *) lockres->l_lksb.lvb; + lvb = (struct ocfs2_meta_lvb *)ocfs2_dlm_lvb(&lockres->l_lksb); /* * Invalidate the LVB of a deleted inode - this way other @@ -1700,7 +1804,7 @@ static void ocfs2_refresh_inode_from_lvb(struct inode *inode) mlog_meta_lvb(0, lockres); - lvb = (struct ocfs2_meta_lvb *) lockres->l_lksb.lvb; + lvb = (struct ocfs2_meta_lvb *)ocfs2_dlm_lvb(&lockres->l_lksb); /* We're safe here without the lockres lock... */ spin_lock(&oi->ip_lock); @@ -1735,7 +1839,8 @@ static void ocfs2_refresh_inode_from_lvb(struct inode *inode) static inline int ocfs2_meta_lvb_is_trustable(struct inode *inode, struct ocfs2_lock_res *lockres) { - struct ocfs2_meta_lvb *lvb = (struct ocfs2_meta_lvb *) lockres->l_lksb.lvb; + struct ocfs2_meta_lvb *lvb = + (struct ocfs2_meta_lvb *)ocfs2_dlm_lvb(&lockres->l_lksb); if (lvb->lvb_version == OCFS2_LVB_VERSION && be32_to_cpu(lvb->lvb_igeneration) == inode->i_generation) @@ -1923,7 +2028,8 @@ int ocfs2_inode_lock_full(struct inode *inode, int ex, int arg_flags) { - int status, level, dlm_flags, acquired; + int status, level, acquired; + u32 dlm_flags; struct ocfs2_lock_res *lockres = NULL; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); struct buffer_head *local_bh = NULL; @@ -1950,14 +2056,13 @@ int ocfs2_inode_lock_full(struct inode *inode, goto local; if (!(arg_flags & OCFS2_META_LOCK_RECOVERY)) - wait_event(osb->recovery_event, - ocfs2_node_map_is_empty(osb, &osb->recovery_map)); + ocfs2_wait_for_recovery(osb); lockres = &OCFS2_I(inode)->ip_inode_lockres; - level = ex ? LKM_EXMODE : LKM_PRMODE; + level = ex ? DLM_LOCK_EX : DLM_LOCK_PR; dlm_flags = 0; if (arg_flags & OCFS2_META_LOCK_NOQUEUE) - dlm_flags |= LKM_NOQUEUE; + dlm_flags |= DLM_LKF_NOQUEUE; status = ocfs2_cluster_lock(osb, lockres, level, dlm_flags, arg_flags); if (status < 0) { @@ -1974,8 +2079,7 @@ int ocfs2_inode_lock_full(struct inode *inode, * committed to owning this lock so we don't allow signals to * abort the operation. */ if (!(arg_flags & OCFS2_META_LOCK_RECOVERY)) - wait_event(osb->recovery_event, - ocfs2_node_map_is_empty(osb, &osb->recovery_map)); + ocfs2_wait_for_recovery(osb); local: /* @@ -2109,7 +2213,7 @@ int ocfs2_inode_lock_atime(struct inode *inode, void ocfs2_inode_unlock(struct inode *inode, int ex) { - int level = ex ? LKM_EXMODE : LKM_PRMODE; + int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR; struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_inode_lockres; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); @@ -2130,10 +2234,8 @@ int ocfs2_super_lock(struct ocfs2_super *osb, int ex) { int status = 0; - int level = ex ? LKM_EXMODE : LKM_PRMODE; + int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR; struct ocfs2_lock_res *lockres = &osb->osb_super_lockres; - struct buffer_head *bh; - struct ocfs2_slot_info *si = osb->slot_info; mlog_entry_void(); @@ -2159,11 +2261,7 @@ int ocfs2_super_lock(struct ocfs2_super *osb, goto bail; } if (status) { - bh = si->si_bh; - status = ocfs2_read_block(osb, bh->b_blocknr, &bh, 0, - si->si_inode); - if (status == 0) - ocfs2_update_slot_info(si); + status = ocfs2_refresh_slot_info(osb); ocfs2_complete_lock_res_refresh(lockres, status); @@ -2178,7 +2276,7 @@ bail: void ocfs2_super_unlock(struct ocfs2_super *osb, int ex) { - int level = ex ? LKM_EXMODE : LKM_PRMODE; + int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR; struct ocfs2_lock_res *lockres = &osb->osb_super_lockres; if (!ocfs2_mount_local(osb)) @@ -2196,7 +2294,7 @@ int ocfs2_rename_lock(struct ocfs2_super *osb) if (ocfs2_mount_local(osb)) return 0; - status = ocfs2_cluster_lock(osb, lockres, LKM_EXMODE, 0, 0); + status = ocfs2_cluster_lock(osb, lockres, DLM_LOCK_EX, 0, 0); if (status < 0) mlog_errno(status); @@ -2208,13 +2306,13 @@ void ocfs2_rename_unlock(struct ocfs2_super *osb) struct ocfs2_lock_res *lockres = &osb->osb_rename_lockres; if (!ocfs2_mount_local(osb)) - ocfs2_cluster_unlock(osb, lockres, LKM_EXMODE); + ocfs2_cluster_unlock(osb, lockres, DLM_LOCK_EX); } int ocfs2_dentry_lock(struct dentry *dentry, int ex) { int ret; - int level = ex ? LKM_EXMODE : LKM_PRMODE; + int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR; struct ocfs2_dentry_lock *dl = dentry->d_fsdata; struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb); @@ -2235,7 +2333,7 @@ int ocfs2_dentry_lock(struct dentry *dentry, int ex) void ocfs2_dentry_unlock(struct dentry *dentry, int ex) { - int level = ex ? LKM_EXMODE : LKM_PRMODE; + int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR; struct ocfs2_dentry_lock *dl = dentry->d_fsdata; struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb); @@ -2400,7 +2498,7 @@ static int ocfs2_dlm_seq_show(struct seq_file *m, void *v) lockres->l_blocking); /* Dump the raw LVB */ - lvb = lockres->l_lksb.lvb; + lvb = ocfs2_dlm_lvb(&lockres->l_lksb); for(i = 0; i < DLM_LVB_LEN; i++) seq_printf(m, "0x%x\t", lvb[i]); @@ -2504,13 +2602,14 @@ static void ocfs2_dlm_shutdown_debug(struct ocfs2_super *osb) int ocfs2_dlm_init(struct ocfs2_super *osb) { int status = 0; - u32 dlm_key; - struct dlm_ctxt *dlm = NULL; + struct ocfs2_cluster_connection *conn = NULL; mlog_entry_void(); - if (ocfs2_mount_local(osb)) + if (ocfs2_mount_local(osb)) { + osb->node_num = 0; goto local; + } status = ocfs2_dlm_init_debug(osb); if (status < 0) { @@ -2527,26 +2626,31 @@ int ocfs2_dlm_init(struct ocfs2_super *osb) goto bail; } - /* used by the dlm code to make message headers unique, each - * node in this domain must agree on this. */ - dlm_key = crc32_le(0, osb->uuid_str, strlen(osb->uuid_str)); - /* for now, uuid == domain */ - dlm = dlm_register_domain(osb->uuid_str, dlm_key, - &osb->osb_locking_proto); - if (IS_ERR(dlm)) { - status = PTR_ERR(dlm); + status = ocfs2_cluster_connect(osb->osb_cluster_stack, + osb->uuid_str, + strlen(osb->uuid_str), + ocfs2_do_node_down, osb, + &conn); + if (status) { mlog_errno(status); goto bail; } - dlm_register_eviction_cb(dlm, &osb->osb_eviction_cb); + status = ocfs2_cluster_this_node(&osb->node_num); + if (status < 0) { + mlog_errno(status); + mlog(ML_ERROR, + "could not find this host's node number\n"); + ocfs2_cluster_disconnect(conn, 0); + goto bail; + } local: ocfs2_super_lock_res_init(&osb->osb_super_lockres, osb); ocfs2_rename_lock_res_init(&osb->osb_rename_lockres, osb); - osb->dlm = dlm; + osb->cconn = conn; status = 0; bail: @@ -2560,14 +2664,19 @@ bail: return status; } -void ocfs2_dlm_shutdown(struct ocfs2_super *osb) +void ocfs2_dlm_shutdown(struct ocfs2_super *osb, + int hangup_pending) { mlog_entry_void(); - dlm_unregister_eviction_cb(&osb->osb_eviction_cb); - ocfs2_drop_osb_locks(osb); + /* + * Now that we have dropped all locks and ocfs2_dismount_volume() + * has disabled recovery, the DLM won't be talking to us. It's + * safe to tear things down before disconnecting the cluster. + */ + if (osb->dc_task) { kthread_stop(osb->dc_task); osb->dc_task = NULL; @@ -2576,15 +2685,15 @@ void ocfs2_dlm_shutdown(struct ocfs2_super *osb) ocfs2_lock_res_free(&osb->osb_super_lockres); ocfs2_lock_res_free(&osb->osb_rename_lockres); - dlm_unregister_domain(osb->dlm); - osb->dlm = NULL; + ocfs2_cluster_disconnect(osb->cconn, hangup_pending); + osb->cconn = NULL; ocfs2_dlm_shutdown_debug(osb); mlog_exit_void(); } -static void ocfs2_unlock_ast(void *opaque, enum dlm_status status) +static void ocfs2_unlock_ast(void *opaque, int error) { struct ocfs2_lock_res *lockres = opaque; unsigned long flags; @@ -2595,24 +2704,9 @@ static void ocfs2_unlock_ast(void *opaque, enum dlm_status status) lockres->l_unlock_action); spin_lock_irqsave(&lockres->l_lock, flags); - /* We tried to cancel a convert request, but it was already - * granted. All we want to do here is clear our unlock - * state. The wake_up call done at the bottom is redundant - * (ocfs2_prepare_cancel_convert doesn't sleep on this) but doesn't - * hurt anything anyway */ - if (status == DLM_CANCELGRANT && - lockres->l_unlock_action == OCFS2_UNLOCK_CANCEL_CONVERT) { - mlog(0, "Got cancelgrant for %s\n", lockres->l_name); - - /* We don't clear the busy flag in this case as it - * should have been cleared by the ast which the dlm - * has called. */ - goto complete_unlock; - } - - if (status != DLM_NORMAL) { - mlog(ML_ERROR, "Dlm passes status %d for lock %s, " - "unlock_action %d\n", status, lockres->l_name, + if (error) { + mlog(ML_ERROR, "Dlm passes error %d for lock %s, " + "unlock_action %d\n", error, lockres->l_name, lockres->l_unlock_action); spin_unlock_irqrestore(&lockres->l_lock, flags); return; @@ -2624,14 +2718,13 @@ static void ocfs2_unlock_ast(void *opaque, enum dlm_status status) lockres->l_action = OCFS2_AST_INVALID; break; case OCFS2_UNLOCK_DROP_LOCK: - lockres->l_level = LKM_IVMODE; + lockres->l_level = DLM_LOCK_IV; break; default: BUG(); } lockres_clear_flags(lockres, OCFS2_LOCK_BUSY); -complete_unlock: lockres->l_unlock_action = OCFS2_UNLOCK_INVALID; spin_unlock_irqrestore(&lockres->l_lock, flags); @@ -2643,16 +2736,16 @@ complete_unlock: static int ocfs2_drop_lock(struct ocfs2_super *osb, struct ocfs2_lock_res *lockres) { - enum dlm_status status; + int ret; unsigned long flags; - int lkm_flags = 0; + u32 lkm_flags = 0; /* We didn't get anywhere near actually using this lockres. */ if (!(lockres->l_flags & OCFS2_LOCK_INITIALIZED)) goto out; if (lockres->l_ops->flags & LOCK_TYPE_USES_LVB) - lkm_flags |= LKM_VALBLK; + lkm_flags |= DLM_LKF_VALBLK; spin_lock_irqsave(&lockres->l_lock, flags); @@ -2678,7 +2771,7 @@ static int ocfs2_drop_lock(struct ocfs2_super *osb, if (lockres->l_ops->flags & LOCK_TYPE_USES_LVB) { if (lockres->l_flags & OCFS2_LOCK_ATTACHED && - lockres->l_level == LKM_EXMODE && + lockres->l_level == DLM_LOCK_EX && !(lockres->l_flags & OCFS2_LOCK_NEEDS_REFRESH)) lockres->l_ops->set_lvb(lockres); } @@ -2707,15 +2800,15 @@ static int ocfs2_drop_lock(struct ocfs2_super *osb, mlog(0, "lock %s\n", lockres->l_name); - status = dlmunlock(osb->dlm, &lockres->l_lksb, lkm_flags, - ocfs2_unlock_ast, lockres); - if (status != DLM_NORMAL) { - ocfs2_log_dlm_error("dlmunlock", status, lockres); + ret = ocfs2_dlm_unlock(osb->cconn, &lockres->l_lksb, lkm_flags, + lockres); + if (ret) { + ocfs2_log_dlm_error("ocfs2_dlm_unlock", ret, lockres); mlog(ML_ERROR, "lockres flags: %lu\n", lockres->l_flags); - dlm_print_one_lock(lockres->l_lksb.lockid); + ocfs2_dlm_dump_lksb(&lockres->l_lksb); BUG(); } - mlog(0, "lock %s, successfull return from dlmunlock\n", + mlog(0, "lock %s, successfull return from ocfs2_dlm_unlock\n", lockres->l_name); ocfs2_wait_on_busy_lock(lockres); @@ -2806,15 +2899,15 @@ int ocfs2_drop_inode_locks(struct inode *inode) return status; } -static void ocfs2_prepare_downconvert(struct ocfs2_lock_res *lockres, - int new_level) +static unsigned int ocfs2_prepare_downconvert(struct ocfs2_lock_res *lockres, + int new_level) { assert_spin_locked(&lockres->l_lock); - BUG_ON(lockres->l_blocking <= LKM_NLMODE); + BUG_ON(lockres->l_blocking <= DLM_LOCK_NL); if (lockres->l_level <= new_level) { - mlog(ML_ERROR, "lockres->l_level (%u) <= new_level (%u)\n", + mlog(ML_ERROR, "lockres->l_level (%d) <= new_level (%d)\n", lockres->l_level, new_level); BUG(); } @@ -2825,33 +2918,33 @@ static void ocfs2_prepare_downconvert(struct ocfs2_lock_res *lockres, lockres->l_action = OCFS2_AST_DOWNCONVERT; lockres->l_requested = new_level; lockres_or_flags(lockres, OCFS2_LOCK_BUSY); + return lockres_set_pending(lockres); } static int ocfs2_downconvert_lock(struct ocfs2_super *osb, struct ocfs2_lock_res *lockres, int new_level, - int lvb) + int lvb, + unsigned int generation) { - int ret, dlm_flags = LKM_CONVERT; - enum dlm_status status; + int ret; + u32 dlm_flags = DLM_LKF_CONVERT; mlog_entry_void(); if (lvb) - dlm_flags |= LKM_VALBLK; - - status = dlmlock(osb->dlm, - new_level, - &lockres->l_lksb, - dlm_flags, - lockres->l_name, - OCFS2_LOCK_ID_MAX_LEN - 1, - ocfs2_locking_ast, - lockres, - ocfs2_blocking_ast); - if (status != DLM_NORMAL) { - ocfs2_log_dlm_error("dlmlock", status, lockres); - ret = -EINVAL; + dlm_flags |= DLM_LKF_VALBLK; + + ret = ocfs2_dlm_lock(osb->cconn, + new_level, + &lockres->l_lksb, + dlm_flags, + lockres->l_name, + OCFS2_LOCK_ID_MAX_LEN - 1, + lockres); + lockres_clear_pending(lockres, generation, osb); + if (ret) { + ocfs2_log_dlm_error("ocfs2_dlm_lock", ret, lockres); ocfs2_recover_from_dlm_error(lockres, 1); goto bail; } @@ -2862,7 +2955,7 @@ bail: return ret; } -/* returns 1 when the caller should unlock and call dlmunlock */ +/* returns 1 when the caller should unlock and call ocfs2_dlm_unlock */ static int ocfs2_prepare_cancel_convert(struct ocfs2_super *osb, struct ocfs2_lock_res *lockres) { @@ -2898,24 +2991,18 @@ static int ocfs2_cancel_convert(struct ocfs2_super *osb, struct ocfs2_lock_res *lockres) { int ret; - enum dlm_status status; mlog_entry_void(); mlog(0, "lock %s\n", lockres->l_name); - ret = 0; - status = dlmunlock(osb->dlm, - &lockres->l_lksb, - LKM_CANCEL, - ocfs2_unlock_ast, - lockres); - if (status != DLM_NORMAL) { - ocfs2_log_dlm_error("dlmunlock", status, lockres); - ret = -EINVAL; + ret = ocfs2_dlm_unlock(osb->cconn, &lockres->l_lksb, + DLM_LKF_CANCEL, lockres); + if (ret) { + ocfs2_log_dlm_error("ocfs2_dlm_unlock", ret, lockres); ocfs2_recover_from_dlm_error(lockres, 0); } - mlog(0, "lock %s return from dlmunlock\n", lockres->l_name); + mlog(0, "lock %s return from ocfs2_dlm_unlock\n", lockres->l_name); mlog_exit(ret); return ret; @@ -2930,6 +3017,7 @@ static int ocfs2_unblock_lock(struct ocfs2_super *osb, int new_level; int ret = 0; int set_lvb = 0; + unsigned int gen; mlog_entry_void(); @@ -2939,6 +3027,32 @@ static int ocfs2_unblock_lock(struct ocfs2_super *osb, recheck: if (lockres->l_flags & OCFS2_LOCK_BUSY) { + /* XXX + * This is a *big* race. The OCFS2_LOCK_PENDING flag + * exists entirely for one reason - another thread has set + * OCFS2_LOCK_BUSY, but has *NOT* yet called dlm_lock(). + * + * If we do ocfs2_cancel_convert() before the other thread + * calls dlm_lock(), our cancel will do nothing. We will + * get no ast, and we will have no way of knowing the + * cancel failed. Meanwhile, the other thread will call + * into dlm_lock() and wait...forever. + * + * Why forever? Because another node has asked for the + * lock first; that's why we're here in unblock_lock(). + * + * The solution is OCFS2_LOCK_PENDING. When PENDING is + * set, we just requeue the unblock. Only when the other + * thread has called dlm_lock() and cleared PENDING will + * we then cancel their request. + * + * All callers of dlm_lock() must set OCFS2_DLM_PENDING + * at the same time they set OCFS2_DLM_BUSY. They must + * clear OCFS2_DLM_PENDING after dlm_lock() returns. + */ + if (lockres->l_flags & OCFS2_LOCK_PENDING) + goto leave_requeue; + ctl->requeue = 1; ret = ocfs2_prepare_cancel_convert(osb, lockres); spin_unlock_irqrestore(&lockres->l_lock, flags); @@ -2952,13 +3066,13 @@ recheck: /* if we're blocking an exclusive and we have *any* holders, * then requeue. */ - if ((lockres->l_blocking == LKM_EXMODE) + if ((lockres->l_blocking == DLM_LOCK_EX) && (lockres->l_ex_holders || lockres->l_ro_holders)) goto leave_requeue; /* If it's a PR we're blocking, then only * requeue if we've got any EX holders */ - if (lockres->l_blocking == LKM_PRMODE && + if (lockres->l_blocking == DLM_LOCK_PR && lockres->l_ex_holders) goto leave_requeue; @@ -3005,7 +3119,7 @@ downconvert: ctl->requeue = 0; if (lockres->l_ops->flags & LOCK_TYPE_USES_LVB) { - if (lockres->l_level == LKM_EXMODE) + if (lockres->l_level == DLM_LOCK_EX) set_lvb = 1; /* @@ -3018,9 +3132,11 @@ downconvert: lockres->l_ops->set_lvb(lockres); } - ocfs2_prepare_downconvert(lockres, new_level); + gen = ocfs2_prepare_downconvert(lockres, new_level); spin_unlock_irqrestore(&lockres->l_lock, flags); - ret = ocfs2_downconvert_lock(osb, lockres, new_level, set_lvb); + ret = ocfs2_downconvert_lock(osb, lockres, new_level, set_lvb, + gen); + leave: mlog_exit(ret); return ret; @@ -3059,7 +3175,7 @@ static int ocfs2_data_convert_worker(struct ocfs2_lock_res *lockres, (unsigned long long)OCFS2_I(inode)->ip_blkno); } sync_mapping_buffers(mapping); - if (blocking == LKM_EXMODE) { + if (blocking == DLM_LOCK_EX) { truncate_inode_pages(mapping, 0); } else { /* We only need to wait on the I/O if we're not also @@ -3080,8 +3196,8 @@ static int ocfs2_check_meta_downconvert(struct ocfs2_lock_res *lockres, struct inode *inode = ocfs2_lock_res_inode(lockres); int checkpointed = ocfs2_inode_fully_checkpointed(inode); - BUG_ON(new_level != LKM_NLMODE && new_level != LKM_PRMODE); - BUG_ON(lockres->l_level != LKM_EXMODE && !checkpointed); + BUG_ON(new_level != DLM_LOCK_NL && new_level != DLM_LOCK_PR); + BUG_ON(lockres->l_level != DLM_LOCK_EX && !checkpointed); if (checkpointed) return 1; @@ -3145,7 +3261,7 @@ static int ocfs2_dentry_convert_worker(struct ocfs2_lock_res *lockres, * valid. The downconvert code will retain a PR for this node, * so there's no further work to do. */ - if (blocking == LKM_PRMODE) + if (blocking == DLM_LOCK_PR) return UNBLOCK_CONTINUE; /* @@ -3219,6 +3335,45 @@ static int ocfs2_dentry_convert_worker(struct ocfs2_lock_res *lockres, return UNBLOCK_CONTINUE_POST; } +/* + * This is the filesystem locking protocol. It provides the lock handling + * hooks for the underlying DLM. It has a maximum version number. + * The version number allows interoperability with systems running at + * the same major number and an equal or smaller minor number. + * + * Whenever the filesystem does new things with locks (adds or removes a + * lock, orders them differently, does different things underneath a lock), + * the version must be changed. The protocol is negotiated when joining + * the dlm domain. A node may join the domain if its major version is + * identical to all other nodes and its minor version is greater than + * or equal to all other nodes. When its minor version is greater than + * the other nodes, it will run at the minor version specified by the + * other nodes. + * + * If a locking change is made that will not be compatible with older + * versions, the major number must be increased and the minor version set + * to zero. If a change merely adds a behavior that can be disabled when + * speaking to older versions, the minor version must be increased. If a + * change adds a fully backwards compatible change (eg, LVB changes that + * are just ignored by older versions), the version does not need to be + * updated. + */ +static struct ocfs2_locking_protocol lproto = { + .lp_max_version = { + .pv_major = OCFS2_LOCKING_PROTOCOL_MAJOR, + .pv_minor = OCFS2_LOCKING_PROTOCOL_MINOR, + }, + .lp_lock_ast = ocfs2_locking_ast, + .lp_blocking_ast = ocfs2_blocking_ast, + .lp_unlock_ast = ocfs2_unlock_ast, +}; + +void ocfs2_set_locking_protocol(void) +{ + ocfs2_stack_glue_set_locking_protocol(&lproto); +} + + static void ocfs2_process_blocked_lock(struct ocfs2_super *osb, struct ocfs2_lock_res *lockres) { diff --git a/fs/ocfs2/dlmglue.h b/fs/ocfs2/dlmglue.h index e3cf902..2bb01f0 100644 --- a/fs/ocfs2/dlmglue.h +++ b/fs/ocfs2/dlmglue.h @@ -58,7 +58,7 @@ struct ocfs2_meta_lvb { #define OCFS2_LOCK_NONBLOCK (0x04) int ocfs2_dlm_init(struct ocfs2_super *osb); -void ocfs2_dlm_shutdown(struct ocfs2_super *osb); +void ocfs2_dlm_shutdown(struct ocfs2_super *osb, int hangup_pending); void ocfs2_lock_res_init_once(struct ocfs2_lock_res *res); void ocfs2_inode_lock_res_init(struct ocfs2_lock_res *res, enum ocfs2_lock_type type, @@ -114,5 +114,6 @@ void ocfs2_wake_downconvert_thread(struct ocfs2_super *osb); struct ocfs2_dlm_debug *ocfs2_new_dlm_debug(void); void ocfs2_put_dlm_debug(struct ocfs2_dlm_debug *dlm_debug); -extern const struct dlm_protocol_version ocfs2_locking_protocol; +/* To set the locking protocol on module initialization */ +void ocfs2_set_locking_protocol(void); #endif /* DLMGLUE_H */ diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index ed5d523..9154c82d3 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -2242,7 +2242,7 @@ const struct file_operations ocfs2_fops = { .open = ocfs2_file_open, .aio_read = ocfs2_file_aio_read, .aio_write = ocfs2_file_aio_write, - .ioctl = ocfs2_ioctl, + .unlocked_ioctl = ocfs2_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = ocfs2_compat_ioctl, #endif @@ -2258,7 +2258,7 @@ const struct file_operations ocfs2_dops = { .fsync = ocfs2_sync_file, .release = ocfs2_dir_release, .open = ocfs2_dir_open, - .ioctl = ocfs2_ioctl, + .unlocked_ioctl = ocfs2_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = ocfs2_compat_ioctl, #endif diff --git a/fs/ocfs2/heartbeat.c b/fs/ocfs2/heartbeat.c index 0758daf..c6e7213 100644 --- a/fs/ocfs2/heartbeat.c +++ b/fs/ocfs2/heartbeat.c @@ -28,9 +28,6 @@ #include <linux/types.h> #include <linux/slab.h> #include <linux/highmem.h> -#include <linux/kmod.h> - -#include <dlm/dlmapi.h> #define MLOG_MASK_PREFIX ML_SUPER #include <cluster/masklog.h> @@ -48,7 +45,6 @@ static inline void __ocfs2_node_map_set_bit(struct ocfs2_node_map *map, int bit); static inline void __ocfs2_node_map_clear_bit(struct ocfs2_node_map *map, int bit); -static inline int __ocfs2_node_map_is_empty(struct ocfs2_node_map *map); /* special case -1 for now * TODO: should *really* make sure the calling func never passes -1!! */ @@ -62,23 +58,23 @@ static void ocfs2_node_map_init(struct ocfs2_node_map *map) void ocfs2_init_node_maps(struct ocfs2_super *osb) { spin_lock_init(&osb->node_map_lock); - ocfs2_node_map_init(&osb->recovery_map); ocfs2_node_map_init(&osb->osb_recovering_orphan_dirs); } -static void ocfs2_do_node_down(int node_num, - struct ocfs2_super *osb) +void ocfs2_do_node_down(int node_num, void *data) { + struct ocfs2_super *osb = data; + BUG_ON(osb->node_num == node_num); mlog(0, "ocfs2: node down event for %d\n", node_num); - if (!osb->dlm) { + if (!osb->cconn) { /* - * No DLM means we're not even ready to participate yet. - * We check the slots after the DLM comes up, so we will - * notice the node death then. We can safely ignore it - * here. + * No cluster connection means we're not even ready to + * participate yet. We check the slots after the cluster + * comes up, so we will notice the node death then. We + * can safely ignore it here. */ return; } @@ -86,61 +82,6 @@ static void ocfs2_do_node_down(int node_num, ocfs2_recovery_thread(osb, node_num); } -/* Called from the dlm when it's about to evict a node. We may also - * get a heartbeat callback later. */ -static void ocfs2_dlm_eviction_cb(int node_num, - void *data) -{ - struct ocfs2_super *osb = (struct ocfs2_super *) data; - struct super_block *sb = osb->sb; - - mlog(ML_NOTICE, "device (%u,%u): dlm has evicted node %d\n", - MAJOR(sb->s_dev), MINOR(sb->s_dev), node_num); - - ocfs2_do_node_down(node_num, osb); -} - -void ocfs2_setup_hb_callbacks(struct ocfs2_super *osb) -{ - /* Not exactly a heartbeat callback, but leads to essentially - * the same path so we set it up here. */ - dlm_setup_eviction_cb(&osb->osb_eviction_cb, - ocfs2_dlm_eviction_cb, - osb); -} - -void ocfs2_stop_heartbeat(struct ocfs2_super *osb) -{ - int ret; - char *argv[5], *envp[3]; - - if (ocfs2_mount_local(osb)) - return; - - if (!osb->uuid_str) { - /* This can happen if we don't get far enough in mount... */ - mlog(0, "No UUID with which to stop heartbeat!\n\n"); - return; - } - - argv[0] = (char *)o2nm_get_hb_ctl_path(); - argv[1] = "-K"; - argv[2] = "-u"; - argv[3] = osb->uuid_str; - argv[4] = NULL; - - mlog(0, "Run: %s %s %s %s\n", argv[0], argv[1], argv[2], argv[3]); - - /* minimal command environment taken from cpu_run_sbin_hotplug */ - envp[0] = "HOME=/"; - envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin"; - envp[2] = NULL; - - ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC); - if (ret < 0) - mlog_errno(ret); -} - static inline void __ocfs2_node_map_set_bit(struct ocfs2_node_map *map, int bit) { @@ -192,112 +133,3 @@ int ocfs2_node_map_test_bit(struct ocfs2_super *osb, return ret; } -static inline int __ocfs2_node_map_is_empty(struct ocfs2_node_map *map) -{ - int bit; - bit = find_next_bit(map->map, map->num_nodes, 0); - if (bit < map->num_nodes) - return 0; - return 1; -} - -int ocfs2_node_map_is_empty(struct ocfs2_super *osb, - struct ocfs2_node_map *map) -{ - int ret; - BUG_ON(map->num_nodes == 0); - spin_lock(&osb->node_map_lock); - ret = __ocfs2_node_map_is_empty(map); - spin_unlock(&osb->node_map_lock); - return ret; -} - -#if 0 - -static void __ocfs2_node_map_dup(struct ocfs2_node_map *target, - struct ocfs2_node_map *from) -{ - BUG_ON(from->num_nodes == 0); - ocfs2_node_map_init(target); - __ocfs2_node_map_set(target, from); -} - -/* returns 1 if bit is the only bit set in target, 0 otherwise */ -int ocfs2_node_map_is_only(struct ocfs2_super *osb, - struct ocfs2_node_map *target, - int bit) -{ - struct ocfs2_node_map temp; - int ret; - - spin_lock(&osb->node_map_lock); - __ocfs2_node_map_dup(&temp, target); - __ocfs2_node_map_clear_bit(&temp, bit); - ret = __ocfs2_node_map_is_empty(&temp); - spin_unlock(&osb->node_map_lock); - - return ret; -} - -static void __ocfs2_node_map_set(struct ocfs2_node_map *target, - struct ocfs2_node_map *from) -{ - int num_longs, i; - - BUG_ON(target->num_nodes != from->num_nodes); - BUG_ON(target->num_nodes == 0); - - num_longs = BITS_TO_LONGS(target->num_nodes); - for (i = 0; i < num_longs; i++) - target->map[i] = from->map[i]; -} - -#endif /* 0 */ - -/* Returns whether the recovery bit was actually set - it may not be - * if a node is still marked as needing recovery */ -int ocfs2_recovery_map_set(struct ocfs2_super *osb, - int num) -{ - int set = 0; - - spin_lock(&osb->node_map_lock); - - if (!test_bit(num, osb->recovery_map.map)) { - __ocfs2_node_map_set_bit(&osb->recovery_map, num); - set = 1; - } - - spin_unlock(&osb->node_map_lock); - - return set; -} - -void ocfs2_recovery_map_clear(struct ocfs2_super *osb, - int num) -{ - ocfs2_node_map_clear_bit(osb, &osb->recovery_map, num); -} - -int ocfs2_node_map_iterate(struct ocfs2_super *osb, - struct ocfs2_node_map *map, - int idx) -{ - int i = idx; - - idx = O2NM_INVALID_NODE_NUM; - spin_lock(&osb->node_map_lock); - if ((i != O2NM_INVALID_NODE_NUM) && - (i >= 0) && - (i < map->num_nodes)) { - while(i < map->num_nodes) { - if (test_bit(i, map->map)) { - idx = i; - break; - } - i++; - } - } - spin_unlock(&osb->node_map_lock); - return idx; -} diff --git a/fs/ocfs2/heartbeat.h b/fs/ocfs2/heartbeat.h index eac63ae..74b9c5d 100644 --- a/fs/ocfs2/heartbeat.h +++ b/fs/ocfs2/heartbeat.h @@ -28,13 +28,10 @@ void ocfs2_init_node_maps(struct ocfs2_super *osb); -void ocfs2_setup_hb_callbacks(struct ocfs2_super *osb); -void ocfs2_stop_heartbeat(struct ocfs2_super *osb); +void ocfs2_do_node_down(int node_num, void *data); /* node map functions - used to keep track of mounted and in-recovery * nodes. */ -int ocfs2_node_map_is_empty(struct ocfs2_super *osb, - struct ocfs2_node_map *map); void ocfs2_node_map_set_bit(struct ocfs2_super *osb, struct ocfs2_node_map *map, int bit); @@ -44,17 +41,5 @@ void ocfs2_node_map_clear_bit(struct ocfs2_super *osb, int ocfs2_node_map_test_bit(struct ocfs2_super *osb, struct ocfs2_node_map *map, int bit); -int ocfs2_node_map_iterate(struct ocfs2_super *osb, - struct ocfs2_node_map *map, - int idx); -static inline int ocfs2_node_map_first_set_bit(struct ocfs2_super *osb, - struct ocfs2_node_map *map) -{ - return ocfs2_node_map_iterate(osb, map, 0); -} -int ocfs2_recovery_map_set(struct ocfs2_super *osb, - int num); -void ocfs2_recovery_map_clear(struct ocfs2_super *osb, - int num); #endif /* OCFS2_HEARTBEAT_H */ diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c index 5177fba..7b142f0 100644 --- a/fs/ocfs2/ioctl.c +++ b/fs/ocfs2/ioctl.c @@ -7,6 +7,7 @@ #include <linux/fs.h> #include <linux/mount.h> +#include <linux/smp_lock.h> #define MLOG_MASK_PREFIX ML_INODE #include <cluster/masklog.h> @@ -59,10 +60,6 @@ static int ocfs2_set_inode_attr(struct inode *inode, unsigned flags, goto bail; } - status = -EROFS; - if (IS_RDONLY(inode)) - goto bail_unlock; - status = -EACCES; if (!is_owner_or_cap(inode)) goto bail_unlock; @@ -112,9 +109,9 @@ bail: return status; } -int ocfs2_ioctl(struct inode * inode, struct file * filp, - unsigned int cmd, unsigned long arg) +long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { + struct inode *inode = filp->f_path.dentry->d_inode; unsigned int flags; int new_clusters; int status; @@ -133,8 +130,13 @@ int ocfs2_ioctl(struct inode * inode, struct file * filp, if (get_user(flags, (int __user *) arg)) return -EFAULT; - return ocfs2_set_inode_attr(inode, flags, + status = mnt_want_write(filp->f_path.mnt); + if (status) + return status; + status = ocfs2_set_inode_attr(inode, flags, OCFS2_FL_MODIFIABLE); + mnt_drop_write(filp->f_path.mnt); + return status; case OCFS2_IOC_RESVSP: case OCFS2_IOC_RESVSP64: case OCFS2_IOC_UNRESVSP: @@ -168,9 +170,6 @@ int ocfs2_ioctl(struct inode * inode, struct file * filp, #ifdef CONFIG_COMPAT long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg) { - struct inode *inode = file->f_path.dentry->d_inode; - int ret; - switch (cmd) { case OCFS2_IOC32_GETFLAGS: cmd = OCFS2_IOC_GETFLAGS; @@ -190,9 +189,6 @@ long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg) return -ENOIOCTLCMD; } - lock_kernel(); - ret = ocfs2_ioctl(inode, file, cmd, arg); - unlock_kernel(); - return ret; + return ocfs2_ioctl(file, cmd, arg); } #endif diff --git a/fs/ocfs2/ioctl.h b/fs/ocfs2/ioctl.h index 4d6c4f4..cf9a5ee 100644 --- a/fs/ocfs2/ioctl.h +++ b/fs/ocfs2/ioctl.h @@ -10,8 +10,7 @@ #ifndef OCFS2_IOCTL_H #define OCFS2_IOCTL_H -int ocfs2_ioctl(struct inode * inode, struct file * filp, - unsigned int cmd, unsigned long arg); +long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg); long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg); #endif /* OCFS2_IOCTL_H */ diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index f31c7e8..9698338 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -64,6 +64,137 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb, int slot); static int ocfs2_commit_thread(void *arg); + +/* + * The recovery_list is a simple linked list of node numbers to recover. + * It is protected by the recovery_lock. + */ + +struct ocfs2_recovery_map { + unsigned int rm_used; + unsigned int *rm_entries; +}; + +int ocfs2_recovery_init(struct ocfs2_super *osb) +{ + struct ocfs2_recovery_map *rm; + + mutex_init(&osb->recovery_lock); + osb->disable_recovery = 0; + osb->recovery_thread_task = NULL; + init_waitqueue_head(&osb->recovery_event); + + rm = kzalloc(sizeof(struct ocfs2_recovery_map) + + osb->max_slots * sizeof(unsigned int), + GFP_KERNEL); + if (!rm) { + mlog_errno(-ENOMEM); + return -ENOMEM; + } + + rm->rm_entries = (unsigned int *)((char *)rm + + sizeof(struct ocfs2_recovery_map)); + osb->recovery_map = rm; + + return 0; +} + +/* we can't grab the goofy sem lock from inside wait_event, so we use + * memory barriers to make sure that we'll see the null task before + * being woken up */ +static int ocfs2_recovery_thread_running(struct ocfs2_super *osb) +{ + mb(); + return osb->recovery_thread_task != NULL; +} + +void ocfs2_recovery_exit(struct ocfs2_super *osb) +{ + struct ocfs2_recovery_map *rm; + + /* disable any new recovery threads and wait for any currently + * running ones to exit. Do this before setting the vol_state. */ + mutex_lock(&osb->recovery_lock); + osb->disable_recovery = 1; + mutex_unlock(&osb->recovery_lock); + wait_event(osb->recovery_event, !ocfs2_recovery_thread_running(osb)); + + /* At this point, we know that no more recovery threads can be + * launched, so wait for any recovery completion work to + * complete. */ + flush_workqueue(ocfs2_wq); + + /* + * Now that recovery is shut down, and the osb is about to be + * freed, the osb_lock is not taken here. + */ + rm = osb->recovery_map; + /* XXX: Should we bug if there are dirty entries? */ + + kfree(rm); +} + +static int __ocfs2_recovery_map_test(struct ocfs2_super *osb, + unsigned int node_num) +{ + int i; + struct ocfs2_recovery_map *rm = osb->recovery_map; + + assert_spin_locked(&osb->osb_lock); + + for (i = 0; i < rm->rm_used; i++) { + if (rm->rm_entries[i] == node_num) + return 1; + } + + return 0; +} + +/* Behaves like test-and-set. Returns the previous value */ +static int ocfs2_recovery_map_set(struct ocfs2_super *osb, + unsigned int node_num) +{ + struct ocfs2_recovery_map *rm = osb->recovery_map; + + spin_lock(&osb->osb_lock); + if (__ocfs2_recovery_map_test(osb, node_num)) { + spin_unlock(&osb->osb_lock); + return 1; + } + + /* XXX: Can this be exploited? Not from o2dlm... */ + BUG_ON(rm->rm_used >= osb->max_slots); + + rm->rm_entries[rm->rm_used] = node_num; + rm->rm_used++; + spin_unlock(&osb->osb_lock); + + return 0; +} + +static void ocfs2_recovery_map_clear(struct ocfs2_super *osb, + unsigned int node_num) +{ + int i; + struct ocfs2_recovery_map *rm = osb->recovery_map; + + spin_lock(&osb->osb_lock); + + for (i = 0; i < rm->rm_used; i++) { + if (rm->rm_entries[i] == node_num) + break; + } + + if (i < rm->rm_used) { + /* XXX: be careful with the pointer math */ + memmove(&(rm->rm_entries[i]), &(rm->rm_entries[i + 1]), + (rm->rm_used - i - 1) * sizeof(unsigned int)); + rm->rm_used--; + } + + spin_unlock(&osb->osb_lock); +} + static int ocfs2_commit_cache(struct ocfs2_super *osb) { int status = 0; @@ -586,8 +717,7 @@ int ocfs2_journal_load(struct ocfs2_journal *journal, int local) mlog_entry_void(); - if (!journal) - BUG(); + BUG_ON(!journal); osb = journal->j_osb; @@ -650,6 +780,23 @@ bail: return status; } +static int ocfs2_recovery_completed(struct ocfs2_super *osb) +{ + int empty; + struct ocfs2_recovery_map *rm = osb->recovery_map; + + spin_lock(&osb->osb_lock); + empty = (rm->rm_used == 0); + spin_unlock(&osb->osb_lock); + + return empty; +} + +void ocfs2_wait_for_recovery(struct ocfs2_super *osb) +{ + wait_event(osb->recovery_event, ocfs2_recovery_completed(osb)); +} + /* * JBD Might read a cached version of another nodes journal file. We * don't want this as this file changes often and we get no @@ -848,6 +995,7 @@ static int __ocfs2_recovery_thread(void *arg) { int status, node_num; struct ocfs2_super *osb = arg; + struct ocfs2_recovery_map *rm = osb->recovery_map; mlog_entry_void(); @@ -863,26 +1011,29 @@ restart: goto bail; } - while(!ocfs2_node_map_is_empty(osb, &osb->recovery_map)) { - node_num = ocfs2_node_map_first_set_bit(osb, - &osb->recovery_map); - if (node_num == O2NM_INVALID_NODE_NUM) { - mlog(0, "Out of nodes to recover.\n"); - break; - } + spin_lock(&osb->osb_lock); + while (rm->rm_used) { + /* It's always safe to remove entry zero, as we won't + * clear it until ocfs2_recover_node() has succeeded. */ + node_num = rm->rm_entries[0]; + spin_unlock(&osb->osb_lock); status = ocfs2_recover_node(osb, node_num); - if (status < 0) { + if (!status) { + ocfs2_recovery_map_clear(osb, node_num); + } else { mlog(ML_ERROR, "Error %d recovering node %d on device (%u,%u)!\n", status, node_num, MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev)); mlog(ML_ERROR, "Volume requires unmount.\n"); - continue; } - ocfs2_recovery_map_clear(osb, node_num); + spin_lock(&osb->osb_lock); } + spin_unlock(&osb->osb_lock); + mlog(0, "All nodes recovered\n"); + ocfs2_super_unlock(osb, 1); /* We always run recovery on our own orphan dir - the dead @@ -893,8 +1044,7 @@ restart: bail: mutex_lock(&osb->recovery_lock); - if (!status && - !ocfs2_node_map_is_empty(osb, &osb->recovery_map)) { + if (!status && !ocfs2_recovery_completed(osb)) { mutex_unlock(&osb->recovery_lock); goto restart; } @@ -924,8 +1074,8 @@ void ocfs2_recovery_thread(struct ocfs2_super *osb, int node_num) /* People waiting on recovery will wait on * the recovery map to empty. */ - if (!ocfs2_recovery_map_set(osb, node_num)) - mlog(0, "node %d already be in recovery.\n", node_num); + if (ocfs2_recovery_map_set(osb, node_num)) + mlog(0, "node %d already in recovery map.\n", node_num); mlog(0, "starting recovery thread...\n"); @@ -1079,7 +1229,6 @@ static int ocfs2_recover_node(struct ocfs2_super *osb, { int status = 0; int slot_num; - struct ocfs2_slot_info *si = osb->slot_info; struct ocfs2_dinode *la_copy = NULL; struct ocfs2_dinode *tl_copy = NULL; @@ -1092,8 +1241,8 @@ static int ocfs2_recover_node(struct ocfs2_super *osb, * case we should've called ocfs2_journal_load instead. */ BUG_ON(osb->node_num == node_num); - slot_num = ocfs2_node_num_to_slot(si, node_num); - if (slot_num == OCFS2_INVALID_SLOT) { + slot_num = ocfs2_node_num_to_slot(osb, node_num); + if (slot_num == -ENOENT) { status = 0; mlog(0, "no slot for this node, so no recovery required.\n"); goto done; @@ -1123,8 +1272,7 @@ static int ocfs2_recover_node(struct ocfs2_super *osb, /* Likewise, this would be a strange but ultimately not so * harmful place to get an error... */ - ocfs2_clear_slot(si, slot_num); - status = ocfs2_update_disk_slots(osb, si); + status = ocfs2_clear_slot(osb, slot_num); if (status < 0) mlog_errno(status); @@ -1184,23 +1332,24 @@ bail: * slot info struct has been updated from disk. */ int ocfs2_mark_dead_nodes(struct ocfs2_super *osb) { - int status, i, node_num; - struct ocfs2_slot_info *si = osb->slot_info; + unsigned int node_num; + int status, i; /* This is called with the super block cluster lock, so we * know that the slot map can't change underneath us. */ - spin_lock(&si->si_lock); - for(i = 0; i < si->si_num_slots; i++) { + spin_lock(&osb->osb_lock); + for (i = 0; i < osb->max_slots; i++) { if (i == osb->slot_num) continue; - if (ocfs2_is_empty_slot(si, i)) + + status = ocfs2_slot_to_node_num_locked(osb, i, &node_num); + if (status == -ENOENT) continue; - node_num = si->si_global_node_nums[i]; - if (ocfs2_node_map_test_bit(osb, &osb->recovery_map, node_num)) + if (__ocfs2_recovery_map_test(osb, node_num)) continue; - spin_unlock(&si->si_lock); + spin_unlock(&osb->osb_lock); /* Ok, we have a slot occupied by another node which * is not in the recovery map. We trylock his journal @@ -1216,9 +1365,9 @@ int ocfs2_mark_dead_nodes(struct ocfs2_super *osb) goto bail; } - spin_lock(&si->si_lock); + spin_lock(&osb->osb_lock); } - spin_unlock(&si->si_lock); + spin_unlock(&osb->osb_lock); status = 0; bail: diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h index 220f3e8..db82be2 100644 --- a/fs/ocfs2/journal.h +++ b/fs/ocfs2/journal.h @@ -134,6 +134,10 @@ static inline void ocfs2_inode_set_new(struct ocfs2_super *osb, /* Exported only for the journal struct init code in super.c. Do not call. */ void ocfs2_complete_recovery(struct work_struct *work); +void ocfs2_wait_for_recovery(struct ocfs2_super *osb); + +int ocfs2_recovery_init(struct ocfs2_super *osb); +void ocfs2_recovery_exit(struct ocfs2_super *osb); /* * Journal Control: diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c index ab83fd5..ce0dc14 100644 --- a/fs/ocfs2/localalloc.c +++ b/fs/ocfs2/localalloc.c @@ -447,6 +447,8 @@ out_mutex: iput(main_bm_inode); out: + if (!status) + ocfs2_init_inode_steal_slot(osb); mlog_exit(status); return status; } @@ -523,6 +525,8 @@ int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb, } ac->ac_inode = local_alloc_inode; + /* We should never use localalloc from another slot */ + ac->ac_alloc_slot = osb->slot_num; ac->ac_which = OCFS2_AC_USE_LOCAL; get_bh(osb->local_alloc_bh); ac->ac_bh = osb->local_alloc_bh; diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index ae9ad95..d5d808f 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -424,7 +424,7 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb, fe->i_fs_generation = cpu_to_le32(osb->fs_generation); fe->i_blkno = cpu_to_le64(fe_blkno); fe->i_suballoc_bit = cpu_to_le16(suballoc_bit); - fe->i_suballoc_slot = cpu_to_le16(osb->slot_num); + fe->i_suballoc_slot = cpu_to_le16(inode_ac->ac_alloc_slot); fe->i_uid = cpu_to_le32(current->fsuid); if (dir->i_mode & S_ISGID) { fe->i_gid = cpu_to_le32(dir->i_gid); @@ -997,7 +997,7 @@ static int ocfs2_rename(struct inode *old_dir, * * And that's why, just like the VFS, we need a file system * rename lock. */ - if (old_dentry != new_dentry) { + if (old_dir != new_dir && S_ISDIR(old_inode->i_mode)) { status = ocfs2_rename_lock(osb); if (status < 0) { mlog_errno(status); diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index 6546cef..3169237 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -36,11 +36,8 @@ #include <linux/mutex.h> #include <linux/jbd.h> -#include "cluster/nodemanager.h" -#include "cluster/heartbeat.h" -#include "cluster/tcp.h" - -#include "dlm/dlmapi.h" +/* For union ocfs2_dlm_lksb */ +#include "stackglue.h" #include "ocfs2_fs.h" #include "ocfs2_lockid.h" @@ -101,6 +98,9 @@ enum ocfs2_unlock_action { * dropped. */ #define OCFS2_LOCK_QUEUED (0x00000100) /* queued for downconvert */ #define OCFS2_LOCK_NOCACHE (0x00000200) /* don't use a holder count */ +#define OCFS2_LOCK_PENDING (0x00000400) /* This lockres is pending a + call to dlm_lock. Only + exists with BUSY set. */ struct ocfs2_lock_res_ops; @@ -120,13 +120,14 @@ struct ocfs2_lock_res { int l_level; unsigned int l_ro_holders; unsigned int l_ex_holders; - struct dlm_lockstatus l_lksb; + union ocfs2_dlm_lksb l_lksb; /* used from AST/BAST funcs. */ enum ocfs2_ast_action l_action; enum ocfs2_unlock_action l_unlock_action; int l_requested; int l_blocking; + unsigned int l_pending_gen; wait_queue_head_t l_event; @@ -179,6 +180,8 @@ enum ocfs2_mount_options #define OCFS2_DEFAULT_ATIME_QUANTUM 60 struct ocfs2_journal; +struct ocfs2_slot_info; +struct ocfs2_recovery_map; struct ocfs2_super { struct task_struct *commit_task; @@ -190,7 +193,6 @@ struct ocfs2_super struct ocfs2_slot_info *slot_info; spinlock_t node_map_lock; - struct ocfs2_node_map recovery_map; u64 root_blkno; u64 system_dir_blkno; @@ -206,25 +208,29 @@ struct ocfs2_super u32 s_feature_incompat; u32 s_feature_ro_compat; - /* Protects s_next_generaion, osb_flags. Could protect more on - * osb as it's very short lived. */ + /* Protects s_next_generation, osb_flags and s_inode_steal_slot. + * Could protect more on osb as it's very short lived. + */ spinlock_t osb_lock; u32 s_next_generation; unsigned long osb_flags; + s16 s_inode_steal_slot; + atomic_t s_num_inodes_stolen; unsigned long s_mount_opt; unsigned int s_atime_quantum; - u16 max_slots; - s16 node_num; - s16 slot_num; - s16 preferred_slot; + unsigned int max_slots; + unsigned int node_num; + int slot_num; + int preferred_slot; int s_sectsize_bits; int s_clustersize; int s_clustersize_bits; atomic_t vol_state; struct mutex recovery_lock; + struct ocfs2_recovery_map *recovery_map; struct task_struct *recovery_thread_task; int disable_recovery; wait_queue_head_t checkpoint_event; @@ -245,12 +251,11 @@ struct ocfs2_super struct ocfs2_alloc_stats alloc_stats; char dev_str[20]; /* "major,minor" of the device */ - struct dlm_ctxt *dlm; + char osb_cluster_stack[OCFS2_STACK_LABEL_LEN + 1]; + struct ocfs2_cluster_connection *cconn; struct ocfs2_lock_res osb_super_lockres; struct ocfs2_lock_res osb_rename_lockres; - struct dlm_eviction_cb osb_eviction_cb; struct ocfs2_dlm_debug *osb_dlm_debug; - struct dlm_protocol_version osb_locking_proto; struct dentry *osb_debug_root; @@ -367,11 +372,24 @@ static inline int ocfs2_is_soft_readonly(struct ocfs2_super *osb) return ret; } +static inline int ocfs2_userspace_stack(struct ocfs2_super *osb) +{ + return (osb->s_feature_incompat & + OCFS2_FEATURE_INCOMPAT_USERSPACE_STACK); +} + static inline int ocfs2_mount_local(struct ocfs2_super *osb) { return (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_LOCAL_MOUNT); } +static inline int ocfs2_uses_extended_slot_map(struct ocfs2_super *osb) +{ + return (osb->s_feature_incompat & + OCFS2_FEATURE_INCOMPAT_EXTENDED_SLOT_MAP); +} + + #define OCFS2_IS_VALID_DINODE(ptr) \ (!strcmp((ptr)->i_signature, OCFS2_INODE_SIGNATURE)) @@ -522,6 +540,33 @@ static inline unsigned int ocfs2_pages_per_cluster(struct super_block *sb) return pages_per_cluster; } +static inline void ocfs2_init_inode_steal_slot(struct ocfs2_super *osb) +{ + spin_lock(&osb->osb_lock); + osb->s_inode_steal_slot = OCFS2_INVALID_SLOT; + spin_unlock(&osb->osb_lock); + atomic_set(&osb->s_num_inodes_stolen, 0); +} + +static inline void ocfs2_set_inode_steal_slot(struct ocfs2_super *osb, + s16 slot) +{ + spin_lock(&osb->osb_lock); + osb->s_inode_steal_slot = slot; + spin_unlock(&osb->osb_lock); +} + +static inline s16 ocfs2_get_inode_steal_slot(struct ocfs2_super *osb) +{ + s16 slot; + + spin_lock(&osb->osb_lock); + slot = osb->s_inode_steal_slot; + spin_unlock(&osb->osb_lock); + + return slot; +} + #define ocfs2_set_bit ext2_set_bit #define ocfs2_clear_bit ext2_clear_bit #define ocfs2_test_bit ext2_test_bit diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h index 3633edd3..52c4266 100644 --- a/fs/ocfs2/ocfs2_fs.h +++ b/fs/ocfs2/ocfs2_fs.h @@ -88,7 +88,9 @@ #define OCFS2_FEATURE_COMPAT_SUPP OCFS2_FEATURE_COMPAT_BACKUP_SB #define OCFS2_FEATURE_INCOMPAT_SUPP (OCFS2_FEATURE_INCOMPAT_LOCAL_MOUNT \ | OCFS2_FEATURE_INCOMPAT_SPARSE_ALLOC \ - | OCFS2_FEATURE_INCOMPAT_INLINE_DATA) + | OCFS2_FEATURE_INCOMPAT_INLINE_DATA \ + | OCFS2_FEATURE_INCOMPAT_EXTENDED_SLOT_MAP \ + | OCFS2_FEATURE_INCOMPAT_USERSPACE_STACK) #define OCFS2_FEATURE_RO_COMPAT_SUPP OCFS2_FEATURE_RO_COMPAT_UNWRITTEN /* @@ -125,6 +127,21 @@ /* Support for data packed into inode blocks */ #define OCFS2_FEATURE_INCOMPAT_INLINE_DATA 0x0040 +/* Support for the extended slot map */ +#define OCFS2_FEATURE_INCOMPAT_EXTENDED_SLOT_MAP 0x100 + + +/* + * Support for alternate, userspace cluster stacks. If set, the superblock + * field s_cluster_info contains a tag for the alternate stack in use as + * well as the name of the cluster being joined. + * mount.ocfs2 must pass in a matching stack name. + * + * If not set, the classic stack will be used. This is compatbile with + * all older versions. + */ +#define OCFS2_FEATURE_INCOMPAT_USERSPACE_STACK 0x0080 + /* * backup superblock flag is used to indicate that this volume * has backup superblocks. @@ -267,6 +284,10 @@ struct ocfs2_new_group_input { #define OCFS2_VOL_UUID_LEN 16 #define OCFS2_MAX_VOL_LABEL_LEN 64 +/* The alternate, userspace stack fields */ +#define OCFS2_STACK_LABEL_LEN 4 +#define OCFS2_CLUSTER_NAME_LEN 16 + /* Journal limits (in bytes) */ #define OCFS2_MIN_JOURNAL_SIZE (4 * 1024 * 1024) @@ -475,6 +496,47 @@ struct ocfs2_extent_block }; /* + * On disk slot map for OCFS2. This defines the contents of the "slot_map" + * system file. A slot is valid if it contains a node number >= 0. The + * value -1 (0xFFFF) is OCFS2_INVALID_SLOT. This marks a slot empty. + */ +struct ocfs2_slot_map { +/*00*/ __le16 sm_slots[0]; +/* + * Actual on-disk size is one block. OCFS2_MAX_SLOTS is 255, + * 255 * sizeof(__le16) == 512B, within the 512B block minimum blocksize. + */ +}; + +struct ocfs2_extended_slot { +/*00*/ __u8 es_valid; + __u8 es_reserved1[3]; + __le32 es_node_num; +/*10*/ +}; + +/* + * The extended slot map, used when OCFS2_FEATURE_INCOMPAT_EXTENDED_SLOT_MAP + * is set. It separates out the valid marker from the node number, and + * has room to grow. Unlike the old slot map, this format is defined by + * i_size. + */ +struct ocfs2_slot_map_extended { +/*00*/ struct ocfs2_extended_slot se_slots[0]; +/* + * Actual size is i_size of the slot_map system file. It should + * match s_max_slots * sizeof(struct ocfs2_extended_slot) + */ +}; + +struct ocfs2_cluster_info { +/*00*/ __u8 ci_stack[OCFS2_STACK_LABEL_LEN]; + __le32 ci_reserved; +/*08*/ __u8 ci_cluster[OCFS2_CLUSTER_NAME_LEN]; +/*18*/ +}; + +/* * On disk superblock for OCFS2 * Note that it is contained inside an ocfs2_dinode, so all offsets * are relative to the start of ocfs2_dinode.id2. @@ -506,7 +568,20 @@ struct ocfs2_super_block { * group header */ /*50*/ __u8 s_label[OCFS2_MAX_VOL_LABEL_LEN]; /* Label for mounting, etc. */ /*90*/ __u8 s_uuid[OCFS2_VOL_UUID_LEN]; /* 128-bit uuid */ -/*A0*/ +/*A0*/ struct ocfs2_cluster_info s_cluster_info; /* Selected userspace + stack. Only valid + with INCOMPAT flag. */ +/*B8*/ __le64 s_reserved2[17]; /* Fill out superblock */ +/*140*/ + + /* + * NOTE: As stated above, all offsets are relative to + * ocfs2_dinode.id2, which is at 0xC0 in the inode. + * 0xC0 + 0x140 = 0x200 or 512 bytes. A superblock must fit within + * our smallest blocksize, which is 512 bytes. To ensure this, + * we reserve the space in s_reserved2. Anything past s_reserved2 + * will not be available on the smallest blocksize. + */ }; /* diff --git a/fs/ocfs2/ocfs2_lockid.h b/fs/ocfs2/ocfs2_lockid.h index 86f3e37..82c200f 100644 --- a/fs/ocfs2/ocfs2_lockid.h +++ b/fs/ocfs2/ocfs2_lockid.h @@ -100,7 +100,7 @@ static char *ocfs2_lock_type_strings[] = { static inline const char *ocfs2_lock_type_string(enum ocfs2_lock_type type) { #ifdef __KERNEL__ - mlog_bug_on_msg(type >= OCFS2_NUM_LOCK_TYPES, "%d\n", type); + BUG_ON(type >= OCFS2_NUM_LOCK_TYPES); #endif return ocfs2_lock_type_strings[type]; } diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c index 3a50ce5..bb5ff89 100644 --- a/fs/ocfs2/slot_map.c +++ b/fs/ocfs2/slot_map.c @@ -42,81 +42,244 @@ #include "buffer_head_io.h" -static s16 __ocfs2_node_num_to_slot(struct ocfs2_slot_info *si, - s16 global); -static void __ocfs2_fill_slot(struct ocfs2_slot_info *si, - s16 slot_num, - s16 node_num); - -/* post the slot information on disk into our slot_info struct. */ -void ocfs2_update_slot_info(struct ocfs2_slot_info *si) + +struct ocfs2_slot { + int sl_valid; + unsigned int sl_node_num; +}; + +struct ocfs2_slot_info { + int si_extended; + int si_slots_per_block; + struct inode *si_inode; + unsigned int si_blocks; + struct buffer_head **si_bh; + unsigned int si_num_slots; + struct ocfs2_slot *si_slots; +}; + + +static int __ocfs2_node_num_to_slot(struct ocfs2_slot_info *si, + unsigned int node_num); + +static void ocfs2_invalidate_slot(struct ocfs2_slot_info *si, + int slot_num) +{ + BUG_ON((slot_num < 0) || (slot_num >= si->si_num_slots)); + si->si_slots[slot_num].sl_valid = 0; +} + +static void ocfs2_set_slot(struct ocfs2_slot_info *si, + int slot_num, unsigned int node_num) +{ + BUG_ON((slot_num < 0) || (slot_num >= si->si_num_slots)); + + si->si_slots[slot_num].sl_valid = 1; + si->si_slots[slot_num].sl_node_num = node_num; +} + +/* This version is for the extended slot map */ +static void ocfs2_update_slot_info_extended(struct ocfs2_slot_info *si) +{ + int b, i, slotno; + struct ocfs2_slot_map_extended *se; + + slotno = 0; + for (b = 0; b < si->si_blocks; b++) { + se = (struct ocfs2_slot_map_extended *)si->si_bh[b]->b_data; + for (i = 0; + (i < si->si_slots_per_block) && + (slotno < si->si_num_slots); + i++, slotno++) { + if (se->se_slots[i].es_valid) + ocfs2_set_slot(si, slotno, + le32_to_cpu(se->se_slots[i].es_node_num)); + else + ocfs2_invalidate_slot(si, slotno); + } + } +} + +/* + * Post the slot information on disk into our slot_info struct. + * Must be protected by osb_lock. + */ +static void ocfs2_update_slot_info_old(struct ocfs2_slot_info *si) { int i; - __le16 *disk_info; + struct ocfs2_slot_map *sm; - /* we don't read the slot block here as ocfs2_super_lock - * should've made sure we have the most recent copy. */ - spin_lock(&si->si_lock); - disk_info = (__le16 *) si->si_bh->b_data; + sm = (struct ocfs2_slot_map *)si->si_bh[0]->b_data; - for (i = 0; i < si->si_size; i++) - si->si_global_node_nums[i] = le16_to_cpu(disk_info[i]); + for (i = 0; i < si->si_num_slots; i++) { + if (le16_to_cpu(sm->sm_slots[i]) == (u16)OCFS2_INVALID_SLOT) + ocfs2_invalidate_slot(si, i); + else + ocfs2_set_slot(si, i, le16_to_cpu(sm->sm_slots[i])); + } +} - spin_unlock(&si->si_lock); +static void ocfs2_update_slot_info(struct ocfs2_slot_info *si) +{ + /* + * The slot data will have been refreshed when ocfs2_super_lock + * was taken. + */ + if (si->si_extended) + ocfs2_update_slot_info_extended(si); + else + ocfs2_update_slot_info_old(si); +} + +int ocfs2_refresh_slot_info(struct ocfs2_super *osb) +{ + int ret; + struct ocfs2_slot_info *si = osb->slot_info; + + if (si == NULL) + return 0; + + BUG_ON(si->si_blocks == 0); + BUG_ON(si->si_bh == NULL); + + mlog(0, "Refreshing slot map, reading %u block(s)\n", + si->si_blocks); + + /* + * We pass -1 as blocknr because we expect all of si->si_bh to + * be !NULL. Thus, ocfs2_read_blocks() will ignore blocknr. If + * this is not true, the read of -1 (UINT64_MAX) will fail. + */ + ret = ocfs2_read_blocks(osb, -1, si->si_blocks, si->si_bh, 0, + si->si_inode); + if (ret == 0) { + spin_lock(&osb->osb_lock); + ocfs2_update_slot_info(si); + spin_unlock(&osb->osb_lock); + } + + return ret; } /* post the our slot info stuff into it's destination bh and write it * out. */ -int ocfs2_update_disk_slots(struct ocfs2_super *osb, - struct ocfs2_slot_info *si) +static void ocfs2_update_disk_slot_extended(struct ocfs2_slot_info *si, + int slot_num, + struct buffer_head **bh) { - int status, i; - __le16 *disk_info = (__le16 *) si->si_bh->b_data; + int blkind = slot_num / si->si_slots_per_block; + int slotno = slot_num % si->si_slots_per_block; + struct ocfs2_slot_map_extended *se; + + BUG_ON(blkind >= si->si_blocks); + + se = (struct ocfs2_slot_map_extended *)si->si_bh[blkind]->b_data; + se->se_slots[slotno].es_valid = si->si_slots[slot_num].sl_valid; + if (si->si_slots[slot_num].sl_valid) + se->se_slots[slotno].es_node_num = + cpu_to_le32(si->si_slots[slot_num].sl_node_num); + *bh = si->si_bh[blkind]; +} - spin_lock(&si->si_lock); - for (i = 0; i < si->si_size; i++) - disk_info[i] = cpu_to_le16(si->si_global_node_nums[i]); - spin_unlock(&si->si_lock); +static void ocfs2_update_disk_slot_old(struct ocfs2_slot_info *si, + int slot_num, + struct buffer_head **bh) +{ + int i; + struct ocfs2_slot_map *sm; + + sm = (struct ocfs2_slot_map *)si->si_bh[0]->b_data; + for (i = 0; i < si->si_num_slots; i++) { + if (si->si_slots[i].sl_valid) + sm->sm_slots[i] = + cpu_to_le16(si->si_slots[i].sl_node_num); + else + sm->sm_slots[i] = cpu_to_le16(OCFS2_INVALID_SLOT); + } + *bh = si->si_bh[0]; +} + +static int ocfs2_update_disk_slot(struct ocfs2_super *osb, + struct ocfs2_slot_info *si, + int slot_num) +{ + int status; + struct buffer_head *bh; + + spin_lock(&osb->osb_lock); + if (si->si_extended) + ocfs2_update_disk_slot_extended(si, slot_num, &bh); + else + ocfs2_update_disk_slot_old(si, slot_num, &bh); + spin_unlock(&osb->osb_lock); - status = ocfs2_write_block(osb, si->si_bh, si->si_inode); + status = ocfs2_write_block(osb, bh, si->si_inode); if (status < 0) mlog_errno(status); return status; } -/* try to find global node in the slot info. Returns - * OCFS2_INVALID_SLOT if nothing is found. */ -static s16 __ocfs2_node_num_to_slot(struct ocfs2_slot_info *si, - s16 global) +/* + * Calculate how many bytes are needed by the slot map. Returns + * an error if the slot map file is too small. + */ +static int ocfs2_slot_map_physical_size(struct ocfs2_super *osb, + struct inode *inode, + unsigned long long *bytes) { - int i; - s16 ret = OCFS2_INVALID_SLOT; + unsigned long long bytes_needed; + + if (ocfs2_uses_extended_slot_map(osb)) { + bytes_needed = osb->max_slots * + sizeof(struct ocfs2_extended_slot); + } else { + bytes_needed = osb->max_slots * sizeof(__le16); + } + if (bytes_needed > i_size_read(inode)) { + mlog(ML_ERROR, + "Slot map file is too small! (size %llu, needed %llu)\n", + i_size_read(inode), bytes_needed); + return -ENOSPC; + } + + *bytes = bytes_needed; + return 0; +} + +/* try to find global node in the slot info. Returns -ENOENT + * if nothing is found. */ +static int __ocfs2_node_num_to_slot(struct ocfs2_slot_info *si, + unsigned int node_num) +{ + int i, ret = -ENOENT; for(i = 0; i < si->si_num_slots; i++) { - if (global == si->si_global_node_nums[i]) { - ret = (s16) i; + if (si->si_slots[i].sl_valid && + (node_num == si->si_slots[i].sl_node_num)) { + ret = i; break; } } + return ret; } -static s16 __ocfs2_find_empty_slot(struct ocfs2_slot_info *si, s16 preferred) +static int __ocfs2_find_empty_slot(struct ocfs2_slot_info *si, + int preferred) { - int i; - s16 ret = OCFS2_INVALID_SLOT; + int i, ret = -ENOSPC; - if (preferred >= 0 && preferred < si->si_num_slots) { - if (OCFS2_INVALID_SLOT == si->si_global_node_nums[preferred]) { + if ((preferred >= 0) && (preferred < si->si_num_slots)) { + if (!si->si_slots[preferred].sl_valid) { ret = preferred; goto out; } } for(i = 0; i < si->si_num_slots; i++) { - if (OCFS2_INVALID_SLOT == si->si_global_node_nums[i]) { - ret = (s16) i; + if (!si->si_slots[i].sl_valid) { + ret = i; break; } } @@ -124,58 +287,155 @@ out: return ret; } -s16 ocfs2_node_num_to_slot(struct ocfs2_slot_info *si, - s16 global) +int ocfs2_node_num_to_slot(struct ocfs2_super *osb, unsigned int node_num) { - s16 ret; + int slot; + struct ocfs2_slot_info *si = osb->slot_info; - spin_lock(&si->si_lock); - ret = __ocfs2_node_num_to_slot(si, global); - spin_unlock(&si->si_lock); - return ret; + spin_lock(&osb->osb_lock); + slot = __ocfs2_node_num_to_slot(si, node_num); + spin_unlock(&osb->osb_lock); + + return slot; +} + +int ocfs2_slot_to_node_num_locked(struct ocfs2_super *osb, int slot_num, + unsigned int *node_num) +{ + struct ocfs2_slot_info *si = osb->slot_info; + + assert_spin_locked(&osb->osb_lock); + + BUG_ON(slot_num < 0); + BUG_ON(slot_num > osb->max_slots); + + if (!si->si_slots[slot_num].sl_valid) + return -ENOENT; + + *node_num = si->si_slots[slot_num].sl_node_num; + return 0; } -static void __ocfs2_fill_slot(struct ocfs2_slot_info *si, - s16 slot_num, - s16 node_num) +static void __ocfs2_free_slot_info(struct ocfs2_slot_info *si) { - BUG_ON(slot_num == OCFS2_INVALID_SLOT); - BUG_ON(slot_num >= si->si_num_slots); - BUG_ON((node_num != O2NM_INVALID_NODE_NUM) && - (node_num >= O2NM_MAX_NODES)); + unsigned int i; + + if (si == NULL) + return; + + if (si->si_inode) + iput(si->si_inode); + if (si->si_bh) { + for (i = 0; i < si->si_blocks; i++) { + if (si->si_bh[i]) { + brelse(si->si_bh[i]); + si->si_bh[i] = NULL; + } + } + kfree(si->si_bh); + } - si->si_global_node_nums[slot_num] = node_num; + kfree(si); } -void ocfs2_clear_slot(struct ocfs2_slot_info *si, - s16 slot_num) +int ocfs2_clear_slot(struct ocfs2_super *osb, int slot_num) { - spin_lock(&si->si_lock); - __ocfs2_fill_slot(si, slot_num, OCFS2_INVALID_SLOT); - spin_unlock(&si->si_lock); + struct ocfs2_slot_info *si = osb->slot_info; + + if (si == NULL) + return 0; + + spin_lock(&osb->osb_lock); + ocfs2_invalidate_slot(si, slot_num); + spin_unlock(&osb->osb_lock); + + return ocfs2_update_disk_slot(osb, osb->slot_info, slot_num); } -int ocfs2_init_slot_info(struct ocfs2_super *osb) +static int ocfs2_map_slot_buffers(struct ocfs2_super *osb, + struct ocfs2_slot_info *si) { - int status, i; + int status = 0; u64 blkno; + unsigned long long blocks, bytes; + unsigned int i; + struct buffer_head *bh; + + status = ocfs2_slot_map_physical_size(osb, si->si_inode, &bytes); + if (status) + goto bail; + + blocks = ocfs2_blocks_for_bytes(si->si_inode->i_sb, bytes); + BUG_ON(blocks > UINT_MAX); + si->si_blocks = blocks; + if (!si->si_blocks) + goto bail; + + if (si->si_extended) + si->si_slots_per_block = + (osb->sb->s_blocksize / + sizeof(struct ocfs2_extended_slot)); + else + si->si_slots_per_block = osb->sb->s_blocksize / sizeof(__le16); + + /* The size checks above should ensure this */ + BUG_ON((osb->max_slots / si->si_slots_per_block) > blocks); + + mlog(0, "Slot map needs %u buffers for %llu bytes\n", + si->si_blocks, bytes); + + si->si_bh = kzalloc(sizeof(struct buffer_head *) * si->si_blocks, + GFP_KERNEL); + if (!si->si_bh) { + status = -ENOMEM; + mlog_errno(status); + goto bail; + } + + for (i = 0; i < si->si_blocks; i++) { + status = ocfs2_extent_map_get_blocks(si->si_inode, i, + &blkno, NULL, NULL); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + mlog(0, "Reading slot map block %u at %llu\n", i, + (unsigned long long)blkno); + + bh = NULL; /* Acquire a fresh bh */ + status = ocfs2_read_block(osb, blkno, &bh, 0, si->si_inode); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + si->si_bh[i] = bh; + } + +bail: + return status; +} + +int ocfs2_init_slot_info(struct ocfs2_super *osb) +{ + int status; struct inode *inode = NULL; - struct buffer_head *bh = NULL; struct ocfs2_slot_info *si; - si = kzalloc(sizeof(struct ocfs2_slot_info), GFP_KERNEL); + si = kzalloc(sizeof(struct ocfs2_slot_info) + + (sizeof(struct ocfs2_slot) * osb->max_slots), + GFP_KERNEL); if (!si) { status = -ENOMEM; mlog_errno(status); goto bail; } - spin_lock_init(&si->si_lock); + si->si_extended = ocfs2_uses_extended_slot_map(osb); si->si_num_slots = osb->max_slots; - si->si_size = OCFS2_MAX_SLOTS; - - for(i = 0; i < si->si_num_slots; i++) - si->si_global_node_nums[i] = OCFS2_INVALID_SLOT; + si->si_slots = (struct ocfs2_slot *)((char *)si + + sizeof(struct ocfs2_slot_info)); inode = ocfs2_get_system_file_inode(osb, SLOT_MAP_SYSTEM_INODE, OCFS2_INVALID_SLOT); @@ -185,61 +445,53 @@ int ocfs2_init_slot_info(struct ocfs2_super *osb) goto bail; } - status = ocfs2_extent_map_get_blocks(inode, 0ULL, &blkno, NULL, NULL); - if (status < 0) { - mlog_errno(status); - goto bail; - } - - status = ocfs2_read_block(osb, blkno, &bh, 0, inode); + si->si_inode = inode; + status = ocfs2_map_slot_buffers(osb, si); if (status < 0) { mlog_errno(status); goto bail; } - si->si_inode = inode; - si->si_bh = bh; - osb->slot_info = si; + osb->slot_info = (struct ocfs2_slot_info *)si; bail: if (status < 0 && si) - ocfs2_free_slot_info(si); + __ocfs2_free_slot_info(si); return status; } -void ocfs2_free_slot_info(struct ocfs2_slot_info *si) +void ocfs2_free_slot_info(struct ocfs2_super *osb) { - if (si->si_inode) - iput(si->si_inode); - if (si->si_bh) - brelse(si->si_bh); - kfree(si); + struct ocfs2_slot_info *si = osb->slot_info; + + osb->slot_info = NULL; + __ocfs2_free_slot_info(si); } int ocfs2_find_slot(struct ocfs2_super *osb) { int status; - s16 slot; + int slot; struct ocfs2_slot_info *si; mlog_entry_void(); si = osb->slot_info; + spin_lock(&osb->osb_lock); ocfs2_update_slot_info(si); - spin_lock(&si->si_lock); /* search for ourselves first and take the slot if it already * exists. Perhaps we need to mark this in a variable for our * own journal recovery? Possibly not, though we certainly * need to warn to the user */ slot = __ocfs2_node_num_to_slot(si, osb->node_num); - if (slot == OCFS2_INVALID_SLOT) { + if (slot < 0) { /* if no slot yet, then just take 1st available * one. */ slot = __ocfs2_find_empty_slot(si, osb->preferred_slot); - if (slot == OCFS2_INVALID_SLOT) { - spin_unlock(&si->si_lock); + if (slot < 0) { + spin_unlock(&osb->osb_lock); mlog(ML_ERROR, "no free slots available!\n"); status = -EINVAL; goto bail; @@ -248,13 +500,13 @@ int ocfs2_find_slot(struct ocfs2_super *osb) mlog(ML_NOTICE, "slot %d is already allocated to this node!\n", slot); - __ocfs2_fill_slot(si, slot, osb->node_num); + ocfs2_set_slot(si, slot, osb->node_num); osb->slot_num = slot; - spin_unlock(&si->si_lock); + spin_unlock(&osb->osb_lock); mlog(0, "taking node slot %d\n", osb->slot_num); - status = ocfs2_update_disk_slots(osb, si); + status = ocfs2_update_disk_slot(osb, si, osb->slot_num); if (status < 0) mlog_errno(status); @@ -265,27 +517,27 @@ bail: void ocfs2_put_slot(struct ocfs2_super *osb) { - int status; + int status, slot_num; struct ocfs2_slot_info *si = osb->slot_info; if (!si) return; + spin_lock(&osb->osb_lock); ocfs2_update_slot_info(si); - spin_lock(&si->si_lock); - __ocfs2_fill_slot(si, osb->slot_num, OCFS2_INVALID_SLOT); + slot_num = osb->slot_num; + ocfs2_invalidate_slot(si, osb->slot_num); osb->slot_num = OCFS2_INVALID_SLOT; - spin_unlock(&si->si_lock); + spin_unlock(&osb->osb_lock); - status = ocfs2_update_disk_slots(osb, si); + status = ocfs2_update_disk_slot(osb, si, slot_num); if (status < 0) { mlog_errno(status); goto bail; } bail: - osb->slot_info = NULL; - ocfs2_free_slot_info(si); + ocfs2_free_slot_info(osb); } diff --git a/fs/ocfs2/slot_map.h b/fs/ocfs2/slot_map.h index 1025872..601c95f 100644 --- a/fs/ocfs2/slot_map.h +++ b/fs/ocfs2/slot_map.h @@ -27,38 +27,18 @@ #ifndef SLOTMAP_H #define SLOTMAP_H -struct ocfs2_slot_info { - spinlock_t si_lock; - - struct inode *si_inode; - struct buffer_head *si_bh; - unsigned int si_num_slots; - unsigned int si_size; - s16 si_global_node_nums[OCFS2_MAX_SLOTS]; -}; - int ocfs2_init_slot_info(struct ocfs2_super *osb); -void ocfs2_free_slot_info(struct ocfs2_slot_info *si); +void ocfs2_free_slot_info(struct ocfs2_super *osb); int ocfs2_find_slot(struct ocfs2_super *osb); void ocfs2_put_slot(struct ocfs2_super *osb); -void ocfs2_update_slot_info(struct ocfs2_slot_info *si); -int ocfs2_update_disk_slots(struct ocfs2_super *osb, - struct ocfs2_slot_info *si); - -s16 ocfs2_node_num_to_slot(struct ocfs2_slot_info *si, - s16 global); -void ocfs2_clear_slot(struct ocfs2_slot_info *si, - s16 slot_num); +int ocfs2_refresh_slot_info(struct ocfs2_super *osb); -static inline int ocfs2_is_empty_slot(struct ocfs2_slot_info *si, - int slot_num) -{ - BUG_ON(slot_num == OCFS2_INVALID_SLOT); - assert_spin_locked(&si->si_lock); +int ocfs2_node_num_to_slot(struct ocfs2_super *osb, unsigned int node_num); +int ocfs2_slot_to_node_num_locked(struct ocfs2_super *osb, int slot_num, + unsigned int *node_num); - return si->si_global_node_nums[slot_num] == OCFS2_INVALID_SLOT; -} +int ocfs2_clear_slot(struct ocfs2_super *osb, int slot_num); #endif diff --git a/fs/ocfs2/stack_o2cb.c b/fs/ocfs2/stack_o2cb.c new file mode 100644 index 0000000..ac1d74c --- /dev/null +++ b/fs/ocfs2/stack_o2cb.c @@ -0,0 +1,420 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * stack_o2cb.c + * + * Code which interfaces ocfs2 with the o2cb stack. + * + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation, version 2. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ + +#include <linux/crc32.h> +#include <linux/module.h> + +/* Needed for AOP_TRUNCATED_PAGE in mlog_errno() */ +#include <linux/fs.h> + +#include "cluster/masklog.h" +#include "cluster/nodemanager.h" +#include "cluster/heartbeat.h" + +#include "stackglue.h" + +struct o2dlm_private { + struct dlm_eviction_cb op_eviction_cb; +}; + +static struct ocfs2_stack_plugin o2cb_stack; + +/* These should be identical */ +#if (DLM_LOCK_IV != LKM_IVMODE) +# error Lock modes do not match +#endif +#if (DLM_LOCK_NL != LKM_NLMODE) +# error Lock modes do not match +#endif +#if (DLM_LOCK_CR != LKM_CRMODE) +# error Lock modes do not match +#endif +#if (DLM_LOCK_CW != LKM_CWMODE) +# error Lock modes do not match +#endif +#if (DLM_LOCK_PR != LKM_PRMODE) +# error Lock modes do not match +#endif +#if (DLM_LOCK_PW != LKM_PWMODE) +# error Lock modes do not match +#endif +#if (DLM_LOCK_EX != LKM_EXMODE) +# error Lock modes do not match +#endif +static inline int mode_to_o2dlm(int mode) +{ + BUG_ON(mode > LKM_MAXMODE); + + return mode; +} + +#define map_flag(_generic, _o2dlm) \ + if (flags & (_generic)) { \ + flags &= ~(_generic); \ + o2dlm_flags |= (_o2dlm); \ + } +static int flags_to_o2dlm(u32 flags) +{ + int o2dlm_flags = 0; + + map_flag(DLM_LKF_NOQUEUE, LKM_NOQUEUE); + map_flag(DLM_LKF_CANCEL, LKM_CANCEL); + map_flag(DLM_LKF_CONVERT, LKM_CONVERT); + map_flag(DLM_LKF_VALBLK, LKM_VALBLK); + map_flag(DLM_LKF_IVVALBLK, LKM_INVVALBLK); + map_flag(DLM_LKF_ORPHAN, LKM_ORPHAN); + map_flag(DLM_LKF_FORCEUNLOCK, LKM_FORCE); + map_flag(DLM_LKF_TIMEOUT, LKM_TIMEOUT); + map_flag(DLM_LKF_LOCAL, LKM_LOCAL); + + /* map_flag() should have cleared every flag passed in */ + BUG_ON(flags != 0); + + return o2dlm_flags; +} +#undef map_flag + +/* + * Map an o2dlm status to standard errno values. + * + * o2dlm only uses a handful of these, and returns even fewer to the + * caller. Still, we try to assign sane values to each error. + * + * The following value pairs have special meanings to dlmglue, thus + * the right hand side needs to stay unique - never duplicate the + * mapping elsewhere in the table! + * + * DLM_NORMAL: 0 + * DLM_NOTQUEUED: -EAGAIN + * DLM_CANCELGRANT: -EBUSY + * DLM_CANCEL: -DLM_ECANCEL + */ +/* Keep in sync with dlmapi.h */ +static int status_map[] = { + [DLM_NORMAL] = 0, /* Success */ + [DLM_GRANTED] = -EINVAL, + [DLM_DENIED] = -EACCES, + [DLM_DENIED_NOLOCKS] = -EACCES, + [DLM_WORKING] = -EACCES, + [DLM_BLOCKED] = -EINVAL, + [DLM_BLOCKED_ORPHAN] = -EINVAL, + [DLM_DENIED_GRACE_PERIOD] = -EACCES, + [DLM_SYSERR] = -ENOMEM, /* It is what it is */ + [DLM_NOSUPPORT] = -EPROTO, + [DLM_CANCELGRANT] = -EBUSY, /* Cancel after grant */ + [DLM_IVLOCKID] = -EINVAL, + [DLM_SYNC] = -EINVAL, + [DLM_BADTYPE] = -EINVAL, + [DLM_BADRESOURCE] = -EINVAL, + [DLM_MAXHANDLES] = -ENOMEM, + [DLM_NOCLINFO] = -EINVAL, + [DLM_NOLOCKMGR] = -EINVAL, + [DLM_NOPURGED] = -EINVAL, + [DLM_BADARGS] = -EINVAL, + [DLM_VOID] = -EINVAL, + [DLM_NOTQUEUED] = -EAGAIN, /* Trylock failed */ + [DLM_IVBUFLEN] = -EINVAL, + [DLM_CVTUNGRANT] = -EPERM, + [DLM_BADPARAM] = -EINVAL, + [DLM_VALNOTVALID] = -EINVAL, + [DLM_REJECTED] = -EPERM, + [DLM_ABORT] = -EINVAL, + [DLM_CANCEL] = -DLM_ECANCEL, /* Successful cancel */ + [DLM_IVRESHANDLE] = -EINVAL, + [DLM_DEADLOCK] = -EDEADLK, + [DLM_DENIED_NOASTS] = -EINVAL, + [DLM_FORWARD] = -EINVAL, + [DLM_TIMEOUT] = -ETIMEDOUT, + [DLM_IVGROUPID] = -EINVAL, + [DLM_VERS_CONFLICT] = -EOPNOTSUPP, + [DLM_BAD_DEVICE_PATH] = -ENOENT, + [DLM_NO_DEVICE_PERMISSION] = -EPERM, + [DLM_NO_CONTROL_DEVICE] = -ENOENT, + [DLM_RECOVERING] = -ENOTCONN, + [DLM_MIGRATING] = -ERESTART, + [DLM_MAXSTATS] = -EINVAL, +}; + +static int dlm_status_to_errno(enum dlm_status status) +{ + BUG_ON(status > (sizeof(status_map) / sizeof(status_map[0]))); + + return status_map[status]; +} + +static void o2dlm_lock_ast_wrapper(void *astarg) +{ + BUG_ON(o2cb_stack.sp_proto == NULL); + + o2cb_stack.sp_proto->lp_lock_ast(astarg); +} + +static void o2dlm_blocking_ast_wrapper(void *astarg, int level) +{ + BUG_ON(o2cb_stack.sp_proto == NULL); + + o2cb_stack.sp_proto->lp_blocking_ast(astarg, level); +} + +static void o2dlm_unlock_ast_wrapper(void *astarg, enum dlm_status status) +{ + int error = dlm_status_to_errno(status); + + BUG_ON(o2cb_stack.sp_proto == NULL); + + /* + * In o2dlm, you can get both the lock_ast() for the lock being + * granted and the unlock_ast() for the CANCEL failing. A + * successful cancel sends DLM_NORMAL here. If the + * lock grant happened before the cancel arrived, you get + * DLM_CANCELGRANT. + * + * There's no need for the double-ast. If we see DLM_CANCELGRANT, + * we just ignore it. We expect the lock_ast() to handle the + * granted lock. + */ + if (status == DLM_CANCELGRANT) + return; + + o2cb_stack.sp_proto->lp_unlock_ast(astarg, error); +} + +static int o2cb_dlm_lock(struct ocfs2_cluster_connection *conn, + int mode, + union ocfs2_dlm_lksb *lksb, + u32 flags, + void *name, + unsigned int namelen, + void *astarg) +{ + enum dlm_status status; + int o2dlm_mode = mode_to_o2dlm(mode); + int o2dlm_flags = flags_to_o2dlm(flags); + int ret; + + status = dlmlock(conn->cc_lockspace, o2dlm_mode, &lksb->lksb_o2dlm, + o2dlm_flags, name, namelen, + o2dlm_lock_ast_wrapper, astarg, + o2dlm_blocking_ast_wrapper); + ret = dlm_status_to_errno(status); + return ret; +} + +static int o2cb_dlm_unlock(struct ocfs2_cluster_connection *conn, + union ocfs2_dlm_lksb *lksb, + u32 flags, + void *astarg) +{ + enum dlm_status status; + int o2dlm_flags = flags_to_o2dlm(flags); + int ret; + + status = dlmunlock(conn->cc_lockspace, &lksb->lksb_o2dlm, + o2dlm_flags, o2dlm_unlock_ast_wrapper, astarg); + ret = dlm_status_to_errno(status); + return ret; +} + +static int o2cb_dlm_lock_status(union ocfs2_dlm_lksb *lksb) +{ + return dlm_status_to_errno(lksb->lksb_o2dlm.status); +} + +static void *o2cb_dlm_lvb(union ocfs2_dlm_lksb *lksb) +{ + return (void *)(lksb->lksb_o2dlm.lvb); +} + +static void o2cb_dump_lksb(union ocfs2_dlm_lksb *lksb) +{ + dlm_print_one_lock(lksb->lksb_o2dlm.lockid); +} + +/* + * Called from the dlm when it's about to evict a node. This is how the + * classic stack signals node death. + */ +static void o2dlm_eviction_cb(int node_num, void *data) +{ + struct ocfs2_cluster_connection *conn = data; + + mlog(ML_NOTICE, "o2dlm has evicted node %d from group %.*s\n", + node_num, conn->cc_namelen, conn->cc_name); + + conn->cc_recovery_handler(node_num, conn->cc_recovery_data); +} + +static int o2cb_cluster_connect(struct ocfs2_cluster_connection *conn) +{ + int rc = 0; + u32 dlm_key; + struct dlm_ctxt *dlm; + struct o2dlm_private *priv; + struct dlm_protocol_version dlm_version; + + BUG_ON(conn == NULL); + BUG_ON(o2cb_stack.sp_proto == NULL); + + /* for now we only have one cluster/node, make sure we see it + * in the heartbeat universe */ + if (!o2hb_check_local_node_heartbeating()) { + rc = -EINVAL; + goto out; + } + + priv = kzalloc(sizeof(struct o2dlm_private), GFP_KERNEL); + if (!priv) { + rc = -ENOMEM; + goto out_free; + } + + /* This just fills the structure in. It is safe to pass conn. */ + dlm_setup_eviction_cb(&priv->op_eviction_cb, o2dlm_eviction_cb, + conn); + + conn->cc_private = priv; + + /* used by the dlm code to make message headers unique, each + * node in this domain must agree on this. */ + dlm_key = crc32_le(0, conn->cc_name, conn->cc_namelen); + dlm_version.pv_major = conn->cc_version.pv_major; + dlm_version.pv_minor = conn->cc_version.pv_minor; + + dlm = dlm_register_domain(conn->cc_name, dlm_key, &dlm_version); + if (IS_ERR(dlm)) { + rc = PTR_ERR(dlm); + mlog_errno(rc); + goto out_free; + } + + conn->cc_version.pv_major = dlm_version.pv_major; + conn->cc_version.pv_minor = dlm_version.pv_minor; + conn->cc_lockspace = dlm; + + dlm_register_eviction_cb(dlm, &priv->op_eviction_cb); + +out_free: + if (rc && conn->cc_private) + kfree(conn->cc_private); + +out: + return rc; +} + +static int o2cb_cluster_disconnect(struct ocfs2_cluster_connection *conn, + int hangup_pending) +{ + struct dlm_ctxt *dlm = conn->cc_lockspace; + struct o2dlm_private *priv = conn->cc_private; + + dlm_unregister_eviction_cb(&priv->op_eviction_cb); + conn->cc_private = NULL; + kfree(priv); + + dlm_unregister_domain(dlm); + conn->cc_lockspace = NULL; + + return 0; +} + +static void o2hb_stop(const char *group) +{ + int ret; + char *argv[5], *envp[3]; + + argv[0] = (char *)o2nm_get_hb_ctl_path(); + argv[1] = "-K"; + argv[2] = "-u"; + argv[3] = (char *)group; + argv[4] = NULL; + + mlog(0, "Run: %s %s %s %s\n", argv[0], argv[1], argv[2], argv[3]); + + /* minimal command environment taken from cpu_run_sbin_hotplug */ + envp[0] = "HOME=/"; + envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin"; + envp[2] = NULL; + + ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC); + if (ret < 0) + mlog_errno(ret); +} + +/* + * Hangup is a hack for tools compatibility. Older ocfs2-tools software + * expects the filesystem to call "ocfs2_hb_ctl" during unmount. This + * happens regardless of whether the DLM got started, so we can't do it + * in ocfs2_cluster_disconnect(). We bring the o2hb_stop() function into + * the glue and provide a "hangup" API for super.c to call. + * + * Other stacks will eventually provide a NULL ->hangup() pointer. + */ +static void o2cb_cluster_hangup(const char *group, int grouplen) +{ + o2hb_stop(group); +} + +static int o2cb_cluster_this_node(unsigned int *node) +{ + int node_num; + + node_num = o2nm_this_node(); + if (node_num == O2NM_INVALID_NODE_NUM) + return -ENOENT; + + if (node_num >= O2NM_MAX_NODES) + return -EOVERFLOW; + + *node = node_num; + return 0; +} + +struct ocfs2_stack_operations o2cb_stack_ops = { + .connect = o2cb_cluster_connect, + .disconnect = o2cb_cluster_disconnect, + .hangup = o2cb_cluster_hangup, + .this_node = o2cb_cluster_this_node, + .dlm_lock = o2cb_dlm_lock, + .dlm_unlock = o2cb_dlm_unlock, + .lock_status = o2cb_dlm_lock_status, + .lock_lvb = o2cb_dlm_lvb, + .dump_lksb = o2cb_dump_lksb, +}; + +static struct ocfs2_stack_plugin o2cb_stack = { + .sp_name = "o2cb", + .sp_ops = &o2cb_stack_ops, + .sp_owner = THIS_MODULE, +}; + +static int __init o2cb_stack_init(void) +{ + return ocfs2_stack_glue_register(&o2cb_stack); +} + +static void __exit o2cb_stack_exit(void) +{ + ocfs2_stack_glue_unregister(&o2cb_stack); +} + +MODULE_AUTHOR("Oracle"); +MODULE_DESCRIPTION("ocfs2 driver for the classic o2cb stack"); +MODULE_LICENSE("GPL"); +module_init(o2cb_stack_init); +module_exit(o2cb_stack_exit); diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c new file mode 100644 index 0000000..7428663 --- /dev/null +++ b/fs/ocfs2/stack_user.c @@ -0,0 +1,883 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * stack_user.c + * + * Code which interfaces ocfs2 with fs/dlm and a userspace stack. + * + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation, version 2. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ + +#include <linux/module.h> +#include <linux/fs.h> +#include <linux/miscdevice.h> +#include <linux/mutex.h> +#include <linux/reboot.h> +#include <asm/uaccess.h> + +#include "ocfs2.h" /* For struct ocfs2_lock_res */ +#include "stackglue.h" + + +/* + * The control protocol starts with a handshake. Until the handshake + * is complete, the control device will fail all write(2)s. + * + * The handshake is simple. First, the client reads until EOF. Each line + * of output is a supported protocol tag. All protocol tags are a single + * character followed by a two hex digit version number. Currently the + * only things supported is T01, for "Text-base version 0x01". Next, the + * client writes the version they would like to use, including the newline. + * Thus, the protocol tag is 'T01\n'. If the version tag written is + * unknown, -EINVAL is returned. Once the negotiation is complete, the + * client can start sending messages. + * + * The T01 protocol has three messages. First is the "SETN" message. + * It has the following syntax: + * + * SETN<space><8-char-hex-nodenum><newline> + * + * This is 14 characters. + * + * The "SETN" message must be the first message following the protocol. + * It tells ocfs2_control the local node number. + * + * Next comes the "SETV" message. It has the following syntax: + * + * SETV<space><2-char-hex-major><space><2-char-hex-minor><newline> + * + * This is 11 characters. + * + * The "SETV" message sets the filesystem locking protocol version as + * negotiated by the client. The client negotiates based on the maximum + * version advertised in /sys/fs/ocfs2/max_locking_protocol. The major + * number from the "SETV" message must match + * user_stack.sp_proto->lp_max_version.pv_major, and the minor number + * must be less than or equal to ...->lp_max_version.pv_minor. + * + * Once this information has been set, mounts will be allowed. From this + * point on, the "DOWN" message can be sent for node down notification. + * It has the following syntax: + * + * DOWN<space><32-char-cap-hex-uuid><space><8-char-hex-nodenum><newline> + * + * eg: + * + * DOWN 632A924FDD844190BDA93C0DF6B94899 00000001\n + * + * This is 47 characters. + */ + +/* + * Whether or not the client has done the handshake. + * For now, we have just one protocol version. + */ +#define OCFS2_CONTROL_PROTO "T01\n" +#define OCFS2_CONTROL_PROTO_LEN 4 + +/* Handshake states */ +#define OCFS2_CONTROL_HANDSHAKE_INVALID (0) +#define OCFS2_CONTROL_HANDSHAKE_READ (1) +#define OCFS2_CONTROL_HANDSHAKE_PROTOCOL (2) +#define OCFS2_CONTROL_HANDSHAKE_VALID (3) + +/* Messages */ +#define OCFS2_CONTROL_MESSAGE_OP_LEN 4 +#define OCFS2_CONTROL_MESSAGE_SETNODE_OP "SETN" +#define OCFS2_CONTROL_MESSAGE_SETNODE_TOTAL_LEN 14 +#define OCFS2_CONTROL_MESSAGE_SETVERSION_OP "SETV" +#define OCFS2_CONTROL_MESSAGE_SETVERSION_TOTAL_LEN 11 +#define OCFS2_CONTROL_MESSAGE_DOWN_OP "DOWN" +#define OCFS2_CONTROL_MESSAGE_DOWN_TOTAL_LEN 47 +#define OCFS2_TEXT_UUID_LEN 32 +#define OCFS2_CONTROL_MESSAGE_VERNUM_LEN 2 +#define OCFS2_CONTROL_MESSAGE_NODENUM_LEN 8 + +/* + * ocfs2_live_connection is refcounted because the filesystem and + * miscdevice sides can detach in different order. Let's just be safe. + */ +struct ocfs2_live_connection { + struct list_head oc_list; + struct ocfs2_cluster_connection *oc_conn; +}; + +struct ocfs2_control_private { + struct list_head op_list; + int op_state; + int op_this_node; + struct ocfs2_protocol_version op_proto; +}; + +/* SETN<space><8-char-hex-nodenum><newline> */ +struct ocfs2_control_message_setn { + char tag[OCFS2_CONTROL_MESSAGE_OP_LEN]; + char space; + char nodestr[OCFS2_CONTROL_MESSAGE_NODENUM_LEN]; + char newline; +}; + +/* SETV<space><2-char-hex-major><space><2-char-hex-minor><newline> */ +struct ocfs2_control_message_setv { + char tag[OCFS2_CONTROL_MESSAGE_OP_LEN]; + char space1; + char major[OCFS2_CONTROL_MESSAGE_VERNUM_LEN]; + char space2; + char minor[OCFS2_CONTROL_MESSAGE_VERNUM_LEN]; + char newline; +}; + +/* DOWN<space><32-char-cap-hex-uuid><space><8-char-hex-nodenum><newline> */ +struct ocfs2_control_message_down { + char tag[OCFS2_CONTROL_MESSAGE_OP_LEN]; + char space1; + char uuid[OCFS2_TEXT_UUID_LEN]; + char space2; + char nodestr[OCFS2_CONTROL_MESSAGE_NODENUM_LEN]; + char newline; +}; + +union ocfs2_control_message { + char tag[OCFS2_CONTROL_MESSAGE_OP_LEN]; + struct ocfs2_control_message_setn u_setn; + struct ocfs2_control_message_setv u_setv; + struct ocfs2_control_message_down u_down; +}; + +static struct ocfs2_stack_plugin user_stack; + +static atomic_t ocfs2_control_opened; +static int ocfs2_control_this_node = -1; +static struct ocfs2_protocol_version running_proto; + +static LIST_HEAD(ocfs2_live_connection_list); +static LIST_HEAD(ocfs2_control_private_list); +static DEFINE_MUTEX(ocfs2_control_lock); + +static inline void ocfs2_control_set_handshake_state(struct file *file, + int state) +{ + struct ocfs2_control_private *p = file->private_data; + p->op_state = state; +} + +static inline int ocfs2_control_get_handshake_state(struct file *file) +{ + struct ocfs2_control_private *p = file->private_data; + return p->op_state; +} + +static struct ocfs2_live_connection *ocfs2_connection_find(const char *name) +{ + size_t len = strlen(name); + struct ocfs2_live_connection *c; + + BUG_ON(!mutex_is_locked(&ocfs2_control_lock)); + + list_for_each_entry(c, &ocfs2_live_connection_list, oc_list) { + if ((c->oc_conn->cc_namelen == len) && + !strncmp(c->oc_conn->cc_name, name, len)) + return c; + } + + return c; +} + +/* + * ocfs2_live_connection structures are created underneath the ocfs2 + * mount path. Since the VFS prevents multiple calls to + * fill_super(), we can't get dupes here. + */ +static int ocfs2_live_connection_new(struct ocfs2_cluster_connection *conn, + struct ocfs2_live_connection **c_ret) +{ + int rc = 0; + struct ocfs2_live_connection *c; + + c = kzalloc(sizeof(struct ocfs2_live_connection), GFP_KERNEL); + if (!c) + return -ENOMEM; + + mutex_lock(&ocfs2_control_lock); + c->oc_conn = conn; + + if (atomic_read(&ocfs2_control_opened)) + list_add(&c->oc_list, &ocfs2_live_connection_list); + else { + printk(KERN_ERR + "ocfs2: Userspace control daemon is not present\n"); + rc = -ESRCH; + } + + mutex_unlock(&ocfs2_control_lock); + + if (!rc) + *c_ret = c; + else + kfree(c); + + return rc; +} + +/* + * This function disconnects the cluster connection from ocfs2_control. + * Afterwards, userspace can't affect the cluster connection. + */ +static void ocfs2_live_connection_drop(struct ocfs2_live_connection *c) +{ + mutex_lock(&ocfs2_control_lock); + list_del_init(&c->oc_list); + c->oc_conn = NULL; + mutex_unlock(&ocfs2_control_lock); + + kfree(c); +} + +static int ocfs2_control_cfu(void *target, size_t target_len, + const char __user *buf, size_t count) +{ + /* The T01 expects write(2) calls to have exactly one command */ + if ((count != target_len) || + (count > sizeof(union ocfs2_control_message))) + return -EINVAL; + + if (copy_from_user(target, buf, target_len)) + return -EFAULT; + + return 0; +} + +static ssize_t ocfs2_control_validate_protocol(struct file *file, + const char __user *buf, + size_t count) +{ + ssize_t ret; + char kbuf[OCFS2_CONTROL_PROTO_LEN]; + + ret = ocfs2_control_cfu(kbuf, OCFS2_CONTROL_PROTO_LEN, + buf, count); + if (ret) + return ret; + + if (strncmp(kbuf, OCFS2_CONTROL_PROTO, OCFS2_CONTROL_PROTO_LEN)) + return -EINVAL; + + ocfs2_control_set_handshake_state(file, + OCFS2_CONTROL_HANDSHAKE_PROTOCOL); + + return count; +} + +static void ocfs2_control_send_down(const char *uuid, + int nodenum) +{ + struct ocfs2_live_connection *c; + + mutex_lock(&ocfs2_control_lock); + + c = ocfs2_connection_find(uuid); + if (c) { + BUG_ON(c->oc_conn == NULL); + c->oc_conn->cc_recovery_handler(nodenum, + c->oc_conn->cc_recovery_data); + } + + mutex_unlock(&ocfs2_control_lock); +} + +/* + * Called whenever configuration elements are sent to /dev/ocfs2_control. + * If all configuration elements are present, try to set the global + * values. If there is a problem, return an error. Skip any missing + * elements, and only bump ocfs2_control_opened when we have all elements + * and are successful. + */ +static int ocfs2_control_install_private(struct file *file) +{ + int rc = 0; + int set_p = 1; + struct ocfs2_control_private *p = file->private_data; + + BUG_ON(p->op_state != OCFS2_CONTROL_HANDSHAKE_PROTOCOL); + + mutex_lock(&ocfs2_control_lock); + + if (p->op_this_node < 0) { + set_p = 0; + } else if ((ocfs2_control_this_node >= 0) && + (ocfs2_control_this_node != p->op_this_node)) { + rc = -EINVAL; + goto out_unlock; + } + + if (!p->op_proto.pv_major) { + set_p = 0; + } else if (!list_empty(&ocfs2_live_connection_list) && + ((running_proto.pv_major != p->op_proto.pv_major) || + (running_proto.pv_minor != p->op_proto.pv_minor))) { + rc = -EINVAL; + goto out_unlock; + } + + if (set_p) { + ocfs2_control_this_node = p->op_this_node; + running_proto.pv_major = p->op_proto.pv_major; + running_proto.pv_minor = p->op_proto.pv_minor; + } + +out_unlock: + mutex_unlock(&ocfs2_control_lock); + + if (!rc && set_p) { + /* We set the global values successfully */ + atomic_inc(&ocfs2_control_opened); + ocfs2_control_set_handshake_state(file, + OCFS2_CONTROL_HANDSHAKE_VALID); + } + + return rc; +} + +static int ocfs2_control_get_this_node(void) +{ + int rc; + + mutex_lock(&ocfs2_control_lock); + if (ocfs2_control_this_node < 0) + rc = -EINVAL; + else + rc = ocfs2_control_this_node; + mutex_unlock(&ocfs2_control_lock); + + return rc; +} + +static int ocfs2_control_do_setnode_msg(struct file *file, + struct ocfs2_control_message_setn *msg) +{ + long nodenum; + char *ptr = NULL; + struct ocfs2_control_private *p = file->private_data; + + if (ocfs2_control_get_handshake_state(file) != + OCFS2_CONTROL_HANDSHAKE_PROTOCOL) + return -EINVAL; + + if (strncmp(msg->tag, OCFS2_CONTROL_MESSAGE_SETNODE_OP, + OCFS2_CONTROL_MESSAGE_OP_LEN)) + return -EINVAL; + + if ((msg->space != ' ') || (msg->newline != '\n')) + return -EINVAL; + msg->space = msg->newline = '\0'; + + nodenum = simple_strtol(msg->nodestr, &ptr, 16); + if (!ptr || *ptr) + return -EINVAL; + + if ((nodenum == LONG_MIN) || (nodenum == LONG_MAX) || + (nodenum > INT_MAX) || (nodenum < 0)) + return -ERANGE; + p->op_this_node = nodenum; + + return ocfs2_control_install_private(file); +} + +static int ocfs2_control_do_setversion_msg(struct file *file, + struct ocfs2_control_message_setv *msg) + { + long major, minor; + char *ptr = NULL; + struct ocfs2_control_private *p = file->private_data; + struct ocfs2_protocol_version *max = + &user_stack.sp_proto->lp_max_version; + + if (ocfs2_control_get_handshake_state(file) != + OCFS2_CONTROL_HANDSHAKE_PROTOCOL) + return -EINVAL; + + if (strncmp(msg->tag, OCFS2_CONTROL_MESSAGE_SETVERSION_OP, + OCFS2_CONTROL_MESSAGE_OP_LEN)) + return -EINVAL; + + if ((msg->space1 != ' ') || (msg->space2 != ' ') || + (msg->newline != '\n')) + return -EINVAL; + msg->space1 = msg->space2 = msg->newline = '\0'; + + major = simple_strtol(msg->major, &ptr, 16); + if (!ptr || *ptr) + return -EINVAL; + minor = simple_strtol(msg->minor, &ptr, 16); + if (!ptr || *ptr) + return -EINVAL; + + /* + * The major must be between 1 and 255, inclusive. The minor + * must be between 0 and 255, inclusive. The version passed in + * must be within the maximum version supported by the filesystem. + */ + if ((major == LONG_MIN) || (major == LONG_MAX) || + (major > (u8)-1) || (major < 1)) + return -ERANGE; + if ((minor == LONG_MIN) || (minor == LONG_MAX) || + (minor > (u8)-1) || (minor < 0)) + return -ERANGE; + if ((major != max->pv_major) || + (minor > max->pv_minor)) + return -EINVAL; + + p->op_proto.pv_major = major; + p->op_proto.pv_minor = minor; + + return ocfs2_control_install_private(file); +} + +static int ocfs2_control_do_down_msg(struct file *file, + struct ocfs2_control_message_down *msg) +{ + long nodenum; + char *p = NULL; + + if (ocfs2_control_get_handshake_state(file) != + OCFS2_CONTROL_HANDSHAKE_VALID) + return -EINVAL; + + if (strncmp(msg->tag, OCFS2_CONTROL_MESSAGE_DOWN_OP, + OCFS2_CONTROL_MESSAGE_OP_LEN)) + return -EINVAL; + + if ((msg->space1 != ' ') || (msg->space2 != ' ') || + (msg->newline != '\n')) + return -EINVAL; + msg->space1 = msg->space2 = msg->newline = '\0'; + + nodenum = simple_strtol(msg->nodestr, &p, 16); + if (!p || *p) + return -EINVAL; + + if ((nodenum == LONG_MIN) || (nodenum == LONG_MAX) || + (nodenum > INT_MAX) || (nodenum < 0)) + return -ERANGE; + + ocfs2_control_send_down(msg->uuid, nodenum); + + return 0; +} + +static ssize_t ocfs2_control_message(struct file *file, + const char __user *buf, + size_t count) +{ + ssize_t ret; + union ocfs2_control_message msg; + + /* Try to catch padding issues */ + WARN_ON(offsetof(struct ocfs2_control_message_down, uuid) != + (sizeof(msg.u_down.tag) + sizeof(msg.u_down.space1))); + + memset(&msg, 0, sizeof(union ocfs2_control_message)); + ret = ocfs2_control_cfu(&msg, count, buf, count); + if (ret) + goto out; + + if ((count == OCFS2_CONTROL_MESSAGE_SETNODE_TOTAL_LEN) && + !strncmp(msg.tag, OCFS2_CONTROL_MESSAGE_SETNODE_OP, + OCFS2_CONTROL_MESSAGE_OP_LEN)) + ret = ocfs2_control_do_setnode_msg(file, &msg.u_setn); + else if ((count == OCFS2_CONTROL_MESSAGE_SETVERSION_TOTAL_LEN) && + !strncmp(msg.tag, OCFS2_CONTROL_MESSAGE_SETVERSION_OP, + OCFS2_CONTROL_MESSAGE_OP_LEN)) + ret = ocfs2_control_do_setversion_msg(file, &msg.u_setv); + else if ((count == OCFS2_CONTROL_MESSAGE_DOWN_TOTAL_LEN) && + !strncmp(msg.tag, OCFS2_CONTROL_MESSAGE_DOWN_OP, + OCFS2_CONTROL_MESSAGE_OP_LEN)) + ret = ocfs2_control_do_down_msg(file, &msg.u_down); + else + ret = -EINVAL; + +out: + return ret ? ret : count; +} + +static ssize_t ocfs2_control_write(struct file *file, + const char __user *buf, + size_t count, + loff_t *ppos) +{ + ssize_t ret; + + switch (ocfs2_control_get_handshake_state(file)) { + case OCFS2_CONTROL_HANDSHAKE_INVALID: + ret = -EINVAL; + break; + + case OCFS2_CONTROL_HANDSHAKE_READ: + ret = ocfs2_control_validate_protocol(file, buf, + count); + break; + + case OCFS2_CONTROL_HANDSHAKE_PROTOCOL: + case OCFS2_CONTROL_HANDSHAKE_VALID: + ret = ocfs2_control_message(file, buf, count); + break; + + default: + BUG(); + ret = -EIO; + break; + } + + return ret; +} + +/* + * This is a naive version. If we ever have a new protocol, we'll expand + * it. Probably using seq_file. + */ +static ssize_t ocfs2_control_read(struct file *file, + char __user *buf, + size_t count, + loff_t *ppos) +{ + char *proto_string = OCFS2_CONTROL_PROTO; + size_t to_write = 0; + + if (*ppos >= OCFS2_CONTROL_PROTO_LEN) + return 0; + + to_write = OCFS2_CONTROL_PROTO_LEN - *ppos; + if (to_write > count) + to_write = count; + if (copy_to_user(buf, proto_string + *ppos, to_write)) + return -EFAULT; + + *ppos += to_write; + + /* Have we read the whole protocol list? */ + if (*ppos >= OCFS2_CONTROL_PROTO_LEN) + ocfs2_control_set_handshake_state(file, + OCFS2_CONTROL_HANDSHAKE_READ); + + return to_write; +} + +static int ocfs2_control_release(struct inode *inode, struct file *file) +{ + struct ocfs2_control_private *p = file->private_data; + + mutex_lock(&ocfs2_control_lock); + + if (ocfs2_control_get_handshake_state(file) != + OCFS2_CONTROL_HANDSHAKE_VALID) + goto out; + + if (atomic_dec_and_test(&ocfs2_control_opened)) { + if (!list_empty(&ocfs2_live_connection_list)) { + /* XXX: Do bad things! */ + printk(KERN_ERR + "ocfs2: Unexpected release of ocfs2_control!\n" + " Loss of cluster connection requires " + "an emergency restart!\n"); + emergency_restart(); + } + /* + * Last valid close clears the node number and resets + * the locking protocol version + */ + ocfs2_control_this_node = -1; + running_proto.pv_major = 0; + running_proto.pv_major = 0; + } + +out: + list_del_init(&p->op_list); + file->private_data = NULL; + + mutex_unlock(&ocfs2_control_lock); + + kfree(p); + + return 0; +} + +static int ocfs2_control_open(struct inode *inode, struct file *file) +{ + struct ocfs2_control_private *p; + + p = kzalloc(sizeof(struct ocfs2_control_private), GFP_KERNEL); + if (!p) + return -ENOMEM; + p->op_this_node = -1; + + mutex_lock(&ocfs2_control_lock); + file->private_data = p; + list_add(&p->op_list, &ocfs2_control_private_list); + mutex_unlock(&ocfs2_control_lock); + + return 0; +} + +static const struct file_operations ocfs2_control_fops = { + .open = ocfs2_control_open, + .release = ocfs2_control_release, + .read = ocfs2_control_read, + .write = ocfs2_control_write, + .owner = THIS_MODULE, +}; + +struct miscdevice ocfs2_control_device = { + .minor = MISC_DYNAMIC_MINOR, + .name = "ocfs2_control", + .fops = &ocfs2_control_fops, +}; + +static int ocfs2_control_init(void) +{ + int rc; + + atomic_set(&ocfs2_control_opened, 0); + + rc = misc_register(&ocfs2_control_device); + if (rc) + printk(KERN_ERR + "ocfs2: Unable to register ocfs2_control device " + "(errno %d)\n", + -rc); + + return rc; +} + +static void ocfs2_control_exit(void) +{ + int rc; + + rc = misc_deregister(&ocfs2_control_device); + if (rc) + printk(KERN_ERR + "ocfs2: Unable to deregister ocfs2_control device " + "(errno %d)\n", + -rc); +} + +static struct dlm_lksb *fsdlm_astarg_to_lksb(void *astarg) +{ + struct ocfs2_lock_res *res = astarg; + return &res->l_lksb.lksb_fsdlm; +} + +static void fsdlm_lock_ast_wrapper(void *astarg) +{ + struct dlm_lksb *lksb = fsdlm_astarg_to_lksb(astarg); + int status = lksb->sb_status; + + BUG_ON(user_stack.sp_proto == NULL); + + /* + * For now we're punting on the issue of other non-standard errors + * where we can't tell if the unlock_ast or lock_ast should be called. + * The main "other error" that's possible is EINVAL which means the + * function was called with invalid args, which shouldn't be possible + * since the caller here is under our control. Other non-standard + * errors probably fall into the same category, or otherwise are fatal + * which means we can't carry on anyway. + */ + + if (status == -DLM_EUNLOCK || status == -DLM_ECANCEL) + user_stack.sp_proto->lp_unlock_ast(astarg, 0); + else + user_stack.sp_proto->lp_lock_ast(astarg); +} + +static void fsdlm_blocking_ast_wrapper(void *astarg, int level) +{ + BUG_ON(user_stack.sp_proto == NULL); + + user_stack.sp_proto->lp_blocking_ast(astarg, level); +} + +static int user_dlm_lock(struct ocfs2_cluster_connection *conn, + int mode, + union ocfs2_dlm_lksb *lksb, + u32 flags, + void *name, + unsigned int namelen, + void *astarg) +{ + int ret; + + if (!lksb->lksb_fsdlm.sb_lvbptr) + lksb->lksb_fsdlm.sb_lvbptr = (char *)lksb + + sizeof(struct dlm_lksb); + + ret = dlm_lock(conn->cc_lockspace, mode, &lksb->lksb_fsdlm, + flags|DLM_LKF_NODLCKWT, name, namelen, 0, + fsdlm_lock_ast_wrapper, astarg, + fsdlm_blocking_ast_wrapper); + return ret; +} + +static int user_dlm_unlock(struct ocfs2_cluster_connection *conn, + union ocfs2_dlm_lksb *lksb, + u32 flags, + void *astarg) +{ + int ret; + + ret = dlm_unlock(conn->cc_lockspace, lksb->lksb_fsdlm.sb_lkid, + flags, &lksb->lksb_fsdlm, astarg); + return ret; +} + +static int user_dlm_lock_status(union ocfs2_dlm_lksb *lksb) +{ + return lksb->lksb_fsdlm.sb_status; +} + +static void *user_dlm_lvb(union ocfs2_dlm_lksb *lksb) +{ + return (void *)(lksb->lksb_fsdlm.sb_lvbptr); +} + +static void user_dlm_dump_lksb(union ocfs2_dlm_lksb *lksb) +{ +} + +/* + * Compare a requested locking protocol version against the current one. + * + * If the major numbers are different, they are incompatible. + * If the current minor is greater than the request, they are incompatible. + * If the current minor is less than or equal to the request, they are + * compatible, and the requester should run at the current minor version. + */ +static int fs_protocol_compare(struct ocfs2_protocol_version *existing, + struct ocfs2_protocol_version *request) +{ + if (existing->pv_major != request->pv_major) + return 1; + + if (existing->pv_minor > request->pv_minor) + return 1; + + if (existing->pv_minor < request->pv_minor) + request->pv_minor = existing->pv_minor; + + return 0; +} + +static int user_cluster_connect(struct ocfs2_cluster_connection *conn) +{ + dlm_lockspace_t *fsdlm; + struct ocfs2_live_connection *control; + int rc = 0; + + BUG_ON(conn == NULL); + + rc = ocfs2_live_connection_new(conn, &control); + if (rc) + goto out; + + /* + * running_proto must have been set before we allowed any mounts + * to proceed. + */ + if (fs_protocol_compare(&running_proto, &conn->cc_version)) { + printk(KERN_ERR + "Unable to mount with fs locking protocol version " + "%u.%u because the userspace control daemon has " + "negotiated %u.%u\n", + conn->cc_version.pv_major, conn->cc_version.pv_minor, + running_proto.pv_major, running_proto.pv_minor); + rc = -EPROTO; + ocfs2_live_connection_drop(control); + goto out; + } + + rc = dlm_new_lockspace(conn->cc_name, strlen(conn->cc_name), + &fsdlm, DLM_LSFL_FS, DLM_LVB_LEN); + if (rc) { + ocfs2_live_connection_drop(control); + goto out; + } + + conn->cc_private = control; + conn->cc_lockspace = fsdlm; +out: + return rc; +} + +static int user_cluster_disconnect(struct ocfs2_cluster_connection *conn, + int hangup_pending) +{ + dlm_release_lockspace(conn->cc_lockspace, 2); + conn->cc_lockspace = NULL; + ocfs2_live_connection_drop(conn->cc_private); + conn->cc_private = NULL; + return 0; +} + +static int user_cluster_this_node(unsigned int *this_node) +{ + int rc; + + rc = ocfs2_control_get_this_node(); + if (rc < 0) + return rc; + + *this_node = rc; + return 0; +} + +static struct ocfs2_stack_operations user_stack_ops = { + .connect = user_cluster_connect, + .disconnect = user_cluster_disconnect, + .this_node = user_cluster_this_node, + .dlm_lock = user_dlm_lock, + .dlm_unlock = user_dlm_unlock, + .lock_status = user_dlm_lock_status, + .lock_lvb = user_dlm_lvb, + .dump_lksb = user_dlm_dump_lksb, +}; + +static struct ocfs2_stack_plugin user_stack = { + .sp_name = "user", + .sp_ops = &user_stack_ops, + .sp_owner = THIS_MODULE, +}; + + +static int __init user_stack_init(void) +{ + int rc; + + rc = ocfs2_control_init(); + if (!rc) { + rc = ocfs2_stack_glue_register(&user_stack); + if (rc) + ocfs2_control_exit(); + } + + return rc; +} + +static void __exit user_stack_exit(void) +{ + ocfs2_stack_glue_unregister(&user_stack); + ocfs2_control_exit(); +} + +MODULE_AUTHOR("Oracle"); +MODULE_DESCRIPTION("ocfs2 driver for userspace cluster stacks"); +MODULE_LICENSE("GPL"); +module_init(user_stack_init); +module_exit(user_stack_exit); diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c new file mode 100644 index 0000000..119f60c --- /dev/null +++ b/fs/ocfs2/stackglue.c @@ -0,0 +1,568 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * stackglue.c + * + * Code which implements an OCFS2 specific interface to underlying + * cluster stacks. + * + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation, version 2. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ + +#include <linux/list.h> +#include <linux/spinlock.h> +#include <linux/module.h> +#include <linux/slab.h> +#include <linux/kmod.h> +#include <linux/fs.h> +#include <linux/kobject.h> +#include <linux/sysfs.h> + +#include "ocfs2_fs.h" + +#include "stackglue.h" + +#define OCFS2_STACK_PLUGIN_O2CB "o2cb" +#define OCFS2_STACK_PLUGIN_USER "user" + +static struct ocfs2_locking_protocol *lproto; +static DEFINE_SPINLOCK(ocfs2_stack_lock); +static LIST_HEAD(ocfs2_stack_list); +static char cluster_stack_name[OCFS2_STACK_LABEL_LEN + 1]; + +/* + * The stack currently in use. If not null, active_stack->sp_count > 0, + * the module is pinned, and the locking protocol cannot be changed. + */ +static struct ocfs2_stack_plugin *active_stack; + +static struct ocfs2_stack_plugin *ocfs2_stack_lookup(const char *name) +{ + struct ocfs2_stack_plugin *p; + + assert_spin_locked(&ocfs2_stack_lock); + + list_for_each_entry(p, &ocfs2_stack_list, sp_list) { + if (!strcmp(p->sp_name, name)) + return p; + } + + return NULL; +} + +static int ocfs2_stack_driver_request(const char *stack_name, + const char *plugin_name) +{ + int rc; + struct ocfs2_stack_plugin *p; + + spin_lock(&ocfs2_stack_lock); + + /* + * If the stack passed by the filesystem isn't the selected one, + * we can't continue. + */ + if (strcmp(stack_name, cluster_stack_name)) { + rc = -EBUSY; + goto out; + } + + if (active_stack) { + /* + * If the active stack isn't the one we want, it cannot + * be selected right now. + */ + if (!strcmp(active_stack->sp_name, plugin_name)) + rc = 0; + else + rc = -EBUSY; + goto out; + } + + p = ocfs2_stack_lookup(plugin_name); + if (!p || !try_module_get(p->sp_owner)) { + rc = -ENOENT; + goto out; + } + + /* Ok, the stack is pinned */ + p->sp_count++; + active_stack = p; + + rc = 0; + +out: + spin_unlock(&ocfs2_stack_lock); + return rc; +} + +/* + * This function looks up the appropriate stack and makes it active. If + * there is no stack, it tries to load it. It will fail if the stack still + * cannot be found. It will also fail if a different stack is in use. + */ +static int ocfs2_stack_driver_get(const char *stack_name) +{ + int rc; + char *plugin_name = OCFS2_STACK_PLUGIN_O2CB; + + /* + * Classic stack does not pass in a stack name. This is + * compatible with older tools as well. + */ + if (!stack_name || !*stack_name) + stack_name = OCFS2_STACK_PLUGIN_O2CB; + + if (strlen(stack_name) != OCFS2_STACK_LABEL_LEN) { + printk(KERN_ERR + "ocfs2 passed an invalid cluster stack label: \"%s\"\n", + stack_name); + return -EINVAL; + } + + /* Anything that isn't the classic stack is a user stack */ + if (strcmp(stack_name, OCFS2_STACK_PLUGIN_O2CB)) + plugin_name = OCFS2_STACK_PLUGIN_USER; + + rc = ocfs2_stack_driver_request(stack_name, plugin_name); + if (rc == -ENOENT) { + request_module("ocfs2_stack_%s", plugin_name); + rc = ocfs2_stack_driver_request(stack_name, plugin_name); + } + + if (rc == -ENOENT) { + printk(KERN_ERR + "ocfs2: Cluster stack driver \"%s\" cannot be found\n", + plugin_name); + } else if (rc == -EBUSY) { + printk(KERN_ERR + "ocfs2: A different cluster stack is in use\n"); + } + + return rc; +} + +static void ocfs2_stack_driver_put(void) +{ + spin_lock(&ocfs2_stack_lock); + BUG_ON(active_stack == NULL); + BUG_ON(active_stack->sp_count == 0); + + active_stack->sp_count--; + if (!active_stack->sp_count) { + module_put(active_stack->sp_owner); + active_stack = NULL; + } + spin_unlock(&ocfs2_stack_lock); +} + +int ocfs2_stack_glue_register(struct ocfs2_stack_plugin *plugin) +{ + int rc; + + spin_lock(&ocfs2_stack_lock); + if (!ocfs2_stack_lookup(plugin->sp_name)) { + plugin->sp_count = 0; + plugin->sp_proto = lproto; + list_add(&plugin->sp_list, &ocfs2_stack_list); + printk(KERN_INFO "ocfs2: Registered cluster interface %s\n", + plugin->sp_name); + rc = 0; + } else { + printk(KERN_ERR "ocfs2: Stack \"%s\" already registered\n", + plugin->sp_name); + rc = -EEXIST; + } + spin_unlock(&ocfs2_stack_lock); + + return rc; +} +EXPORT_SYMBOL_GPL(ocfs2_stack_glue_register); + +void ocfs2_stack_glue_unregister(struct ocfs2_stack_plugin *plugin) +{ + struct ocfs2_stack_plugin *p; + + spin_lock(&ocfs2_stack_lock); + p = ocfs2_stack_lookup(plugin->sp_name); + if (p) { + BUG_ON(p != plugin); + BUG_ON(plugin == active_stack); + BUG_ON(plugin->sp_count != 0); + list_del_init(&plugin->sp_list); + printk(KERN_INFO "ocfs2: Unregistered cluster interface %s\n", + plugin->sp_name); + } else { + printk(KERN_ERR "Stack \"%s\" is not registered\n", + plugin->sp_name); + } + spin_unlock(&ocfs2_stack_lock); +} +EXPORT_SYMBOL_GPL(ocfs2_stack_glue_unregister); + +void ocfs2_stack_glue_set_locking_protocol(struct ocfs2_locking_protocol *proto) +{ + struct ocfs2_stack_plugin *p; + + BUG_ON(proto == NULL); + + spin_lock(&ocfs2_stack_lock); + BUG_ON(active_stack != NULL); + + lproto = proto; + list_for_each_entry(p, &ocfs2_stack_list, sp_list) { + p->sp_proto = lproto; + } + + spin_unlock(&ocfs2_stack_lock); +} +EXPORT_SYMBOL_GPL(ocfs2_stack_glue_set_locking_protocol); + + +/* + * The ocfs2_dlm_lock() and ocfs2_dlm_unlock() functions take + * "struct ocfs2_lock_res *astarg" instead of "void *astarg" because the + * underlying stack plugins need to pilfer the lksb off of the lock_res. + * If some other structure needs to be passed as an astarg, the plugins + * will need to be given a different avenue to the lksb. + */ +int ocfs2_dlm_lock(struct ocfs2_cluster_connection *conn, + int mode, + union ocfs2_dlm_lksb *lksb, + u32 flags, + void *name, + unsigned int namelen, + struct ocfs2_lock_res *astarg) +{ + BUG_ON(lproto == NULL); + + return active_stack->sp_ops->dlm_lock(conn, mode, lksb, flags, + name, namelen, astarg); +} +EXPORT_SYMBOL_GPL(ocfs2_dlm_lock); + +int ocfs2_dlm_unlock(struct ocfs2_cluster_connection *conn, + union ocfs2_dlm_lksb *lksb, + u32 flags, + struct ocfs2_lock_res *astarg) +{ + BUG_ON(lproto == NULL); + + return active_stack->sp_ops->dlm_unlock(conn, lksb, flags, astarg); +} +EXPORT_SYMBOL_GPL(ocfs2_dlm_unlock); + +int ocfs2_dlm_lock_status(union ocfs2_dlm_lksb *lksb) +{ + return active_stack->sp_ops->lock_status(lksb); +} +EXPORT_SYMBOL_GPL(ocfs2_dlm_lock_status); + +/* + * Why don't we cast to ocfs2_meta_lvb? The "clean" answer is that we + * don't cast at the glue level. The real answer is that the header + * ordering is nigh impossible. + */ +void *ocfs2_dlm_lvb(union ocfs2_dlm_lksb *lksb) +{ + return active_stack->sp_ops->lock_lvb(lksb); +} +EXPORT_SYMBOL_GPL(ocfs2_dlm_lvb); + +void ocfs2_dlm_dump_lksb(union ocfs2_dlm_lksb *lksb) +{ + active_stack->sp_ops->dump_lksb(lksb); +} +EXPORT_SYMBOL_GPL(ocfs2_dlm_dump_lksb); + +int ocfs2_cluster_connect(const char *stack_name, + const char *group, + int grouplen, + void (*recovery_handler)(int node_num, + void *recovery_data), + void *recovery_data, + struct ocfs2_cluster_connection **conn) +{ + int rc = 0; + struct ocfs2_cluster_connection *new_conn; + + BUG_ON(group == NULL); + BUG_ON(conn == NULL); + BUG_ON(recovery_handler == NULL); + + if (grouplen > GROUP_NAME_MAX) { + rc = -EINVAL; + goto out; + } + + new_conn = kzalloc(sizeof(struct ocfs2_cluster_connection), + GFP_KERNEL); + if (!new_conn) { + rc = -ENOMEM; + goto out; + } + + memcpy(new_conn->cc_name, group, grouplen); + new_conn->cc_namelen = grouplen; + new_conn->cc_recovery_handler = recovery_handler; + new_conn->cc_recovery_data = recovery_data; + + /* Start the new connection at our maximum compatibility level */ + new_conn->cc_version = lproto->lp_max_version; + + /* This will pin the stack driver if successful */ + rc = ocfs2_stack_driver_get(stack_name); + if (rc) + goto out_free; + + rc = active_stack->sp_ops->connect(new_conn); + if (rc) { + ocfs2_stack_driver_put(); + goto out_free; + } + + *conn = new_conn; + +out_free: + if (rc) + kfree(new_conn); + +out: + return rc; +} +EXPORT_SYMBOL_GPL(ocfs2_cluster_connect); + +/* If hangup_pending is 0, the stack driver will be dropped */ +int ocfs2_cluster_disconnect(struct ocfs2_cluster_connection *conn, + int hangup_pending) +{ + int ret; + + BUG_ON(conn == NULL); + + ret = active_stack->sp_ops->disconnect(conn, hangup_pending); + + /* XXX Should we free it anyway? */ + if (!ret) { + kfree(conn); + if (!hangup_pending) + ocfs2_stack_driver_put(); + } + + return ret; +} +EXPORT_SYMBOL_GPL(ocfs2_cluster_disconnect); + +void ocfs2_cluster_hangup(const char *group, int grouplen) +{ + BUG_ON(group == NULL); + BUG_ON(group[grouplen] != '\0'); + + if (active_stack->sp_ops->hangup) + active_stack->sp_ops->hangup(group, grouplen); + + /* cluster_disconnect() was called with hangup_pending==1 */ + ocfs2_stack_driver_put(); +} +EXPORT_SYMBOL_GPL(ocfs2_cluster_hangup); + +int ocfs2_cluster_this_node(unsigned int *node) +{ + return active_stack->sp_ops->this_node(node); +} +EXPORT_SYMBOL_GPL(ocfs2_cluster_this_node); + + +/* + * Sysfs bits + */ + +static ssize_t ocfs2_max_locking_protocol_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + ssize_t ret = 0; + + spin_lock(&ocfs2_stack_lock); + if (lproto) + ret = snprintf(buf, PAGE_SIZE, "%u.%u\n", + lproto->lp_max_version.pv_major, + lproto->lp_max_version.pv_minor); + spin_unlock(&ocfs2_stack_lock); + + return ret; +} + +static struct kobj_attribute ocfs2_attr_max_locking_protocol = + __ATTR(max_locking_protocol, S_IFREG | S_IRUGO, + ocfs2_max_locking_protocol_show, NULL); + +static ssize_t ocfs2_loaded_cluster_plugins_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + ssize_t ret = 0, total = 0, remain = PAGE_SIZE; + struct ocfs2_stack_plugin *p; + + spin_lock(&ocfs2_stack_lock); + list_for_each_entry(p, &ocfs2_stack_list, sp_list) { + ret = snprintf(buf, remain, "%s\n", + p->sp_name); + if (ret < 0) { + total = ret; + break; + } + if (ret == remain) { + /* snprintf() didn't fit */ + total = -E2BIG; + break; + } + total += ret; + remain -= ret; + } + spin_unlock(&ocfs2_stack_lock); + + return total; +} + +static struct kobj_attribute ocfs2_attr_loaded_cluster_plugins = + __ATTR(loaded_cluster_plugins, S_IFREG | S_IRUGO, + ocfs2_loaded_cluster_plugins_show, NULL); + +static ssize_t ocfs2_active_cluster_plugin_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + ssize_t ret = 0; + + spin_lock(&ocfs2_stack_lock); + if (active_stack) { + ret = snprintf(buf, PAGE_SIZE, "%s\n", + active_stack->sp_name); + if (ret == PAGE_SIZE) + ret = -E2BIG; + } + spin_unlock(&ocfs2_stack_lock); + + return ret; +} + +static struct kobj_attribute ocfs2_attr_active_cluster_plugin = + __ATTR(active_cluster_plugin, S_IFREG | S_IRUGO, + ocfs2_active_cluster_plugin_show, NULL); + +static ssize_t ocfs2_cluster_stack_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + ssize_t ret; + spin_lock(&ocfs2_stack_lock); + ret = snprintf(buf, PAGE_SIZE, "%s\n", cluster_stack_name); + spin_unlock(&ocfs2_stack_lock); + + return ret; +} + +static ssize_t ocfs2_cluster_stack_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + size_t len = count; + ssize_t ret; + + if (len == 0) + return len; + + if (buf[len - 1] == '\n') + len--; + + if ((len != OCFS2_STACK_LABEL_LEN) || + (strnlen(buf, len) != len)) + return -EINVAL; + + spin_lock(&ocfs2_stack_lock); + if (active_stack) { + if (!strncmp(buf, cluster_stack_name, len)) + ret = count; + else + ret = -EBUSY; + } else { + memcpy(cluster_stack_name, buf, len); + ret = count; + } + spin_unlock(&ocfs2_stack_lock); + + return ret; +} + + +static struct kobj_attribute ocfs2_attr_cluster_stack = + __ATTR(cluster_stack, S_IFREG | S_IRUGO | S_IWUSR, + ocfs2_cluster_stack_show, + ocfs2_cluster_stack_store); + +static struct attribute *ocfs2_attrs[] = { + &ocfs2_attr_max_locking_protocol.attr, + &ocfs2_attr_loaded_cluster_plugins.attr, + &ocfs2_attr_active_cluster_plugin.attr, + &ocfs2_attr_cluster_stack.attr, + NULL, +}; + +static struct attribute_group ocfs2_attr_group = { + .attrs = ocfs2_attrs, +}; + +static struct kset *ocfs2_kset; + +static void ocfs2_sysfs_exit(void) +{ + kset_unregister(ocfs2_kset); +} + +static int ocfs2_sysfs_init(void) +{ + int ret; + + ocfs2_kset = kset_create_and_add("ocfs2", NULL, fs_kobj); + if (!ocfs2_kset) + return -ENOMEM; + + ret = sysfs_create_group(&ocfs2_kset->kobj, &ocfs2_attr_group); + if (ret) + goto error; + + return 0; + +error: + kset_unregister(ocfs2_kset); + return ret; +} + +static int __init ocfs2_stack_glue_init(void) +{ + strcpy(cluster_stack_name, OCFS2_STACK_PLUGIN_O2CB); + + return ocfs2_sysfs_init(); +} + +static void __exit ocfs2_stack_glue_exit(void) +{ + lproto = NULL; + ocfs2_sysfs_exit(); +} + +MODULE_AUTHOR("Oracle"); +MODULE_DESCRIPTION("ocfs2 cluter stack glue layer"); +MODULE_LICENSE("GPL"); +module_init(ocfs2_stack_glue_init); +module_exit(ocfs2_stack_glue_exit); diff --git a/fs/ocfs2/stackglue.h b/fs/ocfs2/stackglue.h new file mode 100644 index 0000000..005e4f1 --- /dev/null +++ b/fs/ocfs2/stackglue.h @@ -0,0 +1,261 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * stackglue.h + * + * Glue to the underlying cluster stack. + * + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation, version 2. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ + + +#ifndef STACKGLUE_H +#define STACKGLUE_H + +#include <linux/types.h> +#include <linux/list.h> +#include <linux/dlmconstants.h> + +#include "dlm/dlmapi.h" +#include <linux/dlm.h> + +/* + * dlmconstants.h does not have a LOCAL flag. We hope to remove it + * some day, but right now we need it. Let's fake it. This value is larger + * than any flag in dlmconstants.h. + */ +#define DLM_LKF_LOCAL 0x00100000 + +/* + * This shadows DLM_LOCKSPACE_LEN in fs/dlm/dlm_internal.h. That probably + * wants to be in a public header. + */ +#define GROUP_NAME_MAX 64 + + +/* + * ocfs2_protocol_version changes when ocfs2 does something different in + * its inter-node behavior. See dlmglue.c for more information. + */ +struct ocfs2_protocol_version { + u8 pv_major; + u8 pv_minor; +}; + +/* + * The ocfs2_locking_protocol defines the handlers called on ocfs2's behalf. + */ +struct ocfs2_locking_protocol { + struct ocfs2_protocol_version lp_max_version; + void (*lp_lock_ast)(void *astarg); + void (*lp_blocking_ast)(void *astarg, int level); + void (*lp_unlock_ast)(void *astarg, int error); +}; + + +/* + * The dlm_lockstatus struct includes lvb space, but the dlm_lksb struct only + * has a pointer to separately allocated lvb space. This struct exists only to + * include in the lksb union to make space for a combined dlm_lksb and lvb. + */ +struct fsdlm_lksb_plus_lvb { + struct dlm_lksb lksb; + char lvb[DLM_LVB_LEN]; +}; + +/* + * A union of all lock status structures. We define it here so that the + * size of the union is known. Lock status structures are embedded in + * ocfs2 inodes. + */ +union ocfs2_dlm_lksb { + struct dlm_lockstatus lksb_o2dlm; + struct dlm_lksb lksb_fsdlm; + struct fsdlm_lksb_plus_lvb padding; +}; + +/* + * A cluster connection. Mostly opaque to ocfs2, the connection holds + * state for the underlying stack. ocfs2 does use cc_version to determine + * locking compatibility. + */ +struct ocfs2_cluster_connection { + char cc_name[GROUP_NAME_MAX]; + int cc_namelen; + struct ocfs2_protocol_version cc_version; + void (*cc_recovery_handler)(int node_num, void *recovery_data); + void *cc_recovery_data; + void *cc_lockspace; + void *cc_private; +}; + +/* + * Each cluster stack implements the stack operations structure. Not used + * in the ocfs2 code, the stackglue code translates generic cluster calls + * into stack operations. + */ +struct ocfs2_stack_operations { + /* + * The fs code calls ocfs2_cluster_connect() to attach a new + * filesystem to the cluster stack. The ->connect() op is passed + * an ocfs2_cluster_connection with the name and recovery field + * filled in. + * + * The stack must set up any notification mechanisms and create + * the filesystem lockspace in the DLM. The lockspace should be + * stored on cc_lockspace. Any other information can be stored on + * cc_private. + * + * ->connect() must not return until it is guaranteed that + * + * - Node down notifications for the filesystem will be recieved + * and passed to conn->cc_recovery_handler(). + * - Locking requests for the filesystem will be processed. + */ + int (*connect)(struct ocfs2_cluster_connection *conn); + + /* + * The fs code calls ocfs2_cluster_disconnect() when a filesystem + * no longer needs cluster services. All DLM locks have been + * dropped, and recovery notification is being ignored by the + * fs code. The stack must disengage from the DLM and discontinue + * recovery notification. + * + * Once ->disconnect() has returned, the connection structure will + * be freed. Thus, a stack must not return from ->disconnect() + * until it will no longer reference the conn pointer. + * + * If hangup_pending is zero, ocfs2_cluster_disconnect() will also + * be dropping the reference on the module. + */ + int (*disconnect)(struct ocfs2_cluster_connection *conn, + int hangup_pending); + + /* + * ocfs2_cluster_hangup() exists for compatibility with older + * ocfs2 tools. Only the classic stack really needs it. As such + * ->hangup() is not required of all stacks. See the comment by + * ocfs2_cluster_hangup() for more details. + * + * Note that ocfs2_cluster_hangup() can only be called if + * hangup_pending was passed to ocfs2_cluster_disconnect(). + */ + void (*hangup)(const char *group, int grouplen); + + /* + * ->this_node() returns the cluster's unique identifier for the + * local node. + */ + int (*this_node)(unsigned int *node); + + /* + * Call the underlying dlm lock function. The ->dlm_lock() + * callback should convert the flags and mode as appropriate. + * + * ast and bast functions are not part of the call because the + * stack will likely want to wrap ast and bast calls before passing + * them to stack->sp_proto. + */ + int (*dlm_lock)(struct ocfs2_cluster_connection *conn, + int mode, + union ocfs2_dlm_lksb *lksb, + u32 flags, + void *name, + unsigned int namelen, + void *astarg); + + /* + * Call the underlying dlm unlock function. The ->dlm_unlock() + * function should convert the flags as appropriate. + * + * The unlock ast is not passed, as the stack will want to wrap + * it before calling stack->sp_proto->lp_unlock_ast(). + */ + int (*dlm_unlock)(struct ocfs2_cluster_connection *conn, + union ocfs2_dlm_lksb *lksb, + u32 flags, + void *astarg); + + /* + * Return the status of the current lock status block. The fs + * code should never dereference the union. The ->lock_status() + * callback pulls out the stack-specific lksb, converts the status + * to a proper errno, and returns it. + */ + int (*lock_status)(union ocfs2_dlm_lksb *lksb); + + /* + * Pull the lvb pointer off of the stack-specific lksb. + */ + void *(*lock_lvb)(union ocfs2_dlm_lksb *lksb); + + /* + * This is an optoinal debugging hook. If provided, the + * stack can dump debugging information about this lock. + */ + void (*dump_lksb)(union ocfs2_dlm_lksb *lksb); +}; + +/* + * Each stack plugin must describe itself by registering a + * ocfs2_stack_plugin structure. This is only seen by stackglue and the + * stack driver. + */ +struct ocfs2_stack_plugin { + char *sp_name; + struct ocfs2_stack_operations *sp_ops; + struct module *sp_owner; + + /* These are managed by the stackglue code. */ + struct list_head sp_list; + unsigned int sp_count; + struct ocfs2_locking_protocol *sp_proto; +}; + + +/* Used by the filesystem */ +int ocfs2_cluster_connect(const char *stack_name, + const char *group, + int grouplen, + void (*recovery_handler)(int node_num, + void *recovery_data), + void *recovery_data, + struct ocfs2_cluster_connection **conn); +int ocfs2_cluster_disconnect(struct ocfs2_cluster_connection *conn, + int hangup_pending); +void ocfs2_cluster_hangup(const char *group, int grouplen); +int ocfs2_cluster_this_node(unsigned int *node); + +struct ocfs2_lock_res; +int ocfs2_dlm_lock(struct ocfs2_cluster_connection *conn, + int mode, + union ocfs2_dlm_lksb *lksb, + u32 flags, + void *name, + unsigned int namelen, + struct ocfs2_lock_res *astarg); +int ocfs2_dlm_unlock(struct ocfs2_cluster_connection *conn, + union ocfs2_dlm_lksb *lksb, + u32 flags, + struct ocfs2_lock_res *astarg); + +int ocfs2_dlm_lock_status(union ocfs2_dlm_lksb *lksb); +void *ocfs2_dlm_lvb(union ocfs2_dlm_lksb *lksb); +void ocfs2_dlm_dump_lksb(union ocfs2_dlm_lksb *lksb); + +void ocfs2_stack_glue_set_locking_protocol(struct ocfs2_locking_protocol *proto); + + +/* Used by stack plugins */ +int ocfs2_stack_glue_register(struct ocfs2_stack_plugin *plugin); +void ocfs2_stack_glue_unregister(struct ocfs2_stack_plugin *plugin); +#endif /* STACKGLUE_H */ diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c index 72c198a..d2d278f 100644 --- a/fs/ocfs2/suballoc.c +++ b/fs/ocfs2/suballoc.c @@ -46,6 +46,11 @@ #include "buffer_head_io.h" +#define NOT_ALLOC_NEW_GROUP 0 +#define ALLOC_NEW_GROUP 1 + +#define OCFS2_MAX_INODES_TO_STEAL 1024 + static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg); static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode *fe); static inline u16 ocfs2_find_victim_chain(struct ocfs2_chain_list *cl); @@ -106,7 +111,7 @@ static inline void ocfs2_block_to_cluster_group(struct inode *inode, u64 *bg_blkno, u16 *bg_bit_off); -void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac) +static void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac) { struct inode *inode = ac->ac_inode; @@ -117,9 +122,17 @@ void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac) mutex_unlock(&inode->i_mutex); iput(inode); + ac->ac_inode = NULL; } - if (ac->ac_bh) + if (ac->ac_bh) { brelse(ac->ac_bh); + ac->ac_bh = NULL; + } +} + +void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac) +{ + ocfs2_free_ac_resource(ac); kfree(ac); } @@ -391,7 +404,8 @@ bail: static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb, struct ocfs2_alloc_context *ac, int type, - u32 slot) + u32 slot, + int alloc_new_group) { int status; u32 bits_wanted = ac->ac_bits_wanted; @@ -420,6 +434,7 @@ static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb, } ac->ac_inode = alloc_inode; + ac->ac_alloc_slot = slot; fe = (struct ocfs2_dinode *) bh->b_data; if (!OCFS2_IS_VALID_DINODE(fe)) { @@ -446,6 +461,14 @@ static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb, goto bail; } + if (alloc_new_group != ALLOC_NEW_GROUP) { + mlog(0, "Alloc File %u Full: wanted=%u, free_bits=%u, " + "and we don't alloc a new group for it.\n", + slot, bits_wanted, free_bits); + status = -ENOSPC; + goto bail; + } + status = ocfs2_block_group_alloc(osb, alloc_inode, bh); if (status < 0) { if (status != -ENOSPC) @@ -490,7 +513,8 @@ int ocfs2_reserve_new_metadata(struct ocfs2_super *osb, (*ac)->ac_group_search = ocfs2_block_group_search; status = ocfs2_reserve_suballoc_bits(osb, (*ac), - EXTENT_ALLOC_SYSTEM_INODE, slot); + EXTENT_ALLOC_SYSTEM_INODE, + slot, ALLOC_NEW_GROUP); if (status < 0) { if (status != -ENOSPC) mlog_errno(status); @@ -508,10 +532,42 @@ bail: return status; } +static int ocfs2_steal_inode_from_other_nodes(struct ocfs2_super *osb, + struct ocfs2_alloc_context *ac) +{ + int i, status = -ENOSPC; + s16 slot = ocfs2_get_inode_steal_slot(osb); + + /* Start to steal inodes from the first slot after ours. */ + if (slot == OCFS2_INVALID_SLOT) + slot = osb->slot_num + 1; + + for (i = 0; i < osb->max_slots; i++, slot++) { + if (slot == osb->max_slots) + slot = 0; + + if (slot == osb->slot_num) + continue; + + status = ocfs2_reserve_suballoc_bits(osb, ac, + INODE_ALLOC_SYSTEM_INODE, + slot, NOT_ALLOC_NEW_GROUP); + if (status >= 0) { + ocfs2_set_inode_steal_slot(osb, slot); + break; + } + + ocfs2_free_ac_resource(ac); + } + + return status; +} + int ocfs2_reserve_new_inode(struct ocfs2_super *osb, struct ocfs2_alloc_context **ac) { int status; + s16 slot = ocfs2_get_inode_steal_slot(osb); *ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL); if (!(*ac)) { @@ -525,9 +581,43 @@ int ocfs2_reserve_new_inode(struct ocfs2_super *osb, (*ac)->ac_group_search = ocfs2_block_group_search; + /* + * slot is set when we successfully steal inode from other nodes. + * It is reset in 3 places: + * 1. when we flush the truncate log + * 2. when we complete local alloc recovery. + * 3. when we successfully allocate from our own slot. + * After it is set, we will go on stealing inodes until we find the + * need to check our slots to see whether there is some space for us. + */ + if (slot != OCFS2_INVALID_SLOT && + atomic_read(&osb->s_num_inodes_stolen) < OCFS2_MAX_INODES_TO_STEAL) + goto inode_steal; + + atomic_set(&osb->s_num_inodes_stolen, 0); status = ocfs2_reserve_suballoc_bits(osb, *ac, INODE_ALLOC_SYSTEM_INODE, - osb->slot_num); + osb->slot_num, ALLOC_NEW_GROUP); + if (status >= 0) { + status = 0; + + /* + * Some inodes must be freed by us, so try to allocate + * from our own next time. + */ + if (slot != OCFS2_INVALID_SLOT) + ocfs2_init_inode_steal_slot(osb); + goto bail; + } else if (status < 0 && status != -ENOSPC) { + mlog_errno(status); + goto bail; + } + + ocfs2_free_ac_resource(*ac); + +inode_steal: + status = ocfs2_steal_inode_from_other_nodes(osb, *ac); + atomic_inc(&osb->s_num_inodes_stolen); if (status < 0) { if (status != -ENOSPC) mlog_errno(status); @@ -557,7 +647,8 @@ int ocfs2_reserve_cluster_bitmap_bits(struct ocfs2_super *osb, status = ocfs2_reserve_suballoc_bits(osb, ac, GLOBAL_BITMAP_SYSTEM_INODE, - OCFS2_INVALID_SLOT); + OCFS2_INVALID_SLOT, + ALLOC_NEW_GROUP); if (status < 0 && status != -ENOSPC) { mlog_errno(status); goto bail; diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h index 8799033..544c600 100644 --- a/fs/ocfs2/suballoc.h +++ b/fs/ocfs2/suballoc.h @@ -36,6 +36,7 @@ typedef int (group_search_t)(struct inode *, struct ocfs2_alloc_context { struct inode *ac_inode; /* which bitmap are we allocating from? */ struct buffer_head *ac_bh; /* file entry bh */ + u32 ac_alloc_slot; /* which slot are we allocating from? */ u32 ac_bits_wanted; u32 ac_bits_given; #define OCFS2_AC_USE_LOCAL 1 diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index bec75af..df63ba2 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -40,8 +40,7 @@ #include <linux/crc32.h> #include <linux/debugfs.h> #include <linux/mount.h> - -#include <cluster/nodemanager.h> +#include <linux/seq_file.h> #define MLOG_MASK_PREFIX ML_SUPER #include <cluster/masklog.h> @@ -88,6 +87,7 @@ struct mount_options unsigned int atime_quantum; signed short slot; unsigned int localalloc_opt; + char cluster_stack[OCFS2_STACK_LABEL_LEN + 1]; }; static int ocfs2_parse_options(struct super_block *sb, char *options, @@ -109,7 +109,6 @@ static int ocfs2_sync_fs(struct super_block *sb, int wait); static int ocfs2_init_global_system_inodes(struct ocfs2_super *osb); static int ocfs2_init_local_system_inodes(struct ocfs2_super *osb); static void ocfs2_release_system_inodes(struct ocfs2_super *osb); -static int ocfs2_fill_local_node_info(struct ocfs2_super *osb); static int ocfs2_check_volume(struct ocfs2_super *osb); static int ocfs2_verify_volume(struct ocfs2_dinode *di, struct buffer_head *bh, @@ -154,6 +153,7 @@ enum { Opt_commit, Opt_localalloc, Opt_localflocks, + Opt_stack, Opt_err, }; @@ -172,6 +172,7 @@ static match_table_t tokens = { {Opt_commit, "commit=%u"}, {Opt_localalloc, "localalloc=%d"}, {Opt_localflocks, "localflocks"}, + {Opt_stack, "cluster_stack=%s"}, {Opt_err, NULL} }; @@ -551,8 +552,17 @@ static int ocfs2_verify_heartbeat(struct ocfs2_super *osb) } } + if (ocfs2_userspace_stack(osb)) { + if (osb->s_mount_opt & OCFS2_MOUNT_HB_LOCAL) { + mlog(ML_ERROR, "Userspace stack expected, but " + "o2cb heartbeat arguments passed to mount\n"); + return -EINVAL; + } + } + if (!(osb->s_mount_opt & OCFS2_MOUNT_HB_LOCAL)) { - if (!ocfs2_mount_local(osb) && !ocfs2_is_hard_readonly(osb)) { + if (!ocfs2_mount_local(osb) && !ocfs2_is_hard_readonly(osb) && + !ocfs2_userspace_stack(osb)) { mlog(ML_ERROR, "Heartbeat has to be started to mount " "a read-write clustered device.\n"); return -EINVAL; @@ -562,6 +572,35 @@ static int ocfs2_verify_heartbeat(struct ocfs2_super *osb) return 0; } +/* + * If we're using a userspace stack, mount should have passed + * a name that matches the disk. If not, mount should not + * have passed a stack. + */ +static int ocfs2_verify_userspace_stack(struct ocfs2_super *osb, + struct mount_options *mopt) +{ + if (!ocfs2_userspace_stack(osb) && mopt->cluster_stack[0]) { + mlog(ML_ERROR, + "cluster stack passed to mount, but this filesystem " + "does not support it\n"); + return -EINVAL; + } + + if (ocfs2_userspace_stack(osb) && + strncmp(osb->osb_cluster_stack, mopt->cluster_stack, + OCFS2_STACK_LABEL_LEN)) { + mlog(ML_ERROR, + "cluster stack passed to mount (\"%s\") does not " + "match the filesystem (\"%s\")\n", + mopt->cluster_stack, + osb->osb_cluster_stack); + return -EINVAL; + } + + return 0; +} + static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) { struct dentry *root; @@ -579,15 +618,6 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) goto read_super_error; } - /* for now we only have one cluster/node, make sure we see it - * in the heartbeat universe */ - if (parsed_options.mount_opt & OCFS2_MOUNT_HB_LOCAL) { - if (!o2hb_check_local_node_heartbeating()) { - status = -EINVAL; - goto read_super_error; - } - } - /* probe for superblock */ status = ocfs2_sb_probe(sb, &bh, §or_size); if (status < 0) { @@ -609,6 +639,10 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) osb->osb_commit_interval = parsed_options.commit_interval; osb->local_alloc_size = parsed_options.localalloc_opt; + status = ocfs2_verify_userspace_stack(osb, &parsed_options); + if (status) + goto read_super_error; + sb->s_magic = OCFS2_SUPER_MAGIC; /* Hard readonly mode only if: bdev_read_only, MS_RDONLY, @@ -694,7 +728,7 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) if (ocfs2_mount_local(osb)) snprintf(nodestr, sizeof(nodestr), "local"); else - snprintf(nodestr, sizeof(nodestr), "%d", osb->node_num); + snprintf(nodestr, sizeof(nodestr), "%u", osb->node_num); printk(KERN_INFO "ocfs2: Mounting device (%s) on (node %s, slot %d) " "with %s data mode.\n", @@ -763,6 +797,7 @@ static int ocfs2_parse_options(struct super_block *sb, mopt->atime_quantum = OCFS2_DEFAULT_ATIME_QUANTUM; mopt->slot = OCFS2_INVALID_SLOT; mopt->localalloc_opt = OCFS2_DEFAULT_LOCAL_ALLOC_SIZE; + mopt->cluster_stack[0] = '\0'; if (!options) { status = 1; @@ -864,6 +899,25 @@ static int ocfs2_parse_options(struct super_block *sb, if (!is_remount) mopt->mount_opt |= OCFS2_MOUNT_LOCALFLOCKS; break; + case Opt_stack: + /* Check both that the option we were passed + * is of the right length and that it is a proper + * string of the right length. + */ + if (((args[0].to - args[0].from) != + OCFS2_STACK_LABEL_LEN) || + (strnlen(args[0].from, + OCFS2_STACK_LABEL_LEN) != + OCFS2_STACK_LABEL_LEN)) { + mlog(ML_ERROR, + "Invalid cluster_stack option\n"); + status = 0; + goto bail; + } + memcpy(mopt->cluster_stack, args[0].from, + OCFS2_STACK_LABEL_LEN); + mopt->cluster_stack[OCFS2_STACK_LABEL_LEN] = '\0'; + break; default: mlog(ML_ERROR, "Unrecognized mount option \"%s\" " @@ -922,6 +976,10 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt) if (opts & OCFS2_MOUNT_LOCALFLOCKS) seq_printf(s, ",localflocks,"); + if (osb->osb_cluster_stack[0]) + seq_printf(s, ",cluster_stack=%.*s", OCFS2_STACK_LABEL_LEN, + osb->osb_cluster_stack); + return 0; } @@ -957,6 +1015,8 @@ static int __init ocfs2_init(void) mlog(ML_ERROR, "Unable to create ocfs2 debugfs root.\n"); } + ocfs2_set_locking_protocol(); + leave: if (status < 0) { ocfs2_free_mem_caches(); @@ -1132,31 +1192,6 @@ static int ocfs2_get_sector(struct super_block *sb, return 0; } -/* ocfs2 1.0 only allows one cluster and node identity per kernel image. */ -static int ocfs2_fill_local_node_info(struct ocfs2_super *osb) -{ - int status; - - /* XXX hold a ref on the node while mounte? easy enough, if - * desirable. */ - if (ocfs2_mount_local(osb)) - osb->node_num = 0; - else - osb->node_num = o2nm_this_node(); - - if (osb->node_num == O2NM_MAX_NODES) { - mlog(ML_ERROR, "could not find this host's node number\n"); - status = -ENOENT; - goto bail; - } - - mlog(0, "I am node %d\n", osb->node_num); - - status = 0; -bail: - return status; -} - static int ocfs2_mount_volume(struct super_block *sb) { int status = 0; @@ -1168,12 +1203,6 @@ static int ocfs2_mount_volume(struct super_block *sb) if (ocfs2_is_hard_readonly(osb)) goto leave; - status = ocfs2_fill_local_node_info(osb); - if (status < 0) { - mlog_errno(status); - goto leave; - } - status = ocfs2_dlm_init(osb); if (status < 0) { mlog_errno(status); @@ -1224,18 +1253,9 @@ leave: return status; } -/* we can't grab the goofy sem lock from inside wait_event, so we use - * memory barriers to make sure that we'll see the null task before - * being woken up */ -static int ocfs2_recovery_thread_running(struct ocfs2_super *osb) -{ - mb(); - return osb->recovery_thread_task != NULL; -} - static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err) { - int tmp; + int tmp, hangup_needed = 0; struct ocfs2_super *osb = NULL; char nodestr[8]; @@ -1249,25 +1269,16 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err) ocfs2_truncate_log_shutdown(osb); - /* disable any new recovery threads and wait for any currently - * running ones to exit. Do this before setting the vol_state. */ - mutex_lock(&osb->recovery_lock); - osb->disable_recovery = 1; - mutex_unlock(&osb->recovery_lock); - wait_event(osb->recovery_event, !ocfs2_recovery_thread_running(osb)); - - /* At this point, we know that no more recovery threads can be - * launched, so wait for any recovery completion work to - * complete. */ - flush_workqueue(ocfs2_wq); + /* This will disable recovery and flush any recovery work. */ + ocfs2_recovery_exit(osb); ocfs2_journal_shutdown(osb); ocfs2_sync_blockdev(sb); - /* No dlm means we've failed during mount, so skip all the - * steps which depended on that to complete. */ - if (osb->dlm) { + /* No cluster connection means we've failed during mount, so skip + * all the steps which depended on that to complete. */ + if (osb->cconn) { tmp = ocfs2_super_lock(osb, 1); if (tmp < 0) { mlog_errno(tmp); @@ -1278,25 +1289,34 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err) if (osb->slot_num != OCFS2_INVALID_SLOT) ocfs2_put_slot(osb); - if (osb->dlm) + if (osb->cconn) ocfs2_super_unlock(osb, 1); ocfs2_release_system_inodes(osb); - if (osb->dlm) - ocfs2_dlm_shutdown(osb); + /* + * If we're dismounting due to mount error, mount.ocfs2 will clean + * up heartbeat. If we're a local mount, there is no heartbeat. + * If we failed before we got a uuid_str yet, we can't stop + * heartbeat. Otherwise, do it. + */ + if (!mnt_err && !ocfs2_mount_local(osb) && osb->uuid_str) + hangup_needed = 1; + + if (osb->cconn) + ocfs2_dlm_shutdown(osb, hangup_needed); debugfs_remove(osb->osb_debug_root); - if (!mnt_err) - ocfs2_stop_heartbeat(osb); + if (hangup_needed) + ocfs2_cluster_hangup(osb->uuid_str, strlen(osb->uuid_str)); atomic_set(&osb->vol_state, VOLUME_DISMOUNTED); if (ocfs2_mount_local(osb)) snprintf(nodestr, sizeof(nodestr), "local"); else - snprintf(nodestr, sizeof(nodestr), "%d", osb->node_num); + snprintf(nodestr, sizeof(nodestr), "%u", osb->node_num); printk(KERN_INFO "ocfs2: Unmounting device (%s) on (node %s)\n", osb->dev_str, nodestr); @@ -1355,7 +1375,6 @@ static int ocfs2_initialize_super(struct super_block *sb, sb->s_fs_info = osb; sb->s_op = &ocfs2_sops; sb->s_export_op = &ocfs2_export_ops; - osb->osb_locking_proto = ocfs2_locking_protocol; sb->s_time_gran = 1; sb->s_flags |= MS_NOATIME; /* this is needed to support O_LARGEFILE */ @@ -1368,7 +1387,6 @@ static int ocfs2_initialize_super(struct super_block *sb, osb->s_sectsize_bits = blksize_bits(sector_size); BUG_ON(!osb->s_sectsize_bits); - init_waitqueue_head(&osb->recovery_event); spin_lock_init(&osb->dc_task_lock); init_waitqueue_head(&osb->dc_event); osb->dc_work_sequence = 0; @@ -1376,6 +1394,7 @@ static int ocfs2_initialize_super(struct super_block *sb, INIT_LIST_HEAD(&osb->blocked_lock_list); osb->blocked_lock_count = 0; spin_lock_init(&osb->osb_lock); + ocfs2_init_inode_steal_slot(osb); atomic_set(&osb->alloc_stats.moves, 0); atomic_set(&osb->alloc_stats.local_data, 0); @@ -1388,24 +1407,23 @@ static int ocfs2_initialize_super(struct super_block *sb, snprintf(osb->dev_str, sizeof(osb->dev_str), "%u,%u", MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev)); - mutex_init(&osb->recovery_lock); - - osb->disable_recovery = 0; - osb->recovery_thread_task = NULL; + status = ocfs2_recovery_init(osb); + if (status) { + mlog(ML_ERROR, "Unable to initialize recovery state\n"); + mlog_errno(status); + goto bail; + } init_waitqueue_head(&osb->checkpoint_event); atomic_set(&osb->needs_checkpoint, 0); osb->s_atime_quantum = OCFS2_DEFAULT_ATIME_QUANTUM; - osb->node_num = O2NM_INVALID_NODE_NUM; osb->slot_num = OCFS2_INVALID_SLOT; osb->local_alloc_state = OCFS2_LA_UNUSED; osb->local_alloc_bh = NULL; - ocfs2_setup_hb_callbacks(osb); - init_waitqueue_head(&osb->osb_mount_event); osb->vol_label = kmalloc(OCFS2_MAX_VOL_LABEL_LEN, GFP_KERNEL); @@ -1455,6 +1473,25 @@ static int ocfs2_initialize_super(struct super_block *sb, goto bail; } + if (ocfs2_userspace_stack(osb)) { + memcpy(osb->osb_cluster_stack, + OCFS2_RAW_SB(di)->s_cluster_info.ci_stack, + OCFS2_STACK_LABEL_LEN); + osb->osb_cluster_stack[OCFS2_STACK_LABEL_LEN] = '\0'; + if (strlen(osb->osb_cluster_stack) != OCFS2_STACK_LABEL_LEN) { + mlog(ML_ERROR, + "couldn't mount because of an invalid " + "cluster stack label (%s) \n", + osb->osb_cluster_stack); + status = -EINVAL; + goto bail; + } + } else { + /* The empty string is identical with classic tools that + * don't know about s_cluster_info. */ + osb->osb_cluster_stack[0] = '\0'; + } + get_random_bytes(&osb->s_next_generation, sizeof(u32)); /* FIXME @@ -1724,8 +1761,7 @@ static void ocfs2_delete_osb(struct ocfs2_super *osb) /* This function assumes that the caller has the main osb resource */ - if (osb->slot_info) - ocfs2_free_slot_info(osb->slot_info); + ocfs2_free_slot_info(osb); kfree(osb->osb_orphan_wipes); /* FIXME @@ -244,21 +244,21 @@ static long do_sys_truncate(const char __user * path, loff_t length) if (!S_ISREG(inode->i_mode)) goto dput_and_out; - error = vfs_permission(&nd, MAY_WRITE); + error = mnt_want_write(nd.path.mnt); if (error) goto dput_and_out; - error = -EROFS; - if (IS_RDONLY(inode)) - goto dput_and_out; + error = vfs_permission(&nd, MAY_WRITE); + if (error) + goto mnt_drop_write_and_out; error = -EPERM; if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) - goto dput_and_out; + goto mnt_drop_write_and_out; error = get_write_access(inode); if (error) - goto dput_and_out; + goto mnt_drop_write_and_out; /* * Make sure that there are no leases. get_write_access() protects @@ -276,6 +276,8 @@ static long do_sys_truncate(const char __user * path, loff_t length) put_write_and_out: put_write_access(inode); +mnt_drop_write_and_out: + mnt_drop_write(nd.path.mnt); dput_and_out: path_put(&nd.path); out: @@ -457,8 +459,17 @@ asmlinkage long sys_faccessat(int dfd, const char __user *filename, int mode) if(res || !(mode & S_IWOTH) || special_file(nd.path.dentry->d_inode->i_mode)) goto out_path_release; - - if(IS_RDONLY(nd.path.dentry->d_inode)) + /* + * This is a rare case where using __mnt_is_readonly() + * is OK without a mnt_want/drop_write() pair. Since + * no actual write to the fs is performed here, we do + * not need to telegraph to that to anyone. + * + * By doing this, we accept that this access is + * inherently racy and know that the fs may change + * state before we even see this result. + */ + if (__mnt_is_readonly(nd.path.mnt)) res = -EROFS; out_path_release: @@ -567,12 +578,12 @@ asmlinkage long sys_fchmod(unsigned int fd, mode_t mode) audit_inode(NULL, dentry); - err = -EROFS; - if (IS_RDONLY(inode)) + err = mnt_want_write(file->f_path.mnt); + if (err) goto out_putf; err = -EPERM; if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) - goto out_putf; + goto out_drop_write; mutex_lock(&inode->i_mutex); if (mode == (mode_t) -1) mode = inode->i_mode; @@ -581,6 +592,8 @@ asmlinkage long sys_fchmod(unsigned int fd, mode_t mode) err = notify_change(dentry, &newattrs); mutex_unlock(&inode->i_mutex); +out_drop_write: + mnt_drop_write(file->f_path.mnt); out_putf: fput(file); out: @@ -600,13 +613,13 @@ asmlinkage long sys_fchmodat(int dfd, const char __user *filename, goto out; inode = nd.path.dentry->d_inode; - error = -EROFS; - if (IS_RDONLY(inode)) + error = mnt_want_write(nd.path.mnt); + if (error) goto dput_and_out; error = -EPERM; if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) - goto dput_and_out; + goto out_drop_write; mutex_lock(&inode->i_mutex); if (mode == (mode_t) -1) @@ -616,6 +629,8 @@ asmlinkage long sys_fchmodat(int dfd, const char __user *filename, error = notify_change(nd.path.dentry, &newattrs); mutex_unlock(&inode->i_mutex); +out_drop_write: + mnt_drop_write(nd.path.mnt); dput_and_out: path_put(&nd.path); out: @@ -638,9 +653,6 @@ static int chown_common(struct dentry * dentry, uid_t user, gid_t group) printk(KERN_ERR "chown_common: NULL inode\n"); goto out; } - error = -EROFS; - if (IS_RDONLY(inode)) - goto out; error = -EPERM; if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) goto out; @@ -671,7 +683,12 @@ asmlinkage long sys_chown(const char __user * filename, uid_t user, gid_t group) error = user_path_walk(filename, &nd); if (error) goto out; + error = mnt_want_write(nd.path.mnt); + if (error) + goto out_release; error = chown_common(nd.path.dentry, user, group); + mnt_drop_write(nd.path.mnt); +out_release: path_put(&nd.path); out: return error; @@ -691,7 +708,12 @@ asmlinkage long sys_fchownat(int dfd, const char __user *filename, uid_t user, error = __user_walk_fd(dfd, filename, follow, &nd); if (error) goto out; + error = mnt_want_write(nd.path.mnt); + if (error) + goto out_release; error = chown_common(nd.path.dentry, user, group); + mnt_drop_write(nd.path.mnt); +out_release: path_put(&nd.path); out: return error; @@ -705,7 +727,12 @@ asmlinkage long sys_lchown(const char __user * filename, uid_t user, gid_t group error = user_path_walk_link(filename, &nd); if (error) goto out; + error = mnt_want_write(nd.path.mnt); + if (error) + goto out_release; error = chown_common(nd.path.dentry, user, group); + mnt_drop_write(nd.path.mnt); +out_release: path_put(&nd.path); out: return error; @@ -722,14 +749,48 @@ asmlinkage long sys_fchown(unsigned int fd, uid_t user, gid_t group) if (!file) goto out; + error = mnt_want_write(file->f_path.mnt); + if (error) + goto out_fput; dentry = file->f_path.dentry; audit_inode(NULL, dentry); error = chown_common(dentry, user, group); + mnt_drop_write(file->f_path.mnt); +out_fput: fput(file); out: return error; } +/* + * You have to be very careful that these write + * counts get cleaned up in error cases and + * upon __fput(). This should probably never + * be called outside of __dentry_open(). + */ +static inline int __get_file_write_access(struct inode *inode, + struct vfsmount *mnt) +{ + int error; + error = get_write_access(inode); + if (error) + return error; + /* + * Do not take mount writer counts on + * special files since no writes to + * the mount itself will occur. + */ + if (!special_file(inode->i_mode)) { + /* + * Balanced in __fput() + */ + error = mnt_want_write(mnt); + if (error) + put_write_access(inode); + } + return error; +} + static struct file *__dentry_open(struct dentry *dentry, struct vfsmount *mnt, int flags, struct file *f, int (*open)(struct inode *, struct file *)) @@ -742,9 +803,11 @@ static struct file *__dentry_open(struct dentry *dentry, struct vfsmount *mnt, FMODE_PREAD | FMODE_PWRITE; inode = dentry->d_inode; if (f->f_mode & FMODE_WRITE) { - error = get_write_access(inode); + error = __get_file_write_access(inode, mnt); if (error) goto cleanup_file; + if (!special_file(inode->i_mode)) + file_take_write(f); } f->f_mapping = inode->i_mapping; @@ -784,8 +847,19 @@ static struct file *__dentry_open(struct dentry *dentry, struct vfsmount *mnt, cleanup_all: fops_put(f->f_op); - if (f->f_mode & FMODE_WRITE) + if (f->f_mode & FMODE_WRITE) { put_write_access(inode); + if (!special_file(inode->i_mode)) { + /* + * We don't consider this a real + * mnt_want/drop_write() pair + * because it all happenend right + * here, so just reset the state. + */ + file_reset_write(f); + mnt_drop_write(mnt); + } + } file_kill(f); f->f_path.dentry = NULL; f->f_path.mnt = NULL; @@ -796,43 +870,6 @@ cleanup_file: return ERR_PTR(error); } -/* - * Note that while the flag value (low two bits) for sys_open means: - * 00 - read-only - * 01 - write-only - * 10 - read-write - * 11 - special - * it is changed into - * 00 - no permissions needed - * 01 - read-permission - * 10 - write-permission - * 11 - read-write - * for the internal routines (ie open_namei()/follow_link() etc). 00 is - * used by symlinks. - */ -static struct file *do_filp_open(int dfd, const char *filename, int flags, - int mode) -{ - int namei_flags, error; - struct nameidata nd; - - namei_flags = flags; - if ((namei_flags+1) & O_ACCMODE) - namei_flags++; - - error = open_namei(dfd, filename, namei_flags, mode, &nd); - if (!error) - return nameidata_to_filp(&nd, flags); - - return ERR_PTR(error); -} - -struct file *filp_open(const char *filename, int flags, int mode) -{ - return do_filp_open(AT_FDCWD, filename, flags, mode); -} -EXPORT_SYMBOL(filp_open); - /** * lookup_instantiate_filp - instantiates the open intent filp * @nd: pointer to nameidata diff --git a/fs/partitions/check.c b/fs/partitions/check.c index 03f808c..6149e4b 100644 --- a/fs/partitions/check.c +++ b/fs/partitions/check.c @@ -473,6 +473,10 @@ int rescan_partitions(struct gendisk *disk, struct block_device *bdev) return 0; if (IS_ERR(state)) /* I/O error reading the partition table */ return -EIO; + + /* tell userspace that the media / partition table may have changed */ + kobject_uevent(&disk->dev.kobj, KOBJ_CHANGE); + for (p = 1; p < state->limit; p++) { sector_t size = state->parts[p].size; sector_t from = state->parts[p].from; @@ -988,7 +988,10 @@ struct file *create_write_pipe(void) return f; err_dentry: + free_pipe_info(inode); dput(dentry); + return ERR_PTR(err); + err_inode: free_pipe_info(inode); iput(inode); @@ -9,6 +9,7 @@ #include <linux/mnt_namespace.h> #include <linux/mount.h> #include <linux/fs.h> +#include "internal.h" #include "pnode.h" /* return the next shared peer mount of @p */ @@ -27,6 +28,57 @@ static inline struct vfsmount *next_slave(struct vfsmount *p) return list_entry(p->mnt_slave.next, struct vfsmount, mnt_slave); } +/* + * Return true if path is reachable from root + * + * namespace_sem is held, and mnt is attached + */ +static bool is_path_reachable(struct vfsmount *mnt, struct dentry *dentry, + const struct path *root) +{ + while (mnt != root->mnt && mnt->mnt_parent != mnt) { + dentry = mnt->mnt_mountpoint; + mnt = mnt->mnt_parent; + } + return mnt == root->mnt && is_subdir(dentry, root->dentry); +} + +static struct vfsmount *get_peer_under_root(struct vfsmount *mnt, + struct mnt_namespace *ns, + const struct path *root) +{ + struct vfsmount *m = mnt; + + do { + /* Check the namespace first for optimization */ + if (m->mnt_ns == ns && is_path_reachable(m, m->mnt_root, root)) + return m; + + m = next_peer(m); + } while (m != mnt); + + return NULL; +} + +/* + * Get ID of closest dominating peer group having a representative + * under the given root. + * + * Caller must hold namespace_sem + */ +int get_dominating_id(struct vfsmount *mnt, const struct path *root) +{ + struct vfsmount *m; + + for (m = mnt->mnt_master; m != NULL; m = m->mnt_master) { + struct vfsmount *d = get_peer_under_root(m, mnt->mnt_ns, root); + if (d) + return d->mnt_group_id; + } + + return 0; +} + static int do_make_slave(struct vfsmount *mnt) { struct vfsmount *peer_mnt = mnt, *master = mnt->mnt_master; @@ -45,7 +97,11 @@ static int do_make_slave(struct vfsmount *mnt) if (peer_mnt == mnt) peer_mnt = NULL; } + if (IS_MNT_SHARED(mnt) && list_empty(&mnt->mnt_share)) + mnt_release_group_id(mnt); + list_del_init(&mnt->mnt_share); + mnt->mnt_group_id = 0; if (peer_mnt) master = peer_mnt; @@ -67,7 +123,6 @@ static int do_make_slave(struct vfsmount *mnt) } mnt->mnt_master = master; CLEAR_MNT_SHARED(mnt); - INIT_LIST_HEAD(&mnt->mnt_slave_list); return 0; } @@ -211,8 +266,7 @@ int propagate_mnt(struct vfsmount *dest_mnt, struct dentry *dest_dentry, out: spin_lock(&vfsmount_lock); while (!list_empty(&tmp_list)) { - child = list_entry(tmp_list.next, struct vfsmount, mnt_hash); - list_del_init(&child->mnt_hash); + child = list_first_entry(&tmp_list, struct vfsmount, mnt_hash); umount_tree(child, 0, &umount_list); } spin_unlock(&vfsmount_lock); @@ -35,4 +35,6 @@ int propagate_mnt(struct vfsmount *, struct dentry *, struct vfsmount *, struct list_head *); int propagate_umount(struct list_head *); int propagate_mount_busy(struct vfsmount *, int); +void mnt_release_group_id(struct vfsmount *); +int get_dominating_id(struct vfsmount *mnt, const struct path *root); #endif /* _LINUX_PNODE_H */ diff --git a/fs/proc/base.c b/fs/proc/base.c index 81d7d14..c5e412a 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -502,17 +502,14 @@ static const struct inode_operations proc_def_inode_operations = { .setattr = proc_setattr, }; -extern const struct seq_operations mounts_op; -struct proc_mounts { - struct seq_file m; - int event; -}; - -static int mounts_open(struct inode *inode, struct file *file) +static int mounts_open_common(struct inode *inode, struct file *file, + const struct seq_operations *op) { struct task_struct *task = get_proc_task(inode); struct nsproxy *nsp; struct mnt_namespace *ns = NULL; + struct fs_struct *fs = NULL; + struct path root; struct proc_mounts *p; int ret = -EINVAL; @@ -525,40 +522,61 @@ static int mounts_open(struct inode *inode, struct file *file) get_mnt_ns(ns); } rcu_read_unlock(); - + if (ns) + fs = get_fs_struct(task); put_task_struct(task); } - if (ns) { - ret = -ENOMEM; - p = kmalloc(sizeof(struct proc_mounts), GFP_KERNEL); - if (p) { - file->private_data = &p->m; - ret = seq_open(file, &mounts_op); - if (!ret) { - p->m.private = ns; - p->event = ns->event; - return 0; - } - kfree(p); - } - put_mnt_ns(ns); - } + if (!ns) + goto err; + if (!fs) + goto err_put_ns; + + read_lock(&fs->lock); + root = fs->root; + path_get(&root); + read_unlock(&fs->lock); + put_fs_struct(fs); + + ret = -ENOMEM; + p = kmalloc(sizeof(struct proc_mounts), GFP_KERNEL); + if (!p) + goto err_put_path; + + file->private_data = &p->m; + ret = seq_open(file, op); + if (ret) + goto err_free; + + p->m.private = p; + p->ns = ns; + p->root = root; + p->event = ns->event; + + return 0; + + err_free: + kfree(p); + err_put_path: + path_put(&root); + err_put_ns: + put_mnt_ns(ns); + err: return ret; } static int mounts_release(struct inode *inode, struct file *file) { - struct seq_file *m = file->private_data; - struct mnt_namespace *ns = m->private; - put_mnt_ns(ns); + struct proc_mounts *p = file->private_data; + path_put(&p->root); + put_mnt_ns(p->ns); return seq_release(inode, file); } static unsigned mounts_poll(struct file *file, poll_table *wait) { struct proc_mounts *p = file->private_data; - struct mnt_namespace *ns = p->m.private; + struct mnt_namespace *ns = p->ns; unsigned res = 0; poll_wait(file, &ns->poll, wait); @@ -573,6 +591,11 @@ static unsigned mounts_poll(struct file *file, poll_table *wait) return res; } +static int mounts_open(struct inode *inode, struct file *file) +{ + return mounts_open_common(inode, file, &mounts_op); +} + static const struct file_operations proc_mounts_operations = { .open = mounts_open, .read = seq_read, @@ -581,38 +604,22 @@ static const struct file_operations proc_mounts_operations = { .poll = mounts_poll, }; -extern const struct seq_operations mountstats_op; -static int mountstats_open(struct inode *inode, struct file *file) +static int mountinfo_open(struct inode *inode, struct file *file) { - int ret = seq_open(file, &mountstats_op); - - if (!ret) { - struct seq_file *m = file->private_data; - struct nsproxy *nsp; - struct mnt_namespace *mnt_ns = NULL; - struct task_struct *task = get_proc_task(inode); - - if (task) { - rcu_read_lock(); - nsp = task_nsproxy(task); - if (nsp) { - mnt_ns = nsp->mnt_ns; - if (mnt_ns) - get_mnt_ns(mnt_ns); - } - rcu_read_unlock(); + return mounts_open_common(inode, file, &mountinfo_op); +} - put_task_struct(task); - } +static const struct file_operations proc_mountinfo_operations = { + .open = mountinfo_open, + .read = seq_read, + .llseek = seq_lseek, + .release = mounts_release, + .poll = mounts_poll, +}; - if (mnt_ns) - m->private = mnt_ns; - else { - seq_release(inode, file); - ret = -EINVAL; - } - } - return ret; +static int mountstats_open(struct inode *inode, struct file *file) +{ + return mounts_open_common(inode, file, &mountstats_op); } static const struct file_operations proc_mountstats_operations = { @@ -1626,7 +1633,6 @@ static int proc_readfd_common(struct file * filp, void * dirent, unsigned int fd, ino; int retval; struct files_struct * files; - struct fdtable *fdt; retval = -ENOENT; if (!p) @@ -1649,9 +1655,8 @@ static int proc_readfd_common(struct file * filp, void * dirent, if (!files) goto out; rcu_read_lock(); - fdt = files_fdtable(files); for (fd = filp->f_pos-2; - fd < fdt->max_fds; + fd < files_fdtable(files)->max_fds; fd++, filp->f_pos++) { char name[PROC_NUMBUF]; int len; @@ -2311,6 +2316,7 @@ static const struct pid_entry tgid_base_stuff[] = { LNK("root", root), LNK("exe", exe), REG("mounts", S_IRUGO, mounts), + REG("mountinfo", S_IRUGO, mountinfo), REG("mountstats", S_IRUSR, mountstats), #ifdef CONFIG_PROC_PAGE_MONITOR REG("clear_refs", S_IWUSR, clear_refs), @@ -2643,6 +2649,7 @@ static const struct pid_entry tid_base_stuff[] = { LNK("root", root), LNK("exe", exe), REG("mounts", S_IRUGO, mounts), + REG("mountinfo", S_IRUGO, mountinfo), #ifdef CONFIG_PROC_PAGE_MONITOR REG("clear_refs", S_IWUSR, clear_refs), REG("smaps", S_IRUGO, smaps), diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c index 4caa5f7..13cd783 100644 --- a/fs/proc/proc_net.c +++ b/fs/proc/proc_net.c @@ -44,7 +44,9 @@ int seq_open_net(struct inode *ino, struct file *f, put_net(net); return -ENOMEM; } +#ifdef CONFIG_NET_NS p->net = net; +#endif return 0; } EXPORT_SYMBOL_GPL(seq_open_net); @@ -52,12 +54,10 @@ EXPORT_SYMBOL_GPL(seq_open_net); int seq_release_net(struct inode *ino, struct file *f) { struct seq_file *seq; - struct seq_net_private *p; seq = f->private_data; - p = seq->private; - put_net(p->net); + put_net(seq_file_net(seq)); seq_release_private(ino, f); return 0; } diff --git a/fs/read_write.c b/fs/read_write.c index 49a9871..f0d1240 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -33,7 +33,7 @@ EXPORT_SYMBOL(generic_ro_fops); loff_t generic_file_llseek(struct file *file, loff_t offset, int origin) { - long long retval; + loff_t retval; struct inode *inode = file->f_mapping->host; mutex_lock(&inode->i_mutex); @@ -60,7 +60,7 @@ EXPORT_SYMBOL(generic_file_llseek); loff_t remote_llseek(struct file *file, loff_t offset, int origin) { - long long retval; + loff_t retval; lock_kernel(); switch (origin) { @@ -91,7 +91,7 @@ EXPORT_SYMBOL(no_llseek); loff_t default_llseek(struct file *file, loff_t offset, int origin) { - long long retval; + loff_t retval; lock_kernel(); switch (origin) { diff --git a/fs/reiserfs/ioctl.c b/fs/reiserfs/ioctl.c index e0f0f09..74363a7 100644 --- a/fs/reiserfs/ioctl.c +++ b/fs/reiserfs/ioctl.c @@ -4,6 +4,7 @@ #include <linux/capability.h> #include <linux/fs.h> +#include <linux/mount.h> #include <linux/reiserfs_fs.h> #include <linux/time.h> #include <asm/uaccess.h> @@ -25,6 +26,7 @@ int reiserfs_ioctl(struct inode *inode, struct file *filp, unsigned int cmd, unsigned long arg) { unsigned int flags; + int err = 0; switch (cmd) { case REISERFS_IOC_UNPACK: @@ -48,50 +50,67 @@ int reiserfs_ioctl(struct inode *inode, struct file *filp, unsigned int cmd, if (!reiserfs_attrs(inode->i_sb)) return -ENOTTY; - if (IS_RDONLY(inode)) - return -EROFS; + err = mnt_want_write(filp->f_path.mnt); + if (err) + return err; - if (!is_owner_or_cap(inode)) - return -EPERM; - - if (get_user(flags, (int __user *)arg)) - return -EFAULT; - - /* Is it quota file? Do not allow user to mess with it. */ - if (IS_NOQUOTA(inode)) - return -EPERM; + if (!is_owner_or_cap(inode)) { + err = -EPERM; + goto setflags_out; + } + if (get_user(flags, (int __user *)arg)) { + err = -EFAULT; + goto setflags_out; + } + /* + * Is it quota file? Do not allow user to mess with it + */ + if (IS_NOQUOTA(inode)) { + err = -EPERM; + goto setflags_out; + } if (((flags ^ REISERFS_I(inode)-> i_attrs) & (REISERFS_IMMUTABLE_FL | REISERFS_APPEND_FL)) - && !capable(CAP_LINUX_IMMUTABLE)) - return -EPERM; - + && !capable(CAP_LINUX_IMMUTABLE)) { + err = -EPERM; + goto setflags_out; + } if ((flags & REISERFS_NOTAIL_FL) && S_ISREG(inode->i_mode)) { int result; result = reiserfs_unpack(inode, filp); - if (result) - return result; + if (result) { + err = result; + goto setflags_out; + } } sd_attrs_to_i_attrs(flags, inode); REISERFS_I(inode)->i_attrs = flags; inode->i_ctime = CURRENT_TIME_SEC; mark_inode_dirty(inode); - return 0; +setflags_out: + mnt_drop_write(filp->f_path.mnt); + return err; } case REISERFS_IOC_GETVERSION: return put_user(inode->i_generation, (int __user *)arg); case REISERFS_IOC_SETVERSION: if (!is_owner_or_cap(inode)) return -EPERM; - if (IS_RDONLY(inode)) - return -EROFS; - if (get_user(inode->i_generation, (int __user *)arg)) - return -EFAULT; + err = mnt_want_write(filp->f_path.mnt); + if (err) + return err; + if (get_user(inode->i_generation, (int __user *)arg)) { + err = -EFAULT; + goto setversion_out; + } inode->i_ctime = CURRENT_TIME_SEC; mark_inode_dirty(inode); - return 0; +setversion_out: + mnt_drop_write(filp->f_path.mnt); + return err; default: return -ENOTTY; } diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c index bb05a3e..060eb3f 100644 --- a/fs/reiserfs/journal.c +++ b/fs/reiserfs/journal.c @@ -38,7 +38,7 @@ #include <asm/system.h> #include <linux/time.h> -#include <asm/semaphore.h> +#include <linux/semaphore.h> #include <linux/vmalloc.h> #include <linux/reiserfs_fs.h> diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c index 344b9b9..d7c4935 100644 --- a/fs/reiserfs/xattr.c +++ b/fs/reiserfs/xattr.c @@ -44,7 +44,6 @@ #include <net/checksum.h> #include <linux/smp_lock.h> #include <linux/stat.h> -#include <asm/semaphore.h> #define FL_READONLY 128 #define FL_DIR_SEM_HELD 256 diff --git a/fs/select.c b/fs/select.c index 5633fe9..00f58c5 100644 --- a/fs/select.c +++ b/fs/select.c @@ -260,7 +260,7 @@ int do_select(int n, fd_set_bits *fds, s64 *timeout) wait = NULL; if (retval || !*timeout || signal_pending(current)) break; - if(table.error) { + if (table.error) { retval = table.error; break; } diff --git a/fs/seq_file.c b/fs/seq_file.c index 8537702..3f54dbd 100644 --- a/fs/seq_file.c +++ b/fs/seq_file.c @@ -25,6 +25,7 @@ * into the buffer. In case of error ->start() and ->next() return * ERR_PTR(error). In the end of sequence they return %NULL. ->show() * returns 0 in case of success and negative number in case of error. + * Returning SEQ_SKIP means "discard this element and move on". */ int seq_open(struct file *file, const struct seq_operations *op) { @@ -114,8 +115,10 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos) if (!p || IS_ERR(p)) break; err = m->op->show(m, p); - if (err) + if (err < 0) break; + if (unlikely(err)) + m->count = 0; if (m->count < m->size) goto Fill; m->op->stop(m, p); @@ -140,9 +143,10 @@ Fill: break; } err = m->op->show(m, p); - if (err || m->count == m->size) { + if (m->count == m->size || err) { m->count = offs; - break; + if (likely(err <= 0)) + break; } pos = next; } @@ -199,8 +203,12 @@ static int traverse(struct seq_file *m, loff_t offset) if (IS_ERR(p)) break; error = m->op->show(m, p); - if (error) + if (error < 0) break; + if (unlikely(error)) { + error = 0; + m->count = 0; + } if (m->count == m->size) goto Eoverflow; if (pos + m->count > offset) { @@ -239,7 +247,7 @@ Eoverflow: loff_t seq_lseek(struct file *file, loff_t offset, int origin) { struct seq_file *m = (struct seq_file *)file->private_data; - long long retval = -EINVAL; + loff_t retval = -EINVAL; mutex_lock(&m->lock); m->version = file->f_version; @@ -342,28 +350,40 @@ int seq_printf(struct seq_file *m, const char *f, ...) } EXPORT_SYMBOL(seq_printf); +static char *mangle_path(char *s, char *p, char *esc) +{ + while (s <= p) { + char c = *p++; + if (!c) { + return s; + } else if (!strchr(esc, c)) { + *s++ = c; + } else if (s + 4 > p) { + break; + } else { + *s++ = '\\'; + *s++ = '0' + ((c & 0300) >> 6); + *s++ = '0' + ((c & 070) >> 3); + *s++ = '0' + (c & 07); + } + } + return NULL; +} + +/* + * return the absolute path of 'dentry' residing in mount 'mnt'. + */ int seq_path(struct seq_file *m, struct path *path, char *esc) { if (m->count < m->size) { char *s = m->buf + m->count; char *p = d_path(path, s, m->size - m->count); if (!IS_ERR(p)) { - while (s <= p) { - char c = *p++; - if (!c) { - p = m->buf + m->count; - m->count = s - m->buf; - return s - p; - } else if (!strchr(esc, c)) { - *s++ = c; - } else if (s + 4 > p) { - break; - } else { - *s++ = '\\'; - *s++ = '0' + ((c & 0300) >> 6); - *s++ = '0' + ((c & 070) >> 3); - *s++ = '0' + (c & 07); - } + s = mangle_path(s, p, esc); + if (s) { + p = m->buf + m->count; + m->count = s - m->buf; + return s - p; } } } @@ -372,6 +392,57 @@ int seq_path(struct seq_file *m, struct path *path, char *esc) } EXPORT_SYMBOL(seq_path); +/* + * Same as seq_path, but relative to supplied root. + * + * root may be changed, see __d_path(). + */ +int seq_path_root(struct seq_file *m, struct path *path, struct path *root, + char *esc) +{ + int err = -ENAMETOOLONG; + if (m->count < m->size) { + char *s = m->buf + m->count; + char *p; + + spin_lock(&dcache_lock); + p = __d_path(path, root, s, m->size - m->count); + spin_unlock(&dcache_lock); + err = PTR_ERR(p); + if (!IS_ERR(p)) { + s = mangle_path(s, p, esc); + if (s) { + p = m->buf + m->count; + m->count = s - m->buf; + return 0; + } + } + } + m->count = m->size; + return err; +} + +/* + * returns the path of the 'dentry' from the root of its filesystem. + */ +int seq_dentry(struct seq_file *m, struct dentry *dentry, char *esc) +{ + if (m->count < m->size) { + char *s = m->buf + m->count; + char *p = dentry_path(dentry, s, m->size - m->count); + if (!IS_ERR(p)) { + s = mangle_path(s, p, esc); + if (s) { + p = m->buf + m->count; + m->count = s - m->buf; + return s - p; + } + } + } + m->count = m->size; + return -1; +} + static void *single_start(struct seq_file *p, loff_t *pos) { return NULL + (*pos == 0); @@ -37,7 +37,9 @@ #include <linux/idr.h> #include <linux/kobject.h> #include <linux/mutex.h> +#include <linux/file.h> #include <asm/uaccess.h> +#include "internal.h" LIST_HEAD(super_blocks); @@ -567,10 +569,29 @@ static void mark_files_ro(struct super_block *sb) { struct file *f; +retry: file_list_lock(); list_for_each_entry(f, &sb->s_files, f_u.fu_list) { - if (S_ISREG(f->f_path.dentry->d_inode->i_mode) && file_count(f)) - f->f_mode &= ~FMODE_WRITE; + struct vfsmount *mnt; + if (!S_ISREG(f->f_path.dentry->d_inode->i_mode)) + continue; + if (!file_count(f)) + continue; + if (!(f->f_mode & FMODE_WRITE)) + continue; + f->f_mode &= ~FMODE_WRITE; + if (file_check_writeable(f) != 0) + continue; + file_release_write(f); + mnt = mntget(f->f_path.mnt); + file_list_unlock(); + /* + * This can sleep, so we can't hold + * the file_list_lock() spinlock. + */ + mnt_drop_write(mnt); + mntput(mnt); + goto retry; } file_list_unlock(); } diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c index 4948d9b..a1c3a1f 100644 --- a/fs/sysfs/dir.c +++ b/fs/sysfs/dir.c @@ -20,6 +20,7 @@ #include <linux/idr.h> #include <linux/completion.h> #include <linux/mutex.h> +#include <linux/slab.h> #include "sysfs.h" DEFINE_MUTEX(sysfs_mutex); diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c index baa663e..ade9a7e 100644 --- a/fs/sysfs/file.c +++ b/fs/sysfs/file.c @@ -13,6 +13,7 @@ #include <linux/module.h> #include <linux/kobject.h> #include <linux/kallsyms.h> +#include <linux/slab.h> #include <linux/namei.h> #include <linux/poll.h> #include <linux/list.h> @@ -128,7 +129,7 @@ sysfs_read_file(struct file *file, char __user *buf, size_t count, loff_t *ppos) ssize_t retval = 0; mutex_lock(&buffer->mutex); - if (buffer->needs_read_fill) { + if (buffer->needs_read_fill || *ppos == 0) { retval = fill_read_buffer(file->f_path.dentry,buffer); if (retval) goto out; @@ -409,8 +410,7 @@ static int sysfs_release(struct inode *inode, struct file *filp) * return POLLERR|POLLPRI, and select will return the fd whether * it is waiting for read, write, or exceptions. * Once poll/select indicates that the value has changed, you - * need to close and re-open the file, as simply seeking and reading - * again will not get new data, or reset the state of 'poll'. + * need to close and re-open the file, or seek to 0 and read again. * Reminder: this only works for attributes which actively support * it, and it is not possible to test an attribute from userspace * to see if it supports poll (Neither 'poll' nor 'select' return diff --git a/fs/sysfs/symlink.c b/fs/sysfs/symlink.c index 5f66c44..817f596 100644 --- a/fs/sysfs/symlink.c +++ b/fs/sysfs/symlink.c @@ -87,7 +87,14 @@ int sysfs_create_link(struct kobject * kobj, struct kobject * target, const char void sysfs_remove_link(struct kobject * kobj, const char * name) { - sysfs_hash_and_remove(kobj->sd, name); + struct sysfs_dirent *parent_sd = NULL; + + if (!kobj) + parent_sd = &sysfs_root; + else + parent_sd = kobj->sd; + + sysfs_hash_and_remove(parent_sd, name); } static int sysfs_get_target_path(struct sysfs_dirent *parent_sd, diff --git a/fs/udf/Makefile b/fs/udf/Makefile index be845e7..0d4503f 100644 --- a/fs/udf/Makefile +++ b/fs/udf/Makefile @@ -6,4 +6,4 @@ obj-$(CONFIG_UDF_FS) += udf.o udf-objs := balloc.o dir.o file.o ialloc.o inode.o lowlevel.o namei.o \ partition.o super.o truncate.o symlink.o fsync.o \ - crc.o directory.o misc.o udftime.o unicode.o + directory.o misc.o udftime.o unicode.o diff --git a/fs/udf/balloc.c b/fs/udf/balloc.c index f855dcb..1b809bd4 100644 --- a/fs/udf/balloc.c +++ b/fs/udf/balloc.c @@ -149,8 +149,7 @@ static bool udf_add_free_space(struct udf_sb_info *sbi, return false; lvid = (struct logicalVolIntegrityDesc *)sbi->s_lvid_bh->b_data; - lvid->freeSpaceTable[partition] = cpu_to_le32(le32_to_cpu( - lvid->freeSpaceTable[partition]) + cnt); + le32_add_cpu(&lvid->freeSpaceTable[partition], cnt); return true; } @@ -589,10 +588,8 @@ static void udf_table_free_blocks(struct super_block *sb, sptr = oepos.bh->b_data + epos.offset; aed = (struct allocExtDesc *) oepos.bh->b_data; - aed->lengthAllocDescs = - cpu_to_le32(le32_to_cpu( - aed->lengthAllocDescs) + - adsize); + le32_add_cpu(&aed->lengthAllocDescs, + adsize); } else { sptr = iinfo->i_ext.i_data + epos.offset; @@ -645,9 +642,7 @@ static void udf_table_free_blocks(struct super_block *sb, mark_inode_dirty(table); } else { aed = (struct allocExtDesc *)epos.bh->b_data; - aed->lengthAllocDescs = - cpu_to_le32(le32_to_cpu( - aed->lengthAllocDescs) + adsize); + le32_add_cpu(&aed->lengthAllocDescs, adsize); udf_update_tag(epos.bh->b_data, epos.offset); mark_buffer_dirty(epos.bh); } diff --git a/fs/udf/crc.c b/fs/udf/crc.c deleted file mode 100644 index b166129..0000000 --- a/fs/udf/crc.c +++ /dev/null @@ -1,172 +0,0 @@ -/* - * crc.c - * - * PURPOSE - * Routines to generate, calculate, and test a 16-bit CRC. - * - * DESCRIPTION - * The CRC code was devised by Don P. Mitchell of AT&T Bell Laboratories - * and Ned W. Rhodes of Software Systems Group. It has been published in - * "Design and Validation of Computer Protocols", Prentice Hall, - * Englewood Cliffs, NJ, 1991, Chapter 3, ISBN 0-13-539925-4. - * - * Copyright is held by AT&T. - * - * AT&T gives permission for the free use of the CRC source code. - * - * COPYRIGHT - * This file is distributed under the terms of the GNU General Public - * License (GPL). Copies of the GPL can be obtained from: - * ftp://prep.ai.mit.edu/pub/gnu/GPL - * Each contributing author retains all rights to their own work. - */ - -#include "udfdecl.h" - -static uint16_t crc_table[256] = { - 0x0000U, 0x1021U, 0x2042U, 0x3063U, 0x4084U, 0x50a5U, 0x60c6U, 0x70e7U, - 0x8108U, 0x9129U, 0xa14aU, 0xb16bU, 0xc18cU, 0xd1adU, 0xe1ceU, 0xf1efU, - 0x1231U, 0x0210U, 0x3273U, 0x2252U, 0x52b5U, 0x4294U, 0x72f7U, 0x62d6U, - 0x9339U, 0x8318U, 0xb37bU, 0xa35aU, 0xd3bdU, 0xc39cU, 0xf3ffU, 0xe3deU, - 0x2462U, 0x3443U, 0x0420U, 0x1401U, 0x64e6U, 0x74c7U, 0x44a4U, 0x5485U, - 0xa56aU, 0xb54bU, 0x8528U, 0x9509U, 0xe5eeU, 0xf5cfU, 0xc5acU, 0xd58dU, - 0x3653U, 0x2672U, 0x1611U, 0x0630U, 0x76d7U, 0x66f6U, 0x5695U, 0x46b4U, - 0xb75bU, 0xa77aU, 0x9719U, 0x8738U, 0xf7dfU, 0xe7feU, 0xd79dU, 0xc7bcU, - 0x48c4U, 0x58e5U, 0x6886U, 0x78a7U, 0x0840U, 0x1861U, 0x2802U, 0x3823U, - 0xc9ccU, 0xd9edU, 0xe98eU, 0xf9afU, 0x8948U, 0x9969U, 0xa90aU, 0xb92bU, - 0x5af5U, 0x4ad4U, 0x7ab7U, 0x6a96U, 0x1a71U, 0x0a50U, 0x3a33U, 0x2a12U, - 0xdbfdU, 0xcbdcU, 0xfbbfU, 0xeb9eU, 0x9b79U, 0x8b58U, 0xbb3bU, 0xab1aU, - 0x6ca6U, 0x7c87U, 0x4ce4U, 0x5cc5U, 0x2c22U, 0x3c03U, 0x0c60U, 0x1c41U, - 0xedaeU, 0xfd8fU, 0xcdecU, 0xddcdU, 0xad2aU, 0xbd0bU, 0x8d68U, 0x9d49U, - 0x7e97U, 0x6eb6U, 0x5ed5U, 0x4ef4U, 0x3e13U, 0x2e32U, 0x1e51U, 0x0e70U, - 0xff9fU, 0xefbeU, 0xdfddU, 0xcffcU, 0xbf1bU, 0xaf3aU, 0x9f59U, 0x8f78U, - 0x9188U, 0x81a9U, 0xb1caU, 0xa1ebU, 0xd10cU, 0xc12dU, 0xf14eU, 0xe16fU, - 0x1080U, 0x00a1U, 0x30c2U, 0x20e3U, 0x5004U, 0x4025U, 0x7046U, 0x6067U, - 0x83b9U, 0x9398U, 0xa3fbU, 0xb3daU, 0xc33dU, 0xd31cU, 0xe37fU, 0xf35eU, - 0x02b1U, 0x1290U, 0x22f3U, 0x32d2U, 0x4235U, 0x5214U, 0x6277U, 0x7256U, - 0xb5eaU, 0xa5cbU, 0x95a8U, 0x8589U, 0xf56eU, 0xe54fU, 0xd52cU, 0xc50dU, - 0x34e2U, 0x24c3U, 0x14a0U, 0x0481U, 0x7466U, 0x6447U, 0x5424U, 0x4405U, - 0xa7dbU, 0xb7faU, 0x8799U, 0x97b8U, 0xe75fU, 0xf77eU, 0xc71dU, 0xd73cU, - 0x26d3U, 0x36f2U, 0x0691U, 0x16b0U, 0x6657U, 0x7676U, 0x4615U, 0x5634U, - 0xd94cU, 0xc96dU, 0xf90eU, 0xe92fU, 0x99c8U, 0x89e9U, 0xb98aU, 0xa9abU, - 0x5844U, 0x4865U, 0x7806U, 0x6827U, 0x18c0U, 0x08e1U, 0x3882U, 0x28a3U, - 0xcb7dU, 0xdb5cU, 0xeb3fU, 0xfb1eU, 0x8bf9U, 0x9bd8U, 0xabbbU, 0xbb9aU, - 0x4a75U, 0x5a54U, 0x6a37U, 0x7a16U, 0x0af1U, 0x1ad0U, 0x2ab3U, 0x3a92U, - 0xfd2eU, 0xed0fU, 0xdd6cU, 0xcd4dU, 0xbdaaU, 0xad8bU, 0x9de8U, 0x8dc9U, - 0x7c26U, 0x6c07U, 0x5c64U, 0x4c45U, 0x3ca2U, 0x2c83U, 0x1ce0U, 0x0cc1U, - 0xef1fU, 0xff3eU, 0xcf5dU, 0xdf7cU, 0xaf9bU, 0xbfbaU, 0x8fd9U, 0x9ff8U, - 0x6e17U, 0x7e36U, 0x4e55U, 0x5e74U, 0x2e93U, 0x3eb2U, 0x0ed1U, 0x1ef0U -}; - -/* - * udf_crc - * - * PURPOSE - * Calculate a 16-bit CRC checksum using ITU-T V.41 polynomial. - * - * DESCRIPTION - * The OSTA-UDF(tm) 1.50 standard states that using CRCs is mandatory. - * The polynomial used is: x^16 + x^12 + x^15 + 1 - * - * PRE-CONDITIONS - * data Pointer to the data block. - * size Size of the data block. - * - * POST-CONDITIONS - * <return> CRC of the data block. - * - * HISTORY - * July 21, 1997 - Andrew E. Mileski - * Adapted from OSTA-UDF(tm) 1.50 standard. - */ -uint16_t udf_crc(uint8_t *data, uint32_t size, uint16_t crc) -{ - while (size--) - crc = crc_table[(crc >> 8 ^ *(data++)) & 0xffU] ^ (crc << 8); - - return crc; -} - -/****************************************************************************/ -#if defined(TEST) - -/* - * PURPOSE - * Test udf_crc() - * - * HISTORY - * July 21, 1997 - Andrew E. Mileski - * Adapted from OSTA-UDF(tm) 1.50 standard. - */ - -unsigned char bytes[] = { 0x70U, 0x6AU, 0x77U }; - -int main(void) -{ - unsigned short x; - - x = udf_crc(bytes, sizeof bytes); - printf("udf_crc: calculated = %4.4x, correct = %4.4x\n", x, 0x3299U); - - return 0; -} - -#endif /* defined(TEST) */ - -/****************************************************************************/ -#if defined(GENERATE) - -/* - * PURPOSE - * Generate a table for fast 16-bit CRC calculations (any polynomial). - * - * DESCRIPTION - * The ITU-T V.41 polynomial is 010041. - * - * HISTORY - * July 21, 1997 - Andrew E. Mileski - * Adapted from OSTA-UDF(tm) 1.50 standard. - */ - -#include <stdio.h> - -int main(int argc, char **argv) -{ - unsigned long crc, poly; - int n, i; - - /* Get the polynomial */ - sscanf(argv[1], "%lo", &poly); - if (poly & 0xffff0000U) { - fprintf(stderr, "polynomial is too large\en"); - exit(1); - } - - printf("/* CRC 0%o */\n", poly); - - /* Create a table */ - printf("static unsigned short crc_table[256] = {\n"); - for (n = 0; n < 256; n++) { - if (n % 8 == 0) - printf("\t"); - crc = n << 8; - for (i = 0; i < 8; i++) { - if (crc & 0x8000U) - crc = (crc << 1) ^ poly; - else - crc <<= 1; - crc &= 0xFFFFU; - } - if (n == 255) - printf("0x%04xU ", crc); - else - printf("0x%04xU, ", crc); - if (n % 8 == 7) - printf("\n"); - } - printf("};\n"); - - return 0; -} - -#endif /* defined(GENERATE) */ diff --git a/fs/udf/dir.c b/fs/udf/dir.c index 8d8643ada..62dc270 100644 --- a/fs/udf/dir.c +++ b/fs/udf/dir.c @@ -39,13 +39,13 @@ static int do_udf_readdir(struct inode *dir, struct file *filp, filldir_t filldir, void *dirent) { - struct udf_fileident_bh fibh; + struct udf_fileident_bh fibh = { .sbh = NULL, .ebh = NULL}; struct fileIdentDesc *fi = NULL; struct fileIdentDesc cfi; int block, iblock; loff_t nf_pos = (filp->f_pos - 1) << 2; int flen; - char fname[UDF_NAME_LEN]; + char *fname = NULL; char *nameptr; uint16_t liu; uint8_t lfi; @@ -54,23 +54,32 @@ static int do_udf_readdir(struct inode *dir, struct file *filp, kernel_lb_addr eloc; uint32_t elen; sector_t offset; - int i, num; + int i, num, ret = 0; unsigned int dt_type; struct extent_position epos = { NULL, 0, {0, 0} }; struct udf_inode_info *iinfo; if (nf_pos >= size) - return 0; + goto out; + + fname = kmalloc(UDF_NAME_LEN, GFP_NOFS); + if (!fname) { + ret = -ENOMEM; + goto out; + } if (nf_pos == 0) nf_pos = udf_ext0_offset(dir); fibh.soffset = fibh.eoffset = nf_pos & (dir->i_sb->s_blocksize - 1); iinfo = UDF_I(dir); - if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) { - fibh.sbh = fibh.ebh = NULL; - } else if (inode_bmap(dir, nf_pos >> dir->i_sb->s_blocksize_bits, - &epos, &eloc, &elen, &offset) == (EXT_RECORDED_ALLOCATED >> 30)) { + if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) { + if (inode_bmap(dir, nf_pos >> dir->i_sb->s_blocksize_bits, + &epos, &eloc, &elen, &offset) + != (EXT_RECORDED_ALLOCATED >> 30)) { + ret = -ENOENT; + goto out; + } block = udf_get_lb_pblock(dir->i_sb, eloc, offset); if ((++offset << dir->i_sb->s_blocksize_bits) < elen) { if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) @@ -83,8 +92,8 @@ static int do_udf_readdir(struct inode *dir, struct file *filp, } if (!(fibh.sbh = fibh.ebh = udf_tread(dir->i_sb, block))) { - brelse(epos.bh); - return -EIO; + ret = -EIO; + goto out; } if (!(offset & ((16 >> (dir->i_sb->s_blocksize_bits - 9)) - 1))) { @@ -105,9 +114,6 @@ static int do_udf_readdir(struct inode *dir, struct file *filp, brelse(bha[i]); } } - } else { - brelse(epos.bh); - return -ENOENT; } while (nf_pos < size) { @@ -115,13 +121,8 @@ static int do_udf_readdir(struct inode *dir, struct file *filp, fi = udf_fileident_read(dir, &nf_pos, &fibh, &cfi, &epos, &eloc, &elen, &offset); - if (!fi) { - if (fibh.sbh != fibh.ebh) - brelse(fibh.ebh); - brelse(fibh.sbh); - brelse(epos.bh); - return 0; - } + if (!fi) + goto out; liu = le16_to_cpu(cfi.lengthOfImpUse); lfi = cfi.lengthFileIdent; @@ -167,53 +168,23 @@ static int do_udf_readdir(struct inode *dir, struct file *filp, dt_type = DT_UNKNOWN; } - if (flen) { - if (filldir(dirent, fname, flen, filp->f_pos, iblock, dt_type) < 0) { - if (fibh.sbh != fibh.ebh) - brelse(fibh.ebh); - brelse(fibh.sbh); - brelse(epos.bh); - return 0; - } - } + if (flen && filldir(dirent, fname, flen, filp->f_pos, + iblock, dt_type) < 0) + goto out; } /* end while */ filp->f_pos = (nf_pos >> 2) + 1; +out: if (fibh.sbh != fibh.ebh) brelse(fibh.ebh); brelse(fibh.sbh); brelse(epos.bh); + kfree(fname); - return 0; + return ret; } -/* - * udf_readdir - * - * PURPOSE - * Read a directory entry. - * - * DESCRIPTION - * Optional - sys_getdents() will return -ENOTDIR if this routine is not - * available. - * - * Refer to sys_getdents() in fs/readdir.c - * sys_getdents() -> . - * - * PRE-CONDITIONS - * filp Pointer to directory file. - * buf Pointer to directory entry buffer. - * filldir Pointer to filldir function. - * - * POST-CONDITIONS - * <return> >=0 on success. - * - * HISTORY - * July 1, 1997 - Andrew E. Mileski - * Written, tested, and released. - */ - static int udf_readdir(struct file *filp, void *dirent, filldir_t filldir) { struct inode *dir = filp->f_path.dentry->d_inode; diff --git a/fs/udf/ecma_167.h b/fs/udf/ecma_167.h index 5638771..a0974df 100644 --- a/fs/udf/ecma_167.h +++ b/fs/udf/ecma_167.h @@ -70,19 +70,6 @@ typedef struct { uint8_t microseconds; } __attribute__ ((packed)) timestamp; -typedef struct { - uint16_t typeAndTimezone; - int16_t year; - uint8_t month; - uint8_t day; - uint8_t hour; - uint8_t minute; - uint8_t second; - uint8_t centiseconds; - uint8_t hundredsOfMicroseconds; - uint8_t microseconds; -} __attribute__ ((packed)) kernel_timestamp; - /* Type and Time Zone (ECMA 167r3 1/7.3.1) */ #define TIMESTAMP_TYPE_MASK 0xF000 #define TIMESTAMP_TYPE_CUT 0x0000 diff --git a/fs/udf/file.c b/fs/udf/file.c index 97c71ae..0ed6e14 100644 --- a/fs/udf/file.c +++ b/fs/udf/file.c @@ -27,7 +27,6 @@ #include "udfdecl.h" #include <linux/fs.h> -#include <linux/udf_fs.h> #include <asm/uaccess.h> #include <linux/kernel.h> #include <linux/string.h> /* memset */ @@ -144,40 +143,6 @@ static ssize_t udf_file_aio_write(struct kiocb *iocb, const struct iovec *iov, return retval; } -/* - * udf_ioctl - * - * PURPOSE - * Issue an ioctl. - * - * DESCRIPTION - * Optional - sys_ioctl() will return -ENOTTY if this routine is not - * available, and the ioctl cannot be handled without filesystem help. - * - * sys_ioctl() handles these ioctls that apply only to regular files: - * FIBMAP [requires udf_block_map()], FIGETBSZ, FIONREAD - * These ioctls are also handled by sys_ioctl(): - * FIOCLEX, FIONCLEX, FIONBIO, FIOASYNC - * All other ioctls are passed to the filesystem. - * - * Refer to sys_ioctl() in fs/ioctl.c - * sys_ioctl() -> . - * - * PRE-CONDITIONS - * inode Pointer to inode that ioctl was issued on. - * filp Pointer to file that ioctl was issued on. - * cmd The ioctl command. - * arg The ioctl argument [can be interpreted as a - * user-space pointer if desired]. - * - * POST-CONDITIONS - * <return> Success (>=0) or an error code (<=0) that - * sys_ioctl() will return. - * - * HISTORY - * July 1, 1997 - Andrew E. Mileski - * Written, tested, and released. - */ int udf_ioctl(struct inode *inode, struct file *filp, unsigned int cmd, unsigned long arg) { @@ -225,18 +190,6 @@ int udf_ioctl(struct inode *inode, struct file *filp, unsigned int cmd, return result; } -/* - * udf_release_file - * - * PURPOSE - * Called when all references to the file are closed - * - * DESCRIPTION - * Discard prealloced blocks - * - * HISTORY - * - */ static int udf_release_file(struct inode *inode, struct file *filp) { if (filp->f_mode & FMODE_WRITE) { diff --git a/fs/udf/ialloc.c b/fs/udf/ialloc.c index 84360315..eb9cfa2 100644 --- a/fs/udf/ialloc.c +++ b/fs/udf/ialloc.c @@ -21,7 +21,6 @@ #include "udfdecl.h" #include <linux/fs.h> #include <linux/quotaops.h> -#include <linux/udf_fs.h> #include <linux/sched.h> #include <linux/slab.h> @@ -47,11 +46,9 @@ void udf_free_inode(struct inode *inode) struct logicalVolIntegrityDescImpUse *lvidiu = udf_sb_lvidiu(sbi); if (S_ISDIR(inode->i_mode)) - lvidiu->numDirs = - cpu_to_le32(le32_to_cpu(lvidiu->numDirs) - 1); + le32_add_cpu(&lvidiu->numDirs, -1); else - lvidiu->numFiles = - cpu_to_le32(le32_to_cpu(lvidiu->numFiles) - 1); + le32_add_cpu(&lvidiu->numFiles, -1); mark_buffer_dirty(sbi->s_lvid_bh); } @@ -105,11 +102,9 @@ struct inode *udf_new_inode(struct inode *dir, int mode, int *err) lvhd = (struct logicalVolHeaderDesc *) (lvid->logicalVolContentsUse); if (S_ISDIR(mode)) - lvidiu->numDirs = - cpu_to_le32(le32_to_cpu(lvidiu->numDirs) + 1); + le32_add_cpu(&lvidiu->numDirs, 1); else - lvidiu->numFiles = - cpu_to_le32(le32_to_cpu(lvidiu->numFiles) + 1); + le32_add_cpu(&lvidiu->numFiles, 1); iinfo->i_unique = uniqueID = le64_to_cpu(lvhd->uniqueID); if (!(++uniqueID & 0x00000000FFFFFFFFUL)) uniqueID += 16; diff --git a/fs/udf/inode.c b/fs/udf/inode.c index 24cfa55..6e74b11 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -37,6 +37,7 @@ #include <linux/buffer_head.h> #include <linux/writeback.h> #include <linux/slab.h> +#include <linux/crc-itu-t.h> #include "udf_i.h" #include "udf_sb.h" @@ -66,22 +67,7 @@ static void udf_update_extents(struct inode *, struct extent_position *); static int udf_get_block(struct inode *, sector_t, struct buffer_head *, int); -/* - * udf_delete_inode - * - * PURPOSE - * Clean-up before the specified inode is destroyed. - * - * DESCRIPTION - * This routine is called when the kernel destroys an inode structure - * ie. when iput() finds i_count == 0. - * - * HISTORY - * July 1, 1997 - Andrew E. Mileski - * Written, tested, and released. - * - * Called at the last iput() if i_nlink is zero. - */ + void udf_delete_inode(struct inode *inode) { truncate_inode_pages(&inode->i_data, 0); @@ -323,9 +309,6 @@ static int udf_get_block(struct inode *inode, sector_t block, lock_kernel(); - if (block < 0) - goto abort_negative; - iinfo = UDF_I(inode); if (block == iinfo->i_next_alloc_block + 1) { iinfo->i_next_alloc_block++; @@ -347,10 +330,6 @@ static int udf_get_block(struct inode *inode, sector_t block, abort: unlock_kernel(); return err; - -abort_negative: - udf_warning(inode->i_sb, "udf_get_block", "block < 0"); - goto abort; } static struct buffer_head *udf_getblk(struct inode *inode, long block, @@ -1116,42 +1095,36 @@ static void __udf_read_inode(struct inode *inode) fe = (struct fileEntry *)bh->b_data; if (fe->icbTag.strategyType == cpu_to_le16(4096)) { - struct buffer_head *ibh = NULL, *nbh = NULL; - struct indirectEntry *ie; + struct buffer_head *ibh; ibh = udf_read_ptagged(inode->i_sb, iinfo->i_location, 1, &ident); - if (ident == TAG_IDENT_IE) { - if (ibh) { - kernel_lb_addr loc; - ie = (struct indirectEntry *)ibh->b_data; - - loc = lelb_to_cpu(ie->indirectICB.extLocation); - - if (ie->indirectICB.extLength && - (nbh = udf_read_ptagged(inode->i_sb, loc, 0, - &ident))) { - if (ident == TAG_IDENT_FE || - ident == TAG_IDENT_EFE) { - memcpy(&iinfo->i_location, - &loc, - sizeof(kernel_lb_addr)); - brelse(bh); - brelse(ibh); - brelse(nbh); - __udf_read_inode(inode); - return; - } else { - brelse(nbh); - brelse(ibh); - } - } else { + if (ident == TAG_IDENT_IE && ibh) { + struct buffer_head *nbh = NULL; + kernel_lb_addr loc; + struct indirectEntry *ie; + + ie = (struct indirectEntry *)ibh->b_data; + loc = lelb_to_cpu(ie->indirectICB.extLocation); + + if (ie->indirectICB.extLength && + (nbh = udf_read_ptagged(inode->i_sb, loc, 0, + &ident))) { + if (ident == TAG_IDENT_FE || + ident == TAG_IDENT_EFE) { + memcpy(&iinfo->i_location, + &loc, + sizeof(kernel_lb_addr)); + brelse(bh); brelse(ibh); + brelse(nbh); + __udf_read_inode(inode); + return; } + brelse(nbh); } - } else { - brelse(ibh); } + brelse(ibh); } else if (fe->icbTag.strategyType != cpu_to_le16(4)) { printk(KERN_ERR "udf: unsupported strategy type: %d\n", le16_to_cpu(fe->icbTag.strategyType)); @@ -1168,8 +1141,6 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh) { struct fileEntry *fe; struct extendedFileEntry *efe; - time_t convtime; - long convtime_usec; int offset; struct udf_sb_info *sbi = UDF_SB(inode->i_sb); struct udf_inode_info *iinfo = UDF_I(inode); @@ -1257,29 +1228,15 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh) inode->i_blocks = le64_to_cpu(fe->logicalBlocksRecorded) << (inode->i_sb->s_blocksize_bits - 9); - if (udf_stamp_to_time(&convtime, &convtime_usec, - lets_to_cpu(fe->accessTime))) { - inode->i_atime.tv_sec = convtime; - inode->i_atime.tv_nsec = convtime_usec * 1000; - } else { + if (!udf_disk_stamp_to_time(&inode->i_atime, fe->accessTime)) inode->i_atime = sbi->s_record_time; - } - if (udf_stamp_to_time(&convtime, &convtime_usec, - lets_to_cpu(fe->modificationTime))) { - inode->i_mtime.tv_sec = convtime; - inode->i_mtime.tv_nsec = convtime_usec * 1000; - } else { + if (!udf_disk_stamp_to_time(&inode->i_mtime, + fe->modificationTime)) inode->i_mtime = sbi->s_record_time; - } - if (udf_stamp_to_time(&convtime, &convtime_usec, - lets_to_cpu(fe->attrTime))) { - inode->i_ctime.tv_sec = convtime; - inode->i_ctime.tv_nsec = convtime_usec * 1000; - } else { + if (!udf_disk_stamp_to_time(&inode->i_ctime, fe->attrTime)) inode->i_ctime = sbi->s_record_time; - } iinfo->i_unique = le64_to_cpu(fe->uniqueID); iinfo->i_lenEAttr = le32_to_cpu(fe->lengthExtendedAttr); @@ -1289,37 +1246,18 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh) inode->i_blocks = le64_to_cpu(efe->logicalBlocksRecorded) << (inode->i_sb->s_blocksize_bits - 9); - if (udf_stamp_to_time(&convtime, &convtime_usec, - lets_to_cpu(efe->accessTime))) { - inode->i_atime.tv_sec = convtime; - inode->i_atime.tv_nsec = convtime_usec * 1000; - } else { + if (!udf_disk_stamp_to_time(&inode->i_atime, efe->accessTime)) inode->i_atime = sbi->s_record_time; - } - if (udf_stamp_to_time(&convtime, &convtime_usec, - lets_to_cpu(efe->modificationTime))) { - inode->i_mtime.tv_sec = convtime; - inode->i_mtime.tv_nsec = convtime_usec * 1000; - } else { + if (!udf_disk_stamp_to_time(&inode->i_mtime, + efe->modificationTime)) inode->i_mtime = sbi->s_record_time; - } - if (udf_stamp_to_time(&convtime, &convtime_usec, - lets_to_cpu(efe->createTime))) { - iinfo->i_crtime.tv_sec = convtime; - iinfo->i_crtime.tv_nsec = convtime_usec * 1000; - } else { + if (!udf_disk_stamp_to_time(&iinfo->i_crtime, efe->createTime)) iinfo->i_crtime = sbi->s_record_time; - } - if (udf_stamp_to_time(&convtime, &convtime_usec, - lets_to_cpu(efe->attrTime))) { - inode->i_ctime.tv_sec = convtime; - inode->i_ctime.tv_nsec = convtime_usec * 1000; - } else { + if (!udf_disk_stamp_to_time(&inode->i_ctime, efe->attrTime)) inode->i_ctime = sbi->s_record_time; - } iinfo->i_unique = le64_to_cpu(efe->uniqueID); iinfo->i_lenEAttr = le32_to_cpu(efe->lengthExtendedAttr); @@ -1338,6 +1276,7 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh) case ICBTAG_FILE_TYPE_REALTIME: case ICBTAG_FILE_TYPE_REGULAR: case ICBTAG_FILE_TYPE_UNDEF: + case ICBTAG_FILE_TYPE_VAT20: if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) inode->i_data.a_ops = &udf_adinicb_aops; else @@ -1363,6 +1302,15 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh) inode->i_op = &page_symlink_inode_operations; inode->i_mode = S_IFLNK | S_IRWXUGO; break; + case ICBTAG_FILE_TYPE_MAIN: + udf_debug("METADATA FILE-----\n"); + break; + case ICBTAG_FILE_TYPE_MIRROR: + udf_debug("METADATA MIRROR FILE-----\n"); + break; + case ICBTAG_FILE_TYPE_BITMAP: + udf_debug("METADATA BITMAP FILE-----\n"); + break; default: printk(KERN_ERR "udf: udf_fill_inode(ino %ld) failed unknown " "file type=%d\n", inode->i_ino, @@ -1416,21 +1364,6 @@ static mode_t udf_convert_permissions(struct fileEntry *fe) return mode; } -/* - * udf_write_inode - * - * PURPOSE - * Write out the specified inode. - * - * DESCRIPTION - * This routine is called whenever an inode is synced. - * Currently this routine is just a placeholder. - * - * HISTORY - * July 1, 1997 - Andrew E. Mileski - * Written, tested, and released. - */ - int udf_write_inode(struct inode *inode, int sync) { int ret; @@ -1455,7 +1388,6 @@ static int udf_update_inode(struct inode *inode, int do_sync) uint32_t udfperms; uint16_t icbflags; uint16_t crclen; - kernel_timestamp cpu_time; int err = 0; struct udf_sb_info *sbi = UDF_SB(inode->i_sb); unsigned char blocksize_bits = inode->i_sb->s_blocksize_bits; @@ -1488,9 +1420,9 @@ static int udf_update_inode(struct inode *inode, int do_sync) iinfo->i_location. logicalBlockNum); use->descTag.descCRCLength = cpu_to_le16(crclen); - use->descTag.descCRC = cpu_to_le16(udf_crc((char *)use + - sizeof(tag), crclen, - 0)); + use->descTag.descCRC = cpu_to_le16(crc_itu_t(0, (char *)use + + sizeof(tag), + crclen)); use->descTag.tagChecksum = udf_tag_checksum(&use->descTag); mark_buffer_dirty(bh); @@ -1558,12 +1490,9 @@ static int udf_update_inode(struct inode *inode, int do_sync) (inode->i_blocks + (1 << (blocksize_bits - 9)) - 1) >> (blocksize_bits - 9)); - if (udf_time_to_stamp(&cpu_time, inode->i_atime)) - fe->accessTime = cpu_to_lets(cpu_time); - if (udf_time_to_stamp(&cpu_time, inode->i_mtime)) - fe->modificationTime = cpu_to_lets(cpu_time); - if (udf_time_to_stamp(&cpu_time, inode->i_ctime)) - fe->attrTime = cpu_to_lets(cpu_time); + udf_time_to_disk_stamp(&fe->accessTime, inode->i_atime); + udf_time_to_disk_stamp(&fe->modificationTime, inode->i_mtime); + udf_time_to_disk_stamp(&fe->attrTime, inode->i_ctime); memset(&(fe->impIdent), 0, sizeof(regid)); strcpy(fe->impIdent.ident, UDF_ID_DEVELOPER); fe->impIdent.identSuffix[0] = UDF_OS_CLASS_UNIX; @@ -1598,14 +1527,10 @@ static int udf_update_inode(struct inode *inode, int do_sync) iinfo->i_crtime.tv_nsec > inode->i_ctime.tv_nsec)) iinfo->i_crtime = inode->i_ctime; - if (udf_time_to_stamp(&cpu_time, inode->i_atime)) - efe->accessTime = cpu_to_lets(cpu_time); - if (udf_time_to_stamp(&cpu_time, inode->i_mtime)) - efe->modificationTime = cpu_to_lets(cpu_time); - if (udf_time_to_stamp(&cpu_time, iinfo->i_crtime)) - efe->createTime = cpu_to_lets(cpu_time); - if (udf_time_to_stamp(&cpu_time, inode->i_ctime)) - efe->attrTime = cpu_to_lets(cpu_time); + udf_time_to_disk_stamp(&efe->accessTime, inode->i_atime); + udf_time_to_disk_stamp(&efe->modificationTime, inode->i_mtime); + udf_time_to_disk_stamp(&efe->createTime, iinfo->i_crtime); + udf_time_to_disk_stamp(&efe->attrTime, inode->i_ctime); memset(&(efe->impIdent), 0, sizeof(regid)); strcpy(efe->impIdent.ident, UDF_ID_DEVELOPER); @@ -1660,8 +1585,8 @@ static int udf_update_inode(struct inode *inode, int do_sync) crclen += iinfo->i_lenEAttr + iinfo->i_lenAlloc - sizeof(tag); fe->descTag.descCRCLength = cpu_to_le16(crclen); - fe->descTag.descCRC = cpu_to_le16(udf_crc((char *)fe + sizeof(tag), - crclen, 0)); + fe->descTag.descCRC = cpu_to_le16(crc_itu_t(0, (char *)fe + sizeof(tag), + crclen)); fe->descTag.tagChecksum = udf_tag_checksum(&fe->descTag); /* write the data blocks */ @@ -1778,9 +1703,7 @@ int8_t udf_add_aext(struct inode *inode, struct extent_position *epos, if (epos->bh) { aed = (struct allocExtDesc *)epos->bh->b_data; - aed->lengthAllocDescs = - cpu_to_le32(le32_to_cpu( - aed->lengthAllocDescs) + adsize); + le32_add_cpu(&aed->lengthAllocDescs, adsize); } else { iinfo->i_lenAlloc += adsize; mark_inode_dirty(inode); @@ -1830,9 +1753,7 @@ int8_t udf_add_aext(struct inode *inode, struct extent_position *epos, mark_inode_dirty(inode); } else { aed = (struct allocExtDesc *)epos->bh->b_data; - aed->lengthAllocDescs = - cpu_to_le32(le32_to_cpu(aed->lengthAllocDescs) + - adsize); + le32_add_cpu(&aed->lengthAllocDescs, adsize); if (!UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_STRICT) || UDF_SB(inode->i_sb)->s_udfrev >= 0x0201) udf_update_tag(epos->bh->b_data, @@ -2046,9 +1967,7 @@ int8_t udf_delete_aext(struct inode *inode, struct extent_position epos, mark_inode_dirty(inode); } else { aed = (struct allocExtDesc *)oepos.bh->b_data; - aed->lengthAllocDescs = - cpu_to_le32(le32_to_cpu(aed->lengthAllocDescs) - - (2 * adsize)); + le32_add_cpu(&aed->lengthAllocDescs, -(2 * adsize)); if (!UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_STRICT) || UDF_SB(inode->i_sb)->s_udfrev >= 0x0201) udf_update_tag(oepos.bh->b_data, @@ -2065,9 +1984,7 @@ int8_t udf_delete_aext(struct inode *inode, struct extent_position epos, mark_inode_dirty(inode); } else { aed = (struct allocExtDesc *)oepos.bh->b_data; - aed->lengthAllocDescs = - cpu_to_le32(le32_to_cpu(aed->lengthAllocDescs) - - adsize); + le32_add_cpu(&aed->lengthAllocDescs, -adsize); if (!UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_STRICT) || UDF_SB(inode->i_sb)->s_udfrev >= 0x0201) udf_update_tag(oepos.bh->b_data, @@ -2095,11 +2012,6 @@ int8_t inode_bmap(struct inode *inode, sector_t block, int8_t etype; struct udf_inode_info *iinfo; - if (block < 0) { - printk(KERN_ERR "udf: inode_bmap: block < 0\n"); - return -1; - } - iinfo = UDF_I(inode); pos->offset = 0; pos->block = iinfo->i_location; diff --git a/fs/udf/lowlevel.c b/fs/udf/lowlevel.c index 579bae7..703843f 100644 --- a/fs/udf/lowlevel.c +++ b/fs/udf/lowlevel.c @@ -23,7 +23,6 @@ #include <linux/cdrom.h> #include <asm/uaccess.h> -#include <linux/udf_fs.h> #include "udf_sb.h" unsigned int udf_get_last_session(struct super_block *sb) diff --git a/fs/udf/misc.c b/fs/udf/misc.c index a1d6da0..84bf0fd 100644 --- a/fs/udf/misc.c +++ b/fs/udf/misc.c @@ -23,8 +23,8 @@ #include <linux/fs.h> #include <linux/string.h> -#include <linux/udf_fs.h> #include <linux/buffer_head.h> +#include <linux/crc-itu-t.h> #include "udf_i.h" #include "udf_sb.h" @@ -136,8 +136,8 @@ struct genericFormat *udf_add_extendedattr(struct inode *inode, uint32_t size, /* rewrite CRC + checksum of eahd */ crclen = sizeof(struct extendedAttrHeaderDesc) - sizeof(tag); eahd->descTag.descCRCLength = cpu_to_le16(crclen); - eahd->descTag.descCRC = cpu_to_le16(udf_crc((char *)eahd + - sizeof(tag), crclen, 0)); + eahd->descTag.descCRC = cpu_to_le16(crc_itu_t(0, (char *)eahd + + sizeof(tag), crclen)); eahd->descTag.tagChecksum = udf_tag_checksum(&eahd->descTag); iinfo->i_lenEAttr += size; return (struct genericFormat *)&ea[offset]; @@ -204,16 +204,15 @@ struct buffer_head *udf_read_tagged(struct super_block *sb, uint32_t block, { tag *tag_p; struct buffer_head *bh = NULL; - struct udf_sb_info *sbi = UDF_SB(sb); /* Read the block */ if (block == 0xFFFFFFFF) return NULL; - bh = udf_tread(sb, block + sbi->s_session); + bh = udf_tread(sb, block); if (!bh) { udf_debug("block=%d, location=%d: read failed\n", - block + sbi->s_session, location); + block, location); return NULL; } @@ -223,8 +222,7 @@ struct buffer_head *udf_read_tagged(struct super_block *sb, uint32_t block, if (location != le32_to_cpu(tag_p->tagLocation)) { udf_debug("location mismatch block %u, tag %u != %u\n", - block + sbi->s_session, - le32_to_cpu(tag_p->tagLocation), location); + block, le32_to_cpu(tag_p->tagLocation), location); goto error_out; } @@ -244,13 +242,13 @@ struct buffer_head *udf_read_tagged(struct super_block *sb, uint32_t block, /* Verify the descriptor CRC */ if (le16_to_cpu(tag_p->descCRCLength) + sizeof(tag) > sb->s_blocksize || - le16_to_cpu(tag_p->descCRC) == udf_crc(bh->b_data + sizeof(tag), - le16_to_cpu(tag_p->descCRCLength), 0)) + le16_to_cpu(tag_p->descCRC) == crc_itu_t(0, + bh->b_data + sizeof(tag), + le16_to_cpu(tag_p->descCRCLength))) return bh; - udf_debug("Crc failure block %d: crc = %d, crclen = %d\n", - block + sbi->s_session, le16_to_cpu(tag_p->descCRC), - le16_to_cpu(tag_p->descCRCLength)); + udf_debug("Crc failure block %d: crc = %d, crclen = %d\n", block, + le16_to_cpu(tag_p->descCRC), le16_to_cpu(tag_p->descCRCLength)); error_out: brelse(bh); @@ -270,7 +268,7 @@ void udf_update_tag(char *data, int length) length -= sizeof(tag); tptr->descCRCLength = cpu_to_le16(length); - tptr->descCRC = cpu_to_le16(udf_crc(data + sizeof(tag), length, 0)); + tptr->descCRC = cpu_to_le16(crc_itu_t(0, data + sizeof(tag), length)); tptr->tagChecksum = udf_tag_checksum(tptr); } diff --git a/fs/udf/namei.c b/fs/udf/namei.c index 112a5fb..ba5537d 100644 --- a/fs/udf/namei.c +++ b/fs/udf/namei.c @@ -31,6 +31,7 @@ #include <linux/smp_lock.h> #include <linux/buffer_head.h> #include <linux/sched.h> +#include <linux/crc-itu-t.h> static inline int udf_match(int len1, const char *name1, int len2, const char *name2) @@ -97,25 +98,23 @@ int udf_write_fi(struct inode *inode, struct fileIdentDesc *cfi, memset(fibh->ebh->b_data, 0x00, padlen + offset); } - crc = udf_crc((uint8_t *)cfi + sizeof(tag), - sizeof(struct fileIdentDesc) - sizeof(tag), 0); + crc = crc_itu_t(0, (uint8_t *)cfi + sizeof(tag), + sizeof(struct fileIdentDesc) - sizeof(tag)); if (fibh->sbh == fibh->ebh) { - crc = udf_crc((uint8_t *)sfi->impUse, + crc = crc_itu_t(crc, (uint8_t *)sfi->impUse, crclen + sizeof(tag) - - sizeof(struct fileIdentDesc), crc); + sizeof(struct fileIdentDesc)); } else if (sizeof(struct fileIdentDesc) >= -fibh->soffset) { - crc = udf_crc(fibh->ebh->b_data + + crc = crc_itu_t(crc, fibh->ebh->b_data + sizeof(struct fileIdentDesc) + fibh->soffset, crclen + sizeof(tag) - - sizeof(struct fileIdentDesc), - crc); + sizeof(struct fileIdentDesc)); } else { - crc = udf_crc((uint8_t *)sfi->impUse, - -fibh->soffset - sizeof(struct fileIdentDesc), - crc); - crc = udf_crc(fibh->ebh->b_data, fibh->eoffset, crc); + crc = crc_itu_t(crc, (uint8_t *)sfi->impUse, + -fibh->soffset - sizeof(struct fileIdentDesc)); + crc = crc_itu_t(crc, fibh->ebh->b_data, fibh->eoffset); } cfi->descTag.descCRC = cpu_to_le16(crc); @@ -149,7 +148,7 @@ static struct fileIdentDesc *udf_find_entry(struct inode *dir, struct fileIdentDesc *fi = NULL; loff_t f_pos; int block, flen; - char fname[UDF_NAME_LEN]; + char *fname = NULL; char *nameptr; uint8_t lfi; uint16_t liu; @@ -163,12 +162,12 @@ static struct fileIdentDesc *udf_find_entry(struct inode *dir, size = udf_ext0_offset(dir) + dir->i_size; f_pos = udf_ext0_offset(dir); + fibh->sbh = fibh->ebh = NULL; fibh->soffset = fibh->eoffset = f_pos & (dir->i_sb->s_blocksize - 1); - if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) - fibh->sbh = fibh->ebh = NULL; - else if (inode_bmap(dir, f_pos >> dir->i_sb->s_blocksize_bits, - &epos, &eloc, &elen, &offset) == - (EXT_RECORDED_ALLOCATED >> 30)) { + if (dinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) { + if (inode_bmap(dir, f_pos >> dir->i_sb->s_blocksize_bits, &epos, + &eloc, &elen, &offset) != (EXT_RECORDED_ALLOCATED >> 30)) + goto out_err; block = udf_get_lb_pblock(dir->i_sb, eloc, offset); if ((++offset << dir->i_sb->s_blocksize_bits) < elen) { if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) @@ -179,25 +178,19 @@ static struct fileIdentDesc *udf_find_entry(struct inode *dir, offset = 0; fibh->sbh = fibh->ebh = udf_tread(dir->i_sb, block); - if (!fibh->sbh) { - brelse(epos.bh); - return NULL; - } - } else { - brelse(epos.bh); - return NULL; + if (!fibh->sbh) + goto out_err; } + fname = kmalloc(UDF_NAME_LEN, GFP_NOFS); + if (!fname) + goto out_err; + while (f_pos < size) { fi = udf_fileident_read(dir, &f_pos, fibh, cfi, &epos, &eloc, &elen, &offset); - if (!fi) { - if (fibh->sbh != fibh->ebh) - brelse(fibh->ebh); - brelse(fibh->sbh); - brelse(epos.bh); - return NULL; - } + if (!fi) + goto out_err; liu = le16_to_cpu(cfi->lengthOfImpUse); lfi = cfi->lengthFileIdent; @@ -237,53 +230,22 @@ static struct fileIdentDesc *udf_find_entry(struct inode *dir, flen = udf_get_filename(dir->i_sb, nameptr, fname, lfi); if (flen && udf_match(flen, fname, dentry->d_name.len, - dentry->d_name.name)) { - brelse(epos.bh); - return fi; - } + dentry->d_name.name)) + goto out_ok; } +out_err: + fi = NULL; if (fibh->sbh != fibh->ebh) brelse(fibh->ebh); brelse(fibh->sbh); +out_ok: brelse(epos.bh); + kfree(fname); - return NULL; + return fi; } -/* - * udf_lookup - * - * PURPOSE - * Look-up the inode for a given name. - * - * DESCRIPTION - * Required - lookup_dentry() will return -ENOTDIR if this routine is not - * available for a directory. The filesystem is useless if this routine is - * not available for at least the filesystem's root directory. - * - * This routine is passed an incomplete dentry - it must be completed by - * calling d_add(dentry, inode). If the name does not exist, then the - * specified inode must be set to null. An error should only be returned - * when the lookup fails for a reason other than the name not existing. - * Note that the directory inode semaphore is held during the call. - * - * Refer to lookup_dentry() in fs/namei.c - * lookup_dentry() -> lookup() -> real_lookup() -> . - * - * PRE-CONDITIONS - * dir Pointer to inode of parent directory. - * dentry Pointer to dentry to complete. - * nd Pointer to lookup nameidata - * - * POST-CONDITIONS - * <return> Zero on success. - * - * HISTORY - * July 1, 1997 - Andrew E. Mileski - * Written, tested, and released. - */ - static struct dentry *udf_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd) { @@ -336,11 +298,9 @@ static struct fileIdentDesc *udf_add_entry(struct inode *dir, { struct super_block *sb = dir->i_sb; struct fileIdentDesc *fi = NULL; - char name[UDF_NAME_LEN], fname[UDF_NAME_LEN]; + char *name = NULL; int namelen; loff_t f_pos; - int flen; - char *nameptr; loff_t size = udf_ext0_offset(dir) + dir->i_size; int nfidlen; uint8_t lfi; @@ -352,16 +312,23 @@ static struct fileIdentDesc *udf_add_entry(struct inode *dir, struct extent_position epos = {}; struct udf_inode_info *dinfo; + fibh->sbh = fibh->ebh = NULL; + name = kmalloc(UDF_NAME_LEN, GFP_NOFS); + if (!name) { + *err = -ENOMEM; + goto out_err; + } + if (dentry) { if (!dentry->d_name.len) { *err = -EINVAL; - return NULL; + goto out_err; } namelen = udf_put_filename(sb, dentry->d_name.name, name, dentry->d_name.len); if (!namelen) { *err = -ENAMETOOLONG; - return NULL; + goto out_err; } } else { namelen = 0; @@ -373,11 +340,14 @@ static struct fileIdentDesc *udf_add_entry(struct inode *dir, fibh->soffset = fibh->eoffset = f_pos & (dir->i_sb->s_blocksize - 1); dinfo = UDF_I(dir); - if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) - fibh->sbh = fibh->ebh = NULL; - else if (inode_bmap(dir, f_pos >> dir->i_sb->s_blocksize_bits, - &epos, &eloc, &elen, &offset) == - (EXT_RECORDED_ALLOCATED >> 30)) { + if (dinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) { + if (inode_bmap(dir, f_pos >> dir->i_sb->s_blocksize_bits, &epos, + &eloc, &elen, &offset) != (EXT_RECORDED_ALLOCATED >> 30)) { + block = udf_get_lb_pblock(dir->i_sb, + dinfo->i_location, 0); + fibh->soffset = fibh->eoffset = sb->s_blocksize; + goto add; + } block = udf_get_lb_pblock(dir->i_sb, eloc, offset); if ((++offset << dir->i_sb->s_blocksize_bits) < elen) { if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) @@ -389,17 +359,11 @@ static struct fileIdentDesc *udf_add_entry(struct inode *dir, fibh->sbh = fibh->ebh = udf_tread(dir->i_sb, block); if (!fibh->sbh) { - brelse(epos.bh); *err = -EIO; - return NULL; + goto out_err; } block = dinfo->i_location.logicalBlockNum; - } else { - block = udf_get_lb_pblock(dir->i_sb, dinfo->i_location, 0); - fibh->sbh = fibh->ebh = NULL; - fibh->soffset = fibh->eoffset = sb->s_blocksize; - goto add; } while (f_pos < size) { @@ -407,41 +371,16 @@ static struct fileIdentDesc *udf_add_entry(struct inode *dir, &elen, &offset); if (!fi) { - if (fibh->sbh != fibh->ebh) - brelse(fibh->ebh); - brelse(fibh->sbh); - brelse(epos.bh); *err = -EIO; - return NULL; + goto out_err; } liu = le16_to_cpu(cfi->lengthOfImpUse); lfi = cfi->lengthFileIdent; - if (fibh->sbh == fibh->ebh) - nameptr = fi->fileIdent + liu; - else { - int poffset; /* Unpaded ending offset */ - - poffset = fibh->soffset + sizeof(struct fileIdentDesc) + - liu + lfi; - - if (poffset >= lfi) - nameptr = (char *)(fibh->ebh->b_data + - poffset - lfi); - else { - nameptr = fname; - memcpy(nameptr, fi->fileIdent + liu, - lfi - poffset); - memcpy(nameptr + lfi - poffset, - fibh->ebh->b_data, poffset); - } - } - if ((cfi->fileCharacteristics & FID_FILE_CHAR_DELETED) != 0) { if (((sizeof(struct fileIdentDesc) + liu + lfi + 3) & ~3) == nfidlen) { - brelse(epos.bh); cfi->descTag.tagSerialNum = cpu_to_le16(1); cfi->fileVersionNum = cpu_to_le16(1); cfi->fileCharacteristics = 0; @@ -449,27 +388,13 @@ static struct fileIdentDesc *udf_add_entry(struct inode *dir, cfi->lengthOfImpUse = cpu_to_le16(0); if (!udf_write_fi(dir, cfi, fi, fibh, NULL, name)) - return fi; + goto out_ok; else { *err = -EIO; - return NULL; + goto out_err; } } } - - if (!lfi || !dentry) - continue; - - flen = udf_get_filename(dir->i_sb, nameptr, fname, lfi); - if (flen && udf_match(flen, fname, dentry->d_name.len, - dentry->d_name.name)) { - if (fibh->sbh != fibh->ebh) - brelse(fibh->ebh); - brelse(fibh->sbh); - brelse(epos.bh); - *err = -EEXIST; - return NULL; - } } add: @@ -496,7 +421,7 @@ add: fibh->sbh = fibh->ebh = udf_expand_dir_adinicb(dir, &block, err); if (!fibh->sbh) - return NULL; + goto out_err; epos.block = dinfo->i_location; epos.offset = udf_file_entry_alloc_offset(dir); /* Load extent udf_expand_dir_adinicb() has created */ @@ -537,11 +462,8 @@ add: dir->i_sb->s_blocksize_bits); fibh->ebh = udf_bread(dir, f_pos >> dir->i_sb->s_blocksize_bits, 1, err); - if (!fibh->ebh) { - brelse(epos.bh); - brelse(fibh->sbh); - return NULL; - } + if (!fibh->ebh) + goto out_err; if (!fibh->soffset) { if (udf_next_aext(dir, &epos, &eloc, &elen, 1) == @@ -572,20 +494,25 @@ add: cfi->lengthFileIdent = namelen; cfi->lengthOfImpUse = cpu_to_le16(0); if (!udf_write_fi(dir, cfi, fi, fibh, NULL, name)) { - brelse(epos.bh); dir->i_size += nfidlen; if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) dinfo->i_lenAlloc += nfidlen; mark_inode_dirty(dir); - return fi; + goto out_ok; } else { - brelse(epos.bh); - if (fibh->sbh != fibh->ebh) - brelse(fibh->ebh); - brelse(fibh->sbh); *err = -EIO; - return NULL; + goto out_err; } + +out_err: + fi = NULL; + if (fibh->sbh != fibh->ebh) + brelse(fibh->ebh); + brelse(fibh->sbh); +out_ok: + brelse(epos.bh); + kfree(name); + return fi; } static int udf_delete_entry(struct inode *inode, struct fileIdentDesc *fi, @@ -940,7 +867,7 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry, char *ea; int err; int block; - char name[UDF_NAME_LEN]; + char *name = NULL; int namelen; struct buffer_head *bh; struct udf_inode_info *iinfo; @@ -950,6 +877,12 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry, if (!inode) goto out; + name = kmalloc(UDF_NAME_LEN, GFP_NOFS); + if (!name) { + err = -ENOMEM; + goto out_no_entry; + } + iinfo = UDF_I(inode); inode->i_mode = S_IFLNK | S_IRWXUGO; inode->i_data.a_ops = &udf_symlink_aops; @@ -1089,6 +1022,7 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry, err = 0; out: + kfree(name); unlock_kernel(); return err; diff --git a/fs/udf/partition.c b/fs/udf/partition.c index fc53334..63610f0 100644 --- a/fs/udf/partition.c +++ b/fs/udf/partition.c @@ -24,7 +24,6 @@ #include <linux/fs.h> #include <linux/string.h> -#include <linux/udf_fs.h> #include <linux/slab.h> #include <linux/buffer_head.h> @@ -55,11 +54,10 @@ uint32_t udf_get_pblock_virt15(struct super_block *sb, uint32_t block, struct udf_sb_info *sbi = UDF_SB(sb); struct udf_part_map *map; struct udf_virtual_data *vdata; - struct udf_inode_info *iinfo; + struct udf_inode_info *iinfo = UDF_I(sbi->s_vat_inode); map = &sbi->s_partmaps[partition]; vdata = &map->s_type_specific.s_virtual; - index = (sb->s_blocksize - vdata->s_start_offset) / sizeof(uint32_t); if (block > vdata->s_num_entries) { udf_debug("Trying to access block beyond end of VAT " @@ -67,6 +65,12 @@ uint32_t udf_get_pblock_virt15(struct super_block *sb, uint32_t block, return 0xFFFFFFFF; } + if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) { + loc = le32_to_cpu(((__le32 *)(iinfo->i_ext.i_data + + vdata->s_start_offset))[block]); + goto translate; + } + index = (sb->s_blocksize - vdata->s_start_offset) / sizeof(uint32_t); if (block >= index) { block -= index; newblock = 1 + (block / (sb->s_blocksize / sizeof(uint32_t))); @@ -89,7 +93,7 @@ uint32_t udf_get_pblock_virt15(struct super_block *sb, uint32_t block, brelse(bh); - iinfo = UDF_I(sbi->s_vat_inode); +translate: if (iinfo->i_location.partitionReferenceNum == partition) { udf_debug("recursive call to udf_get_pblock!\n"); return 0xFFFFFFFF; @@ -263,3 +267,58 @@ int udf_relocate_blocks(struct super_block *sb, long old_block, long *new_block) return 0; } + +static uint32_t udf_try_read_meta(struct inode *inode, uint32_t block, + uint16_t partition, uint32_t offset) +{ + struct super_block *sb = inode->i_sb; + struct udf_part_map *map; + kernel_lb_addr eloc; + uint32_t elen; + sector_t ext_offset; + struct extent_position epos = {}; + uint32_t phyblock; + + if (inode_bmap(inode, block, &epos, &eloc, &elen, &ext_offset) != + (EXT_RECORDED_ALLOCATED >> 30)) + phyblock = 0xFFFFFFFF; + else { + map = &UDF_SB(sb)->s_partmaps[partition]; + /* map to sparable/physical partition desc */ + phyblock = udf_get_pblock(sb, eloc.logicalBlockNum, + map->s_partition_num, ext_offset + offset); + } + + brelse(epos.bh); + return phyblock; +} + +uint32_t udf_get_pblock_meta25(struct super_block *sb, uint32_t block, + uint16_t partition, uint32_t offset) +{ + struct udf_sb_info *sbi = UDF_SB(sb); + struct udf_part_map *map; + struct udf_meta_data *mdata; + uint32_t retblk; + struct inode *inode; + + udf_debug("READING from METADATA\n"); + + map = &sbi->s_partmaps[partition]; + mdata = &map->s_type_specific.s_metadata; + inode = mdata->s_metadata_fe ? : mdata->s_mirror_fe; + + /* We shouldn't mount such media... */ + BUG_ON(!inode); + retblk = udf_try_read_meta(inode, block, partition, offset); + if (retblk == 0xFFFFFFFF) { + udf_warning(sb, __func__, "error reading from METADATA, " + "trying to read from MIRROR"); + inode = mdata->s_mirror_fe; + if (!inode) + return 0xFFFFFFFF; + retblk = udf_try_read_meta(inode, block, partition, offset); + } + + return retblk; +} diff --git a/fs/udf/super.c b/fs/udf/super.c index f3ac4ab..b564fc1 100644 --- a/fs/udf/super.c +++ b/fs/udf/super.c @@ -55,9 +55,10 @@ #include <linux/errno.h> #include <linux/mount.h> #include <linux/seq_file.h> +#include <linux/bitmap.h> +#include <linux/crc-itu-t.h> #include <asm/byteorder.h> -#include <linux/udf_fs.h> #include "udf_sb.h" #include "udf_i.h" @@ -84,22 +85,19 @@ static void udf_write_super(struct super_block *); static int udf_remount_fs(struct super_block *, int *, char *); static int udf_check_valid(struct super_block *, int, int); static int udf_vrs(struct super_block *sb, int silent); -static int udf_load_partition(struct super_block *, kernel_lb_addr *); -static int udf_load_logicalvol(struct super_block *, struct buffer_head *, - kernel_lb_addr *); static void udf_load_logicalvolint(struct super_block *, kernel_extent_ad); static void udf_find_anchor(struct super_block *); static int udf_find_fileset(struct super_block *, kernel_lb_addr *, kernel_lb_addr *); -static void udf_load_pvoldesc(struct super_block *, struct buffer_head *); static void udf_load_fileset(struct super_block *, struct buffer_head *, kernel_lb_addr *); -static int udf_load_partdesc(struct super_block *, struct buffer_head *); static void udf_open_lvid(struct super_block *); static void udf_close_lvid(struct super_block *); static unsigned int udf_count_free(struct super_block *); static int udf_statfs(struct dentry *, struct kstatfs *); static int udf_show_options(struct seq_file *, struct vfsmount *); +static void udf_error(struct super_block *sb, const char *function, + const char *fmt, ...); struct logicalVolIntegrityDescImpUse *udf_sb_lvidiu(struct udf_sb_info *sbi) { @@ -587,48 +585,10 @@ static int udf_remount_fs(struct super_block *sb, int *flags, char *options) return 0; } -/* - * udf_set_blocksize - * - * PURPOSE - * Set the block size to be used in all transfers. - * - * DESCRIPTION - * To allow room for a DMA transfer, it is best to guess big when unsure. - * This routine picks 2048 bytes as the blocksize when guessing. This - * should be adequate until devices with larger block sizes become common. - * - * Note that the Linux kernel can currently only deal with blocksizes of - * 512, 1024, 2048, 4096, and 8192 bytes. - * - * PRE-CONDITIONS - * sb Pointer to _locked_ superblock. - * - * POST-CONDITIONS - * sb->s_blocksize Blocksize. - * sb->s_blocksize_bits log2 of blocksize. - * <return> 0 Blocksize is valid. - * <return> 1 Blocksize is invalid. - * - * HISTORY - * July 1, 1997 - Andrew E. Mileski - * Written, tested, and released. - */ -static int udf_set_blocksize(struct super_block *sb, int bsize) -{ - if (!sb_min_blocksize(sb, bsize)) { - udf_debug("Bad block size (%d)\n", bsize); - printk(KERN_ERR "udf: bad block size (%d)\n", bsize); - return 0; - } - - return sb->s_blocksize; -} - static int udf_vrs(struct super_block *sb, int silent) { struct volStructDesc *vsd = NULL; - int sector = 32768; + loff_t sector = 32768; int sectorsize; struct buffer_head *bh = NULL; int iso9660 = 0; @@ -649,7 +609,8 @@ static int udf_vrs(struct super_block *sb, int silent) sector += (sbi->s_session << sb->s_blocksize_bits); udf_debug("Starting at sector %u (%ld byte sectors)\n", - (sector >> sb->s_blocksize_bits), sb->s_blocksize); + (unsigned int)(sector >> sb->s_blocksize_bits), + sb->s_blocksize); /* Process the sequence (if applicable) */ for (; !nsr02 && !nsr03; sector += sectorsize) { /* Read a block */ @@ -719,162 +680,140 @@ static int udf_vrs(struct super_block *sb, int silent) } /* - * udf_find_anchor - * - * PURPOSE - * Find an anchor volume descriptor. - * - * PRE-CONDITIONS - * sb Pointer to _locked_ superblock. - * lastblock Last block on media. - * - * POST-CONDITIONS - * <return> 1 if not found, 0 if ok - * - * HISTORY - * July 1, 1997 - Andrew E. Mileski - * Written, tested, and released. + * Check whether there is an anchor block in the given block */ -static void udf_find_anchor(struct super_block *sb) +static int udf_check_anchor_block(struct super_block *sb, sector_t block, + bool varconv) { - int lastblock; struct buffer_head *bh = NULL; + tag *t; uint16_t ident; uint32_t location; - int i; - struct udf_sb_info *sbi; - sbi = UDF_SB(sb); - lastblock = sbi->s_last_block; + if (varconv) { + if (udf_fixed_to_variable(block) >= + sb->s_bdev->bd_inode->i_size >> sb->s_blocksize_bits) + return 0; + bh = sb_bread(sb, udf_fixed_to_variable(block)); + } + else + bh = sb_bread(sb, block); - if (lastblock) { - int varlastblock = udf_variable_to_fixed(lastblock); - int last[] = { lastblock, lastblock - 2, - lastblock - 150, lastblock - 152, - varlastblock, varlastblock - 2, - varlastblock - 150, varlastblock - 152 }; - - lastblock = 0; - - /* Search for an anchor volume descriptor pointer */ - - /* according to spec, anchor is in either: - * block 256 - * lastblock-256 - * lastblock - * however, if the disc isn't closed, it could be 512 */ - - for (i = 0; !lastblock && i < ARRAY_SIZE(last); i++) { - ident = location = 0; - if (last[i] >= 0) { - bh = sb_bread(sb, last[i]); - if (bh) { - tag *t = (tag *)bh->b_data; - ident = le16_to_cpu(t->tagIdent); - location = le32_to_cpu(t->tagLocation); - brelse(bh); - } - } + if (!bh) + return 0; - if (ident == TAG_IDENT_AVDP) { - if (location == last[i] - sbi->s_session) { - lastblock = last[i] - sbi->s_session; - sbi->s_anchor[0] = lastblock; - sbi->s_anchor[1] = lastblock - 256; - } else if (location == - udf_variable_to_fixed(last[i]) - - sbi->s_session) { - UDF_SET_FLAG(sb, UDF_FLAG_VARCONV); - lastblock = - udf_variable_to_fixed(last[i]) - - sbi->s_session; - sbi->s_anchor[0] = lastblock; - sbi->s_anchor[1] = lastblock - 256 - - sbi->s_session; - } else { - udf_debug("Anchor found at block %d, " - "location mismatch %d.\n", - last[i], location); - } - } else if (ident == TAG_IDENT_FE || - ident == TAG_IDENT_EFE) { - lastblock = last[i]; - sbi->s_anchor[3] = 512; - } else { - ident = location = 0; - if (last[i] >= 256) { - bh = sb_bread(sb, last[i] - 256); - if (bh) { - tag *t = (tag *)bh->b_data; - ident = le16_to_cpu( - t->tagIdent); - location = le32_to_cpu( - t->tagLocation); - brelse(bh); - } - } + t = (tag *)bh->b_data; + ident = le16_to_cpu(t->tagIdent); + location = le32_to_cpu(t->tagLocation); + brelse(bh); + if (ident != TAG_IDENT_AVDP) + return 0; + return location == block; +} - if (ident == TAG_IDENT_AVDP && - location == last[i] - 256 - - sbi->s_session) { - lastblock = last[i]; - sbi->s_anchor[1] = last[i] - 256; - } else { - ident = location = 0; - if (last[i] >= 312 + sbi->s_session) { - bh = sb_bread(sb, - last[i] - 312 - - sbi->s_session); - if (bh) { - tag *t = (tag *) - bh->b_data; - ident = le16_to_cpu( - t->tagIdent); - location = le32_to_cpu( - t->tagLocation); - brelse(bh); - } - } +/* Search for an anchor volume descriptor pointer */ +static sector_t udf_scan_anchors(struct super_block *sb, bool varconv, + sector_t lastblock) +{ + sector_t last[6]; + int i; + struct udf_sb_info *sbi = UDF_SB(sb); - if (ident == TAG_IDENT_AVDP && - location == udf_variable_to_fixed(last[i]) - 256) { - UDF_SET_FLAG(sb, - UDF_FLAG_VARCONV); - lastblock = udf_variable_to_fixed(last[i]); - sbi->s_anchor[1] = lastblock - 256; - } - } - } + last[0] = lastblock; + last[1] = last[0] - 1; + last[2] = last[0] + 1; + last[3] = last[0] - 2; + last[4] = last[0] - 150; + last[5] = last[0] - 152; + + /* according to spec, anchor is in either: + * block 256 + * lastblock-256 + * lastblock + * however, if the disc isn't closed, it could be 512 */ + + for (i = 0; i < ARRAY_SIZE(last); i++) { + if (last[i] < 0) + continue; + if (last[i] >= sb->s_bdev->bd_inode->i_size >> + sb->s_blocksize_bits) + continue; + + if (udf_check_anchor_block(sb, last[i], varconv)) { + sbi->s_anchor[0] = last[i]; + sbi->s_anchor[1] = last[i] - 256; + return last[i]; } - } - if (!lastblock) { - /* We haven't found the lastblock. check 312 */ - bh = sb_bread(sb, 312 + sbi->s_session); - if (bh) { - tag *t = (tag *)bh->b_data; - ident = le16_to_cpu(t->tagIdent); - location = le32_to_cpu(t->tagLocation); - brelse(bh); + if (last[i] < 256) + continue; - if (ident == TAG_IDENT_AVDP && location == 256) - UDF_SET_FLAG(sb, UDF_FLAG_VARCONV); + if (udf_check_anchor_block(sb, last[i] - 256, varconv)) { + sbi->s_anchor[1] = last[i] - 256; + return last[i]; } } + if (udf_check_anchor_block(sb, sbi->s_session + 256, varconv)) { + sbi->s_anchor[0] = sbi->s_session + 256; + return last[0]; + } + if (udf_check_anchor_block(sb, sbi->s_session + 512, varconv)) { + sbi->s_anchor[0] = sbi->s_session + 512; + return last[0]; + } + return 0; +} + +/* + * Find an anchor volume descriptor. The function expects sbi->s_lastblock to + * be the last block on the media. + * + * Return 1 if not found, 0 if ok + * + */ +static void udf_find_anchor(struct super_block *sb) +{ + sector_t lastblock; + struct buffer_head *bh = NULL; + uint16_t ident; + int i; + struct udf_sb_info *sbi = UDF_SB(sb); + + lastblock = udf_scan_anchors(sb, 0, sbi->s_last_block); + if (lastblock) + goto check_anchor; + + /* No anchor found? Try VARCONV conversion of block numbers */ + /* Firstly, we try to not convert number of the last block */ + lastblock = udf_scan_anchors(sb, 1, + udf_variable_to_fixed(sbi->s_last_block)); + if (lastblock) { + UDF_SET_FLAG(sb, UDF_FLAG_VARCONV); + goto check_anchor; + } + + /* Secondly, we try with converted number of the last block */ + lastblock = udf_scan_anchors(sb, 1, sbi->s_last_block); + if (lastblock) + UDF_SET_FLAG(sb, UDF_FLAG_VARCONV); + +check_anchor: + /* + * Check located anchors and the anchor block supplied via + * mount options + */ for (i = 0; i < ARRAY_SIZE(sbi->s_anchor); i++) { - if (sbi->s_anchor[i]) { - bh = udf_read_tagged(sb, sbi->s_anchor[i], - sbi->s_anchor[i], &ident); - if (!bh) + if (!sbi->s_anchor[i]) + continue; + bh = udf_read_tagged(sb, sbi->s_anchor[i], + sbi->s_anchor[i], &ident); + if (!bh) + sbi->s_anchor[i] = 0; + else { + brelse(bh); + if (ident != TAG_IDENT_AVDP) sbi->s_anchor[i] = 0; - else { - brelse(bh); - if ((ident != TAG_IDENT_AVDP) && - (i || (ident != TAG_IDENT_FE && - ident != TAG_IDENT_EFE))) - sbi->s_anchor[i] = 0; - } } } @@ -971,27 +910,30 @@ static int udf_find_fileset(struct super_block *sb, return 1; } -static void udf_load_pvoldesc(struct super_block *sb, struct buffer_head *bh) +static int udf_load_pvoldesc(struct super_block *sb, sector_t block) { struct primaryVolDesc *pvoldesc; - time_t recording; - long recording_usec; struct ustr instr; struct ustr outstr; + struct buffer_head *bh; + uint16_t ident; + + bh = udf_read_tagged(sb, block, block, &ident); + if (!bh) + return 1; + BUG_ON(ident != TAG_IDENT_PVD); pvoldesc = (struct primaryVolDesc *)bh->b_data; - if (udf_stamp_to_time(&recording, &recording_usec, - lets_to_cpu(pvoldesc->recordingDateAndTime))) { - kernel_timestamp ts; - ts = lets_to_cpu(pvoldesc->recordingDateAndTime); - udf_debug("recording time %ld/%ld, %04u/%02u/%02u" + if (udf_disk_stamp_to_time(&UDF_SB(sb)->s_record_time, + pvoldesc->recordingDateAndTime)) { +#ifdef UDFFS_DEBUG + timestamp *ts = &pvoldesc->recordingDateAndTime; + udf_debug("recording time %04u/%02u/%02u" " %02u:%02u (%x)\n", - recording, recording_usec, - ts.year, ts.month, ts.day, ts.hour, - ts.minute, ts.typeAndTimezone); - UDF_SB(sb)->s_record_time.tv_sec = recording; - UDF_SB(sb)->s_record_time.tv_nsec = recording_usec * 1000; + le16_to_cpu(ts->year), ts->month, ts->day, ts->hour, + ts->minute, le16_to_cpu(ts->typeAndTimezone)); +#endif } if (!udf_build_ustr(&instr, pvoldesc->volIdent, 32)) @@ -1005,6 +947,104 @@ static void udf_load_pvoldesc(struct super_block *sb, struct buffer_head *bh) if (!udf_build_ustr(&instr, pvoldesc->volSetIdent, 128)) if (udf_CS0toUTF8(&outstr, &instr)) udf_debug("volSetIdent[] = '%s'\n", outstr.u_name); + + brelse(bh); + return 0; +} + +static int udf_load_metadata_files(struct super_block *sb, int partition) +{ + struct udf_sb_info *sbi = UDF_SB(sb); + struct udf_part_map *map; + struct udf_meta_data *mdata; + kernel_lb_addr addr; + int fe_error = 0; + + map = &sbi->s_partmaps[partition]; + mdata = &map->s_type_specific.s_metadata; + + /* metadata address */ + addr.logicalBlockNum = mdata->s_meta_file_loc; + addr.partitionReferenceNum = map->s_partition_num; + + udf_debug("Metadata file location: block = %d part = %d\n", + addr.logicalBlockNum, addr.partitionReferenceNum); + + mdata->s_metadata_fe = udf_iget(sb, addr); + + if (mdata->s_metadata_fe == NULL) { + udf_warning(sb, __func__, "metadata inode efe not found, " + "will try mirror inode."); + fe_error = 1; + } else if (UDF_I(mdata->s_metadata_fe)->i_alloc_type != + ICBTAG_FLAG_AD_SHORT) { + udf_warning(sb, __func__, "metadata inode efe does not have " + "short allocation descriptors!"); + fe_error = 1; + iput(mdata->s_metadata_fe); + mdata->s_metadata_fe = NULL; + } + + /* mirror file entry */ + addr.logicalBlockNum = mdata->s_mirror_file_loc; + addr.partitionReferenceNum = map->s_partition_num; + + udf_debug("Mirror metadata file location: block = %d part = %d\n", + addr.logicalBlockNum, addr.partitionReferenceNum); + + mdata->s_mirror_fe = udf_iget(sb, addr); + + if (mdata->s_mirror_fe == NULL) { + if (fe_error) { + udf_error(sb, __func__, "mirror inode efe not found " + "and metadata inode is missing too, exiting..."); + goto error_exit; + } else + udf_warning(sb, __func__, "mirror inode efe not found," + " but metadata inode is OK"); + } else if (UDF_I(mdata->s_mirror_fe)->i_alloc_type != + ICBTAG_FLAG_AD_SHORT) { + udf_warning(sb, __func__, "mirror inode efe does not have " + "short allocation descriptors!"); + iput(mdata->s_mirror_fe); + mdata->s_mirror_fe = NULL; + if (fe_error) + goto error_exit; + } + + /* + * bitmap file entry + * Note: + * Load only if bitmap file location differs from 0xFFFFFFFF (DCN-5102) + */ + if (mdata->s_bitmap_file_loc != 0xFFFFFFFF) { + addr.logicalBlockNum = mdata->s_bitmap_file_loc; + addr.partitionReferenceNum = map->s_partition_num; + + udf_debug("Bitmap file location: block = %d part = %d\n", + addr.logicalBlockNum, addr.partitionReferenceNum); + + mdata->s_bitmap_fe = udf_iget(sb, addr); + + if (mdata->s_bitmap_fe == NULL) { + if (sb->s_flags & MS_RDONLY) + udf_warning(sb, __func__, "bitmap inode efe " + "not found but it's ok since the disc" + " is mounted read-only"); + else { + udf_error(sb, __func__, "bitmap inode efe not " + "found and attempted read-write mount"); + goto error_exit; + } + } + } + + udf_debug("udf_load_metadata_files Ok\n"); + + return 0; + +error_exit: + return 1; } static void udf_load_fileset(struct super_block *sb, struct buffer_head *bh, @@ -1025,10 +1065,9 @@ static void udf_load_fileset(struct super_block *sb, struct buffer_head *bh, int udf_compute_nr_groups(struct super_block *sb, u32 partition) { struct udf_part_map *map = &UDF_SB(sb)->s_partmaps[partition]; - return (map->s_partition_len + - (sizeof(struct spaceBitmapDesc) << 3) + - (sb->s_blocksize * 8) - 1) / - (sb->s_blocksize * 8); + return DIV_ROUND_UP(map->s_partition_len + + (sizeof(struct spaceBitmapDesc) << 3), + sb->s_blocksize * 8); } static struct udf_bitmap *udf_sb_alloc_bitmap(struct super_block *sb, u32 index) @@ -1059,134 +1098,241 @@ static struct udf_bitmap *udf_sb_alloc_bitmap(struct super_block *sb, u32 index) return bitmap; } -static int udf_load_partdesc(struct super_block *sb, struct buffer_head *bh) +static int udf_fill_partdesc_info(struct super_block *sb, + struct partitionDesc *p, int p_index) +{ + struct udf_part_map *map; + struct udf_sb_info *sbi = UDF_SB(sb); + struct partitionHeaderDesc *phd; + + map = &sbi->s_partmaps[p_index]; + + map->s_partition_len = le32_to_cpu(p->partitionLength); /* blocks */ + map->s_partition_root = le32_to_cpu(p->partitionStartingLocation); + + if (p->accessType == cpu_to_le32(PD_ACCESS_TYPE_READ_ONLY)) + map->s_partition_flags |= UDF_PART_FLAG_READ_ONLY; + if (p->accessType == cpu_to_le32(PD_ACCESS_TYPE_WRITE_ONCE)) + map->s_partition_flags |= UDF_PART_FLAG_WRITE_ONCE; + if (p->accessType == cpu_to_le32(PD_ACCESS_TYPE_REWRITABLE)) + map->s_partition_flags |= UDF_PART_FLAG_REWRITABLE; + if (p->accessType == cpu_to_le32(PD_ACCESS_TYPE_OVERWRITABLE)) + map->s_partition_flags |= UDF_PART_FLAG_OVERWRITABLE; + + udf_debug("Partition (%d type %x) starts at physical %d, " + "block length %d\n", p_index, + map->s_partition_type, map->s_partition_root, + map->s_partition_len); + + if (strcmp(p->partitionContents.ident, PD_PARTITION_CONTENTS_NSR02) && + strcmp(p->partitionContents.ident, PD_PARTITION_CONTENTS_NSR03)) + return 0; + + phd = (struct partitionHeaderDesc *)p->partitionContentsUse; + if (phd->unallocSpaceTable.extLength) { + kernel_lb_addr loc = { + .logicalBlockNum = le32_to_cpu( + phd->unallocSpaceTable.extPosition), + .partitionReferenceNum = p_index, + }; + + map->s_uspace.s_table = udf_iget(sb, loc); + if (!map->s_uspace.s_table) { + udf_debug("cannot load unallocSpaceTable (part %d)\n", + p_index); + return 1; + } + map->s_partition_flags |= UDF_PART_FLAG_UNALLOC_TABLE; + udf_debug("unallocSpaceTable (part %d) @ %ld\n", + p_index, map->s_uspace.s_table->i_ino); + } + + if (phd->unallocSpaceBitmap.extLength) { + struct udf_bitmap *bitmap = udf_sb_alloc_bitmap(sb, p_index); + if (!bitmap) + return 1; + map->s_uspace.s_bitmap = bitmap; + bitmap->s_extLength = le32_to_cpu( + phd->unallocSpaceBitmap.extLength); + bitmap->s_extPosition = le32_to_cpu( + phd->unallocSpaceBitmap.extPosition); + map->s_partition_flags |= UDF_PART_FLAG_UNALLOC_BITMAP; + udf_debug("unallocSpaceBitmap (part %d) @ %d\n", p_index, + bitmap->s_extPosition); + } + + if (phd->partitionIntegrityTable.extLength) + udf_debug("partitionIntegrityTable (part %d)\n", p_index); + + if (phd->freedSpaceTable.extLength) { + kernel_lb_addr loc = { + .logicalBlockNum = le32_to_cpu( + phd->freedSpaceTable.extPosition), + .partitionReferenceNum = p_index, + }; + + map->s_fspace.s_table = udf_iget(sb, loc); + if (!map->s_fspace.s_table) { + udf_debug("cannot load freedSpaceTable (part %d)\n", + p_index); + return 1; + } + + map->s_partition_flags |= UDF_PART_FLAG_FREED_TABLE; + udf_debug("freedSpaceTable (part %d) @ %ld\n", + p_index, map->s_fspace.s_table->i_ino); + } + + if (phd->freedSpaceBitmap.extLength) { + struct udf_bitmap *bitmap = udf_sb_alloc_bitmap(sb, p_index); + if (!bitmap) + return 1; + map->s_fspace.s_bitmap = bitmap; + bitmap->s_extLength = le32_to_cpu( + phd->freedSpaceBitmap.extLength); + bitmap->s_extPosition = le32_to_cpu( + phd->freedSpaceBitmap.extPosition); + map->s_partition_flags |= UDF_PART_FLAG_FREED_BITMAP; + udf_debug("freedSpaceBitmap (part %d) @ %d\n", p_index, + bitmap->s_extPosition); + } + return 0; +} + +static int udf_load_vat(struct super_block *sb, int p_index, int type1_index) +{ + struct udf_sb_info *sbi = UDF_SB(sb); + struct udf_part_map *map = &sbi->s_partmaps[p_index]; + kernel_lb_addr ino; + struct buffer_head *bh = NULL; + struct udf_inode_info *vati; + uint32_t pos; + struct virtualAllocationTable20 *vat20; + + /* VAT file entry is in the last recorded block */ + ino.partitionReferenceNum = type1_index; + ino.logicalBlockNum = sbi->s_last_block - map->s_partition_root; + sbi->s_vat_inode = udf_iget(sb, ino); + if (!sbi->s_vat_inode) + return 1; + + if (map->s_partition_type == UDF_VIRTUAL_MAP15) { + map->s_type_specific.s_virtual.s_start_offset = 0; + map->s_type_specific.s_virtual.s_num_entries = + (sbi->s_vat_inode->i_size - 36) >> 2; + } else if (map->s_partition_type == UDF_VIRTUAL_MAP20) { + vati = UDF_I(sbi->s_vat_inode); + if (vati->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) { + pos = udf_block_map(sbi->s_vat_inode, 0); + bh = sb_bread(sb, pos); + if (!bh) + return 1; + vat20 = (struct virtualAllocationTable20 *)bh->b_data; + } else { + vat20 = (struct virtualAllocationTable20 *) + vati->i_ext.i_data; + } + + map->s_type_specific.s_virtual.s_start_offset = + le16_to_cpu(vat20->lengthHeader); + map->s_type_specific.s_virtual.s_num_entries = + (sbi->s_vat_inode->i_size - + map->s_type_specific.s_virtual. + s_start_offset) >> 2; + brelse(bh); + } + return 0; +} + +static int udf_load_partdesc(struct super_block *sb, sector_t block) { + struct buffer_head *bh; struct partitionDesc *p; - int i; struct udf_part_map *map; - struct udf_sb_info *sbi; + struct udf_sb_info *sbi = UDF_SB(sb); + int i, type1_idx; + uint16_t partitionNumber; + uint16_t ident; + int ret = 0; + + bh = udf_read_tagged(sb, block, block, &ident); + if (!bh) + return 1; + if (ident != TAG_IDENT_PD) + goto out_bh; p = (struct partitionDesc *)bh->b_data; - sbi = UDF_SB(sb); + partitionNumber = le16_to_cpu(p->partitionNumber); + /* First scan for TYPE1, SPARABLE and METADATA partitions */ for (i = 0; i < sbi->s_partitions; i++) { map = &sbi->s_partmaps[i]; udf_debug("Searching map: (%d == %d)\n", - map->s_partition_num, - le16_to_cpu(p->partitionNumber)); - if (map->s_partition_num == - le16_to_cpu(p->partitionNumber)) { - map->s_partition_len = - le32_to_cpu(p->partitionLength); /* blocks */ - map->s_partition_root = - le32_to_cpu(p->partitionStartingLocation); - if (p->accessType == - cpu_to_le32(PD_ACCESS_TYPE_READ_ONLY)) - map->s_partition_flags |= - UDF_PART_FLAG_READ_ONLY; - if (p->accessType == - cpu_to_le32(PD_ACCESS_TYPE_WRITE_ONCE)) - map->s_partition_flags |= - UDF_PART_FLAG_WRITE_ONCE; - if (p->accessType == - cpu_to_le32(PD_ACCESS_TYPE_REWRITABLE)) - map->s_partition_flags |= - UDF_PART_FLAG_REWRITABLE; - if (p->accessType == - cpu_to_le32(PD_ACCESS_TYPE_OVERWRITABLE)) - map->s_partition_flags |= - UDF_PART_FLAG_OVERWRITABLE; - - if (!strcmp(p->partitionContents.ident, - PD_PARTITION_CONTENTS_NSR02) || - !strcmp(p->partitionContents.ident, - PD_PARTITION_CONTENTS_NSR03)) { - struct partitionHeaderDesc *phd; - - phd = (struct partitionHeaderDesc *) - (p->partitionContentsUse); - if (phd->unallocSpaceTable.extLength) { - kernel_lb_addr loc = { - .logicalBlockNum = le32_to_cpu(phd->unallocSpaceTable.extPosition), - .partitionReferenceNum = i, - }; - - map->s_uspace.s_table = - udf_iget(sb, loc); - if (!map->s_uspace.s_table) { - udf_debug("cannot load unallocSpaceTable (part %d)\n", i); - return 1; - } - map->s_partition_flags |= - UDF_PART_FLAG_UNALLOC_TABLE; - udf_debug("unallocSpaceTable (part %d) @ %ld\n", - i, map->s_uspace.s_table->i_ino); - } - if (phd->unallocSpaceBitmap.extLength) { - struct udf_bitmap *bitmap = - udf_sb_alloc_bitmap(sb, i); - map->s_uspace.s_bitmap = bitmap; - if (bitmap != NULL) { - bitmap->s_extLength = - le32_to_cpu(phd->unallocSpaceBitmap.extLength); - bitmap->s_extPosition = - le32_to_cpu(phd->unallocSpaceBitmap.extPosition); - map->s_partition_flags |= UDF_PART_FLAG_UNALLOC_BITMAP; - udf_debug("unallocSpaceBitmap (part %d) @ %d\n", - i, bitmap->s_extPosition); - } - } - if (phd->partitionIntegrityTable.extLength) - udf_debug("partitionIntegrityTable (part %d)\n", i); - if (phd->freedSpaceTable.extLength) { - kernel_lb_addr loc = { - .logicalBlockNum = le32_to_cpu(phd->freedSpaceTable.extPosition), - .partitionReferenceNum = i, - }; - - map->s_fspace.s_table = - udf_iget(sb, loc); - if (!map->s_fspace.s_table) { - udf_debug("cannot load freedSpaceTable (part %d)\n", i); - return 1; - } - map->s_partition_flags |= - UDF_PART_FLAG_FREED_TABLE; - udf_debug("freedSpaceTable (part %d) @ %ld\n", - i, map->s_fspace.s_table->i_ino); - } - if (phd->freedSpaceBitmap.extLength) { - struct udf_bitmap *bitmap = - udf_sb_alloc_bitmap(sb, i); - map->s_fspace.s_bitmap = bitmap; - if (bitmap != NULL) { - bitmap->s_extLength = - le32_to_cpu(phd->freedSpaceBitmap.extLength); - bitmap->s_extPosition = - le32_to_cpu(phd->freedSpaceBitmap.extPosition); - map->s_partition_flags |= UDF_PART_FLAG_FREED_BITMAP; - udf_debug("freedSpaceBitmap (part %d) @ %d\n", - i, bitmap->s_extPosition); - } - } - } + map->s_partition_num, partitionNumber); + if (map->s_partition_num == partitionNumber && + (map->s_partition_type == UDF_TYPE1_MAP15 || + map->s_partition_type == UDF_SPARABLE_MAP15)) break; - } } - if (i == sbi->s_partitions) + + if (i >= sbi->s_partitions) { udf_debug("Partition (%d) not found in partition map\n", - le16_to_cpu(p->partitionNumber)); - else - udf_debug("Partition (%d:%d type %x) starts at physical %d, " - "block length %d\n", - le16_to_cpu(p->partitionNumber), i, - map->s_partition_type, - map->s_partition_root, - map->s_partition_len); - return 0; + partitionNumber); + goto out_bh; + } + + ret = udf_fill_partdesc_info(sb, p, i); + + /* + * Now rescan for VIRTUAL or METADATA partitions when SPARABLE and + * PHYSICAL partitions are already set up + */ + type1_idx = i; + for (i = 0; i < sbi->s_partitions; i++) { + map = &sbi->s_partmaps[i]; + + if (map->s_partition_num == partitionNumber && + (map->s_partition_type == UDF_VIRTUAL_MAP15 || + map->s_partition_type == UDF_VIRTUAL_MAP20 || + map->s_partition_type == UDF_METADATA_MAP25)) + break; + } + + if (i >= sbi->s_partitions) + goto out_bh; + + ret = udf_fill_partdesc_info(sb, p, i); + if (ret) + goto out_bh; + + if (map->s_partition_type == UDF_METADATA_MAP25) { + ret = udf_load_metadata_files(sb, i); + if (ret) { + printk(KERN_ERR "UDF-fs: error loading MetaData " + "partition map %d\n", i); + goto out_bh; + } + } else { + ret = udf_load_vat(sb, i, type1_idx); + if (ret) + goto out_bh; + /* + * Mark filesystem read-only if we have a partition with + * virtual map since we don't handle writing to it (we + * overwrite blocks instead of relocating them). + */ + sb->s_flags |= MS_RDONLY; + printk(KERN_NOTICE "UDF-fs: Filesystem marked read-only " + "because writing to pseudooverwrite partition is " + "not implemented.\n"); + } +out_bh: + /* In case loading failed, we handle cleanup in udf_fill_super */ + brelse(bh); + return ret; } -static int udf_load_logicalvol(struct super_block *sb, struct buffer_head *bh, +static int udf_load_logicalvol(struct super_block *sb, sector_t block, kernel_lb_addr *fileset) { struct logicalVolDesc *lvd; @@ -1194,12 +1340,21 @@ static int udf_load_logicalvol(struct super_block *sb, struct buffer_head *bh, uint8_t type; struct udf_sb_info *sbi = UDF_SB(sb); struct genericPartitionMap *gpm; + uint16_t ident; + struct buffer_head *bh; + int ret = 0; + bh = udf_read_tagged(sb, block, block, &ident); + if (!bh) + return 1; + BUG_ON(ident != TAG_IDENT_LVD); lvd = (struct logicalVolDesc *)bh->b_data; i = udf_sb_alloc_partition_maps(sb, le32_to_cpu(lvd->numPartitionMaps)); - if (i != 0) - return i; + if (i != 0) { + ret = i; + goto out_bh; + } for (i = 0, offset = 0; i < sbi->s_partitions && offset < le32_to_cpu(lvd->mapTableLength); @@ -1223,12 +1378,12 @@ static int udf_load_logicalvol(struct super_block *sb, struct buffer_head *bh, u16 suf = le16_to_cpu(((__le16 *)upm2->partIdent. identSuffix)[0]); - if (suf == 0x0150) { + if (suf < 0x0200) { map->s_partition_type = UDF_VIRTUAL_MAP15; map->s_partition_func = udf_get_pblock_virt15; - } else if (suf == 0x0200) { + } else { map->s_partition_type = UDF_VIRTUAL_MAP20; map->s_partition_func = @@ -1238,7 +1393,6 @@ static int udf_load_logicalvol(struct super_block *sb, struct buffer_head *bh, UDF_ID_SPARABLE, strlen(UDF_ID_SPARABLE))) { uint32_t loc; - uint16_t ident; struct sparingTable *st; struct sparablePartitionMap *spm = (struct sparablePartitionMap *)gpm; @@ -1256,22 +1410,64 @@ static int udf_load_logicalvol(struct super_block *sb, struct buffer_head *bh, map->s_type_specific.s_sparing. s_spar_map[j] = bh2; - if (bh2 != NULL) { - st = (struct sparingTable *) - bh2->b_data; - if (ident != 0 || strncmp( - st->sparingIdent.ident, - UDF_ID_SPARING, - strlen(UDF_ID_SPARING))) { - brelse(bh2); - map->s_type_specific. - s_sparing. - s_spar_map[j] = - NULL; - } + if (bh2 == NULL) + continue; + + st = (struct sparingTable *)bh2->b_data; + if (ident != 0 || strncmp( + st->sparingIdent.ident, + UDF_ID_SPARING, + strlen(UDF_ID_SPARING))) { + brelse(bh2); + map->s_type_specific.s_sparing. + s_spar_map[j] = NULL; } } map->s_partition_func = udf_get_pblock_spar15; + } else if (!strncmp(upm2->partIdent.ident, + UDF_ID_METADATA, + strlen(UDF_ID_METADATA))) { + struct udf_meta_data *mdata = + &map->s_type_specific.s_metadata; + struct metadataPartitionMap *mdm = + (struct metadataPartitionMap *) + &(lvd->partitionMaps[offset]); + udf_debug("Parsing Logical vol part %d " + "type %d id=%s\n", i, type, + UDF_ID_METADATA); + + map->s_partition_type = UDF_METADATA_MAP25; + map->s_partition_func = udf_get_pblock_meta25; + + mdata->s_meta_file_loc = + le32_to_cpu(mdm->metadataFileLoc); + mdata->s_mirror_file_loc = + le32_to_cpu(mdm->metadataMirrorFileLoc); + mdata->s_bitmap_file_loc = + le32_to_cpu(mdm->metadataBitmapFileLoc); + mdata->s_alloc_unit_size = + le32_to_cpu(mdm->allocUnitSize); + mdata->s_align_unit_size = + le16_to_cpu(mdm->alignUnitSize); + mdata->s_dup_md_flag = + mdm->flags & 0x01; + + udf_debug("Metadata Ident suffix=0x%x\n", + (le16_to_cpu( + ((__le16 *) + mdm->partIdent.identSuffix)[0]))); + udf_debug("Metadata part num=%d\n", + le16_to_cpu(mdm->partitionNum)); + udf_debug("Metadata part alloc unit size=%d\n", + le32_to_cpu(mdm->allocUnitSize)); + udf_debug("Metadata file loc=%d\n", + le32_to_cpu(mdm->metadataFileLoc)); + udf_debug("Mirror file loc=%d\n", + le32_to_cpu(mdm->metadataMirrorFileLoc)); + udf_debug("Bitmap file loc=%d\n", + le32_to_cpu(mdm->metadataBitmapFileLoc)); + udf_debug("Duplicate Flag: %d %d\n", + mdata->s_dup_md_flag, mdm->flags); } else { udf_debug("Unknown ident: %s\n", upm2->partIdent.ident); @@ -1296,7 +1492,9 @@ static int udf_load_logicalvol(struct super_block *sb, struct buffer_head *bh, if (lvd->integritySeqExt.extLength) udf_load_logicalvolint(sb, leea_to_cpu(lvd->integritySeqExt)); - return 0; +out_bh: + brelse(bh); + return ret; } /* @@ -1345,7 +1543,7 @@ static void udf_load_logicalvolint(struct super_block *sb, kernel_extent_ad loc) * July 1, 1997 - Andrew E. Mileski * Written, tested, and released. */ -static int udf_process_sequence(struct super_block *sb, long block, +static noinline int udf_process_sequence(struct super_block *sb, long block, long lastblock, kernel_lb_addr *fileset) { struct buffer_head *bh = NULL; @@ -1354,19 +1552,25 @@ static int udf_process_sequence(struct super_block *sb, long block, struct generic_desc *gd; struct volDescPtr *vdp; int done = 0; - int i, j; uint32_t vdsn; uint16_t ident; long next_s = 0, next_e = 0; memset(vds, 0, sizeof(struct udf_vds_record) * VDS_POS_LENGTH); - /* Read the main descriptor sequence */ + /* + * Read the main descriptor sequence and find which descriptors + * are in it. + */ for (; (!done && block <= lastblock); block++) { bh = udf_read_tagged(sb, block, block, &ident); - if (!bh) - break; + if (!bh) { + printk(KERN_ERR "udf: Block %Lu of volume descriptor " + "sequence is corrupted or we could not read " + "it.\n", (unsigned long long)block); + return 1; + } /* Process each descriptor (ISO 13346 3/8.3-8.4) */ gd = (struct generic_desc *)bh->b_data; @@ -1432,41 +1636,31 @@ static int udf_process_sequence(struct super_block *sb, long block, } brelse(bh); } - for (i = 0; i < VDS_POS_LENGTH; i++) { - if (vds[i].block) { - bh = udf_read_tagged(sb, vds[i].block, vds[i].block, - &ident); - - if (i == VDS_POS_PRIMARY_VOL_DESC) { - udf_load_pvoldesc(sb, bh); - } else if (i == VDS_POS_LOGICAL_VOL_DESC) { - if (udf_load_logicalvol(sb, bh, fileset)) { - brelse(bh); - return 1; - } - } else if (i == VDS_POS_PARTITION_DESC) { - struct buffer_head *bh2 = NULL; - if (udf_load_partdesc(sb, bh)) { - brelse(bh); - return 1; - } - for (j = vds[i].block + 1; - j < vds[VDS_POS_TERMINATING_DESC].block; - j++) { - bh2 = udf_read_tagged(sb, j, j, &ident); - gd = (struct generic_desc *)bh2->b_data; - if (ident == TAG_IDENT_PD) - if (udf_load_partdesc(sb, - bh2)) { - brelse(bh); - brelse(bh2); - return 1; - } - brelse(bh2); - } - } - brelse(bh); - } + /* + * Now read interesting descriptors again and process them + * in a suitable order + */ + if (!vds[VDS_POS_PRIMARY_VOL_DESC].block) { + printk(KERN_ERR "udf: Primary Volume Descriptor not found!\n"); + return 1; + } + if (udf_load_pvoldesc(sb, vds[VDS_POS_PRIMARY_VOL_DESC].block)) + return 1; + + if (vds[VDS_POS_LOGICAL_VOL_DESC].block && udf_load_logicalvol(sb, + vds[VDS_POS_LOGICAL_VOL_DESC].block, fileset)) + return 1; + + if (vds[VDS_POS_PARTITION_DESC].block) { + /* + * We rescan the whole descriptor sequence to find + * partition descriptor blocks and process them. + */ + for (block = vds[VDS_POS_PARTITION_DESC].block; + block < vds[VDS_POS_TERMINATING_DESC].block; + block++) + if (udf_load_partdesc(sb, block)) + return 1; } return 0; @@ -1478,6 +1672,7 @@ static int udf_process_sequence(struct super_block *sb, long block, static int udf_check_valid(struct super_block *sb, int novrs, int silent) { long block; + struct udf_sb_info *sbi = UDF_SB(sb); if (novrs) { udf_debug("Validity check skipped because of novrs option\n"); @@ -1485,27 +1680,22 @@ static int udf_check_valid(struct super_block *sb, int novrs, int silent) } /* Check that it is NSR02 compliant */ /* Process any "CD-ROM Volume Descriptor Set" (ECMA 167 2/8.3.1) */ - else { - block = udf_vrs(sb, silent); - if (block == -1) { - struct udf_sb_info *sbi = UDF_SB(sb); - udf_debug("Failed to read byte 32768. Assuming open " - "disc. Skipping validity check\n"); - if (!sbi->s_last_block) - sbi->s_last_block = udf_get_last_block(sb); - return 0; - } else - return !block; - } + block = udf_vrs(sb, silent); + if (block == -1) + udf_debug("Failed to read byte 32768. Assuming open " + "disc. Skipping validity check\n"); + if (block && !sbi->s_last_block) + sbi->s_last_block = udf_get_last_block(sb); + return !block; } -static int udf_load_partition(struct super_block *sb, kernel_lb_addr *fileset) +static int udf_load_sequence(struct super_block *sb, kernel_lb_addr *fileset) { struct anchorVolDescPtr *anchor; uint16_t ident; struct buffer_head *bh; long main_s, main_e, reserve_s, reserve_e; - int i, j; + int i; struct udf_sb_info *sbi; if (!sb) @@ -1515,6 +1705,7 @@ static int udf_load_partition(struct super_block *sb, kernel_lb_addr *fileset) for (i = 0; i < ARRAY_SIZE(sbi->s_anchor); i++) { if (!sbi->s_anchor[i]) continue; + bh = udf_read_tagged(sb, sbi->s_anchor[i], sbi->s_anchor[i], &ident); if (!bh) @@ -1553,76 +1744,6 @@ static int udf_load_partition(struct super_block *sb, kernel_lb_addr *fileset) } udf_debug("Using anchor in block %d\n", sbi->s_anchor[i]); - for (i = 0; i < sbi->s_partitions; i++) { - kernel_lb_addr uninitialized_var(ino); - struct udf_part_map *map = &sbi->s_partmaps[i]; - switch (map->s_partition_type) { - case UDF_VIRTUAL_MAP15: - case UDF_VIRTUAL_MAP20: - if (!sbi->s_last_block) { - sbi->s_last_block = udf_get_last_block(sb); - udf_find_anchor(sb); - } - - if (!sbi->s_last_block) { - udf_debug("Unable to determine Lastblock (For " - "Virtual Partition)\n"); - return 1; - } - - for (j = 0; j < sbi->s_partitions; j++) { - struct udf_part_map *map2 = &sbi->s_partmaps[j]; - if (j != i && - map->s_volumeseqnum == - map2->s_volumeseqnum && - map->s_partition_num == - map2->s_partition_num) { - ino.partitionReferenceNum = j; - ino.logicalBlockNum = - sbi->s_last_block - - map2->s_partition_root; - break; - } - } - - if (j == sbi->s_partitions) - return 1; - - sbi->s_vat_inode = udf_iget(sb, ino); - if (!sbi->s_vat_inode) - return 1; - - if (map->s_partition_type == UDF_VIRTUAL_MAP15) { - map->s_type_specific.s_virtual.s_start_offset = - udf_ext0_offset(sbi->s_vat_inode); - map->s_type_specific.s_virtual.s_num_entries = - (sbi->s_vat_inode->i_size - 36) >> 2; - } else if (map->s_partition_type == UDF_VIRTUAL_MAP20) { - uint32_t pos; - struct virtualAllocationTable20 *vat20; - - pos = udf_block_map(sbi->s_vat_inode, 0); - bh = sb_bread(sb, pos); - if (!bh) - return 1; - vat20 = (struct virtualAllocationTable20 *) - bh->b_data + - udf_ext0_offset(sbi->s_vat_inode); - map->s_type_specific.s_virtual.s_start_offset = - le16_to_cpu(vat20->lengthHeader) + - udf_ext0_offset(sbi->s_vat_inode); - map->s_type_specific.s_virtual.s_num_entries = - (sbi->s_vat_inode->i_size - - map->s_type_specific.s_virtual. - s_start_offset) >> 2; - brelse(bh); - } - map->s_partition_root = udf_get_pblock(sb, 0, i, 0); - map->s_partition_len = - sbi->s_partmaps[ino.partitionReferenceNum]. - s_partition_len; - } - } return 0; } @@ -1630,65 +1751,61 @@ static void udf_open_lvid(struct super_block *sb) { struct udf_sb_info *sbi = UDF_SB(sb); struct buffer_head *bh = sbi->s_lvid_bh; - if (bh) { - kernel_timestamp cpu_time; - struct logicalVolIntegrityDesc *lvid = - (struct logicalVolIntegrityDesc *)bh->b_data; - struct logicalVolIntegrityDescImpUse *lvidiu = - udf_sb_lvidiu(sbi); + struct logicalVolIntegrityDesc *lvid; + struct logicalVolIntegrityDescImpUse *lvidiu; + if (!bh) + return; - lvidiu->impIdent.identSuffix[0] = UDF_OS_CLASS_UNIX; - lvidiu->impIdent.identSuffix[1] = UDF_OS_ID_LINUX; - if (udf_time_to_stamp(&cpu_time, CURRENT_TIME)) - lvid->recordingDateAndTime = cpu_to_lets(cpu_time); - lvid->integrityType = LVID_INTEGRITY_TYPE_OPEN; + lvid = (struct logicalVolIntegrityDesc *)bh->b_data; + lvidiu = udf_sb_lvidiu(sbi); - lvid->descTag.descCRC = cpu_to_le16( - udf_crc((char *)lvid + sizeof(tag), - le16_to_cpu(lvid->descTag.descCRCLength), - 0)); + lvidiu->impIdent.identSuffix[0] = UDF_OS_CLASS_UNIX; + lvidiu->impIdent.identSuffix[1] = UDF_OS_ID_LINUX; + udf_time_to_disk_stamp(&lvid->recordingDateAndTime, + CURRENT_TIME); + lvid->integrityType = LVID_INTEGRITY_TYPE_OPEN; - lvid->descTag.tagChecksum = udf_tag_checksum(&lvid->descTag); - mark_buffer_dirty(bh); - } + lvid->descTag.descCRC = cpu_to_le16( + crc_itu_t(0, (char *)lvid + sizeof(tag), + le16_to_cpu(lvid->descTag.descCRCLength))); + + lvid->descTag.tagChecksum = udf_tag_checksum(&lvid->descTag); + mark_buffer_dirty(bh); } static void udf_close_lvid(struct super_block *sb) { - kernel_timestamp cpu_time; struct udf_sb_info *sbi = UDF_SB(sb); struct buffer_head *bh = sbi->s_lvid_bh; struct logicalVolIntegrityDesc *lvid; + struct logicalVolIntegrityDescImpUse *lvidiu; if (!bh) return; lvid = (struct logicalVolIntegrityDesc *)bh->b_data; - if (lvid->integrityType == LVID_INTEGRITY_TYPE_OPEN) { - struct logicalVolIntegrityDescImpUse *lvidiu = - udf_sb_lvidiu(sbi); - lvidiu->impIdent.identSuffix[0] = UDF_OS_CLASS_UNIX; - lvidiu->impIdent.identSuffix[1] = UDF_OS_ID_LINUX; - if (udf_time_to_stamp(&cpu_time, CURRENT_TIME)) - lvid->recordingDateAndTime = cpu_to_lets(cpu_time); - if (UDF_MAX_WRITE_VERSION > le16_to_cpu(lvidiu->maxUDFWriteRev)) - lvidiu->maxUDFWriteRev = - cpu_to_le16(UDF_MAX_WRITE_VERSION); - if (sbi->s_udfrev > le16_to_cpu(lvidiu->minUDFReadRev)) - lvidiu->minUDFReadRev = cpu_to_le16(sbi->s_udfrev); - if (sbi->s_udfrev > le16_to_cpu(lvidiu->minUDFWriteRev)) - lvidiu->minUDFWriteRev = cpu_to_le16(sbi->s_udfrev); - lvid->integrityType = cpu_to_le32(LVID_INTEGRITY_TYPE_CLOSE); - - lvid->descTag.descCRC = cpu_to_le16( - udf_crc((char *)lvid + sizeof(tag), - le16_to_cpu(lvid->descTag.descCRCLength), - 0)); - - lvid->descTag.tagChecksum = udf_tag_checksum(&lvid->descTag); - mark_buffer_dirty(bh); - } + if (lvid->integrityType != LVID_INTEGRITY_TYPE_OPEN) + return; + + lvidiu = udf_sb_lvidiu(sbi); + lvidiu->impIdent.identSuffix[0] = UDF_OS_CLASS_UNIX; + lvidiu->impIdent.identSuffix[1] = UDF_OS_ID_LINUX; + udf_time_to_disk_stamp(&lvid->recordingDateAndTime, CURRENT_TIME); + if (UDF_MAX_WRITE_VERSION > le16_to_cpu(lvidiu->maxUDFWriteRev)) + lvidiu->maxUDFWriteRev = cpu_to_le16(UDF_MAX_WRITE_VERSION); + if (sbi->s_udfrev > le16_to_cpu(lvidiu->minUDFReadRev)) + lvidiu->minUDFReadRev = cpu_to_le16(sbi->s_udfrev); + if (sbi->s_udfrev > le16_to_cpu(lvidiu->minUDFWriteRev)) + lvidiu->minUDFWriteRev = cpu_to_le16(sbi->s_udfrev); + lvid->integrityType = cpu_to_le32(LVID_INTEGRITY_TYPE_CLOSE); + + lvid->descTag.descCRC = cpu_to_le16( + crc_itu_t(0, (char *)lvid + sizeof(tag), + le16_to_cpu(lvid->descTag.descCRCLength))); + + lvid->descTag.tagChecksum = udf_tag_checksum(&lvid->descTag); + mark_buffer_dirty(bh); } static void udf_sb_free_bitmap(struct udf_bitmap *bitmap) @@ -1708,22 +1825,35 @@ static void udf_sb_free_bitmap(struct udf_bitmap *bitmap) vfree(bitmap); } -/* - * udf_read_super - * - * PURPOSE - * Complete the specified super block. - * - * PRE-CONDITIONS - * sb Pointer to superblock to complete - never NULL. - * sb->s_dev Device to read suberblock from. - * options Pointer to mount options. - * silent Silent flag. - * - * HISTORY - * July 1, 1997 - Andrew E. Mileski - * Written, tested, and released. - */ +static void udf_free_partition(struct udf_part_map *map) +{ + int i; + struct udf_meta_data *mdata; + + if (map->s_partition_flags & UDF_PART_FLAG_UNALLOC_TABLE) + iput(map->s_uspace.s_table); + if (map->s_partition_flags & UDF_PART_FLAG_FREED_TABLE) + iput(map->s_fspace.s_table); + if (map->s_partition_flags & UDF_PART_FLAG_UNALLOC_BITMAP) + udf_sb_free_bitmap(map->s_uspace.s_bitmap); + if (map->s_partition_flags & UDF_PART_FLAG_FREED_BITMAP) + udf_sb_free_bitmap(map->s_fspace.s_bitmap); + if (map->s_partition_type == UDF_SPARABLE_MAP15) + for (i = 0; i < 4; i++) + brelse(map->s_type_specific.s_sparing.s_spar_map[i]); + else if (map->s_partition_type == UDF_METADATA_MAP25) { + mdata = &map->s_type_specific.s_metadata; + iput(mdata->s_metadata_fe); + mdata->s_metadata_fe = NULL; + + iput(mdata->s_mirror_fe); + mdata->s_mirror_fe = NULL; + + iput(mdata->s_bitmap_fe); + mdata->s_bitmap_fe = NULL; + } +} + static int udf_fill_super(struct super_block *sb, void *options, int silent) { int i; @@ -1776,8 +1906,11 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent) sbi->s_nls_map = uopt.nls_map; /* Set the block size for all transfers */ - if (!udf_set_blocksize(sb, uopt.blocksize)) + if (!sb_min_blocksize(sb, uopt.blocksize)) { + udf_debug("Bad block size (%d)\n", uopt.blocksize); + printk(KERN_ERR "udf: bad block size (%d)\n", uopt.blocksize); goto error_out; + } if (uopt.session == 0xFFFFFFFF) sbi->s_session = udf_get_last_session(sb); @@ -1789,7 +1922,6 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent) sbi->s_last_block = uopt.lastblock; sbi->s_anchor[0] = sbi->s_anchor[1] = 0; sbi->s_anchor[2] = uopt.anchor; - sbi->s_anchor[3] = 256; if (udf_check_valid(sb, uopt.novrs, silent)) { /* read volume recognition sequences */ @@ -1806,7 +1938,7 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent) sb->s_magic = UDF_SUPER_MAGIC; sb->s_time_gran = 1000; - if (udf_load_partition(sb, &fileset)) { + if (udf_load_sequence(sb, &fileset)) { printk(KERN_WARNING "UDF-fs: No partition found (1)\n"); goto error_out; } @@ -1856,12 +1988,12 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent) } if (!silent) { - kernel_timestamp ts; - udf_time_to_stamp(&ts, sbi->s_record_time); + timestamp ts; + udf_time_to_disk_stamp(&ts, sbi->s_record_time); udf_info("UDF: Mounting volume '%s', " "timestamp %04u/%02u/%02u %02u:%02u (%x)\n", - sbi->s_volume_ident, ts.year, ts.month, ts.day, - ts.hour, ts.minute, ts.typeAndTimezone); + sbi->s_volume_ident, le16_to_cpu(ts.year), ts.month, ts.day, + ts.hour, ts.minute, le16_to_cpu(ts.typeAndTimezone)); } if (!(sb->s_flags & MS_RDONLY)) udf_open_lvid(sb); @@ -1890,21 +2022,9 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent) error_out: if (sbi->s_vat_inode) iput(sbi->s_vat_inode); - if (sbi->s_partitions) { - struct udf_part_map *map = &sbi->s_partmaps[sbi->s_partition]; - if (map->s_partition_flags & UDF_PART_FLAG_UNALLOC_TABLE) - iput(map->s_uspace.s_table); - if (map->s_partition_flags & UDF_PART_FLAG_FREED_TABLE) - iput(map->s_fspace.s_table); - if (map->s_partition_flags & UDF_PART_FLAG_UNALLOC_BITMAP) - udf_sb_free_bitmap(map->s_uspace.s_bitmap); - if (map->s_partition_flags & UDF_PART_FLAG_FREED_BITMAP) - udf_sb_free_bitmap(map->s_fspace.s_bitmap); - if (map->s_partition_type == UDF_SPARABLE_MAP15) - for (i = 0; i < 4; i++) - brelse(map->s_type_specific.s_sparing. - s_spar_map[i]); - } + if (sbi->s_partitions) + for (i = 0; i < sbi->s_partitions; i++) + udf_free_partition(&sbi->s_partmaps[i]); #ifdef CONFIG_UDF_NLS if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP)) unload_nls(sbi->s_nls_map); @@ -1920,8 +2040,8 @@ error_out: return -EINVAL; } -void udf_error(struct super_block *sb, const char *function, - const char *fmt, ...) +static void udf_error(struct super_block *sb, const char *function, + const char *fmt, ...) { va_list args; @@ -1948,19 +2068,6 @@ void udf_warning(struct super_block *sb, const char *function, sb->s_id, function, error_buf); } -/* - * udf_put_super - * - * PURPOSE - * Prepare for destruction of the superblock. - * - * DESCRIPTION - * Called before the filesystem is unmounted. - * - * HISTORY - * July 1, 1997 - Andrew E. Mileski - * Written, tested, and released. - */ static void udf_put_super(struct super_block *sb) { int i; @@ -1969,21 +2076,9 @@ static void udf_put_super(struct super_block *sb) sbi = UDF_SB(sb); if (sbi->s_vat_inode) iput(sbi->s_vat_inode); - if (sbi->s_partitions) { - struct udf_part_map *map = &sbi->s_partmaps[sbi->s_partition]; - if (map->s_partition_flags & UDF_PART_FLAG_UNALLOC_TABLE) - iput(map->s_uspace.s_table); - if (map->s_partition_flags & UDF_PART_FLAG_FREED_TABLE) - iput(map->s_fspace.s_table); - if (map->s_partition_flags & UDF_PART_FLAG_UNALLOC_BITMAP) - udf_sb_free_bitmap(map->s_uspace.s_bitmap); - if (map->s_partition_flags & UDF_PART_FLAG_FREED_BITMAP) - udf_sb_free_bitmap(map->s_fspace.s_bitmap); - if (map->s_partition_type == UDF_SPARABLE_MAP15) - for (i = 0; i < 4; i++) - brelse(map->s_type_specific.s_sparing. - s_spar_map[i]); - } + if (sbi->s_partitions) + for (i = 0; i < sbi->s_partitions; i++) + udf_free_partition(&sbi->s_partmaps[i]); #ifdef CONFIG_UDF_NLS if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP)) unload_nls(sbi->s_nls_map); @@ -1996,19 +2091,6 @@ static void udf_put_super(struct super_block *sb) sb->s_fs_info = NULL; } -/* - * udf_stat_fs - * - * PURPOSE - * Return info about the filesystem. - * - * DESCRIPTION - * Called by sys_statfs() - * - * HISTORY - * July 1, 1997 - Andrew E. Mileski - * Written, tested, and released. - */ static int udf_statfs(struct dentry *dentry, struct kstatfs *buf) { struct super_block *sb = dentry->d_sb; @@ -2035,10 +2117,6 @@ static int udf_statfs(struct dentry *dentry, struct kstatfs *buf) return 0; } -static unsigned char udf_bitmap_lookup[16] = { - 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4 -}; - static unsigned int udf_count_free_bitmap(struct super_block *sb, struct udf_bitmap *bitmap) { @@ -2048,7 +2126,6 @@ static unsigned int udf_count_free_bitmap(struct super_block *sb, int block = 0, newblock; kernel_lb_addr loc; uint32_t bytes; - uint8_t value; uint8_t *ptr; uint16_t ident; struct spaceBitmapDesc *bm; @@ -2074,13 +2151,10 @@ static unsigned int udf_count_free_bitmap(struct super_block *sb, ptr = (uint8_t *)bh->b_data; while (bytes > 0) { - while ((bytes > 0) && (index < sb->s_blocksize)) { - value = ptr[index]; - accum += udf_bitmap_lookup[value & 0x0f]; - accum += udf_bitmap_lookup[value >> 4]; - index++; - bytes--; - } + u32 cur_bytes = min_t(u32, bytes, sb->s_blocksize - index); + accum += bitmap_weight((const unsigned long *)(ptr + index), + cur_bytes * 8); + bytes -= cur_bytes; if (bytes) { brelse(bh); newblock = udf_get_lb_pblock(sb, loc, ++block); diff --git a/fs/udf/symlink.c b/fs/udf/symlink.c index 6ec9922..c3265e1 100644 --- a/fs/udf/symlink.c +++ b/fs/udf/symlink.c @@ -23,7 +23,6 @@ #include <asm/uaccess.h> #include <linux/errno.h> #include <linux/fs.h> -#include <linux/udf_fs.h> #include <linux/time.h> #include <linux/mm.h> #include <linux/stat.h> diff --git a/fs/udf/truncate.c b/fs/udf/truncate.c index fe61be1..65e19b4 100644 --- a/fs/udf/truncate.c +++ b/fs/udf/truncate.c @@ -22,7 +22,6 @@ #include "udfdecl.h" #include <linux/fs.h> #include <linux/mm.h> -#include <linux/udf_fs.h> #include <linux/buffer_head.h> #include "udf_i.h" @@ -180,6 +179,24 @@ void udf_discard_prealloc(struct inode *inode) brelse(epos.bh); } +static void udf_update_alloc_ext_desc(struct inode *inode, + struct extent_position *epos, + u32 lenalloc) +{ + struct super_block *sb = inode->i_sb; + struct udf_sb_info *sbi = UDF_SB(sb); + + struct allocExtDesc *aed = (struct allocExtDesc *) (epos->bh->b_data); + int len = sizeof(struct allocExtDesc); + + aed->lengthAllocDescs = cpu_to_le32(lenalloc); + if (!UDF_QUERY_FLAG(sb, UDF_FLAG_STRICT) || sbi->s_udfrev >= 0x0201) + len += lenalloc; + + udf_update_tag(epos->bh->b_data, len); + mark_buffer_dirty_inode(epos->bh, inode); +} + void udf_truncate_extents(struct inode *inode) { struct extent_position epos; @@ -187,7 +204,6 @@ void udf_truncate_extents(struct inode *inode) uint32_t elen, nelen = 0, indirect_ext_len = 0, lenalloc; int8_t etype; struct super_block *sb = inode->i_sb; - struct udf_sb_info *sbi = UDF_SB(sb); sector_t first_block = inode->i_size >> sb->s_blocksize_bits, offset; loff_t byte_offset; int adsize; @@ -224,35 +240,15 @@ void udf_truncate_extents(struct inode *inode) if (indirect_ext_len) { /* We managed to free all extents in the * indirect extent - free it too */ - if (!epos.bh) - BUG(); + BUG_ON(!epos.bh); udf_free_blocks(sb, inode, epos.block, 0, indirect_ext_len); - } else { - if (!epos.bh) { - iinfo->i_lenAlloc = - lenalloc; - mark_inode_dirty(inode); - } else { - struct allocExtDesc *aed = - (struct allocExtDesc *) - (epos.bh->b_data); - int len = - sizeof(struct allocExtDesc); - - aed->lengthAllocDescs = - cpu_to_le32(lenalloc); - if (!UDF_QUERY_FLAG(sb, - UDF_FLAG_STRICT) || - sbi->s_udfrev >= 0x0201) - len += lenalloc; - - udf_update_tag(epos.bh->b_data, - len); - mark_buffer_dirty_inode( - epos.bh, inode); - } - } + } else if (!epos.bh) { + iinfo->i_lenAlloc = lenalloc; + mark_inode_dirty(inode); + } else + udf_update_alloc_ext_desc(inode, + &epos, lenalloc); brelse(epos.bh); epos.offset = sizeof(struct allocExtDesc); epos.block = eloc; @@ -272,29 +268,14 @@ void udf_truncate_extents(struct inode *inode) } if (indirect_ext_len) { - if (!epos.bh) - BUG(); + BUG_ON(!epos.bh); udf_free_blocks(sb, inode, epos.block, 0, indirect_ext_len); - } else { - if (!epos.bh) { - iinfo->i_lenAlloc = lenalloc; - mark_inode_dirty(inode); - } else { - struct allocExtDesc *aed = - (struct allocExtDesc *)(epos.bh->b_data); - aed->lengthAllocDescs = cpu_to_le32(lenalloc); - if (!UDF_QUERY_FLAG(sb, UDF_FLAG_STRICT) || - sbi->s_udfrev >= 0x0201) - udf_update_tag(epos.bh->b_data, - lenalloc + - sizeof(struct allocExtDesc)); - else - udf_update_tag(epos.bh->b_data, - sizeof(struct allocExtDesc)); - mark_buffer_dirty_inode(epos.bh, inode); - } - } + } else if (!epos.bh) { + iinfo->i_lenAlloc = lenalloc; + mark_inode_dirty(inode); + } else + udf_update_alloc_ext_desc(inode, &epos, lenalloc); } else if (inode->i_size) { if (byte_offset) { kernel_long_ad extent; diff --git a/fs/udf/udf_i.h b/fs/udf/udf_i.h index ccc52f1..4f86b1d 100644 --- a/fs/udf/udf_i.h +++ b/fs/udf/udf_i.h @@ -1,10 +1,32 @@ -#ifndef __LINUX_UDF_I_H -#define __LINUX_UDF_I_H +#ifndef _UDF_I_H +#define _UDF_I_H + +struct udf_inode_info { + struct timespec i_crtime; + /* Physical address of inode */ + kernel_lb_addr i_location; + __u64 i_unique; + __u32 i_lenEAttr; + __u32 i_lenAlloc; + __u64 i_lenExtents; + __u32 i_next_alloc_block; + __u32 i_next_alloc_goal; + unsigned i_alloc_type : 3; + unsigned i_efe : 1; /* extendedFileEntry */ + unsigned i_use : 1; /* unallocSpaceEntry */ + unsigned i_strat4096 : 1; + unsigned reserved : 26; + union { + short_ad *i_sad; + long_ad *i_lad; + __u8 *i_data; + } i_ext; + struct inode vfs_inode; +}; -#include <linux/udf_fs_i.h> static inline struct udf_inode_info *UDF_I(struct inode *inode) { return list_entry(inode, struct udf_inode_info, vfs_inode); } -#endif /* !defined(_LINUX_UDF_I_H) */ +#endif /* _UDF_I_H) */ diff --git a/fs/udf/udf_sb.h b/fs/udf/udf_sb.h index 737d1c6..1c1c514 100644 --- a/fs/udf/udf_sb.h +++ b/fs/udf/udf_sb.h @@ -1,10 +1,12 @@ #ifndef __LINUX_UDF_SB_H #define __LINUX_UDF_SB_H +#include <linux/mutex.h> + /* Since UDF 2.01 is ISO 13346 based... */ #define UDF_SUPER_MAGIC 0x15013346 -#define UDF_MAX_READ_VERSION 0x0201 +#define UDF_MAX_READ_VERSION 0x0250 #define UDF_MAX_WRITE_VERSION 0x0201 #define UDF_FLAG_USE_EXTENDED_FE 0 @@ -38,6 +40,111 @@ #define UDF_PART_FLAG_REWRITABLE 0x0040 #define UDF_PART_FLAG_OVERWRITABLE 0x0080 +#define UDF_MAX_BLOCK_LOADED 8 + +#define UDF_TYPE1_MAP15 0x1511U +#define UDF_VIRTUAL_MAP15 0x1512U +#define UDF_VIRTUAL_MAP20 0x2012U +#define UDF_SPARABLE_MAP15 0x1522U +#define UDF_METADATA_MAP25 0x2511U + +#pragma pack(1) /* XXX(hch): Why? This file just defines in-core structures */ + +struct udf_meta_data { + __u32 s_meta_file_loc; + __u32 s_mirror_file_loc; + __u32 s_bitmap_file_loc; + __u32 s_alloc_unit_size; + __u16 s_align_unit_size; + __u8 s_dup_md_flag; + struct inode *s_metadata_fe; + struct inode *s_mirror_fe; + struct inode *s_bitmap_fe; +}; + +struct udf_sparing_data { + __u16 s_packet_len; + struct buffer_head *s_spar_map[4]; +}; + +struct udf_virtual_data { + __u32 s_num_entries; + __u16 s_start_offset; +}; + +struct udf_bitmap { + __u32 s_extLength; + __u32 s_extPosition; + __u16 s_nr_groups; + struct buffer_head **s_block_bitmap; +}; + +struct udf_part_map { + union { + struct udf_bitmap *s_bitmap; + struct inode *s_table; + } s_uspace; + union { + struct udf_bitmap *s_bitmap; + struct inode *s_table; + } s_fspace; + __u32 s_partition_root; + __u32 s_partition_len; + __u16 s_partition_type; + __u16 s_partition_num; + union { + struct udf_sparing_data s_sparing; + struct udf_virtual_data s_virtual; + struct udf_meta_data s_metadata; + } s_type_specific; + __u32 (*s_partition_func)(struct super_block *, __u32, __u16, __u32); + __u16 s_volumeseqnum; + __u16 s_partition_flags; +}; + +#pragma pack() + +struct udf_sb_info { + struct udf_part_map *s_partmaps; + __u8 s_volume_ident[32]; + + /* Overall info */ + __u16 s_partitions; + __u16 s_partition; + + /* Sector headers */ + __s32 s_session; + __u32 s_anchor[3]; + __u32 s_last_block; + + struct buffer_head *s_lvid_bh; + + /* Default permissions */ + mode_t s_umask; + gid_t s_gid; + uid_t s_uid; + + /* Root Info */ + struct timespec s_record_time; + + /* Fileset Info */ + __u16 s_serial_number; + + /* highest UDF revision we have recorded to this media */ + __u16 s_udfrev; + + /* Miscellaneous flags */ + __u32 s_flags; + + /* Encoding info */ + struct nls_table *s_nls_map; + + /* VAT inode */ + struct inode *s_vat_inode; + + struct mutex s_alloc_mutex; +}; + static inline struct udf_sb_info *UDF_SB(struct super_block *sb) { return sb->s_fs_info; diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h index 681dc2b..f3f45d0 100644 --- a/fs/udf/udfdecl.h +++ b/fs/udf/udfdecl.h @@ -1,17 +1,37 @@ #ifndef __UDF_DECL_H #define __UDF_DECL_H -#include <linux/udf_fs.h> #include "ecma_167.h" #include "osta_udf.h" #include <linux/fs.h> #include <linux/types.h> -#include <linux/udf_fs_i.h> -#include <linux/udf_fs_sb.h> #include <linux/buffer_head.h> +#include <linux/udf_fs_i.h> +#include "udf_sb.h" #include "udfend.h" +#include "udf_i.h" + +#define UDF_PREALLOCATE +#define UDF_DEFAULT_PREALLOC_BLOCKS 8 + +#define UDFFS_DEBUG + +#ifdef UDFFS_DEBUG +#define udf_debug(f, a...) \ +do { \ + printk(KERN_DEBUG "UDF-fs DEBUG %s:%d:%s: ", \ + __FILE__, __LINE__, __func__); \ + printk(f, ##a); \ +} while (0) +#else +#define udf_debug(f, a...) /**/ +#endif + +#define udf_info(f, a...) \ + printk(KERN_INFO "UDF-fs INFO " f, ##a); + #define udf_fixed_to_variable(x) ( ( ( (x) >> 5 ) * 39 ) + ( (x) & 0x0000001F ) ) #define udf_variable_to_fixed(x) ( ( ( (x) / 39 ) << 5 ) + ( (x) % 39 ) ) @@ -23,16 +43,24 @@ #define UDF_NAME_LEN 256 #define UDF_PATH_LEN 1023 -#define udf_file_entry_alloc_offset(inode)\ - (UDF_I(inode)->i_use ?\ - sizeof(struct unallocSpaceEntry) :\ - ((UDF_I(inode)->i_efe ?\ - sizeof(struct extendedFileEntry) :\ - sizeof(struct fileEntry)) + UDF_I(inode)->i_lenEAttr)) - -#define udf_ext0_offset(inode)\ - (UDF_I(inode)->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB ?\ - udf_file_entry_alloc_offset(inode) : 0) +static inline size_t udf_file_entry_alloc_offset(struct inode *inode) +{ + struct udf_inode_info *iinfo = UDF_I(inode); + if (iinfo->i_use) + return sizeof(struct unallocSpaceEntry); + else if (iinfo->i_efe) + return sizeof(struct extendedFileEntry) + iinfo->i_lenEAttr; + else + return sizeof(struct fileEntry) + iinfo->i_lenEAttr; +} + +static inline size_t udf_ext0_offset(struct inode *inode) +{ + if (UDF_I(inode)->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) + return udf_file_entry_alloc_offset(inode); + else + return 0; +} #define udf_get_lb_pblock(sb,loc,offset) udf_get_pblock((sb), (loc).logicalBlockNum, (loc).partitionReferenceNum, (offset)) @@ -83,7 +111,6 @@ struct extent_position { }; /* super.c */ -extern void udf_error(struct super_block *, const char *, const char *, ...); extern void udf_warning(struct super_block *, const char *, const char *, ...); /* namei.c */ @@ -150,6 +177,8 @@ extern uint32_t udf_get_pblock_virt20(struct super_block *, uint32_t, uint16_t, uint32_t); extern uint32_t udf_get_pblock_spar15(struct super_block *, uint32_t, uint16_t, uint32_t); +extern uint32_t udf_get_pblock_meta25(struct super_block *, uint32_t, uint16_t, + uint32_t); extern int udf_relocate_blocks(struct super_block *, long, long *); /* unicode.c */ @@ -157,7 +186,7 @@ extern int udf_get_filename(struct super_block *, uint8_t *, uint8_t *, int); extern int udf_put_filename(struct super_block *, const uint8_t *, uint8_t *, int); extern int udf_build_ustr(struct ustr *, dstring *, int); -extern int udf_CS0toUTF8(struct ustr *, struct ustr *); +extern int udf_CS0toUTF8(struct ustr *, const struct ustr *); /* ialloc.c */ extern void udf_free_inode(struct inode *); @@ -191,11 +220,9 @@ extern struct fileIdentDesc *udf_get_fileident(void *buffer, int bufsize, extern long_ad *udf_get_filelongad(uint8_t *, int, uint32_t *, int); extern short_ad *udf_get_fileshortad(uint8_t *, int, uint32_t *, int); -/* crc.c */ -extern uint16_t udf_crc(uint8_t *, uint32_t, uint16_t); - /* udftime.c */ -extern time_t *udf_stamp_to_time(time_t *, long *, kernel_timestamp); -extern kernel_timestamp *udf_time_to_stamp(kernel_timestamp *, struct timespec); +extern struct timespec *udf_disk_stamp_to_time(struct timespec *dest, + timestamp src); +extern timestamp *udf_time_to_disk_stamp(timestamp *dest, struct timespec src); #endif /* __UDF_DECL_H */ diff --git a/fs/udf/udfend.h b/fs/udf/udfend.h index c4bd120..489f52f 100644 --- a/fs/udf/udfend.h +++ b/fs/udf/udfend.h @@ -24,17 +24,6 @@ static inline lb_addr cpu_to_lelb(kernel_lb_addr in) return out; } -static inline kernel_timestamp lets_to_cpu(timestamp in) -{ - kernel_timestamp out; - - memcpy(&out, &in, sizeof(timestamp)); - out.typeAndTimezone = le16_to_cpu(in.typeAndTimezone); - out.year = le16_to_cpu(in.year); - - return out; -} - static inline short_ad lesa_to_cpu(short_ad in) { short_ad out; @@ -85,15 +74,4 @@ static inline kernel_extent_ad leea_to_cpu(extent_ad in) return out; } -static inline timestamp cpu_to_lets(kernel_timestamp in) -{ - timestamp out; - - memcpy(&out, &in, sizeof(timestamp)); - out.typeAndTimezone = cpu_to_le16(in.typeAndTimezone); - out.year = cpu_to_le16(in.year); - - return out; -} - #endif /* __UDF_ENDIAN_H */ diff --git a/fs/udf/udftime.c b/fs/udf/udftime.c index ce59573..5f811655 100644 --- a/fs/udf/udftime.c +++ b/fs/udf/udftime.c @@ -85,39 +85,38 @@ extern struct timezone sys_tz; #define SECS_PER_HOUR (60 * 60) #define SECS_PER_DAY (SECS_PER_HOUR * 24) -time_t *udf_stamp_to_time(time_t *dest, long *dest_usec, kernel_timestamp src) +struct timespec *udf_disk_stamp_to_time(struct timespec *dest, timestamp src) { int yday; - uint8_t type = src.typeAndTimezone >> 12; + u16 typeAndTimezone = le16_to_cpu(src.typeAndTimezone); + u16 year = le16_to_cpu(src.year); + uint8_t type = typeAndTimezone >> 12; int16_t offset; if (type == 1) { - offset = src.typeAndTimezone << 4; + offset = typeAndTimezone << 4; /* sign extent offset */ offset = (offset >> 4); if (offset == -2047) /* unspecified offset */ offset = 0; - } else { + } else offset = 0; - } - if ((src.year < EPOCH_YEAR) || - (src.year >= EPOCH_YEAR + MAX_YEAR_SECONDS)) { - *dest = -1; - *dest_usec = -1; + if ((year < EPOCH_YEAR) || + (year >= EPOCH_YEAR + MAX_YEAR_SECONDS)) { return NULL; } - *dest = year_seconds[src.year - EPOCH_YEAR]; - *dest -= offset * 60; + dest->tv_sec = year_seconds[year - EPOCH_YEAR]; + dest->tv_sec -= offset * 60; - yday = ((__mon_yday[__isleap(src.year)][src.month - 1]) + src.day - 1); - *dest += (((yday * 24) + src.hour) * 60 + src.minute) * 60 + src.second; - *dest_usec = src.centiseconds * 10000 + - src.hundredsOfMicroseconds * 100 + src.microseconds; + yday = ((__mon_yday[__isleap(year)][src.month - 1]) + src.day - 1); + dest->tv_sec += (((yday * 24) + src.hour) * 60 + src.minute) * 60 + src.second; + dest->tv_nsec = 1000 * (src.centiseconds * 10000 + + src.hundredsOfMicroseconds * 100 + src.microseconds); return dest; } -kernel_timestamp *udf_time_to_stamp(kernel_timestamp *dest, struct timespec ts) +timestamp *udf_time_to_disk_stamp(timestamp *dest, struct timespec ts) { long int days, rem, y; const unsigned short int *ip; @@ -128,7 +127,7 @@ kernel_timestamp *udf_time_to_stamp(kernel_timestamp *dest, struct timespec ts) if (!dest) return NULL; - dest->typeAndTimezone = 0x1000 | (offset & 0x0FFF); + dest->typeAndTimezone = cpu_to_le16(0x1000 | (offset & 0x0FFF)); ts.tv_sec += offset * 60; days = ts.tv_sec / SECS_PER_DAY; @@ -151,7 +150,7 @@ kernel_timestamp *udf_time_to_stamp(kernel_timestamp *dest, struct timespec ts) - LEAPS_THRU_END_OF(y - 1)); y = yg; } - dest->year = y; + dest->year = cpu_to_le16(y); ip = __mon_yday[__isleap(y)]; for (y = 11; days < (long int)ip[y]; --y) continue; diff --git a/fs/udf/unicode.c b/fs/udf/unicode.c index e533b11..9fdf8c9 100644 --- a/fs/udf/unicode.c +++ b/fs/udf/unicode.c @@ -23,7 +23,7 @@ #include <linux/kernel.h> #include <linux/string.h> /* for memset */ #include <linux/nls.h> -#include <linux/udf_fs.h> +#include <linux/crc-itu-t.h> #include "udf_sb.h" @@ -49,14 +49,16 @@ int udf_build_ustr(struct ustr *dest, dstring *ptr, int size) { int usesize; - if ((!dest) || (!ptr) || (!size)) + if (!dest || !ptr || !size) return -1; + BUG_ON(size < 2); - memset(dest, 0, sizeof(struct ustr)); - usesize = (size > UDF_NAME_LEN) ? UDF_NAME_LEN : size; + usesize = min_t(size_t, ptr[size - 1], sizeof(dest->u_name)); + usesize = min(usesize, size - 2); dest->u_cmpID = ptr[0]; - dest->u_len = ptr[size - 1]; - memcpy(dest->u_name, ptr + 1, usesize - 1); + dest->u_len = usesize; + memcpy(dest->u_name, ptr + 1, usesize); + memset(dest->u_name + usesize, 0, sizeof(dest->u_name) - usesize); return 0; } @@ -83,9 +85,6 @@ static int udf_build_ustr_exact(struct ustr *dest, dstring *ptr, int exactsize) * PURPOSE * Convert OSTA Compressed Unicode to the UTF-8 equivalent. * - * DESCRIPTION - * This routine is only called by udf_filldir(). - * * PRE-CONDITIONS * utf Pointer to UTF-8 output buffer. * ocu Pointer to OSTA Compressed Unicode input buffer @@ -99,43 +98,39 @@ static int udf_build_ustr_exact(struct ustr *dest, dstring *ptr, int exactsize) * November 12, 1997 - Andrew E. Mileski * Written, tested, and released. */ -int udf_CS0toUTF8(struct ustr *utf_o, struct ustr *ocu_i) +int udf_CS0toUTF8(struct ustr *utf_o, const struct ustr *ocu_i) { - uint8_t *ocu; - uint32_t c; + const uint8_t *ocu; uint8_t cmp_id, ocu_len; int i; - ocu = ocu_i->u_name; - ocu_len = ocu_i->u_len; - cmp_id = ocu_i->u_cmpID; - utf_o->u_len = 0; - if (ocu_len == 0) { memset(utf_o, 0, sizeof(struct ustr)); - utf_o->u_cmpID = 0; - utf_o->u_len = 0; return 0; } - if ((cmp_id != 8) && (cmp_id != 16)) { + cmp_id = ocu_i->u_cmpID; + if (cmp_id != 8 && cmp_id != 16) { + memset(utf_o, 0, sizeof(struct ustr)); printk(KERN_ERR "udf: unknown compression code (%d) stri=%s\n", cmp_id, ocu_i->u_name); return 0; } + ocu = ocu_i->u_name; + utf_o->u_len = 0; for (i = 0; (i < ocu_len) && (utf_o->u_len <= (UDF_NAME_LEN - 3));) { /* Expand OSTA compressed Unicode to Unicode */ - c = ocu[i++]; + uint32_t c = ocu[i++]; if (cmp_id == 16) c = (c << 8) | ocu[i++]; /* Compress Unicode to UTF-8 */ - if (c < 0x80U) { + if (c < 0x80U) utf_o->u_name[utf_o->u_len++] = (uint8_t)c; - } else if (c < 0x800U) { + else if (c < 0x800U) { utf_o->u_name[utf_o->u_len++] = (uint8_t)(0xc0 | (c >> 6)); utf_o->u_name[utf_o->u_len++] = @@ -255,35 +250,32 @@ error_out: } static int udf_CS0toNLS(struct nls_table *nls, struct ustr *utf_o, - struct ustr *ocu_i) + const struct ustr *ocu_i) { - uint8_t *ocu; - uint32_t c; + const uint8_t *ocu; uint8_t cmp_id, ocu_len; int i; - ocu = ocu_i->u_name; ocu_len = ocu_i->u_len; - cmp_id = ocu_i->u_cmpID; - utf_o->u_len = 0; - if (ocu_len == 0) { memset(utf_o, 0, sizeof(struct ustr)); - utf_o->u_cmpID = 0; - utf_o->u_len = 0; return 0; } - if ((cmp_id != 8) && (cmp_id != 16)) { + cmp_id = ocu_i->u_cmpID; + if (cmp_id != 8 && cmp_id != 16) { + memset(utf_o, 0, sizeof(struct ustr)); printk(KERN_ERR "udf: unknown compression code (%d) stri=%s\n", cmp_id, ocu_i->u_name); return 0; } + ocu = ocu_i->u_name; + utf_o->u_len = 0; for (i = 0; (i < ocu_len) && (utf_o->u_len <= (UDF_NAME_LEN - 3));) { /* Expand OSTA compressed Unicode to Unicode */ - c = ocu[i++]; + uint32_t c = ocu[i++]; if (cmp_id == 16) c = (c << 8) | ocu[i++]; @@ -463,7 +455,7 @@ static int udf_translate_to_linux(uint8_t *newName, uint8_t *udfName, } else if (newIndex > 250) newIndex = 250; newName[newIndex++] = CRC_MARK; - valueCRC = udf_crc(fidName, fidNameLen, 0); + valueCRC = crc_itu_t(0, fidName, fidNameLen); newName[newIndex++] = hexChar[(valueCRC & 0xf000) >> 12]; newName[newIndex++] = hexChar[(valueCRC & 0x0f00) >> 8]; newName[newIndex++] = hexChar[(valueCRC & 0x00f0) >> 4]; diff --git a/fs/utimes.c b/fs/utimes.c index b18da9c..a2bef77 100644 --- a/fs/utimes.c +++ b/fs/utimes.c @@ -2,6 +2,7 @@ #include <linux/file.h> #include <linux/fs.h> #include <linux/linkage.h> +#include <linux/mount.h> #include <linux/namei.h> #include <linux/sched.h> #include <linux/stat.h> @@ -59,6 +60,7 @@ long do_utimes(int dfd, char __user *filename, struct timespec *times, int flags struct inode *inode; struct iattr newattrs; struct file *f = NULL; + struct vfsmount *mnt; error = -EINVAL; if (times && (!nsec_valid(times[0].tv_nsec) || @@ -79,18 +81,20 @@ long do_utimes(int dfd, char __user *filename, struct timespec *times, int flags if (!f) goto out; dentry = f->f_path.dentry; + mnt = f->f_path.mnt; } else { error = __user_walk_fd(dfd, filename, (flags & AT_SYMLINK_NOFOLLOW) ? 0 : LOOKUP_FOLLOW, &nd); if (error) goto out; dentry = nd.path.dentry; + mnt = nd.path.mnt; } inode = dentry->d_inode; - error = -EROFS; - if (IS_RDONLY(inode)) + error = mnt_want_write(mnt); + if (error) goto dput_and_out; /* Don't worry, the checks are done in inode_change_ok() */ @@ -98,7 +102,7 @@ long do_utimes(int dfd, char __user *filename, struct timespec *times, int flags if (times) { error = -EPERM; if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) - goto dput_and_out; + goto mnt_drop_write_and_out; if (times[0].tv_nsec == UTIME_OMIT) newattrs.ia_valid &= ~ATTR_ATIME; @@ -118,22 +122,24 @@ long do_utimes(int dfd, char __user *filename, struct timespec *times, int flags } else { error = -EACCES; if (IS_IMMUTABLE(inode)) - goto dput_and_out; + goto mnt_drop_write_and_out; if (!is_owner_or_cap(inode)) { if (f) { if (!(f->f_mode & FMODE_WRITE)) - goto dput_and_out; + goto mnt_drop_write_and_out; } else { error = vfs_permission(&nd, MAY_WRITE); if (error) - goto dput_and_out; + goto mnt_drop_write_and_out; } } } mutex_lock(&inode->i_mutex); error = notify_change(dentry, &newattrs); mutex_unlock(&inode->i_mutex); +mnt_drop_write_and_out: + mnt_drop_write(mnt); dput_and_out: if (f) fput(f); @@ -11,6 +11,7 @@ #include <linux/slab.h> #include <linux/file.h> #include <linux/xattr.h> +#include <linux/mount.h> #include <linux/namei.h> #include <linux/security.h> #include <linux/syscalls.h> @@ -32,8 +33,6 @@ xattr_permission(struct inode *inode, const char *name, int mask) * filesystem or on an immutable / append-only inode. */ if (mask & MAY_WRITE) { - if (IS_RDONLY(inode)) - return -EROFS; if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) return -EPERM; } @@ -262,7 +261,11 @@ sys_setxattr(char __user *path, char __user *name, void __user *value, error = user_path_walk(path, &nd); if (error) return error; - error = setxattr(nd.path.dentry, name, value, size, flags); + error = mnt_want_write(nd.path.mnt); + if (!error) { + error = setxattr(nd.path.dentry, name, value, size, flags); + mnt_drop_write(nd.path.mnt); + } path_put(&nd.path); return error; } @@ -277,7 +280,11 @@ sys_lsetxattr(char __user *path, char __user *name, void __user *value, error = user_path_walk_link(path, &nd); if (error) return error; - error = setxattr(nd.path.dentry, name, value, size, flags); + error = mnt_want_write(nd.path.mnt); + if (!error) { + error = setxattr(nd.path.dentry, name, value, size, flags); + mnt_drop_write(nd.path.mnt); + } path_put(&nd.path); return error; } @@ -295,7 +302,11 @@ sys_fsetxattr(int fd, char __user *name, void __user *value, return error; dentry = f->f_path.dentry; audit_inode(NULL, dentry); - error = setxattr(dentry, name, value, size, flags); + error = mnt_want_write(f->f_path.mnt); + if (!error) { + error = setxattr(dentry, name, value, size, flags); + mnt_drop_write(f->f_path.mnt); + } fput(f); return error; } @@ -482,7 +493,11 @@ sys_removexattr(char __user *path, char __user *name) error = user_path_walk(path, &nd); if (error) return error; - error = removexattr(nd.path.dentry, name); + error = mnt_want_write(nd.path.mnt); + if (!error) { + error = removexattr(nd.path.dentry, name); + mnt_drop_write(nd.path.mnt); + } path_put(&nd.path); return error; } @@ -496,7 +511,11 @@ sys_lremovexattr(char __user *path, char __user *name) error = user_path_walk_link(path, &nd); if (error) return error; - error = removexattr(nd.path.dentry, name); + error = mnt_want_write(nd.path.mnt); + if (!error) { + error = removexattr(nd.path.dentry, name); + mnt_drop_write(nd.path.mnt); + } path_put(&nd.path); return error; } @@ -513,7 +532,11 @@ sys_fremovexattr(int fd, char __user *name) return error; dentry = f->f_path.dentry; audit_inode(NULL, dentry); - error = removexattr(dentry, name); + error = mnt_want_write(f->f_path.mnt); + if (!error) { + error = removexattr(dentry, name); + mnt_drop_write(f->f_path.mnt); + } fput(f); return error; } diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig index 35115bc..524021f 100644 --- a/fs/xfs/Kconfig +++ b/fs/xfs/Kconfig @@ -35,18 +35,6 @@ config XFS_QUOTA with or without the generic quota support enabled (CONFIG_QUOTA) - they are completely independent subsystems. -config XFS_SECURITY - bool "XFS Security Label support" - depends on XFS_FS - help - Security labels support alternative access control models - implemented by security modules like SELinux. This option - enables an extended attribute namespace for inode security - labels in the XFS filesystem. - - If you are not using a security module that requires using - extended attributes for inode security labels, say N. - config XFS_POSIX_ACL bool "XFS POSIX ACL support" depends on XFS_FS diff --git a/fs/xfs/linux-2.6/kmem.c b/fs/xfs/linux-2.6/kmem.c index e040f1c..9b1bb17 100644 --- a/fs/xfs/linux-2.6/kmem.c +++ b/fs/xfs/linux-2.6/kmem.c @@ -37,7 +37,7 @@ kmem_alloc(size_t size, unsigned int __nocast flags) #ifdef DEBUG if (unlikely(!(flags & KM_LARGE) && (size > PAGE_SIZE))) { printk(KERN_WARNING "Large %s attempt, size=%ld\n", - __FUNCTION__, (long)size); + __func__, (long)size); dump_stack(); } #endif @@ -52,7 +52,7 @@ kmem_alloc(size_t size, unsigned int __nocast flags) if (!(++retries % 100)) printk(KERN_ERR "XFS: possible memory allocation " "deadlock in %s (mode:0x%x)\n", - __FUNCTION__, lflags); + __func__, lflags); congestion_wait(WRITE, HZ/50); } while (1); } @@ -129,7 +129,7 @@ kmem_zone_alloc(kmem_zone_t *zone, unsigned int __nocast flags) if (!(++retries % 100)) printk(KERN_ERR "XFS: possible memory allocation " "deadlock in %s (mode:0x%x)\n", - __FUNCTION__, lflags); + __func__, lflags); congestion_wait(WRITE, HZ/50); } while (1); } diff --git a/fs/xfs/linux-2.6/sema.h b/fs/xfs/linux-2.6/sema.h index 2009e6d..3abe7e9 100644 --- a/fs/xfs/linux-2.6/sema.h +++ b/fs/xfs/linux-2.6/sema.h @@ -20,8 +20,8 @@ #include <linux/time.h> #include <linux/wait.h> +#include <linux/semaphore.h> #include <asm/atomic.h> -#include <asm/semaphore.h> /* * sema_t structure just maps to struct semaphore in Linux kernel. diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c index e051952..a55c3b2 100644 --- a/fs/xfs/linux-2.6/xfs_aops.c +++ b/fs/xfs/linux-2.6/xfs_aops.c @@ -243,8 +243,12 @@ xfs_end_bio_unwritten( size_t size = ioend->io_size; if (likely(!ioend->io_error)) { - if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) - xfs_iomap_write_unwritten(ip, offset, size); + if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) { + int error; + error = xfs_iomap_write_unwritten(ip, offset, size); + if (error) + ioend->io_error = error; + } xfs_setfilesize(ioend); } xfs_destroy_ioend(ioend); @@ -1532,9 +1536,9 @@ xfs_vm_bmap( struct xfs_inode *ip = XFS_I(inode); xfs_itrace_entry(XFS_I(inode)); - xfs_rwlock(ip, VRWLOCK_READ); + xfs_ilock(ip, XFS_IOLOCK_SHARED); xfs_flush_pages(ip, (xfs_off_t)0, -1, 0, FI_REMAPF); - xfs_rwunlock(ip, VRWLOCK_READ); + xfs_iunlock(ip, XFS_IOLOCK_SHARED); return generic_block_bmap(mapping, block, xfs_get_blocks); } diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c index e347bfd..52f6846 100644 --- a/fs/xfs/linux-2.6/xfs_buf.c +++ b/fs/xfs/linux-2.6/xfs_buf.c @@ -400,7 +400,7 @@ _xfs_buf_lookup_pages( printk(KERN_ERR "XFS: possible memory allocation " "deadlock in %s (mode:0x%x)\n", - __FUNCTION__, gfp_mask); + __func__, gfp_mask); XFS_STATS_INC(xb_page_retries); xfsbufd_wakeup(0, gfp_mask); @@ -598,7 +598,7 @@ xfs_buf_get_flags( error = _xfs_buf_map_pages(bp, flags); if (unlikely(error)) { printk(KERN_WARNING "%s: failed to map pages\n", - __FUNCTION__); + __func__); goto no_buffer; } } @@ -778,7 +778,7 @@ xfs_buf_get_noaddr( error = _xfs_buf_map_pages(bp, XBF_MAPPED); if (unlikely(error)) { printk(KERN_WARNING "%s: failed to map pages\n", - __FUNCTION__); + __func__); goto fail_free_mem; } @@ -1060,7 +1060,7 @@ xfs_buf_iostart( bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_ASYNC); bp->b_flags |= flags & (XBF_DELWRI | XBF_ASYNC); xfs_buf_delwri_queue(bp, 1); - return status; + return 0; } bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_ASYNC | XBF_DELWRI | \ diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h index a3d207d..841d788 100644 --- a/fs/xfs/linux-2.6/xfs_buf.h +++ b/fs/xfs/linux-2.6/xfs_buf.h @@ -387,11 +387,15 @@ static inline int XFS_bwrite(xfs_buf_t *bp) return error; } -static inline int xfs_bdwrite(void *mp, xfs_buf_t *bp) +/* + * No error can be returned from xfs_buf_iostart for delwri + * buffers as they are queued and no I/O is issued. + */ +static inline void xfs_bdwrite(void *mp, xfs_buf_t *bp) { bp->b_strat = xfs_bdstrat_cb; bp->b_fspriv3 = mp; - return xfs_buf_iostart(bp, XBF_DELWRI | XBF_ASYNC); + (void)xfs_buf_iostart(bp, XBF_DELWRI | XBF_ASYNC); } #define XFS_bdstrat(bp) xfs_buf_iorequest(bp) diff --git a/fs/xfs/linux-2.6/xfs_cred.h b/fs/xfs/linux-2.6/xfs_cred.h index e7f3da6..652721c 100644 --- a/fs/xfs/linux-2.6/xfs_cred.h +++ b/fs/xfs/linux-2.6/xfs_cred.h @@ -30,7 +30,7 @@ typedef struct cred { extern struct cred *sys_cred; /* this is a hack.. (assumes sys_cred is the only cred_t in the system) */ -static __inline int capable_cred(cred_t *cr, int cid) +static inline int capable_cred(cred_t *cr, int cid) { return (cr == sys_cred) ? 1 : capable(cid); } diff --git a/fs/xfs/linux-2.6/xfs_export.c b/fs/xfs/linux-2.6/xfs_export.c index ca4f66c..265f0168 100644 --- a/fs/xfs/linux-2.6/xfs_export.c +++ b/fs/xfs/linux-2.6/xfs_export.c @@ -22,6 +22,7 @@ #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" +#include "xfs_dir2.h" #include "xfs_dmapi.h" #include "xfs_mount.h" #include "xfs_export.h" @@ -30,8 +31,6 @@ #include "xfs_inode.h" #include "xfs_vfsops.h" -static struct dentry dotdot = { .d_name.name = "..", .d_name.len = 2, }; - /* * Note that we only accept fileids which are long enough rather than allow * the parent generation number to default to zero. XFS considers zero a @@ -66,7 +65,7 @@ xfs_fs_encode_fh( int len; /* Directories don't need their parent encoded, they have ".." */ - if (S_ISDIR(inode->i_mode)) + if (S_ISDIR(inode->i_mode) || !connectable) fileid_type = FILEID_INO32_GEN; else fileid_type = FILEID_INO32_GEN_PARENT; @@ -213,17 +212,16 @@ xfs_fs_get_parent( struct dentry *child) { int error; - bhv_vnode_t *cvp; + struct xfs_inode *cip; struct dentry *parent; - cvp = NULL; - error = xfs_lookup(XFS_I(child->d_inode), &dotdot, &cvp); + error = xfs_lookup(XFS_I(child->d_inode), &xfs_name_dotdot, &cip); if (unlikely(error)) return ERR_PTR(-error); - parent = d_alloc_anon(vn_to_inode(cvp)); + parent = d_alloc_anon(cip->i_vnode); if (unlikely(!parent)) { - VN_RELE(cvp); + iput(cip->i_vnode); return ERR_PTR(-ENOMEM); } return parent; diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c index edab1ff..0590524 100644 --- a/fs/xfs/linux-2.6/xfs_file.c +++ b/fs/xfs/linux-2.6/xfs_file.c @@ -469,16 +469,11 @@ xfs_file_open_exec( struct inode *inode) { struct xfs_mount *mp = XFS_M(inode->i_sb); + struct xfs_inode *ip = XFS_I(inode); - if (unlikely(mp->m_flags & XFS_MOUNT_DMAPI)) { - if (DM_EVENT_ENABLED(XFS_I(inode), DM_EVENT_READ)) { - bhv_vnode_t *vp = vn_from_inode(inode); - - return -XFS_SEND_DATA(mp, DM_EVENT_READ, - vp, 0, 0, 0, NULL); - } - } - + if (unlikely(mp->m_flags & XFS_MOUNT_DMAPI) && + DM_EVENT_ENABLED(ip, DM_EVENT_READ)) + return -XFS_SEND_DATA(mp, DM_EVENT_READ, ip, 0, 0, 0, NULL); return 0; } #endif /* HAVE_FOP_OPEN_EXEC */ diff --git a/fs/xfs/linux-2.6/xfs_fs_subr.c b/fs/xfs/linux-2.6/xfs_fs_subr.c index ac6d34c..1eefe61 100644 --- a/fs/xfs/linux-2.6/xfs_fs_subr.c +++ b/fs/xfs/linux-2.6/xfs_fs_subr.c @@ -17,18 +17,7 @@ */ #include "xfs.h" #include "xfs_vnodeops.h" - -/* - * The following six includes are needed so that we can include - * xfs_inode.h. What a mess.. - */ #include "xfs_bmap_btree.h" -#include "xfs_inum.h" -#include "xfs_dir2.h" -#include "xfs_dir2_sf.h" -#include "xfs_attr_sf.h" -#include "xfs_dinode.h" - #include "xfs_inode.h" int fs_noerr(void) { return 0; } @@ -42,11 +31,10 @@ xfs_tosspages( xfs_off_t last, int fiopt) { - bhv_vnode_t *vp = XFS_ITOV(ip); - struct inode *inode = vn_to_inode(vp); + struct address_space *mapping = ip->i_vnode->i_mapping; - if (VN_CACHED(vp)) - truncate_inode_pages(inode->i_mapping, first); + if (mapping->nrpages) + truncate_inode_pages(mapping, first); } int @@ -56,15 +44,14 @@ xfs_flushinval_pages( xfs_off_t last, int fiopt) { - bhv_vnode_t *vp = XFS_ITOV(ip); - struct inode *inode = vn_to_inode(vp); + struct address_space *mapping = ip->i_vnode->i_mapping; int ret = 0; - if (VN_CACHED(vp)) { + if (mapping->nrpages) { xfs_iflags_clear(ip, XFS_ITRUNCATED); - ret = filemap_write_and_wait(inode->i_mapping); + ret = filemap_write_and_wait(mapping); if (!ret) - truncate_inode_pages(inode->i_mapping, first); + truncate_inode_pages(mapping, first); } return ret; } @@ -77,17 +64,16 @@ xfs_flush_pages( uint64_t flags, int fiopt) { - bhv_vnode_t *vp = XFS_ITOV(ip); - struct inode *inode = vn_to_inode(vp); + struct address_space *mapping = ip->i_vnode->i_mapping; int ret = 0; int ret2; - if (VN_DIRTY(vp)) { + if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) { xfs_iflags_clear(ip, XFS_ITRUNCATED); - ret = filemap_fdatawrite(inode->i_mapping); + ret = filemap_fdatawrite(mapping); if (flags & XFS_B_ASYNC) return ret; - ret2 = filemap_fdatawait(inode->i_mapping); + ret2 = filemap_fdatawait(mapping); if (!ret) ret = ret2; } diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c index f34bd01..4ddb86b 100644 --- a/fs/xfs/linux-2.6/xfs_ioctl.c +++ b/fs/xfs/linux-2.6/xfs_ioctl.c @@ -535,8 +535,6 @@ xfs_attrmulti_attr_set( char *kbuf; int error = EFAULT; - if (IS_RDONLY(inode)) - return -EROFS; if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) return EPERM; if (len > XATTR_SIZE_MAX) @@ -562,8 +560,6 @@ xfs_attrmulti_attr_remove( char *name, __uint32_t flags) { - if (IS_RDONLY(inode)) - return -EROFS; if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) return EPERM; return xfs_attr_remove(XFS_I(inode), name, flags); @@ -573,6 +569,7 @@ STATIC int xfs_attrmulti_by_handle( xfs_mount_t *mp, void __user *arg, + struct file *parfilp, struct inode *parinode) { int error; @@ -626,13 +623,21 @@ xfs_attrmulti_by_handle( &ops[i].am_length, ops[i].am_flags); break; case ATTR_OP_SET: + ops[i].am_error = mnt_want_write(parfilp->f_path.mnt); + if (ops[i].am_error) + break; ops[i].am_error = xfs_attrmulti_attr_set(inode, attr_name, ops[i].am_attrvalue, ops[i].am_length, ops[i].am_flags); + mnt_drop_write(parfilp->f_path.mnt); break; case ATTR_OP_REMOVE: + ops[i].am_error = mnt_want_write(parfilp->f_path.mnt); + if (ops[i].am_error) + break; ops[i].am_error = xfs_attrmulti_attr_remove(inode, attr_name, ops[i].am_flags); + mnt_drop_write(parfilp->f_path.mnt); break; default: ops[i].am_error = EINVAL; @@ -651,314 +656,6 @@ xfs_attrmulti_by_handle( return -error; } -/* prototypes for a few of the stack-hungry cases that have - * their own functions. Functions are defined after their use - * so gcc doesn't get fancy and inline them with -03 */ - -STATIC int -xfs_ioc_space( - struct xfs_inode *ip, - struct inode *inode, - struct file *filp, - int flags, - unsigned int cmd, - void __user *arg); - -STATIC int -xfs_ioc_bulkstat( - xfs_mount_t *mp, - unsigned int cmd, - void __user *arg); - -STATIC int -xfs_ioc_fsgeometry_v1( - xfs_mount_t *mp, - void __user *arg); - -STATIC int -xfs_ioc_fsgeometry( - xfs_mount_t *mp, - void __user *arg); - -STATIC int -xfs_ioc_xattr( - xfs_inode_t *ip, - struct file *filp, - unsigned int cmd, - void __user *arg); - -STATIC int -xfs_ioc_fsgetxattr( - xfs_inode_t *ip, - int attr, - void __user *arg); - -STATIC int -xfs_ioc_getbmap( - struct xfs_inode *ip, - int flags, - unsigned int cmd, - void __user *arg); - -STATIC int -xfs_ioc_getbmapx( - struct xfs_inode *ip, - void __user *arg); - -int -xfs_ioctl( - xfs_inode_t *ip, - struct file *filp, - int ioflags, - unsigned int cmd, - void __user *arg) -{ - struct inode *inode = filp->f_path.dentry->d_inode; - xfs_mount_t *mp = ip->i_mount; - int error; - - xfs_itrace_entry(XFS_I(inode)); - switch (cmd) { - - case XFS_IOC_ALLOCSP: - case XFS_IOC_FREESP: - case XFS_IOC_RESVSP: - case XFS_IOC_UNRESVSP: - case XFS_IOC_ALLOCSP64: - case XFS_IOC_FREESP64: - case XFS_IOC_RESVSP64: - case XFS_IOC_UNRESVSP64: - /* - * Only allow the sys admin to reserve space unless - * unwritten extents are enabled. - */ - if (!xfs_sb_version_hasextflgbit(&mp->m_sb) && - !capable(CAP_SYS_ADMIN)) - return -EPERM; - - return xfs_ioc_space(ip, inode, filp, ioflags, cmd, arg); - - case XFS_IOC_DIOINFO: { - struct dioattr da; - xfs_buftarg_t *target = - XFS_IS_REALTIME_INODE(ip) ? - mp->m_rtdev_targp : mp->m_ddev_targp; - - da.d_mem = da.d_miniosz = 1 << target->bt_sshift; - da.d_maxiosz = INT_MAX & ~(da.d_miniosz - 1); - - if (copy_to_user(arg, &da, sizeof(da))) - return -XFS_ERROR(EFAULT); - return 0; - } - - case XFS_IOC_FSBULKSTAT_SINGLE: - case XFS_IOC_FSBULKSTAT: - case XFS_IOC_FSINUMBERS: - return xfs_ioc_bulkstat(mp, cmd, arg); - - case XFS_IOC_FSGEOMETRY_V1: - return xfs_ioc_fsgeometry_v1(mp, arg); - - case XFS_IOC_FSGEOMETRY: - return xfs_ioc_fsgeometry(mp, arg); - - case XFS_IOC_GETVERSION: - return put_user(inode->i_generation, (int __user *)arg); - - case XFS_IOC_FSGETXATTR: - return xfs_ioc_fsgetxattr(ip, 0, arg); - case XFS_IOC_FSGETXATTRA: - return xfs_ioc_fsgetxattr(ip, 1, arg); - case XFS_IOC_GETXFLAGS: - case XFS_IOC_SETXFLAGS: - case XFS_IOC_FSSETXATTR: - return xfs_ioc_xattr(ip, filp, cmd, arg); - - case XFS_IOC_FSSETDM: { - struct fsdmidata dmi; - - if (copy_from_user(&dmi, arg, sizeof(dmi))) - return -XFS_ERROR(EFAULT); - - error = xfs_set_dmattrs(ip, dmi.fsd_dmevmask, - dmi.fsd_dmstate); - return -error; - } - - case XFS_IOC_GETBMAP: - case XFS_IOC_GETBMAPA: - return xfs_ioc_getbmap(ip, ioflags, cmd, arg); - - case XFS_IOC_GETBMAPX: - return xfs_ioc_getbmapx(ip, arg); - - case XFS_IOC_FD_TO_HANDLE: - case XFS_IOC_PATH_TO_HANDLE: - case XFS_IOC_PATH_TO_FSHANDLE: - return xfs_find_handle(cmd, arg); - - case XFS_IOC_OPEN_BY_HANDLE: - return xfs_open_by_handle(mp, arg, filp, inode); - - case XFS_IOC_FSSETDM_BY_HANDLE: - return xfs_fssetdm_by_handle(mp, arg, inode); - - case XFS_IOC_READLINK_BY_HANDLE: - return xfs_readlink_by_handle(mp, arg, inode); - - case XFS_IOC_ATTRLIST_BY_HANDLE: - return xfs_attrlist_by_handle(mp, arg, inode); - - case XFS_IOC_ATTRMULTI_BY_HANDLE: - return xfs_attrmulti_by_handle(mp, arg, inode); - - case XFS_IOC_SWAPEXT: { - error = xfs_swapext((struct xfs_swapext __user *)arg); - return -error; - } - - case XFS_IOC_FSCOUNTS: { - xfs_fsop_counts_t out; - - error = xfs_fs_counts(mp, &out); - if (error) - return -error; - - if (copy_to_user(arg, &out, sizeof(out))) - return -XFS_ERROR(EFAULT); - return 0; - } - - case XFS_IOC_SET_RESBLKS: { - xfs_fsop_resblks_t inout; - __uint64_t in; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - if (copy_from_user(&inout, arg, sizeof(inout))) - return -XFS_ERROR(EFAULT); - - /* input parameter is passed in resblks field of structure */ - in = inout.resblks; - error = xfs_reserve_blocks(mp, &in, &inout); - if (error) - return -error; - - if (copy_to_user(arg, &inout, sizeof(inout))) - return -XFS_ERROR(EFAULT); - return 0; - } - - case XFS_IOC_GET_RESBLKS: { - xfs_fsop_resblks_t out; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - error = xfs_reserve_blocks(mp, NULL, &out); - if (error) - return -error; - - if (copy_to_user(arg, &out, sizeof(out))) - return -XFS_ERROR(EFAULT); - - return 0; - } - - case XFS_IOC_FSGROWFSDATA: { - xfs_growfs_data_t in; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - if (copy_from_user(&in, arg, sizeof(in))) - return -XFS_ERROR(EFAULT); - - error = xfs_growfs_data(mp, &in); - return -error; - } - - case XFS_IOC_FSGROWFSLOG: { - xfs_growfs_log_t in; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - if (copy_from_user(&in, arg, sizeof(in))) - return -XFS_ERROR(EFAULT); - - error = xfs_growfs_log(mp, &in); - return -error; - } - - case XFS_IOC_FSGROWFSRT: { - xfs_growfs_rt_t in; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - if (copy_from_user(&in, arg, sizeof(in))) - return -XFS_ERROR(EFAULT); - - error = xfs_growfs_rt(mp, &in); - return -error; - } - - case XFS_IOC_FREEZE: - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - if (inode->i_sb->s_frozen == SB_UNFROZEN) - freeze_bdev(inode->i_sb->s_bdev); - return 0; - - case XFS_IOC_THAW: - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - if (inode->i_sb->s_frozen != SB_UNFROZEN) - thaw_bdev(inode->i_sb->s_bdev, inode->i_sb); - return 0; - - case XFS_IOC_GOINGDOWN: { - __uint32_t in; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - if (get_user(in, (__uint32_t __user *)arg)) - return -XFS_ERROR(EFAULT); - - error = xfs_fs_goingdown(mp, in); - return -error; - } - - case XFS_IOC_ERROR_INJECTION: { - xfs_error_injection_t in; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - if (copy_from_user(&in, arg, sizeof(in))) - return -XFS_ERROR(EFAULT); - - error = xfs_errortag_add(in.errtag, mp); - return -error; - } - - case XFS_IOC_ERROR_CLEARALL: - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - error = xfs_errortag_clearall(mp, 1); - return -error; - - default: - return -ENOTTY; - } -} - STATIC int xfs_ioc_space( struct xfs_inode *ip, @@ -1179,85 +876,85 @@ xfs_ioc_fsgetxattr( } STATIC int -xfs_ioc_xattr( +xfs_ioc_fssetxattr( xfs_inode_t *ip, struct file *filp, - unsigned int cmd, void __user *arg) { struct fsxattr fa; struct bhv_vattr *vattr; - int error = 0; + int error; int attr_flags; - unsigned int flags; + + if (copy_from_user(&fa, arg, sizeof(fa))) + return -EFAULT; vattr = kmalloc(sizeof(*vattr), GFP_KERNEL); if (unlikely(!vattr)) return -ENOMEM; - switch (cmd) { - case XFS_IOC_FSSETXATTR: { - if (copy_from_user(&fa, arg, sizeof(fa))) { - error = -EFAULT; - break; - } + attr_flags = 0; + if (filp->f_flags & (O_NDELAY|O_NONBLOCK)) + attr_flags |= ATTR_NONBLOCK; - attr_flags = 0; - if (filp->f_flags & (O_NDELAY|O_NONBLOCK)) - attr_flags |= ATTR_NONBLOCK; + vattr->va_mask = XFS_AT_XFLAGS | XFS_AT_EXTSIZE | XFS_AT_PROJID; + vattr->va_xflags = fa.fsx_xflags; + vattr->va_extsize = fa.fsx_extsize; + vattr->va_projid = fa.fsx_projid; - vattr->va_mask = XFS_AT_XFLAGS | XFS_AT_EXTSIZE | XFS_AT_PROJID; - vattr->va_xflags = fa.fsx_xflags; - vattr->va_extsize = fa.fsx_extsize; - vattr->va_projid = fa.fsx_projid; + error = -xfs_setattr(ip, vattr, attr_flags, NULL); + if (!error) + vn_revalidate(XFS_ITOV(ip)); /* update flags */ + kfree(vattr); + return 0; +} - error = xfs_setattr(ip, vattr, attr_flags, NULL); - if (likely(!error)) - vn_revalidate(XFS_ITOV(ip)); /* update flags */ - error = -error; - break; - } +STATIC int +xfs_ioc_getxflags( + xfs_inode_t *ip, + void __user *arg) +{ + unsigned int flags; - case XFS_IOC_GETXFLAGS: { - flags = xfs_di2lxflags(ip->i_d.di_flags); - if (copy_to_user(arg, &flags, sizeof(flags))) - error = -EFAULT; - break; - } + flags = xfs_di2lxflags(ip->i_d.di_flags); + if (copy_to_user(arg, &flags, sizeof(flags))) + return -EFAULT; + return 0; +} - case XFS_IOC_SETXFLAGS: { - if (copy_from_user(&flags, arg, sizeof(flags))) { - error = -EFAULT; - break; - } +STATIC int +xfs_ioc_setxflags( + xfs_inode_t *ip, + struct file *filp, + void __user *arg) +{ + struct bhv_vattr *vattr; + unsigned int flags; + int attr_flags; + int error; - if (flags & ~(FS_IMMUTABLE_FL | FS_APPEND_FL | \ - FS_NOATIME_FL | FS_NODUMP_FL | \ - FS_SYNC_FL)) { - error = -EOPNOTSUPP; - break; - } + if (copy_from_user(&flags, arg, sizeof(flags))) + return -EFAULT; - attr_flags = 0; - if (filp->f_flags & (O_NDELAY|O_NONBLOCK)) - attr_flags |= ATTR_NONBLOCK; + if (flags & ~(FS_IMMUTABLE_FL | FS_APPEND_FL | \ + FS_NOATIME_FL | FS_NODUMP_FL | \ + FS_SYNC_FL)) + return -EOPNOTSUPP; - vattr->va_mask = XFS_AT_XFLAGS; - vattr->va_xflags = xfs_merge_ioc_xflags(flags, - xfs_ip2xflags(ip)); + vattr = kmalloc(sizeof(*vattr), GFP_KERNEL); + if (unlikely(!vattr)) + return -ENOMEM; - error = xfs_setattr(ip, vattr, attr_flags, NULL); - if (likely(!error)) - vn_revalidate(XFS_ITOV(ip)); /* update flags */ - error = -error; - break; - } + attr_flags = 0; + if (filp->f_flags & (O_NDELAY|O_NONBLOCK)) + attr_flags |= ATTR_NONBLOCK; - default: - error = -ENOTTY; - break; - } + vattr->va_mask = XFS_AT_XFLAGS; + vattr->va_xflags = xfs_merge_ioc_xflags(flags, xfs_ip2xflags(ip)); + error = -xfs_setattr(ip, vattr, attr_flags, NULL); + if (likely(!error)) + vn_revalidate(XFS_ITOV(ip)); /* update flags */ kfree(vattr); return error; } @@ -1332,3 +1029,259 @@ xfs_ioc_getbmapx( return 0; } + +int +xfs_ioctl( + xfs_inode_t *ip, + struct file *filp, + int ioflags, + unsigned int cmd, + void __user *arg) +{ + struct inode *inode = filp->f_path.dentry->d_inode; + xfs_mount_t *mp = ip->i_mount; + int error; + + xfs_itrace_entry(XFS_I(inode)); + switch (cmd) { + + case XFS_IOC_ALLOCSP: + case XFS_IOC_FREESP: + case XFS_IOC_RESVSP: + case XFS_IOC_UNRESVSP: + case XFS_IOC_ALLOCSP64: + case XFS_IOC_FREESP64: + case XFS_IOC_RESVSP64: + case XFS_IOC_UNRESVSP64: + /* + * Only allow the sys admin to reserve space unless + * unwritten extents are enabled. + */ + if (!xfs_sb_version_hasextflgbit(&mp->m_sb) && + !capable(CAP_SYS_ADMIN)) + return -EPERM; + + return xfs_ioc_space(ip, inode, filp, ioflags, cmd, arg); + + case XFS_IOC_DIOINFO: { + struct dioattr da; + xfs_buftarg_t *target = + XFS_IS_REALTIME_INODE(ip) ? + mp->m_rtdev_targp : mp->m_ddev_targp; + + da.d_mem = da.d_miniosz = 1 << target->bt_sshift; + da.d_maxiosz = INT_MAX & ~(da.d_miniosz - 1); + + if (copy_to_user(arg, &da, sizeof(da))) + return -XFS_ERROR(EFAULT); + return 0; + } + + case XFS_IOC_FSBULKSTAT_SINGLE: + case XFS_IOC_FSBULKSTAT: + case XFS_IOC_FSINUMBERS: + return xfs_ioc_bulkstat(mp, cmd, arg); + + case XFS_IOC_FSGEOMETRY_V1: + return xfs_ioc_fsgeometry_v1(mp, arg); + + case XFS_IOC_FSGEOMETRY: + return xfs_ioc_fsgeometry(mp, arg); + + case XFS_IOC_GETVERSION: + return put_user(inode->i_generation, (int __user *)arg); + + case XFS_IOC_FSGETXATTR: + return xfs_ioc_fsgetxattr(ip, 0, arg); + case XFS_IOC_FSGETXATTRA: + return xfs_ioc_fsgetxattr(ip, 1, arg); + case XFS_IOC_FSSETXATTR: + return xfs_ioc_fssetxattr(ip, filp, arg); + case XFS_IOC_GETXFLAGS: + return xfs_ioc_getxflags(ip, arg); + case XFS_IOC_SETXFLAGS: + return xfs_ioc_setxflags(ip, filp, arg); + + case XFS_IOC_FSSETDM: { + struct fsdmidata dmi; + + if (copy_from_user(&dmi, arg, sizeof(dmi))) + return -XFS_ERROR(EFAULT); + + error = xfs_set_dmattrs(ip, dmi.fsd_dmevmask, + dmi.fsd_dmstate); + return -error; + } + + case XFS_IOC_GETBMAP: + case XFS_IOC_GETBMAPA: + return xfs_ioc_getbmap(ip, ioflags, cmd, arg); + + case XFS_IOC_GETBMAPX: + return xfs_ioc_getbmapx(ip, arg); + + case XFS_IOC_FD_TO_HANDLE: + case XFS_IOC_PATH_TO_HANDLE: + case XFS_IOC_PATH_TO_FSHANDLE: + return xfs_find_handle(cmd, arg); + + case XFS_IOC_OPEN_BY_HANDLE: + return xfs_open_by_handle(mp, arg, filp, inode); + + case XFS_IOC_FSSETDM_BY_HANDLE: + return xfs_fssetdm_by_handle(mp, arg, inode); + + case XFS_IOC_READLINK_BY_HANDLE: + return xfs_readlink_by_handle(mp, arg, inode); + + case XFS_IOC_ATTRLIST_BY_HANDLE: + return xfs_attrlist_by_handle(mp, arg, inode); + + case XFS_IOC_ATTRMULTI_BY_HANDLE: + return xfs_attrmulti_by_handle(mp, arg, filp, inode); + + case XFS_IOC_SWAPEXT: { + error = xfs_swapext((struct xfs_swapext __user *)arg); + return -error; + } + + case XFS_IOC_FSCOUNTS: { + xfs_fsop_counts_t out; + + error = xfs_fs_counts(mp, &out); + if (error) + return -error; + + if (copy_to_user(arg, &out, sizeof(out))) + return -XFS_ERROR(EFAULT); + return 0; + } + + case XFS_IOC_SET_RESBLKS: { + xfs_fsop_resblks_t inout; + __uint64_t in; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (copy_from_user(&inout, arg, sizeof(inout))) + return -XFS_ERROR(EFAULT); + + /* input parameter is passed in resblks field of structure */ + in = inout.resblks; + error = xfs_reserve_blocks(mp, &in, &inout); + if (error) + return -error; + + if (copy_to_user(arg, &inout, sizeof(inout))) + return -XFS_ERROR(EFAULT); + return 0; + } + + case XFS_IOC_GET_RESBLKS: { + xfs_fsop_resblks_t out; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + error = xfs_reserve_blocks(mp, NULL, &out); + if (error) + return -error; + + if (copy_to_user(arg, &out, sizeof(out))) + return -XFS_ERROR(EFAULT); + + return 0; + } + + case XFS_IOC_FSGROWFSDATA: { + xfs_growfs_data_t in; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (copy_from_user(&in, arg, sizeof(in))) + return -XFS_ERROR(EFAULT); + + error = xfs_growfs_data(mp, &in); + return -error; + } + + case XFS_IOC_FSGROWFSLOG: { + xfs_growfs_log_t in; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (copy_from_user(&in, arg, sizeof(in))) + return -XFS_ERROR(EFAULT); + + error = xfs_growfs_log(mp, &in); + return -error; + } + + case XFS_IOC_FSGROWFSRT: { + xfs_growfs_rt_t in; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (copy_from_user(&in, arg, sizeof(in))) + return -XFS_ERROR(EFAULT); + + error = xfs_growfs_rt(mp, &in); + return -error; + } + + case XFS_IOC_FREEZE: + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (inode->i_sb->s_frozen == SB_UNFROZEN) + freeze_bdev(inode->i_sb->s_bdev); + return 0; + + case XFS_IOC_THAW: + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + if (inode->i_sb->s_frozen != SB_UNFROZEN) + thaw_bdev(inode->i_sb->s_bdev, inode->i_sb); + return 0; + + case XFS_IOC_GOINGDOWN: { + __uint32_t in; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (get_user(in, (__uint32_t __user *)arg)) + return -XFS_ERROR(EFAULT); + + error = xfs_fs_goingdown(mp, in); + return -error; + } + + case XFS_IOC_ERROR_INJECTION: { + xfs_error_injection_t in; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (copy_from_user(&in, arg, sizeof(in))) + return -XFS_ERROR(EFAULT); + + error = xfs_errortag_add(in.errtag, mp); + return -error; + } + + case XFS_IOC_ERROR_CLEARALL: + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + error = xfs_errortag_clearall(mp, 1); + return -error; + + default: + return -ENOTTY; + } +} diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c index cc4abd3..a1237da 100644 --- a/fs/xfs/linux-2.6/xfs_iops.c +++ b/fs/xfs/linux-2.6/xfs_iops.c @@ -62,12 +62,11 @@ void xfs_synchronize_atime( xfs_inode_t *ip) { - bhv_vnode_t *vp; + struct inode *inode = ip->i_vnode; - vp = XFS_ITOV_NULL(ip); - if (vp) { - ip->i_d.di_atime.t_sec = (__int32_t)vp->i_atime.tv_sec; - ip->i_d.di_atime.t_nsec = (__int32_t)vp->i_atime.tv_nsec; + if (inode) { + ip->i_d.di_atime.t_sec = (__int32_t)inode->i_atime.tv_sec; + ip->i_d.di_atime.t_nsec = (__int32_t)inode->i_atime.tv_nsec; } } @@ -80,11 +79,10 @@ void xfs_mark_inode_dirty_sync( xfs_inode_t *ip) { - bhv_vnode_t *vp; + struct inode *inode = ip->i_vnode; - vp = XFS_ITOV_NULL(ip); - if (vp) - mark_inode_dirty_sync(vn_to_inode(vp)); + if (inode) + mark_inode_dirty_sync(inode); } /* @@ -157,13 +155,6 @@ xfs_ichgtime_fast( */ ASSERT((flags & XFS_ICHGTIME_ACC) == 0); - /* - * We're not supposed to change timestamps in readonly-mounted - * filesystems. Throw it away if anyone asks us. - */ - if (unlikely(IS_RDONLY(inode))) - return; - if (flags & XFS_ICHGTIME_MOD) { tvp = &inode->i_mtime; ip->i_d.di_mtime.t_sec = (__int32_t)tvp->tv_sec; @@ -215,66 +206,62 @@ xfs_validate_fields( */ STATIC int xfs_init_security( - bhv_vnode_t *vp, + struct inode *inode, struct inode *dir) { - struct inode *ip = vn_to_inode(vp); + struct xfs_inode *ip = XFS_I(inode); size_t length; void *value; char *name; int error; - error = security_inode_init_security(ip, dir, &name, &value, &length); + error = security_inode_init_security(inode, dir, &name, + &value, &length); if (error) { if (error == -EOPNOTSUPP) return 0; return -error; } - error = xfs_attr_set(XFS_I(ip), name, value, - length, ATTR_SECURE); + error = xfs_attr_set(ip, name, value, length, ATTR_SECURE); if (!error) - xfs_iflags_set(XFS_I(ip), XFS_IMODIFIED); + xfs_iflags_set(ip, XFS_IMODIFIED); kfree(name); kfree(value); return error; } -/* - * Determine whether a process has a valid fs_struct (kernel daemons - * like knfsd don't have an fs_struct). - * - * XXX(hch): nfsd is broken, better fix it instead. - */ -STATIC_INLINE int -xfs_has_fs_struct(struct task_struct *task) +static void +xfs_dentry_to_name( + struct xfs_name *namep, + struct dentry *dentry) { - return (task->fs != init_task.fs); + namep->name = dentry->d_name.name; + namep->len = dentry->d_name.len; } STATIC void xfs_cleanup_inode( struct inode *dir, - bhv_vnode_t *vp, + struct inode *inode, struct dentry *dentry, int mode) { - struct dentry teardown = {}; + struct xfs_name teardown; /* Oh, the horror. * If we can't add the ACL or we fail in * xfs_init_security we must back out. * ENOSPC can hit here, among other things. */ - teardown.d_inode = vn_to_inode(vp); - teardown.d_name = dentry->d_name; + xfs_dentry_to_name(&teardown, dentry); if (S_ISDIR(mode)) - xfs_rmdir(XFS_I(dir), &teardown); + xfs_rmdir(XFS_I(dir), &teardown, XFS_I(inode)); else - xfs_remove(XFS_I(dir), &teardown); - VN_RELE(vp); + xfs_remove(XFS_I(dir), &teardown, XFS_I(inode)); + iput(inode); } STATIC int @@ -284,9 +271,10 @@ xfs_vn_mknod( int mode, dev_t rdev) { - struct inode *ip; - bhv_vnode_t *vp = NULL, *dvp = vn_from_inode(dir); + struct inode *inode; + struct xfs_inode *ip = NULL; xfs_acl_t *default_acl = NULL; + struct xfs_name name; attrexists_t test_default_acl = _ACL_DEFAULT_EXISTS; int error; @@ -297,59 +285,67 @@ xfs_vn_mknod( if (unlikely(!sysv_valid_dev(rdev) || MAJOR(rdev) & ~0x1ff)) return -EINVAL; - if (unlikely(test_default_acl && test_default_acl(dvp))) { + if (test_default_acl && test_default_acl(dir)) { if (!_ACL_ALLOC(default_acl)) { return -ENOMEM; } - if (!_ACL_GET_DEFAULT(dvp, default_acl)) { + if (!_ACL_GET_DEFAULT(dir, default_acl)) { _ACL_FREE(default_acl); default_acl = NULL; } } - if (IS_POSIXACL(dir) && !default_acl && xfs_has_fs_struct(current)) + xfs_dentry_to_name(&name, dentry); + + if (IS_POSIXACL(dir) && !default_acl) mode &= ~current->fs->umask; switch (mode & S_IFMT) { - case S_IFCHR: case S_IFBLK: case S_IFIFO: case S_IFSOCK: + case S_IFCHR: + case S_IFBLK: + case S_IFIFO: + case S_IFSOCK: rdev = sysv_encode_dev(rdev); case S_IFREG: - error = xfs_create(XFS_I(dir), dentry, mode, rdev, &vp, NULL); + error = xfs_create(XFS_I(dir), &name, mode, rdev, &ip, NULL); break; case S_IFDIR: - error = xfs_mkdir(XFS_I(dir), dentry, mode, &vp, NULL); + error = xfs_mkdir(XFS_I(dir), &name, mode, &ip, NULL); break; default: error = EINVAL; break; } - if (unlikely(!error)) { - error = xfs_init_security(vp, dir); - if (error) - xfs_cleanup_inode(dir, vp, dentry, mode); - } + if (unlikely(error)) + goto out_free_acl; - if (unlikely(default_acl)) { - if (!error) { - error = _ACL_INHERIT(vp, mode, default_acl); - if (!error) - xfs_iflags_set(XFS_I(vp), XFS_IMODIFIED); - else - xfs_cleanup_inode(dir, vp, dentry, mode); - } + inode = ip->i_vnode; + + error = xfs_init_security(inode, dir); + if (unlikely(error)) + goto out_cleanup_inode; + + if (default_acl) { + error = _ACL_INHERIT(inode, mode, default_acl); + if (unlikely(error)) + goto out_cleanup_inode; + xfs_iflags_set(ip, XFS_IMODIFIED); _ACL_FREE(default_acl); } - if (likely(!error)) { - ASSERT(vp); - ip = vn_to_inode(vp); - if (S_ISDIR(mode)) - xfs_validate_fields(ip); - d_instantiate(dentry, ip); - xfs_validate_fields(dir); - } + if (S_ISDIR(mode)) + xfs_validate_fields(inode); + d_instantiate(dentry, inode); + xfs_validate_fields(dir); + return -error; + + out_cleanup_inode: + xfs_cleanup_inode(dir, inode, dentry, mode); + out_free_acl: + if (default_acl) + _ACL_FREE(default_acl); return -error; } @@ -378,13 +374,15 @@ xfs_vn_lookup( struct dentry *dentry, struct nameidata *nd) { - bhv_vnode_t *cvp; + struct xfs_inode *cip; + struct xfs_name name; int error; if (dentry->d_name.len >= MAXNAMELEN) return ERR_PTR(-ENAMETOOLONG); - error = xfs_lookup(XFS_I(dir), dentry, &cvp); + xfs_dentry_to_name(&name, dentry); + error = xfs_lookup(XFS_I(dir), &name, &cip); if (unlikely(error)) { if (unlikely(error != ENOENT)) return ERR_PTR(-error); @@ -392,7 +390,7 @@ xfs_vn_lookup( return NULL; } - return d_splice_alias(vn_to_inode(cvp), dentry); + return d_splice_alias(cip->i_vnode, dentry); } STATIC int @@ -401,23 +399,24 @@ xfs_vn_link( struct inode *dir, struct dentry *dentry) { - struct inode *ip; /* inode of guy being linked to */ - bhv_vnode_t *vp; /* vp of name being linked */ + struct inode *inode; /* inode of guy being linked to */ + struct xfs_name name; int error; - ip = old_dentry->d_inode; /* inode being linked to */ - vp = vn_from_inode(ip); + inode = old_dentry->d_inode; + xfs_dentry_to_name(&name, dentry); - VN_HOLD(vp); - error = xfs_link(XFS_I(dir), vp, dentry); + igrab(inode); + error = xfs_link(XFS_I(dir), XFS_I(inode), &name); if (unlikely(error)) { - VN_RELE(vp); - } else { - xfs_iflags_set(XFS_I(dir), XFS_IMODIFIED); - xfs_validate_fields(ip); - d_instantiate(dentry, ip); + iput(inode); + return -error; } - return -error; + + xfs_iflags_set(XFS_I(dir), XFS_IMODIFIED); + xfs_validate_fields(inode); + d_instantiate(dentry, inode); + return 0; } STATIC int @@ -426,11 +425,13 @@ xfs_vn_unlink( struct dentry *dentry) { struct inode *inode; + struct xfs_name name; int error; inode = dentry->d_inode; + xfs_dentry_to_name(&name, dentry); - error = xfs_remove(XFS_I(dir), dentry); + error = xfs_remove(XFS_I(dir), &name, XFS_I(inode)); if (likely(!error)) { xfs_validate_fields(dir); /* size needs update */ xfs_validate_fields(inode); @@ -444,29 +445,34 @@ xfs_vn_symlink( struct dentry *dentry, const char *symname) { - struct inode *ip; - bhv_vnode_t *cvp; /* used to lookup symlink to put in dentry */ + struct inode *inode; + struct xfs_inode *cip = NULL; + struct xfs_name name; int error; mode_t mode; - cvp = NULL; - mode = S_IFLNK | (irix_symlink_mode ? 0777 & ~current->fs->umask : S_IRWXUGO); + xfs_dentry_to_name(&name, dentry); - error = xfs_symlink(XFS_I(dir), dentry, (char *)symname, mode, - &cvp, NULL); - if (likely(!error && cvp)) { - error = xfs_init_security(cvp, dir); - if (likely(!error)) { - ip = vn_to_inode(cvp); - d_instantiate(dentry, ip); - xfs_validate_fields(dir); - xfs_validate_fields(ip); - } else { - xfs_cleanup_inode(dir, cvp, dentry, 0); - } - } + error = xfs_symlink(XFS_I(dir), &name, symname, mode, &cip, NULL); + if (unlikely(error)) + goto out; + + inode = cip->i_vnode; + + error = xfs_init_security(inode, dir); + if (unlikely(error)) + goto out_cleanup_inode; + + d_instantiate(dentry, inode); + xfs_validate_fields(dir); + xfs_validate_fields(inode); + return 0; + + out_cleanup_inode: + xfs_cleanup_inode(dir, inode, dentry, 0); + out: return -error; } @@ -476,9 +482,12 @@ xfs_vn_rmdir( struct dentry *dentry) { struct inode *inode = dentry->d_inode; + struct xfs_name name; int error; - error = xfs_rmdir(XFS_I(dir), dentry); + xfs_dentry_to_name(&name, dentry); + + error = xfs_rmdir(XFS_I(dir), &name, XFS_I(inode)); if (likely(!error)) { xfs_validate_fields(inode); xfs_validate_fields(dir); @@ -494,12 +503,15 @@ xfs_vn_rename( struct dentry *ndentry) { struct inode *new_inode = ndentry->d_inode; - bhv_vnode_t *tvp; /* target directory */ + struct xfs_name oname; + struct xfs_name nname; int error; - tvp = vn_from_inode(ndir); + xfs_dentry_to_name(&oname, odentry); + xfs_dentry_to_name(&nname, ndentry); - error = xfs_rename(XFS_I(odir), odentry, tvp, ndentry); + error = xfs_rename(XFS_I(odir), &oname, XFS_I(odentry->d_inode), + XFS_I(ndir), &nname); if (likely(!error)) { if (new_inode) xfs_validate_fields(new_inode); @@ -700,11 +712,19 @@ xfs_vn_setattr( return -error; } +/* + * block_truncate_page can return an error, but we can't propagate it + * at all here. Leave a complaint + stack trace in the syslog because + * this could be bad. If it is bad, we need to propagate the error further. + */ STATIC void xfs_vn_truncate( struct inode *inode) { - block_truncate_page(inode->i_mapping, inode->i_size, xfs_get_blocks); + int error; + error = block_truncate_page(inode->i_mapping, inode->i_size, + xfs_get_blocks); + WARN_ON(error); } STATIC int diff --git a/fs/xfs/linux-2.6/xfs_linux.h b/fs/xfs/linux-2.6/xfs_linux.h index 3ca39c4..e514332 100644 --- a/fs/xfs/linux-2.6/xfs_linux.h +++ b/fs/xfs/linux-2.6/xfs_linux.h @@ -99,7 +99,6 @@ /* * Feature macros (disable/enable) */ -#undef HAVE_REFCACHE /* reference cache not needed for NFS in 2.6 */ #define HAVE_SPLICE /* a splice(2) exists in 2.6, but not in 2.4 */ #ifdef CONFIG_SMP #define HAVE_PERCPU_SB /* per cpu superblock counters are a 2.6 feature */ diff --git a/fs/xfs/linux-2.6/xfs_lrw.c b/fs/xfs/linux-2.6/xfs_lrw.c index 1663533..1ebd800 100644 --- a/fs/xfs/linux-2.6/xfs_lrw.c +++ b/fs/xfs/linux-2.6/xfs_lrw.c @@ -51,6 +51,7 @@ #include "xfs_vnodeops.h" #include <linux/capability.h> +#include <linux/mount.h> #include <linux/writeback.h> @@ -176,7 +177,6 @@ xfs_read( { struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; - bhv_vnode_t *vp = XFS_ITOV(ip); xfs_mount_t *mp = ip->i_mount; size_t size = 0; ssize_t ret = 0; @@ -228,11 +228,11 @@ xfs_read( xfs_ilock(ip, XFS_IOLOCK_SHARED); if (DM_EVENT_ENABLED(ip, DM_EVENT_READ) && !(ioflags & IO_INVIS)) { - bhv_vrwlock_t locktype = VRWLOCK_READ; int dmflags = FILP_DELAY_FLAG(file) | DM_SEM_FLAG_RD(ioflags); + int iolock = XFS_IOLOCK_SHARED; - ret = -XFS_SEND_DATA(mp, DM_EVENT_READ, vp, *offset, size, - dmflags, &locktype); + ret = -XFS_SEND_DATA(mp, DM_EVENT_READ, ip, *offset, size, + dmflags, &iolock); if (ret) { xfs_iunlock(ip, XFS_IOLOCK_SHARED); if (unlikely(ioflags & IO_ISDIRECT)) @@ -242,7 +242,7 @@ xfs_read( } if (unlikely(ioflags & IO_ISDIRECT)) { - if (VN_CACHED(vp)) + if (inode->i_mapping->nrpages) ret = xfs_flushinval_pages(ip, (*offset & PAGE_CACHE_MASK), -1, FI_REMAPF_LOCKED); mutex_unlock(&inode->i_mutex); @@ -276,7 +276,6 @@ xfs_splice_read( int flags, int ioflags) { - bhv_vnode_t *vp = XFS_ITOV(ip); xfs_mount_t *mp = ip->i_mount; ssize_t ret; @@ -287,11 +286,11 @@ xfs_splice_read( xfs_ilock(ip, XFS_IOLOCK_SHARED); if (DM_EVENT_ENABLED(ip, DM_EVENT_READ) && !(ioflags & IO_INVIS)) { - bhv_vrwlock_t locktype = VRWLOCK_READ; + int iolock = XFS_IOLOCK_SHARED; int error; - error = XFS_SEND_DATA(mp, DM_EVENT_READ, vp, *ppos, count, - FILP_DELAY_FLAG(infilp), &locktype); + error = XFS_SEND_DATA(mp, DM_EVENT_READ, ip, *ppos, count, + FILP_DELAY_FLAG(infilp), &iolock); if (error) { xfs_iunlock(ip, XFS_IOLOCK_SHARED); return -error; @@ -317,7 +316,6 @@ xfs_splice_write( int flags, int ioflags) { - bhv_vnode_t *vp = XFS_ITOV(ip); xfs_mount_t *mp = ip->i_mount; ssize_t ret; struct inode *inode = outfilp->f_mapping->host; @@ -330,11 +328,11 @@ xfs_splice_write( xfs_ilock(ip, XFS_IOLOCK_EXCL); if (DM_EVENT_ENABLED(ip, DM_EVENT_WRITE) && !(ioflags & IO_INVIS)) { - bhv_vrwlock_t locktype = VRWLOCK_WRITE; + int iolock = XFS_IOLOCK_EXCL; int error; - error = XFS_SEND_DATA(mp, DM_EVENT_WRITE, vp, *ppos, count, - FILP_DELAY_FLAG(outfilp), &locktype); + error = XFS_SEND_DATA(mp, DM_EVENT_WRITE, ip, *ppos, count, + FILP_DELAY_FLAG(outfilp), &iolock); if (error) { xfs_iunlock(ip, XFS_IOLOCK_EXCL); return -error; @@ -573,14 +571,12 @@ xfs_write( struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; - bhv_vnode_t *vp = XFS_ITOV(xip); unsigned long segs = nsegs; xfs_mount_t *mp; ssize_t ret = 0, error = 0; xfs_fsize_t isize, new_size; int iolock; int eventsent = 0; - bhv_vrwlock_t locktype; size_t ocount = 0, count; loff_t pos; int need_i_mutex; @@ -607,11 +603,9 @@ xfs_write( relock: if (ioflags & IO_ISDIRECT) { iolock = XFS_IOLOCK_SHARED; - locktype = VRWLOCK_WRITE_DIRECT; need_i_mutex = 0; } else { iolock = XFS_IOLOCK_EXCL; - locktype = VRWLOCK_WRITE; need_i_mutex = 1; mutex_lock(&inode->i_mutex); } @@ -634,9 +628,8 @@ start: dmflags |= DM_FLAGS_IMUX; xfs_iunlock(xip, XFS_ILOCK_EXCL); - error = XFS_SEND_DATA(xip->i_mount, DM_EVENT_WRITE, vp, - pos, count, - dmflags, &locktype); + error = XFS_SEND_DATA(xip->i_mount, DM_EVENT_WRITE, xip, + pos, count, dmflags, &iolock); if (error) { goto out_unlock_internal; } @@ -664,10 +657,9 @@ start: return XFS_ERROR(-EINVAL); } - if (!need_i_mutex && (VN_CACHED(vp) || pos > xip->i_size)) { + if (!need_i_mutex && (mapping->nrpages || pos > xip->i_size)) { xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock); iolock = XFS_IOLOCK_EXCL; - locktype = VRWLOCK_WRITE; need_i_mutex = 1; mutex_lock(&inode->i_mutex); xfs_ilock(xip, XFS_ILOCK_EXCL|iolock); @@ -679,10 +671,16 @@ start: if (new_size > xip->i_size) xip->i_new_size = new_size; - if (likely(!(ioflags & IO_INVIS))) { + /* + * We're not supposed to change timestamps in readonly-mounted + * filesystems. Throw it away if anyone asks us. + */ + if (likely(!(ioflags & IO_INVIS) && + !mnt_want_write(file->f_path.mnt))) { file_update_time(file); xfs_ichgtime_fast(xip, inode, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); + mnt_drop_write(file->f_path.mnt); } /* @@ -727,7 +725,7 @@ retry: current->backing_dev_info = mapping->backing_dev_info; if ((ioflags & IO_ISDIRECT)) { - if (VN_CACHED(vp)) { + if (mapping->nrpages) { WARN_ON(need_i_mutex == 0); xfs_inval_cached_trace(xip, pos, -1, (pos & PAGE_CACHE_MASK), -1); @@ -744,7 +742,6 @@ retry: mutex_unlock(&inode->i_mutex); iolock = XFS_IOLOCK_SHARED; - locktype = VRWLOCK_WRITE_DIRECT; need_i_mutex = 0; } @@ -781,15 +778,15 @@ retry: if (ret == -ENOSPC && DM_EVENT_ENABLED(xip, DM_EVENT_NOSPACE) && !(ioflags & IO_INVIS)) { - xfs_rwunlock(xip, locktype); + xfs_iunlock(xip, iolock); if (need_i_mutex) mutex_unlock(&inode->i_mutex); - error = XFS_SEND_NAMESP(xip->i_mount, DM_EVENT_NOSPACE, vp, - DM_RIGHT_NULL, vp, DM_RIGHT_NULL, NULL, NULL, + error = XFS_SEND_NAMESP(xip->i_mount, DM_EVENT_NOSPACE, xip, + DM_RIGHT_NULL, xip, DM_RIGHT_NULL, NULL, NULL, 0, 0, 0); /* Delay flag intentionally unused */ if (need_i_mutex) mutex_lock(&inode->i_mutex); - xfs_rwlock(xip, locktype); + xfs_ilock(xip, iolock); if (error) goto out_unlock_internal; pos = xip->i_size; @@ -817,7 +814,8 @@ retry: /* Handle various SYNC-type writes */ if ((file->f_flags & O_SYNC) || IS_SYNC(inode)) { int error2; - xfs_rwunlock(xip, locktype); + + xfs_iunlock(xip, iolock); if (need_i_mutex) mutex_unlock(&inode->i_mutex); error2 = sync_page_range(inode, mapping, pos, ret); @@ -825,7 +823,7 @@ retry: error = error2; if (need_i_mutex) mutex_lock(&inode->i_mutex); - xfs_rwlock(xip, locktype); + xfs_ilock(xip, iolock); error2 = xfs_write_sync_logforce(mp, xip); if (!error) error = error2; @@ -846,7 +844,7 @@ retry: xip->i_d.di_size = xip->i_size; xfs_iunlock(xip, XFS_ILOCK_EXCL); } - xfs_rwunlock(xip, locktype); + xfs_iunlock(xip, iolock); out_unlock_mutex: if (need_i_mutex) mutex_unlock(&inode->i_mutex); @@ -884,28 +882,23 @@ xfs_bdstrat_cb(struct xfs_buf *bp) } /* - * Wrapper around bdstrat so that we can stop data - * from going to disk in case we are shutting down the filesystem. - * Typically user data goes thru this path; one of the exceptions - * is the superblock. + * Wrapper around bdstrat so that we can stop data from going to disk in case + * we are shutting down the filesystem. Typically user data goes thru this + * path; one of the exceptions is the superblock. */ -int +void xfsbdstrat( struct xfs_mount *mp, struct xfs_buf *bp) { ASSERT(mp); if (!XFS_FORCED_SHUTDOWN(mp)) { - /* Grio redirection would go here - * if (XFS_BUF_IS_GRIO(bp)) { - */ - xfs_buf_iorequest(bp); - return 0; + return; } xfs_buftrace("XFSBDSTRAT IOERROR", bp); - return (xfs_bioerror_relse(bp)); + xfs_bioerror_relse(bp); } /* diff --git a/fs/xfs/linux-2.6/xfs_lrw.h b/fs/xfs/linux-2.6/xfs_lrw.h index e200253..e1d498b 100644 --- a/fs/xfs/linux-2.6/xfs_lrw.h +++ b/fs/xfs/linux-2.6/xfs_lrw.h @@ -68,7 +68,8 @@ extern void xfs_inval_cached_trace(struct xfs_inode *, #define xfs_inval_cached_trace(ip, offset, len, first, last) #endif -extern int xfsbdstrat(struct xfs_mount *, struct xfs_buf *); +/* errors from xfsbdstrat() must be extracted from the buffer */ +extern void xfsbdstrat(struct xfs_mount *, struct xfs_buf *); extern int xfs_bdstrat_cb(struct xfs_buf *); extern int xfs_dev_is_read_only(struct xfs_mount *, char *); diff --git a/fs/xfs/linux-2.6/xfs_stats.h b/fs/xfs/linux-2.6/xfs_stats.h index 8ba7a2f..afd0b0d 100644 --- a/fs/xfs/linux-2.6/xfs_stats.h +++ b/fs/xfs/linux-2.6/xfs_stats.h @@ -144,8 +144,8 @@ extern void xfs_cleanup_procfs(void); # define XFS_STATS_DEC(count) # define XFS_STATS_ADD(count, inc) -static __inline void xfs_init_procfs(void) { }; -static __inline void xfs_cleanup_procfs(void) { }; +static inline void xfs_init_procfs(void) { }; +static inline void xfs_cleanup_procfs(void) { }; #endif /* !CONFIG_PROC_FS */ diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c index 8831d95..865eb70 100644 --- a/fs/xfs/linux-2.6/xfs_super.c +++ b/fs/xfs/linux-2.6/xfs_super.c @@ -896,7 +896,8 @@ xfs_fs_write_inode( struct inode *inode, int sync) { - int error = 0, flags = FLUSH_INODE; + int error = 0; + int flags = 0; xfs_itrace_entry(XFS_I(inode)); if (sync) { @@ -934,7 +935,7 @@ xfs_fs_clear_inode( xfs_inactive(ip); xfs_iflags_clear(ip, XFS_IMODIFIED); if (xfs_reclaim(ip)) - panic("%s: cannot reclaim 0x%p\n", __FUNCTION__, inode); + panic("%s: cannot reclaim 0x%p\n", __func__, inode); } ASSERT(XFS_I(inode) == NULL); @@ -1027,8 +1028,7 @@ xfs_sync_worker( int error; if (!(mp->m_flags & XFS_MOUNT_RDONLY)) - error = xfs_sync(mp, SYNC_FSDATA | SYNC_BDFLUSH | SYNC_ATTR | - SYNC_REFCACHE | SYNC_SUPER); + error = xfs_sync(mp, SYNC_FSDATA | SYNC_BDFLUSH | SYNC_ATTR); mp->m_sync_seq++; wake_up(&mp->m_wait_single_sync_task); } @@ -1306,7 +1306,7 @@ xfs_fs_fill_super( void *data, int silent) { - struct inode *rootvp; + struct inode *root; struct xfs_mount *mp = NULL; struct xfs_mount_args *args = xfs_args_allocate(sb, silent); int error; @@ -1344,19 +1344,18 @@ xfs_fs_fill_super( sb->s_time_gran = 1; set_posix_acl_flag(sb); - rootvp = igrab(mp->m_rootip->i_vnode); - if (!rootvp) { + root = igrab(mp->m_rootip->i_vnode); + if (!root) { error = ENOENT; goto fail_unmount; } - - sb->s_root = d_alloc_root(vn_to_inode(rootvp)); - if (!sb->s_root) { - error = ENOMEM; + if (is_bad_inode(root)) { + error = EINVAL; goto fail_vnrele; } - if (is_bad_inode(sb->s_root->d_inode)) { - error = EINVAL; + sb->s_root = d_alloc_root(root); + if (!sb->s_root) { + error = ENOMEM; goto fail_vnrele; } @@ -1378,7 +1377,7 @@ fail_vnrele: dput(sb->s_root); sb->s_root = NULL; } else { - VN_RELE(rootvp); + iput(root); } fail_unmount: diff --git a/fs/xfs/linux-2.6/xfs_super.h b/fs/xfs/linux-2.6/xfs_super.h index 3efcf45..3efb7c6 100644 --- a/fs/xfs/linux-2.6/xfs_super.h +++ b/fs/xfs/linux-2.6/xfs_super.h @@ -50,13 +50,7 @@ extern void xfs_qm_exit(void); # define set_posix_acl_flag(sb) do { } while (0) #endif -#ifdef CONFIG_XFS_SECURITY -# define XFS_SECURITY_STRING "security attributes, " -# define ENOSECURITY 0 -#else -# define XFS_SECURITY_STRING -# define ENOSECURITY EOPNOTSUPP -#endif +#define XFS_SECURITY_STRING "security attributes, " #ifdef CONFIG_XFS_RT # define XFS_REALTIME_STRING "realtime, " diff --git a/fs/xfs/linux-2.6/xfs_vfs.h b/fs/xfs/linux-2.6/xfs_vfs.h index 4da03a4..7e60c77 100644 --- a/fs/xfs/linux-2.6/xfs_vfs.h +++ b/fs/xfs/linux-2.6/xfs_vfs.h @@ -49,7 +49,6 @@ typedef struct bhv_vfs_sync_work { #define SYNC_REFCACHE 0x0040 /* prune some of the nfs ref cache */ #define SYNC_REMOUNT 0x0080 /* remount readonly, no dummy LRs */ #define SYNC_IOWAIT 0x0100 /* wait for all I/O to complete */ -#define SYNC_SUPER 0x0200 /* flush superblock to disk */ /* * When remounting a filesystem read-only or freezing the filesystem, diff --git a/fs/xfs/linux-2.6/xfs_vnode.h b/fs/xfs/linux-2.6/xfs_vnode.h index b5ea418..8b4d63c 100644 --- a/fs/xfs/linux-2.6/xfs_vnode.h +++ b/fs/xfs/linux-2.6/xfs_vnode.h @@ -23,8 +23,6 @@ struct bhv_vattr; struct xfs_iomap; struct attrlist_cursor_kern; -typedef struct dentry bhv_vname_t; -typedef __u64 bhv_vnumber_t; typedef struct inode bhv_vnode_t; #define VN_ISLNK(vp) S_ISLNK((vp)->i_mode) @@ -46,18 +44,6 @@ static inline struct inode *vn_to_inode(bhv_vnode_t *vnode) } /* - * Values for the vop_rwlock/rwunlock flags parameter. - */ -typedef enum bhv_vrwlock { - VRWLOCK_NONE, - VRWLOCK_READ, - VRWLOCK_WRITE, - VRWLOCK_WRITE_DIRECT, - VRWLOCK_TRY_READ, - VRWLOCK_TRY_WRITE -} bhv_vrwlock_t; - -/* * Return values for xfs_inactive. A return value of * VN_INACTIVE_NOCACHE implies that the file system behavior * has disassociated its state and bhv_desc_t from the vnode. @@ -73,12 +59,9 @@ typedef enum bhv_vrwlock { #define IO_INVIS 0x00020 /* don't update inode timestamps */ /* - * Flags for vop_iflush call + * Flags for xfs_inode_flush */ #define FLUSH_SYNC 1 /* wait for flush to complete */ -#define FLUSH_INODE 2 /* flush the inode itself */ -#define FLUSH_LOG 4 /* force the last log entry for - * this inode out to disk */ /* * Flush/Invalidate options for vop_toss/flush/flushinval_pages. @@ -226,13 +209,6 @@ static inline bhv_vnode_t *vn_grab(bhv_vnode_t *vp) } /* - * Vname handling macros. - */ -#define VNAME(dentry) ((char *) (dentry)->d_name.name) -#define VNAMELEN(dentry) ((dentry)->d_name.len) -#define VNAME_TO_VNODE(dentry) (vn_from_inode((dentry)->d_inode)) - -/* * Dealing with bad inodes */ static inline int VN_BAD(bhv_vnode_t *vp) @@ -303,9 +279,9 @@ extern void xfs_itrace_hold(struct xfs_inode *, char *, int, inst_t *); extern void _xfs_itrace_ref(struct xfs_inode *, char *, int, inst_t *); extern void xfs_itrace_rele(struct xfs_inode *, char *, int, inst_t *); #define xfs_itrace_entry(ip) \ - _xfs_itrace_entry(ip, __FUNCTION__, (inst_t *)__return_address) + _xfs_itrace_entry(ip, __func__, (inst_t *)__return_address) #define xfs_itrace_exit(ip) \ - _xfs_itrace_exit(ip, __FUNCTION__, (inst_t *)__return_address) + _xfs_itrace_exit(ip, __func__, (inst_t *)__return_address) #define xfs_itrace_exit_tag(ip, tag) \ _xfs_itrace_exit(ip, tag, (inst_t *)__return_address) #define xfs_itrace_ref(ip) \ diff --git a/fs/xfs/quota/xfs_dquot.c b/fs/xfs/quota/xfs_dquot.c index 665babc..631ebb3 100644 --- a/fs/xfs/quota/xfs_dquot.c +++ b/fs/xfs/quota/xfs_dquot.c @@ -1291,7 +1291,7 @@ xfs_qm_dqflush( if (flags & XFS_QMOPT_DELWRI) { xfs_bdwrite(mp, bp); } else if (flags & XFS_QMOPT_ASYNC) { - xfs_bawrite(mp, bp); + error = xfs_bawrite(mp, bp); } else { error = xfs_bwrite(mp, bp); } @@ -1439,9 +1439,7 @@ xfs_qm_dqpurge( uint flags) { xfs_dqhash_t *thishash; - xfs_mount_t *mp; - - mp = dqp->q_mount; + xfs_mount_t *mp = dqp->q_mount; ASSERT(XFS_QM_IS_MPLIST_LOCKED(mp)); ASSERT(XFS_DQ_IS_HASH_LOCKED(dqp->q_hash)); @@ -1485,6 +1483,7 @@ xfs_qm_dqpurge( * we're unmounting, we do care, so we flush it and wait. */ if (XFS_DQ_IS_DIRTY(dqp)) { + int error; xfs_dqtrace_entry(dqp, "DQPURGE ->DQFLUSH: DQDIRTY"); /* dqflush unlocks dqflock */ /* @@ -1495,7 +1494,10 @@ xfs_qm_dqpurge( * We don't care about getting disk errors here. We need * to purge this dquot anyway, so we go ahead regardless. */ - (void) xfs_qm_dqflush(dqp, XFS_QMOPT_SYNC); + error = xfs_qm_dqflush(dqp, XFS_QMOPT_SYNC); + if (error) + xfs_fs_cmn_err(CE_WARN, mp, + "xfs_qm_dqpurge: dquot %p flush failed", dqp); xfs_dqflock(dqp); } ASSERT(dqp->q_pincount == 0); @@ -1580,12 +1582,18 @@ xfs_qm_dqflock_pushbuf_wait( XFS_INCORE_TRYLOCK); if (bp != NULL) { if (XFS_BUF_ISDELAYWRITE(bp)) { + int error; if (XFS_BUF_ISPINNED(bp)) { xfs_log_force(dqp->q_mount, (xfs_lsn_t)0, XFS_LOG_FORCE); } - xfs_bawrite(dqp->q_mount, bp); + error = xfs_bawrite(dqp->q_mount, bp); + if (error) + xfs_fs_cmn_err(CE_WARN, dqp->q_mount, + "xfs_qm_dqflock_pushbuf_wait: " + "pushbuf error %d on dqp %p, bp %p", + error, dqp, bp); } else { xfs_buf_relse(bp); } diff --git a/fs/xfs/quota/xfs_dquot_item.c b/fs/xfs/quota/xfs_dquot_item.c index 1800e8d..36e05ca 100644 --- a/fs/xfs/quota/xfs_dquot_item.c +++ b/fs/xfs/quota/xfs_dquot_item.c @@ -146,6 +146,7 @@ xfs_qm_dquot_logitem_push( xfs_dq_logitem_t *logitem) { xfs_dquot_t *dqp; + int error; dqp = logitem->qli_dquot; @@ -161,7 +162,11 @@ xfs_qm_dquot_logitem_push( * lock without sleeping, then there must not have been * anyone in the process of flushing the dquot. */ - xfs_qm_dqflush(dqp, XFS_B_DELWRI); + error = xfs_qm_dqflush(dqp, XFS_QMOPT_DELWRI); + if (error) + xfs_fs_cmn_err(CE_WARN, dqp->q_mount, + "xfs_qm_dquot_logitem_push: push error %d on dqp %p", + error, dqp); xfs_dqunlock(dqp); } @@ -262,11 +267,16 @@ xfs_qm_dquot_logitem_pushbuf( XFS_LOG_FORCE); } if (dopush) { + int error; #ifdef XFSRACEDEBUG delay_for_intr(); delay(300); #endif - xfs_bawrite(mp, bp); + error = xfs_bawrite(mp, bp); + if (error) + xfs_fs_cmn_err(CE_WARN, mp, + "xfs_qm_dquot_logitem_pushbuf: pushbuf error %d on qip %p, bp %p", + error, qip, bp); } else { xfs_buf_relse(bp); } diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c index 8e9c5ae..40ea564 100644 --- a/fs/xfs/quota/xfs_qm.c +++ b/fs/xfs/quota/xfs_qm.c @@ -304,8 +304,11 @@ xfs_qm_unmount_quotadestroy( * necessary data structures like quotainfo. This is also responsible for * running a quotacheck as necessary. We are guaranteed that the superblock * is consistently read in at this point. + * + * If we fail here, the mount will continue with quota turned off. We don't + * need to inidicate success or failure at all. */ -int +void xfs_qm_mount_quotas( xfs_mount_t *mp, int mfsi_flags) @@ -313,7 +316,6 @@ xfs_qm_mount_quotas( int error = 0; uint sbf; - /* * If quotas on realtime volumes is not supported, we disable * quotas immediately. @@ -332,7 +334,8 @@ xfs_qm_mount_quotas( * Allocate the quotainfo structure inside the mount struct, and * create quotainode(s), and change/rev superblock if necessary. */ - if ((error = xfs_qm_init_quotainfo(mp))) { + error = xfs_qm_init_quotainfo(mp); + if (error) { /* * We must turn off quotas. */ @@ -344,12 +347,11 @@ xfs_qm_mount_quotas( * If any of the quotas are not consistent, do a quotacheck. */ if (XFS_QM_NEED_QUOTACHECK(mp) && - !(mfsi_flags & XFS_MFSI_NO_QUOTACHECK)) { - if ((error = xfs_qm_quotacheck(mp))) { - /* Quotacheck has failed and quotas have - * been disabled. - */ - return XFS_ERROR(error); + !(mfsi_flags & XFS_MFSI_NO_QUOTACHECK)) { + error = xfs_qm_quotacheck(mp); + if (error) { + /* Quotacheck failed and disabled quotas. */ + return; } } /* @@ -357,12 +359,10 @@ xfs_qm_mount_quotas( * quotachecked status, since we won't be doing accounting for * that type anymore. */ - if (!XFS_IS_UQUOTA_ON(mp)) { + if (!XFS_IS_UQUOTA_ON(mp)) mp->m_qflags &= ~XFS_UQUOTA_CHKD; - } - if (!(XFS_IS_GQUOTA_ON(mp) || XFS_IS_PQUOTA_ON(mp))) { + if (!(XFS_IS_GQUOTA_ON(mp) || XFS_IS_PQUOTA_ON(mp))) mp->m_qflags &= ~XFS_OQUOTA_CHKD; - } write_changes: /* @@ -392,7 +392,7 @@ xfs_qm_mount_quotas( xfs_fs_cmn_err(CE_WARN, mp, "Failed to initialize disk quotas."); } - return XFS_ERROR(error); + return; } /* @@ -1438,7 +1438,7 @@ xfs_qm_qino_alloc( } -STATIC int +STATIC void xfs_qm_reset_dqcounts( xfs_mount_t *mp, xfs_buf_t *bp, @@ -1478,8 +1478,6 @@ xfs_qm_reset_dqcounts( ddq->d_rtbwarns = 0; ddq = (xfs_disk_dquot_t *) ((xfs_dqblk_t *)ddq + 1); } - - return 0; } STATIC int @@ -1520,7 +1518,7 @@ xfs_qm_dqiter_bufs( if (error) break; - (void) xfs_qm_reset_dqcounts(mp, bp, firstid, type); + xfs_qm_reset_dqcounts(mp, bp, firstid, type); xfs_bdwrite(mp, bp); /* * goto the next block. @@ -1810,7 +1808,7 @@ xfs_qm_dqusage_adjust( * Now release the inode. This will send it to 'inactive', and * possibly even free blocks. */ - VN_RELE(XFS_ITOV(ip)); + IRELE(ip); /* * Goto next inode. @@ -1880,6 +1878,14 @@ xfs_qm_quotacheck( } while (! done); /* + * We've made all the changes that we need to make incore. + * Flush them down to disk buffers if everything was updated + * successfully. + */ + if (!error) + error = xfs_qm_dqflush_all(mp, XFS_QMOPT_DELWRI); + + /* * We can get this error if we couldn't do a dquot allocation inside * xfs_qm_dqusage_adjust (via bulkstat). We don't care about the * dirty dquots that might be cached, we just want to get rid of them @@ -1890,11 +1896,6 @@ xfs_qm_quotacheck( xfs_qm_dqpurge_all(mp, XFS_QMOPT_QUOTALL | XFS_QMOPT_QUOTAOFF); goto error_return; } - /* - * We've made all the changes that we need to make incore. - * Now flush_them down to disk buffers. - */ - xfs_qm_dqflush_all(mp, XFS_QMOPT_DELWRI); /* * We didn't log anything, because if we crashed, we'll have to @@ -1926,7 +1927,10 @@ xfs_qm_quotacheck( ASSERT(mp->m_quotainfo != NULL); ASSERT(xfs_Gqm != NULL); xfs_qm_destroy_quotainfo(mp); - (void)xfs_mount_reset_sbqflags(mp); + if (xfs_mount_reset_sbqflags(mp)) { + cmn_err(CE_WARN, "XFS quotacheck %s: " + "Failed to reset quota flags.", mp->m_fsname); + } } else { cmn_err(CE_NOTE, "XFS quotacheck %s: Done.", mp->m_fsname); } @@ -1968,7 +1972,7 @@ xfs_qm_init_quotainos( if ((error = xfs_iget(mp, NULL, mp->m_sb.sb_gquotino, 0, 0, &gip, 0))) { if (uip) - VN_RELE(XFS_ITOV(uip)); + IRELE(uip); return XFS_ERROR(error); } } @@ -1999,7 +2003,7 @@ xfs_qm_init_quotainos( sbflags | XFS_SB_GQUOTINO, flags); if (error) { if (uip) - VN_RELE(XFS_ITOV(uip)); + IRELE(uip); return XFS_ERROR(error); } @@ -2093,12 +2097,17 @@ xfs_qm_shake_freelist( * dirty dquots. */ if (XFS_DQ_IS_DIRTY(dqp)) { + int error; xfs_dqtrace_entry(dqp, "DQSHAKE: DQDIRTY"); /* * We flush it delayed write, so don't bother * releasing the mplock. */ - (void) xfs_qm_dqflush(dqp, XFS_QMOPT_DELWRI); + error = xfs_qm_dqflush(dqp, XFS_QMOPT_DELWRI); + if (error) { + xfs_fs_cmn_err(CE_WARN, dqp->q_mount, + "xfs_qm_dqflush_all: dquot %p flush failed", dqp); + } xfs_dqunlock(dqp); /* dqflush unlocks dqflock */ dqp = dqp->dq_flnext; continue; @@ -2265,12 +2274,17 @@ xfs_qm_dqreclaim_one(void) * dirty dquots. */ if (XFS_DQ_IS_DIRTY(dqp)) { + int error; xfs_dqtrace_entry(dqp, "DQRECLAIM: DQDIRTY"); /* * We flush it delayed write, so don't bother * releasing the freelist lock. */ - (void) xfs_qm_dqflush(dqp, XFS_QMOPT_DELWRI); + error = xfs_qm_dqflush(dqp, XFS_QMOPT_DELWRI); + if (error) { + xfs_fs_cmn_err(CE_WARN, dqp->q_mount, + "xfs_qm_dqreclaim: dquot %p flush failed", dqp); + } xfs_dqunlock(dqp); /* dqflush unlocks dqflock */ continue; } @@ -2378,9 +2392,9 @@ xfs_qm_write_sb_changes( } xfs_mod_sb(tp, flags); - (void) xfs_trans_commit(tp, 0); + error = xfs_trans_commit(tp, 0); - return 0; + return error; } diff --git a/fs/xfs/quota/xfs_qm.h b/fs/xfs/quota/xfs_qm.h index baf537c..cd2300e 100644 --- a/fs/xfs/quota/xfs_qm.h +++ b/fs/xfs/quota/xfs_qm.h @@ -165,7 +165,7 @@ typedef struct xfs_dquot_acct { #define XFS_QM_RELE(xqm) ((xqm)->qm_nrefs--) extern void xfs_qm_destroy_quotainfo(xfs_mount_t *); -extern int xfs_qm_mount_quotas(xfs_mount_t *, int); +extern void xfs_qm_mount_quotas(xfs_mount_t *, int); extern int xfs_qm_quotacheck(xfs_mount_t *); extern void xfs_qm_unmount_quotadestroy(xfs_mount_t *); extern int xfs_qm_unmount_quotas(xfs_mount_t *); diff --git a/fs/xfs/quota/xfs_qm_stats.h b/fs/xfs/quota/xfs_qm_stats.h index a50ffab..5b964fc 100644 --- a/fs/xfs/quota/xfs_qm_stats.h +++ b/fs/xfs/quota/xfs_qm_stats.h @@ -45,8 +45,8 @@ extern void xfs_qm_cleanup_procfs(void); # define XQM_STATS_INC(count) do { } while (0) -static __inline void xfs_qm_init_procfs(void) { }; -static __inline void xfs_qm_cleanup_procfs(void) { }; +static inline void xfs_qm_init_procfs(void) { }; +static inline void xfs_qm_cleanup_procfs(void) { }; #endif diff --git a/fs/xfs/quota/xfs_qm_syscalls.c b/fs/xfs/quota/xfs_qm_syscalls.c index d2b8be7..8342823 100644 --- a/fs/xfs/quota/xfs_qm_syscalls.c +++ b/fs/xfs/quota/xfs_qm_syscalls.c @@ -279,9 +279,12 @@ xfs_qm_scall_quotaoff( /* * Write the LI_QUOTAOFF log record, and do SB changes atomically, - * and synchronously. + * and synchronously. If we fail to write, we should abort the + * operation as it cannot be recovered safely if we crash. */ - xfs_qm_log_quotaoff(mp, &qoffstart, flags); + error = xfs_qm_log_quotaoff(mp, &qoffstart, flags); + if (error) + goto out_error; /* * Next we clear the XFS_MOUNT_*DQ_ACTIVE bit(s) in the mount struct @@ -337,7 +340,12 @@ xfs_qm_scall_quotaoff( * So, we have QUOTAOFF start and end logitems; the start * logitem won't get overwritten until the end logitem appears... */ - xfs_qm_log_quotaoff_end(mp, qoffstart, flags); + error = xfs_qm_log_quotaoff_end(mp, qoffstart, flags); + if (error) { + /* We're screwed now. Shutdown is the only option. */ + xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); + goto out_error; + } /* * If quotas is completely disabled, close shop. @@ -361,6 +369,7 @@ xfs_qm_scall_quotaoff( XFS_PURGE_INODE(XFS_QI_GQIP(mp)); XFS_QI_GQIP(mp) = NULL; } +out_error: mutex_unlock(&(XFS_QI_QOFFLOCK(mp))); return (error); @@ -371,12 +380,11 @@ xfs_qm_scall_trunc_qfiles( xfs_mount_t *mp, uint flags) { - int error; + int error = 0, error2 = 0; xfs_inode_t *qip; if (!capable(CAP_SYS_ADMIN)) return XFS_ERROR(EPERM); - error = 0; if (!xfs_sb_version_hasquota(&mp->m_sb) || flags == 0) { qdprintk("qtrunc flags=%x m_qflags=%x\n", flags, mp->m_qflags); return XFS_ERROR(EINVAL); @@ -384,22 +392,22 @@ xfs_qm_scall_trunc_qfiles( if ((flags & XFS_DQ_USER) && mp->m_sb.sb_uquotino != NULLFSINO) { error = xfs_iget(mp, NULL, mp->m_sb.sb_uquotino, 0, 0, &qip, 0); - if (! error) { - (void) xfs_truncate_file(mp, qip); - VN_RELE(XFS_ITOV(qip)); + if (!error) { + error = xfs_truncate_file(mp, qip); + IRELE(qip); } } if ((flags & (XFS_DQ_GROUP|XFS_DQ_PROJ)) && mp->m_sb.sb_gquotino != NULLFSINO) { - error = xfs_iget(mp, NULL, mp->m_sb.sb_gquotino, 0, 0, &qip, 0); - if (! error) { - (void) xfs_truncate_file(mp, qip); - VN_RELE(XFS_ITOV(qip)); + error2 = xfs_iget(mp, NULL, mp->m_sb.sb_gquotino, 0, 0, &qip, 0); + if (!error2) { + error2 = xfs_truncate_file(mp, qip); + IRELE(qip); } } - return (error); + return error ? error : error2; } @@ -552,13 +560,13 @@ xfs_qm_scall_getqstat( out->qs_uquota.qfs_nblks = uip->i_d.di_nblocks; out->qs_uquota.qfs_nextents = uip->i_d.di_nextents; if (tempuqip) - VN_RELE(XFS_ITOV(uip)); + IRELE(uip); } if (gip) { out->qs_gquota.qfs_nblks = gip->i_d.di_nblocks; out->qs_gquota.qfs_nextents = gip->i_d.di_nextents; if (tempgqip) - VN_RELE(XFS_ITOV(gip)); + IRELE(gip); } if (mp->m_quotainfo) { out->qs_incoredqs = XFS_QI_MPLNDQUOTS(mp); @@ -726,12 +734,12 @@ xfs_qm_scall_setqlim( xfs_trans_log_dquot(tp, dqp); xfs_dqtrace_entry(dqp, "Q_SETQLIM: COMMIT"); - xfs_trans_commit(tp, 0); + error = xfs_trans_commit(tp, 0); xfs_qm_dqprint(dqp); xfs_qm_dqrele(dqp); mutex_unlock(&(XFS_QI_QOFFLOCK(mp))); - return (0); + return error; } STATIC int @@ -1095,7 +1103,7 @@ again: * inactive code in hell. */ if (vnode_refd) - VN_RELE(vp); + IRELE(ip); XFS_MOUNT_ILOCK(mp); /* * If an inode was inserted or removed, we gotta diff --git a/fs/xfs/support/ktrace.c b/fs/xfs/support/ktrace.c index 129067c..0b75d30 100644 --- a/fs/xfs/support/ktrace.c +++ b/fs/xfs/support/ktrace.c @@ -24,7 +24,7 @@ static int ktrace_zentries; void __init ktrace_init(int zentries) { - ktrace_zentries = zentries; + ktrace_zentries = roundup_pow_of_two(zentries); ktrace_hdr_zone = kmem_zone_init(sizeof(ktrace_t), "ktrace_hdr"); @@ -47,13 +47,16 @@ ktrace_uninit(void) * ktrace_alloc() * * Allocate a ktrace header and enough buffering for the given - * number of entries. + * number of entries. Round the number of entries up to a + * power of 2 so we can do fast masking to get the index from + * the atomic index counter. */ ktrace_t * ktrace_alloc(int nentries, unsigned int __nocast sleep) { ktrace_t *ktp; ktrace_entry_t *ktep; + int entries; ktp = (ktrace_t*)kmem_zone_alloc(ktrace_hdr_zone, sleep); @@ -70,11 +73,12 @@ ktrace_alloc(int nentries, unsigned int __nocast sleep) /* * Special treatment for buffers with the ktrace_zentries entries */ - if (nentries == ktrace_zentries) { + entries = roundup_pow_of_two(nentries); + if (entries == ktrace_zentries) { ktep = (ktrace_entry_t*)kmem_zone_zalloc(ktrace_ent_zone, sleep); } else { - ktep = (ktrace_entry_t*)kmem_zalloc((nentries * sizeof(*ktep)), + ktep = (ktrace_entry_t*)kmem_zalloc((entries * sizeof(*ktep)), sleep | KM_LARGE); } @@ -91,8 +95,10 @@ ktrace_alloc(int nentries, unsigned int __nocast sleep) } ktp->kt_entries = ktep; - ktp->kt_nentries = nentries; - ktp->kt_index = 0; + ktp->kt_nentries = entries; + ASSERT(is_power_of_2(entries)); + ktp->kt_index_mask = entries - 1; + atomic_set(&ktp->kt_index, 0); ktp->kt_rollover = 0; return ktp; } @@ -151,8 +157,6 @@ ktrace_enter( void *val14, void *val15) { - static DEFINE_SPINLOCK(wrap_lock); - unsigned long flags; int index; ktrace_entry_t *ktep; @@ -161,12 +165,8 @@ ktrace_enter( /* * Grab an entry by pushing the index up to the next one. */ - spin_lock_irqsave(&wrap_lock, flags); - index = ktp->kt_index; - if (++ktp->kt_index == ktp->kt_nentries) - ktp->kt_index = 0; - spin_unlock_irqrestore(&wrap_lock, flags); - + index = atomic_add_return(1, &ktp->kt_index); + index = (index - 1) & ktp->kt_index_mask; if (!ktp->kt_rollover && index == ktp->kt_nentries - 1) ktp->kt_rollover = 1; @@ -199,11 +199,12 @@ int ktrace_nentries( ktrace_t *ktp) { - if (ktp == NULL) { + int index; + if (ktp == NULL) return 0; - } - return (ktp->kt_rollover ? ktp->kt_nentries : ktp->kt_index); + index = atomic_read(&ktp->kt_index) & ktp->kt_index_mask; + return (ktp->kt_rollover ? ktp->kt_nentries : index); } /* @@ -228,7 +229,7 @@ ktrace_first(ktrace_t *ktp, ktrace_snap_t *ktsp) int nentries; if (ktp->kt_rollover) - index = ktp->kt_index; + index = atomic_read(&ktp->kt_index) & ktp->kt_index_mask; else index = 0; diff --git a/fs/xfs/support/ktrace.h b/fs/xfs/support/ktrace.h index 56e72b4..741d694 100644 --- a/fs/xfs/support/ktrace.h +++ b/fs/xfs/support/ktrace.h @@ -30,7 +30,8 @@ typedef struct ktrace_entry { */ typedef struct ktrace { int kt_nentries; /* number of entries in trace buf */ - int kt_index; /* current index in entries */ + atomic_t kt_index; /* current index in entries */ + unsigned int kt_index_mask; int kt_rollover; ktrace_entry_t *kt_entries; /* buffer of entries */ } ktrace_t; diff --git a/fs/xfs/xfs.h b/fs/xfs/xfs.h index 540e4c9..765aaf6 100644 --- a/fs/xfs/xfs.h +++ b/fs/xfs/xfs.h @@ -22,7 +22,7 @@ #define STATIC #define DEBUG 1 #define XFS_BUF_LOCK_TRACKING 1 -/* #define QUOTADEBUG 1 */ +#define QUOTADEBUG 1 #endif #ifdef CONFIG_XFS_TRACE diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c index 7272fe3..8e130b9 100644 --- a/fs/xfs/xfs_acl.c +++ b/fs/xfs/xfs_acl.c @@ -307,12 +307,13 @@ xfs_acl_vset( VN_HOLD(vp); error = xfs_acl_allow_set(vp, kind); - if (error) - goto out; /* Incoming ACL exists, set file mode based on its value */ - if (kind == _ACL_TYPE_ACCESS) - xfs_acl_setmode(vp, xfs_acl, &basicperms); + if (!error && kind == _ACL_TYPE_ACCESS) + error = xfs_acl_setmode(vp, xfs_acl, &basicperms); + + if (error) + goto out; /* * If we have more than std unix permissions, set up the actual attr. @@ -323,7 +324,7 @@ xfs_acl_vset( if (!basicperms) { xfs_acl_set_attr(vp, xfs_acl, kind, &error); } else { - xfs_acl_vremove(vp, _ACL_TYPE_ACCESS); + error = -xfs_acl_vremove(vp, _ACL_TYPE_ACCESS); } out: @@ -707,7 +708,9 @@ xfs_acl_inherit( memcpy(cacl, pdaclp, sizeof(xfs_acl_t)); xfs_acl_filter_mode(mode, cacl); - xfs_acl_setmode(vp, cacl, &basicperms); + error = xfs_acl_setmode(vp, cacl, &basicperms); + if (error) + goto out_error; /* * Set the Default and Access ACL on the file. The mode is already @@ -720,6 +723,7 @@ xfs_acl_inherit( xfs_acl_set_attr(vp, pdaclp, _ACL_TYPE_DEFAULT, &error); if (!error && !basicperms) xfs_acl_set_attr(vp, cacl, _ACL_TYPE_ACCESS, &error); +out_error: _ACL_FREE(cacl); return error; } diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c index bdbfbbe..1956f83 100644 --- a/fs/xfs/xfs_alloc.c +++ b/fs/xfs/xfs_alloc.c @@ -45,7 +45,7 @@ #define XFSA_FIXUP_BNO_OK 1 #define XFSA_FIXUP_CNT_OK 2 -STATIC int +STATIC void xfs_alloc_search_busy(xfs_trans_t *tp, xfs_agnumber_t agno, xfs_agblock_t bno, @@ -55,24 +55,24 @@ xfs_alloc_search_busy(xfs_trans_t *tp, ktrace_t *xfs_alloc_trace_buf; #define TRACE_ALLOC(s,a) \ - xfs_alloc_trace_alloc(__FUNCTION__, s, a, __LINE__) + xfs_alloc_trace_alloc(__func__, s, a, __LINE__) #define TRACE_FREE(s,a,b,x,f) \ - xfs_alloc_trace_free(__FUNCTION__, s, mp, a, b, x, f, __LINE__) + xfs_alloc_trace_free(__func__, s, mp, a, b, x, f, __LINE__) #define TRACE_MODAGF(s,a,f) \ - xfs_alloc_trace_modagf(__FUNCTION__, s, mp, a, f, __LINE__) -#define TRACE_BUSY(__FUNCTION__,s,ag,agb,l,sl,tp) \ - xfs_alloc_trace_busy(__FUNCTION__, s, mp, ag, agb, l, sl, tp, XFS_ALLOC_KTRACE_BUSY, __LINE__) -#define TRACE_UNBUSY(__FUNCTION__,s,ag,sl,tp) \ - xfs_alloc_trace_busy(__FUNCTION__, s, mp, ag, -1, -1, sl, tp, XFS_ALLOC_KTRACE_UNBUSY, __LINE__) -#define TRACE_BUSYSEARCH(__FUNCTION__,s,ag,agb,l,sl,tp) \ - xfs_alloc_trace_busy(__FUNCTION__, s, mp, ag, agb, l, sl, tp, XFS_ALLOC_KTRACE_BUSYSEARCH, __LINE__) + xfs_alloc_trace_modagf(__func__, s, mp, a, f, __LINE__) +#define TRACE_BUSY(__func__,s,ag,agb,l,sl,tp) \ + xfs_alloc_trace_busy(__func__, s, mp, ag, agb, l, sl, tp, XFS_ALLOC_KTRACE_BUSY, __LINE__) +#define TRACE_UNBUSY(__func__,s,ag,sl,tp) \ + xfs_alloc_trace_busy(__func__, s, mp, ag, -1, -1, sl, tp, XFS_ALLOC_KTRACE_UNBUSY, __LINE__) +#define TRACE_BUSYSEARCH(__func__,s,ag,agb,l,tp) \ + xfs_alloc_trace_busy(__func__, s, mp, ag, agb, l, 0, tp, XFS_ALLOC_KTRACE_BUSYSEARCH, __LINE__) #else #define TRACE_ALLOC(s,a) #define TRACE_FREE(s,a,b,x,f) #define TRACE_MODAGF(s,a,f) #define TRACE_BUSY(s,a,ag,agb,l,sl,tp) #define TRACE_UNBUSY(fname,s,ag,sl,tp) -#define TRACE_BUSYSEARCH(fname,s,ag,agb,l,sl,tp) +#define TRACE_BUSYSEARCH(fname,s,ag,agb,l,tp) #endif /* XFS_ALLOC_TRACE */ /* @@ -93,7 +93,7 @@ STATIC int xfs_alloc_ag_vextent_small(xfs_alloc_arg_t *, * Compute aligned version of the found extent. * Takes alignment and min length into account. */ -STATIC int /* success (>= minlen) */ +STATIC void xfs_alloc_compute_aligned( xfs_agblock_t foundbno, /* starting block in found extent */ xfs_extlen_t foundlen, /* length in found extent */ @@ -116,7 +116,6 @@ xfs_alloc_compute_aligned( } *resbno = bno; *reslen = len; - return len >= minlen; } /* @@ -837,9 +836,9 @@ xfs_alloc_ag_vextent_near( if ((error = xfs_alloc_get_rec(cnt_cur, <bno, <len, &i))) goto error0; XFS_WANT_CORRUPTED_GOTO(i == 1, error0); - if (!xfs_alloc_compute_aligned(ltbno, ltlen, - args->alignment, args->minlen, - <bnoa, <lena)) + xfs_alloc_compute_aligned(ltbno, ltlen, args->alignment, + args->minlen, <bnoa, <lena); + if (ltlena < args->minlen) continue; args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen); xfs_alloc_fix_len(args); @@ -958,9 +957,9 @@ xfs_alloc_ag_vextent_near( if ((error = xfs_alloc_get_rec(bno_cur_lt, <bno, <len, &i))) goto error0; XFS_WANT_CORRUPTED_GOTO(i == 1, error0); - if (xfs_alloc_compute_aligned(ltbno, ltlen, - args->alignment, args->minlen, - <bnoa, <lena)) + xfs_alloc_compute_aligned(ltbno, ltlen, args->alignment, + args->minlen, <bnoa, <lena); + if (ltlena >= args->minlen) break; if ((error = xfs_alloc_decrement(bno_cur_lt, 0, &i))) goto error0; @@ -974,9 +973,9 @@ xfs_alloc_ag_vextent_near( if ((error = xfs_alloc_get_rec(bno_cur_gt, >bno, >len, &i))) goto error0; XFS_WANT_CORRUPTED_GOTO(i == 1, error0); - if (xfs_alloc_compute_aligned(gtbno, gtlen, - args->alignment, args->minlen, - >bnoa, >lena)) + xfs_alloc_compute_aligned(gtbno, gtlen, args->alignment, + args->minlen, >bnoa, >lena); + if (gtlena >= args->minlen) break; if ((error = xfs_alloc_increment(bno_cur_gt, 0, &i))) goto error0; @@ -2562,9 +2561,10 @@ xfs_alloc_clear_busy(xfs_trans_t *tp, /* - * returns non-zero if any of (agno,bno):len is in a busy list + * If we find the extent in the busy list, force the log out to get the + * extent out of the busy list so the caller can use it straight away. */ -STATIC int +STATIC void xfs_alloc_search_busy(xfs_trans_t *tp, xfs_agnumber_t agno, xfs_agblock_t bno, @@ -2572,7 +2572,6 @@ xfs_alloc_search_busy(xfs_trans_t *tp, { xfs_mount_t *mp; xfs_perag_busy_t *bsy; - int n; xfs_agblock_t uend, bend; xfs_lsn_t lsn; int cnt; @@ -2585,21 +2584,18 @@ xfs_alloc_search_busy(xfs_trans_t *tp, uend = bno + len - 1; /* search pagb_list for this slot, skipping open slots */ - for (bsy = mp->m_perag[agno].pagb_list, n = 0; - cnt; bsy++, n++) { + for (bsy = mp->m_perag[agno].pagb_list; cnt; bsy++) { /* * (start1,length1) within (start2, length2) */ if (bsy->busy_tp != NULL) { bend = bsy->busy_start + bsy->busy_length - 1; - if ((bno > bend) || - (uend < bsy->busy_start)) { + if ((bno > bend) || (uend < bsy->busy_start)) { cnt--; } else { TRACE_BUSYSEARCH("xfs_alloc_search_busy", - "found1", agno, bno, len, n, - tp); + "found1", agno, bno, len, tp); break; } } @@ -2610,15 +2606,12 @@ xfs_alloc_search_busy(xfs_trans_t *tp, * transaction that freed the block */ if (cnt) { - TRACE_BUSYSEARCH("xfs_alloc_search_busy", "found", agno, bno, len, n, tp); + TRACE_BUSYSEARCH("xfs_alloc_search_busy", "found", agno, bno, len, tp); lsn = bsy->busy_tp->t_commit_lsn; spin_unlock(&mp->m_perag[agno].pagb_lock); xfs_log_force(mp, lsn, XFS_LOG_FORCE|XFS_LOG_SYNC); } else { - TRACE_BUSYSEARCH("xfs_alloc_search_busy", "not-found", agno, bno, len, n, tp); - n = -1; + TRACE_BUSYSEARCH("xfs_alloc_search_busy", "not-found", agno, bno, len, tp); spin_unlock(&mp->m_perag[agno].pagb_lock); } - - return n; } diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c index e58f321..36d781e 100644 --- a/fs/xfs/xfs_attr.c +++ b/fs/xfs/xfs_attr.c @@ -2647,14 +2647,6 @@ attr_trusted_capable( } STATIC int -attr_secure_capable( - bhv_vnode_t *vp, - cred_t *cred) -{ - return -ENOSECURITY; -} - -STATIC int attr_system_set( bhv_vnode_t *vp, char *name, void *data, size_t size, int xflags) { @@ -2724,7 +2716,7 @@ struct attrnames attr_secure = { .attr_get = attr_generic_get, .attr_set = attr_generic_set, .attr_remove = attr_generic_remove, - .attr_capable = attr_secure_capable, + .attr_capable = (attrcapable_t)fs_noerr, }; struct attrnames attr_user = { diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c index 96ba6aa..303d41e 100644 --- a/fs/xfs/xfs_attr_leaf.c +++ b/fs/xfs/xfs_attr_leaf.c @@ -166,7 +166,7 @@ xfs_attr_shortform_bytesfit(xfs_inode_t *dp, int bytes) if (!(mp->m_flags & XFS_MOUNT_ATTR2)) { if (bytes <= XFS_IFORK_ASIZE(dp)) - return mp->m_attroffset >> 3; + return dp->i_d.di_forkoff; return 0; } diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c index 2def273..eb198c0 100644 --- a/fs/xfs/xfs_bmap.c +++ b/fs/xfs/xfs_bmap.c @@ -323,13 +323,13 @@ xfs_bmap_trace_pre_update( int whichfork); /* data or attr fork */ #define XFS_BMAP_TRACE_DELETE(d,ip,i,c,w) \ - xfs_bmap_trace_delete(__FUNCTION__,d,ip,i,c,w) + xfs_bmap_trace_delete(__func__,d,ip,i,c,w) #define XFS_BMAP_TRACE_INSERT(d,ip,i,c,r1,r2,w) \ - xfs_bmap_trace_insert(__FUNCTION__,d,ip,i,c,r1,r2,w) + xfs_bmap_trace_insert(__func__,d,ip,i,c,r1,r2,w) #define XFS_BMAP_TRACE_POST_UPDATE(d,ip,i,w) \ - xfs_bmap_trace_post_update(__FUNCTION__,d,ip,i,w) + xfs_bmap_trace_post_update(__func__,d,ip,i,w) #define XFS_BMAP_TRACE_PRE_UPDATE(d,ip,i,w) \ - xfs_bmap_trace_pre_update(__FUNCTION__,d,ip,i,w) + xfs_bmap_trace_pre_update(__func__,d,ip,i,w) #else #define XFS_BMAP_TRACE_DELETE(d,ip,i,c,w) #define XFS_BMAP_TRACE_INSERT(d,ip,i,c,r1,r2,w) @@ -2402,7 +2402,7 @@ xfs_bmap_extsize_align( #define XFS_ALLOC_GAP_UNITS 4 -STATIC int +STATIC void xfs_bmap_adjacent( xfs_bmalloca_t *ap) /* bmap alloc argument struct */ { @@ -2548,7 +2548,6 @@ xfs_bmap_adjacent( ap->rval = gotbno; } #undef ISVALID - return 0; } STATIC int @@ -4154,16 +4153,21 @@ xfs_bmap_compute_maxlevels( * number of leaf entries, is controlled by the type of di_nextents * (a signed 32-bit number, xfs_extnum_t), or by di_anextents * (a signed 16-bit number, xfs_aextnum_t). + * + * Note that we can no longer assume that if we are in ATTR1 that + * the fork offset of all the inodes will be (m_attroffset >> 3) + * because we could have mounted with ATTR2 and then mounted back + * with ATTR1, keeping the di_forkoff's fixed but probably at + * various positions. Therefore, for both ATTR1 and ATTR2 + * we have to assume the worst case scenario of a minimum size + * available. */ if (whichfork == XFS_DATA_FORK) { maxleafents = MAXEXTNUM; - sz = (mp->m_flags & XFS_MOUNT_ATTR2) ? - XFS_BMDR_SPACE_CALC(MINDBTPTRS) : mp->m_attroffset; + sz = XFS_BMDR_SPACE_CALC(MINDBTPTRS); } else { maxleafents = MAXAEXTNUM; - sz = (mp->m_flags & XFS_MOUNT_ATTR2) ? - XFS_BMDR_SPACE_CALC(MINABTPTRS) : - mp->m_sb.sb_inodesize - mp->m_attroffset; + sz = XFS_BMDR_SPACE_CALC(MINABTPTRS); } maxrootrecs = (int)XFS_BTREE_BLOCK_MAXRECS(sz, xfs_bmdr, 0); minleafrecs = mp->m_bmap_dmnr[0]; @@ -5772,7 +5776,6 @@ xfs_getbmap( int error; /* return value */ __int64_t fixlen; /* length for -1 case */ int i; /* extent number */ - bhv_vnode_t *vp; /* corresponding vnode */ int lock; /* lock state */ xfs_bmbt_irec_t *map; /* buffer for user's data */ xfs_mount_t *mp; /* file system mount point */ @@ -5789,7 +5792,6 @@ xfs_getbmap( int bmapi_flags; /* flags for xfs_bmapi */ __int32_t oflags; /* getbmapx bmv_oflags field */ - vp = XFS_ITOV(ip); mp = ip->i_mount; whichfork = interface & BMV_IF_ATTRFORK ? XFS_ATTR_FORK : XFS_DATA_FORK; @@ -5811,7 +5813,7 @@ xfs_getbmap( if ((interface & BMV_IF_NO_DMAPI_READ) == 0 && DM_EVENT_ENABLED(ip, DM_EVENT_READ) && whichfork == XFS_DATA_FORK) { - error = XFS_SEND_DATA(mp, DM_EVENT_READ, vp, 0, 0, 0, NULL); + error = XFS_SEND_DATA(mp, DM_EVENT_READ, ip, 0, 0, 0, NULL); if (error) return XFS_ERROR(error); } @@ -5869,6 +5871,10 @@ xfs_getbmap( /* xfs_fsize_t last_byte = xfs_file_last_byte(ip); */ error = xfs_flush_pages(ip, (xfs_off_t)0, -1, 0, FI_REMAPF); + if (error) { + xfs_iunlock(ip, XFS_IOLOCK_SHARED); + return error; + } } ASSERT(whichfork == XFS_ATTR_FORK || ip->i_delayed_blks == 0); @@ -6162,10 +6168,10 @@ xfs_check_block( } if (*thispa == *pp) { cmn_err(CE_WARN, "%s: thispa(%d) == pp(%d) %Ld", - __FUNCTION__, j, i, + __func__, j, i, (unsigned long long)be64_to_cpu(*thispa)); panic("%s: ptrs are equal in node\n", - __FUNCTION__); + __func__); } } } @@ -6192,7 +6198,7 @@ xfs_bmap_check_leaf_extents( xfs_mount_t *mp; /* file system mount structure */ __be64 *pp; /* pointer to block address */ xfs_bmbt_rec_t *ep; /* pointer to current extent */ - xfs_bmbt_rec_t *lastp; /* pointer to previous extent */ + xfs_bmbt_rec_t last = {0, 0}; /* last extent in prev block */ xfs_bmbt_rec_t *nextp; /* pointer to next extent */ int bp_release = 0; @@ -6262,7 +6268,6 @@ xfs_bmap_check_leaf_extents( /* * Loop over all leaf nodes checking that all extents are in the right order. */ - lastp = NULL; for (;;) { xfs_fsblock_t nextbno; xfs_extnum_t num_recs; @@ -6283,18 +6288,16 @@ xfs_bmap_check_leaf_extents( */ ep = XFS_BTREE_REC_ADDR(xfs_bmbt, block, 1); + if (i) { + xfs_btree_check_rec(XFS_BTNUM_BMAP, &last, ep); + } for (j = 1; j < num_recs; j++) { nextp = XFS_BTREE_REC_ADDR(xfs_bmbt, block, j + 1); - if (lastp) { - xfs_btree_check_rec(XFS_BTNUM_BMAP, - (void *)lastp, (void *)ep); - } - xfs_btree_check_rec(XFS_BTNUM_BMAP, (void *)ep, - (void *)(nextp)); - lastp = ep; + xfs_btree_check_rec(XFS_BTNUM_BMAP, ep, nextp); ep = nextp; } + last = *ep; i += num_recs; if (bp_release) { bp_release = 0; @@ -6325,13 +6328,13 @@ xfs_bmap_check_leaf_extents( return; error0: - cmn_err(CE_WARN, "%s: at error0", __FUNCTION__); + cmn_err(CE_WARN, "%s: at error0", __func__); if (bp_release) xfs_trans_brelse(NULL, bp); error_norelse: cmn_err(CE_WARN, "%s: BAD after btree leaves for %d extents", - __FUNCTION__, i); - panic("%s: CORRUPTED BTREE OR SOMETHING", __FUNCTION__); + __func__, i); + panic("%s: CORRUPTED BTREE OR SOMETHING", __func__); return; } #endif diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h index 87224b7..6ff70cd 100644 --- a/fs/xfs/xfs_bmap.h +++ b/fs/xfs/xfs_bmap.h @@ -151,7 +151,7 @@ xfs_bmap_trace_exlist( xfs_extnum_t cnt, /* count of entries in list */ int whichfork); /* data or attr fork */ #define XFS_BMAP_TRACE_EXLIST(ip,c,w) \ - xfs_bmap_trace_exlist(__FUNCTION__,ip,c,w) + xfs_bmap_trace_exlist(__func__,ip,c,w) #else #define XFS_BMAP_TRACE_EXLIST(ip,c,w) #endif diff --git a/fs/xfs/xfs_bmap_btree.c b/fs/xfs/xfs_bmap_btree.c index bd18987..4f0e849 100644 --- a/fs/xfs/xfs_bmap_btree.c +++ b/fs/xfs/xfs_bmap_btree.c @@ -275,21 +275,21 @@ xfs_bmbt_trace_cursor( } #define XFS_BMBT_TRACE_ARGBI(c,b,i) \ - xfs_bmbt_trace_argbi(__FUNCTION__, c, b, i, __LINE__) + xfs_bmbt_trace_argbi(__func__, c, b, i, __LINE__) #define XFS_BMBT_TRACE_ARGBII(c,b,i,j) \ - xfs_bmbt_trace_argbii(__FUNCTION__, c, b, i, j, __LINE__) + xfs_bmbt_trace_argbii(__func__, c, b, i, j, __LINE__) #define XFS_BMBT_TRACE_ARGFFFI(c,o,b,i,j) \ - xfs_bmbt_trace_argfffi(__FUNCTION__, c, o, b, i, j, __LINE__) + xfs_bmbt_trace_argfffi(__func__, c, o, b, i, j, __LINE__) #define XFS_BMBT_TRACE_ARGI(c,i) \ - xfs_bmbt_trace_argi(__FUNCTION__, c, i, __LINE__) + xfs_bmbt_trace_argi(__func__, c, i, __LINE__) #define XFS_BMBT_TRACE_ARGIFK(c,i,f,s) \ - xfs_bmbt_trace_argifk(__FUNCTION__, c, i, f, s, __LINE__) + xfs_bmbt_trace_argifk(__func__, c, i, f, s, __LINE__) #define XFS_BMBT_TRACE_ARGIFR(c,i,f,r) \ - xfs_bmbt_trace_argifr(__FUNCTION__, c, i, f, r, __LINE__) + xfs_bmbt_trace_argifr(__func__, c, i, f, r, __LINE__) #define XFS_BMBT_TRACE_ARGIK(c,i,k) \ - xfs_bmbt_trace_argik(__FUNCTION__, c, i, k, __LINE__) + xfs_bmbt_trace_argik(__func__, c, i, k, __LINE__) #define XFS_BMBT_TRACE_CURSOR(c,s) \ - xfs_bmbt_trace_cursor(__FUNCTION__, c, s, __LINE__) + xfs_bmbt_trace_cursor(__func__, c, s, __LINE__) #else #define XFS_BMBT_TRACE_ARGBI(c,b,i) #define XFS_BMBT_TRACE_ARGBII(c,b,i,j) @@ -2027,6 +2027,24 @@ xfs_bmbt_increment( /* * Insert the current record at the point referenced by cur. + * + * A multi-level split of the tree on insert will invalidate the original + * cursor. It appears, however, that some callers assume that the cursor is + * always valid. Hence if we do a multi-level split we need to revalidate the + * cursor. + * + * When a split occurs, we will see a new cursor returned. Use that as a + * trigger to determine if we need to revalidate the original cursor. If we get + * a split, then use the original irec to lookup up the path of the record we + * just inserted. + * + * Note that the fact that the btree root is in the inode means that we can + * have the level of the tree change without a "split" occurring at the root + * level. What happens is that the root is migrated to an allocated block and + * the inode root is pointed to it. This means a single split can change the + * level of the tree (level 2 -> level 3) and invalidate the old cursor. Hence + * the level change should be accounted as a split so as to correctly trigger a + * revalidation of the old cursor. */ int /* error */ xfs_bmbt_insert( @@ -2039,11 +2057,14 @@ xfs_bmbt_insert( xfs_fsblock_t nbno; xfs_btree_cur_t *ncur; xfs_bmbt_rec_t nrec; + xfs_bmbt_irec_t oirec; /* original irec */ xfs_btree_cur_t *pcur; + int splits = 0; XFS_BMBT_TRACE_CURSOR(cur, ENTRY); level = 0; nbno = NULLFSBLOCK; + oirec = cur->bc_rec.b; xfs_bmbt_disk_set_all(&nrec, &cur->bc_rec.b); ncur = NULL; pcur = cur; @@ -2052,11 +2073,13 @@ xfs_bmbt_insert( &i))) { if (pcur != cur) xfs_btree_del_cursor(pcur, XFS_BTREE_ERROR); - XFS_BMBT_TRACE_CURSOR(cur, ERROR); - return error; + goto error0; } XFS_WANT_CORRUPTED_GOTO(i == 1, error0); if (pcur != cur && (ncur || nbno == NULLFSBLOCK)) { + /* allocating a new root is effectively a split */ + if (cur->bc_nlevels != pcur->bc_nlevels) + splits++; cur->bc_nlevels = pcur->bc_nlevels; cur->bc_private.b.allocated += pcur->bc_private.b.allocated; @@ -2070,10 +2093,21 @@ xfs_bmbt_insert( xfs_btree_del_cursor(pcur, XFS_BTREE_NOERROR); } if (ncur) { + splits++; pcur = ncur; ncur = NULL; } } while (nbno != NULLFSBLOCK); + + if (splits > 1) { + /* revalidate the old cursor as we had a multi-level split */ + error = xfs_bmbt_lookup_eq(cur, oirec.br_startoff, + oirec.br_startblock, oirec.br_blockcount, &i); + if (error) + goto error0; + ASSERT(i == 1); + } + XFS_BMBT_TRACE_CURSOR(cur, EXIT); *stat = i; return 0; diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index 63debd1..53a71c6 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -645,7 +645,12 @@ xfs_buf_item_push( bp = bip->bli_buf; if (XFS_BUF_ISDELAYWRITE(bp)) { - xfs_bawrite(bip->bli_item.li_mountp, bp); + int error; + error = xfs_bawrite(bip->bli_item.li_mountp, bp); + if (error) + xfs_fs_cmn_err(CE_WARN, bip->bli_item.li_mountp, + "xfs_buf_item_push: pushbuf error %d on bip %p, bp %p", + error, bip, bp); } else { xfs_buf_relse(bp); } diff --git a/fs/xfs/xfs_dir2.c b/fs/xfs/xfs_dir2.c index e92e73f..7cb2652 100644 --- a/fs/xfs/xfs_dir2.c +++ b/fs/xfs/xfs_dir2.c @@ -44,6 +44,7 @@ #include "xfs_error.h" #include "xfs_vnodeops.h" +struct xfs_name xfs_name_dotdot = {"..", 2}; void xfs_dir_mount( @@ -146,8 +147,7 @@ int xfs_dir_createname( xfs_trans_t *tp, xfs_inode_t *dp, - char *name, - int namelen, + struct xfs_name *name, xfs_ino_t inum, /* new entry inode number */ xfs_fsblock_t *first, /* bmap's firstblock */ xfs_bmap_free_t *flist, /* bmap's freeblock list */ @@ -162,9 +162,9 @@ xfs_dir_createname( return rval; XFS_STATS_INC(xs_dir_create); - args.name = name; - args.namelen = namelen; - args.hashval = xfs_da_hashname(name, namelen); + args.name = name->name; + args.namelen = name->len; + args.hashval = xfs_da_hashname(name->name, name->len); args.inumber = inum; args.dp = dp; args.firstblock = first; @@ -197,8 +197,7 @@ int xfs_dir_lookup( xfs_trans_t *tp, xfs_inode_t *dp, - char *name, - int namelen, + struct xfs_name *name, xfs_ino_t *inum) /* out: inode number */ { xfs_da_args_t args; @@ -207,18 +206,14 @@ xfs_dir_lookup( ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR); XFS_STATS_INC(xs_dir_lookup); + memset(&args, 0, sizeof(xfs_da_args_t)); - args.name = name; - args.namelen = namelen; - args.hashval = xfs_da_hashname(name, namelen); - args.inumber = 0; + args.name = name->name; + args.namelen = name->len; + args.hashval = xfs_da_hashname(name->name, name->len); args.dp = dp; - args.firstblock = NULL; - args.flist = NULL; - args.total = 0; args.whichfork = XFS_DATA_FORK; args.trans = tp; - args.justcheck = args.addname = 0; args.oknoent = 1; if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) @@ -247,8 +242,7 @@ int xfs_dir_removename( xfs_trans_t *tp, xfs_inode_t *dp, - char *name, - int namelen, + struct xfs_name *name, xfs_ino_t ino, xfs_fsblock_t *first, /* bmap's firstblock */ xfs_bmap_free_t *flist, /* bmap's freeblock list */ @@ -261,9 +255,9 @@ xfs_dir_removename( ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR); XFS_STATS_INC(xs_dir_remove); - args.name = name; - args.namelen = namelen; - args.hashval = xfs_da_hashname(name, namelen); + args.name = name->name; + args.namelen = name->len; + args.hashval = xfs_da_hashname(name->name, name->len); args.inumber = ino; args.dp = dp; args.firstblock = first; @@ -329,8 +323,7 @@ int xfs_dir_replace( xfs_trans_t *tp, xfs_inode_t *dp, - char *name, /* name of entry to replace */ - int namelen, + struct xfs_name *name, /* name of entry to replace */ xfs_ino_t inum, /* new inode number */ xfs_fsblock_t *first, /* bmap's firstblock */ xfs_bmap_free_t *flist, /* bmap's freeblock list */ @@ -345,9 +338,9 @@ xfs_dir_replace( if ((rval = xfs_dir_ino_validate(tp->t_mountp, inum))) return rval; - args.name = name; - args.namelen = namelen; - args.hashval = xfs_da_hashname(name, namelen); + args.name = name->name; + args.namelen = name->len; + args.hashval = xfs_da_hashname(name->name, name->len); args.inumber = inum; args.dp = dp; args.firstblock = first; @@ -374,28 +367,29 @@ xfs_dir_replace( /* * See if this entry can be added to the directory without allocating space. + * First checks that the caller couldn't reserve enough space (resblks = 0). */ int xfs_dir_canenter( xfs_trans_t *tp, xfs_inode_t *dp, - char *name, /* name of entry to add */ - int namelen) + struct xfs_name *name, /* name of entry to add */ + uint resblks) { xfs_da_args_t args; int rval; int v; /* type-checking value */ + if (resblks) + return 0; + ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR); + memset(&args, 0, sizeof(xfs_da_args_t)); - args.name = name; - args.namelen = namelen; - args.hashval = xfs_da_hashname(name, namelen); - args.inumber = 0; + args.name = name->name; + args.namelen = name->len; + args.hashval = xfs_da_hashname(name->name, name->len); args.dp = dp; - args.firstblock = NULL; - args.flist = NULL; - args.total = 0; args.whichfork = XFS_DATA_FORK; args.trans = tp; args.justcheck = args.addname = args.oknoent = 1; diff --git a/fs/xfs/xfs_dir2.h b/fs/xfs/xfs_dir2.h index b265197..6392f93 100644 --- a/fs/xfs/xfs_dir2.h +++ b/fs/xfs/xfs_dir2.h @@ -59,6 +59,8 @@ typedef __uint32_t xfs_dir2_db_t; */ typedef xfs_off_t xfs_dir2_off_t; +extern struct xfs_name xfs_name_dotdot; + /* * Generic directory interface routines */ @@ -68,21 +70,21 @@ extern int xfs_dir_isempty(struct xfs_inode *dp); extern int xfs_dir_init(struct xfs_trans *tp, struct xfs_inode *dp, struct xfs_inode *pdp); extern int xfs_dir_createname(struct xfs_trans *tp, struct xfs_inode *dp, - char *name, int namelen, xfs_ino_t inum, + struct xfs_name *name, xfs_ino_t inum, xfs_fsblock_t *first, struct xfs_bmap_free *flist, xfs_extlen_t tot); extern int xfs_dir_lookup(struct xfs_trans *tp, struct xfs_inode *dp, - char *name, int namelen, xfs_ino_t *inum); + struct xfs_name *name, xfs_ino_t *inum); extern int xfs_dir_removename(struct xfs_trans *tp, struct xfs_inode *dp, - char *name, int namelen, xfs_ino_t ino, + struct xfs_name *name, xfs_ino_t ino, xfs_fsblock_t *first, struct xfs_bmap_free *flist, xfs_extlen_t tot); extern int xfs_dir_replace(struct xfs_trans *tp, struct xfs_inode *dp, - char *name, int namelen, xfs_ino_t inum, + struct xfs_name *name, xfs_ino_t inum, xfs_fsblock_t *first, struct xfs_bmap_free *flist, xfs_extlen_t tot); extern int xfs_dir_canenter(struct xfs_trans *tp, struct xfs_inode *dp, - char *name, int namelen); + struct xfs_name *name, uint resblks); extern int xfs_dir_ino_validate(struct xfs_mount *mp, xfs_ino_t ino); /* diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c index eb03eab..3f3785b 100644 --- a/fs/xfs/xfs_filestream.c +++ b/fs/xfs/xfs_filestream.c @@ -73,7 +73,7 @@ xfs_filestreams_trace( #define TRACE4(mp,t,a0,a1,a2,a3) TRACE6(mp,t,a0,a1,a2,a3,0,0) #define TRACE5(mp,t,a0,a1,a2,a3,a4) TRACE6(mp,t,a0,a1,a2,a3,a4,0) #define TRACE6(mp,t,a0,a1,a2,a3,a4,a5) \ - xfs_filestreams_trace(mp, t, __FUNCTION__, __LINE__, \ + xfs_filestreams_trace(mp, t, __func__, __LINE__, \ (__psunsigned_t)a0, (__psunsigned_t)a1, \ (__psunsigned_t)a2, (__psunsigned_t)a3, \ (__psunsigned_t)a4, (__psunsigned_t)a5) diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c index 5a146cb..a64dfbd 100644 --- a/fs/xfs/xfs_ialloc.c +++ b/fs/xfs/xfs_ialloc.c @@ -107,6 +107,16 @@ xfs_ialloc_log_di( /* * Allocation group level functions. */ +static inline int +xfs_ialloc_cluster_alignment( + xfs_alloc_arg_t *args) +{ + if (xfs_sb_version_hasalign(&args->mp->m_sb) && + args->mp->m_sb.sb_inoalignmt >= + XFS_B_TO_FSBT(args->mp, XFS_INODE_CLUSTER_SIZE(args->mp))) + return args->mp->m_sb.sb_inoalignmt; + return 1; +} /* * Allocate new inodes in the allocation group specified by agbp. @@ -167,10 +177,24 @@ xfs_ialloc_ag_alloc( args.mod = args.total = args.wasdel = args.isfl = args.userdata = args.minalignslop = 0; args.prod = 1; - args.alignment = 1; + /* - * Allow space for the inode btree to split. + * We need to take into account alignment here to ensure that + * we don't modify the free list if we fail to have an exact + * block. If we don't have an exact match, and every oher + * attempt allocation attempt fails, we'll end up cancelling + * a dirty transaction and shutting down. + * + * For an exact allocation, alignment must be 1, + * however we need to take cluster alignment into account when + * fixing up the freelist. Use the minalignslop field to + * indicate that extra blocks might be required for alignment, + * but not to use them in the actual exact allocation. */ + args.alignment = 1; + args.minalignslop = xfs_ialloc_cluster_alignment(&args) - 1; + + /* Allow space for the inode btree to split. */ args.minleft = XFS_IN_MAXLEVELS(args.mp) - 1; if ((error = xfs_alloc_vextent(&args))) return error; @@ -191,13 +215,8 @@ xfs_ialloc_ag_alloc( ASSERT(!(args.mp->m_flags & XFS_MOUNT_NOALIGN)); args.alignment = args.mp->m_dalign; isaligned = 1; - } else if (xfs_sb_version_hasalign(&args.mp->m_sb) && - args.mp->m_sb.sb_inoalignmt >= - XFS_B_TO_FSBT(args.mp, - XFS_INODE_CLUSTER_SIZE(args.mp))) - args.alignment = args.mp->m_sb.sb_inoalignmt; - else - args.alignment = 1; + } else + args.alignment = xfs_ialloc_cluster_alignment(&args); /* * Need to figure out where to allocate the inode blocks. * Ideally they should be spaced out through the a.g. @@ -230,12 +249,7 @@ xfs_ialloc_ag_alloc( args.agbno = be32_to_cpu(agi->agi_root); args.fsbno = XFS_AGB_TO_FSB(args.mp, be32_to_cpu(agi->agi_seqno), args.agbno); - if (xfs_sb_version_hasalign(&args.mp->m_sb) && - args.mp->m_sb.sb_inoalignmt >= - XFS_B_TO_FSBT(args.mp, XFS_INODE_CLUSTER_SIZE(args.mp))) - args.alignment = args.mp->m_sb.sb_inoalignmt; - else - args.alignment = 1; + args.alignment = xfs_ialloc_cluster_alignment(&args); if ((error = xfs_alloc_vextent(&args))) return error; } diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c index 8e09b71..e657c51 100644 --- a/fs/xfs/xfs_iget.c +++ b/fs/xfs/xfs_iget.c @@ -78,7 +78,6 @@ xfs_iget_core( xfs_inode_t *ip; xfs_inode_t *iq; int error; - xfs_icluster_t *icl, *new_icl = NULL; unsigned long first_index, mask; xfs_perag_t *pag; xfs_agino_t agino; @@ -229,11 +228,9 @@ finish_inode: } /* - * This is a bit messy - we preallocate everything we _might_ - * need before we pick up the ici lock. That way we don't have to - * juggle locks and go all the way back to the start. + * Preload the radix tree so we can insert safely under the + * write spinlock. */ - new_icl = kmem_zone_alloc(xfs_icluster_zone, KM_SLEEP); if (radix_tree_preload(GFP_KERNEL)) { xfs_idestroy(ip); delay(1); @@ -242,17 +239,6 @@ finish_inode: mask = ~(((XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog)) - 1); first_index = agino & mask; write_lock(&pag->pag_ici_lock); - - /* - * Find the cluster if it exists - */ - icl = NULL; - if (radix_tree_gang_lookup(&pag->pag_ici_root, (void**)&iq, - first_index, 1)) { - if ((XFS_INO_TO_AGINO(mp, iq->i_ino) & mask) == first_index) - icl = iq->i_cluster; - } - /* * insert the new inode */ @@ -267,30 +253,13 @@ finish_inode: } /* - * These values _must_ be set before releasing ihlock! + * These values _must_ be set before releasing the radix tree lock! */ ip->i_udquot = ip->i_gdquot = NULL; xfs_iflags_set(ip, XFS_INEW); - ASSERT(ip->i_cluster == NULL); - - if (!icl) { - spin_lock_init(&new_icl->icl_lock); - INIT_HLIST_HEAD(&new_icl->icl_inodes); - icl = new_icl; - new_icl = NULL; - } else { - ASSERT(!hlist_empty(&icl->icl_inodes)); - } - spin_lock(&icl->icl_lock); - hlist_add_head(&ip->i_cnode, &icl->icl_inodes); - ip->i_cluster = icl; - spin_unlock(&icl->icl_lock); - write_unlock(&pag->pag_ici_lock); radix_tree_preload_end(); - if (new_icl) - kmem_zone_free(xfs_icluster_zone, new_icl); /* * Link ip to its mount and thread it on the mount's inode list. @@ -529,18 +498,6 @@ xfs_iextract( xfs_put_perag(mp, pag); /* - * Remove from cluster list - */ - mp = ip->i_mount; - spin_lock(&ip->i_cluster->icl_lock); - hlist_del(&ip->i_cnode); - spin_unlock(&ip->i_cluster->icl_lock); - - /* was last inode in cluster? */ - if (hlist_empty(&ip->i_cluster->icl_inodes)) - kmem_zone_free(xfs_icluster_zone, ip->i_cluster); - - /* * Remove from mount's inode list. */ XFS_MOUNT_ILOCK(mp); diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index f43a6e0..ca12acb 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -55,7 +55,6 @@ kmem_zone_t *xfs_ifork_zone; kmem_zone_t *xfs_inode_zone; -kmem_zone_t *xfs_icluster_zone; /* * Used in xfs_itruncate(). This is the maximum number of extents @@ -126,6 +125,90 @@ xfs_inobp_check( #endif /* + * Find the buffer associated with the given inode map + * We do basic validation checks on the buffer once it has been + * retrieved from disk. + */ +STATIC int +xfs_imap_to_bp( + xfs_mount_t *mp, + xfs_trans_t *tp, + xfs_imap_t *imap, + xfs_buf_t **bpp, + uint buf_flags, + uint imap_flags) +{ + int error; + int i; + int ni; + xfs_buf_t *bp; + + error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap->im_blkno, + (int)imap->im_len, buf_flags, &bp); + if (error) { + if (error != EAGAIN) { + cmn_err(CE_WARN, + "xfs_imap_to_bp: xfs_trans_read_buf()returned " + "an error %d on %s. Returning error.", + error, mp->m_fsname); + } else { + ASSERT(buf_flags & XFS_BUF_TRYLOCK); + } + return error; + } + + /* + * Validate the magic number and version of every inode in the buffer + * (if DEBUG kernel) or the first inode in the buffer, otherwise. + */ +#ifdef DEBUG + ni = BBTOB(imap->im_len) >> mp->m_sb.sb_inodelog; +#else /* usual case */ + ni = 1; +#endif + + for (i = 0; i < ni; i++) { + int di_ok; + xfs_dinode_t *dip; + + dip = (xfs_dinode_t *)xfs_buf_offset(bp, + (i << mp->m_sb.sb_inodelog)); + di_ok = be16_to_cpu(dip->di_core.di_magic) == XFS_DINODE_MAGIC && + XFS_DINODE_GOOD_VERSION(dip->di_core.di_version); + if (unlikely(XFS_TEST_ERROR(!di_ok, mp, + XFS_ERRTAG_ITOBP_INOTOBP, + XFS_RANDOM_ITOBP_INOTOBP))) { + if (imap_flags & XFS_IMAP_BULKSTAT) { + xfs_trans_brelse(tp, bp); + return XFS_ERROR(EINVAL); + } + XFS_CORRUPTION_ERROR("xfs_imap_to_bp", + XFS_ERRLEVEL_HIGH, mp, dip); +#ifdef DEBUG + cmn_err(CE_PANIC, + "Device %s - bad inode magic/vsn " + "daddr %lld #%d (magic=%x)", + XFS_BUFTARG_NAME(mp->m_ddev_targp), + (unsigned long long)imap->im_blkno, i, + be16_to_cpu(dip->di_core.di_magic)); +#endif + xfs_trans_brelse(tp, bp); + return XFS_ERROR(EFSCORRUPTED); + } + } + + xfs_inobp_check(mp, bp); + + /* + * Mark the buffer as an inode buffer now that it looks good + */ + XFS_BUF_SET_VTYPE(bp, B_FS_INO); + + *bpp = bp; + return 0; +} + +/* * This routine is called to map an inode number within a file * system to the buffer containing the on-disk version of the * inode. It returns a pointer to the buffer containing the @@ -147,72 +230,19 @@ xfs_inotobp( xfs_buf_t **bpp, int *offset) { - int di_ok; xfs_imap_t imap; xfs_buf_t *bp; int error; - xfs_dinode_t *dip; - /* - * Call the space management code to find the location of the - * inode on disk. - */ imap.im_blkno = 0; error = xfs_imap(mp, tp, ino, &imap, XFS_IMAP_LOOKUP); - if (error != 0) { - cmn_err(CE_WARN, - "xfs_inotobp: xfs_imap() returned an " - "error %d on %s. Returning error.", error, mp->m_fsname); + if (error) return error; - } - /* - * If the inode number maps to a block outside the bounds of the - * file system then return NULL rather than calling read_buf - * and panicing when we get an error from the driver. - */ - if ((imap.im_blkno + imap.im_len) > - XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)) { - cmn_err(CE_WARN, - "xfs_inotobp: inode number (%llu + %d) maps to a block outside the bounds " - "of the file system %s. Returning EINVAL.", - (unsigned long long)imap.im_blkno, - imap.im_len, mp->m_fsname); - return XFS_ERROR(EINVAL); - } - - /* - * Read in the buffer. If tp is NULL, xfs_trans_read_buf() will - * default to just a read_buf() call. - */ - error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap.im_blkno, - (int)imap.im_len, XFS_BUF_LOCK, &bp); - - if (error) { - cmn_err(CE_WARN, - "xfs_inotobp: xfs_trans_read_buf() returned an " - "error %d on %s. Returning error.", error, mp->m_fsname); + error = xfs_imap_to_bp(mp, tp, &imap, &bp, XFS_BUF_LOCK, 0); + if (error) return error; - } - dip = (xfs_dinode_t *)xfs_buf_offset(bp, 0); - di_ok = - be16_to_cpu(dip->di_core.di_magic) == XFS_DINODE_MAGIC && - XFS_DINODE_GOOD_VERSION(dip->di_core.di_version); - if (unlikely(XFS_TEST_ERROR(!di_ok, mp, XFS_ERRTAG_ITOBP_INOTOBP, - XFS_RANDOM_ITOBP_INOTOBP))) { - XFS_CORRUPTION_ERROR("xfs_inotobp", XFS_ERRLEVEL_LOW, mp, dip); - xfs_trans_brelse(tp, bp); - cmn_err(CE_WARN, - "xfs_inotobp: XFS_TEST_ERROR() returned an " - "error on %s. Returning EFSCORRUPTED.", mp->m_fsname); - return XFS_ERROR(EFSCORRUPTED); - } - xfs_inobp_check(mp, bp); - - /* - * Set *dipp to point to the on-disk inode in the buffer. - */ *dipp = (xfs_dinode_t *)xfs_buf_offset(bp, imap.im_boffset); *bpp = bp; *offset = imap.im_boffset; @@ -248,46 +278,21 @@ xfs_itobp( xfs_dinode_t **dipp, xfs_buf_t **bpp, xfs_daddr_t bno, - uint imap_flags) + uint imap_flags, + uint buf_flags) { xfs_imap_t imap; xfs_buf_t *bp; int error; - int i; - int ni; if (ip->i_blkno == (xfs_daddr_t)0) { - /* - * Call the space management code to find the location of the - * inode on disk. - */ imap.im_blkno = bno; - if ((error = xfs_imap(mp, tp, ip->i_ino, &imap, - XFS_IMAP_LOOKUP | imap_flags))) + error = xfs_imap(mp, tp, ip->i_ino, &imap, + XFS_IMAP_LOOKUP | imap_flags); + if (error) return error; /* - * If the inode number maps to a block outside the bounds - * of the file system then return NULL rather than calling - * read_buf and panicing when we get an error from the - * driver. - */ - if ((imap.im_blkno + imap.im_len) > - XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)) { -#ifdef DEBUG - xfs_fs_cmn_err(CE_ALERT, mp, "xfs_itobp: " - "(imap.im_blkno (0x%llx) " - "+ imap.im_len (0x%llx)) > " - " XFS_FSB_TO_BB(mp, " - "mp->m_sb.sb_dblocks) (0x%llx)", - (unsigned long long) imap.im_blkno, - (unsigned long long) imap.im_len, - XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)); -#endif /* DEBUG */ - return XFS_ERROR(EINVAL); - } - - /* * Fill in the fields in the inode that will be used to * map the inode to its buffer from now on. */ @@ -305,76 +310,17 @@ xfs_itobp( } ASSERT(bno == 0 || bno == imap.im_blkno); - /* - * Read in the buffer. If tp is NULL, xfs_trans_read_buf() will - * default to just a read_buf() call. - */ - error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap.im_blkno, - (int)imap.im_len, XFS_BUF_LOCK, &bp); - if (error) { -#ifdef DEBUG - xfs_fs_cmn_err(CE_ALERT, mp, "xfs_itobp: " - "xfs_trans_read_buf() returned error %d, " - "imap.im_blkno 0x%llx, imap.im_len 0x%llx", - error, (unsigned long long) imap.im_blkno, - (unsigned long long) imap.im_len); -#endif /* DEBUG */ + error = xfs_imap_to_bp(mp, tp, &imap, &bp, buf_flags, imap_flags); + if (error) return error; - } - - /* - * Validate the magic number and version of every inode in the buffer - * (if DEBUG kernel) or the first inode in the buffer, otherwise. - * No validation is done here in userspace (xfs_repair). - */ -#if !defined(__KERNEL__) - ni = 0; -#elif defined(DEBUG) - ni = BBTOB(imap.im_len) >> mp->m_sb.sb_inodelog; -#else /* usual case */ - ni = 1; -#endif - - for (i = 0; i < ni; i++) { - int di_ok; - xfs_dinode_t *dip; - dip = (xfs_dinode_t *)xfs_buf_offset(bp, - (i << mp->m_sb.sb_inodelog)); - di_ok = be16_to_cpu(dip->di_core.di_magic) == XFS_DINODE_MAGIC && - XFS_DINODE_GOOD_VERSION(dip->di_core.di_version); - if (unlikely(XFS_TEST_ERROR(!di_ok, mp, - XFS_ERRTAG_ITOBP_INOTOBP, - XFS_RANDOM_ITOBP_INOTOBP))) { - if (imap_flags & XFS_IMAP_BULKSTAT) { - xfs_trans_brelse(tp, bp); - return XFS_ERROR(EINVAL); - } -#ifdef DEBUG - cmn_err(CE_ALERT, - "Device %s - bad inode magic/vsn " - "daddr %lld #%d (magic=%x)", - XFS_BUFTARG_NAME(mp->m_ddev_targp), - (unsigned long long)imap.im_blkno, i, - be16_to_cpu(dip->di_core.di_magic)); -#endif - XFS_CORRUPTION_ERROR("xfs_itobp", XFS_ERRLEVEL_HIGH, - mp, dip); - xfs_trans_brelse(tp, bp); - return XFS_ERROR(EFSCORRUPTED); - } + if (!bp) { + ASSERT(buf_flags & XFS_BUF_TRYLOCK); + ASSERT(tp == NULL); + *bpp = NULL; + return EAGAIN; } - xfs_inobp_check(mp, bp); - - /* - * Mark the buffer as an inode buffer now that it looks good - */ - XFS_BUF_SET_VTYPE(bp, B_FS_INO); - - /* - * Set *dipp to point to the on-disk inode in the buffer. - */ *dipp = (xfs_dinode_t *)xfs_buf_offset(bp, imap.im_boffset); *bpp = bp; return 0; @@ -878,7 +824,7 @@ xfs_iread( * return NULL as well. Set i_blkno to 0 so that xfs_itobp() will * know that this is a new incore inode. */ - error = xfs_itobp(mp, tp, ip, &dip, &bp, bno, imap_flags); + error = xfs_itobp(mp, tp, ip, &dip, &bp, bno, imap_flags, XFS_BUF_LOCK); if (error) { kmem_zone_free(xfs_inode_zone, ip); return error; @@ -1518,51 +1464,50 @@ xfs_itruncate_start( } /* - * Shrink the file to the given new_size. The new - * size must be smaller than the current size. - * This will free up the underlying blocks - * in the removed range after a call to xfs_itruncate_start() - * or xfs_atruncate_start(). + * Shrink the file to the given new_size. The new size must be smaller than + * the current size. This will free up the underlying blocks in the removed + * range after a call to xfs_itruncate_start() or xfs_atruncate_start(). * - * The transaction passed to this routine must have made - * a permanent log reservation of at least XFS_ITRUNCATE_LOG_RES. - * This routine may commit the given transaction and - * start new ones, so make sure everything involved in - * the transaction is tidy before calling here. - * Some transaction will be returned to the caller to be - * committed. The incoming transaction must already include - * the inode, and both inode locks must be held exclusively. - * The inode must also be "held" within the transaction. On - * return the inode will be "held" within the returned transaction. - * This routine does NOT require any disk space to be reserved - * for it within the transaction. + * The transaction passed to this routine must have made a permanent log + * reservation of at least XFS_ITRUNCATE_LOG_RES. This routine may commit the + * given transaction and start new ones, so make sure everything involved in + * the transaction is tidy before calling here. Some transaction will be + * returned to the caller to be committed. The incoming transaction must + * already include the inode, and both inode locks must be held exclusively. + * The inode must also be "held" within the transaction. On return the inode + * will be "held" within the returned transaction. This routine does NOT + * require any disk space to be reserved for it within the transaction. * - * The fork parameter must be either xfs_attr_fork or xfs_data_fork, - * and it indicates the fork which is to be truncated. For the - * attribute fork we only support truncation to size 0. + * The fork parameter must be either xfs_attr_fork or xfs_data_fork, and it + * indicates the fork which is to be truncated. For the attribute fork we only + * support truncation to size 0. * - * We use the sync parameter to indicate whether or not the first - * transaction we perform might have to be synchronous. For the attr fork, - * it needs to be so if the unlink of the inode is not yet known to be - * permanent in the log. This keeps us from freeing and reusing the - * blocks of the attribute fork before the unlink of the inode becomes - * permanent. + * We use the sync parameter to indicate whether or not the first transaction + * we perform might have to be synchronous. For the attr fork, it needs to be + * so if the unlink of the inode is not yet known to be permanent in the log. + * This keeps us from freeing and reusing the blocks of the attribute fork + * before the unlink of the inode becomes permanent. * - * For the data fork, we normally have to run synchronously if we're - * being called out of the inactive path or we're being called - * out of the create path where we're truncating an existing file. - * Either way, the truncate needs to be sync so blocks don't reappear - * in the file with altered data in case of a crash. wsync filesystems - * can run the first case async because anything that shrinks the inode - * has to run sync so by the time we're called here from inactive, the - * inode size is permanently set to 0. + * For the data fork, we normally have to run synchronously if we're being + * called out of the inactive path or we're being called out of the create path + * where we're truncating an existing file. Either way, the truncate needs to + * be sync so blocks don't reappear in the file with altered data in case of a + * crash. wsync filesystems can run the first case async because anything that + * shrinks the inode has to run sync so by the time we're called here from + * inactive, the inode size is permanently set to 0. * - * Calls from the truncate path always need to be sync unless we're - * in a wsync filesystem and the file has already been unlinked. + * Calls from the truncate path always need to be sync unless we're in a wsync + * filesystem and the file has already been unlinked. * - * The caller is responsible for correctly setting the sync parameter. - * It gets too hard for us to guess here which path we're being called - * out of just based on inode state. + * The caller is responsible for correctly setting the sync parameter. It gets + * too hard for us to guess here which path we're being called out of just + * based on inode state. + * + * If we get an error, we must return with the inode locked and linked into the + * current transaction. This keeps things simple for the higher level code, + * because it always knows that the inode is locked and held in the transaction + * that returns to it whether errors occur or not. We don't mark the inode + * dirty on error so that transactions can be easily aborted if possible. */ int xfs_itruncate_finish( @@ -1741,65 +1686,51 @@ xfs_itruncate_finish( */ error = xfs_bmap_finish(tp, &free_list, &committed); ntp = *tp; + if (committed) { + /* link the inode into the next xact in the chain */ + xfs_trans_ijoin(ntp, ip, + XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); + xfs_trans_ihold(ntp, ip); + } + if (error) { /* - * If the bmap finish call encounters an error, - * return to the caller where the transaction - * can be properly aborted. We just need to - * make sure we're not holding any resources - * that we were not when we came in. + * If the bmap finish call encounters an error, return + * to the caller where the transaction can be properly + * aborted. We just need to make sure we're not + * holding any resources that we were not when we came + * in. * - * Aborting from this point might lose some - * blocks in the file system, but oh well. + * Aborting from this point might lose some blocks in + * the file system, but oh well. */ xfs_bmap_cancel(&free_list); - if (committed) { - /* - * If the passed in transaction committed - * in xfs_bmap_finish(), then we want to - * add the inode to this one before returning. - * This keeps things simple for the higher - * level code, because it always knows that - * the inode is locked and held in the - * transaction that returns to it whether - * errors occur or not. We don't mark the - * inode dirty so that this transaction can - * be easily aborted if possible. - */ - xfs_trans_ijoin(ntp, ip, - XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); - xfs_trans_ihold(ntp, ip); - } return error; } if (committed) { /* - * The first xact was committed, - * so add the inode to the new one. - * Mark it dirty so it will be logged - * and moved forward in the log as - * part of every commit. + * Mark the inode dirty so it will be logged and + * moved forward in the log as part of every commit. */ - xfs_trans_ijoin(ntp, ip, - XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); - xfs_trans_ihold(ntp, ip); xfs_trans_log_inode(ntp, ip, XFS_ILOG_CORE); } + ntp = xfs_trans_dup(ntp); - (void) xfs_trans_commit(*tp, 0); + error = xfs_trans_commit(*tp, 0); *tp = ntp; - error = xfs_trans_reserve(ntp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0, - XFS_TRANS_PERM_LOG_RES, - XFS_ITRUNCATE_LOG_COUNT); - /* - * Add the inode being truncated to the next chained - * transaction. - */ + + /* link the inode into the next transaction in the chain */ xfs_trans_ijoin(ntp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); xfs_trans_ihold(ntp, ip); + + if (!error) + error = xfs_trans_reserve(ntp, 0, + XFS_ITRUNCATE_LOG_RES(mp), 0, + XFS_TRANS_PERM_LOG_RES, + XFS_ITRUNCATE_LOG_COUNT); if (error) - return (error); + return error; } /* * Only update the size in the case of the data fork, but @@ -1967,7 +1898,7 @@ xfs_iunlink( * Here we put the head pointer into our next pointer, * and then we fall through to point the head at us. */ - error = xfs_itobp(mp, tp, ip, &dip, &ibp, 0, 0); + error = xfs_itobp(mp, tp, ip, &dip, &ibp, 0, 0, XFS_BUF_LOCK); if (error) return error; @@ -2075,7 +2006,7 @@ xfs_iunlink_remove( * of dealing with the buffer when there is no need to * change it. */ - error = xfs_itobp(mp, tp, ip, &dip, &ibp, 0, 0); + error = xfs_itobp(mp, tp, ip, &dip, &ibp, 0, 0, XFS_BUF_LOCK); if (error) { cmn_err(CE_WARN, "xfs_iunlink_remove: xfs_itobp() returned an error %d on %s. Returning error.", @@ -2137,7 +2068,7 @@ xfs_iunlink_remove( * Now last_ibp points to the buffer previous to us on * the unlinked list. Pull us from the list. */ - error = xfs_itobp(mp, tp, ip, &dip, &ibp, 0, 0); + error = xfs_itobp(mp, tp, ip, &dip, &ibp, 0, 0, XFS_BUF_LOCK); if (error) { cmn_err(CE_WARN, "xfs_iunlink_remove: xfs_itobp() returned an error %d on %s. Returning error.", @@ -2172,13 +2103,6 @@ xfs_iunlink_remove( return 0; } -STATIC_INLINE int xfs_inode_clean(xfs_inode_t *ip) -{ - return (((ip->i_itemp == NULL) || - !(ip->i_itemp->ili_format.ilf_fields & XFS_ILOG_ALL)) && - (ip->i_update_core == 0)); -} - STATIC void xfs_ifree_cluster( xfs_inode_t *free_ip, @@ -2400,7 +2324,7 @@ xfs_ifree( xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - error = xfs_itobp(ip->i_mount, tp, ip, &dip, &ibp, 0, 0); + error = xfs_itobp(ip->i_mount, tp, ip, &dip, &ibp, 0, 0, XFS_BUF_LOCK); if (error) return error; @@ -2678,14 +2602,31 @@ xfs_imap( fsbno = imap->im_blkno ? XFS_DADDR_TO_FSB(mp, imap->im_blkno) : NULLFSBLOCK; error = xfs_dilocate(mp, tp, ino, &fsbno, &len, &off, flags); - if (error != 0) { + if (error) return error; - } + imap->im_blkno = XFS_FSB_TO_DADDR(mp, fsbno); imap->im_len = XFS_FSB_TO_BB(mp, len); imap->im_agblkno = XFS_FSB_TO_AGBNO(mp, fsbno); imap->im_ioffset = (ushort)off; imap->im_boffset = (ushort)(off << mp->m_sb.sb_inodelog); + + /* + * If the inode number maps to a block outside the bounds + * of the file system then return NULL rather than calling + * read_buf and panicing when we get an error from the + * driver. + */ + if ((imap->im_blkno + imap->im_len) > + XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)) { + xfs_fs_cmn_err(CE_ALERT, mp, "xfs_imap: " + "(imap->im_blkno (0x%llx) + imap->im_len (0x%llx)) > " + " XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks) (0x%llx)", + (unsigned long long) imap->im_blkno, + (unsigned long long) imap->im_len, + XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)); + return EINVAL; + } return 0; } @@ -2826,38 +2767,41 @@ xfs_iunpin( } /* - * This is called to wait for the given inode to be unpinned. - * It will sleep until this happens. The caller must have the - * inode locked in at least shared mode so that the buffer cannot - * be subsequently pinned once someone is waiting for it to be - * unpinned. + * This is called to unpin an inode. It can be directed to wait or to return + * immediately without waiting for the inode to be unpinned. The caller must + * have the inode locked in at least shared mode so that the buffer cannot be + * subsequently pinned once someone is waiting for it to be unpinned. */ STATIC void -xfs_iunpin_wait( - xfs_inode_t *ip) +__xfs_iunpin_wait( + xfs_inode_t *ip, + int wait) { - xfs_inode_log_item_t *iip; - xfs_lsn_t lsn; + xfs_inode_log_item_t *iip = ip->i_itemp; ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE | MR_ACCESS)); - - if (atomic_read(&ip->i_pincount) == 0) { + if (atomic_read(&ip->i_pincount) == 0) return; - } - iip = ip->i_itemp; - if (iip && iip->ili_last_lsn) { - lsn = iip->ili_last_lsn; - } else { - lsn = (xfs_lsn_t)0; - } + /* Give the log a push to start the unpinning I/O */ + xfs_log_force(ip->i_mount, (iip && iip->ili_last_lsn) ? + iip->ili_last_lsn : 0, XFS_LOG_FORCE); + if (wait) + wait_event(ip->i_ipin_wait, (atomic_read(&ip->i_pincount) == 0)); +} - /* - * Give the log a push so we don't wait here too long. - */ - xfs_log_force(ip->i_mount, lsn, XFS_LOG_FORCE); +static inline void +xfs_iunpin_wait( + xfs_inode_t *ip) +{ + __xfs_iunpin_wait(ip, 1); +} - wait_event(ip->i_ipin_wait, (atomic_read(&ip->i_pincount) == 0)); +static inline void +xfs_iunpin_nowait( + xfs_inode_t *ip) +{ + __xfs_iunpin_wait(ip, 0); } @@ -2932,7 +2876,7 @@ xfs_iextents_copy( * format indicates the current state of the fork. */ /*ARGSUSED*/ -STATIC int +STATIC void xfs_iflush_fork( xfs_inode_t *ip, xfs_dinode_t *dip, @@ -2953,16 +2897,16 @@ xfs_iflush_fork( static const short extflag[2] = { XFS_ILOG_DEXT, XFS_ILOG_AEXT }; - if (iip == NULL) - return 0; + if (!iip) + return; ifp = XFS_IFORK_PTR(ip, whichfork); /* * This can happen if we gave up in iformat in an error path, * for the attribute fork. */ - if (ifp == NULL) { + if (!ifp) { ASSERT(whichfork == XFS_ATTR_FORK); - return 0; + return; } cp = XFS_DFORK_PTR(dip, whichfork); mp = ip->i_mount; @@ -3023,8 +2967,145 @@ xfs_iflush_fork( ASSERT(0); break; } +} + +STATIC int +xfs_iflush_cluster( + xfs_inode_t *ip, + xfs_buf_t *bp) +{ + xfs_mount_t *mp = ip->i_mount; + xfs_perag_t *pag = xfs_get_perag(mp, ip->i_ino); + unsigned long first_index, mask; + int ilist_size; + xfs_inode_t **ilist; + xfs_inode_t *iq; + int nr_found; + int clcount = 0; + int bufwasdelwri; + int i; + + ASSERT(pag->pagi_inodeok); + ASSERT(pag->pag_ici_init); + + ilist_size = XFS_INODE_CLUSTER_SIZE(mp) * sizeof(xfs_inode_t *); + ilist = kmem_alloc(ilist_size, KM_MAYFAIL); + if (!ilist) + return 0; + + mask = ~(((XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog)) - 1); + first_index = XFS_INO_TO_AGINO(mp, ip->i_ino) & mask; + read_lock(&pag->pag_ici_lock); + /* really need a gang lookup range call here */ + nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, (void**)ilist, + first_index, + XFS_INODE_CLUSTER_SIZE(mp)); + if (nr_found == 0) + goto out_free; + + for (i = 0; i < nr_found; i++) { + iq = ilist[i]; + if (iq == ip) + continue; + /* if the inode lies outside this cluster, we're done. */ + if ((XFS_INO_TO_AGINO(mp, iq->i_ino) & mask) != first_index) + break; + /* + * Do an un-protected check to see if the inode is dirty and + * is a candidate for flushing. These checks will be repeated + * later after the appropriate locks are acquired. + */ + if (xfs_inode_clean(iq) && xfs_ipincount(iq) == 0) + continue; + + /* + * Try to get locks. If any are unavailable or it is pinned, + * then this inode cannot be flushed and is skipped. + */ + + if (!xfs_ilock_nowait(iq, XFS_ILOCK_SHARED)) + continue; + if (!xfs_iflock_nowait(iq)) { + xfs_iunlock(iq, XFS_ILOCK_SHARED); + continue; + } + if (xfs_ipincount(iq)) { + xfs_ifunlock(iq); + xfs_iunlock(iq, XFS_ILOCK_SHARED); + continue; + } + + /* + * arriving here means that this inode can be flushed. First + * re-check that it's dirty before flushing. + */ + if (!xfs_inode_clean(iq)) { + int error; + error = xfs_iflush_int(iq, bp); + if (error) { + xfs_iunlock(iq, XFS_ILOCK_SHARED); + goto cluster_corrupt_out; + } + clcount++; + } else { + xfs_ifunlock(iq); + } + xfs_iunlock(iq, XFS_ILOCK_SHARED); + } + + if (clcount) { + XFS_STATS_INC(xs_icluster_flushcnt); + XFS_STATS_ADD(xs_icluster_flushinode, clcount); + } +out_free: + read_unlock(&pag->pag_ici_lock); + kmem_free(ilist, ilist_size); return 0; + + +cluster_corrupt_out: + /* + * Corruption detected in the clustering loop. Invalidate the + * inode buffer and shut down the filesystem. + */ + read_unlock(&pag->pag_ici_lock); + /* + * Clean up the buffer. If it was B_DELWRI, just release it -- + * brelse can handle it with no problems. If not, shut down the + * filesystem before releasing the buffer. + */ + bufwasdelwri = XFS_BUF_ISDELAYWRITE(bp); + if (bufwasdelwri) + xfs_buf_relse(bp); + + xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); + + if (!bufwasdelwri) { + /* + * Just like incore_relse: if we have b_iodone functions, + * mark the buffer as an error and call them. Otherwise + * mark it as stale and brelse. + */ + if (XFS_BUF_IODONE_FUNC(bp)) { + XFS_BUF_CLR_BDSTRAT_FUNC(bp); + XFS_BUF_UNDONE(bp); + XFS_BUF_STALE(bp); + XFS_BUF_SHUT(bp); + XFS_BUF_ERROR(bp,EIO); + xfs_biodone(bp); + } else { + XFS_BUF_STALE(bp); + xfs_buf_relse(bp); + } + } + + /* + * Unlocks the flush lock + */ + xfs_iflush_abort(iq); + kmem_free(ilist, ilist_size); + return XFS_ERROR(EFSCORRUPTED); } /* @@ -3046,11 +3127,7 @@ xfs_iflush( xfs_dinode_t *dip; xfs_mount_t *mp; int error; - /* REFERENCED */ - xfs_inode_t *iq; - int clcount; /* count of inodes clustered */ - int bufwasdelwri; - struct hlist_node *entry; + int noblock = (flags == XFS_IFLUSH_ASYNC_NOBLOCK); enum { INT_DELWRI = (1 << 0), INT_ASYNC = (1 << 1) }; XFS_STATS_INC(xs_iflush_count); @@ -3067,8 +3144,7 @@ xfs_iflush( * If the inode isn't dirty, then just release the inode * flush lock and do nothing. */ - if ((ip->i_update_core == 0) && - ((iip == NULL) || !(iip->ili_format.ilf_fields & XFS_ILOG_ALL))) { + if (xfs_inode_clean(ip)) { ASSERT((iip != NULL) ? !(iip->ili_item.li_flags & XFS_LI_IN_AIL) : 1); xfs_ifunlock(ip); @@ -3076,11 +3152,21 @@ xfs_iflush( } /* - * We can't flush the inode until it is unpinned, so - * wait for it. We know noone new can pin it, because - * we are holding the inode lock shared and you need - * to hold it exclusively to pin the inode. + * We can't flush the inode until it is unpinned, so wait for it if we + * are allowed to block. We know noone new can pin it, because we are + * holding the inode lock shared and you need to hold it exclusively to + * pin the inode. + * + * If we are not allowed to block, force the log out asynchronously so + * that when we come back the inode will be unpinned. If other inodes + * in the same cluster are dirty, they will probably write the inode + * out for us if they occur after the log force completes. */ + if (noblock && xfs_ipincount(ip)) { + xfs_iunpin_nowait(ip); + xfs_ifunlock(ip); + return EAGAIN; + } xfs_iunpin_wait(ip); /* @@ -3097,15 +3183,6 @@ xfs_iflush( } /* - * Get the buffer containing the on-disk inode. - */ - error = xfs_itobp(mp, NULL, ip, &dip, &bp, 0, 0); - if (error) { - xfs_ifunlock(ip); - return error; - } - - /* * Decide how buffer will be flushed out. This is done before * the call to xfs_iflush_int because this field is zeroed by it. */ @@ -3121,6 +3198,7 @@ xfs_iflush( case XFS_IFLUSH_DELWRI_ELSE_SYNC: flags = 0; break; + case XFS_IFLUSH_ASYNC_NOBLOCK: case XFS_IFLUSH_ASYNC: case XFS_IFLUSH_DELWRI_ELSE_ASYNC: flags = INT_ASYNC; @@ -3140,6 +3218,7 @@ xfs_iflush( case XFS_IFLUSH_DELWRI: flags = INT_DELWRI; break; + case XFS_IFLUSH_ASYNC_NOBLOCK: case XFS_IFLUSH_ASYNC: flags = INT_ASYNC; break; @@ -3154,94 +3233,41 @@ xfs_iflush( } /* - * First flush out the inode that xfs_iflush was called with. + * Get the buffer containing the on-disk inode. */ - error = xfs_iflush_int(ip, bp); - if (error) { - goto corrupt_out; + error = xfs_itobp(mp, NULL, ip, &dip, &bp, 0, 0, + noblock ? XFS_BUF_TRYLOCK : XFS_BUF_LOCK); + if (error || !bp) { + xfs_ifunlock(ip); + return error; } /* - * inode clustering: - * see if other inodes can be gathered into this write + * First flush out the inode that xfs_iflush was called with. */ - spin_lock(&ip->i_cluster->icl_lock); - ip->i_cluster->icl_buf = bp; - - clcount = 0; - hlist_for_each_entry(iq, entry, &ip->i_cluster->icl_inodes, i_cnode) { - if (iq == ip) - continue; - - /* - * Do an un-protected check to see if the inode is dirty and - * is a candidate for flushing. These checks will be repeated - * later after the appropriate locks are acquired. - */ - iip = iq->i_itemp; - if ((iq->i_update_core == 0) && - ((iip == NULL) || - !(iip->ili_format.ilf_fields & XFS_ILOG_ALL)) && - xfs_ipincount(iq) == 0) { - continue; - } - - /* - * Try to get locks. If any are unavailable, - * then this inode cannot be flushed and is skipped. - */ - - /* get inode locks (just i_lock) */ - if (xfs_ilock_nowait(iq, XFS_ILOCK_SHARED)) { - /* get inode flush lock */ - if (xfs_iflock_nowait(iq)) { - /* check if pinned */ - if (xfs_ipincount(iq) == 0) { - /* arriving here means that - * this inode can be flushed. - * first re-check that it's - * dirty - */ - iip = iq->i_itemp; - if ((iq->i_update_core != 0)|| - ((iip != NULL) && - (iip->ili_format.ilf_fields & XFS_ILOG_ALL))) { - clcount++; - error = xfs_iflush_int(iq, bp); - if (error) { - xfs_iunlock(iq, - XFS_ILOCK_SHARED); - goto cluster_corrupt_out; - } - } else { - xfs_ifunlock(iq); - } - } else { - xfs_ifunlock(iq); - } - } - xfs_iunlock(iq, XFS_ILOCK_SHARED); - } - } - spin_unlock(&ip->i_cluster->icl_lock); - - if (clcount) { - XFS_STATS_INC(xs_icluster_flushcnt); - XFS_STATS_ADD(xs_icluster_flushinode, clcount); - } + error = xfs_iflush_int(ip, bp); + if (error) + goto corrupt_out; /* - * If the buffer is pinned then push on the log so we won't + * If the buffer is pinned then push on the log now so we won't * get stuck waiting in the write for too long. */ - if (XFS_BUF_ISPINNED(bp)){ + if (XFS_BUF_ISPINNED(bp)) xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE); - } + + /* + * inode clustering: + * see if other inodes can be gathered into this write + */ + error = xfs_iflush_cluster(ip, bp); + if (error) + goto cluster_corrupt_out; if (flags & INT_DELWRI) { xfs_bdwrite(mp, bp); } else if (flags & INT_ASYNC) { - xfs_bawrite(mp, bp); + error = xfs_bawrite(mp, bp); } else { error = xfs_bwrite(mp, bp); } @@ -3250,52 +3276,11 @@ xfs_iflush( corrupt_out: xfs_buf_relse(bp); xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); - xfs_iflush_abort(ip); - /* - * Unlocks the flush lock - */ - return XFS_ERROR(EFSCORRUPTED); - cluster_corrupt_out: - /* Corruption detected in the clustering loop. Invalidate the - * inode buffer and shut down the filesystem. - */ - spin_unlock(&ip->i_cluster->icl_lock); - - /* - * Clean up the buffer. If it was B_DELWRI, just release it -- - * brelse can handle it with no problems. If not, shut down the - * filesystem before releasing the buffer. - */ - if ((bufwasdelwri= XFS_BUF_ISDELAYWRITE(bp))) { - xfs_buf_relse(bp); - } - - xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); - - if(!bufwasdelwri) { - /* - * Just like incore_relse: if we have b_iodone functions, - * mark the buffer as an error and call them. Otherwise - * mark it as stale and brelse. - */ - if (XFS_BUF_IODONE_FUNC(bp)) { - XFS_BUF_CLR_BDSTRAT_FUNC(bp); - XFS_BUF_UNDONE(bp); - XFS_BUF_STALE(bp); - XFS_BUF_SHUT(bp); - XFS_BUF_ERROR(bp,EIO); - xfs_biodone(bp); - } else { - XFS_BUF_STALE(bp); - xfs_buf_relse(bp); - } - } - - xfs_iflush_abort(iq); /* * Unlocks the flush lock */ + xfs_iflush_abort(ip); return XFS_ERROR(EFSCORRUPTED); } @@ -3325,8 +3310,7 @@ xfs_iflush_int( * If the inode isn't dirty, then just release the inode * flush lock and do nothing. */ - if ((ip->i_update_core == 0) && - ((iip == NULL) || !(iip->ili_format.ilf_fields & XFS_ILOG_ALL))) { + if (xfs_inode_clean(ip)) { xfs_ifunlock(ip); return 0; } @@ -3459,16 +3443,9 @@ xfs_iflush_int( } } - if (xfs_iflush_fork(ip, dip, iip, XFS_DATA_FORK, bp) == EFSCORRUPTED) { - goto corrupt_out; - } - - if (XFS_IFORK_Q(ip)) { - /* - * The only error from xfs_iflush_fork is on the data fork. - */ - (void) xfs_iflush_fork(ip, dip, iip, XFS_ATTR_FORK, bp); - } + xfs_iflush_fork(ip, dip, iip, XFS_DATA_FORK, bp); + if (XFS_IFORK_Q(ip)) + xfs_iflush_fork(ip, dip, iip, XFS_ATTR_FORK, bp); xfs_inobp_check(mp, bp); /* diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index bfcd72c..93c3769 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -133,19 +133,6 @@ typedef struct dm_attrs_s { } dm_attrs_t; /* - * This is the xfs inode cluster structure. This structure is used by - * xfs_iflush to find inodes that share a cluster and can be flushed to disk at - * the same time. - */ -typedef struct xfs_icluster { - struct hlist_head icl_inodes; /* list of inodes on cluster */ - xfs_daddr_t icl_blkno; /* starting block number of - * the cluster */ - struct xfs_buf *icl_buf; /* the inode buffer */ - spinlock_t icl_lock; /* inode list lock */ -} xfs_icluster_t; - -/* * This is the xfs in-core inode structure. * Most of the on-disk inode is embedded in the i_d field. * @@ -240,10 +227,6 @@ typedef struct xfs_inode { atomic_t i_pincount; /* inode pin count */ wait_queue_head_t i_ipin_wait; /* inode pinning wait queue */ spinlock_t i_flags_lock; /* inode i_flags lock */ -#ifdef HAVE_REFCACHE - struct xfs_inode **i_refcache; /* ptr to entry in ref cache */ - struct xfs_inode *i_release; /* inode to unref */ -#endif /* Miscellaneous state. */ unsigned short i_flags; /* see defined flags below */ unsigned char i_update_core; /* timestamps/size is dirty */ @@ -252,8 +235,6 @@ typedef struct xfs_inode { unsigned int i_delayed_blks; /* count of delay alloc blks */ xfs_icdinode_t i_d; /* most of ondisk inode */ - xfs_icluster_t *i_cluster; /* cluster list header */ - struct hlist_node i_cnode; /* cluster link node */ xfs_fsize_t i_size; /* in-memory size */ xfs_fsize_t i_new_size; /* size when write completes */ @@ -461,6 +442,7 @@ xfs_iflags_test_and_clear(xfs_inode_t *ip, unsigned short flags) #define XFS_IFLUSH_SYNC 3 #define XFS_IFLUSH_ASYNC 4 #define XFS_IFLUSH_DELWRI 5 +#define XFS_IFLUSH_ASYNC_NOBLOCK 6 /* * Flags for xfs_itruncate_start(). @@ -515,7 +497,7 @@ int xfs_finish_reclaim_all(struct xfs_mount *, int); */ int xfs_itobp(struct xfs_mount *, struct xfs_trans *, xfs_inode_t *, struct xfs_dinode **, struct xfs_buf **, - xfs_daddr_t, uint); + xfs_daddr_t, uint, uint); int xfs_iread(struct xfs_mount *, struct xfs_trans *, xfs_ino_t, xfs_inode_t **, xfs_daddr_t, uint); int xfs_iread_extents(struct xfs_trans *, xfs_inode_t *, int); @@ -597,7 +579,6 @@ void xfs_inobp_check(struct xfs_mount *, struct xfs_buf *); #define xfs_inobp_check(mp, bp) #endif /* DEBUG */ -extern struct kmem_zone *xfs_icluster_zone; extern struct kmem_zone *xfs_ifork_zone; extern struct kmem_zone *xfs_inode_zone; extern struct kmem_zone *xfs_ili_zone; diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index 2c775b4..93b5db4 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -40,6 +40,7 @@ #include "xfs_btree.h" #include "xfs_ialloc.h" #include "xfs_rw.h" +#include "xfs_error.h" kmem_zone_t *xfs_ili_zone; /* inode log item zone */ @@ -813,7 +814,12 @@ xfs_inode_item_pushbuf( XFS_LOG_FORCE); } if (dopush) { - xfs_bawrite(mp, bp); + int error; + error = xfs_bawrite(mp, bp); + if (error) + xfs_fs_cmn_err(CE_WARN, mp, + "xfs_inode_item_pushbuf: pushbuf error %d on iip %p, bp %p", + error, iip, bp); } else { xfs_buf_relse(bp); } diff --git a/fs/xfs/xfs_inode_item.h b/fs/xfs/xfs_inode_item.h index bfe92ea..4051307 100644 --- a/fs/xfs/xfs_inode_item.h +++ b/fs/xfs/xfs_inode_item.h @@ -168,6 +168,14 @@ static inline int xfs_ilog_fext(int w) return (w == XFS_DATA_FORK ? XFS_ILOG_DEXT : XFS_ILOG_AEXT); } +static inline int xfs_inode_clean(xfs_inode_t *ip) +{ + return (!ip->i_itemp || + !(ip->i_itemp->ili_format.ilf_fields & XFS_ILOG_ALL)) && + !ip->i_update_core; +} + + #ifdef __KERNEL__ extern void xfs_inode_item_init(struct xfs_inode *, struct xfs_mount *); diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index fde37f8..fb3cf11 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -802,8 +802,11 @@ xfs_iomap_write_allocate( */ nimaps = 1; end_fsb = XFS_B_TO_FSB(mp, ip->i_size); - xfs_bmap_last_offset(NULL, ip, &last_block, - XFS_DATA_FORK); + error = xfs_bmap_last_offset(NULL, ip, &last_block, + XFS_DATA_FORK); + if (error) + goto trans_cancel; + last_block = XFS_FILEOFF_MAX(last_block, end_fsb); if ((map_start_fsb + count_fsb) > last_block) { count_fsb = last_block - map_start_fsb; diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c index f615e04..eb85bde 100644 --- a/fs/xfs/xfs_itable.c +++ b/fs/xfs/xfs_itable.c @@ -129,7 +129,7 @@ xfs_bulkstat_one_iget( return error; } -STATIC int +STATIC void xfs_bulkstat_one_dinode( xfs_mount_t *mp, /* mount point for filesystem */ xfs_ino_t ino, /* inode number to get data for */ @@ -198,8 +198,6 @@ xfs_bulkstat_one_dinode( buf->bs_blocks = be64_to_cpu(dic->di_nblocks); break; } - - return 0; } STATIC int @@ -614,7 +612,8 @@ xfs_bulkstat( xfs_buf_relse(bp); error = xfs_itobp(mp, NULL, ip, &dip, &bp, bno, - XFS_IMAP_BULKSTAT); + XFS_IMAP_BULKSTAT, + XFS_BUF_LOCK); if (!error) clustidx = ip->i_boffset / mp->m_sb.sb_inodesize; kmem_zone_free(xfs_inode_zone, ip); diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 31f2b04..afaee30 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -41,6 +41,7 @@ #include "xfs_inode.h" #include "xfs_rw.h" +kmem_zone_t *xfs_log_ticket_zone; #define xlog_write_adv_cnt(ptr, len, off, bytes) \ { (ptr) += (bytes); \ @@ -73,8 +74,6 @@ STATIC int xlog_state_get_iclog_space(xlog_t *log, xlog_ticket_t *ticket, int *continued_write, int *logoffsetp); -STATIC void xlog_state_put_ticket(xlog_t *log, - xlog_ticket_t *tic); STATIC int xlog_state_release_iclog(xlog_t *log, xlog_in_core_t *iclog); STATIC void xlog_state_switch_iclogs(xlog_t *log, @@ -101,7 +100,6 @@ STATIC void xlog_ungrant_log_space(xlog_t *log, /* local ticket functions */ -STATIC void xlog_state_ticket_alloc(xlog_t *log); STATIC xlog_ticket_t *xlog_ticket_get(xlog_t *log, int unit_bytes, int count, @@ -330,7 +328,7 @@ xfs_log_done(xfs_mount_t *mp, */ xlog_trace_loggrant(log, ticket, "xfs_log_done: (non-permanent)"); xlog_ungrant_log_space(log, ticket); - xlog_state_put_ticket(log, ticket); + xlog_ticket_put(log, ticket); } else { xlog_trace_loggrant(log, ticket, "xfs_log_done: (permanent)"); xlog_regrant_reserve_log_space(log, ticket); @@ -384,7 +382,27 @@ _xfs_log_force( return xlog_state_sync_all(log, flags, log_flushed); else return xlog_state_sync(log, lsn, flags, log_flushed); -} /* xfs_log_force */ +} /* _xfs_log_force */ + +/* + * Wrapper for _xfs_log_force(), to be used when caller doesn't care + * about errors or whether the log was flushed or not. This is the normal + * interface to use when trying to unpin items or move the log forward. + */ +void +xfs_log_force( + xfs_mount_t *mp, + xfs_lsn_t lsn, + uint flags) +{ + int error; + error = _xfs_log_force(mp, lsn, flags, NULL); + if (error) { + xfs_fs_cmn_err(CE_WARN, mp, "xfs_log_force: " + "error %d returned.", error); + } +} + /* * Attaches a new iclog I/O completion callback routine during @@ -397,12 +415,10 @@ xfs_log_notify(xfs_mount_t *mp, /* mount of partition */ void *iclog_hndl, /* iclog to hang callback off */ xfs_log_callback_t *cb) { - xlog_t *log = mp->m_log; xlog_in_core_t *iclog = (xlog_in_core_t *)iclog_hndl; int abortflg; - cb->cb_next = NULL; - spin_lock(&log->l_icloglock); + spin_lock(&iclog->ic_callback_lock); abortflg = (iclog->ic_state & XLOG_STATE_IOERROR); if (!abortflg) { ASSERT_ALWAYS((iclog->ic_state == XLOG_STATE_ACTIVE) || @@ -411,7 +427,7 @@ xfs_log_notify(xfs_mount_t *mp, /* mount of partition */ *(iclog->ic_callback_tail) = cb; iclog->ic_callback_tail = &(cb->cb_next); } - spin_unlock(&log->l_icloglock); + spin_unlock(&iclog->ic_callback_lock); return abortflg; } /* xfs_log_notify */ @@ -471,6 +487,8 @@ xfs_log_reserve(xfs_mount_t *mp, /* may sleep if need to allocate more tickets */ internal_ticket = xlog_ticket_get(log, unit_bytes, cnt, client, flags); + if (!internal_ticket) + return XFS_ERROR(ENOMEM); internal_ticket->t_trans_type = t_type; *ticket = internal_ticket; xlog_trace_loggrant(log, internal_ticket, @@ -636,7 +654,8 @@ xfs_log_unmount_write(xfs_mount_t *mp) if (mp->m_flags & XFS_MOUNT_RDONLY) return 0; - xfs_log_force(mp, 0, XFS_LOG_FORCE|XFS_LOG_SYNC); + error = _xfs_log_force(mp, 0, XFS_LOG_FORCE|XFS_LOG_SYNC, NULL); + ASSERT(error || !(XLOG_FORCED_SHUTDOWN(log))); #ifdef DEBUG first_iclog = iclog = log->l_iclog; @@ -675,10 +694,10 @@ xfs_log_unmount_write(xfs_mount_t *mp) spin_lock(&log->l_icloglock); iclog = log->l_iclog; - iclog->ic_refcnt++; + atomic_inc(&iclog->ic_refcnt); spin_unlock(&log->l_icloglock); xlog_state_want_sync(log, iclog); - (void) xlog_state_release_iclog(log, iclog); + error = xlog_state_release_iclog(log, iclog); spin_lock(&log->l_icloglock); if (!(iclog->ic_state == XLOG_STATE_ACTIVE || @@ -695,7 +714,7 @@ xfs_log_unmount_write(xfs_mount_t *mp) if (tic) { xlog_trace_loggrant(log, tic, "unmount rec"); xlog_ungrant_log_space(log, tic); - xlog_state_put_ticket(log, tic); + xlog_ticket_put(log, tic); } } else { /* @@ -713,11 +732,11 @@ xfs_log_unmount_write(xfs_mount_t *mp) */ spin_lock(&log->l_icloglock); iclog = log->l_iclog; - iclog->ic_refcnt++; + atomic_inc(&iclog->ic_refcnt); spin_unlock(&log->l_icloglock); xlog_state_want_sync(log, iclog); - (void) xlog_state_release_iclog(log, iclog); + error = xlog_state_release_iclog(log, iclog); spin_lock(&log->l_icloglock); @@ -732,7 +751,7 @@ xfs_log_unmount_write(xfs_mount_t *mp) } } - return 0; + return error; } /* xfs_log_unmount_write */ /* @@ -1210,7 +1229,6 @@ xlog_alloc_log(xfs_mount_t *mp, spin_lock_init(&log->l_icloglock); spin_lock_init(&log->l_grant_lock); initnsema(&log->l_flushsema, 0, "ic-flush"); - xlog_state_ticket_alloc(log); /* wait until after icloglock inited */ /* log record size must be multiple of BBSIZE; see xlog_rec_header_t */ ASSERT((XFS_BUF_SIZE(bp) & BBMASK) == 0); @@ -1240,9 +1258,9 @@ xlog_alloc_log(xfs_mount_t *mp, XFS_BUF_SET_FSPRIVATE2(bp, (unsigned long)1); iclog->ic_bp = bp; iclog->hic_data = bp->b_addr; - +#ifdef DEBUG log->l_iclog_bak[i] = (xfs_caddr_t)&(iclog->ic_header); - +#endif head = &iclog->ic_header; memset(head, 0, sizeof(xlog_rec_header_t)); head->h_magicno = cpu_to_be32(XLOG_HEADER_MAGIC_NUM); @@ -1253,10 +1271,11 @@ xlog_alloc_log(xfs_mount_t *mp, head->h_fmt = cpu_to_be32(XLOG_FMT); memcpy(&head->h_fs_uuid, &mp->m_sb.sb_uuid, sizeof(uuid_t)); - iclog->ic_size = XFS_BUF_SIZE(bp) - log->l_iclog_hsize; iclog->ic_state = XLOG_STATE_ACTIVE; iclog->ic_log = log; + atomic_set(&iclog->ic_refcnt, 0); + spin_lock_init(&iclog->ic_callback_lock); iclog->ic_callback_tail = &(iclog->ic_callback); iclog->ic_datap = (char *)iclog->hic_data + log->l_iclog_hsize; @@ -1405,7 +1424,7 @@ xlog_sync(xlog_t *log, int v2 = xfs_sb_version_haslogv2(&log->l_mp->m_sb); XFS_STATS_INC(xs_log_writes); - ASSERT(iclog->ic_refcnt == 0); + ASSERT(atomic_read(&iclog->ic_refcnt) == 0); /* Add for LR header */ count_init = log->l_iclog_hsize + iclog->ic_offset; @@ -1538,7 +1557,6 @@ STATIC void xlog_dealloc_log(xlog_t *log) { xlog_in_core_t *iclog, *next_iclog; - xlog_ticket_t *tic, *next_tic; int i; iclog = log->l_iclog; @@ -1559,22 +1577,6 @@ xlog_dealloc_log(xlog_t *log) spinlock_destroy(&log->l_icloglock); spinlock_destroy(&log->l_grant_lock); - /* XXXsup take a look at this again. */ - if ((log->l_ticket_cnt != log->l_ticket_tcnt) && - !XLOG_FORCED_SHUTDOWN(log)) { - xfs_fs_cmn_err(CE_WARN, log->l_mp, - "xlog_dealloc_log: (cnt: %d, total: %d)", - log->l_ticket_cnt, log->l_ticket_tcnt); - /* ASSERT(log->l_ticket_cnt == log->l_ticket_tcnt); */ - - } else { - tic = log->l_unmount_free; - while (tic) { - next_tic = tic->t_next; - kmem_free(tic, PAGE_SIZE); - tic = next_tic; - } - } xfs_buf_free(log->l_xbuf); #ifdef XFS_LOG_TRACE if (log->l_trace != NULL) { @@ -1987,7 +1989,7 @@ xlog_state_clean_log(xlog_t *log) if (iclog->ic_state == XLOG_STATE_DIRTY) { iclog->ic_state = XLOG_STATE_ACTIVE; iclog->ic_offset = 0; - iclog->ic_callback = NULL; /* don't need to free */ + ASSERT(iclog->ic_callback == NULL); /* * If the number of ops in this iclog indicate it just * contains the dummy transaction, we can @@ -2190,37 +2192,40 @@ xlog_state_do_callback( be64_to_cpu(iclog->ic_header.h_lsn); spin_unlock(&log->l_grant_lock); - /* - * Keep processing entries in the callback list - * until we come around and it is empty. We - * need to atomically see that the list is - * empty and change the state to DIRTY so that - * we don't miss any more callbacks being added. - */ - spin_lock(&log->l_icloglock); } else { + spin_unlock(&log->l_icloglock); ioerrors++; } - cb = iclog->ic_callback; + /* + * Keep processing entries in the callback list until + * we come around and it is empty. We need to + * atomically see that the list is empty and change the + * state to DIRTY so that we don't miss any more + * callbacks being added. + */ + spin_lock(&iclog->ic_callback_lock); + cb = iclog->ic_callback; while (cb) { iclog->ic_callback_tail = &(iclog->ic_callback); iclog->ic_callback = NULL; - spin_unlock(&log->l_icloglock); + spin_unlock(&iclog->ic_callback_lock); /* perform callbacks in the order given */ for (; cb; cb = cb_next) { cb_next = cb->cb_next; cb->cb_func(cb->cb_arg, aborted); } - spin_lock(&log->l_icloglock); + spin_lock(&iclog->ic_callback_lock); cb = iclog->ic_callback; } loopdidcallbacks++; funcdidcallbacks++; + spin_lock(&log->l_icloglock); ASSERT(iclog->ic_callback == NULL); + spin_unlock(&iclog->ic_callback_lock); if (!(iclog->ic_state & XLOG_STATE_IOERROR)) iclog->ic_state = XLOG_STATE_DIRTY; @@ -2241,7 +2246,7 @@ xlog_state_do_callback( repeats = 0; xfs_fs_cmn_err(CE_WARN, log->l_mp, "%s: possible infinite loop (%d iterations)", - __FUNCTION__, flushcnt); + __func__, flushcnt); } } while (!ioerrors && loopdidcallbacks); @@ -2309,7 +2314,7 @@ xlog_state_done_syncing( ASSERT(iclog->ic_state == XLOG_STATE_SYNCING || iclog->ic_state == XLOG_STATE_IOERROR); - ASSERT(iclog->ic_refcnt == 0); + ASSERT(atomic_read(&iclog->ic_refcnt) == 0); ASSERT(iclog->ic_bwritecnt == 1 || iclog->ic_bwritecnt == 2); @@ -2391,7 +2396,7 @@ restart: ASSERT(iclog->ic_state == XLOG_STATE_ACTIVE); head = &iclog->ic_header; - iclog->ic_refcnt++; /* prevents sync */ + atomic_inc(&iclog->ic_refcnt); /* prevents sync */ log_offset = iclog->ic_offset; /* On the 1st write to an iclog, figure out lsn. This works @@ -2423,12 +2428,12 @@ restart: xlog_state_switch_iclogs(log, iclog, iclog->ic_size); /* If I'm the only one writing to this iclog, sync it to disk */ - if (iclog->ic_refcnt == 1) { + if (atomic_read(&iclog->ic_refcnt) == 1) { spin_unlock(&log->l_icloglock); if ((error = xlog_state_release_iclog(log, iclog))) return error; } else { - iclog->ic_refcnt--; + atomic_dec(&iclog->ic_refcnt); spin_unlock(&log->l_icloglock); } goto restart; @@ -2792,18 +2797,6 @@ xlog_ungrant_log_space(xlog_t *log, /* - * Atomically put back used ticket. - */ -STATIC void -xlog_state_put_ticket(xlog_t *log, - xlog_ticket_t *tic) -{ - spin_lock(&log->l_icloglock); - xlog_ticket_put(log, tic); - spin_unlock(&log->l_icloglock); -} /* xlog_state_put_ticket */ - -/* * Flush iclog to disk if this is the last reference to the given iclog and * the WANT_SYNC bit is set. * @@ -2813,33 +2806,35 @@ xlog_state_put_ticket(xlog_t *log, * */ STATIC int -xlog_state_release_iclog(xlog_t *log, - xlog_in_core_t *iclog) +xlog_state_release_iclog( + xlog_t *log, + xlog_in_core_t *iclog) { int sync = 0; /* do we sync? */ - xlog_assign_tail_lsn(log->l_mp); + if (iclog->ic_state & XLOG_STATE_IOERROR) + return XFS_ERROR(EIO); - spin_lock(&log->l_icloglock); + ASSERT(atomic_read(&iclog->ic_refcnt) > 0); + if (!atomic_dec_and_lock(&iclog->ic_refcnt, &log->l_icloglock)) + return 0; if (iclog->ic_state & XLOG_STATE_IOERROR) { spin_unlock(&log->l_icloglock); return XFS_ERROR(EIO); } - - ASSERT(iclog->ic_refcnt > 0); ASSERT(iclog->ic_state == XLOG_STATE_ACTIVE || iclog->ic_state == XLOG_STATE_WANT_SYNC); - if (--iclog->ic_refcnt == 0 && - iclog->ic_state == XLOG_STATE_WANT_SYNC) { + if (iclog->ic_state == XLOG_STATE_WANT_SYNC) { + /* update tail before writing to iclog */ + xlog_assign_tail_lsn(log->l_mp); sync++; iclog->ic_state = XLOG_STATE_SYNCING; iclog->ic_header.h_tail_lsn = cpu_to_be64(log->l_tail_lsn); xlog_verify_tail_lsn(log, iclog, log->l_tail_lsn); /* cycle incremented when incrementing curr_block */ } - spin_unlock(&log->l_icloglock); /* @@ -2849,11 +2844,9 @@ xlog_state_release_iclog(xlog_t *log, * this iclog has consistent data, so we ignore IOERROR * flags after this point. */ - if (sync) { + if (sync) return xlog_sync(log, iclog); - } return 0; - } /* xlog_state_release_iclog */ @@ -2953,7 +2946,8 @@ xlog_state_sync_all(xlog_t *log, uint flags, int *log_flushed) * previous iclog and go to sleep. */ if (iclog->ic_state == XLOG_STATE_DIRTY || - (iclog->ic_refcnt == 0 && iclog->ic_offset == 0)) { + (atomic_read(&iclog->ic_refcnt) == 0 + && iclog->ic_offset == 0)) { iclog = iclog->ic_prev; if (iclog->ic_state == XLOG_STATE_ACTIVE || iclog->ic_state == XLOG_STATE_DIRTY) @@ -2961,14 +2955,14 @@ xlog_state_sync_all(xlog_t *log, uint flags, int *log_flushed) else goto maybe_sleep; } else { - if (iclog->ic_refcnt == 0) { + if (atomic_read(&iclog->ic_refcnt) == 0) { /* We are the only one with access to this * iclog. Flush it out now. There should * be a roundoff of zero to show that someone * has already taken care of the roundoff from * the previous sync. */ - iclog->ic_refcnt++; + atomic_inc(&iclog->ic_refcnt); lsn = be64_to_cpu(iclog->ic_header.h_lsn); xlog_state_switch_iclogs(log, iclog, 0); spin_unlock(&log->l_icloglock); @@ -3100,7 +3094,7 @@ try_again: already_slept = 1; goto try_again; } else { - iclog->ic_refcnt++; + atomic_inc(&iclog->ic_refcnt); xlog_state_switch_iclogs(log, iclog, 0); spin_unlock(&log->l_icloglock); if (xlog_state_release_iclog(log, iclog)) @@ -3172,92 +3166,19 @@ xlog_state_want_sync(xlog_t *log, xlog_in_core_t *iclog) */ /* - * Algorithm doesn't take into account page size. ;-( - */ -STATIC void -xlog_state_ticket_alloc(xlog_t *log) -{ - xlog_ticket_t *t_list; - xlog_ticket_t *next; - xfs_caddr_t buf; - uint i = (PAGE_SIZE / sizeof(xlog_ticket_t)) - 2; - - /* - * The kmem_zalloc may sleep, so we shouldn't be holding the - * global lock. XXXmiken: may want to use zone allocator. - */ - buf = (xfs_caddr_t) kmem_zalloc(PAGE_SIZE, KM_SLEEP); - - spin_lock(&log->l_icloglock); - - /* Attach 1st ticket to Q, so we can keep track of allocated memory */ - t_list = (xlog_ticket_t *)buf; - t_list->t_next = log->l_unmount_free; - log->l_unmount_free = t_list++; - log->l_ticket_cnt++; - log->l_ticket_tcnt++; - - /* Next ticket becomes first ticket attached to ticket free list */ - if (log->l_freelist != NULL) { - ASSERT(log->l_tail != NULL); - log->l_tail->t_next = t_list; - } else { - log->l_freelist = t_list; - } - log->l_ticket_cnt++; - log->l_ticket_tcnt++; - - /* Cycle through rest of alloc'ed memory, building up free Q */ - for ( ; i > 0; i--) { - next = t_list + 1; - t_list->t_next = next; - t_list = next; - log->l_ticket_cnt++; - log->l_ticket_tcnt++; - } - t_list->t_next = NULL; - log->l_tail = t_list; - spin_unlock(&log->l_icloglock); -} /* xlog_state_ticket_alloc */ - - -/* - * Put ticket into free list - * - * Assumption: log lock is held around this call. + * Free a used ticket. */ STATIC void xlog_ticket_put(xlog_t *log, xlog_ticket_t *ticket) { sv_destroy(&ticket->t_sema); - - /* - * Don't think caching will make that much difference. It's - * more important to make debug easier. - */ -#if 0 - /* real code will want to use LIFO for caching */ - ticket->t_next = log->l_freelist; - log->l_freelist = ticket; - /* no need to clear fields */ -#else - /* When we debug, it is easier if tickets are cycled */ - ticket->t_next = NULL; - if (log->l_tail) { - log->l_tail->t_next = ticket; - } else { - ASSERT(log->l_freelist == NULL); - log->l_freelist = ticket; - } - log->l_tail = ticket; -#endif /* DEBUG */ - log->l_ticket_cnt++; + kmem_zone_free(xfs_log_ticket_zone, ticket); } /* xlog_ticket_put */ /* - * Grab ticket off freelist or allocation some more + * Allocate and initialise a new log ticket. */ STATIC xlog_ticket_t * xlog_ticket_get(xlog_t *log, @@ -3269,21 +3190,9 @@ xlog_ticket_get(xlog_t *log, xlog_ticket_t *tic; uint num_headers; - alloc: - if (log->l_freelist == NULL) - xlog_state_ticket_alloc(log); /* potentially sleep */ - - spin_lock(&log->l_icloglock); - if (log->l_freelist == NULL) { - spin_unlock(&log->l_icloglock); - goto alloc; - } - tic = log->l_freelist; - log->l_freelist = tic->t_next; - if (log->l_freelist == NULL) - log->l_tail = NULL; - log->l_ticket_cnt--; - spin_unlock(&log->l_icloglock); + tic = kmem_zone_zalloc(xfs_log_ticket_zone, KM_SLEEP|KM_MAYFAIL); + if (!tic) + return NULL; /* * Permanent reservations have up to 'cnt'-1 active log operations @@ -3611,8 +3520,8 @@ xfs_log_force_umount( * before we mark the filesystem SHUTDOWN and wake * everybody up to tell the bad news. */ - spin_lock(&log->l_grant_lock); spin_lock(&log->l_icloglock); + spin_lock(&log->l_grant_lock); mp->m_flags |= XFS_MOUNT_FS_SHUTDOWN; XFS_BUF_DONE(mp->m_sb_bp); /* diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h index 4cdac04..d1d678e 100644 --- a/fs/xfs/xfs_log.h +++ b/fs/xfs/xfs_log.h @@ -142,8 +142,9 @@ int _xfs_log_force(struct xfs_mount *mp, xfs_lsn_t lsn, uint flags, int *log_forced); -#define xfs_log_force(mp, lsn, flags) \ - _xfs_log_force(mp, lsn, flags, NULL); +void xfs_log_force(struct xfs_mount *mp, + xfs_lsn_t lsn, + uint flags); int xfs_log_mount(struct xfs_mount *mp, struct xfs_buftarg *log_target, xfs_daddr_t start_block, diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index c6244cc..8952a39 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -242,7 +242,7 @@ typedef struct xlog_res { typedef struct xlog_ticket { sv_t t_sema; /* sleep on this semaphore : 20 */ - struct xlog_ticket *t_next; /* :4|8 */ + struct xlog_ticket *t_next; /* :4|8 */ struct xlog_ticket *t_prev; /* :4|8 */ xlog_tid_t t_tid; /* transaction identifier : 4 */ int t_curr_res; /* current reservation in bytes : 4 */ @@ -324,6 +324,19 @@ typedef struct xlog_rec_ext_header { * - ic_offset is the current number of bytes written to in this iclog. * - ic_refcnt is bumped when someone is writing to the log. * - ic_state is the state of the iclog. + * + * Because of cacheline contention on large machines, we need to separate + * various resources onto different cachelines. To start with, make the + * structure cacheline aligned. The following fields can be contended on + * by independent processes: + * + * - ic_callback_* + * - ic_refcnt + * - fields protected by the global l_icloglock + * + * so we need to ensure that these fields are located in separate cachelines. + * We'll put all the read-only and l_icloglock fields in the first cacheline, + * and move everything else out to subsequent cachelines. */ typedef struct xlog_iclog_fields { sv_t ic_forcesema; @@ -332,17 +345,22 @@ typedef struct xlog_iclog_fields { struct xlog_in_core *ic_prev; struct xfs_buf *ic_bp; struct log *ic_log; - xfs_log_callback_t *ic_callback; - xfs_log_callback_t **ic_callback_tail; -#ifdef XFS_LOG_TRACE - struct ktrace *ic_trace; -#endif int ic_size; int ic_offset; - int ic_refcnt; int ic_bwritecnt; ushort_t ic_state; char *ic_datap; /* pointer to iclog data */ +#ifdef XFS_LOG_TRACE + struct ktrace *ic_trace; +#endif + + /* Callback structures need their own cacheline */ + spinlock_t ic_callback_lock ____cacheline_aligned_in_smp; + xfs_log_callback_t *ic_callback; + xfs_log_callback_t **ic_callback_tail; + + /* reference counts need their own cacheline */ + atomic_t ic_refcnt ____cacheline_aligned_in_smp; } xlog_iclog_fields_t; typedef union xlog_in_core2 { @@ -366,6 +384,7 @@ typedef struct xlog_in_core { #define ic_bp hic_fields.ic_bp #define ic_log hic_fields.ic_log #define ic_callback hic_fields.ic_callback +#define ic_callback_lock hic_fields.ic_callback_lock #define ic_callback_tail hic_fields.ic_callback_tail #define ic_trace hic_fields.ic_trace #define ic_size hic_fields.ic_size @@ -383,43 +402,46 @@ typedef struct xlog_in_core { * that round off problems won't occur when releasing partial reservations. */ typedef struct log { + /* The following fields don't need locking */ + struct xfs_mount *l_mp; /* mount point */ + struct xfs_buf *l_xbuf; /* extra buffer for log + * wrapping */ + struct xfs_buftarg *l_targ; /* buftarg of log */ + uint l_flags; + uint l_quotaoffs_flag; /* XFS_DQ_*, for QUOTAOFFs */ + struct xfs_buf_cancel **l_buf_cancel_table; + int l_iclog_hsize; /* size of iclog header */ + int l_iclog_heads; /* # of iclog header sectors */ + uint l_sectbb_log; /* log2 of sector size in BBs */ + uint l_sectbb_mask; /* sector size (in BBs) + * alignment mask */ + int l_iclog_size; /* size of log in bytes */ + int l_iclog_size_log; /* log power size of log */ + int l_iclog_bufs; /* number of iclog buffers */ + xfs_daddr_t l_logBBstart; /* start block of log */ + int l_logsize; /* size of log in bytes */ + int l_logBBsize; /* size of log in BB chunks */ + /* The following block of fields are changed while holding icloglock */ - sema_t l_flushsema; /* iclog flushing semaphore */ + sema_t l_flushsema ____cacheline_aligned_in_smp; + /* iclog flushing semaphore */ int l_flushcnt; /* # of procs waiting on this * sema */ - int l_ticket_cnt; /* free ticket count */ - int l_ticket_tcnt; /* total ticket count */ int l_covered_state;/* state of "covering disk * log entries" */ - xlog_ticket_t *l_freelist; /* free list of tickets */ - xlog_ticket_t *l_unmount_free;/* kmem_free these addresses */ - xlog_ticket_t *l_tail; /* free list of tickets */ xlog_in_core_t *l_iclog; /* head log queue */ spinlock_t l_icloglock; /* grab to change iclog state */ xfs_lsn_t l_tail_lsn; /* lsn of 1st LR with unflushed * buffers */ xfs_lsn_t l_last_sync_lsn;/* lsn of last LR on disk */ - struct xfs_mount *l_mp; /* mount point */ - struct xfs_buf *l_xbuf; /* extra buffer for log - * wrapping */ - struct xfs_buftarg *l_targ; /* buftarg of log */ - xfs_daddr_t l_logBBstart; /* start block of log */ - int l_logsize; /* size of log in bytes */ - int l_logBBsize; /* size of log in BB chunks */ int l_curr_cycle; /* Cycle number of log writes */ int l_prev_cycle; /* Cycle number before last * block increment */ int l_curr_block; /* current logical log block */ int l_prev_block; /* previous logical log block */ - int l_iclog_size; /* size of log in bytes */ - int l_iclog_size_log; /* log power size of log */ - int l_iclog_bufs; /* number of iclog buffers */ - - /* The following field are used for debugging; need to hold icloglock */ - char *l_iclog_bak[XLOG_MAX_ICLOGS]; /* The following block of fields are changed while holding grant_lock */ - spinlock_t l_grant_lock; + spinlock_t l_grant_lock ____cacheline_aligned_in_smp; xlog_ticket_t *l_reserve_headq; xlog_ticket_t *l_write_headq; int l_grant_reserve_cycle; @@ -427,19 +449,16 @@ typedef struct log { int l_grant_write_cycle; int l_grant_write_bytes; - /* The following fields don't need locking */ #ifdef XFS_LOG_TRACE struct ktrace *l_trace; struct ktrace *l_grant_trace; #endif - uint l_flags; - uint l_quotaoffs_flag; /* XFS_DQ_*, for QUOTAOFFs */ - struct xfs_buf_cancel **l_buf_cancel_table; - int l_iclog_hsize; /* size of iclog header */ - int l_iclog_heads; /* # of iclog header sectors */ - uint l_sectbb_log; /* log2 of sector size in BBs */ - uint l_sectbb_mask; /* sector size (in BBs) - * alignment mask */ + + /* The following field are used for debugging; need to hold icloglock */ +#ifdef DEBUG + char *l_iclog_bak[XLOG_MAX_ICLOGS]; +#endif + } xlog_t; #define XLOG_FORCED_SHUTDOWN(log) ((log)->l_flags & XLOG_IO_ERROR) @@ -459,6 +478,8 @@ extern struct xfs_buf *xlog_get_bp(xlog_t *, int); extern void xlog_put_bp(struct xfs_buf *); extern int xlog_bread(xlog_t *, xfs_daddr_t, int, struct xfs_buf *); +extern kmem_zone_t *xfs_log_ticket_zone; + /* iclog tracing */ #define XLOG_TRACE_GRAB_FLUSH 1 #define XLOG_TRACE_REL_FLUSH 2 diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index b2b70eb..e65ab4a 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -46,6 +46,7 @@ #include "xfs_trans_priv.h" #include "xfs_quota.h" #include "xfs_rw.h" +#include "xfs_utils.h" STATIC int xlog_find_zeroed(xlog_t *, xfs_daddr_t *); STATIC int xlog_clear_stale_blocks(xlog_t *, xfs_lsn_t); @@ -120,7 +121,8 @@ xlog_bread( XFS_BUF_SET_TARGET(bp, log->l_mp->m_logdev_targp); xfsbdstrat(log->l_mp, bp); - if ((error = xfs_iowait(bp))) + error = xfs_iowait(bp); + if (error) xfs_ioerror_alert("xlog_bread", log->l_mp, bp, XFS_BUF_ADDR(bp)); return error; @@ -191,7 +193,7 @@ xlog_header_check_dump( { int b; - cmn_err(CE_DEBUG, "%s: SB : uuid = ", __FUNCTION__); + cmn_err(CE_DEBUG, "%s: SB : uuid = ", __func__); for (b = 0; b < 16; b++) cmn_err(CE_DEBUG, "%02x", ((uchar_t *)&mp->m_sb.sb_uuid)[b]); cmn_err(CE_DEBUG, ", fmt = %d\n", XLOG_FMT); @@ -1160,10 +1162,14 @@ xlog_write_log_records( if (j == 0 && (start_block + endcount > ealign)) { offset = XFS_BUF_PTR(bp); balign = BBTOB(ealign - start_block); - XFS_BUF_SET_PTR(bp, offset + balign, BBTOB(sectbb)); - if ((error = xlog_bread(log, ealign, sectbb, bp))) + error = XFS_BUF_SET_PTR(bp, offset + balign, + BBTOB(sectbb)); + if (!error) + error = xlog_bread(log, ealign, sectbb, bp); + if (!error) + error = XFS_BUF_SET_PTR(bp, offset, bufblks); + if (error) break; - XFS_BUF_SET_PTR(bp, offset, bufblks); } offset = xlog_align(log, start_block, endcount, bp); @@ -2280,7 +2286,9 @@ xlog_recover_do_inode_trans( * invalidate the buffer when we write it out below. */ imap.im_blkno = 0; - xfs_imap(log->l_mp, NULL, ino, &imap, 0); + error = xfs_imap(log->l_mp, NULL, ino, &imap, 0); + if (error) + goto error; } /* @@ -2964,7 +2972,7 @@ xlog_recover_process_data( * Process an extent free intent item that was recovered from * the log. We need to free the extents that it describes. */ -STATIC void +STATIC int xlog_recover_process_efi( xfs_mount_t *mp, xfs_efi_log_item_t *efip) @@ -2972,6 +2980,7 @@ xlog_recover_process_efi( xfs_efd_log_item_t *efdp; xfs_trans_t *tp; int i; + int error = 0; xfs_extent_t *extp; xfs_fsblock_t startblock_fsb; @@ -2995,23 +3004,32 @@ xlog_recover_process_efi( * free the memory associated with it. */ xfs_efi_release(efip, efip->efi_format.efi_nextents); - return; + return XFS_ERROR(EIO); } } tp = xfs_trans_alloc(mp, 0); - xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0, 0, 0); + error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0, 0, 0); + if (error) + goto abort_error; efdp = xfs_trans_get_efd(tp, efip, efip->efi_format.efi_nextents); for (i = 0; i < efip->efi_format.efi_nextents; i++) { extp = &(efip->efi_format.efi_extents[i]); - xfs_free_extent(tp, extp->ext_start, extp->ext_len); + error = xfs_free_extent(tp, extp->ext_start, extp->ext_len); + if (error) + goto abort_error; xfs_trans_log_efd_extent(tp, efdp, extp->ext_start, extp->ext_len); } efip->efi_flags |= XFS_EFI_RECOVERED; - xfs_trans_commit(tp, 0); + error = xfs_trans_commit(tp, 0); + return error; + +abort_error: + xfs_trans_cancel(tp, XFS_TRANS_ABORT); + return error; } /* @@ -3059,7 +3077,7 @@ xlog_recover_check_ail( * everything already in the AIL, we stop processing as soon as * we see something other than an EFI in the AIL. */ -STATIC void +STATIC int xlog_recover_process_efis( xlog_t *log) { @@ -3067,6 +3085,7 @@ xlog_recover_process_efis( xfs_efi_log_item_t *efip; int gen; xfs_mount_t *mp; + int error = 0; mp = log->l_mp; spin_lock(&mp->m_ail_lock); @@ -3091,11 +3110,14 @@ xlog_recover_process_efis( } spin_unlock(&mp->m_ail_lock); - xlog_recover_process_efi(mp, efip); + error = xlog_recover_process_efi(mp, efip); + if (error) + return error; spin_lock(&mp->m_ail_lock); lip = xfs_trans_next_ail(mp, lip, &gen, NULL); } spin_unlock(&mp->m_ail_lock); + return error; } /* @@ -3115,21 +3137,18 @@ xlog_recover_clear_agi_bucket( int error; tp = xfs_trans_alloc(mp, XFS_TRANS_CLEAR_AGI_BUCKET); - xfs_trans_reserve(tp, 0, XFS_CLEAR_AGI_BUCKET_LOG_RES(mp), 0, 0, 0); - - error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, + error = xfs_trans_reserve(tp, 0, XFS_CLEAR_AGI_BUCKET_LOG_RES(mp), 0, 0, 0); + if (!error) + error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)), XFS_FSS_TO_BB(mp, 1), 0, &agibp); - if (error) { - xfs_trans_cancel(tp, XFS_TRANS_ABORT); - return; - } + if (error) + goto out_abort; + error = EINVAL; agi = XFS_BUF_TO_AGI(agibp); - if (be32_to_cpu(agi->agi_magicnum) != XFS_AGI_MAGIC) { - xfs_trans_cancel(tp, XFS_TRANS_ABORT); - return; - } + if (be32_to_cpu(agi->agi_magicnum) != XFS_AGI_MAGIC) + goto out_abort; agi->agi_unlinked[bucket] = cpu_to_be32(NULLAGINO); offset = offsetof(xfs_agi_t, agi_unlinked) + @@ -3137,7 +3156,17 @@ xlog_recover_clear_agi_bucket( xfs_trans_log_buf(tp, agibp, offset, (offset + sizeof(xfs_agino_t) - 1)); - (void) xfs_trans_commit(tp, 0); + error = xfs_trans_commit(tp, 0); + if (error) + goto out_error; + return; + +out_abort: + xfs_trans_cancel(tp, XFS_TRANS_ABORT); +out_error: + xfs_fs_cmn_err(CE_WARN, mp, "xlog_recover_clear_agi_bucket: " + "failed to clear agi %d. Continuing.", agno); + return; } /* @@ -3214,7 +3243,8 @@ xlog_recover_process_iunlinks( * next inode in the bucket. */ error = xfs_itobp(mp, NULL, ip, &dip, - &ibp, 0, 0); + &ibp, 0, 0, + XFS_BUF_LOCK); ASSERT(error || (dip != NULL)); } @@ -3247,7 +3277,7 @@ xlog_recover_process_iunlinks( if (ip->i_d.di_mode == 0) xfs_iput_new(ip, 0); else - VN_RELE(XFS_ITOV(ip)); + IRELE(ip); } else { /* * We can't read in the inode @@ -3445,7 +3475,7 @@ xlog_valid_rec_header( (!rhead->h_version || (be32_to_cpu(rhead->h_version) & (~XLOG_VERSION_OKBITS))))) { xlog_warn("XFS: %s: unrecognised log version (%d).", - __FUNCTION__, be32_to_cpu(rhead->h_version)); + __func__, be32_to_cpu(rhead->h_version)); return XFS_ERROR(EIO); } @@ -3604,15 +3634,19 @@ xlog_do_recovery_pass( * _first_, then the log start (LR header end) * - order is important. */ + wrapped_hblks = hblks - split_hblks; bufaddr = XFS_BUF_PTR(hbp); - XFS_BUF_SET_PTR(hbp, + error = XFS_BUF_SET_PTR(hbp, bufaddr + BBTOB(split_hblks), BBTOB(hblks - split_hblks)); - wrapped_hblks = hblks - split_hblks; - error = xlog_bread(log, 0, wrapped_hblks, hbp); + if (!error) + error = xlog_bread(log, 0, + wrapped_hblks, hbp); + if (!error) + error = XFS_BUF_SET_PTR(hbp, bufaddr, + BBTOB(hblks)); if (error) goto bread_err2; - XFS_BUF_SET_PTR(hbp, bufaddr, BBTOB(hblks)); if (!offset) offset = xlog_align(log, 0, wrapped_hblks, hbp); @@ -3664,13 +3698,18 @@ xlog_do_recovery_pass( * - order is important. */ bufaddr = XFS_BUF_PTR(dbp); - XFS_BUF_SET_PTR(dbp, + error = XFS_BUF_SET_PTR(dbp, bufaddr + BBTOB(split_bblks), BBTOB(bblks - split_bblks)); - if ((error = xlog_bread(log, wrapped_hblks, - bblks - split_bblks, dbp))) + if (!error) + error = xlog_bread(log, wrapped_hblks, + bblks - split_bblks, + dbp); + if (!error) + error = XFS_BUF_SET_PTR(dbp, bufaddr, + h_size); + if (error) goto bread_err2; - XFS_BUF_SET_PTR(dbp, bufaddr, h_size); if (!offset) offset = xlog_align(log, wrapped_hblks, bblks - split_bblks, dbp); @@ -3826,7 +3865,8 @@ xlog_do_recover( XFS_BUF_READ(bp); XFS_BUF_UNASYNC(bp); xfsbdstrat(log->l_mp, bp); - if ((error = xfs_iowait(bp))) { + error = xfs_iowait(bp); + if (error) { xfs_ioerror_alert("xlog_do_recover", log->l_mp, bp, XFS_BUF_ADDR(bp)); ASSERT(0); @@ -3917,7 +3957,14 @@ xlog_recover_finish( * rather than accepting new requests. */ if (log->l_flags & XLOG_RECOVERY_NEEDED) { - xlog_recover_process_efis(log); + int error; + error = xlog_recover_process_efis(log); + if (error) { + cmn_err(CE_ALERT, + "Failed to recover EFIs on filesystem: %s", + log->l_mp->m_fsname); + return error; + } /* * Sync the log to get all the EFIs out of the AIL. * This isn't absolutely necessary, but it helps in diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 8ed164e..2fec452 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -43,8 +43,9 @@ #include "xfs_rw.h" #include "xfs_quota.h" #include "xfs_fsops.h" +#include "xfs_utils.h" -STATIC void xfs_mount_log_sb(xfs_mount_t *, __int64_t); +STATIC int xfs_mount_log_sb(xfs_mount_t *, __int64_t); STATIC int xfs_uuid_mount(xfs_mount_t *); STATIC void xfs_uuid_unmount(xfs_mount_t *mp); STATIC void xfs_unmountfs_wait(xfs_mount_t *); @@ -57,7 +58,7 @@ STATIC void xfs_icsb_balance_counter(xfs_mount_t *, xfs_sb_field_t, STATIC void xfs_icsb_sync_counters(xfs_mount_t *); STATIC int xfs_icsb_modify_counters(xfs_mount_t *, xfs_sb_field_t, int64_t, int); -STATIC int xfs_icsb_disable_counter(xfs_mount_t *, xfs_sb_field_t); +STATIC void xfs_icsb_disable_counter(xfs_mount_t *, xfs_sb_field_t); #else @@ -956,7 +957,6 @@ xfs_mountfs( { xfs_sb_t *sbp = &(mp->m_sb); xfs_inode_t *rip; - bhv_vnode_t *rvp = NULL; __uint64_t resblks; __int64_t update_flags = 0LL; uint quotamount, quotaflags; @@ -964,11 +964,6 @@ xfs_mountfs( int uuid_mounted = 0; int error = 0; - if (mp->m_sb_bp == NULL) { - error = xfs_readsb(mp, mfsi_flags); - if (error) - return error; - } xfs_mount_common(mp, sbp); /* @@ -1163,7 +1158,6 @@ xfs_mountfs( } ASSERT(rip != NULL); - rvp = XFS_ITOV(rip); if (unlikely((rip->i_d.di_mode & S_IFMT) != S_IFDIR)) { cmn_err(CE_WARN, "XFS: corrupted root inode"); @@ -1195,8 +1189,13 @@ xfs_mountfs( /* * If fs is not mounted readonly, then update the superblock changes. */ - if (update_flags && !(mp->m_flags & XFS_MOUNT_RDONLY)) - xfs_mount_log_sb(mp, update_flags); + if (update_flags && !(mp->m_flags & XFS_MOUNT_RDONLY)) { + error = xfs_mount_log_sb(mp, update_flags); + if (error) { + cmn_err(CE_WARN, "XFS: failed to write sb changes"); + goto error4; + } + } /* * Initialise the XFS quota management subsystem for this mount @@ -1233,12 +1232,15 @@ xfs_mountfs( * * We default to 5% or 1024 fsbs of space reserved, whichever is smaller. * This may drive us straight to ENOSPC on mount, but that implies - * we were already there on the last unmount. + * we were already there on the last unmount. Warn if this occurs. */ resblks = mp->m_sb.sb_dblocks; do_div(resblks, 20); resblks = min_t(__uint64_t, resblks, 1024); - xfs_reserve_blocks(mp, &resblks, NULL); + error = xfs_reserve_blocks(mp, &resblks, NULL); + if (error) + cmn_err(CE_WARN, "XFS: Unable to allocate reserve blocks. " + "Continuing without a reserve pool."); return 0; @@ -1246,7 +1248,7 @@ xfs_mountfs( /* * Free up the root inode. */ - VN_RELE(rvp); + IRELE(rip); error3: xfs_log_unmount_dealloc(mp); error2: @@ -1274,6 +1276,7 @@ int xfs_unmountfs(xfs_mount_t *mp, struct cred *cr) { __uint64_t resblks; + int error = 0; /* * We can potentially deadlock here if we have an inode cluster @@ -1317,9 +1320,15 @@ xfs_unmountfs(xfs_mount_t *mp, struct cred *cr) * value does not matter.... */ resblks = 0; - xfs_reserve_blocks(mp, &resblks, NULL); + error = xfs_reserve_blocks(mp, &resblks, NULL); + if (error) + cmn_err(CE_WARN, "XFS: Unable to free reserved block pool. " + "Freespace may not be correct on next mount."); - xfs_log_sbcount(mp, 1); + error = xfs_log_sbcount(mp, 1); + if (error) + cmn_err(CE_WARN, "XFS: Unable to update superblock counters. " + "Freespace may not be correct on next mount."); xfs_unmountfs_writesb(mp); xfs_unmountfs_wait(mp); /* wait for async bufs */ xfs_log_unmount(mp); /* Done! No more fs ops. */ @@ -1411,9 +1420,8 @@ xfs_log_sbcount( xfs_mod_sb(tp, XFS_SB_IFREE | XFS_SB_ICOUNT | XFS_SB_FDBLOCKS); if (sync) xfs_trans_set_sync(tp); - xfs_trans_commit(tp, 0); - - return 0; + error = xfs_trans_commit(tp, 0); + return error; } STATIC void @@ -1462,7 +1470,6 @@ xfs_unmountfs_writesb(xfs_mount_t *mp) XFS_BUF_UNASYNC(sbp); ASSERT(XFS_BUF_TARGET(sbp) == mp->m_ddev_targp); xfsbdstrat(mp, sbp); - /* Nevermind errors we might get here. */ error = xfs_iowait(sbp); if (error) xfs_ioerror_alert("xfs_unmountfs_writesb", @@ -1911,24 +1918,27 @@ xfs_uuid_unmount( * be altered by the mount options, as well as any potential sb_features2 * fixup. Only the first superblock is updated. */ -STATIC void +STATIC int xfs_mount_log_sb( xfs_mount_t *mp, __int64_t fields) { xfs_trans_t *tp; + int error; ASSERT(fields & (XFS_SB_UNIT | XFS_SB_WIDTH | XFS_SB_UUID | XFS_SB_FEATURES2 | XFS_SB_BAD_FEATURES2)); tp = xfs_trans_alloc(mp, XFS_TRANS_SB_UNIT); - if (xfs_trans_reserve(tp, 0, mp->m_sb.sb_sectsize + 128, 0, 0, - XFS_DEFAULT_LOG_COUNT)) { + error = xfs_trans_reserve(tp, 0, mp->m_sb.sb_sectsize + 128, 0, 0, + XFS_DEFAULT_LOG_COUNT); + if (error) { xfs_trans_cancel(tp, 0); - return; + return error; } xfs_mod_sb(tp, fields); - xfs_trans_commit(tp, 0); + error = xfs_trans_commit(tp, 0); + return error; } @@ -2189,7 +2199,7 @@ xfs_icsb_counter_disabled( return test_bit(field, &mp->m_icsb_counters); } -STATIC int +STATIC void xfs_icsb_disable_counter( xfs_mount_t *mp, xfs_sb_field_t field) @@ -2207,7 +2217,7 @@ xfs_icsb_disable_counter( * the m_icsb_mutex. */ if (xfs_icsb_counter_disabled(mp, field)) - return 0; + return; xfs_icsb_lock_all_counters(mp); if (!test_and_set_bit(field, &mp->m_icsb_counters)) { @@ -2230,8 +2240,6 @@ xfs_icsb_disable_counter( } xfs_icsb_unlock_all_counters(mp); - - return 0; } STATIC void diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index 1d8a472..1ed5751 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -66,17 +66,17 @@ struct xfs_mru_cache; * Prototypes and functions for the Data Migration subsystem. */ -typedef int (*xfs_send_data_t)(int, bhv_vnode_t *, - xfs_off_t, size_t, int, bhv_vrwlock_t *); +typedef int (*xfs_send_data_t)(int, struct xfs_inode *, + xfs_off_t, size_t, int, int *); typedef int (*xfs_send_mmap_t)(struct vm_area_struct *, uint); -typedef int (*xfs_send_destroy_t)(bhv_vnode_t *, dm_right_t); +typedef int (*xfs_send_destroy_t)(struct xfs_inode *, dm_right_t); typedef int (*xfs_send_namesp_t)(dm_eventtype_t, struct xfs_mount *, - bhv_vnode_t *, - dm_right_t, bhv_vnode_t *, dm_right_t, - char *, char *, mode_t, int, int); + struct xfs_inode *, dm_right_t, + struct xfs_inode *, dm_right_t, + const char *, const char *, mode_t, int, int); typedef int (*xfs_send_mount_t)(struct xfs_mount *, dm_right_t, char *, char *); -typedef void (*xfs_send_unmount_t)(struct xfs_mount *, bhv_vnode_t *, +typedef void (*xfs_send_unmount_t)(struct xfs_mount *, struct xfs_inode *, dm_right_t, mode_t, int, int); typedef struct xfs_dmops { @@ -88,20 +88,20 @@ typedef struct xfs_dmops { xfs_send_unmount_t xfs_send_unmount; } xfs_dmops_t; -#define XFS_SEND_DATA(mp, ev,vp,off,len,fl,lock) \ - (*(mp)->m_dm_ops->xfs_send_data)(ev,vp,off,len,fl,lock) +#define XFS_SEND_DATA(mp, ev,ip,off,len,fl,lock) \ + (*(mp)->m_dm_ops->xfs_send_data)(ev,ip,off,len,fl,lock) #define XFS_SEND_MMAP(mp, vma,fl) \ (*(mp)->m_dm_ops->xfs_send_mmap)(vma,fl) -#define XFS_SEND_DESTROY(mp, vp,right) \ - (*(mp)->m_dm_ops->xfs_send_destroy)(vp,right) +#define XFS_SEND_DESTROY(mp, ip,right) \ + (*(mp)->m_dm_ops->xfs_send_destroy)(ip,right) #define XFS_SEND_NAMESP(mp, ev,b1,r1,b2,r2,n1,n2,mode,rval,fl) \ (*(mp)->m_dm_ops->xfs_send_namesp)(ev,NULL,b1,r1,b2,r2,n1,n2,mode,rval,fl) #define XFS_SEND_PREUNMOUNT(mp,b1,r1,b2,r2,n1,n2,mode,rval,fl) \ (*(mp)->m_dm_ops->xfs_send_namesp)(DM_EVENT_PREUNMOUNT,mp,b1,r1,b2,r2,n1,n2,mode,rval,fl) #define XFS_SEND_MOUNT(mp,right,path,name) \ (*(mp)->m_dm_ops->xfs_send_mount)(mp,right,path,name) -#define XFS_SEND_UNMOUNT(mp, vp,right,mode,rval,fl) \ - (*(mp)->m_dm_ops->xfs_send_unmount)(mp,vp,right,mode,rval,fl) +#define XFS_SEND_UNMOUNT(mp, ip,right,mode,rval,fl) \ + (*(mp)->m_dm_ops->xfs_send_unmount)(mp,ip,right,mode,rval,fl) /* @@ -220,7 +220,7 @@ extern void xfs_icsb_sync_counters_flags(struct xfs_mount *, int); #endif typedef struct xfs_ail { - xfs_ail_entry_t xa_ail; + struct list_head xa_ail; uint xa_gen; struct task_struct *xa_task; xfs_lsn_t xa_target; @@ -401,7 +401,7 @@ typedef struct xfs_mount { /* * Allow large block sizes to be reported to userspace programs if the - * "largeio" mount option is used. + * "largeio" mount option is used. * * If compatibility mode is specified, simply return the basic unit of caching * so that we don't get inefficient read/modify/write I/O from user apps. diff --git a/fs/xfs/xfs_rename.c b/fs/xfs/xfs_rename.c index 7eb157a..ee37189 100644 --- a/fs/xfs/xfs_rename.c +++ b/fs/xfs/xfs_rename.c @@ -36,7 +36,6 @@ #include "xfs_bmap.h" #include "xfs_error.h" #include "xfs_quota.h" -#include "xfs_refcache.h" #include "xfs_utils.h" #include "xfs_trans_space.h" #include "xfs_vnodeops.h" @@ -84,25 +83,23 @@ int xfs_rename_skip, xfs_rename_nskip; */ STATIC int xfs_lock_for_rename( - xfs_inode_t *dp1, /* old (source) directory inode */ - xfs_inode_t *dp2, /* new (target) directory inode */ - bhv_vname_t *vname1,/* old entry name */ - bhv_vname_t *vname2,/* new entry name */ - xfs_inode_t **ipp1, /* inode of old entry */ - xfs_inode_t **ipp2, /* inode of new entry, if it + xfs_inode_t *dp1, /* in: old (source) directory inode */ + xfs_inode_t *dp2, /* in: new (target) directory inode */ + xfs_inode_t *ip1, /* in: inode of old entry */ + struct xfs_name *name2, /* in: new entry name */ + xfs_inode_t **ipp2, /* out: inode of new entry, if it already exists, NULL otherwise. */ - xfs_inode_t **i_tab,/* array of inode returned, sorted */ - int *num_inodes) /* number of inodes in array */ + xfs_inode_t **i_tab,/* out: array of inode returned, sorted */ + int *num_inodes) /* out: number of inodes in array */ { - xfs_inode_t *ip1, *ip2, *temp; + xfs_inode_t *ip2 = NULL; + xfs_inode_t *temp; xfs_ino_t inum1, inum2; int error; int i, j; uint lock_mode; int diff_dirs = (dp1 != dp2); - ip2 = NULL; - /* * First, find out the current inums of the entries so that we * can determine the initial locking order. We'll have to @@ -110,27 +107,20 @@ xfs_lock_for_rename( * to see if we still have the right inodes, directories, etc. */ lock_mode = xfs_ilock_map_shared(dp1); - error = xfs_get_dir_entry(vname1, &ip1); - if (error) { - xfs_iunlock_map_shared(dp1, lock_mode); - return error; - } + IHOLD(ip1); + xfs_itrace_ref(ip1); inum1 = ip1->i_ino; - ASSERT(ip1); - xfs_itrace_ref(ip1); - /* * Unlock dp1 and lock dp2 if they are different. */ - if (diff_dirs) { xfs_iunlock_map_shared(dp1, lock_mode); lock_mode = xfs_ilock_map_shared(dp2); } - error = xfs_dir_lookup_int(dp2, lock_mode, vname2, &inum2, &ip2); + error = xfs_dir_lookup_int(dp2, lock_mode, name2, &inum2, &ip2); if (error == ENOENT) { /* target does not need to exist. */ inum2 = 0; } else if (error) { @@ -162,6 +152,7 @@ xfs_lock_for_rename( *num_inodes = 4; i_tab[3] = ip2; } + *ipp2 = i_tab[3]; /* * Sort the elements via bubble sort. (Remember, there are at @@ -199,21 +190,6 @@ xfs_lock_for_rename( xfs_lock_inodes(i_tab, *num_inodes, 0, XFS_ILOCK_SHARED); } - /* - * Set the return value. Null out any unused entries in i_tab. - */ - *ipp1 = *ipp2 = NULL; - for (i=0; i < *num_inodes; i++) { - if (i_tab[i]->i_ino == inum1) { - *ipp1 = i_tab[i]; - } - if (i_tab[i]->i_ino == inum2) { - *ipp2 = i_tab[i]; - } - } - for (;i < 4; i++) { - i_tab[i] = NULL; - } return 0; } @@ -223,13 +199,13 @@ xfs_lock_for_rename( int xfs_rename( xfs_inode_t *src_dp, - bhv_vname_t *src_vname, - bhv_vnode_t *target_dir_vp, - bhv_vname_t *target_vname) + struct xfs_name *src_name, + xfs_inode_t *src_ip, + xfs_inode_t *target_dp, + struct xfs_name *target_name) { - bhv_vnode_t *src_dir_vp = XFS_ITOV(src_dp); xfs_trans_t *tp; - xfs_inode_t *target_dp, *src_ip, *target_ip; + xfs_inode_t *target_ip; xfs_mount_t *mp = src_dp->i_mount; int new_parent; /* moving to a new dir */ int src_is_directory; /* src_name is a directory */ @@ -243,29 +219,16 @@ xfs_rename( int spaceres; int target_link_zero = 0; int num_inodes; - char *src_name = VNAME(src_vname); - char *target_name = VNAME(target_vname); - int src_namelen = VNAMELEN(src_vname); - int target_namelen = VNAMELEN(target_vname); xfs_itrace_entry(src_dp); - xfs_itrace_entry(xfs_vtoi(target_dir_vp)); - - /* - * Find the XFS behavior descriptor for the target directory - * vnode since it was not handed to us. - */ - target_dp = xfs_vtoi(target_dir_vp); - if (target_dp == NULL) { - return XFS_ERROR(EXDEV); - } + xfs_itrace_entry(target_dp); if (DM_EVENT_ENABLED(src_dp, DM_EVENT_RENAME) || DM_EVENT_ENABLED(target_dp, DM_EVENT_RENAME)) { error = XFS_SEND_NAMESP(mp, DM_EVENT_RENAME, - src_dir_vp, DM_RIGHT_NULL, - target_dir_vp, DM_RIGHT_NULL, - src_name, target_name, + src_dp, DM_RIGHT_NULL, + target_dp, DM_RIGHT_NULL, + src_name->name, target_name->name, 0, 0, 0); if (error) { return error; @@ -282,10 +245,8 @@ xfs_rename( * does not exist in the source directory. */ tp = NULL; - error = xfs_lock_for_rename(src_dp, target_dp, src_vname, - target_vname, &src_ip, &target_ip, inodes, - &num_inodes); - + error = xfs_lock_for_rename(src_dp, target_dp, src_ip, target_name, + &target_ip, inodes, &num_inodes); if (error) { /* * We have nothing locked, no inode references, and @@ -331,7 +292,7 @@ xfs_rename( XFS_BMAP_INIT(&free_list, &first_block); tp = xfs_trans_alloc(mp, XFS_TRANS_RENAME); cancel_flags = XFS_TRANS_RELEASE_LOG_RES; - spaceres = XFS_RENAME_SPACE_RES(mp, target_namelen); + spaceres = XFS_RENAME_SPACE_RES(mp, target_name->len); error = xfs_trans_reserve(tp, spaceres, XFS_RENAME_LOG_RES(mp), 0, XFS_TRANS_PERM_LOG_RES, XFS_RENAME_LOG_COUNT); if (error == ENOSPC) { @@ -365,10 +326,10 @@ xfs_rename( * them when they unlock the inodes. Also, we need to be careful * not to add an inode to the transaction more than once. */ - VN_HOLD(src_dir_vp); + IHOLD(src_dp); xfs_trans_ijoin(tp, src_dp, XFS_ILOCK_EXCL); if (new_parent) { - VN_HOLD(target_dir_vp); + IHOLD(target_dp); xfs_trans_ijoin(tp, target_dp, XFS_ILOCK_EXCL); } if ((src_ip != src_dp) && (src_ip != target_dp)) { @@ -389,9 +350,8 @@ xfs_rename( * If there's no space reservation, check the entry will * fit before actually inserting it. */ - if (spaceres == 0 && - (error = xfs_dir_canenter(tp, target_dp, target_name, - target_namelen))) + error = xfs_dir_canenter(tp, target_dp, target_name, spaceres); + if (error) goto error_return; /* * If target does not exist and the rename crosses @@ -399,8 +359,8 @@ xfs_rename( * to account for the ".." reference from the new entry. */ error = xfs_dir_createname(tp, target_dp, target_name, - target_namelen, src_ip->i_ino, - &first_block, &free_list, spaceres); + src_ip->i_ino, &first_block, + &free_list, spaceres); if (error == ENOSPC) goto error_return; if (error) @@ -439,7 +399,7 @@ xfs_rename( * name at the destination directory, remove it first. */ error = xfs_dir_replace(tp, target_dp, target_name, - target_namelen, src_ip->i_ino, + src_ip->i_ino, &first_block, &free_list, spaceres); if (error) goto abort_return; @@ -476,7 +436,8 @@ xfs_rename( * Rewrite the ".." entry to point to the new * directory. */ - error = xfs_dir_replace(tp, src_ip, "..", 2, target_dp->i_ino, + error = xfs_dir_replace(tp, src_ip, &xfs_name_dotdot, + target_dp->i_ino, &first_block, &free_list, spaceres); ASSERT(error != EEXIST); if (error) @@ -512,8 +473,8 @@ xfs_rename( goto abort_return; } - error = xfs_dir_removename(tp, src_dp, src_name, src_namelen, - src_ip->i_ino, &first_block, &free_list, spaceres); + error = xfs_dir_removename(tp, src_dp, src_name, src_ip->i_ino, + &first_block, &free_list, spaceres); if (error) goto abort_return; xfs_ichgtime(src_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); @@ -580,10 +541,8 @@ xfs_rename( * the vnode references. */ error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); - if (target_ip != NULL) { - xfs_refcache_purge_ip(target_ip); + if (target_ip != NULL) IRELE(target_ip); - } /* * Let interposed file systems know about removed links. */ @@ -598,9 +557,9 @@ std_return: if (DM_EVENT_ENABLED(src_dp, DM_EVENT_POSTRENAME) || DM_EVENT_ENABLED(target_dp, DM_EVENT_POSTRENAME)) { (void) XFS_SEND_NAMESP (mp, DM_EVENT_POSTRENAME, - src_dir_vp, DM_RIGHT_NULL, - target_dir_vp, DM_RIGHT_NULL, - src_name, target_name, + src_dp, DM_RIGHT_NULL, + target_dp, DM_RIGHT_NULL, + src_name->name, target_name->name, 0, error, 0); } return error; diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index 47082c0..a0dc6e5 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -44,6 +44,7 @@ #include "xfs_rw.h" #include "xfs_inode_item.h" #include "xfs_trans_space.h" +#include "xfs_utils.h" /* @@ -123,14 +124,14 @@ xfs_growfs_rt_alloc( XFS_GROWRTALLOC_LOG_RES(mp), 0, XFS_TRANS_PERM_LOG_RES, XFS_DEFAULT_PERM_LOG_COUNT))) - goto error_exit; + goto error_cancel; cancelflags = XFS_TRANS_RELEASE_LOG_RES; /* * Lock the inode. */ if ((error = xfs_trans_iget(mp, tp, ino, 0, XFS_ILOCK_EXCL, &ip))) - goto error_exit; + goto error_cancel; XFS_BMAP_INIT(&flist, &firstblock); /* * Allocate blocks to the bitmap file. @@ -143,14 +144,16 @@ xfs_growfs_rt_alloc( if (!error && nmap < 1) error = XFS_ERROR(ENOSPC); if (error) - goto error_exit; + goto error_cancel; /* * Free any blocks freed up in the transaction, then commit. */ error = xfs_bmap_finish(&tp, &flist, &committed); if (error) - goto error_exit; - xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + goto error_cancel; + error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + if (error) + goto error; /* * Now we need to clear the allocated blocks. * Do this one block per transaction, to keep it simple. @@ -165,13 +168,13 @@ xfs_growfs_rt_alloc( */ if ((error = xfs_trans_reserve(tp, 0, XFS_GROWRTZERO_LOG_RES(mp), 0, 0, 0))) - goto error_exit; + goto error_cancel; /* * Lock the bitmap inode. */ if ((error = xfs_trans_iget(mp, tp, ino, 0, XFS_ILOCK_EXCL, &ip))) - goto error_exit; + goto error_cancel; /* * Get a buffer for the block. */ @@ -180,14 +183,16 @@ xfs_growfs_rt_alloc( mp->m_bsize, 0); if (bp == NULL) { error = XFS_ERROR(EIO); - goto error_exit; + goto error_cancel; } memset(XFS_BUF_PTR(bp), 0, mp->m_sb.sb_blocksize); xfs_trans_log_buf(tp, bp, 0, mp->m_sb.sb_blocksize - 1); /* * Commit the transaction. */ - xfs_trans_commit(tp, 0); + error = xfs_trans_commit(tp, 0); + if (error) + goto error; } /* * Go on to the next extent, if any. @@ -195,8 +200,9 @@ xfs_growfs_rt_alloc( oblocks = map.br_startoff + map.br_blockcount; } return 0; -error_exit: +error_cancel: xfs_trans_cancel(tp, cancelflags); +error: return error; } @@ -1875,6 +1881,7 @@ xfs_growfs_rt( xfs_trans_t *tp; /* transaction pointer */ sbp = &mp->m_sb; + cancelflags = 0; /* * Initial error checking. */ @@ -2041,13 +2048,15 @@ xfs_growfs_rt( */ mp->m_rsumlevels = nrsumlevels; mp->m_rsumsize = nrsumsize; - /* - * Commit the transaction. - */ - xfs_trans_commit(tp, 0); + + error = xfs_trans_commit(tp, 0); + if (error) { + tp = NULL; + break; + } } - if (error) + if (error && tp) xfs_trans_cancel(tp, cancelflags); /* @@ -2278,7 +2287,7 @@ xfs_rtmount_inodes( ASSERT(sbp->sb_rsumino != NULLFSINO); error = xfs_iget(mp, NULL, sbp->sb_rsumino, 0, 0, &mp->m_rsumip, 0); if (error) { - VN_RELE(XFS_ITOV(mp->m_rbmip)); + IRELE(mp->m_rbmip); return error; } ASSERT(mp->m_rsumip != NULL); diff --git a/fs/xfs/xfs_rw.c b/fs/xfs/xfs_rw.c index cd3ece6..b0f31c0 100644 --- a/fs/xfs/xfs_rw.c +++ b/fs/xfs/xfs_rw.c @@ -126,11 +126,11 @@ xfs_write_sync_logforce( * when we return. */ if (iip && iip->ili_last_lsn) { - xfs_log_force(mp, iip->ili_last_lsn, - XFS_LOG_FORCE | XFS_LOG_SYNC); + error = _xfs_log_force(mp, iip->ili_last_lsn, + XFS_LOG_FORCE | XFS_LOG_SYNC, NULL); } else if (xfs_ipincount(ip) > 0) { - xfs_log_force(mp, (xfs_lsn_t)0, - XFS_LOG_FORCE | XFS_LOG_SYNC); + error = _xfs_log_force(mp, (xfs_lsn_t)0, + XFS_LOG_FORCE | XFS_LOG_SYNC, NULL); } } else { diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index 7f40628..0804207 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -113,13 +113,8 @@ struct xfs_mount; struct xfs_trans; struct xfs_dquot_acct; -typedef struct xfs_ail_entry { - struct xfs_log_item *ail_forw; /* AIL forw pointer */ - struct xfs_log_item *ail_back; /* AIL back pointer */ -} xfs_ail_entry_t; - typedef struct xfs_log_item { - xfs_ail_entry_t li_ail; /* AIL pointers */ + struct list_head li_ail; /* AIL pointers */ xfs_lsn_t li_lsn; /* last on-disk lsn */ struct xfs_log_item_desc *li_desc; /* ptr to current desc*/ struct xfs_mount *li_mountp; /* ptr to fs mount */ @@ -341,7 +336,6 @@ typedef struct xfs_trans { unsigned int t_rtx_res; /* # of rt extents resvd */ unsigned int t_rtx_res_used; /* # of resvd rt extents used */ xfs_log_ticket_t t_ticket; /* log mgr ticket */ - sema_t t_sema; /* sema for commit completion */ xfs_lsn_t t_lsn; /* log seq num of start of * transaction. */ xfs_lsn_t t_commit_lsn; /* log seq num of end of diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c index 76d470d..1f77c00 100644 --- a/fs/xfs/xfs_trans_ail.c +++ b/fs/xfs/xfs_trans_ail.c @@ -28,13 +28,13 @@ #include "xfs_trans_priv.h" #include "xfs_error.h" -STATIC void xfs_ail_insert(xfs_ail_entry_t *, xfs_log_item_t *); -STATIC xfs_log_item_t * xfs_ail_delete(xfs_ail_entry_t *, xfs_log_item_t *); -STATIC xfs_log_item_t * xfs_ail_min(xfs_ail_entry_t *); -STATIC xfs_log_item_t * xfs_ail_next(xfs_ail_entry_t *, xfs_log_item_t *); +STATIC void xfs_ail_insert(xfs_ail_t *, xfs_log_item_t *); +STATIC xfs_log_item_t * xfs_ail_delete(xfs_ail_t *, xfs_log_item_t *); +STATIC xfs_log_item_t * xfs_ail_min(xfs_ail_t *); +STATIC xfs_log_item_t * xfs_ail_next(xfs_ail_t *, xfs_log_item_t *); #ifdef DEBUG -STATIC void xfs_ail_check(xfs_ail_entry_t *, xfs_log_item_t *); +STATIC void xfs_ail_check(xfs_ail_t *, xfs_log_item_t *); #else #define xfs_ail_check(a,l) #endif /* DEBUG */ @@ -57,7 +57,7 @@ xfs_trans_tail_ail( xfs_log_item_t *lip; spin_lock(&mp->m_ail_lock); - lip = xfs_ail_min(&(mp->m_ail.xa_ail)); + lip = xfs_ail_min(&mp->m_ail); if (lip == NULL) { lsn = (xfs_lsn_t)0; } else { @@ -91,7 +91,7 @@ xfs_trans_push_ail( { xfs_log_item_t *lip; - lip = xfs_ail_min(&mp->m_ail.xa_ail); + lip = xfs_ail_min(&mp->m_ail); if (lip && !XFS_FORCED_SHUTDOWN(mp)) { if (XFS_LSN_CMP(threshold_lsn, mp->m_ail.xa_target) > 0) xfsaild_wakeup(mp, threshold_lsn); @@ -111,15 +111,17 @@ xfs_trans_first_push_ail( { xfs_log_item_t *lip; - lip = xfs_ail_min(&(mp->m_ail.xa_ail)); + lip = xfs_ail_min(&mp->m_ail); *gen = (int)mp->m_ail.xa_gen; if (lsn == 0) return lip; - while (lip && (XFS_LSN_CMP(lip->li_lsn, lsn) < 0)) - lip = lip->li_ail.ail_forw; + list_for_each_entry(lip, &mp->m_ail.xa_ail, li_ail) { + if (XFS_LSN_CMP(lip->li_lsn, lsn) >= 0) + return lip; + } - return lip; + return NULL; } /* @@ -329,7 +331,7 @@ xfs_trans_unlocked_item( * the call to xfs_log_move_tail() doesn't do anything if there's * not enough free space to wake people up so we're safe calling it. */ - min_lip = xfs_ail_min(&mp->m_ail.xa_ail); + min_lip = xfs_ail_min(&mp->m_ail); if (min_lip == lip) xfs_log_move_tail(mp, 1); @@ -357,15 +359,13 @@ xfs_trans_update_ail( xfs_log_item_t *lip, xfs_lsn_t lsn) __releases(mp->m_ail_lock) { - xfs_ail_entry_t *ailp; xfs_log_item_t *dlip=NULL; xfs_log_item_t *mlip; /* ptr to minimum lip */ - ailp = &(mp->m_ail.xa_ail); - mlip = xfs_ail_min(ailp); + mlip = xfs_ail_min(&mp->m_ail); if (lip->li_flags & XFS_LI_IN_AIL) { - dlip = xfs_ail_delete(ailp, lip); + dlip = xfs_ail_delete(&mp->m_ail, lip); ASSERT(dlip == lip); } else { lip->li_flags |= XFS_LI_IN_AIL; @@ -373,11 +373,11 @@ xfs_trans_update_ail( lip->li_lsn = lsn; - xfs_ail_insert(ailp, lip); + xfs_ail_insert(&mp->m_ail, lip); mp->m_ail.xa_gen++; if (mlip == dlip) { - mlip = xfs_ail_min(&(mp->m_ail.xa_ail)); + mlip = xfs_ail_min(&mp->m_ail); spin_unlock(&mp->m_ail_lock); xfs_log_move_tail(mp, mlip->li_lsn); } else { @@ -407,14 +407,12 @@ xfs_trans_delete_ail( xfs_mount_t *mp, xfs_log_item_t *lip) __releases(mp->m_ail_lock) { - xfs_ail_entry_t *ailp; xfs_log_item_t *dlip; xfs_log_item_t *mlip; if (lip->li_flags & XFS_LI_IN_AIL) { - ailp = &(mp->m_ail.xa_ail); - mlip = xfs_ail_min(ailp); - dlip = xfs_ail_delete(ailp, lip); + mlip = xfs_ail_min(&mp->m_ail); + dlip = xfs_ail_delete(&mp->m_ail, lip); ASSERT(dlip == lip); @@ -423,7 +421,7 @@ xfs_trans_delete_ail( mp->m_ail.xa_gen++; if (mlip == dlip) { - mlip = xfs_ail_min(&(mp->m_ail.xa_ail)); + mlip = xfs_ail_min(&mp->m_ail); spin_unlock(&mp->m_ail_lock); xfs_log_move_tail(mp, (mlip ? mlip->li_lsn : 0)); } else { @@ -440,7 +438,7 @@ xfs_trans_delete_ail( else { xfs_cmn_err(XFS_PTAG_AILDELETE, CE_ALERT, mp, "%s: attempting to delete a log item that is not in the AIL", - __FUNCTION__); + __func__); spin_unlock(&mp->m_ail_lock); xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); } @@ -461,7 +459,7 @@ xfs_trans_first_ail( { xfs_log_item_t *lip; - lip = xfs_ail_min(&(mp->m_ail.xa_ail)); + lip = xfs_ail_min(&mp->m_ail); *gen = (int)mp->m_ail.xa_gen; return lip; @@ -485,9 +483,9 @@ xfs_trans_next_ail( ASSERT(mp && lip && gen); if (mp->m_ail.xa_gen == *gen) { - nlip = xfs_ail_next(&(mp->m_ail.xa_ail), lip); + nlip = xfs_ail_next(&mp->m_ail, lip); } else { - nlip = xfs_ail_min(&(mp->m_ail).xa_ail); + nlip = xfs_ail_min(&mp->m_ail); *gen = (int)mp->m_ail.xa_gen; if (restarts != NULL) { XFS_STATS_INC(xs_push_ail_restarts); @@ -517,8 +515,7 @@ int xfs_trans_ail_init( xfs_mount_t *mp) { - mp->m_ail.xa_ail.ail_forw = (xfs_log_item_t*)&mp->m_ail.xa_ail; - mp->m_ail.xa_ail.ail_back = (xfs_log_item_t*)&mp->m_ail.xa_ail; + INIT_LIST_HEAD(&mp->m_ail.xa_ail); return xfsaild_start(mp); } @@ -537,7 +534,7 @@ xfs_trans_ail_destroy( */ STATIC void xfs_ail_insert( - xfs_ail_entry_t *base, + xfs_ail_t *ailp, xfs_log_item_t *lip) /* ARGSUSED */ { @@ -546,27 +543,22 @@ xfs_ail_insert( /* * If the list is empty, just insert the item. */ - if (base->ail_back == (xfs_log_item_t*)base) { - base->ail_forw = lip; - base->ail_back = lip; - lip->li_ail.ail_forw = (xfs_log_item_t*)base; - lip->li_ail.ail_back = (xfs_log_item_t*)base; + if (list_empty(&ailp->xa_ail)) { + list_add(&lip->li_ail, &ailp->xa_ail); return; } - next_lip = base->ail_back; - while ((next_lip != (xfs_log_item_t*)base) && - (XFS_LSN_CMP(next_lip->li_lsn, lip->li_lsn) > 0)) { - next_lip = next_lip->li_ail.ail_back; + list_for_each_entry_reverse(next_lip, &ailp->xa_ail, li_ail) { + if (XFS_LSN_CMP(next_lip->li_lsn, lip->li_lsn) <= 0) + break; } - ASSERT((next_lip == (xfs_log_item_t*)base) || + + ASSERT((&next_lip->li_ail == &ailp->xa_ail) || (XFS_LSN_CMP(next_lip->li_lsn, lip->li_lsn) <= 0)); - lip->li_ail.ail_forw = next_lip->li_ail.ail_forw; - lip->li_ail.ail_back = next_lip; - next_lip->li_ail.ail_forw = lip; - lip->li_ail.ail_forw->li_ail.ail_back = lip; - xfs_ail_check(base, lip); + list_add(&lip->li_ail, &next_lip->li_ail); + + xfs_ail_check(ailp, lip); return; } @@ -576,15 +568,13 @@ xfs_ail_insert( /*ARGSUSED*/ STATIC xfs_log_item_t * xfs_ail_delete( - xfs_ail_entry_t *base, + xfs_ail_t *ailp, xfs_log_item_t *lip) /* ARGSUSED */ { - xfs_ail_check(base, lip); - lip->li_ail.ail_forw->li_ail.ail_back = lip->li_ail.ail_back; - lip->li_ail.ail_back->li_ail.ail_forw = lip->li_ail.ail_forw; - lip->li_ail.ail_forw = NULL; - lip->li_ail.ail_back = NULL; + xfs_ail_check(ailp, lip); + + list_del(&lip->li_ail); return lip; } @@ -595,14 +585,13 @@ xfs_ail_delete( */ STATIC xfs_log_item_t * xfs_ail_min( - xfs_ail_entry_t *base) + xfs_ail_t *ailp) /* ARGSUSED */ { - register xfs_log_item_t *forw = base->ail_forw; - if (forw == (xfs_log_item_t*)base) { + if (list_empty(&ailp->xa_ail)) return NULL; - } - return forw; + + return list_first_entry(&ailp->xa_ail, xfs_log_item_t, li_ail); } /* @@ -612,15 +601,14 @@ xfs_ail_min( */ STATIC xfs_log_item_t * xfs_ail_next( - xfs_ail_entry_t *base, + xfs_ail_t *ailp, xfs_log_item_t *lip) /* ARGSUSED */ { - if (lip->li_ail.ail_forw == (xfs_log_item_t*)base) { + if (lip->li_ail.next == &ailp->xa_ail) return NULL; - } - return lip->li_ail.ail_forw; + return list_first_entry(&lip->li_ail, xfs_log_item_t, li_ail); } #ifdef DEBUG @@ -629,57 +617,40 @@ xfs_ail_next( */ STATIC void xfs_ail_check( - xfs_ail_entry_t *base, + xfs_ail_t *ailp, xfs_log_item_t *lip) { xfs_log_item_t *prev_lip; - prev_lip = base->ail_forw; - if (prev_lip == (xfs_log_item_t*)base) { - /* - * Make sure the pointers are correct when the list - * is empty. - */ - ASSERT(base->ail_back == (xfs_log_item_t*)base); + if (list_empty(&ailp->xa_ail)) return; - } /* * Check the next and previous entries are valid. */ ASSERT((lip->li_flags & XFS_LI_IN_AIL) != 0); - prev_lip = lip->li_ail.ail_back; - if (prev_lip != (xfs_log_item_t*)base) { - ASSERT(prev_lip->li_ail.ail_forw == lip); + prev_lip = list_entry(lip->li_ail.prev, xfs_log_item_t, li_ail); + if (&prev_lip->li_ail != &ailp->xa_ail) ASSERT(XFS_LSN_CMP(prev_lip->li_lsn, lip->li_lsn) <= 0); - } - prev_lip = lip->li_ail.ail_forw; - if (prev_lip != (xfs_log_item_t*)base) { - ASSERT(prev_lip->li_ail.ail_back == lip); + + prev_lip = list_entry(lip->li_ail.next, xfs_log_item_t, li_ail); + if (&prev_lip->li_ail != &ailp->xa_ail) ASSERT(XFS_LSN_CMP(prev_lip->li_lsn, lip->li_lsn) >= 0); - } #ifdef XFS_TRANS_DEBUG /* - * Walk the list checking forward and backward pointers, - * lsn ordering, and that every entry has the XFS_LI_IN_AIL - * flag set. This is really expensive, so only do it when - * specifically debugging the transaction subsystem. + * Walk the list checking lsn ordering, and that every entry has the + * XFS_LI_IN_AIL flag set. This is really expensive, so only do it + * when specifically debugging the transaction subsystem. */ - prev_lip = (xfs_log_item_t*)base; - while (lip != (xfs_log_item_t*)base) { - if (prev_lip != (xfs_log_item_t*)base) { - ASSERT(prev_lip->li_ail.ail_forw == lip); + prev_lip = list_entry(&ailp->xa_ail, xfs_log_item_t, li_ail); + list_for_each_entry(lip, &ailp->xa_ail, li_ail) { + if (&prev_lip->li_ail != &ailp->xa_ail) ASSERT(XFS_LSN_CMP(prev_lip->li_lsn, lip->li_lsn) <= 0); - } - ASSERT(lip->li_ail.ail_back == prev_lip); ASSERT((lip->li_flags & XFS_LI_IN_AIL) != 0); prev_lip = lip; - lip = lip->li_ail.ail_forw; } - ASSERT(lip == (xfs_log_item_t*)base); - ASSERT(base->ail_back == prev_lip); #endif /* XFS_TRANS_DEBUG */ } #endif /* DEBUG */ diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c index 60b6b89..cb0c583 100644 --- a/fs/xfs/xfs_trans_buf.c +++ b/fs/xfs/xfs_trans_buf.c @@ -304,7 +304,8 @@ xfs_trans_read_buf( if (tp == NULL) { bp = xfs_buf_read_flags(target, blkno, len, flags | BUF_BUSY); if (!bp) - return XFS_ERROR(ENOMEM); + return (flags & XFS_BUF_TRYLOCK) ? + EAGAIN : XFS_ERROR(ENOMEM); if ((bp != NULL) && (XFS_BUF_GETERROR(bp) != 0)) { xfs_ioerror_alert("xfs_trans_read_buf", mp, @@ -353,17 +354,15 @@ xfs_trans_read_buf( ASSERT(!XFS_BUF_ISASYNC(bp)); XFS_BUF_READ(bp); xfsbdstrat(tp->t_mountp, bp); - xfs_iowait(bp); - if (XFS_BUF_GETERROR(bp) != 0) { + error = xfs_iowait(bp); + if (error) { xfs_ioerror_alert("xfs_trans_read_buf", mp, bp, blkno); - error = XFS_BUF_GETERROR(bp); xfs_buf_relse(bp); /* - * We can gracefully recover from most - * read errors. Ones we can't are those - * that happen after the transaction's - * already dirty. + * We can gracefully recover from most read + * errors. Ones we can't are those that happen + * after the transaction's already dirty. */ if (tp->t_flags & XFS_TRANS_DIRTY) xfs_force_shutdown(tp->t_mountp, diff --git a/fs/xfs/xfs_types.h b/fs/xfs/xfs_types.h index 5c89be4..0f51916 100644 --- a/fs/xfs/xfs_types.h +++ b/fs/xfs/xfs_types.h @@ -160,4 +160,9 @@ typedef enum { XFS_BTNUM_MAX } xfs_btnum_t; +struct xfs_name { + const char *name; + int len; +}; + #endif /* __XFS_TYPES_H__ */ diff --git a/fs/xfs/xfs_utils.c b/fs/xfs/xfs_utils.c index 18a85e7..2b8dc7e 100644 --- a/fs/xfs/xfs_utils.c +++ b/fs/xfs/xfs_utils.c @@ -40,34 +40,12 @@ #include "xfs_itable.h" #include "xfs_utils.h" -/* - * xfs_get_dir_entry is used to get a reference to an inode given - * its parent directory inode and the name of the file. It does - * not lock the child inode, and it unlocks the directory before - * returning. The directory's generation number is returned for - * use by a later call to xfs_lock_dir_and_entry. - */ -int -xfs_get_dir_entry( - bhv_vname_t *dentry, - xfs_inode_t **ipp) -{ - bhv_vnode_t *vp; - - vp = VNAME_TO_VNODE(dentry); - - *ipp = xfs_vtoi(vp); - if (!*ipp) - return XFS_ERROR(ENOENT); - VN_HOLD(vp); - return 0; -} int xfs_dir_lookup_int( xfs_inode_t *dp, uint lock_mode, - bhv_vname_t *dentry, + struct xfs_name *name, xfs_ino_t *inum, xfs_inode_t **ipp) { @@ -75,7 +53,7 @@ xfs_dir_lookup_int( xfs_itrace_entry(dp); - error = xfs_dir_lookup(NULL, dp, VNAME(dentry), VNAMELEN(dentry), inum); + error = xfs_dir_lookup(NULL, dp, name, inum); if (!error) { /* * Unlock the directory. We do this because we can't diff --git a/fs/xfs/xfs_utils.h b/fs/xfs/xfs_utils.h index f857fcc..175b126 100644 --- a/fs/xfs/xfs_utils.h +++ b/fs/xfs/xfs_utils.h @@ -21,15 +21,14 @@ #define IRELE(ip) VN_RELE(XFS_ITOV(ip)) #define IHOLD(ip) VN_HOLD(XFS_ITOV(ip)) -extern int xfs_get_dir_entry (bhv_vname_t *, xfs_inode_t **); -extern int xfs_dir_lookup_int (xfs_inode_t *, uint, bhv_vname_t *, xfs_ino_t *, - xfs_inode_t **); -extern int xfs_truncate_file (xfs_mount_t *, xfs_inode_t *); -extern int xfs_dir_ialloc (xfs_trans_t **, xfs_inode_t *, mode_t, xfs_nlink_t, +extern int xfs_dir_lookup_int(xfs_inode_t *, uint, struct xfs_name *, + xfs_ino_t *, xfs_inode_t **); +extern int xfs_truncate_file(xfs_mount_t *, xfs_inode_t *); +extern int xfs_dir_ialloc(xfs_trans_t **, xfs_inode_t *, mode_t, xfs_nlink_t, xfs_dev_t, cred_t *, prid_t, int, xfs_inode_t **, int *); -extern int xfs_droplink (xfs_trans_t *, xfs_inode_t *); -extern int xfs_bumplink (xfs_trans_t *, xfs_inode_t *); -extern void xfs_bump_ino_vers2 (xfs_trans_t *, xfs_inode_t *); +extern int xfs_droplink(xfs_trans_t *, xfs_inode_t *); +extern int xfs_bumplink(xfs_trans_t *, xfs_inode_t *); +extern void xfs_bump_ino_vers2(xfs_trans_t *, xfs_inode_t *); #endif /* __XFS_UTILS_H__ */ diff --git a/fs/xfs/xfs_vfsops.c b/fs/xfs/xfs_vfsops.c index 7094caf..fc48158 100644 --- a/fs/xfs/xfs_vfsops.c +++ b/fs/xfs/xfs_vfsops.c @@ -43,7 +43,6 @@ #include "xfs_error.h" #include "xfs_bmap.h" #include "xfs_rw.h" -#include "xfs_refcache.h" #include "xfs_buf_item.h" #include "xfs_log_priv.h" #include "xfs_dir2_trace.h" @@ -56,6 +55,7 @@ #include "xfs_fsops.h" #include "xfs_vnodeops.h" #include "xfs_vfsops.h" +#include "xfs_utils.h" int __init @@ -69,15 +69,17 @@ xfs_init(void) /* * Initialize all of the zone allocators we use. */ + xfs_log_ticket_zone = kmem_zone_init(sizeof(xlog_ticket_t), + "xfs_log_ticket"); xfs_bmap_free_item_zone = kmem_zone_init(sizeof(xfs_bmap_free_item_t), - "xfs_bmap_free_item"); + "xfs_bmap_free_item"); xfs_btree_cur_zone = kmem_zone_init(sizeof(xfs_btree_cur_t), - "xfs_btree_cur"); - xfs_trans_zone = kmem_zone_init(sizeof(xfs_trans_t), "xfs_trans"); - xfs_da_state_zone = - kmem_zone_init(sizeof(xfs_da_state_t), "xfs_da_state"); + "xfs_btree_cur"); + xfs_da_state_zone = kmem_zone_init(sizeof(xfs_da_state_t), + "xfs_da_state"); xfs_dabuf_zone = kmem_zone_init(sizeof(xfs_dabuf_t), "xfs_dabuf"); xfs_ifork_zone = kmem_zone_init(sizeof(xfs_ifork_t), "xfs_ifork"); + xfs_trans_zone = kmem_zone_init(sizeof(xfs_trans_t), "xfs_trans"); xfs_acl_zone_init(xfs_acl_zone, "xfs_acl"); xfs_mru_cache_init(); xfs_filestream_init(); @@ -113,9 +115,6 @@ xfs_init(void) xfs_ili_zone = kmem_zone_init_flags(sizeof(xfs_inode_log_item_t), "xfs_ili", KM_ZONE_SPREAD, NULL); - xfs_icluster_zone = - kmem_zone_init_flags(sizeof(xfs_icluster_t), "xfs_icluster", - KM_ZONE_SPREAD, NULL); /* * Allocate global trace buffers. @@ -153,11 +152,9 @@ xfs_cleanup(void) extern kmem_zone_t *xfs_inode_zone; extern kmem_zone_t *xfs_efd_zone; extern kmem_zone_t *xfs_efi_zone; - extern kmem_zone_t *xfs_icluster_zone; xfs_cleanup_procfs(); xfs_sysctl_unregister(); - xfs_refcache_destroy(); xfs_filestream_uninit(); xfs_mru_cache_uninit(); xfs_acl_zone_destroy(xfs_acl_zone); @@ -189,7 +186,6 @@ xfs_cleanup(void) kmem_zone_destroy(xfs_efi_zone); kmem_zone_destroy(xfs_ifork_zone); kmem_zone_destroy(xfs_ili_zone); - kmem_zone_destroy(xfs_icluster_zone); } /* @@ -573,7 +569,7 @@ xfs_unmount( #ifdef HAVE_DMAPI if (mp->m_flags & XFS_MOUNT_DMAPI) { error = XFS_SEND_PREUNMOUNT(mp, - rvp, DM_RIGHT_NULL, rvp, DM_RIGHT_NULL, + rip, DM_RIGHT_NULL, rip, DM_RIGHT_NULL, NULL, NULL, 0, 0, (mp->m_dmevmask & (1<<DM_EVENT_PREUNMOUNT))? 0:DM_FLAGS_UNWANTED); @@ -584,11 +580,6 @@ xfs_unmount( 0 : DM_FLAGS_UNWANTED; } #endif - /* - * First blow any referenced inode from this file system - * out of the reference cache, and delete the timer. - */ - xfs_refcache_purge_mp(mp); /* * Blow away any referenced inode in the filestreams cache. @@ -607,7 +598,7 @@ xfs_unmount( /* * Drop the reference count */ - VN_RELE(rvp); + IRELE(rip); /* * If we're forcing a shutdown, typically because of a media error, @@ -629,7 +620,7 @@ out: /* Note: mp structure must still exist for * XFS_SEND_UNMOUNT() call. */ - XFS_SEND_UNMOUNT(mp, error == 0 ? rvp : NULL, + XFS_SEND_UNMOUNT(mp, error == 0 ? rip : NULL, DM_RIGHT_NULL, 0, error, unmount_event_flags); } if (xfs_unmountfs_needed) { @@ -646,13 +637,12 @@ out: return XFS_ERROR(error); } -STATIC int +STATIC void xfs_quiesce_fs( xfs_mount_t *mp) { int count = 0, pincount; - xfs_refcache_purge_mp(mp); xfs_flush_buftarg(mp->m_ddev_targp, 0); xfs_finish_reclaim_all(mp, 0); @@ -671,8 +661,6 @@ xfs_quiesce_fs( count++; } } while (count < 2); - - return 0; } /* @@ -684,6 +672,8 @@ void xfs_attr_quiesce( xfs_mount_t *mp) { + int error = 0; + /* wait for all modifications to complete */ while (atomic_read(&mp->m_active_trans) > 0) delay(100); @@ -694,7 +684,11 @@ xfs_attr_quiesce( ASSERT_ALWAYS(atomic_read(&mp->m_active_trans) == 0); /* Push the superblock and write an unmount record */ - xfs_log_sbcount(mp, 1); + error = xfs_log_sbcount(mp, 1); + if (error) + xfs_fs_cmn_err(CE_WARN, mp, + "xfs_attr_quiesce: failed to log sb changes. " + "Frozen image may not be consistent."); xfs_log_unmount_write(mp); xfs_unmountfs_writesb(mp); } @@ -790,8 +784,8 @@ xfs_unmount_flush( goto fscorrupt_out2; if (rbmip) { - VN_RELE(XFS_ITOV(rbmip)); - VN_RELE(XFS_ITOV(rsumip)); + IRELE(rbmip); + IRELE(rsumip); } xfs_iunlock(rip, XFS_ILOCK_EXCL); @@ -1169,10 +1163,10 @@ xfs_sync_inodes( * above, then wait until after we've unlocked * the inode to release the reference. This is * because we can be already holding the inode - * lock when VN_RELE() calls xfs_inactive(). + * lock when IRELE() calls xfs_inactive(). * * Make sure to drop the mount lock before calling - * VN_RELE() so that we don't trip over ourselves if + * IRELE() so that we don't trip over ourselves if * we have to go for the mount lock again in the * inactive code. */ @@ -1180,7 +1174,7 @@ xfs_sync_inodes( IPOINTER_INSERT(ip, mp); } - VN_RELE(vp); + IRELE(ip); vnode_refed = B_FALSE; } @@ -1323,30 +1317,8 @@ xfs_syncsub( } /* - * If this is the periodic sync, then kick some entries out of - * the reference cache. This ensures that idle entries are - * eventually kicked out of the cache. - */ - if (flags & SYNC_REFCACHE) { - if (flags & SYNC_WAIT) - xfs_refcache_purge_mp(mp); - else - xfs_refcache_purge_some(mp); - } - - /* - * If asked, update the disk superblock with incore counter values if we - * are using non-persistent counters so that they don't get too far out - * of sync if we crash or get a forced shutdown. We don't want to force - * this to disk, just get a transaction into the iclogs.... - */ - if (flags & SYNC_SUPER) - xfs_log_sbcount(mp, 0); - - /* * Now check to see if the log needs a "dummy" transaction. */ - if (!(flags & SYNC_REMOUNT) && xfs_log_need_covered(mp)) { xfs_trans_t *tp; xfs_inode_t *ip; diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c index 64c5953..6650601 100644 --- a/fs/xfs/xfs_vnodeops.c +++ b/fs/xfs/xfs_vnodeops.c @@ -48,7 +48,6 @@ #include "xfs_quota.h" #include "xfs_utils.h" #include "xfs_rtalloc.h" -#include "xfs_refcache.h" #include "xfs_trans_space.h" #include "xfs_log_priv.h" #include "xfs_filestream.h" @@ -327,7 +326,7 @@ xfs_setattr( if (DM_EVENT_ENABLED(ip, DM_EVENT_TRUNCATE) && !(flags & ATTR_DMI)) { int dmflags = AT_DELAY_FLAG(flags) | DM_SEM_FLAG_WR; - code = XFS_SEND_DATA(mp, DM_EVENT_TRUNCATE, vp, + code = XFS_SEND_DATA(mp, DM_EVENT_TRUNCATE, ip, vap->va_size, 0, dmflags, NULL); if (code) { lock_flags = 0; @@ -634,6 +633,15 @@ xfs_setattr( * Truncate file. Must have write permission and not be a directory. */ if (mask & XFS_AT_SIZE) { + /* + * Only change the c/mtime if we are changing the size + * or we are explicitly asked to change it. This handles + * the semantic difference between truncate() and ftruncate() + * as implemented in the VFS. + */ + if (vap->va_size != ip->i_size || (mask & XFS_AT_CTIME)) + timeflags |= XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG; + if (vap->va_size > ip->i_size) { xfs_igrow_finish(tp, ip, vap->va_size, !(flags & ATTR_DMI)); @@ -662,10 +670,6 @@ xfs_setattr( */ xfs_iflags_set(ip, XFS_ITRUNCATED); } - /* - * Have to do this even if the file's size doesn't change. - */ - timeflags |= XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG; } /* @@ -877,7 +881,7 @@ xfs_setattr( if (DM_EVENT_ENABLED(ip, DM_EVENT_ATTRIBUTE) && !(flags & ATTR_DMI)) { - (void) XFS_SEND_NAMESP(mp, DM_EVENT_ATTRIBUTE, vp, DM_RIGHT_NULL, + (void) XFS_SEND_NAMESP(mp, DM_EVENT_ATTRIBUTE, ip, DM_RIGHT_NULL, NULL, DM_RIGHT_NULL, NULL, NULL, 0, 0, AT_DELAY_FLAG(flags)); } @@ -1443,28 +1447,22 @@ xfs_inactive_attrs( tp = *tpp; mp = ip->i_mount; ASSERT(ip->i_d.di_forkoff != 0); - xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); xfs_iunlock(ip, XFS_ILOCK_EXCL); + if (error) + goto error_unlock; error = xfs_attr_inactive(ip); - if (error) { - *tpp = NULL; - xfs_iunlock(ip, XFS_IOLOCK_EXCL); - return error; /* goto out */ - } + if (error) + goto error_unlock; tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE); error = xfs_trans_reserve(tp, 0, XFS_IFREE_LOG_RES(mp), 0, XFS_TRANS_PERM_LOG_RES, XFS_INACTIVE_LOG_COUNT); - if (error) { - ASSERT(XFS_FORCED_SHUTDOWN(mp)); - xfs_trans_cancel(tp, 0); - *tpp = NULL; - xfs_iunlock(ip, XFS_IOLOCK_EXCL); - return error; - } + if (error) + goto error_cancel; xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); @@ -1475,6 +1473,14 @@ xfs_inactive_attrs( *tpp = tp; return 0; + +error_cancel: + ASSERT(XFS_FORCED_SHUTDOWN(mp)); + xfs_trans_cancel(tp, 0); +error_unlock: + *tpp = NULL; + xfs_iunlock(ip, XFS_IOLOCK_EXCL); + return error; } int @@ -1520,12 +1526,6 @@ xfs_release( xfs_flush_pages(ip, 0, -1, XFS_B_ASYNC, FI_NONE); } -#ifdef HAVE_REFCACHE - /* If we are in the NFS reference cache then don't do this now */ - if (ip->i_refcache) - return 0; -#endif - if (ip->i_d.di_nlink != 0) { if ((((ip->i_d.di_mode & S_IFMT) == S_IFREG) && ((ip->i_size > 0) || (VN_CACHED(vp) > 0 || @@ -1588,9 +1588,8 @@ xfs_inactive( mp = ip->i_mount; - if (ip->i_d.di_nlink == 0 && DM_EVENT_ENABLED(ip, DM_EVENT_DESTROY)) { - (void) XFS_SEND_DESTROY(mp, vp, DM_RIGHT_NULL); - } + if (ip->i_d.di_nlink == 0 && DM_EVENT_ENABLED(ip, DM_EVENT_DESTROY)) + XFS_SEND_DESTROY(mp, ip, DM_RIGHT_NULL); error = 0; @@ -1744,11 +1743,18 @@ xfs_inactive( XFS_TRANS_MOD_DQUOT_BYINO(mp, tp, ip, XFS_TRANS_DQ_ICOUNT, -1); /* - * Just ignore errors at this point. There is - * nothing we can do except to try to keep going. + * Just ignore errors at this point. There is nothing we can + * do except to try to keep going. Make sure it's not a silent + * error. */ - (void) xfs_bmap_finish(&tp, &free_list, &committed); - (void) xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + error = xfs_bmap_finish(&tp, &free_list, &committed); + if (error) + xfs_fs_cmn_err(CE_NOTE, mp, "xfs_inactive: " + "xfs_bmap_finish() returned error %d", error); + error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + if (error) + xfs_fs_cmn_err(CE_NOTE, mp, "xfs_inactive: " + "xfs_trans_commit() returned error %d", error); } /* * Release the dquots held by inode, if any. @@ -1765,8 +1771,8 @@ xfs_inactive( int xfs_lookup( xfs_inode_t *dp, - bhv_vname_t *dentry, - bhv_vnode_t **vpp) + struct xfs_name *name, + xfs_inode_t **ipp) { xfs_inode_t *ip; xfs_ino_t e_inum; @@ -1779,9 +1785,9 @@ xfs_lookup( return XFS_ERROR(EIO); lock_mode = xfs_ilock_map_shared(dp); - error = xfs_dir_lookup_int(dp, lock_mode, dentry, &e_inum, &ip); + error = xfs_dir_lookup_int(dp, lock_mode, name, &e_inum, &ip); if (!error) { - *vpp = XFS_ITOV(ip); + *ipp = ip; xfs_itrace_ref(ip); } xfs_iunlock_map_shared(dp, lock_mode); @@ -1791,19 +1797,16 @@ xfs_lookup( int xfs_create( xfs_inode_t *dp, - bhv_vname_t *dentry, + struct xfs_name *name, mode_t mode, xfs_dev_t rdev, - bhv_vnode_t **vpp, + xfs_inode_t **ipp, cred_t *credp) { - char *name = VNAME(dentry); - xfs_mount_t *mp = dp->i_mount; - bhv_vnode_t *dir_vp = XFS_ITOV(dp); + xfs_mount_t *mp = dp->i_mount; xfs_inode_t *ip; - bhv_vnode_t *vp = NULL; xfs_trans_t *tp; - int error; + int error; xfs_bmap_free_t free_list; xfs_fsblock_t first_block; boolean_t unlock_dp_on_error = B_FALSE; @@ -1813,17 +1816,14 @@ xfs_create( xfs_prid_t prid; struct xfs_dquot *udqp, *gdqp; uint resblks; - int namelen; - ASSERT(!*vpp); + ASSERT(!*ipp); xfs_itrace_entry(dp); - namelen = VNAMELEN(dentry); - if (DM_EVENT_ENABLED(dp, DM_EVENT_CREATE)) { error = XFS_SEND_NAMESP(mp, DM_EVENT_CREATE, - dir_vp, DM_RIGHT_NULL, NULL, - DM_RIGHT_NULL, name, NULL, + dp, DM_RIGHT_NULL, NULL, + DM_RIGHT_NULL, name->name, NULL, mode, 0, 0); if (error) @@ -1855,7 +1855,7 @@ xfs_create( tp = xfs_trans_alloc(mp, XFS_TRANS_CREATE); cancel_flags = XFS_TRANS_RELEASE_LOG_RES; - resblks = XFS_CREATE_SPACE_RES(mp, namelen); + resblks = XFS_CREATE_SPACE_RES(mp, name->len); /* * Initially assume that the file does not exist and * reserve the resources for that case. If that is not @@ -1888,7 +1888,8 @@ xfs_create( if (error) goto error_return; - if (resblks == 0 && (error = xfs_dir_canenter(tp, dp, name, namelen))) + error = xfs_dir_canenter(tp, dp, name, resblks); + if (error) goto error_return; error = xfs_dir_ialloc(&tp, dp, mode, 1, rdev, credp, prid, resblks > 0, @@ -1914,11 +1915,11 @@ xfs_create( * the transaction cancel unlocking dp so don't do it explicitly in the * error path. */ - VN_HOLD(dir_vp); + IHOLD(dp); xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL); unlock_dp_on_error = B_FALSE; - error = xfs_dir_createname(tp, dp, name, namelen, ip->i_ino, + error = xfs_dir_createname(tp, dp, name, ip->i_ino, &first_block, &free_list, resblks ? resblks - XFS_IALLOC_SPACE_RES(mp) : 0); if (error) { @@ -1952,7 +1953,6 @@ xfs_create( * vnode to the caller, we bump the vnode ref count now. */ IHOLD(ip); - vp = XFS_ITOV(ip); error = xfs_bmap_finish(&tp, &free_list, &committed); if (error) { @@ -1970,17 +1970,17 @@ xfs_create( XFS_QM_DQRELE(mp, udqp); XFS_QM_DQRELE(mp, gdqp); - *vpp = vp; + *ipp = ip; /* Fallthrough to std_return with error = 0 */ std_return: - if ((*vpp || (error != 0 && dm_event_sent != 0)) && + if ((*ipp || (error != 0 && dm_event_sent != 0)) && DM_EVENT_ENABLED(dp, DM_EVENT_POSTCREATE)) { (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTCREATE, - dir_vp, DM_RIGHT_NULL, - *vpp ? vp:NULL, - DM_RIGHT_NULL, name, NULL, + dp, DM_RIGHT_NULL, + *ipp ? ip : NULL, + DM_RIGHT_NULL, name->name, NULL, mode, error, 0); } return error; @@ -2272,46 +2272,32 @@ int remove_which_error_return = 0; int xfs_remove( xfs_inode_t *dp, - bhv_vname_t *dentry) + struct xfs_name *name, + xfs_inode_t *ip) { - bhv_vnode_t *dir_vp = XFS_ITOV(dp); - char *name = VNAME(dentry); xfs_mount_t *mp = dp->i_mount; - xfs_inode_t *ip; xfs_trans_t *tp = NULL; int error = 0; xfs_bmap_free_t free_list; xfs_fsblock_t first_block; int cancel_flags; int committed; - int dm_di_mode = 0; int link_zero; uint resblks; - int namelen; xfs_itrace_entry(dp); if (XFS_FORCED_SHUTDOWN(mp)) return XFS_ERROR(EIO); - namelen = VNAMELEN(dentry); - - if (!xfs_get_dir_entry(dentry, &ip)) { - dm_di_mode = ip->i_d.di_mode; - IRELE(ip); - } - if (DM_EVENT_ENABLED(dp, DM_EVENT_REMOVE)) { - error = XFS_SEND_NAMESP(mp, DM_EVENT_REMOVE, dir_vp, - DM_RIGHT_NULL, NULL, DM_RIGHT_NULL, - name, NULL, dm_di_mode, 0, 0); + error = XFS_SEND_NAMESP(mp, DM_EVENT_REMOVE, dp, DM_RIGHT_NULL, + NULL, DM_RIGHT_NULL, name->name, NULL, + ip->i_d.di_mode, 0, 0); if (error) return error; } - /* From this point on, return through std_return */ - ip = NULL; - /* * We need to get a reference to ip before we get our log * reservation. The reason for this is that we cannot call @@ -2324,13 +2310,7 @@ xfs_remove( * when we call xfs_iget. Instead we get an unlocked reference * to the inode before getting our log reservation. */ - error = xfs_get_dir_entry(dentry, &ip); - if (error) { - REMOVE_DEBUG_TRACE(__LINE__); - goto std_return; - } - - dm_di_mode = ip->i_d.di_mode; + IHOLD(ip); xfs_itrace_entry(ip); xfs_itrace_ref(ip); @@ -2398,7 +2378,7 @@ xfs_remove( * Entry must exist since we did a lookup in xfs_lock_dir_and_entry. */ XFS_BMAP_INIT(&free_list, &first_block); - error = xfs_dir_removename(tp, dp, name, namelen, ip->i_ino, + error = xfs_dir_removename(tp, dp, name, ip->i_ino, &first_block, &free_list, 0); if (error) { ASSERT(error != ENOENT); @@ -2449,14 +2429,6 @@ xfs_remove( } /* - * Before we drop our extra reference to the inode, purge it - * from the refcache if it is there. By waiting until afterwards - * to do the IRELE, we ensure that we won't go inactive in the - * xfs_refcache_purge_ip routine (although that would be OK). - */ - xfs_refcache_purge_ip(ip); - - /* * If we are using filestreams, kill the stream association. * If the file is still open it may get a new one but that * will get killed on last close in xfs_close() so we don't @@ -2472,9 +2444,9 @@ xfs_remove( std_return: if (DM_EVENT_ENABLED(dp, DM_EVENT_POSTREMOVE)) { (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTREMOVE, - dir_vp, DM_RIGHT_NULL, + dp, DM_RIGHT_NULL, NULL, DM_RIGHT_NULL, - name, NULL, dm_di_mode, error, 0); + name->name, NULL, ip->i_d.di_mode, error, 0); } return error; @@ -2495,14 +2467,6 @@ xfs_remove( cancel_flags |= XFS_TRANS_ABORT; xfs_trans_cancel(tp, cancel_flags); - /* - * Before we drop our extra reference to the inode, purge it - * from the refcache if it is there. By waiting until afterwards - * to do the IRELE, we ensure that we won't go inactive in the - * xfs_refcache_purge_ip routine (although that would be OK). - */ - xfs_refcache_purge_ip(ip); - IRELE(ip); goto std_return; @@ -2511,12 +2475,10 @@ xfs_remove( int xfs_link( xfs_inode_t *tdp, - bhv_vnode_t *src_vp, - bhv_vname_t *dentry) + xfs_inode_t *sip, + struct xfs_name *target_name) { - bhv_vnode_t *target_dir_vp = XFS_ITOV(tdp); xfs_mount_t *mp = tdp->i_mount; - xfs_inode_t *sip = xfs_vtoi(src_vp); xfs_trans_t *tp; xfs_inode_t *ips[2]; int error; @@ -2525,23 +2487,20 @@ xfs_link( int cancel_flags; int committed; int resblks; - char *target_name = VNAME(dentry); - int target_namelen; xfs_itrace_entry(tdp); - xfs_itrace_entry(xfs_vtoi(src_vp)); + xfs_itrace_entry(sip); - target_namelen = VNAMELEN(dentry); - ASSERT(!VN_ISDIR(src_vp)); + ASSERT(!S_ISDIR(sip->i_d.di_mode)); if (XFS_FORCED_SHUTDOWN(mp)) return XFS_ERROR(EIO); if (DM_EVENT_ENABLED(tdp, DM_EVENT_LINK)) { error = XFS_SEND_NAMESP(mp, DM_EVENT_LINK, - target_dir_vp, DM_RIGHT_NULL, - src_vp, DM_RIGHT_NULL, - target_name, NULL, 0, 0, 0); + tdp, DM_RIGHT_NULL, + sip, DM_RIGHT_NULL, + target_name->name, NULL, 0, 0, 0); if (error) return error; } @@ -2556,7 +2515,7 @@ xfs_link( tp = xfs_trans_alloc(mp, XFS_TRANS_LINK); cancel_flags = XFS_TRANS_RELEASE_LOG_RES; - resblks = XFS_LINK_SPACE_RES(mp, target_namelen); + resblks = XFS_LINK_SPACE_RES(mp, target_name->len); error = xfs_trans_reserve(tp, resblks, XFS_LINK_LOG_RES(mp), 0, XFS_TRANS_PERM_LOG_RES, XFS_LINK_LOG_COUNT); if (error == ENOSPC) { @@ -2584,8 +2543,8 @@ xfs_link( * xfs_trans_cancel will both unlock the inodes and * decrement the associated ref counts. */ - VN_HOLD(src_vp); - VN_HOLD(target_dir_vp); + IHOLD(sip); + IHOLD(tdp); xfs_trans_ijoin(tp, sip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, tdp, XFS_ILOCK_EXCL); @@ -2608,15 +2567,14 @@ xfs_link( goto error_return; } - if (resblks == 0 && - (error = xfs_dir_canenter(tp, tdp, target_name, target_namelen))) + error = xfs_dir_canenter(tp, tdp, target_name, resblks); + if (error) goto error_return; XFS_BMAP_INIT(&free_list, &first_block); - error = xfs_dir_createname(tp, tdp, target_name, target_namelen, - sip->i_ino, &first_block, &free_list, - resblks); + error = xfs_dir_createname(tp, tdp, target_name, sip->i_ino, + &first_block, &free_list, resblks); if (error) goto abort_return; xfs_ichgtime(tdp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); @@ -2650,9 +2608,9 @@ xfs_link( std_return: if (DM_EVENT_ENABLED(sip, DM_EVENT_POSTLINK)) { (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTLINK, - target_dir_vp, DM_RIGHT_NULL, - src_vp, DM_RIGHT_NULL, - target_name, NULL, 0, error, 0); + tdp, DM_RIGHT_NULL, + sip, DM_RIGHT_NULL, + target_name->name, NULL, 0, error, 0); } return error; @@ -2669,17 +2627,13 @@ std_return: int xfs_mkdir( xfs_inode_t *dp, - bhv_vname_t *dentry, + struct xfs_name *dir_name, mode_t mode, - bhv_vnode_t **vpp, + xfs_inode_t **ipp, cred_t *credp) { - bhv_vnode_t *dir_vp = XFS_ITOV(dp); - char *dir_name = VNAME(dentry); - int dir_namelen = VNAMELEN(dentry); xfs_mount_t *mp = dp->i_mount; xfs_inode_t *cdp; /* inode of created dir */ - bhv_vnode_t *cvp; /* vnode of created dir */ xfs_trans_t *tp; int cancel_flags; int error; @@ -2700,8 +2654,8 @@ xfs_mkdir( if (DM_EVENT_ENABLED(dp, DM_EVENT_CREATE)) { error = XFS_SEND_NAMESP(mp, DM_EVENT_CREATE, - dir_vp, DM_RIGHT_NULL, NULL, - DM_RIGHT_NULL, dir_name, NULL, + dp, DM_RIGHT_NULL, NULL, + DM_RIGHT_NULL, dir_name->name, NULL, mode, 0, 0); if (error) return error; @@ -2730,7 +2684,7 @@ xfs_mkdir( tp = xfs_trans_alloc(mp, XFS_TRANS_MKDIR); cancel_flags = XFS_TRANS_RELEASE_LOG_RES; - resblks = XFS_MKDIR_SPACE_RES(mp, dir_namelen); + resblks = XFS_MKDIR_SPACE_RES(mp, dir_name->len); error = xfs_trans_reserve(tp, resblks, XFS_MKDIR_LOG_RES(mp), 0, XFS_TRANS_PERM_LOG_RES, XFS_MKDIR_LOG_COUNT); if (error == ENOSPC) { @@ -2762,8 +2716,8 @@ xfs_mkdir( if (error) goto error_return; - if (resblks == 0 && - (error = xfs_dir_canenter(tp, dp, dir_name, dir_namelen))) + error = xfs_dir_canenter(tp, dp, dir_name, resblks); + if (error) goto error_return; /* * create the directory inode. @@ -2786,15 +2740,15 @@ xfs_mkdir( * from here on will result in the transaction cancel * unlocking dp so don't do it explicitly in the error path. */ - VN_HOLD(dir_vp); + IHOLD(dp); xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL); unlock_dp_on_error = B_FALSE; XFS_BMAP_INIT(&free_list, &first_block); - error = xfs_dir_createname(tp, dp, dir_name, dir_namelen, cdp->i_ino, - &first_block, &free_list, resblks ? - resblks - XFS_IALLOC_SPACE_RES(mp) : 0); + error = xfs_dir_createname(tp, dp, dir_name, cdp->i_ino, + &first_block, &free_list, resblks ? + resblks - XFS_IALLOC_SPACE_RES(mp) : 0); if (error) { ASSERT(error != ENOSPC); goto error1; @@ -2817,11 +2771,9 @@ xfs_mkdir( if (error) goto error2; - cvp = XFS_ITOV(cdp); - created = B_TRUE; - *vpp = cvp; + *ipp = cdp; IHOLD(cdp); /* @@ -2858,10 +2810,10 @@ std_return: if ((created || (error != 0 && dm_event_sent != 0)) && DM_EVENT_ENABLED(dp, DM_EVENT_POSTCREATE)) { (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTCREATE, - dir_vp, DM_RIGHT_NULL, - created ? XFS_ITOV(cdp):NULL, + dp, DM_RIGHT_NULL, + created ? cdp : NULL, DM_RIGHT_NULL, - dir_name, NULL, + dir_name->name, NULL, mode, error, 0); } return error; @@ -2885,20 +2837,17 @@ std_return: int xfs_rmdir( xfs_inode_t *dp, - bhv_vname_t *dentry) + struct xfs_name *name, + xfs_inode_t *cdp) { bhv_vnode_t *dir_vp = XFS_ITOV(dp); - char *name = VNAME(dentry); - int namelen = VNAMELEN(dentry); xfs_mount_t *mp = dp->i_mount; - xfs_inode_t *cdp; /* child directory */ xfs_trans_t *tp; int error; xfs_bmap_free_t free_list; xfs_fsblock_t first_block; int cancel_flags; int committed; - int dm_di_mode = S_IFDIR; int last_cdp_link; uint resblks; @@ -2907,24 +2856,15 @@ xfs_rmdir( if (XFS_FORCED_SHUTDOWN(mp)) return XFS_ERROR(EIO); - if (!xfs_get_dir_entry(dentry, &cdp)) { - dm_di_mode = cdp->i_d.di_mode; - IRELE(cdp); - } - if (DM_EVENT_ENABLED(dp, DM_EVENT_REMOVE)) { error = XFS_SEND_NAMESP(mp, DM_EVENT_REMOVE, - dir_vp, DM_RIGHT_NULL, - NULL, DM_RIGHT_NULL, - name, NULL, dm_di_mode, 0, 0); + dp, DM_RIGHT_NULL, + NULL, DM_RIGHT_NULL, name->name, + NULL, cdp->i_d.di_mode, 0, 0); if (error) return XFS_ERROR(error); } - /* Return through std_return after this point. */ - - cdp = NULL; - /* * We need to get a reference to cdp before we get our log * reservation. The reason for this is that we cannot call @@ -2937,13 +2877,7 @@ xfs_rmdir( * when we call xfs_iget. Instead we get an unlocked reference * to the inode before getting our log reservation. */ - error = xfs_get_dir_entry(dentry, &cdp); - if (error) { - REMOVE_DEBUG_TRACE(__LINE__); - goto std_return; - } - mp = dp->i_mount; - dm_di_mode = cdp->i_d.di_mode; + IHOLD(cdp); /* * Get the dquots for the inodes. @@ -3020,7 +2954,7 @@ xfs_rmdir( goto error_return; } - error = xfs_dir_removename(tp, dp, name, namelen, cdp->i_ino, + error = xfs_dir_removename(tp, dp, name, cdp->i_ino, &first_block, &free_list, resblks); if (error) goto error1; @@ -3098,9 +3032,9 @@ xfs_rmdir( std_return: if (DM_EVENT_ENABLED(dp, DM_EVENT_POSTREMOVE)) { (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTREMOVE, - dir_vp, DM_RIGHT_NULL, + dp, DM_RIGHT_NULL, NULL, DM_RIGHT_NULL, - name, NULL, dm_di_mode, + name->name, NULL, cdp->i_d.di_mode, error, 0); } return error; @@ -3118,13 +3052,12 @@ xfs_rmdir( int xfs_symlink( xfs_inode_t *dp, - bhv_vname_t *dentry, - char *target_path, + struct xfs_name *link_name, + const char *target_path, mode_t mode, - bhv_vnode_t **vpp, + xfs_inode_t **ipp, cred_t *credp) { - bhv_vnode_t *dir_vp = XFS_ITOV(dp); xfs_mount_t *mp = dp->i_mount; xfs_trans_t *tp; xfs_inode_t *ip; @@ -3140,17 +3073,15 @@ xfs_symlink( int nmaps; xfs_bmbt_irec_t mval[SYMLINK_MAPS]; xfs_daddr_t d; - char *cur_chunk; + const char *cur_chunk; int byte_cnt; int n; xfs_buf_t *bp; xfs_prid_t prid; struct xfs_dquot *udqp, *gdqp; uint resblks; - char *link_name = VNAME(dentry); - int link_namelen; - *vpp = NULL; + *ipp = NULL; error = 0; ip = NULL; tp = NULL; @@ -3160,44 +3091,17 @@ xfs_symlink( if (XFS_FORCED_SHUTDOWN(mp)) return XFS_ERROR(EIO); - link_namelen = VNAMELEN(dentry); - /* * Check component lengths of the target path name. */ pathlen = strlen(target_path); if (pathlen >= MAXPATHLEN) /* total string too long */ return XFS_ERROR(ENAMETOOLONG); - if (pathlen >= MAXNAMELEN) { /* is any component too long? */ - int len, total; - char *path; - - for (total = 0, path = target_path; total < pathlen;) { - /* - * Skip any slashes. - */ - while(*path == '/') { - total++; - path++; - } - - /* - * Count up to the next slash or end of path. - * Error out if the component is bigger than MAXNAMELEN. - */ - for(len = 0; *path != '/' && total < pathlen;total++, path++) { - if (++len >= MAXNAMELEN) { - error = ENAMETOOLONG; - return error; - } - } - } - } if (DM_EVENT_ENABLED(dp, DM_EVENT_SYMLINK)) { - error = XFS_SEND_NAMESP(mp, DM_EVENT_SYMLINK, dir_vp, + error = XFS_SEND_NAMESP(mp, DM_EVENT_SYMLINK, dp, DM_RIGHT_NULL, NULL, DM_RIGHT_NULL, - link_name, target_path, 0, 0, 0); + link_name->name, target_path, 0, 0, 0); if (error) return error; } @@ -3229,7 +3133,7 @@ xfs_symlink( fs_blocks = 0; else fs_blocks = XFS_B_TO_FSB(mp, pathlen); - resblks = XFS_SYMLINK_SPACE_RES(mp, link_namelen, fs_blocks); + resblks = XFS_SYMLINK_SPACE_RES(mp, link_name->len, fs_blocks); error = xfs_trans_reserve(tp, resblks, XFS_SYMLINK_LOG_RES(mp), 0, XFS_TRANS_PERM_LOG_RES, XFS_SYMLINK_LOG_COUNT); if (error == ENOSPC && fs_blocks == 0) { @@ -3263,8 +3167,8 @@ xfs_symlink( /* * Check for ability to enter directory entry, if no space reserved. */ - if (resblks == 0 && - (error = xfs_dir_canenter(tp, dp, link_name, link_namelen))) + error = xfs_dir_canenter(tp, dp, link_name, resblks); + if (error) goto error_return; /* * Initialize the bmap freelist prior to calling either @@ -3289,7 +3193,7 @@ xfs_symlink( * transaction cancel unlocking dp so don't do it explicitly in the * error path. */ - VN_HOLD(dir_vp); + IHOLD(dp); xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL); unlock_dp_on_error = B_FALSE; @@ -3356,8 +3260,8 @@ xfs_symlink( /* * Create the directory entry for the symlink. */ - error = xfs_dir_createname(tp, dp, link_name, link_namelen, ip->i_ino, - &first_block, &free_list, resblks); + error = xfs_dir_createname(tp, dp, link_name, ip->i_ino, + &first_block, &free_list, resblks); if (error) goto error1; xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); @@ -3399,19 +3303,14 @@ xfs_symlink( std_return: if (DM_EVENT_ENABLED(dp, DM_EVENT_POSTSYMLINK)) { (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTSYMLINK, - dir_vp, DM_RIGHT_NULL, - error ? NULL : XFS_ITOV(ip), - DM_RIGHT_NULL, link_name, target_path, - 0, error, 0); + dp, DM_RIGHT_NULL, + error ? NULL : ip, + DM_RIGHT_NULL, link_name->name, + target_path, 0, error, 0); } - if (!error) { - bhv_vnode_t *vp; - - ASSERT(ip); - vp = XFS_ITOV(ip); - *vpp = vp; - } + if (!error) + *ipp = ip; return error; error2: @@ -3431,60 +3330,11 @@ std_return: } int -xfs_rwlock( - xfs_inode_t *ip, - bhv_vrwlock_t locktype) -{ - if (S_ISDIR(ip->i_d.di_mode)) - return 1; - if (locktype == VRWLOCK_WRITE) { - xfs_ilock(ip, XFS_IOLOCK_EXCL); - } else if (locktype == VRWLOCK_TRY_READ) { - return xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED); - } else if (locktype == VRWLOCK_TRY_WRITE) { - return xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL); - } else { - ASSERT((locktype == VRWLOCK_READ) || - (locktype == VRWLOCK_WRITE_DIRECT)); - xfs_ilock(ip, XFS_IOLOCK_SHARED); - } - - return 1; -} - - -void -xfs_rwunlock( - xfs_inode_t *ip, - bhv_vrwlock_t locktype) -{ - if (S_ISDIR(ip->i_d.di_mode)) - return; - if (locktype == VRWLOCK_WRITE) { - /* - * In the write case, we may have added a new entry to - * the reference cache. This might store a pointer to - * an inode to be released in this inode. If it is there, - * clear the pointer and release the inode after unlocking - * this one. - */ - xfs_refcache_iunlock(ip, XFS_IOLOCK_EXCL); - } else { - ASSERT((locktype == VRWLOCK_READ) || - (locktype == VRWLOCK_WRITE_DIRECT)); - xfs_iunlock(ip, XFS_IOLOCK_SHARED); - } - return; -} - - -int xfs_inode_flush( xfs_inode_t *ip, int flags) { xfs_mount_t *mp = ip->i_mount; - xfs_inode_log_item_t *iip = ip->i_itemp; int error = 0; if (XFS_FORCED_SHUTDOWN(mp)) @@ -3494,33 +3344,9 @@ xfs_inode_flush( * Bypass inodes which have already been cleaned by * the inode flush clustering code inside xfs_iflush */ - if ((ip->i_update_core == 0) && - ((iip == NULL) || !(iip->ili_format.ilf_fields & XFS_ILOG_ALL))) + if (xfs_inode_clean(ip)) return 0; - if (flags & FLUSH_LOG) { - if (iip && iip->ili_last_lsn) { - xlog_t *log = mp->m_log; - xfs_lsn_t sync_lsn; - int log_flags = XFS_LOG_FORCE; - - spin_lock(&log->l_grant_lock); - sync_lsn = log->l_last_sync_lsn; - spin_unlock(&log->l_grant_lock); - - if ((XFS_LSN_CMP(iip->ili_last_lsn, sync_lsn) > 0)) { - if (flags & FLUSH_SYNC) - log_flags |= XFS_LOG_SYNC; - error = xfs_log_force(mp, iip->ili_last_lsn, log_flags); - if (error) - return error; - } - - if (ip->i_update_core == 0) - return 0; - } - } - /* * We make this non-blocking if the inode is contended, * return EAGAIN to indicate to the caller that they @@ -3528,30 +3354,22 @@ xfs_inode_flush( * blocking on inodes inside another operation right * now, they get caught later by xfs_sync. */ - if (flags & FLUSH_INODE) { - int flush_flags; - - if (flags & FLUSH_SYNC) { - xfs_ilock(ip, XFS_ILOCK_SHARED); - xfs_iflock(ip); - } else if (xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) { - if (xfs_ipincount(ip) || !xfs_iflock_nowait(ip)) { - xfs_iunlock(ip, XFS_ILOCK_SHARED); - return EAGAIN; - } - } else { + if (flags & FLUSH_SYNC) { + xfs_ilock(ip, XFS_ILOCK_SHARED); + xfs_iflock(ip); + } else if (xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) { + if (xfs_ipincount(ip) || !xfs_iflock_nowait(ip)) { + xfs_iunlock(ip, XFS_ILOCK_SHARED); return EAGAIN; } - - if (flags & FLUSH_SYNC) - flush_flags = XFS_IFLUSH_SYNC; - else - flush_flags = XFS_IFLUSH_ASYNC; - - error = xfs_iflush(ip, flush_flags); - xfs_iunlock(ip, XFS_ILOCK_SHARED); + } else { + return EAGAIN; } + error = xfs_iflush(ip, (flags & FLUSH_SYNC) ? XFS_IFLUSH_SYNC + : XFS_IFLUSH_ASYNC_NOBLOCK); + xfs_iunlock(ip, XFS_ILOCK_SHARED); + return error; } @@ -3694,12 +3512,12 @@ xfs_finish_reclaim( * We get the flush lock regardless, though, just to make sure * we don't free it while it is being flushed. */ - if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) { - if (!locked) { - xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_iflock(ip); - } + if (!locked) { + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_iflock(ip); + } + if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) { if (ip->i_update_core || ((ip->i_itemp != NULL) && (ip->i_itemp->ili_format.ilf_fields != 0))) { @@ -3719,17 +3537,11 @@ xfs_finish_reclaim( ASSERT(ip->i_update_core == 0); ASSERT(ip->i_itemp == NULL || ip->i_itemp->ili_format.ilf_fields == 0); - xfs_iunlock(ip, XFS_ILOCK_EXCL); - } else if (locked) { - /* - * We are not interested in doing an iflush if we're - * in the process of shutting down the filesystem forcibly. - * So, just reclaim the inode. - */ - xfs_ifunlock(ip); - xfs_iunlock(ip, XFS_ILOCK_EXCL); } + xfs_ifunlock(ip); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + reclaim: xfs_ireclaim(ip); return 0; @@ -3845,9 +3657,8 @@ xfs_alloc_file_space( end_dmi_offset = offset+len; if (end_dmi_offset > ip->i_size) end_dmi_offset = ip->i_size; - error = XFS_SEND_DATA(mp, DM_EVENT_WRITE, XFS_ITOV(ip), - offset, end_dmi_offset - offset, - 0, NULL); + error = XFS_SEND_DATA(mp, DM_EVENT_WRITE, ip, offset, + end_dmi_offset - offset, 0, NULL); if (error) return error; } @@ -3956,8 +3767,8 @@ dmapi_enospc_check: if (error == ENOSPC && (attr_flags & ATTR_DMI) == 0 && DM_EVENT_ENABLED(ip, DM_EVENT_NOSPACE)) { error = XFS_SEND_NAMESP(mp, DM_EVENT_NOSPACE, - XFS_ITOV(ip), DM_RIGHT_NULL, - XFS_ITOV(ip), DM_RIGHT_NULL, + ip, DM_RIGHT_NULL, + ip, DM_RIGHT_NULL, NULL, NULL, 0, 0, 0); /* Delay flag intentionally unused */ if (error == 0) goto retry; /* Maybe DMAPI app. has made space */ @@ -4021,7 +3832,8 @@ xfs_zero_remaining_bytes( XFS_BUF_READ(bp); XFS_BUF_SET_ADDR(bp, XFS_FSB_TO_DB(ip, imap.br_startblock)); xfsbdstrat(mp, bp); - if ((error = xfs_iowait(bp))) { + error = xfs_iowait(bp); + if (error) { xfs_ioerror_alert("xfs_zero_remaining_bytes(read)", mp, bp, XFS_BUF_ADDR(bp)); break; @@ -4033,7 +3845,8 @@ xfs_zero_remaining_bytes( XFS_BUF_UNREAD(bp); XFS_BUF_WRITE(bp); xfsbdstrat(mp, bp); - if ((error = xfs_iowait(bp))) { + error = xfs_iowait(bp); + if (error) { xfs_ioerror_alert("xfs_zero_remaining_bytes(write)", mp, bp, XFS_BUF_ADDR(bp)); break; @@ -4102,7 +3915,7 @@ xfs_free_file_space( DM_EVENT_ENABLED(ip, DM_EVENT_WRITE)) { if (end_dmi_offset > ip->i_size) end_dmi_offset = ip->i_size; - error = XFS_SEND_DATA(mp, DM_EVENT_WRITE, vp, + error = XFS_SEND_DATA(mp, DM_EVENT_WRITE, ip, offset, end_dmi_offset - offset, AT_DELAY_FLAG(attr_flags), NULL); if (error) diff --git a/fs/xfs/xfs_vnodeops.h b/fs/xfs/xfs_vnodeops.h index 4e3970f0..24c5392 100644 --- a/fs/xfs/xfs_vnodeops.h +++ b/fs/xfs/xfs_vnodeops.h @@ -23,31 +23,32 @@ int xfs_fsync(struct xfs_inode *ip, int flag, xfs_off_t start, xfs_off_t stop); int xfs_release(struct xfs_inode *ip); int xfs_inactive(struct xfs_inode *ip); -int xfs_lookup(struct xfs_inode *dp, bhv_vname_t *dentry, - bhv_vnode_t **vpp); -int xfs_create(struct xfs_inode *dp, bhv_vname_t *dentry, mode_t mode, - xfs_dev_t rdev, bhv_vnode_t **vpp, struct cred *credp); -int xfs_remove(struct xfs_inode *dp, bhv_vname_t *dentry); -int xfs_link(struct xfs_inode *tdp, bhv_vnode_t *src_vp, - bhv_vname_t *dentry); -int xfs_mkdir(struct xfs_inode *dp, bhv_vname_t *dentry, - mode_t mode, bhv_vnode_t **vpp, struct cred *credp); -int xfs_rmdir(struct xfs_inode *dp, bhv_vname_t *dentry); +int xfs_lookup(struct xfs_inode *dp, struct xfs_name *name, + struct xfs_inode **ipp); +int xfs_create(struct xfs_inode *dp, struct xfs_name *name, mode_t mode, + xfs_dev_t rdev, struct xfs_inode **ipp, struct cred *credp); +int xfs_remove(struct xfs_inode *dp, struct xfs_name *name, + struct xfs_inode *ip); +int xfs_link(struct xfs_inode *tdp, struct xfs_inode *sip, + struct xfs_name *target_name); +int xfs_mkdir(struct xfs_inode *dp, struct xfs_name *dir_name, + mode_t mode, struct xfs_inode **ipp, struct cred *credp); +int xfs_rmdir(struct xfs_inode *dp, struct xfs_name *name, + struct xfs_inode *cdp); int xfs_readdir(struct xfs_inode *dp, void *dirent, size_t bufsize, xfs_off_t *offset, filldir_t filldir); -int xfs_symlink(struct xfs_inode *dp, bhv_vname_t *dentry, - char *target_path, mode_t mode, bhv_vnode_t **vpp, +int xfs_symlink(struct xfs_inode *dp, struct xfs_name *link_name, + const char *target_path, mode_t mode, struct xfs_inode **ipp, struct cred *credp); -int xfs_rwlock(struct xfs_inode *ip, bhv_vrwlock_t locktype); -void xfs_rwunlock(struct xfs_inode *ip, bhv_vrwlock_t locktype); int xfs_inode_flush(struct xfs_inode *ip, int flags); int xfs_set_dmattrs(struct xfs_inode *ip, u_int evmask, u_int16_t state); int xfs_reclaim(struct xfs_inode *ip); int xfs_change_file_space(struct xfs_inode *ip, int cmd, xfs_flock64_t *bf, xfs_off_t offset, struct cred *credp, int attr_flags); -int xfs_rename(struct xfs_inode *src_dp, bhv_vname_t *src_vname, - bhv_vnode_t *target_dir_vp, bhv_vname_t *target_vname); +int xfs_rename(struct xfs_inode *src_dp, struct xfs_name *src_name, + struct xfs_inode *src_ip, struct xfs_inode *target_dp, + struct xfs_name *target_name); int xfs_attr_get(struct xfs_inode *ip, const char *name, char *value, int *valuelenp, int flags, cred_t *cred); int xfs_attr_set(struct xfs_inode *dp, const char *name, char *value, |