diff options
Diffstat (limited to 'fs')
126 files changed, 3403 insertions, 3307 deletions
@@ -440,14 +440,8 @@ config OCFS2_FS Tools web page: http://oss.oracle.com/projects/ocfs2-tools OCFS2 mailing lists: http://oss.oracle.com/projects/ocfs2/mailman/ - Note: Features which OCFS2 does not support yet: - - extended attributes - - quotas - - cluster aware flock - - Directory change notification (F_NOTIFY) - - Distributed Caching (F_SETLEASE/F_GETLEASE/break_lease) - - POSIX ACLs - - readpages / writepages (not user visible) + For more information on OCFS2, see the file + <file:Documentation/filesystems/ocfs2.txt>. config OCFS2_DEBUG_MASKLOG bool "OCFS2 logging support" @@ -1028,8 +1022,8 @@ config HUGETLB_PAGE def_bool HUGETLBFS config CONFIGFS_FS - tristate "Userspace-driven configuration filesystem (EXPERIMENTAL)" - depends on SYSFS && EXPERIMENTAL + tristate "Userspace-driven configuration filesystem" + depends on SYSFS help configfs is a ram-based filesystem that provides the converse of sysfs's functionality. Where sysfs is a filesystem-based @@ -1112,8 +1106,8 @@ config HFS_FS help If you say Y here, you will be able to mount Macintosh-formatted floppy disks and hard drive partitions with full read-write access. - Please read <file:fs/hfs/HFS.txt> to learn about the available mount - options. + Please read <file:Documentation/filesystems/hfs.txt> to learn about + the available mount options. To compile this file system support as a module, choose M here: the module will be called hfs. @@ -2130,4 +2124,3 @@ source "fs/nls/Kconfig" source "fs/dlm/Kconfig" endmenu - diff --git a/fs/block_dev.c b/fs/block_dev.c index 993f78c..e48a630 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -738,9 +738,9 @@ EXPORT_SYMBOL(bd_release); static struct kobject *bdev_get_kobj(struct block_device *bdev) { if (bdev->bd_contains != bdev) - return kobject_get(&bdev->bd_part->kobj); + return kobject_get(&bdev->bd_part->dev.kobj); else - return kobject_get(&bdev->bd_disk->kobj); + return kobject_get(&bdev->bd_disk->dev.kobj); } static struct kobject *bdev_get_holder(struct block_device *bdev) @@ -1176,7 +1176,7 @@ static int do_open(struct block_device *bdev, struct file *file, int for_part) ret = -ENXIO; goto out_first; } - kobject_get(&p->kobj); + kobject_get(&p->dev.kobj); bdev->bd_part = p; bd_set_size(bdev, (loff_t) p->nr_sects << 9); } @@ -1299,7 +1299,7 @@ static int __blkdev_put(struct block_device *bdev, int for_part) module_put(owner); if (bdev->bd_contains != bdev) { - kobject_put(&bdev->bd_part->kobj); + kobject_put(&bdev->bd_part->dev.kobj); bdev->bd_part = NULL; } bdev->bd_disk = NULL; diff --git a/fs/char_dev.c b/fs/char_dev.c index c3bfa76..2c7a8b5 100644 --- a/fs/char_dev.c +++ b/fs/char_dev.c @@ -510,9 +510,8 @@ struct cdev *cdev_alloc(void) { struct cdev *p = kzalloc(sizeof(struct cdev), GFP_KERNEL); if (p) { - p->kobj.ktype = &ktype_cdev_dynamic; INIT_LIST_HEAD(&p->list); - kobject_init(&p->kobj); + kobject_init(&p->kobj, &ktype_cdev_dynamic); } return p; } @@ -529,8 +528,7 @@ void cdev_init(struct cdev *cdev, const struct file_operations *fops) { memset(cdev, 0, sizeof *cdev); INIT_LIST_HEAD(&cdev->list); - cdev->kobj.ktype = &ktype_cdev_default; - kobject_init(&cdev->kobj); + kobject_init(&cdev->kobj, &ktype_cdev_default); cdev->ops = fops; } diff --git a/fs/coda/psdev.c b/fs/coda/psdev.c index dcc6aea..e3eb355 100644 --- a/fs/coda/psdev.c +++ b/fs/coda/psdev.c @@ -362,8 +362,8 @@ static int init_coda_psdev(void) goto out_chrdev; } for (i = 0; i < MAX_CODADEVS; i++) - class_device_create(coda_psdev_class, NULL, - MKDEV(CODA_PSDEV_MAJOR,i), NULL, "cfs%d", i); + device_create(coda_psdev_class, NULL, + MKDEV(CODA_PSDEV_MAJOR,i), "cfs%d", i); coda_sysctl_init(); goto out; @@ -405,7 +405,7 @@ static int __init init_coda(void) return 0; out: for (i = 0; i < MAX_CODADEVS; i++) - class_device_destroy(coda_psdev_class, MKDEV(CODA_PSDEV_MAJOR, i)); + device_destroy(coda_psdev_class, MKDEV(CODA_PSDEV_MAJOR, i)); class_destroy(coda_psdev_class); unregister_chrdev(CODA_PSDEV_MAJOR, "coda"); coda_sysctl_clean(); @@ -424,7 +424,7 @@ static void __exit exit_coda(void) printk("coda: failed to unregister filesystem\n"); } for (i = 0; i < MAX_CODADEVS; i++) - class_device_destroy(coda_psdev_class, MKDEV(CODA_PSDEV_MAJOR, i)); + device_destroy(coda_psdev_class, MKDEV(CODA_PSDEV_MAJOR, i)); class_destroy(coda_psdev_class); unregister_chrdev(CODA_PSDEV_MAJOR, "coda"); coda_sysctl_clean(); diff --git a/fs/compat.c b/fs/compat.c index 15078ce..5216c3f 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -1104,10 +1104,6 @@ static ssize_t compat_do_readv_writev(int type, struct file *file, if (ret < 0) goto out; - ret = security_file_permission(file, type == READ ? MAY_READ:MAY_WRITE); - if (ret) - goto out; - fnv = NULL; if (type == READ) { fn = file->f_op->read; diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c index 50ed691..a48dc7d 100644 --- a/fs/configfs/dir.c +++ b/fs/configfs/dir.c @@ -546,7 +546,7 @@ static int populate_groups(struct config_group *group) * That said, taking our i_mutex is closer to mkdir * emulation, and shouldn't hurt. */ - mutex_lock(&dentry->d_inode->i_mutex); + mutex_lock_nested(&dentry->d_inode->i_mutex, I_MUTEX_CHILD); for (i = 0; group->default_groups[i]; i++) { new_group = group->default_groups[i]; @@ -1405,7 +1405,8 @@ int configfs_register_subsystem(struct configfs_subsystem *subsys) sd = configfs_sb->s_root->d_fsdata; link_group(to_config_group(sd->s_element), group); - mutex_lock(&configfs_sb->s_root->d_inode->i_mutex); + mutex_lock_nested(&configfs_sb->s_root->d_inode->i_mutex, + I_MUTEX_PARENT); name.name = group->cg_item.ci_name; name.len = strlen(name.name); diff --git a/fs/configfs/file.c b/fs/configfs/file.c index a3658f9..397cb50 100644 --- a/fs/configfs/file.c +++ b/fs/configfs/file.c @@ -320,7 +320,7 @@ int configfs_add_file(struct dentry * dir, const struct configfs_attribute * att umode_t mode = (attr->ca_mode & S_IALLUGO) | S_IFREG; int error = 0; - mutex_lock(&dir->d_inode->i_mutex); + mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_NORMAL); error = configfs_make_dirent(parent_sd, NULL, (void *) attr, mode, type); mutex_unlock(&dir->d_inode->i_mutex); diff --git a/fs/configfs/mount.c b/fs/configfs/mount.c index 3bf0278..de3b31d 100644 --- a/fs/configfs/mount.c +++ b/fs/configfs/mount.c @@ -128,7 +128,7 @@ void configfs_release_fs(void) } -static decl_subsys(config, NULL, NULL); +static struct kobject *config_kobj; static int __init configfs_init(void) { @@ -140,9 +140,8 @@ static int __init configfs_init(void) if (!configfs_dir_cachep) goto out; - kobj_set_kset_s(&config_subsys, kernel_subsys); - err = subsystem_register(&config_subsys); - if (err) { + config_kobj = kobject_create_and_add("config", kernel_kobj); + if (!config_kobj) { kmem_cache_destroy(configfs_dir_cachep); configfs_dir_cachep = NULL; goto out; @@ -151,7 +150,7 @@ static int __init configfs_init(void) err = register_filesystem(&configfs_fs_type); if (err) { printk(KERN_ERR "configfs: Unable to register filesystem!\n"); - subsystem_unregister(&config_subsys); + kobject_put(config_kobj); kmem_cache_destroy(configfs_dir_cachep); configfs_dir_cachep = NULL; goto out; @@ -160,7 +159,7 @@ static int __init configfs_init(void) err = configfs_inode_init(); if (err) { unregister_filesystem(&configfs_fs_type); - subsystem_unregister(&config_subsys); + kobject_put(config_kobj); kmem_cache_destroy(configfs_dir_cachep); configfs_dir_cachep = NULL; } @@ -171,7 +170,7 @@ out: static void __exit configfs_exit(void) { unregister_filesystem(&configfs_fs_type); - subsystem_unregister(&config_subsys); + kobject_put(config_kobj); kmem_cache_destroy(configfs_dir_cachep); configfs_dir_cachep = NULL; configfs_inode_exit(); diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c index 6a713b3..d26e282 100644 --- a/fs/debugfs/inode.c +++ b/fs/debugfs/inode.c @@ -426,20 +426,19 @@ exit: } EXPORT_SYMBOL_GPL(debugfs_rename); -static decl_subsys(debug, NULL, NULL); +static struct kobject *debug_kobj; static int __init debugfs_init(void) { int retval; - kobj_set_kset_s(&debug_subsys, kernel_subsys); - retval = subsystem_register(&debug_subsys); - if (retval) - return retval; + debug_kobj = kobject_create_and_add("debug", kernel_kobj); + if (!debug_kobj) + return -EINVAL; retval = register_filesystem(&debug_fs_type); if (retval) - subsystem_unregister(&debug_subsys); + kobject_put(debug_kobj); return retval; } @@ -447,7 +446,7 @@ static void __exit debugfs_exit(void) { simple_release_fs(&debugfs_mount, &debugfs_mount_count); unregister_filesystem(&debug_fs_type); - subsystem_unregister(&debug_subsys); + kobject_put(debug_kobj); } core_initcall(debugfs_init); diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c index 6353a83..5c108c4 100644 --- a/fs/dlm/lockspace.c +++ b/fs/dlm/lockspace.c @@ -166,26 +166,7 @@ static struct kobj_type dlm_ktype = { .release = lockspace_kobj_release, }; -static struct kset dlm_kset = { - .ktype = &dlm_ktype, -}; - -static int kobject_setup(struct dlm_ls *ls) -{ - char lsname[DLM_LOCKSPACE_LEN]; - int error; - - memset(lsname, 0, DLM_LOCKSPACE_LEN); - snprintf(lsname, DLM_LOCKSPACE_LEN, "%s", ls->ls_name); - - error = kobject_set_name(&ls->ls_kobj, "%s", lsname); - if (error) - return error; - - ls->ls_kobj.kset = &dlm_kset; - ls->ls_kobj.ktype = &dlm_ktype; - return 0; -} +static struct kset *dlm_kset; static int do_uevent(struct dlm_ls *ls, int in) { @@ -220,24 +201,22 @@ static int do_uevent(struct dlm_ls *ls, int in) int dlm_lockspace_init(void) { - int error; - ls_count = 0; mutex_init(&ls_lock); INIT_LIST_HEAD(&lslist); spin_lock_init(&lslist_lock); - kobject_set_name(&dlm_kset.kobj, "dlm"); - kobj_set_kset_s(&dlm_kset, kernel_subsys); - error = kset_register(&dlm_kset); - if (error) - printk("dlm_lockspace_init: cannot register kset %d\n", error); - return error; + dlm_kset = kset_create_and_add("dlm", NULL, kernel_kobj); + if (!dlm_kset) { + printk(KERN_WARNING "%s: can not create kset\n", __FUNCTION__); + return -ENOMEM; + } + return 0; } void dlm_lockspace_exit(void) { - kset_unregister(&dlm_kset); + kset_unregister(dlm_kset); } static int dlm_scand(void *data) @@ -549,13 +528,12 @@ static int new_lockspace(char *name, int namelen, void **lockspace, goto out_delist; } - error = kobject_setup(ls); - if (error) - goto out_stop; - - error = kobject_register(&ls->ls_kobj); + ls->ls_kobj.kset = dlm_kset; + error = kobject_init_and_add(&ls->ls_kobj, &dlm_ktype, NULL, + "%s", ls->ls_name); if (error) goto out_stop; + kobject_uevent(&ls->ls_kobj, KOBJ_ADD); /* let kobject handle freeing of ls if there's an error */ do_unreg = 1; @@ -601,7 +579,7 @@ static int new_lockspace(char *name, int namelen, void **lockspace, kfree(ls->ls_rsbtbl); out_lsfree: if (do_unreg) - kobject_unregister(&ls->ls_kobj); + kobject_put(&ls->ls_kobj); else kfree(ls); out: @@ -750,7 +728,7 @@ static int release_lockspace(struct dlm_ls *ls, int force) dlm_clear_members(ls); dlm_clear_members_gone(ls); kfree(ls->ls_node_array); - kobject_unregister(&ls->ls_kobj); + kobject_put(&ls->ls_kobj); /* The ls structure will be freed when the kobject is done with */ mutex_lock(&ls_lock); diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c index e5580bc..0249aa4 100644 --- a/fs/ecryptfs/main.c +++ b/fs/ecryptfs/main.c @@ -734,127 +734,40 @@ static int ecryptfs_init_kmem_caches(void) return 0; } -struct ecryptfs_obj { - char *name; - struct list_head slot_list; - struct kobject kobj; -}; - -struct ecryptfs_attribute { - struct attribute attr; - ssize_t(*show) (struct ecryptfs_obj *, char *); - ssize_t(*store) (struct ecryptfs_obj *, const char *, size_t); -}; +static struct kobject *ecryptfs_kobj; -static ssize_t -ecryptfs_attr_store(struct kobject *kobj, - struct attribute *attr, const char *buf, size_t len) +static ssize_t version_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buff) { - struct ecryptfs_obj *obj = container_of(kobj, struct ecryptfs_obj, - kobj); - struct ecryptfs_attribute *attribute = - container_of(attr, struct ecryptfs_attribute, attr); - - return (attribute->store ? attribute->store(obj, buf, len) : 0); + return snprintf(buff, PAGE_SIZE, "%d\n", ECRYPTFS_VERSIONING_MASK); } -static ssize_t -ecryptfs_attr_show(struct kobject *kobj, struct attribute *attr, char *buf) -{ - struct ecryptfs_obj *obj = container_of(kobj, struct ecryptfs_obj, - kobj); - struct ecryptfs_attribute *attribute = - container_of(attr, struct ecryptfs_attribute, attr); - - return (attribute->show ? attribute->show(obj, buf) : 0); -} +static struct kobj_attribute version_attr = __ATTR_RO(version); -static struct sysfs_ops ecryptfs_sysfs_ops = { - .show = ecryptfs_attr_show, - .store = ecryptfs_attr_store +static struct attribute *attributes[] = { + &version_attr.attr, + NULL, }; -static struct kobj_type ecryptfs_ktype = { - .sysfs_ops = &ecryptfs_sysfs_ops +static struct attribute_group attr_group = { + .attrs = attributes, }; -static decl_subsys(ecryptfs, &ecryptfs_ktype, NULL); - -static ssize_t version_show(struct ecryptfs_obj *obj, char *buff) -{ - return snprintf(buff, PAGE_SIZE, "%d\n", ECRYPTFS_VERSIONING_MASK); -} - -static struct ecryptfs_attribute sysfs_attr_version = __ATTR_RO(version); - -static struct ecryptfs_version_str_map_elem { - u32 flag; - char *str; -} ecryptfs_version_str_map[] = { - {ECRYPTFS_VERSIONING_PASSPHRASE, "passphrase"}, - {ECRYPTFS_VERSIONING_PUBKEY, "pubkey"}, - {ECRYPTFS_VERSIONING_PLAINTEXT_PASSTHROUGH, "plaintext passthrough"}, - {ECRYPTFS_VERSIONING_POLICY, "policy"}, - {ECRYPTFS_VERSIONING_XATTR, "metadata in extended attribute"}, - {ECRYPTFS_VERSIONING_MULTKEY, "multiple keys per file"} -}; - -static ssize_t version_str_show(struct ecryptfs_obj *obj, char *buff) -{ - int i; - int remaining = PAGE_SIZE; - int total_written = 0; - - buff[0] = '\0'; - for (i = 0; i < ARRAY_SIZE(ecryptfs_version_str_map); i++) { - int entry_size; - - if (!(ECRYPTFS_VERSIONING_MASK - & ecryptfs_version_str_map[i].flag)) - continue; - entry_size = strlen(ecryptfs_version_str_map[i].str); - if ((entry_size + 2) > remaining) - goto out; - memcpy(buff, ecryptfs_version_str_map[i].str, entry_size); - buff[entry_size++] = '\n'; - buff[entry_size] = '\0'; - buff += entry_size; - total_written += entry_size; - remaining -= entry_size; - } -out: - return total_written; -} - -static struct ecryptfs_attribute sysfs_attr_version_str = __ATTR_RO(version_str); - static int do_sysfs_registration(void) { int rc; - rc = subsystem_register(&ecryptfs_subsys); - if (rc) { - printk(KERN_ERR - "Unable to register ecryptfs sysfs subsystem\n"); - goto out; - } - rc = sysfs_create_file(&ecryptfs_subsys.kobj, - &sysfs_attr_version.attr); - if (rc) { - printk(KERN_ERR - "Unable to create ecryptfs version attribute\n"); - subsystem_unregister(&ecryptfs_subsys); + ecryptfs_kobj = kobject_create_and_add("ecryptfs", fs_kobj); + if (!ecryptfs_kobj) { + printk(KERN_ERR "Unable to create ecryptfs kset\n"); + rc = -ENOMEM; goto out; } - rc = sysfs_create_file(&ecryptfs_subsys.kobj, - &sysfs_attr_version_str.attr); + rc = sysfs_create_group(ecryptfs_kobj, &attr_group); if (rc) { printk(KERN_ERR - "Unable to create ecryptfs version_str attribute\n"); - sysfs_remove_file(&ecryptfs_subsys.kobj, - &sysfs_attr_version.attr); - subsystem_unregister(&ecryptfs_subsys); - goto out; + "Unable to create ecryptfs version attributes\n"); + kobject_put(ecryptfs_kobj); } out: return rc; @@ -862,11 +775,8 @@ out: static void do_sysfs_unregistration(void) { - sysfs_remove_file(&ecryptfs_subsys.kobj, - &sysfs_attr_version.attr); - sysfs_remove_file(&ecryptfs_subsys.kobj, - &sysfs_attr_version_str.attr); - subsystem_unregister(&ecryptfs_subsys); + sysfs_remove_group(ecryptfs_kobj, &attr_group); + kobject_put(ecryptfs_kobj); } static int __init ecryptfs_init(void) @@ -894,7 +804,6 @@ static int __init ecryptfs_init(void) printk(KERN_ERR "Failed to register filesystem\n"); goto out_free_kmem_caches; } - kobj_set_kset_s(&ecryptfs_subsys, fs_subsys); rc = do_sysfs_registration(); if (rc) { printk(KERN_ERR "sysfs registration failed\n"); diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 0fca820..300324b 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -482,8 +482,6 @@ sync_sb_inodes(struct super_block *sb, struct writeback_control *wbc) if (wbc->nr_to_write <= 0) break; } - if (!list_empty(&sb->s_more_io)) - wbc->more_io = 1; return; /* Leave any unwritten inodes on s_io */ } diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 84f9f7d..e5e80d1 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -744,9 +744,6 @@ static inline void unregister_fuseblk(void) } #endif -static decl_subsys(fuse, NULL, NULL); -static decl_subsys(connections, NULL, NULL); - static void fuse_inode_init_once(struct kmem_cache *cachep, void *foo) { struct inode * inode = foo; @@ -791,32 +788,37 @@ static void fuse_fs_cleanup(void) kmem_cache_destroy(fuse_inode_cachep); } +static struct kobject *fuse_kobj; +static struct kobject *connections_kobj; + static int fuse_sysfs_init(void) { int err; - kobj_set_kset_s(&fuse_subsys, fs_subsys); - err = subsystem_register(&fuse_subsys); - if (err) + fuse_kobj = kobject_create_and_add("fuse", fs_kobj); + if (!fuse_kobj) { + err = -ENOMEM; goto out_err; + } - kobj_set_kset_s(&connections_subsys, fuse_subsys); - err = subsystem_register(&connections_subsys); - if (err) + connections_kobj = kobject_create_and_add("connections", fuse_kobj); + if (!connections_kobj) { + err = -ENOMEM; goto out_fuse_unregister; + } return 0; out_fuse_unregister: - subsystem_unregister(&fuse_subsys); + kobject_put(fuse_kobj); out_err: return err; } static void fuse_sysfs_cleanup(void) { - subsystem_unregister(&connections_subsys); - subsystem_unregister(&fuse_subsys); + kobject_put(connections_kobj); + kobject_put(fuse_kobj); } static int __init fuse_init(void) diff --git a/fs/gfs2/Makefile b/fs/gfs2/Makefile index 04ad0ca..8fff110 100644 --- a/fs/gfs2/Makefile +++ b/fs/gfs2/Makefile @@ -2,7 +2,7 @@ obj-$(CONFIG_GFS2_FS) += gfs2.o gfs2-y := acl.o bmap.o daemon.o dir.o eaops.o eattr.o glock.o \ glops.o inode.o lm.o log.o lops.o locking.o main.o meta_io.o \ mount.o ops_address.o ops_dentry.o ops_export.o ops_file.o \ - ops_fstype.o ops_inode.o ops_super.o ops_vm.o quota.o \ + ops_fstype.o ops_inode.o ops_super.o quota.o \ recovery.o rgrp.o super.o sys.o trans.o util.o obj-$(CONFIG_GFS2_FS_LOCKING_NOLOCK) += locking/nolock/ diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index 93fa427..e4effc4 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -59,7 +59,6 @@ struct strip_mine { static int gfs2_unstuffer_page(struct gfs2_inode *ip, struct buffer_head *dibh, u64 block, struct page *page) { - struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); struct inode *inode = &ip->i_inode; struct buffer_head *bh; int release = 0; @@ -95,7 +94,7 @@ static int gfs2_unstuffer_page(struct gfs2_inode *ip, struct buffer_head *dibh, set_buffer_uptodate(bh); if (!gfs2_is_jdata(ip)) mark_buffer_dirty(bh); - if (sdp->sd_args.ar_data == GFS2_DATA_ORDERED || gfs2_is_jdata(ip)) + if (!gfs2_is_writeback(ip)) gfs2_trans_add_bh(ip->i_gl, bh, 0); if (release) { @@ -453,8 +452,8 @@ static inline void bmap_unlock(struct inode *inode, int create) * Returns: errno */ -int gfs2_block_map(struct inode *inode, u64 lblock, int create, - struct buffer_head *bh_map) +int gfs2_block_map(struct inode *inode, sector_t lblock, + struct buffer_head *bh_map, int create) { struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_sbd *sdp = GFS2_SB(inode); @@ -470,6 +469,7 @@ int gfs2_block_map(struct inode *inode, u64 lblock, int create, unsigned int maxlen = bh_map->b_size >> inode->i_blkbits; struct metapath mp; u64 size; + struct buffer_head *dibh = NULL; BUG_ON(maxlen == 0); @@ -500,6 +500,8 @@ int gfs2_block_map(struct inode *inode, u64 lblock, int create, error = gfs2_meta_inode_buffer(ip, &bh); if (error) goto out_fail; + dibh = bh; + get_bh(dibh); for (x = 0; x < end_of_metadata; x++) { lookup_block(ip, bh, x, &mp, create, &new, &dblock); @@ -518,13 +520,8 @@ int gfs2_block_map(struct inode *inode, u64 lblock, int create, if (boundary) set_buffer_boundary(bh_map); if (new) { - struct buffer_head *dibh; - error = gfs2_meta_inode_buffer(ip, &dibh); - if (!error) { - gfs2_trans_add_bh(ip->i_gl, dibh, 1); - gfs2_dinode_out(ip, dibh->b_data); - brelse(dibh); - } + gfs2_trans_add_bh(ip->i_gl, dibh, 1); + gfs2_dinode_out(ip, dibh->b_data); set_buffer_new(bh_map); goto out_brelse; } @@ -545,6 +542,8 @@ out_brelse: out_ok: error = 0; out_fail: + if (dibh) + brelse(dibh); bmap_unlock(inode, create); return error; } @@ -560,7 +559,7 @@ int gfs2_extent_map(struct inode *inode, u64 lblock, int *new, u64 *dblock, unsi BUG_ON(!new); bh.b_size = 1 << (inode->i_blkbits + 5); - ret = gfs2_block_map(inode, lblock, create, &bh); + ret = gfs2_block_map(inode, lblock, &bh, create); *extlen = bh.b_size >> inode->i_blkbits; *dblock = bh.b_blocknr; if (buffer_new(&bh)) @@ -684,7 +683,7 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh, if (metadata) revokes = (height) ? sdp->sd_inptrs : sdp->sd_diptrs; - error = gfs2_rindex_hold(sdp, &ip->i_alloc.al_ri_gh); + error = gfs2_rindex_hold(sdp, &ip->i_alloc->al_ri_gh); if (error) return error; @@ -786,7 +785,7 @@ out_rg_gunlock: out_rlist: gfs2_rlist_free(&rlist); out: - gfs2_glock_dq_uninit(&ip->i_alloc.al_ri_gh); + gfs2_glock_dq_uninit(&ip->i_alloc->al_ri_gh); return error; } @@ -879,7 +878,6 @@ static int gfs2_block_truncate_page(struct address_space *mapping) { struct inode *inode = mapping->host; struct gfs2_inode *ip = GFS2_I(inode); - struct gfs2_sbd *sdp = GFS2_SB(inode); loff_t from = inode->i_size; unsigned long index = from >> PAGE_CACHE_SHIFT; unsigned offset = from & (PAGE_CACHE_SIZE-1); @@ -911,7 +909,7 @@ static int gfs2_block_truncate_page(struct address_space *mapping) err = 0; if (!buffer_mapped(bh)) { - gfs2_get_block(inode, iblock, bh, 0); + gfs2_block_map(inode, iblock, bh, 0); /* unmapped? It's a hole - nothing to do */ if (!buffer_mapped(bh)) goto unlock; @@ -931,7 +929,7 @@ static int gfs2_block_truncate_page(struct address_space *mapping) err = 0; } - if (sdp->sd_args.ar_data == GFS2_DATA_ORDERED || gfs2_is_jdata(ip)) + if (!gfs2_is_writeback(ip)) gfs2_trans_add_bh(ip->i_gl, bh, 0); zero_user_page(page, offset, length, KM_USER0); @@ -1224,8 +1222,13 @@ int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset, do_div(lblock_stop, bsize); } else { unsigned int shift = sdp->sd_sb.sb_bsize_shift; + u64 end_of_file = (ip->i_di.di_size + sdp->sd_sb.sb_bsize - 1) >> shift; lblock = offset >> shift; lblock_stop = (offset + len + sdp->sd_sb.sb_bsize - 1) >> shift; + if (lblock_stop > end_of_file) { + *alloc_required = 1; + return 0; + } } for (; lblock < lblock_stop; lblock += extlen) { diff --git a/fs/gfs2/bmap.h b/fs/gfs2/bmap.h index ac2fd04..4e6cde2 100644 --- a/fs/gfs2/bmap.h +++ b/fs/gfs2/bmap.h @@ -15,7 +15,7 @@ struct gfs2_inode; struct page; int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page); -int gfs2_block_map(struct inode *inode, u64 lblock, int create, struct buffer_head *bh); +int gfs2_block_map(struct inode *inode, sector_t lblock, struct buffer_head *bh, int create); int gfs2_extent_map(struct inode *inode, u64 lblock, int *new, u64 *dblock, unsigned *extlen); int gfs2_truncatei(struct gfs2_inode *ip, u64 size); diff --git a/fs/gfs2/daemon.c b/fs/gfs2/daemon.c index 3731ab0..e519919 100644 --- a/fs/gfs2/daemon.c +++ b/fs/gfs2/daemon.c @@ -83,56 +83,6 @@ int gfs2_recoverd(void *data) } /** - * gfs2_logd - Update log tail as Active Items get flushed to in-place blocks - * @sdp: Pointer to GFS2 superblock - * - * Also, periodically check to make sure that we're using the most recent - * journal index. - */ - -int gfs2_logd(void *data) -{ - struct gfs2_sbd *sdp = data; - struct gfs2_holder ji_gh; - unsigned long t; - int need_flush; - - while (!kthread_should_stop()) { - /* Advance the log tail */ - - t = sdp->sd_log_flush_time + - gfs2_tune_get(sdp, gt_log_flush_secs) * HZ; - - gfs2_ail1_empty(sdp, DIO_ALL); - gfs2_log_lock(sdp); - need_flush = sdp->sd_log_num_buf > gfs2_tune_get(sdp, gt_incore_log_blocks); - gfs2_log_unlock(sdp); - if (need_flush || time_after_eq(jiffies, t)) { - gfs2_log_flush(sdp, NULL); - sdp->sd_log_flush_time = jiffies; - } - - /* Check for latest journal index */ - - t = sdp->sd_jindex_refresh_time + - gfs2_tune_get(sdp, gt_jindex_refresh_secs) * HZ; - - if (time_after_eq(jiffies, t)) { - if (!gfs2_jindex_hold(sdp, &ji_gh)) - gfs2_glock_dq_uninit(&ji_gh); - sdp->sd_jindex_refresh_time = jiffies; - } - - t = gfs2_tune_get(sdp, gt_logd_secs) * HZ; - if (freezing(current)) - refrigerator(); - schedule_timeout_interruptible(t); - } - - return 0; -} - -/** * gfs2_quotad - Write cached quota changes into the quota file * @sdp: Pointer to GFS2 superblock * diff --git a/fs/gfs2/daemon.h b/fs/gfs2/daemon.h index 0de9b35..4be084f 100644 --- a/fs/gfs2/daemon.h +++ b/fs/gfs2/daemon.h @@ -12,7 +12,6 @@ int gfs2_glockd(void *data); int gfs2_recoverd(void *data); -int gfs2_logd(void *data); int gfs2_quotad(void *data); #endif /* __DAEMON_DOT_H__ */ diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c index 9949bb7..57e2ed9 100644 --- a/fs/gfs2/dir.c +++ b/fs/gfs2/dir.c @@ -1876,7 +1876,7 @@ static int leaf_dealloc(struct gfs2_inode *dip, u32 index, u32 len, if (error) goto out; - error = gfs2_rindex_hold(sdp, &dip->i_alloc.al_ri_gh); + error = gfs2_rindex_hold(sdp, &dip->i_alloc->al_ri_gh); if (error) goto out_qs; @@ -1949,7 +1949,7 @@ out_rg_gunlock: gfs2_glock_dq_m(rlist.rl_rgrps, rlist.rl_ghs); out_rlist: gfs2_rlist_free(&rlist); - gfs2_glock_dq_uninit(&dip->i_alloc.al_ri_gh); + gfs2_glock_dq_uninit(&dip->i_alloc->al_ri_gh); out_qs: gfs2_quota_unhold(dip); out: diff --git a/fs/gfs2/eaops.c b/fs/gfs2/eaops.c index aa8dbf3..f114ba2 100644 --- a/fs/gfs2/eaops.c +++ b/fs/gfs2/eaops.c @@ -56,46 +56,6 @@ unsigned int gfs2_ea_name2type(const char *name, const char **truncated_name) return type; } -static int user_eo_get(struct gfs2_inode *ip, struct gfs2_ea_request *er) -{ - struct inode *inode = &ip->i_inode; - int error = permission(inode, MAY_READ, NULL); - if (error) - return error; - - return gfs2_ea_get_i(ip, er); -} - -static int user_eo_set(struct gfs2_inode *ip, struct gfs2_ea_request *er) -{ - struct inode *inode = &ip->i_inode; - - if (S_ISREG(inode->i_mode) || - (S_ISDIR(inode->i_mode) && !(inode->i_mode & S_ISVTX))) { - int error = permission(inode, MAY_WRITE, NULL); - if (error) - return error; - } else - return -EPERM; - - return gfs2_ea_set_i(ip, er); -} - -static int user_eo_remove(struct gfs2_inode *ip, struct gfs2_ea_request *er) -{ - struct inode *inode = &ip->i_inode; - - if (S_ISREG(inode->i_mode) || - (S_ISDIR(inode->i_mode) && !(inode->i_mode & S_ISVTX))) { - int error = permission(inode, MAY_WRITE, NULL); - if (error) - return error; - } else - return -EPERM; - - return gfs2_ea_remove_i(ip, er); -} - static int system_eo_get(struct gfs2_inode *ip, struct gfs2_ea_request *er) { if (!GFS2_ACL_IS_ACCESS(er->er_name, er->er_name_len) && @@ -108,8 +68,6 @@ static int system_eo_get(struct gfs2_inode *ip, struct gfs2_ea_request *er) GFS2_ACL_IS_DEFAULT(er->er_name, er->er_name_len))) return -EOPNOTSUPP; - - return gfs2_ea_get_i(ip, er); } @@ -170,40 +128,10 @@ static int system_eo_remove(struct gfs2_inode *ip, struct gfs2_ea_request *er) return gfs2_ea_remove_i(ip, er); } -static int security_eo_get(struct gfs2_inode *ip, struct gfs2_ea_request *er) -{ - struct inode *inode = &ip->i_inode; - int error = permission(inode, MAY_READ, NULL); - if (error) - return error; - - return gfs2_ea_get_i(ip, er); -} - -static int security_eo_set(struct gfs2_inode *ip, struct gfs2_ea_request *er) -{ - struct inode *inode = &ip->i_inode; - int error = permission(inode, MAY_WRITE, NULL); - if (error) - return error; - - return gfs2_ea_set_i(ip, er); -} - -static int security_eo_remove(struct gfs2_inode *ip, struct gfs2_ea_request *er) -{ - struct inode *inode = &ip->i_inode; - int error = permission(inode, MAY_WRITE, NULL); - if (error) - return error; - - return gfs2_ea_remove_i(ip, er); -} - static const struct gfs2_eattr_operations gfs2_user_eaops = { - .eo_get = user_eo_get, - .eo_set = user_eo_set, - .eo_remove = user_eo_remove, + .eo_get = gfs2_ea_get_i, + .eo_set = gfs2_ea_set_i, + .eo_remove = gfs2_ea_remove_i, .eo_name = "user", }; @@ -215,9 +143,9 @@ const struct gfs2_eattr_operations gfs2_system_eaops = { }; static const struct gfs2_eattr_operations gfs2_security_eaops = { - .eo_get = security_eo_get, - .eo_set = security_eo_set, - .eo_remove = security_eo_remove, + .eo_get = gfs2_ea_get_i, + .eo_set = gfs2_ea_set_i, + .eo_remove = gfs2_ea_remove_i, .eo_name = "security", }; diff --git a/fs/gfs2/eattr.c b/fs/gfs2/eattr.c index 2a7435b..bee9970 100644 --- a/fs/gfs2/eattr.c +++ b/fs/gfs2/eattr.c @@ -1418,7 +1418,7 @@ out: static int ea_dealloc_block(struct gfs2_inode *ip) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); - struct gfs2_alloc *al = &ip->i_alloc; + struct gfs2_alloc *al = ip->i_alloc; struct gfs2_rgrpd *rgd; struct buffer_head *dibh; int error; diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index a37efe4..80e09c5 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -1,6 +1,6 @@ /* * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. - * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * Copyright (C) 2004-2007 Red Hat, Inc. All rights reserved. * * This copyrighted material is made available to anyone wishing to use, * modify, copy, or redistribute it subject to the terms and conditions @@ -217,7 +217,6 @@ int gfs2_glock_put(struct gfs2_glock *gl) if (atomic_dec_and_test(&gl->gl_ref)) { hlist_del(&gl->gl_list); write_unlock(gl_lock_addr(gl->gl_hash)); - BUG_ON(spin_is_locked(&gl->gl_spin)); gfs2_assert(sdp, gl->gl_state == LM_ST_UNLOCKED); gfs2_assert(sdp, list_empty(&gl->gl_reclaim)); gfs2_assert(sdp, list_empty(&gl->gl_holders)); @@ -346,7 +345,6 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number, gl->gl_object = NULL; gl->gl_sbd = sdp; gl->gl_aspace = NULL; - lops_init_le(&gl->gl_le, &gfs2_glock_lops); INIT_DELAYED_WORK(&gl->gl_work, glock_work_func); /* If this glock protects actual on-disk data or metadata blocks, @@ -461,7 +459,6 @@ static void wait_on_holder(struct gfs2_holder *gh) static void gfs2_demote_wake(struct gfs2_glock *gl) { - BUG_ON(!spin_is_locked(&gl->gl_spin)); gl->gl_demote_state = LM_ST_EXCLUSIVE; clear_bit(GLF_DEMOTE, &gl->gl_flags); smp_mb__after_clear_bit(); @@ -507,21 +504,12 @@ static int rq_mutex(struct gfs2_holder *gh) static int rq_promote(struct gfs2_holder *gh) { struct gfs2_glock *gl = gh->gh_gl; - struct gfs2_sbd *sdp = gl->gl_sbd; if (!relaxed_state_ok(gl->gl_state, gh->gh_state, gh->gh_flags)) { if (list_empty(&gl->gl_holders)) { gl->gl_req_gh = gh; set_bit(GLF_LOCK, &gl->gl_flags); spin_unlock(&gl->gl_spin); - - if (atomic_read(&sdp->sd_reclaim_count) > - gfs2_tune_get(sdp, gt_reclaim_limit) && - !(gh->gh_flags & LM_FLAG_PRIORITY)) { - gfs2_reclaim_glock(sdp); - gfs2_reclaim_glock(sdp); - } - gfs2_glock_xmote_th(gh->gh_gl, gh); spin_lock(&gl->gl_spin); } @@ -567,7 +555,10 @@ static int rq_demote(struct gfs2_glock *gl) gfs2_demote_wake(gl); return 0; } + set_bit(GLF_LOCK, &gl->gl_flags); + set_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags); + if (gl->gl_demote_state == LM_ST_UNLOCKED || gl->gl_state != LM_ST_EXCLUSIVE) { spin_unlock(&gl->gl_spin); @@ -576,7 +567,9 @@ static int rq_demote(struct gfs2_glock *gl) spin_unlock(&gl->gl_spin); gfs2_glock_xmote_th(gl, NULL); } + spin_lock(&gl->gl_spin); + clear_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags); return 0; } @@ -598,23 +591,18 @@ static void run_queue(struct gfs2_glock *gl) if (!list_empty(&gl->gl_waiters1)) { gh = list_entry(gl->gl_waiters1.next, struct gfs2_holder, gh_list); - - if (test_bit(HIF_MUTEX, &gh->gh_iflags)) - blocked = rq_mutex(gh); - else - gfs2_assert_warn(gl->gl_sbd, 0); - + blocked = rq_mutex(gh); } else if (test_bit(GLF_DEMOTE, &gl->gl_flags)) { blocked = rq_demote(gl); + if (gl->gl_waiters2 && !blocked) { + set_bit(GLF_DEMOTE, &gl->gl_flags); + gl->gl_demote_state = LM_ST_UNLOCKED; + } + gl->gl_waiters2 = 0; } else if (!list_empty(&gl->gl_waiters3)) { gh = list_entry(gl->gl_waiters3.next, struct gfs2_holder, gh_list); - - if (test_bit(HIF_PROMOTE, &gh->gh_iflags)) - blocked = rq_promote(gh); - else - gfs2_assert_warn(gl->gl_sbd, 0); - + blocked = rq_promote(gh); } else break; @@ -632,27 +620,21 @@ static void run_queue(struct gfs2_glock *gl) static void gfs2_glmutex_lock(struct gfs2_glock *gl) { - struct gfs2_holder gh; - - gfs2_holder_init(gl, 0, 0, &gh); - set_bit(HIF_MUTEX, &gh.gh_iflags); - if (test_and_set_bit(HIF_WAIT, &gh.gh_iflags)) - BUG(); - spin_lock(&gl->gl_spin); if (test_and_set_bit(GLF_LOCK, &gl->gl_flags)) { + struct gfs2_holder gh; + + gfs2_holder_init(gl, 0, 0, &gh); + set_bit(HIF_WAIT, &gh.gh_iflags); list_add_tail(&gh.gh_list, &gl->gl_waiters1); + spin_unlock(&gl->gl_spin); + wait_on_holder(&gh); + gfs2_holder_uninit(&gh); } else { gl->gl_owner_pid = current->pid; gl->gl_ip = (unsigned long)__builtin_return_address(0); - clear_bit(HIF_WAIT, &gh.gh_iflags); - smp_mb(); - wake_up_bit(&gh.gh_iflags, HIF_WAIT); + spin_unlock(&gl->gl_spin); } - spin_unlock(&gl->gl_spin); - - wait_on_holder(&gh); - gfs2_holder_uninit(&gh); } /** @@ -691,7 +673,6 @@ static void gfs2_glmutex_unlock(struct gfs2_glock *gl) gl->gl_owner_pid = 0; gl->gl_ip = 0; run_queue(gl); - BUG_ON(!spin_is_locked(&gl->gl_spin)); spin_unlock(&gl->gl_spin); } @@ -722,7 +703,10 @@ static void handle_callback(struct gfs2_glock *gl, unsigned int state, } } else if (gl->gl_demote_state != LM_ST_UNLOCKED && gl->gl_demote_state != state) { - gl->gl_demote_state = LM_ST_UNLOCKED; + if (test_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags)) + gl->gl_waiters2 = 1; + else + gl->gl_demote_state = LM_ST_UNLOCKED; } spin_unlock(&gl->gl_spin); } @@ -943,8 +927,8 @@ static void gfs2_glock_drop_th(struct gfs2_glock *gl) const struct gfs2_glock_operations *glops = gl->gl_ops; unsigned int ret; - if (glops->go_drop_th) - glops->go_drop_th(gl); + if (glops->go_xmote_th) + glops->go_xmote_th(gl); gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags)); gfs2_assert_warn(sdp, list_empty(&gl->gl_holders)); @@ -1156,8 +1140,6 @@ restart: return -EIO; } - set_bit(HIF_PROMOTE, &gh->gh_iflags); - spin_lock(&gl->gl_spin); add_to_queue(gh); run_queue(gl); @@ -1248,12 +1230,11 @@ void gfs2_glock_dq(struct gfs2_holder *gh) list_del_init(&gh->gh_list); if (list_empty(&gl->gl_holders)) { - spin_unlock(&gl->gl_spin); - - if (glops->go_unlock) + if (glops->go_unlock) { + spin_unlock(&gl->gl_spin); glops->go_unlock(gh); - - spin_lock(&gl->gl_spin); + spin_lock(&gl->gl_spin); + } gl->gl_stamp = jiffies; } @@ -1910,8 +1891,6 @@ static int dump_glock(struct glock_iter *gi, struct gfs2_glock *gl) print_dbg(gi, " req_bh = %s\n", (gl->gl_req_bh) ? "yes" : "no"); print_dbg(gi, " lvb_count = %d\n", atomic_read(&gl->gl_lvb_count)); print_dbg(gi, " object = %s\n", (gl->gl_object) ? "yes" : "no"); - print_dbg(gi, " le = %s\n", - (list_empty(&gl->gl_le.le_list)) ? "no" : "yes"); print_dbg(gi, " reclaim = %s\n", (list_empty(&gl->gl_reclaim)) ? "no" : "yes"); if (gl->gl_aspace) diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c index 4670dcb..c663b7a 100644 --- a/fs/gfs2/glops.c +++ b/fs/gfs2/glops.c @@ -56,7 +56,7 @@ static void gfs2_ail_empty_gl(struct gfs2_glock *gl) bd = list_entry(head->next, struct gfs2_bufdata, bd_ail_gl_list); bh = bd->bd_bh; - gfs2_remove_from_ail(NULL, bd); + gfs2_remove_from_ail(bd); bd->bd_bh = NULL; bh->b_private = NULL; bd->bd_blkno = bh->b_blocknr; @@ -86,15 +86,10 @@ static void gfs2_pte_inval(struct gfs2_glock *gl) if (!ip || !S_ISREG(inode->i_mode)) return; - if (!test_bit(GIF_PAGED, &ip->i_flags)) - return; - unmap_shared_mapping_range(inode->i_mapping, 0, 0); - if (test_bit(GIF_SW_PAGED, &ip->i_flags)) set_bit(GLF_DIRTY, &gl->gl_flags); - clear_bit(GIF_SW_PAGED, &ip->i_flags); } /** @@ -143,44 +138,34 @@ static void meta_go_inval(struct gfs2_glock *gl, int flags) static void inode_go_sync(struct gfs2_glock *gl) { struct gfs2_inode *ip = gl->gl_object; + struct address_space *metamapping = gl->gl_aspace->i_mapping; + int error; + + if (gl->gl_state != LM_ST_UNLOCKED) + gfs2_pte_inval(gl); + if (gl->gl_state != LM_ST_EXCLUSIVE) + return; if (ip && !S_ISREG(ip->i_inode.i_mode)) ip = NULL; if (test_bit(GLF_DIRTY, &gl->gl_flags)) { - if (ip && !gfs2_is_jdata(ip)) - filemap_fdatawrite(ip->i_inode.i_mapping); gfs2_log_flush(gl->gl_sbd, gl); - if (ip && gfs2_is_jdata(ip)) - filemap_fdatawrite(ip->i_inode.i_mapping); - gfs2_meta_sync(gl); + filemap_fdatawrite(metamapping); if (ip) { struct address_space *mapping = ip->i_inode.i_mapping; - int error = filemap_fdatawait(mapping); + filemap_fdatawrite(mapping); + error = filemap_fdatawait(mapping); mapping_set_error(mapping, error); } + error = filemap_fdatawait(metamapping); + mapping_set_error(metamapping, error); clear_bit(GLF_DIRTY, &gl->gl_flags); gfs2_ail_empty_gl(gl); } } /** - * inode_go_xmote_th - promote/demote a glock - * @gl: the glock - * @state: the requested state - * @flags: - * - */ - -static void inode_go_xmote_th(struct gfs2_glock *gl) -{ - if (gl->gl_state != LM_ST_UNLOCKED) - gfs2_pte_inval(gl); - if (gl->gl_state == LM_ST_EXCLUSIVE) - inode_go_sync(gl); -} - -/** * inode_go_xmote_bh - After promoting/demoting a glock * @gl: the glock * @@ -201,22 +186,6 @@ static void inode_go_xmote_bh(struct gfs2_glock *gl) } /** - * inode_go_drop_th - unlock a glock - * @gl: the glock - * - * Invoked from rq_demote(). - * Another node needs the lock in EXCLUSIVE mode, or lock (unused for too long) - * is being purged from our node's glock cache; we're dropping lock. - */ - -static void inode_go_drop_th(struct gfs2_glock *gl) -{ - gfs2_pte_inval(gl); - if (gl->gl_state == LM_ST_EXCLUSIVE) - inode_go_sync(gl); -} - -/** * inode_go_inval - prepare a inode glock to be released * @gl: the glock * @flags: @@ -234,10 +203,8 @@ static void inode_go_inval(struct gfs2_glock *gl, int flags) set_bit(GIF_INVALID, &ip->i_flags); } - if (ip && S_ISREG(ip->i_inode.i_mode)) { + if (ip && S_ISREG(ip->i_inode.i_mode)) truncate_inode_pages(ip->i_inode.i_mapping, 0); - clear_bit(GIF_PAGED, &ip->i_flags); - } } /** @@ -294,23 +261,6 @@ static int inode_go_lock(struct gfs2_holder *gh) } /** - * inode_go_unlock - operation done before an inode lock is unlocked by a - * process - * @gl: the glock - * @flags: - * - */ - -static void inode_go_unlock(struct gfs2_holder *gh) -{ - struct gfs2_glock *gl = gh->gh_gl; - struct gfs2_inode *ip = gl->gl_object; - - if (ip) - gfs2_meta_cache_flush(ip); -} - -/** * rgrp_go_demote_ok - Check to see if it's ok to unlock a RG's glock * @gl: the glock * @@ -350,14 +300,14 @@ static void rgrp_go_unlock(struct gfs2_holder *gh) } /** - * trans_go_xmote_th - promote/demote the transaction glock + * trans_go_sync - promote/demote the transaction glock * @gl: the glock * @state: the requested state * @flags: * */ -static void trans_go_xmote_th(struct gfs2_glock *gl) +static void trans_go_sync(struct gfs2_glock *gl) { struct gfs2_sbd *sdp = gl->gl_sbd; @@ -384,7 +334,6 @@ static void trans_go_xmote_bh(struct gfs2_glock *gl) if (gl->gl_state != LM_ST_UNLOCKED && test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) { - gfs2_meta_cache_flush(GFS2_I(sdp->sd_jdesc->jd_inode)); j_gl->gl_ops->go_inval(j_gl, DIO_METADATA); error = gfs2_find_jhead(sdp->sd_jdesc, &head); @@ -402,24 +351,6 @@ static void trans_go_xmote_bh(struct gfs2_glock *gl) } /** - * trans_go_drop_th - unlock the transaction glock - * @gl: the glock - * - * We want to sync the device even with localcaching. Remember - * that localcaching journal replay only marks buffers dirty. - */ - -static void trans_go_drop_th(struct gfs2_glock *gl) -{ - struct gfs2_sbd *sdp = gl->gl_sbd; - - if (test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) { - gfs2_meta_syncfs(sdp); - gfs2_log_shutdown(sdp); - } -} - -/** * quota_go_demote_ok - Check to see if it's ok to unlock a quota glock * @gl: the glock * @@ -433,25 +364,21 @@ static int quota_go_demote_ok(struct gfs2_glock *gl) const struct gfs2_glock_operations gfs2_meta_glops = { .go_xmote_th = meta_go_sync, - .go_drop_th = meta_go_sync, .go_type = LM_TYPE_META, }; const struct gfs2_glock_operations gfs2_inode_glops = { - .go_xmote_th = inode_go_xmote_th, + .go_xmote_th = inode_go_sync, .go_xmote_bh = inode_go_xmote_bh, - .go_drop_th = inode_go_drop_th, .go_inval = inode_go_inval, .go_demote_ok = inode_go_demote_ok, .go_lock = inode_go_lock, - .go_unlock = inode_go_unlock, .go_type = LM_TYPE_INODE, .go_min_hold_time = HZ / 10, }; const struct gfs2_glock_operations gfs2_rgrp_glops = { .go_xmote_th = meta_go_sync, - .go_drop_th = meta_go_sync, .go_inval = meta_go_inval, .go_demote_ok = rgrp_go_demote_ok, .go_lock = rgrp_go_lock, @@ -461,9 +388,8 @@ const struct gfs2_glock_operations gfs2_rgrp_glops = { }; const struct gfs2_glock_operations gfs2_trans_glops = { - .go_xmote_th = trans_go_xmote_th, + .go_xmote_th = trans_go_sync, .go_xmote_bh = trans_go_xmote_bh, - .go_drop_th = trans_go_drop_th, .go_type = LM_TYPE_NONDISK, }; diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index eaddfb5..513aaf0 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -1,6 +1,6 @@ /* * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. - * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * Copyright (C) 2004-2007 Red Hat, Inc. All rights reserved. * * This copyrighted material is made available to anyone wishing to use, * modify, copy, or redistribute it subject to the terms and conditions @@ -131,7 +131,6 @@ struct gfs2_bufdata { struct gfs2_glock_operations { void (*go_xmote_th) (struct gfs2_glock *gl); void (*go_xmote_bh) (struct gfs2_glock *gl); - void (*go_drop_th) (struct gfs2_glock *gl); void (*go_inval) (struct gfs2_glock *gl, int flags); int (*go_demote_ok) (struct gfs2_glock *gl); int (*go_lock) (struct gfs2_holder *gh); @@ -141,10 +140,6 @@ struct gfs2_glock_operations { }; enum { - /* Actions */ - HIF_MUTEX = 0, - HIF_PROMOTE = 1, - /* States */ HIF_HOLDER = 6, HIF_FIRST = 7, @@ -171,6 +166,8 @@ enum { GLF_DEMOTE = 3, GLF_PENDING_DEMOTE = 4, GLF_DIRTY = 5, + GLF_DEMOTE_IN_PROGRESS = 6, + GLF_LFLUSH = 7, }; struct gfs2_glock { @@ -190,6 +187,7 @@ struct gfs2_glock { struct list_head gl_holders; struct list_head gl_waiters1; /* HIF_MUTEX */ struct list_head gl_waiters3; /* HIF_PROMOTE */ + int gl_waiters2; /* GIF_DEMOTE */ const struct gfs2_glock_operations *gl_ops; @@ -210,7 +208,6 @@ struct gfs2_glock { struct gfs2_sbd *gl_sbd; struct inode *gl_aspace; - struct gfs2_log_element gl_le; struct list_head gl_ail_list; atomic_t gl_ail_count; struct delayed_work gl_work; @@ -239,7 +236,6 @@ struct gfs2_alloc { enum { GIF_INVALID = 0, GIF_QD_LOCKED = 1, - GIF_PAGED = 2, GIF_SW_PAGED = 3, }; @@ -268,14 +264,10 @@ struct gfs2_inode { struct gfs2_glock *i_gl; /* Move into i_gh? */ struct gfs2_holder i_iopen_gh; struct gfs2_holder i_gh; /* for prepare/commit_write only */ - struct gfs2_alloc i_alloc; + struct gfs2_alloc *i_alloc; u64 i_last_rg_alloc; - spinlock_t i_spin; struct rw_semaphore i_rw_mutex; - unsigned long i_last_pfault; - - struct buffer_head *i_cache[GFS2_MAX_META_HEIGHT]; }; /* @@ -287,19 +279,12 @@ static inline struct gfs2_inode *GFS2_I(struct inode *inode) return container_of(inode, struct gfs2_inode, i_inode); } -/* To be removed? */ -static inline struct gfs2_sbd *GFS2_SB(struct inode *inode) +static inline struct gfs2_sbd *GFS2_SB(const struct inode *inode) { return inode->i_sb->s_fs_info; } -enum { - GFF_DID_DIRECT_ALLOC = 0, - GFF_EXLOCK = 1, -}; - struct gfs2_file { - unsigned long f_flags; /* GFF_... */ struct mutex f_fl_mutex; struct gfs2_holder f_fl_gh; }; @@ -373,8 +358,17 @@ struct gfs2_ail { u64 ai_sync_gen; }; +struct gfs2_journal_extent { + struct list_head extent_list; + + unsigned int lblock; /* First logical block */ + u64 dblock; /* First disk block */ + u64 blocks; +}; + struct gfs2_jdesc { struct list_head jd_list; + struct list_head extent_list; struct inode *jd_inode; unsigned int jd_jid; @@ -421,13 +415,9 @@ struct gfs2_args { struct gfs2_tune { spinlock_t gt_spin; - unsigned int gt_ilimit; - unsigned int gt_ilimit_tries; - unsigned int gt_ilimit_min; unsigned int gt_demote_secs; /* Cache retention for unheld glock */ unsigned int gt_incore_log_blocks; unsigned int gt_log_flush_secs; - unsigned int gt_jindex_refresh_secs; /* Check for new journal index */ unsigned int gt_recoverd_secs; unsigned int gt_logd_secs; @@ -443,10 +433,8 @@ struct gfs2_tune { unsigned int gt_new_files_jdata; unsigned int gt_new_files_directio; unsigned int gt_max_readahead; /* Max bytes to read-ahead from disk */ - unsigned int gt_lockdump_size; unsigned int gt_stall_secs; /* Detects trouble! */ unsigned int gt_complain_secs; - unsigned int gt_reclaim_limit; /* Max num of glocks in reclaim list */ unsigned int gt_statfs_quantum; unsigned int gt_statfs_slow; }; @@ -539,7 +527,6 @@ struct gfs2_sbd { /* StatFS stuff */ spinlock_t sd_statfs_spin; - struct mutex sd_statfs_mutex; struct gfs2_statfs_change_host sd_statfs_master; struct gfs2_statfs_change_host sd_statfs_local; unsigned long sd_statfs_sync_time; @@ -602,20 +589,18 @@ struct gfs2_sbd { unsigned int sd_log_commited_databuf; unsigned int sd_log_commited_revoke; - unsigned int sd_log_num_gl; unsigned int sd_log_num_buf; unsigned int sd_log_num_revoke; unsigned int sd_log_num_rg; unsigned int sd_log_num_databuf; - struct list_head sd_log_le_gl; struct list_head sd_log_le_buf; struct list_head sd_log_le_revoke; struct list_head sd_log_le_rg; struct list_head sd_log_le_databuf; struct list_head sd_log_le_ordered; - unsigned int sd_log_blks_free; + atomic_t sd_log_blks_free; struct mutex sd_log_reserve_mutex; u64 sd_log_sequence; diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index 5f6dc32..728d3169 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -31,7 +31,6 @@ #include "log.h" #include "meta_io.h" #include "ops_address.h" -#include "ops_file.h" #include "ops_inode.h" #include "quota.h" #include "rgrp.h" @@ -132,15 +131,21 @@ static struct inode *gfs2_iget_skip(struct super_block *sb, void gfs2_set_iop(struct inode *inode) { + struct gfs2_sbd *sdp = GFS2_SB(inode); umode_t mode = inode->i_mode; if (S_ISREG(mode)) { inode->i_op = &gfs2_file_iops; - inode->i_fop = &gfs2_file_fops; - inode->i_mapping->a_ops = &gfs2_file_aops; + if (sdp->sd_args.ar_localflocks) + inode->i_fop = &gfs2_file_fops_nolock; + else + inode->i_fop = &gfs2_file_fops; } else if (S_ISDIR(mode)) { inode->i_op = &gfs2_dir_iops; - inode->i_fop = &gfs2_dir_fops; + if (sdp->sd_args.ar_localflocks) + inode->i_fop = &gfs2_dir_fops_nolock; + else + inode->i_fop = &gfs2_dir_fops; } else if (S_ISLNK(mode)) { inode->i_op = &gfs2_symlink_iops; } else { @@ -291,12 +296,10 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf) di->di_entries = be32_to_cpu(str->di_entries); di->di_eattr = be64_to_cpu(str->di_eattr); - return 0; -} + if (S_ISREG(ip->i_inode.i_mode)) + gfs2_set_aops(&ip->i_inode); -static void gfs2_inode_bh(struct gfs2_inode *ip, struct buffer_head *bh) -{ - ip->i_cache[0] = bh; + return 0; } /** @@ -366,7 +369,8 @@ int gfs2_dinode_dealloc(struct gfs2_inode *ip) if (error) goto out_rg_gunlock; - gfs2_trans_add_gl(ip->i_gl); + set_bit(GLF_DIRTY, &ip->i_gl->gl_flags); + set_bit(GLF_LFLUSH, &ip->i_gl->gl_flags); gfs2_free_di(rgd, ip); @@ -707,9 +711,10 @@ static int alloc_dinode(struct gfs2_inode *dip, u64 *no_addr, u64 *generation) struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode); int error; - gfs2_alloc_get(dip); + if (gfs2_alloc_get(dip) == NULL) + return -ENOMEM; - dip->i_alloc.al_requested = RES_DINODE; + dip->i_alloc->al_requested = RES_DINODE; error = gfs2_inplace_reserve(dip); if (error) goto out; @@ -855,7 +860,7 @@ static int link_dinode(struct gfs2_inode *dip, const struct qstr *name, error = alloc_required = gfs2_diradd_alloc_required(&dip->i_inode, name); if (alloc_required < 0) - goto fail; + goto fail_quota_locks; if (alloc_required) { error = gfs2_quota_check(dip, dip->i_inode.i_uid, dip->i_inode.i_gid); if (error) @@ -896,7 +901,7 @@ fail_end_trans: gfs2_trans_end(sdp); fail_ipreserv: - if (dip->i_alloc.al_rgd) + if (dip->i_alloc->al_rgd) gfs2_inplace_release(dip); fail_quota_locks: @@ -966,7 +971,7 @@ struct inode *gfs2_createi(struct gfs2_holder *ghs, const struct qstr *name, struct gfs2_inum_host inum = { .no_addr = 0, .no_formal_ino = 0 }; int error; u64 generation; - struct buffer_head *bh=NULL; + struct buffer_head *bh = NULL; if (!name->len || name->len > GFS2_FNAMESIZE) return ERR_PTR(-ENAMETOOLONG); @@ -1003,8 +1008,6 @@ struct inode *gfs2_createi(struct gfs2_holder *ghs, const struct qstr *name, if (IS_ERR(inode)) goto fail_gunlock2; - gfs2_inode_bh(GFS2_I(inode), bh); - error = gfs2_inode_refresh(GFS2_I(inode)); if (error) goto fail_gunlock2; @@ -1021,6 +1024,8 @@ struct inode *gfs2_createi(struct gfs2_holder *ghs, const struct qstr *name, if (error) goto fail_gunlock2; + if (bh) + brelse(bh); if (!inode) return ERR_PTR(-ENOMEM); return inode; @@ -1032,6 +1037,8 @@ fail_gunlock2: fail_gunlock: gfs2_glock_dq(ghs); fail: + if (bh) + brelse(bh); return ERR_PTR(error); } diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h index 351ac87..d446506 100644 --- a/fs/gfs2/inode.h +++ b/fs/gfs2/inode.h @@ -20,6 +20,18 @@ static inline int gfs2_is_jdata(const struct gfs2_inode *ip) return ip->i_di.di_flags & GFS2_DIF_JDATA; } +static inline int gfs2_is_writeback(const struct gfs2_inode *ip) +{ + const struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + return (sdp->sd_args.ar_data == GFS2_DATA_WRITEBACK) && !gfs2_is_jdata(ip); +} + +static inline int gfs2_is_ordered(const struct gfs2_inode *ip) +{ + const struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + return (sdp->sd_args.ar_data == GFS2_DATA_ORDERED) && !gfs2_is_jdata(ip); +} + static inline int gfs2_is_dir(const struct gfs2_inode *ip) { return S_ISDIR(ip->i_inode.i_mode); diff --git a/fs/gfs2/locking/dlm/mount.c b/fs/gfs2/locking/dlm/mount.c index 41c5b04..f2efff4 100644 --- a/fs/gfs2/locking/dlm/mount.c +++ b/fs/gfs2/locking/dlm/mount.c @@ -67,6 +67,11 @@ static int make_args(struct gdlm_ls *ls, char *data_arg, int *nodir) memset(data, 0, 256); strncpy(data, data_arg, 255); + if (!strlen(data)) { + log_error("no mount options, (u)mount helpers not installed"); + return -EINVAL; + } + for (options = data; (x = strsep(&options, ":")); ) { if (!*x) continue; diff --git a/fs/gfs2/locking/dlm/plock.c b/fs/gfs2/locking/dlm/plock.c index 1f7b038..2ebd374 100644 --- a/fs/gfs2/locking/dlm/plock.c +++ b/fs/gfs2/locking/dlm/plock.c @@ -89,15 +89,19 @@ int gdlm_plock(void *lockspace, struct lm_lockname *name, op->info.number = name->ln_number; op->info.start = fl->fl_start; op->info.end = fl->fl_end; - op->info.owner = (__u64)(long) fl->fl_owner; if (fl->fl_lmops && fl->fl_lmops->fl_grant) { + /* fl_owner is lockd which doesn't distinguish + processes on the nfs client */ + op->info.owner = (__u64) fl->fl_pid; xop->callback = fl->fl_lmops->fl_grant; locks_init_lock(&xop->flc); locks_copy_lock(&xop->flc, fl); xop->fl = fl; xop->file = file; - } else + } else { + op->info.owner = (__u64)(long) fl->fl_owner; xop->callback = NULL; + } send_op(op); @@ -203,7 +207,10 @@ int gdlm_punlock(void *lockspace, struct lm_lockname *name, op->info.number = name->ln_number; op->info.start = fl->fl_start; op->info.end = fl->fl_end; - op->info.owner = (__u64)(long) fl->fl_owner; + if (fl->fl_lmops && fl->fl_lmops->fl_grant) + op->info.owner = (__u64) fl->fl_pid; + else + op->info.owner = (__u64)(long) fl->fl_owner; send_op(op); wait_event(recv_wq, (op->done != 0)); @@ -242,7 +249,10 @@ int gdlm_plock_get(void *lockspace, struct lm_lockname *name, op->info.number = name->ln_number; op->info.start = fl->fl_start; op->info.end = fl->fl_end; - op->info.owner = (__u64)(long) fl->fl_owner; + if (fl->fl_lmops && fl->fl_lmops->fl_grant) + op->info.owner = (__u64) fl->fl_pid; + else + op->info.owner = (__u64)(long) fl->fl_owner; send_op(op); wait_event(recv_wq, (op->done != 0)); diff --git a/fs/gfs2/locking/dlm/sysfs.c b/fs/gfs2/locking/dlm/sysfs.c index ae9e6a2..a87b098 100644 --- a/fs/gfs2/locking/dlm/sysfs.c +++ b/fs/gfs2/locking/dlm/sysfs.c @@ -189,51 +189,39 @@ static struct kobj_type gdlm_ktype = { .sysfs_ops = &gdlm_attr_ops, }; -static struct kset gdlm_kset = { - .ktype = &gdlm_ktype, -}; +static struct kset *gdlm_kset; int gdlm_kobject_setup(struct gdlm_ls *ls, struct kobject *fskobj) { int error; - error = kobject_set_name(&ls->kobj, "%s", "lock_module"); - if (error) { - log_error("can't set kobj name %d", error); - return error; - } - - ls->kobj.kset = &gdlm_kset; - ls->kobj.ktype = &gdlm_ktype; - ls->kobj.parent = fskobj; - - error = kobject_register(&ls->kobj); + ls->kobj.kset = gdlm_kset; + error = kobject_init_and_add(&ls->kobj, &gdlm_ktype, fskobj, + "lock_module"); if (error) log_error("can't register kobj %d", error); + kobject_uevent(&ls->kobj, KOBJ_ADD); return error; } void gdlm_kobject_release(struct gdlm_ls *ls) { - kobject_unregister(&ls->kobj); + kobject_put(&ls->kobj); } int gdlm_sysfs_init(void) { - int error; - - kobject_set_name(&gdlm_kset.kobj, "lock_dlm"); - kobj_set_kset_s(&gdlm_kset, kernel_subsys); - error = kset_register(&gdlm_kset); - if (error) - printk("lock_dlm: cannot register kset %d\n", error); - - return error; + gdlm_kset = kset_create_and_add("lock_dlm", NULL, kernel_kobj); + if (!gdlm_kset) { + printk(KERN_WARNING "%s: can not create kset\n", __FUNCTION__); + return -ENOMEM; + } + return 0; } void gdlm_sysfs_exit(void) { - kset_unregister(&gdlm_kset); + kset_unregister(gdlm_kset); } diff --git a/fs/gfs2/locking/dlm/thread.c b/fs/gfs2/locking/dlm/thread.c index bd938f0..521694f 100644 --- a/fs/gfs2/locking/dlm/thread.c +++ b/fs/gfs2/locking/dlm/thread.c @@ -273,18 +273,13 @@ static int gdlm_thread(void *data, int blist) struct gdlm_ls *ls = (struct gdlm_ls *) data; struct gdlm_lock *lp = NULL; uint8_t complete, blocking, submit, drop; - DECLARE_WAITQUEUE(wait, current); /* Only thread1 is allowed to do blocking callbacks since gfs may wait for a completion callback within a blocking cb. */ while (!kthread_should_stop()) { - set_current_state(TASK_INTERRUPTIBLE); - add_wait_queue(&ls->thread_wait, &wait); - if (no_work(ls, blist)) - schedule(); - remove_wait_queue(&ls->thread_wait, &wait); - set_current_state(TASK_RUNNING); + wait_event_interruptible(ls->thread_wait, + !no_work(ls, blist) || kthread_should_stop()); complete = blocking = submit = drop = 0; diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c index 7df7024..161ab6f 100644 --- a/fs/gfs2/log.c +++ b/fs/gfs2/log.c @@ -1,6 +1,6 @@ /* * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. - * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * Copyright (C) 2004-2007 Red Hat, Inc. All rights reserved. * * This copyrighted material is made available to anyone wishing to use, * modify, copy, or redistribute it subject to the terms and conditions @@ -16,6 +16,8 @@ #include <linux/crc32.h> #include <linux/lm_interface.h> #include <linux/delay.h> +#include <linux/kthread.h> +#include <linux/freezer.h> #include "gfs2.h" #include "incore.h" @@ -68,14 +70,12 @@ unsigned int gfs2_struct2blk(struct gfs2_sbd *sdp, unsigned int nstruct, * */ -void gfs2_remove_from_ail(struct address_space *mapping, struct gfs2_bufdata *bd) +void gfs2_remove_from_ail(struct gfs2_bufdata *bd) { bd->bd_ail = NULL; list_del_init(&bd->bd_ail_st_list); list_del_init(&bd->bd_ail_gl_list); atomic_dec(&bd->bd_gl->gl_ail_count); - if (mapping) - gfs2_meta_cache_flush(GFS2_I(mapping->host)); brelse(bd->bd_bh); } @@ -92,8 +92,6 @@ static void gfs2_ail1_start_one(struct gfs2_sbd *sdp, struct gfs2_ail *ai) struct buffer_head *bh; int retry; - BUG_ON(!spin_is_locked(&sdp->sd_log_lock)); - do { retry = 0; @@ -210,7 +208,7 @@ static void gfs2_ail1_start(struct gfs2_sbd *sdp, int flags) gfs2_log_unlock(sdp); } -int gfs2_ail1_empty(struct gfs2_sbd *sdp, int flags) +static int gfs2_ail1_empty(struct gfs2_sbd *sdp, int flags) { struct gfs2_ail *ai, *s; int ret; @@ -248,7 +246,7 @@ static void gfs2_ail2_empty_one(struct gfs2_sbd *sdp, struct gfs2_ail *ai) bd = list_entry(head->prev, struct gfs2_bufdata, bd_ail_st_list); gfs2_assert(sdp, bd->bd_ail == ai); - gfs2_remove_from_ail(bd->bd_bh->b_page->mapping, bd); + gfs2_remove_from_ail(bd); } } @@ -303,7 +301,7 @@ int gfs2_log_reserve(struct gfs2_sbd *sdp, unsigned int blks) mutex_lock(&sdp->sd_log_reserve_mutex); gfs2_log_lock(sdp); - while(sdp->sd_log_blks_free <= (blks + reserved_blks)) { + while(atomic_read(&sdp->sd_log_blks_free) <= (blks + reserved_blks)) { gfs2_log_unlock(sdp); gfs2_ail1_empty(sdp, 0); gfs2_log_flush(sdp, NULL); @@ -312,7 +310,7 @@ int gfs2_log_reserve(struct gfs2_sbd *sdp, unsigned int blks) gfs2_ail1_start(sdp, 0); gfs2_log_lock(sdp); } - sdp->sd_log_blks_free -= blks; + atomic_sub(blks, &sdp->sd_log_blks_free); gfs2_log_unlock(sdp); mutex_unlock(&sdp->sd_log_reserve_mutex); @@ -332,27 +330,23 @@ void gfs2_log_release(struct gfs2_sbd *sdp, unsigned int blks) { gfs2_log_lock(sdp); - sdp->sd_log_blks_free += blks; + atomic_add(blks, &sdp->sd_log_blks_free); gfs2_assert_withdraw(sdp, - sdp->sd_log_blks_free <= sdp->sd_jdesc->jd_blocks); + atomic_read(&sdp->sd_log_blks_free) <= sdp->sd_jdesc->jd_blocks); gfs2_log_unlock(sdp); up_read(&sdp->sd_log_flush_lock); } static u64 log_bmap(struct gfs2_sbd *sdp, unsigned int lbn) { - struct inode *inode = sdp->sd_jdesc->jd_inode; - int error; - struct buffer_head bh_map = { .b_state = 0, .b_blocknr = 0 }; - - bh_map.b_size = 1 << inode->i_blkbits; - error = gfs2_block_map(inode, lbn, 0, &bh_map); - if (error || !bh_map.b_blocknr) - printk(KERN_INFO "error=%d, dbn=%llu lbn=%u", error, - (unsigned long long)bh_map.b_blocknr, lbn); - gfs2_assert_withdraw(sdp, !error && bh_map.b_blocknr); - - return bh_map.b_blocknr; + struct gfs2_journal_extent *je; + + list_for_each_entry(je, &sdp->sd_jdesc->extent_list, extent_list) { + if (lbn >= je->lblock && lbn < je->lblock + je->blocks) + return je->dblock + lbn - je->lblock; + } + + return -1; } /** @@ -561,8 +555,8 @@ static void log_pull_tail(struct gfs2_sbd *sdp, unsigned int new_tail) ail2_empty(sdp, new_tail); gfs2_log_lock(sdp); - sdp->sd_log_blks_free += dist; - gfs2_assert_withdraw(sdp, sdp->sd_log_blks_free <= sdp->sd_jdesc->jd_blocks); + atomic_add(dist, &sdp->sd_log_blks_free); + gfs2_assert_withdraw(sdp, atomic_read(&sdp->sd_log_blks_free) <= sdp->sd_jdesc->jd_blocks); gfs2_log_unlock(sdp); sdp->sd_log_tail = new_tail; @@ -652,7 +646,7 @@ static void gfs2_ordered_write(struct gfs2_sbd *sdp) get_bh(bh); gfs2_log_unlock(sdp); lock_buffer(bh); - if (test_clear_buffer_dirty(bh)) { + if (buffer_mapped(bh) && test_clear_buffer_dirty(bh)) { bh->b_end_io = end_buffer_write_sync; submit_bh(WRITE, bh); } else { @@ -694,20 +688,16 @@ static void gfs2_ordered_wait(struct gfs2_sbd *sdp) * */ -void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl) +void __gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl) { struct gfs2_ail *ai; down_write(&sdp->sd_log_flush_lock); - if (gl) { - gfs2_log_lock(sdp); - if (list_empty(&gl->gl_le.le_list)) { - gfs2_log_unlock(sdp); - up_write(&sdp->sd_log_flush_lock); - return; - } - gfs2_log_unlock(sdp); + /* Log might have been flushed while we waited for the flush lock */ + if (gl && !test_bit(GLF_LFLUSH, &gl->gl_flags)) { + up_write(&sdp->sd_log_flush_lock); + return; } ai = kzalloc(sizeof(struct gfs2_ail), GFP_NOFS | __GFP_NOFAIL); @@ -739,7 +729,7 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl) log_flush_commit(sdp); else if (sdp->sd_log_tail != current_tail(sdp) && !sdp->sd_log_idle){ gfs2_log_lock(sdp); - sdp->sd_log_blks_free--; /* Adjust for unreserved buffer */ + atomic_dec(&sdp->sd_log_blks_free); /* Adjust for unreserved buffer */ gfs2_log_unlock(sdp); log_write_header(sdp, 0, PULL); } @@ -767,7 +757,7 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl) static void log_refund(struct gfs2_sbd *sdp, struct gfs2_trans *tr) { unsigned int reserved; - unsigned int old; + unsigned int unused; gfs2_log_lock(sdp); @@ -779,14 +769,11 @@ static void log_refund(struct gfs2_sbd *sdp, struct gfs2_trans *tr) sdp->sd_log_commited_revoke += tr->tr_num_revoke - tr->tr_num_revoke_rm; gfs2_assert_withdraw(sdp, ((int)sdp->sd_log_commited_revoke) >= 0); reserved = calc_reserved(sdp); - old = sdp->sd_log_blks_free; - sdp->sd_log_blks_free += tr->tr_reserved - - (reserved - sdp->sd_log_blks_reserved); - - gfs2_assert_withdraw(sdp, sdp->sd_log_blks_free >= old); - gfs2_assert_withdraw(sdp, sdp->sd_log_blks_free <= + unused = sdp->sd_log_blks_reserved - reserved + tr->tr_reserved; + gfs2_assert_withdraw(sdp, unused >= 0); + atomic_add(unused, &sdp->sd_log_blks_free); + gfs2_assert_withdraw(sdp, atomic_read(&sdp->sd_log_blks_free) <= sdp->sd_jdesc->jd_blocks); - sdp->sd_log_blks_reserved = reserved; gfs2_log_unlock(sdp); @@ -825,7 +812,6 @@ void gfs2_log_shutdown(struct gfs2_sbd *sdp) down_write(&sdp->sd_log_flush_lock); gfs2_assert_withdraw(sdp, !sdp->sd_log_blks_reserved); - gfs2_assert_withdraw(sdp, !sdp->sd_log_num_gl); gfs2_assert_withdraw(sdp, !sdp->sd_log_num_buf); gfs2_assert_withdraw(sdp, !sdp->sd_log_num_revoke); gfs2_assert_withdraw(sdp, !sdp->sd_log_num_rg); @@ -838,7 +824,7 @@ void gfs2_log_shutdown(struct gfs2_sbd *sdp) log_write_header(sdp, GFS2_LOG_HEAD_UNMOUNT, (sdp->sd_log_tail == current_tail(sdp)) ? 0 : PULL); - gfs2_assert_warn(sdp, sdp->sd_log_blks_free == sdp->sd_jdesc->jd_blocks); + gfs2_assert_warn(sdp, atomic_read(&sdp->sd_log_blks_free) == sdp->sd_jdesc->jd_blocks); gfs2_assert_warn(sdp, sdp->sd_log_head == sdp->sd_log_tail); gfs2_assert_warn(sdp, list_empty(&sdp->sd_ail2_list)); @@ -866,3 +852,42 @@ void gfs2_meta_syncfs(struct gfs2_sbd *sdp) } } + +/** + * gfs2_logd - Update log tail as Active Items get flushed to in-place blocks + * @sdp: Pointer to GFS2 superblock + * + * Also, periodically check to make sure that we're using the most recent + * journal index. + */ + +int gfs2_logd(void *data) +{ + struct gfs2_sbd *sdp = data; + unsigned long t; + int need_flush; + + while (!kthread_should_stop()) { + /* Advance the log tail */ + + t = sdp->sd_log_flush_time + + gfs2_tune_get(sdp, gt_log_flush_secs) * HZ; + + gfs2_ail1_empty(sdp, DIO_ALL); + gfs2_log_lock(sdp); + need_flush = sdp->sd_log_num_buf > gfs2_tune_get(sdp, gt_incore_log_blocks); + gfs2_log_unlock(sdp); + if (need_flush || time_after_eq(jiffies, t)) { + gfs2_log_flush(sdp, NULL); + sdp->sd_log_flush_time = jiffies; + } + + t = gfs2_tune_get(sdp, gt_logd_secs) * HZ; + if (freezing(current)) + refrigerator(); + schedule_timeout_interruptible(t); + } + + return 0; +} + diff --git a/fs/gfs2/log.h b/fs/gfs2/log.h index dae2824..7711528 100644 --- a/fs/gfs2/log.h +++ b/fs/gfs2/log.h @@ -48,8 +48,6 @@ static inline void gfs2_log_pointers_init(struct gfs2_sbd *sdp, unsigned int gfs2_struct2blk(struct gfs2_sbd *sdp, unsigned int nstruct, unsigned int ssize); -int gfs2_ail1_empty(struct gfs2_sbd *sdp, int flags); - int gfs2_log_reserve(struct gfs2_sbd *sdp, unsigned int blks); void gfs2_log_release(struct gfs2_sbd *sdp, unsigned int blks); void gfs2_log_incr_head(struct gfs2_sbd *sdp); @@ -57,11 +55,19 @@ void gfs2_log_incr_head(struct gfs2_sbd *sdp); struct buffer_head *gfs2_log_get_buf(struct gfs2_sbd *sdp); struct buffer_head *gfs2_log_fake_buf(struct gfs2_sbd *sdp, struct buffer_head *real); -void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl); +void __gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl); + +static inline void gfs2_log_flush(struct gfs2_sbd *sbd, struct gfs2_glock *gl) +{ + if (!gl || test_bit(GLF_LFLUSH, &gl->gl_flags)) + __gfs2_log_flush(sbd, gl); +} + void gfs2_log_commit(struct gfs2_sbd *sdp, struct gfs2_trans *trans); -void gfs2_remove_from_ail(struct address_space *mapping, struct gfs2_bufdata *bd); +void gfs2_remove_from_ail(struct gfs2_bufdata *bd); void gfs2_log_shutdown(struct gfs2_sbd *sdp); void gfs2_meta_syncfs(struct gfs2_sbd *sdp); +int gfs2_logd(void *data); #endif /* __LOG_DOT_H__ */ diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c index 6c27cea..fae59d6 100644 --- a/fs/gfs2/lops.c +++ b/fs/gfs2/lops.c @@ -87,6 +87,7 @@ static void gfs2_unpin(struct gfs2_sbd *sdp, struct buffer_head *bh, } bd->bd_ail = ai; list_add(&bd->bd_ail_st_list, &ai->ai_ail1_list); + clear_bit(GLF_LFLUSH, &bd->bd_gl->gl_flags); gfs2_log_unlock(sdp); unlock_buffer(bh); } @@ -124,49 +125,6 @@ static struct buffer_head *gfs2_get_log_desc(struct gfs2_sbd *sdp, u32 ld_type) return bh; } -static void __glock_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le) -{ - struct gfs2_glock *gl; - struct gfs2_trans *tr = current->journal_info; - - tr->tr_touched = 1; - - gl = container_of(le, struct gfs2_glock, gl_le); - if (gfs2_assert_withdraw(sdp, gfs2_glock_is_held_excl(gl))) - return; - - if (!list_empty(&le->le_list)) - return; - - gfs2_glock_hold(gl); - set_bit(GLF_DIRTY, &gl->gl_flags); - sdp->sd_log_num_gl++; - list_add(&le->le_list, &sdp->sd_log_le_gl); -} - -static void glock_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le) -{ - gfs2_log_lock(sdp); - __glock_lo_add(sdp, le); - gfs2_log_unlock(sdp); -} - -static void glock_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_ail *ai) -{ - struct list_head *head = &sdp->sd_log_le_gl; - struct gfs2_glock *gl; - - while (!list_empty(head)) { - gl = list_entry(head->next, struct gfs2_glock, gl_le.le_list); - list_del_init(&gl->gl_le.le_list); - sdp->sd_log_num_gl--; - - gfs2_assert_withdraw(sdp, gfs2_glock_is_held_excl(gl)); - gfs2_glock_put(gl); - } - gfs2_assert_warn(sdp, !sdp->sd_log_num_gl); -} - static void buf_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le) { struct gfs2_bufdata *bd = container_of(le, struct gfs2_bufdata, bd_le); @@ -182,7 +140,8 @@ static void buf_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le) list_add(&bd->bd_list_tr, &tr->tr_list_buf); if (!list_empty(&le->le_list)) goto out; - __glock_lo_add(sdp, &bd->bd_gl->gl_le); + set_bit(GLF_LFLUSH, &bd->bd_gl->gl_flags); + set_bit(GLF_DIRTY, &bd->bd_gl->gl_flags); gfs2_meta_check(sdp, bd->bd_bh); gfs2_pin(sdp, bd->bd_bh); sdp->sd_log_num_buf++; @@ -556,17 +515,20 @@ static void databuf_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le) lock_buffer(bd->bd_bh); gfs2_log_lock(sdp); - if (!list_empty(&bd->bd_list_tr)) - goto out; - tr->tr_touched = 1; - if (gfs2_is_jdata(ip)) { - tr->tr_num_buf++; - list_add(&bd->bd_list_tr, &tr->tr_list_buf); + if (tr) { + if (!list_empty(&bd->bd_list_tr)) + goto out; + tr->tr_touched = 1; + if (gfs2_is_jdata(ip)) { + tr->tr_num_buf++; + list_add(&bd->bd_list_tr, &tr->tr_list_buf); + } } if (!list_empty(&le->le_list)) goto out; - __glock_lo_add(sdp, &bd->bd_gl->gl_le); + set_bit(GLF_LFLUSH, &bd->bd_gl->gl_flags); + set_bit(GLF_DIRTY, &bd->bd_gl->gl_flags); if (gfs2_is_jdata(ip)) { gfs2_pin(sdp, bd->bd_bh); tr->tr_num_databuf_new++; @@ -773,12 +735,6 @@ static void databuf_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_ail *ai) } -const struct gfs2_log_operations gfs2_glock_lops = { - .lo_add = glock_lo_add, - .lo_after_commit = glock_lo_after_commit, - .lo_name = "glock", -}; - const struct gfs2_log_operations gfs2_buf_lops = { .lo_add = buf_lo_add, .lo_incore_commit = buf_lo_incore_commit, @@ -816,7 +772,6 @@ const struct gfs2_log_operations gfs2_databuf_lops = { }; const struct gfs2_log_operations *gfs2_log_ops[] = { - &gfs2_glock_lops, &gfs2_databuf_lops, &gfs2_buf_lops, &gfs2_rg_lops, diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c index 7ecfe0d..9c7765c 100644 --- a/fs/gfs2/main.c +++ b/fs/gfs2/main.c @@ -29,9 +29,8 @@ static void gfs2_init_inode_once(struct kmem_cache *cachep, void *foo) struct gfs2_inode *ip = foo; inode_init_once(&ip->i_inode); - spin_lock_init(&ip->i_spin); init_rwsem(&ip->i_rw_mutex); - memset(ip->i_cache, 0, sizeof(ip->i_cache)); + ip->i_alloc = NULL; } static void gfs2_init_glock_once(struct kmem_cache *cachep, void *foo) diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c index 4da4239..85aea27 100644 --- a/fs/gfs2/meta_io.c +++ b/fs/gfs2/meta_io.c @@ -50,6 +50,7 @@ static int gfs2_aspace_writepage(struct page *page, static const struct address_space_operations aspace_aops = { .writepage = gfs2_aspace_writepage, .releasepage = gfs2_releasepage, + .sync_page = block_sync_page, }; /** @@ -221,13 +222,14 @@ int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags, struct buffer_head **bhp) { *bhp = getbuf(gl, blkno, CREATE); - if (!buffer_uptodate(*bhp)) + if (!buffer_uptodate(*bhp)) { ll_rw_block(READ_META, 1, bhp); - if (flags & DIO_WAIT) { - int error = gfs2_meta_wait(gl->gl_sbd, *bhp); - if (error) { - brelse(*bhp); - return error; + if (flags & DIO_WAIT) { + int error = gfs2_meta_wait(gl->gl_sbd, *bhp); + if (error) { + brelse(*bhp); + return error; + } } } @@ -282,7 +284,7 @@ void gfs2_attach_bufdata(struct gfs2_glock *gl, struct buffer_head *bh, return; } - bd = kmem_cache_zalloc(gfs2_bufdata_cachep, GFP_NOFS | __GFP_NOFAIL), + bd = kmem_cache_zalloc(gfs2_bufdata_cachep, GFP_NOFS | __GFP_NOFAIL); bd->bd_bh = bh; bd->bd_gl = gl; @@ -317,7 +319,7 @@ void gfs2_remove_from_journal(struct buffer_head *bh, struct gfs2_trans *tr, int } if (bd) { if (bd->bd_ail) { - gfs2_remove_from_ail(NULL, bd); + gfs2_remove_from_ail(bd); bh->b_private = NULL; bd->bd_bh = NULL; bd->bd_blkno = bh->b_blocknr; @@ -358,32 +360,6 @@ void gfs2_meta_wipe(struct gfs2_inode *ip, u64 bstart, u32 blen) } /** - * gfs2_meta_cache_flush - get rid of any references on buffers for this inode - * @ip: The GFS2 inode - * - * This releases buffers that are in the most-recently-used array of - * blocks used for indirect block addressing for this inode. - */ - -void gfs2_meta_cache_flush(struct gfs2_inode *ip) -{ - struct buffer_head **bh_slot; - unsigned int x; - - spin_lock(&ip->i_spin); - - for (x = 0; x < GFS2_MAX_META_HEIGHT; x++) { - bh_slot = &ip->i_cache[x]; - if (*bh_slot) { - brelse(*bh_slot); - *bh_slot = NULL; - } - } - - spin_unlock(&ip->i_spin); -} - -/** * gfs2_meta_indirect_buffer - Get a metadata buffer * @ip: The GFS2 inode * @height: The level of this buf in the metadata (indir addr) tree (if any) @@ -391,8 +367,6 @@ void gfs2_meta_cache_flush(struct gfs2_inode *ip) * @new: Non-zero if we may create a new buffer * @bhp: the buffer is returned here * - * Try to use the gfs2_inode's MRU metadata tree cache. - * * Returns: errno */ @@ -401,58 +375,25 @@ int gfs2_meta_indirect_buffer(struct gfs2_inode *ip, int height, u64 num, { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); struct gfs2_glock *gl = ip->i_gl; - struct buffer_head *bh = NULL, **bh_slot = ip->i_cache + height; - int in_cache = 0; - - BUG_ON(!gl); - BUG_ON(!sdp); - - spin_lock(&ip->i_spin); - if (*bh_slot && (*bh_slot)->b_blocknr == num) { - bh = *bh_slot; - get_bh(bh); - in_cache = 1; - } - spin_unlock(&ip->i_spin); - - if (!bh) - bh = getbuf(gl, num, CREATE); - - if (!bh) - return -ENOBUFS; + struct buffer_head *bh; + int ret = 0; if (new) { - if (gfs2_assert_warn(sdp, height)) - goto err; - meta_prep_new(bh); + BUG_ON(height == 0); + bh = gfs2_meta_new(gl, num); gfs2_trans_add_bh(ip->i_gl, bh, 1); gfs2_metatype_set(bh, GFS2_METATYPE_IN, GFS2_FORMAT_IN); gfs2_buffer_clear_tail(bh, sizeof(struct gfs2_meta_header)); } else { u32 mtype = height ? GFS2_METATYPE_IN : GFS2_METATYPE_DI; - if (!buffer_uptodate(bh)) { - ll_rw_block(READ_META, 1, &bh); - if (gfs2_meta_wait(sdp, bh)) - goto err; + ret = gfs2_meta_read(gl, num, DIO_WAIT, &bh); + if (ret == 0 && gfs2_metatype_check(sdp, bh, mtype)) { + brelse(bh); + ret = -EIO; } - if (gfs2_metatype_check(sdp, bh, mtype)) - goto err; - } - - if (!in_cache) { - spin_lock(&ip->i_spin); - if (*bh_slot) - brelse(*bh_slot); - *bh_slot = bh; - get_bh(bh); - spin_unlock(&ip->i_spin); } - *bhp = bh; - return 0; -err: - brelse(bh); - return -EIO; + return ret; } /** diff --git a/fs/gfs2/meta_io.h b/fs/gfs2/meta_io.h index b704822..73e3b1c 100644 --- a/fs/gfs2/meta_io.h +++ b/fs/gfs2/meta_io.h @@ -56,7 +56,6 @@ void gfs2_remove_from_journal(struct buffer_head *bh, struct gfs2_trans *tr, void gfs2_meta_wipe(struct gfs2_inode *ip, u64 bstart, u32 blen); -void gfs2_meta_cache_flush(struct gfs2_inode *ip); int gfs2_meta_indirect_buffer(struct gfs2_inode *ip, int height, u64 num, int new, struct buffer_head **bhp); diff --git a/fs/gfs2/ops_address.c b/fs/gfs2/ops_address.c index 9679f8b..38dbe99 100644 --- a/fs/gfs2/ops_address.c +++ b/fs/gfs2/ops_address.c @@ -20,6 +20,8 @@ #include <linux/swap.h> #include <linux/gfs2_ondisk.h> #include <linux/lm_interface.h> +#include <linux/backing-dev.h> +#include <linux/pagevec.h> #include "gfs2.h" #include "incore.h" @@ -32,7 +34,6 @@ #include "quota.h" #include "trans.h" #include "rgrp.h" -#include "ops_file.h" #include "super.h" #include "util.h" #include "glops.h" @@ -58,22 +59,6 @@ static void gfs2_page_add_databufs(struct gfs2_inode *ip, struct page *page, } /** - * gfs2_get_block - Fills in a buffer head with details about a block - * @inode: The inode - * @lblock: The block number to look up - * @bh_result: The buffer head to return the result in - * @create: Non-zero if we may add block to the file - * - * Returns: errno - */ - -int gfs2_get_block(struct inode *inode, sector_t lblock, - struct buffer_head *bh_result, int create) -{ - return gfs2_block_map(inode, lblock, create, bh_result); -} - -/** * gfs2_get_block_noalloc - Fills in a buffer head with details about a block * @inode: The inode * @lblock: The block number to look up @@ -88,7 +73,7 @@ static int gfs2_get_block_noalloc(struct inode *inode, sector_t lblock, { int error; - error = gfs2_block_map(inode, lblock, 0, bh_result); + error = gfs2_block_map(inode, lblock, bh_result, 0); if (error) return error; if (!buffer_mapped(bh_result)) @@ -99,20 +84,19 @@ static int gfs2_get_block_noalloc(struct inode *inode, sector_t lblock, static int gfs2_get_block_direct(struct inode *inode, sector_t lblock, struct buffer_head *bh_result, int create) { - return gfs2_block_map(inode, lblock, 0, bh_result); + return gfs2_block_map(inode, lblock, bh_result, 0); } /** - * gfs2_writepage - Write complete page - * @page: Page to write + * gfs2_writepage_common - Common bits of writepage + * @page: The page to be written + * @wbc: The writeback control * - * Returns: errno - * - * Some of this is copied from block_write_full_page() although we still - * call it to do most of the work. + * Returns: 1 if writepage is ok, otherwise an error code or zero if no error. */ -static int gfs2_writepage(struct page *page, struct writeback_control *wbc) +static int gfs2_writepage_common(struct page *page, + struct writeback_control *wbc) { struct inode *inode = page->mapping->host; struct gfs2_inode *ip = GFS2_I(inode); @@ -120,41 +104,133 @@ static int gfs2_writepage(struct page *page, struct writeback_control *wbc) loff_t i_size = i_size_read(inode); pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT; unsigned offset; - int error; - int done_trans = 0; + int ret = -EIO; - if (gfs2_assert_withdraw(sdp, gfs2_glock_is_held_excl(ip->i_gl))) { - unlock_page(page); - return -EIO; - } + if (gfs2_assert_withdraw(sdp, gfs2_glock_is_held_excl(ip->i_gl))) + goto out; + ret = 0; if (current->journal_info) - goto out_ignore; - + goto redirty; /* Is the page fully outside i_size? (truncate in progress) */ - offset = i_size & (PAGE_CACHE_SIZE-1); + offset = i_size & (PAGE_CACHE_SIZE-1); if (page->index > end_index || (page->index == end_index && !offset)) { page->mapping->a_ops->invalidatepage(page, 0); - unlock_page(page); - return 0; /* don't care */ + goto out; + } + return 1; +redirty: + redirty_page_for_writepage(wbc, page); +out: + unlock_page(page); + return 0; +} + +/** + * gfs2_writeback_writepage - Write page for writeback mappings + * @page: The page + * @wbc: The writeback control + * + */ + +static int gfs2_writeback_writepage(struct page *page, + struct writeback_control *wbc) +{ + int ret; + + ret = gfs2_writepage_common(page, wbc); + if (ret <= 0) + return ret; + + ret = mpage_writepage(page, gfs2_get_block_noalloc, wbc); + if (ret == -EAGAIN) + ret = block_write_full_page(page, gfs2_get_block_noalloc, wbc); + return ret; +} + +/** + * gfs2_ordered_writepage - Write page for ordered data files + * @page: The page to write + * @wbc: The writeback control + * + */ + +static int gfs2_ordered_writepage(struct page *page, + struct writeback_control *wbc) +{ + struct inode *inode = page->mapping->host; + struct gfs2_inode *ip = GFS2_I(inode); + int ret; + + ret = gfs2_writepage_common(page, wbc); + if (ret <= 0) + return ret; + + if (!page_has_buffers(page)) { + create_empty_buffers(page, inode->i_sb->s_blocksize, + (1 << BH_Dirty)|(1 << BH_Uptodate)); } + gfs2_page_add_databufs(ip, page, 0, inode->i_sb->s_blocksize-1); + return block_write_full_page(page, gfs2_get_block_noalloc, wbc); +} - if ((sdp->sd_args.ar_data == GFS2_DATA_ORDERED || gfs2_is_jdata(ip)) && - PageChecked(page)) { +/** + * __gfs2_jdata_writepage - The core of jdata writepage + * @page: The page to write + * @wbc: The writeback control + * + * This is shared between writepage and writepages and implements the + * core of the writepage operation. If a transaction is required then + * PageChecked will have been set and the transaction will have + * already been started before this is called. + */ + +static int __gfs2_jdata_writepage(struct page *page, struct writeback_control *wbc) +{ + struct inode *inode = page->mapping->host; + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_sbd *sdp = GFS2_SB(inode); + + if (PageChecked(page)) { ClearPageChecked(page); - error = gfs2_trans_begin(sdp, RES_DINODE + 1, 0); - if (error) - goto out_ignore; if (!page_has_buffers(page)) { create_empty_buffers(page, inode->i_sb->s_blocksize, (1 << BH_Dirty)|(1 << BH_Uptodate)); } gfs2_page_add_databufs(ip, page, 0, sdp->sd_vfs->s_blocksize-1); + } + return block_write_full_page(page, gfs2_get_block_noalloc, wbc); +} + +/** + * gfs2_jdata_writepage - Write complete page + * @page: Page to write + * + * Returns: errno + * + */ + +static int gfs2_jdata_writepage(struct page *page, struct writeback_control *wbc) +{ + struct inode *inode = page->mapping->host; + struct gfs2_sbd *sdp = GFS2_SB(inode); + int error; + int done_trans = 0; + + error = gfs2_writepage_common(page, wbc); + if (error <= 0) + return error; + + if (PageChecked(page)) { + if (wbc->sync_mode != WB_SYNC_ALL) + goto out_ignore; + error = gfs2_trans_begin(sdp, RES_DINODE + 1, 0); + if (error) + goto out_ignore; done_trans = 1; } - error = block_write_full_page(page, gfs2_get_block_noalloc, wbc); + error = __gfs2_jdata_writepage(page, wbc); if (done_trans) gfs2_trans_end(sdp); - gfs2_meta_cache_flush(ip); return error; out_ignore: @@ -164,29 +240,190 @@ out_ignore: } /** - * gfs2_writepages - Write a bunch of dirty pages back to disk + * gfs2_writeback_writepages - Write a bunch of dirty pages back to disk * @mapping: The mapping to write * @wbc: Write-back control * - * For journaled files and/or ordered writes this just falls back to the - * kernel's default writepages path for now. We will probably want to change - * that eventually (i.e. when we look at allocate on flush). - * - * For the data=writeback case though we can already ignore buffer heads + * For the data=writeback case we can already ignore buffer heads * and write whole extents at once. This is a big reduction in the * number of I/O requests we send and the bmap calls we make in this case. */ -static int gfs2_writepages(struct address_space *mapping, - struct writeback_control *wbc) +static int gfs2_writeback_writepages(struct address_space *mapping, + struct writeback_control *wbc) +{ + return mpage_writepages(mapping, wbc, gfs2_get_block_noalloc); +} + +/** + * gfs2_write_jdata_pagevec - Write back a pagevec's worth of pages + * @mapping: The mapping + * @wbc: The writeback control + * @writepage: The writepage function to call for each page + * @pvec: The vector of pages + * @nr_pages: The number of pages to write + * + * Returns: non-zero if loop should terminate, zero otherwise + */ + +static int gfs2_write_jdata_pagevec(struct address_space *mapping, + struct writeback_control *wbc, + struct pagevec *pvec, + int nr_pages, pgoff_t end) { struct inode *inode = mapping->host; - struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_sbd *sdp = GFS2_SB(inode); + loff_t i_size = i_size_read(inode); + pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT; + unsigned offset = i_size & (PAGE_CACHE_SIZE-1); + unsigned nrblocks = nr_pages * (PAGE_CACHE_SIZE/inode->i_sb->s_blocksize); + struct backing_dev_info *bdi = mapping->backing_dev_info; + int i; + int ret; + + ret = gfs2_trans_begin(sdp, nrblocks, 0); + if (ret < 0) + return ret; + + for(i = 0; i < nr_pages; i++) { + struct page *page = pvec->pages[i]; + + lock_page(page); + + if (unlikely(page->mapping != mapping)) { + unlock_page(page); + continue; + } + + if (!wbc->range_cyclic && page->index > end) { + ret = 1; + unlock_page(page); + continue; + } + + if (wbc->sync_mode != WB_SYNC_NONE) + wait_on_page_writeback(page); + + if (PageWriteback(page) || + !clear_page_dirty_for_io(page)) { + unlock_page(page); + continue; + } + + /* Is the page fully outside i_size? (truncate in progress) */ + if (page->index > end_index || (page->index == end_index && !offset)) { + page->mapping->a_ops->invalidatepage(page, 0); + unlock_page(page); + continue; + } + + ret = __gfs2_jdata_writepage(page, wbc); + + if (ret || (--(wbc->nr_to_write) <= 0)) + ret = 1; + if (wbc->nonblocking && bdi_write_congested(bdi)) { + wbc->encountered_congestion = 1; + ret = 1; + } + + } + gfs2_trans_end(sdp); + return ret; +} + +/** + * gfs2_write_cache_jdata - Like write_cache_pages but different + * @mapping: The mapping to write + * @wbc: The writeback control + * @writepage: The writepage function to call + * @data: The data to pass to writepage + * + * The reason that we use our own function here is that we need to + * start transactions before we grab page locks. This allows us + * to get the ordering right. + */ + +static int gfs2_write_cache_jdata(struct address_space *mapping, + struct writeback_control *wbc) +{ + struct backing_dev_info *bdi = mapping->backing_dev_info; + int ret = 0; + int done = 0; + struct pagevec pvec; + int nr_pages; + pgoff_t index; + pgoff_t end; + int scanned = 0; + int range_whole = 0; + + if (wbc->nonblocking && bdi_write_congested(bdi)) { + wbc->encountered_congestion = 1; + return 0; + } + + pagevec_init(&pvec, 0); + if (wbc->range_cyclic) { + index = mapping->writeback_index; /* Start from prev offset */ + end = -1; + } else { + index = wbc->range_start >> PAGE_CACHE_SHIFT; + end = wbc->range_end >> PAGE_CACHE_SHIFT; + if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) + range_whole = 1; + scanned = 1; + } - if (sdp->sd_args.ar_data == GFS2_DATA_WRITEBACK && !gfs2_is_jdata(ip)) - return mpage_writepages(mapping, wbc, gfs2_get_block_noalloc); +retry: + while (!done && (index <= end) && + (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, + PAGECACHE_TAG_DIRTY, + min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) { + scanned = 1; + ret = gfs2_write_jdata_pagevec(mapping, wbc, &pvec, nr_pages, end); + if (ret) + done = 1; + if (ret > 0) + ret = 0; + + pagevec_release(&pvec); + cond_resched(); + } + + if (!scanned && !done) { + /* + * We hit the last page and there is more work to be done: wrap + * back to the start of the file + */ + scanned = 1; + index = 0; + goto retry; + } + + if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) + mapping->writeback_index = index; + return ret; +} + + +/** + * gfs2_jdata_writepages - Write a bunch of dirty pages back to disk + * @mapping: The mapping to write + * @wbc: The writeback control + * + */ - return generic_writepages(mapping, wbc); +static int gfs2_jdata_writepages(struct address_space *mapping, + struct writeback_control *wbc) +{ + struct gfs2_inode *ip = GFS2_I(mapping->host); + struct gfs2_sbd *sdp = GFS2_SB(mapping->host); + int ret; + + ret = gfs2_write_cache_jdata(mapping, wbc); + if (ret == 0 && wbc->sync_mode == WB_SYNC_ALL) { + gfs2_log_flush(sdp, ip->i_gl); + ret = gfs2_write_cache_jdata(mapping, wbc); + } + return ret; } /** @@ -231,62 +468,107 @@ static int stuffed_readpage(struct gfs2_inode *ip, struct page *page) /** - * gfs2_readpage - readpage with locking - * @file: The file to read a page for. N.B. This may be NULL if we are - * reading an internal file. + * __gfs2_readpage - readpage + * @file: The file to read a page for * @page: The page to read * - * Returns: errno + * This is the core of gfs2's readpage. Its used by the internal file + * reading code as in that case we already hold the glock. Also its + * called by gfs2_readpage() once the required lock has been granted. + * */ -static int gfs2_readpage(struct file *file, struct page *page) +static int __gfs2_readpage(void *file, struct page *page) { struct gfs2_inode *ip = GFS2_I(page->mapping->host); struct gfs2_sbd *sdp = GFS2_SB(page->mapping->host); - struct gfs2_file *gf = NULL; - struct gfs2_holder gh; int error; - int do_unlock = 0; - - if (likely(file != &gfs2_internal_file_sentinel)) { - if (file) { - gf = file->private_data; - if (test_bit(GFF_EXLOCK, &gf->f_flags)) - /* gfs2_sharewrite_fault has grabbed the ip->i_gl already */ - goto skip_lock; - } - gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME|LM_FLAG_TRY_1CB, &gh); - do_unlock = 1; - error = gfs2_glock_nq_atime(&gh); - if (unlikely(error)) - goto out_unlock; - } -skip_lock: if (gfs2_is_stuffed(ip)) { error = stuffed_readpage(ip, page); unlock_page(page); - } else - error = mpage_readpage(page, gfs2_get_block); + } else { + error = mpage_readpage(page, gfs2_block_map); + } if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) - error = -EIO; + return -EIO; + + return error; +} + +/** + * gfs2_readpage - read a page of a file + * @file: The file to read + * @page: The page of the file + * + * This deals with the locking required. We use a trylock in order to + * avoid the page lock / glock ordering problems returning AOP_TRUNCATED_PAGE + * in the event that we are unable to get the lock. + */ + +static int gfs2_readpage(struct file *file, struct page *page) +{ + struct gfs2_inode *ip = GFS2_I(page->mapping->host); + struct gfs2_holder gh; + int error; - if (do_unlock) { - gfs2_glock_dq_m(1, &gh); - gfs2_holder_uninit(&gh); + gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME|LM_FLAG_TRY_1CB, &gh); + error = gfs2_glock_nq_atime(&gh); + if (unlikely(error)) { + unlock_page(page); + goto out; } + error = __gfs2_readpage(file, page); + gfs2_glock_dq(&gh); out: - return error; -out_unlock: - unlock_page(page); + gfs2_holder_uninit(&gh); if (error == GLR_TRYFAILED) { - error = AOP_TRUNCATED_PAGE; yield(); + return AOP_TRUNCATED_PAGE; } - if (do_unlock) - gfs2_holder_uninit(&gh); - goto out; + return error; +} + +/** + * gfs2_internal_read - read an internal file + * @ip: The gfs2 inode + * @ra_state: The readahead state (or NULL for no readahead) + * @buf: The buffer to fill + * @pos: The file position + * @size: The amount to read + * + */ + +int gfs2_internal_read(struct gfs2_inode *ip, struct file_ra_state *ra_state, + char *buf, loff_t *pos, unsigned size) +{ + struct address_space *mapping = ip->i_inode.i_mapping; + unsigned long index = *pos / PAGE_CACHE_SIZE; + unsigned offset = *pos & (PAGE_CACHE_SIZE - 1); + unsigned copied = 0; + unsigned amt; + struct page *page; + void *p; + + do { + amt = size - copied; + if (offset + size > PAGE_CACHE_SIZE) + amt = PAGE_CACHE_SIZE - offset; + page = read_cache_page(mapping, index, __gfs2_readpage, NULL); + if (IS_ERR(page)) + return PTR_ERR(page); + p = kmap_atomic(page, KM_USER0); + memcpy(buf + copied, p + offset, amt); + kunmap_atomic(p, KM_USER0); + mark_page_accessed(page); + page_cache_release(page); + copied += amt; + index++; + offset = 0; + } while(copied < size); + (*pos) += size; + return size; } /** @@ -300,10 +582,9 @@ out_unlock: * Any I/O we ignore at this time will be done via readpage later. * 2. We don't handle stuffed files here we let readpage do the honours. * 3. mpage_readpages() does most of the heavy lifting in the common case. - * 4. gfs2_get_block() is relied upon to set BH_Boundary in the right places. - * 5. We use LM_FLAG_TRY_1CB here, effectively we then have lock-ahead as - * well as read-ahead. + * 4. gfs2_block_map() is relied upon to set BH_Boundary in the right places. */ + static int gfs2_readpages(struct file *file, struct address_space *mapping, struct list_head *pages, unsigned nr_pages) { @@ -311,42 +592,20 @@ static int gfs2_readpages(struct file *file, struct address_space *mapping, struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_sbd *sdp = GFS2_SB(inode); struct gfs2_holder gh; - int ret = 0; - int do_unlock = 0; + int ret; - if (likely(file != &gfs2_internal_file_sentinel)) { - if (file) { - struct gfs2_file *gf = file->private_data; - if (test_bit(GFF_EXLOCK, &gf->f_flags)) - goto skip_lock; - } - gfs2_holder_init(ip->i_gl, LM_ST_SHARED, - LM_FLAG_TRY_1CB|GL_ATIME, &gh); - do_unlock = 1; - ret = gfs2_glock_nq_atime(&gh); - if (ret == GLR_TRYFAILED) - goto out_noerror; - if (unlikely(ret)) - goto out_unlock; - } -skip_lock: + gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME, &gh); + ret = gfs2_glock_nq_atime(&gh); + if (unlikely(ret)) + goto out_uninit; if (!gfs2_is_stuffed(ip)) - ret = mpage_readpages(mapping, pages, nr_pages, gfs2_get_block); - - if (do_unlock) { - gfs2_glock_dq_m(1, &gh); - gfs2_holder_uninit(&gh); - } -out: + ret = mpage_readpages(mapping, pages, nr_pages, gfs2_block_map); + gfs2_glock_dq(&gh); +out_uninit: + gfs2_holder_uninit(&gh); if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) ret = -EIO; return ret; -out_noerror: - ret = 0; -out_unlock: - if (do_unlock) - gfs2_holder_uninit(&gh); - goto out; } /** @@ -382,20 +641,11 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping, if (unlikely(error)) goto out_uninit; - error = -ENOMEM; - page = __grab_cache_page(mapping, index); - *pagep = page; - if (!page) - goto out_unlock; - gfs2_write_calc_reserv(ip, len, &data_blocks, &ind_blocks); - error = gfs2_write_alloc_required(ip, pos, len, &alloc_required); if (error) - goto out_putpage; - + goto out_unlock; - ip->i_alloc.al_requested = 0; if (alloc_required) { al = gfs2_alloc_get(ip); @@ -424,40 +674,47 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping, if (error) goto out_trans_fail; + error = -ENOMEM; + page = __grab_cache_page(mapping, index); + *pagep = page; + if (unlikely(!page)) + goto out_endtrans; + if (gfs2_is_stuffed(ip)) { + error = 0; if (pos + len > sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode)) { error = gfs2_unstuff_dinode(ip, page); if (error == 0) goto prepare_write; - } else if (!PageUptodate(page)) + } else if (!PageUptodate(page)) { error = stuffed_readpage(ip, page); + } goto out; } prepare_write: - error = block_prepare_write(page, from, to, gfs2_get_block); - + error = block_prepare_write(page, from, to, gfs2_block_map); out: - if (error) { - gfs2_trans_end(sdp); + if (error == 0) + return 0; + + page_cache_release(page); + if (pos + len > ip->i_inode.i_size) + vmtruncate(&ip->i_inode, ip->i_inode.i_size); +out_endtrans: + gfs2_trans_end(sdp); out_trans_fail: - if (alloc_required) { - gfs2_inplace_release(ip); + if (alloc_required) { + gfs2_inplace_release(ip); out_qunlock: - gfs2_quota_unlock(ip); + gfs2_quota_unlock(ip); out_alloc_put: - gfs2_alloc_put(ip); - } -out_putpage: - page_cache_release(page); - if (pos + len > ip->i_inode.i_size) - vmtruncate(&ip->i_inode, ip->i_inode.i_size); + gfs2_alloc_put(ip); + } out_unlock: - gfs2_glock_dq_m(1, &ip->i_gh); + gfs2_glock_dq(&ip->i_gh); out_uninit: - gfs2_holder_uninit(&ip->i_gh); - } - + gfs2_holder_uninit(&ip->i_gh); return error; } @@ -565,7 +822,7 @@ static int gfs2_write_end(struct file *file, struct address_space *mapping, struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_sbd *sdp = GFS2_SB(inode); struct buffer_head *dibh; - struct gfs2_alloc *al = &ip->i_alloc; + struct gfs2_alloc *al = ip->i_alloc; struct gfs2_dinode *di; unsigned int from = pos & (PAGE_CACHE_SIZE - 1); unsigned int to = from + len; @@ -585,19 +842,16 @@ static int gfs2_write_end(struct file *file, struct address_space *mapping, if (gfs2_is_stuffed(ip)) return gfs2_stuffed_write_end(inode, dibh, pos, len, copied, page); - if (sdp->sd_args.ar_data == GFS2_DATA_ORDERED || gfs2_is_jdata(ip)) + if (!gfs2_is_writeback(ip)) gfs2_page_add_databufs(ip, page, from, to); ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata); - if (likely(ret >= 0)) { - copied = ret; - if ((pos + copied) > inode->i_size) { - di = (struct gfs2_dinode *)dibh->b_data; - ip->i_di.di_size = inode->i_size; - di->di_size = cpu_to_be64(inode->i_size); - mark_inode_dirty(inode); - } + if (likely(ret >= 0) && (inode->i_size > ip->i_di.di_size)) { + di = (struct gfs2_dinode *)dibh->b_data; + ip->i_di.di_size = inode->i_size; + di->di_size = cpu_to_be64(inode->i_size); + mark_inode_dirty(inode); } if (inode == sdp->sd_rindex) @@ -606,7 +860,7 @@ static int gfs2_write_end(struct file *file, struct address_space *mapping, brelse(dibh); gfs2_trans_end(sdp); failed: - if (al->al_requested) { + if (al) { gfs2_inplace_release(ip); gfs2_quota_unlock(ip); gfs2_alloc_put(ip); @@ -625,11 +879,7 @@ failed: static int gfs2_set_page_dirty(struct page *page) { - struct gfs2_inode *ip = GFS2_I(page->mapping->host); - struct gfs2_sbd *sdp = GFS2_SB(page->mapping->host); - - if (sdp->sd_args.ar_data == GFS2_DATA_ORDERED || gfs2_is_jdata(ip)) - SetPageChecked(page); + SetPageChecked(page); return __set_page_dirty_buffers(page); } @@ -653,7 +903,7 @@ static sector_t gfs2_bmap(struct address_space *mapping, sector_t lblock) return 0; if (!gfs2_is_stuffed(ip)) - dblock = generic_block_bmap(mapping, lblock, gfs2_get_block); + dblock = generic_block_bmap(mapping, lblock, gfs2_block_map); gfs2_glock_dq_uninit(&i_gh); @@ -719,13 +969,9 @@ static int gfs2_ok_for_dio(struct gfs2_inode *ip, int rw, loff_t offset) { /* * Should we return an error here? I can't see that O_DIRECT for - * a journaled file makes any sense. For now we'll silently fall - * back to buffered I/O, likewise we do the same for stuffed - * files since they are (a) small and (b) unaligned. + * a stuffed file makes any sense. For now we'll silently fall + * back to buffered I/O */ - if (gfs2_is_jdata(ip)) - return 0; - if (gfs2_is_stuffed(ip)) return 0; @@ -836,9 +1082,23 @@ cannot_release: return 0; } -const struct address_space_operations gfs2_file_aops = { - .writepage = gfs2_writepage, - .writepages = gfs2_writepages, +static const struct address_space_operations gfs2_writeback_aops = { + .writepage = gfs2_writeback_writepage, + .writepages = gfs2_writeback_writepages, + .readpage = gfs2_readpage, + .readpages = gfs2_readpages, + .sync_page = block_sync_page, + .write_begin = gfs2_write_begin, + .write_end = gfs2_write_end, + .bmap = gfs2_bmap, + .invalidatepage = gfs2_invalidatepage, + .releasepage = gfs2_releasepage, + .direct_IO = gfs2_direct_IO, + .migratepage = buffer_migrate_page, +}; + +static const struct address_space_operations gfs2_ordered_aops = { + .writepage = gfs2_ordered_writepage, .readpage = gfs2_readpage, .readpages = gfs2_readpages, .sync_page = block_sync_page, @@ -849,5 +1109,34 @@ const struct address_space_operations gfs2_file_aops = { .invalidatepage = gfs2_invalidatepage, .releasepage = gfs2_releasepage, .direct_IO = gfs2_direct_IO, + .migratepage = buffer_migrate_page, }; +static const struct address_space_operations gfs2_jdata_aops = { + .writepage = gfs2_jdata_writepage, + .writepages = gfs2_jdata_writepages, + .readpage = gfs2_readpage, + .readpages = gfs2_readpages, + .sync_page = block_sync_page, + .write_begin = gfs2_write_begin, + .write_end = gfs2_write_end, + .set_page_dirty = gfs2_set_page_dirty, + .bmap = gfs2_bmap, + .invalidatepage = gfs2_invalidatepage, + .releasepage = gfs2_releasepage, +}; + +void gfs2_set_aops(struct inode *inode) +{ + struct gfs2_inode *ip = GFS2_I(inode); + + if (gfs2_is_writeback(ip)) + inode->i_mapping->a_ops = &gfs2_writeback_aops; + else if (gfs2_is_ordered(ip)) + inode->i_mapping->a_ops = &gfs2_ordered_aops; + else if (gfs2_is_jdata(ip)) + inode->i_mapping->a_ops = &gfs2_jdata_aops; + else + BUG(); +} + diff --git a/fs/gfs2/ops_address.h b/fs/gfs2/ops_address.h index fa1b5b3..5da2128 100644 --- a/fs/gfs2/ops_address.h +++ b/fs/gfs2/ops_address.h @@ -14,9 +14,10 @@ #include <linux/buffer_head.h> #include <linux/mm.h> -extern const struct address_space_operations gfs2_file_aops; -extern int gfs2_get_block(struct inode *inode, sector_t lblock, - struct buffer_head *bh_result, int create); extern int gfs2_releasepage(struct page *page, gfp_t gfp_mask); +extern int gfs2_internal_read(struct gfs2_inode *ip, + struct file_ra_state *ra_state, + char *buf, loff_t *pos, unsigned size); +extern void gfs2_set_aops(struct inode *inode); #endif /* __OPS_ADDRESS_DOT_H__ */ diff --git a/fs/gfs2/ops_file.c b/fs/gfs2/ops_file.c index bb11fd6..f4842f2 100644 --- a/fs/gfs2/ops_file.c +++ b/fs/gfs2/ops_file.c @@ -33,57 +33,12 @@ #include "lm.h" #include "log.h" #include "meta_io.h" -#include "ops_file.h" -#include "ops_vm.h" #include "quota.h" #include "rgrp.h" #include "trans.h" #include "util.h" #include "eaops.h" - -/* - * Most fields left uninitialised to catch anybody who tries to - * use them. f_flags set to prevent file_accessed() from touching - * any other part of this. Its use is purely as a flag so that we - * know (in readpage()) whether or not do to locking. - */ -struct file gfs2_internal_file_sentinel = { - .f_flags = O_NOATIME|O_RDONLY, -}; - -static int gfs2_read_actor(read_descriptor_t *desc, struct page *page, - unsigned long offset, unsigned long size) -{ - char *kaddr; - unsigned long count = desc->count; - - if (size > count) - size = count; - - kaddr = kmap(page); - memcpy(desc->arg.data, kaddr + offset, size); - kunmap(page); - - desc->count = count - size; - desc->written += size; - desc->arg.buf += size; - return size; -} - -int gfs2_internal_read(struct gfs2_inode *ip, struct file_ra_state *ra_state, - char *buf, loff_t *pos, unsigned size) -{ - struct inode *inode = &ip->i_inode; - read_descriptor_t desc; - desc.written = 0; - desc.arg.data = buf; - desc.count = size; - desc.error = 0; - do_generic_mapping_read(inode->i_mapping, ra_state, - &gfs2_internal_file_sentinel, pos, &desc, - gfs2_read_actor); - return desc.written ? desc.written : desc.error; -} +#include "ops_address.h" /** * gfs2_llseek - seek to a location in a file @@ -214,7 +169,7 @@ static int gfs2_get_flags(struct file *filp, u32 __user *ptr) if (put_user(fsflags, ptr)) error = -EFAULT; - gfs2_glock_dq_m(1, &gh); + gfs2_glock_dq(&gh); gfs2_holder_uninit(&gh); return error; } @@ -291,7 +246,16 @@ static int do_gfs2_set_flags(struct file *filp, u32 reqflags, u32 mask) if (error) goto out; } - + if ((flags ^ new_flags) & GFS2_DIF_JDATA) { + if (flags & GFS2_DIF_JDATA) + gfs2_log_flush(sdp, ip->i_gl); + error = filemap_fdatawrite(inode->i_mapping); + if (error) + goto out; + error = filemap_fdatawait(inode->i_mapping); + if (error) + goto out; + } error = gfs2_trans_begin(sdp, RES_DINODE, 0); if (error) goto out; @@ -303,6 +267,7 @@ static int do_gfs2_set_flags(struct file *filp, u32 reqflags, u32 mask) gfs2_dinode_out(ip, bh->b_data); brelse(bh); gfs2_set_inode_flags(inode); + gfs2_set_aops(inode); out_trans_end: gfs2_trans_end(sdp); out: @@ -338,6 +303,128 @@ static long gfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) return -ENOTTY; } +/** + * gfs2_allocate_page_backing - Use bmap to allocate blocks + * @page: The (locked) page to allocate backing for + * + * We try to allocate all the blocks required for the page in + * one go. This might fail for various reasons, so we keep + * trying until all the blocks to back this page are allocated. + * If some of the blocks are already allocated, thats ok too. + */ + +static int gfs2_allocate_page_backing(struct page *page) +{ + struct inode *inode = page->mapping->host; + struct buffer_head bh; + unsigned long size = PAGE_CACHE_SIZE; + u64 lblock = page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits); + + do { + bh.b_state = 0; + bh.b_size = size; + gfs2_block_map(inode, lblock, &bh, 1); + if (!buffer_mapped(&bh)) + return -EIO; + size -= bh.b_size; + lblock += (bh.b_size >> inode->i_blkbits); + } while(size > 0); + return 0; +} + +/** + * gfs2_page_mkwrite - Make a shared, mmap()ed, page writable + * @vma: The virtual memory area + * @page: The page which is about to become writable + * + * When the page becomes writable, we need to ensure that we have + * blocks allocated on disk to back that page. + */ + +static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct page *page) +{ + struct inode *inode = vma->vm_file->f_path.dentry->d_inode; + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_sbd *sdp = GFS2_SB(inode); + unsigned long last_index; + u64 pos = page->index << (PAGE_CACHE_SIZE - inode->i_blkbits); + unsigned int data_blocks, ind_blocks, rblocks; + int alloc_required = 0; + struct gfs2_holder gh; + struct gfs2_alloc *al; + int ret; + + gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_ATIME, &gh); + ret = gfs2_glock_nq_atime(&gh); + if (ret) + goto out; + + set_bit(GIF_SW_PAGED, &ip->i_flags); + gfs2_write_calc_reserv(ip, PAGE_CACHE_SIZE, &data_blocks, &ind_blocks); + ret = gfs2_write_alloc_required(ip, pos, PAGE_CACHE_SIZE, &alloc_required); + if (ret || !alloc_required) + goto out_unlock; + ret = -ENOMEM; + al = gfs2_alloc_get(ip); + if (al == NULL) + goto out_unlock; + + ret = gfs2_quota_lock(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE); + if (ret) + goto out_alloc_put; + ret = gfs2_quota_check(ip, ip->i_inode.i_uid, ip->i_inode.i_gid); + if (ret) + goto out_quota_unlock; + al->al_requested = data_blocks + ind_blocks; + ret = gfs2_inplace_reserve(ip); + if (ret) + goto out_quota_unlock; + + rblocks = RES_DINODE + ind_blocks; + if (gfs2_is_jdata(ip)) + rblocks += data_blocks ? data_blocks : 1; + if (ind_blocks || data_blocks) + rblocks += RES_STATFS + RES_QUOTA; + ret = gfs2_trans_begin(sdp, rblocks, 0); + if (ret) + goto out_trans_fail; + + lock_page(page); + ret = -EINVAL; + last_index = ip->i_inode.i_size >> PAGE_CACHE_SHIFT; + if (page->index > last_index) + goto out_unlock_page; + ret = 0; + if (!PageUptodate(page) || page->mapping != ip->i_inode.i_mapping) + goto out_unlock_page; + if (gfs2_is_stuffed(ip)) { + ret = gfs2_unstuff_dinode(ip, page); + if (ret) + goto out_unlock_page; + } + ret = gfs2_allocate_page_backing(page); + +out_unlock_page: + unlock_page(page); + gfs2_trans_end(sdp); +out_trans_fail: + gfs2_inplace_release(ip); +out_quota_unlock: + gfs2_quota_unlock(ip); +out_alloc_put: + gfs2_alloc_put(ip); +out_unlock: + gfs2_glock_dq(&gh); +out: + gfs2_holder_uninit(&gh); + return ret; +} + +static struct vm_operations_struct gfs2_vm_ops = { + .fault = filemap_fault, + .page_mkwrite = gfs2_page_mkwrite, +}; + /** * gfs2_mmap - @@ -360,14 +447,7 @@ static int gfs2_mmap(struct file *file, struct vm_area_struct *vma) return error; } - /* This is VM_MAYWRITE instead of VM_WRITE because a call - to mprotect() can turn on VM_WRITE later. */ - - if ((vma->vm_flags & (VM_MAYSHARE | VM_MAYWRITE)) == - (VM_MAYSHARE | VM_MAYWRITE)) - vma->vm_ops = &gfs2_vm_ops_sharewrite; - else - vma->vm_ops = &gfs2_vm_ops_private; + vma->vm_ops = &gfs2_vm_ops; gfs2_glock_dq_uninit(&i_gh); @@ -538,15 +618,6 @@ static int gfs2_lock(struct file *file, int cmd, struct file_lock *fl) if (__mandatory_lock(&ip->i_inode)) return -ENOLCK; - if (sdp->sd_args.ar_localflocks) { - if (IS_GETLK(cmd)) { - posix_test_lock(file, fl); - return 0; - } else { - return posix_lock_file_wait(file, fl); - } - } - if (cmd == F_CANCELLK) { /* Hack: */ cmd = F_SETLK; @@ -632,16 +703,12 @@ static void do_unflock(struct file *file, struct file_lock *fl) static int gfs2_flock(struct file *file, int cmd, struct file_lock *fl) { struct gfs2_inode *ip = GFS2_I(file->f_mapping->host); - struct gfs2_sbd *sdp = GFS2_SB(file->f_mapping->host); if (!(fl->fl_flags & FL_FLOCK)) return -ENOLCK; if (__mandatory_lock(&ip->i_inode)) return -ENOLCK; - if (sdp->sd_args.ar_localflocks) - return flock_lock_file_wait(file, fl); - if (fl->fl_type == F_UNLCK) { do_unflock(file, fl); return 0; @@ -678,3 +745,27 @@ const struct file_operations gfs2_dir_fops = { .flock = gfs2_flock, }; +const struct file_operations gfs2_file_fops_nolock = { + .llseek = gfs2_llseek, + .read = do_sync_read, + .aio_read = generic_file_aio_read, + .write = do_sync_write, + .aio_write = generic_file_aio_write, + .unlocked_ioctl = gfs2_ioctl, + .mmap = gfs2_mmap, + .open = gfs2_open, + .release = gfs2_close, + .fsync = gfs2_fsync, + .splice_read = generic_file_splice_read, + .splice_write = generic_file_splice_write, + .setlease = gfs2_setlease, +}; + +const struct file_operations gfs2_dir_fops_nolock = { + .readdir = gfs2_readdir, + .unlocked_ioctl = gfs2_ioctl, + .open = gfs2_open, + .release = gfs2_close, + .fsync = gfs2_fsync, +}; + diff --git a/fs/gfs2/ops_file.h b/fs/gfs2/ops_file.h deleted file mode 100644 index 7e5d8ec..0000000 --- a/fs/gfs2/ops_file.h +++ /dev/null @@ -1,24 +0,0 @@ -/* - * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. - * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. - * - * This copyrighted material is made available to anyone wishing to use, - * modify, copy, or redistribute it subject to the terms and conditions - * of the GNU General Public License version 2. - */ - -#ifndef __OPS_FILE_DOT_H__ -#define __OPS_FILE_DOT_H__ - -#include <linux/fs.h> -struct gfs2_inode; - -extern struct file gfs2_internal_file_sentinel; -extern int gfs2_internal_read(struct gfs2_inode *ip, - struct file_ra_state *ra_state, - char *buf, loff_t *pos, unsigned size); -extern void gfs2_set_inode_flags(struct inode *inode); -extern const struct file_operations gfs2_file_fops; -extern const struct file_operations gfs2_dir_fops; - -#endif /* __OPS_FILE_DOT_H__ */ diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index 17de58e..43d511b 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -1,6 +1,6 @@ /* * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. - * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * Copyright (C) 2004-2007 Red Hat, Inc. All rights reserved. * * This copyrighted material is made available to anyone wishing to use, * modify, copy, or redistribute it subject to the terms and conditions @@ -21,6 +21,7 @@ #include "gfs2.h" #include "incore.h" +#include "bmap.h" #include "daemon.h" #include "glock.h" #include "glops.h" @@ -59,7 +60,6 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb) mutex_init(&sdp->sd_inum_mutex); spin_lock_init(&sdp->sd_statfs_spin); - mutex_init(&sdp->sd_statfs_mutex); spin_lock_init(&sdp->sd_rindex_spin); mutex_init(&sdp->sd_rindex_mutex); @@ -77,7 +77,6 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb) spin_lock_init(&sdp->sd_log_lock); - INIT_LIST_HEAD(&sdp->sd_log_le_gl); INIT_LIST_HEAD(&sdp->sd_log_le_buf); INIT_LIST_HEAD(&sdp->sd_log_le_revoke); INIT_LIST_HEAD(&sdp->sd_log_le_rg); @@ -303,6 +302,67 @@ out: return error; } +/** + * map_journal_extents - create a reusable "extent" mapping from all logical + * blocks to all physical blocks for the given journal. This will save + * us time when writing journal blocks. Most journals will have only one + * extent that maps all their logical blocks. That's because gfs2.mkfs + * arranges the journal blocks sequentially to maximize performance. + * So the extent would map the first block for the entire file length. + * However, gfs2_jadd can happen while file activity is happening, so + * those journals may not be sequential. Less likely is the case where + * the users created their own journals by mounting the metafs and + * laying it out. But it's still possible. These journals might have + * several extents. + * + * TODO: This should be done in bigger chunks rather than one block at a time, + * but since it's only done at mount time, I'm not worried about the + * time it takes. + */ +static int map_journal_extents(struct gfs2_sbd *sdp) +{ + struct gfs2_jdesc *jd = sdp->sd_jdesc; + unsigned int lb; + u64 db, prev_db; /* logical block, disk block, prev disk block */ + struct gfs2_inode *ip = GFS2_I(jd->jd_inode); + struct gfs2_journal_extent *jext = NULL; + struct buffer_head bh; + int rc = 0; + + prev_db = 0; + + for (lb = 0; lb < ip->i_di.di_size >> sdp->sd_sb.sb_bsize_shift; lb++) { + bh.b_state = 0; + bh.b_blocknr = 0; + bh.b_size = 1 << ip->i_inode.i_blkbits; + rc = gfs2_block_map(jd->jd_inode, lb, &bh, 0); + db = bh.b_blocknr; + if (rc || !db) { + printk(KERN_INFO "GFS2 journal mapping error %d: lb=" + "%u db=%llu\n", rc, lb, (unsigned long long)db); + break; + } + if (!prev_db || db != prev_db + 1) { + jext = kzalloc(sizeof(struct gfs2_journal_extent), + GFP_KERNEL); + if (!jext) { + printk(KERN_INFO "GFS2 error: out of memory " + "mapping journal extents.\n"); + rc = -ENOMEM; + break; + } + jext->dblock = db; + jext->lblock = lb; + jext->blocks = 1; + list_add_tail(&jext->extent_list, &jd->extent_list); + } else { + jext->blocks++; + } + prev_db = db; + } + return rc; +} + static int init_journal(struct gfs2_sbd *sdp, int undo) { struct gfs2_holder ji_gh; @@ -340,7 +400,7 @@ static int init_journal(struct gfs2_sbd *sdp, int undo) if (sdp->sd_args.ar_spectator) { sdp->sd_jdesc = gfs2_jdesc_find(sdp, 0); - sdp->sd_log_blks_free = sdp->sd_jdesc->jd_blocks; + atomic_set(&sdp->sd_log_blks_free, sdp->sd_jdesc->jd_blocks); } else { if (sdp->sd_lockstruct.ls_jid >= gfs2_jindex_size(sdp)) { fs_err(sdp, "can't mount journal #%u\n", @@ -377,7 +437,10 @@ static int init_journal(struct gfs2_sbd *sdp, int undo) sdp->sd_jdesc->jd_jid, error); goto fail_jinode_gh; } - sdp->sd_log_blks_free = sdp->sd_jdesc->jd_blocks; + atomic_set(&sdp->sd_log_blks_free, sdp->sd_jdesc->jd_blocks); + + /* Map the extents for this journal's blocks */ + map_journal_extents(sdp); } if (sdp->sd_lockstruct.ls_first) { diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c index 291f0c7..9f71372 100644 --- a/fs/gfs2/ops_inode.c +++ b/fs/gfs2/ops_inode.c @@ -61,7 +61,7 @@ static int gfs2_create(struct inode *dir, struct dentry *dentry, inode = gfs2_createi(ghs, &dentry->d_name, S_IFREG | mode, 0); if (!IS_ERR(inode)) { gfs2_trans_end(sdp); - if (dip->i_alloc.al_rgd) + if (dip->i_alloc->al_rgd) gfs2_inplace_release(dip); gfs2_quota_unlock(dip); gfs2_alloc_put(dip); @@ -113,8 +113,18 @@ static struct dentry *gfs2_lookup(struct inode *dir, struct dentry *dentry, if (inode && IS_ERR(inode)) return ERR_PTR(PTR_ERR(inode)); - if (inode) + if (inode) { + struct gfs2_glock *gl = GFS2_I(inode)->i_gl; + struct gfs2_holder gh; + int error; + error = gfs2_glock_nq_init(gl, LM_ST_SHARED, LM_FLAG_ANY, &gh); + if (error) { + iput(inode); + return ERR_PTR(error); + } + gfs2_glock_dq_uninit(&gh); return d_splice_alias(inode, dentry); + } d_add(dentry, inode); return NULL; @@ -366,7 +376,7 @@ static int gfs2_symlink(struct inode *dir, struct dentry *dentry, } gfs2_trans_end(sdp); - if (dip->i_alloc.al_rgd) + if (dip->i_alloc->al_rgd) gfs2_inplace_release(dip); gfs2_quota_unlock(dip); gfs2_alloc_put(dip); @@ -442,7 +452,7 @@ static int gfs2_mkdir(struct inode *dir, struct dentry *dentry, int mode) gfs2_assert_withdraw(sdp, !error); /* dip already pinned */ gfs2_trans_end(sdp); - if (dip->i_alloc.al_rgd) + if (dip->i_alloc->al_rgd) gfs2_inplace_release(dip); gfs2_quota_unlock(dip); gfs2_alloc_put(dip); @@ -548,7 +558,7 @@ static int gfs2_mknod(struct inode *dir, struct dentry *dentry, int mode, } gfs2_trans_end(sdp); - if (dip->i_alloc.al_rgd) + if (dip->i_alloc->al_rgd) gfs2_inplace_release(dip); gfs2_quota_unlock(dip); gfs2_alloc_put(dip); diff --git a/fs/gfs2/ops_inode.h b/fs/gfs2/ops_inode.h index 34f0caa..fd8cee2 100644 --- a/fs/gfs2/ops_inode.h +++ b/fs/gfs2/ops_inode.h @@ -16,5 +16,11 @@ extern const struct inode_operations gfs2_file_iops; extern const struct inode_operations gfs2_dir_iops; extern const struct inode_operations gfs2_symlink_iops; extern const struct inode_operations gfs2_dev_iops; +extern const struct file_operations gfs2_file_fops; +extern const struct file_operations gfs2_dir_fops; +extern const struct file_operations gfs2_file_fops_nolock; +extern const struct file_operations gfs2_dir_fops_nolock; + +extern void gfs2_set_inode_flags(struct inode *inode); #endif /* __OPS_INODE_DOT_H__ */ diff --git a/fs/gfs2/ops_super.c b/fs/gfs2/ops_super.c index 950f314..5e52421 100644 --- a/fs/gfs2/ops_super.c +++ b/fs/gfs2/ops_super.c @@ -487,7 +487,6 @@ static struct inode *gfs2_alloc_inode(struct super_block *sb) if (ip) { ip->i_flags = 0; ip->i_gl = NULL; - ip->i_last_pfault = jiffies; } return &ip->i_inode; } diff --git a/fs/gfs2/ops_vm.c b/fs/gfs2/ops_vm.c deleted file mode 100644 index 927d739..0000000 --- a/fs/gfs2/ops_vm.c +++ /dev/null @@ -1,169 +0,0 @@ -/* - * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. - * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. - * - * This copyrighted material is made available to anyone wishing to use, - * modify, copy, or redistribute it subject to the terms and conditions - * of the GNU General Public License version 2. - */ - -#include <linux/slab.h> -#include <linux/spinlock.h> -#include <linux/completion.h> -#include <linux/buffer_head.h> -#include <linux/mm.h> -#include <linux/pagemap.h> -#include <linux/gfs2_ondisk.h> -#include <linux/lm_interface.h> - -#include "gfs2.h" -#include "incore.h" -#include "bmap.h" -#include "glock.h" -#include "inode.h" -#include "ops_vm.h" -#include "quota.h" -#include "rgrp.h" -#include "trans.h" -#include "util.h" - -static int gfs2_private_fault(struct vm_area_struct *vma, struct vm_fault *vmf) -{ - struct gfs2_inode *ip = GFS2_I(vma->vm_file->f_mapping->host); - - set_bit(GIF_PAGED, &ip->i_flags); - return filemap_fault(vma, vmf); -} - -static int alloc_page_backing(struct gfs2_inode *ip, struct page *page) -{ - struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); - unsigned long index = page->index; - u64 lblock = index << (PAGE_CACHE_SHIFT - - sdp->sd_sb.sb_bsize_shift); - unsigned int blocks = PAGE_CACHE_SIZE >> sdp->sd_sb.sb_bsize_shift; - struct gfs2_alloc *al; - unsigned int data_blocks, ind_blocks; - unsigned int x; - int error; - - al = gfs2_alloc_get(ip); - - error = gfs2_quota_lock(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE); - if (error) - goto out; - - error = gfs2_quota_check(ip, ip->i_inode.i_uid, ip->i_inode.i_gid); - if (error) - goto out_gunlock_q; - - gfs2_write_calc_reserv(ip, PAGE_CACHE_SIZE, &data_blocks, &ind_blocks); - - al->al_requested = data_blocks + ind_blocks; - - error = gfs2_inplace_reserve(ip); - if (error) - goto out_gunlock_q; - - error = gfs2_trans_begin(sdp, al->al_rgd->rd_length + - ind_blocks + RES_DINODE + - RES_STATFS + RES_QUOTA, 0); - if (error) - goto out_ipres; - - if (gfs2_is_stuffed(ip)) { - error = gfs2_unstuff_dinode(ip, NULL); - if (error) - goto out_trans; - } - - for (x = 0; x < blocks; ) { - u64 dblock; - unsigned int extlen; - int new = 1; - - error = gfs2_extent_map(&ip->i_inode, lblock, &new, &dblock, &extlen); - if (error) - goto out_trans; - - lblock += extlen; - x += extlen; - } - - gfs2_assert_warn(sdp, al->al_alloced); - -out_trans: - gfs2_trans_end(sdp); -out_ipres: - gfs2_inplace_release(ip); -out_gunlock_q: - gfs2_quota_unlock(ip); -out: - gfs2_alloc_put(ip); - return error; -} - -static int gfs2_sharewrite_fault(struct vm_area_struct *vma, - struct vm_fault *vmf) -{ - struct file *file = vma->vm_file; - struct gfs2_file *gf = file->private_data; - struct gfs2_inode *ip = GFS2_I(file->f_mapping->host); - struct gfs2_holder i_gh; - int alloc_required; - int error; - int ret = 0; - - error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh); - if (error) - goto out; - - set_bit(GIF_PAGED, &ip->i_flags); - set_bit(GIF_SW_PAGED, &ip->i_flags); - - error = gfs2_write_alloc_required(ip, - (u64)vmf->pgoff << PAGE_CACHE_SHIFT, - PAGE_CACHE_SIZE, &alloc_required); - if (error) { - ret = VM_FAULT_OOM; /* XXX: are these right? */ - goto out_unlock; - } - - set_bit(GFF_EXLOCK, &gf->f_flags); - ret = filemap_fault(vma, vmf); - clear_bit(GFF_EXLOCK, &gf->f_flags); - if (ret & VM_FAULT_ERROR) - goto out_unlock; - - if (alloc_required) { - /* XXX: do we need to drop page lock around alloc_page_backing?*/ - error = alloc_page_backing(ip, vmf->page); - if (error) { - /* - * VM_FAULT_LOCKED should always be the case for - * filemap_fault, but it may not be in a future - * implementation. - */ - if (ret & VM_FAULT_LOCKED) - unlock_page(vmf->page); - page_cache_release(vmf->page); - ret = VM_FAULT_OOM; - goto out_unlock; - } - set_page_dirty(vmf->page); - } - -out_unlock: - gfs2_glock_dq_uninit(&i_gh); -out: - return ret; -} - -struct vm_operations_struct gfs2_vm_ops_private = { - .fault = gfs2_private_fault, -}; - -struct vm_operations_struct gfs2_vm_ops_sharewrite = { - .fault = gfs2_sharewrite_fault, -}; - diff --git a/fs/gfs2/ops_vm.h b/fs/gfs2/ops_vm.h deleted file mode 100644 index 4ae8f43..0000000 --- a/fs/gfs2/ops_vm.h +++ /dev/null @@ -1,18 +0,0 @@ -/* - * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. - * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. - * - * This copyrighted material is made available to anyone wishing to use, - * modify, copy, or redistribute it subject to the terms and conditions - * of the GNU General Public License version 2. - */ - -#ifndef __OPS_VM_DOT_H__ -#define __OPS_VM_DOT_H__ - -#include <linux/mm.h> - -extern struct vm_operations_struct gfs2_vm_ops_private; -extern struct vm_operations_struct gfs2_vm_ops_sharewrite; - -#endif /* __OPS_VM_DOT_H__ */ diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index addb51e..a08dabd 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -1,6 +1,6 @@ /* * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. - * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * Copyright (C) 2004-2007 Red Hat, Inc. All rights reserved. * * This copyrighted material is made available to anyone wishing to use, * modify, copy, or redistribute it subject to the terms and conditions @@ -59,7 +59,6 @@ #include "super.h" #include "trans.h" #include "inode.h" -#include "ops_file.h" #include "ops_address.h" #include "util.h" @@ -274,10 +273,10 @@ static int bh_get(struct gfs2_quota_data *qd) } block = qd->qd_slot / sdp->sd_qc_per_block; - offset = qd->qd_slot % sdp->sd_qc_per_block;; + offset = qd->qd_slot % sdp->sd_qc_per_block; bh_map.b_size = 1 << ip->i_inode.i_blkbits; - error = gfs2_block_map(&ip->i_inode, block, 0, &bh_map); + error = gfs2_block_map(&ip->i_inode, block, &bh_map, 0); if (error) goto fail; error = gfs2_meta_read(ip->i_gl, bh_map.b_blocknr, DIO_WAIT, &bh); @@ -454,7 +453,7 @@ static void qdsb_put(struct gfs2_quota_data *qd) int gfs2_quota_hold(struct gfs2_inode *ip, u32 uid, u32 gid) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); - struct gfs2_alloc *al = &ip->i_alloc; + struct gfs2_alloc *al = ip->i_alloc; struct gfs2_quota_data **qd = al->al_qd; int error; @@ -502,7 +501,7 @@ out: void gfs2_quota_unhold(struct gfs2_inode *ip) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); - struct gfs2_alloc *al = &ip->i_alloc; + struct gfs2_alloc *al = ip->i_alloc; unsigned int x; gfs2_assert_warn(sdp, !test_bit(GIF_QD_LOCKED, &ip->i_flags)); @@ -646,7 +645,7 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc, } if (!buffer_mapped(bh)) { - gfs2_get_block(inode, iblock, bh, 1); + gfs2_block_map(inode, iblock, bh, 1); if (!buffer_mapped(bh)) goto unlock; } @@ -793,11 +792,9 @@ static int do_glock(struct gfs2_quota_data *qd, int force_refresh, struct gfs2_holder i_gh; struct gfs2_quota_host q; char buf[sizeof(struct gfs2_quota)]; - struct file_ra_state ra_state; int error; struct gfs2_quota_lvb *qlvb; - file_ra_state_init(&ra_state, sdp->sd_quota_inode->i_mapping); restart: error = gfs2_glock_nq_init(qd->qd_gl, LM_ST_SHARED, 0, q_gh); if (error) @@ -820,8 +817,8 @@ restart: memset(buf, 0, sizeof(struct gfs2_quota)); pos = qd2offset(qd); - error = gfs2_internal_read(ip, &ra_state, buf, - &pos, sizeof(struct gfs2_quota)); + error = gfs2_internal_read(ip, NULL, buf, &pos, + sizeof(struct gfs2_quota)); if (error < 0) goto fail_gunlock; @@ -856,7 +853,7 @@ fail: int gfs2_quota_lock(struct gfs2_inode *ip, u32 uid, u32 gid) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); - struct gfs2_alloc *al = &ip->i_alloc; + struct gfs2_alloc *al = ip->i_alloc; unsigned int x; int error = 0; @@ -924,7 +921,7 @@ static int need_sync(struct gfs2_quota_data *qd) void gfs2_quota_unlock(struct gfs2_inode *ip) { - struct gfs2_alloc *al = &ip->i_alloc; + struct gfs2_alloc *al = ip->i_alloc; struct gfs2_quota_data *qda[4]; unsigned int count = 0; unsigned int x; @@ -972,7 +969,7 @@ static int print_message(struct gfs2_quota_data *qd, char *type) int gfs2_quota_check(struct gfs2_inode *ip, u32 uid, u32 gid) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); - struct gfs2_alloc *al = &ip->i_alloc; + struct gfs2_alloc *al = ip->i_alloc; struct gfs2_quota_data *qd; s64 value; unsigned int x; @@ -1016,10 +1013,9 @@ int gfs2_quota_check(struct gfs2_inode *ip, u32 uid, u32 gid) void gfs2_quota_change(struct gfs2_inode *ip, s64 change, u32 uid, u32 gid) { - struct gfs2_alloc *al = &ip->i_alloc; + struct gfs2_alloc *al = ip->i_alloc; struct gfs2_quota_data *qd; unsigned int x; - unsigned int found = 0; if (gfs2_assert_warn(GFS2_SB(&ip->i_inode), change)) return; @@ -1032,7 +1028,6 @@ void gfs2_quota_change(struct gfs2_inode *ip, s64 change, if ((qd->qd_id == uid && test_bit(QDF_USER, &qd->qd_flags)) || (qd->qd_id == gid && !test_bit(QDF_USER, &qd->qd_flags))) { do_qc(qd, change); - found++; } } } diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c index beb6c7a..b249e29 100644 --- a/fs/gfs2/recovery.c +++ b/fs/gfs2/recovery.c @@ -391,7 +391,7 @@ static int clean_journal(struct gfs2_jdesc *jd, struct gfs2_log_header_host *hea lblock = head->lh_blkno; gfs2_replay_incr_blk(sdp, &lblock); bh_map.b_size = 1 << ip->i_inode.i_blkbits; - error = gfs2_block_map(&ip->i_inode, lblock, 0, &bh_map); + error = gfs2_block_map(&ip->i_inode, lblock, &bh_map, 0); if (error) return error; if (!bh_map.b_blocknr) { @@ -504,13 +504,21 @@ int gfs2_recover_journal(struct gfs2_jdesc *jd) if (!test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) ro = 1; } else { - if (sdp->sd_vfs->s_flags & MS_RDONLY) - ro = 1; + if (sdp->sd_vfs->s_flags & MS_RDONLY) { + /* check if device itself is read-only */ + ro = bdev_read_only(sdp->sd_vfs->s_bdev); + if (!ro) { + fs_info(sdp, "recovery required on " + "read-only filesystem.\n"); + fs_info(sdp, "write access will be " + "enabled during recovery.\n"); + } + } } if (ro) { - fs_warn(sdp, "jid=%u: Can't replay: read-only FS\n", - jd->jd_jid); + fs_warn(sdp, "jid=%u: Can't replay: read-only block " + "device\n", jd->jd_jid); error = -EROFS; goto fail_gunlock_tr; } diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index 708c287..3552110 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -25,10 +25,10 @@ #include "rgrp.h" #include "super.h" #include "trans.h" -#include "ops_file.h" #include "util.h" #include "log.h" #include "inode.h" +#include "ops_address.h" #define BFITNOENT ((u32)~0) #define NO_BLOCK ((u64)~0) @@ -126,41 +126,43 @@ static unsigned char gfs2_testbit(struct gfs2_rgrpd *rgd, unsigned char *buffer, * Return: the block number (bitmap buffer scope) that was found */ -static u32 gfs2_bitfit(struct gfs2_rgrpd *rgd, unsigned char *buffer, - unsigned int buflen, u32 goal, - unsigned char old_state) +static u32 gfs2_bitfit(unsigned char *buffer, unsigned int buflen, u32 goal, + unsigned char old_state) { - unsigned char *byte, *end, alloc; + unsigned char *byte; u32 blk = goal; - unsigned int bit; + unsigned int bit, bitlong; + unsigned long *plong, plong55; byte = buffer + (goal / GFS2_NBBY); + plong = (unsigned long *)(buffer + (goal / GFS2_NBBY)); bit = (goal % GFS2_NBBY) * GFS2_BIT_SIZE; - end = buffer + buflen; - alloc = (old_state == GFS2_BLKST_FREE) ? 0x55 : 0; - - while (byte < end) { - /* If we're looking for a free block we can eliminate all - bitmap settings with 0x55, which represents four data - blocks in a row. If we're looking for a data block, we can - eliminate 0x00 which corresponds to four free blocks. */ - if ((*byte & 0x55) == alloc) { - blk += (8 - bit) >> 1; - - bit = 0; - byte++; - + bitlong = bit; +#if BITS_PER_LONG == 32 + plong55 = 0x55555555; +#else + plong55 = 0x5555555555555555; +#endif + while (byte < buffer + buflen) { + + if (bitlong == 0 && old_state == 0 && *plong == plong55) { + plong++; + byte += sizeof(unsigned long); + blk += sizeof(unsigned long) * GFS2_NBBY; continue; } - if (((*byte >> bit) & GFS2_BIT_MASK) == old_state) return blk; - bit += GFS2_BIT_SIZE; if (bit >= 8) { bit = 0; byte++; } + bitlong += GFS2_BIT_SIZE; + if (bitlong >= sizeof(unsigned long) * 8) { + bitlong = 0; + plong++; + } blk++; } @@ -817,11 +819,9 @@ void gfs2_rgrp_repolish_clones(struct gfs2_rgrpd *rgd) struct gfs2_alloc *gfs2_alloc_get(struct gfs2_inode *ip) { - struct gfs2_alloc *al = &ip->i_alloc; - - /* FIXME: Should assert that the correct locks are held here... */ - memset(al, 0, sizeof(*al)); - return al; + BUG_ON(ip->i_alloc != NULL); + ip->i_alloc = kzalloc(sizeof(struct gfs2_alloc), GFP_KERNEL); + return ip->i_alloc; } /** @@ -1059,26 +1059,34 @@ static struct inode *get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked) struct inode *inode = NULL; struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); struct gfs2_rgrpd *rgd, *begin = NULL; - struct gfs2_alloc *al = &ip->i_alloc; + struct gfs2_alloc *al = ip->i_alloc; int flags = LM_FLAG_TRY; int skipped = 0; int loops = 0; - int error; + int error, rg_locked; /* Try recently successful rgrps */ rgd = recent_rgrp_first(sdp, ip->i_last_rg_alloc); while (rgd) { - error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, - LM_FLAG_TRY, &al->al_rgd_gh); + rg_locked = 0; + + if (gfs2_glock_is_locked_by_me(rgd->rd_gl)) { + rg_locked = 1; + error = 0; + } else { + error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, + LM_FLAG_TRY, &al->al_rgd_gh); + } switch (error) { case 0: if (try_rgrp_fit(rgd, al)) goto out; if (rgd->rd_flags & GFS2_RDF_CHECK) inode = try_rgrp_unlink(rgd, last_unlinked); - gfs2_glock_dq_uninit(&al->al_rgd_gh); + if (!rg_locked) + gfs2_glock_dq_uninit(&al->al_rgd_gh); if (inode) return inode; rgd = recent_rgrp_next(rgd, 1); @@ -1098,15 +1106,23 @@ static struct inode *get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked) begin = rgd = forward_rgrp_get(sdp); for (;;) { - error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, flags, - &al->al_rgd_gh); + rg_locked = 0; + + if (gfs2_glock_is_locked_by_me(rgd->rd_gl)) { + rg_locked = 1; + error = 0; + } else { + error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, flags, + &al->al_rgd_gh); + } switch (error) { case 0: if (try_rgrp_fit(rgd, al)) goto out; if (rgd->rd_flags & GFS2_RDF_CHECK) inode = try_rgrp_unlink(rgd, last_unlinked); - gfs2_glock_dq_uninit(&al->al_rgd_gh); + if (!rg_locked) + gfs2_glock_dq_uninit(&al->al_rgd_gh); if (inode) return inode; break; @@ -1158,7 +1174,7 @@ out: int gfs2_inplace_reserve_i(struct gfs2_inode *ip, char *file, unsigned int line) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); - struct gfs2_alloc *al = &ip->i_alloc; + struct gfs2_alloc *al = ip->i_alloc; struct inode *inode; int error = 0; u64 last_unlinked = NO_BLOCK; @@ -1204,7 +1220,7 @@ try_again: void gfs2_inplace_release(struct gfs2_inode *ip) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); - struct gfs2_alloc *al = &ip->i_alloc; + struct gfs2_alloc *al = ip->i_alloc; if (gfs2_assert_warn(sdp, al->al_alloced <= al->al_requested) == -1) fs_warn(sdp, "al_alloced = %u, al_requested = %u " @@ -1213,7 +1229,8 @@ void gfs2_inplace_release(struct gfs2_inode *ip) al->al_line); al->al_rgd = NULL; - gfs2_glock_dq_uninit(&al->al_rgd_gh); + if (al->al_rgd_gh.gh_gl) + gfs2_glock_dq_uninit(&al->al_rgd_gh); if (ip != GFS2_I(sdp->sd_rindex)) gfs2_glock_dq_uninit(&al->al_ri_gh); } @@ -1301,11 +1318,10 @@ static u32 rgblk_search(struct gfs2_rgrpd *rgd, u32 goal, /* The GFS2_BLKST_UNLINKED state doesn't apply to the clone bitmaps, so we must search the originals for that. */ if (old_state != GFS2_BLKST_UNLINKED && bi->bi_clone) - blk = gfs2_bitfit(rgd, bi->bi_clone + bi->bi_offset, + blk = gfs2_bitfit(bi->bi_clone + bi->bi_offset, bi->bi_len, goal, old_state); else - blk = gfs2_bitfit(rgd, - bi->bi_bh->b_data + bi->bi_offset, + blk = gfs2_bitfit(bi->bi_bh->b_data + bi->bi_offset, bi->bi_len, goal, old_state); if (blk != BFITNOENT) break; @@ -1394,7 +1410,7 @@ static struct gfs2_rgrpd *rgblk_free(struct gfs2_sbd *sdp, u64 bstart, u64 gfs2_alloc_data(struct gfs2_inode *ip) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); - struct gfs2_alloc *al = &ip->i_alloc; + struct gfs2_alloc *al = ip->i_alloc; struct gfs2_rgrpd *rgd = al->al_rgd; u32 goal, blk; u64 block; @@ -1439,7 +1455,7 @@ u64 gfs2_alloc_data(struct gfs2_inode *ip) u64 gfs2_alloc_meta(struct gfs2_inode *ip) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); - struct gfs2_alloc *al = &ip->i_alloc; + struct gfs2_alloc *al = ip->i_alloc; struct gfs2_rgrpd *rgd = al->al_rgd; u32 goal, blk; u64 block; @@ -1485,7 +1501,7 @@ u64 gfs2_alloc_meta(struct gfs2_inode *ip) u64 gfs2_alloc_di(struct gfs2_inode *dip, u64 *generation) { struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode); - struct gfs2_alloc *al = &dip->i_alloc; + struct gfs2_alloc *al = dip->i_alloc; struct gfs2_rgrpd *rgd = al->al_rgd; u32 blk; u64 block; diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h index b4c6adf..149bb16 100644 --- a/fs/gfs2/rgrp.h +++ b/fs/gfs2/rgrp.h @@ -32,7 +32,9 @@ void gfs2_rgrp_repolish_clones(struct gfs2_rgrpd *rgd); struct gfs2_alloc *gfs2_alloc_get(struct gfs2_inode *ip); static inline void gfs2_alloc_put(struct gfs2_inode *ip) { - return; /* So we can see where ip->i_alloc is used */ + BUG_ON(ip->i_alloc == NULL); + kfree(ip->i_alloc); + ip->i_alloc = NULL; } int gfs2_inplace_reserve_i(struct gfs2_inode *ip, diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index dd3e737..ef0562c 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -1,6 +1,6 @@ /* * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. - * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * Copyright (C) 2004-2007 Red Hat, Inc. All rights reserved. * * This copyrighted material is made available to anyone wishing to use, * modify, copy, or redistribute it subject to the terms and conditions @@ -51,13 +51,9 @@ void gfs2_tune_init(struct gfs2_tune *gt) { spin_lock_init(>->gt_spin); - gt->gt_ilimit = 100; - gt->gt_ilimit_tries = 3; - gt->gt_ilimit_min = 1; gt->gt_demote_secs = 300; gt->gt_incore_log_blocks = 1024; gt->gt_log_flush_secs = 60; - gt->gt_jindex_refresh_secs = 60; gt->gt_recoverd_secs = 60; gt->gt_logd_secs = 1; gt->gt_quotad_secs = 5; @@ -71,10 +67,8 @@ void gfs2_tune_init(struct gfs2_tune *gt) gt->gt_new_files_jdata = 0; gt->gt_new_files_directio = 0; gt->gt_max_readahead = 1 << 18; - gt->gt_lockdump_size = 131072; gt->gt_stall_secs = 600; gt->gt_complain_secs = 10; - gt->gt_reclaim_limit = 5000; gt->gt_statfs_quantum = 30; gt->gt_statfs_slow = 0; } @@ -393,6 +387,7 @@ int gfs2_jindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ji_gh) if (!jd) break; + INIT_LIST_HEAD(&jd->extent_list); jd->jd_inode = gfs2_lookupi(sdp->sd_jindex, &name, 1, NULL); if (!jd->jd_inode || IS_ERR(jd->jd_inode)) { if (!jd->jd_inode) @@ -422,8 +417,9 @@ int gfs2_jindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ji_gh) void gfs2_jindex_free(struct gfs2_sbd *sdp) { - struct list_head list; + struct list_head list, *head; struct gfs2_jdesc *jd; + struct gfs2_journal_extent *jext; spin_lock(&sdp->sd_jindex_spin); list_add(&list, &sdp->sd_jindex_list); @@ -433,6 +429,14 @@ void gfs2_jindex_free(struct gfs2_sbd *sdp) while (!list_empty(&list)) { jd = list_entry(list.next, struct gfs2_jdesc, jd_list); + head = &jd->extent_list; + while (!list_empty(head)) { + jext = list_entry(head->next, + struct gfs2_journal_extent, + extent_list); + list_del(&jext->extent_list); + kfree(jext); + } list_del(&jd->jd_list); iput(jd->jd_inode); kfree(jd); @@ -543,7 +547,6 @@ int gfs2_make_fs_rw(struct gfs2_sbd *sdp) if (error) return error; - gfs2_meta_cache_flush(ip); j_gl->gl_ops->go_inval(j_gl, DIO_METADATA); error = gfs2_find_jhead(sdp->sd_jdesc, &head); @@ -686,9 +689,7 @@ void gfs2_statfs_change(struct gfs2_sbd *sdp, s64 total, s64 free, if (error) return; - mutex_lock(&sdp->sd_statfs_mutex); gfs2_trans_add_bh(l_ip->i_gl, l_bh, 1); - mutex_unlock(&sdp->sd_statfs_mutex); spin_lock(&sdp->sd_statfs_spin); l_sc->sc_total += total; @@ -736,9 +737,7 @@ int gfs2_statfs_sync(struct gfs2_sbd *sdp) if (error) goto out_bh2; - mutex_lock(&sdp->sd_statfs_mutex); gfs2_trans_add_bh(l_ip->i_gl, l_bh, 1); - mutex_unlock(&sdp->sd_statfs_mutex); spin_lock(&sdp->sd_statfs_spin); m_sc->sc_total += l_sc->sc_total; diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c index 06e0b77..eaa3b7b 100644 --- a/fs/gfs2/sys.c +++ b/fs/gfs2/sys.c @@ -32,7 +32,8 @@ spinlock_t gfs2_sys_margs_lock; static ssize_t id_show(struct gfs2_sbd *sdp, char *buf) { - return snprintf(buf, PAGE_SIZE, "%s\n", sdp->sd_vfs->s_id); + return snprintf(buf, PAGE_SIZE, "%u:%u\n", + MAJOR(sdp->sd_vfs->s_dev), MINOR(sdp->sd_vfs->s_dev)); } static ssize_t fsname_show(struct gfs2_sbd *sdp, char *buf) @@ -221,9 +222,7 @@ static struct kobj_type gfs2_ktype = { .sysfs_ops = &gfs2_attr_ops, }; -static struct kset gfs2_kset = { - .ktype = &gfs2_ktype, -}; +static struct kset *gfs2_kset; /* * display struct lm_lockstruct fields @@ -427,13 +426,11 @@ TUNE_ATTR_2(name, name##_store) TUNE_ATTR(demote_secs, 0); TUNE_ATTR(incore_log_blocks, 0); TUNE_ATTR(log_flush_secs, 0); -TUNE_ATTR(jindex_refresh_secs, 0); TUNE_ATTR(quota_warn_period, 0); TUNE_ATTR(quota_quantum, 0); TUNE_ATTR(atime_quantum, 0); TUNE_ATTR(max_readahead, 0); TUNE_ATTR(complain_secs, 0); -TUNE_ATTR(reclaim_limit, 0); TUNE_ATTR(statfs_slow, 0); TUNE_ATTR(new_files_jdata, 0); TUNE_ATTR(new_files_directio, 0); @@ -450,13 +447,11 @@ static struct attribute *tune_attrs[] = { &tune_attr_demote_secs.attr, &tune_attr_incore_log_blocks.attr, &tune_attr_log_flush_secs.attr, - &tune_attr_jindex_refresh_secs.attr, &tune_attr_quota_warn_period.attr, &tune_attr_quota_quantum.attr, &tune_attr_atime_quantum.attr, &tune_attr_max_readahead.attr, &tune_attr_complain_secs.attr, - &tune_attr_reclaim_limit.attr, &tune_attr_statfs_slow.attr, &tune_attr_quota_simul_sync.attr, &tune_attr_quota_cache_secs.attr, @@ -495,14 +490,9 @@ int gfs2_sys_fs_add(struct gfs2_sbd *sdp) { int error; - sdp->sd_kobj.kset = &gfs2_kset; - sdp->sd_kobj.ktype = &gfs2_ktype; - - error = kobject_set_name(&sdp->sd_kobj, "%s", sdp->sd_table_name); - if (error) - goto fail; - - error = kobject_register(&sdp->sd_kobj); + sdp->sd_kobj.kset = gfs2_kset; + error = kobject_init_and_add(&sdp->sd_kobj, &gfs2_ktype, NULL, + "%s", sdp->sd_table_name); if (error) goto fail; @@ -522,6 +512,7 @@ int gfs2_sys_fs_add(struct gfs2_sbd *sdp) if (error) goto fail_args; + kobject_uevent(&sdp->sd_kobj, KOBJ_ADD); return 0; fail_args: @@ -531,7 +522,7 @@ fail_counters: fail_lockstruct: sysfs_remove_group(&sdp->sd_kobj, &lockstruct_group); fail_reg: - kobject_unregister(&sdp->sd_kobj); + kobject_put(&sdp->sd_kobj); fail: fs_err(sdp, "error %d adding sysfs files", error); return error; @@ -543,21 +534,22 @@ void gfs2_sys_fs_del(struct gfs2_sbd *sdp) sysfs_remove_group(&sdp->sd_kobj, &args_group); sysfs_remove_group(&sdp->sd_kobj, &counters_group); sysfs_remove_group(&sdp->sd_kobj, &lockstruct_group); - kobject_unregister(&sdp->sd_kobj); + kobject_put(&sdp->sd_kobj); } int gfs2_sys_init(void) { gfs2_sys_margs = NULL; spin_lock_init(&gfs2_sys_margs_lock); - kobject_set_name(&gfs2_kset.kobj, "gfs2"); - kobj_set_kset_s(&gfs2_kset, fs_subsys); - return kset_register(&gfs2_kset); + gfs2_kset = kset_create_and_add("gfs2", NULL, fs_kobj); + if (!gfs2_kset) + return -ENOMEM; + return 0; } void gfs2_sys_uninit(void) { kfree(gfs2_sys_margs); - kset_unregister(&gfs2_kset); + kset_unregister(gfs2_kset); } diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c index 717983e..73e5d92 100644 --- a/fs/gfs2/trans.c +++ b/fs/gfs2/trans.c @@ -114,11 +114,6 @@ void gfs2_trans_end(struct gfs2_sbd *sdp) gfs2_log_flush(sdp, NULL); } -void gfs2_trans_add_gl(struct gfs2_glock *gl) -{ - lops_add(gl->gl_sbd, &gl->gl_le); -} - /** * gfs2_trans_add_bh - Add a to-be-modified buffer to the current transaction * @gl: the glock the buffer belongs to diff --git a/fs/gfs2/trans.h b/fs/gfs2/trans.h index 043d5f4..e826f0d 100644 --- a/fs/gfs2/trans.h +++ b/fs/gfs2/trans.h @@ -30,7 +30,6 @@ int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks, void gfs2_trans_end(struct gfs2_sbd *sdp); -void gfs2_trans_add_gl(struct gfs2_glock *gl); void gfs2_trans_add_bh(struct gfs2_glock *gl, struct buffer_head *bh, int meta); void gfs2_trans_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd); void gfs2_trans_add_unrevoke(struct gfs2_sbd *sdp, u64 blkno); diff --git a/fs/hfs/btree.c b/fs/hfs/btree.c index 31284c7..110dd35 100644 --- a/fs/hfs/btree.c +++ b/fs/hfs/btree.c @@ -61,7 +61,7 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id, btree_keycmp ke mapping = tree->inode->i_mapping; page = read_mapping_page(mapping, 0, NULL); if (IS_ERR(page)) - goto free_tree; + goto free_inode; /* Load the header */ head = (struct hfs_btree_header_rec *)(kmap(page) + sizeof(struct hfs_bnode_desc)); @@ -99,11 +99,12 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id, btree_keycmp ke page_cache_release(page); return tree; - fail_page: +fail_page: page_cache_release(page); - free_tree: +free_inode: tree->inode->i_mapping->a_ops = &hfs_aops; iput(tree->inode); +free_tree: kfree(tree); return NULL; } diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c index 08ff6c7..038ed74 100644 --- a/fs/jbd/transaction.c +++ b/fs/jbd/transaction.c @@ -288,10 +288,12 @@ handle_t *journal_start(journal_t *journal, int nblocks) jbd_free_handle(handle); current->journal_info = NULL; handle = ERR_PTR(err); + goto out; } lock_acquire(&handle->h_lockdep_map, 0, 0, 0, 2, _THIS_IP_); +out: return handle; } diff --git a/fs/jfs/jfs_dtree.c b/fs/jfs/jfs_dtree.c index df25ecc..4dcc058 100644 --- a/fs/jfs/jfs_dtree.c +++ b/fs/jfs/jfs_dtree.c @@ -284,11 +284,11 @@ static struct dir_table_slot *find_index(struct inode *ip, u32 index, release_metapage(*mp); *mp = NULL; } - if (*mp == 0) { + if (!(*mp)) { *lblock = blkno; *mp = read_index_page(ip, blkno); } - if (*mp == 0) { + if (!(*mp)) { jfs_err("free_index: error reading directory table"); return NULL; } @@ -413,7 +413,8 @@ static u32 add_index(tid_t tid, struct inode *ip, s64 bn, int slot) } ip->i_size = PSIZE; - if ((mp = get_index_page(ip, 0)) == 0) { + mp = get_index_page(ip, 0); + if (!mp) { jfs_err("add_index: get_metapage failed!"); xtTruncate(tid, ip, 0, COMMIT_PWMAP); memcpy(&jfs_ip->i_dirtable, temp_table, @@ -461,7 +462,7 @@ static u32 add_index(tid_t tid, struct inode *ip, s64 bn, int slot) } else mp = read_index_page(ip, blkno); - if (mp == 0) { + if (!mp) { jfs_err("add_index: get/read_metapage failed!"); goto clean_up; } @@ -499,7 +500,7 @@ static void free_index(tid_t tid, struct inode *ip, u32 index, u32 next) dirtab_slot = find_index(ip, index, &mp, &lblock); - if (dirtab_slot == 0) + if (!dirtab_slot) return; dirtab_slot->flag = DIR_INDEX_FREE; @@ -526,7 +527,7 @@ static void modify_index(tid_t tid, struct inode *ip, u32 index, s64 bn, dirtab_slot = find_index(ip, index, mp, lblock); - if (dirtab_slot == 0) + if (!dirtab_slot) return; DTSaddress(dirtab_slot, bn); @@ -552,7 +553,7 @@ static int read_index(struct inode *ip, u32 index, struct dir_table_slot *slot; slot = find_index(ip, index, &mp, &lblock); - if (slot == 0) { + if (!slot) { return -EIO; } @@ -592,10 +593,8 @@ int dtSearch(struct inode *ip, struct component_name * key, ino_t * data, struct component_name ciKey; struct super_block *sb = ip->i_sb; - ciKey.name = - (wchar_t *) kmalloc((JFS_NAME_MAX + 1) * sizeof(wchar_t), - GFP_NOFS); - if (ciKey.name == 0) { + ciKey.name = kmalloc((JFS_NAME_MAX + 1) * sizeof(wchar_t), GFP_NOFS); + if (!ciKey.name) { rc = -ENOMEM; goto dtSearch_Exit2; } @@ -957,10 +956,8 @@ static int dtSplitUp(tid_t tid, smp = split->mp; sp = DT_PAGE(ip, smp); - key.name = - (wchar_t *) kmalloc((JFS_NAME_MAX + 2) * sizeof(wchar_t), - GFP_NOFS); - if (key.name == 0) { + key.name = kmalloc((JFS_NAME_MAX + 2) * sizeof(wchar_t), GFP_NOFS); + if (!key.name) { DT_PUTPAGE(smp); rc = -ENOMEM; goto dtSplitUp_Exit; diff --git a/fs/jfs/jfs_dtree.h b/fs/jfs/jfs_dtree.h index 8561c6e..cdac2d5 100644 --- a/fs/jfs/jfs_dtree.h +++ b/fs/jfs/jfs_dtree.h @@ -74,7 +74,7 @@ struct idtentry { #define DTIHDRDATALEN 11 /* compute number of slots for entry */ -#define NDTINTERNAL(klen) ( ((4 + (klen)) + (15 - 1)) / 15 ) +#define NDTINTERNAL(klen) (DIV_ROUND_UP((4 + (klen)), 15)) /* @@ -133,7 +133,7 @@ struct dir_table_slot { ( ((s64)((dts)->addr1)) << 32 | __le32_to_cpu((dts)->addr2) ) /* compute number of slots for entry */ -#define NDTLEAF_LEGACY(klen) ( ((2 + (klen)) + (15 - 1)) / 15 ) +#define NDTLEAF_LEGACY(klen) (DIV_ROUND_UP((2 + (klen)), 15)) #define NDTLEAF NDTINTERNAL diff --git a/fs/jfs/jfs_imap.c b/fs/jfs/jfs_imap.c index 3870ba8..9bf29f7 100644 --- a/fs/jfs/jfs_imap.c +++ b/fs/jfs/jfs_imap.c @@ -381,7 +381,7 @@ int diRead(struct inode *ip) /* read the page of disk inode */ mp = read_metapage(ipimap, pageno << sbi->l2nbperpage, PSIZE, 1); - if (mp == 0) { + if (!mp) { jfs_err("diRead: read_metapage failed"); return -EIO; } @@ -654,7 +654,7 @@ int diWrite(tid_t tid, struct inode *ip) /* read the page of disk inode */ retry: mp = read_metapage(ipimap, pageno << sbi->l2nbperpage, PSIZE, 1); - if (mp == 0) + if (!mp) return -EIO; /* get the pointer to the disk inode */ diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c index 15a3974..325a967 100644 --- a/fs/jfs/jfs_logmgr.c +++ b/fs/jfs/jfs_logmgr.c @@ -208,6 +208,17 @@ static struct lmStat { } lmStat; #endif +static void write_special_inodes(struct jfs_log *log, + int (*writer)(struct address_space *)) +{ + struct jfs_sb_info *sbi; + + list_for_each_entry(sbi, &log->sb_list, log_list) { + writer(sbi->ipbmap->i_mapping); + writer(sbi->ipimap->i_mapping); + writer(sbi->direct_inode->i_mapping); + } +} /* * NAME: lmLog() @@ -935,22 +946,13 @@ static int lmLogSync(struct jfs_log * log, int hard_sync) struct lrd lrd; int lsn; struct logsyncblk *lp; - struct jfs_sb_info *sbi; unsigned long flags; /* push dirty metapages out to disk */ if (hard_sync) - list_for_each_entry(sbi, &log->sb_list, log_list) { - filemap_fdatawrite(sbi->ipbmap->i_mapping); - filemap_fdatawrite(sbi->ipimap->i_mapping); - filemap_fdatawrite(sbi->direct_inode->i_mapping); - } + write_special_inodes(log, filemap_fdatawrite); else - list_for_each_entry(sbi, &log->sb_list, log_list) { - filemap_flush(sbi->ipbmap->i_mapping); - filemap_flush(sbi->ipimap->i_mapping); - filemap_flush(sbi->direct_inode->i_mapping); - } + write_special_inodes(log, filemap_flush); /* * forward syncpt @@ -1536,7 +1538,6 @@ void jfs_flush_journal(struct jfs_log *log, int wait) { int i; struct tblock *target = NULL; - struct jfs_sb_info *sbi; /* jfs_write_inode may call us during read-only mount */ if (!log) @@ -1598,11 +1599,7 @@ void jfs_flush_journal(struct jfs_log *log, int wait) if (wait < 2) return; - list_for_each_entry(sbi, &log->sb_list, log_list) { - filemap_fdatawrite(sbi->ipbmap->i_mapping); - filemap_fdatawrite(sbi->ipimap->i_mapping); - filemap_fdatawrite(sbi->direct_inode->i_mapping); - } + write_special_inodes(log, filemap_fdatawrite); /* * If there was recent activity, we may need to wait @@ -1611,6 +1608,7 @@ void jfs_flush_journal(struct jfs_log *log, int wait) if ((!list_empty(&log->cqueue)) || !list_empty(&log->synclist)) { for (i = 0; i < 200; i++) { /* Too much? */ msleep(250); + write_special_inodes(log, filemap_fdatawrite); if (list_empty(&log->cqueue) && list_empty(&log->synclist)) break; @@ -2347,7 +2345,7 @@ int jfsIOWait(void *arg) do { spin_lock_irq(&log_redrive_lock); - while ((bp = log_redrive_list) != 0) { + while ((bp = log_redrive_list)) { log_redrive_list = bp->l_redrive_next; bp->l_redrive_next = NULL; spin_unlock_irq(&log_redrive_lock); diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c index f5cd8d3..d1e64f2 100644 --- a/fs/jfs/jfs_metapage.c +++ b/fs/jfs/jfs_metapage.c @@ -39,11 +39,11 @@ static struct { #endif #define metapage_locked(mp) test_bit(META_locked, &(mp)->flag) -#define trylock_metapage(mp) test_and_set_bit(META_locked, &(mp)->flag) +#define trylock_metapage(mp) test_and_set_bit_lock(META_locked, &(mp)->flag) static inline void unlock_metapage(struct metapage *mp) { - clear_bit(META_locked, &mp->flag); + clear_bit_unlock(META_locked, &mp->flag); wake_up(&mp->wait); } @@ -88,7 +88,7 @@ struct meta_anchor { }; #define mp_anchor(page) ((struct meta_anchor *)page_private(page)) -static inline struct metapage *page_to_mp(struct page *page, uint offset) +static inline struct metapage *page_to_mp(struct page *page, int offset) { if (!PagePrivate(page)) return NULL; @@ -153,7 +153,7 @@ static inline void dec_io(struct page *page, void (*handler) (struct page *)) } #else -static inline struct metapage *page_to_mp(struct page *page, uint offset) +static inline struct metapage *page_to_mp(struct page *page, int offset) { return PagePrivate(page) ? (struct metapage *)page_private(page) : NULL; } @@ -249,7 +249,7 @@ static inline void drop_metapage(struct page *page, struct metapage *mp) */ static sector_t metapage_get_blocks(struct inode *inode, sector_t lblock, - unsigned int *len) + int *len) { int rc = 0; int xflag; @@ -352,25 +352,27 @@ static void metapage_write_end_io(struct bio *bio, int err) static int metapage_writepage(struct page *page, struct writeback_control *wbc) { struct bio *bio = NULL; - unsigned int block_offset; /* block offset of mp within page */ + int block_offset; /* block offset of mp within page */ struct inode *inode = page->mapping->host; - unsigned int blocks_per_mp = JFS_SBI(inode->i_sb)->nbperpage; - unsigned int len; - unsigned int xlen; + int blocks_per_mp = JFS_SBI(inode->i_sb)->nbperpage; + int len; + int xlen; struct metapage *mp; int redirty = 0; sector_t lblock; + int nr_underway = 0; sector_t pblock; sector_t next_block = 0; sector_t page_start; unsigned long bio_bytes = 0; unsigned long bio_offset = 0; - unsigned int offset; + int offset; page_start = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits); BUG_ON(!PageLocked(page)); BUG_ON(PageWriteback(page)); + set_page_writeback(page); for (offset = 0; offset < PAGE_CACHE_SIZE; offset += PSIZE) { mp = page_to_mp(page, offset); @@ -413,11 +415,10 @@ static int metapage_writepage(struct page *page, struct writeback_control *wbc) if (!bio->bi_size) goto dump_bio; submit_bio(WRITE, bio); + nr_underway++; bio = NULL; - } else { - set_page_writeback(page); + } else inc_io(page); - } xlen = (PAGE_CACHE_SIZE - offset) >> inode->i_blkbits; pblock = metapage_get_blocks(inode, lblock, &xlen); if (!pblock) { @@ -427,7 +428,7 @@ static int metapage_writepage(struct page *page, struct writeback_control *wbc) continue; } set_bit(META_io, &mp->flag); - len = min(xlen, (uint) JFS_SBI(inode->i_sb)->nbperpage); + len = min(xlen, (int)JFS_SBI(inode->i_sb)->nbperpage); bio = bio_alloc(GFP_NOFS, 1); bio->bi_bdev = inode->i_sb->s_bdev; @@ -449,12 +450,16 @@ static int metapage_writepage(struct page *page, struct writeback_control *wbc) goto dump_bio; submit_bio(WRITE, bio); + nr_underway++; } if (redirty) redirty_page_for_writepage(wbc, page); unlock_page(page); + if (nr_underway == 0) + end_page_writeback(page); + return 0; add_failed: /* We should never reach here, since we're only adding one vec */ @@ -475,13 +480,13 @@ static int metapage_readpage(struct file *fp, struct page *page) { struct inode *inode = page->mapping->host; struct bio *bio = NULL; - unsigned int block_offset; - unsigned int blocks_per_page = PAGE_CACHE_SIZE >> inode->i_blkbits; + int block_offset; + int blocks_per_page = PAGE_CACHE_SIZE >> inode->i_blkbits; sector_t page_start; /* address of page in fs blocks */ sector_t pblock; - unsigned int xlen; + int xlen; unsigned int len; - unsigned int offset; + int offset; BUG_ON(!PageLocked(page)); page_start = (sector_t)page->index << @@ -530,7 +535,7 @@ static int metapage_releasepage(struct page *page, gfp_t gfp_mask) { struct metapage *mp; int ret = 1; - unsigned int offset; + int offset; for (offset = 0; offset < PAGE_CACHE_SIZE; offset += PSIZE) { mp = page_to_mp(page, offset); diff --git a/fs/jfs/jfs_mount.c b/fs/jfs/jfs_mount.c index 644429a..7b698f2 100644 --- a/fs/jfs/jfs_mount.c +++ b/fs/jfs/jfs_mount.c @@ -147,7 +147,7 @@ int jfs_mount(struct super_block *sb) */ if ((sbi->mntflag & JFS_BAD_SAIT) == 0) { ipaimap2 = diReadSpecial(sb, AGGREGATE_I, 1); - if (ipaimap2 == 0) { + if (!ipaimap2) { jfs_err("jfs_mount: Faild to read AGGREGATE_I"); rc = -EIO; goto errout35; diff --git a/fs/jfs/jfs_umount.c b/fs/jfs/jfs_umount.c index 7971f37..adcf92d 100644 --- a/fs/jfs/jfs_umount.c +++ b/fs/jfs/jfs_umount.c @@ -68,7 +68,7 @@ int jfs_umount(struct super_block *sb) /* * Wait for outstanding transactions to be written to log: */ - jfs_flush_journal(log, 2); + jfs_flush_journal(log, 1); /* * close fileset inode allocation map (aka fileset inode) @@ -146,7 +146,7 @@ int jfs_umount_rw(struct super_block *sb) * * remove file system from log active file system list. */ - jfs_flush_journal(log, 2); + jfs_flush_journal(log, 1); /* * Make sure all metadata makes it to disk diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c index 4e0a849..f8718de 100644 --- a/fs/jfs/namei.c +++ b/fs/jfs/namei.c @@ -1103,8 +1103,8 @@ static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry, * Make sure dest inode number (if any) is what we think it is */ rc = dtSearch(new_dir, &new_dname, &ino, &btstack, JFS_LOOKUP); - if (rc == 0) { - if ((new_ip == 0) || (ino != new_ip->i_ino)) { + if (!rc) { + if ((!new_ip) || (ino != new_ip->i_ino)) { rc = -ESTALE; goto out3; } diff --git a/fs/jfs/resize.c b/fs/jfs/resize.c index 71984ee..7f24a0b 100644 --- a/fs/jfs/resize.c +++ b/fs/jfs/resize.c @@ -172,7 +172,7 @@ int jfs_extendfs(struct super_block *sb, s64 newLVSize, int newLogSize) */ t64 = ((newLVSize - newLogSize + BPERDMAP - 1) >> L2BPERDMAP) << L2BPERDMAP; - t32 = ((t64 + (BITSPERPAGE - 1)) / BITSPERPAGE) + 1 + 50; + t32 = DIV_ROUND_UP(t64, BITSPERPAGE) + 1 + 50; newFSCKSize = t32 << sbi->l2nbperpage; newFSCKAddress = newLogAddress - newFSCKSize; diff --git a/fs/jfs/super.c b/fs/jfs/super.c index 314bb4f..70a1400 100644 --- a/fs/jfs/super.c +++ b/fs/jfs/super.c @@ -598,6 +598,12 @@ static int jfs_show_options(struct seq_file *seq, struct vfsmount *vfs) seq_printf(seq, ",umask=%03o", sbi->umask); if (sbi->flag & JFS_NOINTEGRITY) seq_puts(seq, ",nointegrity"); + if (sbi->nls_tab) + seq_printf(seq, ",iocharset=%s", sbi->nls_tab->charset); + if (sbi->flag & JFS_ERR_CONTINUE) + seq_printf(seq, ",errors=continue"); + if (sbi->flag & JFS_ERR_PANIC) + seq_printf(seq, ",errors=panic"); #ifdef CONFIG_QUOTA if (sbi->flag & JFS_USRQUOTA) @@ -1605,7 +1605,7 @@ int may_open(struct nameidata *nd, int acc_mode, int flag) if (S_ISLNK(inode->i_mode)) return -ELOOP; - if (S_ISDIR(inode->i_mode) && (flag & FMODE_WRITE)) + if (S_ISDIR(inode->i_mode) && (acc_mode & MAY_WRITE)) return -EISDIR; /* @@ -1620,7 +1620,7 @@ int may_open(struct nameidata *nd, int acc_mode, int flag) return -EACCES; flag &= ~O_TRUNC; - } else if (IS_RDONLY(inode) && (flag & FMODE_WRITE)) + } else if (IS_RDONLY(inode) && (acc_mode & MAY_WRITE)) return -EROFS; error = vfs_permission(nd, acc_mode); diff --git a/fs/namespace.c b/fs/namespace.c index 0608388..61bf376 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -41,8 +41,8 @@ static struct kmem_cache *mnt_cache __read_mostly; static struct rw_semaphore namespace_sem; /* /sys/fs */ -decl_subsys(fs, NULL, NULL); -EXPORT_SYMBOL_GPL(fs_subsys); +struct kobject *fs_kobj; +EXPORT_SYMBOL_GPL(fs_kobj); static inline unsigned long hash(struct vfsmount *mnt, struct dentry *dentry) { @@ -1861,10 +1861,9 @@ void __init mnt_init(void) if (err) printk(KERN_WARNING "%s: sysfs_init error: %d\n", __FUNCTION__, err); - err = subsystem_register(&fs_subsys); - if (err) - printk(KERN_WARNING "%s: subsystem_register error: %d\n", - __FUNCTION__, err); + fs_kobj = kobject_create_and_add("fs", NULL); + if (!fs_kobj) + printk(KERN_WARNING "%s: kobj create error\n", __FUNCTION__); init_rootfs(); init_mount_tree(); } diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c index 2d116d2..f917fd2 100644 --- a/fs/nfsd/nfs3xdr.c +++ b/fs/nfsd/nfs3xdr.c @@ -388,8 +388,11 @@ nfs3svc_decode_writeargs(struct svc_rqst *rqstp, __be32 *p, * Round the length of the data which was specified up to * the next multiple of XDR units and then compare that * against the length which was actually received. + * Note that when RPCSEC/GSS (for example) is used, the + * data buffer can be padded so dlen might be larger + * than required. It must never be smaller. */ - if (dlen != XDR_QUADLEN(len)*4) + if (dlen < XDR_QUADLEN(len)*4) return 0; if (args->count > max_blocksize) { diff --git a/fs/nfsd/nfsxdr.c b/fs/nfsd/nfsxdr.c index 986f9b3..b86e365 100644 --- a/fs/nfsd/nfsxdr.c +++ b/fs/nfsd/nfsxdr.c @@ -313,8 +313,11 @@ nfssvc_decode_writeargs(struct svc_rqst *rqstp, __be32 *p, * Round the length of the data which was specified up to * the next multiple of XDR units and then compare that * against the length which was actually received. + * Note that when RPCSEC/GSS (for example) is used, the + * data buffer can be padded so dlen might be larger + * than required. It must never be smaller. */ - if (dlen != XDR_QUADLEN(len)*4) + if (dlen < XDR_QUADLEN(len)*4) return 0; rqstp->rq_vec[0].iov_base = (void*)p; diff --git a/fs/ocfs2/Makefile b/fs/ocfs2/Makefile index 9fb8132f..4d4ce48 100644 --- a/fs/ocfs2/Makefile +++ b/fs/ocfs2/Makefile @@ -19,16 +19,17 @@ ocfs2-objs := \ ioctl.o \ journal.o \ localalloc.o \ + locks.o \ mmap.o \ namei.o \ + resize.o \ slot_map.o \ suballoc.o \ super.o \ symlink.o \ sysfile.o \ uptodate.o \ - ver.o \ - vote.o + ver.o obj-$(CONFIG_OCFS2_FS) += cluster/ obj-$(CONFIG_OCFS2_FS) += dlm/ diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index 23c8cda43..e6df06a 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -4731,7 +4731,7 @@ int __ocfs2_flush_truncate_log(struct ocfs2_super *osb) mutex_lock(&data_alloc_inode->i_mutex); - status = ocfs2_meta_lock(data_alloc_inode, &data_alloc_bh, 1); + status = ocfs2_inode_lock(data_alloc_inode, &data_alloc_bh, 1); if (status < 0) { mlog_errno(status); goto out_mutex; @@ -4753,7 +4753,7 @@ int __ocfs2_flush_truncate_log(struct ocfs2_super *osb) out_unlock: brelse(data_alloc_bh); - ocfs2_meta_unlock(data_alloc_inode, 1); + ocfs2_inode_unlock(data_alloc_inode, 1); out_mutex: mutex_unlock(&data_alloc_inode->i_mutex); @@ -5077,7 +5077,7 @@ static int ocfs2_free_cached_items(struct ocfs2_super *osb, mutex_lock(&inode->i_mutex); - ret = ocfs2_meta_lock(inode, &di_bh, 1); + ret = ocfs2_inode_lock(inode, &di_bh, 1); if (ret) { mlog_errno(ret); goto out_mutex; @@ -5118,7 +5118,7 @@ out_journal: ocfs2_commit_trans(osb, handle); out_unlock: - ocfs2_meta_unlock(inode, 1); + ocfs2_inode_unlock(inode, 1); brelse(di_bh); out_mutex: mutex_unlock(&inode->i_mutex); diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 56f7790..bc7b4cb 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -26,6 +26,7 @@ #include <asm/byteorder.h> #include <linux/swap.h> #include <linux/pipe_fs_i.h> +#include <linux/mpage.h> #define MLOG_MASK_PREFIX ML_FILE_IO #include <cluster/masklog.h> @@ -139,7 +140,8 @@ static int ocfs2_get_block(struct inode *inode, sector_t iblock, { int err = 0; unsigned int ext_flags; - u64 p_blkno, past_eof; + u64 max_blocks = bh_result->b_size >> inode->i_blkbits; + u64 p_blkno, count, past_eof; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); mlog_entry("(0x%p, %llu, 0x%p, %d)\n", inode, @@ -155,7 +157,7 @@ static int ocfs2_get_block(struct inode *inode, sector_t iblock, goto bail; } - err = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno, NULL, + err = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno, &count, &ext_flags); if (err) { mlog(ML_ERROR, "Error %d from get_blocks(0x%p, %llu, 1, " @@ -164,6 +166,9 @@ static int ocfs2_get_block(struct inode *inode, sector_t iblock, goto bail; } + if (max_blocks < count) + count = max_blocks; + /* * ocfs2 never allocates in this function - the only time we * need to use BH_New is when we're extending i_size on a file @@ -178,6 +183,8 @@ static int ocfs2_get_block(struct inode *inode, sector_t iblock, if (p_blkno && !(ext_flags & OCFS2_EXT_UNWRITTEN)) map_bh(bh_result, inode->i_sb, p_blkno); + bh_result->b_size = count << inode->i_blkbits; + if (!ocfs2_sparse_alloc(osb)) { if (p_blkno == 0) { err = -EIO; @@ -210,7 +217,7 @@ int ocfs2_read_inline_data(struct inode *inode, struct page *page, struct buffer_head *di_bh) { void *kaddr; - unsigned int size; + loff_t size; struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; if (!(le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_DATA_FL)) { @@ -224,8 +231,9 @@ int ocfs2_read_inline_data(struct inode *inode, struct page *page, if (size > PAGE_CACHE_SIZE || size > ocfs2_max_inline_data(inode->i_sb)) { ocfs2_error(inode->i_sb, - "Inode %llu has with inline data has bad size: %u", - (unsigned long long)OCFS2_I(inode)->ip_blkno, size); + "Inode %llu has with inline data has bad size: %Lu", + (unsigned long long)OCFS2_I(inode)->ip_blkno, + (unsigned long long)size); return -EROFS; } @@ -275,7 +283,7 @@ static int ocfs2_readpage(struct file *file, struct page *page) mlog_entry("(0x%p, %lu)\n", file, (page ? page->index : 0)); - ret = ocfs2_meta_lock_with_page(inode, NULL, 0, page); + ret = ocfs2_inode_lock_with_page(inode, NULL, 0, page); if (ret != 0) { if (ret == AOP_TRUNCATED_PAGE) unlock = 0; @@ -285,7 +293,7 @@ static int ocfs2_readpage(struct file *file, struct page *page) if (down_read_trylock(&oi->ip_alloc_sem) == 0) { ret = AOP_TRUNCATED_PAGE; - goto out_meta_unlock; + goto out_inode_unlock; } /* @@ -305,25 +313,16 @@ static int ocfs2_readpage(struct file *file, struct page *page) goto out_alloc; } - ret = ocfs2_data_lock_with_page(inode, 0, page); - if (ret != 0) { - if (ret == AOP_TRUNCATED_PAGE) - unlock = 0; - mlog_errno(ret); - goto out_alloc; - } - if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) ret = ocfs2_readpage_inline(inode, page); else ret = block_read_full_page(page, ocfs2_get_block); unlock = 0; - ocfs2_data_unlock(inode, 0); out_alloc: up_read(&OCFS2_I(inode)->ip_alloc_sem); -out_meta_unlock: - ocfs2_meta_unlock(inode, 0); +out_inode_unlock: + ocfs2_inode_unlock(inode, 0); out: if (unlock) unlock_page(page); @@ -331,6 +330,62 @@ out: return ret; } +/* + * This is used only for read-ahead. Failures or difficult to handle + * situations are safe to ignore. + * + * Right now, we don't bother with BH_Boundary - in-inode extent lists + * are quite large (243 extents on 4k blocks), so most inodes don't + * grow out to a tree. If need be, detecting boundary extents could + * trivially be added in a future version of ocfs2_get_block(). + */ +static int ocfs2_readpages(struct file *filp, struct address_space *mapping, + struct list_head *pages, unsigned nr_pages) +{ + int ret, err = -EIO; + struct inode *inode = mapping->host; + struct ocfs2_inode_info *oi = OCFS2_I(inode); + loff_t start; + struct page *last; + + /* + * Use the nonblocking flag for the dlm code to avoid page + * lock inversion, but don't bother with retrying. + */ + ret = ocfs2_inode_lock_full(inode, NULL, 0, OCFS2_LOCK_NONBLOCK); + if (ret) + return err; + + if (down_read_trylock(&oi->ip_alloc_sem) == 0) { + ocfs2_inode_unlock(inode, 0); + return err; + } + + /* + * Don't bother with inline-data. There isn't anything + * to read-ahead in that case anyway... + */ + if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) + goto out_unlock; + + /* + * Check whether a remote node truncated this file - we just + * drop out in that case as it's not worth handling here. + */ + last = list_entry(pages->prev, struct page, lru); + start = (loff_t)last->index << PAGE_CACHE_SHIFT; + if (start >= i_size_read(inode)) + goto out_unlock; + + err = mpage_readpages(mapping, pages, nr_pages, ocfs2_get_block); + +out_unlock: + up_read(&oi->ip_alloc_sem); + ocfs2_inode_unlock(inode, 0); + + return err; +} + /* Note: Because we don't support holes, our allocation has * already happened (allocation writes zeros to the file data) * so we don't have to worry about ordered writes in @@ -452,7 +507,7 @@ static sector_t ocfs2_bmap(struct address_space *mapping, sector_t block) * accessed concurrently from multiple nodes. */ if (!INODE_JOURNAL(inode)) { - err = ocfs2_meta_lock(inode, NULL, 0); + err = ocfs2_inode_lock(inode, NULL, 0); if (err) { if (err != -ENOENT) mlog_errno(err); @@ -467,7 +522,7 @@ static sector_t ocfs2_bmap(struct address_space *mapping, sector_t block) if (!INODE_JOURNAL(inode)) { up_read(&OCFS2_I(inode)->ip_alloc_sem); - ocfs2_meta_unlock(inode, 0); + ocfs2_inode_unlock(inode, 0); } if (err) { @@ -638,34 +693,12 @@ static ssize_t ocfs2_direct_IO(int rw, if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) return 0; - if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) { - /* - * We get PR data locks even for O_DIRECT. This - * allows concurrent O_DIRECT I/O but doesn't let - * O_DIRECT with extending and buffered zeroing writes - * race. If they did race then the buffered zeroing - * could be written back after the O_DIRECT I/O. It's - * one thing to tell people not to mix buffered and - * O_DIRECT writes, but expecting them to understand - * that file extension is also an implicit buffered - * write is too much. By getting the PR we force - * writeback of the buffered zeroing before - * proceeding. - */ - ret = ocfs2_data_lock(inode, 0); - if (ret < 0) { - mlog_errno(ret); - goto out; - } - ocfs2_data_unlock(inode, 0); - } - ret = blockdev_direct_IO_no_locking(rw, iocb, inode, inode->i_sb->s_bdev, iov, offset, nr_segs, ocfs2_direct_IO_get_blocks, ocfs2_dio_end_io); -out: + mlog_exit(ret); return ret; } @@ -1754,7 +1787,7 @@ static int ocfs2_write_begin(struct file *file, struct address_space *mapping, struct buffer_head *di_bh = NULL; struct inode *inode = mapping->host; - ret = ocfs2_meta_lock(inode, &di_bh, 1); + ret = ocfs2_inode_lock(inode, &di_bh, 1); if (ret) { mlog_errno(ret); return ret; @@ -1769,30 +1802,22 @@ static int ocfs2_write_begin(struct file *file, struct address_space *mapping, */ down_write(&OCFS2_I(inode)->ip_alloc_sem); - ret = ocfs2_data_lock(inode, 1); - if (ret) { - mlog_errno(ret); - goto out_fail; - } - ret = ocfs2_write_begin_nolock(mapping, pos, len, flags, pagep, fsdata, di_bh, NULL); if (ret) { mlog_errno(ret); - goto out_fail_data; + goto out_fail; } brelse(di_bh); return 0; -out_fail_data: - ocfs2_data_unlock(inode, 1); out_fail: up_write(&OCFS2_I(inode)->ip_alloc_sem); brelse(di_bh); - ocfs2_meta_unlock(inode, 1); + ocfs2_inode_unlock(inode, 1); return ret; } @@ -1908,15 +1933,15 @@ static int ocfs2_write_end(struct file *file, struct address_space *mapping, ret = ocfs2_write_end_nolock(mapping, pos, len, copied, page, fsdata); - ocfs2_data_unlock(inode, 1); up_write(&OCFS2_I(inode)->ip_alloc_sem); - ocfs2_meta_unlock(inode, 1); + ocfs2_inode_unlock(inode, 1); return ret; } const struct address_space_operations ocfs2_aops = { .readpage = ocfs2_readpage, + .readpages = ocfs2_readpages, .writepage = ocfs2_writepage, .write_begin = ocfs2_write_begin, .write_end = ocfs2_write_end, diff --git a/fs/ocfs2/buffer_head_io.c b/fs/ocfs2/buffer_head_io.c index c903741..f136639 100644 --- a/fs/ocfs2/buffer_head_io.c +++ b/fs/ocfs2/buffer_head_io.c @@ -79,7 +79,7 @@ int ocfs2_write_block(struct ocfs2_super *osb, struct buffer_head *bh, * information for this bh as it's not marked locally * uptodate. */ ret = -EIO; - brelse(bh); + put_bh(bh); } mutex_unlock(&OCFS2_I(inode)->ip_io_mutex); @@ -256,7 +256,7 @@ int ocfs2_read_blocks(struct ocfs2_super *osb, u64 block, int nr, * for this bh as it's not marked locally * uptodate. */ status = -EIO; - brelse(bh); + put_bh(bh); bhs[i] = NULL; continue; } @@ -280,3 +280,64 @@ bail: mlog_exit(status); return status; } + +/* Check whether the blkno is the super block or one of the backups. */ +static void ocfs2_check_super_or_backup(struct super_block *sb, + sector_t blkno) +{ + int i; + u64 backup_blkno; + + if (blkno == OCFS2_SUPER_BLOCK_BLKNO) + return; + + for (i = 0; i < OCFS2_MAX_BACKUP_SUPERBLOCKS; i++) { + backup_blkno = ocfs2_backup_super_blkno(sb, i); + if (backup_blkno == blkno) + return; + } + + BUG(); +} + +/* + * Write super block and backups doesn't need to collaborate with journal, + * so we don't need to lock ip_io_mutex and inode doesn't need to bea passed + * into this function. + */ +int ocfs2_write_super_or_backup(struct ocfs2_super *osb, + struct buffer_head *bh) +{ + int ret = 0; + + mlog_entry_void(); + + BUG_ON(buffer_jbd(bh)); + ocfs2_check_super_or_backup(osb->sb, bh->b_blocknr); + + if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb)) { + ret = -EROFS; + goto out; + } + + lock_buffer(bh); + set_buffer_uptodate(bh); + + /* remove from dirty list before I/O. */ + clear_buffer_dirty(bh); + + get_bh(bh); /* for end_buffer_write_sync() */ + bh->b_end_io = end_buffer_write_sync; + submit_bh(WRITE, bh); + + wait_on_buffer(bh); + + if (!buffer_uptodate(bh)) { + ret = -EIO; + put_bh(bh); + } + +out: + mlog_exit(ret); + return ret; +} diff --git a/fs/ocfs2/buffer_head_io.h b/fs/ocfs2/buffer_head_io.h index 6cc2093..c2e7861 100644 --- a/fs/ocfs2/buffer_head_io.h +++ b/fs/ocfs2/buffer_head_io.h @@ -47,6 +47,8 @@ int ocfs2_read_blocks(struct ocfs2_super *osb, int flags, struct inode *inode); +int ocfs2_write_super_or_backup(struct ocfs2_super *osb, + struct buffer_head *bh); #define OCFS2_BH_CACHED 1 #define OCFS2_BH_READAHEAD 8 diff --git a/fs/ocfs2/cluster/heartbeat.h b/fs/ocfs2/cluster/heartbeat.h index 35397dd..e511339 100644 --- a/fs/ocfs2/cluster/heartbeat.h +++ b/fs/ocfs2/cluster/heartbeat.h @@ -35,7 +35,7 @@ #define O2HB_LIVE_THRESHOLD 2 /* number of equal samples to be seen as dead */ extern unsigned int o2hb_dead_threshold; -#define O2HB_DEFAULT_DEAD_THRESHOLD 7 +#define O2HB_DEFAULT_DEAD_THRESHOLD 31 /* Otherwise MAX_WRITE_TIMEOUT will be zero... */ #define O2HB_MIN_DEAD_THRESHOLD 2 #define O2HB_MAX_WRITE_TIMEOUT_MS (O2HB_REGION_TIMEOUT_MS * (o2hb_dead_threshold - 1)) diff --git a/fs/ocfs2/cluster/masklog.c b/fs/ocfs2/cluster/masklog.c index a4882c8..23c732f 100644 --- a/fs/ocfs2/cluster/masklog.c +++ b/fs/ocfs2/cluster/masklog.c @@ -146,7 +146,7 @@ static struct kset mlog_kset = { .kobj = {.ktype = &mlog_ktype}, }; -int mlog_sys_init(struct kset *o2cb_subsys) +int mlog_sys_init(struct kset *o2cb_kset) { int i = 0; @@ -157,7 +157,7 @@ int mlog_sys_init(struct kset *o2cb_subsys) mlog_attr_ptrs[i] = NULL; kobject_set_name(&mlog_kset.kobj, "logmask"); - kobj_set_kset_s(&mlog_kset, *o2cb_subsys); + mlog_kset.kobj.kset = o2cb_kset; return kset_register(&mlog_kset); } diff --git a/fs/ocfs2/cluster/sys.c b/fs/ocfs2/cluster/sys.c index 64f6f37..a4b0773 100644 --- a/fs/ocfs2/cluster/sys.c +++ b/fs/ocfs2/cluster/sys.c @@ -28,96 +28,55 @@ #include <linux/module.h> #include <linux/kobject.h> #include <linux/sysfs.h> +#include <linux/fs.h> #include "ocfs2_nodemanager.h" #include "masklog.h" #include "sys.h" -struct o2cb_attribute { - struct attribute attr; - ssize_t (*show)(char *buf); - ssize_t (*store)(const char *buf, size_t count); -}; - -#define O2CB_ATTR(_name, _mode, _show, _store) \ -struct o2cb_attribute o2cb_attr_##_name = __ATTR(_name, _mode, _show, _store) - -#define to_o2cb_attr(_attr) container_of(_attr, struct o2cb_attribute, attr) -static ssize_t o2cb_interface_revision_show(char *buf) +static ssize_t version_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) { return snprintf(buf, PAGE_SIZE, "%u\n", O2NM_API_VERSION); } - -static O2CB_ATTR(interface_revision, S_IFREG | S_IRUGO, o2cb_interface_revision_show, NULL); +static struct kobj_attribute attr_version = + __ATTR(interface_revision, S_IFREG | S_IRUGO, version_show, NULL); static struct attribute *o2cb_attrs[] = { - &o2cb_attr_interface_revision.attr, + &attr_version.attr, NULL, }; -static ssize_t -o2cb_show(struct kobject * kobj, struct attribute * attr, char * buffer); -static ssize_t -o2cb_store(struct kobject * kobj, struct attribute * attr, - const char * buffer, size_t count); -static struct sysfs_ops o2cb_sysfs_ops = { - .show = o2cb_show, - .store = o2cb_store, +static struct attribute_group o2cb_attr_group = { + .attrs = o2cb_attrs, }; -static struct kobj_type o2cb_subsys_type = { - .default_attrs = o2cb_attrs, - .sysfs_ops = &o2cb_sysfs_ops, -}; - -/* gives us o2cb_subsys */ -static decl_subsys(o2cb, NULL, NULL); - -static ssize_t -o2cb_show(struct kobject * kobj, struct attribute * attr, char * buffer) -{ - struct o2cb_attribute *o2cb_attr = to_o2cb_attr(attr); - struct kset *sbs = to_kset(kobj); - - BUG_ON(sbs != &o2cb_subsys); - - if (o2cb_attr->show) - return o2cb_attr->show(buffer); - return -EIO; -} - -static ssize_t -o2cb_store(struct kobject * kobj, struct attribute * attr, - const char * buffer, size_t count) -{ - struct o2cb_attribute *o2cb_attr = to_o2cb_attr(attr); - struct kset *sbs = to_kset(kobj); - - BUG_ON(sbs != &o2cb_subsys); - - if (o2cb_attr->store) - return o2cb_attr->store(buffer, count); - return -EIO; -} +static struct kset *o2cb_kset; void o2cb_sys_shutdown(void) { mlog_sys_shutdown(); - subsystem_unregister(&o2cb_subsys); + kset_unregister(o2cb_kset); } int o2cb_sys_init(void) { int ret; - o2cb_subsys.kobj.ktype = &o2cb_subsys_type; - ret = subsystem_register(&o2cb_subsys); + o2cb_kset = kset_create_and_add("o2cb", NULL, fs_kobj); + if (!o2cb_kset) + return -ENOMEM; + + ret = sysfs_create_group(&o2cb_kset->kobj, &o2cb_attr_group); if (ret) - return ret; + goto error; - ret = mlog_sys_init(&o2cb_subsys); + ret = mlog_sys_init(o2cb_kset); if (ret) - subsystem_unregister(&o2cb_subsys); + goto error; + return 0; +error: + kset_unregister(o2cb_kset); return ret; } diff --git a/fs/ocfs2/cluster/tcp.h b/fs/ocfs2/cluster/tcp.h index da880fc..f36f66a 100644 --- a/fs/ocfs2/cluster/tcp.h +++ b/fs/ocfs2/cluster/tcp.h @@ -60,8 +60,8 @@ typedef void (o2net_post_msg_handler_func)(int status, void *data, /* same as hb delay, we're waiting for another node to recognize our hb */ #define O2NET_RECONNECT_DELAY_MS_DEFAULT 2000 -#define O2NET_KEEPALIVE_DELAY_MS_DEFAULT 5000 -#define O2NET_IDLE_TIMEOUT_MS_DEFAULT 10000 +#define O2NET_KEEPALIVE_DELAY_MS_DEFAULT 2000 +#define O2NET_IDLE_TIMEOUT_MS_DEFAULT 30000 /* TODO: figure this out.... */ diff --git a/fs/ocfs2/cluster/tcp_internal.h b/fs/ocfs2/cluster/tcp_internal.h index 9606111..b2e832a 100644 --- a/fs/ocfs2/cluster/tcp_internal.h +++ b/fs/ocfs2/cluster/tcp_internal.h @@ -38,6 +38,12 @@ * locking semantics of the file system using the protocol. It should * be somewhere else, I'm sure, but right now it isn't. * + * New in version 10: + * - Meta/data locks combined + * + * New in version 9: + * - All votes removed + * * New in version 8: * - Replace delete inode votes with a cluster lock * @@ -60,7 +66,7 @@ * - full 64 bit i_size in the metadata lock lvbs * - introduction of "rw" lock and pushing meta/data locking down */ -#define O2NET_PROTOCOL_VERSION 8ULL +#define O2NET_PROTOCOL_VERSION 10ULL struct o2net_handshake { __be64 protocol_version; __be64 connector_id; diff --git a/fs/ocfs2/cluster/ver.c b/fs/ocfs2/cluster/ver.c index 7286c48..a56eee6 100644 --- a/fs/ocfs2/cluster/ver.c +++ b/fs/ocfs2/cluster/ver.c @@ -28,7 +28,7 @@ #include "ver.h" -#define CLUSTER_BUILD_VERSION "1.3.3" +#define CLUSTER_BUILD_VERSION "1.5.0" #define VERSION_STR "OCFS2 Node Manager " CLUSTER_BUILD_VERSION diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c index 9923278..b1cc7c3 100644 --- a/fs/ocfs2/dcache.c +++ b/fs/ocfs2/dcache.c @@ -128,9 +128,9 @@ static int ocfs2_match_dentry(struct dentry *dentry, /* * Walk the inode alias list, and find a dentry which has a given * parent. ocfs2_dentry_attach_lock() wants to find _any_ alias as it - * is looking for a dentry_lock reference. The vote thread is looking - * to unhash aliases, so we allow it to skip any that already have - * that property. + * is looking for a dentry_lock reference. The downconvert thread is + * looking to unhash aliases, so we allow it to skip any that already + * have that property. */ struct dentry *ocfs2_find_local_alias(struct inode *inode, u64 parent_blkno, @@ -266,7 +266,7 @@ int ocfs2_dentry_attach_lock(struct dentry *dentry, dl->dl_count = 0; /* * Does this have to happen below, for all attaches, in case - * the struct inode gets blown away by votes? + * the struct inode gets blown away by the downconvert thread? */ dl->dl_inode = igrab(inode); dl->dl_parent_blkno = parent_blkno; diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c index 63b28fd..6b0107f 100644 --- a/fs/ocfs2/dir.c +++ b/fs/ocfs2/dir.c @@ -846,14 +846,14 @@ int ocfs2_readdir(struct file * filp, void * dirent, filldir_t filldir) mlog_entry("dirino=%llu\n", (unsigned long long)OCFS2_I(inode)->ip_blkno); - error = ocfs2_meta_lock_atime(inode, filp->f_vfsmnt, &lock_level); + error = ocfs2_inode_lock_atime(inode, filp->f_vfsmnt, &lock_level); if (lock_level && error >= 0) { /* We release EX lock which used to update atime * and get PR lock again to reduce contention * on commonly accessed directories. */ - ocfs2_meta_unlock(inode, 1); + ocfs2_inode_unlock(inode, 1); lock_level = 0; - error = ocfs2_meta_lock(inode, NULL, 0); + error = ocfs2_inode_lock(inode, NULL, 0); } if (error < 0) { if (error != -ENOENT) @@ -865,7 +865,7 @@ int ocfs2_readdir(struct file * filp, void * dirent, filldir_t filldir) error = ocfs2_dir_foreach_blk(inode, &filp->f_version, &filp->f_pos, dirent, filldir, NULL); - ocfs2_meta_unlock(inode, lock_level); + ocfs2_inode_unlock(inode, lock_level); bail_nolock: mlog_exit(error); diff --git a/fs/ocfs2/dlm/dlmfsver.c b/fs/ocfs2/dlm/dlmfsver.c index d2be3ad..a733b33 100644 --- a/fs/ocfs2/dlm/dlmfsver.c +++ b/fs/ocfs2/dlm/dlmfsver.c @@ -28,7 +28,7 @@ #include "dlmfsver.h" -#define DLM_BUILD_VERSION "1.3.3" +#define DLM_BUILD_VERSION "1.5.0" #define VERSION_STR "OCFS2 DLMFS " DLM_BUILD_VERSION diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c index 2fde7bf..91f747b 100644 --- a/fs/ocfs2/dlm/dlmrecovery.c +++ b/fs/ocfs2/dlm/dlmrecovery.c @@ -2270,6 +2270,12 @@ static void __dlm_hb_node_down(struct dlm_ctxt *dlm, int idx) } } + /* Clean up join state on node death. */ + if (dlm->joining_node == idx) { + mlog(0, "Clearing join state for node %u\n", idx); + __dlm_set_joining_node(dlm, DLM_LOCK_RES_OWNER_UNKNOWN); + } + /* check to see if the node is already considered dead */ if (!test_bit(idx, dlm->live_nodes_map)) { mlog(0, "for domain %s, node %d is already dead. " @@ -2288,12 +2294,6 @@ static void __dlm_hb_node_down(struct dlm_ctxt *dlm, int idx) clear_bit(idx, dlm->live_nodes_map); - /* Clean up join state on node death. */ - if (dlm->joining_node == idx) { - mlog(0, "Clearing join state for node %u\n", idx); - __dlm_set_joining_node(dlm, DLM_LOCK_RES_OWNER_UNKNOWN); - } - /* make sure local cleanup occurs before the heartbeat events */ if (!test_bit(idx, dlm->recovery_map)) dlm_do_local_recovery_cleanup(dlm, idx); @@ -2321,6 +2321,13 @@ void dlm_hb_node_down_cb(struct o2nm_node *node, int idx, void *data) if (!dlm_grab(dlm)) return; + /* + * This will notify any dlm users that a node in our domain + * went away without notifying us first. + */ + if (test_bit(idx, dlm->domain_map)) + dlm_fire_domain_eviction_callbacks(dlm, idx); + spin_lock(&dlm->spinlock); __dlm_hb_node_down(dlm, idx); spin_unlock(&dlm->spinlock); diff --git a/fs/ocfs2/dlm/dlmver.c b/fs/ocfs2/dlm/dlmver.c index 7ef2653..dfc0da4 100644 --- a/fs/ocfs2/dlm/dlmver.c +++ b/fs/ocfs2/dlm/dlmver.c @@ -28,7 +28,7 @@ #include "dlmver.h" -#define DLM_BUILD_VERSION "1.3.3" +#define DLM_BUILD_VERSION "1.5.0" #define VERSION_STR "OCFS2 DLM " DLM_BUILD_VERSION diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index 4e97dcc..3867244 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -55,7 +55,6 @@ #include "slot_map.h" #include "super.h" #include "uptodate.h" -#include "vote.h" #include "buffer_head_io.h" @@ -69,6 +68,7 @@ struct ocfs2_mask_waiter { static struct ocfs2_super *ocfs2_get_dentry_osb(struct ocfs2_lock_res *lockres); static struct ocfs2_super *ocfs2_get_inode_osb(struct ocfs2_lock_res *lockres); +static struct ocfs2_super *ocfs2_get_file_osb(struct ocfs2_lock_res *lockres); /* * Return value from ->downconvert_worker functions. @@ -153,10 +153,10 @@ struct ocfs2_lock_res_ops { struct ocfs2_super * (*get_osb)(struct ocfs2_lock_res *); /* - * Optionally called in the downconvert (or "vote") thread - * after a successful downconvert. The lockres will not be - * referenced after this callback is called, so it is safe to - * free memory, etc. + * Optionally called in the downconvert thread after a + * successful downconvert. The lockres will not be referenced + * after this callback is called, so it is safe to free + * memory, etc. * * The exact semantics of when this is called are controlled * by ->downconvert_worker() @@ -225,17 +225,12 @@ static struct ocfs2_lock_res_ops ocfs2_inode_rw_lops = { .flags = 0, }; -static struct ocfs2_lock_res_ops ocfs2_inode_meta_lops = { +static struct ocfs2_lock_res_ops ocfs2_inode_inode_lops = { .get_osb = ocfs2_get_inode_osb, .check_downconvert = ocfs2_check_meta_downconvert, .set_lvb = ocfs2_set_meta_lvb, - .flags = LOCK_TYPE_REQUIRES_REFRESH|LOCK_TYPE_USES_LVB, -}; - -static struct ocfs2_lock_res_ops ocfs2_inode_data_lops = { - .get_osb = ocfs2_get_inode_osb, .downconvert_worker = ocfs2_data_convert_worker, - .flags = 0, + .flags = LOCK_TYPE_REQUIRES_REFRESH|LOCK_TYPE_USES_LVB, }; static struct ocfs2_lock_res_ops ocfs2_super_lops = { @@ -258,10 +253,14 @@ static struct ocfs2_lock_res_ops ocfs2_inode_open_lops = { .flags = 0, }; +static struct ocfs2_lock_res_ops ocfs2_flock_lops = { + .get_osb = ocfs2_get_file_osb, + .flags = 0, +}; + static inline int ocfs2_is_inode_lock(struct ocfs2_lock_res *lockres) { return lockres->l_type == OCFS2_LOCK_TYPE_META || - lockres->l_type == OCFS2_LOCK_TYPE_DATA || lockres->l_type == OCFS2_LOCK_TYPE_RW || lockres->l_type == OCFS2_LOCK_TYPE_OPEN; } @@ -310,12 +309,24 @@ static inline void ocfs2_recover_from_dlm_error(struct ocfs2_lock_res *lockres, "resource %s: %s\n", dlm_errname(_stat), _func, \ _lockres->l_name, dlm_errmsg(_stat)); \ } while (0) -static void ocfs2_vote_on_unlock(struct ocfs2_super *osb, - struct ocfs2_lock_res *lockres); -static int ocfs2_meta_lock_update(struct inode *inode, +static int ocfs2_downconvert_thread(void *arg); +static void ocfs2_downconvert_on_unlock(struct ocfs2_super *osb, + struct ocfs2_lock_res *lockres); +static int ocfs2_inode_lock_update(struct inode *inode, struct buffer_head **bh); static void ocfs2_drop_osb_locks(struct ocfs2_super *osb); static inline int ocfs2_highest_compat_lock_level(int level); +static void ocfs2_prepare_downconvert(struct ocfs2_lock_res *lockres, + int new_level); +static int ocfs2_downconvert_lock(struct ocfs2_super *osb, + struct ocfs2_lock_res *lockres, + int new_level, + int lvb); +static int ocfs2_prepare_cancel_convert(struct ocfs2_super *osb, + struct ocfs2_lock_res *lockres); +static int ocfs2_cancel_convert(struct ocfs2_super *osb, + struct ocfs2_lock_res *lockres); + static void ocfs2_build_lock_name(enum ocfs2_lock_type type, u64 blkno, @@ -402,10 +413,7 @@ void ocfs2_inode_lock_res_init(struct ocfs2_lock_res *res, ops = &ocfs2_inode_rw_lops; break; case OCFS2_LOCK_TYPE_META: - ops = &ocfs2_inode_meta_lops; - break; - case OCFS2_LOCK_TYPE_DATA: - ops = &ocfs2_inode_data_lops; + ops = &ocfs2_inode_inode_lops; break; case OCFS2_LOCK_TYPE_OPEN: ops = &ocfs2_inode_open_lops; @@ -428,6 +436,13 @@ static struct ocfs2_super *ocfs2_get_inode_osb(struct ocfs2_lock_res *lockres) return OCFS2_SB(inode->i_sb); } +static struct ocfs2_super *ocfs2_get_file_osb(struct ocfs2_lock_res *lockres) +{ + struct ocfs2_file_private *fp = lockres->l_priv; + + return OCFS2_SB(fp->fp_file->f_mapping->host->i_sb); +} + static __u64 ocfs2_get_dentry_lock_ino(struct ocfs2_lock_res *lockres) { __be64 inode_blkno_be; @@ -508,6 +523,21 @@ static void ocfs2_rename_lock_res_init(struct ocfs2_lock_res *res, &ocfs2_rename_lops, osb); } +void ocfs2_file_lock_res_init(struct ocfs2_lock_res *lockres, + struct ocfs2_file_private *fp) +{ + struct inode *inode = fp->fp_file->f_mapping->host; + struct ocfs2_inode_info *oi = OCFS2_I(inode); + + ocfs2_lock_res_init_once(lockres); + ocfs2_build_lock_name(OCFS2_LOCK_TYPE_FLOCK, oi->ip_blkno, + inode->i_generation, lockres->l_name); + ocfs2_lock_res_init_common(OCFS2_SB(inode->i_sb), lockres, + OCFS2_LOCK_TYPE_FLOCK, &ocfs2_flock_lops, + fp); + lockres->l_flags |= OCFS2_LOCK_NOCACHE; +} + void ocfs2_lock_res_free(struct ocfs2_lock_res *res) { mlog_entry_void(); @@ -724,6 +754,13 @@ static void ocfs2_blocking_ast(void *opaque, int level) lockres->l_name, level, lockres->l_level, ocfs2_lock_type_string(lockres->l_type)); + /* + * We can skip the bast for locks which don't enable caching - + * they'll be dropped at the earliest possible time anyway. + */ + if (lockres->l_flags & OCFS2_LOCK_NOCACHE) + return; + spin_lock_irqsave(&lockres->l_lock, flags); needs_downconvert = ocfs2_generic_handle_bast(lockres, level); if (needs_downconvert) @@ -732,7 +769,7 @@ static void ocfs2_blocking_ast(void *opaque, int level) wake_up(&lockres->l_event); - ocfs2_kick_vote_thread(osb); + ocfs2_wake_downconvert_thread(osb); } static void ocfs2_locking_ast(void *opaque) @@ -935,6 +972,21 @@ static int lockres_remove_mask_waiter(struct ocfs2_lock_res *lockres, } +static int ocfs2_wait_for_mask_interruptible(struct ocfs2_mask_waiter *mw, + struct ocfs2_lock_res *lockres) +{ + int ret; + + ret = wait_for_completion_interruptible(&mw->mw_complete); + if (ret) + lockres_remove_mask_waiter(lockres, mw); + else + ret = mw->mw_status; + /* Re-arm the completion in case we want to wait on it again */ + INIT_COMPLETION(mw->mw_complete); + return ret; +} + static int ocfs2_cluster_lock(struct ocfs2_super *osb, struct ocfs2_lock_res *lockres, int level, @@ -1089,7 +1141,7 @@ static void ocfs2_cluster_unlock(struct ocfs2_super *osb, mlog_entry_void(); spin_lock_irqsave(&lockres->l_lock, flags); ocfs2_dec_holders(lockres, level); - ocfs2_vote_on_unlock(osb, lockres); + ocfs2_downconvert_on_unlock(osb, lockres); spin_unlock_irqrestore(&lockres->l_lock, flags); mlog_exit_void(); } @@ -1147,13 +1199,7 @@ int ocfs2_create_new_inode_locks(struct inode *inode) * We don't want to use LKM_LOCAL on a meta data lock as they * don't use a generation in their lock names. */ - ret = ocfs2_create_new_lock(osb, &OCFS2_I(inode)->ip_meta_lockres, 1, 0); - if (ret) { - mlog_errno(ret); - goto bail; - } - - ret = ocfs2_create_new_lock(osb, &OCFS2_I(inode)->ip_data_lockres, 1, 1); + ret = ocfs2_create_new_lock(osb, &OCFS2_I(inode)->ip_inode_lockres, 1, 0); if (ret) { mlog_errno(ret); goto bail; @@ -1311,76 +1357,221 @@ out: mlog_exit_void(); } -int ocfs2_data_lock_full(struct inode *inode, - int write, - int arg_flags) +static int ocfs2_flock_handle_signal(struct ocfs2_lock_res *lockres, + int level) { - int status = 0, level; - struct ocfs2_lock_res *lockres; - struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + int ret; + struct ocfs2_super *osb = ocfs2_get_lockres_osb(lockres); + unsigned long flags; + struct ocfs2_mask_waiter mw; - BUG_ON(!inode); + ocfs2_init_mask_waiter(&mw); - mlog_entry_void(); +retry_cancel: + spin_lock_irqsave(&lockres->l_lock, flags); + if (lockres->l_flags & OCFS2_LOCK_BUSY) { + ret = ocfs2_prepare_cancel_convert(osb, lockres); + if (ret) { + spin_unlock_irqrestore(&lockres->l_lock, flags); + ret = ocfs2_cancel_convert(osb, lockres); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + goto retry_cancel; + } + lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BUSY, 0); + spin_unlock_irqrestore(&lockres->l_lock, flags); - mlog(0, "inode %llu take %s DATA lock\n", - (unsigned long long)OCFS2_I(inode)->ip_blkno, - write ? "EXMODE" : "PRMODE"); + ocfs2_wait_for_mask(&mw); + goto retry_cancel; + } - /* We'll allow faking a readonly data lock for - * rodevices. */ - if (ocfs2_is_hard_readonly(OCFS2_SB(inode->i_sb))) { - if (write) { - status = -EROFS; - mlog_errno(status); + ret = -ERESTARTSYS; + /* + * We may still have gotten the lock, in which case there's no + * point to restarting the syscall. + */ + if (lockres->l_level == level) + ret = 0; + + mlog(0, "Cancel returning %d. flags: 0x%lx, level: %d, act: %d\n", ret, + lockres->l_flags, lockres->l_level, lockres->l_action); + + spin_unlock_irqrestore(&lockres->l_lock, flags); + +out: + return ret; +} + +/* + * ocfs2_file_lock() and ocfs2_file_unlock() map to a single pair of + * flock() calls. The locking approach this requires is sufficiently + * different from all other cluster lock types that we implement a + * seperate path to the "low-level" dlm calls. In particular: + * + * - No optimization of lock levels is done - we take at exactly + * what's been requested. + * + * - No lock caching is employed. We immediately downconvert to + * no-lock at unlock time. This also means flock locks never go on + * the blocking list). + * + * - Since userspace can trivially deadlock itself with flock, we make + * sure to allow cancellation of a misbehaving applications flock() + * request. + * + * - Access to any flock lockres doesn't require concurrency, so we + * can simplify the code by requiring the caller to guarantee + * serialization of dlmglue flock calls. + */ +int ocfs2_file_lock(struct file *file, int ex, int trylock) +{ + int ret, level = ex ? LKM_EXMODE : LKM_PRMODE; + unsigned int lkm_flags = trylock ? LKM_NOQUEUE : 0; + unsigned long flags; + struct ocfs2_file_private *fp = file->private_data; + struct ocfs2_lock_res *lockres = &fp->fp_flock; + struct ocfs2_super *osb = OCFS2_SB(file->f_mapping->host->i_sb); + struct ocfs2_mask_waiter mw; + + ocfs2_init_mask_waiter(&mw); + + if ((lockres->l_flags & OCFS2_LOCK_BUSY) || + (lockres->l_level > LKM_NLMODE)) { + mlog(ML_ERROR, + "File lock \"%s\" has busy or locked state: flags: 0x%lx, " + "level: %u\n", lockres->l_name, lockres->l_flags, + lockres->l_level); + return -EINVAL; + } + + spin_lock_irqsave(&lockres->l_lock, flags); + if (!(lockres->l_flags & OCFS2_LOCK_ATTACHED)) { + lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BUSY, 0); + spin_unlock_irqrestore(&lockres->l_lock, flags); + + /* + * Get the lock at NLMODE to start - that way we + * can cancel the upconvert request if need be. + */ + ret = ocfs2_lock_create(osb, lockres, LKM_NLMODE, 0); + if (ret < 0) { + mlog_errno(ret); + goto out; } - goto out; + + ret = ocfs2_wait_for_mask(&mw); + if (ret) { + mlog_errno(ret); + goto out; + } + spin_lock_irqsave(&lockres->l_lock, flags); } - if (ocfs2_mount_local(osb)) - goto out; + lockres->l_action = OCFS2_AST_CONVERT; + lkm_flags |= LKM_CONVERT; + lockres->l_requested = level; + lockres_or_flags(lockres, OCFS2_LOCK_BUSY); - lockres = &OCFS2_I(inode)->ip_data_lockres; + lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BUSY, 0); + spin_unlock_irqrestore(&lockres->l_lock, flags); - level = write ? LKM_EXMODE : LKM_PRMODE; + ret = dlmlock(osb->dlm, level, &lockres->l_lksb, lkm_flags, + lockres->l_name, OCFS2_LOCK_ID_MAX_LEN - 1, + ocfs2_locking_ast, lockres, ocfs2_blocking_ast); + if (ret != DLM_NORMAL) { + if (trylock && ret == DLM_NOTQUEUED) + ret = -EAGAIN; + else { + ocfs2_log_dlm_error("dlmlock", ret, lockres); + ret = -EINVAL; + } - status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres, level, - 0, arg_flags); - if (status < 0 && status != -EAGAIN) - mlog_errno(status); + ocfs2_recover_from_dlm_error(lockres, 1); + lockres_remove_mask_waiter(lockres, &mw); + goto out; + } + + ret = ocfs2_wait_for_mask_interruptible(&mw, lockres); + if (ret == -ERESTARTSYS) { + /* + * Userspace can cause deadlock itself with + * flock(). Current behavior locally is to allow the + * deadlock, but abort the system call if a signal is + * received. We follow this example, otherwise a + * poorly written program could sit in kernel until + * reboot. + * + * Handling this is a bit more complicated for Ocfs2 + * though. We can't exit this function with an + * outstanding lock request, so a cancel convert is + * required. We intentionally overwrite 'ret' - if the + * cancel fails and the lock was granted, it's easier + * to just bubble sucess back up to the user. + */ + ret = ocfs2_flock_handle_signal(lockres, level); + } out: - mlog_exit(status); - return status; + + mlog(0, "Lock: \"%s\" ex: %d, trylock: %d, returns: %d\n", + lockres->l_name, ex, trylock, ret); + return ret; } -/* see ocfs2_meta_lock_with_page() */ -int ocfs2_data_lock_with_page(struct inode *inode, - int write, - struct page *page) +void ocfs2_file_unlock(struct file *file) { int ret; + unsigned long flags; + struct ocfs2_file_private *fp = file->private_data; + struct ocfs2_lock_res *lockres = &fp->fp_flock; + struct ocfs2_super *osb = OCFS2_SB(file->f_mapping->host->i_sb); + struct ocfs2_mask_waiter mw; - ret = ocfs2_data_lock_full(inode, write, OCFS2_LOCK_NONBLOCK); - if (ret == -EAGAIN) { - unlock_page(page); - if (ocfs2_data_lock(inode, write) == 0) - ocfs2_data_unlock(inode, write); - ret = AOP_TRUNCATED_PAGE; + ocfs2_init_mask_waiter(&mw); + + if (!(lockres->l_flags & OCFS2_LOCK_ATTACHED)) + return; + + if (lockres->l_level == LKM_NLMODE) + return; + + mlog(0, "Unlock: \"%s\" flags: 0x%lx, level: %d, act: %d\n", + lockres->l_name, lockres->l_flags, lockres->l_level, + lockres->l_action); + + spin_lock_irqsave(&lockres->l_lock, flags); + /* + * Fake a blocking ast for the downconvert code. + */ + lockres_or_flags(lockres, OCFS2_LOCK_BLOCKED); + lockres->l_blocking = LKM_EXMODE; + + ocfs2_prepare_downconvert(lockres, LKM_NLMODE); + lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BUSY, 0); + spin_unlock_irqrestore(&lockres->l_lock, flags); + + ret = ocfs2_downconvert_lock(osb, lockres, LKM_NLMODE, 0); + if (ret) { + mlog_errno(ret); + return; } - return ret; + ret = ocfs2_wait_for_mask(&mw); + if (ret) + mlog_errno(ret); } -static void ocfs2_vote_on_unlock(struct ocfs2_super *osb, - struct ocfs2_lock_res *lockres) +static void ocfs2_downconvert_on_unlock(struct ocfs2_super *osb, + struct ocfs2_lock_res *lockres) { int kick = 0; mlog_entry_void(); /* If we know that another node is waiting on our lock, kick - * the vote thread * pre-emptively when we reach a release + * the downconvert thread * pre-emptively when we reach a release * condition. */ if (lockres->l_flags & OCFS2_LOCK_BLOCKED) { switch(lockres->l_blocking) { @@ -1398,27 +1589,7 @@ static void ocfs2_vote_on_unlock(struct ocfs2_super *osb, } if (kick) - ocfs2_kick_vote_thread(osb); - - mlog_exit_void(); -} - -void ocfs2_data_unlock(struct inode *inode, - int write) -{ - int level = write ? LKM_EXMODE : LKM_PRMODE; - struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_data_lockres; - struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); - - mlog_entry_void(); - - mlog(0, "inode %llu drop %s DATA lock\n", - (unsigned long long)OCFS2_I(inode)->ip_blkno, - write ? "EXMODE" : "PRMODE"); - - if (!ocfs2_is_hard_readonly(OCFS2_SB(inode->i_sb)) && - !ocfs2_mount_local(osb)) - ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres, level); + ocfs2_wake_downconvert_thread(osb); mlog_exit_void(); } @@ -1442,11 +1613,11 @@ static u64 ocfs2_pack_timespec(struct timespec *spec) /* Call this with the lockres locked. I am reasonably sure we don't * need ip_lock in this function as anyone who would be changing those - * values is supposed to be blocked in ocfs2_meta_lock right now. */ + * values is supposed to be blocked in ocfs2_inode_lock right now. */ static void __ocfs2_stuff_meta_lvb(struct inode *inode) { struct ocfs2_inode_info *oi = OCFS2_I(inode); - struct ocfs2_lock_res *lockres = &oi->ip_meta_lockres; + struct ocfs2_lock_res *lockres = &oi->ip_inode_lockres; struct ocfs2_meta_lvb *lvb; mlog_entry_void(); @@ -1496,7 +1667,7 @@ static void ocfs2_unpack_timespec(struct timespec *spec, static void ocfs2_refresh_inode_from_lvb(struct inode *inode) { struct ocfs2_inode_info *oi = OCFS2_I(inode); - struct ocfs2_lock_res *lockres = &oi->ip_meta_lockres; + struct ocfs2_lock_res *lockres = &oi->ip_inode_lockres; struct ocfs2_meta_lvb *lvb; mlog_entry_void(); @@ -1604,12 +1775,12 @@ static inline void ocfs2_complete_lock_res_refresh(struct ocfs2_lock_res *lockre } /* may or may not return a bh if it went to disk. */ -static int ocfs2_meta_lock_update(struct inode *inode, +static int ocfs2_inode_lock_update(struct inode *inode, struct buffer_head **bh) { int status = 0; struct ocfs2_inode_info *oi = OCFS2_I(inode); - struct ocfs2_lock_res *lockres = &oi->ip_meta_lockres; + struct ocfs2_lock_res *lockres = &oi->ip_inode_lockres; struct ocfs2_dinode *fe; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); @@ -1721,7 +1892,7 @@ static int ocfs2_assign_bh(struct inode *inode, * returns < 0 error if the callback will never be called, otherwise * the result of the lock will be communicated via the callback. */ -int ocfs2_meta_lock_full(struct inode *inode, +int ocfs2_inode_lock_full(struct inode *inode, struct buffer_head **ret_bh, int ex, int arg_flags) @@ -1756,7 +1927,7 @@ int ocfs2_meta_lock_full(struct inode *inode, wait_event(osb->recovery_event, ocfs2_node_map_is_empty(osb, &osb->recovery_map)); - lockres = &OCFS2_I(inode)->ip_meta_lockres; + lockres = &OCFS2_I(inode)->ip_inode_lockres; level = ex ? LKM_EXMODE : LKM_PRMODE; dlm_flags = 0; if (arg_flags & OCFS2_META_LOCK_NOQUEUE) @@ -1795,11 +1966,11 @@ local: } /* This is fun. The caller may want a bh back, or it may - * not. ocfs2_meta_lock_update definitely wants one in, but + * not. ocfs2_inode_lock_update definitely wants one in, but * may or may not read one, depending on what's in the * LVB. The result of all of this is that we've *only* gone to * disk if we have to, so the complexity is worthwhile. */ - status = ocfs2_meta_lock_update(inode, &local_bh); + status = ocfs2_inode_lock_update(inode, &local_bh); if (status < 0) { if (status != -ENOENT) mlog_errno(status); @@ -1821,7 +1992,7 @@ bail: *ret_bh = NULL; } if (acquired) - ocfs2_meta_unlock(inode, ex); + ocfs2_inode_unlock(inode, ex); } if (local_bh) @@ -1832,19 +2003,20 @@ bail: } /* - * This is working around a lock inversion between tasks acquiring DLM locks - * while holding a page lock and the vote thread which blocks dlm lock acquiry - * while acquiring page locks. + * This is working around a lock inversion between tasks acquiring DLM + * locks while holding a page lock and the downconvert thread which + * blocks dlm lock acquiry while acquiring page locks. * * ** These _with_page variantes are only intended to be called from aop * methods that hold page locks and return a very specific *positive* error * code that aop methods pass up to the VFS -- test for errors with != 0. ** * - * The DLM is called such that it returns -EAGAIN if it would have blocked - * waiting for the vote thread. In that case we unlock our page so the vote - * thread can make progress. Once we've done this we have to return - * AOP_TRUNCATED_PAGE so the aop method that called us can bubble that back up - * into the VFS who will then immediately retry the aop call. + * The DLM is called such that it returns -EAGAIN if it would have + * blocked waiting for the downconvert thread. In that case we unlock + * our page so the downconvert thread can make progress. Once we've + * done this we have to return AOP_TRUNCATED_PAGE so the aop method + * that called us can bubble that back up into the VFS who will then + * immediately retry the aop call. * * We do a blocking lock and immediate unlock before returning, though, so that * the lock has a great chance of being cached on this node by the time the VFS @@ -1852,32 +2024,32 @@ bail: * ping locks back and forth, but that's a risk we're willing to take to avoid * the lock inversion simply. */ -int ocfs2_meta_lock_with_page(struct inode *inode, +int ocfs2_inode_lock_with_page(struct inode *inode, struct buffer_head **ret_bh, int ex, struct page *page) { int ret; - ret = ocfs2_meta_lock_full(inode, ret_bh, ex, OCFS2_LOCK_NONBLOCK); + ret = ocfs2_inode_lock_full(inode, ret_bh, ex, OCFS2_LOCK_NONBLOCK); if (ret == -EAGAIN) { unlock_page(page); - if (ocfs2_meta_lock(inode, ret_bh, ex) == 0) - ocfs2_meta_unlock(inode, ex); + if (ocfs2_inode_lock(inode, ret_bh, ex) == 0) + ocfs2_inode_unlock(inode, ex); ret = AOP_TRUNCATED_PAGE; } return ret; } -int ocfs2_meta_lock_atime(struct inode *inode, +int ocfs2_inode_lock_atime(struct inode *inode, struct vfsmount *vfsmnt, int *level) { int ret; mlog_entry_void(); - ret = ocfs2_meta_lock(inode, NULL, 0); + ret = ocfs2_inode_lock(inode, NULL, 0); if (ret < 0) { mlog_errno(ret); return ret; @@ -1890,8 +2062,8 @@ int ocfs2_meta_lock_atime(struct inode *inode, if (ocfs2_should_update_atime(inode, vfsmnt)) { struct buffer_head *bh = NULL; - ocfs2_meta_unlock(inode, 0); - ret = ocfs2_meta_lock(inode, &bh, 1); + ocfs2_inode_unlock(inode, 0); + ret = ocfs2_inode_lock(inode, &bh, 1); if (ret < 0) { mlog_errno(ret); return ret; @@ -1908,11 +2080,11 @@ int ocfs2_meta_lock_atime(struct inode *inode, return ret; } -void ocfs2_meta_unlock(struct inode *inode, +void ocfs2_inode_unlock(struct inode *inode, int ex) { int level = ex ? LKM_EXMODE : LKM_PRMODE; - struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_meta_lockres; + struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_inode_lockres; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); mlog_entry_void(); @@ -2320,11 +2492,11 @@ int ocfs2_dlm_init(struct ocfs2_super *osb) goto bail; } - /* launch vote thread */ - osb->vote_task = kthread_run(ocfs2_vote_thread, osb, "ocfs2vote"); - if (IS_ERR(osb->vote_task)) { - status = PTR_ERR(osb->vote_task); - osb->vote_task = NULL; + /* launch downconvert thread */ + osb->dc_task = kthread_run(ocfs2_downconvert_thread, osb, "ocfs2dc"); + if (IS_ERR(osb->dc_task)) { + status = PTR_ERR(osb->dc_task); + osb->dc_task = NULL; mlog_errno(status); goto bail; } @@ -2353,8 +2525,8 @@ local: bail: if (status < 0) { ocfs2_dlm_shutdown_debug(osb); - if (osb->vote_task) - kthread_stop(osb->vote_task); + if (osb->dc_task) + kthread_stop(osb->dc_task); } mlog_exit(status); @@ -2369,9 +2541,9 @@ void ocfs2_dlm_shutdown(struct ocfs2_super *osb) ocfs2_drop_osb_locks(osb); - if (osb->vote_task) { - kthread_stop(osb->vote_task); - osb->vote_task = NULL; + if (osb->dc_task) { + kthread_stop(osb->dc_task); + osb->dc_task = NULL; } ocfs2_lock_res_free(&osb->osb_super_lockres); @@ -2527,7 +2699,7 @@ out: /* Mark the lockres as being dropped. It will no longer be * queued if blocking, but we still may have to wait on it - * being dequeued from the vote thread before we can consider + * being dequeued from the downconvert thread before we can consider * it safe to drop. * * You can *not* attempt to call cluster_lock on this lockres anymore. */ @@ -2590,14 +2762,7 @@ int ocfs2_drop_inode_locks(struct inode *inode) status = err; err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb), - &OCFS2_I(inode)->ip_data_lockres); - if (err < 0) - mlog_errno(err); - if (err < 0 && !status) - status = err; - - err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb), - &OCFS2_I(inode)->ip_meta_lockres); + &OCFS2_I(inode)->ip_inode_lockres); if (err < 0) mlog_errno(err); if (err < 0 && !status) @@ -2850,6 +3015,9 @@ static int ocfs2_data_convert_worker(struct ocfs2_lock_res *lockres, inode = ocfs2_lock_res_inode(lockres); mapping = inode->i_mapping; + if (S_ISREG(inode->i_mode)) + goto out; + /* * We need this before the filemap_fdatawrite() so that it can * transfer the dirty bit from the PTE to the @@ -2875,6 +3043,7 @@ static int ocfs2_data_convert_worker(struct ocfs2_lock_res *lockres, filemap_fdatawait(mapping); } +out: return UNBLOCK_CONTINUE; } @@ -2903,7 +3072,7 @@ static void ocfs2_set_meta_lvb(struct ocfs2_lock_res *lockres) /* * Does the final reference drop on our dentry lock. Right now this - * happens in the vote thread, but we could choose to simplify the + * happens in the downconvert thread, but we could choose to simplify the * dlmglue API and push these off to the ocfs2_wq in the future. */ static void ocfs2_dentry_post_unlock(struct ocfs2_super *osb, @@ -3042,7 +3211,7 @@ void ocfs2_process_blocked_lock(struct ocfs2_super *osb, mlog(0, "lockres %s blocked.\n", lockres->l_name); /* Detect whether a lock has been marked as going away while - * the vote thread was processing other things. A lock can + * the downconvert thread was processing other things. A lock can * still be marked with OCFS2_LOCK_FREEING after this check, * but short circuiting here will still save us some * performance. */ @@ -3091,13 +3260,104 @@ static void ocfs2_schedule_blocked_lock(struct ocfs2_super *osb, lockres_or_flags(lockres, OCFS2_LOCK_QUEUED); - spin_lock(&osb->vote_task_lock); + spin_lock(&osb->dc_task_lock); if (list_empty(&lockres->l_blocked_list)) { list_add_tail(&lockres->l_blocked_list, &osb->blocked_lock_list); osb->blocked_lock_count++; } - spin_unlock(&osb->vote_task_lock); + spin_unlock(&osb->dc_task_lock); + + mlog_exit_void(); +} + +static void ocfs2_downconvert_thread_do_work(struct ocfs2_super *osb) +{ + unsigned long processed; + struct ocfs2_lock_res *lockres; + + mlog_entry_void(); + + spin_lock(&osb->dc_task_lock); + /* grab this early so we know to try again if a state change and + * wake happens part-way through our work */ + osb->dc_work_sequence = osb->dc_wake_sequence; + + processed = osb->blocked_lock_count; + while (processed) { + BUG_ON(list_empty(&osb->blocked_lock_list)); + + lockres = list_entry(osb->blocked_lock_list.next, + struct ocfs2_lock_res, l_blocked_list); + list_del_init(&lockres->l_blocked_list); + osb->blocked_lock_count--; + spin_unlock(&osb->dc_task_lock); + + BUG_ON(!processed); + processed--; + + ocfs2_process_blocked_lock(osb, lockres); + + spin_lock(&osb->dc_task_lock); + } + spin_unlock(&osb->dc_task_lock); mlog_exit_void(); } + +static int ocfs2_downconvert_thread_lists_empty(struct ocfs2_super *osb) +{ + int empty = 0; + + spin_lock(&osb->dc_task_lock); + if (list_empty(&osb->blocked_lock_list)) + empty = 1; + + spin_unlock(&osb->dc_task_lock); + return empty; +} + +static int ocfs2_downconvert_thread_should_wake(struct ocfs2_super *osb) +{ + int should_wake = 0; + + spin_lock(&osb->dc_task_lock); + if (osb->dc_work_sequence != osb->dc_wake_sequence) + should_wake = 1; + spin_unlock(&osb->dc_task_lock); + + return should_wake; +} + +int ocfs2_downconvert_thread(void *arg) +{ + int status = 0; + struct ocfs2_super *osb = arg; + + /* only quit once we've been asked to stop and there is no more + * work available */ + while (!(kthread_should_stop() && + ocfs2_downconvert_thread_lists_empty(osb))) { + + wait_event_interruptible(osb->dc_event, + ocfs2_downconvert_thread_should_wake(osb) || + kthread_should_stop()); + + mlog(0, "downconvert_thread: awoken\n"); + + ocfs2_downconvert_thread_do_work(osb); + } + + osb->dc_task = NULL; + return status; +} + +void ocfs2_wake_downconvert_thread(struct ocfs2_super *osb) +{ + spin_lock(&osb->dc_task_lock); + /* make sure the voting thread gets a swipe at whatever changes + * the caller may have made to the voting state */ + osb->dc_wake_sequence++; + spin_unlock(&osb->dc_task_lock); + wake_up(&osb->dc_event); +} diff --git a/fs/ocfs2/dlmglue.h b/fs/ocfs2/dlmglue.h index 87a785e..5f17243 100644 --- a/fs/ocfs2/dlmglue.h +++ b/fs/ocfs2/dlmglue.h @@ -49,12 +49,12 @@ struct ocfs2_meta_lvb { __be32 lvb_reserved2; }; -/* ocfs2_meta_lock_full() and ocfs2_data_lock_full() 'arg_flags' flags */ +/* ocfs2_inode_lock_full() 'arg_flags' flags */ /* don't wait on recovery. */ #define OCFS2_META_LOCK_RECOVERY (0x01) /* Instruct the dlm not to queue ourselves on the other node. */ #define OCFS2_META_LOCK_NOQUEUE (0x02) -/* don't block waiting for the vote thread, instead return -EAGAIN */ +/* don't block waiting for the downconvert thread, instead return -EAGAIN */ #define OCFS2_LOCK_NONBLOCK (0x04) int ocfs2_dlm_init(struct ocfs2_super *osb); @@ -66,38 +66,32 @@ void ocfs2_inode_lock_res_init(struct ocfs2_lock_res *res, struct inode *inode); void ocfs2_dentry_lock_res_init(struct ocfs2_dentry_lock *dl, u64 parent, struct inode *inode); +struct ocfs2_file_private; +void ocfs2_file_lock_res_init(struct ocfs2_lock_res *lockres, + struct ocfs2_file_private *fp); void ocfs2_lock_res_free(struct ocfs2_lock_res *res); int ocfs2_create_new_inode_locks(struct inode *inode); int ocfs2_drop_inode_locks(struct inode *inode); -int ocfs2_data_lock_full(struct inode *inode, - int write, - int arg_flags); -#define ocfs2_data_lock(inode, write) ocfs2_data_lock_full(inode, write, 0) -int ocfs2_data_lock_with_page(struct inode *inode, - int write, - struct page *page); -void ocfs2_data_unlock(struct inode *inode, - int write); int ocfs2_rw_lock(struct inode *inode, int write); void ocfs2_rw_unlock(struct inode *inode, int write); int ocfs2_open_lock(struct inode *inode); int ocfs2_try_open_lock(struct inode *inode, int write); void ocfs2_open_unlock(struct inode *inode); -int ocfs2_meta_lock_atime(struct inode *inode, +int ocfs2_inode_lock_atime(struct inode *inode, struct vfsmount *vfsmnt, int *level); -int ocfs2_meta_lock_full(struct inode *inode, +int ocfs2_inode_lock_full(struct inode *inode, struct buffer_head **ret_bh, int ex, int arg_flags); -int ocfs2_meta_lock_with_page(struct inode *inode, +int ocfs2_inode_lock_with_page(struct inode *inode, struct buffer_head **ret_bh, int ex, struct page *page); /* 99% of the time we don't want to supply any additional flags -- * those are for very specific cases only. */ -#define ocfs2_meta_lock(i, b, e) ocfs2_meta_lock_full(i, b, e, 0) -void ocfs2_meta_unlock(struct inode *inode, +#define ocfs2_inode_lock(i, b, e) ocfs2_inode_lock_full(i, b, e, 0) +void ocfs2_inode_unlock(struct inode *inode, int ex); int ocfs2_super_lock(struct ocfs2_super *osb, int ex); @@ -107,14 +101,17 @@ int ocfs2_rename_lock(struct ocfs2_super *osb); void ocfs2_rename_unlock(struct ocfs2_super *osb); int ocfs2_dentry_lock(struct dentry *dentry, int ex); void ocfs2_dentry_unlock(struct dentry *dentry, int ex); +int ocfs2_file_lock(struct file *file, int ex, int trylock); +void ocfs2_file_unlock(struct file *file); void ocfs2_mark_lockres_freeing(struct ocfs2_lock_res *lockres); void ocfs2_simple_drop_lockres(struct ocfs2_super *osb, struct ocfs2_lock_res *lockres); -/* for the vote thread */ +/* for the downconvert thread */ void ocfs2_process_blocked_lock(struct ocfs2_super *osb, struct ocfs2_lock_res *lockres); +void ocfs2_wake_downconvert_thread(struct ocfs2_super *osb); struct ocfs2_dlm_debug *ocfs2_new_dlm_debug(void); void ocfs2_put_dlm_debug(struct ocfs2_dlm_debug *dlm_debug); diff --git a/fs/ocfs2/endian.h b/fs/ocfs2/endian.h index ff25762..1942e09 100644 --- a/fs/ocfs2/endian.h +++ b/fs/ocfs2/endian.h @@ -37,11 +37,6 @@ static inline void le64_add_cpu(__le64 *var, u64 val) *var = cpu_to_le64(le64_to_cpu(*var) + val); } -static inline void le32_and_cpu(__le32 *var, u32 val) -{ - *var = cpu_to_le32(le32_to_cpu(*var) & val); -} - static inline void be32_add_cpu(__be32 *var, u32 val) { *var = cpu_to_be32(be32_to_cpu(*var) + val); diff --git a/fs/ocfs2/export.c b/fs/ocfs2/export.c index 535bfa9..67527ce 100644 --- a/fs/ocfs2/export.c +++ b/fs/ocfs2/export.c @@ -58,7 +58,7 @@ static struct dentry *ocfs2_get_dentry(struct super_block *sb, return ERR_PTR(-ESTALE); } - inode = ocfs2_iget(OCFS2_SB(sb), handle->ih_blkno, 0); + inode = ocfs2_iget(OCFS2_SB(sb), handle->ih_blkno, 0, 0); if (IS_ERR(inode)) return (void *)inode; @@ -95,7 +95,7 @@ static struct dentry *ocfs2_get_parent(struct dentry *child) mlog(0, "find parent of directory %llu\n", (unsigned long long)OCFS2_I(dir)->ip_blkno); - status = ocfs2_meta_lock(dir, NULL, 0); + status = ocfs2_inode_lock(dir, NULL, 0); if (status < 0) { if (status != -ENOENT) mlog_errno(status); @@ -109,7 +109,7 @@ static struct dentry *ocfs2_get_parent(struct dentry *child) goto bail_unlock; } - inode = ocfs2_iget(OCFS2_SB(dir->i_sb), blkno, 0); + inode = ocfs2_iget(OCFS2_SB(dir->i_sb), blkno, 0, 0); if (IS_ERR(inode)) { mlog(ML_ERROR, "Unable to create inode %llu\n", (unsigned long long)blkno); @@ -126,7 +126,7 @@ static struct dentry *ocfs2_get_parent(struct dentry *child) parent->d_op = &ocfs2_dentry_ops; bail_unlock: - ocfs2_meta_unlock(dir, 0); + ocfs2_inode_unlock(dir, 0); bail: mlog_exit_ptr(parent); diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index b75b2e1..ed5d523 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -51,6 +51,7 @@ #include "inode.h" #include "ioctl.h" #include "journal.h" +#include "locks.h" #include "mmap.h" #include "suballoc.h" #include "super.h" @@ -63,6 +64,35 @@ static int ocfs2_sync_inode(struct inode *inode) return sync_mapping_buffers(inode->i_mapping); } +static int ocfs2_init_file_private(struct inode *inode, struct file *file) +{ + struct ocfs2_file_private *fp; + + fp = kzalloc(sizeof(struct ocfs2_file_private), GFP_KERNEL); + if (!fp) + return -ENOMEM; + + fp->fp_file = file; + mutex_init(&fp->fp_mutex); + ocfs2_file_lock_res_init(&fp->fp_flock, fp); + file->private_data = fp; + + return 0; +} + +static void ocfs2_free_file_private(struct inode *inode, struct file *file) +{ + struct ocfs2_file_private *fp = file->private_data; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + + if (fp) { + ocfs2_simple_drop_lockres(osb, &fp->fp_flock); + ocfs2_lock_res_free(&fp->fp_flock); + kfree(fp); + file->private_data = NULL; + } +} + static int ocfs2_file_open(struct inode *inode, struct file *file) { int status; @@ -89,7 +119,18 @@ static int ocfs2_file_open(struct inode *inode, struct file *file) oi->ip_open_count++; spin_unlock(&oi->ip_lock); - status = 0; + + status = ocfs2_init_file_private(inode, file); + if (status) { + /* + * We want to set open count back if we're failing the + * open. + */ + spin_lock(&oi->ip_lock); + oi->ip_open_count--; + spin_unlock(&oi->ip_lock); + } + leave: mlog_exit(status); return status; @@ -108,11 +149,24 @@ static int ocfs2_file_release(struct inode *inode, struct file *file) oi->ip_flags &= ~OCFS2_INODE_OPEN_DIRECT; spin_unlock(&oi->ip_lock); + ocfs2_free_file_private(inode, file); + mlog_exit(0); return 0; } +static int ocfs2_dir_open(struct inode *inode, struct file *file) +{ + return ocfs2_init_file_private(inode, file); +} + +static int ocfs2_dir_release(struct inode *inode, struct file *file) +{ + ocfs2_free_file_private(inode, file); + return 0; +} + static int ocfs2_sync_file(struct file *file, struct dentry *dentry, int datasync) @@ -382,18 +436,13 @@ static int ocfs2_truncate_file(struct inode *inode, down_write(&OCFS2_I(inode)->ip_alloc_sem); - /* This forces other nodes to sync and drop their pages. Do - * this even if we have a truncate without allocation change - - * ocfs2 cluster sizes can be much greater than page size, so - * we have to truncate them anyway. */ - status = ocfs2_data_lock(inode, 1); - if (status < 0) { - up_write(&OCFS2_I(inode)->ip_alloc_sem); - - mlog_errno(status); - goto bail; - } - + /* + * The inode lock forced other nodes to sync and drop their + * pages, which (correctly) happens even if we have a truncate + * without allocation change - ocfs2 cluster sizes can be much + * greater than page size, so we have to truncate them + * anyway. + */ unmap_mapping_range(inode->i_mapping, new_i_size + PAGE_SIZE - 1, 0, 1); truncate_inode_pages(inode->i_mapping, new_i_size); @@ -403,7 +452,7 @@ static int ocfs2_truncate_file(struct inode *inode, if (status) mlog_errno(status); - goto bail_unlock_data; + goto bail_unlock_sem; } /* alright, we're going to need to do a full blown alloc size @@ -413,25 +462,23 @@ static int ocfs2_truncate_file(struct inode *inode, status = ocfs2_orphan_for_truncate(osb, inode, di_bh, new_i_size); if (status < 0) { mlog_errno(status); - goto bail_unlock_data; + goto bail_unlock_sem; } status = ocfs2_prepare_truncate(osb, inode, di_bh, &tc); if (status < 0) { mlog_errno(status); - goto bail_unlock_data; + goto bail_unlock_sem; } status = ocfs2_commit_truncate(osb, inode, di_bh, tc); if (status < 0) { mlog_errno(status); - goto bail_unlock_data; + goto bail_unlock_sem; } /* TODO: orphan dir cleanup here. */ -bail_unlock_data: - ocfs2_data_unlock(inode, 1); - +bail_unlock_sem: up_write(&OCFS2_I(inode)->ip_alloc_sem); bail: @@ -579,7 +626,7 @@ int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_dinode *di, mlog(0, "extend inode %llu, i_size = %lld, di->i_clusters = %u, " "clusters_to_add = %u, extents_to_split = %u\n", - (unsigned long long)OCFS2_I(inode)->ip_blkno, i_size_read(inode), + (unsigned long long)OCFS2_I(inode)->ip_blkno, (long long)i_size_read(inode), le32_to_cpu(di->i_clusters), clusters_to_add, extents_to_split); num_free_extents = ocfs2_num_free_extents(osb, inode, di); @@ -760,7 +807,7 @@ restarted_transaction: le32_to_cpu(fe->i_clusters), (unsigned long long)le64_to_cpu(fe->i_size)); mlog(0, "inode: ip_clusters=%u, i_size=%lld\n", - OCFS2_I(inode)->ip_clusters, i_size_read(inode)); + OCFS2_I(inode)->ip_clusters, (long long)i_size_read(inode)); leave: if (handle) { @@ -917,7 +964,7 @@ static int ocfs2_extend_file(struct inode *inode, struct buffer_head *di_bh, u64 new_i_size) { - int ret = 0, data_locked = 0; + int ret = 0; struct ocfs2_inode_info *oi = OCFS2_I(inode); BUG_ON(!di_bh); @@ -943,20 +990,6 @@ static int ocfs2_extend_file(struct inode *inode, && ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) goto out_update_size; - /* - * protect the pages that ocfs2_zero_extend is going to be - * pulling into the page cache.. we do this before the - * metadata extend so that we don't get into the situation - * where we've extended the metadata but can't get the data - * lock to zero. - */ - ret = ocfs2_data_lock(inode, 1); - if (ret < 0) { - mlog_errno(ret); - goto out; - } - data_locked = 1; - /* * The alloc sem blocks people in read/write from reading our * allocation until we're done changing it. We depend on @@ -980,7 +1013,7 @@ static int ocfs2_extend_file(struct inode *inode, up_write(&oi->ip_alloc_sem); mlog_errno(ret); - goto out_unlock; + goto out; } } @@ -991,7 +1024,7 @@ static int ocfs2_extend_file(struct inode *inode, if (ret < 0) { mlog_errno(ret); - goto out_unlock; + goto out; } out_update_size: @@ -999,10 +1032,6 @@ out_update_size: if (ret < 0) mlog_errno(ret); -out_unlock: - if (data_locked) - ocfs2_data_unlock(inode, 1); - out: return ret; } @@ -1050,7 +1079,7 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr) } } - status = ocfs2_meta_lock(inode, &bh, 1); + status = ocfs2_inode_lock(inode, &bh, 1); if (status < 0) { if (status != -ENOENT) mlog_errno(status); @@ -1102,7 +1131,7 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr) bail_commit: ocfs2_commit_trans(osb, handle); bail_unlock: - ocfs2_meta_unlock(inode, 1); + ocfs2_inode_unlock(inode, 1); bail_unlock_rw: if (size_change) ocfs2_rw_unlock(inode, 1); @@ -1149,7 +1178,7 @@ int ocfs2_permission(struct inode *inode, int mask, struct nameidata *nd) mlog_entry_void(); - ret = ocfs2_meta_lock(inode, NULL, 0); + ret = ocfs2_inode_lock(inode, NULL, 0); if (ret) { if (ret != -ENOENT) mlog_errno(ret); @@ -1158,7 +1187,7 @@ int ocfs2_permission(struct inode *inode, int mask, struct nameidata *nd) ret = generic_permission(inode, mask, NULL); - ocfs2_meta_unlock(inode, 0); + ocfs2_inode_unlock(inode, 0); out: mlog_exit(ret); return ret; @@ -1630,7 +1659,7 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode, goto out; } - ret = ocfs2_meta_lock(inode, &di_bh, 1); + ret = ocfs2_inode_lock(inode, &di_bh, 1); if (ret) { mlog_errno(ret); goto out_rw_unlock; @@ -1638,7 +1667,7 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode, if (inode->i_flags & (S_IMMUTABLE|S_APPEND)) { ret = -EPERM; - goto out_meta_unlock; + goto out_inode_unlock; } switch (sr->l_whence) { @@ -1652,7 +1681,7 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode, break; default: ret = -EINVAL; - goto out_meta_unlock; + goto out_inode_unlock; } sr->l_whence = 0; @@ -1663,14 +1692,14 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode, || (sr->l_start + llen) < 0 || (sr->l_start + llen) > max_off) { ret = -EINVAL; - goto out_meta_unlock; + goto out_inode_unlock; } size = sr->l_start + sr->l_len; if (cmd == OCFS2_IOC_RESVSP || cmd == OCFS2_IOC_RESVSP64) { if (sr->l_len <= 0) { ret = -EINVAL; - goto out_meta_unlock; + goto out_inode_unlock; } } @@ -1678,7 +1707,7 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode, ret = __ocfs2_write_remove_suid(inode, di_bh); if (ret) { mlog_errno(ret); - goto out_meta_unlock; + goto out_inode_unlock; } } @@ -1704,7 +1733,7 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode, up_write(&OCFS2_I(inode)->ip_alloc_sem); if (ret) { mlog_errno(ret); - goto out_meta_unlock; + goto out_inode_unlock; } /* @@ -1714,7 +1743,7 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode, if (IS_ERR(handle)) { ret = PTR_ERR(handle); mlog_errno(ret); - goto out_meta_unlock; + goto out_inode_unlock; } if (change_size && i_size_read(inode) < size) @@ -1727,9 +1756,9 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode, ocfs2_commit_trans(osb, handle); -out_meta_unlock: +out_inode_unlock: brelse(di_bh); - ocfs2_meta_unlock(inode, 1); + ocfs2_inode_unlock(inode, 1); out_rw_unlock: ocfs2_rw_unlock(inode, 1); @@ -1799,7 +1828,7 @@ static int ocfs2_prepare_inode_for_write(struct dentry *dentry, * if we need to make modifications here. */ for(;;) { - ret = ocfs2_meta_lock(inode, NULL, meta_level); + ret = ocfs2_inode_lock(inode, NULL, meta_level); if (ret < 0) { meta_level = -1; mlog_errno(ret); @@ -1817,7 +1846,7 @@ static int ocfs2_prepare_inode_for_write(struct dentry *dentry, * set inode->i_size at the end of a write. */ if (should_remove_suid(dentry)) { if (meta_level == 0) { - ocfs2_meta_unlock(inode, meta_level); + ocfs2_inode_unlock(inode, meta_level); meta_level = 1; continue; } @@ -1886,7 +1915,7 @@ static int ocfs2_prepare_inode_for_write(struct dentry *dentry, *ppos = saved_pos; out_unlock: - ocfs2_meta_unlock(inode, meta_level); + ocfs2_inode_unlock(inode, meta_level); out: return ret; @@ -2099,12 +2128,12 @@ static ssize_t ocfs2_file_splice_read(struct file *in, /* * See the comment in ocfs2_file_aio_read() */ - ret = ocfs2_meta_lock(inode, NULL, 0); + ret = ocfs2_inode_lock(inode, NULL, 0); if (ret < 0) { mlog_errno(ret); goto bail; } - ocfs2_meta_unlock(inode, 0); + ocfs2_inode_unlock(inode, 0); ret = generic_file_splice_read(in, ppos, pipe, len, flags); @@ -2160,12 +2189,12 @@ static ssize_t ocfs2_file_aio_read(struct kiocb *iocb, * like i_size. This allows the checks down below * generic_file_aio_read() a chance of actually working. */ - ret = ocfs2_meta_lock_atime(inode, filp->f_vfsmnt, &lock_level); + ret = ocfs2_inode_lock_atime(inode, filp->f_vfsmnt, &lock_level); if (ret < 0) { mlog_errno(ret); goto bail; } - ocfs2_meta_unlock(inode, lock_level); + ocfs2_inode_unlock(inode, lock_level); ret = generic_file_aio_read(iocb, iov, nr_segs, iocb->ki_pos); if (ret == -EINVAL) @@ -2204,6 +2233,7 @@ const struct inode_operations ocfs2_special_file_iops = { }; const struct file_operations ocfs2_fops = { + .llseek = generic_file_llseek, .read = do_sync_read, .write = do_sync_write, .mmap = ocfs2_mmap, @@ -2216,16 +2246,21 @@ const struct file_operations ocfs2_fops = { #ifdef CONFIG_COMPAT .compat_ioctl = ocfs2_compat_ioctl, #endif + .flock = ocfs2_flock, .splice_read = ocfs2_file_splice_read, .splice_write = ocfs2_file_splice_write, }; const struct file_operations ocfs2_dops = { + .llseek = generic_file_llseek, .read = generic_read_dir, .readdir = ocfs2_readdir, .fsync = ocfs2_sync_file, + .release = ocfs2_dir_release, + .open = ocfs2_dir_open, .ioctl = ocfs2_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = ocfs2_compat_ioctl, #endif + .flock = ocfs2_flock, }; diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h index 066f14a..048ddca 100644 --- a/fs/ocfs2/file.h +++ b/fs/ocfs2/file.h @@ -32,6 +32,12 @@ extern const struct inode_operations ocfs2_file_iops; extern const struct inode_operations ocfs2_special_file_iops; struct ocfs2_alloc_context; +struct ocfs2_file_private { + struct file *fp_file; + struct mutex fp_mutex; + struct ocfs2_lock_res fp_flock; +}; + enum ocfs2_alloc_restarted { RESTART_NONE = 0, RESTART_TRANS, diff --git a/fs/ocfs2/heartbeat.c b/fs/ocfs2/heartbeat.c index c4c3617..c0efd94 100644 --- a/fs/ocfs2/heartbeat.c +++ b/fs/ocfs2/heartbeat.c @@ -30,9 +30,6 @@ #include <linux/highmem.h> #include <linux/kmod.h> -#include <cluster/heartbeat.h> -#include <cluster/nodemanager.h> - #include <dlm/dlmapi.h> #define MLOG_MASK_PREFIX ML_SUPER @@ -44,13 +41,9 @@ #include "heartbeat.h" #include "inode.h" #include "journal.h" -#include "vote.h" #include "buffer_head_io.h" -#define OCFS2_HB_NODE_DOWN_PRI (0x0000002) -#define OCFS2_HB_NODE_UP_PRI OCFS2_HB_NODE_DOWN_PRI - static inline void __ocfs2_node_map_set_bit(struct ocfs2_node_map *map, int bit); static inline void __ocfs2_node_map_clear_bit(struct ocfs2_node_map *map, @@ -64,9 +57,7 @@ static void __ocfs2_node_map_set(struct ocfs2_node_map *target, void ocfs2_init_node_maps(struct ocfs2_super *osb) { spin_lock_init(&osb->node_map_lock); - ocfs2_node_map_init(&osb->mounted_map); ocfs2_node_map_init(&osb->recovery_map); - ocfs2_node_map_init(&osb->umount_map); ocfs2_node_map_init(&osb->osb_recovering_orphan_dirs); } @@ -87,24 +78,7 @@ static void ocfs2_do_node_down(int node_num, return; } - if (ocfs2_node_map_test_bit(osb, &osb->umount_map, node_num)) { - /* If a node is in the umount map, then we've been - * expecting him to go down and we know ahead of time - * that recovery is not necessary. */ - ocfs2_node_map_clear_bit(osb, &osb->umount_map, node_num); - return; - } - ocfs2_recovery_thread(osb, node_num); - - ocfs2_remove_node_from_vote_queues(osb, node_num); -} - -static void ocfs2_hb_node_down_cb(struct o2nm_node *node, - int node_num, - void *data) -{ - ocfs2_do_node_down(node_num, (struct ocfs2_super *) data); } /* Called from the dlm when it's about to evict a node. We may also @@ -121,27 +95,8 @@ static void ocfs2_dlm_eviction_cb(int node_num, ocfs2_do_node_down(node_num, osb); } -static void ocfs2_hb_node_up_cb(struct o2nm_node *node, - int node_num, - void *data) -{ - struct ocfs2_super *osb = data; - - BUG_ON(osb->node_num == node_num); - - mlog(0, "node up event for %d\n", node_num); - ocfs2_node_map_clear_bit(osb, &osb->umount_map, node_num); -} - void ocfs2_setup_hb_callbacks(struct ocfs2_super *osb) { - o2hb_setup_callback(&osb->osb_hb_down, O2HB_NODE_DOWN_CB, - ocfs2_hb_node_down_cb, osb, - OCFS2_HB_NODE_DOWN_PRI); - - o2hb_setup_callback(&osb->osb_hb_up, O2HB_NODE_UP_CB, - ocfs2_hb_node_up_cb, osb, OCFS2_HB_NODE_UP_PRI); - /* Not exactly a heartbeat callback, but leads to essentially * the same path so we set it up here. */ dlm_setup_eviction_cb(&osb->osb_eviction_cb, @@ -149,39 +104,6 @@ void ocfs2_setup_hb_callbacks(struct ocfs2_super *osb) osb); } -/* Most functions here are just stubs for now... */ -int ocfs2_register_hb_callbacks(struct ocfs2_super *osb) -{ - int status; - - if (ocfs2_mount_local(osb)) - return 0; - - status = o2hb_register_callback(osb->uuid_str, &osb->osb_hb_down); - if (status < 0) { - mlog_errno(status); - goto bail; - } - - status = o2hb_register_callback(osb->uuid_str, &osb->osb_hb_up); - if (status < 0) { - mlog_errno(status); - o2hb_unregister_callback(osb->uuid_str, &osb->osb_hb_down); - } - -bail: - return status; -} - -void ocfs2_clear_hb_callbacks(struct ocfs2_super *osb) -{ - if (ocfs2_mount_local(osb)) - return; - - o2hb_unregister_callback(osb->uuid_str, &osb->osb_hb_down); - o2hb_unregister_callback(osb->uuid_str, &osb->osb_hb_up); -} - void ocfs2_stop_heartbeat(struct ocfs2_super *osb) { int ret; @@ -341,8 +263,6 @@ int ocfs2_recovery_map_set(struct ocfs2_super *osb, spin_lock(&osb->node_map_lock); - __ocfs2_node_map_clear_bit(&osb->mounted_map, num); - if (!test_bit(num, osb->recovery_map.map)) { __ocfs2_node_map_set_bit(&osb->recovery_map, num); set = 1; diff --git a/fs/ocfs2/heartbeat.h b/fs/ocfs2/heartbeat.h index e8fb079..5685921 100644 --- a/fs/ocfs2/heartbeat.h +++ b/fs/ocfs2/heartbeat.h @@ -29,8 +29,6 @@ void ocfs2_init_node_maps(struct ocfs2_super *osb); void ocfs2_setup_hb_callbacks(struct ocfs2_super *osb); -int ocfs2_register_hb_callbacks(struct ocfs2_super *osb); -void ocfs2_clear_hb_callbacks(struct ocfs2_super *osb); void ocfs2_stop_heartbeat(struct ocfs2_super *osb); /* node map functions - used to keep track of mounted and in-recovery diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index ebb2bbe..7e9e4c7 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c @@ -49,7 +49,6 @@ #include "symlink.h" #include "sysfile.h" #include "uptodate.h" -#include "vote.h" #include "buffer_head_io.h" @@ -58,8 +57,11 @@ struct ocfs2_find_inode_args u64 fi_blkno; unsigned long fi_ino; unsigned int fi_flags; + unsigned int fi_sysfile_type; }; +static struct lock_class_key ocfs2_sysfile_lock_key[NUM_SYSTEM_INODES]; + static int ocfs2_read_locked_inode(struct inode *inode, struct ocfs2_find_inode_args *args); static int ocfs2_init_locked_inode(struct inode *inode, void *opaque); @@ -107,7 +109,8 @@ void ocfs2_get_inode_flags(struct ocfs2_inode_info *oi) oi->ip_attr |= OCFS2_DIRSYNC_FL; } -struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, int flags) +struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, unsigned flags, + int sysfile_type) { struct inode *inode = NULL; struct super_block *sb = osb->sb; @@ -127,6 +130,7 @@ struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, int flags) args.fi_blkno = blkno; args.fi_flags = flags; args.fi_ino = ino_from_blkno(sb, blkno); + args.fi_sysfile_type = sysfile_type; inode = iget5_locked(sb, args.fi_ino, ocfs2_find_actor, ocfs2_init_locked_inode, &args); @@ -201,6 +205,9 @@ static int ocfs2_init_locked_inode(struct inode *inode, void *opaque) inode->i_ino = args->fi_ino; OCFS2_I(inode)->ip_blkno = args->fi_blkno; + if (args->fi_sysfile_type != 0) + lockdep_set_class(&inode->i_mutex, + &ocfs2_sysfile_lock_key[args->fi_sysfile_type]); mlog_exit(0); return 0; @@ -322,7 +329,7 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe, */ BUG_ON(le32_to_cpu(fe->i_flags) & OCFS2_SYSTEM_FL); - ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_meta_lockres, + ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_inode_lockres, OCFS2_LOCK_TYPE_META, 0, inode); ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_open_lockres, @@ -333,10 +340,6 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe, OCFS2_LOCK_TYPE_RW, inode->i_generation, inode); - ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_data_lockres, - OCFS2_LOCK_TYPE_DATA, inode->i_generation, - inode); - ocfs2_set_inode_flags(inode); status = 0; @@ -414,7 +417,7 @@ static int ocfs2_read_locked_inode(struct inode *inode, if (args->fi_flags & OCFS2_FI_FLAG_SYSFILE) generation = osb->fs_generation; - ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_meta_lockres, + ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_inode_lockres, OCFS2_LOCK_TYPE_META, generation, inode); @@ -429,7 +432,7 @@ static int ocfs2_read_locked_inode(struct inode *inode, mlog_errno(status); return status; } - status = ocfs2_meta_lock(inode, NULL, 0); + status = ocfs2_inode_lock(inode, NULL, 0); if (status) { make_bad_inode(inode); mlog_errno(status); @@ -484,7 +487,7 @@ static int ocfs2_read_locked_inode(struct inode *inode, bail: if (can_lock) - ocfs2_meta_unlock(inode, 0); + ocfs2_inode_unlock(inode, 0); if (status < 0) make_bad_inode(inode); @@ -586,7 +589,7 @@ static int ocfs2_remove_inode(struct inode *inode, } mutex_lock(&inode_alloc_inode->i_mutex); - status = ocfs2_meta_lock(inode_alloc_inode, &inode_alloc_bh, 1); + status = ocfs2_inode_lock(inode_alloc_inode, &inode_alloc_bh, 1); if (status < 0) { mutex_unlock(&inode_alloc_inode->i_mutex); @@ -617,7 +620,7 @@ static int ocfs2_remove_inode(struct inode *inode, } di->i_dtime = cpu_to_le64(CURRENT_TIME.tv_sec); - le32_and_cpu(&di->i_flags, ~(OCFS2_VALID_FL | OCFS2_ORPHANED_FL)); + di->i_flags &= cpu_to_le32(~(OCFS2_VALID_FL | OCFS2_ORPHANED_FL)); status = ocfs2_journal_dirty(handle, di_bh); if (status < 0) { @@ -635,7 +638,7 @@ static int ocfs2_remove_inode(struct inode *inode, bail_commit: ocfs2_commit_trans(osb, handle); bail_unlock: - ocfs2_meta_unlock(inode_alloc_inode, 1); + ocfs2_inode_unlock(inode_alloc_inode, 1); mutex_unlock(&inode_alloc_inode->i_mutex); brelse(inode_alloc_bh); bail: @@ -709,7 +712,7 @@ static int ocfs2_wipe_inode(struct inode *inode, * delete_inode operation. We do this now to avoid races with * recovery completion on other nodes. */ mutex_lock(&orphan_dir_inode->i_mutex); - status = ocfs2_meta_lock(orphan_dir_inode, &orphan_dir_bh, 1); + status = ocfs2_inode_lock(orphan_dir_inode, &orphan_dir_bh, 1); if (status < 0) { mutex_unlock(&orphan_dir_inode->i_mutex); @@ -718,8 +721,8 @@ static int ocfs2_wipe_inode(struct inode *inode, } /* we do this while holding the orphan dir lock because we - * don't want recovery being run from another node to vote for - * an inode delete on us -- this will result in two nodes + * don't want recovery being run from another node to try an + * inode delete underneath us -- this will result in two nodes * truncating the same file! */ status = ocfs2_truncate_for_delete(osb, inode, di_bh); if (status < 0) { @@ -733,7 +736,7 @@ static int ocfs2_wipe_inode(struct inode *inode, mlog_errno(status); bail_unlock_dir: - ocfs2_meta_unlock(orphan_dir_inode, 1); + ocfs2_inode_unlock(orphan_dir_inode, 1); mutex_unlock(&orphan_dir_inode->i_mutex); brelse(orphan_dir_bh); bail: @@ -744,7 +747,7 @@ bail: } /* There is a series of simple checks that should be done before a - * vote is even considered. Encapsulate those in this function. */ + * trylock is even considered. Encapsulate those in this function. */ static int ocfs2_inode_is_valid_to_delete(struct inode *inode) { int ret = 0; @@ -758,14 +761,14 @@ static int ocfs2_inode_is_valid_to_delete(struct inode *inode) goto bail; } - /* If we're coming from process_vote we can't go into our own + /* If we're coming from downconvert_thread we can't go into our own * voting [hello, deadlock city!], so unforuntately we just * have to skip deleting this guy. That's OK though because * the node who's doing the actual deleting should handle it * anyway. */ - if (current == osb->vote_task) { + if (current == osb->dc_task) { mlog(0, "Skipping delete of %lu because we're currently " - "in process_vote\n", inode->i_ino); + "in downconvert\n", inode->i_ino); goto bail; } @@ -779,10 +782,9 @@ static int ocfs2_inode_is_valid_to_delete(struct inode *inode) goto bail_unlock; } - /* If we have voted "yes" on the wipe of this inode for - * another node, it will be marked here so we can safely skip - * it. Recovery will cleanup any inodes we might inadvertantly - * skip here. */ + /* If we have allowd wipe of this inode for another node, it + * will be marked here so we can safely skip it. Recovery will + * cleanup any inodes we might inadvertantly skip here. */ if (oi->ip_flags & OCFS2_INODE_SKIP_DELETE) { mlog(0, "Skipping delete of %lu because another node " "has done this for us.\n", inode->i_ino); @@ -929,13 +931,13 @@ void ocfs2_delete_inode(struct inode *inode) /* Lock down the inode. This gives us an up to date view of * it's metadata (for verification), and allows us to - * serialize delete_inode votes. + * serialize delete_inode on multiple nodes. * * Even though we might be doing a truncate, we don't take the * allocation lock here as it won't be needed - nobody will * have the file open. */ - status = ocfs2_meta_lock(inode, &di_bh, 1); + status = ocfs2_inode_lock(inode, &di_bh, 1); if (status < 0) { if (status != -ENOENT) mlog_errno(status); @@ -947,15 +949,15 @@ void ocfs2_delete_inode(struct inode *inode) * before we go ahead and wipe the inode. */ status = ocfs2_query_inode_wipe(inode, di_bh, &wipe); if (!wipe || status < 0) { - /* Error and inode busy vote both mean we won't be + /* Error and remote inode busy both mean we won't be * removing the inode, so they take almost the same * path. */ if (status < 0) mlog_errno(status); - /* Someone in the cluster has voted to not wipe this - * inode, or it was never completely orphaned. Write - * out the pages and exit now. */ + /* Someone in the cluster has disallowed a wipe of + * this inode, or it was never completely + * orphaned. Write out the pages and exit now. */ ocfs2_cleanup_delete_inode(inode, 1); goto bail_unlock_inode; } @@ -981,7 +983,7 @@ void ocfs2_delete_inode(struct inode *inode) OCFS2_I(inode)->ip_flags |= OCFS2_INODE_DELETED; bail_unlock_inode: - ocfs2_meta_unlock(inode, 1); + ocfs2_inode_unlock(inode, 1); brelse(di_bh); bail_unblock: status = sigprocmask(SIG_SETMASK, &oldset, NULL); @@ -1008,15 +1010,14 @@ void ocfs2_clear_inode(struct inode *inode) mlog_bug_on_msg(OCFS2_SB(inode->i_sb) == NULL, "Inode=%lu\n", inode->i_ino); - /* For remove delete_inode vote, we hold open lock before, - * now it is time to unlock PR and EX open locks. */ + /* To preven remote deletes we hold open lock before, now it + * is time to unlock PR and EX open locks. */ ocfs2_open_unlock(inode); /* Do these before all the other work so that we don't bounce - * the vote thread while waiting to destroy the locks. */ + * the downconvert thread while waiting to destroy the locks. */ ocfs2_mark_lockres_freeing(&oi->ip_rw_lockres); - ocfs2_mark_lockres_freeing(&oi->ip_meta_lockres); - ocfs2_mark_lockres_freeing(&oi->ip_data_lockres); + ocfs2_mark_lockres_freeing(&oi->ip_inode_lockres); ocfs2_mark_lockres_freeing(&oi->ip_open_lockres); /* We very well may get a clear_inode before all an inodes @@ -1039,8 +1040,7 @@ void ocfs2_clear_inode(struct inode *inode) mlog_errno(status); ocfs2_lock_res_free(&oi->ip_rw_lockres); - ocfs2_lock_res_free(&oi->ip_meta_lockres); - ocfs2_lock_res_free(&oi->ip_data_lockres); + ocfs2_lock_res_free(&oi->ip_inode_lockres); ocfs2_lock_res_free(&oi->ip_open_lockres); ocfs2_metadata_cache_purge(inode); @@ -1184,15 +1184,15 @@ int ocfs2_inode_revalidate(struct dentry *dentry) } spin_unlock(&OCFS2_I(inode)->ip_lock); - /* Let ocfs2_meta_lock do the work of updating our struct + /* Let ocfs2_inode_lock do the work of updating our struct * inode for us. */ - status = ocfs2_meta_lock(inode, NULL, 0); + status = ocfs2_inode_lock(inode, NULL, 0); if (status < 0) { if (status != -ENOENT) mlog_errno(status); goto bail; } - ocfs2_meta_unlock(inode, 0); + ocfs2_inode_unlock(inode, 0); bail: mlog_exit(status); diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h index 70e881c..390a855 100644 --- a/fs/ocfs2/inode.h +++ b/fs/ocfs2/inode.h @@ -34,8 +34,7 @@ struct ocfs2_inode_info u64 ip_blkno; struct ocfs2_lock_res ip_rw_lockres; - struct ocfs2_lock_res ip_meta_lockres; - struct ocfs2_lock_res ip_data_lockres; + struct ocfs2_lock_res ip_inode_lockres; struct ocfs2_lock_res ip_open_lockres; /* protects allocation changes on this inode. */ @@ -121,9 +120,10 @@ void ocfs2_delete_inode(struct inode *inode); void ocfs2_drop_inode(struct inode *inode); /* Flags for ocfs2_iget() */ -#define OCFS2_FI_FLAG_SYSFILE 0x4 -#define OCFS2_FI_FLAG_ORPHAN_RECOVERY 0x8 -struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 feoff, int flags); +#define OCFS2_FI_FLAG_SYSFILE 0x1 +#define OCFS2_FI_FLAG_ORPHAN_RECOVERY 0x2 +struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 feoff, unsigned flags, + int sysfile_type); int ocfs2_inode_init_private(struct inode *inode); int ocfs2_inode_revalidate(struct dentry *dentry); int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe, diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c index 87dcece..5177fba 100644 --- a/fs/ocfs2/ioctl.c +++ b/fs/ocfs2/ioctl.c @@ -20,6 +20,7 @@ #include "ocfs2_fs.h" #include "ioctl.h" +#include "resize.h" #include <linux/ext2_fs.h> @@ -27,14 +28,14 @@ static int ocfs2_get_inode_attr(struct inode *inode, unsigned *flags) { int status; - status = ocfs2_meta_lock(inode, NULL, 0); + status = ocfs2_inode_lock(inode, NULL, 0); if (status < 0) { mlog_errno(status); return status; } ocfs2_get_inode_flags(OCFS2_I(inode)); *flags = OCFS2_I(inode)->ip_attr; - ocfs2_meta_unlock(inode, 0); + ocfs2_inode_unlock(inode, 0); mlog_exit(status); return status; @@ -52,7 +53,7 @@ static int ocfs2_set_inode_attr(struct inode *inode, unsigned flags, mutex_lock(&inode->i_mutex); - status = ocfs2_meta_lock(inode, &bh, 1); + status = ocfs2_inode_lock(inode, &bh, 1); if (status < 0) { mlog_errno(status); goto bail; @@ -100,7 +101,7 @@ static int ocfs2_set_inode_attr(struct inode *inode, unsigned flags, ocfs2_commit_trans(osb, handle); bail_unlock: - ocfs2_meta_unlock(inode, 1); + ocfs2_inode_unlock(inode, 1); bail: mutex_unlock(&inode->i_mutex); @@ -115,8 +116,10 @@ int ocfs2_ioctl(struct inode * inode, struct file * filp, unsigned int cmd, unsigned long arg) { unsigned int flags; + int new_clusters; int status; struct ocfs2_space_resv sr; + struct ocfs2_new_group_input input; switch (cmd) { case OCFS2_IOC_GETFLAGS: @@ -140,6 +143,23 @@ int ocfs2_ioctl(struct inode * inode, struct file * filp, return -EFAULT; return ocfs2_change_file_space(filp, cmd, &sr); + case OCFS2_IOC_GROUP_EXTEND: + if (!capable(CAP_SYS_RESOURCE)) + return -EPERM; + + if (get_user(new_clusters, (int __user *)arg)) + return -EFAULT; + + return ocfs2_group_extend(inode, new_clusters); + case OCFS2_IOC_GROUP_ADD: + case OCFS2_IOC_GROUP_ADD64: + if (!capable(CAP_SYS_RESOURCE)) + return -EPERM; + + if (copy_from_user(&input, (int __user *) arg, sizeof(input))) + return -EFAULT; + + return ocfs2_group_add(inode, &input); default: return -ENOTTY; } @@ -162,6 +182,9 @@ long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg) case OCFS2_IOC_RESVSP64: case OCFS2_IOC_UNRESVSP: case OCFS2_IOC_UNRESVSP64: + case OCFS2_IOC_GROUP_EXTEND: + case OCFS2_IOC_GROUP_ADD: + case OCFS2_IOC_GROUP_ADD64: break; default: return -ENOIOCTLCMD; diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index 8d81f6c..f31c7e8 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -44,7 +44,6 @@ #include "localalloc.h" #include "slot_map.h" #include "super.h" -#include "vote.h" #include "sysfile.h" #include "buffer_head_io.h" @@ -103,7 +102,7 @@ static int ocfs2_commit_cache(struct ocfs2_super *osb) mlog(0, "commit_thread: flushed transaction %lu (%u handles)\n", journal->j_trans_id, flushed); - ocfs2_kick_vote_thread(osb); + ocfs2_wake_downconvert_thread(osb); wake_up(&journal->j_checkpointed); finally: mlog_exit(status); @@ -314,14 +313,18 @@ int ocfs2_journal_dirty_data(handle_t *handle, return err; } -#define OCFS2_DEFAULT_COMMIT_INTERVAL (HZ * 5) +#define OCFS2_DEFAULT_COMMIT_INTERVAL (HZ * JBD_DEFAULT_MAX_COMMIT_AGE) void ocfs2_set_journal_params(struct ocfs2_super *osb) { journal_t *journal = osb->journal->j_journal; + unsigned long commit_interval = OCFS2_DEFAULT_COMMIT_INTERVAL; + + if (osb->osb_commit_interval) + commit_interval = osb->osb_commit_interval; spin_lock(&journal->j_state_lock); - journal->j_commit_interval = OCFS2_DEFAULT_COMMIT_INTERVAL; + journal->j_commit_interval = commit_interval; if (osb->s_mount_opt & OCFS2_MOUNT_BARRIER) journal->j_flags |= JFS_BARRIER; else @@ -337,7 +340,7 @@ int ocfs2_journal_init(struct ocfs2_journal *journal, int *dirty) struct ocfs2_dinode *di = NULL; struct buffer_head *bh = NULL; struct ocfs2_super *osb; - int meta_lock = 0; + int inode_lock = 0; mlog_entry_void(); @@ -367,14 +370,14 @@ int ocfs2_journal_init(struct ocfs2_journal *journal, int *dirty) /* Skip recovery waits here - journal inode metadata never * changes in a live cluster so it can be considered an * exception to the rule. */ - status = ocfs2_meta_lock_full(inode, &bh, 1, OCFS2_META_LOCK_RECOVERY); + status = ocfs2_inode_lock_full(inode, &bh, 1, OCFS2_META_LOCK_RECOVERY); if (status < 0) { if (status != -ERESTARTSYS) mlog(ML_ERROR, "Could not get lock on journal!\n"); goto done; } - meta_lock = 1; + inode_lock = 1; di = (struct ocfs2_dinode *)bh->b_data; if (inode->i_size < OCFS2_MIN_JOURNAL_SIZE) { @@ -414,8 +417,8 @@ int ocfs2_journal_init(struct ocfs2_journal *journal, int *dirty) status = 0; done: if (status < 0) { - if (meta_lock) - ocfs2_meta_unlock(inode, 1); + if (inode_lock) + ocfs2_inode_unlock(inode, 1); if (bh != NULL) brelse(bh); if (inode) { @@ -544,7 +547,7 @@ void ocfs2_journal_shutdown(struct ocfs2_super *osb) OCFS2_I(inode)->ip_open_count--; /* unlock our journal */ - ocfs2_meta_unlock(inode, 1); + ocfs2_inode_unlock(inode, 1); brelse(journal->j_bh); journal->j_bh = NULL; @@ -883,8 +886,8 @@ restart: ocfs2_super_unlock(osb, 1); /* We always run recovery on our own orphan dir - the dead - * node(s) may have voted "no" on an inode delete earlier. A - * revote is therefore required. */ + * node(s) may have disallowd a previos inode delete. Re-processing + * is therefore required. */ ocfs2_queue_recovery_completion(osb->journal, osb->slot_num, NULL, NULL); @@ -973,9 +976,9 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb, } SET_INODE_JOURNAL(inode); - status = ocfs2_meta_lock_full(inode, &bh, 1, OCFS2_META_LOCK_RECOVERY); + status = ocfs2_inode_lock_full(inode, &bh, 1, OCFS2_META_LOCK_RECOVERY); if (status < 0) { - mlog(0, "status returned from ocfs2_meta_lock=%d\n", status); + mlog(0, "status returned from ocfs2_inode_lock=%d\n", status); if (status != -ERESTARTSYS) mlog(ML_ERROR, "Could not lock journal!\n"); goto done; @@ -1047,7 +1050,7 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb, done: /* drop the lock on this nodes journal */ if (got_lock) - ocfs2_meta_unlock(inode, 1); + ocfs2_inode_unlock(inode, 1); if (inode) iput(inode); @@ -1162,14 +1165,14 @@ static int ocfs2_trylock_journal(struct ocfs2_super *osb, SET_INODE_JOURNAL(inode); flags = OCFS2_META_LOCK_RECOVERY | OCFS2_META_LOCK_NOQUEUE; - status = ocfs2_meta_lock_full(inode, NULL, 1, flags); + status = ocfs2_inode_lock_full(inode, NULL, 1, flags); if (status < 0) { if (status != -EAGAIN) mlog_errno(status); goto bail; } - ocfs2_meta_unlock(inode, 1); + ocfs2_inode_unlock(inode, 1); bail: if (inode) iput(inode); @@ -1241,7 +1244,7 @@ static int ocfs2_orphan_filldir(void *priv, const char *name, int name_len, /* Skip bad inodes so that recovery can continue */ iter = ocfs2_iget(p->osb, ino, - OCFS2_FI_FLAG_ORPHAN_RECOVERY); + OCFS2_FI_FLAG_ORPHAN_RECOVERY, 0); if (IS_ERR(iter)) return 0; @@ -1277,7 +1280,7 @@ static int ocfs2_queue_orphans(struct ocfs2_super *osb, } mutex_lock(&orphan_dir_inode->i_mutex); - status = ocfs2_meta_lock(orphan_dir_inode, NULL, 0); + status = ocfs2_inode_lock(orphan_dir_inode, NULL, 0); if (status < 0) { mlog_errno(status); goto out; @@ -1293,7 +1296,7 @@ static int ocfs2_queue_orphans(struct ocfs2_super *osb, *head = priv.head; out_cluster: - ocfs2_meta_unlock(orphan_dir_inode, 0); + ocfs2_inode_unlock(orphan_dir_inode, 0); out: mutex_unlock(&orphan_dir_inode->i_mutex); iput(orphan_dir_inode); @@ -1380,10 +1383,10 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb, iter = oi->ip_next_orphan; spin_lock(&oi->ip_lock); - /* Delete voting may have set these on the assumption - * that the other node would wipe them successfully. - * If they are still in the node's orphan dir, we need - * to reset that state. */ + /* The remote delete code may have set these on the + * assumption that the other node would wipe them + * successfully. If they are still in the node's + * orphan dir, we need to reset that state. */ oi->ip_flags &= ~(OCFS2_INODE_DELETED|OCFS2_INODE_SKIP_DELETE); /* Set the proper information to get us going into diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h index 4b32e09..220f3e8 100644 --- a/fs/ocfs2/journal.h +++ b/fs/ocfs2/journal.h @@ -278,6 +278,12 @@ int ocfs2_journal_dirty_data(handle_t *handle, /* simple file updates like chmod, etc. */ #define OCFS2_INODE_UPDATE_CREDITS 1 +/* group extend. inode update and last group update. */ +#define OCFS2_GROUP_EXTEND_CREDITS (OCFS2_INODE_UPDATE_CREDITS + 1) + +/* group add. inode update and the new group update. */ +#define OCFS2_GROUP_ADD_CREDITS (OCFS2_INODE_UPDATE_CREDITS + 1) + /* get one bit out of a suballocator: dinode + group descriptor + * prev. group desc. if we relink. */ #define OCFS2_SUBALLOC_ALLOC (3) diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c index 58ea88b..add1ffd 100644 --- a/fs/ocfs2/localalloc.c +++ b/fs/ocfs2/localalloc.c @@ -75,18 +75,12 @@ static int ocfs2_local_alloc_new_window(struct ocfs2_super *osb, static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb, struct inode *local_alloc_inode); -/* - * Determine how large our local alloc window should be, in bits. - * - * These values (and the behavior in ocfs2_alloc_should_use_local) have - * been chosen so that most allocations, including new block groups go - * through local alloc. - */ static inline int ocfs2_local_alloc_window_bits(struct ocfs2_super *osb) { - BUG_ON(osb->s_clustersize_bits < 12); + BUG_ON(osb->s_clustersize_bits > 20); - return 2048 >> (osb->s_clustersize_bits - 12); + /* Size local alloc windows by the megabyte */ + return osb->local_alloc_size << (20 - osb->s_clustersize_bits); } /* @@ -96,18 +90,23 @@ static inline int ocfs2_local_alloc_window_bits(struct ocfs2_super *osb) int ocfs2_alloc_should_use_local(struct ocfs2_super *osb, u64 bits) { int la_bits = ocfs2_local_alloc_window_bits(osb); + int ret = 0; if (osb->local_alloc_state != OCFS2_LA_ENABLED) - return 0; + goto bail; /* la_bits should be at least twice the size (in clusters) of * a new block group. We want to be sure block group * allocations go through the local alloc, so allow an * allocation to take up to half the bitmap. */ if (bits > (la_bits / 2)) - return 0; + goto bail; - return 1; + ret = 1; +bail: + mlog(0, "state=%d, bits=%llu, la_bits=%d, ret=%d\n", + osb->local_alloc_state, (unsigned long long)bits, la_bits, ret); + return ret; } int ocfs2_load_local_alloc(struct ocfs2_super *osb) @@ -121,6 +120,19 @@ int ocfs2_load_local_alloc(struct ocfs2_super *osb) mlog_entry_void(); + if (ocfs2_mount_local(osb)) + goto bail; + + if (osb->local_alloc_size == 0) + goto bail; + + if (ocfs2_local_alloc_window_bits(osb) >= osb->bitmap_cpg) { + mlog(ML_NOTICE, "Requested local alloc window %d is larger " + "than max possible %u. Using defaults.\n", + ocfs2_local_alloc_window_bits(osb), (osb->bitmap_cpg - 1)); + osb->local_alloc_size = OCFS2_DEFAULT_LOCAL_ALLOC_SIZE; + } + /* read the alloc off disk */ inode = ocfs2_get_system_file_inode(osb, LOCAL_ALLOC_SYSTEM_INODE, osb->slot_num); @@ -181,6 +193,9 @@ bail: if (inode) iput(inode); + mlog(0, "Local alloc window bits = %d\n", + ocfs2_local_alloc_window_bits(osb)); + mlog_exit(status); return status; } @@ -231,7 +246,7 @@ void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb) mutex_lock(&main_bm_inode->i_mutex); - status = ocfs2_meta_lock(main_bm_inode, &main_bm_bh, 1); + status = ocfs2_inode_lock(main_bm_inode, &main_bm_bh, 1); if (status < 0) { mlog_errno(status); goto out_mutex; @@ -286,7 +301,7 @@ out_unlock: if (main_bm_bh) brelse(main_bm_bh); - ocfs2_meta_unlock(main_bm_inode, 1); + ocfs2_inode_unlock(main_bm_inode, 1); out_mutex: mutex_unlock(&main_bm_inode->i_mutex); @@ -399,7 +414,7 @@ int ocfs2_complete_local_alloc_recovery(struct ocfs2_super *osb, mutex_lock(&main_bm_inode->i_mutex); - status = ocfs2_meta_lock(main_bm_inode, &main_bm_bh, 1); + status = ocfs2_inode_lock(main_bm_inode, &main_bm_bh, 1); if (status < 0) { mlog_errno(status); goto out_mutex; @@ -424,7 +439,7 @@ int ocfs2_complete_local_alloc_recovery(struct ocfs2_super *osb, ocfs2_commit_trans(osb, handle); out_unlock: - ocfs2_meta_unlock(main_bm_inode, 1); + ocfs2_inode_unlock(main_bm_inode, 1); out_mutex: mutex_unlock(&main_bm_inode->i_mutex); @@ -521,6 +536,9 @@ bail: iput(local_alloc_inode); } + mlog(0, "bits=%d, slot=%d, ret=%d\n", bits_wanted, osb->slot_num, + status); + mlog_exit(status); return status; } diff --git a/fs/ocfs2/locks.c b/fs/ocfs2/locks.c new file mode 100644 index 0000000..203f871 --- /dev/null +++ b/fs/ocfs2/locks.c @@ -0,0 +1,125 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * locks.c + * + * Userspace file locking support + * + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/fs.h> + +#define MLOG_MASK_PREFIX ML_INODE +#include <cluster/masklog.h> + +#include "ocfs2.h" + +#include "dlmglue.h" +#include "file.h" +#include "locks.h" + +static int ocfs2_do_flock(struct file *file, struct inode *inode, + int cmd, struct file_lock *fl) +{ + int ret = 0, level = 0, trylock = 0; + struct ocfs2_file_private *fp = file->private_data; + struct ocfs2_lock_res *lockres = &fp->fp_flock; + + if (fl->fl_type == F_WRLCK) + level = 1; + if (!IS_SETLKW(cmd)) + trylock = 1; + + mutex_lock(&fp->fp_mutex); + + if (lockres->l_flags & OCFS2_LOCK_ATTACHED && + lockres->l_level > LKM_NLMODE) { + int old_level = 0; + + if (lockres->l_level == LKM_EXMODE) + old_level = 1; + + if (level == old_level) + goto out; + + /* + * Converting an existing lock is not guaranteed to be + * atomic, so we can get away with simply unlocking + * here and allowing the lock code to try at the new + * level. + */ + + flock_lock_file_wait(file, + &(struct file_lock){.fl_type = F_UNLCK}); + + ocfs2_file_unlock(file); + } + + ret = ocfs2_file_lock(file, level, trylock); + if (ret) { + if (ret == -EAGAIN && trylock) + ret = -EWOULDBLOCK; + else + mlog_errno(ret); + goto out; + } + + ret = flock_lock_file_wait(file, fl); + +out: + mutex_unlock(&fp->fp_mutex); + + return ret; +} + +static int ocfs2_do_funlock(struct file *file, int cmd, struct file_lock *fl) +{ + int ret; + struct ocfs2_file_private *fp = file->private_data; + + mutex_lock(&fp->fp_mutex); + ocfs2_file_unlock(file); + ret = flock_lock_file_wait(file, fl); + mutex_unlock(&fp->fp_mutex); + + return ret; +} + +/* + * Overall flow of ocfs2_flock() was influenced by gfs2_flock(). + */ +int ocfs2_flock(struct file *file, int cmd, struct file_lock *fl) +{ + struct inode *inode = file->f_mapping->host; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + + if (!(fl->fl_flags & FL_FLOCK)) + return -ENOLCK; + if (__mandatory_lock(inode)) + return -ENOLCK; + + if ((osb->s_mount_opt & OCFS2_MOUNT_LOCALFLOCKS) || + ocfs2_mount_local(osb)) + return flock_lock_file_wait(file, fl); + + if (fl->fl_type == F_UNLCK) + return ocfs2_do_funlock(file, cmd, fl); + else + return ocfs2_do_flock(file, inode, cmd, fl); +} diff --git a/fs/ocfs2/vote.h b/fs/ocfs2/locks.h index 9ea46f6..9743ef2 100644 --- a/fs/ocfs2/vote.h +++ b/fs/ocfs2/locks.h @@ -1,9 +1,9 @@ /* -*- mode: c; c-basic-offset: 8; -*- * vim: noexpandtab sw=8 ts=8 sts=0: * - * vote.h + * locks.h * - * description here + * Function prototypes for Userspace file locking support * * Copyright (C) 2002, 2004 Oracle. All rights reserved. * @@ -23,26 +23,9 @@ * Boston, MA 021110-1307, USA. */ +#ifndef OCFS2_LOCKS_H +#define OCFS2_LOCKS_H -#ifndef VOTE_H -#define VOTE_H +int ocfs2_flock(struct file *file, int cmd, struct file_lock *fl); -int ocfs2_vote_thread(void *arg); -static inline void ocfs2_kick_vote_thread(struct ocfs2_super *osb) -{ - spin_lock(&osb->vote_task_lock); - /* make sure the voting thread gets a swipe at whatever changes - * the caller may have made to the voting state */ - osb->vote_wake_sequence++; - spin_unlock(&osb->vote_task_lock); - wake_up(&osb->vote_event); -} - -int ocfs2_request_mount_vote(struct ocfs2_super *osb); -int ocfs2_request_umount_vote(struct ocfs2_super *osb); -int ocfs2_register_net_handlers(struct ocfs2_super *osb); -void ocfs2_unregister_net_handlers(struct ocfs2_super *osb); - -void ocfs2_remove_node_from_vote_queues(struct ocfs2_super *osb, - int node_num); -#endif +#endif /* OCFS2_LOCKS_H */ diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c index 9875615..3dc18d6 100644 --- a/fs/ocfs2/mmap.c +++ b/fs/ocfs2/mmap.c @@ -168,7 +168,7 @@ static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct page *page) * node. Taking the data lock will also ensure that we don't * attempt page truncation as part of a downconvert. */ - ret = ocfs2_meta_lock(inode, &di_bh, 1); + ret = ocfs2_inode_lock(inode, &di_bh, 1); if (ret < 0) { mlog_errno(ret); goto out; @@ -181,21 +181,12 @@ static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct page *page) */ down_write(&OCFS2_I(inode)->ip_alloc_sem); - ret = ocfs2_data_lock(inode, 1); - if (ret < 0) { - mlog_errno(ret); - goto out_meta_unlock; - } - ret = __ocfs2_page_mkwrite(inode, di_bh, page); - ocfs2_data_unlock(inode, 1); - -out_meta_unlock: up_write(&OCFS2_I(inode)->ip_alloc_sem); brelse(di_bh); - ocfs2_meta_unlock(inode, 1); + ocfs2_inode_unlock(inode, 1); out: ret2 = ocfs2_vm_op_unblock_sigs(&oldset); @@ -214,13 +205,13 @@ int ocfs2_mmap(struct file *file, struct vm_area_struct *vma) { int ret = 0, lock_level = 0; - ret = ocfs2_meta_lock_atime(file->f_dentry->d_inode, + ret = ocfs2_inode_lock_atime(file->f_dentry->d_inode, file->f_vfsmnt, &lock_level); if (ret < 0) { mlog_errno(ret); goto out; } - ocfs2_meta_unlock(file->f_dentry->d_inode, lock_level); + ocfs2_inode_unlock(file->f_dentry->d_inode, lock_level); out: vma->vm_ops = &ocfs2_file_vm_ops; vma->vm_flags |= VM_CAN_NONLINEAR; diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index 989ac27..ae9ad95 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -60,7 +60,6 @@ #include "symlink.h" #include "sysfile.h" #include "uptodate.h" -#include "vote.h" #include "buffer_head_io.h" @@ -116,7 +115,7 @@ static struct dentry *ocfs2_lookup(struct inode *dir, struct dentry *dentry, mlog(0, "find name %.*s in directory %llu\n", dentry->d_name.len, dentry->d_name.name, (unsigned long long)OCFS2_I(dir)->ip_blkno); - status = ocfs2_meta_lock(dir, NULL, 0); + status = ocfs2_inode_lock(dir, NULL, 0); if (status < 0) { if (status != -ENOENT) mlog_errno(status); @@ -129,7 +128,7 @@ static struct dentry *ocfs2_lookup(struct inode *dir, struct dentry *dentry, if (status < 0) goto bail_add; - inode = ocfs2_iget(OCFS2_SB(dir->i_sb), blkno, 0); + inode = ocfs2_iget(OCFS2_SB(dir->i_sb), blkno, 0, 0); if (IS_ERR(inode)) { ret = ERR_PTR(-EACCES); goto bail_unlock; @@ -176,8 +175,8 @@ bail_unlock: /* Don't drop the cluster lock until *after* the d_add -- * unlink on another node will message us to remove that * dentry under this lock so otherwise we can race this with - * the vote thread and have a stale dentry. */ - ocfs2_meta_unlock(dir, 0); + * the downconvert thread and have a stale dentry. */ + ocfs2_inode_unlock(dir, 0); bail: @@ -209,7 +208,7 @@ static int ocfs2_mknod(struct inode *dir, /* get our super block */ osb = OCFS2_SB(dir->i_sb); - status = ocfs2_meta_lock(dir, &parent_fe_bh, 1); + status = ocfs2_inode_lock(dir, &parent_fe_bh, 1); if (status < 0) { if (status != -ENOENT) mlog_errno(status); @@ -323,7 +322,7 @@ leave: if (handle) ocfs2_commit_trans(osb, handle); - ocfs2_meta_unlock(dir, 1); + ocfs2_inode_unlock(dir, 1); if (status == -ENOSPC) mlog(0, "Disk is full\n"); @@ -553,7 +552,7 @@ static int ocfs2_link(struct dentry *old_dentry, if (S_ISDIR(inode->i_mode)) return -EPERM; - err = ocfs2_meta_lock(dir, &parent_fe_bh, 1); + err = ocfs2_inode_lock(dir, &parent_fe_bh, 1); if (err < 0) { if (err != -ENOENT) mlog_errno(err); @@ -578,7 +577,7 @@ static int ocfs2_link(struct dentry *old_dentry, goto out; } - err = ocfs2_meta_lock(inode, &fe_bh, 1); + err = ocfs2_inode_lock(inode, &fe_bh, 1); if (err < 0) { if (err != -ENOENT) mlog_errno(err); @@ -643,10 +642,10 @@ static int ocfs2_link(struct dentry *old_dentry, out_commit: ocfs2_commit_trans(osb, handle); out_unlock_inode: - ocfs2_meta_unlock(inode, 1); + ocfs2_inode_unlock(inode, 1); out: - ocfs2_meta_unlock(dir, 1); + ocfs2_inode_unlock(dir, 1); if (de_bh) brelse(de_bh); @@ -720,7 +719,7 @@ static int ocfs2_unlink(struct inode *dir, return -EPERM; } - status = ocfs2_meta_lock(dir, &parent_node_bh, 1); + status = ocfs2_inode_lock(dir, &parent_node_bh, 1); if (status < 0) { if (status != -ENOENT) mlog_errno(status); @@ -745,7 +744,7 @@ static int ocfs2_unlink(struct inode *dir, goto leave; } - status = ocfs2_meta_lock(inode, &fe_bh, 1); + status = ocfs2_inode_lock(inode, &fe_bh, 1); if (status < 0) { if (status != -ENOENT) mlog_errno(status); @@ -765,7 +764,7 @@ static int ocfs2_unlink(struct inode *dir, status = ocfs2_remote_dentry_delete(dentry); if (status < 0) { - /* This vote should succeed under all normal + /* This remote delete should succeed under all normal * circumstances. */ mlog_errno(status); goto leave; @@ -841,13 +840,13 @@ leave: ocfs2_commit_trans(osb, handle); if (child_locked) - ocfs2_meta_unlock(inode, 1); + ocfs2_inode_unlock(inode, 1); - ocfs2_meta_unlock(dir, 1); + ocfs2_inode_unlock(dir, 1); if (orphan_dir) { /* This was locked for us in ocfs2_prepare_orphan_dir() */ - ocfs2_meta_unlock(orphan_dir, 1); + ocfs2_inode_unlock(orphan_dir, 1); mutex_unlock(&orphan_dir->i_mutex); iput(orphan_dir); } @@ -908,7 +907,7 @@ static int ocfs2_double_lock(struct ocfs2_super *osb, inode1 = tmpinode; } /* lock id2 */ - status = ocfs2_meta_lock(inode2, bh2, 1); + status = ocfs2_inode_lock(inode2, bh2, 1); if (status < 0) { if (status != -ENOENT) mlog_errno(status); @@ -917,14 +916,14 @@ static int ocfs2_double_lock(struct ocfs2_super *osb, } /* lock id1 */ - status = ocfs2_meta_lock(inode1, bh1, 1); + status = ocfs2_inode_lock(inode1, bh1, 1); if (status < 0) { /* * An error return must mean that no cluster locks * were held on function exit. */ if (oi1->ip_blkno != oi2->ip_blkno) - ocfs2_meta_unlock(inode2, 1); + ocfs2_inode_unlock(inode2, 1); if (status != -ENOENT) mlog_errno(status); @@ -937,10 +936,10 @@ bail: static void ocfs2_double_unlock(struct inode *inode1, struct inode *inode2) { - ocfs2_meta_unlock(inode1, 1); + ocfs2_inode_unlock(inode1, 1); if (inode1 != inode2) - ocfs2_meta_unlock(inode2, 1); + ocfs2_inode_unlock(inode2, 1); } static int ocfs2_rename(struct inode *old_dir, @@ -1031,10 +1030,11 @@ static int ocfs2_rename(struct inode *old_dir, /* * Aside from allowing a meta data update, the locking here - * also ensures that the vote thread on other nodes won't have - * to concurrently downconvert the inode and the dentry locks. + * also ensures that the downconvert thread on other nodes + * won't have to concurrently downconvert the inode and the + * dentry locks. */ - status = ocfs2_meta_lock(old_inode, &old_inode_bh, 1); + status = ocfs2_inode_lock(old_inode, &old_inode_bh, 1); if (status < 0) { if (status != -ENOENT) mlog_errno(status); @@ -1143,7 +1143,7 @@ static int ocfs2_rename(struct inode *old_dir, goto bail; } - status = ocfs2_meta_lock(new_inode, &newfe_bh, 1); + status = ocfs2_inode_lock(new_inode, &newfe_bh, 1); if (status < 0) { if (status != -ENOENT) mlog_errno(status); @@ -1355,14 +1355,14 @@ bail: ocfs2_double_unlock(old_dir, new_dir); if (old_child_locked) - ocfs2_meta_unlock(old_inode, 1); + ocfs2_inode_unlock(old_inode, 1); if (new_child_locked) - ocfs2_meta_unlock(new_inode, 1); + ocfs2_inode_unlock(new_inode, 1); if (orphan_dir) { /* This was locked for us in ocfs2_prepare_orphan_dir() */ - ocfs2_meta_unlock(orphan_dir, 1); + ocfs2_inode_unlock(orphan_dir, 1); mutex_unlock(&orphan_dir->i_mutex); iput(orphan_dir); } @@ -1530,7 +1530,7 @@ static int ocfs2_symlink(struct inode *dir, credits = ocfs2_calc_symlink_credits(sb); /* lock the parent directory */ - status = ocfs2_meta_lock(dir, &parent_fe_bh, 1); + status = ocfs2_inode_lock(dir, &parent_fe_bh, 1); if (status < 0) { if (status != -ENOENT) mlog_errno(status); @@ -1657,7 +1657,7 @@ bail: if (handle) ocfs2_commit_trans(osb, handle); - ocfs2_meta_unlock(dir, 1); + ocfs2_inode_unlock(dir, 1); if (new_fe_bh) brelse(new_fe_bh); @@ -1735,7 +1735,7 @@ static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb, mutex_lock(&orphan_dir_inode->i_mutex); - status = ocfs2_meta_lock(orphan_dir_inode, &orphan_dir_bh, 1); + status = ocfs2_inode_lock(orphan_dir_inode, &orphan_dir_bh, 1); if (status < 0) { mlog_errno(status); goto leave; @@ -1745,7 +1745,7 @@ static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb, orphan_dir_bh, name, OCFS2_ORPHAN_NAMELEN, de_bh); if (status < 0) { - ocfs2_meta_unlock(orphan_dir_inode, 1); + ocfs2_inode_unlock(orphan_dir_inode, 1); mlog_errno(status); goto leave; diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index 60a23e1..d084805 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -101,6 +101,7 @@ enum ocfs2_unlock_action { * about to be * dropped. */ #define OCFS2_LOCK_QUEUED (0x00000100) /* queued for downconvert */ +#define OCFS2_LOCK_NOCACHE (0x00000200) /* don't use a holder count */ struct ocfs2_lock_res_ops; @@ -170,6 +171,7 @@ enum ocfs2_mount_options OCFS2_MOUNT_NOINTR = 1 << 2, /* Don't catch signals */ OCFS2_MOUNT_ERRORS_PANIC = 1 << 3, /* Panic on errors */ OCFS2_MOUNT_DATA_WRITEBACK = 1 << 4, /* No data ordering */ + OCFS2_MOUNT_LOCALFLOCKS = 1 << 5, /* No cluster aware user file locks */ }; #define OCFS2_OSB_SOFT_RO 0x0001 @@ -189,9 +191,7 @@ struct ocfs2_super struct ocfs2_slot_info *slot_info; spinlock_t node_map_lock; - struct ocfs2_node_map mounted_map; struct ocfs2_node_map recovery_map; - struct ocfs2_node_map umount_map; u64 root_blkno; u64 system_dir_blkno; @@ -231,7 +231,9 @@ struct ocfs2_super wait_queue_head_t checkpoint_event; atomic_t needs_checkpoint; struct ocfs2_journal *journal; + unsigned long osb_commit_interval; + int local_alloc_size; enum ocfs2_local_alloc_state local_alloc_state; struct buffer_head *local_alloc_bh; u64 la_last_gd; @@ -254,28 +256,21 @@ struct ocfs2_super wait_queue_head_t recovery_event; - spinlock_t vote_task_lock; - struct task_struct *vote_task; - wait_queue_head_t vote_event; - unsigned long vote_wake_sequence; - unsigned long vote_work_sequence; + spinlock_t dc_task_lock; + struct task_struct *dc_task; + wait_queue_head_t dc_event; + unsigned long dc_wake_sequence; + unsigned long dc_work_sequence; + /* + * Any thread can add locks to the list, but the downconvert + * thread is the only one allowed to remove locks. Any change + * to this rule requires updating + * ocfs2_downconvert_thread_do_work(). + */ struct list_head blocked_lock_list; unsigned long blocked_lock_count; - struct list_head vote_list; - int vote_count; - - u32 net_key; - spinlock_t net_response_lock; - unsigned int net_response_ids; - struct list_head net_response_list; - - struct o2hb_callback_func osb_hb_up; - struct o2hb_callback_func osb_hb_down; - - struct list_head osb_net_handlers; - wait_queue_head_t osb_mount_event; /* Truncate log info */ diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h index 6ef8767..3633edd3 100644 --- a/fs/ocfs2/ocfs2_fs.h +++ b/fs/ocfs2/ocfs2_fs.h @@ -231,6 +231,20 @@ struct ocfs2_space_resv { #define OCFS2_IOC_RESVSP64 _IOW ('X', 42, struct ocfs2_space_resv) #define OCFS2_IOC_UNRESVSP64 _IOW ('X', 43, struct ocfs2_space_resv) +/* Used to pass group descriptor data when online resize is done */ +struct ocfs2_new_group_input { + __u64 group; /* Group descriptor's blkno. */ + __u32 clusters; /* Total number of clusters in this group */ + __u32 frees; /* Total free clusters in this group */ + __u16 chain; /* Chain for this group */ + __u16 reserved1; + __u32 reserved2; +}; + +#define OCFS2_IOC_GROUP_EXTEND _IOW('o', 1, int) +#define OCFS2_IOC_GROUP_ADD _IOW('o', 2,struct ocfs2_new_group_input) +#define OCFS2_IOC_GROUP_ADD64 _IOW('o', 3,struct ocfs2_new_group_input) + /* * Journal Flags (ocfs2_dinode.id1.journal1.i_flags) */ @@ -256,6 +270,14 @@ struct ocfs2_space_resv { /* Journal limits (in bytes) */ #define OCFS2_MIN_JOURNAL_SIZE (4 * 1024 * 1024) +/* + * Default local alloc size (in megabytes) + * + * The value chosen should be such that most allocations, including new + * block groups, use local alloc. + */ +#define OCFS2_DEFAULT_LOCAL_ALLOC_SIZE 8 + struct ocfs2_system_inode_info { char *si_name; int si_iflags; diff --git a/fs/ocfs2/ocfs2_lockid.h b/fs/ocfs2/ocfs2_lockid.h index 4ca02b1..86f3e37 100644 --- a/fs/ocfs2/ocfs2_lockid.h +++ b/fs/ocfs2/ocfs2_lockid.h @@ -45,6 +45,7 @@ enum ocfs2_lock_type { OCFS2_LOCK_TYPE_RW, OCFS2_LOCK_TYPE_DENTRY, OCFS2_LOCK_TYPE_OPEN, + OCFS2_LOCK_TYPE_FLOCK, OCFS2_NUM_LOCK_TYPES }; @@ -73,6 +74,9 @@ static inline char ocfs2_lock_type_char(enum ocfs2_lock_type type) case OCFS2_LOCK_TYPE_OPEN: c = 'O'; break; + case OCFS2_LOCK_TYPE_FLOCK: + c = 'F'; + break; default: c = '\0'; } @@ -90,6 +94,7 @@ static char *ocfs2_lock_type_strings[] = { [OCFS2_LOCK_TYPE_RW] = "Write/Read", [OCFS2_LOCK_TYPE_DENTRY] = "Dentry", [OCFS2_LOCK_TYPE_OPEN] = "Open", + [OCFS2_LOCK_TYPE_FLOCK] = "Flock", }; static inline const char *ocfs2_lock_type_string(enum ocfs2_lock_type type) diff --git a/fs/ocfs2/resize.c b/fs/ocfs2/resize.c new file mode 100644 index 0000000..37835ff --- /dev/null +++ b/fs/ocfs2/resize.c @@ -0,0 +1,634 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * resize.c + * + * volume resize. + * Inspired by ext3/resize.c. + * + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/fs.h> +#include <linux/types.h> + +#define MLOG_MASK_PREFIX ML_DISK_ALLOC +#include <cluster/masklog.h> + +#include "ocfs2.h" + +#include "alloc.h" +#include "dlmglue.h" +#include "inode.h" +#include "journal.h" +#include "super.h" +#include "sysfile.h" +#include "uptodate.h" + +#include "buffer_head_io.h" +#include "suballoc.h" +#include "resize.h" + +/* + * Check whether there are new backup superblocks exist + * in the last group. If there are some, mark them or clear + * them in the bitmap. + * + * Return how many backups we find in the last group. + */ +static u16 ocfs2_calc_new_backup_super(struct inode *inode, + struct ocfs2_group_desc *gd, + int new_clusters, + u32 first_new_cluster, + u16 cl_cpg, + int set) +{ + int i; + u16 backups = 0; + u32 cluster; + u64 blkno, gd_blkno, lgd_blkno = le64_to_cpu(gd->bg_blkno); + + for (i = 0; i < OCFS2_MAX_BACKUP_SUPERBLOCKS; i++) { + blkno = ocfs2_backup_super_blkno(inode->i_sb, i); + cluster = ocfs2_blocks_to_clusters(inode->i_sb, blkno); + + gd_blkno = ocfs2_which_cluster_group(inode, cluster); + if (gd_blkno < lgd_blkno) + continue; + else if (gd_blkno > lgd_blkno) + break; + + if (set) + ocfs2_set_bit(cluster % cl_cpg, + (unsigned long *)gd->bg_bitmap); + else + ocfs2_clear_bit(cluster % cl_cpg, + (unsigned long *)gd->bg_bitmap); + backups++; + } + + mlog_exit_void(); + return backups; +} + +static int ocfs2_update_last_group_and_inode(handle_t *handle, + struct inode *bm_inode, + struct buffer_head *bm_bh, + struct buffer_head *group_bh, + u32 first_new_cluster, + int new_clusters) +{ + int ret = 0; + struct ocfs2_super *osb = OCFS2_SB(bm_inode->i_sb); + struct ocfs2_dinode *fe = (struct ocfs2_dinode *) bm_bh->b_data; + struct ocfs2_chain_list *cl = &fe->id2.i_chain; + struct ocfs2_chain_rec *cr; + struct ocfs2_group_desc *group; + u16 chain, num_bits, backups = 0; + u16 cl_bpc = le16_to_cpu(cl->cl_bpc); + u16 cl_cpg = le16_to_cpu(cl->cl_cpg); + + mlog_entry("(new_clusters=%d, first_new_cluster = %u)\n", + new_clusters, first_new_cluster); + + ret = ocfs2_journal_access(handle, bm_inode, group_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + + group = (struct ocfs2_group_desc *)group_bh->b_data; + + /* update the group first. */ + num_bits = new_clusters * cl_bpc; + le16_add_cpu(&group->bg_bits, num_bits); + le16_add_cpu(&group->bg_free_bits_count, num_bits); + + /* + * check whether there are some new backup superblocks exist in + * this group and update the group bitmap accordingly. + */ + if (OCFS2_HAS_COMPAT_FEATURE(osb->sb, + OCFS2_FEATURE_COMPAT_BACKUP_SB)) { + backups = ocfs2_calc_new_backup_super(bm_inode, + group, + new_clusters, + first_new_cluster, + cl_cpg, 1); + le16_add_cpu(&group->bg_free_bits_count, -1 * backups); + } + + ret = ocfs2_journal_dirty(handle, group_bh); + if (ret < 0) { + mlog_errno(ret); + goto out_rollback; + } + + /* update the inode accordingly. */ + ret = ocfs2_journal_access(handle, bm_inode, bm_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret < 0) { + mlog_errno(ret); + goto out_rollback; + } + + chain = le16_to_cpu(group->bg_chain); + cr = (&cl->cl_recs[chain]); + le32_add_cpu(&cr->c_total, num_bits); + le32_add_cpu(&cr->c_free, num_bits); + le32_add_cpu(&fe->id1.bitmap1.i_total, num_bits); + le32_add_cpu(&fe->i_clusters, new_clusters); + + if (backups) { + le32_add_cpu(&cr->c_free, -1 * backups); + le32_add_cpu(&fe->id1.bitmap1.i_used, backups); + } + + spin_lock(&OCFS2_I(bm_inode)->ip_lock); + OCFS2_I(bm_inode)->ip_clusters = le32_to_cpu(fe->i_clusters); + le64_add_cpu(&fe->i_size, new_clusters << osb->s_clustersize_bits); + spin_unlock(&OCFS2_I(bm_inode)->ip_lock); + i_size_write(bm_inode, le64_to_cpu(fe->i_size)); + + ocfs2_journal_dirty(handle, bm_bh); + +out_rollback: + if (ret < 0) { + ocfs2_calc_new_backup_super(bm_inode, + group, + new_clusters, + first_new_cluster, + cl_cpg, 0); + le16_add_cpu(&group->bg_free_bits_count, backups); + le16_add_cpu(&group->bg_bits, -1 * num_bits); + le16_add_cpu(&group->bg_free_bits_count, -1 * num_bits); + } +out: + mlog_exit(ret); + return ret; +} + +static int update_backups(struct inode * inode, u32 clusters, char *data) +{ + int i, ret = 0; + u32 cluster; + u64 blkno; + struct buffer_head *backup = NULL; + struct ocfs2_dinode *backup_di = NULL; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + + /* calculate the real backups we need to update. */ + for (i = 0; i < OCFS2_MAX_BACKUP_SUPERBLOCKS; i++) { + blkno = ocfs2_backup_super_blkno(inode->i_sb, i); + cluster = ocfs2_blocks_to_clusters(inode->i_sb, blkno); + if (cluster > clusters) + break; + + ret = ocfs2_read_block(osb, blkno, &backup, 0, NULL); + if (ret < 0) { + mlog_errno(ret); + break; + } + + memcpy(backup->b_data, data, inode->i_sb->s_blocksize); + + backup_di = (struct ocfs2_dinode *)backup->b_data; + backup_di->i_blkno = cpu_to_le64(blkno); + + ret = ocfs2_write_super_or_backup(osb, backup); + brelse(backup); + backup = NULL; + if (ret < 0) { + mlog_errno(ret); + break; + } + } + + return ret; +} + +static void ocfs2_update_super_and_backups(struct inode *inode, + int new_clusters) +{ + int ret; + u32 clusters = 0; + struct buffer_head *super_bh = NULL; + struct ocfs2_dinode *super_di = NULL; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + + /* + * update the superblock last. + * It doesn't matter if the write failed. + */ + ret = ocfs2_read_block(osb, OCFS2_SUPER_BLOCK_BLKNO, + &super_bh, 0, NULL); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + + super_di = (struct ocfs2_dinode *)super_bh->b_data; + le32_add_cpu(&super_di->i_clusters, new_clusters); + clusters = le32_to_cpu(super_di->i_clusters); + + ret = ocfs2_write_super_or_backup(osb, super_bh); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + + if (OCFS2_HAS_COMPAT_FEATURE(osb->sb, OCFS2_FEATURE_COMPAT_BACKUP_SB)) + ret = update_backups(inode, clusters, super_bh->b_data); + +out: + brelse(super_bh); + if (ret) + printk(KERN_WARNING "ocfs2: Failed to update super blocks on %s" + " during fs resize. This condition is not fatal," + " but fsck.ocfs2 should be run to fix it\n", + osb->dev_str); + return; +} + +/* + * Extend the filesystem to the new number of clusters specified. This entry + * point is only used to extend the current filesystem to the end of the last + * existing group. + */ +int ocfs2_group_extend(struct inode * inode, int new_clusters) +{ + int ret; + handle_t *handle; + struct buffer_head *main_bm_bh = NULL; + struct buffer_head *group_bh = NULL; + struct inode *main_bm_inode = NULL; + struct ocfs2_dinode *fe = NULL; + struct ocfs2_group_desc *group = NULL; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + u16 cl_bpc; + u32 first_new_cluster; + u64 lgd_blkno; + + mlog_entry_void(); + + if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb)) + return -EROFS; + + if (new_clusters < 0) + return -EINVAL; + else if (new_clusters == 0) + return 0; + + main_bm_inode = ocfs2_get_system_file_inode(osb, + GLOBAL_BITMAP_SYSTEM_INODE, + OCFS2_INVALID_SLOT); + if (!main_bm_inode) { + ret = -EINVAL; + mlog_errno(ret); + goto out; + } + + mutex_lock(&main_bm_inode->i_mutex); + + ret = ocfs2_inode_lock(main_bm_inode, &main_bm_bh, 1); + if (ret < 0) { + mlog_errno(ret); + goto out_mutex; + } + + fe = (struct ocfs2_dinode *)main_bm_bh->b_data; + + if (le16_to_cpu(fe->id2.i_chain.cl_cpg) != + ocfs2_group_bitmap_size(osb->sb) * 8) { + mlog(ML_ERROR, "The disk is too old and small. " + "Force to do offline resize."); + ret = -EINVAL; + goto out_unlock; + } + + if (!OCFS2_IS_VALID_DINODE(fe)) { + OCFS2_RO_ON_INVALID_DINODE(main_bm_inode->i_sb, fe); + ret = -EIO; + goto out_unlock; + } + + first_new_cluster = le32_to_cpu(fe->i_clusters); + lgd_blkno = ocfs2_which_cluster_group(main_bm_inode, + first_new_cluster - 1); + + ret = ocfs2_read_block(osb, lgd_blkno, &group_bh, OCFS2_BH_CACHED, + main_bm_inode); + if (ret < 0) { + mlog_errno(ret); + goto out_unlock; + } + + group = (struct ocfs2_group_desc *)group_bh->b_data; + + ret = ocfs2_check_group_descriptor(inode->i_sb, fe, group); + if (ret) { + mlog_errno(ret); + goto out_unlock; + } + + cl_bpc = le16_to_cpu(fe->id2.i_chain.cl_bpc); + if (le16_to_cpu(group->bg_bits) / cl_bpc + new_clusters > + le16_to_cpu(fe->id2.i_chain.cl_cpg)) { + ret = -EINVAL; + goto out_unlock; + } + + mlog(0, "extend the last group at %llu, new clusters = %d\n", + (unsigned long long)le64_to_cpu(group->bg_blkno), new_clusters); + + handle = ocfs2_start_trans(osb, OCFS2_GROUP_EXTEND_CREDITS); + if (IS_ERR(handle)) { + mlog_errno(PTR_ERR(handle)); + ret = -EINVAL; + goto out_unlock; + } + + /* update the last group descriptor and inode. */ + ret = ocfs2_update_last_group_and_inode(handle, main_bm_inode, + main_bm_bh, group_bh, + first_new_cluster, + new_clusters); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + ocfs2_update_super_and_backups(main_bm_inode, new_clusters); + +out_commit: + ocfs2_commit_trans(osb, handle); +out_unlock: + brelse(group_bh); + brelse(main_bm_bh); + + ocfs2_inode_unlock(main_bm_inode, 1); + +out_mutex: + mutex_unlock(&main_bm_inode->i_mutex); + iput(main_bm_inode); + +out: + mlog_exit_void(); + return ret; +} + +static int ocfs2_check_new_group(struct inode *inode, + struct ocfs2_dinode *di, + struct ocfs2_new_group_input *input, + struct buffer_head *group_bh) +{ + int ret; + struct ocfs2_group_desc *gd; + u16 cl_bpc = le16_to_cpu(di->id2.i_chain.cl_bpc); + unsigned int max_bits = le16_to_cpu(di->id2.i_chain.cl_cpg) * + le16_to_cpu(di->id2.i_chain.cl_bpc); + + + gd = (struct ocfs2_group_desc *)group_bh->b_data; + + ret = -EIO; + if (!OCFS2_IS_VALID_GROUP_DESC(gd)) + mlog(ML_ERROR, "Group descriptor # %llu isn't valid.\n", + (unsigned long long)le64_to_cpu(gd->bg_blkno)); + else if (di->i_blkno != gd->bg_parent_dinode) + mlog(ML_ERROR, "Group descriptor # %llu has bad parent " + "pointer (%llu, expected %llu)\n", + (unsigned long long)le64_to_cpu(gd->bg_blkno), + (unsigned long long)le64_to_cpu(gd->bg_parent_dinode), + (unsigned long long)le64_to_cpu(di->i_blkno)); + else if (le16_to_cpu(gd->bg_bits) > max_bits) + mlog(ML_ERROR, "Group descriptor # %llu has bit count of %u\n", + (unsigned long long)le64_to_cpu(gd->bg_blkno), + le16_to_cpu(gd->bg_bits)); + else if (le16_to_cpu(gd->bg_free_bits_count) > le16_to_cpu(gd->bg_bits)) + mlog(ML_ERROR, "Group descriptor # %llu has bit count %u but " + "claims that %u are free\n", + (unsigned long long)le64_to_cpu(gd->bg_blkno), + le16_to_cpu(gd->bg_bits), + le16_to_cpu(gd->bg_free_bits_count)); + else if (le16_to_cpu(gd->bg_bits) > (8 * le16_to_cpu(gd->bg_size))) + mlog(ML_ERROR, "Group descriptor # %llu has bit count %u but " + "max bitmap bits of %u\n", + (unsigned long long)le64_to_cpu(gd->bg_blkno), + le16_to_cpu(gd->bg_bits), + 8 * le16_to_cpu(gd->bg_size)); + else if (le16_to_cpu(gd->bg_chain) != input->chain) + mlog(ML_ERROR, "Group descriptor # %llu has bad chain %u " + "while input has %u set.\n", + (unsigned long long)le64_to_cpu(gd->bg_blkno), + le16_to_cpu(gd->bg_chain), input->chain); + else if (le16_to_cpu(gd->bg_bits) != input->clusters * cl_bpc) + mlog(ML_ERROR, "Group descriptor # %llu has bit count %u but " + "input has %u clusters set\n", + (unsigned long long)le64_to_cpu(gd->bg_blkno), + le16_to_cpu(gd->bg_bits), input->clusters); + else if (le16_to_cpu(gd->bg_free_bits_count) != input->frees * cl_bpc) + mlog(ML_ERROR, "Group descriptor # %llu has free bit count %u " + "but it should have %u set\n", + (unsigned long long)le64_to_cpu(gd->bg_blkno), + le16_to_cpu(gd->bg_bits), + input->frees * cl_bpc); + else + ret = 0; + + return ret; +} + +static int ocfs2_verify_group_and_input(struct inode *inode, + struct ocfs2_dinode *di, + struct ocfs2_new_group_input *input, + struct buffer_head *group_bh) +{ + u16 cl_count = le16_to_cpu(di->id2.i_chain.cl_count); + u16 cl_cpg = le16_to_cpu(di->id2.i_chain.cl_cpg); + u16 next_free = le16_to_cpu(di->id2.i_chain.cl_next_free_rec); + u32 cluster = ocfs2_blocks_to_clusters(inode->i_sb, input->group); + u32 total_clusters = le32_to_cpu(di->i_clusters); + int ret = -EINVAL; + + if (cluster < total_clusters) + mlog(ML_ERROR, "add a group which is in the current volume.\n"); + else if (input->chain >= cl_count) + mlog(ML_ERROR, "input chain exceeds the limit.\n"); + else if (next_free != cl_count && next_free != input->chain) + mlog(ML_ERROR, + "the add group should be in chain %u\n", next_free); + else if (total_clusters + input->clusters < total_clusters) + mlog(ML_ERROR, "add group's clusters overflow.\n"); + else if (input->clusters > cl_cpg) + mlog(ML_ERROR, "the cluster exceeds the maximum of a group\n"); + else if (input->frees > input->clusters) + mlog(ML_ERROR, "the free cluster exceeds the total clusters\n"); + else if (total_clusters % cl_cpg != 0) + mlog(ML_ERROR, + "the last group isn't full. Use group extend first.\n"); + else if (input->group != ocfs2_which_cluster_group(inode, cluster)) + mlog(ML_ERROR, "group blkno is invalid\n"); + else if ((ret = ocfs2_check_new_group(inode, di, input, group_bh))) + mlog(ML_ERROR, "group descriptor check failed.\n"); + else + ret = 0; + + return ret; +} + +/* Add a new group descriptor to global_bitmap. */ +int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input) +{ + int ret; + handle_t *handle; + struct buffer_head *main_bm_bh = NULL; + struct inode *main_bm_inode = NULL; + struct ocfs2_dinode *fe = NULL; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct buffer_head *group_bh = NULL; + struct ocfs2_group_desc *group = NULL; + struct ocfs2_chain_list *cl; + struct ocfs2_chain_rec *cr; + u16 cl_bpc; + + mlog_entry_void(); + + if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb)) + return -EROFS; + + main_bm_inode = ocfs2_get_system_file_inode(osb, + GLOBAL_BITMAP_SYSTEM_INODE, + OCFS2_INVALID_SLOT); + if (!main_bm_inode) { + ret = -EINVAL; + mlog_errno(ret); + goto out; + } + + mutex_lock(&main_bm_inode->i_mutex); + + ret = ocfs2_inode_lock(main_bm_inode, &main_bm_bh, 1); + if (ret < 0) { + mlog_errno(ret); + goto out_mutex; + } + + fe = (struct ocfs2_dinode *)main_bm_bh->b_data; + + if (le16_to_cpu(fe->id2.i_chain.cl_cpg) != + ocfs2_group_bitmap_size(osb->sb) * 8) { + mlog(ML_ERROR, "The disk is too old and small." + " Force to do offline resize."); + ret = -EINVAL; + goto out_unlock; + } + + ret = ocfs2_read_block(osb, input->group, &group_bh, 0, NULL); + if (ret < 0) { + mlog(ML_ERROR, "Can't read the group descriptor # %llu " + "from the device.", (unsigned long long)input->group); + goto out_unlock; + } + + ocfs2_set_new_buffer_uptodate(inode, group_bh); + + ret = ocfs2_verify_group_and_input(main_bm_inode, fe, input, group_bh); + if (ret) { + mlog_errno(ret); + goto out_unlock; + } + + mlog(0, "Add a new group %llu in chain = %u, length = %u\n", + (unsigned long long)input->group, input->chain, input->clusters); + + handle = ocfs2_start_trans(osb, OCFS2_GROUP_ADD_CREDITS); + if (IS_ERR(handle)) { + mlog_errno(PTR_ERR(handle)); + ret = -EINVAL; + goto out_unlock; + } + + cl_bpc = le16_to_cpu(fe->id2.i_chain.cl_bpc); + cl = &fe->id2.i_chain; + cr = &cl->cl_recs[input->chain]; + + ret = ocfs2_journal_access(handle, main_bm_inode, group_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret < 0) { + mlog_errno(ret); + goto out_commit; + } + + group = (struct ocfs2_group_desc *)group_bh->b_data; + group->bg_next_group = cr->c_blkno; + + ret = ocfs2_journal_dirty(handle, group_bh); + if (ret < 0) { + mlog_errno(ret); + goto out_commit; + } + + ret = ocfs2_journal_access(handle, main_bm_inode, main_bm_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret < 0) { + mlog_errno(ret); + goto out_commit; + } + + if (input->chain == le16_to_cpu(cl->cl_next_free_rec)) { + le16_add_cpu(&cl->cl_next_free_rec, 1); + memset(cr, 0, sizeof(struct ocfs2_chain_rec)); + } + + cr->c_blkno = le64_to_cpu(input->group); + le32_add_cpu(&cr->c_total, input->clusters * cl_bpc); + le32_add_cpu(&cr->c_free, input->frees * cl_bpc); + + le32_add_cpu(&fe->id1.bitmap1.i_total, input->clusters *cl_bpc); + le32_add_cpu(&fe->id1.bitmap1.i_used, + (input->clusters - input->frees) * cl_bpc); + le32_add_cpu(&fe->i_clusters, input->clusters); + + ocfs2_journal_dirty(handle, main_bm_bh); + + spin_lock(&OCFS2_I(main_bm_inode)->ip_lock); + OCFS2_I(main_bm_inode)->ip_clusters = le32_to_cpu(fe->i_clusters); + le64_add_cpu(&fe->i_size, input->clusters << osb->s_clustersize_bits); + spin_unlock(&OCFS2_I(main_bm_inode)->ip_lock); + i_size_write(main_bm_inode, le64_to_cpu(fe->i_size)); + + ocfs2_update_super_and_backups(main_bm_inode, input->clusters); + +out_commit: + ocfs2_commit_trans(osb, handle); +out_unlock: + brelse(group_bh); + brelse(main_bm_bh); + + ocfs2_inode_unlock(main_bm_inode, 1); + +out_mutex: + mutex_unlock(&main_bm_inode->i_mutex); + iput(main_bm_inode); + +out: + mlog_exit_void(); + return ret; +} diff --git a/fs/ocfs2/resize.h b/fs/ocfs2/resize.h new file mode 100644 index 0000000..f38841a --- /dev/null +++ b/fs/ocfs2/resize.h @@ -0,0 +1,32 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * resize.h + * + * Function prototypes + * + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef OCFS2_RESIZE_H +#define OCFS2_RESIZE_H + +int ocfs2_group_extend(struct inode * inode, int new_clusters); +int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input); + +#endif /* OCFS2_RESIZE_H */ diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c index af4882b..3a50ce5 100644 --- a/fs/ocfs2/slot_map.c +++ b/fs/ocfs2/slot_map.c @@ -48,25 +48,6 @@ static void __ocfs2_fill_slot(struct ocfs2_slot_info *si, s16 slot_num, s16 node_num); -/* Use the slot information we've collected to create a map of mounted - * nodes. Should be holding an EX on super block. assumes slot info is - * up to date. Note that we call this *after* we find a slot, so our - * own node should be set in the map too... */ -void ocfs2_populate_mounted_map(struct ocfs2_super *osb) -{ - int i; - struct ocfs2_slot_info *si = osb->slot_info; - - spin_lock(&si->si_lock); - - for (i = 0; i < si->si_size; i++) - if (si->si_global_node_nums[i] != OCFS2_INVALID_SLOT) - ocfs2_node_map_set_bit(osb, &osb->mounted_map, - si->si_global_node_nums[i]); - - spin_unlock(&si->si_lock); -} - /* post the slot information on disk into our slot_info struct. */ void ocfs2_update_slot_info(struct ocfs2_slot_info *si) { diff --git a/fs/ocfs2/slot_map.h b/fs/ocfs2/slot_map.h index d8c8cee..1025872 100644 --- a/fs/ocfs2/slot_map.h +++ b/fs/ocfs2/slot_map.h @@ -52,8 +52,6 @@ s16 ocfs2_node_num_to_slot(struct ocfs2_slot_info *si, void ocfs2_clear_slot(struct ocfs2_slot_info *si, s16 slot_num); -void ocfs2_populate_mounted_map(struct ocfs2_super *osb); - static inline int ocfs2_is_empty_slot(struct ocfs2_slot_info *si, int slot_num) { diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c index 8f09f52..7e397e2 100644 --- a/fs/ocfs2/suballoc.c +++ b/fs/ocfs2/suballoc.c @@ -101,8 +101,6 @@ static inline int ocfs2_block_group_reasonably_empty(struct ocfs2_group_desc *bg static inline u32 ocfs2_desc_bitmap_to_cluster_off(struct inode *inode, u64 bg_blkno, u16 bg_bit_off); -static inline u64 ocfs2_which_cluster_group(struct inode *inode, - u32 cluster); static inline void ocfs2_block_to_cluster_group(struct inode *inode, u64 data_blkno, u64 *bg_blkno, @@ -114,7 +112,7 @@ void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac) if (inode) { if (ac->ac_which != OCFS2_AC_USE_LOCAL) - ocfs2_meta_unlock(inode, 1); + ocfs2_inode_unlock(inode, 1); mutex_unlock(&inode->i_mutex); @@ -131,9 +129,9 @@ static u32 ocfs2_bits_per_group(struct ocfs2_chain_list *cl) } /* somewhat more expensive than our other checks, so use sparingly. */ -static int ocfs2_check_group_descriptor(struct super_block *sb, - struct ocfs2_dinode *di, - struct ocfs2_group_desc *gd) +int ocfs2_check_group_descriptor(struct super_block *sb, + struct ocfs2_dinode *di, + struct ocfs2_group_desc *gd) { unsigned int max_bits; @@ -412,7 +410,7 @@ static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb, mutex_lock(&alloc_inode->i_mutex); - status = ocfs2_meta_lock(alloc_inode, &bh, 1); + status = ocfs2_inode_lock(alloc_inode, &bh, 1); if (status < 0) { mutex_unlock(&alloc_inode->i_mutex); iput(alloc_inode); @@ -1443,8 +1441,7 @@ static inline u32 ocfs2_desc_bitmap_to_cluster_off(struct inode *inode, /* given a cluster offset, calculate which block group it belongs to * and return that block offset. */ -static inline u64 ocfs2_which_cluster_group(struct inode *inode, - u32 cluster) +u64 ocfs2_which_cluster_group(struct inode *inode, u32 cluster) { struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); u32 group_no; @@ -1519,8 +1516,9 @@ int __ocfs2_claim_clusters(struct ocfs2_super *osb, if (min_clusters > (osb->bitmap_cpg - 1)) { /* The only paths asking for contiguousness * should know about this already. */ - mlog(ML_ERROR, "minimum allocation requested exceeds " - "group bitmap size!"); + mlog(ML_ERROR, "minimum allocation requested %u exceeds " + "group bitmap size %u!\n", min_clusters, + osb->bitmap_cpg); status = -ENOSPC; goto bail; } diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h index cafe937..8799033 100644 --- a/fs/ocfs2/suballoc.h +++ b/fs/ocfs2/suballoc.h @@ -147,4 +147,12 @@ static inline int ocfs2_is_cluster_bitmap(struct inode *inode) int ocfs2_reserve_cluster_bitmap_bits(struct ocfs2_super *osb, struct ocfs2_alloc_context *ac); +/* given a cluster offset, calculate which block group it belongs to + * and return that block offset. */ +u64 ocfs2_which_cluster_group(struct inode *inode, u32 cluster); + +/* somewhat more expensive than our other checks, so use sparingly. */ +int ocfs2_check_group_descriptor(struct super_block *sb, + struct ocfs2_dinode *di, + struct ocfs2_group_desc *gd); #endif /* _CHAINALLOC_H_ */ diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 5ee7754..01fe40e 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -65,7 +65,6 @@ #include "sysfile.h" #include "uptodate.h" #include "ver.h" -#include "vote.h" #include "buffer_head_io.h" @@ -84,9 +83,11 @@ MODULE_LICENSE("GPL"); struct mount_options { + unsigned long commit_interval; unsigned long mount_opt; unsigned int atime_quantum; signed short slot; + unsigned int localalloc_opt; }; static int ocfs2_parse_options(struct super_block *sb, char *options, @@ -150,6 +151,9 @@ enum { Opt_data_writeback, Opt_atime_quantum, Opt_slot, + Opt_commit, + Opt_localalloc, + Opt_localflocks, Opt_err, }; @@ -165,6 +169,9 @@ static match_table_t tokens = { {Opt_data_writeback, "data=writeback"}, {Opt_atime_quantum, "atime_quantum=%u"}, {Opt_slot, "preferred_slot=%u"}, + {Opt_commit, "commit=%u"}, + {Opt_localalloc, "localalloc=%d"}, + {Opt_localflocks, "localflocks"}, {Opt_err, NULL} }; @@ -213,7 +220,7 @@ static int ocfs2_init_global_system_inodes(struct ocfs2_super *osb) mlog_entry_void(); - new = ocfs2_iget(osb, osb->root_blkno, OCFS2_FI_FLAG_SYSFILE); + new = ocfs2_iget(osb, osb->root_blkno, OCFS2_FI_FLAG_SYSFILE, 0); if (IS_ERR(new)) { status = PTR_ERR(new); mlog_errno(status); @@ -221,7 +228,7 @@ static int ocfs2_init_global_system_inodes(struct ocfs2_super *osb) } osb->root_inode = new; - new = ocfs2_iget(osb, osb->system_dir_blkno, OCFS2_FI_FLAG_SYSFILE); + new = ocfs2_iget(osb, osb->system_dir_blkno, OCFS2_FI_FLAG_SYSFILE, 0); if (IS_ERR(new)) { status = PTR_ERR(new); mlog_errno(status); @@ -443,6 +450,8 @@ unlock_osb: osb->s_mount_opt = parsed_options.mount_opt; osb->s_atime_quantum = parsed_options.atime_quantum; osb->preferred_slot = parsed_options.slot; + if (parsed_options.commit_interval) + osb->osb_commit_interval = parsed_options.commit_interval; if (!ocfs2_is_hard_readonly(osb)) ocfs2_set_journal_params(osb); @@ -597,6 +606,8 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) osb->s_mount_opt = parsed_options.mount_opt; osb->s_atime_quantum = parsed_options.atime_quantum; osb->preferred_slot = parsed_options.slot; + osb->osb_commit_interval = parsed_options.commit_interval; + osb->local_alloc_size = parsed_options.localalloc_opt; sb->s_magic = OCFS2_SUPER_MAGIC; @@ -747,9 +758,11 @@ static int ocfs2_parse_options(struct super_block *sb, mlog_entry("remount: %d, options: \"%s\"\n", is_remount, options ? options : "(none)"); + mopt->commit_interval = 0; mopt->mount_opt = 0; mopt->atime_quantum = OCFS2_DEFAULT_ATIME_QUANTUM; mopt->slot = OCFS2_INVALID_SLOT; + mopt->localalloc_opt = OCFS2_DEFAULT_LOCAL_ALLOC_SIZE; if (!options) { status = 1; @@ -816,6 +829,41 @@ static int ocfs2_parse_options(struct super_block *sb, if (option) mopt->slot = (s16)option; break; + case Opt_commit: + option = 0; + if (match_int(&args[0], &option)) { + status = 0; + goto bail; + } + if (option < 0) + return 0; + if (option == 0) + option = JBD_DEFAULT_MAX_COMMIT_AGE; + mopt->commit_interval = HZ * option; + break; + case Opt_localalloc: + option = 0; + if (match_int(&args[0], &option)) { + status = 0; + goto bail; + } + if (option >= 0 && (option <= ocfs2_local_alloc_size(sb) * 8)) + mopt->localalloc_opt = option; + break; + case Opt_localflocks: + /* + * Changing this during remount could race + * flock() requests, or "unbalance" existing + * ones (e.g., a lock is taken in one mode but + * dropped in the other). If users care enough + * to flip locking modes during remount, we + * could add a "local" flag to individual + * flock structures for proper tracking of + * state. + */ + if (!is_remount) + mopt->mount_opt |= OCFS2_MOUNT_LOCALFLOCKS; + break; default: mlog(ML_ERROR, "Unrecognized mount option \"%s\" " @@ -864,6 +912,16 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt) if (osb->s_atime_quantum != OCFS2_DEFAULT_ATIME_QUANTUM) seq_printf(s, ",atime_quantum=%u", osb->s_atime_quantum); + if (osb->osb_commit_interval) + seq_printf(s, ",commit=%u", + (unsigned) (osb->osb_commit_interval / HZ)); + + if (osb->local_alloc_size != OCFS2_DEFAULT_LOCAL_ALLOC_SIZE) + seq_printf(s, ",localalloc=%d", osb->local_alloc_size); + + if (opts & OCFS2_MOUNT_LOCALFLOCKS) + seq_printf(s, ",localflocks,"); + return 0; } @@ -965,7 +1023,7 @@ static int ocfs2_statfs(struct dentry *dentry, struct kstatfs *buf) goto bail; } - status = ocfs2_meta_lock(inode, &bh, 0); + status = ocfs2_inode_lock(inode, &bh, 0); if (status < 0) { mlog_errno(status); goto bail; @@ -989,7 +1047,7 @@ static int ocfs2_statfs(struct dentry *dentry, struct kstatfs *buf) brelse(bh); - ocfs2_meta_unlock(inode, 0); + ocfs2_inode_unlock(inode, 0); status = 0; bail: if (inode) @@ -1020,8 +1078,7 @@ static void ocfs2_inode_init_once(struct kmem_cache *cachep, void *data) oi->ip_clusters = 0; ocfs2_lock_res_init_once(&oi->ip_rw_lockres); - ocfs2_lock_res_init_once(&oi->ip_meta_lockres); - ocfs2_lock_res_init_once(&oi->ip_data_lockres); + ocfs2_lock_res_init_once(&oi->ip_inode_lockres); ocfs2_lock_res_init_once(&oi->ip_open_lockres); ocfs2_metadata_cache_init(&oi->vfs_inode); @@ -1117,25 +1174,12 @@ static int ocfs2_mount_volume(struct super_block *sb) goto leave; } - status = ocfs2_register_hb_callbacks(osb); - if (status < 0) { - mlog_errno(status); - goto leave; - } - status = ocfs2_dlm_init(osb); if (status < 0) { mlog_errno(status); goto leave; } - /* requires vote_thread to be running. */ - status = ocfs2_register_net_handlers(osb); - if (status < 0) { - mlog_errno(status); - goto leave; - } - status = ocfs2_super_lock(osb, 1); if (status < 0) { mlog_errno(status); @@ -1150,8 +1194,6 @@ static int ocfs2_mount_volume(struct super_block *sb) goto leave; } - ocfs2_populate_mounted_map(osb); - /* load all node-local system inodes */ status = ocfs2_init_local_system_inodes(osb); if (status < 0) { @@ -1174,15 +1216,6 @@ static int ocfs2_mount_volume(struct super_block *sb) if (ocfs2_mount_local(osb)) goto leave; - /* This should be sent *after* we recovered our journal as it - * will cause other nodes to unmark us as needing - * recovery. However, we need to send it *before* dropping the - * super block lock as otherwise their recovery threads might - * try to clean us up while we're live! */ - status = ocfs2_request_mount_vote(osb); - if (status < 0) - mlog_errno(status); - leave: if (unlock_super) ocfs2_super_unlock(osb, 1); @@ -1240,10 +1273,6 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err) mlog_errno(tmp); return; } - - tmp = ocfs2_request_umount_vote(osb); - if (tmp < 0) - mlog_errno(tmp); } if (osb->slot_num != OCFS2_INVALID_SLOT) @@ -1254,13 +1283,8 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err) ocfs2_release_system_inodes(osb); - if (osb->dlm) { - ocfs2_unregister_net_handlers(osb); - + if (osb->dlm) ocfs2_dlm_shutdown(osb); - } - - ocfs2_clear_hb_callbacks(osb); debugfs_remove(osb->osb_debug_root); @@ -1315,7 +1339,6 @@ static int ocfs2_initialize_super(struct super_block *sb, int i, cbits, bbits; struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data; struct inode *inode = NULL; - struct buffer_head *bitmap_bh = NULL; struct ocfs2_journal *journal; __le32 uuid_net_key; struct ocfs2_super *osb; @@ -1344,19 +1367,13 @@ static int ocfs2_initialize_super(struct super_block *sb, osb->s_sectsize_bits = blksize_bits(sector_size); BUG_ON(!osb->s_sectsize_bits); - osb->net_response_ids = 0; - spin_lock_init(&osb->net_response_lock); - INIT_LIST_HEAD(&osb->net_response_list); - - INIT_LIST_HEAD(&osb->osb_net_handlers); init_waitqueue_head(&osb->recovery_event); - spin_lock_init(&osb->vote_task_lock); - init_waitqueue_head(&osb->vote_event); - osb->vote_work_sequence = 0; - osb->vote_wake_sequence = 0; + spin_lock_init(&osb->dc_task_lock); + init_waitqueue_head(&osb->dc_event); + osb->dc_work_sequence = 0; + osb->dc_wake_sequence = 0; INIT_LIST_HEAD(&osb->blocked_lock_list); osb->blocked_lock_count = 0; - INIT_LIST_HEAD(&osb->vote_list); spin_lock_init(&osb->osb_lock); atomic_set(&osb->alloc_stats.moves, 0); @@ -1496,7 +1513,6 @@ static int ocfs2_initialize_super(struct super_block *sb, } memcpy(&uuid_net_key, di->id2.i_super.s_uuid, sizeof(uuid_net_key)); - osb->net_key = le32_to_cpu(uuid_net_key); strncpy(osb->vol_label, di->id2.i_super.s_label, 63); osb->vol_label[63] = '\0'; @@ -1539,25 +1555,9 @@ static int ocfs2_initialize_super(struct super_block *sb, } osb->bitmap_blkno = OCFS2_I(inode)->ip_blkno; - - /* We don't have a cluster lock on the bitmap here because - * we're only interested in static information and the extra - * complexity at mount time isn't worht it. Don't pass the - * inode in to the read function though as we don't want it to - * be put in the cache. */ - status = ocfs2_read_block(osb, osb->bitmap_blkno, &bitmap_bh, 0, - NULL); iput(inode); - if (status < 0) { - mlog_errno(status); - goto bail; - } - di = (struct ocfs2_dinode *) bitmap_bh->b_data; - osb->bitmap_cpg = le16_to_cpu(di->id2.i_chain.cl_cpg); - brelse(bitmap_bh); - mlog(0, "cluster bitmap inode: %llu, clusters per group: %u\n", - (unsigned long long)osb->bitmap_blkno, osb->bitmap_cpg); + osb->bitmap_cpg = ocfs2_group_bitmap_size(sb) * 8; status = ocfs2_init_slot_info(osb); if (status < 0) { diff --git a/fs/ocfs2/sysfile.c b/fs/ocfs2/sysfile.c index fd2e846..ab713eb 100644 --- a/fs/ocfs2/sysfile.c +++ b/fs/ocfs2/sysfile.c @@ -112,7 +112,7 @@ static struct inode * _ocfs2_get_system_file_inode(struct ocfs2_super *osb, goto bail; } - inode = ocfs2_iget(osb, blkno, OCFS2_FI_FLAG_SYSFILE); + inode = ocfs2_iget(osb, blkno, OCFS2_FI_FLAG_SYSFILE, type); if (IS_ERR(inode)) { mlog_errno(PTR_ERR(inode)); inode = NULL; diff --git a/fs/ocfs2/ver.c b/fs/ocfs2/ver.c index 5405ce1..e2488f4 100644 --- a/fs/ocfs2/ver.c +++ b/fs/ocfs2/ver.c @@ -29,7 +29,7 @@ #include "ver.h" -#define OCFS2_BUILD_VERSION "1.3.3" +#define OCFS2_BUILD_VERSION "1.5.0" #define VERSION_STR "OCFS2 " OCFS2_BUILD_VERSION diff --git a/fs/ocfs2/vote.c b/fs/ocfs2/vote.c deleted file mode 100644 index c053585..0000000 --- a/fs/ocfs2/vote.c +++ /dev/null @@ -1,756 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * - * vote.c - * - * description here - * - * Copyright (C) 2003, 2004 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - */ - -#include <linux/types.h> -#include <linux/slab.h> -#include <linux/highmem.h> -#include <linux/kthread.h> - -#include <cluster/heartbeat.h> -#include <cluster/nodemanager.h> -#include <cluster/tcp.h> - -#include <dlm/dlmapi.h> - -#define MLOG_MASK_PREFIX ML_VOTE -#include <cluster/masklog.h> - -#include "ocfs2.h" - -#include "alloc.h" -#include "dlmglue.h" -#include "extent_map.h" -#include "heartbeat.h" -#include "inode.h" -#include "journal.h" -#include "slot_map.h" -#include "vote.h" - -#include "buffer_head_io.h" - -#define OCFS2_MESSAGE_TYPE_VOTE (0x1) -#define OCFS2_MESSAGE_TYPE_RESPONSE (0x2) -struct ocfs2_msg_hdr -{ - __be32 h_response_id; /* used to lookup message handle on sending - * node. */ - __be32 h_request; - __be64 h_blkno; - __be32 h_generation; - __be32 h_node_num; /* node sending this particular message. */ -}; - -struct ocfs2_vote_msg -{ - struct ocfs2_msg_hdr v_hdr; - __be32 v_reserved1; -} __attribute__ ((packed)); - -/* Responses are given these values to maintain backwards - * compatibility with older ocfs2 versions */ -#define OCFS2_RESPONSE_OK (0) -#define OCFS2_RESPONSE_BUSY (-16) -#define OCFS2_RESPONSE_BAD_MSG (-22) - -struct ocfs2_response_msg -{ - struct ocfs2_msg_hdr r_hdr; - __be32 r_response; -} __attribute__ ((packed)); - -struct ocfs2_vote_work { - struct list_head w_list; - struct ocfs2_vote_msg w_msg; -}; - -enum ocfs2_vote_request { - OCFS2_VOTE_REQ_INVALID = 0, - OCFS2_VOTE_REQ_MOUNT, - OCFS2_VOTE_REQ_UMOUNT, - OCFS2_VOTE_REQ_LAST -}; - -static inline int ocfs2_is_valid_vote_request(int request) -{ - return OCFS2_VOTE_REQ_INVALID < request && - request < OCFS2_VOTE_REQ_LAST; -} - -typedef void (*ocfs2_net_response_callback)(void *priv, - struct ocfs2_response_msg *resp); -struct ocfs2_net_response_cb { - ocfs2_net_response_callback rc_cb; - void *rc_priv; -}; - -struct ocfs2_net_wait_ctxt { - struct list_head n_list; - u32 n_response_id; - wait_queue_head_t n_event; - struct ocfs2_node_map n_node_map; - int n_response; /* an agreggate response. 0 if - * all nodes are go, < 0 on any - * negative response from any - * node or network error. */ - struct ocfs2_net_response_cb *n_callback; -}; - -static void ocfs2_process_mount_request(struct ocfs2_super *osb, - unsigned int node_num) -{ - mlog(0, "MOUNT vote from node %u\n", node_num); - /* The other node only sends us this message when he has an EX - * on the superblock, so our recovery threads (if having been - * launched) are waiting on it.*/ - ocfs2_recovery_map_clear(osb, node_num); - ocfs2_node_map_set_bit(osb, &osb->mounted_map, node_num); - - /* We clear the umount map here because a node may have been - * previously mounted, safely unmounted but never stopped - * heartbeating - in which case we'd have a stale entry. */ - ocfs2_node_map_clear_bit(osb, &osb->umount_map, node_num); -} - -static void ocfs2_process_umount_request(struct ocfs2_super *osb, - unsigned int node_num) -{ - mlog(0, "UMOUNT vote from node %u\n", node_num); - ocfs2_node_map_clear_bit(osb, &osb->mounted_map, node_num); - ocfs2_node_map_set_bit(osb, &osb->umount_map, node_num); -} - -static void ocfs2_process_vote(struct ocfs2_super *osb, - struct ocfs2_vote_msg *msg) -{ - int net_status, vote_response; - unsigned int node_num; - u64 blkno; - enum ocfs2_vote_request request; - struct ocfs2_msg_hdr *hdr = &msg->v_hdr; - struct ocfs2_response_msg response; - - /* decode the network mumbo jumbo into local variables. */ - request = be32_to_cpu(hdr->h_request); - blkno = be64_to_cpu(hdr->h_blkno); - node_num = be32_to_cpu(hdr->h_node_num); - - mlog(0, "processing vote: request = %u, blkno = %llu, node_num = %u\n", - request, (unsigned long long)blkno, node_num); - - if (!ocfs2_is_valid_vote_request(request)) { - mlog(ML_ERROR, "Invalid vote request %d from node %u\n", - request, node_num); - vote_response = OCFS2_RESPONSE_BAD_MSG; - goto respond; - } - - vote_response = OCFS2_RESPONSE_OK; - - switch (request) { - case OCFS2_VOTE_REQ_UMOUNT: - ocfs2_process_umount_request(osb, node_num); - goto respond; - case OCFS2_VOTE_REQ_MOUNT: - ocfs2_process_mount_request(osb, node_num); - goto respond; - default: - /* avoids a gcc warning */ - break; - } - -respond: - /* Response struture is small so we just put it on the stack - * and stuff it inline. */ - memset(&response, 0, sizeof(struct ocfs2_response_msg)); - response.r_hdr.h_response_id = hdr->h_response_id; - response.r_hdr.h_blkno = hdr->h_blkno; - response.r_hdr.h_generation = hdr->h_generation; - response.r_hdr.h_node_num = cpu_to_be32(osb->node_num); - response.r_response = cpu_to_be32(vote_response); - - net_status = o2net_send_message(OCFS2_MESSAGE_TYPE_RESPONSE, - osb->net_key, - &response, - sizeof(struct ocfs2_response_msg), - node_num, - NULL); - /* We still want to error print for ENOPROTOOPT here. The - * sending node shouldn't have unregistered his net handler - * without sending an unmount vote 1st */ - if (net_status < 0 - && net_status != -ETIMEDOUT - && net_status != -ENOTCONN) - mlog(ML_ERROR, "message to node %u fails with error %d!\n", - node_num, net_status); -} - -static void ocfs2_vote_thread_do_work(struct ocfs2_super *osb) -{ - unsigned long processed; - struct ocfs2_lock_res *lockres; - struct ocfs2_vote_work *work; - - mlog_entry_void(); - - spin_lock(&osb->vote_task_lock); - /* grab this early so we know to try again if a state change and - * wake happens part-way through our work */ - osb->vote_work_sequence = osb->vote_wake_sequence; - - processed = osb->blocked_lock_count; - while (processed) { - BUG_ON(list_empty(&osb->blocked_lock_list)); - - lockres = list_entry(osb->blocked_lock_list.next, - struct ocfs2_lock_res, l_blocked_list); - list_del_init(&lockres->l_blocked_list); - osb->blocked_lock_count--; - spin_unlock(&osb->vote_task_lock); - - BUG_ON(!processed); - processed--; - - ocfs2_process_blocked_lock(osb, lockres); - - spin_lock(&osb->vote_task_lock); - } - - while (osb->vote_count) { - BUG_ON(list_empty(&osb->vote_list)); - work = list_entry(osb->vote_list.next, - struct ocfs2_vote_work, w_list); - list_del(&work->w_list); - osb->vote_count--; - spin_unlock(&osb->vote_task_lock); - - ocfs2_process_vote(osb, &work->w_msg); - kfree(work); - - spin_lock(&osb->vote_task_lock); - } - spin_unlock(&osb->vote_task_lock); - - mlog_exit_void(); -} - -static int ocfs2_vote_thread_lists_empty(struct ocfs2_super *osb) -{ - int empty = 0; - - spin_lock(&osb->vote_task_lock); - if (list_empty(&osb->blocked_lock_list) && - list_empty(&osb->vote_list)) - empty = 1; - - spin_unlock(&osb->vote_task_lock); - return empty; -} - -static int ocfs2_vote_thread_should_wake(struct ocfs2_super *osb) -{ - int should_wake = 0; - - spin_lock(&osb->vote_task_lock); - if (osb->vote_work_sequence != osb->vote_wake_sequence) - should_wake = 1; - spin_unlock(&osb->vote_task_lock); - - return should_wake; -} - -int ocfs2_vote_thread(void *arg) -{ - int status = 0; - struct ocfs2_super *osb = arg; - - /* only quit once we've been asked to stop and there is no more - * work available */ - while (!(kthread_should_stop() && - ocfs2_vote_thread_lists_empty(osb))) { - - wait_event_interruptible(osb->vote_event, - ocfs2_vote_thread_should_wake(osb) || - kthread_should_stop()); - - mlog(0, "vote_thread: awoken\n"); - - ocfs2_vote_thread_do_work(osb); - } - - osb->vote_task = NULL; - return status; -} - -static struct ocfs2_net_wait_ctxt *ocfs2_new_net_wait_ctxt(unsigned int response_id) -{ - struct ocfs2_net_wait_ctxt *w; - - w = kzalloc(sizeof(*w), GFP_NOFS); - if (!w) { - mlog_errno(-ENOMEM); - goto bail; - } - - INIT_LIST_HEAD(&w->n_list); - init_waitqueue_head(&w->n_event); - ocfs2_node_map_init(&w->n_node_map); - w->n_response_id = response_id; - w->n_callback = NULL; -bail: - return w; -} - -static unsigned int ocfs2_new_response_id(struct ocfs2_super *osb) -{ - unsigned int ret; - - spin_lock(&osb->net_response_lock); - ret = ++osb->net_response_ids; - spin_unlock(&osb->net_response_lock); - - return ret; -} - -static void ocfs2_dequeue_net_wait_ctxt(struct ocfs2_super *osb, - struct ocfs2_net_wait_ctxt *w) -{ - spin_lock(&osb->net_response_lock); - list_del(&w->n_list); - spin_unlock(&osb->net_response_lock); -} - -static void ocfs2_queue_net_wait_ctxt(struct ocfs2_super *osb, - struct ocfs2_net_wait_ctxt *w) -{ - spin_lock(&osb->net_response_lock); - list_add_tail(&w->n_list, - &osb->net_response_list); - spin_unlock(&osb->net_response_lock); -} - -static void __ocfs2_mark_node_responded(struct ocfs2_super *osb, - struct ocfs2_net_wait_ctxt *w, - int node_num) -{ - assert_spin_locked(&osb->net_response_lock); - - ocfs2_node_map_clear_bit(osb, &w->n_node_map, node_num); - if (ocfs2_node_map_is_empty(osb, &w->n_node_map)) - wake_up(&w->n_event); -} - -/* Intended to be called from the node down callback, we fake remove - * the node from all our response contexts */ -void ocfs2_remove_node_from_vote_queues(struct ocfs2_super *osb, - int node_num) -{ - struct list_head *p; - struct ocfs2_net_wait_ctxt *w = NULL; - - spin_lock(&osb->net_response_lock); - - list_for_each(p, &osb->net_response_list) { - w = list_entry(p, struct ocfs2_net_wait_ctxt, n_list); - - __ocfs2_mark_node_responded(osb, w, node_num); - } - - spin_unlock(&osb->net_response_lock); -} - -static int ocfs2_broadcast_vote(struct ocfs2_super *osb, - struct ocfs2_vote_msg *request, - unsigned int response_id, - int *response, - struct ocfs2_net_response_cb *callback) -{ - int status, i, remote_err; - struct ocfs2_net_wait_ctxt *w = NULL; - int dequeued = 0; - - mlog_entry_void(); - - w = ocfs2_new_net_wait_ctxt(response_id); - if (!w) { - status = -ENOMEM; - mlog_errno(status); - goto bail; - } - w->n_callback = callback; - - /* we're pretty much ready to go at this point, and this fills - * in n_response which we need anyway... */ - ocfs2_queue_net_wait_ctxt(osb, w); - - i = ocfs2_node_map_iterate(osb, &osb->mounted_map, 0); - - while (i != O2NM_INVALID_NODE_NUM) { - if (i != osb->node_num) { - mlog(0, "trying to send request to node %i\n", i); - ocfs2_node_map_set_bit(osb, &w->n_node_map, i); - - remote_err = 0; - status = o2net_send_message(OCFS2_MESSAGE_TYPE_VOTE, - osb->net_key, - request, - sizeof(*request), - i, - &remote_err); - if (status == -ETIMEDOUT) { - mlog(0, "remote node %d timed out!\n", i); - status = -EAGAIN; - goto bail; - } - if (remote_err < 0) { - status = remote_err; - mlog(0, "remote error %d on node %d!\n", - remote_err, i); - mlog_errno(status); - goto bail; - } - if (status < 0) { - mlog_errno(status); - goto bail; - } - } - i++; - i = ocfs2_node_map_iterate(osb, &osb->mounted_map, i); - mlog(0, "next is %d, i am %d\n", i, osb->node_num); - } - mlog(0, "done sending, now waiting on responses...\n"); - - wait_event(w->n_event, ocfs2_node_map_is_empty(osb, &w->n_node_map)); - - ocfs2_dequeue_net_wait_ctxt(osb, w); - dequeued = 1; - - *response = w->n_response; - status = 0; -bail: - if (w) { - if (!dequeued) - ocfs2_dequeue_net_wait_ctxt(osb, w); - kfree(w); - } - - mlog_exit(status); - return status; -} - -static struct ocfs2_vote_msg * ocfs2_new_vote_request(struct ocfs2_super *osb, - u64 blkno, - unsigned int generation, - enum ocfs2_vote_request type) -{ - struct ocfs2_vote_msg *request; - struct ocfs2_msg_hdr *hdr; - - BUG_ON(!ocfs2_is_valid_vote_request(type)); - - request = kzalloc(sizeof(*request), GFP_NOFS); - if (!request) { - mlog_errno(-ENOMEM); - } else { - hdr = &request->v_hdr; - hdr->h_node_num = cpu_to_be32(osb->node_num); - hdr->h_request = cpu_to_be32(type); - hdr->h_blkno = cpu_to_be64(blkno); - hdr->h_generation = cpu_to_be32(generation); - } - - return request; -} - -/* Complete the buildup of a new vote request and process the - * broadcast return value. */ -static int ocfs2_do_request_vote(struct ocfs2_super *osb, - struct ocfs2_vote_msg *request, - struct ocfs2_net_response_cb *callback) -{ - int status, response = -EBUSY; - unsigned int response_id; - struct ocfs2_msg_hdr *hdr; - - response_id = ocfs2_new_response_id(osb); - - hdr = &request->v_hdr; - hdr->h_response_id = cpu_to_be32(response_id); - - status = ocfs2_broadcast_vote(osb, request, response_id, &response, - callback); - if (status < 0) { - mlog_errno(status); - goto bail; - } - - status = response; -bail: - - return status; -} - -int ocfs2_request_mount_vote(struct ocfs2_super *osb) -{ - int status; - struct ocfs2_vote_msg *request = NULL; - - request = ocfs2_new_vote_request(osb, 0ULL, 0, OCFS2_VOTE_REQ_MOUNT); - if (!request) { - status = -ENOMEM; - goto bail; - } - - status = -EAGAIN; - while (status == -EAGAIN) { - if (!(osb->s_mount_opt & OCFS2_MOUNT_NOINTR) && - signal_pending(current)) { - status = -ERESTARTSYS; - goto bail; - } - - if (ocfs2_node_map_is_only(osb, &osb->mounted_map, - osb->node_num)) { - status = 0; - goto bail; - } - - status = ocfs2_do_request_vote(osb, request, NULL); - } - -bail: - kfree(request); - return status; -} - -int ocfs2_request_umount_vote(struct ocfs2_super *osb) -{ - int status; - struct ocfs2_vote_msg *request = NULL; - - request = ocfs2_new_vote_request(osb, 0ULL, 0, OCFS2_VOTE_REQ_UMOUNT); - if (!request) { - status = -ENOMEM; - goto bail; - } - - status = -EAGAIN; - while (status == -EAGAIN) { - /* Do not check signals on this vote... We really want - * this one to go all the way through. */ - - if (ocfs2_node_map_is_only(osb, &osb->mounted_map, - osb->node_num)) { - status = 0; - goto bail; - } - - status = ocfs2_do_request_vote(osb, request, NULL); - } - -bail: - kfree(request); - return status; -} - -/* TODO: This should eventually be a hash table! */ -static struct ocfs2_net_wait_ctxt * __ocfs2_find_net_wait_ctxt(struct ocfs2_super *osb, - u32 response_id) -{ - struct list_head *p; - struct ocfs2_net_wait_ctxt *w = NULL; - - list_for_each(p, &osb->net_response_list) { - w = list_entry(p, struct ocfs2_net_wait_ctxt, n_list); - if (response_id == w->n_response_id) - break; - w = NULL; - } - - return w; -} - -/* Translate response codes into local node errno values */ -static inline int ocfs2_translate_response(int response) -{ - int ret; - - switch (response) { - case OCFS2_RESPONSE_OK: - ret = 0; - break; - - case OCFS2_RESPONSE_BUSY: - ret = -EBUSY; - break; - - default: - ret = -EINVAL; - } - - return ret; -} - -static int ocfs2_handle_response_message(struct o2net_msg *msg, - u32 len, - void *data, void **ret_data) -{ - unsigned int response_id, node_num; - int response_status; - struct ocfs2_super *osb = data; - struct ocfs2_response_msg *resp; - struct ocfs2_net_wait_ctxt * w; - struct ocfs2_net_response_cb *resp_cb; - - resp = (struct ocfs2_response_msg *) msg->buf; - - response_id = be32_to_cpu(resp->r_hdr.h_response_id); - node_num = be32_to_cpu(resp->r_hdr.h_node_num); - response_status = - ocfs2_translate_response(be32_to_cpu(resp->r_response)); - - mlog(0, "received response message:\n"); - mlog(0, "h_response_id = %u\n", response_id); - mlog(0, "h_request = %u\n", be32_to_cpu(resp->r_hdr.h_request)); - mlog(0, "h_blkno = %llu\n", - (unsigned long long)be64_to_cpu(resp->r_hdr.h_blkno)); - mlog(0, "h_generation = %u\n", be32_to_cpu(resp->r_hdr.h_generation)); - mlog(0, "h_node_num = %u\n", node_num); - mlog(0, "r_response = %d\n", response_status); - - spin_lock(&osb->net_response_lock); - w = __ocfs2_find_net_wait_ctxt(osb, response_id); - if (!w) { - mlog(0, "request not found!\n"); - goto bail; - } - resp_cb = w->n_callback; - - if (response_status && (!w->n_response)) { - /* we only really need one negative response so don't - * set it twice. */ - w->n_response = response_status; - } - - if (resp_cb) { - spin_unlock(&osb->net_response_lock); - - resp_cb->rc_cb(resp_cb->rc_priv, resp); - - spin_lock(&osb->net_response_lock); - } - - __ocfs2_mark_node_responded(osb, w, node_num); -bail: - spin_unlock(&osb->net_response_lock); - - return 0; -} - -static int ocfs2_handle_vote_message(struct o2net_msg *msg, - u32 len, - void *data, void **ret_data) -{ - int status; - struct ocfs2_super *osb = data; - struct ocfs2_vote_work *work; - - work = kmalloc(sizeof(struct ocfs2_vote_work), GFP_NOFS); - if (!work) { - status = -ENOMEM; - mlog_errno(status); - goto bail; - } - - INIT_LIST_HEAD(&work->w_list); - memcpy(&work->w_msg, msg->buf, sizeof(struct ocfs2_vote_msg)); - - mlog(0, "scheduling vote request:\n"); - mlog(0, "h_response_id = %u\n", - be32_to_cpu(work->w_msg.v_hdr.h_response_id)); - mlog(0, "h_request = %u\n", be32_to_cpu(work->w_msg.v_hdr.h_request)); - mlog(0, "h_blkno = %llu\n", - (unsigned long long)be64_to_cpu(work->w_msg.v_hdr.h_blkno)); - mlog(0, "h_generation = %u\n", - be32_to_cpu(work->w_msg.v_hdr.h_generation)); - mlog(0, "h_node_num = %u\n", - be32_to_cpu(work->w_msg.v_hdr.h_node_num)); - - spin_lock(&osb->vote_task_lock); - list_add_tail(&work->w_list, &osb->vote_list); - osb->vote_count++; - spin_unlock(&osb->vote_task_lock); - - ocfs2_kick_vote_thread(osb); - - status = 0; -bail: - return status; -} - -void ocfs2_unregister_net_handlers(struct ocfs2_super *osb) -{ - if (!osb->net_key) - return; - - o2net_unregister_handler_list(&osb->osb_net_handlers); - - if (!list_empty(&osb->net_response_list)) - mlog(ML_ERROR, "net response list not empty!\n"); - - osb->net_key = 0; -} - -int ocfs2_register_net_handlers(struct ocfs2_super *osb) -{ - int status = 0; - - if (ocfs2_mount_local(osb)) - return 0; - - status = o2net_register_handler(OCFS2_MESSAGE_TYPE_RESPONSE, - osb->net_key, - sizeof(struct ocfs2_response_msg), - ocfs2_handle_response_message, - osb, NULL, &osb->osb_net_handlers); - if (status) { - mlog_errno(status); - goto bail; - } - - status = o2net_register_handler(OCFS2_MESSAGE_TYPE_VOTE, - osb->net_key, - sizeof(struct ocfs2_vote_msg), - ocfs2_handle_vote_message, - osb, NULL, &osb->osb_net_handlers); - if (status) { - mlog_errno(status); - goto bail; - } -bail: - if (status < 0) - ocfs2_unregister_net_handlers(osb); - - return status; -} diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c index d88173840..6b7ff16 100644 --- a/fs/openpromfs/inode.c +++ b/fs/openpromfs/inode.c @@ -131,7 +131,7 @@ static void property_stop(struct seq_file *f, void *v) /* Nothing to do */ } -static struct seq_operations property_op = { +static const struct seq_operations property_op = { .start = property_start, .next = property_next, .stop = property_stop, diff --git a/fs/partitions/check.c b/fs/partitions/check.c index 722e12e..739da70 100644 --- a/fs/partitions/check.c +++ b/fs/partitions/check.c @@ -195,96 +195,45 @@ check_partition(struct gendisk *hd, struct block_device *bdev) return ERR_PTR(res); } -/* - * sysfs bindings for partitions - */ - -struct part_attribute { - struct attribute attr; - ssize_t (*show)(struct hd_struct *,char *); - ssize_t (*store)(struct hd_struct *,const char *, size_t); -}; - -static ssize_t -part_attr_show(struct kobject * kobj, struct attribute * attr, char * page) +static ssize_t part_start_show(struct device *dev, + struct device_attribute *attr, char *buf) { - struct hd_struct * p = container_of(kobj,struct hd_struct,kobj); - struct part_attribute * part_attr = container_of(attr,struct part_attribute,attr); - ssize_t ret = 0; - if (part_attr->show) - ret = part_attr->show(p, page); - return ret; -} -static ssize_t -part_attr_store(struct kobject * kobj, struct attribute * attr, - const char *page, size_t count) -{ - struct hd_struct * p = container_of(kobj,struct hd_struct,kobj); - struct part_attribute * part_attr = container_of(attr,struct part_attribute,attr); - ssize_t ret = 0; + struct hd_struct *p = dev_to_part(dev); - if (part_attr->store) - ret = part_attr->store(p, page, count); - return ret; + return sprintf(buf, "%llu\n",(unsigned long long)p->start_sect); } -static struct sysfs_ops part_sysfs_ops = { - .show = part_attr_show, - .store = part_attr_store, -}; - -static ssize_t part_uevent_store(struct hd_struct * p, - const char *page, size_t count) +static ssize_t part_size_show(struct device *dev, + struct device_attribute *attr, char *buf) { - kobject_uevent(&p->kobj, KOBJ_ADD); - return count; + struct hd_struct *p = dev_to_part(dev); + return sprintf(buf, "%llu\n",(unsigned long long)p->nr_sects); } -static ssize_t part_dev_read(struct hd_struct * p, char *page) -{ - struct gendisk *disk = container_of(p->kobj.parent,struct gendisk,kobj); - dev_t dev = MKDEV(disk->major, disk->first_minor + p->partno); - return print_dev_t(page, dev); -} -static ssize_t part_start_read(struct hd_struct * p, char *page) -{ - return sprintf(page, "%llu\n",(unsigned long long)p->start_sect); -} -static ssize_t part_size_read(struct hd_struct * p, char *page) -{ - return sprintf(page, "%llu\n",(unsigned long long)p->nr_sects); -} -static ssize_t part_stat_read(struct hd_struct * p, char *page) + +static ssize_t part_stat_show(struct device *dev, + struct device_attribute *attr, char *buf) { - return sprintf(page, "%8u %8llu %8u %8llu\n", + struct hd_struct *p = dev_to_part(dev); + + return sprintf(buf, "%8u %8llu %8u %8llu\n", p->ios[0], (unsigned long long)p->sectors[0], p->ios[1], (unsigned long long)p->sectors[1]); } -static struct part_attribute part_attr_uevent = { - .attr = {.name = "uevent", .mode = S_IWUSR }, - .store = part_uevent_store -}; -static struct part_attribute part_attr_dev = { - .attr = {.name = "dev", .mode = S_IRUGO }, - .show = part_dev_read -}; -static struct part_attribute part_attr_start = { - .attr = {.name = "start", .mode = S_IRUGO }, - .show = part_start_read -}; -static struct part_attribute part_attr_size = { - .attr = {.name = "size", .mode = S_IRUGO }, - .show = part_size_read -}; -static struct part_attribute part_attr_stat = { - .attr = {.name = "stat", .mode = S_IRUGO }, - .show = part_stat_read -}; #ifdef CONFIG_FAIL_MAKE_REQUEST +static ssize_t part_fail_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct hd_struct *p = dev_to_part(dev); -static ssize_t part_fail_store(struct hd_struct * p, + return sprintf(buf, "%d\n", p->make_it_fail); +} + +static ssize_t part_fail_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t count) { + struct hd_struct *p = dev_to_part(dev); int i; if (count > 0 && sscanf(buf, "%d", &i) > 0) @@ -292,50 +241,53 @@ static ssize_t part_fail_store(struct hd_struct * p, return count; } -static ssize_t part_fail_read(struct hd_struct * p, char *page) -{ - return sprintf(page, "%d\n", p->make_it_fail); -} -static struct part_attribute part_attr_fail = { - .attr = {.name = "make-it-fail", .mode = S_IRUGO | S_IWUSR }, - .store = part_fail_store, - .show = part_fail_read -}; +#endif +static DEVICE_ATTR(start, S_IRUGO, part_start_show, NULL); +static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL); +static DEVICE_ATTR(stat, S_IRUGO, part_stat_show, NULL); +#ifdef CONFIG_FAIL_MAKE_REQUEST +static struct device_attribute dev_attr_fail = + __ATTR(make-it-fail, S_IRUGO|S_IWUSR, part_fail_show, part_fail_store); #endif -static struct attribute * default_attrs[] = { - &part_attr_uevent.attr, - &part_attr_dev.attr, - &part_attr_start.attr, - &part_attr_size.attr, - &part_attr_stat.attr, +static struct attribute *part_attrs[] = { + &dev_attr_start.attr, + &dev_attr_size.attr, + &dev_attr_stat.attr, #ifdef CONFIG_FAIL_MAKE_REQUEST - &part_attr_fail.attr, + &dev_attr_fail.attr, #endif - NULL, + NULL }; -extern struct kset block_subsys; +static struct attribute_group part_attr_group = { + .attrs = part_attrs, +}; -static void part_release(struct kobject *kobj) +static struct attribute_group *part_attr_groups[] = { + &part_attr_group, + NULL +}; + +static void part_release(struct device *dev) { - struct hd_struct * p = container_of(kobj,struct hd_struct,kobj); + struct hd_struct *p = dev_to_part(dev); kfree(p); } -struct kobj_type ktype_part = { +struct device_type part_type = { + .name = "partition", + .groups = part_attr_groups, .release = part_release, - .default_attrs = default_attrs, - .sysfs_ops = &part_sysfs_ops, }; static inline void partition_sysfs_add_subdir(struct hd_struct *p) { struct kobject *k; - k = kobject_get(&p->kobj); - p->holder_dir = kobject_add_dir(k, "holders"); + k = kobject_get(&p->dev.kobj); + p->holder_dir = kobject_create_and_add("holders", k); kobject_put(k); } @@ -343,15 +295,16 @@ static inline void disk_sysfs_add_subdirs(struct gendisk *disk) { struct kobject *k; - k = kobject_get(&disk->kobj); - disk->holder_dir = kobject_add_dir(k, "holders"); - disk->slave_dir = kobject_add_dir(k, "slaves"); + k = kobject_get(&disk->dev.kobj); + disk->holder_dir = kobject_create_and_add("holders", k); + disk->slave_dir = kobject_create_and_add("slaves", k); kobject_put(k); } void delete_partition(struct gendisk *disk, int part) { struct hd_struct *p = disk->part[part-1]; + if (!p) return; if (!p->nr_sects) @@ -361,113 +314,55 @@ void delete_partition(struct gendisk *disk, int part) p->nr_sects = 0; p->ios[0] = p->ios[1] = 0; p->sectors[0] = p->sectors[1] = 0; - sysfs_remove_link(&p->kobj, "subsystem"); - kobject_unregister(p->holder_dir); - kobject_uevent(&p->kobj, KOBJ_REMOVE); - kobject_del(&p->kobj); - kobject_put(&p->kobj); + kobject_put(p->holder_dir); + device_del(&p->dev); + put_device(&p->dev); } void add_partition(struct gendisk *disk, int part, sector_t start, sector_t len, int flags) { struct hd_struct *p; + int err; p = kzalloc(sizeof(*p), GFP_KERNEL); if (!p) return; - + p->start_sect = start; p->nr_sects = len; p->partno = part; p->policy = disk->policy; - if (isdigit(disk->kobj.k_name[strlen(disk->kobj.k_name)-1])) - kobject_set_name(&p->kobj, "%sp%d", - kobject_name(&disk->kobj), part); + if (isdigit(disk->dev.bus_id[strlen(disk->dev.bus_id)-1])) + snprintf(p->dev.bus_id, BUS_ID_SIZE, + "%sp%d", disk->dev.bus_id, part); else - kobject_set_name(&p->kobj, "%s%d", - kobject_name(&disk->kobj),part); - p->kobj.parent = &disk->kobj; - p->kobj.ktype = &ktype_part; - kobject_init(&p->kobj); - kobject_add(&p->kobj); - if (!disk->part_uevent_suppress) - kobject_uevent(&p->kobj, KOBJ_ADD); - sysfs_create_link(&p->kobj, &block_subsys.kobj, "subsystem"); + snprintf(p->dev.bus_id, BUS_ID_SIZE, + "%s%d", disk->dev.bus_id, part); + + device_initialize(&p->dev); + p->dev.devt = MKDEV(disk->major, disk->first_minor + part); + p->dev.class = &block_class; + p->dev.type = &part_type; + p->dev.parent = &disk->dev; + disk->part[part-1] = p; + + /* delay uevent until 'holders' subdir is created */ + p->dev.uevent_suppress = 1; + device_add(&p->dev); + partition_sysfs_add_subdir(p); + p->dev.uevent_suppress = 0; if (flags & ADDPART_FLAG_WHOLEDISK) { static struct attribute addpartattr = { .name = "whole_disk", .mode = S_IRUSR | S_IRGRP | S_IROTH, }; - - sysfs_create_file(&p->kobj, &addpartattr); + err = sysfs_create_file(&p->dev.kobj, &addpartattr); } - partition_sysfs_add_subdir(p); - disk->part[part-1] = p; -} -static char *make_block_name(struct gendisk *disk) -{ - char *name; - static char *block_str = "block:"; - int size; - char *s; - - size = strlen(block_str) + strlen(disk->disk_name) + 1; - name = kmalloc(size, GFP_KERNEL); - if (!name) - return NULL; - strcpy(name, block_str); - strcat(name, disk->disk_name); - /* ewww... some of these buggers have / in name... */ - s = strchr(name, '/'); - if (s) - *s = '!'; - return name; -} - -static int disk_sysfs_symlinks(struct gendisk *disk) -{ - struct device *target = get_device(disk->driverfs_dev); - int err; - char *disk_name = NULL; - - if (target) { - disk_name = make_block_name(disk); - if (!disk_name) { - err = -ENOMEM; - goto err_out; - } - - err = sysfs_create_link(&disk->kobj, &target->kobj, "device"); - if (err) - goto err_out_disk_name; - - err = sysfs_create_link(&target->kobj, &disk->kobj, disk_name); - if (err) - goto err_out_dev_link; - } - - err = sysfs_create_link(&disk->kobj, &block_subsys.kobj, - "subsystem"); - if (err) - goto err_out_disk_name_lnk; - - kfree(disk_name); - - return 0; - -err_out_disk_name_lnk: - if (target) { - sysfs_remove_link(&target->kobj, disk_name); -err_out_dev_link: - sysfs_remove_link(&disk->kobj, "device"); -err_out_disk_name: - kfree(disk_name); -err_out: - put_device(target); - } - return err; + /* suppress uevent if the disk supresses it */ + if (!disk->dev.uevent_suppress) + kobject_uevent(&p->dev.kobj, KOBJ_ADD); } /* Not exported, helper to add_disk(). */ @@ -479,19 +374,29 @@ void register_disk(struct gendisk *disk) struct hd_struct *p; int err; - kobject_set_name(&disk->kobj, "%s", disk->disk_name); - /* ewww... some of these buggers have / in name... */ - s = strchr(disk->kobj.k_name, '/'); + disk->dev.parent = disk->driverfs_dev; + disk->dev.devt = MKDEV(disk->major, disk->first_minor); + + strlcpy(disk->dev.bus_id, disk->disk_name, KOBJ_NAME_LEN); + /* ewww... some of these buggers have / in the name... */ + s = strchr(disk->dev.bus_id, '/'); if (s) *s = '!'; - if ((err = kobject_add(&disk->kobj))) + + /* delay uevents, until we scanned partition table */ + disk->dev.uevent_suppress = 1; + + if (device_add(&disk->dev)) return; - err = disk_sysfs_symlinks(disk); +#ifndef CONFIG_SYSFS_DEPRECATED + err = sysfs_create_link(block_depr, &disk->dev.kobj, + kobject_name(&disk->dev.kobj)); if (err) { - kobject_del(&disk->kobj); + device_del(&disk->dev); return; } - disk_sysfs_add_subdirs(disk); +#endif + disk_sysfs_add_subdirs(disk); /* No minors to use for partitions */ if (disk->minors == 1) @@ -505,25 +410,23 @@ void register_disk(struct gendisk *disk) if (!bdev) goto exit; - /* scan partition table, but suppress uevents */ bdev->bd_invalidated = 1; - disk->part_uevent_suppress = 1; err = blkdev_get(bdev, FMODE_READ, 0); - disk->part_uevent_suppress = 0; if (err < 0) goto exit; blkdev_put(bdev); exit: - /* announce disk after possible partitions are already created */ - kobject_uevent(&disk->kobj, KOBJ_ADD); + /* announce disk after possible partitions are created */ + disk->dev.uevent_suppress = 0; + kobject_uevent(&disk->dev.kobj, KOBJ_ADD); /* announce possible partitions */ for (i = 1; i < disk->minors; i++) { p = disk->part[i-1]; if (!p || !p->nr_sects) continue; - kobject_uevent(&p->kobj, KOBJ_ADD); + kobject_uevent(&p->dev.kobj, KOBJ_ADD); } } @@ -602,19 +505,11 @@ void del_gendisk(struct gendisk *disk) disk_stat_set_all(disk, 0); disk->stamp = 0; - kobject_uevent(&disk->kobj, KOBJ_REMOVE); - kobject_unregister(disk->holder_dir); - kobject_unregister(disk->slave_dir); - if (disk->driverfs_dev) { - char *disk_name = make_block_name(disk); - sysfs_remove_link(&disk->kobj, "device"); - if (disk_name) { - sysfs_remove_link(&disk->driverfs_dev->kobj, disk_name); - kfree(disk_name); - } - put_device(disk->driverfs_dev); - disk->driverfs_dev = NULL; - } - sysfs_remove_link(&disk->kobj, "subsystem"); - kobject_del(&disk->kobj); + kobject_put(disk->holder_dir); + kobject_put(disk->slave_dir); + disk->driverfs_dev = NULL; +#ifndef CONFIG_SYSFS_DEPRECATED + sysfs_remove_link(block_depr, disk->dev.bus_id); +#endif + device_del(&disk->dev); } diff --git a/fs/proc/array.c b/fs/proc/array.c index 65c62e1..eb97f28 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -169,7 +169,7 @@ static inline char *task_state(struct task_struct *p, char *buffer) ppid = pid_alive(p) ? task_tgid_nr_ns(rcu_dereference(p->real_parent), ns) : 0; tpid = pid_alive(p) && p->ptrace ? - task_ppid_nr_ns(rcu_dereference(p->parent), ns) : 0; + task_pid_nr_ns(rcu_dereference(p->parent), ns) : 0; buffer += sprintf(buffer, "State:\t%s\n" "Tgid:\t%d\n" @@ -464,8 +464,8 @@ static int do_task_stat(struct task_struct *task, char *buffer, int whole) } sid = task_session_nr_ns(task, ns); + ppid = task_tgid_nr_ns(task->real_parent, ns); pgid = task_pgrp_nr_ns(task, ns); - ppid = task_ppid_nr_ns(task, ns); unlock_task_sighand(task, &flags); } diff --git a/fs/proc/base.c b/fs/proc/base.c index 7411bfb..91fa8e6 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -310,6 +310,77 @@ static int proc_pid_schedstat(struct task_struct *task, char *buffer) } #endif +#ifdef CONFIG_LATENCYTOP +static int lstats_show_proc(struct seq_file *m, void *v) +{ + int i; + struct task_struct *task = m->private; + seq_puts(m, "Latency Top version : v0.1\n"); + + for (i = 0; i < 32; i++) { + if (task->latency_record[i].backtrace[0]) { + int q; + seq_printf(m, "%i %li %li ", + task->latency_record[i].count, + task->latency_record[i].time, + task->latency_record[i].max); + for (q = 0; q < LT_BACKTRACEDEPTH; q++) { + char sym[KSYM_NAME_LEN]; + char *c; + if (!task->latency_record[i].backtrace[q]) + break; + if (task->latency_record[i].backtrace[q] == ULONG_MAX) + break; + sprint_symbol(sym, task->latency_record[i].backtrace[q]); + c = strchr(sym, '+'); + if (c) + *c = 0; + seq_printf(m, "%s ", sym); + } + seq_printf(m, "\n"); + } + + } + return 0; +} + +static int lstats_open(struct inode *inode, struct file *file) +{ + int ret; + struct seq_file *m; + struct task_struct *task = get_proc_task(inode); + + ret = single_open(file, lstats_show_proc, NULL); + if (!ret) { + m = file->private_data; + m->private = task; + } + return ret; +} + +static ssize_t lstats_write(struct file *file, const char __user *buf, + size_t count, loff_t *offs) +{ + struct seq_file *m; + struct task_struct *task; + + m = file->private_data; + task = m->private; + clear_all_latency_tracing(task); + + return count; +} + +static const struct file_operations proc_lstats_operations = { + .open = lstats_open, + .read = seq_read, + .write = lstats_write, + .llseek = seq_lseek, + .release = single_release, +}; + +#endif + /* The badness from the OOM killer */ unsigned long badness(struct task_struct *p, unsigned long uptime); static int proc_oom_score(struct task_struct *task, char *buffer) @@ -1020,6 +1091,7 @@ static const struct file_operations proc_fault_inject_operations = { }; #endif + #ifdef CONFIG_SCHED_DEBUG /* * Print out various scheduling related per-task fields: @@ -2230,6 +2302,9 @@ static const struct pid_entry tgid_base_stuff[] = { #ifdef CONFIG_SCHEDSTATS INF("schedstat", S_IRUGO, pid_schedstat), #endif +#ifdef CONFIG_LATENCYTOP + REG("latency", S_IRUGO, lstats), +#endif #ifdef CONFIG_PROC_PID_CPUSET REG("cpuset", S_IRUGO, cpuset), #endif @@ -2555,6 +2630,9 @@ static const struct pid_entry tid_base_stuff[] = { #ifdef CONFIG_SCHEDSTATS INF("schedstat", S_IRUGO, pid_schedstat), #endif +#ifdef CONFIG_LATENCYTOP + REG("latency", S_IRUGO, lstats), +#endif #ifdef CONFIG_PROC_PID_CPUSET REG("cpuset", S_IRUGO, cpuset), #endif diff --git a/fs/read_write.c b/fs/read_write.c index ea1f94c..c4d3d17 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -197,25 +197,27 @@ int rw_verify_area(int read_write, struct file *file, loff_t *ppos, size_t count { struct inode *inode; loff_t pos; + int retval = -EINVAL; inode = file->f_path.dentry->d_inode; if (unlikely((ssize_t) count < 0)) - goto Einval; + return retval; pos = *ppos; if (unlikely((pos < 0) || (loff_t) (pos + count) < 0)) - goto Einval; + return retval; if (unlikely(inode->i_flock && mandatory_lock(inode))) { - int retval = locks_mandatory_area( + retval = locks_mandatory_area( read_write == READ ? FLOCK_VERIFY_READ : FLOCK_VERIFY_WRITE, inode, file, pos, count); if (retval < 0) return retval; } + retval = security_file_permission(file, + read_write == READ ? MAY_READ : MAY_WRITE); + if (retval) + return retval; return count > MAX_RW_COUNT ? MAX_RW_COUNT : count; - -Einval: - return -EINVAL; } static void wait_on_retry_sync_kiocb(struct kiocb *iocb) @@ -267,18 +269,15 @@ ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos) ret = rw_verify_area(READ, file, pos, count); if (ret >= 0) { count = ret; - ret = security_file_permission (file, MAY_READ); - if (!ret) { - if (file->f_op->read) - ret = file->f_op->read(file, buf, count, pos); - else - ret = do_sync_read(file, buf, count, pos); - if (ret > 0) { - fsnotify_access(file->f_path.dentry); - add_rchar(current, ret); - } - inc_syscr(current); + if (file->f_op->read) + ret = file->f_op->read(file, buf, count, pos); + else + ret = do_sync_read(file, buf, count, pos); + if (ret > 0) { + fsnotify_access(file->f_path.dentry); + add_rchar(current, ret); } + inc_syscr(current); } return ret; @@ -325,18 +324,15 @@ ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_ ret = rw_verify_area(WRITE, file, pos, count); if (ret >= 0) { count = ret; - ret = security_file_permission (file, MAY_WRITE); - if (!ret) { - if (file->f_op->write) - ret = file->f_op->write(file, buf, count, pos); - else - ret = do_sync_write(file, buf, count, pos); - if (ret > 0) { - fsnotify_modify(file->f_path.dentry); - add_wchar(current, ret); - } - inc_syscw(current); + if (file->f_op->write) + ret = file->f_op->write(file, buf, count, pos); + else + ret = do_sync_write(file, buf, count, pos); + if (ret > 0) { + fsnotify_modify(file->f_path.dentry); + add_wchar(current, ret); } + inc_syscw(current); } return ret; @@ -603,9 +599,6 @@ static ssize_t do_readv_writev(int type, struct file *file, ret = rw_verify_area(type, file, pos, tot_len); if (ret < 0) goto out; - ret = security_file_permission(file, type == READ ? MAY_READ : MAY_WRITE); - if (ret) - goto out; fnv = NULL; if (type == READ) { @@ -737,10 +730,6 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos, goto fput_in; count = retval; - retval = security_file_permission (in_file, MAY_READ); - if (retval) - goto fput_in; - /* * Get output file, and verify that it is ok.. */ @@ -759,10 +748,6 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos, goto fput_out; count = retval; - retval = security_file_permission (out_file, MAY_WRITE); - if (retval) - goto fput_out; - if (!max) max = min(in_inode->i_sb->s_maxbytes, out_inode->i_sb->s_maxbytes); diff --git a/fs/splice.c b/fs/splice.c index 6bdcb61..56b802b 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -908,10 +908,6 @@ static long do_splice_from(struct pipe_inode_info *pipe, struct file *out, if (unlikely(ret < 0)) return ret; - ret = security_file_permission(out, MAY_WRITE); - if (unlikely(ret < 0)) - return ret; - return out->f_op->splice_write(pipe, out, ppos, len, flags); } @@ -934,10 +930,6 @@ static long do_splice_to(struct file *in, loff_t *ppos, if (unlikely(ret < 0)) return ret; - ret = security_file_permission(in, MAY_READ); - if (unlikely(ret < 0)) - return ret; - return in->f_op->splice_read(in, ppos, pipe, len, flags); } diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c index 3371629..4948d9b 100644 --- a/fs/sysfs/dir.c +++ b/fs/sysfs/dir.c @@ -440,7 +440,7 @@ int sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd) /** * sysfs_remove_one - remove sysfs_dirent from parent * @acxt: addrm context to use - * @sd: sysfs_dirent to be added + * @sd: sysfs_dirent to be removed * * Mark @sd removed and drop nlink of parent inode if @sd is a * directory. @sd is unlinked from the children list. @@ -678,8 +678,10 @@ static struct dentry * sysfs_lookup(struct inode *dir, struct dentry *dentry, sd = sysfs_find_dirent(parent_sd, dentry->d_name.name); /* no such entry */ - if (!sd) + if (!sd) { + ret = ERR_PTR(-ENOENT); goto out_unlock; + } /* attach dentry and inode */ inode = sysfs_get_inode(sd); @@ -781,6 +783,7 @@ int sysfs_rename_dir(struct kobject * kobj, const char *new_name) old_dentry = sysfs_get_dentry(sd); if (IS_ERR(old_dentry)) { error = PTR_ERR(old_dentry); + old_dentry = NULL; goto out; } @@ -848,6 +851,7 @@ int sysfs_move_dir(struct kobject *kobj, struct kobject *new_parent_kobj) old_dentry = sysfs_get_dentry(sd); if (IS_ERR(old_dentry)) { error = PTR_ERR(old_dentry); + old_dentry = NULL; goto out; } old_parent = old_dentry->d_parent; @@ -855,6 +859,7 @@ int sysfs_move_dir(struct kobject *kobj, struct kobject *new_parent_kobj) new_parent = sysfs_get_dentry(new_parent_sd); if (IS_ERR(new_parent)) { error = PTR_ERR(new_parent); + new_parent = NULL; goto out; } @@ -878,7 +883,6 @@ again: error = 0; d_add(new_dentry, NULL); d_move(old_dentry, new_dentry); - dput(new_dentry); /* Remove from old parent's list and insert into new parent's list. */ sysfs_unlink_sibling(sd); diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c index b834f17..a271c87 100644 --- a/fs/sysfs/file.c +++ b/fs/sysfs/file.c @@ -20,43 +20,6 @@ #include "sysfs.h" -#define to_sattr(a) container_of(a,struct subsys_attribute, attr) - -/* - * Subsystem file operations. - * These operations allow subsystems to have files that can be - * read/written. - */ -static ssize_t -subsys_attr_show(struct kobject * kobj, struct attribute * attr, char * page) -{ - struct kset *kset = to_kset(kobj); - struct subsys_attribute * sattr = to_sattr(attr); - ssize_t ret = -EIO; - - if (sattr->show) - ret = sattr->show(kset, page); - return ret; -} - -static ssize_t -subsys_attr_store(struct kobject * kobj, struct attribute * attr, - const char * page, size_t count) -{ - struct kset *kset = to_kset(kobj); - struct subsys_attribute * sattr = to_sattr(attr); - ssize_t ret = -EIO; - - if (sattr->store) - ret = sattr->store(kset, page, count); - return ret; -} - -static struct sysfs_ops subsys_sysfs_ops = { - .show = subsys_attr_show, - .store = subsys_attr_store, -}; - /* * There's one sysfs_buffer for each open file and one * sysfs_open_dirent for each sysfs_dirent with one or more open @@ -66,7 +29,7 @@ static struct sysfs_ops subsys_sysfs_ops = { * sysfs_dirent->s_attr.open points to sysfs_open_dirent. s_attr.open * is protected by sysfs_open_dirent_lock. */ -static spinlock_t sysfs_open_dirent_lock = SPIN_LOCK_UNLOCKED; +static DEFINE_SPINLOCK(sysfs_open_dirent_lock); struct sysfs_open_dirent { atomic_t refcnt; @@ -354,31 +317,23 @@ static int sysfs_open_file(struct inode *inode, struct file *file) { struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata; struct kobject *kobj = attr_sd->s_parent->s_dir.kobj; - struct sysfs_buffer * buffer; - struct sysfs_ops * ops = NULL; - int error; + struct sysfs_buffer *buffer; + struct sysfs_ops *ops; + int error = -EACCES; /* need attr_sd for attr and ops, its parent for kobj */ if (!sysfs_get_active_two(attr_sd)) return -ENODEV; - /* if the kobject has no ktype, then we assume that it is a subsystem - * itself, and use ops for it. - */ - if (kobj->kset && kobj->kset->ktype) - ops = kobj->kset->ktype->sysfs_ops; - else if (kobj->ktype) + /* every kobject with an attribute needs a ktype assigned */ + if (kobj->ktype && kobj->ktype->sysfs_ops) ops = kobj->ktype->sysfs_ops; - else - ops = &subsys_sysfs_ops; - - error = -EACCES; - - /* No sysfs operations, either from having no subsystem, - * or the subsystem have no operations. - */ - if (!ops) + else { + printk(KERN_ERR "missing sysfs attribute operations for " + "kobject: %s\n", kobject_name(kobj)); + WARN_ON(1); goto err_out; + } /* File needs write support. * The inode's perms must say it's ok, diff --git a/fs/sysfs/symlink.c b/fs/sysfs/symlink.c index 3eac20c..5f66c44 100644 --- a/fs/sysfs/symlink.c +++ b/fs/sysfs/symlink.c @@ -19,39 +19,6 @@ #include "sysfs.h" -static int object_depth(struct sysfs_dirent *sd) -{ - int depth = 0; - - for (; sd->s_parent; sd = sd->s_parent) - depth++; - - return depth; -} - -static int object_path_length(struct sysfs_dirent * sd) -{ - int length = 1; - - for (; sd->s_parent; sd = sd->s_parent) - length += strlen(sd->s_name) + 1; - - return length; -} - -static void fill_object_path(struct sysfs_dirent *sd, char *buffer, int length) -{ - --length; - for (; sd->s_parent; sd = sd->s_parent) { - int cur = strlen(sd->s_name); - - /* back up enough to print this bus id with '/' */ - length -= cur; - strncpy(buffer + length, sd->s_name, cur); - *(buffer + --length) = '/'; - } -} - /** * sysfs_create_link - create symlink between two objects. * @kobj: object whose directory we're creating the link in. @@ -112,7 +79,6 @@ int sysfs_create_link(struct kobject * kobj, struct kobject * target, const char return error; } - /** * sysfs_remove_link - remove symlink in object's directory. * @kobj: object we're acting for. @@ -124,24 +90,54 @@ void sysfs_remove_link(struct kobject * kobj, const char * name) sysfs_hash_and_remove(kobj->sd, name); } -static int sysfs_get_target_path(struct sysfs_dirent * parent_sd, - struct sysfs_dirent * target_sd, char *path) +static int sysfs_get_target_path(struct sysfs_dirent *parent_sd, + struct sysfs_dirent *target_sd, char *path) { - char * s; - int depth, size; + struct sysfs_dirent *base, *sd; + char *s = path; + int len = 0; + + /* go up to the root, stop at the base */ + base = parent_sd; + while (base->s_parent) { + sd = target_sd->s_parent; + while (sd->s_parent && base != sd) + sd = sd->s_parent; + + if (base == sd) + break; + + strcpy(s, "../"); + s += 3; + base = base->s_parent; + } + + /* determine end of target string for reverse fillup */ + sd = target_sd; + while (sd->s_parent && sd != base) { + len += strlen(sd->s_name) + 1; + sd = sd->s_parent; + } - depth = object_depth(parent_sd); - size = object_path_length(target_sd) + depth * 3 - 1; - if (size > PATH_MAX) + /* check limits */ + if (len < 2) + return -EINVAL; + len--; + if ((s - path) + len > PATH_MAX) return -ENAMETOOLONG; - pr_debug("%s: depth = %d, size = %d\n", __FUNCTION__, depth, size); + /* reverse fillup of target string from target to base */ + sd = target_sd; + while (sd->s_parent && sd != base) { + int slen = strlen(sd->s_name); - for (s = path; depth--; s += 3) - strcpy(s,"../"); + len -= slen; + strncpy(s + len, sd->s_name, slen); + if (len) + s[--len] = '/'; - fill_object_path(target_sd, path, size); - pr_debug("%s: path = '%s'\n", __FUNCTION__, path); + sd = sd->s_parent; + } return 0; } |